unitxt.standard module¶

class unitxt.standard.AddDemosField(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, source_stream: str = None, target_field: str = None, sampler: Sampler = None)¶: Bases: SpreadSplit

class unitxt.standard.BaseRecipe(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, max_steps: int | None = None, caching: bool = None, card: ~unitxt.card.TaskCard, template: ~unitxt.templates.Template = None, system_prompt: ~unitxt.system_prompts.SystemPrompt = EmptySystemPrompt(__type__='empty_system_prompt', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True), format: ~unitxt.formats.Format = SystemFormat(__type__='system_format', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, demos_field='demos', demo_format='{source}\\N{target_prefix}{target}\n\n', model_input_format='{system_prompt}\\N{instruction}\\N{demos}{source}\\N{target_prefix}', format_args={}), metrics: ~typing.List[str] = None, postprocessors: ~typing.List[str] = None, loader_limit: int = None, max_train_instances: int = None, max_validation_instances: int = None, max_test_instances: int = None, train_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), validation_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), test_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), demos_pool_size: int = None, num_demos: int = 0, demos_removed_from_data: bool = True, demos_pool_name: str = 'demos_pool', demos_taken_from: str = 'train', demos_field: str = 'demos', sampler: ~unitxt.splitters.Sampler = None, augmentor: ~unitxt.operators.Augmentor = NullAugmentor(__type__='null_augmentor', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, augment_task_input=False, augment_model_input=False))¶: Bases: Recipe, SourceSequentialOperator

class unitxt.standard.CreateDemosPool(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, from_split: str, to_split_names: ~typing.List[str], to_split_sizes: ~typing.List[int], remove_targets_from_source_split: bool = True)¶: Bases: SeparateSplit

class unitxt.standard.StandardRecipe(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, max_steps: int | None = None, caching: bool = None, card: ~unitxt.card.TaskCard, template: ~unitxt.templates.Template = None, system_prompt: ~unitxt.system_prompts.SystemPrompt = EmptySystemPrompt(__type__='empty_system_prompt', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True), format: ~unitxt.formats.Format = SystemFormat(__type__='system_format', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, demos_field='demos', demo_format='{source}\\N{target_prefix}{target}\n\n', model_input_format='{system_prompt}\\N{instruction}\\N{demos}{source}\\N{target_prefix}', format_args={}), metrics: ~typing.List[str] = None, postprocessors: ~typing.List[str] = None, loader_limit: int = None, max_train_instances: int = None, max_validation_instances: int = None, max_test_instances: int = None, train_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), validation_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), test_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), demos_pool_size: int = None, num_demos: int = 0, demos_removed_from_data: bool = True, demos_pool_name: str = 'demos_pool', demos_taken_from: str = 'train', demos_field: str = 'demos', sampler: ~unitxt.splitters.Sampler = None, augmentor: ~unitxt.operators.Augmentor = NullAugmentor(__type__='null_augmentor', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, augment_task_input=False, augment_model_input=False), template_card_index: int = None)¶

Bases: StandardRecipeWithIndexes

This class represents a standard recipe for data processing and preparation.

This class can be used to prepare a recipe. with all necessary steps, refiners and renderers included. It allows to set various parameters and steps in a sequential manner for preparing the recipe.

card¶

TaskCard object associated with the recipe.

Type:: TaskCard

template¶

Template object to be used for the recipe.

Type:: Template, optional

system_prompt¶

SystemPrompt object to be used for the recipe.

Type:: SystemPrompt, optional

loader_limit¶

Specifies the maximum number of instances per stream to be returned from the loader (used to reduce loading time in large datasets)

Type:: int, optional

format¶

SystemFormat object to be used for the recipe.

Type:: SystemFormat, optional

metrics¶

list of catalog metrics to use with this recipe.

Type:: List[str]

postprocessors¶

list of catalog processors to apply at post processing. (Not recommended to use from here)

Type:: List[str]

train_refiner¶

Train refiner to be used in the recipe.

Type:: StreamRefiner, optional

max_train_instances¶

Maximum training instances for the refiner.

Type:: int, optional

validation_refiner¶

Validation refiner to be used in the recipe.

Type:: StreamRefiner, optional

max_validation_instances¶

Maximum validation instances for the refiner.

Type:: int, optional

test_refiner¶

Test refiner to be used in the recipe.

Type:: StreamRefiner, optional

max_test_instances¶

Maximum test instances for the refiner.

Type:: int, optional

demos_pool_size¶

Size of the demos pool.

Type:: int, optional

num_demos¶

Number of demos to be used.

Type:: int, optional

demos_pool_name¶

Name of the demos pool. Default is “demos_pool”.

Type:: str, optional

demos_taken_from¶

Specifies from where the demos are taken. Default is “train”.

Type:: str, optional

demos_field¶

Field name for demos. Default is “demos”.

Type:: str, optional

demos_removed_from_data¶

whether to remove the demos from the source data, Default is True

Type:: bool, optional

sampler¶

The Sampler used to select the demonstrations when num_demos > 0.

Type:: Sampler, optional

steps¶

List of StreamingOperator objects to be used in the recipe.

Type:: List[StreamingOperator], optional

augmentor¶

Augmentor to be used to pseudo randomly augment the source text

Type:: Augmentor

instruction_card_index¶

Index of instruction card to be used for preparing the recipe.

Type:: int, optional

template_card_index¶

Index of template card to be used for preparing the recipe.

Type:: int, optional

prepare()¶: This overridden method is used for preparing the recipe by arranging all the steps, refiners, and renderers in a sequential manner.

Raises:: AssertionError – If both template and template_card_index are specified at the same time.

class unitxt.standard.StandardRecipeWithIndexes(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, max_steps: int | None = None, caching: bool = None, card: ~unitxt.card.TaskCard, template: ~unitxt.templates.Template = None, system_prompt: ~unitxt.system_prompts.SystemPrompt = EmptySystemPrompt(__type__='empty_system_prompt', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True), format: ~unitxt.formats.Format = SystemFormat(__type__='system_format', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, demos_field='demos', demo_format='{source}\\N{target_prefix}{target}\n\n', model_input_format='{system_prompt}\\N{instruction}\\N{demos}{source}\\N{target_prefix}', format_args={}), metrics: ~typing.List[str] = None, postprocessors: ~typing.List[str] = None, loader_limit: int = None, max_train_instances: int = None, max_validation_instances: int = None, max_test_instances: int = None, train_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), validation_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), test_refiner: ~unitxt.operators.StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), demos_pool_size: int = None, num_demos: int = 0, demos_removed_from_data: bool = True, demos_pool_name: str = 'demos_pool', demos_taken_from: str = 'train', demos_field: str = 'demos', sampler: ~unitxt.splitters.Sampler = None, augmentor: ~unitxt.operators.Augmentor = NullAugmentor(__type__='null_augmentor', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, augment_task_input=False, augment_model_input=False), template_card_index: int = None)¶: Bases: BaseRecipe