unitxt.standard module¶
- class unitxt.standard.BaseRecipe(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, max_steps: int | None = None, caching: bool = None, card: TaskCard = None, task: Task = None, template: Template | List[Template] | TemplatesList = None, system_prompt: SystemPrompt = EmptySystemPrompt(__type__='empty_system_prompt', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True), format: Format = SystemFormat(__type__='system_format', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, demos_field='demos', demo_format='{source}\\N{target_prefix}{target}\n\n', model_input_format='{system_prompt}\\N{instruction}\\N{demos}{source}\\N{target_prefix}', format_args={}), serializer: SingleTypeSerializer | List[SingleTypeSerializer] = None, template_card_index: int = None, metrics: List[str] = None, postprocessors: List[str] = None, group_by: List[str | List[str]] = [], loader_limit: int = None, max_train_instances: int = None, max_validation_instances: int = None, max_test_instances: int = None, train_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), validation_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), test_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), demos_pool_size: int = None, num_demos: int | List[int] | None = 0, demos_removed_from_data: bool = True, demos_pool_name: str = 'demos_pool', demos_taken_from: str = 'train', demos_field: str = 'demos', sampler: Sampler = None, augmentor: Augmentor = NullAugmentor(__type__='null_augmentor', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, field=None, to_field=None, field_to_field=None, use_query=None, process_every_value=False, get_default=None, not_exist_ok=False, not_exist_do_nothing=False, operator=Identity(__type__='identity', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, field=None, to_field=None, field_to_field=None, use_query=None, process_every_value=False, get_default=None, not_exist_ok=False, not_exist_do_nothing=False)))¶
Bases:
Recipe,SourceSequentialOperator- group_by: List[str | List[str]] = []¶
- property has_custom_demos_pool¶
- property max_demos_size¶
- property use_demos¶
- class unitxt.standard.CreateDemosPool(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, from_split: str, to_split_names: ~typing.List[str], to_split_sizes: ~typing.List[int], remove_targets_from_source_split: bool = True)¶
Bases:
SeparateSplit
- class unitxt.standard.StandardRecipe(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, max_steps: int | None = None, caching: bool = None, card: TaskCard = None, task: Task = None, template: Template | List[Template] | TemplatesList = None, system_prompt: SystemPrompt = EmptySystemPrompt(__type__='empty_system_prompt', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True), format: Format = SystemFormat(__type__='system_format', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, demos_field='demos', demo_format='{source}\\N{target_prefix}{target}\n\n', model_input_format='{system_prompt}\\N{instruction}\\N{demos}{source}\\N{target_prefix}', format_args={}), serializer: SingleTypeSerializer | List[SingleTypeSerializer] = None, template_card_index: int = None, metrics: List[str] = None, postprocessors: List[str] = None, group_by: List[str | List[str]] = [], loader_limit: int = None, max_train_instances: int = None, max_validation_instances: int = None, max_test_instances: int = None, train_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), validation_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), test_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), demos_pool_size: int = None, num_demos: int | List[int] | None = 0, demos_removed_from_data: bool = True, demos_pool_name: str = 'demos_pool', demos_taken_from: str = 'train', demos_field: str = 'demos', sampler: Sampler = None, augmentor: Augmentor = NullAugmentor(__type__='null_augmentor', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, field=None, to_field=None, field_to_field=None, use_query=None, process_every_value=False, get_default=None, not_exist_ok=False, not_exist_do_nothing=False, operator=Identity(__type__='identity', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, field=None, to_field=None, field_to_field=None, use_query=None, process_every_value=False, get_default=None, not_exist_ok=False, not_exist_do_nothing=False)))¶
Bases:
StandardRecipeWithIndexesThis class represents a standard recipe for data processing and preparation.
This class can be used to prepare a recipe. with all necessary steps, refiners and renderers included. It allows to set various parameters and steps in a sequential manner for preparing the recipe.
- system_prompt¶
SystemPrompt object to be used for the recipe.
- Type:
SystemPrompt, optional
- loader_limit¶
Specifies the maximum number of instances per stream to be returned from the loader (used to reduce loading time in large datasets)
- Type:
int, optional
- format¶
SystemFormat object to be used for the recipe.
- Type:
SystemFormat, optional
- metrics¶
list of catalog metrics to use with this recipe.
- Type:
List[str]
- postprocessors¶
list of catalog processors to apply at post processing. (Not recommended to use from here)
- Type:
List[str]
- group_by¶
list of task_data or metadata keys to group global scores by.
- Type:
List[Union[str, List[str]]]
- train_refiner¶
Train refiner to be used in the recipe.
- Type:
StreamRefiner, optional
- max_train_instances¶
Maximum training instances for the refiner.
- Type:
int, optional
- validation_refiner¶
Validation refiner to be used in the recipe.
- Type:
StreamRefiner, optional
- max_validation_instances¶
Maximum validation instances for the refiner.
- Type:
int, optional
- test_refiner¶
Test refiner to be used in the recipe.
- Type:
StreamRefiner, optional
- max_test_instances¶
Maximum test instances for the refiner.
- Type:
int, optional
- demos_pool_size¶
Size of the demos pool.
- Type:
int, optional
- num_demos¶
Number of demos to be used.
- Type:
int, optional
- demos_pool_name¶
Name of the demos pool. Default is “demos_pool”.
- Type:
str, optional
- demos_taken_from¶
Specifies from where the demos are taken. Default is “train”.
- Type:
str, optional
- demos_field¶
Field name for demos. Default is “demos”.
- Type:
str, optional
- demos_removed_from_data¶
whether to remove the demos from the source data, Default is True
- Type:
bool, optional
- steps¶
List of StreamingOperator objects to be used in the recipe.
- Type:
List[StreamingOperator], optional
- instruction_card_index¶
Index of instruction card to be used for preparing the recipe.
- Type:
int, optional
- template_card_index¶
Index of template card to be used for preparing the recipe.
- Type:
int, optional
- prepare()¶
This overridden method is used for preparing the recipe by arranging all the steps, refiners, and renderers in a sequential manner.
- Raises:
AssertionError – If both template and template_card_index are specified at the same time.
- class unitxt.standard.StandardRecipeWithIndexes(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, max_steps: int | None = None, caching: bool = None, card: TaskCard = None, task: Task = None, template: Template | List[Template] | TemplatesList = None, system_prompt: SystemPrompt = EmptySystemPrompt(__type__='empty_system_prompt', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, skip_rendered_instance=True), format: Format = SystemFormat(__type__='system_format', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, demos_field='demos', demo_format='{source}\\N{target_prefix}{target}\n\n', model_input_format='{system_prompt}\\N{instruction}\\N{demos}{source}\\N{target_prefix}', format_args={}), serializer: SingleTypeSerializer | List[SingleTypeSerializer] = None, template_card_index: int = None, metrics: List[str] = None, postprocessors: List[str] = None, group_by: List[str | List[str]] = [], loader_limit: int = None, max_train_instances: int = None, max_validation_instances: int = None, max_test_instances: int = None, train_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), validation_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), test_refiner: StreamRefiner = StreamRefiner(__type__='stream_refiner', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, max_instances=None), demos_pool_size: int = None, num_demos: int | List[int] | None = 0, demos_removed_from_data: bool = True, demos_pool_name: str = 'demos_pool', demos_taken_from: str = 'train', demos_field: str = 'demos', sampler: Sampler = None, augmentor: Augmentor = NullAugmentor(__type__='null_augmentor', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, field=None, to_field=None, field_to_field=None, use_query=None, process_every_value=False, get_default=None, not_exist_ok=False, not_exist_do_nothing=False, operator=Identity(__type__='identity', __description__=None, __tags__={}, __id__=None, data_classification_policy=None, _requirements_list=[], caching=None, apply_to_streams=None, dont_apply_to_streams=None, field=None, to_field=None, field_to_field=None, use_query=None, process_every_value=False, get_default=None, not_exist_ok=False, not_exist_do_nothing=False)))¶
Bases:
BaseRecipe