unitxt.processors module

class unitxt.processors.Capitalize(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ConvertToBoolean(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.DictOfListsToPairs(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, position_key_before_value: bool = True)

Bases: FieldOperator

class unitxt.processors.ExtractArenaHardNumericalJudgment(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractMtBenchLabelJudgment(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractMtBenchRatingJudgment(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractSafeUnsafeJudgment(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractWithRegex(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, regex: str, termination_regex: str = None)

Bases: RegexParser

class unitxt.processors.FirstCharacter(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.FixWhiteSpace(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.GetStringAfter(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, substring: str)

Bases: FieldOperator

class unitxt.processors.InferDictsToBinaryLogprobs(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, neg_class_name: str, pos_class_name: str, take_logprobs_from_end: bool = False, num_logprobs_to_take: int = 3)

Bases: FieldOperator

class unitxt.processors.ListToEmptyEntitiesTuples(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.LiteralEval(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.Lower(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.LowerCaseTillPunc(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.MatchClosestOption(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, options_field: str = 'options')

Bases: InstanceFieldOperator

class unitxt.processors.PostProcess(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, operator: ~unitxt.operators.InstanceFieldOperator, process_prediction: bool = True, process_references: bool = True)

Bases: MultiStreamOperator

class unitxt.processors.RegexParser(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, regex: str, termination_regex: str = None)

Bases: FieldOperator

A processor that uses regex in order to parse a string.

class unitxt.processors.RemoveArticles(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.RemovePunctuations(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.SplitStrip(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, delimiter: str = ' ', strip_every_element: bool = False)

Bases: FieldOperator

class unitxt.processors.StanceToProCon(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.StrToFloatFormat(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.StringEquals(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, string: str)

Bases: FieldOperator

class unitxt.processors.Substring(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, begin: int = 0, end: int = None)

Bases: FieldOperator

class unitxt.processors.TakeFirstNonEmptyLine(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.TakeFirstWord(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ToListByComma(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, delimiter: str = ',', strip_every_element: bool = True)

Bases: SplitStrip

class unitxt.processors.ToListByCommaSpace(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False, delimiter: str = ', ', strip_every_element: bool = True)

Bases: SplitStrip

class unitxt.processors.ToString(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ToStringStripped(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.ToYesOrNone(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.YesNoToInt(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

class unitxt.processors.YesToOneElseZero(__tags__: Dict[str, str] = {}, data_classification_policy: List[str] = None, caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: List[List[str]] | Dict[str, str] | None = None, use_query: bool | None = None, process_every_value: bool = False, get_default: Any = None, not_exist_ok: bool = False, not_exist_do_nothing: bool = False)

Bases: FieldOperator

unitxt.processors.process_instance_value(self, value, instance)