unitxt.processors module

class unitxt.processors.Capitalize(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ConvertToBoolean(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.DictOfListsToPairs(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, position_key_before_value: bool = True)

Bases: FieldOperator

class unitxt.processors.ExtractArenaHardNumericalJudgment(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractMtBenchLabelJudgment(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractMtBenchRatingJudgment(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractSafeUnsafeJudgment(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ExtractWithRegex(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, regex: str, termination_regex: str = None)

Bases: RegexParser

class unitxt.processors.FirstCharacter(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.GetStringAfter(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, substring: str)

Bases: FieldOperator

class unitxt.processors.ListToEmptyEntitiesTuples(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.LiteralEval(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.LowerCase(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.LowerCaseTillPunc(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.MatchClosestOption(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, options_field: str = 'options')

Bases: InstanceFieldOperator

class unitxt.processors.RegexParser(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, regex: str, termination_regex: str = None)

Bases: FieldOperator

A processor that uses regex in order to parse a string.

class unitxt.processors.SplitStrip(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, delimiter: str = ' ', strip_every_element: bool = False)

Bases: FieldOperator

class unitxt.processors.StanceToProCon(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.StrToFloatFormat(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.StringOrNotString(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, string: str)

Bases: FieldOperator

class unitxt.processors.Substring(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, begin: int = 0, end: int = None)

Bases: FieldOperator

class unitxt.processors.TakeFirstNonEmptyLine(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.TakeFirstWord(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ToListByComma(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, delimiter: str = ', ', strip_every_element: bool = True)

Bases: SplitStrip

class unitxt.processors.ToListByCommaSpace(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False, delimiter: str = ', ', strip_every_element: bool = True)

Bases: SplitStrip

class unitxt.processors.ToString(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ToStringStripped(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.ToYesOrNone(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.YesNoToInt(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

class unitxt.processors.YesToOneElseZero(__tags__: ~typing.Dict[str, str] = {}, data_classification_policy: ~typing.List[str] = None, caching: bool = None, apply_to_streams: ~typing.List[str] = None, dont_apply_to_streams: ~typing.List[str] = None, field: str | None = None, to_field: str | None = None, field_to_field: ~typing.List[~typing.List[str]] | ~typing.Dict[str, str] | None = None, use_query: bool, process_every_value: bool = False, get_default: ~typing.Any = None, not_exist_ok: bool = False)

Bases: FieldOperator

unitxt.processors.process_instance_value(self, value, instance)