unitxt.llm_as_judge module

class unitxt.llm_as_judge.LLMJudge(data_classification_policy: List[str] = None, main_score: str = __required__, prediction_type: Union[Any, str] = typing.Any, single_reference_per_prediction: bool = False, score_prefix: str = '', n_resamples: int = 1000, confidence_level: float = 0.95, ci_scores: List[str] = None, _requirements_list: Union[List[str], Dict[str, str]] = [], requirements: Union[List[str], Dict[str, str]] = [], caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, reduction_map: Dict[str, List[str]] = __required__, implemented_reductions: List[str] = ['mean', 'weighted_win_rate'], inference_engine: unitxt.inference.InferenceEngine = __required__, evaluator_name: <enum 'EvaluatorNameEnum = None, check_positional_bias: bool = True, context_fields: Union[str, List[str], Dict[str, str]] = ['context'], generate_summaries: bool = True, include_prompts_in_result: bool = False, criteria_field: str = None, criteria: unitxt.llm_as_judge_constants.Criteria = None)[source]

Bases: BulkInstanceMetric

context_fields: str | List[str] | Dict[str, str] = ['context']
logger = <Logger unitxt (INFO)>
class unitxt.llm_as_judge.LLMJudgeDirect(data_classification_policy: List[str] = None, main_score: str = 'llm_as_judge', prediction_type: Union[Any, str] = typing.Any, single_reference_per_prediction: bool = False, score_prefix: str = '', n_resamples: int = 1000, confidence_level: float = 0.95, ci_scores: List[str] = None, _requirements_list: Union[List[str], Dict[str, str]] = [], requirements: Union[List[str], Dict[str, str]] = [], caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, reduction_map: Dict[str, List[str]] = {'mean': ['llm_as_judge']}, implemented_reductions: List[str] = ['mean', 'weighted_win_rate'], inference_engine: unitxt.inference.InferenceEngine = __required__, evaluator_name: <enum 'EvaluatorNameEnum = None, check_positional_bias: bool = True, context_fields: Union[str, List[str], Dict[str, str]] = ['context'], generate_summaries: bool = True, include_prompts_in_result: bool = False, criteria_field: str = None, criteria: unitxt.llm_as_judge_constants.CriteriaWithOptions = None)[source]

Bases: LLMJudge

reduction_map: Dict[str, List[str]] = {'mean': ['llm_as_judge']}
class unitxt.llm_as_judge.LLMJudgePairwise(data_classification_policy: List[str] = None, main_score: str = '1_winrate', prediction_type: Union[Any, str] = typing.List[str], single_reference_per_prediction: bool = False, score_prefix: str = '', n_resamples: int = 1000, confidence_level: float = 0.95, ci_scores: List[str] = None, _requirements_list: Union[List[str], Dict[str, str]] = [], requirements: Union[List[str], Dict[str, str]] = [], caching: bool = None, apply_to_streams: List[str] = None, dont_apply_to_streams: List[str] = None, reduction_map: Dict[str, List[str]] = {'mean': ['score']}, implemented_reductions: List[str] = ['mean', 'weighted_win_rate'], inference_engine: unitxt.inference.InferenceEngine = __required__, evaluator_name: <enum 'EvaluatorNameEnum = None, check_positional_bias: bool = True, context_fields: Union[str, List[str], Dict[str, str]] = ['context'], generate_summaries: bool = True, include_prompts_in_result: bool = False, criteria_field: str = None, criteria: unitxt.llm_as_judge_constants.Criteria = None)[source]

Bases: LLMJudge

prediction_type

alias of List[str]

reduction_map: Dict[str, List[str]] = {'mean': ['score']}