πŸ“„ BluebenchΒΆ

BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.

Optional alt text

It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxt’s abilities for dynamic and flexible text processing.

As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.

benchmarks.bluebench

Benchmark(
    subsets={
        "bias": Benchmark(
            subsets={
                "safety_bbq_age": "recipes.bluebench.bias.safety_bbq_age",
                "safety_bbq_disability_status": "recipes.bluebench.bias.safety_bbq_disability_status",
                "safety_bbq_gender_identity": "recipes.bluebench.bias.safety_bbq_gender_identity",
                "safety_bbq_nationality": "recipes.bluebench.bias.safety_bbq_nationality",
                "safety_bbq_physical_appearance": "recipes.bluebench.bias.safety_bbq_physical_appearance",
                "safety_bbq_race_ethnicity": "recipes.bluebench.bias.safety_bbq_race_ethnicity",
                "safety_bbq_race_x_gender": "recipes.bluebench.bias.safety_bbq_race_x_gender",
                "safety_bbq_race_x_ses": "recipes.bluebench.bias.safety_bbq_race_x_ses",
                "safety_bbq_religion": "recipes.bluebench.bias.safety_bbq_religion",
                "safety_bbq_ses": "recipes.bluebench.bias.safety_bbq_ses",
                "safety_bbq_sexual_orientation": "recipes.bluebench.bias.safety_bbq_sexual_orientation",
            },
        ),
        "chatbot_abilities": Benchmark(
            subsets={
                "arena_hard_generation_english_gpt_4_0314_reference": "recipes.bluebench.chatbot_abilities.arena_hard_generation_english_gpt_4_0314_reference",
            },
        ),
        "entity_extraction": Benchmark(
            subsets={
                "universal_ner_en_ewt": "recipes.bluebench.entity_extraction.universal_ner_en_ewt",
            },
        ),
        "knowledge": Benchmark(
            subsets={
                "mmlu_pro_biology": "recipes.bluebench.knowledge.mmlu_pro_biology",
                "mmlu_pro_business": "recipes.bluebench.knowledge.mmlu_pro_business",
                "mmlu_pro_chemistry": "recipes.bluebench.knowledge.mmlu_pro_chemistry",
                "mmlu_pro_computer_science": "recipes.bluebench.knowledge.mmlu_pro_computer_science",
                "mmlu_pro_economics": "recipes.bluebench.knowledge.mmlu_pro_economics",
                "mmlu_pro_engineering": "recipes.bluebench.knowledge.mmlu_pro_engineering",
                "mmlu_pro_health": "recipes.bluebench.knowledge.mmlu_pro_health",
                "mmlu_pro_history": "recipes.bluebench.knowledge.mmlu_pro_history",
                "mmlu_pro_law": "recipes.bluebench.knowledge.mmlu_pro_law",
                "mmlu_pro_math": "recipes.bluebench.knowledge.mmlu_pro_math",
                "mmlu_pro_other": "recipes.bluebench.knowledge.mmlu_pro_other",
                "mmlu_pro_philosophy": "recipes.bluebench.knowledge.mmlu_pro_philosophy",
                "mmlu_pro_physics": "recipes.bluebench.knowledge.mmlu_pro_physics",
                "mmlu_pro_psychology": "recipes.bluebench.knowledge.mmlu_pro_psychology",
            },
        ),
        "legal": Benchmark(
            subsets={
                "legalbench_abercrombie": "recipes.bluebench.legal.legalbench_abercrombie",
                "legalbench_corporate_lobbying": "recipes.bluebench.legal.legalbench_corporate_lobbying",
                "legalbench_function_of_decision_section": "recipes.bluebench.legal.legalbench_function_of_decision_section",
                "legalbench_international_citizenship_questions": "recipes.bluebench.legal.legalbench_international_citizenship_questions",
                "legalbench_proa": "recipes.bluebench.legal.legalbench_proa",
            },
        ),
        "news_classification": Benchmark(
            subsets={
                "20_newsgroups_short": "recipes.bluebench.news_classification.20_newsgroups_short",
            },
        ),
        "product_help": Benchmark(
            subsets={
                "cfpb_product_2023": "recipes.bluebench.product_help.cfpb_product_2023",
                "cfpb_product_watsonx": "recipes.bluebench.product_help.cfpb_product_watsonx",
            },
        ),
        "qa_finance": Benchmark(
            subsets={
                "fin_qa": "recipes.bluebench.qa_finance.fin_qa",
            },
        ),
        "rag_general": Benchmark(
            subsets={
                "rag_response_generation_clapnq": "recipes.bluebench.rag_general.rag_response_generation_clapnq",
            },
        ),
        "reasoning": Benchmark(
            subsets={
                "hellaswag": "recipes.bluebench.reasoning.hellaswag",
                "openbook_qa": "recipes.bluebench.reasoning.openbook_qa",
            },
        ),
        "safety": Benchmark(
            subsets={
                "attaq_500": "recipes.bluebench.safety.attaq_500",
            },
        ),
        "summarization": Benchmark(
            subsets={
                "billsum_document_filtered_to_6000_chars": "recipes.bluebench.summarization.billsum_document_filtered_to_6000_chars",
                "tldr_document_filtered_to_6000_chars": "recipes.bluebench.summarization.tldr_document_filtered_to_6000_chars",
            },
        ),
        "translation": Benchmark(
            subsets={
                "mt_flores_101_ara_eng": "recipes.bluebench.translation.mt_flores_101_ara_eng",
                "mt_flores_101_deu_eng": "recipes.bluebench.translation.mt_flores_101_deu_eng",
                "mt_flores_101_eng_ara": "recipes.bluebench.translation.mt_flores_101_eng_ara",
                "mt_flores_101_eng_deu": "recipes.bluebench.translation.mt_flores_101_eng_deu",
                "mt_flores_101_eng_fra": "recipes.bluebench.translation.mt_flores_101_eng_fra",
                "mt_flores_101_eng_kor": "recipes.bluebench.translation.mt_flores_101_eng_kor",
                "mt_flores_101_eng_por": "recipes.bluebench.translation.mt_flores_101_eng_por",
                "mt_flores_101_eng_ron": "recipes.bluebench.translation.mt_flores_101_eng_ron",
                "mt_flores_101_eng_spa": "recipes.bluebench.translation.mt_flores_101_eng_spa",
                "mt_flores_101_fra_eng": "recipes.bluebench.translation.mt_flores_101_fra_eng",
                "mt_flores_101_jpn_eng": "recipes.bluebench.translation.mt_flores_101_jpn_eng",
                "mt_flores_101_kor_eng": "recipes.bluebench.translation.mt_flores_101_kor_eng",
                "mt_flores_101_por_eng": "recipes.bluebench.translation.mt_flores_101_por_eng",
                "mt_flores_101_ron_eng": "recipes.bluebench.translation.mt_flores_101_ron_eng",
                "mt_flores_101_spa_eng": "recipes.bluebench.translation.mt_flores_101_spa_eng",
            },
        ),
    },
)
[source]

References: recipes.bluebench.chatbot_abilities.arena_hard_generation_english_gpt_4_0314_reference, recipes.bluebench.summarization.billsum_document_filtered_to_6000_chars, recipes.bluebench.legal.legalbench_international_citizenship_questions, recipes.bluebench.summarization.tldr_document_filtered_to_6000_chars, recipes.bluebench.legal.legalbench_function_of_decision_section, recipes.bluebench.rag_general.rag_response_generation_clapnq, recipes.bluebench.news_classification.20_newsgroups_short, recipes.bluebench.entity_extraction.universal_ner_en_ewt, recipes.bluebench.bias.safety_bbq_physical_appearance, recipes.bluebench.legal.legalbench_corporate_lobbying, recipes.bluebench.knowledge.mmlu_pro_computer_science, recipes.bluebench.bias.safety_bbq_sexual_orientation, recipes.bluebench.translation.mt_flores_101_eng_ara, recipes.bluebench.translation.mt_flores_101_eng_por, recipes.bluebench.translation.mt_flores_101_eng_kor, recipes.bluebench.translation.mt_flores_101_ara_eng, recipes.bluebench.translation.mt_flores_101_deu_eng, recipes.bluebench.translation.mt_flores_101_eng_fra, recipes.bluebench.translation.mt_flores_101_spa_eng, recipes.bluebench.translation.mt_flores_101_ron_eng, recipes.bluebench.translation.mt_flores_101_jpn_eng, recipes.bluebench.bias.safety_bbq_disability_status, recipes.bluebench.translation.mt_flores_101_kor_eng, recipes.bluebench.product_help.cfpb_product_watsonx, recipes.bluebench.translation.mt_flores_101_eng_deu, recipes.bluebench.translation.mt_flores_101_fra_eng, recipes.bluebench.translation.mt_flores_101_eng_ron, recipes.bluebench.translation.mt_flores_101_eng_spa, recipes.bluebench.translation.mt_flores_101_por_eng, recipes.bluebench.bias.safety_bbq_gender_identity, recipes.bluebench.product_help.cfpb_product_2023, recipes.bluebench.bias.safety_bbq_race_ethnicity, recipes.bluebench.knowledge.mmlu_pro_engineering, recipes.bluebench.bias.safety_bbq_race_x_gender, recipes.bluebench.knowledge.mmlu_pro_philosophy, recipes.bluebench.knowledge.mmlu_pro_psychology, recipes.bluebench.knowledge.mmlu_pro_economics, recipes.bluebench.knowledge.mmlu_pro_chemistry, recipes.bluebench.legal.legalbench_abercrombie, recipes.bluebench.bias.safety_bbq_nationality, recipes.bluebench.knowledge.mmlu_pro_business, recipes.bluebench.bias.safety_bbq_race_x_ses, recipes.bluebench.knowledge.mmlu_pro_physics, recipes.bluebench.knowledge.mmlu_pro_history, recipes.bluebench.knowledge.mmlu_pro_biology, recipes.bluebench.knowledge.mmlu_pro_health, recipes.bluebench.bias.safety_bbq_religion, recipes.bluebench.knowledge.mmlu_pro_other, recipes.bluebench.knowledge.mmlu_pro_math, recipes.bluebench.knowledge.mmlu_pro_law, recipes.bluebench.legal.legalbench_proa, recipes.bluebench.reasoning.openbook_qa, recipes.bluebench.reasoning.hellaswag, recipes.bluebench.bias.safety_bbq_age, recipes.bluebench.bias.safety_bbq_ses, recipes.bluebench.qa_finance.fin_qa, recipes.bluebench.safety.attaq_500

Read more about catalog usage here.