π BluebenchΒΆ
Note
ID: benchmarks.bluebench | Type: Benchmark
{
"__type__": "benchmark",
"subsets": {
"bias": {
"__type__": "benchmark",
"subsets": {
"safety_bbq_age": "recipes.bluebench.bias.safety_bbq_age",
"safety_bbq_disability_status": "recipes.bluebench.bias.safety_bbq_disability_status",
"safety_bbq_gender_identity": "recipes.bluebench.bias.safety_bbq_gender_identity",
"safety_bbq_nationality": "recipes.bluebench.bias.safety_bbq_nationality",
"safety_bbq_physical_appearance": "recipes.bluebench.bias.safety_bbq_physical_appearance",
"safety_bbq_race_ethnicity": "recipes.bluebench.bias.safety_bbq_race_ethnicity",
"safety_bbq_race_x_gender": "recipes.bluebench.bias.safety_bbq_race_x_gender",
"safety_bbq_race_x_ses": "recipes.bluebench.bias.safety_bbq_race_x_ses",
"safety_bbq_religion": "recipes.bluebench.bias.safety_bbq_religion",
"safety_bbq_ses": "recipes.bluebench.bias.safety_bbq_ses",
"safety_bbq_sexual_orientation": "recipes.bluebench.bias.safety_bbq_sexual_orientation"
}
},
"chatbot_abilities": {
"__type__": "benchmark",
"subsets": {
"arena_hard_generation_english_gpt_4_0314_reference": "recipes.bluebench.chatbot_abilities.arena_hard_generation_english_gpt_4_0314_reference"
}
},
"entity_extraction": {
"__type__": "benchmark",
"subsets": {
"universal_ner_en_ewt": "recipes.bluebench.entity_extraction.universal_ner_en_ewt"
}
},
"knowledge": {
"__type__": "benchmark",
"subsets": {
"mmlu_pro_biology": "recipes.bluebench.knowledge.mmlu_pro_biology",
"mmlu_pro_business": "recipes.bluebench.knowledge.mmlu_pro_business",
"mmlu_pro_chemistry": "recipes.bluebench.knowledge.mmlu_pro_chemistry",
"mmlu_pro_computer_science": "recipes.bluebench.knowledge.mmlu_pro_computer_science",
"mmlu_pro_economics": "recipes.bluebench.knowledge.mmlu_pro_economics",
"mmlu_pro_engineering": "recipes.bluebench.knowledge.mmlu_pro_engineering",
"mmlu_pro_health": "recipes.bluebench.knowledge.mmlu_pro_health",
"mmlu_pro_history": "recipes.bluebench.knowledge.mmlu_pro_history",
"mmlu_pro_law": "recipes.bluebench.knowledge.mmlu_pro_law",
"mmlu_pro_math": "recipes.bluebench.knowledge.mmlu_pro_math",
"mmlu_pro_other": "recipes.bluebench.knowledge.mmlu_pro_other",
"mmlu_pro_philosophy": "recipes.bluebench.knowledge.mmlu_pro_philosophy",
"mmlu_pro_physics": "recipes.bluebench.knowledge.mmlu_pro_physics",
"mmlu_pro_psychology": "recipes.bluebench.knowledge.mmlu_pro_psychology"
}
},
"legal": {
"__type__": "benchmark",
"subsets": {
"legalbench_abercrombie": "recipes.bluebench.legal.legalbench_abercrombie",
"legalbench_corporate_lobbying": "recipes.bluebench.legal.legalbench_corporate_lobbying",
"legalbench_function_of_decision_section": "recipes.bluebench.legal.legalbench_function_of_decision_section",
"legalbench_international_citizenship_questions": "recipes.bluebench.legal.legalbench_international_citizenship_questions",
"legalbench_proa": "recipes.bluebench.legal.legalbench_proa"
}
},
"news_classification": {
"__type__": "benchmark",
"subsets": {
"20_newsgroups": "recipes.bluebench.news_classification.20_newsgroups"
}
},
"product_help": {
"__type__": "benchmark",
"subsets": {
"cfpb_product_2023": "recipes.bluebench.product_help.cfpb_product_2023",
"cfpb_product_watsonx": "recipes.bluebench.product_help.cfpb_product_watsonx"
}
},
"qa_finance": {
"__type__": "benchmark",
"subsets": {
"fin_qa": "recipes.bluebench.qa_finance.fin_qa"
}
},
"rag_general": {
"__type__": "benchmark",
"subsets": {
"rag_response_generation_clapnq": "recipes.bluebench.rag_general.rag_response_generation_clapnq"
}
},
"reasoning": {
"__type__": "benchmark",
"subsets": {
"hellaswag": "recipes.bluebench.reasoning.hellaswag",
"openbook_qa": "recipes.bluebench.reasoning.openbook_qa"
}
},
"safety": {
"__type__": "benchmark",
"subsets": {
"attaq_500": "recipes.bluebench.safety.attaq_500"
}
},
"summarization": {
"__type__": "benchmark",
"subsets": {
"billsum_document_filtered_to_6000_chars": "recipes.bluebench.summarization.billsum_document_filtered_to_6000_chars",
"tldr_document_filtered_to_6000_chars": "recipes.bluebench.summarization.tldr_document_filtered_to_6000_chars"
}
},
"translation": {
"__type__": "benchmark",
"subsets": {
"mt_flores_101_ara_eng": "recipes.bluebench.translation.mt_flores_101_ara_eng",
"mt_flores_101_deu_eng": "recipes.bluebench.translation.mt_flores_101_deu_eng",
"mt_flores_101_eng_ara": "recipes.bluebench.translation.mt_flores_101_eng_ara",
"mt_flores_101_eng_deu": "recipes.bluebench.translation.mt_flores_101_eng_deu",
"mt_flores_101_eng_fra": "recipes.bluebench.translation.mt_flores_101_eng_fra",
"mt_flores_101_eng_kor": "recipes.bluebench.translation.mt_flores_101_eng_kor",
"mt_flores_101_eng_por": "recipes.bluebench.translation.mt_flores_101_eng_por",
"mt_flores_101_eng_ron": "recipes.bluebench.translation.mt_flores_101_eng_ron",
"mt_flores_101_eng_spa": "recipes.bluebench.translation.mt_flores_101_eng_spa",
"mt_flores_101_fra_eng": "recipes.bluebench.translation.mt_flores_101_fra_eng",
"mt_flores_101_jpn_eng": "recipes.bluebench.translation.mt_flores_101_jpn_eng",
"mt_flores_101_kor_eng": "recipes.bluebench.translation.mt_flores_101_kor_eng",
"mt_flores_101_por_eng": "recipes.bluebench.translation.mt_flores_101_por_eng",
"mt_flores_101_ron_eng": "recipes.bluebench.translation.mt_flores_101_ron_eng",
"mt_flores_101_spa_eng": "recipes.bluebench.translation.mt_flores_101_spa_eng"
}
}
}
}
References: recipes.bluebench.knowledge.mmlu_pro_biology, recipes.bluebench.bias.safety_bbq_race_x_gender, recipes.bluebench.chatbot_abilities.arena_hard_generation_english_gpt_4_0314_reference, recipes.bluebench.translation.mt_flores_101_spa_eng, recipes.bluebench.knowledge.mmlu_pro_business, recipes.bluebench.knowledge.mmlu_pro_law, recipes.bluebench.knowledge.mmlu_pro_engineering, recipes.bluebench.legal.legalbench_international_citizenship_questions, recipes.bluebench.knowledge.mmlu_pro_economics, recipes.bluebench.translation.mt_flores_101_eng_ron, recipes.bluebench.legal.legalbench_corporate_lobbying, recipes.bluebench.translation.mt_flores_101_eng_deu, recipes.bluebench.reasoning.openbook_qa, recipes.bluebench.bias.safety_bbq_gender_identity, recipes.bluebench.knowledge.mmlu_pro_psychology, recipes.bluebench.bias.safety_bbq_race_ethnicity, recipes.bluebench.legal.legalbench_function_of_decision_section, recipes.bluebench.knowledge.mmlu_pro_history, recipes.bluebench.knowledge.mmlu_pro_philosophy, recipes.bluebench.legal.legalbench_proa, recipes.bluebench.summarization.billsum_document_filtered_to_6000_chars, recipes.bluebench.bias.safety_bbq_religion, recipes.bluebench.translation.mt_flores_101_eng_ara, recipes.bluebench.translation.mt_flores_101_ron_eng, recipes.bluebench.translation.mt_flores_101_por_eng, recipes.bluebench.bias.safety_bbq_age, recipes.bluebench.summarization.tldr_document_filtered_to_6000_chars, recipes.bluebench.safety.attaq_500, recipes.bluebench.knowledge.mmlu_pro_computer_science, recipes.bluebench.entity_extraction.universal_ner_en_ewt, recipes.bluebench.news_classification.20_newsgroups, recipes.bluebench.reasoning.hellaswag, recipes.bluebench.translation.mt_flores_101_eng_fra, recipes.bluebench.bias.safety_bbq_ses, recipes.bluebench.translation.mt_flores_101_ara_eng, recipes.bluebench.bias.safety_bbq_disability_status, recipes.bluebench.product_help.cfpb_product_watsonx, recipes.bluebench.translation.mt_flores_101_eng_por, recipes.bluebench.product_help.cfpb_product_2023, recipes.bluebench.translation.mt_flores_101_kor_eng, recipes.bluebench.qa_finance.fin_qa, recipes.bluebench.bias.safety_bbq_physical_appearance, recipes.bluebench.translation.mt_flores_101_deu_eng, recipes.bluebench.bias.safety_bbq_race_x_ses, recipes.bluebench.knowledge.mmlu_pro_chemistry, recipes.bluebench.knowledge.mmlu_pro_physics, recipes.bluebench.knowledge.mmlu_pro_math, recipes.bluebench.translation.mt_flores_101_jpn_eng, recipes.bluebench.bias.safety_bbq_nationality, recipes.bluebench.translation.mt_flores_101_eng_spa, recipes.bluebench.legal.legalbench_abercrombie, recipes.bluebench.knowledge.mmlu_pro_health, recipes.bluebench.rag_general.rag_response_generation_clapnq, recipes.bluebench.translation.mt_flores_101_fra_eng, recipes.bluebench.translation.mt_flores_101_eng_kor, recipes.bluebench.knowledge.mmlu_pro_other, recipes.bluebench.bias.safety_bbq_sexual_orientation
Read more about catalog usage here.