π BluebenchΒΆ
BlueBench is an open-source benchmark developed by domain experts to represent required needs of Enterprise users.

It is constructed using state-of-the-art benchmarking methodologies to ensure validity, robustness, and efficiency by utilizing unitxtβs abilities for dynamic and flexible text processing.
As a dynamic and evolving benchmark, BlueBench currently encompasses diverse domains such as legal, finance, customer support, and news. It also evaluates a range of capabilities, including RAG, pro-social behavior, summarization, and chatbot performance, with additional tasks and domains to be integrated over time.
benchmarks.bluebench
type: Benchmark
subsets:
bias:
type: Benchmark
subsets:
safety_bbq_age: recipes.bluebench.bias.safety_bbq_age
safety_bbq_disability_status: recipes.bluebench.bias.safety_bbq_disability_status
safety_bbq_gender_identity: recipes.bluebench.bias.safety_bbq_gender_identity
safety_bbq_nationality: recipes.bluebench.bias.safety_bbq_nationality
safety_bbq_physical_appearance: recipes.bluebench.bias.safety_bbq_physical_appearance
safety_bbq_race_ethnicity: recipes.bluebench.bias.safety_bbq_race_ethnicity
safety_bbq_race_x_gender: recipes.bluebench.bias.safety_bbq_race_x_gender
safety_bbq_race_x_ses: recipes.bluebench.bias.safety_bbq_race_x_ses
safety_bbq_religion: recipes.bluebench.bias.safety_bbq_religion
safety_bbq_ses: recipes.bluebench.bias.safety_bbq_ses
safety_bbq_sexual_orientation: recipes.bluebench.bias.safety_bbq_sexual_orientation
chatbot_abilities:
type: Benchmark
subsets:
arena_hard_generation_english_gpt_4_0314_reference: recipes.bluebench.chatbot_abilities.arena_hard_generation_english_gpt_4_0314_reference
entity_extraction:
type: Benchmark
subsets:
universal_ner_en_ewt: recipes.bluebench.entity_extraction.universal_ner_en_ewt
knowledge:
type: Benchmark
subsets:
mmlu_pro_biology: recipes.bluebench.knowledge.mmlu_pro_biology
mmlu_pro_business: recipes.bluebench.knowledge.mmlu_pro_business
mmlu_pro_chemistry: recipes.bluebench.knowledge.mmlu_pro_chemistry
mmlu_pro_computer_science: recipes.bluebench.knowledge.mmlu_pro_computer_science
mmlu_pro_economics: recipes.bluebench.knowledge.mmlu_pro_economics
mmlu_pro_engineering: recipes.bluebench.knowledge.mmlu_pro_engineering
mmlu_pro_health: recipes.bluebench.knowledge.mmlu_pro_health
mmlu_pro_history: recipes.bluebench.knowledge.mmlu_pro_history
mmlu_pro_law: recipes.bluebench.knowledge.mmlu_pro_law
mmlu_pro_math: recipes.bluebench.knowledge.mmlu_pro_math
mmlu_pro_other: recipes.bluebench.knowledge.mmlu_pro_other
mmlu_pro_philosophy: recipes.bluebench.knowledge.mmlu_pro_philosophy
mmlu_pro_physics: recipes.bluebench.knowledge.mmlu_pro_physics
mmlu_pro_psychology: recipes.bluebench.knowledge.mmlu_pro_psychology
legal:
type: Benchmark
subsets:
legalbench_abercrombie: recipes.bluebench.legal.legalbench_abercrombie
legalbench_corporate_lobbying: recipes.bluebench.legal.legalbench_corporate_lobbying
legalbench_function_of_decision_section: recipes.bluebench.legal.legalbench_function_of_decision_section
legalbench_international_citizenship_questions: recipes.bluebench.legal.legalbench_international_citizenship_questions
legalbench_proa: recipes.bluebench.legal.legalbench_proa
news_classification:
type: Benchmark
subsets:
20_newsgroups_short: recipes.bluebench.news_classification.20_newsgroups_short
product_help:
type: Benchmark
subsets:
cfpb_product_2023: recipes.bluebench.product_help.cfpb_product_2023
cfpb_product_watsonx: recipes.bluebench.product_help.cfpb_product_watsonx
qa_finance:
type: Benchmark
subsets:
fin_qa: recipes.bluebench.qa_finance.fin_qa
rag_general:
type: Benchmark
subsets:
rag_response_generation_clapnq: recipes.bluebench.rag_general.rag_response_generation_clapnq
reasoning:
type: Benchmark
subsets:
hellaswag: recipes.bluebench.reasoning.hellaswag
openbook_qa: recipes.bluebench.reasoning.openbook_qa
safety:
type: Benchmark
subsets:
attaq_500: recipes.bluebench.safety.attaq_500
summarization:
type: Benchmark
subsets:
billsum_document_filtered_to_6000_chars: recipes.bluebench.summarization.billsum_document_filtered_to_6000_chars
tldr_document_filtered_to_6000_chars: recipes.bluebench.summarization.tldr_document_filtered_to_6000_chars
translation:
type: Benchmark
subsets:
mt_flores_101_ara_eng: recipes.bluebench.translation.mt_flores_101_ara_eng
mt_flores_101_deu_eng: recipes.bluebench.translation.mt_flores_101_deu_eng
mt_flores_101_eng_ara: recipes.bluebench.translation.mt_flores_101_eng_ara
mt_flores_101_eng_deu: recipes.bluebench.translation.mt_flores_101_eng_deu
mt_flores_101_eng_fra: recipes.bluebench.translation.mt_flores_101_eng_fra
mt_flores_101_eng_kor: recipes.bluebench.translation.mt_flores_101_eng_kor
mt_flores_101_eng_por: recipes.bluebench.translation.mt_flores_101_eng_por
mt_flores_101_eng_ron: recipes.bluebench.translation.mt_flores_101_eng_ron
mt_flores_101_eng_spa: recipes.bluebench.translation.mt_flores_101_eng_spa
mt_flores_101_fra_eng: recipes.bluebench.translation.mt_flores_101_fra_eng
mt_flores_101_jpn_eng: recipes.bluebench.translation.mt_flores_101_jpn_eng
mt_flores_101_kor_eng: recipes.bluebench.translation.mt_flores_101_kor_eng
mt_flores_101_por_eng: recipes.bluebench.translation.mt_flores_101_por_eng
mt_flores_101_ron_eng: recipes.bluebench.translation.mt_flores_101_ron_eng
mt_flores_101_spa_eng: recipes.bluebench.translation.mt_flores_101_spa_eng
[source]References: recipes.bluebench.chatbot_abilities.arena_hard_generation_english_gpt_4_0314_reference, recipes.bluebench.summarization.billsum_document_filtered_to_6000_chars, recipes.bluebench.legal.legalbench_international_citizenship_questions, recipes.bluebench.summarization.tldr_document_filtered_to_6000_chars, recipes.bluebench.legal.legalbench_function_of_decision_section, recipes.bluebench.rag_general.rag_response_generation_clapnq, recipes.bluebench.news_classification.20_newsgroups_short, recipes.bluebench.entity_extraction.universal_ner_en_ewt, recipes.bluebench.legal.legalbench_corporate_lobbying, recipes.bluebench.bias.safety_bbq_physical_appearance, recipes.bluebench.knowledge.mmlu_pro_computer_science, recipes.bluebench.bias.safety_bbq_sexual_orientation, recipes.bluebench.translation.mt_flores_101_deu_eng, recipes.bluebench.translation.mt_flores_101_jpn_eng, recipes.bluebench.translation.mt_flores_101_fra_eng, recipes.bluebench.translation.mt_flores_101_ron_eng, recipes.bluebench.translation.mt_flores_101_kor_eng, recipes.bluebench.translation.mt_flores_101_eng_deu, recipes.bluebench.translation.mt_flores_101_eng_fra, recipes.bluebench.translation.mt_flores_101_eng_ron, recipes.bluebench.product_help.cfpb_product_watsonx, recipes.bluebench.translation.mt_flores_101_por_eng, recipes.bluebench.translation.mt_flores_101_spa_eng, recipes.bluebench.translation.mt_flores_101_eng_por, recipes.bluebench.translation.mt_flores_101_ara_eng, recipes.bluebench.translation.mt_flores_101_eng_kor, recipes.bluebench.translation.mt_flores_101_eng_ara, recipes.bluebench.bias.safety_bbq_disability_status, recipes.bluebench.translation.mt_flores_101_eng_spa, recipes.bluebench.bias.safety_bbq_gender_identity, recipes.bluebench.product_help.cfpb_product_2023, recipes.bluebench.bias.safety_bbq_race_ethnicity, recipes.bluebench.knowledge.mmlu_pro_engineering, recipes.bluebench.knowledge.mmlu_pro_psychology, recipes.bluebench.bias.safety_bbq_race_x_gender, recipes.bluebench.knowledge.mmlu_pro_philosophy, recipes.bluebench.knowledge.mmlu_pro_economics, recipes.bluebench.legal.legalbench_abercrombie, recipes.bluebench.knowledge.mmlu_pro_chemistry, recipes.bluebench.bias.safety_bbq_nationality, recipes.bluebench.knowledge.mmlu_pro_business, recipes.bluebench.knowledge.mmlu_pro_physics, recipes.bluebench.knowledge.mmlu_pro_history, recipes.bluebench.knowledge.mmlu_pro_biology, recipes.bluebench.bias.safety_bbq_race_x_ses, recipes.bluebench.knowledge.mmlu_pro_health, recipes.bluebench.bias.safety_bbq_religion, recipes.bluebench.knowledge.mmlu_pro_other, recipes.bluebench.knowledge.mmlu_pro_math, recipes.bluebench.knowledge.mmlu_pro_law, recipes.bluebench.reasoning.openbook_qa, recipes.bluebench.legal.legalbench_proa, recipes.bluebench.bias.safety_bbq_age, recipes.bluebench.reasoning.hellaswag, recipes.bluebench.bias.safety_bbq_ses, recipes.bluebench.qa_finance.fin_qa, recipes.bluebench.safety.attaq_500
Read more about catalog usage here.