{ "results": { "mmlu": { "acc,none": 0.47172767412049565, "acc_stderr,none": 0.12050494358253985, "alias": "mmlu" }, "mmlu_humanities": { "alias": " - humanities", "acc,none": 0.4401700318809777, "acc_stderr,none": 0.13537651956030122 }, "mmlu_formal_logic": { "alias": " - formal_logic", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04006168083848879 }, "mmlu_high_school_european_history": { "alias": " - high_school_european_history", "acc,none": 0.5878787878787879, "acc_stderr,none": 0.03843566993588717 }, "mmlu_high_school_us_history": { "alias": " - high_school_us_history", "acc,none": 0.6519607843137255, "acc_stderr,none": 0.03343311240488418 }, "mmlu_high_school_world_history": { "alias": " - high_school_world_history", "acc,none": 0.6751054852320675, "acc_stderr,none": 0.030486039389105307 }, "mmlu_international_law": { "alias": " - international_law", "acc,none": 0.6694214876033058, "acc_stderr,none": 0.04294340845212095 }, "mmlu_jurisprudence": { "alias": " - jurisprudence", "acc,none": 0.5648148148148148, "acc_stderr,none": 0.04792898170907061 }, "mmlu_logical_fallacies": { "alias": " - logical_fallacies", "acc,none": 0.5705521472392638, "acc_stderr,none": 0.03889066619112722 }, "mmlu_moral_disputes": { "alias": " - moral_disputes", "acc,none": 0.5433526011560693, "acc_stderr,none": 0.026817718130348916 }, "mmlu_moral_scenarios": { "alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808838 }, "mmlu_philosophy": { "alias": " - philosophy", "acc,none": 0.5273311897106109, "acc_stderr,none": 0.02835563356832818 }, "mmlu_prehistory": { "alias": " - prehistory", "acc,none": 0.5648148148148148, "acc_stderr,none": 0.02758600622160773 }, "mmlu_professional_law": { "alias": " - professional_law", "acc,none": 0.35071707953063885, "acc_stderr,none": 0.012187773370741522 }, "mmlu_world_religions": { "alias": " - world_religions", "acc,none": 0.6842105263157895, "acc_stderr,none": 0.035650796707083106 }, "mmlu_other": { "alias": " - other", "acc,none": 0.5329900225297715, "acc_stderr,none": 0.09928206618537083 }, "mmlu_business_ethics": { "alias": " - business_ethics", "acc,none": 0.55, "acc_stderr,none": 0.04999999999999999 }, "mmlu_clinical_knowledge": { "alias": " - clinical_knowledge", "acc,none": 0.4679245283018868, "acc_stderr,none": 0.03070948699255655 }, "mmlu_college_medicine": { "alias": " - college_medicine", "acc,none": 0.44508670520231214, "acc_stderr,none": 0.03789401760283648 }, "mmlu_global_facts": { "alias": " - global_facts", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605 }, "mmlu_human_aging": { "alias": " - human_aging", "acc,none": 0.5515695067264574, "acc_stderr,none": 0.033378837362550984 }, "mmlu_management": { "alias": " - management", "acc,none": 0.5728155339805825, "acc_stderr,none": 0.04897957737781168 }, "mmlu_marketing": { "alias": " - marketing", "acc,none": 0.7264957264957265, "acc_stderr,none": 0.029202540153431177 }, "mmlu_medical_genetics": { "alias": " - medical_genetics", "acc,none": 0.53, "acc_stderr,none": 0.050161355804659205 }, "mmlu_miscellaneous": { "alias": " - miscellaneous", "acc,none": 0.6666666666666666, "acc_stderr,none": 0.01685739124747255 }, "mmlu_nutrition": { "alias": " - nutrition", "acc,none": 0.5130718954248366, "acc_stderr,none": 0.028620130800700246 }, "mmlu_professional_accounting": { "alias": " - professional_accounting", "acc,none": 0.3723404255319149, "acc_stderr,none": 0.028838921471251458 }, "mmlu_professional_medicine": { "alias": " - professional_medicine", "acc,none": 0.4007352941176471, "acc_stderr,none": 0.029768263528933105 }, "mmlu_virology": { "alias": " - virology", "acc,none": 0.41566265060240964, "acc_stderr,none": 0.03836722176598053 }, "mmlu_social_sciences": { "alias": " - social_sciences", "acc,none": 0.5368865778355542, "acc_stderr,none": 0.1040064106772047 }, "mmlu_econometrics": { "alias": " - econometrics", "acc,none": 0.2543859649122807, "acc_stderr,none": 0.040969851398436695 }, "mmlu_high_school_geography": { "alias": " - high_school_geography", "acc,none": 0.6060606060606061, "acc_stderr,none": 0.034812853382329624 }, "mmlu_high_school_government_and_politics": { "alias": " - high_school_government_and_politics", "acc,none": 0.6683937823834197, "acc_stderr,none": 0.03397636541089118 }, "mmlu_high_school_macroeconomics": { "alias": " - high_school_macroeconomics", "acc,none": 0.358974358974359, "acc_stderr,none": 0.024321738484602354 }, "mmlu_high_school_microeconomics": { "alias": " - high_school_microeconomics", "acc,none": 0.4411764705882353, "acc_stderr,none": 0.0322529423239964 }, "mmlu_high_school_psychology": { "alias": " - high_school_psychology", "acc,none": 0.6146788990825688, "acc_stderr,none": 0.020865850852794122 }, "mmlu_human_sexuality": { "alias": " - human_sexuality", "acc,none": 0.6030534351145038, "acc_stderr,none": 0.04291135671009224 }, "mmlu_professional_psychology": { "alias": " - professional_psychology", "acc,none": 0.47875816993464054, "acc_stderr,none": 0.020209572388600255 }, "mmlu_public_relations": { "alias": " - public_relations", "acc,none": 0.5909090909090909, "acc_stderr,none": 0.04709306978661896 }, "mmlu_security_studies": { "alias": " - security_studies", "acc,none": 0.6081632653061224, "acc_stderr,none": 0.03125127591089164 }, "mmlu_sociology": { "alias": " - sociology", "acc,none": 0.6716417910447762, "acc_stderr,none": 0.033206858897443244 }, "mmlu_us_foreign_policy": { "alias": " - us_foreign_policy", "acc,none": 0.73, "acc_stderr,none": 0.0446196043338474 }, "mmlu_stem": { "alias": " - stem", "acc,none": 0.3948620361560419, "acc_stderr,none": 0.09664316035329332 }, "mmlu_abstract_algebra": { "alias": " - abstract_algebra", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127 }, "mmlu_anatomy": { "alias": " - anatomy", "acc,none": 0.4888888888888889, "acc_stderr,none": 0.04318275491977976 }, "mmlu_astronomy": { "alias": " - astronomy", "acc,none": 0.506578947368421, "acc_stderr,none": 0.04068590050224971 }, "mmlu_college_biology": { "alias": " - college_biology", "acc,none": 0.4583333333333333, "acc_stderr,none": 0.04166666666666665 }, "mmlu_college_chemistry": { "alias": " - college_chemistry", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695235 }, "mmlu_college_computer_science": { "alias": " - college_computer_science", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316 }, "mmlu_college_mathematics": { "alias": " - college_mathematics", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102 }, "mmlu_college_physics": { "alias": " - college_physics", "acc,none": 0.30392156862745096, "acc_stderr,none": 0.045766654032077636 }, "mmlu_computer_security": { "alias": " - computer_security", "acc,none": 0.58, "acc_stderr,none": 0.049604496374885836 }, "mmlu_conceptual_physics": { "alias": " - conceptual_physics", "acc,none": 0.33191489361702126, "acc_stderr,none": 0.03078373675774564 }, "mmlu_electrical_engineering": { "alias": " - electrical_engineering", "acc,none": 0.5517241379310345, "acc_stderr,none": 0.04144311810878151 }, "mmlu_elementary_mathematics": { "alias": " - elementary_mathematics", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.023809523809523867 }, "mmlu_high_school_biology": { "alias": " - high_school_biology", "acc,none": 0.5258064516129032, "acc_stderr,none": 0.02840609505765332 }, "mmlu_high_school_chemistry": { "alias": " - high_school_chemistry", "acc,none": 0.3793103448275862, "acc_stderr,none": 0.034139638059062345 }, "mmlu_high_school_computer_science": { "alias": " - high_school_computer_science", "acc,none": 0.57, "acc_stderr,none": 0.04975698519562428 }, "mmlu_high_school_mathematics": { "alias": " - high_school_mathematics", "acc,none": 0.29259259259259257, "acc_stderr,none": 0.027738969632176088 }, "mmlu_high_school_physics": { "alias": " - high_school_physics", "acc,none": 0.33774834437086093, "acc_stderr,none": 0.03861557546255168 }, "mmlu_high_school_statistics": { "alias": " - high_school_statistics", "acc,none": 0.35185185185185186, "acc_stderr,none": 0.03256850570293647 }, "mmlu_machine_learning": { "alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044 } }, "groups": { "mmlu": { "acc,none": 0.47172767412049565, "acc_stderr,none": 0.12050494358253985, "alias": "mmlu" }, "mmlu_humanities": { "alias": " - humanities", "acc,none": 0.4401700318809777, "acc_stderr,none": 0.13537651956030122 }, "mmlu_other": { "alias": " - other", "acc,none": 0.5329900225297715, "acc_stderr,none": 0.09928206618537083 }, "mmlu_social_sciences": { "alias": " - social_sciences", "acc,none": 0.5368865778355542, "acc_stderr,none": 0.1040064106772047 }, "mmlu_stem": { "alias": " - stem", "acc,none": 0.3948620361560419, "acc_stderr,none": 0.09664316035329332 } }, "configs": { "mmlu_abstract_algebra": { "task": "mmlu_abstract_algebra", "task_alias": "abstract_algebra", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "abstract_algebra", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_anatomy": { "task": "mmlu_anatomy", "task_alias": "anatomy", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "anatomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_astronomy": { "task": "mmlu_astronomy", "task_alias": "astronomy", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "astronomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_business_ethics": { "task": "mmlu_business_ethics", "task_alias": "business_ethics", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "business_ethics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_clinical_knowledge": { "task": "mmlu_clinical_knowledge", "task_alias": "clinical_knowledge", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "clinical_knowledge", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_biology": { "task": "mmlu_college_biology", "task_alias": "college_biology", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_chemistry": { "task": "mmlu_college_chemistry", "task_alias": "college_chemistry", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_computer_science": { "task": "mmlu_college_computer_science", "task_alias": "college_computer_science", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_mathematics": { "task": "mmlu_college_mathematics", "task_alias": "college_mathematics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_medicine": { "task": "mmlu_college_medicine", "task_alias": "college_medicine", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_college_physics": { "task": "mmlu_college_physics", "task_alias": "college_physics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "college_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_computer_security": { "task": "mmlu_computer_security", "task_alias": "computer_security", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "computer_security", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about computer security.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_conceptual_physics": { "task": "mmlu_conceptual_physics", "task_alias": "conceptual_physics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "conceptual_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_econometrics": { "task": "mmlu_econometrics", "task_alias": "econometrics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "econometrics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_electrical_engineering": { "task": "mmlu_electrical_engineering", "task_alias": "electrical_engineering", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "electrical_engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_elementary_mathematics": { "task": "mmlu_elementary_mathematics", "task_alias": "elementary_mathematics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "elementary_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_formal_logic": { "task": "mmlu_formal_logic", "task_alias": "formal_logic", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "formal_logic", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_global_facts": { "task": "mmlu_global_facts", "task_alias": "global_facts", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "global_facts", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about global facts.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_biology": { "task": "mmlu_high_school_biology", "task_alias": "high_school_biology", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_chemistry": { "task": "mmlu_high_school_chemistry", "task_alias": "high_school_chemistry", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_computer_science": { "task": "mmlu_high_school_computer_science", "task_alias": "high_school_computer_science", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_european_history": { "task": "mmlu_high_school_european_history", "task_alias": "high_school_european_history", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_european_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_geography": { "task": "mmlu_high_school_geography", "task_alias": "high_school_geography", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_geography", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_government_and_politics": { "task": "mmlu_high_school_government_and_politics", "task_alias": "high_school_government_and_politics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_government_and_politics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_macroeconomics": { "task": "mmlu_high_school_macroeconomics", "task_alias": "high_school_macroeconomics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_macroeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_mathematics": { "task": "mmlu_high_school_mathematics", "task_alias": "high_school_mathematics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_microeconomics": { "task": "mmlu_high_school_microeconomics", "task_alias": "high_school_microeconomics", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_microeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_physics": { "task": "mmlu_high_school_physics", "task_alias": "high_school_physics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_psychology": { "task": "mmlu_high_school_psychology", "task_alias": "high_school_psychology", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_statistics": { "task": "mmlu_high_school_statistics", "task_alias": "high_school_statistics", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_statistics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_us_history": { "task": "mmlu_high_school_us_history", "task_alias": "high_school_us_history", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_us_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_high_school_world_history": { "task": "mmlu_high_school_world_history", "task_alias": "high_school_world_history", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "high_school_world_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_human_aging": { "task": "mmlu_human_aging", "task_alias": "human_aging", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "human_aging", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human aging.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_human_sexuality": { "task": "mmlu_human_sexuality", "task_alias": "human_sexuality", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "human_sexuality", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_international_law": { "task": "mmlu_international_law", "task_alias": "international_law", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "international_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about international law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_jurisprudence": { "task": "mmlu_jurisprudence", "task_alias": "jurisprudence", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "jurisprudence", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_logical_fallacies": { "task": "mmlu_logical_fallacies", "task_alias": "logical_fallacies", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "logical_fallacies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_machine_learning": { "task": "mmlu_machine_learning", "task_alias": "machine_learning", "group": "mmlu_stem", "group_alias": "stem", "dataset_path": "hails/mmlu_no_train", "dataset_name": "machine_learning", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_management": { "task": "mmlu_management", "task_alias": "management", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "management", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about management.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_marketing": { "task": "mmlu_marketing", "task_alias": "marketing", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "marketing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about marketing.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_medical_genetics": { "task": "mmlu_medical_genetics", "task_alias": "medical_genetics", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "medical_genetics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_miscellaneous": { "task": "mmlu_miscellaneous", "task_alias": "miscellaneous", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "miscellaneous", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_moral_disputes": { "task": "mmlu_moral_disputes", "task_alias": "moral_disputes", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "moral_disputes", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_moral_scenarios": { "task": "mmlu_moral_scenarios", "task_alias": "moral_scenarios", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "moral_scenarios", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_nutrition": { "task": "mmlu_nutrition", "task_alias": "nutrition", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "nutrition", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_philosophy": { "task": "mmlu_philosophy", "task_alias": "philosophy", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "philosophy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_prehistory": { "task": "mmlu_prehistory", "task_alias": "prehistory", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "prehistory", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_accounting": { "task": "mmlu_professional_accounting", "task_alias": "professional_accounting", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_accounting", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_law": { "task": "mmlu_professional_law", "task_alias": "professional_law", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_medicine": { "task": "mmlu_professional_medicine", "task_alias": "professional_medicine", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_professional_psychology": { "task": "mmlu_professional_psychology", "task_alias": "professional_psychology", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "professional_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_public_relations": { "task": "mmlu_public_relations", "task_alias": "public_relations", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "public_relations", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about public relations.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_security_studies": { "task": "mmlu_security_studies", "task_alias": "security_studies", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "security_studies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about security studies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_sociology": { "task": "mmlu_sociology", "task_alias": "sociology", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "sociology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about sociology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_us_foreign_policy": { "task": "mmlu_us_foreign_policy", "task_alias": "us_foreign_policy", "group": "mmlu_social_sciences", "group_alias": "social_sciences", "dataset_path": "hails/mmlu_no_train", "dataset_name": "us_foreign_policy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_virology": { "task": "mmlu_virology", "task_alias": "virology", "group": "mmlu_other", "group_alias": "other", "dataset_path": "hails/mmlu_no_train", "dataset_name": "virology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about virology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } }, "mmlu_world_religions": { "task": "mmlu_world_religions", "task_alias": "world_religions", "group": "mmlu_humanities", "group_alias": "humanities", "dataset_path": "hails/mmlu_no_train", "dataset_name": "world_religions", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about world religions.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 0.0 } } }, "versions": { "mmlu": "N/A", "mmlu_abstract_algebra": "Yaml", "mmlu_anatomy": "Yaml", "mmlu_astronomy": "Yaml", "mmlu_business_ethics": "Yaml", "mmlu_clinical_knowledge": "Yaml", "mmlu_college_biology": "Yaml", "mmlu_college_chemistry": "Yaml", "mmlu_college_computer_science": "Yaml", "mmlu_college_mathematics": "Yaml", "mmlu_college_medicine": "Yaml", "mmlu_college_physics": "Yaml", "mmlu_computer_security": "Yaml", "mmlu_conceptual_physics": "Yaml", "mmlu_econometrics": "Yaml", "mmlu_electrical_engineering": "Yaml", "mmlu_elementary_mathematics": "Yaml", "mmlu_formal_logic": "Yaml", "mmlu_global_facts": "Yaml", "mmlu_high_school_biology": "Yaml", "mmlu_high_school_chemistry": "Yaml", "mmlu_high_school_computer_science": "Yaml", "mmlu_high_school_european_history": "Yaml", "mmlu_high_school_geography": "Yaml", "mmlu_high_school_government_and_politics": "Yaml", "mmlu_high_school_macroeconomics": "Yaml", "mmlu_high_school_mathematics": "Yaml", "mmlu_high_school_microeconomics": "Yaml", "mmlu_high_school_physics": "Yaml", "mmlu_high_school_psychology": "Yaml", "mmlu_high_school_statistics": "Yaml", "mmlu_high_school_us_history": "Yaml", "mmlu_high_school_world_history": "Yaml", "mmlu_human_aging": "Yaml", "mmlu_human_sexuality": "Yaml", "mmlu_humanities": "N/A", "mmlu_international_law": "Yaml", "mmlu_jurisprudence": "Yaml", "mmlu_logical_fallacies": "Yaml", "mmlu_machine_learning": "Yaml", "mmlu_management": "Yaml", "mmlu_marketing": "Yaml", "mmlu_medical_genetics": "Yaml", "mmlu_miscellaneous": "Yaml", "mmlu_moral_disputes": "Yaml", "mmlu_moral_scenarios": "Yaml", "mmlu_nutrition": "Yaml", "mmlu_other": "N/A", "mmlu_philosophy": "Yaml", "mmlu_prehistory": "Yaml", "mmlu_professional_accounting": "Yaml", "mmlu_professional_law": "Yaml", "mmlu_professional_medicine": "Yaml", "mmlu_professional_psychology": "Yaml", "mmlu_public_relations": "Yaml", "mmlu_security_studies": "Yaml", "mmlu_social_sciences": "N/A", "mmlu_sociology": "Yaml", "mmlu_stem": "N/A", "mmlu_us_foreign_policy": "Yaml", "mmlu_virology": "Yaml", "mmlu_world_religions": "Yaml" }, "n-shot": { "mmlu": 0, "mmlu_abstract_algebra": 0, "mmlu_anatomy": 0, "mmlu_astronomy": 0, "mmlu_business_ethics": 0, "mmlu_clinical_knowledge": 0, "mmlu_college_biology": 0, "mmlu_college_chemistry": 0, "mmlu_college_computer_science": 0, "mmlu_college_mathematics": 0, "mmlu_college_medicine": 0, "mmlu_college_physics": 0, "mmlu_computer_security": 0, "mmlu_conceptual_physics": 0, "mmlu_econometrics": 0, "mmlu_electrical_engineering": 0, "mmlu_elementary_mathematics": 0, "mmlu_formal_logic": 0, "mmlu_global_facts": 0, "mmlu_high_school_biology": 0, "mmlu_high_school_chemistry": 0, "mmlu_high_school_computer_science": 0, "mmlu_high_school_european_history": 0, "mmlu_high_school_geography": 0, "mmlu_high_school_government_and_politics": 0, "mmlu_high_school_macroeconomics": 0, "mmlu_high_school_mathematics": 0, "mmlu_high_school_microeconomics": 0, "mmlu_high_school_physics": 0, "mmlu_high_school_psychology": 0, "mmlu_high_school_statistics": 0, "mmlu_high_school_us_history": 0, "mmlu_high_school_world_history": 0, "mmlu_human_aging": 0, "mmlu_human_sexuality": 0, "mmlu_humanities": 0, "mmlu_international_law": 0, "mmlu_jurisprudence": 0, "mmlu_logical_fallacies": 0, "mmlu_machine_learning": 0, "mmlu_management": 0, "mmlu_marketing": 0, "mmlu_medical_genetics": 0, "mmlu_miscellaneous": 0, "mmlu_moral_disputes": 0, "mmlu_moral_scenarios": 0, "mmlu_nutrition": 0, "mmlu_other": 0, "mmlu_philosophy": 0, "mmlu_prehistory": 0, "mmlu_professional_accounting": 0, "mmlu_professional_law": 0, "mmlu_professional_medicine": 0, "mmlu_professional_psychology": 0, "mmlu_public_relations": 0, "mmlu_security_studies": 0, "mmlu_social_sciences": 0, "mmlu_sociology": 0, "mmlu_stem": 0, "mmlu_us_foreign_policy": 0, "mmlu_virology": 0, "mmlu_world_religions": 0 }, "config": { "model": "hf", "model_args": "pretrained=baichuan-inc/Baichuan2-7B-Base,trust_remote_code=True,load_in_4bit=True,peft=./out/lora/p12", "batch_size": "16", "batch_sizes": [], "device": "cuda:0", "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": "dd6c6de" }