File size: 2,919 Bytes
d9f31f1 524e209 d9f31f1 524e209 d9f31f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
# EXCLUDED_KEYS = {
# "pretty_env_info",
# "chat_template",
# "group_subtasks",
# }
# EXCLUDED_RESULTS_KEYS = {
# "leaderboard",
# }
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
# "alias",
# }
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
DETAILS_FILENAME = "samples_{subtask}_*.json"
TASKS = {
# "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
"leaderboard_bbh": ("BBH", "leaderboard_bbh"),
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
"leaderboard_math_hard": ("MATH", "leaderboard_math"),
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
}
SUBTASKS = {
# "leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
"leaderboard_bbh": [
"leaderboard_bbh_boolean_expressions",
"leaderboard_bbh_causal_judgement",
"leaderboard_bbh_date_understanding",
"leaderboard_bbh_disambiguation_qa",
"leaderboard_bbh_formal_fallacies",
"leaderboard_bbh_geometric_shapes",
"leaderboard_bbh_hyperbaton",
"leaderboard_bbh_logical_deduction_five_objects",
"leaderboard_bbh_logical_deduction_seven_objects",
"leaderboard_bbh_logical_deduction_three_objects",
"leaderboard_bbh_movie_recommendation",
"leaderboard_bbh_navigate",
"leaderboard_bbh_object_counting",
"leaderboard_bbh_penguins_in_a_table",
"leaderboard_bbh_reasoning_about_colored_objects",
"leaderboard_bbh_ruin_names",
"leaderboard_bbh_salient_translation_error_detection",
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
"leaderboard_bbh_temporal_sequences",
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
"leaderboard_bbh_web_of_lies",
],
"leaderboard_gpqa": [
"leaderboard_gpqa_extended",
"leaderboard_gpqa_diamond",
"leaderboard_gpqa_main",
],
"leaderboard_ifeval": ["leaderboard_ifeval"],
# "leaderboard_math_hard": [
"leaderboard_math": [
"leaderboard_math_algebra_hard",
"leaderboard_math_counting_and_prob_hard",
"leaderboard_math_geometry_hard",
"leaderboard_math_intermediate_algebra_hard",
"leaderboard_math_num_theory_hard",
"leaderboard_math_prealgebra_hard",
"leaderboard_math_precalculus_hard",
],
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
"leaderboard_musr": [
"leaderboard_musr_murder_mysteries",
"leaderboard_musr_object_placements",
"leaderboard_musr_team_allocation",
],
}
|