File size: 2,919 Bytes
d9f31f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524e209
d9f31f1
 
 
 
 
 
 
 
524e209
d9f31f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
# EXCLUDED_KEYS =  {
#     "pretty_env_info",
#     "chat_template",
#     "group_subtasks",
# }
# EXCLUDED_RESULTS_KEYS = {
#     "leaderboard",
# }
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
#     "alias",
# }

DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
DETAILS_FILENAME = "samples_{subtask}_*.json"
TASKS = {
    # "leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
    "leaderboard_bbh": ("BBH", "leaderboard_bbh"),
    "leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
    "leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
    "leaderboard_math_hard": ("MATH", "leaderboard_math"),
    "leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
    "leaderboard_musr": ("MuSR", "leaderboard_musr"),
}
SUBTASKS = {
    # "leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
    "leaderboard_bbh": [
        "leaderboard_bbh_boolean_expressions",
        "leaderboard_bbh_causal_judgement",
        "leaderboard_bbh_date_understanding",
        "leaderboard_bbh_disambiguation_qa",
        "leaderboard_bbh_formal_fallacies",
        "leaderboard_bbh_geometric_shapes",
        "leaderboard_bbh_hyperbaton",
        "leaderboard_bbh_logical_deduction_five_objects",
        "leaderboard_bbh_logical_deduction_seven_objects",
        "leaderboard_bbh_logical_deduction_three_objects",
        "leaderboard_bbh_movie_recommendation",
        "leaderboard_bbh_navigate",
        "leaderboard_bbh_object_counting",
        "leaderboard_bbh_penguins_in_a_table",
        "leaderboard_bbh_reasoning_about_colored_objects",
        "leaderboard_bbh_ruin_names",
        "leaderboard_bbh_salient_translation_error_detection",
        "leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
        "leaderboard_bbh_temporal_sequences",
        "leaderboard_bbh_tracking_shuffled_objects_five_objects",
        "leaderboard_bbh_tracking_shuffled_objects_seven_objects",
        "leaderboard_bbh_tracking_shuffled_objects_three_objects",
        "leaderboard_bbh_web_of_lies",
    ],
    "leaderboard_gpqa": [
        "leaderboard_gpqa_extended",
        "leaderboard_gpqa_diamond",
        "leaderboard_gpqa_main",
    ],
    "leaderboard_ifeval": ["leaderboard_ifeval"],
    # "leaderboard_math_hard": [
    "leaderboard_math": [
        "leaderboard_math_algebra_hard",
        "leaderboard_math_counting_and_prob_hard",
        "leaderboard_math_geometry_hard",
        "leaderboard_math_intermediate_algebra_hard",
        "leaderboard_math_num_theory_hard",
        "leaderboard_math_prealgebra_hard",
        "leaderboard_math_precalculus_hard",
    ],
    "leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
    "leaderboard_musr": [
        "leaderboard_musr_murder_mysteries",
        "leaderboard_musr_object_placements",
        "leaderboard_musr_team_allocation",
    ],
}