T145 commited on
Commit
88ed67a
Β·
1 Parent(s): 52a5d9f

Round results

Browse files
Files changed (1) hide show
  1. functions.py +13 -8
functions.py CHANGED
@@ -26,6 +26,10 @@ def get_details_url(repo):
26
  return f"https://huggingface.co/datasets/open-llm-leaderboard/{author}__{model}-details"
27
 
28
 
 
 
 
 
29
  def get_query_url(repo):
30
  return f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query={repo}"
31
 
@@ -36,7 +40,7 @@ def get_task_summary(results):
36
  "dataset_type": "HuggingFaceH4/ifeval",
37
  "dataset_name": "IFEval (0-Shot)",
38
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
39
- "metric_value": results["IFEval"],
40
  "dataset_config": None, # don't know
41
  "dataset_split": None, # don't know
42
  "dataset_revision": None,
@@ -47,7 +51,7 @@ def get_task_summary(results):
47
  "dataset_type": "BBH",
48
  "dataset_name": "BBH (3-Shot)",
49
  "metric_type": "acc_norm",
50
- "metric_value": results["BBH"],
51
  "dataset_config": None, # don't know
52
  "dataset_split": None, # don't know
53
  "dataset_revision": None,
@@ -58,7 +62,7 @@ def get_task_summary(results):
58
  "dataset_type": "hendrycks/competition_math",
59
  "dataset_name": "MATH Lvl 5 (4-Shot)",
60
  "metric_type": "exact_match",
61
- "metric_value": results["MATH Lvl 5"],
62
  "dataset_config": None, # don't know
63
  "dataset_split": None, # don't know
64
  "dataset_revision": None,
@@ -69,7 +73,7 @@ def get_task_summary(results):
69
  "dataset_type": "Idavidrein/gpqa",
70
  "dataset_name": "GPQA (0-shot)",
71
  "metric_type": "acc_norm",
72
- "metric_value": results["GPQA"],
73
  "dataset_config": None, # don't know
74
  "dataset_split": None, # don't know
75
  "dataset_revision": None,
@@ -80,7 +84,7 @@ def get_task_summary(results):
80
  "dataset_type": "TAUR-Lab/MuSR",
81
  "dataset_name": "MuSR (0-shot)",
82
  "metric_type": "acc_norm",
83
- "metric_value": results["MUSR"],
84
  "dataset_config": None, # don't know
85
  "dataset_split": None, # don't know
86
  "dataset_args": {"num_few_shot": 0},
@@ -90,7 +94,7 @@ def get_task_summary(results):
90
  "dataset_type": "TIGER-Lab/MMLU-Pro",
91
  "dataset_name": "MMLU-PRO (5-shot)",
92
  "metric_type": "acc",
93
- "metric_value": results["MMLU-PRO"],
94
  "dataset_config": "main",
95
  "dataset_split": "test",
96
  "dataset_args": {"num_few_shot": 5},
@@ -104,13 +108,14 @@ def get_eval_results(df, repo):
104
  task_summary = get_task_summary(results)
105
  md_writer = MarkdownTableWriter()
106
  md_writer.headers = ["Metric", "Value"]
107
- md_writer.value_matrix = [["Avg.", results["Average ⬆️"]]] + [
108
  [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
109
  ]
110
 
111
  text = f"""
112
  # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
113
- Detailed results can be found [here]({get_details_url(repo)})
 
114
 
115
  {md_writer.dumps()}
116
  """
 
26
  return f"https://huggingface.co/datasets/open-llm-leaderboard/{author}__{model}-details"
27
 
28
 
29
+ def get_contents_url(repo):
30
+ return f"https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q={repo}"
31
+
32
+
33
  def get_query_url(repo):
34
  return f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query={repo}"
35
 
 
40
  "dataset_type": "HuggingFaceH4/ifeval",
41
  "dataset_name": "IFEval (0-Shot)",
42
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
43
+ "metric_value": round(results["IFEval"], 2),
44
  "dataset_config": None, # don't know
45
  "dataset_split": None, # don't know
46
  "dataset_revision": None,
 
51
  "dataset_type": "BBH",
52
  "dataset_name": "BBH (3-Shot)",
53
  "metric_type": "acc_norm",
54
+ "metric_value": round(results["BBH"], 2),
55
  "dataset_config": None, # don't know
56
  "dataset_split": None, # don't know
57
  "dataset_revision": None,
 
62
  "dataset_type": "hendrycks/competition_math",
63
  "dataset_name": "MATH Lvl 5 (4-Shot)",
64
  "metric_type": "exact_match",
65
+ "metric_value": round(results["MATH Lvl 5"], 2),
66
  "dataset_config": None, # don't know
67
  "dataset_split": None, # don't know
68
  "dataset_revision": None,
 
73
  "dataset_type": "Idavidrein/gpqa",
74
  "dataset_name": "GPQA (0-shot)",
75
  "metric_type": "acc_norm",
76
+ "metric_value": round(results["GPQA"], 2),
77
  "dataset_config": None, # don't know
78
  "dataset_split": None, # don't know
79
  "dataset_revision": None,
 
84
  "dataset_type": "TAUR-Lab/MuSR",
85
  "dataset_name": "MuSR (0-shot)",
86
  "metric_type": "acc_norm",
87
+ "metric_value": round(results["MUSR"], 2),
88
  "dataset_config": None, # don't know
89
  "dataset_split": None, # don't know
90
  "dataset_args": {"num_few_shot": 0},
 
94
  "dataset_type": "TIGER-Lab/MMLU-Pro",
95
  "dataset_name": "MMLU-PRO (5-shot)",
96
  "metric_type": "acc",
97
+ "metric_value": round(results["MMLU-PRO"], 2),
98
  "dataset_config": "main",
99
  "dataset_split": "test",
100
  "dataset_args": {"num_few_shot": 5},
 
108
  task_summary = get_task_summary(results)
109
  md_writer = MarkdownTableWriter()
110
  md_writer.headers = ["Metric", "Value"]
111
+ md_writer.value_matrix = [["Avg.", round(results["Average ⬆️"], 2)]] + [
112
  [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
113
  ]
114
 
115
  text = f"""
116
  # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
117
+ Detailed results can be found [here]({get_details_url(repo)})!
118
+ Summarized results can be found [here]({get_contents_url(repo)})!
119
 
120
  {md_writer.dumps()}
121
  """