T145 commited on
Commit
8323cf6
Β·
1 Parent(s): 4e2bd19

Updated datasets

Browse files
Files changed (1) hide show
  1. functions.py +21 -23
functions.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import gradio as gr
3
  import pandas as pd
4
  from datasets import load_dataset
@@ -40,34 +39,34 @@ def get_query_url(repo):
40
  def get_task_summary(results):
41
  return {
42
  "IFEval": {
43
- "dataset_type": "HuggingFaceH4/ifeval",
44
  "dataset_name": "IFEval (0-Shot)",
45
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
46
  "metric_value": round(results["IFEval"], 2),
47
- "dataset_config": None, # don't know
48
- "dataset_split": None, # don't know
49
  "dataset_revision": None,
50
  "dataset_args": {"num_few_shot": 0},
51
- "metric_name": "strict accuracy",
52
  },
53
  "BBH": {
54
- "dataset_type": "BBH",
55
  "dataset_name": "BBH (3-Shot)",
56
  "metric_type": "acc_norm",
57
  "metric_value": round(results["BBH"], 2),
58
- "dataset_config": None, # don't know
59
- "dataset_split": None, # don't know
60
  "dataset_revision": None,
61
  "dataset_args": {"num_few_shot": 3},
62
  "metric_name": "normalized accuracy",
63
  },
64
  "MATH Lvl 5": {
65
- "dataset_type": "hendrycks/competition_math",
66
  "dataset_name": "MATH Lvl 5 (4-Shot)",
67
  "metric_type": "exact_match",
68
  "metric_value": round(results["MATH Lvl 5"], 2),
69
- "dataset_config": None, # don't know
70
- "dataset_split": None, # don't know
71
  "dataset_revision": None,
72
  "dataset_args": {"num_few_shot": 4},
73
  "metric_name": "exact match",
@@ -77,8 +76,8 @@ def get_task_summary(results):
77
  "dataset_name": "GPQA (0-shot)",
78
  "metric_type": "acc_norm",
79
  "metric_value": round(results["GPQA"], 2),
80
- "dataset_config": None, # don't know
81
- "dataset_split": None, # don't know
82
  "dataset_revision": None,
83
  "dataset_args": {"num_few_shot": 0},
84
  "metric_name": "acc_norm",
@@ -88,8 +87,8 @@ def get_task_summary(results):
88
  "dataset_name": "MuSR (0-shot)",
89
  "metric_type": "acc_norm",
90
  "metric_value": round(results["MUSR"], 2),
91
- "dataset_config": None, # don't know
92
- "dataset_split": None, # don't know
93
  "dataset_args": {"num_few_shot": 0},
94
  "metric_name": "acc_norm",
95
  },
@@ -109,18 +108,17 @@ def get_task_summary(results):
109
  def get_eval_results(df, repo):
110
  results = search(df, repo)
111
  task_summary = get_task_summary(results)
112
- md_writer = MarkdownTableWriter()
113
- md_writer.headers = ["Metric", "% Value"]
114
- md_writer.value_matrix = [["Avg.", round(results["Average ⬆️"], 2)]] + [
115
  [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
116
  ]
117
 
118
- text = f"""
119
- # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
120
  Detailed results can be found [here]({get_details_url(repo)})!
121
  Summarized results can be found [here]({get_contents_url(repo)})!
122
 
123
- {md_writer.dumps()}
124
  """
125
  return text
126
 
@@ -175,8 +173,8 @@ def commit(
175
  if repo.startswith("https://huggingface.co/"):
176
  try:
177
  repo = RepoUrl(repo).repo_id
178
- except Exception:
179
- raise gr.Error(f"Not a valid repo id: {str(repo)}")
180
 
181
  edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True}
182
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
 
39
  def get_task_summary(results):
40
  return {
41
  "IFEval": {
42
+ "dataset_type": "wis-k/instruction-following-eval",
43
  "dataset_name": "IFEval (0-Shot)",
44
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
45
  "metric_value": round(results["IFEval"], 2),
46
+ "dataset_config": None,
47
+ "dataset_split": "train",
48
  "dataset_revision": None,
49
  "dataset_args": {"num_few_shot": 0},
50
+ "metric_name": "averaged accuracy",
51
  },
52
  "BBH": {
53
+ "dataset_type": "SaylorTwift/bbh",
54
  "dataset_name": "BBH (3-Shot)",
55
  "metric_type": "acc_norm",
56
  "metric_value": round(results["BBH"], 2),
57
+ "dataset_config": None,
58
+ "dataset_split": "test",
59
  "dataset_revision": None,
60
  "dataset_args": {"num_few_shot": 3},
61
  "metric_name": "normalized accuracy",
62
  },
63
  "MATH Lvl 5": {
64
+ "dataset_type": "lighteval/MATH-Hard",
65
  "dataset_name": "MATH Lvl 5 (4-Shot)",
66
  "metric_type": "exact_match",
67
  "metric_value": round(results["MATH Lvl 5"], 2),
68
+ "dataset_config": None,
69
+ "dataset_split": "test",
70
  "dataset_revision": None,
71
  "dataset_args": {"num_few_shot": 4},
72
  "metric_name": "exact match",
 
76
  "dataset_name": "GPQA (0-shot)",
77
  "metric_type": "acc_norm",
78
  "metric_value": round(results["GPQA"], 2),
79
+ "dataset_config": None,
80
+ "dataset_split": "train",
81
  "dataset_revision": None,
82
  "dataset_args": {"num_few_shot": 0},
83
  "metric_name": "acc_norm",
 
87
  "dataset_name": "MuSR (0-shot)",
88
  "metric_type": "acc_norm",
89
  "metric_value": round(results["MUSR"], 2),
90
+ "dataset_config": None,
91
+ "dataset_split": None, # three test splits
92
  "dataset_args": {"num_few_shot": 0},
93
  "metric_name": "acc_norm",
94
  },
 
108
  def get_eval_results(df, repo):
109
  results = search(df, repo)
110
  task_summary = get_task_summary(results)
111
+ table = MarkdownTableWriter()
112
+ table.headers = ["Metric", "% Value"]
113
+ table.value_matrix = [["Avg.", round(results["Average ⬆️"], 2)]] + [
114
  [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
115
  ]
116
 
117
+ text = f"""# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
 
118
  Detailed results can be found [here]({get_details_url(repo)})!
119
  Summarized results can be found [here]({get_contents_url(repo)})!
120
 
121
+ {table.dumps()}
122
  """
123
  return text
124
 
 
173
  if repo.startswith("https://huggingface.co/"):
174
  try:
175
  repo = RepoUrl(repo).repo_id
176
+ except Exception as e:
177
+ raise gr.Error(f"Not a valid repo id: {str(repo)}") from e
178
 
179
  edited = {"revision": f"refs/pr/{pr_number}"} if pr_number else {"create_pr": True}
180