pminervini commited on
Commit
7d1a89f
1 Parent(s): 9a14053
src/display/css_html_js.py CHANGED
@@ -34,7 +34,7 @@ custom_css = """
34
  }
35
 
36
  #leaderboard-table table td:first-child {
37
- text-align: right;
38
  }
39
 
40
  #leaderboard-table-lite {
 
34
  }
35
 
36
  #leaderboard-table table td:first-child {
37
+ text-align: right !important;
38
  }
39
 
40
  #leaderboard-table-lite {
src/display/utils.py CHANGED
@@ -16,14 +16,6 @@ class Task:
16
 
17
 
18
  class Tasks(Enum):
19
- # arc = Task("arc:challenge", "acc_norm", "ARC")
20
- # hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
21
- # mmlu = Task("hendrycksTest", "acc", "MMLU")
22
- # truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
23
- # winogrande = Task("winogrande", "acc", "Winogrande")
24
- # gsm8k = Task("gsm8k", "acc", "GSM8K")
25
- # drop = Task("drop", "f1", "DROP")
26
-
27
  nqopen = Task("nq8", "em", "NQ Open/EM")
28
  triviaqa = Task("tqa8", "em", "TriviaQA/EM")
29
 
@@ -40,7 +32,7 @@ class Tasks(Enum):
40
  cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
41
 
42
  race = Task("race", "acc", "RACE/Acc")
43
- # squadv2 = Task("squadv2", "exact_normalised", "SQUaDv2/EM")
44
 
45
  memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
46
  ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
@@ -69,10 +61,13 @@ auto_eval_column_dict = []
69
  # Init
70
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
71
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
72
  #Scores
73
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 
74
  for task in Tasks:
75
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
76
  # Model information
77
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
78
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
16
 
17
 
18
  class Tasks(Enum):
 
 
 
 
 
 
 
 
19
  nqopen = Task("nq8", "em", "NQ Open/EM")
20
  triviaqa = Task("tqa8", "em", "TriviaQA/EM")
21
 
 
32
  cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
33
 
34
  race = Task("race", "acc", "RACE/Acc")
35
+ squadv2 = Task("squadv2", "exact", "SQUaDv2/EM")
36
 
37
  memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
38
  ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
 
61
  # Init
62
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
63
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
64
+
65
  #Scores
66
+ # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
67
+
68
  for task in Tasks:
69
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
70
+
71
  # Model information
72
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
73
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/leaderboard/read_evals.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  from dataclasses import dataclass
5
 
6
  import dateutil
7
- import numpy as np
8
 
9
  from src.display.formatting import make_clickable_model
10
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
@@ -22,11 +22,11 @@ def is_float(string):
22
  @dataclass
23
  class EvalResult:
24
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
- eval_name: str # org_model_precision (uid)
26
- full_model: str # org/model (path on hub)
27
  org: str
28
  model: str
29
- revision: str # commit hash, "" if main
30
  results: dict
31
  precision: Precision = Precision.Unknown
32
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
@@ -72,55 +72,41 @@ class EvalResult:
72
  architecture = ";".join(architectures)
73
 
74
  # Extract results available in this file (some results are split in several files)
75
- results = {}
76
-
77
- task_iterator = Tasks
78
- if is_backend is True:
79
- from src.backend.envs import Tasks as BackendTasks
80
- task_iterator = BackendTasks
81
-
82
- for task in task_iterator:
83
- task = task.value
84
-
85
- def post_process_results(results: dict) -> dict:
86
- # {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
87
- res_copy = results.copy()
88
-
89
- for task_name in res_copy.keys():
90
- entry_copy = results[task_name].copy()
91
-
92
- for k, v in entry_copy.items():
93
- if "exact_match" in k:
94
- results[task_name][k.replace("exact_match", "em")] = v
95
- if "squadv2" in task_name:
96
- value = results[task_name][k]
97
- if is_float(value) and 'normalised' not in k:
98
- results[task_name][f"{k}_normalised"] = value / 100.0
99
- else:
100
- del results[task_name][k]
101
-
102
- entry_copy = results[task_name].copy()
103
-
104
- for k, v in entry_copy.items():
105
- if "," in k:
106
- tokens = k.split(",")
107
- results[task_name][tokens[0]] = v
108
-
109
- return results
110
 
111
- accs = np.array([v.get(task.metric, None) for k, v in post_process_results(data["results"]).items() if task.benchmark in k])
112
 
113
- if accs.size == 0 or any([acc is None for acc in accs]):
114
- continue
115
-
116
- # print(accs)
117
-
118
- mean_acc = np.mean(accs) * 100.0
119
- results[task.benchmark] = mean_acc
120
-
121
- return EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
122
- precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
123
- architecture=architecture)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  def update_with_request_file(self, requests_path):
126
  """Finds the relevant request file for the current model and updates info with it"""
@@ -129,6 +115,7 @@ class EvalResult:
129
  try:
130
  with open(request_file, "r") as f:
131
  request = json.load(f)
 
132
  self.model_type = ModelType.from_str(request.get("model_type", ""))
133
  self.weight_type = WeightType[request.get("weight_type", "Original")]
134
  self.license = request.get("license", "?")
@@ -146,7 +133,10 @@ class EvalResult:
146
 
147
  def to_dict(self):
148
  """Converts the Eval Result to a dict compatible with our dataframe display"""
149
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
150
  data_dict = {
151
  "eval_name": self.eval_name, # not a column, just a save name,
152
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -157,7 +147,7 @@ class EvalResult:
157
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
158
  AutoEvalColumn.dummy.name: self.full_model,
159
  AutoEvalColumn.revision.name: self.revision,
160
- AutoEvalColumn.average.name: average,
161
  AutoEvalColumn.license.name: self.license,
162
  AutoEvalColumn.likes.name: self.likes,
163
  AutoEvalColumn.params.name: self.num_params,
@@ -165,7 +155,7 @@ class EvalResult:
165
  }
166
 
167
  for task in Tasks:
168
- if task.value.benchmark in self.results: # XXX
169
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
170
 
171
  return data_dict
@@ -182,14 +172,11 @@ def get_request_file_for_model(requests_path, model_name, precision):
182
  # Select correct request file (precision)
183
  request_file = ""
184
  request_files = sorted(request_files, reverse=True)
185
- # print('XXX', request_files)
186
  for tmp_request_file in request_files:
187
  with open(tmp_request_file, "r") as f:
188
  req_content = json.load(f)
189
- if (
190
- # req_content["status"] in ["FINISHED", "RUNNING"] and
191
- req_content["precision"] == precision.split(".")[-1]
192
- ):
193
  request_file = tmp_request_file
194
  return request_file
195
 
 
4
  from dataclasses import dataclass
5
 
6
  import dateutil
7
+ # import numpy as np
8
 
9
  from src.display.formatting import make_clickable_model
10
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 
22
  @dataclass
23
  class EvalResult:
24
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
+ eval_name: str # org_model_precision (uid)
26
+ full_model: str # org/model (path on hub)
27
  org: str
28
  model: str
29
+ revision: str # commit hash, "" if main
30
  results: dict
31
  precision: Precision = Precision.Unknown
32
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
 
72
  architecture = ";".join(architectures)
73
 
74
  # Extract results available in this file (some results are split in several files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
77
 
78
+ results = {}
79
+ for benchmark, benchmark_results in data['results'].items():
80
+ if benchmark not in results:
81
+ results[benchmark] = {}
82
+
83
+ for metric, value in benchmark_results.items():
84
+ to_add = True
85
+ if '_stderr' in metric:
86
+ to_add = False
87
+ if 'alias' in metric:
88
+ to_add = False
89
+
90
+ if ',' in metric:
91
+ metric = metric.split(',')[0]
92
+ metric = metric.replace("exact_match", "em")
93
+
94
+ if to_add is True:
95
+ multiplier = 100.0
96
+ if 'rouge' in metric:
97
+ multiplier = 1.0
98
+ if 'squad' in benchmark:
99
+ multiplier = 1.0
100
+
101
+ # print('RESULTS', data['results'])
102
+ # print('XXX', benchmark, metric, value, multiplier)
103
+ results[benchmark][metric] = value * multiplier
104
+
105
+ res = EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
106
+ precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
107
+ architecture=architecture)
108
+
109
+ return res
110
 
111
  def update_with_request_file(self, requests_path):
112
  """Finds the relevant request file for the current model and updates info with it"""
 
115
  try:
116
  with open(request_file, "r") as f:
117
  request = json.load(f)
118
+
119
  self.model_type = ModelType.from_str(request.get("model_type", ""))
120
  self.weight_type = WeightType[request.get("weight_type", "Original")]
121
  self.license = request.get("license", "?")
 
133
 
134
  def to_dict(self):
135
  """Converts the Eval Result to a dict compatible with our dataframe display"""
136
+
137
+ # breakpoint()
138
+ # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
139
+
140
  data_dict = {
141
  "eval_name": self.eval_name, # not a column, just a save name,
142
  AutoEvalColumn.precision.name: self.precision.value.name,
 
147
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
148
  AutoEvalColumn.dummy.name: self.full_model,
149
  AutoEvalColumn.revision.name: self.revision,
150
+ # AutoEvalColumn.average.name: average,
151
  AutoEvalColumn.license.name: self.license,
152
  AutoEvalColumn.likes.name: self.likes,
153
  AutoEvalColumn.params.name: self.num_params,
 
155
  }
156
 
157
  for task in Tasks:
158
+ if task.value.benchmark in self.results:
159
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
160
 
161
  return data_dict
 
172
  # Select correct request file (precision)
173
  request_file = ""
174
  request_files = sorted(request_files, reverse=True)
175
+
176
  for tmp_request_file in request_files:
177
  with open(tmp_request_file, "r") as f:
178
  req_content = json.load(f)
179
+ if req_content["precision"] == precision.split(".")[-1]:
 
 
 
180
  request_file = tmp_request_file
181
  return request_file
182
 
src/populate.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import os
3
 
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
@@ -8,24 +9,58 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.filter_models import filter_models
9
  from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
10
 
 
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> tuple[list[EvalResult], pd.DataFrame]:
 
 
 
 
 
13
  # Returns a list of EvalResult
14
- # raw_data[0]:
15
- # EvalResult(eval_name='EleutherAI_pythia-1.3b_torch.float16', full_model='EleutherAI/pythia-1.3b', org='EleutherAI', model='pythia-1.3b', revision='34b668ff0acfe56f2d541aa46b385557ee39eb3f', results={'arc:challenge': 31.14334470989761, 'hellaswag': 51.43397729535949, 'hendrycksTest': 26.55151159544371, 'truthfulqa:mc': 39.24322830092449, 'winogrande': 57.37963693764798, 'gsm8k': 0.9855951478392722, 'drop': 4.056312919463095}, precision='torch.float16', model_type=<ModelType.PT: ModelTypeDetails(name='pretrained', symbol='🟢')>, weight_type='Original', architecture='GPTNeoXForCausalLM', license='apache-2.0', likes=7, num_params=1.312, date='2023-09-09T10:52:17Z', still_on_hub=True)
16
- # EvalResult and get_raw_eval_results are defined in ./src/leaderboard/read_evals.py, the results slots are not hardcoded
17
- raw_data = get_raw_eval_results(results_path, requests_path)
18
- all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # all_data_json.append(baseline_row)
20
  filter_models(all_data_json)
21
 
22
  df = pd.DataFrame.from_records(all_data_json)
23
- if AutoEvalColumn.average.name in df:
24
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
25
- df = df[cols].round(decimals=2)
26
 
27
- # filter out if any of the benchmarks have not been produced
28
- df = df[has_no_nan_values(df, benchmark_cols)]
 
 
 
 
 
 
29
  return raw_data, df
30
 
31
 
 
1
  import json
2
  import os
3
 
4
+ import copy
5
  import pandas as pd
6
 
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
 
9
  from src.leaderboard.filter_models import filter_models
10
  from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
11
 
12
+ from src.backend.envs import Tasks as BackendTasks
13
+ from src.display.utils import Tasks
14
 
15
+
16
+ def get_leaderboard_df(results_path: str,
17
+ requests_path: str,
18
+ cols: list,
19
+ benchmark_cols: list,
20
+ is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
21
  # Returns a list of EvalResult
22
+ raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path)
23
+
24
+ all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
25
+
26
+ name_to_bm_map = {}
27
+
28
+ task_iterator = Tasks
29
+ if is_backend is True:
30
+ task_iterator = BackendTasks
31
+
32
+ for task in task_iterator:
33
+ task = task.value
34
+ name = task.col_name
35
+ bm = (task.benchmark, task.metric)
36
+ name_to_bm_map[name] = bm
37
+
38
+ # bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
39
+
40
+ all_data_json = []
41
+ for entry in all_data_json_:
42
+ new_entry = copy.deepcopy(entry)
43
+
44
+ for k, v in entry.items():
45
+ if k in name_to_bm_map:
46
+ benchmark, metric = name_to_bm_map[k]
47
+ new_entry[k] = entry[k][metric]
48
+
49
+ all_data_json += [new_entry]
50
+
51
  # all_data_json.append(baseline_row)
52
  filter_models(all_data_json)
53
 
54
  df = pd.DataFrame.from_records(all_data_json)
 
 
 
55
 
56
+ # if AutoEvalColumn.average.name in df:
57
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
58
+
59
+ df = df[cols].round(decimals=2)
60
+
61
+ # filter out if any of the benchmarks have not been produced
62
+ df = df[has_no_nan_values(df, benchmark_cols)]
63
+
64
  return raw_data, df
65
 
66