Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
7d1a89f
1
Parent(s):
9a14053
update
Browse files- src/display/css_html_js.py +1 -1
- src/display/utils.py +5 -10
- src/leaderboard/read_evals.py +46 -59
- src/populate.py +46 -11
src/display/css_html_js.py
CHANGED
@@ -34,7 +34,7 @@ custom_css = """
|
|
34 |
}
|
35 |
|
36 |
#leaderboard-table table td:first-child {
|
37 |
-
text-align: right;
|
38 |
}
|
39 |
|
40 |
#leaderboard-table-lite {
|
|
|
34 |
}
|
35 |
|
36 |
#leaderboard-table table td:first-child {
|
37 |
+
text-align: right !important;
|
38 |
}
|
39 |
|
40 |
#leaderboard-table-lite {
|
src/display/utils.py
CHANGED
@@ -16,14 +16,6 @@ class Task:
|
|
16 |
|
17 |
|
18 |
class Tasks(Enum):
|
19 |
-
# arc = Task("arc:challenge", "acc_norm", "ARC")
|
20 |
-
# hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
21 |
-
# mmlu = Task("hendrycksTest", "acc", "MMLU")
|
22 |
-
# truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
23 |
-
# winogrande = Task("winogrande", "acc", "Winogrande")
|
24 |
-
# gsm8k = Task("gsm8k", "acc", "GSM8K")
|
25 |
-
# drop = Task("drop", "f1", "DROP")
|
26 |
-
|
27 |
nqopen = Task("nq8", "em", "NQ Open/EM")
|
28 |
triviaqa = Task("tqa8", "em", "TriviaQA/EM")
|
29 |
|
@@ -40,7 +32,7 @@ class Tasks(Enum):
|
|
40 |
cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
|
41 |
|
42 |
race = Task("race", "acc", "RACE/Acc")
|
43 |
-
|
44 |
|
45 |
memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
|
46 |
ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
|
@@ -69,10 +61,13 @@ auto_eval_column_dict = []
|
|
69 |
# Init
|
70 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
71 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
72 |
#Scores
|
73 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
|
|
74 |
for task in Tasks:
|
75 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
76 |
# Model information
|
77 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
78 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
16 |
|
17 |
|
18 |
class Tasks(Enum):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
nqopen = Task("nq8", "em", "NQ Open/EM")
|
20 |
triviaqa = Task("tqa8", "em", "TriviaQA/EM")
|
21 |
|
|
|
32 |
cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
|
33 |
|
34 |
race = Task("race", "acc", "RACE/Acc")
|
35 |
+
squadv2 = Task("squadv2", "exact", "SQUaDv2/EM")
|
36 |
|
37 |
memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
|
38 |
ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
|
|
|
61 |
# Init
|
62 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
63 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
64 |
+
|
65 |
#Scores
|
66 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
67 |
+
|
68 |
for task in Tasks:
|
69 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
70 |
+
|
71 |
# Model information
|
72 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
73 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4 |
from dataclasses import dataclass
|
5 |
|
6 |
import dateutil
|
7 |
-
import numpy as np
|
8 |
|
9 |
from src.display.formatting import make_clickable_model
|
10 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
@@ -22,11 +22,11 @@ def is_float(string):
|
|
22 |
@dataclass
|
23 |
class EvalResult:
|
24 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
25 |
-
eval_name: str
|
26 |
-
full_model: str
|
27 |
org: str
|
28 |
model: str
|
29 |
-
revision: str
|
30 |
results: dict
|
31 |
precision: Precision = Precision.Unknown
|
32 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
@@ -72,55 +72,41 @@ class EvalResult:
|
|
72 |
architecture = ";".join(architectures)
|
73 |
|
74 |
# Extract results available in this file (some results are split in several files)
|
75 |
-
results = {}
|
76 |
-
|
77 |
-
task_iterator = Tasks
|
78 |
-
if is_backend is True:
|
79 |
-
from src.backend.envs import Tasks as BackendTasks
|
80 |
-
task_iterator = BackendTasks
|
81 |
-
|
82 |
-
for task in task_iterator:
|
83 |
-
task = task.value
|
84 |
-
|
85 |
-
def post_process_results(results: dict) -> dict:
|
86 |
-
# {'nq_open': {'em': 0.018005540166204988, 'em_stderr': 0.0022134216580395583}}
|
87 |
-
res_copy = results.copy()
|
88 |
-
|
89 |
-
for task_name in res_copy.keys():
|
90 |
-
entry_copy = results[task_name].copy()
|
91 |
-
|
92 |
-
for k, v in entry_copy.items():
|
93 |
-
if "exact_match" in k:
|
94 |
-
results[task_name][k.replace("exact_match", "em")] = v
|
95 |
-
if "squadv2" in task_name:
|
96 |
-
value = results[task_name][k]
|
97 |
-
if is_float(value) and 'normalised' not in k:
|
98 |
-
results[task_name][f"{k}_normalised"] = value / 100.0
|
99 |
-
else:
|
100 |
-
del results[task_name][k]
|
101 |
-
|
102 |
-
entry_copy = results[task_name].copy()
|
103 |
-
|
104 |
-
for k, v in entry_copy.items():
|
105 |
-
if "," in k:
|
106 |
-
tokens = k.split(",")
|
107 |
-
results[task_name][tokens[0]] = v
|
108 |
-
|
109 |
-
return results
|
110 |
|
111 |
-
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
def update_with_request_file(self, requests_path):
|
126 |
"""Finds the relevant request file for the current model and updates info with it"""
|
@@ -129,6 +115,7 @@ class EvalResult:
|
|
129 |
try:
|
130 |
with open(request_file, "r") as f:
|
131 |
request = json.load(f)
|
|
|
132 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
133 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
134 |
self.license = request.get("license", "?")
|
@@ -146,7 +133,10 @@ class EvalResult:
|
|
146 |
|
147 |
def to_dict(self):
|
148 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
149 |
-
|
|
|
|
|
|
|
150 |
data_dict = {
|
151 |
"eval_name": self.eval_name, # not a column, just a save name,
|
152 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -157,7 +147,7 @@ class EvalResult:
|
|
157 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
158 |
AutoEvalColumn.dummy.name: self.full_model,
|
159 |
AutoEvalColumn.revision.name: self.revision,
|
160 |
-
AutoEvalColumn.average.name: average,
|
161 |
AutoEvalColumn.license.name: self.license,
|
162 |
AutoEvalColumn.likes.name: self.likes,
|
163 |
AutoEvalColumn.params.name: self.num_params,
|
@@ -165,7 +155,7 @@ class EvalResult:
|
|
165 |
}
|
166 |
|
167 |
for task in Tasks:
|
168 |
-
if task.value.benchmark in self.results:
|
169 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
170 |
|
171 |
return data_dict
|
@@ -182,14 +172,11 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
182 |
# Select correct request file (precision)
|
183 |
request_file = ""
|
184 |
request_files = sorted(request_files, reverse=True)
|
185 |
-
|
186 |
for tmp_request_file in request_files:
|
187 |
with open(tmp_request_file, "r") as f:
|
188 |
req_content = json.load(f)
|
189 |
-
if (
|
190 |
-
# req_content["status"] in ["FINISHED", "RUNNING"] and
|
191 |
-
req_content["precision"] == precision.split(".")[-1]
|
192 |
-
):
|
193 |
request_file = tmp_request_file
|
194 |
return request_file
|
195 |
|
|
|
4 |
from dataclasses import dataclass
|
5 |
|
6 |
import dateutil
|
7 |
+
# import numpy as np
|
8 |
|
9 |
from src.display.formatting import make_clickable_model
|
10 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
|
|
22 |
@dataclass
|
23 |
class EvalResult:
|
24 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
25 |
+
eval_name: str # org_model_precision (uid)
|
26 |
+
full_model: str # org/model (path on hub)
|
27 |
org: str
|
28 |
model: str
|
29 |
+
revision: str # commit hash, "" if main
|
30 |
results: dict
|
31 |
precision: Precision = Precision.Unknown
|
32 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
|
|
72 |
architecture = ";".join(architectures)
|
73 |
|
74 |
# Extract results available in this file (some results are split in several files)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
# data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
|
77 |
|
78 |
+
results = {}
|
79 |
+
for benchmark, benchmark_results in data['results'].items():
|
80 |
+
if benchmark not in results:
|
81 |
+
results[benchmark] = {}
|
82 |
+
|
83 |
+
for metric, value in benchmark_results.items():
|
84 |
+
to_add = True
|
85 |
+
if '_stderr' in metric:
|
86 |
+
to_add = False
|
87 |
+
if 'alias' in metric:
|
88 |
+
to_add = False
|
89 |
+
|
90 |
+
if ',' in metric:
|
91 |
+
metric = metric.split(',')[0]
|
92 |
+
metric = metric.replace("exact_match", "em")
|
93 |
+
|
94 |
+
if to_add is True:
|
95 |
+
multiplier = 100.0
|
96 |
+
if 'rouge' in metric:
|
97 |
+
multiplier = 1.0
|
98 |
+
if 'squad' in benchmark:
|
99 |
+
multiplier = 1.0
|
100 |
+
|
101 |
+
# print('RESULTS', data['results'])
|
102 |
+
# print('XXX', benchmark, metric, value, multiplier)
|
103 |
+
results[benchmark][metric] = value * multiplier
|
104 |
+
|
105 |
+
res = EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
|
106 |
+
precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
|
107 |
+
architecture=architecture)
|
108 |
+
|
109 |
+
return res
|
110 |
|
111 |
def update_with_request_file(self, requests_path):
|
112 |
"""Finds the relevant request file for the current model and updates info with it"""
|
|
|
115 |
try:
|
116 |
with open(request_file, "r") as f:
|
117 |
request = json.load(f)
|
118 |
+
|
119 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
120 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
121 |
self.license = request.get("license", "?")
|
|
|
133 |
|
134 |
def to_dict(self):
|
135 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
136 |
+
|
137 |
+
# breakpoint()
|
138 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
139 |
+
|
140 |
data_dict = {
|
141 |
"eval_name": self.eval_name, # not a column, just a save name,
|
142 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
147 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
148 |
AutoEvalColumn.dummy.name: self.full_model,
|
149 |
AutoEvalColumn.revision.name: self.revision,
|
150 |
+
# AutoEvalColumn.average.name: average,
|
151 |
AutoEvalColumn.license.name: self.license,
|
152 |
AutoEvalColumn.likes.name: self.likes,
|
153 |
AutoEvalColumn.params.name: self.num_params,
|
|
|
155 |
}
|
156 |
|
157 |
for task in Tasks:
|
158 |
+
if task.value.benchmark in self.results:
|
159 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
160 |
|
161 |
return data_dict
|
|
|
172 |
# Select correct request file (precision)
|
173 |
request_file = ""
|
174 |
request_files = sorted(request_files, reverse=True)
|
175 |
+
|
176 |
for tmp_request_file in request_files:
|
177 |
with open(tmp_request_file, "r") as f:
|
178 |
req_content = json.load(f)
|
179 |
+
if req_content["precision"] == precision.split(".")[-1]:
|
|
|
|
|
|
|
180 |
request_file = tmp_request_file
|
181 |
return request_file
|
182 |
|
src/populate.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
@@ -8,24 +9,58 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.filter_models import filter_models
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
10 |
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
13 |
# Returns a list of EvalResult
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# all_data_json.append(baseline_row)
|
20 |
filter_models(all_data_json)
|
21 |
|
22 |
df = pd.DataFrame.from_records(all_data_json)
|
23 |
-
if AutoEvalColumn.average.name in df:
|
24 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
25 |
-
df = df[cols].round(decimals=2)
|
26 |
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
return raw_data, df
|
30 |
|
31 |
|
|
|
1 |
import json
|
2 |
import os
|
3 |
|
4 |
+
import copy
|
5 |
import pandas as pd
|
6 |
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
|
9 |
from src.leaderboard.filter_models import filter_models
|
10 |
from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
|
11 |
|
12 |
+
from src.backend.envs import Tasks as BackendTasks
|
13 |
+
from src.display.utils import Tasks
|
14 |
|
15 |
+
|
16 |
+
def get_leaderboard_df(results_path: str,
|
17 |
+
requests_path: str,
|
18 |
+
cols: list,
|
19 |
+
benchmark_cols: list,
|
20 |
+
is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
|
21 |
# Returns a list of EvalResult
|
22 |
+
raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path)
|
23 |
+
|
24 |
+
all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
|
25 |
+
|
26 |
+
name_to_bm_map = {}
|
27 |
+
|
28 |
+
task_iterator = Tasks
|
29 |
+
if is_backend is True:
|
30 |
+
task_iterator = BackendTasks
|
31 |
+
|
32 |
+
for task in task_iterator:
|
33 |
+
task = task.value
|
34 |
+
name = task.col_name
|
35 |
+
bm = (task.benchmark, task.metric)
|
36 |
+
name_to_bm_map[name] = bm
|
37 |
+
|
38 |
+
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
39 |
+
|
40 |
+
all_data_json = []
|
41 |
+
for entry in all_data_json_:
|
42 |
+
new_entry = copy.deepcopy(entry)
|
43 |
+
|
44 |
+
for k, v in entry.items():
|
45 |
+
if k in name_to_bm_map:
|
46 |
+
benchmark, metric = name_to_bm_map[k]
|
47 |
+
new_entry[k] = entry[k][metric]
|
48 |
+
|
49 |
+
all_data_json += [new_entry]
|
50 |
+
|
51 |
# all_data_json.append(baseline_row)
|
52 |
filter_models(all_data_json)
|
53 |
|
54 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
|
|
|
55 |
|
56 |
+
# if AutoEvalColumn.average.name in df:
|
57 |
+
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
58 |
+
|
59 |
+
df = df[cols].round(decimals=2)
|
60 |
+
|
61 |
+
# filter out if any of the benchmarks have not been produced
|
62 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
63 |
+
|
64 |
return raw_data, df
|
65 |
|
66 |
|