Spaces:
Runtime error
Runtime error
jinsol-neubla
commited on
Commit
•
9a04f8c
1
Parent(s):
3066149
Fix GSM8k key change issue
Browse files(get-answer -> strict-match)
Signed-off-by: jinsol-neubla <jinsol.kim@neubla.com>
- app.py +18 -18
- requirements.txt +3 -3
- src/display/utils.py +15 -4
- src/leaderboard/read_evals.py +6 -3
app.py
CHANGED
@@ -80,7 +80,7 @@ leaderboard_df, original_df, plot_df = init_space()
|
|
80 |
def update_table(
|
81 |
hidden_df: pd.DataFrame,
|
82 |
columns: list,
|
83 |
-
type_query: list,
|
84 |
weight_precision_query: str,
|
85 |
activation_precision_query: str,
|
86 |
size_query: list,
|
@@ -90,7 +90,7 @@ def update_table(
|
|
90 |
):
|
91 |
filtered_df = filter_models(
|
92 |
df=hidden_df,
|
93 |
-
type_query=type_query,
|
94 |
size_query=size_query,
|
95 |
weight_precision_query=weight_precision_query,
|
96 |
activation_precision_query=activation_precision_query,
|
@@ -151,7 +151,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
|
|
151 |
|
152 |
def filter_models(
|
153 |
df: pd.DataFrame,
|
154 |
-
type_query: list,
|
155 |
size_query: list,
|
156 |
weight_precision_query: list,
|
157 |
activation_precision_query: list,
|
@@ -173,8 +173,8 @@ def filter_models(
|
|
173 |
if "Flagged" in hide_models:
|
174 |
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
175 |
|
176 |
-
type_emoji = [t[0] for t in type_query]
|
177 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
178 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.weight_precision.name].isin(weight_precision_query + ["None"])]
|
179 |
filtered_df = filtered_df.loc[
|
180 |
df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
|
@@ -191,7 +191,7 @@ def filter_models(
|
|
191 |
|
192 |
leaderboard_df = filter_models(
|
193 |
df=leaderboard_df,
|
194 |
-
type_query=[t.to_str(" : ") for t in ModelType],
|
195 |
size_query=list(NUMERIC_INTERVALS.keys()),
|
196 |
weight_precision_query=[i.value.name for i in Precision],
|
197 |
activation_precision_query=[i.value.name for i in Precision],
|
@@ -239,13 +239,13 @@ with demo:
|
|
239 |
)
|
240 |
with gr.Column(min_width=320):
|
241 |
# with gr.Box(elem_id="box-filter"):
|
242 |
-
filter_columns_type = gr.CheckboxGroup(
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
)
|
249 |
filter_columns_weight_precision = gr.CheckboxGroup(
|
250 |
label="Weight Precision",
|
251 |
choices=[i.value.name for i in Precision],
|
@@ -301,7 +301,7 @@ with demo:
|
|
301 |
[
|
302 |
hidden_leaderboard_table_for_search,
|
303 |
shown_columns,
|
304 |
-
filter_columns_type,
|
305 |
filter_columns_weight_precision,
|
306 |
filter_columns_activation_precision,
|
307 |
filter_columns_size,
|
@@ -319,7 +319,7 @@ with demo:
|
|
319 |
[
|
320 |
hidden_leaderboard_table_for_search,
|
321 |
shown_columns,
|
322 |
-
filter_columns_type,
|
323 |
filter_columns_weight_precision,
|
324 |
filter_columns_activation_precision,
|
325 |
filter_columns_size,
|
@@ -334,7 +334,7 @@ with demo:
|
|
334 |
|
335 |
for selector in [
|
336 |
shown_columns,
|
337 |
-
filter_columns_type,
|
338 |
filter_columns_weight_precision,
|
339 |
filter_columns_activation_precision,
|
340 |
filter_columns_size,
|
@@ -346,7 +346,7 @@ with demo:
|
|
346 |
[
|
347 |
hidden_leaderboard_table_for_search,
|
348 |
shown_columns,
|
349 |
-
filter_columns_type,
|
350 |
filter_columns_weight_precision,
|
351 |
filter_columns_activation_precision,
|
352 |
filter_columns_size,
|
@@ -391,4 +391,4 @@ scheduler = BackgroundScheduler()
|
|
391 |
scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
|
392 |
scheduler.start()
|
393 |
|
394 |
-
demo.queue(default_concurrency_limit=40).launch(
|
|
|
80 |
def update_table(
|
81 |
hidden_df: pd.DataFrame,
|
82 |
columns: list,
|
83 |
+
# type_query: list,
|
84 |
weight_precision_query: str,
|
85 |
activation_precision_query: str,
|
86 |
size_query: list,
|
|
|
90 |
):
|
91 |
filtered_df = filter_models(
|
92 |
df=hidden_df,
|
93 |
+
# type_query=type_query,
|
94 |
size_query=size_query,
|
95 |
weight_precision_query=weight_precision_query,
|
96 |
activation_precision_query=activation_precision_query,
|
|
|
151 |
|
152 |
def filter_models(
|
153 |
df: pd.DataFrame,
|
154 |
+
# type_query: list,
|
155 |
size_query: list,
|
156 |
weight_precision_query: list,
|
157 |
activation_precision_query: list,
|
|
|
173 |
if "Flagged" in hide_models:
|
174 |
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
175 |
|
176 |
+
# type_emoji = [t[0] for t in type_query]
|
177 |
+
# filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
178 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.weight_precision.name].isin(weight_precision_query + ["None"])]
|
179 |
filtered_df = filtered_df.loc[
|
180 |
df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
|
|
|
191 |
|
192 |
leaderboard_df = filter_models(
|
193 |
df=leaderboard_df,
|
194 |
+
# type_query=[t.to_str(" : ") for t in ModelType],
|
195 |
size_query=list(NUMERIC_INTERVALS.keys()),
|
196 |
weight_precision_query=[i.value.name for i in Precision],
|
197 |
activation_precision_query=[i.value.name for i in Precision],
|
|
|
239 |
)
|
240 |
with gr.Column(min_width=320):
|
241 |
# with gr.Box(elem_id="box-filter"):
|
242 |
+
# filter_columns_type = gr.CheckboxGroup(
|
243 |
+
# label="Model types",
|
244 |
+
# choices=[t.to_str() for t in ModelType],
|
245 |
+
# value=[t.to_str() for t in ModelType],
|
246 |
+
# interactive=True,
|
247 |
+
# elem_id="filter-columns-type",
|
248 |
+
# )
|
249 |
filter_columns_weight_precision = gr.CheckboxGroup(
|
250 |
label="Weight Precision",
|
251 |
choices=[i.value.name for i in Precision],
|
|
|
301 |
[
|
302 |
hidden_leaderboard_table_for_search,
|
303 |
shown_columns,
|
304 |
+
# filter_columns_type,
|
305 |
filter_columns_weight_precision,
|
306 |
filter_columns_activation_precision,
|
307 |
filter_columns_size,
|
|
|
319 |
[
|
320 |
hidden_leaderboard_table_for_search,
|
321 |
shown_columns,
|
322 |
+
# filter_columns_type,
|
323 |
filter_columns_weight_precision,
|
324 |
filter_columns_activation_precision,
|
325 |
filter_columns_size,
|
|
|
334 |
|
335 |
for selector in [
|
336 |
shown_columns,
|
337 |
+
# filter_columns_type,
|
338 |
filter_columns_weight_precision,
|
339 |
filter_columns_activation_precision,
|
340 |
filter_columns_size,
|
|
|
346 |
[
|
347 |
hidden_leaderboard_table_for_search,
|
348 |
shown_columns,
|
349 |
+
# filter_columns_type,
|
350 |
filter_columns_weight_precision,
|
351 |
filter_columns_activation_precision,
|
352 |
filter_columns_size,
|
|
|
391 |
scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
|
392 |
scheduler.start()
|
393 |
|
394 |
+
demo.queue(default_concurrency_limit=40).launch()
|
requirements.txt
CHANGED
@@ -2,15 +2,15 @@ APScheduler==3.10.1
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
-
gradio==4.
|
6 |
-
gradio_client
|
7 |
huggingface-hub>=0.18.0
|
8 |
matplotlib==3.7.1
|
9 |
numpy==1.24.2
|
10 |
pandas==2.0.0
|
11 |
plotly==5.14.1
|
12 |
python-dateutil==2.8.2
|
13 |
-
requests
|
14 |
sentencepiece
|
15 |
tqdm==4.65.0
|
16 |
transformers==4.37.0
|
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
+
gradio==4.29.0
|
6 |
+
gradio_client
|
7 |
huggingface-hub>=0.18.0
|
8 |
matplotlib==3.7.1
|
9 |
numpy==1.24.2
|
10 |
pandas==2.0.0
|
11 |
plotly==5.14.1
|
12 |
python-dateutil==2.8.2
|
13 |
+
requests
|
14 |
sentencepiece
|
15 |
tqdm==4.65.0
|
16 |
transformers==4.37.0
|
src/display/utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
from altair import Column
|
|
|
4 |
|
5 |
import pandas as pd
|
6 |
|
@@ -12,7 +13,7 @@ def fields(raw_class):
|
|
12 |
@dataclass
|
13 |
class Task:
|
14 |
benchmark: str
|
15 |
-
metric: str
|
16 |
col_name: str
|
17 |
|
18 |
|
@@ -22,7 +23,17 @@ class Tasks(Enum):
|
|
22 |
mmlu = Task("mmlu", "acc", "MMLU")
|
23 |
truthfulqa = Task("truthfulqa_mc2", "acc", "TruthfulQA")
|
24 |
winogrande = Task("winogrande", "acc", "Winogrande")
|
25 |
-
gsm8k = Task("gsm8k", "exact_match,get-answer", "GSM8K")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
# These classes are for user facing column names,
|
@@ -40,7 +51,7 @@ class ColumnContent:
|
|
40 |
|
41 |
auto_eval_column_dict = []
|
42 |
# Init
|
43 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
44 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
45 |
# Scores
|
46 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
@@ -173,7 +184,7 @@ class Precision(Enum):
|
|
173 |
Unknown = ModelDetails("?")
|
174 |
|
175 |
def from_str(precision):
|
176 |
-
if precision in ["torch.float16", "float16"]:
|
177 |
return Precision.float16
|
178 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
179 |
return Precision.bfloat16
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
from altair import Column
|
4 |
+
from typing import Union, List, Dict
|
5 |
|
6 |
import pandas as pd
|
7 |
|
|
|
13 |
@dataclass
|
14 |
class Task:
|
15 |
benchmark: str
|
16 |
+
metric: Union[str, List[str]]
|
17 |
col_name: str
|
18 |
|
19 |
|
|
|
23 |
mmlu = Task("mmlu", "acc", "MMLU")
|
24 |
truthfulqa = Task("truthfulqa_mc2", "acc", "TruthfulQA")
|
25 |
winogrande = Task("winogrande", "acc", "Winogrande")
|
26 |
+
gsm8k = Task("gsm8k", ["exact_match,get-answer", "exact_match,strict-match"], "GSM8K")
|
27 |
+
|
28 |
+
@staticmethod
|
29 |
+
def get_metric(task: Task, dict_results: Dict[str, float]):
|
30 |
+
if isinstance(task.metric, str):
|
31 |
+
return dict_results[task.metric]
|
32 |
+
else:
|
33 |
+
for metric in task.metric:
|
34 |
+
if metric in dict_results:
|
35 |
+
return dict_results[metric]
|
36 |
+
return None
|
37 |
|
38 |
|
39 |
# These classes are for user facing column names,
|
|
|
51 |
|
52 |
auto_eval_column_dict = []
|
53 |
# Init
|
54 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
55 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
56 |
# Scores
|
57 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
184 |
Unknown = ModelDetails("?")
|
185 |
|
186 |
def from_str(precision):
|
187 |
+
if precision in ["torch.float16", "float16", "fp16"]:
|
188 |
return Precision.float16
|
189 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
190 |
return Precision.bfloat16
|
src/leaderboard/read_evals.py
CHANGED
@@ -94,7 +94,7 @@ class EvalResult:
|
|
94 |
if task.benchmark == "mmlu":
|
95 |
accs = np.array([data["results"].get(task.benchmark, {}).get(task.metric, None)])
|
96 |
else:
|
97 |
-
accs = np.array([
|
98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
99 |
continue
|
100 |
|
@@ -154,7 +154,7 @@ class EvalResult:
|
|
154 |
AutoEvalColumn.weight_precision.name: self.weight_precision.value.name,
|
155 |
AutoEvalColumn.activation_precision.name: self.activation_precision.value.name,
|
156 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
157 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
158 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
159 |
AutoEvalColumn.architecture.name: self.architecture,
|
160 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
@@ -216,6 +216,7 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
216 |
|
217 |
eval_results = {}
|
218 |
for model_result_filepath in model_result_filepaths:
|
|
|
219 |
# Creation of result
|
220 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
221 |
|
@@ -232,7 +233,9 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
232 |
if v.status == "FINISHED":
|
233 |
v.to_dict() # we test if the dict version is complete
|
234 |
results.append(v)
|
235 |
-
except KeyError: # not all eval values present
|
|
|
|
|
236 |
continue
|
237 |
|
238 |
return results
|
|
|
94 |
if task.benchmark == "mmlu":
|
95 |
accs = np.array([data["results"].get(task.benchmark, {}).get(task.metric, None)])
|
96 |
else:
|
97 |
+
accs = np.array([Tasks.get_metric(task, v) for k, v in data["results"].items() if task.benchmark in k])
|
98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
99 |
continue
|
100 |
|
|
|
154 |
AutoEvalColumn.weight_precision.name: self.weight_precision.value.name,
|
155 |
AutoEvalColumn.activation_precision.name: self.activation_precision.value.name,
|
156 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
157 |
+
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
158 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
159 |
AutoEvalColumn.architecture.name: self.architecture,
|
160 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
|
216 |
|
217 |
eval_results = {}
|
218 |
for model_result_filepath in model_result_filepaths:
|
219 |
+
print(f"Read {model_result_filepath}")
|
220 |
# Creation of result
|
221 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
222 |
|
|
|
233 |
if v.status == "FINISHED":
|
234 |
v.to_dict() # we test if the dict version is complete
|
235 |
results.append(v)
|
236 |
+
except KeyError as e: # not all eval values present
|
237 |
+
print(f"Fail to get results from {v.eval_name} with the error {e}")
|
238 |
+
print(v)
|
239 |
continue
|
240 |
|
241 |
return results
|