jinsol-neubla commited on
Commit
9a04f8c
1 Parent(s): 3066149

Fix GSM8k key change issue

Browse files

(get-answer -> strict-match)

Signed-off-by: jinsol-neubla <jinsol.kim@neubla.com>

app.py CHANGED
@@ -80,7 +80,7 @@ leaderboard_df, original_df, plot_df = init_space()
80
  def update_table(
81
  hidden_df: pd.DataFrame,
82
  columns: list,
83
- type_query: list,
84
  weight_precision_query: str,
85
  activation_precision_query: str,
86
  size_query: list,
@@ -90,7 +90,7 @@ def update_table(
90
  ):
91
  filtered_df = filter_models(
92
  df=hidden_df,
93
- type_query=type_query,
94
  size_query=size_query,
95
  weight_precision_query=weight_precision_query,
96
  activation_precision_query=activation_precision_query,
@@ -151,7 +151,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
151
 
152
  def filter_models(
153
  df: pd.DataFrame,
154
- type_query: list,
155
  size_query: list,
156
  weight_precision_query: list,
157
  activation_precision_query: list,
@@ -173,8 +173,8 @@ def filter_models(
173
  if "Flagged" in hide_models:
174
  filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
175
 
176
- type_emoji = [t[0] for t in type_query]
177
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
178
  filtered_df = filtered_df.loc[df[AutoEvalColumn.weight_precision.name].isin(weight_precision_query + ["None"])]
179
  filtered_df = filtered_df.loc[
180
  df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
@@ -191,7 +191,7 @@ def filter_models(
191
 
192
  leaderboard_df = filter_models(
193
  df=leaderboard_df,
194
- type_query=[t.to_str(" : ") for t in ModelType],
195
  size_query=list(NUMERIC_INTERVALS.keys()),
196
  weight_precision_query=[i.value.name for i in Precision],
197
  activation_precision_query=[i.value.name for i in Precision],
@@ -239,13 +239,13 @@ with demo:
239
  )
240
  with gr.Column(min_width=320):
241
  # with gr.Box(elem_id="box-filter"):
242
- filter_columns_type = gr.CheckboxGroup(
243
- label="Model types",
244
- choices=[t.to_str() for t in ModelType],
245
- value=[t.to_str() for t in ModelType],
246
- interactive=True,
247
- elem_id="filter-columns-type",
248
- )
249
  filter_columns_weight_precision = gr.CheckboxGroup(
250
  label="Weight Precision",
251
  choices=[i.value.name for i in Precision],
@@ -301,7 +301,7 @@ with demo:
301
  [
302
  hidden_leaderboard_table_for_search,
303
  shown_columns,
304
- filter_columns_type,
305
  filter_columns_weight_precision,
306
  filter_columns_activation_precision,
307
  filter_columns_size,
@@ -319,7 +319,7 @@ with demo:
319
  [
320
  hidden_leaderboard_table_for_search,
321
  shown_columns,
322
- filter_columns_type,
323
  filter_columns_weight_precision,
324
  filter_columns_activation_precision,
325
  filter_columns_size,
@@ -334,7 +334,7 @@ with demo:
334
 
335
  for selector in [
336
  shown_columns,
337
- filter_columns_type,
338
  filter_columns_weight_precision,
339
  filter_columns_activation_precision,
340
  filter_columns_size,
@@ -346,7 +346,7 @@ with demo:
346
  [
347
  hidden_leaderboard_table_for_search,
348
  shown_columns,
349
- filter_columns_type,
350
  filter_columns_weight_precision,
351
  filter_columns_activation_precision,
352
  filter_columns_size,
@@ -391,4 +391,4 @@ scheduler = BackgroundScheduler()
391
  scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
392
  scheduler.start()
393
 
394
- demo.queue(default_concurrency_limit=40).launch(share=True)
 
80
  def update_table(
81
  hidden_df: pd.DataFrame,
82
  columns: list,
83
+ # type_query: list,
84
  weight_precision_query: str,
85
  activation_precision_query: str,
86
  size_query: list,
 
90
  ):
91
  filtered_df = filter_models(
92
  df=hidden_df,
93
+ # type_query=type_query,
94
  size_query=size_query,
95
  weight_precision_query=weight_precision_query,
96
  activation_precision_query=activation_precision_query,
 
151
 
152
  def filter_models(
153
  df: pd.DataFrame,
154
+ # type_query: list,
155
  size_query: list,
156
  weight_precision_query: list,
157
  activation_precision_query: list,
 
173
  if "Flagged" in hide_models:
174
  filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
175
 
176
+ # type_emoji = [t[0] for t in type_query]
177
+ # filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
178
  filtered_df = filtered_df.loc[df[AutoEvalColumn.weight_precision.name].isin(weight_precision_query + ["None"])]
179
  filtered_df = filtered_df.loc[
180
  df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
 
191
 
192
  leaderboard_df = filter_models(
193
  df=leaderboard_df,
194
+ # type_query=[t.to_str(" : ") for t in ModelType],
195
  size_query=list(NUMERIC_INTERVALS.keys()),
196
  weight_precision_query=[i.value.name for i in Precision],
197
  activation_precision_query=[i.value.name for i in Precision],
 
239
  )
240
  with gr.Column(min_width=320):
241
  # with gr.Box(elem_id="box-filter"):
242
+ # filter_columns_type = gr.CheckboxGroup(
243
+ # label="Model types",
244
+ # choices=[t.to_str() for t in ModelType],
245
+ # value=[t.to_str() for t in ModelType],
246
+ # interactive=True,
247
+ # elem_id="filter-columns-type",
248
+ # )
249
  filter_columns_weight_precision = gr.CheckboxGroup(
250
  label="Weight Precision",
251
  choices=[i.value.name for i in Precision],
 
301
  [
302
  hidden_leaderboard_table_for_search,
303
  shown_columns,
304
+ # filter_columns_type,
305
  filter_columns_weight_precision,
306
  filter_columns_activation_precision,
307
  filter_columns_size,
 
319
  [
320
  hidden_leaderboard_table_for_search,
321
  shown_columns,
322
+ # filter_columns_type,
323
  filter_columns_weight_precision,
324
  filter_columns_activation_precision,
325
  filter_columns_size,
 
334
 
335
  for selector in [
336
  shown_columns,
337
+ # filter_columns_type,
338
  filter_columns_weight_precision,
339
  filter_columns_activation_precision,
340
  filter_columns_size,
 
346
  [
347
  hidden_leaderboard_table_for_search,
348
  shown_columns,
349
+ # filter_columns_type,
350
  filter_columns_weight_precision,
351
  filter_columns_activation_precision,
352
  filter_columns_size,
 
391
  scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
392
  scheduler.start()
393
 
394
+ demo.queue(default_concurrency_limit=40).launch()
requirements.txt CHANGED
@@ -2,15 +2,15 @@ APScheduler==3.10.1
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
- gradio==4.9.0
6
- gradio_client==0.7.2
7
  huggingface-hub>=0.18.0
8
  matplotlib==3.7.1
9
  numpy==1.24.2
10
  pandas==2.0.0
11
  plotly==5.14.1
12
  python-dateutil==2.8.2
13
- requests==2.28.2
14
  sentencepiece
15
  tqdm==4.65.0
16
  transformers==4.37.0
 
2
  black==23.11.0
3
  click==8.1.3
4
  datasets==2.14.5
5
+ gradio==4.29.0
6
+ gradio_client
7
  huggingface-hub>=0.18.0
8
  matplotlib==3.7.1
9
  numpy==1.24.2
10
  pandas==2.0.0
11
  plotly==5.14.1
12
  python-dateutil==2.8.2
13
+ requests
14
  sentencepiece
15
  tqdm==4.65.0
16
  transformers==4.37.0
src/display/utils.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  from altair import Column
 
4
 
5
  import pandas as pd
6
 
@@ -12,7 +13,7 @@ def fields(raw_class):
12
  @dataclass
13
  class Task:
14
  benchmark: str
15
- metric: str
16
  col_name: str
17
 
18
 
@@ -22,7 +23,17 @@ class Tasks(Enum):
22
  mmlu = Task("mmlu", "acc", "MMLU")
23
  truthfulqa = Task("truthfulqa_mc2", "acc", "TruthfulQA")
24
  winogrande = Task("winogrande", "acc", "Winogrande")
25
- gsm8k = Task("gsm8k", "exact_match,get-answer", "GSM8K")
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  # These classes are for user facing column names,
@@ -40,7 +51,7 @@ class ColumnContent:
40
 
41
  auto_eval_column_dict = []
42
  # Init
43
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
44
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
45
  # Scores
46
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
@@ -173,7 +184,7 @@ class Precision(Enum):
173
  Unknown = ModelDetails("?")
174
 
175
  def from_str(precision):
176
- if precision in ["torch.float16", "float16"]:
177
  return Precision.float16
178
  if precision in ["torch.bfloat16", "bfloat16"]:
179
  return Precision.bfloat16
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  from altair import Column
4
+ from typing import Union, List, Dict
5
 
6
  import pandas as pd
7
 
 
13
  @dataclass
14
  class Task:
15
  benchmark: str
16
+ metric: Union[str, List[str]]
17
  col_name: str
18
 
19
 
 
23
  mmlu = Task("mmlu", "acc", "MMLU")
24
  truthfulqa = Task("truthfulqa_mc2", "acc", "TruthfulQA")
25
  winogrande = Task("winogrande", "acc", "Winogrande")
26
+ gsm8k = Task("gsm8k", ["exact_match,get-answer", "exact_match,strict-match"], "GSM8K")
27
+
28
+ @staticmethod
29
+ def get_metric(task: Task, dict_results: Dict[str, float]):
30
+ if isinstance(task.metric, str):
31
+ return dict_results[task.metric]
32
+ else:
33
+ for metric in task.metric:
34
+ if metric in dict_results:
35
+ return dict_results[metric]
36
+ return None
37
 
38
 
39
  # These classes are for user facing column names,
 
51
 
52
  auto_eval_column_dict = []
53
  # Init
54
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
55
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
56
  # Scores
57
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
184
  Unknown = ModelDetails("?")
185
 
186
  def from_str(precision):
187
+ if precision in ["torch.float16", "float16", "fp16"]:
188
  return Precision.float16
189
  if precision in ["torch.bfloat16", "bfloat16"]:
190
  return Precision.bfloat16
src/leaderboard/read_evals.py CHANGED
@@ -94,7 +94,7 @@ class EvalResult:
94
  if task.benchmark == "mmlu":
95
  accs = np.array([data["results"].get(task.benchmark, {}).get(task.metric, None)])
96
  else:
97
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
98
  if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
 
@@ -154,7 +154,7 @@ class EvalResult:
154
  AutoEvalColumn.weight_precision.name: self.weight_precision.value.name,
155
  AutoEvalColumn.activation_precision.name: self.activation_precision.value.name,
156
  AutoEvalColumn.model_type.name: self.model_type.value.name,
157
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
158
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
159
  AutoEvalColumn.architecture.name: self.architecture,
160
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
@@ -216,6 +216,7 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
216
 
217
  eval_results = {}
218
  for model_result_filepath in model_result_filepaths:
 
219
  # Creation of result
220
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
221
 
@@ -232,7 +233,9 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
232
  if v.status == "FINISHED":
233
  v.to_dict() # we test if the dict version is complete
234
  results.append(v)
235
- except KeyError: # not all eval values present
 
 
236
  continue
237
 
238
  return results
 
94
  if task.benchmark == "mmlu":
95
  accs = np.array([data["results"].get(task.benchmark, {}).get(task.metric, None)])
96
  else:
97
+ accs = np.array([Tasks.get_metric(task, v) for k, v in data["results"].items() if task.benchmark in k])
98
  if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
 
 
154
  AutoEvalColumn.weight_precision.name: self.weight_precision.value.name,
155
  AutoEvalColumn.activation_precision.name: self.activation_precision.value.name,
156
  AutoEvalColumn.model_type.name: self.model_type.value.name,
157
+ # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
158
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
159
  AutoEvalColumn.architecture.name: self.architecture,
160
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
216
 
217
  eval_results = {}
218
  for model_result_filepath in model_result_filepaths:
219
+ print(f"Read {model_result_filepath}")
220
  # Creation of result
221
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
222
 
 
233
  if v.status == "FINISHED":
234
  v.to_dict() # we test if the dict version is complete
235
  results.append(v)
236
+ except KeyError as e: # not all eval values present
237
+ print(f"Fail to get results from {v.eval_name} with the error {e}")
238
+ print(v)
239
  continue
240
 
241
  return results