eduagarcia commited on
Commit
359d8a9
1 Parent(s): ebb5810

Evaluation time metric and plot

Browse files
app.py CHANGED
@@ -38,6 +38,7 @@ from src.tools.plots import (
38
  create_metric_plot_obj,
39
  create_plot_df,
40
  create_scores_df,
 
41
  )
42
 
43
  # Start ephemeral Spaces on PRs (see config in README.md)
@@ -344,7 +345,7 @@ with demo:
344
  queue=True,
345
  )
346
 
347
- with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
348
  with gr.Row():
349
  with gr.Column():
350
  chart = create_metric_plot_obj(
@@ -359,7 +360,17 @@ with demo:
359
  BENCHMARK_COLS,
360
  title="Top Scores and Human Baseline Over Time (from last update)",
361
  )
362
- gr.Plot(value=chart, min_width=500)
 
 
 
 
 
 
 
 
 
 
363
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
364
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
365
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
 
38
  create_metric_plot_obj,
39
  create_plot_df,
40
  create_scores_df,
41
+ create_lat_score_mem_plot_obj
42
  )
43
 
44
  # Start ephemeral Spaces on PRs (see config in README.md)
 
345
  queue=True,
346
  )
347
 
348
+ with gr.TabItem("📈 Metrics", elem_id="llm-benchmark-tab-table", id=4):
349
  with gr.Row():
350
  with gr.Column():
351
  chart = create_metric_plot_obj(
 
360
  BENCHMARK_COLS,
361
  title="Top Scores and Human Baseline Over Time (from last update)",
362
  )
363
+ gr.Plot(value=chart, min_width=500)
364
+ with gr.Row():
365
+ with gr.Column():
366
+ fig = create_lat_score_mem_plot_obj(leaderboard_df)
367
+ plot = gr.components.Plot(
368
+ value=fig,
369
+ elem_id="plot",
370
+ show_label=False,
371
+ )
372
+ gr.HTML("👆 Hover over the points 👆 for additional information. ",elem_id="text")
373
+ gr.HTML('This plot the Evaluation Time from our backend GPU (Nvdia A100-80G) to run all the benchmarks, it\'s not a very precise performance benchmark of the models, for that look for the <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard" target="_blank">🤗 LLM-Perf Leaderboard</a>',elem_id="text")
374
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
375
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
376
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
src/display/utils.py CHANGED
@@ -109,8 +109,11 @@ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Avai
109
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
110
  auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
111
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
 
112
  # Dummy column for the search bar (hidden by the custom CSS)
113
- auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 
 
114
 
115
  # We use make dataclass to dynamically fill the scores from Tasks
116
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -147,7 +150,8 @@ baseline_row = {
147
  AutoEvalColumn.likes.name: 0,
148
  AutoEvalColumn.license.name: "",
149
  AutoEvalColumn.still_on_hub.name: False,
150
- AutoEvalColumn.moe.name: False
 
151
  }
152
 
153
  baseline_list = []
@@ -187,7 +191,8 @@ human_baseline_row = {
187
  AutoEvalColumn.likes.name: 0,
188
  AutoEvalColumn.license.name: "",
189
  AutoEvalColumn.still_on_hub.name: False,
190
- AutoEvalColumn.moe.name: False
 
191
  }
192
 
193
  baseline_list = []
 
109
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
110
  auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
111
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
112
+ auto_eval_column_dict.append(["eval_time", ColumnContent, ColumnContent("Evaluation Time (s)", "number", False)])
113
  # Dummy column for the search bar (hidden by the custom CSS)
114
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("Model Name", "str", False, dummy=True)])
115
+
116
+
117
 
118
  # We use make dataclass to dynamically fill the scores from Tasks
119
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
150
  AutoEvalColumn.likes.name: 0,
151
  AutoEvalColumn.license.name: "",
152
  AutoEvalColumn.still_on_hub.name: False,
153
+ AutoEvalColumn.moe.name: False,
154
+ AutoEvalColumn.eval_time.name: 0.0
155
  }
156
 
157
  baseline_list = []
 
191
  AutoEvalColumn.likes.name: 0,
192
  AutoEvalColumn.license.name: "",
193
  AutoEvalColumn.still_on_hub.name: False,
194
+ AutoEvalColumn.moe.name: False,
195
+ AutoEvalColumn.eval_time.name: 0.0
196
  }
197
 
198
  baseline_list = []
src/leaderboard/filter_models.py CHANGED
@@ -99,7 +99,7 @@ def flag_models(leaderboard_data: list[dict]):
99
  if model_data[AutoEvalColumn.flagged.name] == True:
100
  flag_key = "merged"
101
  else:
102
- flag_key = model_data["model_name_for_query"]
103
 
104
  if flag_key in FLAGGED_MODELS:
105
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
@@ -118,7 +118,7 @@ def flag_models(leaderboard_data: list[dict]):
118
  def remove_forbidden_models(leaderboard_data: list[dict]):
119
  indices_to_remove = []
120
  for ix, model in enumerate(leaderboard_data):
121
- if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
122
  indices_to_remove.append(ix)
123
 
124
  for ix in reversed(indices_to_remove):
 
99
  if model_data[AutoEvalColumn.flagged.name] == True:
100
  flag_key = "merged"
101
  else:
102
+ flag_key = model_data[AutoEvalColumn.dummy.name]
103
 
104
  if flag_key in FLAGGED_MODELS:
105
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
 
118
  def remove_forbidden_models(leaderboard_data: list[dict]):
119
  indices_to_remove = []
120
  for ix, model in enumerate(leaderboard_data):
121
+ if model[AutoEvalColumn.dummy.name] in DO_NOT_SUBMIT_MODELS:
122
  indices_to_remove.append(ix)
123
 
124
  for ix in reversed(indices_to_remove):
src/leaderboard/read_evals.py CHANGED
@@ -36,6 +36,7 @@ class EvalResult:
36
  status: str = "FINISHED"
37
  tags: list = None
38
  json_filename: str = None
 
39
 
40
  @classmethod
41
  def init_from_json_file(self, json_filepath):
@@ -103,7 +104,8 @@ class EvalResult:
103
  results=results,
104
  precision=precision,
105
  revision= config.get("model_sha", ""),
106
- json_filename=json_filename
 
107
  )
108
 
109
  def update_with_request_file(self, requests_path):
@@ -151,7 +153,8 @@ class EvalResult:
151
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
152
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
153
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
154
- AutoEvalColumn.flagged.name: self.flagged
 
155
  }
156
 
157
  for task in Tasks:
 
36
  status: str = "FINISHED"
37
  tags: list = None
38
  json_filename: str = None
39
+ eval_time: float = 0.0
40
 
41
  @classmethod
42
  def init_from_json_file(self, json_filepath):
 
104
  results=results,
105
  precision=precision,
106
  revision= config.get("model_sha", ""),
107
+ json_filename=json_filename,
108
+ eval_time=config.get("total_evaluation_time_seconds", 0.0)
109
  )
110
 
111
  def update_with_request_file(self, requests_path):
 
153
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
154
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
155
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
156
+ AutoEvalColumn.flagged.name: self.flagged,
157
+ AutoEvalColumn.eval_time.name: self.eval_time,
158
  }
159
 
160
  for task in Tasks:
src/tools/plots.py CHANGED
@@ -151,6 +151,61 @@ def create_metric_plot_obj(
151
 
152
  return fig
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  # Example Usage:
156
  # human_baselines dictionary is defined.
 
151
 
152
  return fig
153
 
154
+ def create_lat_score_mem_plot_obj(leaderboard_df):
155
+ copy_df = leaderboard_df.copy()
156
+ copy_df = copy_df[~(copy_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"]))]
157
+ # plot
158
+ SCORE_MEMORY_LATENCY_DATA = [
159
+ AutoEvalColumn.dummy.name,
160
+ AutoEvalColumn.average.name,
161
+ AutoEvalColumn.params.name,
162
+ AutoEvalColumn.architecture.name,
163
+ "Evaluation Time (min)"
164
+ ]
165
+
166
+ copy_df["LLM Average Score"] = copy_df[AutoEvalColumn.average.name]
167
+ copy_df["Evaluation Time (min)"] = copy_df[AutoEvalColumn.eval_time.name] / 60
168
+
169
+ #copy_df["size"] = copy_df[AutoEvalColumn.params.name]
170
+ copy_df["size"] = copy_df[AutoEvalColumn.params.name].apply(lambda x: 0.5 if 0 <= x < 0.8 else x)
171
+ copy_df["size"] = copy_df["size"].apply(lambda x: 0.8 if 0.8 <= x < 2 else x)
172
+ copy_df["size"] = copy_df["size"].apply(lambda x: 1.5 if 2 <= x < 5 else x)
173
+ copy_df["size"] = copy_df["size"].apply(lambda x: 2.0 if 5 <= x < 10 else x)
174
+ copy_df["size"] = copy_df["size"].apply(lambda x: 3.0 if 10 <= x < 20 else x)
175
+ copy_df["size"] = copy_df["size"].apply(lambda x: 4.5 if 20 <= x < 40 else x)
176
+ copy_df["size"] = copy_df["size"].apply(lambda x: 7.0 if x > 40 else x)
177
+
178
+ fig = px.scatter(
179
+ copy_df,
180
+ x="Evaluation Time (min)",
181
+ y="LLM Average Score",
182
+ size="size",
183
+ color=AutoEvalColumn.architecture.name,
184
+ custom_data=SCORE_MEMORY_LATENCY_DATA,
185
+ color_discrete_sequence=px.colors.qualitative.Light24,
186
+ log_x=True
187
+ )
188
+ fig.update_traces(
189
+ hovertemplate="<br>".join(
190
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
191
+ )
192
+ )
193
+ fig.update_layout(
194
+ title={
195
+ "text": "Eval Time vs. Score vs. #Params",
196
+ "y": 0.95,
197
+ "x": 0.5,
198
+ "xanchor": "center",
199
+ "yanchor": "top",
200
+ },
201
+ xaxis_title="Time To Evaluate (min)",
202
+ yaxis_title="LLM Average Score",
203
+ legend_title="LLM Architecture",
204
+ width=1200,
205
+ height=600,
206
+ )
207
+
208
+ return fig
209
 
210
  # Example Usage:
211
  # human_baselines dictionary is defined.