display button unification for benchmarks

#24
app.py CHANGED
@@ -2,10 +2,11 @@
2
  import os
3
  import datetime
4
  import socket
 
5
 
6
  import gradio as gr
7
  import pandas as pd
8
-
9
  from apscheduler.schedulers.background import BackgroundScheduler
10
 
11
  from huggingface_hub import snapshot_download
@@ -35,13 +36,27 @@ from src.display.utils import (
35
  fields,
36
  WeightType,
37
  Precision,
 
38
  )
39
 
40
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 
41
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
42
  from src.submission.submit import add_new_eval
43
  from src.utils import get_dataset_summary_table
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
47
  try:
@@ -75,17 +90,13 @@ def init_space():
75
  )
76
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
77
 
78
-
79
- dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
80
- leaderboard_df = original_df.copy()
81
-
82
-
83
  # Searching and filtering
84
  def update_table(
85
  hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
86
  ):
87
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
88
  filtered_df = filter_queries(query, filtered_df)
 
89
  df = select_columns(filtered_df, columns)
90
  return df
91
 
@@ -141,7 +152,62 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
141
  # filtered_df = filtered_df.loc[mask]
142
 
143
  return filtered_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  # triggered only once at startup => read query parameter if it exists
147
  def load_query(request: gr.Request):
@@ -162,7 +228,7 @@ with demo:
162
  search_bar = gr.Textbox(
163
  placeholder=" 🔍 Model search (separate multiple queries with `;`)",
164
  show_label=False,
165
- elem_id="search-bar",
166
  )
167
  with gr.Row():
168
  shown_columns = gr.CheckboxGroup(
@@ -215,18 +281,19 @@ with demo:
215
  # )
216
 
217
  # breakpoint()
218
-
219
  leaderboard_table = gr.components.Dataframe(
220
  value=(
221
  leaderboard_df[
222
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
223
  + shown_columns.value
 
224
  + [AutoEvalColumn.dummy.name]
225
  ]
226
  if leaderboard_df.empty is False
227
  else leaderboard_df
228
  ),
229
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
230
  datatype=TYPES,
231
  elem_id="leaderboard-table",
232
  interactive=False,
@@ -251,14 +318,14 @@ with demo:
251
  filter_columns_size,
252
  search_bar,
253
  ],
254
- leaderboard_table,
255
  )
256
 
257
  # Check query parameter once at startup and update search bar
258
  demo.load(load_query, inputs=[], outputs=[search_bar])
259
 
260
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
261
- selector.change(
262
  update_table,
263
  [
264
  hidden_leaderboard_table_for_search,
@@ -323,6 +390,15 @@ with demo:
323
  value=None,
324
  interactive=True,
325
  )
 
 
 
 
 
 
 
 
 
326
 
327
  with gr.Row():
328
  with gr.Column():
@@ -358,6 +434,7 @@ with demo:
358
 
359
  submit_button = gr.Button("Submit Eval")
360
  submission_result = gr.Markdown()
 
361
  submit_button.click(
362
  add_new_eval,
363
  [
@@ -369,6 +446,8 @@ with demo:
369
  weight_type,
370
  model_type,
371
  inference_framework,
 
 
372
  ],
373
  submission_result,
374
  )
@@ -385,8 +464,7 @@ with demo:
385
 
386
  scheduler = BackgroundScheduler()
387
 
388
- scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
389
-
390
 
391
  def launch_backend():
392
  import subprocess
@@ -395,8 +473,9 @@ def launch_backend():
395
  if DEVICE not in {"cpu"}:
396
  _ = subprocess.run(["python", "backend-cli.py"])
397
 
398
-
399
  # scheduler.add_job(launch_backend, "interval", seconds=120)
400
-
401
- scheduler.start()
402
- demo.queue(default_concurrency_limit=40).launch()
 
 
2
  import os
3
  import datetime
4
  import socket
5
+ from threading import Thread
6
 
7
  import gradio as gr
8
  import pandas as pd
9
+ import time
10
  from apscheduler.schedulers.background import BackgroundScheduler
11
 
12
  from huggingface_hub import snapshot_download
 
36
  fields,
37
  WeightType,
38
  Precision,
39
+ GPUType
40
  )
41
 
42
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
43
+ QUEUE_REPO, REPO_ID, RESULTS_REPO, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
44
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
45
  from src.submission.submit import add_new_eval
46
  from src.utils import get_dataset_summary_table
47
 
48
+ def get_args():
49
+ import argparse
50
+
51
+ parser = argparse.ArgumentParser(description="Run the LLM Leaderboard")
52
+ parser.add_argument("--debug", action="store_true", help="Run in debug mode")
53
+ return parser.parse_args()
54
+
55
+ args = get_args()
56
+ if args.debug:
57
+ print("Running in debug mode")
58
+ QUEUE_REPO = DEBUG_QUEUE_REPO
59
+ RESULTS_REPO = DEBUG_RESULTS_REPO
60
 
61
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
62
  try:
 
90
  )
91
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
92
 
 
 
 
 
 
93
  # Searching and filtering
94
  def update_table(
95
  hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
96
  ):
97
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
98
  filtered_df = filter_queries(query, filtered_df)
99
+ columns.extend(add_benchmark_columns(columns))
100
  df = select_columns(filtered_df, columns)
101
  return df
102
 
 
152
  # filtered_df = filtered_df.loc[mask]
153
 
154
  return filtered_df
155
+
156
+
157
+ def add_benchmark_columns(shown_columns):
158
+ benchmark_columns = []
159
+ for benchmark in BENCHMARK_COLS:
160
+ if benchmark in shown_columns:
161
+ for c in COLS:
162
+ if benchmark in c and benchmark != c:
163
+ benchmark_columns.append(c)
164
+ return benchmark_columns
165
+
166
+ shown_columns = None
167
+ dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
168
+ leaderboard_df = original_df.copy()
169
+
170
+ # def update_leaderboard_table():
171
+ # global leaderboard_df, shown_columns
172
+ # print("Updating leaderboard table")
173
+ # return leaderboard_df[
174
+ # [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
175
+ # + shown_columns.value
176
+ # + [AutoEvalColumn.dummy.name]
177
+ # ] if not leaderboard_df.empty else leaderboard_df
178
+
179
+
180
+ # def update_hidden_leaderboard_table():
181
+ # global original_df
182
+ # return original_df[COLS] if original_df.empty is False else original_df
183
+
184
+ # def update_dataset_table():
185
+ # global dataset_df
186
+ # return dataset_df
187
 
188
+ # def update_finish_table():
189
+ # global finished_eval_queue_df
190
+ # return finished_eval_queue_df
191
+
192
+ # def update_running_table():
193
+ # global running_eval_queue_df
194
+ # return running_eval_queue_df
195
+
196
+ # def update_pending_table():
197
+ # global pending_eval_queue_df
198
+ # return pending_eval_queue_df
199
+
200
+ # def update_finish_num():
201
+ # global finished_eval_queue_df
202
+ # return len(finished_eval_queue_df)
203
+
204
+ # def update_running_num():
205
+ # global running_eval_queue_df
206
+ # return len(running_eval_queue_df)
207
+
208
+ # def update_pending_num():
209
+ # global pending_eval_queue_df
210
+ # return len(pending_eval_queue_df)
211
 
212
  # triggered only once at startup => read query parameter if it exists
213
  def load_query(request: gr.Request):
 
228
  search_bar = gr.Textbox(
229
  placeholder=" 🔍 Model search (separate multiple queries with `;`)",
230
  show_label=False,
231
+ elem_id="search-bar"
232
  )
233
  with gr.Row():
234
  shown_columns = gr.CheckboxGroup(
 
281
  # )
282
 
283
  # breakpoint()
284
+ benchmark_columns = add_benchmark_columns(shown_columns.value)
285
  leaderboard_table = gr.components.Dataframe(
286
  value=(
287
  leaderboard_df[
288
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
289
  + shown_columns.value
290
+ + benchmark_columns
291
  + [AutoEvalColumn.dummy.name]
292
  ]
293
  if leaderboard_df.empty is False
294
  else leaderboard_df
295
  ),
296
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + benchmark_columns,
297
  datatype=TYPES,
298
  elem_id="leaderboard-table",
299
  interactive=False,
 
318
  filter_columns_size,
319
  search_bar,
320
  ],
321
+ leaderboard_table
322
  )
323
 
324
  # Check query parameter once at startup and update search bar
325
  demo.load(load_query, inputs=[], outputs=[search_bar])
326
 
327
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
328
+ selector.select(
329
  update_table,
330
  [
331
  hidden_leaderboard_table_for_search,
 
390
  value=None,
391
  interactive=True,
392
  )
393
+
394
+ gpu_type = gr.Dropdown(
395
+ choices=[t.to_str() for t in GPUType],
396
+ label="GPU type",
397
+ multiselect=False,
398
+ value="NVIDIA-A100-PCIe-80GB",
399
+ interactive=True,
400
+ )
401
+
402
 
403
  with gr.Row():
404
  with gr.Column():
 
434
 
435
  submit_button = gr.Button("Submit Eval")
436
  submission_result = gr.Markdown()
437
+ debug = gr.Checkbox(value=args.debug, label="Debug", visible=False)
438
  submit_button.click(
439
  add_new_eval,
440
  [
 
446
  weight_type,
447
  model_type,
448
  inference_framework,
449
+ debug,
450
+ gpu_type
451
  ],
452
  submission_result,
453
  )
 
464
 
465
  scheduler = BackgroundScheduler()
466
 
467
+ scheduler.add_job(restart_space, "interval", hours=6)
 
468
 
469
  def launch_backend():
470
  import subprocess
 
473
  if DEVICE not in {"cpu"}:
474
  _ = subprocess.run(["python", "backend-cli.py"])
475
 
476
+ # Thread(target=periodic_init, daemon=True).start()
477
  # scheduler.add_job(launch_backend, "interval", seconds=120)
478
+ if __name__ == "__main__":
479
+ scheduler.start()
480
+ demo.queue(default_concurrency_limit=40).launch()
481
+
backend-cli.py CHANGED
@@ -16,13 +16,13 @@ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PAT
16
  from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
- from src.envs import QUEUE_REPO, RESULTS_REPO, API
20
  from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
24
  from typing import Optional
25
-
26
  import time
27
 
28
  import pprint
@@ -126,6 +126,9 @@ def request_to_result_name(request: EvalRequest) -> str:
126
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
127
  batch_size = 1
128
  batch_size = eval_request.batch_size
 
 
 
129
 
130
  init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
131
  # if init_gpu_info['Mem(M)'] > 500:
@@ -364,9 +367,22 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
364
  return False
365
 
366
 
 
 
 
 
 
 
 
 
 
 
367
  def process_pending_requests() -> bool:
 
 
 
368
  sanity_checks()
369
-
370
  current_pending_status = [PENDING_STATUS]
371
 
372
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
@@ -385,6 +401,12 @@ def process_pending_requests() -> bool:
385
 
386
  eval_request = eval_requests[0]
387
  pp.pprint(eval_request)
 
 
 
 
 
 
388
 
389
  my_snapshot_download(
390
  repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
@@ -426,6 +448,8 @@ def get_args():
426
  parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
427
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
428
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
 
 
429
  return parser.parse_args()
430
 
431
 
@@ -454,8 +478,13 @@ if __name__ == "__main__":
454
  status="",
455
  json_filepath="",
456
  precision=precision, # Use precision from arguments
457
- inference_framework=args.inference_framework # Use inference framework from arguments
 
458
  )
 
 
 
 
459
  results = process_evaluation(task, eval_request, limit=args.limit)
460
  except Exception as e:
461
  print(f"debug running error: {e}")
 
16
  from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
+ from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
20
  from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
24
  from typing import Optional
25
+ import GPUtil
26
  import time
27
 
28
  import pprint
 
126
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
127
  batch_size = 1
128
  batch_size = eval_request.batch_size
129
+
130
+ if args.debug:
131
+ RESULTS_REPO = DEBUG_RESULTS_REPO
132
 
133
  init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
134
  # if init_gpu_info['Mem(M)'] > 500:
 
367
  return False
368
 
369
 
370
+ def get_gpu_details():
371
+ gpus = GPUtil.getGPUs()
372
+ gpu = gpus[0]
373
+ name = gpu.name.replace(" ", "-")
374
+ # Convert memory from MB to GB and round to nearest whole number
375
+ memory_gb = round(gpu.memoryTotal / 1024)
376
+ memory = f"{memory_gb}GB"
377
+ formatted_name = f"{name}-{memory}"
378
+ return formatted_name
379
+
380
  def process_pending_requests() -> bool:
381
+ if args.debug:
382
+ QUEUE_REPO = DEBUG_QUEUE_REPO
383
+
384
  sanity_checks()
385
+ print("Processing pending requests")
386
  current_pending_status = [PENDING_STATUS]
387
 
388
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
 
401
 
402
  eval_request = eval_requests[0]
403
  pp.pprint(eval_request)
404
+
405
+ gpu_type = eval_request.gpu_type
406
+ curr_gpu_type = get_gpu_details()
407
+ if gpu_type != curr_gpu_type:
408
+ print(f"GPU type mismatch: {gpu_type} vs {curr_gpu_type}")
409
+ return False
410
 
411
  my_snapshot_download(
412
  repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
 
448
  parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
449
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
450
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
451
+ parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
452
+ help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
453
  return parser.parse_args()
454
 
455
 
 
478
  status="",
479
  json_filepath="",
480
  precision=precision, # Use precision from arguments
481
+ inference_framework=args.inference_framework, # Use inference framework from arguments
482
+ gpu_type=args.gpu_type
483
  )
484
+ curr_gpu_type = get_gpu_details()
485
+ if eval_request.gpu_type != curr_gpu_type:
486
+ print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
487
+ raise Exception("GPU type mismatch")
488
  results = process_evaluation(task, eval_request, limit=args.limit)
489
  except Exception as e:
490
  print(f"debug running error: {e}")
requirements.txt CHANGED
@@ -27,6 +27,7 @@ cchardet
27
  rouge_score
28
  bert-score
29
  evaluate
30
- spacy
31
  selfcheckgpt
32
  immutabledict
 
 
27
  rouge_score
28
  bert-score
29
  evaluate
30
+ spacy==3.7.4
31
  selfcheckgpt
32
  immutabledict
33
+ gputil
src/backend/manage_requests.py CHANGED
@@ -28,6 +28,7 @@ class EvalRequest:
28
  params: Optional[int] = None
29
  license: Optional[str] = ""
30
  batch_size: Optional[int] = 1
 
31
 
32
  def get_model_args(self) -> str:
33
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
 
28
  params: Optional[int] = None
29
  license: Optional[str] = ""
30
  batch_size: Optional[int] = 1
31
+ gpu_type: Optional[str] = "NVIDIA-A100-PCIe-80GB"
32
 
33
  def get_model_args(self) -> str:
34
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
src/display/utils.py CHANGED
@@ -104,16 +104,16 @@ auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnConten
104
  for task in Tasks:
105
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
106
  # System performance metrics
107
- auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True)])
108
- auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True)])
109
- # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True)])
110
- auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True)])
111
- auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True)])
112
- auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True)])
113
  if task.value.benchmark in MULTIPLE_CHOICEs:
114
  continue
115
- # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False)])
116
- auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True)])
117
 
118
 
119
  # Model information
@@ -140,6 +140,7 @@ class EvalQueueColumn: # Queue column
140
  private = ColumnContent("private", "bool", True)
141
  precision = ColumnContent("precision", "str", True)
142
  weight_type = ColumnContent("weight_type", "str", "Original")
 
143
  status = ColumnContent("status", "str", True)
144
 
145
 
@@ -189,7 +190,25 @@ class InferenceFramework(Enum):
189
  return InferenceFramework.HF_Chat
190
  return InferenceFramework.Unknown
191
 
 
 
 
 
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  class WeightType(Enum):
194
  Adapter = ModelDetails("Adapter")
195
  Original = ModelDetails("Original")
@@ -223,8 +242,8 @@ class Precision(Enum):
223
 
224
 
225
  # Column selection
226
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
227
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
228
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
229
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
230
 
 
104
  for task in Tasks:
105
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
106
  # System performance metrics
107
+ auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
108
+ auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
109
+ # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
110
+ auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
111
+ auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
112
+ auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
113
  if task.value.benchmark in MULTIPLE_CHOICEs:
114
  continue
115
+ # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
116
+ auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
117
 
118
 
119
  # Model information
 
140
  private = ColumnContent("private", "bool", True)
141
  precision = ColumnContent("precision", "str", True)
142
  weight_type = ColumnContent("weight_type", "str", "Original")
143
+ model_framework = ColumnContent("inference_framework", "str", True)
144
  status = ColumnContent("status", "str", True)
145
 
146
 
 
190
  return InferenceFramework.HF_Chat
191
  return InferenceFramework.Unknown
192
 
193
+ class GPUType(Enum):
194
+ H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB")
195
+ A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
196
+ A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB")
197
+ Unknown = ModelDetails("?")
198
 
199
+ def to_str(self):
200
+ return self.value.name
201
+
202
+ @staticmethod
203
+ def from_str(gpu_type: str):
204
+ if gpu_type in ["NVIDIA-H100-PCIe-80GB"]:
205
+ return GPUType.A100_pcie
206
+ if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
207
+ return GPUType.H100_pcie
208
+ if gpu_type in ["NVIDIA-A5000-24GB"]:
209
+ return GPUType.A5000
210
+ return GPUType.Unknown
211
+
212
  class WeightType(Enum):
213
  Adapter = ModelDetails("Adapter")
214
  Original = ModelDetails("Original")
 
242
 
243
 
244
  # Column selection
245
+ COLS = [c.name for c in fields(AutoEvalColumn)]
246
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
247
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
248
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
249
 
src/envs.py CHANGED
@@ -12,8 +12,8 @@ QUEUE_REPO = "sparse-generative-ai/requests"
12
  QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests"
13
  RESULTS_REPO = "sparse-generative-ai/results"
14
 
15
- PRIVATE_QUEUE_REPO = "sparse-generative-ai/private-requests"
16
- PRIVATE_RESULTS_REPO = "sparse-generative-ai/private-results"
17
 
18
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
19
 
 
12
  QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests"
13
  RESULTS_REPO = "sparse-generative-ai/results"
14
 
15
+ DEBUG_QUEUE_REPO = "sparse-generative-ai/debug_requests"
16
+ DEBUG_RESULTS_REPO = "sparse-generative-ai/debug_results"
17
 
18
  IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
19
 
src/populate.py CHANGED
@@ -95,6 +95,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> tuple[pd.DataFrame, p
95
 
96
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
97
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
98
 
99
  all_evals.append(data)
100
  elif ".md" not in entry:
@@ -107,6 +108,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> tuple[pd.DataFrame, p
107
 
108
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
109
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
110
  all_evals.append(data)
111
 
112
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
 
95
 
96
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
97
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
98
+ data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-")
99
 
100
  all_evals.append(data)
101
  elif ".md" not in entry:
 
108
 
109
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
110
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
111
+ data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-")
112
  all_evals.append(data)
113
 
114
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
src/submission/check_validity.py CHANGED
@@ -130,7 +130,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}")
134
 
135
  # Select organisation
136
  if info["model"].count("/") == 0 or "submitted_time" not in info:
 
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
134
 
135
  # Select organisation
136
  if info["model"].count("/") == 0 or "submitted_time" not in info:
src/submission/submit.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
7
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
8
  from src.submission.check_validity import (
9
  already_submitted_models,
@@ -26,12 +26,17 @@ def add_new_eval(
26
  weight_type: str,
27
  model_type: str,
28
  inference_framework: str,
 
 
29
  ):
30
  global REQUESTED_MODELS
31
  global USERS_TO_SUBMISSION_DATES
32
  if not REQUESTED_MODELS:
33
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
34
 
 
 
 
35
  user_name = ""
36
  model_path = model
37
  if "/" in model:
@@ -110,17 +115,18 @@ def add_new_eval(
110
  "params": model_size,
111
  "license": license,
112
  "inference_framework": inference_framework,
 
113
  }
114
 
115
  # Check for duplicate submission
116
- if f"{model}_{revision}_{precision}_{inference_framework}" in REQUESTED_MODELS:
117
  return styled_warning("This model has been already submitted.")
118
 
119
  print("Creating eval file")
120
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
121
  os.makedirs(OUT_DIR, exist_ok=True)
122
  # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
123
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}.json"
124
 
125
  with open(out_path, "w") as f:
126
  f.write(json.dumps(eval_entry))
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA, DEBUG_QUEUE_REPO
7
  from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
8
  from src.submission.check_validity import (
9
  already_submitted_models,
 
26
  weight_type: str,
27
  model_type: str,
28
  inference_framework: str,
29
+ debug: bool = False,
30
+ gpu_type: str = "NVIDIA-A100-PCIe-80GB",
31
  ):
32
  global REQUESTED_MODELS
33
  global USERS_TO_SUBMISSION_DATES
34
  if not REQUESTED_MODELS:
35
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
36
 
37
+ if debug:
38
+ QUEUE_REPO = DEBUG_QUEUE_REPO
39
+
40
  user_name = ""
41
  model_path = model
42
  if "/" in model:
 
115
  "params": model_size,
116
  "license": license,
117
  "inference_framework": inference_framework,
118
+ "gpu_type": gpu_type
119
  }
120
 
121
  # Check for duplicate submission
122
+ if f"{model}_{revision}_{precision}_{inference_framework}_{gpu_type}" in REQUESTED_MODELS:
123
  return styled_warning("This model has been already submitted.")
124
 
125
  print("Creating eval file")
126
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
127
  os.makedirs(OUT_DIR, exist_ok=True)
128
  # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
129
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}_{gpu_type}.json"
130
 
131
  with open(out_path, "w") as f:
132
  f.write(json.dumps(eval_entry))