display button unification for benchmarks
#24
by
zhiminy
- opened
- app.py +97 -18
- backend-cli.py +33 -4
- requirements.txt +2 -1
- src/backend/manage_requests.py +1 -0
- src/display/utils.py +29 -10
- src/envs.py +2 -2
- src/populate.py +2 -0
- src/submission/check_validity.py +1 -1
- src/submission/submit.py +9 -3
app.py
CHANGED
@@ -2,10 +2,11 @@
|
|
2 |
import os
|
3 |
import datetime
|
4 |
import socket
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
import pandas as pd
|
8 |
-
|
9 |
from apscheduler.schedulers.background import BackgroundScheduler
|
10 |
|
11 |
from huggingface_hub import snapshot_download
|
@@ -35,13 +36,27 @@ from src.display.utils import (
|
|
35 |
fields,
|
36 |
WeightType,
|
37 |
Precision,
|
|
|
38 |
)
|
39 |
|
40 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC,
|
|
|
41 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
42 |
from src.submission.submit import add_new_eval
|
43 |
from src.utils import get_dataset_summary_table
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
|
47 |
try:
|
@@ -75,17 +90,13 @@ def init_space():
|
|
75 |
)
|
76 |
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
77 |
|
78 |
-
|
79 |
-
dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
80 |
-
leaderboard_df = original_df.copy()
|
81 |
-
|
82 |
-
|
83 |
# Searching and filtering
|
84 |
def update_table(
|
85 |
hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
|
86 |
):
|
87 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
88 |
filtered_df = filter_queries(query, filtered_df)
|
|
|
89 |
df = select_columns(filtered_df, columns)
|
90 |
return df
|
91 |
|
@@ -141,7 +152,62 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
|
|
141 |
# filtered_df = filtered_df.loc[mask]
|
142 |
|
143 |
return filtered_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
# triggered only once at startup => read query parameter if it exists
|
147 |
def load_query(request: gr.Request):
|
@@ -162,7 +228,7 @@ with demo:
|
|
162 |
search_bar = gr.Textbox(
|
163 |
placeholder=" 🔍 Model search (separate multiple queries with `;`)",
|
164 |
show_label=False,
|
165 |
-
elem_id="search-bar"
|
166 |
)
|
167 |
with gr.Row():
|
168 |
shown_columns = gr.CheckboxGroup(
|
@@ -215,18 +281,19 @@ with demo:
|
|
215 |
# )
|
216 |
|
217 |
# breakpoint()
|
218 |
-
|
219 |
leaderboard_table = gr.components.Dataframe(
|
220 |
value=(
|
221 |
leaderboard_df[
|
222 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
223 |
+ shown_columns.value
|
|
|
224 |
+ [AutoEvalColumn.dummy.name]
|
225 |
]
|
226 |
if leaderboard_df.empty is False
|
227 |
else leaderboard_df
|
228 |
),
|
229 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
230 |
datatype=TYPES,
|
231 |
elem_id="leaderboard-table",
|
232 |
interactive=False,
|
@@ -251,14 +318,14 @@ with demo:
|
|
251 |
filter_columns_size,
|
252 |
search_bar,
|
253 |
],
|
254 |
-
leaderboard_table
|
255 |
)
|
256 |
|
257 |
# Check query parameter once at startup and update search bar
|
258 |
demo.load(load_query, inputs=[], outputs=[search_bar])
|
259 |
|
260 |
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
|
261 |
-
selector.
|
262 |
update_table,
|
263 |
[
|
264 |
hidden_leaderboard_table_for_search,
|
@@ -323,6 +390,15 @@ with demo:
|
|
323 |
value=None,
|
324 |
interactive=True,
|
325 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
with gr.Row():
|
328 |
with gr.Column():
|
@@ -358,6 +434,7 @@ with demo:
|
|
358 |
|
359 |
submit_button = gr.Button("Submit Eval")
|
360 |
submission_result = gr.Markdown()
|
|
|
361 |
submit_button.click(
|
362 |
add_new_eval,
|
363 |
[
|
@@ -369,6 +446,8 @@ with demo:
|
|
369 |
weight_type,
|
370 |
model_type,
|
371 |
inference_framework,
|
|
|
|
|
372 |
],
|
373 |
submission_result,
|
374 |
)
|
@@ -385,8 +464,7 @@ with demo:
|
|
385 |
|
386 |
scheduler = BackgroundScheduler()
|
387 |
|
388 |
-
scheduler.add_job(restart_space, "interval",
|
389 |
-
|
390 |
|
391 |
def launch_backend():
|
392 |
import subprocess
|
@@ -395,8 +473,9 @@ def launch_backend():
|
|
395 |
if DEVICE not in {"cpu"}:
|
396 |
_ = subprocess.run(["python", "backend-cli.py"])
|
397 |
|
398 |
-
|
399 |
# scheduler.add_job(launch_backend, "interval", seconds=120)
|
400 |
-
|
401 |
-
scheduler.start()
|
402 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
2 |
import os
|
3 |
import datetime
|
4 |
import socket
|
5 |
+
from threading import Thread
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
9 |
+
import time
|
10 |
from apscheduler.schedulers.background import BackgroundScheduler
|
11 |
|
12 |
from huggingface_hub import snapshot_download
|
|
|
36 |
fields,
|
37 |
WeightType,
|
38 |
Precision,
|
39 |
+
GPUType
|
40 |
)
|
41 |
|
42 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
|
43 |
+
QUEUE_REPO, REPO_ID, RESULTS_REPO, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
|
44 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
45 |
from src.submission.submit import add_new_eval
|
46 |
from src.utils import get_dataset_summary_table
|
47 |
|
48 |
+
def get_args():
|
49 |
+
import argparse
|
50 |
+
|
51 |
+
parser = argparse.ArgumentParser(description="Run the LLM Leaderboard")
|
52 |
+
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
53 |
+
return parser.parse_args()
|
54 |
+
|
55 |
+
args = get_args()
|
56 |
+
if args.debug:
|
57 |
+
print("Running in debug mode")
|
58 |
+
QUEUE_REPO = DEBUG_QUEUE_REPO
|
59 |
+
RESULTS_REPO = DEBUG_RESULTS_REPO
|
60 |
|
61 |
def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
|
62 |
try:
|
|
|
90 |
)
|
91 |
return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
92 |
|
|
|
|
|
|
|
|
|
|
|
93 |
# Searching and filtering
|
94 |
def update_table(
|
95 |
hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
|
96 |
):
|
97 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
|
98 |
filtered_df = filter_queries(query, filtered_df)
|
99 |
+
columns.extend(add_benchmark_columns(columns))
|
100 |
df = select_columns(filtered_df, columns)
|
101 |
return df
|
102 |
|
|
|
152 |
# filtered_df = filtered_df.loc[mask]
|
153 |
|
154 |
return filtered_df
|
155 |
+
|
156 |
+
|
157 |
+
def add_benchmark_columns(shown_columns):
|
158 |
+
benchmark_columns = []
|
159 |
+
for benchmark in BENCHMARK_COLS:
|
160 |
+
if benchmark in shown_columns:
|
161 |
+
for c in COLS:
|
162 |
+
if benchmark in c and benchmark != c:
|
163 |
+
benchmark_columns.append(c)
|
164 |
+
return benchmark_columns
|
165 |
+
|
166 |
+
shown_columns = None
|
167 |
+
dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
168 |
+
leaderboard_df = original_df.copy()
|
169 |
+
|
170 |
+
# def update_leaderboard_table():
|
171 |
+
# global leaderboard_df, shown_columns
|
172 |
+
# print("Updating leaderboard table")
|
173 |
+
# return leaderboard_df[
|
174 |
+
# [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
175 |
+
# + shown_columns.value
|
176 |
+
# + [AutoEvalColumn.dummy.name]
|
177 |
+
# ] if not leaderboard_df.empty else leaderboard_df
|
178 |
+
|
179 |
+
|
180 |
+
# def update_hidden_leaderboard_table():
|
181 |
+
# global original_df
|
182 |
+
# return original_df[COLS] if original_df.empty is False else original_df
|
183 |
+
|
184 |
+
# def update_dataset_table():
|
185 |
+
# global dataset_df
|
186 |
+
# return dataset_df
|
187 |
|
188 |
+
# def update_finish_table():
|
189 |
+
# global finished_eval_queue_df
|
190 |
+
# return finished_eval_queue_df
|
191 |
+
|
192 |
+
# def update_running_table():
|
193 |
+
# global running_eval_queue_df
|
194 |
+
# return running_eval_queue_df
|
195 |
+
|
196 |
+
# def update_pending_table():
|
197 |
+
# global pending_eval_queue_df
|
198 |
+
# return pending_eval_queue_df
|
199 |
+
|
200 |
+
# def update_finish_num():
|
201 |
+
# global finished_eval_queue_df
|
202 |
+
# return len(finished_eval_queue_df)
|
203 |
+
|
204 |
+
# def update_running_num():
|
205 |
+
# global running_eval_queue_df
|
206 |
+
# return len(running_eval_queue_df)
|
207 |
+
|
208 |
+
# def update_pending_num():
|
209 |
+
# global pending_eval_queue_df
|
210 |
+
# return len(pending_eval_queue_df)
|
211 |
|
212 |
# triggered only once at startup => read query parameter if it exists
|
213 |
def load_query(request: gr.Request):
|
|
|
228 |
search_bar = gr.Textbox(
|
229 |
placeholder=" 🔍 Model search (separate multiple queries with `;`)",
|
230 |
show_label=False,
|
231 |
+
elem_id="search-bar"
|
232 |
)
|
233 |
with gr.Row():
|
234 |
shown_columns = gr.CheckboxGroup(
|
|
|
281 |
# )
|
282 |
|
283 |
# breakpoint()
|
284 |
+
benchmark_columns = add_benchmark_columns(shown_columns.value)
|
285 |
leaderboard_table = gr.components.Dataframe(
|
286 |
value=(
|
287 |
leaderboard_df[
|
288 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
289 |
+ shown_columns.value
|
290 |
+
+ benchmark_columns
|
291 |
+ [AutoEvalColumn.dummy.name]
|
292 |
]
|
293 |
if leaderboard_df.empty is False
|
294 |
else leaderboard_df
|
295 |
),
|
296 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + benchmark_columns,
|
297 |
datatype=TYPES,
|
298 |
elem_id="leaderboard-table",
|
299 |
interactive=False,
|
|
|
318 |
filter_columns_size,
|
319 |
search_bar,
|
320 |
],
|
321 |
+
leaderboard_table
|
322 |
)
|
323 |
|
324 |
# Check query parameter once at startup and update search bar
|
325 |
demo.load(load_query, inputs=[], outputs=[search_bar])
|
326 |
|
327 |
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
|
328 |
+
selector.select(
|
329 |
update_table,
|
330 |
[
|
331 |
hidden_leaderboard_table_for_search,
|
|
|
390 |
value=None,
|
391 |
interactive=True,
|
392 |
)
|
393 |
+
|
394 |
+
gpu_type = gr.Dropdown(
|
395 |
+
choices=[t.to_str() for t in GPUType],
|
396 |
+
label="GPU type",
|
397 |
+
multiselect=False,
|
398 |
+
value="NVIDIA-A100-PCIe-80GB",
|
399 |
+
interactive=True,
|
400 |
+
)
|
401 |
+
|
402 |
|
403 |
with gr.Row():
|
404 |
with gr.Column():
|
|
|
434 |
|
435 |
submit_button = gr.Button("Submit Eval")
|
436 |
submission_result = gr.Markdown()
|
437 |
+
debug = gr.Checkbox(value=args.debug, label="Debug", visible=False)
|
438 |
submit_button.click(
|
439 |
add_new_eval,
|
440 |
[
|
|
|
446 |
weight_type,
|
447 |
model_type,
|
448 |
inference_framework,
|
449 |
+
debug,
|
450 |
+
gpu_type
|
451 |
],
|
452 |
submission_result,
|
453 |
)
|
|
|
464 |
|
465 |
scheduler = BackgroundScheduler()
|
466 |
|
467 |
+
scheduler.add_job(restart_space, "interval", hours=6)
|
|
|
468 |
|
469 |
def launch_backend():
|
470 |
import subprocess
|
|
|
473 |
if DEVICE not in {"cpu"}:
|
474 |
_ = subprocess.run(["python", "backend-cli.py"])
|
475 |
|
476 |
+
# Thread(target=periodic_init, daemon=True).start()
|
477 |
# scheduler.add_job(launch_backend, "interval", seconds=120)
|
478 |
+
if __name__ == "__main__":
|
479 |
+
scheduler.start()
|
480 |
+
demo.queue(default_concurrency_limit=40).launch()
|
481 |
+
|
backend-cli.py
CHANGED
@@ -16,13 +16,13 @@ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PAT
|
|
16 |
from src.backend.manage_requests import EvalRequest
|
17 |
from src.leaderboard.read_evals import EvalResult
|
18 |
|
19 |
-
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
20 |
from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
|
21 |
|
22 |
from src.leaderboard.read_evals import get_raw_eval_results
|
23 |
|
24 |
from typing import Optional
|
25 |
-
|
26 |
import time
|
27 |
|
28 |
import pprint
|
@@ -126,6 +126,9 @@ def request_to_result_name(request: EvalRequest) -> str:
|
|
126 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
127 |
batch_size = 1
|
128 |
batch_size = eval_request.batch_size
|
|
|
|
|
|
|
129 |
|
130 |
init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
|
131 |
# if init_gpu_info['Mem(M)'] > 500:
|
@@ -364,9 +367,22 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
|
|
364 |
return False
|
365 |
|
366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
def process_pending_requests() -> bool:
|
|
|
|
|
|
|
368 |
sanity_checks()
|
369 |
-
|
370 |
current_pending_status = [PENDING_STATUS]
|
371 |
|
372 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
@@ -385,6 +401,12 @@ def process_pending_requests() -> bool:
|
|
385 |
|
386 |
eval_request = eval_requests[0]
|
387 |
pp.pprint(eval_request)
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
my_snapshot_download(
|
390 |
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
@@ -426,6 +448,8 @@ def get_args():
|
|
426 |
parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
|
427 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
428 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
|
|
|
|
429 |
return parser.parse_args()
|
430 |
|
431 |
|
@@ -454,8 +478,13 @@ if __name__ == "__main__":
|
|
454 |
status="",
|
455 |
json_filepath="",
|
456 |
precision=precision, # Use precision from arguments
|
457 |
-
inference_framework=args.inference_framework # Use inference framework from arguments
|
|
|
458 |
)
|
|
|
|
|
|
|
|
|
459 |
results = process_evaluation(task, eval_request, limit=args.limit)
|
460 |
except Exception as e:
|
461 |
print(f"debug running error: {e}")
|
|
|
16 |
from src.backend.manage_requests import EvalRequest
|
17 |
from src.leaderboard.read_evals import EvalResult
|
18 |
|
19 |
+
from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
|
20 |
from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
|
21 |
|
22 |
from src.leaderboard.read_evals import get_raw_eval_results
|
23 |
|
24 |
from typing import Optional
|
25 |
+
import GPUtil
|
26 |
import time
|
27 |
|
28 |
import pprint
|
|
|
126 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
127 |
batch_size = 1
|
128 |
batch_size = eval_request.batch_size
|
129 |
+
|
130 |
+
if args.debug:
|
131 |
+
RESULTS_REPO = DEBUG_RESULTS_REPO
|
132 |
|
133 |
init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
|
134 |
# if init_gpu_info['Mem(M)'] > 500:
|
|
|
367 |
return False
|
368 |
|
369 |
|
370 |
+
def get_gpu_details():
|
371 |
+
gpus = GPUtil.getGPUs()
|
372 |
+
gpu = gpus[0]
|
373 |
+
name = gpu.name.replace(" ", "-")
|
374 |
+
# Convert memory from MB to GB and round to nearest whole number
|
375 |
+
memory_gb = round(gpu.memoryTotal / 1024)
|
376 |
+
memory = f"{memory_gb}GB"
|
377 |
+
formatted_name = f"{name}-{memory}"
|
378 |
+
return formatted_name
|
379 |
+
|
380 |
def process_pending_requests() -> bool:
|
381 |
+
if args.debug:
|
382 |
+
QUEUE_REPO = DEBUG_QUEUE_REPO
|
383 |
+
|
384 |
sanity_checks()
|
385 |
+
print("Processing pending requests")
|
386 |
current_pending_status = [PENDING_STATUS]
|
387 |
|
388 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
|
|
401 |
|
402 |
eval_request = eval_requests[0]
|
403 |
pp.pprint(eval_request)
|
404 |
+
|
405 |
+
gpu_type = eval_request.gpu_type
|
406 |
+
curr_gpu_type = get_gpu_details()
|
407 |
+
if gpu_type != curr_gpu_type:
|
408 |
+
print(f"GPU type mismatch: {gpu_type} vs {curr_gpu_type}")
|
409 |
+
return False
|
410 |
|
411 |
my_snapshot_download(
|
412 |
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
|
|
|
448 |
parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
|
449 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
450 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
451 |
+
parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
|
452 |
+
help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
|
453 |
return parser.parse_args()
|
454 |
|
455 |
|
|
|
478 |
status="",
|
479 |
json_filepath="",
|
480 |
precision=precision, # Use precision from arguments
|
481 |
+
inference_framework=args.inference_framework, # Use inference framework from arguments
|
482 |
+
gpu_type=args.gpu_type
|
483 |
)
|
484 |
+
curr_gpu_type = get_gpu_details()
|
485 |
+
if eval_request.gpu_type != curr_gpu_type:
|
486 |
+
print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
|
487 |
+
raise Exception("GPU type mismatch")
|
488 |
results = process_evaluation(task, eval_request, limit=args.limit)
|
489 |
except Exception as e:
|
490 |
print(f"debug running error: {e}")
|
requirements.txt
CHANGED
@@ -27,6 +27,7 @@ cchardet
|
|
27 |
rouge_score
|
28 |
bert-score
|
29 |
evaluate
|
30 |
-
spacy
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
|
|
|
27 |
rouge_score
|
28 |
bert-score
|
29 |
evaluate
|
30 |
+
spacy==3.7.4
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
33 |
+
gputil
|
src/backend/manage_requests.py
CHANGED
@@ -28,6 +28,7 @@ class EvalRequest:
|
|
28 |
params: Optional[int] = None
|
29 |
license: Optional[str] = ""
|
30 |
batch_size: Optional[int] = 1
|
|
|
31 |
|
32 |
def get_model_args(self) -> str:
|
33 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
|
|
28 |
params: Optional[int] = None
|
29 |
license: Optional[str] = ""
|
30 |
batch_size: Optional[int] = 1
|
31 |
+
gpu_type: Optional[str] = "NVIDIA-A100-PCIe-80GB"
|
32 |
|
33 |
def get_model_args(self) -> str:
|
34 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
src/display/utils.py
CHANGED
@@ -104,16 +104,16 @@ auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnConten
|
|
104 |
for task in Tasks:
|
105 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
106 |
# System performance metrics
|
107 |
-
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True)])
|
108 |
-
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True)])
|
109 |
-
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True)])
|
110 |
-
auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True)])
|
111 |
-
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True)])
|
112 |
-
auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True)])
|
113 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
114 |
continue
|
115 |
-
# auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False)])
|
116 |
-
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True)])
|
117 |
|
118 |
|
119 |
# Model information
|
@@ -140,6 +140,7 @@ class EvalQueueColumn: # Queue column
|
|
140 |
private = ColumnContent("private", "bool", True)
|
141 |
precision = ColumnContent("precision", "str", True)
|
142 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
|
|
143 |
status = ColumnContent("status", "str", True)
|
144 |
|
145 |
|
@@ -189,7 +190,25 @@ class InferenceFramework(Enum):
|
|
189 |
return InferenceFramework.HF_Chat
|
190 |
return InferenceFramework.Unknown
|
191 |
|
|
|
|
|
|
|
|
|
|
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
class WeightType(Enum):
|
194 |
Adapter = ModelDetails("Adapter")
|
195 |
Original = ModelDetails("Original")
|
@@ -223,8 +242,8 @@ class Precision(Enum):
|
|
223 |
|
224 |
|
225 |
# Column selection
|
226 |
-
COLS = [c.name for c in fields(AutoEvalColumn)
|
227 |
-
TYPES = [c.type for c in fields(AutoEvalColumn)
|
228 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
229 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
230 |
|
|
|
104 |
for task in Tasks:
|
105 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
106 |
# System performance metrics
|
107 |
+
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
|
108 |
+
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
|
109 |
+
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
|
110 |
+
auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
|
111 |
+
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
|
112 |
+
auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
|
113 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
114 |
continue
|
115 |
+
# auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
|
116 |
+
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
|
117 |
|
118 |
|
119 |
# Model information
|
|
|
140 |
private = ColumnContent("private", "bool", True)
|
141 |
precision = ColumnContent("precision", "str", True)
|
142 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
143 |
+
model_framework = ColumnContent("inference_framework", "str", True)
|
144 |
status = ColumnContent("status", "str", True)
|
145 |
|
146 |
|
|
|
190 |
return InferenceFramework.HF_Chat
|
191 |
return InferenceFramework.Unknown
|
192 |
|
193 |
+
class GPUType(Enum):
|
194 |
+
H100_pcie = ModelDetails("NVIDIA-H100-PCIe-80GB")
|
195 |
+
A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
|
196 |
+
A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB")
|
197 |
+
Unknown = ModelDetails("?")
|
198 |
|
199 |
+
def to_str(self):
|
200 |
+
return self.value.name
|
201 |
+
|
202 |
+
@staticmethod
|
203 |
+
def from_str(gpu_type: str):
|
204 |
+
if gpu_type in ["NVIDIA-H100-PCIe-80GB"]:
|
205 |
+
return GPUType.A100_pcie
|
206 |
+
if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
|
207 |
+
return GPUType.H100_pcie
|
208 |
+
if gpu_type in ["NVIDIA-A5000-24GB"]:
|
209 |
+
return GPUType.A5000
|
210 |
+
return GPUType.Unknown
|
211 |
+
|
212 |
class WeightType(Enum):
|
213 |
Adapter = ModelDetails("Adapter")
|
214 |
Original = ModelDetails("Original")
|
|
|
242 |
|
243 |
|
244 |
# Column selection
|
245 |
+
COLS = [c.name for c in fields(AutoEvalColumn)]
|
246 |
+
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
247 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
248 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
249 |
|
src/envs.py
CHANGED
@@ -12,8 +12,8 @@ QUEUE_REPO = "sparse-generative-ai/requests"
|
|
12 |
QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests"
|
13 |
RESULTS_REPO = "sparse-generative-ai/results"
|
14 |
|
15 |
-
|
16 |
-
|
17 |
|
18 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
19 |
|
|
|
12 |
QUEUE_REPO_OPEN_LLM = "open-llm-leaderboard/requests"
|
13 |
RESULTS_REPO = "sparse-generative-ai/results"
|
14 |
|
15 |
+
DEBUG_QUEUE_REPO = "sparse-generative-ai/debug_requests"
|
16 |
+
DEBUG_RESULTS_REPO = "sparse-generative-ai/debug_results"
|
17 |
|
18 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
19 |
|
src/populate.py
CHANGED
@@ -95,6 +95,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> tuple[pd.DataFrame, p
|
|
95 |
|
96 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
97 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
|
|
98 |
|
99 |
all_evals.append(data)
|
100 |
elif ".md" not in entry:
|
@@ -107,6 +108,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> tuple[pd.DataFrame, p
|
|
107 |
|
108 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
109 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
|
|
110 |
all_evals.append(data)
|
111 |
|
112 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
|
|
95 |
|
96 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
97 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
98 |
+
data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-")
|
99 |
|
100 |
all_evals.append(data)
|
101 |
elif ".md" not in entry:
|
|
|
108 |
|
109 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
110 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
111 |
+
data[EvalQueueColumn.model_framework.name] = data.get("inference_framework", "-")
|
112 |
all_evals.append(data)
|
113 |
|
114 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
src/submission/check_validity.py
CHANGED
@@ -130,7 +130,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
130 |
continue
|
131 |
with open(os.path.join(root, file), "r") as f:
|
132 |
info = json.load(f)
|
133 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}")
|
134 |
|
135 |
# Select organisation
|
136 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
|
|
130 |
continue
|
131 |
with open(os.path.join(root, file), "r") as f:
|
132 |
info = json.load(f)
|
133 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
|
134 |
|
135 |
# Select organisation
|
136 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
src/submission/submit.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
7 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
8 |
from src.submission.check_validity import (
|
9 |
already_submitted_models,
|
@@ -26,12 +26,17 @@ def add_new_eval(
|
|
26 |
weight_type: str,
|
27 |
model_type: str,
|
28 |
inference_framework: str,
|
|
|
|
|
29 |
):
|
30 |
global REQUESTED_MODELS
|
31 |
global USERS_TO_SUBMISSION_DATES
|
32 |
if not REQUESTED_MODELS:
|
33 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
34 |
|
|
|
|
|
|
|
35 |
user_name = ""
|
36 |
model_path = model
|
37 |
if "/" in model:
|
@@ -110,17 +115,18 @@ def add_new_eval(
|
|
110 |
"params": model_size,
|
111 |
"license": license,
|
112 |
"inference_framework": inference_framework,
|
|
|
113 |
}
|
114 |
|
115 |
# Check for duplicate submission
|
116 |
-
if f"{model}_{revision}_{precision}_{inference_framework}" in REQUESTED_MODELS:
|
117 |
return styled_warning("This model has been already submitted.")
|
118 |
|
119 |
print("Creating eval file")
|
120 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
121 |
os.makedirs(OUT_DIR, exist_ok=True)
|
122 |
# out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
123 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}.json"
|
124 |
|
125 |
with open(out_path, "w") as f:
|
126 |
f.write(json.dumps(eval_entry))
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA, DEBUG_QUEUE_REPO
|
7 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
8 |
from src.submission.check_validity import (
|
9 |
already_submitted_models,
|
|
|
26 |
weight_type: str,
|
27 |
model_type: str,
|
28 |
inference_framework: str,
|
29 |
+
debug: bool = False,
|
30 |
+
gpu_type: str = "NVIDIA-A100-PCIe-80GB",
|
31 |
):
|
32 |
global REQUESTED_MODELS
|
33 |
global USERS_TO_SUBMISSION_DATES
|
34 |
if not REQUESTED_MODELS:
|
35 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
36 |
|
37 |
+
if debug:
|
38 |
+
QUEUE_REPO = DEBUG_QUEUE_REPO
|
39 |
+
|
40 |
user_name = ""
|
41 |
model_path = model
|
42 |
if "/" in model:
|
|
|
115 |
"params": model_size,
|
116 |
"license": license,
|
117 |
"inference_framework": inference_framework,
|
118 |
+
"gpu_type": gpu_type
|
119 |
}
|
120 |
|
121 |
# Check for duplicate submission
|
122 |
+
if f"{model}_{revision}_{precision}_{inference_framework}_{gpu_type}" in REQUESTED_MODELS:
|
123 |
return styled_warning("This model has been already submitted.")
|
124 |
|
125 |
print("Creating eval file")
|
126 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
127 |
os.makedirs(OUT_DIR, exist_ok=True)
|
128 |
# out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
129 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}_{gpu_type}.json"
|
130 |
|
131 |
with open(out_path, "w") as f:
|
132 |
f.write(json.dumps(eval_entry))
|