Spaces:
Running
Running
BenchmarkBot
commited on
Commit
Β·
67cbded
1
Parent(s):
bf0a261
made scores clickable
Browse files- app.py +9 -30
- src/assets/css_html_js.py +0 -36
- src/assets/text_content.py +7 -9
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import os
|
2 |
-
import json
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -21,7 +20,7 @@ COLUMNS_MAPPING = {
|
|
21 |
"forward.peak_memory(MB)": "Peak Memory (MB) β¬οΈ",
|
22 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
23 |
}
|
24 |
-
COLUMNS_DATATYPES = ["markdown", "str", "str", "
|
25 |
SORTING_COLUMN = ["Throughput (tokens/s) β¬οΈ"]
|
26 |
|
27 |
|
@@ -39,8 +38,8 @@ def get_benchmark_df(benchmark):
|
|
39 |
scores_df = pd.read_csv(
|
40 |
f"./llm-perf-dataset/reports/average_scores.csv")
|
41 |
bench_df = bench_df.merge(scores_df, on="model", how="left")
|
42 |
-
|
43 |
-
|
44 |
|
45 |
# preprocess
|
46 |
bench_df["model"] = bench_df["model"].apply(make_clickable_model)
|
@@ -54,33 +53,19 @@ def get_benchmark_df(benchmark):
|
|
54 |
return bench_df
|
55 |
|
56 |
|
57 |
-
# def change_tab(query_param):
|
58 |
-
# query_param = query_param.replace("'", '"')
|
59 |
-
# query_param = json.loads(query_param)
|
60 |
-
|
61 |
-
# if (
|
62 |
-
# isinstance(query_param, dict)
|
63 |
-
# and "tab" in query_param
|
64 |
-
# and query_param["tab"] == "evaluation"
|
65 |
-
# ):
|
66 |
-
# return gr.Tabs.update(selected=1)
|
67 |
-
# else:
|
68 |
-
# return gr.Tabs.update(selected=0)
|
69 |
-
|
70 |
-
|
71 |
def submit_query(text, backends, datatypes, threshold, raw_df):
|
72 |
|
73 |
# extract the average score (float) from the clickable score (clickable markdown)
|
74 |
-
|
75 |
-
|
76 |
filtered_df = raw_df[
|
77 |
raw_df["Model π€"].str.lower().str.contains(text.lower()) &
|
78 |
raw_df["Backend π"].isin(backends) &
|
79 |
raw_df["Datatype π₯"].isin(datatypes) &
|
80 |
(raw_df["Average H4 Score β¬οΈ"] >= threshold)
|
81 |
]
|
82 |
-
|
83 |
-
|
84 |
|
85 |
return filtered_df
|
86 |
|
@@ -91,6 +76,7 @@ with demo:
|
|
91 |
gr.HTML(TITLE)
|
92 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
93 |
|
|
|
94 |
with gr.Row():
|
95 |
search_bar = gr.Textbox(
|
96 |
label="Model π€",
|
@@ -127,6 +113,7 @@ with demo:
|
|
127 |
elem_id="submit-button",
|
128 |
)
|
129 |
|
|
|
130 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
131 |
with gr.TabItem("π₯οΈ A100-80GB Benchmark ποΈ", elem_id="A100-benchmark", id=0):
|
132 |
gr.HTML(SINGLE_A100_TEXT)
|
@@ -166,14 +153,6 @@ with demo:
|
|
166 |
elem_id="citation-button",
|
167 |
).style(show_copy_button=True)
|
168 |
|
169 |
-
# dummy = gr.Textbox(visible=False)
|
170 |
-
# demo.load(
|
171 |
-
# change_tab,
|
172 |
-
# dummy,
|
173 |
-
# tabs,
|
174 |
-
# _js=get_window_url_params,
|
175 |
-
# )
|
176 |
-
|
177 |
# Restart space every hour
|
178 |
scheduler = BackgroundScheduler()
|
179 |
scheduler.add_job(restart_space, "interval", seconds=3600,
|
|
|
1 |
import os
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
20 |
"forward.peak_memory(MB)": "Peak Memory (MB) β¬οΈ",
|
21 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
22 |
}
|
23 |
+
COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
|
24 |
SORTING_COLUMN = ["Throughput (tokens/s) β¬οΈ"]
|
25 |
|
26 |
|
|
|
38 |
scores_df = pd.read_csv(
|
39 |
f"./llm-perf-dataset/reports/average_scores.csv")
|
40 |
bench_df = bench_df.merge(scores_df, on="model", how="left")
|
41 |
+
bench_df["average"] = bench_df["average"].apply(
|
42 |
+
make_clickable_score)
|
43 |
|
44 |
# preprocess
|
45 |
bench_df["model"] = bench_df["model"].apply(make_clickable_model)
|
|
|
53 |
return bench_df
|
54 |
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def submit_query(text, backends, datatypes, threshold, raw_df):
|
57 |
|
58 |
# extract the average score (float) from the clickable score (clickable markdown)
|
59 |
+
raw_df["Average H4 Score β¬οΈ"] = raw_df["Average H4 Score β¬οΈ"].apply(
|
60 |
+
extract_score_from_clickable)
|
61 |
filtered_df = raw_df[
|
62 |
raw_df["Model π€"].str.lower().str.contains(text.lower()) &
|
63 |
raw_df["Backend π"].isin(backends) &
|
64 |
raw_df["Datatype π₯"].isin(datatypes) &
|
65 |
(raw_df["Average H4 Score β¬οΈ"] >= threshold)
|
66 |
]
|
67 |
+
filtered_df["Average H4 Score β¬οΈ"] = filtered_df["Average H4 Score β¬οΈ"].apply(
|
68 |
+
make_clickable_score)
|
69 |
|
70 |
return filtered_df
|
71 |
|
|
|
76 |
gr.HTML(TITLE)
|
77 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
78 |
|
79 |
+
# controls
|
80 |
with gr.Row():
|
81 |
search_bar = gr.Textbox(
|
82 |
label="Model π€",
|
|
|
113 |
elem_id="submit-button",
|
114 |
)
|
115 |
|
116 |
+
# leaderboard tabs
|
117 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
118 |
with gr.TabItem("π₯οΈ A100-80GB Benchmark ποΈ", elem_id="A100-benchmark", id=0):
|
119 |
gr.HTML(SINGLE_A100_TEXT)
|
|
|
153 |
elem_id="citation-button",
|
154 |
).style(show_copy_button=True)
|
155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
# Restart space every hour
|
157 |
scheduler = BackgroundScheduler()
|
158 |
scheduler.add_job(restart_space, "interval", seconds=3600,
|
src/assets/css_html_js.py
CHANGED
@@ -1,12 +1,4 @@
|
|
1 |
custom_css = """
|
2 |
-
#changelog-text {
|
3 |
-
font-size: 16px !important;
|
4 |
-
}
|
5 |
-
|
6 |
-
#changelog-text h2 {
|
7 |
-
font-size: 18px !important;
|
8 |
-
}
|
9 |
-
|
10 |
.markdown-text {
|
11 |
font-size: 16px !important;
|
12 |
}
|
@@ -28,26 +20,11 @@ custom_css = """
|
|
28 |
transform: scale(1.3);
|
29 |
}
|
30 |
|
31 |
-
#leaderboard-table {
|
32 |
-
margin-top: 15px
|
33 |
-
}
|
34 |
-
|
35 |
-
#leaderboard-table-lite {
|
36 |
-
margin-top: 15px
|
37 |
-
}
|
38 |
-
|
39 |
#search-bar-table-box > div:first-child {
|
40 |
background: none;
|
41 |
border: none;
|
42 |
}
|
43 |
|
44 |
-
|
45 |
-
/* Hides the final AutoEvalColumn */
|
46 |
-
#llm-benchmark-tab-table table td:last-child,
|
47 |
-
#llm-benchmark-tab-table table th:last-child {
|
48 |
-
display: none;
|
49 |
-
}
|
50 |
-
|
51 |
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
52 |
table td:first-child,
|
53 |
table th:first-child {
|
@@ -59,19 +36,6 @@ table th:first-child {
|
|
59 |
.tab-buttons button {
|
60 |
font-size: 20px;
|
61 |
}
|
62 |
-
|
63 |
-
#scale-logo {
|
64 |
-
border-style: none !important;
|
65 |
-
box-shadow: none;
|
66 |
-
display: block;
|
67 |
-
margin-left: auto;
|
68 |
-
margin-right: auto;
|
69 |
-
max-width: 600px;
|
70 |
-
}
|
71 |
-
|
72 |
-
#scale-logo .download {
|
73 |
-
display: none;
|
74 |
-
}
|
75 |
"""
|
76 |
|
77 |
get_window_url_params = """
|
|
|
1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
.markdown-text {
|
3 |
font-size: 16px !important;
|
4 |
}
|
|
|
20 |
transform: scale(1.3);
|
21 |
}
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
#search-bar-table-box > div:first-child {
|
24 |
background: none;
|
25 |
border: none;
|
26 |
}
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
29 |
table td:first-child,
|
30 |
table th:first-child {
|
|
|
36 |
.tab-buttons button {
|
37 |
font-size: 20px;
|
38 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
"""
|
40 |
|
41 |
get_window_url_params = """
|
src/assets/text_content.py
CHANGED
@@ -1,22 +1,20 @@
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
2 |
|
3 |
INTRODUCTION_TEXT = f"""
|
4 |
-
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs)
|
5 |
-
Anyone from the community can request a model or a hardware+backend configuration for automated benchmarking:
|
6 |
-
- Model requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ once they're publicly available.
|
7 |
-
- Hardware+Backend requests should be made in the π€ Open LLM-Perf Leaderboard ποΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions).
|
8 |
|
9 |
-
|
|
|
|
|
10 |
"""
|
11 |
|
12 |
-
SINGLE_A100_TEXT = """<h3>Single-GPU (1xA100):</h3>
|
13 |
<ul>
|
14 |
<li>Singleton Batch (1)</li>
|
15 |
<li>Thousand Tokens (1000)</li>
|
16 |
</ul>
|
17 |
"""
|
18 |
|
19 |
-
|
20 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
21 |
CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
|
22 |
author = {Ilyas Moutawwakil},
|
@@ -25,8 +23,8 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
|
|
25 |
publisher = {Hugging Face},
|
26 |
howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
|
27 |
@software{optimum-benchmark,
|
28 |
-
author
|
29 |
publisher = {Hugging Face},
|
30 |
-
title
|
31 |
}
|
32 |
"""
|
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π€ Open LLM-Perf Leaderboard ποΈ</h1>"""
|
2 |
|
3 |
INTRODUCTION_TEXT = f"""
|
4 |
+
The π€ Open LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
|
|
|
|
|
|
5 |
|
6 |
+
Anyone from the community can request a model or a hardware+backend+optimization configuration for automated benchmarking:
|
7 |
+
- Model requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the π€ Open LLM-Perf Leaderboard ποΈ automatically once they're publicly available. That's mostly because we don't want to benchmark models that don't have an evaluation score yet.
|
8 |
+
- Hardware+Backend+Optimization requests should be made in the π€ Open LLM-Perf Leaderboard ποΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
|
9 |
"""
|
10 |
|
11 |
+
SINGLE_A100_TEXT = """<h3>Single-GPU Benchmarks (1xA100):</h3>
|
12 |
<ul>
|
13 |
<li>Singleton Batch (1)</li>
|
14 |
<li>Thousand Tokens (1000)</li>
|
15 |
</ul>
|
16 |
"""
|
17 |
|
|
|
18 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
19 |
CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
|
20 |
author = {Ilyas Moutawwakil},
|
|
|
23 |
publisher = {Hugging Face},
|
24 |
howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
|
25 |
@software{optimum-benchmark,
|
26 |
+
author = {Ilyas Moutawwakil},
|
27 |
publisher = {Hugging Face},
|
28 |
+
title = {Optimum-Benchmark: A framework for benchmarking the performance of Transformers models with different hardwares, backends and optimizations.},
|
29 |
}
|
30 |
"""
|