BenchmarkBot commited on
Commit
8e8c463
Β·
1 Parent(s): 67cbded

added plot

Browse files
Files changed (3) hide show
  1. app.py +106 -39
  2. src/assets/text_content.py +1 -1
  3. src/utils.py +2 -1
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import gradio as gr
3
  import pandas as pd
@@ -16,9 +17,9 @@ COLUMNS_MAPPING = {
16
  "model": "Model πŸ€—",
17
  "backend.name": "Backend 🏭",
18
  "backend.torch_dtype": "Datatype πŸ“₯",
19
- "average": "Average H4 Score ⬆️",
20
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
21
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
 
22
  }
23
  COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
24
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
@@ -33,16 +34,14 @@ def get_benchmark_df(benchmark):
33
 
34
  # load
35
  bench_df = pd.read_csv(
36
- f"./llm-perf-dataset/reports/{benchmark}/inference_report.csv")
37
-
38
  scores_df = pd.read_csv(
39
- f"./llm-perf-dataset/reports/average_scores.csv")
40
  bench_df = bench_df.merge(scores_df, on="model", how="left")
41
- bench_df["average"] = bench_df["average"].apply(
42
- make_clickable_score)
43
 
44
  # preprocess
45
  bench_df["model"] = bench_df["model"].apply(make_clickable_model)
 
46
  # filter
47
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
48
  # rename
@@ -53,55 +52,98 @@ def get_benchmark_df(benchmark):
53
  return bench_df
54
 
55
 
56
- def submit_query(text, backends, datatypes, threshold, raw_df):
 
57
 
58
- # extract the average score (float) from the clickable score (clickable markdown)
59
- raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
60
- extract_score_from_clickable)
61
- filtered_df = raw_df[
62
- raw_df["Model πŸ€—"].str.lower().str.contains(text.lower()) &
63
- raw_df["Backend 🏭"].isin(backends) &
64
- raw_df["Datatype πŸ“₯"].isin(datatypes) &
65
- (raw_df["Average H4 Score ⬆️"] >= threshold)
66
- ]
67
- filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
68
- make_clickable_score)
69
 
70
- return filtered_df
 
 
71
 
 
 
 
 
 
 
72
 
73
- # Define demo interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  demo = gr.Blocks(css=custom_css)
75
  with demo:
 
76
  gr.HTML(TITLE)
 
 
77
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
78
 
79
- # controls
 
 
 
80
  with gr.Row():
81
  search_bar = gr.Textbox(
82
  label="Model πŸ€—",
83
- info="Search for a model name",
84
  elem_id="search-bar",
85
  )
86
  backend_checkboxes = gr.CheckboxGroup(
87
  label="Backends 🏭",
88
  choices=["pytorch", "onnxruntime"],
89
  value=["pytorch", "onnxruntime"],
90
- info="Select the backends",
91
  elem_id="backend-checkboxes",
92
  )
93
  datatype_checkboxes = gr.CheckboxGroup(
94
  label="Datatypes πŸ“₯",
95
  choices=["float32", "float16"],
96
  value=["float32", "float16"],
97
- info="Select the load datatypes",
98
  elem_id="datatype-checkboxes",
99
  )
100
-
101
- with gr.Row():
102
  threshold_slider = gr.Slider(
103
  label="Average H4 Score πŸ“ˆ",
104
- info="Filter by minimum average H4 score",
105
  value=0.0,
106
  elem_id="threshold-slider",
107
  )
@@ -109,16 +151,14 @@ with demo:
109
  with gr.Row():
110
  submit_button = gr.Button(
111
  value="Submit πŸš€",
112
- info="Submit the filters",
113
  elem_id="submit-button",
114
  )
115
 
116
  # leaderboard tabs
117
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
118
- with gr.TabItem("πŸ–₯️ A100-80GB Benchmark πŸ‹οΈ", elem_id="A100-benchmark", id=0):
119
  gr.HTML(SINGLE_A100_TEXT)
120
 
121
- single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
122
  # Original leaderboard table
123
  single_A100_leaderboard = gr.components.Dataframe(
124
  value=single_A100_df,
@@ -135,15 +175,15 @@ with demo:
135
  visible=False,
136
  )
137
 
138
- # Callbacks
139
- submit_button.click(
140
- submit_query,
141
- [
142
- search_bar, backend_checkboxes, datatype_checkboxes, threshold_slider,
143
- single_A100_for_search
144
- ],
145
- [single_A100_leaderboard]
146
- )
147
 
148
  with gr.Row():
149
  with gr.Accordion("πŸ“™ Citation", open=False):
@@ -153,6 +193,33 @@ with demo:
153
  elem_id="citation-button",
154
  ).style(show_copy_button=True)
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  # Restart space every hour
157
  scheduler = BackgroundScheduler()
158
  scheduler.add_job(restart_space, "interval", seconds=3600,
 
1
+ import plotly.express as px
2
  import os
3
  import gradio as gr
4
  import pandas as pd
 
17
  "model": "Model πŸ€—",
18
  "backend.name": "Backend 🏭",
19
  "backend.torch_dtype": "Datatype πŸ“₯",
 
20
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
21
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
22
+ "h4_score": "H4 Score ⬆️",
23
  }
24
  COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
25
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
 
34
 
35
  # load
36
  bench_df = pd.read_csv(
37
+ f"./llm-perf-dataset/reports/{benchmark}.csv")
 
38
  scores_df = pd.read_csv(
39
+ f"./llm-perf-dataset/reports/additional_data.csv")
40
  bench_df = bench_df.merge(scores_df, on="model", how="left")
 
 
41
 
42
  # preprocess
43
  bench_df["model"] = bench_df["model"].apply(make_clickable_model)
44
+ bench_df["h4_score"] = bench_df["h4_score"].apply(make_clickable_score)
45
  # filter
46
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
47
  # rename
 
52
  return bench_df
53
 
54
 
55
+ # Dataframes
56
+ single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
57
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ def get_benchmark_plot(benchmark):
60
+ if llm_perf_dataset_repo:
61
+ llm_perf_dataset_repo.git_pull()
62
 
63
+ # load
64
+ bench_df = pd.read_csv(
65
+ f"./llm-perf-dataset/reports/{benchmark}.csv")
66
+ scores_df = pd.read_csv(
67
+ f"./llm-perf-dataset/reports/additional_data.csv")
68
+ bench_df = bench_df.merge(scores_df, on="model", how="left")
69
 
70
+ fig = px.scatter(
71
+ bench_df, x="h4_score", y="generate.latency(s)",
72
+ color='model_type', symbol='backend.name', size='forward.peak_memory(MB)',
73
+ custom_data=['model', 'backend.name', 'backend.torch_dtype',
74
+ 'forward.peak_memory(MB)', 'generate.throughput(tokens/s)'],
75
+ )
76
+
77
+ fig.update_traces(
78
+ title={
79
+ 'text': "Model Score vs. Latency vs. Memory",
80
+ 'y': 0.95, 'x': 0.5,
81
+ 'xanchor': 'center',
82
+ 'yanchor': 'top'
83
+ },
84
+ xaxis_title="Average H4 Score",
85
+ yaxis_title="Latency per 1000 Tokens (s)",
86
+ legend_title="Model Type",
87
+ legend=dict(
88
+ orientation="h",
89
+ yanchor="middle",
90
+ xanchor="center",
91
+ y=-0.15,
92
+ x=0.5
93
+ ),
94
+ hovertemplate="<br>".join([
95
+ "Model: %{customdata[0]}",
96
+ "Backend: %{customdata[1]}",
97
+ "Datatype: %{customdata[2]}",
98
+ "Peak Memory (MB): %{customdata[3]}",
99
+ "Throughput (tokens/s): %{customdata[4]}",
100
+ "Latency per 1000 Tokens (s): %{y}",
101
+ "Average H4 Score: %{x}"
102
+ ])
103
+ )
104
+
105
+ return fig
106
+
107
+
108
+ # Plots
109
+ single_A100_plot = get_benchmark_plot(benchmark="1xA100-80GB")
110
+
111
+ # Demo interface
112
  demo = gr.Blocks(css=custom_css)
113
  with demo:
114
+ # leaderboard title
115
  gr.HTML(TITLE)
116
+
117
+ # introduction text
118
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
119
 
120
+ # control panel title
121
+ gr.HTML("<h2>Control Panel πŸŽ›οΈ</h2>")
122
+
123
+ # control panel interface
124
  with gr.Row():
125
  search_bar = gr.Textbox(
126
  label="Model πŸ€—",
127
+ info="πŸ” Search for a model name",
128
  elem_id="search-bar",
129
  )
130
  backend_checkboxes = gr.CheckboxGroup(
131
  label="Backends 🏭",
132
  choices=["pytorch", "onnxruntime"],
133
  value=["pytorch", "onnxruntime"],
134
+ info="β˜‘οΈ Select the backends",
135
  elem_id="backend-checkboxes",
136
  )
137
  datatype_checkboxes = gr.CheckboxGroup(
138
  label="Datatypes πŸ“₯",
139
  choices=["float32", "float16"],
140
  value=["float32", "float16"],
141
+ info="β˜‘οΈ Select the load datatypes",
142
  elem_id="datatype-checkboxes",
143
  )
 
 
144
  threshold_slider = gr.Slider(
145
  label="Average H4 Score πŸ“ˆ",
146
+ info="lter by minimum average H4 score",
147
  value=0.0,
148
  elem_id="threshold-slider",
149
  )
 
151
  with gr.Row():
152
  submit_button = gr.Button(
153
  value="Submit πŸš€",
 
154
  elem_id="submit-button",
155
  )
156
 
157
  # leaderboard tabs
158
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
159
+ with gr.TabItem("πŸ–₯️ A100-80GB Leaderboard πŸ†", id=0):
160
  gr.HTML(SINGLE_A100_TEXT)
161
 
 
162
  # Original leaderboard table
163
  single_A100_leaderboard = gr.components.Dataframe(
164
  value=single_A100_df,
 
175
  visible=False,
176
  )
177
 
178
+ with gr.TabItem("πŸ–₯️ A100-80GB Plot πŸ“ˆ", id=1):
179
+ # Original leaderboard plot
180
+ gr.HTML(SINGLE_A100_TEXT)
181
+
182
+ single_A100_plotly = gr.components.Plot(
183
+ value=single_A100_plot,
184
+ elem_id="1xA100-plot",
185
+ show_label=False,
186
+ )
187
 
188
  with gr.Row():
189
  with gr.Accordion("πŸ“™ Citation", open=False):
 
193
  elem_id="citation-button",
194
  ).style(show_copy_button=True)
195
 
196
+
197
+ def submit_query(text, backends, datatypes, threshold, raw_df):
198
+ raw_df["H4 Score ⬆️"] = raw_df["H4 Score ⬆️"].apply(
199
+ extract_score_from_clickable)
200
+
201
+ filtered_df = raw_df[
202
+ raw_df["Model πŸ€—"].str.lower().str.contains(text.lower()) &
203
+ raw_df["Backend 🏭"].isin(backends) &
204
+ raw_df["Datatype πŸ“₯"].isin(datatypes) &
205
+ (raw_df["H4 Score ⬆️"] >= threshold)
206
+ ]
207
+
208
+ filtered_df["H4 Score ⬆️"] = filtered_df["H4 Score ⬆️"].apply(
209
+ make_clickable_score)
210
+ return filtered_df
211
+
212
+
213
+ # Callbacks
214
+ submit_button.click(
215
+ submit_query,
216
+ [
217
+ search_bar, backend_checkboxes, datatype_checkboxes, threshold_slider,
218
+ single_A100_for_search
219
+ ],
220
+ [single_A100_leaderboard]
221
+ )
222
+
223
  # Restart space every hour
224
  scheduler = BackgroundScheduler()
225
  scheduler.add_job(restart_space, "interval", seconds=3600,
src/assets/text_content.py CHANGED
@@ -8,7 +8,7 @@ Anyone from the community can request a model or a hardware+backend+optimization
8
  - Hardware+Backend+Optimization requests should be made in the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
9
  """
10
 
11
- SINGLE_A100_TEXT = """<h3>Single-GPU Benchmarks (1xA100):</h3>
12
  <ul>
13
  <li>Singleton Batch (1)</li>
14
  <li>Thousand Tokens (1000)</li>
 
8
  - Hardware+Backend+Optimization requests should be made in the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
9
  """
10
 
11
+ SINGLE_A100_TEXT = """<h3>Single-GPU Benchmark (1xA100):</h3>
12
  <ul>
13
  <li>Singleton Batch (1)</li>
14
  <li>Thousand Tokens (1000)</li>
src/utils.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from huggingface_hub import HfApi, Repository
2
 
3
 
@@ -68,4 +69,4 @@ def make_clickable_score(score):
68
 
69
 
70
  def extract_score_from_clickable(clickable_score) -> float:
71
- return float(clickable_score.split(">")[1].split("<")[0])
 
1
+ import re
2
  from huggingface_hub import HfApi, Repository
3
 
4
 
 
69
 
70
 
71
  def extract_score_from_clickable(clickable_score) -> float:
72
+ return float(re.findall(r"\d+\.\d+", clickable_score)[-1])