IlyasMoutawwakil HF staff commited on
Commit
dc685a9
β€’
1 Parent(s): 14d526b

updated layout

Browse files
app.py CHANGED
@@ -4,10 +4,10 @@ import gradio as gr
4
 
5
  from src.control_panel import create_control_panel, create_control_callback
6
  from src.latency_score_memory import create_lat_score_mem_plot
 
7
  from src.leaderboard import create_leaderboard_table
8
  from src.bettertransformer import create_bt_plots
9
  from src.flashattentionv2 import create_fa2_plots
10
- from src.custom_kernels import create_custom_kernels_plots
11
  from src.llm_perf import get_llm_perf_df
12
  from src.assets import custom_css
13
  from src.content import (
@@ -52,18 +52,14 @@ with demo:
52
  ####################### LEADERBOARD TAB #######################
53
  with gr.TabItem("Leaderboard πŸ…", id=0):
54
  leaderboard_table = create_leaderboard_table(llm_perf_df)
55
- ####################### LAT. vs. SCORE vs. MEM. TAB #######################
56
- with gr.TabItem("Latency vs. Score vs. Memory πŸ“Š", id=1):
57
  lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
58
  ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
59
- with gr.TabItem("BetterTransformer Speedup πŸ“ˆ", id=2):
60
  bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
61
- with gr.TabItem("FlashAttentionV2 Speedup πŸ“ˆ", id=3):
62
  fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
63
- with gr.TabItem("Custom Quantization Kernels Comparison πŸ†", id=4):
64
- custom_kernels_prefill_plot, custom_kernels_decode_plot = create_custom_kernels_plots(
65
- llm_perf_df
66
- )
67
 
68
  ####################### CONTROL CALLBACK #######################
69
  create_control_callback(
@@ -84,8 +80,8 @@ with demo:
84
  bt_decode_plot,
85
  fa2_prefill_plot,
86
  fa2_decode_plot,
87
- custom_kernels_prefill_plot,
88
- custom_kernels_decode_plot,
89
  )
90
  ####################### ABOUT TAB #######################
91
  with gr.TabItem("About πŸ“–", id=3):
 
4
 
5
  from src.control_panel import create_control_panel, create_control_callback
6
  from src.latency_score_memory import create_lat_score_mem_plot
7
+ from src.quantization_kernels import create_quant_plots
8
  from src.leaderboard import create_leaderboard_table
9
  from src.bettertransformer import create_bt_plots
10
  from src.flashattentionv2 import create_fa2_plots
 
11
  from src.llm_perf import get_llm_perf_df
12
  from src.assets import custom_css
13
  from src.content import (
 
52
  ####################### LEADERBOARD TAB #######################
53
  with gr.TabItem("Leaderboard πŸ…", id=0):
54
  leaderboard_table = create_leaderboard_table(llm_perf_df)
 
 
55
  lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
56
  ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
57
+ with gr.TabItem("BetterTransformer πŸ“ˆ", id=2):
58
  bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
59
+ with gr.TabItem("FlashAttentionV2 πŸ“ˆ", id=3):
60
  fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
61
+ with gr.TabItem("Custom Quantization Kernels πŸ“ˆ", id=4):
62
+ quant_prefill_plot, quant_decode_plot = create_quant_plots(llm_perf_df)
 
 
63
 
64
  ####################### CONTROL CALLBACK #######################
65
  create_control_callback(
 
80
  bt_decode_plot,
81
  fa2_prefill_plot,
82
  fa2_decode_plot,
83
+ quant_prefill_plot,
84
+ quant_decode_plot,
85
  )
86
  ####################### ABOUT TAB #######################
87
  with gr.TabItem("About πŸ“–", id=3):
src/bettertransformer.py CHANGED
@@ -14,7 +14,9 @@ BETTERTRANSFORMER_DATA = [
14
  # deployment settings
15
  "DType πŸ“₯",
16
  "Backend 🏭",
 
17
  "Quantization πŸ—œοΈ",
 
18
  # primary measurements
19
  "Prefill Latency (s)",
20
  "Prefill Latency (s) BetterTransformer",
@@ -29,10 +31,10 @@ BETTERTRANSFORMER_DATA = [
29
 
30
 
31
  def get_bt_df(llm_perf_df):
32
- bt_df = llm_perf_df.copy()
33
  # seperate original model experiments from BetterTransformer experiments
34
- original_df = bt_df[(bt_df["Optimization πŸ› οΈ"] == "None") & (bt_df["DType πŸ“₯"] == "float16")]
35
- bt_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "BetterTransformer"]
36
  # merge the two dataframes
37
  bt_df = pd.merge(
38
  original_df,
@@ -54,78 +56,78 @@ def get_bt_df(llm_perf_df):
54
  return bt_df
55
 
56
 
57
- def get_bt_decode_fig(llm_perf_df):
58
  bt_df = get_bt_df(llm_perf_df)
59
  # plot
60
- decode_fig = px.box(
61
  bt_df,
62
  x="Arch πŸ›οΈ",
63
- y="Decode Throughput Speedup (%)",
64
  color_discrete_sequence=px.colors.qualitative.Light24,
65
  custom_data=BETTERTRANSFORMER_DATA,
66
  color="Quantization πŸ—œοΈ",
67
  points="all",
68
  )
69
  # add hover data
70
- decode_fig.update_traces(
71
  hovertemplate="<br>".join(
72
  [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
73
  )
74
  )
75
  # add layout
76
- decode_fig.update_layout(
77
  title={
78
- "text": "Decode Throughput Speedup per Architecture",
79
  "y": 0.95,
80
  "x": 0.5,
81
  "xanchor": "center",
82
  "yanchor": "top",
83
  },
84
  xaxis_title="LLM Architecture",
85
- yaxis_title="Decode Speedup (%)",
86
  legend_title="Quantization Scheme",
87
  width=1200,
88
  height=600,
89
  )
90
 
91
- return decode_fig
92
 
93
 
94
- def get_bt_prefill_fig(llm_perf_df):
95
  bt_df = get_bt_df(llm_perf_df)
96
  # plot
97
- prefill_fig = px.box(
98
  bt_df,
99
  x="Arch πŸ›οΈ",
100
- y="Prefill Latency Speedup (%)",
101
  color_discrete_sequence=px.colors.qualitative.Light24,
102
  custom_data=BETTERTRANSFORMER_DATA,
103
  color="Quantization πŸ—œοΈ",
104
  points="all",
105
  )
106
  # add hover data
107
- prefill_fig.update_traces(
108
  hovertemplate="<br>".join(
109
  [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
110
  )
111
  )
112
  # add layout
113
- prefill_fig.update_layout(
114
  title={
115
- "text": "Prefill Latency Speedup per Architecture",
116
  "y": 0.95,
117
  "x": 0.5,
118
  "xanchor": "center",
119
  "yanchor": "top",
120
  },
121
  xaxis_title="LLM Architecture",
122
- yaxis_title="Prefill Speedup (%)",
123
  legend_title="Quantization Scheme",
124
  width=1200,
125
  height=600,
126
  )
127
 
128
- return prefill_fig
129
 
130
 
131
  def create_bt_plots(llm_perf_df):
 
14
  # deployment settings
15
  "DType πŸ“₯",
16
  "Backend 🏭",
17
+ "Optimization πŸ› οΈ",
18
  "Quantization πŸ—œοΈ",
19
+ "Optimization πŸ› οΈ BetterTransformer",
20
  # primary measurements
21
  "Prefill Latency (s)",
22
  "Prefill Latency (s) BetterTransformer",
 
31
 
32
 
33
  def get_bt_df(llm_perf_df):
34
+ copy_df = llm_perf_df.copy()
35
  # seperate original model experiments from BetterTransformer experiments
36
+ original_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "None") & (copy_df["DType πŸ“₯"] == "float16")]
37
+ bt_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "BetterTransformer") & (copy_df["DType πŸ“₯"] == "float16")]
38
  # merge the two dataframes
39
  bt_df = pd.merge(
40
  original_df,
 
56
  return bt_df
57
 
58
 
59
+ def get_bt_prefill_fig(llm_perf_df):
60
  bt_df = get_bt_df(llm_perf_df)
61
  # plot
62
+ prefill_fig = px.box(
63
  bt_df,
64
  x="Arch πŸ›οΈ",
65
+ y="Prefill Latency Speedup (%)",
66
  color_discrete_sequence=px.colors.qualitative.Light24,
67
  custom_data=BETTERTRANSFORMER_DATA,
68
  color="Quantization πŸ—œοΈ",
69
  points="all",
70
  )
71
  # add hover data
72
+ prefill_fig.update_traces(
73
  hovertemplate="<br>".join(
74
  [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
75
  )
76
  )
77
  # add layout
78
+ prefill_fig.update_layout(
79
  title={
80
+ "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
81
  "y": 0.95,
82
  "x": 0.5,
83
  "xanchor": "center",
84
  "yanchor": "top",
85
  },
86
  xaxis_title="LLM Architecture",
87
+ yaxis_title="Prefill Speedup (%)",
88
  legend_title="Quantization Scheme",
89
  width=1200,
90
  height=600,
91
  )
92
 
93
+ return prefill_fig
94
 
95
 
96
+ def get_bt_decode_fig(llm_perf_df):
97
  bt_df = get_bt_df(llm_perf_df)
98
  # plot
99
+ decode_fig = px.box(
100
  bt_df,
101
  x="Arch πŸ›οΈ",
102
+ y="Decode Throughput Speedup (%)",
103
  color_discrete_sequence=px.colors.qualitative.Light24,
104
  custom_data=BETTERTRANSFORMER_DATA,
105
  color="Quantization πŸ—œοΈ",
106
  points="all",
107
  )
108
  # add hover data
109
+ decode_fig.update_traces(
110
  hovertemplate="<br>".join(
111
  [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
112
  )
113
  )
114
  # add layout
115
+ decode_fig.update_layout(
116
  title={
117
+ "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
118
  "y": 0.95,
119
  "x": 0.5,
120
  "xanchor": "center",
121
  "yanchor": "top",
122
  },
123
  xaxis_title="LLM Architecture",
124
+ yaxis_title="Decode Speedup (%)",
125
  legend_title="Quantization Scheme",
126
  width=1200,
127
  height=600,
128
  )
129
 
130
+ return decode_fig
131
 
132
 
133
  def create_bt_plots(llm_perf_df):
src/control_panel.py CHANGED
@@ -5,7 +5,7 @@ from src.leaderboard import get_leaderboard_df
5
  from src.latency_score_memory import get_lat_score_mem_fig
6
  from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
7
  from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
8
- from src.custom_kernels import get_custom_kernels_prefill_fig, get_custom_kernels_decode_fig
9
 
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
@@ -133,8 +133,8 @@ def filter_fn(
133
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
134
  filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
135
  filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
136
- filtered_custom_kernels_prefill_fig = get_custom_kernels_prefill_fig(filtered_df)
137
- filtered_custom_kernels_decode_fig = get_custom_kernels_decode_fig(filtered_df)
138
 
139
  return [
140
  filtered_leaderboard_df,
@@ -143,8 +143,8 @@ def filter_fn(
143
  filtered_bt_decode_fig,
144
  filtered_fa2_prefill_fig,
145
  filtered_fa2_decode_fig,
146
- filtered_custom_kernels_prefill_fig,
147
- filtered_custom_kernels_decode_fig,
148
  ]
149
 
150
 
@@ -167,8 +167,8 @@ def create_control_callback(
167
  bt_decode_plot,
168
  fa2_prefill_plot,
169
  fa2_decode_plot,
170
- exllama_prefill_plot,
171
- exllama_decode_plot,
172
  ):
173
  filter_button.click(
174
  fn=filter_fn,
@@ -189,7 +189,7 @@ def create_control_callback(
189
  bt_decode_plot,
190
  fa2_prefill_plot,
191
  fa2_decode_plot,
192
- exllama_prefill_plot,
193
- exllama_decode_plot,
194
  ],
195
  )
 
5
  from src.latency_score_memory import get_lat_score_mem_fig
6
  from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
7
  from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
8
+ from src.quantization_kernels import get_quant_prefill_fig, get_quant_decode_fig
9
 
10
 
11
  def create_control_panel(machine: str = "hf-dgx-01"):
 
133
  filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
134
  filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
135
  filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
136
+ filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
137
+ filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
138
 
139
  return [
140
  filtered_leaderboard_df,
 
143
  filtered_bt_decode_fig,
144
  filtered_fa2_prefill_fig,
145
  filtered_fa2_decode_fig,
146
+ filtered_quant_prefill_fig,
147
+ filtered_quant_decode_fig,
148
  ]
149
 
150
 
 
167
  bt_decode_plot,
168
  fa2_prefill_plot,
169
  fa2_decode_plot,
170
+ quant_prefill_plot,
171
+ quant_decode_plot,
172
  ):
173
  filter_button.click(
174
  fn=filter_fn,
 
189
  bt_decode_plot,
190
  fa2_prefill_plot,
191
  fa2_decode_plot,
192
+ quant_prefill_plot,
193
+ quant_decode_plot,
194
  ],
195
  )
src/flashattentionv2.py CHANGED
@@ -14,7 +14,9 @@ FLASHATTENTIONV2_DATA = [
14
  # deployment settings
15
  "DType πŸ“₯",
16
  "Backend 🏭",
 
17
  "Quantization πŸ—œοΈ",
 
18
  # primary measurements
19
  "Prefill Latency (s)",
20
  "Prefill Latency (s) FlashAttentionV2",
@@ -29,10 +31,10 @@ FLASHATTENTIONV2_DATA = [
29
 
30
 
31
  def get_fa2_df(llm_perf_df):
32
- fa2_df = llm_perf_df.copy()
33
  # seperate original model experiments from FlashAttentionV2 experiments
34
- original_df = fa2_df[(fa2_df["Optimization πŸ› οΈ"] == "None") & (fa2_df["DType πŸ“₯"] == "float16")]
35
- fa2_df = fa2_df[fa2_df["Optimization πŸ› οΈ"] == "FlashAttentionV2"]
36
  # merge the two dataframes
37
  fa2_df = pd.merge(
38
  original_df,
@@ -47,7 +49,6 @@ def get_fa2_df(llm_perf_df):
47
  fa2_df["Decode Throughput Speedup (%)"] = (
48
  (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
49
  ).round(2) - 100
50
-
51
  # filter speedups > 1000%
52
  fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
53
  fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
@@ -76,7 +77,7 @@ def get_fa2_decode_fig(llm_perf_df):
76
  # add layout
77
  decode_fig.update_layout(
78
  title={
79
- "text": "Decode Throughput Speedup per Architecture",
80
  "y": 0.95,
81
  "x": 0.5,
82
  "xanchor": "center",
@@ -113,7 +114,7 @@ def get_fa2_prefill_fig(llm_perf_df):
113
  # add layout
114
  prefill_fig.update_layout(
115
  title={
116
- "text": "Prefill Latency Speedup per Architecture",
117
  "y": 0.95,
118
  "x": 0.5,
119
  "xanchor": "center",
 
14
  # deployment settings
15
  "DType πŸ“₯",
16
  "Backend 🏭",
17
+ "Optimization πŸ› οΈ",
18
  "Quantization πŸ—œοΈ",
19
+ "Optimization πŸ› οΈ FlashAttentionV2",
20
  # primary measurements
21
  "Prefill Latency (s)",
22
  "Prefill Latency (s) FlashAttentionV2",
 
31
 
32
 
33
  def get_fa2_df(llm_perf_df):
34
+ copy_df = llm_perf_df.copy()
35
  # seperate original model experiments from FlashAttentionV2 experiments
36
+ original_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "None") & (copy_df["DType πŸ“₯"] == "float16")]
37
+ fa2_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "FlashAttentionV2") & (copy_df["DType πŸ“₯"] == "float16")]
38
  # merge the two dataframes
39
  fa2_df = pd.merge(
40
  original_df,
 
49
  fa2_df["Decode Throughput Speedup (%)"] = (
50
  (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
51
  ).round(2) - 100
 
52
  # filter speedups > 1000%
53
  fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
54
  fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
 
77
  # add layout
78
  decode_fig.update_layout(
79
  title={
80
+ "text": "Decode Throughput Speedup per Architecture, Compared To Non-Optimized Model",
81
  "y": 0.95,
82
  "x": 0.5,
83
  "xanchor": "center",
 
114
  # add layout
115
  prefill_fig.update_layout(
116
  title={
117
+ "text": "Prefill Latency Speedup per Architecture, Compared To Non-Optimized Model",
118
  "y": 0.95,
119
  "x": 0.5,
120
  "xanchor": "center",
src/latency_score_memory.py CHANGED
@@ -8,6 +8,8 @@ SCORE_MEMORY_LATENCY_DATA = [
8
  "Params (B)",
9
  "DType πŸ“₯",
10
  "Backend 🏭",
 
 
11
  "Open LLM Score (%)",
12
  "Prefill Latency (s)",
13
  "Decode Throughput (tokens/s)",
@@ -42,7 +44,7 @@ def get_lat_score_mem_fig(llm_perf_df):
42
  "xanchor": "center",
43
  "yanchor": "top",
44
  },
45
- xaxis_title="Per 256 Tokens Latency (s)",
46
  yaxis_title="Open LLM Score (%)",
47
  legend_title="LLM Architecture",
48
  width=1200,
 
8
  "Params (B)",
9
  "DType πŸ“₯",
10
  "Backend 🏭",
11
+ "Optimization πŸ› οΈ",
12
+ "Quantization πŸ—œοΈ",
13
  "Open LLM Score (%)",
14
  "Prefill Latency (s)",
15
  "Decode Throughput (tokens/s)",
 
44
  "xanchor": "center",
45
  "yanchor": "top",
46
  },
47
+ xaxis_title="Time To Generate 256 Tokens (s)",
48
  yaxis_title="Open LLM Score (%)",
49
  legend_title="LLM Architecture",
50
  width=1200,
src/{custom_kernels.py β†’ quantization_kernels.py} RENAMED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import plotly.express as px
4
 
5
 
6
- CUSTOM_KERNELS_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
  "Arch πŸ›οΈ",
@@ -29,13 +29,13 @@ CUSTOM_KERNELS_DATA = [
29
  ]
30
 
31
 
32
- def get_custom_kernels_df(llm_perf_df):
33
  copy_df = llm_perf_df.copy()
34
  # seperate vanilla GPTQ experiments from Custom Kernel experiments
35
  vanilla_df = copy_df[
36
- (copy_df["Backend 🏭"] == "pytorch") &
37
  (copy_df["Quantization πŸ—œοΈ"] == "None") &
38
- (copy_df["Optimization πŸ› οΈ"] == "None") &
39
  (copy_df["DType πŸ“₯"] == "float16")
40
  ]
41
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
@@ -68,42 +68,36 @@ def get_custom_kernels_df(llm_perf_df):
68
  suffixes=["", " Custom Kernel"],
69
  )
70
  # concat the two dataframes row-wise
71
- custom_kernels_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
72
  # compute speedups
73
- custom_kernels_df["Prefill Latency Speedup (%)"] = (
74
- (custom_kernels_df["Prefill Latency (s)"] / custom_kernels_df["Prefill Latency (s) Custom Kernel"]) * 100
75
  ).round(2) - 100
76
- custom_kernels_df["Decode Throughput Speedup (%)"] = (
77
- (
78
- custom_kernels_df["Decode Throughput (tokens/s) Custom Kernel"]
79
- / custom_kernels_df["Decode Throughput (tokens/s)"]
80
- )
81
- * 100
82
  ).round(2) - 100
83
  # filter speedups > 1000%
84
- custom_kernels_df = custom_kernels_df[custom_kernels_df["Prefill Latency Speedup (%)"] < 1000]
85
- custom_kernels_df = custom_kernels_df[custom_kernels_df["Decode Throughput Speedup (%)"] < 1000]
86
 
87
- return custom_kernels_df
88
 
89
 
90
- def get_custom_kernels_decode_fig(llm_perf_df):
91
- custom_kernels_df = get_custom_kernels_df(llm_perf_df)
92
  # plot
93
  decode_fig = px.box(
94
- custom_kernels_df,
95
  x="Arch πŸ›οΈ",
96
  y="Decode Throughput Speedup (%)",
97
  color_discrete_sequence=px.colors.qualitative.Light24,
98
- custom_data=CUSTOM_KERNELS_DATA,
99
  color="Quantization πŸ—œοΈ Custom Kernel",
100
  points="all",
101
  )
102
  # add hover data
103
  decode_fig.update_traces(
104
- hovertemplate="<br>".join(
105
- [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)]
106
- )
107
  )
108
  # add layout
109
  decode_fig.update_layout(
@@ -124,23 +118,21 @@ def get_custom_kernels_decode_fig(llm_perf_df):
124
  return decode_fig
125
 
126
 
127
- def get_custom_kernels_prefill_fig(llm_perf_df):
128
- custom_kernels_df = get_custom_kernels_df(llm_perf_df)
129
  # plot
130
  prefill_fig = px.box(
131
- custom_kernels_df,
132
  x="Arch πŸ›οΈ",
133
  y="Prefill Latency Speedup (%)",
134
  color_discrete_sequence=px.colors.qualitative.Light24,
135
- custom_data=CUSTOM_KERNELS_DATA,
136
  color="Quantization πŸ—œοΈ Custom Kernel",
137
  points="all",
138
  )
139
  # add hover data
140
  prefill_fig.update_traces(
141
- hovertemplate="<br>".join(
142
- [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(CUSTOM_KERNELS_DATA)]
143
- )
144
  )
145
  # add layout
146
  prefill_fig.update_layout(
@@ -161,12 +153,12 @@ def get_custom_kernels_prefill_fig(llm_perf_df):
161
  return prefill_fig
162
 
163
 
164
- def create_custom_kernels_plots(llm_perf_df):
165
  # descriptive text
166
  gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
167
  # get figures
168
- prefill_fig = get_custom_kernels_prefill_fig(llm_perf_df)
169
- decode_fig = get_custom_kernels_decode_fig(llm_perf_df)
170
 
171
  # create plots
172
  prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
 
3
  import plotly.express as px
4
 
5
 
6
+ QUANT_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
  "Arch πŸ›οΈ",
 
29
  ]
30
 
31
 
32
+ def get_quant_df(llm_perf_df):
33
  copy_df = llm_perf_df.copy()
34
  # seperate vanilla GPTQ experiments from Custom Kernel experiments
35
  vanilla_df = copy_df[
36
+ (copy_df["Backend 🏭"] == "pytorch") &
37
  (copy_df["Quantization πŸ—œοΈ"] == "None") &
38
+ (copy_df["Optimization πŸ› οΈ"] == "None") &
39
  (copy_df["DType πŸ“₯"] == "float16")
40
  ]
41
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
 
68
  suffixes=["", " Custom Kernel"],
69
  )
70
  # concat the two dataframes row-wise
71
+ quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
72
  # compute speedups
73
+ quant_df["Prefill Latency Speedup (%)"] = (
74
+ (quant_df["Prefill Latency (s)"] / quant_df["Prefill Latency (s) Custom Kernel"]) * 100
75
  ).round(2) - 100
76
+ quant_df["Decode Throughput Speedup (%)"] = (
77
+ (quant_df["Decode Throughput (tokens/s) Custom Kernel"] / quant_df["Decode Throughput (tokens/s)"]) * 100
 
 
 
 
78
  ).round(2) - 100
79
  # filter speedups > 1000%
80
+ quant_df = quant_df[quant_df["Prefill Latency Speedup (%)"] < 1000]
81
+ quant_df = quant_df[quant_df["Decode Throughput Speedup (%)"] < 1000]
82
 
83
+ return quant_df
84
 
85
 
86
+ def get_quant_decode_fig(llm_perf_df):
87
+ quant_df = get_quant_df(llm_perf_df)
88
  # plot
89
  decode_fig = px.box(
90
+ quant_df,
91
  x="Arch πŸ›οΈ",
92
  y="Decode Throughput Speedup (%)",
93
  color_discrete_sequence=px.colors.qualitative.Light24,
94
+ custom_data=QUANT_DATA,
95
  color="Quantization πŸ—œοΈ Custom Kernel",
96
  points="all",
97
  )
98
  # add hover data
99
  decode_fig.update_traces(
100
+ hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
 
 
101
  )
102
  # add layout
103
  decode_fig.update_layout(
 
118
  return decode_fig
119
 
120
 
121
+ def get_quant_prefill_fig(llm_perf_df):
122
+ quant_df = get_quant_df(llm_perf_df)
123
  # plot
124
  prefill_fig = px.box(
125
+ quant_df,
126
  x="Arch πŸ›οΈ",
127
  y="Prefill Latency Speedup (%)",
128
  color_discrete_sequence=px.colors.qualitative.Light24,
129
+ custom_data=QUANT_DATA,
130
  color="Quantization πŸ—œοΈ Custom Kernel",
131
  points="all",
132
  )
133
  # add hover data
134
  prefill_fig.update_traces(
135
+ hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
 
 
136
  )
137
  # add layout
138
  prefill_fig.update_layout(
 
153
  return prefill_fig
154
 
155
 
156
+ def create_quant_plots(llm_perf_df):
157
  # descriptive text
158
  gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
159
  # get figures
160
+ prefill_fig = get_quant_prefill_fig(llm_perf_df)
161
+ decode_fig = get_quant_decode_fig(llm_perf_df)
162
 
163
  # create plots
164
  prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)