Spaces:
Running
Running
Commit
β’
08604d0
1
Parent(s):
affd732
added awq
Browse files- src/content.py +1 -1
- src/control_panel.py +26 -8
- src/exllama.py +5 -5
- src/llm_perf.py +1 -0
- src/utils.py +4 -0
src/content.py
CHANGED
@@ -7,7 +7,7 @@ The π€ LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency
|
|
7 |
|
8 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
9 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [π€ LLM Performance Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
|
10 |
-
- Hardware/Backend/Optimization performance requests should be made in the [
|
11 |
"""
|
12 |
|
13 |
ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
|
|
7 |
|
8 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
9 |
- Model evaluation requests should be made in the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [π€ LLM Performance Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
|
10 |
+
- Hardware/Backend/Optimization performance requests should be made in the [llm-perf-backend repository](https://github.com/IlyasMoutawwakil/llm-perf-backend) and will be added to the [π€ LLM Performance Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
|
11 |
"""
|
12 |
|
13 |
ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
src/control_panel.py
CHANGED
@@ -10,7 +10,7 @@ from src.exllama import get_exllama_prefill_fig, get_exllama_decode_fig
|
|
10 |
|
11 |
def create_control_panel(machine: str = "hf-dgx-01"):
|
12 |
# descriptive text
|
13 |
-
gr.HTML("Use this control panel to filter
|
14 |
# controls
|
15 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
16 |
with gr.Row():
|
@@ -21,14 +21,14 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
21 |
elem_id="search-bar",
|
22 |
)
|
23 |
with gr.Row():
|
24 |
-
with gr.Column(scale=1):
|
25 |
score_slider = gr.Slider(
|
26 |
label="Open LLM Score (%) π",
|
27 |
info="ποΈ Slide to minimum Open LLM score",
|
28 |
value=0,
|
29 |
elem_id="threshold-slider",
|
30 |
)
|
31 |
-
with gr.Column(scale=1):
|
32 |
memory_slider = gr.Slider(
|
33 |
label="Peak Memory (MB) π",
|
34 |
info="ποΈ Slide to maximum Peak Memory",
|
@@ -46,7 +46,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
46 |
elem_id="backend-checkboxes",
|
47 |
)
|
48 |
with gr.Row():
|
49 |
-
with gr.Column(scale=1):
|
50 |
datatype_checkboxes = gr.CheckboxGroup(
|
51 |
label="Load DTypes π₯",
|
52 |
choices=["float32", "float16", "bfloat16"],
|
@@ -54,7 +54,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
54 |
info="βοΈ Select the load data types",
|
55 |
elem_id="dtype-checkboxes",
|
56 |
)
|
57 |
-
with gr.Column(scale=1):
|
58 |
optimization_checkboxes = gr.CheckboxGroup(
|
59 |
label="Optimizations π οΈ",
|
60 |
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
@@ -62,11 +62,29 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
62 |
info="βοΈ Select the optimization",
|
63 |
elem_id="optimization-checkboxes",
|
64 |
)
|
65 |
-
with gr.Column(scale=
|
66 |
quantization_checkboxes = gr.CheckboxGroup(
|
67 |
label="Quantizations ποΈ",
|
68 |
-
choices=[
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
info="βοΈ Select the quantization schemes",
|
71 |
elem_id="quantization-checkboxes",
|
72 |
)
|
|
|
10 |
|
11 |
def create_control_panel(machine: str = "hf-dgx-01"):
|
12 |
# descriptive text
|
13 |
+
gr.HTML("Use this control panel to filter the leaderboard.", elem_id="text")
|
14 |
# controls
|
15 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
16 |
with gr.Row():
|
|
|
21 |
elem_id="search-bar",
|
22 |
)
|
23 |
with gr.Row():
|
24 |
+
with gr.Column(scale=1, variant="panel"):
|
25 |
score_slider = gr.Slider(
|
26 |
label="Open LLM Score (%) π",
|
27 |
info="ποΈ Slide to minimum Open LLM score",
|
28 |
value=0,
|
29 |
elem_id="threshold-slider",
|
30 |
)
|
31 |
+
with gr.Column(scale=1, variant="panel"):
|
32 |
memory_slider = gr.Slider(
|
33 |
label="Peak Memory (MB) π",
|
34 |
info="ποΈ Slide to maximum Peak Memory",
|
|
|
46 |
elem_id="backend-checkboxes",
|
47 |
)
|
48 |
with gr.Row():
|
49 |
+
with gr.Column(scale=1, variant="panel"):
|
50 |
datatype_checkboxes = gr.CheckboxGroup(
|
51 |
label="Load DTypes π₯",
|
52 |
choices=["float32", "float16", "bfloat16"],
|
|
|
54 |
info="βοΈ Select the load data types",
|
55 |
elem_id="dtype-checkboxes",
|
56 |
)
|
57 |
+
with gr.Column(scale=1, variant="panel"):
|
58 |
optimization_checkboxes = gr.CheckboxGroup(
|
59 |
label="Optimizations π οΈ",
|
60 |
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
|
|
62 |
info="βοΈ Select the optimization",
|
63 |
elem_id="optimization-checkboxes",
|
64 |
)
|
65 |
+
with gr.Column(scale=2):
|
66 |
quantization_checkboxes = gr.CheckboxGroup(
|
67 |
label="Quantizations ποΈ",
|
68 |
+
choices=[
|
69 |
+
"None",
|
70 |
+
"BnB.4bit",
|
71 |
+
"BnB.8bit",
|
72 |
+
"GPTQ.4bit",
|
73 |
+
"GPTQ.4bit+ExllamaV1",
|
74 |
+
"GPTQ.4bit+ExllamaV2",
|
75 |
+
"AWQ.4bit+GEMM",
|
76 |
+
"AWQ.4bit+GEMV",
|
77 |
+
],
|
78 |
+
value=[
|
79 |
+
"None",
|
80 |
+
"BnB.4bit",
|
81 |
+
"BnB.8bit",
|
82 |
+
"GPTQ.4bit",
|
83 |
+
"GPTQ.4bit+ExllamaV1",
|
84 |
+
"GPTQ.4bit+ExllamaV2",
|
85 |
+
"AWQ.4bit+GEMM",
|
86 |
+
"AWQ.4bit+GEMV",
|
87 |
+
],
|
88 |
info="βοΈ Select the quantization schemes",
|
89 |
elem_id="quantization-checkboxes",
|
90 |
)
|
src/exllama.py
CHANGED
@@ -29,11 +29,11 @@ EXLLAMA_DATA = [
|
|
29 |
|
30 |
|
31 |
def get_exllama_df(llm_perf_df):
|
32 |
-
|
33 |
-
# seperate
|
34 |
-
gptq_df =
|
35 |
-
exllamav1_df =
|
36 |
-
exllamav2_df =
|
37 |
# merge the three dataframes
|
38 |
exllamav1_df = pd.merge(
|
39 |
gptq_df,
|
|
|
29 |
|
30 |
|
31 |
def get_exllama_df(llm_perf_df):
|
32 |
+
copy_df = llm_perf_df.copy()
|
33 |
+
# seperate vanilla GPTQ experiments from Exllama experiments
|
34 |
+
gptq_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit")]
|
35 |
+
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
36 |
+
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
37 |
# merge the three dataframes
|
38 |
exllamav1_df = pd.merge(
|
39 |
gptq_df,
|
src/llm_perf.py
CHANGED
@@ -94,6 +94,7 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
|
94 |
[
|
95 |
"backend.quantization_scheme",
|
96 |
"backend.quantization_config.bits",
|
|
|
97 |
"backend.quantization_config.load_in_4bit",
|
98 |
"backend.quantization_config.load_in_8bit",
|
99 |
"backend.quantization_config.exllama_config.version",
|
|
|
94 |
[
|
95 |
"backend.quantization_scheme",
|
96 |
"backend.quantization_config.bits",
|
97 |
+
"backend.quantization_config.version",
|
98 |
"backend.quantization_config.load_in_4bit",
|
99 |
"backend.quantization_config.load_in_8bit",
|
100 |
"backend.quantization_config.exllama_config.version",
|
src/utils.py
CHANGED
@@ -62,6 +62,10 @@ def process_quantization_scheme(x):
|
|
62 |
return "GPTQ.4bit+ExllamaV2"
|
63 |
elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
|
64 |
return "GPTQ.4bit"
|
|
|
|
|
|
|
|
|
65 |
else:
|
66 |
return "None"
|
67 |
|
|
|
62 |
return "GPTQ.4bit+ExllamaV2"
|
63 |
elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
|
64 |
return "GPTQ.4bit"
|
65 |
+
elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemm":
|
66 |
+
return "AWQ.4bit+GEMM"
|
67 |
+
elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemv":
|
68 |
+
return "AWQ.4bit+GEMV"
|
69 |
else:
|
70 |
return "None"
|
71 |
|