Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF staff commited on Jan 9

Commit

08604d0

•

1 Parent(s): affd732

added awq

Browse files

Files changed (5) hide show

src/content.py +1 -1
src/control_panel.py +26 -8
src/exllama.py +5 -5
src/llm_perf.py +1 -0
src/utils.py +4 -0

src/content.py CHANGED Viewed

@@ -7,7 +7,7 @@ The 🤗 LLM-Perf Leaderboard 🏋️ aims to benchmark the performance (latency
 Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
 - Model evaluation requests should be made in the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [🤗 LLM Performance Leaderboard 🏋️](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
-- Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
 """
 ABOUT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>

 Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
 - Model evaluation requests should be made in the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the [🤗 LLM Performance Leaderboard 🏋️](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
+- Hardware/Backend/Optimization performance requests should be made in the [llm-perf-backend repository](https://github.com/IlyasMoutawwakil/llm-perf-backend) and will be added to the [🤗 LLM Performance Leaderboard 🏋️](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) automatically.
 """
 ABOUT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>

src/control_panel.py CHANGED Viewed

@@ -10,7 +10,7 @@ from src.exllama import get_exllama_prefill_fig, get_exllama_decode_fig
 def create_control_panel(machine: str = "hf-dgx-01"):
     # descriptive text
-    gr.HTML("Use this control panel to filter this leaderboard.", elem_id="text")
     # controls
     machine_textbox = gr.Textbox(value=machine, visible=False)
     with gr.Row():
@@ -21,14 +21,14 @@ def create_control_panel(machine: str = "hf-dgx-01"):
                 elem_id="search-bar",
             )
     with gr.Row():
-        with gr.Column(scale=1):
             score_slider = gr.Slider(
                 label="Open LLM Score (%) 📈",
                 info="🎚️ Slide to minimum Open LLM score",
                 value=0,
                 elem_id="threshold-slider",
             )
-        with gr.Column(scale=1):
             memory_slider = gr.Slider(
                 label="Peak Memory (MB) 📈",
                 info="🎚️ Slide to maximum Peak Memory",
@@ -46,7 +46,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
                 elem_id="backend-checkboxes",
             )
     with gr.Row():
-        with gr.Column(scale=1):
             datatype_checkboxes = gr.CheckboxGroup(
                 label="Load DTypes 📥",
                 choices=["float32", "float16", "bfloat16"],
@@ -54,7 +54,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
                 info="☑️ Select the load data types",
                 elem_id="dtype-checkboxes",
             )
-        with gr.Column(scale=1):
             optimization_checkboxes = gr.CheckboxGroup(
                 label="Optimizations 🛠️",
                 choices=["None", "BetterTransformer", "FlashAttentionV2"],
@@ -62,11 +62,29 @@ def create_control_panel(machine: str = "hf-dgx-01"):
                 info="☑️ Select the optimization",
                 elem_id="optimization-checkboxes",
             )
-        with gr.Column(scale=1):
             quantization_checkboxes = gr.CheckboxGroup(
                 label="Quantizations 🗜️",
-                choices=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
-                value=["None", "BnB.4bit", "BnB.8bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
                 info="☑️ Select the quantization schemes",
                 elem_id="quantization-checkboxes",
             )

 def create_control_panel(machine: str = "hf-dgx-01"):
     # descriptive text
+    gr.HTML("Use this control panel to filter the leaderboard.", elem_id="text")
     # controls
     machine_textbox = gr.Textbox(value=machine, visible=False)
     with gr.Row():
                 elem_id="search-bar",
             )
     with gr.Row():
+        with gr.Column(scale=1, variant="panel"):
             score_slider = gr.Slider(
                 label="Open LLM Score (%) 📈",
                 info="🎚️ Slide to minimum Open LLM score",
                 value=0,
                 elem_id="threshold-slider",
             )
+        with gr.Column(scale=1, variant="panel"):
             memory_slider = gr.Slider(
                 label="Peak Memory (MB) 📈",
                 info="🎚️ Slide to maximum Peak Memory",
                 elem_id="backend-checkboxes",
             )
     with gr.Row():
+        with gr.Column(scale=1, variant="panel"):
             datatype_checkboxes = gr.CheckboxGroup(
                 label="Load DTypes 📥",
                 choices=["float32", "float16", "bfloat16"],
                 info="☑️ Select the load data types",
                 elem_id="dtype-checkboxes",
             )
+        with gr.Column(scale=1, variant="panel"):
             optimization_checkboxes = gr.CheckboxGroup(
                 label="Optimizations 🛠️",
                 choices=["None", "BetterTransformer", "FlashAttentionV2"],
                 info="☑️ Select the optimization",
                 elem_id="optimization-checkboxes",
             )
+        with gr.Column(scale=2):
             quantization_checkboxes = gr.CheckboxGroup(
                 label="Quantizations 🗜️",
+                choices=[
+                    "None",
+                    "BnB.4bit",
+                    "BnB.8bit",
+                    "GPTQ.4bit",
+                    "GPTQ.4bit+ExllamaV1",
+                    "GPTQ.4bit+ExllamaV2",
+                    "AWQ.4bit+GEMM",
+                    "AWQ.4bit+GEMV",
+                ],
+                value=[
+                    "None",
+                    "BnB.4bit",
+                    "BnB.8bit",
+                    "GPTQ.4bit",
+                    "GPTQ.4bit+ExllamaV1",
+                    "GPTQ.4bit+ExllamaV2",
+                    "AWQ.4bit+GEMM",
+                    "AWQ.4bit+GEMV",
+                ],
                 info="☑️ Select the quantization schemes",
                 elem_id="quantization-checkboxes",
             )

src/exllama.py CHANGED Viewed

@@ -29,11 +29,11 @@ EXLLAMA_DATA = [
 def get_exllama_df(llm_perf_df):
-    exllama_df = llm_perf_df.copy()
-    # seperate original model experiments from Exllama experiments
-    gptq_df = exllama_df[(exllama_df["Quantization 🗜️"] == "GPTQ.4bit")]
-    exllamav1_df = exllama_df[(exllama_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
-    exllamav2_df = exllama_df[(exllama_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
     # merge the three dataframes
     exllamav1_df = pd.merge(
         gptq_df,

 def get_exllama_df(llm_perf_df):
+    copy_df = llm_perf_df.copy()
+    # seperate vanilla GPTQ experiments from Exllama experiments
+    gptq_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit")]
+    exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
+    exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
     # merge the three dataframes
     exllamav1_df = pd.merge(
         gptq_df,

src/llm_perf.py CHANGED Viewed

@@ -94,6 +94,7 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
         [
             "backend.quantization_scheme",
             "backend.quantization_config.bits",
             "backend.quantization_config.load_in_4bit",
             "backend.quantization_config.load_in_8bit",
             "backend.quantization_config.exllama_config.version",

         [
             "backend.quantization_scheme",
             "backend.quantization_config.bits",
+            "backend.quantization_config.version",
             "backend.quantization_config.load_in_4bit",
             "backend.quantization_config.load_in_8bit",
             "backend.quantization_config.exllama_config.version",

src/utils.py CHANGED Viewed

@@ -62,6 +62,10 @@ def process_quantization_scheme(x):
         return "GPTQ.4bit+ExllamaV2"
     elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
         return "GPTQ.4bit"
     else:
         return "None"

         return "GPTQ.4bit+ExllamaV2"
     elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
         return "GPTQ.4bit"
+    elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemm":
+        return "AWQ.4bit+GEMM"
+    elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemv":
+        return "AWQ.4bit+GEMV"
     else:
         return "None"