Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF staff commited on May 16

Commit

76b423c

•

1 Parent(s): 57896bb

update

Browse files

Files changed (11) hide show

app.py +36 -34
requirements.txt +1 -0
src/{flashattentionv2.py → attention.py} +78 -65
src/bettertransformer.py +0 -144
src/content.py +2 -2
src/control_panel.py +37 -37
src/{quantization_kernels.py → kernels.py} +21 -12
src/leaderboard.py +7 -8
src/llm_perf.py +84 -98
src/{latency_score_memory.py → map.py} +9 -6
src/utils.py +53 -23

app.py CHANGED Viewed

@@ -1,26 +1,25 @@
-import os
 import gradio as gr
-from src.control_panel import create_control_panel, create_control_callback, create_select_callback
-from src.latency_score_memory import create_lat_score_mem_plot
-from src.quantization_kernels import create_quant_plots
-from src.leaderboard import create_leaderboard_table
-from src.bettertransformer import create_bt_plots
-from src.flashattentionv2 import create_fa2_plots
-from src.llm_perf import get_llm_perf_df
 from src.assets import custom_css
-from src.content import (
-    LOGO,
-    TITLE,
-    ABOUT,
-    CITATION_BUTTON,
-    CITATION_BUTTON_LABEL,
 )
-MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB-275W 🖥️", "audace": "RTX4090-24GB-450W 💻"}
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
 demo = gr.Blocks(css=custom_css)
@@ -41,22 +40,27 @@ with demo:
                     datatype_checkboxes,
                     optimization_checkboxes,
                     quantization_checkboxes,
-                ) = create_control_panel()
                 ####################### HARDWARE SUBTABS #######################
                 with gr.Tabs(elem_classes="subtabs"):
-                    llm_perf_df = get_llm_perf_df(machine=machine)
                     ####################### LEADERBOARD TAB #######################
                     with gr.TabItem("Leaderboard 🏅", id=0):
-                        search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(llm_perf_df)
                     with gr.TabItem("Find Your Best Model 🧭", id=1):
-                        lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
-                    ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
-                    with gr.TabItem("ScaledDotProductAttention 📈", id=2):
-                        bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
-                    with gr.TabItem("FlashAttentionV2 📈", id=3):
-                        fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
-                    with gr.TabItem("Quantization Kernels 📈", id=4):
-                        quant_prefill_plot, quant_decode_plot = create_quant_plots(llm_perf_df)
                 ####################### CONTROL CALLBACK #######################
                 create_control_callback(
@@ -75,12 +79,10 @@ with demo:
                     # outputs
                     leaderboard_table,
                     lat_score_mem_plot,
-                    bt_prefill_plot,
-                    bt_decode_plot,
-                    fa2_prefill_plot,
-                    fa2_decode_plot,
-                    quant_prefill_plot,
-                    quant_decode_plot,
                 )
                 create_select_callback(

 import gradio as gr
 from src.assets import custom_css
+# from src.attention import create_attn_plots
+from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
+from src.control_panel import (
+    create_control_callback,
+    create_control_panel,
+    create_select_callback,
 )
+from src.leaderboard import create_leaderboard_table
+from src.llm_perf import get_llm_perf_df
+from src.map import create_lat_score_mem_plot
+# from custom_kernels import create_quant_krnl_plots
+MACHINE_TO_HARDWARE = {
+    "1xA10": "A10-24GB-150W 🖥️",
+    "1xA100": "A100-80GB-275W 🖥️",
+    # "1xH100": "H100-80GB-700W 🖥️",
+}
 demo = gr.Blocks(css=custom_css)
                     datatype_checkboxes,
                     optimization_checkboxes,
                     quantization_checkboxes,
+                ) = create_control_panel(machine=machine)
                 ####################### HARDWARE SUBTABS #######################
                 with gr.Tabs(elem_classes="subtabs"):
+                    open_llm_perf_df = get_llm_perf_df(machine=machine)
                     ####################### LEADERBOARD TAB #######################
                     with gr.TabItem("Leaderboard 🏅", id=0):
+                        search_bar, columns_checkboxes, leaderboard_table = (
+                            create_leaderboard_table(open_llm_perf_df)
+                        )
                     with gr.TabItem("Find Your Best Model 🧭", id=1):
+                        lat_score_mem_plot = create_lat_score_mem_plot(open_llm_perf_df)
+                    ###################### ATTENTIONS SPEEDUP TAB #######################
+                    # with gr.TabItem("Attention 📈", id=2):
+                    #     attn_prefill_plot, attn_decode_plot = create_attn_plots(
+                    #         open_llm_perf_df
+                    #     )
+                    # ####################### KERNELS SPEEDUP TAB #######################
+                    # with gr.TabItem("Kernels 📈", id=4):
+                    #     quant_krnl_prefill_plot, quant_krnl_decode_plot = (
+                    #         create_quant_krnl_plots(llm_perf_df)
+                    #     )
                 ####################### CONTROL CALLBACK #######################
                 create_control_callback(
                     # outputs
                     leaderboard_table,
                     lat_score_mem_plot,
+                    # attn_prefill_plot,
+                    # attn_decode_plot,
+                    # quant_krnl_prefill_plot,
+                    # quant_krnl_decode_plot,
                 )
                 create_select_callback(

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 huggingface_hub
 gradio
 plotly
 pandas

 huggingface_hub
+transformers
 gradio
 plotly
 pandas

src/{flashattentionv2.py → attention.py} RENAMED Viewed

@@ -2,143 +2,156 @@ import gradio as gr
 import pandas as pd
 import plotly.express as px
-FLASHATTENTIONV2_DATA = [
     # open llm
     "Model 🤗",
-    "DType 📥",
-    "Backend 🏭",
     "Params (B)",
     "Architecture 🏛️",
     "Open LLM Score (%)",
     # deployment settings
-    "DType 📥",
     "Backend 🏭",
-    "Optimization 🛠️",
     "Quantization 🗜️",
-    "Optimization 🛠️ FlashAttentionV2",
     # primary measurements
     "Prefill (s)",
-    "Prefill (s) FlashAttentionV2",
     "Decode (tokens/s)",
-    "Decode (tokens/s) FlashAttentionV2",
-    "End-to-End (tokens/s)",
-    "End-to-End (tokens/s) FlashAttentionV2",
     # speedups
     "Prefill Speedup (%)",
     "Decode Speedup (%)",
 ]
-def get_fa2_df(llm_perf_df):
-    copy_df = llm_perf_df.copy()
-    # seperate original model experiments from FlashAttentionV2 experiments
-    original_df = copy_df[(copy_df["Optimization 🛠️"] == "None") & (copy_df["DType 📥"] == "float16")]
-    fa2_df = copy_df[(copy_df["Optimization 🛠️"] == "FlashAttentionV2") & (copy_df["DType 📥"] == "float16")]
-    # merge the two dataframes
     fa2_df = pd.merge(
-        original_df,
         fa2_df,
-        on=["Model 🤗", "Quantization 🗜️"],
-        suffixes=["", " FlashAttentionV2"],
     )
     # compute speedups
-    fa2_df["Prefill Speedup (%)"] = ((fa2_df["Prefill (s)"] / fa2_df["Prefill (s) FlashAttentionV2"]) * 100).round(
-        2
-    ) - 100
-    fa2_df["Decode Speedup (%)"] = (
-        (fa2_df["Decode (tokens/s) FlashAttentionV2"] / fa2_df["Decode (tokens/s)"]) * 100
     ).round(2) - 100
-    # filter speedups > 1000%
-    fa2_df = fa2_df[fa2_df["Prefill Speedup (%)"] < 1000]
-    fa2_df = fa2_df[fa2_df["Decode Speedup (%)"] < 1000]
-    return fa2_df
-def get_fa2_decode_fig(llm_perf_df):
-    fa2_df = get_fa2_df(llm_perf_df)
     # plot
-    decode_fig = px.box(
-        fa2_df,
         x="Architecture 🏛️",
-        y="Decode Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
-        custom_data=FLASHATTENTIONV2_DATA,
-        color="Quantization 🗜️",
         points="all",
     )
     # add hover data
-    decode_fig.update_traces(
         hovertemplate="<br>".join(
-            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
         )
     )
     # add layout
-    decode_fig.update_layout(
         title={
-            "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
-            "y": 0.95,
-            "x": 0.5,
             "xanchor": "center",
             "yanchor": "top",
         },
         xaxis_title="LLM Architecture",
-        yaxis_title="Decode Speedup (%)",
-        legend_title="Quantization Scheme",
         width=1200,
         height=600,
     )
-    return decode_fig
-def get_fa2_prefill_fig(llm_perf_df):
-    fa2_df = get_fa2_df(llm_perf_df)
     # plot
-    prefill_fig = px.box(
-        fa2_df,
         x="Architecture 🏛️",
-        y="Prefill Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
-        custom_data=FLASHATTENTIONV2_DATA,
-        color="Quantization 🗜️",
         points="all",
     )
     # add hover data
-    prefill_fig.update_traces(
         hovertemplate="<br>".join(
-            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
         )
     )
     # add layout
-    prefill_fig.update_layout(
         title={
-            "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
-            "y": 0.95,
-            "x": 0.5,
             "xanchor": "center",
             "yanchor": "top",
         },
         xaxis_title="LLM Architecture",
-        yaxis_title="Prefill Speedup (%)",
-        legend_title="Quantization Scheme",
         width=1200,
         height=600,
     )
-    return prefill_fig
-def create_fa2_plots(llm_perf_df):
     # descriptive text
     gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
     # get figures
-    prefill_fig = get_fa2_prefill_fig(llm_perf_df)
-    decode_fig = get_fa2_decode_fig(llm_perf_df)
     # create plots
-    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
     decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
     return prefill_plot, decode_plot

 import pandas as pd
 import plotly.express as px
+ATTN_DATA = [
     # open llm
     "Model 🤗",
+    "Experiment 🧪",
     "Params (B)",
     "Architecture 🏛️",
     "Open LLM Score (%)",
     # deployment settings
     "Backend 🏭",
     "Quantization 🗜️",
+    "Precision 📥",
+    "Attention 👁️",
+    "Kernel ⚛️",
     # primary measurements
     "Prefill (s)",
     "Decode (tokens/s)",
     # speedups
     "Prefill Speedup (%)",
     "Decode Speedup (%)",
 ]
+def get_attn_df(open_llm_perf_df):
+    copy_df = open_llm_perf_df.copy()
+    copy_df["Quantization & Kernel"] = (
+        copy_df["Quantization 🗜️"] + " & " + copy_df["Kernel ⚛️"]
+    )
+    eager_df = copy_df[(copy_df["Attention 👁️"] == "Eager")]
+    sdpa_df = copy_df[(copy_df["Attention 👁️"] == "SDPA")]
+    fa2_df = copy_df[(copy_df["Attention 👁️"] == "FAv2")]
+    sdpa_df = pd.merge(
+        eager_df,
+        sdpa_df,
+        on=["Model 🤗", "Quantization & Kernel"],
+        suffixes=["", " other"],
+    )
     fa2_df = pd.merge(
+        eager_df,
         fa2_df,
+        on=["Model 🤗", "Quantization & Kernel"],
+        suffixes=["", " other"],
     )
+    attn_df = pd.concat([sdpa_df, fa2_df])
     # compute speedups
+    attn_df["Prefill Speedup (%)"] = (
+        (attn_df["Prefill (s)"] / attn_df["Prefill (s) other"]) * 100
+    ).round(2) - 100
+    attn_df["Decode Speedup (%)"] = (
+        (attn_df["Decode (tokens/s) other"] / attn_df["Decode (tokens/s)"]) * 100
     ).round(2) - 100
+    return attn_df
+def get_attn_prefill_fig(open_llm_perf_df):
+    attn_df = get_attn_df(open_llm_perf_df)
     # plot
+    prefill_fig = px.box(
+        attn_df,
         x="Architecture 🏛️",
+        y="Prefill Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=ATTN_DATA,
+        color="Attention 👁️ other",
         points="all",
     )
     # add hover data
+    prefill_fig.update_traces(
         hovertemplate="<br>".join(
+            [
+                f"<b>{column}:</b> %{{customdata[{i}]}}"
+                for i, column in enumerate(ATTN_DATA)
+            ]
         )
     )
     # add layout
+    prefill_fig.update_layout(
         title={
+            "text": "Prefill Speedup per Architecture, Compared To Eager Attention",
             "xanchor": "center",
             "yanchor": "top",
+            "y": 0.95,
+            "x": 0.5,
         },
+        yaxis_title="Prefill Speedup (%)",
         xaxis_title="LLM Architecture",
+        legend_title="Attention",
         width=1200,
         height=600,
     )
+    return prefill_fig
+def get_attn_decode_fig(open_llm_perf_df):
+    attn_df = get_attn_df(open_llm_perf_df)
+    print(len(attn_df))
     # plot
+    decode_fig = px.box(
+        attn_df,
         x="Architecture 🏛️",
+        y="Decode Speedup (%)",
         color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=ATTN_DATA,
+        color="Attention 👁️ other",
         points="all",
     )
     # add hover data
+    decode_fig.update_traces(
         hovertemplate="<br>".join(
+            [
+                f"<b>{column}:</b> %{{customdata[{i}]}}"
+                for i, column in enumerate(ATTN_DATA)
+            ]
         )
     )
     # add layout
+    decode_fig.update_layout(
         title={
+            "text": "Decode Speedup per Architecture, Compared To Eager Attention",
             "xanchor": "center",
             "yanchor": "top",
+            "y": 0.95,
+            "x": 0.5,
         },
+        yaxis_title="Decode Speedup (%)",
         xaxis_title="LLM Architecture",
+        legend_title="Attention",
         width=1200,
         height=600,
     )
+    return decode_fig
+def create_attn_plots(open_llm_perf_df):
     # descriptive text
     gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
     # get figures
+    prefill_fig = get_attn_prefill_fig(open_llm_perf_df)
+    decode_fig = get_attn_decode_fig(open_llm_perf_df)
     # create plots
+    prefill_plot = gr.components.Plot(
+        value=prefill_fig, elem_id="plot", show_label=False
+    )
     decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
     return prefill_plot, decode_plot

src/bettertransformer.py DELETED Viewed

@@ -1,144 +0,0 @@
-import gradio as gr
-import pandas as pd
-import plotly.express as px
-BETTERTRANSFORMER_DATA = [
-    # open llm
-    "Model 🤗",
-    "DType 📥",
-    "Backend 🏭",
-    "Params (B)",
-    "Architecture 🏛️",
-    "Open LLM Score (%)",
-    # deployment settings
-    "DType 📥",
-    "Backend 🏭",
-    "Optimization 🛠️",
-    "Quantization 🗜️",
-    "Optimization 🛠️ BetterTransformer",
-    # primary measurements
-    "Prefill (s)",
-    "Prefill (s) BetterTransformer",
-    "Decode (tokens/s)",
-    "Decode (tokens/s) BetterTransformer",
-    "End-to-End (tokens/s)",
-    "End-to-End (tokens/s) BetterTransformer",
-    # speedups
-    "Prefill Speedup (%)",
-    "Decode Speedup (%)",
-]
-def get_bt_df(llm_perf_df):
-    copy_df = llm_perf_df.copy()
-    # seperate original model experiments from BetterTransformer experiments
-    original_df = copy_df[(copy_df["Optimization 🛠️"] == "None") & (copy_df["DType 📥"] == "float16")]
-    bt_df = copy_df[(copy_df["Optimization 🛠️"] == "BetterTransformer") & (copy_df["DType 📥"] == "float16")]
-    # merge the two dataframes
-    bt_df = pd.merge(
-        original_df,
-        bt_df,
-        on=["Model 🤗", "Quantization 🗜️"],
-        suffixes=["", " BetterTransformer"],
-    )
-    # compute speedups
-    bt_df["Prefill Speedup (%)"] = (
-        (bt_df["Prefill (s)"] / bt_df["Prefill (s) BetterTransformer"]) * 100
-    ).round(2) - 100
-    bt_df["Decode Speedup (%)"] = (
-        (bt_df["Decode (tokens/s) BetterTransformer"] / bt_df["Decode (tokens/s)"]) * 100
-    ).round(2) - 100
-    # filter speedups > 1000%
-    bt_df = bt_df[bt_df["Prefill Speedup (%)"] < 1000]
-    bt_df = bt_df[bt_df["Decode Speedup (%)"] < 1000]
-    return bt_df
-def get_bt_prefill_fig(llm_perf_df):
-    bt_df = get_bt_df(llm_perf_df)
-    # plot
-    prefill_fig = px.box(
-        bt_df,
-        x="Architecture 🏛️",
-        y="Prefill Speedup (%)",
-        color_discrete_sequence=px.colors.qualitative.Light24,
-        custom_data=BETTERTRANSFORMER_DATA,
-        color="Quantization 🗜️",
-        points="all",
-    )
-    # add hover data
-    prefill_fig.update_traces(
-        hovertemplate="<br>".join(
-            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
-        )
-    )
-    # add layout
-    prefill_fig.update_layout(
-        title={
-            "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
-            "y": 0.95,
-            "x": 0.5,
-            "xanchor": "center",
-            "yanchor": "top",
-        },
-        xaxis_title="LLM Architecture",
-        yaxis_title="Prefill Speedup (%)",
-        legend_title="Quantization Scheme",
-        width=1200,
-        height=600,
-    )
-    return prefill_fig
-def get_bt_decode_fig(llm_perf_df):
-    bt_df = get_bt_df(llm_perf_df)
-    # plot
-    decode_fig = px.box(
-        bt_df,
-        x="Architecture 🏛️",
-        y="Decode Speedup (%)",
-        color_discrete_sequence=px.colors.qualitative.Light24,
-        custom_data=BETTERTRANSFORMER_DATA,
-        color="Quantization 🗜️",
-        points="all",
-    )
-    # add hover data
-    decode_fig.update_traces(
-        hovertemplate="<br>".join(
-            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
-        )
-    )
-    # add layout
-    decode_fig.update_layout(
-        title={
-            "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
-            "y": 0.95,
-            "x": 0.5,
-            "xanchor": "center",
-            "yanchor": "top",
-        },
-        xaxis_title="LLM Architecture",
-        yaxis_title="Decode Speedup (%)",
-        legend_title="Quantization Scheme",
-        width=1200,
-        height=600,
-    )
-    return decode_fig
-def create_bt_plots(llm_perf_df):
-    # descriptive text
-    gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
-    # get figures
-    prefill_fig = get_bt_prefill_fig(llm_perf_df)
-    decode_fig = get_bt_decode_fig(llm_perf_df)
-    # create plots
-    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
-    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
-    return prefill_plot, decode_plot

src/content.py CHANGED Viewed

@@ -14,7 +14,7 @@ configuration for automated benchmarking:
 - Model evaluation requests should be made in the
 [🤗 Open LLM Leaderboard 🏅](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
-we scrape the list of pretrained base models from there.
 - Hardware/Backend/Optimization configuration requests should be made in the
 [🤗 LLM-Perf Leaderboard 🏋️](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
 [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
@@ -23,7 +23,7 @@ we scrape the list of pretrained base models from there.
 - To avoid communication-dependent results, only one GPU is used.
 - Score is the average evaluation score obtained from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
-- LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.
 - Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
 - We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.

 - Model evaluation requests should be made in the
 [🤗 Open LLM Leaderboard 🏅](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
+we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
 - Hardware/Backend/Optimization configuration requests should be made in the
 [🤗 LLM-Perf Leaderboard 🏋️](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
 [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
 - To avoid communication-dependent results, only one GPU is used.
 - Score is the average evaluation score obtained from the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+- LLMs are running on a singleton batch with a prompt size of 256 and generating a 64 tokens for at least 10 iterations and 10 seconds.
 - Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
 - We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.

src/control_panel.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import gradio as gr
-from src.llm_perf import get_llm_perf_df
 from src.leaderboard import get_leaderboard_df
-from src.latency_score_memory import get_lat_score_mem_fig
-from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
-from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
-from src.quantization_kernels import get_quant_prefill_fig, get_quant_decode_fig
-def create_control_panel(machine: str = "hf-dgx-01"):
     # controls
     machine_textbox = gr.Textbox(value=machine, visible=False)
     with gr.Accordion("Control Panel 🎛️", open=False, elem_id="control-panel"):
@@ -29,7 +29,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
                     value=80 * 1024,
                     elem_id="memory-slider",
                 )
-            with gr.Column(scale=1):
                 backend_checkboxes = gr.CheckboxGroup(
                     label="Backends 🏭",
                     choices=["pytorch"],
@@ -40,7 +40,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
         with gr.Row():
             with gr.Column(scale=1, variant="panel"):
                 datatype_checkboxes = gr.CheckboxGroup(
-                    label="Load DTypes 📥",
                     choices=["float32", "float16", "bfloat16"],
                     value=["float32", "float16", "bfloat16"],
                     info="☑️ Select the load data types",
@@ -48,13 +48,13 @@ def create_control_panel(machine: str = "hf-dgx-01"):
                 )
             with gr.Column(scale=1, variant="panel"):
                 optimization_checkboxes = gr.CheckboxGroup(
-                    label="Optimizations 🛠️",
                     choices=["None", "BetterTransformer", "FlashAttentionV2"],
                     value=["None", "BetterTransformer", "FlashAttentionV2"],
                     info="☑️ Select the optimization",
                     elem_id="optimization-checkboxes",
                 )
-            with gr.Column(scale=2):
                 quantization_checkboxes = gr.CheckboxGroup(
                     label="Quantizations 🗜️",
                     choices=[
@@ -118,29 +118,29 @@ def filter_fn(
         # raw_df["Model 🤗"].str.contains(model, case=False)
         raw_df["Backend 🏭"].isin(backends)
         & raw_df["DType 📥"].isin(datatypes)
-        & raw_df["Optimization 🛠️"].isin(optimizations)
         & raw_df["Quantization 🗜️"].isin(quantizations)
         & (raw_df["Open LLM Score (%)"] >= score)
         & (raw_df["Allocated Memory (MB)"] <= memory)
     ]
     filtered_leaderboard_df = select_fn(machine, columns, search)
     filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
-    filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
-    filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
-    filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
-    filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
-    filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
-    filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
     return [
         filtered_leaderboard_df,
         filtered_lat_score_mem_fig,
-        filtered_bt_prefill_fig,
-        filtered_bt_decode_fig,
-        filtered_fa2_prefill_fig,
-        filtered_fa2_decode_fig,
-        filtered_quant_prefill_fig,
-        filtered_quant_decode_fig,
     ]
@@ -162,12 +162,12 @@ def create_control_callback(
     # outputs
     leaderboard_table,
     lat_score_mem_plot,
-    bt_prefill_plot,
-    bt_decode_plot,
-    fa2_prefill_plot,
-    fa2_decode_plot,
-    quant_prefill_plot,
-    quant_decode_plot,
 ):
     filter_button.click(
         fn=filter_fn,
@@ -188,19 +188,19 @@ def create_control_callback(
         outputs=[
             leaderboard_table,
             lat_score_mem_plot,
-            bt_prefill_plot,
-            bt_decode_plot,
-            fa2_prefill_plot,
-            fa2_decode_plot,
-            quant_prefill_plot,
-            quant_decode_plot,
         ],
     )
 def select_fn(machine, columns, search):
-    raw_df = get_llm_perf_df(machine=machine)
-    selected_leaderboard_df = get_leaderboard_df(raw_df)
     selected_leaderboard_df = selected_leaderboard_df[
         selected_leaderboard_df["Model 🤗"].str.contains(search, case=False)
     ]

 import gradio as gr
 from src.leaderboard import get_leaderboard_df
+from src.llm_perf import get_llm_perf_df
+# from attention_implementations import get_attn_decode_fig, get_attn_prefill_fig
+# from custom_kernels import get_kernel_decode_fig, get_kernel_prefill_fig
+from src.map import get_lat_score_mem_fig
+def create_control_panel(machine: str):
     # controls
     machine_textbox = gr.Textbox(value=machine, visible=False)
     with gr.Accordion("Control Panel 🎛️", open=False, elem_id="control-panel"):
                     value=80 * 1024,
                     elem_id="memory-slider",
                 )
+            with gr.Column(scale=1, variant="panel"):
                 backend_checkboxes = gr.CheckboxGroup(
                     label="Backends 🏭",
                     choices=["pytorch"],
         with gr.Row():
             with gr.Column(scale=1, variant="panel"):
                 datatype_checkboxes = gr.CheckboxGroup(
+                    label="DTypes 📥",
                     choices=["float32", "float16", "bfloat16"],
                     value=["float32", "float16", "bfloat16"],
                     info="☑️ Select the load data types",
                 )
             with gr.Column(scale=1, variant="panel"):
                 optimization_checkboxes = gr.CheckboxGroup(
+                    label="Attentions 👁️",
                     choices=["None", "BetterTransformer", "FlashAttentionV2"],
                     value=["None", "BetterTransformer", "FlashAttentionV2"],
                     info="☑️ Select the optimization",
                     elem_id="optimization-checkboxes",
                 )
+            with gr.Column(scale=2, variant="panel"):
                 quantization_checkboxes = gr.CheckboxGroup(
                     label="Quantizations 🗜️",
                     choices=[
         # raw_df["Model 🤗"].str.contains(model, case=False)
         raw_df["Backend 🏭"].isin(backends)
         & raw_df["DType 📥"].isin(datatypes)
+        & raw_df["Attention 👁️"].isin(optimizations)
         & raw_df["Quantization 🗜️"].isin(quantizations)
         & (raw_df["Open LLM Score (%)"] >= score)
         & (raw_df["Allocated Memory (MB)"] <= memory)
     ]
     filtered_leaderboard_df = select_fn(machine, columns, search)
     filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
+    # filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
+    # filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
+    # filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
+    # filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
+    # filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
+    # filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
     return [
         filtered_leaderboard_df,
         filtered_lat_score_mem_fig,
+        # filtered_bt_prefill_fig,
+        # filtered_bt_decode_fig,
+        # filtered_fa2_prefill_fig,
+        # filtered_fa2_decode_fig,
+        # filtered_quant_prefill_fig,
+        # filtered_quant_decode_fig,
     ]
     # outputs
     leaderboard_table,
     lat_score_mem_plot,
+    # attn_prefill_plot,
+    # attn_decode_plot,
+    # fa2_prefill_plot,
+    # fa2_decode_plot,
+    # quant_prefill_plot,
+    # quant_decode_plot,
 ):
     filter_button.click(
         fn=filter_fn,
         outputs=[
             leaderboard_table,
             lat_score_mem_plot,
+            # attn_prefill_plot,
+            # attn_decode_plot,
+            # fa2_prefill_plot,
+            # fa2_decode_plot,
+            # quant_prefill_plot,
+            # quant_decode_plot,
         ],
     )
 def select_fn(machine, columns, search):
+    llm_perf_df = get_llm_perf_df(machine=machine)
+    selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
     selected_leaderboard_df = selected_leaderboard_df[
         selected_leaderboard_df["Model 🤗"].str.contains(search, case=False)
     ]

src/{quantization_kernels.py → kernels.py} RENAMED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 import pandas as pd
 import plotly.express as px
 QUANT_DATA = [
     # open llm
     "Model 🤗",
@@ -14,9 +13,7 @@ QUANT_DATA = [
     # deployment settings
     "DType 📥",
     "Backend 🏭",
-    "Optimization 🛠️",
     "Quantization 🗜️",
-    "Optimization 🛠️ Custom Kernel",
     "Quantization 🗜️ Custom Kernel",
     # primary measurements
     "Prefill (s)",
@@ -34,9 +31,8 @@ def get_quant_df(llm_perf_df):
     # seperate vanilla GPTQ experiments from Custom Kernel experiments
     vanilla_df = copy_df[
         (copy_df["Backend 🏭"] == "pytorch")
-        & (copy_df["Quantization 🗜️"] == "None")
-        & (copy_df["Optimization 🛠️"] == "None")
         & (copy_df["DType 📥"] == "float16")
     ]
     exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
@@ -70,11 +66,12 @@ def get_quant_df(llm_perf_df):
     # concat the two dataframes row-wise
     quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
     # compute speedups
-    quant_df["Prefill Speedup (%)"] = ((quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100).round(
-        2
-    ) - 100
     quant_df["Decode Speedup (%)"] = (
-        (quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"]) * 100
     ).round(2) - 100
     # filter speedups > 1000%
     quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
@@ -97,7 +94,12 @@ def get_quant_decode_fig(llm_perf_df):
     )
     # add hover data
     decode_fig.update_traces(
-        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
     )
     # add layout
     decode_fig.update_layout(
@@ -132,7 +134,12 @@ def get_quant_prefill_fig(llm_perf_df):
     )
     # add hover data
     prefill_fig.update_traces(
-        hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
     )
     # add layout
     prefill_fig.update_layout(
@@ -161,7 +168,9 @@ def create_quant_plots(llm_perf_df):
     decode_fig = get_quant_decode_fig(llm_perf_df)
     # create plots
-    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
     decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
     return prefill_plot, decode_plot

 import pandas as pd
 import plotly.express as px
 QUANT_DATA = [
     # open llm
     "Model 🤗",
     # deployment settings
     "DType 📥",
     "Backend 🏭",
     "Quantization 🗜️",
     "Quantization 🗜️ Custom Kernel",
     # primary measurements
     "Prefill (s)",
     # seperate vanilla GPTQ experiments from Custom Kernel experiments
     vanilla_df = copy_df[
         (copy_df["Backend 🏭"] == "pytorch")
         & (copy_df["DType 📥"] == "float16")
+        & (copy_df["Quantization 🗜️"] == "None")
     ]
     exllamav1_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV1")]
     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
     # concat the two dataframes row-wise
     quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
     # compute speedups
+    quant_df["Prefill Speedup (%)"] = (
+        (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
+    ).round(2) - 100
     quant_df["Decode Speedup (%)"] = (
+        (quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"])
+        * 100
     ).round(2) - 100
     # filter speedups > 1000%
     quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
     )
     # add hover data
     decode_fig.update_traces(
+        hovertemplate="<br>".join(
+            [
+                f"<b>{column}:</b> %{{customdata[{i}]}}"
+                for i, column in enumerate(QUANT_DATA)
+            ]
+        )
     )
     # add layout
     decode_fig.update_layout(
     )
     # add hover data
     prefill_fig.update_traces(
+        hovertemplate="<br>".join(
+            [
+                f"<b>{column}:</b> %{{customdata[{i}]}}"
+                for i, column in enumerate(QUANT_DATA)
+            ]
+        )
     )
     # add layout
     prefill_fig.update_layout(
     decode_fig = get_quant_decode_fig(llm_perf_df)
     # create plots
+    prefill_plot = gr.components.Plot(
+        value=prefill_fig, elem_id="plot", show_label=False
+    )
     decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
     return prefill_plot, decode_plot

src/leaderboard.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 from src.utils import model_hyperlink, process_score
 LEADERBOARD_COLUMN_TO_DATATYPE = {
     # open llm
     "Model 🤗": "markdown",
@@ -13,18 +12,18 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
     "Memory (MB)": "number",
     "Energy (tokens/kWh)": "number",
     # deployment settings
-    "DType 📥": "str",
     "Backend 🏭": "str",
-    "Optimization 🛠️": "str",
     "Quantization 🗜️": "str",
     # additional measurements
-    "Architecture 🏛️": "markdown",
-    "Params (B)": "number",
     "Open LLM Score (%)": "number",
     "End-to-End (s)": "number",
-    "End-to-End (tokens/s)": "number",
-    "Reserved Memory (MB)": "number",
-    "Used Memory (MB)": "number",
 }
 PRIMARY_COLUMNS = [

 from src.utils import model_hyperlink, process_score
 LEADERBOARD_COLUMN_TO_DATATYPE = {
     # open llm
     "Model 🤗": "markdown",
     "Memory (MB)": "number",
     "Energy (tokens/kWh)": "number",
     # deployment settings
     "Backend 🏭": "str",
+    "Precision 📥": "str",
     "Quantization 🗜️": "str",
+    "Attention 👁️": "str",
+    "Kernel ⚛️": "str",
     # additional measurements
+    # "Reserved Memory (MB)": "number",
+    # "Used Memory (MB)": "number",
     "Open LLM Score (%)": "number",
     "End-to-End (s)": "number",
+    "Architecture 🏛️": "str",
+    "Params (B)": "number",
 }
 PRIMARY_COLUMNS = [

src/llm_perf.py CHANGED Viewed

@@ -1,123 +1,98 @@
 import os
 import pandas as pd
-from huggingface_hub import hf_hub_download
-from .utils import process_quantization_scheme, process_arch
-LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
 COLUMNS_MAPPING = {
-    "Model": "Model 🤗",
-    "experiment_name": "Experiment 🧪",
     # primary measurements
-    "forward.latency(s)": "Prefill (s)",
-    "decode.throughput(tokens/s)": "Decode (tokens/s)",
-    "generate.max_memory_allocated(MB)": "Memory (MB)",
-    "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
     # deployment settings
-    "backend.name": "Backend 🏭",
-    "backend.torch_dtype": "DType 📥",
-    "optimization": "Optimization 🛠️",
     "quantization": "Quantization 🗜️",
-    # additional measurements
-    "Size": "Params (B)",
-    "Arch": "Architecture 🏛️",
-    "Score": "Open LLM Score (%)",
-    "generate.latency(s)": "End-to-End (s)",
-    "generate.throughput(tokens/s)": "End-to-End (tokens/s)",
-    "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
-    "generate.max_memory_used(MB)": "Used Memory (MB)",
 }
-SORTING_COLUMNS = [
-    "Open LLM Score (%)",
-    "Decode (tokens/s)",
-    "Prefill (s)",
-]
 SORTING_ASCENDING = [False, True, False]
-def get_llm_df():
-    # commented for now since scraping script is not working
-    hf_hub_download(
-        repo_id=LLM_PERF_DATASET_REPO,
-        filename="open-llm.csv",
-        local_dir="dataset",
-        repo_type="dataset",
-        token=HF_TOKEN,
-    )
-    llm_df = pd.read_csv("dataset/open-llm.csv")
-    return llm_df
-def get_perf_df(machine: str = "hf-dgx-01"):
-    hf_hub_download(
-        repo_id=LLM_PERF_DATASET_REPO,
-        filename=f"{machine}/perf-report.csv",
-        local_dir="dataset",
-        repo_type="dataset",
-        token=HF_TOKEN,
     )
-    perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
-    return perf_df
-def get_llm_perf_df(machine: str = "hf-dgx-01"):
-    # get dataframes
-    llm_df = get_llm_df()
-    perf_df = get_perf_df(machine=machine)
-    llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
     # some assertions
-    assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
-    assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
-    assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
-    # transpose energy consumption
-    llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
-        1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
-    ).astype(int)
-    # fix nan values
-    llm_perf_df.loc[
-        llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
-        "generate.energy_consumption(tokens/kWh)",
-    ] = pd.NA
-    # add optimization column
-    llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
-        lambda x: (
-            "BetterTransformer"
-            if x["backend.to_bettertransformer"]
-            else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None")
-        ),
-        axis=1,
     )
-    # add quantization scheme
-    llm_perf_df["quantization"] = llm_perf_df[
-        [
-            "backend.quantization_scheme",
-            "backend.quantization_config.bits",
-            "backend.quantization_config.version",
-            "backend.quantization_config.load_in_4bit",
-            "backend.quantization_config.load_in_8bit",
-            "backend.quantization_config.exllama_config.version",
-        ]
-    ].apply(lambda x: process_quantization_scheme(x), axis=1)
-    # process experiment name
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(
-        lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x
     )
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-4bit", "BnB-4bit"))
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-8bit", "BnB-8bit"))
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "AWQ-4bit"))
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "GPTQ-4bit"))
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "SDPA"))
-    llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA-v2"))
-    # add arch
-    llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
     # filter columns
     llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
     # rename columns
@@ -130,3 +105,14 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
     )
     return llm_perf_df

 import os
 import pandas as pd
+from .utils import process_kernels, process_quantizations
 COLUMNS_MAPPING = {
+    "config.name": "Experiment 🧪",
+    "config.backend.model": "Model 🤗",
     # primary measurements
+    "report.prefill.latency.p50": "Prefill (s)",
+    "report.per_token.latency.p50": "Per Token (s)",
+    "report.decode.throughput.value": "Decode (tokens/s)",
+    "report.decode.efficiency.value": "Energy (tokens/kWh)",
+    "report.decode.memory.max_allocated": "Memory (MB)",
     # deployment settings
+    "config.backend.name": "Backend 🏭",
+    "config.backend.torch_dtype": "Precision 📥",
     "quantization": "Quantization 🗜️",
+    "attention": "Attention 👁️",
+    "kernel": "Kernel ⚛️",
+    # additional information
+    "architecture": "Architecture 🏛️",
+    "prefill+decode": "End-to-End (s)",
+    "Average ⬆️": "Open LLM Score (%)",
+    "#Params (B)": "Params (B)",
 }
+SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
+SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
 SORTING_ASCENDING = [False, True, False]
+def get_raw_llm_perf_df(machine: str = "1xA10"):
+    dfs = []
+    for subset in SUBSETS:
+        try:
+            dfs.append(
+                pd.read_csv(
+                    f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-perf-leaderboard-{subset}-{machine}.csv"
+                )
+            )
+        except Exception:
+            print(f"Subset {subset} for machine {machine} not found")
+    llm_perf_df = pd.concat(dfs)
+    open_llm_df = pd.read_csv(
+        "hf://datasets/optimum-benchmark/open-llm-leaderboard/open-llm-leaderboard.csv"
+    )
+    llm_perf_df = pd.merge(
+        open_llm_df, llm_perf_df, left_on="Model", right_on="config.backend.model"
     )
+    return llm_perf_df
+def processed_llm_perf_df(llm_perf_df):
     # some assertions
+    assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
+    assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
+    assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
+    assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
+    # fix couple stuff
+    llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace(
+        "flash_attention_2", "fa2"
     )
+    llm_perf_df["prefill+decode"] = (
+        llm_perf_df["report.prefill.latency.p50"]
+        + (llm_perf_df["report.decode.latency.p50"])
+    )
+    # llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
+    #     process_architectures
+    # )
+    llm_perf_df["architecture"] = llm_perf_df["Architecture"]
+    llm_perf_df["attention"] = (
+        llm_perf_df["config.backend.attn_implementation"]
+        .str.replace("flash_attention_2", "FAv2")
+        .str.replace("eager", "Eager")
+        .str.replace("sdpa", "SDPA")
+    )
+    llm_perf_df["quantization"] = llm_perf_df.apply(process_quantizations, axis=1)
+    llm_perf_df["kernel"] = llm_perf_df.apply(process_kernels, axis=1)
+    # round numerical columns
+    llm_perf_df = llm_perf_df.round(
+        {
+            "report.prefill.latency.p50": 3,
+            "report.decode.latency.p50": 3,
+            "report.decode.throughput.value": 3,
+            "report.decode.efficiency.value": 3,
+            "report.decode.memory.max_allocated": 3,
+            "Average ⬆️": 3,
+            "prefill+decode": 3,
+            "#Params (B)": 3,
+        }
     )
     # filter columns
     llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
     # rename columns
     )
     return llm_perf_df
+def get_llm_perf_df(machine: str = "1xA10"):
+    if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
+        llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
+    else:
+        llm_perf_df = get_raw_llm_perf_df(machine)
+        llm_perf_df = processed_llm_perf_df(llm_perf_df)
+        llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
+    return llm_perf_df

src/{latency_score_memory.py → map.py} RENAMED Viewed

@@ -1,20 +1,20 @@
 import gradio as gr
 import plotly.express as px
 SCORE_MEMORY_LATENCY_DATA = [
     "Model 🤗",
-    "DType 📥",
     "Backend 🏭",
     "Params (B)",
-    "Architecture 🏛️",
-    "Optimization 🛠️",
     "Quantization 🗜️",
     "Open LLM Score (%)",
     "Prefill (s)",
     "Decode (tokens/s)",
     "Memory (MB)",
     "End-to-End (s)",
 ]
@@ -32,7 +32,10 @@ def get_lat_score_mem_fig(llm_perf_df):
     )
     fig.update_traces(
         hovertemplate="<br>".join(
-            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
         )
     )
     fig.update_layout(
@@ -43,7 +46,7 @@ def get_lat_score_mem_fig(llm_perf_df):
             "xanchor": "center",
             "yanchor": "top",
         },
-        xaxis_title="Time To Generate 256 Tokens (s)",
         yaxis_title="Open LLM Score (%)",
         legend_title="LLM Architecture",
         width=1200,

 import gradio as gr
 import plotly.express as px
 SCORE_MEMORY_LATENCY_DATA = [
     "Model 🤗",
     "Backend 🏭",
+    "Precision 📥",
     "Params (B)",
     "Quantization 🗜️",
+    "Attention 👁️",
+    "Kernel ⚛️",
     "Open LLM Score (%)",
     "Prefill (s)",
     "Decode (tokens/s)",
     "Memory (MB)",
     "End-to-End (s)",
+    "Architecture 🏛️",
 ]
     )
     fig.update_traces(
         hovertemplate="<br>".join(
+            [
+                f"<b>{column}:</b> %{{customdata[{i}]}}"
+                for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)
+            ]
         )
     )
     fig.update_layout(
             "xanchor": "center",
             "yanchor": "top",
         },
+        xaxis_title="Time To Generate 64 Tokens (s)",
         yaxis_title="Open LLM Score (%)",
         legend_title="LLM Architecture",
         width=1200,

src/utils.py CHANGED Viewed

@@ -1,3 +1,5 @@
 LLM_MODEL_ARCHS = {
     "stablelm_epoch": "🔴 StableLM-Epoch",
     "stablelm_alpha": "🔴 StableLM-Alpha",
@@ -16,16 +18,14 @@ LLM_MODEL_ARCHS = {
     "llama": "🦙 LLaMA",
     "rwkv": "🐦‍⬛ RWKV",
     "deci": "🔵 deci",
-    "Yi": "🫂 Yi 人", # people
     "mpt": "🧱 MPT",
     # suggest something
     "gpt_neox": "GPT-NeoX",
     "gpt_neo": "GPT-Neo",
     "gpt2": "GPT-2",
     "gptj": "GPT-J",
-    "xglm": "XGLM",
     "bart": "BART",
-    "opt": "OPT",
 }
@@ -33,11 +33,13 @@ def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def process_arch(model_arch):
-    if model_arch in LLM_MODEL_ARCHS:
-        return LLM_MODEL_ARCHS[model_arch]
-    else:
-        return model_arch
 def process_score(score, quantization):
@@ -47,25 +49,53 @@ def process_score(score, quantization):
         return f"{score:.2f} "
-def process_quantization_scheme(x):
-    if x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_4bit"] == True:
         return "BnB.4bit"
-    elif x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_8bit"] == True:
-        return "BnB.8bit"
-    elif (x["backend.quantization_scheme"] == "gptq") and (
-        x["backend.quantization_config.exllama_config.version"] == 1
     ):
-        return "GPTQ.4bit+ExllamaV1"
-    elif (x["backend.quantization_scheme"] == "gptq") and (
-        x["backend.quantization_config.exllama_config.version"] == 2
     ):
-        return "GPTQ.4bit+ExllamaV2"
-    elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
         return "GPTQ.4bit"
-    elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemm":
-        return "AWQ.4bit+GEMM"
-    elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemv":
-        return "AWQ.4bit+GEMV"
     else:
         return "None"

+from transformers import AutoConfig
 LLM_MODEL_ARCHS = {
     "stablelm_epoch": "🔴 StableLM-Epoch",
     "stablelm_alpha": "🔴 StableLM-Alpha",
     "llama": "🦙 LLaMA",
     "rwkv": "🐦‍⬛ RWKV",
     "deci": "🔵 deci",
+    "Yi": "🫂 Yi 人",  # people
     "mpt": "🧱 MPT",
     # suggest something
     "gpt_neox": "GPT-NeoX",
     "gpt_neo": "GPT-Neo",
     "gpt2": "GPT-2",
     "gptj": "GPT-J",
     "bart": "BART",
 }
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def process_architectures(model):
+    # return "Unknown"
+    try:
+        config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+        return LLM_MODEL_ARCHS.get(config.model_type, "Unknown")
+    except Exception:
+        return "Unknown"
 def process_score(score, quantization):
         return f"{score:.2f} "
+def process_quantizations(x):
+    if (
+        x["config.backend.quantization_scheme"] == "bnb"
+        and x["config.backend.quantization_config.load_in_4bit"] is True
+    ):
         return "BnB.4bit"
+    elif (
+        x["config.backend.quantization_scheme"] == "bnb"
+        and x["config.backend.quantization_config.load_in_8bit"] is True
     ):
+        return "BnB.8bit"
+    elif (
+        x["config.backend.quantization_scheme"] == "gptq"
+        and x["config.backend.quantization_config.bits"] == 4
     ):
         return "GPTQ.4bit"
+    elif (
+        x["config.backend.quantization_scheme"] == "awq"
+        and x["config.backend.quantization_config.bits"] == 4
+    ):
+        return "AWQ.4bit"
+    else:
+        return "None"
+def process_kernels(x):
+    if (
+        x["config.backend.quantization_scheme"] == "gptq"
+        and x["config.backend.quantization_config.version"] == 1
+    ):
+        return "GPTQ.ExllamaV1"
+    elif (
+        x["config.backend.quantization_scheme"] == "gptq"
+        and x["config.backend.quantization_config.version"] == 2
+    ):
+        return "GPTQ.ExllamaV2"
+    elif (
+        x["config.backend.quantization_scheme"] == "awq"
+        and x["config.backend.quantization_config.version"] == "gemm"
+    ):
+        return "AWQ.GEMM"
+    elif (
+        x["config.backend.quantization_scheme"] == "awq"
+        and x["config.backend.quantization_config.version"] == "gemv"
+    ):
+        return "AWQ.GEMV"
     else:
         return "None"