bigcode-models-leaderboard

Running

App Files Files Community

loubnabnl HF staff commited on Jul 28, 2023

Commit

57381cd

•

1 Parent(s): 2028475

fix missing value

Browse files

Files changed (2) hide show

app.py +5 -3
src/build.py +1 -1

app.py CHANGED Viewed

@@ -14,6 +14,8 @@ def plot_throughput(bs=1):
     df.loc[df['Models'].str.contains('StarCoder|SantaCoder'), 'color'] = 'orange'
     df.loc[df['Models'].str.contains('CodeGen'), 'color'] = 'pink'
     df.loc[df['Models'].str.contains('Replit'), 'color'] = 'purple'
     fig = go.Figure()
@@ -65,7 +67,7 @@ with demo:
                 leaderboard_df = gr.components.Dataframe(
                     value=df, headers=headers, datatype=["str" for _ in range(len(headers))]
                 )
             with gr.TabItem("📊 Performance Plot", id=1):
                 with gr.Row():
                     bs_1_plot = gr.components.Plot(
@@ -77,7 +79,7 @@ with demo:
                         value=plot_throughput(bs=50),
                         elem_id="bs50-plot",
                         show_label=False,
-                    )
     with gr.Row():
         gr.Markdown(
             """Notes:
@@ -85,7 +87,7 @@ with demo:
             <li> Throughputs and peak memory usage are measured using <a href="https://github.com/huggingface/optimum-benchmark/tree/main">Optimum-Benchmark</a> which powers <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">Open LLM-Perf Leaderboard</a>. (0 throughput corresponds to OOM).</li>
             <li> All models were evaluated with the <a href="https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main">bigcode-evaluation-harness</a> with top-p=0.95, temperature=0.2 and n_samples=50.</li>
             <li> HumanEval-Python, reports the pass@1 on HumanEval, the rest is from MultiPL-E benchmark.</li>
-            <li> Average score is the average pass@1 over all languages. For Win Rate, we rank models for each language and average their ranking.</li>
             <li> #Languages column represents the number of programming languages included during the pretraining.
             </ul>"""
         )

     df.loc[df['Models'].str.contains('StarCoder|SantaCoder'), 'color'] = 'orange'
     df.loc[df['Models'].str.contains('CodeGen'), 'color'] = 'pink'
     df.loc[df['Models'].str.contains('Replit'), 'color'] = 'purple'
+    df.loc[df['Models'].str.contains('Wizard'), 'color']  = '#00b3b3'
+    df.loc[df['Models'].str.contains('CodeGeeX'), 'color'] = '#00cc00'
     fig = go.Figure()
                 leaderboard_df = gr.components.Dataframe(
                     value=df, headers=headers, datatype=["str" for _ in range(len(headers))]
                 )
+            """
             with gr.TabItem("📊 Performance Plot", id=1):
                 with gr.Row():
                     bs_1_plot = gr.components.Plot(
                         value=plot_throughput(bs=50),
                         elem_id="bs50-plot",
                         show_label=False,
+                    )"""
     with gr.Row():
         gr.Markdown(
             """Notes:
             <li> Throughputs and peak memory usage are measured using <a href="https://github.com/huggingface/optimum-benchmark/tree/main">Optimum-Benchmark</a> which powers <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">Open LLM-Perf Leaderboard</a>. (0 throughput corresponds to OOM).</li>
             <li> All models were evaluated with the <a href="https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main">bigcode-evaluation-harness</a> with top-p=0.95, temperature=0.2 and n_samples=50.</li>
             <li> HumanEval-Python, reports the pass@1 on HumanEval, the rest is from MultiPL-E benchmark.</li>
+            <li> Average score is the average pass@1 over all languages. For Win Rate, we compute model rank for each language as <pre><code>num_models - (rank -1)</code></pre> and average their rankings.</li>
             <li> #Languages column represents the number of programming languages included during the pretraining.
             </ul>"""
         )

src/build.py CHANGED Viewed

@@ -35,7 +35,7 @@ data = {
     "racket": [0.66, 0.07, 11.77, 11.08, 7.87, 3.22, 0, 5.03, 4.07, 10.37, 11.35,13.39],
     "rust": [4.21, 21.84, 24.46, 22.60, 16.32, 15.19, 2.00, 10.24, 7.83, 21.84, 19.94, 33.74],
     "swift": [1.25, 22.74, 16.74, 15.10, 9.98, 5.88, 0.70, 3.92, 1.71, 16.62, 20.81, 27.06],
-    "Throughput (tokens/s) bs=50": [0, 1490.00, 1460.00, 1700.00, 1770.00, 577.00, 2270.00, 2360.00, 687.00, 680.00, 1670.00, 1470.00],
     "Peak Memory (MB)": [32890, 33461, 32366, 16512, 8414, 7176, 4602, 4586, 15336, 15336, 0, 32414],
 }

     "racket": [0.66, 0.07, 11.77, 11.08, 7.87, 3.22, 0, 5.03, 4.07, 10.37, 11.35,13.39],
     "rust": [4.21, 21.84, 24.46, 22.60, 16.32, 15.19, 2.00, 10.24, 7.83, 21.84, 19.94, 33.74],
     "swift": [1.25, 22.74, 16.74, 15.10, 9.98, 5.88, 0.70, 3.92, 1.71, 16.62, 20.81, 27.06],
+    "Throughput (tokens/s) bs=50": [0, 1490.00, 1460.00, 1700.00, 1770.00, 577.00, 2270.00, 2360.00, 687.00, 680.00, 0, 1470.00],
     "Peak Memory (MB)": [32890, 33461, 32366, 16512, 8414, 7176, 4602, 4586, 15336, 15336, 0, 32414],
 }