File size: 7,856 Bytes
3ebc784 77a9749 caa834f 3ebc784 77a9749 5b15f5e 376d3eb 3ebc784 d5c587b 376d3eb 3ebc784 376d3eb 77a9749 1a20ca0 caa834f 376d3eb 2b8f53a 77a9749 376d3eb 3ebc784 376d3eb 3ebc784 376d3eb 3ebc784 376d3eb 3ebc784 376d3eb 3ebc784 376d3eb 3ebc784 376d3eb 3ebc784 d5c587b 376d3eb a6ca949 3ebc784 376d3eb 3ebc784 376d3eb 3ebc784 376d3eb caa834f 10a2425 caa834f 376d3eb 1a20ca0 caa834f 77a9749 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main
import gradio as gr
import pandas as pd
from src.utils import AutoEvalColumn, fields, make_clickable_names, plot_throughput
df = pd.read_csv("data/code_eval_board.csv")
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [
c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
]
TYPES_LITE = [
c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
]
def select_columns(df, columns):
always_here_cols = [
AutoEvalColumn.model_type_symbol.name,
AutoEvalColumn.model.name,
]
# We use COLS to maintain sorting
filtered_df = df[
always_here_cols
+ [c for c in COLS if c in df.columns and c in columns]
]
return filtered_df
def filter_items(df, leaderboard_table, query):
if query == "all":
return df[leaderboard_table.columns]
else:
query = query[0] # take only the emoji character
filtered_df = df[(df["T"] == query)]
return filtered_df[leaderboard_table.columns]
def search_table(df, leaderboard_table, query):
filtered_df = df[(df["Models"].str.contains(query, case=False))]
return filtered_df[leaderboard_table.columns]
df = make_clickable_names(df)
demo = gr.Blocks()
with demo:
with gr.Row():
gr.Markdown(
"""<div style="text-align: center;"><h1> β Multilingual <span style='color: #e6b800;'>Code</span> Models <span style='color: #e6b800;'>Evaluation</span></h1></div>\
<br>\
<p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">π€ Open LLM-Perf Leaderboard ποΈ</a>, we compare performance of base multilingual code generation models on <a href="https://huggingface.co/datasets/openai_humaneval">HumanEval</a> benchmark and <a href="https://huggingface.co/datasets/nuprl/MultiPL-E">MultiPL-E</a>. We also measure throughput and provide\
information about the models. We only compare pre-trained multilingual code models, that people can start from as base models for their trainings.</p>"""
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.Column():
with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
with gr.TabItem("π Evaluation table", id=0):
with gr.Column():
# with gr.Column(min_width=780):
shown_columns = gr.CheckboxGroup(
choices=[
c
for c in COLS
if c
not in [
AutoEvalColumn.dummy.name,
AutoEvalColumn.model.name,
AutoEvalColumn.model_type_symbol.name,
]
],
value=[
c
for c in COLS_LITE
if c
not in [
AutoEvalColumn.dummy.name,
AutoEvalColumn.model.name,
AutoEvalColumn.model_type_symbol.name,
]
],
label="Select columns to show",
elem_id="column-select",
interactive=True,
)
with gr.Row():
search_bar = gr.Textbox(
placeholder="π Search for your model and press ENTER...",
show_label=False,
elem_id="search-bar",
)
filter_columns = gr.Radio(
label="β Filter model types",
choices=["all", "π’ base", "πΆ instruction-tuned"],
value="all",
elem_id="filter-columns",
)
leaderboard_df = gr.components.Dataframe(
value=df[
[
AutoEvalColumn.model_type_symbol.name,
AutoEvalColumn.model.name,
]
+ shown_columns.value
],
headers=[
AutoEvalColumn.model_type_symbol.name,
AutoEvalColumn.model.name,
]
+ shown_columns.value,
datatype=TYPES,
elem_id="leaderboard-table",
)
hidden_leaderboard_df = gr.components.Dataframe(
value=df,
headers=COLS,
datatype=["str" for _ in range(len(COLS))],
visible=False,
)
search_bar.submit(
search_table,
[hidden_leaderboard_df, leaderboard_df, search_bar],
leaderboard_df,
)
shown_columns.change(
select_columns,
[hidden_leaderboard_df, shown_columns],
leaderboard_df,
)
filter_columns.change(
filter_items,
[hidden_leaderboard_df, leaderboard_df, filter_columns],
leaderboard_df,
)
with gr.TabItem("π Performance Plot", id=1):
with gr.Row():
bs_1_plot = gr.components.Plot(
value=plot_throughput(df, bs=1),
elem_id="bs1-plot",
show_label=False,
)
bs_50_plt = gr.components.Plot(
value=plot_throughput(df, bs=50),
elem_id="bs50-plot",
show_label=False,
)
with gr.Row():
gr.Markdown(
"""Notes:
<ul>
<li> Throughputs and peak memory usage are measured using <a href="https://github.com/huggingface/optimum-benchmark/tree/main">Optimum-Benchmark</a> which powers <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">Open LLM-Perf Leaderboard</a>. (0 throughput corresponds to OOM).</li>
<li> All models were evaluated with the <a href="https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main">bigcode-evaluation-harness</a> with top-p=0.95, temperature=0.2 and n_samples=50.</li>
<li> HumanEval-Python, reports the pass@1 on HumanEval, the rest is from MultiPL-E benchmark.</li>
<li> Average score is the average pass@1 over all languages. For Win Rate, we compute model rank for each language as <code style="white-space: nowrap; display: inline;">num_models - (rank -1)</code> and average their rankings.</li>
<li> #Languages column represents the number of programming languages included during the pretraining.
</ul>"""
)
demo.launch()
|