bigcode-models-leaderboard

Running

File size: 7,856 Bytes

3ebc784
77a9749
caa834f
3ebc784
 
77a9749
5b15f5e
376d3eb
 
 
3ebc784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5c587b
 
376d3eb
 
 
 
3ebc784
376d3eb
 
 
 
 
 
 
 
 
 
 
 
77a9749
 
 
 
1a20ca0
caa834f
376d3eb
2b8f53a
77a9749
 
376d3eb
 
 
 
 
3ebc784
376d3eb
3ebc784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376d3eb
 
 
 
 
 
 
 
 
 
3ebc784
376d3eb
3ebc784
376d3eb
3ebc784
376d3eb
3ebc784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376d3eb
3ebc784
 
 
 
d5c587b
376d3eb
 
 
 
a6ca949
3ebc784
 
 
 
 
 
 
 
 
 
376d3eb
 
 
 
3ebc784
376d3eb
 
 
 
3ebc784
376d3eb
 
 
caa834f
 
 
 
10a2425
 
caa834f
376d3eb
1a20ca0
caa834f
 
77a9749

# some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main
import gradio as gr
import pandas as pd
from src.utils import AutoEvalColumn, fields, make_clickable_names, plot_throughput


df = pd.read_csv("data/code_eval_board.csv")

COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [
    c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
]
TYPES_LITE = [
    c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
]


def select_columns(df, columns):
    always_here_cols = [
        AutoEvalColumn.model_type_symbol.name,
        AutoEvalColumn.model.name,
    ]
    # We use COLS to maintain sorting
    filtered_df = df[
        always_here_cols
        + [c for c in COLS if c in df.columns and c in columns]
    ]
    return filtered_df


def filter_items(df, leaderboard_table, query):
    if query == "all":
        return df[leaderboard_table.columns]
    else:
        query = query[0]  # take only the emoji character
    filtered_df = df[(df["T"] == query)]
    return filtered_df[leaderboard_table.columns]


def search_table(df, leaderboard_table, query):
    filtered_df = df[(df["Models"].str.contains(query, case=False))]
    return filtered_df[leaderboard_table.columns]


df = make_clickable_names(df)


demo = gr.Blocks()
with demo:
    with gr.Row():
        gr.Markdown(
            """<div style="text-align: center;"><h1> ⭐ Multilingual <span style='color: #e6b800;'>Code</span> Models <span style='color: #e6b800;'>Evaluation</span></h1></div>\
            <br>\
            <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">🤗 Open LLM-Perf Leaderboard 🏋️</a>, we compare performance of base multilingual code generation models on <a href="https://huggingface.co/datasets/openai_humaneval">HumanEval</a> benchmark and <a href="https://huggingface.co/datasets/nuprl/MultiPL-E">MultiPL-E</a>. We also measure throughput and provide\
            information about the models. We only compare pre-trained multilingual code models, that people can start from as base models for their trainings.</p>"""
        )

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.Column():
            with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
                with gr.TabItem("🔍 Evaluation table", id=0):
                    with gr.Column():
                        # with gr.Column(min_width=780):
                        shown_columns = gr.CheckboxGroup(
                            choices=[
                                c
                                for c in COLS
                                if c
                                not in [
                                    AutoEvalColumn.dummy.name,
                                    AutoEvalColumn.model.name,
                                    AutoEvalColumn.model_type_symbol.name,
                                ]
                            ],
                            value=[
                                c
                                for c in COLS_LITE
                                if c
                                not in [
                                    AutoEvalColumn.dummy.name,
                                    AutoEvalColumn.model.name,
                                    AutoEvalColumn.model_type_symbol.name,
                                ]
                            ],
                            label="Select columns to show",
                            elem_id="column-select",
                            interactive=True,
                        )
                        with gr.Row():
                            search_bar = gr.Textbox(
                                placeholder="🔍 Search for your model and press ENTER...",
                                show_label=False,
                                elem_id="search-bar",
                            )
                            filter_columns = gr.Radio(
                                label="⏚ Filter model types",
                                choices=["all", "🟢 base", "🔶 instruction-tuned"],
                                value="all",
                                elem_id="filter-columns",
                            )

                    leaderboard_df = gr.components.Dataframe(
                        value=df[
                            [
                                AutoEvalColumn.model_type_symbol.name,
                                AutoEvalColumn.model.name,
                            ]
                            + shown_columns.value
                        ],
                        headers=[
                            AutoEvalColumn.model_type_symbol.name,
                            AutoEvalColumn.model.name,
                        ]
                        + shown_columns.value,
                        datatype=TYPES,
                        elem_id="leaderboard-table",
                    )

                    hidden_leaderboard_df = gr.components.Dataframe(
                        value=df,
                        headers=COLS,
                        datatype=["str" for _ in range(len(COLS))],
                        visible=False,
                    )
                    search_bar.submit(
                        search_table,
                        [hidden_leaderboard_df, leaderboard_df, search_bar],
                        leaderboard_df,
                    )
                    shown_columns.change(
                        select_columns,
                        [hidden_leaderboard_df, shown_columns],
                        leaderboard_df,
                    )
                    filter_columns.change(
                        filter_items,
                        [hidden_leaderboard_df, leaderboard_df, filter_columns],
                        leaderboard_df,
                    )

                with gr.TabItem("📊 Performance Plot", id=1):
                    with gr.Row():
                        bs_1_plot = gr.components.Plot(
                            value=plot_throughput(df, bs=1),
                            elem_id="bs1-plot",
                            show_label=False,
                        )
                        bs_50_plt = gr.components.Plot(
                            value=plot_throughput(df, bs=50),
                            elem_id="bs50-plot",
                            show_label=False,
                        )
    with gr.Row():
        gr.Markdown(
            """Notes:
            <ul>
            <li> Throughputs and peak memory usage are measured using <a href="https://github.com/huggingface/optimum-benchmark/tree/main">Optimum-Benchmark</a> which powers <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">Open LLM-Perf Leaderboard</a>. (0 throughput corresponds to OOM).</li>
            <li> All models were evaluated with the <a href="https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main">bigcode-evaluation-harness</a> with top-p=0.95, temperature=0.2 and n_samples=50.</li>
            <li> HumanEval-Python, reports the pass@1 on HumanEval, the rest is from MultiPL-E benchmark.</li>
            <li> Average score is the average pass@1 over all languages. For Win Rate, we compute model rank for each language as <code style="white-space: nowrap; display: inline;">num_models - (rank -1)</code> and average their rankings.</li>
            <li> #Languages column represents the number of programming languages included during the pretraining.
            </ul>"""
        )
demo.launch()