bigcode-models-leaderboard

Running

File size: 11,005 Bytes

3ebc784
7eda93e
 
 
 
77a9749
caa834f
7eda93e
c36a1a2
 
748d750
7eda93e
 
 
 
 
 
 
 
 
77a9749
7eda93e
 
5b15f5e
376d3eb
7eda93e
 
376d3eb
 
3ebc784
 
 
 
 
 
 
 
7eda93e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ebc784
 
 
 
 
 
 
d8c70e0
3ebc784
 
d5c587b
 
376d3eb
 
 
 
3ebc784
376d3eb
 
 
 
 
 
 
 
 
 
 
 
c36a1a2
77a9749
 
 
ec727b9
caa834f
376d3eb
43aa67d
60aa158
 
43aa67d
 
7eda93e
 
77a9749
376d3eb
 
 
ec727b9
376d3eb
425a9c7
ec727b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8f90f9
376d3eb
 
 
 
 
 
 
 
6df95f1
376d3eb
3ebc784
376d3eb
3ebc784
376d3eb
3ebc784
 
 
 
 
 
 
 
 
 
 
 
 
 
40e97a1
3ebc784
 
376d3eb
3ebc784
 
 
 
d5c587b
376d3eb
 
 
 
a6ca949
3ebc784
 
 
 
 
c8f90f9
 
 
 
 
7eda93e
 
ec727b9
 
7dcd740
ec727b9
947eb06
7eda93e
 
 
 
ec727b9
376d3eb
 
3ebc784
376d3eb
 
 
 
3ebc784
376d3eb
 
 
7eda93e
 
 
 
ec727b9
d8c70e0
 
43aa67d
c8f90f9
 
7eda93e

# some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main
import json
import os
from datetime import datetime, timezone

import gradio as gr
import pandas as pd
from huggingface_hub import HfApi

from src.css_html import custom_css
from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3
from src.utils import (
    AutoEvalColumn,
    fields,
    is_model_on_hub,
    make_clickable_names,
    plot_throughput,
    styled_error,
    styled_message,
)

TOKEN = os.environ.get("HF_TOKEN", None)
api = HfApi(TOKEN)
df = pd.read_csv("data/code_eval_board.csv")

QUEUE_REPO = "bigcode/evaluation-requests"
EVAL_REQUESTS_PATH = "eval-queue"
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [
    c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
]
TYPES_LITE = [
    c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
]


def add_new_eval(
    model: str,
    revision: str,
    precision: str,
    model_type: str,
):
    precision = precision
    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    if model_type is None or model_type == "":
        return styled_error("Please select a model type.")

    # check the model actually exists before adding the eval
    if revision == "":
        revision = "main"

    model_on_hub, error = is_model_on_hub(model, revision)
    if not model_on_hub:
        return styled_error(f'Model "{model}" {error}')

    print("adding new eval")

    eval_entry = {
        "model": model,
        "revision": revision,
        "precision": precision,
        "status": "PENDING",
        "submitted_time": current_time,
        "model_type": model_type.split(" ")[1],
    }

    user_name = ""
    model_path = model
    if "/" in model:
        user_name = model.split("/")[0]
        model_path = model.split("/")[1]

    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
    os.makedirs(OUT_DIR, exist_ok=True)
    out_path = f"{OUT_DIR}/{model_path}_eval_request_{precision}.json"
    print(f"Saving eval request to {out_path}")

    with open(out_path, "w") as f:
        f.write(json.dumps(eval_entry))

    api.upload_file(
        path_or_fileobj=out_path,
        path_in_repo=out_path.split("eval-queue/")[1],
        repo_id=QUEUE_REPO,
        repo_type="dataset",
        commit_message=f"Add {model} to eval queue",
    )

    # remove the local file
    os.remove(out_path)

    return styled_message("Your request has been submitted to the evaluation queue!\n")


def select_columns(df, columns):
    always_here_cols = [
        AutoEvalColumn.model_type_symbol.name,
        AutoEvalColumn.model.name,
    ]
    # We use COLS to maintain sorting
    filtered_df = df[
        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
    ]
    return filtered_df


def filter_items(df, leaderboard_table, query):
    if query == "all":
        return df[leaderboard_table.columns]
    else:
        query = query[0]  # take only the emoji character
    filtered_df = df[(df["T"] == query)]
    return filtered_df[leaderboard_table.columns]


def search_table(df, leaderboard_table, query):
    filtered_df = df[(df["Models"].str.contains(query, case=False))]
    return filtered_df[leaderboard_table.columns]


df = make_clickable_names(df)


demo = gr.Blocks(css=custom_css)
with demo:
    with gr.Row():
        gr.Markdown(
            """<div style="text-align: center;"><h1> ⭐ Big <span style='color: #e6b800;'>Code</span> Models <span style='color: #e6b800;'>Leaderboard</span></h1></div>\
            <br>\
            <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/optimum/llm-perf-leaderboard">🤗 Open LLM-Perf Leaderboard 🏋️</a>, we compare performance of base multilingual code generation models on <a href="https://huggingface.co/datasets/openai_humaneval">HumanEval</a> benchmark and <a href="https://huggingface.co/datasets/nuprl/MultiPL-E">MultiPL-E</a>. We also measure throughput and provide\
            information about the models. We only compare open pre-trained multilingual code models, that people can start from as base models for their trainings.</p>
            <div style='background-color: #F5F1CB; text-align: center; padding: 10px;'>
                <p><b>Warning</b>: This leaderboard was last updated as of the release of <a href="https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct">DeepSeek-Coder-33b-instruct</a> on November 2023. Stronger models might have been released since, check the <b>Submit Results</b> section for submitting new evaluation results for the leaderboard.
            You can also check other code leaderboards like <a href="https://huggingface.co/spaces/mike-ravkine/can-ai-code-results">Can-AI-Code</a> .</p>
            </div>""",
            elem_classes="markdown-text",
        )

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.Column():
            with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
                with gr.TabItem("🔍 Evaluation table", id=0):
                    with gr.Column():
                        with gr.Accordion("➡️ See All Columns", open=False):
                            shown_columns = gr.CheckboxGroup(
                                choices=[
                                    c
                                    for c in COLS
                                    if c
                                    not in [
                                        AutoEvalColumn.dummy.name,
                                        AutoEvalColumn.model.name,
                                        AutoEvalColumn.model_type_symbol.name,
                                    ]
                                ],
                                value=[
                                    c
                                    for c in COLS_LITE
                                    if c
                                    not in [
                                        AutoEvalColumn.dummy.name,
                                        AutoEvalColumn.model.name,
                                        AutoEvalColumn.model_type_symbol.name,
                                    ]
                                ],
                                label="",
                                elem_id="column-select",
                                interactive=True,
                            )
                        # with gr.Column(min_width=780):
                        with gr.Row():
                            search_bar = gr.Textbox(
                                placeholder="🔍 Search for your model and press ENTER...",
                                show_label=False,
                                elem_id="search-bar",
                            )
                            filter_columns = gr.Radio(
                                label="⏚ Filter model types",
                                choices=["all", "🟢 base", "🔶 instruction-tuned", "🔴 external-evaluation"],
                                value="all",
                                elem_id="filter-columns",
                            )

                    leaderboard_df = gr.components.Dataframe(
                        value=df[
                            [
                                AutoEvalColumn.model_type_symbol.name,
                                AutoEvalColumn.model.name,
                            ]
                            + shown_columns.value
                        ],
                        headers=[
                            AutoEvalColumn.model_type_symbol.name,
                            AutoEvalColumn.model.name,
                        ]
                        + shown_columns.value,
                        datatype=TYPES,
                        elem_id="leaderboard-table",
                        interactive=False,
                    )

                    hidden_leaderboard_df = gr.components.Dataframe(
                        value=df,
                        headers=COLS,
                        datatype=["str" for _ in range(len(COLS))],
                        visible=False,
                    )
                    search_bar.submit(
                        search_table,
                        [hidden_leaderboard_df, leaderboard_df, search_bar],
                        leaderboard_df,
                    )
                    filter_columns.change(
                        filter_items,
                        [hidden_leaderboard_df, leaderboard_df, filter_columns],
                        leaderboard_df,
                    )
                    shown_columns.change(
                        select_columns,
                        [hidden_leaderboard_df, shown_columns],
                        leaderboard_df,
                    )
                    gr.Markdown(
                        """
                    **Notes:**
                    - Win Rate represents how often a model outperforms other models in each language, averaged across all languages.
                    - The scores of instruction-tuned models might be significantly higher on humaneval-python than other languages. We use the instruction format of HumanEval. For other languages, we use base MultiPL-E prompts.
                    - For more details check the 📝 About section.
                    - Models with a 🔴 symbol represent external evaluation submission, this means that we didn't verify the results, you can find the author's submission under `Submission PR` field from `See All Columns` tab.
                    """,
                        elem_classes="markdown-text",
                    )

                with gr.TabItem("📊 Performance Plot", id=1):
                    with gr.Row():
                        bs_1_plot = gr.components.Plot(
                            value=plot_throughput(df, bs=1),
                            elem_id="bs1-plot",
                            show_label=False,
                        )
                        bs_50_plt = gr.components.Plot(
                            value=plot_throughput(df, bs=50),
                            elem_id="bs50-plot",
                            show_label=False,
                        )
                    gr.Markdown(
                        "**Note:** Zero throughput on the right plot refers to OOM, for more details check the 📝 About section.",
                        elem_classes="markdown-text",
                    )
                with gr.TabItem("📝 About", id=2):
                    gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
                with gr.TabItem("Submit results 🚀", id=3):
                    gr.Markdown(SUBMISSION_TEXT_3)


demo.launch()