# some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main import json import os from datetime import datetime, timezone import gradio as gr import pandas as pd from huggingface_hub import HfApi from src.css_html import custom_css from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3 from src.utils import ( AutoEvalColumn, fields, is_model_on_hub, make_clickable_names, plot_throughput, styled_error, styled_message, ) TOKEN = os.environ.get("HF_TOKEN", None) api = HfApi(TOKEN) df = pd.read_csv("data/code_eval_board.csv") QUEUE_REPO = "bigcode/evaluation-requests" EVAL_REQUESTS_PATH = "eval-queue" COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden] COLS_LITE = [ c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden ] TYPES_LITE = [ c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden ] def add_new_eval( model: str, revision: str, precision: str, model_type: str, ): precision = precision current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") if model_type is None or model_type == "": return styled_error("Please select a model type.") # check the model actually exists before adding the eval if revision == "": revision = "main" model_on_hub, error = is_model_on_hub(model, revision) if not model_on_hub: return styled_error(f'Model "{model}" {error}') print("adding new eval") eval_entry = { "model": model, "revision": revision, "precision": precision, "status": "PENDING", "submitted_time": current_time, "model_type": model_type.split(" ")[1], } user_name = "" model_path = model if "/" in model: user_name = model.split("/")[0] model_path = model.split("/")[1] OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" os.makedirs(OUT_DIR, exist_ok=True) out_path = f"{OUT_DIR}/{model_path}_eval_request_{precision}.json" print(f"Saving eval request to {out_path}") with open(out_path, "w") as f: f.write(json.dumps(eval_entry)) api.upload_file( path_or_fileobj=out_path, path_in_repo=out_path.split("eval-queue/")[1], repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add {model} to eval queue", ) # remove the local file os.remove(out_path) return styled_message("Your request has been submitted to the evaluation queue!\n") def select_columns(df, columns): always_here_cols = [ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, ] # We use COLS to maintain sorting filtered_df = df[ always_here_cols + [c for c in COLS if c in df.columns and c in columns] ] return filtered_df def filter_items(df, leaderboard_table, query): if query == "all": return df[leaderboard_table.columns] else: query = query[0] # take only the emoji character filtered_df = df[(df["T"] == query)] return filtered_df[leaderboard_table.columns] def search_table(df, leaderboard_table, query): filtered_df = df[(df["Models"].str.contains(query, case=False))] return filtered_df[leaderboard_table.columns] df = make_clickable_names(df) demo = gr.Blocks(css=custom_css) with demo: with gr.Row(): gr.Markdown( """
Inspired from the 🤗 Open LLM Leaderboard and 🤗 Open LLM-Perf Leaderboard 🏋️, we compare performance of base multilingual code generation models on HumanEval benchmark and MultiPL-E. We also measure throughput and provide\ information about the models. We only compare open pre-trained multilingual code models, that people can start from as base models for their trainings.
Warning: This leaderboard was last updated as of the release of DeepSeek-Coder-33b-instruct on November 2023. Stronger models might have been released since, check the Submit Results section for submitting new evaluation results for the leaderboard. You can also check other code leaderboards like EvalPlus & Can-AI-Code .