import json from collections import defaultdict from pathlib import Path import pandas as pd import gradio as gr from content import * from css import * import glob ARC = "arc" HELLASWAG = "hellaswag" MMLU = "mmlu" TRUTHFULQA = "truthfulqa" BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA] METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(",") LANG_NAME = { "ar": "Arabic", "bn": "Bengali", "ca": "Catalan", "da": "Danish", "de": "German", "es": "Spanish", "eu": "Basque", "fr": "French", "gu": "Gujarati", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "hy": "Armenian", "id": "Indonesian", "it": "Italian", "kn": "Kannada", "ml": "Malayalam", "mr": "Marathi", "ne": "Nepali", "nl": "Dutch", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "sk": "Slovak", "sr": "Serbian", "sv": "Swedish", "ta": "Tamil", "te": "Telugu", "uk": "Ukrainian", "vi": "Vietnamese", "zh": "Chinese", } def collect_results(): performance_dict = defaultdict(dict) pretrained_models = set() for pfin in Path("evals").rglob("*.json"): data = json.loads(pfin.read_text(encoding="utf-8")) if "results" not in data: continue if "config" not in data: continue results = data["results"] config = data["config"] if "model_args" not in config: continue model_args = config["model_args"].split(",") pretrained = [x for x in model_args if x.startswith("pretrained=")] if len(pretrained) != 1: continue pretrained = pretrained[0].split("=")[1] pretrained = pretrained.split("/")[-1] pretrained_models.add(pretrained) for lang_task, perfs in results.items(): task, lang = lang_task.split("_") assert task in BENCHMARKS if lang and task: metric = METRICS[BENCHMARKS.index(task)] p = round(perfs[metric] * 100, 1) performance_dict[(pretrained, lang)][task] = p return performance_dict, pretrained_models def get_leaderboard_df(performance_dict, pretrained_models): df = list() for (pretrained, lang), perfs in performance_dict.items(): lang_name = LANG_NAME[lang] arc_perf = perfs.get(ARC, 0.0) hellaswag_perf = perfs.get(HELLASWAG, 0.0) mmlu_perf = perfs.get(MMLU, 0.0) truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0) avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1) notes = " ".join([pretrained, lang_name]) row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes] df.append(row) df = pd.DataFrame.from_records(df, columns=COLS) df = df.sort_values(by=[AVERAGE_COL], ascending=False) df = df[COLS] return df def search_table(df, query): filtered_df = df[df[NOTES_COL].str.contains(query, case=False)] return filtered_df MODEL_COL = "Model" AVERAGE_COL = "Average" ARC_COL = "ARC (25-shot)" HELLASWAG_COL = "HellaSwag (10-shot)️" MMLU_COL = "MMLU (5-shot)" TRUTHFULQA_COL = "TruthfulQA (0-shot)" NOTES_COL = "Notes" # For search only COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL] TYPES = ["str", "number", "number", "number", "number", "number", "str"] args = collect_results() original_df = get_leaderboard_df(*args) demo = gr.Blocks(css=CUSTOM_CSS) with demo: gr.HTML(TITLE) gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") gr.Markdown(HOW_TO, elem_classes="markdown-text") with gr.Box(): search_bar = gr.Textbox(placeholder="Search models and languages...", show_label=False, elem_id="search-bar") leaderboard_table = gr.components.Dataframe( value=original_df, headers=COLS, datatype=TYPES, max_rows=5, elem_id="leaderboard-table", ) # # Dummy leaderboard for handling the case when the user uses backspace key hidden_leaderboard_table_for_search = gr.components.Dataframe( value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False ) search_bar.change( search_table, [hidden_leaderboard_table_for_search, search_bar], leaderboard_table, ) gr.Markdown(CREDIT, elem_classes="markdown-text") gr.Markdown(CITATION, elem_classes="markdown-text") demo.launch()