Spaces:
Running
Running
"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space.""" | |
import argparse | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import gradio as gr | |
import pandas as pd | |
import json | |
from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS | |
LAST_UPDATED = "Feb 28th 2024" | |
css = """ | |
.markdown-text{font-size: 15pt} | |
.markdown-text-small{font-size: 13pt} | |
th { | |
text-align: center; | |
} | |
td { | |
font-size: 15px; /* Adjust the font size as needed */ | |
text-align: center; | |
} | |
#od-benchmark-tab-table-button{ | |
font-size: 15pt; | |
font-weight: bold; | |
} | |
""" | |
column_names = { | |
"model": "Model", | |
"Overall": "All π―", | |
"Turn 1": "Turn 1οΈβ£", | |
"Turn 2": "Turn 2οΈβ£", | |
} | |
model_info = { | |
"gpt-4": {"hf_name": "https://platform.openai.com/", "pretty_name": "gpt-4"}, | |
"gpt-3.5-turbo": {"hf_name": "https://platform.openai.com/", "pretty_name": "gpt-3.5-turbo"}, | |
"Llama-2-70b-hf": {"hf_name": "meta-llama/Llama-2-70b-hf", "pretty_name": "Llama-2-70B"}, | |
"Llama-2-13b-hf": {"hf_name": "meta-llama/Llama-2-13b-hf", "pretty_name": "Llama-2-13B"}, | |
"Llama-2-7b-hf": {"hf_name": "meta-llama/Llama-2-7b-hf", "pretty_name": "Llama-2-7B"}, | |
"Mixtral-8x7B-v0.1": {"hf_name": "mistralai/Mixtral-8x7B-v0.1", "pretty_name": "Mixtral-8x7B"}, | |
"Mistral-7b-v0.1": {"hf_name": "mistralai/Mistral-7B-v0.1", "pretty_name": "Mistral-7B v0.1"}, | |
"Mistral-7b-v0.2": {"hf_name": "alpindale/Mistral-7B-v0.2-hf", "pretty_name": "Mistral-7B v0.2"}, | |
"Yi-34B": {"hf_name": "01-ai/Yi-34B", "pretty_name": "Yi-34B"}, | |
"Yi-6B": {"hf_name": "01-ai/Yi-6B", "pretty_name": "Yi-6B"}, | |
"gemma-7b": {"hf_name": "google/gemma-7b", "pretty_name": "Gemma-7B"}, | |
"gemma-2b": {"hf_name": "google/gemma-2b", "pretty_name": "Gemma-2B"}, | |
"phi-2": {"hf_name": "microsoft/phi-2", "pretty_name": "Phi-2 @hf"}, | |
"olmo": {"hf_name": "allenai/OLMo-7B", "pretty_name": "OLMo-7B @hf"}, | |
"phi-2-vllm": {"hf_name": "microsoft/phi-2", "pretty_name": "Phi-2 (2.7B)"}, | |
"olmo-7b-vllm": {"hf_name": "allenai/OLMo-7B", "pretty_name": "OLMo-7B"}, | |
"falcon-7b": {"hf_name": "microsoft/falcon-7b", "pretty_name": "Falcon-7B"}, | |
"mpt-7b": {"hf_name": "mosaicml/mpt-7b", "pretty_name": "MPT-7B"}, | |
"amber": {"hf_name": "LLM360/Amber", "pretty_name": "Amber (7B)"}, | |
"dbrx": {"hf_name": "databricks/dbrx-base", "pretty_name": "DBRX-base"}, | |
} | |
def formatter(x): | |
x = round(x, 2) | |
return x | |
def make_clickable_model(model_name, model_info): | |
if model_info[model_name]['hf_name'].startswith("http"): | |
link = model_info[model_name]['hf_name'] | |
else: | |
link = f"https://huggingface.co/{model_info[model_name]['hf_name']}" | |
if model_name.startswith("gpt"): | |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted; background-color: lightgray;">{model_info[model_name]["pretty_name"]}</a>' | |
else: | |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>' | |
def build_demo(original_df, full_df, TYPES): | |
with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo: | |
# gr.HTML(BANNER, elem_id="banner") | |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("π Leaderboard", elem_id="od-benchmark-tab-table", id=0): | |
leaderboard_table = gr.components.Dataframe( | |
value=original_df, | |
datatype=TYPES, | |
height=1000, | |
wrap=False, | |
elem_id="leaderboard-table", | |
interactive=False, | |
visible=True, | |
min_width=60, | |
) | |
with gr.TabItem("π URIAL + π€ OpenLLM", elem_id="od-benchmark-tab-table", id=1): | |
gr.Markdown("### More results from the awesome π€ [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ", elem_classes="markdown-text") | |
leaderboard_table_full = gr.components.Dataframe( | |
value=full_df, | |
datatype=TYPES, | |
height=1000, | |
wrap=False, | |
elem_id="leaderboard-table-full", | |
interactive=False, | |
visible=True, | |
min_width=60, | |
) | |
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small") | |
with gr.Row(): | |
with gr.Accordion("π Citation", open=False): | |
gr.Textbox( | |
value=CITATION_TEXT, lines=18, | |
label="Copy the BibTeX to cite URIAL and MT-Bench", | |
elem_id="citation-button", | |
show_copy_button=True) | |
# ).style(show_copy_button=True) | |
return demo | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--share", action="store_true") | |
parser.add_argument("--result_file", help="Path to results table", default="leaderboard_data.jsonl") | |
args = parser.parse_args() | |
all_model_hf_ids = {v["hf_name"]: k for k, v in model_info.items()} | |
# Load Open LLM Leaderboard | |
with open("open-llm-leaderboard.json") as f: | |
open_llm_leaderbaord = json.load(f) | |
full_leaderboard = {} | |
for item in open_llm_leaderbaord: | |
if item["Model"] in all_model_hf_ids: | |
# print(item["Model"]) | |
# print(item["Average \u2b06\ufe0f"]) | |
full_bench_item = {} | |
# full_bench_item["hf_name"] = item["Model"] | |
full_bench_item["model_name"] = all_model_hf_ids[item["Model"]] | |
tasks = ["HellaSwag", "ARC", "Winogrande", "TruthfulQA", "MMLU", "GSM8K"] | |
for task in tasks: | |
full_bench_item[task] = item[task] | |
full_bench_item["HF_AVG"] = item["Average \u2b06\ufe0f"] | |
full_leaderboard[all_model_hf_ids[item["Model"]]] = full_bench_item | |
# Load URIAL Leaderboard | |
with open("leaderboard_data.jsonl") as f: | |
for line in f: | |
item = json.loads(line) | |
if item["model"] in full_leaderboard: | |
full_leaderboard[item["model"]]["URIAL_AVG"] = item["Overall"] | |
# Process the URIAL Benchmark Tab | |
original_df = pd.read_json(args.result_file, lines=True) | |
print(original_df.columns) | |
for col in original_df.columns: | |
if col == "model": | |
original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x, model_info))) | |
else: | |
original_df[col] = original_df[col].apply(formatter) # For numerical values | |
# Define the first column explicitly, add 'Overall' as the second column, and then append the rest excluding 'Overall' | |
new_order = [original_df.columns[0], 'Overall'] + [col for col in original_df.columns if col not in [original_df.columns[0], 'Overall']] | |
# Reorder the DataFrame columns using the new order | |
reordered_df = original_df[new_order] | |
reordered_df.sort_values(by='Overall', inplace=True, ascending=False) | |
reordered_df.rename(columns=column_names, inplace=True) | |
# Process the Full Benchmark Tab | |
full_df = pd.DataFrame(full_leaderboard).T | |
full_df = full_df.reset_index() | |
full_df.rename(columns={"index": "model"}, inplace=True) | |
full_df = full_df[["model", "URIAL_AVG", "HF_AVG", "HellaSwag", "ARC", "Winogrande", "TruthfulQA", "MMLU", "GSM8K"]] | |
full_df.sort_values(by='URIAL_AVG', inplace=True, ascending=False) | |
full_df["model"] = full_df["model"].apply(lambda x: make_clickable_model(x, model_info)) | |
full_df.rename(columns=column_names, inplace=True) | |
# apply formatter to numerical columns | |
for col in full_df.columns: | |
if col not in ["Model"]: | |
full_df[col] = full_df[col].apply(formatter) # For numerical values | |
# COLS = [c.name for c in fields(AutoEvalColumn)] | |
# TYPES = [c.type for c in fields(AutoEvalColumn)] | |
TYPES = ["markdown", "number"] | |
demo = build_demo(reordered_df, full_df, TYPES) | |
demo.launch(share=args.share) | |