Spaces:
Sleeping
Sleeping
# app.py | |
import json | |
import streamlit as st | |
import glob | |
import os | |
from datetime import datetime | |
#st.set_page_config(layout="wide") | |
st.title('Meta Open LLM leaderboard') | |
directories = os.listdir("./data") | |
#data_dir = directories[0] | |
def format_dir_date(data_dir): | |
# Extracting date and time information from the path | |
parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M") | |
# Formatting the parsed date | |
return parsed_date.strftime("%b %d, %Y %H:%M") | |
data_dir = st.selectbox( | |
'Select different Date', | |
directories, | |
format_func=format_dir_date, | |
index=len(directories)-1 | |
) | |
captions_map = { | |
"hg_average_to_agentbench_compare.png": "HF to AgentBench compare", | |
"hg_average_to_opencompass_compare.png": "HF to OpenCompass</a> compare", | |
"hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare", | |
"hg_average_to_mosaic_compare.png": "HF to MosaicML compare", | |
"hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare" | |
} | |
st.write("Generated on: <b>" + format_dir_date(data_dir) + "</b>", unsafe_allow_html=True) | |
st.divider() | |
data_path = './data/' + data_dir | |
imgs = glob.glob(os.path.join(data_path, '*.png')) | |
# Extracting images that start with "hf_llm_diagram" | |
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)] | |
# Getting the remaining images | |
remaining_imgs = [img for img in imgs if 'hf_llm_diagram' not in os.path.basename(img)] | |
def print_model_list(file_name, st, split_into_two=False): | |
file_path = file_name[:-4] + '.json' | |
# Read the list from the JSON file | |
with open(file_path, 'r') as file: | |
model_id_list_loaded = json.load(file) | |
midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2) # Calculate the midpoint | |
# Split the list into two parts | |
left_list = model_id_list_loaded[:midpoint] | |
right_list = model_id_list_loaded[midpoint:] | |
# Generate HTML for the left column | |
left_html = "" | |
for model_id in left_list: | |
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] | |
left_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>' | |
# Generate HTML for the right column | |
right_html = "" | |
for model_id in right_list: | |
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] | |
right_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>' | |
final_html = "" | |
if(split_into_two): | |
final_html = "<ul>" | |
final_html += left_html | |
final_html += "</ul>" | |
cols = st.columns(2) | |
cols[0].write(final_html, unsafe_allow_html=True) | |
final_html = "<ul>" | |
final_html += right_html | |
final_html += "</ul>" | |
cols[1].write(final_html, unsafe_allow_html=True) | |
else: | |
final_html = "<ul>" | |
final_html += left_html | |
final_html += right_html | |
final_html += "</ul>" | |
st.write(final_html, unsafe_allow_html=True) | |
st.write("HuggingFace Open LLM leaderboard by Model Size") | |
st.image(hf_llm_diagrams[0],use_column_width="auto") | |
print_model_list(hf_llm_diagrams[0],st,True) | |
cols = st.columns(2) | |
cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto") | |
print_model_list(hf_llm_diagrams[1],cols[0]) | |
cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto") | |
print_model_list(hf_llm_diagrams[2],cols[1]) | |
st.divider() | |
st.write("HuggingFace and Other Leaderboards: A Comparative Model Evaluation") | |
st.caption("Only models evaluated on both leaderboards are included.") | |
cols = st.columns(2) | |
for i, img in enumerate(remaining_imgs): | |
# Extract the filename from the full image path | |
filename = os.path.basename(img) | |
# Get the caption from the captions_map dictionary | |
caption = captions_map.get(filename, "") # If no caption is found, it will default to an empty string | |
# Display the image with the caption | |
cols[i % 2].image(img, caption=caption, width=None) | |
st.write( | |
""" | |
<p>Leaderboards tracked:</p> | |
<ul> | |
<li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li> | |
<li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench</a> GPT4 judged evaluation of models</li> | |
<li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li> | |
<li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li> | |
<li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li> | |
<li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li> | |
</ul> | |
""", unsafe_allow_html=True | |
) | |
st.divider() | |
st.write("TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size") | |
st.image(hf_llm_diagrams[3],use_column_width="auto") | |
st.divider() | |
st.subheader('About') | |
st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True) |