File size: 5,768 Bytes
7411847 6d7a85e 7411847 84f536a 7411847 9a34cda 7411847 9a34cda 7411847 9a34cda 24806ea 9a34cda 7411847 84f536a 9a34cda 84f536a 9a34cda 84f536a 9a34cda 7411847 84f536a 8fcd96c 6d7a85e 8fcd96c 6d7a85e 84f536a 5669002 8fcd96c 6d7a85e dedfb73 6d7a85e dedfb73 6d7a85e 84f536a 7411847 84f536a b0337e1 9a34cda 0e46dc8 f765ec9 0e46dc8 9a34cda 5669002 a583aee d8585e9 a583aee d8585e9 5669002 b0337e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# app.py
import json
import streamlit as st
import glob
import os
from datetime import datetime
#st.set_page_config(layout="wide")
st.title('Meta Open LLM leaderboard')
directories = os.listdir("./data")
#data_dir = directories[0]
def format_dir_date(data_dir):
# Extracting date and time information from the path
parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M")
# Formatting the parsed date
return parsed_date.strftime("%b %d, %Y %H:%M")
data_dir = st.selectbox(
'Select different Date',
directories,
format_func=format_dir_date,
index=len(directories)-1
)
captions_map = {
"hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
"hg_average_to_opencompass_compare.png": "HF to OpenCompass</a> compare",
"hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
"hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
"hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
}
st.write("Generated on: <b>" + format_dir_date(data_dir) + "</b>", unsafe_allow_html=True)
st.divider()
data_path = './data/' + data_dir
imgs = glob.glob(os.path.join(data_path, '*.png'))
# Extracting images that start with "hf_llm_diagram"
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
# Getting the remaining images
remaining_imgs = [img for img in imgs if 'hf_llm_diagram' not in os.path.basename(img)]
def print_model_list(file_name, st, split_into_two=False):
file_path = file_name[:-4] + '.json'
# Read the list from the JSON file
with open(file_path, 'r') as file:
model_id_list_loaded = json.load(file)
midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2) # Calculate the midpoint
# Split the list into two parts
left_list = model_id_list_loaded[:midpoint]
right_list = model_id_list_loaded[midpoint:]
# Generate HTML for the left column
left_html = ""
for model_id in left_list:
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
left_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'
# Generate HTML for the right column
right_html = ""
for model_id in right_list:
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
right_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'
final_html = ""
if(split_into_two):
final_html = "<ul>"
final_html += left_html
final_html += "</ul>"
cols = st.columns(2)
cols[0].write(final_html, unsafe_allow_html=True)
final_html = "<ul>"
final_html += right_html
final_html += "</ul>"
cols[1].write(final_html, unsafe_allow_html=True)
else:
final_html = "<ul>"
final_html += left_html
final_html += right_html
final_html += "</ul>"
st.write(final_html, unsafe_allow_html=True)
st.write("HuggingFace Open LLM leaderboard by Model Size")
st.image(hf_llm_diagrams[0],use_column_width="auto")
print_model_list(hf_llm_diagrams[0],st,True)
cols = st.columns(2)
cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto")
print_model_list(hf_llm_diagrams[1],cols[0])
cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto")
print_model_list(hf_llm_diagrams[2],cols[1])
st.divider()
st.write("HuggingFace and Other Leaderboards: A Comparative Model Evaluation")
st.caption("Only models evaluated on both leaderboards are included.")
cols = st.columns(2)
for i, img in enumerate(remaining_imgs):
# Extract the filename from the full image path
filename = os.path.basename(img)
# Get the caption from the captions_map dictionary
caption = captions_map.get(filename, "") # If no caption is found, it will default to an empty string
# Display the image with the caption
cols[i % 2].image(img, caption=caption, width=None)
st.write(
"""
<p>Leaderboards tracked:</p>
<ul>
<li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
<li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench</a> GPT4 judged evaluation of models</li>
<li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
<li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
<li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>
<li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li>
</ul>
""", unsafe_allow_html=True
)
st.divider()
cols = st.columns(2)
cols[0].write("TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size")
cols[0].image(hf_llm_diagrams[3],use_column_width="auto")
print_model_list(hf_llm_diagrams[3],cols[0],False)
cols[1].write("ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size")
cols[1].image(hf_llm_diagrams[4],use_column_width="auto")
print_model_list(hf_llm_diagrams[4],cols[1],False)
st.divider()
st.subheader('About')
st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True) |