File size: 11,809 Bytes
7411847 6d7a85e 7411847 d1b3326 7411847 8d4c97a 7411847 9a34cda 7411847 9a34cda 8fcd96c 6d7a85e 8fcd96c 6d7a85e 0b04027 5669002 0b04027 d1b3326 0b04027 d1b3326 0b04027 6d7a85e 0b04027 6d7a85e 0b04027 84f536a 0b04027 5d6c941 0b04027 5d6c941 0b04027 5d6c941 0b04027 5d6c941 0b04027 5d6c941 0b04027 e254e41 0b04027 e254e41 0b04027 e254e41 0b04027 e254e41 b9fbf95 0b04027 e254e41 b9fbf95 0b04027 b9fbf95 0b04027 0110cd9 0b04027 e254e41 5d6c941 0b04027 e254e41 0b04027 e254e41 0b04027 7411847 0b04027 84f536a 0b04027 84f536a 0b04027 b0337e1 9a34cda 0e46dc8 f765ec9 5d6c941 0e46dc8 b9fbf95 9a34cda 6ca74e2 9a34cda 5669002 8d4c97a b0337e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
# app.py
import json
import streamlit as st
import glob
import os
from datetime import datetime
st.set_page_config(layout="wide")
st.title('Meta Open LLM leaderboard')
st.write("Combine data from various open LLM leaderboards into one useful visualization page")
st.write("<nbsp/>", unsafe_allow_html=True)
directories = os.listdir("./data")
def format_dir_date(data_dir):
# Extracting date and time information from the path
parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M")
# Formatting the parsed date
return parsed_date.strftime("%b %d, %Y %H:%M")
def print_model_list(file_name, st, split_into_two=False):
file_path = file_name[:-4] + '.json'
# Read the list from the JSON file
with open(file_path, 'r') as file:
model_id_list_loaded = json.load(file)
midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2) # Calculate the midpoint
# Split the list into two parts
left_list = model_id_list_loaded[:midpoint]
right_list = model_id_list_loaded[midpoint:]
# Generate HTML for the left column
left_html = ""
for model_id in left_list:
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
left_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'
# Generate HTML for the right column
right_html = ""
for model_id in right_list:
model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
right_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'
final_html = ""
if(split_into_two):
final_html = "<ul>"
final_html += left_html
final_html += "</ul>"
cols = st.columns(2)
cols[0].write(final_html, unsafe_allow_html=True)
final_html = "<ul>"
final_html += right_html
final_html += "</ul>"
cols[1].write(final_html, unsafe_allow_html=True)
else:
final_html = "<ul>"
final_html += left_html
final_html += right_html
final_html += "</ul>"
st.write(final_html, unsafe_allow_html=True)
col1, col2 = st.columns(2)
with col1:
data_dir = st.selectbox(
'Select different data generation date',
directories,
format_func=format_dir_date,
index=len(directories)-1,
)
with col2:
compare_mode = st.checkbox('Enable compare to different date')
if compare_mode:
compare_data_dir = st.selectbox(
'Select date for comparison',
directories,
format_func=format_dir_date,
index=len(directories)-1,
)
captions_map = {
"hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
"hg_average_to_opencompass_compare.png": "HF to OpenCompass compare",
"hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
"hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
"hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
}
with col1:
st.write("<div style=\"text-align: center\" >Generated on: <b>" + format_dir_date(data_dir) + "</b></div>", unsafe_allow_html=True)
data_path = './data/' + data_dir
# Adjust the data path loading logic
if compare_mode:
# Side by side compare:
compare_data_path = './data/' + compare_data_dir
# Load images from both directories
imgs = glob.glob(os.path.join(data_path, '*.png'))
compare_imgs = glob.glob(os.path.join(compare_data_path, '*.png'))
# Extracting images that start with specific keywords from both sets
def extract_images(keyword, img_list):
return [img for img in img_list if keyword in os.path.basename(img)]
hf_llm_diagrams = extract_images('hf_llm_diagram', imgs)
bigcode_diagrams = extract_images('bigcode', imgs)
mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', imgs)
arena_diagrams = extract_images('lmsys_leaderboard_arena', imgs)
opencompass_diagrams = extract_images('opencompass_leaderboard', imgs)
compare_hf_llm_diagrams = extract_images('hf_llm_diagram', compare_imgs)
compare_bigcode_diagrams = extract_images('bigcode', compare_imgs)
compare_mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', compare_imgs)
compare_arena_diagrams = extract_images('lmsys_leaderboard_arena', compare_imgs)
compare_opencompass_diagrams = extract_images('opencompass_leaderboard', compare_imgs)
# Display each category side by side
def display_side_by_side(diagrams1, diagrams2, title):
st.subheader(title, divider=True)
for d1, d2 in zip(diagrams1, diagrams2):
cols = st.columns(2)
cols[0].image(d1, use_column_width="auto")
cols[1].image(d2, use_column_width="auto")
# Displaying HuggingFace LLM Leaderboard
display_side_by_side(hf_llm_diagrams, compare_hf_llm_diagrams, "HuggingFace Open LLM leaderboard by Model Size")
# Displaying Big Code Models Leaderboard
display_side_by_side(bigcode_diagrams, compare_bigcode_diagrams, "Big Code Models Leaderboard")
# Displaying MT-Bench Models Leaderboard
display_side_by_side(mt_bench_diagrams, compare_mt_bench_diagrams, "MT-Bench Models Leaderboard")
# Displaying Arena Models Leaderboard
display_side_by_side(arena_diagrams, compare_arena_diagrams, "LMSYS Arena Elo Models Leaderboard")
# Displaying OpenCompass Models Leaderboard
display_side_by_side(opencompass_diagrams, compare_opencompass_diagrams, "OpenCompass Models Leaderboard")
# Extracting remaining images from both sets
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams))
compare_remaining_imgs = list(set(compare_imgs) - set(compare_hf_llm_diagrams) - set(compare_bigcode_diagrams) - set(compare_mt_bench_diagrams) - set(compare_opencompass_diagrams))
st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
st.caption("Only models evaluated on both leaderboards are included.")
# Display remaining images side by side
for img, compare_img in zip(remaining_imgs, compare_remaining_imgs):
cols = st.columns(2)
# Extract the filename and caption for the first image
filename = os.path.basename(img)
caption = captions_map.get(filename, "")
# Extract the filename and caption for the comparison image
compare_filename = os.path.basename(compare_img)
compare_caption = captions_map.get(compare_filename, "")
# Display the images with captions
cols[0].image(img, caption=caption, width=None)
cols[1].image(compare_img, caption=compare_caption, width=None)
else:
imgs = glob.glob(os.path.join(data_path, '*.png'))
# Extracting images that start with "hf_llm_diagram"
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)]
mt_bench_diagrams = [img for img in imgs if 'lmsys_leaderboard_mt_bench' in os.path.basename(img)]
arena_diagrams = [img for img in imgs if 'lmsys_leaderboard_arena' in os.path.basename(img)]
opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)]
# Getting the remaining images
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(arena_diagrams) - set(opencompass_diagrams))
st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True)
cols = st.columns(2)
cols[0].image(hf_llm_diagrams[0], caption="Main chart using all the models", use_column_width="auto")
print_model_list(hf_llm_diagrams[0],st, True)
st.write("<nbsp/>", unsafe_allow_html=True)
cols = st.columns(2)
cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto")
print_model_list(hf_llm_diagrams[1],cols[0])
cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto")
print_model_list(hf_llm_diagrams[2],cols[1])
st.write("<nbsp/>", unsafe_allow_html=True)
cols = st.columns(2)
cols[0].image(hf_llm_diagrams[3],caption="TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
print_model_list(hf_llm_diagrams[3],cols[0],False)
cols[1].image(hf_llm_diagrams[4],caption="ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
print_model_list(hf_llm_diagrams[4],cols[1],False)
st.subheader("Big Code Models Leaderboard", divider=True)
cols = st.columns(2)
cols[0].image(bigcode_diagrams[0], use_column_width="auto")
print_model_list(bigcode_diagrams[0],st,True)
st.subheader("MT-Bench Models Leaderboard", divider=True)
cols = st.columns(2)
cols[0].image(mt_bench_diagrams[0], use_column_width="auto")
print_model_list(mt_bench_diagrams[0],st,True)
st.subheader("LMSYS Arena Elo Models Leaderboard", divider=True)
cols = st.columns(2)
cols[0].image(arena_diagrams[0], use_column_width="auto")
print_model_list(arena_diagrams[0],st,True)
st.subheader("OpenCompass Models Leaderboard", divider=True)
cols = st.columns(2)
cols[0].image(opencompass_diagrams[0], use_column_width="auto")
print_model_list(opencompass_diagrams[0],st,True)
st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
st.caption("Only models evaluated on both leaderboards are included.")
cols = st.columns(2)
for i, img in enumerate(remaining_imgs):
# Extract the filename from the full image path
filename = os.path.basename(img)
# Get the caption from the captions_map dictionary
caption = captions_map.get(filename, "") # If no caption is found, it will default to an empty string
# Display the image with the caption
cols[i % 2].image(img, caption=caption, width=None)
st.write(
"""
<p>Leaderboards tracked:</p>
<ul>
<li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
<li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench and Arena Elo</a>MT-Bench is GPT4 judged evaluation of models, Arena Elo is users ranking outputs between models.</li>
<li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
<li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
<li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>
<li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li>
<li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">BigCode</a> Compare performance of base multilingual code generation models</li>
</ul>
<sub>HuggingFace models that have been flagged as contaminated or do not provide any model card information are excluded.</sub>
""", unsafe_allow_html=True
)
st.subheader('About', divider=True)
st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True) |