# app.py import json import streamlit as st import glob import os from datetime import datetime #st.set_page_config(layout="wide") st.title('Meta Open LLM leaderboard') directories = os.listdir("./data") #data_dir = directories[0] def format_dir_date(data_dir): # Extracting date and time information from the path parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M") # Formatting the parsed date return parsed_date.strftime("%b %d, %Y %H:%M") data_dir = st.selectbox( 'Select different Date', directories, format_func=format_dir_date, index=len(directories)-1 ) captions_map = { "hg_average_to_agentbench_compare.png": "HF to AgentBench compare", "hg_average_to_opencompass_compare.png": "HF to OpenCompass compare", "hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare", "hg_average_to_mosaic_compare.png": "HF to MosaicML compare", "hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare" } st.write("Generated on: " + format_dir_date(data_dir) + "", unsafe_allow_html=True) st.divider() data_path = './data/' + data_dir imgs = glob.glob(os.path.join(data_path, '*.png')) # Extracting images that start with "hf_llm_diagram" hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)] bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)] # Getting the remaining images remaining_imgs = [img for img in imgs if 'hf_llm_diagram' not in os.path.basename(img)] def print_model_list(file_name, st, split_into_two=False): file_path = file_name[:-4] + '.json' # Read the list from the JSON file with open(file_path, 'r') as file: model_id_list_loaded = json.load(file) midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2) # Calculate the midpoint # Split the list into two parts left_list = model_id_list_loaded[:midpoint] right_list = model_id_list_loaded[midpoint:] # Generate HTML for the left column left_html = "" for model_id in left_list: model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] left_html += f'

' # Generate HTML for the right column right_html = "" for model_id in right_list: model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:] right_html += f'

' final_html = "" if(split_into_two): final_html = "

" final_html += left_html final_html += "" cols = st.columns(2) cols[0].write(final_html, unsafe_allow_html=True) final_html = "

" final_html += right_html final_html += "" cols[1].write(final_html, unsafe_allow_html=True) else: final_html = "

" final_html += left_html final_html += right_html final_html += "" st.write(final_html, unsafe_allow_html=True) st.write("HuggingFace Open LLM leaderboard by Model Size") st.image(hf_llm_diagrams[0],use_column_width="auto") print_model_list(hf_llm_diagrams[0],st,True) cols = st.columns(2) cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto") print_model_list(hf_llm_diagrams[1],cols[0]) cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto") print_model_list(hf_llm_diagrams[2],cols[1]) st.divider() st.write("Big Code Models Leaderboard") st.image(bigcode_diagrams[0],use_column_width="auto") print_model_list(bigcode_diagrams[0],st,True) st.divider() st.write("HuggingFace and Other Leaderboards: A Comparative Model Evaluation") st.caption("Only models evaluated on both leaderboards are included.") cols = st.columns(2) for i, img in enumerate(remaining_imgs): # Extract the filename from the full image path filename = os.path.basename(img) # Get the caption from the captions_map dictionary caption = captions_map.get(filename, "") # If no caption is found, it will default to an empty string # Display the image with the caption cols[i % 2].image(img, caption=caption, width=None) st.write( """

Leaderboards tracked:

Hugging Face Open LLM
MT-Bench GPT4 judged evaluation of models
AlpacaEval GPT4 judged evaluation of models
MosaicML Balanced set of static benchmarks
OpenCompass Balanced set of static benchmarks
AgentBench Benchmark evaluating Agent abilities
BigCode Compare performance of base multilingual code generation models

""", unsafe_allow_html=True ) st.divider() cols = st.columns(2) cols[0].write("TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size") cols[0].image(hf_llm_diagrams[3],use_column_width="auto") print_model_list(hf_llm_diagrams[3],cols[0],False) cols[1].write("ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size") cols[1].image(hf_llm_diagrams[4],use_column_width="auto") print_model_list(hf_llm_diagrams[4],cols[1],False) st.divider() st.subheader('About') st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at @FZaslavskiy or here via community discussions.', unsafe_allow_html=True)