File size: 3,375 Bytes
7411847
 
 
 
 
 
84f536a
7411847
 
9a34cda
7411847
9a34cda
7411847
9a34cda
 
 
 
 
 
 
 
 
 
24806ea
 
9a34cda
7411847
84f536a
 
9a34cda
84f536a
 
 
 
 
9a34cda
84f536a
 
9a34cda
 
7411847
 
84f536a
 
 
 
 
 
 
5669002
 
84f536a
 
 
 
7411847
 
84f536a
 
 
 
 
 
 
 
 
b0337e1
9a34cda
 
0e46dc8
 
f765ec9
0e46dc8
 
 
 
 
9a34cda
 
 
5669002
 
 
 
 
b0337e1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# app.py
import streamlit as st
import glob
import os
from datetime import datetime

#st.set_page_config(layout="wide")
st.title('Meta Open LLM leaderboard')

directories = os.listdir("./data")

#data_dir = directories[0]

def format_dir_date(data_dir):
    # Extracting date and time information from the path
    parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M")

    # Formatting the parsed date
    return  parsed_date.strftime("%b %d, %Y %H:%M")

data_dir = st.selectbox(
    'Select different Date',
    directories,
    format_func=format_dir_date,
    index=len(directories)-1
    )

captions_map = {
    "hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
    "hg_average_to_opencompass_compare.png": "HF to OpenCompass</a> compare",
    "hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
    "hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
    "hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
}

st.write("Generated on: <b>" + format_dir_date(data_dir) + "</b>", unsafe_allow_html=True)
st.divider()

data_path = './data/' + data_dir

imgs = glob.glob(os.path.join(data_path, '*.png'))

# Extracting images that start with "hf_llm_diagram"
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]

# Getting the remaining images
remaining_imgs = [img for img in imgs if 'hf_llm_diagram' not in os.path.basename(img)]

st.write("HuggingFace Open LLM leaderboard by Model Size")
st.image(hf_llm_diagrams[0],use_column_width="auto")


st.divider()
st.write("HuggingFace and Other Leaderboards: A Comparative Model Evaluation")
st.caption("Only models evaluated on both leaderboards are included.")
cols = st.columns(2)

for i, img in enumerate(remaining_imgs):
    # Extract the filename from the full image path
    filename = os.path.basename(img)

    # Get the caption from the captions_map dictionary
    caption = captions_map.get(filename, "")  # If no caption is found, it will default to an empty string

    # Display the image with the caption
    cols[i % 2].image(img, caption=caption, width=None)

st.write(
    """
    <p>Leaderboards tracked:</p>
     <ul>
        <li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
        <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench</a> GPT4 judged evaluation of models</li>
        <li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
        <li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
        <li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>
        <li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li>
        </ul>
    """, unsafe_allow_html=True
)
st.divider()

st.write("TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size")
st.image(hf_llm_diagrams[1],use_column_width="auto")

st.divider()
st.subheader('About')
st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True)