File size: 11,809 Bytes
7411847
6d7a85e
 
7411847
 
 
 
 
d1b3326
7411847
8d4c97a
 
7411847
9a34cda
7411847
9a34cda
 
 
 
 
 
 
8fcd96c
6d7a85e
 
 
 
8fcd96c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d7a85e
0b04027
5669002
0b04027
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1b3326
0b04027
 
 
 
 
 
 
d1b3326
0b04027
 
6d7a85e
 
0b04027
6d7a85e
0b04027
 
84f536a
0b04027
 
 
 
 
 
 
 
 
 
 
 
 
5d6c941
 
0b04027
 
 
 
5d6c941
 
0b04027
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d6c941
 
 
0b04027
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d6c941
 
0b04027
 
 
5d6c941
0b04027
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e254e41
0b04027
e254e41
0b04027
 
 
e254e41
0b04027
 
e254e41
b9fbf95
0b04027
 
 
e254e41
b9fbf95
0b04027
b9fbf95
0b04027
 
 
0110cd9
0b04027
e254e41
5d6c941
 
 
 
 
 
0b04027
 
 
 
e254e41
0b04027
 
e254e41
0b04027
7411847
0b04027
 
 
84f536a
0b04027
 
84f536a
0b04027
 
b0337e1
9a34cda
 
0e46dc8
 
f765ec9
5d6c941
0e46dc8
 
 
 
b9fbf95
9a34cda
6ca74e2
9a34cda
 
5669002
 
8d4c97a
b0337e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# app.py
import json

import streamlit as st
import glob
import os
from datetime import datetime

st.set_page_config(layout="wide")
st.title('Meta Open LLM leaderboard')
st.write("Combine data from various open LLM leaderboards into one useful visualization page")
st.write("<nbsp/>", unsafe_allow_html=True)

directories = os.listdir("./data")

def format_dir_date(data_dir):
    # Extracting date and time information from the path
    parsed_date = datetime.strptime(data_dir, "%Y%m%d_%H%M")

    # Formatting the parsed date
    return  parsed_date.strftime("%b %d, %Y %H:%M")

def print_model_list(file_name, st, split_into_two=False):
    file_path = file_name[:-4] + '.json'
    # Read the list from the JSON file
    with open(file_path, 'r') as file:
        model_id_list_loaded = json.load(file)
        midpoint = len(model_id_list_loaded) // 2 + (len(model_id_list_loaded) % 2)  # Calculate the midpoint

        # Split the list into two parts
        left_list = model_id_list_loaded[:midpoint]
        right_list = model_id_list_loaded[midpoint:]

        # Generate HTML for the left column
        left_html = ""
        for model_id in left_list:
            model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
            left_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'


        # Generate HTML for the right column
        right_html = ""
        for model_id in right_list:
            model_id_trunc = model_id if len(model_id) <= 35 else '...' + model_id[-35:]
            right_html += f'<li><a href="https://huggingface.co/{model_id}">{model_id_trunc}</a></li>'

        final_html = ""
        if(split_into_two):
            final_html  = "<ul>"
            final_html  += left_html
            final_html  += "</ul>"
            cols = st.columns(2)
            cols[0].write(final_html, unsafe_allow_html=True)
            final_html  = "<ul>"
            final_html  += right_html
            final_html  += "</ul>"
            cols[1].write(final_html, unsafe_allow_html=True)
        else:
            final_html  = "<ul>"
            final_html  += left_html
            final_html  += right_html
            final_html  += "</ul>"
            st.write(final_html, unsafe_allow_html=True)

col1, col2 = st.columns(2)

with col1:
    data_dir = st.selectbox(
        'Select different data generation date',
        directories,
        format_func=format_dir_date,
        index=len(directories)-1,
        )
with col2:
    compare_mode = st.checkbox('Enable compare to different date')
    if compare_mode:
        compare_data_dir = st.selectbox(
            'Select date for comparison',
            directories,
            format_func=format_dir_date,
            index=len(directories)-1,
        )

captions_map = {
    "hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
    "hg_average_to_opencompass_compare.png": "HF to OpenCompass compare",
    "hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
    "hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
    "hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
}

with col1:
    st.write("<div style=\"text-align: center\" >Generated on: <b>" + format_dir_date(data_dir) + "</b></div>", unsafe_allow_html=True)


data_path = './data/' + data_dir

# Adjust the data path loading logic
if compare_mode:

    # Side by side compare:
    compare_data_path = './data/' + compare_data_dir

    # Load images from both directories
    imgs = glob.glob(os.path.join(data_path, '*.png'))
    compare_imgs = glob.glob(os.path.join(compare_data_path, '*.png'))

    # Extracting images that start with specific keywords from both sets
    def extract_images(keyword, img_list):
        return [img for img in img_list if keyword in os.path.basename(img)]

    hf_llm_diagrams = extract_images('hf_llm_diagram', imgs)
    bigcode_diagrams = extract_images('bigcode', imgs)
    mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', imgs)
    arena_diagrams = extract_images('lmsys_leaderboard_arena', imgs)
    opencompass_diagrams = extract_images('opencompass_leaderboard', imgs)

    compare_hf_llm_diagrams = extract_images('hf_llm_diagram', compare_imgs)
    compare_bigcode_diagrams = extract_images('bigcode', compare_imgs)
    compare_mt_bench_diagrams = extract_images('lmsys_leaderboard_mt_bench', compare_imgs)
    compare_arena_diagrams = extract_images('lmsys_leaderboard_arena', compare_imgs)
    compare_opencompass_diagrams = extract_images('opencompass_leaderboard', compare_imgs)

    # Display each category side by side
    def display_side_by_side(diagrams1, diagrams2, title):
        st.subheader(title, divider=True)
        for d1, d2 in zip(diagrams1, diagrams2):
            cols = st.columns(2)
            cols[0].image(d1, use_column_width="auto")
            cols[1].image(d2, use_column_width="auto")

    # Displaying HuggingFace LLM Leaderboard
    display_side_by_side(hf_llm_diagrams, compare_hf_llm_diagrams, "HuggingFace Open LLM leaderboard by Model Size")

    # Displaying Big Code Models Leaderboard
    display_side_by_side(bigcode_diagrams, compare_bigcode_diagrams, "Big Code Models Leaderboard")

    # Displaying MT-Bench Models Leaderboard
    display_side_by_side(mt_bench_diagrams, compare_mt_bench_diagrams, "MT-Bench Models Leaderboard")

    # Displaying Arena Models Leaderboard
    display_side_by_side(arena_diagrams, compare_arena_diagrams, "LMSYS Arena Elo Models Leaderboard")

    # Displaying OpenCompass Models Leaderboard
    display_side_by_side(opencompass_diagrams, compare_opencompass_diagrams, "OpenCompass Models Leaderboard")

    # Extracting remaining images from both sets
    remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams))
    compare_remaining_imgs = list(set(compare_imgs) - set(compare_hf_llm_diagrams) - set(compare_bigcode_diagrams) - set(compare_mt_bench_diagrams) - set(compare_opencompass_diagrams))

    st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
    st.caption("Only models evaluated on both leaderboards are included.")

    # Display remaining images side by side
    for img, compare_img in zip(remaining_imgs, compare_remaining_imgs):
        cols = st.columns(2)

        # Extract the filename and caption for the first image
        filename = os.path.basename(img)
        caption = captions_map.get(filename, "")

        # Extract the filename and caption for the comparison image
        compare_filename = os.path.basename(compare_img)
        compare_caption = captions_map.get(compare_filename, "")

        # Display the images with captions
        cols[0].image(img, caption=caption, width=None)
        cols[1].image(compare_img, caption=compare_caption, width=None)

else:
    imgs = glob.glob(os.path.join(data_path, '*.png'))

    # Extracting images that start with "hf_llm_diagram"
    hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
    bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)]
    mt_bench_diagrams = [img for img in imgs if 'lmsys_leaderboard_mt_bench' in os.path.basename(img)]
    arena_diagrams = [img for img in imgs if 'lmsys_leaderboard_arena' in os.path.basename(img)]
    opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)]

    # Getting the remaining images
    remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(arena_diagrams) - set(opencompass_diagrams))

    st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True)
    cols = st.columns(2)

    cols[0].image(hf_llm_diagrams[0], caption="Main chart using all the models", use_column_width="auto")

    print_model_list(hf_llm_diagrams[0],st, True)
    st.write("<nbsp/>", unsafe_allow_html=True)

    cols = st.columns(2)

    cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto")
    print_model_list(hf_llm_diagrams[1],cols[0])

    cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto")
    print_model_list(hf_llm_diagrams[2],cols[1])

    st.write("<nbsp/>", unsafe_allow_html=True)

    cols = st.columns(2)
    cols[0].image(hf_llm_diagrams[3],caption="TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
    print_model_list(hf_llm_diagrams[3],cols[0],False)

    cols[1].image(hf_llm_diagrams[4],caption="ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
    print_model_list(hf_llm_diagrams[4],cols[1],False)


    st.subheader("Big Code Models Leaderboard", divider=True)
    cols = st.columns(2)
    cols[0].image(bigcode_diagrams[0], use_column_width="auto")


    print_model_list(bigcode_diagrams[0],st,True)

    st.subheader("MT-Bench Models Leaderboard", divider=True)
    cols = st.columns(2)
    cols[0].image(mt_bench_diagrams[0], use_column_width="auto")

    print_model_list(mt_bench_diagrams[0],st,True)

    st.subheader("LMSYS Arena Elo Models Leaderboard", divider=True)
    cols = st.columns(2)
    cols[0].image(arena_diagrams[0], use_column_width="auto")

    print_model_list(arena_diagrams[0],st,True)

    st.subheader("OpenCompass Models Leaderboard", divider=True)
    cols = st.columns(2)
    cols[0].image(opencompass_diagrams[0], use_column_width="auto")
    print_model_list(opencompass_diagrams[0],st,True)

    st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
    st.caption("Only models evaluated on both leaderboards are included.")

    cols = st.columns(2)

    for i, img in enumerate(remaining_imgs):
        # Extract the filename from the full image path
        filename = os.path.basename(img)

        # Get the caption from the captions_map dictionary
        caption = captions_map.get(filename, "")  # If no caption is found, it will default to an empty string

        # Display the image with the caption
        cols[i % 2].image(img, caption=caption, width=None)

st.write(
    """
    <p>Leaderboards tracked:</p>
     <ul>
        <li><a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">Hugging Face Open LLM</a></li>
        <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">MT-Bench and Arena Elo</a>MT-Bench is GPT4 judged evaluation of models, Arena Elo is users ranking outputs between models.</li>
        <li><a href="https://tatsu-lab.github.io/alpaca_eval/">AlpacaEval</a> GPT4 judged evaluation of models</li>
        <li><a href="https://www.mosaicml.com/llm-evaluation">MosaicML</a> Balanced set of static benchmarks</li>
        <li><a href="https://opencompass.org.cn/leaderboard-llm">OpenCompass</a> Balanced set of static benchmarks</li>
        <li><a href="https://llmbench.ai/data">AgentBench</a> Benchmark evaluating Agent abilities</li>
        <li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">BigCode</a>  Compare performance of base multilingual code generation models</li>
        </ul>
        <sub>HuggingFace models that have been flagged as contaminated or do not provide any model card information are excluded.</sub>
    """, unsafe_allow_html=True
)


st.subheader('About', divider=True)
st.write('This meta leaderboard is built and maintained by Felix Zaslavskiy. For feedback, correction, suggestions please reach out on X at <a href="https://twitter.com/FZaslavskiy" >@FZaslavskiy</a> or here via community discussions.', unsafe_allow_html=True)