Spaces:
Sleeping
Sleeping
felix
commited on
Commit
•
0b04027
1
Parent(s):
ee1c446
add side by side compare
Browse files
app.py
CHANGED
@@ -20,40 +20,6 @@ def format_dir_date(data_dir):
|
|
20 |
# Formatting the parsed date
|
21 |
return parsed_date.strftime("%b %d, %Y %H:%M")
|
22 |
|
23 |
-
col1, col2 = st.columns(2)
|
24 |
-
|
25 |
-
with col1:
|
26 |
-
data_dir = st.selectbox(
|
27 |
-
'Select different data generation date',
|
28 |
-
directories,
|
29 |
-
format_func=format_dir_date,
|
30 |
-
index=len(directories)-1,
|
31 |
-
)
|
32 |
-
|
33 |
-
captions_map = {
|
34 |
-
"hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
|
35 |
-
"hg_average_to_opencompass_compare.png": "HF to OpenCompass compare",
|
36 |
-
"hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
|
37 |
-
"hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
|
38 |
-
"hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
|
39 |
-
}
|
40 |
-
with col2:
|
41 |
-
st.write("<div style=\"text-align: center\" >Generated on: <b>" + format_dir_date(data_dir) + "</b></div>", unsafe_allow_html=True)
|
42 |
-
|
43 |
-
|
44 |
-
data_path = './data/' + data_dir
|
45 |
-
|
46 |
-
imgs = glob.glob(os.path.join(data_path, '*.png'))
|
47 |
-
|
48 |
-
# Extracting images that start with "hf_llm_diagram"
|
49 |
-
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
|
50 |
-
bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)]
|
51 |
-
mt_bench_diagrams = [img for img in imgs if 'mt_bench_leaderboard' in os.path.basename(img)]
|
52 |
-
opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)]
|
53 |
-
|
54 |
-
# Getting the remaining images
|
55 |
-
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams))
|
56 |
-
|
57 |
def print_model_list(file_name, st, split_into_two=False):
|
58 |
file_path = file_name[:-4] + '.json'
|
59 |
# Read the list from the JSON file
|
@@ -96,66 +62,176 @@ def print_model_list(file_name, st, split_into_two=False):
|
|
96 |
final_html += "</ul>"
|
97 |
st.write(final_html, unsafe_allow_html=True)
|
98 |
|
|
|
99 |
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
|
106 |
-
st.write("<
|
107 |
|
108 |
-
cols = st.columns(2)
|
109 |
|
110 |
-
|
111 |
-
print_model_list(hf_llm_diagrams[1],cols[0])
|
112 |
|
113 |
-
|
114 |
-
|
115 |
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
-
|
119 |
-
cols[0].image(hf_llm_diagrams[3],caption="TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
|
120 |
-
print_model_list(hf_llm_diagrams[3],cols[0],False)
|
121 |
|
122 |
-
cols
|
123 |
-
|
|
|
124 |
|
|
|
|
|
125 |
|
126 |
|
127 |
-
st.subheader("Big Code Models Leaderboard", divider=True)
|
128 |
-
cols = st.columns(2)
|
129 |
-
cols[0].image(bigcode_diagrams[0], use_column_width="auto")
|
130 |
|
131 |
|
132 |
-
print_model_list(bigcode_diagrams[0],st,True)
|
133 |
|
134 |
-
st.subheader("MT-Bench Models Leaderboard", divider=True)
|
135 |
-
cols = st.columns(2)
|
136 |
-
cols[0].image(mt_bench_diagrams[0], use_column_width="auto")
|
137 |
|
138 |
-
print_model_list(mt_bench_diagrams[0],st,True)
|
139 |
|
140 |
-
st.subheader("OpenCompass Models Leaderboard", divider=True)
|
141 |
-
cols = st.columns(2)
|
142 |
-
cols[0].image(opencompass_diagrams[0], use_column_width="auto")
|
143 |
-
print_model_list(opencompass_diagrams[0],st,True)
|
144 |
|
145 |
-
st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
|
146 |
-
st.caption("Only models evaluated on both leaderboards are included.")
|
147 |
|
148 |
-
cols = st.columns(2)
|
149 |
|
150 |
-
for i, img in enumerate(remaining_imgs):
|
151 |
-
|
152 |
-
|
153 |
|
154 |
-
|
155 |
-
|
156 |
|
157 |
-
|
158 |
-
|
159 |
|
160 |
st.write(
|
161 |
"""
|
|
|
20 |
# Formatting the parsed date
|
21 |
return parsed_date.strftime("%b %d, %Y %H:%M")
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def print_model_list(file_name, st, split_into_two=False):
|
24 |
file_path = file_name[:-4] + '.json'
|
25 |
# Read the list from the JSON file
|
|
|
62 |
final_html += "</ul>"
|
63 |
st.write(final_html, unsafe_allow_html=True)
|
64 |
|
65 |
+
col1, col2 = st.columns(2)
|
66 |
|
67 |
+
with col1:
|
68 |
+
data_dir = st.selectbox(
|
69 |
+
'Select different data generation date',
|
70 |
+
directories,
|
71 |
+
format_func=format_dir_date,
|
72 |
+
index=len(directories)-1,
|
73 |
+
)
|
74 |
+
with col2:
|
75 |
+
compare_mode = st.checkbox('Enable compare to different date')
|
76 |
+
if compare_mode:
|
77 |
+
compare_data_dir = st.selectbox(
|
78 |
+
'Select date for comparison',
|
79 |
+
directories,
|
80 |
+
format_func=format_dir_date,
|
81 |
+
index=len(directories)-1,
|
82 |
+
)
|
83 |
|
84 |
+
captions_map = {
|
85 |
+
"hg_average_to_agentbench_compare.png": "HF to AgentBench compare",
|
86 |
+
"hg_average_to_opencompass_compare.png": "HF to OpenCompass compare",
|
87 |
+
"hg_average_to_mt_bench_compare.png": "HF to MT-Bench compare",
|
88 |
+
"hg_average_to_mosaic_compare.png": "HF to MosaicML compare",
|
89 |
+
"hg_average_to_alpacaeval_compare.png": "HF to AlpacaEval compare"
|
90 |
+
}
|
91 |
|
92 |
+
with col1:
|
93 |
+
st.write("<div style=\"text-align: center\" >Generated on: <b>" + format_dir_date(data_dir) + "</b></div>", unsafe_allow_html=True)
|
94 |
|
|
|
95 |
|
96 |
+
data_path = './data/' + data_dir
|
|
|
97 |
|
98 |
+
# Adjust the data path loading logic
|
99 |
+
if compare_mode:
|
100 |
|
101 |
+
# Side by side compare:
|
102 |
+
compare_data_path = './data/' + compare_data_dir
|
103 |
+
|
104 |
+
# Load images from both directories
|
105 |
+
imgs = glob.glob(os.path.join(data_path, '*.png'))
|
106 |
+
compare_imgs = glob.glob(os.path.join(compare_data_path, '*.png'))
|
107 |
+
|
108 |
+
# Extracting images that start with specific keywords from both sets
|
109 |
+
def extract_images(keyword, img_list):
|
110 |
+
return [img for img in img_list if keyword in os.path.basename(img)]
|
111 |
+
|
112 |
+
hf_llm_diagrams = extract_images('hf_llm_diagram', imgs)
|
113 |
+
bigcode_diagrams = extract_images('bigcode', imgs)
|
114 |
+
mt_bench_diagrams = extract_images('mt_bench_leaderboard', imgs)
|
115 |
+
opencompass_diagrams = extract_images('opencompass_leaderboard', imgs)
|
116 |
+
|
117 |
+
compare_hf_llm_diagrams = extract_images('hf_llm_diagram', compare_imgs)
|
118 |
+
compare_bigcode_diagrams = extract_images('bigcode', compare_imgs)
|
119 |
+
compare_mt_bench_diagrams = extract_images('mt_bench_leaderboard', compare_imgs)
|
120 |
+
compare_opencompass_diagrams = extract_images('opencompass_leaderboard', compare_imgs)
|
121 |
+
|
122 |
+
# Display each category side by side
|
123 |
+
def display_side_by_side(diagrams1, diagrams2, title):
|
124 |
+
st.subheader(title, divider=True)
|
125 |
+
for d1, d2 in zip(diagrams1, diagrams2):
|
126 |
+
cols = st.columns(2)
|
127 |
+
cols[0].image(d1, use_column_width="auto")
|
128 |
+
cols[1].image(d2, use_column_width="auto")
|
129 |
+
|
130 |
+
# Displaying HuggingFace LLM Leaderboard
|
131 |
+
display_side_by_side(hf_llm_diagrams, compare_hf_llm_diagrams, "HuggingFace Open LLM leaderboard by Model Size")
|
132 |
+
|
133 |
+
# Displaying Big Code Models Leaderboard
|
134 |
+
display_side_by_side(bigcode_diagrams, compare_bigcode_diagrams, "Big Code Models Leaderboard")
|
135 |
+
|
136 |
+
# Displaying MT-Bench Models Leaderboard
|
137 |
+
display_side_by_side(mt_bench_diagrams, compare_mt_bench_diagrams, "MT-Bench Models Leaderboard")
|
138 |
+
|
139 |
+
# Displaying OpenCompass Models Leaderboard
|
140 |
+
display_side_by_side(opencompass_diagrams, compare_opencompass_diagrams, "OpenCompass Models Leaderboard")
|
141 |
+
|
142 |
+
# Extracting remaining images from both sets
|
143 |
+
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams))
|
144 |
+
compare_remaining_imgs = list(set(compare_imgs) - set(compare_hf_llm_diagrams) - set(compare_bigcode_diagrams) - set(compare_mt_bench_diagrams) - set(compare_opencompass_diagrams))
|
145 |
+
|
146 |
+
st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
|
147 |
+
st.caption("Only models evaluated on both leaderboards are included.")
|
148 |
+
|
149 |
+
# Display remaining images side by side
|
150 |
+
for img, compare_img in zip(remaining_imgs, compare_remaining_imgs):
|
151 |
+
cols = st.columns(2)
|
152 |
+
|
153 |
+
# Extract the filename and caption for the first image
|
154 |
+
filename = os.path.basename(img)
|
155 |
+
caption = captions_map.get(filename, "")
|
156 |
+
|
157 |
+
# Extract the filename and caption for the comparison image
|
158 |
+
compare_filename = os.path.basename(compare_img)
|
159 |
+
compare_caption = captions_map.get(compare_filename, "")
|
160 |
+
|
161 |
+
# Display the images with captions
|
162 |
+
cols[0].image(img, caption=caption, width=None)
|
163 |
+
cols[1].image(compare_img, caption=compare_caption, width=None)
|
164 |
+
|
165 |
+
else:
|
166 |
+
imgs = glob.glob(os.path.join(data_path, '*.png'))
|
167 |
+
|
168 |
+
# Extracting images that start with "hf_llm_diagram"
|
169 |
+
hf_llm_diagrams = [img for img in imgs if 'hf_llm_diagram' in os.path.basename(img)]
|
170 |
+
bigcode_diagrams = [img for img in imgs if 'bigcode' in os.path.basename(img)]
|
171 |
+
mt_bench_diagrams = [img for img in imgs if 'mt_bench_leaderboard' in os.path.basename(img)]
|
172 |
+
opencompass_diagrams = [img for img in imgs if 'opencompass_leaderboard' in os.path.basename(img)]
|
173 |
+
|
174 |
+
# Getting the remaining images
|
175 |
+
remaining_imgs = list(set(imgs) - set(hf_llm_diagrams) - set(bigcode_diagrams) - set(mt_bench_diagrams) - set(opencompass_diagrams))
|
176 |
+
|
177 |
+
st.subheader("HuggingFace Open LLM leaderboard by Model Size", divider=True)
|
178 |
+
cols = st.columns(2)
|
179 |
+
|
180 |
+
cols[0].image(hf_llm_diagrams[0], caption="Main chart using all the models", use_column_width="auto")
|
181 |
+
|
182 |
+
print_model_list(hf_llm_diagrams[0],st, True)
|
183 |
+
st.write("<nbsp/>", unsafe_allow_html=True)
|
184 |
+
|
185 |
+
cols = st.columns(2)
|
186 |
+
|
187 |
+
cols[0].image(hf_llm_diagrams[1],caption="Other or commercially permissive licenses only", use_column_width="auto")
|
188 |
+
print_model_list(hf_llm_diagrams[1],cols[0])
|
189 |
+
|
190 |
+
cols[1].image(hf_llm_diagrams[2],caption="Commercially permissive license only", use_column_width="auto")
|
191 |
+
print_model_list(hf_llm_diagrams[2],cols[1])
|
192 |
|
193 |
+
st.write("<nbsp/>", unsafe_allow_html=True)
|
|
|
|
|
194 |
|
195 |
+
cols = st.columns(2)
|
196 |
+
cols[0].image(hf_llm_diagrams[3],caption="TruthfulQA at 10% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
|
197 |
+
print_model_list(hf_llm_diagrams[3],cols[0],False)
|
198 |
|
199 |
+
cols[1].image(hf_llm_diagrams[4],caption="ARC at 50% and MMLU at 50% for HuggingFace Open LLM leaderboard by Model Size", use_column_width="auto")
|
200 |
+
print_model_list(hf_llm_diagrams[4],cols[1],False)
|
201 |
|
202 |
|
203 |
+
st.subheader("Big Code Models Leaderboard", divider=True)
|
204 |
+
cols = st.columns(2)
|
205 |
+
cols[0].image(bigcode_diagrams[0], use_column_width="auto")
|
206 |
|
207 |
|
208 |
+
print_model_list(bigcode_diagrams[0],st,True)
|
209 |
|
210 |
+
st.subheader("MT-Bench Models Leaderboard", divider=True)
|
211 |
+
cols = st.columns(2)
|
212 |
+
cols[0].image(mt_bench_diagrams[0], use_column_width="auto")
|
213 |
|
214 |
+
print_model_list(mt_bench_diagrams[0],st,True)
|
215 |
|
216 |
+
st.subheader("OpenCompass Models Leaderboard", divider=True)
|
217 |
+
cols = st.columns(2)
|
218 |
+
cols[0].image(opencompass_diagrams[0], use_column_width="auto")
|
219 |
+
print_model_list(opencompass_diagrams[0],st,True)
|
220 |
|
221 |
+
st.subheader("HuggingFace and Other Leaderboards: A Comparative Model Evaluation", divider=True)
|
222 |
+
st.caption("Only models evaluated on both leaderboards are included.")
|
223 |
|
224 |
+
cols = st.columns(2)
|
225 |
|
226 |
+
for i, img in enumerate(remaining_imgs):
|
227 |
+
# Extract the filename from the full image path
|
228 |
+
filename = os.path.basename(img)
|
229 |
|
230 |
+
# Get the caption from the captions_map dictionary
|
231 |
+
caption = captions_map.get(filename, "") # If no caption is found, it will default to an empty string
|
232 |
|
233 |
+
# Display the image with the caption
|
234 |
+
cols[i % 2].image(img, caption=caption, width=None)
|
235 |
|
236 |
st.write(
|
237 |
"""
|