Upload app.py
Browse filesCode cleanup and added comparison table to compare models.
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import re
|
2 |
import streamlit as st
|
3 |
import requests
|
@@ -6,11 +7,35 @@ from io import StringIO
|
|
6 |
import plotly.graph_objs as go
|
7 |
from huggingface_hub import HfApi
|
8 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
9 |
-
|
10 |
from yall import create_yall
|
|
|
|
|
|
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
|
|
|
|
|
|
|
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def convert_markdown_table_to_dataframe(md_content):
|
15 |
"""
|
16 |
Converts markdown table to Pandas DataFrame, handling special characters and links,
|
@@ -59,8 +84,7 @@ def get_model_info(df):
|
|
59 |
|
60 |
return df
|
61 |
|
62 |
-
|
63 |
-
|
64 |
def create_bar_chart(df, category):
|
65 |
"""Create and display a bar chart for a given category."""
|
66 |
st.write(f"### {category} Scores")
|
@@ -73,7 +97,7 @@ def create_bar_chart(df, category):
|
|
73 |
x=sorted_df[category],
|
74 |
y=sorted_df['Model'],
|
75 |
orientation='h',
|
76 |
-
marker=dict(color=sorted_df[category], colorscale='
|
77 |
))
|
78 |
|
79 |
# Update layout for better readability
|
@@ -82,17 +106,17 @@ def create_bar_chart(df, category):
|
|
82 |
)
|
83 |
|
84 |
# Adjust the height of the chart based on the number of rows in the DataFrame
|
85 |
-
st.plotly_chart(fig, use_container_width=True, height=35)
|
86 |
-
|
87 |
-
# Example usage:
|
88 |
-
# create_bar_chart(your_dataframe, 'Your_Category')
|
89 |
-
|
90 |
|
|
|
91 |
def main():
|
|
|
92 |
st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
|
93 |
|
94 |
st.title("π YALL - Yet Another LLM Leaderboard")
|
95 |
st.markdown("Leaderboard made with π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
|
|
|
|
|
96 |
content = create_yall()
|
97 |
tab1, tab2 = st.tabs(["π Leaderboard", "π About"])
|
98 |
|
@@ -104,21 +128,19 @@ def main():
|
|
104 |
|
105 |
# Display dataframe
|
106 |
full_df = convert_markdown_table_to_dataframe(content)
|
|
|
107 |
for col in score_columns:
|
108 |
# Corrected use of pd.to_numeric
|
109 |
full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
|
|
|
110 |
full_df = get_model_info(full_df)
|
111 |
full_df['Tags'] = full_df['Tags'].fillna('')
|
112 |
df = pd.DataFrame(columns=full_df.columns)
|
113 |
|
114 |
-
# Toggles
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
with col2:
|
119 |
-
show_mistral = st.checkbox("Mistral (7B)", value=True)
|
120 |
-
with col3:
|
121 |
-
show_other = st.checkbox("Other", value=True)
|
122 |
|
123 |
# Create a DataFrame based on selected filters
|
124 |
dfs_to_concat = []
|
@@ -135,9 +157,6 @@ def main():
|
|
135 |
if dfs_to_concat:
|
136 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
137 |
|
138 |
-
# Sort values
|
139 |
-
df = df.sort_values(by='Average', ascending=False)
|
140 |
-
|
141 |
# Add a search bar
|
142 |
search_query = st.text_input("Search models", "")
|
143 |
|
@@ -158,9 +177,11 @@ def main():
|
|
158 |
"URL": st.column_config.LinkColumn("URL"),
|
159 |
},
|
160 |
hide_index=True,
|
161 |
-
height=
|
162 |
)
|
163 |
-
|
|
|
|
|
164 |
# Add a button to export data to CSV
|
165 |
if st.button("Export to CSV"):
|
166 |
# Export the DataFrame to CSV
|
@@ -203,27 +224,28 @@ def main():
|
|
203 |
with tab2:
|
204 |
st.markdown('''
|
205 |
### Nous benchmark suite
|
206 |
-
|
207 |
Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
|
208 |
-
|
209 |
* [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
|
210 |
* **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
|
211 |
* [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
|
212 |
* [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
|
213 |
-
|
214 |
### Reproducibility
|
215 |
-
|
216 |
You can easily reproduce these results using π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
|
217 |
-
|
218 |
### Clone this space
|
219 |
-
|
220 |
You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
|
221 |
-
|
222 |
* Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
|
223 |
* Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
|
224 |
-
|
225 |
-
A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations
|
226 |
''')
|
227 |
-
|
|
|
228 |
if __name__ == "__main__":
|
229 |
main()
|
|
|
1 |
+
# Importing necessary libraries
|
2 |
import re
|
3 |
import streamlit as st
|
4 |
import requests
|
|
|
7 |
import plotly.graph_objs as go
|
8 |
from huggingface_hub import HfApi
|
9 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
|
|
10 |
from yall import create_yall
|
11 |
+
from functools import cache
|
12 |
+
|
13 |
+
|
14 |
|
15 |
+
# Function to get model info from Hugging Face API using caching
|
16 |
+
@cache
|
17 |
+
def cached_model_info(api, model):
|
18 |
+
try:
|
19 |
+
return api.model_info(repo_id=str(model))
|
20 |
+
except (RepositoryNotFoundError, RevisionNotFoundError):
|
21 |
+
return None
|
22 |
|
23 |
+
# Function to get model info from DataFrame and update it with likes and tags
|
24 |
+
@st.cache
|
25 |
+
def get_model_info(df):
|
26 |
+
api = HfApi()
|
27 |
|
28 |
+
for index, row in df.iterrows():
|
29 |
+
model_info = cached_model_info(api, row['Model'].strip())
|
30 |
+
if model_info:
|
31 |
+
df.loc[index, 'Likes'] = model_info.likes
|
32 |
+
df.loc[index, 'Tags'] = ', '.join(model_info.tags)
|
33 |
+
else:
|
34 |
+
df.loc[index, 'Likes'] = -1
|
35 |
+
df.loc[index, 'Tags'] = ''
|
36 |
+
return df
|
37 |
+
|
38 |
+
# Function to convert markdown table to DataFrame and extract Hugging Face URLs
|
39 |
def convert_markdown_table_to_dataframe(md_content):
|
40 |
"""
|
41 |
Converts markdown table to Pandas DataFrame, handling special characters and links,
|
|
|
84 |
|
85 |
return df
|
86 |
|
87 |
+
# Function to create bar chart for a given category
|
|
|
88 |
def create_bar_chart(df, category):
|
89 |
"""Create and display a bar chart for a given category."""
|
90 |
st.write(f"### {category} Scores")
|
|
|
97 |
x=sorted_df[category],
|
98 |
y=sorted_df['Model'],
|
99 |
orientation='h',
|
100 |
+
marker=dict(color=sorted_df[category], colorscale='Spectral') # You can change 'Viridis' to another color scale
|
101 |
))
|
102 |
|
103 |
# Update layout for better readability
|
|
|
106 |
)
|
107 |
|
108 |
# Adjust the height of the chart based on the number of rows in the DataFrame
|
109 |
+
st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
# Main function to run the Streamlit app
|
112 |
def main():
|
113 |
+
# Set page configuration and title
|
114 |
st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
|
115 |
|
116 |
st.title("π YALL - Yet Another LLM Leaderboard")
|
117 |
st.markdown("Leaderboard made with π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
|
118 |
+
|
119 |
+
# Create tabs for leaderboard and about section
|
120 |
content = create_yall()
|
121 |
tab1, tab2 = st.tabs(["π Leaderboard", "π About"])
|
122 |
|
|
|
128 |
|
129 |
# Display dataframe
|
130 |
full_df = convert_markdown_table_to_dataframe(content)
|
131 |
+
|
132 |
for col in score_columns:
|
133 |
# Corrected use of pd.to_numeric
|
134 |
full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
|
135 |
+
|
136 |
full_df = get_model_info(full_df)
|
137 |
full_df['Tags'] = full_df['Tags'].fillna('')
|
138 |
df = pd.DataFrame(columns=full_df.columns)
|
139 |
|
140 |
+
# Toggles for filtering by tags
|
141 |
+
show_phi = st.checkbox("Phi (2.8B)", value=True)
|
142 |
+
show_mistral = st.checkbox("Mistral (7B)", value=True)
|
143 |
+
show_other = st.checkbox("Other", value=True)
|
|
|
|
|
|
|
|
|
144 |
|
145 |
# Create a DataFrame based on selected filters
|
146 |
dfs_to_concat = []
|
|
|
157 |
if dfs_to_concat:
|
158 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
159 |
|
|
|
|
|
|
|
160 |
# Add a search bar
|
161 |
search_query = st.text_input("Search models", "")
|
162 |
|
|
|
177 |
"URL": st.column_config.LinkColumn("URL"),
|
178 |
},
|
179 |
hide_index=True,
|
180 |
+
height=len(df) * 37,
|
181 |
)
|
182 |
+
selected_models = st.multiselect('Select models to compare', df['Model'].unique())
|
183 |
+
comparison_df = df[df['Model'].isin(selected_models)]
|
184 |
+
st.dataframe(comparison_df)
|
185 |
# Add a button to export data to CSV
|
186 |
if st.button("Export to CSV"):
|
187 |
# Export the DataFrame to CSV
|
|
|
224 |
with tab2:
|
225 |
st.markdown('''
|
226 |
### Nous benchmark suite
|
227 |
+
|
228 |
Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
|
229 |
+
|
230 |
* [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
|
231 |
* **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
|
232 |
* [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
|
233 |
* [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
|
234 |
+
|
235 |
### Reproducibility
|
236 |
+
|
237 |
You can easily reproduce these results using π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
|
238 |
+
|
239 |
### Clone this space
|
240 |
+
|
241 |
You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
|
242 |
+
|
243 |
* Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
|
244 |
* Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
|
245 |
+
|
246 |
+
A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
|
247 |
''')
|
248 |
+
|
249 |
+
# Run the main function if this script is run directly
|
250 |
if __name__ == "__main__":
|
251 |
main()
|