Files changed (1) hide show
  1. app.py +54 -32
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  import streamlit as st
3
  import requests
@@ -6,11 +7,35 @@ from io import StringIO
6
  import plotly.graph_objs as go
7
  from huggingface_hub import HfApi
8
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
9
-
10
  from yall import create_yall
 
 
 
11
 
 
 
 
 
 
 
 
12
 
 
 
 
 
13
 
 
 
 
 
 
 
 
 
 
 
 
14
  def convert_markdown_table_to_dataframe(md_content):
15
  """
16
  Converts markdown table to Pandas DataFrame, handling special characters and links,
@@ -59,8 +84,7 @@ def get_model_info(df):
59
 
60
  return df
61
 
62
-
63
-
64
  def create_bar_chart(df, category):
65
  """Create and display a bar chart for a given category."""
66
  st.write(f"### {category} Scores")
@@ -73,7 +97,7 @@ def create_bar_chart(df, category):
73
  x=sorted_df[category],
74
  y=sorted_df['Model'],
75
  orientation='h',
76
- marker=dict(color=sorted_df[category], colorscale='Inferno')
77
  ))
78
 
79
  # Update layout for better readability
@@ -82,17 +106,17 @@ def create_bar_chart(df, category):
82
  )
83
 
84
  # Adjust the height of the chart based on the number of rows in the DataFrame
85
- st.plotly_chart(fig, use_container_width=True, height=35)
86
-
87
- # Example usage:
88
- # create_bar_chart(your_dataframe, 'Your_Category')
89
-
90
 
 
91
  def main():
 
92
  st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
93
 
94
  st.title("πŸ† YALL - Yet Another LLM Leaderboard")
95
  st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
 
 
96
  content = create_yall()
97
  tab1, tab2 = st.tabs(["πŸ† Leaderboard", "πŸ“ About"])
98
 
@@ -104,21 +128,19 @@ def main():
104
 
105
  # Display dataframe
106
  full_df = convert_markdown_table_to_dataframe(content)
 
107
  for col in score_columns:
108
  # Corrected use of pd.to_numeric
109
  full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
 
110
  full_df = get_model_info(full_df)
111
  full_df['Tags'] = full_df['Tags'].fillna('')
112
  df = pd.DataFrame(columns=full_df.columns)
113
 
114
- # Toggles
115
- col1, col2, col3 = st.columns(3)
116
- with col1:
117
- show_phi = st.checkbox("Phi (2.8B)", value=True)
118
- with col2:
119
- show_mistral = st.checkbox("Mistral (7B)", value=True)
120
- with col3:
121
- show_other = st.checkbox("Other", value=True)
122
 
123
  # Create a DataFrame based on selected filters
124
  dfs_to_concat = []
@@ -135,9 +157,6 @@ def main():
135
  if dfs_to_concat:
136
  df = pd.concat(dfs_to_concat, ignore_index=True)
137
 
138
- # Sort values
139
- df = df.sort_values(by='Average', ascending=False)
140
-
141
  # Add a search bar
142
  search_query = st.text_input("Search models", "")
143
 
@@ -158,9 +177,11 @@ def main():
158
  "URL": st.column_config.LinkColumn("URL"),
159
  },
160
  hide_index=True,
161
- height=int(len(df) * 36.2),
162
  )
163
-
 
 
164
  # Add a button to export data to CSV
165
  if st.button("Export to CSV"):
166
  # Export the DataFrame to CSV
@@ -203,27 +224,28 @@ def main():
203
  with tab2:
204
  st.markdown('''
205
  ### Nous benchmark suite
206
-
207
  Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
208
-
209
  * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
210
  * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
211
  * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
212
  * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
213
-
214
  ### Reproducibility
215
-
216
  You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
217
-
218
  ### Clone this space
219
-
220
  You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
221
-
222
  * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
223
  * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
224
-
225
- A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations and [CultriX](https://huggingface.co/CultriX) for the CSV export and search bar.
226
  ''')
227
-
 
228
  if __name__ == "__main__":
229
  main()
 
1
+ # Importing necessary libraries
2
  import re
3
  import streamlit as st
4
  import requests
 
7
  import plotly.graph_objs as go
8
  from huggingface_hub import HfApi
9
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 
10
  from yall import create_yall
11
+ from functools import cache
12
+
13
+
14
 
15
+ # Function to get model info from Hugging Face API using caching
16
+ @cache
17
+ def cached_model_info(api, model):
18
+ try:
19
+ return api.model_info(repo_id=str(model))
20
+ except (RepositoryNotFoundError, RevisionNotFoundError):
21
+ return None
22
 
23
+ # Function to get model info from DataFrame and update it with likes and tags
24
+ @st.cache
25
+ def get_model_info(df):
26
+ api = HfApi()
27
 
28
+ for index, row in df.iterrows():
29
+ model_info = cached_model_info(api, row['Model'].strip())
30
+ if model_info:
31
+ df.loc[index, 'Likes'] = model_info.likes
32
+ df.loc[index, 'Tags'] = ', '.join(model_info.tags)
33
+ else:
34
+ df.loc[index, 'Likes'] = -1
35
+ df.loc[index, 'Tags'] = ''
36
+ return df
37
+
38
+ # Function to convert markdown table to DataFrame and extract Hugging Face URLs
39
  def convert_markdown_table_to_dataframe(md_content):
40
  """
41
  Converts markdown table to Pandas DataFrame, handling special characters and links,
 
84
 
85
  return df
86
 
87
+ # Function to create bar chart for a given category
 
88
  def create_bar_chart(df, category):
89
  """Create and display a bar chart for a given category."""
90
  st.write(f"### {category} Scores")
 
97
  x=sorted_df[category],
98
  y=sorted_df['Model'],
99
  orientation='h',
100
+ marker=dict(color=sorted_df[category], colorscale='Spectral') # You can change 'Viridis' to another color scale
101
  ))
102
 
103
  # Update layout for better readability
 
106
  )
107
 
108
  # Adjust the height of the chart based on the number of rows in the DataFrame
109
+ st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
 
 
 
 
110
 
111
+ # Main function to run the Streamlit app
112
  def main():
113
+ # Set page configuration and title
114
  st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
115
 
116
  st.title("πŸ† YALL - Yet Another LLM Leaderboard")
117
  st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
118
+
119
+ # Create tabs for leaderboard and about section
120
  content = create_yall()
121
  tab1, tab2 = st.tabs(["πŸ† Leaderboard", "πŸ“ About"])
122
 
 
128
 
129
  # Display dataframe
130
  full_df = convert_markdown_table_to_dataframe(content)
131
+
132
  for col in score_columns:
133
  # Corrected use of pd.to_numeric
134
  full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
135
+
136
  full_df = get_model_info(full_df)
137
  full_df['Tags'] = full_df['Tags'].fillna('')
138
  df = pd.DataFrame(columns=full_df.columns)
139
 
140
+ # Toggles for filtering by tags
141
+ show_phi = st.checkbox("Phi (2.8B)", value=True)
142
+ show_mistral = st.checkbox("Mistral (7B)", value=True)
143
+ show_other = st.checkbox("Other", value=True)
 
 
 
 
144
 
145
  # Create a DataFrame based on selected filters
146
  dfs_to_concat = []
 
157
  if dfs_to_concat:
158
  df = pd.concat(dfs_to_concat, ignore_index=True)
159
 
 
 
 
160
  # Add a search bar
161
  search_query = st.text_input("Search models", "")
162
 
 
177
  "URL": st.column_config.LinkColumn("URL"),
178
  },
179
  hide_index=True,
180
+ height=len(df) * 37,
181
  )
182
+ selected_models = st.multiselect('Select models to compare', df['Model'].unique())
183
+ comparison_df = df[df['Model'].isin(selected_models)]
184
+ st.dataframe(comparison_df)
185
  # Add a button to export data to CSV
186
  if st.button("Export to CSV"):
187
  # Export the DataFrame to CSV
 
224
  with tab2:
225
  st.markdown('''
226
  ### Nous benchmark suite
227
+
228
  Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
229
+
230
  * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
231
  * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
232
  * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
233
  * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
234
+
235
  ### Reproducibility
236
+
237
  You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
238
+
239
  ### Clone this space
240
+
241
  You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
242
+
243
  * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
244
  * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
245
+
246
+ A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
247
  ''')
248
+
249
+ # Run the main function if this script is run directly
250
  if __name__ == "__main__":
251
  main()