eduagarcia commited on
Commit
91c6e89
Β·
1 Parent(s): c8b2c09

change 'proprietary' models to 'external' models and added news models

Browse files
proprietary_models_results.json β†’ external_models_results.json RENAMED
@@ -6,6 +6,7 @@
6
  "date": "2024-04-12",
7
  "status": "full",
8
  "main_language": "Portuguese",
 
9
  "result_metrics": {
10
  "enem_challenge": 0.7172848145556333,
11
  "bluex": 0.5549374130737135,
@@ -27,6 +28,7 @@
27
  "date": "2024-04-13",
28
  "status": "full",
29
  "main_language": "Portuguese",
 
30
  "result_metrics": {
31
  "enem_challenge": 0.8180545836249126,
32
  "bluex": 0.717663421418637,
@@ -48,6 +50,7 @@
48
  "date": "2024-03-08",
49
  "status": "full",
50
  "main_language": "English",
 
51
  "result_metrics": {
52
  "enem_challenge": 0.7214835549335199,
53
  "bluex": 0.6244784422809457,
@@ -69,6 +72,7 @@
69
  "date": "2024-04-13",
70
  "status": "full",
71
  "main_language": "English",
 
72
  "result_metrics": {
73
  "enem_challenge": 0.7718684394681595,
74
  "bluex": 0.6662030598052852,
@@ -90,6 +94,7 @@
90
  "date": "2024-03-08",
91
  "status": "full",
92
  "main_language": "English",
 
93
  "result_metrics": {
94
  "enem_challenge": 0.7130860741777467,
95
  "bluex": 0.5869262865090403,
@@ -111,6 +116,7 @@
111
  "date": "2024-04-15",
112
  "status": "full",
113
  "main_language": "English",
 
114
  "result_metrics": {
115
  "enem_challenge": 0.8509447165850245,
116
  "bluex": 0.7719054242002782,
@@ -132,6 +138,7 @@
132
  "date": "2024-05-18",
133
  "status": "full",
134
  "main_language": "English",
 
135
  "result_metrics": {
136
  "enem_challenge": 0.7844646606018194,
137
  "bluex": 0.6954102920723226,
@@ -153,6 +160,7 @@
153
  "date": "2024-05-18",
154
  "status": "full",
155
  "main_language": "English",
 
156
  "result_metrics": {
157
  "enem_challenge": 0.8264520643806857,
158
  "bluex": 0.7482614742698191,
@@ -166,5 +174,72 @@
166
  },
167
  "result_metrics_average": 0.7914657682594597,
168
  "result_metrics_npm": 0.6834036936130392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  }
170
  ]
 
6
  "date": "2024-04-12",
7
  "status": "full",
8
  "main_language": "Portuguese",
9
+ "model_type": "proprietary",
10
  "result_metrics": {
11
  "enem_challenge": 0.7172848145556333,
12
  "bluex": 0.5549374130737135,
 
28
  "date": "2024-04-13",
29
  "status": "full",
30
  "main_language": "Portuguese",
31
+ "model_type": "proprietary",
32
  "result_metrics": {
33
  "enem_challenge": 0.8180545836249126,
34
  "bluex": 0.717663421418637,
 
50
  "date": "2024-03-08",
51
  "status": "full",
52
  "main_language": "English",
53
+ "model_type": "proprietary",
54
  "result_metrics": {
55
  "enem_challenge": 0.7214835549335199,
56
  "bluex": 0.6244784422809457,
 
72
  "date": "2024-04-13",
73
  "status": "full",
74
  "main_language": "English",
75
+ "model_type": "proprietary",
76
  "result_metrics": {
77
  "enem_challenge": 0.7718684394681595,
78
  "bluex": 0.6662030598052852,
 
94
  "date": "2024-03-08",
95
  "status": "full",
96
  "main_language": "English",
97
+ "model_type": "proprietary",
98
  "result_metrics": {
99
  "enem_challenge": 0.7130860741777467,
100
  "bluex": 0.5869262865090403,
 
116
  "date": "2024-04-15",
117
  "status": "full",
118
  "main_language": "English",
119
+ "model_type": "proprietary",
120
  "result_metrics": {
121
  "enem_challenge": 0.8509447165850245,
122
  "bluex": 0.7719054242002782,
 
138
  "date": "2024-05-18",
139
  "status": "full",
140
  "main_language": "English",
141
+ "model_type": "proprietary",
142
  "result_metrics": {
143
  "enem_challenge": 0.7844646606018194,
144
  "bluex": 0.6954102920723226,
 
160
  "date": "2024-05-18",
161
  "status": "full",
162
  "main_language": "English",
163
+ "model_type": "proprietary",
164
  "result_metrics": {
165
  "enem_challenge": 0.8264520643806857,
166
  "bluex": 0.7482614742698191,
 
174
  },
175
  "result_metrics_average": 0.7914657682594597,
176
  "result_metrics_npm": 0.6834036936130392
177
+ },
178
+ {
179
+ "model": "gemini-1.5-flash",
180
+ "name": "Gemini 1.5 Flash",
181
+ "link": "https://cloud.google.com/vertex-ai",
182
+ "date": "2024-08-09",
183
+ "status": "full",
184
+ "main_language": "English",
185
+ "model_type": "proprietary",
186
+ "result_metrics": {
187
+ "enem_challenge": 0.8306508047585724,
188
+ "bluex": 0.7579972183588317,
189
+ "oab_exams": 0.6446469248291572,
190
+ "assin2_sts": 0.838806085610371,
191
+ "assin2_rte": 0.9366169973822607,
192
+ "faquad_nli": 0.7963910785668922,
193
+ "hatebr_offensive": 0.9092078461170015,
194
+ "portuguese_hate_speech": 0.6932563987219857,
195
+ "tweetsentbr": 0.7312948963367732
196
+ },
197
+ "result_metrics_average": 0.7932075834090939,
198
+ "result_metrics_npm": 0.6855338135928848
199
+ },
200
+ {
201
+ "model": "gpt-4o-mini-2024-07-18",
202
+ "name": "GPT 4o Mini (2024-07-18)",
203
+ "link": "https://www.openai.com/",
204
+ "date": "2024-07-25",
205
+ "status": "full",
206
+ "main_language": "English",
207
+ "model_type": "proprietary",
208
+ "result_metrics": {
209
+ "enem_challenge": 0.7669699090272918,
210
+ "bluex": 0.6842837273991655,
211
+ "oab_exams": 0.6013667425968109,
212
+ "assin2_sts": 0.7259038954527597,
213
+ "assin2_rte": 0.942809846745341,
214
+ "faquad_nli": 0.819807735300693,
215
+ "hatebr_offensive": 0.8682357029532165,
216
+ "portuguese_hate_speech": 0.7501413502853012,
217
+ "tweetsentbr": 0.7509303825869922
218
+ },
219
+ "result_metrics_average": 0.7678276991497301,
220
+ "result_metrics_npm": 0.6595966999910003
221
+ },
222
+ {
223
+ "model": "nemotron-4-340b-instruct",
224
+ "name": "nvidia/Nemotron-4-340B-Instruct (Nvidia API)",
225
+ "link": "https://build.nvidia.com/nvidia/nemotron-4-340b-instruct",
226
+ "date": "2024-06-30",
227
+ "status": "full",
228
+ "main_language": "English",
229
+ "model_type": "chat",
230
+ "params": 340.0,
231
+ "result_metrics": {
232
+ "enem_challenge": 0.6648005598320503,
233
+ "bluex": 0.6578581363004172,
234
+ "oab_exams": 0.7020501138952164,
235
+ "assin2_sts": 0.7857731021403329,
236
+ "assin2_rte": 0.9489354458928496,
237
+ "faquad_nli": 0.8194444444444444,
238
+ "hatebr_offensive": 0.8641580001234928,
239
+ "portuguese_hate_speech": 0.7761835184102864,
240
+ "tweetsentbr": 0.780880021326841
241
+ },
242
+ "result_metrics_average": 0.7777870380406591,
243
+ "result_metrics_npm": 0.6740728488043128
244
  }
245
  ]
src/display/utils.py CHANGED
@@ -166,24 +166,30 @@ human_baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2)
166
  if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
167
  human_baseline_row["πŸ€— Leaderboard Average"] = None
168
 
169
- #Proprietary models
170
- proprietary_rows = []
171
- if os.path.exists('proprietary_models_results.json'):
172
- with open('proprietary_models_results.json', 'r', encoding='utf8') as f:
173
  all_models = json.load(f)
174
  for model_data in all_models:
175
  model_row = deepcopy(baseline_row)
176
  model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
177
  model_row[AutoEvalColumn.dummy.name] = model_data['model']
178
- model_row[AutoEvalColumn.license.name] = "Proprietary"
179
  for task in Tasks:
180
  model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
181
  model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
182
  model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
183
- model_row[AutoEvalColumn.model_type.name] = "proprietary models (closed)"
184
- model_row[AutoEvalColumn.model_type_symbol.name] = "πŸ”’"
 
 
 
 
 
 
 
185
  model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
186
- proprietary_rows.append(model_row)
187
 
188
  @dataclass
189
  class ModelDetails:
 
166
  if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
167
  human_baseline_row["πŸ€— Leaderboard Average"] = None
168
 
169
+ #External models
170
+ external_rows = []
171
+ if os.path.exists('external_models_results.json'):
172
+ with open('external_models_results.json', 'r', encoding='utf8') as f:
173
  all_models = json.load(f)
174
  for model_data in all_models:
175
  model_row = deepcopy(baseline_row)
176
  model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
177
  model_row[AutoEvalColumn.dummy.name] = model_data['model']
 
178
  for task in Tasks:
179
  model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
180
  model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
181
  model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
182
+
183
+ model_type = ModelType.from_str(model_data['model_type'])
184
+ model_row[AutoEvalColumn.model_type.name] = model_type.name
185
+ model_row[AutoEvalColumn.model_type_symbol.name] = model_type.symbol
186
+ if model_type == ModelType.proprietary:
187
+ model_row[AutoEvalColumn.license.name] = "Proprietary"
188
+ if 'params' in model_data:
189
+ model_row[AutoEvalColumn.params.name] = model_data['params']
190
+
191
  model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
192
+ external_rows.append(model_row)
193
 
194
  @dataclass
195
  class ModelDetails:
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import copy
5
  import pandas as pd
6
 
7
  from src.display.formatting import has_no_nan_values, make_requests_clickable_model
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row, proprietary_rows
9
  from src.leaderboard.filter_models import filter_models_flags
10
  from src.leaderboard.read_evals import get_raw_eval_results
11
 
@@ -14,8 +14,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
14
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
  all_data_json.append(baseline_row)
17
- for proprietary_row in proprietary_rows:
18
- all_data_json.append(proprietary_row)
19
  filter_models_flags(all_data_json)
20
 
21
  df = pd.DataFrame.from_records(all_data_json)
 
5
  import pandas as pd
6
 
7
  from src.display.formatting import has_no_nan_values, make_requests_clickable_model
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row, external_rows
9
  from src.leaderboard.filter_models import filter_models_flags
10
  from src.leaderboard.read_evals import get_raw_eval_results
11
 
 
14
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
  all_data_json.append(baseline_row)
17
+ for external_row in external_rows:
18
+ all_data_json.append(external_row)
19
  filter_models_flags(all_data_json)
20
 
21
  df = pd.DataFrame.from_records(all_data_json)