crumb commited on
Commit
1cabb81
1 Parent(s): 4f18efa

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +65 -1
README.md CHANGED
@@ -122,7 +122,71 @@ open llm leaderboard eval datasets and settings
122
  |winogrande| 1|none | 5|acc |0.517|± | 0.014|
123
  |hellaswag| 1|none | 10|acc |0.2803|± |0.0045|
124
  | | |none | 10|acc_norm|0.2886|± |0.0045|
125
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  #### Summary
128
 
 
122
  |winogrande| 1|none | 5|acc |0.517|± | 0.014|
123
  |hellaswag| 1|none | 10|acc |0.2803|± |0.0045|
124
  | | |none | 10|acc_norm|0.2886|± |0.0045|
125
+ |gsm8k| 3|strict-match | 5|exact_match|0.0008|± |0.0008|
126
+ | | |flexible-extract| 5|exact_match|0.0099|± |0.0027|
127
+
128
+ #### MMLU
129
+
130
+ value, stderr = (0.253980701754386, 0.004428598058450528)
131
+ | Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
132
+ |-----------------------------------|------:|------|-----:|------|-----:|---|-----:|
133
+ |world_religions | 0|none | 5|acc |0.2222|± |0.0319|
134
+ |virology | 0|none | 5|acc |0.2711|± |0.0346|
135
+ |us_foreign_policy | 0|none | 5|acc |0.3300|± |0.0473|
136
+ |sociology | 0|none | 5|acc |0.2388|± |0.0301|
137
+ |security_studies | 0|none | 5|acc |0.2367|± |0.0272|
138
+ |public_relations | 0|none | 5|acc |0.2273|± |0.0401|
139
+ |professional_psychology | 0|none | 5|acc |0.2484|± |0.0175|
140
+ |professional_medicine | 0|none | 5|acc |0.4596|± |0.0303|
141
+ |professional_law | 0|none | 5|acc |0.2464|± |0.0110|
142
+ |professional_accounting | 0|none | 5|acc |0.2021|± |0.0240|
143
+ |prehistory | 0|none | 5|acc |0.2130|± |0.0228|
144
+ |philosophy | 0|none | 5|acc |0.2219|± |0.0236|
145
+ |nutrition | 0|none | 5|acc |0.2157|± |0.0236|
146
+ |moral_scenarios | 0|none | 5|acc |0.2380|± |0.0142|
147
+ |moral_disputes | 0|none | 5|acc |0.2486|± |0.0233|
148
+ |miscellaneous | 0|none | 5|acc |0.2516|± |0.0155|
149
+ |medical_genetics | 0|none | 5|acc |0.3000|± |0.0461|
150
+ |marketing | 0|none | 5|acc |0.2265|± |0.0274|
151
+ |management | 0|none | 5|acc |0.1748|± |0.0376|
152
+ |machine_learning | 0|none | 5|acc |0.3125|± |0.0440|
153
+ |logical_fallacies | 0|none | 5|acc |0.2393|± |0.0335|
154
+ |jurisprudence | 0|none | 5|acc |0.2315|± |0.0408|
155
+ |international_law | 0|none | 5|acc |0.3140|± |0.0424|
156
+ |human_sexuality | 0|none | 5|acc |0.2519|± |0.0381|
157
+ |human_aging | 0|none | 5|acc |0.3049|± |0.0309|
158
+ |high_school_world_history | 0|none | 5|acc |0.2658|± |0.0288|
159
+ |high_school_us_history | 0|none | 5|acc |0.2451|± |0.0302|
160
+ |high_school_statistics | 0|none | 5|acc |0.4722|± |0.0340|
161
+ |high_school_psychology | 0|none | 5|acc |0.1963|± |0.0170|
162
+ |high_school_physics | 0|none | 5|acc |0.3046|± |0.0376|
163
+ |high_school_microeconomics | 0|none | 5|acc |0.2773|± |0.0291|
164
+ |high_school_mathematics | 0|none | 5|acc |0.2667|± |0.0270|
165
+ |high_school_macroeconomics | 0|none | 5|acc |0.2667|± |0.0224|
166
+ |high_school_government_and_politics| 0|none | 5|acc |0.2591|± |0.0316|
167
+ |high_school_geography | 0|none | 5|acc |0.2424|± |0.0305|
168
+ |high_school_european_history | 0|none | 5|acc |0.2242|± |0.0326|
169
+ |high_school_computer_science | 0|none | 5|acc |0.2800|± |0.0451|
170
+ |high_school_chemistry | 0|none | 5|acc |0.2857|± |0.0318|
171
+ |high_school_biology | 0|none | 5|acc |0.3129|± |0.0264|
172
+ |global_facts | 0|none | 5|acc |0.1500|± |0.0359|
173
+ |formal_logic | 0|none | 5|acc |0.1905|± |0.0351|
174
+ |elementary_mathematics | 0|none | 5|acc |0.2513|± |0.0223|
175
+ |electrical_engineering | 0|none | 5|acc |0.2759|± |0.0372|
176
+ |econometrics | 0|none | 5|acc |0.2456|± |0.0405|
177
+ |conceptual_physics | 0|none | 5|acc |0.2638|± |0.0288|
178
+ |computer_security | 0|none | 5|acc |0.1800|± |0.0386|
179
+ |college_physics | 0|none | 5|acc |0.2549|± |0.0434|
180
+ |college_medicine | 0|none | 5|acc |0.2023|± |0.0306|
181
+ |college_mathematics | 0|none | 5|acc |0.2900|± |0.0456|
182
+ |college_computer_science | 0|none | 5|acc |0.2700|± |0.0446|
183
+ |college_chemistry | 0|none | 5|acc |0.2500|± |0.0435|
184
+ |college_biology | 0|none | 5|acc |0.2222|± |0.0348|
185
+ |clinical_knowledge | 0|none | 5|acc |0.2377|± |0.0262|
186
+ |business_ethics | 0|none | 5|acc |0.2100|± |0.0409|
187
+ |astronomy | 0|none | 5|acc |0.1776|± |0.0311|
188
+ |anatomy | 0|none | 5|acc |0.2593|± |0.0379|
189
+ |abstract_algebra | 0|none | 5|acc |0.2200|± |0.0416|
190
 
191
  #### Summary
192