Update README.md
Browse files
README.md
CHANGED
@@ -122,7 +122,71 @@ open llm leaderboard eval datasets and settings
|
|
122 |
|winogrande| 1|none | 5|acc |0.517|± | 0.014|
|
123 |
|hellaswag| 1|none | 10|acc |0.2803|± |0.0045|
|
124 |
| | |none | 10|acc_norm|0.2886|± |0.0045|
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
#### Summary
|
128 |
|
|
|
122 |
|winogrande| 1|none | 5|acc |0.517|± | 0.014|
|
123 |
|hellaswag| 1|none | 10|acc |0.2803|± |0.0045|
|
124 |
| | |none | 10|acc_norm|0.2886|± |0.0045|
|
125 |
+
|gsm8k| 3|strict-match | 5|exact_match|0.0008|± |0.0008|
|
126 |
+
| | |flexible-extract| 5|exact_match|0.0099|± |0.0027|
|
127 |
+
|
128 |
+
#### MMLU
|
129 |
+
|
130 |
+
value, stderr = (0.253980701754386, 0.004428598058450528)
|
131 |
+
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
|
132 |
+
|-----------------------------------|------:|------|-----:|------|-----:|---|-----:|
|
133 |
+
|world_religions | 0|none | 5|acc |0.2222|± |0.0319|
|
134 |
+
|virology | 0|none | 5|acc |0.2711|± |0.0346|
|
135 |
+
|us_foreign_policy | 0|none | 5|acc |0.3300|± |0.0473|
|
136 |
+
|sociology | 0|none | 5|acc |0.2388|± |0.0301|
|
137 |
+
|security_studies | 0|none | 5|acc |0.2367|± |0.0272|
|
138 |
+
|public_relations | 0|none | 5|acc |0.2273|± |0.0401|
|
139 |
+
|professional_psychology | 0|none | 5|acc |0.2484|± |0.0175|
|
140 |
+
|professional_medicine | 0|none | 5|acc |0.4596|± |0.0303|
|
141 |
+
|professional_law | 0|none | 5|acc |0.2464|± |0.0110|
|
142 |
+
|professional_accounting | 0|none | 5|acc |0.2021|± |0.0240|
|
143 |
+
|prehistory | 0|none | 5|acc |0.2130|± |0.0228|
|
144 |
+
|philosophy | 0|none | 5|acc |0.2219|± |0.0236|
|
145 |
+
|nutrition | 0|none | 5|acc |0.2157|± |0.0236|
|
146 |
+
|moral_scenarios | 0|none | 5|acc |0.2380|± |0.0142|
|
147 |
+
|moral_disputes | 0|none | 5|acc |0.2486|± |0.0233|
|
148 |
+
|miscellaneous | 0|none | 5|acc |0.2516|± |0.0155|
|
149 |
+
|medical_genetics | 0|none | 5|acc |0.3000|± |0.0461|
|
150 |
+
|marketing | 0|none | 5|acc |0.2265|± |0.0274|
|
151 |
+
|management | 0|none | 5|acc |0.1748|± |0.0376|
|
152 |
+
|machine_learning | 0|none | 5|acc |0.3125|± |0.0440|
|
153 |
+
|logical_fallacies | 0|none | 5|acc |0.2393|± |0.0335|
|
154 |
+
|jurisprudence | 0|none | 5|acc |0.2315|± |0.0408|
|
155 |
+
|international_law | 0|none | 5|acc |0.3140|± |0.0424|
|
156 |
+
|human_sexuality | 0|none | 5|acc |0.2519|± |0.0381|
|
157 |
+
|human_aging | 0|none | 5|acc |0.3049|± |0.0309|
|
158 |
+
|high_school_world_history | 0|none | 5|acc |0.2658|± |0.0288|
|
159 |
+
|high_school_us_history | 0|none | 5|acc |0.2451|± |0.0302|
|
160 |
+
|high_school_statistics | 0|none | 5|acc |0.4722|± |0.0340|
|
161 |
+
|high_school_psychology | 0|none | 5|acc |0.1963|± |0.0170|
|
162 |
+
|high_school_physics | 0|none | 5|acc |0.3046|± |0.0376|
|
163 |
+
|high_school_microeconomics | 0|none | 5|acc |0.2773|± |0.0291|
|
164 |
+
|high_school_mathematics | 0|none | 5|acc |0.2667|± |0.0270|
|
165 |
+
|high_school_macroeconomics | 0|none | 5|acc |0.2667|± |0.0224|
|
166 |
+
|high_school_government_and_politics| 0|none | 5|acc |0.2591|± |0.0316|
|
167 |
+
|high_school_geography | 0|none | 5|acc |0.2424|± |0.0305|
|
168 |
+
|high_school_european_history | 0|none | 5|acc |0.2242|± |0.0326|
|
169 |
+
|high_school_computer_science | 0|none | 5|acc |0.2800|± |0.0451|
|
170 |
+
|high_school_chemistry | 0|none | 5|acc |0.2857|± |0.0318|
|
171 |
+
|high_school_biology | 0|none | 5|acc |0.3129|± |0.0264|
|
172 |
+
|global_facts | 0|none | 5|acc |0.1500|± |0.0359|
|
173 |
+
|formal_logic | 0|none | 5|acc |0.1905|± |0.0351|
|
174 |
+
|elementary_mathematics | 0|none | 5|acc |0.2513|± |0.0223|
|
175 |
+
|electrical_engineering | 0|none | 5|acc |0.2759|± |0.0372|
|
176 |
+
|econometrics | 0|none | 5|acc |0.2456|± |0.0405|
|
177 |
+
|conceptual_physics | 0|none | 5|acc |0.2638|± |0.0288|
|
178 |
+
|computer_security | 0|none | 5|acc |0.1800|± |0.0386|
|
179 |
+
|college_physics | 0|none | 5|acc |0.2549|± |0.0434|
|
180 |
+
|college_medicine | 0|none | 5|acc |0.2023|± |0.0306|
|
181 |
+
|college_mathematics | 0|none | 5|acc |0.2900|± |0.0456|
|
182 |
+
|college_computer_science | 0|none | 5|acc |0.2700|± |0.0446|
|
183 |
+
|college_chemistry | 0|none | 5|acc |0.2500|± |0.0435|
|
184 |
+
|college_biology | 0|none | 5|acc |0.2222|± |0.0348|
|
185 |
+
|clinical_knowledge | 0|none | 5|acc |0.2377|± |0.0262|
|
186 |
+
|business_ethics | 0|none | 5|acc |0.2100|± |0.0409|
|
187 |
+
|astronomy | 0|none | 5|acc |0.1776|± |0.0311|
|
188 |
+
|anatomy | 0|none | 5|acc |0.2593|± |0.0379|
|
189 |
+
|abstract_algebra | 0|none | 5|acc |0.2200|± |0.0416|
|
190 |
|
191 |
#### Summary
|
192 |
|