|
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr| |
|
|---------------------------------------|-------|------|-----:|------|-----:|---|-----:| |
|
|mmlu |N/A |none | 0|acc |0.5940|± |0.0039| |
|
| - humanities |N/A |none | 0|acc |0.5288|± |0.0068| |
|
| - formal_logic | 0|none | 0|acc |0.4365|± |0.0444| |
|
| - high_school_european_history | 0|none | 0|acc |0.6909|± |0.0361| |
|
| - high_school_us_history | 0|none | 0|acc |0.7500|± |0.0304| |
|
| - high_school_world_history | 0|none | 0|acc |0.8059|± |0.0257| |
|
| - international_law | 0|none | 0|acc |0.7851|± |0.0375| |
|
| - jurisprudence | 0|none | 0|acc |0.7130|± |0.0437| |
|
| - logical_fallacies | 0|none | 0|acc |0.7055|± |0.0358| |
|
| - moral_disputes | 0|none | 0|acc |0.6503|± |0.0257| |
|
| - moral_scenarios | 0|none | 0|acc |0.2380|± |0.0142| |
|
| - philosophy | 0|none | 0|acc |0.6785|± |0.0265| |
|
| - prehistory | 0|none | 0|acc |0.6698|± |0.0262| |
|
| - professional_law | 0|none | 0|acc |0.4478|± |0.0127| |
|
| - world_religions | 0|none | 0|acc |0.7895|± |0.0313| |
|
| - other |N/A |none | 0|acc |0.6827|± |0.0081| |
|
| - business_ethics | 0|none | 0|acc |0.6200|± |0.0488| |
|
| - clinical_knowledge | 0|none | 0|acc |0.7132|± |0.0278| |
|
| - college_medicine | 0|none | 0|acc |0.6185|± |0.0370| |
|
| - global_facts | 0|none | 0|acc |0.3800|± |0.0488| |
|
| - human_aging | 0|none | 0|acc |0.6637|± |0.0317| |
|
| - management | 0|none | 0|acc |0.7864|± |0.0406| |
|
| - marketing | 0|none | 0|acc |0.8333|± |0.0244| |
|
| - medical_genetics | 0|none | 0|acc |0.7800|± |0.0416| |
|
| - miscellaneous | 0|none | 0|acc |0.7791|± |0.0148| |
|
| - nutrition | 0|none | 0|acc |0.7124|± |0.0259| |
|
| - professional_accounting | 0|none | 0|acc |0.4681|± |0.0298| |
|
| - professional_medicine | 0|none | 0|acc |0.6434|± |0.0291| |
|
| - virology | 0|none | 0|acc |0.5301|± |0.0389| |
|
| - social_sciences |N/A |none | 0|acc |0.6961|± |0.0081| |
|
| - econometrics | 0|none | 0|acc |0.3246|± |0.0440| |
|
| - high_school_geography | 0|none | 0|acc |0.7374|± |0.0314| |
|
| - high_school_government_and_politics| 0|none | 0|acc |0.8238|± |0.0275| |
|
| - high_school_macroeconomics | 0|none | 0|acc |0.6179|± |0.0246| |
|
| - high_school_microeconomics | 0|none | 0|acc |0.6597|± |0.0308| |
|
| - high_school_psychology | 0|none | 0|acc |0.7835|± |0.0177| |
|
| - human_sexuality | 0|none | 0|acc |0.7328|± |0.0388| |
|
| - professional_psychology | 0|none | 0|acc |0.6405|± |0.0194| |
|
| - public_relations | 0|none | 0|acc |0.6455|± |0.0458| |
|
| - security_studies | 0|none | 0|acc |0.6857|± |0.0297| |
|
| - sociology | 0|none | 0|acc |0.8109|± |0.0277| |
|
| - us_foreign_policy | 0|none | 0|acc |0.8500|± |0.0359| |
|
| - stem |N/A |none | 0|acc |0.5043|± |0.0086| |
|
| - abstract_algebra | 0|none | 0|acc |0.3000|± |0.0461| |
|
| - anatomy | 0|none | 0|acc |0.6222|± |0.0419| |
|
| - astronomy | 0|none | 0|acc |0.6711|± |0.0382| |
|
| - college_biology | 0|none | 0|acc |0.7361|± |0.0369| |
|
| - college_chemistry | 0|none | 0|acc |0.3900|± |0.0490| |
|
| - college_computer_science | 0|none | 0|acc |0.4800|± |0.0502| |
|
| - college_mathematics | 0|none | 0|acc |0.3700|± |0.0485| |
|
| - college_physics | 0|none | 0|acc |0.3431|± |0.0472| |
|
| - computer_security | 0|none | 0|acc |0.7100|± |0.0456| |
|
| - conceptual_physics | 0|none | 0|acc |0.5064|± |0.0327| |
|
| - electrical_engineering | 0|none | 0|acc |0.5586|± |0.0414| |
|
| - elementary_mathematics | 0|none | 0|acc |0.4127|± |0.0254| |
|
| - high_school_biology | 0|none | 0|acc |0.7161|± |0.0256| |
|
| - high_school_chemistry | 0|none | 0|acc |0.5123|± |0.0352| |
|
| - high_school_computer_science | 0|none | 0|acc |0.6200|± |0.0488| |
|
| - high_school_mathematics | 0|none | 0|acc |0.3333|± |0.0287| |
|
| - high_school_physics | 0|none | 0|acc |0.4106|± |0.0402| |
|
| - high_school_statistics | 0|none | 0|acc |0.4907|± |0.0341| |
|
| - machine_learning | 0|none | 0|acc |0.3214|± |0.0443| |
|
|