File size: 5,758 Bytes
53a2780 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
|---------------------------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.5940|± |0.0039|
| - humanities |N/A |none | 0|acc |0.5288|± |0.0068|
| - formal_logic | 0|none | 0|acc |0.4365|± |0.0444|
| - high_school_european_history | 0|none | 0|acc |0.6909|± |0.0361|
| - high_school_us_history | 0|none | 0|acc |0.7500|± |0.0304|
| - high_school_world_history | 0|none | 0|acc |0.8059|± |0.0257|
| - international_law | 0|none | 0|acc |0.7851|± |0.0375|
| - jurisprudence | 0|none | 0|acc |0.7130|± |0.0437|
| - logical_fallacies | 0|none | 0|acc |0.7055|± |0.0358|
| - moral_disputes | 0|none | 0|acc |0.6503|± |0.0257|
| - moral_scenarios | 0|none | 0|acc |0.2380|± |0.0142|
| - philosophy | 0|none | 0|acc |0.6785|± |0.0265|
| - prehistory | 0|none | 0|acc |0.6698|± |0.0262|
| - professional_law | 0|none | 0|acc |0.4478|± |0.0127|
| - world_religions | 0|none | 0|acc |0.7895|± |0.0313|
| - other |N/A |none | 0|acc |0.6827|± |0.0081|
| - business_ethics | 0|none | 0|acc |0.6200|± |0.0488|
| - clinical_knowledge | 0|none | 0|acc |0.7132|± |0.0278|
| - college_medicine | 0|none | 0|acc |0.6185|± |0.0370|
| - global_facts | 0|none | 0|acc |0.3800|± |0.0488|
| - human_aging | 0|none | 0|acc |0.6637|± |0.0317|
| - management | 0|none | 0|acc |0.7864|± |0.0406|
| - marketing | 0|none | 0|acc |0.8333|± |0.0244|
| - medical_genetics | 0|none | 0|acc |0.7800|± |0.0416|
| - miscellaneous | 0|none | 0|acc |0.7791|± |0.0148|
| - nutrition | 0|none | 0|acc |0.7124|± |0.0259|
| - professional_accounting | 0|none | 0|acc |0.4681|± |0.0298|
| - professional_medicine | 0|none | 0|acc |0.6434|± |0.0291|
| - virology | 0|none | 0|acc |0.5301|± |0.0389|
| - social_sciences |N/A |none | 0|acc |0.6961|± |0.0081|
| - econometrics | 0|none | 0|acc |0.3246|± |0.0440|
| - high_school_geography | 0|none | 0|acc |0.7374|± |0.0314|
| - high_school_government_and_politics| 0|none | 0|acc |0.8238|± |0.0275|
| - high_school_macroeconomics | 0|none | 0|acc |0.6179|± |0.0246|
| - high_school_microeconomics | 0|none | 0|acc |0.6597|± |0.0308|
| - high_school_psychology | 0|none | 0|acc |0.7835|± |0.0177|
| - human_sexuality | 0|none | 0|acc |0.7328|± |0.0388|
| - professional_psychology | 0|none | 0|acc |0.6405|± |0.0194|
| - public_relations | 0|none | 0|acc |0.6455|± |0.0458|
| - security_studies | 0|none | 0|acc |0.6857|± |0.0297|
| - sociology | 0|none | 0|acc |0.8109|± |0.0277|
| - us_foreign_policy | 0|none | 0|acc |0.8500|± |0.0359|
| - stem |N/A |none | 0|acc |0.5043|± |0.0086|
| - abstract_algebra | 0|none | 0|acc |0.3000|± |0.0461|
| - anatomy | 0|none | 0|acc |0.6222|± |0.0419|
| - astronomy | 0|none | 0|acc |0.6711|± |0.0382|
| - college_biology | 0|none | 0|acc |0.7361|± |0.0369|
| - college_chemistry | 0|none | 0|acc |0.3900|± |0.0490|
| - college_computer_science | 0|none | 0|acc |0.4800|± |0.0502|
| - college_mathematics | 0|none | 0|acc |0.3700|± |0.0485|
| - college_physics | 0|none | 0|acc |0.3431|± |0.0472|
| - computer_security | 0|none | 0|acc |0.7100|± |0.0456|
| - conceptual_physics | 0|none | 0|acc |0.5064|± |0.0327|
| - electrical_engineering | 0|none | 0|acc |0.5586|± |0.0414|
| - elementary_mathematics | 0|none | 0|acc |0.4127|± |0.0254|
| - high_school_biology | 0|none | 0|acc |0.7161|± |0.0256|
| - high_school_chemistry | 0|none | 0|acc |0.5123|± |0.0352|
| - high_school_computer_science | 0|none | 0|acc |0.6200|± |0.0488|
| - high_school_mathematics | 0|none | 0|acc |0.3333|± |0.0287|
| - high_school_physics | 0|none | 0|acc |0.4106|± |0.0402|
| - high_school_statistics | 0|none | 0|acc |0.4907|± |0.0341|
| - machine_learning | 0|none | 0|acc |0.3214|± |0.0443|
|