File size: 5,758 Bytes
53a2780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|                 Tasks                 |Version|Filter|n-shot|Metric|Value |   |Stderr|
|---------------------------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu                                   |N/A    |none  |     0|acc   |0.5940|±  |0.0039|
| - humanities                          |N/A    |none  |     0|acc   |0.5288|±  |0.0068|
|  - formal_logic                       |      0|none  |     0|acc   |0.4365|±  |0.0444|
|  - high_school_european_history       |      0|none  |     0|acc   |0.6909|±  |0.0361|
|  - high_school_us_history             |      0|none  |     0|acc   |0.7500|±  |0.0304|
|  - high_school_world_history          |      0|none  |     0|acc   |0.8059|±  |0.0257|
|  - international_law                  |      0|none  |     0|acc   |0.7851|±  |0.0375|
|  - jurisprudence                      |      0|none  |     0|acc   |0.7130|±  |0.0437|
|  - logical_fallacies                  |      0|none  |     0|acc   |0.7055|±  |0.0358|
|  - moral_disputes                     |      0|none  |     0|acc   |0.6503|±  |0.0257|
|  - moral_scenarios                    |      0|none  |     0|acc   |0.2380|±  |0.0142|
|  - philosophy                         |      0|none  |     0|acc   |0.6785|±  |0.0265|
|  - prehistory                         |      0|none  |     0|acc   |0.6698|±  |0.0262|
|  - professional_law                   |      0|none  |     0|acc   |0.4478|±  |0.0127|
|  - world_religions                    |      0|none  |     0|acc   |0.7895|±  |0.0313|
| - other                               |N/A    |none  |     0|acc   |0.6827|±  |0.0081|
|  - business_ethics                    |      0|none  |     0|acc   |0.6200|±  |0.0488|
|  - clinical_knowledge                 |      0|none  |     0|acc   |0.7132|±  |0.0278|
|  - college_medicine                   |      0|none  |     0|acc   |0.6185|±  |0.0370|
|  - global_facts                       |      0|none  |     0|acc   |0.3800|±  |0.0488|
|  - human_aging                        |      0|none  |     0|acc   |0.6637|±  |0.0317|
|  - management                         |      0|none  |     0|acc   |0.7864|±  |0.0406|
|  - marketing                          |      0|none  |     0|acc   |0.8333|±  |0.0244|
|  - medical_genetics                   |      0|none  |     0|acc   |0.7800|±  |0.0416|
|  - miscellaneous                      |      0|none  |     0|acc   |0.7791|±  |0.0148|
|  - nutrition                          |      0|none  |     0|acc   |0.7124|±  |0.0259|
|  - professional_accounting            |      0|none  |     0|acc   |0.4681|±  |0.0298|
|  - professional_medicine              |      0|none  |     0|acc   |0.6434|±  |0.0291|
|  - virology                           |      0|none  |     0|acc   |0.5301|±  |0.0389|
| - social_sciences                     |N/A    |none  |     0|acc   |0.6961|±  |0.0081|
|  - econometrics                       |      0|none  |     0|acc   |0.3246|±  |0.0440|
|  - high_school_geography              |      0|none  |     0|acc   |0.7374|±  |0.0314|
|  - high_school_government_and_politics|      0|none  |     0|acc   |0.8238|±  |0.0275|
|  - high_school_macroeconomics         |      0|none  |     0|acc   |0.6179|±  |0.0246|
|  - high_school_microeconomics         |      0|none  |     0|acc   |0.6597|±  |0.0308|
|  - high_school_psychology             |      0|none  |     0|acc   |0.7835|±  |0.0177|
|  - human_sexuality                    |      0|none  |     0|acc   |0.7328|±  |0.0388|
|  - professional_psychology            |      0|none  |     0|acc   |0.6405|±  |0.0194|
|  - public_relations                   |      0|none  |     0|acc   |0.6455|±  |0.0458|
|  - security_studies                   |      0|none  |     0|acc   |0.6857|±  |0.0297|
|  - sociology                          |      0|none  |     0|acc   |0.8109|±  |0.0277|
|  - us_foreign_policy                  |      0|none  |     0|acc   |0.8500|±  |0.0359|
| - stem                                |N/A    |none  |     0|acc   |0.5043|±  |0.0086|
|  - abstract_algebra                   |      0|none  |     0|acc   |0.3000|±  |0.0461|
|  - anatomy                            |      0|none  |     0|acc   |0.6222|±  |0.0419|
|  - astronomy                          |      0|none  |     0|acc   |0.6711|±  |0.0382|
|  - college_biology                    |      0|none  |     0|acc   |0.7361|±  |0.0369|
|  - college_chemistry                  |      0|none  |     0|acc   |0.3900|±  |0.0490|
|  - college_computer_science           |      0|none  |     0|acc   |0.4800|±  |0.0502|
|  - college_mathematics                |      0|none  |     0|acc   |0.3700|±  |0.0485|
|  - college_physics                    |      0|none  |     0|acc   |0.3431|±  |0.0472|
|  - computer_security                  |      0|none  |     0|acc   |0.7100|±  |0.0456|
|  - conceptual_physics                 |      0|none  |     0|acc   |0.5064|±  |0.0327|
|  - electrical_engineering             |      0|none  |     0|acc   |0.5586|±  |0.0414|
|  - elementary_mathematics             |      0|none  |     0|acc   |0.4127|±  |0.0254|
|  - high_school_biology                |      0|none  |     0|acc   |0.7161|±  |0.0256|
|  - high_school_chemistry              |      0|none  |     0|acc   |0.5123|±  |0.0352|
|  - high_school_computer_science       |      0|none  |     0|acc   |0.6200|±  |0.0488|
|  - high_school_mathematics            |      0|none  |     0|acc   |0.3333|±  |0.0287|
|  - high_school_physics                |      0|none  |     0|acc   |0.4106|±  |0.0402|
|  - high_school_statistics             |      0|none  |     0|acc   |0.4907|±  |0.0341|
|  - machine_learning                   |      0|none  |     0|acc   |0.3214|±  |0.0443|