fblgit commited on
Commit
a798e0c
1 Parent(s): c63d063

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +95 -3
README.md CHANGED
@@ -36,14 +36,33 @@ Merges:
36
  - Datasets 2.14.6
37
  - Tokenizers 0.14.1
38
 
39
- ## Evals
 
 
40
 
41
  ```
42
- hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto (64)
 
 
 
 
 
 
 
 
 
 
 
43
  | Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
44
  |--------------|-------|------|-----:|------|-----:|---|-----:|
45
  |truthfulqa_mc2|Yaml |none | 0|acc |0.7297|_ |0.0149|
46
 
 
 
 
 
 
 
47
  hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto (32)
48
  | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
49
  |--------------|-------|------|-----:|----------|-----:|---|-----:|
@@ -63,7 +82,6 @@ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs:
63
  | | |none | 0|acc_norm|0.3568|_ |0.0088|
64
  |pubmedqa|Yaml |none | 0|acc |0.5400|_ |0.0223|
65
 
66
- 100%|______________________________________________________________________________________________________________________________________________________________________________________| 3256/3256 [50:04<00:00, 1.08it/s]
67
  hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto
68
  | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
69
  |------------------------------------------------------|-------|------|-----:|-----------|-----:|---|-----:|
@@ -99,6 +117,80 @@ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs:
99
  | Groups |Version|Filter|n-shot| Metric |Value| |Stderr|
100
  |-----------|-------|------|-----:|-----------|----:|---|-----:|
101
  |bbh_fewshot|N/A |none | 0|exact_match|0.466|_ |0.1771|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  ```
103
 
104
 
 
36
  - Datasets 2.14.6
37
  - Tokenizers 0.14.1
38
 
39
+ ## Evals LM-Evaluation Harness
40
+
41
+ `big-refactor` branch:
42
 
43
  ```
44
+ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 25, batch_size: auto (32)
45
+ | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
46
+ |-------------|-------|------|-----:|--------|-----:|---|-----:|
47
+ |arc_challenge|Yaml |none | 25|acc |0.6954|± |0.0134|
48
+ | | |none | 25|acc_norm|0.7167|± |0.0132|
49
+
50
+ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto
51
+ |Tasks|Version| Filter |n-shot| Metric |Value| |Stderr|
52
+ |-----|-------|----------|-----:|-----------|----:|---|-----:|
53
+ |gsm8k|Yaml |get-answer| 5|exact_match|0.671|± |0.0129|
54
+
55
+ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto (64)
56
  | Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
57
  |--------------|-------|------|-----:|------|-----:|---|-----:|
58
  |truthfulqa_mc2|Yaml |none | 0|acc |0.7297|_ |0.0149|
59
 
60
+ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 10, batch_size: auto (32)
61
+ | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
62
+ |---------|-------|------|-----:|--------|-----:|---|-----:|
63
+ |hellaswag|Yaml |none | 10|acc |0.7091|± |0.0045|
64
+ | | |none | 10|acc_norm|0.8821|± |0.0032|
65
+
66
  hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto (32)
67
  | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
68
  |--------------|-------|------|-----:|----------|-----:|---|-----:|
 
82
  | | |none | 0|acc_norm|0.3568|_ |0.0088|
83
  |pubmedqa|Yaml |none | 0|acc |0.5400|_ |0.0223|
84
 
 
85
  hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto
86
  | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
87
  |------------------------------------------------------|-------|------|-----:|-----------|-----:|---|-----:|
 
117
  | Groups |Version|Filter|n-shot| Metric |Value| |Stderr|
118
  |-----------|-------|------|-----:|-----------|----:|---|-----:|
119
  |bbh_fewshot|N/A |none | 0|exact_match|0.466|_ |0.1771|
120
+
121
+ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto (16)
122
+ | Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
123
+ |---------------------------------------|-------|------|-----:|------|-----:|---|-----:|
124
+ |mmlu |N/A |none | 0|acc |0.6513|± |0.1221|
125
+ | - humanities |N/A |none | 5|acc |0.6077|± |0.1185|
126
+ | - formal_logic |Yaml |none | 5|acc |0.4444|± |0.0444|
127
+ | - high_school_european_history |Yaml |none | 5|acc |0.8121|± |0.0305|
128
+ | - high_school_us_history |Yaml |none | 5|acc |0.8431|± |0.0255|
129
+ | - high_school_world_history |Yaml |none | 5|acc |0.8523|± |0.0231|
130
+ | - international_law |Yaml |none | 5|acc |0.7851|± |0.0375|
131
+ | - jurisprudence |Yaml |none | 5|acc |0.7870|± |0.0396|
132
+ | - logical_fallacies |Yaml |none | 5|acc |0.7546|± |0.0338|
133
+ | - moral_disputes |Yaml |none | 5|acc |0.7370|± |0.0237|
134
+ | - moral_scenarios |Yaml |none | 5|acc |0.4101|± |0.0164|
135
+ | - philosophy |Yaml |none | 5|acc |0.7170|± |0.0256|
136
+ | - prehistory |Yaml |none | 5|acc |0.7840|± |0.0229|
137
+ | - professional_law |Yaml |none | 5|acc |0.4941|± |0.0128|
138
+ | - world_religions |Yaml |none | 5|acc |0.7895|± |0.0313|
139
+ | - other |N/A |none | 5|acc |0.7116|± |0.0939|
140
+ | - business_ethics |Yaml |none | 5|acc |0.7600|± |0.0429|
141
+ | - clinical_knowledge |Yaml |none | 5|acc |0.6792|± |0.0287|
142
+ | - college_medicine |Yaml |none | 5|acc |0.6590|± |0.0361|
143
+ | - global_facts |Yaml |none | 5|acc |0.3400|± |0.0476|
144
+ | - human_aging |Yaml |none | 5|acc |0.6816|± |0.0313|
145
+ | - management |Yaml |none | 5|acc |0.8350|± |0.0368|
146
+ | - marketing |Yaml |none | 5|acc |0.8547|± |0.0231|
147
+ | - medical_genetics |Yaml |none | 5|acc |0.7000|± |0.0461|
148
+ | - miscellaneous |Yaml |none | 5|acc |0.8020|± |0.0142|
149
+ | - nutrition |Yaml |none | 5|acc |0.7418|± |0.0251|
150
+ | - professional_accounting |Yaml |none | 5|acc |0.5071|± |0.0298|
151
+ | - professional_medicine |Yaml |none | 5|acc |0.7500|± |0.0263|
152
+ | - virology |Yaml |none | 5|acc |0.5843|± |0.0384|
153
+ | - social_sciences |N/A |none | 5|acc |0.7537|± |0.0681|
154
+ | - econometrics |Yaml |none | 5|acc |0.5000|± |0.0470|
155
+ | - high_school_geography |Yaml |none | 5|acc |0.8586|± |0.0248|
156
+ | - high_school_government_and_politics|Yaml |none | 5|acc |0.9016|± |0.0215|
157
+ | - high_school_macroeconomics |Yaml |none | 5|acc |0.6615|± |0.0240|
158
+ | - high_school_microeconomics |Yaml |none | 5|acc |0.7311|± |0.0288|
159
+ | - high_school_psychology |Yaml |none | 5|acc |0.8404|± |0.0157|
160
+ | - human_sexuality |Yaml |none | 5|acc |0.7328|± |0.0388|
161
+ | - professional_psychology |Yaml |none | 5|acc |0.6814|± |0.0189|
162
+ | - public_relations |Yaml |none | 5|acc |0.6909|± |0.0443|
163
+ | - security_studies |Yaml |none | 5|acc |0.7469|± |0.0278|
164
+ | - sociology |Yaml |none | 5|acc |0.8308|± |0.0265|
165
+ | - us_foreign_policy |Yaml |none | 5|acc |0.8900|± |0.0314|
166
+ | - stem |N/A |none | 5|acc |0.5569|± |0.1380|
167
+ | - abstract_algebra |Yaml |none | 5|acc |0.4100|± |0.0494|
168
+ | - anatomy |Yaml |none | 5|acc |0.6222|± |0.0419|
169
+ | - astronomy |Yaml |none | 5|acc |0.7368|± |0.0358|
170
+ | - college_biology |Yaml |none | 5|acc |0.8056|± |0.0331|
171
+ | - college_chemistry |Yaml |none | 5|acc |0.4700|± |0.0502|
172
+ | - college_computer_science |Yaml |none | 5|acc |0.5100|± |0.0502|
173
+ | - college_mathematics |Yaml |none | 5|acc |0.2800|± |0.0451|
174
+ | - college_physics |Yaml |none | 5|acc |0.3431|± |0.0472|
175
+ | - computer_security |Yaml |none | 5|acc |0.7400|± |0.0441|
176
+ | - conceptual_physics |Yaml |none | 5|acc |0.6340|± |0.0315|
177
+ | - electrical_engineering |Yaml |none | 5|acc |0.6000|± |0.0408|
178
+ | - elementary_mathematics |Yaml |none | 5|acc |0.4815|± |0.0257|
179
+ | - high_school_biology |Yaml |none | 5|acc |0.8032|± |0.0226|
180
+ | - high_school_chemistry |Yaml |none | 5|acc |0.4877|± |0.0352|
181
+ | - high_school_computer_science |Yaml |none | 5|acc |0.7200|± |0.0451|
182
+ | - high_school_mathematics |Yaml |none | 5|acc |0.3815|± |0.0296|
183
+ | - high_school_physics |Yaml |none | 5|acc |0.3576|± |0.0391|
184
+ | - high_school_statistics |Yaml |none | 5|acc |0.5602|± |0.0339|
185
+ | - machine_learning |Yaml |none | 5|acc |0.4643|± |0.0473|
186
+
187
+ | Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
188
+ |------------------|-------|------|-----:|------|-----:|---|-----:|
189
+ |mmlu |N/A |none | 0|acc |0.6513|± |0.1221|
190
+ | - humanities |N/A |none | 5|acc |0.6077|± |0.1185|
191
+ | - other |N/A |none | 5|acc |0.7116|± |0.0939|
192
+ | - social_sciences|N/A |none | 5|acc |0.7537|± |0.0681|
193
+ | - stem |N/A |none | 5|acc |0.5569|± |0.1380|
194
  ```
195
 
196