Update README.md
Browse files
README.md
CHANGED
@@ -36,14 +36,33 @@ Merges:
|
|
36 |
- Datasets 2.14.6
|
37 |
- Tokenizers 0.14.1
|
38 |
|
39 |
-
## Evals
|
|
|
|
|
40 |
|
41 |
```
|
42 |
-
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
|
44 |
|--------------|-------|------|-----:|------|-----:|---|-----:|
|
45 |
|truthfulqa_mc2|Yaml |none | 0|acc |0.7297|_ |0.0149|
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto (32)
|
48 |
| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
|
49 |
|--------------|-------|------|-----:|----------|-----:|---|-----:|
|
@@ -63,7 +82,6 @@ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs:
|
|
63 |
| | |none | 0|acc_norm|0.3568|_ |0.0088|
|
64 |
|pubmedqa|Yaml |none | 0|acc |0.5400|_ |0.0223|
|
65 |
|
66 |
-
100%|______________________________________________________________________________________________________________________________________________________________________________________| 3256/3256 [50:04<00:00, 1.08it/s]
|
67 |
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto
|
68 |
| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
|
69 |
|------------------------------------------------------|-------|------|-----:|-----------|-----:|---|-----:|
|
@@ -99,6 +117,80 @@ hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs:
|
|
99 |
| Groups |Version|Filter|n-shot| Metric |Value| |Stderr|
|
100 |
|-----------|-------|------|-----:|-----------|----:|---|-----:|
|
101 |
|bbh_fewshot|N/A |none | 0|exact_match|0.466|_ |0.1771|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
```
|
103 |
|
104 |
|
|
|
36 |
- Datasets 2.14.6
|
37 |
- Tokenizers 0.14.1
|
38 |
|
39 |
+
## Evals LM-Evaluation Harness
|
40 |
+
|
41 |
+
`big-refactor` branch:
|
42 |
|
43 |
```
|
44 |
+
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 25, batch_size: auto (32)
|
45 |
+
| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
|
46 |
+
|-------------|-------|------|-----:|--------|-----:|---|-----:|
|
47 |
+
|arc_challenge|Yaml |none | 25|acc |0.6954|± |0.0134|
|
48 |
+
| | |none | 25|acc_norm|0.7167|± |0.0132|
|
49 |
+
|
50 |
+
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto
|
51 |
+
|Tasks|Version| Filter |n-shot| Metric |Value| |Stderr|
|
52 |
+
|-----|-------|----------|-----:|-----------|----:|---|-----:|
|
53 |
+
|gsm8k|Yaml |get-answer| 5|exact_match|0.671|± |0.0129|
|
54 |
+
|
55 |
+
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto (64)
|
56 |
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
|
57 |
|--------------|-------|------|-----:|------|-----:|---|-----:|
|
58 |
|truthfulqa_mc2|Yaml |none | 0|acc |0.7297|_ |0.0149|
|
59 |
|
60 |
+
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 10, batch_size: auto (32)
|
61 |
+
| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
|
62 |
+
|---------|-------|------|-----:|--------|-----:|---|-----:|
|
63 |
+
|hellaswag|Yaml |none | 10|acc |0.7091|± |0.0045|
|
64 |
+
| | |none | 10|acc_norm|0.8821|± |0.0032|
|
65 |
+
|
66 |
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto (32)
|
67 |
| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
|
68 |
|--------------|-------|------|-----:|----------|-----:|---|-----:|
|
|
|
82 |
| | |none | 0|acc_norm|0.3568|_ |0.0088|
|
83 |
|pubmedqa|Yaml |none | 0|acc |0.5400|_ |0.0223|
|
84 |
|
|
|
85 |
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0,dtype=float16), gen_kwargs: (), limit: None, num_fewshot: 0, batch_size: auto
|
86 |
| Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
|
87 |
|------------------------------------------------------|-------|------|-----:|-----------|-----:|---|-----:|
|
|
|
117 |
| Groups |Version|Filter|n-shot| Metric |Value| |Stderr|
|
118 |
|-----------|-------|------|-----:|-----------|----:|---|-----:|
|
119 |
|bbh_fewshot|N/A |none | 0|exact_match|0.466|_ |0.1771|
|
120 |
+
|
121 |
+
hf (pretrained=fblgit/UNA-SOLAR-10.7B-Instruct-v1.0), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto (16)
|
122 |
+
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
|
123 |
+
|---------------------------------------|-------|------|-----:|------|-----:|---|-----:|
|
124 |
+
|mmlu |N/A |none | 0|acc |0.6513|± |0.1221|
|
125 |
+
| - humanities |N/A |none | 5|acc |0.6077|± |0.1185|
|
126 |
+
| - formal_logic |Yaml |none | 5|acc |0.4444|± |0.0444|
|
127 |
+
| - high_school_european_history |Yaml |none | 5|acc |0.8121|± |0.0305|
|
128 |
+
| - high_school_us_history |Yaml |none | 5|acc |0.8431|± |0.0255|
|
129 |
+
| - high_school_world_history |Yaml |none | 5|acc |0.8523|± |0.0231|
|
130 |
+
| - international_law |Yaml |none | 5|acc |0.7851|± |0.0375|
|
131 |
+
| - jurisprudence |Yaml |none | 5|acc |0.7870|± |0.0396|
|
132 |
+
| - logical_fallacies |Yaml |none | 5|acc |0.7546|± |0.0338|
|
133 |
+
| - moral_disputes |Yaml |none | 5|acc |0.7370|± |0.0237|
|
134 |
+
| - moral_scenarios |Yaml |none | 5|acc |0.4101|± |0.0164|
|
135 |
+
| - philosophy |Yaml |none | 5|acc |0.7170|± |0.0256|
|
136 |
+
| - prehistory |Yaml |none | 5|acc |0.7840|± |0.0229|
|
137 |
+
| - professional_law |Yaml |none | 5|acc |0.4941|± |0.0128|
|
138 |
+
| - world_religions |Yaml |none | 5|acc |0.7895|± |0.0313|
|
139 |
+
| - other |N/A |none | 5|acc |0.7116|± |0.0939|
|
140 |
+
| - business_ethics |Yaml |none | 5|acc |0.7600|± |0.0429|
|
141 |
+
| - clinical_knowledge |Yaml |none | 5|acc |0.6792|± |0.0287|
|
142 |
+
| - college_medicine |Yaml |none | 5|acc |0.6590|± |0.0361|
|
143 |
+
| - global_facts |Yaml |none | 5|acc |0.3400|± |0.0476|
|
144 |
+
| - human_aging |Yaml |none | 5|acc |0.6816|± |0.0313|
|
145 |
+
| - management |Yaml |none | 5|acc |0.8350|± |0.0368|
|
146 |
+
| - marketing |Yaml |none | 5|acc |0.8547|± |0.0231|
|
147 |
+
| - medical_genetics |Yaml |none | 5|acc |0.7000|± |0.0461|
|
148 |
+
| - miscellaneous |Yaml |none | 5|acc |0.8020|± |0.0142|
|
149 |
+
| - nutrition |Yaml |none | 5|acc |0.7418|± |0.0251|
|
150 |
+
| - professional_accounting |Yaml |none | 5|acc |0.5071|± |0.0298|
|
151 |
+
| - professional_medicine |Yaml |none | 5|acc |0.7500|± |0.0263|
|
152 |
+
| - virology |Yaml |none | 5|acc |0.5843|± |0.0384|
|
153 |
+
| - social_sciences |N/A |none | 5|acc |0.7537|± |0.0681|
|
154 |
+
| - econometrics |Yaml |none | 5|acc |0.5000|± |0.0470|
|
155 |
+
| - high_school_geography |Yaml |none | 5|acc |0.8586|± |0.0248|
|
156 |
+
| - high_school_government_and_politics|Yaml |none | 5|acc |0.9016|± |0.0215|
|
157 |
+
| - high_school_macroeconomics |Yaml |none | 5|acc |0.6615|± |0.0240|
|
158 |
+
| - high_school_microeconomics |Yaml |none | 5|acc |0.7311|± |0.0288|
|
159 |
+
| - high_school_psychology |Yaml |none | 5|acc |0.8404|± |0.0157|
|
160 |
+
| - human_sexuality |Yaml |none | 5|acc |0.7328|± |0.0388|
|
161 |
+
| - professional_psychology |Yaml |none | 5|acc |0.6814|± |0.0189|
|
162 |
+
| - public_relations |Yaml |none | 5|acc |0.6909|± |0.0443|
|
163 |
+
| - security_studies |Yaml |none | 5|acc |0.7469|± |0.0278|
|
164 |
+
| - sociology |Yaml |none | 5|acc |0.8308|± |0.0265|
|
165 |
+
| - us_foreign_policy |Yaml |none | 5|acc |0.8900|± |0.0314|
|
166 |
+
| - stem |N/A |none | 5|acc |0.5569|± |0.1380|
|
167 |
+
| - abstract_algebra |Yaml |none | 5|acc |0.4100|± |0.0494|
|
168 |
+
| - anatomy |Yaml |none | 5|acc |0.6222|± |0.0419|
|
169 |
+
| - astronomy |Yaml |none | 5|acc |0.7368|± |0.0358|
|
170 |
+
| - college_biology |Yaml |none | 5|acc |0.8056|± |0.0331|
|
171 |
+
| - college_chemistry |Yaml |none | 5|acc |0.4700|± |0.0502|
|
172 |
+
| - college_computer_science |Yaml |none | 5|acc |0.5100|± |0.0502|
|
173 |
+
| - college_mathematics |Yaml |none | 5|acc |0.2800|± |0.0451|
|
174 |
+
| - college_physics |Yaml |none | 5|acc |0.3431|± |0.0472|
|
175 |
+
| - computer_security |Yaml |none | 5|acc |0.7400|± |0.0441|
|
176 |
+
| - conceptual_physics |Yaml |none | 5|acc |0.6340|± |0.0315|
|
177 |
+
| - electrical_engineering |Yaml |none | 5|acc |0.6000|± |0.0408|
|
178 |
+
| - elementary_mathematics |Yaml |none | 5|acc |0.4815|± |0.0257|
|
179 |
+
| - high_school_biology |Yaml |none | 5|acc |0.8032|± |0.0226|
|
180 |
+
| - high_school_chemistry |Yaml |none | 5|acc |0.4877|± |0.0352|
|
181 |
+
| - high_school_computer_science |Yaml |none | 5|acc |0.7200|± |0.0451|
|
182 |
+
| - high_school_mathematics |Yaml |none | 5|acc |0.3815|± |0.0296|
|
183 |
+
| - high_school_physics |Yaml |none | 5|acc |0.3576|± |0.0391|
|
184 |
+
| - high_school_statistics |Yaml |none | 5|acc |0.5602|± |0.0339|
|
185 |
+
| - machine_learning |Yaml |none | 5|acc |0.4643|± |0.0473|
|
186 |
+
|
187 |
+
| Groups |Version|Filter|n-shot|Metric|Value | |Stderr|
|
188 |
+
|------------------|-------|------|-----:|------|-----:|---|-----:|
|
189 |
+
|mmlu |N/A |none | 0|acc |0.6513|± |0.1221|
|
190 |
+
| - humanities |N/A |none | 5|acc |0.6077|± |0.1185|
|
191 |
+
| - other |N/A |none | 5|acc |0.7116|± |0.0939|
|
192 |
+
| - social_sciences|N/A |none | 5|acc |0.7537|± |0.0681|
|
193 |
+
| - stem |N/A |none | 5|acc |0.5569|± |0.1380|
|
194 |
```
|
195 |
|
196 |
|