alexmarques commited on
Commit
4432a30
1 Parent(s): 9064e71

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +82 -21
README.md CHANGED
@@ -155,54 +155,84 @@ This version of the lm-evaluation-harness includes versions of ARC-Challenge, GS
155
  <td><strong>Recovery</strong>
156
  </td>
157
  </tr>
 
 
 
 
 
 
 
 
 
 
158
  <tr>
159
  <td>MMLU-cot (0-shot)
160
  </td>
161
- <td>55.22
162
  </td>
163
- <td>55.28
164
  </td>
165
- <td>100.1%
166
  </td>
167
  </tr>
168
  <tr>
169
  <td>ARC Challenge (0-shot)
170
  </td>
171
- <td>77.39
172
  </td>
173
- <td>76.62
174
  </td>
175
- <td>99.0%
176
  </td>
177
  </tr>
178
  <tr>
179
  <td>GSM-8K-cot (8-shot, strict-match)
180
  </td>
181
- <td>77.56
182
  </td>
183
- <td>76.12
184
  </td>
185
- <td>98.1%
186
  </td>
187
  </tr>
188
  <tr>
189
  <td>Winogrande (5-shot)
190
  </td>
191
- <td>70.2
 
 
 
 
 
 
 
 
 
 
192
  </td>
193
- <td>69.3
194
  </td>
195
- <td>98.7%
196
  </td>
197
  </tr>
 
 
 
 
 
 
 
 
 
 
198
  <tr>
199
  <td><strong>Average</strong>
200
  </td>
201
- <td><strong>70.09</strong>
202
  </td>
203
- <td><strong>69.33</strong>
204
  </td>
205
- <td><strong>98.92%</strong>
206
  </td>
207
  </tr>
208
  </table>
@@ -212,11 +242,23 @@ This version of the lm-evaluation-harness includes versions of ARC-Challenge, GS
212
  The results were obtained using the following commands:
213
 
214
 
215
- #### MMLU-cot
216
  ```
217
  lm_eval \
218
  --model vllm \
219
- --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
 
 
 
 
 
 
 
 
 
 
 
 
220
  --tasks mmlu_cot_0shot_llama_3.1_instruct \
221
  --apply_chat_template \
222
  --num_fewshot 0 \
@@ -227,7 +269,7 @@ lm_eval \
227
  ```
228
  lm_eval \
229
  --model vllm \
230
- --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
231
  --tasks arc_challenge_llama_3.1_instruct \
232
  --apply_chat_template \
233
  --num_fewshot 0 \
@@ -238,21 +280,40 @@ lm_eval \
238
  ```
239
  lm_eval \
240
  --model vllm \
241
- --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
242
  --tasks gsm8k_cot_llama_3.1_instruct \
243
- --apply_chat_template \
244
  --fewshot_as_multiturn \
 
245
  --num_fewshot 8 \
246
  --batch_size auto
247
  ```
248
 
 
 
 
 
 
 
 
 
 
 
249
  #### Winogrande
250
  ```
251
  lm_eval \
252
  --model vllm \
253
- --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
254
  --tasks winogrande \
255
  --num_fewshot 5 \
256
  --batch_size auto
257
  ```
258
 
 
 
 
 
 
 
 
 
 
 
155
  <td><strong>Recovery</strong>
156
  </td>
157
  </tr>
158
+ <tr>
159
+ <td>MMLU (5-shot)
160
+ </td>
161
+ <td>62.98
162
+ </td>
163
+ <td>62.95
164
+ </td>
165
+ <td>100.0%
166
+ </td>
167
+ </tr>
168
  <tr>
169
  <td>MMLU-cot (0-shot)
170
  </td>
171
+ <td>65.40
172
  </td>
173
+ <td>65.23
174
  </td>
175
+ <td>99.7%
176
  </td>
177
  </tr>
178
  <tr>
179
  <td>ARC Challenge (0-shot)
180
  </td>
181
+ <td>77.13
182
  </td>
183
+ <td>76.71
184
  </td>
185
+ <td>99.4%
186
  </td>
187
  </tr>
188
  <tr>
189
  <td>GSM-8K-cot (8-shot, strict-match)
190
  </td>
191
+ <td>77.94
192
  </td>
193
+ <td>76.72
194
  </td>
195
+ <td>98.4%
196
  </td>
197
  </tr>
198
  <tr>
199
  <td>Winogrande (5-shot)
200
  </td>
201
+ <td>71.11
202
+ </td>
203
+ <td>71.11
204
+ </td>
205
+ <td>100.0%
206
+ </td>
207
+ </tr>
208
+ <tr>
209
+ <td>Hellaswag (10-shot)
210
+ </td>
211
+ <td>73.62
212
  </td>
213
+ <td>73.54
214
  </td>
215
+ <td>99.9%
216
  </td>
217
  </tr>
218
+ <tr>
219
+ <td>TruthfulQA (0-shot, mc2)
220
+ </td>
221
+ <td>51.47
222
+ </td>
223
+ <td>51.06
224
+ </td>
225
+ <td>99.2%
226
+ </td>
227
+ </tr>
228
  <tr>
229
  <td><strong>Average</strong>
230
  </td>
231
+ <td><strong>68.52</strong>
232
  </td>
233
+ <td><strong>68.19</strong>
234
  </td>
235
+ <td><strong>99.5%</strong>
236
  </td>
237
  </tr>
238
  </table>
 
242
  The results were obtained using the following commands:
243
 
244
 
245
+ #### MMLU
246
  ```
247
  lm_eval \
248
  --model vllm \
249
+ --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
250
+ --tasks mmlu_llama_3.1_instruct \
251
+ --fewshot_as_multiturn \
252
+ --apply_chat_template \
253
+ --num_fewshot 5 \
254
+ --batch_size auto
255
+ ```
256
+
257
+ #### MMLU-CoT
258
+ ```
259
+ lm_eval \
260
+ --model vllm \
261
+ --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4064,max_gen_toks=1024,tensor_parallel_size=1 \
262
  --tasks mmlu_cot_0shot_llama_3.1_instruct \
263
  --apply_chat_template \
264
  --num_fewshot 0 \
 
269
  ```
270
  lm_eval \
271
  --model vllm \
272
+ --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=3940,max_gen_toks=100,tensor_parallel_size=1 \
273
  --tasks arc_challenge_llama_3.1_instruct \
274
  --apply_chat_template \
275
  --num_fewshot 0 \
 
280
  ```
281
  lm_eval \
282
  --model vllm \
283
+ --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,max_model_len=4096,max_gen_toks=1024,tensor_parallel_size=1 \
284
  --tasks gsm8k_cot_llama_3.1_instruct \
 
285
  --fewshot_as_multiturn \
286
+ --apply_chat_template \
287
  --num_fewshot 8 \
288
  --batch_size auto
289
  ```
290
 
291
+ #### Hellaswag
292
+ ```
293
+ lm_eval \
294
+ --model vllm \
295
+ --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
296
+ --tasks hellaswag \
297
+ --num_fewshot 10 \
298
+ --batch_size auto
299
+ ```
300
+
301
  #### Winogrande
302
  ```
303
  lm_eval \
304
  --model vllm \
305
+ --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
306
  --tasks winogrande \
307
  --num_fewshot 5 \
308
  --batch_size auto
309
  ```
310
 
311
+ #### TruthfulQA
312
+ ```
313
+ lm_eval \
314
+ --model vllm \
315
+ --model_args pretrained="neuralmagic/Llama-3.2-3B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
316
+ --tasks truthfulqa \
317
+ --num_fewshot 0 \
318
+ --batch_size auto
319
+ ```