ramdhanfirdaus commited on
Commit
6fc059c
1 Parent(s): f75e93f

Training in progress, step 3900, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -216,4 +216,11 @@ The following `bitsandbytes` quantization config was used during training:
216
  ### Framework versions
217
 
218
 
 
 
 
 
 
 
 
219
  - PEFT 0.6.0.dev0
 
216
  ### Framework versions
217
 
218
 
219
+ - PEFT 0.6.0.dev0
220
+ ## Training procedure
221
+
222
+
223
+ ### Framework versions
224
+
225
+
226
  - PEFT 0.6.0.dev0
last-checkpoint/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f94abbb3dadd9d5509333e208387643f054f7063935df921cf742244dcb1378e
3
  size 50349441
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4f2142fbeae4dfbadd1cbea04fd1c45db2c3d583a2629ae7dbc95fef606daf9
3
  size 50349441
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b4208b79f50c1c24b94328cd125469fdbe97c696f5aa8c8821ad36e25844571
3
- size 100693001
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f48ff6733dc97f2bdbdd5ecb48f0be052a4b1b694220408c6889c85dff2a54a3
3
+ size 100691721
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aba10e0017806276ad36c9962a8d3fa89bfabe64354bedff59216e48a02da40c
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71128331774d3b754a22777700176c9e9d829274c21a3111675887d72431a159
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ab644d38b64875fe4b9db6a2dd788efbee5d9e5dadfabf0dd125e8dae428442
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:794a0a668693664bb45fb10418080981a1873b71841f3a55ed72c5aa6d455597
3
  size 627
last-checkpoint/special_tokens_map.json CHANGED
@@ -1,6 +1,24 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
4
  "pad_token": "<|endoftext|>",
5
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
6
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
  "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
  }
last-checkpoint/tokenizer_config.json CHANGED
@@ -13,8 +13,12 @@
13
  "bos_token": "<|endoftext|>",
14
  "clean_up_tokenization_spaces": true,
15
  "eos_token": "<|endoftext|>",
 
16
  "model_max_length": 1024,
17
  "pad_token": "<|endoftext|>",
 
18
  "tokenizer_class": "GPT2Tokenizer",
 
 
19
  "unk_token": "<|endoftext|>"
20
  }
 
13
  "bos_token": "<|endoftext|>",
14
  "clean_up_tokenization_spaces": true,
15
  "eos_token": "<|endoftext|>",
16
+ "max_length": 512,
17
  "model_max_length": 1024,
18
  "pad_token": "<|endoftext|>",
19
+ "stride": 0,
20
  "tokenizer_class": "GPT2Tokenizer",
21
+ "truncation_side": "right",
22
+ "truncation_strategy": "longest_first",
23
  "unk_token": "<|endoftext|>"
24
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.217284917831421,
3
- "best_model_checkpoint": "./outputs/checkpoint-4000",
4
- "epoch": 2.9143897996357016,
5
  "eval_steps": 100,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -151,429 +151,415 @@
151
  {
152
  "epoch": 0.8,
153
  "learning_rate": 0.0002,
154
- "loss": 1.6268,
155
  "step": 1100
156
  },
157
  {
158
  "epoch": 0.8,
159
- "eval_loss": 1.6125303506851196,
160
- "eval_runtime": 144.0648,
161
- "eval_samples_per_second": 43.55,
162
- "eval_steps_per_second": 5.449,
163
  "step": 1100
164
  },
165
  {
166
  "epoch": 0.87,
167
  "learning_rate": 0.0002,
168
- "loss": 1.5911,
169
  "step": 1200
170
  },
171
  {
172
  "epoch": 0.87,
173
- "eval_loss": 1.5925209522247314,
174
- "eval_runtime": 144.042,
175
- "eval_samples_per_second": 43.557,
176
- "eval_steps_per_second": 5.45,
177
  "step": 1200
178
  },
179
  {
180
  "epoch": 0.95,
181
  "learning_rate": 0.0002,
182
- "loss": 1.5986,
183
  "step": 1300
184
  },
185
  {
186
  "epoch": 0.95,
187
- "eval_loss": 1.571681261062622,
188
- "eval_runtime": 144.1746,
189
- "eval_samples_per_second": 43.517,
190
- "eval_steps_per_second": 5.445,
191
  "step": 1300
192
  },
193
  {
194
  "epoch": 1.02,
195
  "learning_rate": 0.0002,
196
- "loss": 1.5514,
197
  "step": 1400
198
  },
199
  {
200
  "epoch": 1.02,
201
- "eval_loss": 1.5524405241012573,
202
- "eval_runtime": 183.3697,
203
- "eval_samples_per_second": 34.215,
204
- "eval_steps_per_second": 4.281,
205
  "step": 1400
206
  },
207
  {
208
  "epoch": 1.09,
209
  "learning_rate": 0.0002,
210
- "loss": 1.5235,
211
  "step": 1500
212
  },
213
  {
214
  "epoch": 1.09,
215
- "eval_loss": 1.534788966178894,
216
- "eval_runtime": 317.3733,
217
- "eval_samples_per_second": 19.769,
218
- "eval_steps_per_second": 2.473,
219
  "step": 1500
220
  },
221
  {
222
  "epoch": 1.17,
223
  "learning_rate": 0.0002,
224
- "loss": 1.5051,
225
  "step": 1600
226
  },
227
  {
228
  "epoch": 1.17,
229
- "eval_loss": 1.517040491104126,
230
- "eval_runtime": 315.5897,
231
- "eval_samples_per_second": 19.88,
232
- "eval_steps_per_second": 2.487,
233
  "step": 1600
234
  },
235
  {
236
  "epoch": 1.24,
237
  "learning_rate": 0.0002,
238
- "loss": 1.5036,
239
  "step": 1700
240
  },
241
  {
242
  "epoch": 1.24,
243
- "eval_loss": 1.500235915184021,
244
- "eval_runtime": 314.1201,
245
- "eval_samples_per_second": 19.973,
246
- "eval_steps_per_second": 2.499,
247
  "step": 1700
248
  },
249
  {
250
  "epoch": 1.31,
251
  "learning_rate": 0.0002,
252
- "loss": 1.4767,
253
  "step": 1800
254
  },
255
  {
256
  "epoch": 1.31,
257
- "eval_loss": 1.4854458570480347,
258
- "eval_runtime": 313.3904,
259
- "eval_samples_per_second": 20.02,
260
- "eval_steps_per_second": 2.505,
261
  "step": 1800
262
  },
263
  {
264
  "epoch": 1.38,
265
  "learning_rate": 0.0002,
266
- "loss": 1.4665,
267
  "step": 1900
268
  },
269
  {
270
  "epoch": 1.38,
271
- "eval_loss": 1.4697930812835693,
272
- "eval_runtime": 314.584,
273
- "eval_samples_per_second": 19.944,
274
- "eval_steps_per_second": 2.495,
275
  "step": 1900
276
  },
277
  {
278
  "epoch": 1.46,
279
  "learning_rate": 0.0002,
280
- "loss": 1.4498,
281
  "step": 2000
282
  },
283
  {
284
  "epoch": 1.46,
285
- "eval_loss": 1.456108808517456,
286
- "eval_runtime": 316.2748,
287
- "eval_samples_per_second": 19.837,
288
- "eval_steps_per_second": 2.482,
289
  "step": 2000
290
  },
291
  {
292
  "epoch": 1.53,
293
  "learning_rate": 0.0002,
294
- "loss": 1.4358,
295
  "step": 2100
296
  },
297
  {
298
  "epoch": 1.53,
299
- "eval_loss": 1.4408069849014282,
300
- "eval_runtime": 317.862,
301
- "eval_samples_per_second": 19.738,
302
- "eval_steps_per_second": 2.47,
303
  "step": 2100
304
  },
305
  {
306
  "epoch": 1.6,
307
  "learning_rate": 0.0002,
308
- "loss": 1.4099,
309
  "step": 2200
310
  },
311
  {
312
  "epoch": 1.6,
313
- "eval_loss": 1.4274669885635376,
314
- "eval_runtime": 319.6918,
315
- "eval_samples_per_second": 19.625,
316
- "eval_steps_per_second": 2.455,
317
  "step": 2200
318
  },
319
  {
320
  "epoch": 1.68,
321
  "learning_rate": 0.0002,
322
- "loss": 1.409,
323
  "step": 2300
324
  },
325
  {
326
  "epoch": 1.68,
327
- "eval_loss": 1.4145855903625488,
328
- "eval_runtime": 320.1441,
329
- "eval_samples_per_second": 19.597,
330
- "eval_steps_per_second": 2.452,
331
  "step": 2300
332
  },
333
  {
334
  "epoch": 1.75,
335
  "learning_rate": 0.0002,
336
- "loss": 1.4082,
337
  "step": 2400
338
  },
339
  {
340
  "epoch": 1.75,
341
- "eval_loss": 1.4013197422027588,
342
- "eval_runtime": 318.5235,
343
- "eval_samples_per_second": 19.697,
344
- "eval_steps_per_second": 2.464,
345
  "step": 2400
346
  },
347
  {
348
  "epoch": 1.82,
349
  "learning_rate": 0.0002,
350
- "loss": 1.3797,
351
  "step": 2500
352
  },
353
  {
354
  "epoch": 1.82,
355
- "eval_loss": 1.388313889503479,
356
- "eval_runtime": 316.5542,
357
- "eval_samples_per_second": 19.82,
358
- "eval_steps_per_second": 2.48,
359
  "step": 2500
360
  },
361
  {
362
  "epoch": 1.89,
363
  "learning_rate": 0.0002,
364
- "loss": 1.37,
365
  "step": 2600
366
  },
367
  {
368
  "epoch": 1.89,
369
- "eval_loss": 1.3754903078079224,
370
- "eval_runtime": 314.9023,
371
- "eval_samples_per_second": 19.924,
372
- "eval_steps_per_second": 2.493,
373
  "step": 2600
374
  },
375
  {
376
  "epoch": 1.97,
377
  "learning_rate": 0.0002,
378
- "loss": 1.3591,
379
  "step": 2700
380
  },
381
  {
382
  "epoch": 1.97,
383
- "eval_loss": 1.361141324043274,
384
- "eval_runtime": 313.2121,
385
- "eval_samples_per_second": 20.031,
386
- "eval_steps_per_second": 2.506,
387
  "step": 2700
388
  },
389
  {
390
  "epoch": 2.04,
391
  "learning_rate": 0.0002,
392
- "loss": 1.3293,
393
  "step": 2800
394
  },
395
  {
396
  "epoch": 2.04,
397
- "eval_loss": 1.35151207447052,
398
- "eval_runtime": 313.6525,
399
- "eval_samples_per_second": 20.003,
400
- "eval_steps_per_second": 2.503,
401
  "step": 2800
402
  },
403
  {
404
  "epoch": 2.11,
405
  "learning_rate": 0.0002,
406
- "loss": 1.3038,
407
  "step": 2900
408
  },
409
  {
410
  "epoch": 2.11,
411
- "eval_loss": 1.3370529413223267,
412
- "eval_runtime": 315.5675,
413
- "eval_samples_per_second": 19.882,
414
- "eval_steps_per_second": 2.488,
415
  "step": 2900
416
  },
417
  {
418
  "epoch": 2.19,
419
  "learning_rate": 0.0002,
420
- "loss": 1.3045,
421
  "step": 3000
422
  },
423
  {
424
  "epoch": 2.19,
425
- "eval_loss": 1.3253566026687622,
426
- "eval_runtime": 317.1772,
427
- "eval_samples_per_second": 19.781,
428
- "eval_steps_per_second": 2.475,
429
  "step": 3000
430
  },
431
  {
432
  "epoch": 2.26,
433
  "learning_rate": 0.0002,
434
- "loss": 1.2858,
435
  "step": 3100
436
  },
437
  {
438
  "epoch": 2.26,
439
- "eval_loss": 1.3158119916915894,
440
- "eval_runtime": 318.7074,
441
- "eval_samples_per_second": 19.686,
442
- "eval_steps_per_second": 2.463,
443
  "step": 3100
444
  },
445
  {
446
  "epoch": 2.33,
447
  "learning_rate": 0.0002,
448
- "loss": 1.2892,
449
  "step": 3200
450
  },
451
  {
452
  "epoch": 2.33,
453
- "eval_loss": 1.3039228916168213,
454
- "eval_runtime": 320.2143,
455
- "eval_samples_per_second": 19.593,
456
- "eval_steps_per_second": 2.451,
457
  "step": 3200
458
  },
459
  {
460
  "epoch": 2.4,
461
  "learning_rate": 0.0002,
462
- "loss": 1.2707,
463
  "step": 3300
464
  },
465
  {
466
  "epoch": 2.4,
467
- "eval_loss": 1.2919011116027832,
468
- "eval_runtime": 319.3459,
469
- "eval_samples_per_second": 19.646,
470
- "eval_steps_per_second": 2.458,
471
  "step": 3300
472
  },
473
  {
474
  "epoch": 2.48,
475
  "learning_rate": 0.0002,
476
- "loss": 1.2513,
477
  "step": 3400
478
  },
479
  {
480
  "epoch": 2.48,
481
- "eval_loss": 1.2831082344055176,
482
- "eval_runtime": 317.9367,
483
- "eval_samples_per_second": 19.733,
484
- "eval_steps_per_second": 2.469,
485
  "step": 3400
486
  },
487
  {
488
  "epoch": 2.55,
489
  "learning_rate": 0.0002,
490
- "loss": 1.2533,
491
  "step": 3500
492
  },
493
  {
494
  "epoch": 2.55,
495
- "eval_loss": 1.2713383436203003,
496
- "eval_runtime": 316.0824,
497
- "eval_samples_per_second": 19.849,
498
- "eval_steps_per_second": 2.484,
499
  "step": 3500
500
  },
501
  {
502
  "epoch": 2.62,
503
  "learning_rate": 0.0002,
504
- "loss": 1.2401,
505
  "step": 3600
506
  },
507
  {
508
  "epoch": 2.62,
509
- "eval_loss": 1.2587928771972656,
510
- "eval_runtime": 314.209,
511
- "eval_samples_per_second": 19.968,
512
- "eval_steps_per_second": 2.498,
513
  "step": 3600
514
  },
515
  {
516
  "epoch": 2.7,
517
  "learning_rate": 0.0002,
518
- "loss": 1.2235,
519
  "step": 3700
520
  },
521
  {
522
  "epoch": 2.7,
523
- "eval_loss": 1.2472882270812988,
524
- "eval_runtime": 313.2806,
525
- "eval_samples_per_second": 20.027,
526
- "eval_steps_per_second": 2.506,
527
  "step": 3700
528
  },
529
  {
530
  "epoch": 2.77,
531
  "learning_rate": 0.0002,
532
- "loss": 1.2008,
533
  "step": 3800
534
  },
535
  {
536
  "epoch": 2.77,
537
- "eval_loss": 1.2382943630218506,
538
- "eval_runtime": 314.1806,
539
- "eval_samples_per_second": 19.969,
540
- "eval_steps_per_second": 2.499,
541
  "step": 3800
542
  },
543
  {
544
  "epoch": 2.84,
545
  "learning_rate": 0.0002,
546
- "loss": 1.2127,
547
  "step": 3900
548
  },
549
  {
550
  "epoch": 2.84,
551
- "eval_loss": 1.2249549627304077,
552
- "eval_runtime": 316.2071,
553
- "eval_samples_per_second": 19.841,
554
- "eval_steps_per_second": 2.483,
555
  "step": 3900
556
- },
557
- {
558
- "epoch": 2.91,
559
- "learning_rate": 0.0002,
560
- "loss": 1.1949,
561
- "step": 4000
562
- },
563
- {
564
- "epoch": 2.91,
565
- "eval_loss": 1.217284917831421,
566
- "eval_runtime": 317.7217,
567
- "eval_samples_per_second": 19.747,
568
- "eval_steps_per_second": 2.471,
569
- "step": 4000
570
  }
571
  ],
572
  "logging_steps": 100,
573
  "max_steps": 4116,
574
  "num_train_epochs": 3,
575
  "save_steps": 100,
576
- "total_flos": 2.3843183295659213e+17,
577
  "trial_name": null,
578
  "trial_params": null
579
  }
 
1
  {
2
+ "best_metric": 1.2214804887771606,
3
+ "best_model_checkpoint": "./outputs/checkpoint-3900",
4
+ "epoch": 2.841530054644809,
5
  "eval_steps": 100,
6
+ "global_step": 3900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
151
  {
152
  "epoch": 0.8,
153
  "learning_rate": 0.0002,
154
+ "loss": 1.6461,
155
  "step": 1100
156
  },
157
  {
158
  "epoch": 0.8,
159
+ "eval_loss": 1.6240431070327759,
160
+ "eval_runtime": 293.5989,
161
+ "eval_samples_per_second": 21.369,
162
+ "eval_steps_per_second": 2.674,
163
  "step": 1100
164
  },
165
  {
166
  "epoch": 0.87,
167
  "learning_rate": 0.0002,
168
+ "loss": 1.5992,
169
  "step": 1200
170
  },
171
  {
172
  "epoch": 0.87,
173
+ "eval_loss": 1.5974311828613281,
174
+ "eval_runtime": 291.7,
175
+ "eval_samples_per_second": 21.508,
176
+ "eval_steps_per_second": 2.691,
177
  "step": 1200
178
  },
179
  {
180
  "epoch": 0.95,
181
  "learning_rate": 0.0002,
182
+ "loss": 1.6021,
183
  "step": 1300
184
  },
185
  {
186
  "epoch": 0.95,
187
+ "eval_loss": 1.5751127004623413,
188
+ "eval_runtime": 289.9524,
189
+ "eval_samples_per_second": 21.638,
190
+ "eval_steps_per_second": 2.707,
191
  "step": 1300
192
  },
193
  {
194
  "epoch": 1.02,
195
  "learning_rate": 0.0002,
196
+ "loss": 1.5538,
197
  "step": 1400
198
  },
199
  {
200
  "epoch": 1.02,
201
+ "eval_loss": 1.5539450645446777,
202
+ "eval_runtime": 287.8748,
203
+ "eval_samples_per_second": 21.794,
204
+ "eval_steps_per_second": 2.727,
205
  "step": 1400
206
  },
207
  {
208
  "epoch": 1.09,
209
  "learning_rate": 0.0002,
210
+ "loss": 1.5249,
211
  "step": 1500
212
  },
213
  {
214
  "epoch": 1.09,
215
+ "eval_loss": 1.5348094701766968,
216
+ "eval_runtime": 287.891,
217
+ "eval_samples_per_second": 21.793,
218
+ "eval_steps_per_second": 2.727,
219
  "step": 1500
220
  },
221
  {
222
  "epoch": 1.17,
223
  "learning_rate": 0.0002,
224
+ "loss": 1.506,
225
  "step": 1600
226
  },
227
  {
228
  "epoch": 1.17,
229
+ "eval_loss": 1.515953540802002,
230
+ "eval_runtime": 289.836,
231
+ "eval_samples_per_second": 21.647,
232
+ "eval_steps_per_second": 2.708,
233
  "step": 1600
234
  },
235
  {
236
  "epoch": 1.24,
237
  "learning_rate": 0.0002,
238
+ "loss": 1.5042,
239
  "step": 1700
240
  },
241
  {
242
  "epoch": 1.24,
243
+ "eval_loss": 1.4988901615142822,
244
+ "eval_runtime": 291.5471,
245
+ "eval_samples_per_second": 21.52,
246
+ "eval_steps_per_second": 2.693,
247
  "step": 1700
248
  },
249
  {
250
  "epoch": 1.31,
251
  "learning_rate": 0.0002,
252
+ "loss": 1.4762,
253
  "step": 1800
254
  },
255
  {
256
  "epoch": 1.31,
257
+ "eval_loss": 1.4844294786453247,
258
+ "eval_runtime": 293.6668,
259
+ "eval_samples_per_second": 21.364,
260
+ "eval_steps_per_second": 2.673,
261
  "step": 1800
262
  },
263
  {
264
  "epoch": 1.38,
265
  "learning_rate": 0.0002,
266
+ "loss": 1.4652,
267
  "step": 1900
268
  },
269
  {
270
  "epoch": 1.38,
271
+ "eval_loss": 1.4694663286209106,
272
+ "eval_runtime": 295.5867,
273
+ "eval_samples_per_second": 21.226,
274
+ "eval_steps_per_second": 2.656,
275
  "step": 1900
276
  },
277
  {
278
  "epoch": 1.46,
279
  "learning_rate": 0.0002,
280
+ "loss": 1.4481,
281
  "step": 2000
282
  },
283
  {
284
  "epoch": 1.46,
285
+ "eval_loss": 1.4534634351730347,
286
+ "eval_runtime": 296.7451,
287
+ "eval_samples_per_second": 21.143,
288
+ "eval_steps_per_second": 2.645,
289
  "step": 2000
290
  },
291
  {
292
  "epoch": 1.53,
293
  "learning_rate": 0.0002,
294
+ "loss": 1.4335,
295
  "step": 2100
296
  },
297
  {
298
  "epoch": 1.53,
299
+ "eval_loss": 1.4383305311203003,
300
+ "eval_runtime": 294.659,
301
+ "eval_samples_per_second": 21.292,
302
+ "eval_steps_per_second": 2.664,
303
  "step": 2100
304
  },
305
  {
306
  "epoch": 1.6,
307
  "learning_rate": 0.0002,
308
+ "loss": 1.4075,
309
  "step": 2200
310
  },
311
  {
312
  "epoch": 1.6,
313
+ "eval_loss": 1.4232139587402344,
314
+ "eval_runtime": 292.737,
315
+ "eval_samples_per_second": 21.432,
316
+ "eval_steps_per_second": 2.682,
317
  "step": 2200
318
  },
319
  {
320
  "epoch": 1.68,
321
  "learning_rate": 0.0002,
322
+ "loss": 1.4059,
323
  "step": 2300
324
  },
325
  {
326
  "epoch": 1.68,
327
+ "eval_loss": 1.411597490310669,
328
+ "eval_runtime": 290.6759,
329
+ "eval_samples_per_second": 21.584,
330
+ "eval_steps_per_second": 2.701,
331
  "step": 2300
332
  },
333
  {
334
  "epoch": 1.75,
335
  "learning_rate": 0.0002,
336
+ "loss": 1.4055,
337
  "step": 2400
338
  },
339
  {
340
  "epoch": 1.75,
341
+ "eval_loss": 1.3974188566207886,
342
+ "eval_runtime": 288.6807,
343
+ "eval_samples_per_second": 21.733,
344
+ "eval_steps_per_second": 2.719,
345
  "step": 2400
346
  },
347
  {
348
  "epoch": 1.82,
349
  "learning_rate": 0.0002,
350
+ "loss": 1.3772,
351
  "step": 2500
352
  },
353
  {
354
  "epoch": 1.82,
355
+ "eval_loss": 1.384261965751648,
356
+ "eval_runtime": 287.1594,
357
+ "eval_samples_per_second": 21.848,
358
+ "eval_steps_per_second": 2.734,
359
  "step": 2500
360
  },
361
  {
362
  "epoch": 1.89,
363
  "learning_rate": 0.0002,
364
+ "loss": 1.3669,
365
  "step": 2600
366
  },
367
  {
368
  "epoch": 1.89,
369
+ "eval_loss": 1.3715009689331055,
370
+ "eval_runtime": 288.6305,
371
+ "eval_samples_per_second": 21.737,
372
+ "eval_steps_per_second": 2.72,
373
  "step": 2600
374
  },
375
  {
376
  "epoch": 1.97,
377
  "learning_rate": 0.0002,
378
+ "loss": 1.3548,
379
  "step": 2700
380
  },
381
  {
382
  "epoch": 1.97,
383
+ "eval_loss": 1.3575737476348877,
384
+ "eval_runtime": 290.5389,
385
+ "eval_samples_per_second": 21.594,
386
+ "eval_steps_per_second": 2.702,
387
  "step": 2700
388
  },
389
  {
390
  "epoch": 2.04,
391
  "learning_rate": 0.0002,
392
+ "loss": 1.3266,
393
  "step": 2800
394
  },
395
  {
396
  "epoch": 2.04,
397
+ "eval_loss": 1.3451271057128906,
398
+ "eval_runtime": 292.4987,
399
+ "eval_samples_per_second": 21.45,
400
+ "eval_steps_per_second": 2.684,
401
  "step": 2800
402
  },
403
  {
404
  "epoch": 2.11,
405
  "learning_rate": 0.0002,
406
+ "loss": 1.3004,
407
  "step": 2900
408
  },
409
  {
410
  "epoch": 2.11,
411
+ "eval_loss": 1.333436369895935,
412
+ "eval_runtime": 294.3881,
413
+ "eval_samples_per_second": 21.312,
414
+ "eval_steps_per_second": 2.667,
415
  "step": 2900
416
  },
417
  {
418
  "epoch": 2.19,
419
  "learning_rate": 0.0002,
420
+ "loss": 1.3009,
421
  "step": 3000
422
  },
423
  {
424
  "epoch": 2.19,
425
+ "eval_loss": 1.3215913772583008,
426
+ "eval_runtime": 296.2165,
427
+ "eval_samples_per_second": 21.18,
428
+ "eval_steps_per_second": 2.65,
429
  "step": 3000
430
  },
431
  {
432
  "epoch": 2.26,
433
  "learning_rate": 0.0002,
434
+ "loss": 1.2822,
435
  "step": 3100
436
  },
437
  {
438
  "epoch": 2.26,
439
+ "eval_loss": 1.311138391494751,
440
+ "eval_runtime": 295.6634,
441
+ "eval_samples_per_second": 21.22,
442
+ "eval_steps_per_second": 2.655,
443
  "step": 3100
444
  },
445
  {
446
  "epoch": 2.33,
447
  "learning_rate": 0.0002,
448
+ "loss": 1.2846,
449
  "step": 3200
450
  },
451
  {
452
  "epoch": 2.33,
453
+ "eval_loss": 1.3013139963150024,
454
+ "eval_runtime": 293.7639,
455
+ "eval_samples_per_second": 21.357,
456
+ "eval_steps_per_second": 2.672,
457
  "step": 3200
458
  },
459
  {
460
  "epoch": 2.4,
461
  "learning_rate": 0.0002,
462
+ "loss": 1.2674,
463
  "step": 3300
464
  },
465
  {
466
  "epoch": 2.4,
467
+ "eval_loss": 1.2875950336456299,
468
+ "eval_runtime": 291.8744,
469
+ "eval_samples_per_second": 21.496,
470
+ "eval_steps_per_second": 2.69,
471
  "step": 3300
472
  },
473
  {
474
  "epoch": 2.48,
475
  "learning_rate": 0.0002,
476
+ "loss": 1.2485,
477
  "step": 3400
478
  },
479
  {
480
  "epoch": 2.48,
481
+ "eval_loss": 1.2778161764144897,
482
+ "eval_runtime": 289.8138,
483
+ "eval_samples_per_second": 21.648,
484
+ "eval_steps_per_second": 2.709,
485
  "step": 3400
486
  },
487
  {
488
  "epoch": 2.55,
489
  "learning_rate": 0.0002,
490
+ "loss": 1.2499,
491
  "step": 3500
492
  },
493
  {
494
  "epoch": 2.55,
495
+ "eval_loss": 1.2662204504013062,
496
+ "eval_runtime": 288.0522,
497
+ "eval_samples_per_second": 21.781,
498
+ "eval_steps_per_second": 2.725,
499
  "step": 3500
500
  },
501
  {
502
  "epoch": 2.62,
503
  "learning_rate": 0.0002,
504
+ "loss": 1.2363,
505
  "step": 3600
506
  },
507
  {
508
  "epoch": 2.62,
509
+ "eval_loss": 1.2541649341583252,
510
+ "eval_runtime": 287.6646,
511
+ "eval_samples_per_second": 21.81,
512
+ "eval_steps_per_second": 2.729,
513
  "step": 3600
514
  },
515
  {
516
  "epoch": 2.7,
517
  "learning_rate": 0.0002,
518
+ "loss": 1.22,
519
  "step": 3700
520
  },
521
  {
522
  "epoch": 2.7,
523
+ "eval_loss": 1.2425366640090942,
524
+ "eval_runtime": 289.5395,
525
+ "eval_samples_per_second": 21.669,
526
+ "eval_steps_per_second": 2.711,
527
  "step": 3700
528
  },
529
  {
530
  "epoch": 2.77,
531
  "learning_rate": 0.0002,
532
+ "loss": 1.1977,
533
  "step": 3800
534
  },
535
  {
536
  "epoch": 2.77,
537
+ "eval_loss": 1.2353510856628418,
538
+ "eval_runtime": 291.5958,
539
+ "eval_samples_per_second": 21.516,
540
+ "eval_steps_per_second": 2.692,
541
  "step": 3800
542
  },
543
  {
544
  "epoch": 2.84,
545
  "learning_rate": 0.0002,
546
+ "loss": 1.2087,
547
  "step": 3900
548
  },
549
  {
550
  "epoch": 2.84,
551
+ "eval_loss": 1.2214804887771606,
552
+ "eval_runtime": 293.3693,
553
+ "eval_samples_per_second": 21.386,
554
+ "eval_steps_per_second": 2.676,
555
  "step": 3900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  }
557
  ],
558
  "logging_steps": 100,
559
  "max_steps": 4116,
560
  "num_train_epochs": 3,
561
  "save_steps": 100,
562
+ "total_flos": 2.324829790449746e+17,
563
  "trial_name": null,
564
  "trial_params": null
565
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:435ef416a520c327b44f1a335ae059bdb8b9a978d39dfecd5bff01684de2670c
3
  size 4155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1ab4a714067324690a64db56d021644d50462360424c147b3e9df6b69650fa0
3
  size 4155