error577 commited on
Commit
b730ce2
·
verified ·
1 Parent(s): 05ab4bb

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
- "down_proj",
25
- "q_proj",
26
- "gate_proj",
27
  "up_proj",
28
  "v_proj",
29
- "o_proj"
 
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
 
23
  "up_proj",
24
  "v_proj",
25
+ "o_proj",
26
+ "q_proj",
27
+ "down_proj",
28
+ "gate_proj",
29
+ "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e091209793459243110477d9b79453c4bafaa92003a432b92672aef52cf4d29
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c23ee6f81dd496aa2a837a563f66f71607eec837c9c178d4351919ddf89c7c59
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a80d3a702d920c9d1d698eb3308180707a61b26022b22ecdf692f396f752474
3
  size 43122580
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b983c4dc1d54aeb8fa04b4467ff343795c06f02e2408d7abb5bedd3c7ea4498
3
  size 43122580
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c3449066c51062e3580aecd4101da9ad8e106705402d76bc7d67c630b7508d5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1301d8ff77a21126a7a758614a70c01a552c16ba64fe53295c29cdd7bbf62b76
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7362d3d5b178b687d19e464d780e55e8b7b069063a073dfd94da0497a241db57
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e69e2b49ea642509f0c688c16fb190b7cf27dac0a18903a5e2d1467d0343d8b8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,569 +1,393 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.05847953216374269,
5
- "eval_steps": 2,
6
  "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0011695906432748538,
13
- "grad_norm": 8.476790428161621,
14
- "learning_rate": 1.0000000000000001e-07,
15
- "loss": 3.1297,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.0011695906432748538,
20
  "eval_loss": 3.3358547687530518,
21
- "eval_runtime": 14.0872,
22
- "eval_samples_per_second": 6.389,
23
- "eval_steps_per_second": 6.389,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.0023391812865497076,
28
- "grad_norm": 11.907588958740234,
29
- "learning_rate": 2.0000000000000002e-07,
30
- "loss": 3.1984,
31
- "step": 2
32
- },
33
- {
34
- "epoch": 0.0023391812865497076,
35
- "eval_loss": 3.336097478866577,
36
- "eval_runtime": 13.9412,
37
- "eval_samples_per_second": 6.456,
38
- "eval_steps_per_second": 6.456,
39
  "step": 2
40
  },
41
  {
42
- "epoch": 0.0035087719298245615,
43
- "grad_norm": 9.732194900512695,
44
- "learning_rate": 3.0000000000000004e-07,
45
- "loss": 2.9559,
46
  "step": 3
47
  },
48
  {
49
- "epoch": 0.004678362573099415,
50
- "grad_norm": 8.285279273986816,
51
- "learning_rate": 4.0000000000000003e-07,
52
- "loss": 2.7216,
53
- "step": 4
54
- },
55
- {
56
- "epoch": 0.004678362573099415,
57
- "eval_loss": 3.336188793182373,
58
- "eval_runtime": 13.9965,
59
- "eval_samples_per_second": 6.43,
60
- "eval_steps_per_second": 6.43,
61
  "step": 4
62
  },
63
  {
64
- "epoch": 0.005847953216374269,
65
- "grad_norm": 11.178544044494629,
66
- "learning_rate": 5.000000000000001e-07,
67
- "loss": 3.3045,
68
  "step": 5
69
  },
70
  {
71
- "epoch": 0.007017543859649123,
72
- "grad_norm": 31.653640747070312,
73
- "learning_rate": 6.000000000000001e-07,
74
- "loss": 5.7524,
75
- "step": 6
76
- },
77
- {
78
- "epoch": 0.007017543859649123,
79
- "eval_loss": 3.3365745544433594,
80
- "eval_runtime": 14.1369,
81
- "eval_samples_per_second": 6.366,
82
- "eval_steps_per_second": 6.366,
83
  "step": 6
84
  },
85
  {
86
- "epoch": 0.008187134502923977,
87
- "grad_norm": 10.910638809204102,
88
- "learning_rate": 7.000000000000001e-07,
89
- "loss": 3.5631,
90
  "step": 7
91
  },
92
  {
93
- "epoch": 0.00935672514619883,
94
- "grad_norm": 12.292998313903809,
95
- "learning_rate": 8.000000000000001e-07,
96
- "loss": 3.2113,
97
- "step": 8
98
- },
99
- {
100
- "epoch": 0.00935672514619883,
101
- "eval_loss": 3.335761785507202,
102
- "eval_runtime": 14.0918,
103
- "eval_samples_per_second": 6.387,
104
- "eval_steps_per_second": 6.387,
105
  "step": 8
106
  },
107
  {
108
- "epoch": 0.010526315789473684,
109
- "grad_norm": 10.52215576171875,
110
- "learning_rate": 9e-07,
111
- "loss": 3.4784,
112
  "step": 9
113
  },
114
  {
115
- "epoch": 0.011695906432748537,
116
- "grad_norm": 10.834870338439941,
117
- "learning_rate": 1.0000000000000002e-06,
118
- "loss": 3.5036,
119
- "step": 10
120
- },
121
- {
122
- "epoch": 0.011695906432748537,
123
- "eval_loss": 3.335782289505005,
124
- "eval_runtime": 14.2442,
125
- "eval_samples_per_second": 6.318,
126
- "eval_steps_per_second": 6.318,
127
  "step": 10
128
  },
129
  {
130
- "epoch": 0.012865497076023392,
131
- "grad_norm": 14.306832313537598,
132
- "learning_rate": 1.1e-06,
133
- "loss": 3.7758,
134
  "step": 11
135
  },
136
  {
137
- "epoch": 0.014035087719298246,
138
- "grad_norm": 9.94528579711914,
139
- "learning_rate": 1.2000000000000002e-06,
140
- "loss": 3.173,
141
- "step": 12
142
- },
143
- {
144
- "epoch": 0.014035087719298246,
145
- "eval_loss": 3.33567476272583,
146
- "eval_runtime": 14.2012,
147
- "eval_samples_per_second": 6.338,
148
- "eval_steps_per_second": 6.338,
149
  "step": 12
150
  },
151
  {
152
- "epoch": 0.0152046783625731,
153
- "grad_norm": 12.635165214538574,
154
- "learning_rate": 1.3e-06,
155
- "loss": 2.9429,
156
  "step": 13
157
  },
158
  {
159
- "epoch": 0.016374269005847954,
160
- "grad_norm": 10.34188461303711,
161
- "learning_rate": 1.4000000000000001e-06,
162
- "loss": 3.5056,
163
- "step": 14
 
164
  },
165
  {
166
- "epoch": 0.016374269005847954,
167
- "eval_loss": 3.33496356010437,
168
- "eval_runtime": 13.9923,
169
- "eval_samples_per_second": 6.432,
170
- "eval_steps_per_second": 6.432,
171
  "step": 14
172
  },
173
  {
174
- "epoch": 0.017543859649122806,
175
- "grad_norm": 10.396870613098145,
176
- "learning_rate": 1.5e-06,
177
- "loss": 3.1286,
178
  "step": 15
179
  },
180
  {
181
- "epoch": 0.01871345029239766,
182
- "grad_norm": 11.446793556213379,
183
- "learning_rate": 1.6000000000000001e-06,
184
- "loss": 3.5737,
185
- "step": 16
186
- },
187
- {
188
- "epoch": 0.01871345029239766,
189
- "eval_loss": 3.333711862564087,
190
- "eval_runtime": 13.9756,
191
- "eval_samples_per_second": 6.44,
192
- "eval_steps_per_second": 6.44,
193
  "step": 16
194
  },
195
  {
196
- "epoch": 0.019883040935672516,
197
- "grad_norm": 8.924163818359375,
198
- "learning_rate": 1.7000000000000002e-06,
199
- "loss": 3.246,
200
  "step": 17
201
  },
202
  {
203
- "epoch": 0.021052631578947368,
204
- "grad_norm": 12.621112823486328,
205
- "learning_rate": 1.8e-06,
206
- "loss": 3.3298,
207
- "step": 18
208
- },
209
- {
210
- "epoch": 0.021052631578947368,
211
- "eval_loss": 3.332756996154785,
212
- "eval_runtime": 13.9715,
213
- "eval_samples_per_second": 6.442,
214
- "eval_steps_per_second": 6.442,
215
  "step": 18
216
  },
217
  {
218
- "epoch": 0.022222222222222223,
219
- "grad_norm": 16.083580017089844,
220
- "learning_rate": 1.9e-06,
221
- "loss": 3.8307,
222
  "step": 19
223
  },
224
  {
225
- "epoch": 0.023391812865497075,
226
- "grad_norm": 9.164115905761719,
227
- "learning_rate": 2.0000000000000003e-06,
228
- "loss": 3.2996,
229
- "step": 20
230
- },
231
- {
232
- "epoch": 0.023391812865497075,
233
- "eval_loss": 3.3321051597595215,
234
- "eval_runtime": 14.0785,
235
- "eval_samples_per_second": 6.393,
236
- "eval_steps_per_second": 6.393,
237
  "step": 20
238
  },
239
  {
240
- "epoch": 0.02456140350877193,
241
- "grad_norm": 13.98554801940918,
242
- "learning_rate": 2.1000000000000002e-06,
243
- "loss": 3.6964,
244
  "step": 21
245
  },
246
  {
247
- "epoch": 0.025730994152046785,
248
- "grad_norm": 9.490047454833984,
249
- "learning_rate": 2.2e-06,
250
- "loss": 3.5336,
251
- "step": 22
252
- },
253
- {
254
- "epoch": 0.025730994152046785,
255
- "eval_loss": 3.330921173095703,
256
- "eval_runtime": 13.9892,
257
- "eval_samples_per_second": 6.434,
258
- "eval_steps_per_second": 6.434,
259
  "step": 22
260
  },
261
  {
262
- "epoch": 0.026900584795321637,
263
- "grad_norm": 11.965221405029297,
264
- "learning_rate": 2.3e-06,
265
- "loss": 4.1548,
266
  "step": 23
267
  },
268
  {
269
- "epoch": 0.028070175438596492,
270
- "grad_norm": 7.872015953063965,
271
- "learning_rate": 2.4000000000000003e-06,
272
- "loss": 2.6803,
273
  "step": 24
274
  },
275
  {
276
- "epoch": 0.028070175438596492,
277
- "eval_loss": 3.330366373062134,
278
- "eval_runtime": 14.0471,
279
- "eval_samples_per_second": 6.407,
280
- "eval_steps_per_second": 6.407,
281
- "step": 24
282
- },
283
- {
284
- "epoch": 0.029239766081871343,
285
- "grad_norm": 15.018107414245605,
286
- "learning_rate": 2.5e-06,
287
- "loss": 3.3743,
288
  "step": 25
289
  },
290
  {
291
- "epoch": 0.0304093567251462,
292
- "grad_norm": 8.211061477661133,
293
- "learning_rate": 2.6e-06,
294
- "loss": 2.9239,
295
  "step": 26
296
  },
297
  {
298
- "epoch": 0.0304093567251462,
299
- "eval_loss": 3.3289644718170166,
300
- "eval_runtime": 14.0324,
301
- "eval_samples_per_second": 6.414,
302
- "eval_steps_per_second": 6.414,
303
  "step": 26
304
  },
305
  {
306
- "epoch": 0.031578947368421054,
307
- "grad_norm": 9.60824203491211,
308
- "learning_rate": 2.7e-06,
309
- "loss": 2.8382,
310
  "step": 27
311
  },
312
  {
313
- "epoch": 0.03274853801169591,
314
- "grad_norm": 16.03299903869629,
315
- "learning_rate": 2.8000000000000003e-06,
316
- "loss": 3.9005,
317
- "step": 28
318
- },
319
- {
320
- "epoch": 0.03274853801169591,
321
- "eval_loss": 3.3265655040740967,
322
- "eval_runtime": 13.9247,
323
- "eval_samples_per_second": 6.463,
324
- "eval_steps_per_second": 6.463,
325
  "step": 28
326
  },
327
  {
328
- "epoch": 0.03391812865497076,
329
- "grad_norm": 7.8519768714904785,
330
- "learning_rate": 2.9e-06,
331
- "loss": 2.8852,
332
  "step": 29
333
  },
334
  {
335
- "epoch": 0.03508771929824561,
336
- "grad_norm": 11.132136344909668,
337
- "learning_rate": 3e-06,
338
- "loss": 2.6383,
339
- "step": 30
340
- },
341
- {
342
- "epoch": 0.03508771929824561,
343
- "eval_loss": 3.324815273284912,
344
- "eval_runtime": 13.9716,
345
- "eval_samples_per_second": 6.442,
346
- "eval_steps_per_second": 6.442,
347
  "step": 30
348
  },
349
  {
350
- "epoch": 0.03625730994152047,
351
- "grad_norm": 10.680882453918457,
352
- "learning_rate": 3.1e-06,
353
- "loss": 3.8337,
354
  "step": 31
355
  },
356
  {
357
- "epoch": 0.03742690058479532,
358
- "grad_norm": 10.323698043823242,
359
- "learning_rate": 3.2000000000000003e-06,
360
- "loss": 3.2712,
361
- "step": 32
362
- },
363
- {
364
- "epoch": 0.03742690058479532,
365
- "eval_loss": 3.3221709728240967,
366
- "eval_runtime": 14.0736,
367
- "eval_samples_per_second": 6.395,
368
- "eval_steps_per_second": 6.395,
369
  "step": 32
370
  },
371
  {
372
- "epoch": 0.03859649122807018,
373
- "grad_norm": 11.598480224609375,
374
- "learning_rate": 3.3e-06,
375
- "loss": 3.2872,
376
  "step": 33
377
  },
378
  {
379
- "epoch": 0.03976608187134503,
380
- "grad_norm": 11.635000228881836,
381
- "learning_rate": 3.4000000000000005e-06,
382
- "loss": 3.2332,
383
- "step": 34
384
- },
385
- {
386
- "epoch": 0.03976608187134503,
387
- "eval_loss": 3.32070255279541,
388
- "eval_runtime": 14.048,
389
- "eval_samples_per_second": 6.407,
390
- "eval_steps_per_second": 6.407,
391
  "step": 34
392
  },
393
  {
394
- "epoch": 0.04093567251461988,
395
- "grad_norm": 9.629935264587402,
396
- "learning_rate": 3.5000000000000004e-06,
397
- "loss": 3.4841,
398
  "step": 35
399
  },
400
  {
401
- "epoch": 0.042105263157894736,
402
- "grad_norm": 9.597766876220703,
403
- "learning_rate": 3.6e-06,
404
- "loss": 3.2372,
405
  "step": 36
406
  },
407
  {
408
- "epoch": 0.042105263157894736,
409
- "eval_loss": 3.3169195652008057,
410
- "eval_runtime": 14.0422,
411
- "eval_samples_per_second": 6.409,
412
- "eval_steps_per_second": 6.409,
413
- "step": 36
414
- },
415
- {
416
- "epoch": 0.04327485380116959,
417
- "grad_norm": 17.730506896972656,
418
- "learning_rate": 3.7e-06,
419
- "loss": 4.7899,
420
  "step": 37
421
  },
422
  {
423
- "epoch": 0.044444444444444446,
424
- "grad_norm": 10.158425331115723,
425
- "learning_rate": 3.8e-06,
426
- "loss": 3.1066,
427
- "step": 38
428
- },
429
- {
430
- "epoch": 0.044444444444444446,
431
- "eval_loss": 3.313861131668091,
432
- "eval_runtime": 14.0748,
433
- "eval_samples_per_second": 6.394,
434
- "eval_steps_per_second": 6.394,
435
  "step": 38
436
  },
437
  {
438
- "epoch": 0.0456140350877193,
439
- "grad_norm": 71.36988067626953,
440
- "learning_rate": 3.9e-06,
441
- "loss": 3.6712,
442
  "step": 39
443
  },
444
  {
445
- "epoch": 0.04678362573099415,
446
- "grad_norm": 9.100869178771973,
447
- "learning_rate": 4.000000000000001e-06,
448
- "loss": 3.0616,
449
- "step": 40
 
450
  },
451
  {
452
- "epoch": 0.04678362573099415,
453
- "eval_loss": 3.3106253147125244,
454
- "eval_runtime": 14.0735,
455
- "eval_samples_per_second": 6.395,
456
- "eval_steps_per_second": 6.395,
457
  "step": 40
458
  },
459
  {
460
- "epoch": 0.047953216374269005,
461
- "grad_norm": 11.145156860351562,
462
- "learning_rate": 4.1000000000000006e-06,
463
- "loss": 3.4375,
464
  "step": 41
465
  },
466
  {
467
- "epoch": 0.04912280701754386,
468
- "grad_norm": 12.307095527648926,
469
- "learning_rate": 4.2000000000000004e-06,
470
- "loss": 2.689,
471
- "step": 42
472
- },
473
- {
474
- "epoch": 0.04912280701754386,
475
- "eval_loss": 3.3057522773742676,
476
- "eval_runtime": 13.9972,
477
- "eval_samples_per_second": 6.43,
478
- "eval_steps_per_second": 6.43,
479
  "step": 42
480
  },
481
  {
482
- "epoch": 0.050292397660818715,
483
- "grad_norm": 11.216985702514648,
484
- "learning_rate": 4.2999999999999995e-06,
485
- "loss": 2.7962,
486
  "step": 43
487
  },
488
  {
489
- "epoch": 0.05146198830409357,
490
- "grad_norm": 11.059319496154785,
491
- "learning_rate": 4.4e-06,
492
- "loss": 2.7182,
493
- "step": 44
494
- },
495
- {
496
- "epoch": 0.05146198830409357,
497
- "eval_loss": 3.300640821456909,
498
- "eval_runtime": 13.9132,
499
- "eval_samples_per_second": 6.469,
500
- "eval_steps_per_second": 6.469,
501
  "step": 44
502
  },
503
  {
504
- "epoch": 0.05263157894736842,
505
- "grad_norm": 10.133870124816895,
506
- "learning_rate": 4.5e-06,
507
- "loss": 2.9549,
508
  "step": 45
509
  },
510
  {
511
- "epoch": 0.05380116959064327,
512
- "grad_norm": 9.002756118774414,
513
- "learning_rate": 4.6e-06,
514
- "loss": 3.1854,
515
  "step": 46
516
  },
517
  {
518
- "epoch": 0.05380116959064327,
519
- "eval_loss": 3.294647455215454,
520
- "eval_runtime": 13.9772,
521
- "eval_samples_per_second": 6.439,
522
- "eval_steps_per_second": 6.439,
523
- "step": 46
524
- },
525
- {
526
- "epoch": 0.05497076023391813,
527
- "grad_norm": 11.244136810302734,
528
- "learning_rate": 4.7e-06,
529
- "loss": 3.8369,
530
  "step": 47
531
  },
532
  {
533
- "epoch": 0.056140350877192984,
534
- "grad_norm": 9.751367568969727,
535
- "learning_rate": 4.800000000000001e-06,
536
- "loss": 3.5293,
537
- "step": 48
538
- },
539
- {
540
- "epoch": 0.056140350877192984,
541
- "eval_loss": 3.2886362075805664,
542
- "eval_runtime": 13.9805,
543
- "eval_samples_per_second": 6.438,
544
- "eval_steps_per_second": 6.438,
545
  "step": 48
546
  },
547
  {
548
- "epoch": 0.05730994152046784,
549
- "grad_norm": 13.132854461669922,
550
- "learning_rate": 4.9000000000000005e-06,
551
- "loss": 3.4583,
552
  "step": 49
553
  },
554
  {
555
- "epoch": 0.05847953216374269,
556
- "grad_norm": 13.29298210144043,
557
- "learning_rate": 5e-06,
558
- "loss": 3.3806,
559
- "step": 50
560
- },
561
- {
562
- "epoch": 0.05847953216374269,
563
- "eval_loss": 3.2816646099090576,
564
- "eval_runtime": 14.0016,
565
- "eval_samples_per_second": 6.428,
566
- "eval_steps_per_second": 6.428,
567
  "step": 50
568
  }
569
  ],
@@ -571,7 +395,7 @@
571
  "max_steps": 50,
572
  "num_input_tokens_seen": 0,
573
  "num_train_epochs": 1,
574
- "save_steps": 2,
575
  "stateful_callbacks": {
576
  "TrainerControl": {
577
  "args": {
@@ -584,7 +408,7 @@
584
  "attributes": {}
585
  }
586
  },
587
- "total_flos": 1068798207590400.0,
588
  "train_batch_size": 1,
589
  "trial_name": null,
590
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.4678362573099415,
5
+ "eval_steps": 13,
6
  "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.00935672514619883,
13
+ "grad_norm": 4.390414714813232,
14
+ "learning_rate": 2e-05,
15
+ "loss": 3.2142,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.00935672514619883,
20
  "eval_loss": 3.3358547687530518,
21
+ "eval_runtime": 13.9154,
22
+ "eval_samples_per_second": 6.468,
23
+ "eval_steps_per_second": 6.468,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.01871345029239766,
28
+ "grad_norm": 4.702328205108643,
29
+ "learning_rate": 4e-05,
30
+ "loss": 3.3766,
 
 
 
 
 
 
 
 
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.028070175438596492,
35
+ "grad_norm": 3.9439697265625,
36
+ "learning_rate": 6e-05,
37
+ "loss": 3.3938,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 0.03742690058479532,
42
+ "grad_norm": 3.85864520072937,
43
+ "learning_rate": 8e-05,
44
+ "loss": 3.0785,
 
 
 
 
 
 
 
 
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 0.04678362573099415,
49
+ "grad_norm": 4.246149063110352,
50
+ "learning_rate": 0.0001,
51
+ "loss": 3.3005,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 0.056140350877192984,
56
+ "grad_norm": 3.976107597351074,
57
+ "learning_rate": 0.00012,
58
+ "loss": 3.0138,
 
 
 
 
 
 
 
 
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.06549707602339182,
63
+ "grad_norm": 4.311132907867432,
64
+ "learning_rate": 0.00014,
65
+ "loss": 3.0914,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.07485380116959064,
70
+ "grad_norm": 3.888803482055664,
71
+ "learning_rate": 0.00016,
72
+ "loss": 2.9978,
 
 
 
 
 
 
 
 
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.08421052631578947,
77
+ "grad_norm": 3.422027826309204,
78
+ "learning_rate": 0.00018,
79
+ "loss": 2.9429,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.0935672514619883,
84
+ "grad_norm": 3.8653438091278076,
85
+ "learning_rate": 0.0002,
86
+ "loss": 2.7896,
 
 
 
 
 
 
 
 
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.10292397660818714,
91
+ "grad_norm": 3.4567666053771973,
92
+ "learning_rate": 0.0001996917333733128,
93
+ "loss": 2.8054,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.11228070175438597,
98
+ "grad_norm": 3.503476619720459,
99
+ "learning_rate": 0.00019876883405951377,
100
+ "loss": 2.6823,
 
 
 
 
 
 
 
 
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.1216374269005848,
105
+ "grad_norm": 3.275860548019409,
106
+ "learning_rate": 0.00019723699203976766,
107
+ "loss": 2.6355,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.1216374269005848,
112
+ "eval_loss": 2.705251932144165,
113
+ "eval_runtime": 14.0018,
114
+ "eval_samples_per_second": 6.428,
115
+ "eval_steps_per_second": 6.428,
116
+ "step": 13
117
  },
118
  {
119
+ "epoch": 0.13099415204678364,
120
+ "grad_norm": 3.013310432434082,
121
+ "learning_rate": 0.00019510565162951537,
122
+ "loss": 2.7933,
 
123
  "step": 14
124
  },
125
  {
126
+ "epoch": 0.14035087719298245,
127
+ "grad_norm": 2.840459108352661,
128
+ "learning_rate": 0.0001923879532511287,
129
+ "loss": 2.5431,
130
  "step": 15
131
  },
132
  {
133
+ "epoch": 0.1497076023391813,
134
+ "grad_norm": 2.7099273204803467,
135
+ "learning_rate": 0.0001891006524188368,
136
+ "loss": 2.7605,
 
 
 
 
 
 
 
 
137
  "step": 16
138
  },
139
  {
140
+ "epoch": 0.15906432748538013,
141
+ "grad_norm": 3.015941858291626,
142
+ "learning_rate": 0.00018526401643540922,
143
+ "loss": 2.7942,
144
  "step": 17
145
  },
146
  {
147
+ "epoch": 0.16842105263157894,
148
+ "grad_norm": 2.8122756481170654,
149
+ "learning_rate": 0.00018090169943749476,
150
+ "loss": 2.4924,
 
 
 
 
 
 
 
 
151
  "step": 18
152
  },
153
  {
154
+ "epoch": 0.17777777777777778,
155
+ "grad_norm": 3.007946729660034,
156
+ "learning_rate": 0.0001760405965600031,
157
+ "loss": 2.603,
158
  "step": 19
159
  },
160
  {
161
+ "epoch": 0.1871345029239766,
162
+ "grad_norm": 2.713869571685791,
163
+ "learning_rate": 0.00017071067811865476,
164
+ "loss": 2.6412,
 
 
 
 
 
 
 
 
165
  "step": 20
166
  },
167
  {
168
+ "epoch": 0.19649122807017544,
169
+ "grad_norm": 3.0754899978637695,
170
+ "learning_rate": 0.00016494480483301836,
171
+ "loss": 2.7249,
172
  "step": 21
173
  },
174
  {
175
+ "epoch": 0.20584795321637428,
176
+ "grad_norm": 2.7693493366241455,
177
+ "learning_rate": 0.00015877852522924732,
178
+ "loss": 2.4397,
 
 
 
 
 
 
 
 
179
  "step": 22
180
  },
181
  {
182
+ "epoch": 0.2152046783625731,
183
+ "grad_norm": 3.2417685985565186,
184
+ "learning_rate": 0.0001522498564715949,
185
+ "loss": 2.6893,
186
  "step": 23
187
  },
188
  {
189
+ "epoch": 0.22456140350877193,
190
+ "grad_norm": 2.7010700702667236,
191
+ "learning_rate": 0.00014539904997395468,
192
+ "loss": 2.5236,
193
  "step": 24
194
  },
195
  {
196
+ "epoch": 0.23391812865497075,
197
+ "grad_norm": 2.3874123096466064,
198
+ "learning_rate": 0.000138268343236509,
199
+ "loss": 2.6103,
 
 
 
 
 
 
 
 
200
  "step": 25
201
  },
202
  {
203
+ "epoch": 0.2432748538011696,
204
+ "grad_norm": 2.6382222175598145,
205
+ "learning_rate": 0.00013090169943749476,
206
+ "loss": 2.5479,
207
  "step": 26
208
  },
209
  {
210
+ "epoch": 0.2432748538011696,
211
+ "eval_loss": 2.524278163909912,
212
+ "eval_runtime": 14.0548,
213
+ "eval_samples_per_second": 6.404,
214
+ "eval_steps_per_second": 6.404,
215
  "step": 26
216
  },
217
  {
218
+ "epoch": 0.25263157894736843,
219
+ "grad_norm": 2.8567240238189697,
220
+ "learning_rate": 0.00012334453638559057,
221
+ "loss": 2.5996,
222
  "step": 27
223
  },
224
  {
225
+ "epoch": 0.26198830409356727,
226
+ "grad_norm": 2.801514148712158,
227
+ "learning_rate": 0.0001156434465040231,
228
+ "loss": 2.418,
 
 
 
 
 
 
 
 
229
  "step": 28
230
  },
231
  {
232
+ "epoch": 0.27134502923976606,
233
+ "grad_norm": 2.5751523971557617,
234
+ "learning_rate": 0.0001078459095727845,
235
+ "loss": 2.3009,
236
  "step": 29
237
  },
238
  {
239
+ "epoch": 0.2807017543859649,
240
+ "grad_norm": 2.7880630493164062,
241
+ "learning_rate": 0.0001,
242
+ "loss": 2.6674,
 
 
 
 
 
 
 
 
243
  "step": 30
244
  },
245
  {
246
+ "epoch": 0.29005847953216374,
247
+ "grad_norm": 2.797865152359009,
248
+ "learning_rate": 9.215409042721552e-05,
249
+ "loss": 2.2681,
250
  "step": 31
251
  },
252
  {
253
+ "epoch": 0.2994152046783626,
254
+ "grad_norm": 2.4833743572235107,
255
+ "learning_rate": 8.435655349597689e-05,
256
+ "loss": 2.3812,
 
 
 
 
 
 
 
 
257
  "step": 32
258
  },
259
  {
260
+ "epoch": 0.3087719298245614,
261
+ "grad_norm": 2.8263983726501465,
262
+ "learning_rate": 7.66554636144095e-05,
263
+ "loss": 2.3514,
264
  "step": 33
265
  },
266
  {
267
+ "epoch": 0.31812865497076026,
268
+ "grad_norm": 2.8448684215545654,
269
+ "learning_rate": 6.909830056250527e-05,
270
+ "loss": 2.4968,
 
 
 
 
 
 
 
 
271
  "step": 34
272
  },
273
  {
274
+ "epoch": 0.32748538011695905,
275
+ "grad_norm": 2.7989699840545654,
276
+ "learning_rate": 6.173165676349103e-05,
277
+ "loss": 2.3572,
278
  "step": 35
279
  },
280
  {
281
+ "epoch": 0.3368421052631579,
282
+ "grad_norm": 2.8134164810180664,
283
+ "learning_rate": 5.4600950026045326e-05,
284
+ "loss": 2.4871,
285
  "step": 36
286
  },
287
  {
288
+ "epoch": 0.34619883040935673,
289
+ "grad_norm": 2.7917492389678955,
290
+ "learning_rate": 4.7750143528405126e-05,
291
+ "loss": 2.5673,
 
 
 
 
 
 
 
 
292
  "step": 37
293
  },
294
  {
295
+ "epoch": 0.35555555555555557,
296
+ "grad_norm": 2.7606780529022217,
297
+ "learning_rate": 4.12214747707527e-05,
298
+ "loss": 2.4209,
 
 
 
 
 
 
 
 
299
  "step": 38
300
  },
301
  {
302
+ "epoch": 0.3649122807017544,
303
+ "grad_norm": 2.731044054031372,
304
+ "learning_rate": 3.5055195166981645e-05,
305
+ "loss": 2.3921,
306
  "step": 39
307
  },
308
  {
309
+ "epoch": 0.3649122807017544,
310
+ "eval_loss": 2.4570887088775635,
311
+ "eval_runtime": 13.8846,
312
+ "eval_samples_per_second": 6.482,
313
+ "eval_steps_per_second": 6.482,
314
+ "step": 39
315
  },
316
  {
317
+ "epoch": 0.3742690058479532,
318
+ "grad_norm": 2.5024521350860596,
319
+ "learning_rate": 2.9289321881345254e-05,
320
+ "loss": 2.4786,
 
321
  "step": 40
322
  },
323
  {
324
+ "epoch": 0.38362573099415204,
325
+ "grad_norm": 2.737299919128418,
326
+ "learning_rate": 2.3959403439996907e-05,
327
+ "loss": 2.5444,
328
  "step": 41
329
  },
330
  {
331
+ "epoch": 0.3929824561403509,
332
+ "grad_norm": 2.6610891819000244,
333
+ "learning_rate": 1.9098300562505266e-05,
334
+ "loss": 2.3979,
 
 
 
 
 
 
 
 
335
  "step": 42
336
  },
337
  {
338
+ "epoch": 0.4023391812865497,
339
+ "grad_norm": 2.5578110218048096,
340
+ "learning_rate": 1.4735983564590783e-05,
341
+ "loss": 2.5223,
342
  "step": 43
343
  },
344
  {
345
+ "epoch": 0.41169590643274856,
346
+ "grad_norm": 2.6688404083251953,
347
+ "learning_rate": 1.0899347581163221e-05,
348
+ "loss": 2.3434,
 
 
 
 
 
 
 
 
349
  "step": 44
350
  },
351
  {
352
+ "epoch": 0.42105263157894735,
353
+ "grad_norm": 2.8133082389831543,
354
+ "learning_rate": 7.612046748871327e-06,
355
+ "loss": 2.6974,
356
  "step": 45
357
  },
358
  {
359
+ "epoch": 0.4304093567251462,
360
+ "grad_norm": 3.1550958156585693,
361
+ "learning_rate": 4.8943483704846475e-06,
362
+ "loss": 2.8442,
363
  "step": 46
364
  },
365
  {
366
+ "epoch": 0.439766081871345,
367
+ "grad_norm": 2.393209218978882,
368
+ "learning_rate": 2.7630079602323442e-06,
369
+ "loss": 2.3097,
 
 
 
 
 
 
 
 
370
  "step": 47
371
  },
372
  {
373
+ "epoch": 0.44912280701754387,
374
+ "grad_norm": 2.543856382369995,
375
+ "learning_rate": 1.231165940486234e-06,
376
+ "loss": 2.3707,
 
 
 
 
 
 
 
 
377
  "step": 48
378
  },
379
  {
380
+ "epoch": 0.4584795321637427,
381
+ "grad_norm": 2.64045786857605,
382
+ "learning_rate": 3.0826662668720364e-07,
383
+ "loss": 2.6261,
384
  "step": 49
385
  },
386
  {
387
+ "epoch": 0.4678362573099415,
388
+ "grad_norm": 2.683619976043701,
389
+ "learning_rate": 0.0,
390
+ "loss": 2.5441,
 
 
 
 
 
 
 
 
391
  "step": 50
392
  }
393
  ],
 
395
  "max_steps": 50,
396
  "num_input_tokens_seen": 0,
397
  "num_train_epochs": 1,
398
+ "save_steps": 13,
399
  "stateful_callbacks": {
400
  "TrainerControl": {
401
  "args": {
 
408
  "attributes": {}
409
  }
410
  },
411
+ "total_flos": 8696642257551360.0,
412
  "train_batch_size": 1,
413
  "trial_name": null,
414
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98f0f0c46bb8a8aea130ec5fac61dac3c1395e8dd1d272da8feeb25551e173ef
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcda8f02659f8934defef4e265587a384ef2bea0c9e7337b7be092c1ae1125f1
3
  size 6776