DeepMount00 commited on
Commit
2cf980c
·
verified ·
1 Parent(s): 125a4bc

Delete trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -1365
trainer_state.json DELETED
@@ -1,1365 +0,0 @@
1
- {
2
- "best_metric": 1.2660380601882935,
3
- "best_model_checkpoint": "./bert_mlm_finetuned/checkpoint-900",
4
- "epoch": 0.1152,
5
- "eval_steps": 100,
6
- "global_step": 900,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.00064,
13
- "grad_norm": 7.650606632232666,
14
- "learning_rate": 1.0000000000000002e-06,
15
- "loss": 6.29,
16
- "step": 5
17
- },
18
- {
19
- "epoch": 0.00128,
20
- "grad_norm": 4.541823387145996,
21
- "learning_rate": 2.0000000000000003e-06,
22
- "loss": 6.3815,
23
- "step": 10
24
- },
25
- {
26
- "epoch": 0.00192,
27
- "grad_norm": 4.245054721832275,
28
- "learning_rate": 3e-06,
29
- "loss": 6.2854,
30
- "step": 15
31
- },
32
- {
33
- "epoch": 0.00256,
34
- "grad_norm": 4.5587897300720215,
35
- "learning_rate": 4.000000000000001e-06,
36
- "loss": 6.0674,
37
- "step": 20
38
- },
39
- {
40
- "epoch": 0.0032,
41
- "grad_norm": 3.7703804969787598,
42
- "learning_rate": 4.800000000000001e-06,
43
- "loss": 6.2961,
44
- "step": 25
45
- },
46
- {
47
- "epoch": 0.00384,
48
- "grad_norm": 3.8425862789154053,
49
- "learning_rate": 5.8e-06,
50
- "loss": 6.3326,
51
- "step": 30
52
- },
53
- {
54
- "epoch": 0.00448,
55
- "grad_norm": 4.413463115692139,
56
- "learning_rate": 6.800000000000001e-06,
57
- "loss": 6.183,
58
- "step": 35
59
- },
60
- {
61
- "epoch": 0.00512,
62
- "grad_norm": 4.1980509757995605,
63
- "learning_rate": 7.800000000000002e-06,
64
- "loss": 6.2654,
65
- "step": 40
66
- },
67
- {
68
- "epoch": 0.00576,
69
- "grad_norm": 3.9166719913482666,
70
- "learning_rate": 8.8e-06,
71
- "loss": 6.0916,
72
- "step": 45
73
- },
74
- {
75
- "epoch": 0.0064,
76
- "grad_norm": 3.4706904888153076,
77
- "learning_rate": 9.800000000000001e-06,
78
- "loss": 6.103,
79
- "step": 50
80
- },
81
- {
82
- "epoch": 0.00704,
83
- "grad_norm": 5.138203144073486,
84
- "learning_rate": 9.999998372356185e-06,
85
- "loss": 6.2379,
86
- "step": 55
87
- },
88
- {
89
- "epoch": 0.00768,
90
- "grad_norm": 3.7806520462036133,
91
- "learning_rate": 9.999991760055e-06,
92
- "loss": 6.1776,
93
- "step": 60
94
- },
95
- {
96
- "epoch": 0.00832,
97
- "grad_norm": 3.5731871128082275,
98
- "learning_rate": 9.999980061375427e-06,
99
- "loss": 6.2082,
100
- "step": 65
101
- },
102
- {
103
- "epoch": 0.00896,
104
- "grad_norm": 3.661797285079956,
105
- "learning_rate": 9.999963276329369e-06,
106
- "loss": 6.0704,
107
- "step": 70
108
- },
109
- {
110
- "epoch": 0.0096,
111
- "grad_norm": 3.6181113719940186,
112
- "learning_rate": 9.999941404933902e-06,
113
- "loss": 6.2081,
114
- "step": 75
115
- },
116
- {
117
- "epoch": 0.01024,
118
- "grad_norm": 3.3162803649902344,
119
- "learning_rate": 9.99991444721127e-06,
120
- "loss": 5.8807,
121
- "step": 80
122
- },
123
- {
124
- "epoch": 0.01088,
125
- "grad_norm": 3.6022472381591797,
126
- "learning_rate": 9.999882403188902e-06,
127
- "loss": 6.1092,
128
- "step": 85
129
- },
130
- {
131
- "epoch": 0.01152,
132
- "grad_norm": 7.291418552398682,
133
- "learning_rate": 9.999845272899393e-06,
134
- "loss": 5.7668,
135
- "step": 90
136
- },
137
- {
138
- "epoch": 0.01216,
139
- "grad_norm": 3.522437810897827,
140
- "learning_rate": 9.999803056380517e-06,
141
- "loss": 6.1621,
142
- "step": 95
143
- },
144
- {
145
- "epoch": 0.0128,
146
- "grad_norm": 3.9014439582824707,
147
- "learning_rate": 9.999755753675216e-06,
148
- "loss": 6.0573,
149
- "step": 100
150
- },
151
- {
152
- "epoch": 0.0128,
153
- "eval_loss": 1.5072969198226929,
154
- "eval_runtime": 11.1161,
155
- "eval_samples_per_second": 89.96,
156
- "eval_steps_per_second": 11.245,
157
- "step": 100
158
- },
159
- {
160
- "epoch": 0.01344,
161
- "grad_norm": 3.7579081058502197,
162
- "learning_rate": 9.999703364831614e-06,
163
- "loss": 6.1671,
164
- "step": 105
165
- },
166
- {
167
- "epoch": 0.01408,
168
- "grad_norm": 3.7058262825012207,
169
- "learning_rate": 9.999645889903002e-06,
170
- "loss": 6.1348,
171
- "step": 110
172
- },
173
- {
174
- "epoch": 0.01472,
175
- "grad_norm": 5.018667697906494,
176
- "learning_rate": 9.99958332894785e-06,
177
- "loss": 5.9376,
178
- "step": 115
179
- },
180
- {
181
- "epoch": 0.01536,
182
- "grad_norm": 3.5420188903808594,
183
- "learning_rate": 9.999515682029798e-06,
184
- "loss": 5.9961,
185
- "step": 120
186
- },
187
- {
188
- "epoch": 0.016,
189
- "grad_norm": 3.5725393295288086,
190
- "learning_rate": 9.999442949217663e-06,
191
- "loss": 5.8439,
192
- "step": 125
193
- },
194
- {
195
- "epoch": 0.01664,
196
- "grad_norm": 3.8440959453582764,
197
- "learning_rate": 9.999365130585435e-06,
198
- "loss": 5.7857,
199
- "step": 130
200
- },
201
- {
202
- "epoch": 0.01728,
203
- "grad_norm": 3.4371285438537598,
204
- "learning_rate": 9.999282226212276e-06,
205
- "loss": 5.799,
206
- "step": 135
207
- },
208
- {
209
- "epoch": 0.01792,
210
- "grad_norm": 3.996847152709961,
211
- "learning_rate": 9.999194236182523e-06,
212
- "loss": 6.0022,
213
- "step": 140
214
- },
215
- {
216
- "epoch": 0.01856,
217
- "grad_norm": 3.720330238342285,
218
- "learning_rate": 9.999101160585687e-06,
219
- "loss": 5.925,
220
- "step": 145
221
- },
222
- {
223
- "epoch": 0.0192,
224
- "grad_norm": 3.8822953701019287,
225
- "learning_rate": 9.99900299951645e-06,
226
- "loss": 5.8085,
227
- "step": 150
228
- },
229
- {
230
- "epoch": 0.01984,
231
- "grad_norm": 3.599283456802368,
232
- "learning_rate": 9.99889975307467e-06,
233
- "loss": 5.6533,
234
- "step": 155
235
- },
236
- {
237
- "epoch": 0.02048,
238
- "grad_norm": 3.4847381114959717,
239
- "learning_rate": 9.998791421365376e-06,
240
- "loss": 5.9021,
241
- "step": 160
242
- },
243
- {
244
- "epoch": 0.02112,
245
- "grad_norm": 3.4302055835723877,
246
- "learning_rate": 9.998678004498774e-06,
247
- "loss": 5.962,
248
- "step": 165
249
- },
250
- {
251
- "epoch": 0.02176,
252
- "grad_norm": 4.561929702758789,
253
- "learning_rate": 9.99855950259024e-06,
254
- "loss": 5.9011,
255
- "step": 170
256
- },
257
- {
258
- "epoch": 0.0224,
259
- "grad_norm": 4.069271087646484,
260
- "learning_rate": 9.998435915760323e-06,
261
- "loss": 5.6782,
262
- "step": 175
263
- },
264
- {
265
- "epoch": 0.02304,
266
- "grad_norm": 3.5959055423736572,
267
- "learning_rate": 9.998307244134741e-06,
268
- "loss": 5.8107,
269
- "step": 180
270
- },
271
- {
272
- "epoch": 0.02368,
273
- "grad_norm": 3.5477242469787598,
274
- "learning_rate": 9.998173487844396e-06,
275
- "loss": 5.8335,
276
- "step": 185
277
- },
278
- {
279
- "epoch": 0.02432,
280
- "grad_norm": 4.488218307495117,
281
- "learning_rate": 9.998034647025349e-06,
282
- "loss": 5.8285,
283
- "step": 190
284
- },
285
- {
286
- "epoch": 0.02496,
287
- "grad_norm": 3.555074691772461,
288
- "learning_rate": 9.997890721818844e-06,
289
- "loss": 5.817,
290
- "step": 195
291
- },
292
- {
293
- "epoch": 0.0256,
294
- "grad_norm": 3.6248419284820557,
295
- "learning_rate": 9.99774171237129e-06,
296
- "loss": 5.8368,
297
- "step": 200
298
- },
299
- {
300
- "epoch": 0.0256,
301
- "eval_loss": 1.440572738647461,
302
- "eval_runtime": 6.6468,
303
- "eval_samples_per_second": 150.448,
304
- "eval_steps_per_second": 18.806,
305
- "step": 200
306
- },
307
- {
308
- "epoch": 0.02624,
309
- "grad_norm": 3.432421922683716,
310
- "learning_rate": 9.997587618834272e-06,
311
- "loss": 5.7842,
312
- "step": 205
313
- },
314
- {
315
- "epoch": 0.02688,
316
- "grad_norm": 3.333038806915283,
317
- "learning_rate": 9.997428441364546e-06,
318
- "loss": 5.7173,
319
- "step": 210
320
- },
321
- {
322
- "epoch": 0.02752,
323
- "grad_norm": 3.7716541290283203,
324
- "learning_rate": 9.997264180124038e-06,
325
- "loss": 5.719,
326
- "step": 215
327
- },
328
- {
329
- "epoch": 0.02816,
330
- "grad_norm": 3.345600128173828,
331
- "learning_rate": 9.99709483527985e-06,
332
- "loss": 5.8428,
333
- "step": 220
334
- },
335
- {
336
- "epoch": 0.0288,
337
- "grad_norm": 3.7677502632141113,
338
- "learning_rate": 9.99692040700425e-06,
339
- "loss": 5.7393,
340
- "step": 225
341
- },
342
- {
343
- "epoch": 0.02944,
344
- "grad_norm": 11.996383666992188,
345
- "learning_rate": 9.996740895474682e-06,
346
- "loss": 5.5566,
347
- "step": 230
348
- },
349
- {
350
- "epoch": 0.03008,
351
- "grad_norm": 3.6089084148406982,
352
- "learning_rate": 9.996556300873758e-06,
353
- "loss": 5.6939,
354
- "step": 235
355
- },
356
- {
357
- "epoch": 0.03072,
358
- "grad_norm": 3.834825038909912,
359
- "learning_rate": 9.996366623389263e-06,
360
- "loss": 5.8123,
361
- "step": 240
362
- },
363
- {
364
- "epoch": 0.03136,
365
- "grad_norm": 3.570263147354126,
366
- "learning_rate": 9.99617186321415e-06,
367
- "loss": 5.6839,
368
- "step": 245
369
- },
370
- {
371
- "epoch": 0.032,
372
- "grad_norm": 3.5728812217712402,
373
- "learning_rate": 9.995972020546545e-06,
374
- "loss": 5.7764,
375
- "step": 250
376
- },
377
- {
378
- "epoch": 0.03264,
379
- "grad_norm": 3.4725637435913086,
380
- "learning_rate": 9.995767095589743e-06,
381
- "loss": 5.6879,
382
- "step": 255
383
- },
384
- {
385
- "epoch": 0.03328,
386
- "grad_norm": 3.811537742614746,
387
- "learning_rate": 9.99555708855221e-06,
388
- "loss": 5.6418,
389
- "step": 260
390
- },
391
- {
392
- "epoch": 0.03392,
393
- "grad_norm": 3.494992971420288,
394
- "learning_rate": 9.99534199964758e-06,
395
- "loss": 5.6927,
396
- "step": 265
397
- },
398
- {
399
- "epoch": 0.03456,
400
- "grad_norm": 3.8107383251190186,
401
- "learning_rate": 9.995121829094662e-06,
402
- "loss": 5.5658,
403
- "step": 270
404
- },
405
- {
406
- "epoch": 0.0352,
407
- "grad_norm": 3.570551633834839,
408
- "learning_rate": 9.994896577117425e-06,
409
- "loss": 5.8131,
410
- "step": 275
411
- },
412
- {
413
- "epoch": 0.03584,
414
- "grad_norm": 3.540811538696289,
415
- "learning_rate": 9.994666243945018e-06,
416
- "loss": 5.6009,
417
- "step": 280
418
- },
419
- {
420
- "epoch": 0.03648,
421
- "grad_norm": 3.7275819778442383,
422
- "learning_rate": 9.99443082981175e-06,
423
- "loss": 5.6407,
424
- "step": 285
425
- },
426
- {
427
- "epoch": 0.03712,
428
- "grad_norm": 4.194495677947998,
429
- "learning_rate": 9.994190334957103e-06,
430
- "loss": 5.8319,
431
- "step": 290
432
- },
433
- {
434
- "epoch": 0.03776,
435
- "grad_norm": 3.5107626914978027,
436
- "learning_rate": 9.993944759625728e-06,
437
- "loss": 5.5765,
438
- "step": 295
439
- },
440
- {
441
- "epoch": 0.0384,
442
- "grad_norm": 3.4100208282470703,
443
- "learning_rate": 9.993694104067444e-06,
444
- "loss": 5.7473,
445
- "step": 300
446
- },
447
- {
448
- "epoch": 0.0384,
449
- "eval_loss": 1.407908320426941,
450
- "eval_runtime": 6.6542,
451
- "eval_samples_per_second": 150.281,
452
- "eval_steps_per_second": 18.785,
453
- "step": 300
454
- },
455
- {
456
- "epoch": 0.03904,
457
- "grad_norm": 3.7727818489074707,
458
- "learning_rate": 9.993438368537236e-06,
459
- "loss": 5.6802,
460
- "step": 305
461
- },
462
- {
463
- "epoch": 0.03968,
464
- "grad_norm": 3.445909023284912,
465
- "learning_rate": 9.993177553295258e-06,
466
- "loss": 5.7484,
467
- "step": 310
468
- },
469
- {
470
- "epoch": 0.04032,
471
- "grad_norm": 3.4199888706207275,
472
- "learning_rate": 9.992911658606832e-06,
473
- "loss": 5.7648,
474
- "step": 315
475
- },
476
- {
477
- "epoch": 0.04096,
478
- "grad_norm": 4.9640655517578125,
479
- "learning_rate": 9.992640684742445e-06,
480
- "loss": 5.7922,
481
- "step": 320
482
- },
483
- {
484
- "epoch": 0.0416,
485
- "grad_norm": 3.3730976581573486,
486
- "learning_rate": 9.992364631977754e-06,
487
- "loss": 5.677,
488
- "step": 325
489
- },
490
- {
491
- "epoch": 0.04224,
492
- "grad_norm": 3.540597915649414,
493
- "learning_rate": 9.99208350059358e-06,
494
- "loss": 5.5495,
495
- "step": 330
496
- },
497
- {
498
- "epoch": 0.04288,
499
- "grad_norm": 3.6853768825531006,
500
- "learning_rate": 9.991797290875915e-06,
501
- "loss": 5.4089,
502
- "step": 335
503
- },
504
- {
505
- "epoch": 0.04352,
506
- "grad_norm": 3.6380045413970947,
507
- "learning_rate": 9.991506003115911e-06,
508
- "loss": 5.4849,
509
- "step": 340
510
- },
511
- {
512
- "epoch": 0.04416,
513
- "grad_norm": 3.265488862991333,
514
- "learning_rate": 9.991209637609887e-06,
515
- "loss": 5.523,
516
- "step": 345
517
- },
518
- {
519
- "epoch": 0.0448,
520
- "grad_norm": 3.2634189128875732,
521
- "learning_rate": 9.990908194659332e-06,
522
- "loss": 5.5664,
523
- "step": 350
524
- },
525
- {
526
- "epoch": 0.04544,
527
- "grad_norm": 3.569810152053833,
528
- "learning_rate": 9.990601674570895e-06,
529
- "loss": 5.5059,
530
- "step": 355
531
- },
532
- {
533
- "epoch": 0.04608,
534
- "grad_norm": 3.580211877822876,
535
- "learning_rate": 9.990290077656393e-06,
536
- "loss": 5.4079,
537
- "step": 360
538
- },
539
- {
540
- "epoch": 0.04672,
541
- "grad_norm": 3.4860317707061768,
542
- "learning_rate": 9.989973404232805e-06,
543
- "loss": 5.6858,
544
- "step": 365
545
- },
546
- {
547
- "epoch": 0.04736,
548
- "grad_norm": 4.026730060577393,
549
- "learning_rate": 9.989651654622277e-06,
550
- "loss": 5.5662,
551
- "step": 370
552
- },
553
- {
554
- "epoch": 0.048,
555
- "grad_norm": 3.364692449569702,
556
- "learning_rate": 9.989324829152119e-06,
557
- "loss": 5.5304,
558
- "step": 375
559
- },
560
- {
561
- "epoch": 0.04864,
562
- "grad_norm": 3.611964464187622,
563
- "learning_rate": 9.9889929281548e-06,
564
- "loss": 5.3911,
565
- "step": 380
566
- },
567
- {
568
- "epoch": 0.04928,
569
- "grad_norm": 3.2946035861968994,
570
- "learning_rate": 9.988655951967958e-06,
571
- "loss": 5.4102,
572
- "step": 385
573
- },
574
- {
575
- "epoch": 0.04992,
576
- "grad_norm": 3.963909864425659,
577
- "learning_rate": 9.98831390093439e-06,
578
- "loss": 5.549,
579
- "step": 390
580
- },
581
- {
582
- "epoch": 0.05056,
583
- "grad_norm": 3.2876341342926025,
584
- "learning_rate": 9.987966775402056e-06,
585
- "loss": 5.5388,
586
- "step": 395
587
- },
588
- {
589
- "epoch": 0.0512,
590
- "grad_norm": 3.8467471599578857,
591
- "learning_rate": 9.98761457572408e-06,
592
- "loss": 5.454,
593
- "step": 400
594
- },
595
- {
596
- "epoch": 0.0512,
597
- "eval_loss": 1.3826359510421753,
598
- "eval_runtime": 7.0199,
599
- "eval_samples_per_second": 142.452,
600
- "eval_steps_per_second": 17.807,
601
- "step": 400
602
- },
603
- {
604
- "epoch": 0.05184,
605
- "grad_norm": 3.675231695175171,
606
- "learning_rate": 9.987257302258748e-06,
607
- "loss": 5.674,
608
- "step": 405
609
- },
610
- {
611
- "epoch": 0.05248,
612
- "grad_norm": 3.787940263748169,
613
- "learning_rate": 9.986894955369504e-06,
614
- "loss": 5.5466,
615
- "step": 410
616
- },
617
- {
618
- "epoch": 0.05312,
619
- "grad_norm": 3.677966833114624,
620
- "learning_rate": 9.986527535424956e-06,
621
- "loss": 5.4762,
622
- "step": 415
623
- },
624
- {
625
- "epoch": 0.05376,
626
- "grad_norm": 3.5083606243133545,
627
- "learning_rate": 9.986155042798874e-06,
628
- "loss": 5.3145,
629
- "step": 420
630
- },
631
- {
632
- "epoch": 0.0544,
633
- "grad_norm": 3.536379098892212,
634
- "learning_rate": 9.98577747787018e-06,
635
- "loss": 5.3769,
636
- "step": 425
637
- },
638
- {
639
- "epoch": 0.05504,
640
- "grad_norm": 3.5448412895202637,
641
- "learning_rate": 9.98539484102297e-06,
642
- "loss": 5.3996,
643
- "step": 430
644
- },
645
- {
646
- "epoch": 0.05568,
647
- "grad_norm": 3.359647274017334,
648
- "learning_rate": 9.985007132646489e-06,
649
- "loss": 5.3114,
650
- "step": 435
651
- },
652
- {
653
- "epoch": 0.05632,
654
- "grad_norm": 3.3419110774993896,
655
- "learning_rate": 9.984614353135143e-06,
656
- "loss": 5.4383,
657
- "step": 440
658
- },
659
- {
660
- "epoch": 0.05696,
661
- "grad_norm": 3.558025360107422,
662
- "learning_rate": 9.984216502888496e-06,
663
- "loss": 5.5239,
664
- "step": 445
665
- },
666
- {
667
- "epoch": 0.0576,
668
- "grad_norm": 3.6349422931671143,
669
- "learning_rate": 9.983813582311277e-06,
670
- "loss": 5.5639,
671
- "step": 450
672
- },
673
- {
674
- "epoch": 0.05824,
675
- "grad_norm": 3.2916922569274902,
676
- "learning_rate": 9.983405591813362e-06,
677
- "loss": 5.3886,
678
- "step": 455
679
- },
680
- {
681
- "epoch": 0.05888,
682
- "grad_norm": 3.32891845703125,
683
- "learning_rate": 9.982992531809796e-06,
684
- "loss": 5.526,
685
- "step": 460
686
- },
687
- {
688
- "epoch": 0.05952,
689
- "grad_norm": 3.8752880096435547,
690
- "learning_rate": 9.982574402720773e-06,
691
- "loss": 5.6599,
692
- "step": 465
693
- },
694
- {
695
- "epoch": 0.06016,
696
- "grad_norm": 3.604433536529541,
697
- "learning_rate": 9.982151204971646e-06,
698
- "loss": 5.4567,
699
- "step": 470
700
- },
701
- {
702
- "epoch": 0.0608,
703
- "grad_norm": 3.3058159351348877,
704
- "learning_rate": 9.981722938992926e-06,
705
- "loss": 5.4981,
706
- "step": 475
707
- },
708
- {
709
- "epoch": 0.06144,
710
- "grad_norm": 3.7341926097869873,
711
- "learning_rate": 9.981289605220276e-06,
712
- "loss": 5.3278,
713
- "step": 480
714
- },
715
- {
716
- "epoch": 0.06208,
717
- "grad_norm": 3.51798415184021,
718
- "learning_rate": 9.980851204094519e-06,
719
- "loss": 5.5029,
720
- "step": 485
721
- },
722
- {
723
- "epoch": 0.06272,
724
- "grad_norm": 3.6541428565979004,
725
- "learning_rate": 9.980407736061629e-06,
726
- "loss": 5.3987,
727
- "step": 490
728
- },
729
- {
730
- "epoch": 0.06336,
731
- "grad_norm": 3.420767307281494,
732
- "learning_rate": 9.979959201572736e-06,
733
- "loss": 5.405,
734
- "step": 495
735
- },
736
- {
737
- "epoch": 0.064,
738
- "grad_norm": 3.7169559001922607,
739
- "learning_rate": 9.979505601084124e-06,
740
- "loss": 5.498,
741
- "step": 500
742
- },
743
- {
744
- "epoch": 0.064,
745
- "eval_loss": 1.3493109941482544,
746
- "eval_runtime": 7.1309,
747
- "eval_samples_per_second": 140.234,
748
- "eval_steps_per_second": 17.529,
749
- "step": 500
750
- },
751
- {
752
- "epoch": 0.06464,
753
- "grad_norm": 4.536627769470215,
754
- "learning_rate": 9.97904693505723e-06,
755
- "loss": 5.5237,
756
- "step": 505
757
- },
758
- {
759
- "epoch": 0.06528,
760
- "grad_norm": 3.204948902130127,
761
- "learning_rate": 9.978583203958649e-06,
762
- "loss": 5.3746,
763
- "step": 510
764
- },
765
- {
766
- "epoch": 0.06592,
767
- "grad_norm": 3.4658005237579346,
768
- "learning_rate": 9.978114408260118e-06,
769
- "loss": 5.4567,
770
- "step": 515
771
- },
772
- {
773
- "epoch": 0.06656,
774
- "grad_norm": 4.932333469390869,
775
- "learning_rate": 9.977640548438534e-06,
776
- "loss": 5.1959,
777
- "step": 520
778
- },
779
- {
780
- "epoch": 0.0672,
781
- "grad_norm": 3.4697563648223877,
782
- "learning_rate": 9.977161624975948e-06,
783
- "loss": 5.4013,
784
- "step": 525
785
- },
786
- {
787
- "epoch": 0.06784,
788
- "grad_norm": 3.441819667816162,
789
- "learning_rate": 9.976677638359553e-06,
790
- "loss": 5.4899,
791
- "step": 530
792
- },
793
- {
794
- "epoch": 0.06848,
795
- "grad_norm": 3.4293930530548096,
796
- "learning_rate": 9.9761885890817e-06,
797
- "loss": 5.3569,
798
- "step": 535
799
- },
800
- {
801
- "epoch": 0.06912,
802
- "grad_norm": 3.5388574600219727,
803
- "learning_rate": 9.975694477639885e-06,
804
- "loss": 5.2739,
805
- "step": 540
806
- },
807
- {
808
- "epoch": 0.06976,
809
- "grad_norm": 3.735548973083496,
810
- "learning_rate": 9.97519530453676e-06,
811
- "loss": 5.4253,
812
- "step": 545
813
- },
814
- {
815
- "epoch": 0.0704,
816
- "grad_norm": 3.33503794670105,
817
- "learning_rate": 9.974691070280121e-06,
818
- "loss": 5.1569,
819
- "step": 550
820
- },
821
- {
822
- "epoch": 0.07104,
823
- "grad_norm": 3.5171401500701904,
824
- "learning_rate": 9.974181775382915e-06,
825
- "loss": 5.3242,
826
- "step": 555
827
- },
828
- {
829
- "epoch": 0.07168,
830
- "grad_norm": 3.565356969833374,
831
- "learning_rate": 9.973667420363233e-06,
832
- "loss": 5.3893,
833
- "step": 560
834
- },
835
- {
836
- "epoch": 0.07232,
837
- "grad_norm": 3.172163248062134,
838
- "learning_rate": 9.973148005744319e-06,
839
- "loss": 5.3824,
840
- "step": 565
841
- },
842
- {
843
- "epoch": 0.07296,
844
- "grad_norm": 3.517838716506958,
845
- "learning_rate": 9.972623532054564e-06,
846
- "loss": 5.2673,
847
- "step": 570
848
- },
849
- {
850
- "epoch": 0.0736,
851
- "grad_norm": 3.328416585922241,
852
- "learning_rate": 9.9720939998275e-06,
853
- "loss": 5.2649,
854
- "step": 575
855
- },
856
- {
857
- "epoch": 0.07424,
858
- "grad_norm": 3.475539445877075,
859
- "learning_rate": 9.971559409601807e-06,
860
- "loss": 5.3318,
861
- "step": 580
862
- },
863
- {
864
- "epoch": 0.07488,
865
- "grad_norm": 3.492013692855835,
866
- "learning_rate": 9.971019761921317e-06,
867
- "loss": 5.2735,
868
- "step": 585
869
- },
870
- {
871
- "epoch": 0.07552,
872
- "grad_norm": 3.474803924560547,
873
- "learning_rate": 9.970475057334997e-06,
874
- "loss": 5.3722,
875
- "step": 590
876
- },
877
- {
878
- "epoch": 0.07616,
879
- "grad_norm": 3.4162726402282715,
880
- "learning_rate": 9.96992529639696e-06,
881
- "loss": 5.3901,
882
- "step": 595
883
- },
884
- {
885
- "epoch": 0.0768,
886
- "grad_norm": 3.3643155097961426,
887
- "learning_rate": 9.969370479666473e-06,
888
- "loss": 5.2384,
889
- "step": 600
890
- },
891
- {
892
- "epoch": 0.0768,
893
- "eval_loss": 1.3373793363571167,
894
- "eval_runtime": 6.5847,
895
- "eval_samples_per_second": 151.867,
896
- "eval_steps_per_second": 18.983,
897
- "step": 600
898
- },
899
- {
900
- "epoch": 0.07744,
901
- "grad_norm": 3.44301176071167,
902
- "learning_rate": 9.968810607707933e-06,
903
- "loss": 5.2322,
904
- "step": 605
905
- },
906
- {
907
- "epoch": 0.07808,
908
- "grad_norm": 3.422262668609619,
909
- "learning_rate": 9.968245681090887e-06,
910
- "loss": 5.1708,
911
- "step": 610
912
- },
913
- {
914
- "epoch": 0.07872,
915
- "grad_norm": 3.2879252433776855,
916
- "learning_rate": 9.96767570039002e-06,
917
- "loss": 5.2291,
918
- "step": 615
919
- },
920
- {
921
- "epoch": 0.07936,
922
- "grad_norm": 3.6026480197906494,
923
- "learning_rate": 9.967100666185163e-06,
924
- "loss": 5.4241,
925
- "step": 620
926
- },
927
- {
928
- "epoch": 0.08,
929
- "grad_norm": 3.3642101287841797,
930
- "learning_rate": 9.966520579061286e-06,
931
- "loss": 5.4473,
932
- "step": 625
933
- },
934
- {
935
- "epoch": 0.08064,
936
- "grad_norm": 3.5968470573425293,
937
- "learning_rate": 9.965935439608493e-06,
938
- "loss": 5.3982,
939
- "step": 630
940
- },
941
- {
942
- "epoch": 0.08128,
943
- "grad_norm": 3.352083206176758,
944
- "learning_rate": 9.96534524842204e-06,
945
- "loss": 5.3953,
946
- "step": 635
947
- },
948
- {
949
- "epoch": 0.08192,
950
- "grad_norm": 3.3571720123291016,
951
- "learning_rate": 9.964750006102311e-06,
952
- "loss": 5.3159,
953
- "step": 640
954
- },
955
- {
956
- "epoch": 0.08256,
957
- "grad_norm": 3.486246109008789,
958
- "learning_rate": 9.964149713254833e-06,
959
- "loss": 5.211,
960
- "step": 645
961
- },
962
- {
963
- "epoch": 0.0832,
964
- "grad_norm": 3.674906015396118,
965
- "learning_rate": 9.96354437049027e-06,
966
- "loss": 5.3374,
967
- "step": 650
968
- },
969
- {
970
- "epoch": 0.08384,
971
- "grad_norm": 3.590810537338257,
972
- "learning_rate": 9.962933978424426e-06,
973
- "loss": 5.2194,
974
- "step": 655
975
- },
976
- {
977
- "epoch": 0.08448,
978
- "grad_norm": 3.551786184310913,
979
- "learning_rate": 9.962318537678238e-06,
980
- "loss": 5.1187,
981
- "step": 660
982
- },
983
- {
984
- "epoch": 0.08512,
985
- "grad_norm": 3.5391581058502197,
986
- "learning_rate": 9.961698048877776e-06,
987
- "loss": 5.2001,
988
- "step": 665
989
- },
990
- {
991
- "epoch": 0.08576,
992
- "grad_norm": 3.6105592250823975,
993
- "learning_rate": 9.961072512654255e-06,
994
- "loss": 5.2758,
995
- "step": 670
996
- },
997
- {
998
- "epoch": 0.0864,
999
- "grad_norm": 3.7463858127593994,
1000
- "learning_rate": 9.960441929644017e-06,
1001
- "loss": 5.2137,
1002
- "step": 675
1003
- },
1004
- {
1005
- "epoch": 0.08704,
1006
- "grad_norm": 3.9237470626831055,
1007
- "learning_rate": 9.959806300488538e-06,
1008
- "loss": 5.2047,
1009
- "step": 680
1010
- },
1011
- {
1012
- "epoch": 0.08768,
1013
- "grad_norm": 3.392827272415161,
1014
- "learning_rate": 9.95916562583443e-06,
1015
- "loss": 5.3071,
1016
- "step": 685
1017
- },
1018
- {
1019
- "epoch": 0.08832,
1020
- "grad_norm": 3.221484661102295,
1021
- "learning_rate": 9.958519906333438e-06,
1022
- "loss": 5.183,
1023
- "step": 690
1024
- },
1025
- {
1026
- "epoch": 0.08896,
1027
- "grad_norm": 3.5143983364105225,
1028
- "learning_rate": 9.957869142642437e-06,
1029
- "loss": 5.3171,
1030
- "step": 695
1031
- },
1032
- {
1033
- "epoch": 0.0896,
1034
- "grad_norm": 3.497072696685791,
1035
- "learning_rate": 9.957213335423433e-06,
1036
- "loss": 5.1784,
1037
- "step": 700
1038
- },
1039
- {
1040
- "epoch": 0.0896,
1041
- "eval_loss": 1.2988511323928833,
1042
- "eval_runtime": 6.9763,
1043
- "eval_samples_per_second": 143.342,
1044
- "eval_steps_per_second": 17.918,
1045
- "step": 700
1046
- },
1047
- {
1048
- "epoch": 0.09024,
1049
- "grad_norm": 3.3822438716888428,
1050
- "learning_rate": 9.956552485343566e-06,
1051
- "loss": 5.1732,
1052
- "step": 705
1053
- },
1054
- {
1055
- "epoch": 0.09088,
1056
- "grad_norm": 3.3949694633483887,
1057
- "learning_rate": 9.955886593075101e-06,
1058
- "loss": 5.2725,
1059
- "step": 710
1060
- },
1061
- {
1062
- "epoch": 0.09152,
1063
- "grad_norm": 3.2577288150787354,
1064
- "learning_rate": 9.955215659295438e-06,
1065
- "loss": 5.2207,
1066
- "step": 715
1067
- },
1068
- {
1069
- "epoch": 0.09216,
1070
- "grad_norm": 3.769519567489624,
1071
- "learning_rate": 9.954539684687103e-06,
1072
- "loss": 5.2152,
1073
- "step": 720
1074
- },
1075
- {
1076
- "epoch": 0.0928,
1077
- "grad_norm": 3.3824892044067383,
1078
- "learning_rate": 9.953858669937746e-06,
1079
- "loss": 5.2085,
1080
- "step": 725
1081
- },
1082
- {
1083
- "epoch": 0.09344,
1084
- "grad_norm": 3.771742105484009,
1085
- "learning_rate": 9.953172615740152e-06,
1086
- "loss": 5.1575,
1087
- "step": 730
1088
- },
1089
- {
1090
- "epoch": 0.09408,
1091
- "grad_norm": 3.7706689834594727,
1092
- "learning_rate": 9.952481522792226e-06,
1093
- "loss": 4.9608,
1094
- "step": 735
1095
- },
1096
- {
1097
- "epoch": 0.09472,
1098
- "grad_norm": 3.8110334873199463,
1099
- "learning_rate": 9.951785391797001e-06,
1100
- "loss": 5.21,
1101
- "step": 740
1102
- },
1103
- {
1104
- "epoch": 0.09536,
1105
- "grad_norm": 3.3012993335723877,
1106
- "learning_rate": 9.951084223462636e-06,
1107
- "loss": 5.2475,
1108
- "step": 745
1109
- },
1110
- {
1111
- "epoch": 0.096,
1112
- "grad_norm": 3.6353518962860107,
1113
- "learning_rate": 9.950378018502415e-06,
1114
- "loss": 5.0985,
1115
- "step": 750
1116
- },
1117
- {
1118
- "epoch": 0.09664,
1119
- "grad_norm": 3.369378089904785,
1120
- "learning_rate": 9.949666777634743e-06,
1121
- "loss": 5.1986,
1122
- "step": 755
1123
- },
1124
- {
1125
- "epoch": 0.09728,
1126
- "grad_norm": 3.2247676849365234,
1127
- "learning_rate": 9.948950501583147e-06,
1128
- "loss": 5.3192,
1129
- "step": 760
1130
- },
1131
- {
1132
- "epoch": 0.09792,
1133
- "grad_norm": 3.6966888904571533,
1134
- "learning_rate": 9.948229191076284e-06,
1135
- "loss": 5.1654,
1136
- "step": 765
1137
- },
1138
- {
1139
- "epoch": 0.09856,
1140
- "grad_norm": 3.5823962688446045,
1141
- "learning_rate": 9.947502846847921e-06,
1142
- "loss": 5.1351,
1143
- "step": 770
1144
- },
1145
- {
1146
- "epoch": 0.0992,
1147
- "grad_norm": 3.5258729457855225,
1148
- "learning_rate": 9.946771469636955e-06,
1149
- "loss": 5.1745,
1150
- "step": 775
1151
- },
1152
- {
1153
- "epoch": 0.09984,
1154
- "grad_norm": 3.42067813873291,
1155
- "learning_rate": 9.946035060187398e-06,
1156
- "loss": 5.1569,
1157
- "step": 780
1158
- },
1159
- {
1160
- "epoch": 0.10048,
1161
- "grad_norm": 3.9832825660705566,
1162
- "learning_rate": 9.945293619248383e-06,
1163
- "loss": 4.9796,
1164
- "step": 785
1165
- },
1166
- {
1167
- "epoch": 0.10112,
1168
- "grad_norm": 3.742013692855835,
1169
- "learning_rate": 9.944547147574162e-06,
1170
- "loss": 5.1625,
1171
- "step": 790
1172
- },
1173
- {
1174
- "epoch": 0.10176,
1175
- "grad_norm": 3.3150367736816406,
1176
- "learning_rate": 9.943795645924104e-06,
1177
- "loss": 5.099,
1178
- "step": 795
1179
- },
1180
- {
1181
- "epoch": 0.1024,
1182
- "grad_norm": 3.359069585800171,
1183
- "learning_rate": 9.943039115062691e-06,
1184
- "loss": 5.1877,
1185
- "step": 800
1186
- },
1187
- {
1188
- "epoch": 0.1024,
1189
- "eval_loss": 1.2946017980575562,
1190
- "eval_runtime": 7.4306,
1191
- "eval_samples_per_second": 134.579,
1192
- "eval_steps_per_second": 16.822,
1193
- "step": 800
1194
- },
1195
- {
1196
- "epoch": 0.10304,
1197
- "grad_norm": 3.703000545501709,
1198
- "learning_rate": 9.94227755575953e-06,
1199
- "loss": 5.1581,
1200
- "step": 805
1201
- },
1202
- {
1203
- "epoch": 0.10368,
1204
- "grad_norm": 3.5370070934295654,
1205
- "learning_rate": 9.941510968789334e-06,
1206
- "loss": 5.2402,
1207
- "step": 810
1208
- },
1209
- {
1210
- "epoch": 0.10432,
1211
- "grad_norm": 3.5010828971862793,
1212
- "learning_rate": 9.940739354931936e-06,
1213
- "loss": 5.1828,
1214
- "step": 815
1215
- },
1216
- {
1217
- "epoch": 0.10496,
1218
- "grad_norm": 3.4637820720672607,
1219
- "learning_rate": 9.93996271497228e-06,
1220
- "loss": 5.1792,
1221
- "step": 820
1222
- },
1223
- {
1224
- "epoch": 0.1056,
1225
- "grad_norm": 3.409712076187134,
1226
- "learning_rate": 9.939181049700427e-06,
1227
- "loss": 5.0721,
1228
- "step": 825
1229
- },
1230
- {
1231
- "epoch": 0.10624,
1232
- "grad_norm": 3.589414596557617,
1233
- "learning_rate": 9.938394359911545e-06,
1234
- "loss": 5.234,
1235
- "step": 830
1236
- },
1237
- {
1238
- "epoch": 0.10688,
1239
- "grad_norm": 3.444977045059204,
1240
- "learning_rate": 9.937602646405918e-06,
1241
- "loss": 4.9763,
1242
- "step": 835
1243
- },
1244
- {
1245
- "epoch": 0.10752,
1246
- "grad_norm": 3.3560900688171387,
1247
- "learning_rate": 9.936805909988935e-06,
1248
- "loss": 5.2006,
1249
- "step": 840
1250
- },
1251
- {
1252
- "epoch": 0.10816,
1253
- "grad_norm": 3.345703601837158,
1254
- "learning_rate": 9.9360041514711e-06,
1255
- "loss": 5.0287,
1256
- "step": 845
1257
- },
1258
- {
1259
- "epoch": 0.1088,
1260
- "grad_norm": 3.492363691329956,
1261
- "learning_rate": 9.935197371668024e-06,
1262
- "loss": 5.0908,
1263
- "step": 850
1264
- },
1265
- {
1266
- "epoch": 0.10944,
1267
- "grad_norm": 7.459951400756836,
1268
- "learning_rate": 9.934385571400425e-06,
1269
- "loss": 5.1735,
1270
- "step": 855
1271
- },
1272
- {
1273
- "epoch": 0.11008,
1274
- "grad_norm": 3.5033841133117676,
1275
- "learning_rate": 9.933568751494131e-06,
1276
- "loss": 5.053,
1277
- "step": 860
1278
- },
1279
- {
1280
- "epoch": 0.11072,
1281
- "grad_norm": 3.5542259216308594,
1282
- "learning_rate": 9.93274691278007e-06,
1283
- "loss": 5.1463,
1284
- "step": 865
1285
- },
1286
- {
1287
- "epoch": 0.11136,
1288
- "grad_norm": 3.3819243907928467,
1289
- "learning_rate": 9.931920056094285e-06,
1290
- "loss": 5.0397,
1291
- "step": 870
1292
- },
1293
- {
1294
- "epoch": 0.112,
1295
- "grad_norm": 3.406768798828125,
1296
- "learning_rate": 9.931088182277915e-06,
1297
- "loss": 5.179,
1298
- "step": 875
1299
- },
1300
- {
1301
- "epoch": 0.11264,
1302
- "grad_norm": 5.960773944854736,
1303
- "learning_rate": 9.930251292177206e-06,
1304
- "loss": 5.217,
1305
- "step": 880
1306
- },
1307
- {
1308
- "epoch": 0.11328,
1309
- "grad_norm": 3.5821049213409424,
1310
- "learning_rate": 9.929409386643511e-06,
1311
- "loss": 5.0374,
1312
- "step": 885
1313
- },
1314
- {
1315
- "epoch": 0.11392,
1316
- "grad_norm": 3.3204903602600098,
1317
- "learning_rate": 9.928562466533279e-06,
1318
- "loss": 5.1856,
1319
- "step": 890
1320
- },
1321
- {
1322
- "epoch": 0.11456,
1323
- "grad_norm": 4.022350788116455,
1324
- "learning_rate": 9.927710532708064e-06,
1325
- "loss": 5.1051,
1326
- "step": 895
1327
- },
1328
- {
1329
- "epoch": 0.1152,
1330
- "grad_norm": 3.3810718059539795,
1331
- "learning_rate": 9.926853586034515e-06,
1332
- "loss": 5.1691,
1333
- "step": 900
1334
- },
1335
- {
1336
- "epoch": 0.1152,
1337
- "eval_loss": 1.2660380601882935,
1338
- "eval_runtime": 6.8853,
1339
- "eval_samples_per_second": 145.238,
1340
- "eval_steps_per_second": 18.155,
1341
- "step": 900
1342
- }
1343
- ],
1344
- "logging_steps": 5,
1345
- "max_steps": 15624,
1346
- "num_input_tokens_seen": 0,
1347
- "num_train_epochs": 2,
1348
- "save_steps": 100,
1349
- "stateful_callbacks": {
1350
- "TrainerControl": {
1351
- "args": {
1352
- "should_epoch_stop": false,
1353
- "should_evaluate": false,
1354
- "should_log": false,
1355
- "should_save": true,
1356
- "should_training_stop": false
1357
- },
1358
- "attributes": {}
1359
- }
1360
- },
1361
- "total_flos": 3.141806761967616e+17,
1362
- "train_batch_size": 32,
1363
- "trial_name": null,
1364
- "trial_params": null
1365
- }