Undi95 commited on
Commit
f9fc7f2
1 Parent(s): d93dd4a

Delete trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -2313
trainer_state.json DELETED
@@ -1,2313 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.9781591263650546,
5
- "eval_steps": 80,
6
- "global_step": 320,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.0062402496099844,
13
- "grad_norm": 19.714784622192383,
14
- "learning_rate": 2.0000000000000002e-07,
15
- "loss": 1.3589,
16
- "step": 1
17
- },
18
- {
19
- "epoch": 0.0062402496099844,
20
- "eval_loss": 1.3540421724319458,
21
- "eval_runtime": 132.5999,
22
- "eval_samples_per_second": 102.195,
23
- "eval_steps_per_second": 6.388,
24
- "step": 1
25
- },
26
- {
27
- "epoch": 0.0124804992199688,
28
- "grad_norm": 20.498014450073242,
29
- "learning_rate": 4.0000000000000003e-07,
30
- "loss": 1.3662,
31
- "step": 2
32
- },
33
- {
34
- "epoch": 0.0187207488299532,
35
- "grad_norm": 19.82619285583496,
36
- "learning_rate": 6.000000000000001e-07,
37
- "loss": 1.3336,
38
- "step": 3
39
- },
40
- {
41
- "epoch": 0.0249609984399376,
42
- "grad_norm": 18.423460006713867,
43
- "learning_rate": 8.000000000000001e-07,
44
- "loss": 1.3555,
45
- "step": 4
46
- },
47
- {
48
- "epoch": 0.031201248049921998,
49
- "grad_norm": 16.555850982666016,
50
- "learning_rate": 1.0000000000000002e-06,
51
- "loss": 1.3527,
52
- "step": 5
53
- },
54
- {
55
- "epoch": 0.0374414976599064,
56
- "grad_norm": 10.684965133666992,
57
- "learning_rate": 1.2000000000000002e-06,
58
- "loss": 1.3491,
59
- "step": 6
60
- },
61
- {
62
- "epoch": 0.0436817472698908,
63
- "grad_norm": 8.396592140197754,
64
- "learning_rate": 1.4000000000000001e-06,
65
- "loss": 1.3181,
66
- "step": 7
67
- },
68
- {
69
- "epoch": 0.0499219968798752,
70
- "grad_norm": 3.145500421524048,
71
- "learning_rate": 1.6000000000000001e-06,
72
- "loss": 1.2984,
73
- "step": 8
74
- },
75
- {
76
- "epoch": 0.056162246489859596,
77
- "grad_norm": 2.981050491333008,
78
- "learning_rate": 1.8000000000000001e-06,
79
- "loss": 1.2901,
80
- "step": 9
81
- },
82
- {
83
- "epoch": 0.062402496099843996,
84
- "grad_norm": 2.741509199142456,
85
- "learning_rate": 2.0000000000000003e-06,
86
- "loss": 1.2948,
87
- "step": 10
88
- },
89
- {
90
- "epoch": 0.0686427457098284,
91
- "grad_norm": 3.8496174812316895,
92
- "learning_rate": 2.2e-06,
93
- "loss": 1.2524,
94
- "step": 11
95
- },
96
- {
97
- "epoch": 0.0748829953198128,
98
- "grad_norm": 3.039551258087158,
99
- "learning_rate": 2.4000000000000003e-06,
100
- "loss": 1.2369,
101
- "step": 12
102
- },
103
- {
104
- "epoch": 0.0811232449297972,
105
- "grad_norm": 2.215259313583374,
106
- "learning_rate": 2.6e-06,
107
- "loss": 1.244,
108
- "step": 13
109
- },
110
- {
111
- "epoch": 0.0873634945397816,
112
- "grad_norm": 1.4627336263656616,
113
- "learning_rate": 2.8000000000000003e-06,
114
- "loss": 1.2201,
115
- "step": 14
116
- },
117
- {
118
- "epoch": 0.093603744149766,
119
- "grad_norm": 2.0100812911987305,
120
- "learning_rate": 3e-06,
121
- "loss": 1.2097,
122
- "step": 15
123
- },
124
- {
125
- "epoch": 0.0998439937597504,
126
- "grad_norm": 2.0757627487182617,
127
- "learning_rate": 3.2000000000000003e-06,
128
- "loss": 1.2058,
129
- "step": 16
130
- },
131
- {
132
- "epoch": 0.1060842433697348,
133
- "grad_norm": 1.6582179069519043,
134
- "learning_rate": 3.4000000000000005e-06,
135
- "loss": 1.1775,
136
- "step": 17
137
- },
138
- {
139
- "epoch": 0.11232449297971919,
140
- "grad_norm": 1.2454713582992554,
141
- "learning_rate": 3.6000000000000003e-06,
142
- "loss": 1.1405,
143
- "step": 18
144
- },
145
- {
146
- "epoch": 0.11856474258970359,
147
- "grad_norm": 1.0032132863998413,
148
- "learning_rate": 3.8000000000000005e-06,
149
- "loss": 1.1442,
150
- "step": 19
151
- },
152
- {
153
- "epoch": 0.12480499219968799,
154
- "grad_norm": 1.3543955087661743,
155
- "learning_rate": 4.000000000000001e-06,
156
- "loss": 1.1874,
157
- "step": 20
158
- },
159
- {
160
- "epoch": 0.1310452418096724,
161
- "grad_norm": 1.2795507907867432,
162
- "learning_rate": 4.2000000000000004e-06,
163
- "loss": 1.1423,
164
- "step": 21
165
- },
166
- {
167
- "epoch": 0.1372854914196568,
168
- "grad_norm": 1.0040080547332764,
169
- "learning_rate": 4.4e-06,
170
- "loss": 1.1269,
171
- "step": 22
172
- },
173
- {
174
- "epoch": 0.1435257410296412,
175
- "grad_norm": 0.9706005454063416,
176
- "learning_rate": 4.600000000000001e-06,
177
- "loss": 1.1508,
178
- "step": 23
179
- },
180
- {
181
- "epoch": 0.1497659906396256,
182
- "grad_norm": 0.905784547328949,
183
- "learning_rate": 4.800000000000001e-06,
184
- "loss": 1.1003,
185
- "step": 24
186
- },
187
- {
188
- "epoch": 0.15600624024961,
189
- "grad_norm": 0.8688749074935913,
190
- "learning_rate": 5e-06,
191
- "loss": 1.1046,
192
- "step": 25
193
- },
194
- {
195
- "epoch": 0.1622464898595944,
196
- "grad_norm": 0.7418661713600159,
197
- "learning_rate": 5.2e-06,
198
- "loss": 1.0736,
199
- "step": 26
200
- },
201
- {
202
- "epoch": 0.1684867394695788,
203
- "grad_norm": 0.7218017578125,
204
- "learning_rate": 5.400000000000001e-06,
205
- "loss": 1.0924,
206
- "step": 27
207
- },
208
- {
209
- "epoch": 0.1747269890795632,
210
- "grad_norm": 0.7364180684089661,
211
- "learning_rate": 5.600000000000001e-06,
212
- "loss": 1.0666,
213
- "step": 28
214
- },
215
- {
216
- "epoch": 0.1809672386895476,
217
- "grad_norm": 0.6347681879997253,
218
- "learning_rate": 5.8e-06,
219
- "loss": 1.0533,
220
- "step": 29
221
- },
222
- {
223
- "epoch": 0.187207488299532,
224
- "grad_norm": 0.672021210193634,
225
- "learning_rate": 6e-06,
226
- "loss": 1.0719,
227
- "step": 30
228
- },
229
- {
230
- "epoch": 0.1934477379095164,
231
- "grad_norm": 0.6880649328231812,
232
- "learning_rate": 6.200000000000001e-06,
233
- "loss": 1.0555,
234
- "step": 31
235
- },
236
- {
237
- "epoch": 0.1996879875195008,
238
- "grad_norm": 0.5669052004814148,
239
- "learning_rate": 6.4000000000000006e-06,
240
- "loss": 1.0845,
241
- "step": 32
242
- },
243
- {
244
- "epoch": 0.2059282371294852,
245
- "grad_norm": 0.6051258444786072,
246
- "learning_rate": 6.600000000000001e-06,
247
- "loss": 1.0656,
248
- "step": 33
249
- },
250
- {
251
- "epoch": 0.2121684867394696,
252
- "grad_norm": 0.5937217473983765,
253
- "learning_rate": 6.800000000000001e-06,
254
- "loss": 1.0738,
255
- "step": 34
256
- },
257
- {
258
- "epoch": 0.21840873634945399,
259
- "grad_norm": 0.5861482620239258,
260
- "learning_rate": 7e-06,
261
- "loss": 1.0497,
262
- "step": 35
263
- },
264
- {
265
- "epoch": 0.22464898595943839,
266
- "grad_norm": 0.5939168334007263,
267
- "learning_rate": 7.2000000000000005e-06,
268
- "loss": 1.0657,
269
- "step": 36
270
- },
271
- {
272
- "epoch": 0.23088923556942278,
273
- "grad_norm": 0.5843105316162109,
274
- "learning_rate": 7.4e-06,
275
- "loss": 1.0498,
276
- "step": 37
277
- },
278
- {
279
- "epoch": 0.23712948517940718,
280
- "grad_norm": 0.5303648710250854,
281
- "learning_rate": 7.600000000000001e-06,
282
- "loss": 1.0604,
283
- "step": 38
284
- },
285
- {
286
- "epoch": 0.24336973478939158,
287
- "grad_norm": 0.558338463306427,
288
- "learning_rate": 7.800000000000002e-06,
289
- "loss": 1.0383,
290
- "step": 39
291
- },
292
- {
293
- "epoch": 0.24960998439937598,
294
- "grad_norm": 0.49629613757133484,
295
- "learning_rate": 8.000000000000001e-06,
296
- "loss": 1.0521,
297
- "step": 40
298
- },
299
- {
300
- "epoch": 0.25585023400936036,
301
- "grad_norm": 0.5873180627822876,
302
- "learning_rate": 8.2e-06,
303
- "loss": 1.0403,
304
- "step": 41
305
- },
306
- {
307
- "epoch": 0.2620904836193448,
308
- "grad_norm": 0.5466005802154541,
309
- "learning_rate": 8.400000000000001e-06,
310
- "loss": 1.0127,
311
- "step": 42
312
- },
313
- {
314
- "epoch": 0.26833073322932915,
315
- "grad_norm": 0.5514444708824158,
316
- "learning_rate": 8.6e-06,
317
- "loss": 1.0399,
318
- "step": 43
319
- },
320
- {
321
- "epoch": 0.2745709828393136,
322
- "grad_norm": 0.5304705500602722,
323
- "learning_rate": 8.8e-06,
324
- "loss": 1.0057,
325
- "step": 44
326
- },
327
- {
328
- "epoch": 0.28081123244929795,
329
- "grad_norm": 0.5105130076408386,
330
- "learning_rate": 9e-06,
331
- "loss": 1.0174,
332
- "step": 45
333
- },
334
- {
335
- "epoch": 0.2870514820592824,
336
- "grad_norm": 0.533640444278717,
337
- "learning_rate": 9.200000000000002e-06,
338
- "loss": 1.0342,
339
- "step": 46
340
- },
341
- {
342
- "epoch": 0.29329173166926675,
343
- "grad_norm": 0.48208147287368774,
344
- "learning_rate": 9.4e-06,
345
- "loss": 1.0195,
346
- "step": 47
347
- },
348
- {
349
- "epoch": 0.2995319812792512,
350
- "grad_norm": 0.5069381594657898,
351
- "learning_rate": 9.600000000000001e-06,
352
- "loss": 1.0382,
353
- "step": 48
354
- },
355
- {
356
- "epoch": 0.30577223088923555,
357
- "grad_norm": 0.4819696843624115,
358
- "learning_rate": 9.800000000000001e-06,
359
- "loss": 1.05,
360
- "step": 49
361
- },
362
- {
363
- "epoch": 0.31201248049922,
364
- "grad_norm": 0.5414313673973083,
365
- "learning_rate": 1e-05,
366
- "loss": 1.0245,
367
- "step": 50
368
- },
369
- {
370
- "epoch": 0.31825273010920435,
371
- "grad_norm": 0.4769354462623596,
372
- "learning_rate": 1.02e-05,
373
- "loss": 1.005,
374
- "step": 51
375
- },
376
- {
377
- "epoch": 0.3244929797191888,
378
- "grad_norm": 0.5051629543304443,
379
- "learning_rate": 1.04e-05,
380
- "loss": 1.0158,
381
- "step": 52
382
- },
383
- {
384
- "epoch": 0.33073322932917315,
385
- "grad_norm": 0.5432644486427307,
386
- "learning_rate": 1.0600000000000002e-05,
387
- "loss": 1.0122,
388
- "step": 53
389
- },
390
- {
391
- "epoch": 0.3369734789391576,
392
- "grad_norm": 0.4705195128917694,
393
- "learning_rate": 1.0800000000000002e-05,
394
- "loss": 1.0053,
395
- "step": 54
396
- },
397
- {
398
- "epoch": 0.34321372854914195,
399
- "grad_norm": 0.5468801856040955,
400
- "learning_rate": 1.1000000000000001e-05,
401
- "loss": 1.0173,
402
- "step": 55
403
- },
404
- {
405
- "epoch": 0.3494539781591264,
406
- "grad_norm": 0.6218928694725037,
407
- "learning_rate": 1.1200000000000001e-05,
408
- "loss": 0.9974,
409
- "step": 56
410
- },
411
- {
412
- "epoch": 0.35569422776911075,
413
- "grad_norm": 0.532873272895813,
414
- "learning_rate": 1.14e-05,
415
- "loss": 1.006,
416
- "step": 57
417
- },
418
- {
419
- "epoch": 0.3619344773790952,
420
- "grad_norm": 0.48144450783729553,
421
- "learning_rate": 1.16e-05,
422
- "loss": 1.0025,
423
- "step": 58
424
- },
425
- {
426
- "epoch": 0.36817472698907955,
427
- "grad_norm": 0.5385976433753967,
428
- "learning_rate": 1.18e-05,
429
- "loss": 0.9976,
430
- "step": 59
431
- },
432
- {
433
- "epoch": 0.374414976599064,
434
- "grad_norm": 0.5179689526557922,
435
- "learning_rate": 1.2e-05,
436
- "loss": 0.9988,
437
- "step": 60
438
- },
439
- {
440
- "epoch": 0.38065522620904835,
441
- "grad_norm": 0.4646259844303131,
442
- "learning_rate": 1.22e-05,
443
- "loss": 0.9959,
444
- "step": 61
445
- },
446
- {
447
- "epoch": 0.3868954758190328,
448
- "grad_norm": 0.5259431004524231,
449
- "learning_rate": 1.2400000000000002e-05,
450
- "loss": 0.9933,
451
- "step": 62
452
- },
453
- {
454
- "epoch": 0.39313572542901715,
455
- "grad_norm": 0.5602505803108215,
456
- "learning_rate": 1.2600000000000001e-05,
457
- "loss": 0.9732,
458
- "step": 63
459
- },
460
- {
461
- "epoch": 0.3993759750390016,
462
- "grad_norm": 0.5400233864784241,
463
- "learning_rate": 1.2800000000000001e-05,
464
- "loss": 1.0021,
465
- "step": 64
466
- },
467
- {
468
- "epoch": 0.40561622464898595,
469
- "grad_norm": 0.5008605718612671,
470
- "learning_rate": 1.3000000000000001e-05,
471
- "loss": 1.0098,
472
- "step": 65
473
- },
474
- {
475
- "epoch": 0.4118564742589704,
476
- "grad_norm": 0.5215092301368713,
477
- "learning_rate": 1.3200000000000002e-05,
478
- "loss": 0.9805,
479
- "step": 66
480
- },
481
- {
482
- "epoch": 0.41809672386895474,
483
- "grad_norm": 0.6043874025344849,
484
- "learning_rate": 1.3400000000000002e-05,
485
- "loss": 0.9651,
486
- "step": 67
487
- },
488
- {
489
- "epoch": 0.4243369734789392,
490
- "grad_norm": 0.5744293928146362,
491
- "learning_rate": 1.3600000000000002e-05,
492
- "loss": 0.9715,
493
- "step": 68
494
- },
495
- {
496
- "epoch": 0.43057722308892354,
497
- "grad_norm": 0.6228943467140198,
498
- "learning_rate": 1.38e-05,
499
- "loss": 0.9942,
500
- "step": 69
501
- },
502
- {
503
- "epoch": 0.43681747269890797,
504
- "grad_norm": 0.6340550780296326,
505
- "learning_rate": 1.4e-05,
506
- "loss": 1.0278,
507
- "step": 70
508
- },
509
- {
510
- "epoch": 0.44305772230889234,
511
- "grad_norm": 0.6537193655967712,
512
- "learning_rate": 1.4200000000000001e-05,
513
- "loss": 1.005,
514
- "step": 71
515
- },
516
- {
517
- "epoch": 0.44929797191887677,
518
- "grad_norm": 0.6706846356391907,
519
- "learning_rate": 1.4400000000000001e-05,
520
- "loss": 0.9736,
521
- "step": 72
522
- },
523
- {
524
- "epoch": 0.45553822152886114,
525
- "grad_norm": 0.5686175227165222,
526
- "learning_rate": 1.46e-05,
527
- "loss": 0.9753,
528
- "step": 73
529
- },
530
- {
531
- "epoch": 0.46177847113884557,
532
- "grad_norm": 0.5182248950004578,
533
- "learning_rate": 1.48e-05,
534
- "loss": 0.9964,
535
- "step": 74
536
- },
537
- {
538
- "epoch": 0.46801872074882994,
539
- "grad_norm": 0.5445067286491394,
540
- "learning_rate": 1.5000000000000002e-05,
541
- "loss": 0.9702,
542
- "step": 75
543
- },
544
- {
545
- "epoch": 0.47425897035881437,
546
- "grad_norm": 0.6168459057807922,
547
- "learning_rate": 1.5200000000000002e-05,
548
- "loss": 0.9791,
549
- "step": 76
550
- },
551
- {
552
- "epoch": 0.48049921996879874,
553
- "grad_norm": 0.6475315093994141,
554
- "learning_rate": 1.54e-05,
555
- "loss": 0.98,
556
- "step": 77
557
- },
558
- {
559
- "epoch": 0.48673946957878317,
560
- "grad_norm": 0.8365716934204102,
561
- "learning_rate": 1.5600000000000003e-05,
562
- "loss": 0.987,
563
- "step": 78
564
- },
565
- {
566
- "epoch": 0.49297971918876754,
567
- "grad_norm": 1.0882554054260254,
568
- "learning_rate": 1.58e-05,
569
- "loss": 0.9796,
570
- "step": 79
571
- },
572
- {
573
- "epoch": 0.49921996879875197,
574
- "grad_norm": 1.109529972076416,
575
- "learning_rate": 1.6000000000000003e-05,
576
- "loss": 0.9825,
577
- "step": 80
578
- },
579
- {
580
- "epoch": 0.49921996879875197,
581
- "eval_loss": 0.9798125624656677,
582
- "eval_runtime": 132.8615,
583
- "eval_samples_per_second": 101.993,
584
- "eval_steps_per_second": 6.375,
585
- "step": 80
586
- },
587
- {
588
- "epoch": 0.5054602184087363,
589
- "grad_norm": 0.9999867677688599,
590
- "learning_rate": 1.62e-05,
591
- "loss": 0.9591,
592
- "step": 81
593
- },
594
- {
595
- "epoch": 0.5117004680187207,
596
- "grad_norm": 0.8042426705360413,
597
- "learning_rate": 1.64e-05,
598
- "loss": 0.9832,
599
- "step": 82
600
- },
601
- {
602
- "epoch": 0.5179407176287052,
603
- "grad_norm": 0.5843170881271362,
604
- "learning_rate": 1.66e-05,
605
- "loss": 0.9769,
606
- "step": 83
607
- },
608
- {
609
- "epoch": 0.5241809672386896,
610
- "grad_norm": 0.6988096237182617,
611
- "learning_rate": 1.6800000000000002e-05,
612
- "loss": 0.9556,
613
- "step": 84
614
- },
615
- {
616
- "epoch": 0.5304212168486739,
617
- "grad_norm": 0.7298963665962219,
618
- "learning_rate": 1.7e-05,
619
- "loss": 0.983,
620
- "step": 85
621
- },
622
- {
623
- "epoch": 0.5366614664586583,
624
- "grad_norm": 0.7856214046478271,
625
- "learning_rate": 1.72e-05,
626
- "loss": 0.9541,
627
- "step": 86
628
- },
629
- {
630
- "epoch": 0.5429017160686428,
631
- "grad_norm": 0.6896259188652039,
632
- "learning_rate": 1.7400000000000003e-05,
633
- "loss": 0.983,
634
- "step": 87
635
- },
636
- {
637
- "epoch": 0.5491419656786272,
638
- "grad_norm": 0.5555576682090759,
639
- "learning_rate": 1.76e-05,
640
- "loss": 0.9395,
641
- "step": 88
642
- },
643
- {
644
- "epoch": 0.5553822152886115,
645
- "grad_norm": 0.6107622385025024,
646
- "learning_rate": 1.7800000000000002e-05,
647
- "loss": 0.9601,
648
- "step": 89
649
- },
650
- {
651
- "epoch": 0.5616224648985959,
652
- "grad_norm": 0.7116836309432983,
653
- "learning_rate": 1.8e-05,
654
- "loss": 0.9647,
655
- "step": 90
656
- },
657
- {
658
- "epoch": 0.5678627145085804,
659
- "grad_norm": 0.5782715082168579,
660
- "learning_rate": 1.8200000000000002e-05,
661
- "loss": 0.9604,
662
- "step": 91
663
- },
664
- {
665
- "epoch": 0.5741029641185648,
666
- "grad_norm": 0.512100338935852,
667
- "learning_rate": 1.8400000000000003e-05,
668
- "loss": 0.9433,
669
- "step": 92
670
- },
671
- {
672
- "epoch": 0.5803432137285491,
673
- "grad_norm": 0.6315212249755859,
674
- "learning_rate": 1.86e-05,
675
- "loss": 0.969,
676
- "step": 93
677
- },
678
- {
679
- "epoch": 0.5865834633385335,
680
- "grad_norm": 0.6883739233016968,
681
- "learning_rate": 1.88e-05,
682
- "loss": 0.9832,
683
- "step": 94
684
- },
685
- {
686
- "epoch": 0.592823712948518,
687
- "grad_norm": 0.6760767102241516,
688
- "learning_rate": 1.9e-05,
689
- "loss": 0.9414,
690
- "step": 95
691
- },
692
- {
693
- "epoch": 0.5990639625585024,
694
- "grad_norm": 0.6577237248420715,
695
- "learning_rate": 1.9200000000000003e-05,
696
- "loss": 0.9748,
697
- "step": 96
698
- },
699
- {
700
- "epoch": 0.6053042121684867,
701
- "grad_norm": 0.9515029788017273,
702
- "learning_rate": 1.94e-05,
703
- "loss": 0.9863,
704
- "step": 97
705
- },
706
- {
707
- "epoch": 0.6115444617784711,
708
- "grad_norm": 1.40570068359375,
709
- "learning_rate": 1.9600000000000002e-05,
710
- "loss": 0.9445,
711
- "step": 98
712
- },
713
- {
714
- "epoch": 0.6177847113884556,
715
- "grad_norm": 0.9026833176612854,
716
- "learning_rate": 1.98e-05,
717
- "loss": 0.9436,
718
- "step": 99
719
- },
720
- {
721
- "epoch": 0.62402496099844,
722
- "grad_norm": 0.6666714549064636,
723
- "learning_rate": 2e-05,
724
- "loss": 0.9832,
725
- "step": 100
726
- },
727
- {
728
- "epoch": 0.6302652106084243,
729
- "grad_norm": 0.8047837018966675,
730
- "learning_rate": 1.9998980430094333e-05,
731
- "loss": 0.9498,
732
- "step": 101
733
- },
734
- {
735
- "epoch": 0.6365054602184087,
736
- "grad_norm": 0.9035269618034363,
737
- "learning_rate": 1.9995921928281893e-05,
738
- "loss": 0.9541,
739
- "step": 102
740
- },
741
- {
742
- "epoch": 0.6427457098283932,
743
- "grad_norm": 1.027601718902588,
744
- "learning_rate": 1.9990825118233958e-05,
745
- "loss": 0.9786,
746
- "step": 103
747
- },
748
- {
749
- "epoch": 0.6489859594383776,
750
- "grad_norm": 1.1459457874298096,
751
- "learning_rate": 1.9983691039261358e-05,
752
- "loss": 0.9482,
753
- "step": 104
754
- },
755
- {
756
- "epoch": 0.6552262090483619,
757
- "grad_norm": 0.7179874777793884,
758
- "learning_rate": 1.9974521146102535e-05,
759
- "loss": 0.9743,
760
- "step": 105
761
- },
762
- {
763
- "epoch": 0.6614664586583463,
764
- "grad_norm": 0.6881632208824158,
765
- "learning_rate": 1.9963317308626916e-05,
766
- "loss": 0.9797,
767
- "step": 106
768
- },
769
- {
770
- "epoch": 0.6677067082683308,
771
- "grad_norm": 0.7822304368019104,
772
- "learning_rate": 1.9950081811453598e-05,
773
- "loss": 0.9682,
774
- "step": 107
775
- },
776
- {
777
- "epoch": 0.6739469578783152,
778
- "grad_norm": 0.8269001841545105,
779
- "learning_rate": 1.99348173534855e-05,
780
- "loss": 0.9455,
781
- "step": 108
782
- },
783
- {
784
- "epoch": 0.6801872074882995,
785
- "grad_norm": 0.8077254295349121,
786
- "learning_rate": 1.991752704735903e-05,
787
- "loss": 0.9243,
788
- "step": 109
789
- },
790
- {
791
- "epoch": 0.6864274570982839,
792
- "grad_norm": 0.8119699954986572,
793
- "learning_rate": 1.989821441880933e-05,
794
- "loss": 0.9273,
795
- "step": 110
796
- },
797
- {
798
- "epoch": 0.6926677067082684,
799
- "grad_norm": 0.8220670223236084,
800
- "learning_rate": 1.9876883405951378e-05,
801
- "loss": 0.9455,
802
- "step": 111
803
- },
804
- {
805
- "epoch": 0.6989079563182528,
806
- "grad_norm": 0.8622007966041565,
807
- "learning_rate": 1.9853538358476933e-05,
808
- "loss": 0.9624,
809
- "step": 112
810
- },
811
- {
812
- "epoch": 0.7051482059282371,
813
- "grad_norm": 0.8222960233688354,
814
- "learning_rate": 1.9828184036767556e-05,
815
- "loss": 0.955,
816
- "step": 113
817
- },
818
- {
819
- "epoch": 0.7113884555382215,
820
- "grad_norm": 0.62811678647995,
821
- "learning_rate": 1.9800825610923937e-05,
822
- "loss": 0.9551,
823
- "step": 114
824
- },
825
- {
826
- "epoch": 0.717628705148206,
827
- "grad_norm": 0.7614508271217346,
828
- "learning_rate": 1.9771468659711595e-05,
829
- "loss": 0.9413,
830
- "step": 115
831
- },
832
- {
833
- "epoch": 0.7238689547581904,
834
- "grad_norm": 0.6695716977119446,
835
- "learning_rate": 1.9740119169423337e-05,
836
- "loss": 0.9384,
837
- "step": 116
838
- },
839
- {
840
- "epoch": 0.7301092043681747,
841
- "grad_norm": 0.5493482947349548,
842
- "learning_rate": 1.9706783532658528e-05,
843
- "loss": 0.9601,
844
- "step": 117
845
- },
846
- {
847
- "epoch": 0.7363494539781591,
848
- "grad_norm": 0.7798200249671936,
849
- "learning_rate": 1.9671468547019575e-05,
850
- "loss": 0.9555,
851
- "step": 118
852
- },
853
- {
854
- "epoch": 0.7425897035881436,
855
- "grad_norm": 0.8122205138206482,
856
- "learning_rate": 1.963418141372579e-05,
857
- "loss": 0.9351,
858
- "step": 119
859
- },
860
- {
861
- "epoch": 0.748829953198128,
862
- "grad_norm": 0.6351688504219055,
863
- "learning_rate": 1.9594929736144978e-05,
864
- "loss": 0.9517,
865
- "step": 120
866
- },
867
- {
868
- "epoch": 0.7550702028081123,
869
- "grad_norm": 0.8507185578346252,
870
- "learning_rate": 1.955372151824297e-05,
871
- "loss": 0.9482,
872
- "step": 121
873
- },
874
- {
875
- "epoch": 0.7613104524180967,
876
- "grad_norm": 1.057692050933838,
877
- "learning_rate": 1.9510565162951538e-05,
878
- "loss": 0.9626,
879
- "step": 122
880
- },
881
- {
882
- "epoch": 0.7675507020280812,
883
- "grad_norm": 0.789968729019165,
884
- "learning_rate": 1.94654694704549e-05,
885
- "loss": 0.9504,
886
- "step": 123
887
- },
888
- {
889
- "epoch": 0.7737909516380655,
890
- "grad_norm": 0.8988214731216431,
891
- "learning_rate": 1.941844363639525e-05,
892
- "loss": 0.9339,
893
- "step": 124
894
- },
895
- {
896
- "epoch": 0.7800312012480499,
897
- "grad_norm": 0.6798993945121765,
898
- "learning_rate": 1.936949724999762e-05,
899
- "loss": 0.9387,
900
- "step": 125
901
- },
902
- {
903
- "epoch": 0.7862714508580343,
904
- "grad_norm": 0.7597091794013977,
905
- "learning_rate": 1.9318640292114526e-05,
906
- "loss": 0.9884,
907
- "step": 126
908
- },
909
- {
910
- "epoch": 0.7925117004680188,
911
- "grad_norm": 0.9357583522796631,
912
- "learning_rate": 1.9265883133190715e-05,
913
- "loss": 0.9382,
914
- "step": 127
915
- },
916
- {
917
- "epoch": 0.7987519500780031,
918
- "grad_norm": 0.8738594055175781,
919
- "learning_rate": 1.92112365311485e-05,
920
- "loss": 0.9482,
921
- "step": 128
922
- },
923
- {
924
- "epoch": 0.8049921996879875,
925
- "grad_norm": 0.8523539900779724,
926
- "learning_rate": 1.9154711629194062e-05,
927
- "loss": 0.9299,
928
- "step": 129
929
- },
930
- {
931
- "epoch": 0.8112324492979719,
932
- "grad_norm": 0.5781116485595703,
933
- "learning_rate": 1.9096319953545186e-05,
934
- "loss": 0.9636,
935
- "step": 130
936
- },
937
- {
938
- "epoch": 0.8174726989079563,
939
- "grad_norm": 0.7737751007080078,
940
- "learning_rate": 1.9036073411080917e-05,
941
- "loss": 0.9482,
942
- "step": 131
943
- },
944
- {
945
- "epoch": 0.8237129485179407,
946
- "grad_norm": 0.7203546762466431,
947
- "learning_rate": 1.8973984286913584e-05,
948
- "loss": 0.9298,
949
- "step": 132
950
- },
951
- {
952
- "epoch": 0.8299531981279251,
953
- "grad_norm": 0.5875493288040161,
954
- "learning_rate": 1.891006524188368e-05,
955
- "loss": 0.9239,
956
- "step": 133
957
- },
958
- {
959
- "epoch": 0.8361934477379095,
960
- "grad_norm": 0.7981539964675903,
961
- "learning_rate": 1.8844329309978146e-05,
962
- "loss": 0.9546,
963
- "step": 134
964
- },
965
- {
966
- "epoch": 0.8424336973478939,
967
- "grad_norm": 0.7623902559280396,
968
- "learning_rate": 1.8776789895672557e-05,
969
- "loss": 0.9335,
970
- "step": 135
971
- },
972
- {
973
- "epoch": 0.8486739469578783,
974
- "grad_norm": 0.6350914239883423,
975
- "learning_rate": 1.8707460771197773e-05,
976
- "loss": 0.9585,
977
- "step": 136
978
- },
979
- {
980
- "epoch": 0.8549141965678627,
981
- "grad_norm": 0.6981391310691833,
982
- "learning_rate": 1.863635607373157e-05,
983
- "loss": 0.9271,
984
- "step": 137
985
- },
986
- {
987
- "epoch": 0.8611544461778471,
988
- "grad_norm": 0.7900795936584473,
989
- "learning_rate": 1.856349030251589e-05,
990
- "loss": 0.9022,
991
- "step": 138
992
- },
993
- {
994
- "epoch": 0.8673946957878315,
995
- "grad_norm": 0.7494855523109436,
996
- "learning_rate": 1.8488878315900228e-05,
997
- "loss": 0.9534,
998
- "step": 139
999
- },
1000
- {
1001
- "epoch": 0.8736349453978159,
1002
- "grad_norm": 0.5757277011871338,
1003
- "learning_rate": 1.8412535328311813e-05,
1004
- "loss": 0.9397,
1005
- "step": 140
1006
- },
1007
- {
1008
- "epoch": 0.8798751950078003,
1009
- "grad_norm": 0.6893640756607056,
1010
- "learning_rate": 1.8334476907153177e-05,
1011
- "loss": 0.952,
1012
- "step": 141
1013
- },
1014
- {
1015
- "epoch": 0.8861154446177847,
1016
- "grad_norm": 0.7050842046737671,
1017
- "learning_rate": 1.825471896962774e-05,
1018
- "loss": 0.9417,
1019
- "step": 142
1020
- },
1021
- {
1022
- "epoch": 0.8923556942277691,
1023
- "grad_norm": 0.5544989109039307,
1024
- "learning_rate": 1.817327777949407e-05,
1025
- "loss": 0.9008,
1026
- "step": 143
1027
- },
1028
- {
1029
- "epoch": 0.8985959438377535,
1030
- "grad_norm": 0.6469840407371521,
1031
- "learning_rate": 1.8090169943749477e-05,
1032
- "loss": 0.9471,
1033
- "step": 144
1034
- },
1035
- {
1036
- "epoch": 0.9048361934477379,
1037
- "grad_norm": 0.6894209384918213,
1038
- "learning_rate": 1.8005412409243604e-05,
1039
- "loss": 0.9553,
1040
- "step": 145
1041
- },
1042
- {
1043
- "epoch": 0.9110764430577223,
1044
- "grad_norm": 0.5356501936912537,
1045
- "learning_rate": 1.7919022459222754e-05,
1046
- "loss": 0.9496,
1047
- "step": 146
1048
- },
1049
- {
1050
- "epoch": 0.9173166926677067,
1051
- "grad_norm": 0.6416233777999878,
1052
- "learning_rate": 1.7831017709805555e-05,
1053
- "loss": 0.9558,
1054
- "step": 147
1055
- },
1056
- {
1057
- "epoch": 0.9235569422776911,
1058
- "grad_norm": 0.7085059881210327,
1059
- "learning_rate": 1.7741416106390828e-05,
1060
- "loss": 0.9168,
1061
- "step": 148
1062
- },
1063
- {
1064
- "epoch": 0.9297971918876755,
1065
- "grad_norm": 0.6492967009544373,
1066
- "learning_rate": 1.7650235919998234e-05,
1067
- "loss": 0.9065,
1068
- "step": 149
1069
- },
1070
- {
1071
- "epoch": 0.9360374414976599,
1072
- "grad_norm": 0.7753322124481201,
1073
- "learning_rate": 1.7557495743542586e-05,
1074
- "loss": 0.9285,
1075
- "step": 150
1076
- },
1077
- {
1078
- "epoch": 0.9422776911076443,
1079
- "grad_norm": 0.6451005935668945,
1080
- "learning_rate": 1.7463214488042472e-05,
1081
- "loss": 0.9567,
1082
- "step": 151
1083
- },
1084
- {
1085
- "epoch": 0.9485179407176287,
1086
- "grad_norm": 0.4824198782444,
1087
- "learning_rate": 1.736741137876405e-05,
1088
- "loss": 0.905,
1089
- "step": 152
1090
- },
1091
- {
1092
- "epoch": 0.9547581903276131,
1093
- "grad_norm": 0.5846424102783203,
1094
- "learning_rate": 1.727010595130074e-05,
1095
- "loss": 0.9426,
1096
- "step": 153
1097
- },
1098
- {
1099
- "epoch": 0.9609984399375975,
1100
- "grad_norm": 0.5984904170036316,
1101
- "learning_rate": 1.7171318047589637e-05,
1102
- "loss": 0.9398,
1103
- "step": 154
1104
- },
1105
- {
1106
- "epoch": 0.9672386895475819,
1107
- "grad_norm": 0.545465886592865,
1108
- "learning_rate": 1.7071067811865477e-05,
1109
- "loss": 0.9185,
1110
- "step": 155
1111
- },
1112
- {
1113
- "epoch": 0.9734789391575663,
1114
- "grad_norm": 0.5724261403083801,
1115
- "learning_rate": 1.696937568655294e-05,
1116
- "loss": 0.9222,
1117
- "step": 156
1118
- },
1119
- {
1120
- "epoch": 0.9797191887675507,
1121
- "grad_norm": 0.5120018124580383,
1122
- "learning_rate": 1.6866262408098134e-05,
1123
- "loss": 0.93,
1124
- "step": 157
1125
- },
1126
- {
1127
- "epoch": 0.9859594383775351,
1128
- "grad_norm": 0.5575640797615051,
1129
- "learning_rate": 1.6761749002740195e-05,
1130
- "loss": 0.9483,
1131
- "step": 158
1132
- },
1133
- {
1134
- "epoch": 0.9921996879875195,
1135
- "grad_norm": 0.603184163570404,
1136
- "learning_rate": 1.6655856782223682e-05,
1137
- "loss": 0.9394,
1138
- "step": 159
1139
- },
1140
- {
1141
- "epoch": 0.9984399375975039,
1142
- "grad_norm": 0.536756157875061,
1143
- "learning_rate": 1.6548607339452853e-05,
1144
- "loss": 0.9227,
1145
- "step": 160
1146
- },
1147
- {
1148
- "epoch": 0.9984399375975039,
1149
- "eval_loss": 0.9286661744117737,
1150
- "eval_runtime": 134.8358,
1151
- "eval_samples_per_second": 100.5,
1152
- "eval_steps_per_second": 6.282,
1153
- "step": 160
1154
- },
1155
- {
1156
- "epoch": 1.0046801872074882,
1157
- "grad_norm": 0.5680767297744751,
1158
- "learning_rate": 1.6440022544088553e-05,
1159
- "loss": 0.9263,
1160
- "step": 161
1161
- },
1162
- {
1163
- "epoch": 1.0109204368174727,
1164
- "grad_norm": 0.6374309062957764,
1165
- "learning_rate": 1.6330124538088705e-05,
1166
- "loss": 0.942,
1167
- "step": 162
1168
- },
1169
- {
1170
- "epoch": 1.0171606864274572,
1171
- "grad_norm": 0.5749344825744629,
1172
- "learning_rate": 1.6218935731193223e-05,
1173
- "loss": 0.9264,
1174
- "step": 163
1175
- },
1176
- {
1177
- "epoch": 1.0046801872074882,
1178
- "grad_norm": 0.9867531657218933,
1179
- "learning_rate": 1.6106478796354382e-05,
1180
- "loss": 0.8513,
1181
- "step": 164
1182
- },
1183
- {
1184
- "epoch": 1.0109204368174727,
1185
- "grad_norm": 0.7155735492706299,
1186
- "learning_rate": 1.599277666511347e-05,
1187
- "loss": 0.8023,
1188
- "step": 165
1189
- },
1190
- {
1191
- "epoch": 1.0171606864274572,
1192
- "grad_norm": 1.3990559577941895,
1193
- "learning_rate": 1.5877852522924733e-05,
1194
- "loss": 0.7992,
1195
- "step": 166
1196
- },
1197
- {
1198
- "epoch": 1.0234009360374414,
1199
- "grad_norm": 0.9339443445205688,
1200
- "learning_rate": 1.576172980442753e-05,
1201
- "loss": 0.7982,
1202
- "step": 167
1203
- },
1204
- {
1205
- "epoch": 1.029641185647426,
1206
- "grad_norm": 0.8383383750915527,
1207
- "learning_rate": 1.5644432188667695e-05,
1208
- "loss": 0.7764,
1209
- "step": 168
1210
- },
1211
- {
1212
- "epoch": 1.0358814352574104,
1213
- "grad_norm": 0.8227719664573669,
1214
- "learning_rate": 1.5525983594269026e-05,
1215
- "loss": 0.7985,
1216
- "step": 169
1217
- },
1218
- {
1219
- "epoch": 1.0421216848673946,
1220
- "grad_norm": 0.8231950998306274,
1221
- "learning_rate": 1.5406408174555978e-05,
1222
- "loss": 0.8097,
1223
- "step": 170
1224
- },
1225
- {
1226
- "epoch": 1.0483619344773791,
1227
- "grad_norm": 1.1454997062683105,
1228
- "learning_rate": 1.528573031262842e-05,
1229
- "loss": 0.7779,
1230
- "step": 171
1231
- },
1232
- {
1233
- "epoch": 1.0546021840873634,
1234
- "grad_norm": 0.6939067840576172,
1235
- "learning_rate": 1.5163974616389621e-05,
1236
- "loss": 0.8217,
1237
- "step": 172
1238
- },
1239
- {
1240
- "epoch": 1.0608424336973479,
1241
- "grad_norm": 0.7526265978813171,
1242
- "learning_rate": 1.504116591352832e-05,
1243
- "loss": 0.7886,
1244
- "step": 173
1245
- },
1246
- {
1247
- "epoch": 1.0670826833073324,
1248
- "grad_norm": 0.7892379760742188,
1249
- "learning_rate": 1.491732924645604e-05,
1250
- "loss": 0.7959,
1251
- "step": 174
1252
- },
1253
- {
1254
- "epoch": 1.0733229329173166,
1255
- "grad_norm": 0.7279461026191711,
1256
- "learning_rate": 1.479248986720057e-05,
1257
- "loss": 0.7977,
1258
- "step": 175
1259
- },
1260
- {
1261
- "epoch": 1.079563182527301,
1262
- "grad_norm": 0.7360721230506897,
1263
- "learning_rate": 1.4666673232256738e-05,
1264
- "loss": 0.7883,
1265
- "step": 176
1266
- },
1267
- {
1268
- "epoch": 1.0858034321372856,
1269
- "grad_norm": 0.6525989174842834,
1270
- "learning_rate": 1.4539904997395468e-05,
1271
- "loss": 0.785,
1272
- "step": 177
1273
- },
1274
- {
1275
- "epoch": 1.0920436817472698,
1276
- "grad_norm": 0.7803720235824585,
1277
- "learning_rate": 1.4412211012432213e-05,
1278
- "loss": 0.7998,
1279
- "step": 178
1280
- },
1281
- {
1282
- "epoch": 1.0982839313572543,
1283
- "grad_norm": 0.6588256359100342,
1284
- "learning_rate": 1.4283617315955815e-05,
1285
- "loss": 0.7999,
1286
- "step": 179
1287
- },
1288
- {
1289
- "epoch": 1.1045241809672386,
1290
- "grad_norm": 0.5983767509460449,
1291
- "learning_rate": 1.4154150130018867e-05,
1292
- "loss": 0.7848,
1293
- "step": 180
1294
- },
1295
- {
1296
- "epoch": 1.110764430577223,
1297
- "grad_norm": 0.641603946685791,
1298
- "learning_rate": 1.4023835854790682e-05,
1299
- "loss": 0.7937,
1300
- "step": 181
1301
- },
1302
- {
1303
- "epoch": 1.1170046801872076,
1304
- "grad_norm": 0.6453792452812195,
1305
- "learning_rate": 1.3892701063173917e-05,
1306
- "loss": 0.8004,
1307
- "step": 182
1308
- },
1309
- {
1310
- "epoch": 1.1232449297971918,
1311
- "grad_norm": 0.6428067088127136,
1312
- "learning_rate": 1.3760772495385998e-05,
1313
- "loss": 0.792,
1314
- "step": 183
1315
- },
1316
- {
1317
- "epoch": 1.1294851794071763,
1318
- "grad_norm": 0.6279442310333252,
1319
- "learning_rate": 1.362807705350641e-05,
1320
- "loss": 0.7859,
1321
- "step": 184
1322
- },
1323
- {
1324
- "epoch": 1.1357254290171608,
1325
- "grad_norm": 0.6000891327857971,
1326
- "learning_rate": 1.3494641795990986e-05,
1327
- "loss": 0.8039,
1328
- "step": 185
1329
- },
1330
- {
1331
- "epoch": 1.141965678627145,
1332
- "grad_norm": 0.6628398895263672,
1333
- "learning_rate": 1.3360493932154301e-05,
1334
- "loss": 0.7829,
1335
- "step": 186
1336
- },
1337
- {
1338
- "epoch": 1.1482059282371295,
1339
- "grad_norm": 0.6268762946128845,
1340
- "learning_rate": 1.3225660816621342e-05,
1341
- "loss": 0.778,
1342
- "step": 187
1343
- },
1344
- {
1345
- "epoch": 1.154446177847114,
1346
- "grad_norm": 0.639062225818634,
1347
- "learning_rate": 1.3090169943749475e-05,
1348
- "loss": 0.7796,
1349
- "step": 188
1350
- },
1351
- {
1352
- "epoch": 1.1606864274570983,
1353
- "grad_norm": 0.6048714518547058,
1354
- "learning_rate": 1.2954048942022002e-05,
1355
- "loss": 0.7883,
1356
- "step": 189
1357
- },
1358
- {
1359
- "epoch": 1.1669266770670828,
1360
- "grad_norm": 0.5929723381996155,
1361
- "learning_rate": 1.2817325568414299e-05,
1362
- "loss": 0.7736,
1363
- "step": 190
1364
- },
1365
- {
1366
- "epoch": 1.173166926677067,
1367
- "grad_norm": 0.5971985459327698,
1368
- "learning_rate": 1.2680027702733791e-05,
1369
- "loss": 0.8008,
1370
- "step": 191
1371
- },
1372
- {
1373
- "epoch": 1.1794071762870515,
1374
- "grad_norm": 0.6460970044136047,
1375
- "learning_rate": 1.2542183341934873e-05,
1376
- "loss": 0.7867,
1377
- "step": 192
1378
- },
1379
- {
1380
- "epoch": 1.185647425897036,
1381
- "grad_norm": 0.5345771908760071,
1382
- "learning_rate": 1.2403820594409926e-05,
1383
- "loss": 0.7808,
1384
- "step": 193
1385
- },
1386
- {
1387
- "epoch": 1.1918876755070202,
1388
- "grad_norm": 0.6704164743423462,
1389
- "learning_rate": 1.2264967674257647e-05,
1390
- "loss": 0.7785,
1391
- "step": 194
1392
- },
1393
- {
1394
- "epoch": 1.1981279251170047,
1395
- "grad_norm": 0.5182461738586426,
1396
- "learning_rate": 1.2125652895529766e-05,
1397
- "loss": 0.7907,
1398
- "step": 195
1399
- },
1400
- {
1401
- "epoch": 1.204368174726989,
1402
- "grad_norm": 0.6421562433242798,
1403
- "learning_rate": 1.1985904666457455e-05,
1404
- "loss": 0.7914,
1405
- "step": 196
1406
- },
1407
- {
1408
- "epoch": 1.2106084243369735,
1409
- "grad_norm": 0.5846896171569824,
1410
- "learning_rate": 1.1845751483658454e-05,
1411
- "loss": 0.7631,
1412
- "step": 197
1413
- },
1414
- {
1415
- "epoch": 1.216848673946958,
1416
- "grad_norm": 0.5582466721534729,
1417
- "learning_rate": 1.170522192632624e-05,
1418
- "loss": 0.7912,
1419
- "step": 198
1420
- },
1421
- {
1422
- "epoch": 1.2230889235569422,
1423
- "grad_norm": 0.5527791976928711,
1424
- "learning_rate": 1.156434465040231e-05,
1425
- "loss": 0.7938,
1426
- "step": 199
1427
- },
1428
- {
1429
- "epoch": 1.2293291731669267,
1430
- "grad_norm": 0.5673221945762634,
1431
- "learning_rate": 1.1423148382732854e-05,
1432
- "loss": 0.7947,
1433
- "step": 200
1434
- },
1435
- {
1436
- "epoch": 1.2355694227769112,
1437
- "grad_norm": 0.5078392028808594,
1438
- "learning_rate": 1.1281661915210931e-05,
1439
- "loss": 0.7771,
1440
- "step": 201
1441
- },
1442
- {
1443
- "epoch": 1.2418096723868954,
1444
- "grad_norm": 0.5475752353668213,
1445
- "learning_rate": 1.1139914098905406e-05,
1446
- "loss": 0.7781,
1447
- "step": 202
1448
- },
1449
- {
1450
- "epoch": 1.24804992199688,
1451
- "grad_norm": 0.5290600657463074,
1452
- "learning_rate": 1.0997933838177828e-05,
1453
- "loss": 0.7622,
1454
- "step": 203
1455
- },
1456
- {
1457
- "epoch": 1.2542901716068644,
1458
- "grad_norm": 0.4957723915576935,
1459
- "learning_rate": 1.08557500847884e-05,
1460
- "loss": 0.7857,
1461
- "step": 204
1462
- },
1463
- {
1464
- "epoch": 1.2605304212168487,
1465
- "grad_norm": 0.5119233727455139,
1466
- "learning_rate": 1.0713391831992324e-05,
1467
- "loss": 0.7585,
1468
- "step": 205
1469
- },
1470
- {
1471
- "epoch": 1.2667706708268331,
1472
- "grad_norm": 0.5187195539474487,
1473
- "learning_rate": 1.0570888108627682e-05,
1474
- "loss": 0.7885,
1475
- "step": 206
1476
- },
1477
- {
1478
- "epoch": 1.2730109204368174,
1479
- "grad_norm": 0.5066515803337097,
1480
- "learning_rate": 1.0428267973196027e-05,
1481
- "loss": 0.7691,
1482
- "step": 207
1483
- },
1484
- {
1485
- "epoch": 1.2792511700468019,
1486
- "grad_norm": 0.48673221468925476,
1487
- "learning_rate": 1.0285560507936962e-05,
1488
- "loss": 0.7715,
1489
- "step": 208
1490
- },
1491
- {
1492
- "epoch": 1.2854914196567861,
1493
- "grad_norm": 0.5083721876144409,
1494
- "learning_rate": 1.0142794812897874e-05,
1495
- "loss": 0.7812,
1496
- "step": 209
1497
- },
1498
- {
1499
- "epoch": 1.2917316692667706,
1500
- "grad_norm": 0.5033391118049622,
1501
- "learning_rate": 1e-05,
1502
- "loss": 0.7756,
1503
- "step": 210
1504
- },
1505
- {
1506
- "epoch": 1.2979719188767551,
1507
- "grad_norm": 0.532008945941925,
1508
- "learning_rate": 9.85720518710213e-06,
1509
- "loss": 0.7898,
1510
- "step": 211
1511
- },
1512
- {
1513
- "epoch": 1.3042121684867394,
1514
- "grad_norm": 0.5123456716537476,
1515
- "learning_rate": 9.71443949206304e-06,
1516
- "loss": 0.7779,
1517
- "step": 212
1518
- },
1519
- {
1520
- "epoch": 1.3104524180967239,
1521
- "grad_norm": 0.48444995284080505,
1522
- "learning_rate": 9.571732026803978e-06,
1523
- "loss": 0.7598,
1524
- "step": 213
1525
- },
1526
- {
1527
- "epoch": 1.3166926677067083,
1528
- "grad_norm": 0.5265589356422424,
1529
- "learning_rate": 9.42911189137232e-06,
1530
- "loss": 0.783,
1531
- "step": 214
1532
- },
1533
- {
1534
- "epoch": 1.3229329173166926,
1535
- "grad_norm": 0.5039641261100769,
1536
- "learning_rate": 9.286608168007678e-06,
1537
- "loss": 0.7798,
1538
- "step": 215
1539
- },
1540
- {
1541
- "epoch": 1.329173166926677,
1542
- "grad_norm": 0.5092752575874329,
1543
- "learning_rate": 9.144249915211605e-06,
1544
- "loss": 0.7635,
1545
- "step": 216
1546
- },
1547
- {
1548
- "epoch": 1.3354134165366616,
1549
- "grad_norm": 0.5394583940505981,
1550
- "learning_rate": 9.002066161822174e-06,
1551
- "loss": 0.7999,
1552
- "step": 217
1553
- },
1554
- {
1555
- "epoch": 1.3416536661466458,
1556
- "grad_norm": 0.5039361119270325,
1557
- "learning_rate": 8.860085901094595e-06,
1558
- "loss": 0.7745,
1559
- "step": 218
1560
- },
1561
- {
1562
- "epoch": 1.3478939157566303,
1563
- "grad_norm": 0.5087692737579346,
1564
- "learning_rate": 8.718338084789074e-06,
1565
- "loss": 0.8093,
1566
- "step": 219
1567
- },
1568
- {
1569
- "epoch": 1.3541341653666148,
1570
- "grad_norm": 0.5368974208831787,
1571
- "learning_rate": 8.576851617267151e-06,
1572
- "loss": 0.7596,
1573
- "step": 220
1574
- },
1575
- {
1576
- "epoch": 1.360374414976599,
1577
- "grad_norm": 0.5123481750488281,
1578
- "learning_rate": 8.43565534959769e-06,
1579
- "loss": 0.7665,
1580
- "step": 221
1581
- },
1582
- {
1583
- "epoch": 1.3666146645865835,
1584
- "grad_norm": 0.5048606395721436,
1585
- "learning_rate": 8.294778073673762e-06,
1586
- "loss": 0.769,
1587
- "step": 222
1588
- },
1589
- {
1590
- "epoch": 1.3728549141965678,
1591
- "grad_norm": 0.5156130790710449,
1592
- "learning_rate": 8.154248516341547e-06,
1593
- "loss": 0.7989,
1594
- "step": 223
1595
- },
1596
- {
1597
- "epoch": 1.3790951638065523,
1598
- "grad_norm": 0.5228000283241272,
1599
- "learning_rate": 8.014095333542548e-06,
1600
- "loss": 0.7775,
1601
- "step": 224
1602
- },
1603
- {
1604
- "epoch": 1.3853354134165365,
1605
- "grad_norm": 0.5356248617172241,
1606
- "learning_rate": 7.874347104470234e-06,
1607
- "loss": 0.7629,
1608
- "step": 225
1609
- },
1610
- {
1611
- "epoch": 1.391575663026521,
1612
- "grad_norm": 0.49624764919281006,
1613
- "learning_rate": 7.735032325742355e-06,
1614
- "loss": 0.8026,
1615
- "step": 226
1616
- },
1617
- {
1618
- "epoch": 1.3978159126365055,
1619
- "grad_norm": 0.5164198279380798,
1620
- "learning_rate": 7.596179405590076e-06,
1621
- "loss": 0.7803,
1622
- "step": 227
1623
- },
1624
- {
1625
- "epoch": 1.4040561622464898,
1626
- "grad_norm": 0.47972792387008667,
1627
- "learning_rate": 7.4578166580651335e-06,
1628
- "loss": 0.7783,
1629
- "step": 228
1630
- },
1631
- {
1632
- "epoch": 1.4102964118564743,
1633
- "grad_norm": 0.4989663064479828,
1634
- "learning_rate": 7.319972297266215e-06,
1635
- "loss": 0.7728,
1636
- "step": 229
1637
- },
1638
- {
1639
- "epoch": 1.4165366614664587,
1640
- "grad_norm": 0.5109092593193054,
1641
- "learning_rate": 7.182674431585703e-06,
1642
- "loss": 0.7919,
1643
- "step": 230
1644
- },
1645
- {
1646
- "epoch": 1.422776911076443,
1647
- "grad_norm": 0.5069689750671387,
1648
- "learning_rate": 7.045951057978001e-06,
1649
- "loss": 0.8015,
1650
- "step": 231
1651
- },
1652
- {
1653
- "epoch": 1.4290171606864275,
1654
- "grad_norm": 0.5074580907821655,
1655
- "learning_rate": 6.909830056250527e-06,
1656
- "loss": 0.7844,
1657
- "step": 232
1658
- },
1659
- {
1660
- "epoch": 1.435257410296412,
1661
- "grad_norm": 0.47711381316185,
1662
- "learning_rate": 6.774339183378663e-06,
1663
- "loss": 0.7601,
1664
- "step": 233
1665
- },
1666
- {
1667
- "epoch": 1.4414976599063962,
1668
- "grad_norm": 0.4926273822784424,
1669
- "learning_rate": 6.639506067845698e-06,
1670
- "loss": 0.7904,
1671
- "step": 234
1672
- },
1673
- {
1674
- "epoch": 1.4477379095163807,
1675
- "grad_norm": 0.5004563927650452,
1676
- "learning_rate": 6.505358204009018e-06,
1677
- "loss": 0.7638,
1678
- "step": 235
1679
- },
1680
- {
1681
- "epoch": 1.4539781591263652,
1682
- "grad_norm": 0.5014521479606628,
1683
- "learning_rate": 6.3719229464935915e-06,
1684
- "loss": 0.794,
1685
- "step": 236
1686
- },
1687
- {
1688
- "epoch": 1.4602184087363494,
1689
- "grad_norm": 0.5181212425231934,
1690
- "learning_rate": 6.239227504614004e-06,
1691
- "loss": 0.7495,
1692
- "step": 237
1693
- },
1694
- {
1695
- "epoch": 1.466458658346334,
1696
- "grad_norm": 0.5317255258560181,
1697
- "learning_rate": 6.107298936826086e-06,
1698
- "loss": 0.7884,
1699
- "step": 238
1700
- },
1701
- {
1702
- "epoch": 1.4726989079563182,
1703
- "grad_norm": 0.5126049518585205,
1704
- "learning_rate": 5.9761641452093225e-06,
1705
- "loss": 0.7869,
1706
- "step": 239
1707
- },
1708
- {
1709
- "epoch": 1.4789391575663027,
1710
- "grad_norm": 0.5322765707969666,
1711
- "learning_rate": 5.845849869981137e-06,
1712
- "loss": 0.7712,
1713
- "step": 240
1714
- },
1715
- {
1716
- "epoch": 1.4789391575663027,
1717
- "eval_loss": 0.9110648036003113,
1718
- "eval_runtime": 134.5659,
1719
- "eval_samples_per_second": 100.702,
1720
- "eval_steps_per_second": 6.294,
1721
- "step": 240
1722
- },
1723
- {
1724
- "epoch": 1.485179407176287,
1725
- "grad_norm": 0.5006371140480042,
1726
- "learning_rate": 5.716382684044191e-06,
1727
- "loss": 0.7939,
1728
- "step": 241
1729
- },
1730
- {
1731
- "epoch": 1.4914196567862714,
1732
- "grad_norm": 0.5099849104881287,
1733
- "learning_rate": 5.587788987567785e-06,
1734
- "loss": 0.771,
1735
- "step": 242
1736
- },
1737
- {
1738
- "epoch": 1.497659906396256,
1739
- "grad_norm": 0.5397711396217346,
1740
- "learning_rate": 5.460095002604533e-06,
1741
- "loss": 0.7642,
1742
- "step": 243
1743
- },
1744
- {
1745
- "epoch": 1.5039001560062402,
1746
- "grad_norm": 0.47754916548728943,
1747
- "learning_rate": 5.333326767743263e-06,
1748
- "loss": 0.7848,
1749
- "step": 244
1750
- },
1751
- {
1752
- "epoch": 1.5101404056162246,
1753
- "grad_norm": 0.5114724636077881,
1754
- "learning_rate": 5.207510132799436e-06,
1755
- "loss": 0.7743,
1756
- "step": 245
1757
- },
1758
- {
1759
- "epoch": 1.5163806552262091,
1760
- "grad_norm": 0.4859448969364166,
1761
- "learning_rate": 5.082670753543961e-06,
1762
- "loss": 0.7748,
1763
- "step": 246
1764
- },
1765
- {
1766
- "epoch": 1.5226209048361934,
1767
- "grad_norm": 0.46158257126808167,
1768
- "learning_rate": 4.958834086471683e-06,
1769
- "loss": 0.7951,
1770
- "step": 247
1771
- },
1772
- {
1773
- "epoch": 1.5288611544461779,
1774
- "grad_norm": 0.48695865273475647,
1775
- "learning_rate": 4.836025383610382e-06,
1776
- "loss": 0.7968,
1777
- "step": 248
1778
- },
1779
- {
1780
- "epoch": 1.5351014040561624,
1781
- "grad_norm": 0.4924914240837097,
1782
- "learning_rate": 4.714269687371581e-06,
1783
- "loss": 0.792,
1784
- "step": 249
1785
- },
1786
- {
1787
- "epoch": 1.5413416536661466,
1788
- "grad_norm": 0.5044175982475281,
1789
- "learning_rate": 4.593591825444028e-06,
1790
- "loss": 0.781,
1791
- "step": 250
1792
- },
1793
- {
1794
- "epoch": 1.547581903276131,
1795
- "grad_norm": 0.4598456919193268,
1796
- "learning_rate": 4.474016405730973e-06,
1797
- "loss": 0.794,
1798
- "step": 251
1799
- },
1800
- {
1801
- "epoch": 1.5538221528861156,
1802
- "grad_norm": 0.48866939544677734,
1803
- "learning_rate": 4.355567811332311e-06,
1804
- "loss": 0.7853,
1805
- "step": 252
1806
- },
1807
- {
1808
- "epoch": 1.5600624024960998,
1809
- "grad_norm": 0.4878495931625366,
1810
- "learning_rate": 4.2382701955724724e-06,
1811
- "loss": 0.743,
1812
- "step": 253
1813
- },
1814
- {
1815
- "epoch": 1.566302652106084,
1816
- "grad_norm": 0.4770466387271881,
1817
- "learning_rate": 4.12214747707527e-06,
1818
- "loss": 0.7442,
1819
- "step": 254
1820
- },
1821
- {
1822
- "epoch": 1.5725429017160688,
1823
- "grad_norm": 0.4467732012271881,
1824
- "learning_rate": 4.007223334886531e-06,
1825
- "loss": 0.7611,
1826
- "step": 255
1827
- },
1828
- {
1829
- "epoch": 1.578783151326053,
1830
- "grad_norm": 0.47116416692733765,
1831
- "learning_rate": 3.893521203645618e-06,
1832
- "loss": 0.7921,
1833
- "step": 256
1834
- },
1835
- {
1836
- "epoch": 1.5850234009360373,
1837
- "grad_norm": 0.468517005443573,
1838
- "learning_rate": 3.78106426880678e-06,
1839
- "loss": 0.7811,
1840
- "step": 257
1841
- },
1842
- {
1843
- "epoch": 1.5912636505460218,
1844
- "grad_norm": 0.46981289982795715,
1845
- "learning_rate": 3.6698754619112974e-06,
1846
- "loss": 0.7756,
1847
- "step": 258
1848
- },
1849
- {
1850
- "epoch": 1.5975039001560063,
1851
- "grad_norm": 0.45571863651275635,
1852
- "learning_rate": 3.5599774559114475e-06,
1853
- "loss": 0.7469,
1854
- "step": 259
1855
- },
1856
- {
1857
- "epoch": 1.6037441497659906,
1858
- "grad_norm": 0.4486157298088074,
1859
- "learning_rate": 3.4513926605471504e-06,
1860
- "loss": 0.7566,
1861
- "step": 260
1862
- },
1863
- {
1864
- "epoch": 1.609984399375975,
1865
- "grad_norm": 0.47735777497291565,
1866
- "learning_rate": 3.344143217776319e-06,
1867
- "loss": 0.7753,
1868
- "step": 261
1869
- },
1870
- {
1871
- "epoch": 1.6162246489859595,
1872
- "grad_norm": 0.4546492099761963,
1873
- "learning_rate": 3.2382509972598087e-06,
1874
- "loss": 0.7741,
1875
- "step": 262
1876
- },
1877
- {
1878
- "epoch": 1.6224648985959438,
1879
- "grad_norm": 0.4690036177635193,
1880
- "learning_rate": 3.133737591901864e-06,
1881
- "loss": 0.7693,
1882
- "step": 263
1883
- },
1884
- {
1885
- "epoch": 1.6287051482059283,
1886
- "grad_norm": 0.4607780873775482,
1887
- "learning_rate": 3.0306243134470668e-06,
1888
- "loss": 0.7312,
1889
- "step": 264
1890
- },
1891
- {
1892
- "epoch": 1.6349453978159127,
1893
- "grad_norm": 0.4510229825973511,
1894
- "learning_rate": 2.9289321881345257e-06,
1895
- "loss": 0.7705,
1896
- "step": 265
1897
- },
1898
- {
1899
- "epoch": 1.641185647425897,
1900
- "grad_norm": 0.43797171115875244,
1901
- "learning_rate": 2.8286819524103657e-06,
1902
- "loss": 0.7666,
1903
- "step": 266
1904
- },
1905
- {
1906
- "epoch": 1.6474258970358813,
1907
- "grad_norm": 0.45275524258613586,
1908
- "learning_rate": 2.7298940486992654e-06,
1909
- "loss": 0.8028,
1910
- "step": 267
1911
- },
1912
- {
1913
- "epoch": 1.653666146645866,
1914
- "grad_norm": 0.4479183852672577,
1915
- "learning_rate": 2.6325886212359496e-06,
1916
- "loss": 0.7647,
1917
- "step": 268
1918
- },
1919
- {
1920
- "epoch": 1.6599063962558502,
1921
- "grad_norm": 0.4575185775756836,
1922
- "learning_rate": 2.5367855119575314e-06,
1923
- "loss": 0.7731,
1924
- "step": 269
1925
- },
1926
- {
1927
- "epoch": 1.6661466458658345,
1928
- "grad_norm": 0.45092037320137024,
1929
- "learning_rate": 2.4425042564574186e-06,
1930
- "loss": 0.7895,
1931
- "step": 270
1932
- },
1933
- {
1934
- "epoch": 1.672386895475819,
1935
- "grad_norm": 0.4297903776168823,
1936
- "learning_rate": 2.3497640800017687e-06,
1937
- "loss": 0.7597,
1938
- "step": 271
1939
- },
1940
- {
1941
- "epoch": 1.6786271450858035,
1942
- "grad_norm": 0.43731772899627686,
1943
- "learning_rate": 2.2585838936091753e-06,
1944
- "loss": 0.7775,
1945
- "step": 272
1946
- },
1947
- {
1948
- "epoch": 1.6848673946957877,
1949
- "grad_norm": 0.4288908541202545,
1950
- "learning_rate": 2.1689822901944456e-06,
1951
- "loss": 0.787,
1952
- "step": 273
1953
- },
1954
- {
1955
- "epoch": 1.6911076443057722,
1956
- "grad_norm": 0.42555147409439087,
1957
- "learning_rate": 2.0809775407772505e-06,
1958
- "loss": 0.7751,
1959
- "step": 274
1960
- },
1961
- {
1962
- "epoch": 1.6973478939157567,
1963
- "grad_norm": 0.45276904106140137,
1964
- "learning_rate": 1.994587590756397e-06,
1965
- "loss": 0.7893,
1966
- "step": 275
1967
- },
1968
- {
1969
- "epoch": 1.703588143525741,
1970
- "grad_norm": 0.44294846057891846,
1971
- "learning_rate": 1.9098300562505266e-06,
1972
- "loss": 0.7794,
1973
- "step": 276
1974
- },
1975
- {
1976
- "epoch": 1.7098283931357254,
1977
- "grad_norm": 0.4307778775691986,
1978
- "learning_rate": 1.826722220505931e-06,
1979
- "loss": 0.7791,
1980
- "step": 277
1981
- },
1982
- {
1983
- "epoch": 1.71606864274571,
1984
- "grad_norm": 0.43245357275009155,
1985
- "learning_rate": 1.74528103037226e-06,
1986
- "loss": 0.7497,
1987
- "step": 278
1988
- },
1989
- {
1990
- "epoch": 1.7223088923556942,
1991
- "grad_norm": 0.4251644015312195,
1992
- "learning_rate": 1.6655230928468257e-06,
1993
- "loss": 0.7743,
1994
- "step": 279
1995
- },
1996
- {
1997
- "epoch": 1.7285491419656787,
1998
- "grad_norm": 0.4321819543838501,
1999
- "learning_rate": 1.587464671688187e-06,
2000
- "loss": 0.7522,
2001
- "step": 280
2002
- },
2003
- {
2004
- "epoch": 1.7347893915756631,
2005
- "grad_norm": 0.4292062222957611,
2006
- "learning_rate": 1.5111216840997745e-06,
2007
- "loss": 0.7698,
2008
- "step": 281
2009
- },
2010
- {
2011
- "epoch": 1.7410296411856474,
2012
- "grad_norm": 0.45093026757240295,
2013
- "learning_rate": 1.436509697484111e-06,
2014
- "loss": 0.7647,
2015
- "step": 282
2016
- },
2017
- {
2018
- "epoch": 1.7472698907956317,
2019
- "grad_norm": 0.42376089096069336,
2020
- "learning_rate": 1.3636439262684299e-06,
2021
- "loss": 0.7556,
2022
- "step": 283
2023
- },
2024
- {
2025
- "epoch": 1.7535101404056164,
2026
- "grad_norm": 0.4337958097457886,
2027
- "learning_rate": 1.2925392288022299e-06,
2028
- "loss": 0.7756,
2029
- "step": 284
2030
- },
2031
- {
2032
- "epoch": 1.7597503900156006,
2033
- "grad_norm": 0.4319595992565155,
2034
- "learning_rate": 1.2232101043274437e-06,
2035
- "loss": 0.7816,
2036
- "step": 285
2037
- },
2038
- {
2039
- "epoch": 1.765990639625585,
2040
- "grad_norm": 0.4335976243019104,
2041
- "learning_rate": 1.1556706900218572e-06,
2042
- "loss": 0.7902,
2043
- "step": 286
2044
- },
2045
- {
2046
- "epoch": 1.7722308892355694,
2047
- "grad_norm": 0.4158109724521637,
2048
- "learning_rate": 1.0899347581163222e-06,
2049
- "loss": 0.7396,
2050
- "step": 287
2051
- },
2052
- {
2053
- "epoch": 1.7784711388455539,
2054
- "grad_norm": 0.42235442996025085,
2055
- "learning_rate": 1.0260157130864178e-06,
2056
- "loss": 0.7747,
2057
- "step": 288
2058
- },
2059
- {
2060
- "epoch": 1.7847113884555381,
2061
- "grad_norm": 0.43022453784942627,
2062
- "learning_rate": 9.63926588919083e-07,
2063
- "loss": 0.7616,
2064
- "step": 289
2065
- },
2066
- {
2067
- "epoch": 1.7909516380655226,
2068
- "grad_norm": 0.42115500569343567,
2069
- "learning_rate": 9.036800464548157e-07,
2070
- "loss": 0.7609,
2071
- "step": 290
2072
- },
2073
- {
2074
- "epoch": 1.797191887675507,
2075
- "grad_norm": 0.43961915373802185,
2076
- "learning_rate": 8.4528837080594e-07,
2077
- "loss": 0.7665,
2078
- "step": 291
2079
- },
2080
- {
2081
- "epoch": 1.8034321372854913,
2082
- "grad_norm": 0.42159175872802734,
2083
- "learning_rate": 7.887634688515e-07,
2084
- "loss": 0.7684,
2085
- "step": 292
2086
- },
2087
- {
2088
- "epoch": 1.8096723868954758,
2089
- "grad_norm": 0.4211348295211792,
2090
- "learning_rate": 7.341168668092857e-07,
2091
- "loss": 0.7701,
2092
- "step": 293
2093
- },
2094
- {
2095
- "epoch": 1.8159126365054603,
2096
- "grad_norm": 0.447083055973053,
2097
- "learning_rate": 6.813597078854772e-07,
2098
- "loss": 0.8051,
2099
- "step": 294
2100
- },
2101
- {
2102
- "epoch": 1.8221528861154446,
2103
- "grad_norm": 0.42439502477645874,
2104
- "learning_rate": 6.305027500023841e-07,
2105
- "loss": 0.7794,
2106
- "step": 295
2107
- },
2108
- {
2109
- "epoch": 1.828393135725429,
2110
- "grad_norm": 0.43265220522880554,
2111
- "learning_rate": 5.815563636047539e-07,
2112
- "loss": 0.7722,
2113
- "step": 296
2114
- },
2115
- {
2116
- "epoch": 1.8346333853354135,
2117
- "grad_norm": 0.4244794249534607,
2118
- "learning_rate": 5.345305295450997e-07,
2119
- "loss": 0.7638,
2120
- "step": 297
2121
- },
2122
- {
2123
- "epoch": 1.8408736349453978,
2124
- "grad_norm": 0.41255486011505127,
2125
- "learning_rate": 4.894348370484648e-07,
2126
- "loss": 0.772,
2127
- "step": 298
2128
- },
2129
- {
2130
- "epoch": 1.847113884555382,
2131
- "grad_norm": 0.4242880344390869,
2132
- "learning_rate": 4.4627848175703315e-07,
2133
- "loss": 0.7643,
2134
- "step": 299
2135
- },
2136
- {
2137
- "epoch": 1.8533541341653668,
2138
- "grad_norm": 0.41673314571380615,
2139
- "learning_rate": 4.0507026385502747e-07,
2140
- "loss": 0.7601,
2141
- "step": 300
2142
- },
2143
- {
2144
- "epoch": 1.859594383775351,
2145
- "grad_norm": 0.42518967390060425,
2146
- "learning_rate": 3.658185862742103e-07,
2147
- "loss": 0.7699,
2148
- "step": 301
2149
- },
2150
- {
2151
- "epoch": 1.8658346333853353,
2152
- "grad_norm": 0.42029449343681335,
2153
- "learning_rate": 3.2853145298042954e-07,
2154
- "loss": 0.7498,
2155
- "step": 302
2156
- },
2157
- {
2158
- "epoch": 1.8720748829953198,
2159
- "grad_norm": 0.4201337695121765,
2160
- "learning_rate": 2.93216467341475e-07,
2161
- "loss": 0.7626,
2162
- "step": 303
2163
- },
2164
- {
2165
- "epoch": 1.8783151326053042,
2166
- "grad_norm": 0.42505332827568054,
2167
- "learning_rate": 2.5988083057666534e-07,
2168
- "loss": 0.774,
2169
- "step": 304
2170
- },
2171
- {
2172
- "epoch": 1.8845553822152885,
2173
- "grad_norm": 0.41834697127342224,
2174
- "learning_rate": 2.2853134028840594e-07,
2175
- "loss": 0.7638,
2176
- "step": 305
2177
- },
2178
- {
2179
- "epoch": 1.890795631825273,
2180
- "grad_norm": 0.4278232455253601,
2181
- "learning_rate": 1.9917438907606556e-07,
2182
- "loss": 0.7906,
2183
- "step": 306
2184
- },
2185
- {
2186
- "epoch": 1.8970358814352575,
2187
- "grad_norm": 0.429078608751297,
2188
- "learning_rate": 1.7181596323244453e-07,
2189
- "loss": 0.7839,
2190
- "step": 307
2191
- },
2192
- {
2193
- "epoch": 1.9032761310452417,
2194
- "grad_norm": 0.4142579138278961,
2195
- "learning_rate": 1.464616415230702e-07,
2196
- "loss": 0.7687,
2197
- "step": 308
2198
- },
2199
- {
2200
- "epoch": 1.9095163806552262,
2201
- "grad_norm": 0.40971171855926514,
2202
- "learning_rate": 1.231165940486234e-07,
2203
- "loss": 0.7647,
2204
- "step": 309
2205
- },
2206
- {
2207
- "epoch": 1.9157566302652107,
2208
- "grad_norm": 0.4336109161376953,
2209
- "learning_rate": 1.0178558119067316e-07,
2210
- "loss": 0.7691,
2211
- "step": 310
2212
- },
2213
- {
2214
- "epoch": 1.921996879875195,
2215
- "grad_norm": 0.40623047947883606,
2216
- "learning_rate": 8.247295264097288e-08,
2217
- "loss": 0.7728,
2218
- "step": 311
2219
- },
2220
- {
2221
- "epoch": 1.9282371294851794,
2222
- "grad_norm": 0.4205041527748108,
2223
- "learning_rate": 6.51826465144978e-08,
2224
- "loss": 0.7533,
2225
- "step": 312
2226
- },
2227
- {
2228
- "epoch": 1.934477379095164,
2229
- "grad_norm": 0.416535347700119,
2230
- "learning_rate": 4.991818854640396e-08,
2231
- "loss": 0.7826,
2232
- "step": 313
2233
- },
2234
- {
2235
- "epoch": 1.9407176287051482,
2236
- "grad_norm": 0.41483184695243835,
2237
- "learning_rate": 3.668269137308666e-08,
2238
- "loss": 0.7688,
2239
- "step": 314
2240
- },
2241
- {
2242
- "epoch": 1.9469578783151325,
2243
- "grad_norm": 0.4072718322277069,
2244
- "learning_rate": 2.547885389746485e-08,
2245
- "loss": 0.7943,
2246
- "step": 315
2247
- },
2248
- {
2249
- "epoch": 1.9531981279251172,
2250
- "grad_norm": 0.413289338350296,
2251
- "learning_rate": 1.630896073864352e-08,
2252
- "loss": 0.7867,
2253
- "step": 316
2254
- },
2255
- {
2256
- "epoch": 1.9594383775351014,
2257
- "grad_norm": 0.4177180528640747,
2258
- "learning_rate": 9.174881766043086e-09,
2259
- "loss": 0.781,
2260
- "step": 317
2261
- },
2262
- {
2263
- "epoch": 1.9656786271450857,
2264
- "grad_norm": 0.41807225346565247,
2265
- "learning_rate": 4.0780717181077015e-09,
2266
- "loss": 0.769,
2267
- "step": 318
2268
- },
2269
- {
2270
- "epoch": 1.9719188767550702,
2271
- "grad_norm": 0.41558825969696045,
2272
- "learning_rate": 1.019569905666984e-09,
2273
- "loss": 0.7504,
2274
- "step": 319
2275
- },
2276
- {
2277
- "epoch": 1.9781591263650546,
2278
- "grad_norm": 0.4160574674606323,
2279
- "learning_rate": 0.0,
2280
- "loss": 0.8025,
2281
- "step": 320
2282
- },
2283
- {
2284
- "epoch": 1.9781591263650546,
2285
- "eval_loss": 0.903252899646759,
2286
- "eval_runtime": 134.5566,
2287
- "eval_samples_per_second": 100.709,
2288
- "eval_steps_per_second": 6.295,
2289
- "step": 320
2290
- }
2291
- ],
2292
- "logging_steps": 1,
2293
- "max_steps": 320,
2294
- "num_input_tokens_seen": 0,
2295
- "num_train_epochs": 2,
2296
- "save_steps": 80,
2297
- "stateful_callbacks": {
2298
- "TrainerControl": {
2299
- "args": {
2300
- "should_epoch_stop": false,
2301
- "should_evaluate": false,
2302
- "should_log": false,
2303
- "should_save": true,
2304
- "should_training_stop": true
2305
- },
2306
- "attributes": {}
2307
- }
2308
- },
2309
- "total_flos": 1.9476972312723456e+18,
2310
- "train_batch_size": 2,
2311
- "trial_name": null,
2312
- "trial_params": null
2313
- }