lesso commited on
Commit
06fb472
1 Parent(s): 11fd728

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -21,12 +21,12 @@
21
  "revision": null,
22
  "target_modules": [
23
  "up_proj",
 
24
  "gate_proj",
25
- "k_proj",
26
- "q_proj",
27
  "o_proj",
28
  "down_proj",
29
- "v_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
21
  "revision": null,
22
  "target_modules": [
23
  "up_proj",
24
+ "v_proj",
25
  "gate_proj",
 
 
26
  "o_proj",
27
  "down_proj",
28
+ "q_proj",
29
+ "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dba72936d57e106236c848f5987a7326552c870be18d806f1896dada593a52ac
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a73204c7e6e779cc19da937007bd4260d435de15c6652ad9b2e0eb17fc9a82
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c005c4b25b09181742e883a4b3e8b822f1c6821cd9b81c0b1f15d5078209824
3
  size 168037626
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6604f471f08bf2b2f37be27ac2049084a70dba84bf293f556fc4df6eb230bfb7
3
  size 168037626
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0af971a7867e3fdcd241f658f44aff4afb10f86dc4126cb7bbb2d154917b715c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1601a226abaab371fb1a1637d693b97a9298cac19004527736e3a00675bc6cd
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41bfbb26011cd1f6da678dc46ec4b53dc777067eecadd603a97390cd94b34f6a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dc773b9542ac3137b07542444676719c19d7023a9079e4b27cc1feb775f8ea8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -24,2327 +24,2327 @@
24
  },
25
  {
26
  "epoch": 0.009009009009009009,
27
- "grad_norm": 8.254266738891602,
28
  "learning_rate": 0.0001999955498150411,
29
  "loss": 2.3407,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.012012012012012012,
34
- "grad_norm": 2.888874053955078,
35
  "learning_rate": 0.00019998219965624734,
36
- "loss": 1.982,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.015015015015015015,
41
- "grad_norm": 2.099008083343506,
42
  "learning_rate": 0.0001999599507118322,
43
- "loss": 1.3602,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.018018018018018018,
48
- "grad_norm": 4.129324913024902,
49
  "learning_rate": 0.000199928804962034,
50
- "loss": 2.4564,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.021021021021021023,
55
- "grad_norm": 3.6175684928894043,
56
  "learning_rate": 0.0001998887651789398,
57
- "loss": 1.9079,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.024024024024024024,
62
- "grad_norm": 3.246344804763794,
63
  "learning_rate": 0.00019983983492623833,
64
- "loss": 1.6866,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.02702702702702703,
69
- "grad_norm": 3.2417070865631104,
70
  "learning_rate": 0.00019978201855890308,
71
- "loss": 1.4891,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.03003003003003003,
76
- "grad_norm": 5.3369832038879395,
77
  "learning_rate": 0.00019971532122280464,
78
- "loss": 2.0856,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.03303303303303303,
83
- "grad_norm": 3.8723933696746826,
84
  "learning_rate": 0.00019963974885425266,
85
- "loss": 1.9191,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.036036036036036036,
90
- "grad_norm": 5.872828006744385,
91
  "learning_rate": 0.00019955530817946748,
92
- "loss": 1.9715,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.03903903903903904,
97
- "grad_norm": 5.758902549743652,
98
  "learning_rate": 0.0001994620067139815,
99
- "loss": 1.9539,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.042042042042042045,
104
- "grad_norm": 4.494828224182129,
105
  "learning_rate": 0.0001993598527619703,
106
- "loss": 1.8494,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.04504504504504504,
111
- "grad_norm": 4.702944755554199,
112
  "learning_rate": 0.0001992488554155135,
113
- "loss": 1.7613,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.04804804804804805,
118
- "grad_norm": 4.821094512939453,
119
  "learning_rate": 0.00019912902455378556,
120
- "loss": 1.5019,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.05105105105105105,
125
- "grad_norm": 4.842085838317871,
126
  "learning_rate": 0.00019900037084217637,
127
- "loss": 1.014,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.05405405405405406,
132
- "grad_norm": 4.73835563659668,
133
  "learning_rate": 0.00019886290573134228,
134
- "loss": 1.7198,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.057057057057057055,
139
- "grad_norm": 5.116694927215576,
140
  "learning_rate": 0.00019871664145618657,
141
- "loss": 1.029,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.06006006006006006,
146
- "grad_norm": 3.1407368183135986,
147
  "learning_rate": 0.00019856159103477086,
148
- "loss": 1.0887,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.06306306306306306,
153
- "grad_norm": 4.0600666999816895,
154
  "learning_rate": 0.00019839776826715614,
155
- "loss": 1.7268,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.06606606606606606,
160
- "grad_norm": 5.035831928253174,
161
  "learning_rate": 0.0001982251877341748,
162
- "loss": 1.2828,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.06906906906906907,
167
- "grad_norm": 6.41340446472168,
168
  "learning_rate": 0.0001980438647961327,
169
- "loss": 1.6047,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.07207207207207207,
174
- "grad_norm": 4.085684776306152,
175
  "learning_rate": 0.00019785381559144196,
176
- "loss": 1.477,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.07507507507507508,
181
- "grad_norm": 3.9870212078094482,
182
  "learning_rate": 0.00019765505703518496,
183
- "loss": 0.8285,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.07807807807807808,
188
- "grad_norm": 4.476605415344238,
189
  "learning_rate": 0.00019744760681760832,
190
- "loss": 1.3635,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.08108108108108109,
195
- "grad_norm": 4.659677982330322,
196
  "learning_rate": 0.00019723148340254892,
197
- "loss": 1.7153,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.08408408408408409,
202
- "grad_norm": 4.323594570159912,
203
  "learning_rate": 0.00019700670602579008,
204
- "loss": 1.5809,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.08708708708708708,
209
- "grad_norm": 3.4452884197235107,
210
  "learning_rate": 0.0001967732946933499,
211
- "loss": 1.4132,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.09009009009009009,
216
- "grad_norm": 5.972443103790283,
217
  "learning_rate": 0.00019653127017970034,
218
- "loss": 1.6207,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.09309309309309309,
223
- "grad_norm": 2.909363269805908,
224
  "learning_rate": 0.00019628065402591845,
225
- "loss": 1.2312,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.0960960960960961,
230
- "grad_norm": 3.1354382038116455,
231
  "learning_rate": 0.00019602146853776894,
232
- "loss": 1.2398,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.0990990990990991,
237
- "grad_norm": 4.161502361297607,
238
  "learning_rate": 0.00019575373678371909,
239
- "loss": 1.5237,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1021021021021021,
244
- "grad_norm": 3.4831933975219727,
245
  "learning_rate": 0.00019547748259288536,
246
- "loss": 1.3345,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.10510510510510511,
251
- "grad_norm": 3.050612688064575,
252
  "learning_rate": 0.00019519273055291266,
253
- "loss": 1.233,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.10810810810810811,
258
- "grad_norm": 3.520677089691162,
259
  "learning_rate": 0.0001948995060077859,
260
- "loss": 1.3311,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1111111111111111,
265
- "grad_norm": 3.586829423904419,
266
  "learning_rate": 0.00019459783505557424,
267
- "loss": 1.212,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.11411411411411411,
272
- "grad_norm": 2.9168314933776855,
273
  "learning_rate": 0.00019428774454610843,
274
- "loss": 1.084,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.11711711711711711,
279
- "grad_norm": 3.1117663383483887,
280
  "learning_rate": 0.00019396926207859084,
281
- "loss": 0.978,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.12012012012012012,
286
- "grad_norm": 3.9365386962890625,
287
  "learning_rate": 0.00019364241599913924,
288
- "loss": 0.91,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.12312312312312312,
293
- "grad_norm": 5.414772987365723,
294
  "learning_rate": 0.00019330723539826375,
295
- "loss": 1.5547,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.12612612612612611,
300
- "grad_norm": 3.866046905517578,
301
  "learning_rate": 0.00019296375010827773,
302
- "loss": 1.3311,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.12912912912912913,
307
- "grad_norm": 4.769541263580322,
308
  "learning_rate": 0.0001926119907006426,
309
- "loss": 0.8167,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.13213213213213212,
314
- "grad_norm": 3.6581079959869385,
315
  "learning_rate": 0.0001922519884832469,
316
- "loss": 1.2906,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.13513513513513514,
321
- "grad_norm": 4.367158889770508,
322
  "learning_rate": 0.00019188377549761963,
323
- "loss": 1.2016,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.13813813813813813,
328
- "grad_norm": 3.591726541519165,
329
  "learning_rate": 0.0001915073845160786,
330
- "loss": 1.4798,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.14114114114114115,
335
- "grad_norm": 4.844123840332031,
336
  "learning_rate": 0.0001911228490388136,
337
- "loss": 1.4799,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.14414414414414414,
342
- "grad_norm": 5.353418350219727,
343
  "learning_rate": 0.00019073020329090444,
344
- "loss": 1.2234,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.14714714714714713,
349
- "grad_norm": 3.3358774185180664,
350
  "learning_rate": 0.00019032948221927524,
351
- "loss": 1.0556,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.15015015015015015,
356
- "grad_norm": 3.674219846725464,
357
  "learning_rate": 0.00018992072148958368,
358
- "loss": 1.1757,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.15315315315315314,
363
- "grad_norm": 4.523245334625244,
364
  "learning_rate": 0.00018950395748304678,
365
- "loss": 1.2357,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.15615615615615616,
370
- "grad_norm": 4.535871982574463,
371
  "learning_rate": 0.00018907922729320285,
372
- "loss": 1.5018,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.15915915915915915,
377
- "grad_norm": 3.2581071853637695,
378
  "learning_rate": 0.00018864656872260985,
379
  "loss": 1.1452,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.16216216216216217,
384
- "grad_norm": 3.132718324661255,
385
  "learning_rate": 0.00018820602027948114,
386
- "loss": 1.1584,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.16516516516516516,
391
- "grad_norm": 4.070220470428467,
392
  "learning_rate": 0.00018775762117425777,
393
- "loss": 1.1046,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.16816816816816818,
398
- "grad_norm": 3.8557350635528564,
399
  "learning_rate": 0.00018730141131611882,
400
- "loss": 0.9695,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.17117117117117117,
405
- "grad_norm": 3.3533196449279785,
406
  "learning_rate": 0.00018683743130942928,
407
- "loss": 1.1877,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.17417417417417416,
412
- "grad_norm": 3.0984268188476562,
413
  "learning_rate": 0.00018636572245012606,
414
- "loss": 1.3991,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.17717717717717718,
419
- "grad_norm": 10.649101257324219,
420
  "learning_rate": 0.00018588632672204264,
421
- "loss": 1.1049,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.18018018018018017,
426
- "grad_norm": 4.183538436889648,
427
  "learning_rate": 0.0001853992867931721,
428
- "loss": 1.4726,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.1831831831831832,
433
- "grad_norm": 3.4067232608795166,
434
  "learning_rate": 0.0001849046460118698,
435
- "loss": 1.3283,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.18618618618618618,
440
- "grad_norm": 3.2232909202575684,
441
  "learning_rate": 0.00018440244840299506,
442
- "loss": 1.0828,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.1891891891891892,
447
- "grad_norm": 3.587733030319214,
448
  "learning_rate": 0.00018389273866399275,
449
- "loss": 1.4606,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.1921921921921922,
454
- "grad_norm": 3.715364456176758,
455
  "learning_rate": 0.00018337556216091517,
456
- "loss": 1.3308,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.19519519519519518,
461
- "grad_norm": 3.0346460342407227,
462
  "learning_rate": 0.00018285096492438424,
463
- "loss": 1.1785,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.1981981981981982,
468
- "grad_norm": 2.9115614891052246,
469
  "learning_rate": 0.00018231899364549455,
470
- "loss": 0.8434,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2012012012012012,
475
- "grad_norm": 3.1807260513305664,
476
  "learning_rate": 0.0001817796956716578,
477
- "loss": 1.0018,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2042042042042042,
482
- "grad_norm": 3.0817840099334717,
483
  "learning_rate": 0.0001812331190023886,
484
- "loss": 1.0185,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.2072072072072072,
489
- "grad_norm": 3.2719409465789795,
490
  "learning_rate": 0.00018067931228503246,
491
- "loss": 1.4434,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.21021021021021022,
496
- "grad_norm": 4.187236309051514,
497
  "learning_rate": 0.00018011832481043576,
498
- "loss": 1.0969,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.2132132132132132,
503
- "grad_norm": 3.2186405658721924,
504
  "learning_rate": 0.000179550206508559,
505
- "loss": 1.2105,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.21621621621621623,
510
- "grad_norm": 3.406670570373535,
511
  "learning_rate": 0.0001789750079440326,
512
- "loss": 1.3271,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.21921921921921922,
517
- "grad_norm": 3.0242960453033447,
518
  "learning_rate": 0.00017839278031165658,
519
- "loss": 1.1269,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.2222222222222222,
524
- "grad_norm": 4.0513691902160645,
525
  "learning_rate": 0.00017780357543184397,
526
- "loss": 1.1778,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.22522522522522523,
531
- "grad_norm": 4.116867542266846,
532
  "learning_rate": 0.00017720744574600863,
533
- "loss": 1.2357,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.22822822822822822,
538
- "grad_norm": 3.957306146621704,
539
  "learning_rate": 0.0001766044443118978,
540
- "loss": 1.4235,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.23123123123123124,
545
- "grad_norm": 7.28619384765625,
546
  "learning_rate": 0.00017599462479886974,
547
- "loss": 1.102,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.23423423423423423,
552
- "grad_norm": 4.001276969909668,
553
  "learning_rate": 0.00017537804148311695,
554
- "loss": 1.1948,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.23723723723723725,
559
- "grad_norm": 2.7599635124206543,
560
  "learning_rate": 0.00017475474924283536,
561
- "loss": 1.0386,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.24024024024024024,
566
- "grad_norm": 3.36354923248291,
567
  "learning_rate": 0.00017412480355334005,
568
- "loss": 1.6077,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.24324324324324326,
573
- "grad_norm": NaN,
574
- "learning_rate": 0.00017412480355334005,
575
- "loss": 1.4291,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.24624624624624625,
580
- "grad_norm": 4.241931915283203,
581
- "learning_rate": 0.0001734882604821276,
582
- "loss": 1.37,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.24924924924924924,
587
- "grad_norm": 3.40556001663208,
588
- "learning_rate": 0.0001728451766838861,
589
- "loss": 1.4526,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.25225225225225223,
594
- "grad_norm": 3.1627070903778076,
595
- "learning_rate": 0.00017219560939545246,
596
- "loss": 1.1918,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.2552552552552553,
601
- "grad_norm": 2.915019989013672,
602
- "learning_rate": 0.0001715396164307182,
603
- "loss": 1.1523,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.25825825825825827,
608
- "grad_norm": 3.027531862258911,
609
- "learning_rate": 0.00017087725617548385,
610
- "loss": 1.097,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.26126126126126126,
615
- "grad_norm": 3.198509931564331,
616
- "learning_rate": 0.00017020858758226229,
617
- "loss": 1.1172,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.26426426426426425,
622
- "grad_norm": 2.9282870292663574,
623
- "learning_rate": 0.00016953367016503182,
624
- "loss": 0.9913,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.2672672672672673,
629
- "grad_norm": 2.8152143955230713,
630
- "learning_rate": 0.00016885256399393924,
631
- "loss": 0.9582,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.2702702702702703,
636
- "grad_norm": 5.513664245605469,
637
- "learning_rate": 0.00016816532968995328,
638
- "loss": 1.5983,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.2732732732732733,
643
- "grad_norm": 3.0489561557769775,
644
- "learning_rate": 0.00016747202841946928,
645
- "loss": 1.3503,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.27627627627627627,
650
- "grad_norm": 3.1149275302886963,
651
- "learning_rate": 0.00016677272188886483,
652
- "loss": 1.0439,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.27927927927927926,
657
- "grad_norm": 5.150669097900391,
658
- "learning_rate": 0.00016606747233900815,
659
- "loss": 1.5164,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.2822822822822823,
664
- "grad_norm": 3.6513853073120117,
665
- "learning_rate": 0.00016535634253971794,
666
- "loss": 0.8632,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.2852852852852853,
671
- "grad_norm": 2.818716287612915,
672
- "learning_rate": 0.00016463939578417692,
673
- "loss": 0.9685,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.2882882882882883,
678
- "grad_norm": 4.800100803375244,
679
- "learning_rate": 0.0001639166958832985,
680
- "loss": 1.6414,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.2912912912912913,
685
- "grad_norm": 5.5120649337768555,
686
- "learning_rate": 0.00016318830716004722,
687
- "loss": 2.0318,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.29429429429429427,
692
- "grad_norm": 3.650167226791382,
693
- "learning_rate": 0.0001624542944437139,
694
- "loss": 1.3504,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.2972972972972973,
699
- "grad_norm": 3.262024402618408,
700
- "learning_rate": 0.00016171472306414554,
701
- "loss": 0.9678,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.3003003003003003,
706
- "grad_norm": 3.3580527305603027,
707
- "learning_rate": 0.0001609696588459307,
708
- "loss": 1.2071,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.3033033033033033,
713
- "grad_norm": 2.174422025680542,
714
- "learning_rate": 0.00016021916810254097,
715
- "loss": 0.8678,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.3063063063063063,
720
- "grad_norm": 3.338526487350464,
721
- "learning_rate": 0.00015946331763042867,
722
- "loss": 1.1964,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.30930930930930933,
727
- "grad_norm": 2.4254307746887207,
728
- "learning_rate": 0.00015870217470308188,
729
- "loss": 1.1689,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.3123123123123123,
734
- "grad_norm": 3.5244359970092773,
735
- "learning_rate": 0.0001579358070650367,
736
- "loss": 1.5078,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.3153153153153153,
741
- "grad_norm": 3.7619662284851074,
742
- "learning_rate": 0.00015716428292584787,
743
- "loss": 1.2496,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.3183183183183183,
748
- "grad_norm": 4.3766303062438965,
749
- "learning_rate": 0.0001563876709540178,
750
- "loss": 1.673,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.3213213213213213,
755
- "grad_norm": 1.925053596496582,
756
- "learning_rate": 0.00015560604027088477,
757
- "loss": 0.7445,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.32432432432432434,
762
- "grad_norm": 2.5130960941314697,
763
- "learning_rate": 0.00015481946044447099,
764
- "loss": 0.8784,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.32732732732732733,
769
- "grad_norm": 3.086637020111084,
770
- "learning_rate": 0.00015402800148329071,
771
- "loss": 1.1608,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.3303303303303303,
776
- "grad_norm": 2.877929925918579,
777
- "learning_rate": 0.0001532317338301192,
778
- "loss": 0.9467,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.3333333333333333,
783
- "grad_norm": 2.76326847076416,
784
- "learning_rate": 0.00015243072835572318,
785
- "loss": 1.0425,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.33633633633633636,
790
- "grad_norm": 3.9744467735290527,
791
- "learning_rate": 0.00015162505635255287,
792
- "loss": 1.3432,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.33933933933933935,
797
- "grad_norm": 5.023432731628418,
798
- "learning_rate": 0.00015081478952839693,
799
- "loss": 1.6413,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.34234234234234234,
804
- "grad_norm": 3.5280025005340576,
805
- "learning_rate": 0.00015000000000000001,
806
- "loss": 1.0987,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.34534534534534533,
811
- "grad_norm": 2.802316665649414,
812
- "learning_rate": 0.0001491807602866442,
813
- "loss": 0.7508,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.3483483483483483,
818
- "grad_norm": 3.0784826278686523,
819
- "learning_rate": 0.00014835714330369446,
820
- "loss": 1.199,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.35135135135135137,
825
- "grad_norm": 4.75717306137085,
826
- "learning_rate": 0.000147529222356109,
827
- "loss": 1.3701,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.35435435435435436,
832
- "grad_norm": 2.9851152896881104,
833
- "learning_rate": 0.00014669707113191483,
834
- "loss": 1.037,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.35735735735735735,
839
- "grad_norm": 4.035271644592285,
840
- "learning_rate": 0.00014586076369564908,
841
- "loss": 1.0533,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.36036036036036034,
846
- "grad_norm": 9.006219863891602,
847
- "learning_rate": 0.00014502037448176734,
848
- "loss": 1.0655,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.3633633633633634,
853
- "grad_norm": 3.5663275718688965,
854
- "learning_rate": 0.00014417597828801832,
855
- "loss": 1.568,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.3663663663663664,
860
- "grad_norm": 2.996906280517578,
861
- "learning_rate": 0.00014332765026878687,
862
- "loss": 1.0472,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.36936936936936937,
867
- "grad_norm": 2.879059076309204,
868
- "learning_rate": 0.0001424754659284048,
869
- "loss": 1.214,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.37237237237237236,
874
- "grad_norm": 3.065863847732544,
875
- "learning_rate": 0.00014161950111443077,
876
- "loss": 0.8353,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.37537537537537535,
881
- "grad_norm": 4.549298286437988,
882
- "learning_rate": 0.00014075983201089964,
883
- "loss": 0.9154,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.3783783783783784,
888
- "grad_norm": 4.853454113006592,
889
- "learning_rate": 0.00013989653513154165,
890
- "loss": 1.5181,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.3813813813813814,
895
- "grad_norm": 2.2490577697753906,
896
- "learning_rate": 0.00013902968731297255,
897
- "loss": 0.9221,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.3843843843843844,
902
- "grad_norm": 4.746774673461914,
903
- "learning_rate": 0.00013815936570785487,
904
- "loss": 0.7953,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.38738738738738737,
909
- "grad_norm": 3.5353190898895264,
910
- "learning_rate": 0.00013728564777803088,
911
- "loss": 0.9101,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.39039039039039036,
916
- "grad_norm": 3.052245855331421,
917
- "learning_rate": 0.0001364086112876284,
918
- "loss": 1.3159,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.3933933933933934,
923
- "grad_norm": 4.477910041809082,
924
- "learning_rate": 0.00013552833429613938,
925
- "loss": 1.2709,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.3963963963963964,
930
- "grad_norm": 7.228553295135498,
931
- "learning_rate": 0.00013464489515147238,
932
- "loss": 1.4622,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.3993993993993994,
937
- "grad_norm": 2.7557127475738525,
938
- "learning_rate": 0.00013375837248297926,
939
- "loss": 1.4031,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.4024024024024024,
944
- "grad_norm": 5.027473449707031,
945
- "learning_rate": 0.0001328688451944569,
946
- "loss": 1.2564,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.40540540540540543,
951
- "grad_norm": 2.9252965450286865,
952
- "learning_rate": 0.00013197639245712454,
953
- "loss": 1.0321,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.4084084084084084,
958
- "grad_norm": 2.841583728790283,
959
- "learning_rate": 0.00013108109370257712,
960
- "loss": 0.9534,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.4114114114114114,
965
- "grad_norm": 4.3325114250183105,
966
- "learning_rate": 0.0001301830286157157,
967
- "loss": 1.4278,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.4144144144144144,
972
- "grad_norm": 3.721039056777954,
973
- "learning_rate": 0.00012928227712765504,
974
- "loss": 1.4167,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.4174174174174174,
979
- "grad_norm": 4.287530422210693,
980
- "learning_rate": 0.00012837891940860972,
981
- "loss": 1.4874,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.42042042042042044,
986
- "grad_norm": 3.366671085357666,
987
- "learning_rate": 0.0001274730358607583,
988
- "loss": 1.0406,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.42342342342342343,
993
- "grad_norm": 4.121444225311279,
994
- "learning_rate": 0.00012656470711108764,
995
- "loss": 1.6264,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.4264264264264264,
1000
- "grad_norm": 3.2435896396636963,
1001
- "learning_rate": 0.00012565401400421651,
1002
- "loss": 1.1203,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.4294294294294294,
1007
- "grad_norm": 2.8955342769622803,
1008
- "learning_rate": 0.00012474103759520027,
1009
- "loss": 1.2582,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.43243243243243246,
1014
- "grad_norm": 4.04188346862793,
1015
- "learning_rate": 0.0001238258591423165,
1016
- "loss": 1.0391,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.43543543543543545,
1021
- "grad_norm": 4.958117961883545,
1022
- "learning_rate": 0.000122908560099833,
1023
- "loss": 1.4298,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.43843843843843844,
1028
- "grad_norm": 1.997272253036499,
1029
- "learning_rate": 0.00012198922211075778,
1030
- "loss": 0.9793,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.44144144144144143,
1035
- "grad_norm": 2.4838814735412598,
1036
- "learning_rate": 0.00012106792699957263,
1037
- "loss": 1.0245,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.4444444444444444,
1042
- "grad_norm": 3.6618947982788086,
1043
- "learning_rate": 0.00012014475676495052,
1044
- "loss": 1.2034,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.44744744744744747,
1049
- "grad_norm": 2.2039713859558105,
1050
- "learning_rate": 0.0001192197935724573,
1051
- "loss": 0.9593,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.45045045045045046,
1056
- "grad_norm": 3.798941135406494,
1057
- "learning_rate": 0.00011829311974723867,
1058
- "loss": 1.6691,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.45345345345345345,
1063
- "grad_norm": 1.9355311393737793,
1064
- "learning_rate": 0.00011736481776669306,
1065
- "loss": 0.6611,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.45645645645645644,
1070
- "grad_norm": 2.3967108726501465,
1071
- "learning_rate": 0.00011643497025313061,
1072
- "loss": 1.0664,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.4594594594594595,
1077
- "grad_norm": 4.215780735015869,
1078
- "learning_rate": 0.00011550365996641979,
1079
- "loss": 0.9672,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.4624624624624625,
1084
- "grad_norm": 3.002946615219116,
1085
- "learning_rate": 0.00011457096979662114,
1086
- "loss": 1.6195,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.46546546546546547,
1091
- "grad_norm": 2.1367151737213135,
1092
- "learning_rate": 0.00011363698275661001,
1093
- "loss": 0.7318,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.46846846846846846,
1098
- "grad_norm": 2.3746261596679688,
1099
- "learning_rate": 0.00011270178197468789,
1100
- "loss": 0.7527,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.47147147147147145,
1105
- "grad_norm": 4.308503150939941,
1106
- "learning_rate": 0.00011176545068718385,
1107
- "loss": 1.6989,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.4744744744744745,
1112
- "grad_norm": 2.32792067527771,
1113
- "learning_rate": 0.0001108280722310462,
1114
- "loss": 1.262,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.4774774774774775,
1119
- "grad_norm": 4.225987911224365,
1120
- "learning_rate": 0.00010988973003642499,
1121
- "loss": 1.4396,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.4804804804804805,
1126
- "grad_norm": 3.8827497959136963,
1127
- "learning_rate": 0.00010895050761924668,
1128
- "loss": 1.5353,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.48348348348348347,
1133
- "grad_norm": 3.854318380355835,
1134
- "learning_rate": 0.00010801048857378071,
1135
- "loss": 1.4654,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.4864864864864865,
1140
- "grad_norm": 2.3658535480499268,
1141
- "learning_rate": 0.00010706975656519946,
1142
- "loss": 1.0751,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.4894894894894895,
1147
- "grad_norm": 6.701672554016113,
1148
- "learning_rate": 0.00010612839532213164,
1149
- "loss": 1.6829,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.4924924924924925,
1154
- "grad_norm": 3.207954168319702,
1155
- "learning_rate": 0.00010518648862921012,
1156
- "loss": 1.4072,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.4954954954954955,
1161
- "grad_norm": 2.800644636154175,
1162
- "learning_rate": 0.00010424412031961484,
1163
- "loss": 1.0728,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.4984984984984985,
1168
- "grad_norm": 4.086751461029053,
1169
- "learning_rate": 0.00010330137426761135,
1170
- "loss": 1.1628,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.5015015015015015,
1175
- "grad_norm": 2.659355640411377,
1176
- "learning_rate": 0.00010235833438108571,
1177
- "loss": 1.1724,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.5045045045045045,
1182
- "grad_norm": 3.152151107788086,
1183
- "learning_rate": 0.00010141508459407623,
1184
- "loss": 1.2078,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.5075075075075075,
1189
- "grad_norm": 3.8074827194213867,
1190
- "learning_rate": 0.00010047170885930324,
1191
- "loss": 1.1832,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.5105105105105106,
1196
- "grad_norm": 3.1562180519104004,
1197
- "learning_rate": 9.95282911406968e-05,
1198
- "loss": 1.1005,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.5135135135135135,
1203
- "grad_norm": 2.793989419937134,
1204
- "learning_rate": 9.858491540592382e-05,
1205
- "loss": 1.0381,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.5165165165165165,
1210
- "grad_norm": 3.589442491531372,
1211
- "learning_rate": 9.764166561891432e-05,
1212
- "loss": 0.9335,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.5195195195195195,
1217
- "grad_norm": 4.19896125793457,
1218
- "learning_rate": 9.669862573238863e-05,
1219
- "loss": 1.1411,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.5225225225225225,
1224
- "grad_norm": 3.931088924407959,
1225
- "learning_rate": 9.57558796803852e-05,
1226
- "loss": 1.0498,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.5255255255255256,
1231
- "grad_norm": 2.746661424636841,
1232
- "learning_rate": 9.48135113707899e-05,
1233
- "loss": 1.0747,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.5285285285285285,
1238
- "grad_norm": 2.2940986156463623,
1239
- "learning_rate": 9.38716046778684e-05,
1240
- "loss": 1.0899,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.5315315315315315,
1245
- "grad_norm": 2.696507692337036,
1246
- "learning_rate": 9.293024343480055e-05,
1247
- "loss": 0.7636,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.5345345345345346,
1252
- "grad_norm": 2.972304344177246,
1253
- "learning_rate": 9.198951142621929e-05,
1254
- "loss": 1.349,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.5375375375375375,
1259
- "grad_norm": 2.4269001483917236,
1260
- "learning_rate": 9.104949238075336e-05,
1261
- "loss": 1.1242,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.5405405405405406,
1266
- "grad_norm": 3.105961799621582,
1267
- "learning_rate": 9.011026996357503e-05,
1268
- "loss": 1.0303,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.5435435435435435,
1273
- "grad_norm": 2.861002206802368,
1274
- "learning_rate": 8.917192776895382e-05,
1275
- "loss": 1.2752,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.5465465465465466,
1280
- "grad_norm": 3.16262149810791,
1281
- "learning_rate": 8.823454931281616e-05,
1282
- "loss": 0.9932,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.5495495495495496,
1287
- "grad_norm": 2.2218563556671143,
1288
- "learning_rate": 8.729821802531212e-05,
1289
- "loss": 0.9025,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.5525525525525525,
1294
- "grad_norm": 3.3385350704193115,
1295
- "learning_rate": 8.636301724339004e-05,
1296
- "loss": 1.1192,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.5555555555555556,
1301
- "grad_norm": 1.895119071006775,
1302
- "learning_rate": 8.542903020337887e-05,
1303
- "loss": 0.7014,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.5585585585585585,
1308
- "grad_norm": 2.023733377456665,
1309
- "learning_rate": 8.449634003358022e-05,
1310
- "loss": 0.8134,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.5615615615615616,
1315
- "grad_norm": 3.346677303314209,
1316
- "learning_rate": 8.356502974686941e-05,
1317
- "loss": 0.8806,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.5645645645645646,
1322
- "grad_norm": 2.9996836185455322,
1323
- "learning_rate": 8.263518223330697e-05,
1324
- "loss": 1.3907,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.5675675675675675,
1329
- "grad_norm": 4.624329090118408,
1330
- "learning_rate": 8.170688025276134e-05,
1331
- "loss": 1.2675,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.5705705705705706,
1336
- "grad_norm": 4.569002628326416,
1337
- "learning_rate": 8.078020642754274e-05,
1338
- "loss": 1.176,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.5735735735735735,
1343
- "grad_norm": 4.073513031005859,
1344
- "learning_rate": 7.985524323504948e-05,
1345
- "loss": 1.058,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.5765765765765766,
1350
- "grad_norm": 3.1799652576446533,
1351
- "learning_rate": 7.89320730004274e-05,
1352
- "loss": 1.1819,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.5795795795795796,
1357
- "grad_norm": 3.175107002258301,
1358
- "learning_rate": 7.801077788924224e-05,
1359
- "loss": 1.1484,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.5825825825825826,
1364
- "grad_norm": 7.873251914978027,
1365
- "learning_rate": 7.709143990016702e-05,
1366
- "loss": 1.172,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.5855855855855856,
1371
- "grad_norm": 2.4473395347595215,
1372
- "learning_rate": 7.617414085768351e-05,
1373
- "loss": 1.0935,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.5885885885885885,
1378
- "grad_norm": 2.298689842224121,
1379
- "learning_rate": 7.525896240479976e-05,
1380
- "loss": 1.1773,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.5915915915915916,
1385
- "grad_norm": 3.7667293548583984,
1386
- "learning_rate": 7.434598599578351e-05,
1387
- "loss": 1.0643,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.5945945945945946,
1392
- "grad_norm": 2.9455878734588623,
1393
- "learning_rate": 7.343529288891239e-05,
1394
- "loss": 1.227,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.5975975975975976,
1399
- "grad_norm": 7.777294158935547,
1400
- "learning_rate": 7.252696413924174e-05,
1401
- "loss": 1.2713,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.6006006006006006,
1406
- "grad_norm": 2.827641248703003,
1407
- "learning_rate": 7.162108059139032e-05,
1408
- "loss": 1.2447,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.6036036036036037,
1413
- "grad_norm": 2.9091506004333496,
1414
- "learning_rate": 7.071772287234497e-05,
1415
- "loss": 1.0227,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.6066066066066066,
1420
- "grad_norm": 2.9833085536956787,
1421
- "learning_rate": 6.981697138428434e-05,
1422
- "loss": 0.9794,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.6096096096096096,
1427
- "grad_norm": 3.3673200607299805,
1428
- "learning_rate": 6.891890629742288e-05,
1429
- "loss": 0.9187,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.6126126126126126,
1434
- "grad_norm": 2.38704252243042,
1435
- "learning_rate": 6.802360754287547e-05,
1436
- "loss": 1.0263,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.6156156156156156,
1441
- "grad_norm": 4.426513195037842,
1442
- "learning_rate": 6.713115480554313e-05,
1443
- "loss": 1.1677,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.6186186186186187,
1448
- "grad_norm": 3.6128177642822266,
1449
- "learning_rate": 6.624162751702076e-05,
1450
- "loss": 1.3946,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.6216216216216216,
1455
- "grad_norm": 3.2020866870880127,
1456
- "learning_rate": 6.535510484852767e-05,
1457
- "loss": 1.5255,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.6246246246246246,
1462
- "grad_norm": 2.582364082336426,
1463
- "learning_rate": 6.447166570386063e-05,
1464
- "loss": 1.1196,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.6276276276276276,
1469
- "grad_norm": 3.6883230209350586,
1470
- "learning_rate": 6.35913887123716e-05,
1471
- "loss": 1.2505,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.6306306306306306,
1476
- "grad_norm": 2.0863749980926514,
1477
- "learning_rate": 6.271435222196916e-05,
1478
- "loss": 0.5439,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.6336336336336337,
1483
- "grad_norm": 1.9187757968902588,
1484
- "learning_rate": 6.184063429214515e-05,
1485
- "loss": 0.6077,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.6366366366366366,
1490
- "grad_norm": 2.5035970211029053,
1491
- "learning_rate": 6.097031268702746e-05,
1492
- "loss": 1.0272,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.6396396396396397,
1497
- "grad_norm": 2.1252877712249756,
1498
- "learning_rate": 6.010346486845837e-05,
1499
- "loss": 0.877,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.6426426426426426,
1504
- "grad_norm": 2.2181661128997803,
1505
- "learning_rate": 5.924016798910037e-05,
1506
- "loss": 0.7115,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.6456456456456456,
1511
- "grad_norm": 2.6346538066864014,
1512
- "learning_rate": 5.838049888556925e-05,
1513
- "loss": 1.2154,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.6486486486486487,
1518
- "grad_norm": 3.103731870651245,
1519
- "learning_rate": 5.752453407159522e-05,
1520
- "loss": 1.021,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.6516516516516516,
1525
- "grad_norm": 3.415545701980591,
1526
- "learning_rate": 5.667234973121317e-05,
1527
- "loss": 1.5522,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.6546546546546547,
1532
- "grad_norm": 3.22251558303833,
1533
- "learning_rate": 5.5824021711981686e-05,
1534
- "loss": 0.9561,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.6576576576576577,
1539
- "grad_norm": 2.920732259750366,
1540
- "learning_rate": 5.497962551823266e-05,
1541
- "loss": 1.0558,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.6606606606606606,
1546
- "grad_norm": 4.05772590637207,
1547
- "learning_rate": 5.4139236304350935e-05,
1548
- "loss": 1.0468,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.6636636636636637,
1553
- "grad_norm": 2.826936960220337,
1554
- "learning_rate": 5.33029288680852e-05,
1555
- "loss": 1.1075,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.6666666666666666,
1560
- "grad_norm": 4.209110260009766,
1561
- "learning_rate": 5.247077764389099e-05,
1562
- "loss": 1.1172,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.6696696696696697,
1567
- "grad_norm": 3.3651530742645264,
1568
- "learning_rate": 5.1642856696305575e-05,
1569
- "loss": 1.0825,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.6726726726726727,
1574
- "grad_norm": 2.487950086593628,
1575
- "learning_rate": 5.081923971335582e-05,
1576
- "loss": 0.7995,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.6756756756756757,
1581
- "grad_norm": 3.0736727714538574,
1582
- "learning_rate": 5.000000000000002e-05,
1583
- "loss": 1.0645,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.6786786786786787,
1588
- "grad_norm": 4.154182434082031,
1589
- "learning_rate": 4.918521047160308e-05,
1590
- "loss": 1.784,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.6816816816816816,
1595
- "grad_norm": 2.6880645751953125,
1596
- "learning_rate": 4.837494364744711e-05,
1597
- "loss": 0.806,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.6846846846846847,
1602
- "grad_norm": 8.7504301071167,
1603
- "learning_rate": 4.756927164427685e-05,
1604
- "loss": 1.2716,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.6876876876876877,
1609
- "grad_norm": 3.641247510910034,
1610
- "learning_rate": 4.6768266169880804e-05,
1611
- "loss": 1.0403,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.6906906906906907,
1616
- "grad_norm": 2.9611194133758545,
1617
- "learning_rate": 4.597199851670932e-05,
1618
- "loss": 1.2985,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.6936936936936937,
1623
- "grad_norm": 2.1863701343536377,
1624
- "learning_rate": 4.518053955552903e-05,
1625
- "loss": 0.9741,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.6966966966966966,
1630
- "grad_norm": 2.5241053104400635,
1631
- "learning_rate": 4.4393959729115244e-05,
1632
- "loss": 1.0619,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 0.6996996996996997,
1637
- "grad_norm": 2.0366642475128174,
1638
- "learning_rate": 4.3612329045982236e-05,
1639
- "loss": 1.0319,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 0.7027027027027027,
1644
- "grad_norm": 3.1480021476745605,
1645
- "learning_rate": 4.283571707415214e-05,
1646
- "loss": 1.2822,
1647
  "step": 234
1648
  },
1649
  {
1650
  "epoch": 0.7057057057057057,
1651
- "grad_norm": 3.6605899333953857,
1652
- "learning_rate": 4.206419293496333e-05,
1653
- "loss": 1.0404,
1654
  "step": 235
1655
  },
1656
  {
1657
  "epoch": 0.7087087087087087,
1658
- "grad_norm": 2.7231175899505615,
1659
- "learning_rate": 4.129782529691815e-05,
1660
- "loss": 1.0927,
1661
  "step": 236
1662
  },
1663
  {
1664
  "epoch": 0.7117117117117117,
1665
- "grad_norm": 2.513577938079834,
1666
- "learning_rate": 4.053668236957134e-05,
1667
- "loss": 1.2299,
1668
  "step": 237
1669
  },
1670
  {
1671
  "epoch": 0.7147147147147147,
1672
- "grad_norm": 1.9379348754882812,
1673
- "learning_rate": 3.978083189745907e-05,
1674
- "loss": 0.7212,
1675
  "step": 238
1676
  },
1677
  {
1678
  "epoch": 0.7177177177177178,
1679
- "grad_norm": 2.7951018810272217,
1680
- "learning_rate": 3.903034115406931e-05,
1681
- "loss": 1.0704,
1682
  "step": 239
1683
  },
1684
  {
1685
  "epoch": 0.7207207207207207,
1686
- "grad_norm": 2.4771158695220947,
1687
- "learning_rate": 3.828527693585451e-05,
1688
- "loss": 1.0166,
1689
  "step": 240
1690
  },
1691
  {
1692
  "epoch": 0.7237237237237237,
1693
- "grad_norm": 3.3941121101379395,
1694
- "learning_rate": 3.7545705556286126e-05,
1695
- "loss": 1.5398,
1696
  "step": 241
1697
  },
1698
  {
1699
  "epoch": 0.7267267267267268,
1700
- "grad_norm": 2.322334051132202,
1701
- "learning_rate": 3.681169283995279e-05,
1702
- "loss": 1.2624,
1703
  "step": 242
1704
  },
1705
  {
1706
  "epoch": 0.7297297297297297,
1707
- "grad_norm": 2.8599162101745605,
1708
- "learning_rate": 3.6083304116701535e-05,
1709
- "loss": 1.1381,
1710
  "step": 243
1711
  },
1712
  {
1713
  "epoch": 0.7327327327327328,
1714
- "grad_norm": 4.170224666595459,
1715
- "learning_rate": 3.536060421582309e-05,
1716
- "loss": 1.4004,
1717
  "step": 244
1718
  },
1719
  {
1720
  "epoch": 0.7357357357357357,
1721
- "grad_norm": 3.191027879714966,
1722
- "learning_rate": 3.464365746028208e-05,
1723
- "loss": 1.1464,
1724
  "step": 245
1725
  },
1726
  {
1727
  "epoch": 0.7387387387387387,
1728
- "grad_norm": 2.834343671798706,
1729
- "learning_rate": 3.393252766099187e-05,
1730
- "loss": 1.142,
1731
  "step": 246
1732
  },
1733
  {
1734
  "epoch": 0.7417417417417418,
1735
- "grad_norm": 3.2168824672698975,
1736
- "learning_rate": 3.322727811113516e-05,
1737
- "loss": 1.2666,
1738
  "step": 247
1739
  },
1740
  {
1741
  "epoch": 0.7447447447447447,
1742
- "grad_norm": 2.456047296524048,
1743
- "learning_rate": 3.252797158053077e-05,
1744
- "loss": 1.2571,
1745
  "step": 248
1746
  },
1747
  {
1748
  "epoch": 0.7477477477477478,
1749
- "grad_norm": 2.9790220260620117,
1750
- "learning_rate": 3.1834670310046734e-05,
1751
- "loss": 1.2003,
1752
  "step": 249
1753
  },
1754
  {
1755
  "epoch": 0.7507507507507507,
1756
- "grad_norm": 3.3574981689453125,
1757
- "learning_rate": 3.114743600606078e-05,
1758
- "loss": 1.3745,
1759
  "step": 250
1760
  },
1761
  {
1762
  "epoch": 0.7537537537537538,
1763
- "grad_norm": 2.140984296798706,
1764
- "learning_rate": 3.0466329834968233e-05,
1765
- "loss": 0.8346,
1766
  "step": 251
1767
  },
1768
  {
1769
  "epoch": 0.7567567567567568,
1770
- "grad_norm": 3.07039475440979,
1771
- "learning_rate": 2.979141241773775e-05,
1772
- "loss": 1.2459,
1773
  "step": 252
1774
  },
1775
  {
1776
  "epoch": 0.7597597597597597,
1777
- "grad_norm": 2.29309344291687,
1778
- "learning_rate": 2.9122743824516195e-05,
1779
- "loss": 1.0559,
1780
  "step": 253
1781
  },
1782
  {
1783
  "epoch": 0.7627627627627628,
1784
- "grad_norm": 2.319108247756958,
1785
- "learning_rate": 2.8460383569281824e-05,
1786
- "loss": 0.8545,
1787
  "step": 254
1788
  },
1789
  {
1790
  "epoch": 0.7657657657657657,
1791
- "grad_norm": 3.5860605239868164,
1792
- "learning_rate": 2.7804390604547557e-05,
1793
- "loss": 0.9624,
1794
  "step": 255
1795
  },
1796
  {
1797
  "epoch": 0.7687687687687688,
1798
- "grad_norm": 2.4737586975097656,
1799
- "learning_rate": 2.7154823316113932e-05,
1800
- "loss": 1.1101,
1801
  "step": 256
1802
  },
1803
  {
1804
  "epoch": 0.7717717717717718,
1805
- "grad_norm": 2.6330103874206543,
1806
- "learning_rate": 2.6511739517872426e-05,
1807
- "loss": 1.1027,
1808
  "step": 257
1809
  },
1810
  {
1811
  "epoch": 0.7747747747747747,
1812
- "grad_norm": 2.807921886444092,
1813
- "learning_rate": 2.587519644666001e-05,
1814
- "loss": 0.8148,
1815
  "step": 258
1816
  },
1817
  {
1818
  "epoch": 0.7777777777777778,
1819
- "grad_norm": 4.0378546714782715,
1820
- "learning_rate": 2.5245250757164663e-05,
1821
- "loss": 1.3036,
1822
  "step": 259
1823
  },
1824
  {
1825
  "epoch": 0.7807807807807807,
1826
- "grad_norm": 2.8638672828674316,
1827
- "learning_rate": 2.462195851688306e-05,
1828
- "loss": 0.8851,
1829
  "step": 260
1830
  },
1831
  {
1832
  "epoch": 0.7837837837837838,
1833
- "grad_norm": 2.558032512664795,
1834
- "learning_rate": 2.4005375201130274e-05,
1835
- "loss": 1.1158,
1836
  "step": 261
1837
  },
1838
  {
1839
  "epoch": 0.7867867867867868,
1840
- "grad_norm": 1.8975292444229126,
1841
- "learning_rate": 2.339555568810221e-05,
1842
- "loss": 0.8933,
1843
  "step": 262
1844
  },
1845
  {
1846
  "epoch": 0.7897897897897898,
1847
- "grad_norm": 2.8043062686920166,
1848
- "learning_rate": 2.2792554253991415e-05,
1849
- "loss": 0.9539,
1850
  "step": 263
1851
  },
1852
  {
1853
  "epoch": 0.7927927927927928,
1854
- "grad_norm": 3.37785267829895,
1855
- "learning_rate": 2.2196424568156073e-05,
1856
- "loss": 1.3296,
1857
  "step": 264
1858
  },
1859
  {
1860
  "epoch": 0.7957957957957958,
1861
- "grad_norm": 2.7959485054016113,
1862
- "learning_rate": 2.160721968834344e-05,
1863
- "loss": 1.1462,
1864
  "step": 265
1865
  },
1866
  {
1867
  "epoch": 0.7987987987987988,
1868
- "grad_norm": 7.449779510498047,
1869
- "learning_rate": 2.102499205596743e-05,
1870
- "loss": 1.0893,
1871
  "step": 266
1872
  },
1873
  {
1874
  "epoch": 0.8018018018018018,
1875
- "grad_norm": 6.506124019622803,
1876
- "learning_rate": 2.0449793491441028e-05,
1877
- "loss": 1.1367,
1878
  "step": 267
1879
  },
1880
  {
1881
  "epoch": 0.8048048048048048,
1882
- "grad_norm": 2.821166515350342,
1883
- "learning_rate": 1.9881675189564254e-05,
1884
- "loss": 1.6805,
1885
  "step": 268
1886
  },
1887
  {
1888
  "epoch": 0.8078078078078078,
1889
- "grad_norm": 2.951826333999634,
1890
- "learning_rate": 1.93206877149676e-05,
1891
- "loss": 0.8443,
1892
  "step": 269
1893
  },
1894
  {
1895
  "epoch": 0.8108108108108109,
1896
- "grad_norm": 3.3361992835998535,
1897
- "learning_rate": 1.8766880997611424e-05,
1898
- "loss": 1.2579,
1899
  "step": 270
1900
  },
1901
  {
1902
  "epoch": 0.8138138138138138,
1903
- "grad_norm": 1.7760874032974243,
1904
- "learning_rate": 1.8220304328342252e-05,
1905
- "loss": 0.6704,
1906
  "step": 271
1907
  },
1908
  {
1909
  "epoch": 0.8168168168168168,
1910
- "grad_norm": 3.165902614593506,
1911
- "learning_rate": 1.7681006354505493e-05,
1912
- "loss": 1.0814,
1913
  "step": 272
1914
  },
1915
  {
1916
  "epoch": 0.8198198198198198,
1917
- "grad_norm": 2.863865613937378,
1918
- "learning_rate": 1.7149035075615794e-05,
1919
- "loss": 1.0898,
1920
  "step": 273
1921
  },
1922
  {
1923
  "epoch": 0.8228228228228228,
1924
- "grad_norm": 2.5821480751037598,
1925
- "learning_rate": 1.6624437839084862e-05,
1926
- "loss": 0.8588,
1927
  "step": 274
1928
  },
1929
  {
1930
  "epoch": 0.8258258258258259,
1931
- "grad_norm": 3.5799527168273926,
1932
- "learning_rate": 1.6107261336007285e-05,
1933
- "loss": 1.2082,
1934
  "step": 275
1935
  },
1936
  {
1937
  "epoch": 0.8288288288288288,
1938
- "grad_norm": 1.9022583961486816,
1939
- "learning_rate": 1.5597551597004966e-05,
1940
- "loss": 0.8508,
1941
  "step": 276
1942
  },
1943
  {
1944
  "epoch": 0.8318318318318318,
1945
- "grad_norm": 3.3668415546417236,
1946
- "learning_rate": 1.5095353988130235e-05,
1947
- "loss": 1.4462,
1948
  "step": 277
1949
  },
1950
  {
1951
  "epoch": 0.8348348348348348,
1952
- "grad_norm": 3.1614718437194824,
1953
- "learning_rate": 1.4600713206827932e-05,
1954
- "loss": 0.7332,
1955
  "step": 278
1956
  },
1957
  {
1958
  "epoch": 0.8378378378378378,
1959
- "grad_norm": 3.1946263313293457,
1960
- "learning_rate": 1.4113673277957395e-05,
1961
- "loss": 1.2424,
1962
  "step": 279
1963
  },
1964
  {
1965
  "epoch": 0.8408408408408409,
1966
- "grad_norm": 3.346984386444092,
1967
- "learning_rate": 1.3634277549873953e-05,
1968
- "loss": 1.1425,
1969
  "step": 280
1970
  },
1971
  {
1972
  "epoch": 0.8438438438438438,
1973
- "grad_norm": 2.6733968257904053,
1974
- "learning_rate": 1.3162568690570743e-05,
1975
- "loss": 1.0175,
1976
  "step": 281
1977
  },
1978
  {
1979
  "epoch": 0.8468468468468469,
1980
- "grad_norm": 3.606302499771118,
1981
- "learning_rate": 1.2698588683881186e-05,
1982
- "loss": 1.1272,
1983
  "step": 282
1984
  },
1985
  {
1986
  "epoch": 0.8498498498498499,
1987
- "grad_norm": 3.11902117729187,
1988
- "learning_rate": 1.224237882574224e-05,
1989
- "loss": 0.8483,
1990
  "step": 283
1991
  },
1992
  {
1993
  "epoch": 0.8528528528528528,
1994
- "grad_norm": 2.8845720291137695,
1995
- "learning_rate": 1.1793979720518866e-05,
1996
- "loss": 0.9861,
1997
  "step": 284
1998
  },
1999
  {
2000
  "epoch": 0.8558558558558559,
2001
- "grad_norm": 3.02492094039917,
2002
- "learning_rate": 1.1353431277390126e-05,
2003
- "loss": 1.2887,
2004
  "step": 285
2005
  },
2006
  {
2007
  "epoch": 0.8588588588588588,
2008
- "grad_norm": 3.848069906234741,
2009
- "learning_rate": 1.0920772706797167e-05,
2010
- "loss": 1.0295,
2011
  "step": 286
2012
  },
2013
  {
2014
  "epoch": 0.8618618618618619,
2015
- "grad_norm": 2.647904396057129,
2016
- "learning_rate": 1.0496042516953209e-05,
2017
- "loss": 0.9551,
2018
  "step": 287
2019
  },
2020
  {
2021
  "epoch": 0.8648648648648649,
2022
- "grad_norm": 2.807253360748291,
2023
- "learning_rate": 1.0079278510416313e-05,
2024
- "loss": 0.7592,
2025
  "step": 288
2026
  },
2027
  {
2028
  "epoch": 0.8678678678678678,
2029
- "grad_norm": 3.8293511867523193,
2030
- "learning_rate": 9.670517780724775e-06,
2031
- "loss": 1.0225,
2032
  "step": 289
2033
  },
2034
  {
2035
  "epoch": 0.8708708708708709,
2036
- "grad_norm": 2.4294826984405518,
2037
- "learning_rate": 9.269796709095558e-06,
2038
- "loss": 1.4176,
2039
  "step": 290
2040
  },
2041
  {
2042
  "epoch": 0.8738738738738738,
2043
- "grad_norm": 4.781210899353027,
2044
- "learning_rate": 8.87715096118642e-06,
2045
- "loss": 0.8798,
2046
  "step": 291
2047
  },
2048
  {
2049
  "epoch": 0.8768768768768769,
2050
- "grad_norm": 2.406425714492798,
2051
- "learning_rate": 8.492615483921395e-06,
2052
- "loss": 0.8694,
2053
  "step": 292
2054
  },
2055
  {
2056
  "epoch": 0.8798798798798799,
2057
- "grad_norm": 1.9549373388290405,
2058
- "learning_rate": 8.116224502380387e-06,
2059
- "loss": 1.1899,
2060
  "step": 293
2061
  },
2062
  {
2063
  "epoch": 0.8828828828828829,
2064
- "grad_norm": 3.0934197902679443,
2065
- "learning_rate": 7.74801151675314e-06,
2066
- "loss": 0.9323,
2067
  "step": 294
2068
  },
2069
  {
2070
  "epoch": 0.8858858858858859,
2071
- "grad_norm": 4.107941150665283,
2072
- "learning_rate": 7.3880092993574125e-06,
2073
- "loss": 1.3923,
2074
  "step": 295
2075
  },
2076
  {
2077
  "epoch": 0.8888888888888888,
2078
- "grad_norm": 2.7095839977264404,
2079
- "learning_rate": 7.03624989172228e-06,
2080
- "loss": 0.8522,
2081
  "step": 296
2082
  },
2083
  {
2084
  "epoch": 0.8918918918918919,
2085
- "grad_norm": 1.8515009880065918,
2086
- "learning_rate": 6.692764601736268e-06,
2087
- "loss": 0.6926,
2088
  "step": 297
2089
  },
2090
  {
2091
  "epoch": 0.8948948948948949,
2092
- "grad_norm": 3.2831356525421143,
2093
- "learning_rate": 6.357584000860761e-06,
2094
  "loss": 0.9239,
2095
  "step": 298
2096
  },
2097
  {
2098
  "epoch": 0.8978978978978979,
2099
- "grad_norm": 2.5845866203308105,
2100
- "learning_rate": 6.030737921409169e-06,
2101
- "loss": 0.8028,
2102
  "step": 299
2103
  },
2104
  {
2105
  "epoch": 0.9009009009009009,
2106
- "grad_norm": 2.855114221572876,
2107
- "learning_rate": 5.71225545389158e-06,
2108
- "loss": 1.1832,
2109
  "step": 300
2110
  },
2111
  {
2112
  "epoch": 0.9039039039039038,
2113
- "grad_norm": 3.3236331939697266,
2114
- "learning_rate": 5.402164944425758e-06,
2115
- "loss": 1.1724,
2116
  "step": 301
2117
  },
2118
  {
2119
  "epoch": 0.9069069069069069,
2120
- "grad_norm": 6.83448600769043,
2121
- "learning_rate": 5.100493992214128e-06,
2122
- "loss": 1.3314,
2123
  "step": 302
2124
  },
2125
  {
2126
  "epoch": 0.9099099099099099,
2127
- "grad_norm": 2.863325834274292,
2128
- "learning_rate": 4.807269447087348e-06,
2129
- "loss": 1.0996,
2130
  "step": 303
2131
  },
2132
  {
2133
  "epoch": 0.9129129129129129,
2134
- "grad_norm": 3.2937326431274414,
2135
- "learning_rate": 4.5225174071146455e-06,
2136
- "loss": 1.1262,
2137
  "step": 304
2138
  },
2139
  {
2140
  "epoch": 0.9159159159159159,
2141
- "grad_norm": 4.532299995422363,
2142
- "learning_rate": 4.24626321628091e-06,
2143
- "loss": 0.8499,
2144
  "step": 305
2145
  },
2146
  {
2147
  "epoch": 0.918918918918919,
2148
- "grad_norm": 3.1992688179016113,
2149
- "learning_rate": 3.9785314622310495e-06,
2150
- "loss": 1.3525,
2151
  "step": 306
2152
  },
2153
  {
2154
  "epoch": 0.9219219219219219,
2155
- "grad_norm": 2.4705193042755127,
2156
- "learning_rate": 3.7193459740815674e-06,
2157
- "loss": 1.0908,
2158
  "step": 307
2159
  },
2160
  {
2161
  "epoch": 0.924924924924925,
2162
- "grad_norm": 2.904529333114624,
2163
- "learning_rate": 3.4687298202996655e-06,
2164
- "loss": 0.9582,
2165
  "step": 308
2166
  },
2167
  {
2168
  "epoch": 0.9279279279279279,
2169
- "grad_norm": 2.3138301372528076,
2170
- "learning_rate": 3.226705306650113e-06,
2171
- "loss": 1.2733,
2172
  "step": 309
2173
  },
2174
  {
2175
  "epoch": 0.9309309309309309,
2176
- "grad_norm": 2.09416127204895,
2177
- "learning_rate": 2.9932939742099208e-06,
2178
- "loss": 0.7504,
2179
  "step": 310
2180
  },
2181
  {
2182
  "epoch": 0.933933933933934,
2183
- "grad_norm": 3.167368173599243,
2184
- "learning_rate": 2.7685165974510986e-06,
2185
- "loss": 1.2412,
2186
  "step": 311
2187
  },
2188
  {
2189
  "epoch": 0.9369369369369369,
2190
- "grad_norm": 3.094839572906494,
2191
- "learning_rate": 2.552393182391677e-06,
2192
- "loss": 0.9607,
2193
  "step": 312
2194
  },
2195
  {
2196
  "epoch": 0.93993993993994,
2197
- "grad_norm": 2.5439722537994385,
2198
- "learning_rate": 2.3449429648150665e-06,
2199
- "loss": 1.1211,
2200
  "step": 313
2201
  },
2202
  {
2203
  "epoch": 0.9429429429429429,
2204
- "grad_norm": 3.0354812145233154,
2205
- "learning_rate": 2.1461844085580385e-06,
2206
- "loss": 1.1613,
2207
  "step": 314
2208
  },
2209
  {
2210
  "epoch": 0.9459459459459459,
2211
- "grad_norm": 2.3744139671325684,
2212
- "learning_rate": 1.9561352038673263e-06,
2213
- "loss": 0.9695,
2214
  "step": 315
2215
  },
2216
  {
2217
  "epoch": 0.948948948948949,
2218
- "grad_norm": 4.216712474822998,
2219
- "learning_rate": 1.7748122658251876e-06,
2220
- "loss": 0.7317,
2221
  "step": 316
2222
  },
2223
  {
2224
  "epoch": 0.9519519519519519,
2225
- "grad_norm": 2.2783091068267822,
2226
- "learning_rate": 1.6022317328438506e-06,
2227
- "loss": 1.0139,
2228
  "step": 317
2229
  },
2230
  {
2231
  "epoch": 0.954954954954955,
2232
- "grad_norm": 3.2791147232055664,
2233
- "learning_rate": 1.4384089652291543e-06,
2234
- "loss": 1.1532,
2235
  "step": 318
2236
  },
2237
  {
2238
  "epoch": 0.9579579579579579,
2239
- "grad_norm": 2.5588083267211914,
2240
- "learning_rate": 1.2833585438134287e-06,
2241
- "loss": 0.9971,
2242
  "step": 319
2243
  },
2244
  {
2245
  "epoch": 0.960960960960961,
2246
- "grad_norm": 2.4663734436035156,
2247
- "learning_rate": 1.1370942686577347e-06,
2248
- "loss": 1.1583,
2249
  "step": 320
2250
  },
2251
  {
2252
  "epoch": 0.963963963963964,
2253
- "grad_norm": 3.0910394191741943,
2254
- "learning_rate": 9.996291578236228e-07,
2255
- "loss": 1.3676,
2256
  "step": 321
2257
  },
2258
  {
2259
  "epoch": 0.9669669669669669,
2260
- "grad_norm": 2.7286369800567627,
2261
- "learning_rate": 8.709754462144615e-07,
2262
- "loss": 1.1954,
2263
  "step": 322
2264
  },
2265
  {
2266
  "epoch": 0.96996996996997,
2267
- "grad_norm": 2.4788362979888916,
2268
- "learning_rate": 7.511445844864962e-07,
2269
- "loss": 0.6394,
2270
  "step": 323
2271
  },
2272
  {
2273
  "epoch": 0.972972972972973,
2274
- "grad_norm": 2.686504364013672,
2275
- "learning_rate": 6.401472380297091e-07,
2276
- "loss": 1.2232,
2277
  "step": 324
2278
  },
2279
  {
2280
  "epoch": 0.975975975975976,
2281
- "grad_norm": 2.3043410778045654,
2282
- "learning_rate": 5.379932860185122e-07,
2283
- "loss": 0.8467,
2284
  "step": 325
2285
  },
2286
  {
2287
  "epoch": 0.978978978978979,
2288
- "grad_norm": 2.39601993560791,
2289
- "learning_rate": 4.44691820532539e-07,
2290
- "loss": 1.031,
2291
  "step": 326
2292
  },
2293
  {
2294
  "epoch": 0.9819819819819819,
2295
- "grad_norm": 3.4683892726898193,
2296
- "learning_rate": 3.6025114574734785e-07,
2297
- "loss": 1.2397,
2298
  "step": 327
2299
  },
2300
  {
2301
  "epoch": 0.984984984984985,
2302
- "grad_norm": 3.130098819732666,
2303
- "learning_rate": 2.846787771953574e-07,
2304
- "loss": 1.0141,
2305
  "step": 328
2306
  },
2307
  {
2308
  "epoch": 0.987987987987988,
2309
- "grad_norm": 3.894436836242676,
2310
- "learning_rate": 2.179814410969261e-07,
2311
- "loss": 1.4286,
2312
  "step": 329
2313
  },
2314
  {
2315
  "epoch": 0.990990990990991,
2316
- "grad_norm": 3.297236919403076,
2317
- "learning_rate": 1.6016507376169777e-07,
2318
- "loss": 1.2933,
2319
  "step": 330
2320
  },
2321
  {
2322
  "epoch": 0.993993993993994,
2323
- "grad_norm": 4.986927509307861,
2324
- "learning_rate": 1.1123482106021322e-07,
2325
- "loss": 1.6548,
2326
  "step": 331
2327
  },
2328
  {
2329
  "epoch": 0.996996996996997,
2330
- "grad_norm": 3.8665215969085693,
2331
- "learning_rate": 7.119503796599868e-08,
2332
- "loss": 0.8686,
2333
  "step": 332
2334
  },
2335
  {
2336
  "epoch": 1.0,
2337
- "grad_norm": 2.562675952911377,
2338
- "learning_rate": 4.0049288167842705e-08,
2339
- "loss": 0.8482,
2340
  "step": 333
2341
  },
2342
  {
2343
  "epoch": 1.0,
2344
- "eval_loss": 1.0635182857513428,
2345
- "eval_runtime": 6.3055,
2346
- "eval_samples_per_second": 11.26,
2347
- "eval_steps_per_second": 11.26,
2348
  "step": 333
2349
  }
2350
  ],
 
24
  },
25
  {
26
  "epoch": 0.009009009009009009,
27
+ "grad_norm": 8.949604988098145,
28
  "learning_rate": 0.0001999955498150411,
29
  "loss": 2.3407,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.012012012012012012,
34
+ "grad_norm": 3.158568859100342,
35
  "learning_rate": 0.00019998219965624734,
36
+ "loss": 1.9823,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.015015015015015015,
41
+ "grad_norm": 2.2516987323760986,
42
  "learning_rate": 0.0001999599507118322,
43
+ "loss": 1.3624,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.018018018018018018,
48
+ "grad_norm": 4.390817642211914,
49
  "learning_rate": 0.000199928804962034,
50
+ "loss": 2.4517,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.021021021021021023,
55
+ "grad_norm": 3.761112689971924,
56
  "learning_rate": 0.0001998887651789398,
57
+ "loss": 1.9101,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.024024024024024024,
62
+ "grad_norm": 3.4166648387908936,
63
  "learning_rate": 0.00019983983492623833,
64
+ "loss": 1.6885,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.02702702702702703,
69
+ "grad_norm": 3.3827452659606934,
70
  "learning_rate": 0.00019978201855890308,
71
+ "loss": 1.4844,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.03003003003003003,
76
+ "grad_norm": 5.928102016448975,
77
  "learning_rate": 0.00019971532122280464,
78
+ "loss": 2.0922,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.03303303303303303,
83
+ "grad_norm": 3.9921727180480957,
84
  "learning_rate": 0.00019963974885425266,
85
+ "loss": 1.9214,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.036036036036036036,
90
+ "grad_norm": 6.364567279815674,
91
  "learning_rate": 0.00019955530817946748,
92
+ "loss": 1.9784,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.03903903903903904,
97
+ "grad_norm": 6.44351053237915,
98
  "learning_rate": 0.0001994620067139815,
99
+ "loss": 1.9525,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.042042042042042045,
104
+ "grad_norm": 4.626143455505371,
105
  "learning_rate": 0.0001993598527619703,
106
+ "loss": 1.8573,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.04504504504504504,
111
+ "grad_norm": 5.128786087036133,
112
  "learning_rate": 0.0001992488554155135,
113
+ "loss": 1.7945,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.04804804804804805,
118
+ "grad_norm": 4.881668567657471,
119
  "learning_rate": 0.00019912902455378556,
120
+ "loss": 1.5265,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.05105105105105105,
125
+ "grad_norm": 5.330976486206055,
126
  "learning_rate": 0.00019900037084217637,
127
+ "loss": 1.0192,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.05405405405405406,
132
+ "grad_norm": 4.7276458740234375,
133
  "learning_rate": 0.00019886290573134228,
134
+ "loss": 1.7171,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.057057057057057055,
139
+ "grad_norm": 4.941858768463135,
140
  "learning_rate": 0.00019871664145618657,
141
+ "loss": 1.0095,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.06006006006006006,
146
+ "grad_norm": 3.3863754272460938,
147
  "learning_rate": 0.00019856159103477086,
148
+ "loss": 1.0899,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.06306306306306306,
153
+ "grad_norm": 4.0780029296875,
154
  "learning_rate": 0.00019839776826715614,
155
+ "loss": 1.7246,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.06606606606606606,
160
+ "grad_norm": 4.969494342803955,
161
  "learning_rate": 0.0001982251877341748,
162
+ "loss": 1.2525,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.06906906906906907,
167
+ "grad_norm": 9.143044471740723,
168
  "learning_rate": 0.0001980438647961327,
169
+ "loss": 1.642,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.07207207207207207,
174
+ "grad_norm": 4.1555070877075195,
175
  "learning_rate": 0.00019785381559144196,
176
+ "loss": 1.4793,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.07507507507507508,
181
+ "grad_norm": 4.046628952026367,
182
  "learning_rate": 0.00019765505703518496,
183
+ "loss": 0.85,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.07807807807807808,
188
+ "grad_norm": 4.636494159698486,
189
  "learning_rate": 0.00019744760681760832,
190
+ "loss": 1.3454,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.08108108108108109,
195
+ "grad_norm": 4.610245704650879,
196
  "learning_rate": 0.00019723148340254892,
197
+ "loss": 1.7203,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.08408408408408409,
202
+ "grad_norm": 4.697399139404297,
203
  "learning_rate": 0.00019700670602579008,
204
+ "loss": 1.5735,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.08708708708708708,
209
+ "grad_norm": 3.4270856380462646,
210
  "learning_rate": 0.0001967732946933499,
211
+ "loss": 1.4224,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.09009009009009009,
216
+ "grad_norm": 6.80886697769165,
217
  "learning_rate": 0.00019653127017970034,
218
+ "loss": 1.5906,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.09309309309309309,
223
+ "grad_norm": 2.8819735050201416,
224
  "learning_rate": 0.00019628065402591845,
225
+ "loss": 1.2217,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.0960960960960961,
230
+ "grad_norm": 3.6171391010284424,
231
  "learning_rate": 0.00019602146853776894,
232
+ "loss": 1.2468,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.0990990990990991,
237
+ "grad_norm": 4.199411869049072,
238
  "learning_rate": 0.00019575373678371909,
239
+ "loss": 1.563,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.1021021021021021,
244
+ "grad_norm": 3.5157337188720703,
245
  "learning_rate": 0.00019547748259288536,
246
+ "loss": 1.3674,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.10510510510510511,
251
+ "grad_norm": 3.1449432373046875,
252
  "learning_rate": 0.00019519273055291266,
253
+ "loss": 1.2334,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.10810810810810811,
258
+ "grad_norm": 3.480318546295166,
259
  "learning_rate": 0.0001948995060077859,
260
+ "loss": 1.3391,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1111111111111111,
265
+ "grad_norm": 3.561872720718384,
266
  "learning_rate": 0.00019459783505557424,
267
+ "loss": 1.2206,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.11411411411411411,
272
+ "grad_norm": 2.9358267784118652,
273
  "learning_rate": 0.00019428774454610843,
274
+ "loss": 1.0831,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.11711711711711711,
279
+ "grad_norm": 3.375826358795166,
280
  "learning_rate": 0.00019396926207859084,
281
+ "loss": 0.991,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.12012012012012012,
286
+ "grad_norm": 4.3282246589660645,
287
  "learning_rate": 0.00019364241599913924,
288
+ "loss": 0.9534,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.12312312312312312,
293
+ "grad_norm": 5.404246807098389,
294
  "learning_rate": 0.00019330723539826375,
295
+ "loss": 1.5467,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.12612612612612611,
300
+ "grad_norm": 4.246641159057617,
301
  "learning_rate": 0.00019296375010827773,
302
+ "loss": 1.3542,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.12912912912912913,
307
+ "grad_norm": 4.657464027404785,
308
  "learning_rate": 0.0001926119907006426,
309
+ "loss": 0.8422,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.13213213213213212,
314
+ "grad_norm": 3.8296756744384766,
315
  "learning_rate": 0.0001922519884832469,
316
+ "loss": 1.3043,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.13513513513513514,
321
+ "grad_norm": 4.33470344543457,
322
  "learning_rate": 0.00019188377549761963,
323
+ "loss": 1.2141,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.13813813813813813,
328
+ "grad_norm": 3.89436674118042,
329
  "learning_rate": 0.0001915073845160786,
330
+ "loss": 1.4874,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.14114114114114115,
335
+ "grad_norm": 4.670392990112305,
336
  "learning_rate": 0.0001911228490388136,
337
+ "loss": 1.475,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.14414414414414414,
342
+ "grad_norm": 3.8425967693328857,
343
  "learning_rate": 0.00019073020329090444,
344
+ "loss": 1.2083,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.14714714714714713,
349
+ "grad_norm": 3.335406541824341,
350
  "learning_rate": 0.00019032948221927524,
351
+ "loss": 1.0553,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.15015015015015015,
356
+ "grad_norm": 3.949777364730835,
357
  "learning_rate": 0.00018992072148958368,
358
+ "loss": 1.1888,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.15315315315315314,
363
+ "grad_norm": 5.711514949798584,
364
  "learning_rate": 0.00018950395748304678,
365
+ "loss": 1.2669,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.15615615615615616,
370
+ "grad_norm": 3.32204008102417,
371
  "learning_rate": 0.00018907922729320285,
372
+ "loss": 1.5032,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.15915915915915915,
377
+ "grad_norm": 3.5603139400482178,
378
  "learning_rate": 0.00018864656872260985,
379
  "loss": 1.1452,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.16216216216216217,
384
+ "grad_norm": 2.997321128845215,
385
  "learning_rate": 0.00018820602027948114,
386
+ "loss": 1.1479,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.16516516516516516,
391
+ "grad_norm": 3.761380672454834,
392
  "learning_rate": 0.00018775762117425777,
393
+ "loss": 1.1018,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.16816816816816818,
398
+ "grad_norm": 3.998138904571533,
399
  "learning_rate": 0.00018730141131611882,
400
+ "loss": 0.9831,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.17117117117117117,
405
+ "grad_norm": 3.5861902236938477,
406
  "learning_rate": 0.00018683743130942928,
407
+ "loss": 1.181,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.17417417417417416,
412
+ "grad_norm": 3.1308274269104004,
413
  "learning_rate": 0.00018636572245012606,
414
+ "loss": 1.3986,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.17717717717717718,
419
+ "grad_norm": 10.603771209716797,
420
  "learning_rate": 0.00018588632672204264,
421
+ "loss": 1.1666,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.18018018018018017,
426
+ "grad_norm": 4.469778537750244,
427
  "learning_rate": 0.0001853992867931721,
428
+ "loss": 1.4776,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.1831831831831832,
433
+ "grad_norm": 3.3826351165771484,
434
  "learning_rate": 0.0001849046460118698,
435
+ "loss": 1.3352,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.18618618618618618,
440
+ "grad_norm": 3.4881980419158936,
441
  "learning_rate": 0.00018440244840299506,
442
+ "loss": 1.1307,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.1891891891891892,
447
+ "grad_norm": 3.8292853832244873,
448
  "learning_rate": 0.00018389273866399275,
449
+ "loss": 1.4379,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.1921921921921922,
454
+ "grad_norm": 3.452981472015381,
455
  "learning_rate": 0.00018337556216091517,
456
+ "loss": 1.3291,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.19519519519519518,
461
+ "grad_norm": 3.15205717086792,
462
  "learning_rate": 0.00018285096492438424,
463
+ "loss": 1.1697,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.1981981981981982,
468
+ "grad_norm": 2.998598098754883,
469
  "learning_rate": 0.00018231899364549455,
470
+ "loss": 0.8398,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.2012012012012012,
475
+ "grad_norm": 3.3119194507598877,
476
  "learning_rate": 0.0001817796956716578,
477
+ "loss": 0.9666,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.2042042042042042,
482
+ "grad_norm": 3.2154204845428467,
483
  "learning_rate": 0.0001812331190023886,
484
+ "loss": 1.0111,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.2072072072072072,
489
+ "grad_norm": 3.498479127883911,
490
  "learning_rate": 0.00018067931228503246,
491
+ "loss": 1.4255,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.21021021021021022,
496
+ "grad_norm": 3.3297171592712402,
497
  "learning_rate": 0.00018011832481043576,
498
+ "loss": 1.0459,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.2132132132132132,
503
+ "grad_norm": 3.0861032009124756,
504
  "learning_rate": 0.000179550206508559,
505
+ "loss": 1.1825,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.21621621621621623,
510
+ "grad_norm": 3.7197835445404053,
511
  "learning_rate": 0.0001789750079440326,
512
+ "loss": 1.3545,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.21921921921921922,
517
+ "grad_norm": 2.9662370681762695,
518
  "learning_rate": 0.00017839278031165658,
519
+ "loss": 1.1287,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.2222222222222222,
524
+ "grad_norm": 3.009904623031616,
525
  "learning_rate": 0.00017780357543184397,
526
+ "loss": 1.1675,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.22522522522522523,
531
+ "grad_norm": 3.819753885269165,
532
  "learning_rate": 0.00017720744574600863,
533
+ "loss": 1.2138,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.22822822822822822,
538
+ "grad_norm": 3.731964111328125,
539
  "learning_rate": 0.0001766044443118978,
540
+ "loss": 1.4024,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.23123123123123124,
545
+ "grad_norm": 7.4664082527160645,
546
  "learning_rate": 0.00017599462479886974,
547
+ "loss": 1.1637,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.23423423423423423,
552
+ "grad_norm": 3.3976633548736572,
553
  "learning_rate": 0.00017537804148311695,
554
+ "loss": 1.2395,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.23723723723723725,
559
+ "grad_norm": 2.8055222034454346,
560
  "learning_rate": 0.00017475474924283536,
561
+ "loss": 1.0323,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.24024024024024024,
566
+ "grad_norm": 5.117397308349609,
567
  "learning_rate": 0.00017412480355334005,
568
+ "loss": 1.6292,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.24324324324324326,
573
+ "grad_norm": 6.144725799560547,
574
+ "learning_rate": 0.0001734882604821276,
575
+ "loss": 1.5008,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.24624624624624625,
580
+ "grad_norm": 3.7815804481506348,
581
+ "learning_rate": 0.0001728451766838861,
582
+ "loss": 1.3559,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.24924924924924924,
587
+ "grad_norm": 3.6141855716705322,
588
+ "learning_rate": 0.00017219560939545246,
589
+ "loss": 1.505,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.25225225225225223,
594
+ "grad_norm": 2.9968454837799072,
595
+ "learning_rate": 0.0001715396164307182,
596
+ "loss": 1.1781,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.2552552552552553,
601
+ "grad_norm": 2.8306217193603516,
602
+ "learning_rate": 0.00017087725617548385,
603
+ "loss": 1.1847,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.25825825825825827,
608
+ "grad_norm": 2.852055788040161,
609
+ "learning_rate": 0.00017020858758226229,
610
+ "loss": 1.0893,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.26126126126126126,
615
+ "grad_norm": 3.723259687423706,
616
+ "learning_rate": 0.00016953367016503182,
617
+ "loss": 1.2045,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.26426426426426425,
622
+ "grad_norm": 2.880476236343384,
623
+ "learning_rate": 0.00016885256399393924,
624
+ "loss": 0.9824,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.2672672672672673,
629
+ "grad_norm": 2.7408432960510254,
630
+ "learning_rate": 0.00016816532968995328,
631
+ "loss": 0.9685,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.2702702702702703,
636
+ "grad_norm": 6.530210494995117,
637
+ "learning_rate": 0.00016747202841946928,
638
+ "loss": 1.5825,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.2732732732732733,
643
+ "grad_norm": 3.415400266647339,
644
+ "learning_rate": 0.00016677272188886483,
645
+ "loss": 1.4,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.27627627627627627,
650
+ "grad_norm": 3.045269250869751,
651
+ "learning_rate": 0.00016606747233900815,
652
+ "loss": 1.0537,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.27927927927927926,
657
+ "grad_norm": 4.4474992752075195,
658
+ "learning_rate": 0.00016535634253971794,
659
+ "loss": 1.4919,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.2822822822822823,
664
+ "grad_norm": 3.2899246215820312,
665
+ "learning_rate": 0.00016463939578417692,
666
+ "loss": 0.9117,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.2852852852852853,
671
+ "grad_norm": 3.275552749633789,
672
+ "learning_rate": 0.0001639166958832985,
673
+ "loss": 0.9903,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.2882882882882883,
678
+ "grad_norm": 4.8841118812561035,
679
+ "learning_rate": 0.00016318830716004722,
680
+ "loss": 1.615,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.2912912912912913,
685
+ "grad_norm": 4.650697231292725,
686
+ "learning_rate": 0.0001624542944437139,
687
+ "loss": 2.0446,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.29429429429429427,
692
+ "grad_norm": 3.8506672382354736,
693
+ "learning_rate": 0.00016171472306414554,
694
+ "loss": 1.3541,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.2972972972972973,
699
+ "grad_norm": 3.125844955444336,
700
+ "learning_rate": 0.0001609696588459307,
701
+ "loss": 0.9896,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.3003003003003003,
706
+ "grad_norm": 3.528669595718384,
707
+ "learning_rate": 0.00016021916810254097,
708
+ "loss": 1.2391,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.3033033033033033,
713
+ "grad_norm": 2.1554737091064453,
714
+ "learning_rate": 0.00015946331763042867,
715
+ "loss": 0.8665,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.3063063063063063,
720
+ "grad_norm": 3.8150954246520996,
721
+ "learning_rate": 0.00015870217470308188,
722
+ "loss": 1.2792,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.30930930930930933,
727
+ "grad_norm": 2.630399465560913,
728
+ "learning_rate": 0.0001579358070650367,
729
+ "loss": 1.177,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.3123123123123123,
734
+ "grad_norm": 3.1974780559539795,
735
+ "learning_rate": 0.00015716428292584787,
736
+ "loss": 1.5204,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.3153153153153153,
741
+ "grad_norm": 3.4595398902893066,
742
+ "learning_rate": 0.0001563876709540178,
743
+ "loss": 1.261,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.3183183183183183,
748
+ "grad_norm": 4.504167556762695,
749
+ "learning_rate": 0.00015560604027088477,
750
+ "loss": 1.7519,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.3213213213213213,
755
+ "grad_norm": 2.0379419326782227,
756
+ "learning_rate": 0.00015481946044447099,
757
+ "loss": 0.7486,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.32432432432432434,
762
+ "grad_norm": 2.478163957595825,
763
+ "learning_rate": 0.00015402800148329071,
764
+ "loss": 0.8723,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.32732732732732733,
769
+ "grad_norm": 2.9465012550354004,
770
+ "learning_rate": 0.0001532317338301192,
771
+ "loss": 1.1437,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.3303303303303303,
776
+ "grad_norm": 3.080664873123169,
777
+ "learning_rate": 0.00015243072835572318,
778
+ "loss": 0.968,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.3333333333333333,
783
+ "grad_norm": 2.778723955154419,
784
+ "learning_rate": 0.00015162505635255287,
785
+ "loss": 1.0228,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.33633633633633636,
790
+ "grad_norm": 3.867264986038208,
791
+ "learning_rate": 0.00015081478952839693,
792
+ "loss": 1.3582,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.33933933933933935,
797
+ "grad_norm": 4.303924560546875,
798
+ "learning_rate": 0.00015000000000000001,
799
+ "loss": 1.6162,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.34234234234234234,
804
+ "grad_norm": 3.681415319442749,
805
+ "learning_rate": 0.0001491807602866442,
806
+ "loss": 1.075,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.34534534534534533,
811
+ "grad_norm": 3.065018892288208,
812
+ "learning_rate": 0.00014835714330369446,
813
+ "loss": 0.7634,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.3483483483483483,
818
+ "grad_norm": 3.532973051071167,
819
+ "learning_rate": 0.000147529222356109,
820
+ "loss": 1.2287,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.35135135135135137,
825
+ "grad_norm": 4.905853748321533,
826
+ "learning_rate": 0.00014669707113191483,
827
+ "loss": 1.3884,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.35435435435435436,
832
+ "grad_norm": 2.8591182231903076,
833
+ "learning_rate": 0.00014586076369564908,
834
+ "loss": 1.0066,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.35735735735735735,
839
+ "grad_norm": 5.718966007232666,
840
+ "learning_rate": 0.00014502037448176734,
841
+ "loss": 1.0447,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.36036036036036034,
846
+ "grad_norm": 3.5775551795959473,
847
+ "learning_rate": 0.00014417597828801832,
848
+ "loss": 1.0552,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.3633633633633634,
853
+ "grad_norm": 3.6629276275634766,
854
+ "learning_rate": 0.00014332765026878687,
855
+ "loss": 1.5753,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.3663663663663664,
860
+ "grad_norm": 3.239745616912842,
861
+ "learning_rate": 0.0001424754659284048,
862
+ "loss": 1.0734,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.36936936936936937,
867
+ "grad_norm": 3.027463674545288,
868
+ "learning_rate": 0.00014161950111443077,
869
+ "loss": 1.2182,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.37237237237237236,
874
+ "grad_norm": 2.7655885219573975,
875
+ "learning_rate": 0.00014075983201089964,
876
+ "loss": 0.8409,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.37537537537537535,
881
+ "grad_norm": 2.6868016719818115,
882
+ "learning_rate": 0.00013989653513154165,
883
+ "loss": 0.9132,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.3783783783783784,
888
+ "grad_norm": 4.335800647735596,
889
+ "learning_rate": 0.00013902968731297255,
890
+ "loss": 1.448,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.3813813813813814,
895
+ "grad_norm": 2.2871532440185547,
896
+ "learning_rate": 0.00013815936570785487,
897
+ "loss": 0.889,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.3843843843843844,
902
+ "grad_norm": 4.444313049316406,
903
+ "learning_rate": 0.00013728564777803088,
904
+ "loss": 0.8284,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.38738738738738737,
909
+ "grad_norm": 3.820131778717041,
910
+ "learning_rate": 0.0001364086112876284,
911
+ "loss": 0.8969,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.39039039039039036,
916
+ "grad_norm": 4.358783721923828,
917
+ "learning_rate": 0.00013552833429613938,
918
+ "loss": 1.3421,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.3933933933933934,
923
+ "grad_norm": 5.490592002868652,
924
+ "learning_rate": 0.00013464489515147238,
925
+ "loss": 1.3454,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.3963963963963964,
930
+ "grad_norm": 5.831946849822998,
931
+ "learning_rate": 0.00013375837248297926,
932
+ "loss": 1.3816,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.3993993993993994,
937
+ "grad_norm": 2.685199022293091,
938
+ "learning_rate": 0.0001328688451944569,
939
+ "loss": 1.4342,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.4024024024024024,
944
+ "grad_norm": 3.5410306453704834,
945
+ "learning_rate": 0.00013197639245712454,
946
+ "loss": 1.2398,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.40540540540540543,
951
+ "grad_norm": 4.072748184204102,
952
+ "learning_rate": 0.00013108109370257712,
953
+ "loss": 1.0573,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.4084084084084084,
958
+ "grad_norm": 2.7305827140808105,
959
+ "learning_rate": 0.0001301830286157157,
960
+ "loss": 0.9445,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.4114114114114114,
965
+ "grad_norm": 3.4876914024353027,
966
+ "learning_rate": 0.00012928227712765504,
967
+ "loss": 1.424,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.4144144144144144,
972
+ "grad_norm": 3.93389630317688,
973
+ "learning_rate": 0.00012837891940860972,
974
+ "loss": 1.3908,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.4174174174174174,
979
+ "grad_norm": 4.051177501678467,
980
+ "learning_rate": 0.0001274730358607583,
981
+ "loss": 1.487,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.42042042042042044,
986
+ "grad_norm": 3.429589033126831,
987
+ "learning_rate": 0.00012656470711108764,
988
+ "loss": 1.0791,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.42342342342342343,
993
+ "grad_norm": 4.349189281463623,
994
+ "learning_rate": 0.00012565401400421651,
995
+ "loss": 1.5873,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.4264264264264264,
1000
+ "grad_norm": 3.6553049087524414,
1001
+ "learning_rate": 0.00012474103759520027,
1002
+ "loss": 1.1005,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.4294294294294294,
1007
+ "grad_norm": 2.88079571723938,
1008
+ "learning_rate": 0.0001238258591423165,
1009
+ "loss": 1.2325,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.43243243243243246,
1014
+ "grad_norm": 3.8122270107269287,
1015
+ "learning_rate": 0.000122908560099833,
1016
+ "loss": 1.0523,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.43543543543543545,
1021
+ "grad_norm": 3.8025801181793213,
1022
+ "learning_rate": 0.00012198922211075778,
1023
+ "loss": 1.3929,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.43843843843843844,
1028
+ "grad_norm": 2.05446195602417,
1029
+ "learning_rate": 0.00012106792699957263,
1030
+ "loss": 0.9597,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.44144144144144143,
1035
+ "grad_norm": 2.6509203910827637,
1036
+ "learning_rate": 0.00012014475676495052,
1037
+ "loss": 1.0216,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.4444444444444444,
1042
+ "grad_norm": 3.2547495365142822,
1043
+ "learning_rate": 0.0001192197935724573,
1044
+ "loss": 1.1975,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.44744744744744747,
1049
+ "grad_norm": 2.5003983974456787,
1050
+ "learning_rate": 0.00011829311974723867,
1051
+ "loss": 0.9849,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.45045045045045046,
1056
+ "grad_norm": 4.184463024139404,
1057
+ "learning_rate": 0.00011736481776669306,
1058
+ "loss": 1.6907,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.45345345345345345,
1063
+ "grad_norm": 1.7590183019638062,
1064
+ "learning_rate": 0.00011643497025313061,
1065
+ "loss": 0.6678,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.45645645645645644,
1070
+ "grad_norm": 2.4469213485717773,
1071
+ "learning_rate": 0.00011550365996641979,
1072
+ "loss": 1.084,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.4594594594594595,
1077
+ "grad_norm": 4.256711959838867,
1078
+ "learning_rate": 0.00011457096979662114,
1079
+ "loss": 0.999,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.4624624624624625,
1084
+ "grad_norm": 3.6662344932556152,
1085
+ "learning_rate": 0.00011363698275661001,
1086
+ "loss": 1.6327,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.46546546546546547,
1091
+ "grad_norm": 2.139150619506836,
1092
+ "learning_rate": 0.00011270178197468789,
1093
+ "loss": 0.7365,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.46846846846846846,
1098
+ "grad_norm": 2.397113561630249,
1099
+ "learning_rate": 0.00011176545068718385,
1100
+ "loss": 0.7469,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.47147147147147145,
1105
+ "grad_norm": 4.046786785125732,
1106
+ "learning_rate": 0.0001108280722310462,
1107
+ "loss": 1.7294,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.4744744744744745,
1112
+ "grad_norm": 3.422342538833618,
1113
+ "learning_rate": 0.00010988973003642499,
1114
+ "loss": 1.3217,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.4774774774774775,
1119
+ "grad_norm": 4.208369255065918,
1120
+ "learning_rate": 0.00010895050761924668,
1121
+ "loss": 1.4273,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.4804804804804805,
1126
+ "grad_norm": 4.233487129211426,
1127
+ "learning_rate": 0.00010801048857378071,
1128
+ "loss": 1.5523,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.48348348348348347,
1133
+ "grad_norm": 3.681049346923828,
1134
+ "learning_rate": 0.00010706975656519946,
1135
+ "loss": 1.4675,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.4864864864864865,
1140
+ "grad_norm": 2.444506883621216,
1141
+ "learning_rate": 0.00010612839532213164,
1142
+ "loss": 1.0676,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.4894894894894895,
1147
+ "grad_norm": 5.741203784942627,
1148
+ "learning_rate": 0.00010518648862921012,
1149
+ "loss": 1.6099,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.4924924924924925,
1154
+ "grad_norm": 3.325943946838379,
1155
+ "learning_rate": 0.00010424412031961484,
1156
+ "loss": 1.4014,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.4954954954954955,
1161
+ "grad_norm": 3.2008140087127686,
1162
+ "learning_rate": 0.00010330137426761135,
1163
+ "loss": 1.0755,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.4984984984984985,
1168
+ "grad_norm": 4.403491020202637,
1169
+ "learning_rate": 0.00010235833438108571,
1170
+ "loss": 1.1924,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.5015015015015015,
1175
+ "grad_norm": 2.671586513519287,
1176
+ "learning_rate": 0.00010141508459407623,
1177
+ "loss": 1.2323,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.5045045045045045,
1182
+ "grad_norm": 3.2359049320220947,
1183
+ "learning_rate": 0.00010047170885930324,
1184
+ "loss": 1.2092,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.5075075075075075,
1189
+ "grad_norm": 4.325455188751221,
1190
+ "learning_rate": 9.95282911406968e-05,
1191
+ "loss": 1.1624,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.5105105105105106,
1196
+ "grad_norm": 3.6989574432373047,
1197
+ "learning_rate": 9.858491540592382e-05,
1198
+ "loss": 1.0925,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.5135135135135135,
1203
+ "grad_norm": 3.4216420650482178,
1204
+ "learning_rate": 9.764166561891432e-05,
1205
+ "loss": 1.0068,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.5165165165165165,
1210
+ "grad_norm": 3.371088743209839,
1211
+ "learning_rate": 9.669862573238863e-05,
1212
+ "loss": 0.9391,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.5195195195195195,
1217
+ "grad_norm": 3.9000446796417236,
1218
+ "learning_rate": 9.57558796803852e-05,
1219
+ "loss": 1.1322,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.5225225225225225,
1224
+ "grad_norm": 4.006927967071533,
1225
+ "learning_rate": 9.48135113707899e-05,
1226
+ "loss": 1.0941,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.5255255255255256,
1231
+ "grad_norm": 2.8356428146362305,
1232
+ "learning_rate": 9.38716046778684e-05,
1233
+ "loss": 1.0821,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.5285285285285285,
1238
+ "grad_norm": 2.2834103107452393,
1239
+ "learning_rate": 9.293024343480055e-05,
1240
+ "loss": 1.0736,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.5315315315315315,
1245
+ "grad_norm": 2.1150338649749756,
1246
+ "learning_rate": 9.198951142621929e-05,
1247
+ "loss": 0.7405,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.5345345345345346,
1252
+ "grad_norm": 4.170374393463135,
1253
+ "learning_rate": 9.104949238075336e-05,
1254
+ "loss": 1.3548,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.5375375375375375,
1259
+ "grad_norm": 2.320065975189209,
1260
+ "learning_rate": 9.011026996357503e-05,
1261
+ "loss": 1.1287,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.5405405405405406,
1266
+ "grad_norm": 3.3114922046661377,
1267
+ "learning_rate": 8.917192776895382e-05,
1268
+ "loss": 1.0874,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.5435435435435435,
1273
+ "grad_norm": 2.822983980178833,
1274
+ "learning_rate": 8.823454931281616e-05,
1275
+ "loss": 1.2849,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.5465465465465466,
1280
+ "grad_norm": 5.101154804229736,
1281
+ "learning_rate": 8.729821802531212e-05,
1282
+ "loss": 1.0091,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.5495495495495496,
1287
+ "grad_norm": 2.1716604232788086,
1288
+ "learning_rate": 8.636301724339004e-05,
1289
+ "loss": 0.9004,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.5525525525525525,
1294
+ "grad_norm": 3.0461654663085938,
1295
+ "learning_rate": 8.542903020337887e-05,
1296
+ "loss": 1.1352,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.5555555555555556,
1301
+ "grad_norm": 1.9528117179870605,
1302
+ "learning_rate": 8.449634003358022e-05,
1303
+ "loss": 0.7149,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.5585585585585585,
1308
+ "grad_norm": 2.121596097946167,
1309
+ "learning_rate": 8.356502974686941e-05,
1310
+ "loss": 0.8156,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.5615615615615616,
1315
+ "grad_norm": 3.440051317214966,
1316
+ "learning_rate": 8.263518223330697e-05,
1317
+ "loss": 0.8836,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.5645645645645646,
1322
+ "grad_norm": 3.4065990447998047,
1323
+ "learning_rate": 8.170688025276134e-05,
1324
+ "loss": 1.3883,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.5675675675675675,
1329
+ "grad_norm": 4.436142444610596,
1330
+ "learning_rate": 8.078020642754274e-05,
1331
+ "loss": 1.247,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.5705705705705706,
1336
+ "grad_norm": 4.292184829711914,
1337
+ "learning_rate": 7.985524323504948e-05,
1338
+ "loss": 1.1434,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.5735735735735735,
1343
+ "grad_norm": 3.944159984588623,
1344
+ "learning_rate": 7.89320730004274e-05,
1345
+ "loss": 1.0251,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.5765765765765766,
1350
+ "grad_norm": 3.428690195083618,
1351
+ "learning_rate": 7.801077788924224e-05,
1352
+ "loss": 1.2212,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.5795795795795796,
1357
+ "grad_norm": 2.958847761154175,
1358
+ "learning_rate": 7.709143990016702e-05,
1359
+ "loss": 1.1812,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.5825825825825826,
1364
+ "grad_norm": 4.4013214111328125,
1365
+ "learning_rate": 7.617414085768351e-05,
1366
+ "loss": 1.1677,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.5855855855855856,
1371
+ "grad_norm": 2.435934543609619,
1372
+ "learning_rate": 7.525896240479976e-05,
1373
+ "loss": 1.0538,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.5885885885885885,
1378
+ "grad_norm": 2.3836328983306885,
1379
+ "learning_rate": 7.434598599578351e-05,
1380
+ "loss": 1.1743,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.5915915915915916,
1385
+ "grad_norm": 3.5238161087036133,
1386
+ "learning_rate": 7.343529288891239e-05,
1387
+ "loss": 1.112,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.5945945945945946,
1392
+ "grad_norm": 3.281916379928589,
1393
+ "learning_rate": 7.252696413924174e-05,
1394
+ "loss": 1.2468,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.5975975975975976,
1399
+ "grad_norm": 9.559804916381836,
1400
+ "learning_rate": 7.162108059139032e-05,
1401
+ "loss": 1.3009,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.6006006006006006,
1406
+ "grad_norm": 2.9067914485931396,
1407
+ "learning_rate": 7.071772287234497e-05,
1408
+ "loss": 1.2777,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.6036036036036037,
1413
+ "grad_norm": 3.17146897315979,
1414
+ "learning_rate": 6.981697138428434e-05,
1415
+ "loss": 1.0077,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.6066066066066066,
1420
+ "grad_norm": 2.9536023139953613,
1421
+ "learning_rate": 6.891890629742288e-05,
1422
+ "loss": 0.9833,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.6096096096096096,
1427
+ "grad_norm": 3.4014861583709717,
1428
+ "learning_rate": 6.802360754287547e-05,
1429
+ "loss": 0.9078,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.6126126126126126,
1434
+ "grad_norm": 2.692471981048584,
1435
+ "learning_rate": 6.713115480554313e-05,
1436
+ "loss": 1.0275,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.6156156156156156,
1441
+ "grad_norm": 4.035511016845703,
1442
+ "learning_rate": 6.624162751702076e-05,
1443
+ "loss": 1.1448,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.6186186186186187,
1448
+ "grad_norm": 3.6545097827911377,
1449
+ "learning_rate": 6.535510484852767e-05,
1450
+ "loss": 1.3975,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.6216216216216216,
1455
+ "grad_norm": 3.810364007949829,
1456
+ "learning_rate": 6.447166570386063e-05,
1457
+ "loss": 1.543,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.6246246246246246,
1462
+ "grad_norm": 2.650916337966919,
1463
+ "learning_rate": 6.35913887123716e-05,
1464
+ "loss": 1.1199,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.6276276276276276,
1469
+ "grad_norm": 4.023564338684082,
1470
+ "learning_rate": 6.271435222196916e-05,
1471
+ "loss": 1.256,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.6306306306306306,
1476
+ "grad_norm": 2.7010152339935303,
1477
+ "learning_rate": 6.184063429214515e-05,
1478
+ "loss": 0.5501,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.6336336336336337,
1483
+ "grad_norm": 1.9381567239761353,
1484
+ "learning_rate": 6.097031268702746e-05,
1485
+ "loss": 0.6213,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.6366366366366366,
1490
+ "grad_norm": 2.4009511470794678,
1491
+ "learning_rate": 6.010346486845837e-05,
1492
+ "loss": 1.0056,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.6396396396396397,
1497
+ "grad_norm": 2.6076042652130127,
1498
+ "learning_rate": 5.924016798910037e-05,
1499
+ "loss": 0.9052,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.6426426426426426,
1504
+ "grad_norm": 2.4669220447540283,
1505
+ "learning_rate": 5.838049888556925e-05,
1506
+ "loss": 0.7062,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.6456456456456456,
1511
+ "grad_norm": 2.6070826053619385,
1512
+ "learning_rate": 5.752453407159522e-05,
1513
+ "loss": 1.2074,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.6486486486486487,
1518
+ "grad_norm": 3.0781683921813965,
1519
+ "learning_rate": 5.667234973121317e-05,
1520
+ "loss": 1.027,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.6516516516516516,
1525
+ "grad_norm": 3.333461284637451,
1526
+ "learning_rate": 5.5824021711981686e-05,
1527
+ "loss": 1.5397,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.6546546546546547,
1532
+ "grad_norm": 3.359348773956299,
1533
+ "learning_rate": 5.497962551823266e-05,
1534
+ "loss": 0.9678,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.6576576576576577,
1539
+ "grad_norm": 3.1527299880981445,
1540
+ "learning_rate": 5.4139236304350935e-05,
1541
+ "loss": 1.0678,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.6606606606606606,
1546
+ "grad_norm": 3.3422963619232178,
1547
+ "learning_rate": 5.33029288680852e-05,
1548
+ "loss": 1.0821,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.6636636636636637,
1553
+ "grad_norm": 2.5791242122650146,
1554
+ "learning_rate": 5.247077764389099e-05,
1555
+ "loss": 1.0926,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.6666666666666666,
1560
+ "grad_norm": 4.386955738067627,
1561
+ "learning_rate": 5.1642856696305575e-05,
1562
+ "loss": 1.1258,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.6696696696696697,
1567
+ "grad_norm": 2.5412847995758057,
1568
+ "learning_rate": 5.081923971335582e-05,
1569
+ "loss": 1.0448,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.6726726726726727,
1574
+ "grad_norm": 2.4744250774383545,
1575
+ "learning_rate": 5.000000000000002e-05,
1576
+ "loss": 0.8041,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.6756756756756757,
1581
+ "grad_norm": 3.471261978149414,
1582
+ "learning_rate": 4.918521047160308e-05,
1583
+ "loss": 1.077,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.6786786786786787,
1588
+ "grad_norm": 4.173779487609863,
1589
+ "learning_rate": 4.837494364744711e-05,
1590
+ "loss": 1.798,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.6816816816816816,
1595
+ "grad_norm": 3.305908203125,
1596
+ "learning_rate": 4.756927164427685e-05,
1597
+ "loss": 0.7939,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.6846846846846847,
1602
+ "grad_norm": 3.1384143829345703,
1603
+ "learning_rate": 4.6768266169880804e-05,
1604
+ "loss": 1.2368,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.6876876876876877,
1609
+ "grad_norm": 3.3017380237579346,
1610
+ "learning_rate": 4.597199851670932e-05,
1611
+ "loss": 1.0357,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.6906906906906907,
1616
+ "grad_norm": 3.067096710205078,
1617
+ "learning_rate": 4.518053955552903e-05,
1618
+ "loss": 1.2825,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.6936936936936937,
1623
+ "grad_norm": 2.126237392425537,
1624
+ "learning_rate": 4.4393959729115244e-05,
1625
+ "loss": 0.9703,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.6966966966966966,
1630
+ "grad_norm": 2.740218162536621,
1631
+ "learning_rate": 4.3612329045982236e-05,
1632
+ "loss": 1.0771,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 0.6996996996996997,
1637
+ "grad_norm": 2.1864259243011475,
1638
+ "learning_rate": 4.283571707415214e-05,
1639
+ "loss": 1.0516,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 0.7027027027027027,
1644
+ "grad_norm": 3.2261359691619873,
1645
+ "learning_rate": 4.206419293496333e-05,
1646
+ "loss": 1.317,
1647
  "step": 234
1648
  },
1649
  {
1650
  "epoch": 0.7057057057057057,
1651
+ "grad_norm": 3.8331665992736816,
1652
+ "learning_rate": 4.129782529691815e-05,
1653
+ "loss": 1.0472,
1654
  "step": 235
1655
  },
1656
  {
1657
  "epoch": 0.7087087087087087,
1658
+ "grad_norm": 2.4469823837280273,
1659
+ "learning_rate": 4.053668236957134e-05,
1660
+ "loss": 1.0914,
1661
  "step": 236
1662
  },
1663
  {
1664
  "epoch": 0.7117117117117117,
1665
+ "grad_norm": 2.4445111751556396,
1666
+ "learning_rate": 3.978083189745907e-05,
1667
+ "loss": 1.2176,
1668
  "step": 237
1669
  },
1670
  {
1671
  "epoch": 0.7147147147147147,
1672
+ "grad_norm": 1.970625400543213,
1673
+ "learning_rate": 3.903034115406931e-05,
1674
+ "loss": 0.7056,
1675
  "step": 238
1676
  },
1677
  {
1678
  "epoch": 0.7177177177177178,
1679
+ "grad_norm": 2.919154644012451,
1680
+ "learning_rate": 3.828527693585451e-05,
1681
+ "loss": 1.0735,
1682
  "step": 239
1683
  },
1684
  {
1685
  "epoch": 0.7207207207207207,
1686
+ "grad_norm": 3.0129635334014893,
1687
+ "learning_rate": 3.7545705556286126e-05,
1688
+ "loss": 1.0387,
1689
  "step": 240
1690
  },
1691
  {
1692
  "epoch": 0.7237237237237237,
1693
+ "grad_norm": 3.2934532165527344,
1694
+ "learning_rate": 3.681169283995279e-05,
1695
+ "loss": 1.5478,
1696
  "step": 241
1697
  },
1698
  {
1699
  "epoch": 0.7267267267267268,
1700
+ "grad_norm": 2.250554084777832,
1701
+ "learning_rate": 3.6083304116701535e-05,
1702
+ "loss": 1.2424,
1703
  "step": 242
1704
  },
1705
  {
1706
  "epoch": 0.7297297297297297,
1707
+ "grad_norm": 2.934713125228882,
1708
+ "learning_rate": 3.536060421582309e-05,
1709
+ "loss": 1.1404,
1710
  "step": 243
1711
  },
1712
  {
1713
  "epoch": 0.7327327327327328,
1714
+ "grad_norm": 3.759920358657837,
1715
+ "learning_rate": 3.464365746028208e-05,
1716
+ "loss": 1.4122,
1717
  "step": 244
1718
  },
1719
  {
1720
  "epoch": 0.7357357357357357,
1721
+ "grad_norm": 3.219802141189575,
1722
+ "learning_rate": 3.393252766099187e-05,
1723
+ "loss": 1.1218,
1724
  "step": 245
1725
  },
1726
  {
1727
  "epoch": 0.7387387387387387,
1728
+ "grad_norm": 2.7375502586364746,
1729
+ "learning_rate": 3.322727811113516e-05,
1730
+ "loss": 1.1416,
1731
  "step": 246
1732
  },
1733
  {
1734
  "epoch": 0.7417417417417418,
1735
+ "grad_norm": 3.227217674255371,
1736
+ "learning_rate": 3.252797158053077e-05,
1737
+ "loss": 1.3001,
1738
  "step": 247
1739
  },
1740
  {
1741
  "epoch": 0.7447447447447447,
1742
+ "grad_norm": 2.4813485145568848,
1743
+ "learning_rate": 3.1834670310046734e-05,
1744
+ "loss": 1.251,
1745
  "step": 248
1746
  },
1747
  {
1748
  "epoch": 0.7477477477477478,
1749
+ "grad_norm": 3.096740245819092,
1750
+ "learning_rate": 3.114743600606078e-05,
1751
+ "loss": 1.2156,
1752
  "step": 249
1753
  },
1754
  {
1755
  "epoch": 0.7507507507507507,
1756
+ "grad_norm": 3.479696750640869,
1757
+ "learning_rate": 3.0466329834968233e-05,
1758
+ "loss": 1.3375,
1759
  "step": 250
1760
  },
1761
  {
1762
  "epoch": 0.7537537537537538,
1763
+ "grad_norm": 2.182914972305298,
1764
+ "learning_rate": 2.979141241773775e-05,
1765
+ "loss": 0.8452,
1766
  "step": 251
1767
  },
1768
  {
1769
  "epoch": 0.7567567567567568,
1770
+ "grad_norm": 3.065699338912964,
1771
+ "learning_rate": 2.9122743824516195e-05,
1772
+ "loss": 1.2515,
1773
  "step": 252
1774
  },
1775
  {
1776
  "epoch": 0.7597597597597597,
1777
+ "grad_norm": 2.5178773403167725,
1778
+ "learning_rate": 2.8460383569281824e-05,
1779
+ "loss": 1.066,
1780
  "step": 253
1781
  },
1782
  {
1783
  "epoch": 0.7627627627627628,
1784
+ "grad_norm": 2.5615763664245605,
1785
+ "learning_rate": 2.7804390604547557e-05,
1786
+ "loss": 0.8671,
1787
  "step": 254
1788
  },
1789
  {
1790
  "epoch": 0.7657657657657657,
1791
+ "grad_norm": 3.6489756107330322,
1792
+ "learning_rate": 2.7154823316113932e-05,
1793
+ "loss": 0.9719,
1794
  "step": 255
1795
  },
1796
  {
1797
  "epoch": 0.7687687687687688,
1798
+ "grad_norm": 2.6565563678741455,
1799
+ "learning_rate": 2.6511739517872426e-05,
1800
+ "loss": 1.1096,
1801
  "step": 256
1802
  },
1803
  {
1804
  "epoch": 0.7717717717717718,
1805
+ "grad_norm": 2.854301691055298,
1806
+ "learning_rate": 2.587519644666001e-05,
1807
+ "loss": 1.1078,
1808
  "step": 257
1809
  },
1810
  {
1811
  "epoch": 0.7747747747747747,
1812
+ "grad_norm": 3.2702455520629883,
1813
+ "learning_rate": 2.5245250757164663e-05,
1814
+ "loss": 0.8153,
1815
  "step": 258
1816
  },
1817
  {
1818
  "epoch": 0.7777777777777778,
1819
+ "grad_norm": 4.333303928375244,
1820
+ "learning_rate": 2.462195851688306e-05,
1821
+ "loss": 1.3388,
1822
  "step": 259
1823
  },
1824
  {
1825
  "epoch": 0.7807807807807807,
1826
+ "grad_norm": 3.0669684410095215,
1827
+ "learning_rate": 2.4005375201130274e-05,
1828
+ "loss": 0.8611,
1829
  "step": 260
1830
  },
1831
  {
1832
  "epoch": 0.7837837837837838,
1833
+ "grad_norm": 2.525719404220581,
1834
+ "learning_rate": 2.339555568810221e-05,
1835
+ "loss": 1.1003,
1836
  "step": 261
1837
  },
1838
  {
1839
  "epoch": 0.7867867867867868,
1840
+ "grad_norm": 2.2282748222351074,
1841
+ "learning_rate": 2.2792554253991415e-05,
1842
+ "loss": 0.9158,
1843
  "step": 262
1844
  },
1845
  {
1846
  "epoch": 0.7897897897897898,
1847
+ "grad_norm": 2.8235883712768555,
1848
+ "learning_rate": 2.2196424568156073e-05,
1849
+ "loss": 0.9614,
1850
  "step": 263
1851
  },
1852
  {
1853
  "epoch": 0.7927927927927928,
1854
+ "grad_norm": 3.252837896347046,
1855
+ "learning_rate": 2.160721968834344e-05,
1856
+ "loss": 1.3249,
1857
  "step": 264
1858
  },
1859
  {
1860
  "epoch": 0.7957957957957958,
1861
+ "grad_norm": 2.533545732498169,
1862
+ "learning_rate": 2.102499205596743e-05,
1863
+ "loss": 1.1374,
1864
  "step": 265
1865
  },
1866
  {
1867
  "epoch": 0.7987987987987988,
1868
+ "grad_norm": 6.542397975921631,
1869
+ "learning_rate": 2.0449793491441028e-05,
1870
+ "loss": 1.1266,
1871
  "step": 266
1872
  },
1873
  {
1874
  "epoch": 0.8018018018018018,
1875
+ "grad_norm": 4.502058982849121,
1876
+ "learning_rate": 1.9881675189564254e-05,
1877
+ "loss": 1.0552,
1878
  "step": 267
1879
  },
1880
  {
1881
  "epoch": 0.8048048048048048,
1882
+ "grad_norm": 2.8032190799713135,
1883
+ "learning_rate": 1.93206877149676e-05,
1884
+ "loss": 1.6937,
1885
  "step": 268
1886
  },
1887
  {
1888
  "epoch": 0.8078078078078078,
1889
+ "grad_norm": 2.657517194747925,
1890
+ "learning_rate": 1.8766880997611424e-05,
1891
+ "loss": 0.8906,
1892
  "step": 269
1893
  },
1894
  {
1895
  "epoch": 0.8108108108108109,
1896
+ "grad_norm": 2.660195827484131,
1897
+ "learning_rate": 1.8220304328342252e-05,
1898
+ "loss": 1.2647,
1899
  "step": 270
1900
  },
1901
  {
1902
  "epoch": 0.8138138138138138,
1903
+ "grad_norm": 1.883196473121643,
1904
+ "learning_rate": 1.7681006354505493e-05,
1905
+ "loss": 0.6738,
1906
  "step": 271
1907
  },
1908
  {
1909
  "epoch": 0.8168168168168168,
1910
+ "grad_norm": 3.13309645652771,
1911
+ "learning_rate": 1.7149035075615794e-05,
1912
+ "loss": 1.066,
1913
  "step": 272
1914
  },
1915
  {
1916
  "epoch": 0.8198198198198198,
1917
+ "grad_norm": 2.7584190368652344,
1918
+ "learning_rate": 1.6624437839084862e-05,
1919
+ "loss": 1.0969,
1920
  "step": 273
1921
  },
1922
  {
1923
  "epoch": 0.8228228228228228,
1924
+ "grad_norm": 2.3647868633270264,
1925
+ "learning_rate": 1.6107261336007285e-05,
1926
+ "loss": 0.8421,
1927
  "step": 274
1928
  },
1929
  {
1930
  "epoch": 0.8258258258258259,
1931
+ "grad_norm": 3.7344658374786377,
1932
+ "learning_rate": 1.5597551597004966e-05,
1933
+ "loss": 1.2212,
1934
  "step": 275
1935
  },
1936
  {
1937
  "epoch": 0.8288288288288288,
1938
+ "grad_norm": 1.9428234100341797,
1939
+ "learning_rate": 1.5095353988130235e-05,
1940
+ "loss": 0.8606,
1941
  "step": 276
1942
  },
1943
  {
1944
  "epoch": 0.8318318318318318,
1945
+ "grad_norm": 4.204195499420166,
1946
+ "learning_rate": 1.4600713206827932e-05,
1947
+ "loss": 1.4646,
1948
  "step": 277
1949
  },
1950
  {
1951
  "epoch": 0.8348348348348348,
1952
+ "grad_norm": 3.156510353088379,
1953
+ "learning_rate": 1.4113673277957395e-05,
1954
+ "loss": 0.7277,
1955
  "step": 278
1956
  },
1957
  {
1958
  "epoch": 0.8378378378378378,
1959
+ "grad_norm": 2.952470064163208,
1960
+ "learning_rate": 1.3634277549873953e-05,
1961
+ "loss": 1.2394,
1962
  "step": 279
1963
  },
1964
  {
1965
  "epoch": 0.8408408408408409,
1966
+ "grad_norm": 3.20043683052063,
1967
+ "learning_rate": 1.3162568690570743e-05,
1968
+ "loss": 1.1353,
1969
  "step": 280
1970
  },
1971
  {
1972
  "epoch": 0.8438438438438438,
1973
+ "grad_norm": 2.7060203552246094,
1974
+ "learning_rate": 1.2698588683881186e-05,
1975
+ "loss": 1.03,
1976
  "step": 281
1977
  },
1978
  {
1979
  "epoch": 0.8468468468468469,
1980
+ "grad_norm": 3.0565147399902344,
1981
+ "learning_rate": 1.224237882574224e-05,
1982
+ "loss": 1.0829,
1983
  "step": 282
1984
  },
1985
  {
1986
  "epoch": 0.8498498498498499,
1987
+ "grad_norm": 3.7081923484802246,
1988
+ "learning_rate": 1.1793979720518866e-05,
1989
+ "loss": 0.861,
1990
  "step": 283
1991
  },
1992
  {
1993
  "epoch": 0.8528528528528528,
1994
+ "grad_norm": 2.978846311569214,
1995
+ "learning_rate": 1.1353431277390126e-05,
1996
+ "loss": 0.9791,
1997
  "step": 284
1998
  },
1999
  {
2000
  "epoch": 0.8558558558558559,
2001
+ "grad_norm": 3.253490447998047,
2002
+ "learning_rate": 1.0920772706797167e-05,
2003
+ "loss": 1.2799,
2004
  "step": 285
2005
  },
2006
  {
2007
  "epoch": 0.8588588588588588,
2008
+ "grad_norm": 3.5447134971618652,
2009
+ "learning_rate": 1.0496042516953209e-05,
2010
+ "loss": 0.9983,
2011
  "step": 286
2012
  },
2013
  {
2014
  "epoch": 0.8618618618618619,
2015
+ "grad_norm": 3.0113589763641357,
2016
+ "learning_rate": 1.0079278510416313e-05,
2017
+ "loss": 0.9575,
2018
  "step": 287
2019
  },
2020
  {
2021
  "epoch": 0.8648648648648649,
2022
+ "grad_norm": 2.7044694423675537,
2023
+ "learning_rate": 9.670517780724775e-06,
2024
+ "loss": 0.7598,
2025
  "step": 288
2026
  },
2027
  {
2028
  "epoch": 0.8678678678678678,
2029
+ "grad_norm": 3.9310684204101562,
2030
+ "learning_rate": 9.269796709095558e-06,
2031
+ "loss": 1.0002,
2032
  "step": 289
2033
  },
2034
  {
2035
  "epoch": 0.8708708708708709,
2036
+ "grad_norm": 2.882972478866577,
2037
+ "learning_rate": 8.87715096118642e-06,
2038
+ "loss": 1.4311,
2039
  "step": 290
2040
  },
2041
  {
2042
  "epoch": 0.8738738738738738,
2043
+ "grad_norm": 4.454821586608887,
2044
+ "learning_rate": 8.492615483921395e-06,
2045
+ "loss": 0.8625,
2046
  "step": 291
2047
  },
2048
  {
2049
  "epoch": 0.8768768768768769,
2050
+ "grad_norm": 2.4712319374084473,
2051
+ "learning_rate": 8.116224502380387e-06,
2052
+ "loss": 0.868,
2053
  "step": 292
2054
  },
2055
  {
2056
  "epoch": 0.8798798798798799,
2057
+ "grad_norm": 2.3259506225585938,
2058
+ "learning_rate": 7.74801151675314e-06,
2059
+ "loss": 1.21,
2060
  "step": 293
2061
  },
2062
  {
2063
  "epoch": 0.8828828828828829,
2064
+ "grad_norm": 2.29254412651062,
2065
+ "learning_rate": 7.3880092993574125e-06,
2066
+ "loss": 0.9161,
2067
  "step": 294
2068
  },
2069
  {
2070
  "epoch": 0.8858858858858859,
2071
+ "grad_norm": 3.6006367206573486,
2072
+ "learning_rate": 7.03624989172228e-06,
2073
+ "loss": 1.3222,
2074
  "step": 295
2075
  },
2076
  {
2077
  "epoch": 0.8888888888888888,
2078
+ "grad_norm": 2.694861888885498,
2079
+ "learning_rate": 6.692764601736268e-06,
2080
+ "loss": 0.8479,
2081
  "step": 296
2082
  },
2083
  {
2084
  "epoch": 0.8918918918918919,
2085
+ "grad_norm": 1.8544005155563354,
2086
+ "learning_rate": 6.357584000860761e-06,
2087
+ "loss": 0.7101,
2088
  "step": 297
2089
  },
2090
  {
2091
  "epoch": 0.8948948948948949,
2092
+ "grad_norm": 3.2788209915161133,
2093
+ "learning_rate": 6.030737921409169e-06,
2094
  "loss": 0.9239,
2095
  "step": 298
2096
  },
2097
  {
2098
  "epoch": 0.8978978978978979,
2099
+ "grad_norm": 2.618945598602295,
2100
+ "learning_rate": 5.71225545389158e-06,
2101
+ "loss": 0.827,
2102
  "step": 299
2103
  },
2104
  {
2105
  "epoch": 0.9009009009009009,
2106
+ "grad_norm": 2.999677896499634,
2107
+ "learning_rate": 5.402164944425758e-06,
2108
+ "loss": 1.1641,
2109
  "step": 300
2110
  },
2111
  {
2112
  "epoch": 0.9039039039039038,
2113
+ "grad_norm": 2.9957101345062256,
2114
+ "learning_rate": 5.100493992214128e-06,
2115
+ "loss": 1.1766,
2116
  "step": 301
2117
  },
2118
  {
2119
  "epoch": 0.9069069069069069,
2120
+ "grad_norm": 7.010153293609619,
2121
+ "learning_rate": 4.807269447087348e-06,
2122
+ "loss": 1.3179,
2123
  "step": 302
2124
  },
2125
  {
2126
  "epoch": 0.9099099099099099,
2127
+ "grad_norm": 2.962459087371826,
2128
+ "learning_rate": 4.5225174071146455e-06,
2129
+ "loss": 1.1249,
2130
  "step": 303
2131
  },
2132
  {
2133
  "epoch": 0.9129129129129129,
2134
+ "grad_norm": 3.1985106468200684,
2135
+ "learning_rate": 4.24626321628091e-06,
2136
+ "loss": 1.0897,
2137
  "step": 304
2138
  },
2139
  {
2140
  "epoch": 0.9159159159159159,
2141
+ "grad_norm": 3.808082103729248,
2142
+ "learning_rate": 3.9785314622310495e-06,
2143
+ "loss": 0.8197,
2144
  "step": 305
2145
  },
2146
  {
2147
  "epoch": 0.918918918918919,
2148
+ "grad_norm": 3.2726027965545654,
2149
+ "learning_rate": 3.7193459740815674e-06,
2150
+ "loss": 1.3593,
2151
  "step": 306
2152
  },
2153
  {
2154
  "epoch": 0.9219219219219219,
2155
+ "grad_norm": 2.5150132179260254,
2156
+ "learning_rate": 3.4687298202996655e-06,
2157
+ "loss": 1.1216,
2158
  "step": 307
2159
  },
2160
  {
2161
  "epoch": 0.924924924924925,
2162
+ "grad_norm": 2.964665174484253,
2163
+ "learning_rate": 3.226705306650113e-06,
2164
+ "loss": 0.9923,
2165
  "step": 308
2166
  },
2167
  {
2168
  "epoch": 0.9279279279279279,
2169
+ "grad_norm": 2.368405342102051,
2170
+ "learning_rate": 2.9932939742099208e-06,
2171
+ "loss": 1.2781,
2172
  "step": 309
2173
  },
2174
  {
2175
  "epoch": 0.9309309309309309,
2176
+ "grad_norm": 2.1340932846069336,
2177
+ "learning_rate": 2.7685165974510986e-06,
2178
+ "loss": 0.771,
2179
  "step": 310
2180
  },
2181
  {
2182
  "epoch": 0.933933933933934,
2183
+ "grad_norm": 3.647322654724121,
2184
+ "learning_rate": 2.552393182391677e-06,
2185
+ "loss": 1.2626,
2186
  "step": 311
2187
  },
2188
  {
2189
  "epoch": 0.9369369369369369,
2190
+ "grad_norm": 3.0686633586883545,
2191
+ "learning_rate": 2.3449429648150665e-06,
2192
+ "loss": 0.9542,
2193
  "step": 312
2194
  },
2195
  {
2196
  "epoch": 0.93993993993994,
2197
+ "grad_norm": 2.4045958518981934,
2198
+ "learning_rate": 2.1461844085580385e-06,
2199
+ "loss": 1.1157,
2200
  "step": 313
2201
  },
2202
  {
2203
  "epoch": 0.9429429429429429,
2204
+ "grad_norm": 3.171212673187256,
2205
+ "learning_rate": 1.9561352038673263e-06,
2206
+ "loss": 1.154,
2207
  "step": 314
2208
  },
2209
  {
2210
  "epoch": 0.9459459459459459,
2211
+ "grad_norm": 2.600048065185547,
2212
+ "learning_rate": 1.7748122658251876e-06,
2213
+ "loss": 0.985,
2214
  "step": 315
2215
  },
2216
  {
2217
  "epoch": 0.948948948948949,
2218
+ "grad_norm": 3.850813388824463,
2219
+ "learning_rate": 1.6022317328438506e-06,
2220
+ "loss": 0.7198,
2221
  "step": 316
2222
  },
2223
  {
2224
  "epoch": 0.9519519519519519,
2225
+ "grad_norm": 2.7421891689300537,
2226
+ "learning_rate": 1.4384089652291543e-06,
2227
+ "loss": 1.0198,
2228
  "step": 317
2229
  },
2230
  {
2231
  "epoch": 0.954954954954955,
2232
+ "grad_norm": 3.2563424110412598,
2233
+ "learning_rate": 1.2833585438134287e-06,
2234
+ "loss": 1.1602,
2235
  "step": 318
2236
  },
2237
  {
2238
  "epoch": 0.9579579579579579,
2239
+ "grad_norm": 2.593169689178467,
2240
+ "learning_rate": 1.1370942686577347e-06,
2241
+ "loss": 1.0172,
2242
  "step": 319
2243
  },
2244
  {
2245
  "epoch": 0.960960960960961,
2246
+ "grad_norm": 2.467902421951294,
2247
+ "learning_rate": 9.996291578236228e-07,
2248
+ "loss": 1.158,
2249
  "step": 320
2250
  },
2251
  {
2252
  "epoch": 0.963963963963964,
2253
+ "grad_norm": 3.1894989013671875,
2254
+ "learning_rate": 8.709754462144615e-07,
2255
+ "loss": 1.366,
2256
  "step": 321
2257
  },
2258
  {
2259
  "epoch": 0.9669669669669669,
2260
+ "grad_norm": 2.78214955329895,
2261
+ "learning_rate": 7.511445844864962e-07,
2262
+ "loss": 1.19,
2263
  "step": 322
2264
  },
2265
  {
2266
  "epoch": 0.96996996996997,
2267
+ "grad_norm": 2.4907963275909424,
2268
+ "learning_rate": 6.401472380297091e-07,
2269
+ "loss": 0.645,
2270
  "step": 323
2271
  },
2272
  {
2273
  "epoch": 0.972972972972973,
2274
+ "grad_norm": 2.79376482963562,
2275
+ "learning_rate": 5.379932860185122e-07,
2276
+ "loss": 1.2244,
2277
  "step": 324
2278
  },
2279
  {
2280
  "epoch": 0.975975975975976,
2281
+ "grad_norm": 2.3573567867279053,
2282
+ "learning_rate": 4.44691820532539e-07,
2283
+ "loss": 0.8455,
2284
  "step": 325
2285
  },
2286
  {
2287
  "epoch": 0.978978978978979,
2288
+ "grad_norm": 2.453552484512329,
2289
+ "learning_rate": 3.6025114574734785e-07,
2290
+ "loss": 1.0234,
2291
  "step": 326
2292
  },
2293
  {
2294
  "epoch": 0.9819819819819819,
2295
+ "grad_norm": 3.1703238487243652,
2296
+ "learning_rate": 2.846787771953574e-07,
2297
+ "loss": 1.2226,
2298
  "step": 327
2299
  },
2300
  {
2301
  "epoch": 0.984984984984985,
2302
+ "grad_norm": 3.3608462810516357,
2303
+ "learning_rate": 2.179814410969261e-07,
2304
+ "loss": 1.0183,
2305
  "step": 328
2306
  },
2307
  {
2308
  "epoch": 0.987987987987988,
2309
+ "grad_norm": 4.895633220672607,
2310
+ "learning_rate": 1.6016507376169777e-07,
2311
+ "loss": 1.4253,
2312
  "step": 329
2313
  },
2314
  {
2315
  "epoch": 0.990990990990991,
2316
+ "grad_norm": 3.5701801776885986,
2317
+ "learning_rate": 1.1123482106021322e-07,
2318
+ "loss": 1.3085,
2319
  "step": 330
2320
  },
2321
  {
2322
  "epoch": 0.993993993993994,
2323
+ "grad_norm": 5.8992815017700195,
2324
+ "learning_rate": 7.119503796599868e-08,
2325
+ "loss": 1.6318,
2326
  "step": 331
2327
  },
2328
  {
2329
  "epoch": 0.996996996996997,
2330
+ "grad_norm": 2.636950969696045,
2331
+ "learning_rate": 4.0049288167842705e-08,
2332
+ "loss": 0.8327,
2333
  "step": 332
2334
  },
2335
  {
2336
  "epoch": 1.0,
2337
+ "grad_norm": 2.5497934818267822,
2338
+ "learning_rate": 1.7800343752683023e-08,
2339
+ "loss": 0.8514,
2340
  "step": 333
2341
  },
2342
  {
2343
  "epoch": 1.0,
2344
+ "eval_loss": 1.0642523765563965,
2345
+ "eval_runtime": 7.6037,
2346
+ "eval_samples_per_second": 9.338,
2347
+ "eval_steps_per_second": 9.338,
2348
  "step": 333
2349
  }
2350
  ],
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd5b5e9829e2746865387002c1cb7d44db56f890fcf35267b4f105596244cfe5
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d90574b685ffd93795646be8586cc2b85e736d6506f44768b8536997b506b439
3
  size 6776