farmery commited on
Commit
d577fe9
·
verified ·
1 Parent(s): b061139

Training in progress, step 717, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c4997e18d6700e84a7a1858b10bd3fa2f40ad1b5a50ac23a01d6313cf7a86bf
3
  size 2145944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e464bbe95cc23854803b672fb9cf729efc290b58ff545522f92b64c7016eecd
3
  size 2145944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0573ad3f43148d339cbbccad27331089947e54f860e5c0cf8a0921bc15ef729
3
  size 4310020
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:822935ab8fc15c5f806a5134daaf8d3b53f885d21a92bd3e33b1903ae914170d
3
  size 4310020
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e332756dd15a82d20ce17b777006dd2414bdb046adc04f889ee5f59016e85d6b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e63ec6d4f8a228ccb62beccd67d6f6e9016931e6b30ae5344fdba8c30cfe35a8
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b49661ddb4fae0f6df31a87bb09d4b90662fde4fd5d2cb5af8ca4dcf708abd4
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33825bed89719638e2e37f21656a413d50d0dc8cf99d86b4d7152f50e5bbcd6e
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05b1ece00ef562283a959bc5a4e7ade2e3c2b36df26d9fb285587bf971ab91f4
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f3682dfb503773bdb3a7d4868d8abf3b6eed45d692e22e2299624c46632a667
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3e533310474c2318451aa098b68802354c32a1c88b8e865d3b0f3ad75e1d1cc
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cb07e5da45d7c5643fe3179d2cfab1712f94a8c62afddc66fee2e58ea42f700
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4b14aaf5ba6519837f768b68712c4b778ccd3447e8099bc8e1769acc4d8955d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f024d2f6fb6610551c472834de25d1d904c6aa9a110ea61cc065fb2a17fa713f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5733133433283358,
5
  "eval_steps": 500,
6
- "global_step": 478,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3353,6 +3353,1679 @@
3353
  "learning_rate": 4.7536583265546775e-05,
3354
  "loss": 9.609,
3355
  "step": 478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3356
  }
3357
  ],
3358
  "logging_steps": 1,
@@ -3372,7 +5045,7 @@
3372
  "attributes": {}
3373
  }
3374
  },
3375
- "total_flos": 396899546824704.0,
3376
  "train_batch_size": 4,
3377
  "trial_name": null,
3378
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8599700149925037,
5
  "eval_steps": 500,
6
+ "global_step": 717,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3353
  "learning_rate": 4.7536583265546775e-05,
3354
  "loss": 9.609,
3355
  "step": 478
3356
+ },
3357
+ {
3358
+ "epoch": 0.5745127436281859,
3359
+ "grad_norm": 0.2816186845302582,
3360
+ "learning_rate": 4.7322569886417006e-05,
3361
+ "loss": 9.6101,
3362
+ "step": 479
3363
+ },
3364
+ {
3365
+ "epoch": 0.5757121439280359,
3366
+ "grad_norm": 0.2793320417404175,
3367
+ "learning_rate": 4.71086056896371e-05,
3368
+ "loss": 9.6206,
3369
+ "step": 480
3370
+ },
3371
+ {
3372
+ "epoch": 0.576911544227886,
3373
+ "grad_norm": 0.2865123748779297,
3374
+ "learning_rate": 4.689469460556626e-05,
3375
+ "loss": 9.6109,
3376
+ "step": 481
3377
+ },
3378
+ {
3379
+ "epoch": 0.5781109445277361,
3380
+ "grad_norm": 0.2744526267051697,
3381
+ "learning_rate": 4.6680840563587966e-05,
3382
+ "loss": 9.6222,
3383
+ "step": 482
3384
+ },
3385
+ {
3386
+ "epoch": 0.5793103448275863,
3387
+ "grad_norm": 0.30048128962516785,
3388
+ "learning_rate": 4.646704749203793e-05,
3389
+ "loss": 9.6182,
3390
+ "step": 483
3391
+ },
3392
+ {
3393
+ "epoch": 0.5805097451274362,
3394
+ "grad_norm": 0.29160621762275696,
3395
+ "learning_rate": 4.6253319318131926e-05,
3396
+ "loss": 9.618,
3397
+ "step": 484
3398
+ },
3399
+ {
3400
+ "epoch": 0.5817091454272864,
3401
+ "grad_norm": 0.31267857551574707,
3402
+ "learning_rate": 4.60396599678935e-05,
3403
+ "loss": 9.622,
3404
+ "step": 485
3405
+ },
3406
+ {
3407
+ "epoch": 0.5829085457271365,
3408
+ "grad_norm": 0.3598839044570923,
3409
+ "learning_rate": 4.582607336608205e-05,
3410
+ "loss": 9.6176,
3411
+ "step": 486
3412
+ },
3413
+ {
3414
+ "epoch": 0.5841079460269865,
3415
+ "grad_norm": 0.33458805084228516,
3416
+ "learning_rate": 4.561256343612061e-05,
3417
+ "loss": 9.6256,
3418
+ "step": 487
3419
+ },
3420
+ {
3421
+ "epoch": 0.5853073463268366,
3422
+ "grad_norm": 0.27461162209510803,
3423
+ "learning_rate": 4.539913410002378e-05,
3424
+ "loss": 9.6119,
3425
+ "step": 488
3426
+ },
3427
+ {
3428
+ "epoch": 0.5865067466266867,
3429
+ "grad_norm": 0.2723887264728546,
3430
+ "learning_rate": 4.518578927832577e-05,
3431
+ "loss": 9.6056,
3432
+ "step": 489
3433
+ },
3434
+ {
3435
+ "epoch": 0.5877061469265368,
3436
+ "grad_norm": 0.2768537998199463,
3437
+ "learning_rate": 4.4972532890008313e-05,
3438
+ "loss": 9.6079,
3439
+ "step": 490
3440
+ },
3441
+ {
3442
+ "epoch": 0.5889055472263868,
3443
+ "grad_norm": 0.2774599492549896,
3444
+ "learning_rate": 4.4759368852428625e-05,
3445
+ "loss": 9.6092,
3446
+ "step": 491
3447
+ },
3448
+ {
3449
+ "epoch": 0.5901049475262369,
3450
+ "grad_norm": 0.27346640825271606,
3451
+ "learning_rate": 4.45463010812476e-05,
3452
+ "loss": 9.6143,
3453
+ "step": 492
3454
+ },
3455
+ {
3456
+ "epoch": 0.591304347826087,
3457
+ "grad_norm": 0.2797171175479889,
3458
+ "learning_rate": 4.433333349035773e-05,
3459
+ "loss": 9.6168,
3460
+ "step": 493
3461
+ },
3462
+ {
3463
+ "epoch": 0.592503748125937,
3464
+ "grad_norm": 0.2800818085670471,
3465
+ "learning_rate": 4.4120469991811296e-05,
3466
+ "loss": 9.6165,
3467
+ "step": 494
3468
+ },
3469
+ {
3470
+ "epoch": 0.5937031484257871,
3471
+ "grad_norm": 0.280519038438797,
3472
+ "learning_rate": 4.390771449574846e-05,
3473
+ "loss": 9.6195,
3474
+ "step": 495
3475
+ },
3476
+ {
3477
+ "epoch": 0.5949025487256372,
3478
+ "grad_norm": 0.2884778678417206,
3479
+ "learning_rate": 4.369507091032551e-05,
3480
+ "loss": 9.6132,
3481
+ "step": 496
3482
+ },
3483
+ {
3484
+ "epoch": 0.5961019490254873,
3485
+ "grad_norm": 0.2894138693809509,
3486
+ "learning_rate": 4.3482543141642943e-05,
3487
+ "loss": 9.6147,
3488
+ "step": 497
3489
+ },
3490
+ {
3491
+ "epoch": 0.5973013493253373,
3492
+ "grad_norm": 0.2868705093860626,
3493
+ "learning_rate": 4.327013509367386e-05,
3494
+ "loss": 9.6242,
3495
+ "step": 498
3496
+ },
3497
+ {
3498
+ "epoch": 0.5985007496251874,
3499
+ "grad_norm": 0.2994021773338318,
3500
+ "learning_rate": 4.305785066819218e-05,
3501
+ "loss": 9.6189,
3502
+ "step": 499
3503
+ },
3504
+ {
3505
+ "epoch": 0.5997001499250375,
3506
+ "grad_norm": 0.3168644607067108,
3507
+ "learning_rate": 4.2845693764700914e-05,
3508
+ "loss": 9.6247,
3509
+ "step": 500
3510
+ },
3511
+ {
3512
+ "epoch": 0.6008995502248875,
3513
+ "grad_norm": 0.26666632294654846,
3514
+ "learning_rate": 4.263366828036065e-05,
3515
+ "loss": 9.6057,
3516
+ "step": 501
3517
+ },
3518
+ {
3519
+ "epoch": 0.6020989505247376,
3520
+ "grad_norm": 0.26327091455459595,
3521
+ "learning_rate": 4.242177810991789e-05,
3522
+ "loss": 9.6115,
3523
+ "step": 502
3524
+ },
3525
+ {
3526
+ "epoch": 0.6032983508245877,
3527
+ "grad_norm": 0.27538183331489563,
3528
+ "learning_rate": 4.221002714563347e-05,
3529
+ "loss": 9.6082,
3530
+ "step": 503
3531
+ },
3532
+ {
3533
+ "epoch": 0.6044977511244378,
3534
+ "grad_norm": 0.27597832679748535,
3535
+ "learning_rate": 4.19984192772112e-05,
3536
+ "loss": 9.6075,
3537
+ "step": 504
3538
+ },
3539
+ {
3540
+ "epoch": 0.6056971514242878,
3541
+ "grad_norm": 0.28365880250930786,
3542
+ "learning_rate": 4.1786958391726314e-05,
3543
+ "loss": 9.6136,
3544
+ "step": 505
3545
+ },
3546
+ {
3547
+ "epoch": 0.6068965517241379,
3548
+ "grad_norm": 0.2802659273147583,
3549
+ "learning_rate": 4.1575648373554e-05,
3550
+ "loss": 9.6158,
3551
+ "step": 506
3552
+ },
3553
+ {
3554
+ "epoch": 0.608095952023988,
3555
+ "grad_norm": 0.2841864228248596,
3556
+ "learning_rate": 4.136449310429822e-05,
3557
+ "loss": 9.6115,
3558
+ "step": 507
3559
+ },
3560
+ {
3561
+ "epoch": 0.6092953523238381,
3562
+ "grad_norm": 0.2928536832332611,
3563
+ "learning_rate": 4.115349646272029e-05,
3564
+ "loss": 9.6156,
3565
+ "step": 508
3566
+ },
3567
+ {
3568
+ "epoch": 0.6104947526236881,
3569
+ "grad_norm": 0.2854699492454529,
3570
+ "learning_rate": 4.0942662324667627e-05,
3571
+ "loss": 9.6137,
3572
+ "step": 509
3573
+ },
3574
+ {
3575
+ "epoch": 0.6116941529235382,
3576
+ "grad_norm": 0.29192522168159485,
3577
+ "learning_rate": 4.0731994563002606e-05,
3578
+ "loss": 9.6136,
3579
+ "step": 510
3580
+ },
3581
+ {
3582
+ "epoch": 0.6128935532233883,
3583
+ "grad_norm": 0.3441016674041748,
3584
+ "learning_rate": 4.052149704753142e-05,
3585
+ "loss": 9.6224,
3586
+ "step": 511
3587
+ },
3588
+ {
3589
+ "epoch": 0.6140929535232383,
3590
+ "grad_norm": 0.3597991466522217,
3591
+ "learning_rate": 4.03111736449329e-05,
3592
+ "loss": 9.6219,
3593
+ "step": 512
3594
+ },
3595
+ {
3596
+ "epoch": 0.6152923538230884,
3597
+ "grad_norm": 0.2781412899494171,
3598
+ "learning_rate": 4.010102821868762e-05,
3599
+ "loss": 9.6056,
3600
+ "step": 513
3601
+ },
3602
+ {
3603
+ "epoch": 0.6164917541229386,
3604
+ "grad_norm": 0.27280357480049133,
3605
+ "learning_rate": 3.989106462900686e-05,
3606
+ "loss": 9.6063,
3607
+ "step": 514
3608
+ },
3609
+ {
3610
+ "epoch": 0.6176911544227887,
3611
+ "grad_norm": 0.27366748452186584,
3612
+ "learning_rate": 3.968128673276165e-05,
3613
+ "loss": 9.6104,
3614
+ "step": 515
3615
+ },
3616
+ {
3617
+ "epoch": 0.6188905547226387,
3618
+ "grad_norm": 0.27588704228401184,
3619
+ "learning_rate": 3.947169838341202e-05,
3620
+ "loss": 9.605,
3621
+ "step": 516
3622
+ },
3623
+ {
3624
+ "epoch": 0.6200899550224888,
3625
+ "grad_norm": 0.27753859758377075,
3626
+ "learning_rate": 3.9262303430936164e-05,
3627
+ "loss": 9.6033,
3628
+ "step": 517
3629
+ },
3630
+ {
3631
+ "epoch": 0.6212893553223389,
3632
+ "grad_norm": 0.27255064249038696,
3633
+ "learning_rate": 3.9053105721759696e-05,
3634
+ "loss": 9.6098,
3635
+ "step": 518
3636
+ },
3637
+ {
3638
+ "epoch": 0.6224887556221889,
3639
+ "grad_norm": 0.2782951295375824,
3640
+ "learning_rate": 3.8844109098685045e-05,
3641
+ "loss": 9.6184,
3642
+ "step": 519
3643
+ },
3644
+ {
3645
+ "epoch": 0.623688155922039,
3646
+ "grad_norm": 0.28660768270492554,
3647
+ "learning_rate": 3.8635317400820855e-05,
3648
+ "loss": 9.6113,
3649
+ "step": 520
3650
+ },
3651
+ {
3652
+ "epoch": 0.6248875562218891,
3653
+ "grad_norm": 0.28494128584861755,
3654
+ "learning_rate": 3.842673446351138e-05,
3655
+ "loss": 9.6105,
3656
+ "step": 521
3657
+ },
3658
+ {
3659
+ "epoch": 0.6260869565217392,
3660
+ "grad_norm": 0.28198301792144775,
3661
+ "learning_rate": 3.82183641182662e-05,
3662
+ "loss": 9.626,
3663
+ "step": 522
3664
+ },
3665
+ {
3666
+ "epoch": 0.6272863568215892,
3667
+ "grad_norm": 0.2875995337963104,
3668
+ "learning_rate": 3.801021019268969e-05,
3669
+ "loss": 9.6176,
3670
+ "step": 523
3671
+ },
3672
+ {
3673
+ "epoch": 0.6284857571214393,
3674
+ "grad_norm": 0.2956449091434479,
3675
+ "learning_rate": 3.780227651041073e-05,
3676
+ "loss": 9.6229,
3677
+ "step": 524
3678
+ },
3679
+ {
3680
+ "epoch": 0.6296851574212894,
3681
+ "grad_norm": 0.37847524881362915,
3682
+ "learning_rate": 3.7594566891012546e-05,
3683
+ "loss": 9.6214,
3684
+ "step": 525
3685
+ },
3686
+ {
3687
+ "epoch": 0.6308845577211394,
3688
+ "grad_norm": 0.27030348777770996,
3689
+ "learning_rate": 3.7387085149962507e-05,
3690
+ "loss": 9.6011,
3691
+ "step": 526
3692
+ },
3693
+ {
3694
+ "epoch": 0.6320839580209895,
3695
+ "grad_norm": 0.274962455034256,
3696
+ "learning_rate": 3.717983509854198e-05,
3697
+ "loss": 9.6023,
3698
+ "step": 527
3699
+ },
3700
+ {
3701
+ "epoch": 0.6332833583208396,
3702
+ "grad_norm": 0.27726373076438904,
3703
+ "learning_rate": 3.69728205437764e-05,
3704
+ "loss": 9.6102,
3705
+ "step": 528
3706
+ },
3707
+ {
3708
+ "epoch": 0.6344827586206897,
3709
+ "grad_norm": 0.27569401264190674,
3710
+ "learning_rate": 3.676604528836535e-05,
3711
+ "loss": 9.6077,
3712
+ "step": 529
3713
+ },
3714
+ {
3715
+ "epoch": 0.6356821589205397,
3716
+ "grad_norm": 0.2719118893146515,
3717
+ "learning_rate": 3.6559513130612565e-05,
3718
+ "loss": 9.6078,
3719
+ "step": 530
3720
+ },
3721
+ {
3722
+ "epoch": 0.6368815592203898,
3723
+ "grad_norm": 0.27930060029029846,
3724
+ "learning_rate": 3.635322786435635e-05,
3725
+ "loss": 9.6099,
3726
+ "step": 531
3727
+ },
3728
+ {
3729
+ "epoch": 0.6380809595202399,
3730
+ "grad_norm": 0.2761722505092621,
3731
+ "learning_rate": 3.614719327889978e-05,
3732
+ "loss": 9.6161,
3733
+ "step": 532
3734
+ },
3735
+ {
3736
+ "epoch": 0.6392803598200899,
3737
+ "grad_norm": 0.2825543284416199,
3738
+ "learning_rate": 3.594141315894108e-05,
3739
+ "loss": 9.616,
3740
+ "step": 533
3741
+ },
3742
+ {
3743
+ "epoch": 0.64047976011994,
3744
+ "grad_norm": 0.28519946336746216,
3745
+ "learning_rate": 3.573589128450418e-05,
3746
+ "loss": 9.6134,
3747
+ "step": 534
3748
+ },
3749
+ {
3750
+ "epoch": 0.6416791604197901,
3751
+ "grad_norm": 0.2859567105770111,
3752
+ "learning_rate": 3.5530631430869234e-05,
3753
+ "loss": 9.6181,
3754
+ "step": 535
3755
+ },
3756
+ {
3757
+ "epoch": 0.6428785607196402,
3758
+ "grad_norm": 0.293560653924942,
3759
+ "learning_rate": 3.532563736850322e-05,
3760
+ "loss": 9.6141,
3761
+ "step": 536
3762
+ },
3763
+ {
3764
+ "epoch": 0.6440779610194902,
3765
+ "grad_norm": 0.31543228030204773,
3766
+ "learning_rate": 3.512091286299081e-05,
3767
+ "loss": 9.6132,
3768
+ "step": 537
3769
+ },
3770
+ {
3771
+ "epoch": 0.6452773613193403,
3772
+ "grad_norm": 0.28361520171165466,
3773
+ "learning_rate": 3.491646167496507e-05,
3774
+ "loss": 9.5993,
3775
+ "step": 538
3776
+ },
3777
+ {
3778
+ "epoch": 0.6464767616191904,
3779
+ "grad_norm": 0.2670563757419586,
3780
+ "learning_rate": 3.4712287560038446e-05,
3781
+ "loss": 9.6042,
3782
+ "step": 539
3783
+ },
3784
+ {
3785
+ "epoch": 0.6476761619190404,
3786
+ "grad_norm": 0.2657446265220642,
3787
+ "learning_rate": 3.450839426873378e-05,
3788
+ "loss": 9.6106,
3789
+ "step": 540
3790
+ },
3791
+ {
3792
+ "epoch": 0.6488755622188905,
3793
+ "grad_norm": 0.271816611289978,
3794
+ "learning_rate": 3.4304785546415374e-05,
3795
+ "loss": 9.608,
3796
+ "step": 541
3797
+ },
3798
+ {
3799
+ "epoch": 0.6500749625187406,
3800
+ "grad_norm": 0.27191296219825745,
3801
+ "learning_rate": 3.41014651332202e-05,
3802
+ "loss": 9.6103,
3803
+ "step": 542
3804
+ },
3805
+ {
3806
+ "epoch": 0.6512743628185907,
3807
+ "grad_norm": 0.27644070982933044,
3808
+ "learning_rate": 3.3898436763989247e-05,
3809
+ "loss": 9.6039,
3810
+ "step": 543
3811
+ },
3812
+ {
3813
+ "epoch": 0.6524737631184407,
3814
+ "grad_norm": 0.27742430567741394,
3815
+ "learning_rate": 3.369570416819889e-05,
3816
+ "loss": 9.6053,
3817
+ "step": 544
3818
+ },
3819
+ {
3820
+ "epoch": 0.6536731634182908,
3821
+ "grad_norm": 0.2793113589286804,
3822
+ "learning_rate": 3.349327106989232e-05,
3823
+ "loss": 9.615,
3824
+ "step": 545
3825
+ },
3826
+ {
3827
+ "epoch": 0.654872563718141,
3828
+ "grad_norm": 0.28077057003974915,
3829
+ "learning_rate": 3.329114118761123e-05,
3830
+ "loss": 9.6101,
3831
+ "step": 546
3832
+ },
3833
+ {
3834
+ "epoch": 0.656071964017991,
3835
+ "grad_norm": 0.2894865870475769,
3836
+ "learning_rate": 3.308931823432744e-05,
3837
+ "loss": 9.6093,
3838
+ "step": 547
3839
+ },
3840
+ {
3841
+ "epoch": 0.6572713643178411,
3842
+ "grad_norm": 0.2894723415374756,
3843
+ "learning_rate": 3.288780591737474e-05,
3844
+ "loss": 9.6141,
3845
+ "step": 548
3846
+ },
3847
+ {
3848
+ "epoch": 0.6584707646176912,
3849
+ "grad_norm": 0.3010658323764801,
3850
+ "learning_rate": 3.268660793838074e-05,
3851
+ "loss": 9.6249,
3852
+ "step": 549
3853
+ },
3854
+ {
3855
+ "epoch": 0.6596701649175413,
3856
+ "grad_norm": 0.3542385399341583,
3857
+ "learning_rate": 3.2485727993198945e-05,
3858
+ "loss": 9.6182,
3859
+ "step": 550
3860
+ },
3861
+ {
3862
+ "epoch": 0.6608695652173913,
3863
+ "grad_norm": 0.2821604907512665,
3864
+ "learning_rate": 3.228516977184075e-05,
3865
+ "loss": 9.6229,
3866
+ "step": 551
3867
+ },
3868
+ {
3869
+ "epoch": 0.6620689655172414,
3870
+ "grad_norm": 0.27113085985183716,
3871
+ "learning_rate": 3.2084936958407805e-05,
3872
+ "loss": 9.6041,
3873
+ "step": 552
3874
+ },
3875
+ {
3876
+ "epoch": 0.6632683658170915,
3877
+ "grad_norm": 0.26982516050338745,
3878
+ "learning_rate": 3.188503323102425e-05,
3879
+ "loss": 9.6084,
3880
+ "step": 553
3881
+ },
3882
+ {
3883
+ "epoch": 0.6644677661169416,
3884
+ "grad_norm": 0.2756569981575012,
3885
+ "learning_rate": 3.1685462261769105e-05,
3886
+ "loss": 9.6126,
3887
+ "step": 554
3888
+ },
3889
+ {
3890
+ "epoch": 0.6656671664167916,
3891
+ "grad_norm": 0.27629488706588745,
3892
+ "learning_rate": 3.1486227716608946e-05,
3893
+ "loss": 9.6056,
3894
+ "step": 555
3895
+ },
3896
+ {
3897
+ "epoch": 0.6668665667166417,
3898
+ "grad_norm": 0.28036460280418396,
3899
+ "learning_rate": 3.128733325533047e-05,
3900
+ "loss": 9.6054,
3901
+ "step": 556
3902
+ },
3903
+ {
3904
+ "epoch": 0.6680659670164918,
3905
+ "grad_norm": 0.27844056487083435,
3906
+ "learning_rate": 3.1088782531473266e-05,
3907
+ "loss": 9.6111,
3908
+ "step": 557
3909
+ },
3910
+ {
3911
+ "epoch": 0.6692653673163418,
3912
+ "grad_norm": 0.2862386405467987,
3913
+ "learning_rate": 3.089057919226277e-05,
3914
+ "loss": 9.612,
3915
+ "step": 558
3916
+ },
3917
+ {
3918
+ "epoch": 0.6704647676161919,
3919
+ "grad_norm": 0.2859496474266052,
3920
+ "learning_rate": 3.069272687854322e-05,
3921
+ "loss": 9.6114,
3922
+ "step": 559
3923
+ },
3924
+ {
3925
+ "epoch": 0.671664167916042,
3926
+ "grad_norm": 0.28554123640060425,
3927
+ "learning_rate": 3.049522922471075e-05,
3928
+ "loss": 9.6105,
3929
+ "step": 560
3930
+ },
3931
+ {
3932
+ "epoch": 0.6728635682158921,
3933
+ "grad_norm": 0.30089861154556274,
3934
+ "learning_rate": 3.02980898586467e-05,
3935
+ "loss": 9.6205,
3936
+ "step": 561
3937
+ },
3938
+ {
3939
+ "epoch": 0.6740629685157421,
3940
+ "grad_norm": 0.30331140756607056,
3941
+ "learning_rate": 3.0101312401650937e-05,
3942
+ "loss": 9.6158,
3943
+ "step": 562
3944
+ },
3945
+ {
3946
+ "epoch": 0.6752623688155922,
3947
+ "grad_norm": 0.2732248902320862,
3948
+ "learning_rate": 2.9904900468375297e-05,
3949
+ "loss": 9.6064,
3950
+ "step": 563
3951
+ },
3952
+ {
3953
+ "epoch": 0.6764617691154423,
3954
+ "grad_norm": 0.27510005235671997,
3955
+ "learning_rate": 2.9708857666757246e-05,
3956
+ "loss": 9.6019,
3957
+ "step": 564
3958
+ },
3959
+ {
3960
+ "epoch": 0.6776611694152923,
3961
+ "grad_norm": 0.27365824580192566,
3962
+ "learning_rate": 2.9513187597953607e-05,
3963
+ "loss": 9.5995,
3964
+ "step": 565
3965
+ },
3966
+ {
3967
+ "epoch": 0.6788605697151424,
3968
+ "grad_norm": 0.2792357802391052,
3969
+ "learning_rate": 2.931789385627433e-05,
3970
+ "loss": 9.606,
3971
+ "step": 566
3972
+ },
3973
+ {
3974
+ "epoch": 0.6800599700149925,
3975
+ "grad_norm": 0.2759556770324707,
3976
+ "learning_rate": 2.9122980029116586e-05,
3977
+ "loss": 9.6039,
3978
+ "step": 567
3979
+ },
3980
+ {
3981
+ "epoch": 0.6812593703148426,
3982
+ "grad_norm": 0.2814030647277832,
3983
+ "learning_rate": 2.8928449696898763e-05,
3984
+ "loss": 9.602,
3985
+ "step": 568
3986
+ },
3987
+ {
3988
+ "epoch": 0.6824587706146926,
3989
+ "grad_norm": 0.2769099771976471,
3990
+ "learning_rate": 2.8734306432994735e-05,
3991
+ "loss": 9.6079,
3992
+ "step": 569
3993
+ },
3994
+ {
3995
+ "epoch": 0.6836581709145427,
3996
+ "grad_norm": 0.2809275686740875,
3997
+ "learning_rate": 2.8540553803668252e-05,
3998
+ "loss": 9.613,
3999
+ "step": 570
4000
+ },
4001
+ {
4002
+ "epoch": 0.6848575712143928,
4003
+ "grad_norm": 0.275016725063324,
4004
+ "learning_rate": 2.8347195368007418e-05,
4005
+ "loss": 9.6097,
4006
+ "step": 571
4007
+ },
4008
+ {
4009
+ "epoch": 0.6860569715142428,
4010
+ "grad_norm": 0.2964610755443573,
4011
+ "learning_rate": 2.815423467785925e-05,
4012
+ "loss": 9.6111,
4013
+ "step": 572
4014
+ },
4015
+ {
4016
+ "epoch": 0.6872563718140929,
4017
+ "grad_norm": 0.2884480059146881,
4018
+ "learning_rate": 2.7961675277764498e-05,
4019
+ "loss": 9.6089,
4020
+ "step": 573
4021
+ },
4022
+ {
4023
+ "epoch": 0.688455772113943,
4024
+ "grad_norm": 0.30310893058776855,
4025
+ "learning_rate": 2.7769520704892566e-05,
4026
+ "loss": 9.6102,
4027
+ "step": 574
4028
+ },
4029
+ {
4030
+ "epoch": 0.6896551724137931,
4031
+ "grad_norm": 0.4733683466911316,
4032
+ "learning_rate": 2.757777448897646e-05,
4033
+ "loss": 9.6083,
4034
+ "step": 575
4035
+ },
4036
+ {
4037
+ "epoch": 0.6908545727136431,
4038
+ "grad_norm": 0.272512823343277,
4039
+ "learning_rate": 2.7386440152247933e-05,
4040
+ "loss": 9.5963,
4041
+ "step": 576
4042
+ },
4043
+ {
4044
+ "epoch": 0.6920539730134933,
4045
+ "grad_norm": 0.2810138165950775,
4046
+ "learning_rate": 2.71955212093729e-05,
4047
+ "loss": 9.6012,
4048
+ "step": 577
4049
+ },
4050
+ {
4051
+ "epoch": 0.6932533733133434,
4052
+ "grad_norm": 0.2755623161792755,
4053
+ "learning_rate": 2.7005021167386803e-05,
4054
+ "loss": 9.6022,
4055
+ "step": 578
4056
+ },
4057
+ {
4058
+ "epoch": 0.6944527736131934,
4059
+ "grad_norm": 0.2718299329280853,
4060
+ "learning_rate": 2.681494352563013e-05,
4061
+ "loss": 9.6096,
4062
+ "step": 579
4063
+ },
4064
+ {
4065
+ "epoch": 0.6956521739130435,
4066
+ "grad_norm": 0.2746315896511078,
4067
+ "learning_rate": 2.6625291775684292e-05,
4068
+ "loss": 9.6124,
4069
+ "step": 580
4070
+ },
4071
+ {
4072
+ "epoch": 0.6968515742128936,
4073
+ "grad_norm": 0.2844776511192322,
4074
+ "learning_rate": 2.6436069401307284e-05,
4075
+ "loss": 9.6054,
4076
+ "step": 581
4077
+ },
4078
+ {
4079
+ "epoch": 0.6980509745127437,
4080
+ "grad_norm": 0.2785060703754425,
4081
+ "learning_rate": 2.624727987836991e-05,
4082
+ "loss": 9.6112,
4083
+ "step": 582
4084
+ },
4085
+ {
4086
+ "epoch": 0.6992503748125937,
4087
+ "grad_norm": 0.2840147316455841,
4088
+ "learning_rate": 2.6058926674791728e-05,
4089
+ "loss": 9.6061,
4090
+ "step": 583
4091
+ },
4092
+ {
4093
+ "epoch": 0.7004497751124438,
4094
+ "grad_norm": 0.28523436188697815,
4095
+ "learning_rate": 2.5871013250477528e-05,
4096
+ "loss": 9.6057,
4097
+ "step": 584
4098
+ },
4099
+ {
4100
+ "epoch": 0.7016491754122939,
4101
+ "grad_norm": 0.29284006357192993,
4102
+ "learning_rate": 2.56835430572536e-05,
4103
+ "loss": 9.6091,
4104
+ "step": 585
4105
+ },
4106
+ {
4107
+ "epoch": 0.7028485757121439,
4108
+ "grad_norm": 0.29574641585350037,
4109
+ "learning_rate": 2.5496519538804486e-05,
4110
+ "loss": 9.6155,
4111
+ "step": 586
4112
+ },
4113
+ {
4114
+ "epoch": 0.704047976011994,
4115
+ "grad_norm": 0.3032572269439697,
4116
+ "learning_rate": 2.530994613060965e-05,
4117
+ "loss": 9.6162,
4118
+ "step": 587
4119
+ },
4120
+ {
4121
+ "epoch": 0.7052473763118441,
4122
+ "grad_norm": 0.2718828320503235,
4123
+ "learning_rate": 2.5123826259880323e-05,
4124
+ "loss": 9.6001,
4125
+ "step": 588
4126
+ },
4127
+ {
4128
+ "epoch": 0.7064467766116942,
4129
+ "grad_norm": 0.27074381709098816,
4130
+ "learning_rate": 2.493816334549664e-05,
4131
+ "loss": 9.6014,
4132
+ "step": 589
4133
+ },
4134
+ {
4135
+ "epoch": 0.7076461769115442,
4136
+ "grad_norm": 0.2791549265384674,
4137
+ "learning_rate": 2.4752960797944802e-05,
4138
+ "loss": 9.5998,
4139
+ "step": 590
4140
+ },
4141
+ {
4142
+ "epoch": 0.7088455772113943,
4143
+ "grad_norm": 0.28340011835098267,
4144
+ "learning_rate": 2.4568222019254377e-05,
4145
+ "loss": 9.5979,
4146
+ "step": 591
4147
+ },
4148
+ {
4149
+ "epoch": 0.7100449775112444,
4150
+ "grad_norm": 0.2762751579284668,
4151
+ "learning_rate": 2.43839504029359e-05,
4152
+ "loss": 9.6032,
4153
+ "step": 592
4154
+ },
4155
+ {
4156
+ "epoch": 0.7112443778110945,
4157
+ "grad_norm": 0.2753763198852539,
4158
+ "learning_rate": 2.4200149333918487e-05,
4159
+ "loss": 9.6089,
4160
+ "step": 593
4161
+ },
4162
+ {
4163
+ "epoch": 0.7124437781109445,
4164
+ "grad_norm": 0.27482444047927856,
4165
+ "learning_rate": 2.4016822188487603e-05,
4166
+ "loss": 9.6081,
4167
+ "step": 594
4168
+ },
4169
+ {
4170
+ "epoch": 0.7136431784107946,
4171
+ "grad_norm": 0.28210797905921936,
4172
+ "learning_rate": 2.383397233422318e-05,
4173
+ "loss": 9.6041,
4174
+ "step": 595
4175
+ },
4176
+ {
4177
+ "epoch": 0.7148425787106447,
4178
+ "grad_norm": 0.2853706479072571,
4179
+ "learning_rate": 2.3651603129937592e-05,
4180
+ "loss": 9.6042,
4181
+ "step": 596
4182
+ },
4183
+ {
4184
+ "epoch": 0.7160419790104947,
4185
+ "grad_norm": 0.3066234886646271,
4186
+ "learning_rate": 2.346971792561413e-05,
4187
+ "loss": 9.6053,
4188
+ "step": 597
4189
+ },
4190
+ {
4191
+ "epoch": 0.7172413793103448,
4192
+ "grad_norm": 0.2879929542541504,
4193
+ "learning_rate": 2.3288320062345277e-05,
4194
+ "loss": 9.6069,
4195
+ "step": 598
4196
+ },
4197
+ {
4198
+ "epoch": 0.7184407796101949,
4199
+ "grad_norm": 0.35332369804382324,
4200
+ "learning_rate": 2.3107412872271518e-05,
4201
+ "loss": 9.6162,
4202
+ "step": 599
4203
+ },
4204
+ {
4205
+ "epoch": 0.719640179910045,
4206
+ "grad_norm": 0.5152252316474915,
4207
+ "learning_rate": 2.2926999678519974e-05,
4208
+ "loss": 9.6182,
4209
+ "step": 600
4210
+ },
4211
+ {
4212
+ "epoch": 0.720839580209895,
4213
+ "grad_norm": 0.2663346230983734,
4214
+ "learning_rate": 2.274708379514348e-05,
4215
+ "loss": 9.5986,
4216
+ "step": 601
4217
+ },
4218
+ {
4219
+ "epoch": 0.7220389805097451,
4220
+ "grad_norm": 0.27524423599243164,
4221
+ "learning_rate": 2.256766852705967e-05,
4222
+ "loss": 9.5986,
4223
+ "step": 602
4224
+ },
4225
+ {
4226
+ "epoch": 0.7232383808095952,
4227
+ "grad_norm": 0.2814219295978546,
4228
+ "learning_rate": 2.238875716999019e-05,
4229
+ "loss": 9.6037,
4230
+ "step": 603
4231
+ },
4232
+ {
4233
+ "epoch": 0.7244377811094452,
4234
+ "grad_norm": 0.2859136760234833,
4235
+ "learning_rate": 2.221035301040027e-05,
4236
+ "loss": 9.6002,
4237
+ "step": 604
4238
+ },
4239
+ {
4240
+ "epoch": 0.7256371814092953,
4241
+ "grad_norm": 0.27460747957229614,
4242
+ "learning_rate": 2.2032459325438336e-05,
4243
+ "loss": 9.6031,
4244
+ "step": 605
4245
+ },
4246
+ {
4247
+ "epoch": 0.7268365817091454,
4248
+ "grad_norm": 0.2745445966720581,
4249
+ "learning_rate": 2.185507938287572e-05,
4250
+ "loss": 9.6072,
4251
+ "step": 606
4252
+ },
4253
+ {
4254
+ "epoch": 0.7280359820089956,
4255
+ "grad_norm": 0.2816024124622345,
4256
+ "learning_rate": 2.1678216441046734e-05,
4257
+ "loss": 9.6128,
4258
+ "step": 607
4259
+ },
4260
+ {
4261
+ "epoch": 0.7292353823088455,
4262
+ "grad_norm": 0.28734058141708374,
4263
+ "learning_rate": 2.1501873748788802e-05,
4264
+ "loss": 9.6127,
4265
+ "step": 608
4266
+ },
4267
+ {
4268
+ "epoch": 0.7304347826086957,
4269
+ "grad_norm": 0.28445249795913696,
4270
+ "learning_rate": 2.1326054545382695e-05,
4271
+ "loss": 9.6118,
4272
+ "step": 609
4273
+ },
4274
+ {
4275
+ "epoch": 0.7316341829085458,
4276
+ "grad_norm": 0.2825443148612976,
4277
+ "learning_rate": 2.1150762060493155e-05,
4278
+ "loss": 9.6182,
4279
+ "step": 610
4280
+ },
4281
+ {
4282
+ "epoch": 0.7328335832083958,
4283
+ "grad_norm": 0.29409319162368774,
4284
+ "learning_rate": 2.09759995141095e-05,
4285
+ "loss": 9.611,
4286
+ "step": 611
4287
+ },
4288
+ {
4289
+ "epoch": 0.7340329835082459,
4290
+ "grad_norm": 0.30348506569862366,
4291
+ "learning_rate": 2.0801770116486447e-05,
4292
+ "loss": 9.6193,
4293
+ "step": 612
4294
+ },
4295
+ {
4296
+ "epoch": 0.735232383808096,
4297
+ "grad_norm": 0.2586905360221863,
4298
+ "learning_rate": 2.0628077068085173e-05,
4299
+ "loss": 9.6146,
4300
+ "step": 613
4301
+ },
4302
+ {
4303
+ "epoch": 0.7364317841079461,
4304
+ "grad_norm": 0.27243587374687195,
4305
+ "learning_rate": 2.0454923559514595e-05,
4306
+ "loss": 9.6025,
4307
+ "step": 614
4308
+ },
4309
+ {
4310
+ "epoch": 0.7376311844077961,
4311
+ "grad_norm": 0.27491042017936707,
4312
+ "learning_rate": 2.028231277147261e-05,
4313
+ "loss": 9.6013,
4314
+ "step": 615
4315
+ },
4316
+ {
4317
+ "epoch": 0.7388305847076462,
4318
+ "grad_norm": 0.279153048992157,
4319
+ "learning_rate": 2.0110247874687815e-05,
4320
+ "loss": 9.5937,
4321
+ "step": 616
4322
+ },
4323
+ {
4324
+ "epoch": 0.7400299850074963,
4325
+ "grad_norm": 0.27780649065971375,
4326
+ "learning_rate": 1.993873202986119e-05,
4327
+ "loss": 9.6022,
4328
+ "step": 617
4329
+ },
4330
+ {
4331
+ "epoch": 0.7412293853073463,
4332
+ "grad_norm": 0.2798539698123932,
4333
+ "learning_rate": 1.976776838760801e-05,
4334
+ "loss": 9.6022,
4335
+ "step": 618
4336
+ },
4337
+ {
4338
+ "epoch": 0.7424287856071964,
4339
+ "grad_norm": 0.27843162417411804,
4340
+ "learning_rate": 1.9597360088400052e-05,
4341
+ "loss": 9.6062,
4342
+ "step": 619
4343
+ },
4344
+ {
4345
+ "epoch": 0.7436281859070465,
4346
+ "grad_norm": 0.27371302247047424,
4347
+ "learning_rate": 1.9427510262507864e-05,
4348
+ "loss": 9.6119,
4349
+ "step": 620
4350
+ },
4351
+ {
4352
+ "epoch": 0.7448275862068966,
4353
+ "grad_norm": 0.2873663604259491,
4354
+ "learning_rate": 1.925822202994323e-05,
4355
+ "loss": 9.6004,
4356
+ "step": 621
4357
+ },
4358
+ {
4359
+ "epoch": 0.7460269865067466,
4360
+ "grad_norm": 0.2875591218471527,
4361
+ "learning_rate": 1.9089498500401914e-05,
4362
+ "loss": 9.6119,
4363
+ "step": 622
4364
+ },
4365
+ {
4366
+ "epoch": 0.7472263868065967,
4367
+ "grad_norm": 0.2853778004646301,
4368
+ "learning_rate": 1.892134277320655e-05,
4369
+ "loss": 9.6091,
4370
+ "step": 623
4371
+ },
4372
+ {
4373
+ "epoch": 0.7484257871064468,
4374
+ "grad_norm": 0.2952004075050354,
4375
+ "learning_rate": 1.87537579372496e-05,
4376
+ "loss": 9.6182,
4377
+ "step": 624
4378
+ },
4379
+ {
4380
+ "epoch": 0.7496251874062968,
4381
+ "grad_norm": 0.3686712980270386,
4382
+ "learning_rate": 1.858674707093675e-05,
4383
+ "loss": 9.614,
4384
+ "step": 625
4385
+ },
4386
+ {
4387
+ "epoch": 0.7508245877061469,
4388
+ "grad_norm": 0.2664184868335724,
4389
+ "learning_rate": 1.8420313242130293e-05,
4390
+ "loss": 9.6005,
4391
+ "step": 626
4392
+ },
4393
+ {
4394
+ "epoch": 0.752023988005997,
4395
+ "grad_norm": 0.2688407301902771,
4396
+ "learning_rate": 1.8254459508092768e-05,
4397
+ "loss": 9.5988,
4398
+ "step": 627
4399
+ },
4400
+ {
4401
+ "epoch": 0.7532233883058471,
4402
+ "grad_norm": 0.2794104516506195,
4403
+ "learning_rate": 1.8089188915430793e-05,
4404
+ "loss": 9.5987,
4405
+ "step": 628
4406
+ },
4407
+ {
4408
+ "epoch": 0.7544227886056971,
4409
+ "grad_norm": 0.26486334204673767,
4410
+ "learning_rate": 1.792450450003919e-05,
4411
+ "loss": 9.6129,
4412
+ "step": 629
4413
+ },
4414
+ {
4415
+ "epoch": 0.7556221889055472,
4416
+ "grad_norm": 0.2762359082698822,
4417
+ "learning_rate": 1.7760409287045078e-05,
4418
+ "loss": 9.6052,
4419
+ "step": 630
4420
+ },
4421
+ {
4422
+ "epoch": 0.7568215892053973,
4423
+ "grad_norm": 0.27764591574668884,
4424
+ "learning_rate": 1.7596906290752425e-05,
4425
+ "loss": 9.6056,
4426
+ "step": 631
4427
+ },
4428
+ {
4429
+ "epoch": 0.7580209895052473,
4430
+ "grad_norm": 0.276153028011322,
4431
+ "learning_rate": 1.743399851458663e-05,
4432
+ "loss": 9.609,
4433
+ "step": 632
4434
+ },
4435
+ {
4436
+ "epoch": 0.7592203898050974,
4437
+ "grad_norm": 0.2780199646949768,
4438
+ "learning_rate": 1.727168895103931e-05,
4439
+ "loss": 9.6081,
4440
+ "step": 633
4441
+ },
4442
+ {
4443
+ "epoch": 0.7604197901049475,
4444
+ "grad_norm": 0.276457816362381,
4445
+ "learning_rate": 1.7109980581613417e-05,
4446
+ "loss": 9.6062,
4447
+ "step": 634
4448
+ },
4449
+ {
4450
+ "epoch": 0.7616191904047976,
4451
+ "grad_norm": 0.2808220088481903,
4452
+ "learning_rate": 1.6948876376768418e-05,
4453
+ "loss": 9.6123,
4454
+ "step": 635
4455
+ },
4456
+ {
4457
+ "epoch": 0.7628185907046476,
4458
+ "grad_norm": 0.29566583037376404,
4459
+ "learning_rate": 1.6788379295865704e-05,
4460
+ "loss": 9.6094,
4461
+ "step": 636
4462
+ },
4463
+ {
4464
+ "epoch": 0.7640179910044977,
4465
+ "grad_norm": 0.33136534690856934,
4466
+ "learning_rate": 1.6628492287114296e-05,
4467
+ "loss": 9.614,
4468
+ "step": 637
4469
+ },
4470
+ {
4471
+ "epoch": 0.7652173913043478,
4472
+ "grad_norm": 0.27251994609832764,
4473
+ "learning_rate": 1.6469218287516664e-05,
4474
+ "loss": 9.6011,
4475
+ "step": 638
4476
+ },
4477
+ {
4478
+ "epoch": 0.766416791604198,
4479
+ "grad_norm": 0.2670121490955353,
4480
+ "learning_rate": 1.6310560222814714e-05,
4481
+ "loss": 9.6037,
4482
+ "step": 639
4483
+ },
4484
+ {
4485
+ "epoch": 0.767616191904048,
4486
+ "grad_norm": 0.2792399227619171,
4487
+ "learning_rate": 1.6152521007436145e-05,
4488
+ "loss": 9.6036,
4489
+ "step": 640
4490
+ },
4491
+ {
4492
+ "epoch": 0.7688155922038981,
4493
+ "grad_norm": 0.275511234998703,
4494
+ "learning_rate": 1.599510354444087e-05,
4495
+ "loss": 9.5973,
4496
+ "step": 641
4497
+ },
4498
+ {
4499
+ "epoch": 0.7700149925037482,
4500
+ "grad_norm": 0.2751782536506653,
4501
+ "learning_rate": 1.5838310725467644e-05,
4502
+ "loss": 9.6005,
4503
+ "step": 642
4504
+ },
4505
+ {
4506
+ "epoch": 0.7712143928035982,
4507
+ "grad_norm": 0.28111734986305237,
4508
+ "learning_rate": 1.5682145430681027e-05,
4509
+ "loss": 9.6015,
4510
+ "step": 643
4511
+ },
4512
+ {
4513
+ "epoch": 0.7724137931034483,
4514
+ "grad_norm": 0.2826797068119049,
4515
+ "learning_rate": 1.5526610528718415e-05,
4516
+ "loss": 9.6054,
4517
+ "step": 644
4518
+ },
4519
+ {
4520
+ "epoch": 0.7736131934032984,
4521
+ "grad_norm": 0.28505128622055054,
4522
+ "learning_rate": 1.5371708876637354e-05,
4523
+ "loss": 9.6042,
4524
+ "step": 645
4525
+ },
4526
+ {
4527
+ "epoch": 0.7748125937031485,
4528
+ "grad_norm": 0.28200674057006836,
4529
+ "learning_rate": 1.5217443319863112e-05,
4530
+ "loss": 9.6051,
4531
+ "step": 646
4532
+ },
4533
+ {
4534
+ "epoch": 0.7760119940029985,
4535
+ "grad_norm": 0.2859637439250946,
4536
+ "learning_rate": 1.5063816692136373e-05,
4537
+ "loss": 9.6004,
4538
+ "step": 647
4539
+ },
4540
+ {
4541
+ "epoch": 0.7772113943028486,
4542
+ "grad_norm": 0.28504401445388794,
4543
+ "learning_rate": 1.4910831815461123e-05,
4544
+ "loss": 9.6177,
4545
+ "step": 648
4546
+ },
4547
+ {
4548
+ "epoch": 0.7784107946026987,
4549
+ "grad_norm": 0.2949487268924713,
4550
+ "learning_rate": 1.4758491500052924e-05,
4551
+ "loss": 9.6204,
4552
+ "step": 649
4553
+ },
4554
+ {
4555
+ "epoch": 0.7796101949025487,
4556
+ "grad_norm": 0.3952041268348694,
4557
+ "learning_rate": 1.4606798544287243e-05,
4558
+ "loss": 9.62,
4559
+ "step": 650
4560
+ },
4561
+ {
4562
+ "epoch": 0.7808095952023988,
4563
+ "grad_norm": 0.2684868574142456,
4564
+ "learning_rate": 1.445575573464799e-05,
4565
+ "loss": 9.5986,
4566
+ "step": 651
4567
+ },
4568
+ {
4569
+ "epoch": 0.7820089955022489,
4570
+ "grad_norm": 0.2751760184764862,
4571
+ "learning_rate": 1.4305365845676439e-05,
4572
+ "loss": 9.5993,
4573
+ "step": 652
4574
+ },
4575
+ {
4576
+ "epoch": 0.783208395802099,
4577
+ "grad_norm": 0.27565452456474304,
4578
+ "learning_rate": 1.4155631639920209e-05,
4579
+ "loss": 9.5939,
4580
+ "step": 653
4581
+ },
4582
+ {
4583
+ "epoch": 0.784407796101949,
4584
+ "grad_norm": 0.27967387437820435,
4585
+ "learning_rate": 1.4006555867882464e-05,
4586
+ "loss": 9.6024,
4587
+ "step": 654
4588
+ },
4589
+ {
4590
+ "epoch": 0.7856071964017991,
4591
+ "grad_norm": 0.28178393840789795,
4592
+ "learning_rate": 1.3858141267971491e-05,
4593
+ "loss": 9.6057,
4594
+ "step": 655
4595
+ },
4596
+ {
4597
+ "epoch": 0.7868065967016492,
4598
+ "grad_norm": 0.27983683347702026,
4599
+ "learning_rate": 1.3710390566450366e-05,
4600
+ "loss": 9.6059,
4601
+ "step": 656
4602
+ },
4603
+ {
4604
+ "epoch": 0.7880059970014992,
4605
+ "grad_norm": 0.286726713180542,
4606
+ "learning_rate": 1.3563306477386784e-05,
4607
+ "loss": 9.6032,
4608
+ "step": 657
4609
+ },
4610
+ {
4611
+ "epoch": 0.7892053973013493,
4612
+ "grad_norm": 0.2814926505088806,
4613
+ "learning_rate": 1.3416891702603358e-05,
4614
+ "loss": 9.6077,
4615
+ "step": 658
4616
+ },
4617
+ {
4618
+ "epoch": 0.7904047976011994,
4619
+ "grad_norm": 0.291660875082016,
4620
+ "learning_rate": 1.3271148931627858e-05,
4621
+ "loss": 9.6055,
4622
+ "step": 659
4623
+ },
4624
+ {
4625
+ "epoch": 0.7916041979010495,
4626
+ "grad_norm": 0.2863795757293701,
4627
+ "learning_rate": 1.3126080841643856e-05,
4628
+ "loss": 9.6111,
4629
+ "step": 660
4630
+ },
4631
+ {
4632
+ "epoch": 0.7928035982008995,
4633
+ "grad_norm": 0.2854698896408081,
4634
+ "learning_rate": 1.2981690097441573e-05,
4635
+ "loss": 9.6172,
4636
+ "step": 661
4637
+ },
4638
+ {
4639
+ "epoch": 0.7940029985007496,
4640
+ "grad_norm": 0.3119170367717743,
4641
+ "learning_rate": 1.2837979351368912e-05,
4642
+ "loss": 9.6102,
4643
+ "step": 662
4644
+ },
4645
+ {
4646
+ "epoch": 0.7952023988005997,
4647
+ "grad_norm": 0.27526015043258667,
4648
+ "learning_rate": 1.2694951243282683e-05,
4649
+ "loss": 9.6006,
4650
+ "step": 663
4651
+ },
4652
+ {
4653
+ "epoch": 0.7964017991004497,
4654
+ "grad_norm": 0.27086350321769714,
4655
+ "learning_rate": 1.2552608400500199e-05,
4656
+ "loss": 9.6,
4657
+ "step": 664
4658
+ },
4659
+ {
4660
+ "epoch": 0.7976011994002998,
4661
+ "grad_norm": 0.2674426734447479,
4662
+ "learning_rate": 1.2410953437750966e-05,
4663
+ "loss": 9.599,
4664
+ "step": 665
4665
+ },
4666
+ {
4667
+ "epoch": 0.7988005997001499,
4668
+ "grad_norm": 0.26960834860801697,
4669
+ "learning_rate": 1.2269988957128636e-05,
4670
+ "loss": 9.6059,
4671
+ "step": 666
4672
+ },
4673
+ {
4674
+ "epoch": 0.8,
4675
+ "grad_norm": 0.27745890617370605,
4676
+ "learning_rate": 1.212971754804324e-05,
4677
+ "loss": 9.6046,
4678
+ "step": 667
4679
+ },
4680
+ {
4681
+ "epoch": 0.80119940029985,
4682
+ "grad_norm": 0.2803892493247986,
4683
+ "learning_rate": 1.1990141787173648e-05,
4684
+ "loss": 9.6036,
4685
+ "step": 668
4686
+ },
4687
+ {
4688
+ "epoch": 0.8023988005997001,
4689
+ "grad_norm": 0.2826705574989319,
4690
+ "learning_rate": 1.1851264238420135e-05,
4691
+ "loss": 9.6031,
4692
+ "step": 669
4693
+ },
4694
+ {
4695
+ "epoch": 0.8035982008995503,
4696
+ "grad_norm": 0.28543218970298767,
4697
+ "learning_rate": 1.1713087452857408e-05,
4698
+ "loss": 9.6047,
4699
+ "step": 670
4700
+ },
4701
+ {
4702
+ "epoch": 0.8047976011994002,
4703
+ "grad_norm": 0.2749161124229431,
4704
+ "learning_rate": 1.1575613968687682e-05,
4705
+ "loss": 9.6061,
4706
+ "step": 671
4707
+ },
4708
+ {
4709
+ "epoch": 0.8059970014992504,
4710
+ "grad_norm": 0.2880239486694336,
4711
+ "learning_rate": 1.1438846311194024e-05,
4712
+ "loss": 9.607,
4713
+ "step": 672
4714
+ },
4715
+ {
4716
+ "epoch": 0.8071964017991005,
4717
+ "grad_norm": 0.2794909179210663,
4718
+ "learning_rate": 1.1302786992694048e-05,
4719
+ "loss": 9.6098,
4720
+ "step": 673
4721
+ },
4722
+ {
4723
+ "epoch": 0.8083958020989506,
4724
+ "grad_norm": 0.3027547299861908,
4725
+ "learning_rate": 1.1167438512493683e-05,
4726
+ "loss": 9.6116,
4727
+ "step": 674
4728
+ },
4729
+ {
4730
+ "epoch": 0.8095952023988006,
4731
+ "grad_norm": 0.34445720911026,
4732
+ "learning_rate": 1.1032803356841342e-05,
4733
+ "loss": 9.6171,
4734
+ "step": 675
4735
+ },
4736
+ {
4737
+ "epoch": 0.8107946026986507,
4738
+ "grad_norm": 0.2722347378730774,
4739
+ "learning_rate": 1.0898883998882158e-05,
4740
+ "loss": 9.601,
4741
+ "step": 676
4742
+ },
4743
+ {
4744
+ "epoch": 0.8119940029985008,
4745
+ "grad_norm": 0.27299922704696655,
4746
+ "learning_rate": 1.0765682898612656e-05,
4747
+ "loss": 9.5976,
4748
+ "step": 677
4749
+ },
4750
+ {
4751
+ "epoch": 0.8131934032983508,
4752
+ "grad_norm": 0.2737182080745697,
4753
+ "learning_rate": 1.0633202502835494e-05,
4754
+ "loss": 9.5965,
4755
+ "step": 678
4756
+ },
4757
+ {
4758
+ "epoch": 0.8143928035982009,
4759
+ "grad_norm": 0.2752780020236969,
4760
+ "learning_rate": 1.0501445245114522e-05,
4761
+ "loss": 9.6009,
4762
+ "step": 679
4763
+ },
4764
+ {
4765
+ "epoch": 0.815592203898051,
4766
+ "grad_norm": 0.2721465826034546,
4767
+ "learning_rate": 1.0370413545730118e-05,
4768
+ "loss": 9.6064,
4769
+ "step": 680
4770
+ },
4771
+ {
4772
+ "epoch": 0.8167916041979011,
4773
+ "grad_norm": 0.2846396267414093,
4774
+ "learning_rate": 1.0240109811634712e-05,
4775
+ "loss": 9.5995,
4776
+ "step": 681
4777
+ },
4778
+ {
4779
+ "epoch": 0.8179910044977511,
4780
+ "grad_norm": 0.28411293029785156,
4781
+ "learning_rate": 1.0110536436408535e-05,
4782
+ "loss": 9.5975,
4783
+ "step": 682
4784
+ },
4785
+ {
4786
+ "epoch": 0.8191904047976012,
4787
+ "grad_norm": 0.2815098762512207,
4788
+ "learning_rate": 9.9816958002157e-06,
4789
+ "loss": 9.6078,
4790
+ "step": 683
4791
+ },
4792
+ {
4793
+ "epoch": 0.8203898050974513,
4794
+ "grad_norm": 0.278131902217865,
4795
+ "learning_rate": 9.853590269760493e-06,
4796
+ "loss": 9.6143,
4797
+ "step": 684
4798
+ },
4799
+ {
4800
+ "epoch": 0.8215892053973014,
4801
+ "grad_norm": 0.2930939197540283,
4802
+ "learning_rate": 9.726222198243806e-06,
4803
+ "loss": 9.6042,
4804
+ "step": 685
4805
+ },
4806
+ {
4807
+ "epoch": 0.8227886056971514,
4808
+ "grad_norm": 0.2876308560371399,
4809
+ "learning_rate": 9.599593925320016e-06,
4810
+ "loss": 9.6187,
4811
+ "step": 686
4812
+ },
4813
+ {
4814
+ "epoch": 0.8239880059970015,
4815
+ "grad_norm": 0.3398456573486328,
4816
+ "learning_rate": 9.47370777705397e-06,
4817
+ "loss": 9.6115,
4818
+ "step": 687
4819
+ },
4820
+ {
4821
+ "epoch": 0.8251874062968516,
4822
+ "grad_norm": 0.28324592113494873,
4823
+ "learning_rate": 9.348566065878217e-06,
4824
+ "loss": 9.5972,
4825
+ "step": 688
4826
+ },
4827
+ {
4828
+ "epoch": 0.8263868065967016,
4829
+ "grad_norm": 0.271178662776947,
4830
+ "learning_rate": 9.224171090550571e-06,
4831
+ "loss": 9.6004,
4832
+ "step": 689
4833
+ },
4834
+ {
4835
+ "epoch": 0.8275862068965517,
4836
+ "grad_norm": 0.26743438839912415,
4837
+ "learning_rate": 9.100525136111915e-06,
4838
+ "loss": 9.604,
4839
+ "step": 690
4840
+ },
4841
+ {
4842
+ "epoch": 0.8287856071964018,
4843
+ "grad_norm": 0.2741158604621887,
4844
+ "learning_rate": 8.97763047384414e-06,
4845
+ "loss": 9.6024,
4846
+ "step": 691
4847
+ },
4848
+ {
4849
+ "epoch": 0.8299850074962519,
4850
+ "grad_norm": 0.2776412069797516,
4851
+ "learning_rate": 8.855489361228496e-06,
4852
+ "loss": 9.5996,
4853
+ "step": 692
4854
+ },
4855
+ {
4856
+ "epoch": 0.8311844077961019,
4857
+ "grad_norm": 0.2762274742126465,
4858
+ "learning_rate": 8.734104041904129e-06,
4859
+ "loss": 9.6041,
4860
+ "step": 693
4861
+ },
4862
+ {
4863
+ "epoch": 0.832383808095952,
4864
+ "grad_norm": 0.2758176624774933,
4865
+ "learning_rate": 8.61347674562677e-06,
4866
+ "loss": 9.6084,
4867
+ "step": 694
4868
+ },
4869
+ {
4870
+ "epoch": 0.8335832083958021,
4871
+ "grad_norm": 0.28230342268943787,
4872
+ "learning_rate": 8.4936096882279e-06,
4873
+ "loss": 9.6047,
4874
+ "step": 695
4875
+ },
4876
+ {
4877
+ "epoch": 0.8347826086956521,
4878
+ "grad_norm": 0.28801631927490234,
4879
+ "learning_rate": 8.37450507157399e-06,
4880
+ "loss": 9.6084,
4881
+ "step": 696
4882
+ },
4883
+ {
4884
+ "epoch": 0.8359820089955022,
4885
+ "grad_norm": 0.289760559797287,
4886
+ "learning_rate": 8.256165083526019e-06,
4887
+ "loss": 9.6033,
4888
+ "step": 697
4889
+ },
4890
+ {
4891
+ "epoch": 0.8371814092953523,
4892
+ "grad_norm": 0.29011571407318115,
4893
+ "learning_rate": 8.138591897899345e-06,
4894
+ "loss": 9.6161,
4895
+ "step": 698
4896
+ },
4897
+ {
4898
+ "epoch": 0.8383808095952024,
4899
+ "grad_norm": 0.3083633780479431,
4900
+ "learning_rate": 8.021787674423775e-06,
4901
+ "loss": 9.6152,
4902
+ "step": 699
4903
+ },
4904
+ {
4905
+ "epoch": 0.8395802098950524,
4906
+ "grad_norm": 0.36470380425453186,
4907
+ "learning_rate": 7.905754558703803e-06,
4908
+ "loss": 9.6132,
4909
+ "step": 700
4910
+ },
4911
+ {
4912
+ "epoch": 0.8407796101949025,
4913
+ "grad_norm": 0.26850220561027527,
4914
+ "learning_rate": 7.790494682179317e-06,
4915
+ "loss": 9.5949,
4916
+ "step": 701
4917
+ },
4918
+ {
4919
+ "epoch": 0.8419790104947527,
4920
+ "grad_norm": 0.2714633643627167,
4921
+ "learning_rate": 7.676010162086388e-06,
4922
+ "loss": 9.604,
4923
+ "step": 702
4924
+ },
4925
+ {
4926
+ "epoch": 0.8431784107946027,
4927
+ "grad_norm": 0.2753824293613434,
4928
+ "learning_rate": 7.56230310141835e-06,
4929
+ "loss": 9.5993,
4930
+ "step": 703
4931
+ },
4932
+ {
4933
+ "epoch": 0.8443778110944528,
4934
+ "grad_norm": 0.2757047116756439,
4935
+ "learning_rate": 7.449375588887203e-06,
4936
+ "loss": 9.5993,
4937
+ "step": 704
4938
+ },
4939
+ {
4940
+ "epoch": 0.8455772113943029,
4941
+ "grad_norm": 0.27331098914146423,
4942
+ "learning_rate": 7.337229698885279e-06,
4943
+ "loss": 9.6088,
4944
+ "step": 705
4945
+ },
4946
+ {
4947
+ "epoch": 0.846776611694153,
4948
+ "grad_norm": 0.2818980813026428,
4949
+ "learning_rate": 7.225867491447053e-06,
4950
+ "loss": 9.6,
4951
+ "step": 706
4952
+ },
4953
+ {
4954
+ "epoch": 0.847976011994003,
4955
+ "grad_norm": 0.2784759998321533,
4956
+ "learning_rate": 7.115291012211383e-06,
4957
+ "loss": 9.6056,
4958
+ "step": 707
4959
+ },
4960
+ {
4961
+ "epoch": 0.8491754122938531,
4962
+ "grad_norm": 0.2809768319129944,
4963
+ "learning_rate": 7.005502292383898e-06,
4964
+ "loss": 9.6092,
4965
+ "step": 708
4966
+ },
4967
+ {
4968
+ "epoch": 0.8503748125937032,
4969
+ "grad_norm": 0.29430076479911804,
4970
+ "learning_rate": 6.896503348699657e-06,
4971
+ "loss": 9.6031,
4972
+ "step": 709
4973
+ },
4974
+ {
4975
+ "epoch": 0.8515742128935532,
4976
+ "grad_norm": 0.28350192308425903,
4977
+ "learning_rate": 6.788296183386162e-06,
4978
+ "loss": 9.6105,
4979
+ "step": 710
4980
+ },
4981
+ {
4982
+ "epoch": 0.8527736131934033,
4983
+ "grad_norm": 0.29121461510658264,
4984
+ "learning_rate": 6.680882784126552e-06,
4985
+ "loss": 9.6108,
4986
+ "step": 711
4987
+ },
4988
+ {
4989
+ "epoch": 0.8539730134932534,
4990
+ "grad_norm": 0.3215639889240265,
4991
+ "learning_rate": 6.5742651240230545e-06,
4992
+ "loss": 9.6104,
4993
+ "step": 712
4994
+ },
4995
+ {
4996
+ "epoch": 0.8551724137931035,
4997
+ "grad_norm": 0.27074047923088074,
4998
+ "learning_rate": 6.46844516156081e-06,
4999
+ "loss": 9.598,
5000
+ "step": 713
5001
+ },
5002
+ {
5003
+ "epoch": 0.8563718140929535,
5004
+ "grad_norm": 0.2728975713253021,
5005
+ "learning_rate": 6.363424840571869e-06,
5006
+ "loss": 9.5965,
5007
+ "step": 714
5008
+ },
5009
+ {
5010
+ "epoch": 0.8575712143928036,
5011
+ "grad_norm": 0.2756417393684387,
5012
+ "learning_rate": 6.259206090199426e-06,
5013
+ "loss": 9.6021,
5014
+ "step": 715
5015
+ },
5016
+ {
5017
+ "epoch": 0.8587706146926537,
5018
+ "grad_norm": 0.28334730863571167,
5019
+ "learning_rate": 6.155790824862484e-06,
5020
+ "loss": 9.5923,
5021
+ "step": 716
5022
+ },
5023
+ {
5024
+ "epoch": 0.8599700149925037,
5025
+ "grad_norm": 0.2780725359916687,
5026
+ "learning_rate": 6.053180944220627e-06,
5027
+ "loss": 9.5977,
5028
+ "step": 717
5029
  }
5030
  ],
5031
  "logging_steps": 1,
 
5045
  "attributes": {}
5046
  }
5047
  },
5048
+ "total_flos": 595349320237056.0,
5049
  "train_batch_size": 4,
5050
  "trial_name": null,
5051
  "trial_params": null