diaenra commited on
Commit
28e1531
·
verified ·
1 Parent(s): 6c26a2d

Training in progress, step 5846, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7161eb678b753e3707eaa8342a389dd6f3264b8b3862eb1eb500e43fffbc98c
3
  size 50358592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25745c4e48331c53fd5a30e76601bed2b2c294b2fa84b0b99dc053205d796695
3
  size 50358592
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2c6899a3ad6d41d03cfc558affb1a967a0b7457195cad8501bb7d092cd157ef
3
  size 100824826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2be6675b9b9b80a7d2257d3cf7288a27aea082042953111315f2974cd98cf39a
3
  size 100824826
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a3e550049197115447b65598046189db78e16d4fe3b16c292eb5ffb8cdcb810
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9f870579aad726ad5dd33eb2c2ef3c173af7d0105d1964cc901d9f9fddc786e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd55cb27e87133306bf758a06b4c53c228d86f0cb1d2a25f2580caea010551e6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01d224b5db2d008aab6901376593307ed413c8767c47ba9d506ac5ec4e271040
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9623674307218613,
5
  "eval_steps": 500,
6
- "global_step": 5736,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -40167,6 +40167,776 @@
40167
  "learning_rate": 9.039886378601204e-08,
40168
  "loss": 22.0672,
40169
  "step": 5736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40170
  }
40171
  ],
40172
  "logging_steps": 1,
@@ -40181,12 +40951,12 @@
40181
  "should_evaluate": false,
40182
  "should_log": false,
40183
  "should_save": true,
40184
- "should_training_stop": false
40185
  },
40186
  "attributes": {}
40187
  }
40188
  },
40189
- "total_flos": 1.8139386024768307e+17,
40190
  "train_batch_size": 4,
40191
  "trial_name": null,
40192
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
  "eval_steps": 500,
6
+ "global_step": 5846,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
40167
  "learning_rate": 9.039886378601204e-08,
40168
  "loss": 22.0672,
40169
  "step": 5736
40170
+ },
40171
+ {
40172
+ "epoch": 1.962709544988026,
40173
+ "grad_norm": 6.91142463684082,
40174
+ "learning_rate": 8.87632033329977e-08,
40175
+ "loss": 19.9954,
40176
+ "step": 5737
40177
+ },
40178
+ {
40179
+ "epoch": 1.963051659254191,
40180
+ "grad_norm": 6.9987006187438965,
40181
+ "learning_rate": 8.71424628112738e-08,
40182
+ "loss": 19.3593,
40183
+ "step": 5738
40184
+ },
40185
+ {
40186
+ "epoch": 1.9633937735203557,
40187
+ "grad_norm": 7.478214740753174,
40188
+ "learning_rate": 8.553664270531947e-08,
40189
+ "loss": 20.6897,
40190
+ "step": 5739
40191
+ },
40192
+ {
40193
+ "epoch": 1.9637358877865208,
40194
+ "grad_norm": 9.3041410446167,
40195
+ "learning_rate": 8.394574349517293e-08,
40196
+ "loss": 19.2391,
40197
+ "step": 5740
40198
+ },
40199
+ {
40200
+ "epoch": 1.9640780020526856,
40201
+ "grad_norm": 9.374120712280273,
40202
+ "learning_rate": 8.236976565639265e-08,
40203
+ "loss": 19.4552,
40204
+ "step": 5741
40205
+ },
40206
+ {
40207
+ "epoch": 1.9644201163188506,
40208
+ "grad_norm": 9.557108879089355,
40209
+ "learning_rate": 8.080870966008514e-08,
40210
+ "loss": 19.067,
40211
+ "step": 5742
40212
+ },
40213
+ {
40214
+ "epoch": 1.9647622305850154,
40215
+ "grad_norm": 8.268561363220215,
40216
+ "learning_rate": 7.926257597289376e-08,
40217
+ "loss": 20.9947,
40218
+ "step": 5743
40219
+ },
40220
+ {
40221
+ "epoch": 1.9651043448511802,
40222
+ "grad_norm": 8.512595176696777,
40223
+ "learning_rate": 7.773136505700995e-08,
40224
+ "loss": 20.4771,
40225
+ "step": 5744
40226
+ },
40227
+ {
40228
+ "epoch": 1.965446459117345,
40229
+ "grad_norm": 8.379534721374512,
40230
+ "learning_rate": 7.621507737015088e-08,
40231
+ "loss": 21.1643,
40232
+ "step": 5745
40233
+ },
40234
+ {
40235
+ "epoch": 1.96578857338351,
40236
+ "grad_norm": 9.614562034606934,
40237
+ "learning_rate": 7.471371336558174e-08,
40238
+ "loss": 20.1646,
40239
+ "step": 5746
40240
+ },
40241
+ {
40242
+ "epoch": 1.9661306876496751,
40243
+ "grad_norm": 9.188095092773438,
40244
+ "learning_rate": 7.322727349211023e-08,
40245
+ "loss": 21.6648,
40246
+ "step": 5747
40247
+ },
40248
+ {
40249
+ "epoch": 1.96647280191584,
40250
+ "grad_norm": 10.135308265686035,
40251
+ "learning_rate": 7.175575819406421e-08,
40252
+ "loss": 19.3838,
40253
+ "step": 5748
40254
+ },
40255
+ {
40256
+ "epoch": 1.9668149161820048,
40257
+ "grad_norm": 9.67789363861084,
40258
+ "learning_rate": 7.029916791133074e-08,
40259
+ "loss": 18.6511,
40260
+ "step": 5749
40261
+ },
40262
+ {
40263
+ "epoch": 1.9671570304481696,
40264
+ "grad_norm": 10.633426666259766,
40265
+ "learning_rate": 6.885750307933369e-08,
40266
+ "loss": 20.5941,
40267
+ "step": 5750
40268
+ },
40269
+ {
40270
+ "epoch": 1.9674991447143346,
40271
+ "grad_norm": 10.650802612304688,
40272
+ "learning_rate": 6.743076412901173e-08,
40273
+ "loss": 20.8643,
40274
+ "step": 5751
40275
+ },
40276
+ {
40277
+ "epoch": 1.9678412589804994,
40278
+ "grad_norm": 10.619598388671875,
40279
+ "learning_rate": 6.601895148687365e-08,
40280
+ "loss": 20.3627,
40281
+ "step": 5752
40282
+ },
40283
+ {
40284
+ "epoch": 1.9681833732466645,
40285
+ "grad_norm": 12.187576293945312,
40286
+ "learning_rate": 6.462206557494854e-08,
40287
+ "loss": 20.6054,
40288
+ "step": 5753
40289
+ },
40290
+ {
40291
+ "epoch": 1.9685254875128293,
40292
+ "grad_norm": 12.619000434875488,
40293
+ "learning_rate": 6.324010681080239e-08,
40294
+ "loss": 21.6063,
40295
+ "step": 5754
40296
+ },
40297
+ {
40298
+ "epoch": 1.9688676017789941,
40299
+ "grad_norm": 15.741616249084473,
40300
+ "learning_rate": 6.187307560754363e-08,
40301
+ "loss": 22.4582,
40302
+ "step": 5755
40303
+ },
40304
+ {
40305
+ "epoch": 1.969209716045159,
40306
+ "grad_norm": 14.282339096069336,
40307
+ "learning_rate": 6.052097237382315e-08,
40308
+ "loss": 23.1004,
40309
+ "step": 5756
40310
+ },
40311
+ {
40312
+ "epoch": 1.969551830311324,
40313
+ "grad_norm": 15.24374771118164,
40314
+ "learning_rate": 5.918379751381764e-08,
40315
+ "loss": 24.0451,
40316
+ "step": 5757
40317
+ },
40318
+ {
40319
+ "epoch": 1.969893944577489,
40320
+ "grad_norm": 14.704069137573242,
40321
+ "learning_rate": 5.78615514272518e-08,
40322
+ "loss": 22.778,
40323
+ "step": 5758
40324
+ },
40325
+ {
40326
+ "epoch": 1.9702360588436538,
40327
+ "grad_norm": 16.28843116760254,
40328
+ "learning_rate": 5.655423450938724e-08,
40329
+ "loss": 19.8016,
40330
+ "step": 5759
40331
+ },
40332
+ {
40333
+ "epoch": 1.9705781731098186,
40334
+ "grad_norm": 16.319915771484375,
40335
+ "learning_rate": 5.5261847151011346e-08,
40336
+ "loss": 20.8609,
40337
+ "step": 5760
40338
+ },
40339
+ {
40340
+ "epoch": 1.9709202873759835,
40341
+ "grad_norm": 16.698453903198242,
40342
+ "learning_rate": 5.398438973845954e-08,
40343
+ "loss": 21.4025,
40344
+ "step": 5761
40345
+ },
40346
+ {
40347
+ "epoch": 1.9712624016421485,
40348
+ "grad_norm": 17.40521240234375,
40349
+ "learning_rate": 5.272186265360413e-08,
40350
+ "loss": 24.3388,
40351
+ "step": 5762
40352
+ },
40353
+ {
40354
+ "epoch": 1.9716045159083135,
40355
+ "grad_norm": 19.207595825195312,
40356
+ "learning_rate": 5.147426627384877e-08,
40357
+ "loss": 20.9791,
40358
+ "step": 5763
40359
+ },
40360
+ {
40361
+ "epoch": 1.9719466301744784,
40362
+ "grad_norm": 21.78555679321289,
40363
+ "learning_rate": 5.0241600972139594e-08,
40364
+ "loss": 21.4271,
40365
+ "step": 5764
40366
+ },
40367
+ {
40368
+ "epoch": 1.9722887444406432,
40369
+ "grad_norm": 21.837677001953125,
40370
+ "learning_rate": 4.9023867116948506e-08,
40371
+ "loss": 22.5372,
40372
+ "step": 5765
40373
+ },
40374
+ {
40375
+ "epoch": 1.972630858706808,
40376
+ "grad_norm": 24.989015579223633,
40377
+ "learning_rate": 4.7821065072301e-08,
40378
+ "loss": 20.4801,
40379
+ "step": 5766
40380
+ },
40381
+ {
40382
+ "epoch": 1.972972972972973,
40383
+ "grad_norm": 26.47430992126465,
40384
+ "learning_rate": 4.6633195197742784e-08,
40385
+ "loss": 19.5036,
40386
+ "step": 5767
40387
+ },
40388
+ {
40389
+ "epoch": 1.9733150872391378,
40390
+ "grad_norm": 28.229026794433594,
40391
+ "learning_rate": 4.5460257848373156e-08,
40392
+ "loss": 24.5793,
40393
+ "step": 5768
40394
+ },
40395
+ {
40396
+ "epoch": 1.9736572015053029,
40397
+ "grad_norm": 29.432619094848633,
40398
+ "learning_rate": 4.430225337480609e-08,
40399
+ "loss": 20.9656,
40400
+ "step": 5769
40401
+ },
40402
+ {
40403
+ "epoch": 1.9739993157714677,
40404
+ "grad_norm": 29.067018508911133,
40405
+ "learning_rate": 4.315918212320358e-08,
40406
+ "loss": 23.4043,
40407
+ "step": 5770
40408
+ },
40409
+ {
40410
+ "epoch": 1.9743414300376325,
40411
+ "grad_norm": 33.93965530395508,
40412
+ "learning_rate": 4.2031044435270063e-08,
40413
+ "loss": 21.3011,
40414
+ "step": 5771
40415
+ },
40416
+ {
40417
+ "epoch": 1.9746835443037973,
40418
+ "grad_norm": 37.01821517944336,
40419
+ "learning_rate": 4.0917840648241334e-08,
40420
+ "loss": 25.6067,
40421
+ "step": 5772
40422
+ },
40423
+ {
40424
+ "epoch": 1.9750256585699624,
40425
+ "grad_norm": 42.83258056640625,
40426
+ "learning_rate": 3.9819571094878993e-08,
40427
+ "loss": 25.7397,
40428
+ "step": 5773
40429
+ },
40430
+ {
40431
+ "epoch": 1.9753677728361274,
40432
+ "grad_norm": 8.533278465270996,
40433
+ "learning_rate": 3.873623610348709e-08,
40434
+ "loss": 19.5421,
40435
+ "step": 5774
40436
+ },
40437
+ {
40438
+ "epoch": 1.9757098871022922,
40439
+ "grad_norm": 6.471829414367676,
40440
+ "learning_rate": 3.766783599791213e-08,
40441
+ "loss": 18.0767,
40442
+ "step": 5775
40443
+ },
40444
+ {
40445
+ "epoch": 1.976052001368457,
40446
+ "grad_norm": 6.663585186004639,
40447
+ "learning_rate": 3.661437109752641e-08,
40448
+ "loss": 17.9218,
40449
+ "step": 5776
40450
+ },
40451
+ {
40452
+ "epoch": 1.9763941156346219,
40453
+ "grad_norm": 6.855971813201904,
40454
+ "learning_rate": 3.5575841717239157e-08,
40455
+ "loss": 18.6801,
40456
+ "step": 5777
40457
+ },
40458
+ {
40459
+ "epoch": 1.976736229900787,
40460
+ "grad_norm": 6.7459330558776855,
40461
+ "learning_rate": 3.455224816750757e-08,
40462
+ "loss": 17.6948,
40463
+ "step": 5778
40464
+ },
40465
+ {
40466
+ "epoch": 1.9770783441669517,
40467
+ "grad_norm": 6.8546552658081055,
40468
+ "learning_rate": 3.354359075430358e-08,
40469
+ "loss": 18.4751,
40470
+ "step": 5779
40471
+ },
40472
+ {
40473
+ "epoch": 1.9774204584331168,
40474
+ "grad_norm": 6.872806549072266,
40475
+ "learning_rate": 3.254986977914709e-08,
40476
+ "loss": 19.4655,
40477
+ "step": 5780
40478
+ },
40479
+ {
40480
+ "epoch": 1.9777625726992816,
40481
+ "grad_norm": 18.566171646118164,
40482
+ "learning_rate": 3.1571085539089384e-08,
40483
+ "loss": 19.6944,
40484
+ "step": 5781
40485
+ },
40486
+ {
40487
+ "epoch": 1.9781046869654464,
40488
+ "grad_norm": 6.615222454071045,
40489
+ "learning_rate": 3.0607238326724186e-08,
40490
+ "loss": 18.9052,
40491
+ "step": 5782
40492
+ },
40493
+ {
40494
+ "epoch": 1.9784468012316112,
40495
+ "grad_norm": 7.05173921585083,
40496
+ "learning_rate": 2.9658328430165472e-08,
40497
+ "loss": 19.9629,
40498
+ "step": 5783
40499
+ },
40500
+ {
40501
+ "epoch": 1.9787889154977762,
40502
+ "grad_norm": 6.960274696350098,
40503
+ "learning_rate": 2.8724356133075226e-08,
40504
+ "loss": 20.1454,
40505
+ "step": 5784
40506
+ },
40507
+ {
40508
+ "epoch": 1.9791310297639413,
40509
+ "grad_norm": 7.180263996124268,
40510
+ "learning_rate": 2.780532171464123e-08,
40511
+ "loss": 20.9793,
40512
+ "step": 5785
40513
+ },
40514
+ {
40515
+ "epoch": 1.979473144030106,
40516
+ "grad_norm": 7.336266040802002,
40517
+ "learning_rate": 2.6901225449593726e-08,
40518
+ "loss": 19.6709,
40519
+ "step": 5786
40520
+ },
40521
+ {
40522
+ "epoch": 1.979815258296271,
40523
+ "grad_norm": 8.403460502624512,
40524
+ "learning_rate": 2.6012067608194303e-08,
40525
+ "loss": 19.4493,
40526
+ "step": 5787
40527
+ },
40528
+ {
40529
+ "epoch": 1.9801573725624357,
40530
+ "grad_norm": 11.085399627685547,
40531
+ "learning_rate": 2.513784845623035e-08,
40532
+ "loss": 21.0099,
40533
+ "step": 5788
40534
+ },
40535
+ {
40536
+ "epoch": 1.9804994868286008,
40537
+ "grad_norm": 7.737322807312012,
40538
+ "learning_rate": 2.427856825504282e-08,
40539
+ "loss": 19.7549,
40540
+ "step": 5789
40541
+ },
40542
+ {
40543
+ "epoch": 1.9808416010947658,
40544
+ "grad_norm": 7.702394008636475,
40545
+ "learning_rate": 2.3434227261487362e-08,
40546
+ "loss": 20.8117,
40547
+ "step": 5790
40548
+ },
40549
+ {
40550
+ "epoch": 1.9811837153609306,
40551
+ "grad_norm": 7.653695106506348,
40552
+ "learning_rate": 2.2604825727962075e-08,
40553
+ "loss": 18.3907,
40554
+ "step": 5791
40555
+ },
40556
+ {
40557
+ "epoch": 1.9815258296270954,
40558
+ "grad_norm": 7.7871575355529785,
40559
+ "learning_rate": 2.179036390240752e-08,
40560
+ "loss": 19.3076,
40561
+ "step": 5792
40562
+ },
40563
+ {
40564
+ "epoch": 1.9818679438932603,
40565
+ "grad_norm": 8.504340171813965,
40566
+ "learning_rate": 2.0990842028284496e-08,
40567
+ "loss": 21.0861,
40568
+ "step": 5793
40569
+ },
40570
+ {
40571
+ "epoch": 1.9822100581594253,
40572
+ "grad_norm": 8.407926559448242,
40573
+ "learning_rate": 2.020626034459072e-08,
40574
+ "loss": 19.538,
40575
+ "step": 5794
40576
+ },
40577
+ {
40578
+ "epoch": 1.9825521724255901,
40579
+ "grad_norm": 8.925575256347656,
40580
+ "learning_rate": 1.943661908586636e-08,
40581
+ "loss": 20.1966,
40582
+ "step": 5795
40583
+ },
40584
+ {
40585
+ "epoch": 1.9828942866917552,
40586
+ "grad_norm": 8.412221908569336,
40587
+ "learning_rate": 1.8681918482177375e-08,
40588
+ "loss": 19.9603,
40589
+ "step": 5796
40590
+ },
40591
+ {
40592
+ "epoch": 1.98323640095792,
40593
+ "grad_norm": 9.20413589477539,
40594
+ "learning_rate": 1.7942158759126637e-08,
40595
+ "loss": 19.8316,
40596
+ "step": 5797
40597
+ },
40598
+ {
40599
+ "epoch": 1.9835785152240848,
40600
+ "grad_norm": 9.628589630126953,
40601
+ "learning_rate": 1.721734013784837e-08,
40602
+ "loss": 19.4429,
40603
+ "step": 5798
40604
+ },
40605
+ {
40606
+ "epoch": 1.9839206294902496,
40607
+ "grad_norm": 9.633048057556152,
40608
+ "learning_rate": 1.6507462835013697e-08,
40609
+ "loss": 19.0393,
40610
+ "step": 5799
40611
+ },
40612
+ {
40613
+ "epoch": 1.9842627437564146,
40614
+ "grad_norm": 12.57917308807373,
40615
+ "learning_rate": 1.581252706281955e-08,
40616
+ "loss": 21.2715,
40617
+ "step": 5800
40618
+ },
40619
+ {
40620
+ "epoch": 1.9846048580225797,
40621
+ "grad_norm": 10.881157875061035,
40622
+ "learning_rate": 1.5132533029016405e-08,
40623
+ "loss": 20.7091,
40624
+ "step": 5801
40625
+ },
40626
+ {
40627
+ "epoch": 1.9849469722887445,
40628
+ "grad_norm": 11.508024215698242,
40629
+ "learning_rate": 1.4467480936858347e-08,
40630
+ "loss": 20.5245,
40631
+ "step": 5802
40632
+ },
40633
+ {
40634
+ "epoch": 1.9852890865549093,
40635
+ "grad_norm": 12.641556739807129,
40636
+ "learning_rate": 1.3817370985164113e-08,
40637
+ "loss": 20.9665,
40638
+ "step": 5803
40639
+ },
40640
+ {
40641
+ "epoch": 1.9856312008210741,
40642
+ "grad_norm": 54.07820510864258,
40643
+ "learning_rate": 1.3182203368256041e-08,
40644
+ "loss": 21.8294,
40645
+ "step": 5804
40646
+ },
40647
+ {
40648
+ "epoch": 1.9859733150872392,
40649
+ "grad_norm": 13.773715019226074,
40650
+ "learning_rate": 1.2561978276015574e-08,
40651
+ "loss": 20.4164,
40652
+ "step": 5805
40653
+ },
40654
+ {
40655
+ "epoch": 1.986315429353404,
40656
+ "grad_norm": 13.61760425567627,
40657
+ "learning_rate": 1.1956695893844405e-08,
40658
+ "loss": 24.0563,
40659
+ "step": 5806
40660
+ },
40661
+ {
40662
+ "epoch": 1.986657543619569,
40663
+ "grad_norm": 14.020088195800781,
40664
+ "learning_rate": 1.1366356402670032e-08,
40665
+ "loss": 20.9675,
40666
+ "step": 5807
40667
+ },
40668
+ {
40669
+ "epoch": 1.9869996578857338,
40670
+ "grad_norm": 14.913092613220215,
40671
+ "learning_rate": 1.0790959978973503e-08,
40672
+ "loss": 21.9978,
40673
+ "step": 5808
40674
+ },
40675
+ {
40676
+ "epoch": 1.9873417721518987,
40677
+ "grad_norm": 15.699370384216309,
40678
+ "learning_rate": 1.0230506794750572e-08,
40679
+ "loss": 19.9998,
40680
+ "step": 5809
40681
+ },
40682
+ {
40683
+ "epoch": 1.9876838864180635,
40684
+ "grad_norm": 16.686582565307617,
40685
+ "learning_rate": 9.684997017544995e-09,
40686
+ "loss": 18.9121,
40687
+ "step": 5810
40688
+ },
40689
+ {
40690
+ "epoch": 1.9880260006842285,
40691
+ "grad_norm": 17.049999237060547,
40692
+ "learning_rate": 9.154430810415227e-09,
40693
+ "loss": 21.2301,
40694
+ "step": 5811
40695
+ },
40696
+ {
40697
+ "epoch": 1.9883681149503936,
40698
+ "grad_norm": 19.32858657836914,
40699
+ "learning_rate": 8.63880833197328e-09,
40700
+ "loss": 22.4003,
40701
+ "step": 5812
40702
+ },
40703
+ {
40704
+ "epoch": 1.9887102292165584,
40705
+ "grad_norm": 19.711584091186523,
40706
+ "learning_rate": 8.138129736340317e-09,
40707
+ "loss": 22.6676,
40708
+ "step": 5813
40709
+ },
40710
+ {
40711
+ "epoch": 1.9890523434827232,
40712
+ "grad_norm": 22.30181884765625,
40713
+ "learning_rate": 7.652395173202154e-09,
40714
+ "loss": 19.9137,
40715
+ "step": 5814
40716
+ },
40717
+ {
40718
+ "epoch": 1.989394457748888,
40719
+ "grad_norm": 33.89694595336914,
40720
+ "learning_rate": 7.181604787742657e-09,
40721
+ "loss": 22.0087,
40722
+ "step": 5815
40723
+ },
40724
+ {
40725
+ "epoch": 1.989736572015053,
40726
+ "grad_norm": 27.179582595825195,
40727
+ "learning_rate": 6.7257587207048e-09,
40728
+ "loss": 25.1015,
40729
+ "step": 5816
40730
+ },
40731
+ {
40732
+ "epoch": 1.990078686281218,
40733
+ "grad_norm": 22.285991668701172,
40734
+ "learning_rate": 6.284857108346254e-09,
40735
+ "loss": 18.6589,
40736
+ "step": 5817
40737
+ },
40738
+ {
40739
+ "epoch": 1.990420800547383,
40740
+ "grad_norm": 24.79540252685547,
40741
+ "learning_rate": 5.858900082472696e-09,
40742
+ "loss": 20.7354,
40743
+ "step": 5818
40744
+ },
40745
+ {
40746
+ "epoch": 1.9907629148135477,
40747
+ "grad_norm": 27.761415481567383,
40748
+ "learning_rate": 5.447887770415605e-09,
40749
+ "loss": 25.7395,
40750
+ "step": 5819
40751
+ },
40752
+ {
40753
+ "epoch": 1.9911050290797125,
40754
+ "grad_norm": 26.287521362304688,
40755
+ "learning_rate": 5.051820295032261e-09,
40756
+ "loss": 22.7141,
40757
+ "step": 5820
40758
+ },
40759
+ {
40760
+ "epoch": 1.9914471433458774,
40761
+ "grad_norm": 30.279993057250977,
40762
+ "learning_rate": 4.670697774722399e-09,
40763
+ "loss": 22.8432,
40764
+ "step": 5821
40765
+ },
40766
+ {
40767
+ "epoch": 1.9917892576120424,
40768
+ "grad_norm": 34.489742279052734,
40769
+ "learning_rate": 4.3045203234115535e-09,
40770
+ "loss": 21.1695,
40771
+ "step": 5822
40772
+ },
40773
+ {
40774
+ "epoch": 1.9921313718782074,
40775
+ "grad_norm": 42.58619689941406,
40776
+ "learning_rate": 3.953288050567716e-09,
40777
+ "loss": 27.3762,
40778
+ "step": 5823
40779
+ },
40780
+ {
40781
+ "epoch": 1.9924734861443723,
40782
+ "grad_norm": 6.662477970123291,
40783
+ "learning_rate": 3.617001061179126e-09,
40784
+ "loss": 18.4796,
40785
+ "step": 5824
40786
+ },
40787
+ {
40788
+ "epoch": 1.992815600410537,
40789
+ "grad_norm": 9.788829803466797,
40790
+ "learning_rate": 3.2956594557764785e-09,
40791
+ "loss": 17.4526,
40792
+ "step": 5825
40793
+ },
40794
+ {
40795
+ "epoch": 1.9931577146767019,
40796
+ "grad_norm": 6.742092609405518,
40797
+ "learning_rate": 2.9892633304107186e-09,
40798
+ "loss": 19.3224,
40799
+ "step": 5826
40800
+ },
40801
+ {
40802
+ "epoch": 1.993499828942867,
40803
+ "grad_norm": 12.77696418762207,
40804
+ "learning_rate": 2.697812776680797e-09,
40805
+ "loss": 18.8464,
40806
+ "step": 5827
40807
+ },
40808
+ {
40809
+ "epoch": 1.993841943209032,
40810
+ "grad_norm": 7.268036365509033,
40811
+ "learning_rate": 2.4213078817059143e-09,
40812
+ "loss": 19.2486,
40813
+ "step": 5828
40814
+ },
40815
+ {
40816
+ "epoch": 1.9941840574751968,
40817
+ "grad_norm": 6.920243740081787,
40818
+ "learning_rate": 2.1597487281366235e-09,
40819
+ "loss": 20.0134,
40820
+ "step": 5829
40821
+ },
40822
+ {
40823
+ "epoch": 1.9945261717413616,
40824
+ "grad_norm": 7.157983779907227,
40825
+ "learning_rate": 1.9131353941714836e-09,
40826
+ "loss": 20.9693,
40827
+ "step": 5830
40828
+ },
40829
+ {
40830
+ "epoch": 1.9948682860075264,
40831
+ "grad_norm": 7.626955986022949,
40832
+ "learning_rate": 1.681467953518201e-09,
40833
+ "loss": 19.7387,
40834
+ "step": 5831
40835
+ },
40836
+ {
40837
+ "epoch": 1.9952104002736915,
40838
+ "grad_norm": 7.880003929138184,
40839
+ "learning_rate": 1.4647464754380391e-09,
40840
+ "loss": 20.6881,
40841
+ "step": 5832
40842
+ },
40843
+ {
40844
+ "epoch": 1.9955525145398563,
40845
+ "grad_norm": 8.679123878479004,
40846
+ "learning_rate": 1.2629710247180627e-09,
40847
+ "loss": 19.434,
40848
+ "step": 5833
40849
+ },
40850
+ {
40851
+ "epoch": 1.9958946288060213,
40852
+ "grad_norm": 10.085281372070312,
40853
+ "learning_rate": 1.076141661660035e-09,
40854
+ "loss": 20.0026,
40855
+ "step": 5834
40856
+ },
40857
+ {
40858
+ "epoch": 1.9962367430721861,
40859
+ "grad_norm": 10.148959159851074,
40860
+ "learning_rate": 9.04258442130379e-10,
40861
+ "loss": 19.6373,
40862
+ "step": 5835
40863
+ },
40864
+ {
40865
+ "epoch": 1.996578857338351,
40866
+ "grad_norm": 11.545489311218262,
40867
+ "learning_rate": 7.473214174935628e-10,
40868
+ "loss": 20.4251,
40869
+ "step": 5836
40870
+ },
40871
+ {
40872
+ "epoch": 1.9969209716045158,
40873
+ "grad_norm": 14.113458633422852,
40874
+ "learning_rate": 6.053306346787135e-10,
40875
+ "loss": 23.9484,
40876
+ "step": 5837
40877
+ },
40878
+ {
40879
+ "epoch": 1.9972630858706808,
40880
+ "grad_norm": 13.124404907226562,
40881
+ "learning_rate": 4.782861361185554e-10,
40882
+ "loss": 21.1148,
40883
+ "step": 5838
40884
+ },
40885
+ {
40886
+ "epoch": 1.9976052001368458,
40887
+ "grad_norm": 15.107348442077637,
40888
+ "learning_rate": 3.661879597938178e-10,
40889
+ "loss": 20.0414,
40890
+ "step": 5839
40891
+ },
40892
+ {
40893
+ "epoch": 1.9979473144030107,
40894
+ "grad_norm": 14.699893951416016,
40895
+ "learning_rate": 2.690361392221341e-10,
40896
+ "loss": 18.2349,
40897
+ "step": 5840
40898
+ },
40899
+ {
40900
+ "epoch": 1.9982894286691755,
40901
+ "grad_norm": 19.990739822387695,
40902
+ "learning_rate": 1.868307034302852e-10,
40903
+ "loss": 22.159,
40904
+ "step": 5841
40905
+ },
40906
+ {
40907
+ "epoch": 1.9986315429353403,
40908
+ "grad_norm": 22.302776336669922,
40909
+ "learning_rate": 1.1957167700415996e-10,
40910
+ "loss": 24.7348,
40911
+ "step": 5842
40912
+ },
40913
+ {
40914
+ "epoch": 1.9989736572015053,
40915
+ "grad_norm": 23.003162384033203,
40916
+ "learning_rate": 6.725908004434622e-11,
40917
+ "loss": 24.44,
40918
+ "step": 5843
40919
+ },
40920
+ {
40921
+ "epoch": 1.9993157714676704,
40922
+ "grad_norm": 25.137348175048828,
40923
+ "learning_rate": 2.9892928188335335e-11,
40924
+ "loss": 23.8174,
40925
+ "step": 5844
40926
+ },
40927
+ {
40928
+ "epoch": 1.9996578857338352,
40929
+ "grad_norm": 31.00333023071289,
40930
+ "learning_rate": 7.473232604970903e-12,
40931
+ "loss": 22.5995,
40932
+ "step": 5845
40933
+ },
40934
+ {
40935
+ "epoch": 2.0,
40936
+ "grad_norm": 44.4307975769043,
40937
+ "learning_rate": 0.0,
40938
+ "loss": 32.1773,
40939
+ "step": 5846
40940
  }
40941
  ],
40942
  "logging_steps": 1,
 
40951
  "should_evaluate": false,
40952
  "should_log": false,
40953
  "should_save": true,
40954
+ "should_training_stop": true
40955
  },
40956
  "attributes": {}
40957
  }
40958
  },
40959
+ "total_flos": 1.8484734148686643e+17,
40960
  "train_batch_size": 4,
40961
  "trial_name": null,
40962
  "trial_params": null