Training in progress, step 563, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +984 -4

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:efedcd7712efe5df4242d40d0fc157567550dc57198de0fde11a067a253c3786
 size 101752088

 version https://git-lfs.github.com/spec/v1
+oid sha256:a8c595720a41e9384906f2d3e480d3aa304689f4e0612ea898704da5876df4ce
 size 101752088

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a3093ef84d124bf4f3a388a3f58cedd89b5fbf3ec80a866e1189f65649a0f5e
 size 203713238

 version https://git-lfs.github.com/spec/v1
+oid sha256:aa85267159afa7ee96961d8e086ff9125093d3784893802d3ad17d9a528ea772
 size 203713238

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e52b4ddcd925a725a65812af6610fe4debc708c6e4fc1ee7e0e17160e2a6fc5
 size 14512

 version https://git-lfs.github.com/spec/v1
+oid sha256:da5aca99bcde1ec8b0dc9a1dd61af6a832b8ca17d5e8974363414a00fe156561
 size 14512

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d9aa8c4c4812086f9a0cd74c7d98dc727224f492c2c8deb8168a9fa04e2846e
 size 14512

 version https://git-lfs.github.com/spec/v1
+oid sha256:6355acd2ed897b92e4c2ba4445c30b5ccb7ab5d77d60dc3141cf3e52bd674a29
 size 14512

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2bb049f58262ac24b66ea8e4bbb35c588cda72b0f20c7495d16197e65e5d114
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:1aa6a07fde8b7b9172b2dbbfa971ec598626e705bbfe8cea4899774e3eba905a
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.751165371809101,
   "eval_steps": 141,
-  "global_step": 423,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -3000,6 +3000,986 @@
       "eval_samples_per_second": 6.06,
       "eval_steps_per_second": 1.52,
       "step": 423
     }
   ],
   "logging_steps": 1,
@@ -3014,12 +3994,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 5.571234948741857e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9997780244173141,
   "eval_steps": 141,
+  "global_step": 563,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 6.06,
       "eval_steps_per_second": 1.52,
       "step": 423
+    },
+    {
+      "epoch": 0.7529411764705882,
+      "grad_norm": 0.44857800006866455,
+      "learning_rate": 3.4094199810279924e-05,
+      "loss": 0.6288,
+      "step": 424
+    },
+    {
+      "epoch": 0.7547169811320755,
+      "grad_norm": 0.564264178276062,
+      "learning_rate": 3.363486013983788e-05,
+      "loss": 0.784,
+      "step": 425
+    },
+    {
+      "epoch": 0.7564927857935627,
+      "grad_norm": 0.26752522587776184,
+      "learning_rate": 3.317800934775696e-05,
+      "loss": 0.8435,
+      "step": 426
+    },
+    {
+      "epoch": 0.7582685904550499,
+      "grad_norm": 0.2690868377685547,
+      "learning_rate": 3.2723664567219626e-05,
+      "loss": 0.8125,
+      "step": 427
+    },
+    {
+      "epoch": 0.7600443951165372,
+      "grad_norm": 0.28497472405433655,
+      "learning_rate": 3.227184283742591e-05,
+      "loss": 0.7349,
+      "step": 428
+    },
+    {
+      "epoch": 0.7618201997780244,
+      "grad_norm": 0.3184243142604828,
+      "learning_rate": 3.182256110295437e-05,
+      "loss": 0.8037,
+      "step": 429
+    },
+    {
+      "epoch": 0.7635960044395117,
+      "grad_norm": 0.29851233959198,
+      "learning_rate": 3.137583621312665e-05,
+      "loss": 0.7631,
+      "step": 430
+    },
+    {
+      "epoch": 0.7653718091009989,
+      "grad_norm": 0.31429582834243774,
+      "learning_rate": 3.093168492137557e-05,
+      "loss": 0.7446,
+      "step": 431
+    },
+    {
+      "epoch": 0.7671476137624861,
+      "grad_norm": 0.3191598653793335,
+      "learning_rate": 3.0490123884616796e-05,
+      "loss": 0.7301,
+      "step": 432
+    },
+    {
+      "epoch": 0.7689234184239734,
+      "grad_norm": 0.32959750294685364,
+      "learning_rate": 3.0051169662624225e-05,
+      "loss": 0.7891,
+      "step": 433
+    },
+    {
+      "epoch": 0.7706992230854606,
+      "grad_norm": 0.306445449590683,
+      "learning_rate": 2.9614838717408867e-05,
+      "loss": 0.7205,
+      "step": 434
+    },
+    {
+      "epoch": 0.7724750277469479,
+      "grad_norm": 0.3152560889720917,
+      "learning_rate": 2.9181147412601562e-05,
+      "loss": 0.7436,
+      "step": 435
+    },
+    {
+      "epoch": 0.7742508324084351,
+      "grad_norm": 0.3343624174594879,
+      "learning_rate": 2.8750112012839214e-05,
+      "loss": 0.7004,
+      "step": 436
+    },
+    {
+      "epoch": 0.7760266370699223,
+      "grad_norm": 0.3389514982700348,
+      "learning_rate": 2.8321748683154893e-05,
+      "loss": 0.7037,
+      "step": 437
+    },
+    {
+      "epoch": 0.7778024417314096,
+      "grad_norm": 0.36133134365081787,
+      "learning_rate": 2.789607348837153e-05,
+      "loss": 0.7648,
+      "step": 438
+    },
+    {
+      "epoch": 0.7795782463928967,
+      "grad_norm": 0.3495211601257324,
+      "learning_rate": 2.7473102392499518e-05,
+      "loss": 0.7668,
+      "step": 439
+    },
+    {
+      "epoch": 0.781354051054384,
+      "grad_norm": 0.3677636384963989,
+      "learning_rate": 2.7052851258137935e-05,
+      "loss": 0.7267,
+      "step": 440
+    },
+    {
+      "epoch": 0.7831298557158712,
+      "grad_norm": 0.36717966198921204,
+      "learning_rate": 2.6635335845879737e-05,
+      "loss": 0.7577,
+      "step": 441
+    },
+    {
+      "epoch": 0.7849056603773585,
+      "grad_norm": 0.38047298789024353,
+      "learning_rate": 2.622057181372063e-05,
+      "loss": 0.6682,
+      "step": 442
+    },
+    {
+      "epoch": 0.7866814650388457,
+      "grad_norm": 0.37885257601737976,
+      "learning_rate": 2.5808574716471856e-05,
+      "loss": 0.7454,
+      "step": 443
+    },
+    {
+      "epoch": 0.7884572697003329,
+      "grad_norm": 0.37553516030311584,
+      "learning_rate": 2.5399360005176886e-05,
+      "loss": 0.6721,
+      "step": 444
+    },
+    {
+      "epoch": 0.7902330743618202,
+      "grad_norm": 0.42358729243278503,
+      "learning_rate": 2.4992943026531935e-05,
+      "loss": 0.7339,
+      "step": 445
+    },
+    {
+      "epoch": 0.7920088790233074,
+      "grad_norm": 0.3923121392726898,
+      "learning_rate": 2.4589339022310386e-05,
+      "loss": 0.6952,
+      "step": 446
+    },
+    {
+      "epoch": 0.7937846836847947,
+      "grad_norm": 0.39447784423828125,
+      "learning_rate": 2.4188563128791254e-05,
+      "loss": 0.6898,
+      "step": 447
+    },
+    {
+      "epoch": 0.7955604883462819,
+      "grad_norm": 0.4486071467399597,
+      "learning_rate": 2.379063037619146e-05,
+      "loss": 0.6485,
+      "step": 448
+    },
+    {
+      "epoch": 0.7973362930077691,
+      "grad_norm": 0.47466063499450684,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.6961,
+      "step": 449
+    },
+    {
+      "epoch": 0.7991120976692564,
+      "grad_norm": 0.5662741661071777,
+      "learning_rate": 2.300335388092929e-05,
+      "loss": 0.7295,
+      "step": 450
+    },
+    {
+      "epoch": 0.8008879023307436,
+      "grad_norm": 0.28682488203048706,
+      "learning_rate": 2.2614039663337417e-05,
+      "loss": 0.8068,
+      "step": 451
+    },
+    {
+      "epoch": 0.8026637069922309,
+      "grad_norm": 0.2780812680721283,
+      "learning_rate": 2.222762763569862e-05,
+      "loss": 0.8236,
+      "step": 452
+    },
+    {
+      "epoch": 0.8044395116537181,
+      "grad_norm": 0.29560670256614685,
+      "learning_rate": 2.184413228954468e-05,
+      "loss": 0.7894,
+      "step": 453
+    },
+    {
+      "epoch": 0.8062153163152054,
+      "grad_norm": 0.2896682918071747,
+      "learning_rate": 2.1463568007023704e-05,
+      "loss": 0.7534,
+      "step": 454
+    },
+    {
+      "epoch": 0.8079911209766926,
+      "grad_norm": 0.2878231108188629,
+      "learning_rate": 2.1085949060360654e-05,
+      "loss": 0.7245,
+      "step": 455
+    },
+    {
+      "epoch": 0.8097669256381798,
+      "grad_norm": 0.3214217722415924,
+      "learning_rate": 2.0711289611322204e-05,
+      "loss": 0.7731,
+      "step": 456
+    },
+    {
+      "epoch": 0.8115427302996671,
+      "grad_norm": 0.3201158940792084,
+      "learning_rate": 2.033960371068557e-05,
+      "loss": 0.7475,
+      "step": 457
+    },
+    {
+      "epoch": 0.8133185349611542,
+      "grad_norm": 0.33665937185287476,
+      "learning_rate": 1.9970905297711606e-05,
+      "loss": 0.721,
+      "step": 458
+    },
+    {
+      "epoch": 0.8150943396226416,
+      "grad_norm": 0.3305956721305847,
+      "learning_rate": 1.9605208199621995e-05,
+      "loss": 0.7249,
+      "step": 459
+    },
+    {
+      "epoch": 0.8168701442841287,
+      "grad_norm": 0.3384665548801422,
+      "learning_rate": 1.924252613108073e-05,
+      "loss": 0.724,
+      "step": 460
+    },
+    {
+      "epoch": 0.8186459489456159,
+      "grad_norm": 0.359944224357605,
+      "learning_rate": 1.888287269367979e-05,
+      "loss": 0.7516,
+      "step": 461
+    },
+    {
+      "epoch": 0.8204217536071032,
+      "grad_norm": 0.3357195556163788,
+      "learning_rate": 1.8526261375428955e-05,
+      "loss": 0.7327,
+      "step": 462
+    },
+    {
+      "epoch": 0.8221975582685904,
+      "grad_norm": 0.34272313117980957,
+      "learning_rate": 1.8172705550250092e-05,
+      "loss": 0.7161,
+      "step": 463
+    },
+    {
+      "epoch": 0.8239733629300777,
+      "grad_norm": 0.32507383823394775,
+      "learning_rate": 1.7822218477475494e-05,
+      "loss": 0.6392,
+      "step": 464
+    },
+    {
+      "epoch": 0.8257491675915649,
+      "grad_norm": 0.37359514832496643,
+      "learning_rate": 1.7474813301350666e-05,
+      "loss": 0.7298,
+      "step": 465
+    },
+    {
+      "epoch": 0.8275249722530522,
+      "grad_norm": 0.3348104655742645,
+      "learning_rate": 1.7130503050541368e-05,
+      "loss": 0.6568,
+      "step": 466
+    },
+    {
+      "epoch": 0.8293007769145394,
+      "grad_norm": 0.3460003435611725,
+      "learning_rate": 1.6789300637645e-05,
+      "loss": 0.6742,
+      "step": 467
+    },
+    {
+      "epoch": 0.8310765815760266,
+      "grad_norm": 0.3993259370326996,
+      "learning_rate": 1.6451218858706374e-05,
+      "loss": 0.7365,
+      "step": 468
+    },
+    {
+      "epoch": 0.8328523862375139,
+      "grad_norm": 0.3710460960865021,
+      "learning_rate": 1.6116270392737754e-05,
+      "loss": 0.699,
+      "step": 469
+    },
+    {
+      "epoch": 0.8346281908990011,
+      "grad_norm": 0.41104644536972046,
+      "learning_rate": 1.578446780124344e-05,
+      "loss": 0.7185,
+      "step": 470
+    },
+    {
+      "epoch": 0.8364039955604884,
+      "grad_norm": 0.39822298288345337,
+      "learning_rate": 1.5455823527748626e-05,
+      "loss": 0.6968,
+      "step": 471
+    },
+    {
+      "epoch": 0.8381798002219756,
+      "grad_norm": 0.3909063935279846,
+      "learning_rate": 1.5130349897332763e-05,
+      "loss": 0.6427,
+      "step": 472
+    },
+    {
+      "epoch": 0.8399556048834628,
+      "grad_norm": 0.39908355474472046,
+      "learning_rate": 1.4808059116167305e-05,
+      "loss": 0.6307,
+      "step": 473
+    },
+    {
+      "epoch": 0.8417314095449501,
+      "grad_norm": 0.4725106954574585,
+      "learning_rate": 1.4488963271057943e-05,
+      "loss": 0.7274,
+      "step": 474
+    },
+    {
+      "epoch": 0.8435072142064373,
+      "grad_norm": 0.58518385887146,
+      "learning_rate": 1.4173074328991377e-05,
+      "loss": 0.7112,
+      "step": 475
+    },
+    {
+      "epoch": 0.8452830188679246,
+      "grad_norm": 0.2664174735546112,
+      "learning_rate": 1.3860404136686411e-05,
+      "loss": 0.8515,
+      "step": 476
+    },
+    {
+      "epoch": 0.8470588235294118,
+      "grad_norm": 0.30460578203201294,
+      "learning_rate": 1.355096442014977e-05,
+      "loss": 0.8107,
+      "step": 477
+    },
+    {
+      "epoch": 0.848834628190899,
+      "grad_norm": 0.28965044021606445,
+      "learning_rate": 1.3244766784236307e-05,
+      "loss": 0.7361,
+      "step": 478
+    },
+    {
+      "epoch": 0.8506104328523862,
+      "grad_norm": 0.3329102396965027,
+      "learning_rate": 1.294182271221377e-05,
+      "loss": 0.7712,
+      "step": 479
+    },
+    {
+      "epoch": 0.8523862375138734,
+      "grad_norm": 0.30333012342453003,
+      "learning_rate": 1.2642143565332154e-05,
+      "loss": 0.7245,
+      "step": 480
+    },
+    {
+      "epoch": 0.8541620421753607,
+      "grad_norm": 0.328744500875473,
+      "learning_rate": 1.2345740582397648e-05,
+      "loss": 0.7557,
+      "step": 481
+    },
+    {
+      "epoch": 0.8559378468368479,
+      "grad_norm": 0.33845219016075134,
+      "learning_rate": 1.2052624879351104e-05,
+      "loss": 0.7857,
+      "step": 482
+    },
+    {
+      "epoch": 0.8577136514983352,
+      "grad_norm": 0.3305346667766571,
+      "learning_rate": 1.176280744885121e-05,
+      "loss": 0.7512,
+      "step": 483
+    },
+    {
+      "epoch": 0.8594894561598224,
+      "grad_norm": 0.340707391500473,
+      "learning_rate": 1.1476299159862203e-05,
+      "loss": 0.7678,
+      "step": 484
+    },
+    {
+      "epoch": 0.8612652608213096,
+      "grad_norm": 0.3646427392959595,
+      "learning_rate": 1.119311075724625e-05,
+      "loss": 0.7473,
+      "step": 485
+    },
+    {
+      "epoch": 0.8630410654827969,
+      "grad_norm": 0.35034942626953125,
+      "learning_rate": 1.09132528613605e-05,
+      "loss": 0.7679,
+      "step": 486
+    },
+    {
+      "epoch": 0.8648168701442841,
+      "grad_norm": 0.35471147298812866,
+      "learning_rate": 1.0636735967658784e-05,
+      "loss": 0.7416,
+      "step": 487
+    },
+    {
+      "epoch": 0.8665926748057714,
+      "grad_norm": 0.3597537875175476,
+      "learning_rate": 1.0363570446297999e-05,
+      "loss": 0.7125,
+      "step": 488
+    },
+    {
+      "epoch": 0.8683684794672586,
+      "grad_norm": 0.35092103481292725,
+      "learning_rate": 1.0093766541749205e-05,
+      "loss": 0.692,
+      "step": 489
+    },
+    {
+      "epoch": 0.8701442841287459,
+      "grad_norm": 0.35275334119796753,
+      "learning_rate": 9.827334372413444e-06,
+      "loss": 0.6683,
+      "step": 490
+    },
+    {
+      "epoch": 0.8719200887902331,
+      "grad_norm": 0.3727843463420868,
+      "learning_rate": 9.564283930242257e-06,
+      "loss": 0.665,
+      "step": 491
+    },
+    {
+      "epoch": 0.8736958934517203,
+      "grad_norm": 0.3570787310600281,
+      "learning_rate": 9.30462508036294e-06,
+      "loss": 0.6736,
+      "step": 492
+    },
+    {
+      "epoch": 0.8754716981132076,
+      "grad_norm": 0.39428988099098206,
+      "learning_rate": 9.048367560708604e-06,
+      "loss": 0.7076,
+      "step": 493
+    },
+    {
+      "epoch": 0.8772475027746948,
+      "grad_norm": 0.3717636168003082,
+      "learning_rate": 8.795520981652961e-06,
+      "loss": 0.6807,
+      "step": 494
+    },
+    {
+      "epoch": 0.8790233074361821,
+      "grad_norm": 0.4105593264102936,
+      "learning_rate": 8.546094825649908e-06,
+      "loss": 0.7068,
+      "step": 495
+    },
+    {
+      "epoch": 0.8807991120976693,
+      "grad_norm": 0.45720747113227844,
+      "learning_rate": 8.300098446877923e-06,
+      "loss": 0.7189,
+      "step": 496
+    },
+    {
+      "epoch": 0.8825749167591564,
+      "grad_norm": 0.44911620020866394,
+      "learning_rate": 8.05754107088923e-06,
+      "loss": 0.6891,
+      "step": 497
+    },
+    {
+      "epoch": 0.8843507214206437,
+      "grad_norm": 0.4414433240890503,
+      "learning_rate": 7.818431794263836e-06,
+      "loss": 0.7167,
+      "step": 498
+    },
+    {
+      "epoch": 0.8861265260821309,
+      "grad_norm": 0.49208080768585205,
+      "learning_rate": 7.582779584268373e-06,
+      "loss": 0.7084,
+      "step": 499
+    },
+    {
+      "epoch": 0.8879023307436182,
+      "grad_norm": 0.5831857323646545,
+      "learning_rate": 7.350593278519824e-06,
+      "loss": 0.7877,
+      "step": 500
+    },
+    {
+      "epoch": 0.8896781354051054,
+      "grad_norm": 0.25118499994277954,
+      "learning_rate": 7.121881584654056e-06,
+      "loss": 0.8006,
+      "step": 501
+    },
+    {
+      "epoch": 0.8914539400665926,
+      "grad_norm": 0.2842087745666504,
+      "learning_rate": 6.896653079999249e-06,
+      "loss": 0.7796,
+      "step": 502
+    },
+    {
+      "epoch": 0.8932297447280799,
+      "grad_norm": 0.2935945391654968,
+      "learning_rate": 6.674916211254289e-06,
+      "loss": 0.7614,
+      "step": 503
+    },
+    {
+      "epoch": 0.8950055493895671,
+      "grad_norm": 0.3194078505039215,
+      "learning_rate": 6.45667929417193e-06,
+      "loss": 0.7537,
+      "step": 504
+    },
+    {
+      "epoch": 0.8967813540510544,
+      "grad_norm": 0.32085007429122925,
+      "learning_rate": 6.2419505132469305e-06,
+      "loss": 0.7843,
+      "step": 505
+    },
+    {
+      "epoch": 0.8985571587125416,
+      "grad_norm": 0.32116949558258057,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.736,
+      "step": 506
+    },
+    {
+      "epoch": 0.9003329633740289,
+      "grad_norm": 0.32136133313179016,
+      "learning_rate": 5.823049439721561e-06,
+      "loss": 0.7388,
+      "step": 507
+    },
+    {
+      "epoch": 0.9021087680355161,
+      "grad_norm": 0.33068105578422546,
+      "learning_rate": 5.618892857083069e-06,
+      "loss": 0.6994,
+      "step": 508
+    },
+    {
+      "epoch": 0.9038845726970033,
+      "grad_norm": 0.3484742343425751,
+      "learning_rate": 5.418275829936537e-06,
+      "loss": 0.7431,
+      "step": 509
+    },
+    {
+      "epoch": 0.9056603773584906,
+      "grad_norm": 0.35299912095069885,
+      "learning_rate": 5.221205881981595e-06,
+      "loss": 0.7568,
+      "step": 510
+    },
+    {
+      "epoch": 0.9074361820199778,
+      "grad_norm": 0.34243056178092957,
+      "learning_rate": 5.02769040389246e-06,
+      "loss": 0.6817,
+      "step": 511
+    },
+    {
+      "epoch": 0.9092119866814651,
+      "grad_norm": 0.38018926978111267,
+      "learning_rate": 4.8377366530408254e-06,
+      "loss": 0.759,
+      "step": 512
+    },
+    {
+      "epoch": 0.9109877913429523,
+      "grad_norm": 0.36619412899017334,
+      "learning_rate": 4.65135175322361e-06,
+      "loss": 0.7202,
+      "step": 513
+    },
+    {
+      "epoch": 0.9127635960044395,
+      "grad_norm": 0.38696765899658203,
+      "learning_rate": 4.468542694395861e-06,
+      "loss": 0.7202,
+      "step": 514
+    },
+    {
+      "epoch": 0.9145394006659268,
+      "grad_norm": 0.39391985535621643,
+      "learning_rate": 4.2893163324085885e-06,
+      "loss": 0.7091,
+      "step": 515
+    },
+    {
+      "epoch": 0.916315205327414,
+      "grad_norm": 0.37037429213523865,
+      "learning_rate": 4.1136793887516345e-06,
+      "loss": 0.6974,
+      "step": 516
+    },
+    {
+      "epoch": 0.9180910099889013,
+      "grad_norm": 0.39087411761283875,
+      "learning_rate": 3.941638450301644e-06,
+      "loss": 0.7328,
+      "step": 517
+    },
+    {
+      "epoch": 0.9198668146503884,
+      "grad_norm": 0.38174766302108765,
+      "learning_rate": 3.7731999690749585e-06,
+      "loss": 0.7184,
+      "step": 518
+    },
+    {
+      "epoch": 0.9216426193118757,
+      "grad_norm": 0.3953598737716675,
+      "learning_rate": 3.6083702619857605e-06,
+      "loss": 0.7121,
+      "step": 519
+    },
+    {
+      "epoch": 0.9234184239733629,
+      "grad_norm": 0.39504143595695496,
+      "learning_rate": 3.447155510609057e-06,
+      "loss": 0.6665,
+      "step": 520
+    },
+    {
+      "epoch": 0.9251942286348501,
+      "grad_norm": 0.4065762162208557,
+      "learning_rate": 3.2895617609489336e-06,
+      "loss": 0.7019,
+      "step": 521
+    },
+    {
+      "epoch": 0.9269700332963374,
+      "grad_norm": 0.39577242732048035,
+      "learning_rate": 3.135594923211771e-06,
+      "loss": 0.6444,
+      "step": 522
+    },
+    {
+      "epoch": 0.9287458379578246,
+      "grad_norm": 0.4378613531589508,
+      "learning_rate": 2.9852607715846193e-06,
+      "loss": 0.7066,
+      "step": 523
+    },
+    {
+      "epoch": 0.9305216426193119,
+      "grad_norm": 0.4423007369041443,
+      "learning_rate": 2.838564944018618e-06,
+      "loss": 0.6555,
+      "step": 524
+    },
+    {
+      "epoch": 0.9322974472807991,
+      "grad_norm": 0.5693342089653015,
+      "learning_rate": 2.6955129420176196e-06,
+      "loss": 0.8152,
+      "step": 525
+    },
+    {
+      "epoch": 0.9340732519422863,
+      "grad_norm": 0.2593931555747986,
+      "learning_rate": 2.556110130431788e-06,
+      "loss": 0.813,
+      "step": 526
+    },
+    {
+      "epoch": 0.9358490566037736,
+      "grad_norm": 0.30711933970451355,
+      "learning_rate": 2.420361737256438e-06,
+      "loss": 0.7564,
+      "step": 527
+    },
+    {
+      "epoch": 0.9376248612652608,
+      "grad_norm": 0.29575708508491516,
+      "learning_rate": 2.288272853436013e-06,
+      "loss": 0.7813,
+      "step": 528
+    },
+    {
+      "epoch": 0.9394006659267481,
+      "grad_norm": 0.3270512521266937,
+      "learning_rate": 2.1598484326730837e-06,
+      "loss": 0.7658,
+      "step": 529
+    },
+    {
+      "epoch": 0.9411764705882353,
+      "grad_norm": 0.3134397268295288,
+      "learning_rate": 2.035093291242607e-06,
+      "loss": 0.7335,
+      "step": 530
+    },
+    {
+      "epoch": 0.9429522752497226,
+      "grad_norm": 0.34165453910827637,
+      "learning_rate": 1.914012107811336e-06,
+      "loss": 0.798,
+      "step": 531
+    },
+    {
+      "epoch": 0.9447280799112098,
+      "grad_norm": 0.33172738552093506,
+      "learning_rate": 1.7966094232622855e-06,
+      "loss": 0.7516,
+      "step": 532
+    },
+    {
+      "epoch": 0.946503884572697,
+      "grad_norm": 0.35185980796813965,
+      "learning_rate": 1.6828896405244988e-06,
+      "loss": 0.7745,
+      "step": 533
+    },
+    {
+      "epoch": 0.9482796892341843,
+      "grad_norm": 0.3368275761604309,
+      "learning_rate": 1.572857024407881e-06,
+      "loss": 0.749,
+      "step": 534
+    },
+    {
+      "epoch": 0.9500554938956715,
+      "grad_norm": 0.3556804656982422,
+      "learning_rate": 1.466515701443294e-06,
+      "loss": 0.737,
+      "step": 535
+    },
+    {
+      "epoch": 0.9518312985571588,
+      "grad_norm": 0.35035377740859985,
+      "learning_rate": 1.3638696597277679e-06,
+      "loss": 0.716,
+      "step": 536
+    },
+    {
+      "epoch": 0.953607103218646,
+      "grad_norm": 0.356507807970047,
+      "learning_rate": 1.2649227487749548e-06,
+      "loss": 0.7292,
+      "step": 537
+    },
+    {
+      "epoch": 0.9553829078801331,
+      "grad_norm": 0.3645875155925751,
+      "learning_rate": 1.1696786793707781e-06,
+      "loss": 0.7325,
+      "step": 538
+    },
+    {
+      "epoch": 0.9571587125416204,
+      "grad_norm": 0.35591599345207214,
+      "learning_rate": 1.0781410234342094e-06,
+      "loss": 0.7203,
+      "step": 539
+    },
+    {
+      "epoch": 0.9589345172031076,
+      "grad_norm": 0.35885104537010193,
+      "learning_rate": 9.90313213883376e-07,
+      "loss": 0.665,
+      "step": 540
+    },
+    {
+      "epoch": 0.9607103218645949,
+      "grad_norm": 0.38247016072273254,
+      "learning_rate": 9.061985445067756e-07,
+      "loss": 0.6885,
+      "step": 541
+    },
+    {
+      "epoch": 0.9624861265260821,
+      "grad_norm": 0.38850679993629456,
+      "learning_rate": 8.258001698397744e-07,
+      "loss": 0.707,
+      "step": 542
+    },
+    {
+      "epoch": 0.9642619311875694,
+      "grad_norm": 0.3912898004055023,
+      "learning_rate": 7.491211050462798e-07,
+      "loss": 0.6818,
+      "step": 543
+    },
+    {
+      "epoch": 0.9660377358490566,
+      "grad_norm": 0.3983571529388428,
+      "learning_rate": 6.761642258056978e-07,
+      "loss": 0.6786,
+      "step": 544
+    },
+    {
+      "epoch": 0.9678135405105438,
+      "grad_norm": 0.4406982660293579,
+      "learning_rate": 6.069322682050516e-07,
+      "loss": 0.6564,
+      "step": 545
+    },
+    {
+      "epoch": 0.9695893451720311,
+      "grad_norm": 0.38563239574432373,
+      "learning_rate": 5.414278286363761e-07,
+      "loss": 0.5921,
+      "step": 546
+    },
+    {
+      "epoch": 0.9713651498335183,
+      "grad_norm": 0.4474928081035614,
+      "learning_rate": 4.796533636993727e-07,
+      "loss": 0.681,
+      "step": 547
+    },
+    {
+      "epoch": 0.9731409544950056,
+      "grad_norm": 0.43392056226730347,
+      "learning_rate": 4.216111901092501e-07,
+      "loss": 0.6673,
+      "step": 548
+    },
+    {
+      "epoch": 0.9749167591564928,
+      "grad_norm": 0.5074283480644226,
+      "learning_rate": 3.6730348460985996e-07,
+      "loss": 0.7363,
+      "step": 549
+    },
+    {
+      "epoch": 0.97669256381798,
+      "grad_norm": 0.5351345539093018,
+      "learning_rate": 3.1673228389204055e-07,
+      "loss": 0.6898,
+      "step": 550
+    },
+    {
+      "epoch": 0.9784683684794673,
+      "grad_norm": 0.2640553414821625,
+      "learning_rate": 2.6989948451726643e-07,
+      "loss": 0.7556,
+      "step": 551
+    },
+    {
+      "epoch": 0.9802441731409545,
+      "grad_norm": 0.28559839725494385,
+      "learning_rate": 2.2680684284650533e-07,
+      "loss": 0.7636,
+      "step": 552
+    },
+    {
+      "epoch": 0.9820199778024418,
+      "grad_norm": 0.3131345510482788,
+      "learning_rate": 1.8745597497433765e-07,
+      "loss": 0.7366,
+      "step": 553
+    },
+    {
+      "epoch": 0.983795782463929,
+      "grad_norm": 0.3156748116016388,
+      "learning_rate": 1.518483566683826e-07,
+      "loss": 0.7377,
+      "step": 554
+    },
+    {
+      "epoch": 0.9855715871254163,
+      "grad_norm": 0.34887486696243286,
+      "learning_rate": 1.199853233138981e-07,
+      "loss": 0.7588,
+      "step": 555
+    },
+    {
+      "epoch": 0.9873473917869034,
+      "grad_norm": 0.3328061103820801,
+      "learning_rate": 9.186806986376529e-08,
+      "loss": 0.744,
+      "step": 556
+    },
+    {
+      "epoch": 0.9891231964483906,
+      "grad_norm": 0.36429402232170105,
+      "learning_rate": 6.749765079363534e-08,
+      "loss": 0.7261,
+      "step": 557
+    },
+    {
+      "epoch": 0.9908990011098779,
+      "grad_norm": 0.36065101623535156,
+      "learning_rate": 4.687498006236135e-08,
+      "loss": 0.6815,
+      "step": 558
+    },
+    {
+      "epoch": 0.9926748057713651,
+      "grad_norm": 0.3841758966445923,
+      "learning_rate": 3.000083107780327e-08,
+      "loss": 0.7204,
+      "step": 559
+    },
+    {
+      "epoch": 0.9944506104328524,
+      "grad_norm": 0.43167850375175476,
+      "learning_rate": 1.687583666772907e-08,
+      "loss": 0.725,
+      "step": 560
+    },
+    {
+      "epoch": 0.9962264150943396,
+      "grad_norm": 0.4131225347518921,
+      "learning_rate": 7.500489056133652e-09,
+      "loss": 0.6763,
+      "step": 561
+    },
+    {
+      "epoch": 0.9980022197558268,
+      "grad_norm": 0.4707167446613312,
+      "learning_rate": 1.8751398447758306e-09,
+      "loss": 0.6854,
+      "step": 562
+    },
+    {
+      "epoch": 0.9997780244173141,
+      "grad_norm": 0.4748234152793884,
+      "learning_rate": 0.0,
+      "loss": 0.6381,
+      "step": 563
     }
   ],
   "logging_steps": 1,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 7.415142496788808e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null