terry69's picture
Model save
631cbfd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4819,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00020751193193608634,
"grad_norm": 23.81510217286444,
"learning_rate": 2.0746887966804982e-08,
"loss": 1.3923,
"step": 1
},
{
"epoch": 0.0010375596596804316,
"grad_norm": 23.986426679578184,
"learning_rate": 1.037344398340249e-07,
"loss": 1.4149,
"step": 5
},
{
"epoch": 0.002075119319360863,
"grad_norm": 21.906176707284583,
"learning_rate": 2.074688796680498e-07,
"loss": 1.4046,
"step": 10
},
{
"epoch": 0.003112678979041295,
"grad_norm": 15.167476223997951,
"learning_rate": 3.112033195020747e-07,
"loss": 1.3592,
"step": 15
},
{
"epoch": 0.004150238638721726,
"grad_norm": 9.140589856353586,
"learning_rate": 4.149377593360996e-07,
"loss": 1.2548,
"step": 20
},
{
"epoch": 0.005187798298402158,
"grad_norm": 10.826361201441884,
"learning_rate": 5.186721991701245e-07,
"loss": 1.1628,
"step": 25
},
{
"epoch": 0.00622535795808259,
"grad_norm": 9.062979414726044,
"learning_rate": 6.224066390041494e-07,
"loss": 1.0633,
"step": 30
},
{
"epoch": 0.007262917617763021,
"grad_norm": 3.7471401092788903,
"learning_rate": 7.261410788381744e-07,
"loss": 1.0125,
"step": 35
},
{
"epoch": 0.008300477277443452,
"grad_norm": 3.4052281488293517,
"learning_rate": 8.298755186721992e-07,
"loss": 0.9673,
"step": 40
},
{
"epoch": 0.009338036937123885,
"grad_norm": 3.4155319860158584,
"learning_rate": 9.336099585062241e-07,
"loss": 0.9582,
"step": 45
},
{
"epoch": 0.010375596596804317,
"grad_norm": 3.0460439284849947,
"learning_rate": 1.037344398340249e-06,
"loss": 0.9393,
"step": 50
},
{
"epoch": 0.011413156256484747,
"grad_norm": 3.0573711664380117,
"learning_rate": 1.141078838174274e-06,
"loss": 0.928,
"step": 55
},
{
"epoch": 0.01245071591616518,
"grad_norm": 3.104368834698698,
"learning_rate": 1.2448132780082988e-06,
"loss": 0.9128,
"step": 60
},
{
"epoch": 0.013488275575845612,
"grad_norm": 3.1380190764833094,
"learning_rate": 1.3485477178423237e-06,
"loss": 0.9156,
"step": 65
},
{
"epoch": 0.014525835235526042,
"grad_norm": 3.1006570296836182,
"learning_rate": 1.4522821576763488e-06,
"loss": 0.905,
"step": 70
},
{
"epoch": 0.015563394895206474,
"grad_norm": 3.127744100257649,
"learning_rate": 1.5560165975103735e-06,
"loss": 0.9163,
"step": 75
},
{
"epoch": 0.016600954554886905,
"grad_norm": 3.038437602227199,
"learning_rate": 1.6597510373443984e-06,
"loss": 0.8884,
"step": 80
},
{
"epoch": 0.017638514214567337,
"grad_norm": 2.9596644094641413,
"learning_rate": 1.7634854771784235e-06,
"loss": 0.8923,
"step": 85
},
{
"epoch": 0.01867607387424777,
"grad_norm": 2.958917381373649,
"learning_rate": 1.8672199170124482e-06,
"loss": 0.8939,
"step": 90
},
{
"epoch": 0.0197136335339282,
"grad_norm": 3.13129582575627,
"learning_rate": 1.970954356846473e-06,
"loss": 0.8671,
"step": 95
},
{
"epoch": 0.020751193193608634,
"grad_norm": 3.212698731191082,
"learning_rate": 2.074688796680498e-06,
"loss": 0.8846,
"step": 100
},
{
"epoch": 0.021788752853289062,
"grad_norm": 2.9778959899019273,
"learning_rate": 2.178423236514523e-06,
"loss": 0.8747,
"step": 105
},
{
"epoch": 0.022826312512969495,
"grad_norm": 3.02722706158588,
"learning_rate": 2.282157676348548e-06,
"loss": 0.8841,
"step": 110
},
{
"epoch": 0.023863872172649927,
"grad_norm": 3.104436315782777,
"learning_rate": 2.385892116182573e-06,
"loss": 0.8646,
"step": 115
},
{
"epoch": 0.02490143183233036,
"grad_norm": 3.0389426041639847,
"learning_rate": 2.4896265560165977e-06,
"loss": 0.8769,
"step": 120
},
{
"epoch": 0.02593899149201079,
"grad_norm": 3.0481198307330772,
"learning_rate": 2.5933609958506228e-06,
"loss": 0.8597,
"step": 125
},
{
"epoch": 0.026976551151691223,
"grad_norm": 3.02592811695882,
"learning_rate": 2.6970954356846475e-06,
"loss": 0.8668,
"step": 130
},
{
"epoch": 0.028014110811371652,
"grad_norm": 3.2120467143006954,
"learning_rate": 2.8008298755186726e-06,
"loss": 0.8651,
"step": 135
},
{
"epoch": 0.029051670471052084,
"grad_norm": 2.9463572893956633,
"learning_rate": 2.9045643153526977e-06,
"loss": 0.8648,
"step": 140
},
{
"epoch": 0.030089230130732517,
"grad_norm": 3.2030553684980054,
"learning_rate": 3.008298755186722e-06,
"loss": 0.861,
"step": 145
},
{
"epoch": 0.03112678979041295,
"grad_norm": 3.134802130941582,
"learning_rate": 3.112033195020747e-06,
"loss": 0.8572,
"step": 150
},
{
"epoch": 0.03216434945009338,
"grad_norm": 3.0051811744191075,
"learning_rate": 3.215767634854772e-06,
"loss": 0.8548,
"step": 155
},
{
"epoch": 0.03320190910977381,
"grad_norm": 2.9087769254486013,
"learning_rate": 3.319502074688797e-06,
"loss": 0.857,
"step": 160
},
{
"epoch": 0.034239468769454245,
"grad_norm": 3.251836422833884,
"learning_rate": 3.423236514522822e-06,
"loss": 0.8479,
"step": 165
},
{
"epoch": 0.035277028429134674,
"grad_norm": 2.988883877751971,
"learning_rate": 3.526970954356847e-06,
"loss": 0.8573,
"step": 170
},
{
"epoch": 0.03631458808881511,
"grad_norm": 2.9670335974516155,
"learning_rate": 3.6307053941908714e-06,
"loss": 0.8509,
"step": 175
},
{
"epoch": 0.03735214774849554,
"grad_norm": 3.0847182089806333,
"learning_rate": 3.7344398340248965e-06,
"loss": 0.8421,
"step": 180
},
{
"epoch": 0.03838970740817597,
"grad_norm": 2.8264906420734843,
"learning_rate": 3.838174273858922e-06,
"loss": 0.8484,
"step": 185
},
{
"epoch": 0.0394272670678564,
"grad_norm": 2.974170420918456,
"learning_rate": 3.941908713692946e-06,
"loss": 0.8474,
"step": 190
},
{
"epoch": 0.04046482672753683,
"grad_norm": 3.077208581236518,
"learning_rate": 4.045643153526971e-06,
"loss": 0.8275,
"step": 195
},
{
"epoch": 0.04150238638721727,
"grad_norm": 3.134791812920161,
"learning_rate": 4.149377593360996e-06,
"loss": 0.8419,
"step": 200
},
{
"epoch": 0.042539946046897696,
"grad_norm": 3.2479409082629935,
"learning_rate": 4.253112033195021e-06,
"loss": 0.8466,
"step": 205
},
{
"epoch": 0.043577505706578125,
"grad_norm": 2.8988040172466767,
"learning_rate": 4.356846473029046e-06,
"loss": 0.8489,
"step": 210
},
{
"epoch": 0.04461506536625856,
"grad_norm": 3.0923511376582797,
"learning_rate": 4.460580912863071e-06,
"loss": 0.8455,
"step": 215
},
{
"epoch": 0.04565262502593899,
"grad_norm": 3.1659114421543895,
"learning_rate": 4.564315352697096e-06,
"loss": 0.8366,
"step": 220
},
{
"epoch": 0.046690184685619425,
"grad_norm": 2.9510079953090953,
"learning_rate": 4.66804979253112e-06,
"loss": 0.8492,
"step": 225
},
{
"epoch": 0.047727744345299854,
"grad_norm": 2.9528441973334503,
"learning_rate": 4.771784232365146e-06,
"loss": 0.8317,
"step": 230
},
{
"epoch": 0.04876530400498029,
"grad_norm": 3.009602261579148,
"learning_rate": 4.875518672199171e-06,
"loss": 0.8282,
"step": 235
},
{
"epoch": 0.04980286366466072,
"grad_norm": 3.0538602837457702,
"learning_rate": 4.979253112033195e-06,
"loss": 0.8379,
"step": 240
},
{
"epoch": 0.05084042332434115,
"grad_norm": 3.067939083048346,
"learning_rate": 5.08298755186722e-06,
"loss": 0.8255,
"step": 245
},
{
"epoch": 0.05187798298402158,
"grad_norm": 2.8381217796695233,
"learning_rate": 5.1867219917012455e-06,
"loss": 0.8424,
"step": 250
},
{
"epoch": 0.05291554264370201,
"grad_norm": 3.4784967911008247,
"learning_rate": 5.29045643153527e-06,
"loss": 0.8288,
"step": 255
},
{
"epoch": 0.05395310230338245,
"grad_norm": 3.308889130459959,
"learning_rate": 5.394190871369295e-06,
"loss": 0.8458,
"step": 260
},
{
"epoch": 0.054990661963062876,
"grad_norm": 3.043690822549995,
"learning_rate": 5.4979253112033204e-06,
"loss": 0.8233,
"step": 265
},
{
"epoch": 0.056028221622743304,
"grad_norm": 3.168880712945065,
"learning_rate": 5.601659751037345e-06,
"loss": 0.831,
"step": 270
},
{
"epoch": 0.05706578128242374,
"grad_norm": 3.1149299465392644,
"learning_rate": 5.70539419087137e-06,
"loss": 0.8363,
"step": 275
},
{
"epoch": 0.05810334094210417,
"grad_norm": 3.1047167518774814,
"learning_rate": 5.809128630705395e-06,
"loss": 0.8391,
"step": 280
},
{
"epoch": 0.059140900601784605,
"grad_norm": 2.8999047365073927,
"learning_rate": 5.91286307053942e-06,
"loss": 0.8368,
"step": 285
},
{
"epoch": 0.06017846026146503,
"grad_norm": 3.066022536608264,
"learning_rate": 6.016597510373444e-06,
"loss": 0.8379,
"step": 290
},
{
"epoch": 0.06121601992114547,
"grad_norm": 3.116983544786236,
"learning_rate": 6.1203319502074694e-06,
"loss": 0.8169,
"step": 295
},
{
"epoch": 0.0622535795808259,
"grad_norm": 7.508628361145243,
"learning_rate": 6.224066390041494e-06,
"loss": 0.8186,
"step": 300
},
{
"epoch": 0.06329113924050633,
"grad_norm": 2.959364581244697,
"learning_rate": 6.327800829875519e-06,
"loss": 0.823,
"step": 305
},
{
"epoch": 0.06432869890018676,
"grad_norm": 3.3247616385013554,
"learning_rate": 6.431535269709544e-06,
"loss": 0.8336,
"step": 310
},
{
"epoch": 0.06536625855986719,
"grad_norm": 2.97755825450701,
"learning_rate": 6.535269709543569e-06,
"loss": 0.8403,
"step": 315
},
{
"epoch": 0.06640381821954762,
"grad_norm": 2.920924218399819,
"learning_rate": 6.639004149377594e-06,
"loss": 0.8352,
"step": 320
},
{
"epoch": 0.06744137787922806,
"grad_norm": 2.996226962381477,
"learning_rate": 6.742738589211619e-06,
"loss": 0.8149,
"step": 325
},
{
"epoch": 0.06847893753890849,
"grad_norm": 2.9599991745332743,
"learning_rate": 6.846473029045644e-06,
"loss": 0.8227,
"step": 330
},
{
"epoch": 0.06951649719858892,
"grad_norm": 3.3083929869948374,
"learning_rate": 6.950207468879669e-06,
"loss": 0.8293,
"step": 335
},
{
"epoch": 0.07055405685826935,
"grad_norm": 2.921592069295814,
"learning_rate": 7.053941908713694e-06,
"loss": 0.8063,
"step": 340
},
{
"epoch": 0.07159161651794978,
"grad_norm": 2.9356160908302296,
"learning_rate": 7.157676348547719e-06,
"loss": 0.8249,
"step": 345
},
{
"epoch": 0.07262917617763022,
"grad_norm": 3.127667490017175,
"learning_rate": 7.261410788381743e-06,
"loss": 0.805,
"step": 350
},
{
"epoch": 0.07366673583731065,
"grad_norm": 2.811701116040028,
"learning_rate": 7.365145228215769e-06,
"loss": 0.8141,
"step": 355
},
{
"epoch": 0.07470429549699108,
"grad_norm": 3.428565494126869,
"learning_rate": 7.468879668049793e-06,
"loss": 0.8137,
"step": 360
},
{
"epoch": 0.0757418551566715,
"grad_norm": 3.0132405721717794,
"learning_rate": 7.572614107883818e-06,
"loss": 0.8121,
"step": 365
},
{
"epoch": 0.07677941481635193,
"grad_norm": 3.0044504000522045,
"learning_rate": 7.676348547717844e-06,
"loss": 0.8112,
"step": 370
},
{
"epoch": 0.07781697447603238,
"grad_norm": 2.938773166801736,
"learning_rate": 7.780082987551869e-06,
"loss": 0.8071,
"step": 375
},
{
"epoch": 0.0788545341357128,
"grad_norm": 2.816233639978962,
"learning_rate": 7.883817427385892e-06,
"loss": 0.8269,
"step": 380
},
{
"epoch": 0.07989209379539323,
"grad_norm": 3.2016060279962297,
"learning_rate": 7.987551867219918e-06,
"loss": 0.8173,
"step": 385
},
{
"epoch": 0.08092965345507366,
"grad_norm": 3.0338701288383567,
"learning_rate": 8.091286307053943e-06,
"loss": 0.7997,
"step": 390
},
{
"epoch": 0.08196721311475409,
"grad_norm": 2.8900162172973025,
"learning_rate": 8.195020746887967e-06,
"loss": 0.8296,
"step": 395
},
{
"epoch": 0.08300477277443453,
"grad_norm": 2.775870448004777,
"learning_rate": 8.298755186721992e-06,
"loss": 0.8139,
"step": 400
},
{
"epoch": 0.08404233243411496,
"grad_norm": 2.997946086685837,
"learning_rate": 8.402489626556017e-06,
"loss": 0.8244,
"step": 405
},
{
"epoch": 0.08507989209379539,
"grad_norm": 2.79051986833281,
"learning_rate": 8.506224066390042e-06,
"loss": 0.8207,
"step": 410
},
{
"epoch": 0.08611745175347582,
"grad_norm": 2.8248639051971334,
"learning_rate": 8.609958506224068e-06,
"loss": 0.805,
"step": 415
},
{
"epoch": 0.08715501141315625,
"grad_norm": 2.9096736006801485,
"learning_rate": 8.713692946058093e-06,
"loss": 0.8037,
"step": 420
},
{
"epoch": 0.08819257107283669,
"grad_norm": 2.940919950600375,
"learning_rate": 8.817427385892117e-06,
"loss": 0.8186,
"step": 425
},
{
"epoch": 0.08923013073251712,
"grad_norm": 2.896130509805648,
"learning_rate": 8.921161825726142e-06,
"loss": 0.8245,
"step": 430
},
{
"epoch": 0.09026769039219755,
"grad_norm": 2.8401858033389713,
"learning_rate": 9.024896265560167e-06,
"loss": 0.8081,
"step": 435
},
{
"epoch": 0.09130525005187798,
"grad_norm": 2.86912546506336,
"learning_rate": 9.128630705394191e-06,
"loss": 0.8164,
"step": 440
},
{
"epoch": 0.09234280971155842,
"grad_norm": 2.7242600466916453,
"learning_rate": 9.232365145228218e-06,
"loss": 0.8073,
"step": 445
},
{
"epoch": 0.09338036937123885,
"grad_norm": 3.044646841531881,
"learning_rate": 9.33609958506224e-06,
"loss": 0.8145,
"step": 450
},
{
"epoch": 0.09441792903091928,
"grad_norm": 2.8295153855299198,
"learning_rate": 9.439834024896265e-06,
"loss": 0.7957,
"step": 455
},
{
"epoch": 0.09545548869059971,
"grad_norm": 2.910948125766166,
"learning_rate": 9.543568464730292e-06,
"loss": 0.8053,
"step": 460
},
{
"epoch": 0.09649304835028014,
"grad_norm": 2.8219575214924597,
"learning_rate": 9.647302904564317e-06,
"loss": 0.8128,
"step": 465
},
{
"epoch": 0.09753060800996058,
"grad_norm": 2.9677851670315167,
"learning_rate": 9.751037344398341e-06,
"loss": 0.8061,
"step": 470
},
{
"epoch": 0.09856816766964101,
"grad_norm": 3.1531286571785326,
"learning_rate": 9.854771784232366e-06,
"loss": 0.7962,
"step": 475
},
{
"epoch": 0.09960572732932144,
"grad_norm": 3.1075603430133105,
"learning_rate": 9.95850622406639e-06,
"loss": 0.8072,
"step": 480
},
{
"epoch": 0.10064328698900186,
"grad_norm": 2.844518989686556,
"learning_rate": 9.99998819398724e-06,
"loss": 0.8198,
"step": 485
},
{
"epoch": 0.1016808466486823,
"grad_norm": 2.8894422413511127,
"learning_rate": 9.999916046333384e-06,
"loss": 0.8146,
"step": 490
},
{
"epoch": 0.10271840630836274,
"grad_norm": 2.8815754440809878,
"learning_rate": 9.999778310866921e-06,
"loss": 0.7899,
"step": 495
},
{
"epoch": 0.10375596596804317,
"grad_norm": 2.885119932296298,
"learning_rate": 9.999574989394634e-06,
"loss": 0.8013,
"step": 500
},
{
"epoch": 0.1047935256277236,
"grad_norm": 2.8642122606932174,
"learning_rate": 9.99930608458365e-06,
"loss": 0.805,
"step": 505
},
{
"epoch": 0.10583108528740402,
"grad_norm": 2.9310957652228082,
"learning_rate": 9.998971599961405e-06,
"loss": 0.7915,
"step": 510
},
{
"epoch": 0.10686864494708445,
"grad_norm": 2.8531320874329746,
"learning_rate": 9.998571539915592e-06,
"loss": 0.7981,
"step": 515
},
{
"epoch": 0.1079062046067649,
"grad_norm": 2.839970691252372,
"learning_rate": 9.998105909694117e-06,
"loss": 0.7999,
"step": 520
},
{
"epoch": 0.10894376426644532,
"grad_norm": 2.853880263858815,
"learning_rate": 9.997574715405011e-06,
"loss": 0.8311,
"step": 525
},
{
"epoch": 0.10998132392612575,
"grad_norm": 2.9338168378898337,
"learning_rate": 9.996977964016371e-06,
"loss": 0.8005,
"step": 530
},
{
"epoch": 0.11101888358580618,
"grad_norm": 2.7032896393481254,
"learning_rate": 9.996315663356247e-06,
"loss": 0.8003,
"step": 535
},
{
"epoch": 0.11205644324548661,
"grad_norm": 2.761074984380135,
"learning_rate": 9.995587822112558e-06,
"loss": 0.8044,
"step": 540
},
{
"epoch": 0.11309400290516705,
"grad_norm": 2.6815711646329556,
"learning_rate": 9.994794449832966e-06,
"loss": 0.7887,
"step": 545
},
{
"epoch": 0.11413156256484748,
"grad_norm": 2.8649570007362657,
"learning_rate": 9.993935556924756e-06,
"loss": 0.7776,
"step": 550
},
{
"epoch": 0.11516912222452791,
"grad_norm": 2.9788872729907547,
"learning_rate": 9.993011154654702e-06,
"loss": 0.7778,
"step": 555
},
{
"epoch": 0.11620668188420834,
"grad_norm": 2.771583627699142,
"learning_rate": 9.992021255148907e-06,
"loss": 0.7876,
"step": 560
},
{
"epoch": 0.11724424154388878,
"grad_norm": 3.156864019145989,
"learning_rate": 9.990965871392662e-06,
"loss": 0.7924,
"step": 565
},
{
"epoch": 0.11828180120356921,
"grad_norm": 2.6854537961356653,
"learning_rate": 9.989845017230258e-06,
"loss": 0.7841,
"step": 570
},
{
"epoch": 0.11931936086324964,
"grad_norm": 3.194287668643439,
"learning_rate": 9.988658707364819e-06,
"loss": 0.7807,
"step": 575
},
{
"epoch": 0.12035692052293007,
"grad_norm": 2.7638444846888173,
"learning_rate": 9.9874069573581e-06,
"loss": 0.7846,
"step": 580
},
{
"epoch": 0.1213944801826105,
"grad_norm": 2.7802968983308936,
"learning_rate": 9.986089783630286e-06,
"loss": 0.775,
"step": 585
},
{
"epoch": 0.12243203984229094,
"grad_norm": 2.8469947394820103,
"learning_rate": 9.984707203459774e-06,
"loss": 0.7672,
"step": 590
},
{
"epoch": 0.12346959950197137,
"grad_norm": 2.9990572740062493,
"learning_rate": 9.983259234982951e-06,
"loss": 0.7779,
"step": 595
},
{
"epoch": 0.1245071591616518,
"grad_norm": 3.1849226285333345,
"learning_rate": 9.981745897193955e-06,
"loss": 0.7714,
"step": 600
},
{
"epoch": 0.12554471882133222,
"grad_norm": 2.835199675947756,
"learning_rate": 9.98016720994442e-06,
"loss": 0.7784,
"step": 605
},
{
"epoch": 0.12658227848101267,
"grad_norm": 2.8294757615641886,
"learning_rate": 9.978523193943222e-06,
"loss": 0.7905,
"step": 610
},
{
"epoch": 0.12761983814069308,
"grad_norm": 2.6228325540550657,
"learning_rate": 9.976813870756209e-06,
"loss": 0.7695,
"step": 615
},
{
"epoch": 0.12865739780037352,
"grad_norm": 2.6754726256364902,
"learning_rate": 9.975039262805907e-06,
"loss": 0.7784,
"step": 620
},
{
"epoch": 0.12969495746005397,
"grad_norm": 2.6428193830815663,
"learning_rate": 9.973199393371242e-06,
"loss": 0.7768,
"step": 625
},
{
"epoch": 0.13073251711973438,
"grad_norm": 2.7019164377169824,
"learning_rate": 9.97129428658722e-06,
"loss": 0.7787,
"step": 630
},
{
"epoch": 0.13177007677941482,
"grad_norm": 2.7193094088840635,
"learning_rate": 9.969323967444616e-06,
"loss": 0.7691,
"step": 635
},
{
"epoch": 0.13280763643909524,
"grad_norm": 2.9547462574056227,
"learning_rate": 9.96728846178965e-06,
"loss": 0.7791,
"step": 640
},
{
"epoch": 0.13384519609877568,
"grad_norm": 2.7070337933579673,
"learning_rate": 9.965187796323643e-06,
"loss": 0.7793,
"step": 645
},
{
"epoch": 0.13488275575845612,
"grad_norm": 2.846568092264262,
"learning_rate": 9.96302199860267e-06,
"loss": 0.7657,
"step": 650
},
{
"epoch": 0.13592031541813654,
"grad_norm": 2.6430732770071885,
"learning_rate": 9.96079109703719e-06,
"loss": 0.7613,
"step": 655
},
{
"epoch": 0.13695787507781698,
"grad_norm": 2.564352165321478,
"learning_rate": 9.95849512089169e-06,
"loss": 0.7716,
"step": 660
},
{
"epoch": 0.1379954347374974,
"grad_norm": 2.787666582812226,
"learning_rate": 9.956134100284285e-06,
"loss": 0.7788,
"step": 665
},
{
"epoch": 0.13903299439717784,
"grad_norm": 2.565898584923949,
"learning_rate": 9.95370806618633e-06,
"loss": 0.7612,
"step": 670
},
{
"epoch": 0.14007055405685828,
"grad_norm": 2.9186277859262244,
"learning_rate": 9.951217050422013e-06,
"loss": 0.7787,
"step": 675
},
{
"epoch": 0.1411081137165387,
"grad_norm": 2.7573443860042546,
"learning_rate": 9.94866108566794e-06,
"loss": 0.7556,
"step": 680
},
{
"epoch": 0.14214567337621914,
"grad_norm": 2.840469194753,
"learning_rate": 9.946040205452699e-06,
"loss": 0.7456,
"step": 685
},
{
"epoch": 0.14318323303589955,
"grad_norm": 2.55261542401345,
"learning_rate": 9.943354444156428e-06,
"loss": 0.7789,
"step": 690
},
{
"epoch": 0.14422079269558,
"grad_norm": 2.6744948199976535,
"learning_rate": 9.940603837010358e-06,
"loss": 0.773,
"step": 695
},
{
"epoch": 0.14525835235526044,
"grad_norm": 2.7013083342024107,
"learning_rate": 9.937788420096362e-06,
"loss": 0.7735,
"step": 700
},
{
"epoch": 0.14629591201494085,
"grad_norm": 2.731487086002391,
"learning_rate": 9.934908230346462e-06,
"loss": 0.7523,
"step": 705
},
{
"epoch": 0.1473334716746213,
"grad_norm": 2.6033671380903334,
"learning_rate": 9.931963305542363e-06,
"loss": 0.7517,
"step": 710
},
{
"epoch": 0.1483710313343017,
"grad_norm": 2.662624062037032,
"learning_rate": 9.92895368431495e-06,
"loss": 0.7659,
"step": 715
},
{
"epoch": 0.14940859099398215,
"grad_norm": 3.714026552589638,
"learning_rate": 9.925879406143779e-06,
"loss": 0.7646,
"step": 720
},
{
"epoch": 0.1504461506536626,
"grad_norm": 3.3049933455540073,
"learning_rate": 9.922740511356565e-06,
"loss": 0.7681,
"step": 725
},
{
"epoch": 0.151483710313343,
"grad_norm": 2.7320515915400816,
"learning_rate": 9.919537041128647e-06,
"loss": 0.746,
"step": 730
},
{
"epoch": 0.15252126997302345,
"grad_norm": 2.635817269274888,
"learning_rate": 9.916269037482452e-06,
"loss": 0.7306,
"step": 735
},
{
"epoch": 0.15355882963270387,
"grad_norm": 2.662103702116443,
"learning_rate": 9.912936543286939e-06,
"loss": 0.7536,
"step": 740
},
{
"epoch": 0.1545963892923843,
"grad_norm": 2.85326362399482,
"learning_rate": 9.909539602257048e-06,
"loss": 0.7673,
"step": 745
},
{
"epoch": 0.15563394895206475,
"grad_norm": 2.567231081949611,
"learning_rate": 9.90607825895311e-06,
"loss": 0.738,
"step": 750
},
{
"epoch": 0.15667150861174517,
"grad_norm": 2.869479144385386,
"learning_rate": 9.902552558780276e-06,
"loss": 0.7598,
"step": 755
},
{
"epoch": 0.1577090682714256,
"grad_norm": 2.730929830901487,
"learning_rate": 9.898962547987913e-06,
"loss": 0.748,
"step": 760
},
{
"epoch": 0.15874662793110603,
"grad_norm": 2.793985046920194,
"learning_rate": 9.895308273669007e-06,
"loss": 0.7328,
"step": 765
},
{
"epoch": 0.15978418759078647,
"grad_norm": 2.7966738638089246,
"learning_rate": 9.89158978375953e-06,
"loss": 0.7676,
"step": 770
},
{
"epoch": 0.1608217472504669,
"grad_norm": 2.6679897290967576,
"learning_rate": 9.887807127037827e-06,
"loss": 0.7295,
"step": 775
},
{
"epoch": 0.16185930691014733,
"grad_norm": 2.5457872650558455,
"learning_rate": 9.88396035312397e-06,
"loss": 0.728,
"step": 780
},
{
"epoch": 0.16289686656982777,
"grad_norm": 2.8332820570127626,
"learning_rate": 9.880049512479097e-06,
"loss": 0.7421,
"step": 785
},
{
"epoch": 0.16393442622950818,
"grad_norm": 2.7639539461730114,
"learning_rate": 9.876074656404773e-06,
"loss": 0.7534,
"step": 790
},
{
"epoch": 0.16497198588918863,
"grad_norm": 2.7213075641667928,
"learning_rate": 9.872035837042292e-06,
"loss": 0.7363,
"step": 795
},
{
"epoch": 0.16600954554886907,
"grad_norm": 2.6178852546410183,
"learning_rate": 9.86793310737201e-06,
"loss": 0.7318,
"step": 800
},
{
"epoch": 0.16704710520854948,
"grad_norm": 2.7566382614918585,
"learning_rate": 9.863766521212646e-06,
"loss": 0.7507,
"step": 805
},
{
"epoch": 0.16808466486822993,
"grad_norm": 2.75430515948912,
"learning_rate": 9.859536133220569e-06,
"loss": 0.7481,
"step": 810
},
{
"epoch": 0.16912222452791034,
"grad_norm": 2.7773285238684813,
"learning_rate": 9.855241998889091e-06,
"loss": 0.7456,
"step": 815
},
{
"epoch": 0.17015978418759078,
"grad_norm": 2.9942393187975904,
"learning_rate": 9.850884174547734e-06,
"loss": 0.7512,
"step": 820
},
{
"epoch": 0.17119734384727123,
"grad_norm": 2.8176703833149706,
"learning_rate": 9.846462717361489e-06,
"loss": 0.7229,
"step": 825
},
{
"epoch": 0.17223490350695164,
"grad_norm": 2.606112734187681,
"learning_rate": 9.841977685330074e-06,
"loss": 0.7544,
"step": 830
},
{
"epoch": 0.17327246316663208,
"grad_norm": 2.7643282324128324,
"learning_rate": 9.837429137287164e-06,
"loss": 0.7233,
"step": 835
},
{
"epoch": 0.1743100228263125,
"grad_norm": 2.58906230921786,
"learning_rate": 9.832817132899622e-06,
"loss": 0.7496,
"step": 840
},
{
"epoch": 0.17534758248599294,
"grad_norm": 2.9173083959154913,
"learning_rate": 9.828141732666722e-06,
"loss": 0.7405,
"step": 845
},
{
"epoch": 0.17638514214567338,
"grad_norm": 2.807434396354819,
"learning_rate": 9.823402997919346e-06,
"loss": 0.7032,
"step": 850
},
{
"epoch": 0.1774227018053538,
"grad_norm": 2.7235521219589343,
"learning_rate": 9.818600990819193e-06,
"loss": 0.7162,
"step": 855
},
{
"epoch": 0.17846026146503424,
"grad_norm": 3.0405865223857247,
"learning_rate": 9.813735774357942e-06,
"loss": 0.7286,
"step": 860
},
{
"epoch": 0.17949782112471468,
"grad_norm": 2.6580664453159786,
"learning_rate": 9.80880741235645e-06,
"loss": 0.7153,
"step": 865
},
{
"epoch": 0.1805353807843951,
"grad_norm": 2.6234079932920764,
"learning_rate": 9.803815969463898e-06,
"loss": 0.7267,
"step": 870
},
{
"epoch": 0.18157294044407554,
"grad_norm": 2.7632489453595457,
"learning_rate": 9.798761511156948e-06,
"loss": 0.7198,
"step": 875
},
{
"epoch": 0.18261050010375596,
"grad_norm": 2.607364882820461,
"learning_rate": 9.79364410373889e-06,
"loss": 0.7197,
"step": 880
},
{
"epoch": 0.1836480597634364,
"grad_norm": 2.7165078647946856,
"learning_rate": 9.78846381433876e-06,
"loss": 0.7311,
"step": 885
},
{
"epoch": 0.18468561942311684,
"grad_norm": 2.510454703638443,
"learning_rate": 9.783220710910471e-06,
"loss": 0.7318,
"step": 890
},
{
"epoch": 0.18572317908279726,
"grad_norm": 2.638416867941834,
"learning_rate": 9.777914862231912e-06,
"loss": 0.73,
"step": 895
},
{
"epoch": 0.1867607387424777,
"grad_norm": 2.7242863471244148,
"learning_rate": 9.772546337904054e-06,
"loss": 0.7191,
"step": 900
},
{
"epoch": 0.18779829840215811,
"grad_norm": 2.6641307804734806,
"learning_rate": 9.767115208350035e-06,
"loss": 0.7207,
"step": 905
},
{
"epoch": 0.18883585806183856,
"grad_norm": 2.4803694238796905,
"learning_rate": 9.761621544814232e-06,
"loss": 0.7366,
"step": 910
},
{
"epoch": 0.189873417721519,
"grad_norm": 2.54201756384705,
"learning_rate": 9.756065419361329e-06,
"loss": 0.6971,
"step": 915
},
{
"epoch": 0.19091097738119941,
"grad_norm": 2.663357207599627,
"learning_rate": 9.750446904875374e-06,
"loss": 0.7093,
"step": 920
},
{
"epoch": 0.19194853704087986,
"grad_norm": 2.665290508123204,
"learning_rate": 9.744766075058817e-06,
"loss": 0.7092,
"step": 925
},
{
"epoch": 0.19298609670056027,
"grad_norm": 2.561102249391226,
"learning_rate": 9.739023004431553e-06,
"loss": 0.7022,
"step": 930
},
{
"epoch": 0.19402365636024071,
"grad_norm": 2.8532699081444344,
"learning_rate": 9.733217768329934e-06,
"loss": 0.7125,
"step": 935
},
{
"epoch": 0.19506121601992116,
"grad_norm": 2.658165458053829,
"learning_rate": 9.727350442905786e-06,
"loss": 0.713,
"step": 940
},
{
"epoch": 0.19609877567960157,
"grad_norm": 2.8346689183428233,
"learning_rate": 9.721421105125409e-06,
"loss": 0.7111,
"step": 945
},
{
"epoch": 0.19713633533928202,
"grad_norm": 2.691363256166988,
"learning_rate": 9.715429832768566e-06,
"loss": 0.6997,
"step": 950
},
{
"epoch": 0.19817389499896243,
"grad_norm": 2.670459556620767,
"learning_rate": 9.709376704427471e-06,
"loss": 0.7002,
"step": 955
},
{
"epoch": 0.19921145465864287,
"grad_norm": 2.8298140051123624,
"learning_rate": 9.703261799505743e-06,
"loss": 0.6919,
"step": 960
},
{
"epoch": 0.20024901431832332,
"grad_norm": 2.690706447862383,
"learning_rate": 9.697085198217378e-06,
"loss": 0.6951,
"step": 965
},
{
"epoch": 0.20128657397800373,
"grad_norm": 2.6190530004747536,
"learning_rate": 9.690846981585689e-06,
"loss": 0.7088,
"step": 970
},
{
"epoch": 0.20232413363768417,
"grad_norm": 2.896807508207353,
"learning_rate": 9.684547231442248e-06,
"loss": 0.7036,
"step": 975
},
{
"epoch": 0.2033616932973646,
"grad_norm": 2.653583975726927,
"learning_rate": 9.678186030425806e-06,
"loss": 0.7014,
"step": 980
},
{
"epoch": 0.20439925295704503,
"grad_norm": 2.7468040879725857,
"learning_rate": 9.67176346198122e-06,
"loss": 0.6887,
"step": 985
},
{
"epoch": 0.20543681261672547,
"grad_norm": 2.6425850355422607,
"learning_rate": 9.665279610358347e-06,
"loss": 0.6912,
"step": 990
},
{
"epoch": 0.2064743722764059,
"grad_norm": 2.607466586766701,
"learning_rate": 9.658734560610942e-06,
"loss": 0.6986,
"step": 995
},
{
"epoch": 0.20751193193608633,
"grad_norm": 2.5797036833444684,
"learning_rate": 9.652128398595548e-06,
"loss": 0.6893,
"step": 1000
},
{
"epoch": 0.20854949159576675,
"grad_norm": 2.627466198960634,
"learning_rate": 9.645461210970363e-06,
"loss": 0.6939,
"step": 1005
},
{
"epoch": 0.2095870512554472,
"grad_norm": 2.7514054708664486,
"learning_rate": 9.638733085194105e-06,
"loss": 0.6879,
"step": 1010
},
{
"epoch": 0.21062461091512763,
"grad_norm": 2.762232274342595,
"learning_rate": 9.631944109524867e-06,
"loss": 0.7206,
"step": 1015
},
{
"epoch": 0.21166217057480805,
"grad_norm": 2.573942783710098,
"learning_rate": 9.625094373018957e-06,
"loss": 0.672,
"step": 1020
},
{
"epoch": 0.2126997302344885,
"grad_norm": 2.9862062253264154,
"learning_rate": 9.61818396552973e-06,
"loss": 0.7027,
"step": 1025
},
{
"epoch": 0.2137372898941689,
"grad_norm": 2.604158278569754,
"learning_rate": 9.61121297770641e-06,
"loss": 0.6832,
"step": 1030
},
{
"epoch": 0.21477484955384935,
"grad_norm": 2.847316696943195,
"learning_rate": 9.604181500992904e-06,
"loss": 0.6799,
"step": 1035
},
{
"epoch": 0.2158124092135298,
"grad_norm": 2.779271975810847,
"learning_rate": 9.597089627626594e-06,
"loss": 0.6804,
"step": 1040
},
{
"epoch": 0.2168499688732102,
"grad_norm": 2.5638323571099546,
"learning_rate": 9.589937450637134e-06,
"loss": 0.6837,
"step": 1045
},
{
"epoch": 0.21788752853289065,
"grad_norm": 2.7020801988998384,
"learning_rate": 9.58272506384523e-06,
"loss": 0.684,
"step": 1050
},
{
"epoch": 0.21892508819257106,
"grad_norm": 2.549003888633134,
"learning_rate": 9.5754525618614e-06,
"loss": 0.6871,
"step": 1055
},
{
"epoch": 0.2199626478522515,
"grad_norm": 2.8154214183624284,
"learning_rate": 9.568120040084752e-06,
"loss": 0.6652,
"step": 1060
},
{
"epoch": 0.22100020751193195,
"grad_norm": 2.6474463880567884,
"learning_rate": 9.56072759470171e-06,
"loss": 0.6896,
"step": 1065
},
{
"epoch": 0.22203776717161236,
"grad_norm": 2.623449606437885,
"learning_rate": 9.553275322684769e-06,
"loss": 0.6731,
"step": 1070
},
{
"epoch": 0.2230753268312928,
"grad_norm": 2.5852337807326324,
"learning_rate": 9.545763321791213e-06,
"loss": 0.6914,
"step": 1075
},
{
"epoch": 0.22411288649097322,
"grad_norm": 2.5768442254313424,
"learning_rate": 9.538191690561838e-06,
"loss": 0.6827,
"step": 1080
},
{
"epoch": 0.22515044615065366,
"grad_norm": 2.616849381186977,
"learning_rate": 9.530560528319657e-06,
"loss": 0.6861,
"step": 1085
},
{
"epoch": 0.2261880058103341,
"grad_norm": 2.542283706279765,
"learning_rate": 9.522869935168601e-06,
"loss": 0.6673,
"step": 1090
},
{
"epoch": 0.22722556547001452,
"grad_norm": 2.6428940149632916,
"learning_rate": 9.515120011992199e-06,
"loss": 0.6595,
"step": 1095
},
{
"epoch": 0.22826312512969496,
"grad_norm": 2.6516305122608324,
"learning_rate": 9.507310860452258e-06,
"loss": 0.6508,
"step": 1100
},
{
"epoch": 0.22930068478937538,
"grad_norm": 2.618199521456939,
"learning_rate": 9.499442582987535e-06,
"loss": 0.672,
"step": 1105
},
{
"epoch": 0.23033824444905582,
"grad_norm": 2.580520201322647,
"learning_rate": 9.491515282812383e-06,
"loss": 0.6798,
"step": 1110
},
{
"epoch": 0.23137580410873626,
"grad_norm": 2.5578402107468285,
"learning_rate": 9.483529063915405e-06,
"loss": 0.6575,
"step": 1115
},
{
"epoch": 0.23241336376841668,
"grad_norm": 2.5898669244363743,
"learning_rate": 9.475484031058081e-06,
"loss": 0.6686,
"step": 1120
},
{
"epoch": 0.23345092342809712,
"grad_norm": 2.581092169980836,
"learning_rate": 9.46738028977341e-06,
"loss": 0.676,
"step": 1125
},
{
"epoch": 0.23448848308777756,
"grad_norm": 2.564857684910852,
"learning_rate": 9.459217946364508e-06,
"loss": 0.6603,
"step": 1130
},
{
"epoch": 0.23552604274745798,
"grad_norm": 2.6352764131909394,
"learning_rate": 9.450997107903222e-06,
"loss": 0.673,
"step": 1135
},
{
"epoch": 0.23656360240713842,
"grad_norm": 2.7623386766168494,
"learning_rate": 9.442717882228727e-06,
"loss": 0.6713,
"step": 1140
},
{
"epoch": 0.23760116206681883,
"grad_norm": 2.646657632225148,
"learning_rate": 9.434380377946104e-06,
"loss": 0.6714,
"step": 1145
},
{
"epoch": 0.23863872172649928,
"grad_norm": 2.547166874529112,
"learning_rate": 9.425984704424927e-06,
"loss": 0.6664,
"step": 1150
},
{
"epoch": 0.23967628138617972,
"grad_norm": 2.5806952200679225,
"learning_rate": 9.417530971797812e-06,
"loss": 0.6733,
"step": 1155
},
{
"epoch": 0.24071384104586013,
"grad_norm": 2.574640603606341,
"learning_rate": 9.409019290958993e-06,
"loss": 0.6737,
"step": 1160
},
{
"epoch": 0.24175140070554058,
"grad_norm": 2.5060886593359113,
"learning_rate": 9.400449773562849e-06,
"loss": 0.6762,
"step": 1165
},
{
"epoch": 0.242788960365221,
"grad_norm": 2.716564112603813,
"learning_rate": 9.391822532022445e-06,
"loss": 0.6551,
"step": 1170
},
{
"epoch": 0.24382652002490143,
"grad_norm": 2.7171895315845216,
"learning_rate": 9.383137679508063e-06,
"loss": 0.6561,
"step": 1175
},
{
"epoch": 0.24486407968458188,
"grad_norm": 2.58866525923876,
"learning_rate": 9.374395329945714e-06,
"loss": 0.6586,
"step": 1180
},
{
"epoch": 0.2459016393442623,
"grad_norm": 2.641911339118785,
"learning_rate": 9.365595598015635e-06,
"loss": 0.6879,
"step": 1185
},
{
"epoch": 0.24693919900394273,
"grad_norm": 2.6209891495095716,
"learning_rate": 9.356738599150805e-06,
"loss": 0.6562,
"step": 1190
},
{
"epoch": 0.24797675866362315,
"grad_norm": 2.537931458295151,
"learning_rate": 9.347824449535406e-06,
"loss": 0.671,
"step": 1195
},
{
"epoch": 0.2490143183233036,
"grad_norm": 2.6373525596001777,
"learning_rate": 9.338853266103318e-06,
"loss": 0.6469,
"step": 1200
},
{
"epoch": 0.250051877982984,
"grad_norm": 2.49609290349947,
"learning_rate": 9.329825166536578e-06,
"loss": 0.6494,
"step": 1205
},
{
"epoch": 0.25108943764266445,
"grad_norm": 2.597719369713314,
"learning_rate": 9.32074026926383e-06,
"loss": 0.6644,
"step": 1210
},
{
"epoch": 0.2521269973023449,
"grad_norm": 2.622902413291871,
"learning_rate": 9.31159869345879e-06,
"loss": 0.6277,
"step": 1215
},
{
"epoch": 0.25316455696202533,
"grad_norm": 2.4773937545219615,
"learning_rate": 9.302400559038658e-06,
"loss": 0.6435,
"step": 1220
},
{
"epoch": 0.2542021166217057,
"grad_norm": 2.417435834815959,
"learning_rate": 9.293145986662567e-06,
"loss": 0.6551,
"step": 1225
},
{
"epoch": 0.25523967628138616,
"grad_norm": 2.7620863467259134,
"learning_rate": 9.283835097729984e-06,
"loss": 0.6524,
"step": 1230
},
{
"epoch": 0.2562772359410666,
"grad_norm": 2.661471370041919,
"learning_rate": 9.27446801437913e-06,
"loss": 0.6564,
"step": 1235
},
{
"epoch": 0.25731479560074705,
"grad_norm": 2.6022353359733996,
"learning_rate": 9.265044859485369e-06,
"loss": 0.6504,
"step": 1240
},
{
"epoch": 0.2583523552604275,
"grad_norm": 2.67544454301597,
"learning_rate": 9.2555657566596e-06,
"loss": 0.6379,
"step": 1245
},
{
"epoch": 0.25938991492010793,
"grad_norm": 2.7294894592664742,
"learning_rate": 9.246030830246633e-06,
"loss": 0.653,
"step": 1250
},
{
"epoch": 0.2604274745797883,
"grad_norm": 2.4934502710929642,
"learning_rate": 9.236440205323564e-06,
"loss": 0.6504,
"step": 1255
},
{
"epoch": 0.26146503423946876,
"grad_norm": 2.732333826367612,
"learning_rate": 9.226794007698128e-06,
"loss": 0.6417,
"step": 1260
},
{
"epoch": 0.2625025938991492,
"grad_norm": 2.6288344277507023,
"learning_rate": 9.217092363907047e-06,
"loss": 0.6193,
"step": 1265
},
{
"epoch": 0.26354015355882965,
"grad_norm": 2.7169648920990483,
"learning_rate": 9.207335401214379e-06,
"loss": 0.6536,
"step": 1270
},
{
"epoch": 0.2645777132185101,
"grad_norm": 2.520924205856965,
"learning_rate": 9.197523247609839e-06,
"loss": 0.6375,
"step": 1275
},
{
"epoch": 0.2656152728781905,
"grad_norm": 2.515144443600501,
"learning_rate": 9.187656031807129e-06,
"loss": 0.6442,
"step": 1280
},
{
"epoch": 0.2666528325378709,
"grad_norm": 2.5704491825236473,
"learning_rate": 9.177733883242244e-06,
"loss": 0.6586,
"step": 1285
},
{
"epoch": 0.26769039219755136,
"grad_norm": 2.8282989819254634,
"learning_rate": 9.167756932071769e-06,
"loss": 0.6609,
"step": 1290
},
{
"epoch": 0.2687279518572318,
"grad_norm": 2.485389299186558,
"learning_rate": 9.157725309171183e-06,
"loss": 0.6459,
"step": 1295
},
{
"epoch": 0.26976551151691225,
"grad_norm": 2.7788274917257327,
"learning_rate": 9.147639146133142e-06,
"loss": 0.6433,
"step": 1300
},
{
"epoch": 0.27080307117659264,
"grad_norm": 2.521611630972388,
"learning_rate": 9.137498575265736e-06,
"loss": 0.6271,
"step": 1305
},
{
"epoch": 0.2718406308362731,
"grad_norm": 2.67661014478125,
"learning_rate": 9.12730372959077e-06,
"loss": 0.6551,
"step": 1310
},
{
"epoch": 0.2728781904959535,
"grad_norm": 2.678821798528819,
"learning_rate": 9.11705474284202e-06,
"loss": 0.6275,
"step": 1315
},
{
"epoch": 0.27391575015563396,
"grad_norm": 2.5914261808444574,
"learning_rate": 9.106751749463463e-06,
"loss": 0.6401,
"step": 1320
},
{
"epoch": 0.2749533098153144,
"grad_norm": 2.5454105011810664,
"learning_rate": 9.09639488460753e-06,
"loss": 0.6275,
"step": 1325
},
{
"epoch": 0.2759908694749948,
"grad_norm": 2.746226513585246,
"learning_rate": 9.08598428413333e-06,
"loss": 0.6199,
"step": 1330
},
{
"epoch": 0.27702842913467524,
"grad_norm": 2.4978350247576886,
"learning_rate": 9.075520084604849e-06,
"loss": 0.6081,
"step": 1335
},
{
"epoch": 0.2780659887943557,
"grad_norm": 2.565415348716322,
"learning_rate": 9.065002423289189e-06,
"loss": 0.6117,
"step": 1340
},
{
"epoch": 0.2791035484540361,
"grad_norm": 2.5742091961939892,
"learning_rate": 9.054431438154745e-06,
"loss": 0.613,
"step": 1345
},
{
"epoch": 0.28014110811371656,
"grad_norm": 2.5626823209269642,
"learning_rate": 9.043807267869403e-06,
"loss": 0.624,
"step": 1350
},
{
"epoch": 0.28117866777339695,
"grad_norm": 2.4910808560943942,
"learning_rate": 9.033130051798725e-06,
"loss": 0.6314,
"step": 1355
},
{
"epoch": 0.2822162274330774,
"grad_norm": 2.4573499982125524,
"learning_rate": 9.022399930004106e-06,
"loss": 0.625,
"step": 1360
},
{
"epoch": 0.28325378709275784,
"grad_norm": 2.531960067205625,
"learning_rate": 9.011617043240956e-06,
"loss": 0.6261,
"step": 1365
},
{
"epoch": 0.2842913467524383,
"grad_norm": 2.6336708253712917,
"learning_rate": 9.000781532956844e-06,
"loss": 0.6057,
"step": 1370
},
{
"epoch": 0.2853289064121187,
"grad_norm": 2.5416291249522063,
"learning_rate": 8.989893541289636e-06,
"loss": 0.6114,
"step": 1375
},
{
"epoch": 0.2863664660717991,
"grad_norm": 2.6016836625846227,
"learning_rate": 8.978953211065645e-06,
"loss": 0.6308,
"step": 1380
},
{
"epoch": 0.28740402573147955,
"grad_norm": 2.670421258864357,
"learning_rate": 8.96796068579774e-06,
"loss": 0.6373,
"step": 1385
},
{
"epoch": 0.28844158539116,
"grad_norm": 2.6256018056635195,
"learning_rate": 8.956916109683488e-06,
"loss": 0.6136,
"step": 1390
},
{
"epoch": 0.28947914505084044,
"grad_norm": 2.665986383822575,
"learning_rate": 8.945819627603235e-06,
"loss": 0.6294,
"step": 1395
},
{
"epoch": 0.2905167047105209,
"grad_norm": 2.620184006600422,
"learning_rate": 8.934671385118224e-06,
"loss": 0.6154,
"step": 1400
},
{
"epoch": 0.29155426437020127,
"grad_norm": 2.5773178200606797,
"learning_rate": 8.923471528468675e-06,
"loss": 0.6263,
"step": 1405
},
{
"epoch": 0.2925918240298817,
"grad_norm": 2.528035051025028,
"learning_rate": 8.912220204571878e-06,
"loss": 0.6139,
"step": 1410
},
{
"epoch": 0.29362938368956215,
"grad_norm": 2.6830704473334053,
"learning_rate": 8.900917561020255e-06,
"loss": 0.6256,
"step": 1415
},
{
"epoch": 0.2946669433492426,
"grad_norm": 2.5701558374316256,
"learning_rate": 8.889563746079428e-06,
"loss": 0.6163,
"step": 1420
},
{
"epoch": 0.29570450300892304,
"grad_norm": 2.542727546558585,
"learning_rate": 8.878158908686276e-06,
"loss": 0.6214,
"step": 1425
},
{
"epoch": 0.2967420626686034,
"grad_norm": 2.6289522750495684,
"learning_rate": 8.86670319844698e-06,
"loss": 0.6184,
"step": 1430
},
{
"epoch": 0.29777962232828387,
"grad_norm": 2.605979434187302,
"learning_rate": 8.855196765635055e-06,
"loss": 0.6148,
"step": 1435
},
{
"epoch": 0.2988171819879643,
"grad_norm": 2.5378051705763136,
"learning_rate": 8.843639761189392e-06,
"loss": 0.6309,
"step": 1440
},
{
"epoch": 0.29985474164764475,
"grad_norm": 3.122542869019648,
"learning_rate": 8.83203233671226e-06,
"loss": 0.6148,
"step": 1445
},
{
"epoch": 0.3008923013073252,
"grad_norm": 2.7847080279432355,
"learning_rate": 8.820374644467334e-06,
"loss": 0.6149,
"step": 1450
},
{
"epoch": 0.3019298609670056,
"grad_norm": 2.7571745781556176,
"learning_rate": 8.808666837377688e-06,
"loss": 0.6043,
"step": 1455
},
{
"epoch": 0.302967420626686,
"grad_norm": 2.6729505157429645,
"learning_rate": 8.796909069023793e-06,
"loss": 0.6091,
"step": 1460
},
{
"epoch": 0.30400498028636647,
"grad_norm": 2.5670622136089123,
"learning_rate": 8.7851014936415e-06,
"loss": 0.5973,
"step": 1465
},
{
"epoch": 0.3050425399460469,
"grad_norm": 2.5809462309222586,
"learning_rate": 8.77324426612002e-06,
"loss": 0.601,
"step": 1470
},
{
"epoch": 0.30608009960572735,
"grad_norm": 2.6928125038468695,
"learning_rate": 8.761337541999884e-06,
"loss": 0.603,
"step": 1475
},
{
"epoch": 0.30711765926540774,
"grad_norm": 2.5859004844354887,
"learning_rate": 8.749381477470915e-06,
"loss": 0.5902,
"step": 1480
},
{
"epoch": 0.3081552189250882,
"grad_norm": 2.706857250295533,
"learning_rate": 8.73737622937017e-06,
"loss": 0.6068,
"step": 1485
},
{
"epoch": 0.3091927785847686,
"grad_norm": 2.5825402406347995,
"learning_rate": 8.725321955179886e-06,
"loss": 0.5943,
"step": 1490
},
{
"epoch": 0.31023033824444907,
"grad_norm": 2.564859545646307,
"learning_rate": 8.713218813025412e-06,
"loss": 0.6166,
"step": 1495
},
{
"epoch": 0.3112678979041295,
"grad_norm": 2.4788537811048514,
"learning_rate": 8.70106696167314e-06,
"loss": 0.6107,
"step": 1500
},
{
"epoch": 0.3123054575638099,
"grad_norm": 2.5086302932261857,
"learning_rate": 8.688866560528414e-06,
"loss": 0.5953,
"step": 1505
},
{
"epoch": 0.31334301722349034,
"grad_norm": 2.641157195225993,
"learning_rate": 8.676617769633449e-06,
"loss": 0.5942,
"step": 1510
},
{
"epoch": 0.3143805768831708,
"grad_norm": 2.5041601695953952,
"learning_rate": 8.66432074966522e-06,
"loss": 0.614,
"step": 1515
},
{
"epoch": 0.3154181365428512,
"grad_norm": 2.6463153946217988,
"learning_rate": 8.651975661933368e-06,
"loss": 0.6046,
"step": 1520
},
{
"epoch": 0.31645569620253167,
"grad_norm": 2.5329735196813186,
"learning_rate": 8.639582668378068e-06,
"loss": 0.5939,
"step": 1525
},
{
"epoch": 0.31749325586221205,
"grad_norm": 2.626444758743183,
"learning_rate": 8.627141931567918e-06,
"loss": 0.5955,
"step": 1530
},
{
"epoch": 0.3185308155218925,
"grad_norm": 2.4901380695636672,
"learning_rate": 8.614653614697804e-06,
"loss": 0.5887,
"step": 1535
},
{
"epoch": 0.31956837518157294,
"grad_norm": 2.562038119980849,
"learning_rate": 8.602117881586748e-06,
"loss": 0.5887,
"step": 1540
},
{
"epoch": 0.3206059348412534,
"grad_norm": 2.705668712762943,
"learning_rate": 8.589534896675782e-06,
"loss": 0.6155,
"step": 1545
},
{
"epoch": 0.3216434945009338,
"grad_norm": 2.4700710695591144,
"learning_rate": 8.576904825025763e-06,
"loss": 0.5805,
"step": 1550
},
{
"epoch": 0.3226810541606142,
"grad_norm": 2.569997778071618,
"learning_rate": 8.56422783231523e-06,
"loss": 0.5861,
"step": 1555
},
{
"epoch": 0.32371861382029465,
"grad_norm": 2.6800463634463454,
"learning_rate": 8.551504084838217e-06,
"loss": 0.5888,
"step": 1560
},
{
"epoch": 0.3247561734799751,
"grad_norm": 2.614913869620652,
"learning_rate": 8.538733749502084e-06,
"loss": 0.5916,
"step": 1565
},
{
"epoch": 0.32579373313965554,
"grad_norm": 2.5392913094762015,
"learning_rate": 8.525916993825312e-06,
"loss": 0.5845,
"step": 1570
},
{
"epoch": 0.326831292799336,
"grad_norm": 2.5970061681752714,
"learning_rate": 8.51305398593532e-06,
"loss": 0.5885,
"step": 1575
},
{
"epoch": 0.32786885245901637,
"grad_norm": 2.892436571898512,
"learning_rate": 8.50014489456625e-06,
"loss": 0.5735,
"step": 1580
},
{
"epoch": 0.3289064121186968,
"grad_norm": 2.6138781972835874,
"learning_rate": 8.487189889056758e-06,
"loss": 0.5559,
"step": 1585
},
{
"epoch": 0.32994397177837725,
"grad_norm": 2.623576409499823,
"learning_rate": 8.474189139347795e-06,
"loss": 0.5846,
"step": 1590
},
{
"epoch": 0.3309815314380577,
"grad_norm": 2.468823382349316,
"learning_rate": 8.461142815980368e-06,
"loss": 0.5986,
"step": 1595
},
{
"epoch": 0.33201909109773814,
"grad_norm": 2.672237640145444,
"learning_rate": 8.448051090093315e-06,
"loss": 0.591,
"step": 1600
},
{
"epoch": 0.3330566507574185,
"grad_norm": 2.4728672068736937,
"learning_rate": 8.434914133421053e-06,
"loss": 0.5845,
"step": 1605
},
{
"epoch": 0.33409421041709897,
"grad_norm": 2.5895072383457087,
"learning_rate": 8.421732118291326e-06,
"loss": 0.5782,
"step": 1610
},
{
"epoch": 0.3351317700767794,
"grad_norm": 2.5040782851199137,
"learning_rate": 8.408505217622942e-06,
"loss": 0.5815,
"step": 1615
},
{
"epoch": 0.33616932973645985,
"grad_norm": 3.6988626884139846,
"learning_rate": 8.395233604923515e-06,
"loss": 0.5843,
"step": 1620
},
{
"epoch": 0.3372068893961403,
"grad_norm": 2.5828684207028547,
"learning_rate": 8.381917454287175e-06,
"loss": 0.5793,
"step": 1625
},
{
"epoch": 0.3382444490558207,
"grad_norm": 2.638686184144453,
"learning_rate": 8.368556940392295e-06,
"loss": 0.5841,
"step": 1630
},
{
"epoch": 0.3392820087155011,
"grad_norm": 2.650538574655116,
"learning_rate": 8.355152238499192e-06,
"loss": 0.5875,
"step": 1635
},
{
"epoch": 0.34031956837518157,
"grad_norm": 2.486723068143089,
"learning_rate": 8.341703524447834e-06,
"loss": 0.5752,
"step": 1640
},
{
"epoch": 0.341357128034862,
"grad_norm": 2.542548425745693,
"learning_rate": 8.328210974655534e-06,
"loss": 0.582,
"step": 1645
},
{
"epoch": 0.34239468769454245,
"grad_norm": 2.5238474016255346,
"learning_rate": 8.314674766114625e-06,
"loss": 0.5886,
"step": 1650
},
{
"epoch": 0.34343224735422284,
"grad_norm": 2.6663652283930164,
"learning_rate": 8.301095076390151e-06,
"loss": 0.5703,
"step": 1655
},
{
"epoch": 0.3444698070139033,
"grad_norm": 2.5916048021671765,
"learning_rate": 8.287472083617534e-06,
"loss": 0.5578,
"step": 1660
},
{
"epoch": 0.3455073666735837,
"grad_norm": 2.64819187278271,
"learning_rate": 8.273805966500233e-06,
"loss": 0.566,
"step": 1665
},
{
"epoch": 0.34654492633326417,
"grad_norm": 2.7355426760241115,
"learning_rate": 8.260096904307404e-06,
"loss": 0.5724,
"step": 1670
},
{
"epoch": 0.3475824859929446,
"grad_norm": 2.5288788248400422,
"learning_rate": 8.246345076871548e-06,
"loss": 0.5852,
"step": 1675
},
{
"epoch": 0.348620045652625,
"grad_norm": 2.4333668936065727,
"learning_rate": 8.232550664586145e-06,
"loss": 0.562,
"step": 1680
},
{
"epoch": 0.34965760531230544,
"grad_norm": 2.666095931772747,
"learning_rate": 8.218713848403306e-06,
"loss": 0.5761,
"step": 1685
},
{
"epoch": 0.3506951649719859,
"grad_norm": 2.585591858891525,
"learning_rate": 8.204834809831377e-06,
"loss": 0.579,
"step": 1690
},
{
"epoch": 0.3517327246316663,
"grad_norm": 2.5419925763820457,
"learning_rate": 8.190913730932567e-06,
"loss": 0.5792,
"step": 1695
},
{
"epoch": 0.35277028429134677,
"grad_norm": 2.5691748657197575,
"learning_rate": 8.176950794320572e-06,
"loss": 0.5647,
"step": 1700
},
{
"epoch": 0.35380784395102716,
"grad_norm": 2.5977820134449603,
"learning_rate": 8.16294618315816e-06,
"loss": 0.5708,
"step": 1705
},
{
"epoch": 0.3548454036107076,
"grad_norm": 2.423522377595987,
"learning_rate": 8.148900081154773e-06,
"loss": 0.5666,
"step": 1710
},
{
"epoch": 0.35588296327038804,
"grad_norm": 2.569485264156882,
"learning_rate": 8.134812672564131e-06,
"loss": 0.5504,
"step": 1715
},
{
"epoch": 0.3569205229300685,
"grad_norm": 2.758198111046606,
"learning_rate": 8.1206841421818e-06,
"loss": 0.5691,
"step": 1720
},
{
"epoch": 0.3579580825897489,
"grad_norm": 2.5786351112814545,
"learning_rate": 8.10651467534277e-06,
"loss": 0.57,
"step": 1725
},
{
"epoch": 0.35899564224942937,
"grad_norm": 2.4962078269510015,
"learning_rate": 8.092304457919028e-06,
"loss": 0.557,
"step": 1730
},
{
"epoch": 0.36003320190910976,
"grad_norm": 2.5552381714765384,
"learning_rate": 8.078053676317124e-06,
"loss": 0.5673,
"step": 1735
},
{
"epoch": 0.3610707615687902,
"grad_norm": 2.611892754049072,
"learning_rate": 8.06376251747571e-06,
"loss": 0.5535,
"step": 1740
},
{
"epoch": 0.36210832122847064,
"grad_norm": 2.4981862280677545,
"learning_rate": 8.049431168863107e-06,
"loss": 0.5543,
"step": 1745
},
{
"epoch": 0.3631458808881511,
"grad_norm": 2.590859008799774,
"learning_rate": 8.035059818474833e-06,
"loss": 0.5688,
"step": 1750
},
{
"epoch": 0.3641834405478315,
"grad_norm": 2.7049066460444657,
"learning_rate": 8.02064865483114e-06,
"loss": 0.5666,
"step": 1755
},
{
"epoch": 0.3652210002075119,
"grad_norm": 2.50555658991844,
"learning_rate": 8.00619786697454e-06,
"loss": 0.553,
"step": 1760
},
{
"epoch": 0.36625855986719236,
"grad_norm": 2.4491820476275805,
"learning_rate": 7.991707644467335e-06,
"loss": 0.5635,
"step": 1765
},
{
"epoch": 0.3672961195268728,
"grad_norm": 2.560789498381096,
"learning_rate": 7.97717817738911e-06,
"loss": 0.5408,
"step": 1770
},
{
"epoch": 0.36833367918655324,
"grad_norm": 2.6052621436325416,
"learning_rate": 7.962609656334262e-06,
"loss": 0.5488,
"step": 1775
},
{
"epoch": 0.3693712388462337,
"grad_norm": 2.5278436185793582,
"learning_rate": 7.94800227240948e-06,
"loss": 0.5573,
"step": 1780
},
{
"epoch": 0.37040879850591407,
"grad_norm": 2.5168973470980607,
"learning_rate": 7.933356217231261e-06,
"loss": 0.5358,
"step": 1785
},
{
"epoch": 0.3714463581655945,
"grad_norm": 2.5427476959285475,
"learning_rate": 7.918671682923371e-06,
"loss": 0.557,
"step": 1790
},
{
"epoch": 0.37248391782527496,
"grad_norm": 2.5597051499240315,
"learning_rate": 7.90394886211434e-06,
"loss": 0.5443,
"step": 1795
},
{
"epoch": 0.3735214774849554,
"grad_norm": 2.6134854223700033,
"learning_rate": 7.889187947934939e-06,
"loss": 0.5643,
"step": 1800
},
{
"epoch": 0.37455903714463584,
"grad_norm": 2.5638490649548507,
"learning_rate": 7.874389134015627e-06,
"loss": 0.5515,
"step": 1805
},
{
"epoch": 0.37559659680431623,
"grad_norm": 2.638245607620569,
"learning_rate": 7.859552614484035e-06,
"loss": 0.5512,
"step": 1810
},
{
"epoch": 0.37663415646399667,
"grad_norm": 2.5439284853350683,
"learning_rate": 7.844678583962403e-06,
"loss": 0.5357,
"step": 1815
},
{
"epoch": 0.3776717161236771,
"grad_norm": 2.7382120697501264,
"learning_rate": 7.829767237565027e-06,
"loss": 0.5499,
"step": 1820
},
{
"epoch": 0.37870927578335756,
"grad_norm": 2.693790343217592,
"learning_rate": 7.814818770895718e-06,
"loss": 0.5447,
"step": 1825
},
{
"epoch": 0.379746835443038,
"grad_norm": 2.5002163516357685,
"learning_rate": 7.79983338004521e-06,
"loss": 0.5548,
"step": 1830
},
{
"epoch": 0.3807843951027184,
"grad_norm": 2.5140469437451514,
"learning_rate": 7.784811261588605e-06,
"loss": 0.5396,
"step": 1835
},
{
"epoch": 0.38182195476239883,
"grad_norm": 2.3632709986503384,
"learning_rate": 7.769752612582793e-06,
"loss": 0.5455,
"step": 1840
},
{
"epoch": 0.3828595144220793,
"grad_norm": 2.6174115493874415,
"learning_rate": 7.754657630563855e-06,
"loss": 0.5501,
"step": 1845
},
{
"epoch": 0.3838970740817597,
"grad_norm": 2.662869705594198,
"learning_rate": 7.739526513544492e-06,
"loss": 0.5458,
"step": 1850
},
{
"epoch": 0.38493463374144016,
"grad_norm": 2.595342018363226,
"learning_rate": 7.724359460011406e-06,
"loss": 0.5484,
"step": 1855
},
{
"epoch": 0.38597219340112054,
"grad_norm": 2.574988659923898,
"learning_rate": 7.709156668922715e-06,
"loss": 0.5465,
"step": 1860
},
{
"epoch": 0.387009753060801,
"grad_norm": 2.696253501363756,
"learning_rate": 7.693918339705327e-06,
"loss": 0.5416,
"step": 1865
},
{
"epoch": 0.38804731272048143,
"grad_norm": 2.571358317228633,
"learning_rate": 7.678644672252334e-06,
"loss": 0.5432,
"step": 1870
},
{
"epoch": 0.3890848723801619,
"grad_norm": 2.6773383342412007,
"learning_rate": 7.663335866920389e-06,
"loss": 0.5435,
"step": 1875
},
{
"epoch": 0.3901224320398423,
"grad_norm": 2.605715400204335,
"learning_rate": 7.647992124527076e-06,
"loss": 0.5394,
"step": 1880
},
{
"epoch": 0.3911599916995227,
"grad_norm": 2.542289996117672,
"learning_rate": 7.632613646348273e-06,
"loss": 0.5365,
"step": 1885
},
{
"epoch": 0.39219755135920314,
"grad_norm": 2.640322094229756,
"learning_rate": 7.617200634115516e-06,
"loss": 0.5473,
"step": 1890
},
{
"epoch": 0.3932351110188836,
"grad_norm": 2.5708221170868355,
"learning_rate": 7.601753290013353e-06,
"loss": 0.5209,
"step": 1895
},
{
"epoch": 0.39427267067856403,
"grad_norm": 2.525915351063816,
"learning_rate": 7.586271816676687e-06,
"loss": 0.5288,
"step": 1900
},
{
"epoch": 0.3953102303382445,
"grad_norm": 2.5364307727837234,
"learning_rate": 7.570756417188123e-06,
"loss": 0.5429,
"step": 1905
},
{
"epoch": 0.39634778999792486,
"grad_norm": 2.559190451038623,
"learning_rate": 7.555207295075303e-06,
"loss": 0.5128,
"step": 1910
},
{
"epoch": 0.3973853496576053,
"grad_norm": 2.7079589585625095,
"learning_rate": 7.539624654308231e-06,
"loss": 0.5333,
"step": 1915
},
{
"epoch": 0.39842290931728574,
"grad_norm": 2.4387259072975267,
"learning_rate": 7.5240086992966045e-06,
"loss": 0.5334,
"step": 1920
},
{
"epoch": 0.3994604689769662,
"grad_norm": 2.5588774252471818,
"learning_rate": 7.508359634887128e-06,
"loss": 0.5429,
"step": 1925
},
{
"epoch": 0.40049802863664663,
"grad_norm": 2.50003757681576,
"learning_rate": 7.4926776663608305e-06,
"loss": 0.5353,
"step": 1930
},
{
"epoch": 0.401535588296327,
"grad_norm": 2.471663404094866,
"learning_rate": 7.476962999430368e-06,
"loss": 0.5373,
"step": 1935
},
{
"epoch": 0.40257314795600746,
"grad_norm": 2.4541757324292215,
"learning_rate": 7.461215840237329e-06,
"loss": 0.5278,
"step": 1940
},
{
"epoch": 0.4036107076156879,
"grad_norm": 2.5133909898638738,
"learning_rate": 7.4454363953495255e-06,
"loss": 0.5224,
"step": 1945
},
{
"epoch": 0.40464826727536835,
"grad_norm": 2.456759039414412,
"learning_rate": 7.429624871758289e-06,
"loss": 0.5274,
"step": 1950
},
{
"epoch": 0.4056858269350488,
"grad_norm": 2.6254399143011025,
"learning_rate": 7.41378147687575e-06,
"loss": 0.539,
"step": 1955
},
{
"epoch": 0.4067233865947292,
"grad_norm": 2.77782256038311,
"learning_rate": 7.397906418532124e-06,
"loss": 0.5255,
"step": 1960
},
{
"epoch": 0.4077609462544096,
"grad_norm": 2.401784319404263,
"learning_rate": 7.381999904972974e-06,
"loss": 0.5373,
"step": 1965
},
{
"epoch": 0.40879850591409006,
"grad_norm": 2.4340739184784996,
"learning_rate": 7.366062144856494e-06,
"loss": 0.5292,
"step": 1970
},
{
"epoch": 0.4098360655737705,
"grad_norm": 2.5139056739660104,
"learning_rate": 7.350093347250754e-06,
"loss": 0.524,
"step": 1975
},
{
"epoch": 0.41087362523345095,
"grad_norm": 2.513276255127495,
"learning_rate": 7.334093721630976e-06,
"loss": 0.5231,
"step": 1980
},
{
"epoch": 0.41191118489313133,
"grad_norm": 2.6310824706160636,
"learning_rate": 7.318063477876775e-06,
"loss": 0.5233,
"step": 1985
},
{
"epoch": 0.4129487445528118,
"grad_norm": 2.491475901603011,
"learning_rate": 7.302002826269401e-06,
"loss": 0.5341,
"step": 1990
},
{
"epoch": 0.4139863042124922,
"grad_norm": 2.506660610799657,
"learning_rate": 7.285911977488995e-06,
"loss": 0.5182,
"step": 1995
},
{
"epoch": 0.41502386387217266,
"grad_norm": 2.4794467040525507,
"learning_rate": 7.269791142611819e-06,
"loss": 0.5305,
"step": 2000
},
{
"epoch": 0.4160614235318531,
"grad_norm": 2.4409844087069197,
"learning_rate": 7.253640533107482e-06,
"loss": 0.5097,
"step": 2005
},
{
"epoch": 0.4170989831915335,
"grad_norm": 2.5831589194501334,
"learning_rate": 7.23746036083617e-06,
"loss": 0.509,
"step": 2010
},
{
"epoch": 0.41813654285121393,
"grad_norm": 2.5848519997923134,
"learning_rate": 7.221250838045866e-06,
"loss": 0.5212,
"step": 2015
},
{
"epoch": 0.4191741025108944,
"grad_norm": 2.4649428551198507,
"learning_rate": 7.205012177369573e-06,
"loss": 0.5097,
"step": 2020
},
{
"epoch": 0.4202116621705748,
"grad_norm": 2.6470337729349334,
"learning_rate": 7.188744591822514e-06,
"loss": 0.5265,
"step": 2025
},
{
"epoch": 0.42124922183025526,
"grad_norm": 2.6777002487915427,
"learning_rate": 7.17244829479934e-06,
"loss": 0.5132,
"step": 2030
},
{
"epoch": 0.42228678148993565,
"grad_norm": 2.628723636023145,
"learning_rate": 7.156123500071337e-06,
"loss": 0.5383,
"step": 2035
},
{
"epoch": 0.4233243411496161,
"grad_norm": 2.6077956409975322,
"learning_rate": 7.139770421783616e-06,
"loss": 0.5143,
"step": 2040
},
{
"epoch": 0.42436190080929653,
"grad_norm": 2.5195148593267165,
"learning_rate": 7.1233892744523055e-06,
"loss": 0.5292,
"step": 2045
},
{
"epoch": 0.425399460468977,
"grad_norm": 2.4869546800490276,
"learning_rate": 7.1069802729617385e-06,
"loss": 0.5219,
"step": 2050
},
{
"epoch": 0.4264370201286574,
"grad_norm": 2.498457038824186,
"learning_rate": 7.090543632561632e-06,
"loss": 0.5227,
"step": 2055
},
{
"epoch": 0.4274745797883378,
"grad_norm": 2.5334280776647855,
"learning_rate": 7.0740795688642635e-06,
"loss": 0.5174,
"step": 2060
},
{
"epoch": 0.42851213944801825,
"grad_norm": 2.4142684408837667,
"learning_rate": 7.057588297841645e-06,
"loss": 0.5154,
"step": 2065
},
{
"epoch": 0.4295496991076987,
"grad_norm": 2.7421341811351967,
"learning_rate": 7.041070035822687e-06,
"loss": 0.4983,
"step": 2070
},
{
"epoch": 0.43058725876737913,
"grad_norm": 2.385233500154077,
"learning_rate": 7.024524999490364e-06,
"loss": 0.535,
"step": 2075
},
{
"epoch": 0.4316248184270596,
"grad_norm": 2.493022029900477,
"learning_rate": 7.007953405878867e-06,
"loss": 0.5036,
"step": 2080
},
{
"epoch": 0.43266237808673996,
"grad_norm": 2.5007769964543507,
"learning_rate": 6.991355472370762e-06,
"loss": 0.5288,
"step": 2085
},
{
"epoch": 0.4336999377464204,
"grad_norm": 2.4139386188608842,
"learning_rate": 6.974731416694135e-06,
"loss": 0.5142,
"step": 2090
},
{
"epoch": 0.43473749740610085,
"grad_norm": 2.5053762871669303,
"learning_rate": 6.958081456919737e-06,
"loss": 0.502,
"step": 2095
},
{
"epoch": 0.4357750570657813,
"grad_norm": 2.5446406360038267,
"learning_rate": 6.941405811458126e-06,
"loss": 0.5079,
"step": 2100
},
{
"epoch": 0.43681261672546173,
"grad_norm": 2.5024786886158465,
"learning_rate": 6.924704699056792e-06,
"loss": 0.5102,
"step": 2105
},
{
"epoch": 0.4378501763851421,
"grad_norm": 2.5991293471527754,
"learning_rate": 6.907978338797304e-06,
"loss": 0.5033,
"step": 2110
},
{
"epoch": 0.43888773604482256,
"grad_norm": 2.45095417799665,
"learning_rate": 6.891226950092422e-06,
"loss": 0.5033,
"step": 2115
},
{
"epoch": 0.439925295704503,
"grad_norm": 2.5068078607240762,
"learning_rate": 6.874450752683223e-06,
"loss": 0.5131,
"step": 2120
},
{
"epoch": 0.44096285536418345,
"grad_norm": 2.615502411671901,
"learning_rate": 6.85764996663622e-06,
"loss": 0.514,
"step": 2125
},
{
"epoch": 0.4420004150238639,
"grad_norm": 2.483179103106064,
"learning_rate": 6.840824812340476e-06,
"loss": 0.482,
"step": 2130
},
{
"epoch": 0.4430379746835443,
"grad_norm": 2.615056810287462,
"learning_rate": 6.82397551050471e-06,
"loss": 0.4981,
"step": 2135
},
{
"epoch": 0.4440755343432247,
"grad_norm": 2.5081191557582962,
"learning_rate": 6.807102282154406e-06,
"loss": 0.5038,
"step": 2140
},
{
"epoch": 0.44511309400290516,
"grad_norm": 2.4228335632525857,
"learning_rate": 6.790205348628902e-06,
"loss": 0.5116,
"step": 2145
},
{
"epoch": 0.4461506536625856,
"grad_norm": 2.4865567217795834,
"learning_rate": 6.773284931578508e-06,
"loss": 0.4923,
"step": 2150
},
{
"epoch": 0.44718821332226605,
"grad_norm": 2.5028391081664707,
"learning_rate": 6.756341252961575e-06,
"loss": 0.507,
"step": 2155
},
{
"epoch": 0.44822577298194644,
"grad_norm": 2.4437292124023755,
"learning_rate": 6.739374535041601e-06,
"loss": 0.5041,
"step": 2160
},
{
"epoch": 0.4492633326416269,
"grad_norm": 2.4804781219836705,
"learning_rate": 6.722385000384305e-06,
"loss": 0.5071,
"step": 2165
},
{
"epoch": 0.4503008923013073,
"grad_norm": 2.613493029619529,
"learning_rate": 6.705372871854713e-06,
"loss": 0.5045,
"step": 2170
},
{
"epoch": 0.45133845196098776,
"grad_norm": 2.5550452506734613,
"learning_rate": 6.688338372614232e-06,
"loss": 0.4954,
"step": 2175
},
{
"epoch": 0.4523760116206682,
"grad_norm": 2.5823699134656795,
"learning_rate": 6.671281726117721e-06,
"loss": 0.5029,
"step": 2180
},
{
"epoch": 0.4534135712803486,
"grad_norm": 2.6550826128938287,
"learning_rate": 6.654203156110565e-06,
"loss": 0.4942,
"step": 2185
},
{
"epoch": 0.45445113094002904,
"grad_norm": 2.4268474548849976,
"learning_rate": 6.6371028866257355e-06,
"loss": 0.5027,
"step": 2190
},
{
"epoch": 0.4554886905997095,
"grad_norm": 2.437142297358699,
"learning_rate": 6.6199811419808525e-06,
"loss": 0.4949,
"step": 2195
},
{
"epoch": 0.4565262502593899,
"grad_norm": 2.494477319925722,
"learning_rate": 6.602838146775243e-06,
"loss": 0.4796,
"step": 2200
},
{
"epoch": 0.45756380991907036,
"grad_norm": 2.5244330207655223,
"learning_rate": 6.585674125886996e-06,
"loss": 0.5066,
"step": 2205
},
{
"epoch": 0.45860136957875075,
"grad_norm": 2.6127909654783608,
"learning_rate": 6.568489304470007e-06,
"loss": 0.4909,
"step": 2210
},
{
"epoch": 0.4596389292384312,
"grad_norm": 2.5733537948603105,
"learning_rate": 6.551283907951031e-06,
"loss": 0.4886,
"step": 2215
},
{
"epoch": 0.46067648889811164,
"grad_norm": 2.627653050074194,
"learning_rate": 6.534058162026724e-06,
"loss": 0.4871,
"step": 2220
},
{
"epoch": 0.4617140485577921,
"grad_norm": 2.493707567539853,
"learning_rate": 6.516812292660675e-06,
"loss": 0.5115,
"step": 2225
},
{
"epoch": 0.4627516082174725,
"grad_norm": 2.576298574074138,
"learning_rate": 6.499546526080457e-06,
"loss": 0.4935,
"step": 2230
},
{
"epoch": 0.46378916787715296,
"grad_norm": 2.3441840780878227,
"learning_rate": 6.482261088774642e-06,
"loss": 0.4918,
"step": 2235
},
{
"epoch": 0.46482672753683335,
"grad_norm": 2.5855232225238702,
"learning_rate": 6.464956207489843e-06,
"loss": 0.5009,
"step": 2240
},
{
"epoch": 0.4658642871965138,
"grad_norm": 2.578617818062702,
"learning_rate": 6.447632109227735e-06,
"loss": 0.4931,
"step": 2245
},
{
"epoch": 0.46690184685619424,
"grad_norm": 2.6300654942955752,
"learning_rate": 6.4302890212420735e-06,
"loss": 0.4924,
"step": 2250
},
{
"epoch": 0.4679394065158747,
"grad_norm": 2.535272976711584,
"learning_rate": 6.412927171035721e-06,
"loss": 0.4864,
"step": 2255
},
{
"epoch": 0.4689769661755551,
"grad_norm": 2.497392960102334,
"learning_rate": 6.3955467863576555e-06,
"loss": 0.502,
"step": 2260
},
{
"epoch": 0.4700145258352355,
"grad_norm": 2.476885100955309,
"learning_rate": 6.37814809519999e-06,
"loss": 0.498,
"step": 2265
},
{
"epoch": 0.47105208549491595,
"grad_norm": 2.3650744048382726,
"learning_rate": 6.360731325794975e-06,
"loss": 0.486,
"step": 2270
},
{
"epoch": 0.4720896451545964,
"grad_norm": 2.469078585494707,
"learning_rate": 6.343296706612008e-06,
"loss": 0.4745,
"step": 2275
},
{
"epoch": 0.47312720481427684,
"grad_norm": 2.5527304594928326,
"learning_rate": 6.325844466354637e-06,
"loss": 0.4959,
"step": 2280
},
{
"epoch": 0.4741647644739573,
"grad_norm": 2.388876155874213,
"learning_rate": 6.308374833957556e-06,
"loss": 0.4787,
"step": 2285
},
{
"epoch": 0.47520232413363767,
"grad_norm": 2.642090924554838,
"learning_rate": 6.290888038583611e-06,
"loss": 0.4951,
"step": 2290
},
{
"epoch": 0.4762398837933181,
"grad_norm": 2.5493215911731557,
"learning_rate": 6.273384309620785e-06,
"loss": 0.4799,
"step": 2295
},
{
"epoch": 0.47727744345299855,
"grad_norm": 2.3751142920110944,
"learning_rate": 6.25586387667919e-06,
"loss": 0.4841,
"step": 2300
},
{
"epoch": 0.478315003112679,
"grad_norm": 2.4695590079808625,
"learning_rate": 6.238326969588062e-06,
"loss": 0.4739,
"step": 2305
},
{
"epoch": 0.47935256277235944,
"grad_norm": 2.6302561474616937,
"learning_rate": 6.220773818392738e-06,
"loss": 0.4809,
"step": 2310
},
{
"epoch": 0.4803901224320398,
"grad_norm": 2.5254595904398744,
"learning_rate": 6.203204653351642e-06,
"loss": 0.4964,
"step": 2315
},
{
"epoch": 0.48142768209172027,
"grad_norm": 2.4902195244274288,
"learning_rate": 6.185619704933267e-06,
"loss": 0.4654,
"step": 2320
},
{
"epoch": 0.4824652417514007,
"grad_norm": 2.5224684668035007,
"learning_rate": 6.168019203813143e-06,
"loss": 0.479,
"step": 2325
},
{
"epoch": 0.48350280141108115,
"grad_norm": 2.531197814844583,
"learning_rate": 6.15040338087082e-06,
"loss": 0.4756,
"step": 2330
},
{
"epoch": 0.4845403610707616,
"grad_norm": 2.3546555565826135,
"learning_rate": 6.132772467186841e-06,
"loss": 0.4649,
"step": 2335
},
{
"epoch": 0.485577920730442,
"grad_norm": 2.3784120564924622,
"learning_rate": 6.115126694039699e-06,
"loss": 0.4709,
"step": 2340
},
{
"epoch": 0.4866154803901224,
"grad_norm": 2.56815940386139,
"learning_rate": 6.097466292902815e-06,
"loss": 0.486,
"step": 2345
},
{
"epoch": 0.48765304004980287,
"grad_norm": 2.5301845486075103,
"learning_rate": 6.079791495441491e-06,
"loss": 0.4754,
"step": 2350
},
{
"epoch": 0.4886905997094833,
"grad_norm": 2.4650710165826806,
"learning_rate": 6.062102533509886e-06,
"loss": 0.4663,
"step": 2355
},
{
"epoch": 0.48972815936916375,
"grad_norm": 2.7587452612942145,
"learning_rate": 6.044399639147957e-06,
"loss": 0.4632,
"step": 2360
},
{
"epoch": 0.49076571902884414,
"grad_norm": 2.543480240365517,
"learning_rate": 6.026683044578427e-06,
"loss": 0.4689,
"step": 2365
},
{
"epoch": 0.4918032786885246,
"grad_norm": 2.4525809117166366,
"learning_rate": 6.008952982203737e-06,
"loss": 0.4843,
"step": 2370
},
{
"epoch": 0.492840838348205,
"grad_norm": 2.417071327082838,
"learning_rate": 5.991209684602991e-06,
"loss": 0.4677,
"step": 2375
},
{
"epoch": 0.49387839800788547,
"grad_norm": 2.8159423032357394,
"learning_rate": 5.9734533845289144e-06,
"loss": 0.466,
"step": 2380
},
{
"epoch": 0.4949159576675659,
"grad_norm": 2.5331145358141023,
"learning_rate": 5.955684314904795e-06,
"loss": 0.491,
"step": 2385
},
{
"epoch": 0.4959535173272463,
"grad_norm": 2.5850294725860103,
"learning_rate": 5.937902708821427e-06,
"loss": 0.4727,
"step": 2390
},
{
"epoch": 0.49699107698692674,
"grad_norm": 2.3455874492137627,
"learning_rate": 5.920108799534059e-06,
"loss": 0.4699,
"step": 2395
},
{
"epoch": 0.4980286366466072,
"grad_norm": 2.496007013879939,
"learning_rate": 5.902302820459324e-06,
"loss": 0.4599,
"step": 2400
},
{
"epoch": 0.4990661963062876,
"grad_norm": 2.412699996479322,
"learning_rate": 5.884485005172189e-06,
"loss": 0.474,
"step": 2405
},
{
"epoch": 0.500103755965968,
"grad_norm": 2.6197335120279805,
"learning_rate": 5.866655587402886e-06,
"loss": 0.4815,
"step": 2410
},
{
"epoch": 0.5011413156256485,
"grad_norm": 2.632568563481647,
"learning_rate": 5.8488148010338445e-06,
"loss": 0.474,
"step": 2415
},
{
"epoch": 0.5021788752853289,
"grad_norm": 2.383359491818944,
"learning_rate": 5.8309628800966225e-06,
"loss": 0.464,
"step": 2420
},
{
"epoch": 0.5032164349450093,
"grad_norm": 2.687191301722998,
"learning_rate": 5.813100058768841e-06,
"loss": 0.4671,
"step": 2425
},
{
"epoch": 0.5042539946046898,
"grad_norm": 2.455559246352499,
"learning_rate": 5.795226571371114e-06,
"loss": 0.4682,
"step": 2430
},
{
"epoch": 0.5052915542643702,
"grad_norm": 2.589166158482144,
"learning_rate": 5.777342652363963e-06,
"loss": 0.4756,
"step": 2435
},
{
"epoch": 0.5063291139240507,
"grad_norm": 2.5173329990266295,
"learning_rate": 5.759448536344753e-06,
"loss": 0.4849,
"step": 2440
},
{
"epoch": 0.5073666735837311,
"grad_norm": 2.3813334648732227,
"learning_rate": 5.741544458044611e-06,
"loss": 0.4725,
"step": 2445
},
{
"epoch": 0.5084042332434114,
"grad_norm": 2.4645035637647683,
"learning_rate": 5.723630652325349e-06,
"loss": 0.4523,
"step": 2450
},
{
"epoch": 0.5094417929030919,
"grad_norm": 2.5330130007551848,
"learning_rate": 5.705707354176377e-06,
"loss": 0.4655,
"step": 2455
},
{
"epoch": 0.5104793525627723,
"grad_norm": 2.3938015606881886,
"learning_rate": 5.687774798711627e-06,
"loss": 0.468,
"step": 2460
},
{
"epoch": 0.5115169122224528,
"grad_norm": 2.647072227845224,
"learning_rate": 5.669833221166469e-06,
"loss": 0.4695,
"step": 2465
},
{
"epoch": 0.5125544718821332,
"grad_norm": 2.4713289117715544,
"learning_rate": 5.651882856894615e-06,
"loss": 0.4617,
"step": 2470
},
{
"epoch": 0.5135920315418137,
"grad_norm": 2.494015444510411,
"learning_rate": 5.633923941365049e-06,
"loss": 0.4659,
"step": 2475
},
{
"epoch": 0.5146295912014941,
"grad_norm": 2.3455915787636807,
"learning_rate": 5.615956710158921e-06,
"loss": 0.4563,
"step": 2480
},
{
"epoch": 0.5156671508611745,
"grad_norm": 2.453795440026443,
"learning_rate": 5.597981398966468e-06,
"loss": 0.4698,
"step": 2485
},
{
"epoch": 0.516704710520855,
"grad_norm": 2.3862841672431143,
"learning_rate": 5.579998243583919e-06,
"loss": 0.4583,
"step": 2490
},
{
"epoch": 0.5177422701805354,
"grad_norm": 2.412824706136417,
"learning_rate": 5.562007479910396e-06,
"loss": 0.4714,
"step": 2495
},
{
"epoch": 0.5187798298402159,
"grad_norm": 2.5121998422157543,
"learning_rate": 5.544009343944834e-06,
"loss": 0.4597,
"step": 2500
},
{
"epoch": 0.5198173894998962,
"grad_norm": 2.479588579717285,
"learning_rate": 5.526004071782868e-06,
"loss": 0.461,
"step": 2505
},
{
"epoch": 0.5208549491595766,
"grad_norm": 2.4676776770935365,
"learning_rate": 5.507991899613746e-06,
"loss": 0.4632,
"step": 2510
},
{
"epoch": 0.5218925088192571,
"grad_norm": 2.4315208171491838,
"learning_rate": 5.489973063717233e-06,
"loss": 0.4702,
"step": 2515
},
{
"epoch": 0.5229300684789375,
"grad_norm": 2.430974288239865,
"learning_rate": 5.471947800460502e-06,
"loss": 0.4389,
"step": 2520
},
{
"epoch": 0.523967628138618,
"grad_norm": 2.481418769699562,
"learning_rate": 5.453916346295043e-06,
"loss": 0.4516,
"step": 2525
},
{
"epoch": 0.5250051877982984,
"grad_norm": 2.454530070786792,
"learning_rate": 5.435878937753553e-06,
"loss": 0.4461,
"step": 2530
},
{
"epoch": 0.5260427474579789,
"grad_norm": 2.412430721171224,
"learning_rate": 5.417835811446839e-06,
"loss": 0.4516,
"step": 2535
},
{
"epoch": 0.5270803071176593,
"grad_norm": 2.5095500970858566,
"learning_rate": 5.3997872040607154e-06,
"loss": 0.4647,
"step": 2540
},
{
"epoch": 0.5281178667773397,
"grad_norm": 2.47241909883305,
"learning_rate": 5.3817333523528895e-06,
"loss": 0.4529,
"step": 2545
},
{
"epoch": 0.5291554264370202,
"grad_norm": 2.5164134259250677,
"learning_rate": 5.363674493149868e-06,
"loss": 0.4584,
"step": 2550
},
{
"epoch": 0.5301929860967005,
"grad_norm": 2.4698416937451073,
"learning_rate": 5.345610863343843e-06,
"loss": 0.4479,
"step": 2555
},
{
"epoch": 0.531230545756381,
"grad_norm": 2.388109532933269,
"learning_rate": 5.327542699889586e-06,
"loss": 0.4527,
"step": 2560
},
{
"epoch": 0.5322681054160614,
"grad_norm": 2.532256574198705,
"learning_rate": 5.309470239801343e-06,
"loss": 0.4541,
"step": 2565
},
{
"epoch": 0.5333056650757418,
"grad_norm": 2.414906181196464,
"learning_rate": 5.291393720149716e-06,
"loss": 0.4415,
"step": 2570
},
{
"epoch": 0.5343432247354223,
"grad_norm": 2.5860824474566146,
"learning_rate": 5.273313378058566e-06,
"loss": 0.4377,
"step": 2575
},
{
"epoch": 0.5353807843951027,
"grad_norm": 2.6418841514226967,
"learning_rate": 5.255229450701893e-06,
"loss": 0.4342,
"step": 2580
},
{
"epoch": 0.5364183440547832,
"grad_norm": 2.4497315903217247,
"learning_rate": 5.237142175300726e-06,
"loss": 0.4533,
"step": 2585
},
{
"epoch": 0.5374559037144636,
"grad_norm": 2.4935146868944265,
"learning_rate": 5.219051789120015e-06,
"loss": 0.44,
"step": 2590
},
{
"epoch": 0.538493463374144,
"grad_norm": 2.4091632402760346,
"learning_rate": 5.200958529465517e-06,
"loss": 0.454,
"step": 2595
},
{
"epoch": 0.5395310230338245,
"grad_norm": 2.460401303833881,
"learning_rate": 5.182862633680683e-06,
"loss": 0.4512,
"step": 2600
},
{
"epoch": 0.5405685826935048,
"grad_norm": 2.49980869010523,
"learning_rate": 5.164764339143542e-06,
"loss": 0.4531,
"step": 2605
},
{
"epoch": 0.5416061423531853,
"grad_norm": 2.3750857632390767,
"learning_rate": 5.14666388326359e-06,
"loss": 0.4548,
"step": 2610
},
{
"epoch": 0.5426437020128657,
"grad_norm": 2.402759567147983,
"learning_rate": 5.128561503478676e-06,
"loss": 0.4582,
"step": 2615
},
{
"epoch": 0.5436812616725462,
"grad_norm": 2.387820424005501,
"learning_rate": 5.110457437251886e-06,
"loss": 0.4413,
"step": 2620
},
{
"epoch": 0.5447188213322266,
"grad_norm": 2.3796099453754764,
"learning_rate": 5.092351922068427e-06,
"loss": 0.4524,
"step": 2625
},
{
"epoch": 0.545756380991907,
"grad_norm": 2.3669548551534563,
"learning_rate": 5.0742451954325156e-06,
"loss": 0.4473,
"step": 2630
},
{
"epoch": 0.5467939406515875,
"grad_norm": 2.463149244250711,
"learning_rate": 5.056137494864259e-06,
"loss": 0.447,
"step": 2635
},
{
"epoch": 0.5478315003112679,
"grad_norm": 2.3744815980543494,
"learning_rate": 5.0380290578965375e-06,
"loss": 0.4404,
"step": 2640
},
{
"epoch": 0.5488690599709484,
"grad_norm": 2.3391410347348796,
"learning_rate": 5.019920122071896e-06,
"loss": 0.4388,
"step": 2645
},
{
"epoch": 0.5499066196306288,
"grad_norm": 2.3768157404751085,
"learning_rate": 5.00181092493942e-06,
"loss": 0.4386,
"step": 2650
},
{
"epoch": 0.5509441792903091,
"grad_norm": 2.6059542471826824,
"learning_rate": 4.983701704051625e-06,
"loss": 0.4528,
"step": 2655
},
{
"epoch": 0.5519817389499896,
"grad_norm": 2.417585242968673,
"learning_rate": 4.965592696961335e-06,
"loss": 0.4501,
"step": 2660
},
{
"epoch": 0.55301929860967,
"grad_norm": 2.4947079086664123,
"learning_rate": 4.947484141218572e-06,
"loss": 0.4385,
"step": 2665
},
{
"epoch": 0.5540568582693505,
"grad_norm": 2.5191730683436604,
"learning_rate": 4.929376274367438e-06,
"loss": 0.4324,
"step": 2670
},
{
"epoch": 0.5550944179290309,
"grad_norm": 2.4257268049363336,
"learning_rate": 4.911269333942994e-06,
"loss": 0.4388,
"step": 2675
},
{
"epoch": 0.5561319775887114,
"grad_norm": 2.5184876591899648,
"learning_rate": 4.893163557468155e-06,
"loss": 0.4316,
"step": 2680
},
{
"epoch": 0.5571695372483918,
"grad_norm": 2.380468990224694,
"learning_rate": 4.87505918245056e-06,
"loss": 0.437,
"step": 2685
},
{
"epoch": 0.5582070969080722,
"grad_norm": 2.3265663935377727,
"learning_rate": 4.856956446379472e-06,
"loss": 0.4307,
"step": 2690
},
{
"epoch": 0.5592446565677527,
"grad_norm": 2.3400451782909646,
"learning_rate": 4.838855586722647e-06,
"loss": 0.4351,
"step": 2695
},
{
"epoch": 0.5602822162274331,
"grad_norm": 2.4589440190512675,
"learning_rate": 4.820756840923232e-06,
"loss": 0.4447,
"step": 2700
},
{
"epoch": 0.5613197758871135,
"grad_norm": 2.4951528488343184,
"learning_rate": 4.802660446396642e-06,
"loss": 0.4445,
"step": 2705
},
{
"epoch": 0.5623573355467939,
"grad_norm": 2.666275326085189,
"learning_rate": 4.784566640527451e-06,
"loss": 0.4406,
"step": 2710
},
{
"epoch": 0.5633948952064743,
"grad_norm": 2.4165213736699522,
"learning_rate": 4.766475660666271e-06,
"loss": 0.4222,
"step": 2715
},
{
"epoch": 0.5644324548661548,
"grad_norm": 2.345464539142852,
"learning_rate": 4.748387744126649e-06,
"loss": 0.4355,
"step": 2720
},
{
"epoch": 0.5654700145258352,
"grad_norm": 2.5576836547836215,
"learning_rate": 4.730303128181944e-06,
"loss": 0.4289,
"step": 2725
},
{
"epoch": 0.5665075741855157,
"grad_norm": 2.5283882321035858,
"learning_rate": 4.712222050062219e-06,
"loss": 0.4283,
"step": 2730
},
{
"epoch": 0.5675451338451961,
"grad_norm": 2.556873104995108,
"learning_rate": 4.694144746951131e-06,
"loss": 0.434,
"step": 2735
},
{
"epoch": 0.5685826935048766,
"grad_norm": 2.280037463289983,
"learning_rate": 4.676071455982811e-06,
"loss": 0.4294,
"step": 2740
},
{
"epoch": 0.569620253164557,
"grad_norm": 2.3570715386535555,
"learning_rate": 4.658002414238771e-06,
"loss": 0.4357,
"step": 2745
},
{
"epoch": 0.5706578128242374,
"grad_norm": 2.4558176026821217,
"learning_rate": 4.63993785874477e-06,
"loss": 0.4421,
"step": 2750
},
{
"epoch": 0.5716953724839178,
"grad_norm": 2.4779834247504082,
"learning_rate": 4.621878026467725e-06,
"loss": 0.4336,
"step": 2755
},
{
"epoch": 0.5727329321435982,
"grad_norm": 2.3737617720589483,
"learning_rate": 4.603823154312593e-06,
"loss": 0.4263,
"step": 2760
},
{
"epoch": 0.5737704918032787,
"grad_norm": 2.4648165069495147,
"learning_rate": 4.585773479119265e-06,
"loss": 0.4487,
"step": 2765
},
{
"epoch": 0.5748080514629591,
"grad_norm": 2.3105842246846064,
"learning_rate": 4.567729237659459e-06,
"loss": 0.4252,
"step": 2770
},
{
"epoch": 0.5758456111226395,
"grad_norm": 2.560822029486094,
"learning_rate": 4.549690666633615e-06,
"loss": 0.4432,
"step": 2775
},
{
"epoch": 0.57688317078232,
"grad_norm": 2.6130981185416,
"learning_rate": 4.531658002667787e-06,
"loss": 0.4402,
"step": 2780
},
{
"epoch": 0.5779207304420004,
"grad_norm": 2.363414455159581,
"learning_rate": 4.51363148231055e-06,
"loss": 0.4351,
"step": 2785
},
{
"epoch": 0.5789582901016809,
"grad_norm": 2.363075041688293,
"learning_rate": 4.495611342029875e-06,
"loss": 0.428,
"step": 2790
},
{
"epoch": 0.5799958497613613,
"grad_norm": 2.4754540310783733,
"learning_rate": 4.477597818210054e-06,
"loss": 0.4246,
"step": 2795
},
{
"epoch": 0.5810334094210418,
"grad_norm": 2.6816555150046755,
"learning_rate": 4.459591147148575e-06,
"loss": 0.4253,
"step": 2800
},
{
"epoch": 0.5820709690807221,
"grad_norm": 2.4036860880645894,
"learning_rate": 4.441591565053041e-06,
"loss": 0.4272,
"step": 2805
},
{
"epoch": 0.5831085287404025,
"grad_norm": 2.4802074251251156,
"learning_rate": 4.423599308038057e-06,
"loss": 0.4209,
"step": 2810
},
{
"epoch": 0.584146088400083,
"grad_norm": 2.415585886184946,
"learning_rate": 4.405614612122145e-06,
"loss": 0.4226,
"step": 2815
},
{
"epoch": 0.5851836480597634,
"grad_norm": 2.4533356227902816,
"learning_rate": 4.387637713224638e-06,
"loss": 0.4343,
"step": 2820
},
{
"epoch": 0.5862212077194439,
"grad_norm": 2.7240629036546284,
"learning_rate": 4.36966884716259e-06,
"loss": 0.4287,
"step": 2825
},
{
"epoch": 0.5872587673791243,
"grad_norm": 2.3878324516316276,
"learning_rate": 4.3517082496476845e-06,
"loss": 0.4184,
"step": 2830
},
{
"epoch": 0.5882963270388047,
"grad_norm": 2.422038027754476,
"learning_rate": 4.333756156283136e-06,
"loss": 0.4209,
"step": 2835
},
{
"epoch": 0.5893338866984852,
"grad_norm": 2.5027178602080697,
"learning_rate": 4.315812802560609e-06,
"loss": 0.4151,
"step": 2840
},
{
"epoch": 0.5903714463581656,
"grad_norm": 2.440654195509635,
"learning_rate": 4.2978784238571145e-06,
"loss": 0.424,
"step": 2845
},
{
"epoch": 0.5914090060178461,
"grad_norm": 2.4534970838374206,
"learning_rate": 4.279953255431944e-06,
"loss": 0.427,
"step": 2850
},
{
"epoch": 0.5924465656775264,
"grad_norm": 2.4846011931620087,
"learning_rate": 4.262037532423556e-06,
"loss": 0.4376,
"step": 2855
},
{
"epoch": 0.5934841253372068,
"grad_norm": 2.4555820059315,
"learning_rate": 4.244131489846519e-06,
"loss": 0.4102,
"step": 2860
},
{
"epoch": 0.5945216849968873,
"grad_norm": 2.443517931338813,
"learning_rate": 4.2262353625884054e-06,
"loss": 0.4138,
"step": 2865
},
{
"epoch": 0.5955592446565677,
"grad_norm": 2.369110831081435,
"learning_rate": 4.208349385406729e-06,
"loss": 0.4364,
"step": 2870
},
{
"epoch": 0.5965968043162482,
"grad_norm": 2.3567775981536423,
"learning_rate": 4.190473792925851e-06,
"loss": 0.4277,
"step": 2875
},
{
"epoch": 0.5976343639759286,
"grad_norm": 2.3822530664354216,
"learning_rate": 4.1726088196339106e-06,
"loss": 0.4266,
"step": 2880
},
{
"epoch": 0.5986719236356091,
"grad_norm": 2.3958405655171293,
"learning_rate": 4.154754699879748e-06,
"loss": 0.4177,
"step": 2885
},
{
"epoch": 0.5997094832952895,
"grad_norm": 2.3762721908121573,
"learning_rate": 4.136911667869827e-06,
"loss": 0.4146,
"step": 2890
},
{
"epoch": 0.60074704295497,
"grad_norm": 2.2932296484786137,
"learning_rate": 4.119079957665163e-06,
"loss": 0.4074,
"step": 2895
},
{
"epoch": 0.6017846026146504,
"grad_norm": 2.446668454803684,
"learning_rate": 4.101259803178265e-06,
"loss": 0.4318,
"step": 2900
},
{
"epoch": 0.6028221622743307,
"grad_norm": 2.361867602889005,
"learning_rate": 4.083451438170039e-06,
"loss": 0.4098,
"step": 2905
},
{
"epoch": 0.6038597219340112,
"grad_norm": 2.323202296910631,
"learning_rate": 4.065655096246755e-06,
"loss": 0.408,
"step": 2910
},
{
"epoch": 0.6048972815936916,
"grad_norm": 2.3845839128387722,
"learning_rate": 4.047871010856959e-06,
"loss": 0.4071,
"step": 2915
},
{
"epoch": 0.605934841253372,
"grad_norm": 2.317266250356725,
"learning_rate": 4.03009941528842e-06,
"loss": 0.4051,
"step": 2920
},
{
"epoch": 0.6069724009130525,
"grad_norm": 2.4278074780829852,
"learning_rate": 4.012340542665067e-06,
"loss": 0.4002,
"step": 2925
},
{
"epoch": 0.6080099605727329,
"grad_norm": 2.4948657955155853,
"learning_rate": 3.994594625943936e-06,
"loss": 0.4103,
"step": 2930
},
{
"epoch": 0.6090475202324134,
"grad_norm": 2.4097506379402596,
"learning_rate": 3.976861897912106e-06,
"loss": 0.4137,
"step": 2935
},
{
"epoch": 0.6100850798920938,
"grad_norm": 2.695103229335665,
"learning_rate": 3.959142591183652e-06,
"loss": 0.4184,
"step": 2940
},
{
"epoch": 0.6111226395517743,
"grad_norm": 2.433881870125308,
"learning_rate": 3.9414369381965904e-06,
"loss": 0.4084,
"step": 2945
},
{
"epoch": 0.6121601992114547,
"grad_norm": 2.424853521797893,
"learning_rate": 3.92374517120983e-06,
"loss": 0.4157,
"step": 2950
},
{
"epoch": 0.6131977588711351,
"grad_norm": 2.3289318124697926,
"learning_rate": 3.90606752230013e-06,
"loss": 0.4002,
"step": 2955
},
{
"epoch": 0.6142353185308155,
"grad_norm": 2.4465640629382186,
"learning_rate": 3.888404223359045e-06,
"loss": 0.4057,
"step": 2960
},
{
"epoch": 0.6152728781904959,
"grad_norm": 2.572631649766761,
"learning_rate": 3.870755506089899e-06,
"loss": 0.4144,
"step": 2965
},
{
"epoch": 0.6163104378501764,
"grad_norm": 2.4132420332744204,
"learning_rate": 3.8531216020047246e-06,
"loss": 0.4116,
"step": 2970
},
{
"epoch": 0.6173479975098568,
"grad_norm": 2.3892278386798593,
"learning_rate": 3.835502742421251e-06,
"loss": 0.4093,
"step": 2975
},
{
"epoch": 0.6183855571695372,
"grad_norm": 2.2641401615974406,
"learning_rate": 3.8178991584598474e-06,
"loss": 0.4131,
"step": 2980
},
{
"epoch": 0.6194231168292177,
"grad_norm": 2.438762066594083,
"learning_rate": 3.8003110810405065e-06,
"loss": 0.4064,
"step": 2985
},
{
"epoch": 0.6204606764888981,
"grad_norm": 2.4506479186056396,
"learning_rate": 3.782738740879806e-06,
"loss": 0.4052,
"step": 2990
},
{
"epoch": 0.6214982361485786,
"grad_norm": 2.5283170692610617,
"learning_rate": 3.7651823684878884e-06,
"loss": 0.396,
"step": 2995
},
{
"epoch": 0.622535795808259,
"grad_norm": 2.4749410345861453,
"learning_rate": 3.7476421941654318e-06,
"loss": 0.4193,
"step": 3000
},
{
"epoch": 0.6235733554679395,
"grad_norm": 2.317549282593909,
"learning_rate": 3.7301184480006337e-06,
"loss": 0.3973,
"step": 3005
},
{
"epoch": 0.6246109151276198,
"grad_norm": 2.5252500112854275,
"learning_rate": 3.712611359866188e-06,
"loss": 0.4147,
"step": 3010
},
{
"epoch": 0.6256484747873002,
"grad_norm": 2.4400394840947475,
"learning_rate": 3.6951211594162784e-06,
"loss": 0.4089,
"step": 3015
},
{
"epoch": 0.6266860344469807,
"grad_norm": 2.349957863539589,
"learning_rate": 3.677648076083549e-06,
"loss": 0.3992,
"step": 3020
},
{
"epoch": 0.6277235941066611,
"grad_norm": 2.3832033455743833,
"learning_rate": 3.6601923390761156e-06,
"loss": 0.4131,
"step": 3025
},
{
"epoch": 0.6287611537663416,
"grad_norm": 2.576394180910392,
"learning_rate": 3.6427541773745433e-06,
"loss": 0.3968,
"step": 3030
},
{
"epoch": 0.629798713426022,
"grad_norm": 2.4507894648738877,
"learning_rate": 3.6253338197288505e-06,
"loss": 0.4023,
"step": 3035
},
{
"epoch": 0.6308362730857024,
"grad_norm": 2.3382065135829997,
"learning_rate": 3.607931494655504e-06,
"loss": 0.3918,
"step": 3040
},
{
"epoch": 0.6318738327453829,
"grad_norm": 2.4244914619480373,
"learning_rate": 3.5905474304344225e-06,
"loss": 0.4117,
"step": 3045
},
{
"epoch": 0.6329113924050633,
"grad_norm": 2.567189154890026,
"learning_rate": 3.573181855105986e-06,
"loss": 0.42,
"step": 3050
},
{
"epoch": 0.6339489520647438,
"grad_norm": 2.278238684757535,
"learning_rate": 3.555834996468039e-06,
"loss": 0.4041,
"step": 3055
},
{
"epoch": 0.6349865117244241,
"grad_norm": 2.504690078277028,
"learning_rate": 3.538507082072905e-06,
"loss": 0.3944,
"step": 3060
},
{
"epoch": 0.6360240713841046,
"grad_norm": 2.508152236947373,
"learning_rate": 3.5211983392243996e-06,
"loss": 0.4,
"step": 3065
},
{
"epoch": 0.637061631043785,
"grad_norm": 2.3910619945271683,
"learning_rate": 3.503908994974856e-06,
"loss": 0.4093,
"step": 3070
},
{
"epoch": 0.6380991907034654,
"grad_norm": 2.3248985179294652,
"learning_rate": 3.4866392761221303e-06,
"loss": 0.4065,
"step": 3075
},
{
"epoch": 0.6391367503631459,
"grad_norm": 2.361597466745494,
"learning_rate": 3.4693894092066483e-06,
"loss": 0.3907,
"step": 3080
},
{
"epoch": 0.6401743100228263,
"grad_norm": 2.4949060134883743,
"learning_rate": 3.452159620508414e-06,
"loss": 0.4004,
"step": 3085
},
{
"epoch": 0.6412118696825068,
"grad_norm": 2.3586999565378886,
"learning_rate": 3.4349501360440556e-06,
"loss": 0.3977,
"step": 3090
},
{
"epoch": 0.6422494293421872,
"grad_norm": 2.2746540643844426,
"learning_rate": 3.417761181563849e-06,
"loss": 0.3949,
"step": 3095
},
{
"epoch": 0.6432869890018676,
"grad_norm": 2.4940700061779064,
"learning_rate": 3.4005929825487684e-06,
"loss": 0.4011,
"step": 3100
},
{
"epoch": 0.6443245486615481,
"grad_norm": 2.3950183894327934,
"learning_rate": 3.383445764207516e-06,
"loss": 0.408,
"step": 3105
},
{
"epoch": 0.6453621083212284,
"grad_norm": 2.30382353091722,
"learning_rate": 3.366319751473579e-06,
"loss": 0.4022,
"step": 3110
},
{
"epoch": 0.6463996679809089,
"grad_norm": 2.4321346720182766,
"learning_rate": 3.3492151690022712e-06,
"loss": 0.3986,
"step": 3115
},
{
"epoch": 0.6474372276405893,
"grad_norm": 2.3061011632873285,
"learning_rate": 3.332132241167793e-06,
"loss": 0.3972,
"step": 3120
},
{
"epoch": 0.6484747873002698,
"grad_norm": 2.4917204012266327,
"learning_rate": 3.3150711920602765e-06,
"loss": 0.4042,
"step": 3125
},
{
"epoch": 0.6495123469599502,
"grad_norm": 2.190798555385459,
"learning_rate": 3.2980322454828617e-06,
"loss": 0.3917,
"step": 3130
},
{
"epoch": 0.6505499066196306,
"grad_norm": 2.343964465011578,
"learning_rate": 3.281015624948746e-06,
"loss": 0.3893,
"step": 3135
},
{
"epoch": 0.6515874662793111,
"grad_norm": 2.61426487111248,
"learning_rate": 3.264021553678264e-06,
"loss": 0.4087,
"step": 3140
},
{
"epoch": 0.6526250259389915,
"grad_norm": 2.3381914892622553,
"learning_rate": 3.247050254595947e-06,
"loss": 0.3996,
"step": 3145
},
{
"epoch": 0.653662585598672,
"grad_norm": 2.4097371109758945,
"learning_rate": 3.2301019503276144e-06,
"loss": 0.404,
"step": 3150
},
{
"epoch": 0.6547001452583524,
"grad_norm": 2.3425157027203127,
"learning_rate": 3.2131768631974375e-06,
"loss": 0.4025,
"step": 3155
},
{
"epoch": 0.6557377049180327,
"grad_norm": 2.405066414410578,
"learning_rate": 3.196275215225032e-06,
"loss": 0.4095,
"step": 3160
},
{
"epoch": 0.6567752645777132,
"grad_norm": 2.332707293053418,
"learning_rate": 3.179397228122547e-06,
"loss": 0.4,
"step": 3165
},
{
"epoch": 0.6578128242373936,
"grad_norm": 2.374933561243685,
"learning_rate": 3.162543123291749e-06,
"loss": 0.3887,
"step": 3170
},
{
"epoch": 0.6588503838970741,
"grad_norm": 2.265067002723173,
"learning_rate": 3.1457131218211263e-06,
"loss": 0.3974,
"step": 3175
},
{
"epoch": 0.6598879435567545,
"grad_norm": 2.363694486377073,
"learning_rate": 3.1289074444829783e-06,
"loss": 0.3932,
"step": 3180
},
{
"epoch": 0.660925503216435,
"grad_norm": 2.5691413705019603,
"learning_rate": 3.1121263117305355e-06,
"loss": 0.3848,
"step": 3185
},
{
"epoch": 0.6619630628761154,
"grad_norm": 2.3700655398167005,
"learning_rate": 3.0953699436950464e-06,
"loss": 0.3942,
"step": 3190
},
{
"epoch": 0.6630006225357958,
"grad_norm": 2.5079197198404932,
"learning_rate": 3.0786385601829114e-06,
"loss": 0.3921,
"step": 3195
},
{
"epoch": 0.6640381821954763,
"grad_norm": 2.3572529532760136,
"learning_rate": 3.061932380672783e-06,
"loss": 0.389,
"step": 3200
},
{
"epoch": 0.6650757418551567,
"grad_norm": 2.3406368361183643,
"learning_rate": 3.0452516243126955e-06,
"loss": 0.3942,
"step": 3205
},
{
"epoch": 0.666113301514837,
"grad_norm": 2.2910811346177953,
"learning_rate": 3.0285965099171864e-06,
"loss": 0.3954,
"step": 3210
},
{
"epoch": 0.6671508611745175,
"grad_norm": 2.3711327251353254,
"learning_rate": 3.0119672559644313e-06,
"loss": 0.3825,
"step": 3215
},
{
"epoch": 0.6681884208341979,
"grad_norm": 2.346432027672407,
"learning_rate": 2.995364080593368e-06,
"loss": 0.3862,
"step": 3220
},
{
"epoch": 0.6692259804938784,
"grad_norm": 2.25488355968479,
"learning_rate": 2.978787201600847e-06,
"loss": 0.3869,
"step": 3225
},
{
"epoch": 0.6702635401535588,
"grad_norm": 2.5145720297012204,
"learning_rate": 2.9622368364387626e-06,
"loss": 0.3979,
"step": 3230
},
{
"epoch": 0.6713010998132393,
"grad_norm": 2.430288285642923,
"learning_rate": 2.9457132022112156e-06,
"loss": 0.3876,
"step": 3235
},
{
"epoch": 0.6723386594729197,
"grad_norm": 2.52733248085438,
"learning_rate": 2.9292165156716447e-06,
"loss": 0.3918,
"step": 3240
},
{
"epoch": 0.6733762191326002,
"grad_norm": 2.4117862477617464,
"learning_rate": 2.9127469932200034e-06,
"loss": 0.3904,
"step": 3245
},
{
"epoch": 0.6744137787922806,
"grad_norm": 2.544838045591555,
"learning_rate": 2.89630485089991e-06,
"loss": 0.3968,
"step": 3250
},
{
"epoch": 0.675451338451961,
"grad_norm": 2.3114731154281687,
"learning_rate": 2.879890304395816e-06,
"loss": 0.389,
"step": 3255
},
{
"epoch": 0.6764888981116414,
"grad_norm": 2.214278091013657,
"learning_rate": 2.8635035690301725e-06,
"loss": 0.368,
"step": 3260
},
{
"epoch": 0.6775264577713218,
"grad_norm": 2.4150879417081033,
"learning_rate": 2.847144859760622e-06,
"loss": 0.3997,
"step": 3265
},
{
"epoch": 0.6785640174310023,
"grad_norm": 2.2503456214277766,
"learning_rate": 2.8308143911771555e-06,
"loss": 0.3785,
"step": 3270
},
{
"epoch": 0.6796015770906827,
"grad_norm": 2.400256629243757,
"learning_rate": 2.8145123774993075e-06,
"loss": 0.3873,
"step": 3275
},
{
"epoch": 0.6806391367503631,
"grad_norm": 2.4464548102871393,
"learning_rate": 2.798239032573362e-06,
"loss": 0.3811,
"step": 3280
},
{
"epoch": 0.6816766964100436,
"grad_norm": 2.5094627463784973,
"learning_rate": 2.7819945698695148e-06,
"loss": 0.387,
"step": 3285
},
{
"epoch": 0.682714256069724,
"grad_norm": 2.4588062818595944,
"learning_rate": 2.765779202479103e-06,
"loss": 0.3848,
"step": 3290
},
{
"epoch": 0.6837518157294045,
"grad_norm": 2.2530973533871035,
"learning_rate": 2.749593143111793e-06,
"loss": 0.3641,
"step": 3295
},
{
"epoch": 0.6847893753890849,
"grad_norm": 2.3169454419169684,
"learning_rate": 2.733436604092797e-06,
"loss": 0.378,
"step": 3300
},
{
"epoch": 0.6858269350487654,
"grad_norm": 2.5056936406549917,
"learning_rate": 2.7173097973600806e-06,
"loss": 0.3837,
"step": 3305
},
{
"epoch": 0.6868644947084457,
"grad_norm": 2.315516361770829,
"learning_rate": 2.7012129344615933e-06,
"loss": 0.3797,
"step": 3310
},
{
"epoch": 0.6879020543681261,
"grad_norm": 2.3765574424439504,
"learning_rate": 2.6851462265524862e-06,
"loss": 0.3821,
"step": 3315
},
{
"epoch": 0.6889396140278066,
"grad_norm": 2.632664268427571,
"learning_rate": 2.6691098843923464e-06,
"loss": 0.3869,
"step": 3320
},
{
"epoch": 0.689977173687487,
"grad_norm": 2.4558633695260554,
"learning_rate": 2.65310411834242e-06,
"loss": 0.3778,
"step": 3325
},
{
"epoch": 0.6910147333471675,
"grad_norm": 2.3802011004843053,
"learning_rate": 2.637129138362877e-06,
"loss": 0.3818,
"step": 3330
},
{
"epoch": 0.6920522930068479,
"grad_norm": 2.3545532106676625,
"learning_rate": 2.62118515401003e-06,
"loss": 0.3745,
"step": 3335
},
{
"epoch": 0.6930898526665283,
"grad_norm": 2.2370151266527913,
"learning_rate": 2.6052723744336027e-06,
"loss": 0.382,
"step": 3340
},
{
"epoch": 0.6941274123262088,
"grad_norm": 2.3233925949072645,
"learning_rate": 2.589391008373982e-06,
"loss": 0.3901,
"step": 3345
},
{
"epoch": 0.6951649719858892,
"grad_norm": 2.385447861484446,
"learning_rate": 2.5735412641594804e-06,
"loss": 0.3804,
"step": 3350
},
{
"epoch": 0.6962025316455697,
"grad_norm": 2.3578389155683217,
"learning_rate": 2.5577233497035943e-06,
"loss": 0.3888,
"step": 3355
},
{
"epoch": 0.69724009130525,
"grad_norm": 2.3006918024124965,
"learning_rate": 2.541937472502293e-06,
"loss": 0.3661,
"step": 3360
},
{
"epoch": 0.6982776509649304,
"grad_norm": 2.303381728187516,
"learning_rate": 2.526183839631283e-06,
"loss": 0.3827,
"step": 3365
},
{
"epoch": 0.6993152106246109,
"grad_norm": 2.324125134648238,
"learning_rate": 2.5104626577433022e-06,
"loss": 0.376,
"step": 3370
},
{
"epoch": 0.7003527702842913,
"grad_norm": 2.4863826311733614,
"learning_rate": 2.4947741330653942e-06,
"loss": 0.3765,
"step": 3375
},
{
"epoch": 0.7013903299439718,
"grad_norm": 2.309178058770916,
"learning_rate": 2.4791184713962207e-06,
"loss": 0.3665,
"step": 3380
},
{
"epoch": 0.7024278896036522,
"grad_norm": 2.338910146993185,
"learning_rate": 2.463495878103352e-06,
"loss": 0.3601,
"step": 3385
},
{
"epoch": 0.7034654492633327,
"grad_norm": 2.386442857917931,
"learning_rate": 2.4479065581205673e-06,
"loss": 0.3775,
"step": 3390
},
{
"epoch": 0.7045030089230131,
"grad_norm": 2.4264131593130913,
"learning_rate": 2.4323507159451887e-06,
"loss": 0.3775,
"step": 3395
},
{
"epoch": 0.7055405685826935,
"grad_norm": 35.69245904532589,
"learning_rate": 2.416828555635368e-06,
"loss": 0.3836,
"step": 3400
},
{
"epoch": 0.706578128242374,
"grad_norm": 2.3981493771293385,
"learning_rate": 2.4013402808074356e-06,
"loss": 0.3648,
"step": 3405
},
{
"epoch": 0.7076156879020543,
"grad_norm": 2.458681006711934,
"learning_rate": 2.3858860946332148e-06,
"loss": 0.3706,
"step": 3410
},
{
"epoch": 0.7086532475617348,
"grad_norm": 2.5072358874605545,
"learning_rate": 2.3704661998373652e-06,
"loss": 0.3786,
"step": 3415
},
{
"epoch": 0.7096908072214152,
"grad_norm": 2.4783372667192576,
"learning_rate": 2.3550807986947133e-06,
"loss": 0.3635,
"step": 3420
},
{
"epoch": 0.7107283668810956,
"grad_norm": 2.583412985853997,
"learning_rate": 2.3397300930276116e-06,
"loss": 0.3574,
"step": 3425
},
{
"epoch": 0.7117659265407761,
"grad_norm": 2.39052336222071,
"learning_rate": 2.3244142842032823e-06,
"loss": 0.3642,
"step": 3430
},
{
"epoch": 0.7128034862004565,
"grad_norm": 2.2998480766526974,
"learning_rate": 2.309133573131181e-06,
"loss": 0.3659,
"step": 3435
},
{
"epoch": 0.713841045860137,
"grad_norm": 2.452595068182353,
"learning_rate": 2.2938881602603496e-06,
"loss": 0.3767,
"step": 3440
},
{
"epoch": 0.7148786055198174,
"grad_norm": 2.4427339177937597,
"learning_rate": 2.2786782455768113e-06,
"loss": 0.3701,
"step": 3445
},
{
"epoch": 0.7159161651794979,
"grad_norm": 2.5344881209783647,
"learning_rate": 2.2635040286009163e-06,
"loss": 0.3636,
"step": 3450
},
{
"epoch": 0.7169537248391783,
"grad_norm": 2.4517299777866914,
"learning_rate": 2.2483657083847487e-06,
"loss": 0.3638,
"step": 3455
},
{
"epoch": 0.7179912844988587,
"grad_norm": 2.333164872282216,
"learning_rate": 2.233263483509505e-06,
"loss": 0.3652,
"step": 3460
},
{
"epoch": 0.7190288441585391,
"grad_norm": 2.536346581261384,
"learning_rate": 2.218197552082893e-06,
"loss": 0.3843,
"step": 3465
},
{
"epoch": 0.7200664038182195,
"grad_norm": 2.3170765412707444,
"learning_rate": 2.203168111736524e-06,
"loss": 0.3702,
"step": 3470
},
{
"epoch": 0.7211039634779,
"grad_norm": 2.3720426965424926,
"learning_rate": 2.1881753596233334e-06,
"loss": 0.3661,
"step": 3475
},
{
"epoch": 0.7221415231375804,
"grad_norm": 2.4784864232706427,
"learning_rate": 2.173219492414988e-06,
"loss": 0.3557,
"step": 3480
},
{
"epoch": 0.7231790827972608,
"grad_norm": 2.2572170707401598,
"learning_rate": 2.1583007062993037e-06,
"loss": 0.3626,
"step": 3485
},
{
"epoch": 0.7242166424569413,
"grad_norm": 2.3751613572262618,
"learning_rate": 2.1434191969776787e-06,
"loss": 0.372,
"step": 3490
},
{
"epoch": 0.7252542021166217,
"grad_norm": 2.403916983756595,
"learning_rate": 2.1285751596625153e-06,
"loss": 0.3793,
"step": 3495
},
{
"epoch": 0.7262917617763022,
"grad_norm": 2.422300937120251,
"learning_rate": 2.1137687890746733e-06,
"loss": 0.3757,
"step": 3500
},
{
"epoch": 0.7273293214359826,
"grad_norm": 2.3278136492746904,
"learning_rate": 2.099000279440905e-06,
"loss": 0.3695,
"step": 3505
},
{
"epoch": 0.728366881095663,
"grad_norm": 2.3521512237809774,
"learning_rate": 2.0842698244913146e-06,
"loss": 0.3577,
"step": 3510
},
{
"epoch": 0.7294044407553434,
"grad_norm": 2.4420360123708837,
"learning_rate": 2.0695776174568054e-06,
"loss": 0.3617,
"step": 3515
},
{
"epoch": 0.7304420004150238,
"grad_norm": 2.355326415228057,
"learning_rate": 2.054923851066561e-06,
"loss": 0.366,
"step": 3520
},
{
"epoch": 0.7314795600747043,
"grad_norm": 2.408952585713157,
"learning_rate": 2.0403087175455044e-06,
"loss": 0.3592,
"step": 3525
},
{
"epoch": 0.7325171197343847,
"grad_norm": 2.3814394489123556,
"learning_rate": 2.025732408611786e-06,
"loss": 0.3702,
"step": 3530
},
{
"epoch": 0.7335546793940652,
"grad_norm": 2.505258380168417,
"learning_rate": 2.0111951154742526e-06,
"loss": 0.3541,
"step": 3535
},
{
"epoch": 0.7345922390537456,
"grad_norm": 2.2378548590102962,
"learning_rate": 1.9966970288299666e-06,
"loss": 0.367,
"step": 3540
},
{
"epoch": 0.735629798713426,
"grad_norm": 2.5282472044177657,
"learning_rate": 1.982238338861673e-06,
"loss": 0.372,
"step": 3545
},
{
"epoch": 0.7366673583731065,
"grad_norm": 2.395346352470493,
"learning_rate": 1.9678192352353282e-06,
"loss": 0.3647,
"step": 3550
},
{
"epoch": 0.7377049180327869,
"grad_norm": 2.377816912239941,
"learning_rate": 1.9534399070976013e-06,
"loss": 0.3694,
"step": 3555
},
{
"epoch": 0.7387424776924674,
"grad_norm": 2.395069006251528,
"learning_rate": 1.9391005430733973e-06,
"loss": 0.3643,
"step": 3560
},
{
"epoch": 0.7397800373521477,
"grad_norm": 2.360921498446891,
"learning_rate": 1.924801331263375e-06,
"loss": 0.3592,
"step": 3565
},
{
"epoch": 0.7408175970118281,
"grad_norm": 2.4026862396255977,
"learning_rate": 1.9105424592414905e-06,
"loss": 0.3596,
"step": 3570
},
{
"epoch": 0.7418551566715086,
"grad_norm": 2.3493974707324323,
"learning_rate": 1.8963241140525302e-06,
"loss": 0.352,
"step": 3575
},
{
"epoch": 0.742892716331189,
"grad_norm": 2.255383769774033,
"learning_rate": 1.8821464822096587e-06,
"loss": 0.3545,
"step": 3580
},
{
"epoch": 0.7439302759908695,
"grad_norm": 2.3439036527893804,
"learning_rate": 1.8680097496919663e-06,
"loss": 0.3614,
"step": 3585
},
{
"epoch": 0.7449678356505499,
"grad_norm": 2.402995035755627,
"learning_rate": 1.8539141019420459e-06,
"loss": 0.3591,
"step": 3590
},
{
"epoch": 0.7460053953102304,
"grad_norm": 2.3095366349656845,
"learning_rate": 1.8398597238635375e-06,
"loss": 0.3555,
"step": 3595
},
{
"epoch": 0.7470429549699108,
"grad_norm": 2.454441768372595,
"learning_rate": 1.825846799818722e-06,
"loss": 0.3645,
"step": 3600
},
{
"epoch": 0.7480805146295912,
"grad_norm": 2.3783282437356883,
"learning_rate": 1.8118755136260941e-06,
"loss": 0.364,
"step": 3605
},
{
"epoch": 0.7491180742892717,
"grad_norm": 2.4804513717759975,
"learning_rate": 1.7979460485579486e-06,
"loss": 0.3577,
"step": 3610
},
{
"epoch": 0.750155633948952,
"grad_norm": 2.399533389844388,
"learning_rate": 1.784058587337984e-06,
"loss": 0.3548,
"step": 3615
},
{
"epoch": 0.7511931936086325,
"grad_norm": 2.5287966988574992,
"learning_rate": 1.7702133121388999e-06,
"loss": 0.3702,
"step": 3620
},
{
"epoch": 0.7522307532683129,
"grad_norm": 2.5280174627826097,
"learning_rate": 1.7564104045800101e-06,
"loss": 0.3717,
"step": 3625
},
{
"epoch": 0.7532683129279933,
"grad_norm": 2.454120509191578,
"learning_rate": 1.7426500457248552e-06,
"loss": 0.3508,
"step": 3630
},
{
"epoch": 0.7543058725876738,
"grad_norm": 2.2917885535880176,
"learning_rate": 1.7289324160788346e-06,
"loss": 0.3509,
"step": 3635
},
{
"epoch": 0.7553434322473542,
"grad_norm": 2.478759913910432,
"learning_rate": 1.7152576955868338e-06,
"loss": 0.3597,
"step": 3640
},
{
"epoch": 0.7563809919070347,
"grad_norm": 2.3448679544687256,
"learning_rate": 1.701626063630869e-06,
"loss": 0.367,
"step": 3645
},
{
"epoch": 0.7574185515667151,
"grad_norm": 2.3452502536112485,
"learning_rate": 1.6880376990277202e-06,
"loss": 0.3548,
"step": 3650
},
{
"epoch": 0.7584561112263956,
"grad_norm": 2.3587859019469084,
"learning_rate": 1.674492780026611e-06,
"loss": 0.3635,
"step": 3655
},
{
"epoch": 0.759493670886076,
"grad_norm": 2.4281427875718635,
"learning_rate": 1.6609914843068403e-06,
"loss": 0.3535,
"step": 3660
},
{
"epoch": 0.7605312305457563,
"grad_norm": 2.644672013859547,
"learning_rate": 1.6475339889754755e-06,
"loss": 0.3546,
"step": 3665
},
{
"epoch": 0.7615687902054368,
"grad_norm": 2.231863917742126,
"learning_rate": 1.6341204705650155e-06,
"loss": 0.3565,
"step": 3670
},
{
"epoch": 0.7626063498651172,
"grad_norm": 2.3627064704364003,
"learning_rate": 1.6207511050310842e-06,
"loss": 0.3601,
"step": 3675
},
{
"epoch": 0.7636439095247977,
"grad_norm": 2.5482612194562746,
"learning_rate": 1.6074260677501102e-06,
"loss": 0.3606,
"step": 3680
},
{
"epoch": 0.7646814691844781,
"grad_norm": 2.4794463771443187,
"learning_rate": 1.5941455335170408e-06,
"loss": 0.3538,
"step": 3685
},
{
"epoch": 0.7657190288441585,
"grad_norm": 2.6762628509028565,
"learning_rate": 1.5809096765430387e-06,
"loss": 0.3517,
"step": 3690
},
{
"epoch": 0.766756588503839,
"grad_norm": 2.2429463729622565,
"learning_rate": 1.5677186704532016e-06,
"loss": 0.3457,
"step": 3695
},
{
"epoch": 0.7677941481635194,
"grad_norm": 2.36484474879296,
"learning_rate": 1.5545726882842782e-06,
"loss": 0.3573,
"step": 3700
},
{
"epoch": 0.7688317078231999,
"grad_norm": 2.294696024445774,
"learning_rate": 1.5414719024824127e-06,
"loss": 0.3509,
"step": 3705
},
{
"epoch": 0.7698692674828803,
"grad_norm": 2.51233601030323,
"learning_rate": 1.5284164849008648e-06,
"loss": 0.3589,
"step": 3710
},
{
"epoch": 0.7709068271425606,
"grad_norm": 2.473322215306534,
"learning_rate": 1.515406606797763e-06,
"loss": 0.3491,
"step": 3715
},
{
"epoch": 0.7719443868022411,
"grad_norm": 2.2061318656296067,
"learning_rate": 1.5024424388338682e-06,
"loss": 0.3499,
"step": 3720
},
{
"epoch": 0.7729819464619215,
"grad_norm": 2.435209027478781,
"learning_rate": 1.4895241510703157e-06,
"loss": 0.3643,
"step": 3725
},
{
"epoch": 0.774019506121602,
"grad_norm": 2.369611296713981,
"learning_rate": 1.4766519129663992e-06,
"loss": 0.3483,
"step": 3730
},
{
"epoch": 0.7750570657812824,
"grad_norm": 2.355834071175694,
"learning_rate": 1.4638258933773425e-06,
"loss": 0.3519,
"step": 3735
},
{
"epoch": 0.7760946254409629,
"grad_norm": 2.469740057096309,
"learning_rate": 1.451046260552086e-06,
"loss": 0.3475,
"step": 3740
},
{
"epoch": 0.7771321851006433,
"grad_norm": 2.4205796114015077,
"learning_rate": 1.438313182131073e-06,
"loss": 0.3523,
"step": 3745
},
{
"epoch": 0.7781697447603237,
"grad_norm": 2.252004484682199,
"learning_rate": 1.4256268251440631e-06,
"loss": 0.3501,
"step": 3750
},
{
"epoch": 0.7792073044200042,
"grad_norm": 2.3823736067008556,
"learning_rate": 1.412987356007931e-06,
"loss": 0.3439,
"step": 3755
},
{
"epoch": 0.7802448640796846,
"grad_norm": 2.440498718677342,
"learning_rate": 1.4003949405244888e-06,
"loss": 0.3545,
"step": 3760
},
{
"epoch": 0.781282423739365,
"grad_norm": 2.4557991100819905,
"learning_rate": 1.3878497438783035e-06,
"loss": 0.3476,
"step": 3765
},
{
"epoch": 0.7823199833990454,
"grad_norm": 2.3768196794994956,
"learning_rate": 1.3753519306345443e-06,
"loss": 0.3566,
"step": 3770
},
{
"epoch": 0.7833575430587258,
"grad_norm": 2.3763354198113427,
"learning_rate": 1.3629016647368077e-06,
"loss": 0.3508,
"step": 3775
},
{
"epoch": 0.7843951027184063,
"grad_norm": 2.5905901892830725,
"learning_rate": 1.3504991095049774e-06,
"loss": 0.3499,
"step": 3780
},
{
"epoch": 0.7854326623780867,
"grad_norm": 2.4256417735699407,
"learning_rate": 1.338144427633079e-06,
"loss": 0.3504,
"step": 3785
},
{
"epoch": 0.7864702220377672,
"grad_norm": 2.4053687510971664,
"learning_rate": 1.3258377811871481e-06,
"loss": 0.3484,
"step": 3790
},
{
"epoch": 0.7875077816974476,
"grad_norm": 2.4549151530337654,
"learning_rate": 1.3135793316030958e-06,
"loss": 0.3482,
"step": 3795
},
{
"epoch": 0.7885453413571281,
"grad_norm": 2.4124091117521576,
"learning_rate": 1.3013692396846028e-06,
"loss": 0.3417,
"step": 3800
},
{
"epoch": 0.7895829010168085,
"grad_norm": 2.492914244103795,
"learning_rate": 1.2892076656010017e-06,
"loss": 0.346,
"step": 3805
},
{
"epoch": 0.790620460676489,
"grad_norm": 2.5012298456270807,
"learning_rate": 1.277094768885182e-06,
"loss": 0.3359,
"step": 3810
},
{
"epoch": 0.7916580203361693,
"grad_norm": 2.3887971930046,
"learning_rate": 1.2650307084314872e-06,
"loss": 0.3434,
"step": 3815
},
{
"epoch": 0.7926955799958497,
"grad_norm": 2.4930229591528605,
"learning_rate": 1.2530156424936469e-06,
"loss": 0.362,
"step": 3820
},
{
"epoch": 0.7937331396555302,
"grad_norm": 2.386338375070925,
"learning_rate": 1.241049728682684e-06,
"loss": 0.3484,
"step": 3825
},
{
"epoch": 0.7947706993152106,
"grad_norm": 2.303121828761511,
"learning_rate": 1.229133123964853e-06,
"loss": 0.355,
"step": 3830
},
{
"epoch": 0.795808258974891,
"grad_norm": 2.418730350979906,
"learning_rate": 1.2172659846595924e-06,
"loss": 0.3534,
"step": 3835
},
{
"epoch": 0.7968458186345715,
"grad_norm": 2.4002454150472143,
"learning_rate": 1.2054484664374533e-06,
"loss": 0.3484,
"step": 3840
},
{
"epoch": 0.7978833782942519,
"grad_norm": 2.453421003455541,
"learning_rate": 1.1936807243180743e-06,
"loss": 0.3493,
"step": 3845
},
{
"epoch": 0.7989209379539324,
"grad_norm": 2.4746199776766473,
"learning_rate": 1.1819629126681398e-06,
"loss": 0.3387,
"step": 3850
},
{
"epoch": 0.7999584976136128,
"grad_norm": 2.38307853790052,
"learning_rate": 1.1702951851993598e-06,
"loss": 0.3483,
"step": 3855
},
{
"epoch": 0.8009960572732933,
"grad_norm": 2.4103963437096114,
"learning_rate": 1.1586776949664453e-06,
"loss": 0.3423,
"step": 3860
},
{
"epoch": 0.8020336169329736,
"grad_norm": 2.3551774825792924,
"learning_rate": 1.1471105943651117e-06,
"loss": 0.341,
"step": 3865
},
{
"epoch": 0.803071176592654,
"grad_norm": 2.518707782804587,
"learning_rate": 1.1355940351300715e-06,
"loss": 0.349,
"step": 3870
},
{
"epoch": 0.8041087362523345,
"grad_norm": 2.3344993783154964,
"learning_rate": 1.1241281683330486e-06,
"loss": 0.3519,
"step": 3875
},
{
"epoch": 0.8051462959120149,
"grad_norm": 2.3344944133616665,
"learning_rate": 1.1127131443807887e-06,
"loss": 0.3412,
"step": 3880
},
{
"epoch": 0.8061838555716954,
"grad_norm": 2.3019549791312754,
"learning_rate": 1.1013491130131027e-06,
"loss": 0.3346,
"step": 3885
},
{
"epoch": 0.8072214152313758,
"grad_norm": 2.3790731919640415,
"learning_rate": 1.0900362233008804e-06,
"loss": 0.3414,
"step": 3890
},
{
"epoch": 0.8082589748910562,
"grad_norm": 2.4317034804688453,
"learning_rate": 1.0787746236441538e-06,
"loss": 0.3521,
"step": 3895
},
{
"epoch": 0.8092965345507367,
"grad_norm": 2.387356188471124,
"learning_rate": 1.0675644617701402e-06,
"loss": 0.3528,
"step": 3900
},
{
"epoch": 0.8103340942104171,
"grad_norm": 2.53615626667828,
"learning_rate": 1.0564058847313108e-06,
"loss": 0.351,
"step": 3905
},
{
"epoch": 0.8113716538700976,
"grad_norm": 2.4249799199654376,
"learning_rate": 1.0452990389034507e-06,
"loss": 0.3392,
"step": 3910
},
{
"epoch": 0.8124092135297779,
"grad_norm": 2.3915606563087946,
"learning_rate": 1.0342440699837537e-06,
"loss": 0.3361,
"step": 3915
},
{
"epoch": 0.8134467731894583,
"grad_norm": 2.5405240392917094,
"learning_rate": 1.0232411229888994e-06,
"loss": 0.3457,
"step": 3920
},
{
"epoch": 0.8144843328491388,
"grad_norm": 2.424497940803967,
"learning_rate": 1.0122903422531588e-06,
"loss": 0.3424,
"step": 3925
},
{
"epoch": 0.8155218925088192,
"grad_norm": 2.4383046197013187,
"learning_rate": 1.001391871426492e-06,
"loss": 0.3559,
"step": 3930
},
{
"epoch": 0.8165594521684997,
"grad_norm": 2.3911832191085054,
"learning_rate": 9.90545853472673e-07,
"loss": 0.3429,
"step": 3935
},
{
"epoch": 0.8175970118281801,
"grad_norm": 2.279983678354184,
"learning_rate": 9.797524306674104e-07,
"loss": 0.3437,
"step": 3940
},
{
"epoch": 0.8186345714878606,
"grad_norm": 2.50140074824066,
"learning_rate": 9.69011744596477e-07,
"loss": 0.3402,
"step": 3945
},
{
"epoch": 0.819672131147541,
"grad_norm": 2.4365828792120294,
"learning_rate": 9.583239361538638e-07,
"loss": 0.3417,
"step": 3950
},
{
"epoch": 0.8207096908072214,
"grad_norm": 2.3579736935494955,
"learning_rate": 9.476891455399168e-07,
"loss": 0.3346,
"step": 3955
},
{
"epoch": 0.8217472504669019,
"grad_norm": 2.5090769048416566,
"learning_rate": 9.371075122595103e-07,
"loss": 0.3231,
"step": 3960
},
{
"epoch": 0.8227848101265823,
"grad_norm": 2.439152419655965,
"learning_rate": 9.265791751202113e-07,
"loss": 0.3437,
"step": 3965
},
{
"epoch": 0.8238223697862627,
"grad_norm": 2.4534357310565214,
"learning_rate": 9.161042722304609e-07,
"loss": 0.3409,
"step": 3970
},
{
"epoch": 0.8248599294459431,
"grad_norm": 2.2889519820080357,
"learning_rate": 9.056829409977574e-07,
"loss": 0.3423,
"step": 3975
},
{
"epoch": 0.8258974891056236,
"grad_norm": 2.6079615773622544,
"learning_rate": 8.953153181268609e-07,
"loss": 0.3412,
"step": 3980
},
{
"epoch": 0.826935048765304,
"grad_norm": 2.462659677967327,
"learning_rate": 8.850015396179962e-07,
"loss": 0.3399,
"step": 3985
},
{
"epoch": 0.8279726084249844,
"grad_norm": 2.453876430295839,
"learning_rate": 8.747417407650704e-07,
"loss": 0.3368,
"step": 3990
},
{
"epoch": 0.8290101680846649,
"grad_norm": 2.5260638258531722,
"learning_rate": 8.645360561538935e-07,
"loss": 0.3506,
"step": 3995
},
{
"epoch": 0.8300477277443453,
"grad_norm": 2.392012536793574,
"learning_rate": 8.543846196604239e-07,
"loss": 0.3434,
"step": 4000
},
{
"epoch": 0.8310852874040258,
"grad_norm": 2.4984903138745587,
"learning_rate": 8.442875644489962e-07,
"loss": 0.3351,
"step": 4005
},
{
"epoch": 0.8321228470637062,
"grad_norm": 2.464203117108214,
"learning_rate": 8.342450229705889e-07,
"loss": 0.3455,
"step": 4010
},
{
"epoch": 0.8331604067233866,
"grad_norm": 2.489210802914686,
"learning_rate": 8.2425712696108e-07,
"loss": 0.33,
"step": 4015
},
{
"epoch": 0.834197966383067,
"grad_norm": 2.306914933986251,
"learning_rate": 8.143240074395198e-07,
"loss": 0.3418,
"step": 4020
},
{
"epoch": 0.8352355260427474,
"grad_norm": 2.513591795617815,
"learning_rate": 8.044457947064116e-07,
"loss": 0.3418,
"step": 4025
},
{
"epoch": 0.8362730857024279,
"grad_norm": 2.3537491795821524,
"learning_rate": 7.946226183420047e-07,
"loss": 0.3479,
"step": 4030
},
{
"epoch": 0.8373106453621083,
"grad_norm": 2.422572559987635,
"learning_rate": 7.848546072045932e-07,
"loss": 0.3446,
"step": 4035
},
{
"epoch": 0.8383482050217888,
"grad_norm": 2.3037996424039084,
"learning_rate": 7.75141889428826e-07,
"loss": 0.3257,
"step": 4040
},
{
"epoch": 0.8393857646814692,
"grad_norm": 2.3570961713348186,
"learning_rate": 7.654845924240228e-07,
"loss": 0.3341,
"step": 4045
},
{
"epoch": 0.8404233243411496,
"grad_norm": 2.3236115130562762,
"learning_rate": 7.558828428725102e-07,
"loss": 0.3328,
"step": 4050
},
{
"epoch": 0.8414608840008301,
"grad_norm": 2.4888053313991603,
"learning_rate": 7.463367667279515e-07,
"loss": 0.3429,
"step": 4055
},
{
"epoch": 0.8424984436605105,
"grad_norm": 2.4813791916370334,
"learning_rate": 7.368464892137006e-07,
"loss": 0.3412,
"step": 4060
},
{
"epoch": 0.843536003320191,
"grad_norm": 2.3113527571110737,
"learning_rate": 7.274121348211582e-07,
"loss": 0.3475,
"step": 4065
},
{
"epoch": 0.8445735629798713,
"grad_norm": 2.587556104836598,
"learning_rate": 7.180338273081327e-07,
"loss": 0.3354,
"step": 4070
},
{
"epoch": 0.8456111226395517,
"grad_norm": 2.296420055640647,
"learning_rate": 7.087116896972268e-07,
"loss": 0.3357,
"step": 4075
},
{
"epoch": 0.8466486822992322,
"grad_norm": 2.3266676608899663,
"learning_rate": 6.994458442742163e-07,
"loss": 0.3362,
"step": 4080
},
{
"epoch": 0.8476862419589126,
"grad_norm": 2.6045589584755984,
"learning_rate": 6.902364125864496e-07,
"loss": 0.34,
"step": 4085
},
{
"epoch": 0.8487238016185931,
"grad_norm": 2.352378817551258,
"learning_rate": 6.810835154412487e-07,
"loss": 0.339,
"step": 4090
},
{
"epoch": 0.8497613612782735,
"grad_norm": 2.3547116125315446,
"learning_rate": 6.719872729043331e-07,
"loss": 0.3378,
"step": 4095
},
{
"epoch": 0.850798920937954,
"grad_norm": 2.321427219175219,
"learning_rate": 6.629478042982346e-07,
"loss": 0.3229,
"step": 4100
},
{
"epoch": 0.8518364805976344,
"grad_norm": 2.521475742171532,
"learning_rate": 6.539652282007386e-07,
"loss": 0.3376,
"step": 4105
},
{
"epoch": 0.8528740402573148,
"grad_norm": 2.3926877720494573,
"learning_rate": 6.450396624433286e-07,
"loss": 0.3325,
"step": 4110
},
{
"epoch": 0.8539115999169953,
"grad_norm": 2.425081699530235,
"learning_rate": 6.361712241096374e-07,
"loss": 0.3314,
"step": 4115
},
{
"epoch": 0.8549491595766756,
"grad_norm": 2.487314370898736,
"learning_rate": 6.273600295339111e-07,
"loss": 0.3352,
"step": 4120
},
{
"epoch": 0.855986719236356,
"grad_norm": 2.4556339075670723,
"learning_rate": 6.186061942994864e-07,
"loss": 0.3338,
"step": 4125
},
{
"epoch": 0.8570242788960365,
"grad_norm": 2.417547682627604,
"learning_rate": 6.099098332372733e-07,
"loss": 0.3299,
"step": 4130
},
{
"epoch": 0.8580618385557169,
"grad_norm": 2.611592074254707,
"learning_rate": 6.012710604242478e-07,
"loss": 0.3331,
"step": 4135
},
{
"epoch": 0.8590993982153974,
"grad_norm": 2.448052874786836,
"learning_rate": 5.926899891819521e-07,
"loss": 0.3376,
"step": 4140
},
{
"epoch": 0.8601369578750778,
"grad_norm": 2.475864292987021,
"learning_rate": 5.841667320750188e-07,
"loss": 0.3437,
"step": 4145
},
{
"epoch": 0.8611745175347583,
"grad_norm": 2.373719046516931,
"learning_rate": 5.757014009096801e-07,
"loss": 0.3357,
"step": 4150
},
{
"epoch": 0.8622120771944387,
"grad_norm": 2.3162935725557032,
"learning_rate": 5.672941067323124e-07,
"loss": 0.3305,
"step": 4155
},
{
"epoch": 0.8632496368541192,
"grad_norm": 2.3285428336849736,
"learning_rate": 5.589449598279762e-07,
"loss": 0.3278,
"step": 4160
},
{
"epoch": 0.8642871965137996,
"grad_norm": 2.6975663812633823,
"learning_rate": 5.506540697189638e-07,
"loss": 0.3235,
"step": 4165
},
{
"epoch": 0.8653247561734799,
"grad_norm": 2.385391836897359,
"learning_rate": 5.424215451633719e-07,
"loss": 0.3366,
"step": 4170
},
{
"epoch": 0.8663623158331604,
"grad_norm": 2.509848709680898,
"learning_rate": 5.342474941536701e-07,
"loss": 0.3349,
"step": 4175
},
{
"epoch": 0.8673998754928408,
"grad_norm": 2.4992507740450063,
"learning_rate": 5.261320239152851e-07,
"loss": 0.3248,
"step": 4180
},
{
"epoch": 0.8684374351525213,
"grad_norm": 2.4274228991197826,
"learning_rate": 5.180752409051892e-07,
"loss": 0.3276,
"step": 4185
},
{
"epoch": 0.8694749948122017,
"grad_norm": 2.3127395460474975,
"learning_rate": 5.100772508105139e-07,
"loss": 0.318,
"step": 4190
},
{
"epoch": 0.8705125544718821,
"grad_norm": 2.468734058758486,
"learning_rate": 5.021381585471563e-07,
"loss": 0.3332,
"step": 4195
},
{
"epoch": 0.8715501141315626,
"grad_norm": 2.4531926615746564,
"learning_rate": 4.942580682584041e-07,
"loss": 0.3286,
"step": 4200
},
{
"epoch": 0.872587673791243,
"grad_norm": 2.377223115961875,
"learning_rate": 4.864370833135673e-07,
"loss": 0.3342,
"step": 4205
},
{
"epoch": 0.8736252334509235,
"grad_norm": 2.449018104688608,
"learning_rate": 4.786753063066318e-07,
"loss": 0.3355,
"step": 4210
},
{
"epoch": 0.8746627931106039,
"grad_norm": 2.4173614341699685,
"learning_rate": 4.7097283905489956e-07,
"loss": 0.3288,
"step": 4215
},
{
"epoch": 0.8757003527702842,
"grad_norm": 2.603118901855438,
"learning_rate": 4.633297825976635e-07,
"loss": 0.3371,
"step": 4220
},
{
"epoch": 0.8767379124299647,
"grad_norm": 2.4773560425700727,
"learning_rate": 4.5574623719487787e-07,
"loss": 0.3256,
"step": 4225
},
{
"epoch": 0.8777754720896451,
"grad_norm": 2.4564917228085053,
"learning_rate": 4.482223023258453e-07,
"loss": 0.337,
"step": 4230
},
{
"epoch": 0.8788130317493256,
"grad_norm": 2.4727605694693553,
"learning_rate": 4.407580766879066e-07,
"loss": 0.3221,
"step": 4235
},
{
"epoch": 0.879850591409006,
"grad_norm": 2.593564549202061,
"learning_rate": 4.333536581951542e-07,
"loss": 0.3364,
"step": 4240
},
{
"epoch": 0.8808881510686865,
"grad_norm": 2.3065543830266204,
"learning_rate": 4.2600914397714023e-07,
"loss": 0.3266,
"step": 4245
},
{
"epoch": 0.8819257107283669,
"grad_norm": 2.3918281813023783,
"learning_rate": 4.1872463037760823e-07,
"loss": 0.3311,
"step": 4250
},
{
"epoch": 0.8829632703880473,
"grad_norm": 2.5113381009120417,
"learning_rate": 4.1150021295322306e-07,
"loss": 0.3373,
"step": 4255
},
{
"epoch": 0.8840008300477278,
"grad_norm": 2.4230720157484935,
"learning_rate": 4.043359864723262e-07,
"loss": 0.3329,
"step": 4260
},
{
"epoch": 0.8850383897074082,
"grad_norm": 2.616499204627849,
"learning_rate": 3.972320449136829e-07,
"loss": 0.3295,
"step": 4265
},
{
"epoch": 0.8860759493670886,
"grad_norm": 2.5866660581654677,
"learning_rate": 3.90188481465254e-07,
"loss": 0.3276,
"step": 4270
},
{
"epoch": 0.887113509026769,
"grad_norm": 2.4795883896583364,
"learning_rate": 3.8320538852297694e-07,
"loss": 0.3339,
"step": 4275
},
{
"epoch": 0.8881510686864494,
"grad_norm": 2.3898148517858244,
"learning_rate": 3.762828576895472e-07,
"loss": 0.3373,
"step": 4280
},
{
"epoch": 0.8891886283461299,
"grad_norm": 2.4124000504118044,
"learning_rate": 3.694209797732201e-07,
"loss": 0.332,
"step": 4285
},
{
"epoch": 0.8902261880058103,
"grad_norm": 2.724223147572049,
"learning_rate": 3.6261984478662025e-07,
"loss": 0.3417,
"step": 4290
},
{
"epoch": 0.8912637476654908,
"grad_norm": 2.4453049065554278,
"learning_rate": 3.558795419455596e-07,
"loss": 0.3198,
"step": 4295
},
{
"epoch": 0.8923013073251712,
"grad_norm": 2.3658613859590165,
"learning_rate": 3.492001596678651e-07,
"loss": 0.3248,
"step": 4300
},
{
"epoch": 0.8933388669848517,
"grad_norm": 2.415791137686327,
"learning_rate": 3.4258178557222354e-07,
"loss": 0.3238,
"step": 4305
},
{
"epoch": 0.8943764266445321,
"grad_norm": 2.2801295759850566,
"learning_rate": 3.3602450647702847e-07,
"loss": 0.3317,
"step": 4310
},
{
"epoch": 0.8954139863042125,
"grad_norm": 2.532814151310614,
"learning_rate": 3.295284083992434e-07,
"loss": 0.3217,
"step": 4315
},
{
"epoch": 0.8964515459638929,
"grad_norm": 2.5019562185543993,
"learning_rate": 3.2309357655326945e-07,
"loss": 0.333,
"step": 4320
},
{
"epoch": 0.8974891056235733,
"grad_norm": 2.2943698094322955,
"learning_rate": 3.167200953498367e-07,
"loss": 0.3181,
"step": 4325
},
{
"epoch": 0.8985266652832538,
"grad_norm": 2.405841184133046,
"learning_rate": 3.1040804839488406e-07,
"loss": 0.3339,
"step": 4330
},
{
"epoch": 0.8995642249429342,
"grad_norm": 2.386316529188201,
"learning_rate": 3.041575184884732e-07,
"loss": 0.3341,
"step": 4335
},
{
"epoch": 0.9006017846026146,
"grad_norm": 2.517011507851569,
"learning_rate": 2.979685876236982e-07,
"loss": 0.3346,
"step": 4340
},
{
"epoch": 0.9016393442622951,
"grad_norm": 2.5508437593952094,
"learning_rate": 2.918413369856105e-07,
"loss": 0.3258,
"step": 4345
},
{
"epoch": 0.9026769039219755,
"grad_norm": 2.457005114059522,
"learning_rate": 2.857758469501509e-07,
"loss": 0.3197,
"step": 4350
},
{
"epoch": 0.903714463581656,
"grad_norm": 2.3964711097696836,
"learning_rate": 2.7977219708310134e-07,
"loss": 0.3288,
"step": 4355
},
{
"epoch": 0.9047520232413364,
"grad_norm": 2.4092985201196084,
"learning_rate": 2.7383046613903676e-07,
"loss": 0.325,
"step": 4360
},
{
"epoch": 0.9057895829010169,
"grad_norm": 2.519420182065836,
"learning_rate": 2.679507320602931e-07,
"loss": 0.325,
"step": 4365
},
{
"epoch": 0.9068271425606972,
"grad_norm": 2.4612555403219214,
"learning_rate": 2.6213307197594353e-07,
"loss": 0.3261,
"step": 4370
},
{
"epoch": 0.9078647022203776,
"grad_norm": 2.568118436273184,
"learning_rate": 2.5637756220079135e-07,
"loss": 0.3246,
"step": 4375
},
{
"epoch": 0.9089022618800581,
"grad_norm": 2.5397440534691373,
"learning_rate": 2.506842782343627e-07,
"loss": 0.3162,
"step": 4380
},
{
"epoch": 0.9099398215397385,
"grad_norm": 2.4925975703816245,
"learning_rate": 2.4505329475991823e-07,
"loss": 0.331,
"step": 4385
},
{
"epoch": 0.910977381199419,
"grad_norm": 3.3523863369544986,
"learning_rate": 2.3948468564347904e-07,
"loss": 0.3277,
"step": 4390
},
{
"epoch": 0.9120149408590994,
"grad_norm": 2.575554252160631,
"learning_rate": 2.3397852393284792e-07,
"loss": 0.3404,
"step": 4395
},
{
"epoch": 0.9130525005187798,
"grad_norm": 2.481671747693274,
"learning_rate": 2.2853488185665796e-07,
"loss": 0.3206,
"step": 4400
},
{
"epoch": 0.9140900601784603,
"grad_norm": 2.4222594768146415,
"learning_rate": 2.231538308234249e-07,
"loss": 0.3216,
"step": 4405
},
{
"epoch": 0.9151276198381407,
"grad_norm": 2.579114792911807,
"learning_rate": 2.178354414206063e-07,
"loss": 0.3267,
"step": 4410
},
{
"epoch": 0.9161651794978212,
"grad_norm": 2.472569881694958,
"learning_rate": 2.125797834136789e-07,
"loss": 0.3212,
"step": 4415
},
{
"epoch": 0.9172027391575015,
"grad_norm": 2.4091565145640605,
"learning_rate": 2.0738692574522324e-07,
"loss": 0.3246,
"step": 4420
},
{
"epoch": 0.9182402988171819,
"grad_norm": 2.4603025192987644,
"learning_rate": 2.0225693653401824e-07,
"loss": 0.3221,
"step": 4425
},
{
"epoch": 0.9192778584768624,
"grad_norm": 2.536120994058227,
"learning_rate": 1.9718988307414866e-07,
"loss": 0.3289,
"step": 4430
},
{
"epoch": 0.9203154181365428,
"grad_norm": 2.423050703239737,
"learning_rate": 1.921858318341191e-07,
"loss": 0.3258,
"step": 4435
},
{
"epoch": 0.9213529777962233,
"grad_norm": 2.429414838855423,
"learning_rate": 1.8724484845598855e-07,
"loss": 0.3196,
"step": 4440
},
{
"epoch": 0.9223905374559037,
"grad_norm": 2.7662680215892004,
"learning_rate": 1.8236699775450338e-07,
"loss": 0.3258,
"step": 4445
},
{
"epoch": 0.9234280971155842,
"grad_norm": 2.45993148237687,
"learning_rate": 1.7755234371624908e-07,
"loss": 0.318,
"step": 4450
},
{
"epoch": 0.9244656567752646,
"grad_norm": 2.4765798027549644,
"learning_rate": 1.7280094949881144e-07,
"loss": 0.3287,
"step": 4455
},
{
"epoch": 0.925503216434945,
"grad_norm": 2.5990110307214027,
"learning_rate": 1.6811287742994897e-07,
"loss": 0.3203,
"step": 4460
},
{
"epoch": 0.9265407760946255,
"grad_norm": 2.7248202388018563,
"learning_rate": 1.6348818900677077e-07,
"loss": 0.3196,
"step": 4465
},
{
"epoch": 0.9275783357543059,
"grad_norm": 2.360215127100717,
"learning_rate": 1.5892694489493598e-07,
"loss": 0.3266,
"step": 4470
},
{
"epoch": 0.9286158954139863,
"grad_norm": 2.3701662902928415,
"learning_rate": 1.5442920492785396e-07,
"loss": 0.3217,
"step": 4475
},
{
"epoch": 0.9296534550736667,
"grad_norm": 2.5408968036091593,
"learning_rate": 1.4999502810590094e-07,
"loss": 0.3147,
"step": 4480
},
{
"epoch": 0.9306910147333471,
"grad_norm": 2.3872752954425276,
"learning_rate": 1.456244725956446e-07,
"loss": 0.3314,
"step": 4485
},
{
"epoch": 0.9317285743930276,
"grad_norm": 2.510401039927896,
"learning_rate": 1.4131759572908354e-07,
"loss": 0.3233,
"step": 4490
},
{
"epoch": 0.932766134052708,
"grad_norm": 2.452160220911644,
"learning_rate": 1.370744540028929e-07,
"loss": 0.3216,
"step": 4495
},
{
"epoch": 0.9338036937123885,
"grad_norm": 2.5517139832044826,
"learning_rate": 1.328951030776826e-07,
"loss": 0.322,
"step": 4500
},
{
"epoch": 0.9348412533720689,
"grad_norm": 2.600821629157991,
"learning_rate": 1.2877959777727212e-07,
"loss": 0.328,
"step": 4505
},
{
"epoch": 0.9358788130317494,
"grad_norm": 2.4402893254633207,
"learning_rate": 1.2472799208796517e-07,
"loss": 0.3352,
"step": 4510
},
{
"epoch": 0.9369163726914298,
"grad_norm": 2.4890223515899748,
"learning_rate": 1.2074033915784543e-07,
"loss": 0.3273,
"step": 4515
},
{
"epoch": 0.9379539323511102,
"grad_norm": 2.541162954952577,
"learning_rate": 1.168166912960772e-07,
"loss": 0.3198,
"step": 4520
},
{
"epoch": 0.9389914920107906,
"grad_norm": 2.510264694405438,
"learning_rate": 1.1295709997222182e-07,
"loss": 0.3278,
"step": 4525
},
{
"epoch": 0.940029051670471,
"grad_norm": 2.4830766706425895,
"learning_rate": 1.0916161581555895e-07,
"loss": 0.3153,
"step": 4530
},
{
"epoch": 0.9410666113301515,
"grad_norm": 2.3227486456210222,
"learning_rate": 1.0543028861442539e-07,
"loss": 0.3177,
"step": 4535
},
{
"epoch": 0.9421041709898319,
"grad_norm": 2.5419821606370174,
"learning_rate": 1.0176316731556112e-07,
"loss": 0.3322,
"step": 4540
},
{
"epoch": 0.9431417306495123,
"grad_norm": 2.6673497102822536,
"learning_rate": 9.816030002346766e-08,
"loss": 0.3319,
"step": 4545
},
{
"epoch": 0.9441792903091928,
"grad_norm": 2.3937271331548904,
"learning_rate": 9.462173399977348e-08,
"loss": 0.3267,
"step": 4550
},
{
"epoch": 0.9452168499688732,
"grad_norm": 2.3407051661833123,
"learning_rate": 9.11475156626207e-08,
"loss": 0.3103,
"step": 4555
},
{
"epoch": 0.9462544096285537,
"grad_norm": 2.5377283482688755,
"learning_rate": 8.773769058605053e-08,
"loss": 0.3356,
"step": 4560
},
{
"epoch": 0.9472919692882341,
"grad_norm": 2.3498925102259904,
"learning_rate": 8.439230349940708e-08,
"loss": 0.3254,
"step": 4565
},
{
"epoch": 0.9483295289479146,
"grad_norm": 2.3416477533349145,
"learning_rate": 8.111139828675175e-08,
"loss": 0.3293,
"step": 4570
},
{
"epoch": 0.9493670886075949,
"grad_norm": 2.559009235835814,
"learning_rate": 7.78950179862864e-08,
"loss": 0.3185,
"step": 4575
},
{
"epoch": 0.9504046482672753,
"grad_norm": 2.586458213825969,
"learning_rate": 7.474320478978946e-08,
"loss": 0.3246,
"step": 4580
},
{
"epoch": 0.9514422079269558,
"grad_norm": 2.3485714776360407,
"learning_rate": 7.16560000420613e-08,
"loss": 0.3238,
"step": 4585
},
{
"epoch": 0.9524797675866362,
"grad_norm": 2.369694918601893,
"learning_rate": 6.863344424038354e-08,
"loss": 0.3103,
"step": 4590
},
{
"epoch": 0.9535173272463167,
"grad_norm": 2.4252802076691538,
"learning_rate": 6.567557703398675e-08,
"loss": 0.3273,
"step": 4595
},
{
"epoch": 0.9545548869059971,
"grad_norm": 2.552882664412225,
"learning_rate": 6.278243722352973e-08,
"loss": 0.3182,
"step": 4600
},
{
"epoch": 0.9555924465656775,
"grad_norm": 2.5353013413397507,
"learning_rate": 5.995406276059267e-08,
"loss": 0.3335,
"step": 4605
},
{
"epoch": 0.956630006225358,
"grad_norm": 2.4812443315722934,
"learning_rate": 5.719049074717764e-08,
"loss": 0.3191,
"step": 4610
},
{
"epoch": 0.9576675658850384,
"grad_norm": 2.427622171598608,
"learning_rate": 5.4491757435220505e-08,
"loss": 0.3236,
"step": 4615
},
{
"epoch": 0.9587051255447189,
"grad_norm": 2.601332407638946,
"learning_rate": 5.185789822612086e-08,
"loss": 0.3335,
"step": 4620
},
{
"epoch": 0.9597426852043992,
"grad_norm": 2.4380419721098976,
"learning_rate": 4.9288947670270146e-08,
"loss": 0.3193,
"step": 4625
},
{
"epoch": 0.9607802448640796,
"grad_norm": 2.36715515753985,
"learning_rate": 4.678493946660423e-08,
"loss": 0.3228,
"step": 4630
},
{
"epoch": 0.9618178045237601,
"grad_norm": 2.4303066509314677,
"learning_rate": 4.434590646215819e-08,
"loss": 0.3266,
"step": 4635
},
{
"epoch": 0.9628553641834405,
"grad_norm": 2.865610610806469,
"learning_rate": 4.1971880651638376e-08,
"loss": 0.3247,
"step": 4640
},
{
"epoch": 0.963892923843121,
"grad_norm": 2.5407706997414268,
"learning_rate": 3.966289317699878e-08,
"loss": 0.3288,
"step": 4645
},
{
"epoch": 0.9649304835028014,
"grad_norm": 2.4204826042021437,
"learning_rate": 3.74189743270359e-08,
"loss": 0.3145,
"step": 4650
},
{
"epoch": 0.9659680431624819,
"grad_norm": 2.2543367363310716,
"learning_rate": 3.5240153536988954e-08,
"loss": 0.3307,
"step": 4655
},
{
"epoch": 0.9670056028221623,
"grad_norm": 2.3642879401404677,
"learning_rate": 3.312645938815695e-08,
"loss": 0.3111,
"step": 4660
},
{
"epoch": 0.9680431624818427,
"grad_norm": 2.551087158396502,
"learning_rate": 3.107791960752005e-08,
"loss": 0.3202,
"step": 4665
},
{
"epoch": 0.9690807221415232,
"grad_norm": 2.431842119612971,
"learning_rate": 2.909456106737818e-08,
"loss": 0.3271,
"step": 4670
},
{
"epoch": 0.9701182818012035,
"grad_norm": 2.469650411851549,
"learning_rate": 2.7176409784998027e-08,
"loss": 0.3225,
"step": 4675
},
{
"epoch": 0.971155841460884,
"grad_norm": 2.6570472493964337,
"learning_rate": 2.5323490922271044e-08,
"loss": 0.3233,
"step": 4680
},
{
"epoch": 0.9721934011205644,
"grad_norm": 2.792428588323876,
"learning_rate": 2.3535828785384296e-08,
"loss": 0.3263,
"step": 4685
},
{
"epoch": 0.9732309607802448,
"grad_norm": 2.3558732345665456,
"learning_rate": 2.1813446824502372e-08,
"loss": 0.3136,
"step": 4690
},
{
"epoch": 0.9742685204399253,
"grad_norm": 2.356877231683961,
"learning_rate": 2.0156367633455965e-08,
"loss": 0.3239,
"step": 4695
},
{
"epoch": 0.9753060800996057,
"grad_norm": 2.4529847227301818,
"learning_rate": 1.8564612949451555e-08,
"loss": 0.3168,
"step": 4700
},
{
"epoch": 0.9763436397592862,
"grad_norm": 2.382662217743029,
"learning_rate": 1.7038203652781083e-08,
"loss": 0.3234,
"step": 4705
},
{
"epoch": 0.9773811994189666,
"grad_norm": 2.333476690943818,
"learning_rate": 1.5577159766548832e-08,
"loss": 0.3274,
"step": 4710
},
{
"epoch": 0.9784187590786471,
"grad_norm": 2.402141955867801,
"learning_rate": 1.4181500456412755e-08,
"loss": 0.3215,
"step": 4715
},
{
"epoch": 0.9794563187383275,
"grad_norm": 2.501308170912781,
"learning_rate": 1.2851244030328004e-08,
"loss": 0.3212,
"step": 4720
},
{
"epoch": 0.9804938783980078,
"grad_norm": 2.6177200025512626,
"learning_rate": 1.1586407938308785e-08,
"loss": 0.3136,
"step": 4725
},
{
"epoch": 0.9815314380576883,
"grad_norm": 2.548386853689889,
"learning_rate": 1.0387008772200779e-08,
"loss": 0.342,
"step": 4730
},
{
"epoch": 0.9825689977173687,
"grad_norm": 2.552595538532663,
"learning_rate": 9.253062265461855e-09,
"loss": 0.3234,
"step": 4735
},
{
"epoch": 0.9836065573770492,
"grad_norm": 2.489168583450402,
"learning_rate": 8.184583292955572e-09,
"loss": 0.318,
"step": 4740
},
{
"epoch": 0.9846441170367296,
"grad_norm": 2.380606466941288,
"learning_rate": 7.181585870757457e-09,
"loss": 0.326,
"step": 4745
},
{
"epoch": 0.98568167669641,
"grad_norm": 2.4173383787142297,
"learning_rate": 6.2440831559690275e-09,
"loss": 0.3185,
"step": 4750
},
{
"epoch": 0.9867192363560905,
"grad_norm": 2.542878596032387,
"learning_rate": 5.372087446547935e-09,
"loss": 0.3252,
"step": 4755
},
{
"epoch": 0.9877567960157709,
"grad_norm": 2.4745977048628918,
"learning_rate": 4.565610181144209e-09,
"loss": 0.3304,
"step": 4760
},
{
"epoch": 0.9887943556754514,
"grad_norm": 2.4052051630616895,
"learning_rate": 3.824661938951479e-09,
"loss": 0.3243,
"step": 4765
},
{
"epoch": 0.9898319153351318,
"grad_norm": 2.6094698060669264,
"learning_rate": 3.1492524395682065e-09,
"loss": 0.3189,
"step": 4770
},
{
"epoch": 0.9908694749948121,
"grad_norm": 2.3476458011761134,
"learning_rate": 2.5393905428688913e-09,
"loss": 0.3231,
"step": 4775
},
{
"epoch": 0.9919070346544926,
"grad_norm": 2.3445329353887456,
"learning_rate": 1.9950842488891674e-09,
"loss": 0.3355,
"step": 4780
},
{
"epoch": 0.992944594314173,
"grad_norm": 2.4693805125337094,
"learning_rate": 1.5163406977219963e-09,
"loss": 0.3256,
"step": 4785
},
{
"epoch": 0.9939821539738535,
"grad_norm": 2.563629879915604,
"learning_rate": 1.103166169420522e-09,
"loss": 0.3254,
"step": 4790
},
{
"epoch": 0.9950197136335339,
"grad_norm": 2.494757369109791,
"learning_rate": 7.555660839181356e-10,
"loss": 0.3088,
"step": 4795
},
{
"epoch": 0.9960572732932144,
"grad_norm": 2.373070173051403,
"learning_rate": 4.735450009579756e-10,
"loss": 0.323,
"step": 4800
},
{
"epoch": 0.9970948329528948,
"grad_norm": 2.397230200383968,
"learning_rate": 2.571066200307559e-10,
"loss": 0.3303,
"step": 4805
},
{
"epoch": 0.9981323926125752,
"grad_norm": 2.6318567902913985,
"learning_rate": 1.0625378032813604e-10,
"loss": 0.3156,
"step": 4810
},
{
"epoch": 0.9991699522722557,
"grad_norm": 2.3100958526551696,
"learning_rate": 2.0988460705528846e-11,
"loss": 0.3192,
"step": 4815
},
{
"epoch": 1.0,
"eval_loss": 0.2939795255661011,
"eval_runtime": 0.9809,
"eval_samples_per_second": 3.058,
"eval_steps_per_second": 1.019,
"step": 4819
},
{
"epoch": 1.0,
"step": 4819,
"total_flos": 504500280360960.0,
"train_loss": 0.5211530197307415,
"train_runtime": 55738.8815,
"train_samples_per_second": 1.383,
"train_steps_per_second": 0.086
}
],
"logging_steps": 5,
"max_steps": 4819,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 504500280360960.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}