{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4819, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020751193193608634, "grad_norm": 23.81510217286444, "learning_rate": 2.0746887966804982e-08, "loss": 1.3923, "step": 1 }, { "epoch": 0.0010375596596804316, "grad_norm": 23.986426679578184, "learning_rate": 1.037344398340249e-07, "loss": 1.4149, "step": 5 }, { "epoch": 0.002075119319360863, "grad_norm": 21.906176707284583, "learning_rate": 2.074688796680498e-07, "loss": 1.4046, "step": 10 }, { "epoch": 0.003112678979041295, "grad_norm": 15.167476223997951, "learning_rate": 3.112033195020747e-07, "loss": 1.3592, "step": 15 }, { "epoch": 0.004150238638721726, "grad_norm": 9.140589856353586, "learning_rate": 4.149377593360996e-07, "loss": 1.2548, "step": 20 }, { "epoch": 0.005187798298402158, "grad_norm": 10.826361201441884, "learning_rate": 5.186721991701245e-07, "loss": 1.1628, "step": 25 }, { "epoch": 0.00622535795808259, "grad_norm": 9.062979414726044, "learning_rate": 6.224066390041494e-07, "loss": 1.0633, "step": 30 }, { "epoch": 0.007262917617763021, "grad_norm": 3.7471401092788903, "learning_rate": 7.261410788381744e-07, "loss": 1.0125, "step": 35 }, { "epoch": 0.008300477277443452, "grad_norm": 3.4052281488293517, "learning_rate": 8.298755186721992e-07, "loss": 0.9673, "step": 40 }, { "epoch": 0.009338036937123885, "grad_norm": 3.4155319860158584, "learning_rate": 9.336099585062241e-07, "loss": 0.9582, "step": 45 }, { "epoch": 0.010375596596804317, "grad_norm": 3.0460439284849947, "learning_rate": 1.037344398340249e-06, "loss": 0.9393, "step": 50 }, { "epoch": 0.011413156256484747, "grad_norm": 3.0573711664380117, "learning_rate": 1.141078838174274e-06, "loss": 0.928, "step": 55 }, { "epoch": 0.01245071591616518, "grad_norm": 3.104368834698698, "learning_rate": 1.2448132780082988e-06, "loss": 0.9128, "step": 60 }, { "epoch": 0.013488275575845612, "grad_norm": 3.1380190764833094, "learning_rate": 1.3485477178423237e-06, "loss": 0.9156, "step": 65 }, { "epoch": 0.014525835235526042, "grad_norm": 3.1006570296836182, "learning_rate": 1.4522821576763488e-06, "loss": 0.905, "step": 70 }, { "epoch": 0.015563394895206474, "grad_norm": 3.127744100257649, "learning_rate": 1.5560165975103735e-06, "loss": 0.9163, "step": 75 }, { "epoch": 0.016600954554886905, "grad_norm": 3.038437602227199, "learning_rate": 1.6597510373443984e-06, "loss": 0.8884, "step": 80 }, { "epoch": 0.017638514214567337, "grad_norm": 2.9596644094641413, "learning_rate": 1.7634854771784235e-06, "loss": 0.8923, "step": 85 }, { "epoch": 0.01867607387424777, "grad_norm": 2.958917381373649, "learning_rate": 1.8672199170124482e-06, "loss": 0.8939, "step": 90 }, { "epoch": 0.0197136335339282, "grad_norm": 3.13129582575627, "learning_rate": 1.970954356846473e-06, "loss": 0.8671, "step": 95 }, { "epoch": 0.020751193193608634, "grad_norm": 3.212698731191082, "learning_rate": 2.074688796680498e-06, "loss": 0.8846, "step": 100 }, { "epoch": 0.021788752853289062, "grad_norm": 2.9778959899019273, "learning_rate": 2.178423236514523e-06, "loss": 0.8747, "step": 105 }, { "epoch": 0.022826312512969495, "grad_norm": 3.02722706158588, "learning_rate": 2.282157676348548e-06, "loss": 0.8841, "step": 110 }, { "epoch": 0.023863872172649927, "grad_norm": 3.104436315782777, "learning_rate": 2.385892116182573e-06, "loss": 0.8646, "step": 115 }, { "epoch": 0.02490143183233036, "grad_norm": 3.0389426041639847, "learning_rate": 2.4896265560165977e-06, "loss": 0.8769, "step": 120 }, { "epoch": 0.02593899149201079, "grad_norm": 3.0481198307330772, "learning_rate": 2.5933609958506228e-06, "loss": 0.8597, "step": 125 }, { "epoch": 0.026976551151691223, "grad_norm": 3.02592811695882, "learning_rate": 2.6970954356846475e-06, "loss": 0.8668, "step": 130 }, { "epoch": 0.028014110811371652, "grad_norm": 3.2120467143006954, "learning_rate": 2.8008298755186726e-06, "loss": 0.8651, "step": 135 }, { "epoch": 0.029051670471052084, "grad_norm": 2.9463572893956633, "learning_rate": 2.9045643153526977e-06, "loss": 0.8648, "step": 140 }, { "epoch": 0.030089230130732517, "grad_norm": 3.2030553684980054, "learning_rate": 3.008298755186722e-06, "loss": 0.861, "step": 145 }, { "epoch": 0.03112678979041295, "grad_norm": 3.134802130941582, "learning_rate": 3.112033195020747e-06, "loss": 0.8572, "step": 150 }, { "epoch": 0.03216434945009338, "grad_norm": 3.0051811744191075, "learning_rate": 3.215767634854772e-06, "loss": 0.8548, "step": 155 }, { "epoch": 0.03320190910977381, "grad_norm": 2.9087769254486013, "learning_rate": 3.319502074688797e-06, "loss": 0.857, "step": 160 }, { "epoch": 0.034239468769454245, "grad_norm": 3.251836422833884, "learning_rate": 3.423236514522822e-06, "loss": 0.8479, "step": 165 }, { "epoch": 0.035277028429134674, "grad_norm": 2.988883877751971, "learning_rate": 3.526970954356847e-06, "loss": 0.8573, "step": 170 }, { "epoch": 0.03631458808881511, "grad_norm": 2.9670335974516155, "learning_rate": 3.6307053941908714e-06, "loss": 0.8509, "step": 175 }, { "epoch": 0.03735214774849554, "grad_norm": 3.0847182089806333, "learning_rate": 3.7344398340248965e-06, "loss": 0.8421, "step": 180 }, { "epoch": 0.03838970740817597, "grad_norm": 2.8264906420734843, "learning_rate": 3.838174273858922e-06, "loss": 0.8484, "step": 185 }, { "epoch": 0.0394272670678564, "grad_norm": 2.974170420918456, "learning_rate": 3.941908713692946e-06, "loss": 0.8474, "step": 190 }, { "epoch": 0.04046482672753683, "grad_norm": 3.077208581236518, "learning_rate": 4.045643153526971e-06, "loss": 0.8275, "step": 195 }, { "epoch": 0.04150238638721727, "grad_norm": 3.134791812920161, "learning_rate": 4.149377593360996e-06, "loss": 0.8419, "step": 200 }, { "epoch": 0.042539946046897696, "grad_norm": 3.2479409082629935, "learning_rate": 4.253112033195021e-06, "loss": 0.8466, "step": 205 }, { "epoch": 0.043577505706578125, "grad_norm": 2.8988040172466767, "learning_rate": 4.356846473029046e-06, "loss": 0.8489, "step": 210 }, { "epoch": 0.04461506536625856, "grad_norm": 3.0923511376582797, "learning_rate": 4.460580912863071e-06, "loss": 0.8455, "step": 215 }, { "epoch": 0.04565262502593899, "grad_norm": 3.1659114421543895, "learning_rate": 4.564315352697096e-06, "loss": 0.8366, "step": 220 }, { "epoch": 0.046690184685619425, "grad_norm": 2.9510079953090953, "learning_rate": 4.66804979253112e-06, "loss": 0.8492, "step": 225 }, { "epoch": 0.047727744345299854, "grad_norm": 2.9528441973334503, "learning_rate": 4.771784232365146e-06, "loss": 0.8317, "step": 230 }, { "epoch": 0.04876530400498029, "grad_norm": 3.009602261579148, "learning_rate": 4.875518672199171e-06, "loss": 0.8282, "step": 235 }, { "epoch": 0.04980286366466072, "grad_norm": 3.0538602837457702, "learning_rate": 4.979253112033195e-06, "loss": 0.8379, "step": 240 }, { "epoch": 0.05084042332434115, "grad_norm": 3.067939083048346, "learning_rate": 5.08298755186722e-06, "loss": 0.8255, "step": 245 }, { "epoch": 0.05187798298402158, "grad_norm": 2.8381217796695233, "learning_rate": 5.1867219917012455e-06, "loss": 0.8424, "step": 250 }, { "epoch": 0.05291554264370201, "grad_norm": 3.4784967911008247, "learning_rate": 5.29045643153527e-06, "loss": 0.8288, "step": 255 }, { "epoch": 0.05395310230338245, "grad_norm": 3.308889130459959, "learning_rate": 5.394190871369295e-06, "loss": 0.8458, "step": 260 }, { "epoch": 0.054990661963062876, "grad_norm": 3.043690822549995, "learning_rate": 5.4979253112033204e-06, "loss": 0.8233, "step": 265 }, { "epoch": 0.056028221622743304, "grad_norm": 3.168880712945065, "learning_rate": 5.601659751037345e-06, "loss": 0.831, "step": 270 }, { "epoch": 0.05706578128242374, "grad_norm": 3.1149299465392644, "learning_rate": 5.70539419087137e-06, "loss": 0.8363, "step": 275 }, { "epoch": 0.05810334094210417, "grad_norm": 3.1047167518774814, "learning_rate": 5.809128630705395e-06, "loss": 0.8391, "step": 280 }, { "epoch": 0.059140900601784605, "grad_norm": 2.8999047365073927, "learning_rate": 5.91286307053942e-06, "loss": 0.8368, "step": 285 }, { "epoch": 0.06017846026146503, "grad_norm": 3.066022536608264, "learning_rate": 6.016597510373444e-06, "loss": 0.8379, "step": 290 }, { "epoch": 0.06121601992114547, "grad_norm": 3.116983544786236, "learning_rate": 6.1203319502074694e-06, "loss": 0.8169, "step": 295 }, { "epoch": 0.0622535795808259, "grad_norm": 7.508628361145243, "learning_rate": 6.224066390041494e-06, "loss": 0.8186, "step": 300 }, { "epoch": 0.06329113924050633, "grad_norm": 2.959364581244697, "learning_rate": 6.327800829875519e-06, "loss": 0.823, "step": 305 }, { "epoch": 0.06432869890018676, "grad_norm": 3.3247616385013554, "learning_rate": 6.431535269709544e-06, "loss": 0.8336, "step": 310 }, { "epoch": 0.06536625855986719, "grad_norm": 2.97755825450701, "learning_rate": 6.535269709543569e-06, "loss": 0.8403, "step": 315 }, { "epoch": 0.06640381821954762, "grad_norm": 2.920924218399819, "learning_rate": 6.639004149377594e-06, "loss": 0.8352, "step": 320 }, { "epoch": 0.06744137787922806, "grad_norm": 2.996226962381477, "learning_rate": 6.742738589211619e-06, "loss": 0.8149, "step": 325 }, { "epoch": 0.06847893753890849, "grad_norm": 2.9599991745332743, "learning_rate": 6.846473029045644e-06, "loss": 0.8227, "step": 330 }, { "epoch": 0.06951649719858892, "grad_norm": 3.3083929869948374, "learning_rate": 6.950207468879669e-06, "loss": 0.8293, "step": 335 }, { "epoch": 0.07055405685826935, "grad_norm": 2.921592069295814, "learning_rate": 7.053941908713694e-06, "loss": 0.8063, "step": 340 }, { "epoch": 0.07159161651794978, "grad_norm": 2.9356160908302296, "learning_rate": 7.157676348547719e-06, "loss": 0.8249, "step": 345 }, { "epoch": 0.07262917617763022, "grad_norm": 3.127667490017175, "learning_rate": 7.261410788381743e-06, "loss": 0.805, "step": 350 }, { "epoch": 0.07366673583731065, "grad_norm": 2.811701116040028, "learning_rate": 7.365145228215769e-06, "loss": 0.8141, "step": 355 }, { "epoch": 0.07470429549699108, "grad_norm": 3.428565494126869, "learning_rate": 7.468879668049793e-06, "loss": 0.8137, "step": 360 }, { "epoch": 0.0757418551566715, "grad_norm": 3.0132405721717794, "learning_rate": 7.572614107883818e-06, "loss": 0.8121, "step": 365 }, { "epoch": 0.07677941481635193, "grad_norm": 3.0044504000522045, "learning_rate": 7.676348547717844e-06, "loss": 0.8112, "step": 370 }, { "epoch": 0.07781697447603238, "grad_norm": 2.938773166801736, "learning_rate": 7.780082987551869e-06, "loss": 0.8071, "step": 375 }, { "epoch": 0.0788545341357128, "grad_norm": 2.816233639978962, "learning_rate": 7.883817427385892e-06, "loss": 0.8269, "step": 380 }, { "epoch": 0.07989209379539323, "grad_norm": 3.2016060279962297, "learning_rate": 7.987551867219918e-06, "loss": 0.8173, "step": 385 }, { "epoch": 0.08092965345507366, "grad_norm": 3.0338701288383567, "learning_rate": 8.091286307053943e-06, "loss": 0.7997, "step": 390 }, { "epoch": 0.08196721311475409, "grad_norm": 2.8900162172973025, "learning_rate": 8.195020746887967e-06, "loss": 0.8296, "step": 395 }, { "epoch": 0.08300477277443453, "grad_norm": 2.775870448004777, "learning_rate": 8.298755186721992e-06, "loss": 0.8139, "step": 400 }, { "epoch": 0.08404233243411496, "grad_norm": 2.997946086685837, "learning_rate": 8.402489626556017e-06, "loss": 0.8244, "step": 405 }, { "epoch": 0.08507989209379539, "grad_norm": 2.79051986833281, "learning_rate": 8.506224066390042e-06, "loss": 0.8207, "step": 410 }, { "epoch": 0.08611745175347582, "grad_norm": 2.8248639051971334, "learning_rate": 8.609958506224068e-06, "loss": 0.805, "step": 415 }, { "epoch": 0.08715501141315625, "grad_norm": 2.9096736006801485, "learning_rate": 8.713692946058093e-06, "loss": 0.8037, "step": 420 }, { "epoch": 0.08819257107283669, "grad_norm": 2.940919950600375, "learning_rate": 8.817427385892117e-06, "loss": 0.8186, "step": 425 }, { "epoch": 0.08923013073251712, "grad_norm": 2.896130509805648, "learning_rate": 8.921161825726142e-06, "loss": 0.8245, "step": 430 }, { "epoch": 0.09026769039219755, "grad_norm": 2.8401858033389713, "learning_rate": 9.024896265560167e-06, "loss": 0.8081, "step": 435 }, { "epoch": 0.09130525005187798, "grad_norm": 2.86912546506336, "learning_rate": 9.128630705394191e-06, "loss": 0.8164, "step": 440 }, { "epoch": 0.09234280971155842, "grad_norm": 2.7242600466916453, "learning_rate": 9.232365145228218e-06, "loss": 0.8073, "step": 445 }, { "epoch": 0.09338036937123885, "grad_norm": 3.044646841531881, "learning_rate": 9.33609958506224e-06, "loss": 0.8145, "step": 450 }, { "epoch": 0.09441792903091928, "grad_norm": 2.8295153855299198, "learning_rate": 9.439834024896265e-06, "loss": 0.7957, "step": 455 }, { "epoch": 0.09545548869059971, "grad_norm": 2.910948125766166, "learning_rate": 9.543568464730292e-06, "loss": 0.8053, "step": 460 }, { "epoch": 0.09649304835028014, "grad_norm": 2.8219575214924597, "learning_rate": 9.647302904564317e-06, "loss": 0.8128, "step": 465 }, { "epoch": 0.09753060800996058, "grad_norm": 2.9677851670315167, "learning_rate": 9.751037344398341e-06, "loss": 0.8061, "step": 470 }, { "epoch": 0.09856816766964101, "grad_norm": 3.1531286571785326, "learning_rate": 9.854771784232366e-06, "loss": 0.7962, "step": 475 }, { "epoch": 0.09960572732932144, "grad_norm": 3.1075603430133105, "learning_rate": 9.95850622406639e-06, "loss": 0.8072, "step": 480 }, { "epoch": 0.10064328698900186, "grad_norm": 2.844518989686556, "learning_rate": 9.99998819398724e-06, "loss": 0.8198, "step": 485 }, { "epoch": 0.1016808466486823, "grad_norm": 2.8894422413511127, "learning_rate": 9.999916046333384e-06, "loss": 0.8146, "step": 490 }, { "epoch": 0.10271840630836274, "grad_norm": 2.8815754440809878, "learning_rate": 9.999778310866921e-06, "loss": 0.7899, "step": 495 }, { "epoch": 0.10375596596804317, "grad_norm": 2.885119932296298, "learning_rate": 9.999574989394634e-06, "loss": 0.8013, "step": 500 }, { "epoch": 0.1047935256277236, "grad_norm": 2.8642122606932174, "learning_rate": 9.99930608458365e-06, "loss": 0.805, "step": 505 }, { "epoch": 0.10583108528740402, "grad_norm": 2.9310957652228082, "learning_rate": 9.998971599961405e-06, "loss": 0.7915, "step": 510 }, { "epoch": 0.10686864494708445, "grad_norm": 2.8531320874329746, "learning_rate": 9.998571539915592e-06, "loss": 0.7981, "step": 515 }, { "epoch": 0.1079062046067649, "grad_norm": 2.839970691252372, "learning_rate": 9.998105909694117e-06, "loss": 0.7999, "step": 520 }, { "epoch": 0.10894376426644532, "grad_norm": 2.853880263858815, "learning_rate": 9.997574715405011e-06, "loss": 0.8311, "step": 525 }, { "epoch": 0.10998132392612575, "grad_norm": 2.9338168378898337, "learning_rate": 9.996977964016371e-06, "loss": 0.8005, "step": 530 }, { "epoch": 0.11101888358580618, "grad_norm": 2.7032896393481254, "learning_rate": 9.996315663356247e-06, "loss": 0.8003, "step": 535 }, { "epoch": 0.11205644324548661, "grad_norm": 2.761074984380135, "learning_rate": 9.995587822112558e-06, "loss": 0.8044, "step": 540 }, { "epoch": 0.11309400290516705, "grad_norm": 2.6815711646329556, "learning_rate": 9.994794449832966e-06, "loss": 0.7887, "step": 545 }, { "epoch": 0.11413156256484748, "grad_norm": 2.8649570007362657, "learning_rate": 9.993935556924756e-06, "loss": 0.7776, "step": 550 }, { "epoch": 0.11516912222452791, "grad_norm": 2.9788872729907547, "learning_rate": 9.993011154654702e-06, "loss": 0.7778, "step": 555 }, { "epoch": 0.11620668188420834, "grad_norm": 2.771583627699142, "learning_rate": 9.992021255148907e-06, "loss": 0.7876, "step": 560 }, { "epoch": 0.11724424154388878, "grad_norm": 3.156864019145989, "learning_rate": 9.990965871392662e-06, "loss": 0.7924, "step": 565 }, { "epoch": 0.11828180120356921, "grad_norm": 2.6854537961356653, "learning_rate": 9.989845017230258e-06, "loss": 0.7841, "step": 570 }, { "epoch": 0.11931936086324964, "grad_norm": 3.194287668643439, "learning_rate": 9.988658707364819e-06, "loss": 0.7807, "step": 575 }, { "epoch": 0.12035692052293007, "grad_norm": 2.7638444846888173, "learning_rate": 9.9874069573581e-06, "loss": 0.7846, "step": 580 }, { "epoch": 0.1213944801826105, "grad_norm": 2.7802968983308936, "learning_rate": 9.986089783630286e-06, "loss": 0.775, "step": 585 }, { "epoch": 0.12243203984229094, "grad_norm": 2.8469947394820103, "learning_rate": 9.984707203459774e-06, "loss": 0.7672, "step": 590 }, { "epoch": 0.12346959950197137, "grad_norm": 2.9990572740062493, "learning_rate": 9.983259234982951e-06, "loss": 0.7779, "step": 595 }, { "epoch": 0.1245071591616518, "grad_norm": 3.1849226285333345, "learning_rate": 9.981745897193955e-06, "loss": 0.7714, "step": 600 }, { "epoch": 0.12554471882133222, "grad_norm": 2.835199675947756, "learning_rate": 9.98016720994442e-06, "loss": 0.7784, "step": 605 }, { "epoch": 0.12658227848101267, "grad_norm": 2.8294757615641886, "learning_rate": 9.978523193943222e-06, "loss": 0.7905, "step": 610 }, { "epoch": 0.12761983814069308, "grad_norm": 2.6228325540550657, "learning_rate": 9.976813870756209e-06, "loss": 0.7695, "step": 615 }, { "epoch": 0.12865739780037352, "grad_norm": 2.6754726256364902, "learning_rate": 9.975039262805907e-06, "loss": 0.7784, "step": 620 }, { "epoch": 0.12969495746005397, "grad_norm": 2.6428193830815663, "learning_rate": 9.973199393371242e-06, "loss": 0.7768, "step": 625 }, { "epoch": 0.13073251711973438, "grad_norm": 2.7019164377169824, "learning_rate": 9.97129428658722e-06, "loss": 0.7787, "step": 630 }, { "epoch": 0.13177007677941482, "grad_norm": 2.7193094088840635, "learning_rate": 9.969323967444616e-06, "loss": 0.7691, "step": 635 }, { "epoch": 0.13280763643909524, "grad_norm": 2.9547462574056227, "learning_rate": 9.96728846178965e-06, "loss": 0.7791, "step": 640 }, { "epoch": 0.13384519609877568, "grad_norm": 2.7070337933579673, "learning_rate": 9.965187796323643e-06, "loss": 0.7793, "step": 645 }, { "epoch": 0.13488275575845612, "grad_norm": 2.846568092264262, "learning_rate": 9.96302199860267e-06, "loss": 0.7657, "step": 650 }, { "epoch": 0.13592031541813654, "grad_norm": 2.6430732770071885, "learning_rate": 9.96079109703719e-06, "loss": 0.7613, "step": 655 }, { "epoch": 0.13695787507781698, "grad_norm": 2.564352165321478, "learning_rate": 9.95849512089169e-06, "loss": 0.7716, "step": 660 }, { "epoch": 0.1379954347374974, "grad_norm": 2.787666582812226, "learning_rate": 9.956134100284285e-06, "loss": 0.7788, "step": 665 }, { "epoch": 0.13903299439717784, "grad_norm": 2.565898584923949, "learning_rate": 9.95370806618633e-06, "loss": 0.7612, "step": 670 }, { "epoch": 0.14007055405685828, "grad_norm": 2.9186277859262244, "learning_rate": 9.951217050422013e-06, "loss": 0.7787, "step": 675 }, { "epoch": 0.1411081137165387, "grad_norm": 2.7573443860042546, "learning_rate": 9.94866108566794e-06, "loss": 0.7556, "step": 680 }, { "epoch": 0.14214567337621914, "grad_norm": 2.840469194753, "learning_rate": 9.946040205452699e-06, "loss": 0.7456, "step": 685 }, { "epoch": 0.14318323303589955, "grad_norm": 2.55261542401345, "learning_rate": 9.943354444156428e-06, "loss": 0.7789, "step": 690 }, { "epoch": 0.14422079269558, "grad_norm": 2.6744948199976535, "learning_rate": 9.940603837010358e-06, "loss": 0.773, "step": 695 }, { "epoch": 0.14525835235526044, "grad_norm": 2.7013083342024107, "learning_rate": 9.937788420096362e-06, "loss": 0.7735, "step": 700 }, { "epoch": 0.14629591201494085, "grad_norm": 2.731487086002391, "learning_rate": 9.934908230346462e-06, "loss": 0.7523, "step": 705 }, { "epoch": 0.1473334716746213, "grad_norm": 2.6033671380903334, "learning_rate": 9.931963305542363e-06, "loss": 0.7517, "step": 710 }, { "epoch": 0.1483710313343017, "grad_norm": 2.662624062037032, "learning_rate": 9.92895368431495e-06, "loss": 0.7659, "step": 715 }, { "epoch": 0.14940859099398215, "grad_norm": 3.714026552589638, "learning_rate": 9.925879406143779e-06, "loss": 0.7646, "step": 720 }, { "epoch": 0.1504461506536626, "grad_norm": 3.3049933455540073, "learning_rate": 9.922740511356565e-06, "loss": 0.7681, "step": 725 }, { "epoch": 0.151483710313343, "grad_norm": 2.7320515915400816, "learning_rate": 9.919537041128647e-06, "loss": 0.746, "step": 730 }, { "epoch": 0.15252126997302345, "grad_norm": 2.635817269274888, "learning_rate": 9.916269037482452e-06, "loss": 0.7306, "step": 735 }, { "epoch": 0.15355882963270387, "grad_norm": 2.662103702116443, "learning_rate": 9.912936543286939e-06, "loss": 0.7536, "step": 740 }, { "epoch": 0.1545963892923843, "grad_norm": 2.85326362399482, "learning_rate": 9.909539602257048e-06, "loss": 0.7673, "step": 745 }, { "epoch": 0.15563394895206475, "grad_norm": 2.567231081949611, "learning_rate": 9.90607825895311e-06, "loss": 0.738, "step": 750 }, { "epoch": 0.15667150861174517, "grad_norm": 2.869479144385386, "learning_rate": 9.902552558780276e-06, "loss": 0.7598, "step": 755 }, { "epoch": 0.1577090682714256, "grad_norm": 2.730929830901487, "learning_rate": 9.898962547987913e-06, "loss": 0.748, "step": 760 }, { "epoch": 0.15874662793110603, "grad_norm": 2.793985046920194, "learning_rate": 9.895308273669007e-06, "loss": 0.7328, "step": 765 }, { "epoch": 0.15978418759078647, "grad_norm": 2.7966738638089246, "learning_rate": 9.89158978375953e-06, "loss": 0.7676, "step": 770 }, { "epoch": 0.1608217472504669, "grad_norm": 2.6679897290967576, "learning_rate": 9.887807127037827e-06, "loss": 0.7295, "step": 775 }, { "epoch": 0.16185930691014733, "grad_norm": 2.5457872650558455, "learning_rate": 9.88396035312397e-06, "loss": 0.728, "step": 780 }, { "epoch": 0.16289686656982777, "grad_norm": 2.8332820570127626, "learning_rate": 9.880049512479097e-06, "loss": 0.7421, "step": 785 }, { "epoch": 0.16393442622950818, "grad_norm": 2.7639539461730114, "learning_rate": 9.876074656404773e-06, "loss": 0.7534, "step": 790 }, { "epoch": 0.16497198588918863, "grad_norm": 2.7213075641667928, "learning_rate": 9.872035837042292e-06, "loss": 0.7363, "step": 795 }, { "epoch": 0.16600954554886907, "grad_norm": 2.6178852546410183, "learning_rate": 9.86793310737201e-06, "loss": 0.7318, "step": 800 }, { "epoch": 0.16704710520854948, "grad_norm": 2.7566382614918585, "learning_rate": 9.863766521212646e-06, "loss": 0.7507, "step": 805 }, { "epoch": 0.16808466486822993, "grad_norm": 2.75430515948912, "learning_rate": 9.859536133220569e-06, "loss": 0.7481, "step": 810 }, { "epoch": 0.16912222452791034, "grad_norm": 2.7773285238684813, "learning_rate": 9.855241998889091e-06, "loss": 0.7456, "step": 815 }, { "epoch": 0.17015978418759078, "grad_norm": 2.9942393187975904, "learning_rate": 9.850884174547734e-06, "loss": 0.7512, "step": 820 }, { "epoch": 0.17119734384727123, "grad_norm": 2.8176703833149706, "learning_rate": 9.846462717361489e-06, "loss": 0.7229, "step": 825 }, { "epoch": 0.17223490350695164, "grad_norm": 2.606112734187681, "learning_rate": 9.841977685330074e-06, "loss": 0.7544, "step": 830 }, { "epoch": 0.17327246316663208, "grad_norm": 2.7643282324128324, "learning_rate": 9.837429137287164e-06, "loss": 0.7233, "step": 835 }, { "epoch": 0.1743100228263125, "grad_norm": 2.58906230921786, "learning_rate": 9.832817132899622e-06, "loss": 0.7496, "step": 840 }, { "epoch": 0.17534758248599294, "grad_norm": 2.9173083959154913, "learning_rate": 9.828141732666722e-06, "loss": 0.7405, "step": 845 }, { "epoch": 0.17638514214567338, "grad_norm": 2.807434396354819, "learning_rate": 9.823402997919346e-06, "loss": 0.7032, "step": 850 }, { "epoch": 0.1774227018053538, "grad_norm": 2.7235521219589343, "learning_rate": 9.818600990819193e-06, "loss": 0.7162, "step": 855 }, { "epoch": 0.17846026146503424, "grad_norm": 3.0405865223857247, "learning_rate": 9.813735774357942e-06, "loss": 0.7286, "step": 860 }, { "epoch": 0.17949782112471468, "grad_norm": 2.6580664453159786, "learning_rate": 9.80880741235645e-06, "loss": 0.7153, "step": 865 }, { "epoch": 0.1805353807843951, "grad_norm": 2.6234079932920764, "learning_rate": 9.803815969463898e-06, "loss": 0.7267, "step": 870 }, { "epoch": 0.18157294044407554, "grad_norm": 2.7632489453595457, "learning_rate": 9.798761511156948e-06, "loss": 0.7198, "step": 875 }, { "epoch": 0.18261050010375596, "grad_norm": 2.607364882820461, "learning_rate": 9.79364410373889e-06, "loss": 0.7197, "step": 880 }, { "epoch": 0.1836480597634364, "grad_norm": 2.7165078647946856, "learning_rate": 9.78846381433876e-06, "loss": 0.7311, "step": 885 }, { "epoch": 0.18468561942311684, "grad_norm": 2.510454703638443, "learning_rate": 9.783220710910471e-06, "loss": 0.7318, "step": 890 }, { "epoch": 0.18572317908279726, "grad_norm": 2.638416867941834, "learning_rate": 9.777914862231912e-06, "loss": 0.73, "step": 895 }, { "epoch": 0.1867607387424777, "grad_norm": 2.7242863471244148, "learning_rate": 9.772546337904054e-06, "loss": 0.7191, "step": 900 }, { "epoch": 0.18779829840215811, "grad_norm": 2.6641307804734806, "learning_rate": 9.767115208350035e-06, "loss": 0.7207, "step": 905 }, { "epoch": 0.18883585806183856, "grad_norm": 2.4803694238796905, "learning_rate": 9.761621544814232e-06, "loss": 0.7366, "step": 910 }, { "epoch": 0.189873417721519, "grad_norm": 2.54201756384705, "learning_rate": 9.756065419361329e-06, "loss": 0.6971, "step": 915 }, { "epoch": 0.19091097738119941, "grad_norm": 2.663357207599627, "learning_rate": 9.750446904875374e-06, "loss": 0.7093, "step": 920 }, { "epoch": 0.19194853704087986, "grad_norm": 2.665290508123204, "learning_rate": 9.744766075058817e-06, "loss": 0.7092, "step": 925 }, { "epoch": 0.19298609670056027, "grad_norm": 2.561102249391226, "learning_rate": 9.739023004431553e-06, "loss": 0.7022, "step": 930 }, { "epoch": 0.19402365636024071, "grad_norm": 2.8532699081444344, "learning_rate": 9.733217768329934e-06, "loss": 0.7125, "step": 935 }, { "epoch": 0.19506121601992116, "grad_norm": 2.658165458053829, "learning_rate": 9.727350442905786e-06, "loss": 0.713, "step": 940 }, { "epoch": 0.19609877567960157, "grad_norm": 2.8346689183428233, "learning_rate": 9.721421105125409e-06, "loss": 0.7111, "step": 945 }, { "epoch": 0.19713633533928202, "grad_norm": 2.691363256166988, "learning_rate": 9.715429832768566e-06, "loss": 0.6997, "step": 950 }, { "epoch": 0.19817389499896243, "grad_norm": 2.670459556620767, "learning_rate": 9.709376704427471e-06, "loss": 0.7002, "step": 955 }, { "epoch": 0.19921145465864287, "grad_norm": 2.8298140051123624, "learning_rate": 9.703261799505743e-06, "loss": 0.6919, "step": 960 }, { "epoch": 0.20024901431832332, "grad_norm": 2.690706447862383, "learning_rate": 9.697085198217378e-06, "loss": 0.6951, "step": 965 }, { "epoch": 0.20128657397800373, "grad_norm": 2.6190530004747536, "learning_rate": 9.690846981585689e-06, "loss": 0.7088, "step": 970 }, { "epoch": 0.20232413363768417, "grad_norm": 2.896807508207353, "learning_rate": 9.684547231442248e-06, "loss": 0.7036, "step": 975 }, { "epoch": 0.2033616932973646, "grad_norm": 2.653583975726927, "learning_rate": 9.678186030425806e-06, "loss": 0.7014, "step": 980 }, { "epoch": 0.20439925295704503, "grad_norm": 2.7468040879725857, "learning_rate": 9.67176346198122e-06, "loss": 0.6887, "step": 985 }, { "epoch": 0.20543681261672547, "grad_norm": 2.6425850355422607, "learning_rate": 9.665279610358347e-06, "loss": 0.6912, "step": 990 }, { "epoch": 0.2064743722764059, "grad_norm": 2.607466586766701, "learning_rate": 9.658734560610942e-06, "loss": 0.6986, "step": 995 }, { "epoch": 0.20751193193608633, "grad_norm": 2.5797036833444684, "learning_rate": 9.652128398595548e-06, "loss": 0.6893, "step": 1000 }, { "epoch": 0.20854949159576675, "grad_norm": 2.627466198960634, "learning_rate": 9.645461210970363e-06, "loss": 0.6939, "step": 1005 }, { "epoch": 0.2095870512554472, "grad_norm": 2.7514054708664486, "learning_rate": 9.638733085194105e-06, "loss": 0.6879, "step": 1010 }, { "epoch": 0.21062461091512763, "grad_norm": 2.762232274342595, "learning_rate": 9.631944109524867e-06, "loss": 0.7206, "step": 1015 }, { "epoch": 0.21166217057480805, "grad_norm": 2.573942783710098, "learning_rate": 9.625094373018957e-06, "loss": 0.672, "step": 1020 }, { "epoch": 0.2126997302344885, "grad_norm": 2.9862062253264154, "learning_rate": 9.61818396552973e-06, "loss": 0.7027, "step": 1025 }, { "epoch": 0.2137372898941689, "grad_norm": 2.604158278569754, "learning_rate": 9.61121297770641e-06, "loss": 0.6832, "step": 1030 }, { "epoch": 0.21477484955384935, "grad_norm": 2.847316696943195, "learning_rate": 9.604181500992904e-06, "loss": 0.6799, "step": 1035 }, { "epoch": 0.2158124092135298, "grad_norm": 2.779271975810847, "learning_rate": 9.597089627626594e-06, "loss": 0.6804, "step": 1040 }, { "epoch": 0.2168499688732102, "grad_norm": 2.5638323571099546, "learning_rate": 9.589937450637134e-06, "loss": 0.6837, "step": 1045 }, { "epoch": 0.21788752853289065, "grad_norm": 2.7020801988998384, "learning_rate": 9.58272506384523e-06, "loss": 0.684, "step": 1050 }, { "epoch": 0.21892508819257106, "grad_norm": 2.549003888633134, "learning_rate": 9.5754525618614e-06, "loss": 0.6871, "step": 1055 }, { "epoch": 0.2199626478522515, "grad_norm": 2.8154214183624284, "learning_rate": 9.568120040084752e-06, "loss": 0.6652, "step": 1060 }, { "epoch": 0.22100020751193195, "grad_norm": 2.6474463880567884, "learning_rate": 9.56072759470171e-06, "loss": 0.6896, "step": 1065 }, { "epoch": 0.22203776717161236, "grad_norm": 2.623449606437885, "learning_rate": 9.553275322684769e-06, "loss": 0.6731, "step": 1070 }, { "epoch": 0.2230753268312928, "grad_norm": 2.5852337807326324, "learning_rate": 9.545763321791213e-06, "loss": 0.6914, "step": 1075 }, { "epoch": 0.22411288649097322, "grad_norm": 2.5768442254313424, "learning_rate": 9.538191690561838e-06, "loss": 0.6827, "step": 1080 }, { "epoch": 0.22515044615065366, "grad_norm": 2.616849381186977, "learning_rate": 9.530560528319657e-06, "loss": 0.6861, "step": 1085 }, { "epoch": 0.2261880058103341, "grad_norm": 2.542283706279765, "learning_rate": 9.522869935168601e-06, "loss": 0.6673, "step": 1090 }, { "epoch": 0.22722556547001452, "grad_norm": 2.6428940149632916, "learning_rate": 9.515120011992199e-06, "loss": 0.6595, "step": 1095 }, { "epoch": 0.22826312512969496, "grad_norm": 2.6516305122608324, "learning_rate": 9.507310860452258e-06, "loss": 0.6508, "step": 1100 }, { "epoch": 0.22930068478937538, "grad_norm": 2.618199521456939, "learning_rate": 9.499442582987535e-06, "loss": 0.672, "step": 1105 }, { "epoch": 0.23033824444905582, "grad_norm": 2.580520201322647, "learning_rate": 9.491515282812383e-06, "loss": 0.6798, "step": 1110 }, { "epoch": 0.23137580410873626, "grad_norm": 2.5578402107468285, "learning_rate": 9.483529063915405e-06, "loss": 0.6575, "step": 1115 }, { "epoch": 0.23241336376841668, "grad_norm": 2.5898669244363743, "learning_rate": 9.475484031058081e-06, "loss": 0.6686, "step": 1120 }, { "epoch": 0.23345092342809712, "grad_norm": 2.581092169980836, "learning_rate": 9.46738028977341e-06, "loss": 0.676, "step": 1125 }, { "epoch": 0.23448848308777756, "grad_norm": 2.564857684910852, "learning_rate": 9.459217946364508e-06, "loss": 0.6603, "step": 1130 }, { "epoch": 0.23552604274745798, "grad_norm": 2.6352764131909394, "learning_rate": 9.450997107903222e-06, "loss": 0.673, "step": 1135 }, { "epoch": 0.23656360240713842, "grad_norm": 2.7623386766168494, "learning_rate": 9.442717882228727e-06, "loss": 0.6713, "step": 1140 }, { "epoch": 0.23760116206681883, "grad_norm": 2.646657632225148, "learning_rate": 9.434380377946104e-06, "loss": 0.6714, "step": 1145 }, { "epoch": 0.23863872172649928, "grad_norm": 2.547166874529112, "learning_rate": 9.425984704424927e-06, "loss": 0.6664, "step": 1150 }, { "epoch": 0.23967628138617972, "grad_norm": 2.5806952200679225, "learning_rate": 9.417530971797812e-06, "loss": 0.6733, "step": 1155 }, { "epoch": 0.24071384104586013, "grad_norm": 2.574640603606341, "learning_rate": 9.409019290958993e-06, "loss": 0.6737, "step": 1160 }, { "epoch": 0.24175140070554058, "grad_norm": 2.5060886593359113, "learning_rate": 9.400449773562849e-06, "loss": 0.6762, "step": 1165 }, { "epoch": 0.242788960365221, "grad_norm": 2.716564112603813, "learning_rate": 9.391822532022445e-06, "loss": 0.6551, "step": 1170 }, { "epoch": 0.24382652002490143, "grad_norm": 2.7171895315845216, "learning_rate": 9.383137679508063e-06, "loss": 0.6561, "step": 1175 }, { "epoch": 0.24486407968458188, "grad_norm": 2.58866525923876, "learning_rate": 9.374395329945714e-06, "loss": 0.6586, "step": 1180 }, { "epoch": 0.2459016393442623, "grad_norm": 2.641911339118785, "learning_rate": 9.365595598015635e-06, "loss": 0.6879, "step": 1185 }, { "epoch": 0.24693919900394273, "grad_norm": 2.6209891495095716, "learning_rate": 9.356738599150805e-06, "loss": 0.6562, "step": 1190 }, { "epoch": 0.24797675866362315, "grad_norm": 2.537931458295151, "learning_rate": 9.347824449535406e-06, "loss": 0.671, "step": 1195 }, { "epoch": 0.2490143183233036, "grad_norm": 2.6373525596001777, "learning_rate": 9.338853266103318e-06, "loss": 0.6469, "step": 1200 }, { "epoch": 0.250051877982984, "grad_norm": 2.49609290349947, "learning_rate": 9.329825166536578e-06, "loss": 0.6494, "step": 1205 }, { "epoch": 0.25108943764266445, "grad_norm": 2.597719369713314, "learning_rate": 9.32074026926383e-06, "loss": 0.6644, "step": 1210 }, { "epoch": 0.2521269973023449, "grad_norm": 2.622902413291871, "learning_rate": 9.31159869345879e-06, "loss": 0.6277, "step": 1215 }, { "epoch": 0.25316455696202533, "grad_norm": 2.4773937545219615, "learning_rate": 9.302400559038658e-06, "loss": 0.6435, "step": 1220 }, { "epoch": 0.2542021166217057, "grad_norm": 2.417435834815959, "learning_rate": 9.293145986662567e-06, "loss": 0.6551, "step": 1225 }, { "epoch": 0.25523967628138616, "grad_norm": 2.7620863467259134, "learning_rate": 9.283835097729984e-06, "loss": 0.6524, "step": 1230 }, { "epoch": 0.2562772359410666, "grad_norm": 2.661471370041919, "learning_rate": 9.27446801437913e-06, "loss": 0.6564, "step": 1235 }, { "epoch": 0.25731479560074705, "grad_norm": 2.6022353359733996, "learning_rate": 9.265044859485369e-06, "loss": 0.6504, "step": 1240 }, { "epoch": 0.2583523552604275, "grad_norm": 2.67544454301597, "learning_rate": 9.2555657566596e-06, "loss": 0.6379, "step": 1245 }, { "epoch": 0.25938991492010793, "grad_norm": 2.7294894592664742, "learning_rate": 9.246030830246633e-06, "loss": 0.653, "step": 1250 }, { "epoch": 0.2604274745797883, "grad_norm": 2.4934502710929642, "learning_rate": 9.236440205323564e-06, "loss": 0.6504, "step": 1255 }, { "epoch": 0.26146503423946876, "grad_norm": 2.732333826367612, "learning_rate": 9.226794007698128e-06, "loss": 0.6417, "step": 1260 }, { "epoch": 0.2625025938991492, "grad_norm": 2.6288344277507023, "learning_rate": 9.217092363907047e-06, "loss": 0.6193, "step": 1265 }, { "epoch": 0.26354015355882965, "grad_norm": 2.7169648920990483, "learning_rate": 9.207335401214379e-06, "loss": 0.6536, "step": 1270 }, { "epoch": 0.2645777132185101, "grad_norm": 2.520924205856965, "learning_rate": 9.197523247609839e-06, "loss": 0.6375, "step": 1275 }, { "epoch": 0.2656152728781905, "grad_norm": 2.515144443600501, "learning_rate": 9.187656031807129e-06, "loss": 0.6442, "step": 1280 }, { "epoch": 0.2666528325378709, "grad_norm": 2.5704491825236473, "learning_rate": 9.177733883242244e-06, "loss": 0.6586, "step": 1285 }, { "epoch": 0.26769039219755136, "grad_norm": 2.8282989819254634, "learning_rate": 9.167756932071769e-06, "loss": 0.6609, "step": 1290 }, { "epoch": 0.2687279518572318, "grad_norm": 2.485389299186558, "learning_rate": 9.157725309171183e-06, "loss": 0.6459, "step": 1295 }, { "epoch": 0.26976551151691225, "grad_norm": 2.7788274917257327, "learning_rate": 9.147639146133142e-06, "loss": 0.6433, "step": 1300 }, { "epoch": 0.27080307117659264, "grad_norm": 2.521611630972388, "learning_rate": 9.137498575265736e-06, "loss": 0.6271, "step": 1305 }, { "epoch": 0.2718406308362731, "grad_norm": 2.67661014478125, "learning_rate": 9.12730372959077e-06, "loss": 0.6551, "step": 1310 }, { "epoch": 0.2728781904959535, "grad_norm": 2.678821798528819, "learning_rate": 9.11705474284202e-06, "loss": 0.6275, "step": 1315 }, { "epoch": 0.27391575015563396, "grad_norm": 2.5914261808444574, "learning_rate": 9.106751749463463e-06, "loss": 0.6401, "step": 1320 }, { "epoch": 0.2749533098153144, "grad_norm": 2.5454105011810664, "learning_rate": 9.09639488460753e-06, "loss": 0.6275, "step": 1325 }, { "epoch": 0.2759908694749948, "grad_norm": 2.746226513585246, "learning_rate": 9.08598428413333e-06, "loss": 0.6199, "step": 1330 }, { "epoch": 0.27702842913467524, "grad_norm": 2.4978350247576886, "learning_rate": 9.075520084604849e-06, "loss": 0.6081, "step": 1335 }, { "epoch": 0.2780659887943557, "grad_norm": 2.565415348716322, "learning_rate": 9.065002423289189e-06, "loss": 0.6117, "step": 1340 }, { "epoch": 0.2791035484540361, "grad_norm": 2.5742091961939892, "learning_rate": 9.054431438154745e-06, "loss": 0.613, "step": 1345 }, { "epoch": 0.28014110811371656, "grad_norm": 2.5626823209269642, "learning_rate": 9.043807267869403e-06, "loss": 0.624, "step": 1350 }, { "epoch": 0.28117866777339695, "grad_norm": 2.4910808560943942, "learning_rate": 9.033130051798725e-06, "loss": 0.6314, "step": 1355 }, { "epoch": 0.2822162274330774, "grad_norm": 2.4573499982125524, "learning_rate": 9.022399930004106e-06, "loss": 0.625, "step": 1360 }, { "epoch": 0.28325378709275784, "grad_norm": 2.531960067205625, "learning_rate": 9.011617043240956e-06, "loss": 0.6261, "step": 1365 }, { "epoch": 0.2842913467524383, "grad_norm": 2.6336708253712917, "learning_rate": 9.000781532956844e-06, "loss": 0.6057, "step": 1370 }, { "epoch": 0.2853289064121187, "grad_norm": 2.5416291249522063, "learning_rate": 8.989893541289636e-06, "loss": 0.6114, "step": 1375 }, { "epoch": 0.2863664660717991, "grad_norm": 2.6016836625846227, "learning_rate": 8.978953211065645e-06, "loss": 0.6308, "step": 1380 }, { "epoch": 0.28740402573147955, "grad_norm": 2.670421258864357, "learning_rate": 8.96796068579774e-06, "loss": 0.6373, "step": 1385 }, { "epoch": 0.28844158539116, "grad_norm": 2.6256018056635195, "learning_rate": 8.956916109683488e-06, "loss": 0.6136, "step": 1390 }, { "epoch": 0.28947914505084044, "grad_norm": 2.665986383822575, "learning_rate": 8.945819627603235e-06, "loss": 0.6294, "step": 1395 }, { "epoch": 0.2905167047105209, "grad_norm": 2.620184006600422, "learning_rate": 8.934671385118224e-06, "loss": 0.6154, "step": 1400 }, { "epoch": 0.29155426437020127, "grad_norm": 2.5773178200606797, "learning_rate": 8.923471528468675e-06, "loss": 0.6263, "step": 1405 }, { "epoch": 0.2925918240298817, "grad_norm": 2.528035051025028, "learning_rate": 8.912220204571878e-06, "loss": 0.6139, "step": 1410 }, { "epoch": 0.29362938368956215, "grad_norm": 2.6830704473334053, "learning_rate": 8.900917561020255e-06, "loss": 0.6256, "step": 1415 }, { "epoch": 0.2946669433492426, "grad_norm": 2.5701558374316256, "learning_rate": 8.889563746079428e-06, "loss": 0.6163, "step": 1420 }, { "epoch": 0.29570450300892304, "grad_norm": 2.542727546558585, "learning_rate": 8.878158908686276e-06, "loss": 0.6214, "step": 1425 }, { "epoch": 0.2967420626686034, "grad_norm": 2.6289522750495684, "learning_rate": 8.86670319844698e-06, "loss": 0.6184, "step": 1430 }, { "epoch": 0.29777962232828387, "grad_norm": 2.605979434187302, "learning_rate": 8.855196765635055e-06, "loss": 0.6148, "step": 1435 }, { "epoch": 0.2988171819879643, "grad_norm": 2.5378051705763136, "learning_rate": 8.843639761189392e-06, "loss": 0.6309, "step": 1440 }, { "epoch": 0.29985474164764475, "grad_norm": 3.122542869019648, "learning_rate": 8.83203233671226e-06, "loss": 0.6148, "step": 1445 }, { "epoch": 0.3008923013073252, "grad_norm": 2.7847080279432355, "learning_rate": 8.820374644467334e-06, "loss": 0.6149, "step": 1450 }, { "epoch": 0.3019298609670056, "grad_norm": 2.7571745781556176, "learning_rate": 8.808666837377688e-06, "loss": 0.6043, "step": 1455 }, { "epoch": 0.302967420626686, "grad_norm": 2.6729505157429645, "learning_rate": 8.796909069023793e-06, "loss": 0.6091, "step": 1460 }, { "epoch": 0.30400498028636647, "grad_norm": 2.5670622136089123, "learning_rate": 8.7851014936415e-06, "loss": 0.5973, "step": 1465 }, { "epoch": 0.3050425399460469, "grad_norm": 2.5809462309222586, "learning_rate": 8.77324426612002e-06, "loss": 0.601, "step": 1470 }, { "epoch": 0.30608009960572735, "grad_norm": 2.6928125038468695, "learning_rate": 8.761337541999884e-06, "loss": 0.603, "step": 1475 }, { "epoch": 0.30711765926540774, "grad_norm": 2.5859004844354887, "learning_rate": 8.749381477470915e-06, "loss": 0.5902, "step": 1480 }, { "epoch": 0.3081552189250882, "grad_norm": 2.706857250295533, "learning_rate": 8.73737622937017e-06, "loss": 0.6068, "step": 1485 }, { "epoch": 0.3091927785847686, "grad_norm": 2.5825402406347995, "learning_rate": 8.725321955179886e-06, "loss": 0.5943, "step": 1490 }, { "epoch": 0.31023033824444907, "grad_norm": 2.564859545646307, "learning_rate": 8.713218813025412e-06, "loss": 0.6166, "step": 1495 }, { "epoch": 0.3112678979041295, "grad_norm": 2.4788537811048514, "learning_rate": 8.70106696167314e-06, "loss": 0.6107, "step": 1500 }, { "epoch": 0.3123054575638099, "grad_norm": 2.5086302932261857, "learning_rate": 8.688866560528414e-06, "loss": 0.5953, "step": 1505 }, { "epoch": 0.31334301722349034, "grad_norm": 2.641157195225993, "learning_rate": 8.676617769633449e-06, "loss": 0.5942, "step": 1510 }, { "epoch": 0.3143805768831708, "grad_norm": 2.5041601695953952, "learning_rate": 8.66432074966522e-06, "loss": 0.614, "step": 1515 }, { "epoch": 0.3154181365428512, "grad_norm": 2.6463153946217988, "learning_rate": 8.651975661933368e-06, "loss": 0.6046, "step": 1520 }, { "epoch": 0.31645569620253167, "grad_norm": 2.5329735196813186, "learning_rate": 8.639582668378068e-06, "loss": 0.5939, "step": 1525 }, { "epoch": 0.31749325586221205, "grad_norm": 2.626444758743183, "learning_rate": 8.627141931567918e-06, "loss": 0.5955, "step": 1530 }, { "epoch": 0.3185308155218925, "grad_norm": 2.4901380695636672, "learning_rate": 8.614653614697804e-06, "loss": 0.5887, "step": 1535 }, { "epoch": 0.31956837518157294, "grad_norm": 2.562038119980849, "learning_rate": 8.602117881586748e-06, "loss": 0.5887, "step": 1540 }, { "epoch": 0.3206059348412534, "grad_norm": 2.705668712762943, "learning_rate": 8.589534896675782e-06, "loss": 0.6155, "step": 1545 }, { "epoch": 0.3216434945009338, "grad_norm": 2.4700710695591144, "learning_rate": 8.576904825025763e-06, "loss": 0.5805, "step": 1550 }, { "epoch": 0.3226810541606142, "grad_norm": 2.569997778071618, "learning_rate": 8.56422783231523e-06, "loss": 0.5861, "step": 1555 }, { "epoch": 0.32371861382029465, "grad_norm": 2.6800463634463454, "learning_rate": 8.551504084838217e-06, "loss": 0.5888, "step": 1560 }, { "epoch": 0.3247561734799751, "grad_norm": 2.614913869620652, "learning_rate": 8.538733749502084e-06, "loss": 0.5916, "step": 1565 }, { "epoch": 0.32579373313965554, "grad_norm": 2.5392913094762015, "learning_rate": 8.525916993825312e-06, "loss": 0.5845, "step": 1570 }, { "epoch": 0.326831292799336, "grad_norm": 2.5970061681752714, "learning_rate": 8.51305398593532e-06, "loss": 0.5885, "step": 1575 }, { "epoch": 0.32786885245901637, "grad_norm": 2.892436571898512, "learning_rate": 8.50014489456625e-06, "loss": 0.5735, "step": 1580 }, { "epoch": 0.3289064121186968, "grad_norm": 2.6138781972835874, "learning_rate": 8.487189889056758e-06, "loss": 0.5559, "step": 1585 }, { "epoch": 0.32994397177837725, "grad_norm": 2.623576409499823, "learning_rate": 8.474189139347795e-06, "loss": 0.5846, "step": 1590 }, { "epoch": 0.3309815314380577, "grad_norm": 2.468823382349316, "learning_rate": 8.461142815980368e-06, "loss": 0.5986, "step": 1595 }, { "epoch": 0.33201909109773814, "grad_norm": 2.672237640145444, "learning_rate": 8.448051090093315e-06, "loss": 0.591, "step": 1600 }, { "epoch": 0.3330566507574185, "grad_norm": 2.4728672068736937, "learning_rate": 8.434914133421053e-06, "loss": 0.5845, "step": 1605 }, { "epoch": 0.33409421041709897, "grad_norm": 2.5895072383457087, "learning_rate": 8.421732118291326e-06, "loss": 0.5782, "step": 1610 }, { "epoch": 0.3351317700767794, "grad_norm": 2.5040782851199137, "learning_rate": 8.408505217622942e-06, "loss": 0.5815, "step": 1615 }, { "epoch": 0.33616932973645985, "grad_norm": 3.6988626884139846, "learning_rate": 8.395233604923515e-06, "loss": 0.5843, "step": 1620 }, { "epoch": 0.3372068893961403, "grad_norm": 2.5828684207028547, "learning_rate": 8.381917454287175e-06, "loss": 0.5793, "step": 1625 }, { "epoch": 0.3382444490558207, "grad_norm": 2.638686184144453, "learning_rate": 8.368556940392295e-06, "loss": 0.5841, "step": 1630 }, { "epoch": 0.3392820087155011, "grad_norm": 2.650538574655116, "learning_rate": 8.355152238499192e-06, "loss": 0.5875, "step": 1635 }, { "epoch": 0.34031956837518157, "grad_norm": 2.486723068143089, "learning_rate": 8.341703524447834e-06, "loss": 0.5752, "step": 1640 }, { "epoch": 0.341357128034862, "grad_norm": 2.542548425745693, "learning_rate": 8.328210974655534e-06, "loss": 0.582, "step": 1645 }, { "epoch": 0.34239468769454245, "grad_norm": 2.5238474016255346, "learning_rate": 8.314674766114625e-06, "loss": 0.5886, "step": 1650 }, { "epoch": 0.34343224735422284, "grad_norm": 2.6663652283930164, "learning_rate": 8.301095076390151e-06, "loss": 0.5703, "step": 1655 }, { "epoch": 0.3444698070139033, "grad_norm": 2.5916048021671765, "learning_rate": 8.287472083617534e-06, "loss": 0.5578, "step": 1660 }, { "epoch": 0.3455073666735837, "grad_norm": 2.64819187278271, "learning_rate": 8.273805966500233e-06, "loss": 0.566, "step": 1665 }, { "epoch": 0.34654492633326417, "grad_norm": 2.7355426760241115, "learning_rate": 8.260096904307404e-06, "loss": 0.5724, "step": 1670 }, { "epoch": 0.3475824859929446, "grad_norm": 2.5288788248400422, "learning_rate": 8.246345076871548e-06, "loss": 0.5852, "step": 1675 }, { "epoch": 0.348620045652625, "grad_norm": 2.4333668936065727, "learning_rate": 8.232550664586145e-06, "loss": 0.562, "step": 1680 }, { "epoch": 0.34965760531230544, "grad_norm": 2.666095931772747, "learning_rate": 8.218713848403306e-06, "loss": 0.5761, "step": 1685 }, { "epoch": 0.3506951649719859, "grad_norm": 2.585591858891525, "learning_rate": 8.204834809831377e-06, "loss": 0.579, "step": 1690 }, { "epoch": 0.3517327246316663, "grad_norm": 2.5419925763820457, "learning_rate": 8.190913730932567e-06, "loss": 0.5792, "step": 1695 }, { "epoch": 0.35277028429134677, "grad_norm": 2.5691748657197575, "learning_rate": 8.176950794320572e-06, "loss": 0.5647, "step": 1700 }, { "epoch": 0.35380784395102716, "grad_norm": 2.5977820134449603, "learning_rate": 8.16294618315816e-06, "loss": 0.5708, "step": 1705 }, { "epoch": 0.3548454036107076, "grad_norm": 2.423522377595987, "learning_rate": 8.148900081154773e-06, "loss": 0.5666, "step": 1710 }, { "epoch": 0.35588296327038804, "grad_norm": 2.569485264156882, "learning_rate": 8.134812672564131e-06, "loss": 0.5504, "step": 1715 }, { "epoch": 0.3569205229300685, "grad_norm": 2.758198111046606, "learning_rate": 8.1206841421818e-06, "loss": 0.5691, "step": 1720 }, { "epoch": 0.3579580825897489, "grad_norm": 2.5786351112814545, "learning_rate": 8.10651467534277e-06, "loss": 0.57, "step": 1725 }, { "epoch": 0.35899564224942937, "grad_norm": 2.4962078269510015, "learning_rate": 8.092304457919028e-06, "loss": 0.557, "step": 1730 }, { "epoch": 0.36003320190910976, "grad_norm": 2.5552381714765384, "learning_rate": 8.078053676317124e-06, "loss": 0.5673, "step": 1735 }, { "epoch": 0.3610707615687902, "grad_norm": 2.611892754049072, "learning_rate": 8.06376251747571e-06, "loss": 0.5535, "step": 1740 }, { "epoch": 0.36210832122847064, "grad_norm": 2.4981862280677545, "learning_rate": 8.049431168863107e-06, "loss": 0.5543, "step": 1745 }, { "epoch": 0.3631458808881511, "grad_norm": 2.590859008799774, "learning_rate": 8.035059818474833e-06, "loss": 0.5688, "step": 1750 }, { "epoch": 0.3641834405478315, "grad_norm": 2.7049066460444657, "learning_rate": 8.02064865483114e-06, "loss": 0.5666, "step": 1755 }, { "epoch": 0.3652210002075119, "grad_norm": 2.50555658991844, "learning_rate": 8.00619786697454e-06, "loss": 0.553, "step": 1760 }, { "epoch": 0.36625855986719236, "grad_norm": 2.4491820476275805, "learning_rate": 7.991707644467335e-06, "loss": 0.5635, "step": 1765 }, { "epoch": 0.3672961195268728, "grad_norm": 2.560789498381096, "learning_rate": 7.97717817738911e-06, "loss": 0.5408, "step": 1770 }, { "epoch": 0.36833367918655324, "grad_norm": 2.6052621436325416, "learning_rate": 7.962609656334262e-06, "loss": 0.5488, "step": 1775 }, { "epoch": 0.3693712388462337, "grad_norm": 2.5278436185793582, "learning_rate": 7.94800227240948e-06, "loss": 0.5573, "step": 1780 }, { "epoch": 0.37040879850591407, "grad_norm": 2.5168973470980607, "learning_rate": 7.933356217231261e-06, "loss": 0.5358, "step": 1785 }, { "epoch": 0.3714463581655945, "grad_norm": 2.5427476959285475, "learning_rate": 7.918671682923371e-06, "loss": 0.557, "step": 1790 }, { "epoch": 0.37248391782527496, "grad_norm": 2.5597051499240315, "learning_rate": 7.90394886211434e-06, "loss": 0.5443, "step": 1795 }, { "epoch": 0.3735214774849554, "grad_norm": 2.6134854223700033, "learning_rate": 7.889187947934939e-06, "loss": 0.5643, "step": 1800 }, { "epoch": 0.37455903714463584, "grad_norm": 2.5638490649548507, "learning_rate": 7.874389134015627e-06, "loss": 0.5515, "step": 1805 }, { "epoch": 0.37559659680431623, "grad_norm": 2.638245607620569, "learning_rate": 7.859552614484035e-06, "loss": 0.5512, "step": 1810 }, { "epoch": 0.37663415646399667, "grad_norm": 2.5439284853350683, "learning_rate": 7.844678583962403e-06, "loss": 0.5357, "step": 1815 }, { "epoch": 0.3776717161236771, "grad_norm": 2.7382120697501264, "learning_rate": 7.829767237565027e-06, "loss": 0.5499, "step": 1820 }, { "epoch": 0.37870927578335756, "grad_norm": 2.693790343217592, "learning_rate": 7.814818770895718e-06, "loss": 0.5447, "step": 1825 }, { "epoch": 0.379746835443038, "grad_norm": 2.5002163516357685, "learning_rate": 7.79983338004521e-06, "loss": 0.5548, "step": 1830 }, { "epoch": 0.3807843951027184, "grad_norm": 2.5140469437451514, "learning_rate": 7.784811261588605e-06, "loss": 0.5396, "step": 1835 }, { "epoch": 0.38182195476239883, "grad_norm": 2.3632709986503384, "learning_rate": 7.769752612582793e-06, "loss": 0.5455, "step": 1840 }, { "epoch": 0.3828595144220793, "grad_norm": 2.6174115493874415, "learning_rate": 7.754657630563855e-06, "loss": 0.5501, "step": 1845 }, { "epoch": 0.3838970740817597, "grad_norm": 2.662869705594198, "learning_rate": 7.739526513544492e-06, "loss": 0.5458, "step": 1850 }, { "epoch": 0.38493463374144016, "grad_norm": 2.595342018363226, "learning_rate": 7.724359460011406e-06, "loss": 0.5484, "step": 1855 }, { "epoch": 0.38597219340112054, "grad_norm": 2.574988659923898, "learning_rate": 7.709156668922715e-06, "loss": 0.5465, "step": 1860 }, { "epoch": 0.387009753060801, "grad_norm": 2.696253501363756, "learning_rate": 7.693918339705327e-06, "loss": 0.5416, "step": 1865 }, { "epoch": 0.38804731272048143, "grad_norm": 2.571358317228633, "learning_rate": 7.678644672252334e-06, "loss": 0.5432, "step": 1870 }, { "epoch": 0.3890848723801619, "grad_norm": 2.6773383342412007, "learning_rate": 7.663335866920389e-06, "loss": 0.5435, "step": 1875 }, { "epoch": 0.3901224320398423, "grad_norm": 2.605715400204335, "learning_rate": 7.647992124527076e-06, "loss": 0.5394, "step": 1880 }, { "epoch": 0.3911599916995227, "grad_norm": 2.542289996117672, "learning_rate": 7.632613646348273e-06, "loss": 0.5365, "step": 1885 }, { "epoch": 0.39219755135920314, "grad_norm": 2.640322094229756, "learning_rate": 7.617200634115516e-06, "loss": 0.5473, "step": 1890 }, { "epoch": 0.3932351110188836, "grad_norm": 2.5708221170868355, "learning_rate": 7.601753290013353e-06, "loss": 0.5209, "step": 1895 }, { "epoch": 0.39427267067856403, "grad_norm": 2.525915351063816, "learning_rate": 7.586271816676687e-06, "loss": 0.5288, "step": 1900 }, { "epoch": 0.3953102303382445, "grad_norm": 2.5364307727837234, "learning_rate": 7.570756417188123e-06, "loss": 0.5429, "step": 1905 }, { "epoch": 0.39634778999792486, "grad_norm": 2.559190451038623, "learning_rate": 7.555207295075303e-06, "loss": 0.5128, "step": 1910 }, { "epoch": 0.3973853496576053, "grad_norm": 2.7079589585625095, "learning_rate": 7.539624654308231e-06, "loss": 0.5333, "step": 1915 }, { "epoch": 0.39842290931728574, "grad_norm": 2.4387259072975267, "learning_rate": 7.5240086992966045e-06, "loss": 0.5334, "step": 1920 }, { "epoch": 0.3994604689769662, "grad_norm": 2.5588774252471818, "learning_rate": 7.508359634887128e-06, "loss": 0.5429, "step": 1925 }, { "epoch": 0.40049802863664663, "grad_norm": 2.50003757681576, "learning_rate": 7.4926776663608305e-06, "loss": 0.5353, "step": 1930 }, { "epoch": 0.401535588296327, "grad_norm": 2.471663404094866, "learning_rate": 7.476962999430368e-06, "loss": 0.5373, "step": 1935 }, { "epoch": 0.40257314795600746, "grad_norm": 2.4541757324292215, "learning_rate": 7.461215840237329e-06, "loss": 0.5278, "step": 1940 }, { "epoch": 0.4036107076156879, "grad_norm": 2.5133909898638738, "learning_rate": 7.4454363953495255e-06, "loss": 0.5224, "step": 1945 }, { "epoch": 0.40464826727536835, "grad_norm": 2.456759039414412, "learning_rate": 7.429624871758289e-06, "loss": 0.5274, "step": 1950 }, { "epoch": 0.4056858269350488, "grad_norm": 2.6254399143011025, "learning_rate": 7.41378147687575e-06, "loss": 0.539, "step": 1955 }, { "epoch": 0.4067233865947292, "grad_norm": 2.77782256038311, "learning_rate": 7.397906418532124e-06, "loss": 0.5255, "step": 1960 }, { "epoch": 0.4077609462544096, "grad_norm": 2.401784319404263, "learning_rate": 7.381999904972974e-06, "loss": 0.5373, "step": 1965 }, { "epoch": 0.40879850591409006, "grad_norm": 2.4340739184784996, "learning_rate": 7.366062144856494e-06, "loss": 0.5292, "step": 1970 }, { "epoch": 0.4098360655737705, "grad_norm": 2.5139056739660104, "learning_rate": 7.350093347250754e-06, "loss": 0.524, "step": 1975 }, { "epoch": 0.41087362523345095, "grad_norm": 2.513276255127495, "learning_rate": 7.334093721630976e-06, "loss": 0.5231, "step": 1980 }, { "epoch": 0.41191118489313133, "grad_norm": 2.6310824706160636, "learning_rate": 7.318063477876775e-06, "loss": 0.5233, "step": 1985 }, { "epoch": 0.4129487445528118, "grad_norm": 2.491475901603011, "learning_rate": 7.302002826269401e-06, "loss": 0.5341, "step": 1990 }, { "epoch": 0.4139863042124922, "grad_norm": 2.506660610799657, "learning_rate": 7.285911977488995e-06, "loss": 0.5182, "step": 1995 }, { "epoch": 0.41502386387217266, "grad_norm": 2.4794467040525507, "learning_rate": 7.269791142611819e-06, "loss": 0.5305, "step": 2000 }, { "epoch": 0.4160614235318531, "grad_norm": 2.4409844087069197, "learning_rate": 7.253640533107482e-06, "loss": 0.5097, "step": 2005 }, { "epoch": 0.4170989831915335, "grad_norm": 2.5831589194501334, "learning_rate": 7.23746036083617e-06, "loss": 0.509, "step": 2010 }, { "epoch": 0.41813654285121393, "grad_norm": 2.5848519997923134, "learning_rate": 7.221250838045866e-06, "loss": 0.5212, "step": 2015 }, { "epoch": 0.4191741025108944, "grad_norm": 2.4649428551198507, "learning_rate": 7.205012177369573e-06, "loss": 0.5097, "step": 2020 }, { "epoch": 0.4202116621705748, "grad_norm": 2.6470337729349334, "learning_rate": 7.188744591822514e-06, "loss": 0.5265, "step": 2025 }, { "epoch": 0.42124922183025526, "grad_norm": 2.6777002487915427, "learning_rate": 7.17244829479934e-06, "loss": 0.5132, "step": 2030 }, { "epoch": 0.42228678148993565, "grad_norm": 2.628723636023145, "learning_rate": 7.156123500071337e-06, "loss": 0.5383, "step": 2035 }, { "epoch": 0.4233243411496161, "grad_norm": 2.6077956409975322, "learning_rate": 7.139770421783616e-06, "loss": 0.5143, "step": 2040 }, { "epoch": 0.42436190080929653, "grad_norm": 2.5195148593267165, "learning_rate": 7.1233892744523055e-06, "loss": 0.5292, "step": 2045 }, { "epoch": 0.425399460468977, "grad_norm": 2.4869546800490276, "learning_rate": 7.1069802729617385e-06, "loss": 0.5219, "step": 2050 }, { "epoch": 0.4264370201286574, "grad_norm": 2.498457038824186, "learning_rate": 7.090543632561632e-06, "loss": 0.5227, "step": 2055 }, { "epoch": 0.4274745797883378, "grad_norm": 2.5334280776647855, "learning_rate": 7.0740795688642635e-06, "loss": 0.5174, "step": 2060 }, { "epoch": 0.42851213944801825, "grad_norm": 2.4142684408837667, "learning_rate": 7.057588297841645e-06, "loss": 0.5154, "step": 2065 }, { "epoch": 0.4295496991076987, "grad_norm": 2.7421341811351967, "learning_rate": 7.041070035822687e-06, "loss": 0.4983, "step": 2070 }, { "epoch": 0.43058725876737913, "grad_norm": 2.385233500154077, "learning_rate": 7.024524999490364e-06, "loss": 0.535, "step": 2075 }, { "epoch": 0.4316248184270596, "grad_norm": 2.493022029900477, "learning_rate": 7.007953405878867e-06, "loss": 0.5036, "step": 2080 }, { "epoch": 0.43266237808673996, "grad_norm": 2.5007769964543507, "learning_rate": 6.991355472370762e-06, "loss": 0.5288, "step": 2085 }, { "epoch": 0.4336999377464204, "grad_norm": 2.4139386188608842, "learning_rate": 6.974731416694135e-06, "loss": 0.5142, "step": 2090 }, { "epoch": 0.43473749740610085, "grad_norm": 2.5053762871669303, "learning_rate": 6.958081456919737e-06, "loss": 0.502, "step": 2095 }, { "epoch": 0.4357750570657813, "grad_norm": 2.5446406360038267, "learning_rate": 6.941405811458126e-06, "loss": 0.5079, "step": 2100 }, { "epoch": 0.43681261672546173, "grad_norm": 2.5024786886158465, "learning_rate": 6.924704699056792e-06, "loss": 0.5102, "step": 2105 }, { "epoch": 0.4378501763851421, "grad_norm": 2.5991293471527754, "learning_rate": 6.907978338797304e-06, "loss": 0.5033, "step": 2110 }, { "epoch": 0.43888773604482256, "grad_norm": 2.45095417799665, "learning_rate": 6.891226950092422e-06, "loss": 0.5033, "step": 2115 }, { "epoch": 0.439925295704503, "grad_norm": 2.5068078607240762, "learning_rate": 6.874450752683223e-06, "loss": 0.5131, "step": 2120 }, { "epoch": 0.44096285536418345, "grad_norm": 2.615502411671901, "learning_rate": 6.85764996663622e-06, "loss": 0.514, "step": 2125 }, { "epoch": 0.4420004150238639, "grad_norm": 2.483179103106064, "learning_rate": 6.840824812340476e-06, "loss": 0.482, "step": 2130 }, { "epoch": 0.4430379746835443, "grad_norm": 2.615056810287462, "learning_rate": 6.82397551050471e-06, "loss": 0.4981, "step": 2135 }, { "epoch": 0.4440755343432247, "grad_norm": 2.5081191557582962, "learning_rate": 6.807102282154406e-06, "loss": 0.5038, "step": 2140 }, { "epoch": 0.44511309400290516, "grad_norm": 2.4228335632525857, "learning_rate": 6.790205348628902e-06, "loss": 0.5116, "step": 2145 }, { "epoch": 0.4461506536625856, "grad_norm": 2.4865567217795834, "learning_rate": 6.773284931578508e-06, "loss": 0.4923, "step": 2150 }, { "epoch": 0.44718821332226605, "grad_norm": 2.5028391081664707, "learning_rate": 6.756341252961575e-06, "loss": 0.507, "step": 2155 }, { "epoch": 0.44822577298194644, "grad_norm": 2.4437292124023755, "learning_rate": 6.739374535041601e-06, "loss": 0.5041, "step": 2160 }, { "epoch": 0.4492633326416269, "grad_norm": 2.4804781219836705, "learning_rate": 6.722385000384305e-06, "loss": 0.5071, "step": 2165 }, { "epoch": 0.4503008923013073, "grad_norm": 2.613493029619529, "learning_rate": 6.705372871854713e-06, "loss": 0.5045, "step": 2170 }, { "epoch": 0.45133845196098776, "grad_norm": 2.5550452506734613, "learning_rate": 6.688338372614232e-06, "loss": 0.4954, "step": 2175 }, { "epoch": 0.4523760116206682, "grad_norm": 2.5823699134656795, "learning_rate": 6.671281726117721e-06, "loss": 0.5029, "step": 2180 }, { "epoch": 0.4534135712803486, "grad_norm": 2.6550826128938287, "learning_rate": 6.654203156110565e-06, "loss": 0.4942, "step": 2185 }, { "epoch": 0.45445113094002904, "grad_norm": 2.4268474548849976, "learning_rate": 6.6371028866257355e-06, "loss": 0.5027, "step": 2190 }, { "epoch": 0.4554886905997095, "grad_norm": 2.437142297358699, "learning_rate": 6.6199811419808525e-06, "loss": 0.4949, "step": 2195 }, { "epoch": 0.4565262502593899, "grad_norm": 2.494477319925722, "learning_rate": 6.602838146775243e-06, "loss": 0.4796, "step": 2200 }, { "epoch": 0.45756380991907036, "grad_norm": 2.5244330207655223, "learning_rate": 6.585674125886996e-06, "loss": 0.5066, "step": 2205 }, { "epoch": 0.45860136957875075, "grad_norm": 2.6127909654783608, "learning_rate": 6.568489304470007e-06, "loss": 0.4909, "step": 2210 }, { "epoch": 0.4596389292384312, "grad_norm": 2.5733537948603105, "learning_rate": 6.551283907951031e-06, "loss": 0.4886, "step": 2215 }, { "epoch": 0.46067648889811164, "grad_norm": 2.627653050074194, "learning_rate": 6.534058162026724e-06, "loss": 0.4871, "step": 2220 }, { "epoch": 0.4617140485577921, "grad_norm": 2.493707567539853, "learning_rate": 6.516812292660675e-06, "loss": 0.5115, "step": 2225 }, { "epoch": 0.4627516082174725, "grad_norm": 2.576298574074138, "learning_rate": 6.499546526080457e-06, "loss": 0.4935, "step": 2230 }, { "epoch": 0.46378916787715296, "grad_norm": 2.3441840780878227, "learning_rate": 6.482261088774642e-06, "loss": 0.4918, "step": 2235 }, { "epoch": 0.46482672753683335, "grad_norm": 2.5855232225238702, "learning_rate": 6.464956207489843e-06, "loss": 0.5009, "step": 2240 }, { "epoch": 0.4658642871965138, "grad_norm": 2.578617818062702, "learning_rate": 6.447632109227735e-06, "loss": 0.4931, "step": 2245 }, { "epoch": 0.46690184685619424, "grad_norm": 2.6300654942955752, "learning_rate": 6.4302890212420735e-06, "loss": 0.4924, "step": 2250 }, { "epoch": 0.4679394065158747, "grad_norm": 2.535272976711584, "learning_rate": 6.412927171035721e-06, "loss": 0.4864, "step": 2255 }, { "epoch": 0.4689769661755551, "grad_norm": 2.497392960102334, "learning_rate": 6.3955467863576555e-06, "loss": 0.502, "step": 2260 }, { "epoch": 0.4700145258352355, "grad_norm": 2.476885100955309, "learning_rate": 6.37814809519999e-06, "loss": 0.498, "step": 2265 }, { "epoch": 0.47105208549491595, "grad_norm": 2.3650744048382726, "learning_rate": 6.360731325794975e-06, "loss": 0.486, "step": 2270 }, { "epoch": 0.4720896451545964, "grad_norm": 2.469078585494707, "learning_rate": 6.343296706612008e-06, "loss": 0.4745, "step": 2275 }, { "epoch": 0.47312720481427684, "grad_norm": 2.5527304594928326, "learning_rate": 6.325844466354637e-06, "loss": 0.4959, "step": 2280 }, { "epoch": 0.4741647644739573, "grad_norm": 2.388876155874213, "learning_rate": 6.308374833957556e-06, "loss": 0.4787, "step": 2285 }, { "epoch": 0.47520232413363767, "grad_norm": 2.642090924554838, "learning_rate": 6.290888038583611e-06, "loss": 0.4951, "step": 2290 }, { "epoch": 0.4762398837933181, "grad_norm": 2.5493215911731557, "learning_rate": 6.273384309620785e-06, "loss": 0.4799, "step": 2295 }, { "epoch": 0.47727744345299855, "grad_norm": 2.3751142920110944, "learning_rate": 6.25586387667919e-06, "loss": 0.4841, "step": 2300 }, { "epoch": 0.478315003112679, "grad_norm": 2.4695590079808625, "learning_rate": 6.238326969588062e-06, "loss": 0.4739, "step": 2305 }, { "epoch": 0.47935256277235944, "grad_norm": 2.6302561474616937, "learning_rate": 6.220773818392738e-06, "loss": 0.4809, "step": 2310 }, { "epoch": 0.4803901224320398, "grad_norm": 2.5254595904398744, "learning_rate": 6.203204653351642e-06, "loss": 0.4964, "step": 2315 }, { "epoch": 0.48142768209172027, "grad_norm": 2.4902195244274288, "learning_rate": 6.185619704933267e-06, "loss": 0.4654, "step": 2320 }, { "epoch": 0.4824652417514007, "grad_norm": 2.5224684668035007, "learning_rate": 6.168019203813143e-06, "loss": 0.479, "step": 2325 }, { "epoch": 0.48350280141108115, "grad_norm": 2.531197814844583, "learning_rate": 6.15040338087082e-06, "loss": 0.4756, "step": 2330 }, { "epoch": 0.4845403610707616, "grad_norm": 2.3546555565826135, "learning_rate": 6.132772467186841e-06, "loss": 0.4649, "step": 2335 }, { "epoch": 0.485577920730442, "grad_norm": 2.3784120564924622, "learning_rate": 6.115126694039699e-06, "loss": 0.4709, "step": 2340 }, { "epoch": 0.4866154803901224, "grad_norm": 2.56815940386139, "learning_rate": 6.097466292902815e-06, "loss": 0.486, "step": 2345 }, { "epoch": 0.48765304004980287, "grad_norm": 2.5301845486075103, "learning_rate": 6.079791495441491e-06, "loss": 0.4754, "step": 2350 }, { "epoch": 0.4886905997094833, "grad_norm": 2.4650710165826806, "learning_rate": 6.062102533509886e-06, "loss": 0.4663, "step": 2355 }, { "epoch": 0.48972815936916375, "grad_norm": 2.7587452612942145, "learning_rate": 6.044399639147957e-06, "loss": 0.4632, "step": 2360 }, { "epoch": 0.49076571902884414, "grad_norm": 2.543480240365517, "learning_rate": 6.026683044578427e-06, "loss": 0.4689, "step": 2365 }, { "epoch": 0.4918032786885246, "grad_norm": 2.4525809117166366, "learning_rate": 6.008952982203737e-06, "loss": 0.4843, "step": 2370 }, { "epoch": 0.492840838348205, "grad_norm": 2.417071327082838, "learning_rate": 5.991209684602991e-06, "loss": 0.4677, "step": 2375 }, { "epoch": 0.49387839800788547, "grad_norm": 2.8159423032357394, "learning_rate": 5.9734533845289144e-06, "loss": 0.466, "step": 2380 }, { "epoch": 0.4949159576675659, "grad_norm": 2.5331145358141023, "learning_rate": 5.955684314904795e-06, "loss": 0.491, "step": 2385 }, { "epoch": 0.4959535173272463, "grad_norm": 2.5850294725860103, "learning_rate": 5.937902708821427e-06, "loss": 0.4727, "step": 2390 }, { "epoch": 0.49699107698692674, "grad_norm": 2.3455874492137627, "learning_rate": 5.920108799534059e-06, "loss": 0.4699, "step": 2395 }, { "epoch": 0.4980286366466072, "grad_norm": 2.496007013879939, "learning_rate": 5.902302820459324e-06, "loss": 0.4599, "step": 2400 }, { "epoch": 0.4990661963062876, "grad_norm": 2.412699996479322, "learning_rate": 5.884485005172189e-06, "loss": 0.474, "step": 2405 }, { "epoch": 0.500103755965968, "grad_norm": 2.6197335120279805, "learning_rate": 5.866655587402886e-06, "loss": 0.4815, "step": 2410 }, { "epoch": 0.5011413156256485, "grad_norm": 2.632568563481647, "learning_rate": 5.8488148010338445e-06, "loss": 0.474, "step": 2415 }, { "epoch": 0.5021788752853289, "grad_norm": 2.383359491818944, "learning_rate": 5.8309628800966225e-06, "loss": 0.464, "step": 2420 }, { "epoch": 0.5032164349450093, "grad_norm": 2.687191301722998, "learning_rate": 5.813100058768841e-06, "loss": 0.4671, "step": 2425 }, { "epoch": 0.5042539946046898, "grad_norm": 2.455559246352499, "learning_rate": 5.795226571371114e-06, "loss": 0.4682, "step": 2430 }, { "epoch": 0.5052915542643702, "grad_norm": 2.589166158482144, "learning_rate": 5.777342652363963e-06, "loss": 0.4756, "step": 2435 }, { "epoch": 0.5063291139240507, "grad_norm": 2.5173329990266295, "learning_rate": 5.759448536344753e-06, "loss": 0.4849, "step": 2440 }, { "epoch": 0.5073666735837311, "grad_norm": 2.3813334648732227, "learning_rate": 5.741544458044611e-06, "loss": 0.4725, "step": 2445 }, { "epoch": 0.5084042332434114, "grad_norm": 2.4645035637647683, "learning_rate": 5.723630652325349e-06, "loss": 0.4523, "step": 2450 }, { "epoch": 0.5094417929030919, "grad_norm": 2.5330130007551848, "learning_rate": 5.705707354176377e-06, "loss": 0.4655, "step": 2455 }, { "epoch": 0.5104793525627723, "grad_norm": 2.3938015606881886, "learning_rate": 5.687774798711627e-06, "loss": 0.468, "step": 2460 }, { "epoch": 0.5115169122224528, "grad_norm": 2.647072227845224, "learning_rate": 5.669833221166469e-06, "loss": 0.4695, "step": 2465 }, { "epoch": 0.5125544718821332, "grad_norm": 2.4713289117715544, "learning_rate": 5.651882856894615e-06, "loss": 0.4617, "step": 2470 }, { "epoch": 0.5135920315418137, "grad_norm": 2.494015444510411, "learning_rate": 5.633923941365049e-06, "loss": 0.4659, "step": 2475 }, { "epoch": 0.5146295912014941, "grad_norm": 2.3455915787636807, "learning_rate": 5.615956710158921e-06, "loss": 0.4563, "step": 2480 }, { "epoch": 0.5156671508611745, "grad_norm": 2.453795440026443, "learning_rate": 5.597981398966468e-06, "loss": 0.4698, "step": 2485 }, { "epoch": 0.516704710520855, "grad_norm": 2.3862841672431143, "learning_rate": 5.579998243583919e-06, "loss": 0.4583, "step": 2490 }, { "epoch": 0.5177422701805354, "grad_norm": 2.412824706136417, "learning_rate": 5.562007479910396e-06, "loss": 0.4714, "step": 2495 }, { "epoch": 0.5187798298402159, "grad_norm": 2.5121998422157543, "learning_rate": 5.544009343944834e-06, "loss": 0.4597, "step": 2500 }, { "epoch": 0.5198173894998962, "grad_norm": 2.479588579717285, "learning_rate": 5.526004071782868e-06, "loss": 0.461, "step": 2505 }, { "epoch": 0.5208549491595766, "grad_norm": 2.4676776770935365, "learning_rate": 5.507991899613746e-06, "loss": 0.4632, "step": 2510 }, { "epoch": 0.5218925088192571, "grad_norm": 2.4315208171491838, "learning_rate": 5.489973063717233e-06, "loss": 0.4702, "step": 2515 }, { "epoch": 0.5229300684789375, "grad_norm": 2.430974288239865, "learning_rate": 5.471947800460502e-06, "loss": 0.4389, "step": 2520 }, { "epoch": 0.523967628138618, "grad_norm": 2.481418769699562, "learning_rate": 5.453916346295043e-06, "loss": 0.4516, "step": 2525 }, { "epoch": 0.5250051877982984, "grad_norm": 2.454530070786792, "learning_rate": 5.435878937753553e-06, "loss": 0.4461, "step": 2530 }, { "epoch": 0.5260427474579789, "grad_norm": 2.412430721171224, "learning_rate": 5.417835811446839e-06, "loss": 0.4516, "step": 2535 }, { "epoch": 0.5270803071176593, "grad_norm": 2.5095500970858566, "learning_rate": 5.3997872040607154e-06, "loss": 0.4647, "step": 2540 }, { "epoch": 0.5281178667773397, "grad_norm": 2.47241909883305, "learning_rate": 5.3817333523528895e-06, "loss": 0.4529, "step": 2545 }, { "epoch": 0.5291554264370202, "grad_norm": 2.5164134259250677, "learning_rate": 5.363674493149868e-06, "loss": 0.4584, "step": 2550 }, { "epoch": 0.5301929860967005, "grad_norm": 2.4698416937451073, "learning_rate": 5.345610863343843e-06, "loss": 0.4479, "step": 2555 }, { "epoch": 0.531230545756381, "grad_norm": 2.388109532933269, "learning_rate": 5.327542699889586e-06, "loss": 0.4527, "step": 2560 }, { "epoch": 0.5322681054160614, "grad_norm": 2.532256574198705, "learning_rate": 5.309470239801343e-06, "loss": 0.4541, "step": 2565 }, { "epoch": 0.5333056650757418, "grad_norm": 2.414906181196464, "learning_rate": 5.291393720149716e-06, "loss": 0.4415, "step": 2570 }, { "epoch": 0.5343432247354223, "grad_norm": 2.5860824474566146, "learning_rate": 5.273313378058566e-06, "loss": 0.4377, "step": 2575 }, { "epoch": 0.5353807843951027, "grad_norm": 2.6418841514226967, "learning_rate": 5.255229450701893e-06, "loss": 0.4342, "step": 2580 }, { "epoch": 0.5364183440547832, "grad_norm": 2.4497315903217247, "learning_rate": 5.237142175300726e-06, "loss": 0.4533, "step": 2585 }, { "epoch": 0.5374559037144636, "grad_norm": 2.4935146868944265, "learning_rate": 5.219051789120015e-06, "loss": 0.44, "step": 2590 }, { "epoch": 0.538493463374144, "grad_norm": 2.4091632402760346, "learning_rate": 5.200958529465517e-06, "loss": 0.454, "step": 2595 }, { "epoch": 0.5395310230338245, "grad_norm": 2.460401303833881, "learning_rate": 5.182862633680683e-06, "loss": 0.4512, "step": 2600 }, { "epoch": 0.5405685826935048, "grad_norm": 2.49980869010523, "learning_rate": 5.164764339143542e-06, "loss": 0.4531, "step": 2605 }, { "epoch": 0.5416061423531853, "grad_norm": 2.3750857632390767, "learning_rate": 5.14666388326359e-06, "loss": 0.4548, "step": 2610 }, { "epoch": 0.5426437020128657, "grad_norm": 2.402759567147983, "learning_rate": 5.128561503478676e-06, "loss": 0.4582, "step": 2615 }, { "epoch": 0.5436812616725462, "grad_norm": 2.387820424005501, "learning_rate": 5.110457437251886e-06, "loss": 0.4413, "step": 2620 }, { "epoch": 0.5447188213322266, "grad_norm": 2.3796099453754764, "learning_rate": 5.092351922068427e-06, "loss": 0.4524, "step": 2625 }, { "epoch": 0.545756380991907, "grad_norm": 2.3669548551534563, "learning_rate": 5.0742451954325156e-06, "loss": 0.4473, "step": 2630 }, { "epoch": 0.5467939406515875, "grad_norm": 2.463149244250711, "learning_rate": 5.056137494864259e-06, "loss": 0.447, "step": 2635 }, { "epoch": 0.5478315003112679, "grad_norm": 2.3744815980543494, "learning_rate": 5.0380290578965375e-06, "loss": 0.4404, "step": 2640 }, { "epoch": 0.5488690599709484, "grad_norm": 2.3391410347348796, "learning_rate": 5.019920122071896e-06, "loss": 0.4388, "step": 2645 }, { "epoch": 0.5499066196306288, "grad_norm": 2.3768157404751085, "learning_rate": 5.00181092493942e-06, "loss": 0.4386, "step": 2650 }, { "epoch": 0.5509441792903091, "grad_norm": 2.6059542471826824, "learning_rate": 4.983701704051625e-06, "loss": 0.4528, "step": 2655 }, { "epoch": 0.5519817389499896, "grad_norm": 2.417585242968673, "learning_rate": 4.965592696961335e-06, "loss": 0.4501, "step": 2660 }, { "epoch": 0.55301929860967, "grad_norm": 2.4947079086664123, "learning_rate": 4.947484141218572e-06, "loss": 0.4385, "step": 2665 }, { "epoch": 0.5540568582693505, "grad_norm": 2.5191730683436604, "learning_rate": 4.929376274367438e-06, "loss": 0.4324, "step": 2670 }, { "epoch": 0.5550944179290309, "grad_norm": 2.4257268049363336, "learning_rate": 4.911269333942994e-06, "loss": 0.4388, "step": 2675 }, { "epoch": 0.5561319775887114, "grad_norm": 2.5184876591899648, "learning_rate": 4.893163557468155e-06, "loss": 0.4316, "step": 2680 }, { "epoch": 0.5571695372483918, "grad_norm": 2.380468990224694, "learning_rate": 4.87505918245056e-06, "loss": 0.437, "step": 2685 }, { "epoch": 0.5582070969080722, "grad_norm": 2.3265663935377727, "learning_rate": 4.856956446379472e-06, "loss": 0.4307, "step": 2690 }, { "epoch": 0.5592446565677527, "grad_norm": 2.3400451782909646, "learning_rate": 4.838855586722647e-06, "loss": 0.4351, "step": 2695 }, { "epoch": 0.5602822162274331, "grad_norm": 2.4589440190512675, "learning_rate": 4.820756840923232e-06, "loss": 0.4447, "step": 2700 }, { "epoch": 0.5613197758871135, "grad_norm": 2.4951528488343184, "learning_rate": 4.802660446396642e-06, "loss": 0.4445, "step": 2705 }, { "epoch": 0.5623573355467939, "grad_norm": 2.666275326085189, "learning_rate": 4.784566640527451e-06, "loss": 0.4406, "step": 2710 }, { "epoch": 0.5633948952064743, "grad_norm": 2.4165213736699522, "learning_rate": 4.766475660666271e-06, "loss": 0.4222, "step": 2715 }, { "epoch": 0.5644324548661548, "grad_norm": 2.345464539142852, "learning_rate": 4.748387744126649e-06, "loss": 0.4355, "step": 2720 }, { "epoch": 0.5654700145258352, "grad_norm": 2.5576836547836215, "learning_rate": 4.730303128181944e-06, "loss": 0.4289, "step": 2725 }, { "epoch": 0.5665075741855157, "grad_norm": 2.5283882321035858, "learning_rate": 4.712222050062219e-06, "loss": 0.4283, "step": 2730 }, { "epoch": 0.5675451338451961, "grad_norm": 2.556873104995108, "learning_rate": 4.694144746951131e-06, "loss": 0.434, "step": 2735 }, { "epoch": 0.5685826935048766, "grad_norm": 2.280037463289983, "learning_rate": 4.676071455982811e-06, "loss": 0.4294, "step": 2740 }, { "epoch": 0.569620253164557, "grad_norm": 2.3570715386535555, "learning_rate": 4.658002414238771e-06, "loss": 0.4357, "step": 2745 }, { "epoch": 0.5706578128242374, "grad_norm": 2.4558176026821217, "learning_rate": 4.63993785874477e-06, "loss": 0.4421, "step": 2750 }, { "epoch": 0.5716953724839178, "grad_norm": 2.4779834247504082, "learning_rate": 4.621878026467725e-06, "loss": 0.4336, "step": 2755 }, { "epoch": 0.5727329321435982, "grad_norm": 2.3737617720589483, "learning_rate": 4.603823154312593e-06, "loss": 0.4263, "step": 2760 }, { "epoch": 0.5737704918032787, "grad_norm": 2.4648165069495147, "learning_rate": 4.585773479119265e-06, "loss": 0.4487, "step": 2765 }, { "epoch": 0.5748080514629591, "grad_norm": 2.3105842246846064, "learning_rate": 4.567729237659459e-06, "loss": 0.4252, "step": 2770 }, { "epoch": 0.5758456111226395, "grad_norm": 2.560822029486094, "learning_rate": 4.549690666633615e-06, "loss": 0.4432, "step": 2775 }, { "epoch": 0.57688317078232, "grad_norm": 2.6130981185416, "learning_rate": 4.531658002667787e-06, "loss": 0.4402, "step": 2780 }, { "epoch": 0.5779207304420004, "grad_norm": 2.363414455159581, "learning_rate": 4.51363148231055e-06, "loss": 0.4351, "step": 2785 }, { "epoch": 0.5789582901016809, "grad_norm": 2.363075041688293, "learning_rate": 4.495611342029875e-06, "loss": 0.428, "step": 2790 }, { "epoch": 0.5799958497613613, "grad_norm": 2.4754540310783733, "learning_rate": 4.477597818210054e-06, "loss": 0.4246, "step": 2795 }, { "epoch": 0.5810334094210418, "grad_norm": 2.6816555150046755, "learning_rate": 4.459591147148575e-06, "loss": 0.4253, "step": 2800 }, { "epoch": 0.5820709690807221, "grad_norm": 2.4036860880645894, "learning_rate": 4.441591565053041e-06, "loss": 0.4272, "step": 2805 }, { "epoch": 0.5831085287404025, "grad_norm": 2.4802074251251156, "learning_rate": 4.423599308038057e-06, "loss": 0.4209, "step": 2810 }, { "epoch": 0.584146088400083, "grad_norm": 2.415585886184946, "learning_rate": 4.405614612122145e-06, "loss": 0.4226, "step": 2815 }, { "epoch": 0.5851836480597634, "grad_norm": 2.4533356227902816, "learning_rate": 4.387637713224638e-06, "loss": 0.4343, "step": 2820 }, { "epoch": 0.5862212077194439, "grad_norm": 2.7240629036546284, "learning_rate": 4.36966884716259e-06, "loss": 0.4287, "step": 2825 }, { "epoch": 0.5872587673791243, "grad_norm": 2.3878324516316276, "learning_rate": 4.3517082496476845e-06, "loss": 0.4184, "step": 2830 }, { "epoch": 0.5882963270388047, "grad_norm": 2.422038027754476, "learning_rate": 4.333756156283136e-06, "loss": 0.4209, "step": 2835 }, { "epoch": 0.5893338866984852, "grad_norm": 2.5027178602080697, "learning_rate": 4.315812802560609e-06, "loss": 0.4151, "step": 2840 }, { "epoch": 0.5903714463581656, "grad_norm": 2.440654195509635, "learning_rate": 4.2978784238571145e-06, "loss": 0.424, "step": 2845 }, { "epoch": 0.5914090060178461, "grad_norm": 2.4534970838374206, "learning_rate": 4.279953255431944e-06, "loss": 0.427, "step": 2850 }, { "epoch": 0.5924465656775264, "grad_norm": 2.4846011931620087, "learning_rate": 4.262037532423556e-06, "loss": 0.4376, "step": 2855 }, { "epoch": 0.5934841253372068, "grad_norm": 2.4555820059315, "learning_rate": 4.244131489846519e-06, "loss": 0.4102, "step": 2860 }, { "epoch": 0.5945216849968873, "grad_norm": 2.443517931338813, "learning_rate": 4.2262353625884054e-06, "loss": 0.4138, "step": 2865 }, { "epoch": 0.5955592446565677, "grad_norm": 2.369110831081435, "learning_rate": 4.208349385406729e-06, "loss": 0.4364, "step": 2870 }, { "epoch": 0.5965968043162482, "grad_norm": 2.3567775981536423, "learning_rate": 4.190473792925851e-06, "loss": 0.4277, "step": 2875 }, { "epoch": 0.5976343639759286, "grad_norm": 2.3822530664354216, "learning_rate": 4.1726088196339106e-06, "loss": 0.4266, "step": 2880 }, { "epoch": 0.5986719236356091, "grad_norm": 2.3958405655171293, "learning_rate": 4.154754699879748e-06, "loss": 0.4177, "step": 2885 }, { "epoch": 0.5997094832952895, "grad_norm": 2.3762721908121573, "learning_rate": 4.136911667869827e-06, "loss": 0.4146, "step": 2890 }, { "epoch": 0.60074704295497, "grad_norm": 2.2932296484786137, "learning_rate": 4.119079957665163e-06, "loss": 0.4074, "step": 2895 }, { "epoch": 0.6017846026146504, "grad_norm": 2.446668454803684, "learning_rate": 4.101259803178265e-06, "loss": 0.4318, "step": 2900 }, { "epoch": 0.6028221622743307, "grad_norm": 2.361867602889005, "learning_rate": 4.083451438170039e-06, "loss": 0.4098, "step": 2905 }, { "epoch": 0.6038597219340112, "grad_norm": 2.323202296910631, "learning_rate": 4.065655096246755e-06, "loss": 0.408, "step": 2910 }, { "epoch": 0.6048972815936916, "grad_norm": 2.3845839128387722, "learning_rate": 4.047871010856959e-06, "loss": 0.4071, "step": 2915 }, { "epoch": 0.605934841253372, "grad_norm": 2.317266250356725, "learning_rate": 4.03009941528842e-06, "loss": 0.4051, "step": 2920 }, { "epoch": 0.6069724009130525, "grad_norm": 2.4278074780829852, "learning_rate": 4.012340542665067e-06, "loss": 0.4002, "step": 2925 }, { "epoch": 0.6080099605727329, "grad_norm": 2.4948657955155853, "learning_rate": 3.994594625943936e-06, "loss": 0.4103, "step": 2930 }, { "epoch": 0.6090475202324134, "grad_norm": 2.4097506379402596, "learning_rate": 3.976861897912106e-06, "loss": 0.4137, "step": 2935 }, { "epoch": 0.6100850798920938, "grad_norm": 2.695103229335665, "learning_rate": 3.959142591183652e-06, "loss": 0.4184, "step": 2940 }, { "epoch": 0.6111226395517743, "grad_norm": 2.433881870125308, "learning_rate": 3.9414369381965904e-06, "loss": 0.4084, "step": 2945 }, { "epoch": 0.6121601992114547, "grad_norm": 2.424853521797893, "learning_rate": 3.92374517120983e-06, "loss": 0.4157, "step": 2950 }, { "epoch": 0.6131977588711351, "grad_norm": 2.3289318124697926, "learning_rate": 3.90606752230013e-06, "loss": 0.4002, "step": 2955 }, { "epoch": 0.6142353185308155, "grad_norm": 2.4465640629382186, "learning_rate": 3.888404223359045e-06, "loss": 0.4057, "step": 2960 }, { "epoch": 0.6152728781904959, "grad_norm": 2.572631649766761, "learning_rate": 3.870755506089899e-06, "loss": 0.4144, "step": 2965 }, { "epoch": 0.6163104378501764, "grad_norm": 2.4132420332744204, "learning_rate": 3.8531216020047246e-06, "loss": 0.4116, "step": 2970 }, { "epoch": 0.6173479975098568, "grad_norm": 2.3892278386798593, "learning_rate": 3.835502742421251e-06, "loss": 0.4093, "step": 2975 }, { "epoch": 0.6183855571695372, "grad_norm": 2.2641401615974406, "learning_rate": 3.8178991584598474e-06, "loss": 0.4131, "step": 2980 }, { "epoch": 0.6194231168292177, "grad_norm": 2.438762066594083, "learning_rate": 3.8003110810405065e-06, "loss": 0.4064, "step": 2985 }, { "epoch": 0.6204606764888981, "grad_norm": 2.4506479186056396, "learning_rate": 3.782738740879806e-06, "loss": 0.4052, "step": 2990 }, { "epoch": 0.6214982361485786, "grad_norm": 2.5283170692610617, "learning_rate": 3.7651823684878884e-06, "loss": 0.396, "step": 2995 }, { "epoch": 0.622535795808259, "grad_norm": 2.4749410345861453, "learning_rate": 3.7476421941654318e-06, "loss": 0.4193, "step": 3000 }, { "epoch": 0.6235733554679395, "grad_norm": 2.317549282593909, "learning_rate": 3.7301184480006337e-06, "loss": 0.3973, "step": 3005 }, { "epoch": 0.6246109151276198, "grad_norm": 2.5252500112854275, "learning_rate": 3.712611359866188e-06, "loss": 0.4147, "step": 3010 }, { "epoch": 0.6256484747873002, "grad_norm": 2.4400394840947475, "learning_rate": 3.6951211594162784e-06, "loss": 0.4089, "step": 3015 }, { "epoch": 0.6266860344469807, "grad_norm": 2.349957863539589, "learning_rate": 3.677648076083549e-06, "loss": 0.3992, "step": 3020 }, { "epoch": 0.6277235941066611, "grad_norm": 2.3832033455743833, "learning_rate": 3.6601923390761156e-06, "loss": 0.4131, "step": 3025 }, { "epoch": 0.6287611537663416, "grad_norm": 2.576394180910392, "learning_rate": 3.6427541773745433e-06, "loss": 0.3968, "step": 3030 }, { "epoch": 0.629798713426022, "grad_norm": 2.4507894648738877, "learning_rate": 3.6253338197288505e-06, "loss": 0.4023, "step": 3035 }, { "epoch": 0.6308362730857024, "grad_norm": 2.3382065135829997, "learning_rate": 3.607931494655504e-06, "loss": 0.3918, "step": 3040 }, { "epoch": 0.6318738327453829, "grad_norm": 2.4244914619480373, "learning_rate": 3.5905474304344225e-06, "loss": 0.4117, "step": 3045 }, { "epoch": 0.6329113924050633, "grad_norm": 2.567189154890026, "learning_rate": 3.573181855105986e-06, "loss": 0.42, "step": 3050 }, { "epoch": 0.6339489520647438, "grad_norm": 2.278238684757535, "learning_rate": 3.555834996468039e-06, "loss": 0.4041, "step": 3055 }, { "epoch": 0.6349865117244241, "grad_norm": 2.504690078277028, "learning_rate": 3.538507082072905e-06, "loss": 0.3944, "step": 3060 }, { "epoch": 0.6360240713841046, "grad_norm": 2.508152236947373, "learning_rate": 3.5211983392243996e-06, "loss": 0.4, "step": 3065 }, { "epoch": 0.637061631043785, "grad_norm": 2.3910619945271683, "learning_rate": 3.503908994974856e-06, "loss": 0.4093, "step": 3070 }, { "epoch": 0.6380991907034654, "grad_norm": 2.3248985179294652, "learning_rate": 3.4866392761221303e-06, "loss": 0.4065, "step": 3075 }, { "epoch": 0.6391367503631459, "grad_norm": 2.361597466745494, "learning_rate": 3.4693894092066483e-06, "loss": 0.3907, "step": 3080 }, { "epoch": 0.6401743100228263, "grad_norm": 2.4949060134883743, "learning_rate": 3.452159620508414e-06, "loss": 0.4004, "step": 3085 }, { "epoch": 0.6412118696825068, "grad_norm": 2.3586999565378886, "learning_rate": 3.4349501360440556e-06, "loss": 0.3977, "step": 3090 }, { "epoch": 0.6422494293421872, "grad_norm": 2.2746540643844426, "learning_rate": 3.417761181563849e-06, "loss": 0.3949, "step": 3095 }, { "epoch": 0.6432869890018676, "grad_norm": 2.4940700061779064, "learning_rate": 3.4005929825487684e-06, "loss": 0.4011, "step": 3100 }, { "epoch": 0.6443245486615481, "grad_norm": 2.3950183894327934, "learning_rate": 3.383445764207516e-06, "loss": 0.408, "step": 3105 }, { "epoch": 0.6453621083212284, "grad_norm": 2.30382353091722, "learning_rate": 3.366319751473579e-06, "loss": 0.4022, "step": 3110 }, { "epoch": 0.6463996679809089, "grad_norm": 2.4321346720182766, "learning_rate": 3.3492151690022712e-06, "loss": 0.3986, "step": 3115 }, { "epoch": 0.6474372276405893, "grad_norm": 2.3061011632873285, "learning_rate": 3.332132241167793e-06, "loss": 0.3972, "step": 3120 }, { "epoch": 0.6484747873002698, "grad_norm": 2.4917204012266327, "learning_rate": 3.3150711920602765e-06, "loss": 0.4042, "step": 3125 }, { "epoch": 0.6495123469599502, "grad_norm": 2.190798555385459, "learning_rate": 3.2980322454828617e-06, "loss": 0.3917, "step": 3130 }, { "epoch": 0.6505499066196306, "grad_norm": 2.343964465011578, "learning_rate": 3.281015624948746e-06, "loss": 0.3893, "step": 3135 }, { "epoch": 0.6515874662793111, "grad_norm": 2.61426487111248, "learning_rate": 3.264021553678264e-06, "loss": 0.4087, "step": 3140 }, { "epoch": 0.6526250259389915, "grad_norm": 2.3381914892622553, "learning_rate": 3.247050254595947e-06, "loss": 0.3996, "step": 3145 }, { "epoch": 0.653662585598672, "grad_norm": 2.4097371109758945, "learning_rate": 3.2301019503276144e-06, "loss": 0.404, "step": 3150 }, { "epoch": 0.6547001452583524, "grad_norm": 2.3425157027203127, "learning_rate": 3.2131768631974375e-06, "loss": 0.4025, "step": 3155 }, { "epoch": 0.6557377049180327, "grad_norm": 2.405066414410578, "learning_rate": 3.196275215225032e-06, "loss": 0.4095, "step": 3160 }, { "epoch": 0.6567752645777132, "grad_norm": 2.332707293053418, "learning_rate": 3.179397228122547e-06, "loss": 0.4, "step": 3165 }, { "epoch": 0.6578128242373936, "grad_norm": 2.374933561243685, "learning_rate": 3.162543123291749e-06, "loss": 0.3887, "step": 3170 }, { "epoch": 0.6588503838970741, "grad_norm": 2.265067002723173, "learning_rate": 3.1457131218211263e-06, "loss": 0.3974, "step": 3175 }, { "epoch": 0.6598879435567545, "grad_norm": 2.363694486377073, "learning_rate": 3.1289074444829783e-06, "loss": 0.3932, "step": 3180 }, { "epoch": 0.660925503216435, "grad_norm": 2.5691413705019603, "learning_rate": 3.1121263117305355e-06, "loss": 0.3848, "step": 3185 }, { "epoch": 0.6619630628761154, "grad_norm": 2.3700655398167005, "learning_rate": 3.0953699436950464e-06, "loss": 0.3942, "step": 3190 }, { "epoch": 0.6630006225357958, "grad_norm": 2.5079197198404932, "learning_rate": 3.0786385601829114e-06, "loss": 0.3921, "step": 3195 }, { "epoch": 0.6640381821954763, "grad_norm": 2.3572529532760136, "learning_rate": 3.061932380672783e-06, "loss": 0.389, "step": 3200 }, { "epoch": 0.6650757418551567, "grad_norm": 2.3406368361183643, "learning_rate": 3.0452516243126955e-06, "loss": 0.3942, "step": 3205 }, { "epoch": 0.666113301514837, "grad_norm": 2.2910811346177953, "learning_rate": 3.0285965099171864e-06, "loss": 0.3954, "step": 3210 }, { "epoch": 0.6671508611745175, "grad_norm": 2.3711327251353254, "learning_rate": 3.0119672559644313e-06, "loss": 0.3825, "step": 3215 }, { "epoch": 0.6681884208341979, "grad_norm": 2.346432027672407, "learning_rate": 2.995364080593368e-06, "loss": 0.3862, "step": 3220 }, { "epoch": 0.6692259804938784, "grad_norm": 2.25488355968479, "learning_rate": 2.978787201600847e-06, "loss": 0.3869, "step": 3225 }, { "epoch": 0.6702635401535588, "grad_norm": 2.5145720297012204, "learning_rate": 2.9622368364387626e-06, "loss": 0.3979, "step": 3230 }, { "epoch": 0.6713010998132393, "grad_norm": 2.430288285642923, "learning_rate": 2.9457132022112156e-06, "loss": 0.3876, "step": 3235 }, { "epoch": 0.6723386594729197, "grad_norm": 2.52733248085438, "learning_rate": 2.9292165156716447e-06, "loss": 0.3918, "step": 3240 }, { "epoch": 0.6733762191326002, "grad_norm": 2.4117862477617464, "learning_rate": 2.9127469932200034e-06, "loss": 0.3904, "step": 3245 }, { "epoch": 0.6744137787922806, "grad_norm": 2.544838045591555, "learning_rate": 2.89630485089991e-06, "loss": 0.3968, "step": 3250 }, { "epoch": 0.675451338451961, "grad_norm": 2.3114731154281687, "learning_rate": 2.879890304395816e-06, "loss": 0.389, "step": 3255 }, { "epoch": 0.6764888981116414, "grad_norm": 2.214278091013657, "learning_rate": 2.8635035690301725e-06, "loss": 0.368, "step": 3260 }, { "epoch": 0.6775264577713218, "grad_norm": 2.4150879417081033, "learning_rate": 2.847144859760622e-06, "loss": 0.3997, "step": 3265 }, { "epoch": 0.6785640174310023, "grad_norm": 2.2503456214277766, "learning_rate": 2.8308143911771555e-06, "loss": 0.3785, "step": 3270 }, { "epoch": 0.6796015770906827, "grad_norm": 2.400256629243757, "learning_rate": 2.8145123774993075e-06, "loss": 0.3873, "step": 3275 }, { "epoch": 0.6806391367503631, "grad_norm": 2.4464548102871393, "learning_rate": 2.798239032573362e-06, "loss": 0.3811, "step": 3280 }, { "epoch": 0.6816766964100436, "grad_norm": 2.5094627463784973, "learning_rate": 2.7819945698695148e-06, "loss": 0.387, "step": 3285 }, { "epoch": 0.682714256069724, "grad_norm": 2.4588062818595944, "learning_rate": 2.765779202479103e-06, "loss": 0.3848, "step": 3290 }, { "epoch": 0.6837518157294045, "grad_norm": 2.2530973533871035, "learning_rate": 2.749593143111793e-06, "loss": 0.3641, "step": 3295 }, { "epoch": 0.6847893753890849, "grad_norm": 2.3169454419169684, "learning_rate": 2.733436604092797e-06, "loss": 0.378, "step": 3300 }, { "epoch": 0.6858269350487654, "grad_norm": 2.5056936406549917, "learning_rate": 2.7173097973600806e-06, "loss": 0.3837, "step": 3305 }, { "epoch": 0.6868644947084457, "grad_norm": 2.315516361770829, "learning_rate": 2.7012129344615933e-06, "loss": 0.3797, "step": 3310 }, { "epoch": 0.6879020543681261, "grad_norm": 2.3765574424439504, "learning_rate": 2.6851462265524862e-06, "loss": 0.3821, "step": 3315 }, { "epoch": 0.6889396140278066, "grad_norm": 2.632664268427571, "learning_rate": 2.6691098843923464e-06, "loss": 0.3869, "step": 3320 }, { "epoch": 0.689977173687487, "grad_norm": 2.4558633695260554, "learning_rate": 2.65310411834242e-06, "loss": 0.3778, "step": 3325 }, { "epoch": 0.6910147333471675, "grad_norm": 2.3802011004843053, "learning_rate": 2.637129138362877e-06, "loss": 0.3818, "step": 3330 }, { "epoch": 0.6920522930068479, "grad_norm": 2.3545532106676625, "learning_rate": 2.62118515401003e-06, "loss": 0.3745, "step": 3335 }, { "epoch": 0.6930898526665283, "grad_norm": 2.2370151266527913, "learning_rate": 2.6052723744336027e-06, "loss": 0.382, "step": 3340 }, { "epoch": 0.6941274123262088, "grad_norm": 2.3233925949072645, "learning_rate": 2.589391008373982e-06, "loss": 0.3901, "step": 3345 }, { "epoch": 0.6951649719858892, "grad_norm": 2.385447861484446, "learning_rate": 2.5735412641594804e-06, "loss": 0.3804, "step": 3350 }, { "epoch": 0.6962025316455697, "grad_norm": 2.3578389155683217, "learning_rate": 2.5577233497035943e-06, "loss": 0.3888, "step": 3355 }, { "epoch": 0.69724009130525, "grad_norm": 2.3006918024124965, "learning_rate": 2.541937472502293e-06, "loss": 0.3661, "step": 3360 }, { "epoch": 0.6982776509649304, "grad_norm": 2.303381728187516, "learning_rate": 2.526183839631283e-06, "loss": 0.3827, "step": 3365 }, { "epoch": 0.6993152106246109, "grad_norm": 2.324125134648238, "learning_rate": 2.5104626577433022e-06, "loss": 0.376, "step": 3370 }, { "epoch": 0.7003527702842913, "grad_norm": 2.4863826311733614, "learning_rate": 2.4947741330653942e-06, "loss": 0.3765, "step": 3375 }, { "epoch": 0.7013903299439718, "grad_norm": 2.309178058770916, "learning_rate": 2.4791184713962207e-06, "loss": 0.3665, "step": 3380 }, { "epoch": 0.7024278896036522, "grad_norm": 2.338910146993185, "learning_rate": 2.463495878103352e-06, "loss": 0.3601, "step": 3385 }, { "epoch": 0.7034654492633327, "grad_norm": 2.386442857917931, "learning_rate": 2.4479065581205673e-06, "loss": 0.3775, "step": 3390 }, { "epoch": 0.7045030089230131, "grad_norm": 2.4264131593130913, "learning_rate": 2.4323507159451887e-06, "loss": 0.3775, "step": 3395 }, { "epoch": 0.7055405685826935, "grad_norm": 35.69245904532589, "learning_rate": 2.416828555635368e-06, "loss": 0.3836, "step": 3400 }, { "epoch": 0.706578128242374, "grad_norm": 2.3981493771293385, "learning_rate": 2.4013402808074356e-06, "loss": 0.3648, "step": 3405 }, { "epoch": 0.7076156879020543, "grad_norm": 2.458681006711934, "learning_rate": 2.3858860946332148e-06, "loss": 0.3706, "step": 3410 }, { "epoch": 0.7086532475617348, "grad_norm": 2.5072358874605545, "learning_rate": 2.3704661998373652e-06, "loss": 0.3786, "step": 3415 }, { "epoch": 0.7096908072214152, "grad_norm": 2.4783372667192576, "learning_rate": 2.3550807986947133e-06, "loss": 0.3635, "step": 3420 }, { "epoch": 0.7107283668810956, "grad_norm": 2.583412985853997, "learning_rate": 2.3397300930276116e-06, "loss": 0.3574, "step": 3425 }, { "epoch": 0.7117659265407761, "grad_norm": 2.39052336222071, "learning_rate": 2.3244142842032823e-06, "loss": 0.3642, "step": 3430 }, { "epoch": 0.7128034862004565, "grad_norm": 2.2998480766526974, "learning_rate": 2.309133573131181e-06, "loss": 0.3659, "step": 3435 }, { "epoch": 0.713841045860137, "grad_norm": 2.452595068182353, "learning_rate": 2.2938881602603496e-06, "loss": 0.3767, "step": 3440 }, { "epoch": 0.7148786055198174, "grad_norm": 2.4427339177937597, "learning_rate": 2.2786782455768113e-06, "loss": 0.3701, "step": 3445 }, { "epoch": 0.7159161651794979, "grad_norm": 2.5344881209783647, "learning_rate": 2.2635040286009163e-06, "loss": 0.3636, "step": 3450 }, { "epoch": 0.7169537248391783, "grad_norm": 2.4517299777866914, "learning_rate": 2.2483657083847487e-06, "loss": 0.3638, "step": 3455 }, { "epoch": 0.7179912844988587, "grad_norm": 2.333164872282216, "learning_rate": 2.233263483509505e-06, "loss": 0.3652, "step": 3460 }, { "epoch": 0.7190288441585391, "grad_norm": 2.536346581261384, "learning_rate": 2.218197552082893e-06, "loss": 0.3843, "step": 3465 }, { "epoch": 0.7200664038182195, "grad_norm": 2.3170765412707444, "learning_rate": 2.203168111736524e-06, "loss": 0.3702, "step": 3470 }, { "epoch": 0.7211039634779, "grad_norm": 2.3720426965424926, "learning_rate": 2.1881753596233334e-06, "loss": 0.3661, "step": 3475 }, { "epoch": 0.7221415231375804, "grad_norm": 2.4784864232706427, "learning_rate": 2.173219492414988e-06, "loss": 0.3557, "step": 3480 }, { "epoch": 0.7231790827972608, "grad_norm": 2.2572170707401598, "learning_rate": 2.1583007062993037e-06, "loss": 0.3626, "step": 3485 }, { "epoch": 0.7242166424569413, "grad_norm": 2.3751613572262618, "learning_rate": 2.1434191969776787e-06, "loss": 0.372, "step": 3490 }, { "epoch": 0.7252542021166217, "grad_norm": 2.403916983756595, "learning_rate": 2.1285751596625153e-06, "loss": 0.3793, "step": 3495 }, { "epoch": 0.7262917617763022, "grad_norm": 2.422300937120251, "learning_rate": 2.1137687890746733e-06, "loss": 0.3757, "step": 3500 }, { "epoch": 0.7273293214359826, "grad_norm": 2.3278136492746904, "learning_rate": 2.099000279440905e-06, "loss": 0.3695, "step": 3505 }, { "epoch": 0.728366881095663, "grad_norm": 2.3521512237809774, "learning_rate": 2.0842698244913146e-06, "loss": 0.3577, "step": 3510 }, { "epoch": 0.7294044407553434, "grad_norm": 2.4420360123708837, "learning_rate": 2.0695776174568054e-06, "loss": 0.3617, "step": 3515 }, { "epoch": 0.7304420004150238, "grad_norm": 2.355326415228057, "learning_rate": 2.054923851066561e-06, "loss": 0.366, "step": 3520 }, { "epoch": 0.7314795600747043, "grad_norm": 2.408952585713157, "learning_rate": 2.0403087175455044e-06, "loss": 0.3592, "step": 3525 }, { "epoch": 0.7325171197343847, "grad_norm": 2.3814394489123556, "learning_rate": 2.025732408611786e-06, "loss": 0.3702, "step": 3530 }, { "epoch": 0.7335546793940652, "grad_norm": 2.505258380168417, "learning_rate": 2.0111951154742526e-06, "loss": 0.3541, "step": 3535 }, { "epoch": 0.7345922390537456, "grad_norm": 2.2378548590102962, "learning_rate": 1.9966970288299666e-06, "loss": 0.367, "step": 3540 }, { "epoch": 0.735629798713426, "grad_norm": 2.5282472044177657, "learning_rate": 1.982238338861673e-06, "loss": 0.372, "step": 3545 }, { "epoch": 0.7366673583731065, "grad_norm": 2.395346352470493, "learning_rate": 1.9678192352353282e-06, "loss": 0.3647, "step": 3550 }, { "epoch": 0.7377049180327869, "grad_norm": 2.377816912239941, "learning_rate": 1.9534399070976013e-06, "loss": 0.3694, "step": 3555 }, { "epoch": 0.7387424776924674, "grad_norm": 2.395069006251528, "learning_rate": 1.9391005430733973e-06, "loss": 0.3643, "step": 3560 }, { "epoch": 0.7397800373521477, "grad_norm": 2.360921498446891, "learning_rate": 1.924801331263375e-06, "loss": 0.3592, "step": 3565 }, { "epoch": 0.7408175970118281, "grad_norm": 2.4026862396255977, "learning_rate": 1.9105424592414905e-06, "loss": 0.3596, "step": 3570 }, { "epoch": 0.7418551566715086, "grad_norm": 2.3493974707324323, "learning_rate": 1.8963241140525302e-06, "loss": 0.352, "step": 3575 }, { "epoch": 0.742892716331189, "grad_norm": 2.255383769774033, "learning_rate": 1.8821464822096587e-06, "loss": 0.3545, "step": 3580 }, { "epoch": 0.7439302759908695, "grad_norm": 2.3439036527893804, "learning_rate": 1.8680097496919663e-06, "loss": 0.3614, "step": 3585 }, { "epoch": 0.7449678356505499, "grad_norm": 2.402995035755627, "learning_rate": 1.8539141019420459e-06, "loss": 0.3591, "step": 3590 }, { "epoch": 0.7460053953102304, "grad_norm": 2.3095366349656845, "learning_rate": 1.8398597238635375e-06, "loss": 0.3555, "step": 3595 }, { "epoch": 0.7470429549699108, "grad_norm": 2.454441768372595, "learning_rate": 1.825846799818722e-06, "loss": 0.3645, "step": 3600 }, { "epoch": 0.7480805146295912, "grad_norm": 2.3783282437356883, "learning_rate": 1.8118755136260941e-06, "loss": 0.364, "step": 3605 }, { "epoch": 0.7491180742892717, "grad_norm": 2.4804513717759975, "learning_rate": 1.7979460485579486e-06, "loss": 0.3577, "step": 3610 }, { "epoch": 0.750155633948952, "grad_norm": 2.399533389844388, "learning_rate": 1.784058587337984e-06, "loss": 0.3548, "step": 3615 }, { "epoch": 0.7511931936086325, "grad_norm": 2.5287966988574992, "learning_rate": 1.7702133121388999e-06, "loss": 0.3702, "step": 3620 }, { "epoch": 0.7522307532683129, "grad_norm": 2.5280174627826097, "learning_rate": 1.7564104045800101e-06, "loss": 0.3717, "step": 3625 }, { "epoch": 0.7532683129279933, "grad_norm": 2.454120509191578, "learning_rate": 1.7426500457248552e-06, "loss": 0.3508, "step": 3630 }, { "epoch": 0.7543058725876738, "grad_norm": 2.2917885535880176, "learning_rate": 1.7289324160788346e-06, "loss": 0.3509, "step": 3635 }, { "epoch": 0.7553434322473542, "grad_norm": 2.478759913910432, "learning_rate": 1.7152576955868338e-06, "loss": 0.3597, "step": 3640 }, { "epoch": 0.7563809919070347, "grad_norm": 2.3448679544687256, "learning_rate": 1.701626063630869e-06, "loss": 0.367, "step": 3645 }, { "epoch": 0.7574185515667151, "grad_norm": 2.3452502536112485, "learning_rate": 1.6880376990277202e-06, "loss": 0.3548, "step": 3650 }, { "epoch": 0.7584561112263956, "grad_norm": 2.3587859019469084, "learning_rate": 1.674492780026611e-06, "loss": 0.3635, "step": 3655 }, { "epoch": 0.759493670886076, "grad_norm": 2.4281427875718635, "learning_rate": 1.6609914843068403e-06, "loss": 0.3535, "step": 3660 }, { "epoch": 0.7605312305457563, "grad_norm": 2.644672013859547, "learning_rate": 1.6475339889754755e-06, "loss": 0.3546, "step": 3665 }, { "epoch": 0.7615687902054368, "grad_norm": 2.231863917742126, "learning_rate": 1.6341204705650155e-06, "loss": 0.3565, "step": 3670 }, { "epoch": 0.7626063498651172, "grad_norm": 2.3627064704364003, "learning_rate": 1.6207511050310842e-06, "loss": 0.3601, "step": 3675 }, { "epoch": 0.7636439095247977, "grad_norm": 2.5482612194562746, "learning_rate": 1.6074260677501102e-06, "loss": 0.3606, "step": 3680 }, { "epoch": 0.7646814691844781, "grad_norm": 2.4794463771443187, "learning_rate": 1.5941455335170408e-06, "loss": 0.3538, "step": 3685 }, { "epoch": 0.7657190288441585, "grad_norm": 2.6762628509028565, "learning_rate": 1.5809096765430387e-06, "loss": 0.3517, "step": 3690 }, { "epoch": 0.766756588503839, "grad_norm": 2.2429463729622565, "learning_rate": 1.5677186704532016e-06, "loss": 0.3457, "step": 3695 }, { "epoch": 0.7677941481635194, "grad_norm": 2.36484474879296, "learning_rate": 1.5545726882842782e-06, "loss": 0.3573, "step": 3700 }, { "epoch": 0.7688317078231999, "grad_norm": 2.294696024445774, "learning_rate": 1.5414719024824127e-06, "loss": 0.3509, "step": 3705 }, { "epoch": 0.7698692674828803, "grad_norm": 2.51233601030323, "learning_rate": 1.5284164849008648e-06, "loss": 0.3589, "step": 3710 }, { "epoch": 0.7709068271425606, "grad_norm": 2.473322215306534, "learning_rate": 1.515406606797763e-06, "loss": 0.3491, "step": 3715 }, { "epoch": 0.7719443868022411, "grad_norm": 2.2061318656296067, "learning_rate": 1.5024424388338682e-06, "loss": 0.3499, "step": 3720 }, { "epoch": 0.7729819464619215, "grad_norm": 2.435209027478781, "learning_rate": 1.4895241510703157e-06, "loss": 0.3643, "step": 3725 }, { "epoch": 0.774019506121602, "grad_norm": 2.369611296713981, "learning_rate": 1.4766519129663992e-06, "loss": 0.3483, "step": 3730 }, { "epoch": 0.7750570657812824, "grad_norm": 2.355834071175694, "learning_rate": 1.4638258933773425e-06, "loss": 0.3519, "step": 3735 }, { "epoch": 0.7760946254409629, "grad_norm": 2.469740057096309, "learning_rate": 1.451046260552086e-06, "loss": 0.3475, "step": 3740 }, { "epoch": 0.7771321851006433, "grad_norm": 2.4205796114015077, "learning_rate": 1.438313182131073e-06, "loss": 0.3523, "step": 3745 }, { "epoch": 0.7781697447603237, "grad_norm": 2.252004484682199, "learning_rate": 1.4256268251440631e-06, "loss": 0.3501, "step": 3750 }, { "epoch": 0.7792073044200042, "grad_norm": 2.3823736067008556, "learning_rate": 1.412987356007931e-06, "loss": 0.3439, "step": 3755 }, { "epoch": 0.7802448640796846, "grad_norm": 2.440498718677342, "learning_rate": 1.4003949405244888e-06, "loss": 0.3545, "step": 3760 }, { "epoch": 0.781282423739365, "grad_norm": 2.4557991100819905, "learning_rate": 1.3878497438783035e-06, "loss": 0.3476, "step": 3765 }, { "epoch": 0.7823199833990454, "grad_norm": 2.3768196794994956, "learning_rate": 1.3753519306345443e-06, "loss": 0.3566, "step": 3770 }, { "epoch": 0.7833575430587258, "grad_norm": 2.3763354198113427, "learning_rate": 1.3629016647368077e-06, "loss": 0.3508, "step": 3775 }, { "epoch": 0.7843951027184063, "grad_norm": 2.5905901892830725, "learning_rate": 1.3504991095049774e-06, "loss": 0.3499, "step": 3780 }, { "epoch": 0.7854326623780867, "grad_norm": 2.4256417735699407, "learning_rate": 1.338144427633079e-06, "loss": 0.3504, "step": 3785 }, { "epoch": 0.7864702220377672, "grad_norm": 2.4053687510971664, "learning_rate": 1.3258377811871481e-06, "loss": 0.3484, "step": 3790 }, { "epoch": 0.7875077816974476, "grad_norm": 2.4549151530337654, "learning_rate": 1.3135793316030958e-06, "loss": 0.3482, "step": 3795 }, { "epoch": 0.7885453413571281, "grad_norm": 2.4124091117521576, "learning_rate": 1.3013692396846028e-06, "loss": 0.3417, "step": 3800 }, { "epoch": 0.7895829010168085, "grad_norm": 2.492914244103795, "learning_rate": 1.2892076656010017e-06, "loss": 0.346, "step": 3805 }, { "epoch": 0.790620460676489, "grad_norm": 2.5012298456270807, "learning_rate": 1.277094768885182e-06, "loss": 0.3359, "step": 3810 }, { "epoch": 0.7916580203361693, "grad_norm": 2.3887971930046, "learning_rate": 1.2650307084314872e-06, "loss": 0.3434, "step": 3815 }, { "epoch": 0.7926955799958497, "grad_norm": 2.4930229591528605, "learning_rate": 1.2530156424936469e-06, "loss": 0.362, "step": 3820 }, { "epoch": 0.7937331396555302, "grad_norm": 2.386338375070925, "learning_rate": 1.241049728682684e-06, "loss": 0.3484, "step": 3825 }, { "epoch": 0.7947706993152106, "grad_norm": 2.303121828761511, "learning_rate": 1.229133123964853e-06, "loss": 0.355, "step": 3830 }, { "epoch": 0.795808258974891, "grad_norm": 2.418730350979906, "learning_rate": 1.2172659846595924e-06, "loss": 0.3534, "step": 3835 }, { "epoch": 0.7968458186345715, "grad_norm": 2.4002454150472143, "learning_rate": 1.2054484664374533e-06, "loss": 0.3484, "step": 3840 }, { "epoch": 0.7978833782942519, "grad_norm": 2.453421003455541, "learning_rate": 1.1936807243180743e-06, "loss": 0.3493, "step": 3845 }, { "epoch": 0.7989209379539324, "grad_norm": 2.4746199776766473, "learning_rate": 1.1819629126681398e-06, "loss": 0.3387, "step": 3850 }, { "epoch": 0.7999584976136128, "grad_norm": 2.38307853790052, "learning_rate": 1.1702951851993598e-06, "loss": 0.3483, "step": 3855 }, { "epoch": 0.8009960572732933, "grad_norm": 2.4103963437096114, "learning_rate": 1.1586776949664453e-06, "loss": 0.3423, "step": 3860 }, { "epoch": 0.8020336169329736, "grad_norm": 2.3551774825792924, "learning_rate": 1.1471105943651117e-06, "loss": 0.341, "step": 3865 }, { "epoch": 0.803071176592654, "grad_norm": 2.518707782804587, "learning_rate": 1.1355940351300715e-06, "loss": 0.349, "step": 3870 }, { "epoch": 0.8041087362523345, "grad_norm": 2.3344993783154964, "learning_rate": 1.1241281683330486e-06, "loss": 0.3519, "step": 3875 }, { "epoch": 0.8051462959120149, "grad_norm": 2.3344944133616665, "learning_rate": 1.1127131443807887e-06, "loss": 0.3412, "step": 3880 }, { "epoch": 0.8061838555716954, "grad_norm": 2.3019549791312754, "learning_rate": 1.1013491130131027e-06, "loss": 0.3346, "step": 3885 }, { "epoch": 0.8072214152313758, "grad_norm": 2.3790731919640415, "learning_rate": 1.0900362233008804e-06, "loss": 0.3414, "step": 3890 }, { "epoch": 0.8082589748910562, "grad_norm": 2.4317034804688453, "learning_rate": 1.0787746236441538e-06, "loss": 0.3521, "step": 3895 }, { "epoch": 0.8092965345507367, "grad_norm": 2.387356188471124, "learning_rate": 1.0675644617701402e-06, "loss": 0.3528, "step": 3900 }, { "epoch": 0.8103340942104171, "grad_norm": 2.53615626667828, "learning_rate": 1.0564058847313108e-06, "loss": 0.351, "step": 3905 }, { "epoch": 0.8113716538700976, "grad_norm": 2.4249799199654376, "learning_rate": 1.0452990389034507e-06, "loss": 0.3392, "step": 3910 }, { "epoch": 0.8124092135297779, "grad_norm": 2.3915606563087946, "learning_rate": 1.0342440699837537e-06, "loss": 0.3361, "step": 3915 }, { "epoch": 0.8134467731894583, "grad_norm": 2.5405240392917094, "learning_rate": 1.0232411229888994e-06, "loss": 0.3457, "step": 3920 }, { "epoch": 0.8144843328491388, "grad_norm": 2.424497940803967, "learning_rate": 1.0122903422531588e-06, "loss": 0.3424, "step": 3925 }, { "epoch": 0.8155218925088192, "grad_norm": 2.4383046197013187, "learning_rate": 1.001391871426492e-06, "loss": 0.3559, "step": 3930 }, { "epoch": 0.8165594521684997, "grad_norm": 2.3911832191085054, "learning_rate": 9.90545853472673e-07, "loss": 0.3429, "step": 3935 }, { "epoch": 0.8175970118281801, "grad_norm": 2.279983678354184, "learning_rate": 9.797524306674104e-07, "loss": 0.3437, "step": 3940 }, { "epoch": 0.8186345714878606, "grad_norm": 2.50140074824066, "learning_rate": 9.69011744596477e-07, "loss": 0.3402, "step": 3945 }, { "epoch": 0.819672131147541, "grad_norm": 2.4365828792120294, "learning_rate": 9.583239361538638e-07, "loss": 0.3417, "step": 3950 }, { "epoch": 0.8207096908072214, "grad_norm": 2.3579736935494955, "learning_rate": 9.476891455399168e-07, "loss": 0.3346, "step": 3955 }, { "epoch": 0.8217472504669019, "grad_norm": 2.5090769048416566, "learning_rate": 9.371075122595103e-07, "loss": 0.3231, "step": 3960 }, { "epoch": 0.8227848101265823, "grad_norm": 2.439152419655965, "learning_rate": 9.265791751202113e-07, "loss": 0.3437, "step": 3965 }, { "epoch": 0.8238223697862627, "grad_norm": 2.4534357310565214, "learning_rate": 9.161042722304609e-07, "loss": 0.3409, "step": 3970 }, { "epoch": 0.8248599294459431, "grad_norm": 2.2889519820080357, "learning_rate": 9.056829409977574e-07, "loss": 0.3423, "step": 3975 }, { "epoch": 0.8258974891056236, "grad_norm": 2.6079615773622544, "learning_rate": 8.953153181268609e-07, "loss": 0.3412, "step": 3980 }, { "epoch": 0.826935048765304, "grad_norm": 2.462659677967327, "learning_rate": 8.850015396179962e-07, "loss": 0.3399, "step": 3985 }, { "epoch": 0.8279726084249844, "grad_norm": 2.453876430295839, "learning_rate": 8.747417407650704e-07, "loss": 0.3368, "step": 3990 }, { "epoch": 0.8290101680846649, "grad_norm": 2.5260638258531722, "learning_rate": 8.645360561538935e-07, "loss": 0.3506, "step": 3995 }, { "epoch": 0.8300477277443453, "grad_norm": 2.392012536793574, "learning_rate": 8.543846196604239e-07, "loss": 0.3434, "step": 4000 }, { "epoch": 0.8310852874040258, "grad_norm": 2.4984903138745587, "learning_rate": 8.442875644489962e-07, "loss": 0.3351, "step": 4005 }, { "epoch": 0.8321228470637062, "grad_norm": 2.464203117108214, "learning_rate": 8.342450229705889e-07, "loss": 0.3455, "step": 4010 }, { "epoch": 0.8331604067233866, "grad_norm": 2.489210802914686, "learning_rate": 8.2425712696108e-07, "loss": 0.33, "step": 4015 }, { "epoch": 0.834197966383067, "grad_norm": 2.306914933986251, "learning_rate": 8.143240074395198e-07, "loss": 0.3418, "step": 4020 }, { "epoch": 0.8352355260427474, "grad_norm": 2.513591795617815, "learning_rate": 8.044457947064116e-07, "loss": 0.3418, "step": 4025 }, { "epoch": 0.8362730857024279, "grad_norm": 2.3537491795821524, "learning_rate": 7.946226183420047e-07, "loss": 0.3479, "step": 4030 }, { "epoch": 0.8373106453621083, "grad_norm": 2.422572559987635, "learning_rate": 7.848546072045932e-07, "loss": 0.3446, "step": 4035 }, { "epoch": 0.8383482050217888, "grad_norm": 2.3037996424039084, "learning_rate": 7.75141889428826e-07, "loss": 0.3257, "step": 4040 }, { "epoch": 0.8393857646814692, "grad_norm": 2.3570961713348186, "learning_rate": 7.654845924240228e-07, "loss": 0.3341, "step": 4045 }, { "epoch": 0.8404233243411496, "grad_norm": 2.3236115130562762, "learning_rate": 7.558828428725102e-07, "loss": 0.3328, "step": 4050 }, { "epoch": 0.8414608840008301, "grad_norm": 2.4888053313991603, "learning_rate": 7.463367667279515e-07, "loss": 0.3429, "step": 4055 }, { "epoch": 0.8424984436605105, "grad_norm": 2.4813791916370334, "learning_rate": 7.368464892137006e-07, "loss": 0.3412, "step": 4060 }, { "epoch": 0.843536003320191, "grad_norm": 2.3113527571110737, "learning_rate": 7.274121348211582e-07, "loss": 0.3475, "step": 4065 }, { "epoch": 0.8445735629798713, "grad_norm": 2.587556104836598, "learning_rate": 7.180338273081327e-07, "loss": 0.3354, "step": 4070 }, { "epoch": 0.8456111226395517, "grad_norm": 2.296420055640647, "learning_rate": 7.087116896972268e-07, "loss": 0.3357, "step": 4075 }, { "epoch": 0.8466486822992322, "grad_norm": 2.3266676608899663, "learning_rate": 6.994458442742163e-07, "loss": 0.3362, "step": 4080 }, { "epoch": 0.8476862419589126, "grad_norm": 2.6045589584755984, "learning_rate": 6.902364125864496e-07, "loss": 0.34, "step": 4085 }, { "epoch": 0.8487238016185931, "grad_norm": 2.352378817551258, "learning_rate": 6.810835154412487e-07, "loss": 0.339, "step": 4090 }, { "epoch": 0.8497613612782735, "grad_norm": 2.3547116125315446, "learning_rate": 6.719872729043331e-07, "loss": 0.3378, "step": 4095 }, { "epoch": 0.850798920937954, "grad_norm": 2.321427219175219, "learning_rate": 6.629478042982346e-07, "loss": 0.3229, "step": 4100 }, { "epoch": 0.8518364805976344, "grad_norm": 2.521475742171532, "learning_rate": 6.539652282007386e-07, "loss": 0.3376, "step": 4105 }, { "epoch": 0.8528740402573148, "grad_norm": 2.3926877720494573, "learning_rate": 6.450396624433286e-07, "loss": 0.3325, "step": 4110 }, { "epoch": 0.8539115999169953, "grad_norm": 2.425081699530235, "learning_rate": 6.361712241096374e-07, "loss": 0.3314, "step": 4115 }, { "epoch": 0.8549491595766756, "grad_norm": 2.487314370898736, "learning_rate": 6.273600295339111e-07, "loss": 0.3352, "step": 4120 }, { "epoch": 0.855986719236356, "grad_norm": 2.4556339075670723, "learning_rate": 6.186061942994864e-07, "loss": 0.3338, "step": 4125 }, { "epoch": 0.8570242788960365, "grad_norm": 2.417547682627604, "learning_rate": 6.099098332372733e-07, "loss": 0.3299, "step": 4130 }, { "epoch": 0.8580618385557169, "grad_norm": 2.611592074254707, "learning_rate": 6.012710604242478e-07, "loss": 0.3331, "step": 4135 }, { "epoch": 0.8590993982153974, "grad_norm": 2.448052874786836, "learning_rate": 5.926899891819521e-07, "loss": 0.3376, "step": 4140 }, { "epoch": 0.8601369578750778, "grad_norm": 2.475864292987021, "learning_rate": 5.841667320750188e-07, "loss": 0.3437, "step": 4145 }, { "epoch": 0.8611745175347583, "grad_norm": 2.373719046516931, "learning_rate": 5.757014009096801e-07, "loss": 0.3357, "step": 4150 }, { "epoch": 0.8622120771944387, "grad_norm": 2.3162935725557032, "learning_rate": 5.672941067323124e-07, "loss": 0.3305, "step": 4155 }, { "epoch": 0.8632496368541192, "grad_norm": 2.3285428336849736, "learning_rate": 5.589449598279762e-07, "loss": 0.3278, "step": 4160 }, { "epoch": 0.8642871965137996, "grad_norm": 2.6975663812633823, "learning_rate": 5.506540697189638e-07, "loss": 0.3235, "step": 4165 }, { "epoch": 0.8653247561734799, "grad_norm": 2.385391836897359, "learning_rate": 5.424215451633719e-07, "loss": 0.3366, "step": 4170 }, { "epoch": 0.8663623158331604, "grad_norm": 2.509848709680898, "learning_rate": 5.342474941536701e-07, "loss": 0.3349, "step": 4175 }, { "epoch": 0.8673998754928408, "grad_norm": 2.4992507740450063, "learning_rate": 5.261320239152851e-07, "loss": 0.3248, "step": 4180 }, { "epoch": 0.8684374351525213, "grad_norm": 2.4274228991197826, "learning_rate": 5.180752409051892e-07, "loss": 0.3276, "step": 4185 }, { "epoch": 0.8694749948122017, "grad_norm": 2.3127395460474975, "learning_rate": 5.100772508105139e-07, "loss": 0.318, "step": 4190 }, { "epoch": 0.8705125544718821, "grad_norm": 2.468734058758486, "learning_rate": 5.021381585471563e-07, "loss": 0.3332, "step": 4195 }, { "epoch": 0.8715501141315626, "grad_norm": 2.4531926615746564, "learning_rate": 4.942580682584041e-07, "loss": 0.3286, "step": 4200 }, { "epoch": 0.872587673791243, "grad_norm": 2.377223115961875, "learning_rate": 4.864370833135673e-07, "loss": 0.3342, "step": 4205 }, { "epoch": 0.8736252334509235, "grad_norm": 2.449018104688608, "learning_rate": 4.786753063066318e-07, "loss": 0.3355, "step": 4210 }, { "epoch": 0.8746627931106039, "grad_norm": 2.4173614341699685, "learning_rate": 4.7097283905489956e-07, "loss": 0.3288, "step": 4215 }, { "epoch": 0.8757003527702842, "grad_norm": 2.603118901855438, "learning_rate": 4.633297825976635e-07, "loss": 0.3371, "step": 4220 }, { "epoch": 0.8767379124299647, "grad_norm": 2.4773560425700727, "learning_rate": 4.5574623719487787e-07, "loss": 0.3256, "step": 4225 }, { "epoch": 0.8777754720896451, "grad_norm": 2.4564917228085053, "learning_rate": 4.482223023258453e-07, "loss": 0.337, "step": 4230 }, { "epoch": 0.8788130317493256, "grad_norm": 2.4727605694693553, "learning_rate": 4.407580766879066e-07, "loss": 0.3221, "step": 4235 }, { "epoch": 0.879850591409006, "grad_norm": 2.593564549202061, "learning_rate": 4.333536581951542e-07, "loss": 0.3364, "step": 4240 }, { "epoch": 0.8808881510686865, "grad_norm": 2.3065543830266204, "learning_rate": 4.2600914397714023e-07, "loss": 0.3266, "step": 4245 }, { "epoch": 0.8819257107283669, "grad_norm": 2.3918281813023783, "learning_rate": 4.1872463037760823e-07, "loss": 0.3311, "step": 4250 }, { "epoch": 0.8829632703880473, "grad_norm": 2.5113381009120417, "learning_rate": 4.1150021295322306e-07, "loss": 0.3373, "step": 4255 }, { "epoch": 0.8840008300477278, "grad_norm": 2.4230720157484935, "learning_rate": 4.043359864723262e-07, "loss": 0.3329, "step": 4260 }, { "epoch": 0.8850383897074082, "grad_norm": 2.616499204627849, "learning_rate": 3.972320449136829e-07, "loss": 0.3295, "step": 4265 }, { "epoch": 0.8860759493670886, "grad_norm": 2.5866660581654677, "learning_rate": 3.90188481465254e-07, "loss": 0.3276, "step": 4270 }, { "epoch": 0.887113509026769, "grad_norm": 2.4795883896583364, "learning_rate": 3.8320538852297694e-07, "loss": 0.3339, "step": 4275 }, { "epoch": 0.8881510686864494, "grad_norm": 2.3898148517858244, "learning_rate": 3.762828576895472e-07, "loss": 0.3373, "step": 4280 }, { "epoch": 0.8891886283461299, "grad_norm": 2.4124000504118044, "learning_rate": 3.694209797732201e-07, "loss": 0.332, "step": 4285 }, { "epoch": 0.8902261880058103, "grad_norm": 2.724223147572049, "learning_rate": 3.6261984478662025e-07, "loss": 0.3417, "step": 4290 }, { "epoch": 0.8912637476654908, "grad_norm": 2.4453049065554278, "learning_rate": 3.558795419455596e-07, "loss": 0.3198, "step": 4295 }, { "epoch": 0.8923013073251712, "grad_norm": 2.3658613859590165, "learning_rate": 3.492001596678651e-07, "loss": 0.3248, "step": 4300 }, { "epoch": 0.8933388669848517, "grad_norm": 2.415791137686327, "learning_rate": 3.4258178557222354e-07, "loss": 0.3238, "step": 4305 }, { "epoch": 0.8943764266445321, "grad_norm": 2.2801295759850566, "learning_rate": 3.3602450647702847e-07, "loss": 0.3317, "step": 4310 }, { "epoch": 0.8954139863042125, "grad_norm": 2.532814151310614, "learning_rate": 3.295284083992434e-07, "loss": 0.3217, "step": 4315 }, { "epoch": 0.8964515459638929, "grad_norm": 2.5019562185543993, "learning_rate": 3.2309357655326945e-07, "loss": 0.333, "step": 4320 }, { "epoch": 0.8974891056235733, "grad_norm": 2.2943698094322955, "learning_rate": 3.167200953498367e-07, "loss": 0.3181, "step": 4325 }, { "epoch": 0.8985266652832538, "grad_norm": 2.405841184133046, "learning_rate": 3.1040804839488406e-07, "loss": 0.3339, "step": 4330 }, { "epoch": 0.8995642249429342, "grad_norm": 2.386316529188201, "learning_rate": 3.041575184884732e-07, "loss": 0.3341, "step": 4335 }, { "epoch": 0.9006017846026146, "grad_norm": 2.517011507851569, "learning_rate": 2.979685876236982e-07, "loss": 0.3346, "step": 4340 }, { "epoch": 0.9016393442622951, "grad_norm": 2.5508437593952094, "learning_rate": 2.918413369856105e-07, "loss": 0.3258, "step": 4345 }, { "epoch": 0.9026769039219755, "grad_norm": 2.457005114059522, "learning_rate": 2.857758469501509e-07, "loss": 0.3197, "step": 4350 }, { "epoch": 0.903714463581656, "grad_norm": 2.3964711097696836, "learning_rate": 2.7977219708310134e-07, "loss": 0.3288, "step": 4355 }, { "epoch": 0.9047520232413364, "grad_norm": 2.4092985201196084, "learning_rate": 2.7383046613903676e-07, "loss": 0.325, "step": 4360 }, { "epoch": 0.9057895829010169, "grad_norm": 2.519420182065836, "learning_rate": 2.679507320602931e-07, "loss": 0.325, "step": 4365 }, { "epoch": 0.9068271425606972, "grad_norm": 2.4612555403219214, "learning_rate": 2.6213307197594353e-07, "loss": 0.3261, "step": 4370 }, { "epoch": 0.9078647022203776, "grad_norm": 2.568118436273184, "learning_rate": 2.5637756220079135e-07, "loss": 0.3246, "step": 4375 }, { "epoch": 0.9089022618800581, "grad_norm": 2.5397440534691373, "learning_rate": 2.506842782343627e-07, "loss": 0.3162, "step": 4380 }, { "epoch": 0.9099398215397385, "grad_norm": 2.4925975703816245, "learning_rate": 2.4505329475991823e-07, "loss": 0.331, "step": 4385 }, { "epoch": 0.910977381199419, "grad_norm": 3.3523863369544986, "learning_rate": 2.3948468564347904e-07, "loss": 0.3277, "step": 4390 }, { "epoch": 0.9120149408590994, "grad_norm": 2.575554252160631, "learning_rate": 2.3397852393284792e-07, "loss": 0.3404, "step": 4395 }, { "epoch": 0.9130525005187798, "grad_norm": 2.481671747693274, "learning_rate": 2.2853488185665796e-07, "loss": 0.3206, "step": 4400 }, { "epoch": 0.9140900601784603, "grad_norm": 2.4222594768146415, "learning_rate": 2.231538308234249e-07, "loss": 0.3216, "step": 4405 }, { "epoch": 0.9151276198381407, "grad_norm": 2.579114792911807, "learning_rate": 2.178354414206063e-07, "loss": 0.3267, "step": 4410 }, { "epoch": 0.9161651794978212, "grad_norm": 2.472569881694958, "learning_rate": 2.125797834136789e-07, "loss": 0.3212, "step": 4415 }, { "epoch": 0.9172027391575015, "grad_norm": 2.4091565145640605, "learning_rate": 2.0738692574522324e-07, "loss": 0.3246, "step": 4420 }, { "epoch": 0.9182402988171819, "grad_norm": 2.4603025192987644, "learning_rate": 2.0225693653401824e-07, "loss": 0.3221, "step": 4425 }, { "epoch": 0.9192778584768624, "grad_norm": 2.536120994058227, "learning_rate": 1.9718988307414866e-07, "loss": 0.3289, "step": 4430 }, { "epoch": 0.9203154181365428, "grad_norm": 2.423050703239737, "learning_rate": 1.921858318341191e-07, "loss": 0.3258, "step": 4435 }, { "epoch": 0.9213529777962233, "grad_norm": 2.429414838855423, "learning_rate": 1.8724484845598855e-07, "loss": 0.3196, "step": 4440 }, { "epoch": 0.9223905374559037, "grad_norm": 2.7662680215892004, "learning_rate": 1.8236699775450338e-07, "loss": 0.3258, "step": 4445 }, { "epoch": 0.9234280971155842, "grad_norm": 2.45993148237687, "learning_rate": 1.7755234371624908e-07, "loss": 0.318, "step": 4450 }, { "epoch": 0.9244656567752646, "grad_norm": 2.4765798027549644, "learning_rate": 1.7280094949881144e-07, "loss": 0.3287, "step": 4455 }, { "epoch": 0.925503216434945, "grad_norm": 2.5990110307214027, "learning_rate": 1.6811287742994897e-07, "loss": 0.3203, "step": 4460 }, { "epoch": 0.9265407760946255, "grad_norm": 2.7248202388018563, "learning_rate": 1.6348818900677077e-07, "loss": 0.3196, "step": 4465 }, { "epoch": 0.9275783357543059, "grad_norm": 2.360215127100717, "learning_rate": 1.5892694489493598e-07, "loss": 0.3266, "step": 4470 }, { "epoch": 0.9286158954139863, "grad_norm": 2.3701662902928415, "learning_rate": 1.5442920492785396e-07, "loss": 0.3217, "step": 4475 }, { "epoch": 0.9296534550736667, "grad_norm": 2.5408968036091593, "learning_rate": 1.4999502810590094e-07, "loss": 0.3147, "step": 4480 }, { "epoch": 0.9306910147333471, "grad_norm": 2.3872752954425276, "learning_rate": 1.456244725956446e-07, "loss": 0.3314, "step": 4485 }, { "epoch": 0.9317285743930276, "grad_norm": 2.510401039927896, "learning_rate": 1.4131759572908354e-07, "loss": 0.3233, "step": 4490 }, { "epoch": 0.932766134052708, "grad_norm": 2.452160220911644, "learning_rate": 1.370744540028929e-07, "loss": 0.3216, "step": 4495 }, { "epoch": 0.9338036937123885, "grad_norm": 2.5517139832044826, "learning_rate": 1.328951030776826e-07, "loss": 0.322, "step": 4500 }, { "epoch": 0.9348412533720689, "grad_norm": 2.600821629157991, "learning_rate": 1.2877959777727212e-07, "loss": 0.328, "step": 4505 }, { "epoch": 0.9358788130317494, "grad_norm": 2.4402893254633207, "learning_rate": 1.2472799208796517e-07, "loss": 0.3352, "step": 4510 }, { "epoch": 0.9369163726914298, "grad_norm": 2.4890223515899748, "learning_rate": 1.2074033915784543e-07, "loss": 0.3273, "step": 4515 }, { "epoch": 0.9379539323511102, "grad_norm": 2.541162954952577, "learning_rate": 1.168166912960772e-07, "loss": 0.3198, "step": 4520 }, { "epoch": 0.9389914920107906, "grad_norm": 2.510264694405438, "learning_rate": 1.1295709997222182e-07, "loss": 0.3278, "step": 4525 }, { "epoch": 0.940029051670471, "grad_norm": 2.4830766706425895, "learning_rate": 1.0916161581555895e-07, "loss": 0.3153, "step": 4530 }, { "epoch": 0.9410666113301515, "grad_norm": 2.3227486456210222, "learning_rate": 1.0543028861442539e-07, "loss": 0.3177, "step": 4535 }, { "epoch": 0.9421041709898319, "grad_norm": 2.5419821606370174, "learning_rate": 1.0176316731556112e-07, "loss": 0.3322, "step": 4540 }, { "epoch": 0.9431417306495123, "grad_norm": 2.6673497102822536, "learning_rate": 9.816030002346766e-08, "loss": 0.3319, "step": 4545 }, { "epoch": 0.9441792903091928, "grad_norm": 2.3937271331548904, "learning_rate": 9.462173399977348e-08, "loss": 0.3267, "step": 4550 }, { "epoch": 0.9452168499688732, "grad_norm": 2.3407051661833123, "learning_rate": 9.11475156626207e-08, "loss": 0.3103, "step": 4555 }, { "epoch": 0.9462544096285537, "grad_norm": 2.5377283482688755, "learning_rate": 8.773769058605053e-08, "loss": 0.3356, "step": 4560 }, { "epoch": 0.9472919692882341, "grad_norm": 2.3498925102259904, "learning_rate": 8.439230349940708e-08, "loss": 0.3254, "step": 4565 }, { "epoch": 0.9483295289479146, "grad_norm": 2.3416477533349145, "learning_rate": 8.111139828675175e-08, "loss": 0.3293, "step": 4570 }, { "epoch": 0.9493670886075949, "grad_norm": 2.559009235835814, "learning_rate": 7.78950179862864e-08, "loss": 0.3185, "step": 4575 }, { "epoch": 0.9504046482672753, "grad_norm": 2.586458213825969, "learning_rate": 7.474320478978946e-08, "loss": 0.3246, "step": 4580 }, { "epoch": 0.9514422079269558, "grad_norm": 2.3485714776360407, "learning_rate": 7.16560000420613e-08, "loss": 0.3238, "step": 4585 }, { "epoch": 0.9524797675866362, "grad_norm": 2.369694918601893, "learning_rate": 6.863344424038354e-08, "loss": 0.3103, "step": 4590 }, { "epoch": 0.9535173272463167, "grad_norm": 2.4252802076691538, "learning_rate": 6.567557703398675e-08, "loss": 0.3273, "step": 4595 }, { "epoch": 0.9545548869059971, "grad_norm": 2.552882664412225, "learning_rate": 6.278243722352973e-08, "loss": 0.3182, "step": 4600 }, { "epoch": 0.9555924465656775, "grad_norm": 2.5353013413397507, "learning_rate": 5.995406276059267e-08, "loss": 0.3335, "step": 4605 }, { "epoch": 0.956630006225358, "grad_norm": 2.4812443315722934, "learning_rate": 5.719049074717764e-08, "loss": 0.3191, "step": 4610 }, { "epoch": 0.9576675658850384, "grad_norm": 2.427622171598608, "learning_rate": 5.4491757435220505e-08, "loss": 0.3236, "step": 4615 }, { "epoch": 0.9587051255447189, "grad_norm": 2.601332407638946, "learning_rate": 5.185789822612086e-08, "loss": 0.3335, "step": 4620 }, { "epoch": 0.9597426852043992, "grad_norm": 2.4380419721098976, "learning_rate": 4.9288947670270146e-08, "loss": 0.3193, "step": 4625 }, { "epoch": 0.9607802448640796, "grad_norm": 2.36715515753985, "learning_rate": 4.678493946660423e-08, "loss": 0.3228, "step": 4630 }, { "epoch": 0.9618178045237601, "grad_norm": 2.4303066509314677, "learning_rate": 4.434590646215819e-08, "loss": 0.3266, "step": 4635 }, { "epoch": 0.9628553641834405, "grad_norm": 2.865610610806469, "learning_rate": 4.1971880651638376e-08, "loss": 0.3247, "step": 4640 }, { "epoch": 0.963892923843121, "grad_norm": 2.5407706997414268, "learning_rate": 3.966289317699878e-08, "loss": 0.3288, "step": 4645 }, { "epoch": 0.9649304835028014, "grad_norm": 2.4204826042021437, "learning_rate": 3.74189743270359e-08, "loss": 0.3145, "step": 4650 }, { "epoch": 0.9659680431624819, "grad_norm": 2.2543367363310716, "learning_rate": 3.5240153536988954e-08, "loss": 0.3307, "step": 4655 }, { "epoch": 0.9670056028221623, "grad_norm": 2.3642879401404677, "learning_rate": 3.312645938815695e-08, "loss": 0.3111, "step": 4660 }, { "epoch": 0.9680431624818427, "grad_norm": 2.551087158396502, "learning_rate": 3.107791960752005e-08, "loss": 0.3202, "step": 4665 }, { "epoch": 0.9690807221415232, "grad_norm": 2.431842119612971, "learning_rate": 2.909456106737818e-08, "loss": 0.3271, "step": 4670 }, { "epoch": 0.9701182818012035, "grad_norm": 2.469650411851549, "learning_rate": 2.7176409784998027e-08, "loss": 0.3225, "step": 4675 }, { "epoch": 0.971155841460884, "grad_norm": 2.6570472493964337, "learning_rate": 2.5323490922271044e-08, "loss": 0.3233, "step": 4680 }, { "epoch": 0.9721934011205644, "grad_norm": 2.792428588323876, "learning_rate": 2.3535828785384296e-08, "loss": 0.3263, "step": 4685 }, { "epoch": 0.9732309607802448, "grad_norm": 2.3558732345665456, "learning_rate": 2.1813446824502372e-08, "loss": 0.3136, "step": 4690 }, { "epoch": 0.9742685204399253, "grad_norm": 2.356877231683961, "learning_rate": 2.0156367633455965e-08, "loss": 0.3239, "step": 4695 }, { "epoch": 0.9753060800996057, "grad_norm": 2.4529847227301818, "learning_rate": 1.8564612949451555e-08, "loss": 0.3168, "step": 4700 }, { "epoch": 0.9763436397592862, "grad_norm": 2.382662217743029, "learning_rate": 1.7038203652781083e-08, "loss": 0.3234, "step": 4705 }, { "epoch": 0.9773811994189666, "grad_norm": 2.333476690943818, "learning_rate": 1.5577159766548832e-08, "loss": 0.3274, "step": 4710 }, { "epoch": 0.9784187590786471, "grad_norm": 2.402141955867801, "learning_rate": 1.4181500456412755e-08, "loss": 0.3215, "step": 4715 }, { "epoch": 0.9794563187383275, "grad_norm": 2.501308170912781, "learning_rate": 1.2851244030328004e-08, "loss": 0.3212, "step": 4720 }, { "epoch": 0.9804938783980078, "grad_norm": 2.6177200025512626, "learning_rate": 1.1586407938308785e-08, "loss": 0.3136, "step": 4725 }, { "epoch": 0.9815314380576883, "grad_norm": 2.548386853689889, "learning_rate": 1.0387008772200779e-08, "loss": 0.342, "step": 4730 }, { "epoch": 0.9825689977173687, "grad_norm": 2.552595538532663, "learning_rate": 9.253062265461855e-09, "loss": 0.3234, "step": 4735 }, { "epoch": 0.9836065573770492, "grad_norm": 2.489168583450402, "learning_rate": 8.184583292955572e-09, "loss": 0.318, "step": 4740 }, { "epoch": 0.9846441170367296, "grad_norm": 2.380606466941288, "learning_rate": 7.181585870757457e-09, "loss": 0.326, "step": 4745 }, { "epoch": 0.98568167669641, "grad_norm": 2.4173383787142297, "learning_rate": 6.2440831559690275e-09, "loss": 0.3185, "step": 4750 }, { "epoch": 0.9867192363560905, "grad_norm": 2.542878596032387, "learning_rate": 5.372087446547935e-09, "loss": 0.3252, "step": 4755 }, { "epoch": 0.9877567960157709, "grad_norm": 2.4745977048628918, "learning_rate": 4.565610181144209e-09, "loss": 0.3304, "step": 4760 }, { "epoch": 0.9887943556754514, "grad_norm": 2.4052051630616895, "learning_rate": 3.824661938951479e-09, "loss": 0.3243, "step": 4765 }, { "epoch": 0.9898319153351318, "grad_norm": 2.6094698060669264, "learning_rate": 3.1492524395682065e-09, "loss": 0.3189, "step": 4770 }, { "epoch": 0.9908694749948121, "grad_norm": 2.3476458011761134, "learning_rate": 2.5393905428688913e-09, "loss": 0.3231, "step": 4775 }, { "epoch": 0.9919070346544926, "grad_norm": 2.3445329353887456, "learning_rate": 1.9950842488891674e-09, "loss": 0.3355, "step": 4780 }, { "epoch": 0.992944594314173, "grad_norm": 2.4693805125337094, "learning_rate": 1.5163406977219963e-09, "loss": 0.3256, "step": 4785 }, { "epoch": 0.9939821539738535, "grad_norm": 2.563629879915604, "learning_rate": 1.103166169420522e-09, "loss": 0.3254, "step": 4790 }, { "epoch": 0.9950197136335339, "grad_norm": 2.494757369109791, "learning_rate": 7.555660839181356e-10, "loss": 0.3088, "step": 4795 }, { "epoch": 0.9960572732932144, "grad_norm": 2.373070173051403, "learning_rate": 4.735450009579756e-10, "loss": 0.323, "step": 4800 }, { "epoch": 0.9970948329528948, "grad_norm": 2.397230200383968, "learning_rate": 2.571066200307559e-10, "loss": 0.3303, "step": 4805 }, { "epoch": 0.9981323926125752, "grad_norm": 2.6318567902913985, "learning_rate": 1.0625378032813604e-10, "loss": 0.3156, "step": 4810 }, { "epoch": 0.9991699522722557, "grad_norm": 2.3100958526551696, "learning_rate": 2.0988460705528846e-11, "loss": 0.3192, "step": 4815 }, { "epoch": 1.0, "eval_loss": 0.2939795255661011, "eval_runtime": 0.9809, "eval_samples_per_second": 3.058, "eval_steps_per_second": 1.019, "step": 4819 }, { "epoch": 1.0, "step": 4819, "total_flos": 504500280360960.0, "train_loss": 0.5211530197307415, "train_runtime": 55738.8815, "train_samples_per_second": 1.383, "train_steps_per_second": 0.086 } ], "logging_steps": 5, "max_steps": 4819, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 504500280360960.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }