{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2413, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004144218814753419, "grad_norm": 24.514733364424554, "learning_rate": 4.132231404958678e-08, "loss": 1.4169, "step": 1 }, { "epoch": 0.0020721094073767096, "grad_norm": 23.537184095238235, "learning_rate": 2.066115702479339e-07, "loss": 1.3953, "step": 5 }, { "epoch": 0.004144218814753419, "grad_norm": 15.367263243187544, "learning_rate": 4.132231404958678e-07, "loss": 1.3778, "step": 10 }, { "epoch": 0.006216328222130129, "grad_norm": 8.898381617118186, "learning_rate": 6.198347107438018e-07, "loss": 1.2602, "step": 15 }, { "epoch": 0.008288437629506838, "grad_norm": 10.642672694622197, "learning_rate": 8.264462809917356e-07, "loss": 1.1548, "step": 20 }, { "epoch": 0.010360547036883548, "grad_norm": 4.617567060806481, "learning_rate": 1.0330578512396695e-06, "loss": 1.0324, "step": 25 }, { "epoch": 0.012432656444260257, "grad_norm": 3.417020439965166, "learning_rate": 1.2396694214876035e-06, "loss": 0.9692, "step": 30 }, { "epoch": 0.014504765851636967, "grad_norm": 3.395692919461883, "learning_rate": 1.4462809917355372e-06, "loss": 0.9523, "step": 35 }, { "epoch": 0.016576875259013676, "grad_norm": 3.151546235627503, "learning_rate": 1.6528925619834712e-06, "loss": 0.9274, "step": 40 }, { "epoch": 0.018648984666390384, "grad_norm": 2.9582253473643485, "learning_rate": 1.859504132231405e-06, "loss": 0.9323, "step": 45 }, { "epoch": 0.020721094073767096, "grad_norm": 3.0341795582664997, "learning_rate": 2.066115702479339e-06, "loss": 0.8943, "step": 50 }, { "epoch": 0.022793203481143803, "grad_norm": 3.0341130670598164, "learning_rate": 2.2727272727272728e-06, "loss": 0.8975, "step": 55 }, { "epoch": 0.024865312888520515, "grad_norm": 3.1718207048254627, "learning_rate": 2.479338842975207e-06, "loss": 0.8814, "step": 60 }, { "epoch": 0.026937422295897222, "grad_norm": 3.0690467751730495, "learning_rate": 2.6859504132231405e-06, "loss": 0.8886, "step": 65 }, { "epoch": 0.029009531703273934, "grad_norm": 3.1173422324045084, "learning_rate": 2.8925619834710743e-06, "loss": 0.8779, "step": 70 }, { "epoch": 0.03108164111065064, "grad_norm": 3.138742895479937, "learning_rate": 3.0991735537190086e-06, "loss": 0.8896, "step": 75 }, { "epoch": 0.03315375051802735, "grad_norm": 3.172114081672577, "learning_rate": 3.3057851239669424e-06, "loss": 0.852, "step": 80 }, { "epoch": 0.035225859925404064, "grad_norm": 3.0136111662154126, "learning_rate": 3.5123966942148763e-06, "loss": 0.8718, "step": 85 }, { "epoch": 0.03729796933278077, "grad_norm": 3.258761073776976, "learning_rate": 3.71900826446281e-06, "loss": 0.8515, "step": 90 }, { "epoch": 0.03937007874015748, "grad_norm": 3.1414312111507323, "learning_rate": 3.925619834710744e-06, "loss": 0.8653, "step": 95 }, { "epoch": 0.04144218814753419, "grad_norm": 3.207392945640886, "learning_rate": 4.132231404958678e-06, "loss": 0.8688, "step": 100 }, { "epoch": 0.0435142975549109, "grad_norm": 3.2052306999586695, "learning_rate": 4.338842975206612e-06, "loss": 0.8662, "step": 105 }, { "epoch": 0.04558640696228761, "grad_norm": 3.061390265998909, "learning_rate": 4.5454545454545455e-06, "loss": 0.8397, "step": 110 }, { "epoch": 0.04765851636966432, "grad_norm": 3.0154741328483023, "learning_rate": 4.75206611570248e-06, "loss": 0.8583, "step": 115 }, { "epoch": 0.04973062577704103, "grad_norm": 2.9687539148068796, "learning_rate": 4.958677685950414e-06, "loss": 0.8542, "step": 120 }, { "epoch": 0.05180273518441774, "grad_norm": 3.0956570078052925, "learning_rate": 5.165289256198347e-06, "loss": 0.8343, "step": 125 }, { "epoch": 0.053874844591794445, "grad_norm": 2.939810124211356, "learning_rate": 5.371900826446281e-06, "loss": 0.8457, "step": 130 }, { "epoch": 0.055946953999171156, "grad_norm": 3.2352916246928203, "learning_rate": 5.578512396694216e-06, "loss": 0.8169, "step": 135 }, { "epoch": 0.05801906340654787, "grad_norm": 3.1434079053605273, "learning_rate": 5.785123966942149e-06, "loss": 0.8257, "step": 140 }, { "epoch": 0.06009117281392457, "grad_norm": 3.1532690460920803, "learning_rate": 5.991735537190083e-06, "loss": 0.8392, "step": 145 }, { "epoch": 0.06216328222130128, "grad_norm": 3.0343466546862037, "learning_rate": 6.198347107438017e-06, "loss": 0.8442, "step": 150 }, { "epoch": 0.06423539162867799, "grad_norm": 2.9508007571282318, "learning_rate": 6.404958677685951e-06, "loss": 0.8218, "step": 155 }, { "epoch": 0.0663075010360547, "grad_norm": 3.0527743250572033, "learning_rate": 6.611570247933885e-06, "loss": 0.8294, "step": 160 }, { "epoch": 0.06837961044343141, "grad_norm": 3.075132609753167, "learning_rate": 6.818181818181818e-06, "loss": 0.8266, "step": 165 }, { "epoch": 0.07045171985080813, "grad_norm": 2.972687092057336, "learning_rate": 7.0247933884297525e-06, "loss": 0.8139, "step": 170 }, { "epoch": 0.07252382925818483, "grad_norm": 3.068027468513739, "learning_rate": 7.231404958677687e-06, "loss": 0.7992, "step": 175 }, { "epoch": 0.07459593866556154, "grad_norm": 3.158357724987422, "learning_rate": 7.43801652892562e-06, "loss": 0.8134, "step": 180 }, { "epoch": 0.07666804807293826, "grad_norm": 3.051134008290911, "learning_rate": 7.644628099173555e-06, "loss": 0.8344, "step": 185 }, { "epoch": 0.07874015748031496, "grad_norm": 3.3235580452981175, "learning_rate": 7.851239669421489e-06, "loss": 0.8041, "step": 190 }, { "epoch": 0.08081226688769166, "grad_norm": 3.0678026193628805, "learning_rate": 8.057851239669421e-06, "loss": 0.8045, "step": 195 }, { "epoch": 0.08288437629506838, "grad_norm": 3.415940865122294, "learning_rate": 8.264462809917356e-06, "loss": 0.7953, "step": 200 }, { "epoch": 0.08495648570244509, "grad_norm": 3.815817395153579, "learning_rate": 8.47107438016529e-06, "loss": 0.8047, "step": 205 }, { "epoch": 0.0870285951098218, "grad_norm": 3.2984513943211153, "learning_rate": 8.677685950413224e-06, "loss": 0.8056, "step": 210 }, { "epoch": 0.08910070451719851, "grad_norm": 2.8880894973116793, "learning_rate": 8.884297520661158e-06, "loss": 0.795, "step": 215 }, { "epoch": 0.09117281392457521, "grad_norm": 2.974477767781155, "learning_rate": 9.090909090909091e-06, "loss": 0.8014, "step": 220 }, { "epoch": 0.09324492333195193, "grad_norm": 3.0809082608070444, "learning_rate": 9.297520661157025e-06, "loss": 0.812, "step": 225 }, { "epoch": 0.09531703273932864, "grad_norm": 3.11471679186006, "learning_rate": 9.50413223140496e-06, "loss": 0.815, "step": 230 }, { "epoch": 0.09738914214670534, "grad_norm": 3.3405073257172333, "learning_rate": 9.710743801652894e-06, "loss": 0.7991, "step": 235 }, { "epoch": 0.09946125155408206, "grad_norm": 3.1375906477529485, "learning_rate": 9.917355371900828e-06, "loss": 0.7902, "step": 240 }, { "epoch": 0.10153336096145876, "grad_norm": 2.8343248219683823, "learning_rate": 9.999952884702848e-06, "loss": 0.8215, "step": 245 }, { "epoch": 0.10360547036883548, "grad_norm": 2.9925055543397563, "learning_rate": 9.999664961102495e-06, "loss": 0.8084, "step": 250 }, { "epoch": 0.10567757977621219, "grad_norm": 3.0441664134730275, "learning_rate": 9.999115304121459e-06, "loss": 0.8085, "step": 255 }, { "epoch": 0.10774968918358889, "grad_norm": 3.039997210231939, "learning_rate": 9.998303942534383e-06, "loss": 0.7938, "step": 260 }, { "epoch": 0.10982179859096561, "grad_norm": 3.0431241796795696, "learning_rate": 9.997230918816193e-06, "loss": 0.7985, "step": 265 }, { "epoch": 0.11189390799834231, "grad_norm": 2.85027037020918, "learning_rate": 9.99589628913988e-06, "loss": 0.7861, "step": 270 }, { "epoch": 0.11396601740571902, "grad_norm": 3.051497861941328, "learning_rate": 9.994300123373554e-06, "loss": 0.7716, "step": 275 }, { "epoch": 0.11603812681309573, "grad_norm": 2.895125032094719, "learning_rate": 9.992442505076788e-06, "loss": 0.7834, "step": 280 }, { "epoch": 0.11811023622047244, "grad_norm": 3.0061812704069886, "learning_rate": 9.990323531496235e-06, "loss": 0.7756, "step": 285 }, { "epoch": 0.12018234562784914, "grad_norm": 3.0347710812056525, "learning_rate": 9.98794331356056e-06, "loss": 0.7846, "step": 290 }, { "epoch": 0.12225445503522586, "grad_norm": 2.805184541271578, "learning_rate": 9.985301975874604e-06, "loss": 0.7731, "step": 295 }, { "epoch": 0.12432656444260257, "grad_norm": 3.0842132440120147, "learning_rate": 9.982399656712884e-06, "loss": 0.8042, "step": 300 }, { "epoch": 0.12639867384997927, "grad_norm": 2.9008174183880233, "learning_rate": 9.979236508012341e-06, "loss": 0.7681, "step": 305 }, { "epoch": 0.12847078325735597, "grad_norm": 2.8236286325458244, "learning_rate": 9.975812695364391e-06, "loss": 0.7891, "step": 310 }, { "epoch": 0.1305428926647327, "grad_norm": 2.8468044382747384, "learning_rate": 9.97212839800626e-06, "loss": 0.7681, "step": 315 }, { "epoch": 0.1326150020721094, "grad_norm": 2.9534507280362003, "learning_rate": 9.968183808811586e-06, "loss": 0.7564, "step": 320 }, { "epoch": 0.13468711147948612, "grad_norm": 2.965082743686242, "learning_rate": 9.963979134280344e-06, "loss": 0.7529, "step": 325 }, { "epoch": 0.13675922088686282, "grad_norm": 2.847254353400155, "learning_rate": 9.959514594528018e-06, "loss": 0.7438, "step": 330 }, { "epoch": 0.13883133029423952, "grad_norm": 2.7966052001739206, "learning_rate": 9.954790423274086e-06, "loss": 0.7591, "step": 335 }, { "epoch": 0.14090343970161626, "grad_norm": 2.7319025763391576, "learning_rate": 9.94980686782978e-06, "loss": 0.7406, "step": 340 }, { "epoch": 0.14297554910899296, "grad_norm": 2.870893839424329, "learning_rate": 9.94456418908515e-06, "loss": 0.7541, "step": 345 }, { "epoch": 0.14504765851636967, "grad_norm": 3.009522094390881, "learning_rate": 9.939062661495387e-06, "loss": 0.7511, "step": 350 }, { "epoch": 0.14711976792374637, "grad_norm": 2.7182810127315573, "learning_rate": 9.933302573066477e-06, "loss": 0.7688, "step": 355 }, { "epoch": 0.14919187733112307, "grad_norm": 3.012302144551778, "learning_rate": 9.927284225340105e-06, "loss": 0.7341, "step": 360 }, { "epoch": 0.1512639867384998, "grad_norm": 3.337745192726695, "learning_rate": 9.921007933377886e-06, "loss": 0.7539, "step": 365 }, { "epoch": 0.1533360961458765, "grad_norm": 3.17882772460756, "learning_rate": 9.914474025744855e-06, "loss": 0.7506, "step": 370 }, { "epoch": 0.15540820555325321, "grad_norm": 3.2860227088727383, "learning_rate": 9.907682844492283e-06, "loss": 0.7514, "step": 375 }, { "epoch": 0.15748031496062992, "grad_norm": 3.1971805376225517, "learning_rate": 9.900634745139759e-06, "loss": 0.7475, "step": 380 }, { "epoch": 0.15955242436800662, "grad_norm": 2.8028862036100843, "learning_rate": 9.893330096656576e-06, "loss": 0.7285, "step": 385 }, { "epoch": 0.16162453377538333, "grad_norm": 2.708278578400365, "learning_rate": 9.885769281442426e-06, "loss": 0.7224, "step": 390 }, { "epoch": 0.16369664318276006, "grad_norm": 2.771966186486157, "learning_rate": 9.877952695307382e-06, "loss": 0.7287, "step": 395 }, { "epoch": 0.16576875259013676, "grad_norm": 2.645488728201908, "learning_rate": 9.869880747451164e-06, "loss": 0.7389, "step": 400 }, { "epoch": 0.16784086199751347, "grad_norm": 2.9190913357286306, "learning_rate": 9.861553860441726e-06, "loss": 0.7414, "step": 405 }, { "epoch": 0.16991297140489017, "grad_norm": 2.6920496051405998, "learning_rate": 9.852972470193136e-06, "loss": 0.7259, "step": 410 }, { "epoch": 0.17198508081226688, "grad_norm": 2.9415197302471663, "learning_rate": 9.844137025942755e-06, "loss": 0.7266, "step": 415 }, { "epoch": 0.1740571902196436, "grad_norm": 2.7808373109476294, "learning_rate": 9.835047990227713e-06, "loss": 0.7119, "step": 420 }, { "epoch": 0.1761292996270203, "grad_norm": 2.810707277824952, "learning_rate": 9.825705838860699e-06, "loss": 0.7361, "step": 425 }, { "epoch": 0.17820140903439702, "grad_norm": 3.0525714644872117, "learning_rate": 9.816111060905063e-06, "loss": 0.7146, "step": 430 }, { "epoch": 0.18027351844177372, "grad_norm": 2.769816103147695, "learning_rate": 9.806264158649193e-06, "loss": 0.7104, "step": 435 }, { "epoch": 0.18234562784915043, "grad_norm": 2.8698277588389516, "learning_rate": 9.796165647580233e-06, "loss": 0.7015, "step": 440 }, { "epoch": 0.18441773725652713, "grad_norm": 2.632541484736724, "learning_rate": 9.785816056357096e-06, "loss": 0.7148, "step": 445 }, { "epoch": 0.18648984666390386, "grad_norm": 2.6937205983501276, "learning_rate": 9.775215926782788e-06, "loss": 0.7203, "step": 450 }, { "epoch": 0.18856195607128057, "grad_norm": 2.7810565830560354, "learning_rate": 9.764365813776042e-06, "loss": 0.7068, "step": 455 }, { "epoch": 0.19063406547865727, "grad_norm": 2.8012616966262134, "learning_rate": 9.753266285342271e-06, "loss": 0.7104, "step": 460 }, { "epoch": 0.19270617488603398, "grad_norm": 2.970139751785694, "learning_rate": 9.741917922543831e-06, "loss": 0.6881, "step": 465 }, { "epoch": 0.19477828429341068, "grad_norm": 2.6941473339327824, "learning_rate": 9.7303213194696e-06, "loss": 0.6996, "step": 470 }, { "epoch": 0.1968503937007874, "grad_norm": 3.0409394398260554, "learning_rate": 9.718477083203888e-06, "loss": 0.6933, "step": 475 }, { "epoch": 0.19892250310816412, "grad_norm": 2.8048791766929146, "learning_rate": 9.706385833794639e-06, "loss": 0.6748, "step": 480 }, { "epoch": 0.20099461251554082, "grad_norm": 2.7366882872916887, "learning_rate": 9.694048204220986e-06, "loss": 0.7044, "step": 485 }, { "epoch": 0.20306672192291753, "grad_norm": 2.7267358802595374, "learning_rate": 9.681464840360105e-06, "loss": 0.6912, "step": 490 }, { "epoch": 0.20513883133029423, "grad_norm": 2.754099178327694, "learning_rate": 9.668636400953411e-06, "loss": 0.6731, "step": 495 }, { "epoch": 0.20721094073767096, "grad_norm": 3.11521121490046, "learning_rate": 9.655563557572068e-06, "loss": 0.7018, "step": 500 }, { "epoch": 0.20928305014504767, "grad_norm": 2.729648091432038, "learning_rate": 9.642246994581833e-06, "loss": 0.6919, "step": 505 }, { "epoch": 0.21135515955242437, "grad_norm": 2.788565650588865, "learning_rate": 9.62868740910723e-06, "loss": 0.6718, "step": 510 }, { "epoch": 0.21342726895980108, "grad_norm": 2.6821583529570527, "learning_rate": 9.614885510995047e-06, "loss": 0.6696, "step": 515 }, { "epoch": 0.21549937836717778, "grad_norm": 2.669430236726845, "learning_rate": 9.600842022777198e-06, "loss": 0.686, "step": 520 }, { "epoch": 0.21757148777455448, "grad_norm": 2.6329122210908613, "learning_rate": 9.58655767963287e-06, "loss": 0.6649, "step": 525 }, { "epoch": 0.21964359718193122, "grad_norm": 2.712511744397499, "learning_rate": 9.57203322935006e-06, "loss": 0.6691, "step": 530 }, { "epoch": 0.22171570658930792, "grad_norm": 2.971121715072785, "learning_rate": 9.557269432286406e-06, "loss": 0.6568, "step": 535 }, { "epoch": 0.22378781599668462, "grad_norm": 3.5850165463286587, "learning_rate": 9.542267061329407e-06, "loss": 0.6535, "step": 540 }, { "epoch": 0.22585992540406133, "grad_norm": 2.6756133538389593, "learning_rate": 9.52702690185594e-06, "loss": 0.6578, "step": 545 }, { "epoch": 0.22793203481143803, "grad_norm": 2.7169962232785947, "learning_rate": 9.511549751691159e-06, "loss": 0.6696, "step": 550 }, { "epoch": 0.23000414421881477, "grad_norm": 2.66297533869507, "learning_rate": 9.495836421066722e-06, "loss": 0.6594, "step": 555 }, { "epoch": 0.23207625362619147, "grad_norm": 2.974286886191662, "learning_rate": 9.47988773257838e-06, "loss": 0.6784, "step": 560 }, { "epoch": 0.23414836303356817, "grad_norm": 2.7185575923361243, "learning_rate": 9.46370452114291e-06, "loss": 0.658, "step": 565 }, { "epoch": 0.23622047244094488, "grad_norm": 2.829971742294898, "learning_rate": 9.447287633954406e-06, "loss": 0.6593, "step": 570 }, { "epoch": 0.23829258184832158, "grad_norm": 2.620399538248119, "learning_rate": 9.430637930439933e-06, "loss": 0.6641, "step": 575 }, { "epoch": 0.2403646912556983, "grad_norm": 2.8089738469657286, "learning_rate": 9.413756282214538e-06, "loss": 0.6443, "step": 580 }, { "epoch": 0.24243680066307502, "grad_norm": 2.6610020148736657, "learning_rate": 9.396643573035609e-06, "loss": 0.6619, "step": 585 }, { "epoch": 0.24450891007045172, "grad_norm": 2.693009848715325, "learning_rate": 9.37930069875662e-06, "loss": 0.6469, "step": 590 }, { "epoch": 0.24658101947782843, "grad_norm": 2.8338530690883657, "learning_rate": 9.36172856728023e-06, "loss": 0.6571, "step": 595 }, { "epoch": 0.24865312888520513, "grad_norm": 2.676634556349702, "learning_rate": 9.343928098510759e-06, "loss": 0.6358, "step": 600 }, { "epoch": 0.25072523829258186, "grad_norm": 2.6815297814713674, "learning_rate": 9.325900224306019e-06, "loss": 0.6366, "step": 605 }, { "epoch": 0.25279734769995854, "grad_norm": 2.693794960810057, "learning_rate": 9.307645888428542e-06, "loss": 0.6441, "step": 610 }, { "epoch": 0.2548694571073353, "grad_norm": 2.6733329911215757, "learning_rate": 9.289166046496172e-06, "loss": 0.6284, "step": 615 }, { "epoch": 0.25694156651471195, "grad_norm": 2.735971515551987, "learning_rate": 9.270461665932035e-06, "loss": 0.6394, "step": 620 }, { "epoch": 0.2590136759220887, "grad_norm": 2.7764753108215623, "learning_rate": 9.251533725913893e-06, "loss": 0.6308, "step": 625 }, { "epoch": 0.2610857853294654, "grad_norm": 2.9846661778059596, "learning_rate": 9.23238321732289e-06, "loss": 0.6381, "step": 630 }, { "epoch": 0.2631578947368421, "grad_norm": 2.639475488829137, "learning_rate": 9.213011142691672e-06, "loss": 0.6298, "step": 635 }, { "epoch": 0.2652300041442188, "grad_norm": 2.8031268295287037, "learning_rate": 9.193418516151913e-06, "loss": 0.6314, "step": 640 }, { "epoch": 0.2673021135515955, "grad_norm": 2.751071861402199, "learning_rate": 9.173606363381218e-06, "loss": 0.6243, "step": 645 }, { "epoch": 0.26937422295897223, "grad_norm": 2.8637192589002507, "learning_rate": 9.15357572154943e-06, "loss": 0.6226, "step": 650 }, { "epoch": 0.27144633236634896, "grad_norm": 2.757948609951417, "learning_rate": 9.133327639264334e-06, "loss": 0.6195, "step": 655 }, { "epoch": 0.27351844177372564, "grad_norm": 2.7078393676429724, "learning_rate": 9.112863176516761e-06, "loss": 0.6063, "step": 660 }, { "epoch": 0.2755905511811024, "grad_norm": 2.7174885956763517, "learning_rate": 9.092183404625107e-06, "loss": 0.6201, "step": 665 }, { "epoch": 0.27766266058847905, "grad_norm": 2.692439395697707, "learning_rate": 9.071289406179233e-06, "loss": 0.6186, "step": 670 }, { "epoch": 0.2797347699958558, "grad_norm": 2.705577759760543, "learning_rate": 9.0501822749838e-06, "loss": 0.6208, "step": 675 }, { "epoch": 0.2818068794032325, "grad_norm": 2.6961720306129497, "learning_rate": 9.028863116001013e-06, "loss": 0.6217, "step": 680 }, { "epoch": 0.2838789888106092, "grad_norm": 2.823153530451939, "learning_rate": 9.007333045292764e-06, "loss": 0.6095, "step": 685 }, { "epoch": 0.2859510982179859, "grad_norm": 2.7424489434092885, "learning_rate": 8.98559318996222e-06, "loss": 0.6071, "step": 690 }, { "epoch": 0.2880232076253626, "grad_norm": 2.8108371792200724, "learning_rate": 8.963644688094806e-06, "loss": 0.6123, "step": 695 }, { "epoch": 0.29009531703273933, "grad_norm": 2.7296570733868046, "learning_rate": 8.941488688698635e-06, "loss": 0.6038, "step": 700 }, { "epoch": 0.29216742644011606, "grad_norm": 2.5282731359900983, "learning_rate": 8.919126351644351e-06, "loss": 0.6051, "step": 705 }, { "epoch": 0.29423953584749274, "grad_norm": 2.733801097585659, "learning_rate": 8.896558847604414e-06, "loss": 0.6169, "step": 710 }, { "epoch": 0.29631164525486947, "grad_norm": 2.5922131116520433, "learning_rate": 8.873787357991811e-06, "loss": 0.6062, "step": 715 }, { "epoch": 0.29838375466224615, "grad_norm": 2.6565071641175844, "learning_rate": 8.850813074898218e-06, "loss": 0.6069, "step": 720 }, { "epoch": 0.3004558640696229, "grad_norm": 3.0927970126548154, "learning_rate": 8.827637201031579e-06, "loss": 0.5879, "step": 725 }, { "epoch": 0.3025279734769996, "grad_norm": 2.8105831813533033, "learning_rate": 8.804260949653154e-06, "loss": 0.6124, "step": 730 }, { "epoch": 0.3046000828843763, "grad_norm": 2.584377915155431, "learning_rate": 8.780685544514006e-06, "loss": 0.6073, "step": 735 }, { "epoch": 0.306672192291753, "grad_norm": 2.773926231343915, "learning_rate": 8.756912219790933e-06, "loss": 0.5999, "step": 740 }, { "epoch": 0.3087443016991297, "grad_norm": 2.732022843788419, "learning_rate": 8.732942220021859e-06, "loss": 0.5762, "step": 745 }, { "epoch": 0.31081641110650643, "grad_norm": 2.595571184744404, "learning_rate": 8.708776800040679e-06, "loss": 0.5846, "step": 750 }, { "epoch": 0.3128885205138831, "grad_norm": 2.6942143484830985, "learning_rate": 8.684417224911579e-06, "loss": 0.6003, "step": 755 }, { "epoch": 0.31496062992125984, "grad_norm": 2.6835198662003155, "learning_rate": 8.659864769862797e-06, "loss": 0.5838, "step": 760 }, { "epoch": 0.31703273932863657, "grad_norm": 2.7778129772748383, "learning_rate": 8.635120720219877e-06, "loss": 0.5794, "step": 765 }, { "epoch": 0.31910484873601325, "grad_norm": 2.6022076775767493, "learning_rate": 8.610186371338364e-06, "loss": 0.586, "step": 770 }, { "epoch": 0.32117695814339, "grad_norm": 2.7328885442816557, "learning_rate": 8.585063028536015e-06, "loss": 0.5987, "step": 775 }, { "epoch": 0.32324906755076666, "grad_norm": 2.635761747619665, "learning_rate": 8.559752007024449e-06, "loss": 0.5859, "step": 780 }, { "epoch": 0.3253211769581434, "grad_norm": 2.79153427275607, "learning_rate": 8.534254631840297e-06, "loss": 0.5976, "step": 785 }, { "epoch": 0.3273932863655201, "grad_norm": 2.656496652025873, "learning_rate": 8.50857223777584e-06, "loss": 0.578, "step": 790 }, { "epoch": 0.3294653957728968, "grad_norm": 2.7338530782471655, "learning_rate": 8.482706169309139e-06, "loss": 0.5648, "step": 795 }, { "epoch": 0.33153750518027353, "grad_norm": 2.667291889393192, "learning_rate": 8.456657780533633e-06, "loss": 0.5641, "step": 800 }, { "epoch": 0.3336096145876502, "grad_norm": 2.9479902385492562, "learning_rate": 8.430428435087267e-06, "loss": 0.5665, "step": 805 }, { "epoch": 0.33568172399502694, "grad_norm": 2.9408488784356748, "learning_rate": 8.404019506081103e-06, "loss": 0.5834, "step": 810 }, { "epoch": 0.33775383340240367, "grad_norm": 2.707200722617604, "learning_rate": 8.377432376027437e-06, "loss": 0.5756, "step": 815 }, { "epoch": 0.33982594280978035, "grad_norm": 2.70828380166259, "learning_rate": 8.350668436767413e-06, "loss": 0.5686, "step": 820 }, { "epoch": 0.3418980522171571, "grad_norm": 2.7745931767789886, "learning_rate": 8.323729089398182e-06, "loss": 0.5521, "step": 825 }, { "epoch": 0.34397016162453375, "grad_norm": 2.685590822904809, "learning_rate": 8.296615744199533e-06, "loss": 0.5707, "step": 830 }, { "epoch": 0.3460422710319105, "grad_norm": 2.6439480580475023, "learning_rate": 8.269329820560074e-06, "loss": 0.549, "step": 835 }, { "epoch": 0.3481143804392872, "grad_norm": 2.6684012534208845, "learning_rate": 8.241872746902934e-06, "loss": 0.5614, "step": 840 }, { "epoch": 0.3501864898466639, "grad_norm": 2.6586973407105847, "learning_rate": 8.214245960610966e-06, "loss": 0.5596, "step": 845 }, { "epoch": 0.3522585992540406, "grad_norm": 2.630842021442866, "learning_rate": 8.18645090795152e-06, "loss": 0.5435, "step": 850 }, { "epoch": 0.3543307086614173, "grad_norm": 2.783114608569901, "learning_rate": 8.158489044000712e-06, "loss": 0.554, "step": 855 }, { "epoch": 0.35640281806879404, "grad_norm": 2.6861386548441266, "learning_rate": 8.13036183256727e-06, "loss": 0.5503, "step": 860 }, { "epoch": 0.35847492747617077, "grad_norm": 2.760451069908695, "learning_rate": 8.102070746115888e-06, "loss": 0.5504, "step": 865 }, { "epoch": 0.36054703688354744, "grad_norm": 3.1912807546036155, "learning_rate": 8.073617265690144e-06, "loss": 0.5585, "step": 870 }, { "epoch": 0.3626191462909242, "grad_norm": 2.7551688368780365, "learning_rate": 8.045002880834975e-06, "loss": 0.5499, "step": 875 }, { "epoch": 0.36469125569830085, "grad_norm": 2.656529615625788, "learning_rate": 8.016229089518695e-06, "loss": 0.5472, "step": 880 }, { "epoch": 0.3667633651056776, "grad_norm": 2.456139438562433, "learning_rate": 7.987297398054572e-06, "loss": 0.5444, "step": 885 }, { "epoch": 0.36883547451305426, "grad_norm": 2.7021257640939678, "learning_rate": 7.95820932102198e-06, "loss": 0.5467, "step": 890 }, { "epoch": 0.370907583920431, "grad_norm": 2.576896456231456, "learning_rate": 7.9289663811871e-06, "loss": 0.5453, "step": 895 }, { "epoch": 0.3729796933278077, "grad_norm": 2.509602711427261, "learning_rate": 7.899570109423219e-06, "loss": 0.5315, "step": 900 }, { "epoch": 0.3750518027351844, "grad_norm": 2.7824790919014486, "learning_rate": 7.870022044630569e-06, "loss": 0.5367, "step": 905 }, { "epoch": 0.37712391214256114, "grad_norm": 2.635566041127458, "learning_rate": 7.84032373365578e-06, "loss": 0.5458, "step": 910 }, { "epoch": 0.3791960215499378, "grad_norm": 2.5686297030493277, "learning_rate": 7.810476731210897e-06, "loss": 0.5538, "step": 915 }, { "epoch": 0.38126813095731454, "grad_norm": 2.4885943009976366, "learning_rate": 7.780482599791987e-06, "loss": 0.5501, "step": 920 }, { "epoch": 0.3833402403646913, "grad_norm": 2.6312255974545704, "learning_rate": 7.750342909597353e-06, "loss": 0.5412, "step": 925 }, { "epoch": 0.38541234977206795, "grad_norm": 2.522165622730654, "learning_rate": 7.72005923844532e-06, "loss": 0.5313, "step": 930 }, { "epoch": 0.3874844591794447, "grad_norm": 2.6702491559670167, "learning_rate": 7.689633171691646e-06, "loss": 0.5345, "step": 935 }, { "epoch": 0.38955656858682136, "grad_norm": 2.6631872386085185, "learning_rate": 7.659066302146523e-06, "loss": 0.5452, "step": 940 }, { "epoch": 0.3916286779941981, "grad_norm": 2.5785637853885195, "learning_rate": 7.628360229991198e-06, "loss": 0.5288, "step": 945 }, { "epoch": 0.3937007874015748, "grad_norm": 2.6837857193306935, "learning_rate": 7.597516562694198e-06, "loss": 0.5306, "step": 950 }, { "epoch": 0.3957728968089515, "grad_norm": 2.550858198260864, "learning_rate": 7.56653691492718e-06, "loss": 0.5233, "step": 955 }, { "epoch": 0.39784500621632823, "grad_norm": 2.6115170843406132, "learning_rate": 7.535422908480408e-06, "loss": 0.5424, "step": 960 }, { "epoch": 0.3999171156237049, "grad_norm": 2.5494802625123105, "learning_rate": 7.504176172177842e-06, "loss": 0.5171, "step": 965 }, { "epoch": 0.40198922503108164, "grad_norm": 2.5129023648194284, "learning_rate": 7.472798341791877e-06, "loss": 0.5148, "step": 970 }, { "epoch": 0.4040613344384584, "grad_norm": 2.6166709552589866, "learning_rate": 7.441291059957709e-06, "loss": 0.5292, "step": 975 }, { "epoch": 0.40613344384583505, "grad_norm": 2.5745519804152073, "learning_rate": 7.409655976087338e-06, "loss": 0.5228, "step": 980 }, { "epoch": 0.4082055532532118, "grad_norm": 2.7301378295783643, "learning_rate": 7.377894746283227e-06, "loss": 0.5343, "step": 985 }, { "epoch": 0.41027766266058846, "grad_norm": 2.431218699982397, "learning_rate": 7.3460090332516e-06, "loss": 0.508, "step": 990 }, { "epoch": 0.4123497720679652, "grad_norm": 2.556296703014823, "learning_rate": 7.314000506215402e-06, "loss": 0.5148, "step": 995 }, { "epoch": 0.4144218814753419, "grad_norm": 2.5152139819517023, "learning_rate": 7.281870840826912e-06, "loss": 0.4999, "step": 1000 }, { "epoch": 0.4164939908827186, "grad_norm": 2.786935816025543, "learning_rate": 7.249621719080026e-06, "loss": 0.5177, "step": 1005 }, { "epoch": 0.41856610029009533, "grad_norm": 2.5491738667702672, "learning_rate": 7.217254829222201e-06, "loss": 0.5114, "step": 1010 }, { "epoch": 0.420638209697472, "grad_norm": 2.50660251762613, "learning_rate": 7.1847718656660755e-06, "loss": 0.5156, "step": 1015 }, { "epoch": 0.42271031910484874, "grad_norm": 2.5986079473520878, "learning_rate": 7.152174528900773e-06, "loss": 0.4954, "step": 1020 }, { "epoch": 0.4247824285122254, "grad_norm": 2.635934213356069, "learning_rate": 7.119464525402867e-06, "loss": 0.504, "step": 1025 }, { "epoch": 0.42685453791960215, "grad_norm": 2.861696123619274, "learning_rate": 7.08664356754706e-06, "loss": 0.5063, "step": 1030 }, { "epoch": 0.4289266473269789, "grad_norm": 2.5689042563262223, "learning_rate": 7.053713373516538e-06, "loss": 0.5181, "step": 1035 }, { "epoch": 0.43099875673435556, "grad_norm": 2.5132849205491286, "learning_rate": 7.020675667213015e-06, "loss": 0.5043, "step": 1040 }, { "epoch": 0.4330708661417323, "grad_norm": 2.704397404776844, "learning_rate": 6.987532178166496e-06, "loss": 0.5022, "step": 1045 }, { "epoch": 0.43514297554910897, "grad_norm": 2.4785754646781104, "learning_rate": 6.9542846414447306e-06, "loss": 0.5027, "step": 1050 }, { "epoch": 0.4372150849564857, "grad_norm": 2.3912935006226594, "learning_rate": 6.920934797562385e-06, "loss": 0.5051, "step": 1055 }, { "epoch": 0.43928719436386243, "grad_norm": 2.3476041200753546, "learning_rate": 6.887484392389923e-06, "loss": 0.5043, "step": 1060 }, { "epoch": 0.4413593037712391, "grad_norm": 2.5914055427224465, "learning_rate": 6.853935177062208e-06, "loss": 0.4974, "step": 1065 }, { "epoch": 0.44343141317861584, "grad_norm": 2.594951575952019, "learning_rate": 6.8202889078868395e-06, "loss": 0.5061, "step": 1070 }, { "epoch": 0.4455035225859925, "grad_norm": 2.6023391478211293, "learning_rate": 6.786547346252198e-06, "loss": 0.4963, "step": 1075 }, { "epoch": 0.44757563199336925, "grad_norm": 2.3528096807317995, "learning_rate": 6.7527122585352435e-06, "loss": 0.4883, "step": 1080 }, { "epoch": 0.449647741400746, "grad_norm": 2.7398459124724814, "learning_rate": 6.718785416009044e-06, "loss": 0.4968, "step": 1085 }, { "epoch": 0.45171985080812266, "grad_norm": 2.838272166911696, "learning_rate": 6.6847685947500495e-06, "loss": 0.4915, "step": 1090 }, { "epoch": 0.4537919602154994, "grad_norm": 3.0284763209341623, "learning_rate": 6.650663575545111e-06, "loss": 0.4762, "step": 1095 }, { "epoch": 0.45586406962287607, "grad_norm": 2.6967943018540517, "learning_rate": 6.61647214379826e-06, "loss": 0.4737, "step": 1100 }, { "epoch": 0.4579361790302528, "grad_norm": 2.551888108931838, "learning_rate": 6.582196089437241e-06, "loss": 0.5076, "step": 1105 }, { "epoch": 0.46000828843762953, "grad_norm": 2.514843118690436, "learning_rate": 6.547837206819804e-06, "loss": 0.4876, "step": 1110 }, { "epoch": 0.4620803978450062, "grad_norm": 2.4499641238461005, "learning_rate": 6.513397294639778e-06, "loss": 0.4785, "step": 1115 }, { "epoch": 0.46415250725238294, "grad_norm": 2.5439805360709378, "learning_rate": 6.478878155832904e-06, "loss": 0.4609, "step": 1120 }, { "epoch": 0.4662246166597596, "grad_norm": 2.567465771254044, "learning_rate": 6.444281597482449e-06, "loss": 0.4826, "step": 1125 }, { "epoch": 0.46829672606713635, "grad_norm": 2.4194190626785863, "learning_rate": 6.409609430724607e-06, "loss": 0.4639, "step": 1130 }, { "epoch": 0.4703688354745131, "grad_norm": 2.4928927116754216, "learning_rate": 6.3748634706536905e-06, "loss": 0.4755, "step": 1135 }, { "epoch": 0.47244094488188976, "grad_norm": 2.5328967096855517, "learning_rate": 6.340045536227101e-06, "loss": 0.4676, "step": 1140 }, { "epoch": 0.4745130542892665, "grad_norm": 2.580851369974543, "learning_rate": 6.305157450170112e-06, "loss": 0.4679, "step": 1145 }, { "epoch": 0.47658516369664317, "grad_norm": 2.771781788352758, "learning_rate": 6.270201038880451e-06, "loss": 0.4748, "step": 1150 }, { "epoch": 0.4786572731040199, "grad_norm": 2.3357211287270294, "learning_rate": 6.235178132332678e-06, "loss": 0.4733, "step": 1155 }, { "epoch": 0.4807293825113966, "grad_norm": 2.4376737050512993, "learning_rate": 6.200090563982397e-06, "loss": 0.4623, "step": 1160 }, { "epoch": 0.4828014919187733, "grad_norm": 2.727521949059151, "learning_rate": 6.164940170670266e-06, "loss": 0.4763, "step": 1165 }, { "epoch": 0.48487360132615004, "grad_norm": 2.485190311900293, "learning_rate": 6.129728792525847e-06, "loss": 0.4653, "step": 1170 }, { "epoch": 0.4869457107335267, "grad_norm": 2.3795187623979746, "learning_rate": 6.094458272871259e-06, "loss": 0.4576, "step": 1175 }, { "epoch": 0.48901782014090345, "grad_norm": 2.43415537867363, "learning_rate": 6.0591304581247005e-06, "loss": 0.4606, "step": 1180 }, { "epoch": 0.4910899295482801, "grad_norm": 2.5114902879972374, "learning_rate": 6.023747197703771e-06, "loss": 0.4671, "step": 1185 }, { "epoch": 0.49316203895565686, "grad_norm": 2.5122364778542225, "learning_rate": 5.988310343928665e-06, "loss": 0.4678, "step": 1190 }, { "epoch": 0.4952341483630336, "grad_norm": 2.6311201696718283, "learning_rate": 5.9528217519252e-06, "loss": 0.4653, "step": 1195 }, { "epoch": 0.49730625777041026, "grad_norm": 2.634373162867216, "learning_rate": 5.9172832795276965e-06, "loss": 0.4858, "step": 1200 }, { "epoch": 0.499378367177787, "grad_norm": 2.602845367016974, "learning_rate": 5.881696787181724e-06, "loss": 0.4646, "step": 1205 }, { "epoch": 0.5014504765851637, "grad_norm": 2.4997645055614544, "learning_rate": 5.846064137846704e-06, "loss": 0.4723, "step": 1210 }, { "epoch": 0.5035225859925404, "grad_norm": 2.5259484476620853, "learning_rate": 5.810387196898387e-06, "loss": 0.4592, "step": 1215 }, { "epoch": 0.5055946953999171, "grad_norm": 2.460185161862939, "learning_rate": 5.7746678320311955e-06, "loss": 0.4563, "step": 1220 }, { "epoch": 0.5076668048072939, "grad_norm": 2.522924782621583, "learning_rate": 5.738907913160452e-06, "loss": 0.455, "step": 1225 }, { "epoch": 0.5097389142146705, "grad_norm": 2.450518901210104, "learning_rate": 5.703109312324493e-06, "loss": 0.4631, "step": 1230 }, { "epoch": 0.5118110236220472, "grad_norm": 2.512677663120476, "learning_rate": 5.667273903586656e-06, "loss": 0.4636, "step": 1235 }, { "epoch": 0.5138831330294239, "grad_norm": 2.508483790573203, "learning_rate": 5.6314035629371835e-06, "loss": 0.4494, "step": 1240 }, { "epoch": 0.5159552424368007, "grad_norm": 2.539619573950297, "learning_rate": 5.595500168195007e-06, "loss": 0.4657, "step": 1245 }, { "epoch": 0.5180273518441774, "grad_norm": 2.457760045030727, "learning_rate": 5.5595655989094525e-06, "loss": 0.4562, "step": 1250 }, { "epoch": 0.520099461251554, "grad_norm": 2.5810506702287546, "learning_rate": 5.52360173626183e-06, "loss": 0.4587, "step": 1255 }, { "epoch": 0.5221715706589308, "grad_norm": 2.533803600220524, "learning_rate": 5.487610462966969e-06, "loss": 0.4473, "step": 1260 }, { "epoch": 0.5242436800663075, "grad_norm": 2.408501431878237, "learning_rate": 5.451593663174647e-06, "loss": 0.4466, "step": 1265 }, { "epoch": 0.5263157894736842, "grad_norm": 2.5248933210116427, "learning_rate": 5.4155532223709625e-06, "loss": 0.4427, "step": 1270 }, { "epoch": 0.528387898881061, "grad_norm": 2.5355682898170704, "learning_rate": 5.379491027279622e-06, "loss": 0.4624, "step": 1275 }, { "epoch": 0.5304600082884376, "grad_norm": 2.7849780348488156, "learning_rate": 5.343408965763174e-06, "loss": 0.4487, "step": 1280 }, { "epoch": 0.5325321176958143, "grad_norm": 2.401434318103116, "learning_rate": 5.3073089267241805e-06, "loss": 0.4393, "step": 1285 }, { "epoch": 0.534604227103191, "grad_norm": 2.4324353094703475, "learning_rate": 5.271192800006325e-06, "loss": 0.4405, "step": 1290 }, { "epoch": 0.5366763365105678, "grad_norm": 2.5008934086234027, "learning_rate": 5.235062476295488e-06, "loss": 0.4206, "step": 1295 }, { "epoch": 0.5387484459179445, "grad_norm": 2.523177697285061, "learning_rate": 5.198919847020765e-06, "loss": 0.4378, "step": 1300 }, { "epoch": 0.5408205553253211, "grad_norm": 2.561504860219997, "learning_rate": 5.162766804255446e-06, "loss": 0.4369, "step": 1305 }, { "epoch": 0.5428926647326979, "grad_norm": 2.4342748058160457, "learning_rate": 5.1266052406179755e-06, "loss": 0.4429, "step": 1310 }, { "epoch": 0.5449647741400746, "grad_norm": 2.5003051526668054, "learning_rate": 5.090437049172861e-06, "loss": 0.4479, "step": 1315 }, { "epoch": 0.5470368835474513, "grad_norm": 2.41044724225261, "learning_rate": 5.054264123331583e-06, "loss": 0.4348, "step": 1320 }, { "epoch": 0.5491089929548281, "grad_norm": 2.534587174570914, "learning_rate": 5.018088356753463e-06, "loss": 0.4346, "step": 1325 }, { "epoch": 0.5511811023622047, "grad_norm": 2.3874656931988953, "learning_rate": 4.981911643246539e-06, "loss": 0.4463, "step": 1330 }, { "epoch": 0.5532532117695814, "grad_norm": 2.344775548708239, "learning_rate": 4.9457358766684175e-06, "loss": 0.4323, "step": 1335 }, { "epoch": 0.5553253211769581, "grad_norm": 2.394852642174165, "learning_rate": 4.9095629508271396e-06, "loss": 0.412, "step": 1340 }, { "epoch": 0.5573974305843349, "grad_norm": 2.4107260444211067, "learning_rate": 4.873394759382025e-06, "loss": 0.434, "step": 1345 }, { "epoch": 0.5594695399917116, "grad_norm": 2.39603695241891, "learning_rate": 4.837233195744556e-06, "loss": 0.4368, "step": 1350 }, { "epoch": 0.5615416493990882, "grad_norm": 2.484288425740336, "learning_rate": 4.8010801529792375e-06, "loss": 0.4247, "step": 1355 }, { "epoch": 0.563613758806465, "grad_norm": 2.491968721281532, "learning_rate": 4.7649375237045135e-06, "loss": 0.4239, "step": 1360 }, { "epoch": 0.5656858682138417, "grad_norm": 2.405464453263017, "learning_rate": 4.728807199993677e-06, "loss": 0.4156, "step": 1365 }, { "epoch": 0.5677579776212184, "grad_norm": 2.3649504461482405, "learning_rate": 4.692691073275822e-06, "loss": 0.4272, "step": 1370 }, { "epoch": 0.5698300870285951, "grad_norm": 2.578097130955108, "learning_rate": 4.656591034236827e-06, "loss": 0.4318, "step": 1375 }, { "epoch": 0.5719021964359718, "grad_norm": 2.3565047434019233, "learning_rate": 4.620508972720379e-06, "loss": 0.4157, "step": 1380 }, { "epoch": 0.5739743058433485, "grad_norm": 2.4914369771154776, "learning_rate": 4.584446777629038e-06, "loss": 0.4131, "step": 1385 }, { "epoch": 0.5760464152507252, "grad_norm": 2.5403254221008016, "learning_rate": 4.548406336825354e-06, "loss": 0.4209, "step": 1390 }, { "epoch": 0.578118524658102, "grad_norm": 2.4433869039162763, "learning_rate": 4.512389537033032e-06, "loss": 0.4156, "step": 1395 }, { "epoch": 0.5801906340654787, "grad_norm": 2.5532014778576606, "learning_rate": 4.476398263738171e-06, "loss": 0.4187, "step": 1400 }, { "epoch": 0.5822627434728553, "grad_norm": 2.422240802637448, "learning_rate": 4.440434401090549e-06, "loss": 0.4123, "step": 1405 }, { "epoch": 0.5843348528802321, "grad_norm": 2.327309317585392, "learning_rate": 4.404499831804993e-06, "loss": 0.4167, "step": 1410 }, { "epoch": 0.5864069622876088, "grad_norm": 2.337179254235494, "learning_rate": 4.368596437062819e-06, "loss": 0.4253, "step": 1415 }, { "epoch": 0.5884790716949855, "grad_norm": 2.6203872163413, "learning_rate": 4.332726096413346e-06, "loss": 0.4035, "step": 1420 }, { "epoch": 0.5905511811023622, "grad_norm": 2.45290146849787, "learning_rate": 4.29689068767551e-06, "loss": 0.4205, "step": 1425 }, { "epoch": 0.5926232905097389, "grad_norm": 2.442505011249494, "learning_rate": 4.261092086839549e-06, "loss": 0.4199, "step": 1430 }, { "epoch": 0.5946953999171156, "grad_norm": 2.469896647219146, "learning_rate": 4.225332167968808e-06, "loss": 0.4197, "step": 1435 }, { "epoch": 0.5967675093244923, "grad_norm": 2.525933254390451, "learning_rate": 4.189612803101614e-06, "loss": 0.4136, "step": 1440 }, { "epoch": 0.5988396187318691, "grad_norm": 2.3481926369028154, "learning_rate": 4.153935862153299e-06, "loss": 0.4098, "step": 1445 }, { "epoch": 0.6009117281392458, "grad_norm": 2.4652012656554336, "learning_rate": 4.118303212818277e-06, "loss": 0.4111, "step": 1450 }, { "epoch": 0.6029838375466224, "grad_norm": 2.597755453602896, "learning_rate": 4.082716720472304e-06, "loss": 0.4141, "step": 1455 }, { "epoch": 0.6050559469539992, "grad_norm": 2.3332725702199912, "learning_rate": 4.0471782480748005e-06, "loss": 0.3886, "step": 1460 }, { "epoch": 0.6071280563613759, "grad_norm": 2.387505345147473, "learning_rate": 4.011689656071334e-06, "loss": 0.4167, "step": 1465 }, { "epoch": 0.6092001657687526, "grad_norm": 2.4113537140405388, "learning_rate": 3.97625280229623e-06, "loss": 0.4002, "step": 1470 }, { "epoch": 0.6112722751761293, "grad_norm": 2.4554203658965488, "learning_rate": 3.940869541875301e-06, "loss": 0.3881, "step": 1475 }, { "epoch": 0.613344384583506, "grad_norm": 2.425914982917716, "learning_rate": 3.905541727128743e-06, "loss": 0.4069, "step": 1480 }, { "epoch": 0.6154164939908827, "grad_norm": 2.5231106398066476, "learning_rate": 3.870271207474154e-06, "loss": 0.4002, "step": 1485 }, { "epoch": 0.6174886033982594, "grad_norm": 2.4689338483413135, "learning_rate": 3.8350598293297345e-06, "loss": 0.4141, "step": 1490 }, { "epoch": 0.6195607128056362, "grad_norm": 2.496046599900669, "learning_rate": 3.7999094360176036e-06, "loss": 0.3965, "step": 1495 }, { "epoch": 0.6216328222130129, "grad_norm": 2.3726201269683553, "learning_rate": 3.7648218676673232e-06, "loss": 0.4017, "step": 1500 }, { "epoch": 0.6237049316203895, "grad_norm": 2.418446591018178, "learning_rate": 3.7297989611195504e-06, "loss": 0.3938, "step": 1505 }, { "epoch": 0.6257770410277662, "grad_norm": 2.4157452764623963, "learning_rate": 3.694842549829889e-06, "loss": 0.3871, "step": 1510 }, { "epoch": 0.627849150435143, "grad_norm": 2.4088293626826114, "learning_rate": 3.659954463772901e-06, "loss": 0.4002, "step": 1515 }, { "epoch": 0.6299212598425197, "grad_norm": 2.5779980646499543, "learning_rate": 3.625136529346312e-06, "loss": 0.4055, "step": 1520 }, { "epoch": 0.6319933692498964, "grad_norm": 2.4110720231898806, "learning_rate": 3.590390569275395e-06, "loss": 0.3913, "step": 1525 }, { "epoch": 0.6340654786572731, "grad_norm": 2.3982253161569234, "learning_rate": 3.555718402517554e-06, "loss": 0.3962, "step": 1530 }, { "epoch": 0.6361375880646498, "grad_norm": 2.392216618491618, "learning_rate": 3.521121844167098e-06, "loss": 0.399, "step": 1535 }, { "epoch": 0.6382096974720265, "grad_norm": 2.3550943907800193, "learning_rate": 3.486602705360224e-06, "loss": 0.3927, "step": 1540 }, { "epoch": 0.6402818068794033, "grad_norm": 2.4744483911955153, "learning_rate": 3.4521627931801976e-06, "loss": 0.3961, "step": 1545 }, { "epoch": 0.64235391628678, "grad_norm": 2.416762173736933, "learning_rate": 3.41780391056276e-06, "loss": 0.3959, "step": 1550 }, { "epoch": 0.6444260256941566, "grad_norm": 2.3621515478388906, "learning_rate": 3.3835278562017405e-06, "loss": 0.3889, "step": 1555 }, { "epoch": 0.6464981351015333, "grad_norm": 2.3799558311767917, "learning_rate": 3.349336424454889e-06, "loss": 0.395, "step": 1560 }, { "epoch": 0.6485702445089101, "grad_norm": 2.4290023388512143, "learning_rate": 3.3152314052499513e-06, "loss": 0.3921, "step": 1565 }, { "epoch": 0.6506423539162868, "grad_norm": 2.386844861650698, "learning_rate": 3.2812145839909566e-06, "loss": 0.382, "step": 1570 }, { "epoch": 0.6527144633236635, "grad_norm": 2.58783476073575, "learning_rate": 3.247287741464758e-06, "loss": 0.3961, "step": 1575 }, { "epoch": 0.6547865727310402, "grad_norm": 2.4711795095903906, "learning_rate": 3.2134526537478034e-06, "loss": 0.403, "step": 1580 }, { "epoch": 0.6568586821384169, "grad_norm": 2.445518124608817, "learning_rate": 3.1797110921131626e-06, "loss": 0.3949, "step": 1585 }, { "epoch": 0.6589307915457936, "grad_norm": 2.3302808051599957, "learning_rate": 3.1460648229377933e-06, "loss": 0.4003, "step": 1590 }, { "epoch": 0.6610029009531704, "grad_norm": 2.4700969540161086, "learning_rate": 3.1125156076100804e-06, "loss": 0.3845, "step": 1595 }, { "epoch": 0.6630750103605471, "grad_norm": 2.2736405570858684, "learning_rate": 3.0790652024376163e-06, "loss": 0.3755, "step": 1600 }, { "epoch": 0.6651471197679237, "grad_norm": 2.3793447678069732, "learning_rate": 3.0457153585552724e-06, "loss": 0.3764, "step": 1605 }, { "epoch": 0.6672192291753004, "grad_norm": 2.381350136652911, "learning_rate": 3.012467821833506e-06, "loss": 0.388, "step": 1610 }, { "epoch": 0.6692913385826772, "grad_norm": 2.440306240554211, "learning_rate": 2.979324332786987e-06, "loss": 0.3914, "step": 1615 }, { "epoch": 0.6713634479900539, "grad_norm": 2.473851856313499, "learning_rate": 2.946286626483463e-06, "loss": 0.3844, "step": 1620 }, { "epoch": 0.6734355573974306, "grad_norm": 2.266055102817957, "learning_rate": 2.913356432452942e-06, "loss": 0.3789, "step": 1625 }, { "epoch": 0.6755076668048073, "grad_norm": 2.4641402988030126, "learning_rate": 2.8805354745971336e-06, "loss": 0.37, "step": 1630 }, { "epoch": 0.677579776212184, "grad_norm": 2.3943294283961984, "learning_rate": 2.847825471099227e-06, "loss": 0.3777, "step": 1635 }, { "epoch": 0.6796518856195607, "grad_norm": 2.384842311722, "learning_rate": 2.815228134333925e-06, "loss": 0.382, "step": 1640 }, { "epoch": 0.6817239950269374, "grad_norm": 2.469151674287745, "learning_rate": 2.782745170777801e-06, "loss": 0.3704, "step": 1645 }, { "epoch": 0.6837961044343142, "grad_norm": 2.583143124954603, "learning_rate": 2.750378280919975e-06, "loss": 0.3736, "step": 1650 }, { "epoch": 0.6858682138416908, "grad_norm": 2.4321963843177947, "learning_rate": 2.7181291591730885e-06, "loss": 0.3782, "step": 1655 }, { "epoch": 0.6879403232490675, "grad_norm": 2.3531730582824975, "learning_rate": 2.6859994937846002e-06, "loss": 0.376, "step": 1660 }, { "epoch": 0.6900124326564443, "grad_norm": 2.4367688917720485, "learning_rate": 2.653990966748401e-06, "loss": 0.377, "step": 1665 }, { "epoch": 0.692084542063821, "grad_norm": 2.3875592466498605, "learning_rate": 2.622105253716774e-06, "loss": 0.3772, "step": 1670 }, { "epoch": 0.6941566514711976, "grad_norm": 2.424175019670591, "learning_rate": 2.5903440239126633e-06, "loss": 0.3762, "step": 1675 }, { "epoch": 0.6962287608785744, "grad_norm": 2.337415256255787, "learning_rate": 2.5587089400422936e-06, "loss": 0.3906, "step": 1680 }, { "epoch": 0.6983008702859511, "grad_norm": 2.384059902937286, "learning_rate": 2.5272016582081236e-06, "loss": 0.3661, "step": 1685 }, { "epoch": 0.7003729796933278, "grad_norm": 2.346884231086367, "learning_rate": 2.4958238278221603e-06, "loss": 0.3632, "step": 1690 }, { "epoch": 0.7024450891007045, "grad_norm": 2.3658711079290264, "learning_rate": 2.464577091519594e-06, "loss": 0.3695, "step": 1695 }, { "epoch": 0.7045171985080813, "grad_norm": 2.3082126926984277, "learning_rate": 2.43346308507282e-06, "loss": 0.3697, "step": 1700 }, { "epoch": 0.7065893079154579, "grad_norm": 2.2724590255115324, "learning_rate": 2.4024834373058024e-06, "loss": 0.3708, "step": 1705 }, { "epoch": 0.7086614173228346, "grad_norm": 2.4685609897195633, "learning_rate": 2.371639770008804e-06, "loss": 0.3589, "step": 1710 }, { "epoch": 0.7107335267302114, "grad_norm": 2.4411171519085704, "learning_rate": 2.3409336978534785e-06, "loss": 0.3657, "step": 1715 }, { "epoch": 0.7128056361375881, "grad_norm": 2.3755249742291227, "learning_rate": 2.3103668283083563e-06, "loss": 0.3688, "step": 1720 }, { "epoch": 0.7148777455449647, "grad_norm": 2.3679474209669307, "learning_rate": 2.2799407615546816e-06, "loss": 0.3738, "step": 1725 }, { "epoch": 0.7169498549523415, "grad_norm": 2.351814749989449, "learning_rate": 2.2496570904026484e-06, "loss": 0.3647, "step": 1730 }, { "epoch": 0.7190219643597182, "grad_norm": 2.4116114120516103, "learning_rate": 2.219517400208015e-06, "loss": 0.3736, "step": 1735 }, { "epoch": 0.7210940737670949, "grad_norm": 2.3864727744957412, "learning_rate": 2.1895232687891044e-06, "loss": 0.3484, "step": 1740 }, { "epoch": 0.7231661831744716, "grad_norm": 2.2512303655303874, "learning_rate": 2.159676266344222e-06, "loss": 0.3599, "step": 1745 }, { "epoch": 0.7252382925818484, "grad_norm": 2.2576815601903872, "learning_rate": 2.1299779553694323e-06, "loss": 0.358, "step": 1750 }, { "epoch": 0.727310401989225, "grad_norm": 2.4261849484523594, "learning_rate": 2.100429890576782e-06, "loss": 0.3638, "step": 1755 }, { "epoch": 0.7293825113966017, "grad_norm": 2.3998781669660008, "learning_rate": 2.0710336188129e-06, "loss": 0.3626, "step": 1760 }, { "epoch": 0.7314546208039785, "grad_norm": 2.2949968092513697, "learning_rate": 2.0417906789780236e-06, "loss": 0.3556, "step": 1765 }, { "epoch": 0.7335267302113552, "grad_norm": 2.3439568873845222, "learning_rate": 2.0127026019454305e-06, "loss": 0.3753, "step": 1770 }, { "epoch": 0.7355988396187318, "grad_norm": 2.2377793345094896, "learning_rate": 1.9837709104813075e-06, "loss": 0.358, "step": 1775 }, { "epoch": 0.7376709490261085, "grad_norm": 2.2885196165888653, "learning_rate": 1.9549971191650263e-06, "loss": 0.3515, "step": 1780 }, { "epoch": 0.7397430584334853, "grad_norm": 2.558424767409142, "learning_rate": 1.9263827343098596e-06, "loss": 0.3657, "step": 1785 }, { "epoch": 0.741815167840862, "grad_norm": 2.44623978739584, "learning_rate": 1.8979292538841133e-06, "loss": 0.3623, "step": 1790 }, { "epoch": 0.7438872772482387, "grad_norm": 2.2614597017692692, "learning_rate": 1.8696381674327308e-06, "loss": 0.3553, "step": 1795 }, { "epoch": 0.7459593866556155, "grad_norm": 2.2881907635344576, "learning_rate": 1.8415109559992883e-06, "loss": 0.3531, "step": 1800 }, { "epoch": 0.7480314960629921, "grad_norm": 2.463206144892447, "learning_rate": 1.8135490920484832e-06, "loss": 0.3559, "step": 1805 }, { "epoch": 0.7501036054703688, "grad_norm": 2.3074379267386766, "learning_rate": 1.7857540393890337e-06, "loss": 0.3544, "step": 1810 }, { "epoch": 0.7521757148777456, "grad_norm": 2.345951065809859, "learning_rate": 1.7581272530970666e-06, "loss": 0.3495, "step": 1815 }, { "epoch": 0.7542478242851223, "grad_norm": 2.436580121963066, "learning_rate": 1.7306701794399266e-06, "loss": 0.351, "step": 1820 }, { "epoch": 0.756319933692499, "grad_norm": 2.477449269541472, "learning_rate": 1.7033842558004692e-06, "loss": 0.357, "step": 1825 }, { "epoch": 0.7583920430998756, "grad_norm": 2.336059640702188, "learning_rate": 1.6762709106018193e-06, "loss": 0.3566, "step": 1830 }, { "epoch": 0.7604641525072524, "grad_norm": 2.327604564240487, "learning_rate": 1.6493315632325873e-06, "loss": 0.3693, "step": 1835 }, { "epoch": 0.7625362619146291, "grad_norm": 2.4521027562031805, "learning_rate": 1.6225676239725663e-06, "loss": 0.3557, "step": 1840 }, { "epoch": 0.7646083713220058, "grad_norm": 2.3928441636264055, "learning_rate": 1.5959804939188962e-06, "loss": 0.35, "step": 1845 }, { "epoch": 0.7666804807293826, "grad_norm": 2.3515743827298263, "learning_rate": 1.5695715649127347e-06, "loss": 0.3597, "step": 1850 }, { "epoch": 0.7687525901367592, "grad_norm": 2.4659862823505843, "learning_rate": 1.5433422194663694e-06, "loss": 0.3544, "step": 1855 }, { "epoch": 0.7708246995441359, "grad_norm": 2.3479456978647684, "learning_rate": 1.5172938306908624e-06, "loss": 0.3479, "step": 1860 }, { "epoch": 0.7728968089515127, "grad_norm": 2.491588705252418, "learning_rate": 1.4914277622241596e-06, "loss": 0.348, "step": 1865 }, { "epoch": 0.7749689183588894, "grad_norm": 2.2957936532852026, "learning_rate": 1.4657453681597056e-06, "loss": 0.3535, "step": 1870 }, { "epoch": 0.777041027766266, "grad_norm": 2.2660578633915787, "learning_rate": 1.440247992975553e-06, "loss": 0.3503, "step": 1875 }, { "epoch": 0.7791131371736427, "grad_norm": 2.4214309256458755, "learning_rate": 1.4149369714639856e-06, "loss": 0.3525, "step": 1880 }, { "epoch": 0.7811852465810195, "grad_norm": 2.378191297196036, "learning_rate": 1.3898136286616364e-06, "loss": 0.3449, "step": 1885 }, { "epoch": 0.7832573559883962, "grad_norm": 2.369189696994275, "learning_rate": 1.3648792797801264e-06, "loss": 0.3411, "step": 1890 }, { "epoch": 0.7853294653957729, "grad_norm": 2.357227748595689, "learning_rate": 1.3401352301372039e-06, "loss": 0.3428, "step": 1895 }, { "epoch": 0.7874015748031497, "grad_norm": 2.283274818402107, "learning_rate": 1.315582775088421e-06, "loss": 0.3476, "step": 1900 }, { "epoch": 0.7894736842105263, "grad_norm": 2.443389423960905, "learning_rate": 1.2912231999593222e-06, "loss": 0.3456, "step": 1905 }, { "epoch": 0.791545793617903, "grad_norm": 2.3098901374263785, "learning_rate": 1.267057779978143e-06, "loss": 0.3387, "step": 1910 }, { "epoch": 0.7936179030252797, "grad_norm": 2.3606277283639403, "learning_rate": 1.2430877802090674e-06, "loss": 0.3505, "step": 1915 }, { "epoch": 0.7956900124326565, "grad_norm": 2.778705490024244, "learning_rate": 1.2193144554859938e-06, "loss": 0.334, "step": 1920 }, { "epoch": 0.7977621218400331, "grad_norm": 2.3112033884634418, "learning_rate": 1.195739050346848e-06, "loss": 0.349, "step": 1925 }, { "epoch": 0.7998342312474098, "grad_norm": 2.3575714411861517, "learning_rate": 1.172362798968424e-06, "loss": 0.3449, "step": 1930 }, { "epoch": 0.8019063406547866, "grad_norm": 2.491996861817634, "learning_rate": 1.1491869251017833e-06, "loss": 0.3414, "step": 1935 }, { "epoch": 0.8039784500621633, "grad_norm": 2.3991288004159883, "learning_rate": 1.1262126420081887e-06, "loss": 0.3457, "step": 1940 }, { "epoch": 0.80605055946954, "grad_norm": 2.1821189723068612, "learning_rate": 1.103441152395588e-06, "loss": 0.3283, "step": 1945 }, { "epoch": 0.8081226688769167, "grad_norm": 2.417668328035057, "learning_rate": 1.0808736483556486e-06, "loss": 0.3386, "step": 1950 }, { "epoch": 0.8101947782842934, "grad_norm": 2.424032614605958, "learning_rate": 1.0585113113013656e-06, "loss": 0.3451, "step": 1955 }, { "epoch": 0.8122668876916701, "grad_norm": 2.3278404790134823, "learning_rate": 1.036355311905194e-06, "loss": 0.3455, "step": 1960 }, { "epoch": 0.8143389970990468, "grad_norm": 2.3039554290948185, "learning_rate": 1.0144068100377818e-06, "loss": 0.3381, "step": 1965 }, { "epoch": 0.8164111065064236, "grad_norm": 2.556306207112663, "learning_rate": 9.926669547072365e-07, "loss": 0.3485, "step": 1970 }, { "epoch": 0.8184832159138002, "grad_norm": 2.4582213305283775, "learning_rate": 9.711368839989904e-07, "loss": 0.3335, "step": 1975 }, { "epoch": 0.8205553253211769, "grad_norm": 2.3579094295454657, "learning_rate": 9.498177250162022e-07, "loss": 0.3346, "step": 1980 }, { "epoch": 0.8226274347285537, "grad_norm": 2.3492342765735934, "learning_rate": 9.287105938207691e-07, "loss": 0.3365, "step": 1985 }, { "epoch": 0.8246995441359304, "grad_norm": 2.4946764945886017, "learning_rate": 9.078165953748936e-07, "loss": 0.336, "step": 1990 }, { "epoch": 0.8267716535433071, "grad_norm": 2.5266114489992955, "learning_rate": 8.871368234832378e-07, "loss": 0.3367, "step": 1995 }, { "epoch": 0.8288437629506838, "grad_norm": 2.473228872985738, "learning_rate": 8.66672360735668e-07, "loss": 0.3247, "step": 2000 }, { "epoch": 0.8309158723580605, "grad_norm": 2.393007820983065, "learning_rate": 8.4642427845057e-07, "loss": 0.3289, "step": 2005 }, { "epoch": 0.8329879817654372, "grad_norm": 2.492764210538724, "learning_rate": 8.263936366187825e-07, "loss": 0.3325, "step": 2010 }, { "epoch": 0.8350600911728139, "grad_norm": 2.4227393176245418, "learning_rate": 8.065814838480879e-07, "loss": 0.3288, "step": 2015 }, { "epoch": 0.8371322005801907, "grad_norm": 2.3030572780595366, "learning_rate": 7.869888573083295e-07, "loss": 0.3401, "step": 2020 }, { "epoch": 0.8392043099875673, "grad_norm": 2.3219953415669528, "learning_rate": 7.676167826771125e-07, "loss": 0.331, "step": 2025 }, { "epoch": 0.841276419394944, "grad_norm": 2.273400790612548, "learning_rate": 7.484662740861093e-07, "loss": 0.3383, "step": 2030 }, { "epoch": 0.8433485288023208, "grad_norm": 2.374061200093819, "learning_rate": 7.295383340679668e-07, "loss": 0.3343, "step": 2035 }, { "epoch": 0.8454206382096975, "grad_norm": 2.377090176007411, "learning_rate": 7.108339535038278e-07, "loss": 0.3298, "step": 2040 }, { "epoch": 0.8474927476170742, "grad_norm": 2.3279278715664016, "learning_rate": 6.923541115714577e-07, "loss": 0.3319, "step": 2045 }, { "epoch": 0.8495648570244508, "grad_norm": 2.36303398687802, "learning_rate": 6.740997756939826e-07, "loss": 0.3418, "step": 2050 }, { "epoch": 0.8516369664318276, "grad_norm": 2.415624778572703, "learning_rate": 6.560719014892425e-07, "loss": 0.3328, "step": 2055 }, { "epoch": 0.8537090758392043, "grad_norm": 2.287618281117491, "learning_rate": 6.382714327197703e-07, "loss": 0.3321, "step": 2060 }, { "epoch": 0.855781185246581, "grad_norm": 2.2595148250907036, "learning_rate": 6.206993012433815e-07, "loss": 0.3336, "step": 2065 }, { "epoch": 0.8578532946539578, "grad_norm": 2.3926760910481186, "learning_rate": 6.033564269643927e-07, "loss": 0.3342, "step": 2070 }, { "epoch": 0.8599254040613344, "grad_norm": 2.316219130405351, "learning_rate": 5.862437177854629e-07, "loss": 0.3311, "step": 2075 }, { "epoch": 0.8619975134687111, "grad_norm": 2.3766431440321707, "learning_rate": 5.693620695600671e-07, "loss": 0.3153, "step": 2080 }, { "epoch": 0.8640696228760879, "grad_norm": 2.1660555465277405, "learning_rate": 5.527123660455968e-07, "loss": 0.3268, "step": 2085 }, { "epoch": 0.8661417322834646, "grad_norm": 2.3890878403617815, "learning_rate": 5.362954788570929e-07, "loss": 0.317, "step": 2090 }, { "epoch": 0.8682138416908413, "grad_norm": 2.3910404086765515, "learning_rate": 5.201122674216208e-07, "loss": 0.329, "step": 2095 }, { "epoch": 0.8702859510982179, "grad_norm": 2.398893233495637, "learning_rate": 5.041635789332783e-07, "loss": 0.3306, "step": 2100 }, { "epoch": 0.8723580605055947, "grad_norm": 2.5664259292956495, "learning_rate": 4.884502483088421e-07, "loss": 0.3153, "step": 2105 }, { "epoch": 0.8744301699129714, "grad_norm": 2.4313343193724073, "learning_rate": 4.7297309814406113e-07, "loss": 0.3286, "step": 2110 }, { "epoch": 0.8765022793203481, "grad_norm": 2.3267523640919374, "learning_rate": 4.577329386705942e-07, "loss": 0.3321, "step": 2115 }, { "epoch": 0.8785743887277249, "grad_norm": 2.37298005463995, "learning_rate": 4.42730567713594e-07, "loss": 0.3303, "step": 2120 }, { "epoch": 0.8806464981351015, "grad_norm": 2.4400767630128084, "learning_rate": 4.2796677064994243e-07, "loss": 0.3187, "step": 2125 }, { "epoch": 0.8827186075424782, "grad_norm": 2.600660724910327, "learning_rate": 4.134423203671295e-07, "loss": 0.3347, "step": 2130 }, { "epoch": 0.884790716949855, "grad_norm": 2.416615232949827, "learning_rate": 3.9915797722280323e-07, "loss": 0.3317, "step": 2135 }, { "epoch": 0.8868628263572317, "grad_norm": 2.3982415986507344, "learning_rate": 3.851144890049535e-07, "loss": 0.326, "step": 2140 }, { "epoch": 0.8889349357646084, "grad_norm": 2.3596784596238445, "learning_rate": 3.713125908927728e-07, "loss": 0.3274, "step": 2145 }, { "epoch": 0.891007045171985, "grad_norm": 2.395731949083915, "learning_rate": 3.577530054181677e-07, "loss": 0.3337, "step": 2150 }, { "epoch": 0.8930791545793618, "grad_norm": 2.4132993194541363, "learning_rate": 3.4443644242793226e-07, "loss": 0.3205, "step": 2155 }, { "epoch": 0.8951512639867385, "grad_norm": 2.421557012145629, "learning_rate": 3.313635990465902e-07, "loss": 0.3286, "step": 2160 }, { "epoch": 0.8972233733941152, "grad_norm": 2.3514364773273053, "learning_rate": 3.1853515963989613e-07, "loss": 0.3244, "step": 2165 }, { "epoch": 0.899295482801492, "grad_norm": 2.3631578697068156, "learning_rate": 3.059517957790165e-07, "loss": 0.3224, "step": 2170 }, { "epoch": 0.9013675922088686, "grad_norm": 2.3165934226078693, "learning_rate": 2.936141662053621e-07, "loss": 0.3189, "step": 2175 }, { "epoch": 0.9034397016162453, "grad_norm": 2.354830595345679, "learning_rate": 2.8152291679611254e-07, "loss": 0.3164, "step": 2180 }, { "epoch": 0.905511811023622, "grad_norm": 2.4591886130861984, "learning_rate": 2.6967868053039916e-07, "loss": 0.3194, "step": 2185 }, { "epoch": 0.9075839204309988, "grad_norm": 2.4651307442780515, "learning_rate": 2.580820774561704e-07, "loss": 0.3329, "step": 2190 }, { "epoch": 0.9096560298383755, "grad_norm": 2.356380428139328, "learning_rate": 2.467337146577298e-07, "loss": 0.3125, "step": 2195 }, { "epoch": 0.9117281392457521, "grad_norm": 2.4718310107961923, "learning_rate": 2.3563418622395863e-07, "loss": 0.3267, "step": 2200 }, { "epoch": 0.9138002486531289, "grad_norm": 2.2943822251497834, "learning_rate": 2.2478407321721295e-07, "loss": 0.3124, "step": 2205 }, { "epoch": 0.9158723580605056, "grad_norm": 2.37802636947732, "learning_rate": 2.141839436429055e-07, "loss": 0.3261, "step": 2210 }, { "epoch": 0.9179444674678823, "grad_norm": 2.4929237268669504, "learning_rate": 2.038343524197689e-07, "loss": 0.3266, "step": 2215 }, { "epoch": 0.9200165768752591, "grad_norm": 2.4497686225084645, "learning_rate": 1.9373584135080893e-07, "loss": 0.3155, "step": 2220 }, { "epoch": 0.9220886862826357, "grad_norm": 2.3085410601255063, "learning_rate": 1.8388893909493776e-07, "loss": 0.3143, "step": 2225 }, { "epoch": 0.9241607956900124, "grad_norm": 2.516522380423302, "learning_rate": 1.742941611393012e-07, "loss": 0.3282, "step": 2230 }, { "epoch": 0.9262329050973891, "grad_norm": 2.250941160020719, "learning_rate": 1.6495200977228897e-07, "loss": 0.3313, "step": 2235 }, { "epoch": 0.9283050145047659, "grad_norm": 2.243976361659756, "learning_rate": 1.558629740572465e-07, "loss": 0.3123, "step": 2240 }, { "epoch": 0.9303771239121426, "grad_norm": 2.411662333101451, "learning_rate": 1.4702752980686463e-07, "loss": 0.3227, "step": 2245 }, { "epoch": 0.9324492333195192, "grad_norm": 2.4064884421540254, "learning_rate": 1.3844613955827536e-07, "loss": 0.326, "step": 2250 }, { "epoch": 0.934521342726896, "grad_norm": 2.4948851626678326, "learning_rate": 1.301192525488376e-07, "loss": 0.3177, "step": 2255 }, { "epoch": 0.9365934521342727, "grad_norm": 2.256448839133944, "learning_rate": 1.2204730469261905e-07, "loss": 0.3224, "step": 2260 }, { "epoch": 0.9386655615416494, "grad_norm": 2.386604983970806, "learning_rate": 1.1423071855757473e-07, "loss": 0.3177, "step": 2265 }, { "epoch": 0.9407376709490262, "grad_norm": 2.4955559903632087, "learning_rate": 1.0666990334342708e-07, "loss": 0.3235, "step": 2270 }, { "epoch": 0.9428097803564028, "grad_norm": 2.4680594179132953, "learning_rate": 9.936525486024362e-08, "loss": 0.3234, "step": 2275 }, { "epoch": 0.9448818897637795, "grad_norm": 2.352943986977247, "learning_rate": 9.23171555077168e-08, "loss": 0.3189, "step": 2280 }, { "epoch": 0.9469539991711562, "grad_norm": 2.3685608851078106, "learning_rate": 8.552597425514508e-08, "loss": 0.3191, "step": 2285 }, { "epoch": 0.949026108578533, "grad_norm": 2.4481455211139944, "learning_rate": 7.8992066622115e-08, "loss": 0.3283, "step": 2290 }, { "epoch": 0.9510982179859097, "grad_norm": 2.298587702409287, "learning_rate": 7.271577465989554e-08, "loss": 0.3181, "step": 2295 }, { "epoch": 0.9531703273932863, "grad_norm": 2.3601024907505956, "learning_rate": 6.669742693352522e-08, "loss": 0.3183, "step": 2300 }, { "epoch": 0.9552424368006631, "grad_norm": 2.3846417205933537, "learning_rate": 6.093733850461359e-08, "loss": 0.311, "step": 2305 }, { "epoch": 0.9573145462080398, "grad_norm": 2.5721713736853427, "learning_rate": 5.5435810914851176e-08, "loss": 0.3225, "step": 2310 }, { "epoch": 0.9593866556154165, "grad_norm": 2.4391728621326667, "learning_rate": 5.0193132170219814e-08, "loss": 0.3219, "step": 2315 }, { "epoch": 0.9614587650227931, "grad_norm": 2.314964895648817, "learning_rate": 4.5209576725915305e-08, "loss": 0.3146, "step": 2320 }, { "epoch": 0.9635308744301699, "grad_norm": 2.3953921094993427, "learning_rate": 4.0485405471983317e-08, "loss": 0.3099, "step": 2325 }, { "epoch": 0.9656029838375466, "grad_norm": 2.4903270174654977, "learning_rate": 3.6020865719657015e-08, "loss": 0.3261, "step": 2330 }, { "epoch": 0.9676750932449233, "grad_norm": 2.5600866933116153, "learning_rate": 3.181619118841517e-08, "loss": 0.3207, "step": 2335 }, { "epoch": 0.9697472026523001, "grad_norm": 2.2542713880681466, "learning_rate": 2.7871601993741947e-08, "loss": 0.3154, "step": 2340 }, { "epoch": 0.9718193120596768, "grad_norm": 2.3906714544505, "learning_rate": 2.4187304635608922e-08, "loss": 0.3264, "step": 2345 }, { "epoch": 0.9738914214670534, "grad_norm": 2.3437139004611396, "learning_rate": 2.0763491987659812e-08, "loss": 0.3131, "step": 2350 }, { "epoch": 0.9759635308744302, "grad_norm": 2.3352282804892504, "learning_rate": 1.7600343287116904e-08, "loss": 0.3236, "step": 2355 }, { "epoch": 0.9780356402818069, "grad_norm": 2.2788999882928267, "learning_rate": 1.4698024125396893e-08, "loss": 0.3165, "step": 2360 }, { "epoch": 0.9801077496891836, "grad_norm": 2.484272820389643, "learning_rate": 1.205668643944169e-08, "loss": 0.3265, "step": 2365 }, { "epoch": 0.9821798590965602, "grad_norm": 2.3492923712340357, "learning_rate": 9.676468503765356e-09, "loss": 0.3185, "step": 2370 }, { "epoch": 0.984251968503937, "grad_norm": 2.2726204174567806, "learning_rate": 7.557494923214338e-09, "loss": 0.3187, "step": 2375 }, { "epoch": 0.9863240779113137, "grad_norm": 2.502123973833959, "learning_rate": 5.699876626446554e-09, "loss": 0.315, "step": 2380 }, { "epoch": 0.9883961873186904, "grad_norm": 2.5216195549598046, "learning_rate": 4.103710860120513e-09, "loss": 0.3232, "step": 2385 }, { "epoch": 0.9904682967260672, "grad_norm": 2.3006568402247987, "learning_rate": 2.769081183808253e-09, "loss": 0.3132, "step": 2390 }, { "epoch": 0.9925404061334439, "grad_norm": 2.3057165957876977, "learning_rate": 1.69605746561885e-09, "loss": 0.3099, "step": 2395 }, { "epoch": 0.9946125155408205, "grad_norm": 2.3117976401405853, "learning_rate": 8.846958785418969e-10, "loss": 0.3166, "step": 2400 }, { "epoch": 0.9966846249481973, "grad_norm": 2.4474241574477613, "learning_rate": 3.3503889750485794e-10, "loss": 0.3198, "step": 2405 }, { "epoch": 0.998756734355574, "grad_norm": 2.475055392564647, "learning_rate": 4.711529715262231e-11, "loss": 0.3188, "step": 2410 }, { "epoch": 1.0, "eval_loss": 0.2978341579437256, "eval_runtime": 1.1902, "eval_samples_per_second": 2.521, "eval_steps_per_second": 0.84, "step": 2413 }, { "epoch": 1.0, "step": 2413, "total_flos": 252616554577920.0, "train_loss": 0.0, "train_runtime": 0.0113, "train_samples_per_second": 3408953.689, "train_steps_per_second": 213070.643 } ], "logging_steps": 5, "max_steps": 2413, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 252616554577920.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }