{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999953267752413, "eval_steps": 500, "global_step": 26748, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001869289903497909, "grad_norm": 2.171574354171753, "learning_rate": 1.9999998275643267e-05, "loss": 0.8437, "step": 5 }, { "epoch": 0.0003738579806995818, "grad_norm": 2.646630048751831, "learning_rate": 1.999999310257367e-05, "loss": 0.6249, "step": 10 }, { "epoch": 0.0005607869710493726, "grad_norm": 1.4575234651565552, "learning_rate": 1.9999984480792985e-05, "loss": 0.4614, "step": 15 }, { "epoch": 0.0007477159613991635, "grad_norm": 0.6195482015609741, "learning_rate": 1.9999972410304184e-05, "loss": 0.4029, "step": 20 }, { "epoch": 0.0009346449517489544, "grad_norm": 1.0422546863555908, "learning_rate": 1.999995689111144e-05, "loss": 0.4514, "step": 25 }, { "epoch": 0.0011215739420987452, "grad_norm": 1.1793930530548096, "learning_rate": 1.9999937923220094e-05, "loss": 0.3913, "step": 30 }, { "epoch": 0.0013085029324485362, "grad_norm": 0.8712885975837708, "learning_rate": 1.9999915506636697e-05, "loss": 0.4137, "step": 35 }, { "epoch": 0.001495431922798327, "grad_norm": 0.795931875705719, "learning_rate": 1.9999889641368975e-05, "loss": 0.4616, "step": 40 }, { "epoch": 0.0016823609131481178, "grad_norm": 1.453735589981079, "learning_rate": 1.9999860327425846e-05, "loss": 0.4453, "step": 45 }, { "epoch": 0.0018692899034979088, "grad_norm": 0.865328311920166, "learning_rate": 1.9999827564817424e-05, "loss": 0.358, "step": 50 }, { "epoch": 0.0020562188938476997, "grad_norm": 0.9025278091430664, "learning_rate": 1.9999791353555008e-05, "loss": 0.4263, "step": 55 }, { "epoch": 0.0022431478841974904, "grad_norm": 0.8391484618186951, "learning_rate": 1.9999751693651083e-05, "loss": 0.3838, "step": 60 }, { "epoch": 0.002430076874547281, "grad_norm": 0.7536634206771851, "learning_rate": 1.9999708585119328e-05, "loss": 0.5194, "step": 65 }, { "epoch": 0.0026170058648970723, "grad_norm": 1.1140562295913696, "learning_rate": 1.999966202797461e-05, "loss": 0.379, "step": 70 }, { "epoch": 0.002803934855246863, "grad_norm": 0.9421278238296509, "learning_rate": 1.9999612022232982e-05, "loss": 0.3929, "step": 75 }, { "epoch": 0.002990863845596654, "grad_norm": 0.5064008831977844, "learning_rate": 1.9999558567911697e-05, "loss": 0.4323, "step": 80 }, { "epoch": 0.003177792835946445, "grad_norm": 0.7780022621154785, "learning_rate": 1.9999501665029185e-05, "loss": 0.3719, "step": 85 }, { "epoch": 0.0033647218262962356, "grad_norm": 1.0911474227905273, "learning_rate": 1.9999441313605068e-05, "loss": 0.3523, "step": 90 }, { "epoch": 0.003551650816646027, "grad_norm": 0.665628969669342, "learning_rate": 1.9999377513660167e-05, "loss": 0.3953, "step": 95 }, { "epoch": 0.0037385798069958175, "grad_norm": 1.2322030067443848, "learning_rate": 1.9999310265216472e-05, "loss": 0.4117, "step": 100 }, { "epoch": 0.003925508797345608, "grad_norm": 0.8758599758148193, "learning_rate": 1.999923956829719e-05, "loss": 0.3031, "step": 105 }, { "epoch": 0.004112437787695399, "grad_norm": 0.8419223427772522, "learning_rate": 1.9999165422926696e-05, "loss": 0.3016, "step": 110 }, { "epoch": 0.00429936677804519, "grad_norm": 1.1489617824554443, "learning_rate": 1.9999087829130554e-05, "loss": 0.4597, "step": 115 }, { "epoch": 0.004486295768394981, "grad_norm": 0.46470773220062256, "learning_rate": 1.9999006786935532e-05, "loss": 0.337, "step": 120 }, { "epoch": 0.004673224758744772, "grad_norm": 0.4822236895561218, "learning_rate": 1.999892229636958e-05, "loss": 0.3476, "step": 125 }, { "epoch": 0.004860153749094562, "grad_norm": 0.691170334815979, "learning_rate": 1.9998834357461834e-05, "loss": 0.3542, "step": 130 }, { "epoch": 0.0050470827394443534, "grad_norm": 0.6940286755561829, "learning_rate": 1.9998742970242614e-05, "loss": 0.3268, "step": 135 }, { "epoch": 0.005234011729794145, "grad_norm": 0.943358302116394, "learning_rate": 1.999864813474345e-05, "loss": 0.3943, "step": 140 }, { "epoch": 0.005420940720143935, "grad_norm": 1.0193995237350464, "learning_rate": 1.9998549850997044e-05, "loss": 0.3066, "step": 145 }, { "epoch": 0.005607869710493726, "grad_norm": 0.7878953814506531, "learning_rate": 1.999844811903728e-05, "loss": 0.3635, "step": 150 }, { "epoch": 0.005794798700843517, "grad_norm": 0.4438617527484894, "learning_rate": 1.9998342938899257e-05, "loss": 0.4297, "step": 155 }, { "epoch": 0.005981727691193308, "grad_norm": 0.35109004378318787, "learning_rate": 1.999823431061924e-05, "loss": 0.4432, "step": 160 }, { "epoch": 0.006168656681543099, "grad_norm": 0.7942967414855957, "learning_rate": 1.99981222342347e-05, "loss": 0.4196, "step": 165 }, { "epoch": 0.00635558567189289, "grad_norm": 0.8898890018463135, "learning_rate": 1.9998006709784283e-05, "loss": 0.3688, "step": 170 }, { "epoch": 0.006542514662242681, "grad_norm": 1.1046160459518433, "learning_rate": 1.999788773730783e-05, "loss": 0.3895, "step": 175 }, { "epoch": 0.006729443652592471, "grad_norm": 0.5191996693611145, "learning_rate": 1.999776531684637e-05, "loss": 0.3963, "step": 180 }, { "epoch": 0.006916372642942262, "grad_norm": 0.5099379420280457, "learning_rate": 1.9997639448442125e-05, "loss": 0.3328, "step": 185 }, { "epoch": 0.007103301633292054, "grad_norm": 1.2763317823410034, "learning_rate": 1.9997510132138505e-05, "loss": 0.2926, "step": 190 }, { "epoch": 0.007290230623641844, "grad_norm": 1.150545358657837, "learning_rate": 1.9997377367980104e-05, "loss": 0.4188, "step": 195 }, { "epoch": 0.007477159613991635, "grad_norm": 0.8739756941795349, "learning_rate": 1.999724115601271e-05, "loss": 0.4185, "step": 200 }, { "epoch": 0.007664088604341426, "grad_norm": 1.0264869928359985, "learning_rate": 1.99971014962833e-05, "loss": 0.3185, "step": 205 }, { "epoch": 0.007851017594691216, "grad_norm": 0.5631968975067139, "learning_rate": 1.9996958388840036e-05, "loss": 0.3156, "step": 210 }, { "epoch": 0.008037946585041008, "grad_norm": 0.6525606513023376, "learning_rate": 1.999681183373227e-05, "loss": 0.3579, "step": 215 }, { "epoch": 0.008224875575390799, "grad_norm": 0.894953727722168, "learning_rate": 1.999666183101055e-05, "loss": 0.3544, "step": 220 }, { "epoch": 0.00841180456574059, "grad_norm": 0.5953158140182495, "learning_rate": 1.9996508380726608e-05, "loss": 0.3366, "step": 225 }, { "epoch": 0.00859873355609038, "grad_norm": 0.47904253005981445, "learning_rate": 1.9996351482933355e-05, "loss": 0.448, "step": 230 }, { "epoch": 0.00878566254644017, "grad_norm": 0.9660627841949463, "learning_rate": 1.9996191137684913e-05, "loss": 0.4367, "step": 235 }, { "epoch": 0.008972591536789962, "grad_norm": 1.3929287195205688, "learning_rate": 1.9996027345036574e-05, "loss": 0.4314, "step": 240 }, { "epoch": 0.009159520527139753, "grad_norm": 0.5760064721107483, "learning_rate": 1.999586010504482e-05, "loss": 0.389, "step": 245 }, { "epoch": 0.009346449517489544, "grad_norm": 1.9712883234024048, "learning_rate": 1.999568941776734e-05, "loss": 0.3424, "step": 250 }, { "epoch": 0.009533378507839335, "grad_norm": 0.6370198726654053, "learning_rate": 1.9995515283262993e-05, "loss": 0.4801, "step": 255 }, { "epoch": 0.009720307498189125, "grad_norm": 1.003278136253357, "learning_rate": 1.999533770159183e-05, "loss": 0.3515, "step": 260 }, { "epoch": 0.009907236488538916, "grad_norm": 0.4942162036895752, "learning_rate": 1.9995156672815096e-05, "loss": 0.3689, "step": 265 }, { "epoch": 0.010094165478888707, "grad_norm": 0.5202749371528625, "learning_rate": 1.9994972196995223e-05, "loss": 0.3163, "step": 270 }, { "epoch": 0.010281094469238498, "grad_norm": 0.6780266165733337, "learning_rate": 1.9994784274195834e-05, "loss": 0.3699, "step": 275 }, { "epoch": 0.01046802345958829, "grad_norm": 0.40867510437965393, "learning_rate": 1.9994592904481732e-05, "loss": 0.3337, "step": 280 }, { "epoch": 0.01065495244993808, "grad_norm": 0.36129143834114075, "learning_rate": 1.999439808791892e-05, "loss": 0.3753, "step": 285 }, { "epoch": 0.01084188144028787, "grad_norm": 0.4958774447441101, "learning_rate": 1.9994199824574583e-05, "loss": 0.3724, "step": 290 }, { "epoch": 0.011028810430637661, "grad_norm": 0.9592061042785645, "learning_rate": 1.9993998114517096e-05, "loss": 0.4238, "step": 295 }, { "epoch": 0.011215739420987452, "grad_norm": 0.6660071015357971, "learning_rate": 1.9993792957816027e-05, "loss": 0.3774, "step": 300 }, { "epoch": 0.011402668411337243, "grad_norm": 1.37131929397583, "learning_rate": 1.999358435454212e-05, "loss": 0.4088, "step": 305 }, { "epoch": 0.011589597401687034, "grad_norm": 0.8485438227653503, "learning_rate": 1.9993372304767327e-05, "loss": 0.3734, "step": 310 }, { "epoch": 0.011776526392036826, "grad_norm": 0.41537079215049744, "learning_rate": 1.999315680856477e-05, "loss": 0.2756, "step": 315 }, { "epoch": 0.011963455382386617, "grad_norm": 0.36821046471595764, "learning_rate": 1.999293786600877e-05, "loss": 0.3323, "step": 320 }, { "epoch": 0.012150384372736406, "grad_norm": 0.6895926594734192, "learning_rate": 1.9992715477174832e-05, "loss": 0.2989, "step": 325 }, { "epoch": 0.012337313363086197, "grad_norm": 0.7400907278060913, "learning_rate": 1.9992489642139654e-05, "loss": 0.4937, "step": 330 }, { "epoch": 0.012524242353435988, "grad_norm": 0.6539915800094604, "learning_rate": 1.999226036098112e-05, "loss": 0.2889, "step": 335 }, { "epoch": 0.01271117134378578, "grad_norm": 0.7676389217376709, "learning_rate": 1.9992027633778303e-05, "loss": 0.3172, "step": 340 }, { "epoch": 0.01289810033413557, "grad_norm": 0.6981975436210632, "learning_rate": 1.9991791460611464e-05, "loss": 0.3242, "step": 345 }, { "epoch": 0.013085029324485362, "grad_norm": 0.6231608986854553, "learning_rate": 1.999155184156205e-05, "loss": 0.4696, "step": 350 }, { "epoch": 0.013271958314835151, "grad_norm": 0.5723335146903992, "learning_rate": 1.9991308776712697e-05, "loss": 0.3813, "step": 355 }, { "epoch": 0.013458887305184943, "grad_norm": 0.5270363092422485, "learning_rate": 1.9991062266147237e-05, "loss": 0.3927, "step": 360 }, { "epoch": 0.013645816295534734, "grad_norm": 1.0495415925979614, "learning_rate": 1.999081230995068e-05, "loss": 0.3547, "step": 365 }, { "epoch": 0.013832745285884525, "grad_norm": 0.9163959622383118, "learning_rate": 1.9990558908209234e-05, "loss": 0.3432, "step": 370 }, { "epoch": 0.014019674276234316, "grad_norm": 1.1537407636642456, "learning_rate": 1.9990302061010282e-05, "loss": 0.3514, "step": 375 }, { "epoch": 0.014206603266584107, "grad_norm": 0.7320118546485901, "learning_rate": 1.999004176844241e-05, "loss": 0.4305, "step": 380 }, { "epoch": 0.014393532256933897, "grad_norm": 0.47620531916618347, "learning_rate": 1.9989778030595385e-05, "loss": 0.3117, "step": 385 }, { "epoch": 0.014580461247283688, "grad_norm": 0.5840729475021362, "learning_rate": 1.9989510847560157e-05, "loss": 0.3125, "step": 390 }, { "epoch": 0.014767390237633479, "grad_norm": 0.5005705952644348, "learning_rate": 1.9989240219428873e-05, "loss": 0.2977, "step": 395 }, { "epoch": 0.01495431922798327, "grad_norm": 0.8225285410881042, "learning_rate": 1.9988966146294867e-05, "loss": 0.3511, "step": 400 }, { "epoch": 0.015141248218333061, "grad_norm": 0.5237758755683899, "learning_rate": 1.9988688628252656e-05, "loss": 0.3672, "step": 405 }, { "epoch": 0.015328177208682852, "grad_norm": 0.6276610493659973, "learning_rate": 1.9988407665397952e-05, "loss": 0.3592, "step": 410 }, { "epoch": 0.015515106199032642, "grad_norm": 0.6155140995979309, "learning_rate": 1.9988123257827646e-05, "loss": 0.3913, "step": 415 }, { "epoch": 0.015702035189382433, "grad_norm": 1.1031259298324585, "learning_rate": 1.9987835405639827e-05, "loss": 0.3597, "step": 420 }, { "epoch": 0.015888964179732224, "grad_norm": 0.9590380787849426, "learning_rate": 1.9987544108933758e-05, "loss": 0.3436, "step": 425 }, { "epoch": 0.016075893170082015, "grad_norm": 0.6944395899772644, "learning_rate": 1.998724936780991e-05, "loss": 0.4223, "step": 430 }, { "epoch": 0.016262822160431806, "grad_norm": 0.8819741606712341, "learning_rate": 1.9986951182369923e-05, "loss": 0.3349, "step": 435 }, { "epoch": 0.016449751150781598, "grad_norm": 0.8658530712127686, "learning_rate": 1.998664955271664e-05, "loss": 0.3447, "step": 440 }, { "epoch": 0.01663668014113139, "grad_norm": 0.7974675297737122, "learning_rate": 1.9986344478954078e-05, "loss": 0.4101, "step": 445 }, { "epoch": 0.01682360913148118, "grad_norm": 0.46253588795661926, "learning_rate": 1.998603596118745e-05, "loss": 0.3763, "step": 450 }, { "epoch": 0.01701053812183097, "grad_norm": 0.67314213514328, "learning_rate": 1.9985723999523154e-05, "loss": 0.3419, "step": 455 }, { "epoch": 0.01719746711218076, "grad_norm": 0.5673795938491821, "learning_rate": 1.998540859406878e-05, "loss": 0.3533, "step": 460 }, { "epoch": 0.01738439610253055, "grad_norm": 0.34820371866226196, "learning_rate": 1.99850897449331e-05, "loss": 0.3095, "step": 465 }, { "epoch": 0.01757132509288034, "grad_norm": 1.0526245832443237, "learning_rate": 1.998476745222607e-05, "loss": 0.3775, "step": 470 }, { "epoch": 0.017758254083230132, "grad_norm": 0.2607547342777252, "learning_rate": 1.9984441716058855e-05, "loss": 0.319, "step": 475 }, { "epoch": 0.017945183073579923, "grad_norm": 0.5493923425674438, "learning_rate": 1.9984112536543774e-05, "loss": 0.2914, "step": 480 }, { "epoch": 0.018132112063929715, "grad_norm": 0.6481868028640747, "learning_rate": 1.9983779913794366e-05, "loss": 0.39, "step": 485 }, { "epoch": 0.018319041054279506, "grad_norm": 0.4423280954360962, "learning_rate": 1.9983443847925334e-05, "loss": 0.383, "step": 490 }, { "epoch": 0.018505970044629297, "grad_norm": 0.24521741271018982, "learning_rate": 1.998310433905258e-05, "loss": 0.3797, "step": 495 }, { "epoch": 0.018692899034979088, "grad_norm": 0.4319721460342407, "learning_rate": 1.9982761387293196e-05, "loss": 0.3635, "step": 500 }, { "epoch": 0.01887982802532888, "grad_norm": 0.6990941166877747, "learning_rate": 1.998241499276545e-05, "loss": 0.2869, "step": 505 }, { "epoch": 0.01906675701567867, "grad_norm": 0.6501472592353821, "learning_rate": 1.9982065155588803e-05, "loss": 0.4004, "step": 510 }, { "epoch": 0.01925368600602846, "grad_norm": 0.5194607973098755, "learning_rate": 1.9981711875883908e-05, "loss": 0.3062, "step": 515 }, { "epoch": 0.01944061499637825, "grad_norm": 0.5444199442863464, "learning_rate": 1.9981355153772603e-05, "loss": 0.3218, "step": 520 }, { "epoch": 0.01962754398672804, "grad_norm": 0.8375357985496521, "learning_rate": 1.9980994989377902e-05, "loss": 0.2581, "step": 525 }, { "epoch": 0.01981447297707783, "grad_norm": 0.47779718041419983, "learning_rate": 1.998063138282402e-05, "loss": 0.2845, "step": 530 }, { "epoch": 0.020001401967427623, "grad_norm": 0.5198971629142761, "learning_rate": 1.998026433423636e-05, "loss": 0.3539, "step": 535 }, { "epoch": 0.020188330957777414, "grad_norm": 0.6025378108024597, "learning_rate": 1.9979893843741498e-05, "loss": 0.3003, "step": 540 }, { "epoch": 0.020375259948127205, "grad_norm": 0.454045832157135, "learning_rate": 1.997951991146721e-05, "loss": 0.3132, "step": 545 }, { "epoch": 0.020562188938476996, "grad_norm": 0.9200149178504944, "learning_rate": 1.9979142537542455e-05, "loss": 0.3469, "step": 550 }, { "epoch": 0.020749117928826787, "grad_norm": 0.5452691912651062, "learning_rate": 1.997876172209738e-05, "loss": 0.3445, "step": 555 }, { "epoch": 0.02093604691917658, "grad_norm": 0.46097660064697266, "learning_rate": 1.997837746526331e-05, "loss": 0.3302, "step": 560 }, { "epoch": 0.02112297590952637, "grad_norm": 0.7622095346450806, "learning_rate": 1.997798976717277e-05, "loss": 0.3539, "step": 565 }, { "epoch": 0.02130990489987616, "grad_norm": 0.44673827290534973, "learning_rate": 1.9977598627959467e-05, "loss": 0.3418, "step": 570 }, { "epoch": 0.021496833890225952, "grad_norm": 0.38425078988075256, "learning_rate": 1.9977204047758293e-05, "loss": 0.342, "step": 575 }, { "epoch": 0.02168376288057574, "grad_norm": 1.0525974035263062, "learning_rate": 1.997680602670532e-05, "loss": 0.3249, "step": 580 }, { "epoch": 0.02187069187092553, "grad_norm": 0.5911397933959961, "learning_rate": 1.9976404564937825e-05, "loss": 0.2806, "step": 585 }, { "epoch": 0.022057620861275322, "grad_norm": 0.7274040579795837, "learning_rate": 1.9975999662594254e-05, "loss": 0.3742, "step": 590 }, { "epoch": 0.022244549851625113, "grad_norm": 0.8570606708526611, "learning_rate": 1.9975591319814248e-05, "loss": 0.4207, "step": 595 }, { "epoch": 0.022431478841974904, "grad_norm": 0.6169318556785583, "learning_rate": 1.9975179536738633e-05, "loss": 0.3025, "step": 600 }, { "epoch": 0.022618407832324695, "grad_norm": 0.7750716805458069, "learning_rate": 1.997476431350942e-05, "loss": 0.2713, "step": 605 }, { "epoch": 0.022805336822674487, "grad_norm": 0.6509002447128296, "learning_rate": 1.9974345650269812e-05, "loss": 0.2948, "step": 610 }, { "epoch": 0.022992265813024278, "grad_norm": 0.875956654548645, "learning_rate": 1.9973923547164183e-05, "loss": 0.4136, "step": 615 }, { "epoch": 0.02317919480337407, "grad_norm": 0.5139532089233398, "learning_rate": 1.9973498004338115e-05, "loss": 0.3333, "step": 620 }, { "epoch": 0.02336612379372386, "grad_norm": 0.7778888940811157, "learning_rate": 1.9973069021938366e-05, "loss": 0.3016, "step": 625 }, { "epoch": 0.02355305278407365, "grad_norm": 0.4906633794307709, "learning_rate": 1.9972636600112873e-05, "loss": 0.3478, "step": 630 }, { "epoch": 0.023739981774423442, "grad_norm": 0.43176034092903137, "learning_rate": 1.997220073901077e-05, "loss": 0.2925, "step": 635 }, { "epoch": 0.023926910764773233, "grad_norm": 0.6139638423919678, "learning_rate": 1.997176143878237e-05, "loss": 0.336, "step": 640 }, { "epoch": 0.02411383975512302, "grad_norm": 0.5035068988800049, "learning_rate": 1.9971318699579177e-05, "loss": 0.322, "step": 645 }, { "epoch": 0.024300768745472812, "grad_norm": 0.8413116931915283, "learning_rate": 1.997087252155388e-05, "loss": 0.437, "step": 650 }, { "epoch": 0.024487697735822603, "grad_norm": 0.33662039041519165, "learning_rate": 1.9970422904860352e-05, "loss": 0.2931, "step": 655 }, { "epoch": 0.024674626726172395, "grad_norm": 0.6480875611305237, "learning_rate": 1.9969969849653652e-05, "loss": 0.2838, "step": 660 }, { "epoch": 0.024861555716522186, "grad_norm": 1.1229877471923828, "learning_rate": 1.9969513356090027e-05, "loss": 0.3157, "step": 665 }, { "epoch": 0.025048484706871977, "grad_norm": 0.5545793771743774, "learning_rate": 1.9969053424326908e-05, "loss": 0.4423, "step": 670 }, { "epoch": 0.025235413697221768, "grad_norm": 0.5844056010246277, "learning_rate": 1.9968590054522914e-05, "loss": 0.4168, "step": 675 }, { "epoch": 0.02542234268757156, "grad_norm": 0.5306451916694641, "learning_rate": 1.996812324683784e-05, "loss": 0.3978, "step": 680 }, { "epoch": 0.02560927167792135, "grad_norm": 0.4964573085308075, "learning_rate": 1.996765300143269e-05, "loss": 0.351, "step": 685 }, { "epoch": 0.02579620066827114, "grad_norm": 0.4789879024028778, "learning_rate": 1.9967179318469626e-05, "loss": 0.3372, "step": 690 }, { "epoch": 0.025983129658620933, "grad_norm": 0.48183321952819824, "learning_rate": 1.996670219811201e-05, "loss": 0.2788, "step": 695 }, { "epoch": 0.026170058648970724, "grad_norm": 0.2901974320411682, "learning_rate": 1.996622164052439e-05, "loss": 0.3617, "step": 700 }, { "epoch": 0.02635698763932051, "grad_norm": 0.8643401861190796, "learning_rate": 1.996573764587249e-05, "loss": 0.3683, "step": 705 }, { "epoch": 0.026543916629670303, "grad_norm": 0.6790117621421814, "learning_rate": 1.9965250214323232e-05, "loss": 0.4397, "step": 710 }, { "epoch": 0.026730845620020094, "grad_norm": 0.7502917051315308, "learning_rate": 1.996475934604472e-05, "loss": 0.3743, "step": 715 }, { "epoch": 0.026917774610369885, "grad_norm": 0.6247079372406006, "learning_rate": 1.996426504120623e-05, "loss": 0.3414, "step": 720 }, { "epoch": 0.027104703600719676, "grad_norm": 1.0865947008132935, "learning_rate": 1.9963767299978243e-05, "loss": 0.332, "step": 725 }, { "epoch": 0.027291632591069467, "grad_norm": 0.8259230256080627, "learning_rate": 1.9963266122532416e-05, "loss": 0.3709, "step": 730 }, { "epoch": 0.02747856158141926, "grad_norm": 0.7781583070755005, "learning_rate": 1.9962761509041578e-05, "loss": 0.357, "step": 735 }, { "epoch": 0.02766549057176905, "grad_norm": 0.4106627404689789, "learning_rate": 1.996225345967977e-05, "loss": 0.4027, "step": 740 }, { "epoch": 0.02785241956211884, "grad_norm": 0.538112223148346, "learning_rate": 1.99617419746222e-05, "loss": 0.3817, "step": 745 }, { "epoch": 0.028039348552468632, "grad_norm": 0.4181678593158722, "learning_rate": 1.996122705404526e-05, "loss": 0.3379, "step": 750 }, { "epoch": 0.028226277542818423, "grad_norm": 0.7423063516616821, "learning_rate": 1.9960708698126536e-05, "loss": 0.3501, "step": 755 }, { "epoch": 0.028413206533168214, "grad_norm": 0.9700421690940857, "learning_rate": 1.996018690704479e-05, "loss": 0.3911, "step": 760 }, { "epoch": 0.028600135523518002, "grad_norm": 0.6143549084663391, "learning_rate": 1.995966168097998e-05, "loss": 0.3379, "step": 765 }, { "epoch": 0.028787064513867793, "grad_norm": 0.5459659695625305, "learning_rate": 1.995913302011323e-05, "loss": 0.3502, "step": 770 }, { "epoch": 0.028973993504217584, "grad_norm": 0.8577987551689148, "learning_rate": 1.9958600924626873e-05, "loss": 0.3926, "step": 775 }, { "epoch": 0.029160922494567375, "grad_norm": 0.8069790601730347, "learning_rate": 1.9958065394704406e-05, "loss": 0.3648, "step": 780 }, { "epoch": 0.029347851484917167, "grad_norm": 1.1032791137695312, "learning_rate": 1.9957526430530514e-05, "loss": 0.3606, "step": 785 }, { "epoch": 0.029534780475266958, "grad_norm": 0.5585815906524658, "learning_rate": 1.995698403229108e-05, "loss": 0.3777, "step": 790 }, { "epoch": 0.02972170946561675, "grad_norm": 0.524401843547821, "learning_rate": 1.9956438200173155e-05, "loss": 0.2764, "step": 795 }, { "epoch": 0.02990863845596654, "grad_norm": 0.4294147789478302, "learning_rate": 1.9955888934364985e-05, "loss": 0.3334, "step": 800 }, { "epoch": 0.03009556744631633, "grad_norm": 0.6010425686836243, "learning_rate": 1.995533623505599e-05, "loss": 0.3192, "step": 805 }, { "epoch": 0.030282496436666122, "grad_norm": 0.8194842338562012, "learning_rate": 1.9954780102436786e-05, "loss": 0.3757, "step": 810 }, { "epoch": 0.030469425427015914, "grad_norm": 0.7027654647827148, "learning_rate": 1.995422053669917e-05, "loss": 0.3285, "step": 815 }, { "epoch": 0.030656354417365705, "grad_norm": 0.3135683238506317, "learning_rate": 1.9953657538036105e-05, "loss": 0.3776, "step": 820 }, { "epoch": 0.030843283407715492, "grad_norm": 0.45088908076286316, "learning_rate": 1.9953091106641772e-05, "loss": 0.3567, "step": 825 }, { "epoch": 0.031030212398065284, "grad_norm": 0.6652802228927612, "learning_rate": 1.9952521242711504e-05, "loss": 0.3103, "step": 830 }, { "epoch": 0.031217141388415075, "grad_norm": 0.396379292011261, "learning_rate": 1.9951947946441835e-05, "loss": 0.3106, "step": 835 }, { "epoch": 0.031404070378764866, "grad_norm": 0.5185825228691101, "learning_rate": 1.9951371218030483e-05, "loss": 0.3243, "step": 840 }, { "epoch": 0.03159099936911466, "grad_norm": 0.48012250661849976, "learning_rate": 1.995079105767634e-05, "loss": 0.352, "step": 845 }, { "epoch": 0.03177792835946445, "grad_norm": 0.576318621635437, "learning_rate": 1.9950207465579483e-05, "loss": 0.4734, "step": 850 }, { "epoch": 0.03196485734981424, "grad_norm": 0.41435566544532776, "learning_rate": 1.9949620441941183e-05, "loss": 0.3589, "step": 855 }, { "epoch": 0.03215178634016403, "grad_norm": 0.5953770875930786, "learning_rate": 1.994902998696389e-05, "loss": 0.3151, "step": 860 }, { "epoch": 0.03233871533051382, "grad_norm": 0.43497735261917114, "learning_rate": 1.9948436100851224e-05, "loss": 0.3793, "step": 865 }, { "epoch": 0.03252564432086361, "grad_norm": 0.8335877656936646, "learning_rate": 1.9947838783808005e-05, "loss": 0.362, "step": 870 }, { "epoch": 0.032712573311213404, "grad_norm": 0.9302036762237549, "learning_rate": 1.9947238036040234e-05, "loss": 0.2667, "step": 875 }, { "epoch": 0.032899502301563195, "grad_norm": 0.733805239200592, "learning_rate": 1.994663385775509e-05, "loss": 0.3825, "step": 880 }, { "epoch": 0.033086431291912986, "grad_norm": 0.46970003843307495, "learning_rate": 1.9946026249160935e-05, "loss": 0.2545, "step": 885 }, { "epoch": 0.03327336028226278, "grad_norm": 0.9111189246177673, "learning_rate": 1.9945415210467316e-05, "loss": 0.3019, "step": 890 }, { "epoch": 0.03346028927261257, "grad_norm": 0.35354894399642944, "learning_rate": 1.9944800741884963e-05, "loss": 0.296, "step": 895 }, { "epoch": 0.03364721826296236, "grad_norm": 0.5118780732154846, "learning_rate": 1.9944182843625786e-05, "loss": 0.3321, "step": 900 }, { "epoch": 0.03383414725331215, "grad_norm": 0.6458944082260132, "learning_rate": 1.9943561515902886e-05, "loss": 0.2565, "step": 905 }, { "epoch": 0.03402107624366194, "grad_norm": 0.34854066371917725, "learning_rate": 1.9942936758930537e-05, "loss": 0.3219, "step": 910 }, { "epoch": 0.034208005234011726, "grad_norm": 0.4601725935935974, "learning_rate": 1.9942308572924204e-05, "loss": 0.274, "step": 915 }, { "epoch": 0.03439493422436152, "grad_norm": 0.998417854309082, "learning_rate": 1.9941676958100526e-05, "loss": 0.3507, "step": 920 }, { "epoch": 0.03458186321471131, "grad_norm": 0.7837041616439819, "learning_rate": 1.9941041914677327e-05, "loss": 0.3192, "step": 925 }, { "epoch": 0.0347687922050611, "grad_norm": 0.541322648525238, "learning_rate": 1.994040344287362e-05, "loss": 0.3525, "step": 930 }, { "epoch": 0.03495572119541089, "grad_norm": 0.7367837429046631, "learning_rate": 1.9939761542909594e-05, "loss": 0.3349, "step": 935 }, { "epoch": 0.03514265018576068, "grad_norm": 0.6418364644050598, "learning_rate": 1.9939116215006626e-05, "loss": 0.3164, "step": 940 }, { "epoch": 0.03532957917611047, "grad_norm": 0.6306796669960022, "learning_rate": 1.993846745938726e-05, "loss": 0.3192, "step": 945 }, { "epoch": 0.035516508166460264, "grad_norm": 0.2718103229999542, "learning_rate": 1.9937815276275247e-05, "loss": 0.3019, "step": 950 }, { "epoch": 0.035703437156810056, "grad_norm": 0.8403748869895935, "learning_rate": 1.9937159665895494e-05, "loss": 0.2813, "step": 955 }, { "epoch": 0.03589036614715985, "grad_norm": 0.4756128191947937, "learning_rate": 1.9936500628474115e-05, "loss": 0.4187, "step": 960 }, { "epoch": 0.03607729513750964, "grad_norm": 0.8436702489852905, "learning_rate": 1.993583816423838e-05, "loss": 0.3813, "step": 965 }, { "epoch": 0.03626422412785943, "grad_norm": 0.3379393517971039, "learning_rate": 1.9935172273416762e-05, "loss": 0.3279, "step": 970 }, { "epoch": 0.03645115311820922, "grad_norm": 0.5459058880805969, "learning_rate": 1.9934502956238905e-05, "loss": 0.3037, "step": 975 }, { "epoch": 0.03663808210855901, "grad_norm": 0.6367349028587341, "learning_rate": 1.9933830212935637e-05, "loss": 0.4326, "step": 980 }, { "epoch": 0.0368250110989088, "grad_norm": 0.35184261202812195, "learning_rate": 1.993315404373897e-05, "loss": 0.309, "step": 985 }, { "epoch": 0.037011940089258594, "grad_norm": 0.49862006306648254, "learning_rate": 1.9932474448882097e-05, "loss": 0.3977, "step": 990 }, { "epoch": 0.037198869079608385, "grad_norm": 0.4049328863620758, "learning_rate": 1.9931791428599386e-05, "loss": 0.3397, "step": 995 }, { "epoch": 0.037385798069958176, "grad_norm": 0.4314810633659363, "learning_rate": 1.993110498312639e-05, "loss": 0.3494, "step": 1000 }, { "epoch": 0.03757272706030797, "grad_norm": 0.5296122431755066, "learning_rate": 1.993041511269985e-05, "loss": 0.2799, "step": 1005 }, { "epoch": 0.03775965605065776, "grad_norm": 0.506058394908905, "learning_rate": 1.9929721817557682e-05, "loss": 0.2591, "step": 1010 }, { "epoch": 0.03794658504100755, "grad_norm": 0.49406224489212036, "learning_rate": 1.9929025097938978e-05, "loss": 0.4447, "step": 1015 }, { "epoch": 0.03813351403135734, "grad_norm": 0.4546676576137543, "learning_rate": 1.9928324954084023e-05, "loss": 0.445, "step": 1020 }, { "epoch": 0.03832044302170713, "grad_norm": 0.3749759793281555, "learning_rate": 1.9927621386234274e-05, "loss": 0.2486, "step": 1025 }, { "epoch": 0.03850737201205692, "grad_norm": 1.062105655670166, "learning_rate": 1.9926914394632368e-05, "loss": 0.4197, "step": 1030 }, { "epoch": 0.038694301002406714, "grad_norm": 0.4789983034133911, "learning_rate": 1.992620397952213e-05, "loss": 0.3374, "step": 1035 }, { "epoch": 0.0388812299927565, "grad_norm": 0.4073324501514435, "learning_rate": 1.9925490141148564e-05, "loss": 0.3918, "step": 1040 }, { "epoch": 0.03906815898310629, "grad_norm": 0.6208945512771606, "learning_rate": 1.9924772879757848e-05, "loss": 0.3542, "step": 1045 }, { "epoch": 0.03925508797345608, "grad_norm": 0.4267159104347229, "learning_rate": 1.9924052195597346e-05, "loss": 0.3543, "step": 1050 }, { "epoch": 0.03944201696380587, "grad_norm": 0.7129971385002136, "learning_rate": 1.9923328088915603e-05, "loss": 0.3433, "step": 1055 }, { "epoch": 0.03962894595415566, "grad_norm": 0.7188255786895752, "learning_rate": 1.992260055996234e-05, "loss": 0.311, "step": 1060 }, { "epoch": 0.039815874944505454, "grad_norm": 0.760067343711853, "learning_rate": 1.9921869608988464e-05, "loss": 0.3193, "step": 1065 }, { "epoch": 0.040002803934855245, "grad_norm": 0.5148400664329529, "learning_rate": 1.9921135236246058e-05, "loss": 0.316, "step": 1070 }, { "epoch": 0.040189732925205036, "grad_norm": 0.6598585844039917, "learning_rate": 1.9920397441988384e-05, "loss": 0.3337, "step": 1075 }, { "epoch": 0.04037666191555483, "grad_norm": 0.40405112504959106, "learning_rate": 1.991965622646989e-05, "loss": 0.3585, "step": 1080 }, { "epoch": 0.04056359090590462, "grad_norm": 0.6789656281471252, "learning_rate": 1.9918911589946193e-05, "loss": 0.3226, "step": 1085 }, { "epoch": 0.04075051989625441, "grad_norm": 0.6832015514373779, "learning_rate": 1.99181635326741e-05, "loss": 0.3428, "step": 1090 }, { "epoch": 0.0409374488866042, "grad_norm": 0.4636186361312866, "learning_rate": 1.99174120549116e-05, "loss": 0.4286, "step": 1095 }, { "epoch": 0.04112437787695399, "grad_norm": 0.673987865447998, "learning_rate": 1.9916657156917852e-05, "loss": 0.3035, "step": 1100 }, { "epoch": 0.04131130686730378, "grad_norm": 0.7312000393867493, "learning_rate": 1.99158988389532e-05, "loss": 0.3005, "step": 1105 }, { "epoch": 0.041498235857653575, "grad_norm": 0.45022085309028625, "learning_rate": 1.9915137101279163e-05, "loss": 0.2549, "step": 1110 }, { "epoch": 0.041685164848003366, "grad_norm": 0.695288360118866, "learning_rate": 1.9914371944158445e-05, "loss": 0.367, "step": 1115 }, { "epoch": 0.04187209383835316, "grad_norm": 0.4332289695739746, "learning_rate": 1.9913603367854927e-05, "loss": 0.2729, "step": 1120 }, { "epoch": 0.04205902282870295, "grad_norm": 0.36939793825149536, "learning_rate": 1.9912831372633665e-05, "loss": 0.3085, "step": 1125 }, { "epoch": 0.04224595181905274, "grad_norm": 0.5997341871261597, "learning_rate": 1.99120559587609e-05, "loss": 0.3238, "step": 1130 }, { "epoch": 0.04243288080940253, "grad_norm": 0.5867207646369934, "learning_rate": 1.9911277126504056e-05, "loss": 0.3768, "step": 1135 }, { "epoch": 0.04261980979975232, "grad_norm": 0.45231306552886963, "learning_rate": 1.9910494876131726e-05, "loss": 0.3349, "step": 1140 }, { "epoch": 0.04280673879010211, "grad_norm": 0.9641021490097046, "learning_rate": 1.990970920791368e-05, "loss": 0.3352, "step": 1145 }, { "epoch": 0.042993667780451904, "grad_norm": 0.24853017926216125, "learning_rate": 1.990892012212088e-05, "loss": 0.3269, "step": 1150 }, { "epoch": 0.043180596770801695, "grad_norm": 0.357064813375473, "learning_rate": 1.9908127619025458e-05, "loss": 0.2854, "step": 1155 }, { "epoch": 0.04336752576115148, "grad_norm": 0.6567901968955994, "learning_rate": 1.990733169890072e-05, "loss": 0.4157, "step": 1160 }, { "epoch": 0.04355445475150127, "grad_norm": 0.7041186690330505, "learning_rate": 1.9906532362021164e-05, "loss": 0.3406, "step": 1165 }, { "epoch": 0.04374138374185106, "grad_norm": 0.7448705434799194, "learning_rate": 1.990572960866245e-05, "loss": 0.3272, "step": 1170 }, { "epoch": 0.04392831273220085, "grad_norm": 0.35884958505630493, "learning_rate": 1.9904923439101432e-05, "loss": 0.3539, "step": 1175 }, { "epoch": 0.044115241722550644, "grad_norm": 0.8289523124694824, "learning_rate": 1.9904113853616128e-05, "loss": 0.296, "step": 1180 }, { "epoch": 0.044302170712900435, "grad_norm": 0.5754719972610474, "learning_rate": 1.990330085248575e-05, "loss": 0.3425, "step": 1185 }, { "epoch": 0.044489099703250226, "grad_norm": 0.6339451670646667, "learning_rate": 1.990248443599067e-05, "loss": 0.4721, "step": 1190 }, { "epoch": 0.04467602869360002, "grad_norm": 0.35451385378837585, "learning_rate": 1.9901664604412453e-05, "loss": 0.3476, "step": 1195 }, { "epoch": 0.04486295768394981, "grad_norm": 0.466686487197876, "learning_rate": 1.990084135803383e-05, "loss": 0.3168, "step": 1200 }, { "epoch": 0.0450498866742996, "grad_norm": 0.3746946454048157, "learning_rate": 1.9900014697138718e-05, "loss": 0.2576, "step": 1205 }, { "epoch": 0.04523681566464939, "grad_norm": 0.4259583353996277, "learning_rate": 1.9899184622012208e-05, "loss": 0.3146, "step": 1210 }, { "epoch": 0.04542374465499918, "grad_norm": 0.6210399270057678, "learning_rate": 1.989835113294057e-05, "loss": 0.3105, "step": 1215 }, { "epoch": 0.04561067364534897, "grad_norm": 0.45747750997543335, "learning_rate": 1.989751423021125e-05, "loss": 0.2911, "step": 1220 }, { "epoch": 0.045797602635698764, "grad_norm": 0.4624209702014923, "learning_rate": 1.989667391411287e-05, "loss": 0.3643, "step": 1225 }, { "epoch": 0.045984531626048555, "grad_norm": 0.7682859301567078, "learning_rate": 1.9895830184935233e-05, "loss": 0.3374, "step": 1230 }, { "epoch": 0.04617146061639835, "grad_norm": 0.6099391579627991, "learning_rate": 1.989498304296932e-05, "loss": 0.299, "step": 1235 }, { "epoch": 0.04635838960674814, "grad_norm": 0.5577886700630188, "learning_rate": 1.989413248850728e-05, "loss": 0.3821, "step": 1240 }, { "epoch": 0.04654531859709793, "grad_norm": 0.4436083436012268, "learning_rate": 1.9893278521842448e-05, "loss": 0.358, "step": 1245 }, { "epoch": 0.04673224758744772, "grad_norm": 0.6322022676467896, "learning_rate": 1.989242114326933e-05, "loss": 0.2856, "step": 1250 }, { "epoch": 0.04691917657779751, "grad_norm": 0.4859846234321594, "learning_rate": 1.9891560353083616e-05, "loss": 0.2951, "step": 1255 }, { "epoch": 0.0471061055681473, "grad_norm": 0.4296378791332245, "learning_rate": 1.989069615158217e-05, "loss": 0.3627, "step": 1260 }, { "epoch": 0.047293034558497093, "grad_norm": 0.4696434438228607, "learning_rate": 1.9889828539063017e-05, "loss": 0.3404, "step": 1265 }, { "epoch": 0.047479963548846885, "grad_norm": 0.5244821310043335, "learning_rate": 1.9888957515825383e-05, "loss": 0.3347, "step": 1270 }, { "epoch": 0.047666892539196676, "grad_norm": 0.5117040872573853, "learning_rate": 1.9888083082169657e-05, "loss": 0.3017, "step": 1275 }, { "epoch": 0.04785382152954647, "grad_norm": 0.6786810159683228, "learning_rate": 1.9887205238397405e-05, "loss": 0.3495, "step": 1280 }, { "epoch": 0.04804075051989625, "grad_norm": 0.41801556944847107, "learning_rate": 1.988632398481137e-05, "loss": 0.2713, "step": 1285 }, { "epoch": 0.04822767951024604, "grad_norm": 0.3679557740688324, "learning_rate": 1.988543932171547e-05, "loss": 0.2617, "step": 1290 }, { "epoch": 0.048414608500595833, "grad_norm": 0.3665943145751953, "learning_rate": 1.9884551249414806e-05, "loss": 0.3392, "step": 1295 }, { "epoch": 0.048601537490945625, "grad_norm": 0.6276513934135437, "learning_rate": 1.9883659768215642e-05, "loss": 0.3797, "step": 1300 }, { "epoch": 0.048788466481295416, "grad_norm": 0.3099709451198578, "learning_rate": 1.988276487842543e-05, "loss": 0.384, "step": 1305 }, { "epoch": 0.04897539547164521, "grad_norm": 0.3573191463947296, "learning_rate": 1.9881866580352783e-05, "loss": 0.3194, "step": 1310 }, { "epoch": 0.049162324461995, "grad_norm": 0.4190731644630432, "learning_rate": 1.9880964874307506e-05, "loss": 0.284, "step": 1315 }, { "epoch": 0.04934925345234479, "grad_norm": 0.4039645791053772, "learning_rate": 1.988005976060057e-05, "loss": 0.2548, "step": 1320 }, { "epoch": 0.04953618244269458, "grad_norm": 0.4496673345565796, "learning_rate": 1.987915123954412e-05, "loss": 0.3529, "step": 1325 }, { "epoch": 0.04972311143304437, "grad_norm": 0.8639070987701416, "learning_rate": 1.9878239311451483e-05, "loss": 0.3073, "step": 1330 }, { "epoch": 0.04991004042339416, "grad_norm": 0.5019544959068298, "learning_rate": 1.9877323976637153e-05, "loss": 0.3829, "step": 1335 }, { "epoch": 0.050096969413743954, "grad_norm": 0.5682312846183777, "learning_rate": 1.9876405235416808e-05, "loss": 0.4139, "step": 1340 }, { "epoch": 0.050283898404093745, "grad_norm": 0.671947717666626, "learning_rate": 1.987548308810729e-05, "loss": 0.2905, "step": 1345 }, { "epoch": 0.050470827394443536, "grad_norm": 0.2840399444103241, "learning_rate": 1.9874557535026623e-05, "loss": 0.3269, "step": 1350 }, { "epoch": 0.05065775638479333, "grad_norm": 0.5511133074760437, "learning_rate": 1.9873628576494004e-05, "loss": 0.3221, "step": 1355 }, { "epoch": 0.05084468537514312, "grad_norm": 0.4360320270061493, "learning_rate": 1.9872696212829804e-05, "loss": 0.3094, "step": 1360 }, { "epoch": 0.05103161436549291, "grad_norm": 0.342729777097702, "learning_rate": 1.987176044435557e-05, "loss": 0.3401, "step": 1365 }, { "epoch": 0.0512185433558427, "grad_norm": 0.43248656392097473, "learning_rate": 1.987082127139402e-05, "loss": 0.2907, "step": 1370 }, { "epoch": 0.05140547234619249, "grad_norm": 0.6353283524513245, "learning_rate": 1.9869878694269048e-05, "loss": 0.3599, "step": 1375 }, { "epoch": 0.05159240133654228, "grad_norm": 0.5560038685798645, "learning_rate": 1.9868932713305723e-05, "loss": 0.3177, "step": 1380 }, { "epoch": 0.051779330326892074, "grad_norm": 0.49626341462135315, "learning_rate": 1.9867983328830283e-05, "loss": 0.3419, "step": 1385 }, { "epoch": 0.051966259317241865, "grad_norm": 0.5227237939834595, "learning_rate": 1.986703054117015e-05, "loss": 0.2357, "step": 1390 }, { "epoch": 0.05215318830759166, "grad_norm": 0.45654603838920593, "learning_rate": 1.986607435065391e-05, "loss": 0.2313, "step": 1395 }, { "epoch": 0.05234011729794145, "grad_norm": 0.4107493460178375, "learning_rate": 1.9865114757611322e-05, "loss": 0.3056, "step": 1400 }, { "epoch": 0.05252704628829123, "grad_norm": 0.9286690354347229, "learning_rate": 1.9864151762373323e-05, "loss": 0.3116, "step": 1405 }, { "epoch": 0.05271397527864102, "grad_norm": 0.5546656847000122, "learning_rate": 1.986318536527203e-05, "loss": 0.3392, "step": 1410 }, { "epoch": 0.052900904268990814, "grad_norm": 0.45263174176216125, "learning_rate": 1.986221556664072e-05, "loss": 0.3175, "step": 1415 }, { "epoch": 0.053087833259340605, "grad_norm": 0.2992209792137146, "learning_rate": 1.9861242366813846e-05, "loss": 0.2616, "step": 1420 }, { "epoch": 0.0532747622496904, "grad_norm": 0.39666202664375305, "learning_rate": 1.9860265766127045e-05, "loss": 0.3728, "step": 1425 }, { "epoch": 0.05346169124004019, "grad_norm": 0.616473376750946, "learning_rate": 1.9859285764917108e-05, "loss": 0.3823, "step": 1430 }, { "epoch": 0.05364862023038998, "grad_norm": 0.3824799954891205, "learning_rate": 1.985830236352202e-05, "loss": 0.2945, "step": 1435 }, { "epoch": 0.05383554922073977, "grad_norm": 0.45680978894233704, "learning_rate": 1.9857315562280923e-05, "loss": 0.3025, "step": 1440 }, { "epoch": 0.05402247821108956, "grad_norm": 0.6660309433937073, "learning_rate": 1.9856325361534133e-05, "loss": 0.2585, "step": 1445 }, { "epoch": 0.05420940720143935, "grad_norm": 0.6838333010673523, "learning_rate": 1.9855331761623143e-05, "loss": 0.2684, "step": 1450 }, { "epoch": 0.054396336191789144, "grad_norm": 0.36576396226882935, "learning_rate": 1.9854334762890626e-05, "loss": 0.3123, "step": 1455 }, { "epoch": 0.054583265182138935, "grad_norm": 0.4945361614227295, "learning_rate": 1.9853334365680408e-05, "loss": 0.3113, "step": 1460 }, { "epoch": 0.054770194172488726, "grad_norm": 0.4163508117198944, "learning_rate": 1.98523305703375e-05, "loss": 0.3394, "step": 1465 }, { "epoch": 0.05495712316283852, "grad_norm": 0.42047226428985596, "learning_rate": 1.985132337720809e-05, "loss": 0.2486, "step": 1470 }, { "epoch": 0.05514405215318831, "grad_norm": 0.4733279347419739, "learning_rate": 1.9850312786639513e-05, "loss": 0.3164, "step": 1475 }, { "epoch": 0.0553309811435381, "grad_norm": 0.36004531383514404, "learning_rate": 1.984929879898031e-05, "loss": 0.2725, "step": 1480 }, { "epoch": 0.05551791013388789, "grad_norm": 0.32165494561195374, "learning_rate": 1.9848281414580167e-05, "loss": 0.3624, "step": 1485 }, { "epoch": 0.05570483912423768, "grad_norm": 0.7197104096412659, "learning_rate": 1.9847260633789953e-05, "loss": 0.3337, "step": 1490 }, { "epoch": 0.05589176811458747, "grad_norm": 0.4705203175544739, "learning_rate": 1.984623645696171e-05, "loss": 0.3503, "step": 1495 }, { "epoch": 0.056078697104937264, "grad_norm": 0.6553797125816345, "learning_rate": 1.984520888444864e-05, "loss": 0.4203, "step": 1500 }, { "epoch": 0.056265626095287055, "grad_norm": 0.46082812547683716, "learning_rate": 1.9844177916605126e-05, "loss": 0.299, "step": 1505 }, { "epoch": 0.056452555085636846, "grad_norm": 0.6781812906265259, "learning_rate": 1.984314355378672e-05, "loss": 0.3063, "step": 1510 }, { "epoch": 0.05663948407598664, "grad_norm": 0.6019364595413208, "learning_rate": 1.9842105796350143e-05, "loss": 0.389, "step": 1515 }, { "epoch": 0.05682641306633643, "grad_norm": 0.4727480113506317, "learning_rate": 1.9841064644653293e-05, "loss": 0.367, "step": 1520 }, { "epoch": 0.05701334205668621, "grad_norm": 0.5794846415519714, "learning_rate": 1.9840020099055226e-05, "loss": 0.3148, "step": 1525 }, { "epoch": 0.057200271047036004, "grad_norm": 0.48816463351249695, "learning_rate": 1.983897215991618e-05, "loss": 0.3408, "step": 1530 }, { "epoch": 0.057387200037385795, "grad_norm": 0.2970341145992279, "learning_rate": 1.983792082759756e-05, "loss": 0.3218, "step": 1535 }, { "epoch": 0.057574129027735586, "grad_norm": 0.2880699336528778, "learning_rate": 1.9836866102461933e-05, "loss": 0.3219, "step": 1540 }, { "epoch": 0.05776105801808538, "grad_norm": 0.368488609790802, "learning_rate": 1.9835807984873055e-05, "loss": 0.3782, "step": 1545 }, { "epoch": 0.05794798700843517, "grad_norm": 0.5185800194740295, "learning_rate": 1.983474647519583e-05, "loss": 0.2973, "step": 1550 }, { "epoch": 0.05813491599878496, "grad_norm": 0.5686115026473999, "learning_rate": 1.9833681573796352e-05, "loss": 0.317, "step": 1555 }, { "epoch": 0.05832184498913475, "grad_norm": 0.6750605702400208, "learning_rate": 1.983261328104187e-05, "loss": 0.3243, "step": 1560 }, { "epoch": 0.05850877397948454, "grad_norm": 0.36497625708580017, "learning_rate": 1.9831541597300804e-05, "loss": 0.3339, "step": 1565 }, { "epoch": 0.05869570296983433, "grad_norm": 0.5015937089920044, "learning_rate": 1.983046652294275e-05, "loss": 0.3349, "step": 1570 }, { "epoch": 0.058882631960184124, "grad_norm": 0.36055922508239746, "learning_rate": 1.982938805833847e-05, "loss": 0.2826, "step": 1575 }, { "epoch": 0.059069560950533916, "grad_norm": 0.36943939328193665, "learning_rate": 1.9828306203859896e-05, "loss": 0.3517, "step": 1580 }, { "epoch": 0.05925648994088371, "grad_norm": 0.47426244616508484, "learning_rate": 1.982722095988013e-05, "loss": 0.3198, "step": 1585 }, { "epoch": 0.0594434189312335, "grad_norm": 0.5041938424110413, "learning_rate": 1.9826132326773443e-05, "loss": 0.3636, "step": 1590 }, { "epoch": 0.05963034792158329, "grad_norm": 0.5388785600662231, "learning_rate": 1.982504030491527e-05, "loss": 0.3527, "step": 1595 }, { "epoch": 0.05981727691193308, "grad_norm": 0.399564653635025, "learning_rate": 1.982394489468222e-05, "loss": 0.3333, "step": 1600 }, { "epoch": 0.06000420590228287, "grad_norm": 1.5852733850479126, "learning_rate": 1.9822846096452064e-05, "loss": 0.3851, "step": 1605 }, { "epoch": 0.06019113489263266, "grad_norm": 0.3177071809768677, "learning_rate": 1.982174391060375e-05, "loss": 0.3981, "step": 1610 }, { "epoch": 0.060378063882982454, "grad_norm": 0.9603758454322815, "learning_rate": 1.982063833751739e-05, "loss": 0.3691, "step": 1615 }, { "epoch": 0.060564992873332245, "grad_norm": 0.3188650608062744, "learning_rate": 1.9819529377574265e-05, "loss": 0.3143, "step": 1620 }, { "epoch": 0.060751921863682036, "grad_norm": 0.48925361037254333, "learning_rate": 1.9818417031156826e-05, "loss": 0.3063, "step": 1625 }, { "epoch": 0.06093885085403183, "grad_norm": 0.4794778823852539, "learning_rate": 1.9817301298648683e-05, "loss": 0.3278, "step": 1630 }, { "epoch": 0.06112577984438162, "grad_norm": 0.4850408434867859, "learning_rate": 1.9816182180434622e-05, "loss": 0.3245, "step": 1635 }, { "epoch": 0.06131270883473141, "grad_norm": 0.5055322647094727, "learning_rate": 1.9815059676900597e-05, "loss": 0.2969, "step": 1640 }, { "epoch": 0.0614996378250812, "grad_norm": 0.8014519214630127, "learning_rate": 1.9813933788433724e-05, "loss": 0.2473, "step": 1645 }, { "epoch": 0.061686566815430985, "grad_norm": 0.3348430395126343, "learning_rate": 1.9812804515422298e-05, "loss": 0.3928, "step": 1650 }, { "epoch": 0.061873495805780776, "grad_norm": 0.6837722659111023, "learning_rate": 1.9811671858255764e-05, "loss": 0.2898, "step": 1655 }, { "epoch": 0.06206042479613057, "grad_norm": 0.422375351190567, "learning_rate": 1.9810535817324746e-05, "loss": 0.3522, "step": 1660 }, { "epoch": 0.06224735378648036, "grad_norm": 0.8504202365875244, "learning_rate": 1.980939639302103e-05, "loss": 0.4159, "step": 1665 }, { "epoch": 0.06243428277683015, "grad_norm": 0.31550297141075134, "learning_rate": 1.9808253585737577e-05, "loss": 0.3423, "step": 1670 }, { "epoch": 0.06262121176717994, "grad_norm": 0.52349454164505, "learning_rate": 1.9807107395868503e-05, "loss": 0.337, "step": 1675 }, { "epoch": 0.06280814075752973, "grad_norm": 0.9710644483566284, "learning_rate": 1.9805957823809095e-05, "loss": 0.3331, "step": 1680 }, { "epoch": 0.06299506974787952, "grad_norm": 0.5250943899154663, "learning_rate": 1.9804804869955815e-05, "loss": 0.3059, "step": 1685 }, { "epoch": 0.06318199873822931, "grad_norm": 0.6633884310722351, "learning_rate": 1.980364853470627e-05, "loss": 0.3314, "step": 1690 }, { "epoch": 0.0633689277285791, "grad_norm": 0.35779082775115967, "learning_rate": 1.9802488818459263e-05, "loss": 0.2959, "step": 1695 }, { "epoch": 0.0635558567189289, "grad_norm": 0.3499528169631958, "learning_rate": 1.980132572161474e-05, "loss": 0.3142, "step": 1700 }, { "epoch": 0.06374278570927869, "grad_norm": 0.45091912150382996, "learning_rate": 1.9800159244573813e-05, "loss": 0.342, "step": 1705 }, { "epoch": 0.06392971469962848, "grad_norm": 0.46202903985977173, "learning_rate": 1.9798989387738776e-05, "loss": 0.3052, "step": 1710 }, { "epoch": 0.06411664368997827, "grad_norm": 0.4897025227546692, "learning_rate": 1.9797816151513075e-05, "loss": 0.3864, "step": 1715 }, { "epoch": 0.06430357268032806, "grad_norm": 0.44095808267593384, "learning_rate": 1.979663953630133e-05, "loss": 0.3165, "step": 1720 }, { "epoch": 0.06449050167067785, "grad_norm": 0.4061107337474823, "learning_rate": 1.9795459542509314e-05, "loss": 0.3123, "step": 1725 }, { "epoch": 0.06467743066102764, "grad_norm": 0.37612634897232056, "learning_rate": 1.9794276170543974e-05, "loss": 0.3726, "step": 1730 }, { "epoch": 0.06486435965137743, "grad_norm": 0.8302399516105652, "learning_rate": 1.979308942081343e-05, "loss": 0.3406, "step": 1735 }, { "epoch": 0.06505128864172723, "grad_norm": 0.6120762228965759, "learning_rate": 1.9791899293726947e-05, "loss": 0.3493, "step": 1740 }, { "epoch": 0.06523821763207702, "grad_norm": 1.227718710899353, "learning_rate": 1.9790705789694977e-05, "loss": 0.3243, "step": 1745 }, { "epoch": 0.06542514662242681, "grad_norm": 0.2927764356136322, "learning_rate": 1.978950890912912e-05, "loss": 0.2685, "step": 1750 }, { "epoch": 0.0656120756127766, "grad_norm": 0.18635421991348267, "learning_rate": 1.9788308652442137e-05, "loss": 0.326, "step": 1755 }, { "epoch": 0.06579900460312639, "grad_norm": 0.5031612515449524, "learning_rate": 1.978710502004798e-05, "loss": 0.2777, "step": 1760 }, { "epoch": 0.06598593359347618, "grad_norm": 0.5212199091911316, "learning_rate": 1.9785898012361732e-05, "loss": 0.3813, "step": 1765 }, { "epoch": 0.06617286258382597, "grad_norm": 0.40999093651771545, "learning_rate": 1.978468762979966e-05, "loss": 0.3218, "step": 1770 }, { "epoch": 0.06635979157417576, "grad_norm": 0.29708200693130493, "learning_rate": 1.9783473872779192e-05, "loss": 0.2983, "step": 1775 }, { "epoch": 0.06654672056452555, "grad_norm": 0.48487138748168945, "learning_rate": 1.978225674171892e-05, "loss": 0.2956, "step": 1780 }, { "epoch": 0.06673364955487535, "grad_norm": 0.7466865181922913, "learning_rate": 1.9781036237038593e-05, "loss": 0.4038, "step": 1785 }, { "epoch": 0.06692057854522514, "grad_norm": 0.5608679056167603, "learning_rate": 1.977981235915913e-05, "loss": 0.31, "step": 1790 }, { "epoch": 0.06710750753557493, "grad_norm": 0.3457610011100769, "learning_rate": 1.9778585108502613e-05, "loss": 0.3171, "step": 1795 }, { "epoch": 0.06729443652592472, "grad_norm": 0.3912654221057892, "learning_rate": 1.977735448549228e-05, "loss": 0.2944, "step": 1800 }, { "epoch": 0.06748136551627451, "grad_norm": 0.660007119178772, "learning_rate": 1.9776120490552545e-05, "loss": 0.3354, "step": 1805 }, { "epoch": 0.0676682945066243, "grad_norm": 0.5889789462089539, "learning_rate": 1.9774883124108975e-05, "loss": 0.2601, "step": 1810 }, { "epoch": 0.06785522349697409, "grad_norm": 0.43226462602615356, "learning_rate": 1.97736423865883e-05, "loss": 0.26, "step": 1815 }, { "epoch": 0.06804215248732388, "grad_norm": 0.38729366660118103, "learning_rate": 1.9772398278418414e-05, "loss": 0.2675, "step": 1820 }, { "epoch": 0.06822908147767368, "grad_norm": 0.472239226102829, "learning_rate": 1.9771150800028376e-05, "loss": 0.3722, "step": 1825 }, { "epoch": 0.06841601046802345, "grad_norm": 0.5041793584823608, "learning_rate": 1.976989995184841e-05, "loss": 0.2839, "step": 1830 }, { "epoch": 0.06860293945837324, "grad_norm": 0.5347047448158264, "learning_rate": 1.9768645734309896e-05, "loss": 0.365, "step": 1835 }, { "epoch": 0.06878986844872303, "grad_norm": 0.3948882818222046, "learning_rate": 1.976738814784537e-05, "loss": 0.3337, "step": 1840 }, { "epoch": 0.06897679743907283, "grad_norm": 0.8375716209411621, "learning_rate": 1.9766127192888543e-05, "loss": 0.2908, "step": 1845 }, { "epoch": 0.06916372642942262, "grad_norm": 0.6502942442893982, "learning_rate": 1.9764862869874282e-05, "loss": 0.2642, "step": 1850 }, { "epoch": 0.06935065541977241, "grad_norm": 0.36091551184654236, "learning_rate": 1.9763595179238617e-05, "loss": 0.3685, "step": 1855 }, { "epoch": 0.0695375844101222, "grad_norm": 0.6670817732810974, "learning_rate": 1.9762324121418735e-05, "loss": 0.2686, "step": 1860 }, { "epoch": 0.06972451340047199, "grad_norm": 0.50603187084198, "learning_rate": 1.9761049696852996e-05, "loss": 0.4413, "step": 1865 }, { "epoch": 0.06991144239082178, "grad_norm": 0.4863389730453491, "learning_rate": 1.97597719059809e-05, "loss": 0.216, "step": 1870 }, { "epoch": 0.07009837138117157, "grad_norm": 0.3261346220970154, "learning_rate": 1.975849074924313e-05, "loss": 0.3634, "step": 1875 }, { "epoch": 0.07028530037152136, "grad_norm": 0.44674980640411377, "learning_rate": 1.9757206227081514e-05, "loss": 0.268, "step": 1880 }, { "epoch": 0.07047222936187116, "grad_norm": 0.4099423587322235, "learning_rate": 1.975591833993905e-05, "loss": 0.3505, "step": 1885 }, { "epoch": 0.07065915835222095, "grad_norm": 1.4500172138214111, "learning_rate": 1.9754627088259894e-05, "loss": 0.3023, "step": 1890 }, { "epoch": 0.07084608734257074, "grad_norm": 0.49058449268341064, "learning_rate": 1.975333247248936e-05, "loss": 0.2925, "step": 1895 }, { "epoch": 0.07103301633292053, "grad_norm": 0.4941057860851288, "learning_rate": 1.9752034493073924e-05, "loss": 0.2957, "step": 1900 }, { "epoch": 0.07121994532327032, "grad_norm": 0.8845500349998474, "learning_rate": 1.9750733150461225e-05, "loss": 0.3244, "step": 1905 }, { "epoch": 0.07140687431362011, "grad_norm": 1.2214986085891724, "learning_rate": 1.9749428445100053e-05, "loss": 0.3882, "step": 1910 }, { "epoch": 0.0715938033039699, "grad_norm": 0.6646692752838135, "learning_rate": 1.974812037744037e-05, "loss": 0.4436, "step": 1915 }, { "epoch": 0.0717807322943197, "grad_norm": 0.5141096115112305, "learning_rate": 1.9746808947933285e-05, "loss": 0.2812, "step": 1920 }, { "epoch": 0.07196766128466948, "grad_norm": 0.39117810130119324, "learning_rate": 1.9745494157031075e-05, "loss": 0.2966, "step": 1925 }, { "epoch": 0.07215459027501928, "grad_norm": 0.5144658088684082, "learning_rate": 1.9744176005187173e-05, "loss": 0.3422, "step": 1930 }, { "epoch": 0.07234151926536907, "grad_norm": 0.6618564128875732, "learning_rate": 1.9742854492856178e-05, "loss": 0.3295, "step": 1935 }, { "epoch": 0.07252844825571886, "grad_norm": 0.7013601660728455, "learning_rate": 1.9741529620493833e-05, "loss": 0.3461, "step": 1940 }, { "epoch": 0.07271537724606865, "grad_norm": 0.5347566604614258, "learning_rate": 1.9740201388557053e-05, "loss": 0.278, "step": 1945 }, { "epoch": 0.07290230623641844, "grad_norm": 0.4391633868217468, "learning_rate": 1.9738869797503905e-05, "loss": 0.2967, "step": 1950 }, { "epoch": 0.07308923522676823, "grad_norm": 0.4526468813419342, "learning_rate": 1.9737534847793622e-05, "loss": 0.2962, "step": 1955 }, { "epoch": 0.07327616421711802, "grad_norm": 0.652113676071167, "learning_rate": 1.9736196539886578e-05, "loss": 0.2854, "step": 1960 }, { "epoch": 0.07346309320746781, "grad_norm": 0.39846867322921753, "learning_rate": 1.9734854874244332e-05, "loss": 0.2783, "step": 1965 }, { "epoch": 0.0736500221978176, "grad_norm": 0.3684355318546295, "learning_rate": 1.9733509851329574e-05, "loss": 0.2901, "step": 1970 }, { "epoch": 0.0738369511881674, "grad_norm": 0.6221095323562622, "learning_rate": 1.973216147160617e-05, "loss": 0.3012, "step": 1975 }, { "epoch": 0.07402388017851719, "grad_norm": 0.41872674226760864, "learning_rate": 1.9730809735539134e-05, "loss": 0.3818, "step": 1980 }, { "epoch": 0.07421080916886698, "grad_norm": 0.35373455286026, "learning_rate": 1.9729454643594646e-05, "loss": 0.354, "step": 1985 }, { "epoch": 0.07439773815921677, "grad_norm": 0.795148491859436, "learning_rate": 1.9728096196240035e-05, "loss": 0.3374, "step": 1990 }, { "epoch": 0.07458466714956656, "grad_norm": 0.32079294323921204, "learning_rate": 1.972673439394379e-05, "loss": 0.2758, "step": 1995 }, { "epoch": 0.07477159613991635, "grad_norm": 0.3288242518901825, "learning_rate": 1.9725369237175562e-05, "loss": 0.2796, "step": 2000 }, { "epoch": 0.07495852513026614, "grad_norm": 0.428270548582077, "learning_rate": 1.9724000726406148e-05, "loss": 0.2982, "step": 2005 }, { "epoch": 0.07514545412061593, "grad_norm": 0.38480132818222046, "learning_rate": 1.9722628862107514e-05, "loss": 0.2905, "step": 2010 }, { "epoch": 0.07533238311096573, "grad_norm": 0.5246176719665527, "learning_rate": 1.9721253644752774e-05, "loss": 0.3551, "step": 2015 }, { "epoch": 0.07551931210131552, "grad_norm": 0.5422767400741577, "learning_rate": 1.97198750748162e-05, "loss": 0.3718, "step": 2020 }, { "epoch": 0.07570624109166531, "grad_norm": 0.5160179138183594, "learning_rate": 1.971849315277322e-05, "loss": 0.3526, "step": 2025 }, { "epoch": 0.0758931700820151, "grad_norm": 0.5089318752288818, "learning_rate": 1.9717107879100426e-05, "loss": 0.344, "step": 2030 }, { "epoch": 0.07608009907236489, "grad_norm": 0.6274924874305725, "learning_rate": 1.971571925427555e-05, "loss": 0.2923, "step": 2035 }, { "epoch": 0.07626702806271468, "grad_norm": 0.5537222027778625, "learning_rate": 1.9714327278777495e-05, "loss": 0.3163, "step": 2040 }, { "epoch": 0.07645395705306447, "grad_norm": 0.7514776587486267, "learning_rate": 1.9712931953086314e-05, "loss": 0.4283, "step": 2045 }, { "epoch": 0.07664088604341426, "grad_norm": 0.33425864577293396, "learning_rate": 1.9711533277683214e-05, "loss": 0.2521, "step": 2050 }, { "epoch": 0.07682781503376405, "grad_norm": 0.4531971514225006, "learning_rate": 1.9710131253050555e-05, "loss": 0.3491, "step": 2055 }, { "epoch": 0.07701474402411385, "grad_norm": 0.2874973714351654, "learning_rate": 1.9708725879671856e-05, "loss": 0.334, "step": 2060 }, { "epoch": 0.07720167301446364, "grad_norm": 0.4508047103881836, "learning_rate": 1.9707317158031794e-05, "loss": 0.2811, "step": 2065 }, { "epoch": 0.07738860200481343, "grad_norm": 0.233233243227005, "learning_rate": 1.9705905088616195e-05, "loss": 0.2521, "step": 2070 }, { "epoch": 0.0775755309951632, "grad_norm": 0.31307104229927063, "learning_rate": 1.970448967191204e-05, "loss": 0.322, "step": 2075 }, { "epoch": 0.077762459985513, "grad_norm": 0.6601381897926331, "learning_rate": 1.9703070908407463e-05, "loss": 0.3456, "step": 2080 }, { "epoch": 0.07794938897586279, "grad_norm": 0.37102803587913513, "learning_rate": 1.970164879859176e-05, "loss": 0.374, "step": 2085 }, { "epoch": 0.07813631796621258, "grad_norm": 0.37135210633277893, "learning_rate": 1.9700223342955374e-05, "loss": 0.3463, "step": 2090 }, { "epoch": 0.07832324695656237, "grad_norm": 0.5516648292541504, "learning_rate": 1.9698794541989903e-05, "loss": 0.4849, "step": 2095 }, { "epoch": 0.07851017594691216, "grad_norm": 0.5477395057678223, "learning_rate": 1.96973623961881e-05, "loss": 0.2745, "step": 2100 }, { "epoch": 0.07869710493726195, "grad_norm": 0.5191754102706909, "learning_rate": 1.969592690604387e-05, "loss": 0.3044, "step": 2105 }, { "epoch": 0.07888403392761174, "grad_norm": 1.8438035249710083, "learning_rate": 1.9694488072052275e-05, "loss": 0.3786, "step": 2110 }, { "epoch": 0.07907096291796153, "grad_norm": 0.3772248327732086, "learning_rate": 1.9693045894709524e-05, "loss": 0.2567, "step": 2115 }, { "epoch": 0.07925789190831133, "grad_norm": 0.6036641597747803, "learning_rate": 1.9691600374512988e-05, "loss": 0.3108, "step": 2120 }, { "epoch": 0.07944482089866112, "grad_norm": 0.4737173020839691, "learning_rate": 1.969015151196118e-05, "loss": 0.2473, "step": 2125 }, { "epoch": 0.07963174988901091, "grad_norm": 0.741744339466095, "learning_rate": 1.9688699307553774e-05, "loss": 0.3534, "step": 2130 }, { "epoch": 0.0798186788793607, "grad_norm": 0.4742072522640228, "learning_rate": 1.9687243761791595e-05, "loss": 0.3127, "step": 2135 }, { "epoch": 0.08000560786971049, "grad_norm": 0.4502277374267578, "learning_rate": 1.9685784875176613e-05, "loss": 0.3633, "step": 2140 }, { "epoch": 0.08019253686006028, "grad_norm": 0.9466716647148132, "learning_rate": 1.9684322648211964e-05, "loss": 0.3007, "step": 2145 }, { "epoch": 0.08037946585041007, "grad_norm": 0.4492679834365845, "learning_rate": 1.9682857081401923e-05, "loss": 0.2861, "step": 2150 }, { "epoch": 0.08056639484075986, "grad_norm": 0.8264315724372864, "learning_rate": 1.9681388175251925e-05, "loss": 0.3445, "step": 2155 }, { "epoch": 0.08075332383110966, "grad_norm": 0.7091488242149353, "learning_rate": 1.9679915930268553e-05, "loss": 0.3153, "step": 2160 }, { "epoch": 0.08094025282145945, "grad_norm": 0.5118813514709473, "learning_rate": 1.967844034695954e-05, "loss": 0.3658, "step": 2165 }, { "epoch": 0.08112718181180924, "grad_norm": 0.6051716208457947, "learning_rate": 1.967696142583377e-05, "loss": 0.3552, "step": 2170 }, { "epoch": 0.08131411080215903, "grad_norm": 0.5814588665962219, "learning_rate": 1.967547916740129e-05, "loss": 0.3133, "step": 2175 }, { "epoch": 0.08150103979250882, "grad_norm": 0.6085551977157593, "learning_rate": 1.967399357217328e-05, "loss": 0.3033, "step": 2180 }, { "epoch": 0.08168796878285861, "grad_norm": 0.5651482939720154, "learning_rate": 1.9672504640662083e-05, "loss": 0.3819, "step": 2185 }, { "epoch": 0.0818748977732084, "grad_norm": 0.3710572123527527, "learning_rate": 1.9671012373381188e-05, "loss": 0.3037, "step": 2190 }, { "epoch": 0.0820618267635582, "grad_norm": 0.43990039825439453, "learning_rate": 1.9669516770845233e-05, "loss": 0.4436, "step": 2195 }, { "epoch": 0.08224875575390798, "grad_norm": 0.6509809494018555, "learning_rate": 1.966801783357001e-05, "loss": 0.2811, "step": 2200 }, { "epoch": 0.08243568474425778, "grad_norm": 0.5239195823669434, "learning_rate": 1.9666515562072463e-05, "loss": 0.3285, "step": 2205 }, { "epoch": 0.08262261373460757, "grad_norm": 0.6424049139022827, "learning_rate": 1.9665009956870678e-05, "loss": 0.2595, "step": 2210 }, { "epoch": 0.08280954272495736, "grad_norm": 0.3653183579444885, "learning_rate": 1.9663501018483897e-05, "loss": 0.3201, "step": 2215 }, { "epoch": 0.08299647171530715, "grad_norm": 0.6920386552810669, "learning_rate": 1.9661988747432508e-05, "loss": 0.2718, "step": 2220 }, { "epoch": 0.08318340070565694, "grad_norm": 0.4126833379268646, "learning_rate": 1.966047314423805e-05, "loss": 0.336, "step": 2225 }, { "epoch": 0.08337032969600673, "grad_norm": 0.11903409659862518, "learning_rate": 1.9658954209423214e-05, "loss": 0.3856, "step": 2230 }, { "epoch": 0.08355725868635652, "grad_norm": 0.44001504778862, "learning_rate": 1.9657431943511837e-05, "loss": 0.3288, "step": 2235 }, { "epoch": 0.08374418767670631, "grad_norm": 0.2602495849132538, "learning_rate": 1.96559063470289e-05, "loss": 0.2417, "step": 2240 }, { "epoch": 0.0839311166670561, "grad_norm": 0.48337632417678833, "learning_rate": 1.965437742050054e-05, "loss": 0.4299, "step": 2245 }, { "epoch": 0.0841180456574059, "grad_norm": 0.40766072273254395, "learning_rate": 1.9652845164454044e-05, "loss": 0.3488, "step": 2250 }, { "epoch": 0.08430497464775569, "grad_norm": 0.4196990132331848, "learning_rate": 1.9651309579417835e-05, "loss": 0.3123, "step": 2255 }, { "epoch": 0.08449190363810548, "grad_norm": 0.4198533296585083, "learning_rate": 1.96497706659215e-05, "loss": 0.2318, "step": 2260 }, { "epoch": 0.08467883262845527, "grad_norm": 0.48852840065956116, "learning_rate": 1.9648228424495765e-05, "loss": 0.2639, "step": 2265 }, { "epoch": 0.08486576161880506, "grad_norm": 0.42771658301353455, "learning_rate": 1.96466828556725e-05, "loss": 0.2426, "step": 2270 }, { "epoch": 0.08505269060915485, "grad_norm": 0.46135228872299194, "learning_rate": 1.9645133959984733e-05, "loss": 0.3736, "step": 2275 }, { "epoch": 0.08523961959950464, "grad_norm": 0.673338770866394, "learning_rate": 1.9643581737966628e-05, "loss": 0.2744, "step": 2280 }, { "epoch": 0.08542654858985443, "grad_norm": 0.5732733011245728, "learning_rate": 1.964202619015351e-05, "loss": 0.3472, "step": 2285 }, { "epoch": 0.08561347758020423, "grad_norm": 0.6712455749511719, "learning_rate": 1.9640467317081833e-05, "loss": 0.2671, "step": 2290 }, { "epoch": 0.08580040657055402, "grad_norm": 0.6656973958015442, "learning_rate": 1.9638905119289215e-05, "loss": 0.3475, "step": 2295 }, { "epoch": 0.08598733556090381, "grad_norm": 0.3552950620651245, "learning_rate": 1.963733959731441e-05, "loss": 0.2821, "step": 2300 }, { "epoch": 0.0861742645512536, "grad_norm": 0.7310354709625244, "learning_rate": 1.9635770751697326e-05, "loss": 0.3669, "step": 2305 }, { "epoch": 0.08636119354160339, "grad_norm": 0.3826546370983124, "learning_rate": 1.9634198582979005e-05, "loss": 0.3274, "step": 2310 }, { "epoch": 0.08654812253195318, "grad_norm": 0.8085774183273315, "learning_rate": 1.963262309170165e-05, "loss": 0.303, "step": 2315 }, { "epoch": 0.08673505152230296, "grad_norm": 0.2076191008090973, "learning_rate": 1.96310442784086e-05, "loss": 0.2976, "step": 2320 }, { "epoch": 0.08692198051265275, "grad_norm": 0.8541048169136047, "learning_rate": 1.962946214364434e-05, "loss": 0.3923, "step": 2325 }, { "epoch": 0.08710890950300254, "grad_norm": 0.4931800663471222, "learning_rate": 1.9627876687954508e-05, "loss": 0.3641, "step": 2330 }, { "epoch": 0.08729583849335233, "grad_norm": 0.33070865273475647, "learning_rate": 1.9626287911885882e-05, "loss": 0.3196, "step": 2335 }, { "epoch": 0.08748276748370212, "grad_norm": 0.2637079060077667, "learning_rate": 1.9624695815986383e-05, "loss": 0.3229, "step": 2340 }, { "epoch": 0.08766969647405191, "grad_norm": 0.34368112683296204, "learning_rate": 1.9623100400805076e-05, "loss": 0.2956, "step": 2345 }, { "epoch": 0.0878566254644017, "grad_norm": 0.5603893995285034, "learning_rate": 1.9621501666892178e-05, "loss": 0.2941, "step": 2350 }, { "epoch": 0.0880435544547515, "grad_norm": 0.5940320491790771, "learning_rate": 1.9619899614799046e-05, "loss": 0.315, "step": 2355 }, { "epoch": 0.08823048344510129, "grad_norm": 0.36051857471466064, "learning_rate": 1.9618294245078184e-05, "loss": 0.391, "step": 2360 }, { "epoch": 0.08841741243545108, "grad_norm": 0.5375169515609741, "learning_rate": 1.9616685558283234e-05, "loss": 0.3213, "step": 2365 }, { "epoch": 0.08860434142580087, "grad_norm": 0.6388794779777527, "learning_rate": 1.9615073554968988e-05, "loss": 0.347, "step": 2370 }, { "epoch": 0.08879127041615066, "grad_norm": 1.1879587173461914, "learning_rate": 1.961345823569138e-05, "loss": 0.2717, "step": 2375 }, { "epoch": 0.08897819940650045, "grad_norm": 0.21704648435115814, "learning_rate": 1.961183960100749e-05, "loss": 0.3559, "step": 2380 }, { "epoch": 0.08916512839685024, "grad_norm": 1.803924560546875, "learning_rate": 1.961021765147553e-05, "loss": 0.3177, "step": 2385 }, { "epoch": 0.08935205738720003, "grad_norm": 0.6759556531906128, "learning_rate": 1.9608592387654873e-05, "loss": 0.2972, "step": 2390 }, { "epoch": 0.08953898637754983, "grad_norm": 0.2949463725090027, "learning_rate": 1.9606963810106023e-05, "loss": 0.3087, "step": 2395 }, { "epoch": 0.08972591536789962, "grad_norm": 0.34233054518699646, "learning_rate": 1.9605331919390627e-05, "loss": 0.4371, "step": 2400 }, { "epoch": 0.08991284435824941, "grad_norm": 0.3700423836708069, "learning_rate": 1.960369671607148e-05, "loss": 0.2686, "step": 2405 }, { "epoch": 0.0900997733485992, "grad_norm": 0.528294026851654, "learning_rate": 1.9602058200712516e-05, "loss": 0.3552, "step": 2410 }, { "epoch": 0.09028670233894899, "grad_norm": 2.334455966949463, "learning_rate": 1.9600416373878815e-05, "loss": 0.3232, "step": 2415 }, { "epoch": 0.09047363132929878, "grad_norm": 0.4088938236236572, "learning_rate": 1.959877123613659e-05, "loss": 0.2649, "step": 2420 }, { "epoch": 0.09066056031964857, "grad_norm": 0.5448519587516785, "learning_rate": 1.959712278805321e-05, "loss": 0.3223, "step": 2425 }, { "epoch": 0.09084748930999836, "grad_norm": 0.5493069291114807, "learning_rate": 1.9595471030197165e-05, "loss": 0.345, "step": 2430 }, { "epoch": 0.09103441830034815, "grad_norm": 0.39371412992477417, "learning_rate": 1.9593815963138115e-05, "loss": 0.3379, "step": 2435 }, { "epoch": 0.09122134729069795, "grad_norm": 0.5203854441642761, "learning_rate": 1.9592157587446833e-05, "loss": 0.2653, "step": 2440 }, { "epoch": 0.09140827628104774, "grad_norm": 0.4320240616798401, "learning_rate": 1.9590495903695248e-05, "loss": 0.3827, "step": 2445 }, { "epoch": 0.09159520527139753, "grad_norm": 0.35967013239860535, "learning_rate": 1.958883091245643e-05, "loss": 0.4025, "step": 2450 }, { "epoch": 0.09178213426174732, "grad_norm": 0.47668302059173584, "learning_rate": 1.9587162614304588e-05, "loss": 0.2894, "step": 2455 }, { "epoch": 0.09196906325209711, "grad_norm": 0.463037371635437, "learning_rate": 1.958549100981506e-05, "loss": 0.2681, "step": 2460 }, { "epoch": 0.0921559922424469, "grad_norm": 0.5469281077384949, "learning_rate": 1.9583816099564346e-05, "loss": 0.346, "step": 2465 }, { "epoch": 0.0923429212327967, "grad_norm": 0.40648287534713745, "learning_rate": 1.958213788413007e-05, "loss": 0.3613, "step": 2470 }, { "epoch": 0.09252985022314648, "grad_norm": 0.35621514916419983, "learning_rate": 1.9580456364091003e-05, "loss": 0.3235, "step": 2475 }, { "epoch": 0.09271677921349628, "grad_norm": 0.6592122316360474, "learning_rate": 1.957877154002705e-05, "loss": 0.3155, "step": 2480 }, { "epoch": 0.09290370820384607, "grad_norm": 0.515332818031311, "learning_rate": 1.9577083412519258e-05, "loss": 0.2627, "step": 2485 }, { "epoch": 0.09309063719419586, "grad_norm": 0.3281778395175934, "learning_rate": 1.9575391982149814e-05, "loss": 0.3066, "step": 2490 }, { "epoch": 0.09327756618454565, "grad_norm": 0.4038369953632355, "learning_rate": 1.9573697249502046e-05, "loss": 0.2842, "step": 2495 }, { "epoch": 0.09346449517489544, "grad_norm": 0.5899633169174194, "learning_rate": 1.957199921516042e-05, "loss": 0.2495, "step": 2500 }, { "epoch": 0.09365142416524523, "grad_norm": 0.2653183043003082, "learning_rate": 1.9570297879710533e-05, "loss": 0.3687, "step": 2505 }, { "epoch": 0.09383835315559502, "grad_norm": 0.5264580249786377, "learning_rate": 1.9568593243739133e-05, "loss": 0.3317, "step": 2510 }, { "epoch": 0.09402528214594481, "grad_norm": 0.8166529536247253, "learning_rate": 1.95668853078341e-05, "loss": 0.3921, "step": 2515 }, { "epoch": 0.0942122111362946, "grad_norm": 3.684605360031128, "learning_rate": 1.9565174072584448e-05, "loss": 0.383, "step": 2520 }, { "epoch": 0.0943991401266444, "grad_norm": 0.3563925325870514, "learning_rate": 1.9563459538580337e-05, "loss": 0.2752, "step": 2525 }, { "epoch": 0.09458606911699419, "grad_norm": 0.696751594543457, "learning_rate": 1.9561741706413055e-05, "loss": 0.2909, "step": 2530 }, { "epoch": 0.09477299810734398, "grad_norm": 0.47342103719711304, "learning_rate": 1.956002057667504e-05, "loss": 0.3115, "step": 2535 }, { "epoch": 0.09495992709769377, "grad_norm": 0.3338661193847656, "learning_rate": 1.955829614995986e-05, "loss": 0.3501, "step": 2540 }, { "epoch": 0.09514685608804356, "grad_norm": 0.48812028765678406, "learning_rate": 1.9556568426862214e-05, "loss": 0.3285, "step": 2545 }, { "epoch": 0.09533378507839335, "grad_norm": 0.5304372310638428, "learning_rate": 1.9554837407977948e-05, "loss": 0.3472, "step": 2550 }, { "epoch": 0.09552071406874314, "grad_norm": 0.5442197322845459, "learning_rate": 1.9553103093904043e-05, "loss": 0.3138, "step": 2555 }, { "epoch": 0.09570764305909293, "grad_norm": 0.34639909863471985, "learning_rate": 1.955136548523861e-05, "loss": 0.2917, "step": 2560 }, { "epoch": 0.09589457204944271, "grad_norm": 0.43235430121421814, "learning_rate": 1.9549624582580905e-05, "loss": 0.382, "step": 2565 }, { "epoch": 0.0960815010397925, "grad_norm": 0.37063688039779663, "learning_rate": 1.9547880386531307e-05, "loss": 0.3394, "step": 2570 }, { "epoch": 0.0962684300301423, "grad_norm": 1.0126583576202393, "learning_rate": 1.954613289769135e-05, "loss": 0.3272, "step": 2575 }, { "epoch": 0.09645535902049208, "grad_norm": 0.6808075308799744, "learning_rate": 1.9544382116663687e-05, "loss": 0.3496, "step": 2580 }, { "epoch": 0.09664228801084188, "grad_norm": 0.4482937753200531, "learning_rate": 1.9542628044052115e-05, "loss": 0.3691, "step": 2585 }, { "epoch": 0.09682921700119167, "grad_norm": 0.5843492150306702, "learning_rate": 1.954087068046156e-05, "loss": 0.3311, "step": 2590 }, { "epoch": 0.09701614599154146, "grad_norm": 0.45115911960601807, "learning_rate": 1.9539110026498085e-05, "loss": 0.2578, "step": 2595 }, { "epoch": 0.09720307498189125, "grad_norm": 0.4495108425617218, "learning_rate": 1.9537346082768894e-05, "loss": 0.3572, "step": 2600 }, { "epoch": 0.09739000397224104, "grad_norm": 0.5485822558403015, "learning_rate": 1.9535578849882318e-05, "loss": 0.3271, "step": 2605 }, { "epoch": 0.09757693296259083, "grad_norm": 0.3917326331138611, "learning_rate": 1.9533808328447828e-05, "loss": 0.2825, "step": 2610 }, { "epoch": 0.09776386195294062, "grad_norm": 0.7909905314445496, "learning_rate": 1.953203451907602e-05, "loss": 0.299, "step": 2615 }, { "epoch": 0.09795079094329041, "grad_norm": 0.6403763294219971, "learning_rate": 1.9530257422378635e-05, "loss": 0.4017, "step": 2620 }, { "epoch": 0.0981377199336402, "grad_norm": 0.5497360229492188, "learning_rate": 1.9528477038968542e-05, "loss": 0.4055, "step": 2625 }, { "epoch": 0.09832464892399, "grad_norm": 0.35415706038475037, "learning_rate": 1.9526693369459747e-05, "loss": 0.2735, "step": 2630 }, { "epoch": 0.09851157791433979, "grad_norm": 0.6400545239448547, "learning_rate": 1.9524906414467376e-05, "loss": 0.3236, "step": 2635 }, { "epoch": 0.09869850690468958, "grad_norm": 0.6510913968086243, "learning_rate": 1.952311617460771e-05, "loss": 0.2668, "step": 2640 }, { "epoch": 0.09888543589503937, "grad_norm": 0.36324068903923035, "learning_rate": 1.9521322650498148e-05, "loss": 0.3129, "step": 2645 }, { "epoch": 0.09907236488538916, "grad_norm": 0.4775453209877014, "learning_rate": 1.9519525842757223e-05, "loss": 0.265, "step": 2650 }, { "epoch": 0.09925929387573895, "grad_norm": 0.6153637170791626, "learning_rate": 1.9517725752004605e-05, "loss": 0.2791, "step": 2655 }, { "epoch": 0.09944622286608874, "grad_norm": 0.704619288444519, "learning_rate": 1.951592237886109e-05, "loss": 0.2513, "step": 2660 }, { "epoch": 0.09963315185643853, "grad_norm": 0.49600765109062195, "learning_rate": 1.9514115723948612e-05, "loss": 0.416, "step": 2665 }, { "epoch": 0.09982008084678833, "grad_norm": 0.5411927103996277, "learning_rate": 1.9512305787890237e-05, "loss": 0.3892, "step": 2670 }, { "epoch": 0.10000700983713812, "grad_norm": 0.4148860275745392, "learning_rate": 1.9510492571310157e-05, "loss": 0.316, "step": 2675 }, { "epoch": 0.10019393882748791, "grad_norm": 0.7685550451278687, "learning_rate": 1.95086760748337e-05, "loss": 0.3357, "step": 2680 }, { "epoch": 0.1003808678178377, "grad_norm": 0.7916879057884216, "learning_rate": 1.950685629908732e-05, "loss": 0.4379, "step": 2685 }, { "epoch": 0.10056779680818749, "grad_norm": 0.4039478003978729, "learning_rate": 1.9505033244698614e-05, "loss": 0.2756, "step": 2690 }, { "epoch": 0.10075472579853728, "grad_norm": 0.4937886595726013, "learning_rate": 1.950320691229629e-05, "loss": 0.2877, "step": 2695 }, { "epoch": 0.10094165478888707, "grad_norm": 0.7463287711143494, "learning_rate": 1.9501377302510204e-05, "loss": 0.277, "step": 2700 }, { "epoch": 0.10112858377923686, "grad_norm": 0.591262936592102, "learning_rate": 1.9499544415971337e-05, "loss": 0.316, "step": 2705 }, { "epoch": 0.10131551276958665, "grad_norm": 0.3209913372993469, "learning_rate": 1.94977082533118e-05, "loss": 0.2486, "step": 2710 }, { "epoch": 0.10150244175993645, "grad_norm": 0.5596148371696472, "learning_rate": 1.9495868815164827e-05, "loss": 0.3891, "step": 2715 }, { "epoch": 0.10168937075028624, "grad_norm": 0.3874911367893219, "learning_rate": 1.949402610216479e-05, "loss": 0.3855, "step": 2720 }, { "epoch": 0.10187629974063603, "grad_norm": 0.5168285369873047, "learning_rate": 1.9492180114947187e-05, "loss": 0.2817, "step": 2725 }, { "epoch": 0.10206322873098582, "grad_norm": 0.5476559996604919, "learning_rate": 1.949033085414865e-05, "loss": 0.3411, "step": 2730 }, { "epoch": 0.10225015772133561, "grad_norm": 0.8609138131141663, "learning_rate": 1.9488478320406937e-05, "loss": 0.3765, "step": 2735 }, { "epoch": 0.1024370867116854, "grad_norm": 0.5176308751106262, "learning_rate": 1.9486622514360928e-05, "loss": 0.2958, "step": 2740 }, { "epoch": 0.10262401570203519, "grad_norm": 0.49085232615470886, "learning_rate": 1.9484763436650637e-05, "loss": 0.3155, "step": 2745 }, { "epoch": 0.10281094469238498, "grad_norm": 0.5792192816734314, "learning_rate": 1.948290108791721e-05, "loss": 0.3046, "step": 2750 }, { "epoch": 0.10299787368273478, "grad_norm": 0.5799537301063538, "learning_rate": 1.9481035468802922e-05, "loss": 0.3144, "step": 2755 }, { "epoch": 0.10318480267308457, "grad_norm": 2.7372591495513916, "learning_rate": 1.9479166579951162e-05, "loss": 0.5132, "step": 2760 }, { "epoch": 0.10337173166343436, "grad_norm": 0.6942354440689087, "learning_rate": 1.9477294422006462e-05, "loss": 0.312, "step": 2765 }, { "epoch": 0.10355866065378415, "grad_norm": 0.42621228098869324, "learning_rate": 1.947541899561448e-05, "loss": 0.3342, "step": 2770 }, { "epoch": 0.10374558964413394, "grad_norm": 0.2672087252140045, "learning_rate": 1.9473540301421985e-05, "loss": 0.3697, "step": 2775 }, { "epoch": 0.10393251863448373, "grad_norm": 0.3490724563598633, "learning_rate": 1.9471658340076895e-05, "loss": 0.397, "step": 2780 }, { "epoch": 0.10411944762483352, "grad_norm": 0.4320790767669678, "learning_rate": 1.9469773112228237e-05, "loss": 0.354, "step": 2785 }, { "epoch": 0.10430637661518331, "grad_norm": 0.19510655105113983, "learning_rate": 1.946788461852618e-05, "loss": 0.3459, "step": 2790 }, { "epoch": 0.1044933056055331, "grad_norm": 0.4755474627017975, "learning_rate": 1.9465992859622006e-05, "loss": 0.3761, "step": 2795 }, { "epoch": 0.1046802345958829, "grad_norm": 0.3486561179161072, "learning_rate": 1.946409783616813e-05, "loss": 0.2303, "step": 2800 }, { "epoch": 0.10486716358623269, "grad_norm": 0.7743209004402161, "learning_rate": 1.946219954881809e-05, "loss": 0.3435, "step": 2805 }, { "epoch": 0.10505409257658246, "grad_norm": 1.9734418392181396, "learning_rate": 1.9460297998226552e-05, "loss": 0.3507, "step": 2810 }, { "epoch": 0.10524102156693226, "grad_norm": 0.5270185470581055, "learning_rate": 1.9458393185049303e-05, "loss": 0.4341, "step": 2815 }, { "epoch": 0.10542795055728205, "grad_norm": 0.41794025897979736, "learning_rate": 1.945648510994327e-05, "loss": 0.3172, "step": 2820 }, { "epoch": 0.10561487954763184, "grad_norm": 0.22198650240898132, "learning_rate": 1.9454573773566478e-05, "loss": 0.2623, "step": 2825 }, { "epoch": 0.10580180853798163, "grad_norm": 0.36959734559059143, "learning_rate": 1.94526591765781e-05, "loss": 0.2379, "step": 2830 }, { "epoch": 0.10598873752833142, "grad_norm": 0.21317122876644135, "learning_rate": 1.945074131963843e-05, "loss": 0.3395, "step": 2835 }, { "epoch": 0.10617566651868121, "grad_norm": 0.5071830153465271, "learning_rate": 1.944882020340887e-05, "loss": 0.3268, "step": 2840 }, { "epoch": 0.106362595509031, "grad_norm": 0.4251273274421692, "learning_rate": 1.944689582855197e-05, "loss": 0.3505, "step": 2845 }, { "epoch": 0.1065495244993808, "grad_norm": 0.6162049174308777, "learning_rate": 1.9444968195731384e-05, "loss": 0.308, "step": 2850 }, { "epoch": 0.10673645348973058, "grad_norm": 0.522985577583313, "learning_rate": 1.94430373056119e-05, "loss": 0.2516, "step": 2855 }, { "epoch": 0.10692338248008038, "grad_norm": 0.7239044308662415, "learning_rate": 1.9441103158859427e-05, "loss": 0.3029, "step": 2860 }, { "epoch": 0.10711031147043017, "grad_norm": 0.5046908855438232, "learning_rate": 1.9439165756141e-05, "loss": 0.3179, "step": 2865 }, { "epoch": 0.10729724046077996, "grad_norm": 0.4853164553642273, "learning_rate": 1.9437225098124765e-05, "loss": 0.2714, "step": 2870 }, { "epoch": 0.10748416945112975, "grad_norm": 0.472434401512146, "learning_rate": 1.9435281185480007e-05, "loss": 0.2768, "step": 2875 }, { "epoch": 0.10767109844147954, "grad_norm": 0.549471914768219, "learning_rate": 1.943333401887712e-05, "loss": 0.3117, "step": 2880 }, { "epoch": 0.10785802743182933, "grad_norm": 0.3362756669521332, "learning_rate": 1.943138359898763e-05, "loss": 0.2906, "step": 2885 }, { "epoch": 0.10804495642217912, "grad_norm": 0.6930930614471436, "learning_rate": 1.9429429926484184e-05, "loss": 0.2813, "step": 2890 }, { "epoch": 0.10823188541252891, "grad_norm": 1.1984028816223145, "learning_rate": 1.942747300204054e-05, "loss": 0.3159, "step": 2895 }, { "epoch": 0.1084188144028787, "grad_norm": 0.6312728524208069, "learning_rate": 1.9425512826331593e-05, "loss": 0.3663, "step": 2900 }, { "epoch": 0.1086057433932285, "grad_norm": 0.4989972710609436, "learning_rate": 1.9423549400033344e-05, "loss": 0.3155, "step": 2905 }, { "epoch": 0.10879267238357829, "grad_norm": 0.22321349382400513, "learning_rate": 1.9421582723822926e-05, "loss": 0.3021, "step": 2910 }, { "epoch": 0.10897960137392808, "grad_norm": 0.4356032609939575, "learning_rate": 1.9419612798378588e-05, "loss": 0.341, "step": 2915 }, { "epoch": 0.10916653036427787, "grad_norm": 0.6972177028656006, "learning_rate": 1.9417639624379704e-05, "loss": 0.31, "step": 2920 }, { "epoch": 0.10935345935462766, "grad_norm": 0.6108505725860596, "learning_rate": 1.9415663202506757e-05, "loss": 0.2865, "step": 2925 }, { "epoch": 0.10954038834497745, "grad_norm": 0.29747897386550903, "learning_rate": 1.941368353344137e-05, "loss": 0.2782, "step": 2930 }, { "epoch": 0.10972731733532724, "grad_norm": 0.46353158354759216, "learning_rate": 1.9411700617866268e-05, "loss": 0.3503, "step": 2935 }, { "epoch": 0.10991424632567703, "grad_norm": 0.3761847913265228, "learning_rate": 1.9409714456465303e-05, "loss": 0.3263, "step": 2940 }, { "epoch": 0.11010117531602683, "grad_norm": 0.3461897671222687, "learning_rate": 1.9407725049923443e-05, "loss": 0.2596, "step": 2945 }, { "epoch": 0.11028810430637662, "grad_norm": 0.3752954602241516, "learning_rate": 1.940573239892678e-05, "loss": 0.3043, "step": 2950 }, { "epoch": 0.11047503329672641, "grad_norm": 0.41294610500335693, "learning_rate": 1.940373650416252e-05, "loss": 0.327, "step": 2955 }, { "epoch": 0.1106619622870762, "grad_norm": 0.4009648859500885, "learning_rate": 1.940173736631899e-05, "loss": 0.2781, "step": 2960 }, { "epoch": 0.11084889127742599, "grad_norm": 0.20430001616477966, "learning_rate": 1.9399734986085636e-05, "loss": 0.3943, "step": 2965 }, { "epoch": 0.11103582026777578, "grad_norm": 0.7819133400917053, "learning_rate": 1.9397729364153025e-05, "loss": 0.295, "step": 2970 }, { "epoch": 0.11122274925812557, "grad_norm": 0.7665151953697205, "learning_rate": 1.9395720501212833e-05, "loss": 0.2599, "step": 2975 }, { "epoch": 0.11140967824847536, "grad_norm": 0.38904261589050293, "learning_rate": 1.9393708397957863e-05, "loss": 0.3369, "step": 2980 }, { "epoch": 0.11159660723882515, "grad_norm": 0.4791386127471924, "learning_rate": 1.9391693055082028e-05, "loss": 0.3276, "step": 2985 }, { "epoch": 0.11178353622917495, "grad_norm": 0.5694244503974915, "learning_rate": 1.9389674473280365e-05, "loss": 0.3659, "step": 2990 }, { "epoch": 0.11197046521952474, "grad_norm": 0.7917852401733398, "learning_rate": 1.9387652653249023e-05, "loss": 0.333, "step": 2995 }, { "epoch": 0.11215739420987453, "grad_norm": 0.41513529419898987, "learning_rate": 1.9385627595685275e-05, "loss": 0.2983, "step": 3000 }, { "epoch": 0.11234432320022432, "grad_norm": 0.39530879259109497, "learning_rate": 1.9383599301287498e-05, "loss": 0.2979, "step": 3005 }, { "epoch": 0.11253125219057411, "grad_norm": 0.40975919365882874, "learning_rate": 1.93815677707552e-05, "loss": 0.3145, "step": 3010 }, { "epoch": 0.1127181811809239, "grad_norm": 0.7045682072639465, "learning_rate": 1.9379533004788992e-05, "loss": 0.3777, "step": 3015 }, { "epoch": 0.11290511017127369, "grad_norm": 0.5002708435058594, "learning_rate": 1.9377495004090605e-05, "loss": 0.3937, "step": 3020 }, { "epoch": 0.11309203916162348, "grad_norm": 0.9641180634498596, "learning_rate": 1.937545376936289e-05, "loss": 0.2889, "step": 3025 }, { "epoch": 0.11327896815197327, "grad_norm": 0.41359230875968933, "learning_rate": 1.9373409301309817e-05, "loss": 0.2924, "step": 3030 }, { "epoch": 0.11346589714232307, "grad_norm": 0.5380722284317017, "learning_rate": 1.9371361600636452e-05, "loss": 0.3907, "step": 3035 }, { "epoch": 0.11365282613267286, "grad_norm": 0.8563070297241211, "learning_rate": 1.9369310668049e-05, "loss": 0.341, "step": 3040 }, { "epoch": 0.11383975512302265, "grad_norm": 1.5900834798812866, "learning_rate": 1.936725650425476e-05, "loss": 0.3633, "step": 3045 }, { "epoch": 0.11402668411337243, "grad_norm": 0.4389062821865082, "learning_rate": 1.936519910996216e-05, "loss": 0.341, "step": 3050 }, { "epoch": 0.11421361310372222, "grad_norm": 0.4638637602329254, "learning_rate": 1.936313848588073e-05, "loss": 0.3595, "step": 3055 }, { "epoch": 0.11440054209407201, "grad_norm": 0.541275680065155, "learning_rate": 1.9361074632721125e-05, "loss": 0.3566, "step": 3060 }, { "epoch": 0.1145874710844218, "grad_norm": 0.38814595341682434, "learning_rate": 1.935900755119511e-05, "loss": 0.28, "step": 3065 }, { "epoch": 0.11477440007477159, "grad_norm": 0.3759201169013977, "learning_rate": 1.935693724201556e-05, "loss": 0.3574, "step": 3070 }, { "epoch": 0.11496132906512138, "grad_norm": 0.4212549030780792, "learning_rate": 1.9354863705896464e-05, "loss": 0.2823, "step": 3075 }, { "epoch": 0.11514825805547117, "grad_norm": 0.24448025226593018, "learning_rate": 1.9352786943552925e-05, "loss": 0.3305, "step": 3080 }, { "epoch": 0.11533518704582096, "grad_norm": 1.0636529922485352, "learning_rate": 1.9350706955701163e-05, "loss": 0.4083, "step": 3085 }, { "epoch": 0.11552211603617075, "grad_norm": 0.554562509059906, "learning_rate": 1.9348623743058504e-05, "loss": 0.3184, "step": 3090 }, { "epoch": 0.11570904502652055, "grad_norm": 0.5117464661598206, "learning_rate": 1.9346537306343384e-05, "loss": 0.2759, "step": 3095 }, { "epoch": 0.11589597401687034, "grad_norm": 0.2832476794719696, "learning_rate": 1.9344447646275367e-05, "loss": 0.2997, "step": 3100 }, { "epoch": 0.11608290300722013, "grad_norm": 0.38118836283683777, "learning_rate": 1.9342354763575103e-05, "loss": 0.3417, "step": 3105 }, { "epoch": 0.11626983199756992, "grad_norm": 0.42836499214172363, "learning_rate": 1.9340258658964376e-05, "loss": 0.2619, "step": 3110 }, { "epoch": 0.11645676098791971, "grad_norm": 0.6346667408943176, "learning_rate": 1.9338159333166063e-05, "loss": 0.2843, "step": 3115 }, { "epoch": 0.1166436899782695, "grad_norm": 0.4850265681743622, "learning_rate": 1.9336056786904175e-05, "loss": 0.3905, "step": 3120 }, { "epoch": 0.1168306189686193, "grad_norm": 0.6557062864303589, "learning_rate": 1.9333951020903812e-05, "loss": 0.3158, "step": 3125 }, { "epoch": 0.11701754795896908, "grad_norm": 0.30885252356529236, "learning_rate": 1.9331842035891193e-05, "loss": 0.2957, "step": 3130 }, { "epoch": 0.11720447694931888, "grad_norm": 0.3093513250350952, "learning_rate": 1.9329729832593646e-05, "loss": 0.3044, "step": 3135 }, { "epoch": 0.11739140593966867, "grad_norm": 0.4149554967880249, "learning_rate": 1.932761441173961e-05, "loss": 0.3545, "step": 3140 }, { "epoch": 0.11757833493001846, "grad_norm": 0.5750139951705933, "learning_rate": 1.932549577405864e-05, "loss": 0.2389, "step": 3145 }, { "epoch": 0.11776526392036825, "grad_norm": 0.5002467632293701, "learning_rate": 1.932337392028138e-05, "loss": 0.3064, "step": 3150 }, { "epoch": 0.11795219291071804, "grad_norm": 0.7145326733589172, "learning_rate": 1.9321248851139605e-05, "loss": 0.3383, "step": 3155 }, { "epoch": 0.11813912190106783, "grad_norm": 0.3954809010028839, "learning_rate": 1.9319120567366186e-05, "loss": 0.3623, "step": 3160 }, { "epoch": 0.11832605089141762, "grad_norm": 0.637115478515625, "learning_rate": 1.9316989069695112e-05, "loss": 0.2777, "step": 3165 }, { "epoch": 0.11851297988176741, "grad_norm": 0.45427948236465454, "learning_rate": 1.9314854358861478e-05, "loss": 0.5832, "step": 3170 }, { "epoch": 0.1186999088721172, "grad_norm": 0.41445136070251465, "learning_rate": 1.931271643560147e-05, "loss": 0.3425, "step": 3175 }, { "epoch": 0.118886837862467, "grad_norm": 0.6264867186546326, "learning_rate": 1.9310575300652416e-05, "loss": 0.2942, "step": 3180 }, { "epoch": 0.11907376685281679, "grad_norm": 0.2800760269165039, "learning_rate": 1.9308430954752717e-05, "loss": 0.2636, "step": 3185 }, { "epoch": 0.11926069584316658, "grad_norm": 0.49179184436798096, "learning_rate": 1.9306283398641906e-05, "loss": 0.2834, "step": 3190 }, { "epoch": 0.11944762483351637, "grad_norm": 0.17769566178321838, "learning_rate": 1.9304132633060605e-05, "loss": 0.3386, "step": 3195 }, { "epoch": 0.11963455382386616, "grad_norm": 0.35259896516799927, "learning_rate": 1.930197865875056e-05, "loss": 0.3855, "step": 3200 }, { "epoch": 0.11982148281421595, "grad_norm": 0.44811856746673584, "learning_rate": 1.929982147645461e-05, "loss": 0.3237, "step": 3205 }, { "epoch": 0.12000841180456574, "grad_norm": 0.6957955360412598, "learning_rate": 1.9297661086916704e-05, "loss": 0.3235, "step": 3210 }, { "epoch": 0.12019534079491553, "grad_norm": 0.35209953784942627, "learning_rate": 1.9295497490881902e-05, "loss": 0.3576, "step": 3215 }, { "epoch": 0.12038226978526533, "grad_norm": 0.30803924798965454, "learning_rate": 1.9293330689096366e-05, "loss": 0.313, "step": 3220 }, { "epoch": 0.12056919877561512, "grad_norm": 0.39552509784698486, "learning_rate": 1.9291160682307363e-05, "loss": 0.2947, "step": 3225 }, { "epoch": 0.12075612776596491, "grad_norm": 0.5420857667922974, "learning_rate": 1.9288987471263266e-05, "loss": 0.3885, "step": 3230 }, { "epoch": 0.1209430567563147, "grad_norm": 0.36529114842414856, "learning_rate": 1.928681105671355e-05, "loss": 0.3692, "step": 3235 }, { "epoch": 0.12112998574666449, "grad_norm": 0.27940261363983154, "learning_rate": 1.9284631439408804e-05, "loss": 0.2968, "step": 3240 }, { "epoch": 0.12131691473701428, "grad_norm": 0.38085952401161194, "learning_rate": 1.9282448620100716e-05, "loss": 0.3231, "step": 3245 }, { "epoch": 0.12150384372736407, "grad_norm": 0.4754383862018585, "learning_rate": 1.928026259954207e-05, "loss": 0.396, "step": 3250 }, { "epoch": 0.12169077271771386, "grad_norm": 0.5790641903877258, "learning_rate": 1.927807337848677e-05, "loss": 0.2934, "step": 3255 }, { "epoch": 0.12187770170806365, "grad_norm": 0.4564800560474396, "learning_rate": 1.927588095768981e-05, "loss": 0.2913, "step": 3260 }, { "epoch": 0.12206463069841345, "grad_norm": 0.5212480425834656, "learning_rate": 1.9273685337907295e-05, "loss": 0.2929, "step": 3265 }, { "epoch": 0.12225155968876324, "grad_norm": 0.43916815519332886, "learning_rate": 1.9271486519896434e-05, "loss": 0.345, "step": 3270 }, { "epoch": 0.12243848867911303, "grad_norm": 0.47202667593955994, "learning_rate": 1.926928450441553e-05, "loss": 0.3599, "step": 3275 }, { "epoch": 0.12262541766946282, "grad_norm": 0.3427533507347107, "learning_rate": 1.9267079292224005e-05, "loss": 0.2895, "step": 3280 }, { "epoch": 0.12281234665981261, "grad_norm": 0.23820289969444275, "learning_rate": 1.9264870884082362e-05, "loss": 0.2214, "step": 3285 }, { "epoch": 0.1229992756501624, "grad_norm": 0.7368319630622864, "learning_rate": 1.9262659280752224e-05, "loss": 0.2429, "step": 3290 }, { "epoch": 0.12318620464051218, "grad_norm": 0.36256974935531616, "learning_rate": 1.9260444482996313e-05, "loss": 0.3072, "step": 3295 }, { "epoch": 0.12337313363086197, "grad_norm": 0.44561657309532166, "learning_rate": 1.9258226491578443e-05, "loss": 0.2567, "step": 3300 }, { "epoch": 0.12356006262121176, "grad_norm": 0.708128035068512, "learning_rate": 1.9256005307263536e-05, "loss": 0.3169, "step": 3305 }, { "epoch": 0.12374699161156155, "grad_norm": 0.37106049060821533, "learning_rate": 1.925378093081762e-05, "loss": 0.2737, "step": 3310 }, { "epoch": 0.12393392060191134, "grad_norm": 3.3657779693603516, "learning_rate": 1.925155336300781e-05, "loss": 0.3468, "step": 3315 }, { "epoch": 0.12412084959226113, "grad_norm": 0.33611029386520386, "learning_rate": 1.9249322604602342e-05, "loss": 0.3304, "step": 3320 }, { "epoch": 0.12430777858261093, "grad_norm": 0.4634288251399994, "learning_rate": 1.9247088656370528e-05, "loss": 0.2953, "step": 3325 }, { "epoch": 0.12449470757296072, "grad_norm": 1.3842734098434448, "learning_rate": 1.9244851519082802e-05, "loss": 0.4189, "step": 3330 }, { "epoch": 0.12468163656331051, "grad_norm": 0.5978188514709473, "learning_rate": 1.924261119351069e-05, "loss": 0.3122, "step": 3335 }, { "epoch": 0.1248685655536603, "grad_norm": 0.30435624718666077, "learning_rate": 1.9240367680426804e-05, "loss": 0.3596, "step": 3340 }, { "epoch": 0.1250554945440101, "grad_norm": 0.30297866463661194, "learning_rate": 1.923812098060488e-05, "loss": 0.3459, "step": 3345 }, { "epoch": 0.12524242353435988, "grad_norm": 0.3620128929615021, "learning_rate": 1.923587109481973e-05, "loss": 0.342, "step": 3350 }, { "epoch": 0.1254293525247097, "grad_norm": 0.3281639516353607, "learning_rate": 1.9233618023847285e-05, "loss": 0.2632, "step": 3355 }, { "epoch": 0.12561628151505946, "grad_norm": 0.3437563478946686, "learning_rate": 1.923136176846456e-05, "loss": 0.3144, "step": 3360 }, { "epoch": 0.12580321050540927, "grad_norm": 0.4065919816493988, "learning_rate": 1.9229102329449674e-05, "loss": 0.328, "step": 3365 }, { "epoch": 0.12599013949575905, "grad_norm": 0.714131236076355, "learning_rate": 1.9226839707581838e-05, "loss": 0.3692, "step": 3370 }, { "epoch": 0.12617706848610885, "grad_norm": 0.6316368579864502, "learning_rate": 1.9224573903641374e-05, "loss": 0.3259, "step": 3375 }, { "epoch": 0.12636399747645863, "grad_norm": 0.23467862606048584, "learning_rate": 1.9222304918409684e-05, "loss": 0.2745, "step": 3380 }, { "epoch": 0.12655092646680843, "grad_norm": 0.5963695645332336, "learning_rate": 1.9220032752669282e-05, "loss": 0.297, "step": 3385 }, { "epoch": 0.1267378554571582, "grad_norm": 0.38383743166923523, "learning_rate": 1.921775740720377e-05, "loss": 0.3192, "step": 3390 }, { "epoch": 0.12692478444750802, "grad_norm": 0.5108599662780762, "learning_rate": 1.9215478882797852e-05, "loss": 0.321, "step": 3395 }, { "epoch": 0.1271117134378578, "grad_norm": 0.6184415221214294, "learning_rate": 1.9213197180237325e-05, "loss": 0.3903, "step": 3400 }, { "epoch": 0.1272986424282076, "grad_norm": 0.4752747416496277, "learning_rate": 1.921091230030908e-05, "loss": 0.2943, "step": 3405 }, { "epoch": 0.12748557141855738, "grad_norm": 0.4967573583126068, "learning_rate": 1.9208624243801107e-05, "loss": 0.3427, "step": 3410 }, { "epoch": 0.12767250040890718, "grad_norm": 0.6884995102882385, "learning_rate": 1.9206333011502497e-05, "loss": 0.2472, "step": 3415 }, { "epoch": 0.12785942939925696, "grad_norm": 0.49374425411224365, "learning_rate": 1.9204038604203423e-05, "loss": 0.3751, "step": 3420 }, { "epoch": 0.12804635838960673, "grad_norm": 0.38376346230506897, "learning_rate": 1.9201741022695165e-05, "loss": 0.3392, "step": 3425 }, { "epoch": 0.12823328737995654, "grad_norm": 0.4707247018814087, "learning_rate": 1.919944026777009e-05, "loss": 0.2892, "step": 3430 }, { "epoch": 0.12842021637030632, "grad_norm": 0.5075487494468689, "learning_rate": 1.9197136340221667e-05, "loss": 0.2932, "step": 3435 }, { "epoch": 0.12860714536065612, "grad_norm": 0.3079968988895416, "learning_rate": 1.9194829240844448e-05, "loss": 0.3229, "step": 3440 }, { "epoch": 0.1287940743510059, "grad_norm": 0.41244247555732727, "learning_rate": 1.919251897043409e-05, "loss": 0.3227, "step": 3445 }, { "epoch": 0.1289810033413557, "grad_norm": 0.6245453953742981, "learning_rate": 1.9190205529787336e-05, "loss": 0.3171, "step": 3450 }, { "epoch": 0.12916793233170548, "grad_norm": 0.5110964775085449, "learning_rate": 1.9187888919702035e-05, "loss": 0.3235, "step": 3455 }, { "epoch": 0.1293548613220553, "grad_norm": 0.5428165197372437, "learning_rate": 1.9185569140977104e-05, "loss": 0.2773, "step": 3460 }, { "epoch": 0.12954179031240506, "grad_norm": 0.24074698984622955, "learning_rate": 1.9183246194412583e-05, "loss": 0.2858, "step": 3465 }, { "epoch": 0.12972871930275487, "grad_norm": 0.5543522834777832, "learning_rate": 1.9180920080809575e-05, "loss": 0.2618, "step": 3470 }, { "epoch": 0.12991564829310465, "grad_norm": 0.15020422637462616, "learning_rate": 1.9178590800970302e-05, "loss": 0.3799, "step": 3475 }, { "epoch": 0.13010257728345445, "grad_norm": 0.5467866659164429, "learning_rate": 1.9176258355698062e-05, "loss": 0.335, "step": 3480 }, { "epoch": 0.13028950627380423, "grad_norm": 0.3544618487358093, "learning_rate": 1.917392274579725e-05, "loss": 0.322, "step": 3485 }, { "epoch": 0.13047643526415403, "grad_norm": 0.32355251908302307, "learning_rate": 1.9171583972073345e-05, "loss": 0.2544, "step": 3490 }, { "epoch": 0.1306633642545038, "grad_norm": 0.3455124795436859, "learning_rate": 1.916924203533293e-05, "loss": 0.2684, "step": 3495 }, { "epoch": 0.13085029324485362, "grad_norm": 0.5765972137451172, "learning_rate": 1.9166896936383668e-05, "loss": 0.3054, "step": 3500 }, { "epoch": 0.1310372222352034, "grad_norm": 0.5195266008377075, "learning_rate": 1.9164548676034312e-05, "loss": 0.3135, "step": 3505 }, { "epoch": 0.1312241512255532, "grad_norm": 0.35077986121177673, "learning_rate": 1.9162197255094722e-05, "loss": 0.3495, "step": 3510 }, { "epoch": 0.13141108021590298, "grad_norm": 0.34569740295410156, "learning_rate": 1.915984267437583e-05, "loss": 0.2855, "step": 3515 }, { "epoch": 0.13159800920625278, "grad_norm": 0.36829873919487, "learning_rate": 1.915748493468966e-05, "loss": 0.3321, "step": 3520 }, { "epoch": 0.13178493819660256, "grad_norm": 0.5086042881011963, "learning_rate": 1.915512403684933e-05, "loss": 0.3072, "step": 3525 }, { "epoch": 0.13197186718695236, "grad_norm": 0.3874140679836273, "learning_rate": 1.9152759981669046e-05, "loss": 0.2989, "step": 3530 }, { "epoch": 0.13215879617730214, "grad_norm": 0.4253312945365906, "learning_rate": 1.9150392769964106e-05, "loss": 0.3197, "step": 3535 }, { "epoch": 0.13234572516765195, "grad_norm": 0.309948205947876, "learning_rate": 1.914802240255089e-05, "loss": 0.3147, "step": 3540 }, { "epoch": 0.13253265415800172, "grad_norm": 0.38400158286094666, "learning_rate": 1.9145648880246877e-05, "loss": 0.3448, "step": 3545 }, { "epoch": 0.13271958314835153, "grad_norm": 0.2646450996398926, "learning_rate": 1.914327220387062e-05, "loss": 0.4444, "step": 3550 }, { "epoch": 0.1329065121387013, "grad_norm": 0.4903238117694855, "learning_rate": 1.914089237424176e-05, "loss": 0.3533, "step": 3555 }, { "epoch": 0.1330934411290511, "grad_norm": 0.48070570826530457, "learning_rate": 1.9138509392181047e-05, "loss": 0.2815, "step": 3560 }, { "epoch": 0.1332803701194009, "grad_norm": 0.3975090980529785, "learning_rate": 1.9136123258510292e-05, "loss": 0.3287, "step": 3565 }, { "epoch": 0.1334672991097507, "grad_norm": 0.3869096040725708, "learning_rate": 1.9133733974052412e-05, "loss": 0.3055, "step": 3570 }, { "epoch": 0.13365422810010047, "grad_norm": 0.26971435546875, "learning_rate": 1.9131341539631395e-05, "loss": 0.3044, "step": 3575 }, { "epoch": 0.13384115709045027, "grad_norm": 0.5639915466308594, "learning_rate": 1.912894595607233e-05, "loss": 0.2953, "step": 3580 }, { "epoch": 0.13402808608080005, "grad_norm": 0.463853120803833, "learning_rate": 1.912654722420138e-05, "loss": 0.2509, "step": 3585 }, { "epoch": 0.13421501507114986, "grad_norm": 0.38717523217201233, "learning_rate": 1.9124145344845804e-05, "loss": 0.3899, "step": 3590 }, { "epoch": 0.13440194406149963, "grad_norm": 0.2911614179611206, "learning_rate": 1.9121740318833938e-05, "loss": 0.2567, "step": 3595 }, { "epoch": 0.13458887305184944, "grad_norm": 0.5385882258415222, "learning_rate": 1.9119332146995205e-05, "loss": 0.265, "step": 3600 }, { "epoch": 0.13477580204219922, "grad_norm": 0.3904414176940918, "learning_rate": 1.9116920830160117e-05, "loss": 0.251, "step": 3605 }, { "epoch": 0.13496273103254902, "grad_norm": 0.41085904836654663, "learning_rate": 1.9114506369160267e-05, "loss": 0.3327, "step": 3610 }, { "epoch": 0.1351496600228988, "grad_norm": 0.6957544088363647, "learning_rate": 1.9112088764828335e-05, "loss": 0.3716, "step": 3615 }, { "epoch": 0.1353365890132486, "grad_norm": 0.4141400456428528, "learning_rate": 1.910966801799808e-05, "loss": 0.2635, "step": 3620 }, { "epoch": 0.13552351800359838, "grad_norm": 0.773308277130127, "learning_rate": 1.910724412950435e-05, "loss": 0.2762, "step": 3625 }, { "epoch": 0.13571044699394819, "grad_norm": 0.4335046708583832, "learning_rate": 1.910481710018308e-05, "loss": 0.3463, "step": 3630 }, { "epoch": 0.13589737598429796, "grad_norm": 0.6109732985496521, "learning_rate": 1.9102386930871276e-05, "loss": 0.3244, "step": 3635 }, { "epoch": 0.13608430497464777, "grad_norm": 0.3992200791835785, "learning_rate": 1.9099953622407038e-05, "loss": 0.4066, "step": 3640 }, { "epoch": 0.13627123396499755, "grad_norm": 0.2817442715167999, "learning_rate": 1.9097517175629535e-05, "loss": 0.382, "step": 3645 }, { "epoch": 0.13645816295534735, "grad_norm": 0.6066967248916626, "learning_rate": 1.9095077591379044e-05, "loss": 0.3066, "step": 3650 }, { "epoch": 0.13664509194569713, "grad_norm": 0.638088047504425, "learning_rate": 1.9092634870496892e-05, "loss": 0.341, "step": 3655 }, { "epoch": 0.1368320209360469, "grad_norm": 0.5067809224128723, "learning_rate": 1.9090189013825515e-05, "loss": 0.3039, "step": 3660 }, { "epoch": 0.1370189499263967, "grad_norm": 0.3270426392555237, "learning_rate": 1.908774002220841e-05, "loss": 0.3165, "step": 3665 }, { "epoch": 0.1372058789167465, "grad_norm": 0.46038034558296204, "learning_rate": 1.908528789649017e-05, "loss": 0.3287, "step": 3670 }, { "epoch": 0.1373928079070963, "grad_norm": 0.35730430483818054, "learning_rate": 1.9082832637516458e-05, "loss": 0.2788, "step": 3675 }, { "epoch": 0.13757973689744607, "grad_norm": 0.4533166289329529, "learning_rate": 1.908037424613403e-05, "loss": 0.3384, "step": 3680 }, { "epoch": 0.13776666588779587, "grad_norm": 0.6102305054664612, "learning_rate": 1.907791272319071e-05, "loss": 0.3651, "step": 3685 }, { "epoch": 0.13795359487814565, "grad_norm": 0.6289255023002625, "learning_rate": 1.9075448069535406e-05, "loss": 0.311, "step": 3690 }, { "epoch": 0.13814052386849546, "grad_norm": 0.44820600748062134, "learning_rate": 1.9072980286018104e-05, "loss": 0.3194, "step": 3695 }, { "epoch": 0.13832745285884523, "grad_norm": 0.36444956064224243, "learning_rate": 1.907050937348988e-05, "loss": 0.293, "step": 3700 }, { "epoch": 0.13851438184919504, "grad_norm": 0.3149167001247406, "learning_rate": 1.9068035332802874e-05, "loss": 0.3288, "step": 3705 }, { "epoch": 0.13870131083954482, "grad_norm": 0.4111466705799103, "learning_rate": 1.9065558164810312e-05, "loss": 0.2532, "step": 3710 }, { "epoch": 0.13888823982989462, "grad_norm": 0.37402135133743286, "learning_rate": 1.9063077870366504e-05, "loss": 0.3662, "step": 3715 }, { "epoch": 0.1390751688202444, "grad_norm": 0.706342875957489, "learning_rate": 1.9060594450326824e-05, "loss": 0.2887, "step": 3720 }, { "epoch": 0.1392620978105942, "grad_norm": 0.5995466709136963, "learning_rate": 1.9058107905547737e-05, "loss": 0.3387, "step": 3725 }, { "epoch": 0.13944902680094398, "grad_norm": 0.16998915374279022, "learning_rate": 1.9055618236886784e-05, "loss": 0.3133, "step": 3730 }, { "epoch": 0.1396359557912938, "grad_norm": 0.3938615024089813, "learning_rate": 1.9053125445202574e-05, "loss": 0.3253, "step": 3735 }, { "epoch": 0.13982288478164356, "grad_norm": 0.7660725712776184, "learning_rate": 1.9050629531354806e-05, "loss": 0.3604, "step": 3740 }, { "epoch": 0.14000981377199337, "grad_norm": 0.4029218256473541, "learning_rate": 1.9048130496204247e-05, "loss": 0.3705, "step": 3745 }, { "epoch": 0.14019674276234315, "grad_norm": 0.5488607883453369, "learning_rate": 1.9045628340612737e-05, "loss": 0.2867, "step": 3750 }, { "epoch": 0.14038367175269295, "grad_norm": 0.38521480560302734, "learning_rate": 1.904312306544321e-05, "loss": 0.2713, "step": 3755 }, { "epoch": 0.14057060074304273, "grad_norm": 0.23381686210632324, "learning_rate": 1.9040614671559647e-05, "loss": 0.3445, "step": 3760 }, { "epoch": 0.14075752973339253, "grad_norm": 0.6382274031639099, "learning_rate": 1.9038103159827136e-05, "loss": 0.3306, "step": 3765 }, { "epoch": 0.1409444587237423, "grad_norm": 0.17501187324523926, "learning_rate": 1.9035588531111818e-05, "loss": 0.2653, "step": 3770 }, { "epoch": 0.14113138771409212, "grad_norm": 0.4778057038784027, "learning_rate": 1.903307078628092e-05, "loss": 0.3657, "step": 3775 }, { "epoch": 0.1413183167044419, "grad_norm": 0.22998036444187164, "learning_rate": 1.9030549926202732e-05, "loss": 0.2792, "step": 3780 }, { "epoch": 0.1415052456947917, "grad_norm": 0.35750317573547363, "learning_rate": 1.902802595174664e-05, "loss": 0.2231, "step": 3785 }, { "epoch": 0.14169217468514148, "grad_norm": 0.5821340680122375, "learning_rate": 1.902549886378308e-05, "loss": 0.4172, "step": 3790 }, { "epoch": 0.14187910367549128, "grad_norm": 0.6900777816772461, "learning_rate": 1.902296866318357e-05, "loss": 0.3648, "step": 3795 }, { "epoch": 0.14206603266584106, "grad_norm": 0.4445832669734955, "learning_rate": 1.9020435350820715e-05, "loss": 0.2523, "step": 3800 }, { "epoch": 0.14225296165619086, "grad_norm": 0.43811532855033875, "learning_rate": 1.9017898927568173e-05, "loss": 0.3028, "step": 3805 }, { "epoch": 0.14243989064654064, "grad_norm": 0.4545620381832123, "learning_rate": 1.9015359394300686e-05, "loss": 0.3506, "step": 3810 }, { "epoch": 0.14262681963689045, "grad_norm": 0.401100754737854, "learning_rate": 1.901281675189407e-05, "loss": 0.3335, "step": 3815 }, { "epoch": 0.14281374862724022, "grad_norm": 0.9869058132171631, "learning_rate": 1.9010271001225203e-05, "loss": 0.3321, "step": 3820 }, { "epoch": 0.14300067761759003, "grad_norm": 0.5079956650733948, "learning_rate": 1.9007722143172046e-05, "loss": 0.2671, "step": 3825 }, { "epoch": 0.1431876066079398, "grad_norm": 0.5241971015930176, "learning_rate": 1.9005170178613624e-05, "loss": 0.2777, "step": 3830 }, { "epoch": 0.1433745355982896, "grad_norm": 0.39962106943130493, "learning_rate": 1.900261510843004e-05, "loss": 0.3102, "step": 3835 }, { "epoch": 0.1435614645886394, "grad_norm": 0.4119807183742523, "learning_rate": 1.9000056933502466e-05, "loss": 0.3591, "step": 3840 }, { "epoch": 0.1437483935789892, "grad_norm": 0.4325822591781616, "learning_rate": 1.8997495654713133e-05, "loss": 0.3438, "step": 3845 }, { "epoch": 0.14393532256933897, "grad_norm": 0.2697000503540039, "learning_rate": 1.8994931272945364e-05, "loss": 0.3043, "step": 3850 }, { "epoch": 0.14412225155968877, "grad_norm": 0.4875340461730957, "learning_rate": 1.8992363789083534e-05, "loss": 0.2718, "step": 3855 }, { "epoch": 0.14430918055003855, "grad_norm": 0.2768021821975708, "learning_rate": 1.89897932040131e-05, "loss": 0.2831, "step": 3860 }, { "epoch": 0.14449610954038836, "grad_norm": 0.3775290548801422, "learning_rate": 1.8987219518620573e-05, "loss": 0.3754, "step": 3865 }, { "epoch": 0.14468303853073813, "grad_norm": 0.4714037775993347, "learning_rate": 1.8984642733793556e-05, "loss": 0.3069, "step": 3870 }, { "epoch": 0.14486996752108794, "grad_norm": 0.3473433554172516, "learning_rate": 1.8982062850420705e-05, "loss": 0.2979, "step": 3875 }, { "epoch": 0.14505689651143772, "grad_norm": 0.4559517204761505, "learning_rate": 1.897947986939174e-05, "loss": 0.3666, "step": 3880 }, { "epoch": 0.14524382550178752, "grad_norm": 0.5417743921279907, "learning_rate": 1.8976893791597465e-05, "loss": 0.3124, "step": 3885 }, { "epoch": 0.1454307544921373, "grad_norm": 0.2836984097957611, "learning_rate": 1.8974304617929746e-05, "loss": 0.2611, "step": 3890 }, { "epoch": 0.1456176834824871, "grad_norm": 0.3992460370063782, "learning_rate": 1.8971712349281506e-05, "loss": 0.3606, "step": 3895 }, { "epoch": 0.14580461247283688, "grad_norm": 0.40167370438575745, "learning_rate": 1.896911698654675e-05, "loss": 0.3131, "step": 3900 }, { "epoch": 0.14599154146318666, "grad_norm": 0.45140784978866577, "learning_rate": 1.8966518530620542e-05, "loss": 0.3149, "step": 3905 }, { "epoch": 0.14617847045353646, "grad_norm": 0.5974220633506775, "learning_rate": 1.8963916982399014e-05, "loss": 0.2877, "step": 3910 }, { "epoch": 0.14636539944388624, "grad_norm": 0.2815289795398712, "learning_rate": 1.8961312342779374e-05, "loss": 0.3502, "step": 3915 }, { "epoch": 0.14655232843423605, "grad_norm": 0.3491719365119934, "learning_rate": 1.8958704612659876e-05, "loss": 0.2682, "step": 3920 }, { "epoch": 0.14673925742458582, "grad_norm": 0.28988122940063477, "learning_rate": 1.8956093792939855e-05, "loss": 0.369, "step": 3925 }, { "epoch": 0.14692618641493563, "grad_norm": 0.6046833992004395, "learning_rate": 1.895347988451971e-05, "loss": 0.4345, "step": 3930 }, { "epoch": 0.1471131154052854, "grad_norm": 0.5748773813247681, "learning_rate": 1.895086288830091e-05, "loss": 0.3157, "step": 3935 }, { "epoch": 0.1473000443956352, "grad_norm": 2.4972338676452637, "learning_rate": 1.8948242805185966e-05, "loss": 0.4136, "step": 3940 }, { "epoch": 0.147486973385985, "grad_norm": 0.41845616698265076, "learning_rate": 1.8945619636078483e-05, "loss": 0.2917, "step": 3945 }, { "epoch": 0.1476739023763348, "grad_norm": 0.42868664860725403, "learning_rate": 1.894299338188311e-05, "loss": 0.2979, "step": 3950 }, { "epoch": 0.14786083136668457, "grad_norm": 0.5431304574012756, "learning_rate": 1.8940364043505568e-05, "loss": 0.3204, "step": 3955 }, { "epoch": 0.14804776035703437, "grad_norm": 0.40625184774398804, "learning_rate": 1.893773162185264e-05, "loss": 0.2749, "step": 3960 }, { "epoch": 0.14823468934738415, "grad_norm": 0.4661557078361511, "learning_rate": 1.893509611783218e-05, "loss": 0.326, "step": 3965 }, { "epoch": 0.14842161833773396, "grad_norm": 0.27836883068084717, "learning_rate": 1.8932457532353087e-05, "loss": 0.2812, "step": 3970 }, { "epoch": 0.14860854732808373, "grad_norm": 0.3862547278404236, "learning_rate": 1.892981586632534e-05, "loss": 0.3615, "step": 3975 }, { "epoch": 0.14879547631843354, "grad_norm": 0.46969127655029297, "learning_rate": 1.892717112065997e-05, "loss": 0.3065, "step": 3980 }, { "epoch": 0.14898240530878332, "grad_norm": 0.4676092565059662, "learning_rate": 1.8924523296269077e-05, "loss": 0.3214, "step": 3985 }, { "epoch": 0.14916933429913312, "grad_norm": 0.5357167720794678, "learning_rate": 1.8921872394065822e-05, "loss": 0.3075, "step": 3990 }, { "epoch": 0.1493562632894829, "grad_norm": 0.5506564974784851, "learning_rate": 1.891921841496442e-05, "loss": 0.2875, "step": 3995 }, { "epoch": 0.1495431922798327, "grad_norm": 0.2982819080352783, "learning_rate": 1.8916561359880153e-05, "loss": 0.3099, "step": 4000 }, { "epoch": 0.14973012127018248, "grad_norm": 1.2449369430541992, "learning_rate": 1.8913901229729367e-05, "loss": 0.385, "step": 4005 }, { "epoch": 0.1499170502605323, "grad_norm": 0.7360920310020447, "learning_rate": 1.8911238025429464e-05, "loss": 0.3481, "step": 4010 }, { "epoch": 0.15010397925088206, "grad_norm": 0.37266823649406433, "learning_rate": 1.8908571747898902e-05, "loss": 0.262, "step": 4015 }, { "epoch": 0.15029090824123187, "grad_norm": 0.5776626467704773, "learning_rate": 1.8905902398057208e-05, "loss": 0.3394, "step": 4020 }, { "epoch": 0.15047783723158165, "grad_norm": 0.6980562210083008, "learning_rate": 1.8903229976824963e-05, "loss": 0.3497, "step": 4025 }, { "epoch": 0.15066476622193145, "grad_norm": 0.46382442116737366, "learning_rate": 1.890055448512381e-05, "loss": 0.2935, "step": 4030 }, { "epoch": 0.15085169521228123, "grad_norm": 0.4823243319988251, "learning_rate": 1.889787592387645e-05, "loss": 0.2739, "step": 4035 }, { "epoch": 0.15103862420263103, "grad_norm": 0.4817698895931244, "learning_rate": 1.8895194294006635e-05, "loss": 0.2478, "step": 4040 }, { "epoch": 0.1512255531929808, "grad_norm": 0.6213811039924622, "learning_rate": 1.8892509596439192e-05, "loss": 0.3307, "step": 4045 }, { "epoch": 0.15141248218333062, "grad_norm": 0.4142370820045471, "learning_rate": 1.8889821832099988e-05, "loss": 0.3091, "step": 4050 }, { "epoch": 0.1515994111736804, "grad_norm": 0.5155577659606934, "learning_rate": 1.8887131001915964e-05, "loss": 0.3226, "step": 4055 }, { "epoch": 0.1517863401640302, "grad_norm": 0.6737167239189148, "learning_rate": 1.8884437106815103e-05, "loss": 0.3247, "step": 4060 }, { "epoch": 0.15197326915437998, "grad_norm": 0.2817344665527344, "learning_rate": 1.8881740147726458e-05, "loss": 0.3737, "step": 4065 }, { "epoch": 0.15216019814472978, "grad_norm": 0.3459798991680145, "learning_rate": 1.887904012558013e-05, "loss": 0.3037, "step": 4070 }, { "epoch": 0.15234712713507956, "grad_norm": 0.34796640276908875, "learning_rate": 1.8876337041307275e-05, "loss": 0.2671, "step": 4075 }, { "epoch": 0.15253405612542936, "grad_norm": 0.6775915622711182, "learning_rate": 1.8873630895840114e-05, "loss": 0.3625, "step": 4080 }, { "epoch": 0.15272098511577914, "grad_norm": 0.35140863060951233, "learning_rate": 1.887092169011192e-05, "loss": 0.2942, "step": 4085 }, { "epoch": 0.15290791410612894, "grad_norm": 0.8965099453926086, "learning_rate": 1.8868209425057025e-05, "loss": 0.3787, "step": 4090 }, { "epoch": 0.15309484309647872, "grad_norm": 0.4382193684577942, "learning_rate": 1.88654941016108e-05, "loss": 0.3455, "step": 4095 }, { "epoch": 0.15328177208682853, "grad_norm": 0.4126281142234802, "learning_rate": 1.8862775720709686e-05, "loss": 0.2807, "step": 4100 }, { "epoch": 0.1534687010771783, "grad_norm": 0.31981319189071655, "learning_rate": 1.886005428329118e-05, "loss": 0.3167, "step": 4105 }, { "epoch": 0.1536556300675281, "grad_norm": 0.41247910261154175, "learning_rate": 1.8857329790293824e-05, "loss": 0.2936, "step": 4110 }, { "epoch": 0.1538425590578779, "grad_norm": 0.48248907923698425, "learning_rate": 1.885460224265722e-05, "loss": 0.3004, "step": 4115 }, { "epoch": 0.1540294880482277, "grad_norm": 0.578687846660614, "learning_rate": 1.8851871641322016e-05, "loss": 0.28, "step": 4120 }, { "epoch": 0.15421641703857747, "grad_norm": 0.20185329020023346, "learning_rate": 1.884913798722992e-05, "loss": 0.334, "step": 4125 }, { "epoch": 0.15440334602892727, "grad_norm": 0.5341799259185791, "learning_rate": 1.8846401281323693e-05, "loss": 0.3087, "step": 4130 }, { "epoch": 0.15459027501927705, "grad_norm": 0.2198217362165451, "learning_rate": 1.884366152454715e-05, "loss": 0.2779, "step": 4135 }, { "epoch": 0.15477720400962686, "grad_norm": 0.4372238218784332, "learning_rate": 1.8840918717845146e-05, "loss": 0.3726, "step": 4140 }, { "epoch": 0.15496413299997663, "grad_norm": 0.48354169726371765, "learning_rate": 1.88381728621636e-05, "loss": 0.3891, "step": 4145 }, { "epoch": 0.1551510619903264, "grad_norm": 0.5194223523139954, "learning_rate": 1.883542395844948e-05, "loss": 0.3032, "step": 4150 }, { "epoch": 0.15533799098067622, "grad_norm": 0.23291267454624176, "learning_rate": 1.8832672007650805e-05, "loss": 0.2777, "step": 4155 }, { "epoch": 0.155524919971026, "grad_norm": 0.8497887253761292, "learning_rate": 1.882991701071664e-05, "loss": 0.3094, "step": 4160 }, { "epoch": 0.1557118489613758, "grad_norm": 0.4399624466896057, "learning_rate": 1.8827158968597113e-05, "loss": 0.3451, "step": 4165 }, { "epoch": 0.15589877795172558, "grad_norm": 0.3427572250366211, "learning_rate": 1.8824397882243382e-05, "loss": 0.2513, "step": 4170 }, { "epoch": 0.15608570694207538, "grad_norm": 0.5564326643943787, "learning_rate": 1.8821633752607672e-05, "loss": 0.3496, "step": 4175 }, { "epoch": 0.15627263593242516, "grad_norm": 0.4042254090309143, "learning_rate": 1.8818866580643254e-05, "loss": 0.3207, "step": 4180 }, { "epoch": 0.15645956492277496, "grad_norm": 0.25726428627967834, "learning_rate": 1.8816096367304447e-05, "loss": 0.2839, "step": 4185 }, { "epoch": 0.15664649391312474, "grad_norm": 0.6856520175933838, "learning_rate": 1.8813323113546614e-05, "loss": 0.3228, "step": 4190 }, { "epoch": 0.15683342290347455, "grad_norm": 0.27812889218330383, "learning_rate": 1.8810546820326173e-05, "loss": 0.2724, "step": 4195 }, { "epoch": 0.15702035189382432, "grad_norm": 0.48270055651664734, "learning_rate": 1.880776748860059e-05, "loss": 0.4022, "step": 4200 }, { "epoch": 0.15720728088417413, "grad_norm": 0.8398357033729553, "learning_rate": 1.8804985119328375e-05, "loss": 0.3086, "step": 4205 }, { "epoch": 0.1573942098745239, "grad_norm": 0.34941208362579346, "learning_rate": 1.8802199713469084e-05, "loss": 0.3078, "step": 4210 }, { "epoch": 0.1575811388648737, "grad_norm": 0.3891669511795044, "learning_rate": 1.8799411271983325e-05, "loss": 0.2914, "step": 4215 }, { "epoch": 0.1577680678552235, "grad_norm": 0.4491298794746399, "learning_rate": 1.8796619795832758e-05, "loss": 0.3676, "step": 4220 }, { "epoch": 0.1579549968455733, "grad_norm": 0.3361729383468628, "learning_rate": 1.8793825285980076e-05, "loss": 0.3815, "step": 4225 }, { "epoch": 0.15814192583592307, "grad_norm": 0.3507533371448517, "learning_rate": 1.879102774338903e-05, "loss": 0.3435, "step": 4230 }, { "epoch": 0.15832885482627287, "grad_norm": 0.44350340962409973, "learning_rate": 1.8788227169024406e-05, "loss": 0.2433, "step": 4235 }, { "epoch": 0.15851578381662265, "grad_norm": 0.4336090683937073, "learning_rate": 1.878542356385205e-05, "loss": 0.3624, "step": 4240 }, { "epoch": 0.15870271280697246, "grad_norm": 0.5720579028129578, "learning_rate": 1.878261692883884e-05, "loss": 0.3331, "step": 4245 }, { "epoch": 0.15888964179732223, "grad_norm": 0.5189465880393982, "learning_rate": 1.8779807264952704e-05, "loss": 0.3359, "step": 4250 }, { "epoch": 0.15907657078767204, "grad_norm": 0.2319631725549698, "learning_rate": 1.8776994573162615e-05, "loss": 0.3025, "step": 4255 }, { "epoch": 0.15926349977802182, "grad_norm": 0.25383996963500977, "learning_rate": 1.877417885443859e-05, "loss": 0.332, "step": 4260 }, { "epoch": 0.15945042876837162, "grad_norm": 0.4644213020801544, "learning_rate": 1.8771360109751694e-05, "loss": 0.4095, "step": 4265 }, { "epoch": 0.1596373577587214, "grad_norm": 0.37912002205848694, "learning_rate": 1.8768538340074024e-05, "loss": 0.3924, "step": 4270 }, { "epoch": 0.1598242867490712, "grad_norm": 0.3221535086631775, "learning_rate": 1.8765713546378733e-05, "loss": 0.3375, "step": 4275 }, { "epoch": 0.16001121573942098, "grad_norm": 0.3765072822570801, "learning_rate": 1.8762885729640007e-05, "loss": 0.3454, "step": 4280 }, { "epoch": 0.16019814472977079, "grad_norm": 0.43610912561416626, "learning_rate": 1.8760054890833083e-05, "loss": 0.2819, "step": 4285 }, { "epoch": 0.16038507372012056, "grad_norm": 0.5509352684020996, "learning_rate": 1.8757221030934234e-05, "loss": 0.3796, "step": 4290 }, { "epoch": 0.16057200271047037, "grad_norm": 0.43372535705566406, "learning_rate": 1.8754384150920777e-05, "loss": 0.3896, "step": 4295 }, { "epoch": 0.16075893170082015, "grad_norm": 0.35043320059776306, "learning_rate": 1.8751544251771072e-05, "loss": 0.3586, "step": 4300 }, { "epoch": 0.16094586069116995, "grad_norm": 0.6886361837387085, "learning_rate": 1.874870133446452e-05, "loss": 0.3648, "step": 4305 }, { "epoch": 0.16113278968151973, "grad_norm": 0.6359232068061829, "learning_rate": 1.8745855399981555e-05, "loss": 0.2569, "step": 4310 }, { "epoch": 0.16131971867186953, "grad_norm": 0.4465806782245636, "learning_rate": 1.8743006449303663e-05, "loss": 0.3305, "step": 4315 }, { "epoch": 0.1615066476622193, "grad_norm": 0.6533617973327637, "learning_rate": 1.874015448341337e-05, "loss": 0.3634, "step": 4320 }, { "epoch": 0.16169357665256912, "grad_norm": 0.4724544286727905, "learning_rate": 1.8737299503294233e-05, "loss": 0.2838, "step": 4325 }, { "epoch": 0.1618805056429189, "grad_norm": 0.4181117117404938, "learning_rate": 1.873444150993085e-05, "loss": 0.2875, "step": 4330 }, { "epoch": 0.1620674346332687, "grad_norm": 0.2569214105606079, "learning_rate": 1.8731580504308865e-05, "loss": 0.4176, "step": 4335 }, { "epoch": 0.16225436362361847, "grad_norm": 1.067435622215271, "learning_rate": 1.872871648741495e-05, "loss": 0.3486, "step": 4340 }, { "epoch": 0.16244129261396828, "grad_norm": 0.20301935076713562, "learning_rate": 1.8725849460236833e-05, "loss": 0.2639, "step": 4345 }, { "epoch": 0.16262822160431806, "grad_norm": 0.3194063901901245, "learning_rate": 1.8722979423763264e-05, "loss": 0.2772, "step": 4350 }, { "epoch": 0.16281515059466786, "grad_norm": 0.5217241048812866, "learning_rate": 1.872010637898404e-05, "loss": 0.3767, "step": 4355 }, { "epoch": 0.16300207958501764, "grad_norm": 0.658311128616333, "learning_rate": 1.8717230326889984e-05, "loss": 0.4492, "step": 4360 }, { "epoch": 0.16318900857536744, "grad_norm": 0.26515570282936096, "learning_rate": 1.871435126847297e-05, "loss": 0.2921, "step": 4365 }, { "epoch": 0.16337593756571722, "grad_norm": 0.3176003396511078, "learning_rate": 1.87114692047259e-05, "loss": 0.3403, "step": 4370 }, { "epoch": 0.16356286655606703, "grad_norm": 0.5091459155082703, "learning_rate": 1.8708584136642717e-05, "loss": 0.3209, "step": 4375 }, { "epoch": 0.1637497955464168, "grad_norm": 0.35556361079216003, "learning_rate": 1.8705696065218398e-05, "loss": 0.311, "step": 4380 }, { "epoch": 0.1639367245367666, "grad_norm": 0.43555188179016113, "learning_rate": 1.8702804991448955e-05, "loss": 0.3309, "step": 4385 }, { "epoch": 0.1641236535271164, "grad_norm": 0.2813642919063568, "learning_rate": 1.8699910916331438e-05, "loss": 0.2802, "step": 4390 }, { "epoch": 0.16431058251746616, "grad_norm": 0.32983142137527466, "learning_rate": 1.869701384086393e-05, "loss": 0.3307, "step": 4395 }, { "epoch": 0.16449751150781597, "grad_norm": 0.8605227470397949, "learning_rate": 1.8694113766045552e-05, "loss": 0.2888, "step": 4400 }, { "epoch": 0.16468444049816575, "grad_norm": 0.4275653660297394, "learning_rate": 1.869121069287645e-05, "loss": 0.3053, "step": 4405 }, { "epoch": 0.16487136948851555, "grad_norm": 0.32885563373565674, "learning_rate": 1.8688304622357817e-05, "loss": 0.3125, "step": 4410 }, { "epoch": 0.16505829847886533, "grad_norm": 0.38488566875457764, "learning_rate": 1.868539555549187e-05, "loss": 0.2482, "step": 4415 }, { "epoch": 0.16524522746921513, "grad_norm": 0.3290596902370453, "learning_rate": 1.8682483493281864e-05, "loss": 0.3019, "step": 4420 }, { "epoch": 0.1654321564595649, "grad_norm": 0.9788287878036499, "learning_rate": 1.8679568436732084e-05, "loss": 0.3028, "step": 4425 }, { "epoch": 0.16561908544991472, "grad_norm": 0.728779137134552, "learning_rate": 1.8676650386847855e-05, "loss": 0.3395, "step": 4430 }, { "epoch": 0.1658060144402645, "grad_norm": 0.40499404072761536, "learning_rate": 1.8673729344635524e-05, "loss": 0.4077, "step": 4435 }, { "epoch": 0.1659929434306143, "grad_norm": 1.272477626800537, "learning_rate": 1.8670805311102477e-05, "loss": 0.3258, "step": 4440 }, { "epoch": 0.16617987242096408, "grad_norm": 0.32982587814331055, "learning_rate": 1.8667878287257124e-05, "loss": 0.283, "step": 4445 }, { "epoch": 0.16636680141131388, "grad_norm": 0.4656912088394165, "learning_rate": 1.8664948274108918e-05, "loss": 0.2622, "step": 4450 }, { "epoch": 0.16655373040166366, "grad_norm": 0.3891690671443939, "learning_rate": 1.866201527266834e-05, "loss": 0.3216, "step": 4455 }, { "epoch": 0.16674065939201346, "grad_norm": 0.40880194306373596, "learning_rate": 1.8659079283946882e-05, "loss": 0.2894, "step": 4460 }, { "epoch": 0.16692758838236324, "grad_norm": 1.5993868112564087, "learning_rate": 1.86561403089571e-05, "loss": 0.3405, "step": 4465 }, { "epoch": 0.16711451737271305, "grad_norm": 0.5294637084007263, "learning_rate": 1.8653198348712552e-05, "loss": 0.2742, "step": 4470 }, { "epoch": 0.16730144636306282, "grad_norm": 0.47895947098731995, "learning_rate": 1.865025340422784e-05, "loss": 0.271, "step": 4475 }, { "epoch": 0.16748837535341263, "grad_norm": 0.17551173269748688, "learning_rate": 1.864730547651859e-05, "loss": 0.2728, "step": 4480 }, { "epoch": 0.1676753043437624, "grad_norm": 0.5507614612579346, "learning_rate": 1.8644354566601458e-05, "loss": 0.3346, "step": 4485 }, { "epoch": 0.1678622333341122, "grad_norm": 0.4217040240764618, "learning_rate": 1.864140067549413e-05, "loss": 0.3026, "step": 4490 }, { "epoch": 0.168049162324462, "grad_norm": 0.31403791904449463, "learning_rate": 1.8638443804215315e-05, "loss": 0.2778, "step": 4495 }, { "epoch": 0.1682360913148118, "grad_norm": 0.3219631016254425, "learning_rate": 1.8635483953784755e-05, "loss": 0.3311, "step": 4500 }, { "epoch": 0.16842302030516157, "grad_norm": 0.708217442035675, "learning_rate": 1.8632521125223215e-05, "loss": 0.4032, "step": 4505 }, { "epoch": 0.16860994929551137, "grad_norm": 1.9042284488677979, "learning_rate": 1.8629555319552492e-05, "loss": 0.3184, "step": 4510 }, { "epoch": 0.16879687828586115, "grad_norm": 0.42551693320274353, "learning_rate": 1.862658653779541e-05, "loss": 0.2671, "step": 4515 }, { "epoch": 0.16898380727621096, "grad_norm": 0.22539173066616058, "learning_rate": 1.8623614780975813e-05, "loss": 0.3398, "step": 4520 }, { "epoch": 0.16917073626656073, "grad_norm": 0.353363037109375, "learning_rate": 1.862064005011858e-05, "loss": 0.2993, "step": 4525 }, { "epoch": 0.16935766525691054, "grad_norm": 0.25013571977615356, "learning_rate": 1.86176623462496e-05, "loss": 0.2873, "step": 4530 }, { "epoch": 0.16954459424726032, "grad_norm": 0.28396397829055786, "learning_rate": 1.8614681670395808e-05, "loss": 0.326, "step": 4535 }, { "epoch": 0.16973152323761012, "grad_norm": 0.33238470554351807, "learning_rate": 1.8611698023585146e-05, "loss": 0.3196, "step": 4540 }, { "epoch": 0.1699184522279599, "grad_norm": 0.4479984939098358, "learning_rate": 1.8608711406846595e-05, "loss": 0.3363, "step": 4545 }, { "epoch": 0.1701053812183097, "grad_norm": 0.4897647500038147, "learning_rate": 1.8605721821210146e-05, "loss": 0.3515, "step": 4550 }, { "epoch": 0.17029231020865948, "grad_norm": 0.33985239267349243, "learning_rate": 1.8602729267706833e-05, "loss": 0.3559, "step": 4555 }, { "epoch": 0.17047923919900929, "grad_norm": 0.4013058543205261, "learning_rate": 1.859973374736869e-05, "loss": 0.2976, "step": 4560 }, { "epoch": 0.17066616818935906, "grad_norm": 0.3209231197834015, "learning_rate": 1.8596735261228793e-05, "loss": 0.3168, "step": 4565 }, { "epoch": 0.17085309717970887, "grad_norm": 0.5259041786193848, "learning_rate": 1.859373381032123e-05, "loss": 0.3052, "step": 4570 }, { "epoch": 0.17104002617005865, "grad_norm": 0.34932300448417664, "learning_rate": 1.8590729395681122e-05, "loss": 0.3515, "step": 4575 }, { "epoch": 0.17122695516040845, "grad_norm": 0.615368127822876, "learning_rate": 1.8587722018344598e-05, "loss": 0.2997, "step": 4580 }, { "epoch": 0.17141388415075823, "grad_norm": 0.36148715019226074, "learning_rate": 1.8584711679348818e-05, "loss": 0.2574, "step": 4585 }, { "epoch": 0.17160081314110803, "grad_norm": 0.23802785575389862, "learning_rate": 1.8581698379731965e-05, "loss": 0.2897, "step": 4590 }, { "epoch": 0.1717877421314578, "grad_norm": 0.3403271436691284, "learning_rate": 1.857868212053324e-05, "loss": 0.3736, "step": 4595 }, { "epoch": 0.17197467112180762, "grad_norm": 0.2740768492221832, "learning_rate": 1.8575662902792854e-05, "loss": 0.3326, "step": 4600 }, { "epoch": 0.1721616001121574, "grad_norm": 0.32492953538894653, "learning_rate": 1.8572640727552064e-05, "loss": 0.3538, "step": 4605 }, { "epoch": 0.1723485291025072, "grad_norm": 0.25937604904174805, "learning_rate": 1.856961559585312e-05, "loss": 0.268, "step": 4610 }, { "epoch": 0.17253545809285697, "grad_norm": 0.28443995118141174, "learning_rate": 1.8566587508739312e-05, "loss": 0.278, "step": 4615 }, { "epoch": 0.17272238708320678, "grad_norm": 0.31061285734176636, "learning_rate": 1.856355646725493e-05, "loss": 0.3424, "step": 4620 }, { "epoch": 0.17290931607355656, "grad_norm": 0.29571932554244995, "learning_rate": 1.8560522472445304e-05, "loss": 0.3045, "step": 4625 }, { "epoch": 0.17309624506390636, "grad_norm": 0.38453951478004456, "learning_rate": 1.8557485525356765e-05, "loss": 0.3163, "step": 4630 }, { "epoch": 0.17328317405425614, "grad_norm": 0.21908405423164368, "learning_rate": 1.855444562703667e-05, "loss": 0.2375, "step": 4635 }, { "epoch": 0.17347010304460592, "grad_norm": 0.3414257764816284, "learning_rate": 1.85514027785334e-05, "loss": 0.3415, "step": 4640 }, { "epoch": 0.17365703203495572, "grad_norm": 0.6744435429573059, "learning_rate": 1.8548356980896337e-05, "loss": 0.3277, "step": 4645 }, { "epoch": 0.1738439610253055, "grad_norm": 0.6530652046203613, "learning_rate": 1.854530823517589e-05, "loss": 0.3593, "step": 4650 }, { "epoch": 0.1740308900156553, "grad_norm": 0.5472162961959839, "learning_rate": 1.854225654242349e-05, "loss": 0.295, "step": 4655 }, { "epoch": 0.17421781900600508, "grad_norm": 0.5321584939956665, "learning_rate": 1.8539201903691574e-05, "loss": 0.3521, "step": 4660 }, { "epoch": 0.1744047479963549, "grad_norm": 0.4973386824131012, "learning_rate": 1.8536144320033602e-05, "loss": 0.3237, "step": 4665 }, { "epoch": 0.17459167698670466, "grad_norm": 0.37571483850479126, "learning_rate": 1.8533083792504043e-05, "loss": 0.3179, "step": 4670 }, { "epoch": 0.17477860597705447, "grad_norm": 0.31560763716697693, "learning_rate": 1.8530020322158392e-05, "loss": 0.3937, "step": 4675 }, { "epoch": 0.17496553496740425, "grad_norm": 0.3787771165370941, "learning_rate": 1.8526953910053143e-05, "loss": 0.2801, "step": 4680 }, { "epoch": 0.17515246395775405, "grad_norm": 0.2740449905395508, "learning_rate": 1.852388455724582e-05, "loss": 0.3061, "step": 4685 }, { "epoch": 0.17533939294810383, "grad_norm": 0.43204188346862793, "learning_rate": 1.8520812264794954e-05, "loss": 0.3166, "step": 4690 }, { "epoch": 0.17552632193845363, "grad_norm": 0.2784161865711212, "learning_rate": 1.851773703376009e-05, "loss": 0.2652, "step": 4695 }, { "epoch": 0.1757132509288034, "grad_norm": 0.45802968740463257, "learning_rate": 1.8514658865201786e-05, "loss": 0.3275, "step": 4700 }, { "epoch": 0.17590017991915322, "grad_norm": 0.30260100960731506, "learning_rate": 1.8511577760181615e-05, "loss": 0.3034, "step": 4705 }, { "epoch": 0.176087108909503, "grad_norm": 0.44082552194595337, "learning_rate": 1.8508493719762162e-05, "loss": 0.3504, "step": 4710 }, { "epoch": 0.1762740378998528, "grad_norm": 0.3765546977519989, "learning_rate": 1.850540674500703e-05, "loss": 0.3524, "step": 4715 }, { "epoch": 0.17646096689020258, "grad_norm": 0.30463677644729614, "learning_rate": 1.8502316836980815e-05, "loss": 0.3177, "step": 4720 }, { "epoch": 0.17664789588055238, "grad_norm": 0.39802396297454834, "learning_rate": 1.8499223996749148e-05, "loss": 0.2554, "step": 4725 }, { "epoch": 0.17683482487090216, "grad_norm": 0.44730278849601746, "learning_rate": 1.849612822537866e-05, "loss": 0.3275, "step": 4730 }, { "epoch": 0.17702175386125196, "grad_norm": 0.20249329507350922, "learning_rate": 1.849302952393699e-05, "loss": 0.3258, "step": 4735 }, { "epoch": 0.17720868285160174, "grad_norm": 0.4662615656852722, "learning_rate": 1.8489927893492794e-05, "loss": 0.3144, "step": 4740 }, { "epoch": 0.17739561184195154, "grad_norm": 0.43017593026161194, "learning_rate": 1.8486823335115735e-05, "loss": 0.3936, "step": 4745 }, { "epoch": 0.17758254083230132, "grad_norm": 0.3419100344181061, "learning_rate": 1.8483715849876486e-05, "loss": 0.4101, "step": 4750 }, { "epoch": 0.17776946982265113, "grad_norm": 0.08327355980873108, "learning_rate": 1.8480605438846724e-05, "loss": 0.3454, "step": 4755 }, { "epoch": 0.1779563988130009, "grad_norm": 0.386849045753479, "learning_rate": 1.847749210309915e-05, "loss": 0.2913, "step": 4760 }, { "epoch": 0.1781433278033507, "grad_norm": 0.7153798937797546, "learning_rate": 1.8474375843707464e-05, "loss": 0.2839, "step": 4765 }, { "epoch": 0.1783302567937005, "grad_norm": 0.3605651259422302, "learning_rate": 1.8471256661746367e-05, "loss": 0.3087, "step": 4770 }, { "epoch": 0.1785171857840503, "grad_norm": 0.3076781630516052, "learning_rate": 1.8468134558291582e-05, "loss": 0.3429, "step": 4775 }, { "epoch": 0.17870411477440007, "grad_norm": 0.29881712794303894, "learning_rate": 1.846500953441983e-05, "loss": 0.3679, "step": 4780 }, { "epoch": 0.17889104376474987, "grad_norm": 0.39074286818504333, "learning_rate": 1.8461881591208843e-05, "loss": 0.4408, "step": 4785 }, { "epoch": 0.17907797275509965, "grad_norm": 0.5738774538040161, "learning_rate": 1.8458750729737356e-05, "loss": 0.3933, "step": 4790 }, { "epoch": 0.17926490174544946, "grad_norm": 0.6319369077682495, "learning_rate": 1.8455616951085118e-05, "loss": 0.3198, "step": 4795 }, { "epoch": 0.17945183073579923, "grad_norm": 0.4841550886631012, "learning_rate": 1.845248025633288e-05, "loss": 0.389, "step": 4800 }, { "epoch": 0.17963875972614904, "grad_norm": 0.31131798028945923, "learning_rate": 1.8449340646562396e-05, "loss": 0.2501, "step": 4805 }, { "epoch": 0.17982568871649882, "grad_norm": 0.24542942643165588, "learning_rate": 1.844619812285642e-05, "loss": 0.3488, "step": 4810 }, { "epoch": 0.18001261770684862, "grad_norm": 0.31681129336357117, "learning_rate": 1.8443052686298733e-05, "loss": 0.331, "step": 4815 }, { "epoch": 0.1801995466971984, "grad_norm": 0.22183604538440704, "learning_rate": 1.8439904337974095e-05, "loss": 0.259, "step": 4820 }, { "epoch": 0.1803864756875482, "grad_norm": 1.1499030590057373, "learning_rate": 1.843675307896829e-05, "loss": 0.2915, "step": 4825 }, { "epoch": 0.18057340467789798, "grad_norm": 0.2945778965950012, "learning_rate": 1.8433598910368085e-05, "loss": 0.3305, "step": 4830 }, { "epoch": 0.18076033366824779, "grad_norm": 0.3298734724521637, "learning_rate": 1.8430441833261273e-05, "loss": 0.2948, "step": 4835 }, { "epoch": 0.18094726265859756, "grad_norm": 0.46152833104133606, "learning_rate": 1.8427281848736632e-05, "loss": 0.3241, "step": 4840 }, { "epoch": 0.18113419164894737, "grad_norm": 0.37429046630859375, "learning_rate": 1.8424118957883954e-05, "loss": 0.4391, "step": 4845 }, { "epoch": 0.18132112063929715, "grad_norm": 0.44875389337539673, "learning_rate": 1.8420953161794033e-05, "loss": 0.2826, "step": 4850 }, { "epoch": 0.18150804962964695, "grad_norm": 0.287893146276474, "learning_rate": 1.8417784461558656e-05, "loss": 0.2415, "step": 4855 }, { "epoch": 0.18169497861999673, "grad_norm": 0.36352619528770447, "learning_rate": 1.8414612858270616e-05, "loss": 0.2222, "step": 4860 }, { "epoch": 0.18188190761034653, "grad_norm": 0.35056108236312866, "learning_rate": 1.841143835302371e-05, "loss": 0.2468, "step": 4865 }, { "epoch": 0.1820688366006963, "grad_norm": 0.384459912776947, "learning_rate": 1.8408260946912736e-05, "loss": 0.2602, "step": 4870 }, { "epoch": 0.18225576559104611, "grad_norm": 0.38520097732543945, "learning_rate": 1.8405080641033487e-05, "loss": 0.2951, "step": 4875 }, { "epoch": 0.1824426945813959, "grad_norm": 0.2897312343120575, "learning_rate": 1.8401897436482762e-05, "loss": 0.3556, "step": 4880 }, { "epoch": 0.18262962357174567, "grad_norm": 0.5431839823722839, "learning_rate": 1.8398711334358355e-05, "loss": 0.2967, "step": 4885 }, { "epoch": 0.18281655256209547, "grad_norm": 0.6530733108520508, "learning_rate": 1.8395522335759067e-05, "loss": 0.296, "step": 4890 }, { "epoch": 0.18300348155244525, "grad_norm": 0.5999047160148621, "learning_rate": 1.8392330441784683e-05, "loss": 0.3362, "step": 4895 }, { "epoch": 0.18319041054279506, "grad_norm": 0.5073540806770325, "learning_rate": 1.8389135653535998e-05, "loss": 0.2777, "step": 4900 }, { "epoch": 0.18337733953314483, "grad_norm": 0.48142045736312866, "learning_rate": 1.838593797211481e-05, "loss": 0.2659, "step": 4905 }, { "epoch": 0.18356426852349464, "grad_norm": 0.2865387201309204, "learning_rate": 1.8382737398623904e-05, "loss": 0.2702, "step": 4910 }, { "epoch": 0.18375119751384442, "grad_norm": 0.6015134453773499, "learning_rate": 1.8379533934167063e-05, "loss": 0.3637, "step": 4915 }, { "epoch": 0.18393812650419422, "grad_norm": 0.33356979489326477, "learning_rate": 1.8376327579849068e-05, "loss": 0.3628, "step": 4920 }, { "epoch": 0.184125055494544, "grad_norm": 0.7000239491462708, "learning_rate": 1.8373118336775707e-05, "loss": 0.3346, "step": 4925 }, { "epoch": 0.1843119844848938, "grad_norm": 0.4333287477493286, "learning_rate": 1.8369906206053753e-05, "loss": 0.2867, "step": 4930 }, { "epoch": 0.18449891347524358, "grad_norm": 0.4251829981803894, "learning_rate": 1.8366691188790976e-05, "loss": 0.2435, "step": 4935 }, { "epoch": 0.1846858424655934, "grad_norm": 0.41755521297454834, "learning_rate": 1.836347328609614e-05, "loss": 0.3786, "step": 4940 }, { "epoch": 0.18487277145594316, "grad_norm": 0.40600693225860596, "learning_rate": 1.8360252499079015e-05, "loss": 0.2621, "step": 4945 }, { "epoch": 0.18505970044629297, "grad_norm": 0.5500521063804626, "learning_rate": 1.8357028828850356e-05, "loss": 0.2544, "step": 4950 }, { "epoch": 0.18524662943664275, "grad_norm": 0.5678396224975586, "learning_rate": 1.8353802276521908e-05, "loss": 0.3178, "step": 4955 }, { "epoch": 0.18543355842699255, "grad_norm": 0.6940974593162537, "learning_rate": 1.8350572843206425e-05, "loss": 0.3536, "step": 4960 }, { "epoch": 0.18562048741734233, "grad_norm": 0.2244710922241211, "learning_rate": 1.8347340530017642e-05, "loss": 0.3592, "step": 4965 }, { "epoch": 0.18580741640769213, "grad_norm": 0.3188341557979584, "learning_rate": 1.8344105338070294e-05, "loss": 0.35, "step": 4970 }, { "epoch": 0.1859943453980419, "grad_norm": 0.33533531427383423, "learning_rate": 1.8340867268480102e-05, "loss": 0.3611, "step": 4975 }, { "epoch": 0.18618127438839172, "grad_norm": 0.3602057099342346, "learning_rate": 1.8337626322363782e-05, "loss": 0.3204, "step": 4980 }, { "epoch": 0.1863682033787415, "grad_norm": 0.4599422216415405, "learning_rate": 1.833438250083905e-05, "loss": 0.3248, "step": 4985 }, { "epoch": 0.1865551323690913, "grad_norm": 0.24511076509952545, "learning_rate": 1.8331135805024606e-05, "loss": 0.3311, "step": 4990 }, { "epoch": 0.18674206135944107, "grad_norm": 0.6302551031112671, "learning_rate": 1.8327886236040137e-05, "loss": 0.3417, "step": 4995 }, { "epoch": 0.18692899034979088, "grad_norm": 0.3400326073169708, "learning_rate": 1.832463379500633e-05, "loss": 0.2946, "step": 5000 }, { "epoch": 0.18711591934014066, "grad_norm": 0.2726198732852936, "learning_rate": 1.8321378483044855e-05, "loss": 0.317, "step": 5005 }, { "epoch": 0.18730284833049046, "grad_norm": 0.28910648822784424, "learning_rate": 1.8318120301278382e-05, "loss": 0.4624, "step": 5010 }, { "epoch": 0.18748977732084024, "grad_norm": 0.47752344608306885, "learning_rate": 1.831485925083056e-05, "loss": 0.3485, "step": 5015 }, { "epoch": 0.18767670631119004, "grad_norm": 0.4108814597129822, "learning_rate": 1.8311595332826034e-05, "loss": 0.2964, "step": 5020 }, { "epoch": 0.18786363530153982, "grad_norm": 0.38854309916496277, "learning_rate": 1.8308328548390437e-05, "loss": 0.3555, "step": 5025 }, { "epoch": 0.18805056429188963, "grad_norm": 0.3640444576740265, "learning_rate": 1.8305058898650387e-05, "loss": 0.2959, "step": 5030 }, { "epoch": 0.1882374932822394, "grad_norm": 0.5036413073539734, "learning_rate": 1.830178638473349e-05, "loss": 0.2471, "step": 5035 }, { "epoch": 0.1884244222725892, "grad_norm": 0.8091880083084106, "learning_rate": 1.8298511007768347e-05, "loss": 0.3366, "step": 5040 }, { "epoch": 0.188611351262939, "grad_norm": 0.43067753314971924, "learning_rate": 1.829523276888454e-05, "loss": 0.2554, "step": 5045 }, { "epoch": 0.1887982802532888, "grad_norm": 0.1925475299358368, "learning_rate": 1.8291951669212637e-05, "loss": 0.2483, "step": 5050 }, { "epoch": 0.18898520924363857, "grad_norm": 0.5737656354904175, "learning_rate": 1.82886677098842e-05, "loss": 0.3381, "step": 5055 }, { "epoch": 0.18917213823398837, "grad_norm": 0.6750152707099915, "learning_rate": 1.828538089203177e-05, "loss": 0.3419, "step": 5060 }, { "epoch": 0.18935906722433815, "grad_norm": 0.4922321140766144, "learning_rate": 1.828209121678888e-05, "loss": 0.268, "step": 5065 }, { "epoch": 0.18954599621468796, "grad_norm": 0.557983934879303, "learning_rate": 1.8278798685290037e-05, "loss": 0.2987, "step": 5070 }, { "epoch": 0.18973292520503773, "grad_norm": 0.4714908301830292, "learning_rate": 1.8275503298670742e-05, "loss": 0.3768, "step": 5075 }, { "epoch": 0.18991985419538754, "grad_norm": 0.38415268063545227, "learning_rate": 1.8272205058067488e-05, "loss": 0.2899, "step": 5080 }, { "epoch": 0.19010678318573732, "grad_norm": 0.640612006187439, "learning_rate": 1.8268903964617738e-05, "loss": 0.3488, "step": 5085 }, { "epoch": 0.19029371217608712, "grad_norm": 0.3866754472255707, "learning_rate": 1.826560001945994e-05, "loss": 0.2575, "step": 5090 }, { "epoch": 0.1904806411664369, "grad_norm": 0.6181028485298157, "learning_rate": 1.826229322373354e-05, "loss": 0.3078, "step": 5095 }, { "epoch": 0.1906675701567867, "grad_norm": 0.5132717490196228, "learning_rate": 1.825898357857895e-05, "loss": 0.3201, "step": 5100 }, { "epoch": 0.19085449914713648, "grad_norm": 0.3530460298061371, "learning_rate": 1.825567108513757e-05, "loss": 0.27, "step": 5105 }, { "epoch": 0.19104142813748629, "grad_norm": 0.6483767628669739, "learning_rate": 1.825235574455179e-05, "loss": 0.3382, "step": 5110 }, { "epoch": 0.19122835712783606, "grad_norm": 0.6425184607505798, "learning_rate": 1.8249037557964975e-05, "loss": 0.2799, "step": 5115 }, { "epoch": 0.19141528611818587, "grad_norm": 0.5412061810493469, "learning_rate": 1.8245716526521475e-05, "loss": 0.3593, "step": 5120 }, { "epoch": 0.19160221510853565, "grad_norm": 0.4309811592102051, "learning_rate": 1.8242392651366607e-05, "loss": 0.3289, "step": 5125 }, { "epoch": 0.19178914409888542, "grad_norm": 0.48865464329719543, "learning_rate": 1.823906593364669e-05, "loss": 0.2389, "step": 5130 }, { "epoch": 0.19197607308923523, "grad_norm": 0.4698772132396698, "learning_rate": 1.8235736374509015e-05, "loss": 0.3453, "step": 5135 }, { "epoch": 0.192163002079585, "grad_norm": 0.39952799677848816, "learning_rate": 1.8232403975101845e-05, "loss": 0.2537, "step": 5140 }, { "epoch": 0.1923499310699348, "grad_norm": 0.3500300347805023, "learning_rate": 1.8229068736574434e-05, "loss": 0.3181, "step": 5145 }, { "epoch": 0.1925368600602846, "grad_norm": 0.3574506342411041, "learning_rate": 1.8225730660077007e-05, "loss": 0.2995, "step": 5150 }, { "epoch": 0.1927237890506344, "grad_norm": 0.5655731558799744, "learning_rate": 1.8222389746760774e-05, "loss": 0.2967, "step": 5155 }, { "epoch": 0.19291071804098417, "grad_norm": 0.5786092877388, "learning_rate": 1.8219045997777916e-05, "loss": 0.265, "step": 5160 }, { "epoch": 0.19309764703133397, "grad_norm": 0.30824795365333557, "learning_rate": 1.8215699414281602e-05, "loss": 0.3385, "step": 5165 }, { "epoch": 0.19328457602168375, "grad_norm": 0.379058301448822, "learning_rate": 1.8212349997425967e-05, "loss": 0.3196, "step": 5170 }, { "epoch": 0.19347150501203356, "grad_norm": 0.2118494063615799, "learning_rate": 1.820899774836613e-05, "loss": 0.2843, "step": 5175 }, { "epoch": 0.19365843400238333, "grad_norm": 0.49254727363586426, "learning_rate": 1.820564266825819e-05, "loss": 0.3449, "step": 5180 }, { "epoch": 0.19384536299273314, "grad_norm": 0.45277997851371765, "learning_rate": 1.8202284758259215e-05, "loss": 0.3104, "step": 5185 }, { "epoch": 0.19403229198308292, "grad_norm": 0.45387160778045654, "learning_rate": 1.8198924019527252e-05, "loss": 0.3288, "step": 5190 }, { "epoch": 0.19421922097343272, "grad_norm": 0.30750924348831177, "learning_rate": 1.8195560453221322e-05, "loss": 0.2391, "step": 5195 }, { "epoch": 0.1944061499637825, "grad_norm": 0.2546007037162781, "learning_rate": 1.8192194060501428e-05, "loss": 0.3273, "step": 5200 }, { "epoch": 0.1945930789541323, "grad_norm": 0.37746095657348633, "learning_rate": 1.8188824842528535e-05, "loss": 0.401, "step": 5205 }, { "epoch": 0.19478000794448208, "grad_norm": 0.3548900783061981, "learning_rate": 1.8185452800464593e-05, "loss": 0.2809, "step": 5210 }, { "epoch": 0.19496693693483189, "grad_norm": 0.21679793298244476, "learning_rate": 1.8182077935472525e-05, "loss": 0.337, "step": 5215 }, { "epoch": 0.19515386592518166, "grad_norm": 0.6175905466079712, "learning_rate": 1.8178700248716225e-05, "loss": 0.3285, "step": 5220 }, { "epoch": 0.19534079491553147, "grad_norm": 0.42689886689186096, "learning_rate": 1.8175319741360553e-05, "loss": 0.3127, "step": 5225 }, { "epoch": 0.19552772390588125, "grad_norm": 0.26765209436416626, "learning_rate": 1.8171936414571358e-05, "loss": 0.2753, "step": 5230 }, { "epoch": 0.19571465289623105, "grad_norm": 0.2162155956029892, "learning_rate": 1.816855026951545e-05, "loss": 0.3051, "step": 5235 }, { "epoch": 0.19590158188658083, "grad_norm": 0.4493172764778137, "learning_rate": 1.8165161307360613e-05, "loss": 0.2757, "step": 5240 }, { "epoch": 0.19608851087693063, "grad_norm": 0.3421705961227417, "learning_rate": 1.81617695292756e-05, "loss": 0.3372, "step": 5245 }, { "epoch": 0.1962754398672804, "grad_norm": 0.48634567856788635, "learning_rate": 1.8158374936430144e-05, "loss": 0.3392, "step": 5250 }, { "epoch": 0.19646236885763022, "grad_norm": 0.3790494203567505, "learning_rate": 1.8154977529994934e-05, "loss": 0.2742, "step": 5255 }, { "epoch": 0.19664929784798, "grad_norm": 0.48067718744277954, "learning_rate": 1.8151577311141647e-05, "loss": 0.304, "step": 5260 }, { "epoch": 0.1968362268383298, "grad_norm": 0.374163419008255, "learning_rate": 1.814817428104292e-05, "loss": 0.2721, "step": 5265 }, { "epoch": 0.19702315582867957, "grad_norm": 0.3786110281944275, "learning_rate": 1.8144768440872353e-05, "loss": 0.3316, "step": 5270 }, { "epoch": 0.19721008481902938, "grad_norm": 0.4367314875125885, "learning_rate": 1.8141359791804532e-05, "loss": 0.3282, "step": 5275 }, { "epoch": 0.19739701380937916, "grad_norm": 0.3415897786617279, "learning_rate": 1.8137948335014998e-05, "loss": 0.2399, "step": 5280 }, { "epoch": 0.19758394279972896, "grad_norm": 0.46700483560562134, "learning_rate": 1.813453407168026e-05, "loss": 0.2813, "step": 5285 }, { "epoch": 0.19777087179007874, "grad_norm": 0.30502578616142273, "learning_rate": 1.813111700297781e-05, "loss": 0.3091, "step": 5290 }, { "epoch": 0.19795780078042854, "grad_norm": 0.37599360942840576, "learning_rate": 1.812769713008609e-05, "loss": 0.2241, "step": 5295 }, { "epoch": 0.19814472977077832, "grad_norm": 0.4802122712135315, "learning_rate": 1.812427445418452e-05, "loss": 0.2691, "step": 5300 }, { "epoch": 0.19833165876112813, "grad_norm": 0.41680780053138733, "learning_rate": 1.8120848976453475e-05, "loss": 0.4307, "step": 5305 }, { "epoch": 0.1985185877514779, "grad_norm": 0.4099524915218353, "learning_rate": 1.8117420698074318e-05, "loss": 0.3089, "step": 5310 }, { "epoch": 0.1987055167418277, "grad_norm": 0.6154845952987671, "learning_rate": 1.811398962022935e-05, "loss": 0.3269, "step": 5315 }, { "epoch": 0.1988924457321775, "grad_norm": 0.29491856694221497, "learning_rate": 1.811055574410186e-05, "loss": 0.2856, "step": 5320 }, { "epoch": 0.1990793747225273, "grad_norm": 0.3641104996204376, "learning_rate": 1.810711907087609e-05, "loss": 0.2881, "step": 5325 }, { "epoch": 0.19926630371287707, "grad_norm": 0.33928000926971436, "learning_rate": 1.8103679601737244e-05, "loss": 0.2707, "step": 5330 }, { "epoch": 0.19945323270322687, "grad_norm": 0.21978841722011566, "learning_rate": 1.810023733787151e-05, "loss": 0.2939, "step": 5335 }, { "epoch": 0.19964016169357665, "grad_norm": 0.3131517469882965, "learning_rate": 1.8096792280466016e-05, "loss": 0.2912, "step": 5340 }, { "epoch": 0.19982709068392646, "grad_norm": 0.42469385266304016, "learning_rate": 1.8093344430708873e-05, "loss": 0.2581, "step": 5345 }, { "epoch": 0.20001401967427623, "grad_norm": 0.31522685289382935, "learning_rate": 1.8089893789789134e-05, "loss": 0.3042, "step": 5350 }, { "epoch": 0.20020094866462604, "grad_norm": 0.5416140556335449, "learning_rate": 1.8086440358896834e-05, "loss": 0.2743, "step": 5355 }, { "epoch": 0.20038787765497582, "grad_norm": 0.424735963344574, "learning_rate": 1.808298413922296e-05, "loss": 0.2687, "step": 5360 }, { "epoch": 0.20057480664532562, "grad_norm": 0.24916088581085205, "learning_rate": 1.807952513195946e-05, "loss": 0.3091, "step": 5365 }, { "epoch": 0.2007617356356754, "grad_norm": 0.5970064401626587, "learning_rate": 1.8076063338299254e-05, "loss": 0.3088, "step": 5370 }, { "epoch": 0.20094866462602518, "grad_norm": 0.5019383430480957, "learning_rate": 1.807259875943621e-05, "loss": 0.2508, "step": 5375 }, { "epoch": 0.20113559361637498, "grad_norm": 0.39974379539489746, "learning_rate": 1.8069131396565164e-05, "loss": 0.2661, "step": 5380 }, { "epoch": 0.20132252260672476, "grad_norm": 0.42386549711227417, "learning_rate": 1.8065661250881908e-05, "loss": 0.2875, "step": 5385 }, { "epoch": 0.20150945159707456, "grad_norm": 0.626395583152771, "learning_rate": 1.8062188323583193e-05, "loss": 0.3221, "step": 5390 }, { "epoch": 0.20169638058742434, "grad_norm": 0.5648181438446045, "learning_rate": 1.805871261586674e-05, "loss": 0.308, "step": 5395 }, { "epoch": 0.20188330957777414, "grad_norm": 0.39882054924964905, "learning_rate": 1.8055234128931218e-05, "loss": 0.2746, "step": 5400 }, { "epoch": 0.20207023856812392, "grad_norm": 0.3538493812084198, "learning_rate": 1.8051752863976257e-05, "loss": 0.364, "step": 5405 }, { "epoch": 0.20225716755847373, "grad_norm": 0.5525597333908081, "learning_rate": 1.804826882220244e-05, "loss": 0.315, "step": 5410 }, { "epoch": 0.2024440965488235, "grad_norm": 0.42425400018692017, "learning_rate": 1.8044782004811325e-05, "loss": 0.3296, "step": 5415 }, { "epoch": 0.2026310255391733, "grad_norm": 0.3802151083946228, "learning_rate": 1.8041292413005406e-05, "loss": 0.2939, "step": 5420 }, { "epoch": 0.2028179545295231, "grad_norm": 0.6813692450523376, "learning_rate": 1.8037800047988145e-05, "loss": 0.329, "step": 5425 }, { "epoch": 0.2030048835198729, "grad_norm": 0.4450630247592926, "learning_rate": 1.8034304910963957e-05, "loss": 0.2982, "step": 5430 }, { "epoch": 0.20319181251022267, "grad_norm": 0.4038054347038269, "learning_rate": 1.8030807003138223e-05, "loss": 0.4678, "step": 5435 }, { "epoch": 0.20337874150057247, "grad_norm": 0.6570813655853271, "learning_rate": 1.8027306325717263e-05, "loss": 0.2544, "step": 5440 }, { "epoch": 0.20356567049092225, "grad_norm": 0.3153902292251587, "learning_rate": 1.802380287990836e-05, "loss": 0.2733, "step": 5445 }, { "epoch": 0.20375259948127206, "grad_norm": 0.3435896635055542, "learning_rate": 1.802029666691976e-05, "loss": 0.2971, "step": 5450 }, { "epoch": 0.20393952847162183, "grad_norm": 0.24299199879169464, "learning_rate": 1.8016787687960645e-05, "loss": 0.2603, "step": 5455 }, { "epoch": 0.20412645746197164, "grad_norm": 0.7944544553756714, "learning_rate": 1.801327594424117e-05, "loss": 0.3944, "step": 5460 }, { "epoch": 0.20431338645232142, "grad_norm": 0.3297552764415741, "learning_rate": 1.8009761436972427e-05, "loss": 0.282, "step": 5465 }, { "epoch": 0.20450031544267122, "grad_norm": 0.48783180117607117, "learning_rate": 1.8006244167366478e-05, "loss": 0.2335, "step": 5470 }, { "epoch": 0.204687244433021, "grad_norm": 0.6959665417671204, "learning_rate": 1.800272413663632e-05, "loss": 0.2497, "step": 5475 }, { "epoch": 0.2048741734233708, "grad_norm": 0.3114238679409027, "learning_rate": 1.7999201345995918e-05, "loss": 0.248, "step": 5480 }, { "epoch": 0.20506110241372058, "grad_norm": 0.4099029004573822, "learning_rate": 1.7995675796660175e-05, "loss": 0.3176, "step": 5485 }, { "epoch": 0.20524803140407039, "grad_norm": 0.9323566555976868, "learning_rate": 1.7992147489844956e-05, "loss": 0.3653, "step": 5490 }, { "epoch": 0.20543496039442016, "grad_norm": 0.5710521340370178, "learning_rate": 1.798861642676707e-05, "loss": 0.2791, "step": 5495 }, { "epoch": 0.20562188938476997, "grad_norm": 0.39266684651374817, "learning_rate": 1.7985082608644285e-05, "loss": 0.2759, "step": 5500 }, { "epoch": 0.20580881837511975, "grad_norm": 0.9457874298095703, "learning_rate": 1.7981546036695307e-05, "loss": 0.354, "step": 5505 }, { "epoch": 0.20599574736546955, "grad_norm": 0.507675290107727, "learning_rate": 1.7978006712139802e-05, "loss": 0.3088, "step": 5510 }, { "epoch": 0.20618267635581933, "grad_norm": 0.44382691383361816, "learning_rate": 1.797446463619838e-05, "loss": 0.3084, "step": 5515 }, { "epoch": 0.20636960534616913, "grad_norm": 0.6009101271629333, "learning_rate": 1.7970919810092603e-05, "loss": 0.2258, "step": 5520 }, { "epoch": 0.2065565343365189, "grad_norm": 0.46515893936157227, "learning_rate": 1.7967372235044975e-05, "loss": 0.3775, "step": 5525 }, { "epoch": 0.20674346332686871, "grad_norm": 0.3712780475616455, "learning_rate": 1.7963821912278963e-05, "loss": 0.3656, "step": 5530 }, { "epoch": 0.2069303923172185, "grad_norm": 0.6315010190010071, "learning_rate": 1.7960268843018964e-05, "loss": 0.3175, "step": 5535 }, { "epoch": 0.2071173213075683, "grad_norm": 0.4107741117477417, "learning_rate": 1.7956713028490332e-05, "loss": 0.2603, "step": 5540 }, { "epoch": 0.20730425029791807, "grad_norm": 0.2759909927845001, "learning_rate": 1.7953154469919365e-05, "loss": 0.2768, "step": 5545 }, { "epoch": 0.20749117928826788, "grad_norm": 0.3919317126274109, "learning_rate": 1.7949593168533304e-05, "loss": 0.4107, "step": 5550 }, { "epoch": 0.20767810827861766, "grad_norm": 0.23572702705860138, "learning_rate": 1.7946029125560352e-05, "loss": 0.3234, "step": 5555 }, { "epoch": 0.20786503726896746, "grad_norm": 0.40106138586997986, "learning_rate": 1.794246234222963e-05, "loss": 0.3748, "step": 5560 }, { "epoch": 0.20805196625931724, "grad_norm": 0.5880841016769409, "learning_rate": 1.793889281977123e-05, "loss": 0.2439, "step": 5565 }, { "epoch": 0.20823889524966704, "grad_norm": 0.2483060657978058, "learning_rate": 1.7935320559416173e-05, "loss": 0.3053, "step": 5570 }, { "epoch": 0.20842582424001682, "grad_norm": 1.2637639045715332, "learning_rate": 1.7931745562396432e-05, "loss": 0.3467, "step": 5575 }, { "epoch": 0.20861275323036663, "grad_norm": 0.26382896304130554, "learning_rate": 1.7928167829944917e-05, "loss": 0.2091, "step": 5580 }, { "epoch": 0.2087996822207164, "grad_norm": 0.3013812005519867, "learning_rate": 1.7924587363295493e-05, "loss": 0.2777, "step": 5585 }, { "epoch": 0.2089866112110662, "grad_norm": 0.5539411306381226, "learning_rate": 1.792100416368295e-05, "loss": 0.3764, "step": 5590 }, { "epoch": 0.209173540201416, "grad_norm": 0.40408313274383545, "learning_rate": 1.791741823234304e-05, "loss": 0.3294, "step": 5595 }, { "epoch": 0.2093604691917658, "grad_norm": 0.7178158164024353, "learning_rate": 1.7913829570512445e-05, "loss": 0.3448, "step": 5600 }, { "epoch": 0.20954739818211557, "grad_norm": 0.31547850370407104, "learning_rate": 1.791023817942879e-05, "loss": 0.3079, "step": 5605 }, { "epoch": 0.20973432717246537, "grad_norm": 0.4582591950893402, "learning_rate": 1.7906644060330646e-05, "loss": 0.2815, "step": 5610 }, { "epoch": 0.20992125616281515, "grad_norm": 0.7002764344215393, "learning_rate": 1.7903047214457517e-05, "loss": 0.2797, "step": 5615 }, { "epoch": 0.21010818515316493, "grad_norm": 0.4524244964122772, "learning_rate": 1.7899447643049855e-05, "loss": 0.402, "step": 5620 }, { "epoch": 0.21029511414351473, "grad_norm": 0.40630853176116943, "learning_rate": 1.7895845347349047e-05, "loss": 0.3465, "step": 5625 }, { "epoch": 0.2104820431338645, "grad_norm": 0.21824871003627777, "learning_rate": 1.7892240328597427e-05, "loss": 0.2492, "step": 5630 }, { "epoch": 0.21066897212421432, "grad_norm": 0.3022218644618988, "learning_rate": 1.7888632588038256e-05, "loss": 0.2882, "step": 5635 }, { "epoch": 0.2108559011145641, "grad_norm": 0.761012077331543, "learning_rate": 1.7885022126915743e-05, "loss": 0.3003, "step": 5640 }, { "epoch": 0.2110428301049139, "grad_norm": 0.39378008246421814, "learning_rate": 1.7881408946475035e-05, "loss": 0.296, "step": 5645 }, { "epoch": 0.21122975909526367, "grad_norm": 0.6320983171463013, "learning_rate": 1.787779304796221e-05, "loss": 0.2578, "step": 5650 }, { "epoch": 0.21141668808561348, "grad_norm": 0.3272438943386078, "learning_rate": 1.787417443262429e-05, "loss": 0.297, "step": 5655 }, { "epoch": 0.21160361707596326, "grad_norm": 0.4508860409259796, "learning_rate": 1.7870553101709232e-05, "loss": 0.2263, "step": 5660 }, { "epoch": 0.21179054606631306, "grad_norm": 0.3349319398403168, "learning_rate": 1.7866929056465933e-05, "loss": 0.3182, "step": 5665 }, { "epoch": 0.21197747505666284, "grad_norm": 0.7591943740844727, "learning_rate": 1.7863302298144218e-05, "loss": 0.2884, "step": 5670 }, { "epoch": 0.21216440404701264, "grad_norm": 0.35864681005477905, "learning_rate": 1.785967282799485e-05, "loss": 0.2924, "step": 5675 }, { "epoch": 0.21235133303736242, "grad_norm": 0.42692896723747253, "learning_rate": 1.785604064726953e-05, "loss": 0.2845, "step": 5680 }, { "epoch": 0.21253826202771223, "grad_norm": 0.5757957100868225, "learning_rate": 1.7852405757220898e-05, "loss": 0.3391, "step": 5685 }, { "epoch": 0.212725191018062, "grad_norm": 0.48782822489738464, "learning_rate": 1.7848768159102522e-05, "loss": 0.3037, "step": 5690 }, { "epoch": 0.2129121200084118, "grad_norm": 0.31845298409461975, "learning_rate": 1.78451278541689e-05, "loss": 0.2772, "step": 5695 }, { "epoch": 0.2130990489987616, "grad_norm": 0.33685699105262756, "learning_rate": 1.7841484843675473e-05, "loss": 0.319, "step": 5700 }, { "epoch": 0.2132859779891114, "grad_norm": 0.391272634267807, "learning_rate": 1.7837839128878608e-05, "loss": 0.324, "step": 5705 }, { "epoch": 0.21347290697946117, "grad_norm": 0.42025449872016907, "learning_rate": 1.7834190711035613e-05, "loss": 0.2744, "step": 5710 }, { "epoch": 0.21365983596981097, "grad_norm": 0.5056503415107727, "learning_rate": 1.7830539591404717e-05, "loss": 0.2977, "step": 5715 }, { "epoch": 0.21384676496016075, "grad_norm": 0.5261359214782715, "learning_rate": 1.7826885771245094e-05, "loss": 0.3341, "step": 5720 }, { "epoch": 0.21403369395051056, "grad_norm": 0.44913962483406067, "learning_rate": 1.7823229251816836e-05, "loss": 0.2768, "step": 5725 }, { "epoch": 0.21422062294086033, "grad_norm": 0.2031090408563614, "learning_rate": 1.7819570034380968e-05, "loss": 0.2962, "step": 5730 }, { "epoch": 0.21440755193121014, "grad_norm": 0.3239678144454956, "learning_rate": 1.7815908120199462e-05, "loss": 0.2717, "step": 5735 }, { "epoch": 0.21459448092155992, "grad_norm": 0.25322166085243225, "learning_rate": 1.7812243510535194e-05, "loss": 0.3227, "step": 5740 }, { "epoch": 0.21478140991190972, "grad_norm": 0.2611919045448303, "learning_rate": 1.780857620665199e-05, "loss": 0.3009, "step": 5745 }, { "epoch": 0.2149683389022595, "grad_norm": 0.580700695514679, "learning_rate": 1.7804906209814597e-05, "loss": 0.26, "step": 5750 }, { "epoch": 0.2151552678926093, "grad_norm": 0.7626398205757141, "learning_rate": 1.780123352128869e-05, "loss": 0.3374, "step": 5755 }, { "epoch": 0.21534219688295908, "grad_norm": 0.34990110993385315, "learning_rate": 1.779755814234087e-05, "loss": 0.2281, "step": 5760 }, { "epoch": 0.21552912587330889, "grad_norm": 0.5652785301208496, "learning_rate": 1.779388007423868e-05, "loss": 0.324, "step": 5765 }, { "epoch": 0.21571605486365866, "grad_norm": 0.45265763998031616, "learning_rate": 1.779019931825058e-05, "loss": 0.3418, "step": 5770 }, { "epoch": 0.21590298385400847, "grad_norm": 0.47476527094841003, "learning_rate": 1.7786515875645945e-05, "loss": 0.2861, "step": 5775 }, { "epoch": 0.21608991284435825, "grad_norm": 0.4736066162586212, "learning_rate": 1.77828297476951e-05, "loss": 0.3211, "step": 5780 }, { "epoch": 0.21627684183470805, "grad_norm": 0.4887428879737854, "learning_rate": 1.777914093566928e-05, "loss": 0.3205, "step": 5785 }, { "epoch": 0.21646377082505783, "grad_norm": 0.7084537148475647, "learning_rate": 1.7775449440840656e-05, "loss": 0.2698, "step": 5790 }, { "epoch": 0.21665069981540763, "grad_norm": 0.7960801124572754, "learning_rate": 1.777175526448231e-05, "loss": 0.3193, "step": 5795 }, { "epoch": 0.2168376288057574, "grad_norm": 0.48259130120277405, "learning_rate": 1.776805840786826e-05, "loss": 0.3232, "step": 5800 }, { "epoch": 0.21702455779610721, "grad_norm": 0.5537059307098389, "learning_rate": 1.7764358872273452e-05, "loss": 0.2775, "step": 5805 }, { "epoch": 0.217211486786457, "grad_norm": 0.2787805497646332, "learning_rate": 1.7760656658973748e-05, "loss": 0.285, "step": 5810 }, { "epoch": 0.2173984157768068, "grad_norm": 0.6352003216743469, "learning_rate": 1.7756951769245933e-05, "loss": 0.3008, "step": 5815 }, { "epoch": 0.21758534476715657, "grad_norm": 0.45176759362220764, "learning_rate": 1.7753244204367713e-05, "loss": 0.3192, "step": 5820 }, { "epoch": 0.21777227375750638, "grad_norm": 0.38481858372688293, "learning_rate": 1.7749533965617726e-05, "loss": 0.3189, "step": 5825 }, { "epoch": 0.21795920274785616, "grad_norm": 0.2588234841823578, "learning_rate": 1.7745821054275533e-05, "loss": 0.3369, "step": 5830 }, { "epoch": 0.21814613173820596, "grad_norm": 0.4388332664966583, "learning_rate": 1.7742105471621597e-05, "loss": 0.3092, "step": 5835 }, { "epoch": 0.21833306072855574, "grad_norm": 0.5704757571220398, "learning_rate": 1.7738387218937326e-05, "loss": 0.3485, "step": 5840 }, { "epoch": 0.21851998971890554, "grad_norm": 0.23019002377986908, "learning_rate": 1.7734666297505034e-05, "loss": 0.3094, "step": 5845 }, { "epoch": 0.21870691870925532, "grad_norm": 0.582446277141571, "learning_rate": 1.7730942708607965e-05, "loss": 0.3464, "step": 5850 }, { "epoch": 0.21889384769960513, "grad_norm": 0.621008574962616, "learning_rate": 1.7727216453530275e-05, "loss": 0.3706, "step": 5855 }, { "epoch": 0.2190807766899549, "grad_norm": 0.4594647288322449, "learning_rate": 1.7723487533557043e-05, "loss": 0.3341, "step": 5860 }, { "epoch": 0.21926770568030468, "grad_norm": 0.3757644593715668, "learning_rate": 1.7719755949974264e-05, "loss": 0.2758, "step": 5865 }, { "epoch": 0.21945463467065449, "grad_norm": 0.38085031509399414, "learning_rate": 1.7716021704068858e-05, "loss": 0.248, "step": 5870 }, { "epoch": 0.21964156366100426, "grad_norm": 0.22712992131710052, "learning_rate": 1.771228479712866e-05, "loss": 0.2829, "step": 5875 }, { "epoch": 0.21982849265135407, "grad_norm": 0.7901753783226013, "learning_rate": 1.770854523044242e-05, "loss": 0.3236, "step": 5880 }, { "epoch": 0.22001542164170385, "grad_norm": 0.3070071041584015, "learning_rate": 1.7704803005299806e-05, "loss": 0.2932, "step": 5885 }, { "epoch": 0.22020235063205365, "grad_norm": 0.6579908132553101, "learning_rate": 1.7701058122991407e-05, "loss": 0.3087, "step": 5890 }, { "epoch": 0.22038927962240343, "grad_norm": 0.29225432872772217, "learning_rate": 1.7697310584808726e-05, "loss": 0.2783, "step": 5895 }, { "epoch": 0.22057620861275323, "grad_norm": 0.4828365743160248, "learning_rate": 1.769356039204418e-05, "loss": 0.3803, "step": 5900 }, { "epoch": 0.220763137603103, "grad_norm": 0.47344258427619934, "learning_rate": 1.76898075459911e-05, "loss": 0.3296, "step": 5905 }, { "epoch": 0.22095006659345282, "grad_norm": 0.335570752620697, "learning_rate": 1.768605204794374e-05, "loss": 0.4109, "step": 5910 }, { "epoch": 0.2211369955838026, "grad_norm": 0.4336024820804596, "learning_rate": 1.7682293899197264e-05, "loss": 0.3487, "step": 5915 }, { "epoch": 0.2213239245741524, "grad_norm": 0.43026483058929443, "learning_rate": 1.7678533101047745e-05, "loss": 0.4125, "step": 5920 }, { "epoch": 0.22151085356450217, "grad_norm": 0.3426781892776489, "learning_rate": 1.767476965479218e-05, "loss": 0.274, "step": 5925 }, { "epoch": 0.22169778255485198, "grad_norm": 0.26045331358909607, "learning_rate": 1.7671003561728468e-05, "loss": 0.2431, "step": 5930 }, { "epoch": 0.22188471154520176, "grad_norm": 0.3758314251899719, "learning_rate": 1.766723482315543e-05, "loss": 0.3625, "step": 5935 }, { "epoch": 0.22207164053555156, "grad_norm": 0.5306219458580017, "learning_rate": 1.7663463440372795e-05, "loss": 0.3293, "step": 5940 }, { "epoch": 0.22225856952590134, "grad_norm": 0.4184575080871582, "learning_rate": 1.7659689414681208e-05, "loss": 0.2876, "step": 5945 }, { "epoch": 0.22244549851625114, "grad_norm": 0.4982224702835083, "learning_rate": 1.765591274738222e-05, "loss": 0.4022, "step": 5950 }, { "epoch": 0.22263242750660092, "grad_norm": 0.6839665770530701, "learning_rate": 1.765213343977829e-05, "loss": 0.3823, "step": 5955 }, { "epoch": 0.22281935649695073, "grad_norm": 0.3338620066642761, "learning_rate": 1.7648351493172805e-05, "loss": 0.2864, "step": 5960 }, { "epoch": 0.2230062854873005, "grad_norm": 0.436450332403183, "learning_rate": 1.764456690887004e-05, "loss": 0.3375, "step": 5965 }, { "epoch": 0.2231932144776503, "grad_norm": 0.4416438341140747, "learning_rate": 1.7640779688175192e-05, "loss": 0.3351, "step": 5970 }, { "epoch": 0.2233801434680001, "grad_norm": 2.5495193004608154, "learning_rate": 1.7636989832394365e-05, "loss": 0.3343, "step": 5975 }, { "epoch": 0.2235670724583499, "grad_norm": 0.6355164647102356, "learning_rate": 1.7633197342834574e-05, "loss": 0.3348, "step": 5980 }, { "epoch": 0.22375400144869967, "grad_norm": 0.6266055703163147, "learning_rate": 1.762940222080374e-05, "loss": 0.3222, "step": 5985 }, { "epoch": 0.22394093043904947, "grad_norm": 0.5203749537467957, "learning_rate": 1.7625604467610685e-05, "loss": 0.2935, "step": 5990 }, { "epoch": 0.22412785942939925, "grad_norm": 0.40464696288108826, "learning_rate": 1.7621804084565153e-05, "loss": 0.3872, "step": 5995 }, { "epoch": 0.22431478841974906, "grad_norm": 0.43349775671958923, "learning_rate": 1.7618001072977783e-05, "loss": 0.2921, "step": 6000 }, { "epoch": 0.22450171741009883, "grad_norm": 2.7697973251342773, "learning_rate": 1.7614195434160128e-05, "loss": 0.3545, "step": 6005 }, { "epoch": 0.22468864640044864, "grad_norm": 1.0596578121185303, "learning_rate": 1.7610387169424643e-05, "loss": 0.3049, "step": 6010 }, { "epoch": 0.22487557539079842, "grad_norm": 0.39064550399780273, "learning_rate": 1.7606576280084688e-05, "loss": 0.3061, "step": 6015 }, { "epoch": 0.22506250438114822, "grad_norm": 0.18736739456653595, "learning_rate": 1.760276276745453e-05, "loss": 0.337, "step": 6020 }, { "epoch": 0.225249433371498, "grad_norm": 0.4019501805305481, "learning_rate": 1.7598946632849338e-05, "loss": 0.2916, "step": 6025 }, { "epoch": 0.2254363623618478, "grad_norm": 0.5677745938301086, "learning_rate": 1.759512787758519e-05, "loss": 0.3083, "step": 6030 }, { "epoch": 0.22562329135219758, "grad_norm": 0.6863211393356323, "learning_rate": 1.7591306502979066e-05, "loss": 0.3761, "step": 6035 }, { "epoch": 0.22581022034254739, "grad_norm": 0.25616636872291565, "learning_rate": 1.7587482510348848e-05, "loss": 0.2742, "step": 6040 }, { "epoch": 0.22599714933289716, "grad_norm": 0.37293756008148193, "learning_rate": 1.7583655901013323e-05, "loss": 0.3009, "step": 6045 }, { "epoch": 0.22618407832324697, "grad_norm": 0.33705809712409973, "learning_rate": 1.757982667629217e-05, "loss": 0.2773, "step": 6050 }, { "epoch": 0.22637100731359674, "grad_norm": 0.43777912855148315, "learning_rate": 1.757599483750599e-05, "loss": 0.3039, "step": 6055 }, { "epoch": 0.22655793630394655, "grad_norm": 0.4632292091846466, "learning_rate": 1.7572160385976273e-05, "loss": 0.3124, "step": 6060 }, { "epoch": 0.22674486529429633, "grad_norm": 0.2688206434249878, "learning_rate": 1.756832332302541e-05, "loss": 0.2935, "step": 6065 }, { "epoch": 0.22693179428464613, "grad_norm": 0.5086357593536377, "learning_rate": 1.7564483649976686e-05, "loss": 0.342, "step": 6070 }, { "epoch": 0.2271187232749959, "grad_norm": 1.0471845865249634, "learning_rate": 1.7560641368154307e-05, "loss": 0.3716, "step": 6075 }, { "epoch": 0.22730565226534571, "grad_norm": 0.4194478988647461, "learning_rate": 1.7556796478883355e-05, "loss": 0.2641, "step": 6080 }, { "epoch": 0.2274925812556955, "grad_norm": 0.6006360650062561, "learning_rate": 1.755294898348983e-05, "loss": 0.3655, "step": 6085 }, { "epoch": 0.2276795102460453, "grad_norm": 0.48307010531425476, "learning_rate": 1.754909888330062e-05, "loss": 0.2505, "step": 6090 }, { "epoch": 0.22786643923639507, "grad_norm": 0.3270209729671478, "learning_rate": 1.7545246179643513e-05, "loss": 0.3142, "step": 6095 }, { "epoch": 0.22805336822674485, "grad_norm": 0.5060667395591736, "learning_rate": 1.7541390873847195e-05, "loss": 0.4389, "step": 6100 }, { "epoch": 0.22824029721709466, "grad_norm": 0.15000495314598083, "learning_rate": 1.7537532967241255e-05, "loss": 0.3561, "step": 6105 }, { "epoch": 0.22842722620744443, "grad_norm": 0.6780317425727844, "learning_rate": 1.7533672461156173e-05, "loss": 0.2892, "step": 6110 }, { "epoch": 0.22861415519779424, "grad_norm": 0.4353383481502533, "learning_rate": 1.7529809356923327e-05, "loss": 0.3026, "step": 6115 }, { "epoch": 0.22880108418814402, "grad_norm": 0.34928640723228455, "learning_rate": 1.7525943655874987e-05, "loss": 0.3004, "step": 6120 }, { "epoch": 0.22898801317849382, "grad_norm": 0.538777232170105, "learning_rate": 1.752207535934433e-05, "loss": 0.3053, "step": 6125 }, { "epoch": 0.2291749421688436, "grad_norm": 0.6690159440040588, "learning_rate": 1.7518204468665415e-05, "loss": 0.3058, "step": 6130 }, { "epoch": 0.2293618711591934, "grad_norm": 0.29182419180870056, "learning_rate": 1.75143309851732e-05, "loss": 0.2783, "step": 6135 }, { "epoch": 0.22954880014954318, "grad_norm": 0.4138169586658478, "learning_rate": 1.751045491020354e-05, "loss": 0.363, "step": 6140 }, { "epoch": 0.22973572913989299, "grad_norm": 0.433714359998703, "learning_rate": 1.750657624509319e-05, "loss": 0.2915, "step": 6145 }, { "epoch": 0.22992265813024276, "grad_norm": 0.41636890172958374, "learning_rate": 1.750269499117978e-05, "loss": 0.275, "step": 6150 }, { "epoch": 0.23010958712059257, "grad_norm": 0.38736167550086975, "learning_rate": 1.7498811149801845e-05, "loss": 0.4277, "step": 6155 }, { "epoch": 0.23029651611094235, "grad_norm": 0.578729510307312, "learning_rate": 1.749492472229881e-05, "loss": 0.3489, "step": 6160 }, { "epoch": 0.23048344510129215, "grad_norm": 0.6496894359588623, "learning_rate": 1.7491035710011e-05, "loss": 0.2933, "step": 6165 }, { "epoch": 0.23067037409164193, "grad_norm": 0.2986353039741516, "learning_rate": 1.748714411427962e-05, "loss": 0.2677, "step": 6170 }, { "epoch": 0.23085730308199173, "grad_norm": 0.38289961218833923, "learning_rate": 1.7483249936446768e-05, "loss": 0.2705, "step": 6175 }, { "epoch": 0.2310442320723415, "grad_norm": 0.4401322603225708, "learning_rate": 1.7479353177855434e-05, "loss": 0.2505, "step": 6180 }, { "epoch": 0.23123116106269131, "grad_norm": 0.48002535104751587, "learning_rate": 1.74754538398495e-05, "loss": 0.3417, "step": 6185 }, { "epoch": 0.2314180900530411, "grad_norm": 0.32335153222084045, "learning_rate": 1.7471551923773732e-05, "loss": 0.3704, "step": 6190 }, { "epoch": 0.2316050190433909, "grad_norm": 0.34287896752357483, "learning_rate": 1.7467647430973796e-05, "loss": 0.3006, "step": 6195 }, { "epoch": 0.23179194803374067, "grad_norm": 0.6139170527458191, "learning_rate": 1.7463740362796235e-05, "loss": 0.4078, "step": 6200 }, { "epoch": 0.23197887702409048, "grad_norm": 0.5172100067138672, "learning_rate": 1.7459830720588486e-05, "loss": 0.4518, "step": 6205 }, { "epoch": 0.23216580601444026, "grad_norm": 0.5083472728729248, "learning_rate": 1.7455918505698876e-05, "loss": 0.2808, "step": 6210 }, { "epoch": 0.23235273500479006, "grad_norm": 0.3061027526855469, "learning_rate": 1.745200371947661e-05, "loss": 0.3108, "step": 6215 }, { "epoch": 0.23253966399513984, "grad_norm": 0.5908476710319519, "learning_rate": 1.7448086363271785e-05, "loss": 0.2467, "step": 6220 }, { "epoch": 0.23272659298548964, "grad_norm": 0.5854301452636719, "learning_rate": 1.7444166438435392e-05, "loss": 0.3062, "step": 6225 }, { "epoch": 0.23291352197583942, "grad_norm": 0.6378867030143738, "learning_rate": 1.7440243946319294e-05, "loss": 0.2687, "step": 6230 }, { "epoch": 0.23310045096618923, "grad_norm": 0.195028617978096, "learning_rate": 1.7436318888276252e-05, "loss": 0.3161, "step": 6235 }, { "epoch": 0.233287379956539, "grad_norm": 0.22017711400985718, "learning_rate": 1.74323912656599e-05, "loss": 0.381, "step": 6240 }, { "epoch": 0.2334743089468888, "grad_norm": 0.43266749382019043, "learning_rate": 1.742846107982477e-05, "loss": 0.2814, "step": 6245 }, { "epoch": 0.2336612379372386, "grad_norm": 0.30291399359703064, "learning_rate": 1.742452833212626e-05, "loss": 0.2848, "step": 6250 }, { "epoch": 0.2338481669275884, "grad_norm": 0.38583922386169434, "learning_rate": 1.7420593023920673e-05, "loss": 0.4128, "step": 6255 }, { "epoch": 0.23403509591793817, "grad_norm": 0.34890538454055786, "learning_rate": 1.7416655156565173e-05, "loss": 0.2512, "step": 6260 }, { "epoch": 0.23422202490828797, "grad_norm": 0.35292932391166687, "learning_rate": 1.741271473141783e-05, "loss": 0.3371, "step": 6265 }, { "epoch": 0.23440895389863775, "grad_norm": 0.45217952132225037, "learning_rate": 1.7408771749837572e-05, "loss": 0.2664, "step": 6270 }, { "epoch": 0.23459588288898756, "grad_norm": 0.32959669828414917, "learning_rate": 1.7404826213184228e-05, "loss": 0.392, "step": 6275 }, { "epoch": 0.23478281187933733, "grad_norm": 0.4836936891078949, "learning_rate": 1.74008781228185e-05, "loss": 0.2778, "step": 6280 }, { "epoch": 0.23496974086968714, "grad_norm": 0.31659331917762756, "learning_rate": 1.7396927480101968e-05, "loss": 0.2593, "step": 6285 }, { "epoch": 0.23515666986003692, "grad_norm": 0.3063521981239319, "learning_rate": 1.7392974286397096e-05, "loss": 0.2791, "step": 6290 }, { "epoch": 0.23534359885038672, "grad_norm": 0.3990836441516876, "learning_rate": 1.7389018543067227e-05, "loss": 0.3199, "step": 6295 }, { "epoch": 0.2355305278407365, "grad_norm": 0.38259872794151306, "learning_rate": 1.738506025147659e-05, "loss": 0.2996, "step": 6300 }, { "epoch": 0.2357174568310863, "grad_norm": 0.4150097072124481, "learning_rate": 1.7381099412990276e-05, "loss": 0.338, "step": 6305 }, { "epoch": 0.23590438582143608, "grad_norm": 0.18883201479911804, "learning_rate": 1.737713602897427e-05, "loss": 0.3198, "step": 6310 }, { "epoch": 0.23609131481178589, "grad_norm": 0.2726154923439026, "learning_rate": 1.7373170100795433e-05, "loss": 0.2758, "step": 6315 }, { "epoch": 0.23627824380213566, "grad_norm": 0.3209015727043152, "learning_rate": 1.736920162982149e-05, "loss": 0.3209, "step": 6320 }, { "epoch": 0.23646517279248547, "grad_norm": 0.32282012701034546, "learning_rate": 1.736523061742107e-05, "loss": 0.3198, "step": 6325 }, { "epoch": 0.23665210178283524, "grad_norm": 0.6019655466079712, "learning_rate": 1.736125706496364e-05, "loss": 0.3568, "step": 6330 }, { "epoch": 0.23683903077318505, "grad_norm": 0.42038559913635254, "learning_rate": 1.7357280973819576e-05, "loss": 0.3844, "step": 6335 }, { "epoch": 0.23702595976353483, "grad_norm": 0.6067512631416321, "learning_rate": 1.7353302345360118e-05, "loss": 0.2941, "step": 6340 }, { "epoch": 0.2372128887538846, "grad_norm": 0.3025815486907959, "learning_rate": 1.7349321180957382e-05, "loss": 0.2795, "step": 6345 }, { "epoch": 0.2373998177442344, "grad_norm": 0.30944767594337463, "learning_rate": 1.7345337481984355e-05, "loss": 0.2756, "step": 6350 }, { "epoch": 0.2375867467345842, "grad_norm": 0.33044394850730896, "learning_rate": 1.73413512498149e-05, "loss": 0.2536, "step": 6355 }, { "epoch": 0.237773675724934, "grad_norm": 0.45606037974357605, "learning_rate": 1.7337362485823757e-05, "loss": 0.3169, "step": 6360 }, { "epoch": 0.23796060471528377, "grad_norm": 0.38473767042160034, "learning_rate": 1.7333371191386535e-05, "loss": 0.2228, "step": 6365 }, { "epoch": 0.23814753370563357, "grad_norm": 0.3136763870716095, "learning_rate": 1.7329377367879715e-05, "loss": 0.2625, "step": 6370 }, { "epoch": 0.23833446269598335, "grad_norm": 0.33016493916511536, "learning_rate": 1.7325381016680657e-05, "loss": 0.4031, "step": 6375 }, { "epoch": 0.23852139168633316, "grad_norm": 0.6611918807029724, "learning_rate": 1.7321382139167578e-05, "loss": 0.346, "step": 6380 }, { "epoch": 0.23870832067668293, "grad_norm": 0.3350640833377838, "learning_rate": 1.7317380736719588e-05, "loss": 0.2905, "step": 6385 }, { "epoch": 0.23889524966703274, "grad_norm": 0.6378026008605957, "learning_rate": 1.7313376810716654e-05, "loss": 0.2809, "step": 6390 }, { "epoch": 0.23908217865738252, "grad_norm": 0.5519046783447266, "learning_rate": 1.7309370362539607e-05, "loss": 0.3125, "step": 6395 }, { "epoch": 0.23926910764773232, "grad_norm": 0.4152654707431793, "learning_rate": 1.7305361393570165e-05, "loss": 0.267, "step": 6400 }, { "epoch": 0.2394560366380821, "grad_norm": 0.47034916281700134, "learning_rate": 1.7301349905190904e-05, "loss": 0.2861, "step": 6405 }, { "epoch": 0.2396429656284319, "grad_norm": 1.0928969383239746, "learning_rate": 1.729733589878527e-05, "loss": 0.306, "step": 6410 }, { "epoch": 0.23982989461878168, "grad_norm": 0.28271111845970154, "learning_rate": 1.729331937573758e-05, "loss": 0.2999, "step": 6415 }, { "epoch": 0.24001682360913149, "grad_norm": 0.27167975902557373, "learning_rate": 1.7289300337433017e-05, "loss": 0.2157, "step": 6420 }, { "epoch": 0.24020375259948126, "grad_norm": 0.3268529176712036, "learning_rate": 1.7285278785257633e-05, "loss": 0.2802, "step": 6425 }, { "epoch": 0.24039068158983107, "grad_norm": 0.6253595352172852, "learning_rate": 1.7281254720598342e-05, "loss": 0.3881, "step": 6430 }, { "epoch": 0.24057761058018085, "grad_norm": 0.3022212088108063, "learning_rate": 1.727722814484294e-05, "loss": 0.3592, "step": 6435 }, { "epoch": 0.24076453957053065, "grad_norm": 1.0882967710494995, "learning_rate": 1.7273199059380062e-05, "loss": 0.2548, "step": 6440 }, { "epoch": 0.24095146856088043, "grad_norm": 0.23813073337078094, "learning_rate": 1.7269167465599236e-05, "loss": 0.2696, "step": 6445 }, { "epoch": 0.24113839755123023, "grad_norm": 0.4797143042087555, "learning_rate": 1.7265133364890842e-05, "loss": 0.2992, "step": 6450 }, { "epoch": 0.24132532654158, "grad_norm": 0.3314330577850342, "learning_rate": 1.7261096758646115e-05, "loss": 0.4107, "step": 6455 }, { "epoch": 0.24151225553192981, "grad_norm": 0.38850629329681396, "learning_rate": 1.7257057648257174e-05, "loss": 0.3134, "step": 6460 }, { "epoch": 0.2416991845222796, "grad_norm": 0.39222410321235657, "learning_rate": 1.7253016035117e-05, "loss": 0.2126, "step": 6465 }, { "epoch": 0.2418861135126294, "grad_norm": 0.18110430240631104, "learning_rate": 1.7248971920619413e-05, "loss": 0.2886, "step": 6470 }, { "epoch": 0.24207304250297917, "grad_norm": 0.47243568301200867, "learning_rate": 1.724492530615912e-05, "loss": 0.3672, "step": 6475 }, { "epoch": 0.24225997149332898, "grad_norm": 0.3299618661403656, "learning_rate": 1.7240876193131685e-05, "loss": 0.3709, "step": 6480 }, { "epoch": 0.24244690048367876, "grad_norm": 0.34539106488227844, "learning_rate": 1.7236824582933525e-05, "loss": 0.2648, "step": 6485 }, { "epoch": 0.24263382947402856, "grad_norm": 0.5265419483184814, "learning_rate": 1.7232770476961932e-05, "loss": 0.2256, "step": 6490 }, { "epoch": 0.24282075846437834, "grad_norm": 0.5372416377067566, "learning_rate": 1.7228713876615043e-05, "loss": 0.2817, "step": 6495 }, { "epoch": 0.24300768745472814, "grad_norm": 0.4615037739276886, "learning_rate": 1.7224654783291867e-05, "loss": 0.2583, "step": 6500 }, { "epoch": 0.24319461644507792, "grad_norm": 0.5872400403022766, "learning_rate": 1.722059319839227e-05, "loss": 0.3394, "step": 6505 }, { "epoch": 0.24338154543542773, "grad_norm": 0.5134822726249695, "learning_rate": 1.7216529123316975e-05, "loss": 0.2918, "step": 6510 }, { "epoch": 0.2435684744257775, "grad_norm": 0.3390646278858185, "learning_rate": 1.7212462559467567e-05, "loss": 0.338, "step": 6515 }, { "epoch": 0.2437554034161273, "grad_norm": 0.24070733785629272, "learning_rate": 1.7208393508246484e-05, "loss": 0.3063, "step": 6520 }, { "epoch": 0.24394233240647709, "grad_norm": 0.4138699471950531, "learning_rate": 1.7204321971057024e-05, "loss": 0.2647, "step": 6525 }, { "epoch": 0.2441292613968269, "grad_norm": 0.5086038708686829, "learning_rate": 1.720024794930335e-05, "loss": 0.2562, "step": 6530 }, { "epoch": 0.24431619038717667, "grad_norm": 0.8470866680145264, "learning_rate": 1.719617144439047e-05, "loss": 0.276, "step": 6535 }, { "epoch": 0.24450311937752647, "grad_norm": 0.5553832650184631, "learning_rate": 1.7192092457724254e-05, "loss": 0.2874, "step": 6540 }, { "epoch": 0.24469004836787625, "grad_norm": 0.32335466146469116, "learning_rate": 1.718801099071143e-05, "loss": 0.2903, "step": 6545 }, { "epoch": 0.24487697735822606, "grad_norm": 0.46024224162101746, "learning_rate": 1.7183927044759576e-05, "loss": 0.3618, "step": 6550 }, { "epoch": 0.24506390634857583, "grad_norm": 0.29070380330085754, "learning_rate": 1.7179840621277132e-05, "loss": 0.4624, "step": 6555 }, { "epoch": 0.24525083533892564, "grad_norm": 0.35333648324012756, "learning_rate": 1.7175751721673384e-05, "loss": 0.2641, "step": 6560 }, { "epoch": 0.24543776432927542, "grad_norm": 0.5118494629859924, "learning_rate": 1.7171660347358482e-05, "loss": 0.289, "step": 6565 }, { "epoch": 0.24562469331962522, "grad_norm": 0.4456328749656677, "learning_rate": 1.7167566499743417e-05, "loss": 0.2784, "step": 6570 }, { "epoch": 0.245811622309975, "grad_norm": 0.42578622698783875, "learning_rate": 1.7163470180240047e-05, "loss": 0.242, "step": 6575 }, { "epoch": 0.2459985513003248, "grad_norm": 0.38927942514419556, "learning_rate": 1.7159371390261067e-05, "loss": 0.3147, "step": 6580 }, { "epoch": 0.24618548029067458, "grad_norm": 0.4326514005661011, "learning_rate": 1.715527013122004e-05, "loss": 0.3499, "step": 6585 }, { "epoch": 0.24637240928102436, "grad_norm": 0.4785178601741791, "learning_rate": 1.7151166404531365e-05, "loss": 0.3341, "step": 6590 }, { "epoch": 0.24655933827137416, "grad_norm": 0.09519144147634506, "learning_rate": 1.7147060211610305e-05, "loss": 0.2694, "step": 6595 }, { "epoch": 0.24674626726172394, "grad_norm": 0.4483666718006134, "learning_rate": 1.7142951553872968e-05, "loss": 0.4049, "step": 6600 }, { "epoch": 0.24693319625207374, "grad_norm": 0.2505245804786682, "learning_rate": 1.713884043273631e-05, "loss": 0.3139, "step": 6605 }, { "epoch": 0.24712012524242352, "grad_norm": 0.5089213848114014, "learning_rate": 1.7134726849618144e-05, "loss": 0.3026, "step": 6610 }, { "epoch": 0.24730705423277333, "grad_norm": 0.3417368531227112, "learning_rate": 1.7130610805937123e-05, "loss": 0.219, "step": 6615 }, { "epoch": 0.2474939832231231, "grad_norm": 0.428158164024353, "learning_rate": 1.712649230311275e-05, "loss": 0.3102, "step": 6620 }, { "epoch": 0.2476809122134729, "grad_norm": 0.4470468759536743, "learning_rate": 1.7122371342565384e-05, "loss": 0.2733, "step": 6625 }, { "epoch": 0.2478678412038227, "grad_norm": 0.9151923060417175, "learning_rate": 1.7118247925716223e-05, "loss": 0.3076, "step": 6630 }, { "epoch": 0.2480547701941725, "grad_norm": 0.7877910137176514, "learning_rate": 1.7114122053987318e-05, "loss": 0.3281, "step": 6635 }, { "epoch": 0.24824169918452227, "grad_norm": 0.47747930884361267, "learning_rate": 1.7109993728801556e-05, "loss": 0.258, "step": 6640 }, { "epoch": 0.24842862817487207, "grad_norm": 0.3599812388420105, "learning_rate": 1.710586295158269e-05, "loss": 0.2615, "step": 6645 }, { "epoch": 0.24861555716522185, "grad_norm": 0.34985432028770447, "learning_rate": 1.7101729723755296e-05, "loss": 0.3707, "step": 6650 }, { "epoch": 0.24880248615557166, "grad_norm": 0.48352763056755066, "learning_rate": 1.7097594046744815e-05, "loss": 0.3285, "step": 6655 }, { "epoch": 0.24898941514592143, "grad_norm": 0.3733879625797272, "learning_rate": 1.7093455921977516e-05, "loss": 0.3347, "step": 6660 }, { "epoch": 0.24917634413627124, "grad_norm": 1.183453917503357, "learning_rate": 1.7089315350880525e-05, "loss": 0.395, "step": 6665 }, { "epoch": 0.24936327312662102, "grad_norm": 0.3995364010334015, "learning_rate": 1.70851723348818e-05, "loss": 0.3719, "step": 6670 }, { "epoch": 0.24955020211697082, "grad_norm": 0.2912960350513458, "learning_rate": 1.7081026875410156e-05, "loss": 0.2581, "step": 6675 }, { "epoch": 0.2497371311073206, "grad_norm": 0.32783398032188416, "learning_rate": 1.7076878973895242e-05, "loss": 0.3313, "step": 6680 }, { "epoch": 0.2499240600976704, "grad_norm": 0.45547330379486084, "learning_rate": 1.7072728631767543e-05, "loss": 0.2585, "step": 6685 }, { "epoch": 0.2501109890880202, "grad_norm": 0.4544295370578766, "learning_rate": 1.7068575850458402e-05, "loss": 0.2982, "step": 6690 }, { "epoch": 0.25029791807837, "grad_norm": 0.5787556171417236, "learning_rate": 1.7064420631399986e-05, "loss": 0.2737, "step": 6695 }, { "epoch": 0.25048484706871976, "grad_norm": 0.4456428587436676, "learning_rate": 1.706026297602532e-05, "loss": 0.2809, "step": 6700 }, { "epoch": 0.25067177605906954, "grad_norm": 0.27072015404701233, "learning_rate": 1.705610288576825e-05, "loss": 0.2661, "step": 6705 }, { "epoch": 0.2508587050494194, "grad_norm": 0.2884485721588135, "learning_rate": 1.7051940362063486e-05, "loss": 0.3243, "step": 6710 }, { "epoch": 0.25104563403976915, "grad_norm": 0.3565753400325775, "learning_rate": 1.7047775406346548e-05, "loss": 0.4407, "step": 6715 }, { "epoch": 0.2512325630301189, "grad_norm": 0.5307539105415344, "learning_rate": 1.7043608020053823e-05, "loss": 0.3288, "step": 6720 }, { "epoch": 0.2514194920204687, "grad_norm": 0.2853598892688751, "learning_rate": 1.7039438204622515e-05, "loss": 0.3014, "step": 6725 }, { "epoch": 0.25160642101081854, "grad_norm": 0.476520836353302, "learning_rate": 1.7035265961490673e-05, "loss": 0.337, "step": 6730 }, { "epoch": 0.2517933500011683, "grad_norm": 0.34905505180358887, "learning_rate": 1.7031091292097186e-05, "loss": 0.2644, "step": 6735 }, { "epoch": 0.2519802789915181, "grad_norm": 0.4414178729057312, "learning_rate": 1.702691419788178e-05, "loss": 0.3584, "step": 6740 }, { "epoch": 0.25216720798186787, "grad_norm": 0.4812869131565094, "learning_rate": 1.7022734680285013e-05, "loss": 0.2968, "step": 6745 }, { "epoch": 0.2523541369722177, "grad_norm": 0.5577378869056702, "learning_rate": 1.701855274074828e-05, "loss": 0.2407, "step": 6750 }, { "epoch": 0.2525410659625675, "grad_norm": 0.3004097044467926, "learning_rate": 1.701436838071382e-05, "loss": 0.2893, "step": 6755 }, { "epoch": 0.25272799495291726, "grad_norm": 0.5560919046401978, "learning_rate": 1.7010181601624687e-05, "loss": 0.305, "step": 6760 }, { "epoch": 0.25291492394326703, "grad_norm": 0.5027373433113098, "learning_rate": 1.700599240492479e-05, "loss": 0.2415, "step": 6765 }, { "epoch": 0.25310185293361687, "grad_norm": 0.2853597104549408, "learning_rate": 1.7001800792058856e-05, "loss": 0.236, "step": 6770 }, { "epoch": 0.25328878192396664, "grad_norm": 0.36475807428359985, "learning_rate": 1.6997606764472457e-05, "loss": 0.2476, "step": 6775 }, { "epoch": 0.2534757109143164, "grad_norm": 0.2956780791282654, "learning_rate": 1.6993410323611993e-05, "loss": 0.3781, "step": 6780 }, { "epoch": 0.2536626399046662, "grad_norm": 0.3027787208557129, "learning_rate": 1.6989211470924694e-05, "loss": 0.2951, "step": 6785 }, { "epoch": 0.25384956889501603, "grad_norm": 0.36557620763778687, "learning_rate": 1.6985010207858624e-05, "loss": 0.2903, "step": 6790 }, { "epoch": 0.2540364978853658, "grad_norm": 0.2095050811767578, "learning_rate": 1.6980806535862683e-05, "loss": 0.2859, "step": 6795 }, { "epoch": 0.2542234268757156, "grad_norm": 0.4164871573448181, "learning_rate": 1.6976600456386593e-05, "loss": 0.24, "step": 6800 }, { "epoch": 0.25441035586606536, "grad_norm": 0.2171694040298462, "learning_rate": 1.6972391970880906e-05, "loss": 0.4062, "step": 6805 }, { "epoch": 0.2545972848564152, "grad_norm": 0.6664031744003296, "learning_rate": 1.6968181080797012e-05, "loss": 0.2982, "step": 6810 }, { "epoch": 0.254784213846765, "grad_norm": 0.3303912878036499, "learning_rate": 1.6963967787587133e-05, "loss": 0.2349, "step": 6815 }, { "epoch": 0.25497114283711475, "grad_norm": 0.42287972569465637, "learning_rate": 1.6959752092704302e-05, "loss": 0.3224, "step": 6820 }, { "epoch": 0.2551580718274645, "grad_norm": 0.5297410488128662, "learning_rate": 1.69555339976024e-05, "loss": 0.3097, "step": 6825 }, { "epoch": 0.25534500081781436, "grad_norm": 0.44122767448425293, "learning_rate": 1.695131350373612e-05, "loss": 0.2884, "step": 6830 }, { "epoch": 0.25553192980816414, "grad_norm": 0.4497906267642975, "learning_rate": 1.6947090612560995e-05, "loss": 0.2767, "step": 6835 }, { "epoch": 0.2557188587985139, "grad_norm": 0.383256196975708, "learning_rate": 1.6942865325533374e-05, "loss": 0.3156, "step": 6840 }, { "epoch": 0.2559057877888637, "grad_norm": 0.45405879616737366, "learning_rate": 1.6938637644110442e-05, "loss": 0.3549, "step": 6845 }, { "epoch": 0.25609271677921347, "grad_norm": 0.26836884021759033, "learning_rate": 1.6934407569750208e-05, "loss": 0.265, "step": 6850 }, { "epoch": 0.2562796457695633, "grad_norm": 0.42300936579704285, "learning_rate": 1.6930175103911492e-05, "loss": 0.3418, "step": 6855 }, { "epoch": 0.2564665747599131, "grad_norm": 0.4314024746417999, "learning_rate": 1.6925940248053964e-05, "loss": 0.3864, "step": 6860 }, { "epoch": 0.25665350375026286, "grad_norm": 0.4368232488632202, "learning_rate": 1.6921703003638094e-05, "loss": 0.3285, "step": 6865 }, { "epoch": 0.25684043274061263, "grad_norm": 0.33699876070022583, "learning_rate": 1.691746337212519e-05, "loss": 0.3067, "step": 6870 }, { "epoch": 0.25702736173096247, "grad_norm": 0.28076720237731934, "learning_rate": 1.691322135497738e-05, "loss": 0.2381, "step": 6875 }, { "epoch": 0.25721429072131224, "grad_norm": 0.5777077674865723, "learning_rate": 1.6908976953657612e-05, "loss": 0.2946, "step": 6880 }, { "epoch": 0.257401219711662, "grad_norm": 0.43163543939590454, "learning_rate": 1.690473016962966e-05, "loss": 0.307, "step": 6885 }, { "epoch": 0.2575881487020118, "grad_norm": 0.4134599268436432, "learning_rate": 1.6900481004358123e-05, "loss": 0.2846, "step": 6890 }, { "epoch": 0.25777507769236163, "grad_norm": 0.40419724583625793, "learning_rate": 1.6896229459308405e-05, "loss": 0.2843, "step": 6895 }, { "epoch": 0.2579620066827114, "grad_norm": 0.4465784728527069, "learning_rate": 1.6891975535946753e-05, "loss": 0.277, "step": 6900 }, { "epoch": 0.2581489356730612, "grad_norm": 0.39851880073547363, "learning_rate": 1.6887719235740216e-05, "loss": 0.2687, "step": 6905 }, { "epoch": 0.25833586466341096, "grad_norm": 0.30198317766189575, "learning_rate": 1.6883460560156673e-05, "loss": 0.2999, "step": 6910 }, { "epoch": 0.2585227936537608, "grad_norm": 0.43067798018455505, "learning_rate": 1.687919951066482e-05, "loss": 0.3473, "step": 6915 }, { "epoch": 0.2587097226441106, "grad_norm": 0.2943825423717499, "learning_rate": 1.687493608873417e-05, "loss": 0.3303, "step": 6920 }, { "epoch": 0.25889665163446035, "grad_norm": 0.3708643317222595, "learning_rate": 1.6870670295835055e-05, "loss": 0.3585, "step": 6925 }, { "epoch": 0.25908358062481013, "grad_norm": 0.45866164565086365, "learning_rate": 1.6866402133438623e-05, "loss": 0.3606, "step": 6930 }, { "epoch": 0.25927050961515996, "grad_norm": 0.4115564227104187, "learning_rate": 1.6862131603016844e-05, "loss": 0.2645, "step": 6935 }, { "epoch": 0.25945743860550974, "grad_norm": 0.21493466198444366, "learning_rate": 1.68578587060425e-05, "loss": 0.312, "step": 6940 }, { "epoch": 0.2596443675958595, "grad_norm": 0.8563496470451355, "learning_rate": 1.6853583443989186e-05, "loss": 0.362, "step": 6945 }, { "epoch": 0.2598312965862093, "grad_norm": 0.36561504006385803, "learning_rate": 1.684930581833133e-05, "loss": 0.3779, "step": 6950 }, { "epoch": 0.2600182255765591, "grad_norm": 0.37505069375038147, "learning_rate": 1.6845025830544147e-05, "loss": 0.301, "step": 6955 }, { "epoch": 0.2602051545669089, "grad_norm": 0.45678603649139404, "learning_rate": 1.68407434821037e-05, "loss": 0.3282, "step": 6960 }, { "epoch": 0.2603920835572587, "grad_norm": 0.5878984332084656, "learning_rate": 1.6836458774486827e-05, "loss": 0.3805, "step": 6965 }, { "epoch": 0.26057901254760846, "grad_norm": 0.20040619373321533, "learning_rate": 1.683217170917122e-05, "loss": 0.2674, "step": 6970 }, { "epoch": 0.2607659415379583, "grad_norm": 0.45022228360176086, "learning_rate": 1.6827882287635353e-05, "loss": 0.315, "step": 6975 }, { "epoch": 0.26095287052830807, "grad_norm": 0.39582857489585876, "learning_rate": 1.682359051135853e-05, "loss": 0.3558, "step": 6980 }, { "epoch": 0.26113979951865784, "grad_norm": 0.5623447895050049, "learning_rate": 1.681929638182086e-05, "loss": 0.2928, "step": 6985 }, { "epoch": 0.2613267285090076, "grad_norm": 0.35395896434783936, "learning_rate": 1.6814999900503265e-05, "loss": 0.3276, "step": 6990 }, { "epoch": 0.26151365749935745, "grad_norm": 0.2845790684223175, "learning_rate": 1.681070106888748e-05, "loss": 0.3025, "step": 6995 }, { "epoch": 0.26170058648970723, "grad_norm": 0.3993929326534271, "learning_rate": 1.6806399888456043e-05, "loss": 0.327, "step": 7000 }, { "epoch": 0.261887515480057, "grad_norm": 0.23860380053520203, "learning_rate": 1.6802096360692316e-05, "loss": 0.3101, "step": 7005 }, { "epoch": 0.2620744444704068, "grad_norm": 0.20932121574878693, "learning_rate": 1.679779048708046e-05, "loss": 0.2384, "step": 7010 }, { "epoch": 0.2622613734607566, "grad_norm": 0.4419725239276886, "learning_rate": 1.6793482269105446e-05, "loss": 0.2384, "step": 7015 }, { "epoch": 0.2624483024511064, "grad_norm": 0.30131080746650696, "learning_rate": 1.6789171708253052e-05, "loss": 0.3222, "step": 7020 }, { "epoch": 0.2626352314414562, "grad_norm": 0.2808610498905182, "learning_rate": 1.6784858806009875e-05, "loss": 0.249, "step": 7025 }, { "epoch": 0.26282216043180595, "grad_norm": 0.4096542298793793, "learning_rate": 1.67805435638633e-05, "loss": 0.2912, "step": 7030 }, { "epoch": 0.2630090894221558, "grad_norm": 0.32321956753730774, "learning_rate": 1.6776225983301543e-05, "loss": 0.2665, "step": 7035 }, { "epoch": 0.26319601841250556, "grad_norm": 0.27172431349754333, "learning_rate": 1.6771906065813607e-05, "loss": 0.2614, "step": 7040 }, { "epoch": 0.26338294740285534, "grad_norm": 0.2878829836845398, "learning_rate": 1.676758381288931e-05, "loss": 0.2974, "step": 7045 }, { "epoch": 0.2635698763932051, "grad_norm": 0.3150484263896942, "learning_rate": 1.6763259226019267e-05, "loss": 0.2796, "step": 7050 }, { "epoch": 0.26375680538355495, "grad_norm": 0.39884239435195923, "learning_rate": 1.6758932306694913e-05, "loss": 0.261, "step": 7055 }, { "epoch": 0.2639437343739047, "grad_norm": 0.42523136734962463, "learning_rate": 1.6754603056408473e-05, "loss": 0.3706, "step": 7060 }, { "epoch": 0.2641306633642545, "grad_norm": 0.5379177927970886, "learning_rate": 1.675027147665298e-05, "loss": 0.275, "step": 7065 }, { "epoch": 0.2643175923546043, "grad_norm": 1.383110761642456, "learning_rate": 1.674593756892228e-05, "loss": 0.2515, "step": 7070 }, { "epoch": 0.2645045213449541, "grad_norm": 0.3739633560180664, "learning_rate": 1.6741601334711004e-05, "loss": 0.3131, "step": 7075 }, { "epoch": 0.2646914503353039, "grad_norm": 0.485005646944046, "learning_rate": 1.67372627755146e-05, "loss": 0.2868, "step": 7080 }, { "epoch": 0.26487837932565367, "grad_norm": 0.4296754002571106, "learning_rate": 1.6732921892829313e-05, "loss": 0.3443, "step": 7085 }, { "epoch": 0.26506530831600345, "grad_norm": 0.6618226170539856, "learning_rate": 1.6728578688152186e-05, "loss": 0.3642, "step": 7090 }, { "epoch": 0.2652522373063532, "grad_norm": 0.3030281662940979, "learning_rate": 1.6724233162981067e-05, "loss": 0.2784, "step": 7095 }, { "epoch": 0.26543916629670306, "grad_norm": 0.5461763143539429, "learning_rate": 1.6719885318814604e-05, "loss": 0.4215, "step": 7100 }, { "epoch": 0.26562609528705283, "grad_norm": 0.38784629106521606, "learning_rate": 1.6715535157152244e-05, "loss": 0.3164, "step": 7105 }, { "epoch": 0.2658130242774026, "grad_norm": 0.46060433983802795, "learning_rate": 1.6711182679494232e-05, "loss": 0.2347, "step": 7110 }, { "epoch": 0.2659999532677524, "grad_norm": 0.7013404369354248, "learning_rate": 1.6706827887341613e-05, "loss": 0.2908, "step": 7115 }, { "epoch": 0.2661868822581022, "grad_norm": 0.7355599999427795, "learning_rate": 1.6702470782196237e-05, "loss": 0.2816, "step": 7120 }, { "epoch": 0.266373811248452, "grad_norm": 0.23429067432880402, "learning_rate": 1.6698111365560733e-05, "loss": 0.3136, "step": 7125 }, { "epoch": 0.2665607402388018, "grad_norm": 0.7480411529541016, "learning_rate": 1.6693749638938544e-05, "loss": 0.2388, "step": 7130 }, { "epoch": 0.26674766922915155, "grad_norm": 0.6238270998001099, "learning_rate": 1.6689385603833907e-05, "loss": 0.4418, "step": 7135 }, { "epoch": 0.2669345982195014, "grad_norm": 0.15828560292720795, "learning_rate": 1.6685019261751848e-05, "loss": 0.251, "step": 7140 }, { "epoch": 0.26712152720985116, "grad_norm": 0.44209611415863037, "learning_rate": 1.6680650614198194e-05, "loss": 0.2724, "step": 7145 }, { "epoch": 0.26730845620020094, "grad_norm": 0.49470680952072144, "learning_rate": 1.667627966267957e-05, "loss": 0.2514, "step": 7150 }, { "epoch": 0.2674953851905507, "grad_norm": 0.3461754620075226, "learning_rate": 1.6671906408703394e-05, "loss": 0.2894, "step": 7155 }, { "epoch": 0.26768231418090055, "grad_norm": 0.31336525082588196, "learning_rate": 1.6667530853777865e-05, "loss": 0.2696, "step": 7160 }, { "epoch": 0.2678692431712503, "grad_norm": 0.5694870352745056, "learning_rate": 1.6663152999411998e-05, "loss": 0.2751, "step": 7165 }, { "epoch": 0.2680561721616001, "grad_norm": 0.8204647898674011, "learning_rate": 1.6658772847115584e-05, "loss": 0.4243, "step": 7170 }, { "epoch": 0.2682431011519499, "grad_norm": 0.44578155875205994, "learning_rate": 1.6654390398399213e-05, "loss": 0.2863, "step": 7175 }, { "epoch": 0.2684300301422997, "grad_norm": 0.42481744289398193, "learning_rate": 1.6650005654774265e-05, "loss": 0.2577, "step": 7180 }, { "epoch": 0.2686169591326495, "grad_norm": 0.5261092782020569, "learning_rate": 1.6645618617752914e-05, "loss": 0.2702, "step": 7185 }, { "epoch": 0.26880388812299927, "grad_norm": 0.468940794467926, "learning_rate": 1.6641229288848123e-05, "loss": 0.3976, "step": 7190 }, { "epoch": 0.26899081711334905, "grad_norm": 0.5336934328079224, "learning_rate": 1.6636837669573647e-05, "loss": 0.3726, "step": 7195 }, { "epoch": 0.2691777461036989, "grad_norm": 0.2591412365436554, "learning_rate": 1.6632443761444027e-05, "loss": 0.3307, "step": 7200 }, { "epoch": 0.26936467509404866, "grad_norm": 0.5266516804695129, "learning_rate": 1.6628047565974594e-05, "loss": 0.307, "step": 7205 }, { "epoch": 0.26955160408439843, "grad_norm": 0.38717594742774963, "learning_rate": 1.6623649084681477e-05, "loss": 0.3238, "step": 7210 }, { "epoch": 0.2697385330747482, "grad_norm": 0.17658643424510956, "learning_rate": 1.6619248319081583e-05, "loss": 0.2985, "step": 7215 }, { "epoch": 0.26992546206509804, "grad_norm": 0.30146273970603943, "learning_rate": 1.6614845270692606e-05, "loss": 0.294, "step": 7220 }, { "epoch": 0.2701123910554478, "grad_norm": 0.6402953863143921, "learning_rate": 1.6610439941033038e-05, "loss": 0.2622, "step": 7225 }, { "epoch": 0.2702993200457976, "grad_norm": 0.5695781111717224, "learning_rate": 1.6606032331622148e-05, "loss": 0.2279, "step": 7230 }, { "epoch": 0.2704862490361474, "grad_norm": 0.23263224959373474, "learning_rate": 1.6601622443979987e-05, "loss": 0.2919, "step": 7235 }, { "epoch": 0.2706731780264972, "grad_norm": 0.4788752496242523, "learning_rate": 1.659721027962741e-05, "loss": 0.2335, "step": 7240 }, { "epoch": 0.270860107016847, "grad_norm": 0.4149765372276306, "learning_rate": 1.659279584008604e-05, "loss": 0.3214, "step": 7245 }, { "epoch": 0.27104703600719676, "grad_norm": 0.5265377759933472, "learning_rate": 1.6588379126878293e-05, "loss": 0.3483, "step": 7250 }, { "epoch": 0.27123396499754654, "grad_norm": 0.6144738793373108, "learning_rate": 1.6583960141527367e-05, "loss": 0.424, "step": 7255 }, { "epoch": 0.27142089398789637, "grad_norm": 0.3015303909778595, "learning_rate": 1.6579538885557242e-05, "loss": 0.343, "step": 7260 }, { "epoch": 0.27160782297824615, "grad_norm": 0.4408928155899048, "learning_rate": 1.6575115360492683e-05, "loss": 0.2704, "step": 7265 }, { "epoch": 0.2717947519685959, "grad_norm": 0.3285323679447174, "learning_rate": 1.6570689567859237e-05, "loss": 0.2789, "step": 7270 }, { "epoch": 0.2719816809589457, "grad_norm": 0.09705019742250443, "learning_rate": 1.6566261509183232e-05, "loss": 0.3737, "step": 7275 }, { "epoch": 0.27216860994929554, "grad_norm": 0.24546971917152405, "learning_rate": 1.6561831185991782e-05, "loss": 0.2472, "step": 7280 }, { "epoch": 0.2723555389396453, "grad_norm": 0.1780613362789154, "learning_rate": 1.6557398599812774e-05, "loss": 0.2864, "step": 7285 }, { "epoch": 0.2725424679299951, "grad_norm": 0.5108126997947693, "learning_rate": 1.655296375217488e-05, "loss": 0.3593, "step": 7290 }, { "epoch": 0.27272939692034487, "grad_norm": 0.14681376516819, "learning_rate": 1.654852664460756e-05, "loss": 0.318, "step": 7295 }, { "epoch": 0.2729163259106947, "grad_norm": 0.42053887248039246, "learning_rate": 1.6544087278641037e-05, "loss": 0.2802, "step": 7300 }, { "epoch": 0.2731032549010445, "grad_norm": 0.5258027911186218, "learning_rate": 1.653964565580632e-05, "loss": 0.3422, "step": 7305 }, { "epoch": 0.27329018389139426, "grad_norm": 0.2713332772254944, "learning_rate": 1.6535201777635206e-05, "loss": 0.3497, "step": 7310 }, { "epoch": 0.27347711288174403, "grad_norm": 0.5129302740097046, "learning_rate": 1.6530755645660254e-05, "loss": 0.2799, "step": 7315 }, { "epoch": 0.2736640418720938, "grad_norm": 0.33113789558410645, "learning_rate": 1.6526307261414812e-05, "loss": 0.351, "step": 7320 }, { "epoch": 0.27385097086244364, "grad_norm": 0.3590730130672455, "learning_rate": 1.6521856626432992e-05, "loss": 0.3425, "step": 7325 }, { "epoch": 0.2740378998527934, "grad_norm": 0.2738291323184967, "learning_rate": 1.65174037422497e-05, "loss": 0.3775, "step": 7330 }, { "epoch": 0.2742248288431432, "grad_norm": 0.3311813771724701, "learning_rate": 1.6512948610400606e-05, "loss": 0.3176, "step": 7335 }, { "epoch": 0.274411757833493, "grad_norm": 0.2696897089481354, "learning_rate": 1.6508491232422153e-05, "loss": 0.314, "step": 7340 }, { "epoch": 0.2745986868238428, "grad_norm": 0.2976570725440979, "learning_rate": 1.6504031609851567e-05, "loss": 0.4029, "step": 7345 }, { "epoch": 0.2747856158141926, "grad_norm": 0.25655868649482727, "learning_rate": 1.6499569744226843e-05, "loss": 0.3326, "step": 7350 }, { "epoch": 0.27497254480454236, "grad_norm": 0.21505354344844818, "learning_rate": 1.649510563708675e-05, "loss": 0.2943, "step": 7355 }, { "epoch": 0.27515947379489214, "grad_norm": 0.4883136749267578, "learning_rate": 1.6490639289970834e-05, "loss": 0.2989, "step": 7360 }, { "epoch": 0.275346402785242, "grad_norm": 0.26657021045684814, "learning_rate": 1.6486170704419402e-05, "loss": 0.2817, "step": 7365 }, { "epoch": 0.27553333177559175, "grad_norm": 0.40542852878570557, "learning_rate": 1.648169988197355e-05, "loss": 0.2866, "step": 7370 }, { "epoch": 0.2757202607659415, "grad_norm": 0.901739776134491, "learning_rate": 1.647722682417513e-05, "loss": 0.2885, "step": 7375 }, { "epoch": 0.2759071897562913, "grad_norm": 0.42218196392059326, "learning_rate": 1.6472751532566777e-05, "loss": 0.225, "step": 7380 }, { "epoch": 0.27609411874664114, "grad_norm": 0.5886632800102234, "learning_rate": 1.6468274008691888e-05, "loss": 0.3544, "step": 7385 }, { "epoch": 0.2762810477369909, "grad_norm": 0.4941720962524414, "learning_rate": 1.646379425409463e-05, "loss": 0.3206, "step": 7390 }, { "epoch": 0.2764679767273407, "grad_norm": 0.3011486530303955, "learning_rate": 1.6459312270319946e-05, "loss": 0.2848, "step": 7395 }, { "epoch": 0.27665490571769047, "grad_norm": 0.5940350890159607, "learning_rate": 1.6454828058913544e-05, "loss": 0.3851, "step": 7400 }, { "epoch": 0.2768418347080403, "grad_norm": 0.41424158215522766, "learning_rate": 1.64503416214219e-05, "loss": 0.3082, "step": 7405 }, { "epoch": 0.2770287636983901, "grad_norm": 0.43419891595840454, "learning_rate": 1.6445852959392257e-05, "loss": 0.2973, "step": 7410 }, { "epoch": 0.27721569268873986, "grad_norm": 0.4673122465610504, "learning_rate": 1.644136207437262e-05, "loss": 0.2796, "step": 7415 }, { "epoch": 0.27740262167908963, "grad_norm": 0.5394338965415955, "learning_rate": 1.6436868967911777e-05, "loss": 0.4372, "step": 7420 }, { "epoch": 0.27758955066943947, "grad_norm": 0.39527252316474915, "learning_rate": 1.6432373641559266e-05, "loss": 0.2922, "step": 7425 }, { "epoch": 0.27777647965978924, "grad_norm": 0.35765504837036133, "learning_rate": 1.6427876096865394e-05, "loss": 0.3001, "step": 7430 }, { "epoch": 0.277963408650139, "grad_norm": 0.6142287254333496, "learning_rate": 1.642337633538124e-05, "loss": 0.3109, "step": 7435 }, { "epoch": 0.2781503376404888, "grad_norm": 0.45784667134284973, "learning_rate": 1.641887435865864e-05, "loss": 0.3899, "step": 7440 }, { "epoch": 0.27833726663083863, "grad_norm": 0.3831481337547302, "learning_rate": 1.64143701682502e-05, "loss": 0.2393, "step": 7445 }, { "epoch": 0.2785241956211884, "grad_norm": 0.628398060798645, "learning_rate": 1.6409863765709282e-05, "loss": 0.3072, "step": 7450 }, { "epoch": 0.2787111246115382, "grad_norm": 0.3460518717765808, "learning_rate": 1.6405355152590018e-05, "loss": 0.3902, "step": 7455 }, { "epoch": 0.27889805360188796, "grad_norm": 0.48453855514526367, "learning_rate": 1.64008443304473e-05, "loss": 0.339, "step": 7460 }, { "epoch": 0.2790849825922378, "grad_norm": 0.4171224534511566, "learning_rate": 1.6396331300836778e-05, "loss": 0.358, "step": 7465 }, { "epoch": 0.2792719115825876, "grad_norm": 0.3539869487285614, "learning_rate": 1.6391816065314865e-05, "loss": 0.3195, "step": 7470 }, { "epoch": 0.27945884057293735, "grad_norm": 0.39307695627212524, "learning_rate": 1.6387298625438743e-05, "loss": 0.378, "step": 7475 }, { "epoch": 0.2796457695632871, "grad_norm": 0.28309181332588196, "learning_rate": 1.6382778982766347e-05, "loss": 0.2541, "step": 7480 }, { "epoch": 0.27983269855363696, "grad_norm": 0.4474238455295563, "learning_rate": 1.6378257138856365e-05, "loss": 0.2949, "step": 7485 }, { "epoch": 0.28001962754398674, "grad_norm": 0.5505223274230957, "learning_rate": 1.6373733095268258e-05, "loss": 0.3328, "step": 7490 }, { "epoch": 0.2802065565343365, "grad_norm": 0.3556166887283325, "learning_rate": 1.636920685356224e-05, "loss": 0.3287, "step": 7495 }, { "epoch": 0.2803934855246863, "grad_norm": 0.3573717772960663, "learning_rate": 1.6364678415299274e-05, "loss": 0.363, "step": 7500 }, { "epoch": 0.2805804145150361, "grad_norm": 0.3663536608219147, "learning_rate": 1.6360147782041097e-05, "loss": 0.3575, "step": 7505 }, { "epoch": 0.2807673435053859, "grad_norm": 0.26305726170539856, "learning_rate": 1.6355614955350187e-05, "loss": 0.3127, "step": 7510 }, { "epoch": 0.2809542724957357, "grad_norm": 0.5720646977424622, "learning_rate": 1.6351079936789792e-05, "loss": 0.377, "step": 7515 }, { "epoch": 0.28114120148608546, "grad_norm": 0.3031104505062103, "learning_rate": 1.634654272792391e-05, "loss": 0.3199, "step": 7520 }, { "epoch": 0.2813281304764353, "grad_norm": 0.4231772720813751, "learning_rate": 1.6342003330317295e-05, "loss": 0.3691, "step": 7525 }, { "epoch": 0.28151505946678507, "grad_norm": 0.4373528063297272, "learning_rate": 1.6337461745535446e-05, "loss": 0.3065, "step": 7530 }, { "epoch": 0.28170198845713484, "grad_norm": 0.3790377676486969, "learning_rate": 1.6332917975144638e-05, "loss": 0.3129, "step": 7535 }, { "epoch": 0.2818889174474846, "grad_norm": 0.5475689172744751, "learning_rate": 1.632837202071188e-05, "loss": 0.3161, "step": 7540 }, { "epoch": 0.28207584643783445, "grad_norm": 0.3444899320602417, "learning_rate": 1.6323823883804942e-05, "loss": 0.3612, "step": 7545 }, { "epoch": 0.28226277542818423, "grad_norm": 0.38195663690567017, "learning_rate": 1.631927356599235e-05, "loss": 0.3729, "step": 7550 }, { "epoch": 0.282449704418534, "grad_norm": 0.3962342143058777, "learning_rate": 1.6314721068843367e-05, "loss": 0.3536, "step": 7555 }, { "epoch": 0.2826366334088838, "grad_norm": 0.41847673058509827, "learning_rate": 1.6310166393928036e-05, "loss": 0.3601, "step": 7560 }, { "epoch": 0.28282356239923356, "grad_norm": 0.4625031650066376, "learning_rate": 1.630560954281712e-05, "loss": 0.3712, "step": 7565 }, { "epoch": 0.2830104913895834, "grad_norm": 0.24907182157039642, "learning_rate": 1.6301050517082154e-05, "loss": 0.2722, "step": 7570 }, { "epoch": 0.2831974203799332, "grad_norm": 0.4163358211517334, "learning_rate": 1.629648931829541e-05, "loss": 0.3019, "step": 7575 }, { "epoch": 0.28338434937028295, "grad_norm": 0.24772894382476807, "learning_rate": 1.6291925948029918e-05, "loss": 0.2406, "step": 7580 }, { "epoch": 0.28357127836063273, "grad_norm": 0.42197006940841675, "learning_rate": 1.6287360407859452e-05, "loss": 0.2812, "step": 7585 }, { "epoch": 0.28375820735098256, "grad_norm": 0.38658830523490906, "learning_rate": 1.628279269935854e-05, "loss": 0.3223, "step": 7590 }, { "epoch": 0.28394513634133234, "grad_norm": 0.6128913164138794, "learning_rate": 1.627822282410245e-05, "loss": 0.3035, "step": 7595 }, { "epoch": 0.2841320653316821, "grad_norm": 0.44375917315483093, "learning_rate": 1.62736507836672e-05, "loss": 0.3087, "step": 7600 }, { "epoch": 0.2843189943220319, "grad_norm": 0.35817980766296387, "learning_rate": 1.626907657962956e-05, "loss": 0.2697, "step": 7605 }, { "epoch": 0.2845059233123817, "grad_norm": 0.3279922902584076, "learning_rate": 1.6264500213567038e-05, "loss": 0.2629, "step": 7610 }, { "epoch": 0.2846928523027315, "grad_norm": 0.4176391363143921, "learning_rate": 1.625992168705789e-05, "loss": 0.2812, "step": 7615 }, { "epoch": 0.2848797812930813, "grad_norm": 0.5126670002937317, "learning_rate": 1.6255341001681125e-05, "loss": 0.2976, "step": 7620 }, { "epoch": 0.28506671028343106, "grad_norm": 0.40457814931869507, "learning_rate": 1.625075815901649e-05, "loss": 0.258, "step": 7625 }, { "epoch": 0.2852536392737809, "grad_norm": 0.6561230421066284, "learning_rate": 1.624617316064447e-05, "loss": 0.3499, "step": 7630 }, { "epoch": 0.28544056826413067, "grad_norm": 0.17372195422649384, "learning_rate": 1.62415860081463e-05, "loss": 0.2878, "step": 7635 }, { "epoch": 0.28562749725448044, "grad_norm": 0.3161996603012085, "learning_rate": 1.6236996703103963e-05, "loss": 0.2953, "step": 7640 }, { "epoch": 0.2858144262448302, "grad_norm": 0.41537266969680786, "learning_rate": 1.6232405247100173e-05, "loss": 0.4672, "step": 7645 }, { "epoch": 0.28600135523518005, "grad_norm": 0.5545313358306885, "learning_rate": 1.6227811641718392e-05, "loss": 0.251, "step": 7650 }, { "epoch": 0.28618828422552983, "grad_norm": 0.4810824692249298, "learning_rate": 1.6223215888542832e-05, "loss": 0.2838, "step": 7655 }, { "epoch": 0.2863752132158796, "grad_norm": 0.3870065212249756, "learning_rate": 1.6218617989158426e-05, "loss": 0.2886, "step": 7660 }, { "epoch": 0.2865621422062294, "grad_norm": 0.5083006024360657, "learning_rate": 1.6214017945150863e-05, "loss": 0.2963, "step": 7665 }, { "epoch": 0.2867490711965792, "grad_norm": 0.4420605003833771, "learning_rate": 1.6209415758106565e-05, "loss": 0.3387, "step": 7670 }, { "epoch": 0.286936000186929, "grad_norm": 0.29529669880867004, "learning_rate": 1.620481142961269e-05, "loss": 0.2956, "step": 7675 }, { "epoch": 0.2871229291772788, "grad_norm": 0.25295814871788025, "learning_rate": 1.6200204961257148e-05, "loss": 0.2808, "step": 7680 }, { "epoch": 0.28730985816762855, "grad_norm": 0.3369937241077423, "learning_rate": 1.619559635462857e-05, "loss": 0.3474, "step": 7685 }, { "epoch": 0.2874967871579784, "grad_norm": 0.5575014352798462, "learning_rate": 1.6190985611316336e-05, "loss": 0.3019, "step": 7690 }, { "epoch": 0.28768371614832816, "grad_norm": 0.5619780421257019, "learning_rate": 1.618637273291056e-05, "loss": 0.2889, "step": 7695 }, { "epoch": 0.28787064513867794, "grad_norm": 0.6334633231163025, "learning_rate": 1.6181757721002092e-05, "loss": 0.406, "step": 7700 }, { "epoch": 0.2880575741290277, "grad_norm": 0.27999556064605713, "learning_rate": 1.617714057718251e-05, "loss": 0.3111, "step": 7705 }, { "epoch": 0.28824450311937755, "grad_norm": 0.2326452136039734, "learning_rate": 1.6172521303044145e-05, "loss": 0.3168, "step": 7710 }, { "epoch": 0.2884314321097273, "grad_norm": 0.3384558856487274, "learning_rate": 1.616789990018005e-05, "loss": 0.3016, "step": 7715 }, { "epoch": 0.2886183611000771, "grad_norm": 0.29935914278030396, "learning_rate": 1.616327637018401e-05, "loss": 0.2635, "step": 7720 }, { "epoch": 0.2888052900904269, "grad_norm": 0.3248922526836395, "learning_rate": 1.615865071465055e-05, "loss": 0.3969, "step": 7725 }, { "epoch": 0.2889922190807767, "grad_norm": 0.2685738205909729, "learning_rate": 1.6154022935174923e-05, "loss": 0.3624, "step": 7730 }, { "epoch": 0.2891791480711265, "grad_norm": 0.34825339913368225, "learning_rate": 1.614939303335312e-05, "loss": 0.2855, "step": 7735 }, { "epoch": 0.28936607706147627, "grad_norm": 0.41288071870803833, "learning_rate": 1.6144761010781867e-05, "loss": 0.2983, "step": 7740 }, { "epoch": 0.28955300605182605, "grad_norm": 0.34375739097595215, "learning_rate": 1.614012686905861e-05, "loss": 0.2286, "step": 7745 }, { "epoch": 0.2897399350421759, "grad_norm": 0.16923807561397552, "learning_rate": 1.6135490609781534e-05, "loss": 0.2648, "step": 7750 }, { "epoch": 0.28992686403252566, "grad_norm": 0.391319215297699, "learning_rate": 1.613085223454955e-05, "loss": 0.3083, "step": 7755 }, { "epoch": 0.29011379302287543, "grad_norm": 0.35019999742507935, "learning_rate": 1.61262117449623e-05, "loss": 0.2544, "step": 7760 }, { "epoch": 0.2903007220132252, "grad_norm": 0.35723742842674255, "learning_rate": 1.612156914262016e-05, "loss": 0.2807, "step": 7765 }, { "epoch": 0.29048765100357504, "grad_norm": 0.5157048106193542, "learning_rate": 1.6116924429124222e-05, "loss": 0.3461, "step": 7770 }, { "epoch": 0.2906745799939248, "grad_norm": 0.504540741443634, "learning_rate": 1.6112277606076325e-05, "loss": 0.3168, "step": 7775 }, { "epoch": 0.2908615089842746, "grad_norm": 0.4043572247028351, "learning_rate": 1.6107628675079023e-05, "loss": 0.2517, "step": 7780 }, { "epoch": 0.2910484379746244, "grad_norm": 0.29459133744239807, "learning_rate": 1.6102977637735587e-05, "loss": 0.2881, "step": 7785 }, { "epoch": 0.2912353669649742, "grad_norm": 0.20902110636234283, "learning_rate": 1.6098324495650044e-05, "loss": 0.3696, "step": 7790 }, { "epoch": 0.291422295955324, "grad_norm": 0.3627723157405853, "learning_rate": 1.609366925042712e-05, "loss": 0.1937, "step": 7795 }, { "epoch": 0.29160922494567376, "grad_norm": 0.4867866337299347, "learning_rate": 1.6089011903672277e-05, "loss": 0.362, "step": 7800 }, { "epoch": 0.29179615393602354, "grad_norm": 0.2678326964378357, "learning_rate": 1.6084352456991704e-05, "loss": 0.36, "step": 7805 }, { "epoch": 0.2919830829263733, "grad_norm": 0.7733646631240845, "learning_rate": 1.6079690911992304e-05, "loss": 0.2725, "step": 7810 }, { "epoch": 0.29217001191672315, "grad_norm": 1.191166639328003, "learning_rate": 1.6075027270281713e-05, "loss": 0.2314, "step": 7815 }, { "epoch": 0.2923569409070729, "grad_norm": 0.3552202880382538, "learning_rate": 1.607036153346829e-05, "loss": 0.3046, "step": 7820 }, { "epoch": 0.2925438698974227, "grad_norm": 0.42656761407852173, "learning_rate": 1.6065693703161113e-05, "loss": 0.315, "step": 7825 }, { "epoch": 0.2927307988877725, "grad_norm": 0.3393990099430084, "learning_rate": 1.606102378096998e-05, "loss": 0.2877, "step": 7830 }, { "epoch": 0.2929177278781223, "grad_norm": 0.3915458619594574, "learning_rate": 1.605635176850541e-05, "loss": 0.3389, "step": 7835 }, { "epoch": 0.2931046568684721, "grad_norm": 0.4257470369338989, "learning_rate": 1.605167766737866e-05, "loss": 0.317, "step": 7840 }, { "epoch": 0.29329158585882187, "grad_norm": 0.36982858180999756, "learning_rate": 1.604700147920168e-05, "loss": 0.3218, "step": 7845 }, { "epoch": 0.29347851484917165, "grad_norm": 0.4049963057041168, "learning_rate": 1.604232320558716e-05, "loss": 0.2894, "step": 7850 }, { "epoch": 0.2936654438395215, "grad_norm": 0.7059416174888611, "learning_rate": 1.6037642848148502e-05, "loss": 0.2444, "step": 7855 }, { "epoch": 0.29385237282987126, "grad_norm": 0.635959267616272, "learning_rate": 1.6032960408499824e-05, "loss": 0.2997, "step": 7860 }, { "epoch": 0.29403930182022103, "grad_norm": 0.24313776195049286, "learning_rate": 1.602827588825597e-05, "loss": 0.2254, "step": 7865 }, { "epoch": 0.2942262308105708, "grad_norm": 0.3153620660305023, "learning_rate": 1.6023589289032494e-05, "loss": 0.2558, "step": 7870 }, { "epoch": 0.29441315980092064, "grad_norm": 0.45469948649406433, "learning_rate": 1.6018900612445665e-05, "loss": 0.2513, "step": 7875 }, { "epoch": 0.2946000887912704, "grad_norm": 0.47144100069999695, "learning_rate": 1.601420986011248e-05, "loss": 0.2372, "step": 7880 }, { "epoch": 0.2947870177816202, "grad_norm": 0.22326113283634186, "learning_rate": 1.6009517033650643e-05, "loss": 0.2896, "step": 7885 }, { "epoch": 0.29497394677197, "grad_norm": 0.5031372308731079, "learning_rate": 1.6004822134678577e-05, "loss": 0.2962, "step": 7890 }, { "epoch": 0.2951608757623198, "grad_norm": 0.6720777750015259, "learning_rate": 1.6000125164815418e-05, "loss": 0.3161, "step": 7895 }, { "epoch": 0.2953478047526696, "grad_norm": 0.4197002351284027, "learning_rate": 1.5995426125681014e-05, "loss": 0.3376, "step": 7900 }, { "epoch": 0.29553473374301936, "grad_norm": 0.13400337100028992, "learning_rate": 1.599072501889593e-05, "loss": 0.3168, "step": 7905 }, { "epoch": 0.29572166273336914, "grad_norm": 0.3738100230693817, "learning_rate": 1.598602184608144e-05, "loss": 0.2846, "step": 7910 }, { "epoch": 0.29590859172371897, "grad_norm": 0.5275940895080566, "learning_rate": 1.598131660885954e-05, "loss": 0.2793, "step": 7915 }, { "epoch": 0.29609552071406875, "grad_norm": 0.41306841373443604, "learning_rate": 1.597660930885293e-05, "loss": 0.3141, "step": 7920 }, { "epoch": 0.2962824497044185, "grad_norm": 0.2588225305080414, "learning_rate": 1.5971899947685018e-05, "loss": 0.2731, "step": 7925 }, { "epoch": 0.2964693786947683, "grad_norm": 0.1754133701324463, "learning_rate": 1.5967188526979928e-05, "loss": 0.2432, "step": 7930 }, { "epoch": 0.29665630768511814, "grad_norm": 0.18629756569862366, "learning_rate": 1.5962475048362498e-05, "loss": 0.3458, "step": 7935 }, { "epoch": 0.2968432366754679, "grad_norm": 0.6092668175697327, "learning_rate": 1.5957759513458274e-05, "loss": 0.3054, "step": 7940 }, { "epoch": 0.2970301656658177, "grad_norm": 0.3897908926010132, "learning_rate": 1.59530419238935e-05, "loss": 0.2734, "step": 7945 }, { "epoch": 0.29721709465616747, "grad_norm": 0.4030907154083252, "learning_rate": 1.5948322281295147e-05, "loss": 0.3481, "step": 7950 }, { "epoch": 0.2974040236465173, "grad_norm": 0.3852006793022156, "learning_rate": 1.594360058729088e-05, "loss": 0.2923, "step": 7955 }, { "epoch": 0.2975909526368671, "grad_norm": 0.3748472034931183, "learning_rate": 1.5938876843509072e-05, "loss": 0.3313, "step": 7960 }, { "epoch": 0.29777788162721686, "grad_norm": 0.5024619698524475, "learning_rate": 1.5934151051578814e-05, "loss": 0.2947, "step": 7965 }, { "epoch": 0.29796481061756663, "grad_norm": 0.4308888018131256, "learning_rate": 1.592942321312989e-05, "loss": 0.2932, "step": 7970 }, { "epoch": 0.29815173960791647, "grad_norm": 0.4150945842266083, "learning_rate": 1.5924693329792808e-05, "loss": 0.351, "step": 7975 }, { "epoch": 0.29833866859826624, "grad_norm": 0.2940865755081177, "learning_rate": 1.5919961403198752e-05, "loss": 0.2743, "step": 7980 }, { "epoch": 0.298525597588616, "grad_norm": 0.2742552161216736, "learning_rate": 1.591522743497964e-05, "loss": 0.266, "step": 7985 }, { "epoch": 0.2987125265789658, "grad_norm": 0.5203600525856018, "learning_rate": 1.591049142676808e-05, "loss": 0.3469, "step": 7990 }, { "epoch": 0.29889945556931563, "grad_norm": 0.29020723700523376, "learning_rate": 1.590575338019738e-05, "loss": 0.2792, "step": 7995 }, { "epoch": 0.2990863845596654, "grad_norm": 0.5410657525062561, "learning_rate": 1.590101329690156e-05, "loss": 0.2447, "step": 8000 }, { "epoch": 0.2992733135500152, "grad_norm": 0.36561673879623413, "learning_rate": 1.589627117851534e-05, "loss": 0.2502, "step": 8005 }, { "epoch": 0.29946024254036496, "grad_norm": 0.2769913673400879, "learning_rate": 1.589152702667414e-05, "loss": 0.3428, "step": 8010 }, { "epoch": 0.2996471715307148, "grad_norm": 0.34356051683425903, "learning_rate": 1.5886780843014085e-05, "loss": 0.3429, "step": 8015 }, { "epoch": 0.2998341005210646, "grad_norm": 0.2786838710308075, "learning_rate": 1.5882032629171993e-05, "loss": 0.3215, "step": 8020 }, { "epoch": 0.30002102951141435, "grad_norm": 0.46318674087524414, "learning_rate": 1.587728238678539e-05, "loss": 0.3538, "step": 8025 }, { "epoch": 0.3002079585017641, "grad_norm": 0.3690614402294159, "learning_rate": 1.5872530117492495e-05, "loss": 0.3055, "step": 8030 }, { "epoch": 0.30039488749211396, "grad_norm": 0.8592503070831299, "learning_rate": 1.5867775822932233e-05, "loss": 0.3292, "step": 8035 }, { "epoch": 0.30058181648246374, "grad_norm": 0.4920271635055542, "learning_rate": 1.5863019504744222e-05, "loss": 0.2921, "step": 8040 }, { "epoch": 0.3007687454728135, "grad_norm": 0.3858177363872528, "learning_rate": 1.585826116456878e-05, "loss": 0.3832, "step": 8045 }, { "epoch": 0.3009556744631633, "grad_norm": 0.37820449471473694, "learning_rate": 1.5853500804046926e-05, "loss": 0.3231, "step": 8050 }, { "epoch": 0.30114260345351307, "grad_norm": 0.24952290952205658, "learning_rate": 1.5848738424820366e-05, "loss": 0.3362, "step": 8055 }, { "epoch": 0.3013295324438629, "grad_norm": 0.39352136850357056, "learning_rate": 1.584397402853151e-05, "loss": 0.2533, "step": 8060 }, { "epoch": 0.3015164614342127, "grad_norm": 0.34481966495513916, "learning_rate": 1.5839207616823468e-05, "loss": 0.3445, "step": 8065 }, { "epoch": 0.30170339042456246, "grad_norm": 0.6301359534263611, "learning_rate": 1.583443919134003e-05, "loss": 0.332, "step": 8070 }, { "epoch": 0.30189031941491223, "grad_norm": 0.6552008986473083, "learning_rate": 1.582966875372569e-05, "loss": 0.3042, "step": 8075 }, { "epoch": 0.30207724840526207, "grad_norm": 0.3802003860473633, "learning_rate": 1.582489630562564e-05, "loss": 0.2635, "step": 8080 }, { "epoch": 0.30226417739561184, "grad_norm": 0.4597533941268921, "learning_rate": 1.5820121848685758e-05, "loss": 0.2614, "step": 8085 }, { "epoch": 0.3024511063859616, "grad_norm": 0.4375554025173187, "learning_rate": 1.581534538455262e-05, "loss": 0.342, "step": 8090 }, { "epoch": 0.3026380353763114, "grad_norm": 0.36787232756614685, "learning_rate": 1.5810566914873487e-05, "loss": 0.3305, "step": 8095 }, { "epoch": 0.30282496436666123, "grad_norm": 0.342817485332489, "learning_rate": 1.580578644129632e-05, "loss": 0.3672, "step": 8100 }, { "epoch": 0.303011893357011, "grad_norm": 0.8170397281646729, "learning_rate": 1.5801003965469764e-05, "loss": 0.2564, "step": 8105 }, { "epoch": 0.3031988223473608, "grad_norm": 0.1437387764453888, "learning_rate": 1.5796219489043164e-05, "loss": 0.3617, "step": 8110 }, { "epoch": 0.30338575133771056, "grad_norm": 0.4565275013446808, "learning_rate": 1.5791433013666544e-05, "loss": 0.2808, "step": 8115 }, { "epoch": 0.3035726803280604, "grad_norm": 0.5784963369369507, "learning_rate": 1.5786644540990622e-05, "loss": 0.234, "step": 8120 }, { "epoch": 0.3037596093184102, "grad_norm": 0.44726547598838806, "learning_rate": 1.578185407266681e-05, "loss": 0.3222, "step": 8125 }, { "epoch": 0.30394653830875995, "grad_norm": 0.260372132062912, "learning_rate": 1.5777061610347197e-05, "loss": 0.2867, "step": 8130 }, { "epoch": 0.3041334672991097, "grad_norm": 0.5115578174591064, "learning_rate": 1.5772267155684565e-05, "loss": 0.3603, "step": 8135 }, { "epoch": 0.30432039628945956, "grad_norm": 0.4961742162704468, "learning_rate": 1.576747071033239e-05, "loss": 0.2761, "step": 8140 }, { "epoch": 0.30450732527980934, "grad_norm": 0.07623039931058884, "learning_rate": 1.5762672275944826e-05, "loss": 0.3036, "step": 8145 }, { "epoch": 0.3046942542701591, "grad_norm": 0.7987903356552124, "learning_rate": 1.5757871854176716e-05, "loss": 0.2701, "step": 8150 }, { "epoch": 0.3048811832605089, "grad_norm": 0.34833961725234985, "learning_rate": 1.5753069446683586e-05, "loss": 0.2672, "step": 8155 }, { "epoch": 0.3050681122508587, "grad_norm": 0.5639849305152893, "learning_rate": 1.574826505512165e-05, "loss": 0.3335, "step": 8160 }, { "epoch": 0.3052550412412085, "grad_norm": 0.2684768736362457, "learning_rate": 1.5743458681147807e-05, "loss": 0.2879, "step": 8165 }, { "epoch": 0.3054419702315583, "grad_norm": 0.32152795791625977, "learning_rate": 1.5738650326419636e-05, "loss": 0.3091, "step": 8170 }, { "epoch": 0.30562889922190806, "grad_norm": 0.4024050533771515, "learning_rate": 1.57338399925954e-05, "loss": 0.3315, "step": 8175 }, { "epoch": 0.3058158282122579, "grad_norm": 0.32922789454460144, "learning_rate": 1.5729027681334043e-05, "loss": 0.3058, "step": 8180 }, { "epoch": 0.30600275720260767, "grad_norm": 0.37226831912994385, "learning_rate": 1.57242133942952e-05, "loss": 0.2949, "step": 8185 }, { "epoch": 0.30618968619295744, "grad_norm": 0.5778080821037292, "learning_rate": 1.5719397133139172e-05, "loss": 0.3008, "step": 8190 }, { "epoch": 0.3063766151833072, "grad_norm": 0.30780428647994995, "learning_rate": 1.5714578899526957e-05, "loss": 0.2412, "step": 8195 }, { "epoch": 0.30656354417365705, "grad_norm": 0.13755351305007935, "learning_rate": 1.5709758695120222e-05, "loss": 0.2439, "step": 8200 }, { "epoch": 0.30675047316400683, "grad_norm": 0.5074127316474915, "learning_rate": 1.570493652158132e-05, "loss": 0.3904, "step": 8205 }, { "epoch": 0.3069374021543566, "grad_norm": 0.27860695123672485, "learning_rate": 1.5700112380573277e-05, "loss": 0.313, "step": 8210 }, { "epoch": 0.3071243311447064, "grad_norm": 0.4412946403026581, "learning_rate": 1.5695286273759805e-05, "loss": 0.3682, "step": 8215 }, { "epoch": 0.3073112601350562, "grad_norm": 0.402165025472641, "learning_rate": 1.569045820280529e-05, "loss": 0.2835, "step": 8220 }, { "epoch": 0.307498189125406, "grad_norm": 0.49247145652770996, "learning_rate": 1.5685628169374793e-05, "loss": 0.2833, "step": 8225 }, { "epoch": 0.3076851181157558, "grad_norm": 0.41755223274230957, "learning_rate": 1.568079617513405e-05, "loss": 0.2886, "step": 8230 }, { "epoch": 0.30787204710610555, "grad_norm": 0.34908807277679443, "learning_rate": 1.5675962221749478e-05, "loss": 0.275, "step": 8235 }, { "epoch": 0.3080589760964554, "grad_norm": 0.3702501356601715, "learning_rate": 1.567112631088818e-05, "loss": 0.3203, "step": 8240 }, { "epoch": 0.30824590508680516, "grad_norm": 0.42112669348716736, "learning_rate": 1.5666288444217915e-05, "loss": 0.2694, "step": 8245 }, { "epoch": 0.30843283407715494, "grad_norm": 0.29839271306991577, "learning_rate": 1.5661448623407122e-05, "loss": 0.288, "step": 8250 }, { "epoch": 0.3086197630675047, "grad_norm": 0.4897206425666809, "learning_rate": 1.565660685012492e-05, "loss": 0.3968, "step": 8255 }, { "epoch": 0.30880669205785455, "grad_norm": 0.6160528659820557, "learning_rate": 1.5651763126041098e-05, "loss": 0.2819, "step": 8260 }, { "epoch": 0.3089936210482043, "grad_norm": 0.2819858193397522, "learning_rate": 1.564691745282612e-05, "loss": 0.2829, "step": 8265 }, { "epoch": 0.3091805500385541, "grad_norm": 0.33518272638320923, "learning_rate": 1.5642069832151116e-05, "loss": 0.2391, "step": 8270 }, { "epoch": 0.3093674790289039, "grad_norm": 0.3020849823951721, "learning_rate": 1.5637220265687894e-05, "loss": 0.2829, "step": 8275 }, { "epoch": 0.3095544080192537, "grad_norm": 0.42169663310050964, "learning_rate": 1.5632368755108926e-05, "loss": 0.312, "step": 8280 }, { "epoch": 0.3097413370096035, "grad_norm": 0.34373271465301514, "learning_rate": 1.5627515302087362e-05, "loss": 0.2879, "step": 8285 }, { "epoch": 0.30992826599995327, "grad_norm": 0.4371539056301117, "learning_rate": 1.562265990829702e-05, "loss": 0.3221, "step": 8290 }, { "epoch": 0.31011519499030304, "grad_norm": 0.3401534855365753, "learning_rate": 1.5617802575412385e-05, "loss": 0.2821, "step": 8295 }, { "epoch": 0.3103021239806528, "grad_norm": 0.605153501033783, "learning_rate": 1.5612943305108615e-05, "loss": 0.3055, "step": 8300 }, { "epoch": 0.31048905297100265, "grad_norm": 0.4191633462905884, "learning_rate": 1.5608082099061528e-05, "loss": 0.2807, "step": 8305 }, { "epoch": 0.31067598196135243, "grad_norm": 0.2965903580188751, "learning_rate": 1.560321895894762e-05, "loss": 0.2722, "step": 8310 }, { "epoch": 0.3108629109517022, "grad_norm": 0.39185696840286255, "learning_rate": 1.559835388644404e-05, "loss": 0.3032, "step": 8315 }, { "epoch": 0.311049839942052, "grad_norm": 0.2846728265285492, "learning_rate": 1.5593486883228617e-05, "loss": 0.2977, "step": 8320 }, { "epoch": 0.3112367689324018, "grad_norm": 0.35575684905052185, "learning_rate": 1.5588617950979846e-05, "loss": 0.3072, "step": 8325 }, { "epoch": 0.3114236979227516, "grad_norm": 0.12662628293037415, "learning_rate": 1.5583747091376877e-05, "loss": 0.2764, "step": 8330 }, { "epoch": 0.3116106269131014, "grad_norm": 0.439426451921463, "learning_rate": 1.5578874306099533e-05, "loss": 0.2643, "step": 8335 }, { "epoch": 0.31179755590345115, "grad_norm": 0.13574817776679993, "learning_rate": 1.5573999596828292e-05, "loss": 0.2879, "step": 8340 }, { "epoch": 0.311984484893801, "grad_norm": 0.6407355666160583, "learning_rate": 1.5569122965244306e-05, "loss": 0.2412, "step": 8345 }, { "epoch": 0.31217141388415076, "grad_norm": 0.5395145416259766, "learning_rate": 1.5564244413029385e-05, "loss": 0.3068, "step": 8350 }, { "epoch": 0.31235834287450054, "grad_norm": 0.17821340262889862, "learning_rate": 1.5559363941866005e-05, "loss": 0.4122, "step": 8355 }, { "epoch": 0.3125452718648503, "grad_norm": 0.3062966465950012, "learning_rate": 1.5554481553437294e-05, "loss": 0.2104, "step": 8360 }, { "epoch": 0.31273220085520015, "grad_norm": 0.5049282908439636, "learning_rate": 1.5549597249427052e-05, "loss": 0.253, "step": 8365 }, { "epoch": 0.3129191298455499, "grad_norm": 0.49796777963638306, "learning_rate": 1.5544711031519736e-05, "loss": 0.2669, "step": 8370 }, { "epoch": 0.3131060588358997, "grad_norm": 0.34029170870780945, "learning_rate": 1.553982290140046e-05, "loss": 0.3638, "step": 8375 }, { "epoch": 0.3132929878262495, "grad_norm": 0.5282084941864014, "learning_rate": 1.5534932860755e-05, "loss": 0.2841, "step": 8380 }, { "epoch": 0.3134799168165993, "grad_norm": 0.4704183340072632, "learning_rate": 1.553004091126979e-05, "loss": 0.3326, "step": 8385 }, { "epoch": 0.3136668458069491, "grad_norm": 0.442396879196167, "learning_rate": 1.552514705463193e-05, "loss": 0.2584, "step": 8390 }, { "epoch": 0.31385377479729887, "grad_norm": 0.40408533811569214, "learning_rate": 1.5520251292529165e-05, "loss": 0.3079, "step": 8395 }, { "epoch": 0.31404070378764865, "grad_norm": 0.34462475776672363, "learning_rate": 1.5515353626649905e-05, "loss": 0.2727, "step": 8400 }, { "epoch": 0.3142276327779985, "grad_norm": 0.043053608387708664, "learning_rate": 1.5510454058683216e-05, "loss": 0.2879, "step": 8405 }, { "epoch": 0.31441456176834826, "grad_norm": 0.3124004006385803, "learning_rate": 1.5505552590318814e-05, "loss": 0.3576, "step": 8410 }, { "epoch": 0.31460149075869803, "grad_norm": 0.4973233938217163, "learning_rate": 1.5500649223247076e-05, "loss": 0.2801, "step": 8415 }, { "epoch": 0.3147884197490478, "grad_norm": 0.4319716989994049, "learning_rate": 1.5495743959159034e-05, "loss": 0.2454, "step": 8420 }, { "epoch": 0.31497534873939764, "grad_norm": 0.4788093864917755, "learning_rate": 1.549083679974638e-05, "loss": 0.3537, "step": 8425 }, { "epoch": 0.3151622777297474, "grad_norm": 0.6235319972038269, "learning_rate": 1.548592774670144e-05, "loss": 0.2974, "step": 8430 }, { "epoch": 0.3153492067200972, "grad_norm": 0.3328862190246582, "learning_rate": 1.5481016801717213e-05, "loss": 0.37, "step": 8435 }, { "epoch": 0.315536135710447, "grad_norm": 0.6881264448165894, "learning_rate": 1.5476103966487345e-05, "loss": 0.3738, "step": 8440 }, { "epoch": 0.3157230647007968, "grad_norm": 0.4371614456176758, "learning_rate": 1.547118924270613e-05, "loss": 0.2477, "step": 8445 }, { "epoch": 0.3159099936911466, "grad_norm": 0.6535786986351013, "learning_rate": 1.546627263206851e-05, "loss": 0.3318, "step": 8450 }, { "epoch": 0.31609692268149636, "grad_norm": 0.5133426189422607, "learning_rate": 1.546135413627009e-05, "loss": 0.2697, "step": 8455 }, { "epoch": 0.31628385167184614, "grad_norm": 0.5738974809646606, "learning_rate": 1.5456433757007115e-05, "loss": 0.247, "step": 8460 }, { "epoch": 0.31647078066219597, "grad_norm": 0.12999732792377472, "learning_rate": 1.5451511495976483e-05, "loss": 0.2726, "step": 8465 }, { "epoch": 0.31665770965254575, "grad_norm": 0.30150121450424194, "learning_rate": 1.5446587354875742e-05, "loss": 0.2792, "step": 8470 }, { "epoch": 0.3168446386428955, "grad_norm": 0.36538806557655334, "learning_rate": 1.544166133540309e-05, "loss": 0.3656, "step": 8475 }, { "epoch": 0.3170315676332453, "grad_norm": 0.5512821078300476, "learning_rate": 1.5436733439257362e-05, "loss": 0.2357, "step": 8480 }, { "epoch": 0.31721849662359514, "grad_norm": 0.4331134259700775, "learning_rate": 1.5431803668138056e-05, "loss": 0.2833, "step": 8485 }, { "epoch": 0.3174054256139449, "grad_norm": 0.2866262197494507, "learning_rate": 1.5426872023745305e-05, "loss": 0.1892, "step": 8490 }, { "epoch": 0.3175923546042947, "grad_norm": 0.36052101850509644, "learning_rate": 1.5421938507779893e-05, "loss": 0.2636, "step": 8495 }, { "epoch": 0.31777928359464447, "grad_norm": 0.4259335398674011, "learning_rate": 1.5417003121943247e-05, "loss": 0.2615, "step": 8500 }, { "epoch": 0.3179662125849943, "grad_norm": 0.270948201417923, "learning_rate": 1.541206586793744e-05, "loss": 0.3241, "step": 8505 }, { "epoch": 0.3181531415753441, "grad_norm": 0.2681352496147156, "learning_rate": 1.5407126747465195e-05, "loss": 0.2701, "step": 8510 }, { "epoch": 0.31834007056569386, "grad_norm": 0.4917761981487274, "learning_rate": 1.5402185762229864e-05, "loss": 0.2454, "step": 8515 }, { "epoch": 0.31852699955604363, "grad_norm": 0.4351048767566681, "learning_rate": 1.539724291393546e-05, "loss": 0.2836, "step": 8520 }, { "epoch": 0.31871392854639347, "grad_norm": 0.7462584376335144, "learning_rate": 1.5392298204286623e-05, "loss": 0.2905, "step": 8525 }, { "epoch": 0.31890085753674324, "grad_norm": 0.36334559321403503, "learning_rate": 1.5387351634988644e-05, "loss": 0.2733, "step": 8530 }, { "epoch": 0.319087786527093, "grad_norm": 0.22820745408535004, "learning_rate": 1.5382403207747453e-05, "loss": 0.253, "step": 8535 }, { "epoch": 0.3192747155174428, "grad_norm": 0.45713189244270325, "learning_rate": 1.537745292426962e-05, "loss": 0.2474, "step": 8540 }, { "epoch": 0.3194616445077926, "grad_norm": 0.1870334893465042, "learning_rate": 1.5372500786262357e-05, "loss": 0.3835, "step": 8545 }, { "epoch": 0.3196485734981424, "grad_norm": 0.6428795456886292, "learning_rate": 1.5367546795433517e-05, "loss": 0.3194, "step": 8550 }, { "epoch": 0.3198355024884922, "grad_norm": 0.47562816739082336, "learning_rate": 1.5362590953491586e-05, "loss": 0.2485, "step": 8555 }, { "epoch": 0.32002243147884196, "grad_norm": 0.418270468711853, "learning_rate": 1.535763326214569e-05, "loss": 0.318, "step": 8560 }, { "epoch": 0.32020936046919174, "grad_norm": 0.7207987904548645, "learning_rate": 1.53526737231056e-05, "loss": 0.347, "step": 8565 }, { "epoch": 0.32039628945954157, "grad_norm": 0.4449211061000824, "learning_rate": 1.5347712338081717e-05, "loss": 0.2959, "step": 8570 }, { "epoch": 0.32058321844989135, "grad_norm": 0.29263216257095337, "learning_rate": 1.5342749108785084e-05, "loss": 0.2846, "step": 8575 }, { "epoch": 0.3207701474402411, "grad_norm": 0.5871028900146484, "learning_rate": 1.5337784036927367e-05, "loss": 0.3511, "step": 8580 }, { "epoch": 0.3209570764305909, "grad_norm": 0.3245595395565033, "learning_rate": 1.533281712422088e-05, "loss": 0.3302, "step": 8585 }, { "epoch": 0.32114400542094074, "grad_norm": 0.6525892019271851, "learning_rate": 1.5327848372378574e-05, "loss": 0.3053, "step": 8590 }, { "epoch": 0.3213309344112905, "grad_norm": 0.3770609498023987, "learning_rate": 1.5322877783114027e-05, "loss": 0.3547, "step": 8595 }, { "epoch": 0.3215178634016403, "grad_norm": 0.4422697126865387, "learning_rate": 1.5317905358141456e-05, "loss": 0.2285, "step": 8600 }, { "epoch": 0.32170479239199007, "grad_norm": 0.3939485549926758, "learning_rate": 1.53129310991757e-05, "loss": 0.3421, "step": 8605 }, { "epoch": 0.3218917213823399, "grad_norm": 0.3674333095550537, "learning_rate": 1.5307955007932243e-05, "loss": 0.2389, "step": 8610 }, { "epoch": 0.3220786503726897, "grad_norm": 0.27263256907463074, "learning_rate": 1.5302977086127194e-05, "loss": 0.2633, "step": 8615 }, { "epoch": 0.32226557936303946, "grad_norm": 0.3871941864490509, "learning_rate": 1.5297997335477302e-05, "loss": 0.314, "step": 8620 }, { "epoch": 0.32245250835338923, "grad_norm": 0.4470764398574829, "learning_rate": 1.5293015757699935e-05, "loss": 0.2881, "step": 8625 }, { "epoch": 0.32263943734373907, "grad_norm": 0.6860572695732117, "learning_rate": 1.5288032354513095e-05, "loss": 0.3377, "step": 8630 }, { "epoch": 0.32282636633408884, "grad_norm": 0.4774250090122223, "learning_rate": 1.5283047127635418e-05, "loss": 0.2989, "step": 8635 }, { "epoch": 0.3230132953244386, "grad_norm": 0.2891339957714081, "learning_rate": 1.5278060078786166e-05, "loss": 0.2577, "step": 8640 }, { "epoch": 0.3232002243147884, "grad_norm": 0.7874906659126282, "learning_rate": 1.5273071209685227e-05, "loss": 0.3161, "step": 8645 }, { "epoch": 0.32338715330513823, "grad_norm": 0.3914336562156677, "learning_rate": 1.526808052205312e-05, "loss": 0.2692, "step": 8650 }, { "epoch": 0.323574082295488, "grad_norm": 0.9589570164680481, "learning_rate": 1.526308801761099e-05, "loss": 0.2913, "step": 8655 }, { "epoch": 0.3237610112858378, "grad_norm": 0.19636771082878113, "learning_rate": 1.5258093698080614e-05, "loss": 0.3008, "step": 8660 }, { "epoch": 0.32394794027618756, "grad_norm": 0.27566975355148315, "learning_rate": 1.5253097565184382e-05, "loss": 0.2644, "step": 8665 }, { "epoch": 0.3241348692665374, "grad_norm": 0.3337952494621277, "learning_rate": 1.5248099620645321e-05, "loss": 0.3182, "step": 8670 }, { "epoch": 0.3243217982568872, "grad_norm": 0.30488675832748413, "learning_rate": 1.5243099866187076e-05, "loss": 0.3166, "step": 8675 }, { "epoch": 0.32450872724723695, "grad_norm": 0.2742447853088379, "learning_rate": 1.5238098303533923e-05, "loss": 0.2807, "step": 8680 }, { "epoch": 0.3246956562375867, "grad_norm": 0.35254549980163574, "learning_rate": 1.5233094934410755e-05, "loss": 0.2999, "step": 8685 }, { "epoch": 0.32488258522793656, "grad_norm": 0.1497417390346527, "learning_rate": 1.522808976054309e-05, "loss": 0.3525, "step": 8690 }, { "epoch": 0.32506951421828634, "grad_norm": 0.22152622044086456, "learning_rate": 1.522308278365707e-05, "loss": 0.3084, "step": 8695 }, { "epoch": 0.3252564432086361, "grad_norm": 0.4548346996307373, "learning_rate": 1.521807400547946e-05, "loss": 0.3306, "step": 8700 }, { "epoch": 0.3254433721989859, "grad_norm": 0.7274771928787231, "learning_rate": 1.5213063427737639e-05, "loss": 0.2431, "step": 8705 }, { "epoch": 0.3256303011893357, "grad_norm": 0.4136773347854614, "learning_rate": 1.5208051052159618e-05, "loss": 0.2497, "step": 8710 }, { "epoch": 0.3258172301796855, "grad_norm": 0.4158473014831543, "learning_rate": 1.520303688047402e-05, "loss": 0.3343, "step": 8715 }, { "epoch": 0.3260041591700353, "grad_norm": 0.26390954852104187, "learning_rate": 1.5198020914410085e-05, "loss": 0.2489, "step": 8720 }, { "epoch": 0.32619108816038506, "grad_norm": 0.3240726888179779, "learning_rate": 1.5193003155697681e-05, "loss": 0.3173, "step": 8725 }, { "epoch": 0.3263780171507349, "grad_norm": 0.4203800857067108, "learning_rate": 1.5187983606067284e-05, "loss": 0.2545, "step": 8730 }, { "epoch": 0.32656494614108467, "grad_norm": 0.40211477875709534, "learning_rate": 1.5182962267249997e-05, "loss": 0.2805, "step": 8735 }, { "epoch": 0.32675187513143444, "grad_norm": 0.35995474457740784, "learning_rate": 1.5177939140977535e-05, "loss": 0.3068, "step": 8740 }, { "epoch": 0.3269388041217842, "grad_norm": 0.36302804946899414, "learning_rate": 1.517291422898223e-05, "loss": 0.2778, "step": 8745 }, { "epoch": 0.32712573311213405, "grad_norm": 0.4798825681209564, "learning_rate": 1.5167887532997032e-05, "loss": 0.2644, "step": 8750 }, { "epoch": 0.32731266210248383, "grad_norm": 0.3247123658657074, "learning_rate": 1.51628590547555e-05, "loss": 0.259, "step": 8755 }, { "epoch": 0.3274995910928336, "grad_norm": 0.23680318892002106, "learning_rate": 1.5157828795991813e-05, "loss": 0.3716, "step": 8760 }, { "epoch": 0.3276865200831834, "grad_norm": 0.4908321797847748, "learning_rate": 1.5152796758440769e-05, "loss": 0.2849, "step": 8765 }, { "epoch": 0.3278734490735332, "grad_norm": 0.8543740510940552, "learning_rate": 1.514776294383777e-05, "loss": 0.3428, "step": 8770 }, { "epoch": 0.328060378063883, "grad_norm": 0.6384404301643372, "learning_rate": 1.514272735391883e-05, "loss": 0.291, "step": 8775 }, { "epoch": 0.3282473070542328, "grad_norm": 0.33483198285102844, "learning_rate": 1.5137689990420583e-05, "loss": 0.3363, "step": 8780 }, { "epoch": 0.32843423604458255, "grad_norm": 0.26816031336784363, "learning_rate": 1.5132650855080275e-05, "loss": 0.298, "step": 8785 }, { "epoch": 0.3286211650349323, "grad_norm": 0.3883032500743866, "learning_rate": 1.5127609949635753e-05, "loss": 0.2433, "step": 8790 }, { "epoch": 0.32880809402528216, "grad_norm": 0.31391748785972595, "learning_rate": 1.5122567275825486e-05, "loss": 0.3271, "step": 8795 }, { "epoch": 0.32899502301563194, "grad_norm": 0.5481862425804138, "learning_rate": 1.5117522835388545e-05, "loss": 0.2805, "step": 8800 }, { "epoch": 0.3291819520059817, "grad_norm": 0.38565173745155334, "learning_rate": 1.5112476630064615e-05, "loss": 0.2974, "step": 8805 }, { "epoch": 0.3293688809963315, "grad_norm": 0.3033200800418854, "learning_rate": 1.5107428661593983e-05, "loss": 0.2851, "step": 8810 }, { "epoch": 0.3295558099866813, "grad_norm": 0.3411104381084442, "learning_rate": 1.5102378931717556e-05, "loss": 0.3146, "step": 8815 }, { "epoch": 0.3297427389770311, "grad_norm": 0.32924318313598633, "learning_rate": 1.5097327442176837e-05, "loss": 0.2499, "step": 8820 }, { "epoch": 0.3299296679673809, "grad_norm": 0.6186853647232056, "learning_rate": 1.5092274194713933e-05, "loss": 0.3388, "step": 8825 }, { "epoch": 0.33011659695773066, "grad_norm": 0.2822064757347107, "learning_rate": 1.5087219191071579e-05, "loss": 0.307, "step": 8830 }, { "epoch": 0.3303035259480805, "grad_norm": 0.4423992931842804, "learning_rate": 1.5082162432993092e-05, "loss": 0.2772, "step": 8835 }, { "epoch": 0.33049045493843027, "grad_norm": 0.6045133471488953, "learning_rate": 1.5077103922222402e-05, "loss": 0.3682, "step": 8840 }, { "epoch": 0.33067738392878004, "grad_norm": 0.26116663217544556, "learning_rate": 1.507204366050405e-05, "loss": 0.4336, "step": 8845 }, { "epoch": 0.3308643129191298, "grad_norm": 0.621884286403656, "learning_rate": 1.5066981649583168e-05, "loss": 0.2781, "step": 8850 }, { "epoch": 0.33105124190947965, "grad_norm": 0.43372565507888794, "learning_rate": 1.5061917891205504e-05, "loss": 0.3472, "step": 8855 }, { "epoch": 0.33123817089982943, "grad_norm": 1.028779149055481, "learning_rate": 1.5056852387117405e-05, "loss": 0.4705, "step": 8860 }, { "epoch": 0.3314250998901792, "grad_norm": 0.4312193691730499, "learning_rate": 1.505178513906581e-05, "loss": 0.3649, "step": 8865 }, { "epoch": 0.331612028880529, "grad_norm": 0.3952298164367676, "learning_rate": 1.5046716148798273e-05, "loss": 0.3109, "step": 8870 }, { "epoch": 0.3317989578708788, "grad_norm": 0.19110475480556488, "learning_rate": 1.5041645418062942e-05, "loss": 0.3217, "step": 8875 }, { "epoch": 0.3319858868612286, "grad_norm": 0.7014629244804382, "learning_rate": 1.5036572948608572e-05, "loss": 0.3183, "step": 8880 }, { "epoch": 0.3321728158515784, "grad_norm": 0.27941253781318665, "learning_rate": 1.5031498742184507e-05, "loss": 0.2359, "step": 8885 }, { "epoch": 0.33235974484192815, "grad_norm": 0.29573166370391846, "learning_rate": 1.5026422800540694e-05, "loss": 0.2383, "step": 8890 }, { "epoch": 0.332546673832278, "grad_norm": 0.36960336565971375, "learning_rate": 1.5021345125427684e-05, "loss": 0.3709, "step": 8895 }, { "epoch": 0.33273360282262776, "grad_norm": 0.3210650086402893, "learning_rate": 1.501626571859662e-05, "loss": 0.3651, "step": 8900 }, { "epoch": 0.33292053181297754, "grad_norm": 0.6550555229187012, "learning_rate": 1.5011184581799243e-05, "loss": 0.3616, "step": 8905 }, { "epoch": 0.3331074608033273, "grad_norm": 0.3219583034515381, "learning_rate": 1.5006101716787896e-05, "loss": 0.3333, "step": 8910 }, { "epoch": 0.33329438979367715, "grad_norm": 0.40844401717185974, "learning_rate": 1.5001017125315503e-05, "loss": 0.3207, "step": 8915 }, { "epoch": 0.3334813187840269, "grad_norm": 0.5592089295387268, "learning_rate": 1.4995930809135604e-05, "loss": 0.2584, "step": 8920 }, { "epoch": 0.3336682477743767, "grad_norm": 0.3412552773952484, "learning_rate": 1.4990842770002321e-05, "loss": 0.3286, "step": 8925 }, { "epoch": 0.3338551767647265, "grad_norm": 0.2517843246459961, "learning_rate": 1.4985753009670375e-05, "loss": 0.3027, "step": 8930 }, { "epoch": 0.3340421057550763, "grad_norm": 0.274789422750473, "learning_rate": 1.4980661529895073e-05, "loss": 0.3058, "step": 8935 }, { "epoch": 0.3342290347454261, "grad_norm": 0.3245002031326294, "learning_rate": 1.4975568332432322e-05, "loss": 0.356, "step": 8940 }, { "epoch": 0.33441596373577587, "grad_norm": 0.2871999442577362, "learning_rate": 1.4970473419038623e-05, "loss": 0.2805, "step": 8945 }, { "epoch": 0.33460289272612564, "grad_norm": 0.3988129496574402, "learning_rate": 1.4965376791471062e-05, "loss": 0.2596, "step": 8950 }, { "epoch": 0.3347898217164755, "grad_norm": 0.3798826038837433, "learning_rate": 1.4960278451487327e-05, "loss": 0.3168, "step": 8955 }, { "epoch": 0.33497675070682525, "grad_norm": 0.336479127407074, "learning_rate": 1.4955178400845678e-05, "loss": 0.294, "step": 8960 }, { "epoch": 0.33516367969717503, "grad_norm": 0.31239262223243713, "learning_rate": 1.4950076641304984e-05, "loss": 0.2486, "step": 8965 }, { "epoch": 0.3353506086875248, "grad_norm": 0.4820721745491028, "learning_rate": 1.4944973174624695e-05, "loss": 0.3584, "step": 8970 }, { "epoch": 0.33553753767787464, "grad_norm": 0.7084938287734985, "learning_rate": 1.493986800256485e-05, "loss": 0.2835, "step": 8975 }, { "epoch": 0.3357244666682244, "grad_norm": 0.13603666424751282, "learning_rate": 1.4934761126886077e-05, "loss": 0.2406, "step": 8980 }, { "epoch": 0.3359113956585742, "grad_norm": 0.28574085235595703, "learning_rate": 1.4929652549349587e-05, "loss": 0.331, "step": 8985 }, { "epoch": 0.336098324648924, "grad_norm": 0.3157019317150116, "learning_rate": 1.4924542271717186e-05, "loss": 0.2754, "step": 8990 }, { "epoch": 0.3362852536392738, "grad_norm": 0.4523063004016876, "learning_rate": 1.4919430295751262e-05, "loss": 0.2865, "step": 8995 }, { "epoch": 0.3364721826296236, "grad_norm": 0.32611748576164246, "learning_rate": 1.4914316623214788e-05, "loss": 0.3052, "step": 9000 }, { "epoch": 0.33665911161997336, "grad_norm": 0.34001392126083374, "learning_rate": 1.4909201255871325e-05, "loss": 0.253, "step": 9005 }, { "epoch": 0.33684604061032314, "grad_norm": 0.3103577792644501, "learning_rate": 1.4904084195485014e-05, "loss": 0.2962, "step": 9010 }, { "epoch": 0.33703296960067297, "grad_norm": 0.3384459614753723, "learning_rate": 1.4898965443820584e-05, "loss": 0.2931, "step": 9015 }, { "epoch": 0.33721989859102275, "grad_norm": 0.47207170724868774, "learning_rate": 1.4893845002643345e-05, "loss": 0.3106, "step": 9020 }, { "epoch": 0.3374068275813725, "grad_norm": 0.25210464000701904, "learning_rate": 1.4888722873719195e-05, "loss": 0.3618, "step": 9025 }, { "epoch": 0.3375937565717223, "grad_norm": 0.4364457130432129, "learning_rate": 1.4883599058814602e-05, "loss": 0.3376, "step": 9030 }, { "epoch": 0.3377806855620721, "grad_norm": 0.4352976381778717, "learning_rate": 1.4878473559696625e-05, "loss": 0.3076, "step": 9035 }, { "epoch": 0.3379676145524219, "grad_norm": 0.4709399342536926, "learning_rate": 1.4873346378132905e-05, "loss": 0.301, "step": 9040 }, { "epoch": 0.3381545435427717, "grad_norm": 0.4114588499069214, "learning_rate": 1.4868217515891657e-05, "loss": 0.2685, "step": 9045 }, { "epoch": 0.33834147253312147, "grad_norm": 0.7446982860565186, "learning_rate": 1.4863086974741684e-05, "loss": 0.294, "step": 9050 }, { "epoch": 0.33852840152347125, "grad_norm": 0.29010334610939026, "learning_rate": 1.4857954756452353e-05, "loss": 0.356, "step": 9055 }, { "epoch": 0.3387153305138211, "grad_norm": 0.4005201756954193, "learning_rate": 1.4852820862793626e-05, "loss": 0.2966, "step": 9060 }, { "epoch": 0.33890225950417086, "grad_norm": 0.7483821511268616, "learning_rate": 1.4847685295536037e-05, "loss": 0.3161, "step": 9065 }, { "epoch": 0.33908918849452063, "grad_norm": 0.4686259925365448, "learning_rate": 1.4842548056450692e-05, "loss": 0.3532, "step": 9070 }, { "epoch": 0.3392761174848704, "grad_norm": 0.44194498658180237, "learning_rate": 1.4837409147309276e-05, "loss": 0.2582, "step": 9075 }, { "epoch": 0.33946304647522024, "grad_norm": 0.47664833068847656, "learning_rate": 1.4832268569884058e-05, "loss": 0.3903, "step": 9080 }, { "epoch": 0.33964997546557, "grad_norm": 0.488346129655838, "learning_rate": 1.4827126325947872e-05, "loss": 0.2816, "step": 9085 }, { "epoch": 0.3398369044559198, "grad_norm": 0.3450961709022522, "learning_rate": 1.4821982417274128e-05, "loss": 0.3073, "step": 9090 }, { "epoch": 0.3400238334462696, "grad_norm": 0.32565736770629883, "learning_rate": 1.4816836845636817e-05, "loss": 0.3467, "step": 9095 }, { "epoch": 0.3402107624366194, "grad_norm": 0.3779214322566986, "learning_rate": 1.4811689612810498e-05, "loss": 0.2372, "step": 9100 }, { "epoch": 0.3403976914269692, "grad_norm": 11.968618392944336, "learning_rate": 1.4806540720570306e-05, "loss": 0.4737, "step": 9105 }, { "epoch": 0.34058462041731896, "grad_norm": 0.4189259111881256, "learning_rate": 1.4801390170691941e-05, "loss": 0.2739, "step": 9110 }, { "epoch": 0.34077154940766874, "grad_norm": 0.33259811997413635, "learning_rate": 1.4796237964951686e-05, "loss": 0.2829, "step": 9115 }, { "epoch": 0.34095847839801857, "grad_norm": 0.2681080400943756, "learning_rate": 1.4791084105126385e-05, "loss": 0.2564, "step": 9120 }, { "epoch": 0.34114540738836835, "grad_norm": 0.35501420497894287, "learning_rate": 1.478592859299346e-05, "loss": 0.2472, "step": 9125 }, { "epoch": 0.3413323363787181, "grad_norm": 0.4222002625465393, "learning_rate": 1.4780771430330894e-05, "loss": 0.3695, "step": 9130 }, { "epoch": 0.3415192653690679, "grad_norm": 0.6013131737709045, "learning_rate": 1.477561261891725e-05, "loss": 0.2431, "step": 9135 }, { "epoch": 0.34170619435941774, "grad_norm": 0.4116293787956238, "learning_rate": 1.4770452160531652e-05, "loss": 0.2915, "step": 9140 }, { "epoch": 0.3418931233497675, "grad_norm": 0.4117601811885834, "learning_rate": 1.4765290056953796e-05, "loss": 0.2381, "step": 9145 }, { "epoch": 0.3420800523401173, "grad_norm": 0.5726933479309082, "learning_rate": 1.476012630996394e-05, "loss": 0.3691, "step": 9150 }, { "epoch": 0.34226698133046707, "grad_norm": 0.43034738302230835, "learning_rate": 1.4754960921342916e-05, "loss": 0.2324, "step": 9155 }, { "epoch": 0.3424539103208169, "grad_norm": 0.23042064905166626, "learning_rate": 1.4749793892872115e-05, "loss": 0.2359, "step": 9160 }, { "epoch": 0.3426408393111667, "grad_norm": 0.4107089340686798, "learning_rate": 1.4744625226333502e-05, "loss": 0.2775, "step": 9165 }, { "epoch": 0.34282776830151646, "grad_norm": 0.1352885216474533, "learning_rate": 1.47394549235096e-05, "loss": 0.2663, "step": 9170 }, { "epoch": 0.34301469729186623, "grad_norm": 0.34713098406791687, "learning_rate": 1.4734282986183494e-05, "loss": 0.3152, "step": 9175 }, { "epoch": 0.34320162628221607, "grad_norm": 0.3076459765434265, "learning_rate": 1.4729109416138843e-05, "loss": 0.3371, "step": 9180 }, { "epoch": 0.34338855527256584, "grad_norm": 0.2737236022949219, "learning_rate": 1.4723934215159858e-05, "loss": 0.3022, "step": 9185 }, { "epoch": 0.3435754842629156, "grad_norm": 0.3773277997970581, "learning_rate": 1.4718757385031321e-05, "loss": 0.2924, "step": 9190 }, { "epoch": 0.3437624132532654, "grad_norm": 0.39832785725593567, "learning_rate": 1.4713578927538573e-05, "loss": 0.2438, "step": 9195 }, { "epoch": 0.34394934224361523, "grad_norm": 0.19407474994659424, "learning_rate": 1.4708398844467512e-05, "loss": 0.3117, "step": 9200 }, { "epoch": 0.344136271233965, "grad_norm": 0.3654159605503082, "learning_rate": 1.4703217137604604e-05, "loss": 0.3485, "step": 9205 }, { "epoch": 0.3443232002243148, "grad_norm": 0.37599438428878784, "learning_rate": 1.4698033808736867e-05, "loss": 0.2486, "step": 9210 }, { "epoch": 0.34451012921466456, "grad_norm": 0.42682337760925293, "learning_rate": 1.4692848859651889e-05, "loss": 0.2533, "step": 9215 }, { "epoch": 0.3446970582050144, "grad_norm": 0.3610582947731018, "learning_rate": 1.4687662292137804e-05, "loss": 0.2781, "step": 9220 }, { "epoch": 0.3448839871953642, "grad_norm": 0.4574204385280609, "learning_rate": 1.4682474107983314e-05, "loss": 0.2965, "step": 9225 }, { "epoch": 0.34507091618571395, "grad_norm": 0.513286828994751, "learning_rate": 1.467728430897767e-05, "loss": 0.2859, "step": 9230 }, { "epoch": 0.3452578451760637, "grad_norm": 0.23457178473472595, "learning_rate": 1.4672092896910692e-05, "loss": 0.2623, "step": 9235 }, { "epoch": 0.34544477416641356, "grad_norm": 0.34765762090682983, "learning_rate": 1.4666899873572747e-05, "loss": 0.291, "step": 9240 }, { "epoch": 0.34563170315676334, "grad_norm": 0.3738343417644501, "learning_rate": 1.4661705240754757e-05, "loss": 0.2701, "step": 9245 }, { "epoch": 0.3458186321471131, "grad_norm": 0.3144400119781494, "learning_rate": 1.4656509000248207e-05, "loss": 0.2842, "step": 9250 }, { "epoch": 0.3460055611374629, "grad_norm": 0.5743440985679626, "learning_rate": 1.4651311153845127e-05, "loss": 0.2436, "step": 9255 }, { "epoch": 0.3461924901278127, "grad_norm": 0.2858171761035919, "learning_rate": 1.4646111703338108e-05, "loss": 0.318, "step": 9260 }, { "epoch": 0.3463794191181625, "grad_norm": 0.4376906156539917, "learning_rate": 1.464091065052029e-05, "loss": 0.2822, "step": 9265 }, { "epoch": 0.3465663481085123, "grad_norm": 0.2567811906337738, "learning_rate": 1.4635707997185367e-05, "loss": 0.2571, "step": 9270 }, { "epoch": 0.34675327709886206, "grad_norm": 0.3876672089099884, "learning_rate": 1.4630503745127587e-05, "loss": 0.3234, "step": 9275 }, { "epoch": 0.34694020608921183, "grad_norm": 0.48617517948150635, "learning_rate": 1.4625297896141741e-05, "loss": 0.2896, "step": 9280 }, { "epoch": 0.34712713507956167, "grad_norm": 0.4633631110191345, "learning_rate": 1.462009045202319e-05, "loss": 0.3156, "step": 9285 }, { "epoch": 0.34731406406991144, "grad_norm": 0.18796226382255554, "learning_rate": 1.461488141456782e-05, "loss": 0.2849, "step": 9290 }, { "epoch": 0.3475009930602612, "grad_norm": 0.8571456074714661, "learning_rate": 1.4609670785572084e-05, "loss": 0.3728, "step": 9295 }, { "epoch": 0.347687922050611, "grad_norm": 0.22898966073989868, "learning_rate": 1.4604458566832977e-05, "loss": 0.2989, "step": 9300 }, { "epoch": 0.34787485104096083, "grad_norm": 0.32521340250968933, "learning_rate": 1.4599244760148046e-05, "loss": 0.2966, "step": 9305 }, { "epoch": 0.3480617800313106, "grad_norm": 0.6073753833770752, "learning_rate": 1.459402936731538e-05, "loss": 0.3742, "step": 9310 }, { "epoch": 0.3482487090216604, "grad_norm": 0.3876205384731293, "learning_rate": 1.4588812390133624e-05, "loss": 0.249, "step": 9315 }, { "epoch": 0.34843563801201016, "grad_norm": 0.46591299772262573, "learning_rate": 1.4583593830401958e-05, "loss": 0.2723, "step": 9320 }, { "epoch": 0.34862256700236, "grad_norm": 0.22240598499774933, "learning_rate": 1.4578373689920116e-05, "loss": 0.3037, "step": 9325 }, { "epoch": 0.3488094959927098, "grad_norm": 0.5356066823005676, "learning_rate": 1.457315197048838e-05, "loss": 0.3212, "step": 9330 }, { "epoch": 0.34899642498305955, "grad_norm": 0.5799317955970764, "learning_rate": 1.4567928673907563e-05, "loss": 0.3065, "step": 9335 }, { "epoch": 0.3491833539734093, "grad_norm": 0.2084297090768814, "learning_rate": 1.4562703801979035e-05, "loss": 0.3081, "step": 9340 }, { "epoch": 0.34937028296375916, "grad_norm": 0.520650327205658, "learning_rate": 1.4557477356504702e-05, "loss": 0.3058, "step": 9345 }, { "epoch": 0.34955721195410894, "grad_norm": 0.34877529740333557, "learning_rate": 1.455224933928702e-05, "loss": 0.2876, "step": 9350 }, { "epoch": 0.3497441409444587, "grad_norm": 0.30944302678108215, "learning_rate": 1.4547019752128977e-05, "loss": 0.3491, "step": 9355 }, { "epoch": 0.3499310699348085, "grad_norm": 0.5403395295143127, "learning_rate": 1.4541788596834111e-05, "loss": 0.3018, "step": 9360 }, { "epoch": 0.3501179989251583, "grad_norm": 0.43393194675445557, "learning_rate": 1.4536555875206497e-05, "loss": 0.3163, "step": 9365 }, { "epoch": 0.3503049279155081, "grad_norm": 0.3072391450405121, "learning_rate": 1.4531321589050749e-05, "loss": 0.2298, "step": 9370 }, { "epoch": 0.3504918569058579, "grad_norm": 0.18442721664905548, "learning_rate": 1.4526085740172025e-05, "loss": 0.4021, "step": 9375 }, { "epoch": 0.35067878589620766, "grad_norm": 0.4270346462726593, "learning_rate": 1.4520848330376019e-05, "loss": 0.2749, "step": 9380 }, { "epoch": 0.3508657148865575, "grad_norm": 0.4557771384716034, "learning_rate": 1.4515609361468959e-05, "loss": 0.2412, "step": 9385 }, { "epoch": 0.35105264387690727, "grad_norm": 0.7857339978218079, "learning_rate": 1.4510368835257622e-05, "loss": 0.2772, "step": 9390 }, { "epoch": 0.35123957286725704, "grad_norm": 0.30803146958351135, "learning_rate": 1.450512675354931e-05, "loss": 0.3163, "step": 9395 }, { "epoch": 0.3514265018576068, "grad_norm": 0.4320001006126404, "learning_rate": 1.449988311815187e-05, "loss": 0.2661, "step": 9400 }, { "epoch": 0.35161343084795665, "grad_norm": 0.13867487013339996, "learning_rate": 1.4494637930873683e-05, "loss": 0.2416, "step": 9405 }, { "epoch": 0.35180035983830643, "grad_norm": 0.27354222536087036, "learning_rate": 1.4489391193523658e-05, "loss": 0.2661, "step": 9410 }, { "epoch": 0.3519872888286562, "grad_norm": 0.3791649043560028, "learning_rate": 1.4484142907911247e-05, "loss": 0.2581, "step": 9415 }, { "epoch": 0.352174217819006, "grad_norm": 0.38127240538597107, "learning_rate": 1.4478893075846436e-05, "loss": 0.2605, "step": 9420 }, { "epoch": 0.3523611468093558, "grad_norm": 0.23698148131370544, "learning_rate": 1.447364169913974e-05, "loss": 0.3725, "step": 9425 }, { "epoch": 0.3525480757997056, "grad_norm": 0.3475520610809326, "learning_rate": 1.4468388779602207e-05, "loss": 0.2851, "step": 9430 }, { "epoch": 0.3527350047900554, "grad_norm": 0.45811226963996887, "learning_rate": 1.446313431904542e-05, "loss": 0.3581, "step": 9435 }, { "epoch": 0.35292193378040515, "grad_norm": 0.6400048136711121, "learning_rate": 1.4457878319281491e-05, "loss": 0.2837, "step": 9440 }, { "epoch": 0.353108862770755, "grad_norm": 0.4961070716381073, "learning_rate": 1.4452620782123063e-05, "loss": 0.3202, "step": 9445 }, { "epoch": 0.35329579176110476, "grad_norm": 0.33612340688705444, "learning_rate": 1.4447361709383312e-05, "loss": 0.2923, "step": 9450 }, { "epoch": 0.35348272075145454, "grad_norm": 0.4333072006702423, "learning_rate": 1.4442101102875942e-05, "loss": 0.2749, "step": 9455 }, { "epoch": 0.3536696497418043, "grad_norm": 0.3720749616622925, "learning_rate": 1.443683896441518e-05, "loss": 0.333, "step": 9460 }, { "epoch": 0.35385657873215415, "grad_norm": 0.638292670249939, "learning_rate": 1.4431575295815793e-05, "loss": 0.2825, "step": 9465 }, { "epoch": 0.3540435077225039, "grad_norm": 0.27355220913887024, "learning_rate": 1.4426310098893069e-05, "loss": 0.334, "step": 9470 }, { "epoch": 0.3542304367128537, "grad_norm": 0.3119034171104431, "learning_rate": 1.4421043375462821e-05, "loss": 0.3194, "step": 9475 }, { "epoch": 0.3544173657032035, "grad_norm": 0.3145153820514679, "learning_rate": 1.441577512734139e-05, "loss": 0.2671, "step": 9480 }, { "epoch": 0.3546042946935533, "grad_norm": 0.25988125801086426, "learning_rate": 1.4410505356345645e-05, "loss": 0.2174, "step": 9485 }, { "epoch": 0.3547912236839031, "grad_norm": 0.4911235570907593, "learning_rate": 1.440523406429298e-05, "loss": 0.4387, "step": 9490 }, { "epoch": 0.35497815267425287, "grad_norm": 0.21677200496196747, "learning_rate": 1.4399961253001316e-05, "loss": 0.2656, "step": 9495 }, { "epoch": 0.35516508166460264, "grad_norm": 0.3045201897621155, "learning_rate": 1.4394686924289087e-05, "loss": 0.2927, "step": 9500 }, { "epoch": 0.3553520106549525, "grad_norm": 0.38031861186027527, "learning_rate": 1.4389411079975262e-05, "loss": 0.333, "step": 9505 }, { "epoch": 0.35553893964530225, "grad_norm": 0.31916332244873047, "learning_rate": 1.4384133721879327e-05, "loss": 0.2274, "step": 9510 }, { "epoch": 0.35572586863565203, "grad_norm": 0.4681374430656433, "learning_rate": 1.4378854851821294e-05, "loss": 0.2061, "step": 9515 }, { "epoch": 0.3559127976260018, "grad_norm": 0.43480637669563293, "learning_rate": 1.4373574471621694e-05, "loss": 0.3167, "step": 9520 }, { "epoch": 0.3560997266163516, "grad_norm": 0.2742566764354706, "learning_rate": 1.4368292583101578e-05, "loss": 0.3607, "step": 9525 }, { "epoch": 0.3562866556067014, "grad_norm": 0.2994193732738495, "learning_rate": 1.4363009188082513e-05, "loss": 0.34, "step": 9530 }, { "epoch": 0.3564735845970512, "grad_norm": 0.38814255595207214, "learning_rate": 1.43577242883866e-05, "loss": 0.2461, "step": 9535 }, { "epoch": 0.356660513587401, "grad_norm": 0.29775434732437134, "learning_rate": 1.4352437885836441e-05, "loss": 0.2581, "step": 9540 }, { "epoch": 0.35684744257775075, "grad_norm": 0.36987629532814026, "learning_rate": 1.4347149982255168e-05, "loss": 0.3376, "step": 9545 }, { "epoch": 0.3570343715681006, "grad_norm": 0.2949317693710327, "learning_rate": 1.4341860579466428e-05, "loss": 0.3628, "step": 9550 }, { "epoch": 0.35722130055845036, "grad_norm": 0.3889232873916626, "learning_rate": 1.4336569679294385e-05, "loss": 0.2386, "step": 9555 }, { "epoch": 0.35740822954880014, "grad_norm": 0.498597115278244, "learning_rate": 1.433127728356372e-05, "loss": 0.2282, "step": 9560 }, { "epoch": 0.3575951585391499, "grad_norm": 0.332461416721344, "learning_rate": 1.4325983394099626e-05, "loss": 0.3273, "step": 9565 }, { "epoch": 0.35778208752949975, "grad_norm": 0.5877690315246582, "learning_rate": 1.4320688012727814e-05, "loss": 0.2397, "step": 9570 }, { "epoch": 0.3579690165198495, "grad_norm": 0.1286364197731018, "learning_rate": 1.4315391141274508e-05, "loss": 0.2346, "step": 9575 }, { "epoch": 0.3581559455101993, "grad_norm": 0.3990229368209839, "learning_rate": 1.4310092781566452e-05, "loss": 0.3775, "step": 9580 }, { "epoch": 0.3583428745005491, "grad_norm": 0.4049658179283142, "learning_rate": 1.4304792935430893e-05, "loss": 0.2323, "step": 9585 }, { "epoch": 0.3585298034908989, "grad_norm": 0.27659088373184204, "learning_rate": 1.42994916046956e-05, "loss": 0.2626, "step": 9590 }, { "epoch": 0.3587167324812487, "grad_norm": 0.38999220728874207, "learning_rate": 1.4294188791188846e-05, "loss": 0.3191, "step": 9595 }, { "epoch": 0.35890366147159847, "grad_norm": 0.5159905552864075, "learning_rate": 1.4288884496739424e-05, "loss": 0.2849, "step": 9600 }, { "epoch": 0.35909059046194824, "grad_norm": 0.3492240905761719, "learning_rate": 1.4283578723176632e-05, "loss": 0.3292, "step": 9605 }, { "epoch": 0.3592775194522981, "grad_norm": 0.3294449746608734, "learning_rate": 1.4278271472330277e-05, "loss": 0.3421, "step": 9610 }, { "epoch": 0.35946444844264785, "grad_norm": 0.543605625629425, "learning_rate": 1.4272962746030678e-05, "loss": 0.3525, "step": 9615 }, { "epoch": 0.35965137743299763, "grad_norm": 0.5134566426277161, "learning_rate": 1.4267652546108668e-05, "loss": 0.2269, "step": 9620 }, { "epoch": 0.3598383064233474, "grad_norm": 0.30253830552101135, "learning_rate": 1.4262340874395574e-05, "loss": 0.2389, "step": 9625 }, { "epoch": 0.36002523541369724, "grad_norm": 0.30939239263534546, "learning_rate": 1.4257027732723247e-05, "loss": 0.3656, "step": 9630 }, { "epoch": 0.360212164404047, "grad_norm": 0.3319133222103119, "learning_rate": 1.4251713122924034e-05, "loss": 0.2553, "step": 9635 }, { "epoch": 0.3603990933943968, "grad_norm": 0.37184301018714905, "learning_rate": 1.424639704683079e-05, "loss": 0.305, "step": 9640 }, { "epoch": 0.3605860223847466, "grad_norm": 0.4830484688282013, "learning_rate": 1.424107950627688e-05, "loss": 0.2698, "step": 9645 }, { "epoch": 0.3607729513750964, "grad_norm": 0.5102275013923645, "learning_rate": 1.423576050309617e-05, "loss": 0.2744, "step": 9650 }, { "epoch": 0.3609598803654462, "grad_norm": 0.2823433578014374, "learning_rate": 1.4230440039123032e-05, "loss": 0.2687, "step": 9655 }, { "epoch": 0.36114680935579596, "grad_norm": 0.36009663343429565, "learning_rate": 1.4225118116192344e-05, "loss": 0.3377, "step": 9660 }, { "epoch": 0.36133373834614574, "grad_norm": 0.5339310765266418, "learning_rate": 1.4219794736139484e-05, "loss": 0.2668, "step": 9665 }, { "epoch": 0.36152066733649557, "grad_norm": 0.565954864025116, "learning_rate": 1.421446990080033e-05, "loss": 0.3104, "step": 9670 }, { "epoch": 0.36170759632684535, "grad_norm": 0.5683127641677856, "learning_rate": 1.4209143612011268e-05, "loss": 0.2727, "step": 9675 }, { "epoch": 0.3618945253171951, "grad_norm": 0.4476057291030884, "learning_rate": 1.4203815871609177e-05, "loss": 0.2485, "step": 9680 }, { "epoch": 0.3620814543075449, "grad_norm": 0.3725626468658447, "learning_rate": 1.419848668143145e-05, "loss": 0.288, "step": 9685 }, { "epoch": 0.36226838329789474, "grad_norm": 0.5492215752601624, "learning_rate": 1.4193156043315967e-05, "loss": 0.3209, "step": 9690 }, { "epoch": 0.3624553122882445, "grad_norm": 0.3779137134552002, "learning_rate": 1.4187823959101116e-05, "loss": 0.2553, "step": 9695 }, { "epoch": 0.3626422412785943, "grad_norm": 0.296917587518692, "learning_rate": 1.4182490430625775e-05, "loss": 0.3266, "step": 9700 }, { "epoch": 0.36282917026894407, "grad_norm": 0.2786087691783905, "learning_rate": 1.417715545972933e-05, "loss": 0.2164, "step": 9705 }, { "epoch": 0.3630160992592939, "grad_norm": 0.44238272309303284, "learning_rate": 1.4171819048251658e-05, "loss": 0.2699, "step": 9710 }, { "epoch": 0.3632030282496437, "grad_norm": 0.5811543464660645, "learning_rate": 1.4166481198033136e-05, "loss": 0.2559, "step": 9715 }, { "epoch": 0.36338995723999346, "grad_norm": 0.26523107290267944, "learning_rate": 1.4161141910914632e-05, "loss": 0.2773, "step": 9720 }, { "epoch": 0.36357688623034323, "grad_norm": 0.5079830288887024, "learning_rate": 1.415580118873751e-05, "loss": 0.2821, "step": 9725 }, { "epoch": 0.36376381522069307, "grad_norm": 0.44632482528686523, "learning_rate": 1.4150459033343639e-05, "loss": 0.3243, "step": 9730 }, { "epoch": 0.36395074421104284, "grad_norm": 0.3747384250164032, "learning_rate": 1.4145115446575377e-05, "loss": 0.3253, "step": 9735 }, { "epoch": 0.3641376732013926, "grad_norm": 0.423495888710022, "learning_rate": 1.4139770430275567e-05, "loss": 0.259, "step": 9740 }, { "epoch": 0.3643246021917424, "grad_norm": 0.365408718585968, "learning_rate": 1.4134423986287555e-05, "loss": 0.3372, "step": 9745 }, { "epoch": 0.36451153118209223, "grad_norm": 0.31306222081184387, "learning_rate": 1.4129076116455176e-05, "loss": 0.2578, "step": 9750 }, { "epoch": 0.364698460172442, "grad_norm": 0.4542996287345886, "learning_rate": 1.4123726822622758e-05, "loss": 0.3076, "step": 9755 }, { "epoch": 0.3648853891627918, "grad_norm": 0.2852497398853302, "learning_rate": 1.411837610663512e-05, "loss": 0.2407, "step": 9760 }, { "epoch": 0.36507231815314156, "grad_norm": 0.4517068564891815, "learning_rate": 1.411302397033757e-05, "loss": 0.2958, "step": 9765 }, { "epoch": 0.36525924714349134, "grad_norm": 0.24500373005867004, "learning_rate": 1.4107670415575902e-05, "loss": 0.3155, "step": 9770 }, { "epoch": 0.36544617613384117, "grad_norm": 0.35521599650382996, "learning_rate": 1.410231544419641e-05, "loss": 0.2573, "step": 9775 }, { "epoch": 0.36563310512419095, "grad_norm": 0.5142762660980225, "learning_rate": 1.4096959058045867e-05, "loss": 0.3081, "step": 9780 }, { "epoch": 0.3658200341145407, "grad_norm": 0.46503591537475586, "learning_rate": 1.4091601258971537e-05, "loss": 0.2943, "step": 9785 }, { "epoch": 0.3660069631048905, "grad_norm": 0.35828927159309387, "learning_rate": 1.4086242048821172e-05, "loss": 0.2945, "step": 9790 }, { "epoch": 0.36619389209524034, "grad_norm": 0.43609440326690674, "learning_rate": 1.408088142944301e-05, "loss": 0.3362, "step": 9795 }, { "epoch": 0.3663808210855901, "grad_norm": 0.7864788770675659, "learning_rate": 1.4075519402685775e-05, "loss": 0.2748, "step": 9800 }, { "epoch": 0.3665677500759399, "grad_norm": 0.2505030930042267, "learning_rate": 1.4070155970398676e-05, "loss": 0.3193, "step": 9805 }, { "epoch": 0.36675467906628967, "grad_norm": 0.2732783257961273, "learning_rate": 1.406479113443141e-05, "loss": 0.4463, "step": 9810 }, { "epoch": 0.3669416080566395, "grad_norm": 0.4646552801132202, "learning_rate": 1.405942489663415e-05, "loss": 0.2432, "step": 9815 }, { "epoch": 0.3671285370469893, "grad_norm": 0.23349085450172424, "learning_rate": 1.405405725885756e-05, "loss": 0.27, "step": 9820 }, { "epoch": 0.36731546603733906, "grad_norm": 0.318560391664505, "learning_rate": 1.4048688222952787e-05, "loss": 0.2645, "step": 9825 }, { "epoch": 0.36750239502768883, "grad_norm": 0.3994881510734558, "learning_rate": 1.4043317790771455e-05, "loss": 0.2549, "step": 9830 }, { "epoch": 0.36768932401803867, "grad_norm": 0.4488486647605896, "learning_rate": 1.4037945964165674e-05, "loss": 0.2432, "step": 9835 }, { "epoch": 0.36787625300838844, "grad_norm": 0.47077760100364685, "learning_rate": 1.4032572744988029e-05, "loss": 0.2743, "step": 9840 }, { "epoch": 0.3680631819987382, "grad_norm": 0.3115878403186798, "learning_rate": 1.4027198135091594e-05, "loss": 0.3284, "step": 9845 }, { "epoch": 0.368250110989088, "grad_norm": 0.802329421043396, "learning_rate": 1.4021822136329914e-05, "loss": 0.3841, "step": 9850 }, { "epoch": 0.36843703997943783, "grad_norm": 0.22909800708293915, "learning_rate": 1.4016444750557022e-05, "loss": 0.3512, "step": 9855 }, { "epoch": 0.3686239689697876, "grad_norm": 0.4044862687587738, "learning_rate": 1.4011065979627418e-05, "loss": 0.2226, "step": 9860 }, { "epoch": 0.3688108979601374, "grad_norm": 0.7386347651481628, "learning_rate": 1.4005685825396091e-05, "loss": 0.2814, "step": 9865 }, { "epoch": 0.36899782695048716, "grad_norm": 0.49540725350379944, "learning_rate": 1.4000304289718498e-05, "loss": 0.3408, "step": 9870 }, { "epoch": 0.369184755940837, "grad_norm": 0.33938613533973694, "learning_rate": 1.3994921374450584e-05, "loss": 0.3386, "step": 9875 }, { "epoch": 0.3693716849311868, "grad_norm": 0.6717550158500671, "learning_rate": 1.3989537081448752e-05, "loss": 0.351, "step": 9880 }, { "epoch": 0.36955861392153655, "grad_norm": 0.3765832483768463, "learning_rate": 1.3984151412569898e-05, "loss": 0.2666, "step": 9885 }, { "epoch": 0.3697455429118863, "grad_norm": 0.38686302304267883, "learning_rate": 1.3978764369671378e-05, "loss": 0.2667, "step": 9890 }, { "epoch": 0.36993247190223616, "grad_norm": 0.4677604138851166, "learning_rate": 1.3973375954611037e-05, "loss": 0.263, "step": 9895 }, { "epoch": 0.37011940089258594, "grad_norm": 0.19471901655197144, "learning_rate": 1.396798616924718e-05, "loss": 0.4968, "step": 9900 }, { "epoch": 0.3703063298829357, "grad_norm": 0.37783077359199524, "learning_rate": 1.3962595015438592e-05, "loss": 0.3632, "step": 9905 }, { "epoch": 0.3704932588732855, "grad_norm": 0.26753556728363037, "learning_rate": 1.3957202495044525e-05, "loss": 0.2798, "step": 9910 }, { "epoch": 0.3706801878636353, "grad_norm": 0.3343743085861206, "learning_rate": 1.3951808609924703e-05, "loss": 0.2431, "step": 9915 }, { "epoch": 0.3708671168539851, "grad_norm": 0.2634985148906708, "learning_rate": 1.3946413361939324e-05, "loss": 0.2888, "step": 9920 }, { "epoch": 0.3710540458443349, "grad_norm": 0.8201472163200378, "learning_rate": 1.3941016752949059e-05, "loss": 0.3404, "step": 9925 }, { "epoch": 0.37124097483468466, "grad_norm": 0.5062202215194702, "learning_rate": 1.3935618784815042e-05, "loss": 0.2689, "step": 9930 }, { "epoch": 0.3714279038250345, "grad_norm": 0.5348239541053772, "learning_rate": 1.3930219459398872e-05, "loss": 0.2577, "step": 9935 }, { "epoch": 0.37161483281538427, "grad_norm": 0.3352089822292328, "learning_rate": 1.3924818778562627e-05, "loss": 0.3134, "step": 9940 }, { "epoch": 0.37180176180573404, "grad_norm": 0.4589434266090393, "learning_rate": 1.3919416744168846e-05, "loss": 0.3105, "step": 9945 }, { "epoch": 0.3719886907960838, "grad_norm": 0.5832353234291077, "learning_rate": 1.3914013358080536e-05, "loss": 0.2674, "step": 9950 }, { "epoch": 0.37217561978643365, "grad_norm": 0.46412187814712524, "learning_rate": 1.3908608622161168e-05, "loss": 0.2989, "step": 9955 }, { "epoch": 0.37236254877678343, "grad_norm": 0.22348807752132416, "learning_rate": 1.390320253827468e-05, "loss": 0.306, "step": 9960 }, { "epoch": 0.3725494777671332, "grad_norm": 0.3156212270259857, "learning_rate": 1.389779510828548e-05, "loss": 0.2838, "step": 9965 }, { "epoch": 0.372736406757483, "grad_norm": 0.32380181550979614, "learning_rate": 1.3892386334058433e-05, "loss": 0.2335, "step": 9970 }, { "epoch": 0.3729233357478328, "grad_norm": 0.39215198159217834, "learning_rate": 1.3886976217458872e-05, "loss": 0.2667, "step": 9975 }, { "epoch": 0.3731102647381826, "grad_norm": 0.6366074085235596, "learning_rate": 1.3881564760352589e-05, "loss": 0.2668, "step": 9980 }, { "epoch": 0.3732971937285324, "grad_norm": 0.4511488676071167, "learning_rate": 1.387615196460584e-05, "loss": 0.2636, "step": 9985 }, { "epoch": 0.37348412271888215, "grad_norm": 0.2915279269218445, "learning_rate": 1.3870737832085344e-05, "loss": 0.2942, "step": 9990 }, { "epoch": 0.373671051709232, "grad_norm": 0.2640349864959717, "learning_rate": 1.386532236465828e-05, "loss": 0.2608, "step": 9995 }, { "epoch": 0.37385798069958176, "grad_norm": 0.5220035910606384, "learning_rate": 1.3859905564192291e-05, "loss": 0.3061, "step": 10000 }, { "epoch": 0.37404490968993154, "grad_norm": 0.26642879843711853, "learning_rate": 1.3854487432555467e-05, "loss": 0.2558, "step": 10005 }, { "epoch": 0.3742318386802813, "grad_norm": 0.5303171277046204, "learning_rate": 1.3849067971616377e-05, "loss": 0.264, "step": 10010 }, { "epoch": 0.3744187676706311, "grad_norm": 0.412943035364151, "learning_rate": 1.384364718324403e-05, "loss": 0.363, "step": 10015 }, { "epoch": 0.3746056966609809, "grad_norm": 0.7517222166061401, "learning_rate": 1.3838225069307911e-05, "loss": 0.3029, "step": 10020 }, { "epoch": 0.3747926256513307, "grad_norm": 0.34779641032218933, "learning_rate": 1.383280163167794e-05, "loss": 0.2702, "step": 10025 }, { "epoch": 0.3749795546416805, "grad_norm": 0.5778641104698181, "learning_rate": 1.382737687222451e-05, "loss": 0.2748, "step": 10030 }, { "epoch": 0.37516648363203026, "grad_norm": 0.43767881393432617, "learning_rate": 1.3821950792818464e-05, "loss": 0.3038, "step": 10035 }, { "epoch": 0.3753534126223801, "grad_norm": 0.43027085065841675, "learning_rate": 1.3816523395331104e-05, "loss": 0.2593, "step": 10040 }, { "epoch": 0.37554034161272987, "grad_norm": 0.40595659613609314, "learning_rate": 1.3811094681634183e-05, "loss": 0.3294, "step": 10045 }, { "epoch": 0.37572727060307964, "grad_norm": 0.21250957250595093, "learning_rate": 1.3805664653599905e-05, "loss": 0.3234, "step": 10050 }, { "epoch": 0.3759141995934294, "grad_norm": 0.3901183605194092, "learning_rate": 1.3800233313100935e-05, "loss": 0.3291, "step": 10055 }, { "epoch": 0.37610112858377925, "grad_norm": 0.40727564692497253, "learning_rate": 1.3794800662010386e-05, "loss": 0.2554, "step": 10060 }, { "epoch": 0.37628805757412903, "grad_norm": 0.6711639165878296, "learning_rate": 1.3789366702201824e-05, "loss": 0.2909, "step": 10065 }, { "epoch": 0.3764749865644788, "grad_norm": 0.3564465045928955, "learning_rate": 1.3783931435549266e-05, "loss": 0.2833, "step": 10070 }, { "epoch": 0.3766619155548286, "grad_norm": 0.3789138197898865, "learning_rate": 1.377849486392718e-05, "loss": 0.2947, "step": 10075 }, { "epoch": 0.3768488445451784, "grad_norm": 0.19771960377693176, "learning_rate": 1.377305698921048e-05, "loss": 0.2381, "step": 10080 }, { "epoch": 0.3770357735355282, "grad_norm": 0.12256388366222382, "learning_rate": 1.3767617813274537e-05, "loss": 0.2924, "step": 10085 }, { "epoch": 0.377222702525878, "grad_norm": 0.656947672367096, "learning_rate": 1.376217733799517e-05, "loss": 0.326, "step": 10090 }, { "epoch": 0.37740963151622775, "grad_norm": 0.2615041136741638, "learning_rate": 1.3756735565248634e-05, "loss": 0.2551, "step": 10095 }, { "epoch": 0.3775965605065776, "grad_norm": 0.4703410267829895, "learning_rate": 1.3751292496911646e-05, "loss": 0.273, "step": 10100 }, { "epoch": 0.37778348949692736, "grad_norm": 0.3060661256313324, "learning_rate": 1.3745848134861367e-05, "loss": 0.381, "step": 10105 }, { "epoch": 0.37797041848727714, "grad_norm": 0.30581894516944885, "learning_rate": 1.3740402480975394e-05, "loss": 0.2714, "step": 10110 }, { "epoch": 0.3781573474776269, "grad_norm": 0.2676241397857666, "learning_rate": 1.3734955537131786e-05, "loss": 0.3834, "step": 10115 }, { "epoch": 0.37834427646797675, "grad_norm": 0.28147178888320923, "learning_rate": 1.3729507305209033e-05, "loss": 0.2662, "step": 10120 }, { "epoch": 0.3785312054583265, "grad_norm": 0.34182241559028625, "learning_rate": 1.3724057787086073e-05, "loss": 0.2477, "step": 10125 }, { "epoch": 0.3787181344486763, "grad_norm": 0.30519115924835205, "learning_rate": 1.3718606984642292e-05, "loss": 0.2354, "step": 10130 }, { "epoch": 0.3789050634390261, "grad_norm": 0.2952529788017273, "learning_rate": 1.3713154899757508e-05, "loss": 0.3169, "step": 10135 }, { "epoch": 0.3790919924293759, "grad_norm": 0.4137369692325592, "learning_rate": 1.3707701534311999e-05, "loss": 0.3089, "step": 10140 }, { "epoch": 0.3792789214197257, "grad_norm": 0.2354464828968048, "learning_rate": 1.3702246890186469e-05, "loss": 0.318, "step": 10145 }, { "epoch": 0.37946585041007547, "grad_norm": 0.15742668509483337, "learning_rate": 1.3696790969262068e-05, "loss": 0.2672, "step": 10150 }, { "epoch": 0.37965277940042524, "grad_norm": 0.21879169344902039, "learning_rate": 1.369133377342039e-05, "loss": 0.2698, "step": 10155 }, { "epoch": 0.3798397083907751, "grad_norm": 0.8999713063240051, "learning_rate": 1.3685875304543463e-05, "loss": 0.4024, "step": 10160 }, { "epoch": 0.38002663738112485, "grad_norm": 0.2633039355278015, "learning_rate": 1.3680415564513759e-05, "loss": 0.347, "step": 10165 }, { "epoch": 0.38021356637147463, "grad_norm": 0.343563437461853, "learning_rate": 1.3674954555214182e-05, "loss": 0.3596, "step": 10170 }, { "epoch": 0.3804004953618244, "grad_norm": 0.5179651975631714, "learning_rate": 1.3669492278528079e-05, "loss": 0.2785, "step": 10175 }, { "epoch": 0.38058742435217424, "grad_norm": 0.24854378402233124, "learning_rate": 1.3664028736339234e-05, "loss": 0.2958, "step": 10180 }, { "epoch": 0.380774353342524, "grad_norm": 0.39850738644599915, "learning_rate": 1.3658563930531865e-05, "loss": 0.313, "step": 10185 }, { "epoch": 0.3809612823328738, "grad_norm": 0.3405385911464691, "learning_rate": 1.365309786299063e-05, "loss": 0.2671, "step": 10190 }, { "epoch": 0.3811482113232236, "grad_norm": 0.3381601572036743, "learning_rate": 1.3647630535600613e-05, "loss": 0.2986, "step": 10195 }, { "epoch": 0.3813351403135734, "grad_norm": 0.3975812792778015, "learning_rate": 1.3642161950247345e-05, "loss": 0.3056, "step": 10200 }, { "epoch": 0.3815220693039232, "grad_norm": 0.5125154852867126, "learning_rate": 1.363669210881678e-05, "loss": 0.2916, "step": 10205 }, { "epoch": 0.38170899829427296, "grad_norm": 0.29686829447746277, "learning_rate": 1.3631221013195313e-05, "loss": 0.3361, "step": 10210 }, { "epoch": 0.38189592728462274, "grad_norm": 0.596336841583252, "learning_rate": 1.3625748665269765e-05, "loss": 0.2738, "step": 10215 }, { "epoch": 0.38208285627497257, "grad_norm": 0.24768860638141632, "learning_rate": 1.3620275066927392e-05, "loss": 0.3018, "step": 10220 }, { "epoch": 0.38226978526532235, "grad_norm": 0.30476251244544983, "learning_rate": 1.3614800220055884e-05, "loss": 0.2602, "step": 10225 }, { "epoch": 0.3824567142556721, "grad_norm": 0.46634262800216675, "learning_rate": 1.3609324126543357e-05, "loss": 0.2068, "step": 10230 }, { "epoch": 0.3826436432460219, "grad_norm": 0.264059454202652, "learning_rate": 1.3603846788278357e-05, "loss": 0.3386, "step": 10235 }, { "epoch": 0.38283057223637174, "grad_norm": 0.37096327543258667, "learning_rate": 1.3598368207149865e-05, "loss": 0.3036, "step": 10240 }, { "epoch": 0.3830175012267215, "grad_norm": 0.35662344098091125, "learning_rate": 1.3592888385047285e-05, "loss": 0.3593, "step": 10245 }, { "epoch": 0.3832044302170713, "grad_norm": 0.4874442517757416, "learning_rate": 1.358740732386045e-05, "loss": 0.2687, "step": 10250 }, { "epoch": 0.38339135920742107, "grad_norm": 0.3303247392177582, "learning_rate": 1.3581925025479618e-05, "loss": 0.3349, "step": 10255 }, { "epoch": 0.38357828819777084, "grad_norm": 0.31140390038490295, "learning_rate": 1.3576441491795484e-05, "loss": 0.2713, "step": 10260 }, { "epoch": 0.3837652171881207, "grad_norm": 0.2794208824634552, "learning_rate": 1.3570956724699156e-05, "loss": 0.2632, "step": 10265 }, { "epoch": 0.38395214617847045, "grad_norm": 0.5220376253128052, "learning_rate": 1.3565470726082176e-05, "loss": 0.3356, "step": 10270 }, { "epoch": 0.38413907516882023, "grad_norm": 0.31608107686042786, "learning_rate": 1.3559983497836504e-05, "loss": 0.3079, "step": 10275 }, { "epoch": 0.38432600415917, "grad_norm": 0.44200223684310913, "learning_rate": 1.3554495041854532e-05, "loss": 0.2978, "step": 10280 }, { "epoch": 0.38451293314951984, "grad_norm": 0.2726639211177826, "learning_rate": 1.3549005360029066e-05, "loss": 0.2671, "step": 10285 }, { "epoch": 0.3846998621398696, "grad_norm": 0.410034716129303, "learning_rate": 1.3543514454253346e-05, "loss": 0.2938, "step": 10290 }, { "epoch": 0.3848867911302194, "grad_norm": 0.33769235014915466, "learning_rate": 1.3538022326421023e-05, "loss": 0.3121, "step": 10295 }, { "epoch": 0.3850737201205692, "grad_norm": 0.4343665540218353, "learning_rate": 1.3532528978426178e-05, "loss": 0.2907, "step": 10300 }, { "epoch": 0.385260649110919, "grad_norm": 0.45513853430747986, "learning_rate": 1.352703441216331e-05, "loss": 0.2766, "step": 10305 }, { "epoch": 0.3854475781012688, "grad_norm": 0.49546775221824646, "learning_rate": 1.3521538629527336e-05, "loss": 0.3089, "step": 10310 }, { "epoch": 0.38563450709161856, "grad_norm": 0.6482167840003967, "learning_rate": 1.351604163241359e-05, "loss": 0.2859, "step": 10315 }, { "epoch": 0.38582143608196834, "grad_norm": 0.6202002763748169, "learning_rate": 1.3510543422717835e-05, "loss": 0.3149, "step": 10320 }, { "epoch": 0.38600836507231817, "grad_norm": 0.7666960954666138, "learning_rate": 1.3505044002336241e-05, "loss": 0.2812, "step": 10325 }, { "epoch": 0.38619529406266795, "grad_norm": 0.42803874611854553, "learning_rate": 1.3499543373165402e-05, "loss": 0.3083, "step": 10330 }, { "epoch": 0.3863822230530177, "grad_norm": 0.5874767899513245, "learning_rate": 1.3494041537102332e-05, "loss": 0.3362, "step": 10335 }, { "epoch": 0.3865691520433675, "grad_norm": 0.5874844789505005, "learning_rate": 1.348853849604445e-05, "loss": 0.3628, "step": 10340 }, { "epoch": 0.38675608103371734, "grad_norm": 0.454783171415329, "learning_rate": 1.34830342518896e-05, "loss": 0.2621, "step": 10345 }, { "epoch": 0.3869430100240671, "grad_norm": 0.3771190345287323, "learning_rate": 1.347752880653604e-05, "loss": 0.2833, "step": 10350 }, { "epoch": 0.3871299390144169, "grad_norm": 0.30891087651252747, "learning_rate": 1.3472022161882439e-05, "loss": 0.2938, "step": 10355 }, { "epoch": 0.38731686800476667, "grad_norm": 0.5641288757324219, "learning_rate": 1.346651431982788e-05, "loss": 0.3008, "step": 10360 }, { "epoch": 0.3875037969951165, "grad_norm": 0.19588641822338104, "learning_rate": 1.3461005282271857e-05, "loss": 0.2773, "step": 10365 }, { "epoch": 0.3876907259854663, "grad_norm": 0.6388927102088928, "learning_rate": 1.3455495051114283e-05, "loss": 0.2966, "step": 10370 }, { "epoch": 0.38787765497581606, "grad_norm": 0.4509037733078003, "learning_rate": 1.344998362825548e-05, "loss": 0.3202, "step": 10375 }, { "epoch": 0.38806458396616583, "grad_norm": 0.3036980628967285, "learning_rate": 1.344447101559618e-05, "loss": 0.2795, "step": 10380 }, { "epoch": 0.38825151295651567, "grad_norm": 0.34260863065719604, "learning_rate": 1.343895721503752e-05, "loss": 0.2886, "step": 10385 }, { "epoch": 0.38843844194686544, "grad_norm": 0.2766103744506836, "learning_rate": 1.3433442228481055e-05, "loss": 0.3462, "step": 10390 }, { "epoch": 0.3886253709372152, "grad_norm": 0.39682698249816895, "learning_rate": 1.3427926057828749e-05, "loss": 0.294, "step": 10395 }, { "epoch": 0.388812299927565, "grad_norm": 0.3901638388633728, "learning_rate": 1.3422408704982968e-05, "loss": 0.2803, "step": 10400 }, { "epoch": 0.38899922891791483, "grad_norm": 0.39301908016204834, "learning_rate": 1.3416890171846486e-05, "loss": 0.3174, "step": 10405 }, { "epoch": 0.3891861579082646, "grad_norm": 0.39801543951034546, "learning_rate": 1.3411370460322493e-05, "loss": 0.3569, "step": 10410 }, { "epoch": 0.3893730868986144, "grad_norm": 0.3659820854663849, "learning_rate": 1.3405849572314574e-05, "loss": 0.3338, "step": 10415 }, { "epoch": 0.38956001588896416, "grad_norm": 0.28667154908180237, "learning_rate": 1.3400327509726726e-05, "loss": 0.2569, "step": 10420 }, { "epoch": 0.389746944879314, "grad_norm": 0.4689178466796875, "learning_rate": 1.3394804274463358e-05, "loss": 0.279, "step": 10425 }, { "epoch": 0.38993387386966377, "grad_norm": 0.4040241539478302, "learning_rate": 1.3389279868429264e-05, "loss": 0.2931, "step": 10430 }, { "epoch": 0.39012080286001355, "grad_norm": 0.33305883407592773, "learning_rate": 1.3383754293529659e-05, "loss": 0.3057, "step": 10435 }, { "epoch": 0.3903077318503633, "grad_norm": 0.37228256464004517, "learning_rate": 1.3378227551670155e-05, "loss": 0.315, "step": 10440 }, { "epoch": 0.39049466084071316, "grad_norm": 0.4526333510875702, "learning_rate": 1.3372699644756769e-05, "loss": 0.294, "step": 10445 }, { "epoch": 0.39068158983106294, "grad_norm": 0.4754495620727539, "learning_rate": 1.3367170574695916e-05, "loss": 0.2909, "step": 10450 }, { "epoch": 0.3908685188214127, "grad_norm": 0.4171309769153595, "learning_rate": 1.3361640343394411e-05, "loss": 0.2908, "step": 10455 }, { "epoch": 0.3910554478117625, "grad_norm": 0.3807438313961029, "learning_rate": 1.3356108952759472e-05, "loss": 0.3104, "step": 10460 }, { "epoch": 0.3912423768021123, "grad_norm": 0.20601804554462433, "learning_rate": 1.3350576404698725e-05, "loss": 0.299, "step": 10465 }, { "epoch": 0.3914293057924621, "grad_norm": 0.4232652187347412, "learning_rate": 1.334504270112018e-05, "loss": 0.3072, "step": 10470 }, { "epoch": 0.3916162347828119, "grad_norm": 0.40767017006874084, "learning_rate": 1.3339507843932259e-05, "loss": 0.273, "step": 10475 }, { "epoch": 0.39180316377316166, "grad_norm": 0.49084317684173584, "learning_rate": 1.3333971835043767e-05, "loss": 0.2653, "step": 10480 }, { "epoch": 0.3919900927635115, "grad_norm": 0.5394593477249146, "learning_rate": 1.332843467636392e-05, "loss": 0.3149, "step": 10485 }, { "epoch": 0.39217702175386127, "grad_norm": 0.2563055157661438, "learning_rate": 1.3322896369802325e-05, "loss": 0.3443, "step": 10490 }, { "epoch": 0.39236395074421104, "grad_norm": 0.2230866700410843, "learning_rate": 1.3317356917268987e-05, "loss": 0.2512, "step": 10495 }, { "epoch": 0.3925508797345608, "grad_norm": 0.5398097038269043, "learning_rate": 1.33118163206743e-05, "loss": 0.2493, "step": 10500 }, { "epoch": 0.3927378087249106, "grad_norm": 0.36351341009140015, "learning_rate": 1.3306274581929059e-05, "loss": 0.3472, "step": 10505 }, { "epoch": 0.39292473771526043, "grad_norm": 0.34674593806266785, "learning_rate": 1.3300731702944453e-05, "loss": 0.3117, "step": 10510 }, { "epoch": 0.3931116667056102, "grad_norm": 0.2768007516860962, "learning_rate": 1.329518768563206e-05, "loss": 0.2991, "step": 10515 }, { "epoch": 0.39329859569596, "grad_norm": 0.7572014331817627, "learning_rate": 1.3289642531903857e-05, "loss": 0.3921, "step": 10520 }, { "epoch": 0.39348552468630976, "grad_norm": 0.34211087226867676, "learning_rate": 1.32840962436722e-05, "loss": 0.3197, "step": 10525 }, { "epoch": 0.3936724536766596, "grad_norm": 0.5510493516921997, "learning_rate": 1.3278548822849853e-05, "loss": 0.2646, "step": 10530 }, { "epoch": 0.3938593826670094, "grad_norm": 0.39801833033561707, "learning_rate": 1.3273000271349959e-05, "loss": 0.2889, "step": 10535 }, { "epoch": 0.39404631165735915, "grad_norm": 0.3157884180545807, "learning_rate": 1.3267450591086051e-05, "loss": 0.3597, "step": 10540 }, { "epoch": 0.3942332406477089, "grad_norm": 0.5300944447517395, "learning_rate": 1.3261899783972062e-05, "loss": 0.3158, "step": 10545 }, { "epoch": 0.39442016963805876, "grad_norm": 0.48455068469047546, "learning_rate": 1.32563478519223e-05, "loss": 0.2964, "step": 10550 }, { "epoch": 0.39460709862840854, "grad_norm": 0.33385178446769714, "learning_rate": 1.3250794796851474e-05, "loss": 0.3481, "step": 10555 }, { "epoch": 0.3947940276187583, "grad_norm": 0.35784122347831726, "learning_rate": 1.3245240620674667e-05, "loss": 0.3654, "step": 10560 }, { "epoch": 0.3949809566091081, "grad_norm": 0.19756436347961426, "learning_rate": 1.3239685325307359e-05, "loss": 0.3123, "step": 10565 }, { "epoch": 0.3951678855994579, "grad_norm": 0.33110281825065613, "learning_rate": 1.323412891266541e-05, "loss": 0.291, "step": 10570 }, { "epoch": 0.3953548145898077, "grad_norm": 0.2022867351770401, "learning_rate": 1.322857138466507e-05, "loss": 0.3221, "step": 10575 }, { "epoch": 0.3955417435801575, "grad_norm": 0.49643459916114807, "learning_rate": 1.3223012743222967e-05, "loss": 0.2867, "step": 10580 }, { "epoch": 0.39572867257050726, "grad_norm": 0.43748632073402405, "learning_rate": 1.3217452990256122e-05, "loss": 0.334, "step": 10585 }, { "epoch": 0.3959156015608571, "grad_norm": 0.6567883491516113, "learning_rate": 1.3211892127681934e-05, "loss": 0.3096, "step": 10590 }, { "epoch": 0.39610253055120687, "grad_norm": 0.5701555609703064, "learning_rate": 1.320633015741818e-05, "loss": 0.249, "step": 10595 }, { "epoch": 0.39628945954155664, "grad_norm": 0.33283892273902893, "learning_rate": 1.3200767081383028e-05, "loss": 0.3294, "step": 10600 }, { "epoch": 0.3964763885319064, "grad_norm": 0.3843041956424713, "learning_rate": 1.3195202901495024e-05, "loss": 0.2428, "step": 10605 }, { "epoch": 0.39666331752225625, "grad_norm": 0.2713102698326111, "learning_rate": 1.3189637619673095e-05, "loss": 0.2753, "step": 10610 }, { "epoch": 0.39685024651260603, "grad_norm": 0.28656595945358276, "learning_rate": 1.3184071237836544e-05, "loss": 0.2965, "step": 10615 }, { "epoch": 0.3970371755029558, "grad_norm": 0.30529990792274475, "learning_rate": 1.3178503757905058e-05, "loss": 0.3222, "step": 10620 }, { "epoch": 0.3972241044933056, "grad_norm": 0.3871190845966339, "learning_rate": 1.3172935181798703e-05, "loss": 0.362, "step": 10625 }, { "epoch": 0.3974110334836554, "grad_norm": 0.5080790519714355, "learning_rate": 1.3167365511437919e-05, "loss": 0.238, "step": 10630 }, { "epoch": 0.3975979624740052, "grad_norm": 0.3145684003829956, "learning_rate": 1.3161794748743525e-05, "loss": 0.2148, "step": 10635 }, { "epoch": 0.397784891464355, "grad_norm": 0.7068213820457458, "learning_rate": 1.315622289563672e-05, "loss": 0.2557, "step": 10640 }, { "epoch": 0.39797182045470475, "grad_norm": 0.3006476163864136, "learning_rate": 1.3150649954039078e-05, "loss": 0.3824, "step": 10645 }, { "epoch": 0.3981587494450546, "grad_norm": 0.269481360912323, "learning_rate": 1.3145075925872543e-05, "loss": 0.3222, "step": 10650 }, { "epoch": 0.39834567843540436, "grad_norm": 0.42372992634773254, "learning_rate": 1.3139500813059438e-05, "loss": 0.2562, "step": 10655 }, { "epoch": 0.39853260742575414, "grad_norm": 0.4050086736679077, "learning_rate": 1.3133924617522462e-05, "loss": 0.2818, "step": 10660 }, { "epoch": 0.3987195364161039, "grad_norm": 0.34156128764152527, "learning_rate": 1.3128347341184684e-05, "loss": 0.3041, "step": 10665 }, { "epoch": 0.39890646540645375, "grad_norm": 0.25688794255256653, "learning_rate": 1.3122768985969546e-05, "loss": 0.3239, "step": 10670 }, { "epoch": 0.3990933943968035, "grad_norm": 0.4633738398551941, "learning_rate": 1.3117189553800861e-05, "loss": 0.267, "step": 10675 }, { "epoch": 0.3992803233871533, "grad_norm": 0.41393613815307617, "learning_rate": 1.311160904660282e-05, "loss": 0.2762, "step": 10680 }, { "epoch": 0.3994672523775031, "grad_norm": 0.41371089220046997, "learning_rate": 1.3106027466299977e-05, "loss": 0.2862, "step": 10685 }, { "epoch": 0.3996541813678529, "grad_norm": 0.4804210662841797, "learning_rate": 1.310044481481726e-05, "loss": 0.2986, "step": 10690 }, { "epoch": 0.3998411103582027, "grad_norm": 0.5706376433372498, "learning_rate": 1.3094861094079965e-05, "loss": 0.3143, "step": 10695 }, { "epoch": 0.40002803934855247, "grad_norm": 0.6555120944976807, "learning_rate": 1.3089276306013759e-05, "loss": 0.2877, "step": 10700 }, { "epoch": 0.40021496833890224, "grad_norm": 0.40146076679229736, "learning_rate": 1.3083690452544673e-05, "loss": 0.2458, "step": 10705 }, { "epoch": 0.4004018973292521, "grad_norm": 0.24645639955997467, "learning_rate": 1.307810353559911e-05, "loss": 0.3734, "step": 10710 }, { "epoch": 0.40058882631960185, "grad_norm": 0.34604454040527344, "learning_rate": 1.3072515557103835e-05, "loss": 0.3489, "step": 10715 }, { "epoch": 0.40077575530995163, "grad_norm": 0.3194914758205414, "learning_rate": 1.3066926518985984e-05, "loss": 0.2924, "step": 10720 }, { "epoch": 0.4009626843003014, "grad_norm": 0.38813045620918274, "learning_rate": 1.3061336423173053e-05, "loss": 0.3895, "step": 10725 }, { "epoch": 0.40114961329065124, "grad_norm": 0.4124408960342407, "learning_rate": 1.305574527159291e-05, "loss": 0.2377, "step": 10730 }, { "epoch": 0.401336542281001, "grad_norm": 0.37710294127464294, "learning_rate": 1.305015306617378e-05, "loss": 0.2393, "step": 10735 }, { "epoch": 0.4015234712713508, "grad_norm": 0.34080588817596436, "learning_rate": 1.3044559808844257e-05, "loss": 0.3022, "step": 10740 }, { "epoch": 0.4017104002617006, "grad_norm": 0.4511391520500183, "learning_rate": 1.3038965501533291e-05, "loss": 0.2905, "step": 10745 }, { "epoch": 0.40189732925205035, "grad_norm": 0.5546760559082031, "learning_rate": 1.3033370146170201e-05, "loss": 0.3291, "step": 10750 }, { "epoch": 0.4020842582424002, "grad_norm": 0.3765566647052765, "learning_rate": 1.3027773744684669e-05, "loss": 0.2396, "step": 10755 }, { "epoch": 0.40227118723274996, "grad_norm": 0.32864052057266235, "learning_rate": 1.3022176299006726e-05, "loss": 0.3394, "step": 10760 }, { "epoch": 0.40245811622309974, "grad_norm": 0.3548523187637329, "learning_rate": 1.3016577811066775e-05, "loss": 0.3865, "step": 10765 }, { "epoch": 0.4026450452134495, "grad_norm": 0.8239120841026306, "learning_rate": 1.3010978282795569e-05, "loss": 0.2682, "step": 10770 }, { "epoch": 0.40283197420379935, "grad_norm": 0.3894842565059662, "learning_rate": 1.3005377716124232e-05, "loss": 0.335, "step": 10775 }, { "epoch": 0.4030189031941491, "grad_norm": 0.5358176231384277, "learning_rate": 1.2999776112984237e-05, "loss": 0.2949, "step": 10780 }, { "epoch": 0.4032058321844989, "grad_norm": 0.5368191003799438, "learning_rate": 1.2994173475307412e-05, "loss": 0.2752, "step": 10785 }, { "epoch": 0.4033927611748487, "grad_norm": 0.30042341351509094, "learning_rate": 1.298856980502595e-05, "loss": 0.2578, "step": 10790 }, { "epoch": 0.4035796901651985, "grad_norm": 0.3132897913455963, "learning_rate": 1.2982965104072397e-05, "loss": 0.3735, "step": 10795 }, { "epoch": 0.4037666191555483, "grad_norm": 2.302870988845825, "learning_rate": 1.2977359374379652e-05, "loss": 0.3051, "step": 10800 }, { "epoch": 0.40395354814589807, "grad_norm": 0.31478455662727356, "learning_rate": 1.2971752617880972e-05, "loss": 0.2558, "step": 10805 }, { "epoch": 0.40414047713624784, "grad_norm": 0.5957106947898865, "learning_rate": 1.2966144836509964e-05, "loss": 0.3231, "step": 10810 }, { "epoch": 0.4043274061265977, "grad_norm": 0.49217236042022705, "learning_rate": 1.2960536032200592e-05, "loss": 0.2385, "step": 10815 }, { "epoch": 0.40451433511694745, "grad_norm": 0.39011693000793457, "learning_rate": 1.2954926206887174e-05, "loss": 0.3561, "step": 10820 }, { "epoch": 0.40470126410729723, "grad_norm": 0.4122646152973175, "learning_rate": 1.2949315362504376e-05, "loss": 0.3262, "step": 10825 }, { "epoch": 0.404888193097647, "grad_norm": 0.3655274510383606, "learning_rate": 1.2943703500987218e-05, "loss": 0.2638, "step": 10830 }, { "epoch": 0.40507512208799684, "grad_norm": 0.39280015230178833, "learning_rate": 1.293809062427107e-05, "loss": 0.3355, "step": 10835 }, { "epoch": 0.4052620510783466, "grad_norm": 0.5871797800064087, "learning_rate": 1.2932476734291652e-05, "loss": 0.3217, "step": 10840 }, { "epoch": 0.4054489800686964, "grad_norm": 0.22152677178382874, "learning_rate": 1.2926861832985036e-05, "loss": 0.2709, "step": 10845 }, { "epoch": 0.4056359090590462, "grad_norm": 0.4744000732898712, "learning_rate": 1.292124592228764e-05, "loss": 0.3501, "step": 10850 }, { "epoch": 0.405822838049396, "grad_norm": 0.34133681654930115, "learning_rate": 1.2915629004136228e-05, "loss": 0.2422, "step": 10855 }, { "epoch": 0.4060097670397458, "grad_norm": 0.5644049048423767, "learning_rate": 1.2910011080467917e-05, "loss": 0.2745, "step": 10860 }, { "epoch": 0.40619669603009556, "grad_norm": 0.48984065651893616, "learning_rate": 1.2904392153220164e-05, "loss": 0.3229, "step": 10865 }, { "epoch": 0.40638362502044534, "grad_norm": 0.30956414341926575, "learning_rate": 1.2898772224330778e-05, "loss": 0.3053, "step": 10870 }, { "epoch": 0.40657055401079517, "grad_norm": 0.24445974826812744, "learning_rate": 1.2893151295737916e-05, "loss": 0.2011, "step": 10875 }, { "epoch": 0.40675748300114495, "grad_norm": 1.567341923713684, "learning_rate": 1.288752936938007e-05, "loss": 0.3163, "step": 10880 }, { "epoch": 0.4069444119914947, "grad_norm": 0.28451815247535706, "learning_rate": 1.2881906447196082e-05, "loss": 0.2405, "step": 10885 }, { "epoch": 0.4071313409818445, "grad_norm": 0.252210408449173, "learning_rate": 1.287628253112514e-05, "loss": 0.245, "step": 10890 }, { "epoch": 0.40731826997219434, "grad_norm": 0.4983895421028137, "learning_rate": 1.2870657623106766e-05, "loss": 0.3025, "step": 10895 }, { "epoch": 0.4075051989625441, "grad_norm": 0.6142060160636902, "learning_rate": 1.2865031725080834e-05, "loss": 0.3184, "step": 10900 }, { "epoch": 0.4076921279528939, "grad_norm": 0.34046661853790283, "learning_rate": 1.2859404838987552e-05, "loss": 0.3181, "step": 10905 }, { "epoch": 0.40787905694324367, "grad_norm": 0.7890579700469971, "learning_rate": 1.2853776966767475e-05, "loss": 0.3095, "step": 10910 }, { "epoch": 0.4080659859335935, "grad_norm": 0.4729807376861572, "learning_rate": 1.284814811036149e-05, "loss": 0.3228, "step": 10915 }, { "epoch": 0.4082529149239433, "grad_norm": 0.5021012425422668, "learning_rate": 1.2842518271710836e-05, "loss": 0.2234, "step": 10920 }, { "epoch": 0.40843984391429305, "grad_norm": 0.36319997906684875, "learning_rate": 1.2836887452757076e-05, "loss": 0.2692, "step": 10925 }, { "epoch": 0.40862677290464283, "grad_norm": 0.44691482186317444, "learning_rate": 1.2831255655442122e-05, "loss": 0.3243, "step": 10930 }, { "epoch": 0.40881370189499266, "grad_norm": 0.2757837772369385, "learning_rate": 1.2825622881708218e-05, "loss": 0.3383, "step": 10935 }, { "epoch": 0.40900063088534244, "grad_norm": 0.3655282258987427, "learning_rate": 1.2819989133497945e-05, "loss": 0.216, "step": 10940 }, { "epoch": 0.4091875598756922, "grad_norm": 0.2834169864654541, "learning_rate": 1.2814354412754227e-05, "loss": 0.2807, "step": 10945 }, { "epoch": 0.409374488866042, "grad_norm": 0.49131330847740173, "learning_rate": 1.2808718721420308e-05, "loss": 0.2887, "step": 10950 }, { "epoch": 0.40956141785639183, "grad_norm": 0.2866436541080475, "learning_rate": 1.2803082061439784e-05, "loss": 0.3064, "step": 10955 }, { "epoch": 0.4097483468467416, "grad_norm": 0.5651163458824158, "learning_rate": 1.2797444434756571e-05, "loss": 0.2649, "step": 10960 }, { "epoch": 0.4099352758370914, "grad_norm": 0.3305072486400604, "learning_rate": 1.2791805843314937e-05, "loss": 0.3414, "step": 10965 }, { "epoch": 0.41012220482744116, "grad_norm": 0.4713725745677948, "learning_rate": 1.2786166289059461e-05, "loss": 0.2735, "step": 10970 }, { "epoch": 0.410309133817791, "grad_norm": 0.45371806621551514, "learning_rate": 1.2780525773935063e-05, "loss": 0.231, "step": 10975 }, { "epoch": 0.41049606280814077, "grad_norm": 0.3463742136955261, "learning_rate": 1.2774884299887e-05, "loss": 0.338, "step": 10980 }, { "epoch": 0.41068299179849055, "grad_norm": 0.30682888627052307, "learning_rate": 1.2769241868860851e-05, "loss": 0.2203, "step": 10985 }, { "epoch": 0.4108699207888403, "grad_norm": 0.5777801275253296, "learning_rate": 1.2763598482802531e-05, "loss": 0.2488, "step": 10990 }, { "epoch": 0.4110568497791901, "grad_norm": 0.3914337754249573, "learning_rate": 1.2757954143658285e-05, "loss": 0.3193, "step": 10995 }, { "epoch": 0.41124377876953994, "grad_norm": 0.37003767490386963, "learning_rate": 1.2752308853374675e-05, "loss": 0.3423, "step": 11000 }, { "epoch": 0.4114307077598897, "grad_norm": 0.6970301866531372, "learning_rate": 1.2746662613898605e-05, "loss": 0.3753, "step": 11005 }, { "epoch": 0.4116176367502395, "grad_norm": 0.5969115495681763, "learning_rate": 1.2741015427177304e-05, "loss": 0.3479, "step": 11010 }, { "epoch": 0.41180456574058927, "grad_norm": 0.4280428886413574, "learning_rate": 1.2735367295158324e-05, "loss": 0.2633, "step": 11015 }, { "epoch": 0.4119914947309391, "grad_norm": 0.47678300738334656, "learning_rate": 1.2729718219789538e-05, "loss": 0.2922, "step": 11020 }, { "epoch": 0.4121784237212889, "grad_norm": 0.4477633833885193, "learning_rate": 1.2724068203019155e-05, "loss": 0.3189, "step": 11025 }, { "epoch": 0.41236535271163866, "grad_norm": 0.330090194940567, "learning_rate": 1.2718417246795702e-05, "loss": 0.2159, "step": 11030 }, { "epoch": 0.41255228170198843, "grad_norm": 0.33988553285598755, "learning_rate": 1.2712765353068036e-05, "loss": 0.2479, "step": 11035 }, { "epoch": 0.41273921069233827, "grad_norm": 0.22661490738391876, "learning_rate": 1.2707112523785327e-05, "loss": 0.3321, "step": 11040 }, { "epoch": 0.41292613968268804, "grad_norm": 0.29933539032936096, "learning_rate": 1.2701458760897083e-05, "loss": 0.2927, "step": 11045 }, { "epoch": 0.4131130686730378, "grad_norm": 0.39779335260391235, "learning_rate": 1.2695804066353113e-05, "loss": 0.3024, "step": 11050 }, { "epoch": 0.4132999976633876, "grad_norm": 0.3741395175457001, "learning_rate": 1.2690148442103567e-05, "loss": 0.3496, "step": 11055 }, { "epoch": 0.41348692665373743, "grad_norm": 0.33514922857284546, "learning_rate": 1.2684491890098907e-05, "loss": 0.2939, "step": 11060 }, { "epoch": 0.4136738556440872, "grad_norm": 0.2723214030265808, "learning_rate": 1.2678834412289915e-05, "loss": 0.2748, "step": 11065 }, { "epoch": 0.413860784634437, "grad_norm": 0.2613365054130554, "learning_rate": 1.267317601062769e-05, "loss": 0.268, "step": 11070 }, { "epoch": 0.41404771362478676, "grad_norm": 0.3753436207771301, "learning_rate": 1.2667516687063657e-05, "loss": 0.2862, "step": 11075 }, { "epoch": 0.4142346426151366, "grad_norm": 0.38865718245506287, "learning_rate": 1.2661856443549551e-05, "loss": 0.3599, "step": 11080 }, { "epoch": 0.41442157160548637, "grad_norm": 0.3524521291255951, "learning_rate": 1.2656195282037432e-05, "loss": 0.2869, "step": 11085 }, { "epoch": 0.41460850059583615, "grad_norm": 0.28500306606292725, "learning_rate": 1.265053320447967e-05, "loss": 0.2769, "step": 11090 }, { "epoch": 0.4147954295861859, "grad_norm": 0.514741837978363, "learning_rate": 1.2644870212828949e-05, "loss": 0.234, "step": 11095 }, { "epoch": 0.41498235857653576, "grad_norm": 0.33767464756965637, "learning_rate": 1.2639206309038279e-05, "loss": 0.3528, "step": 11100 }, { "epoch": 0.41516928756688554, "grad_norm": 0.4422234296798706, "learning_rate": 1.2633541495060975e-05, "loss": 0.2436, "step": 11105 }, { "epoch": 0.4153562165572353, "grad_norm": 0.41787955164909363, "learning_rate": 1.2627875772850672e-05, "loss": 0.3395, "step": 11110 }, { "epoch": 0.4155431455475851, "grad_norm": 1.9292072057724, "learning_rate": 1.2622209144361313e-05, "loss": 0.2723, "step": 11115 }, { "epoch": 0.4157300745379349, "grad_norm": 0.3203314244747162, "learning_rate": 1.2616541611547155e-05, "loss": 0.2928, "step": 11120 }, { "epoch": 0.4159170035282847, "grad_norm": 0.49962037801742554, "learning_rate": 1.2610873176362767e-05, "loss": 0.3809, "step": 11125 }, { "epoch": 0.4161039325186345, "grad_norm": 0.4445996880531311, "learning_rate": 1.2605203840763034e-05, "loss": 0.3369, "step": 11130 }, { "epoch": 0.41629086150898426, "grad_norm": 0.43643853068351746, "learning_rate": 1.2599533606703144e-05, "loss": 0.4172, "step": 11135 }, { "epoch": 0.4164777904993341, "grad_norm": 0.7465499639511108, "learning_rate": 1.2593862476138598e-05, "loss": 0.4366, "step": 11140 }, { "epoch": 0.41666471948968387, "grad_norm": 0.34575432538986206, "learning_rate": 1.2588190451025209e-05, "loss": 0.2783, "step": 11145 }, { "epoch": 0.41685164848003364, "grad_norm": 0.33487460017204285, "learning_rate": 1.2582517533319094e-05, "loss": 0.3308, "step": 11150 }, { "epoch": 0.4170385774703834, "grad_norm": 0.33010798692703247, "learning_rate": 1.257684372497668e-05, "loss": 0.3056, "step": 11155 }, { "epoch": 0.41722550646073325, "grad_norm": 0.49482452869415283, "learning_rate": 1.2571169027954702e-05, "loss": 0.2821, "step": 11160 }, { "epoch": 0.41741243545108303, "grad_norm": 0.23514960706233978, "learning_rate": 1.25654934442102e-05, "loss": 0.3404, "step": 11165 }, { "epoch": 0.4175993644414328, "grad_norm": 0.7681819200515747, "learning_rate": 1.2559816975700518e-05, "loss": 0.3222, "step": 11170 }, { "epoch": 0.4177862934317826, "grad_norm": 0.32423636317253113, "learning_rate": 1.2554139624383307e-05, "loss": 0.263, "step": 11175 }, { "epoch": 0.4179732224221324, "grad_norm": 0.26687395572662354, "learning_rate": 1.2548461392216531e-05, "loss": 0.2467, "step": 11180 }, { "epoch": 0.4181601514124822, "grad_norm": 0.3909884989261627, "learning_rate": 1.2542782281158438e-05, "loss": 0.2949, "step": 11185 }, { "epoch": 0.418347080402832, "grad_norm": 0.27226680517196655, "learning_rate": 1.2537102293167598e-05, "loss": 0.2839, "step": 11190 }, { "epoch": 0.41853400939318175, "grad_norm": 0.3636428415775299, "learning_rate": 1.2531421430202875e-05, "loss": 0.2814, "step": 11195 }, { "epoch": 0.4187209383835316, "grad_norm": 0.43354684114456177, "learning_rate": 1.2525739694223436e-05, "loss": 0.3003, "step": 11200 }, { "epoch": 0.41890786737388136, "grad_norm": 0.3936502933502197, "learning_rate": 1.2520057087188748e-05, "loss": 0.2926, "step": 11205 }, { "epoch": 0.41909479636423114, "grad_norm": 0.37949803471565247, "learning_rate": 1.2514373611058578e-05, "loss": 0.3242, "step": 11210 }, { "epoch": 0.4192817253545809, "grad_norm": 0.2999209463596344, "learning_rate": 1.2508689267792994e-05, "loss": 0.2679, "step": 11215 }, { "epoch": 0.41946865434493075, "grad_norm": 0.37106993794441223, "learning_rate": 1.2503004059352369e-05, "loss": 0.2887, "step": 11220 }, { "epoch": 0.4196555833352805, "grad_norm": 0.286905437707901, "learning_rate": 1.2497317987697359e-05, "loss": 0.2298, "step": 11225 }, { "epoch": 0.4198425123256303, "grad_norm": 0.5081347823143005, "learning_rate": 1.2491631054788936e-05, "loss": 0.305, "step": 11230 }, { "epoch": 0.4200294413159801, "grad_norm": 0.43394070863723755, "learning_rate": 1.2485943262588353e-05, "loss": 0.2757, "step": 11235 }, { "epoch": 0.42021637030632986, "grad_norm": 0.6610644459724426, "learning_rate": 1.2480254613057172e-05, "loss": 0.3212, "step": 11240 }, { "epoch": 0.4204032992966797, "grad_norm": 0.6914903521537781, "learning_rate": 1.2474565108157244e-05, "loss": 0.3204, "step": 11245 }, { "epoch": 0.42059022828702947, "grad_norm": 0.31729546189308167, "learning_rate": 1.2468874749850715e-05, "loss": 0.2934, "step": 11250 }, { "epoch": 0.42077715727737924, "grad_norm": 0.40817737579345703, "learning_rate": 1.2463183540100028e-05, "loss": 0.2576, "step": 11255 }, { "epoch": 0.420964086267729, "grad_norm": 0.48512113094329834, "learning_rate": 1.2457491480867917e-05, "loss": 0.3062, "step": 11260 }, { "epoch": 0.42115101525807885, "grad_norm": 0.32723644375801086, "learning_rate": 1.2451798574117406e-05, "loss": 0.2556, "step": 11265 }, { "epoch": 0.42133794424842863, "grad_norm": 0.3997679054737091, "learning_rate": 1.2446104821811825e-05, "loss": 0.2993, "step": 11270 }, { "epoch": 0.4215248732387784, "grad_norm": 0.24848319590091705, "learning_rate": 1.2440410225914779e-05, "loss": 0.32, "step": 11275 }, { "epoch": 0.4217118022291282, "grad_norm": 0.33824488520622253, "learning_rate": 1.243471478839017e-05, "loss": 0.3054, "step": 11280 }, { "epoch": 0.421898731219478, "grad_norm": 0.3888397216796875, "learning_rate": 1.2429018511202195e-05, "loss": 0.2567, "step": 11285 }, { "epoch": 0.4220856602098278, "grad_norm": 0.25221407413482666, "learning_rate": 1.2423321396315338e-05, "loss": 0.2519, "step": 11290 }, { "epoch": 0.4222725892001776, "grad_norm": 0.4825991094112396, "learning_rate": 1.2417623445694367e-05, "loss": 0.2718, "step": 11295 }, { "epoch": 0.42245951819052735, "grad_norm": 0.4877346456050873, "learning_rate": 1.2411924661304346e-05, "loss": 0.2421, "step": 11300 }, { "epoch": 0.4226464471808772, "grad_norm": 0.734813392162323, "learning_rate": 1.2406225045110617e-05, "loss": 0.2855, "step": 11305 }, { "epoch": 0.42283337617122696, "grad_norm": 0.4634631276130676, "learning_rate": 1.2400524599078816e-05, "loss": 0.3167, "step": 11310 }, { "epoch": 0.42302030516157674, "grad_norm": 0.2482909858226776, "learning_rate": 1.2394823325174866e-05, "loss": 0.333, "step": 11315 }, { "epoch": 0.4232072341519265, "grad_norm": 0.31496381759643555, "learning_rate": 1.2389121225364968e-05, "loss": 0.3233, "step": 11320 }, { "epoch": 0.42339416314227635, "grad_norm": 0.46915552020072937, "learning_rate": 1.2383418301615622e-05, "loss": 0.33, "step": 11325 }, { "epoch": 0.4235810921326261, "grad_norm": 0.3510735332965851, "learning_rate": 1.2377714555893595e-05, "loss": 0.2903, "step": 11330 }, { "epoch": 0.4237680211229759, "grad_norm": 0.7571256756782532, "learning_rate": 1.2372009990165948e-05, "loss": 0.3083, "step": 11335 }, { "epoch": 0.4239549501133257, "grad_norm": 0.37375253438949585, "learning_rate": 1.2366304606400021e-05, "loss": 0.2759, "step": 11340 }, { "epoch": 0.4241418791036755, "grad_norm": 0.4373493790626526, "learning_rate": 1.236059840656344e-05, "loss": 0.2503, "step": 11345 }, { "epoch": 0.4243288080940253, "grad_norm": 0.284297376871109, "learning_rate": 1.235489139262411e-05, "loss": 0.2491, "step": 11350 }, { "epoch": 0.42451573708437507, "grad_norm": 0.24776756763458252, "learning_rate": 1.2349183566550212e-05, "loss": 0.2494, "step": 11355 }, { "epoch": 0.42470266607472484, "grad_norm": 1.0645568370819092, "learning_rate": 1.2343474930310213e-05, "loss": 0.2053, "step": 11360 }, { "epoch": 0.4248895950650747, "grad_norm": 0.325014591217041, "learning_rate": 1.2337765485872859e-05, "loss": 0.2916, "step": 11365 }, { "epoch": 0.42507652405542445, "grad_norm": 0.9632283449172974, "learning_rate": 1.2332055235207179e-05, "loss": 0.3179, "step": 11370 }, { "epoch": 0.42526345304577423, "grad_norm": 1.18338143825531, "learning_rate": 1.2326344180282468e-05, "loss": 0.3688, "step": 11375 }, { "epoch": 0.425450382036124, "grad_norm": 0.4803732633590698, "learning_rate": 1.2320632323068306e-05, "loss": 0.3574, "step": 11380 }, { "epoch": 0.42563731102647384, "grad_norm": 0.38758915662765503, "learning_rate": 1.2314919665534552e-05, "loss": 0.3133, "step": 11385 }, { "epoch": 0.4258242400168236, "grad_norm": 0.4360613524913788, "learning_rate": 1.2309206209651336e-05, "loss": 0.2571, "step": 11390 }, { "epoch": 0.4260111690071734, "grad_norm": 0.39218610525131226, "learning_rate": 1.2303491957389069e-05, "loss": 0.2504, "step": 11395 }, { "epoch": 0.4261980979975232, "grad_norm": 0.4993368685245514, "learning_rate": 1.2297776910718424e-05, "loss": 0.3063, "step": 11400 }, { "epoch": 0.426385026987873, "grad_norm": 0.5808417201042175, "learning_rate": 1.2292061071610365e-05, "loss": 0.2726, "step": 11405 }, { "epoch": 0.4265719559782228, "grad_norm": 0.31993383169174194, "learning_rate": 1.2286344442036114e-05, "loss": 0.386, "step": 11410 }, { "epoch": 0.42675888496857256, "grad_norm": 0.17474965751171112, "learning_rate": 1.228062702396718e-05, "loss": 0.2783, "step": 11415 }, { "epoch": 0.42694581395892234, "grad_norm": 0.5042828917503357, "learning_rate": 1.2274908819375335e-05, "loss": 0.2297, "step": 11420 }, { "epoch": 0.42713274294927217, "grad_norm": 0.4278635084629059, "learning_rate": 1.2269189830232622e-05, "loss": 0.2978, "step": 11425 }, { "epoch": 0.42731967193962195, "grad_norm": 0.5413123369216919, "learning_rate": 1.2263470058511355e-05, "loss": 0.3018, "step": 11430 }, { "epoch": 0.4275066009299717, "grad_norm": 0.3124532401561737, "learning_rate": 1.2257749506184125e-05, "loss": 0.3438, "step": 11435 }, { "epoch": 0.4276935299203215, "grad_norm": 0.3359132707118988, "learning_rate": 1.2252028175223778e-05, "loss": 0.2612, "step": 11440 }, { "epoch": 0.42788045891067134, "grad_norm": 0.5123247504234314, "learning_rate": 1.224630606760345e-05, "loss": 0.288, "step": 11445 }, { "epoch": 0.4280673879010211, "grad_norm": 0.48125380277633667, "learning_rate": 1.2240583185296517e-05, "loss": 0.3077, "step": 11450 }, { "epoch": 0.4282543168913709, "grad_norm": 0.24318033456802368, "learning_rate": 1.2234859530276647e-05, "loss": 0.2642, "step": 11455 }, { "epoch": 0.42844124588172067, "grad_norm": 0.3051791489124298, "learning_rate": 1.2229135104517757e-05, "loss": 0.3298, "step": 11460 }, { "epoch": 0.4286281748720705, "grad_norm": 0.5335453152656555, "learning_rate": 1.2223409909994048e-05, "loss": 0.3253, "step": 11465 }, { "epoch": 0.4288151038624203, "grad_norm": 0.7249236702919006, "learning_rate": 1.221768394867997e-05, "loss": 0.3483, "step": 11470 }, { "epoch": 0.42900203285277005, "grad_norm": 0.16120702028274536, "learning_rate": 1.221195722255024e-05, "loss": 0.2528, "step": 11475 }, { "epoch": 0.42918896184311983, "grad_norm": 0.6941366791725159, "learning_rate": 1.2206229733579846e-05, "loss": 0.2684, "step": 11480 }, { "epoch": 0.4293758908334696, "grad_norm": 0.4987410306930542, "learning_rate": 1.2200501483744032e-05, "loss": 0.3327, "step": 11485 }, { "epoch": 0.42956281982381944, "grad_norm": 0.20793752372264862, "learning_rate": 1.2194772475018309e-05, "loss": 0.2491, "step": 11490 }, { "epoch": 0.4297497488141692, "grad_norm": 0.4314498007297516, "learning_rate": 1.218904270937845e-05, "loss": 0.2784, "step": 11495 }, { "epoch": 0.429936677804519, "grad_norm": 0.5785399079322815, "learning_rate": 1.2183312188800483e-05, "loss": 0.2836, "step": 11500 }, { "epoch": 0.4301236067948688, "grad_norm": 0.4460437297821045, "learning_rate": 1.2177580915260698e-05, "loss": 0.3361, "step": 11505 }, { "epoch": 0.4303105357852186, "grad_norm": 0.34010326862335205, "learning_rate": 1.2171848890735655e-05, "loss": 0.2738, "step": 11510 }, { "epoch": 0.4304974647755684, "grad_norm": 0.2805013060569763, "learning_rate": 1.2166116117202162e-05, "loss": 0.2981, "step": 11515 }, { "epoch": 0.43068439376591816, "grad_norm": 0.3375371992588043, "learning_rate": 1.2160382596637286e-05, "loss": 0.2759, "step": 11520 }, { "epoch": 0.43087132275626794, "grad_norm": 0.5983409881591797, "learning_rate": 1.2154648331018355e-05, "loss": 0.2989, "step": 11525 }, { "epoch": 0.43105825174661777, "grad_norm": 0.4123179614543915, "learning_rate": 1.2148913322322952e-05, "loss": 0.3054, "step": 11530 }, { "epoch": 0.43124518073696755, "grad_norm": 0.6426059603691101, "learning_rate": 1.214317757252892e-05, "loss": 0.3205, "step": 11535 }, { "epoch": 0.4314321097273173, "grad_norm": 0.35368436574935913, "learning_rate": 1.2137441083614351e-05, "loss": 0.297, "step": 11540 }, { "epoch": 0.4316190387176671, "grad_norm": 0.3455871343612671, "learning_rate": 1.21317038575576e-05, "loss": 0.3098, "step": 11545 }, { "epoch": 0.43180596770801694, "grad_norm": 0.34755730628967285, "learning_rate": 1.2125965896337266e-05, "loss": 0.3382, "step": 11550 }, { "epoch": 0.4319928966983667, "grad_norm": 0.5450564622879028, "learning_rate": 1.2120227201932213e-05, "loss": 0.2472, "step": 11555 }, { "epoch": 0.4321798256887165, "grad_norm": 0.28686073422431946, "learning_rate": 1.2114487776321553e-05, "loss": 0.2876, "step": 11560 }, { "epoch": 0.43236675467906627, "grad_norm": 0.33250150084495544, "learning_rate": 1.2108747621484645e-05, "loss": 0.3267, "step": 11565 }, { "epoch": 0.4325536836694161, "grad_norm": 0.5938117504119873, "learning_rate": 1.2103006739401105e-05, "loss": 0.319, "step": 11570 }, { "epoch": 0.4327406126597659, "grad_norm": 0.32513856887817383, "learning_rate": 1.20972651320508e-05, "loss": 0.2741, "step": 11575 }, { "epoch": 0.43292754165011565, "grad_norm": 0.39806506037712097, "learning_rate": 1.2091522801413844e-05, "loss": 0.3497, "step": 11580 }, { "epoch": 0.43311447064046543, "grad_norm": 0.41519683599472046, "learning_rate": 1.2085779749470603e-05, "loss": 0.3165, "step": 11585 }, { "epoch": 0.43330139963081526, "grad_norm": 0.5625133514404297, "learning_rate": 1.2080035978201695e-05, "loss": 0.2961, "step": 11590 }, { "epoch": 0.43348832862116504, "grad_norm": 0.3750125765800476, "learning_rate": 1.2074291489587972e-05, "loss": 0.3051, "step": 11595 }, { "epoch": 0.4336752576115148, "grad_norm": 0.43589699268341064, "learning_rate": 1.2068546285610556e-05, "loss": 0.3055, "step": 11600 }, { "epoch": 0.4338621866018646, "grad_norm": 0.29387718439102173, "learning_rate": 1.2062800368250795e-05, "loss": 0.2602, "step": 11605 }, { "epoch": 0.43404911559221443, "grad_norm": 0.4198211133480072, "learning_rate": 1.2057053739490297e-05, "loss": 0.2962, "step": 11610 }, { "epoch": 0.4342360445825642, "grad_norm": 0.38284847140312195, "learning_rate": 1.2051306401310904e-05, "loss": 0.3328, "step": 11615 }, { "epoch": 0.434422973572914, "grad_norm": 0.49905702471733093, "learning_rate": 1.204555835569471e-05, "loss": 0.2479, "step": 11620 }, { "epoch": 0.43460990256326376, "grad_norm": 0.2864576578140259, "learning_rate": 1.2039809604624053e-05, "loss": 0.2545, "step": 11625 }, { "epoch": 0.4347968315536136, "grad_norm": 0.3651660084724426, "learning_rate": 1.203406015008151e-05, "loss": 0.2738, "step": 11630 }, { "epoch": 0.43498376054396337, "grad_norm": 0.5334829688072205, "learning_rate": 1.2028309994049907e-05, "loss": 0.2902, "step": 11635 }, { "epoch": 0.43517068953431315, "grad_norm": 0.4753320813179016, "learning_rate": 1.20225591385123e-05, "loss": 0.3009, "step": 11640 }, { "epoch": 0.4353576185246629, "grad_norm": 0.3971807360649109, "learning_rate": 1.2016807585452004e-05, "loss": 0.4801, "step": 11645 }, { "epoch": 0.43554454751501276, "grad_norm": 0.3786313831806183, "learning_rate": 1.201105533685256e-05, "loss": 0.2662, "step": 11650 }, { "epoch": 0.43573147650536254, "grad_norm": 0.5463781356811523, "learning_rate": 1.2005302394697755e-05, "loss": 0.2264, "step": 11655 }, { "epoch": 0.4359184054957123, "grad_norm": 0.29028788208961487, "learning_rate": 1.1999548760971614e-05, "loss": 0.2305, "step": 11660 }, { "epoch": 0.4361053344860621, "grad_norm": 0.30021342635154724, "learning_rate": 1.1993794437658397e-05, "loss": 0.3219, "step": 11665 }, { "epoch": 0.4362922634764119, "grad_norm": 0.3362751007080078, "learning_rate": 1.1988039426742608e-05, "loss": 0.3006, "step": 11670 }, { "epoch": 0.4364791924667617, "grad_norm": 0.25194594264030457, "learning_rate": 1.1982283730208988e-05, "loss": 0.2612, "step": 11675 }, { "epoch": 0.4366661214571115, "grad_norm": 0.5895513296127319, "learning_rate": 1.1976527350042507e-05, "loss": 0.2559, "step": 11680 }, { "epoch": 0.43685305044746126, "grad_norm": 0.6330528855323792, "learning_rate": 1.1970770288228377e-05, "loss": 0.2361, "step": 11685 }, { "epoch": 0.4370399794378111, "grad_norm": 0.37626221776008606, "learning_rate": 1.1965012546752047e-05, "loss": 0.2266, "step": 11690 }, { "epoch": 0.43722690842816087, "grad_norm": 0.3309417963027954, "learning_rate": 1.1959254127599191e-05, "loss": 0.265, "step": 11695 }, { "epoch": 0.43741383741851064, "grad_norm": 0.5689538717269897, "learning_rate": 1.1953495032755726e-05, "loss": 0.2866, "step": 11700 }, { "epoch": 0.4376007664088604, "grad_norm": 0.4492878019809723, "learning_rate": 1.1947735264207804e-05, "loss": 0.2061, "step": 11705 }, { "epoch": 0.43778769539921025, "grad_norm": 0.40549832582473755, "learning_rate": 1.1941974823941795e-05, "loss": 0.3215, "step": 11710 }, { "epoch": 0.43797462438956003, "grad_norm": 0.667451798915863, "learning_rate": 1.1936213713944315e-05, "loss": 0.292, "step": 11715 }, { "epoch": 0.4381615533799098, "grad_norm": 0.349060982465744, "learning_rate": 1.1930451936202203e-05, "loss": 0.3204, "step": 11720 }, { "epoch": 0.4383484823702596, "grad_norm": 0.3580839931964874, "learning_rate": 1.1924689492702534e-05, "loss": 0.2576, "step": 11725 }, { "epoch": 0.43853541136060936, "grad_norm": 0.49423980712890625, "learning_rate": 1.1918926385432608e-05, "loss": 0.2562, "step": 11730 }, { "epoch": 0.4387223403509592, "grad_norm": 0.2973886728286743, "learning_rate": 1.1913162616379956e-05, "loss": 0.3051, "step": 11735 }, { "epoch": 0.43890926934130897, "grad_norm": 0.50880366563797, "learning_rate": 1.1907398187532337e-05, "loss": 0.2851, "step": 11740 }, { "epoch": 0.43909619833165875, "grad_norm": 0.4905702471733093, "learning_rate": 1.1901633100877736e-05, "loss": 0.2635, "step": 11745 }, { "epoch": 0.4392831273220085, "grad_norm": 0.21061702072620392, "learning_rate": 1.1895867358404369e-05, "loss": 0.2665, "step": 11750 }, { "epoch": 0.43947005631235836, "grad_norm": 0.32583504915237427, "learning_rate": 1.1890100962100672e-05, "loss": 0.2656, "step": 11755 }, { "epoch": 0.43965698530270814, "grad_norm": 0.2895315885543823, "learning_rate": 1.1884333913955312e-05, "loss": 0.2501, "step": 11760 }, { "epoch": 0.4398439142930579, "grad_norm": 0.40925106406211853, "learning_rate": 1.187856621595718e-05, "loss": 0.2625, "step": 11765 }, { "epoch": 0.4400308432834077, "grad_norm": 0.1345365196466446, "learning_rate": 1.1872797870095385e-05, "loss": 0.3494, "step": 11770 }, { "epoch": 0.4402177722737575, "grad_norm": 0.5240778923034668, "learning_rate": 1.1867028878359266e-05, "loss": 0.2615, "step": 11775 }, { "epoch": 0.4404047012641073, "grad_norm": 0.3817775547504425, "learning_rate": 1.1861259242738386e-05, "loss": 0.272, "step": 11780 }, { "epoch": 0.4405916302544571, "grad_norm": 0.3994652032852173, "learning_rate": 1.1855488965222526e-05, "loss": 0.2841, "step": 11785 }, { "epoch": 0.44077855924480686, "grad_norm": 0.461319237947464, "learning_rate": 1.1849718047801686e-05, "loss": 0.3008, "step": 11790 }, { "epoch": 0.4409654882351567, "grad_norm": 0.5307036638259888, "learning_rate": 1.1843946492466093e-05, "loss": 0.2497, "step": 11795 }, { "epoch": 0.44115241722550647, "grad_norm": 0.37287020683288574, "learning_rate": 1.1838174301206194e-05, "loss": 0.2826, "step": 11800 }, { "epoch": 0.44133934621585624, "grad_norm": 0.538077712059021, "learning_rate": 1.1832401476012645e-05, "loss": 0.2475, "step": 11805 }, { "epoch": 0.441526275206206, "grad_norm": 0.3736201524734497, "learning_rate": 1.1826628018876334e-05, "loss": 0.265, "step": 11810 }, { "epoch": 0.44171320419655585, "grad_norm": 0.272097647190094, "learning_rate": 1.1820853931788357e-05, "loss": 0.3053, "step": 11815 }, { "epoch": 0.44190013318690563, "grad_norm": 0.5018362998962402, "learning_rate": 1.1815079216740033e-05, "loss": 0.3002, "step": 11820 }, { "epoch": 0.4420870621772554, "grad_norm": 0.5243871808052063, "learning_rate": 1.1809303875722896e-05, "loss": 0.3192, "step": 11825 }, { "epoch": 0.4422739911676052, "grad_norm": 0.20126721262931824, "learning_rate": 1.1803527910728695e-05, "loss": 0.3049, "step": 11830 }, { "epoch": 0.442460920157955, "grad_norm": 0.824718177318573, "learning_rate": 1.1797751323749396e-05, "loss": 0.3462, "step": 11835 }, { "epoch": 0.4426478491483048, "grad_norm": 0.6242169737815857, "learning_rate": 1.1791974116777179e-05, "loss": 0.2373, "step": 11840 }, { "epoch": 0.4428347781386546, "grad_norm": 0.23284803330898285, "learning_rate": 1.1786196291804432e-05, "loss": 0.2514, "step": 11845 }, { "epoch": 0.44302170712900435, "grad_norm": 0.5042092800140381, "learning_rate": 1.1780417850823768e-05, "loss": 0.2548, "step": 11850 }, { "epoch": 0.4432086361193542, "grad_norm": 0.28074219822883606, "learning_rate": 1.1774638795828004e-05, "loss": 0.2807, "step": 11855 }, { "epoch": 0.44339556510970396, "grad_norm": 0.36168691515922546, "learning_rate": 1.1768859128810167e-05, "loss": 0.2616, "step": 11860 }, { "epoch": 0.44358249410005374, "grad_norm": 0.6940954923629761, "learning_rate": 1.1763078851763497e-05, "loss": 0.3262, "step": 11865 }, { "epoch": 0.4437694230904035, "grad_norm": 0.29294854402542114, "learning_rate": 1.1757297966681455e-05, "loss": 0.2751, "step": 11870 }, { "epoch": 0.44395635208075335, "grad_norm": 0.2644307017326355, "learning_rate": 1.1751516475557696e-05, "loss": 0.2402, "step": 11875 }, { "epoch": 0.4441432810711031, "grad_norm": 1.0284477472305298, "learning_rate": 1.1745734380386091e-05, "loss": 0.3128, "step": 11880 }, { "epoch": 0.4443302100614529, "grad_norm": 0.3318440616130829, "learning_rate": 1.1739951683160719e-05, "loss": 0.2859, "step": 11885 }, { "epoch": 0.4445171390518027, "grad_norm": 0.49884122610092163, "learning_rate": 1.173416838587587e-05, "loss": 0.3245, "step": 11890 }, { "epoch": 0.4447040680421525, "grad_norm": 0.12781549990177155, "learning_rate": 1.1728384490526035e-05, "loss": 0.2808, "step": 11895 }, { "epoch": 0.4448909970325023, "grad_norm": 0.4701847434043884, "learning_rate": 1.1722599999105913e-05, "loss": 0.3074, "step": 11900 }, { "epoch": 0.44507792602285207, "grad_norm": 0.28572845458984375, "learning_rate": 1.1716814913610409e-05, "loss": 0.3912, "step": 11905 }, { "epoch": 0.44526485501320184, "grad_norm": 0.36786168813705444, "learning_rate": 1.1711029236034633e-05, "loss": 0.2701, "step": 11910 }, { "epoch": 0.4454517840035517, "grad_norm": 0.49527883529663086, "learning_rate": 1.1705242968373903e-05, "loss": 0.2451, "step": 11915 }, { "epoch": 0.44563871299390145, "grad_norm": 0.46449634432792664, "learning_rate": 1.1699456112623733e-05, "loss": 0.2992, "step": 11920 }, { "epoch": 0.44582564198425123, "grad_norm": 0.4579066038131714, "learning_rate": 1.1693668670779847e-05, "loss": 0.2084, "step": 11925 }, { "epoch": 0.446012570974601, "grad_norm": 0.1956116259098053, "learning_rate": 1.1687880644838164e-05, "loss": 0.3574, "step": 11930 }, { "epoch": 0.44619949996495084, "grad_norm": 0.37438297271728516, "learning_rate": 1.1682092036794812e-05, "loss": 0.3066, "step": 11935 }, { "epoch": 0.4463864289553006, "grad_norm": 0.37410229444503784, "learning_rate": 1.1676302848646116e-05, "loss": 0.3174, "step": 11940 }, { "epoch": 0.4465733579456504, "grad_norm": 0.23035752773284912, "learning_rate": 1.16705130823886e-05, "loss": 0.2645, "step": 11945 }, { "epoch": 0.4467602869360002, "grad_norm": 0.21039049327373505, "learning_rate": 1.1664722740018983e-05, "loss": 0.3209, "step": 11950 }, { "epoch": 0.44694721592635, "grad_norm": 0.4789251387119293, "learning_rate": 1.1658931823534196e-05, "loss": 0.3161, "step": 11955 }, { "epoch": 0.4471341449166998, "grad_norm": 0.33394578099250793, "learning_rate": 1.1653140334931357e-05, "loss": 0.2549, "step": 11960 }, { "epoch": 0.44732107390704956, "grad_norm": 0.5714621543884277, "learning_rate": 1.1647348276207789e-05, "loss": 0.3469, "step": 11965 }, { "epoch": 0.44750800289739934, "grad_norm": 0.8689758777618408, "learning_rate": 1.1641555649360998e-05, "loss": 0.4199, "step": 11970 }, { "epoch": 0.4476949318877491, "grad_norm": 0.1597885936498642, "learning_rate": 1.1635762456388702e-05, "loss": 0.3961, "step": 11975 }, { "epoch": 0.44788186087809895, "grad_norm": 0.46579235792160034, "learning_rate": 1.1629968699288805e-05, "loss": 0.2853, "step": 11980 }, { "epoch": 0.4480687898684487, "grad_norm": 0.34550490975379944, "learning_rate": 1.162417438005941e-05, "loss": 0.2943, "step": 11985 }, { "epoch": 0.4482557188587985, "grad_norm": 0.4065869450569153, "learning_rate": 1.1618379500698808e-05, "loss": 0.3224, "step": 11990 }, { "epoch": 0.4484426478491483, "grad_norm": 0.19699564576148987, "learning_rate": 1.161258406320549e-05, "loss": 0.3538, "step": 11995 }, { "epoch": 0.4486295768394981, "grad_norm": 0.851565420627594, "learning_rate": 1.1606788069578132e-05, "loss": 0.3719, "step": 12000 }, { "epoch": 0.4488165058298479, "grad_norm": 0.36260199546813965, "learning_rate": 1.160099152181561e-05, "loss": 0.3062, "step": 12005 }, { "epoch": 0.44900343482019767, "grad_norm": 0.36545851826667786, "learning_rate": 1.1595194421916988e-05, "loss": 0.329, "step": 12010 }, { "epoch": 0.44919036381054744, "grad_norm": 0.339874804019928, "learning_rate": 1.1589396771881518e-05, "loss": 0.2977, "step": 12015 }, { "epoch": 0.4493772928008973, "grad_norm": 0.4972774386405945, "learning_rate": 1.1583598573708642e-05, "loss": 0.385, "step": 12020 }, { "epoch": 0.44956422179124705, "grad_norm": 0.41324669122695923, "learning_rate": 1.1577799829397996e-05, "loss": 0.2967, "step": 12025 }, { "epoch": 0.44975115078159683, "grad_norm": 0.44996240735054016, "learning_rate": 1.1572000540949398e-05, "loss": 0.2517, "step": 12030 }, { "epoch": 0.4499380797719466, "grad_norm": 0.4872210621833801, "learning_rate": 1.1566200710362854e-05, "loss": 0.2805, "step": 12035 }, { "epoch": 0.45012500876229644, "grad_norm": 0.3203275799751282, "learning_rate": 1.1560400339638567e-05, "loss": 0.2458, "step": 12040 }, { "epoch": 0.4503119377526462, "grad_norm": 0.733971118927002, "learning_rate": 1.1554599430776906e-05, "loss": 0.2399, "step": 12045 }, { "epoch": 0.450498866742996, "grad_norm": 0.38670045137405396, "learning_rate": 1.1548797985778452e-05, "loss": 0.2588, "step": 12050 }, { "epoch": 0.4506857957333458, "grad_norm": 0.33743932843208313, "learning_rate": 1.1542996006643952e-05, "loss": 0.2816, "step": 12055 }, { "epoch": 0.4508727247236956, "grad_norm": 0.23398756980895996, "learning_rate": 1.1537193495374342e-05, "loss": 0.2784, "step": 12060 }, { "epoch": 0.4510596537140454, "grad_norm": 0.5807647705078125, "learning_rate": 1.153139045397074e-05, "loss": 0.3134, "step": 12065 }, { "epoch": 0.45124658270439516, "grad_norm": 0.4573014974594116, "learning_rate": 1.152558688443445e-05, "loss": 0.2615, "step": 12070 }, { "epoch": 0.45143351169474494, "grad_norm": 0.31440410017967224, "learning_rate": 1.1519782788766957e-05, "loss": 0.3578, "step": 12075 }, { "epoch": 0.45162044068509477, "grad_norm": 0.19732816517353058, "learning_rate": 1.1513978168969929e-05, "loss": 0.2974, "step": 12080 }, { "epoch": 0.45180736967544455, "grad_norm": 0.17791102826595306, "learning_rate": 1.1508173027045214e-05, "loss": 0.2286, "step": 12085 }, { "epoch": 0.4519942986657943, "grad_norm": 0.35120418667793274, "learning_rate": 1.1502367364994832e-05, "loss": 0.3173, "step": 12090 }, { "epoch": 0.4521812276561441, "grad_norm": 0.5374488234519958, "learning_rate": 1.1496561184820991e-05, "loss": 0.2903, "step": 12095 }, { "epoch": 0.45236815664649394, "grad_norm": 0.37684205174446106, "learning_rate": 1.1490754488526084e-05, "loss": 0.3131, "step": 12100 }, { "epoch": 0.4525550856368437, "grad_norm": 0.3075244426727295, "learning_rate": 1.1484947278112673e-05, "loss": 0.2625, "step": 12105 }, { "epoch": 0.4527420146271935, "grad_norm": 0.5102051496505737, "learning_rate": 1.147913955558349e-05, "loss": 0.3038, "step": 12110 }, { "epoch": 0.45292894361754327, "grad_norm": 0.2178102284669876, "learning_rate": 1.1473331322941457e-05, "loss": 0.2539, "step": 12115 }, { "epoch": 0.4531158726078931, "grad_norm": 0.20256425440311432, "learning_rate": 1.1467522582189667e-05, "loss": 0.2735, "step": 12120 }, { "epoch": 0.4533028015982429, "grad_norm": 0.5971643924713135, "learning_rate": 1.1461713335331389e-05, "loss": 0.4114, "step": 12125 }, { "epoch": 0.45348973058859265, "grad_norm": 0.5443209409713745, "learning_rate": 1.1455903584370065e-05, "loss": 0.3009, "step": 12130 }, { "epoch": 0.45367665957894243, "grad_norm": 0.6024529933929443, "learning_rate": 1.1450093331309314e-05, "loss": 0.2964, "step": 12135 }, { "epoch": 0.45386358856929226, "grad_norm": 0.4719208776950836, "learning_rate": 1.1444282578152918e-05, "loss": 0.3716, "step": 12140 }, { "epoch": 0.45405051755964204, "grad_norm": 0.3875749707221985, "learning_rate": 1.1438471326904847e-05, "loss": 0.2173, "step": 12145 }, { "epoch": 0.4542374465499918, "grad_norm": 0.40976300835609436, "learning_rate": 1.1432659579569234e-05, "loss": 0.2978, "step": 12150 }, { "epoch": 0.4544243755403416, "grad_norm": 0.32276174426078796, "learning_rate": 1.1426847338150386e-05, "loss": 0.3428, "step": 12155 }, { "epoch": 0.45461130453069143, "grad_norm": 0.2936963140964508, "learning_rate": 1.1421034604652771e-05, "loss": 0.3055, "step": 12160 }, { "epoch": 0.4547982335210412, "grad_norm": 0.24648843705654144, "learning_rate": 1.1415221381081039e-05, "loss": 0.3737, "step": 12165 }, { "epoch": 0.454985162511391, "grad_norm": 0.3906302750110626, "learning_rate": 1.1409407669440005e-05, "loss": 0.2709, "step": 12170 }, { "epoch": 0.45517209150174076, "grad_norm": 0.23113980889320374, "learning_rate": 1.140359347173465e-05, "loss": 0.2874, "step": 12175 }, { "epoch": 0.4553590204920906, "grad_norm": 0.4913095533847809, "learning_rate": 1.1397778789970126e-05, "loss": 0.253, "step": 12180 }, { "epoch": 0.45554594948244037, "grad_norm": 0.34004727005958557, "learning_rate": 1.1391963626151745e-05, "loss": 0.2647, "step": 12185 }, { "epoch": 0.45573287847279015, "grad_norm": 0.41701698303222656, "learning_rate": 1.1386147982284996e-05, "loss": 0.2685, "step": 12190 }, { "epoch": 0.4559198074631399, "grad_norm": 0.12583187222480774, "learning_rate": 1.1380331860375527e-05, "loss": 0.2764, "step": 12195 }, { "epoch": 0.4561067364534897, "grad_norm": 0.29735517501831055, "learning_rate": 1.1374515262429154e-05, "loss": 0.2472, "step": 12200 }, { "epoch": 0.45629366544383954, "grad_norm": 0.5592828989028931, "learning_rate": 1.1368698190451848e-05, "loss": 0.3773, "step": 12205 }, { "epoch": 0.4564805944341893, "grad_norm": 0.5007039904594421, "learning_rate": 1.1362880646449755e-05, "loss": 0.2827, "step": 12210 }, { "epoch": 0.4566675234245391, "grad_norm": 0.5219624042510986, "learning_rate": 1.1357062632429177e-05, "loss": 0.3227, "step": 12215 }, { "epoch": 0.45685445241488887, "grad_norm": 0.45227476954460144, "learning_rate": 1.1351244150396581e-05, "loss": 0.329, "step": 12220 }, { "epoch": 0.4570413814052387, "grad_norm": 0.5245965123176575, "learning_rate": 1.1345425202358597e-05, "loss": 0.3021, "step": 12225 }, { "epoch": 0.4572283103955885, "grad_norm": 0.43462806940078735, "learning_rate": 1.1339605790322016e-05, "loss": 0.3904, "step": 12230 }, { "epoch": 0.45741523938593825, "grad_norm": 0.44027209281921387, "learning_rate": 1.1333785916293776e-05, "loss": 0.2519, "step": 12235 }, { "epoch": 0.45760216837628803, "grad_norm": 0.7230022549629211, "learning_rate": 1.1327965582280995e-05, "loss": 0.3982, "step": 12240 }, { "epoch": 0.45778909736663786, "grad_norm": 0.5000762939453125, "learning_rate": 1.1322144790290935e-05, "loss": 0.3101, "step": 12245 }, { "epoch": 0.45797602635698764, "grad_norm": 0.3863626718521118, "learning_rate": 1.1316323542331022e-05, "loss": 0.3012, "step": 12250 }, { "epoch": 0.4581629553473374, "grad_norm": 0.34956860542297363, "learning_rate": 1.1310501840408837e-05, "loss": 0.3426, "step": 12255 }, { "epoch": 0.4583498843376872, "grad_norm": 0.3766677975654602, "learning_rate": 1.1304679686532116e-05, "loss": 0.2716, "step": 12260 }, { "epoch": 0.45853681332803703, "grad_norm": 0.5153632164001465, "learning_rate": 1.1298857082708754e-05, "loss": 0.2259, "step": 12265 }, { "epoch": 0.4587237423183868, "grad_norm": 0.4910812973976135, "learning_rate": 1.1293034030946804e-05, "loss": 0.3003, "step": 12270 }, { "epoch": 0.4589106713087366, "grad_norm": 0.43328529596328735, "learning_rate": 1.1287210533254464e-05, "loss": 0.3041, "step": 12275 }, { "epoch": 0.45909760029908636, "grad_norm": 0.24060022830963135, "learning_rate": 1.1281386591640096e-05, "loss": 0.3299, "step": 12280 }, { "epoch": 0.4592845292894362, "grad_norm": 0.2626323997974396, "learning_rate": 1.127556220811221e-05, "loss": 0.3169, "step": 12285 }, { "epoch": 0.45947145827978597, "grad_norm": 0.27137619256973267, "learning_rate": 1.1269737384679465e-05, "loss": 0.2479, "step": 12290 }, { "epoch": 0.45965838727013575, "grad_norm": 0.3326461911201477, "learning_rate": 1.1263912123350679e-05, "loss": 0.2324, "step": 12295 }, { "epoch": 0.4598453162604855, "grad_norm": 0.20982854068279266, "learning_rate": 1.1258086426134822e-05, "loss": 0.2547, "step": 12300 }, { "epoch": 0.46003224525083536, "grad_norm": 0.3248259127140045, "learning_rate": 1.1252260295041003e-05, "loss": 0.2574, "step": 12305 }, { "epoch": 0.46021917424118514, "grad_norm": 0.35238370299339294, "learning_rate": 1.1246433732078487e-05, "loss": 0.2612, "step": 12310 }, { "epoch": 0.4604061032315349, "grad_norm": 0.5635769367218018, "learning_rate": 1.1240606739256694e-05, "loss": 0.3106, "step": 12315 }, { "epoch": 0.4605930322218847, "grad_norm": 0.33287879824638367, "learning_rate": 1.1234779318585182e-05, "loss": 0.24, "step": 12320 }, { "epoch": 0.4607799612122345, "grad_norm": 0.5921958684921265, "learning_rate": 1.1228951472073669e-05, "loss": 0.2757, "step": 12325 }, { "epoch": 0.4609668902025843, "grad_norm": 0.32586291432380676, "learning_rate": 1.1223123201732002e-05, "loss": 0.3202, "step": 12330 }, { "epoch": 0.4611538191929341, "grad_norm": 0.4058232605457306, "learning_rate": 1.1217294509570193e-05, "loss": 0.2255, "step": 12335 }, { "epoch": 0.46134074818328386, "grad_norm": 0.5720903873443604, "learning_rate": 1.1211465397598387e-05, "loss": 0.3099, "step": 12340 }, { "epoch": 0.4615276771736337, "grad_norm": 0.43414196372032166, "learning_rate": 1.1205635867826878e-05, "loss": 0.2535, "step": 12345 }, { "epoch": 0.46171460616398347, "grad_norm": 0.5103111863136292, "learning_rate": 1.1199805922266105e-05, "loss": 0.2551, "step": 12350 }, { "epoch": 0.46190153515433324, "grad_norm": 0.4893711805343628, "learning_rate": 1.1193975562926646e-05, "loss": 0.2794, "step": 12355 }, { "epoch": 0.462088464144683, "grad_norm": 0.48441797494888306, "learning_rate": 1.1188144791819226e-05, "loss": 0.4306, "step": 12360 }, { "epoch": 0.46227539313503285, "grad_norm": 0.2794596254825592, "learning_rate": 1.1182313610954716e-05, "loss": 0.2915, "step": 12365 }, { "epoch": 0.46246232212538263, "grad_norm": 0.5436064004898071, "learning_rate": 1.1176482022344115e-05, "loss": 0.297, "step": 12370 }, { "epoch": 0.4626492511157324, "grad_norm": 0.40309974551200867, "learning_rate": 1.1170650027998577e-05, "loss": 0.4027, "step": 12375 }, { "epoch": 0.4628361801060822, "grad_norm": 0.1574869155883789, "learning_rate": 1.116481762992939e-05, "loss": 0.2339, "step": 12380 }, { "epoch": 0.463023109096432, "grad_norm": 0.43917763233184814, "learning_rate": 1.1158984830147975e-05, "loss": 0.3213, "step": 12385 }, { "epoch": 0.4632100380867818, "grad_norm": 0.34497371315956116, "learning_rate": 1.1153151630665902e-05, "loss": 0.2609, "step": 12390 }, { "epoch": 0.46339696707713157, "grad_norm": 0.3034207224845886, "learning_rate": 1.1147318033494876e-05, "loss": 0.2787, "step": 12395 }, { "epoch": 0.46358389606748135, "grad_norm": 0.28706076741218567, "learning_rate": 1.1141484040646732e-05, "loss": 0.2624, "step": 12400 }, { "epoch": 0.4637708250578312, "grad_norm": 0.3962991237640381, "learning_rate": 1.1135649654133453e-05, "loss": 0.2194, "step": 12405 }, { "epoch": 0.46395775404818096, "grad_norm": 0.6293958425521851, "learning_rate": 1.1129814875967143e-05, "loss": 0.3141, "step": 12410 }, { "epoch": 0.46414468303853074, "grad_norm": 0.4316769242286682, "learning_rate": 1.1123979708160064e-05, "loss": 0.2896, "step": 12415 }, { "epoch": 0.4643316120288805, "grad_norm": 0.4425215423107147, "learning_rate": 1.1118144152724584e-05, "loss": 0.3304, "step": 12420 }, { "epoch": 0.46451854101923035, "grad_norm": 0.43086567521095276, "learning_rate": 1.1112308211673226e-05, "loss": 0.3048, "step": 12425 }, { "epoch": 0.4647054700095801, "grad_norm": 0.5925473570823669, "learning_rate": 1.1106471887018637e-05, "loss": 0.3241, "step": 12430 }, { "epoch": 0.4648923989999299, "grad_norm": 0.4298711121082306, "learning_rate": 1.11006351807736e-05, "loss": 0.3052, "step": 12435 }, { "epoch": 0.4650793279902797, "grad_norm": 0.40438607335090637, "learning_rate": 1.1094798094951027e-05, "loss": 0.3083, "step": 12440 }, { "epoch": 0.46526625698062946, "grad_norm": 0.1780761182308197, "learning_rate": 1.1088960631563958e-05, "loss": 0.3185, "step": 12445 }, { "epoch": 0.4654531859709793, "grad_norm": 0.5064865946769714, "learning_rate": 1.108312279262557e-05, "loss": 0.2251, "step": 12450 }, { "epoch": 0.46564011496132907, "grad_norm": 0.49286672472953796, "learning_rate": 1.1077284580149169e-05, "loss": 0.2824, "step": 12455 }, { "epoch": 0.46582704395167884, "grad_norm": 0.35339727997779846, "learning_rate": 1.1071445996148182e-05, "loss": 0.3037, "step": 12460 }, { "epoch": 0.4660139729420286, "grad_norm": 0.5857782959938049, "learning_rate": 1.1065607042636173e-05, "loss": 0.3552, "step": 12465 }, { "epoch": 0.46620090193237845, "grad_norm": 0.4888255000114441, "learning_rate": 1.1059767721626828e-05, "loss": 0.3064, "step": 12470 }, { "epoch": 0.46638783092272823, "grad_norm": 0.2896244525909424, "learning_rate": 1.1053928035133964e-05, "loss": 0.2304, "step": 12475 }, { "epoch": 0.466574759913078, "grad_norm": 0.26369529962539673, "learning_rate": 1.1048087985171517e-05, "loss": 0.2565, "step": 12480 }, { "epoch": 0.4667616889034278, "grad_norm": 0.4036164879798889, "learning_rate": 1.1042247573753555e-05, "loss": 0.2744, "step": 12485 }, { "epoch": 0.4669486178937776, "grad_norm": 0.03450818732380867, "learning_rate": 1.103640680289427e-05, "loss": 0.2852, "step": 12490 }, { "epoch": 0.4671355468841274, "grad_norm": 0.39615771174430847, "learning_rate": 1.1030565674607976e-05, "loss": 0.2949, "step": 12495 }, { "epoch": 0.4673224758744772, "grad_norm": 0.26349276304244995, "learning_rate": 1.1024724190909109e-05, "loss": 0.2381, "step": 12500 }, { "epoch": 0.46750940486482695, "grad_norm": 0.6820989847183228, "learning_rate": 1.101888235381223e-05, "loss": 0.2836, "step": 12505 }, { "epoch": 0.4676963338551768, "grad_norm": 0.30709752440452576, "learning_rate": 1.1013040165332024e-05, "loss": 0.3014, "step": 12510 }, { "epoch": 0.46788326284552656, "grad_norm": 0.31798475980758667, "learning_rate": 1.1007197627483292e-05, "loss": 0.2755, "step": 12515 }, { "epoch": 0.46807019183587634, "grad_norm": 0.4482247233390808, "learning_rate": 1.1001354742280959e-05, "loss": 0.2384, "step": 12520 }, { "epoch": 0.4682571208262261, "grad_norm": 0.503452479839325, "learning_rate": 1.0995511511740066e-05, "loss": 0.3171, "step": 12525 }, { "epoch": 0.46844404981657595, "grad_norm": 0.3593085706233978, "learning_rate": 1.0989667937875778e-05, "loss": 0.372, "step": 12530 }, { "epoch": 0.4686309788069257, "grad_norm": 0.4590424597263336, "learning_rate": 1.0983824022703377e-05, "loss": 0.297, "step": 12535 }, { "epoch": 0.4688179077972755, "grad_norm": 0.49093323945999146, "learning_rate": 1.0977979768238261e-05, "loss": 0.2479, "step": 12540 }, { "epoch": 0.4690048367876253, "grad_norm": 0.29465728998184204, "learning_rate": 1.0972135176495942e-05, "loss": 0.2881, "step": 12545 }, { "epoch": 0.4691917657779751, "grad_norm": 0.16378015279769897, "learning_rate": 1.0966290249492057e-05, "loss": 0.2464, "step": 12550 }, { "epoch": 0.4693786947683249, "grad_norm": 0.3140955865383148, "learning_rate": 1.0960444989242355e-05, "loss": 0.1831, "step": 12555 }, { "epoch": 0.46956562375867467, "grad_norm": 0.3941444158554077, "learning_rate": 1.0954599397762695e-05, "loss": 0.2958, "step": 12560 }, { "epoch": 0.46975255274902444, "grad_norm": 0.3791537582874298, "learning_rate": 1.0948753477069057e-05, "loss": 0.3068, "step": 12565 }, { "epoch": 0.4699394817393743, "grad_norm": 0.12725694477558136, "learning_rate": 1.0942907229177526e-05, "loss": 0.2489, "step": 12570 }, { "epoch": 0.47012641072972405, "grad_norm": 0.6161131858825684, "learning_rate": 1.0937060656104312e-05, "loss": 0.3096, "step": 12575 }, { "epoch": 0.47031333972007383, "grad_norm": 0.40583592653274536, "learning_rate": 1.0931213759865729e-05, "loss": 0.3163, "step": 12580 }, { "epoch": 0.4705002687104236, "grad_norm": 3.2883963584899902, "learning_rate": 1.0925366542478205e-05, "loss": 0.2901, "step": 12585 }, { "epoch": 0.47068719770077344, "grad_norm": 0.5457201600074768, "learning_rate": 1.0919519005958268e-05, "loss": 0.332, "step": 12590 }, { "epoch": 0.4708741266911232, "grad_norm": 0.42853930592536926, "learning_rate": 1.0913671152322578e-05, "loss": 0.2741, "step": 12595 }, { "epoch": 0.471061055681473, "grad_norm": 0.43397367000579834, "learning_rate": 1.0907822983587888e-05, "loss": 0.366, "step": 12600 }, { "epoch": 0.4712479846718228, "grad_norm": 0.6656988263130188, "learning_rate": 1.0901974501771065e-05, "loss": 0.2831, "step": 12605 }, { "epoch": 0.4714349136621726, "grad_norm": 0.3851011097431183, "learning_rate": 1.0896125708889077e-05, "loss": 0.2331, "step": 12610 }, { "epoch": 0.4716218426525224, "grad_norm": 0.25610148906707764, "learning_rate": 1.0890276606959011e-05, "loss": 0.2575, "step": 12615 }, { "epoch": 0.47180877164287216, "grad_norm": 0.4998786449432373, "learning_rate": 1.0884427197998054e-05, "loss": 0.2438, "step": 12620 }, { "epoch": 0.47199570063322194, "grad_norm": 0.21021483838558197, "learning_rate": 1.0878577484023496e-05, "loss": 0.2544, "step": 12625 }, { "epoch": 0.47218262962357177, "grad_norm": 0.3968941271305084, "learning_rate": 1.0872727467052741e-05, "loss": 0.2302, "step": 12630 }, { "epoch": 0.47236955861392155, "grad_norm": 0.4154810905456543, "learning_rate": 1.0866877149103286e-05, "loss": 0.2776, "step": 12635 }, { "epoch": 0.4725564876042713, "grad_norm": 0.4162626564502716, "learning_rate": 1.0861026532192745e-05, "loss": 0.2388, "step": 12640 }, { "epoch": 0.4727434165946211, "grad_norm": 0.4286978244781494, "learning_rate": 1.0855175618338823e-05, "loss": 0.2645, "step": 12645 }, { "epoch": 0.47293034558497093, "grad_norm": 0.3265318274497986, "learning_rate": 1.0849324409559334e-05, "loss": 0.2832, "step": 12650 }, { "epoch": 0.4731172745753207, "grad_norm": 0.3723618686199188, "learning_rate": 1.0843472907872192e-05, "loss": 0.2681, "step": 12655 }, { "epoch": 0.4733042035656705, "grad_norm": 0.47997230291366577, "learning_rate": 1.0837621115295414e-05, "loss": 0.4914, "step": 12660 }, { "epoch": 0.47349113255602027, "grad_norm": 0.3690032362937927, "learning_rate": 1.0831769033847113e-05, "loss": 0.2536, "step": 12665 }, { "epoch": 0.4736780615463701, "grad_norm": 0.44639065861701965, "learning_rate": 1.0825916665545506e-05, "loss": 0.3222, "step": 12670 }, { "epoch": 0.4738649905367199, "grad_norm": 0.3845580816268921, "learning_rate": 1.0820064012408905e-05, "loss": 0.3047, "step": 12675 }, { "epoch": 0.47405191952706965, "grad_norm": 0.5291163921356201, "learning_rate": 1.0814211076455727e-05, "loss": 0.2676, "step": 12680 }, { "epoch": 0.47423884851741943, "grad_norm": 0.20399098098278046, "learning_rate": 1.0808357859704478e-05, "loss": 0.2618, "step": 12685 }, { "epoch": 0.4744257775077692, "grad_norm": 0.5159106254577637, "learning_rate": 1.0802504364173763e-05, "loss": 0.3227, "step": 12690 }, { "epoch": 0.47461270649811904, "grad_norm": 0.24743987619876862, "learning_rate": 1.079665059188229e-05, "loss": 0.2674, "step": 12695 }, { "epoch": 0.4747996354884688, "grad_norm": 0.41947150230407715, "learning_rate": 1.0790796544848853e-05, "loss": 0.297, "step": 12700 }, { "epoch": 0.4749865644788186, "grad_norm": 0.5480782389640808, "learning_rate": 1.078494222509235e-05, "loss": 0.2761, "step": 12705 }, { "epoch": 0.4751734934691684, "grad_norm": 0.2428130805492401, "learning_rate": 1.0779087634631763e-05, "loss": 0.2324, "step": 12710 }, { "epoch": 0.4753604224595182, "grad_norm": 0.2542586624622345, "learning_rate": 1.0773232775486173e-05, "loss": 0.3582, "step": 12715 }, { "epoch": 0.475547351449868, "grad_norm": 0.341899573802948, "learning_rate": 1.0767377649674755e-05, "loss": 0.2975, "step": 12720 }, { "epoch": 0.47573428044021776, "grad_norm": 0.6047893762588501, "learning_rate": 1.0761522259216777e-05, "loss": 0.2313, "step": 12725 }, { "epoch": 0.47592120943056754, "grad_norm": 0.3485582768917084, "learning_rate": 1.0755666606131588e-05, "loss": 0.2606, "step": 12730 }, { "epoch": 0.47610813842091737, "grad_norm": 0.38747602701187134, "learning_rate": 1.074981069243864e-05, "loss": 0.3315, "step": 12735 }, { "epoch": 0.47629506741126715, "grad_norm": 0.5207949876785278, "learning_rate": 1.0743954520157471e-05, "loss": 0.34, "step": 12740 }, { "epoch": 0.4764819964016169, "grad_norm": 0.6631015539169312, "learning_rate": 1.0738098091307703e-05, "loss": 0.2457, "step": 12745 }, { "epoch": 0.4766689253919667, "grad_norm": 0.2515263855457306, "learning_rate": 1.0732241407909057e-05, "loss": 0.2445, "step": 12750 }, { "epoch": 0.47685585438231654, "grad_norm": 0.27167659997940063, "learning_rate": 1.0726384471981326e-05, "loss": 0.2514, "step": 12755 }, { "epoch": 0.4770427833726663, "grad_norm": 0.44566571712493896, "learning_rate": 1.0720527285544406e-05, "loss": 0.2548, "step": 12760 }, { "epoch": 0.4772297123630161, "grad_norm": 0.5117403268814087, "learning_rate": 1.071466985061827e-05, "loss": 0.2884, "step": 12765 }, { "epoch": 0.47741664135336587, "grad_norm": 0.36228641867637634, "learning_rate": 1.070881216922298e-05, "loss": 0.3223, "step": 12770 }, { "epoch": 0.4776035703437157, "grad_norm": 0.35445940494537354, "learning_rate": 1.0702954243378685e-05, "loss": 0.2761, "step": 12775 }, { "epoch": 0.4777904993340655, "grad_norm": 0.2719997763633728, "learning_rate": 1.069709607510561e-05, "loss": 0.3556, "step": 12780 }, { "epoch": 0.47797742832441525, "grad_norm": 0.33862072229385376, "learning_rate": 1.0691237666424077e-05, "loss": 0.2534, "step": 12785 }, { "epoch": 0.47816435731476503, "grad_norm": 0.2040008008480072, "learning_rate": 1.0685379019354476e-05, "loss": 0.2666, "step": 12790 }, { "epoch": 0.47835128630511486, "grad_norm": 0.3832045793533325, "learning_rate": 1.0679520135917293e-05, "loss": 0.3088, "step": 12795 }, { "epoch": 0.47853821529546464, "grad_norm": 0.4770766496658325, "learning_rate": 1.0673661018133086e-05, "loss": 0.2701, "step": 12800 }, { "epoch": 0.4787251442858144, "grad_norm": 1.6293139457702637, "learning_rate": 1.0667801668022496e-05, "loss": 0.2655, "step": 12805 }, { "epoch": 0.4789120732761642, "grad_norm": 0.44511693716049194, "learning_rate": 1.0661942087606243e-05, "loss": 0.354, "step": 12810 }, { "epoch": 0.47909900226651403, "grad_norm": 0.22285427153110504, "learning_rate": 1.065608227890513e-05, "loss": 0.2925, "step": 12815 }, { "epoch": 0.4792859312568638, "grad_norm": 0.34608471393585205, "learning_rate": 1.0650222243940043e-05, "loss": 0.245, "step": 12820 }, { "epoch": 0.4794728602472136, "grad_norm": 0.42804983258247375, "learning_rate": 1.0644361984731932e-05, "loss": 0.288, "step": 12825 }, { "epoch": 0.47965978923756336, "grad_norm": 0.4652557373046875, "learning_rate": 1.0638501503301837e-05, "loss": 0.3612, "step": 12830 }, { "epoch": 0.4798467182279132, "grad_norm": 0.24128127098083496, "learning_rate": 1.0632640801670868e-05, "loss": 0.2583, "step": 12835 }, { "epoch": 0.48003364721826297, "grad_norm": 0.45303890109062195, "learning_rate": 1.0626779881860213e-05, "loss": 0.2702, "step": 12840 }, { "epoch": 0.48022057620861275, "grad_norm": 0.33776313066482544, "learning_rate": 1.0620918745891143e-05, "loss": 0.3176, "step": 12845 }, { "epoch": 0.4804075051989625, "grad_norm": 0.304514616727829, "learning_rate": 1.0615057395784983e-05, "loss": 0.2317, "step": 12850 }, { "epoch": 0.48059443418931236, "grad_norm": 0.3652326762676239, "learning_rate": 1.0609195833563153e-05, "loss": 0.271, "step": 12855 }, { "epoch": 0.48078136317966214, "grad_norm": 0.397659033536911, "learning_rate": 1.0603334061247133e-05, "loss": 0.3553, "step": 12860 }, { "epoch": 0.4809682921700119, "grad_norm": 0.33267614245414734, "learning_rate": 1.0597472080858485e-05, "loss": 0.3367, "step": 12865 }, { "epoch": 0.4811552211603617, "grad_norm": 0.35334113240242004, "learning_rate": 1.0591609894418835e-05, "loss": 0.3743, "step": 12870 }, { "epoch": 0.4813421501507115, "grad_norm": 0.7049282789230347, "learning_rate": 1.0585747503949883e-05, "loss": 0.3031, "step": 12875 }, { "epoch": 0.4815290791410613, "grad_norm": 0.504492998123169, "learning_rate": 1.0579884911473404e-05, "loss": 0.2968, "step": 12880 }, { "epoch": 0.4817160081314111, "grad_norm": 0.21734552085399628, "learning_rate": 1.0574022119011234e-05, "loss": 0.304, "step": 12885 }, { "epoch": 0.48190293712176085, "grad_norm": 0.5968211889266968, "learning_rate": 1.0568159128585283e-05, "loss": 0.2402, "step": 12890 }, { "epoch": 0.4820898661121107, "grad_norm": 0.5871456265449524, "learning_rate": 1.056229594221753e-05, "loss": 0.2916, "step": 12895 }, { "epoch": 0.48227679510246046, "grad_norm": 0.4001029431819916, "learning_rate": 1.0556432561930014e-05, "loss": 0.2675, "step": 12900 }, { "epoch": 0.48246372409281024, "grad_norm": 0.40792810916900635, "learning_rate": 1.0550568989744852e-05, "loss": 0.2857, "step": 12905 }, { "epoch": 0.48265065308316, "grad_norm": 0.23005032539367676, "learning_rate": 1.0544705227684223e-05, "loss": 0.3099, "step": 12910 }, { "epoch": 0.48283758207350985, "grad_norm": 0.4021388292312622, "learning_rate": 1.053884127777037e-05, "loss": 0.3509, "step": 12915 }, { "epoch": 0.48302451106385963, "grad_norm": 0.40135905146598816, "learning_rate": 1.0532977142025595e-05, "loss": 0.3563, "step": 12920 }, { "epoch": 0.4832114400542094, "grad_norm": 1.3269644975662231, "learning_rate": 1.0527112822472279e-05, "loss": 0.34, "step": 12925 }, { "epoch": 0.4833983690445592, "grad_norm": 0.28477320075035095, "learning_rate": 1.0521248321132853e-05, "loss": 0.3246, "step": 12930 }, { "epoch": 0.48358529803490896, "grad_norm": 0.38668492436408997, "learning_rate": 1.0515383640029819e-05, "loss": 0.348, "step": 12935 }, { "epoch": 0.4837722270252588, "grad_norm": 0.5541387796401978, "learning_rate": 1.0509518781185735e-05, "loss": 0.2915, "step": 12940 }, { "epoch": 0.48395915601560857, "grad_norm": 0.43491193652153015, "learning_rate": 1.0503653746623221e-05, "loss": 0.2804, "step": 12945 }, { "epoch": 0.48414608500595835, "grad_norm": 0.3402020037174225, "learning_rate": 1.0497788538364961e-05, "loss": 0.3074, "step": 12950 }, { "epoch": 0.4843330139963081, "grad_norm": 0.4688038229942322, "learning_rate": 1.0491923158433696e-05, "loss": 0.328, "step": 12955 }, { "epoch": 0.48451994298665796, "grad_norm": 0.2162652462720871, "learning_rate": 1.0486057608852236e-05, "loss": 0.2978, "step": 12960 }, { "epoch": 0.48470687197700774, "grad_norm": 0.33628934621810913, "learning_rate": 1.0480191891643427e-05, "loss": 0.2905, "step": 12965 }, { "epoch": 0.4848938009673575, "grad_norm": 0.30055081844329834, "learning_rate": 1.0474326008830198e-05, "loss": 0.288, "step": 12970 }, { "epoch": 0.4850807299577073, "grad_norm": 0.30387982726097107, "learning_rate": 1.0468459962435517e-05, "loss": 0.2517, "step": 12975 }, { "epoch": 0.4852676589480571, "grad_norm": 0.3027805685997009, "learning_rate": 1.046259375448242e-05, "loss": 0.2283, "step": 12980 }, { "epoch": 0.4854545879384069, "grad_norm": 0.6029101014137268, "learning_rate": 1.0456727386993992e-05, "loss": 0.313, "step": 12985 }, { "epoch": 0.4856415169287567, "grad_norm": 0.23959915339946747, "learning_rate": 1.0450860861993374e-05, "loss": 0.2493, "step": 12990 }, { "epoch": 0.48582844591910646, "grad_norm": 0.6326263546943665, "learning_rate": 1.0444994181503764e-05, "loss": 0.31, "step": 12995 }, { "epoch": 0.4860153749094563, "grad_norm": 0.3467031419277191, "learning_rate": 1.043912734754841e-05, "loss": 0.2751, "step": 13000 }, { "epoch": 0.48620230389980607, "grad_norm": 0.24758736789226532, "learning_rate": 1.0433260362150618e-05, "loss": 0.3156, "step": 13005 }, { "epoch": 0.48638923289015584, "grad_norm": 0.4706723093986511, "learning_rate": 1.0427393227333742e-05, "loss": 0.2606, "step": 13010 }, { "epoch": 0.4865761618805056, "grad_norm": 0.4599349796772003, "learning_rate": 1.0421525945121187e-05, "loss": 0.257, "step": 13015 }, { "epoch": 0.48676309087085545, "grad_norm": 0.3387102484703064, "learning_rate": 1.0415658517536414e-05, "loss": 0.2991, "step": 13020 }, { "epoch": 0.48695001986120523, "grad_norm": 0.4700387120246887, "learning_rate": 1.0409790946602926e-05, "loss": 0.2578, "step": 13025 }, { "epoch": 0.487136948851555, "grad_norm": 0.5611972808837891, "learning_rate": 1.0403923234344282e-05, "loss": 0.3931, "step": 13030 }, { "epoch": 0.4873238778419048, "grad_norm": 0.3460509479045868, "learning_rate": 1.0398055382784094e-05, "loss": 0.2514, "step": 13035 }, { "epoch": 0.4875108068322546, "grad_norm": 0.45094433426856995, "learning_rate": 1.0392187393946004e-05, "loss": 0.2802, "step": 13040 }, { "epoch": 0.4876977358226044, "grad_norm": 0.44121870398521423, "learning_rate": 1.0386319269853719e-05, "loss": 0.3061, "step": 13045 }, { "epoch": 0.48788466481295417, "grad_norm": 0.35361143946647644, "learning_rate": 1.038045101253099e-05, "loss": 0.2768, "step": 13050 }, { "epoch": 0.48807159380330395, "grad_norm": 1.3324558734893799, "learning_rate": 1.0374582624001608e-05, "loss": 0.3261, "step": 13055 }, { "epoch": 0.4882585227936538, "grad_norm": 0.31909024715423584, "learning_rate": 1.0368714106289412e-05, "loss": 0.2433, "step": 13060 }, { "epoch": 0.48844545178400356, "grad_norm": 0.8846080899238586, "learning_rate": 1.0362845461418286e-05, "loss": 0.3087, "step": 13065 }, { "epoch": 0.48863238077435334, "grad_norm": 0.2497347593307495, "learning_rate": 1.0356976691412156e-05, "loss": 0.2936, "step": 13070 }, { "epoch": 0.4888193097647031, "grad_norm": 0.33820840716362, "learning_rate": 1.0351107798294994e-05, "loss": 0.3137, "step": 13075 }, { "epoch": 0.48900623875505295, "grad_norm": 0.23961906135082245, "learning_rate": 1.0345238784090816e-05, "loss": 0.2316, "step": 13080 }, { "epoch": 0.4891931677454027, "grad_norm": 0.172140970826149, "learning_rate": 1.0339369650823672e-05, "loss": 0.3032, "step": 13085 }, { "epoch": 0.4893800967357525, "grad_norm": 0.21862946450710297, "learning_rate": 1.0333500400517656e-05, "loss": 0.2501, "step": 13090 }, { "epoch": 0.4895670257261023, "grad_norm": 0.35493218898773193, "learning_rate": 1.032763103519691e-05, "loss": 0.2768, "step": 13095 }, { "epoch": 0.4897539547164521, "grad_norm": 0.2946685254573822, "learning_rate": 1.0321761556885608e-05, "loss": 0.3169, "step": 13100 }, { "epoch": 0.4899408837068019, "grad_norm": 0.39835453033447266, "learning_rate": 1.0315891967607968e-05, "loss": 0.3315, "step": 13105 }, { "epoch": 0.49012781269715167, "grad_norm": 0.4330670237541199, "learning_rate": 1.0310022269388236e-05, "loss": 0.3063, "step": 13110 }, { "epoch": 0.49031474168750144, "grad_norm": 0.3474862277507782, "learning_rate": 1.0304152464250707e-05, "loss": 0.3044, "step": 13115 }, { "epoch": 0.4905016706778513, "grad_norm": 0.3377613425254822, "learning_rate": 1.0298282554219707e-05, "loss": 0.2859, "step": 13120 }, { "epoch": 0.49068859966820105, "grad_norm": 0.2067660242319107, "learning_rate": 1.0292412541319603e-05, "loss": 0.2821, "step": 13125 }, { "epoch": 0.49087552865855083, "grad_norm": 0.5813116431236267, "learning_rate": 1.0286542427574794e-05, "loss": 0.3219, "step": 13130 }, { "epoch": 0.4910624576489006, "grad_norm": 0.5601276159286499, "learning_rate": 1.0280672215009706e-05, "loss": 0.2605, "step": 13135 }, { "epoch": 0.49124938663925044, "grad_norm": 0.35725921392440796, "learning_rate": 1.0274801905648816e-05, "loss": 0.2353, "step": 13140 }, { "epoch": 0.4914363156296002, "grad_norm": 0.3625008165836334, "learning_rate": 1.0268931501516626e-05, "loss": 0.3755, "step": 13145 }, { "epoch": 0.49162324461995, "grad_norm": 0.2937968671321869, "learning_rate": 1.0263061004637666e-05, "loss": 0.2857, "step": 13150 }, { "epoch": 0.4918101736102998, "grad_norm": 0.48958075046539307, "learning_rate": 1.0257190417036502e-05, "loss": 0.289, "step": 13155 }, { "epoch": 0.4919971026006496, "grad_norm": 0.4188752770423889, "learning_rate": 1.0251319740737732e-05, "loss": 0.2968, "step": 13160 }, { "epoch": 0.4921840315909994, "grad_norm": 0.32891568541526794, "learning_rate": 1.0245448977765986e-05, "loss": 0.299, "step": 13165 }, { "epoch": 0.49237096058134916, "grad_norm": 0.4842504858970642, "learning_rate": 1.023957813014592e-05, "loss": 0.212, "step": 13170 }, { "epoch": 0.49255788957169894, "grad_norm": 0.5407660007476807, "learning_rate": 1.0233707199902223e-05, "loss": 0.24, "step": 13175 }, { "epoch": 0.4927448185620487, "grad_norm": 0.268076092004776, "learning_rate": 1.0227836189059606e-05, "loss": 0.229, "step": 13180 }, { "epoch": 0.49293174755239855, "grad_norm": 0.6369917392730713, "learning_rate": 1.0221965099642817e-05, "loss": 0.2855, "step": 13185 }, { "epoch": 0.4931186765427483, "grad_norm": 0.39120930433273315, "learning_rate": 1.0216093933676625e-05, "loss": 0.2355, "step": 13190 }, { "epoch": 0.4933056055330981, "grad_norm": 0.31261909008026123, "learning_rate": 1.0210222693185829e-05, "loss": 0.2723, "step": 13195 }, { "epoch": 0.4934925345234479, "grad_norm": 0.5893165469169617, "learning_rate": 1.0204351380195249e-05, "loss": 0.2922, "step": 13200 }, { "epoch": 0.4936794635137977, "grad_norm": 0.4819238781929016, "learning_rate": 1.0198479996729736e-05, "loss": 0.3932, "step": 13205 }, { "epoch": 0.4938663925041475, "grad_norm": 0.35270988941192627, "learning_rate": 1.0192608544814155e-05, "loss": 0.35, "step": 13210 }, { "epoch": 0.49405332149449727, "grad_norm": 0.6773056387901306, "learning_rate": 1.0186737026473408e-05, "loss": 0.3297, "step": 13215 }, { "epoch": 0.49424025048484704, "grad_norm": 0.28301501274108887, "learning_rate": 1.0180865443732408e-05, "loss": 0.3117, "step": 13220 }, { "epoch": 0.4944271794751969, "grad_norm": 0.3076423406600952, "learning_rate": 1.0174993798616101e-05, "loss": 0.2898, "step": 13225 }, { "epoch": 0.49461410846554665, "grad_norm": 0.6145200133323669, "learning_rate": 1.0169122093149449e-05, "loss": 0.271, "step": 13230 }, { "epoch": 0.49480103745589643, "grad_norm": 0.6317368745803833, "learning_rate": 1.016325032935743e-05, "loss": 0.3003, "step": 13235 }, { "epoch": 0.4949879664462462, "grad_norm": 0.5369362235069275, "learning_rate": 1.0157378509265053e-05, "loss": 0.2718, "step": 13240 }, { "epoch": 0.49517489543659604, "grad_norm": 0.3894631564617157, "learning_rate": 1.015150663489734e-05, "loss": 0.3061, "step": 13245 }, { "epoch": 0.4953618244269458, "grad_norm": 0.35104823112487793, "learning_rate": 1.0145634708279324e-05, "loss": 0.3223, "step": 13250 }, { "epoch": 0.4955487534172956, "grad_norm": 0.5245153903961182, "learning_rate": 1.013976273143607e-05, "loss": 0.3233, "step": 13255 }, { "epoch": 0.4957356824076454, "grad_norm": 0.4358454942703247, "learning_rate": 1.013389070639266e-05, "loss": 0.344, "step": 13260 }, { "epoch": 0.4959226113979952, "grad_norm": 0.3596687912940979, "learning_rate": 1.0128018635174177e-05, "loss": 0.2782, "step": 13265 }, { "epoch": 0.496109540388345, "grad_norm": 0.314059317111969, "learning_rate": 1.0122146519805736e-05, "loss": 0.27, "step": 13270 }, { "epoch": 0.49629646937869476, "grad_norm": 0.49434518814086914, "learning_rate": 1.0116274362312462e-05, "loss": 0.2295, "step": 13275 }, { "epoch": 0.49648339836904454, "grad_norm": 0.42047810554504395, "learning_rate": 1.011040216471949e-05, "loss": 0.2461, "step": 13280 }, { "epoch": 0.49667032735939437, "grad_norm": 0.44239863753318787, "learning_rate": 1.0104529929051977e-05, "loss": 0.3447, "step": 13285 }, { "epoch": 0.49685725634974415, "grad_norm": 0.4856170117855072, "learning_rate": 1.0098657657335083e-05, "loss": 0.3261, "step": 13290 }, { "epoch": 0.4970441853400939, "grad_norm": 0.7703909277915955, "learning_rate": 1.0092785351593995e-05, "loss": 0.2374, "step": 13295 }, { "epoch": 0.4972311143304437, "grad_norm": 0.44741690158843994, "learning_rate": 1.0086913013853894e-05, "loss": 0.2753, "step": 13300 }, { "epoch": 0.49741804332079353, "grad_norm": 0.38895756006240845, "learning_rate": 1.0081040646139985e-05, "loss": 0.2716, "step": 13305 }, { "epoch": 0.4976049723111433, "grad_norm": 0.7058769464492798, "learning_rate": 1.0075168250477482e-05, "loss": 0.3592, "step": 13310 }, { "epoch": 0.4977919013014931, "grad_norm": 0.34860917925834656, "learning_rate": 1.00692958288916e-05, "loss": 0.3461, "step": 13315 }, { "epoch": 0.49797883029184287, "grad_norm": 0.3462105691432953, "learning_rate": 1.0063423383407575e-05, "loss": 0.2995, "step": 13320 }, { "epoch": 0.4981657592821927, "grad_norm": 0.351172536611557, "learning_rate": 1.005755091605064e-05, "loss": 0.4439, "step": 13325 }, { "epoch": 0.4983526882725425, "grad_norm": 0.16182276606559753, "learning_rate": 1.0051678428846046e-05, "loss": 0.2951, "step": 13330 }, { "epoch": 0.49853961726289225, "grad_norm": 0.41749125719070435, "learning_rate": 1.0045805923819039e-05, "loss": 0.2411, "step": 13335 }, { "epoch": 0.49872654625324203, "grad_norm": 0.3556321859359741, "learning_rate": 1.0039933402994885e-05, "loss": 0.2667, "step": 13340 }, { "epoch": 0.49891347524359186, "grad_norm": 0.5114247798919678, "learning_rate": 1.0034060868398843e-05, "loss": 0.248, "step": 13345 }, { "epoch": 0.49910040423394164, "grad_norm": 0.29534754157066345, "learning_rate": 1.0028188322056183e-05, "loss": 0.2799, "step": 13350 }, { "epoch": 0.4992873332242914, "grad_norm": 0.5048911571502686, "learning_rate": 1.0022315765992179e-05, "loss": 0.2801, "step": 13355 }, { "epoch": 0.4994742622146412, "grad_norm": 0.3704678416252136, "learning_rate": 1.0016443202232107e-05, "loss": 0.2554, "step": 13360 }, { "epoch": 0.49966119120499103, "grad_norm": 0.4027446210384369, "learning_rate": 1.0010570632801244e-05, "loss": 0.2369, "step": 13365 }, { "epoch": 0.4998481201953408, "grad_norm": 0.4065941572189331, "learning_rate": 1.0004698059724873e-05, "loss": 0.3427, "step": 13370 }, { "epoch": 0.5000350491856906, "grad_norm": 0.1492055058479309, "learning_rate": 9.998825485028277e-06, "loss": 0.3667, "step": 13375 }, { "epoch": 0.5002219781760404, "grad_norm": 0.7147864103317261, "learning_rate": 9.99295291073674e-06, "loss": 0.3257, "step": 13380 }, { "epoch": 0.5004089071663902, "grad_norm": 0.355305552482605, "learning_rate": 9.987080338875537e-06, "loss": 0.384, "step": 13385 }, { "epoch": 0.50059583615674, "grad_norm": 0.24693630635738373, "learning_rate": 9.981207771469956e-06, "loss": 0.3332, "step": 13390 }, { "epoch": 0.5007827651470897, "grad_norm": 0.4253278076648712, "learning_rate": 9.975335210545279e-06, "loss": 0.2696, "step": 13395 }, { "epoch": 0.5009696941374395, "grad_norm": 0.22588220238685608, "learning_rate": 9.969462658126778e-06, "loss": 0.3232, "step": 13400 }, { "epoch": 0.5011566231277893, "grad_norm": 0.6063287854194641, "learning_rate": 9.963590116239734e-06, "loss": 0.2805, "step": 13405 }, { "epoch": 0.5013435521181391, "grad_norm": 0.5377234816551208, "learning_rate": 9.957717586909415e-06, "loss": 0.3082, "step": 13410 }, { "epoch": 0.501530481108489, "grad_norm": 0.6075440645217896, "learning_rate": 9.95184507216109e-06, "loss": 0.2732, "step": 13415 }, { "epoch": 0.5017174100988387, "grad_norm": 0.6097599864006042, "learning_rate": 9.945972574020015e-06, "loss": 0.3988, "step": 13420 }, { "epoch": 0.5019043390891885, "grad_norm": 0.3699340224266052, "learning_rate": 9.940100094511457e-06, "loss": 0.3371, "step": 13425 }, { "epoch": 0.5020912680795383, "grad_norm": 0.30382204055786133, "learning_rate": 9.934227635660654e-06, "loss": 0.2644, "step": 13430 }, { "epoch": 0.5022781970698881, "grad_norm": 0.38782837986946106, "learning_rate": 9.928355199492859e-06, "loss": 0.2467, "step": 13435 }, { "epoch": 0.5024651260602379, "grad_norm": 0.22586065530776978, "learning_rate": 9.922482788033304e-06, "loss": 0.2594, "step": 13440 }, { "epoch": 0.5026520550505876, "grad_norm": 0.38370853662490845, "learning_rate": 9.91661040330721e-06, "loss": 0.2202, "step": 13445 }, { "epoch": 0.5028389840409374, "grad_norm": 0.3413105010986328, "learning_rate": 9.910738047339801e-06, "loss": 0.2385, "step": 13450 }, { "epoch": 0.5030259130312872, "grad_norm": 0.5473100543022156, "learning_rate": 9.90486572215628e-06, "loss": 0.2837, "step": 13455 }, { "epoch": 0.5032128420216371, "grad_norm": 0.5079526305198669, "learning_rate": 9.898993429781848e-06, "loss": 0.328, "step": 13460 }, { "epoch": 0.5033997710119869, "grad_norm": 0.23653025925159454, "learning_rate": 9.893121172241686e-06, "loss": 0.2817, "step": 13465 }, { "epoch": 0.5035867000023366, "grad_norm": 0.3025575280189514, "learning_rate": 9.887248951560972e-06, "loss": 0.3381, "step": 13470 }, { "epoch": 0.5037736289926864, "grad_norm": 0.3230903148651123, "learning_rate": 9.88137676976486e-06, "loss": 0.2964, "step": 13475 }, { "epoch": 0.5039605579830362, "grad_norm": 0.8575633764266968, "learning_rate": 9.875504628878502e-06, "loss": 0.3539, "step": 13480 }, { "epoch": 0.504147486973386, "grad_norm": 0.5326083302497864, "learning_rate": 9.869632530927033e-06, "loss": 0.3046, "step": 13485 }, { "epoch": 0.5043344159637357, "grad_norm": 0.5689330697059631, "learning_rate": 9.863760477935565e-06, "loss": 0.2426, "step": 13490 }, { "epoch": 0.5045213449540855, "grad_norm": 0.4615536034107208, "learning_rate": 9.857888471929207e-06, "loss": 0.3155, "step": 13495 }, { "epoch": 0.5047082739444354, "grad_norm": 0.284390926361084, "learning_rate": 9.85201651493304e-06, "loss": 0.2493, "step": 13500 }, { "epoch": 0.5048952029347852, "grad_norm": 0.382242351770401, "learning_rate": 9.846144608972141e-06, "loss": 0.3991, "step": 13505 }, { "epoch": 0.505082131925135, "grad_norm": 0.3869224190711975, "learning_rate": 9.840272756071556e-06, "loss": 0.2395, "step": 13510 }, { "epoch": 0.5052690609154847, "grad_norm": 0.2770150601863861, "learning_rate": 9.834400958256322e-06, "loss": 0.2365, "step": 13515 }, { "epoch": 0.5054559899058345, "grad_norm": 0.1850321739912033, "learning_rate": 9.828529217551448e-06, "loss": 0.4006, "step": 13520 }, { "epoch": 0.5056429188961843, "grad_norm": 0.5925412178039551, "learning_rate": 9.822657535981936e-06, "loss": 0.3216, "step": 13525 }, { "epoch": 0.5058298478865341, "grad_norm": 0.4952634572982788, "learning_rate": 9.816785915572762e-06, "loss": 0.3249, "step": 13530 }, { "epoch": 0.5060167768768838, "grad_norm": 0.4701783061027527, "learning_rate": 9.81091435834887e-06, "loss": 0.2652, "step": 13535 }, { "epoch": 0.5062037058672337, "grad_norm": 0.8160406351089478, "learning_rate": 9.805042866335202e-06, "loss": 0.2646, "step": 13540 }, { "epoch": 0.5063906348575835, "grad_norm": 0.32033684849739075, "learning_rate": 9.79917144155666e-06, "loss": 0.3069, "step": 13545 }, { "epoch": 0.5065775638479333, "grad_norm": 0.39233770966529846, "learning_rate": 9.793300086038137e-06, "loss": 0.2425, "step": 13550 }, { "epoch": 0.5067644928382831, "grad_norm": 0.4731293022632599, "learning_rate": 9.78742880180449e-06, "loss": 0.2729, "step": 13555 }, { "epoch": 0.5069514218286328, "grad_norm": 0.45634377002716064, "learning_rate": 9.781557590880559e-06, "loss": 0.2657, "step": 13560 }, { "epoch": 0.5071383508189826, "grad_norm": 0.22461484372615814, "learning_rate": 9.775686455291153e-06, "loss": 0.3041, "step": 13565 }, { "epoch": 0.5073252798093324, "grad_norm": 0.19289721548557281, "learning_rate": 9.769815397061062e-06, "loss": 0.2881, "step": 13570 }, { "epoch": 0.5075122087996822, "grad_norm": 0.5333814024925232, "learning_rate": 9.763944418215047e-06, "loss": 0.3384, "step": 13575 }, { "epoch": 0.5076991377900321, "grad_norm": 0.3757210373878479, "learning_rate": 9.758073520777837e-06, "loss": 0.2283, "step": 13580 }, { "epoch": 0.5078860667803818, "grad_norm": 0.2797081470489502, "learning_rate": 9.75220270677414e-06, "loss": 0.2902, "step": 13585 }, { "epoch": 0.5080729957707316, "grad_norm": 0.22128711640834808, "learning_rate": 9.746331978228623e-06, "loss": 0.2467, "step": 13590 }, { "epoch": 0.5082599247610814, "grad_norm": 0.6138377785682678, "learning_rate": 9.740461337165945e-06, "loss": 0.2697, "step": 13595 }, { "epoch": 0.5084468537514312, "grad_norm": 0.43696799874305725, "learning_rate": 9.734590785610713e-06, "loss": 0.2734, "step": 13600 }, { "epoch": 0.508633782741781, "grad_norm": 0.38263434171676636, "learning_rate": 9.728720325587515e-06, "loss": 0.2258, "step": 13605 }, { "epoch": 0.5088207117321307, "grad_norm": 0.634128749370575, "learning_rate": 9.722849959120899e-06, "loss": 0.3134, "step": 13610 }, { "epoch": 0.5090076407224805, "grad_norm": 0.3166172504425049, "learning_rate": 9.716979688235392e-06, "loss": 0.309, "step": 13615 }, { "epoch": 0.5091945697128304, "grad_norm": 0.29206401109695435, "learning_rate": 9.711109514955485e-06, "loss": 0.2797, "step": 13620 }, { "epoch": 0.5093814987031802, "grad_norm": 0.648830235004425, "learning_rate": 9.705239441305626e-06, "loss": 0.2557, "step": 13625 }, { "epoch": 0.50956842769353, "grad_norm": 0.601845383644104, "learning_rate": 9.699369469310238e-06, "loss": 0.3296, "step": 13630 }, { "epoch": 0.5097553566838797, "grad_norm": 0.20830629765987396, "learning_rate": 9.693499600993705e-06, "loss": 0.2889, "step": 13635 }, { "epoch": 0.5099422856742295, "grad_norm": 0.1208687499165535, "learning_rate": 9.68762983838038e-06, "loss": 0.2389, "step": 13640 }, { "epoch": 0.5101292146645793, "grad_norm": 0.38630300760269165, "learning_rate": 9.681760183494568e-06, "loss": 0.3048, "step": 13645 }, { "epoch": 0.510316143654929, "grad_norm": 0.2682333290576935, "learning_rate": 9.675890638360556e-06, "loss": 0.3013, "step": 13650 }, { "epoch": 0.5105030726452788, "grad_norm": 0.46546033024787903, "learning_rate": 9.670021205002573e-06, "loss": 0.3102, "step": 13655 }, { "epoch": 0.5106900016356287, "grad_norm": 0.2961997985839844, "learning_rate": 9.66415188544482e-06, "loss": 0.2771, "step": 13660 }, { "epoch": 0.5108769306259785, "grad_norm": 0.4869229197502136, "learning_rate": 9.65828268171146e-06, "loss": 0.292, "step": 13665 }, { "epoch": 0.5110638596163283, "grad_norm": 0.42666685581207275, "learning_rate": 9.652413595826612e-06, "loss": 0.3738, "step": 13670 }, { "epoch": 0.511250788606678, "grad_norm": 0.5058145523071289, "learning_rate": 9.646544629814357e-06, "loss": 0.3383, "step": 13675 }, { "epoch": 0.5114377175970278, "grad_norm": 0.4218425154685974, "learning_rate": 9.640675785698726e-06, "loss": 0.2804, "step": 13680 }, { "epoch": 0.5116246465873776, "grad_norm": 0.5010436177253723, "learning_rate": 9.634807065503726e-06, "loss": 0.2663, "step": 13685 }, { "epoch": 0.5118115755777274, "grad_norm": 0.44729965925216675, "learning_rate": 9.628938471253302e-06, "loss": 0.347, "step": 13690 }, { "epoch": 0.5119985045680772, "grad_norm": 0.4109884202480316, "learning_rate": 9.62307000497137e-06, "loss": 0.2371, "step": 13695 }, { "epoch": 0.5121854335584269, "grad_norm": 0.35468733310699463, "learning_rate": 9.61720166868179e-06, "loss": 0.2445, "step": 13700 }, { "epoch": 0.5123723625487768, "grad_norm": 0.31252321600914, "learning_rate": 9.611333464408383e-06, "loss": 0.3187, "step": 13705 }, { "epoch": 0.5125592915391266, "grad_norm": 0.31071966886520386, "learning_rate": 9.605465394174933e-06, "loss": 0.235, "step": 13710 }, { "epoch": 0.5127462205294764, "grad_norm": 0.4738657474517822, "learning_rate": 9.599597460005161e-06, "loss": 0.2461, "step": 13715 }, { "epoch": 0.5129331495198262, "grad_norm": 0.35220175981521606, "learning_rate": 9.593729663922752e-06, "loss": 0.3399, "step": 13720 }, { "epoch": 0.5131200785101759, "grad_norm": 0.3583666682243347, "learning_rate": 9.587862007951343e-06, "loss": 0.307, "step": 13725 }, { "epoch": 0.5133070075005257, "grad_norm": 0.5065263509750366, "learning_rate": 9.581994494114518e-06, "loss": 0.3031, "step": 13730 }, { "epoch": 0.5134939364908755, "grad_norm": 0.5432529449462891, "learning_rate": 9.576127124435811e-06, "loss": 0.2919, "step": 13735 }, { "epoch": 0.5136808654812253, "grad_norm": 0.36350008845329285, "learning_rate": 9.570259900938717e-06, "loss": 0.284, "step": 13740 }, { "epoch": 0.5138677944715752, "grad_norm": 0.3522621691226959, "learning_rate": 9.564392825646669e-06, "loss": 0.2702, "step": 13745 }, { "epoch": 0.5140547234619249, "grad_norm": 0.24228136241436005, "learning_rate": 9.55852590058305e-06, "loss": 0.2595, "step": 13750 }, { "epoch": 0.5142416524522747, "grad_norm": 0.40059664845466614, "learning_rate": 9.552659127771204e-06, "loss": 0.3595, "step": 13755 }, { "epoch": 0.5144285814426245, "grad_norm": 0.3901945948600769, "learning_rate": 9.5467925092344e-06, "loss": 0.2465, "step": 13760 }, { "epoch": 0.5146155104329743, "grad_norm": 0.4316410422325134, "learning_rate": 9.54092604699588e-06, "loss": 0.3, "step": 13765 }, { "epoch": 0.514802439423324, "grad_norm": 0.2531914710998535, "learning_rate": 9.53505974307881e-06, "loss": 0.275, "step": 13770 }, { "epoch": 0.5149893684136738, "grad_norm": 1.1252632141113281, "learning_rate": 9.529193599506313e-06, "loss": 0.3713, "step": 13775 }, { "epoch": 0.5151762974040236, "grad_norm": 0.3661787807941437, "learning_rate": 9.52332761830145e-06, "loss": 0.2687, "step": 13780 }, { "epoch": 0.5153632263943735, "grad_norm": 0.234545037150383, "learning_rate": 9.517461801487239e-06, "loss": 0.3, "step": 13785 }, { "epoch": 0.5155501553847233, "grad_norm": 0.29935866594314575, "learning_rate": 9.51159615108662e-06, "loss": 0.3166, "step": 13790 }, { "epoch": 0.515737084375073, "grad_norm": 0.5030335783958435, "learning_rate": 9.505730669122494e-06, "loss": 0.3111, "step": 13795 }, { "epoch": 0.5159240133654228, "grad_norm": 0.33726051449775696, "learning_rate": 9.499865357617703e-06, "loss": 0.2607, "step": 13800 }, { "epoch": 0.5161109423557726, "grad_norm": 0.4101554751396179, "learning_rate": 9.494000218595015e-06, "loss": 0.297, "step": 13805 }, { "epoch": 0.5162978713461224, "grad_norm": 0.3463868796825409, "learning_rate": 9.488135254077155e-06, "loss": 0.2925, "step": 13810 }, { "epoch": 0.5164848003364721, "grad_norm": 0.15302151441574097, "learning_rate": 9.482270466086778e-06, "loss": 0.3581, "step": 13815 }, { "epoch": 0.5166717293268219, "grad_norm": 0.2611302435398102, "learning_rate": 9.476405856646485e-06, "loss": 0.2782, "step": 13820 }, { "epoch": 0.5168586583171718, "grad_norm": 0.6067391633987427, "learning_rate": 9.470541427778805e-06, "loss": 0.2788, "step": 13825 }, { "epoch": 0.5170455873075216, "grad_norm": 0.2857629954814911, "learning_rate": 9.46467718150622e-06, "loss": 0.3126, "step": 13830 }, { "epoch": 0.5172325162978714, "grad_norm": 0.45087161660194397, "learning_rate": 9.45881311985113e-06, "loss": 0.3074, "step": 13835 }, { "epoch": 0.5174194452882211, "grad_norm": 0.29453983902931213, "learning_rate": 9.452949244835893e-06, "loss": 0.2423, "step": 13840 }, { "epoch": 0.5176063742785709, "grad_norm": 0.4951644241809845, "learning_rate": 9.447085558482787e-06, "loss": 0.2675, "step": 13845 }, { "epoch": 0.5177933032689207, "grad_norm": 0.43756482005119324, "learning_rate": 9.441222062814024e-06, "loss": 0.2897, "step": 13850 }, { "epoch": 0.5179802322592705, "grad_norm": 0.6385819911956787, "learning_rate": 9.435358759851767e-06, "loss": 0.2524, "step": 13855 }, { "epoch": 0.5181671612496203, "grad_norm": 0.3710041046142578, "learning_rate": 9.42949565161809e-06, "loss": 0.2996, "step": 13860 }, { "epoch": 0.5183540902399701, "grad_norm": 0.47704362869262695, "learning_rate": 9.423632740135021e-06, "loss": 0.2501, "step": 13865 }, { "epoch": 0.5185410192303199, "grad_norm": 0.30794888734817505, "learning_rate": 9.417770027424499e-06, "loss": 0.2924, "step": 13870 }, { "epoch": 0.5187279482206697, "grad_norm": 0.567466139793396, "learning_rate": 9.411907515508415e-06, "loss": 0.3012, "step": 13875 }, { "epoch": 0.5189148772110195, "grad_norm": 0.5275660157203674, "learning_rate": 9.406045206408574e-06, "loss": 0.3539, "step": 13880 }, { "epoch": 0.5191018062013693, "grad_norm": 0.17727385461330414, "learning_rate": 9.400183102146726e-06, "loss": 0.2714, "step": 13885 }, { "epoch": 0.519288735191719, "grad_norm": 0.5720701813697815, "learning_rate": 9.394321204744538e-06, "loss": 0.2234, "step": 13890 }, { "epoch": 0.5194756641820688, "grad_norm": 0.19189219176769257, "learning_rate": 9.388459516223611e-06, "loss": 0.3371, "step": 13895 }, { "epoch": 0.5196625931724186, "grad_norm": 0.7133250832557678, "learning_rate": 9.382598038605477e-06, "loss": 0.3986, "step": 13900 }, { "epoch": 0.5198495221627685, "grad_norm": 0.3994240164756775, "learning_rate": 9.376736773911583e-06, "loss": 0.3639, "step": 13905 }, { "epoch": 0.5200364511531183, "grad_norm": 0.27001604437828064, "learning_rate": 9.370875724163322e-06, "loss": 0.2863, "step": 13910 }, { "epoch": 0.520223380143468, "grad_norm": 0.5072554349899292, "learning_rate": 9.365014891381996e-06, "loss": 0.2816, "step": 13915 }, { "epoch": 0.5204103091338178, "grad_norm": 0.4427657723426819, "learning_rate": 9.35915427758884e-06, "loss": 0.257, "step": 13920 }, { "epoch": 0.5205972381241676, "grad_norm": 0.3535575270652771, "learning_rate": 9.353293884805008e-06, "loss": 0.2465, "step": 13925 }, { "epoch": 0.5207841671145174, "grad_norm": 0.2692049443721771, "learning_rate": 9.347433715051585e-06, "loss": 0.2309, "step": 13930 }, { "epoch": 0.5209710961048671, "grad_norm": 0.316593199968338, "learning_rate": 9.341573770349579e-06, "loss": 0.2596, "step": 13935 }, { "epoch": 0.5211580250952169, "grad_norm": 0.5187430381774902, "learning_rate": 9.33571405271991e-06, "loss": 0.3025, "step": 13940 }, { "epoch": 0.5213449540855667, "grad_norm": 0.4760133922100067, "learning_rate": 9.329854564183433e-06, "loss": 0.2937, "step": 13945 }, { "epoch": 0.5215318830759166, "grad_norm": 0.36547884345054626, "learning_rate": 9.323995306760909e-06, "loss": 0.2883, "step": 13950 }, { "epoch": 0.5217188120662664, "grad_norm": 0.3512476980686188, "learning_rate": 9.31813628247304e-06, "loss": 0.283, "step": 13955 }, { "epoch": 0.5219057410566161, "grad_norm": 0.40813568234443665, "learning_rate": 9.312277493340428e-06, "loss": 0.278, "step": 13960 }, { "epoch": 0.5220926700469659, "grad_norm": 0.35714733600616455, "learning_rate": 9.306418941383602e-06, "loss": 0.2797, "step": 13965 }, { "epoch": 0.5222795990373157, "grad_norm": 0.4371785521507263, "learning_rate": 9.300560628623007e-06, "loss": 0.2286, "step": 13970 }, { "epoch": 0.5224665280276655, "grad_norm": 0.8280014991760254, "learning_rate": 9.294702557079012e-06, "loss": 0.3348, "step": 13975 }, { "epoch": 0.5226534570180152, "grad_norm": 0.6949049234390259, "learning_rate": 9.288844728771898e-06, "loss": 0.327, "step": 13980 }, { "epoch": 0.522840386008365, "grad_norm": 0.573320209980011, "learning_rate": 9.282987145721853e-06, "loss": 0.2654, "step": 13985 }, { "epoch": 0.5230273149987149, "grad_norm": 0.28571873903274536, "learning_rate": 9.277129809949004e-06, "loss": 0.3184, "step": 13990 }, { "epoch": 0.5232142439890647, "grad_norm": 0.49826252460479736, "learning_rate": 9.271272723473365e-06, "loss": 0.2586, "step": 13995 }, { "epoch": 0.5234011729794145, "grad_norm": 0.33316782116889954, "learning_rate": 9.265415888314887e-06, "loss": 0.3419, "step": 14000 }, { "epoch": 0.5235881019697642, "grad_norm": 0.589277446269989, "learning_rate": 9.25955930649342e-06, "loss": 0.2259, "step": 14005 }, { "epoch": 0.523775030960114, "grad_norm": 0.48725539445877075, "learning_rate": 9.253702980028732e-06, "loss": 0.3722, "step": 14010 }, { "epoch": 0.5239619599504638, "grad_norm": 0.7195634245872498, "learning_rate": 9.2478469109405e-06, "loss": 0.2598, "step": 14015 }, { "epoch": 0.5241488889408136, "grad_norm": 0.23791752755641937, "learning_rate": 9.241991101248314e-06, "loss": 0.2484, "step": 14020 }, { "epoch": 0.5243358179311634, "grad_norm": 0.32968446612358093, "learning_rate": 9.236135552971684e-06, "loss": 0.2552, "step": 14025 }, { "epoch": 0.5245227469215132, "grad_norm": 0.32867518067359924, "learning_rate": 9.230280268130011e-06, "loss": 0.2843, "step": 14030 }, { "epoch": 0.524709675911863, "grad_norm": 0.4443724453449249, "learning_rate": 9.22442524874262e-06, "loss": 0.313, "step": 14035 }, { "epoch": 0.5248966049022128, "grad_norm": 0.38702741265296936, "learning_rate": 9.218570496828733e-06, "loss": 0.2381, "step": 14040 }, { "epoch": 0.5250835338925626, "grad_norm": 0.27657172083854675, "learning_rate": 9.212716014407498e-06, "loss": 0.2563, "step": 14045 }, { "epoch": 0.5252704628829123, "grad_norm": 0.247044637799263, "learning_rate": 9.206861803497946e-06, "loss": 0.2164, "step": 14050 }, { "epoch": 0.5254573918732621, "grad_norm": 0.3950256407260895, "learning_rate": 9.201007866119035e-06, "loss": 0.3164, "step": 14055 }, { "epoch": 0.5256443208636119, "grad_norm": 0.4747505784034729, "learning_rate": 9.195154204289614e-06, "loss": 0.2861, "step": 14060 }, { "epoch": 0.5258312498539617, "grad_norm": 0.27522939443588257, "learning_rate": 9.189300820028444e-06, "loss": 0.3118, "step": 14065 }, { "epoch": 0.5260181788443116, "grad_norm": 0.4004516303539276, "learning_rate": 9.183447715354197e-06, "loss": 0.2722, "step": 14070 }, { "epoch": 0.5262051078346613, "grad_norm": 1.1275408267974854, "learning_rate": 9.177594892285434e-06, "loss": 0.2359, "step": 14075 }, { "epoch": 0.5263920368250111, "grad_norm": 0.221548393368721, "learning_rate": 9.171742352840628e-06, "loss": 0.2919, "step": 14080 }, { "epoch": 0.5265789658153609, "grad_norm": 0.3027479946613312, "learning_rate": 9.165890099038149e-06, "loss": 0.2954, "step": 14085 }, { "epoch": 0.5267658948057107, "grad_norm": 0.39747804403305054, "learning_rate": 9.160038132896279e-06, "loss": 0.2939, "step": 14090 }, { "epoch": 0.5269528237960605, "grad_norm": 0.3907001316547394, "learning_rate": 9.154186456433185e-06, "loss": 0.3942, "step": 14095 }, { "epoch": 0.5271397527864102, "grad_norm": 0.4810253083705902, "learning_rate": 9.148335071666949e-06, "loss": 0.2717, "step": 14100 }, { "epoch": 0.52732668177676, "grad_norm": 0.21729324758052826, "learning_rate": 9.142483980615545e-06, "loss": 0.2178, "step": 14105 }, { "epoch": 0.5275136107671099, "grad_norm": 0.331342875957489, "learning_rate": 9.13663318529684e-06, "loss": 0.3857, "step": 14110 }, { "epoch": 0.5277005397574597, "grad_norm": 0.44765812158584595, "learning_rate": 9.130782687728615e-06, "loss": 0.2857, "step": 14115 }, { "epoch": 0.5278874687478095, "grad_norm": 0.4248161315917969, "learning_rate": 9.124932489928535e-06, "loss": 0.3475, "step": 14120 }, { "epoch": 0.5280743977381592, "grad_norm": 0.2518801689147949, "learning_rate": 9.119082593914164e-06, "loss": 0.3148, "step": 14125 }, { "epoch": 0.528261326728509, "grad_norm": 0.24737538397312164, "learning_rate": 9.113233001702963e-06, "loss": 0.2533, "step": 14130 }, { "epoch": 0.5284482557188588, "grad_norm": 0.36829543113708496, "learning_rate": 9.107383715312294e-06, "loss": 0.3111, "step": 14135 }, { "epoch": 0.5286351847092086, "grad_norm": 0.42858853936195374, "learning_rate": 9.101534736759402e-06, "loss": 0.2648, "step": 14140 }, { "epoch": 0.5288221136995583, "grad_norm": 0.436758428812027, "learning_rate": 9.095686068061439e-06, "loss": 0.296, "step": 14145 }, { "epoch": 0.5290090426899082, "grad_norm": 0.4839959442615509, "learning_rate": 9.089837711235436e-06, "loss": 0.3344, "step": 14150 }, { "epoch": 0.529195971680258, "grad_norm": 0.3259483277797699, "learning_rate": 9.083989668298326e-06, "loss": 0.2556, "step": 14155 }, { "epoch": 0.5293829006706078, "grad_norm": 0.3588860332965851, "learning_rate": 9.078141941266934e-06, "loss": 0.2971, "step": 14160 }, { "epoch": 0.5295698296609576, "grad_norm": 0.33428633213043213, "learning_rate": 9.072294532157973e-06, "loss": 0.2645, "step": 14165 }, { "epoch": 0.5297567586513073, "grad_norm": 0.7300418615341187, "learning_rate": 9.066447442988044e-06, "loss": 0.3597, "step": 14170 }, { "epoch": 0.5299436876416571, "grad_norm": 0.3091430366039276, "learning_rate": 9.060600675773644e-06, "loss": 0.2246, "step": 14175 }, { "epoch": 0.5301306166320069, "grad_norm": 0.47581833600997925, "learning_rate": 9.054754232531153e-06, "loss": 0.2581, "step": 14180 }, { "epoch": 0.5303175456223567, "grad_norm": 0.4748400151729584, "learning_rate": 9.04890811527684e-06, "loss": 0.2788, "step": 14185 }, { "epoch": 0.5305044746127064, "grad_norm": 0.21533802151679993, "learning_rate": 9.04306232602687e-06, "loss": 0.2591, "step": 14190 }, { "epoch": 0.5306914036030563, "grad_norm": 0.3625734746456146, "learning_rate": 9.037216866797281e-06, "loss": 0.2998, "step": 14195 }, { "epoch": 0.5308783325934061, "grad_norm": 0.6287603378295898, "learning_rate": 9.031371739604006e-06, "loss": 0.3302, "step": 14200 }, { "epoch": 0.5310652615837559, "grad_norm": 0.28976184129714966, "learning_rate": 9.025526946462868e-06, "loss": 0.2641, "step": 14205 }, { "epoch": 0.5312521905741057, "grad_norm": 0.4596756398677826, "learning_rate": 9.01968248938956e-06, "loss": 0.2452, "step": 14210 }, { "epoch": 0.5314391195644554, "grad_norm": 0.3350590765476227, "learning_rate": 9.013838370399675e-06, "loss": 0.3145, "step": 14215 }, { "epoch": 0.5316260485548052, "grad_norm": 0.31188416481018066, "learning_rate": 9.007994591508677e-06, "loss": 0.2753, "step": 14220 }, { "epoch": 0.531812977545155, "grad_norm": 0.22735527157783508, "learning_rate": 9.002151154731922e-06, "loss": 0.3517, "step": 14225 }, { "epoch": 0.5319999065355048, "grad_norm": 0.32361796498298645, "learning_rate": 8.996308062084638e-06, "loss": 0.3578, "step": 14230 }, { "epoch": 0.5321868355258547, "grad_norm": 0.44661062955856323, "learning_rate": 8.990465315581947e-06, "loss": 0.3692, "step": 14235 }, { "epoch": 0.5323737645162044, "grad_norm": 0.6878971457481384, "learning_rate": 8.984622917238842e-06, "loss": 0.3867, "step": 14240 }, { "epoch": 0.5325606935065542, "grad_norm": 0.33011388778686523, "learning_rate": 8.978780869070198e-06, "loss": 0.2912, "step": 14245 }, { "epoch": 0.532747622496904, "grad_norm": 0.38461020588874817, "learning_rate": 8.972939173090768e-06, "loss": 0.2551, "step": 14250 }, { "epoch": 0.5329345514872538, "grad_norm": 0.6979146599769592, "learning_rate": 8.967097831315188e-06, "loss": 0.3248, "step": 14255 }, { "epoch": 0.5331214804776035, "grad_norm": 0.2943269610404968, "learning_rate": 8.961256845757973e-06, "loss": 0.2884, "step": 14260 }, { "epoch": 0.5333084094679533, "grad_norm": 0.2779577076435089, "learning_rate": 8.955416218433506e-06, "loss": 0.2509, "step": 14265 }, { "epoch": 0.5334953384583031, "grad_norm": 0.16579632461071014, "learning_rate": 8.949575951356057e-06, "loss": 0.2584, "step": 14270 }, { "epoch": 0.533682267448653, "grad_norm": 0.30751311779022217, "learning_rate": 8.94373604653976e-06, "loss": 0.2895, "step": 14275 }, { "epoch": 0.5338691964390028, "grad_norm": 4.094731330871582, "learning_rate": 8.937896505998638e-06, "loss": 0.3417, "step": 14280 }, { "epoch": 0.5340561254293525, "grad_norm": 0.3279981017112732, "learning_rate": 8.932057331746576e-06, "loss": 0.3156, "step": 14285 }, { "epoch": 0.5342430544197023, "grad_norm": 0.6543190479278564, "learning_rate": 8.926218525797342e-06, "loss": 0.2528, "step": 14290 }, { "epoch": 0.5344299834100521, "grad_norm": 0.28789252042770386, "learning_rate": 8.920380090164569e-06, "loss": 0.3179, "step": 14295 }, { "epoch": 0.5346169124004019, "grad_norm": 0.4924599230289459, "learning_rate": 8.914542026861765e-06, "loss": 0.3752, "step": 14300 }, { "epoch": 0.5348038413907517, "grad_norm": 0.33676034212112427, "learning_rate": 8.908704337902318e-06, "loss": 0.2342, "step": 14305 }, { "epoch": 0.5349907703811014, "grad_norm": 0.3200925290584564, "learning_rate": 8.902867025299475e-06, "loss": 0.3312, "step": 14310 }, { "epoch": 0.5351776993714513, "grad_norm": 0.39076340198516846, "learning_rate": 8.897030091066359e-06, "loss": 0.2409, "step": 14315 }, { "epoch": 0.5353646283618011, "grad_norm": 0.1652345061302185, "learning_rate": 8.891193537215956e-06, "loss": 0.2935, "step": 14320 }, { "epoch": 0.5355515573521509, "grad_norm": 0.33995094895362854, "learning_rate": 8.885357365761136e-06, "loss": 0.2829, "step": 14325 }, { "epoch": 0.5357384863425007, "grad_norm": 0.48137718439102173, "learning_rate": 8.879521578714617e-06, "loss": 0.3732, "step": 14330 }, { "epoch": 0.5359254153328504, "grad_norm": 0.3473806381225586, "learning_rate": 8.873686178089004e-06, "loss": 0.2582, "step": 14335 }, { "epoch": 0.5361123443232002, "grad_norm": 0.9458356499671936, "learning_rate": 8.867851165896752e-06, "loss": 0.2783, "step": 14340 }, { "epoch": 0.53629927331355, "grad_norm": 0.2733253538608551, "learning_rate": 8.862016544150192e-06, "loss": 0.3056, "step": 14345 }, { "epoch": 0.5364862023038998, "grad_norm": 0.33390501141548157, "learning_rate": 8.856182314861524e-06, "loss": 0.2556, "step": 14350 }, { "epoch": 0.5366731312942497, "grad_norm": 0.5112034678459167, "learning_rate": 8.850348480042794e-06, "loss": 0.3071, "step": 14355 }, { "epoch": 0.5368600602845994, "grad_norm": 0.3586961328983307, "learning_rate": 8.844515041705938e-06, "loss": 0.242, "step": 14360 }, { "epoch": 0.5370469892749492, "grad_norm": 0.4830436408519745, "learning_rate": 8.838682001862732e-06, "loss": 0.227, "step": 14365 }, { "epoch": 0.537233918265299, "grad_norm": 0.34037473797798157, "learning_rate": 8.83284936252483e-06, "loss": 0.3112, "step": 14370 }, { "epoch": 0.5374208472556488, "grad_norm": 0.21011002361774445, "learning_rate": 8.827017125703735e-06, "loss": 0.2921, "step": 14375 }, { "epoch": 0.5376077762459985, "grad_norm": 0.42194753885269165, "learning_rate": 8.821185293410827e-06, "loss": 0.3428, "step": 14380 }, { "epoch": 0.5377947052363483, "grad_norm": 0.48263490200042725, "learning_rate": 8.815353867657334e-06, "loss": 0.2702, "step": 14385 }, { "epoch": 0.5379816342266981, "grad_norm": 0.46013012528419495, "learning_rate": 8.809522850454343e-06, "loss": 0.3728, "step": 14390 }, { "epoch": 0.538168563217048, "grad_norm": 0.4305642247200012, "learning_rate": 8.803692243812816e-06, "loss": 0.2556, "step": 14395 }, { "epoch": 0.5383554922073978, "grad_norm": 0.3640795052051544, "learning_rate": 8.79786204974355e-06, "loss": 0.3871, "step": 14400 }, { "epoch": 0.5385424211977475, "grad_norm": 0.413748562335968, "learning_rate": 8.792032270257223e-06, "loss": 0.2976, "step": 14405 }, { "epoch": 0.5387293501880973, "grad_norm": 0.28498637676239014, "learning_rate": 8.786202907364349e-06, "loss": 0.3093, "step": 14410 }, { "epoch": 0.5389162791784471, "grad_norm": 0.3062063455581665, "learning_rate": 8.780373963075315e-06, "loss": 0.3509, "step": 14415 }, { "epoch": 0.5391032081687969, "grad_norm": 0.34864166378974915, "learning_rate": 8.774545439400352e-06, "loss": 0.3229, "step": 14420 }, { "epoch": 0.5392901371591466, "grad_norm": 0.27013546228408813, "learning_rate": 8.768717338349557e-06, "loss": 0.2775, "step": 14425 }, { "epoch": 0.5394770661494964, "grad_norm": 0.23966774344444275, "learning_rate": 8.762889661932869e-06, "loss": 0.3353, "step": 14430 }, { "epoch": 0.5396639951398462, "grad_norm": 0.3070327639579773, "learning_rate": 8.757062412160085e-06, "loss": 0.2272, "step": 14435 }, { "epoch": 0.5398509241301961, "grad_norm": 0.9705345630645752, "learning_rate": 8.751235591040867e-06, "loss": 0.2292, "step": 14440 }, { "epoch": 0.5400378531205459, "grad_norm": 0.2887144386768341, "learning_rate": 8.745409200584707e-06, "loss": 0.2384, "step": 14445 }, { "epoch": 0.5402247821108956, "grad_norm": 0.22318506240844727, "learning_rate": 8.739583242800968e-06, "loss": 0.2734, "step": 14450 }, { "epoch": 0.5404117111012454, "grad_norm": 0.4824393093585968, "learning_rate": 8.733757719698854e-06, "loss": 0.2736, "step": 14455 }, { "epoch": 0.5405986400915952, "grad_norm": 0.677008867263794, "learning_rate": 8.72793263328742e-06, "loss": 0.2748, "step": 14460 }, { "epoch": 0.540785569081945, "grad_norm": 0.32696858048439026, "learning_rate": 8.722107985575567e-06, "loss": 0.2841, "step": 14465 }, { "epoch": 0.5409724980722947, "grad_norm": 0.35395094752311707, "learning_rate": 8.716283778572058e-06, "loss": 0.3439, "step": 14470 }, { "epoch": 0.5411594270626445, "grad_norm": 0.27938494086265564, "learning_rate": 8.710460014285486e-06, "loss": 0.2786, "step": 14475 }, { "epoch": 0.5413463560529944, "grad_norm": 0.4673710763454437, "learning_rate": 8.704636694724309e-06, "loss": 0.2452, "step": 14480 }, { "epoch": 0.5415332850433442, "grad_norm": 0.4600588083267212, "learning_rate": 8.69881382189682e-06, "loss": 0.327, "step": 14485 }, { "epoch": 0.541720214033694, "grad_norm": 0.33369168639183044, "learning_rate": 8.692991397811157e-06, "loss": 0.3612, "step": 14490 }, { "epoch": 0.5419071430240437, "grad_norm": 0.4194375276565552, "learning_rate": 8.687169424475312e-06, "loss": 0.2282, "step": 14495 }, { "epoch": 0.5420940720143935, "grad_norm": 0.3811163902282715, "learning_rate": 8.681347903897115e-06, "loss": 0.324, "step": 14500 }, { "epoch": 0.5422810010047433, "grad_norm": 0.6883916854858398, "learning_rate": 8.675526838084244e-06, "loss": 0.3881, "step": 14505 }, { "epoch": 0.5424679299950931, "grad_norm": 0.36151525378227234, "learning_rate": 8.66970622904421e-06, "loss": 0.2661, "step": 14510 }, { "epoch": 0.5426548589854429, "grad_norm": 0.4361328184604645, "learning_rate": 8.663886078784386e-06, "loss": 0.3693, "step": 14515 }, { "epoch": 0.5428417879757927, "grad_norm": 0.3427176773548126, "learning_rate": 8.658066389311963e-06, "loss": 0.3023, "step": 14520 }, { "epoch": 0.5430287169661425, "grad_norm": 0.4090884029865265, "learning_rate": 8.652247162633994e-06, "loss": 0.2677, "step": 14525 }, { "epoch": 0.5432156459564923, "grad_norm": 0.4226764440536499, "learning_rate": 8.646428400757363e-06, "loss": 0.294, "step": 14530 }, { "epoch": 0.5434025749468421, "grad_norm": 0.5746899843215942, "learning_rate": 8.640610105688787e-06, "loss": 0.325, "step": 14535 }, { "epoch": 0.5435895039371919, "grad_norm": 0.37049710750579834, "learning_rate": 8.634792279434838e-06, "loss": 0.2668, "step": 14540 }, { "epoch": 0.5437764329275416, "grad_norm": 0.40856534242630005, "learning_rate": 8.62897492400191e-06, "loss": 0.3502, "step": 14545 }, { "epoch": 0.5439633619178914, "grad_norm": 0.3843344449996948, "learning_rate": 8.623158041396251e-06, "loss": 0.317, "step": 14550 }, { "epoch": 0.5441502909082412, "grad_norm": 0.4334682822227478, "learning_rate": 8.617341633623928e-06, "loss": 0.3195, "step": 14555 }, { "epoch": 0.5443372198985911, "grad_norm": 0.55138099193573, "learning_rate": 8.611525702690861e-06, "loss": 0.2328, "step": 14560 }, { "epoch": 0.5445241488889409, "grad_norm": 0.599747359752655, "learning_rate": 8.60571025060279e-06, "loss": 0.2468, "step": 14565 }, { "epoch": 0.5447110778792906, "grad_norm": 0.6509507298469543, "learning_rate": 8.599895279365303e-06, "loss": 0.3409, "step": 14570 }, { "epoch": 0.5448980068696404, "grad_norm": 0.2125466763973236, "learning_rate": 8.59408079098382e-06, "loss": 0.2632, "step": 14575 }, { "epoch": 0.5450849358599902, "grad_norm": 0.3049337565898895, "learning_rate": 8.588266787463582e-06, "loss": 0.2482, "step": 14580 }, { "epoch": 0.54527186485034, "grad_norm": 0.372995525598526, "learning_rate": 8.582453270809682e-06, "loss": 0.3199, "step": 14585 }, { "epoch": 0.5454587938406897, "grad_norm": 0.3478299379348755, "learning_rate": 8.576640243027027e-06, "loss": 0.2878, "step": 14590 }, { "epoch": 0.5456457228310395, "grad_norm": 0.32996904850006104, "learning_rate": 8.570827706120373e-06, "loss": 0.2564, "step": 14595 }, { "epoch": 0.5458326518213894, "grad_norm": 0.33126890659332275, "learning_rate": 8.565015662094289e-06, "loss": 0.2473, "step": 14600 }, { "epoch": 0.5460195808117392, "grad_norm": 0.6810954809188843, "learning_rate": 8.559204112953187e-06, "loss": 0.3061, "step": 14605 }, { "epoch": 0.546206509802089, "grad_norm": 0.286606103181839, "learning_rate": 8.5533930607013e-06, "loss": 0.4564, "step": 14610 }, { "epoch": 0.5463934387924387, "grad_norm": 0.23572850227355957, "learning_rate": 8.547582507342696e-06, "loss": 0.2598, "step": 14615 }, { "epoch": 0.5465803677827885, "grad_norm": 0.35271137952804565, "learning_rate": 8.54177245488127e-06, "loss": 0.2811, "step": 14620 }, { "epoch": 0.5467672967731383, "grad_norm": 0.3139854967594147, "learning_rate": 8.535962905320739e-06, "loss": 0.2576, "step": 14625 }, { "epoch": 0.5469542257634881, "grad_norm": 0.5428050756454468, "learning_rate": 8.530153860664657e-06, "loss": 0.2428, "step": 14630 }, { "epoch": 0.5471411547538378, "grad_norm": 0.5763229131698608, "learning_rate": 8.524345322916383e-06, "loss": 0.2484, "step": 14635 }, { "epoch": 0.5473280837441876, "grad_norm": 0.3398786783218384, "learning_rate": 8.518537294079132e-06, "loss": 0.3553, "step": 14640 }, { "epoch": 0.5475150127345375, "grad_norm": 0.40959233045578003, "learning_rate": 8.512729776155917e-06, "loss": 0.255, "step": 14645 }, { "epoch": 0.5477019417248873, "grad_norm": 0.34932756423950195, "learning_rate": 8.506922771149588e-06, "loss": 0.3235, "step": 14650 }, { "epoch": 0.5478888707152371, "grad_norm": 0.2500455379486084, "learning_rate": 8.501116281062809e-06, "loss": 0.2978, "step": 14655 }, { "epoch": 0.5480757997055868, "grad_norm": 0.37845414876937866, "learning_rate": 8.495310307898076e-06, "loss": 0.3534, "step": 14660 }, { "epoch": 0.5482627286959366, "grad_norm": 0.1609395295381546, "learning_rate": 8.489504853657707e-06, "loss": 0.2224, "step": 14665 }, { "epoch": 0.5484496576862864, "grad_norm": 0.37711602449417114, "learning_rate": 8.48369992034383e-06, "loss": 0.2195, "step": 14670 }, { "epoch": 0.5486365866766362, "grad_norm": 0.373844712972641, "learning_rate": 8.477895509958407e-06, "loss": 0.2513, "step": 14675 }, { "epoch": 0.548823515666986, "grad_norm": 0.24388177692890167, "learning_rate": 8.472091624503204e-06, "loss": 0.3141, "step": 14680 }, { "epoch": 0.5490104446573358, "grad_norm": 0.5337631106376648, "learning_rate": 8.466288265979822e-06, "loss": 0.3304, "step": 14685 }, { "epoch": 0.5491973736476856, "grad_norm": 0.3623740077018738, "learning_rate": 8.460485436389672e-06, "loss": 0.2617, "step": 14690 }, { "epoch": 0.5493843026380354, "grad_norm": 0.47312211990356445, "learning_rate": 8.454683137733982e-06, "loss": 0.2966, "step": 14695 }, { "epoch": 0.5495712316283852, "grad_norm": 0.4503675401210785, "learning_rate": 8.448881372013795e-06, "loss": 0.299, "step": 14700 }, { "epoch": 0.549758160618735, "grad_norm": 0.3221707344055176, "learning_rate": 8.443080141229978e-06, "loss": 0.2533, "step": 14705 }, { "epoch": 0.5499450896090847, "grad_norm": 0.25598636269569397, "learning_rate": 8.437279447383213e-06, "loss": 0.2354, "step": 14710 }, { "epoch": 0.5501320185994345, "grad_norm": 0.3973354399204254, "learning_rate": 8.431479292473986e-06, "loss": 0.3455, "step": 14715 }, { "epoch": 0.5503189475897843, "grad_norm": 0.30457204580307007, "learning_rate": 8.42567967850261e-06, "loss": 0.2669, "step": 14720 }, { "epoch": 0.5505058765801342, "grad_norm": 0.5336440205574036, "learning_rate": 8.4198806074692e-06, "loss": 0.3144, "step": 14725 }, { "epoch": 0.550692805570484, "grad_norm": 0.39468759298324585, "learning_rate": 8.414082081373695e-06, "loss": 0.3424, "step": 14730 }, { "epoch": 0.5508797345608337, "grad_norm": 0.5090770125389099, "learning_rate": 8.408284102215833e-06, "loss": 0.2587, "step": 14735 }, { "epoch": 0.5510666635511835, "grad_norm": 0.2587001621723175, "learning_rate": 8.40248667199518e-06, "loss": 0.2465, "step": 14740 }, { "epoch": 0.5512535925415333, "grad_norm": 0.4725680351257324, "learning_rate": 8.396689792711098e-06, "loss": 0.2382, "step": 14745 }, { "epoch": 0.551440521531883, "grad_norm": 0.4414125084877014, "learning_rate": 8.390893466362765e-06, "loss": 0.2707, "step": 14750 }, { "epoch": 0.5516274505222328, "grad_norm": 0.43323227763175964, "learning_rate": 8.385097694949171e-06, "loss": 0.2367, "step": 14755 }, { "epoch": 0.5518143795125826, "grad_norm": 0.4801645874977112, "learning_rate": 8.379302480469109e-06, "loss": 0.2377, "step": 14760 }, { "epoch": 0.5520013085029325, "grad_norm": 0.42729660868644714, "learning_rate": 8.373507824921184e-06, "loss": 0.311, "step": 14765 }, { "epoch": 0.5521882374932823, "grad_norm": 0.3292931318283081, "learning_rate": 8.3677137303038e-06, "loss": 0.2404, "step": 14770 }, { "epoch": 0.552375166483632, "grad_norm": 0.2796480655670166, "learning_rate": 8.361920198615182e-06, "loss": 0.2542, "step": 14775 }, { "epoch": 0.5525620954739818, "grad_norm": 0.1017402783036232, "learning_rate": 8.35612723185335e-06, "loss": 0.3207, "step": 14780 }, { "epoch": 0.5527490244643316, "grad_norm": 0.296451598405838, "learning_rate": 8.350334832016136e-06, "loss": 0.269, "step": 14785 }, { "epoch": 0.5529359534546814, "grad_norm": 0.4688802659511566, "learning_rate": 8.344543001101167e-06, "loss": 0.2914, "step": 14790 }, { "epoch": 0.5531228824450312, "grad_norm": 0.2229701429605484, "learning_rate": 8.33875174110588e-06, "loss": 0.3238, "step": 14795 }, { "epoch": 0.5533098114353809, "grad_norm": 0.6549943685531616, "learning_rate": 8.332961054027522e-06, "loss": 0.2368, "step": 14800 }, { "epoch": 0.5534967404257308, "grad_norm": 0.4109474718570709, "learning_rate": 8.327170941863124e-06, "loss": 0.2954, "step": 14805 }, { "epoch": 0.5536836694160806, "grad_norm": 0.4711514711380005, "learning_rate": 8.32138140660954e-06, "loss": 0.2371, "step": 14810 }, { "epoch": 0.5538705984064304, "grad_norm": 0.32290342450141907, "learning_rate": 8.31559245026341e-06, "loss": 0.2736, "step": 14815 }, { "epoch": 0.5540575273967802, "grad_norm": 0.34553197026252747, "learning_rate": 8.309804074821179e-06, "loss": 0.3564, "step": 14820 }, { "epoch": 0.5542444563871299, "grad_norm": 0.3491702377796173, "learning_rate": 8.304016282279089e-06, "loss": 0.281, "step": 14825 }, { "epoch": 0.5544313853774797, "grad_norm": 0.2792864739894867, "learning_rate": 8.298229074633192e-06, "loss": 0.2732, "step": 14830 }, { "epoch": 0.5546183143678295, "grad_norm": 0.48167872428894043, "learning_rate": 8.292442453879324e-06, "loss": 0.277, "step": 14835 }, { "epoch": 0.5548052433581793, "grad_norm": 0.563044548034668, "learning_rate": 8.286656422013122e-06, "loss": 0.3111, "step": 14840 }, { "epoch": 0.5549921723485292, "grad_norm": 0.3365761637687683, "learning_rate": 8.280870981030031e-06, "loss": 0.286, "step": 14845 }, { "epoch": 0.5551791013388789, "grad_norm": 0.3140876591205597, "learning_rate": 8.275086132925277e-06, "loss": 0.3126, "step": 14850 }, { "epoch": 0.5553660303292287, "grad_norm": 0.4990136921405792, "learning_rate": 8.269301879693892e-06, "loss": 0.2471, "step": 14855 }, { "epoch": 0.5555529593195785, "grad_norm": 0.3173409402370453, "learning_rate": 8.263518223330698e-06, "loss": 0.3087, "step": 14860 }, { "epoch": 0.5557398883099283, "grad_norm": 0.32432377338409424, "learning_rate": 8.257735165830314e-06, "loss": 0.1955, "step": 14865 }, { "epoch": 0.555926817300278, "grad_norm": 0.41975530982017517, "learning_rate": 8.251952709187145e-06, "loss": 0.2308, "step": 14870 }, { "epoch": 0.5561137462906278, "grad_norm": 0.22599612176418304, "learning_rate": 8.2461708553954e-06, "loss": 0.2998, "step": 14875 }, { "epoch": 0.5563006752809776, "grad_norm": 0.2920520603656769, "learning_rate": 8.240389606449075e-06, "loss": 0.2895, "step": 14880 }, { "epoch": 0.5564876042713274, "grad_norm": 0.2696673274040222, "learning_rate": 8.234608964341953e-06, "loss": 0.3093, "step": 14885 }, { "epoch": 0.5566745332616773, "grad_norm": 0.37283971905708313, "learning_rate": 8.228828931067618e-06, "loss": 0.3073, "step": 14890 }, { "epoch": 0.556861462252027, "grad_norm": 0.22317394614219666, "learning_rate": 8.223049508619429e-06, "loss": 0.3098, "step": 14895 }, { "epoch": 0.5570483912423768, "grad_norm": 0.5522322058677673, "learning_rate": 8.217270698990555e-06, "loss": 0.3913, "step": 14900 }, { "epoch": 0.5572353202327266, "grad_norm": 0.3208252787590027, "learning_rate": 8.21149250417393e-06, "loss": 0.2539, "step": 14905 }, { "epoch": 0.5574222492230764, "grad_norm": 0.3736477494239807, "learning_rate": 8.205714926162298e-06, "loss": 0.4014, "step": 14910 }, { "epoch": 0.5576091782134261, "grad_norm": 0.3176252543926239, "learning_rate": 8.199937966948168e-06, "loss": 0.3256, "step": 14915 }, { "epoch": 0.5577961072037759, "grad_norm": 0.3628195822238922, "learning_rate": 8.194161628523863e-06, "loss": 0.3898, "step": 14920 }, { "epoch": 0.5579830361941257, "grad_norm": 0.3258436620235443, "learning_rate": 8.18838591288146e-06, "loss": 0.2693, "step": 14925 }, { "epoch": 0.5581699651844756, "grad_norm": 0.4312792420387268, "learning_rate": 8.18261082201285e-06, "loss": 0.2869, "step": 14930 }, { "epoch": 0.5583568941748254, "grad_norm": 0.43801358342170715, "learning_rate": 8.176836357909697e-06, "loss": 0.3012, "step": 14935 }, { "epoch": 0.5585438231651751, "grad_norm": 0.4672282934188843, "learning_rate": 8.171062522563438e-06, "loss": 0.2744, "step": 14940 }, { "epoch": 0.5587307521555249, "grad_norm": 0.36364656686782837, "learning_rate": 8.165289317965314e-06, "loss": 0.3149, "step": 14945 }, { "epoch": 0.5589176811458747, "grad_norm": 0.6358222961425781, "learning_rate": 8.159516746106331e-06, "loss": 0.3848, "step": 14950 }, { "epoch": 0.5591046101362245, "grad_norm": 0.34080687165260315, "learning_rate": 8.153744808977287e-06, "loss": 0.287, "step": 14955 }, { "epoch": 0.5592915391265743, "grad_norm": 0.4259713888168335, "learning_rate": 8.147973508568753e-06, "loss": 0.2668, "step": 14960 }, { "epoch": 0.559478468116924, "grad_norm": 0.25825029611587524, "learning_rate": 8.142202846871093e-06, "loss": 0.2385, "step": 14965 }, { "epoch": 0.5596653971072739, "grad_norm": 0.5594242215156555, "learning_rate": 8.136432825874433e-06, "loss": 0.3407, "step": 14970 }, { "epoch": 0.5598523260976237, "grad_norm": 0.39524081349372864, "learning_rate": 8.130663447568696e-06, "loss": 0.2397, "step": 14975 }, { "epoch": 0.5600392550879735, "grad_norm": 0.391757071018219, "learning_rate": 8.124894713943576e-06, "loss": 0.2447, "step": 14980 }, { "epoch": 0.5602261840783233, "grad_norm": 0.4008239209651947, "learning_rate": 8.119126626988535e-06, "loss": 0.2966, "step": 14985 }, { "epoch": 0.560413113068673, "grad_norm": 0.3706965148448944, "learning_rate": 8.11335918869283e-06, "loss": 0.3095, "step": 14990 }, { "epoch": 0.5606000420590228, "grad_norm": 1.8738888502120972, "learning_rate": 8.10759240104548e-06, "loss": 0.3566, "step": 14995 }, { "epoch": 0.5607869710493726, "grad_norm": 0.28992828726768494, "learning_rate": 8.10182626603529e-06, "loss": 0.2649, "step": 15000 }, { "epoch": 0.5609739000397224, "grad_norm": 0.2337641566991806, "learning_rate": 8.096060785650829e-06, "loss": 0.2402, "step": 15005 }, { "epoch": 0.5611608290300723, "grad_norm": 0.28632795810699463, "learning_rate": 8.09029596188045e-06, "loss": 0.25, "step": 15010 }, { "epoch": 0.561347758020422, "grad_norm": 0.4603891670703888, "learning_rate": 8.08453179671227e-06, "loss": 0.3261, "step": 15015 }, { "epoch": 0.5615346870107718, "grad_norm": 0.4253883361816406, "learning_rate": 8.07876829213419e-06, "loss": 0.3169, "step": 15020 }, { "epoch": 0.5617216160011216, "grad_norm": 0.3845186233520508, "learning_rate": 8.073005450133877e-06, "loss": 0.2886, "step": 15025 }, { "epoch": 0.5619085449914714, "grad_norm": 0.31487545371055603, "learning_rate": 8.067243272698766e-06, "loss": 0.2967, "step": 15030 }, { "epoch": 0.5620954739818211, "grad_norm": 0.29043373465538025, "learning_rate": 8.061481761816073e-06, "loss": 0.2707, "step": 15035 }, { "epoch": 0.5622824029721709, "grad_norm": 0.4541192054748535, "learning_rate": 8.055720919472771e-06, "loss": 0.3279, "step": 15040 }, { "epoch": 0.5624693319625207, "grad_norm": 0.3087974190711975, "learning_rate": 8.049960747655618e-06, "loss": 0.231, "step": 15045 }, { "epoch": 0.5626562609528706, "grad_norm": 1.4350837469100952, "learning_rate": 8.044201248351125e-06, "loss": 0.3562, "step": 15050 }, { "epoch": 0.5628431899432204, "grad_norm": 0.5882908701896667, "learning_rate": 8.038442423545583e-06, "loss": 0.337, "step": 15055 }, { "epoch": 0.5630301189335701, "grad_norm": 0.3661154806613922, "learning_rate": 8.032684275225038e-06, "loss": 0.4017, "step": 15060 }, { "epoch": 0.5632170479239199, "grad_norm": 0.552562952041626, "learning_rate": 8.026926805375319e-06, "loss": 0.2177, "step": 15065 }, { "epoch": 0.5634039769142697, "grad_norm": 0.5023968815803528, "learning_rate": 8.021170015982009e-06, "loss": 0.2301, "step": 15070 }, { "epoch": 0.5635909059046195, "grad_norm": 0.22658932209014893, "learning_rate": 8.01541390903046e-06, "loss": 0.3225, "step": 15075 }, { "epoch": 0.5637778348949692, "grad_norm": 0.32941967248916626, "learning_rate": 8.00965848650579e-06, "loss": 0.264, "step": 15080 }, { "epoch": 0.563964763885319, "grad_norm": 0.5150809288024902, "learning_rate": 8.003903750392872e-06, "loss": 0.2672, "step": 15085 }, { "epoch": 0.5641516928756689, "grad_norm": 0.2671123743057251, "learning_rate": 7.99814970267636e-06, "loss": 0.2327, "step": 15090 }, { "epoch": 0.5643386218660187, "grad_norm": 0.4776862561702728, "learning_rate": 7.992396345340654e-06, "loss": 0.2703, "step": 15095 }, { "epoch": 0.5645255508563685, "grad_norm": 0.34787583351135254, "learning_rate": 7.986643680369925e-06, "loss": 0.2732, "step": 15100 }, { "epoch": 0.5647124798467182, "grad_norm": 0.34038421511650085, "learning_rate": 7.980891709748097e-06, "loss": 0.2566, "step": 15105 }, { "epoch": 0.564899408837068, "grad_norm": 0.5559023022651672, "learning_rate": 7.975140435458864e-06, "loss": 0.3003, "step": 15110 }, { "epoch": 0.5650863378274178, "grad_norm": 0.6152315139770508, "learning_rate": 7.969389859485679e-06, "loss": 0.3102, "step": 15115 }, { "epoch": 0.5652732668177676, "grad_norm": 0.28499045968055725, "learning_rate": 7.963639983811744e-06, "loss": 0.3374, "step": 15120 }, { "epoch": 0.5654601958081173, "grad_norm": 0.4572030007839203, "learning_rate": 7.957890810420033e-06, "loss": 0.2414, "step": 15125 }, { "epoch": 0.5656471247984671, "grad_norm": 0.44276192784309387, "learning_rate": 7.952142341293264e-06, "loss": 0.3226, "step": 15130 }, { "epoch": 0.565834053788817, "grad_norm": 0.44015127420425415, "learning_rate": 7.946394578413923e-06, "loss": 0.2718, "step": 15135 }, { "epoch": 0.5660209827791668, "grad_norm": 0.4518749415874481, "learning_rate": 7.940647523764251e-06, "loss": 0.2729, "step": 15140 }, { "epoch": 0.5662079117695166, "grad_norm": 0.13977321982383728, "learning_rate": 7.93490117932624e-06, "loss": 0.3054, "step": 15145 }, { "epoch": 0.5663948407598663, "grad_norm": 0.3425142765045166, "learning_rate": 7.929155547081637e-06, "loss": 0.2925, "step": 15150 }, { "epoch": 0.5665817697502161, "grad_norm": 1.439296841621399, "learning_rate": 7.923410629011947e-06, "loss": 0.2461, "step": 15155 }, { "epoch": 0.5667686987405659, "grad_norm": 0.3877362310886383, "learning_rate": 7.917666427098434e-06, "loss": 0.2503, "step": 15160 }, { "epoch": 0.5669556277309157, "grad_norm": 0.4661048352718353, "learning_rate": 7.911922943322102e-06, "loss": 0.3003, "step": 15165 }, { "epoch": 0.5671425567212655, "grad_norm": 0.5539686679840088, "learning_rate": 7.906180179663719e-06, "loss": 0.3221, "step": 15170 }, { "epoch": 0.5673294857116153, "grad_norm": 0.45953917503356934, "learning_rate": 7.900438138103791e-06, "loss": 0.2582, "step": 15175 }, { "epoch": 0.5675164147019651, "grad_norm": 0.5768008828163147, "learning_rate": 7.894696820622594e-06, "loss": 0.239, "step": 15180 }, { "epoch": 0.5677033436923149, "grad_norm": 0.3129447400569916, "learning_rate": 7.888956229200134e-06, "loss": 0.3238, "step": 15185 }, { "epoch": 0.5678902726826647, "grad_norm": 0.3107287287712097, "learning_rate": 7.883216365816186e-06, "loss": 0.2606, "step": 15190 }, { "epoch": 0.5680772016730145, "grad_norm": 0.35008296370506287, "learning_rate": 7.877477232450258e-06, "loss": 0.252, "step": 15195 }, { "epoch": 0.5682641306633642, "grad_norm": 0.33052533864974976, "learning_rate": 7.871738831081613e-06, "loss": 0.2242, "step": 15200 }, { "epoch": 0.568451059653714, "grad_norm": 0.36529335379600525, "learning_rate": 7.866001163689264e-06, "loss": 0.4166, "step": 15205 }, { "epoch": 0.5686379886440638, "grad_norm": 0.5268134474754333, "learning_rate": 7.860264232251968e-06, "loss": 0.3337, "step": 15210 }, { "epoch": 0.5688249176344137, "grad_norm": 0.5382173657417297, "learning_rate": 7.85452803874823e-06, "loss": 0.3197, "step": 15215 }, { "epoch": 0.5690118466247635, "grad_norm": 0.29692742228507996, "learning_rate": 7.84879258515629e-06, "loss": 0.3722, "step": 15220 }, { "epoch": 0.5691987756151132, "grad_norm": 0.31395187973976135, "learning_rate": 7.843057873454151e-06, "loss": 0.2898, "step": 15225 }, { "epoch": 0.569385704605463, "grad_norm": 0.16513025760650635, "learning_rate": 7.837323905619543e-06, "loss": 0.2928, "step": 15230 }, { "epoch": 0.5695726335958128, "grad_norm": 0.396751344203949, "learning_rate": 7.831590683629957e-06, "loss": 0.3757, "step": 15235 }, { "epoch": 0.5697595625861626, "grad_norm": 0.6405811905860901, "learning_rate": 7.825858209462609e-06, "loss": 0.3309, "step": 15240 }, { "epoch": 0.5699464915765123, "grad_norm": 0.3609001934528351, "learning_rate": 7.820126485094465e-06, "loss": 0.1863, "step": 15245 }, { "epoch": 0.5701334205668621, "grad_norm": 0.3763851821422577, "learning_rate": 7.814395512502239e-06, "loss": 0.254, "step": 15250 }, { "epoch": 0.570320349557212, "grad_norm": 0.37580713629722595, "learning_rate": 7.80866529366237e-06, "loss": 0.2708, "step": 15255 }, { "epoch": 0.5705072785475618, "grad_norm": 0.4320635497570038, "learning_rate": 7.802935830551058e-06, "loss": 0.2882, "step": 15260 }, { "epoch": 0.5706942075379116, "grad_norm": 0.4472927153110504, "learning_rate": 7.797207125144222e-06, "loss": 0.2909, "step": 15265 }, { "epoch": 0.5708811365282613, "grad_norm": 0.32192474603652954, "learning_rate": 7.791479179417532e-06, "loss": 0.2998, "step": 15270 }, { "epoch": 0.5710680655186111, "grad_norm": 0.39683884382247925, "learning_rate": 7.785751995346385e-06, "loss": 0.2981, "step": 15275 }, { "epoch": 0.5712549945089609, "grad_norm": 0.5190780758857727, "learning_rate": 7.780025574905935e-06, "loss": 0.2564, "step": 15280 }, { "epoch": 0.5714419234993107, "grad_norm": 0.41170167922973633, "learning_rate": 7.774299920071052e-06, "loss": 0.3135, "step": 15285 }, { "epoch": 0.5716288524896604, "grad_norm": 0.5996455550193787, "learning_rate": 7.768575032816347e-06, "loss": 0.3116, "step": 15290 }, { "epoch": 0.5718157814800103, "grad_norm": 0.40233224630355835, "learning_rate": 7.762850915116183e-06, "loss": 0.261, "step": 15295 }, { "epoch": 0.5720027104703601, "grad_norm": 1.258935809135437, "learning_rate": 7.757127568944629e-06, "loss": 0.2613, "step": 15300 }, { "epoch": 0.5721896394607099, "grad_norm": 0.4860970377922058, "learning_rate": 7.751404996275515e-06, "loss": 0.3349, "step": 15305 }, { "epoch": 0.5723765684510597, "grad_norm": 0.6576589941978455, "learning_rate": 7.745683199082385e-06, "loss": 0.3026, "step": 15310 }, { "epoch": 0.5725634974414094, "grad_norm": 0.39305242896080017, "learning_rate": 7.739962179338528e-06, "loss": 0.3438, "step": 15315 }, { "epoch": 0.5727504264317592, "grad_norm": 0.23568610846996307, "learning_rate": 7.734241939016953e-06, "loss": 0.2488, "step": 15320 }, { "epoch": 0.572937355422109, "grad_norm": 0.39915916323661804, "learning_rate": 7.728522480090415e-06, "loss": 0.3723, "step": 15325 }, { "epoch": 0.5731242844124588, "grad_norm": 0.4517640173435211, "learning_rate": 7.722803804531385e-06, "loss": 0.2179, "step": 15330 }, { "epoch": 0.5733112134028087, "grad_norm": 0.5729992985725403, "learning_rate": 7.717085914312071e-06, "loss": 0.2954, "step": 15335 }, { "epoch": 0.5734981423931584, "grad_norm": 0.26858532428741455, "learning_rate": 7.711368811404417e-06, "loss": 0.3368, "step": 15340 }, { "epoch": 0.5736850713835082, "grad_norm": 0.3362521231174469, "learning_rate": 7.705652497780076e-06, "loss": 0.2627, "step": 15345 }, { "epoch": 0.573872000373858, "grad_norm": 0.5297291874885559, "learning_rate": 7.699936975410452e-06, "loss": 0.3659, "step": 15350 }, { "epoch": 0.5740589293642078, "grad_norm": 0.8421899080276489, "learning_rate": 7.694222246266659e-06, "loss": 0.2792, "step": 15355 }, { "epoch": 0.5742458583545575, "grad_norm": 0.3951667845249176, "learning_rate": 7.688508312319545e-06, "loss": 0.3122, "step": 15360 }, { "epoch": 0.5744327873449073, "grad_norm": 0.23748940229415894, "learning_rate": 7.682795175539677e-06, "loss": 0.3775, "step": 15365 }, { "epoch": 0.5746197163352571, "grad_norm": 0.42018768191337585, "learning_rate": 7.677082837897362e-06, "loss": 0.2916, "step": 15370 }, { "epoch": 0.5748066453256069, "grad_norm": 0.21799716353416443, "learning_rate": 7.671371301362613e-06, "loss": 0.2834, "step": 15375 }, { "epoch": 0.5749935743159568, "grad_norm": 0.3239682912826538, "learning_rate": 7.66566056790518e-06, "loss": 0.3178, "step": 15380 }, { "epoch": 0.5751805033063065, "grad_norm": 0.2807358205318451, "learning_rate": 7.659950639494531e-06, "loss": 0.2459, "step": 15385 }, { "epoch": 0.5753674322966563, "grad_norm": 0.39530667662620544, "learning_rate": 7.654241518099851e-06, "loss": 0.306, "step": 15390 }, { "epoch": 0.5755543612870061, "grad_norm": 0.2550390362739563, "learning_rate": 7.648533205690062e-06, "loss": 0.3522, "step": 15395 }, { "epoch": 0.5757412902773559, "grad_norm": 0.4049346148967743, "learning_rate": 7.64282570423379e-06, "loss": 0.3467, "step": 15400 }, { "epoch": 0.5759282192677057, "grad_norm": 0.7592940926551819, "learning_rate": 7.637119015699394e-06, "loss": 0.2773, "step": 15405 }, { "epoch": 0.5761151482580554, "grad_norm": 0.43101072311401367, "learning_rate": 7.631413142054938e-06, "loss": 0.2392, "step": 15410 }, { "epoch": 0.5763020772484052, "grad_norm": 0.2688102126121521, "learning_rate": 7.625708085268227e-06, "loss": 0.2488, "step": 15415 }, { "epoch": 0.5764890062387551, "grad_norm": 0.4400135576725006, "learning_rate": 7.620003847306761e-06, "loss": 0.3319, "step": 15420 }, { "epoch": 0.5766759352291049, "grad_norm": 0.35837605595588684, "learning_rate": 7.6143004301377735e-06, "loss": 0.3054, "step": 15425 }, { "epoch": 0.5768628642194547, "grad_norm": 0.42280635237693787, "learning_rate": 7.6085978357282105e-06, "loss": 0.31, "step": 15430 }, { "epoch": 0.5770497932098044, "grad_norm": 0.36921992897987366, "learning_rate": 7.60289606604473e-06, "loss": 0.2359, "step": 15435 }, { "epoch": 0.5772367222001542, "grad_norm": 0.676522433757782, "learning_rate": 7.597195123053711e-06, "loss": 0.2833, "step": 15440 }, { "epoch": 0.577423651190504, "grad_norm": 0.4574277102947235, "learning_rate": 7.591495008721243e-06, "loss": 0.2164, "step": 15445 }, { "epoch": 0.5776105801808538, "grad_norm": 0.3864579200744629, "learning_rate": 7.585795725013138e-06, "loss": 0.3748, "step": 15450 }, { "epoch": 0.5777975091712035, "grad_norm": 0.5847076177597046, "learning_rate": 7.580097273894911e-06, "loss": 0.3585, "step": 15455 }, { "epoch": 0.5779844381615534, "grad_norm": 0.2284291684627533, "learning_rate": 7.574399657331796e-06, "loss": 0.3382, "step": 15460 }, { "epoch": 0.5781713671519032, "grad_norm": 0.5911282300949097, "learning_rate": 7.568702877288732e-06, "loss": 0.2877, "step": 15465 }, { "epoch": 0.578358296142253, "grad_norm": 0.2202225774526596, "learning_rate": 7.5630069357303835e-06, "loss": 0.2816, "step": 15470 }, { "epoch": 0.5785452251326028, "grad_norm": 0.4826430082321167, "learning_rate": 7.557311834621116e-06, "loss": 0.2662, "step": 15475 }, { "epoch": 0.5787321541229525, "grad_norm": 0.23101915419101715, "learning_rate": 7.551617575925001e-06, "loss": 0.3696, "step": 15480 }, { "epoch": 0.5789190831133023, "grad_norm": 0.4637044668197632, "learning_rate": 7.545924161605832e-06, "loss": 0.3342, "step": 15485 }, { "epoch": 0.5791060121036521, "grad_norm": 0.3432970345020294, "learning_rate": 7.540231593627098e-06, "loss": 0.2257, "step": 15490 }, { "epoch": 0.5792929410940019, "grad_norm": 0.18397098779678345, "learning_rate": 7.5345398739520105e-06, "loss": 0.2383, "step": 15495 }, { "epoch": 0.5794798700843518, "grad_norm": 0.47786015272140503, "learning_rate": 7.528849004543473e-06, "loss": 0.3306, "step": 15500 }, { "epoch": 0.5796667990747015, "grad_norm": 0.2156762033700943, "learning_rate": 7.52315898736411e-06, "loss": 0.3126, "step": 15505 }, { "epoch": 0.5798537280650513, "grad_norm": 0.22711989283561707, "learning_rate": 7.517469824376238e-06, "loss": 0.2556, "step": 15510 }, { "epoch": 0.5800406570554011, "grad_norm": 0.4607297480106354, "learning_rate": 7.5117815175418914e-06, "loss": 0.3023, "step": 15515 }, { "epoch": 0.5802275860457509, "grad_norm": 0.43681520223617554, "learning_rate": 7.506094068822801e-06, "loss": 0.2141, "step": 15520 }, { "epoch": 0.5804145150361006, "grad_norm": 0.33917364478111267, "learning_rate": 7.50040748018041e-06, "loss": 0.3088, "step": 15525 }, { "epoch": 0.5806014440264504, "grad_norm": 0.2788265347480774, "learning_rate": 7.494721753575856e-06, "loss": 0.2605, "step": 15530 }, { "epoch": 0.5807883730168002, "grad_norm": 0.34832385182380676, "learning_rate": 7.489036890969981e-06, "loss": 0.2741, "step": 15535 }, { "epoch": 0.5809753020071501, "grad_norm": 0.3322335481643677, "learning_rate": 7.483352894323339e-06, "loss": 0.2696, "step": 15540 }, { "epoch": 0.5811622309974999, "grad_norm": 0.27210044860839844, "learning_rate": 7.47766976559617e-06, "loss": 0.272, "step": 15545 }, { "epoch": 0.5813491599878496, "grad_norm": 0.4030235707759857, "learning_rate": 7.471987506748426e-06, "loss": 0.3164, "step": 15550 }, { "epoch": 0.5815360889781994, "grad_norm": 0.31630632281303406, "learning_rate": 7.466306119739751e-06, "loss": 0.2757, "step": 15555 }, { "epoch": 0.5817230179685492, "grad_norm": 0.34428203105926514, "learning_rate": 7.4606256065295e-06, "loss": 0.3276, "step": 15560 }, { "epoch": 0.581909946958899, "grad_norm": 0.41596320271492004, "learning_rate": 7.4549459690767105e-06, "loss": 0.3167, "step": 15565 }, { "epoch": 0.5820968759492487, "grad_norm": 0.275072306394577, "learning_rate": 7.4492672093401345e-06, "loss": 0.2606, "step": 15570 }, { "epoch": 0.5822838049395985, "grad_norm": 0.29760444164276123, "learning_rate": 7.443589329278211e-06, "loss": 0.3137, "step": 15575 }, { "epoch": 0.5824707339299484, "grad_norm": 0.40786728262901306, "learning_rate": 7.4379123308490735e-06, "loss": 0.2841, "step": 15580 }, { "epoch": 0.5826576629202982, "grad_norm": 0.40761783719062805, "learning_rate": 7.432236216010564e-06, "loss": 0.2717, "step": 15585 }, { "epoch": 0.582844591910648, "grad_norm": 0.36162450909614563, "learning_rate": 7.426560986720206e-06, "loss": 0.314, "step": 15590 }, { "epoch": 0.5830315209009977, "grad_norm": 0.36004146933555603, "learning_rate": 7.4208866449352275e-06, "loss": 0.3408, "step": 15595 }, { "epoch": 0.5832184498913475, "grad_norm": 0.42730987071990967, "learning_rate": 7.4152131926125405e-06, "loss": 0.2853, "step": 15600 }, { "epoch": 0.5834053788816973, "grad_norm": 0.4817928671836853, "learning_rate": 7.409540631708763e-06, "loss": 0.3038, "step": 15605 }, { "epoch": 0.5835923078720471, "grad_norm": 0.20412497222423553, "learning_rate": 7.403868964180192e-06, "loss": 0.262, "step": 15610 }, { "epoch": 0.5837792368623969, "grad_norm": 0.18026775121688843, "learning_rate": 7.398198191982828e-06, "loss": 0.2356, "step": 15615 }, { "epoch": 0.5839661658527466, "grad_norm": 0.22581899166107178, "learning_rate": 7.39252831707236e-06, "loss": 0.3154, "step": 15620 }, { "epoch": 0.5841530948430965, "grad_norm": 0.26380541920661926, "learning_rate": 7.386859341404158e-06, "loss": 0.3565, "step": 15625 }, { "epoch": 0.5843400238334463, "grad_norm": 0.4216972291469574, "learning_rate": 7.3811912669332965e-06, "loss": 0.2868, "step": 15630 }, { "epoch": 0.5845269528237961, "grad_norm": 0.4482162892818451, "learning_rate": 7.375524095614524e-06, "loss": 0.2696, "step": 15635 }, { "epoch": 0.5847138818141459, "grad_norm": 0.34576284885406494, "learning_rate": 7.369857829402294e-06, "loss": 0.2722, "step": 15640 }, { "epoch": 0.5849008108044956, "grad_norm": 0.35120487213134766, "learning_rate": 7.364192470250735e-06, "loss": 0.2365, "step": 15645 }, { "epoch": 0.5850877397948454, "grad_norm": 0.5268846154212952, "learning_rate": 7.358528020113669e-06, "loss": 0.3953, "step": 15650 }, { "epoch": 0.5852746687851952, "grad_norm": 0.5416049957275391, "learning_rate": 7.352864480944597e-06, "loss": 0.3447, "step": 15655 }, { "epoch": 0.585461597775545, "grad_norm": 0.3339357376098633, "learning_rate": 7.3472018546967175e-06, "loss": 0.3417, "step": 15660 }, { "epoch": 0.5856485267658949, "grad_norm": 0.4160725474357605, "learning_rate": 7.341540143322907e-06, "loss": 0.331, "step": 15665 }, { "epoch": 0.5858354557562446, "grad_norm": 0.4474146366119385, "learning_rate": 7.335879348775724e-06, "loss": 0.2705, "step": 15670 }, { "epoch": 0.5860223847465944, "grad_norm": 0.39802151918411255, "learning_rate": 7.33021947300742e-06, "loss": 0.2973, "step": 15675 }, { "epoch": 0.5862093137369442, "grad_norm": 0.3424316346645355, "learning_rate": 7.324560517969918e-06, "loss": 0.2956, "step": 15680 }, { "epoch": 0.586396242727294, "grad_norm": 0.31695395708084106, "learning_rate": 7.318902485614836e-06, "loss": 0.2828, "step": 15685 }, { "epoch": 0.5865831717176437, "grad_norm": 0.30436766147613525, "learning_rate": 7.313245377893461e-06, "loss": 0.2704, "step": 15690 }, { "epoch": 0.5867701007079935, "grad_norm": 0.349795401096344, "learning_rate": 7.307589196756772e-06, "loss": 0.3184, "step": 15695 }, { "epoch": 0.5869570296983433, "grad_norm": 0.3413912057876587, "learning_rate": 7.301933944155417e-06, "loss": 0.2635, "step": 15700 }, { "epoch": 0.5871439586886932, "grad_norm": 0.3070078194141388, "learning_rate": 7.296279622039737e-06, "loss": 0.36, "step": 15705 }, { "epoch": 0.587330887679043, "grad_norm": 0.29595261812210083, "learning_rate": 7.290626232359746e-06, "loss": 0.2837, "step": 15710 }, { "epoch": 0.5875178166693927, "grad_norm": 0.47354787588119507, "learning_rate": 7.284973777065134e-06, "loss": 0.2849, "step": 15715 }, { "epoch": 0.5877047456597425, "grad_norm": 0.5047826766967773, "learning_rate": 7.279322258105272e-06, "loss": 0.2876, "step": 15720 }, { "epoch": 0.5878916746500923, "grad_norm": 0.5029893517494202, "learning_rate": 7.273671677429202e-06, "loss": 0.3886, "step": 15725 }, { "epoch": 0.5880786036404421, "grad_norm": 0.27061954140663147, "learning_rate": 7.2680220369856546e-06, "loss": 0.3005, "step": 15730 }, { "epoch": 0.5882655326307918, "grad_norm": 0.5909748673439026, "learning_rate": 7.2623733387230245e-06, "loss": 0.2534, "step": 15735 }, { "epoch": 0.5884524616211416, "grad_norm": 0.3261001408100128, "learning_rate": 7.256725584589388e-06, "loss": 0.2761, "step": 15740 }, { "epoch": 0.5886393906114915, "grad_norm": 0.47955918312072754, "learning_rate": 7.25107877653249e-06, "loss": 0.2752, "step": 15745 }, { "epoch": 0.5888263196018413, "grad_norm": 0.49793973565101624, "learning_rate": 7.245432916499755e-06, "loss": 0.3089, "step": 15750 }, { "epoch": 0.5890132485921911, "grad_norm": 0.5249269604682922, "learning_rate": 7.2397880064382816e-06, "loss": 0.3462, "step": 15755 }, { "epoch": 0.5892001775825408, "grad_norm": 0.7072009444236755, "learning_rate": 7.234144048294833e-06, "loss": 0.2022, "step": 15760 }, { "epoch": 0.5893871065728906, "grad_norm": 0.22332626581192017, "learning_rate": 7.228501044015854e-06, "loss": 0.3619, "step": 15765 }, { "epoch": 0.5895740355632404, "grad_norm": 0.6335933208465576, "learning_rate": 7.222858995547446e-06, "loss": 0.2363, "step": 15770 }, { "epoch": 0.5897609645535902, "grad_norm": 0.40368562936782837, "learning_rate": 7.2172179048354e-06, "loss": 0.3412, "step": 15775 }, { "epoch": 0.58994789354394, "grad_norm": 0.41344451904296875, "learning_rate": 7.211577773825157e-06, "loss": 0.3029, "step": 15780 }, { "epoch": 0.5901348225342898, "grad_norm": 0.15166114270687103, "learning_rate": 7.205938604461846e-06, "loss": 0.2663, "step": 15785 }, { "epoch": 0.5903217515246396, "grad_norm": 0.18254537880420685, "learning_rate": 7.2003003986902474e-06, "loss": 0.2779, "step": 15790 }, { "epoch": 0.5905086805149894, "grad_norm": 0.6421458125114441, "learning_rate": 7.19466315845482e-06, "loss": 0.2935, "step": 15795 }, { "epoch": 0.5906956095053392, "grad_norm": 0.296409010887146, "learning_rate": 7.189026885699688e-06, "loss": 0.2955, "step": 15800 }, { "epoch": 0.590882538495689, "grad_norm": 0.444355845451355, "learning_rate": 7.183391582368637e-06, "loss": 0.2687, "step": 15805 }, { "epoch": 0.5910694674860387, "grad_norm": 0.2945992052555084, "learning_rate": 7.177757250405126e-06, "loss": 0.2612, "step": 15810 }, { "epoch": 0.5912563964763885, "grad_norm": 0.2884933650493622, "learning_rate": 7.172123891752268e-06, "loss": 0.2812, "step": 15815 }, { "epoch": 0.5914433254667383, "grad_norm": 0.5446357727050781, "learning_rate": 7.166491508352853e-06, "loss": 0.2699, "step": 15820 }, { "epoch": 0.5916302544570882, "grad_norm": 0.3308735489845276, "learning_rate": 7.160860102149323e-06, "loss": 0.2325, "step": 15825 }, { "epoch": 0.5918171834474379, "grad_norm": 0.5215713977813721, "learning_rate": 7.155229675083797e-06, "loss": 0.269, "step": 15830 }, { "epoch": 0.5920041124377877, "grad_norm": 0.39702996611595154, "learning_rate": 7.1496002290980415e-06, "loss": 0.3225, "step": 15835 }, { "epoch": 0.5921910414281375, "grad_norm": 0.3645925521850586, "learning_rate": 7.14397176613349e-06, "loss": 0.2705, "step": 15840 }, { "epoch": 0.5923779704184873, "grad_norm": 0.32077428698539734, "learning_rate": 7.138344288131245e-06, "loss": 0.3051, "step": 15845 }, { "epoch": 0.592564899408837, "grad_norm": 0.5416418313980103, "learning_rate": 7.132717797032056e-06, "loss": 0.3141, "step": 15850 }, { "epoch": 0.5927518283991868, "grad_norm": 0.5128157734870911, "learning_rate": 7.127092294776343e-06, "loss": 0.3008, "step": 15855 }, { "epoch": 0.5929387573895366, "grad_norm": 0.820091962814331, "learning_rate": 7.121467783304174e-06, "loss": 0.3023, "step": 15860 }, { "epoch": 0.5931256863798864, "grad_norm": 0.2677062153816223, "learning_rate": 7.1158442645552896e-06, "loss": 0.2956, "step": 15865 }, { "epoch": 0.5933126153702363, "grad_norm": 0.3159048855304718, "learning_rate": 7.110221740469074e-06, "loss": 0.3155, "step": 15870 }, { "epoch": 0.593499544360586, "grad_norm": 0.2539639472961426, "learning_rate": 7.10460021298458e-06, "loss": 0.3094, "step": 15875 }, { "epoch": 0.5936864733509358, "grad_norm": 0.36676567792892456, "learning_rate": 7.098979684040508e-06, "loss": 0.2947, "step": 15880 }, { "epoch": 0.5938734023412856, "grad_norm": 0.3667091131210327, "learning_rate": 7.093360155575218e-06, "loss": 0.2889, "step": 15885 }, { "epoch": 0.5940603313316354, "grad_norm": 0.6631130576133728, "learning_rate": 7.087741629526726e-06, "loss": 0.2206, "step": 15890 }, { "epoch": 0.5942472603219852, "grad_norm": 0.7274545431137085, "learning_rate": 7.082124107832695e-06, "loss": 0.3667, "step": 15895 }, { "epoch": 0.5944341893123349, "grad_norm": 0.5482187867164612, "learning_rate": 7.076507592430457e-06, "loss": 0.3779, "step": 15900 }, { "epoch": 0.5946211183026847, "grad_norm": 0.4760439991950989, "learning_rate": 7.070892085256978e-06, "loss": 0.2652, "step": 15905 }, { "epoch": 0.5948080472930346, "grad_norm": 0.27402257919311523, "learning_rate": 7.065277588248893e-06, "loss": 0.2663, "step": 15910 }, { "epoch": 0.5949949762833844, "grad_norm": 0.5011516213417053, "learning_rate": 7.059664103342473e-06, "loss": 0.364, "step": 15915 }, { "epoch": 0.5951819052737342, "grad_norm": 0.3318321108818054, "learning_rate": 7.0540516324736556e-06, "loss": 0.3262, "step": 15920 }, { "epoch": 0.5953688342640839, "grad_norm": 0.34024515748023987, "learning_rate": 7.0484401775780175e-06, "loss": 0.3343, "step": 15925 }, { "epoch": 0.5955557632544337, "grad_norm": 0.2316112518310547, "learning_rate": 7.0428297405907865e-06, "loss": 0.3235, "step": 15930 }, { "epoch": 0.5957426922447835, "grad_norm": 0.3854818046092987, "learning_rate": 7.0372203234468474e-06, "loss": 0.3301, "step": 15935 }, { "epoch": 0.5959296212351333, "grad_norm": 0.34826406836509705, "learning_rate": 7.031611928080721e-06, "loss": 0.2321, "step": 15940 }, { "epoch": 0.596116550225483, "grad_norm": 0.5255212783813477, "learning_rate": 7.02600455642659e-06, "loss": 0.2714, "step": 15945 }, { "epoch": 0.5963034792158329, "grad_norm": 0.4153469204902649, "learning_rate": 7.020398210418269e-06, "loss": 0.3229, "step": 15950 }, { "epoch": 0.5964904082061827, "grad_norm": 0.532195508480072, "learning_rate": 7.014792891989232e-06, "loss": 0.2495, "step": 15955 }, { "epoch": 0.5966773371965325, "grad_norm": 0.12149854749441147, "learning_rate": 7.009188603072586e-06, "loss": 0.2962, "step": 15960 }, { "epoch": 0.5968642661868823, "grad_norm": 0.5596807599067688, "learning_rate": 7.003585345601095e-06, "loss": 0.342, "step": 15965 }, { "epoch": 0.597051195177232, "grad_norm": 0.4083070456981659, "learning_rate": 6.9979831215071566e-06, "loss": 0.2877, "step": 15970 }, { "epoch": 0.5972381241675818, "grad_norm": 0.5761303901672363, "learning_rate": 6.9923819327228235e-06, "loss": 0.2234, "step": 15975 }, { "epoch": 0.5974250531579316, "grad_norm": 0.42254355549812317, "learning_rate": 6.986781781179786e-06, "loss": 0.3016, "step": 15980 }, { "epoch": 0.5976119821482814, "grad_norm": 0.43202266097068787, "learning_rate": 6.981182668809365e-06, "loss": 0.3697, "step": 15985 }, { "epoch": 0.5977989111386313, "grad_norm": 0.30627140402793884, "learning_rate": 6.975584597542549e-06, "loss": 0.3126, "step": 15990 }, { "epoch": 0.597985840128981, "grad_norm": 0.5201455950737, "learning_rate": 6.9699875693099415e-06, "loss": 0.2483, "step": 15995 }, { "epoch": 0.5981727691193308, "grad_norm": 0.49727630615234375, "learning_rate": 6.964391586041803e-06, "loss": 0.3368, "step": 16000 }, { "epoch": 0.5983596981096806, "grad_norm": 0.3812571167945862, "learning_rate": 6.95879664966802e-06, "loss": 0.3413, "step": 16005 }, { "epoch": 0.5985466271000304, "grad_norm": 0.35332605242729187, "learning_rate": 6.953202762118137e-06, "loss": 0.2443, "step": 16010 }, { "epoch": 0.5987335560903801, "grad_norm": 0.45708781480789185, "learning_rate": 6.947609925321314e-06, "loss": 0.2457, "step": 16015 }, { "epoch": 0.5989204850807299, "grad_norm": 0.43644988536834717, "learning_rate": 6.942018141206368e-06, "loss": 0.2944, "step": 16020 }, { "epoch": 0.5991074140710797, "grad_norm": 0.4386439323425293, "learning_rate": 6.9364274117017446e-06, "loss": 0.3314, "step": 16025 }, { "epoch": 0.5992943430614296, "grad_norm": 0.48612180352211, "learning_rate": 6.930837738735521e-06, "loss": 0.2833, "step": 16030 }, { "epoch": 0.5994812720517794, "grad_norm": 0.5489234924316406, "learning_rate": 6.925249124235423e-06, "loss": 0.3497, "step": 16035 }, { "epoch": 0.5996682010421291, "grad_norm": 0.47888511419296265, "learning_rate": 6.919661570128796e-06, "loss": 0.3317, "step": 16040 }, { "epoch": 0.5998551300324789, "grad_norm": 0.12318491190671921, "learning_rate": 6.914075078342632e-06, "loss": 0.2215, "step": 16045 }, { "epoch": 0.6000420590228287, "grad_norm": 0.5285260081291199, "learning_rate": 6.908489650803549e-06, "loss": 0.2446, "step": 16050 }, { "epoch": 0.6002289880131785, "grad_norm": 0.34625720977783203, "learning_rate": 6.902905289437807e-06, "loss": 0.2358, "step": 16055 }, { "epoch": 0.6004159170035283, "grad_norm": 0.5598260760307312, "learning_rate": 6.897321996171281e-06, "loss": 0.2095, "step": 16060 }, { "epoch": 0.600602845993878, "grad_norm": 0.35202714800834656, "learning_rate": 6.891739772929499e-06, "loss": 0.2681, "step": 16065 }, { "epoch": 0.6007897749842279, "grad_norm": 0.39519229531288147, "learning_rate": 6.886158621637608e-06, "loss": 0.2615, "step": 16070 }, { "epoch": 0.6009767039745777, "grad_norm": 0.6391472220420837, "learning_rate": 6.880578544220382e-06, "loss": 0.2642, "step": 16075 }, { "epoch": 0.6011636329649275, "grad_norm": 0.3933602571487427, "learning_rate": 6.874999542602237e-06, "loss": 0.2837, "step": 16080 }, { "epoch": 0.6013505619552773, "grad_norm": 0.6235082745552063, "learning_rate": 6.8694216187072015e-06, "loss": 0.3124, "step": 16085 }, { "epoch": 0.601537490945627, "grad_norm": 0.46364444494247437, "learning_rate": 6.863844774458954e-06, "loss": 0.2186, "step": 16090 }, { "epoch": 0.6017244199359768, "grad_norm": 0.37542879581451416, "learning_rate": 6.8582690117807784e-06, "loss": 0.28, "step": 16095 }, { "epoch": 0.6019113489263266, "grad_norm": 0.4766859710216522, "learning_rate": 6.852694332595601e-06, "loss": 0.2415, "step": 16100 }, { "epoch": 0.6020982779166764, "grad_norm": 0.384067177772522, "learning_rate": 6.847120738825962e-06, "loss": 0.2473, "step": 16105 }, { "epoch": 0.6022852069070261, "grad_norm": 0.3013162314891815, "learning_rate": 6.841548232394041e-06, "loss": 0.2942, "step": 16110 }, { "epoch": 0.602472135897376, "grad_norm": 0.4402746558189392, "learning_rate": 6.835976815221637e-06, "loss": 0.2965, "step": 16115 }, { "epoch": 0.6026590648877258, "grad_norm": 0.42539361119270325, "learning_rate": 6.830406489230162e-06, "loss": 0.2081, "step": 16120 }, { "epoch": 0.6028459938780756, "grad_norm": 0.47078409790992737, "learning_rate": 6.824837256340674e-06, "loss": 0.2934, "step": 16125 }, { "epoch": 0.6030329228684254, "grad_norm": 0.4631364345550537, "learning_rate": 6.819269118473833e-06, "loss": 0.2897, "step": 16130 }, { "epoch": 0.6032198518587751, "grad_norm": 0.48256585001945496, "learning_rate": 6.813702077549935e-06, "loss": 0.2225, "step": 16135 }, { "epoch": 0.6034067808491249, "grad_norm": 0.5263893604278564, "learning_rate": 6.808136135488892e-06, "loss": 0.2288, "step": 16140 }, { "epoch": 0.6035937098394747, "grad_norm": 0.3626660108566284, "learning_rate": 6.802571294210239e-06, "loss": 0.2377, "step": 16145 }, { "epoch": 0.6037806388298245, "grad_norm": 1.2106562852859497, "learning_rate": 6.797007555633124e-06, "loss": 0.2988, "step": 16150 }, { "epoch": 0.6039675678201744, "grad_norm": 0.2370201051235199, "learning_rate": 6.791444921676327e-06, "loss": 0.2449, "step": 16155 }, { "epoch": 0.6041544968105241, "grad_norm": 0.5193468928337097, "learning_rate": 6.785883394258241e-06, "loss": 0.2805, "step": 16160 }, { "epoch": 0.6043414258008739, "grad_norm": 0.5827836990356445, "learning_rate": 6.780322975296877e-06, "loss": 0.2585, "step": 16165 }, { "epoch": 0.6045283547912237, "grad_norm": 0.22835935652256012, "learning_rate": 6.7747636667098645e-06, "loss": 0.2602, "step": 16170 }, { "epoch": 0.6047152837815735, "grad_norm": 0.5145224928855896, "learning_rate": 6.769205470414445e-06, "loss": 0.278, "step": 16175 }, { "epoch": 0.6049022127719232, "grad_norm": 0.26391491293907166, "learning_rate": 6.763648388327488e-06, "loss": 0.2507, "step": 16180 }, { "epoch": 0.605089141762273, "grad_norm": 0.23646554350852966, "learning_rate": 6.758092422365468e-06, "loss": 0.2388, "step": 16185 }, { "epoch": 0.6052760707526228, "grad_norm": 0.4633139669895172, "learning_rate": 6.75253757444448e-06, "loss": 0.3711, "step": 16190 }, { "epoch": 0.6054629997429727, "grad_norm": 0.614550769329071, "learning_rate": 6.746983846480226e-06, "loss": 0.3509, "step": 16195 }, { "epoch": 0.6056499287333225, "grad_norm": 1.0739442110061646, "learning_rate": 6.7414312403880345e-06, "loss": 0.351, "step": 16200 }, { "epoch": 0.6058368577236722, "grad_norm": 0.317594051361084, "learning_rate": 6.735879758082841e-06, "loss": 0.341, "step": 16205 }, { "epoch": 0.606023786714022, "grad_norm": 0.32532641291618347, "learning_rate": 6.730329401479189e-06, "loss": 0.2996, "step": 16210 }, { "epoch": 0.6062107157043718, "grad_norm": 0.4760509431362152, "learning_rate": 6.724780172491241e-06, "loss": 0.2035, "step": 16215 }, { "epoch": 0.6063976446947216, "grad_norm": 0.4633803963661194, "learning_rate": 6.71923207303276e-06, "loss": 0.3453, "step": 16220 }, { "epoch": 0.6065845736850713, "grad_norm": 0.27814042568206787, "learning_rate": 6.713685105017135e-06, "loss": 0.3061, "step": 16225 }, { "epoch": 0.6067715026754211, "grad_norm": 0.331685334444046, "learning_rate": 6.708139270357348e-06, "loss": 0.2412, "step": 16230 }, { "epoch": 0.606958431665771, "grad_norm": 0.2586578130722046, "learning_rate": 6.702594570966008e-06, "loss": 0.2551, "step": 16235 }, { "epoch": 0.6071453606561208, "grad_norm": 0.35025131702423096, "learning_rate": 6.697051008755315e-06, "loss": 0.2946, "step": 16240 }, { "epoch": 0.6073322896464706, "grad_norm": 0.403495192527771, "learning_rate": 6.691508585637085e-06, "loss": 0.2499, "step": 16245 }, { "epoch": 0.6075192186368203, "grad_norm": 0.6013374328613281, "learning_rate": 6.6859673035227495e-06, "loss": 0.2661, "step": 16250 }, { "epoch": 0.6077061476271701, "grad_norm": 0.09546211361885071, "learning_rate": 6.680427164323329e-06, "loss": 0.2791, "step": 16255 }, { "epoch": 0.6078930766175199, "grad_norm": 0.38992223143577576, "learning_rate": 6.674888169949463e-06, "loss": 0.3381, "step": 16260 }, { "epoch": 0.6080800056078697, "grad_norm": 0.33082830905914307, "learning_rate": 6.669350322311388e-06, "loss": 0.3829, "step": 16265 }, { "epoch": 0.6082669345982195, "grad_norm": 0.30394282937049866, "learning_rate": 6.663813623318954e-06, "loss": 0.287, "step": 16270 }, { "epoch": 0.6084538635885693, "grad_norm": 0.27760547399520874, "learning_rate": 6.658278074881605e-06, "loss": 0.2794, "step": 16275 }, { "epoch": 0.6086407925789191, "grad_norm": 0.7337043881416321, "learning_rate": 6.652743678908399e-06, "loss": 0.2566, "step": 16280 }, { "epoch": 0.6088277215692689, "grad_norm": 0.36123204231262207, "learning_rate": 6.647210437307985e-06, "loss": 0.2533, "step": 16285 }, { "epoch": 0.6090146505596187, "grad_norm": 0.4499099552631378, "learning_rate": 6.641678351988619e-06, "loss": 0.314, "step": 16290 }, { "epoch": 0.6092015795499685, "grad_norm": 0.3133924901485443, "learning_rate": 6.6361474248581655e-06, "loss": 0.2611, "step": 16295 }, { "epoch": 0.6093885085403182, "grad_norm": 0.39866819977760315, "learning_rate": 6.630617657824078e-06, "loss": 0.2835, "step": 16300 }, { "epoch": 0.609575437530668, "grad_norm": 0.45732995867729187, "learning_rate": 6.625089052793417e-06, "loss": 0.3571, "step": 16305 }, { "epoch": 0.6097623665210178, "grad_norm": 0.41225141286849976, "learning_rate": 6.619561611672834e-06, "loss": 0.3494, "step": 16310 }, { "epoch": 0.6099492955113677, "grad_norm": 0.5021333694458008, "learning_rate": 6.6140353363685914e-06, "loss": 0.2647, "step": 16315 }, { "epoch": 0.6101362245017175, "grad_norm": 0.2890709936618805, "learning_rate": 6.6085102287865385e-06, "loss": 0.278, "step": 16320 }, { "epoch": 0.6103231534920672, "grad_norm": 0.5777745842933655, "learning_rate": 6.602986290832134e-06, "loss": 0.2815, "step": 16325 }, { "epoch": 0.610510082482417, "grad_norm": 0.4748448133468628, "learning_rate": 6.597463524410418e-06, "loss": 0.2811, "step": 16330 }, { "epoch": 0.6106970114727668, "grad_norm": 0.5496982336044312, "learning_rate": 6.591941931426036e-06, "loss": 0.2409, "step": 16335 }, { "epoch": 0.6108839404631166, "grad_norm": 0.2635287046432495, "learning_rate": 6.5864215137832325e-06, "loss": 0.311, "step": 16340 }, { "epoch": 0.6110708694534663, "grad_norm": 0.2311626374721527, "learning_rate": 6.580902273385834e-06, "loss": 0.2865, "step": 16345 }, { "epoch": 0.6112577984438161, "grad_norm": 0.39265650510787964, "learning_rate": 6.575384212137275e-06, "loss": 0.2652, "step": 16350 }, { "epoch": 0.6114447274341659, "grad_norm": 0.35634520649909973, "learning_rate": 6.569867331940571e-06, "loss": 0.2976, "step": 16355 }, { "epoch": 0.6116316564245158, "grad_norm": 0.7832450866699219, "learning_rate": 6.564351634698343e-06, "loss": 0.2907, "step": 16360 }, { "epoch": 0.6118185854148656, "grad_norm": 0.28447359800338745, "learning_rate": 6.558837122312787e-06, "loss": 0.2886, "step": 16365 }, { "epoch": 0.6120055144052153, "grad_norm": 0.9879940748214722, "learning_rate": 6.553323796685709e-06, "loss": 0.3565, "step": 16370 }, { "epoch": 0.6121924433955651, "grad_norm": 0.35768356919288635, "learning_rate": 6.547811659718492e-06, "loss": 0.3012, "step": 16375 }, { "epoch": 0.6123793723859149, "grad_norm": 0.3880248963832855, "learning_rate": 6.542300713312113e-06, "loss": 0.372, "step": 16380 }, { "epoch": 0.6125663013762647, "grad_norm": 0.1782693862915039, "learning_rate": 6.536790959367149e-06, "loss": 0.2869, "step": 16385 }, { "epoch": 0.6127532303666144, "grad_norm": 0.4560245871543884, "learning_rate": 6.5312823997837425e-06, "loss": 0.2764, "step": 16390 }, { "epoch": 0.6129401593569642, "grad_norm": 0.3002925217151642, "learning_rate": 6.525775036461652e-06, "loss": 0.2596, "step": 16395 }, { "epoch": 0.6131270883473141, "grad_norm": 0.45390433073043823, "learning_rate": 6.520268871300198e-06, "loss": 0.2895, "step": 16400 }, { "epoch": 0.6133140173376639, "grad_norm": 0.19361938536167145, "learning_rate": 6.514763906198307e-06, "loss": 0.3229, "step": 16405 }, { "epoch": 0.6135009463280137, "grad_norm": 0.6821458339691162, "learning_rate": 6.509260143054474e-06, "loss": 0.2455, "step": 16410 }, { "epoch": 0.6136878753183634, "grad_norm": 0.405543714761734, "learning_rate": 6.503757583766802e-06, "loss": 0.3357, "step": 16415 }, { "epoch": 0.6138748043087132, "grad_norm": 0.6047316789627075, "learning_rate": 6.4982562302329535e-06, "loss": 0.3021, "step": 16420 }, { "epoch": 0.614061733299063, "grad_norm": 0.49325546622276306, "learning_rate": 6.492756084350196e-06, "loss": 0.3521, "step": 16425 }, { "epoch": 0.6142486622894128, "grad_norm": 0.8090613484382629, "learning_rate": 6.4872571480153725e-06, "loss": 0.3772, "step": 16430 }, { "epoch": 0.6144355912797625, "grad_norm": 0.4884081780910492, "learning_rate": 6.4817594231249e-06, "loss": 0.2789, "step": 16435 }, { "epoch": 0.6146225202701124, "grad_norm": 0.4141273498535156, "learning_rate": 6.476262911574797e-06, "loss": 0.3313, "step": 16440 }, { "epoch": 0.6148094492604622, "grad_norm": 0.6283908486366272, "learning_rate": 6.470767615260647e-06, "loss": 0.2758, "step": 16445 }, { "epoch": 0.614996378250812, "grad_norm": 0.3659379184246063, "learning_rate": 6.465273536077623e-06, "loss": 0.2827, "step": 16450 }, { "epoch": 0.6151833072411618, "grad_norm": 0.4448320269584656, "learning_rate": 6.459780675920468e-06, "loss": 0.2774, "step": 16455 }, { "epoch": 0.6153702362315115, "grad_norm": 0.43518292903900146, "learning_rate": 6.454289036683523e-06, "loss": 0.2542, "step": 16460 }, { "epoch": 0.6155571652218613, "grad_norm": 0.26366111636161804, "learning_rate": 6.448798620260688e-06, "loss": 0.2958, "step": 16465 }, { "epoch": 0.6157440942122111, "grad_norm": 0.5211677551269531, "learning_rate": 6.443309428545457e-06, "loss": 0.2952, "step": 16470 }, { "epoch": 0.6159310232025609, "grad_norm": 0.2922613322734833, "learning_rate": 6.4378214634308925e-06, "loss": 0.194, "step": 16475 }, { "epoch": 0.6161179521929108, "grad_norm": 0.3419044613838196, "learning_rate": 6.4323347268096316e-06, "loss": 0.2536, "step": 16480 }, { "epoch": 0.6163048811832605, "grad_norm": 0.37760069966316223, "learning_rate": 6.426849220573901e-06, "loss": 0.3227, "step": 16485 }, { "epoch": 0.6164918101736103, "grad_norm": 0.8805325627326965, "learning_rate": 6.4213649466154894e-06, "loss": 0.3393, "step": 16490 }, { "epoch": 0.6166787391639601, "grad_norm": 0.47188910841941833, "learning_rate": 6.415881906825767e-06, "loss": 0.2425, "step": 16495 }, { "epoch": 0.6168656681543099, "grad_norm": 0.47129133343696594, "learning_rate": 6.4104001030956755e-06, "loss": 0.3161, "step": 16500 }, { "epoch": 0.6170525971446597, "grad_norm": 0.43512147665023804, "learning_rate": 6.404919537315737e-06, "loss": 0.2694, "step": 16505 }, { "epoch": 0.6172395261350094, "grad_norm": 0.1717224419116974, "learning_rate": 6.399440211376033e-06, "loss": 0.3666, "step": 16510 }, { "epoch": 0.6174264551253592, "grad_norm": 0.6517991423606873, "learning_rate": 6.393962127166233e-06, "loss": 0.3249, "step": 16515 }, { "epoch": 0.6176133841157091, "grad_norm": 0.30633100867271423, "learning_rate": 6.388485286575572e-06, "loss": 0.2839, "step": 16520 }, { "epoch": 0.6178003131060589, "grad_norm": 0.5757652521133423, "learning_rate": 6.383009691492847e-06, "loss": 0.2825, "step": 16525 }, { "epoch": 0.6179872420964087, "grad_norm": 0.4330870509147644, "learning_rate": 6.377535343806446e-06, "loss": 0.2922, "step": 16530 }, { "epoch": 0.6181741710867584, "grad_norm": 0.279230535030365, "learning_rate": 6.372062245404302e-06, "loss": 0.2907, "step": 16535 }, { "epoch": 0.6183611000771082, "grad_norm": 0.6289684176445007, "learning_rate": 6.366590398173942e-06, "loss": 0.4383, "step": 16540 }, { "epoch": 0.618548029067458, "grad_norm": 0.6121999621391296, "learning_rate": 6.3611198040024405e-06, "loss": 0.2341, "step": 16545 }, { "epoch": 0.6187349580578078, "grad_norm": 0.421684592962265, "learning_rate": 6.355650464776453e-06, "loss": 0.2145, "step": 16550 }, { "epoch": 0.6189218870481575, "grad_norm": 0.45362263917922974, "learning_rate": 6.350182382382193e-06, "loss": 0.3438, "step": 16555 }, { "epoch": 0.6191088160385074, "grad_norm": 0.35668495297431946, "learning_rate": 6.34471555870545e-06, "loss": 0.2971, "step": 16560 }, { "epoch": 0.6192957450288572, "grad_norm": 0.3048703372478485, "learning_rate": 6.339249995631575e-06, "loss": 0.2846, "step": 16565 }, { "epoch": 0.619482674019207, "grad_norm": 0.42319127917289734, "learning_rate": 6.33378569504548e-06, "loss": 0.2441, "step": 16570 }, { "epoch": 0.6196696030095568, "grad_norm": 0.40297672152519226, "learning_rate": 6.328322658831652e-06, "loss": 0.2304, "step": 16575 }, { "epoch": 0.6198565319999065, "grad_norm": 0.32730260491371155, "learning_rate": 6.322860888874129e-06, "loss": 0.2582, "step": 16580 }, { "epoch": 0.6200434609902563, "grad_norm": 0.31443408131599426, "learning_rate": 6.3174003870565256e-06, "loss": 0.3275, "step": 16585 }, { "epoch": 0.6202303899806061, "grad_norm": 0.440044641494751, "learning_rate": 6.311941155262007e-06, "loss": 0.295, "step": 16590 }, { "epoch": 0.6204173189709559, "grad_norm": 0.396515816450119, "learning_rate": 6.306483195373309e-06, "loss": 0.3439, "step": 16595 }, { "epoch": 0.6206042479613056, "grad_norm": 0.43022897839546204, "learning_rate": 6.301026509272721e-06, "loss": 0.334, "step": 16600 }, { "epoch": 0.6207911769516555, "grad_norm": 0.20897158980369568, "learning_rate": 6.2955710988421e-06, "loss": 0.2489, "step": 16605 }, { "epoch": 0.6209781059420053, "grad_norm": 0.43990305066108704, "learning_rate": 6.290116965962867e-06, "loss": 0.2715, "step": 16610 }, { "epoch": 0.6211650349323551, "grad_norm": 0.21257735788822174, "learning_rate": 6.284664112515988e-06, "loss": 0.2052, "step": 16615 }, { "epoch": 0.6213519639227049, "grad_norm": 0.4820898175239563, "learning_rate": 6.279212540382e-06, "loss": 0.3537, "step": 16620 }, { "epoch": 0.6215388929130546, "grad_norm": 0.2106875628232956, "learning_rate": 6.273762251440991e-06, "loss": 0.2318, "step": 16625 }, { "epoch": 0.6217258219034044, "grad_norm": 0.3450741767883301, "learning_rate": 6.268313247572614e-06, "loss": 0.3046, "step": 16630 }, { "epoch": 0.6219127508937542, "grad_norm": 0.2806203365325928, "learning_rate": 6.262865530656069e-06, "loss": 0.2983, "step": 16635 }, { "epoch": 0.622099679884104, "grad_norm": 0.5870185494422913, "learning_rate": 6.257419102570122e-06, "loss": 0.2252, "step": 16640 }, { "epoch": 0.6222866088744539, "grad_norm": 0.33746689558029175, "learning_rate": 6.251973965193085e-06, "loss": 0.2831, "step": 16645 }, { "epoch": 0.6224735378648036, "grad_norm": 0.4964042901992798, "learning_rate": 6.246530120402833e-06, "loss": 0.3002, "step": 16650 }, { "epoch": 0.6226604668551534, "grad_norm": 0.31772905588150024, "learning_rate": 6.241087570076796e-06, "loss": 0.3224, "step": 16655 }, { "epoch": 0.6228473958455032, "grad_norm": 0.547088623046875, "learning_rate": 6.235646316091945e-06, "loss": 0.2861, "step": 16660 }, { "epoch": 0.623034324835853, "grad_norm": 0.5915294885635376, "learning_rate": 6.230206360324822e-06, "loss": 0.3202, "step": 16665 }, { "epoch": 0.6232212538262027, "grad_norm": 0.3158590793609619, "learning_rate": 6.224767704651502e-06, "loss": 0.3383, "step": 16670 }, { "epoch": 0.6234081828165525, "grad_norm": 0.4911237061023712, "learning_rate": 6.219330350947632e-06, "loss": 0.247, "step": 16675 }, { "epoch": 0.6235951118069023, "grad_norm": 0.253912091255188, "learning_rate": 6.213894301088388e-06, "loss": 0.2208, "step": 16680 }, { "epoch": 0.6237820407972522, "grad_norm": 0.3339253067970276, "learning_rate": 6.208459556948519e-06, "loss": 0.2759, "step": 16685 }, { "epoch": 0.623968969787602, "grad_norm": 0.6338766813278198, "learning_rate": 6.2030261204023055e-06, "loss": 0.2522, "step": 16690 }, { "epoch": 0.6241558987779517, "grad_norm": 0.3243858814239502, "learning_rate": 6.197593993323583e-06, "loss": 0.3472, "step": 16695 }, { "epoch": 0.6243428277683015, "grad_norm": 0.5515966415405273, "learning_rate": 6.192163177585745e-06, "loss": 0.2825, "step": 16700 }, { "epoch": 0.6245297567586513, "grad_norm": 0.3765512704849243, "learning_rate": 6.1867336750617155e-06, "loss": 0.2992, "step": 16705 }, { "epoch": 0.6247166857490011, "grad_norm": 0.3543432950973511, "learning_rate": 6.181305487623981e-06, "loss": 0.3033, "step": 16710 }, { "epoch": 0.6249036147393509, "grad_norm": 0.29796430468559265, "learning_rate": 6.175878617144559e-06, "loss": 0.2713, "step": 16715 }, { "epoch": 0.6250905437297006, "grad_norm": 0.38573959469795227, "learning_rate": 6.170453065495032e-06, "loss": 0.2466, "step": 16720 }, { "epoch": 0.6252774727200505, "grad_norm": 0.44721537828445435, "learning_rate": 6.165028834546507e-06, "loss": 0.2684, "step": 16725 }, { "epoch": 0.6254644017104003, "grad_norm": 0.290237694978714, "learning_rate": 6.1596059261696564e-06, "loss": 0.266, "step": 16730 }, { "epoch": 0.6256513307007501, "grad_norm": 0.36702224612236023, "learning_rate": 6.154184342234678e-06, "loss": 0.2362, "step": 16735 }, { "epoch": 0.6258382596910999, "grad_norm": 0.4877489507198334, "learning_rate": 6.148764084611325e-06, "loss": 0.2409, "step": 16740 }, { "epoch": 0.6260251886814496, "grad_norm": 0.7535521388053894, "learning_rate": 6.143345155168885e-06, "loss": 0.2288, "step": 16745 }, { "epoch": 0.6262121176717994, "grad_norm": 0.2947673201560974, "learning_rate": 6.137927555776194e-06, "loss": 0.2387, "step": 16750 }, { "epoch": 0.6263990466621492, "grad_norm": 0.4088467061519623, "learning_rate": 6.1325112883016306e-06, "loss": 0.2728, "step": 16755 }, { "epoch": 0.626585975652499, "grad_norm": 0.5079031586647034, "learning_rate": 6.1270963546131005e-06, "loss": 0.2597, "step": 16760 }, { "epoch": 0.6267729046428488, "grad_norm": 0.31118345260620117, "learning_rate": 6.121682756578069e-06, "loss": 0.3091, "step": 16765 }, { "epoch": 0.6269598336331986, "grad_norm": 0.7571595311164856, "learning_rate": 6.116270496063523e-06, "loss": 0.2882, "step": 16770 }, { "epoch": 0.6271467626235484, "grad_norm": 0.3171136975288391, "learning_rate": 6.110859574936006e-06, "loss": 0.2565, "step": 16775 }, { "epoch": 0.6273336916138982, "grad_norm": 0.448777973651886, "learning_rate": 6.105449995061579e-06, "loss": 0.2222, "step": 16780 }, { "epoch": 0.627520620604248, "grad_norm": 0.44783681631088257, "learning_rate": 6.1000417583058595e-06, "loss": 0.3105, "step": 16785 }, { "epoch": 0.6277075495945977, "grad_norm": 0.24276278913021088, "learning_rate": 6.094634866533984e-06, "loss": 0.2999, "step": 16790 }, { "epoch": 0.6278944785849475, "grad_norm": 0.25277918577194214, "learning_rate": 6.089229321610641e-06, "loss": 0.2993, "step": 16795 }, { "epoch": 0.6280814075752973, "grad_norm": 0.32562288641929626, "learning_rate": 6.083825125400052e-06, "loss": 0.2675, "step": 16800 }, { "epoch": 0.6282683365656472, "grad_norm": 0.5652545094490051, "learning_rate": 6.078422279765961e-06, "loss": 0.2676, "step": 16805 }, { "epoch": 0.628455265555997, "grad_norm": 0.5595022439956665, "learning_rate": 6.073020786571662e-06, "loss": 0.2698, "step": 16810 }, { "epoch": 0.6286421945463467, "grad_norm": 0.18715853989124298, "learning_rate": 6.067620647679966e-06, "loss": 0.3081, "step": 16815 }, { "epoch": 0.6288291235366965, "grad_norm": 0.4392125904560089, "learning_rate": 6.062221864953237e-06, "loss": 0.2487, "step": 16820 }, { "epoch": 0.6290160525270463, "grad_norm": 0.542143702507019, "learning_rate": 6.0568244402533525e-06, "loss": 0.3002, "step": 16825 }, { "epoch": 0.6292029815173961, "grad_norm": 0.3177799582481384, "learning_rate": 6.051428375441735e-06, "loss": 0.2498, "step": 16830 }, { "epoch": 0.6293899105077458, "grad_norm": 0.731010377407074, "learning_rate": 6.046033672379325e-06, "loss": 0.2696, "step": 16835 }, { "epoch": 0.6295768394980956, "grad_norm": 0.3050804138183594, "learning_rate": 6.040640332926606e-06, "loss": 0.2236, "step": 16840 }, { "epoch": 0.6297637684884454, "grad_norm": 0.48626509308815, "learning_rate": 6.035248358943591e-06, "loss": 0.3319, "step": 16845 }, { "epoch": 0.6299506974787953, "grad_norm": 0.4176667034626007, "learning_rate": 6.0298577522898095e-06, "loss": 0.2927, "step": 16850 }, { "epoch": 0.6301376264691451, "grad_norm": 0.3971768319606781, "learning_rate": 6.024468514824333e-06, "loss": 0.2824, "step": 16855 }, { "epoch": 0.6303245554594948, "grad_norm": 0.4421550929546356, "learning_rate": 6.019080648405747e-06, "loss": 0.2794, "step": 16860 }, { "epoch": 0.6305114844498446, "grad_norm": 0.30311697721481323, "learning_rate": 6.013694154892183e-06, "loss": 0.2382, "step": 16865 }, { "epoch": 0.6306984134401944, "grad_norm": 0.14912711083889008, "learning_rate": 6.008309036141279e-06, "loss": 0.2516, "step": 16870 }, { "epoch": 0.6308853424305442, "grad_norm": 0.4212360978126526, "learning_rate": 6.0029252940102154e-06, "loss": 0.3549, "step": 16875 }, { "epoch": 0.631072271420894, "grad_norm": 1.1165913343429565, "learning_rate": 5.997542930355685e-06, "loss": 0.3441, "step": 16880 }, { "epoch": 0.6312592004112437, "grad_norm": 0.5724372267723083, "learning_rate": 5.992161947033912e-06, "loss": 0.273, "step": 16885 }, { "epoch": 0.6314461294015936, "grad_norm": 0.39647606015205383, "learning_rate": 5.9867823459006466e-06, "loss": 0.2822, "step": 16890 }, { "epoch": 0.6316330583919434, "grad_norm": 0.6420783996582031, "learning_rate": 5.981404128811157e-06, "loss": 0.353, "step": 16895 }, { "epoch": 0.6318199873822932, "grad_norm": 0.21946968138217926, "learning_rate": 5.976027297620237e-06, "loss": 0.2933, "step": 16900 }, { "epoch": 0.632006916372643, "grad_norm": 0.48058822751045227, "learning_rate": 5.970651854182197e-06, "loss": 0.2776, "step": 16905 }, { "epoch": 0.6321938453629927, "grad_norm": 0.6671790480613708, "learning_rate": 5.965277800350879e-06, "loss": 0.2776, "step": 16910 }, { "epoch": 0.6323807743533425, "grad_norm": 0.3052579164505005, "learning_rate": 5.959905137979637e-06, "loss": 0.3139, "step": 16915 }, { "epoch": 0.6325677033436923, "grad_norm": 0.37527140974998474, "learning_rate": 5.954533868921352e-06, "loss": 0.2721, "step": 16920 }, { "epoch": 0.632754632334042, "grad_norm": 0.3904482126235962, "learning_rate": 5.949163995028418e-06, "loss": 0.2949, "step": 16925 }, { "epoch": 0.6329415613243919, "grad_norm": 0.3578369617462158, "learning_rate": 5.943795518152747e-06, "loss": 0.315, "step": 16930 }, { "epoch": 0.6331284903147417, "grad_norm": 0.2731132507324219, "learning_rate": 5.9384284401457824e-06, "loss": 0.4051, "step": 16935 }, { "epoch": 0.6333154193050915, "grad_norm": 0.3908912241458893, "learning_rate": 5.933062762858467e-06, "loss": 0.2259, "step": 16940 }, { "epoch": 0.6335023482954413, "grad_norm": 0.6744188070297241, "learning_rate": 5.927698488141273e-06, "loss": 0.2906, "step": 16945 }, { "epoch": 0.633689277285791, "grad_norm": 0.3423663377761841, "learning_rate": 5.9223356178441835e-06, "loss": 0.2811, "step": 16950 }, { "epoch": 0.6338762062761408, "grad_norm": 0.4647408723831177, "learning_rate": 5.9169741538167015e-06, "loss": 0.2524, "step": 16955 }, { "epoch": 0.6340631352664906, "grad_norm": 0.33333298563957214, "learning_rate": 5.9116140979078364e-06, "loss": 0.2098, "step": 16960 }, { "epoch": 0.6342500642568404, "grad_norm": 0.41983723640441895, "learning_rate": 5.906255451966127e-06, "loss": 0.257, "step": 16965 }, { "epoch": 0.6344369932471903, "grad_norm": 0.5163545608520508, "learning_rate": 5.900898217839608e-06, "loss": 0.3418, "step": 16970 }, { "epoch": 0.63462392223754, "grad_norm": 0.5699335932731628, "learning_rate": 5.895542397375837e-06, "loss": 0.2937, "step": 16975 }, { "epoch": 0.6348108512278898, "grad_norm": 0.3931114077568054, "learning_rate": 5.8901879924218915e-06, "loss": 0.3099, "step": 16980 }, { "epoch": 0.6349977802182396, "grad_norm": 0.19795525074005127, "learning_rate": 5.884835004824343e-06, "loss": 0.3461, "step": 16985 }, { "epoch": 0.6351847092085894, "grad_norm": 0.34459546208381653, "learning_rate": 5.87948343642929e-06, "loss": 0.2567, "step": 16990 }, { "epoch": 0.6353716381989392, "grad_norm": 0.37620869278907776, "learning_rate": 5.874133289082329e-06, "loss": 0.2748, "step": 16995 }, { "epoch": 0.6355585671892889, "grad_norm": 0.30327436327934265, "learning_rate": 5.868784564628578e-06, "loss": 0.2824, "step": 17000 }, { "epoch": 0.6357454961796387, "grad_norm": 0.3879458010196686, "learning_rate": 5.863437264912653e-06, "loss": 0.3167, "step": 17005 }, { "epoch": 0.6359324251699886, "grad_norm": 0.3175660967826843, "learning_rate": 5.858091391778691e-06, "loss": 0.3064, "step": 17010 }, { "epoch": 0.6361193541603384, "grad_norm": 0.7663373351097107, "learning_rate": 5.852746947070326e-06, "loss": 0.3244, "step": 17015 }, { "epoch": 0.6363062831506882, "grad_norm": 0.3658159375190735, "learning_rate": 5.847403932630702e-06, "loss": 0.2626, "step": 17020 }, { "epoch": 0.6364932121410379, "grad_norm": 0.3354896008968353, "learning_rate": 5.84206235030248e-06, "loss": 0.2767, "step": 17025 }, { "epoch": 0.6366801411313877, "grad_norm": 0.4783341586589813, "learning_rate": 5.836722201927809e-06, "loss": 0.383, "step": 17030 }, { "epoch": 0.6368670701217375, "grad_norm": 0.5083820819854736, "learning_rate": 5.831383489348361e-06, "loss": 0.3022, "step": 17035 }, { "epoch": 0.6370539991120873, "grad_norm": 0.34456634521484375, "learning_rate": 5.826046214405298e-06, "loss": 0.3142, "step": 17040 }, { "epoch": 0.637240928102437, "grad_norm": 0.5159575343132019, "learning_rate": 5.820710378939301e-06, "loss": 0.2298, "step": 17045 }, { "epoch": 0.6374278570927869, "grad_norm": 0.5502740740776062, "learning_rate": 5.815375984790543e-06, "loss": 0.4202, "step": 17050 }, { "epoch": 0.6376147860831367, "grad_norm": 0.6210566759109497, "learning_rate": 5.810043033798702e-06, "loss": 0.3434, "step": 17055 }, { "epoch": 0.6378017150734865, "grad_norm": 0.47056567668914795, "learning_rate": 5.804711527802957e-06, "loss": 0.2732, "step": 17060 }, { "epoch": 0.6379886440638363, "grad_norm": 0.5329228043556213, "learning_rate": 5.799381468641998e-06, "loss": 0.2628, "step": 17065 }, { "epoch": 0.638175573054186, "grad_norm": 0.5529435276985168, "learning_rate": 5.794052858154009e-06, "loss": 0.3167, "step": 17070 }, { "epoch": 0.6383625020445358, "grad_norm": 0.27884042263031006, "learning_rate": 5.788725698176672e-06, "loss": 0.246, "step": 17075 }, { "epoch": 0.6385494310348856, "grad_norm": 0.4293839931488037, "learning_rate": 5.783399990547176e-06, "loss": 0.2365, "step": 17080 }, { "epoch": 0.6387363600252354, "grad_norm": 0.47030767798423767, "learning_rate": 5.778075737102198e-06, "loss": 0.3154, "step": 17085 }, { "epoch": 0.6389232890155851, "grad_norm": 0.2606756091117859, "learning_rate": 5.772752939677929e-06, "loss": 0.2479, "step": 17090 }, { "epoch": 0.639110218005935, "grad_norm": 0.527047872543335, "learning_rate": 5.767431600110042e-06, "loss": 0.3171, "step": 17095 }, { "epoch": 0.6392971469962848, "grad_norm": 0.49253425002098083, "learning_rate": 5.7621117202337205e-06, "loss": 0.3053, "step": 17100 }, { "epoch": 0.6394840759866346, "grad_norm": 0.5128486752510071, "learning_rate": 5.7567933018836365e-06, "loss": 0.3014, "step": 17105 }, { "epoch": 0.6396710049769844, "grad_norm": 0.46093523502349854, "learning_rate": 5.751476346893956e-06, "loss": 0.3348, "step": 17110 }, { "epoch": 0.6398579339673341, "grad_norm": 0.3886370360851288, "learning_rate": 5.746160857098351e-06, "loss": 0.2569, "step": 17115 }, { "epoch": 0.6400448629576839, "grad_norm": 0.6568734049797058, "learning_rate": 5.740846834329974e-06, "loss": 0.2808, "step": 17120 }, { "epoch": 0.6402317919480337, "grad_norm": 0.39880597591400146, "learning_rate": 5.735534280421489e-06, "loss": 0.3035, "step": 17125 }, { "epoch": 0.6404187209383835, "grad_norm": 0.47509804368019104, "learning_rate": 5.730223197205034e-06, "loss": 0.2522, "step": 17130 }, { "epoch": 0.6406056499287334, "grad_norm": 0.22290876507759094, "learning_rate": 5.7249135865122575e-06, "loss": 0.2568, "step": 17135 }, { "epoch": 0.6407925789190831, "grad_norm": 0.36923328042030334, "learning_rate": 5.719605450174283e-06, "loss": 0.2765, "step": 17140 }, { "epoch": 0.6409795079094329, "grad_norm": 0.43416711688041687, "learning_rate": 5.7142987900217464e-06, "loss": 0.3103, "step": 17145 }, { "epoch": 0.6411664368997827, "grad_norm": 0.32463186979293823, "learning_rate": 5.708993607884754e-06, "loss": 0.2868, "step": 17150 }, { "epoch": 0.6413533658901325, "grad_norm": 0.21319809556007385, "learning_rate": 5.703689905592911e-06, "loss": 0.244, "step": 17155 }, { "epoch": 0.6415402948804823, "grad_norm": 0.39962127804756165, "learning_rate": 5.698387684975317e-06, "loss": 0.3021, "step": 17160 }, { "epoch": 0.641727223870832, "grad_norm": 0.48100176453590393, "learning_rate": 5.693086947860551e-06, "loss": 0.3278, "step": 17165 }, { "epoch": 0.6419141528611818, "grad_norm": 0.5302535891532898, "learning_rate": 5.687787696076692e-06, "loss": 0.2589, "step": 17170 }, { "epoch": 0.6421010818515317, "grad_norm": 0.5527657866477966, "learning_rate": 5.682489931451292e-06, "loss": 0.3159, "step": 17175 }, { "epoch": 0.6422880108418815, "grad_norm": 0.3536984920501709, "learning_rate": 5.677193655811406e-06, "loss": 0.2313, "step": 17180 }, { "epoch": 0.6424749398322313, "grad_norm": 0.49602222442626953, "learning_rate": 5.67189887098356e-06, "loss": 0.3013, "step": 17185 }, { "epoch": 0.642661868822581, "grad_norm": 0.5658937096595764, "learning_rate": 5.666605578793782e-06, "loss": 0.2457, "step": 17190 }, { "epoch": 0.6428487978129308, "grad_norm": 0.10081786662340164, "learning_rate": 5.661313781067572e-06, "loss": 0.2528, "step": 17195 }, { "epoch": 0.6430357268032806, "grad_norm": 0.2929933965206146, "learning_rate": 5.656023479629915e-06, "loss": 0.282, "step": 17200 }, { "epoch": 0.6432226557936304, "grad_norm": 0.27145472168922424, "learning_rate": 5.650734676305295e-06, "loss": 0.2697, "step": 17205 }, { "epoch": 0.6434095847839801, "grad_norm": 0.45547887682914734, "learning_rate": 5.645447372917658e-06, "loss": 0.2184, "step": 17210 }, { "epoch": 0.64359651377433, "grad_norm": 0.3759501576423645, "learning_rate": 5.640161571290452e-06, "loss": 0.259, "step": 17215 }, { "epoch": 0.6437834427646798, "grad_norm": 0.5527650117874146, "learning_rate": 5.6348772732465925e-06, "loss": 0.271, "step": 17220 }, { "epoch": 0.6439703717550296, "grad_norm": 0.6469981670379639, "learning_rate": 5.629594480608487e-06, "loss": 0.2916, "step": 17225 }, { "epoch": 0.6441573007453794, "grad_norm": 0.3704184591770172, "learning_rate": 5.6243131951980145e-06, "loss": 0.2614, "step": 17230 }, { "epoch": 0.6443442297357291, "grad_norm": 0.3571639358997345, "learning_rate": 5.619033418836545e-06, "loss": 0.3213, "step": 17235 }, { "epoch": 0.6445311587260789, "grad_norm": 0.40060603618621826, "learning_rate": 5.613755153344918e-06, "loss": 0.2876, "step": 17240 }, { "epoch": 0.6447180877164287, "grad_norm": 0.38819968700408936, "learning_rate": 5.608478400543455e-06, "loss": 0.2737, "step": 17245 }, { "epoch": 0.6449050167067785, "grad_norm": 0.38324692845344543, "learning_rate": 5.60320316225196e-06, "loss": 0.276, "step": 17250 }, { "epoch": 0.6450919456971284, "grad_norm": 0.2735843360424042, "learning_rate": 5.597929440289709e-06, "loss": 0.3314, "step": 17255 }, { "epoch": 0.6452788746874781, "grad_norm": 0.5796816349029541, "learning_rate": 5.592657236475461e-06, "loss": 0.3573, "step": 17260 }, { "epoch": 0.6454658036778279, "grad_norm": 0.3068085312843323, "learning_rate": 5.587386552627442e-06, "loss": 0.3163, "step": 17265 }, { "epoch": 0.6456527326681777, "grad_norm": 0.5472904443740845, "learning_rate": 5.582117390563368e-06, "loss": 0.3123, "step": 17270 }, { "epoch": 0.6458396616585275, "grad_norm": 0.3581070005893707, "learning_rate": 5.576849752100413e-06, "loss": 0.2803, "step": 17275 }, { "epoch": 0.6460265906488772, "grad_norm": 0.40146610140800476, "learning_rate": 5.571583639055243e-06, "loss": 0.3476, "step": 17280 }, { "epoch": 0.646213519639227, "grad_norm": 0.43631863594055176, "learning_rate": 5.566319053243983e-06, "loss": 0.2792, "step": 17285 }, { "epoch": 0.6464004486295768, "grad_norm": 0.3435373604297638, "learning_rate": 5.561055996482243e-06, "loss": 0.291, "step": 17290 }, { "epoch": 0.6465873776199267, "grad_norm": 0.5556064248085022, "learning_rate": 5.555794470585099e-06, "loss": 0.2866, "step": 17295 }, { "epoch": 0.6467743066102765, "grad_norm": 0.6091718077659607, "learning_rate": 5.550534477367096e-06, "loss": 0.3378, "step": 17300 }, { "epoch": 0.6469612356006262, "grad_norm": 0.3173150420188904, "learning_rate": 5.54527601864226e-06, "loss": 0.2673, "step": 17305 }, { "epoch": 0.647148164590976, "grad_norm": 0.629828929901123, "learning_rate": 5.540019096224079e-06, "loss": 0.2741, "step": 17310 }, { "epoch": 0.6473350935813258, "grad_norm": 0.6558170914649963, "learning_rate": 5.534763711925522e-06, "loss": 0.3303, "step": 17315 }, { "epoch": 0.6475220225716756, "grad_norm": 0.3189551532268524, "learning_rate": 5.529509867559011e-06, "loss": 0.2273, "step": 17320 }, { "epoch": 0.6477089515620253, "grad_norm": 0.4152987003326416, "learning_rate": 5.524257564936454e-06, "loss": 0.2371, "step": 17325 }, { "epoch": 0.6478958805523751, "grad_norm": 0.46988746523857117, "learning_rate": 5.519006805869213e-06, "loss": 0.2364, "step": 17330 }, { "epoch": 0.6480828095427249, "grad_norm": 0.25283321738243103, "learning_rate": 5.513757592168132e-06, "loss": 0.2328, "step": 17335 }, { "epoch": 0.6482697385330748, "grad_norm": 0.792521059513092, "learning_rate": 5.508509925643511e-06, "loss": 0.2674, "step": 17340 }, { "epoch": 0.6484566675234246, "grad_norm": 0.7444917559623718, "learning_rate": 5.503263808105114e-06, "loss": 0.2861, "step": 17345 }, { "epoch": 0.6486435965137743, "grad_norm": 0.4276942312717438, "learning_rate": 5.4980192413621866e-06, "loss": 0.2641, "step": 17350 }, { "epoch": 0.6488305255041241, "grad_norm": 0.4908183217048645, "learning_rate": 5.49277622722342e-06, "loss": 0.2743, "step": 17355 }, { "epoch": 0.6490174544944739, "grad_norm": 0.39298996329307556, "learning_rate": 5.487534767496989e-06, "loss": 0.2977, "step": 17360 }, { "epoch": 0.6492043834848237, "grad_norm": 0.1759922206401825, "learning_rate": 5.482294863990514e-06, "loss": 0.3114, "step": 17365 }, { "epoch": 0.6493913124751735, "grad_norm": 0.43271568417549133, "learning_rate": 5.477056518511096e-06, "loss": 0.2849, "step": 17370 }, { "epoch": 0.6495782414655232, "grad_norm": 0.4080221951007843, "learning_rate": 5.471819732865282e-06, "loss": 0.2555, "step": 17375 }, { "epoch": 0.6497651704558731, "grad_norm": 0.3136856257915497, "learning_rate": 5.466584508859096e-06, "loss": 0.1905, "step": 17380 }, { "epoch": 0.6499520994462229, "grad_norm": 0.3478422164916992, "learning_rate": 5.461350848298016e-06, "loss": 0.356, "step": 17385 }, { "epoch": 0.6501390284365727, "grad_norm": 0.3305181562900543, "learning_rate": 5.456118752986975e-06, "loss": 0.3352, "step": 17390 }, { "epoch": 0.6503259574269225, "grad_norm": 0.35783740878105164, "learning_rate": 5.4508882247303815e-06, "loss": 0.2462, "step": 17395 }, { "epoch": 0.6505128864172722, "grad_norm": 0.14547370374202728, "learning_rate": 5.445659265332087e-06, "loss": 0.2325, "step": 17400 }, { "epoch": 0.650699815407622, "grad_norm": 0.26868870854377747, "learning_rate": 5.440431876595418e-06, "loss": 0.2746, "step": 17405 }, { "epoch": 0.6508867443979718, "grad_norm": 0.4959259629249573, "learning_rate": 5.435206060323142e-06, "loss": 0.2315, "step": 17410 }, { "epoch": 0.6510736733883216, "grad_norm": 0.5970064997673035, "learning_rate": 5.4299818183175006e-06, "loss": 0.3014, "step": 17415 }, { "epoch": 0.6512606023786714, "grad_norm": 0.4646480679512024, "learning_rate": 5.424759152380179e-06, "loss": 0.3339, "step": 17420 }, { "epoch": 0.6514475313690212, "grad_norm": 0.31364089250564575, "learning_rate": 5.419538064312333e-06, "loss": 0.3644, "step": 17425 }, { "epoch": 0.651634460359371, "grad_norm": 0.4947362244129181, "learning_rate": 5.414318555914563e-06, "loss": 0.2448, "step": 17430 }, { "epoch": 0.6518213893497208, "grad_norm": 0.3628036677837372, "learning_rate": 5.409100628986921e-06, "loss": 0.2198, "step": 17435 }, { "epoch": 0.6520083183400706, "grad_norm": 0.38185828924179077, "learning_rate": 5.4038842853289305e-06, "loss": 0.3506, "step": 17440 }, { "epoch": 0.6521952473304203, "grad_norm": 0.37392547726631165, "learning_rate": 5.398669526739551e-06, "loss": 0.2647, "step": 17445 }, { "epoch": 0.6523821763207701, "grad_norm": 0.3992078900337219, "learning_rate": 5.39345635501721e-06, "loss": 0.2584, "step": 17450 }, { "epoch": 0.6525691053111199, "grad_norm": 0.27611681818962097, "learning_rate": 5.388244771959777e-06, "loss": 0.2419, "step": 17455 }, { "epoch": 0.6527560343014698, "grad_norm": 0.42422646284103394, "learning_rate": 5.38303477936458e-06, "loss": 0.2328, "step": 17460 }, { "epoch": 0.6529429632918196, "grad_norm": 0.25942620635032654, "learning_rate": 5.3778263790283905e-06, "loss": 0.2184, "step": 17465 }, { "epoch": 0.6531298922821693, "grad_norm": 0.4233878552913666, "learning_rate": 5.372619572747442e-06, "loss": 0.3018, "step": 17470 }, { "epoch": 0.6533168212725191, "grad_norm": 0.6455023884773254, "learning_rate": 5.3674143623174144e-06, "loss": 0.269, "step": 17475 }, { "epoch": 0.6535037502628689, "grad_norm": 0.635086178779602, "learning_rate": 5.362210749533434e-06, "loss": 0.2835, "step": 17480 }, { "epoch": 0.6536906792532187, "grad_norm": 0.21044528484344482, "learning_rate": 5.357008736190077e-06, "loss": 0.3771, "step": 17485 }, { "epoch": 0.6538776082435684, "grad_norm": 0.5174884796142578, "learning_rate": 5.351808324081362e-06, "loss": 0.2438, "step": 17490 }, { "epoch": 0.6540645372339182, "grad_norm": 0.4793969988822937, "learning_rate": 5.346609515000775e-06, "loss": 0.2707, "step": 17495 }, { "epoch": 0.6542514662242681, "grad_norm": 0.39202970266342163, "learning_rate": 5.341412310741225e-06, "loss": 0.2867, "step": 17500 }, { "epoch": 0.6544383952146179, "grad_norm": 0.3729572296142578, "learning_rate": 5.336216713095087e-06, "loss": 0.3556, "step": 17505 }, { "epoch": 0.6546253242049677, "grad_norm": 0.2661343514919281, "learning_rate": 5.3310227238541665e-06, "loss": 0.2963, "step": 17510 }, { "epoch": 0.6548122531953174, "grad_norm": 0.6629586815834045, "learning_rate": 5.325830344809726e-06, "loss": 0.3187, "step": 17515 }, { "epoch": 0.6549991821856672, "grad_norm": 0.5217400789260864, "learning_rate": 5.320639577752471e-06, "loss": 0.2794, "step": 17520 }, { "epoch": 0.655186111176017, "grad_norm": 0.40809670090675354, "learning_rate": 5.315450424472546e-06, "loss": 0.3342, "step": 17525 }, { "epoch": 0.6553730401663668, "grad_norm": 0.3134659230709076, "learning_rate": 5.31026288675954e-06, "loss": 0.2624, "step": 17530 }, { "epoch": 0.6555599691567165, "grad_norm": 0.5745450258255005, "learning_rate": 5.305076966402483e-06, "loss": 0.3727, "step": 17535 }, { "epoch": 0.6557468981470664, "grad_norm": 0.6128538846969604, "learning_rate": 5.299892665189856e-06, "loss": 0.2772, "step": 17540 }, { "epoch": 0.6559338271374162, "grad_norm": 0.2917860746383667, "learning_rate": 5.2947099849095695e-06, "loss": 0.3028, "step": 17545 }, { "epoch": 0.656120756127766, "grad_norm": 0.6575778722763062, "learning_rate": 5.2895289273489915e-06, "loss": 0.2669, "step": 17550 }, { "epoch": 0.6563076851181158, "grad_norm": 0.4796570837497711, "learning_rate": 5.2843494942949095e-06, "loss": 0.3287, "step": 17555 }, { "epoch": 0.6564946141084655, "grad_norm": 0.5242589712142944, "learning_rate": 5.279171687533566e-06, "loss": 0.2346, "step": 17560 }, { "epoch": 0.6566815430988153, "grad_norm": 0.4839895963668823, "learning_rate": 5.273995508850643e-06, "loss": 0.2419, "step": 17565 }, { "epoch": 0.6568684720891651, "grad_norm": 0.4953691065311432, "learning_rate": 5.268820960031252e-06, "loss": 0.3018, "step": 17570 }, { "epoch": 0.6570554010795149, "grad_norm": 0.34803852438926697, "learning_rate": 5.263648042859945e-06, "loss": 0.2719, "step": 17575 }, { "epoch": 0.6572423300698647, "grad_norm": 0.4037022888660431, "learning_rate": 5.258476759120713e-06, "loss": 0.251, "step": 17580 }, { "epoch": 0.6574292590602145, "grad_norm": 0.38551491498947144, "learning_rate": 5.253307110596988e-06, "loss": 0.1929, "step": 17585 }, { "epoch": 0.6576161880505643, "grad_norm": 0.33148205280303955, "learning_rate": 5.248139099071625e-06, "loss": 0.2561, "step": 17590 }, { "epoch": 0.6578031170409141, "grad_norm": 0.42896026372909546, "learning_rate": 5.242972726326934e-06, "loss": 0.2475, "step": 17595 }, { "epoch": 0.6579900460312639, "grad_norm": 0.5869537591934204, "learning_rate": 5.23780799414464e-06, "loss": 0.3102, "step": 17600 }, { "epoch": 0.6581769750216137, "grad_norm": 0.42332693934440613, "learning_rate": 5.232644904305914e-06, "loss": 0.2316, "step": 17605 }, { "epoch": 0.6583639040119634, "grad_norm": 0.20432531833648682, "learning_rate": 5.227483458591364e-06, "loss": 0.3037, "step": 17610 }, { "epoch": 0.6585508330023132, "grad_norm": 0.5013585686683655, "learning_rate": 5.222323658781018e-06, "loss": 0.2422, "step": 17615 }, { "epoch": 0.658737761992663, "grad_norm": 0.5252900719642639, "learning_rate": 5.21716550665434e-06, "loss": 0.2841, "step": 17620 }, { "epoch": 0.6589246909830129, "grad_norm": 0.3324468433856964, "learning_rate": 5.212009003990237e-06, "loss": 0.2643, "step": 17625 }, { "epoch": 0.6591116199733627, "grad_norm": 0.5473644137382507, "learning_rate": 5.206854152567036e-06, "loss": 0.2717, "step": 17630 }, { "epoch": 0.6592985489637124, "grad_norm": 1.3611345291137695, "learning_rate": 5.201700954162493e-06, "loss": 0.2884, "step": 17635 }, { "epoch": 0.6594854779540622, "grad_norm": 0.6788679957389832, "learning_rate": 5.196549410553806e-06, "loss": 0.2607, "step": 17640 }, { "epoch": 0.659672406944412, "grad_norm": 0.5106036067008972, "learning_rate": 5.191399523517586e-06, "loss": 0.3072, "step": 17645 }, { "epoch": 0.6598593359347618, "grad_norm": 0.8317034840583801, "learning_rate": 5.1862512948298885e-06, "loss": 0.3571, "step": 17650 }, { "epoch": 0.6600462649251115, "grad_norm": 0.3770235776901245, "learning_rate": 5.181104726266191e-06, "loss": 0.3051, "step": 17655 }, { "epoch": 0.6602331939154613, "grad_norm": 0.33272841572761536, "learning_rate": 5.17595981960139e-06, "loss": 0.3451, "step": 17660 }, { "epoch": 0.6604201229058112, "grad_norm": 0.36768677830696106, "learning_rate": 5.170816576609825e-06, "loss": 0.259, "step": 17665 }, { "epoch": 0.660607051896161, "grad_norm": 0.4106055498123169, "learning_rate": 5.16567499906525e-06, "loss": 0.342, "step": 17670 }, { "epoch": 0.6607939808865108, "grad_norm": 0.44016748666763306, "learning_rate": 5.1605350887408466e-06, "loss": 0.3125, "step": 17675 }, { "epoch": 0.6609809098768605, "grad_norm": 0.40315133333206177, "learning_rate": 5.1553968474092185e-06, "loss": 0.2638, "step": 17680 }, { "epoch": 0.6611678388672103, "grad_norm": 0.22286388278007507, "learning_rate": 5.150260276842407e-06, "loss": 0.2822, "step": 17685 }, { "epoch": 0.6613547678575601, "grad_norm": 0.3545617163181305, "learning_rate": 5.1451253788118595e-06, "loss": 0.3389, "step": 17690 }, { "epoch": 0.6615416968479099, "grad_norm": 0.4508470594882965, "learning_rate": 5.139992155088458e-06, "loss": 0.28, "step": 17695 }, { "epoch": 0.6617286258382596, "grad_norm": 0.3894427418708801, "learning_rate": 5.13486060744251e-06, "loss": 0.2735, "step": 17700 }, { "epoch": 0.6619155548286095, "grad_norm": 0.5788761377334595, "learning_rate": 5.12973073764373e-06, "loss": 0.3752, "step": 17705 }, { "epoch": 0.6621024838189593, "grad_norm": 0.2697499394416809, "learning_rate": 5.124602547461273e-06, "loss": 0.2663, "step": 17710 }, { "epoch": 0.6622894128093091, "grad_norm": 0.41945934295654297, "learning_rate": 5.119476038663699e-06, "loss": 0.3097, "step": 17715 }, { "epoch": 0.6624763417996589, "grad_norm": 0.20760861039161682, "learning_rate": 5.1143512130189935e-06, "loss": 0.2761, "step": 17720 }, { "epoch": 0.6626632707900086, "grad_norm": 0.47304192185401917, "learning_rate": 5.109228072294561e-06, "loss": 0.277, "step": 17725 }, { "epoch": 0.6628501997803584, "grad_norm": 0.3576745390892029, "learning_rate": 5.1041066182572296e-06, "loss": 0.2783, "step": 17730 }, { "epoch": 0.6630371287707082, "grad_norm": 0.41994425654411316, "learning_rate": 5.098986852673239e-06, "loss": 0.2465, "step": 17735 }, { "epoch": 0.663224057761058, "grad_norm": 0.3337131440639496, "learning_rate": 5.093868777308251e-06, "loss": 0.3563, "step": 17740 }, { "epoch": 0.6634109867514079, "grad_norm": 0.6296584010124207, "learning_rate": 5.088752393927345e-06, "loss": 0.2916, "step": 17745 }, { "epoch": 0.6635979157417576, "grad_norm": 0.1845739334821701, "learning_rate": 5.08363770429501e-06, "loss": 0.2693, "step": 17750 }, { "epoch": 0.6637848447321074, "grad_norm": 0.3726886510848999, "learning_rate": 5.0785247101751645e-06, "loss": 0.2992, "step": 17755 }, { "epoch": 0.6639717737224572, "grad_norm": 0.26406246423721313, "learning_rate": 5.073413413331128e-06, "loss": 0.2997, "step": 17760 }, { "epoch": 0.664158702712807, "grad_norm": 0.421535849571228, "learning_rate": 5.068303815525639e-06, "loss": 0.2156, "step": 17765 }, { "epoch": 0.6643456317031567, "grad_norm": 0.40606680512428284, "learning_rate": 5.0631959185208514e-06, "loss": 0.2638, "step": 17770 }, { "epoch": 0.6645325606935065, "grad_norm": 0.31488245725631714, "learning_rate": 5.0580897240783365e-06, "loss": 0.2754, "step": 17775 }, { "epoch": 0.6647194896838563, "grad_norm": 0.4173290729522705, "learning_rate": 5.052985233959069e-06, "loss": 0.2472, "step": 17780 }, { "epoch": 0.6649064186742062, "grad_norm": 0.4380118250846863, "learning_rate": 5.047882449923444e-06, "loss": 0.2227, "step": 17785 }, { "epoch": 0.665093347664556, "grad_norm": 0.22158585488796234, "learning_rate": 5.04278137373127e-06, "loss": 0.2522, "step": 17790 }, { "epoch": 0.6652802766549057, "grad_norm": 0.5004199147224426, "learning_rate": 5.037682007141754e-06, "loss": 0.3162, "step": 17795 }, { "epoch": 0.6654672056452555, "grad_norm": 0.5138402581214905, "learning_rate": 5.03258435191353e-06, "loss": 0.2277, "step": 17800 }, { "epoch": 0.6656541346356053, "grad_norm": 0.569689929485321, "learning_rate": 5.027488409804624e-06, "loss": 0.2435, "step": 17805 }, { "epoch": 0.6658410636259551, "grad_norm": 0.50981605052948, "learning_rate": 5.022394182572487e-06, "loss": 0.2654, "step": 17810 }, { "epoch": 0.6660279926163049, "grad_norm": 0.39164555072784424, "learning_rate": 5.017301671973973e-06, "loss": 0.2715, "step": 17815 }, { "epoch": 0.6662149216066546, "grad_norm": 0.31828561425209045, "learning_rate": 5.012210879765339e-06, "loss": 0.2559, "step": 17820 }, { "epoch": 0.6664018505970044, "grad_norm": 0.6788938641548157, "learning_rate": 5.0071218077022495e-06, "loss": 0.3659, "step": 17825 }, { "epoch": 0.6665887795873543, "grad_norm": 0.2640472948551178, "learning_rate": 5.002034457539786e-06, "loss": 0.2719, "step": 17830 }, { "epoch": 0.6667757085777041, "grad_norm": 0.3954325318336487, "learning_rate": 4.996948831032431e-06, "loss": 0.2705, "step": 17835 }, { "epoch": 0.6669626375680539, "grad_norm": 0.25173521041870117, "learning_rate": 4.991864929934065e-06, "loss": 0.2495, "step": 17840 }, { "epoch": 0.6671495665584036, "grad_norm": 0.3650529384613037, "learning_rate": 4.986782755997987e-06, "loss": 0.3812, "step": 17845 }, { "epoch": 0.6673364955487534, "grad_norm": 0.5799317359924316, "learning_rate": 4.981702310976887e-06, "loss": 0.2965, "step": 17850 }, { "epoch": 0.6675234245391032, "grad_norm": 0.3628406822681427, "learning_rate": 4.97662359662287e-06, "loss": 0.3047, "step": 17855 }, { "epoch": 0.667710353529453, "grad_norm": 0.8548519015312195, "learning_rate": 4.971546614687437e-06, "loss": 0.3368, "step": 17860 }, { "epoch": 0.6678972825198027, "grad_norm": 0.4021095037460327, "learning_rate": 4.966471366921493e-06, "loss": 0.3546, "step": 17865 }, { "epoch": 0.6680842115101526, "grad_norm": 0.23902134597301483, "learning_rate": 4.961397855075343e-06, "loss": 0.3073, "step": 17870 }, { "epoch": 0.6682711405005024, "grad_norm": 0.24928809702396393, "learning_rate": 4.956326080898697e-06, "loss": 0.2987, "step": 17875 }, { "epoch": 0.6684580694908522, "grad_norm": 0.4533480703830719, "learning_rate": 4.951256046140671e-06, "loss": 0.3074, "step": 17880 }, { "epoch": 0.668644998481202, "grad_norm": 0.3241102695465088, "learning_rate": 4.946187752549766e-06, "loss": 0.2708, "step": 17885 }, { "epoch": 0.6688319274715517, "grad_norm": 0.5674617290496826, "learning_rate": 4.9411212018738984e-06, "loss": 0.3197, "step": 17890 }, { "epoch": 0.6690188564619015, "grad_norm": 0.41915619373321533, "learning_rate": 4.936056395860369e-06, "loss": 0.3046, "step": 17895 }, { "epoch": 0.6692057854522513, "grad_norm": 0.2280120998620987, "learning_rate": 4.930993336255892e-06, "loss": 0.2369, "step": 17900 }, { "epoch": 0.6693927144426011, "grad_norm": 0.4832659661769867, "learning_rate": 4.925932024806569e-06, "loss": 0.3249, "step": 17905 }, { "epoch": 0.669579643432951, "grad_norm": 0.5970139503479004, "learning_rate": 4.9208724632579e-06, "loss": 0.3852, "step": 17910 }, { "epoch": 0.6697665724233007, "grad_norm": 0.3375006914138794, "learning_rate": 4.915814653354779e-06, "loss": 0.2146, "step": 17915 }, { "epoch": 0.6699535014136505, "grad_norm": 0.3070432245731354, "learning_rate": 4.910758596841504e-06, "loss": 0.2901, "step": 17920 }, { "epoch": 0.6701404304040003, "grad_norm": 0.4842531681060791, "learning_rate": 4.905704295461767e-06, "loss": 0.1967, "step": 17925 }, { "epoch": 0.6703273593943501, "grad_norm": 0.24802975356578827, "learning_rate": 4.900651750958645e-06, "loss": 0.3403, "step": 17930 }, { "epoch": 0.6705142883846998, "grad_norm": 0.32612282037734985, "learning_rate": 4.895600965074623e-06, "loss": 0.2696, "step": 17935 }, { "epoch": 0.6707012173750496, "grad_norm": 0.3664465844631195, "learning_rate": 4.890551939551565e-06, "loss": 0.2614, "step": 17940 }, { "epoch": 0.6708881463653994, "grad_norm": 0.4860963523387909, "learning_rate": 4.885504676130743e-06, "loss": 0.2449, "step": 17945 }, { "epoch": 0.6710750753557493, "grad_norm": 0.38064733147621155, "learning_rate": 4.880459176552808e-06, "loss": 0.285, "step": 17950 }, { "epoch": 0.6712620043460991, "grad_norm": 0.38366106152534485, "learning_rate": 4.87541544255781e-06, "loss": 0.3247, "step": 17955 }, { "epoch": 0.6714489333364488, "grad_norm": 0.7270559668540955, "learning_rate": 4.8703734758851854e-06, "loss": 0.3462, "step": 17960 }, { "epoch": 0.6716358623267986, "grad_norm": 0.23612673580646515, "learning_rate": 4.865333278273768e-06, "loss": 0.2441, "step": 17965 }, { "epoch": 0.6718227913171484, "grad_norm": 0.24252595007419586, "learning_rate": 4.860294851461774e-06, "loss": 0.2234, "step": 17970 }, { "epoch": 0.6720097203074982, "grad_norm": 0.5922762751579285, "learning_rate": 4.8552581971868154e-06, "loss": 0.2714, "step": 17975 }, { "epoch": 0.672196649297848, "grad_norm": 0.477463960647583, "learning_rate": 4.850223317185891e-06, "loss": 0.2555, "step": 17980 }, { "epoch": 0.6723835782881977, "grad_norm": 0.4230765700340271, "learning_rate": 4.845190213195382e-06, "loss": 0.2681, "step": 17985 }, { "epoch": 0.6725705072785476, "grad_norm": 0.7639047503471375, "learning_rate": 4.840158886951069e-06, "loss": 0.4085, "step": 17990 }, { "epoch": 0.6727574362688974, "grad_norm": 0.27008453011512756, "learning_rate": 4.835129340188101e-06, "loss": 0.2701, "step": 17995 }, { "epoch": 0.6729443652592472, "grad_norm": 0.48787766695022583, "learning_rate": 4.8301015746410385e-06, "loss": 0.3341, "step": 18000 }, { "epoch": 0.673131294249597, "grad_norm": 0.4696618914604187, "learning_rate": 4.825075592043805e-06, "loss": 0.245, "step": 18005 }, { "epoch": 0.6733182232399467, "grad_norm": 0.2987324893474579, "learning_rate": 4.82005139412972e-06, "loss": 0.3017, "step": 18010 }, { "epoch": 0.6735051522302965, "grad_norm": 0.35396862030029297, "learning_rate": 4.81502898263148e-06, "loss": 0.2776, "step": 18015 }, { "epoch": 0.6736920812206463, "grad_norm": 0.1255078911781311, "learning_rate": 4.810008359281176e-06, "loss": 0.2967, "step": 18020 }, { "epoch": 0.673879010210996, "grad_norm": 0.4665721356868744, "learning_rate": 4.804989525810282e-06, "loss": 0.3912, "step": 18025 }, { "epoch": 0.6740659392013459, "grad_norm": 0.4541762173175812, "learning_rate": 4.79997248394964e-06, "loss": 0.3989, "step": 18030 }, { "epoch": 0.6742528681916957, "grad_norm": 0.17799994349479675, "learning_rate": 4.794957235429491e-06, "loss": 0.3017, "step": 18035 }, { "epoch": 0.6744397971820455, "grad_norm": 0.5030390024185181, "learning_rate": 4.789943781979447e-06, "loss": 0.2906, "step": 18040 }, { "epoch": 0.6746267261723953, "grad_norm": 0.3190663158893585, "learning_rate": 4.784932125328507e-06, "loss": 0.2697, "step": 18045 }, { "epoch": 0.674813655162745, "grad_norm": 0.5195890069007874, "learning_rate": 4.779922267205048e-06, "loss": 0.2506, "step": 18050 }, { "epoch": 0.6750005841530948, "grad_norm": 0.48416566848754883, "learning_rate": 4.774914209336824e-06, "loss": 0.367, "step": 18055 }, { "epoch": 0.6751875131434446, "grad_norm": 0.38188669085502625, "learning_rate": 4.769907953450968e-06, "loss": 0.277, "step": 18060 }, { "epoch": 0.6753744421337944, "grad_norm": 0.4269798696041107, "learning_rate": 4.764903501273999e-06, "loss": 0.3099, "step": 18065 }, { "epoch": 0.6755613711241442, "grad_norm": 0.5362254977226257, "learning_rate": 4.75990085453181e-06, "loss": 0.3038, "step": 18070 }, { "epoch": 0.675748300114494, "grad_norm": 0.3357028067111969, "learning_rate": 4.754900014949665e-06, "loss": 0.2606, "step": 18075 }, { "epoch": 0.6759352291048438, "grad_norm": 0.43899163603782654, "learning_rate": 4.749900984252218e-06, "loss": 0.308, "step": 18080 }, { "epoch": 0.6761221580951936, "grad_norm": 0.2953328788280487, "learning_rate": 4.744903764163483e-06, "loss": 0.2463, "step": 18085 }, { "epoch": 0.6763090870855434, "grad_norm": 0.2157057821750641, "learning_rate": 4.739908356406866e-06, "loss": 0.3184, "step": 18090 }, { "epoch": 0.6764960160758932, "grad_norm": 0.5682590007781982, "learning_rate": 4.7349147627051365e-06, "loss": 0.2243, "step": 18095 }, { "epoch": 0.6766829450662429, "grad_norm": 0.3066447675228119, "learning_rate": 4.729922984780441e-06, "loss": 0.2636, "step": 18100 }, { "epoch": 0.6768698740565927, "grad_norm": 0.35900914669036865, "learning_rate": 4.724933024354298e-06, "loss": 0.2162, "step": 18105 }, { "epoch": 0.6770568030469425, "grad_norm": 0.40200889110565186, "learning_rate": 4.719944883147605e-06, "loss": 0.292, "step": 18110 }, { "epoch": 0.6772437320372924, "grad_norm": 0.23843464255332947, "learning_rate": 4.714958562880633e-06, "loss": 0.2521, "step": 18115 }, { "epoch": 0.6774306610276422, "grad_norm": 0.6935325860977173, "learning_rate": 4.709974065273013e-06, "loss": 0.2634, "step": 18120 }, { "epoch": 0.6776175900179919, "grad_norm": 0.1865166425704956, "learning_rate": 4.704991392043763e-06, "loss": 0.3864, "step": 18125 }, { "epoch": 0.6778045190083417, "grad_norm": 0.6192229986190796, "learning_rate": 4.700010544911257e-06, "loss": 0.2523, "step": 18130 }, { "epoch": 0.6779914479986915, "grad_norm": 0.22409962117671967, "learning_rate": 4.695031525593254e-06, "loss": 0.2784, "step": 18135 }, { "epoch": 0.6781783769890413, "grad_norm": 0.118705615401268, "learning_rate": 4.690054335806872e-06, "loss": 0.2382, "step": 18140 }, { "epoch": 0.678365305979391, "grad_norm": 0.39932093024253845, "learning_rate": 4.6850789772685955e-06, "loss": 0.223, "step": 18145 }, { "epoch": 0.6785522349697408, "grad_norm": 0.8999297022819519, "learning_rate": 4.6801054516942924e-06, "loss": 0.2789, "step": 18150 }, { "epoch": 0.6787391639600907, "grad_norm": 0.44918587803840637, "learning_rate": 4.675133760799181e-06, "loss": 0.3355, "step": 18155 }, { "epoch": 0.6789260929504405, "grad_norm": 0.4980359673500061, "learning_rate": 4.6701639062978624e-06, "loss": 0.2608, "step": 18160 }, { "epoch": 0.6791130219407903, "grad_norm": 0.30972304940223694, "learning_rate": 4.6651958899042895e-06, "loss": 0.207, "step": 18165 }, { "epoch": 0.67929995093114, "grad_norm": 0.34727078676223755, "learning_rate": 4.660229713331797e-06, "loss": 0.2291, "step": 18170 }, { "epoch": 0.6794868799214898, "grad_norm": 0.2929920256137848, "learning_rate": 4.655265378293068e-06, "loss": 0.2834, "step": 18175 }, { "epoch": 0.6796738089118396, "grad_norm": 0.37373247742652893, "learning_rate": 4.650302886500168e-06, "loss": 0.3505, "step": 18180 }, { "epoch": 0.6798607379021894, "grad_norm": 0.3892793655395508, "learning_rate": 4.645342239664511e-06, "loss": 0.287, "step": 18185 }, { "epoch": 0.6800476668925391, "grad_norm": 0.21411560475826263, "learning_rate": 4.640383439496888e-06, "loss": 0.1991, "step": 18190 }, { "epoch": 0.680234595882889, "grad_norm": 0.30452221632003784, "learning_rate": 4.635426487707445e-06, "loss": 0.2585, "step": 18195 }, { "epoch": 0.6804215248732388, "grad_norm": 0.3732444941997528, "learning_rate": 4.630471386005688e-06, "loss": 0.2668, "step": 18200 }, { "epoch": 0.6806084538635886, "grad_norm": 0.4895915687084198, "learning_rate": 4.625518136100498e-06, "loss": 0.3098, "step": 18205 }, { "epoch": 0.6807953828539384, "grad_norm": 0.25300148129463196, "learning_rate": 4.6205667397001e-06, "loss": 0.2581, "step": 18210 }, { "epoch": 0.6809823118442881, "grad_norm": 0.38394615054130554, "learning_rate": 4.615617198512097e-06, "loss": 0.2979, "step": 18215 }, { "epoch": 0.6811692408346379, "grad_norm": 0.3098186254501343, "learning_rate": 4.6106695142434355e-06, "loss": 0.2881, "step": 18220 }, { "epoch": 0.6813561698249877, "grad_norm": 0.5445083379745483, "learning_rate": 4.60572368860044e-06, "loss": 0.2482, "step": 18225 }, { "epoch": 0.6815430988153375, "grad_norm": 0.3289521634578705, "learning_rate": 4.600779723288774e-06, "loss": 0.3112, "step": 18230 }, { "epoch": 0.6817300278056874, "grad_norm": 0.6174635291099548, "learning_rate": 4.595837620013478e-06, "loss": 0.2644, "step": 18235 }, { "epoch": 0.6819169567960371, "grad_norm": 0.3422519266605377, "learning_rate": 4.5908973804789385e-06, "loss": 0.2874, "step": 18240 }, { "epoch": 0.6821038857863869, "grad_norm": 0.42430946230888367, "learning_rate": 4.585959006388898e-06, "loss": 0.288, "step": 18245 }, { "epoch": 0.6822908147767367, "grad_norm": 0.4555477499961853, "learning_rate": 4.581022499446468e-06, "loss": 0.3142, "step": 18250 }, { "epoch": 0.6824777437670865, "grad_norm": 1.2911605834960938, "learning_rate": 4.576087861354101e-06, "loss": 0.54, "step": 18255 }, { "epoch": 0.6826646727574363, "grad_norm": 0.3238535523414612, "learning_rate": 4.571155093813619e-06, "loss": 0.2987, "step": 18260 }, { "epoch": 0.682851601747786, "grad_norm": 0.2774864137172699, "learning_rate": 4.5662241985261865e-06, "loss": 0.2595, "step": 18265 }, { "epoch": 0.6830385307381358, "grad_norm": 0.3135109543800354, "learning_rate": 4.5612951771923345e-06, "loss": 0.2846, "step": 18270 }, { "epoch": 0.6832254597284857, "grad_norm": 0.6082348823547363, "learning_rate": 4.556368031511932e-06, "loss": 0.2893, "step": 18275 }, { "epoch": 0.6834123887188355, "grad_norm": 0.5355720520019531, "learning_rate": 4.551442763184221e-06, "loss": 0.3026, "step": 18280 }, { "epoch": 0.6835993177091853, "grad_norm": 0.6126582026481628, "learning_rate": 4.546519373907778e-06, "loss": 0.2891, "step": 18285 }, { "epoch": 0.683786246699535, "grad_norm": 0.47081178426742554, "learning_rate": 4.541597865380539e-06, "loss": 0.245, "step": 18290 }, { "epoch": 0.6839731756898848, "grad_norm": 0.6696823835372925, "learning_rate": 4.536678239299797e-06, "loss": 0.3148, "step": 18295 }, { "epoch": 0.6841601046802346, "grad_norm": 0.41798293590545654, "learning_rate": 4.531760497362181e-06, "loss": 0.2587, "step": 18300 }, { "epoch": 0.6843470336705844, "grad_norm": 0.4290241003036499, "learning_rate": 4.526844641263689e-06, "loss": 0.2839, "step": 18305 }, { "epoch": 0.6845339626609341, "grad_norm": 0.4475858509540558, "learning_rate": 4.521930672699651e-06, "loss": 0.3038, "step": 18310 }, { "epoch": 0.6847208916512839, "grad_norm": 0.3587439954280853, "learning_rate": 4.517018593364761e-06, "loss": 0.2565, "step": 18315 }, { "epoch": 0.6849078206416338, "grad_norm": 0.6752985715866089, "learning_rate": 4.512108404953048e-06, "loss": 0.2591, "step": 18320 }, { "epoch": 0.6850947496319836, "grad_norm": 0.3796422779560089, "learning_rate": 4.507200109157901e-06, "loss": 0.2285, "step": 18325 }, { "epoch": 0.6852816786223334, "grad_norm": 0.23058107495307922, "learning_rate": 4.502293707672044e-06, "loss": 0.2528, "step": 18330 }, { "epoch": 0.6854686076126831, "grad_norm": 0.29818716645240784, "learning_rate": 4.497389202187562e-06, "loss": 0.265, "step": 18335 }, { "epoch": 0.6856555366030329, "grad_norm": 0.48177286982536316, "learning_rate": 4.492486594395875e-06, "loss": 0.2451, "step": 18340 }, { "epoch": 0.6858424655933827, "grad_norm": 0.4235864281654358, "learning_rate": 4.487585885987747e-06, "loss": 0.3306, "step": 18345 }, { "epoch": 0.6860293945837325, "grad_norm": 0.5081339478492737, "learning_rate": 4.4826870786533e-06, "loss": 0.3331, "step": 18350 }, { "epoch": 0.6862163235740822, "grad_norm": 0.5094728469848633, "learning_rate": 4.477790174081984e-06, "loss": 0.2585, "step": 18355 }, { "epoch": 0.6864032525644321, "grad_norm": 0.29224979877471924, "learning_rate": 4.47289517396261e-06, "loss": 0.2684, "step": 18360 }, { "epoch": 0.6865901815547819, "grad_norm": 0.2801983058452606, "learning_rate": 4.468002079983315e-06, "loss": 0.3254, "step": 18365 }, { "epoch": 0.6867771105451317, "grad_norm": 0.688702404499054, "learning_rate": 4.463110893831596e-06, "loss": 0.2631, "step": 18370 }, { "epoch": 0.6869640395354815, "grad_norm": 0.43564876914024353, "learning_rate": 4.458221617194273e-06, "loss": 0.2727, "step": 18375 }, { "epoch": 0.6871509685258312, "grad_norm": 0.28386905789375305, "learning_rate": 4.453334251757526e-06, "loss": 0.3505, "step": 18380 }, { "epoch": 0.687337897516181, "grad_norm": 0.3805743157863617, "learning_rate": 4.448448799206863e-06, "loss": 0.2635, "step": 18385 }, { "epoch": 0.6875248265065308, "grad_norm": 0.2629093825817108, "learning_rate": 4.443565261227134e-06, "loss": 0.2978, "step": 18390 }, { "epoch": 0.6877117554968806, "grad_norm": 0.26355159282684326, "learning_rate": 4.438683639502538e-06, "loss": 0.3312, "step": 18395 }, { "epoch": 0.6878986844872305, "grad_norm": 0.42087700963020325, "learning_rate": 4.4338039357165985e-06, "loss": 0.2332, "step": 18400 }, { "epoch": 0.6880856134775802, "grad_norm": 0.42999762296676636, "learning_rate": 4.428926151552194e-06, "loss": 0.3205, "step": 18405 }, { "epoch": 0.68827254246793, "grad_norm": 0.38532260060310364, "learning_rate": 4.424050288691525e-06, "loss": 0.2726, "step": 18410 }, { "epoch": 0.6884594714582798, "grad_norm": 0.40023544430732727, "learning_rate": 4.419176348816144e-06, "loss": 0.2823, "step": 18415 }, { "epoch": 0.6886464004486296, "grad_norm": 0.35504770278930664, "learning_rate": 4.414304333606926e-06, "loss": 0.2744, "step": 18420 }, { "epoch": 0.6888333294389793, "grad_norm": 0.5017563700675964, "learning_rate": 4.409434244744095e-06, "loss": 0.3578, "step": 18425 }, { "epoch": 0.6890202584293291, "grad_norm": 0.4949704706668854, "learning_rate": 4.4045660839072045e-06, "loss": 0.2341, "step": 18430 }, { "epoch": 0.6892071874196789, "grad_norm": 0.40918684005737305, "learning_rate": 4.399699852775138e-06, "loss": 0.2464, "step": 18435 }, { "epoch": 0.6893941164100288, "grad_norm": 0.7892428040504456, "learning_rate": 4.394835553026128e-06, "loss": 0.4378, "step": 18440 }, { "epoch": 0.6895810454003786, "grad_norm": 0.2694770395755768, "learning_rate": 4.3899731863377225e-06, "loss": 0.2667, "step": 18445 }, { "epoch": 0.6897679743907283, "grad_norm": 0.35795870423316956, "learning_rate": 4.385112754386821e-06, "loss": 0.2793, "step": 18450 }, { "epoch": 0.6899549033810781, "grad_norm": 0.3480895161628723, "learning_rate": 4.380254258849641e-06, "loss": 0.2119, "step": 18455 }, { "epoch": 0.6901418323714279, "grad_norm": 0.5051251649856567, "learning_rate": 4.375397701401745e-06, "loss": 0.2969, "step": 18460 }, { "epoch": 0.6903287613617777, "grad_norm": 0.3847442865371704, "learning_rate": 4.370543083718012e-06, "loss": 0.2343, "step": 18465 }, { "epoch": 0.6905156903521275, "grad_norm": 0.3477119207382202, "learning_rate": 4.365690407472668e-06, "loss": 0.2356, "step": 18470 }, { "epoch": 0.6907026193424772, "grad_norm": 0.6154903769493103, "learning_rate": 4.36083967433926e-06, "loss": 0.2936, "step": 18475 }, { "epoch": 0.6908895483328271, "grad_norm": 0.5352329611778259, "learning_rate": 4.355990885990663e-06, "loss": 0.2828, "step": 18480 }, { "epoch": 0.6910764773231769, "grad_norm": 0.39464572072029114, "learning_rate": 4.351144044099091e-06, "loss": 0.2015, "step": 18485 }, { "epoch": 0.6912634063135267, "grad_norm": 0.483270525932312, "learning_rate": 4.346299150336074e-06, "loss": 0.2555, "step": 18490 }, { "epoch": 0.6914503353038765, "grad_norm": 0.39474979043006897, "learning_rate": 4.341456206372485e-06, "loss": 0.3986, "step": 18495 }, { "epoch": 0.6916372642942262, "grad_norm": 0.5631116628646851, "learning_rate": 4.336615213878509e-06, "loss": 0.2867, "step": 18500 }, { "epoch": 0.691824193284576, "grad_norm": 0.623786985874176, "learning_rate": 4.331776174523673e-06, "loss": 0.3206, "step": 18505 }, { "epoch": 0.6920111222749258, "grad_norm": 0.2712309658527374, "learning_rate": 4.326939089976815e-06, "loss": 0.3039, "step": 18510 }, { "epoch": 0.6921980512652756, "grad_norm": 0.34029409289360046, "learning_rate": 4.322103961906113e-06, "loss": 0.2136, "step": 18515 }, { "epoch": 0.6923849802556254, "grad_norm": 0.52415931224823, "learning_rate": 4.317270791979063e-06, "loss": 0.1956, "step": 18520 }, { "epoch": 0.6925719092459752, "grad_norm": 0.38135063648223877, "learning_rate": 4.312439581862488e-06, "loss": 0.2654, "step": 18525 }, { "epoch": 0.692758838236325, "grad_norm": 0.8677012920379639, "learning_rate": 4.307610333222532e-06, "loss": 0.28, "step": 18530 }, { "epoch": 0.6929457672266748, "grad_norm": 0.19962137937545776, "learning_rate": 4.30278304772466e-06, "loss": 0.2984, "step": 18535 }, { "epoch": 0.6931326962170246, "grad_norm": 0.4274756610393524, "learning_rate": 4.297957727033673e-06, "loss": 0.2288, "step": 18540 }, { "epoch": 0.6933196252073743, "grad_norm": 0.3446785509586334, "learning_rate": 4.293134372813678e-06, "loss": 0.2482, "step": 18545 }, { "epoch": 0.6935065541977241, "grad_norm": 0.30156296491622925, "learning_rate": 4.288312986728119e-06, "loss": 0.254, "step": 18550 }, { "epoch": 0.6936934831880739, "grad_norm": 0.20253904163837433, "learning_rate": 4.283493570439746e-06, "loss": 0.2544, "step": 18555 }, { "epoch": 0.6938804121784237, "grad_norm": 0.2638651728630066, "learning_rate": 4.278676125610644e-06, "loss": 0.2614, "step": 18560 }, { "epoch": 0.6940673411687736, "grad_norm": 0.2870006859302521, "learning_rate": 4.2738606539022105e-06, "loss": 0.3594, "step": 18565 }, { "epoch": 0.6942542701591233, "grad_norm": 0.4544857442378998, "learning_rate": 4.269047156975166e-06, "loss": 0.3418, "step": 18570 }, { "epoch": 0.6944411991494731, "grad_norm": 0.21792225539684296, "learning_rate": 4.264235636489542e-06, "loss": 0.3146, "step": 18575 }, { "epoch": 0.6946281281398229, "grad_norm": 0.5876854658126831, "learning_rate": 4.2594260941046935e-06, "loss": 0.2753, "step": 18580 }, { "epoch": 0.6948150571301727, "grad_norm": 0.4777786135673523, "learning_rate": 4.254618531479301e-06, "loss": 0.3524, "step": 18585 }, { "epoch": 0.6950019861205224, "grad_norm": 0.3921513259410858, "learning_rate": 4.249812950271347e-06, "loss": 0.3673, "step": 18590 }, { "epoch": 0.6951889151108722, "grad_norm": 0.284984290599823, "learning_rate": 4.245009352138146e-06, "loss": 0.3071, "step": 18595 }, { "epoch": 0.695375844101222, "grad_norm": 0.4456053078174591, "learning_rate": 4.240207738736315e-06, "loss": 0.3151, "step": 18600 }, { "epoch": 0.6955627730915719, "grad_norm": 0.5414977669715881, "learning_rate": 4.235408111721796e-06, "loss": 0.249, "step": 18605 }, { "epoch": 0.6957497020819217, "grad_norm": 0.7804057598114014, "learning_rate": 4.230610472749847e-06, "loss": 0.3312, "step": 18610 }, { "epoch": 0.6959366310722714, "grad_norm": 0.4957026541233063, "learning_rate": 4.225814823475031e-06, "loss": 0.222, "step": 18615 }, { "epoch": 0.6961235600626212, "grad_norm": 0.2853618860244751, "learning_rate": 4.221021165551232e-06, "loss": 0.2572, "step": 18620 }, { "epoch": 0.696310489052971, "grad_norm": 0.4198627769947052, "learning_rate": 4.21622950063164e-06, "loss": 0.2863, "step": 18625 }, { "epoch": 0.6964974180433208, "grad_norm": 0.25387126207351685, "learning_rate": 4.211439830368771e-06, "loss": 0.2627, "step": 18630 }, { "epoch": 0.6966843470336705, "grad_norm": 0.3733402192592621, "learning_rate": 4.206652156414437e-06, "loss": 0.3436, "step": 18635 }, { "epoch": 0.6968712760240203, "grad_norm": 0.24343183636665344, "learning_rate": 4.2018664804197784e-06, "loss": 0.2873, "step": 18640 }, { "epoch": 0.6970582050143702, "grad_norm": 0.2914699614048004, "learning_rate": 4.19708280403523e-06, "loss": 0.2815, "step": 18645 }, { "epoch": 0.69724513400472, "grad_norm": 0.5060830116271973, "learning_rate": 4.192301128910546e-06, "loss": 0.2341, "step": 18650 }, { "epoch": 0.6974320629950698, "grad_norm": 0.6719004511833191, "learning_rate": 4.187521456694797e-06, "loss": 0.2972, "step": 18655 }, { "epoch": 0.6976189919854195, "grad_norm": 0.49988633394241333, "learning_rate": 4.182743789036346e-06, "loss": 0.3975, "step": 18660 }, { "epoch": 0.6978059209757693, "grad_norm": 0.3273427188396454, "learning_rate": 4.1779681275828795e-06, "loss": 0.278, "step": 18665 }, { "epoch": 0.6979928499661191, "grad_norm": 1.3453400135040283, "learning_rate": 4.173194473981379e-06, "loss": 0.3058, "step": 18670 }, { "epoch": 0.6981797789564689, "grad_norm": 0.30567625164985657, "learning_rate": 4.168422829878148e-06, "loss": 0.3541, "step": 18675 }, { "epoch": 0.6983667079468187, "grad_norm": 0.45551925897598267, "learning_rate": 4.163653196918784e-06, "loss": 0.334, "step": 18680 }, { "epoch": 0.6985536369371685, "grad_norm": 0.33953145146369934, "learning_rate": 4.158885576748205e-06, "loss": 0.3156, "step": 18685 }, { "epoch": 0.6987405659275183, "grad_norm": 0.5441725254058838, "learning_rate": 4.154119971010616e-06, "loss": 0.2464, "step": 18690 }, { "epoch": 0.6989274949178681, "grad_norm": 0.25098875164985657, "learning_rate": 4.149356381349544e-06, "loss": 0.2644, "step": 18695 }, { "epoch": 0.6991144239082179, "grad_norm": 0.35483863949775696, "learning_rate": 4.144594809407818e-06, "loss": 0.2527, "step": 18700 }, { "epoch": 0.6993013528985677, "grad_norm": 0.6076955795288086, "learning_rate": 4.139835256827559e-06, "loss": 0.2728, "step": 18705 }, { "epoch": 0.6994882818889174, "grad_norm": 0.26839420199394226, "learning_rate": 4.135077725250209e-06, "loss": 0.2568, "step": 18710 }, { "epoch": 0.6996752108792672, "grad_norm": 0.3890112042427063, "learning_rate": 4.130322216316502e-06, "loss": 0.2509, "step": 18715 }, { "epoch": 0.699862139869617, "grad_norm": 0.4457304775714874, "learning_rate": 4.125568731666473e-06, "loss": 0.2774, "step": 18720 }, { "epoch": 0.7000490688599669, "grad_norm": 0.3597055971622467, "learning_rate": 4.120817272939462e-06, "loss": 0.2028, "step": 18725 }, { "epoch": 0.7002359978503166, "grad_norm": 0.2370787411928177, "learning_rate": 4.116067841774116e-06, "loss": 0.3078, "step": 18730 }, { "epoch": 0.7004229268406664, "grad_norm": 0.7779338359832764, "learning_rate": 4.111320439808373e-06, "loss": 0.3016, "step": 18735 }, { "epoch": 0.7006098558310162, "grad_norm": 0.3370744585990906, "learning_rate": 4.106575068679477e-06, "loss": 0.274, "step": 18740 }, { "epoch": 0.700796784821366, "grad_norm": 0.46344175934791565, "learning_rate": 4.101831730023978e-06, "loss": 0.3209, "step": 18745 }, { "epoch": 0.7009837138117158, "grad_norm": 0.3550253212451935, "learning_rate": 4.097090425477706e-06, "loss": 0.2878, "step": 18750 }, { "epoch": 0.7011706428020655, "grad_norm": 0.5622664093971252, "learning_rate": 4.092351156675809e-06, "loss": 0.2869, "step": 18755 }, { "epoch": 0.7013575717924153, "grad_norm": 0.27434638142585754, "learning_rate": 4.087613925252723e-06, "loss": 0.3175, "step": 18760 }, { "epoch": 0.7015445007827652, "grad_norm": 0.5039125084877014, "learning_rate": 4.082878732842185e-06, "loss": 0.279, "step": 18765 }, { "epoch": 0.701731429773115, "grad_norm": 0.37588292360305786, "learning_rate": 4.07814558107722e-06, "loss": 0.2938, "step": 18770 }, { "epoch": 0.7019183587634648, "grad_norm": 0.36863166093826294, "learning_rate": 4.073414471590165e-06, "loss": 0.3311, "step": 18775 }, { "epoch": 0.7021052877538145, "grad_norm": 0.3147136867046356, "learning_rate": 4.068685406012637e-06, "loss": 0.3818, "step": 18780 }, { "epoch": 0.7022922167441643, "grad_norm": 0.3484380841255188, "learning_rate": 4.06395838597556e-06, "loss": 0.3061, "step": 18785 }, { "epoch": 0.7024791457345141, "grad_norm": 0.34823089838027954, "learning_rate": 4.059233413109148e-06, "loss": 0.2871, "step": 18790 }, { "epoch": 0.7026660747248639, "grad_norm": 0.21362809836864471, "learning_rate": 4.054510489042906e-06, "loss": 0.2635, "step": 18795 }, { "epoch": 0.7028530037152136, "grad_norm": 0.4324122369289398, "learning_rate": 4.049789615405638e-06, "loss": 0.2149, "step": 18800 }, { "epoch": 0.7030399327055634, "grad_norm": 0.6976597905158997, "learning_rate": 4.0450707938254385e-06, "loss": 0.2078, "step": 18805 }, { "epoch": 0.7032268616959133, "grad_norm": 0.7980237603187561, "learning_rate": 4.0403540259296905e-06, "loss": 0.3072, "step": 18810 }, { "epoch": 0.7034137906862631, "grad_norm": 0.3287063241004944, "learning_rate": 4.03563931334507e-06, "loss": 0.2836, "step": 18815 }, { "epoch": 0.7036007196766129, "grad_norm": 0.6408100128173828, "learning_rate": 4.030926657697554e-06, "loss": 0.2664, "step": 18820 }, { "epoch": 0.7037876486669626, "grad_norm": 0.44458410143852234, "learning_rate": 4.0262160606123946e-06, "loss": 0.3106, "step": 18825 }, { "epoch": 0.7039745776573124, "grad_norm": 0.19885852932929993, "learning_rate": 4.021507523714145e-06, "loss": 0.2218, "step": 18830 }, { "epoch": 0.7041615066476622, "grad_norm": 0.46404361724853516, "learning_rate": 4.016801048626648e-06, "loss": 0.251, "step": 18835 }, { "epoch": 0.704348435638012, "grad_norm": 0.6827042102813721, "learning_rate": 4.012096636973027e-06, "loss": 0.2417, "step": 18840 }, { "epoch": 0.7045353646283617, "grad_norm": 0.3932143747806549, "learning_rate": 4.007394290375703e-06, "loss": 0.3334, "step": 18845 }, { "epoch": 0.7047222936187116, "grad_norm": 0.5734818577766418, "learning_rate": 4.002694010456379e-06, "loss": 0.3273, "step": 18850 }, { "epoch": 0.7049092226090614, "grad_norm": 0.6905113458633423, "learning_rate": 3.997995798836046e-06, "loss": 0.2476, "step": 18855 }, { "epoch": 0.7050961515994112, "grad_norm": 0.5125166177749634, "learning_rate": 3.993299657134979e-06, "loss": 0.2125, "step": 18860 }, { "epoch": 0.705283080589761, "grad_norm": 0.22294095158576965, "learning_rate": 3.98860558697275e-06, "loss": 0.2655, "step": 18865 }, { "epoch": 0.7054700095801107, "grad_norm": 0.5617648959159851, "learning_rate": 3.983913589968202e-06, "loss": 0.3351, "step": 18870 }, { "epoch": 0.7056569385704605, "grad_norm": 0.2911111116409302, "learning_rate": 3.979223667739475e-06, "loss": 0.2118, "step": 18875 }, { "epoch": 0.7058438675608103, "grad_norm": 0.49967023730278015, "learning_rate": 3.974535821903992e-06, "loss": 0.2808, "step": 18880 }, { "epoch": 0.7060307965511601, "grad_norm": 0.20392078161239624, "learning_rate": 3.969850054078448e-06, "loss": 0.2092, "step": 18885 }, { "epoch": 0.70621772554151, "grad_norm": 0.31886929273605347, "learning_rate": 3.965166365878839e-06, "loss": 0.2398, "step": 18890 }, { "epoch": 0.7064046545318597, "grad_norm": 0.4940277934074402, "learning_rate": 3.960484758920426e-06, "loss": 0.2671, "step": 18895 }, { "epoch": 0.7065915835222095, "grad_norm": 0.4553227424621582, "learning_rate": 3.95580523481777e-06, "loss": 0.3112, "step": 18900 }, { "epoch": 0.7067785125125593, "grad_norm": 0.4576134979724884, "learning_rate": 3.9511277951847e-06, "loss": 0.302, "step": 18905 }, { "epoch": 0.7069654415029091, "grad_norm": 0.4832530617713928, "learning_rate": 3.946452441634332e-06, "loss": 0.2758, "step": 18910 }, { "epoch": 0.7071523704932589, "grad_norm": 0.4215802550315857, "learning_rate": 3.9417791757790565e-06, "loss": 0.2889, "step": 18915 }, { "epoch": 0.7073392994836086, "grad_norm": 0.6536740660667419, "learning_rate": 3.937107999230554e-06, "loss": 0.3214, "step": 18920 }, { "epoch": 0.7075262284739584, "grad_norm": 0.4992019832134247, "learning_rate": 3.932438913599781e-06, "loss": 0.2528, "step": 18925 }, { "epoch": 0.7077131574643083, "grad_norm": 0.21796809136867523, "learning_rate": 3.927771920496967e-06, "loss": 0.3074, "step": 18930 }, { "epoch": 0.7079000864546581, "grad_norm": 0.25290024280548096, "learning_rate": 3.923107021531629e-06, "loss": 0.3147, "step": 18935 }, { "epoch": 0.7080870154450079, "grad_norm": 0.2794661521911621, "learning_rate": 3.918444218312551e-06, "loss": 0.304, "step": 18940 }, { "epoch": 0.7082739444353576, "grad_norm": 0.9677742719650269, "learning_rate": 3.913783512447806e-06, "loss": 0.3212, "step": 18945 }, { "epoch": 0.7084608734257074, "grad_norm": 0.3141147792339325, "learning_rate": 3.909124905544737e-06, "loss": 0.2581, "step": 18950 }, { "epoch": 0.7086478024160572, "grad_norm": 0.8014910817146301, "learning_rate": 3.9044683992099616e-06, "loss": 0.3311, "step": 18955 }, { "epoch": 0.708834731406407, "grad_norm": 0.2427234798669815, "learning_rate": 3.899813995049373e-06, "loss": 0.285, "step": 18960 }, { "epoch": 0.7090216603967567, "grad_norm": 0.49129942059516907, "learning_rate": 3.895161694668144e-06, "loss": 0.2669, "step": 18965 }, { "epoch": 0.7092085893871066, "grad_norm": 0.351224422454834, "learning_rate": 3.890511499670726e-06, "loss": 0.2503, "step": 18970 }, { "epoch": 0.7093955183774564, "grad_norm": 0.4046824872493744, "learning_rate": 3.885863411660829e-06, "loss": 0.2601, "step": 18975 }, { "epoch": 0.7095824473678062, "grad_norm": 0.49013751745224, "learning_rate": 3.881217432241451e-06, "loss": 0.2791, "step": 18980 }, { "epoch": 0.709769376358156, "grad_norm": 0.3391354978084564, "learning_rate": 3.876573563014854e-06, "loss": 0.2573, "step": 18985 }, { "epoch": 0.7099563053485057, "grad_norm": 0.28667691349983215, "learning_rate": 3.8719318055825785e-06, "loss": 0.2763, "step": 18990 }, { "epoch": 0.7101432343388555, "grad_norm": 0.2968943417072296, "learning_rate": 3.8672921615454325e-06, "loss": 0.2952, "step": 18995 }, { "epoch": 0.7103301633292053, "grad_norm": 0.2967950403690338, "learning_rate": 3.862654632503495e-06, "loss": 0.3106, "step": 19000 }, { "epoch": 0.7105170923195551, "grad_norm": 0.21600590646266937, "learning_rate": 3.858019220056115e-06, "loss": 0.1917, "step": 19005 }, { "epoch": 0.710704021309905, "grad_norm": 0.5353944897651672, "learning_rate": 3.853385925801916e-06, "loss": 0.2285, "step": 19010 }, { "epoch": 0.7108909503002547, "grad_norm": 0.44608166813850403, "learning_rate": 3.848754751338792e-06, "loss": 0.2501, "step": 19015 }, { "epoch": 0.7110778792906045, "grad_norm": 0.38213208317756653, "learning_rate": 3.844125698263896e-06, "loss": 0.2357, "step": 19020 }, { "epoch": 0.7112648082809543, "grad_norm": 0.2469312697649002, "learning_rate": 3.8394987681736596e-06, "loss": 0.2981, "step": 19025 }, { "epoch": 0.7114517372713041, "grad_norm": 0.30986660718917847, "learning_rate": 3.834873962663775e-06, "loss": 0.3125, "step": 19030 }, { "epoch": 0.7116386662616538, "grad_norm": 0.5508708357810974, "learning_rate": 3.830251283329211e-06, "loss": 0.2756, "step": 19035 }, { "epoch": 0.7118255952520036, "grad_norm": 0.44314640760421753, "learning_rate": 3.825630731764195e-06, "loss": 0.3161, "step": 19040 }, { "epoch": 0.7120125242423534, "grad_norm": 0.3538496494293213, "learning_rate": 3.8210123095622164e-06, "loss": 0.2465, "step": 19045 }, { "epoch": 0.7121994532327032, "grad_norm": 0.4966811537742615, "learning_rate": 3.816396018316047e-06, "loss": 0.2047, "step": 19050 }, { "epoch": 0.7123863822230531, "grad_norm": 0.4113473892211914, "learning_rate": 3.8117818596177035e-06, "loss": 0.232, "step": 19055 }, { "epoch": 0.7125733112134028, "grad_norm": 0.42282649874687195, "learning_rate": 3.807169835058485e-06, "loss": 0.3454, "step": 19060 }, { "epoch": 0.7127602402037526, "grad_norm": 0.495076060295105, "learning_rate": 3.8025599462289407e-06, "loss": 0.2909, "step": 19065 }, { "epoch": 0.7129471691941024, "grad_norm": 0.4929969608783722, "learning_rate": 3.7979521947188946e-06, "loss": 0.3251, "step": 19070 }, { "epoch": 0.7131340981844522, "grad_norm": 0.2015666663646698, "learning_rate": 3.793346582117422e-06, "loss": 0.2641, "step": 19075 }, { "epoch": 0.713321027174802, "grad_norm": 0.42519763112068176, "learning_rate": 3.7887431100128746e-06, "loss": 0.3006, "step": 19080 }, { "epoch": 0.7135079561651517, "grad_norm": 0.543765664100647, "learning_rate": 3.784141779992849e-06, "loss": 0.319, "step": 19085 }, { "epoch": 0.7136948851555015, "grad_norm": 0.39010390639305115, "learning_rate": 3.77954259364422e-06, "loss": 0.2191, "step": 19090 }, { "epoch": 0.7138818141458514, "grad_norm": 0.6654005646705627, "learning_rate": 3.7749455525531122e-06, "loss": 0.2967, "step": 19095 }, { "epoch": 0.7140687431362012, "grad_norm": 0.3600732982158661, "learning_rate": 3.7703506583049097e-06, "loss": 0.333, "step": 19100 }, { "epoch": 0.714255672126551, "grad_norm": 0.42062684893608093, "learning_rate": 3.765757912484266e-06, "loss": 0.3078, "step": 19105 }, { "epoch": 0.7144426011169007, "grad_norm": 0.324552983045578, "learning_rate": 3.7611673166750816e-06, "loss": 0.2595, "step": 19110 }, { "epoch": 0.7146295301072505, "grad_norm": 0.522148072719574, "learning_rate": 3.7565788724605278e-06, "loss": 0.2575, "step": 19115 }, { "epoch": 0.7148164590976003, "grad_norm": 0.2913423776626587, "learning_rate": 3.751992581423021e-06, "loss": 0.2701, "step": 19120 }, { "epoch": 0.71500338808795, "grad_norm": 0.5544187426567078, "learning_rate": 3.7474084451442484e-06, "loss": 0.2332, "step": 19125 }, { "epoch": 0.7151903170782998, "grad_norm": 0.7291717529296875, "learning_rate": 3.7428264652051393e-06, "loss": 0.3321, "step": 19130 }, { "epoch": 0.7153772460686497, "grad_norm": 0.5494731664657593, "learning_rate": 3.7382466431858966e-06, "loss": 0.318, "step": 19135 }, { "epoch": 0.7155641750589995, "grad_norm": 0.3049234449863434, "learning_rate": 3.733668980665963e-06, "loss": 0.2599, "step": 19140 }, { "epoch": 0.7157511040493493, "grad_norm": 0.3012910783290863, "learning_rate": 3.729093479224043e-06, "loss": 0.2457, "step": 19145 }, { "epoch": 0.715938033039699, "grad_norm": 0.23188547790050507, "learning_rate": 3.7245201404381006e-06, "loss": 0.2909, "step": 19150 }, { "epoch": 0.7161249620300488, "grad_norm": 0.1563846319913864, "learning_rate": 3.7199489658853428e-06, "loss": 0.3587, "step": 19155 }, { "epoch": 0.7163118910203986, "grad_norm": 0.34965288639068604, "learning_rate": 3.7153799571422444e-06, "loss": 0.2779, "step": 19160 }, { "epoch": 0.7164988200107484, "grad_norm": 0.3783499300479889, "learning_rate": 3.710813115784517e-06, "loss": 0.3366, "step": 19165 }, { "epoch": 0.7166857490010982, "grad_norm": 0.5413819551467896, "learning_rate": 3.7062484433871426e-06, "loss": 0.3233, "step": 19170 }, { "epoch": 0.716872677991448, "grad_norm": 0.43029576539993286, "learning_rate": 3.7016859415243377e-06, "loss": 0.2896, "step": 19175 }, { "epoch": 0.7170596069817978, "grad_norm": 0.23014762997627258, "learning_rate": 3.6971256117695853e-06, "loss": 0.34, "step": 19180 }, { "epoch": 0.7172465359721476, "grad_norm": 0.5080450773239136, "learning_rate": 3.692567455695609e-06, "loss": 0.2345, "step": 19185 }, { "epoch": 0.7174334649624974, "grad_norm": 0.36543214321136475, "learning_rate": 3.6880114748743832e-06, "loss": 0.3117, "step": 19190 }, { "epoch": 0.7176203939528472, "grad_norm": 0.38558727502822876, "learning_rate": 3.683457670877142e-06, "loss": 0.2428, "step": 19195 }, { "epoch": 0.7178073229431969, "grad_norm": 0.16458238661289215, "learning_rate": 3.678906045274355e-06, "loss": 0.2913, "step": 19200 }, { "epoch": 0.7179942519335467, "grad_norm": 0.4184258282184601, "learning_rate": 3.6743565996357534e-06, "loss": 0.3226, "step": 19205 }, { "epoch": 0.7181811809238965, "grad_norm": 0.33344846963882446, "learning_rate": 3.6698093355303054e-06, "loss": 0.3476, "step": 19210 }, { "epoch": 0.7183681099142464, "grad_norm": 0.9731647372245789, "learning_rate": 3.6652642545262374e-06, "loss": 0.4277, "step": 19215 }, { "epoch": 0.7185550389045962, "grad_norm": 0.38592931628227234, "learning_rate": 3.6607213581910116e-06, "loss": 0.3507, "step": 19220 }, { "epoch": 0.7187419678949459, "grad_norm": 0.5030714273452759, "learning_rate": 3.65618064809135e-06, "loss": 0.2249, "step": 19225 }, { "epoch": 0.7189288968852957, "grad_norm": 0.3900645673274994, "learning_rate": 3.6516421257932054e-06, "loss": 0.304, "step": 19230 }, { "epoch": 0.7191158258756455, "grad_norm": 0.4885767698287964, "learning_rate": 3.6471057928617913e-06, "loss": 0.2138, "step": 19235 }, { "epoch": 0.7193027548659953, "grad_norm": 0.33923253417015076, "learning_rate": 3.6425716508615574e-06, "loss": 0.2623, "step": 19240 }, { "epoch": 0.719489683856345, "grad_norm": 0.3621087670326233, "learning_rate": 3.638039701356193e-06, "loss": 0.2484, "step": 19245 }, { "epoch": 0.7196766128466948, "grad_norm": 0.18470481038093567, "learning_rate": 3.6335099459086453e-06, "loss": 0.3044, "step": 19250 }, { "epoch": 0.7198635418370447, "grad_norm": 0.3943520188331604, "learning_rate": 3.6289823860810926e-06, "loss": 0.2953, "step": 19255 }, { "epoch": 0.7200504708273945, "grad_norm": 0.47943511605262756, "learning_rate": 3.624457023434964e-06, "loss": 0.3055, "step": 19260 }, { "epoch": 0.7202373998177443, "grad_norm": 0.5454005002975464, "learning_rate": 3.619933859530923e-06, "loss": 0.2572, "step": 19265 }, { "epoch": 0.720424328808094, "grad_norm": 0.5797293186187744, "learning_rate": 3.6154128959288847e-06, "loss": 0.311, "step": 19270 }, { "epoch": 0.7206112577984438, "grad_norm": 0.2344737946987152, "learning_rate": 3.610894134187993e-06, "loss": 0.2628, "step": 19275 }, { "epoch": 0.7207981867887936, "grad_norm": 0.279786616563797, "learning_rate": 3.6063775758666485e-06, "loss": 0.2749, "step": 19280 }, { "epoch": 0.7209851157791434, "grad_norm": 0.28806987404823303, "learning_rate": 3.601863222522477e-06, "loss": 0.4239, "step": 19285 }, { "epoch": 0.7211720447694931, "grad_norm": 0.3944340646266937, "learning_rate": 3.5973510757123464e-06, "loss": 0.3092, "step": 19290 }, { "epoch": 0.7213589737598429, "grad_norm": 0.7157394886016846, "learning_rate": 3.5928411369923743e-06, "loss": 0.3422, "step": 19295 }, { "epoch": 0.7215459027501928, "grad_norm": 0.6664456129074097, "learning_rate": 3.5883334079179023e-06, "loss": 0.2451, "step": 19300 }, { "epoch": 0.7217328317405426, "grad_norm": 0.32842952013015747, "learning_rate": 3.583827890043524e-06, "loss": 0.2177, "step": 19305 }, { "epoch": 0.7219197607308924, "grad_norm": 0.3629484474658966, "learning_rate": 3.5793245849230563e-06, "loss": 0.2519, "step": 19310 }, { "epoch": 0.7221066897212421, "grad_norm": 0.506600022315979, "learning_rate": 3.574823494109567e-06, "loss": 0.3817, "step": 19315 }, { "epoch": 0.7222936187115919, "grad_norm": 0.38654983043670654, "learning_rate": 3.5703246191553463e-06, "loss": 0.2811, "step": 19320 }, { "epoch": 0.7224805477019417, "grad_norm": 0.488619863986969, "learning_rate": 3.565827961611935e-06, "loss": 0.2265, "step": 19325 }, { "epoch": 0.7226674766922915, "grad_norm": 0.38500773906707764, "learning_rate": 3.561333523030097e-06, "loss": 0.2155, "step": 19330 }, { "epoch": 0.7228544056826413, "grad_norm": 0.3992687463760376, "learning_rate": 3.5568413049598326e-06, "loss": 0.2698, "step": 19335 }, { "epoch": 0.7230413346729911, "grad_norm": 0.5223041772842407, "learning_rate": 3.552351308950386e-06, "loss": 0.3776, "step": 19340 }, { "epoch": 0.7232282636633409, "grad_norm": 0.32374194264411926, "learning_rate": 3.54786353655022e-06, "loss": 0.2399, "step": 19345 }, { "epoch": 0.7234151926536907, "grad_norm": 0.3728601038455963, "learning_rate": 3.5433779893070477e-06, "loss": 0.2624, "step": 19350 }, { "epoch": 0.7236021216440405, "grad_norm": 0.29935961961746216, "learning_rate": 3.538894668767797e-06, "loss": 0.2591, "step": 19355 }, { "epoch": 0.7237890506343903, "grad_norm": 0.3965166211128235, "learning_rate": 3.534413576478645e-06, "loss": 0.2904, "step": 19360 }, { "epoch": 0.72397597962474, "grad_norm": 0.32862594723701477, "learning_rate": 3.5299347139849836e-06, "loss": 0.3974, "step": 19365 }, { "epoch": 0.7241629086150898, "grad_norm": 0.3905617594718933, "learning_rate": 3.5254580828314524e-06, "loss": 0.2661, "step": 19370 }, { "epoch": 0.7243498376054396, "grad_norm": 0.7117414474487305, "learning_rate": 3.5209836845619093e-06, "loss": 0.2346, "step": 19375 }, { "epoch": 0.7245367665957895, "grad_norm": 0.2594413757324219, "learning_rate": 3.5165115207194435e-06, "loss": 0.2891, "step": 19380 }, { "epoch": 0.7247236955861392, "grad_norm": 0.6064930558204651, "learning_rate": 3.5120415928463813e-06, "loss": 0.2849, "step": 19385 }, { "epoch": 0.724910624576489, "grad_norm": 0.34241193532943726, "learning_rate": 3.507573902484267e-06, "loss": 0.2779, "step": 19390 }, { "epoch": 0.7250975535668388, "grad_norm": 0.4620073735713959, "learning_rate": 3.5031084511738855e-06, "loss": 0.3194, "step": 19395 }, { "epoch": 0.7252844825571886, "grad_norm": 0.36498600244522095, "learning_rate": 3.4986452404552362e-06, "loss": 0.3193, "step": 19400 }, { "epoch": 0.7254714115475384, "grad_norm": 0.24594563245773315, "learning_rate": 3.49418427186756e-06, "loss": 0.3006, "step": 19405 }, { "epoch": 0.7256583405378881, "grad_norm": 0.6980525255203247, "learning_rate": 3.4897255469493096e-06, "loss": 0.2412, "step": 19410 }, { "epoch": 0.7258452695282379, "grad_norm": 0.5322422385215759, "learning_rate": 3.4852690672381785e-06, "loss": 0.2604, "step": 19415 }, { "epoch": 0.7260321985185878, "grad_norm": 0.27667078375816345, "learning_rate": 3.480814834271072e-06, "loss": 0.306, "step": 19420 }, { "epoch": 0.7262191275089376, "grad_norm": 0.38868507742881775, "learning_rate": 3.476362849584134e-06, "loss": 0.2896, "step": 19425 }, { "epoch": 0.7264060564992874, "grad_norm": 0.2022697925567627, "learning_rate": 3.4719131147127237e-06, "loss": 0.3543, "step": 19430 }, { "epoch": 0.7265929854896371, "grad_norm": 0.41005682945251465, "learning_rate": 3.4674656311914233e-06, "loss": 0.3022, "step": 19435 }, { "epoch": 0.7267799144799869, "grad_norm": 0.33465057611465454, "learning_rate": 3.463020400554049e-06, "loss": 0.3551, "step": 19440 }, { "epoch": 0.7269668434703367, "grad_norm": 0.3077739477157593, "learning_rate": 3.4585774243336277e-06, "loss": 0.2626, "step": 19445 }, { "epoch": 0.7271537724606865, "grad_norm": 1.222378134727478, "learning_rate": 3.45413670406242e-06, "loss": 0.3006, "step": 19450 }, { "epoch": 0.7273407014510362, "grad_norm": 0.28431418538093567, "learning_rate": 3.449698241271897e-06, "loss": 0.2822, "step": 19455 }, { "epoch": 0.7275276304413861, "grad_norm": 0.5643324851989746, "learning_rate": 3.445262037492765e-06, "loss": 0.306, "step": 19460 }, { "epoch": 0.7277145594317359, "grad_norm": 0.39042869210243225, "learning_rate": 3.4408280942549343e-06, "loss": 0.2474, "step": 19465 }, { "epoch": 0.7279014884220857, "grad_norm": 0.5540686249732971, "learning_rate": 3.436396413087555e-06, "loss": 0.2843, "step": 19470 }, { "epoch": 0.7280884174124355, "grad_norm": 0.3987734317779541, "learning_rate": 3.4319669955189806e-06, "loss": 0.34, "step": 19475 }, { "epoch": 0.7282753464027852, "grad_norm": 0.4319436252117157, "learning_rate": 3.427539843076788e-06, "loss": 0.3197, "step": 19480 }, { "epoch": 0.728462275393135, "grad_norm": 0.3150979280471802, "learning_rate": 3.423114957287783e-06, "loss": 0.3086, "step": 19485 }, { "epoch": 0.7286492043834848, "grad_norm": 0.34863778948783875, "learning_rate": 3.4186923396779735e-06, "loss": 0.2332, "step": 19490 }, { "epoch": 0.7288361333738346, "grad_norm": 0.41178667545318604, "learning_rate": 3.414271991772602e-06, "loss": 0.2602, "step": 19495 }, { "epoch": 0.7290230623641845, "grad_norm": 0.5623154044151306, "learning_rate": 3.4098539150961107e-06, "loss": 0.2037, "step": 19500 }, { "epoch": 0.7292099913545342, "grad_norm": 0.26398512721061707, "learning_rate": 3.4054381111721767e-06, "loss": 0.2504, "step": 19505 }, { "epoch": 0.729396920344884, "grad_norm": 0.4237319231033325, "learning_rate": 3.4010245815236775e-06, "loss": 0.2686, "step": 19510 }, { "epoch": 0.7295838493352338, "grad_norm": 0.3419848084449768, "learning_rate": 3.3966133276727178e-06, "loss": 0.238, "step": 19515 }, { "epoch": 0.7297707783255836, "grad_norm": 0.34581613540649414, "learning_rate": 3.392204351140611e-06, "loss": 0.2861, "step": 19520 }, { "epoch": 0.7299577073159333, "grad_norm": 0.27548453211784363, "learning_rate": 3.3877976534478816e-06, "loss": 0.3436, "step": 19525 }, { "epoch": 0.7301446363062831, "grad_norm": 0.5232365727424622, "learning_rate": 3.383393236114283e-06, "loss": 0.2877, "step": 19530 }, { "epoch": 0.7303315652966329, "grad_norm": 0.36319199204444885, "learning_rate": 3.378991100658764e-06, "loss": 0.4038, "step": 19535 }, { "epoch": 0.7305184942869827, "grad_norm": 0.3827814757823944, "learning_rate": 3.3745912485995e-06, "loss": 0.2816, "step": 19540 }, { "epoch": 0.7307054232773326, "grad_norm": 0.7302720546722412, "learning_rate": 3.370193681453872e-06, "loss": 0.3484, "step": 19545 }, { "epoch": 0.7308923522676823, "grad_norm": 0.3170529901981354, "learning_rate": 3.3657984007384757e-06, "loss": 0.2089, "step": 19550 }, { "epoch": 0.7310792812580321, "grad_norm": 0.6026202440261841, "learning_rate": 3.361405407969115e-06, "loss": 0.3252, "step": 19555 }, { "epoch": 0.7312662102483819, "grad_norm": 0.6313705444335938, "learning_rate": 3.3570147046608125e-06, "loss": 0.2608, "step": 19560 }, { "epoch": 0.7314531392387317, "grad_norm": 0.6118645071983337, "learning_rate": 3.352626292327793e-06, "loss": 0.303, "step": 19565 }, { "epoch": 0.7316400682290815, "grad_norm": 0.9984606504440308, "learning_rate": 3.348240172483491e-06, "loss": 0.269, "step": 19570 }, { "epoch": 0.7318269972194312, "grad_norm": 0.4851844906806946, "learning_rate": 3.3438563466405595e-06, "loss": 0.2853, "step": 19575 }, { "epoch": 0.732013926209781, "grad_norm": 0.5005273222923279, "learning_rate": 3.339474816310847e-06, "loss": 0.3988, "step": 19580 }, { "epoch": 0.7322008552001309, "grad_norm": 0.31317126750946045, "learning_rate": 3.3350955830054267e-06, "loss": 0.3109, "step": 19585 }, { "epoch": 0.7323877841904807, "grad_norm": 0.33924540877342224, "learning_rate": 3.330718648234562e-06, "loss": 0.3024, "step": 19590 }, { "epoch": 0.7325747131808305, "grad_norm": 0.5410972833633423, "learning_rate": 3.326344013507741e-06, "loss": 0.3328, "step": 19595 }, { "epoch": 0.7327616421711802, "grad_norm": 0.4295196235179901, "learning_rate": 3.321971680333641e-06, "loss": 0.2226, "step": 19600 }, { "epoch": 0.73294857116153, "grad_norm": 0.3176816999912262, "learning_rate": 3.317601650220159e-06, "loss": 0.2551, "step": 19605 }, { "epoch": 0.7331355001518798, "grad_norm": 0.5756362676620483, "learning_rate": 3.313233924674396e-06, "loss": 0.2447, "step": 19610 }, { "epoch": 0.7333224291422296, "grad_norm": 0.6090577840805054, "learning_rate": 3.3088685052026524e-06, "loss": 0.3359, "step": 19615 }, { "epoch": 0.7335093581325793, "grad_norm": 0.27702534198760986, "learning_rate": 3.3045053933104366e-06, "loss": 0.2796, "step": 19620 }, { "epoch": 0.7336962871229292, "grad_norm": 0.3996005654335022, "learning_rate": 3.3001445905024567e-06, "loss": 0.2924, "step": 19625 }, { "epoch": 0.733883216113279, "grad_norm": 0.4653981029987335, "learning_rate": 3.2957860982826363e-06, "loss": 0.242, "step": 19630 }, { "epoch": 0.7340701451036288, "grad_norm": 0.43299591541290283, "learning_rate": 3.2914299181540866e-06, "loss": 0.3355, "step": 19635 }, { "epoch": 0.7342570740939786, "grad_norm": 0.35684916377067566, "learning_rate": 3.287076051619137e-06, "loss": 0.3164, "step": 19640 }, { "epoch": 0.7344440030843283, "grad_norm": 0.5310947895050049, "learning_rate": 3.282724500179304e-06, "loss": 0.3019, "step": 19645 }, { "epoch": 0.7346309320746781, "grad_norm": 0.38951942324638367, "learning_rate": 3.2783752653353164e-06, "loss": 0.2736, "step": 19650 }, { "epoch": 0.7348178610650279, "grad_norm": 0.19010038673877716, "learning_rate": 3.2740283485871038e-06, "loss": 0.2463, "step": 19655 }, { "epoch": 0.7350047900553777, "grad_norm": 0.3501550853252411, "learning_rate": 3.269683751433791e-06, "loss": 0.2534, "step": 19660 }, { "epoch": 0.7351917190457276, "grad_norm": 0.40544456243515015, "learning_rate": 3.2653414753737047e-06, "loss": 0.2525, "step": 19665 }, { "epoch": 0.7353786480360773, "grad_norm": 1.1853599548339844, "learning_rate": 3.261001521904368e-06, "loss": 0.2623, "step": 19670 }, { "epoch": 0.7355655770264271, "grad_norm": 0.24595674872398376, "learning_rate": 3.2566638925225113e-06, "loss": 0.2254, "step": 19675 }, { "epoch": 0.7357525060167769, "grad_norm": 0.2667876183986664, "learning_rate": 3.2523285887240553e-06, "loss": 0.3439, "step": 19680 }, { "epoch": 0.7359394350071267, "grad_norm": 0.44477343559265137, "learning_rate": 3.2479956120041266e-06, "loss": 0.3082, "step": 19685 }, { "epoch": 0.7361263639974764, "grad_norm": 0.455649197101593, "learning_rate": 3.243664963857038e-06, "loss": 0.2301, "step": 19690 }, { "epoch": 0.7363132929878262, "grad_norm": 0.4365465044975281, "learning_rate": 3.239336645776311e-06, "loss": 0.2563, "step": 19695 }, { "epoch": 0.736500221978176, "grad_norm": 0.3654913902282715, "learning_rate": 3.23501065925466e-06, "loss": 0.2998, "step": 19700 }, { "epoch": 0.7366871509685259, "grad_norm": 0.46953657269477844, "learning_rate": 3.230687005783992e-06, "loss": 0.2587, "step": 19705 }, { "epoch": 0.7368740799588757, "grad_norm": 0.5593171119689941, "learning_rate": 3.2263656868554092e-06, "loss": 0.2911, "step": 19710 }, { "epoch": 0.7370610089492254, "grad_norm": 0.4246465861797333, "learning_rate": 3.2220467039592097e-06, "loss": 0.2599, "step": 19715 }, { "epoch": 0.7372479379395752, "grad_norm": 0.38245370984077454, "learning_rate": 3.2177300585848916e-06, "loss": 0.1948, "step": 19720 }, { "epoch": 0.737434866929925, "grad_norm": 0.525850772857666, "learning_rate": 3.213415752221136e-06, "loss": 0.261, "step": 19725 }, { "epoch": 0.7376217959202748, "grad_norm": 0.3949277997016907, "learning_rate": 3.2091037863558316e-06, "loss": 0.2761, "step": 19730 }, { "epoch": 0.7378087249106245, "grad_norm": 0.47665491700172424, "learning_rate": 3.2047941624760435e-06, "loss": 0.2357, "step": 19735 }, { "epoch": 0.7379956539009743, "grad_norm": 0.3770470917224884, "learning_rate": 3.2004868820680423e-06, "loss": 0.2551, "step": 19740 }, { "epoch": 0.7381825828913242, "grad_norm": 0.24684104323387146, "learning_rate": 3.196181946617287e-06, "loss": 0.2372, "step": 19745 }, { "epoch": 0.738369511881674, "grad_norm": 0.5357893705368042, "learning_rate": 3.191879357608425e-06, "loss": 0.3861, "step": 19750 }, { "epoch": 0.7385564408720238, "grad_norm": 0.4339257776737213, "learning_rate": 3.187579116525291e-06, "loss": 0.2941, "step": 19755 }, { "epoch": 0.7387433698623735, "grad_norm": 0.4764021933078766, "learning_rate": 3.1832812248509235e-06, "loss": 0.2143, "step": 19760 }, { "epoch": 0.7389302988527233, "grad_norm": 0.433449923992157, "learning_rate": 3.178985684067537e-06, "loss": 0.2832, "step": 19765 }, { "epoch": 0.7391172278430731, "grad_norm": 0.5205506086349487, "learning_rate": 3.1746924956565385e-06, "loss": 0.2622, "step": 19770 }, { "epoch": 0.7393041568334229, "grad_norm": 0.37828561663627625, "learning_rate": 3.1704016610985313e-06, "loss": 0.2705, "step": 19775 }, { "epoch": 0.7394910858237727, "grad_norm": 0.24176892638206482, "learning_rate": 3.166113181873296e-06, "loss": 0.3125, "step": 19780 }, { "epoch": 0.7396780148141224, "grad_norm": 0.40736111998558044, "learning_rate": 3.1618270594598076e-06, "loss": 0.3194, "step": 19785 }, { "epoch": 0.7398649438044723, "grad_norm": 0.47427597641944885, "learning_rate": 3.1575432953362317e-06, "loss": 0.3029, "step": 19790 }, { "epoch": 0.7400518727948221, "grad_norm": 0.32264068722724915, "learning_rate": 3.1532618909799095e-06, "loss": 0.3093, "step": 19795 }, { "epoch": 0.7402388017851719, "grad_norm": 0.5120447874069214, "learning_rate": 3.14898284786738e-06, "loss": 0.2977, "step": 19800 }, { "epoch": 0.7404257307755217, "grad_norm": 0.3379383981227875, "learning_rate": 3.1447061674743594e-06, "loss": 0.2976, "step": 19805 }, { "epoch": 0.7406126597658714, "grad_norm": 0.6211456060409546, "learning_rate": 3.1404318512757525e-06, "loss": 0.2624, "step": 19810 }, { "epoch": 0.7407995887562212, "grad_norm": 0.306180864572525, "learning_rate": 3.1361599007456456e-06, "loss": 0.2349, "step": 19815 }, { "epoch": 0.740986517746571, "grad_norm": 0.5241755247116089, "learning_rate": 3.131890317357319e-06, "loss": 0.3073, "step": 19820 }, { "epoch": 0.7411734467369208, "grad_norm": 0.4787485599517822, "learning_rate": 3.1276231025832217e-06, "loss": 0.2572, "step": 19825 }, { "epoch": 0.7413603757272706, "grad_norm": 0.27676764130592346, "learning_rate": 3.123358257894997e-06, "loss": 0.3357, "step": 19830 }, { "epoch": 0.7415473047176204, "grad_norm": 0.4201495945453644, "learning_rate": 3.119095784763473e-06, "loss": 0.2738, "step": 19835 }, { "epoch": 0.7417342337079702, "grad_norm": 0.27705347537994385, "learning_rate": 3.114835684658647e-06, "loss": 0.2443, "step": 19840 }, { "epoch": 0.74192116269832, "grad_norm": 0.4122698903083801, "learning_rate": 3.1105779590497108e-06, "loss": 0.3632, "step": 19845 }, { "epoch": 0.7421080916886698, "grad_norm": 0.39222484827041626, "learning_rate": 3.1063226094050304e-06, "loss": 0.2928, "step": 19850 }, { "epoch": 0.7422950206790195, "grad_norm": 0.5306823253631592, "learning_rate": 3.102069637192152e-06, "loss": 0.2495, "step": 19855 }, { "epoch": 0.7424819496693693, "grad_norm": 0.3978886306285858, "learning_rate": 3.0978190438778022e-06, "loss": 0.2175, "step": 19860 }, { "epoch": 0.7426688786597191, "grad_norm": 0.34500226378440857, "learning_rate": 3.0935708309278956e-06, "loss": 0.2046, "step": 19865 }, { "epoch": 0.742855807650069, "grad_norm": 0.3210207521915436, "learning_rate": 3.0893249998075116e-06, "loss": 0.2618, "step": 19870 }, { "epoch": 0.7430427366404188, "grad_norm": 0.7616083025932312, "learning_rate": 3.0850815519809184e-06, "loss": 0.3427, "step": 19875 }, { "epoch": 0.7432296656307685, "grad_norm": 0.44957032799720764, "learning_rate": 3.080840488911565e-06, "loss": 0.2465, "step": 19880 }, { "epoch": 0.7434165946211183, "grad_norm": 0.3584950566291809, "learning_rate": 3.0766018120620643e-06, "loss": 0.2426, "step": 19885 }, { "epoch": 0.7436035236114681, "grad_norm": 0.6310529112815857, "learning_rate": 3.072365522894221e-06, "loss": 0.2662, "step": 19890 }, { "epoch": 0.7437904526018179, "grad_norm": 0.575434684753418, "learning_rate": 3.068131622869007e-06, "loss": 0.2728, "step": 19895 }, { "epoch": 0.7439773815921676, "grad_norm": 0.051615819334983826, "learning_rate": 3.063900113446574e-06, "loss": 0.204, "step": 19900 }, { "epoch": 0.7441643105825174, "grad_norm": 0.575394332408905, "learning_rate": 3.0596709960862436e-06, "loss": 0.2808, "step": 19905 }, { "epoch": 0.7443512395728673, "grad_norm": 0.6706823110580444, "learning_rate": 3.055444272246524e-06, "loss": 0.3299, "step": 19910 }, { "epoch": 0.7445381685632171, "grad_norm": 0.2847936451435089, "learning_rate": 3.0512199433850855e-06, "loss": 0.2401, "step": 19915 }, { "epoch": 0.7447250975535669, "grad_norm": 0.4267846941947937, "learning_rate": 3.0469980109587803e-06, "loss": 0.3545, "step": 19920 }, { "epoch": 0.7449120265439166, "grad_norm": 0.3833447992801666, "learning_rate": 3.042778476423637e-06, "loss": 0.3209, "step": 19925 }, { "epoch": 0.7450989555342664, "grad_norm": 0.41514524817466736, "learning_rate": 3.0385613412348423e-06, "loss": 0.222, "step": 19930 }, { "epoch": 0.7452858845246162, "grad_norm": 0.22775611281394958, "learning_rate": 3.0343466068467752e-06, "loss": 0.2635, "step": 19935 }, { "epoch": 0.745472813514966, "grad_norm": 0.4837653636932373, "learning_rate": 3.030134274712968e-06, "loss": 0.294, "step": 19940 }, { "epoch": 0.7456597425053157, "grad_norm": 0.41959109902381897, "learning_rate": 3.0259243462861423e-06, "loss": 0.3014, "step": 19945 }, { "epoch": 0.7458466714956656, "grad_norm": 0.3355468511581421, "learning_rate": 3.021716823018176e-06, "loss": 0.2535, "step": 19950 }, { "epoch": 0.7460336004860154, "grad_norm": 0.31884121894836426, "learning_rate": 3.0175117063601235e-06, "loss": 0.2149, "step": 19955 }, { "epoch": 0.7462205294763652, "grad_norm": 1.0035449266433716, "learning_rate": 3.0133089977622076e-06, "loss": 0.3098, "step": 19960 }, { "epoch": 0.746407458466715, "grad_norm": 0.34479397535324097, "learning_rate": 3.009108698673825e-06, "loss": 0.2726, "step": 19965 }, { "epoch": 0.7465943874570647, "grad_norm": 0.619194746017456, "learning_rate": 3.00491081054354e-06, "loss": 0.3021, "step": 19970 }, { "epoch": 0.7467813164474145, "grad_norm": 0.4144986867904663, "learning_rate": 3.0007153348190786e-06, "loss": 0.2219, "step": 19975 }, { "epoch": 0.7469682454377643, "grad_norm": 0.580058753490448, "learning_rate": 2.9965222729473474e-06, "loss": 0.3113, "step": 19980 }, { "epoch": 0.7471551744281141, "grad_norm": 0.1870926320552826, "learning_rate": 2.992331626374405e-06, "loss": 0.2202, "step": 19985 }, { "epoch": 0.747342103418464, "grad_norm": 0.48588237166404724, "learning_rate": 2.988143396545493e-06, "loss": 0.2335, "step": 19990 }, { "epoch": 0.7475290324088137, "grad_norm": 0.36052894592285156, "learning_rate": 2.9839575849050094e-06, "loss": 0.2429, "step": 19995 }, { "epoch": 0.7477159613991635, "grad_norm": 0.42960643768310547, "learning_rate": 2.9797741928965185e-06, "loss": 0.2448, "step": 20000 }, { "epoch": 0.7479028903895133, "grad_norm": 0.455585241317749, "learning_rate": 2.9755932219627514e-06, "loss": 0.262, "step": 20005 }, { "epoch": 0.7480898193798631, "grad_norm": 0.22561267018318176, "learning_rate": 2.9714146735456063e-06, "loss": 0.289, "step": 20010 }, { "epoch": 0.7482767483702129, "grad_norm": 0.3798793852329254, "learning_rate": 2.96723854908615e-06, "loss": 0.3006, "step": 20015 }, { "epoch": 0.7484636773605626, "grad_norm": 0.45209431648254395, "learning_rate": 2.9630648500245993e-06, "loss": 0.3088, "step": 20020 }, { "epoch": 0.7486506063509124, "grad_norm": 0.5453251004219055, "learning_rate": 2.9588935778003526e-06, "loss": 0.4175, "step": 20025 }, { "epoch": 0.7488375353412622, "grad_norm": 0.5267879366874695, "learning_rate": 2.9547247338519547e-06, "loss": 0.2464, "step": 20030 }, { "epoch": 0.7490244643316121, "grad_norm": 0.3326359987258911, "learning_rate": 2.950558319617126e-06, "loss": 0.2875, "step": 20035 }, { "epoch": 0.7492113933219618, "grad_norm": 0.3526333272457123, "learning_rate": 2.9463943365327406e-06, "loss": 0.2775, "step": 20040 }, { "epoch": 0.7493983223123116, "grad_norm": 0.2863275110721588, "learning_rate": 2.9422327860348377e-06, "loss": 0.2816, "step": 20045 }, { "epoch": 0.7495852513026614, "grad_norm": 0.5883467793464661, "learning_rate": 2.938073669558613e-06, "loss": 0.2752, "step": 20050 }, { "epoch": 0.7497721802930112, "grad_norm": 0.5096171498298645, "learning_rate": 2.93391698853843e-06, "loss": 0.348, "step": 20055 }, { "epoch": 0.749959109283361, "grad_norm": 0.2166975438594818, "learning_rate": 2.9297627444078115e-06, "loss": 0.2923, "step": 20060 }, { "epoch": 0.7501460382737107, "grad_norm": 0.3335493803024292, "learning_rate": 2.9256109385994326e-06, "loss": 0.2554, "step": 20065 }, { "epoch": 0.7503329672640605, "grad_norm": 1.0838913917541504, "learning_rate": 2.9214615725451354e-06, "loss": 0.3143, "step": 20070 }, { "epoch": 0.7505198962544104, "grad_norm": 0.5258083939552307, "learning_rate": 2.917314647675914e-06, "loss": 0.2903, "step": 20075 }, { "epoch": 0.7507068252447602, "grad_norm": 0.4601660370826721, "learning_rate": 2.913170165421929e-06, "loss": 0.2453, "step": 20080 }, { "epoch": 0.75089375423511, "grad_norm": 0.19206954538822174, "learning_rate": 2.909028127212491e-06, "loss": 0.2646, "step": 20085 }, { "epoch": 0.7510806832254597, "grad_norm": 0.3423701524734497, "learning_rate": 2.904888534476069e-06, "loss": 0.2862, "step": 20090 }, { "epoch": 0.7512676122158095, "grad_norm": 0.5520522594451904, "learning_rate": 2.9007513886402884e-06, "loss": 0.3379, "step": 20095 }, { "epoch": 0.7514545412061593, "grad_norm": 0.2825580835342407, "learning_rate": 2.896616691131934e-06, "loss": 0.3071, "step": 20100 }, { "epoch": 0.7516414701965091, "grad_norm": 0.4348238706588745, "learning_rate": 2.892484443376948e-06, "loss": 0.2731, "step": 20105 }, { "epoch": 0.7518283991868588, "grad_norm": 0.4713153541088104, "learning_rate": 2.8883546468004196e-06, "loss": 0.2838, "step": 20110 }, { "epoch": 0.7520153281772087, "grad_norm": 1.168811559677124, "learning_rate": 2.884227302826601e-06, "loss": 0.2472, "step": 20115 }, { "epoch": 0.7522022571675585, "grad_norm": 0.5519205331802368, "learning_rate": 2.8801024128788903e-06, "loss": 0.3046, "step": 20120 }, { "epoch": 0.7523891861579083, "grad_norm": 0.5526771545410156, "learning_rate": 2.8759799783798503e-06, "loss": 0.2885, "step": 20125 }, { "epoch": 0.7525761151482581, "grad_norm": 0.4945368468761444, "learning_rate": 2.871860000751182e-06, "loss": 0.3089, "step": 20130 }, { "epoch": 0.7527630441386078, "grad_norm": 0.5120470523834229, "learning_rate": 2.8677424814137565e-06, "loss": 0.3819, "step": 20135 }, { "epoch": 0.7529499731289576, "grad_norm": 0.3834647536277771, "learning_rate": 2.8636274217875846e-06, "loss": 0.3076, "step": 20140 }, { "epoch": 0.7531369021193074, "grad_norm": 0.7137815356254578, "learning_rate": 2.859514823291829e-06, "loss": 0.2814, "step": 20145 }, { "epoch": 0.7533238311096572, "grad_norm": 0.5744923949241638, "learning_rate": 2.8554046873448127e-06, "loss": 0.3303, "step": 20150 }, { "epoch": 0.7535107601000071, "grad_norm": 0.7045077085494995, "learning_rate": 2.8512970153639976e-06, "loss": 0.227, "step": 20155 }, { "epoch": 0.7536976890903568, "grad_norm": 0.6118825078010559, "learning_rate": 2.8471918087660087e-06, "loss": 0.3046, "step": 20160 }, { "epoch": 0.7538846180807066, "grad_norm": 0.6602465510368347, "learning_rate": 2.843089068966609e-06, "loss": 0.2856, "step": 20165 }, { "epoch": 0.7540715470710564, "grad_norm": 0.5076795220375061, "learning_rate": 2.8389887973807207e-06, "loss": 0.2302, "step": 20170 }, { "epoch": 0.7542584760614062, "grad_norm": 0.5142863988876343, "learning_rate": 2.8348909954224037e-06, "loss": 0.2519, "step": 20175 }, { "epoch": 0.754445405051756, "grad_norm": 0.3825984001159668, "learning_rate": 2.8307956645048795e-06, "loss": 0.2828, "step": 20180 }, { "epoch": 0.7546323340421057, "grad_norm": 0.5186363458633423, "learning_rate": 2.8267028060405066e-06, "loss": 0.3063, "step": 20185 }, { "epoch": 0.7548192630324555, "grad_norm": 0.43224039673805237, "learning_rate": 2.8226124214407912e-06, "loss": 0.2481, "step": 20190 }, { "epoch": 0.7550061920228054, "grad_norm": 0.3157874047756195, "learning_rate": 2.8185245121163986e-06, "loss": 0.3186, "step": 20195 }, { "epoch": 0.7551931210131552, "grad_norm": 0.2855377197265625, "learning_rate": 2.8144390794771215e-06, "loss": 0.2652, "step": 20200 }, { "epoch": 0.7553800500035049, "grad_norm": 0.29042690992355347, "learning_rate": 2.810356124931918e-06, "loss": 0.3023, "step": 20205 }, { "epoch": 0.7555669789938547, "grad_norm": 0.38155996799468994, "learning_rate": 2.806275649888873e-06, "loss": 0.2603, "step": 20210 }, { "epoch": 0.7557539079842045, "grad_norm": 0.3918231427669525, "learning_rate": 2.8021976557552346e-06, "loss": 0.2806, "step": 20215 }, { "epoch": 0.7559408369745543, "grad_norm": 0.3024686574935913, "learning_rate": 2.7981221439373774e-06, "loss": 0.3187, "step": 20220 }, { "epoch": 0.756127765964904, "grad_norm": 0.5795729160308838, "learning_rate": 2.7940491158408367e-06, "loss": 0.2417, "step": 20225 }, { "epoch": 0.7563146949552538, "grad_norm": 1.9002882242202759, "learning_rate": 2.7899785728702787e-06, "loss": 0.2685, "step": 20230 }, { "epoch": 0.7565016239456037, "grad_norm": 0.6263231039047241, "learning_rate": 2.785910516429515e-06, "loss": 0.3513, "step": 20235 }, { "epoch": 0.7566885529359535, "grad_norm": 0.36941060423851013, "learning_rate": 2.781844947921508e-06, "loss": 0.4064, "step": 20240 }, { "epoch": 0.7568754819263033, "grad_norm": 0.30503350496292114, "learning_rate": 2.7777818687483483e-06, "loss": 0.2724, "step": 20245 }, { "epoch": 0.757062410916653, "grad_norm": 0.27014967799186707, "learning_rate": 2.7737212803112824e-06, "loss": 0.2964, "step": 20250 }, { "epoch": 0.7572493399070028, "grad_norm": 0.432959645986557, "learning_rate": 2.7696631840106847e-06, "loss": 0.271, "step": 20255 }, { "epoch": 0.7574362688973526, "grad_norm": 0.19095739722251892, "learning_rate": 2.7656075812460835e-06, "loss": 0.3006, "step": 20260 }, { "epoch": 0.7576231978877024, "grad_norm": 0.3333872854709625, "learning_rate": 2.7615544734161315e-06, "loss": 0.2004, "step": 20265 }, { "epoch": 0.7578101268780522, "grad_norm": 0.37564340233802795, "learning_rate": 2.757503861918638e-06, "loss": 0.3621, "step": 20270 }, { "epoch": 0.7579970558684019, "grad_norm": 0.3397802412509918, "learning_rate": 2.7534557481505385e-06, "loss": 0.249, "step": 20275 }, { "epoch": 0.7581839848587518, "grad_norm": 0.5983813405036926, "learning_rate": 2.7494101335079094e-06, "loss": 0.2332, "step": 20280 }, { "epoch": 0.7583709138491016, "grad_norm": 0.38677194714546204, "learning_rate": 2.7453670193859716e-06, "loss": 0.3318, "step": 20285 }, { "epoch": 0.7585578428394514, "grad_norm": 0.45902055501937866, "learning_rate": 2.7413264071790747e-06, "loss": 0.2212, "step": 20290 }, { "epoch": 0.7587447718298012, "grad_norm": 0.6074469685554504, "learning_rate": 2.737288298280715e-06, "loss": 0.2984, "step": 20295 }, { "epoch": 0.7589317008201509, "grad_norm": 0.4030331075191498, "learning_rate": 2.7332526940835156e-06, "loss": 0.3323, "step": 20300 }, { "epoch": 0.7591186298105007, "grad_norm": 0.3118947744369507, "learning_rate": 2.729219595979247e-06, "loss": 0.263, "step": 20305 }, { "epoch": 0.7593055588008505, "grad_norm": 0.2766875624656677, "learning_rate": 2.7251890053588015e-06, "loss": 0.2779, "step": 20310 }, { "epoch": 0.7594924877912003, "grad_norm": 0.4155539572238922, "learning_rate": 2.7211609236122216e-06, "loss": 0.2856, "step": 20315 }, { "epoch": 0.7596794167815502, "grad_norm": 0.37500113248825073, "learning_rate": 2.717135352128671e-06, "loss": 0.2353, "step": 20320 }, { "epoch": 0.7598663457718999, "grad_norm": 0.60191810131073, "learning_rate": 2.7131122922964603e-06, "loss": 0.3352, "step": 20325 }, { "epoch": 0.7600532747622497, "grad_norm": 0.2800582945346832, "learning_rate": 2.709091745503024e-06, "loss": 0.3133, "step": 20330 }, { "epoch": 0.7602402037525995, "grad_norm": 0.4130517840385437, "learning_rate": 2.7050737131349315e-06, "loss": 0.2603, "step": 20335 }, { "epoch": 0.7604271327429493, "grad_norm": 0.5701847076416016, "learning_rate": 2.7010581965778914e-06, "loss": 0.2705, "step": 20340 }, { "epoch": 0.760614061733299, "grad_norm": 0.47026559710502625, "learning_rate": 2.6970451972167355e-06, "loss": 0.2877, "step": 20345 }, { "epoch": 0.7608009907236488, "grad_norm": 0.5374239683151245, "learning_rate": 2.6930347164354376e-06, "loss": 0.2791, "step": 20350 }, { "epoch": 0.7609879197139986, "grad_norm": 0.29200440645217896, "learning_rate": 2.6890267556170925e-06, "loss": 0.2677, "step": 20355 }, { "epoch": 0.7611748487043485, "grad_norm": 0.3506094515323639, "learning_rate": 2.6850213161439363e-06, "loss": 0.2583, "step": 20360 }, { "epoch": 0.7613617776946983, "grad_norm": 0.652692973613739, "learning_rate": 2.6810183993973247e-06, "loss": 0.259, "step": 20365 }, { "epoch": 0.761548706685048, "grad_norm": 0.6381142139434814, "learning_rate": 2.6770180067577547e-06, "loss": 0.3052, "step": 20370 }, { "epoch": 0.7617356356753978, "grad_norm": 0.3937336206436157, "learning_rate": 2.6730201396048437e-06, "loss": 0.2877, "step": 20375 }, { "epoch": 0.7619225646657476, "grad_norm": 0.44311031699180603, "learning_rate": 2.6690247993173393e-06, "loss": 0.374, "step": 20380 }, { "epoch": 0.7621094936560974, "grad_norm": 2.144091844558716, "learning_rate": 2.6650319872731258e-06, "loss": 0.4321, "step": 20385 }, { "epoch": 0.7622964226464471, "grad_norm": 0.479422390460968, "learning_rate": 2.661041704849203e-06, "loss": 0.274, "step": 20390 }, { "epoch": 0.7624833516367969, "grad_norm": 0.392315536737442, "learning_rate": 2.657053953421712e-06, "loss": 0.2914, "step": 20395 }, { "epoch": 0.7626702806271468, "grad_norm": 0.34140515327453613, "learning_rate": 2.6530687343659067e-06, "loss": 0.2824, "step": 20400 }, { "epoch": 0.7628572096174966, "grad_norm": 0.2744537889957428, "learning_rate": 2.649086049056182e-06, "loss": 0.2552, "step": 20405 }, { "epoch": 0.7630441386078464, "grad_norm": 0.23247292637825012, "learning_rate": 2.645105898866046e-06, "loss": 0.2724, "step": 20410 }, { "epoch": 0.7632310675981961, "grad_norm": 0.2504008412361145, "learning_rate": 2.641128285168144e-06, "loss": 0.2436, "step": 20415 }, { "epoch": 0.7634179965885459, "grad_norm": 0.3857523500919342, "learning_rate": 2.637153209334239e-06, "loss": 0.294, "step": 20420 }, { "epoch": 0.7636049255788957, "grad_norm": 0.42099565267562866, "learning_rate": 2.633180672735215e-06, "loss": 0.3373, "step": 20425 }, { "epoch": 0.7637918545692455, "grad_norm": 0.2764213979244232, "learning_rate": 2.6292106767410953e-06, "loss": 0.2514, "step": 20430 }, { "epoch": 0.7639787835595953, "grad_norm": 0.8510448932647705, "learning_rate": 2.62524322272101e-06, "loss": 0.2924, "step": 20435 }, { "epoch": 0.7641657125499451, "grad_norm": 0.271930456161499, "learning_rate": 2.621278312043226e-06, "loss": 0.2673, "step": 20440 }, { "epoch": 0.7643526415402949, "grad_norm": 0.4646458327770233, "learning_rate": 2.617315946075123e-06, "loss": 0.2376, "step": 20445 }, { "epoch": 0.7645395705306447, "grad_norm": 0.5632447600364685, "learning_rate": 2.613356126183212e-06, "loss": 0.2435, "step": 20450 }, { "epoch": 0.7647264995209945, "grad_norm": 0.29168501496315, "learning_rate": 2.6093988537331163e-06, "loss": 0.3008, "step": 20455 }, { "epoch": 0.7649134285113443, "grad_norm": 0.24056875705718994, "learning_rate": 2.6054441300895905e-06, "loss": 0.2312, "step": 20460 }, { "epoch": 0.765100357501694, "grad_norm": 0.3789188265800476, "learning_rate": 2.601491956616504e-06, "loss": 0.2843, "step": 20465 }, { "epoch": 0.7652872864920438, "grad_norm": 0.35783255100250244, "learning_rate": 2.597542334676846e-06, "loss": 0.2711, "step": 20470 }, { "epoch": 0.7654742154823936, "grad_norm": 0.6306200623512268, "learning_rate": 2.59359526563273e-06, "loss": 0.3031, "step": 20475 }, { "epoch": 0.7656611444727435, "grad_norm": 0.30697113275527954, "learning_rate": 2.589650750845385e-06, "loss": 0.2589, "step": 20480 }, { "epoch": 0.7658480734630932, "grad_norm": 0.41806843876838684, "learning_rate": 2.5857087916751656e-06, "loss": 0.2346, "step": 20485 }, { "epoch": 0.766035002453443, "grad_norm": 0.4137856960296631, "learning_rate": 2.5817693894815342e-06, "loss": 0.2488, "step": 20490 }, { "epoch": 0.7662219314437928, "grad_norm": 1.0388176441192627, "learning_rate": 2.5778325456230845e-06, "loss": 0.2959, "step": 20495 }, { "epoch": 0.7664088604341426, "grad_norm": 0.3841733932495117, "learning_rate": 2.5738982614575147e-06, "loss": 0.3137, "step": 20500 }, { "epoch": 0.7665957894244924, "grad_norm": 0.3577060103416443, "learning_rate": 2.569966538341654e-06, "loss": 0.3331, "step": 20505 }, { "epoch": 0.7667827184148421, "grad_norm": 0.23667654395103455, "learning_rate": 2.5660373776314318e-06, "loss": 0.3054, "step": 20510 }, { "epoch": 0.7669696474051919, "grad_norm": 0.5192760825157166, "learning_rate": 2.5621107806819125e-06, "loss": 0.2532, "step": 20515 }, { "epoch": 0.7671565763955417, "grad_norm": 0.2387147843837738, "learning_rate": 2.558186748847262e-06, "loss": 0.2256, "step": 20520 }, { "epoch": 0.7673435053858916, "grad_norm": 0.5738237500190735, "learning_rate": 2.5542652834807634e-06, "loss": 0.2913, "step": 20525 }, { "epoch": 0.7675304343762414, "grad_norm": 0.5508410334587097, "learning_rate": 2.5503463859348245e-06, "loss": 0.2927, "step": 20530 }, { "epoch": 0.7677173633665911, "grad_norm": 0.5864548087120056, "learning_rate": 2.5464300575609547e-06, "loss": 0.355, "step": 20535 }, { "epoch": 0.7679042923569409, "grad_norm": 0.5955525636672974, "learning_rate": 2.5425162997097896e-06, "loss": 0.263, "step": 20540 }, { "epoch": 0.7680912213472907, "grad_norm": 0.3412562906742096, "learning_rate": 2.538605113731065e-06, "loss": 0.3519, "step": 20545 }, { "epoch": 0.7682781503376405, "grad_norm": 0.44794994592666626, "learning_rate": 2.5346965009736445e-06, "loss": 0.3314, "step": 20550 }, { "epoch": 0.7684650793279902, "grad_norm": 0.5643365979194641, "learning_rate": 2.5307904627854895e-06, "loss": 0.2622, "step": 20555 }, { "epoch": 0.76865200831834, "grad_norm": 0.3417246639728546, "learning_rate": 2.526887000513687e-06, "loss": 0.3421, "step": 20560 }, { "epoch": 0.7688389373086899, "grad_norm": 0.1929822713136673, "learning_rate": 2.5229861155044254e-06, "loss": 0.267, "step": 20565 }, { "epoch": 0.7690258662990397, "grad_norm": 0.2237076312303543, "learning_rate": 2.5190878091030067e-06, "loss": 0.2641, "step": 20570 }, { "epoch": 0.7692127952893895, "grad_norm": 0.43035122752189636, "learning_rate": 2.5151920826538514e-06, "loss": 0.2557, "step": 20575 }, { "epoch": 0.7693997242797392, "grad_norm": 0.3139476180076599, "learning_rate": 2.511298937500476e-06, "loss": 0.2566, "step": 20580 }, { "epoch": 0.769586653270089, "grad_norm": 0.2134442925453186, "learning_rate": 2.5074083749855216e-06, "loss": 0.2199, "step": 20585 }, { "epoch": 0.7697735822604388, "grad_norm": 0.6720663905143738, "learning_rate": 2.503520396450725e-06, "loss": 0.2664, "step": 20590 }, { "epoch": 0.7699605112507886, "grad_norm": 0.7962212562561035, "learning_rate": 2.4996350032369467e-06, "loss": 0.3161, "step": 20595 }, { "epoch": 0.7701474402411383, "grad_norm": 0.24116773903369904, "learning_rate": 2.4957521966841393e-06, "loss": 0.3047, "step": 20600 }, { "epoch": 0.7703343692314882, "grad_norm": 0.21904657781124115, "learning_rate": 2.4918719781313782e-06, "loss": 0.3049, "step": 20605 }, { "epoch": 0.770521298221838, "grad_norm": 0.40606293082237244, "learning_rate": 2.487994348916837e-06, "loss": 0.3494, "step": 20610 }, { "epoch": 0.7707082272121878, "grad_norm": 0.586861789226532, "learning_rate": 2.484119310377796e-06, "loss": 0.2998, "step": 20615 }, { "epoch": 0.7708951562025376, "grad_norm": 0.5949821472167969, "learning_rate": 2.4802468638506505e-06, "loss": 0.3086, "step": 20620 }, { "epoch": 0.7710820851928873, "grad_norm": 0.3286888599395752, "learning_rate": 2.4763770106708907e-06, "loss": 0.2415, "step": 20625 }, { "epoch": 0.7712690141832371, "grad_norm": 0.1509125977754593, "learning_rate": 2.4725097521731232e-06, "loss": 0.2471, "step": 20630 }, { "epoch": 0.7714559431735869, "grad_norm": 0.5915478467941284, "learning_rate": 2.4686450896910497e-06, "loss": 0.2614, "step": 20635 }, { "epoch": 0.7716428721639367, "grad_norm": 0.725960910320282, "learning_rate": 2.4647830245574865e-06, "loss": 0.3098, "step": 20640 }, { "epoch": 0.7718298011542866, "grad_norm": 0.4178033769130707, "learning_rate": 2.4609235581043457e-06, "loss": 0.2995, "step": 20645 }, { "epoch": 0.7720167301446363, "grad_norm": 0.7236891984939575, "learning_rate": 2.4570666916626484e-06, "loss": 0.2878, "step": 20650 }, { "epoch": 0.7722036591349861, "grad_norm": 0.33673229813575745, "learning_rate": 2.4532124265625155e-06, "loss": 0.3246, "step": 20655 }, { "epoch": 0.7723905881253359, "grad_norm": 0.4866393804550171, "learning_rate": 2.4493607641331762e-06, "loss": 0.2776, "step": 20660 }, { "epoch": 0.7725775171156857, "grad_norm": 0.7614867091178894, "learning_rate": 2.4455117057029566e-06, "loss": 0.3473, "step": 20665 }, { "epoch": 0.7727644461060355, "grad_norm": 0.46578964591026306, "learning_rate": 2.441665252599282e-06, "loss": 0.2915, "step": 20670 }, { "epoch": 0.7729513750963852, "grad_norm": 0.5103464126586914, "learning_rate": 2.4378214061486925e-06, "loss": 0.246, "step": 20675 }, { "epoch": 0.773138304086735, "grad_norm": 0.5652568936347961, "learning_rate": 2.433980167676813e-06, "loss": 0.3631, "step": 20680 }, { "epoch": 0.7733252330770849, "grad_norm": 0.46092647314071655, "learning_rate": 2.4301415385083828e-06, "loss": 0.3188, "step": 20685 }, { "epoch": 0.7735121620674347, "grad_norm": 0.5116592645645142, "learning_rate": 2.426305519967228e-06, "loss": 0.2404, "step": 20690 }, { "epoch": 0.7736990910577844, "grad_norm": 0.6798794269561768, "learning_rate": 2.4224721133762864e-06, "loss": 0.2467, "step": 20695 }, { "epoch": 0.7738860200481342, "grad_norm": 0.4135216772556305, "learning_rate": 2.418641320057592e-06, "loss": 0.2992, "step": 20700 }, { "epoch": 0.774072949038484, "grad_norm": 0.5038582682609558, "learning_rate": 2.414813141332274e-06, "loss": 0.2296, "step": 20705 }, { "epoch": 0.7742598780288338, "grad_norm": 0.35031184554100037, "learning_rate": 2.4109875785205593e-06, "loss": 0.25, "step": 20710 }, { "epoch": 0.7744468070191836, "grad_norm": 0.21241429448127747, "learning_rate": 2.407164632941773e-06, "loss": 0.2752, "step": 20715 }, { "epoch": 0.7746337360095333, "grad_norm": 0.2852620482444763, "learning_rate": 2.403344305914346e-06, "loss": 0.2454, "step": 20720 }, { "epoch": 0.7748206649998832, "grad_norm": 0.20307046175003052, "learning_rate": 2.3995265987557925e-06, "loss": 0.3172, "step": 20725 }, { "epoch": 0.775007593990233, "grad_norm": 0.38747474551200867, "learning_rate": 2.395711512782738e-06, "loss": 0.3205, "step": 20730 }, { "epoch": 0.7751945229805828, "grad_norm": 0.2965136468410492, "learning_rate": 2.3918990493108884e-06, "loss": 0.2596, "step": 20735 }, { "epoch": 0.7753814519709326, "grad_norm": 0.3963155150413513, "learning_rate": 2.3880892096550578e-06, "loss": 0.2905, "step": 20740 }, { "epoch": 0.7755683809612823, "grad_norm": 0.31911271810531616, "learning_rate": 2.384281995129153e-06, "loss": 0.3044, "step": 20745 }, { "epoch": 0.7757553099516321, "grad_norm": 0.8403980731964111, "learning_rate": 2.380477407046169e-06, "loss": 0.3053, "step": 20750 }, { "epoch": 0.7759422389419819, "grad_norm": 0.35401323437690735, "learning_rate": 2.3766754467182006e-06, "loss": 0.2962, "step": 20755 }, { "epoch": 0.7761291679323317, "grad_norm": 0.5200223922729492, "learning_rate": 2.3728761154564326e-06, "loss": 0.3908, "step": 20760 }, { "epoch": 0.7763160969226814, "grad_norm": 0.4293026626110077, "learning_rate": 2.3690794145711505e-06, "loss": 0.3042, "step": 20765 }, { "epoch": 0.7765030259130313, "grad_norm": 0.47955235838890076, "learning_rate": 2.365285345371722e-06, "loss": 0.2323, "step": 20770 }, { "epoch": 0.7766899549033811, "grad_norm": 0.29286670684814453, "learning_rate": 2.3614939091666177e-06, "loss": 0.3391, "step": 20775 }, { "epoch": 0.7768768838937309, "grad_norm": 0.21747294068336487, "learning_rate": 2.3577051072633907e-06, "loss": 0.3265, "step": 20780 }, { "epoch": 0.7770638128840807, "grad_norm": 0.9644585251808167, "learning_rate": 2.3539189409686937e-06, "loss": 0.3585, "step": 20785 }, { "epoch": 0.7772507418744304, "grad_norm": 0.21008965373039246, "learning_rate": 2.350135411588267e-06, "loss": 0.2923, "step": 20790 }, { "epoch": 0.7774376708647802, "grad_norm": 0.4879824221134186, "learning_rate": 2.346354520426942e-06, "loss": 0.329, "step": 20795 }, { "epoch": 0.77762459985513, "grad_norm": 0.26359811425209045, "learning_rate": 2.3425762687886378e-06, "loss": 0.2053, "step": 20800 }, { "epoch": 0.7778115288454798, "grad_norm": 0.15240828692913055, "learning_rate": 2.3388006579763623e-06, "loss": 0.2589, "step": 20805 }, { "epoch": 0.7779984578358297, "grad_norm": 0.27661359310150146, "learning_rate": 2.3350276892922218e-06, "loss": 0.2356, "step": 20810 }, { "epoch": 0.7781853868261794, "grad_norm": 0.5237915515899658, "learning_rate": 2.3312573640373994e-06, "loss": 0.3032, "step": 20815 }, { "epoch": 0.7783723158165292, "grad_norm": 0.33490797877311707, "learning_rate": 2.3274896835121772e-06, "loss": 0.2319, "step": 20820 }, { "epoch": 0.778559244806879, "grad_norm": 0.5029880404472351, "learning_rate": 2.323724649015916e-06, "loss": 0.2439, "step": 20825 }, { "epoch": 0.7787461737972288, "grad_norm": 0.47843530774116516, "learning_rate": 2.31996226184707e-06, "loss": 0.3145, "step": 20830 }, { "epoch": 0.7789331027875785, "grad_norm": 0.5605517625808716, "learning_rate": 2.3162025233031814e-06, "loss": 0.3438, "step": 20835 }, { "epoch": 0.7791200317779283, "grad_norm": 0.47056886553764343, "learning_rate": 2.3124454346808713e-06, "loss": 0.2864, "step": 20840 }, { "epoch": 0.7793069607682781, "grad_norm": 0.5489882230758667, "learning_rate": 2.3086909972758577e-06, "loss": 0.272, "step": 20845 }, { "epoch": 0.779493889758628, "grad_norm": 0.2519620656967163, "learning_rate": 2.304939212382934e-06, "loss": 0.2637, "step": 20850 }, { "epoch": 0.7796808187489778, "grad_norm": 0.16588373482227325, "learning_rate": 2.3011900812959855e-06, "loss": 0.248, "step": 20855 }, { "epoch": 0.7798677477393275, "grad_norm": 0.48377254605293274, "learning_rate": 2.2974436053079764e-06, "loss": 0.2892, "step": 20860 }, { "epoch": 0.7800546767296773, "grad_norm": 0.32265156507492065, "learning_rate": 2.2936997857109644e-06, "loss": 0.2799, "step": 20865 }, { "epoch": 0.7802416057200271, "grad_norm": 0.25994792580604553, "learning_rate": 2.2899586237960793e-06, "loss": 0.2736, "step": 20870 }, { "epoch": 0.7804285347103769, "grad_norm": 0.4288146495819092, "learning_rate": 2.286220120853545e-06, "loss": 0.2761, "step": 20875 }, { "epoch": 0.7806154637007267, "grad_norm": 0.2761852741241455, "learning_rate": 2.2824842781726665e-06, "loss": 0.2326, "step": 20880 }, { "epoch": 0.7808023926910764, "grad_norm": 0.561490535736084, "learning_rate": 2.2787510970418215e-06, "loss": 0.2932, "step": 20885 }, { "epoch": 0.7809893216814263, "grad_norm": 0.28700560331344604, "learning_rate": 2.2750205787484846e-06, "loss": 0.1987, "step": 20890 }, { "epoch": 0.7811762506717761, "grad_norm": 0.2016209363937378, "learning_rate": 2.271292724579203e-06, "loss": 0.2507, "step": 20895 }, { "epoch": 0.7813631796621259, "grad_norm": 0.3742198944091797, "learning_rate": 2.2675675358196037e-06, "loss": 0.2465, "step": 20900 }, { "epoch": 0.7815501086524757, "grad_norm": 0.301685094833374, "learning_rate": 2.2638450137543967e-06, "loss": 0.281, "step": 20905 }, { "epoch": 0.7817370376428254, "grad_norm": 0.6065481305122375, "learning_rate": 2.2601251596673778e-06, "loss": 0.249, "step": 20910 }, { "epoch": 0.7819239666331752, "grad_norm": 0.41621100902557373, "learning_rate": 2.2564079748414138e-06, "loss": 0.254, "step": 20915 }, { "epoch": 0.782110895623525, "grad_norm": 0.33886462450027466, "learning_rate": 2.252693460558456e-06, "loss": 0.2795, "step": 20920 }, { "epoch": 0.7822978246138748, "grad_norm": 0.6175533533096313, "learning_rate": 2.2489816180995395e-06, "loss": 0.3793, "step": 20925 }, { "epoch": 0.7824847536042246, "grad_norm": 0.7338931560516357, "learning_rate": 2.245272448744765e-06, "loss": 0.289, "step": 20930 }, { "epoch": 0.7826716825945744, "grad_norm": 0.292777419090271, "learning_rate": 2.241565953773325e-06, "loss": 0.2248, "step": 20935 }, { "epoch": 0.7828586115849242, "grad_norm": 0.5425524711608887, "learning_rate": 2.237862134463479e-06, "loss": 0.2553, "step": 20940 }, { "epoch": 0.783045540575274, "grad_norm": 0.29631131887435913, "learning_rate": 2.2341609920925698e-06, "loss": 0.2826, "step": 20945 }, { "epoch": 0.7832324695656238, "grad_norm": 0.313694566488266, "learning_rate": 2.230462527937013e-06, "loss": 0.3969, "step": 20950 }, { "epoch": 0.7834193985559735, "grad_norm": 0.7053239941596985, "learning_rate": 2.2267667432723073e-06, "loss": 0.3284, "step": 20955 }, { "epoch": 0.7836063275463233, "grad_norm": 0.22134394943714142, "learning_rate": 2.2230736393730178e-06, "loss": 0.276, "step": 20960 }, { "epoch": 0.7837932565366731, "grad_norm": 0.2921614348888397, "learning_rate": 2.2193832175127928e-06, "loss": 0.247, "step": 20965 }, { "epoch": 0.783980185527023, "grad_norm": 0.4508916735649109, "learning_rate": 2.215695478964357e-06, "loss": 0.2421, "step": 20970 }, { "epoch": 0.7841671145173728, "grad_norm": 0.4369545578956604, "learning_rate": 2.212010424999498e-06, "loss": 0.2226, "step": 20975 }, { "epoch": 0.7843540435077225, "grad_norm": 0.21277235448360443, "learning_rate": 2.2083280568890918e-06, "loss": 0.292, "step": 20980 }, { "epoch": 0.7845409724980723, "grad_norm": 0.4735264778137207, "learning_rate": 2.20464837590308e-06, "loss": 0.2305, "step": 20985 }, { "epoch": 0.7847279014884221, "grad_norm": 0.5336333513259888, "learning_rate": 2.2009713833104785e-06, "loss": 0.2775, "step": 20990 }, { "epoch": 0.7849148304787719, "grad_norm": 0.4231540262699127, "learning_rate": 2.1972970803793726e-06, "loss": 0.2616, "step": 20995 }, { "epoch": 0.7851017594691216, "grad_norm": 0.5644051432609558, "learning_rate": 2.193625468376931e-06, "loss": 0.2852, "step": 21000 }, { "epoch": 0.7852886884594714, "grad_norm": 0.5196647047996521, "learning_rate": 2.189956548569382e-06, "loss": 0.3341, "step": 21005 }, { "epoch": 0.7854756174498212, "grad_norm": 0.4049712121486664, "learning_rate": 2.186290322222033e-06, "loss": 0.4098, "step": 21010 }, { "epoch": 0.7856625464401711, "grad_norm": 0.4484502375125885, "learning_rate": 2.182626790599265e-06, "loss": 0.2502, "step": 21015 }, { "epoch": 0.7858494754305209, "grad_norm": 0.1917799860239029, "learning_rate": 2.1789659549645158e-06, "loss": 0.2588, "step": 21020 }, { "epoch": 0.7860364044208706, "grad_norm": 0.3756994307041168, "learning_rate": 2.175307816580312e-06, "loss": 0.2758, "step": 21025 }, { "epoch": 0.7862233334112204, "grad_norm": 0.4414009153842926, "learning_rate": 2.171652376708233e-06, "loss": 0.2278, "step": 21030 }, { "epoch": 0.7864102624015702, "grad_norm": 0.34547463059425354, "learning_rate": 2.1679996366089428e-06, "loss": 0.3164, "step": 21035 }, { "epoch": 0.78659719139192, "grad_norm": 0.3485727608203888, "learning_rate": 2.1643495975421612e-06, "loss": 0.3466, "step": 21040 }, { "epoch": 0.7867841203822697, "grad_norm": 0.772977888584137, "learning_rate": 2.160702260766684e-06, "loss": 0.2409, "step": 21045 }, { "epoch": 0.7869710493726195, "grad_norm": 0.5560535192489624, "learning_rate": 2.157057627540371e-06, "loss": 0.3805, "step": 21050 }, { "epoch": 0.7871579783629694, "grad_norm": 0.5626355409622192, "learning_rate": 2.1534156991201528e-06, "loss": 0.2906, "step": 21055 }, { "epoch": 0.7873449073533192, "grad_norm": 0.27478352189064026, "learning_rate": 2.149776476762029e-06, "loss": 0.2498, "step": 21060 }, { "epoch": 0.787531836343669, "grad_norm": 0.8276031017303467, "learning_rate": 2.146139961721059e-06, "loss": 0.2522, "step": 21065 }, { "epoch": 0.7877187653340187, "grad_norm": 0.459084153175354, "learning_rate": 2.142506155251377e-06, "loss": 0.3553, "step": 21070 }, { "epoch": 0.7879056943243685, "grad_norm": 0.6312773823738098, "learning_rate": 2.1388750586061735e-06, "loss": 0.2408, "step": 21075 }, { "epoch": 0.7880926233147183, "grad_norm": 0.3618089556694031, "learning_rate": 2.1352466730377164e-06, "loss": 0.3, "step": 21080 }, { "epoch": 0.7882795523050681, "grad_norm": 0.5155732035636902, "learning_rate": 2.131620999797327e-06, "loss": 0.2281, "step": 21085 }, { "epoch": 0.7884664812954179, "grad_norm": 0.5344289541244507, "learning_rate": 2.1279980401353972e-06, "loss": 0.2373, "step": 21090 }, { "epoch": 0.7886534102857677, "grad_norm": 0.4500960111618042, "learning_rate": 2.12437779530138e-06, "loss": 0.3069, "step": 21095 }, { "epoch": 0.7888403392761175, "grad_norm": 0.6218135952949524, "learning_rate": 2.1207602665437953e-06, "loss": 0.288, "step": 21100 }, { "epoch": 0.7890272682664673, "grad_norm": 0.7415568828582764, "learning_rate": 2.117145455110229e-06, "loss": 0.2563, "step": 21105 }, { "epoch": 0.7892141972568171, "grad_norm": 0.47236132621765137, "learning_rate": 2.1135333622473208e-06, "loss": 0.3075, "step": 21110 }, { "epoch": 0.7894011262471669, "grad_norm": 0.3604547083377838, "learning_rate": 2.1099239892007815e-06, "loss": 0.3614, "step": 21115 }, { "epoch": 0.7895880552375166, "grad_norm": 0.32508572936058044, "learning_rate": 2.1063173372153778e-06, "loss": 0.2853, "step": 21120 }, { "epoch": 0.7897749842278664, "grad_norm": 0.40659722685813904, "learning_rate": 2.102713407534943e-06, "loss": 0.3314, "step": 21125 }, { "epoch": 0.7899619132182162, "grad_norm": 0.5361529588699341, "learning_rate": 2.099112201402369e-06, "loss": 0.2541, "step": 21130 }, { "epoch": 0.7901488422085661, "grad_norm": 0.32573097944259644, "learning_rate": 2.0955137200596077e-06, "loss": 0.2379, "step": 21135 }, { "epoch": 0.7903357711989158, "grad_norm": 0.5124326348304749, "learning_rate": 2.0919179647476694e-06, "loss": 0.3336, "step": 21140 }, { "epoch": 0.7905227001892656, "grad_norm": 0.6217731833457947, "learning_rate": 2.0883249367066294e-06, "loss": 0.341, "step": 21145 }, { "epoch": 0.7907096291796154, "grad_norm": 0.44866645336151123, "learning_rate": 2.0847346371756237e-06, "loss": 0.2955, "step": 21150 }, { "epoch": 0.7908965581699652, "grad_norm": 0.4434383809566498, "learning_rate": 2.081147067392838e-06, "loss": 0.3195, "step": 21155 }, { "epoch": 0.791083487160315, "grad_norm": 0.5388154983520508, "learning_rate": 2.0775622285955264e-06, "loss": 0.2346, "step": 21160 }, { "epoch": 0.7912704161506647, "grad_norm": 0.33477190136909485, "learning_rate": 2.073980122019994e-06, "loss": 0.3177, "step": 21165 }, { "epoch": 0.7914573451410145, "grad_norm": 0.5977397561073303, "learning_rate": 2.070400748901611e-06, "loss": 0.3434, "step": 21170 }, { "epoch": 0.7916442741313644, "grad_norm": 0.3142024874687195, "learning_rate": 2.066824110474798e-06, "loss": 0.2778, "step": 21175 }, { "epoch": 0.7918312031217142, "grad_norm": 0.5590510964393616, "learning_rate": 2.0632502079730356e-06, "loss": 0.2392, "step": 21180 }, { "epoch": 0.792018132112064, "grad_norm": 0.38960909843444824, "learning_rate": 2.059679042628856e-06, "loss": 0.2559, "step": 21185 }, { "epoch": 0.7922050611024137, "grad_norm": 0.7024688720703125, "learning_rate": 2.056110615673855e-06, "loss": 0.2569, "step": 21190 }, { "epoch": 0.7923919900927635, "grad_norm": 0.5414541959762573, "learning_rate": 2.0525449283386855e-06, "loss": 0.2865, "step": 21195 }, { "epoch": 0.7925789190831133, "grad_norm": 0.4854254126548767, "learning_rate": 2.0489819818530443e-06, "loss": 0.267, "step": 21200 }, { "epoch": 0.7927658480734631, "grad_norm": 0.34291431307792664, "learning_rate": 2.045421777445694e-06, "loss": 0.3289, "step": 21205 }, { "epoch": 0.7929527770638128, "grad_norm": 0.366192489862442, "learning_rate": 2.041864316344443e-06, "loss": 0.3097, "step": 21210 }, { "epoch": 0.7931397060541627, "grad_norm": 0.35338735580444336, "learning_rate": 2.0383095997761628e-06, "loss": 0.2791, "step": 21215 }, { "epoch": 0.7933266350445125, "grad_norm": 0.5580964684486389, "learning_rate": 2.0347576289667657e-06, "loss": 0.2732, "step": 21220 }, { "epoch": 0.7935135640348623, "grad_norm": 0.47220098972320557, "learning_rate": 2.031208405141234e-06, "loss": 0.243, "step": 21225 }, { "epoch": 0.7937004930252121, "grad_norm": 0.6070525646209717, "learning_rate": 2.027661929523588e-06, "loss": 0.2208, "step": 21230 }, { "epoch": 0.7938874220155618, "grad_norm": 0.4596133828163147, "learning_rate": 2.0241182033369034e-06, "loss": 0.2809, "step": 21235 }, { "epoch": 0.7940743510059116, "grad_norm": 0.17286808788776398, "learning_rate": 2.0205772278033153e-06, "loss": 0.245, "step": 21240 }, { "epoch": 0.7942612799962614, "grad_norm": 0.42331114411354065, "learning_rate": 2.017039004143999e-06, "loss": 0.2423, "step": 21245 }, { "epoch": 0.7944482089866112, "grad_norm": 0.6041852235794067, "learning_rate": 2.013503533579193e-06, "loss": 0.2279, "step": 21250 }, { "epoch": 0.794635137976961, "grad_norm": 0.4134674668312073, "learning_rate": 2.009970817328173e-06, "loss": 0.2674, "step": 21255 }, { "epoch": 0.7948220669673108, "grad_norm": 0.44482535123825073, "learning_rate": 2.0064408566092762e-06, "loss": 0.2763, "step": 21260 }, { "epoch": 0.7950089959576606, "grad_norm": 0.6239652037620544, "learning_rate": 2.002913652639883e-06, "loss": 0.2297, "step": 21265 }, { "epoch": 0.7951959249480104, "grad_norm": 0.4844609498977661, "learning_rate": 1.999389206636426e-06, "loss": 0.2071, "step": 21270 }, { "epoch": 0.7953828539383602, "grad_norm": 0.37075021862983704, "learning_rate": 1.9958675198143873e-06, "loss": 0.2251, "step": 21275 }, { "epoch": 0.79556978292871, "grad_norm": 0.16581667959690094, "learning_rate": 1.992348593388289e-06, "loss": 0.1926, "step": 21280 }, { "epoch": 0.7957567119190597, "grad_norm": 0.4773890972137451, "learning_rate": 1.9888324285717166e-06, "loss": 0.3475, "step": 21285 }, { "epoch": 0.7959436409094095, "grad_norm": 0.57837975025177, "learning_rate": 1.985319026577287e-06, "loss": 0.3069, "step": 21290 }, { "epoch": 0.7961305698997593, "grad_norm": 0.29405614733695984, "learning_rate": 1.9818083886166795e-06, "loss": 0.2997, "step": 21295 }, { "epoch": 0.7963174988901092, "grad_norm": 0.2895260155200958, "learning_rate": 1.978300515900604e-06, "loss": 0.2789, "step": 21300 }, { "epoch": 0.7965044278804589, "grad_norm": 0.36291879415512085, "learning_rate": 1.9747954096388343e-06, "loss": 0.2466, "step": 21305 }, { "epoch": 0.7966913568708087, "grad_norm": 0.477522611618042, "learning_rate": 1.9712930710401735e-06, "loss": 0.2304, "step": 21310 }, { "epoch": 0.7968782858611585, "grad_norm": 0.7150763273239136, "learning_rate": 1.967793501312483e-06, "loss": 0.2485, "step": 21315 }, { "epoch": 0.7970652148515083, "grad_norm": 0.4385119378566742, "learning_rate": 1.9642967016626624e-06, "loss": 0.2579, "step": 21320 }, { "epoch": 0.797252143841858, "grad_norm": 0.4212881624698639, "learning_rate": 1.9608026732966544e-06, "loss": 0.2947, "step": 21325 }, { "epoch": 0.7974390728322078, "grad_norm": 0.54429692029953, "learning_rate": 1.957311417419455e-06, "loss": 0.2657, "step": 21330 }, { "epoch": 0.7976260018225576, "grad_norm": 0.6954907178878784, "learning_rate": 1.9538229352350924e-06, "loss": 0.2948, "step": 21335 }, { "epoch": 0.7978129308129075, "grad_norm": 0.56966632604599, "learning_rate": 1.95033722794665e-06, "loss": 0.2921, "step": 21340 }, { "epoch": 0.7979998598032573, "grad_norm": 0.6537004113197327, "learning_rate": 1.9468542967562443e-06, "loss": 0.4463, "step": 21345 }, { "epoch": 0.798186788793607, "grad_norm": 1.18008291721344, "learning_rate": 1.943374142865042e-06, "loss": 0.3565, "step": 21350 }, { "epoch": 0.7983737177839568, "grad_norm": 0.28714442253112793, "learning_rate": 1.939896767473243e-06, "loss": 0.2965, "step": 21355 }, { "epoch": 0.7985606467743066, "grad_norm": 1.268236756324768, "learning_rate": 1.936422171780101e-06, "loss": 0.3542, "step": 21360 }, { "epoch": 0.7987475757646564, "grad_norm": 0.5523670315742493, "learning_rate": 1.9329503569839002e-06, "loss": 0.2992, "step": 21365 }, { "epoch": 0.7989345047550062, "grad_norm": 0.391519695520401, "learning_rate": 1.92948132428197e-06, "loss": 0.3191, "step": 21370 }, { "epoch": 0.7991214337453559, "grad_norm": 0.6156255602836609, "learning_rate": 1.926015074870683e-06, "loss": 0.2582, "step": 21375 }, { "epoch": 0.7993083627357058, "grad_norm": 0.29384270310401917, "learning_rate": 1.9225516099454456e-06, "loss": 0.29, "step": 21380 }, { "epoch": 0.7994952917260556, "grad_norm": 0.46407338976860046, "learning_rate": 1.919090930700712e-06, "loss": 0.2988, "step": 21385 }, { "epoch": 0.7996822207164054, "grad_norm": 0.16256499290466309, "learning_rate": 1.915633038329967e-06, "loss": 0.3916, "step": 21390 }, { "epoch": 0.7998691497067552, "grad_norm": 0.6829771399497986, "learning_rate": 1.912177934025743e-06, "loss": 0.2818, "step": 21395 }, { "epoch": 0.8000560786971049, "grad_norm": 0.6164911389350891, "learning_rate": 1.9087256189796012e-06, "loss": 0.3556, "step": 21400 }, { "epoch": 0.8002430076874547, "grad_norm": 0.5262479782104492, "learning_rate": 1.9052760943821513e-06, "loss": 0.2089, "step": 21405 }, { "epoch": 0.8004299366778045, "grad_norm": 0.3013521730899811, "learning_rate": 1.901829361423031e-06, "loss": 0.2159, "step": 21410 }, { "epoch": 0.8006168656681543, "grad_norm": 0.37884315848350525, "learning_rate": 1.8983854212909247e-06, "loss": 0.2776, "step": 21415 }, { "epoch": 0.8008037946585042, "grad_norm": 0.27459797263145447, "learning_rate": 1.894944275173547e-06, "loss": 0.3337, "step": 21420 }, { "epoch": 0.8009907236488539, "grad_norm": 0.6611989736557007, "learning_rate": 1.8915059242576462e-06, "loss": 0.2814, "step": 21425 }, { "epoch": 0.8011776526392037, "grad_norm": 0.2187768518924713, "learning_rate": 1.888070369729016e-06, "loss": 0.2271, "step": 21430 }, { "epoch": 0.8013645816295535, "grad_norm": 0.37497401237487793, "learning_rate": 1.8846376127724775e-06, "loss": 0.2489, "step": 21435 }, { "epoch": 0.8015515106199033, "grad_norm": 0.44571053981781006, "learning_rate": 1.881207654571895e-06, "loss": 0.2394, "step": 21440 }, { "epoch": 0.801738439610253, "grad_norm": 0.34269171953201294, "learning_rate": 1.8777804963101553e-06, "loss": 0.2689, "step": 21445 }, { "epoch": 0.8019253686006028, "grad_norm": 0.3532237410545349, "learning_rate": 1.8743561391691955e-06, "loss": 0.2671, "step": 21450 }, { "epoch": 0.8021122975909526, "grad_norm": 0.27271535992622375, "learning_rate": 1.8709345843299708e-06, "loss": 0.2663, "step": 21455 }, { "epoch": 0.8022992265813025, "grad_norm": 0.3709903061389923, "learning_rate": 1.867515832972484e-06, "loss": 0.3473, "step": 21460 }, { "epoch": 0.8024861555716523, "grad_norm": 0.5782517790794373, "learning_rate": 1.864099886275761e-06, "loss": 0.3205, "step": 21465 }, { "epoch": 0.802673084562002, "grad_norm": 0.40941113233566284, "learning_rate": 1.8606867454178612e-06, "loss": 0.3808, "step": 21470 }, { "epoch": 0.8028600135523518, "grad_norm": 0.29615747928619385, "learning_rate": 1.8572764115758846e-06, "loss": 0.3111, "step": 21475 }, { "epoch": 0.8030469425427016, "grad_norm": 0.8128417134284973, "learning_rate": 1.8538688859259534e-06, "loss": 0.3365, "step": 21480 }, { "epoch": 0.8032338715330514, "grad_norm": 0.5973396897315979, "learning_rate": 1.850464169643229e-06, "loss": 0.2514, "step": 21485 }, { "epoch": 0.8034208005234011, "grad_norm": 0.46153488755226135, "learning_rate": 1.8470622639018964e-06, "loss": 0.3385, "step": 21490 }, { "epoch": 0.8036077295137509, "grad_norm": 0.45193207263946533, "learning_rate": 1.8436631698751806e-06, "loss": 0.2695, "step": 21495 }, { "epoch": 0.8037946585041007, "grad_norm": 0.5333691835403442, "learning_rate": 1.840266888735326e-06, "loss": 0.2737, "step": 21500 }, { "epoch": 0.8039815874944506, "grad_norm": 0.46720612049102783, "learning_rate": 1.8368734216536176e-06, "loss": 0.2952, "step": 21505 }, { "epoch": 0.8041685164848004, "grad_norm": 0.24607418477535248, "learning_rate": 1.8334827698003644e-06, "loss": 0.3072, "step": 21510 }, { "epoch": 0.8043554454751501, "grad_norm": 0.3917045295238495, "learning_rate": 1.8300949343449003e-06, "loss": 0.2582, "step": 21515 }, { "epoch": 0.8045423744654999, "grad_norm": 0.27164673805236816, "learning_rate": 1.8267099164555978e-06, "loss": 0.2381, "step": 21520 }, { "epoch": 0.8047293034558497, "grad_norm": 0.7924597859382629, "learning_rate": 1.8233277172998486e-06, "loss": 0.2594, "step": 21525 }, { "epoch": 0.8049162324461995, "grad_norm": 0.17336755990982056, "learning_rate": 1.8199483380440808e-06, "loss": 0.2524, "step": 21530 }, { "epoch": 0.8051031614365493, "grad_norm": 0.5164810419082642, "learning_rate": 1.8165717798537407e-06, "loss": 0.31, "step": 21535 }, { "epoch": 0.805290090426899, "grad_norm": 0.6605789065361023, "learning_rate": 1.8131980438933117e-06, "loss": 0.3017, "step": 21540 }, { "epoch": 0.8054770194172489, "grad_norm": 0.31877684593200684, "learning_rate": 1.809827131326294e-06, "loss": 0.2414, "step": 21545 }, { "epoch": 0.8056639484075987, "grad_norm": 0.4892379641532898, "learning_rate": 1.8064590433152218e-06, "loss": 0.2236, "step": 21550 }, { "epoch": 0.8058508773979485, "grad_norm": 0.4304451644420624, "learning_rate": 1.8030937810216486e-06, "loss": 0.2741, "step": 21555 }, { "epoch": 0.8060378063882983, "grad_norm": 0.5623677968978882, "learning_rate": 1.7997313456061615e-06, "loss": 0.2997, "step": 21560 }, { "epoch": 0.806224735378648, "grad_norm": 0.24887755513191223, "learning_rate": 1.7963717382283663e-06, "loss": 0.2429, "step": 21565 }, { "epoch": 0.8064116643689978, "grad_norm": 0.4200691282749176, "learning_rate": 1.7930149600468927e-06, "loss": 0.2047, "step": 21570 }, { "epoch": 0.8065985933593476, "grad_norm": 0.5015040040016174, "learning_rate": 1.7896610122194015e-06, "loss": 0.3563, "step": 21575 }, { "epoch": 0.8067855223496974, "grad_norm": 0.7050318121910095, "learning_rate": 1.7863098959025692e-06, "loss": 0.2453, "step": 21580 }, { "epoch": 0.8069724513400472, "grad_norm": 0.39785218238830566, "learning_rate": 1.7829616122521043e-06, "loss": 0.2441, "step": 21585 }, { "epoch": 0.807159380330397, "grad_norm": 0.5187550187110901, "learning_rate": 1.7796161624227281e-06, "loss": 0.3743, "step": 21590 }, { "epoch": 0.8073463093207468, "grad_norm": 0.20182538032531738, "learning_rate": 1.7762735475681947e-06, "loss": 0.2975, "step": 21595 }, { "epoch": 0.8075332383110966, "grad_norm": 0.5243845582008362, "learning_rate": 1.7729337688412772e-06, "loss": 0.2309, "step": 21600 }, { "epoch": 0.8077201673014464, "grad_norm": 0.39806708693504333, "learning_rate": 1.769596827393768e-06, "loss": 0.3061, "step": 21605 }, { "epoch": 0.8079070962917961, "grad_norm": 0.37377694249153137, "learning_rate": 1.7662627243764808e-06, "loss": 0.3372, "step": 21610 }, { "epoch": 0.8080940252821459, "grad_norm": 0.4244140088558197, "learning_rate": 1.7629314609392523e-06, "loss": 0.2972, "step": 21615 }, { "epoch": 0.8082809542724957, "grad_norm": 0.44188788533210754, "learning_rate": 1.7596030382309436e-06, "loss": 0.3291, "step": 21620 }, { "epoch": 0.8084678832628456, "grad_norm": 0.37357720732688904, "learning_rate": 1.7562774573994267e-06, "loss": 0.2561, "step": 21625 }, { "epoch": 0.8086548122531954, "grad_norm": 0.28990504145622253, "learning_rate": 1.7529547195916052e-06, "loss": 0.2949, "step": 21630 }, { "epoch": 0.8088417412435451, "grad_norm": 0.49605655670166016, "learning_rate": 1.7496348259533902e-06, "loss": 0.3587, "step": 21635 }, { "epoch": 0.8090286702338949, "grad_norm": 0.3280414044857025, "learning_rate": 1.7463177776297202e-06, "loss": 0.2554, "step": 21640 }, { "epoch": 0.8092155992242447, "grad_norm": 0.4728814959526062, "learning_rate": 1.7430035757645546e-06, "loss": 0.3598, "step": 21645 }, { "epoch": 0.8094025282145945, "grad_norm": 0.45704665780067444, "learning_rate": 1.7396922215008628e-06, "loss": 0.3224, "step": 21650 }, { "epoch": 0.8095894572049442, "grad_norm": 0.47432267665863037, "learning_rate": 1.7363837159806352e-06, "loss": 0.276, "step": 21655 }, { "epoch": 0.809776386195294, "grad_norm": 0.4984903037548065, "learning_rate": 1.7330780603448794e-06, "loss": 0.3783, "step": 21660 }, { "epoch": 0.8099633151856439, "grad_norm": 0.4554220736026764, "learning_rate": 1.7297752557336257e-06, "loss": 0.2547, "step": 21665 }, { "epoch": 0.8101502441759937, "grad_norm": 0.3859618008136749, "learning_rate": 1.7264753032859115e-06, "loss": 0.3227, "step": 21670 }, { "epoch": 0.8103371731663435, "grad_norm": 0.2784779369831085, "learning_rate": 1.7231782041398015e-06, "loss": 0.3581, "step": 21675 }, { "epoch": 0.8105241021566932, "grad_norm": 0.4222283363342285, "learning_rate": 1.7198839594323658e-06, "loss": 0.3204, "step": 21680 }, { "epoch": 0.810711031147043, "grad_norm": 0.8711488842964172, "learning_rate": 1.7165925702997e-06, "loss": 0.3242, "step": 21685 }, { "epoch": 0.8108979601373928, "grad_norm": 0.39160558581352234, "learning_rate": 1.7133040378769039e-06, "loss": 0.2511, "step": 21690 }, { "epoch": 0.8110848891277426, "grad_norm": 1.2719933986663818, "learning_rate": 1.7100183632981039e-06, "loss": 0.3113, "step": 21695 }, { "epoch": 0.8112718181180923, "grad_norm": 0.23821887373924255, "learning_rate": 1.706735547696432e-06, "loss": 0.3062, "step": 21700 }, { "epoch": 0.8114587471084422, "grad_norm": 0.4011630117893219, "learning_rate": 1.7034555922040351e-06, "loss": 0.2491, "step": 21705 }, { "epoch": 0.811645676098792, "grad_norm": 0.28980720043182373, "learning_rate": 1.7001784979520808e-06, "loss": 0.2797, "step": 21710 }, { "epoch": 0.8118326050891418, "grad_norm": 0.41728466749191284, "learning_rate": 1.6969042660707413e-06, "loss": 0.3272, "step": 21715 }, { "epoch": 0.8120195340794916, "grad_norm": 0.22036214172840118, "learning_rate": 1.693632897689208e-06, "loss": 0.2638, "step": 21720 }, { "epoch": 0.8122064630698413, "grad_norm": 0.5138218402862549, "learning_rate": 1.6903643939356784e-06, "loss": 0.2993, "step": 21725 }, { "epoch": 0.8123933920601911, "grad_norm": 0.6489676237106323, "learning_rate": 1.6870987559373709e-06, "loss": 0.2849, "step": 21730 }, { "epoch": 0.8125803210505409, "grad_norm": 0.2285657376050949, "learning_rate": 1.6838359848205055e-06, "loss": 0.2543, "step": 21735 }, { "epoch": 0.8127672500408907, "grad_norm": 0.29291707277297974, "learning_rate": 1.6805760817103201e-06, "loss": 0.2461, "step": 21740 }, { "epoch": 0.8129541790312405, "grad_norm": 0.4174777865409851, "learning_rate": 1.6773190477310652e-06, "loss": 0.2149, "step": 21745 }, { "epoch": 0.8131411080215903, "grad_norm": 0.2554204761981964, "learning_rate": 1.6740648840059958e-06, "loss": 0.2763, "step": 21750 }, { "epoch": 0.8133280370119401, "grad_norm": 0.8167181611061096, "learning_rate": 1.6708135916573797e-06, "loss": 0.2746, "step": 21755 }, { "epoch": 0.8135149660022899, "grad_norm": 0.304171085357666, "learning_rate": 1.6675651718064922e-06, "loss": 0.2495, "step": 21760 }, { "epoch": 0.8137018949926397, "grad_norm": 0.39825260639190674, "learning_rate": 1.6643196255736239e-06, "loss": 0.2251, "step": 21765 }, { "epoch": 0.8138888239829895, "grad_norm": 0.36910250782966614, "learning_rate": 1.661076954078068e-06, "loss": 0.3272, "step": 21770 }, { "epoch": 0.8140757529733392, "grad_norm": 0.31329768896102905, "learning_rate": 1.6578371584381326e-06, "loss": 0.2188, "step": 21775 }, { "epoch": 0.814262681963689, "grad_norm": 0.2181299477815628, "learning_rate": 1.6546002397711247e-06, "loss": 0.3479, "step": 21780 }, { "epoch": 0.8144496109540388, "grad_norm": 0.8821444511413574, "learning_rate": 1.6513661991933694e-06, "loss": 0.2365, "step": 21785 }, { "epoch": 0.8146365399443887, "grad_norm": 0.3816254436969757, "learning_rate": 1.6481350378201954e-06, "loss": 0.348, "step": 21790 }, { "epoch": 0.8148234689347384, "grad_norm": 0.7494456171989441, "learning_rate": 1.644906756765935e-06, "loss": 0.2938, "step": 21795 }, { "epoch": 0.8150103979250882, "grad_norm": 0.5195353031158447, "learning_rate": 1.6416813571439305e-06, "loss": 0.3243, "step": 21800 }, { "epoch": 0.815197326915438, "grad_norm": 0.3925173282623291, "learning_rate": 1.638458840066528e-06, "loss": 0.2577, "step": 21805 }, { "epoch": 0.8153842559057878, "grad_norm": 0.5360116958618164, "learning_rate": 1.635239206645085e-06, "loss": 0.3019, "step": 21810 }, { "epoch": 0.8155711848961376, "grad_norm": 0.384461909532547, "learning_rate": 1.632022457989958e-06, "loss": 0.2346, "step": 21815 }, { "epoch": 0.8157581138864873, "grad_norm": 0.3784307539463043, "learning_rate": 1.6288085952105126e-06, "loss": 0.2576, "step": 21820 }, { "epoch": 0.8159450428768371, "grad_norm": 0.3324683606624603, "learning_rate": 1.6255976194151168e-06, "loss": 0.3859, "step": 21825 }, { "epoch": 0.816131971867187, "grad_norm": 0.6262183785438538, "learning_rate": 1.6223895317111449e-06, "loss": 0.2211, "step": 21830 }, { "epoch": 0.8163189008575368, "grad_norm": 0.3607613444328308, "learning_rate": 1.6191843332049762e-06, "loss": 0.2672, "step": 21835 }, { "epoch": 0.8165058298478866, "grad_norm": 0.33100825548171997, "learning_rate": 1.615982025001991e-06, "loss": 0.2289, "step": 21840 }, { "epoch": 0.8166927588382363, "grad_norm": 0.7562757134437561, "learning_rate": 1.6127826082065723e-06, "loss": 0.2542, "step": 21845 }, { "epoch": 0.8168796878285861, "grad_norm": 0.359550803899765, "learning_rate": 1.6095860839221055e-06, "loss": 0.3022, "step": 21850 }, { "epoch": 0.8170666168189359, "grad_norm": 0.8539102077484131, "learning_rate": 1.6063924532509856e-06, "loss": 0.3232, "step": 21855 }, { "epoch": 0.8172535458092857, "grad_norm": 0.23160411417484283, "learning_rate": 1.603201717294599e-06, "loss": 0.2307, "step": 21860 }, { "epoch": 0.8174404747996354, "grad_norm": 0.5298076272010803, "learning_rate": 1.6000138771533424e-06, "loss": 0.223, "step": 21865 }, { "epoch": 0.8176274037899853, "grad_norm": 0.5675511360168457, "learning_rate": 1.5968289339266084e-06, "loss": 0.3385, "step": 21870 }, { "epoch": 0.8178143327803351, "grad_norm": 0.8937890529632568, "learning_rate": 1.5936468887127932e-06, "loss": 0.3094, "step": 21875 }, { "epoch": 0.8180012617706849, "grad_norm": 0.418905645608902, "learning_rate": 1.5904677426092964e-06, "loss": 0.3251, "step": 21880 }, { "epoch": 0.8181881907610347, "grad_norm": 0.26117852330207825, "learning_rate": 1.587291496712512e-06, "loss": 0.3468, "step": 21885 }, { "epoch": 0.8183751197513844, "grad_norm": 0.9076757431030273, "learning_rate": 1.584118152117835e-06, "loss": 0.26, "step": 21890 }, { "epoch": 0.8185620487417342, "grad_norm": 0.46946465969085693, "learning_rate": 1.5809477099196592e-06, "loss": 0.3238, "step": 21895 }, { "epoch": 0.818748977732084, "grad_norm": 0.28860294818878174, "learning_rate": 1.577780171211385e-06, "loss": 0.2435, "step": 21900 }, { "epoch": 0.8189359067224338, "grad_norm": 0.24636733531951904, "learning_rate": 1.5746155370853998e-06, "loss": 0.2566, "step": 21905 }, { "epoch": 0.8191228357127837, "grad_norm": 0.12167595326900482, "learning_rate": 1.571453808633101e-06, "loss": 0.3848, "step": 21910 }, { "epoch": 0.8193097647031334, "grad_norm": 0.6894955039024353, "learning_rate": 1.5682949869448715e-06, "loss": 0.3028, "step": 21915 }, { "epoch": 0.8194966936934832, "grad_norm": 0.3803219199180603, "learning_rate": 1.5651390731101035e-06, "loss": 0.2547, "step": 21920 }, { "epoch": 0.819683622683833, "grad_norm": 0.28584155440330505, "learning_rate": 1.5619860682171817e-06, "loss": 0.2718, "step": 21925 }, { "epoch": 0.8198705516741828, "grad_norm": 0.5630354285240173, "learning_rate": 1.558835973353483e-06, "loss": 0.2787, "step": 21930 }, { "epoch": 0.8200574806645325, "grad_norm": 0.2100723534822464, "learning_rate": 1.5556887896053896e-06, "loss": 0.3048, "step": 21935 }, { "epoch": 0.8202444096548823, "grad_norm": 0.38809457421302795, "learning_rate": 1.5525445180582721e-06, "loss": 0.2565, "step": 21940 }, { "epoch": 0.8204313386452321, "grad_norm": 0.4679727554321289, "learning_rate": 1.549403159796501e-06, "loss": 0.2861, "step": 21945 }, { "epoch": 0.820618267635582, "grad_norm": 0.39624401926994324, "learning_rate": 1.5462647159034362e-06, "loss": 0.2531, "step": 21950 }, { "epoch": 0.8208051966259318, "grad_norm": 0.32841363549232483, "learning_rate": 1.543129187461444e-06, "loss": 0.2593, "step": 21955 }, { "epoch": 0.8209921256162815, "grad_norm": 0.3810194134712219, "learning_rate": 1.539996575551872e-06, "loss": 0.2991, "step": 21960 }, { "epoch": 0.8211790546066313, "grad_norm": 0.37340226769447327, "learning_rate": 1.5368668812550724e-06, "loss": 0.2569, "step": 21965 }, { "epoch": 0.8213659835969811, "grad_norm": 0.511825680732727, "learning_rate": 1.5337401056503876e-06, "loss": 0.2649, "step": 21970 }, { "epoch": 0.8215529125873309, "grad_norm": 0.5602962374687195, "learning_rate": 1.5306162498161493e-06, "loss": 0.2674, "step": 21975 }, { "epoch": 0.8217398415776807, "grad_norm": 0.27479755878448486, "learning_rate": 1.52749531482969e-06, "loss": 0.304, "step": 21980 }, { "epoch": 0.8219267705680304, "grad_norm": 0.3318054676055908, "learning_rate": 1.5243773017673292e-06, "loss": 0.2914, "step": 21985 }, { "epoch": 0.8221136995583802, "grad_norm": 0.44144922494888306, "learning_rate": 1.5212622117043784e-06, "loss": 0.262, "step": 21990 }, { "epoch": 0.8223006285487301, "grad_norm": 0.8043171763420105, "learning_rate": 1.5181500457151432e-06, "loss": 0.3467, "step": 21995 }, { "epoch": 0.8224875575390799, "grad_norm": 0.4182523787021637, "learning_rate": 1.5150408048729226e-06, "loss": 0.3729, "step": 22000 }, { "epoch": 0.8226744865294296, "grad_norm": 0.22673340141773224, "learning_rate": 1.5119344902500022e-06, "loss": 0.3049, "step": 22005 }, { "epoch": 0.8228614155197794, "grad_norm": 0.3719673454761505, "learning_rate": 1.5088311029176628e-06, "loss": 0.2519, "step": 22010 }, { "epoch": 0.8230483445101292, "grad_norm": 0.3984321355819702, "learning_rate": 1.5057306439461738e-06, "loss": 0.2315, "step": 22015 }, { "epoch": 0.823235273500479, "grad_norm": 0.24741186201572418, "learning_rate": 1.5026331144047935e-06, "loss": 0.3394, "step": 22020 }, { "epoch": 0.8234222024908288, "grad_norm": 0.43265968561172485, "learning_rate": 1.4995385153617725e-06, "loss": 0.3895, "step": 22025 }, { "epoch": 0.8236091314811785, "grad_norm": 0.34110236167907715, "learning_rate": 1.4964468478843496e-06, "loss": 0.2754, "step": 22030 }, { "epoch": 0.8237960604715284, "grad_norm": 0.21157729625701904, "learning_rate": 1.4933581130387509e-06, "loss": 0.253, "step": 22035 }, { "epoch": 0.8239829894618782, "grad_norm": 0.5705170035362244, "learning_rate": 1.4902723118901907e-06, "loss": 0.2533, "step": 22040 }, { "epoch": 0.824169918452228, "grad_norm": 0.41971489787101746, "learning_rate": 1.4871894455028778e-06, "loss": 0.2568, "step": 22045 }, { "epoch": 0.8243568474425778, "grad_norm": 0.5276783108711243, "learning_rate": 1.4841095149399998e-06, "loss": 0.3132, "step": 22050 }, { "epoch": 0.8245437764329275, "grad_norm": 0.2946646213531494, "learning_rate": 1.481032521263739e-06, "loss": 0.304, "step": 22055 }, { "epoch": 0.8247307054232773, "grad_norm": 0.6871627569198608, "learning_rate": 1.4779584655352652e-06, "loss": 0.322, "step": 22060 }, { "epoch": 0.8249176344136271, "grad_norm": 0.37137484550476074, "learning_rate": 1.4748873488147264e-06, "loss": 0.3303, "step": 22065 }, { "epoch": 0.8251045634039769, "grad_norm": 0.34239670634269714, "learning_rate": 1.4718191721612684e-06, "loss": 0.297, "step": 22070 }, { "epoch": 0.8252914923943268, "grad_norm": 0.6226434707641602, "learning_rate": 1.468753936633014e-06, "loss": 0.3276, "step": 22075 }, { "epoch": 0.8254784213846765, "grad_norm": 0.5200907588005066, "learning_rate": 1.4656916432870737e-06, "loss": 0.2458, "step": 22080 }, { "epoch": 0.8256653503750263, "grad_norm": 0.513783872127533, "learning_rate": 1.4626322931795489e-06, "loss": 0.2929, "step": 22085 }, { "epoch": 0.8258522793653761, "grad_norm": 3.522709846496582, "learning_rate": 1.4595758873655198e-06, "loss": 0.2378, "step": 22090 }, { "epoch": 0.8260392083557259, "grad_norm": 0.12881094217300415, "learning_rate": 1.4565224268990507e-06, "loss": 0.2293, "step": 22095 }, { "epoch": 0.8262261373460756, "grad_norm": 0.5438992381095886, "learning_rate": 1.4534719128331953e-06, "loss": 0.2544, "step": 22100 }, { "epoch": 0.8264130663364254, "grad_norm": 0.28744834661483765, "learning_rate": 1.4504243462199896e-06, "loss": 0.2217, "step": 22105 }, { "epoch": 0.8265999953267752, "grad_norm": 0.39146536588668823, "learning_rate": 1.4473797281104485e-06, "loss": 0.2257, "step": 22110 }, { "epoch": 0.8267869243171251, "grad_norm": 0.6850770115852356, "learning_rate": 1.4443380595545787e-06, "loss": 0.3273, "step": 22115 }, { "epoch": 0.8269738533074749, "grad_norm": 0.3585718870162964, "learning_rate": 1.4412993416013588e-06, "loss": 0.2597, "step": 22120 }, { "epoch": 0.8271607822978246, "grad_norm": 0.6018190979957581, "learning_rate": 1.4382635752987606e-06, "loss": 0.2203, "step": 22125 }, { "epoch": 0.8273477112881744, "grad_norm": 0.48121556639671326, "learning_rate": 1.43523076169373e-06, "loss": 0.2179, "step": 22130 }, { "epoch": 0.8275346402785242, "grad_norm": 0.3579247295856476, "learning_rate": 1.432200901832198e-06, "loss": 0.2088, "step": 22135 }, { "epoch": 0.827721569268874, "grad_norm": 0.58817058801651, "learning_rate": 1.4291739967590746e-06, "loss": 0.2333, "step": 22140 }, { "epoch": 0.8279084982592237, "grad_norm": 0.44945281744003296, "learning_rate": 1.4261500475182543e-06, "loss": 0.2591, "step": 22145 }, { "epoch": 0.8280954272495735, "grad_norm": 0.6072959899902344, "learning_rate": 1.4231290551526133e-06, "loss": 0.2795, "step": 22150 }, { "epoch": 0.8282823562399234, "grad_norm": 0.5549390912055969, "learning_rate": 1.4201110207039993e-06, "loss": 0.2617, "step": 22155 }, { "epoch": 0.8284692852302732, "grad_norm": 0.419281929731369, "learning_rate": 1.4170959452132526e-06, "loss": 0.2794, "step": 22160 }, { "epoch": 0.828656214220623, "grad_norm": 0.5446984171867371, "learning_rate": 1.414083829720181e-06, "loss": 0.2595, "step": 22165 }, { "epoch": 0.8288431432109727, "grad_norm": 0.562941312789917, "learning_rate": 1.4110746752635806e-06, "loss": 0.3615, "step": 22170 }, { "epoch": 0.8290300722013225, "grad_norm": 0.42342409491539, "learning_rate": 1.4080684828812219e-06, "loss": 0.2564, "step": 22175 }, { "epoch": 0.8292170011916723, "grad_norm": 0.3505541682243347, "learning_rate": 1.4050652536098518e-06, "loss": 0.2427, "step": 22180 }, { "epoch": 0.8294039301820221, "grad_norm": 0.5046654939651489, "learning_rate": 1.4020649884851988e-06, "loss": 0.2361, "step": 22185 }, { "epoch": 0.8295908591723719, "grad_norm": 0.46937140822410583, "learning_rate": 1.3990676885419685e-06, "loss": 0.2695, "step": 22190 }, { "epoch": 0.8297777881627217, "grad_norm": 1.1296278238296509, "learning_rate": 1.3960733548138472e-06, "loss": 0.3084, "step": 22195 }, { "epoch": 0.8299647171530715, "grad_norm": 0.5058994889259338, "learning_rate": 1.3930819883334901e-06, "loss": 0.2994, "step": 22200 }, { "epoch": 0.8301516461434213, "grad_norm": 0.26385819911956787, "learning_rate": 1.3900935901325374e-06, "loss": 0.2441, "step": 22205 }, { "epoch": 0.8303385751337711, "grad_norm": 0.5458356738090515, "learning_rate": 1.3871081612415982e-06, "loss": 0.391, "step": 22210 }, { "epoch": 0.8305255041241209, "grad_norm": 0.28343480825424194, "learning_rate": 1.3841257026902665e-06, "loss": 0.2884, "step": 22215 }, { "epoch": 0.8307124331144706, "grad_norm": 0.3122907876968384, "learning_rate": 1.3811462155071043e-06, "loss": 0.2776, "step": 22220 }, { "epoch": 0.8308993621048204, "grad_norm": 0.7874910831451416, "learning_rate": 1.3781697007196493e-06, "loss": 0.2769, "step": 22225 }, { "epoch": 0.8310862910951702, "grad_norm": 0.5420087575912476, "learning_rate": 1.3751961593544171e-06, "loss": 0.2888, "step": 22230 }, { "epoch": 0.83127322008552, "grad_norm": 0.2580297887325287, "learning_rate": 1.3722255924368965e-06, "loss": 0.3204, "step": 22235 }, { "epoch": 0.8314601490758698, "grad_norm": 0.40570923686027527, "learning_rate": 1.3692580009915557e-06, "loss": 0.2792, "step": 22240 }, { "epoch": 0.8316470780662196, "grad_norm": 0.3946312963962555, "learning_rate": 1.3662933860418249e-06, "loss": 0.2946, "step": 22245 }, { "epoch": 0.8318340070565694, "grad_norm": 0.26744893193244934, "learning_rate": 1.3633317486101205e-06, "loss": 0.2792, "step": 22250 }, { "epoch": 0.8320209360469192, "grad_norm": 0.41380831599235535, "learning_rate": 1.3603730897178226e-06, "loss": 0.2922, "step": 22255 }, { "epoch": 0.832207865037269, "grad_norm": 0.27136343717575073, "learning_rate": 1.3574174103852922e-06, "loss": 0.315, "step": 22260 }, { "epoch": 0.8323947940276187, "grad_norm": 0.6136317849159241, "learning_rate": 1.3544647116318522e-06, "loss": 0.2856, "step": 22265 }, { "epoch": 0.8325817230179685, "grad_norm": 0.42765137553215027, "learning_rate": 1.3515149944758098e-06, "loss": 0.208, "step": 22270 }, { "epoch": 0.8327686520083183, "grad_norm": 0.41445282101631165, "learning_rate": 1.3485682599344351e-06, "loss": 0.3367, "step": 22275 }, { "epoch": 0.8329555809986682, "grad_norm": 0.2044340819120407, "learning_rate": 1.3456245090239706e-06, "loss": 0.2173, "step": 22280 }, { "epoch": 0.833142509989018, "grad_norm": 0.3924507796764374, "learning_rate": 1.3426837427596363e-06, "loss": 0.3163, "step": 22285 }, { "epoch": 0.8333294389793677, "grad_norm": 0.6259861588478088, "learning_rate": 1.339745962155613e-06, "loss": 0.3531, "step": 22290 }, { "epoch": 0.8335163679697175, "grad_norm": 0.6004974246025085, "learning_rate": 1.336811168225063e-06, "loss": 0.2664, "step": 22295 }, { "epoch": 0.8337032969600673, "grad_norm": 0.3294597268104553, "learning_rate": 1.333879361980106e-06, "loss": 0.2862, "step": 22300 }, { "epoch": 0.8338902259504171, "grad_norm": 0.8053619861602783, "learning_rate": 1.3309505444318439e-06, "loss": 0.2701, "step": 22305 }, { "epoch": 0.8340771549407668, "grad_norm": 0.47687965631484985, "learning_rate": 1.328024716590336e-06, "loss": 0.2936, "step": 22310 }, { "epoch": 0.8342640839311166, "grad_norm": 0.2641568183898926, "learning_rate": 1.3251018794646232e-06, "loss": 0.2435, "step": 22315 }, { "epoch": 0.8344510129214665, "grad_norm": 0.3692987859249115, "learning_rate": 1.3221820340627044e-06, "loss": 0.2775, "step": 22320 }, { "epoch": 0.8346379419118163, "grad_norm": 0.6120216846466064, "learning_rate": 1.319265181391549e-06, "loss": 0.3437, "step": 22325 }, { "epoch": 0.8348248709021661, "grad_norm": 0.5429954528808594, "learning_rate": 1.3163513224571012e-06, "loss": 0.3351, "step": 22330 }, { "epoch": 0.8350117998925158, "grad_norm": 0.5047636032104492, "learning_rate": 1.3134404582642612e-06, "loss": 0.3325, "step": 22335 }, { "epoch": 0.8351987288828656, "grad_norm": 0.5833032131195068, "learning_rate": 1.3105325898169075e-06, "loss": 0.2929, "step": 22340 }, { "epoch": 0.8353856578732154, "grad_norm": 0.35585522651672363, "learning_rate": 1.3076277181178775e-06, "loss": 0.246, "step": 22345 }, { "epoch": 0.8355725868635652, "grad_norm": 0.4089027941226959, "learning_rate": 1.3047258441689815e-06, "loss": 0.3233, "step": 22350 }, { "epoch": 0.835759515853915, "grad_norm": 0.4178864657878876, "learning_rate": 1.3018269689709883e-06, "loss": 0.3107, "step": 22355 }, { "epoch": 0.8359464448442648, "grad_norm": 0.6142130494117737, "learning_rate": 1.2989310935236421e-06, "loss": 0.2829, "step": 22360 }, { "epoch": 0.8361333738346146, "grad_norm": 0.3945229947566986, "learning_rate": 1.2960382188256438e-06, "loss": 0.2744, "step": 22365 }, { "epoch": 0.8363203028249644, "grad_norm": 0.3515075147151947, "learning_rate": 1.2931483458746618e-06, "loss": 0.2536, "step": 22370 }, { "epoch": 0.8365072318153142, "grad_norm": 0.31164297461509705, "learning_rate": 1.2902614756673348e-06, "loss": 0.2761, "step": 22375 }, { "epoch": 0.836694160805664, "grad_norm": 0.24446457624435425, "learning_rate": 1.2873776091992574e-06, "loss": 0.3083, "step": 22380 }, { "epoch": 0.8368810897960137, "grad_norm": 0.8027161955833435, "learning_rate": 1.284496747464996e-06, "loss": 0.314, "step": 22385 }, { "epoch": 0.8370680187863635, "grad_norm": 0.2881372570991516, "learning_rate": 1.281618891458073e-06, "loss": 0.2548, "step": 22390 }, { "epoch": 0.8372549477767133, "grad_norm": 0.6596559286117554, "learning_rate": 1.278744042170984e-06, "loss": 0.2581, "step": 22395 }, { "epoch": 0.8374418767670632, "grad_norm": 0.5899741649627686, "learning_rate": 1.2758722005951773e-06, "loss": 0.3011, "step": 22400 }, { "epoch": 0.8376288057574129, "grad_norm": 0.2703937888145447, "learning_rate": 1.273003367721073e-06, "loss": 0.252, "step": 22405 }, { "epoch": 0.8378157347477627, "grad_norm": 0.36412695050239563, "learning_rate": 1.2701375445380459e-06, "loss": 0.2738, "step": 22410 }, { "epoch": 0.8380026637381125, "grad_norm": 0.35132333636283875, "learning_rate": 1.2672747320344359e-06, "loss": 0.2776, "step": 22415 }, { "epoch": 0.8381895927284623, "grad_norm": 0.4513058662414551, "learning_rate": 1.2644149311975494e-06, "loss": 0.2587, "step": 22420 }, { "epoch": 0.838376521718812, "grad_norm": 0.3663261830806732, "learning_rate": 1.2615581430136449e-06, "loss": 0.2893, "step": 22425 }, { "epoch": 0.8385634507091618, "grad_norm": 0.7394602298736572, "learning_rate": 1.25870436846795e-06, "loss": 0.2527, "step": 22430 }, { "epoch": 0.8387503796995116, "grad_norm": 0.3970726728439331, "learning_rate": 1.255853608544647e-06, "loss": 0.2582, "step": 22435 }, { "epoch": 0.8389373086898615, "grad_norm": 0.6118816137313843, "learning_rate": 1.253005864226885e-06, "loss": 0.3117, "step": 22440 }, { "epoch": 0.8391242376802113, "grad_norm": 0.561377763748169, "learning_rate": 1.2501611364967647e-06, "loss": 0.2964, "step": 22445 }, { "epoch": 0.839311166670561, "grad_norm": 0.41377291083335876, "learning_rate": 1.247319426335356e-06, "loss": 0.2623, "step": 22450 }, { "epoch": 0.8394980956609108, "grad_norm": 0.6088568568229675, "learning_rate": 1.2444807347226795e-06, "loss": 0.2458, "step": 22455 }, { "epoch": 0.8396850246512606, "grad_norm": 0.2927430272102356, "learning_rate": 1.241645062637723e-06, "loss": 0.2587, "step": 22460 }, { "epoch": 0.8398719536416104, "grad_norm": 0.4997739791870117, "learning_rate": 1.2388124110584255e-06, "loss": 0.2811, "step": 22465 }, { "epoch": 0.8400588826319602, "grad_norm": 0.5061297416687012, "learning_rate": 1.2359827809616843e-06, "loss": 0.2929, "step": 22470 }, { "epoch": 0.8402458116223099, "grad_norm": 0.5074501037597656, "learning_rate": 1.2331561733233644e-06, "loss": 0.3264, "step": 22475 }, { "epoch": 0.8404327406126597, "grad_norm": 0.7985924482345581, "learning_rate": 1.230332589118276e-06, "loss": 0.3159, "step": 22480 }, { "epoch": 0.8406196696030096, "grad_norm": 0.39413031935691833, "learning_rate": 1.2275120293201969e-06, "loss": 0.2703, "step": 22485 }, { "epoch": 0.8408065985933594, "grad_norm": 0.5623971819877625, "learning_rate": 1.2246944949018525e-06, "loss": 0.3326, "step": 22490 }, { "epoch": 0.8409935275837092, "grad_norm": 0.3700483739376068, "learning_rate": 1.2218799868349362e-06, "loss": 0.3001, "step": 22495 }, { "epoch": 0.8411804565740589, "grad_norm": 0.42255735397338867, "learning_rate": 1.2190685060900843e-06, "loss": 0.3042, "step": 22500 }, { "epoch": 0.8413673855644087, "grad_norm": 0.38053280115127563, "learning_rate": 1.2162600536369018e-06, "loss": 0.2707, "step": 22505 }, { "epoch": 0.8415543145547585, "grad_norm": 0.42679354548454285, "learning_rate": 1.2134546304439398e-06, "loss": 0.2857, "step": 22510 }, { "epoch": 0.8417412435451083, "grad_norm": 0.20769809186458588, "learning_rate": 1.2106522374787078e-06, "loss": 0.3186, "step": 22515 }, { "epoch": 0.841928172535458, "grad_norm": 0.32969167828559875, "learning_rate": 1.2078528757076746e-06, "loss": 0.3293, "step": 22520 }, { "epoch": 0.8421151015258079, "grad_norm": 0.40005597472190857, "learning_rate": 1.205056546096256e-06, "loss": 0.2552, "step": 22525 }, { "epoch": 0.8423020305161577, "grad_norm": 0.500026524066925, "learning_rate": 1.2022632496088294e-06, "loss": 0.3076, "step": 22530 }, { "epoch": 0.8424889595065075, "grad_norm": 0.34214553236961365, "learning_rate": 1.1994729872087185e-06, "loss": 0.2829, "step": 22535 }, { "epoch": 0.8426758884968573, "grad_norm": 0.35944414138793945, "learning_rate": 1.1966857598582104e-06, "loss": 0.2663, "step": 22540 }, { "epoch": 0.842862817487207, "grad_norm": 0.8880605101585388, "learning_rate": 1.193901568518534e-06, "loss": 0.3047, "step": 22545 }, { "epoch": 0.8430497464775568, "grad_norm": 0.316691517829895, "learning_rate": 1.1911204141498821e-06, "loss": 0.2996, "step": 22550 }, { "epoch": 0.8432366754679066, "grad_norm": 0.49218663573265076, "learning_rate": 1.1883422977113935e-06, "loss": 0.2315, "step": 22555 }, { "epoch": 0.8434236044582564, "grad_norm": 0.27020570635795593, "learning_rate": 1.1855672201611578e-06, "loss": 0.2329, "step": 22560 }, { "epoch": 0.8436105334486063, "grad_norm": 0.321402907371521, "learning_rate": 1.1827951824562245e-06, "loss": 0.3207, "step": 22565 }, { "epoch": 0.843797462438956, "grad_norm": 0.8666585087776184, "learning_rate": 1.1800261855525862e-06, "loss": 0.2414, "step": 22570 }, { "epoch": 0.8439843914293058, "grad_norm": 0.41892504692077637, "learning_rate": 1.177260230405194e-06, "loss": 0.2862, "step": 22575 }, { "epoch": 0.8441713204196556, "grad_norm": 0.4117008149623871, "learning_rate": 1.1744973179679431e-06, "loss": 0.311, "step": 22580 }, { "epoch": 0.8443582494100054, "grad_norm": 0.2825338840484619, "learning_rate": 1.171737449193686e-06, "loss": 0.3535, "step": 22585 }, { "epoch": 0.8445451784003551, "grad_norm": 0.5039531588554382, "learning_rate": 1.1689806250342196e-06, "loss": 0.266, "step": 22590 }, { "epoch": 0.8447321073907049, "grad_norm": 0.8699703216552734, "learning_rate": 1.166226846440297e-06, "loss": 0.2462, "step": 22595 }, { "epoch": 0.8449190363810547, "grad_norm": 0.4708864986896515, "learning_rate": 1.1634761143616159e-06, "loss": 0.2971, "step": 22600 }, { "epoch": 0.8451059653714046, "grad_norm": 0.34658095240592957, "learning_rate": 1.1607284297468214e-06, "loss": 0.2854, "step": 22605 }, { "epoch": 0.8452928943617544, "grad_norm": 0.2925431430339813, "learning_rate": 1.1579837935435168e-06, "loss": 0.2688, "step": 22610 }, { "epoch": 0.8454798233521041, "grad_norm": 0.48946163058280945, "learning_rate": 1.1552422066982437e-06, "loss": 0.2699, "step": 22615 }, { "epoch": 0.8456667523424539, "grad_norm": 0.2582431733608246, "learning_rate": 1.1525036701565018e-06, "loss": 0.2689, "step": 22620 }, { "epoch": 0.8458536813328037, "grad_norm": 0.26818692684173584, "learning_rate": 1.1497681848627284e-06, "loss": 0.2724, "step": 22625 }, { "epoch": 0.8460406103231535, "grad_norm": 0.26108282804489136, "learning_rate": 1.1470357517603192e-06, "loss": 0.2324, "step": 22630 }, { "epoch": 0.8462275393135033, "grad_norm": 0.3785153925418854, "learning_rate": 1.1443063717916081e-06, "loss": 0.2595, "step": 22635 }, { "epoch": 0.846414468303853, "grad_norm": 0.4359034597873688, "learning_rate": 1.141580045897881e-06, "loss": 0.2702, "step": 22640 }, { "epoch": 0.8466013972942029, "grad_norm": 0.41941553354263306, "learning_rate": 1.1388567750193725e-06, "loss": 0.2944, "step": 22645 }, { "epoch": 0.8467883262845527, "grad_norm": 0.2629951536655426, "learning_rate": 1.1361365600952589e-06, "loss": 0.2684, "step": 22650 }, { "epoch": 0.8469752552749025, "grad_norm": 0.32962939143180847, "learning_rate": 1.1334194020636635e-06, "loss": 0.3649, "step": 22655 }, { "epoch": 0.8471621842652522, "grad_norm": 0.6077668070793152, "learning_rate": 1.1307053018616543e-06, "loss": 0.2784, "step": 22660 }, { "epoch": 0.847349113255602, "grad_norm": 0.2332148402929306, "learning_rate": 1.1279942604252514e-06, "loss": 0.2693, "step": 22665 }, { "epoch": 0.8475360422459518, "grad_norm": 0.3998556435108185, "learning_rate": 1.1252862786894103e-06, "loss": 0.241, "step": 22670 }, { "epoch": 0.8477229712363016, "grad_norm": 0.6394714117050171, "learning_rate": 1.1225813575880417e-06, "loss": 0.3319, "step": 22675 }, { "epoch": 0.8479099002266514, "grad_norm": 0.271589457988739, "learning_rate": 1.1198794980539908e-06, "loss": 0.2994, "step": 22680 }, { "epoch": 0.8480968292170012, "grad_norm": 0.35430166125297546, "learning_rate": 1.1171807010190528e-06, "loss": 0.2702, "step": 22685 }, { "epoch": 0.848283758207351, "grad_norm": 0.5060386061668396, "learning_rate": 1.114484967413969e-06, "loss": 0.3112, "step": 22690 }, { "epoch": 0.8484706871977008, "grad_norm": 0.3983868956565857, "learning_rate": 1.1117922981684172e-06, "loss": 0.2696, "step": 22695 }, { "epoch": 0.8486576161880506, "grad_norm": 0.569534957408905, "learning_rate": 1.1091026942110217e-06, "loss": 0.2857, "step": 22700 }, { "epoch": 0.8488445451784004, "grad_norm": 0.9335552453994751, "learning_rate": 1.1064161564693486e-06, "loss": 0.2731, "step": 22705 }, { "epoch": 0.8490314741687501, "grad_norm": 0.36156487464904785, "learning_rate": 1.1037326858699126e-06, "loss": 0.2699, "step": 22710 }, { "epoch": 0.8492184031590999, "grad_norm": 0.31813082098960876, "learning_rate": 1.101052283338161e-06, "loss": 0.199, "step": 22715 }, { "epoch": 0.8494053321494497, "grad_norm": 0.42129233479499817, "learning_rate": 1.0983749497984908e-06, "loss": 0.2301, "step": 22720 }, { "epoch": 0.8495922611397995, "grad_norm": 0.39161425828933716, "learning_rate": 1.0957006861742348e-06, "loss": 0.2858, "step": 22725 }, { "epoch": 0.8497791901301494, "grad_norm": 0.3193294107913971, "learning_rate": 1.093029493387673e-06, "loss": 0.2037, "step": 22730 }, { "epoch": 0.8499661191204991, "grad_norm": 0.3217445909976959, "learning_rate": 1.0903613723600225e-06, "loss": 0.288, "step": 22735 }, { "epoch": 0.8501530481108489, "grad_norm": 0.3758181035518646, "learning_rate": 1.0876963240114413e-06, "loss": 0.1978, "step": 22740 }, { "epoch": 0.8503399771011987, "grad_norm": 0.4037764370441437, "learning_rate": 1.0850343492610282e-06, "loss": 0.2588, "step": 22745 }, { "epoch": 0.8505269060915485, "grad_norm": 0.655259907245636, "learning_rate": 1.0823754490268213e-06, "loss": 0.2726, "step": 22750 }, { "epoch": 0.8507138350818982, "grad_norm": 0.20327606797218323, "learning_rate": 1.0797196242258002e-06, "loss": 0.2396, "step": 22755 }, { "epoch": 0.850900764072248, "grad_norm": 0.3590863049030304, "learning_rate": 1.0770668757738811e-06, "loss": 0.2711, "step": 22760 }, { "epoch": 0.8510876930625978, "grad_norm": 0.5582608580589294, "learning_rate": 1.0744172045859236e-06, "loss": 0.2555, "step": 22765 }, { "epoch": 0.8512746220529477, "grad_norm": 0.9252792596817017, "learning_rate": 1.071770611575721e-06, "loss": 0.3225, "step": 22770 }, { "epoch": 0.8514615510432975, "grad_norm": 0.48875078558921814, "learning_rate": 1.0691270976560075e-06, "loss": 0.2456, "step": 22775 }, { "epoch": 0.8516484800336472, "grad_norm": 0.691635251045227, "learning_rate": 1.0664866637384574e-06, "loss": 0.3181, "step": 22780 }, { "epoch": 0.851835409023997, "grad_norm": 0.7403832077980042, "learning_rate": 1.0638493107336811e-06, "loss": 0.2726, "step": 22785 }, { "epoch": 0.8520223380143468, "grad_norm": 0.37632712721824646, "learning_rate": 1.0612150395512233e-06, "loss": 0.315, "step": 22790 }, { "epoch": 0.8522092670046966, "grad_norm": 0.3026890158653259, "learning_rate": 1.0585838510995684e-06, "loss": 0.2889, "step": 22795 }, { "epoch": 0.8523961959950463, "grad_norm": 0.516616702079773, "learning_rate": 1.0559557462861403e-06, "loss": 0.2493, "step": 22800 }, { "epoch": 0.8525831249853961, "grad_norm": 0.3008388876914978, "learning_rate": 1.0533307260172954e-06, "loss": 0.2922, "step": 22805 }, { "epoch": 0.852770053975746, "grad_norm": 0.4109887182712555, "learning_rate": 1.0507087911983293e-06, "loss": 0.2831, "step": 22810 }, { "epoch": 0.8529569829660958, "grad_norm": 0.3522479832172394, "learning_rate": 1.04808994273347e-06, "loss": 0.3247, "step": 22815 }, { "epoch": 0.8531439119564456, "grad_norm": 0.5156667232513428, "learning_rate": 1.045474181525885e-06, "loss": 0.2889, "step": 22820 }, { "epoch": 0.8533308409467953, "grad_norm": 0.4093747138977051, "learning_rate": 1.0428615084776772e-06, "loss": 0.243, "step": 22825 }, { "epoch": 0.8535177699371451, "grad_norm": 0.3295019567012787, "learning_rate": 1.0402519244898778e-06, "loss": 0.2638, "step": 22830 }, { "epoch": 0.8537046989274949, "grad_norm": 0.7684919238090515, "learning_rate": 1.037645430462464e-06, "loss": 0.2124, "step": 22835 }, { "epoch": 0.8538916279178447, "grad_norm": 0.43423688411712646, "learning_rate": 1.0350420272943362e-06, "loss": 0.2397, "step": 22840 }, { "epoch": 0.8540785569081945, "grad_norm": 0.6224250197410583, "learning_rate": 1.0324417158833343e-06, "loss": 0.2438, "step": 22845 }, { "epoch": 0.8542654858985443, "grad_norm": 0.35096117854118347, "learning_rate": 1.02984449712623e-06, "loss": 0.2792, "step": 22850 }, { "epoch": 0.8544524148888941, "grad_norm": 0.3952169716358185, "learning_rate": 1.0272503719187332e-06, "loss": 0.3134, "step": 22855 }, { "epoch": 0.8546393438792439, "grad_norm": 0.18396589159965515, "learning_rate": 1.0246593411554796e-06, "loss": 0.2938, "step": 22860 }, { "epoch": 0.8548262728695937, "grad_norm": 0.35608750581741333, "learning_rate": 1.022071405730043e-06, "loss": 0.2834, "step": 22865 }, { "epoch": 0.8550132018599435, "grad_norm": 0.45381414890289307, "learning_rate": 1.0194865665349296e-06, "loss": 0.3398, "step": 22870 }, { "epoch": 0.8552001308502932, "grad_norm": 0.5310588479042053, "learning_rate": 1.0169048244615742e-06, "loss": 0.4079, "step": 22875 }, { "epoch": 0.855387059840643, "grad_norm": 0.2279624491930008, "learning_rate": 1.0143261804003479e-06, "loss": 0.2207, "step": 22880 }, { "epoch": 0.8555739888309928, "grad_norm": 0.408582478761673, "learning_rate": 1.0117506352405503e-06, "loss": 0.2455, "step": 22885 }, { "epoch": 0.8557609178213427, "grad_norm": 0.4538606107234955, "learning_rate": 1.0091781898704123e-06, "loss": 0.2467, "step": 22890 }, { "epoch": 0.8559478468116924, "grad_norm": 0.36869016289711, "learning_rate": 1.006608845177095e-06, "loss": 0.298, "step": 22895 }, { "epoch": 0.8561347758020422, "grad_norm": 0.3675679564476013, "learning_rate": 1.0040426020466965e-06, "loss": 0.2797, "step": 22900 }, { "epoch": 0.856321704792392, "grad_norm": 0.5693662762641907, "learning_rate": 1.0014794613642354e-06, "loss": 0.2798, "step": 22905 }, { "epoch": 0.8565086337827418, "grad_norm": 0.35014966130256653, "learning_rate": 9.98919424013669e-07, "loss": 0.2168, "step": 22910 }, { "epoch": 0.8566955627730916, "grad_norm": 0.30344653129577637, "learning_rate": 9.963624908778791e-07, "loss": 0.2438, "step": 22915 }, { "epoch": 0.8568824917634413, "grad_norm": 0.22757522761821747, "learning_rate": 9.938086628386778e-07, "loss": 0.2447, "step": 22920 }, { "epoch": 0.8570694207537911, "grad_norm": 0.5425657629966736, "learning_rate": 9.912579407768118e-07, "loss": 0.2094, "step": 22925 }, { "epoch": 0.857256349744141, "grad_norm": 0.6510607600212097, "learning_rate": 9.887103255719489e-07, "loss": 0.3355, "step": 22930 }, { "epoch": 0.8574432787344908, "grad_norm": 0.4659566283226013, "learning_rate": 9.861658181026878e-07, "loss": 0.2521, "step": 22935 }, { "epoch": 0.8576302077248406, "grad_norm": 0.40870416164398193, "learning_rate": 9.83624419246555e-07, "loss": 0.3023, "step": 22940 }, { "epoch": 0.8578171367151903, "grad_norm": 0.24827082455158234, "learning_rate": 9.810861298800111e-07, "loss": 0.2788, "step": 22945 }, { "epoch": 0.8580040657055401, "grad_norm": 0.6757010221481323, "learning_rate": 9.78550950878433e-07, "loss": 0.3783, "step": 22950 }, { "epoch": 0.8581909946958899, "grad_norm": 0.3025006651878357, "learning_rate": 9.760188831161376e-07, "loss": 0.3317, "step": 22955 }, { "epoch": 0.8583779236862397, "grad_norm": 0.45728322863578796, "learning_rate": 9.734899274663578e-07, "loss": 0.3688, "step": 22960 }, { "epoch": 0.8585648526765894, "grad_norm": 0.5989465713500977, "learning_rate": 9.709640848012602e-07, "loss": 0.2258, "step": 22965 }, { "epoch": 0.8587517816669392, "grad_norm": 0.2060554027557373, "learning_rate": 9.684413559919358e-07, "loss": 0.3065, "step": 22970 }, { "epoch": 0.8589387106572891, "grad_norm": 0.3968755006790161, "learning_rate": 9.65921741908402e-07, "loss": 0.3119, "step": 22975 }, { "epoch": 0.8591256396476389, "grad_norm": 0.4152187407016754, "learning_rate": 9.634052434195983e-07, "loss": 0.2567, "step": 22980 }, { "epoch": 0.8593125686379887, "grad_norm": 0.34883493185043335, "learning_rate": 9.60891861393396e-07, "loss": 0.2796, "step": 22985 }, { "epoch": 0.8594994976283384, "grad_norm": 0.13147194683551788, "learning_rate": 9.583815966965882e-07, "loss": 0.2369, "step": 22990 }, { "epoch": 0.8596864266186882, "grad_norm": 0.49323028326034546, "learning_rate": 9.558744501948903e-07, "loss": 0.3564, "step": 22995 }, { "epoch": 0.859873355609038, "grad_norm": 0.23689912259578705, "learning_rate": 9.533704227529494e-07, "loss": 0.2567, "step": 23000 }, { "epoch": 0.8600602845993878, "grad_norm": 0.8092080950737, "learning_rate": 9.508695152343295e-07, "loss": 0.3302, "step": 23005 }, { "epoch": 0.8602472135897375, "grad_norm": 0.3174912929534912, "learning_rate": 9.483717285015237e-07, "loss": 0.245, "step": 23010 }, { "epoch": 0.8604341425800874, "grad_norm": 0.7012712955474854, "learning_rate": 9.458770634159475e-07, "loss": 0.3679, "step": 23015 }, { "epoch": 0.8606210715704372, "grad_norm": 0.21312133967876434, "learning_rate": 9.433855208379383e-07, "loss": 0.2804, "step": 23020 }, { "epoch": 0.860808000560787, "grad_norm": 0.3727993667125702, "learning_rate": 9.408971016267588e-07, "loss": 0.4381, "step": 23025 }, { "epoch": 0.8609949295511368, "grad_norm": 0.3259866237640381, "learning_rate": 9.384118066405934e-07, "loss": 0.315, "step": 23030 }, { "epoch": 0.8611818585414865, "grad_norm": 0.48218241333961487, "learning_rate": 9.359296367365478e-07, "loss": 0.3288, "step": 23035 }, { "epoch": 0.8613687875318363, "grad_norm": 0.2950020134449005, "learning_rate": 9.334505927706516e-07, "loss": 0.3175, "step": 23040 }, { "epoch": 0.8615557165221861, "grad_norm": 0.36184942722320557, "learning_rate": 9.309746755978566e-07, "loss": 0.2991, "step": 23045 }, { "epoch": 0.8617426455125359, "grad_norm": 0.7623559236526489, "learning_rate": 9.285018860720352e-07, "loss": 0.2409, "step": 23050 }, { "epoch": 0.8619295745028858, "grad_norm": 0.10221162438392639, "learning_rate": 9.260322250459808e-07, "loss": 0.2914, "step": 23055 }, { "epoch": 0.8621165034932355, "grad_norm": 0.4181196093559265, "learning_rate": 9.235656933714121e-07, "loss": 0.3329, "step": 23060 }, { "epoch": 0.8623034324835853, "grad_norm": 0.5724478960037231, "learning_rate": 9.21102291898962e-07, "loss": 0.3349, "step": 23065 }, { "epoch": 0.8624903614739351, "grad_norm": 0.3799975514411926, "learning_rate": 9.186420214781888e-07, "loss": 0.3123, "step": 23070 }, { "epoch": 0.8626772904642849, "grad_norm": 0.48635974526405334, "learning_rate": 9.161848829575693e-07, "loss": 0.2522, "step": 23075 }, { "epoch": 0.8628642194546347, "grad_norm": 0.3440147340297699, "learning_rate": 9.137308771844988e-07, "loss": 0.2328, "step": 23080 }, { "epoch": 0.8630511484449844, "grad_norm": 0.6380581259727478, "learning_rate": 9.112800050052927e-07, "loss": 0.3319, "step": 23085 }, { "epoch": 0.8632380774353342, "grad_norm": 0.37005263566970825, "learning_rate": 9.088322672651906e-07, "loss": 0.3111, "step": 23090 }, { "epoch": 0.8634250064256841, "grad_norm": 0.37442389130592346, "learning_rate": 9.063876648083414e-07, "loss": 0.2227, "step": 23095 }, { "epoch": 0.8636119354160339, "grad_norm": 0.7940928339958191, "learning_rate": 9.039461984778231e-07, "loss": 0.3284, "step": 23100 }, { "epoch": 0.8637988644063836, "grad_norm": 0.4885554015636444, "learning_rate": 9.015078691156265e-07, "loss": 0.2938, "step": 23105 }, { "epoch": 0.8639857933967334, "grad_norm": 0.35439759492874146, "learning_rate": 8.990726775626602e-07, "loss": 0.2331, "step": 23110 }, { "epoch": 0.8641727223870832, "grad_norm": 0.4514514207839966, "learning_rate": 8.966406246587545e-07, "loss": 0.2141, "step": 23115 }, { "epoch": 0.864359651377433, "grad_norm": 0.5355989336967468, "learning_rate": 8.942117112426529e-07, "loss": 0.2842, "step": 23120 }, { "epoch": 0.8645465803677828, "grad_norm": 2.2368109226226807, "learning_rate": 8.917859381520189e-07, "loss": 0.2435, "step": 23125 }, { "epoch": 0.8647335093581325, "grad_norm": 0.5107064843177795, "learning_rate": 8.893633062234285e-07, "loss": 0.2857, "step": 23130 }, { "epoch": 0.8649204383484824, "grad_norm": 0.4561639726161957, "learning_rate": 8.869438162923838e-07, "loss": 0.247, "step": 23135 }, { "epoch": 0.8651073673388322, "grad_norm": 0.40335312485694885, "learning_rate": 8.845274691932926e-07, "loss": 0.3122, "step": 23140 }, { "epoch": 0.865294296329182, "grad_norm": 0.25808554887771606, "learning_rate": 8.821142657594861e-07, "loss": 0.2516, "step": 23145 }, { "epoch": 0.8654812253195318, "grad_norm": 0.4519065320491791, "learning_rate": 8.797042068232098e-07, "loss": 0.2665, "step": 23150 }, { "epoch": 0.8656681543098815, "grad_norm": 0.7045873403549194, "learning_rate": 8.772972932156221e-07, "loss": 0.4179, "step": 23155 }, { "epoch": 0.8658550833002313, "grad_norm": 0.4974367618560791, "learning_rate": 8.748935257668012e-07, "loss": 0.3216, "step": 23160 }, { "epoch": 0.8660420122905811, "grad_norm": 0.4620605707168579, "learning_rate": 8.72492905305734e-07, "loss": 0.2646, "step": 23165 }, { "epoch": 0.8662289412809309, "grad_norm": 0.45460769534111023, "learning_rate": 8.700954326603295e-07, "loss": 0.3191, "step": 23170 }, { "epoch": 0.8664158702712808, "grad_norm": 0.484361857175827, "learning_rate": 8.67701108657405e-07, "loss": 0.2041, "step": 23175 }, { "epoch": 0.8666027992616305, "grad_norm": 0.8345504999160767, "learning_rate": 8.653099341226956e-07, "loss": 0.382, "step": 23180 }, { "epoch": 0.8667897282519803, "grad_norm": 0.5114479660987854, "learning_rate": 8.62921909880845e-07, "loss": 0.2313, "step": 23185 }, { "epoch": 0.8669766572423301, "grad_norm": 0.35048356652259827, "learning_rate": 8.605370367554178e-07, "loss": 0.2199, "step": 23190 }, { "epoch": 0.8671635862326799, "grad_norm": 0.5474756360054016, "learning_rate": 8.581553155688894e-07, "loss": 0.3222, "step": 23195 }, { "epoch": 0.8673505152230296, "grad_norm": 0.4555470645427704, "learning_rate": 8.557767471426448e-07, "loss": 0.2262, "step": 23200 }, { "epoch": 0.8675374442133794, "grad_norm": 0.5740458965301514, "learning_rate": 8.534013322969859e-07, "loss": 0.3478, "step": 23205 }, { "epoch": 0.8677243732037292, "grad_norm": 0.4381164014339447, "learning_rate": 8.510290718511227e-07, "loss": 0.3054, "step": 23210 }, { "epoch": 0.867911302194079, "grad_norm": 0.27854111790657043, "learning_rate": 8.486599666231832e-07, "loss": 0.2728, "step": 23215 }, { "epoch": 0.8680982311844289, "grad_norm": 0.5790026187896729, "learning_rate": 8.462940174302026e-07, "loss": 0.3049, "step": 23220 }, { "epoch": 0.8682851601747786, "grad_norm": 0.18455126881599426, "learning_rate": 8.439312250881282e-07, "loss": 0.266, "step": 23225 }, { "epoch": 0.8684720891651284, "grad_norm": 0.4800267517566681, "learning_rate": 8.415715904118171e-07, "loss": 0.3171, "step": 23230 }, { "epoch": 0.8686590181554782, "grad_norm": 0.36994990706443787, "learning_rate": 8.392151142150428e-07, "loss": 0.3076, "step": 23235 }, { "epoch": 0.868845947145828, "grad_norm": 0.7822495698928833, "learning_rate": 8.368617973104887e-07, "loss": 0.2798, "step": 23240 }, { "epoch": 0.8690328761361777, "grad_norm": 0.38529595732688904, "learning_rate": 8.345116405097408e-07, "loss": 0.321, "step": 23245 }, { "epoch": 0.8692198051265275, "grad_norm": 0.5859056115150452, "learning_rate": 8.321646446233056e-07, "loss": 0.2563, "step": 23250 }, { "epoch": 0.8694067341168773, "grad_norm": 0.5239048004150391, "learning_rate": 8.29820810460591e-07, "loss": 0.2624, "step": 23255 }, { "epoch": 0.8695936631072272, "grad_norm": 0.5495345592498779, "learning_rate": 8.274801388299225e-07, "loss": 0.2581, "step": 23260 }, { "epoch": 0.869780592097577, "grad_norm": 0.4601067900657654, "learning_rate": 8.251426305385268e-07, "loss": 0.2837, "step": 23265 }, { "epoch": 0.8699675210879267, "grad_norm": 0.37067458033561707, "learning_rate": 8.228082863925457e-07, "loss": 0.2406, "step": 23270 }, { "epoch": 0.8701544500782765, "grad_norm": 0.8990246653556824, "learning_rate": 8.204771071970253e-07, "loss": 0.303, "step": 23275 }, { "epoch": 0.8703413790686263, "grad_norm": 0.23729777336120605, "learning_rate": 8.181490937559234e-07, "loss": 0.323, "step": 23280 }, { "epoch": 0.8705283080589761, "grad_norm": 0.6693614721298218, "learning_rate": 8.158242468721078e-07, "loss": 0.277, "step": 23285 }, { "epoch": 0.8707152370493259, "grad_norm": 0.7663869857788086, "learning_rate": 8.135025673473474e-07, "loss": 0.303, "step": 23290 }, { "epoch": 0.8709021660396756, "grad_norm": 0.389027863740921, "learning_rate": 8.11184055982327e-07, "loss": 0.2435, "step": 23295 }, { "epoch": 0.8710890950300255, "grad_norm": 0.38779979944229126, "learning_rate": 8.088687135766316e-07, "loss": 0.3011, "step": 23300 }, { "epoch": 0.8712760240203753, "grad_norm": 0.33862584829330444, "learning_rate": 8.06556540928759e-07, "loss": 0.304, "step": 23305 }, { "epoch": 0.8714629530107251, "grad_norm": 0.31352442502975464, "learning_rate": 8.042475388361104e-07, "loss": 0.2957, "step": 23310 }, { "epoch": 0.8716498820010748, "grad_norm": 0.4300394058227539, "learning_rate": 8.019417080949932e-07, "loss": 0.2494, "step": 23315 }, { "epoch": 0.8718368109914246, "grad_norm": 0.4149588942527771, "learning_rate": 7.996390495006223e-07, "loss": 0.2895, "step": 23320 }, { "epoch": 0.8720237399817744, "grad_norm": 0.3871593177318573, "learning_rate": 7.973395638471182e-07, "loss": 0.2378, "step": 23325 }, { "epoch": 0.8722106689721242, "grad_norm": 0.4816226363182068, "learning_rate": 7.95043251927512e-07, "loss": 0.2898, "step": 23330 }, { "epoch": 0.872397597962474, "grad_norm": 0.6048807501792908, "learning_rate": 7.927501145337302e-07, "loss": 0.2711, "step": 23335 }, { "epoch": 0.8725845269528238, "grad_norm": 0.3330458700656891, "learning_rate": 7.904601524566157e-07, "loss": 0.278, "step": 23340 }, { "epoch": 0.8727714559431736, "grad_norm": 0.5402839183807373, "learning_rate": 7.881733664859048e-07, "loss": 0.2908, "step": 23345 }, { "epoch": 0.8729583849335234, "grad_norm": 0.3132112920284271, "learning_rate": 7.858897574102508e-07, "loss": 0.2525, "step": 23350 }, { "epoch": 0.8731453139238732, "grad_norm": 0.5713052153587341, "learning_rate": 7.836093260171995e-07, "loss": 0.2595, "step": 23355 }, { "epoch": 0.873332242914223, "grad_norm": 0.39907294511795044, "learning_rate": 7.813320730932094e-07, "loss": 0.2347, "step": 23360 }, { "epoch": 0.8735191719045727, "grad_norm": 0.6258445978164673, "learning_rate": 7.790579994236402e-07, "loss": 0.3143, "step": 23365 }, { "epoch": 0.8737061008949225, "grad_norm": 0.5387104153633118, "learning_rate": 7.767871057927512e-07, "loss": 0.2598, "step": 23370 }, { "epoch": 0.8738930298852723, "grad_norm": 0.32804661989212036, "learning_rate": 7.745193929837136e-07, "loss": 0.2496, "step": 23375 }, { "epoch": 0.8740799588756222, "grad_norm": 0.17273744940757751, "learning_rate": 7.722548617785907e-07, "loss": 0.3359, "step": 23380 }, { "epoch": 0.874266887865972, "grad_norm": 0.2972503900527954, "learning_rate": 7.699935129583602e-07, "loss": 0.3159, "step": 23385 }, { "epoch": 0.8744538168563217, "grad_norm": 0.49442118406295776, "learning_rate": 7.677353473028926e-07, "loss": 0.3058, "step": 23390 }, { "epoch": 0.8746407458466715, "grad_norm": 0.5777555108070374, "learning_rate": 7.654803655909671e-07, "loss": 0.2823, "step": 23395 }, { "epoch": 0.8748276748370213, "grad_norm": 0.549485445022583, "learning_rate": 7.632285686002594e-07, "loss": 0.2715, "step": 23400 }, { "epoch": 0.8750146038273711, "grad_norm": 0.44350236654281616, "learning_rate": 7.609799571073529e-07, "loss": 0.2928, "step": 23405 }, { "epoch": 0.8752015328177208, "grad_norm": 0.3184381425380707, "learning_rate": 7.587345318877282e-07, "loss": 0.3997, "step": 23410 }, { "epoch": 0.8753884618080706, "grad_norm": 0.5490055084228516, "learning_rate": 7.564922937157659e-07, "loss": 0.2365, "step": 23415 }, { "epoch": 0.8755753907984205, "grad_norm": 0.2554487884044647, "learning_rate": 7.542532433647532e-07, "loss": 0.2517, "step": 23420 }, { "epoch": 0.8757623197887703, "grad_norm": 0.6824100613594055, "learning_rate": 7.52017381606871e-07, "loss": 0.3281, "step": 23425 }, { "epoch": 0.8759492487791201, "grad_norm": 0.33024176955223083, "learning_rate": 7.497847092132071e-07, "loss": 0.2404, "step": 23430 }, { "epoch": 0.8761361777694698, "grad_norm": 0.5682893395423889, "learning_rate": 7.475552269537434e-07, "loss": 0.3247, "step": 23435 }, { "epoch": 0.8763231067598196, "grad_norm": 0.5878921747207642, "learning_rate": 7.453289355973669e-07, "loss": 0.2347, "step": 23440 }, { "epoch": 0.8765100357501694, "grad_norm": 0.5948970913887024, "learning_rate": 7.431058359118593e-07, "loss": 0.223, "step": 23445 }, { "epoch": 0.8766969647405192, "grad_norm": 0.4269593358039856, "learning_rate": 7.408859286639069e-07, "loss": 0.2496, "step": 23450 }, { "epoch": 0.876883893730869, "grad_norm": 0.35104551911354065, "learning_rate": 7.38669214619091e-07, "loss": 0.2273, "step": 23455 }, { "epoch": 0.8770708227212187, "grad_norm": 0.5351817607879639, "learning_rate": 7.36455694541891e-07, "loss": 0.2464, "step": 23460 }, { "epoch": 0.8772577517115686, "grad_norm": 0.3460119962692261, "learning_rate": 7.342453691956886e-07, "loss": 0.286, "step": 23465 }, { "epoch": 0.8774446807019184, "grad_norm": 0.25786739587783813, "learning_rate": 7.320382393427595e-07, "loss": 0.3703, "step": 23470 }, { "epoch": 0.8776316096922682, "grad_norm": 0.47995084524154663, "learning_rate": 7.298343057442825e-07, "loss": 0.2853, "step": 23475 }, { "epoch": 0.8778185386826179, "grad_norm": 0.31466034054756165, "learning_rate": 7.276335691603276e-07, "loss": 0.2484, "step": 23480 }, { "epoch": 0.8780054676729677, "grad_norm": 0.42213770747184753, "learning_rate": 7.254360303498697e-07, "loss": 0.2988, "step": 23485 }, { "epoch": 0.8781923966633175, "grad_norm": 0.5641787052154541, "learning_rate": 7.232416900707739e-07, "loss": 0.3115, "step": 23490 }, { "epoch": 0.8783793256536673, "grad_norm": 0.3340662717819214, "learning_rate": 7.210505490798081e-07, "loss": 0.2782, "step": 23495 }, { "epoch": 0.878566254644017, "grad_norm": 0.4800506830215454, "learning_rate": 7.188626081326322e-07, "loss": 0.2921, "step": 23500 }, { "epoch": 0.8787531836343669, "grad_norm": 1.0108988285064697, "learning_rate": 7.166778679838026e-07, "loss": 0.2956, "step": 23505 }, { "epoch": 0.8789401126247167, "grad_norm": 0.4938255846500397, "learning_rate": 7.144963293867779e-07, "loss": 0.3058, "step": 23510 }, { "epoch": 0.8791270416150665, "grad_norm": 0.2628172039985657, "learning_rate": 7.123179930939028e-07, "loss": 0.3074, "step": 23515 }, { "epoch": 0.8793139706054163, "grad_norm": 0.3008750081062317, "learning_rate": 7.101428598564286e-07, "loss": 0.2431, "step": 23520 }, { "epoch": 0.879500899595766, "grad_norm": 0.5366243124008179, "learning_rate": 7.07970930424493e-07, "loss": 0.3393, "step": 23525 }, { "epoch": 0.8796878285861158, "grad_norm": 0.3495098054409027, "learning_rate": 7.058022055471337e-07, "loss": 0.1941, "step": 23530 }, { "epoch": 0.8798747575764656, "grad_norm": 0.5412205457687378, "learning_rate": 7.036366859722798e-07, "loss": 0.2646, "step": 23535 }, { "epoch": 0.8800616865668154, "grad_norm": 0.4367334842681885, "learning_rate": 7.014743724467609e-07, "loss": 0.2841, "step": 23540 }, { "epoch": 0.8802486155571653, "grad_norm": 0.2953815460205078, "learning_rate": 6.993152657162916e-07, "loss": 0.2756, "step": 23545 }, { "epoch": 0.880435544547515, "grad_norm": 0.5892349481582642, "learning_rate": 6.971593665254917e-07, "loss": 0.2218, "step": 23550 }, { "epoch": 0.8806224735378648, "grad_norm": 0.4493967890739441, "learning_rate": 6.950066756178653e-07, "loss": 0.2927, "step": 23555 }, { "epoch": 0.8808094025282146, "grad_norm": 0.4241293668746948, "learning_rate": 6.928571937358131e-07, "loss": 0.2596, "step": 23560 }, { "epoch": 0.8809963315185644, "grad_norm": 0.36346501111984253, "learning_rate": 6.907109216206342e-07, "loss": 0.2376, "step": 23565 }, { "epoch": 0.8811832605089142, "grad_norm": 0.5046384334564209, "learning_rate": 6.885678600125101e-07, "loss": 0.3006, "step": 23570 }, { "epoch": 0.8813701894992639, "grad_norm": 0.5586752891540527, "learning_rate": 6.864280096505283e-07, "loss": 0.2569, "step": 23575 }, { "epoch": 0.8815571184896137, "grad_norm": 0.2976597547531128, "learning_rate": 6.842913712726551e-07, "loss": 0.2507, "step": 23580 }, { "epoch": 0.8817440474799636, "grad_norm": 0.5597246289253235, "learning_rate": 6.821579456157612e-07, "loss": 0.3023, "step": 23585 }, { "epoch": 0.8819309764703134, "grad_norm": 0.6922735571861267, "learning_rate": 6.800277334156013e-07, "loss": 0.2563, "step": 23590 }, { "epoch": 0.8821179054606632, "grad_norm": 0.32556474208831787, "learning_rate": 6.779007354068257e-07, "loss": 0.2421, "step": 23595 }, { "epoch": 0.8823048344510129, "grad_norm": 0.2897225618362427, "learning_rate": 6.757769523229751e-07, "loss": 0.255, "step": 23600 }, { "epoch": 0.8824917634413627, "grad_norm": 0.45104631781578064, "learning_rate": 6.736563848964784e-07, "loss": 0.2586, "step": 23605 }, { "epoch": 0.8826786924317125, "grad_norm": 0.5284656286239624, "learning_rate": 6.715390338586636e-07, "loss": 0.2726, "step": 23610 }, { "epoch": 0.8828656214220623, "grad_norm": 0.42080292105674744, "learning_rate": 6.694248999397402e-07, "loss": 0.1887, "step": 23615 }, { "epoch": 0.883052550412412, "grad_norm": 0.7159053087234497, "learning_rate": 6.673139838688148e-07, "loss": 0.3204, "step": 23620 }, { "epoch": 0.8832394794027619, "grad_norm": 0.635303795337677, "learning_rate": 6.652062863738795e-07, "loss": 0.3029, "step": 23625 }, { "epoch": 0.8834264083931117, "grad_norm": 0.4003882110118866, "learning_rate": 6.63101808181823e-07, "loss": 0.3126, "step": 23630 }, { "epoch": 0.8836133373834615, "grad_norm": 0.35992348194122314, "learning_rate": 6.610005500184147e-07, "loss": 0.2064, "step": 23635 }, { "epoch": 0.8838002663738113, "grad_norm": 0.29172858595848083, "learning_rate": 6.589025126083216e-07, "loss": 0.3099, "step": 23640 }, { "epoch": 0.883987195364161, "grad_norm": 0.43935173749923706, "learning_rate": 6.568076966750958e-07, "loss": 0.3544, "step": 23645 }, { "epoch": 0.8841741243545108, "grad_norm": 0.6253902316093445, "learning_rate": 6.547161029411775e-07, "loss": 0.3469, "step": 23650 }, { "epoch": 0.8843610533448606, "grad_norm": 0.570335865020752, "learning_rate": 6.526277321279006e-07, "loss": 0.2235, "step": 23655 }, { "epoch": 0.8845479823352104, "grad_norm": 1.0959036350250244, "learning_rate": 6.505425849554825e-07, "loss": 0.2871, "step": 23660 }, { "epoch": 0.8847349113255603, "grad_norm": 0.6984147429466248, "learning_rate": 6.484606621430312e-07, "loss": 0.2487, "step": 23665 }, { "epoch": 0.88492184031591, "grad_norm": 0.6359334588050842, "learning_rate": 6.463819644085412e-07, "loss": 0.284, "step": 23670 }, { "epoch": 0.8851087693062598, "grad_norm": 0.4095711410045624, "learning_rate": 6.443064924688969e-07, "loss": 0.2513, "step": 23675 }, { "epoch": 0.8852956982966096, "grad_norm": 0.28603771328926086, "learning_rate": 6.422342470398679e-07, "loss": 0.2653, "step": 23680 }, { "epoch": 0.8854826272869594, "grad_norm": 0.36115431785583496, "learning_rate": 6.40165228836116e-07, "loss": 0.2031, "step": 23685 }, { "epoch": 0.8856695562773091, "grad_norm": 0.3040491044521332, "learning_rate": 6.380994385711803e-07, "loss": 0.3044, "step": 23690 }, { "epoch": 0.8858564852676589, "grad_norm": 0.4708629548549652, "learning_rate": 6.360368769574977e-07, "loss": 0.3036, "step": 23695 }, { "epoch": 0.8860434142580087, "grad_norm": 0.19724823534488678, "learning_rate": 6.339775447063856e-07, "loss": 0.2344, "step": 23700 }, { "epoch": 0.8862303432483585, "grad_norm": 0.6166355609893799, "learning_rate": 6.319214425280451e-07, "loss": 0.2477, "step": 23705 }, { "epoch": 0.8864172722387084, "grad_norm": 0.4069211184978485, "learning_rate": 6.298685711315722e-07, "loss": 0.2525, "step": 23710 }, { "epoch": 0.8866042012290581, "grad_norm": 0.6085258722305298, "learning_rate": 6.278189312249395e-07, "loss": 0.3372, "step": 23715 }, { "epoch": 0.8867911302194079, "grad_norm": 0.3506983518600464, "learning_rate": 6.257725235150113e-07, "loss": 0.2526, "step": 23720 }, { "epoch": 0.8869780592097577, "grad_norm": 0.6167741417884827, "learning_rate": 6.237293487075324e-07, "loss": 0.2678, "step": 23725 }, { "epoch": 0.8871649882001075, "grad_norm": 0.3610391318798065, "learning_rate": 6.216894075071378e-07, "loss": 0.1962, "step": 23730 }, { "epoch": 0.8873519171904573, "grad_norm": 0.5724371075630188, "learning_rate": 6.196527006173447e-07, "loss": 0.2723, "step": 23735 }, { "epoch": 0.887538846180807, "grad_norm": 0.4358518719673157, "learning_rate": 6.176192287405547e-07, "loss": 0.2568, "step": 23740 }, { "epoch": 0.8877257751711568, "grad_norm": 0.34022319316864014, "learning_rate": 6.155889925780534e-07, "loss": 0.25, "step": 23745 }, { "epoch": 0.8879127041615067, "grad_norm": 0.5654661059379578, "learning_rate": 6.135619928300096e-07, "loss": 0.3062, "step": 23750 }, { "epoch": 0.8880996331518565, "grad_norm": 0.27159449458122253, "learning_rate": 6.115382301954809e-07, "loss": 0.2635, "step": 23755 }, { "epoch": 0.8882865621422062, "grad_norm": 0.33312445878982544, "learning_rate": 6.095177053724011e-07, "loss": 0.2983, "step": 23760 }, { "epoch": 0.888473491132556, "grad_norm": 0.7186973094940186, "learning_rate": 6.07500419057595e-07, "loss": 0.2881, "step": 23765 }, { "epoch": 0.8886604201229058, "grad_norm": 0.4082525074481964, "learning_rate": 6.054863719467641e-07, "loss": 0.2782, "step": 23770 }, { "epoch": 0.8888473491132556, "grad_norm": 0.22623296082019806, "learning_rate": 6.034755647344958e-07, "loss": 0.198, "step": 23775 }, { "epoch": 0.8890342781036054, "grad_norm": 0.26331770420074463, "learning_rate": 6.014679981142635e-07, "loss": 0.2145, "step": 23780 }, { "epoch": 0.8892212070939551, "grad_norm": 0.613768458366394, "learning_rate": 5.994636727784153e-07, "loss": 0.282, "step": 23785 }, { "epoch": 0.889408136084305, "grad_norm": 0.27621743083000183, "learning_rate": 5.974625894181874e-07, "loss": 0.2704, "step": 23790 }, { "epoch": 0.8895950650746548, "grad_norm": 1.3323360681533813, "learning_rate": 5.954647487236942e-07, "loss": 0.2972, "step": 23795 }, { "epoch": 0.8897819940650046, "grad_norm": 0.46217793226242065, "learning_rate": 5.934701513839369e-07, "loss": 0.3143, "step": 23800 }, { "epoch": 0.8899689230553544, "grad_norm": 0.6428271532058716, "learning_rate": 5.91478798086792e-07, "loss": 0.2476, "step": 23805 }, { "epoch": 0.8901558520457041, "grad_norm": 0.5378536581993103, "learning_rate": 5.894906895190222e-07, "loss": 0.2954, "step": 23810 }, { "epoch": 0.8903427810360539, "grad_norm": 0.34355923533439636, "learning_rate": 5.875058263662669e-07, "loss": 0.2932, "step": 23815 }, { "epoch": 0.8905297100264037, "grad_norm": 0.42726051807403564, "learning_rate": 5.855242093130498e-07, "loss": 0.2709, "step": 23820 }, { "epoch": 0.8907166390167535, "grad_norm": 0.4561409652233124, "learning_rate": 5.835458390427762e-07, "loss": 0.2824, "step": 23825 }, { "epoch": 0.8909035680071034, "grad_norm": 0.523114800453186, "learning_rate": 5.815707162377271e-07, "loss": 0.2704, "step": 23830 }, { "epoch": 0.8910904969974531, "grad_norm": 3.815091848373413, "learning_rate": 5.795988415790655e-07, "loss": 0.3295, "step": 23835 }, { "epoch": 0.8912774259878029, "grad_norm": 0.3353985548019409, "learning_rate": 5.776302157468338e-07, "loss": 0.3024, "step": 23840 }, { "epoch": 0.8914643549781527, "grad_norm": 0.4886054992675781, "learning_rate": 5.756648394199571e-07, "loss": 0.3643, "step": 23845 }, { "epoch": 0.8916512839685025, "grad_norm": 0.28375887870788574, "learning_rate": 5.737027132762341e-07, "loss": 0.2608, "step": 23850 }, { "epoch": 0.8918382129588522, "grad_norm": 0.39414143562316895, "learning_rate": 5.71743837992349e-07, "loss": 0.3279, "step": 23855 }, { "epoch": 0.892025141949202, "grad_norm": 0.29853355884552, "learning_rate": 5.697882142438594e-07, "loss": 0.2457, "step": 23860 }, { "epoch": 0.8922120709395518, "grad_norm": 0.49426108598709106, "learning_rate": 5.678358427052045e-07, "loss": 0.2018, "step": 23865 }, { "epoch": 0.8923989999299017, "grad_norm": 0.48164868354797363, "learning_rate": 5.658867240497034e-07, "loss": 0.2593, "step": 23870 }, { "epoch": 0.8925859289202515, "grad_norm": 0.3493533432483673, "learning_rate": 5.639408589495476e-07, "loss": 0.3238, "step": 23875 }, { "epoch": 0.8927728579106012, "grad_norm": 0.40912967920303345, "learning_rate": 5.619982480758146e-07, "loss": 0.2422, "step": 23880 }, { "epoch": 0.892959786900951, "grad_norm": 0.50095134973526, "learning_rate": 5.600588920984529e-07, "loss": 0.2204, "step": 23885 }, { "epoch": 0.8931467158913008, "grad_norm": 0.3179953992366791, "learning_rate": 5.581227916862907e-07, "loss": 0.2518, "step": 23890 }, { "epoch": 0.8933336448816506, "grad_norm": 0.39287543296813965, "learning_rate": 5.56189947507032e-07, "loss": 0.291, "step": 23895 }, { "epoch": 0.8935205738720003, "grad_norm": 0.44171902537345886, "learning_rate": 5.542603602272622e-07, "loss": 0.2908, "step": 23900 }, { "epoch": 0.8937075028623501, "grad_norm": 0.8764915466308594, "learning_rate": 5.523340305124381e-07, "loss": 0.3125, "step": 23905 }, { "epoch": 0.8938944318527, "grad_norm": 0.3236830234527588, "learning_rate": 5.50410959026898e-07, "loss": 0.2637, "step": 23910 }, { "epoch": 0.8940813608430498, "grad_norm": 0.5709930062294006, "learning_rate": 5.484911464338539e-07, "loss": 0.2993, "step": 23915 }, { "epoch": 0.8942682898333996, "grad_norm": 0.22036674618721008, "learning_rate": 5.465745933953914e-07, "loss": 0.3086, "step": 23920 }, { "epoch": 0.8944552188237493, "grad_norm": 0.36622971296310425, "learning_rate": 5.446613005724788e-07, "loss": 0.287, "step": 23925 }, { "epoch": 0.8946421478140991, "grad_norm": 0.37628886103630066, "learning_rate": 5.427512686249537e-07, "loss": 0.2985, "step": 23930 }, { "epoch": 0.8948290768044489, "grad_norm": 0.3855689764022827, "learning_rate": 5.408444982115313e-07, "loss": 0.2803, "step": 23935 }, { "epoch": 0.8950160057947987, "grad_norm": 0.0829666256904602, "learning_rate": 5.389409899898013e-07, "loss": 0.2945, "step": 23940 }, { "epoch": 0.8952029347851485, "grad_norm": 0.5317602753639221, "learning_rate": 5.370407446162318e-07, "loss": 0.2388, "step": 23945 }, { "epoch": 0.8953898637754982, "grad_norm": 0.635329008102417, "learning_rate": 5.351437627461598e-07, "loss": 0.351, "step": 23950 }, { "epoch": 0.8955767927658481, "grad_norm": 0.48536187410354614, "learning_rate": 5.332500450338018e-07, "loss": 0.3298, "step": 23955 }, { "epoch": 0.8957637217561979, "grad_norm": 0.3373602330684662, "learning_rate": 5.313595921322479e-07, "loss": 0.2615, "step": 23960 }, { "epoch": 0.8959506507465477, "grad_norm": 0.49494582414627075, "learning_rate": 5.294724046934585e-07, "loss": 0.2406, "step": 23965 }, { "epoch": 0.8961375797368974, "grad_norm": 0.36144334077835083, "learning_rate": 5.275884833682721e-07, "loss": 0.2532, "step": 23970 }, { "epoch": 0.8963245087272472, "grad_norm": 0.6861150860786438, "learning_rate": 5.257078288064e-07, "loss": 0.2478, "step": 23975 }, { "epoch": 0.896511437717597, "grad_norm": 0.22255679965019226, "learning_rate": 5.238304416564243e-07, "loss": 0.3317, "step": 23980 }, { "epoch": 0.8966983667079468, "grad_norm": 0.2977222800254822, "learning_rate": 5.21956322565802e-07, "loss": 0.2917, "step": 23985 }, { "epoch": 0.8968852956982966, "grad_norm": 0.7013628482818604, "learning_rate": 5.200854721808645e-07, "loss": 0.3236, "step": 23990 }, { "epoch": 0.8970722246886464, "grad_norm": 0.46044737100601196, "learning_rate": 5.182178911468128e-07, "loss": 0.2712, "step": 23995 }, { "epoch": 0.8972591536789962, "grad_norm": 0.21883058547973633, "learning_rate": 5.163535801077235e-07, "loss": 0.2518, "step": 24000 }, { "epoch": 0.897446082669346, "grad_norm": 0.3243284523487091, "learning_rate": 5.144925397065437e-07, "loss": 0.3362, "step": 24005 }, { "epoch": 0.8976330116596958, "grad_norm": 0.5240607261657715, "learning_rate": 5.12634770585092e-07, "loss": 0.2729, "step": 24010 }, { "epoch": 0.8978199406500456, "grad_norm": 0.318803608417511, "learning_rate": 5.107802733840616e-07, "loss": 0.2773, "step": 24015 }, { "epoch": 0.8980068696403953, "grad_norm": 0.41666528582572937, "learning_rate": 5.089290487430154e-07, "loss": 0.233, "step": 24020 }, { "epoch": 0.8981937986307451, "grad_norm": 0.5397652387619019, "learning_rate": 5.070810973003859e-07, "loss": 0.2415, "step": 24025 }, { "epoch": 0.8983807276210949, "grad_norm": 0.4510786533355713, "learning_rate": 5.052364196934779e-07, "loss": 0.2595, "step": 24030 }, { "epoch": 0.8985676566114448, "grad_norm": 0.4586493670940399, "learning_rate": 5.033950165584711e-07, "loss": 0.4253, "step": 24035 }, { "epoch": 0.8987545856017946, "grad_norm": 0.6832088828086853, "learning_rate": 5.01556888530409e-07, "loss": 0.3542, "step": 24040 }, { "epoch": 0.8989415145921443, "grad_norm": 0.2557532489299774, "learning_rate": 4.99722036243212e-07, "loss": 0.2235, "step": 24045 }, { "epoch": 0.8991284435824941, "grad_norm": 0.2579525113105774, "learning_rate": 4.978904603296686e-07, "loss": 0.2518, "step": 24050 }, { "epoch": 0.8993153725728439, "grad_norm": 0.7935723066329956, "learning_rate": 4.960621614214334e-07, "loss": 0.2589, "step": 24055 }, { "epoch": 0.8995023015631937, "grad_norm": 0.6913199424743652, "learning_rate": 4.942371401490386e-07, "loss": 0.2652, "step": 24060 }, { "epoch": 0.8996892305535434, "grad_norm": 0.604274570941925, "learning_rate": 4.924153971418777e-07, "loss": 0.2887, "step": 24065 }, { "epoch": 0.8998761595438932, "grad_norm": 0.35394561290740967, "learning_rate": 4.905969330282212e-07, "loss": 0.2466, "step": 24070 }, { "epoch": 0.9000630885342431, "grad_norm": 0.39302387833595276, "learning_rate": 4.887817484352031e-07, "loss": 0.3516, "step": 24075 }, { "epoch": 0.9002500175245929, "grad_norm": 0.5640286207199097, "learning_rate": 4.869698439888304e-07, "loss": 0.3531, "step": 24080 }, { "epoch": 0.9004369465149427, "grad_norm": 0.4714716076850891, "learning_rate": 4.851612203139733e-07, "loss": 0.2918, "step": 24085 }, { "epoch": 0.9006238755052924, "grad_norm": 0.41106295585632324, "learning_rate": 4.833558780343772e-07, "loss": 0.3051, "step": 24090 }, { "epoch": 0.9008108044956422, "grad_norm": 0.48679038882255554, "learning_rate": 4.815538177726531e-07, "loss": 0.2615, "step": 24095 }, { "epoch": 0.900997733485992, "grad_norm": 0.5820590257644653, "learning_rate": 4.797550401502782e-07, "loss": 0.2669, "step": 24100 }, { "epoch": 0.9011846624763418, "grad_norm": 0.5170806050300598, "learning_rate": 4.779595457876019e-07, "loss": 0.2829, "step": 24105 }, { "epoch": 0.9013715914666915, "grad_norm": 0.4748556613922119, "learning_rate": 4.761673353038354e-07, "loss": 0.2686, "step": 24110 }, { "epoch": 0.9015585204570414, "grad_norm": 0.3551413118839264, "learning_rate": 4.743784093170645e-07, "loss": 0.3079, "step": 24115 }, { "epoch": 0.9017454494473912, "grad_norm": 0.47169989347457886, "learning_rate": 4.725927684442366e-07, "loss": 0.2218, "step": 24120 }, { "epoch": 0.901932378437741, "grad_norm": 0.39478862285614014, "learning_rate": 4.7081041330116816e-07, "loss": 0.2159, "step": 24125 }, { "epoch": 0.9021193074280908, "grad_norm": 0.33222270011901855, "learning_rate": 4.6903134450254186e-07, "loss": 0.3028, "step": 24130 }, { "epoch": 0.9023062364184405, "grad_norm": 0.39596304297447205, "learning_rate": 4.6725556266190687e-07, "loss": 0.1949, "step": 24135 }, { "epoch": 0.9024931654087903, "grad_norm": 0.31857696175575256, "learning_rate": 4.6548306839168224e-07, "loss": 0.2729, "step": 24140 }, { "epoch": 0.9026800943991401, "grad_norm": 0.4975537359714508, "learning_rate": 4.6371386230314785e-07, "loss": 0.3893, "step": 24145 }, { "epoch": 0.9028670233894899, "grad_norm": 0.4041035771369934, "learning_rate": 4.619479450064535e-07, "loss": 0.3135, "step": 24150 }, { "epoch": 0.9030539523798398, "grad_norm": 0.5101954936981201, "learning_rate": 4.6018531711061297e-07, "loss": 0.2647, "step": 24155 }, { "epoch": 0.9032408813701895, "grad_norm": 0.4400761127471924, "learning_rate": 4.5842597922350683e-07, "loss": 0.2631, "step": 24160 }, { "epoch": 0.9034278103605393, "grad_norm": 0.4869976341724396, "learning_rate": 4.566699319518808e-07, "loss": 0.2841, "step": 24165 }, { "epoch": 0.9036147393508891, "grad_norm": 0.6493037343025208, "learning_rate": 4.5491717590134377e-07, "loss": 0.3233, "step": 24170 }, { "epoch": 0.9038016683412389, "grad_norm": 0.2384498566389084, "learning_rate": 4.5316771167637e-07, "loss": 0.2755, "step": 24175 }, { "epoch": 0.9039885973315887, "grad_norm": 0.5434466004371643, "learning_rate": 4.5142153988030236e-07, "loss": 0.2591, "step": 24180 }, { "epoch": 0.9041755263219384, "grad_norm": 0.2663435935974121, "learning_rate": 4.4967866111534254e-07, "loss": 0.2451, "step": 24185 }, { "epoch": 0.9043624553122882, "grad_norm": 0.48140108585357666, "learning_rate": 4.4793907598256193e-07, "loss": 0.3392, "step": 24190 }, { "epoch": 0.904549384302638, "grad_norm": 0.29158666729927063, "learning_rate": 4.4620278508189395e-07, "loss": 0.2499, "step": 24195 }, { "epoch": 0.9047363132929879, "grad_norm": 0.8527382612228394, "learning_rate": 4.44469789012133e-07, "loss": 0.2645, "step": 24200 }, { "epoch": 0.9049232422833376, "grad_norm": 0.469309538602829, "learning_rate": 4.4274008837094316e-07, "loss": 0.3296, "step": 24205 }, { "epoch": 0.9051101712736874, "grad_norm": 0.7557592988014221, "learning_rate": 4.410136837548462e-07, "loss": 0.2996, "step": 24210 }, { "epoch": 0.9052971002640372, "grad_norm": 0.36758342385292053, "learning_rate": 4.392905757592303e-07, "loss": 0.2624, "step": 24215 }, { "epoch": 0.905484029254387, "grad_norm": 0.6781210899353027, "learning_rate": 4.3757076497834337e-07, "loss": 0.3425, "step": 24220 }, { "epoch": 0.9056709582447368, "grad_norm": 0.4783806800842285, "learning_rate": 4.358542520053044e-07, "loss": 0.3332, "step": 24225 }, { "epoch": 0.9058578872350865, "grad_norm": 0.33384254574775696, "learning_rate": 4.3414103743208426e-07, "loss": 0.2122, "step": 24230 }, { "epoch": 0.9060448162254363, "grad_norm": 0.31750744581222534, "learning_rate": 4.3243112184952365e-07, "loss": 0.31, "step": 24235 }, { "epoch": 0.9062317452157862, "grad_norm": 0.32413434982299805, "learning_rate": 4.307245058473253e-07, "loss": 0.2356, "step": 24240 }, { "epoch": 0.906418674206136, "grad_norm": 0.5775876641273499, "learning_rate": 4.290211900140495e-07, "loss": 0.3082, "step": 24245 }, { "epoch": 0.9066056031964858, "grad_norm": 0.3418591618537903, "learning_rate": 4.27321174937122e-07, "loss": 0.3737, "step": 24250 }, { "epoch": 0.9067925321868355, "grad_norm": 0.48873278498649597, "learning_rate": 4.256244612028293e-07, "loss": 0.3144, "step": 24255 }, { "epoch": 0.9069794611771853, "grad_norm": 0.43501177430152893, "learning_rate": 4.2393104939632e-07, "loss": 0.3015, "step": 24260 }, { "epoch": 0.9071663901675351, "grad_norm": 0.46144506335258484, "learning_rate": 4.222409401016025e-07, "loss": 0.3018, "step": 24265 }, { "epoch": 0.9073533191578849, "grad_norm": 0.4738104045391083, "learning_rate": 4.205541339015484e-07, "loss": 0.2621, "step": 24270 }, { "epoch": 0.9075402481482346, "grad_norm": 0.5475369691848755, "learning_rate": 4.1887063137788565e-07, "loss": 0.3754, "step": 24275 }, { "epoch": 0.9077271771385845, "grad_norm": 0.26058679819107056, "learning_rate": 4.1719043311120757e-07, "loss": 0.2475, "step": 24280 }, { "epoch": 0.9079141061289343, "grad_norm": 0.6181069016456604, "learning_rate": 4.155135396809684e-07, "loss": 0.3012, "step": 24285 }, { "epoch": 0.9081010351192841, "grad_norm": 0.4764721095561981, "learning_rate": 4.13839951665479e-07, "loss": 0.28, "step": 24290 }, { "epoch": 0.9082879641096339, "grad_norm": 0.20308902859687805, "learning_rate": 4.1216966964191194e-07, "loss": 0.341, "step": 24295 }, { "epoch": 0.9084748930999836, "grad_norm": 0.3182278573513031, "learning_rate": 4.1050269418629887e-07, "loss": 0.3403, "step": 24300 }, { "epoch": 0.9086618220903334, "grad_norm": 0.4016958475112915, "learning_rate": 4.088390258735342e-07, "loss": 0.2859, "step": 24305 }, { "epoch": 0.9088487510806832, "grad_norm": 0.5920534729957581, "learning_rate": 4.071786652773679e-07, "loss": 0.3373, "step": 24310 }, { "epoch": 0.909035680071033, "grad_norm": 0.5127608180046082, "learning_rate": 4.0552161297041094e-07, "loss": 0.3156, "step": 24315 }, { "epoch": 0.9092226090613829, "grad_norm": 0.40801453590393066, "learning_rate": 4.038678695241316e-07, "loss": 0.3016, "step": 24320 }, { "epoch": 0.9094095380517326, "grad_norm": 0.5536433458328247, "learning_rate": 4.022174355088593e-07, "loss": 0.3749, "step": 24325 }, { "epoch": 0.9095964670420824, "grad_norm": 0.3775192201137543, "learning_rate": 4.005703114937842e-07, "loss": 0.3362, "step": 24330 }, { "epoch": 0.9097833960324322, "grad_norm": 0.3780711591243744, "learning_rate": 3.989264980469498e-07, "loss": 0.2515, "step": 24335 }, { "epoch": 0.909970325022782, "grad_norm": 0.27277469635009766, "learning_rate": 3.972859957352604e-07, "loss": 0.2751, "step": 24340 }, { "epoch": 0.9101572540131317, "grad_norm": 0.28428831696510315, "learning_rate": 3.956488051244789e-07, "loss": 0.3447, "step": 24345 }, { "epoch": 0.9103441830034815, "grad_norm": 0.3745942711830139, "learning_rate": 3.9401492677922483e-07, "loss": 0.2715, "step": 24350 }, { "epoch": 0.9105311119938313, "grad_norm": 0.33236637711524963, "learning_rate": 3.9238436126297743e-07, "loss": 0.3158, "step": 24355 }, { "epoch": 0.9107180409841812, "grad_norm": 0.4709293246269226, "learning_rate": 3.9075710913807016e-07, "loss": 0.2759, "step": 24360 }, { "epoch": 0.910904969974531, "grad_norm": 0.43469110131263733, "learning_rate": 3.8913317096569427e-07, "loss": 0.2859, "step": 24365 }, { "epoch": 0.9110918989648807, "grad_norm": 0.3980669379234314, "learning_rate": 3.875125473059027e-07, "loss": 0.3309, "step": 24370 }, { "epoch": 0.9112788279552305, "grad_norm": 0.5004674196243286, "learning_rate": 3.8589523871760183e-07, "loss": 0.1799, "step": 24375 }, { "epoch": 0.9114657569455803, "grad_norm": 0.28025054931640625, "learning_rate": 3.8428124575855317e-07, "loss": 0.2318, "step": 24380 }, { "epoch": 0.9116526859359301, "grad_norm": 0.6791032552719116, "learning_rate": 3.826705689853782e-07, "loss": 0.2544, "step": 24385 }, { "epoch": 0.9118396149262799, "grad_norm": 0.44797998666763306, "learning_rate": 3.810632089535526e-07, "loss": 0.2337, "step": 24390 }, { "epoch": 0.9120265439166296, "grad_norm": 0.6669244766235352, "learning_rate": 3.794591662174096e-07, "loss": 0.1961, "step": 24395 }, { "epoch": 0.9122134729069794, "grad_norm": 0.9826619625091553, "learning_rate": 3.778584413301356e-07, "loss": 0.24, "step": 24400 }, { "epoch": 0.9124004018973293, "grad_norm": 0.5112035870552063, "learning_rate": 3.7626103484377674e-07, "loss": 0.2262, "step": 24405 }, { "epoch": 0.9125873308876791, "grad_norm": 0.3886997699737549, "learning_rate": 3.7466694730923124e-07, "loss": 0.2582, "step": 24410 }, { "epoch": 0.9127742598780288, "grad_norm": 0.26052147150039673, "learning_rate": 3.7307617927625494e-07, "loss": 0.3355, "step": 24415 }, { "epoch": 0.9129611888683786, "grad_norm": 0.18169300258159637, "learning_rate": 3.7148873129345896e-07, "loss": 0.2485, "step": 24420 }, { "epoch": 0.9131481178587284, "grad_norm": 0.6013249158859253, "learning_rate": 3.6990460390830653e-07, "loss": 0.2622, "step": 24425 }, { "epoch": 0.9133350468490782, "grad_norm": 0.12792934477329254, "learning_rate": 3.6832379766712057e-07, "loss": 0.2445, "step": 24430 }, { "epoch": 0.913521975839428, "grad_norm": 0.6494380235671997, "learning_rate": 3.667463131150728e-07, "loss": 0.2618, "step": 24435 }, { "epoch": 0.9137089048297777, "grad_norm": 1.2967865467071533, "learning_rate": 3.6517215079619583e-07, "loss": 0.3326, "step": 24440 }, { "epoch": 0.9138958338201276, "grad_norm": 0.43164533376693726, "learning_rate": 3.6360131125336983e-07, "loss": 0.2116, "step": 24445 }, { "epoch": 0.9140827628104774, "grad_norm": 0.2929776608943939, "learning_rate": 3.620337950283348e-07, "loss": 0.3048, "step": 24450 }, { "epoch": 0.9142696918008272, "grad_norm": 0.4373328685760498, "learning_rate": 3.6046960266168163e-07, "loss": 0.2887, "step": 24455 }, { "epoch": 0.914456620791177, "grad_norm": 0.29323285818099976, "learning_rate": 3.5890873469285325e-07, "loss": 0.3293, "step": 24460 }, { "epoch": 0.9146435497815267, "grad_norm": 0.3576362729072571, "learning_rate": 3.573511916601513e-07, "loss": 0.2113, "step": 24465 }, { "epoch": 0.9148304787718765, "grad_norm": 0.5483099222183228, "learning_rate": 3.55796974100725e-07, "loss": 0.2588, "step": 24470 }, { "epoch": 0.9150174077622263, "grad_norm": 0.30468040704727173, "learning_rate": 3.5424608255058334e-07, "loss": 0.2478, "step": 24475 }, { "epoch": 0.9152043367525761, "grad_norm": 0.3078984320163727, "learning_rate": 3.526985175445796e-07, "loss": 0.2239, "step": 24480 }, { "epoch": 0.915391265742926, "grad_norm": 0.2722003757953644, "learning_rate": 3.511542796164291e-07, "loss": 0.2702, "step": 24485 }, { "epoch": 0.9155781947332757, "grad_norm": 0.317244291305542, "learning_rate": 3.496133692986914e-07, "loss": 0.2063, "step": 24490 }, { "epoch": 0.9157651237236255, "grad_norm": 0.23974300920963287, "learning_rate": 3.480757871227858e-07, "loss": 0.2249, "step": 24495 }, { "epoch": 0.9159520527139753, "grad_norm": 0.8161414265632629, "learning_rate": 3.4654153361897815e-07, "loss": 0.3797, "step": 24500 }, { "epoch": 0.9161389817043251, "grad_norm": 0.28393059968948364, "learning_rate": 3.4501060931638743e-07, "loss": 0.2648, "step": 24505 }, { "epoch": 0.9163259106946748, "grad_norm": 0.44209572672843933, "learning_rate": 3.4348301474298906e-07, "loss": 0.3289, "step": 24510 }, { "epoch": 0.9165128396850246, "grad_norm": 0.35628998279571533, "learning_rate": 3.4195875042560276e-07, "loss": 0.2093, "step": 24515 }, { "epoch": 0.9166997686753744, "grad_norm": 0.3359036445617676, "learning_rate": 3.4043781688990696e-07, "loss": 0.2941, "step": 24520 }, { "epoch": 0.9168866976657243, "grad_norm": 0.6143574118614197, "learning_rate": 3.3892021466042646e-07, "loss": 0.303, "step": 24525 }, { "epoch": 0.9170736266560741, "grad_norm": 0.24244384467601776, "learning_rate": 3.374059442605393e-07, "loss": 0.2823, "step": 24530 }, { "epoch": 0.9172605556464238, "grad_norm": 0.8676662445068359, "learning_rate": 3.3589500621247215e-07, "loss": 0.3931, "step": 24535 }, { "epoch": 0.9174474846367736, "grad_norm": 0.5118948817253113, "learning_rate": 3.3438740103730716e-07, "loss": 0.2997, "step": 24540 }, { "epoch": 0.9176344136271234, "grad_norm": 1.1253736019134521, "learning_rate": 3.3288312925497277e-07, "loss": 0.274, "step": 24545 }, { "epoch": 0.9178213426174732, "grad_norm": 0.32212045788764954, "learning_rate": 3.3138219138424856e-07, "loss": 0.2363, "step": 24550 }, { "epoch": 0.918008271607823, "grad_norm": 0.5436800122261047, "learning_rate": 3.298845879427659e-07, "loss": 0.2476, "step": 24555 }, { "epoch": 0.9181952005981727, "grad_norm": 0.6046520471572876, "learning_rate": 3.2839031944700394e-07, "loss": 0.2633, "step": 24560 }, { "epoch": 0.9183821295885226, "grad_norm": 0.5511953234672546, "learning_rate": 3.2689938641229603e-07, "loss": 0.3388, "step": 24565 }, { "epoch": 0.9185690585788724, "grad_norm": 0.30859842896461487, "learning_rate": 3.254117893528186e-07, "loss": 0.2729, "step": 24570 }, { "epoch": 0.9187559875692222, "grad_norm": 0.39766547083854675, "learning_rate": 3.239275287816035e-07, "loss": 0.3074, "step": 24575 }, { "epoch": 0.9189429165595719, "grad_norm": 0.3398101031780243, "learning_rate": 3.224466052105291e-07, "loss": 0.3279, "step": 24580 }, { "epoch": 0.9191298455499217, "grad_norm": 0.7181034684181213, "learning_rate": 3.209690191503245e-07, "loss": 0.3621, "step": 24585 }, { "epoch": 0.9193167745402715, "grad_norm": 0.541123628616333, "learning_rate": 3.194947711105645e-07, "loss": 0.2336, "step": 24590 }, { "epoch": 0.9195037035306213, "grad_norm": 0.24448862671852112, "learning_rate": 3.180238615996778e-07, "loss": 0.3324, "step": 24595 }, { "epoch": 0.919690632520971, "grad_norm": 0.8702734112739563, "learning_rate": 3.165562911249376e-07, "loss": 0.2881, "step": 24600 }, { "epoch": 0.9198775615113209, "grad_norm": 0.38017889857292175, "learning_rate": 3.1509206019246564e-07, "loss": 0.2344, "step": 24605 }, { "epoch": 0.9200644905016707, "grad_norm": 0.48280656337738037, "learning_rate": 3.136311693072347e-07, "loss": 0.2581, "step": 24610 }, { "epoch": 0.9202514194920205, "grad_norm": 0.37911441922187805, "learning_rate": 3.1217361897306395e-07, "loss": 0.2333, "step": 24615 }, { "epoch": 0.9204383484823703, "grad_norm": 0.4707018733024597, "learning_rate": 3.107194096926214e-07, "loss": 0.2859, "step": 24620 }, { "epoch": 0.92062527747272, "grad_norm": 0.308724969625473, "learning_rate": 3.0926854196742017e-07, "loss": 0.28, "step": 24625 }, { "epoch": 0.9208122064630698, "grad_norm": 0.3189759850502014, "learning_rate": 3.0782101629782456e-07, "loss": 0.2821, "step": 24630 }, { "epoch": 0.9209991354534196, "grad_norm": 0.24063967168331146, "learning_rate": 3.063768331830441e-07, "loss": 0.2809, "step": 24635 }, { "epoch": 0.9211860644437694, "grad_norm": 0.5102589726448059, "learning_rate": 3.049359931211382e-07, "loss": 0.3776, "step": 24640 }, { "epoch": 0.9213729934341192, "grad_norm": 0.5993364453315735, "learning_rate": 3.034984966090082e-07, "loss": 0.3378, "step": 24645 }, { "epoch": 0.921559922424469, "grad_norm": 0.5947995185852051, "learning_rate": 3.020643441424065e-07, "loss": 0.2313, "step": 24650 }, { "epoch": 0.9217468514148188, "grad_norm": 0.35502389073371887, "learning_rate": 3.006335362159329e-07, "loss": 0.2421, "step": 24655 }, { "epoch": 0.9219337804051686, "grad_norm": 0.5855832099914551, "learning_rate": 2.9920607332302844e-07, "loss": 0.2156, "step": 24660 }, { "epoch": 0.9221207093955184, "grad_norm": 0.2624458074569702, "learning_rate": 2.97781955955988e-07, "loss": 0.2104, "step": 24665 }, { "epoch": 0.9223076383858682, "grad_norm": 0.41134822368621826, "learning_rate": 2.9636118460594667e-07, "loss": 0.3424, "step": 24670 }, { "epoch": 0.9224945673762179, "grad_norm": 0.28406789898872375, "learning_rate": 2.94943759762889e-07, "loss": 0.2791, "step": 24675 }, { "epoch": 0.9226814963665677, "grad_norm": 0.6110890507698059, "learning_rate": 2.93529681915643e-07, "loss": 0.3006, "step": 24680 }, { "epoch": 0.9228684253569175, "grad_norm": 0.4325246810913086, "learning_rate": 2.9211895155188406e-07, "loss": 0.2617, "step": 24685 }, { "epoch": 0.9230553543472674, "grad_norm": 0.3159783184528351, "learning_rate": 2.9071156915813413e-07, "loss": 0.2363, "step": 24690 }, { "epoch": 0.9232422833376172, "grad_norm": 0.3180788457393646, "learning_rate": 2.8930753521975496e-07, "loss": 0.2499, "step": 24695 }, { "epoch": 0.9234292123279669, "grad_norm": 0.4603706896305084, "learning_rate": 2.879068502209625e-07, "loss": 0.2631, "step": 24700 }, { "epoch": 0.9236161413183167, "grad_norm": 0.5309453010559082, "learning_rate": 2.865095146448105e-07, "loss": 0.2576, "step": 24705 }, { "epoch": 0.9238030703086665, "grad_norm": 0.5529757738113403, "learning_rate": 2.8511552897319997e-07, "loss": 0.229, "step": 24710 }, { "epoch": 0.9239899992990163, "grad_norm": 0.38355591893196106, "learning_rate": 2.837248936868764e-07, "loss": 0.2191, "step": 24715 }, { "epoch": 0.924176928289366, "grad_norm": 0.40068626403808594, "learning_rate": 2.823376092654306e-07, "loss": 0.3506, "step": 24720 }, { "epoch": 0.9243638572797158, "grad_norm": 0.46175092458724976, "learning_rate": 2.8095367618729664e-07, "loss": 0.2266, "step": 24725 }, { "epoch": 0.9245507862700657, "grad_norm": 0.4619698226451874, "learning_rate": 2.7957309492975483e-07, "loss": 0.2883, "step": 24730 }, { "epoch": 0.9247377152604155, "grad_norm": 0.3867049217224121, "learning_rate": 2.781958659689277e-07, "loss": 0.2381, "step": 24735 }, { "epoch": 0.9249246442507653, "grad_norm": 0.34441909193992615, "learning_rate": 2.768219897797797e-07, "loss": 0.2983, "step": 24740 }, { "epoch": 0.925111573241115, "grad_norm": 0.1862328201532364, "learning_rate": 2.7545146683612413e-07, "loss": 0.2152, "step": 24745 }, { "epoch": 0.9252985022314648, "grad_norm": 0.4825479984283447, "learning_rate": 2.7408429761061393e-07, "loss": 0.2617, "step": 24750 }, { "epoch": 0.9254854312218146, "grad_norm": 0.40720510482788086, "learning_rate": 2.727204825747476e-07, "loss": 0.3691, "step": 24755 }, { "epoch": 0.9256723602121644, "grad_norm": 0.48773303627967834, "learning_rate": 2.7136002219886326e-07, "loss": 0.3045, "step": 24760 }, { "epoch": 0.9258592892025141, "grad_norm": 0.5098922848701477, "learning_rate": 2.700029169521479e-07, "loss": 0.2355, "step": 24765 }, { "epoch": 0.926046218192864, "grad_norm": 0.3420339524745941, "learning_rate": 2.6864916730262593e-07, "loss": 0.2178, "step": 24770 }, { "epoch": 0.9262331471832138, "grad_norm": 0.8477463722229004, "learning_rate": 2.672987737171673e-07, "loss": 0.2843, "step": 24775 }, { "epoch": 0.9264200761735636, "grad_norm": 0.481887549161911, "learning_rate": 2.65951736661485e-07, "loss": 0.2758, "step": 24780 }, { "epoch": 0.9266070051639134, "grad_norm": 0.2788504660129547, "learning_rate": 2.646080566001341e-07, "loss": 0.289, "step": 24785 }, { "epoch": 0.9267939341542631, "grad_norm": 0.46086716651916504, "learning_rate": 2.632677339965095e-07, "loss": 0.285, "step": 24790 }, { "epoch": 0.9269808631446129, "grad_norm": 0.7587642669677734, "learning_rate": 2.6193076931285035e-07, "loss": 0.3542, "step": 24795 }, { "epoch": 0.9271677921349627, "grad_norm": 0.32218098640441895, "learning_rate": 2.6059716301023885e-07, "loss": 0.2192, "step": 24800 }, { "epoch": 0.9273547211253125, "grad_norm": 0.6089930534362793, "learning_rate": 2.5926691554859497e-07, "loss": 0.2849, "step": 24805 }, { "epoch": 0.9275416501156624, "grad_norm": 0.8761005401611328, "learning_rate": 2.579400273866861e-07, "loss": 0.2688, "step": 24810 }, { "epoch": 0.9277285791060121, "grad_norm": 0.4644549787044525, "learning_rate": 2.5661649898211515e-07, "loss": 0.2952, "step": 24815 }, { "epoch": 0.9279155080963619, "grad_norm": 0.19469016790390015, "learning_rate": 2.552963307913303e-07, "loss": 0.2141, "step": 24820 }, { "epoch": 0.9281024370867117, "grad_norm": 0.5360405445098877, "learning_rate": 2.5397952326962183e-07, "loss": 0.2797, "step": 24825 }, { "epoch": 0.9282893660770615, "grad_norm": 0.5923690795898438, "learning_rate": 2.526660768711153e-07, "loss": 0.2603, "step": 24830 }, { "epoch": 0.9284762950674113, "grad_norm": 0.47409185767173767, "learning_rate": 2.5135599204878403e-07, "loss": 0.2873, "step": 24835 }, { "epoch": 0.928663224057761, "grad_norm": 0.7585999965667725, "learning_rate": 2.500492692544354e-07, "loss": 0.2297, "step": 24840 }, { "epoch": 0.9288501530481108, "grad_norm": 0.5913994908332825, "learning_rate": 2.487459089387234e-07, "loss": 0.2202, "step": 24845 }, { "epoch": 0.9290370820384607, "grad_norm": 0.5756853818893433, "learning_rate": 2.474459115511374e-07, "loss": 0.2813, "step": 24850 }, { "epoch": 0.9292240110288105, "grad_norm": 0.8577892780303955, "learning_rate": 2.461492775400121e-07, "loss": 0.2344, "step": 24855 }, { "epoch": 0.9294109400191602, "grad_norm": 0.26759210228919983, "learning_rate": 2.448560073525164e-07, "loss": 0.3403, "step": 24860 }, { "epoch": 0.92959786900951, "grad_norm": 1.6753215789794922, "learning_rate": 2.4356610143466353e-07, "loss": 0.2956, "step": 24865 }, { "epoch": 0.9297847979998598, "grad_norm": 0.6404578685760498, "learning_rate": 2.422795602313066e-07, "loss": 0.2465, "step": 24870 }, { "epoch": 0.9299717269902096, "grad_norm": 0.33225345611572266, "learning_rate": 2.40996384186134e-07, "loss": 0.2499, "step": 24875 }, { "epoch": 0.9301586559805594, "grad_norm": 0.5298144221305847, "learning_rate": 2.3971657374167956e-07, "loss": 0.2753, "step": 24880 }, { "epoch": 0.9303455849709091, "grad_norm": 0.3931209444999695, "learning_rate": 2.3844012933930906e-07, "loss": 0.2161, "step": 24885 }, { "epoch": 0.9305325139612589, "grad_norm": 0.8054724931716919, "learning_rate": 2.371670514192348e-07, "loss": 0.2659, "step": 24890 }, { "epoch": 0.9307194429516088, "grad_norm": 0.33258944749832153, "learning_rate": 2.358973404205034e-07, "loss": 0.2871, "step": 24895 }, { "epoch": 0.9309063719419586, "grad_norm": 0.475870281457901, "learning_rate": 2.3463099678100344e-07, "loss": 0.2974, "step": 24900 }, { "epoch": 0.9310933009323084, "grad_norm": 0.43682655692100525, "learning_rate": 2.333680209374578e-07, "loss": 0.2509, "step": 24905 }, { "epoch": 0.9312802299226581, "grad_norm": 0.3373750150203705, "learning_rate": 2.3210841332543254e-07, "loss": 0.2143, "step": 24910 }, { "epoch": 0.9314671589130079, "grad_norm": 0.33678069710731506, "learning_rate": 2.3085217437933127e-07, "loss": 0.2806, "step": 24915 }, { "epoch": 0.9316540879033577, "grad_norm": 0.47912338376045227, "learning_rate": 2.295993045323941e-07, "loss": 0.2977, "step": 24920 }, { "epoch": 0.9318410168937075, "grad_norm": 0.3937797248363495, "learning_rate": 2.2834980421669872e-07, "loss": 0.2392, "step": 24925 }, { "epoch": 0.9320279458840572, "grad_norm": 0.6482548117637634, "learning_rate": 2.2710367386316156e-07, "loss": 0.2579, "step": 24930 }, { "epoch": 0.9322148748744071, "grad_norm": 1.0093536376953125, "learning_rate": 2.2586091390153996e-07, "loss": 0.3091, "step": 24935 }, { "epoch": 0.9324018038647569, "grad_norm": 0.29904359579086304, "learning_rate": 2.246215247604233e-07, "loss": 0.3163, "step": 24940 }, { "epoch": 0.9325887328551067, "grad_norm": 0.6364412903785706, "learning_rate": 2.2338550686724413e-07, "loss": 0.2685, "step": 24945 }, { "epoch": 0.9327756618454565, "grad_norm": 0.7019013166427612, "learning_rate": 2.221528606482659e-07, "loss": 0.3034, "step": 24950 }, { "epoch": 0.9329625908358062, "grad_norm": 0.5660467743873596, "learning_rate": 2.2092358652859634e-07, "loss": 0.25, "step": 24955 }, { "epoch": 0.933149519826156, "grad_norm": 0.5666435956954956, "learning_rate": 2.1969768493217747e-07, "loss": 0.2411, "step": 24960 }, { "epoch": 0.9333364488165058, "grad_norm": 0.3779709041118622, "learning_rate": 2.1847515628178328e-07, "loss": 0.3184, "step": 24965 }, { "epoch": 0.9335233778068556, "grad_norm": 0.6057195067405701, "learning_rate": 2.1725600099903433e-07, "loss": 0.23, "step": 24970 }, { "epoch": 0.9337103067972055, "grad_norm": 0.5047149062156677, "learning_rate": 2.160402195043776e-07, "loss": 0.3242, "step": 24975 }, { "epoch": 0.9338972357875552, "grad_norm": 1.0450160503387451, "learning_rate": 2.1482781221710437e-07, "loss": 0.2839, "step": 24980 }, { "epoch": 0.934084164777905, "grad_norm": 0.44621530175209045, "learning_rate": 2.1361877955533682e-07, "loss": 0.2289, "step": 24985 }, { "epoch": 0.9342710937682548, "grad_norm": 0.547173023223877, "learning_rate": 2.1241312193603814e-07, "loss": 0.3041, "step": 24990 }, { "epoch": 0.9344580227586046, "grad_norm": 0.4551314413547516, "learning_rate": 2.1121083977500346e-07, "loss": 0.2025, "step": 24995 }, { "epoch": 0.9346449517489543, "grad_norm": 0.19889132678508759, "learning_rate": 2.1001193348686444e-07, "loss": 0.2547, "step": 25000 }, { "epoch": 0.9348318807393041, "grad_norm": 0.4394857585430145, "learning_rate": 2.0881640348509258e-07, "loss": 0.3064, "step": 25005 }, { "epoch": 0.9350188097296539, "grad_norm": 0.29480358958244324, "learning_rate": 2.0762425018199028e-07, "loss": 0.2712, "step": 25010 }, { "epoch": 0.9352057387200038, "grad_norm": 0.306598037481308, "learning_rate": 2.0643547398869646e-07, "loss": 0.2319, "step": 25015 }, { "epoch": 0.9353926677103536, "grad_norm": 0.18189042806625366, "learning_rate": 2.052500753151876e-07, "loss": 0.2516, "step": 25020 }, { "epoch": 0.9355795967007033, "grad_norm": 0.3686217963695526, "learning_rate": 2.0406805457027225e-07, "loss": 0.2465, "step": 25025 }, { "epoch": 0.9357665256910531, "grad_norm": 0.3795625567436218, "learning_rate": 2.028894121615943e-07, "loss": 0.3719, "step": 25030 }, { "epoch": 0.9359534546814029, "grad_norm": 0.3678499758243561, "learning_rate": 2.0171414849563753e-07, "loss": 0.2745, "step": 25035 }, { "epoch": 0.9361403836717527, "grad_norm": 0.20017379522323608, "learning_rate": 2.0054226397771216e-07, "loss": 0.2769, "step": 25040 }, { "epoch": 0.9363273126621025, "grad_norm": 0.42065608501434326, "learning_rate": 1.9937375901197154e-07, "loss": 0.3136, "step": 25045 }, { "epoch": 0.9365142416524522, "grad_norm": 0.4758951663970947, "learning_rate": 1.9820863400139778e-07, "loss": 0.2509, "step": 25050 }, { "epoch": 0.9367011706428021, "grad_norm": 0.30327993631362915, "learning_rate": 1.9704688934780946e-07, "loss": 0.2372, "step": 25055 }, { "epoch": 0.9368880996331519, "grad_norm": 0.4054654836654663, "learning_rate": 1.9588852545185831e-07, "loss": 0.3371, "step": 25060 }, { "epoch": 0.9370750286235017, "grad_norm": 0.40502023696899414, "learning_rate": 1.9473354271303258e-07, "loss": 0.2511, "step": 25065 }, { "epoch": 0.9372619576138514, "grad_norm": 0.4846619665622711, "learning_rate": 1.9358194152965139e-07, "loss": 0.2792, "step": 25070 }, { "epoch": 0.9374488866042012, "grad_norm": 0.9523441195487976, "learning_rate": 1.9243372229886704e-07, "loss": 0.3035, "step": 25075 }, { "epoch": 0.937635815594551, "grad_norm": 0.7273759245872498, "learning_rate": 1.9128888541667167e-07, "loss": 0.2516, "step": 25080 }, { "epoch": 0.9378227445849008, "grad_norm": 0.32927486300468445, "learning_rate": 1.9014743127788392e-07, "loss": 0.2888, "step": 25085 }, { "epoch": 0.9380096735752506, "grad_norm": 0.31658056378364563, "learning_rate": 1.890093602761589e-07, "loss": 0.2707, "step": 25090 }, { "epoch": 0.9381966025656004, "grad_norm": 0.41149401664733887, "learning_rate": 1.8787467280398597e-07, "loss": 0.2212, "step": 25095 }, { "epoch": 0.9383835315559502, "grad_norm": 0.3825901746749878, "learning_rate": 1.8674336925268434e-07, "loss": 0.3109, "step": 25100 }, { "epoch": 0.9385704605463, "grad_norm": 0.28844115138053894, "learning_rate": 1.8561545001240967e-07, "loss": 0.315, "step": 25105 }, { "epoch": 0.9387573895366498, "grad_norm": 0.6432269811630249, "learning_rate": 1.844909154721497e-07, "loss": 0.3375, "step": 25110 }, { "epoch": 0.9389443185269996, "grad_norm": 0.3793524205684662, "learning_rate": 1.83369766019722e-07, "loss": 0.3181, "step": 25115 }, { "epoch": 0.9391312475173493, "grad_norm": 0.42755433917045593, "learning_rate": 1.8225200204177952e-07, "loss": 0.3514, "step": 25120 }, { "epoch": 0.9393181765076991, "grad_norm": 0.2594048082828522, "learning_rate": 1.811376239238083e-07, "loss": 0.256, "step": 25125 }, { "epoch": 0.9395051054980489, "grad_norm": 0.6199470162391663, "learning_rate": 1.8002663205012428e-07, "loss": 0.2888, "step": 25130 }, { "epoch": 0.9396920344883987, "grad_norm": 0.5659986734390259, "learning_rate": 1.7891902680387652e-07, "loss": 0.2381, "step": 25135 }, { "epoch": 0.9398789634787486, "grad_norm": 0.31770560145378113, "learning_rate": 1.7781480856704835e-07, "loss": 0.3309, "step": 25140 }, { "epoch": 0.9400658924690983, "grad_norm": 0.44480013847351074, "learning_rate": 1.7671397772044962e-07, "loss": 0.3511, "step": 25145 }, { "epoch": 0.9402528214594481, "grad_norm": 0.48626697063446045, "learning_rate": 1.7561653464372885e-07, "loss": 0.2877, "step": 25150 }, { "epoch": 0.9404397504497979, "grad_norm": 0.4127686023712158, "learning_rate": 1.7452247971535995e-07, "loss": 0.2773, "step": 25155 }, { "epoch": 0.9406266794401477, "grad_norm": 0.9780619144439697, "learning_rate": 1.7343181331265336e-07, "loss": 0.2247, "step": 25160 }, { "epoch": 0.9408136084304974, "grad_norm": 0.4334399104118347, "learning_rate": 1.7234453581174704e-07, "loss": 0.2965, "step": 25165 }, { "epoch": 0.9410005374208472, "grad_norm": 0.37765777111053467, "learning_rate": 1.7126064758761217e-07, "loss": 0.3012, "step": 25170 }, { "epoch": 0.941187466411197, "grad_norm": 0.258969247341156, "learning_rate": 1.701801490140509e-07, "loss": 0.215, "step": 25175 }, { "epoch": 0.9413743954015469, "grad_norm": 0.762693464756012, "learning_rate": 1.6910304046369618e-07, "loss": 0.3182, "step": 25180 }, { "epoch": 0.9415613243918967, "grad_norm": 0.2748546600341797, "learning_rate": 1.6802932230801205e-07, "loss": 0.2761, "step": 25185 }, { "epoch": 0.9417482533822464, "grad_norm": 0.556158721446991, "learning_rate": 1.669589949172934e-07, "loss": 0.2127, "step": 25190 }, { "epoch": 0.9419351823725962, "grad_norm": 0.3723142445087433, "learning_rate": 1.6589205866066493e-07, "loss": 0.277, "step": 25195 }, { "epoch": 0.942122111362946, "grad_norm": 0.42889419198036194, "learning_rate": 1.6482851390608235e-07, "loss": 0.3108, "step": 25200 }, { "epoch": 0.9423090403532958, "grad_norm": 1.3041778802871704, "learning_rate": 1.6376836102033223e-07, "loss": 0.3281, "step": 25205 }, { "epoch": 0.9424959693436455, "grad_norm": 0.5724756121635437, "learning_rate": 1.6271160036903099e-07, "loss": 0.3212, "step": 25210 }, { "epoch": 0.9426828983339953, "grad_norm": 0.2716809809207916, "learning_rate": 1.616582323166249e-07, "loss": 0.2562, "step": 25215 }, { "epoch": 0.9428698273243452, "grad_norm": 0.5220561027526855, "learning_rate": 1.6060825722639005e-07, "loss": 0.3082, "step": 25220 }, { "epoch": 0.943056756314695, "grad_norm": 0.3205987811088562, "learning_rate": 1.5956167546043234e-07, "loss": 0.2507, "step": 25225 }, { "epoch": 0.9432436853050448, "grad_norm": 0.36693549156188965, "learning_rate": 1.5851848737968968e-07, "loss": 0.2144, "step": 25230 }, { "epoch": 0.9434306142953945, "grad_norm": 0.26295924186706543, "learning_rate": 1.574786933439254e-07, "loss": 0.3514, "step": 25235 }, { "epoch": 0.9436175432857443, "grad_norm": 0.5648385286331177, "learning_rate": 1.5644229371173714e-07, "loss": 0.3295, "step": 25240 }, { "epoch": 0.9438044722760941, "grad_norm": 0.43656599521636963, "learning_rate": 1.5540928884054674e-07, "loss": 0.2869, "step": 25245 }, { "epoch": 0.9439914012664439, "grad_norm": 0.5460593104362488, "learning_rate": 1.5437967908661143e-07, "loss": 0.2633, "step": 25250 }, { "epoch": 0.9441783302567937, "grad_norm": 0.4732210040092468, "learning_rate": 1.5335346480501056e-07, "loss": 0.2891, "step": 25255 }, { "epoch": 0.9443652592471435, "grad_norm": 0.2938053011894226, "learning_rate": 1.5233064634965878e-07, "loss": 0.3107, "step": 25260 }, { "epoch": 0.9445521882374933, "grad_norm": 0.28973618149757385, "learning_rate": 1.513112240732939e-07, "loss": 0.3202, "step": 25265 }, { "epoch": 0.9447391172278431, "grad_norm": 0.34332942962646484, "learning_rate": 1.5029519832748807e-07, "loss": 0.3028, "step": 25270 }, { "epoch": 0.9449260462181929, "grad_norm": 0.37853550910949707, "learning_rate": 1.4928256946263875e-07, "loss": 0.3199, "step": 25275 }, { "epoch": 0.9451129752085426, "grad_norm": 0.8875368237495422, "learning_rate": 1.4827333782797216e-07, "loss": 0.2674, "step": 25280 }, { "epoch": 0.9452999041988924, "grad_norm": 0.5367876291275024, "learning_rate": 1.472675037715443e-07, "loss": 0.2802, "step": 25285 }, { "epoch": 0.9454868331892422, "grad_norm": 0.2964636981487274, "learning_rate": 1.4626506764023663e-07, "loss": 0.2168, "step": 25290 }, { "epoch": 0.945673762179592, "grad_norm": 0.25260117650032043, "learning_rate": 1.4526602977976368e-07, "loss": 0.3053, "step": 25295 }, { "epoch": 0.9458606911699419, "grad_norm": 0.49495014548301697, "learning_rate": 1.4427039053466207e-07, "loss": 0.2923, "step": 25300 }, { "epoch": 0.9460476201602916, "grad_norm": 0.4212638735771179, "learning_rate": 1.432781502483005e-07, "loss": 0.2597, "step": 25305 }, { "epoch": 0.9462345491506414, "grad_norm": 0.5896459817886353, "learning_rate": 1.422893092628741e-07, "loss": 0.2654, "step": 25310 }, { "epoch": 0.9464214781409912, "grad_norm": 0.9404630661010742, "learning_rate": 1.4130386791940564e-07, "loss": 0.2247, "step": 25315 }, { "epoch": 0.946608407131341, "grad_norm": 0.4611830413341522, "learning_rate": 1.4032182655774661e-07, "loss": 0.2525, "step": 25320 }, { "epoch": 0.9467953361216908, "grad_norm": 0.5101840496063232, "learning_rate": 1.3934318551657277e-07, "loss": 0.2983, "step": 25325 }, { "epoch": 0.9469822651120405, "grad_norm": 0.6482624411582947, "learning_rate": 1.383679451333919e-07, "loss": 0.3196, "step": 25330 }, { "epoch": 0.9471691941023903, "grad_norm": 0.2823061943054199, "learning_rate": 1.373961057445339e-07, "loss": 0.2113, "step": 25335 }, { "epoch": 0.9473561230927402, "grad_norm": 0.6522537469863892, "learning_rate": 1.364276676851617e-07, "loss": 0.2637, "step": 25340 }, { "epoch": 0.94754305208309, "grad_norm": 0.338235080242157, "learning_rate": 1.354626312892582e-07, "loss": 0.2926, "step": 25345 }, { "epoch": 0.9477299810734398, "grad_norm": 0.3630608022212982, "learning_rate": 1.3450099688963823e-07, "loss": 0.232, "step": 25350 }, { "epoch": 0.9479169100637895, "grad_norm": 0.32054030895233154, "learning_rate": 1.3354276481794325e-07, "loss": 0.2809, "step": 25355 }, { "epoch": 0.9481038390541393, "grad_norm": 0.40766042470932007, "learning_rate": 1.3258793540463778e-07, "loss": 0.2871, "step": 25360 }, { "epoch": 0.9482907680444891, "grad_norm": 0.6181418299674988, "learning_rate": 1.3163650897901724e-07, "loss": 0.3037, "step": 25365 }, { "epoch": 0.9484776970348389, "grad_norm": 0.39881226420402527, "learning_rate": 1.3068848586920035e-07, "loss": 0.27, "step": 25370 }, { "epoch": 0.9486646260251886, "grad_norm": 0.548456609249115, "learning_rate": 1.2974386640213333e-07, "loss": 0.2401, "step": 25375 }, { "epoch": 0.9488515550155384, "grad_norm": 0.3512159585952759, "learning_rate": 1.2880265090358668e-07, "loss": 0.3958, "step": 25380 }, { "epoch": 0.9490384840058883, "grad_norm": 0.33197692036628723, "learning_rate": 1.278648396981619e-07, "loss": 0.2512, "step": 25385 }, { "epoch": 0.9492254129962381, "grad_norm": 0.4836597144603729, "learning_rate": 1.2693043310928022e-07, "loss": 0.3093, "step": 25390 }, { "epoch": 0.9494123419865879, "grad_norm": 0.3625621795654297, "learning_rate": 1.2599943145919392e-07, "loss": 0.3484, "step": 25395 }, { "epoch": 0.9495992709769376, "grad_norm": 0.41878730058670044, "learning_rate": 1.2507183506897723e-07, "loss": 0.267, "step": 25400 }, { "epoch": 0.9497861999672874, "grad_norm": 0.2932325303554535, "learning_rate": 1.241476442585321e-07, "loss": 0.2598, "step": 25405 }, { "epoch": 0.9499731289576372, "grad_norm": 0.3391314744949341, "learning_rate": 1.2322685934658573e-07, "loss": 0.2433, "step": 25410 }, { "epoch": 0.950160057947987, "grad_norm": 0.4093460142612457, "learning_rate": 1.223094806506897e-07, "loss": 0.2741, "step": 25415 }, { "epoch": 0.9503469869383367, "grad_norm": 0.4316021800041199, "learning_rate": 1.2139550848722203e-07, "loss": 0.2655, "step": 25420 }, { "epoch": 0.9505339159286866, "grad_norm": 0.7140107750892639, "learning_rate": 1.2048494317138615e-07, "loss": 0.3403, "step": 25425 }, { "epoch": 0.9507208449190364, "grad_norm": 0.5122732520103455, "learning_rate": 1.1957778501720973e-07, "loss": 0.2101, "step": 25430 }, { "epoch": 0.9509077739093862, "grad_norm": 1.210839867591858, "learning_rate": 1.1867403433754476e-07, "loss": 0.2857, "step": 25435 }, { "epoch": 0.951094702899736, "grad_norm": 0.507588267326355, "learning_rate": 1.177736914440697e-07, "loss": 0.2956, "step": 25440 }, { "epoch": 0.9512816318900857, "grad_norm": 0.4931941330432892, "learning_rate": 1.1687675664728837e-07, "loss": 0.2871, "step": 25445 }, { "epoch": 0.9514685608804355, "grad_norm": 0.5191053748130798, "learning_rate": 1.1598323025652447e-07, "loss": 0.2739, "step": 25450 }, { "epoch": 0.9516554898707853, "grad_norm": 0.37645572423934937, "learning_rate": 1.150931125799315e-07, "loss": 0.2422, "step": 25455 }, { "epoch": 0.9518424188611351, "grad_norm": 0.46614864468574524, "learning_rate": 1.1420640392448612e-07, "loss": 0.2909, "step": 25460 }, { "epoch": 0.952029347851485, "grad_norm": 0.3071075677871704, "learning_rate": 1.1332310459598928e-07, "loss": 0.1938, "step": 25465 }, { "epoch": 0.9522162768418347, "grad_norm": 0.4759249985218048, "learning_rate": 1.1244321489906285e-07, "loss": 0.2831, "step": 25470 }, { "epoch": 0.9524032058321845, "grad_norm": 0.4241246283054352, "learning_rate": 1.1156673513715744e-07, "loss": 0.2166, "step": 25475 }, { "epoch": 0.9525901348225343, "grad_norm": 0.26274579763412476, "learning_rate": 1.1069366561254679e-07, "loss": 0.3136, "step": 25480 }, { "epoch": 0.9527770638128841, "grad_norm": 0.3445034325122833, "learning_rate": 1.0982400662632564e-07, "loss": 0.2326, "step": 25485 }, { "epoch": 0.9529639928032339, "grad_norm": 0.46550294756889343, "learning_rate": 1.0895775847841516e-07, "loss": 0.2672, "step": 25490 }, { "epoch": 0.9531509217935836, "grad_norm": 0.3357786238193512, "learning_rate": 1.0809492146755973e-07, "loss": 0.2343, "step": 25495 }, { "epoch": 0.9533378507839334, "grad_norm": 0.1831052303314209, "learning_rate": 1.0723549589132687e-07, "loss": 0.3252, "step": 25500 }, { "epoch": 0.9535247797742833, "grad_norm": 0.726946234703064, "learning_rate": 1.0637948204610837e-07, "loss": 0.3023, "step": 25505 }, { "epoch": 0.9537117087646331, "grad_norm": 0.5573680996894836, "learning_rate": 1.055268802271181e-07, "loss": 0.2786, "step": 25510 }, { "epoch": 0.9538986377549828, "grad_norm": 0.5966600179672241, "learning_rate": 1.0467769072839307e-07, "loss": 0.3758, "step": 25515 }, { "epoch": 0.9540855667453326, "grad_norm": 0.31094256043434143, "learning_rate": 1.0383191384279789e-07, "loss": 0.2691, "step": 25520 }, { "epoch": 0.9542724957356824, "grad_norm": 0.6314263343811035, "learning_rate": 1.029895498620126e-07, "loss": 0.3314, "step": 25525 }, { "epoch": 0.9544594247260322, "grad_norm": 0.6387194991111755, "learning_rate": 1.0215059907654811e-07, "loss": 0.2326, "step": 25530 }, { "epoch": 0.954646353716382, "grad_norm": 0.27241405844688416, "learning_rate": 1.013150617757308e-07, "loss": 0.2931, "step": 25535 }, { "epoch": 0.9548332827067317, "grad_norm": 0.6504563093185425, "learning_rate": 1.0048293824771682e-07, "loss": 0.3224, "step": 25540 }, { "epoch": 0.9550202116970816, "grad_norm": 0.5727891325950623, "learning_rate": 9.965422877948106e-08, "loss": 0.2353, "step": 25545 }, { "epoch": 0.9552071406874314, "grad_norm": 0.5274751782417297, "learning_rate": 9.882893365681934e-08, "loss": 0.3013, "step": 25550 }, { "epoch": 0.9553940696777812, "grad_norm": 0.41574156284332275, "learning_rate": 9.80070531643551e-08, "loss": 0.2363, "step": 25555 }, { "epoch": 0.955580998668131, "grad_norm": 0.5117254257202148, "learning_rate": 9.71885875855294e-08, "loss": 0.3033, "step": 25560 }, { "epoch": 0.9557679276584807, "grad_norm": 0.5276907086372375, "learning_rate": 9.637353720260867e-08, "loss": 0.3117, "step": 25565 }, { "epoch": 0.9559548566488305, "grad_norm": 0.6600742340087891, "learning_rate": 9.556190229668027e-08, "loss": 0.3827, "step": 25570 }, { "epoch": 0.9561417856391803, "grad_norm": 0.43783825635910034, "learning_rate": 9.475368314765365e-08, "loss": 0.3325, "step": 25575 }, { "epoch": 0.9563287146295301, "grad_norm": 0.4194599390029907, "learning_rate": 9.394888003426028e-08, "loss": 0.2532, "step": 25580 }, { "epoch": 0.95651564361988, "grad_norm": 0.3996281921863556, "learning_rate": 9.314749323405481e-08, "loss": 0.2825, "step": 25585 }, { "epoch": 0.9567025726102297, "grad_norm": 0.4395323395729065, "learning_rate": 9.234952302341172e-08, "loss": 0.2375, "step": 25590 }, { "epoch": 0.9568895016005795, "grad_norm": 0.5239723920822144, "learning_rate": 9.155496967752642e-08, "loss": 0.3791, "step": 25595 }, { "epoch": 0.9570764305909293, "grad_norm": 0.6656010746955872, "learning_rate": 9.076383347042084e-08, "loss": 0.3139, "step": 25600 }, { "epoch": 0.9572633595812791, "grad_norm": 0.6281600594520569, "learning_rate": 8.997611467493228e-08, "loss": 0.3157, "step": 25605 }, { "epoch": 0.9574502885716288, "grad_norm": 0.39084675908088684, "learning_rate": 8.919181356272454e-08, "loss": 0.2807, "step": 25610 }, { "epoch": 0.9576372175619786, "grad_norm": 0.4941912293434143, "learning_rate": 8.841093040427907e-08, "loss": 0.2826, "step": 25615 }, { "epoch": 0.9578241465523284, "grad_norm": 0.500826358795166, "learning_rate": 8.76334654689015e-08, "loss": 0.3496, "step": 25620 }, { "epoch": 0.9580110755426782, "grad_norm": 0.6868979930877686, "learning_rate": 8.685941902471628e-08, "loss": 0.3204, "step": 25625 }, { "epoch": 0.9581980045330281, "grad_norm": 0.30598723888397217, "learning_rate": 8.608879133866988e-08, "loss": 0.3358, "step": 25630 }, { "epoch": 0.9583849335233778, "grad_norm": 0.3103514611721039, "learning_rate": 8.53215826765308e-08, "loss": 0.285, "step": 25635 }, { "epoch": 0.9585718625137276, "grad_norm": 0.5536108016967773, "learning_rate": 8.455779330288516e-08, "loss": 0.3143, "step": 25640 }, { "epoch": 0.9587587915040774, "grad_norm": 0.589036226272583, "learning_rate": 8.379742348114339e-08, "loss": 0.286, "step": 25645 }, { "epoch": 0.9589457204944272, "grad_norm": 0.2255747765302658, "learning_rate": 8.30404734735346e-08, "loss": 0.26, "step": 25650 }, { "epoch": 0.959132649484777, "grad_norm": 0.5313140749931335, "learning_rate": 8.228694354111111e-08, "loss": 0.2988, "step": 25655 }, { "epoch": 0.9593195784751267, "grad_norm": 0.30977725982666016, "learning_rate": 8.153683394374057e-08, "loss": 0.2458, "step": 25660 }, { "epoch": 0.9595065074654765, "grad_norm": 0.8567424416542053, "learning_rate": 8.079014494011827e-08, "loss": 0.2961, "step": 25665 }, { "epoch": 0.9596934364558264, "grad_norm": 0.569592297077179, "learning_rate": 8.004687678775158e-08, "loss": 0.2552, "step": 25670 }, { "epoch": 0.9598803654461762, "grad_norm": 0.2618090808391571, "learning_rate": 7.930702974297544e-08, "loss": 0.2402, "step": 25675 }, { "epoch": 0.9600672944365259, "grad_norm": 0.5399622917175293, "learning_rate": 7.85706040609413e-08, "loss": 0.2917, "step": 25680 }, { "epoch": 0.9602542234268757, "grad_norm": 0.4389611780643463, "learning_rate": 7.783759999562046e-08, "loss": 0.2601, "step": 25685 }, { "epoch": 0.9604411524172255, "grad_norm": 0.3556120693683624, "learning_rate": 7.710801779980514e-08, "loss": 0.3564, "step": 25690 }, { "epoch": 0.9606280814075753, "grad_norm": 0.3294890522956848, "learning_rate": 7.638185772510854e-08, "loss": 0.2801, "step": 25695 }, { "epoch": 0.960815010397925, "grad_norm": 0.37796953320503235, "learning_rate": 7.565912002196141e-08, "loss": 0.2577, "step": 25700 }, { "epoch": 0.9610019393882748, "grad_norm": 0.4705111086368561, "learning_rate": 7.493980493961439e-08, "loss": 0.2638, "step": 25705 }, { "epoch": 0.9611888683786247, "grad_norm": 0.2994195520877838, "learning_rate": 7.422391272614016e-08, "loss": 0.2883, "step": 25710 }, { "epoch": 0.9613757973689745, "grad_norm": 0.2776239812374115, "learning_rate": 7.351144362842898e-08, "loss": 0.2938, "step": 25715 }, { "epoch": 0.9615627263593243, "grad_norm": 0.4236242175102234, "learning_rate": 7.280239789219213e-08, "loss": 0.2281, "step": 25720 }, { "epoch": 0.961749655349674, "grad_norm": 0.7493228912353516, "learning_rate": 7.209677576195617e-08, "loss": 0.3591, "step": 25725 }, { "epoch": 0.9619365843400238, "grad_norm": 0.598604679107666, "learning_rate": 7.139457748107314e-08, "loss": 0.2865, "step": 25730 }, { "epoch": 0.9621235133303736, "grad_norm": 0.147347092628479, "learning_rate": 7.069580329170933e-08, "loss": 0.302, "step": 25735 }, { "epoch": 0.9623104423207234, "grad_norm": 0.5269376635551453, "learning_rate": 7.000045343485306e-08, "loss": 0.3699, "step": 25740 }, { "epoch": 0.9624973713110732, "grad_norm": 0.30106881260871887, "learning_rate": 6.93085281503092e-08, "loss": 0.2679, "step": 25745 }, { "epoch": 0.962684300301423, "grad_norm": 0.6084346771240234, "learning_rate": 6.862002767670351e-08, "loss": 0.288, "step": 25750 }, { "epoch": 0.9628712292917728, "grad_norm": 0.31380781531333923, "learning_rate": 6.793495225148161e-08, "loss": 0.2513, "step": 25755 }, { "epoch": 0.9630581582821226, "grad_norm": 0.5058198571205139, "learning_rate": 6.725330211090342e-08, "loss": 0.2661, "step": 25760 }, { "epoch": 0.9632450872724724, "grad_norm": 0.3875492811203003, "learning_rate": 6.65750774900531e-08, "loss": 0.2098, "step": 25765 }, { "epoch": 0.9634320162628222, "grad_norm": 0.34512069821357727, "learning_rate": 6.59002786228291e-08, "loss": 0.2959, "step": 25770 }, { "epoch": 0.9636189452531719, "grad_norm": 0.5447210669517517, "learning_rate": 6.522890574195195e-08, "loss": 0.2606, "step": 25775 }, { "epoch": 0.9638058742435217, "grad_norm": 0.34275734424591064, "learning_rate": 6.456095907895754e-08, "loss": 0.3135, "step": 25780 }, { "epoch": 0.9639928032338715, "grad_norm": 0.5490571856498718, "learning_rate": 6.389643886420161e-08, "loss": 0.2562, "step": 25785 }, { "epoch": 0.9641797322242214, "grad_norm": 0.921448290348053, "learning_rate": 6.323534532685971e-08, "loss": 0.3327, "step": 25790 }, { "epoch": 0.9643666612145712, "grad_norm": 0.8506829738616943, "learning_rate": 6.25776786949217e-08, "loss": 0.238, "step": 25795 }, { "epoch": 0.9645535902049209, "grad_norm": 0.24562324583530426, "learning_rate": 6.192343919519949e-08, "loss": 0.2228, "step": 25800 }, { "epoch": 0.9647405191952707, "grad_norm": 0.4626258909702301, "learning_rate": 6.127262705332148e-08, "loss": 0.2939, "step": 25805 }, { "epoch": 0.9649274481856205, "grad_norm": 0.46563923358917236, "learning_rate": 6.06252424937337e-08, "loss": 0.2838, "step": 25810 }, { "epoch": 0.9651143771759703, "grad_norm": 0.3600118160247803, "learning_rate": 5.998128573969975e-08, "loss": 0.2402, "step": 25815 }, { "epoch": 0.96530130616632, "grad_norm": 0.25077444314956665, "learning_rate": 5.9340757013304215e-08, "loss": 0.261, "step": 25820 }, { "epoch": 0.9654882351566698, "grad_norm": 0.4118346869945526, "learning_rate": 5.8703656535444853e-08, "loss": 0.3058, "step": 25825 }, { "epoch": 0.9656751641470197, "grad_norm": 0.5594443678855896, "learning_rate": 5.806998452584034e-08, "loss": 0.2788, "step": 25830 }, { "epoch": 0.9658620931373695, "grad_norm": 0.28521931171417236, "learning_rate": 5.743974120302587e-08, "loss": 0.2705, "step": 25835 }, { "epoch": 0.9660490221277193, "grad_norm": 0.44842952489852905, "learning_rate": 5.681292678435424e-08, "loss": 0.2366, "step": 25840 }, { "epoch": 0.966235951118069, "grad_norm": 0.6687315702438354, "learning_rate": 5.618954148599587e-08, "loss": 0.2382, "step": 25845 }, { "epoch": 0.9664228801084188, "grad_norm": 0.3889986574649811, "learning_rate": 5.556958552293878e-08, "loss": 0.2423, "step": 25850 }, { "epoch": 0.9666098090987686, "grad_norm": 0.41925016045570374, "learning_rate": 5.4953059108987516e-08, "loss": 0.2204, "step": 25855 }, { "epoch": 0.9667967380891184, "grad_norm": 0.502301037311554, "learning_rate": 5.4339962456763096e-08, "loss": 0.2187, "step": 25860 }, { "epoch": 0.9669836670794681, "grad_norm": 0.5296496748924255, "learning_rate": 5.37302957777075e-08, "loss": 0.2298, "step": 25865 }, { "epoch": 0.9671705960698179, "grad_norm": 0.34486472606658936, "learning_rate": 5.3124059282076975e-08, "loss": 0.2691, "step": 25870 }, { "epoch": 0.9673575250601678, "grad_norm": 0.41491997241973877, "learning_rate": 5.2521253178944295e-08, "loss": 0.2574, "step": 25875 }, { "epoch": 0.9675444540505176, "grad_norm": 0.4363487660884857, "learning_rate": 5.192187767619872e-08, "loss": 0.2644, "step": 25880 }, { "epoch": 0.9677313830408674, "grad_norm": 0.45662906765937805, "learning_rate": 5.1325932980550444e-08, "loss": 0.2408, "step": 25885 }, { "epoch": 0.9679183120312171, "grad_norm": 0.5905013680458069, "learning_rate": 5.073341929752174e-08, "loss": 0.2843, "step": 25890 }, { "epoch": 0.9681052410215669, "grad_norm": 0.43980005383491516, "learning_rate": 5.0144336831453586e-08, "loss": 0.2896, "step": 25895 }, { "epoch": 0.9682921700119167, "grad_norm": 0.6901749968528748, "learning_rate": 4.95586857855046e-08, "loss": 0.3044, "step": 25900 }, { "epoch": 0.9684790990022665, "grad_norm": 0.9550485014915466, "learning_rate": 4.897646636164877e-08, "loss": 0.2959, "step": 25905 }, { "epoch": 0.9686660279926163, "grad_norm": 0.13232779502868652, "learning_rate": 4.839767876067658e-08, "loss": 0.271, "step": 25910 }, { "epoch": 0.9688529569829661, "grad_norm": 0.3293937146663666, "learning_rate": 4.782232318219615e-08, "loss": 0.2995, "step": 25915 }, { "epoch": 0.9690398859733159, "grad_norm": 0.4516461491584778, "learning_rate": 4.7250399824629867e-08, "loss": 0.3644, "step": 25920 }, { "epoch": 0.9692268149636657, "grad_norm": 0.3880869746208191, "learning_rate": 4.668190888521884e-08, "loss": 0.3003, "step": 25925 }, { "epoch": 0.9694137439540155, "grad_norm": 0.3466978967189789, "learning_rate": 4.611685056001847e-08, "loss": 0.2949, "step": 25930 }, { "epoch": 0.9696006729443652, "grad_norm": 0.4129624664783478, "learning_rate": 4.555522504390175e-08, "loss": 0.2601, "step": 25935 }, { "epoch": 0.969787601934715, "grad_norm": 0.30791693925857544, "learning_rate": 4.499703253055709e-08, "loss": 0.2434, "step": 25940 }, { "epoch": 0.9699745309250648, "grad_norm": 0.7145932912826538, "learning_rate": 4.4442273212488286e-08, "loss": 0.2478, "step": 25945 }, { "epoch": 0.9701614599154146, "grad_norm": 0.359036386013031, "learning_rate": 4.3890947281016725e-08, "loss": 0.2566, "step": 25950 }, { "epoch": 0.9703483889057645, "grad_norm": 0.4460180401802063, "learning_rate": 4.3343054926279216e-08, "loss": 0.2508, "step": 25955 }, { "epoch": 0.9705353178961142, "grad_norm": 0.38728684186935425, "learning_rate": 4.279859633722794e-08, "loss": 0.2721, "step": 25960 }, { "epoch": 0.970722246886464, "grad_norm": 0.4190647602081299, "learning_rate": 4.225757170163047e-08, "loss": 0.3867, "step": 25965 }, { "epoch": 0.9709091758768138, "grad_norm": 0.37819820642471313, "learning_rate": 4.1719981206072015e-08, "loss": 0.2931, "step": 25970 }, { "epoch": 0.9710961048671636, "grad_norm": 0.620923638343811, "learning_rate": 4.118582503594981e-08, "loss": 0.2239, "step": 25975 }, { "epoch": 0.9712830338575134, "grad_norm": 0.4498381018638611, "learning_rate": 4.065510337548206e-08, "loss": 0.338, "step": 25980 }, { "epoch": 0.9714699628478631, "grad_norm": 0.5090503096580505, "learning_rate": 4.0127816407696805e-08, "loss": 0.2363, "step": 25985 }, { "epoch": 0.9716568918382129, "grad_norm": 0.3429947793483734, "learning_rate": 3.96039643144408e-08, "loss": 0.263, "step": 25990 }, { "epoch": 0.9718438208285628, "grad_norm": 0.392718642950058, "learning_rate": 3.90835472763762e-08, "loss": 0.2374, "step": 25995 }, { "epoch": 0.9720307498189126, "grad_norm": 0.3899400532245636, "learning_rate": 3.8566565472980545e-08, "loss": 0.29, "step": 26000 }, { "epoch": 0.9722176788092624, "grad_norm": 0.561836302280426, "learning_rate": 3.805301908254455e-08, "loss": 0.2512, "step": 26005 }, { "epoch": 0.9724046077996121, "grad_norm": 0.6161816120147705, "learning_rate": 3.754290828217655e-08, "loss": 0.1973, "step": 26010 }, { "epoch": 0.9725915367899619, "grad_norm": 1.0645028352737427, "learning_rate": 3.7036233247799144e-08, "loss": 0.4018, "step": 26015 }, { "epoch": 0.9727784657803117, "grad_norm": 0.5088971257209778, "learning_rate": 3.6532994154150347e-08, "loss": 0.2651, "step": 26020 }, { "epoch": 0.9729653947706615, "grad_norm": 0.3199444115161896, "learning_rate": 3.603319117478244e-08, "loss": 0.2447, "step": 26025 }, { "epoch": 0.9731523237610112, "grad_norm": 0.3997444808483124, "learning_rate": 3.5536824482061974e-08, "loss": 0.2706, "step": 26030 }, { "epoch": 0.9733392527513611, "grad_norm": 0.22671836614608765, "learning_rate": 3.504389424717314e-08, "loss": 0.3388, "step": 26035 }, { "epoch": 0.9735261817417109, "grad_norm": 0.36706823110580444, "learning_rate": 3.455440064011328e-08, "loss": 0.2259, "step": 26040 }, { "epoch": 0.9737131107320607, "grad_norm": 0.4109925329685211, "learning_rate": 3.406834382969515e-08, "loss": 0.1845, "step": 26045 }, { "epoch": 0.9739000397224105, "grad_norm": 0.47747719287872314, "learning_rate": 3.358572398354465e-08, "loss": 0.2509, "step": 26050 }, { "epoch": 0.9740869687127602, "grad_norm": 0.1783730685710907, "learning_rate": 3.310654126810309e-08, "loss": 0.273, "step": 26055 }, { "epoch": 0.97427389770311, "grad_norm": 0.3882327079772949, "learning_rate": 3.263079584862938e-08, "loss": 0.2216, "step": 26060 }, { "epoch": 0.9744608266934598, "grad_norm": 0.3611951768398285, "learning_rate": 3.2158487889192294e-08, "loss": 0.2725, "step": 26065 }, { "epoch": 0.9746477556838096, "grad_norm": 0.5424373149871826, "learning_rate": 3.168961755267819e-08, "loss": 0.242, "step": 26070 }, { "epoch": 0.9748346846741595, "grad_norm": 0.6144366264343262, "learning_rate": 3.12241850007855e-08, "loss": 0.2658, "step": 26075 }, { "epoch": 0.9750216136645092, "grad_norm": 0.17419812083244324, "learning_rate": 3.076219039403139e-08, "loss": 0.2147, "step": 26080 }, { "epoch": 0.975208542654859, "grad_norm": 0.23372456431388855, "learning_rate": 3.0303633891742844e-08, "loss": 0.2811, "step": 26085 }, { "epoch": 0.9753954716452088, "grad_norm": 0.1994643360376358, "learning_rate": 2.984851565206226e-08, "loss": 0.3354, "step": 26090 }, { "epoch": 0.9755824006355586, "grad_norm": 0.45933806896209717, "learning_rate": 2.9396835831947412e-08, "loss": 0.3562, "step": 26095 }, { "epoch": 0.9757693296259083, "grad_norm": 0.4124797284603119, "learning_rate": 2.8948594587170366e-08, "loss": 0.2709, "step": 26100 }, { "epoch": 0.9759562586162581, "grad_norm": 0.9764533638954163, "learning_rate": 2.850379207231746e-08, "loss": 0.4274, "step": 26105 }, { "epoch": 0.9761431876066079, "grad_norm": 0.3561939001083374, "learning_rate": 2.8062428440785994e-08, "loss": 0.2174, "step": 26110 }, { "epoch": 0.9763301165969577, "grad_norm": 0.35872069001197815, "learning_rate": 2.7624503844790872e-08, "loss": 0.2391, "step": 26115 }, { "epoch": 0.9765170455873076, "grad_norm": 0.20135623216629028, "learning_rate": 2.7190018435360178e-08, "loss": 0.2471, "step": 26120 }, { "epoch": 0.9767039745776573, "grad_norm": 0.866783082485199, "learning_rate": 2.6758972362334046e-08, "loss": 0.3307, "step": 26125 }, { "epoch": 0.9768909035680071, "grad_norm": 0.34653496742248535, "learning_rate": 2.633136577436912e-08, "loss": 0.2525, "step": 26130 }, { "epoch": 0.9770778325583569, "grad_norm": 0.3317651152610779, "learning_rate": 2.590719881893522e-08, "loss": 0.2919, "step": 26135 }, { "epoch": 0.9772647615487067, "grad_norm": 0.22476986050605774, "learning_rate": 2.5486471642314215e-08, "loss": 0.3969, "step": 26140 }, { "epoch": 0.9774516905390565, "grad_norm": 0.3792192339897156, "learning_rate": 2.5069184389602264e-08, "loss": 0.2839, "step": 26145 }, { "epoch": 0.9776386195294062, "grad_norm": 0.5724767446517944, "learning_rate": 2.4655337204712027e-08, "loss": 0.2301, "step": 26150 }, { "epoch": 0.977825548519756, "grad_norm": 0.30545639991760254, "learning_rate": 2.4244930230364894e-08, "loss": 0.2735, "step": 26155 }, { "epoch": 0.9780124775101059, "grad_norm": 0.5367255210876465, "learning_rate": 2.3837963608100977e-08, "loss": 0.2457, "step": 26160 }, { "epoch": 0.9781994065004557, "grad_norm": 0.431430459022522, "learning_rate": 2.3434437478269124e-08, "loss": 0.2527, "step": 26165 }, { "epoch": 0.9783863354908054, "grad_norm": 0.5572644472122192, "learning_rate": 2.3034351980035784e-08, "loss": 0.2407, "step": 26170 }, { "epoch": 0.9785732644811552, "grad_norm": 0.43069931864738464, "learning_rate": 2.263770725137837e-08, "loss": 0.247, "step": 26175 }, { "epoch": 0.978760193471505, "grad_norm": 0.4567778706550598, "learning_rate": 2.224450342908746e-08, "loss": 0.2365, "step": 26180 }, { "epoch": 0.9789471224618548, "grad_norm": 0.3975851535797119, "learning_rate": 2.1854740648769023e-08, "loss": 0.3065, "step": 26185 }, { "epoch": 0.9791340514522046, "grad_norm": 0.3134264647960663, "learning_rate": 2.1468419044839984e-08, "loss": 0.296, "step": 26190 }, { "epoch": 0.9793209804425543, "grad_norm": 0.27878788113594055, "learning_rate": 2.1085538750531542e-08, "loss": 0.3218, "step": 26195 }, { "epoch": 0.9795079094329042, "grad_norm": 0.6269852519035339, "learning_rate": 2.0706099897890296e-08, "loss": 0.3444, "step": 26200 }, { "epoch": 0.979694838423254, "grad_norm": 0.38990554213523865, "learning_rate": 2.0330102617771575e-08, "loss": 0.2459, "step": 26205 }, { "epoch": 0.9798817674136038, "grad_norm": 0.3752122223377228, "learning_rate": 1.995754703984609e-08, "loss": 0.2583, "step": 26210 }, { "epoch": 0.9800686964039536, "grad_norm": 0.6185774207115173, "learning_rate": 1.9588433292598852e-08, "loss": 0.2597, "step": 26215 }, { "epoch": 0.9802556253943033, "grad_norm": 0.8538332581520081, "learning_rate": 1.9222761503325803e-08, "loss": 0.3271, "step": 26220 }, { "epoch": 0.9804425543846531, "grad_norm": 0.39858320355415344, "learning_rate": 1.886053179813718e-08, "loss": 0.2141, "step": 26225 }, { "epoch": 0.9806294833750029, "grad_norm": 0.4068199396133423, "learning_rate": 1.850174430195528e-08, "loss": 0.237, "step": 26230 }, { "epoch": 0.9808164123653527, "grad_norm": 0.457328200340271, "learning_rate": 1.814639913851557e-08, "loss": 0.3351, "step": 26235 }, { "epoch": 0.9810033413557026, "grad_norm": 0.2778027653694153, "learning_rate": 1.779449643036668e-08, "loss": 0.3103, "step": 26240 }, { "epoch": 0.9811902703460523, "grad_norm": 0.6216102242469788, "learning_rate": 1.7446036298869316e-08, "loss": 0.2385, "step": 26245 }, { "epoch": 0.9813771993364021, "grad_norm": 0.33470118045806885, "learning_rate": 1.710101886419735e-08, "loss": 0.3129, "step": 26250 }, { "epoch": 0.9815641283267519, "grad_norm": 0.37651658058166504, "learning_rate": 1.6759444245338928e-08, "loss": 0.2858, "step": 26255 }, { "epoch": 0.9817510573171017, "grad_norm": 0.3364080488681793, "learning_rate": 1.642131256009094e-08, "loss": 0.3334, "step": 26260 }, { "epoch": 0.9819379863074514, "grad_norm": 0.34844186902046204, "learning_rate": 1.608662392506677e-08, "loss": 0.3211, "step": 26265 }, { "epoch": 0.9821249152978012, "grad_norm": 0.31410226225852966, "learning_rate": 1.575537845569075e-08, "loss": 0.2541, "step": 26270 }, { "epoch": 0.982311844288151, "grad_norm": 0.6750185489654541, "learning_rate": 1.5427576266200394e-08, "loss": 0.2703, "step": 26275 }, { "epoch": 0.9824987732785009, "grad_norm": 0.7728235721588135, "learning_rate": 1.510321746964416e-08, "loss": 0.3094, "step": 26280 }, { "epoch": 0.9826857022688507, "grad_norm": 0.7537628412246704, "learning_rate": 1.4782302177884789e-08, "loss": 0.2852, "step": 26285 }, { "epoch": 0.9828726312592004, "grad_norm": 0.5353776812553406, "learning_rate": 1.4464830501597082e-08, "loss": 0.3261, "step": 26290 }, { "epoch": 0.9830595602495502, "grad_norm": 0.6199917793273926, "learning_rate": 1.4150802550267905e-08, "loss": 0.2453, "step": 26295 }, { "epoch": 0.9832464892399, "grad_norm": 0.15641337633132935, "learning_rate": 1.3840218432195074e-08, "loss": 0.2621, "step": 26300 }, { "epoch": 0.9834334182302498, "grad_norm": 0.38754504919052124, "learning_rate": 1.3533078254492905e-08, "loss": 0.2353, "step": 26305 }, { "epoch": 0.9836203472205995, "grad_norm": 0.6075884699821472, "learning_rate": 1.3229382123082223e-08, "loss": 0.3138, "step": 26310 }, { "epoch": 0.9838072762109493, "grad_norm": 0.46125859022140503, "learning_rate": 1.2929130142700363e-08, "loss": 0.2389, "step": 26315 }, { "epoch": 0.9839942052012992, "grad_norm": 0.44463780522346497, "learning_rate": 1.2632322416896715e-08, "loss": 0.2877, "step": 26320 }, { "epoch": 0.984181134191649, "grad_norm": 0.4823720455169678, "learning_rate": 1.2338959048029398e-08, "loss": 0.2585, "step": 26325 }, { "epoch": 0.9843680631819988, "grad_norm": 0.3346082866191864, "learning_rate": 1.2049040137273038e-08, "loss": 0.2628, "step": 26330 }, { "epoch": 0.9845549921723485, "grad_norm": 0.7445553541183472, "learning_rate": 1.1762565784612101e-08, "loss": 0.2554, "step": 26335 }, { "epoch": 0.9847419211626983, "grad_norm": 0.5069872736930847, "learning_rate": 1.1479536088843112e-08, "loss": 0.3881, "step": 26340 }, { "epoch": 0.9849288501530481, "grad_norm": 0.4112212061882019, "learning_rate": 1.1199951147574661e-08, "loss": 0.277, "step": 26345 }, { "epoch": 0.9851157791433979, "grad_norm": 0.4882351756095886, "learning_rate": 1.0923811057227396e-08, "loss": 0.2854, "step": 26350 }, { "epoch": 0.9853027081337477, "grad_norm": 0.37586453557014465, "learning_rate": 1.0651115913035137e-08, "loss": 0.2478, "step": 26355 }, { "epoch": 0.9854896371240974, "grad_norm": 1.524063229560852, "learning_rate": 1.0381865809040437e-08, "loss": 0.3372, "step": 26360 }, { "epoch": 0.9856765661144473, "grad_norm": 0.4818149209022522, "learning_rate": 1.0116060838103458e-08, "loss": 0.2763, "step": 26365 }, { "epoch": 0.9858634951047971, "grad_norm": 0.2843870222568512, "learning_rate": 9.853701091888656e-09, "loss": 0.2549, "step": 26370 }, { "epoch": 0.9860504240951469, "grad_norm": 0.44535329937934875, "learning_rate": 9.594786660880317e-09, "loss": 0.3165, "step": 26375 }, { "epoch": 0.9862373530854966, "grad_norm": 0.2467801719903946, "learning_rate": 9.339317634367017e-09, "loss": 0.225, "step": 26380 }, { "epoch": 0.9864242820758464, "grad_norm": 0.4127419590950012, "learning_rate": 9.087294100456056e-09, "loss": 0.2825, "step": 26385 }, { "epoch": 0.9866112110661962, "grad_norm": 0.4206787645816803, "learning_rate": 8.838716146060134e-09, "loss": 0.2727, "step": 26390 }, { "epoch": 0.986798140056546, "grad_norm": 0.5794749855995178, "learning_rate": 8.59358385690956e-09, "loss": 0.3581, "step": 26395 }, { "epoch": 0.9869850690468958, "grad_norm": 0.37697890400886536, "learning_rate": 8.351897317541157e-09, "loss": 0.271, "step": 26400 }, { "epoch": 0.9871719980372456, "grad_norm": 0.5611318945884705, "learning_rate": 8.11365661130603e-09, "loss": 0.3294, "step": 26405 }, { "epoch": 0.9873589270275954, "grad_norm": 0.2812879979610443, "learning_rate": 7.878861820367345e-09, "loss": 0.3604, "step": 26410 }, { "epoch": 0.9875458560179452, "grad_norm": 0.3695289194583893, "learning_rate": 7.647513025698105e-09, "loss": 0.2133, "step": 26415 }, { "epoch": 0.987732785008295, "grad_norm": 0.46063148975372314, "learning_rate": 7.4196103070856005e-09, "loss": 0.2794, "step": 26420 }, { "epoch": 0.9879197139986448, "grad_norm": 0.3603619337081909, "learning_rate": 7.195153743124739e-09, "loss": 0.2773, "step": 26425 }, { "epoch": 0.9881066429889945, "grad_norm": 0.515447735786438, "learning_rate": 6.97414341122582e-09, "loss": 0.3515, "step": 26430 }, { "epoch": 0.9882935719793443, "grad_norm": 0.4539223611354828, "learning_rate": 6.756579387607875e-09, "loss": 0.2721, "step": 26435 }, { "epoch": 0.9884805009696941, "grad_norm": 0.927209734916687, "learning_rate": 6.542461747304218e-09, "loss": 0.2898, "step": 26440 }, { "epoch": 0.988667429960044, "grad_norm": 0.33204635977745056, "learning_rate": 6.331790564155782e-09, "loss": 0.3234, "step": 26445 }, { "epoch": 0.9888543589503938, "grad_norm": 0.4017919898033142, "learning_rate": 6.124565910818891e-09, "loss": 0.2895, "step": 26450 }, { "epoch": 0.9890412879407435, "grad_norm": 0.3101350665092468, "learning_rate": 5.9207878587574926e-09, "loss": 0.2818, "step": 26455 }, { "epoch": 0.9892282169310933, "grad_norm": 0.38816362619400024, "learning_rate": 5.7204564782498136e-09, "loss": 0.2629, "step": 26460 }, { "epoch": 0.9894151459214431, "grad_norm": 0.49870437383651733, "learning_rate": 5.523571838385034e-09, "loss": 0.3173, "step": 26465 }, { "epoch": 0.9896020749117929, "grad_norm": 0.5789703726768494, "learning_rate": 5.330134007062171e-09, "loss": 0.2963, "step": 26470 }, { "epoch": 0.9897890039021426, "grad_norm": 0.3744274079799652, "learning_rate": 5.1401430509923075e-09, "loss": 0.3257, "step": 26475 }, { "epoch": 0.9899759328924924, "grad_norm": 0.4980723559856415, "learning_rate": 4.953599035697476e-09, "loss": 0.2974, "step": 26480 }, { "epoch": 0.9901628618828423, "grad_norm": 0.71192467212677, "learning_rate": 4.77050202551288e-09, "loss": 0.265, "step": 26485 }, { "epoch": 0.9903497908731921, "grad_norm": 0.3702625036239624, "learning_rate": 4.590852083582453e-09, "loss": 0.2456, "step": 26490 }, { "epoch": 0.9905367198635419, "grad_norm": 0.459648996591568, "learning_rate": 4.414649271863303e-09, "loss": 0.2939, "step": 26495 }, { "epoch": 0.9907236488538916, "grad_norm": 0.2367364466190338, "learning_rate": 4.241893651120155e-09, "loss": 0.2624, "step": 26500 }, { "epoch": 0.9909105778442414, "grad_norm": 0.43819648027420044, "learning_rate": 4.0725852809342384e-09, "loss": 0.345, "step": 26505 }, { "epoch": 0.9910975068345912, "grad_norm": 0.3360496461391449, "learning_rate": 3.906724219694402e-09, "loss": 0.2522, "step": 26510 }, { "epoch": 0.991284435824941, "grad_norm": 0.2503529191017151, "learning_rate": 3.744310524600447e-09, "loss": 0.2847, "step": 26515 }, { "epoch": 0.9914713648152907, "grad_norm": 0.49219122529029846, "learning_rate": 3.585344251665346e-09, "loss": 0.2354, "step": 26520 }, { "epoch": 0.9916582938056406, "grad_norm": 0.42718371748924255, "learning_rate": 3.4298254557108e-09, "loss": 0.3146, "step": 26525 }, { "epoch": 0.9918452227959904, "grad_norm": 0.6982577443122864, "learning_rate": 3.2777541903716845e-09, "loss": 0.2527, "step": 26530 }, { "epoch": 0.9920321517863402, "grad_norm": 0.5282479524612427, "learning_rate": 3.129130508092715e-09, "loss": 0.2256, "step": 26535 }, { "epoch": 0.99221908077669, "grad_norm": 0.24284540116786957, "learning_rate": 2.983954460130667e-09, "loss": 0.3069, "step": 26540 }, { "epoch": 0.9924060097670397, "grad_norm": 0.4188280403614044, "learning_rate": 2.8422260965510485e-09, "loss": 0.303, "step": 26545 }, { "epoch": 0.9925929387573895, "grad_norm": 0.8391254544258118, "learning_rate": 2.7039454662336484e-09, "loss": 0.2957, "step": 26550 }, { "epoch": 0.9927798677477393, "grad_norm": 0.3748670518398285, "learning_rate": 2.5691126168669866e-09, "loss": 0.2781, "step": 26555 }, { "epoch": 0.9929667967380891, "grad_norm": 0.44531556963920593, "learning_rate": 2.437727594949424e-09, "loss": 0.3105, "step": 26560 }, { "epoch": 0.993153725728439, "grad_norm": 0.43134966492652893, "learning_rate": 2.309790445794713e-09, "loss": 0.2255, "step": 26565 }, { "epoch": 0.9933406547187887, "grad_norm": 0.471179723739624, "learning_rate": 2.1853012135220065e-09, "loss": 0.302, "step": 26570 }, { "epoch": 0.9935275837091385, "grad_norm": 0.3568282425403595, "learning_rate": 2.0642599410658493e-09, "loss": 0.2448, "step": 26575 }, { "epoch": 0.9937145126994883, "grad_norm": 0.5308835506439209, "learning_rate": 1.946666670169517e-09, "loss": 0.2497, "step": 26580 }, { "epoch": 0.9939014416898381, "grad_norm": 0.5474340319633484, "learning_rate": 1.832521441387236e-09, "loss": 0.3189, "step": 26585 }, { "epoch": 0.9940883706801878, "grad_norm": 0.31903812289237976, "learning_rate": 1.7218242940841845e-09, "loss": 0.2821, "step": 26590 }, { "epoch": 0.9942752996705376, "grad_norm": 0.33196452260017395, "learning_rate": 1.6145752664376014e-09, "loss": 0.3051, "step": 26595 }, { "epoch": 0.9944622286608874, "grad_norm": 0.7084635496139526, "learning_rate": 1.5107743954334565e-09, "loss": 0.1976, "step": 26600 }, { "epoch": 0.9946491576512372, "grad_norm": 0.3358515799045563, "learning_rate": 1.4104217168708911e-09, "loss": 0.2774, "step": 26605 }, { "epoch": 0.9948360866415871, "grad_norm": 0.46138903498649597, "learning_rate": 1.3135172653577777e-09, "loss": 0.2587, "step": 26610 }, { "epoch": 0.9950230156319368, "grad_norm": 0.47722405195236206, "learning_rate": 1.2200610743129394e-09, "loss": 0.2649, "step": 26615 }, { "epoch": 0.9952099446222866, "grad_norm": 0.30284929275512695, "learning_rate": 1.1300531759694811e-09, "loss": 0.302, "step": 26620 }, { "epoch": 0.9953968736126364, "grad_norm": 0.768482506275177, "learning_rate": 1.043493601365908e-09, "loss": 0.2393, "step": 26625 }, { "epoch": 0.9955838026029862, "grad_norm": 0.6742646098136902, "learning_rate": 9.60382380355007e-10, "loss": 0.2402, "step": 26630 }, { "epoch": 0.995770731593336, "grad_norm": 0.5979500412940979, "learning_rate": 8.807195415994063e-10, "loss": 0.3245, "step": 26635 }, { "epoch": 0.9959576605836857, "grad_norm": 1.0489237308502197, "learning_rate": 8.045051125726844e-10, "loss": 0.313, "step": 26640 }, { "epoch": 0.9961445895740355, "grad_norm": 0.5769110918045044, "learning_rate": 7.317391195593714e-10, "loss": 0.3009, "step": 26645 }, { "epoch": 0.9963315185643854, "grad_norm": 0.7712821364402771, "learning_rate": 6.624215876538387e-10, "loss": 0.262, "step": 26650 }, { "epoch": 0.9965184475547352, "grad_norm": 0.4885789752006531, "learning_rate": 5.965525407614081e-10, "loss": 0.2446, "step": 26655 }, { "epoch": 0.996705376545085, "grad_norm": 0.2925719916820526, "learning_rate": 5.341320015994633e-10, "loss": 0.2598, "step": 26660 }, { "epoch": 0.9968923055354347, "grad_norm": 0.22376123070716858, "learning_rate": 4.751599916941185e-10, "loss": 0.2542, "step": 26665 }, { "epoch": 0.9970792345257845, "grad_norm": 0.45399028062820435, "learning_rate": 4.1963653138354933e-10, "loss": 0.257, "step": 26670 }, { "epoch": 0.9972661635161343, "grad_norm": 0.3218282461166382, "learning_rate": 3.675616398157722e-10, "loss": 0.308, "step": 26675 }, { "epoch": 0.9974530925064841, "grad_norm": 0.3195207118988037, "learning_rate": 3.1893533495086505e-10, "loss": 0.2604, "step": 26680 }, { "epoch": 0.9976400214968338, "grad_norm": 0.31073522567749023, "learning_rate": 2.737576335576364e-10, "loss": 0.3143, "step": 26685 }, { "epoch": 0.9978269504871837, "grad_norm": 0.3902645409107208, "learning_rate": 2.320285512169562e-10, "loss": 0.2869, "step": 26690 }, { "epoch": 0.9980138794775335, "grad_norm": 0.3908766806125641, "learning_rate": 1.9374810231953533e-10, "loss": 0.2434, "step": 26695 }, { "epoch": 0.9982008084678833, "grad_norm": 0.5233207941055298, "learning_rate": 1.5891630006814595e-10, "loss": 0.29, "step": 26700 }, { "epoch": 0.9983877374582331, "grad_norm": 0.7636908292770386, "learning_rate": 1.275331564754012e-10, "loss": 0.3399, "step": 26705 }, { "epoch": 0.9985746664485828, "grad_norm": 0.21469980478286743, "learning_rate": 9.959868236375514e-11, "loss": 0.2314, "step": 26710 }, { "epoch": 0.9987615954389326, "grad_norm": 0.3713749349117279, "learning_rate": 7.511288736661293e-11, "loss": 0.3933, "step": 26715 }, { "epoch": 0.9989485244292824, "grad_norm": 0.4366331994533539, "learning_rate": 5.407577992944113e-11, "loss": 0.2541, "step": 26720 }, { "epoch": 0.9991354534196322, "grad_norm": 0.4484507739543915, "learning_rate": 3.648736730643698e-11, "loss": 0.2271, "step": 26725 }, { "epoch": 0.9993223824099821, "grad_norm": 0.3740657567977905, "learning_rate": 2.2347655563859096e-11, "loss": 0.3167, "step": 26730 }, { "epoch": 0.9995093114003318, "grad_norm": 0.5398233532905579, "learning_rate": 1.165664957780699e-11, "loss": 0.2958, "step": 26735 }, { "epoch": 0.9996962403906816, "grad_norm": 0.46718209981918335, "learning_rate": 4.414353035331331e-12, "loss": 0.2451, "step": 26740 }, { "epoch": 0.9998831693810314, "grad_norm": 0.2932209074497223, "learning_rate": 6.207684344339271e-13, "loss": 0.2126, "step": 26745 }, { "epoch": 0.9999953267752413, "step": 26748, "total_flos": 5.335451483137671e+18, "train_loss": 0.2982786745278668, "train_runtime": 62383.2691, "train_samples_per_second": 3.43, "train_steps_per_second": 0.429 } ], "logging_steps": 5, "max_steps": 26748, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.335451483137671e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }