diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,63112 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999889013440473, + "eval_steps": 500, + "global_step": 45050, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001109865595276412, + "grad_norm": 3.869100332260132, + "learning_rate": 1.9999999392116768e-05, + "loss": 1.2, + "step": 5 + }, + { + "epoch": 0.0002219731190552824, + "grad_norm": 4.66916561126709, + "learning_rate": 1.9999997568467152e-05, + "loss": 0.9035, + "step": 10 + }, + { + "epoch": 0.0003329596785829236, + "grad_norm": 1.8067188262939453, + "learning_rate": 1.9999994529051363e-05, + "loss": 0.8638, + "step": 15 + }, + { + "epoch": 0.0004439462381105648, + "grad_norm": 1.7449718713760376, + "learning_rate": 1.999999027386978e-05, + "loss": 0.7258, + "step": 20 + }, + { + "epoch": 0.000554932797638206, + "grad_norm": 2.081190586090088, + "learning_rate": 1.9999984802922915e-05, + "loss": 0.7764, + "step": 25 + }, + { + "epoch": 0.0006659193571658472, + "grad_norm": 4.897317886352539, + "learning_rate": 1.9999978116211434e-05, + "loss": 0.8836, + "step": 30 + }, + { + "epoch": 0.0007769059166934884, + "grad_norm": 2.0032882690429688, + "learning_rate": 1.9999970213736153e-05, + "loss": 0.7637, + "step": 35 + }, + { + "epoch": 0.0008878924762211296, + "grad_norm": 1.2999836206436157, + "learning_rate": 1.999996109549803e-05, + "loss": 0.6827, + "step": 40 + }, + { + "epoch": 0.0009988790357487707, + "grad_norm": 2.4639241695404053, + "learning_rate": 1.9999950761498176e-05, + "loss": 0.8187, + "step": 45 + }, + { + "epoch": 0.001109865595276412, + "grad_norm": 1.5862199068069458, + "learning_rate": 1.9999939211737842e-05, + "loss": 0.826, + "step": 50 + }, + { + "epoch": 0.0012208521548040532, + "grad_norm": 2.156992197036743, + "learning_rate": 1.999992644621844e-05, + "loss": 0.6859, + "step": 55 + }, + { + "epoch": 0.0013318387143316944, + "grad_norm": 1.9007676839828491, + "learning_rate": 1.9999912464941515e-05, + "loss": 0.8637, + "step": 60 + }, + { + "epoch": 0.0014428252738593356, + "grad_norm": 1.9827724695205688, + "learning_rate": 1.999989726790877e-05, + "loss": 0.8192, + "step": 65 + }, + { + "epoch": 0.0015538118333869768, + "grad_norm": 1.9432623386383057, + "learning_rate": 1.999988085512205e-05, + "loss": 0.7394, + "step": 70 + }, + { + "epoch": 0.001664798392914618, + "grad_norm": 1.9758037328720093, + "learning_rate": 1.9999863226583357e-05, + "loss": 0.5363, + "step": 75 + }, + { + "epoch": 0.0017757849524422593, + "grad_norm": 1.2889810800552368, + "learning_rate": 1.9999844382294827e-05, + "loss": 0.7151, + "step": 80 + }, + { + "epoch": 0.0018867715119699005, + "grad_norm": 1.4423288106918335, + "learning_rate": 1.9999824322258754e-05, + "loss": 0.7741, + "step": 85 + }, + { + "epoch": 0.0019977580714975415, + "grad_norm": 1.5920696258544922, + "learning_rate": 1.9999803046477578e-05, + "loss": 0.7286, + "step": 90 + }, + { + "epoch": 0.0021087446310251827, + "grad_norm": 1.4489409923553467, + "learning_rate": 1.9999780554953886e-05, + "loss": 0.6861, + "step": 95 + }, + { + "epoch": 0.002219731190552824, + "grad_norm": 1.9922000169754028, + "learning_rate": 1.9999756847690408e-05, + "loss": 0.6803, + "step": 100 + }, + { + "epoch": 0.002330717750080465, + "grad_norm": 1.5229793787002563, + "learning_rate": 1.9999731924690028e-05, + "loss": 0.7924, + "step": 105 + }, + { + "epoch": 0.0024417043096081063, + "grad_norm": 1.939287543296814, + "learning_rate": 1.999970578595578e-05, + "loss": 0.7265, + "step": 110 + }, + { + "epoch": 0.0025526908691357476, + "grad_norm": 2.4952993392944336, + "learning_rate": 1.999967843149084e-05, + "loss": 0.66, + "step": 115 + }, + { + "epoch": 0.002663677428663389, + "grad_norm": 1.9537633657455444, + "learning_rate": 1.999964986129853e-05, + "loss": 0.6382, + "step": 120 + }, + { + "epoch": 0.00277466398819103, + "grad_norm": 1.8744527101516724, + "learning_rate": 1.9999620075382326e-05, + "loss": 0.7525, + "step": 125 + }, + { + "epoch": 0.0028856505477186712, + "grad_norm": 2.0237603187561035, + "learning_rate": 1.999958907374585e-05, + "loss": 0.7356, + "step": 130 + }, + { + "epoch": 0.0029966371072463124, + "grad_norm": 1.49530827999115, + "learning_rate": 1.999955685639287e-05, + "loss": 0.5226, + "step": 135 + }, + { + "epoch": 0.0031076236667739537, + "grad_norm": 2.271005153656006, + "learning_rate": 1.9999523423327304e-05, + "loss": 0.7534, + "step": 140 + }, + { + "epoch": 0.003218610226301595, + "grad_norm": 1.3122650384902954, + "learning_rate": 1.9999488774553213e-05, + "loss": 0.6388, + "step": 145 + }, + { + "epoch": 0.003329596785829236, + "grad_norm": 2.2374744415283203, + "learning_rate": 1.9999452910074817e-05, + "loss": 0.637, + "step": 150 + }, + { + "epoch": 0.0034405833453568773, + "grad_norm": 1.688567042350769, + "learning_rate": 1.9999415829896466e-05, + "loss": 0.6446, + "step": 155 + }, + { + "epoch": 0.0035515699048845185, + "grad_norm": 1.5541659593582153, + "learning_rate": 1.999937753402268e-05, + "loss": 0.744, + "step": 160 + }, + { + "epoch": 0.0036625564644121597, + "grad_norm": 1.51518976688385, + "learning_rate": 1.9999338022458107e-05, + "loss": 0.6366, + "step": 165 + }, + { + "epoch": 0.003773543023939801, + "grad_norm": 2.0031962394714355, + "learning_rate": 1.999929729520755e-05, + "loss": 0.6538, + "step": 170 + }, + { + "epoch": 0.003884529583467442, + "grad_norm": 2.005131721496582, + "learning_rate": 1.9999255352275965e-05, + "loss": 0.5916, + "step": 175 + }, + { + "epoch": 0.003995516142995083, + "grad_norm": 1.7754985094070435, + "learning_rate": 1.9999212193668448e-05, + "loss": 0.6784, + "step": 180 + }, + { + "epoch": 0.004106502702522725, + "grad_norm": 1.4435465335845947, + "learning_rate": 1.999916781939025e-05, + "loss": 0.7264, + "step": 185 + }, + { + "epoch": 0.004217489262050365, + "grad_norm": 1.649026870727539, + "learning_rate": 1.9999122229446758e-05, + "loss": 0.6314, + "step": 190 + }, + { + "epoch": 0.004328475821578007, + "grad_norm": 2.0196125507354736, + "learning_rate": 1.9999075423843525e-05, + "loss": 0.7619, + "step": 195 + }, + { + "epoch": 0.004439462381105648, + "grad_norm": 1.6189427375793457, + "learning_rate": 1.9999027402586235e-05, + "loss": 0.7378, + "step": 200 + }, + { + "epoch": 0.0045504489406332895, + "grad_norm": 1.5108414888381958, + "learning_rate": 1.9998978165680728e-05, + "loss": 0.558, + "step": 205 + }, + { + "epoch": 0.00466143550016093, + "grad_norm": 1.4623931646347046, + "learning_rate": 1.9998927713132986e-05, + "loss": 0.8298, + "step": 210 + }, + { + "epoch": 0.004772422059688572, + "grad_norm": 1.4418224096298218, + "learning_rate": 1.999887604494915e-05, + "loss": 0.5532, + "step": 215 + }, + { + "epoch": 0.004883408619216213, + "grad_norm": 1.648908257484436, + "learning_rate": 1.9998823161135498e-05, + "loss": 0.6328, + "step": 220 + }, + { + "epoch": 0.004994395178743854, + "grad_norm": 1.5702770948410034, + "learning_rate": 1.9998769061698457e-05, + "loss": 0.658, + "step": 225 + }, + { + "epoch": 0.005105381738271495, + "grad_norm": 1.683289647102356, + "learning_rate": 1.9998713746644606e-05, + "loss": 0.565, + "step": 230 + }, + { + "epoch": 0.005216368297799137, + "grad_norm": 2.054819345474243, + "learning_rate": 1.9998657215980674e-05, + "loss": 0.7035, + "step": 235 + }, + { + "epoch": 0.005327354857326778, + "grad_norm": 1.2517290115356445, + "learning_rate": 1.9998599469713528e-05, + "loss": 0.5752, + "step": 240 + }, + { + "epoch": 0.005438341416854419, + "grad_norm": 2.055407762527466, + "learning_rate": 1.9998540507850195e-05, + "loss": 0.7533, + "step": 245 + }, + { + "epoch": 0.00554932797638206, + "grad_norm": 1.7456523180007935, + "learning_rate": 1.9998480330397836e-05, + "loss": 0.5713, + "step": 250 + }, + { + "epoch": 0.005660314535909702, + "grad_norm": 1.9250489473342896, + "learning_rate": 1.999841893736377e-05, + "loss": 0.8103, + "step": 255 + }, + { + "epoch": 0.0057713010954373424, + "grad_norm": 1.3744711875915527, + "learning_rate": 1.9998356328755467e-05, + "loss": 0.6879, + "step": 260 + }, + { + "epoch": 0.005882287654964984, + "grad_norm": 1.9396206140518188, + "learning_rate": 1.9998292504580528e-05, + "loss": 0.5112, + "step": 265 + }, + { + "epoch": 0.005993274214492625, + "grad_norm": 1.8470498323440552, + "learning_rate": 1.999822746484672e-05, + "loss": 0.8324, + "step": 270 + }, + { + "epoch": 0.0061042607740202665, + "grad_norm": 1.656162142753601, + "learning_rate": 1.999816120956195e-05, + "loss": 0.7627, + "step": 275 + }, + { + "epoch": 0.006215247333547907, + "grad_norm": 1.5918843746185303, + "learning_rate": 1.9998093738734267e-05, + "loss": 0.5311, + "step": 280 + }, + { + "epoch": 0.006326233893075549, + "grad_norm": 1.779537558555603, + "learning_rate": 1.999802505237188e-05, + "loss": 0.8021, + "step": 285 + }, + { + "epoch": 0.00643722045260319, + "grad_norm": 1.4962438344955444, + "learning_rate": 1.9997955150483142e-05, + "loss": 0.6775, + "step": 290 + }, + { + "epoch": 0.0065482070121308305, + "grad_norm": 1.7058830261230469, + "learning_rate": 1.9997884033076542e-05, + "loss": 0.6195, + "step": 295 + }, + { + "epoch": 0.006659193571658472, + "grad_norm": 1.3699376583099365, + "learning_rate": 1.9997811700160735e-05, + "loss": 0.5338, + "step": 300 + }, + { + "epoch": 0.006770180131186113, + "grad_norm": 20.352764129638672, + "learning_rate": 1.999773815174451e-05, + "loss": 0.7159, + "step": 305 + }, + { + "epoch": 0.006881166690713755, + "grad_norm": 1.6667920351028442, + "learning_rate": 1.999766338783681e-05, + "loss": 0.6633, + "step": 310 + }, + { + "epoch": 0.006992153250241395, + "grad_norm": 1.438814401626587, + "learning_rate": 1.9997587408446725e-05, + "loss": 0.6731, + "step": 315 + }, + { + "epoch": 0.007103139809769037, + "grad_norm": 1.7060329914093018, + "learning_rate": 1.999751021358349e-05, + "loss": 0.8527, + "step": 320 + }, + { + "epoch": 0.007214126369296678, + "grad_norm": 1.8527767658233643, + "learning_rate": 1.9997431803256496e-05, + "loss": 0.673, + "step": 325 + }, + { + "epoch": 0.0073251129288243195, + "grad_norm": 2.1131327152252197, + "learning_rate": 1.9997352177475274e-05, + "loss": 0.6467, + "step": 330 + }, + { + "epoch": 0.00743609948835196, + "grad_norm": 1.617082118988037, + "learning_rate": 1.99972713362495e-05, + "loss": 0.4799, + "step": 335 + }, + { + "epoch": 0.007547086047879602, + "grad_norm": 1.6008981466293335, + "learning_rate": 1.9997189279589003e-05, + "loss": 0.5518, + "step": 340 + }, + { + "epoch": 0.007658072607407243, + "grad_norm": 1.9231702089309692, + "learning_rate": 1.9997106007503765e-05, + "loss": 0.5805, + "step": 345 + }, + { + "epoch": 0.007769059166934884, + "grad_norm": 2.0241827964782715, + "learning_rate": 1.9997021520003903e-05, + "loss": 0.806, + "step": 350 + }, + { + "epoch": 0.007880045726462526, + "grad_norm": 1.3821715116500854, + "learning_rate": 1.9996935817099695e-05, + "loss": 0.7182, + "step": 355 + }, + { + "epoch": 0.007991032285990166, + "grad_norm": 1.4263622760772705, + "learning_rate": 1.9996848898801554e-05, + "loss": 0.7668, + "step": 360 + }, + { + "epoch": 0.008102018845517808, + "grad_norm": 1.726434588432312, + "learning_rate": 1.9996760765120052e-05, + "loss": 0.7493, + "step": 365 + }, + { + "epoch": 0.00821300540504545, + "grad_norm": 1.2283848524093628, + "learning_rate": 1.9996671416065906e-05, + "loss": 0.6539, + "step": 370 + }, + { + "epoch": 0.008323991964573091, + "grad_norm": 1.572723627090454, + "learning_rate": 1.999658085164997e-05, + "loss": 0.6009, + "step": 375 + }, + { + "epoch": 0.00843497852410073, + "grad_norm": 1.8220628499984741, + "learning_rate": 1.9996489071883265e-05, + "loss": 0.6171, + "step": 380 + }, + { + "epoch": 0.008545965083628372, + "grad_norm": 3.054668664932251, + "learning_rate": 1.9996396076776943e-05, + "loss": 0.7583, + "step": 385 + }, + { + "epoch": 0.008656951643156014, + "grad_norm": 10.986349105834961, + "learning_rate": 1.999630186634231e-05, + "loss": 0.6993, + "step": 390 + }, + { + "epoch": 0.008767938202683656, + "grad_norm": 1.3439058065414429, + "learning_rate": 1.999620644059082e-05, + "loss": 0.593, + "step": 395 + }, + { + "epoch": 0.008878924762211296, + "grad_norm": 1.6584223508834839, + "learning_rate": 1.999610979953408e-05, + "loss": 0.7222, + "step": 400 + }, + { + "epoch": 0.008989911321738937, + "grad_norm": 1.4647761583328247, + "learning_rate": 1.999601194318383e-05, + "loss": 0.7219, + "step": 405 + }, + { + "epoch": 0.009100897881266579, + "grad_norm": 1.648807168006897, + "learning_rate": 1.9995912871551975e-05, + "loss": 0.5759, + "step": 410 + }, + { + "epoch": 0.00921188444079422, + "grad_norm": 1.1908730268478394, + "learning_rate": 1.9995812584650555e-05, + "loss": 0.4634, + "step": 415 + }, + { + "epoch": 0.00932287100032186, + "grad_norm": 1.2616108655929565, + "learning_rate": 1.9995711082491765e-05, + "loss": 0.6941, + "step": 420 + }, + { + "epoch": 0.009433857559849502, + "grad_norm": 1.5647404193878174, + "learning_rate": 1.9995608365087945e-05, + "loss": 0.6583, + "step": 425 + }, + { + "epoch": 0.009544844119377144, + "grad_norm": 1.7183318138122559, + "learning_rate": 1.9995504432451583e-05, + "loss": 0.7446, + "step": 430 + }, + { + "epoch": 0.009655830678904785, + "grad_norm": 1.254790186882019, + "learning_rate": 1.9995399284595314e-05, + "loss": 0.7101, + "step": 435 + }, + { + "epoch": 0.009766817238432425, + "grad_norm": 1.774034023284912, + "learning_rate": 1.999529292153192e-05, + "loss": 0.7042, + "step": 440 + }, + { + "epoch": 0.009877803797960067, + "grad_norm": 1.9237412214279175, + "learning_rate": 1.9995185343274336e-05, + "loss": 0.69, + "step": 445 + }, + { + "epoch": 0.009988790357487709, + "grad_norm": 1.4360047578811646, + "learning_rate": 1.9995076549835638e-05, + "loss": 0.6877, + "step": 450 + }, + { + "epoch": 0.010099776917015349, + "grad_norm": 1.3835448026657104, + "learning_rate": 1.9994966541229057e-05, + "loss": 0.7267, + "step": 455 + }, + { + "epoch": 0.01021076347654299, + "grad_norm": 1.4885082244873047, + "learning_rate": 1.9994855317467963e-05, + "loss": 0.5813, + "step": 460 + }, + { + "epoch": 0.010321750036070632, + "grad_norm": 1.4832953214645386, + "learning_rate": 1.9994742878565878e-05, + "loss": 0.6522, + "step": 465 + }, + { + "epoch": 0.010432736595598274, + "grad_norm": 1.3984161615371704, + "learning_rate": 1.9994629224536477e-05, + "loss": 0.6394, + "step": 470 + }, + { + "epoch": 0.010543723155125913, + "grad_norm": 1.5427662134170532, + "learning_rate": 1.999451435539357e-05, + "loss": 0.6314, + "step": 475 + }, + { + "epoch": 0.010654709714653555, + "grad_norm": 3.5436573028564453, + "learning_rate": 1.999439827115113e-05, + "loss": 0.7408, + "step": 480 + }, + { + "epoch": 0.010765696274181197, + "grad_norm": 1.8555324077606201, + "learning_rate": 1.9994280971823267e-05, + "loss": 0.661, + "step": 485 + }, + { + "epoch": 0.010876682833708838, + "grad_norm": 1.6472523212432861, + "learning_rate": 1.9994162457424238e-05, + "loss": 0.4967, + "step": 490 + }, + { + "epoch": 0.010987669393236478, + "grad_norm": 1.7284140586853027, + "learning_rate": 1.999404272796846e-05, + "loss": 0.6819, + "step": 495 + }, + { + "epoch": 0.01109865595276412, + "grad_norm": 2.7859838008880615, + "learning_rate": 1.999392178347048e-05, + "loss": 0.7583, + "step": 500 + }, + { + "epoch": 0.011209642512291762, + "grad_norm": 2.0837457180023193, + "learning_rate": 1.999379962394501e-05, + "loss": 0.6279, + "step": 505 + }, + { + "epoch": 0.011320629071819403, + "grad_norm": 2.999171733856201, + "learning_rate": 1.9993676249406895e-05, + "loss": 0.6355, + "step": 510 + }, + { + "epoch": 0.011431615631347043, + "grad_norm": 2.121840238571167, + "learning_rate": 1.9993551659871138e-05, + "loss": 0.742, + "step": 515 + }, + { + "epoch": 0.011542602190874685, + "grad_norm": 1.540635585784912, + "learning_rate": 1.9993425855352887e-05, + "loss": 0.6726, + "step": 520 + }, + { + "epoch": 0.011653588750402327, + "grad_norm": 1.9277762174606323, + "learning_rate": 1.999329883586744e-05, + "loss": 0.608, + "step": 525 + }, + { + "epoch": 0.011764575309929968, + "grad_norm": 1.9572395086288452, + "learning_rate": 1.9993170601430233e-05, + "loss": 0.657, + "step": 530 + }, + { + "epoch": 0.011875561869457608, + "grad_norm": 1.517907977104187, + "learning_rate": 1.9993041152056856e-05, + "loss": 0.5259, + "step": 535 + }, + { + "epoch": 0.01198654842898525, + "grad_norm": 1.538671612739563, + "learning_rate": 1.9992910487763052e-05, + "loss": 0.7345, + "step": 540 + }, + { + "epoch": 0.012097534988512891, + "grad_norm": 1.4196572303771973, + "learning_rate": 1.99927786085647e-05, + "loss": 0.6745, + "step": 545 + }, + { + "epoch": 0.012208521548040533, + "grad_norm": 1.7517229318618774, + "learning_rate": 1.9992645514477843e-05, + "loss": 0.5735, + "step": 550 + }, + { + "epoch": 0.012319508107568173, + "grad_norm": 2.0889716148376465, + "learning_rate": 1.9992511205518656e-05, + "loss": 0.6405, + "step": 555 + }, + { + "epoch": 0.012430494667095815, + "grad_norm": 1.3219823837280273, + "learning_rate": 1.999237568170347e-05, + "loss": 0.6927, + "step": 560 + }, + { + "epoch": 0.012541481226623456, + "grad_norm": 1.0619243383407593, + "learning_rate": 1.999223894304876e-05, + "loss": 0.6309, + "step": 565 + }, + { + "epoch": 0.012652467786151098, + "grad_norm": 1.5480470657348633, + "learning_rate": 1.999210098957115e-05, + "loss": 0.5383, + "step": 570 + }, + { + "epoch": 0.012763454345678738, + "grad_norm": 1.854567289352417, + "learning_rate": 1.9991961821287412e-05, + "loss": 0.6096, + "step": 575 + }, + { + "epoch": 0.01287444090520638, + "grad_norm": 1.3984990119934082, + "learning_rate": 1.999182143821447e-05, + "loss": 0.5172, + "step": 580 + }, + { + "epoch": 0.012985427464734021, + "grad_norm": 1.5796399116516113, + "learning_rate": 1.9991679840369383e-05, + "loss": 0.6999, + "step": 585 + }, + { + "epoch": 0.013096414024261661, + "grad_norm": 1.6039270162582397, + "learning_rate": 1.999153702776937e-05, + "loss": 0.7429, + "step": 590 + }, + { + "epoch": 0.013207400583789303, + "grad_norm": 1.6594629287719727, + "learning_rate": 1.9991393000431798e-05, + "loss": 0.5898, + "step": 595 + }, + { + "epoch": 0.013318387143316944, + "grad_norm": 1.4121723175048828, + "learning_rate": 1.999124775837417e-05, + "loss": 0.9492, + "step": 600 + }, + { + "epoch": 0.013429373702844586, + "grad_norm": 1.264452576637268, + "learning_rate": 1.999110130161415e-05, + "loss": 0.5776, + "step": 605 + }, + { + "epoch": 0.013540360262372226, + "grad_norm": 1.4184213876724243, + "learning_rate": 1.999095363016954e-05, + "loss": 0.6565, + "step": 610 + }, + { + "epoch": 0.013651346821899868, + "grad_norm": 1.5732219219207764, + "learning_rate": 1.9990804744058294e-05, + "loss": 0.491, + "step": 615 + }, + { + "epoch": 0.01376233338142751, + "grad_norm": 2.139857053756714, + "learning_rate": 1.9990654643298514e-05, + "loss": 0.608, + "step": 620 + }, + { + "epoch": 0.013873319940955151, + "grad_norm": 1.4825429916381836, + "learning_rate": 1.9990503327908452e-05, + "loss": 0.6254, + "step": 625 + }, + { + "epoch": 0.01398430650048279, + "grad_norm": 1.615256905555725, + "learning_rate": 1.9990350797906497e-05, + "loss": 0.628, + "step": 630 + }, + { + "epoch": 0.014095293060010432, + "grad_norm": 1.2488242387771606, + "learning_rate": 1.99901970533112e-05, + "loss": 0.6429, + "step": 635 + }, + { + "epoch": 0.014206279619538074, + "grad_norm": 3.8135104179382324, + "learning_rate": 1.9990042094141246e-05, + "loss": 0.5945, + "step": 640 + }, + { + "epoch": 0.014317266179065716, + "grad_norm": 1.429972767829895, + "learning_rate": 1.9989885920415483e-05, + "loss": 0.7532, + "step": 645 + }, + { + "epoch": 0.014428252738593356, + "grad_norm": 1.3050944805145264, + "learning_rate": 1.998972853215289e-05, + "loss": 0.5597, + "step": 650 + }, + { + "epoch": 0.014539239298120997, + "grad_norm": 1.4591888189315796, + "learning_rate": 1.9989569929372604e-05, + "loss": 0.6986, + "step": 655 + }, + { + "epoch": 0.014650225857648639, + "grad_norm": 1.4118572473526, + "learning_rate": 1.9989410112093914e-05, + "loss": 0.71, + "step": 660 + }, + { + "epoch": 0.01476121241717628, + "grad_norm": 1.1846052408218384, + "learning_rate": 1.9989249080336236e-05, + "loss": 0.6567, + "step": 665 + }, + { + "epoch": 0.01487219897670392, + "grad_norm": 1.5836447477340698, + "learning_rate": 1.9989086834119164e-05, + "loss": 0.7658, + "step": 670 + }, + { + "epoch": 0.014983185536231562, + "grad_norm": 1.3657773733139038, + "learning_rate": 1.998892337346241e-05, + "loss": 0.4862, + "step": 675 + }, + { + "epoch": 0.015094172095759204, + "grad_norm": 1.6087766885757446, + "learning_rate": 1.9988758698385854e-05, + "loss": 0.5732, + "step": 680 + }, + { + "epoch": 0.015205158655286845, + "grad_norm": 1.1223222017288208, + "learning_rate": 1.9988592808909514e-05, + "loss": 0.6215, + "step": 685 + }, + { + "epoch": 0.015316145214814485, + "grad_norm": 1.4434326887130737, + "learning_rate": 1.998842570505356e-05, + "loss": 0.6406, + "step": 690 + }, + { + "epoch": 0.015427131774342127, + "grad_norm": 1.4812527894973755, + "learning_rate": 1.9988257386838313e-05, + "loss": 0.7588, + "step": 695 + }, + { + "epoch": 0.015538118333869769, + "grad_norm": 1.7070807218551636, + "learning_rate": 1.9988087854284224e-05, + "loss": 0.6366, + "step": 700 + }, + { + "epoch": 0.01564910489339741, + "grad_norm": 1.4044088125228882, + "learning_rate": 1.9987917107411915e-05, + "loss": 0.6937, + "step": 705 + }, + { + "epoch": 0.015760091452925052, + "grad_norm": 1.299124836921692, + "learning_rate": 1.998774514624214e-05, + "loss": 0.5945, + "step": 710 + }, + { + "epoch": 0.015871078012452692, + "grad_norm": 1.1433302164077759, + "learning_rate": 1.9987571970795807e-05, + "loss": 0.6471, + "step": 715 + }, + { + "epoch": 0.015982064571980332, + "grad_norm": 1.9760017395019531, + "learning_rate": 1.9987397581093966e-05, + "loss": 0.6213, + "step": 720 + }, + { + "epoch": 0.016093051131507975, + "grad_norm": 1.4171857833862305, + "learning_rate": 1.9987221977157826e-05, + "loss": 0.7268, + "step": 725 + }, + { + "epoch": 0.016204037691035615, + "grad_norm": 1.2093497514724731, + "learning_rate": 1.998704515900873e-05, + "loss": 0.6168, + "step": 730 + }, + { + "epoch": 0.016315024250563255, + "grad_norm": 1.9509270191192627, + "learning_rate": 1.998686712666818e-05, + "loss": 0.6815, + "step": 735 + }, + { + "epoch": 0.0164260108100909, + "grad_norm": 1.7262238264083862, + "learning_rate": 1.9986687880157815e-05, + "loss": 0.6855, + "step": 740 + }, + { + "epoch": 0.01653699736961854, + "grad_norm": 1.6272399425506592, + "learning_rate": 1.9986507419499435e-05, + "loss": 0.4647, + "step": 745 + }, + { + "epoch": 0.016647983929146182, + "grad_norm": 1.5864605903625488, + "learning_rate": 1.998632574471497e-05, + "loss": 0.5712, + "step": 750 + }, + { + "epoch": 0.01675897048867382, + "grad_norm": 2.9724490642547607, + "learning_rate": 1.9986142855826515e-05, + "loss": 0.618, + "step": 755 + }, + { + "epoch": 0.01686995704820146, + "grad_norm": 1.1995495557785034, + "learning_rate": 1.9985958752856304e-05, + "loss": 0.4329, + "step": 760 + }, + { + "epoch": 0.016980943607729105, + "grad_norm": 1.5253748893737793, + "learning_rate": 1.9985773435826716e-05, + "loss": 0.416, + "step": 765 + }, + { + "epoch": 0.017091930167256745, + "grad_norm": 1.8420664072036743, + "learning_rate": 1.9985586904760285e-05, + "loss": 0.6585, + "step": 770 + }, + { + "epoch": 0.017202916726784385, + "grad_norm": 1.2143425941467285, + "learning_rate": 1.9985399159679684e-05, + "loss": 0.5646, + "step": 775 + }, + { + "epoch": 0.017313903286312028, + "grad_norm": 1.7242611646652222, + "learning_rate": 1.9985210200607743e-05, + "loss": 0.6403, + "step": 780 + }, + { + "epoch": 0.017424889845839668, + "grad_norm": 1.7307857275009155, + "learning_rate": 1.9985020027567433e-05, + "loss": 0.651, + "step": 785 + }, + { + "epoch": 0.01753587640536731, + "grad_norm": 1.5188896656036377, + "learning_rate": 1.998482864058188e-05, + "loss": 0.6383, + "step": 790 + }, + { + "epoch": 0.01764686296489495, + "grad_norm": 1.3000458478927612, + "learning_rate": 1.9984636039674342e-05, + "loss": 0.5734, + "step": 795 + }, + { + "epoch": 0.01775784952442259, + "grad_norm": 2.1906466484069824, + "learning_rate": 1.998444222486824e-05, + "loss": 0.6683, + "step": 800 + }, + { + "epoch": 0.017868836083950235, + "grad_norm": 1.3485846519470215, + "learning_rate": 1.998424719618714e-05, + "loss": 0.5797, + "step": 805 + }, + { + "epoch": 0.017979822643477875, + "grad_norm": 1.6266671419143677, + "learning_rate": 1.998405095365475e-05, + "loss": 0.5214, + "step": 810 + }, + { + "epoch": 0.018090809203005515, + "grad_norm": 1.570114016532898, + "learning_rate": 1.998385349729493e-05, + "loss": 0.6316, + "step": 815 + }, + { + "epoch": 0.018201795762533158, + "grad_norm": 1.1716421842575073, + "learning_rate": 1.9983654827131685e-05, + "loss": 0.6731, + "step": 820 + }, + { + "epoch": 0.018312782322060798, + "grad_norm": 0.9852983951568604, + "learning_rate": 1.9983454943189168e-05, + "loss": 0.4708, + "step": 825 + }, + { + "epoch": 0.01842376888158844, + "grad_norm": 1.6441328525543213, + "learning_rate": 1.9983253845491676e-05, + "loss": 0.5872, + "step": 830 + }, + { + "epoch": 0.01853475544111608, + "grad_norm": 1.401671290397644, + "learning_rate": 1.998305153406367e-05, + "loss": 0.4897, + "step": 835 + }, + { + "epoch": 0.01864574200064372, + "grad_norm": 1.5671908855438232, + "learning_rate": 1.9982848008929736e-05, + "loss": 0.7187, + "step": 840 + }, + { + "epoch": 0.018756728560171364, + "grad_norm": 1.1266714334487915, + "learning_rate": 1.9982643270114617e-05, + "loss": 0.6468, + "step": 845 + }, + { + "epoch": 0.018867715119699004, + "grad_norm": 1.074792504310608, + "learning_rate": 1.9982437317643218e-05, + "loss": 0.7326, + "step": 850 + }, + { + "epoch": 0.018978701679226644, + "grad_norm": 1.2255818843841553, + "learning_rate": 1.998223015154056e-05, + "loss": 0.6069, + "step": 855 + }, + { + "epoch": 0.019089688238754288, + "grad_norm": 1.5439560413360596, + "learning_rate": 1.9982021771831845e-05, + "loss": 0.7673, + "step": 860 + }, + { + "epoch": 0.019200674798281928, + "grad_norm": 1.186937928199768, + "learning_rate": 1.9981812178542394e-05, + "loss": 0.5751, + "step": 865 + }, + { + "epoch": 0.01931166135780957, + "grad_norm": 6.929201602935791, + "learning_rate": 1.9981601371697693e-05, + "loss": 0.5142, + "step": 870 + }, + { + "epoch": 0.01942264791733721, + "grad_norm": 1.579282522201538, + "learning_rate": 1.998138935132338e-05, + "loss": 0.7182, + "step": 875 + }, + { + "epoch": 0.01953363447686485, + "grad_norm": 1.0298939943313599, + "learning_rate": 1.998117611744522e-05, + "loss": 0.5654, + "step": 880 + }, + { + "epoch": 0.019644621036392494, + "grad_norm": 1.312228798866272, + "learning_rate": 1.9980961670089144e-05, + "loss": 0.5333, + "step": 885 + }, + { + "epoch": 0.019755607595920134, + "grad_norm": 1.3584381341934204, + "learning_rate": 1.998074600928122e-05, + "loss": 0.5383, + "step": 890 + }, + { + "epoch": 0.019866594155447774, + "grad_norm": 1.1420618295669556, + "learning_rate": 1.998052913504767e-05, + "loss": 0.6038, + "step": 895 + }, + { + "epoch": 0.019977580714975417, + "grad_norm": 1.7165864706039429, + "learning_rate": 1.998031104741486e-05, + "loss": 0.6515, + "step": 900 + }, + { + "epoch": 0.020088567274503057, + "grad_norm": 1.3264087438583374, + "learning_rate": 1.9980091746409303e-05, + "loss": 0.6116, + "step": 905 + }, + { + "epoch": 0.020199553834030697, + "grad_norm": 1.443520188331604, + "learning_rate": 1.9979871232057665e-05, + "loss": 0.5549, + "step": 910 + }, + { + "epoch": 0.02031054039355834, + "grad_norm": 1.1463545560836792, + "learning_rate": 1.997964950438675e-05, + "loss": 0.6164, + "step": 915 + }, + { + "epoch": 0.02042152695308598, + "grad_norm": 1.5313615798950195, + "learning_rate": 1.997942656342352e-05, + "loss": 0.6918, + "step": 920 + }, + { + "epoch": 0.020532513512613624, + "grad_norm": 1.35427725315094, + "learning_rate": 1.9979202409195073e-05, + "loss": 0.5805, + "step": 925 + }, + { + "epoch": 0.020643500072141264, + "grad_norm": 1.8106743097305298, + "learning_rate": 1.9978977041728665e-05, + "loss": 0.7505, + "step": 930 + }, + { + "epoch": 0.020754486631668904, + "grad_norm": 1.3848850727081299, + "learning_rate": 1.9978750461051698e-05, + "loss": 0.8439, + "step": 935 + }, + { + "epoch": 0.020865473191196547, + "grad_norm": 0.8859273195266724, + "learning_rate": 1.9978522667191714e-05, + "loss": 0.7039, + "step": 940 + }, + { + "epoch": 0.020976459750724187, + "grad_norm": 1.45253324508667, + "learning_rate": 1.997829366017641e-05, + "loss": 0.5935, + "step": 945 + }, + { + "epoch": 0.021087446310251827, + "grad_norm": 1.2368254661560059, + "learning_rate": 1.997806344003363e-05, + "loss": 0.585, + "step": 950 + }, + { + "epoch": 0.02119843286977947, + "grad_norm": 1.277491807937622, + "learning_rate": 1.997783200679136e-05, + "loss": 0.6367, + "step": 955 + }, + { + "epoch": 0.02130941942930711, + "grad_norm": 1.7970563173294067, + "learning_rate": 1.997759936047773e-05, + "loss": 0.4719, + "step": 960 + }, + { + "epoch": 0.021420405988834754, + "grad_norm": 2.2373642921447754, + "learning_rate": 1.997736550112104e-05, + "loss": 0.5275, + "step": 965 + }, + { + "epoch": 0.021531392548362394, + "grad_norm": 1.474535346031189, + "learning_rate": 1.9977130428749715e-05, + "loss": 0.5877, + "step": 970 + }, + { + "epoch": 0.021642379107890033, + "grad_norm": 1.587221384048462, + "learning_rate": 1.9976894143392326e-05, + "loss": 0.617, + "step": 975 + }, + { + "epoch": 0.021753365667417677, + "grad_norm": 1.6356884241104126, + "learning_rate": 1.9976656645077613e-05, + "loss": 0.6357, + "step": 980 + }, + { + "epoch": 0.021864352226945317, + "grad_norm": 1.512257695198059, + "learning_rate": 1.997641793383444e-05, + "loss": 0.5766, + "step": 985 + }, + { + "epoch": 0.021975338786472957, + "grad_norm": 1.3856985569000244, + "learning_rate": 1.9976178009691836e-05, + "loss": 0.5846, + "step": 990 + }, + { + "epoch": 0.0220863253460006, + "grad_norm": 1.6915297508239746, + "learning_rate": 1.997593687267897e-05, + "loss": 0.608, + "step": 995 + }, + { + "epoch": 0.02219731190552824, + "grad_norm": 1.709142804145813, + "learning_rate": 1.997569452282515e-05, + "loss": 0.6902, + "step": 1000 + }, + { + "epoch": 0.022308298465055883, + "grad_norm": 1.3087146282196045, + "learning_rate": 1.9975450960159847e-05, + "loss": 0.648, + "step": 1005 + }, + { + "epoch": 0.022419285024583523, + "grad_norm": 1.3179740905761719, + "learning_rate": 1.9975206184712673e-05, + "loss": 0.6586, + "step": 1010 + }, + { + "epoch": 0.022530271584111163, + "grad_norm": 1.5364478826522827, + "learning_rate": 1.9974960196513383e-05, + "loss": 0.6554, + "step": 1015 + }, + { + "epoch": 0.022641258143638807, + "grad_norm": 1.292578935623169, + "learning_rate": 1.9974712995591887e-05, + "loss": 0.6711, + "step": 1020 + }, + { + "epoch": 0.022752244703166447, + "grad_norm": 1.5487346649169922, + "learning_rate": 1.997446458197824e-05, + "loss": 0.7997, + "step": 1025 + }, + { + "epoch": 0.022863231262694086, + "grad_norm": 1.3361338376998901, + "learning_rate": 1.9974214955702637e-05, + "loss": 0.712, + "step": 1030 + }, + { + "epoch": 0.02297421782222173, + "grad_norm": 1.3260635137557983, + "learning_rate": 1.9973964116795432e-05, + "loss": 0.643, + "step": 1035 + }, + { + "epoch": 0.02308520438174937, + "grad_norm": 1.4649790525436401, + "learning_rate": 1.997371206528712e-05, + "loss": 0.6557, + "step": 1040 + }, + { + "epoch": 0.02319619094127701, + "grad_norm": 1.4138814210891724, + "learning_rate": 1.9973458801208342e-05, + "loss": 0.6153, + "step": 1045 + }, + { + "epoch": 0.023307177500804653, + "grad_norm": 1.391961932182312, + "learning_rate": 1.9973204324589895e-05, + "loss": 0.7188, + "step": 1050 + }, + { + "epoch": 0.023418164060332293, + "grad_norm": 1.024633765220642, + "learning_rate": 1.9972948635462712e-05, + "loss": 0.6313, + "step": 1055 + }, + { + "epoch": 0.023529150619859936, + "grad_norm": 1.5817911624908447, + "learning_rate": 1.997269173385788e-05, + "loss": 0.6091, + "step": 1060 + }, + { + "epoch": 0.023640137179387576, + "grad_norm": 2.2141048908233643, + "learning_rate": 1.9972433619806634e-05, + "loss": 0.7489, + "step": 1065 + }, + { + "epoch": 0.023751123738915216, + "grad_norm": 1.731985330581665, + "learning_rate": 1.9972174293340355e-05, + "loss": 0.4683, + "step": 1070 + }, + { + "epoch": 0.02386211029844286, + "grad_norm": 1.7920399904251099, + "learning_rate": 1.997191375449057e-05, + "loss": 0.6375, + "step": 1075 + }, + { + "epoch": 0.0239730968579705, + "grad_norm": 1.3139867782592773, + "learning_rate": 1.9971652003288947e-05, + "loss": 0.5733, + "step": 1080 + }, + { + "epoch": 0.02408408341749814, + "grad_norm": 2.353349208831787, + "learning_rate": 1.9971389039767323e-05, + "loss": 0.6415, + "step": 1085 + }, + { + "epoch": 0.024195069977025783, + "grad_norm": 1.2308602333068848, + "learning_rate": 1.997112486395766e-05, + "loss": 0.4836, + "step": 1090 + }, + { + "epoch": 0.024306056536553423, + "grad_norm": 1.1983168125152588, + "learning_rate": 1.997085947589207e-05, + "loss": 0.5186, + "step": 1095 + }, + { + "epoch": 0.024417043096081066, + "grad_norm": 1.446942687034607, + "learning_rate": 1.9970592875602833e-05, + "loss": 0.6611, + "step": 1100 + }, + { + "epoch": 0.024528029655608706, + "grad_norm": 2.764981746673584, + "learning_rate": 1.9970325063122348e-05, + "loss": 0.7114, + "step": 1105 + }, + { + "epoch": 0.024639016215136346, + "grad_norm": 1.2632938623428345, + "learning_rate": 1.9970056038483184e-05, + "loss": 0.5719, + "step": 1110 + }, + { + "epoch": 0.02475000277466399, + "grad_norm": 1.2802735567092896, + "learning_rate": 1.996978580171804e-05, + "loss": 0.5928, + "step": 1115 + }, + { + "epoch": 0.02486098933419163, + "grad_norm": 0.9999619722366333, + "learning_rate": 1.9969514352859774e-05, + "loss": 0.5643, + "step": 1120 + }, + { + "epoch": 0.02497197589371927, + "grad_norm": 1.2582635879516602, + "learning_rate": 1.996924169194139e-05, + "loss": 0.5307, + "step": 1125 + }, + { + "epoch": 0.025082962453246913, + "grad_norm": 1.2746455669403076, + "learning_rate": 1.9968967818996036e-05, + "loss": 0.6455, + "step": 1130 + }, + { + "epoch": 0.025193949012774552, + "grad_norm": 1.2235496044158936, + "learning_rate": 1.9968692734057006e-05, + "loss": 0.7033, + "step": 1135 + }, + { + "epoch": 0.025304935572302196, + "grad_norm": 1.1782463788986206, + "learning_rate": 1.9968416437157743e-05, + "loss": 0.4876, + "step": 1140 + }, + { + "epoch": 0.025415922131829836, + "grad_norm": 1.56901216506958, + "learning_rate": 1.9968138928331847e-05, + "loss": 0.5141, + "step": 1145 + }, + { + "epoch": 0.025526908691357476, + "grad_norm": 1.7434866428375244, + "learning_rate": 1.9967860207613047e-05, + "loss": 0.5785, + "step": 1150 + }, + { + "epoch": 0.02563789525088512, + "grad_norm": 1.423734188079834, + "learning_rate": 1.9967580275035234e-05, + "loss": 0.5057, + "step": 1155 + }, + { + "epoch": 0.02574888181041276, + "grad_norm": 1.0542765855789185, + "learning_rate": 1.996729913063244e-05, + "loss": 0.6828, + "step": 1160 + }, + { + "epoch": 0.0258598683699404, + "grad_norm": 1.0192484855651855, + "learning_rate": 1.9967016774438847e-05, + "loss": 0.5117, + "step": 1165 + }, + { + "epoch": 0.025970854929468042, + "grad_norm": 1.1470212936401367, + "learning_rate": 1.9966733206488777e-05, + "loss": 0.4174, + "step": 1170 + }, + { + "epoch": 0.026081841488995682, + "grad_norm": 2.4811060428619385, + "learning_rate": 1.996644842681671e-05, + "loss": 0.4642, + "step": 1175 + }, + { + "epoch": 0.026192828048523322, + "grad_norm": 1.4627830982208252, + "learning_rate": 1.996616243545727e-05, + "loss": 0.7485, + "step": 1180 + }, + { + "epoch": 0.026303814608050965, + "grad_norm": 1.440354347229004, + "learning_rate": 1.9965875232445227e-05, + "loss": 0.5197, + "step": 1185 + }, + { + "epoch": 0.026414801167578605, + "grad_norm": 1.37411630153656, + "learning_rate": 1.9965586817815494e-05, + "loss": 0.7286, + "step": 1190 + }, + { + "epoch": 0.02652578772710625, + "grad_norm": 1.1004072427749634, + "learning_rate": 1.996529719160314e-05, + "loss": 0.6062, + "step": 1195 + }, + { + "epoch": 0.02663677428663389, + "grad_norm": 1.3563573360443115, + "learning_rate": 1.996500635384337e-05, + "loss": 0.5376, + "step": 1200 + }, + { + "epoch": 0.02674776084616153, + "grad_norm": 1.093072533607483, + "learning_rate": 1.996471430457155e-05, + "loss": 0.6083, + "step": 1205 + }, + { + "epoch": 0.026858747405689172, + "grad_norm": 1.385481357574463, + "learning_rate": 1.9964421043823186e-05, + "loss": 0.5489, + "step": 1210 + }, + { + "epoch": 0.026969733965216812, + "grad_norm": 1.5628325939178467, + "learning_rate": 1.9964126571633925e-05, + "loss": 0.5826, + "step": 1215 + }, + { + "epoch": 0.027080720524744452, + "grad_norm": 1.501821517944336, + "learning_rate": 1.9963830888039576e-05, + "loss": 0.6989, + "step": 1220 + }, + { + "epoch": 0.027191707084272095, + "grad_norm": 0.900786817073822, + "learning_rate": 1.996353399307608e-05, + "loss": 0.6027, + "step": 1225 + }, + { + "epoch": 0.027302693643799735, + "grad_norm": 1.074896216392517, + "learning_rate": 1.996323588677954e-05, + "loss": 0.5475, + "step": 1230 + }, + { + "epoch": 0.02741368020332738, + "grad_norm": 1.3888300657272339, + "learning_rate": 1.9962936569186195e-05, + "loss": 0.5201, + "step": 1235 + }, + { + "epoch": 0.02752466676285502, + "grad_norm": 1.707251787185669, + "learning_rate": 1.9962636040332432e-05, + "loss": 0.7034, + "step": 1240 + }, + { + "epoch": 0.02763565332238266, + "grad_norm": 1.8080699443817139, + "learning_rate": 1.9962334300254796e-05, + "loss": 0.7599, + "step": 1245 + }, + { + "epoch": 0.027746639881910302, + "grad_norm": 1.0971603393554688, + "learning_rate": 1.9962031348989962e-05, + "loss": 0.587, + "step": 1250 + }, + { + "epoch": 0.02785762644143794, + "grad_norm": 1.1529098749160767, + "learning_rate": 1.9961727186574768e-05, + "loss": 0.5845, + "step": 1255 + }, + { + "epoch": 0.02796861300096558, + "grad_norm": 1.1753336191177368, + "learning_rate": 1.9961421813046193e-05, + "loss": 0.4196, + "step": 1260 + }, + { + "epoch": 0.028079599560493225, + "grad_norm": 1.2487094402313232, + "learning_rate": 1.9961115228441363e-05, + "loss": 0.6387, + "step": 1265 + }, + { + "epoch": 0.028190586120020865, + "grad_norm": 1.2633018493652344, + "learning_rate": 1.9960807432797545e-05, + "loss": 0.5321, + "step": 1270 + }, + { + "epoch": 0.02830157267954851, + "grad_norm": 1.3093974590301514, + "learning_rate": 1.996049842615217e-05, + "loss": 0.4955, + "step": 1275 + }, + { + "epoch": 0.028412559239076148, + "grad_norm": 1.301692247390747, + "learning_rate": 1.99601882085428e-05, + "loss": 0.7267, + "step": 1280 + }, + { + "epoch": 0.028523545798603788, + "grad_norm": 1.0108011960983276, + "learning_rate": 1.995987678000715e-05, + "loss": 0.5799, + "step": 1285 + }, + { + "epoch": 0.02863453235813143, + "grad_norm": 1.1895384788513184, + "learning_rate": 1.9959564140583088e-05, + "loss": 0.5349, + "step": 1290 + }, + { + "epoch": 0.02874551891765907, + "grad_norm": 1.179839015007019, + "learning_rate": 1.9959250290308617e-05, + "loss": 0.6611, + "step": 1295 + }, + { + "epoch": 0.02885650547718671, + "grad_norm": 1.1100138425827026, + "learning_rate": 1.99589352292219e-05, + "loss": 0.4495, + "step": 1300 + }, + { + "epoch": 0.028967492036714355, + "grad_norm": 1.2159394025802612, + "learning_rate": 1.9958618957361233e-05, + "loss": 0.6009, + "step": 1305 + }, + { + "epoch": 0.029078478596241995, + "grad_norm": 0.953895092010498, + "learning_rate": 1.995830147476507e-05, + "loss": 0.5071, + "step": 1310 + }, + { + "epoch": 0.029189465155769638, + "grad_norm": 1.41812002658844, + "learning_rate": 1.9957982781472016e-05, + "loss": 0.603, + "step": 1315 + }, + { + "epoch": 0.029300451715297278, + "grad_norm": 1.746551752090454, + "learning_rate": 1.995766287752081e-05, + "loss": 0.7819, + "step": 1320 + }, + { + "epoch": 0.029411438274824918, + "grad_norm": 1.238605260848999, + "learning_rate": 1.9957341762950346e-05, + "loss": 0.6801, + "step": 1325 + }, + { + "epoch": 0.02952242483435256, + "grad_norm": 1.1259163618087769, + "learning_rate": 1.9957019437799666e-05, + "loss": 0.6514, + "step": 1330 + }, + { + "epoch": 0.0296334113938802, + "grad_norm": 1.6120734214782715, + "learning_rate": 1.9956695902107956e-05, + "loss": 0.5272, + "step": 1335 + }, + { + "epoch": 0.02974439795340784, + "grad_norm": 1.0889034271240234, + "learning_rate": 1.9956371155914552e-05, + "loss": 0.6868, + "step": 1340 + }, + { + "epoch": 0.029855384512935484, + "grad_norm": 1.2904348373413086, + "learning_rate": 1.9956045199258927e-05, + "loss": 0.6264, + "step": 1345 + }, + { + "epoch": 0.029966371072463124, + "grad_norm": 1.2338993549346924, + "learning_rate": 1.9955718032180725e-05, + "loss": 0.5667, + "step": 1350 + }, + { + "epoch": 0.030077357631990764, + "grad_norm": 1.6639360189437866, + "learning_rate": 1.995538965471971e-05, + "loss": 0.5418, + "step": 1355 + }, + { + "epoch": 0.030188344191518408, + "grad_norm": 1.203410267829895, + "learning_rate": 1.995506006691581e-05, + "loss": 0.4512, + "step": 1360 + }, + { + "epoch": 0.030299330751046048, + "grad_norm": 1.343341588973999, + "learning_rate": 1.995472926880909e-05, + "loss": 0.499, + "step": 1365 + }, + { + "epoch": 0.03041031731057369, + "grad_norm": 1.2672549486160278, + "learning_rate": 1.9954397260439777e-05, + "loss": 0.6129, + "step": 1370 + }, + { + "epoch": 0.03052130387010133, + "grad_norm": 1.2296212911605835, + "learning_rate": 1.9954064041848223e-05, + "loss": 0.607, + "step": 1375 + }, + { + "epoch": 0.03063229042962897, + "grad_norm": 1.239903211593628, + "learning_rate": 1.995372961307495e-05, + "loss": 0.5965, + "step": 1380 + }, + { + "epoch": 0.030743276989156614, + "grad_norm": 1.1556373834609985, + "learning_rate": 1.995339397416061e-05, + "loss": 0.5994, + "step": 1385 + }, + { + "epoch": 0.030854263548684254, + "grad_norm": 0.9935384392738342, + "learning_rate": 1.9953057125146017e-05, + "loss": 0.4611, + "step": 1390 + }, + { + "epoch": 0.030965250108211894, + "grad_norm": 0.9856865406036377, + "learning_rate": 1.9952719066072115e-05, + "loss": 0.4357, + "step": 1395 + }, + { + "epoch": 0.031076236667739537, + "grad_norm": 1.1673684120178223, + "learning_rate": 1.9952379796980007e-05, + "loss": 0.5264, + "step": 1400 + }, + { + "epoch": 0.031187223227267177, + "grad_norm": 1.4866957664489746, + "learning_rate": 1.995203931791094e-05, + "loss": 0.6887, + "step": 1405 + }, + { + "epoch": 0.03129820978679482, + "grad_norm": 1.1869754791259766, + "learning_rate": 1.9951697628906316e-05, + "loss": 0.6099, + "step": 1410 + }, + { + "epoch": 0.03140919634632246, + "grad_norm": 1.7387142181396484, + "learning_rate": 1.9951354730007662e-05, + "loss": 0.7121, + "step": 1415 + }, + { + "epoch": 0.031520182905850104, + "grad_norm": 1.1650878190994263, + "learning_rate": 1.9951010621256678e-05, + "loss": 0.6531, + "step": 1420 + }, + { + "epoch": 0.031631169465377744, + "grad_norm": 1.3697797060012817, + "learning_rate": 1.9950665302695195e-05, + "loss": 0.7252, + "step": 1425 + }, + { + "epoch": 0.031742156024905384, + "grad_norm": 1.2272151708602905, + "learning_rate": 1.9950318774365195e-05, + "loss": 0.5068, + "step": 1430 + }, + { + "epoch": 0.031853142584433024, + "grad_norm": 1.00801420211792, + "learning_rate": 1.9949971036308814e-05, + "loss": 0.6587, + "step": 1435 + }, + { + "epoch": 0.031964129143960664, + "grad_norm": 0.9060214757919312, + "learning_rate": 1.9949622088568323e-05, + "loss": 0.7408, + "step": 1440 + }, + { + "epoch": 0.03207511570348831, + "grad_norm": 1.053296685218811, + "learning_rate": 1.994927193118614e-05, + "loss": 0.678, + "step": 1445 + }, + { + "epoch": 0.03218610226301595, + "grad_norm": 1.3420132398605347, + "learning_rate": 1.994892056420485e-05, + "loss": 0.7067, + "step": 1450 + }, + { + "epoch": 0.03229708882254359, + "grad_norm": 1.5862083435058594, + "learning_rate": 1.994856798766716e-05, + "loss": 0.6518, + "step": 1455 + }, + { + "epoch": 0.03240807538207123, + "grad_norm": 1.2782789468765259, + "learning_rate": 1.994821420161594e-05, + "loss": 0.4185, + "step": 1460 + }, + { + "epoch": 0.03251906194159887, + "grad_norm": 2.185882568359375, + "learning_rate": 1.9947859206094202e-05, + "loss": 0.6727, + "step": 1465 + }, + { + "epoch": 0.03263004850112651, + "grad_norm": 1.0432374477386475, + "learning_rate": 1.9947503001145104e-05, + "loss": 0.4949, + "step": 1470 + }, + { + "epoch": 0.03274103506065416, + "grad_norm": 1.2504695653915405, + "learning_rate": 1.9947145586811955e-05, + "loss": 0.6297, + "step": 1475 + }, + { + "epoch": 0.0328520216201818, + "grad_norm": 1.5974503755569458, + "learning_rate": 1.99467869631382e-05, + "loss": 0.5971, + "step": 1480 + }, + { + "epoch": 0.03296300817970944, + "grad_norm": 1.4650466442108154, + "learning_rate": 1.9946427130167446e-05, + "loss": 0.564, + "step": 1485 + }, + { + "epoch": 0.03307399473923708, + "grad_norm": 1.6814020872116089, + "learning_rate": 1.9946066087943442e-05, + "loss": 0.509, + "step": 1490 + }, + { + "epoch": 0.03318498129876472, + "grad_norm": 1.4288887977600098, + "learning_rate": 1.994570383651008e-05, + "loss": 0.5716, + "step": 1495 + }, + { + "epoch": 0.033295967858292363, + "grad_norm": 0.8717050552368164, + "learning_rate": 1.99453403759114e-05, + "loss": 0.5378, + "step": 1500 + }, + { + "epoch": 0.03340695441782, + "grad_norm": 1.4430837631225586, + "learning_rate": 1.994497570619159e-05, + "loss": 0.6293, + "step": 1505 + }, + { + "epoch": 0.03351794097734764, + "grad_norm": 1.6973576545715332, + "learning_rate": 1.9944609827394986e-05, + "loss": 0.6011, + "step": 1510 + }, + { + "epoch": 0.03362892753687528, + "grad_norm": 1.0696516036987305, + "learning_rate": 1.9944242739566072e-05, + "loss": 0.5927, + "step": 1515 + }, + { + "epoch": 0.03373991409640292, + "grad_norm": 1.1888644695281982, + "learning_rate": 1.9943874442749478e-05, + "loss": 0.6889, + "step": 1520 + }, + { + "epoch": 0.03385090065593057, + "grad_norm": 1.1972169876098633, + "learning_rate": 1.9943504936989978e-05, + "loss": 0.6074, + "step": 1525 + }, + { + "epoch": 0.03396188721545821, + "grad_norm": 1.407692313194275, + "learning_rate": 1.9943134222332493e-05, + "loss": 0.6392, + "step": 1530 + }, + { + "epoch": 0.03407287377498585, + "grad_norm": 1.3288302421569824, + "learning_rate": 1.9942762298822095e-05, + "loss": 0.5331, + "step": 1535 + }, + { + "epoch": 0.03418386033451349, + "grad_norm": 1.2641630172729492, + "learning_rate": 1.9942389166504005e-05, + "loss": 0.5805, + "step": 1540 + }, + { + "epoch": 0.03429484689404113, + "grad_norm": 1.5590423345565796, + "learning_rate": 1.9942014825423583e-05, + "loss": 0.5848, + "step": 1545 + }, + { + "epoch": 0.03440583345356877, + "grad_norm": 1.3845590353012085, + "learning_rate": 1.9941639275626343e-05, + "loss": 0.6635, + "step": 1550 + }, + { + "epoch": 0.034516820013096416, + "grad_norm": 1.493355393409729, + "learning_rate": 1.994126251715794e-05, + "loss": 0.494, + "step": 1555 + }, + { + "epoch": 0.034627806572624056, + "grad_norm": 1.3604371547698975, + "learning_rate": 1.9940884550064182e-05, + "loss": 0.6372, + "step": 1560 + }, + { + "epoch": 0.034738793132151696, + "grad_norm": 1.330060601234436, + "learning_rate": 1.994050537439102e-05, + "loss": 0.6928, + "step": 1565 + }, + { + "epoch": 0.034849779691679336, + "grad_norm": 1.2125601768493652, + "learning_rate": 1.994012499018455e-05, + "loss": 0.6707, + "step": 1570 + }, + { + "epoch": 0.034960766251206976, + "grad_norm": 1.265838623046875, + "learning_rate": 1.993974339749102e-05, + "loss": 0.4771, + "step": 1575 + }, + { + "epoch": 0.03507175281073462, + "grad_norm": 1.1168771982192993, + "learning_rate": 1.9939360596356824e-05, + "loss": 0.628, + "step": 1580 + }, + { + "epoch": 0.03518273937026226, + "grad_norm": 1.2629988193511963, + "learning_rate": 1.9938976586828503e-05, + "loss": 0.766, + "step": 1585 + }, + { + "epoch": 0.0352937259297899, + "grad_norm": 1.5460317134857178, + "learning_rate": 1.993859136895274e-05, + "loss": 0.6064, + "step": 1590 + }, + { + "epoch": 0.03540471248931754, + "grad_norm": 1.1441147327423096, + "learning_rate": 1.9938204942776367e-05, + "loss": 0.5786, + "step": 1595 + }, + { + "epoch": 0.03551569904884518, + "grad_norm": 1.3351715803146362, + "learning_rate": 1.993781730834637e-05, + "loss": 0.6855, + "step": 1600 + }, + { + "epoch": 0.03562668560837283, + "grad_norm": 1.2413687705993652, + "learning_rate": 1.9937428465709875e-05, + "loss": 0.7534, + "step": 1605 + }, + { + "epoch": 0.03573767216790047, + "grad_norm": 1.6053916215896606, + "learning_rate": 1.993703841491415e-05, + "loss": 0.6457, + "step": 1610 + }, + { + "epoch": 0.03584865872742811, + "grad_norm": 1.6114028692245483, + "learning_rate": 1.9936647156006623e-05, + "loss": 0.5664, + "step": 1615 + }, + { + "epoch": 0.03595964528695575, + "grad_norm": 1.053146243095398, + "learning_rate": 1.9936254689034863e-05, + "loss": 0.6224, + "step": 1620 + }, + { + "epoch": 0.03607063184648339, + "grad_norm": 1.386884093284607, + "learning_rate": 1.9935861014046578e-05, + "loss": 0.677, + "step": 1625 + }, + { + "epoch": 0.03618161840601103, + "grad_norm": 1.376198410987854, + "learning_rate": 1.993546613108963e-05, + "loss": 0.8053, + "step": 1630 + }, + { + "epoch": 0.036292604965538676, + "grad_norm": 1.3916345834732056, + "learning_rate": 1.9935070040212038e-05, + "loss": 0.6456, + "step": 1635 + }, + { + "epoch": 0.036403591525066316, + "grad_norm": 1.714755654335022, + "learning_rate": 1.9934672741461946e-05, + "loss": 0.579, + "step": 1640 + }, + { + "epoch": 0.036514578084593956, + "grad_norm": 1.1382675170898438, + "learning_rate": 1.993427423488766e-05, + "loss": 0.5205, + "step": 1645 + }, + { + "epoch": 0.036625564644121596, + "grad_norm": 3.2711870670318604, + "learning_rate": 1.993387452053763e-05, + "loss": 0.7955, + "step": 1650 + }, + { + "epoch": 0.036736551203649236, + "grad_norm": 1.1714918613433838, + "learning_rate": 1.9933473598460454e-05, + "loss": 0.6621, + "step": 1655 + }, + { + "epoch": 0.03684753776317688, + "grad_norm": 1.2545034885406494, + "learning_rate": 1.993307146870487e-05, + "loss": 0.6243, + "step": 1660 + }, + { + "epoch": 0.03695852432270452, + "grad_norm": 1.188735008239746, + "learning_rate": 1.993266813131977e-05, + "loss": 0.7094, + "step": 1665 + }, + { + "epoch": 0.03706951088223216, + "grad_norm": 1.543319582939148, + "learning_rate": 1.993226358635419e-05, + "loss": 0.6565, + "step": 1670 + }, + { + "epoch": 0.0371804974417598, + "grad_norm": 0.9534319043159485, + "learning_rate": 1.9931857833857313e-05, + "loss": 0.4431, + "step": 1675 + }, + { + "epoch": 0.03729148400128744, + "grad_norm": 1.7737338542938232, + "learning_rate": 1.993145087387847e-05, + "loss": 0.735, + "step": 1680 + }, + { + "epoch": 0.03740247056081508, + "grad_norm": 1.04007089138031, + "learning_rate": 1.993104270646714e-05, + "loss": 0.5504, + "step": 1685 + }, + { + "epoch": 0.03751345712034273, + "grad_norm": 1.4022916555404663, + "learning_rate": 1.993063333167294e-05, + "loss": 0.5101, + "step": 1690 + }, + { + "epoch": 0.03762444367987037, + "grad_norm": 1.1499395370483398, + "learning_rate": 1.9930222749545643e-05, + "loss": 0.5378, + "step": 1695 + }, + { + "epoch": 0.03773543023939801, + "grad_norm": 1.1989136934280396, + "learning_rate": 1.992981096013517e-05, + "loss": 0.7049, + "step": 1700 + }, + { + "epoch": 0.03784641679892565, + "grad_norm": 1.572718620300293, + "learning_rate": 1.9929397963491583e-05, + "loss": 0.6205, + "step": 1705 + }, + { + "epoch": 0.03795740335845329, + "grad_norm": 1.2384471893310547, + "learning_rate": 1.9928983759665092e-05, + "loss": 0.6183, + "step": 1710 + }, + { + "epoch": 0.038068389917980935, + "grad_norm": 1.4257042407989502, + "learning_rate": 1.9928568348706053e-05, + "loss": 0.5604, + "step": 1715 + }, + { + "epoch": 0.038179376477508575, + "grad_norm": 0.9981622695922852, + "learning_rate": 1.9928151730664975e-05, + "loss": 0.656, + "step": 1720 + }, + { + "epoch": 0.038290363037036215, + "grad_norm": 1.3974087238311768, + "learning_rate": 1.9927733905592505e-05, + "loss": 0.7721, + "step": 1725 + }, + { + "epoch": 0.038401349596563855, + "grad_norm": 1.5321563482284546, + "learning_rate": 1.992731487353944e-05, + "loss": 0.6978, + "step": 1730 + }, + { + "epoch": 0.038512336156091495, + "grad_norm": 1.3335909843444824, + "learning_rate": 1.9926894634556726e-05, + "loss": 0.651, + "step": 1735 + }, + { + "epoch": 0.03862332271561914, + "grad_norm": 1.35151207447052, + "learning_rate": 1.992647318869546e-05, + "loss": 0.5383, + "step": 1740 + }, + { + "epoch": 0.03873430927514678, + "grad_norm": 1.6774109601974487, + "learning_rate": 1.992605053600687e-05, + "loss": 0.6962, + "step": 1745 + }, + { + "epoch": 0.03884529583467442, + "grad_norm": 0.9621152877807617, + "learning_rate": 1.992562667654234e-05, + "loss": 0.5274, + "step": 1750 + }, + { + "epoch": 0.03895628239420206, + "grad_norm": 2.0539510250091553, + "learning_rate": 1.9925201610353415e-05, + "loss": 0.7013, + "step": 1755 + }, + { + "epoch": 0.0390672689537297, + "grad_norm": 1.0954846143722534, + "learning_rate": 1.992477533749176e-05, + "loss": 0.5413, + "step": 1760 + }, + { + "epoch": 0.03917825551325734, + "grad_norm": 1.2433022260665894, + "learning_rate": 1.992434785800921e-05, + "loss": 0.5258, + "step": 1765 + }, + { + "epoch": 0.03928924207278499, + "grad_norm": 1.1122106313705444, + "learning_rate": 1.992391917195773e-05, + "loss": 0.5136, + "step": 1770 + }, + { + "epoch": 0.03940022863231263, + "grad_norm": 1.3719093799591064, + "learning_rate": 1.9923489279389433e-05, + "loss": 0.5683, + "step": 1775 + }, + { + "epoch": 0.03951121519184027, + "grad_norm": 1.1773172616958618, + "learning_rate": 1.9923058180356595e-05, + "loss": 0.5897, + "step": 1780 + }, + { + "epoch": 0.03962220175136791, + "grad_norm": 1.156784176826477, + "learning_rate": 1.9922625874911624e-05, + "loss": 0.4755, + "step": 1785 + }, + { + "epoch": 0.03973318831089555, + "grad_norm": 1.1960108280181885, + "learning_rate": 1.9922192363107075e-05, + "loss": 0.7606, + "step": 1790 + }, + { + "epoch": 0.039844174870423195, + "grad_norm": 1.7010750770568848, + "learning_rate": 1.9921757644995656e-05, + "loss": 0.5427, + "step": 1795 + }, + { + "epoch": 0.039955161429950835, + "grad_norm": 1.354822039604187, + "learning_rate": 1.9921321720630216e-05, + "loss": 0.5386, + "step": 1800 + }, + { + "epoch": 0.040066147989478475, + "grad_norm": 1.4349123239517212, + "learning_rate": 1.9920884590063755e-05, + "loss": 0.6347, + "step": 1805 + }, + { + "epoch": 0.040177134549006115, + "grad_norm": 1.2014262676239014, + "learning_rate": 1.9920446253349417e-05, + "loss": 0.7894, + "step": 1810 + }, + { + "epoch": 0.040288121108533755, + "grad_norm": 1.3427788019180298, + "learning_rate": 1.9920006710540495e-05, + "loss": 0.6631, + "step": 1815 + }, + { + "epoch": 0.040399107668061394, + "grad_norm": 1.188988447189331, + "learning_rate": 1.9919565961690426e-05, + "loss": 0.598, + "step": 1820 + }, + { + "epoch": 0.04051009422758904, + "grad_norm": 1.388492226600647, + "learning_rate": 1.9919124006852794e-05, + "loss": 0.6441, + "step": 1825 + }, + { + "epoch": 0.04062108078711668, + "grad_norm": 1.0690385103225708, + "learning_rate": 1.9918680846081334e-05, + "loss": 0.6082, + "step": 1830 + }, + { + "epoch": 0.04073206734664432, + "grad_norm": 1.3798823356628418, + "learning_rate": 1.991823647942992e-05, + "loss": 0.7001, + "step": 1835 + }, + { + "epoch": 0.04084305390617196, + "grad_norm": 1.3722220659255981, + "learning_rate": 1.991779090695258e-05, + "loss": 0.5362, + "step": 1840 + }, + { + "epoch": 0.0409540404656996, + "grad_norm": 1.292083501815796, + "learning_rate": 1.991734412870348e-05, + "loss": 0.508, + "step": 1845 + }, + { + "epoch": 0.04106502702522725, + "grad_norm": 1.1517761945724487, + "learning_rate": 1.9916896144736943e-05, + "loss": 0.3987, + "step": 1850 + }, + { + "epoch": 0.04117601358475489, + "grad_norm": 1.3085436820983887, + "learning_rate": 1.991644695510743e-05, + "loss": 0.748, + "step": 1855 + }, + { + "epoch": 0.04128700014428253, + "grad_norm": 1.3864504098892212, + "learning_rate": 1.9915996559869553e-05, + "loss": 0.7465, + "step": 1860 + }, + { + "epoch": 0.04139798670381017, + "grad_norm": 1.2560830116271973, + "learning_rate": 1.9915544959078072e-05, + "loss": 0.5718, + "step": 1865 + }, + { + "epoch": 0.04150897326333781, + "grad_norm": 1.312229871749878, + "learning_rate": 1.9915092152787888e-05, + "loss": 0.5943, + "step": 1870 + }, + { + "epoch": 0.041619959822865454, + "grad_norm": 1.272802710533142, + "learning_rate": 1.9914638141054053e-05, + "loss": 0.5184, + "step": 1875 + }, + { + "epoch": 0.041730946382393094, + "grad_norm": 1.5974937677383423, + "learning_rate": 1.9914182923931766e-05, + "loss": 0.6155, + "step": 1880 + }, + { + "epoch": 0.041841932941920734, + "grad_norm": 1.1618620157241821, + "learning_rate": 1.9913726501476366e-05, + "loss": 0.5379, + "step": 1885 + }, + { + "epoch": 0.041952919501448374, + "grad_norm": 0.9664960503578186, + "learning_rate": 1.9913268873743342e-05, + "loss": 0.6048, + "step": 1890 + }, + { + "epoch": 0.042063906060976014, + "grad_norm": 1.270060420036316, + "learning_rate": 1.991281004078834e-05, + "loss": 0.59, + "step": 1895 + }, + { + "epoch": 0.042174892620503654, + "grad_norm": 1.2572544813156128, + "learning_rate": 1.9912350002667137e-05, + "loss": 0.7689, + "step": 1900 + }, + { + "epoch": 0.0422858791800313, + "grad_norm": 1.559762954711914, + "learning_rate": 1.9911888759435665e-05, + "loss": 0.5408, + "step": 1905 + }, + { + "epoch": 0.04239686573955894, + "grad_norm": 1.2439285516738892, + "learning_rate": 1.991142631115e-05, + "loss": 0.5589, + "step": 1910 + }, + { + "epoch": 0.04250785229908658, + "grad_norm": 1.8407187461853027, + "learning_rate": 1.9910962657866366e-05, + "loss": 0.4827, + "step": 1915 + }, + { + "epoch": 0.04261883885861422, + "grad_norm": 1.6191024780273438, + "learning_rate": 1.9910497799641126e-05, + "loss": 0.5419, + "step": 1920 + }, + { + "epoch": 0.04272982541814186, + "grad_norm": 1.4953969717025757, + "learning_rate": 1.9910031736530803e-05, + "loss": 0.5792, + "step": 1925 + }, + { + "epoch": 0.04284081197766951, + "grad_norm": 1.3562936782836914, + "learning_rate": 1.990956446859206e-05, + "loss": 0.5718, + "step": 1930 + }, + { + "epoch": 0.04295179853719715, + "grad_norm": 1.5445222854614258, + "learning_rate": 1.9909095995881697e-05, + "loss": 0.5163, + "step": 1935 + }, + { + "epoch": 0.04306278509672479, + "grad_norm": 1.349182367324829, + "learning_rate": 1.990862631845668e-05, + "loss": 0.6125, + "step": 1940 + }, + { + "epoch": 0.04317377165625243, + "grad_norm": 1.4996899366378784, + "learning_rate": 1.9908155436374102e-05, + "loss": 0.5883, + "step": 1945 + }, + { + "epoch": 0.04328475821578007, + "grad_norm": 1.2809373140335083, + "learning_rate": 1.990768334969122e-05, + "loss": 0.5395, + "step": 1950 + }, + { + "epoch": 0.04339574477530771, + "grad_norm": 2.493131399154663, + "learning_rate": 1.990721005846542e-05, + "loss": 0.5445, + "step": 1955 + }, + { + "epoch": 0.043506731334835354, + "grad_norm": 0.9056739211082458, + "learning_rate": 1.9906735562754253e-05, + "loss": 0.5177, + "step": 1960 + }, + { + "epoch": 0.043617717894362994, + "grad_norm": 1.4022185802459717, + "learning_rate": 1.9906259862615396e-05, + "loss": 0.6715, + "step": 1965 + }, + { + "epoch": 0.043728704453890634, + "grad_norm": 1.3620365858078003, + "learning_rate": 1.990578295810669e-05, + "loss": 0.6178, + "step": 1970 + }, + { + "epoch": 0.043839691013418274, + "grad_norm": 1.3010473251342773, + "learning_rate": 1.9905304849286114e-05, + "loss": 0.6329, + "step": 1975 + }, + { + "epoch": 0.04395067757294591, + "grad_norm": 1.640214443206787, + "learning_rate": 1.9904825536211793e-05, + "loss": 0.6614, + "step": 1980 + }, + { + "epoch": 0.04406166413247356, + "grad_norm": 1.3884638547897339, + "learning_rate": 1.9904345018942e-05, + "loss": 0.6241, + "step": 1985 + }, + { + "epoch": 0.0441726506920012, + "grad_norm": 1.1741259098052979, + "learning_rate": 1.990386329753516e-05, + "loss": 0.5398, + "step": 1990 + }, + { + "epoch": 0.04428363725152884, + "grad_norm": 1.1659259796142578, + "learning_rate": 1.9903380372049832e-05, + "loss": 0.6726, + "step": 1995 + }, + { + "epoch": 0.04439462381105648, + "grad_norm": 1.649901270866394, + "learning_rate": 1.990289624254473e-05, + "loss": 0.5993, + "step": 2000 + }, + { + "epoch": 0.04450561037058412, + "grad_norm": 1.0309951305389404, + "learning_rate": 1.990241090907872e-05, + "loss": 0.6359, + "step": 2005 + }, + { + "epoch": 0.04461659693011177, + "grad_norm": 1.2367216348648071, + "learning_rate": 1.99019243717108e-05, + "loss": 0.6339, + "step": 2010 + }, + { + "epoch": 0.04472758348963941, + "grad_norm": 2.2584967613220215, + "learning_rate": 1.9901436630500122e-05, + "loss": 0.7368, + "step": 2015 + }, + { + "epoch": 0.04483857004916705, + "grad_norm": 1.4870028495788574, + "learning_rate": 1.9900947685505983e-05, + "loss": 0.5566, + "step": 2020 + }, + { + "epoch": 0.04494955660869469, + "grad_norm": 1.9198880195617676, + "learning_rate": 1.9900457536787834e-05, + "loss": 0.582, + "step": 2025 + }, + { + "epoch": 0.045060543168222326, + "grad_norm": 1.3042656183242798, + "learning_rate": 1.9899966184405255e-05, + "loss": 0.552, + "step": 2030 + }, + { + "epoch": 0.045171529727749966, + "grad_norm": 1.6364234685897827, + "learning_rate": 1.9899473628417997e-05, + "loss": 0.6122, + "step": 2035 + }, + { + "epoch": 0.04528251628727761, + "grad_norm": 1.215649962425232, + "learning_rate": 1.9898979868885933e-05, + "loss": 0.4283, + "step": 2040 + }, + { + "epoch": 0.04539350284680525, + "grad_norm": 1.1958799362182617, + "learning_rate": 1.9898484905869095e-05, + "loss": 0.6035, + "step": 2045 + }, + { + "epoch": 0.04550448940633289, + "grad_norm": 1.2653090953826904, + "learning_rate": 1.989798873942766e-05, + "loss": 0.6591, + "step": 2050 + }, + { + "epoch": 0.04561547596586053, + "grad_norm": 1.3977992534637451, + "learning_rate": 1.9897491369621945e-05, + "loss": 0.5056, + "step": 2055 + }, + { + "epoch": 0.04572646252538817, + "grad_norm": 1.7142751216888428, + "learning_rate": 1.9896992796512427e-05, + "loss": 0.7221, + "step": 2060 + }, + { + "epoch": 0.04583744908491582, + "grad_norm": 1.326062560081482, + "learning_rate": 1.9896493020159715e-05, + "loss": 0.5673, + "step": 2065 + }, + { + "epoch": 0.04594843564444346, + "grad_norm": 1.3358633518218994, + "learning_rate": 1.9895992040624573e-05, + "loss": 0.4271, + "step": 2070 + }, + { + "epoch": 0.0460594222039711, + "grad_norm": 1.0794224739074707, + "learning_rate": 1.9895489857967908e-05, + "loss": 0.5734, + "step": 2075 + }, + { + "epoch": 0.04617040876349874, + "grad_norm": 1.3193761110305786, + "learning_rate": 1.989498647225077e-05, + "loss": 0.4369, + "step": 2080 + }, + { + "epoch": 0.04628139532302638, + "grad_norm": 1.1502362489700317, + "learning_rate": 1.9894481883534364e-05, + "loss": 0.5416, + "step": 2085 + }, + { + "epoch": 0.04639238188255402, + "grad_norm": 1.6109846830368042, + "learning_rate": 1.9893976091880033e-05, + "loss": 0.5072, + "step": 2090 + }, + { + "epoch": 0.046503368442081666, + "grad_norm": 1.4186127185821533, + "learning_rate": 1.989346909734927e-05, + "loss": 0.4665, + "step": 2095 + }, + { + "epoch": 0.046614355001609306, + "grad_norm": 1.2119293212890625, + "learning_rate": 1.9892960900003716e-05, + "loss": 0.5227, + "step": 2100 + }, + { + "epoch": 0.046725341561136946, + "grad_norm": 1.1812269687652588, + "learning_rate": 1.9892451499905153e-05, + "loss": 0.5563, + "step": 2105 + }, + { + "epoch": 0.046836328120664586, + "grad_norm": 1.3973302841186523, + "learning_rate": 1.9891940897115513e-05, + "loss": 0.4548, + "step": 2110 + }, + { + "epoch": 0.046947314680192226, + "grad_norm": 2.288679361343384, + "learning_rate": 1.9891429091696873e-05, + "loss": 0.4953, + "step": 2115 + }, + { + "epoch": 0.04705830123971987, + "grad_norm": 1.3049923181533813, + "learning_rate": 1.9890916083711463e-05, + "loss": 0.5927, + "step": 2120 + }, + { + "epoch": 0.04716928779924751, + "grad_norm": 1.1998517513275146, + "learning_rate": 1.9890401873221642e-05, + "loss": 0.6343, + "step": 2125 + }, + { + "epoch": 0.04728027435877515, + "grad_norm": 1.413918375968933, + "learning_rate": 1.988988646028993e-05, + "loss": 0.5395, + "step": 2130 + }, + { + "epoch": 0.04739126091830279, + "grad_norm": 1.0503027439117432, + "learning_rate": 1.9889369844978996e-05, + "loss": 0.6966, + "step": 2135 + }, + { + "epoch": 0.04750224747783043, + "grad_norm": 0.8282071352005005, + "learning_rate": 1.9888852027351636e-05, + "loss": 0.535, + "step": 2140 + }, + { + "epoch": 0.04761323403735808, + "grad_norm": 3.530345916748047, + "learning_rate": 1.9888333007470815e-05, + "loss": 0.6562, + "step": 2145 + }, + { + "epoch": 0.04772422059688572, + "grad_norm": 1.287862777709961, + "learning_rate": 1.988781278539963e-05, + "loss": 0.749, + "step": 2150 + }, + { + "epoch": 0.04783520715641336, + "grad_norm": 1.1997264623641968, + "learning_rate": 1.9887291361201328e-05, + "loss": 0.7251, + "step": 2155 + }, + { + "epoch": 0.047946193715941, + "grad_norm": 1.6142421960830688, + "learning_rate": 1.9886768734939297e-05, + "loss": 0.5778, + "step": 2160 + }, + { + "epoch": 0.04805718027546864, + "grad_norm": 1.3144336938858032, + "learning_rate": 1.9886244906677087e-05, + "loss": 0.6244, + "step": 2165 + }, + { + "epoch": 0.04816816683499628, + "grad_norm": 1.6684969663619995, + "learning_rate": 1.9885719876478374e-05, + "loss": 0.6855, + "step": 2170 + }, + { + "epoch": 0.048279153394523926, + "grad_norm": 1.1297353506088257, + "learning_rate": 1.9885193644406994e-05, + "loss": 0.5669, + "step": 2175 + }, + { + "epoch": 0.048390139954051566, + "grad_norm": 13.155599594116211, + "learning_rate": 1.988466621052692e-05, + "loss": 0.6243, + "step": 2180 + }, + { + "epoch": 0.048501126513579206, + "grad_norm": 1.2790757417678833, + "learning_rate": 1.988413757490228e-05, + "loss": 0.499, + "step": 2185 + }, + { + "epoch": 0.048612113073106845, + "grad_norm": 3.179291248321533, + "learning_rate": 1.9883607737597344e-05, + "loss": 0.6489, + "step": 2190 + }, + { + "epoch": 0.048723099632634485, + "grad_norm": 1.96470046043396, + "learning_rate": 1.9883076698676523e-05, + "loss": 0.6279, + "step": 2195 + }, + { + "epoch": 0.04883408619216213, + "grad_norm": 2.6033077239990234, + "learning_rate": 1.9882544458204386e-05, + "loss": 0.5934, + "step": 2200 + }, + { + "epoch": 0.04894507275168977, + "grad_norm": 1.5848404169082642, + "learning_rate": 1.988201101624564e-05, + "loss": 0.5393, + "step": 2205 + }, + { + "epoch": 0.04905605931121741, + "grad_norm": 1.7415422201156616, + "learning_rate": 1.988147637286513e-05, + "loss": 0.7068, + "step": 2210 + }, + { + "epoch": 0.04916704587074505, + "grad_norm": 1.6085535287857056, + "learning_rate": 1.9880940528127866e-05, + "loss": 0.6785, + "step": 2215 + }, + { + "epoch": 0.04927803243027269, + "grad_norm": 1.283326506614685, + "learning_rate": 1.9880403482098985e-05, + "loss": 0.4254, + "step": 2220 + }, + { + "epoch": 0.04938901898980033, + "grad_norm": 1.446057915687561, + "learning_rate": 1.9879865234843795e-05, + "loss": 0.5926, + "step": 2225 + }, + { + "epoch": 0.04950000554932798, + "grad_norm": 1.6062350273132324, + "learning_rate": 1.9879325786427716e-05, + "loss": 0.7388, + "step": 2230 + }, + { + "epoch": 0.04961099210885562, + "grad_norm": 1.3095510005950928, + "learning_rate": 1.9878785136916343e-05, + "loss": 0.6358, + "step": 2235 + }, + { + "epoch": 0.04972197866838326, + "grad_norm": 1.1560978889465332, + "learning_rate": 1.9878243286375405e-05, + "loss": 0.5217, + "step": 2240 + }, + { + "epoch": 0.0498329652279109, + "grad_norm": 1.2814440727233887, + "learning_rate": 1.9877700234870775e-05, + "loss": 0.6681, + "step": 2245 + }, + { + "epoch": 0.04994395178743854, + "grad_norm": 1.244834065437317, + "learning_rate": 1.9877155982468478e-05, + "loss": 0.6368, + "step": 2250 + }, + { + "epoch": 0.050054938346966185, + "grad_norm": 1.2957700490951538, + "learning_rate": 1.9876610529234686e-05, + "loss": 0.6633, + "step": 2255 + }, + { + "epoch": 0.050165924906493825, + "grad_norm": 1.300630807876587, + "learning_rate": 1.987606387523571e-05, + "loss": 0.5377, + "step": 2260 + }, + { + "epoch": 0.050276911466021465, + "grad_norm": 1.4381990432739258, + "learning_rate": 1.9875516020538e-05, + "loss": 0.4867, + "step": 2265 + }, + { + "epoch": 0.050387898025549105, + "grad_norm": 1.7026110887527466, + "learning_rate": 1.987496696520818e-05, + "loss": 0.6134, + "step": 2270 + }, + { + "epoch": 0.050498884585076745, + "grad_norm": 1.3532987833023071, + "learning_rate": 1.9874416709312994e-05, + "loss": 0.8066, + "step": 2275 + }, + { + "epoch": 0.05060987114460439, + "grad_norm": 1.2888226509094238, + "learning_rate": 1.987386525291934e-05, + "loss": 0.5613, + "step": 2280 + }, + { + "epoch": 0.05072085770413203, + "grad_norm": 1.1642128229141235, + "learning_rate": 1.9873312596094264e-05, + "loss": 0.5735, + "step": 2285 + }, + { + "epoch": 0.05083184426365967, + "grad_norm": 1.2660547494888306, + "learning_rate": 1.9872758738904952e-05, + "loss": 0.6068, + "step": 2290 + }, + { + "epoch": 0.05094283082318731, + "grad_norm": 1.2771880626678467, + "learning_rate": 1.9872203681418745e-05, + "loss": 0.575, + "step": 2295 + }, + { + "epoch": 0.05105381738271495, + "grad_norm": 0.9723901152610779, + "learning_rate": 1.9871647423703126e-05, + "loss": 0.5135, + "step": 2300 + }, + { + "epoch": 0.05116480394224259, + "grad_norm": 1.1216762065887451, + "learning_rate": 1.9871089965825713e-05, + "loss": 0.5082, + "step": 2305 + }, + { + "epoch": 0.05127579050177024, + "grad_norm": 1.0608837604522705, + "learning_rate": 1.987053130785429e-05, + "loss": 0.4207, + "step": 2310 + }, + { + "epoch": 0.05138677706129788, + "grad_norm": 1.3452649116516113, + "learning_rate": 1.9869971449856778e-05, + "loss": 0.6087, + "step": 2315 + }, + { + "epoch": 0.05149776362082552, + "grad_norm": 1.0215181112289429, + "learning_rate": 1.9869410391901237e-05, + "loss": 0.4663, + "step": 2320 + }, + { + "epoch": 0.05160875018035316, + "grad_norm": 1.0570729970932007, + "learning_rate": 1.986884813405588e-05, + "loss": 0.5788, + "step": 2325 + }, + { + "epoch": 0.0517197367398808, + "grad_norm": 1.1869689226150513, + "learning_rate": 1.986828467638906e-05, + "loss": 0.6118, + "step": 2330 + }, + { + "epoch": 0.051830723299408445, + "grad_norm": 0.9934154748916626, + "learning_rate": 1.9867720018969287e-05, + "loss": 0.7029, + "step": 2335 + }, + { + "epoch": 0.051941709858936085, + "grad_norm": 1.4993884563446045, + "learning_rate": 1.9867154161865207e-05, + "loss": 0.6805, + "step": 2340 + }, + { + "epoch": 0.052052696418463724, + "grad_norm": 1.1254603862762451, + "learning_rate": 1.9866587105145617e-05, + "loss": 0.4247, + "step": 2345 + }, + { + "epoch": 0.052163682977991364, + "grad_norm": 1.353786587715149, + "learning_rate": 1.986601884887946e-05, + "loss": 0.6416, + "step": 2350 + }, + { + "epoch": 0.052274669537519004, + "grad_norm": 1.1726566553115845, + "learning_rate": 1.9865449393135816e-05, + "loss": 0.7834, + "step": 2355 + }, + { + "epoch": 0.052385656097046644, + "grad_norm": 1.455482006072998, + "learning_rate": 1.986487873798392e-05, + "loss": 0.6141, + "step": 2360 + }, + { + "epoch": 0.05249664265657429, + "grad_norm": 2.1738498210906982, + "learning_rate": 1.9864306883493154e-05, + "loss": 0.5314, + "step": 2365 + }, + { + "epoch": 0.05260762921610193, + "grad_norm": 1.433218002319336, + "learning_rate": 1.986373382973304e-05, + "loss": 0.5529, + "step": 2370 + }, + { + "epoch": 0.05271861577562957, + "grad_norm": 1.2258576154708862, + "learning_rate": 1.9863159576773243e-05, + "loss": 0.5788, + "step": 2375 + }, + { + "epoch": 0.05282960233515721, + "grad_norm": 1.0413432121276855, + "learning_rate": 1.9862584124683587e-05, + "loss": 0.5304, + "step": 2380 + }, + { + "epoch": 0.05294058889468485, + "grad_norm": 1.2066630125045776, + "learning_rate": 1.9862007473534026e-05, + "loss": 0.5745, + "step": 2385 + }, + { + "epoch": 0.0530515754542125, + "grad_norm": 0.985165536403656, + "learning_rate": 1.9861429623394676e-05, + "loss": 0.5299, + "step": 2390 + }, + { + "epoch": 0.05316256201374014, + "grad_norm": 1.5206027030944824, + "learning_rate": 1.986085057433578e-05, + "loss": 0.3735, + "step": 2395 + }, + { + "epoch": 0.05327354857326778, + "grad_norm": 1.7408087253570557, + "learning_rate": 1.9860270326427743e-05, + "loss": 0.6242, + "step": 2400 + }, + { + "epoch": 0.05338453513279542, + "grad_norm": 1.4238530397415161, + "learning_rate": 1.985968887974111e-05, + "loss": 0.764, + "step": 2405 + }, + { + "epoch": 0.05349552169232306, + "grad_norm": 1.4702534675598145, + "learning_rate": 1.985910623434657e-05, + "loss": 0.5784, + "step": 2410 + }, + { + "epoch": 0.053606508251850704, + "grad_norm": 1.1298567056655884, + "learning_rate": 1.985852239031496e-05, + "loss": 0.6189, + "step": 2415 + }, + { + "epoch": 0.053717494811378344, + "grad_norm": 0.9069703221321106, + "learning_rate": 1.985793734771726e-05, + "loss": 0.5496, + "step": 2420 + }, + { + "epoch": 0.053828481370905984, + "grad_norm": 1.1809360980987549, + "learning_rate": 1.9857351106624595e-05, + "loss": 0.3049, + "step": 2425 + }, + { + "epoch": 0.053939467930433624, + "grad_norm": 2.4820363521575928, + "learning_rate": 1.9856763667108243e-05, + "loss": 0.484, + "step": 2430 + }, + { + "epoch": 0.054050454489961264, + "grad_norm": 1.0644019842147827, + "learning_rate": 1.9856175029239624e-05, + "loss": 0.5912, + "step": 2435 + }, + { + "epoch": 0.054161441049488904, + "grad_norm": 1.3724855184555054, + "learning_rate": 1.9855585193090297e-05, + "loss": 0.4491, + "step": 2440 + }, + { + "epoch": 0.05427242760901655, + "grad_norm": 1.0048718452453613, + "learning_rate": 1.9854994158731978e-05, + "loss": 0.4543, + "step": 2445 + }, + { + "epoch": 0.05438341416854419, + "grad_norm": 1.835546612739563, + "learning_rate": 1.9854401926236518e-05, + "loss": 0.5102, + "step": 2450 + }, + { + "epoch": 0.05449440072807183, + "grad_norm": 1.3178610801696777, + "learning_rate": 1.985380849567592e-05, + "loss": 0.6391, + "step": 2455 + }, + { + "epoch": 0.05460538728759947, + "grad_norm": 1.2502537965774536, + "learning_rate": 1.9853213867122333e-05, + "loss": 0.4346, + "step": 2460 + }, + { + "epoch": 0.05471637384712711, + "grad_norm": 1.0760316848754883, + "learning_rate": 1.985261804064805e-05, + "loss": 0.5103, + "step": 2465 + }, + { + "epoch": 0.05482736040665476, + "grad_norm": 0.797814130783081, + "learning_rate": 1.985202101632551e-05, + "loss": 0.4749, + "step": 2470 + }, + { + "epoch": 0.0549383469661824, + "grad_norm": 1.1549592018127441, + "learning_rate": 1.9851422794227295e-05, + "loss": 0.5325, + "step": 2475 + }, + { + "epoch": 0.05504933352571004, + "grad_norm": 1.5827157497406006, + "learning_rate": 1.9850823374426136e-05, + "loss": 0.5886, + "step": 2480 + }, + { + "epoch": 0.05516032008523768, + "grad_norm": 1.293227195739746, + "learning_rate": 1.985022275699491e-05, + "loss": 0.6185, + "step": 2485 + }, + { + "epoch": 0.05527130664476532, + "grad_norm": 1.5905836820602417, + "learning_rate": 1.984962094200663e-05, + "loss": 0.6508, + "step": 2490 + }, + { + "epoch": 0.05538229320429296, + "grad_norm": 1.8819011449813843, + "learning_rate": 1.9849017929534474e-05, + "loss": 0.7041, + "step": 2495 + }, + { + "epoch": 0.055493279763820604, + "grad_norm": 1.5715081691741943, + "learning_rate": 1.9848413719651745e-05, + "loss": 0.5766, + "step": 2500 + }, + { + "epoch": 0.05560426632334824, + "grad_norm": 1.348044514656067, + "learning_rate": 1.984780831243191e-05, + "loss": 0.751, + "step": 2505 + }, + { + "epoch": 0.05571525288287588, + "grad_norm": 1.3613786697387695, + "learning_rate": 1.9847201707948567e-05, + "loss": 0.5741, + "step": 2510 + }, + { + "epoch": 0.05582623944240352, + "grad_norm": 1.4741175174713135, + "learning_rate": 1.9846593906275463e-05, + "loss": 0.5456, + "step": 2515 + }, + { + "epoch": 0.05593722600193116, + "grad_norm": 1.2884457111358643, + "learning_rate": 1.9845984907486494e-05, + "loss": 0.6757, + "step": 2520 + }, + { + "epoch": 0.05604821256145881, + "grad_norm": 1.2677499055862427, + "learning_rate": 1.9845374711655703e-05, + "loss": 0.6415, + "step": 2525 + }, + { + "epoch": 0.05615919912098645, + "grad_norm": 1.4850881099700928, + "learning_rate": 1.9844763318857275e-05, + "loss": 0.6886, + "step": 2530 + }, + { + "epoch": 0.05627018568051409, + "grad_norm": 1.1189000606536865, + "learning_rate": 1.9844150729165536e-05, + "loss": 0.5813, + "step": 2535 + }, + { + "epoch": 0.05638117224004173, + "grad_norm": 1.2875266075134277, + "learning_rate": 1.9843536942654967e-05, + "loss": 0.4793, + "step": 2540 + }, + { + "epoch": 0.05649215879956937, + "grad_norm": 1.367203950881958, + "learning_rate": 1.984292195940019e-05, + "loss": 0.4914, + "step": 2545 + }, + { + "epoch": 0.05660314535909702, + "grad_norm": 1.3838480710983276, + "learning_rate": 1.984230577947597e-05, + "loss": 0.6269, + "step": 2550 + }, + { + "epoch": 0.056714131918624656, + "grad_norm": 1.0736616849899292, + "learning_rate": 1.9841688402957223e-05, + "loss": 0.7574, + "step": 2555 + }, + { + "epoch": 0.056825118478152296, + "grad_norm": 1.273168683052063, + "learning_rate": 1.9841069829919006e-05, + "loss": 0.5862, + "step": 2560 + }, + { + "epoch": 0.056936105037679936, + "grad_norm": 1.210391640663147, + "learning_rate": 1.9840450060436523e-05, + "loss": 0.5604, + "step": 2565 + }, + { + "epoch": 0.057047091597207576, + "grad_norm": 1.031866431236267, + "learning_rate": 1.9839829094585125e-05, + "loss": 0.5356, + "step": 2570 + }, + { + "epoch": 0.057158078156735216, + "grad_norm": 1.0833243131637573, + "learning_rate": 1.9839206932440307e-05, + "loss": 0.5123, + "step": 2575 + }, + { + "epoch": 0.05726906471626286, + "grad_norm": 1.3849217891693115, + "learning_rate": 1.983858357407771e-05, + "loss": 0.6217, + "step": 2580 + }, + { + "epoch": 0.0573800512757905, + "grad_norm": 1.1831570863723755, + "learning_rate": 1.983795901957311e-05, + "loss": 0.704, + "step": 2585 + }, + { + "epoch": 0.05749103783531814, + "grad_norm": 1.2546558380126953, + "learning_rate": 1.9837333269002452e-05, + "loss": 0.5062, + "step": 2590 + }, + { + "epoch": 0.05760202439484578, + "grad_norm": 1.8262821435928345, + "learning_rate": 1.9836706322441806e-05, + "loss": 0.7465, + "step": 2595 + }, + { + "epoch": 0.05771301095437342, + "grad_norm": 1.107417345046997, + "learning_rate": 1.9836078179967394e-05, + "loss": 0.5504, + "step": 2600 + }, + { + "epoch": 0.05782399751390107, + "grad_norm": 1.3223316669464111, + "learning_rate": 1.983544884165559e-05, + "loss": 0.6295, + "step": 2605 + }, + { + "epoch": 0.05793498407342871, + "grad_norm": 1.1613926887512207, + "learning_rate": 1.9834818307582896e-05, + "loss": 0.5908, + "step": 2610 + }, + { + "epoch": 0.05804597063295635, + "grad_norm": 1.026995301246643, + "learning_rate": 1.9834186577825977e-05, + "loss": 0.5483, + "step": 2615 + }, + { + "epoch": 0.05815695719248399, + "grad_norm": 1.147704005241394, + "learning_rate": 1.9833553652461636e-05, + "loss": 0.5194, + "step": 2620 + }, + { + "epoch": 0.05826794375201163, + "grad_norm": 1.3523058891296387, + "learning_rate": 1.9832919531566822e-05, + "loss": 0.6718, + "step": 2625 + }, + { + "epoch": 0.058378930311539276, + "grad_norm": 1.3443208932876587, + "learning_rate": 1.9832284215218623e-05, + "loss": 0.7098, + "step": 2630 + }, + { + "epoch": 0.058489916871066916, + "grad_norm": 1.56340491771698, + "learning_rate": 1.9831647703494287e-05, + "loss": 0.6716, + "step": 2635 + }, + { + "epoch": 0.058600903430594556, + "grad_norm": 1.7553986310958862, + "learning_rate": 1.9831009996471197e-05, + "loss": 0.5632, + "step": 2640 + }, + { + "epoch": 0.058711889990122196, + "grad_norm": 1.7398900985717773, + "learning_rate": 1.9830371094226882e-05, + "loss": 0.5898, + "step": 2645 + }, + { + "epoch": 0.058822876549649836, + "grad_norm": 1.50764799118042, + "learning_rate": 1.982973099683902e-05, + "loss": 0.6524, + "step": 2650 + }, + { + "epoch": 0.058933863109177476, + "grad_norm": 1.5842028856277466, + "learning_rate": 1.9829089704385426e-05, + "loss": 0.5745, + "step": 2655 + }, + { + "epoch": 0.05904484966870512, + "grad_norm": 1.5821632146835327, + "learning_rate": 1.982844721694407e-05, + "loss": 0.7652, + "step": 2660 + }, + { + "epoch": 0.05915583622823276, + "grad_norm": 2.011687755584717, + "learning_rate": 1.982780353459307e-05, + "loss": 0.5884, + "step": 2665 + }, + { + "epoch": 0.0592668227877604, + "grad_norm": 1.7357864379882812, + "learning_rate": 1.9827158657410667e-05, + "loss": 0.6314, + "step": 2670 + }, + { + "epoch": 0.05937780934728804, + "grad_norm": 1.1531578302383423, + "learning_rate": 1.982651258547528e-05, + "loss": 0.5567, + "step": 2675 + }, + { + "epoch": 0.05948879590681568, + "grad_norm": 1.2788385152816772, + "learning_rate": 1.982586531886544e-05, + "loss": 0.4528, + "step": 2680 + }, + { + "epoch": 0.05959978246634333, + "grad_norm": 1.8556158542633057, + "learning_rate": 1.9825216857659855e-05, + "loss": 0.5766, + "step": 2685 + }, + { + "epoch": 0.05971076902587097, + "grad_norm": 1.9483803510665894, + "learning_rate": 1.9824567201937354e-05, + "loss": 0.584, + "step": 2690 + }, + { + "epoch": 0.05982175558539861, + "grad_norm": 1.6182191371917725, + "learning_rate": 1.9823916351776922e-05, + "loss": 0.6361, + "step": 2695 + }, + { + "epoch": 0.05993274214492625, + "grad_norm": 1.289291501045227, + "learning_rate": 1.9823264307257683e-05, + "loss": 0.4522, + "step": 2700 + }, + { + "epoch": 0.06004372870445389, + "grad_norm": 1.3541606664657593, + "learning_rate": 1.9822611068458916e-05, + "loss": 0.7097, + "step": 2705 + }, + { + "epoch": 0.06015471526398153, + "grad_norm": 2.0710556507110596, + "learning_rate": 1.982195663546004e-05, + "loss": 0.5523, + "step": 2710 + }, + { + "epoch": 0.060265701823509175, + "grad_norm": 1.5621989965438843, + "learning_rate": 1.9821301008340614e-05, + "loss": 0.5443, + "step": 2715 + }, + { + "epoch": 0.060376688383036815, + "grad_norm": 1.361464500427246, + "learning_rate": 1.9820644187180354e-05, + "loss": 0.7128, + "step": 2720 + }, + { + "epoch": 0.060487674942564455, + "grad_norm": 1.134830355644226, + "learning_rate": 1.9819986172059105e-05, + "loss": 0.6254, + "step": 2725 + }, + { + "epoch": 0.060598661502092095, + "grad_norm": 1.1800482273101807, + "learning_rate": 1.981932696305687e-05, + "loss": 0.5438, + "step": 2730 + }, + { + "epoch": 0.060709648061619735, + "grad_norm": 1.2705154418945312, + "learning_rate": 1.98186665602538e-05, + "loss": 0.5399, + "step": 2735 + }, + { + "epoch": 0.06082063462114738, + "grad_norm": 1.5200409889221191, + "learning_rate": 1.9818004963730174e-05, + "loss": 0.6409, + "step": 2740 + }, + { + "epoch": 0.06093162118067502, + "grad_norm": 1.0456277132034302, + "learning_rate": 1.9817342173566435e-05, + "loss": 0.5863, + "step": 2745 + }, + { + "epoch": 0.06104260774020266, + "grad_norm": 1.1642462015151978, + "learning_rate": 1.9816678189843156e-05, + "loss": 0.5681, + "step": 2750 + }, + { + "epoch": 0.0611535942997303, + "grad_norm": 1.3238295316696167, + "learning_rate": 1.9816013012641066e-05, + "loss": 0.7381, + "step": 2755 + }, + { + "epoch": 0.06126458085925794, + "grad_norm": 1.229519009590149, + "learning_rate": 1.9815346642041032e-05, + "loss": 0.8392, + "step": 2760 + }, + { + "epoch": 0.06137556741878559, + "grad_norm": 1.1470119953155518, + "learning_rate": 1.9814679078124076e-05, + "loss": 0.6714, + "step": 2765 + }, + { + "epoch": 0.06148655397831323, + "grad_norm": 1.3214852809906006, + "learning_rate": 1.9814010320971353e-05, + "loss": 0.5478, + "step": 2770 + }, + { + "epoch": 0.06159754053784087, + "grad_norm": 1.2812148332595825, + "learning_rate": 1.9813340370664167e-05, + "loss": 0.6498, + "step": 2775 + }, + { + "epoch": 0.06170852709736851, + "grad_norm": 1.038292646408081, + "learning_rate": 1.981266922728397e-05, + "loss": 0.4434, + "step": 2780 + }, + { + "epoch": 0.06181951365689615, + "grad_norm": 0.957730233669281, + "learning_rate": 1.981199689091236e-05, + "loss": 0.5538, + "step": 2785 + }, + { + "epoch": 0.06193050021642379, + "grad_norm": 1.2136549949645996, + "learning_rate": 1.981132336163107e-05, + "loss": 0.5467, + "step": 2790 + }, + { + "epoch": 0.062041486775951435, + "grad_norm": 1.0682697296142578, + "learning_rate": 1.9810648639521996e-05, + "loss": 0.4941, + "step": 2795 + }, + { + "epoch": 0.062152473335479075, + "grad_norm": 1.0867773294448853, + "learning_rate": 1.9809972724667158e-05, + "loss": 0.5495, + "step": 2800 + }, + { + "epoch": 0.062263459895006715, + "grad_norm": 0.9544880986213684, + "learning_rate": 1.980929561714874e-05, + "loss": 0.5601, + "step": 2805 + }, + { + "epoch": 0.062374446454534355, + "grad_norm": 3.496631145477295, + "learning_rate": 1.9808617317049055e-05, + "loss": 0.6168, + "step": 2810 + }, + { + "epoch": 0.062485433014061995, + "grad_norm": 1.2551007270812988, + "learning_rate": 1.9807937824450576e-05, + "loss": 0.3769, + "step": 2815 + }, + { + "epoch": 0.06259641957358963, + "grad_norm": 1.3092126846313477, + "learning_rate": 1.9807257139435906e-05, + "loss": 0.6537, + "step": 2820 + }, + { + "epoch": 0.06270740613311727, + "grad_norm": 1.0793254375457764, + "learning_rate": 1.9806575262087806e-05, + "loss": 0.686, + "step": 2825 + }, + { + "epoch": 0.06281839269264491, + "grad_norm": 1.1246548891067505, + "learning_rate": 1.9805892192489177e-05, + "loss": 0.5737, + "step": 2830 + }, + { + "epoch": 0.06292937925217257, + "grad_norm": 1.2988542318344116, + "learning_rate": 1.9805207930723056e-05, + "loss": 0.6251, + "step": 2835 + }, + { + "epoch": 0.06304036581170021, + "grad_norm": 1.405332326889038, + "learning_rate": 1.9804522476872644e-05, + "loss": 0.5313, + "step": 2840 + }, + { + "epoch": 0.06315135237122785, + "grad_norm": 1.222214937210083, + "learning_rate": 1.9803835831021264e-05, + "loss": 0.4689, + "step": 2845 + }, + { + "epoch": 0.06326233893075549, + "grad_norm": 1.2090331315994263, + "learning_rate": 1.980314799325241e-05, + "loss": 0.6677, + "step": 2850 + }, + { + "epoch": 0.06337332549028313, + "grad_norm": 1.2813634872436523, + "learning_rate": 1.9802458963649696e-05, + "loss": 0.4276, + "step": 2855 + }, + { + "epoch": 0.06348431204981077, + "grad_norm": 1.3058404922485352, + "learning_rate": 1.98017687422969e-05, + "loss": 0.5755, + "step": 2860 + }, + { + "epoch": 0.06359529860933841, + "grad_norm": 1.4632521867752075, + "learning_rate": 1.9801077329277932e-05, + "loss": 0.4694, + "step": 2865 + }, + { + "epoch": 0.06370628516886605, + "grad_norm": 1.2117215394973755, + "learning_rate": 1.980038472467685e-05, + "loss": 0.6069, + "step": 2870 + }, + { + "epoch": 0.06381727172839369, + "grad_norm": 0.6046699285507202, + "learning_rate": 1.9799690928577865e-05, + "loss": 0.4746, + "step": 2875 + }, + { + "epoch": 0.06392825828792133, + "grad_norm": 1.6112840175628662, + "learning_rate": 1.9798995941065318e-05, + "loss": 0.4418, + "step": 2880 + }, + { + "epoch": 0.06403924484744897, + "grad_norm": 1.112791895866394, + "learning_rate": 1.9798299762223713e-05, + "loss": 0.7471, + "step": 2885 + }, + { + "epoch": 0.06415023140697662, + "grad_norm": 2.807286262512207, + "learning_rate": 1.9797602392137678e-05, + "loss": 0.4693, + "step": 2890 + }, + { + "epoch": 0.06426121796650426, + "grad_norm": 1.076808214187622, + "learning_rate": 1.9796903830892008e-05, + "loss": 0.479, + "step": 2895 + }, + { + "epoch": 0.0643722045260319, + "grad_norm": 0.9873941540718079, + "learning_rate": 1.9796204078571623e-05, + "loss": 0.6405, + "step": 2900 + }, + { + "epoch": 0.06448319108555954, + "grad_norm": 1.0882630348205566, + "learning_rate": 1.97955031352616e-05, + "loss": 0.5057, + "step": 2905 + }, + { + "epoch": 0.06459417764508718, + "grad_norm": 1.2589354515075684, + "learning_rate": 1.9794801001047158e-05, + "loss": 0.5496, + "step": 2910 + }, + { + "epoch": 0.06470516420461482, + "grad_norm": 1.3534345626831055, + "learning_rate": 1.979409767601366e-05, + "loss": 0.6252, + "step": 2915 + }, + { + "epoch": 0.06481615076414246, + "grad_norm": 1.2870107889175415, + "learning_rate": 1.9793393160246613e-05, + "loss": 0.572, + "step": 2920 + }, + { + "epoch": 0.0649271373236701, + "grad_norm": 1.279741883277893, + "learning_rate": 1.9792687453831673e-05, + "loss": 0.5784, + "step": 2925 + }, + { + "epoch": 0.06503812388319774, + "grad_norm": 1.2085089683532715, + "learning_rate": 1.979198055685463e-05, + "loss": 0.5717, + "step": 2930 + }, + { + "epoch": 0.06514911044272538, + "grad_norm": 1.5447397232055664, + "learning_rate": 1.9791272469401432e-05, + "loss": 0.6508, + "step": 2935 + }, + { + "epoch": 0.06526009700225302, + "grad_norm": 1.1712005138397217, + "learning_rate": 1.9790563191558167e-05, + "loss": 0.5406, + "step": 2940 + }, + { + "epoch": 0.06537108356178067, + "grad_norm": 1.8229413032531738, + "learning_rate": 1.978985272341106e-05, + "loss": 0.5619, + "step": 2945 + }, + { + "epoch": 0.06548207012130831, + "grad_norm": 3.0190622806549072, + "learning_rate": 1.9789141065046495e-05, + "loss": 0.5988, + "step": 2950 + }, + { + "epoch": 0.06559305668083595, + "grad_norm": 1.4421160221099854, + "learning_rate": 1.9788428216550988e-05, + "loss": 0.6442, + "step": 2955 + }, + { + "epoch": 0.0657040432403636, + "grad_norm": 1.2881371974945068, + "learning_rate": 1.9787714178011206e-05, + "loss": 0.5356, + "step": 2960 + }, + { + "epoch": 0.06581502979989123, + "grad_norm": 1.5065562725067139, + "learning_rate": 1.978699894951396e-05, + "loss": 0.5315, + "step": 2965 + }, + { + "epoch": 0.06592601635941887, + "grad_norm": 1.547042965888977, + "learning_rate": 1.9786282531146207e-05, + "loss": 0.5213, + "step": 2970 + }, + { + "epoch": 0.06603700291894651, + "grad_norm": 1.183430790901184, + "learning_rate": 1.9785564922995042e-05, + "loss": 0.4528, + "step": 2975 + }, + { + "epoch": 0.06614798947847415, + "grad_norm": 0.9143704771995544, + "learning_rate": 1.9784846125147712e-05, + "loss": 0.6407, + "step": 2980 + }, + { + "epoch": 0.0662589760380018, + "grad_norm": 0.9499570727348328, + "learning_rate": 1.9784126137691606e-05, + "loss": 0.2765, + "step": 2985 + }, + { + "epoch": 0.06636996259752943, + "grad_norm": 1.2335505485534668, + "learning_rate": 1.9783404960714258e-05, + "loss": 0.6352, + "step": 2990 + }, + { + "epoch": 0.06648094915705709, + "grad_norm": 1.1898120641708374, + "learning_rate": 1.9782682594303348e-05, + "loss": 0.4826, + "step": 2995 + }, + { + "epoch": 0.06659193571658473, + "grad_norm": 1.3998817205429077, + "learning_rate": 1.9781959038546693e-05, + "loss": 0.5894, + "step": 3000 + }, + { + "epoch": 0.06670292227611237, + "grad_norm": 1.0510846376419067, + "learning_rate": 1.9781234293532264e-05, + "loss": 0.6697, + "step": 3005 + }, + { + "epoch": 0.06681390883564, + "grad_norm": 1.3648271560668945, + "learning_rate": 1.9780508359348175e-05, + "loss": 0.6685, + "step": 3010 + }, + { + "epoch": 0.06692489539516765, + "grad_norm": 1.1597286462783813, + "learning_rate": 1.9779781236082683e-05, + "loss": 0.589, + "step": 3015 + }, + { + "epoch": 0.06703588195469529, + "grad_norm": 1.1021475791931152, + "learning_rate": 1.9779052923824186e-05, + "loss": 0.6973, + "step": 3020 + }, + { + "epoch": 0.06714686851422293, + "grad_norm": 1.321478247642517, + "learning_rate": 1.977832342266123e-05, + "loss": 0.6676, + "step": 3025 + }, + { + "epoch": 0.06725785507375057, + "grad_norm": 1.3852922916412354, + "learning_rate": 1.9777592732682507e-05, + "loss": 0.579, + "step": 3030 + }, + { + "epoch": 0.0673688416332782, + "grad_norm": 1.547518253326416, + "learning_rate": 1.977686085397685e-05, + "loss": 0.5779, + "step": 3035 + }, + { + "epoch": 0.06747982819280585, + "grad_norm": 0.9579359889030457, + "learning_rate": 1.977612778663324e-05, + "loss": 0.5756, + "step": 3040 + }, + { + "epoch": 0.06759081475233349, + "grad_norm": 1.5292962789535522, + "learning_rate": 1.9775393530740797e-05, + "loss": 0.6468, + "step": 3045 + }, + { + "epoch": 0.06770180131186114, + "grad_norm": 1.042737364768982, + "learning_rate": 1.97746580863888e-05, + "loss": 0.5905, + "step": 3050 + }, + { + "epoch": 0.06781278787138878, + "grad_norm": 1.162990927696228, + "learning_rate": 1.9773921453666647e-05, + "loss": 0.5959, + "step": 3055 + }, + { + "epoch": 0.06792377443091642, + "grad_norm": 1.2333893775939941, + "learning_rate": 1.9773183632663907e-05, + "loss": 0.6628, + "step": 3060 + }, + { + "epoch": 0.06803476099044406, + "grad_norm": 0.8473436236381531, + "learning_rate": 1.9772444623470277e-05, + "loss": 0.5039, + "step": 3065 + }, + { + "epoch": 0.0681457475499717, + "grad_norm": 1.0550405979156494, + "learning_rate": 1.9771704426175605e-05, + "loss": 0.5449, + "step": 3070 + }, + { + "epoch": 0.06825673410949934, + "grad_norm": 1.0942840576171875, + "learning_rate": 1.9770963040869878e-05, + "loss": 0.5346, + "step": 3075 + }, + { + "epoch": 0.06836772066902698, + "grad_norm": 1.232731819152832, + "learning_rate": 1.9770220467643235e-05, + "loss": 0.47, + "step": 3080 + }, + { + "epoch": 0.06847870722855462, + "grad_norm": 1.2322814464569092, + "learning_rate": 1.9769476706585956e-05, + "loss": 0.4938, + "step": 3085 + }, + { + "epoch": 0.06858969378808226, + "grad_norm": 1.3374191522598267, + "learning_rate": 1.9768731757788462e-05, + "loss": 0.755, + "step": 3090 + }, + { + "epoch": 0.0687006803476099, + "grad_norm": 0.9323263168334961, + "learning_rate": 1.976798562134132e-05, + "loss": 0.5455, + "step": 3095 + }, + { + "epoch": 0.06881166690713754, + "grad_norm": 1.150692105293274, + "learning_rate": 1.976723829733525e-05, + "loss": 0.5345, + "step": 3100 + }, + { + "epoch": 0.06892265346666519, + "grad_norm": 1.3430273532867432, + "learning_rate": 1.97664897858611e-05, + "loss": 0.6931, + "step": 3105 + }, + { + "epoch": 0.06903364002619283, + "grad_norm": 1.2106941938400269, + "learning_rate": 1.976574008700988e-05, + "loss": 0.4909, + "step": 3110 + }, + { + "epoch": 0.06914462658572047, + "grad_norm": 1.5072520971298218, + "learning_rate": 1.976498920087273e-05, + "loss": 0.5841, + "step": 3115 + }, + { + "epoch": 0.06925561314524811, + "grad_norm": 1.5390129089355469, + "learning_rate": 1.9764237127540943e-05, + "loss": 0.6413, + "step": 3120 + }, + { + "epoch": 0.06936659970477575, + "grad_norm": 1.5383436679840088, + "learning_rate": 1.976348386710595e-05, + "loss": 0.5086, + "step": 3125 + }, + { + "epoch": 0.06947758626430339, + "grad_norm": 1.0449554920196533, + "learning_rate": 1.9762729419659335e-05, + "loss": 0.4917, + "step": 3130 + }, + { + "epoch": 0.06958857282383103, + "grad_norm": 1.008422613143921, + "learning_rate": 1.9761973785292822e-05, + "loss": 0.5789, + "step": 3135 + }, + { + "epoch": 0.06969955938335867, + "grad_norm": 1.426741600036621, + "learning_rate": 1.976121696409827e-05, + "loss": 0.4162, + "step": 3140 + }, + { + "epoch": 0.06981054594288631, + "grad_norm": 1.1669197082519531, + "learning_rate": 1.9760458956167698e-05, + "loss": 0.6258, + "step": 3145 + }, + { + "epoch": 0.06992153250241395, + "grad_norm": 1.39824378490448, + "learning_rate": 1.975969976159326e-05, + "loss": 0.7105, + "step": 3150 + }, + { + "epoch": 0.07003251906194159, + "grad_norm": 1.38515043258667, + "learning_rate": 1.975893938046726e-05, + "loss": 0.5529, + "step": 3155 + }, + { + "epoch": 0.07014350562146925, + "grad_norm": 1.5153920650482178, + "learning_rate": 1.9758177812882134e-05, + "loss": 0.7217, + "step": 3160 + }, + { + "epoch": 0.07025449218099689, + "grad_norm": 1.4021879434585571, + "learning_rate": 1.9757415058930477e-05, + "loss": 0.4648, + "step": 3165 + }, + { + "epoch": 0.07036547874052453, + "grad_norm": 1.093927264213562, + "learning_rate": 1.9756651118705023e-05, + "loss": 0.3276, + "step": 3170 + }, + { + "epoch": 0.07047646530005217, + "grad_norm": 1.226275086402893, + "learning_rate": 1.9755885992298648e-05, + "loss": 0.6317, + "step": 3175 + }, + { + "epoch": 0.0705874518595798, + "grad_norm": 1.1503514051437378, + "learning_rate": 1.975511967980437e-05, + "loss": 0.5181, + "step": 3180 + }, + { + "epoch": 0.07069843841910745, + "grad_norm": 1.0098612308502197, + "learning_rate": 1.9754352181315358e-05, + "loss": 0.542, + "step": 3185 + }, + { + "epoch": 0.07080942497863509, + "grad_norm": 1.1693663597106934, + "learning_rate": 1.975358349692492e-05, + "loss": 0.6094, + "step": 3190 + }, + { + "epoch": 0.07092041153816273, + "grad_norm": 1.1020994186401367, + "learning_rate": 1.9752813626726512e-05, + "loss": 0.6664, + "step": 3195 + }, + { + "epoch": 0.07103139809769037, + "grad_norm": 0.7710314393043518, + "learning_rate": 1.9752042570813733e-05, + "loss": 0.4683, + "step": 3200 + }, + { + "epoch": 0.071142384657218, + "grad_norm": 1.5390743017196655, + "learning_rate": 1.9751270329280324e-05, + "loss": 0.653, + "step": 3205 + }, + { + "epoch": 0.07125337121674566, + "grad_norm": 1.0856455564498901, + "learning_rate": 1.9750496902220172e-05, + "loss": 0.6865, + "step": 3210 + }, + { + "epoch": 0.0713643577762733, + "grad_norm": 1.365512490272522, + "learning_rate": 1.9749722289727303e-05, + "loss": 0.5476, + "step": 3215 + }, + { + "epoch": 0.07147534433580094, + "grad_norm": 1.1265281438827515, + "learning_rate": 1.97489464918959e-05, + "loss": 0.5567, + "step": 3220 + }, + { + "epoch": 0.07158633089532858, + "grad_norm": 2.017101287841797, + "learning_rate": 1.974816950882028e-05, + "loss": 0.4984, + "step": 3225 + }, + { + "epoch": 0.07169731745485622, + "grad_norm": 1.2441028356552124, + "learning_rate": 1.97473913405949e-05, + "loss": 0.5496, + "step": 3230 + }, + { + "epoch": 0.07180830401438386, + "grad_norm": 1.2447353601455688, + "learning_rate": 1.9746611987314375e-05, + "loss": 0.4673, + "step": 3235 + }, + { + "epoch": 0.0719192905739115, + "grad_norm": 0.9409399628639221, + "learning_rate": 1.9745831449073448e-05, + "loss": 0.4932, + "step": 3240 + }, + { + "epoch": 0.07203027713343914, + "grad_norm": 1.3958779573440552, + "learning_rate": 1.974504972596702e-05, + "loss": 0.6988, + "step": 3245 + }, + { + "epoch": 0.07214126369296678, + "grad_norm": 1.4717780351638794, + "learning_rate": 1.9744266818090127e-05, + "loss": 0.6872, + "step": 3250 + }, + { + "epoch": 0.07225225025249442, + "grad_norm": 1.0255435705184937, + "learning_rate": 1.9743482725537956e-05, + "loss": 0.4785, + "step": 3255 + }, + { + "epoch": 0.07236323681202206, + "grad_norm": 1.3051496744155884, + "learning_rate": 1.9742697448405834e-05, + "loss": 0.5958, + "step": 3260 + }, + { + "epoch": 0.07247422337154971, + "grad_norm": 1.1969037055969238, + "learning_rate": 1.974191098678923e-05, + "loss": 0.473, + "step": 3265 + }, + { + "epoch": 0.07258520993107735, + "grad_norm": 1.1456329822540283, + "learning_rate": 1.9741123340783756e-05, + "loss": 0.5494, + "step": 3270 + }, + { + "epoch": 0.07269619649060499, + "grad_norm": 1.8885293006896973, + "learning_rate": 1.974033451048518e-05, + "loss": 0.4941, + "step": 3275 + }, + { + "epoch": 0.07280718305013263, + "grad_norm": 1.1382877826690674, + "learning_rate": 1.97395444959894e-05, + "loss": 0.5873, + "step": 3280 + }, + { + "epoch": 0.07291816960966027, + "grad_norm": 0.9775564670562744, + "learning_rate": 1.973875329739246e-05, + "loss": 0.5017, + "step": 3285 + }, + { + "epoch": 0.07302915616918791, + "grad_norm": 1.51008141040802, + "learning_rate": 1.9737960914790562e-05, + "loss": 0.5956, + "step": 3290 + }, + { + "epoch": 0.07314014272871555, + "grad_norm": 1.453376293182373, + "learning_rate": 1.973716734828003e-05, + "loss": 0.7438, + "step": 3295 + }, + { + "epoch": 0.07325112928824319, + "grad_norm": 1.4281340837478638, + "learning_rate": 1.973637259795735e-05, + "loss": 0.5661, + "step": 3300 + }, + { + "epoch": 0.07336211584777083, + "grad_norm": 1.2644604444503784, + "learning_rate": 1.9735576663919138e-05, + "loss": 0.7236, + "step": 3305 + }, + { + "epoch": 0.07347310240729847, + "grad_norm": 1.2021888494491577, + "learning_rate": 1.973477954626217e-05, + "loss": 0.5634, + "step": 3310 + }, + { + "epoch": 0.07358408896682611, + "grad_norm": 1.2512317895889282, + "learning_rate": 1.9733981245083355e-05, + "loss": 0.4663, + "step": 3315 + }, + { + "epoch": 0.07369507552635376, + "grad_norm": 1.073941946029663, + "learning_rate": 1.973318176047974e-05, + "loss": 0.6247, + "step": 3320 + }, + { + "epoch": 0.0738060620858814, + "grad_norm": 1.1235560178756714, + "learning_rate": 1.973238109254853e-05, + "loss": 0.6401, + "step": 3325 + }, + { + "epoch": 0.07391704864540904, + "grad_norm": 1.138898491859436, + "learning_rate": 1.9731579241387068e-05, + "loss": 0.5697, + "step": 3330 + }, + { + "epoch": 0.07402803520493668, + "grad_norm": 1.557690143585205, + "learning_rate": 1.9730776207092842e-05, + "loss": 0.5626, + "step": 3335 + }, + { + "epoch": 0.07413902176446432, + "grad_norm": 1.0606776475906372, + "learning_rate": 1.9729971989763474e-05, + "loss": 0.5082, + "step": 3340 + }, + { + "epoch": 0.07425000832399196, + "grad_norm": 1.6427574157714844, + "learning_rate": 1.9729166589496748e-05, + "loss": 0.5541, + "step": 3345 + }, + { + "epoch": 0.0743609948835196, + "grad_norm": 1.224596619606018, + "learning_rate": 1.9728360006390575e-05, + "loss": 0.4651, + "step": 3350 + }, + { + "epoch": 0.07447198144304724, + "grad_norm": 1.2702159881591797, + "learning_rate": 1.9727552240543018e-05, + "loss": 0.5905, + "step": 3355 + }, + { + "epoch": 0.07458296800257488, + "grad_norm": 0.9899827241897583, + "learning_rate": 1.972674329205228e-05, + "loss": 0.5382, + "step": 3360 + }, + { + "epoch": 0.07469395456210252, + "grad_norm": 1.2235766649246216, + "learning_rate": 1.972593316101672e-05, + "loss": 0.53, + "step": 3365 + }, + { + "epoch": 0.07480494112163016, + "grad_norm": 1.7147908210754395, + "learning_rate": 1.972512184753482e-05, + "loss": 0.4942, + "step": 3370 + }, + { + "epoch": 0.07491592768115782, + "grad_norm": 1.137715458869934, + "learning_rate": 1.9724309351705225e-05, + "loss": 0.6969, + "step": 3375 + }, + { + "epoch": 0.07502691424068546, + "grad_norm": 1.1159820556640625, + "learning_rate": 1.972349567362671e-05, + "loss": 0.7205, + "step": 3380 + }, + { + "epoch": 0.0751379008002131, + "grad_norm": 1.1172508001327515, + "learning_rate": 1.97226808133982e-05, + "loss": 0.6243, + "step": 3385 + }, + { + "epoch": 0.07524888735974074, + "grad_norm": 1.3358166217803955, + "learning_rate": 1.9721864771118764e-05, + "loss": 0.542, + "step": 3390 + }, + { + "epoch": 0.07535987391926838, + "grad_norm": 1.0049517154693604, + "learning_rate": 1.9721047546887617e-05, + "loss": 0.6114, + "step": 3395 + }, + { + "epoch": 0.07547086047879602, + "grad_norm": 1.2963844537734985, + "learning_rate": 1.972022914080411e-05, + "loss": 0.4937, + "step": 3400 + }, + { + "epoch": 0.07558184703832366, + "grad_norm": 1.0634539127349854, + "learning_rate": 1.9719409552967744e-05, + "loss": 0.3353, + "step": 3405 + }, + { + "epoch": 0.0756928335978513, + "grad_norm": 0.957071840763092, + "learning_rate": 1.9718588783478156e-05, + "loss": 0.7273, + "step": 3410 + }, + { + "epoch": 0.07580382015737894, + "grad_norm": 1.2983900308609009, + "learning_rate": 1.971776683243514e-05, + "loss": 0.6289, + "step": 3415 + }, + { + "epoch": 0.07591480671690658, + "grad_norm": 0.9925946593284607, + "learning_rate": 1.9716943699938624e-05, + "loss": 0.5156, + "step": 3420 + }, + { + "epoch": 0.07602579327643422, + "grad_norm": 1.1367206573486328, + "learning_rate": 1.971611938608868e-05, + "loss": 0.5861, + "step": 3425 + }, + { + "epoch": 0.07613677983596187, + "grad_norm": 1.0216517448425293, + "learning_rate": 1.971529389098553e-05, + "loss": 0.6049, + "step": 3430 + }, + { + "epoch": 0.07624776639548951, + "grad_norm": 1.3960784673690796, + "learning_rate": 1.971446721472953e-05, + "loss": 0.7103, + "step": 3435 + }, + { + "epoch": 0.07635875295501715, + "grad_norm": 1.090830683708191, + "learning_rate": 1.9713639357421182e-05, + "loss": 0.5972, + "step": 3440 + }, + { + "epoch": 0.07646973951454479, + "grad_norm": 1.1981265544891357, + "learning_rate": 1.971281031916114e-05, + "loss": 0.5033, + "step": 3445 + }, + { + "epoch": 0.07658072607407243, + "grad_norm": 1.483801007270813, + "learning_rate": 1.9711980100050196e-05, + "loss": 0.445, + "step": 3450 + }, + { + "epoch": 0.07669171263360007, + "grad_norm": 1.3446348905563354, + "learning_rate": 1.971114870018928e-05, + "loss": 0.6291, + "step": 3455 + }, + { + "epoch": 0.07680269919312771, + "grad_norm": 1.2881728410720825, + "learning_rate": 1.9710316119679474e-05, + "loss": 0.4737, + "step": 3460 + }, + { + "epoch": 0.07691368575265535, + "grad_norm": 1.2318109273910522, + "learning_rate": 1.9709482358622002e-05, + "loss": 0.4223, + "step": 3465 + }, + { + "epoch": 0.07702467231218299, + "grad_norm": 1.3753089904785156, + "learning_rate": 1.9708647417118225e-05, + "loss": 0.4553, + "step": 3470 + }, + { + "epoch": 0.07713565887171063, + "grad_norm": 2.218982219696045, + "learning_rate": 1.9707811295269656e-05, + "loss": 0.6495, + "step": 3475 + }, + { + "epoch": 0.07724664543123828, + "grad_norm": 1.3118571043014526, + "learning_rate": 1.9706973993177948e-05, + "loss": 0.507, + "step": 3480 + }, + { + "epoch": 0.07735763199076592, + "grad_norm": 1.707163691520691, + "learning_rate": 1.9706135510944894e-05, + "loss": 0.6421, + "step": 3485 + }, + { + "epoch": 0.07746861855029356, + "grad_norm": 1.357856035232544, + "learning_rate": 1.9705295848672443e-05, + "loss": 0.7232, + "step": 3490 + }, + { + "epoch": 0.0775796051098212, + "grad_norm": 0.8372821807861328, + "learning_rate": 1.9704455006462666e-05, + "loss": 0.6043, + "step": 3495 + }, + { + "epoch": 0.07769059166934884, + "grad_norm": 1.6511869430541992, + "learning_rate": 1.9703612984417797e-05, + "loss": 0.6053, + "step": 3500 + }, + { + "epoch": 0.07780157822887648, + "grad_norm": 1.1607153415679932, + "learning_rate": 1.9702769782640204e-05, + "loss": 0.5912, + "step": 3505 + }, + { + "epoch": 0.07791256478840412, + "grad_norm": 1.3973238468170166, + "learning_rate": 1.9701925401232406e-05, + "loss": 0.5424, + "step": 3510 + }, + { + "epoch": 0.07802355134793176, + "grad_norm": 1.4439544677734375, + "learning_rate": 1.970107984029705e-05, + "loss": 0.7179, + "step": 3515 + }, + { + "epoch": 0.0781345379074594, + "grad_norm": 1.0689724683761597, + "learning_rate": 1.9700233099936944e-05, + "loss": 0.6357, + "step": 3520 + }, + { + "epoch": 0.07824552446698704, + "grad_norm": 1.298864483833313, + "learning_rate": 1.9699385180255027e-05, + "loss": 0.6049, + "step": 3525 + }, + { + "epoch": 0.07835651102651468, + "grad_norm": 0.9402378797531128, + "learning_rate": 1.969853608135439e-05, + "loss": 0.5692, + "step": 3530 + }, + { + "epoch": 0.07846749758604234, + "grad_norm": 1.4564235210418701, + "learning_rate": 1.9697685803338267e-05, + "loss": 0.5447, + "step": 3535 + }, + { + "epoch": 0.07857848414556998, + "grad_norm": 1.2981603145599365, + "learning_rate": 1.9696834346310024e-05, + "loss": 0.5633, + "step": 3540 + }, + { + "epoch": 0.07868947070509762, + "grad_norm": 0.8520748019218445, + "learning_rate": 1.969598171037318e-05, + "loss": 0.5538, + "step": 3545 + }, + { + "epoch": 0.07880045726462526, + "grad_norm": 1.0657398700714111, + "learning_rate": 1.9695127895631403e-05, + "loss": 0.6826, + "step": 3550 + }, + { + "epoch": 0.0789114438241529, + "grad_norm": 1.444388747215271, + "learning_rate": 1.9694272902188486e-05, + "loss": 0.5651, + "step": 3555 + }, + { + "epoch": 0.07902243038368054, + "grad_norm": 1.5944229364395142, + "learning_rate": 1.9693416730148388e-05, + "loss": 0.5611, + "step": 3560 + }, + { + "epoch": 0.07913341694320818, + "grad_norm": 1.327964425086975, + "learning_rate": 1.969255937961519e-05, + "loss": 0.6285, + "step": 3565 + }, + { + "epoch": 0.07924440350273582, + "grad_norm": 1.199369192123413, + "learning_rate": 1.9691700850693126e-05, + "loss": 0.5577, + "step": 3570 + }, + { + "epoch": 0.07935539006226346, + "grad_norm": 1.0459039211273193, + "learning_rate": 1.9690841143486575e-05, + "loss": 0.5764, + "step": 3575 + }, + { + "epoch": 0.0794663766217911, + "grad_norm": 1.2481250762939453, + "learning_rate": 1.9689980258100065e-05, + "loss": 0.6886, + "step": 3580 + }, + { + "epoch": 0.07957736318131874, + "grad_norm": 1.0888584852218628, + "learning_rate": 1.9689118194638248e-05, + "loss": 0.5258, + "step": 3585 + }, + { + "epoch": 0.07968834974084639, + "grad_norm": 1.121347188949585, + "learning_rate": 1.9688254953205935e-05, + "loss": 0.531, + "step": 3590 + }, + { + "epoch": 0.07979933630037403, + "grad_norm": 0.9125674962997437, + "learning_rate": 1.968739053390808e-05, + "loss": 0.5368, + "step": 3595 + }, + { + "epoch": 0.07991032285990167, + "grad_norm": 1.4685077667236328, + "learning_rate": 1.968652493684977e-05, + "loss": 0.6108, + "step": 3600 + }, + { + "epoch": 0.08002130941942931, + "grad_norm": 1.0957640409469604, + "learning_rate": 1.9685658162136247e-05, + "loss": 0.6653, + "step": 3605 + }, + { + "epoch": 0.08013229597895695, + "grad_norm": 1.6297954320907593, + "learning_rate": 1.9684790209872885e-05, + "loss": 0.4481, + "step": 3610 + }, + { + "epoch": 0.08024328253848459, + "grad_norm": 0.9564645886421204, + "learning_rate": 1.9683921080165215e-05, + "loss": 0.523, + "step": 3615 + }, + { + "epoch": 0.08035426909801223, + "grad_norm": 0.9796757102012634, + "learning_rate": 1.9683050773118892e-05, + "loss": 0.7131, + "step": 3620 + }, + { + "epoch": 0.08046525565753987, + "grad_norm": 0.8716163039207458, + "learning_rate": 1.968217928883973e-05, + "loss": 0.463, + "step": 3625 + }, + { + "epoch": 0.08057624221706751, + "grad_norm": 1.0294742584228516, + "learning_rate": 1.9681306627433683e-05, + "loss": 0.6369, + "step": 3630 + }, + { + "epoch": 0.08068722877659515, + "grad_norm": 1.519837498664856, + "learning_rate": 1.9680432789006846e-05, + "loss": 0.5785, + "step": 3635 + }, + { + "epoch": 0.08079821533612279, + "grad_norm": 0.9069929718971252, + "learning_rate": 1.9679557773665457e-05, + "loss": 0.6427, + "step": 3640 + }, + { + "epoch": 0.08090920189565044, + "grad_norm": 1.3364211320877075, + "learning_rate": 1.9678681581515894e-05, + "loss": 0.5294, + "step": 3645 + }, + { + "epoch": 0.08102018845517808, + "grad_norm": 1.2609714269638062, + "learning_rate": 1.9677804212664685e-05, + "loss": 0.5018, + "step": 3650 + }, + { + "epoch": 0.08113117501470572, + "grad_norm": 1.5361000299453735, + "learning_rate": 1.9676925667218498e-05, + "loss": 0.5415, + "step": 3655 + }, + { + "epoch": 0.08124216157423336, + "grad_norm": 1.722002625465393, + "learning_rate": 1.967604594528414e-05, + "loss": 0.5294, + "step": 3660 + }, + { + "epoch": 0.081353148133761, + "grad_norm": 1.276304006576538, + "learning_rate": 1.967516504696857e-05, + "loss": 0.6848, + "step": 3665 + }, + { + "epoch": 0.08146413469328864, + "grad_norm": 1.476733922958374, + "learning_rate": 1.9674282972378878e-05, + "loss": 0.388, + "step": 3670 + }, + { + "epoch": 0.08157512125281628, + "grad_norm": 1.4457347393035889, + "learning_rate": 1.9673399721622307e-05, + "loss": 0.4752, + "step": 3675 + }, + { + "epoch": 0.08168610781234392, + "grad_norm": 1.381885290145874, + "learning_rate": 1.9672515294806242e-05, + "loss": 0.4878, + "step": 3680 + }, + { + "epoch": 0.08179709437187156, + "grad_norm": 1.1364555358886719, + "learning_rate": 1.9671629692038206e-05, + "loss": 0.4585, + "step": 3685 + }, + { + "epoch": 0.0819080809313992, + "grad_norm": 1.1215564012527466, + "learning_rate": 1.967074291342587e-05, + "loss": 0.5158, + "step": 3690 + }, + { + "epoch": 0.08201906749092684, + "grad_norm": 1.0232094526290894, + "learning_rate": 1.9669854959077043e-05, + "loss": 0.6409, + "step": 3695 + }, + { + "epoch": 0.0821300540504545, + "grad_norm": 1.3816514015197754, + "learning_rate": 1.966896582909968e-05, + "loss": 0.5302, + "step": 3700 + }, + { + "epoch": 0.08224104060998214, + "grad_norm": 1.197551965713501, + "learning_rate": 1.9668075523601877e-05, + "loss": 0.5766, + "step": 3705 + }, + { + "epoch": 0.08235202716950978, + "grad_norm": 1.2883515357971191, + "learning_rate": 1.9667184042691877e-05, + "loss": 0.4869, + "step": 3710 + }, + { + "epoch": 0.08246301372903742, + "grad_norm": 1.4811487197875977, + "learning_rate": 1.966629138647806e-05, + "loss": 0.5379, + "step": 3715 + }, + { + "epoch": 0.08257400028856506, + "grad_norm": 1.2199665307998657, + "learning_rate": 1.966539755506896e-05, + "loss": 0.5752, + "step": 3720 + }, + { + "epoch": 0.0826849868480927, + "grad_norm": 1.278082251548767, + "learning_rate": 1.9664502548573234e-05, + "loss": 0.3786, + "step": 3725 + }, + { + "epoch": 0.08279597340762034, + "grad_norm": 1.4458006620407104, + "learning_rate": 1.9663606367099704e-05, + "loss": 0.4773, + "step": 3730 + }, + { + "epoch": 0.08290695996714798, + "grad_norm": 1.4444694519042969, + "learning_rate": 1.966270901075732e-05, + "loss": 0.6588, + "step": 3735 + }, + { + "epoch": 0.08301794652667562, + "grad_norm": 0.9793612360954285, + "learning_rate": 1.9661810479655184e-05, + "loss": 0.5976, + "step": 3740 + }, + { + "epoch": 0.08312893308620325, + "grad_norm": 1.261999249458313, + "learning_rate": 1.9660910773902532e-05, + "loss": 0.6603, + "step": 3745 + }, + { + "epoch": 0.08323991964573091, + "grad_norm": 1.2012444734573364, + "learning_rate": 1.9660009893608744e-05, + "loss": 0.7029, + "step": 3750 + }, + { + "epoch": 0.08335090620525855, + "grad_norm": 1.2707288265228271, + "learning_rate": 1.9659107838883354e-05, + "loss": 0.5535, + "step": 3755 + }, + { + "epoch": 0.08346189276478619, + "grad_norm": 1.252837061882019, + "learning_rate": 1.9658204609836026e-05, + "loss": 0.6405, + "step": 3760 + }, + { + "epoch": 0.08357287932431383, + "grad_norm": 1.2707806825637817, + "learning_rate": 1.9657300206576573e-05, + "loss": 0.5725, + "step": 3765 + }, + { + "epoch": 0.08368386588384147, + "grad_norm": 1.62176513671875, + "learning_rate": 1.965639462921495e-05, + "loss": 0.5944, + "step": 3770 + }, + { + "epoch": 0.08379485244336911, + "grad_norm": 1.136454701423645, + "learning_rate": 1.965548787786125e-05, + "loss": 0.6375, + "step": 3775 + }, + { + "epoch": 0.08390583900289675, + "grad_norm": 1.5717942714691162, + "learning_rate": 1.965457995262572e-05, + "loss": 0.5957, + "step": 3780 + }, + { + "epoch": 0.08401682556242439, + "grad_norm": 0.764004111289978, + "learning_rate": 1.9653670853618733e-05, + "loss": 0.4561, + "step": 3785 + }, + { + "epoch": 0.08412781212195203, + "grad_norm": 2.068720579147339, + "learning_rate": 1.9652760580950825e-05, + "loss": 0.4438, + "step": 3790 + }, + { + "epoch": 0.08423879868147967, + "grad_norm": 0.9934617877006531, + "learning_rate": 1.9651849134732653e-05, + "loss": 0.6083, + "step": 3795 + }, + { + "epoch": 0.08434978524100731, + "grad_norm": 1.0511016845703125, + "learning_rate": 1.965093651507504e-05, + "loss": 0.4666, + "step": 3800 + }, + { + "epoch": 0.08446077180053496, + "grad_norm": 1.0432274341583252, + "learning_rate": 1.9650022722088927e-05, + "loss": 0.5368, + "step": 3805 + }, + { + "epoch": 0.0845717583600626, + "grad_norm": 1.2682653665542603, + "learning_rate": 1.9649107755885416e-05, + "loss": 0.4956, + "step": 3810 + }, + { + "epoch": 0.08468274491959024, + "grad_norm": 0.9422754645347595, + "learning_rate": 1.9648191616575745e-05, + "loss": 0.5083, + "step": 3815 + }, + { + "epoch": 0.08479373147911788, + "grad_norm": 1.4312782287597656, + "learning_rate": 1.9647274304271297e-05, + "loss": 0.4671, + "step": 3820 + }, + { + "epoch": 0.08490471803864552, + "grad_norm": 0.8657413125038147, + "learning_rate": 1.964635581908359e-05, + "loss": 0.4275, + "step": 3825 + }, + { + "epoch": 0.08501570459817316, + "grad_norm": 1.6890164613723755, + "learning_rate": 1.9645436161124293e-05, + "loss": 0.6362, + "step": 3830 + }, + { + "epoch": 0.0851266911577008, + "grad_norm": 0.9134665131568909, + "learning_rate": 1.9644515330505218e-05, + "loss": 0.6467, + "step": 3835 + }, + { + "epoch": 0.08523767771722844, + "grad_norm": 1.2863361835479736, + "learning_rate": 1.9643593327338312e-05, + "loss": 0.4728, + "step": 3840 + }, + { + "epoch": 0.08534866427675608, + "grad_norm": 1.3430936336517334, + "learning_rate": 1.9642670151735675e-05, + "loss": 0.7618, + "step": 3845 + }, + { + "epoch": 0.08545965083628372, + "grad_norm": 1.1958832740783691, + "learning_rate": 1.964174580380954e-05, + "loss": 0.612, + "step": 3850 + }, + { + "epoch": 0.08557063739581136, + "grad_norm": 1.1422450542449951, + "learning_rate": 1.9640820283672284e-05, + "loss": 0.5366, + "step": 3855 + }, + { + "epoch": 0.08568162395533901, + "grad_norm": 1.3729885816574097, + "learning_rate": 1.963989359143643e-05, + "loss": 0.6063, + "step": 3860 + }, + { + "epoch": 0.08579261051486665, + "grad_norm": 0.8676313161849976, + "learning_rate": 1.9638965727214645e-05, + "loss": 0.4358, + "step": 3865 + }, + { + "epoch": 0.0859035970743943, + "grad_norm": 1.2360765933990479, + "learning_rate": 1.9638036691119734e-05, + "loss": 0.4915, + "step": 3870 + }, + { + "epoch": 0.08601458363392193, + "grad_norm": 0.9501989483833313, + "learning_rate": 1.963710648326464e-05, + "loss": 0.5179, + "step": 3875 + }, + { + "epoch": 0.08612557019344957, + "grad_norm": 1.4758225679397583, + "learning_rate": 1.9636175103762466e-05, + "loss": 0.5411, + "step": 3880 + }, + { + "epoch": 0.08623655675297721, + "grad_norm": 1.0443713665008545, + "learning_rate": 1.963524255272644e-05, + "loss": 0.487, + "step": 3885 + }, + { + "epoch": 0.08634754331250485, + "grad_norm": 1.2120336294174194, + "learning_rate": 1.9634308830269936e-05, + "loss": 0.6933, + "step": 3890 + }, + { + "epoch": 0.0864585298720325, + "grad_norm": 1.120863676071167, + "learning_rate": 1.9633373936506478e-05, + "loss": 0.5542, + "step": 3895 + }, + { + "epoch": 0.08656951643156013, + "grad_norm": 1.210645079612732, + "learning_rate": 1.963243787154972e-05, + "loss": 0.612, + "step": 3900 + }, + { + "epoch": 0.08668050299108777, + "grad_norm": 1.8348194360733032, + "learning_rate": 1.9631500635513475e-05, + "loss": 0.4468, + "step": 3905 + }, + { + "epoch": 0.08679148955061541, + "grad_norm": 1.100720763206482, + "learning_rate": 1.9630562228511682e-05, + "loss": 0.5213, + "step": 3910 + }, + { + "epoch": 0.08690247611014307, + "grad_norm": 1.0100977420806885, + "learning_rate": 1.9629622650658434e-05, + "loss": 0.539, + "step": 3915 + }, + { + "epoch": 0.08701346266967071, + "grad_norm": 1.1190155744552612, + "learning_rate": 1.9628681902067956e-05, + "loss": 0.4504, + "step": 3920 + }, + { + "epoch": 0.08712444922919835, + "grad_norm": 0.9896363019943237, + "learning_rate": 1.9627739982854625e-05, + "loss": 0.5673, + "step": 3925 + }, + { + "epoch": 0.08723543578872599, + "grad_norm": 0.9710890054702759, + "learning_rate": 1.9626796893132955e-05, + "loss": 0.4689, + "step": 3930 + }, + { + "epoch": 0.08734642234825363, + "grad_norm": 1.296876311302185, + "learning_rate": 1.9625852633017608e-05, + "loss": 0.6207, + "step": 3935 + }, + { + "epoch": 0.08745740890778127, + "grad_norm": 1.5721899271011353, + "learning_rate": 1.9624907202623378e-05, + "loss": 0.6262, + "step": 3940 + }, + { + "epoch": 0.08756839546730891, + "grad_norm": 1.4297751188278198, + "learning_rate": 1.9623960602065213e-05, + "loss": 0.4816, + "step": 3945 + }, + { + "epoch": 0.08767938202683655, + "grad_norm": 1.1989187002182007, + "learning_rate": 1.962301283145819e-05, + "loss": 0.6097, + "step": 3950 + }, + { + "epoch": 0.08779036858636419, + "grad_norm": 1.1087952852249146, + "learning_rate": 1.9622063890917543e-05, + "loss": 0.5414, + "step": 3955 + }, + { + "epoch": 0.08790135514589183, + "grad_norm": 1.1103991270065308, + "learning_rate": 1.9621113780558635e-05, + "loss": 0.6886, + "step": 3960 + }, + { + "epoch": 0.08801234170541947, + "grad_norm": 1.356634259223938, + "learning_rate": 1.962016250049698e-05, + "loss": 0.5059, + "step": 3965 + }, + { + "epoch": 0.08812332826494712, + "grad_norm": 1.3991785049438477, + "learning_rate": 1.9619210050848236e-05, + "loss": 0.5072, + "step": 3970 + }, + { + "epoch": 0.08823431482447476, + "grad_norm": 1.5779005289077759, + "learning_rate": 1.961825643172819e-05, + "loss": 0.7137, + "step": 3975 + }, + { + "epoch": 0.0883453013840024, + "grad_norm": 1.3491464853286743, + "learning_rate": 1.9617301643252787e-05, + "loss": 0.8226, + "step": 3980 + }, + { + "epoch": 0.08845628794353004, + "grad_norm": 1.3575505018234253, + "learning_rate": 1.9616345685538107e-05, + "loss": 0.5308, + "step": 3985 + }, + { + "epoch": 0.08856727450305768, + "grad_norm": 1.6040680408477783, + "learning_rate": 1.9615388558700363e-05, + "loss": 0.414, + "step": 3990 + }, + { + "epoch": 0.08867826106258532, + "grad_norm": 1.4553582668304443, + "learning_rate": 1.961443026285593e-05, + "loss": 0.7059, + "step": 3995 + }, + { + "epoch": 0.08878924762211296, + "grad_norm": 1.59603750705719, + "learning_rate": 1.9613470798121307e-05, + "loss": 0.6761, + "step": 4000 + }, + { + "epoch": 0.0889002341816406, + "grad_norm": 1.192799687385559, + "learning_rate": 1.9612510164613145e-05, + "loss": 0.5735, + "step": 4005 + }, + { + "epoch": 0.08901122074116824, + "grad_norm": 1.0729697942733765, + "learning_rate": 1.961154836244824e-05, + "loss": 0.6648, + "step": 4010 + }, + { + "epoch": 0.08912220730069588, + "grad_norm": 1.526694893836975, + "learning_rate": 1.9610585391743516e-05, + "loss": 0.6473, + "step": 4015 + }, + { + "epoch": 0.08923319386022353, + "grad_norm": 1.1947791576385498, + "learning_rate": 1.960962125261605e-05, + "loss": 0.6093, + "step": 4020 + }, + { + "epoch": 0.08934418041975117, + "grad_norm": 1.1378854513168335, + "learning_rate": 1.960865594518306e-05, + "loss": 0.6483, + "step": 4025 + }, + { + "epoch": 0.08945516697927881, + "grad_norm": 1.3792065382003784, + "learning_rate": 1.9607689469561908e-05, + "loss": 0.8147, + "step": 4030 + }, + { + "epoch": 0.08956615353880645, + "grad_norm": 1.592315673828125, + "learning_rate": 1.960672182587009e-05, + "loss": 0.6142, + "step": 4035 + }, + { + "epoch": 0.0896771400983341, + "grad_norm": 1.6274422407150269, + "learning_rate": 1.9605753014225254e-05, + "loss": 0.8193, + "step": 4040 + }, + { + "epoch": 0.08978812665786173, + "grad_norm": 1.5865644216537476, + "learning_rate": 1.9604783034745182e-05, + "loss": 0.6778, + "step": 4045 + }, + { + "epoch": 0.08989911321738937, + "grad_norm": 1.2761083841323853, + "learning_rate": 1.9603811887547797e-05, + "loss": 0.6685, + "step": 4050 + }, + { + "epoch": 0.09001009977691701, + "grad_norm": 1.147282600402832, + "learning_rate": 1.960283957275117e-05, + "loss": 0.5395, + "step": 4055 + }, + { + "epoch": 0.09012108633644465, + "grad_norm": 1.1976675987243652, + "learning_rate": 1.9601866090473517e-05, + "loss": 0.5897, + "step": 4060 + }, + { + "epoch": 0.09023207289597229, + "grad_norm": 1.3472319841384888, + "learning_rate": 1.9600891440833187e-05, + "loss": 0.5855, + "step": 4065 + }, + { + "epoch": 0.09034305945549993, + "grad_norm": 0.9857687950134277, + "learning_rate": 1.9599915623948673e-05, + "loss": 0.6031, + "step": 4070 + }, + { + "epoch": 0.09045404601502759, + "grad_norm": 1.6844218969345093, + "learning_rate": 1.9598938639938613e-05, + "loss": 0.6718, + "step": 4075 + }, + { + "epoch": 0.09056503257455523, + "grad_norm": 0.9948323965072632, + "learning_rate": 1.9597960488921785e-05, + "loss": 0.4619, + "step": 4080 + }, + { + "epoch": 0.09067601913408287, + "grad_norm": 0.9627229571342468, + "learning_rate": 1.959698117101711e-05, + "loss": 0.43, + "step": 4085 + }, + { + "epoch": 0.0907870056936105, + "grad_norm": 0.9202539324760437, + "learning_rate": 1.959600068634365e-05, + "loss": 0.5921, + "step": 4090 + }, + { + "epoch": 0.09089799225313815, + "grad_norm": 1.9609408378601074, + "learning_rate": 1.959501903502061e-05, + "loss": 0.6661, + "step": 4095 + }, + { + "epoch": 0.09100897881266579, + "grad_norm": 1.3210110664367676, + "learning_rate": 1.9594036217167336e-05, + "loss": 0.5077, + "step": 4100 + }, + { + "epoch": 0.09111996537219343, + "grad_norm": 1.2981141805648804, + "learning_rate": 1.9593052232903312e-05, + "loss": 0.5604, + "step": 4105 + }, + { + "epoch": 0.09123095193172107, + "grad_norm": 0.9609360098838806, + "learning_rate": 1.9592067082348172e-05, + "loss": 0.5489, + "step": 4110 + }, + { + "epoch": 0.0913419384912487, + "grad_norm": 1.3565384149551392, + "learning_rate": 1.9591080765621685e-05, + "loss": 0.6582, + "step": 4115 + }, + { + "epoch": 0.09145292505077635, + "grad_norm": 1.7355798482894897, + "learning_rate": 1.9590093282843764e-05, + "loss": 0.5174, + "step": 4120 + }, + { + "epoch": 0.09156391161030399, + "grad_norm": 1.0244948863983154, + "learning_rate": 1.9589104634134465e-05, + "loss": 0.5201, + "step": 4125 + }, + { + "epoch": 0.09167489816983164, + "grad_norm": 1.2683207988739014, + "learning_rate": 1.9588114819613984e-05, + "loss": 0.6843, + "step": 4130 + }, + { + "epoch": 0.09178588472935928, + "grad_norm": 1.62088942527771, + "learning_rate": 1.9587123839402658e-05, + "loss": 0.7301, + "step": 4135 + }, + { + "epoch": 0.09189687128888692, + "grad_norm": 1.758149266242981, + "learning_rate": 1.9586131693620973e-05, + "loss": 0.5188, + "step": 4140 + }, + { + "epoch": 0.09200785784841456, + "grad_norm": 1.5698381662368774, + "learning_rate": 1.9585138382389538e-05, + "loss": 0.5827, + "step": 4145 + }, + { + "epoch": 0.0921188444079422, + "grad_norm": 1.6577712297439575, + "learning_rate": 1.9584143905829128e-05, + "loss": 0.6137, + "step": 4150 + }, + { + "epoch": 0.09222983096746984, + "grad_norm": 1.1802431344985962, + "learning_rate": 1.9583148264060648e-05, + "loss": 0.6676, + "step": 4155 + }, + { + "epoch": 0.09234081752699748, + "grad_norm": 1.1364336013793945, + "learning_rate": 1.9582151457205135e-05, + "loss": 0.4683, + "step": 4160 + }, + { + "epoch": 0.09245180408652512, + "grad_norm": 0.9396209716796875, + "learning_rate": 1.958115348538379e-05, + "loss": 0.5817, + "step": 4165 + }, + { + "epoch": 0.09256279064605276, + "grad_norm": 1.372672438621521, + "learning_rate": 1.9580154348717935e-05, + "loss": 0.567, + "step": 4170 + }, + { + "epoch": 0.0926737772055804, + "grad_norm": 1.1881585121154785, + "learning_rate": 1.9579154047329045e-05, + "loss": 0.5306, + "step": 4175 + }, + { + "epoch": 0.09278476376510804, + "grad_norm": 1.2390387058258057, + "learning_rate": 1.9578152581338726e-05, + "loss": 0.5603, + "step": 4180 + }, + { + "epoch": 0.09289575032463569, + "grad_norm": 1.2957234382629395, + "learning_rate": 1.9577149950868744e-05, + "loss": 0.4922, + "step": 4185 + }, + { + "epoch": 0.09300673688416333, + "grad_norm": 1.6602461338043213, + "learning_rate": 1.957614615604099e-05, + "loss": 0.5974, + "step": 4190 + }, + { + "epoch": 0.09311772344369097, + "grad_norm": 1.2627344131469727, + "learning_rate": 1.9575141196977502e-05, + "loss": 0.6923, + "step": 4195 + }, + { + "epoch": 0.09322871000321861, + "grad_norm": 1.0888328552246094, + "learning_rate": 1.957413507380046e-05, + "loss": 0.614, + "step": 4200 + }, + { + "epoch": 0.09333969656274625, + "grad_norm": 1.0514628887176514, + "learning_rate": 1.9573127786632184e-05, + "loss": 0.5061, + "step": 4205 + }, + { + "epoch": 0.09345068312227389, + "grad_norm": 1.1055153608322144, + "learning_rate": 1.9572119335595135e-05, + "loss": 0.4432, + "step": 4210 + }, + { + "epoch": 0.09356166968180153, + "grad_norm": 1.2091655731201172, + "learning_rate": 1.9571109720811924e-05, + "loss": 0.6272, + "step": 4215 + }, + { + "epoch": 0.09367265624132917, + "grad_norm": 1.4013484716415405, + "learning_rate": 1.957009894240529e-05, + "loss": 0.5315, + "step": 4220 + }, + { + "epoch": 0.09378364280085681, + "grad_norm": 1.4002299308776855, + "learning_rate": 1.9569087000498123e-05, + "loss": 0.5467, + "step": 4225 + }, + { + "epoch": 0.09389462936038445, + "grad_norm": 1.728796124458313, + "learning_rate": 1.956807389521345e-05, + "loss": 0.6269, + "step": 4230 + }, + { + "epoch": 0.0940056159199121, + "grad_norm": 1.7954356670379639, + "learning_rate": 1.956705962667444e-05, + "loss": 0.4745, + "step": 4235 + }, + { + "epoch": 0.09411660247943975, + "grad_norm": 0.8677371740341187, + "learning_rate": 1.956604419500441e-05, + "loss": 0.6557, + "step": 4240 + }, + { + "epoch": 0.09422758903896739, + "grad_norm": 1.9286059141159058, + "learning_rate": 1.9565027600326806e-05, + "loss": 0.5488, + "step": 4245 + }, + { + "epoch": 0.09433857559849503, + "grad_norm": 1.1704398393630981, + "learning_rate": 1.9564009842765225e-05, + "loss": 0.4972, + "step": 4250 + }, + { + "epoch": 0.09444956215802267, + "grad_norm": 1.3009753227233887, + "learning_rate": 1.9562990922443404e-05, + "loss": 0.5304, + "step": 4255 + }, + { + "epoch": 0.0945605487175503, + "grad_norm": 0.9224164485931396, + "learning_rate": 1.9561970839485218e-05, + "loss": 0.5462, + "step": 4260 + }, + { + "epoch": 0.09467153527707795, + "grad_norm": 1.1949145793914795, + "learning_rate": 1.9560949594014684e-05, + "loss": 0.5034, + "step": 4265 + }, + { + "epoch": 0.09478252183660558, + "grad_norm": 1.560428261756897, + "learning_rate": 1.9559927186155967e-05, + "loss": 0.6407, + "step": 4270 + }, + { + "epoch": 0.09489350839613322, + "grad_norm": 1.1759483814239502, + "learning_rate": 1.955890361603336e-05, + "loss": 0.5272, + "step": 4275 + }, + { + "epoch": 0.09500449495566086, + "grad_norm": 1.8360085487365723, + "learning_rate": 1.9557878883771312e-05, + "loss": 0.5146, + "step": 4280 + }, + { + "epoch": 0.0951154815151885, + "grad_norm": 1.4351441860198975, + "learning_rate": 1.9556852989494403e-05, + "loss": 0.5396, + "step": 4285 + }, + { + "epoch": 0.09522646807471616, + "grad_norm": 1.1562596559524536, + "learning_rate": 1.9555825933327358e-05, + "loss": 0.4966, + "step": 4290 + }, + { + "epoch": 0.0953374546342438, + "grad_norm": 1.140037178993225, + "learning_rate": 1.9554797715395045e-05, + "loss": 0.5588, + "step": 4295 + }, + { + "epoch": 0.09544844119377144, + "grad_norm": 1.1700094938278198, + "learning_rate": 1.955376833582247e-05, + "loss": 0.4458, + "step": 4300 + }, + { + "epoch": 0.09555942775329908, + "grad_norm": 1.3324131965637207, + "learning_rate": 1.955273779473478e-05, + "loss": 0.5194, + "step": 4305 + }, + { + "epoch": 0.09567041431282672, + "grad_norm": 1.4174566268920898, + "learning_rate": 1.9551706092257268e-05, + "loss": 0.5176, + "step": 4310 + }, + { + "epoch": 0.09578140087235436, + "grad_norm": 1.7331970930099487, + "learning_rate": 1.9550673228515366e-05, + "loss": 0.5411, + "step": 4315 + }, + { + "epoch": 0.095892387431882, + "grad_norm": 1.608151912689209, + "learning_rate": 1.954963920363464e-05, + "loss": 0.5237, + "step": 4320 + }, + { + "epoch": 0.09600337399140964, + "grad_norm": 1.16243314743042, + "learning_rate": 1.9548604017740806e-05, + "loss": 0.6231, + "step": 4325 + }, + { + "epoch": 0.09611436055093728, + "grad_norm": 1.1721985340118408, + "learning_rate": 1.954756767095972e-05, + "loss": 0.6126, + "step": 4330 + }, + { + "epoch": 0.09622534711046492, + "grad_norm": 1.2511540651321411, + "learning_rate": 1.9546530163417374e-05, + "loss": 0.6161, + "step": 4335 + }, + { + "epoch": 0.09633633366999256, + "grad_norm": 1.3616172075271606, + "learning_rate": 1.9545491495239913e-05, + "loss": 0.6517, + "step": 4340 + }, + { + "epoch": 0.09644732022952021, + "grad_norm": 1.3992600440979004, + "learning_rate": 1.9544451666553603e-05, + "loss": 0.5141, + "step": 4345 + }, + { + "epoch": 0.09655830678904785, + "grad_norm": 1.0629327297210693, + "learning_rate": 1.9543410677484873e-05, + "loss": 0.6008, + "step": 4350 + }, + { + "epoch": 0.09666929334857549, + "grad_norm": 1.3158684968948364, + "learning_rate": 1.9542368528160276e-05, + "loss": 0.5928, + "step": 4355 + }, + { + "epoch": 0.09678027990810313, + "grad_norm": 1.9079759120941162, + "learning_rate": 1.954132521870652e-05, + "loss": 0.4011, + "step": 4360 + }, + { + "epoch": 0.09689126646763077, + "grad_norm": 1.2915329933166504, + "learning_rate": 1.954028074925044e-05, + "loss": 0.4968, + "step": 4365 + }, + { + "epoch": 0.09700225302715841, + "grad_norm": 0.9710379838943481, + "learning_rate": 1.9539235119919025e-05, + "loss": 0.4912, + "step": 4370 + }, + { + "epoch": 0.09711323958668605, + "grad_norm": 1.1526905298233032, + "learning_rate": 1.9538188330839393e-05, + "loss": 0.4315, + "step": 4375 + }, + { + "epoch": 0.09722422614621369, + "grad_norm": 1.2658145427703857, + "learning_rate": 1.953714038213881e-05, + "loss": 0.664, + "step": 4380 + }, + { + "epoch": 0.09733521270574133, + "grad_norm": 1.0740182399749756, + "learning_rate": 1.953609127394469e-05, + "loss": 0.4432, + "step": 4385 + }, + { + "epoch": 0.09744619926526897, + "grad_norm": 1.336108922958374, + "learning_rate": 1.9535041006384572e-05, + "loss": 0.5297, + "step": 4390 + }, + { + "epoch": 0.09755718582479661, + "grad_norm": 1.3726085424423218, + "learning_rate": 1.953398957958615e-05, + "loss": 0.6767, + "step": 4395 + }, + { + "epoch": 0.09766817238432426, + "grad_norm": 1.1402976512908936, + "learning_rate": 1.9532936993677243e-05, + "loss": 0.5696, + "step": 4400 + }, + { + "epoch": 0.0977791589438519, + "grad_norm": 1.2232763767242432, + "learning_rate": 1.9531883248785833e-05, + "loss": 0.672, + "step": 4405 + }, + { + "epoch": 0.09789014550337954, + "grad_norm": 1.2305266857147217, + "learning_rate": 1.9530828345040022e-05, + "loss": 0.5717, + "step": 4410 + }, + { + "epoch": 0.09800113206290718, + "grad_norm": 0.9083310961723328, + "learning_rate": 1.9529772282568064e-05, + "loss": 0.6324, + "step": 4415 + }, + { + "epoch": 0.09811211862243482, + "grad_norm": 0.9469552636146545, + "learning_rate": 1.9528715061498355e-05, + "loss": 0.4787, + "step": 4420 + }, + { + "epoch": 0.09822310518196246, + "grad_norm": 1.191145896911621, + "learning_rate": 1.9527656681959425e-05, + "loss": 0.5145, + "step": 4425 + }, + { + "epoch": 0.0983340917414901, + "grad_norm": 1.0690206289291382, + "learning_rate": 1.9526597144079947e-05, + "loss": 0.3813, + "step": 4430 + }, + { + "epoch": 0.09844507830101774, + "grad_norm": 0.9900261163711548, + "learning_rate": 1.952553644798874e-05, + "loss": 0.5721, + "step": 4435 + }, + { + "epoch": 0.09855606486054538, + "grad_norm": 1.2516738176345825, + "learning_rate": 1.9524474593814756e-05, + "loss": 0.5769, + "step": 4440 + }, + { + "epoch": 0.09866705142007302, + "grad_norm": 1.4266631603240967, + "learning_rate": 1.9523411581687097e-05, + "loss": 0.4992, + "step": 4445 + }, + { + "epoch": 0.09877803797960066, + "grad_norm": 1.3769086599349976, + "learning_rate": 1.952234741173499e-05, + "loss": 0.5517, + "step": 4450 + }, + { + "epoch": 0.09888902453912832, + "grad_norm": 1.4339829683303833, + "learning_rate": 1.9521282084087823e-05, + "loss": 0.605, + "step": 4455 + }, + { + "epoch": 0.09900001109865596, + "grad_norm": 1.2619751691818237, + "learning_rate": 1.9520215598875117e-05, + "loss": 0.7132, + "step": 4460 + }, + { + "epoch": 0.0991109976581836, + "grad_norm": 1.1638000011444092, + "learning_rate": 1.9519147956226522e-05, + "loss": 0.4482, + "step": 4465 + }, + { + "epoch": 0.09922198421771124, + "grad_norm": 1.5981131792068481, + "learning_rate": 1.9518079156271844e-05, + "loss": 0.6933, + "step": 4470 + }, + { + "epoch": 0.09933297077723888, + "grad_norm": 1.3889281749725342, + "learning_rate": 1.9517009199141025e-05, + "loss": 0.553, + "step": 4475 + }, + { + "epoch": 0.09944395733676652, + "grad_norm": 0.9670382738113403, + "learning_rate": 1.9515938084964145e-05, + "loss": 0.5892, + "step": 4480 + }, + { + "epoch": 0.09955494389629416, + "grad_norm": 1.1507785320281982, + "learning_rate": 1.9514865813871427e-05, + "loss": 0.4446, + "step": 4485 + }, + { + "epoch": 0.0996659304558218, + "grad_norm": 1.9023184776306152, + "learning_rate": 1.9513792385993233e-05, + "loss": 0.5241, + "step": 4490 + }, + { + "epoch": 0.09977691701534944, + "grad_norm": 1.3594270944595337, + "learning_rate": 1.9512717801460064e-05, + "loss": 0.5864, + "step": 4495 + }, + { + "epoch": 0.09988790357487708, + "grad_norm": 1.3646306991577148, + "learning_rate": 1.9511642060402573e-05, + "loss": 0.6054, + "step": 4500 + }, + { + "epoch": 0.09999889013440473, + "grad_norm": 1.2112932205200195, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.6177, + "step": 4505 + }, + { + "epoch": 0.10010987669393237, + "grad_norm": 1.1686182022094727, + "learning_rate": 1.9509487109237887e-05, + "loss": 0.4071, + "step": 4510 + }, + { + "epoch": 0.10022086325346001, + "grad_norm": 1.53573739528656, + "learning_rate": 1.9508407899392682e-05, + "loss": 0.6475, + "step": 4515 + }, + { + "epoch": 0.10033184981298765, + "grad_norm": 1.5155771970748901, + "learning_rate": 1.9507327533547137e-05, + "loss": 0.6208, + "step": 4520 + }, + { + "epoch": 0.10044283637251529, + "grad_norm": 1.6997084617614746, + "learning_rate": 1.9506246011832595e-05, + "loss": 0.5137, + "step": 4525 + }, + { + "epoch": 0.10055382293204293, + "grad_norm": 1.683323621749878, + "learning_rate": 1.9505163334380547e-05, + "loss": 0.5684, + "step": 4530 + }, + { + "epoch": 0.10066480949157057, + "grad_norm": 1.1800998449325562, + "learning_rate": 1.9504079501322614e-05, + "loss": 0.4743, + "step": 4535 + }, + { + "epoch": 0.10077579605109821, + "grad_norm": 3.659878969192505, + "learning_rate": 1.9502994512790568e-05, + "loss": 0.51, + "step": 4540 + }, + { + "epoch": 0.10088678261062585, + "grad_norm": 3.420764446258545, + "learning_rate": 1.9501908368916326e-05, + "loss": 0.5603, + "step": 4545 + }, + { + "epoch": 0.10099776917015349, + "grad_norm": 1.8362491130828857, + "learning_rate": 1.950082106983193e-05, + "loss": 0.455, + "step": 4550 + }, + { + "epoch": 0.10110875572968113, + "grad_norm": 1.11005699634552, + "learning_rate": 1.9499732615669567e-05, + "loss": 0.3984, + "step": 4555 + }, + { + "epoch": 0.10121974228920878, + "grad_norm": 1.2431269884109497, + "learning_rate": 1.9498643006561576e-05, + "loss": 0.5654, + "step": 4560 + }, + { + "epoch": 0.10133072884873642, + "grad_norm": 1.6340370178222656, + "learning_rate": 1.9497552242640424e-05, + "loss": 0.4444, + "step": 4565 + }, + { + "epoch": 0.10144171540826406, + "grad_norm": 0.8741961121559143, + "learning_rate": 1.949646032403872e-05, + "loss": 0.6445, + "step": 4570 + }, + { + "epoch": 0.1015527019677917, + "grad_norm": 1.056175708770752, + "learning_rate": 1.949536725088922e-05, + "loss": 0.7095, + "step": 4575 + }, + { + "epoch": 0.10166368852731934, + "grad_norm": 1.3246148824691772, + "learning_rate": 1.9494273023324814e-05, + "loss": 0.4938, + "step": 4580 + }, + { + "epoch": 0.10177467508684698, + "grad_norm": 1.1658315658569336, + "learning_rate": 1.9493177641478538e-05, + "loss": 0.6678, + "step": 4585 + }, + { + "epoch": 0.10188566164637462, + "grad_norm": 1.5370222330093384, + "learning_rate": 1.949208110548356e-05, + "loss": 0.5387, + "step": 4590 + }, + { + "epoch": 0.10199664820590226, + "grad_norm": 1.4635233879089355, + "learning_rate": 1.9490983415473198e-05, + "loss": 0.5722, + "step": 4595 + }, + { + "epoch": 0.1021076347654299, + "grad_norm": 1.1367441415786743, + "learning_rate": 1.9489884571580896e-05, + "loss": 0.643, + "step": 4600 + }, + { + "epoch": 0.10221862132495754, + "grad_norm": 1.5496591329574585, + "learning_rate": 1.948878457394026e-05, + "loss": 0.5476, + "step": 4605 + }, + { + "epoch": 0.10232960788448518, + "grad_norm": 0.7282701134681702, + "learning_rate": 1.9487683422685016e-05, + "loss": 0.4071, + "step": 4610 + }, + { + "epoch": 0.10244059444401284, + "grad_norm": 1.3425761461257935, + "learning_rate": 1.9486581117949042e-05, + "loss": 0.5461, + "step": 4615 + }, + { + "epoch": 0.10255158100354048, + "grad_norm": 1.3550171852111816, + "learning_rate": 1.9485477659866352e-05, + "loss": 0.5746, + "step": 4620 + }, + { + "epoch": 0.10266256756306812, + "grad_norm": 1.4267661571502686, + "learning_rate": 1.9484373048571097e-05, + "loss": 0.728, + "step": 4625 + }, + { + "epoch": 0.10277355412259576, + "grad_norm": 0.9552726745605469, + "learning_rate": 1.948326728419758e-05, + "loss": 0.6333, + "step": 4630 + }, + { + "epoch": 0.1028845406821234, + "grad_norm": 0.883658766746521, + "learning_rate": 1.9482160366880224e-05, + "loss": 0.5297, + "step": 4635 + }, + { + "epoch": 0.10299552724165104, + "grad_norm": 1.2870208024978638, + "learning_rate": 1.9481052296753617e-05, + "loss": 0.5575, + "step": 4640 + }, + { + "epoch": 0.10310651380117868, + "grad_norm": 1.1301369667053223, + "learning_rate": 1.9479943073952462e-05, + "loss": 0.5955, + "step": 4645 + }, + { + "epoch": 0.10321750036070632, + "grad_norm": 1.2876123189926147, + "learning_rate": 1.947883269861163e-05, + "loss": 0.6565, + "step": 4650 + }, + { + "epoch": 0.10332848692023396, + "grad_norm": 1.3741137981414795, + "learning_rate": 1.94777211708661e-05, + "loss": 0.4552, + "step": 4655 + }, + { + "epoch": 0.1034394734797616, + "grad_norm": 1.6406534910202026, + "learning_rate": 1.9476608490851023e-05, + "loss": 0.6859, + "step": 4660 + }, + { + "epoch": 0.10355046003928924, + "grad_norm": 1.4455381631851196, + "learning_rate": 1.9475494658701663e-05, + "loss": 0.5057, + "step": 4665 + }, + { + "epoch": 0.10366144659881689, + "grad_norm": 1.0080723762512207, + "learning_rate": 1.9474379674553445e-05, + "loss": 0.5527, + "step": 4670 + }, + { + "epoch": 0.10377243315834453, + "grad_norm": 1.3177790641784668, + "learning_rate": 1.9473263538541916e-05, + "loss": 0.7814, + "step": 4675 + }, + { + "epoch": 0.10388341971787217, + "grad_norm": 0.9700146317481995, + "learning_rate": 1.9472146250802778e-05, + "loss": 0.5032, + "step": 4680 + }, + { + "epoch": 0.10399440627739981, + "grad_norm": 3.535909414291382, + "learning_rate": 1.947102781147187e-05, + "loss": 0.4977, + "step": 4685 + }, + { + "epoch": 0.10410539283692745, + "grad_norm": 1.0130237340927124, + "learning_rate": 1.9469908220685158e-05, + "loss": 0.4246, + "step": 4690 + }, + { + "epoch": 0.10421637939645509, + "grad_norm": 1.2062435150146484, + "learning_rate": 1.9468787478578765e-05, + "loss": 0.5761, + "step": 4695 + }, + { + "epoch": 0.10432736595598273, + "grad_norm": 1.0337311029434204, + "learning_rate": 1.946766558528895e-05, + "loss": 0.5987, + "step": 4700 + }, + { + "epoch": 0.10443835251551037, + "grad_norm": 1.33479642868042, + "learning_rate": 1.9466542540952105e-05, + "loss": 0.426, + "step": 4705 + }, + { + "epoch": 0.10454933907503801, + "grad_norm": 0.9896667003631592, + "learning_rate": 1.9465418345704762e-05, + "loss": 0.5159, + "step": 4710 + }, + { + "epoch": 0.10466032563456565, + "grad_norm": 1.5902118682861328, + "learning_rate": 1.9464292999683603e-05, + "loss": 0.6882, + "step": 4715 + }, + { + "epoch": 0.10477131219409329, + "grad_norm": 1.0780678987503052, + "learning_rate": 1.9463166503025444e-05, + "loss": 0.5413, + "step": 4720 + }, + { + "epoch": 0.10488229875362094, + "grad_norm": 1.0648760795593262, + "learning_rate": 1.9462038855867238e-05, + "loss": 0.5374, + "step": 4725 + }, + { + "epoch": 0.10499328531314858, + "grad_norm": 1.0896109342575073, + "learning_rate": 1.9460910058346082e-05, + "loss": 0.5873, + "step": 4730 + }, + { + "epoch": 0.10510427187267622, + "grad_norm": 1.179976224899292, + "learning_rate": 1.9459780110599204e-05, + "loss": 0.5956, + "step": 4735 + }, + { + "epoch": 0.10521525843220386, + "grad_norm": 1.718687891960144, + "learning_rate": 1.945864901276399e-05, + "loss": 0.6551, + "step": 4740 + }, + { + "epoch": 0.1053262449917315, + "grad_norm": 1.0115402936935425, + "learning_rate": 1.9457516764977954e-05, + "loss": 0.5906, + "step": 4745 + }, + { + "epoch": 0.10543723155125914, + "grad_norm": 1.2545442581176758, + "learning_rate": 1.9456383367378744e-05, + "loss": 0.5814, + "step": 4750 + }, + { + "epoch": 0.10554821811078678, + "grad_norm": 0.939911425113678, + "learning_rate": 1.945524882010416e-05, + "loss": 0.5721, + "step": 4755 + }, + { + "epoch": 0.10565920467031442, + "grad_norm": 1.1681970357894897, + "learning_rate": 1.9454113123292133e-05, + "loss": 0.6557, + "step": 4760 + }, + { + "epoch": 0.10577019122984206, + "grad_norm": 1.1056734323501587, + "learning_rate": 1.9452976277080743e-05, + "loss": 0.622, + "step": 4765 + }, + { + "epoch": 0.1058811777893697, + "grad_norm": 1.0342731475830078, + "learning_rate": 1.94518382816082e-05, + "loss": 0.5997, + "step": 4770 + }, + { + "epoch": 0.10599216434889736, + "grad_norm": 1.1820440292358398, + "learning_rate": 1.9450699137012852e-05, + "loss": 0.5982, + "step": 4775 + }, + { + "epoch": 0.106103150908425, + "grad_norm": 1.011205792427063, + "learning_rate": 1.9449558843433202e-05, + "loss": 0.4623, + "step": 4780 + }, + { + "epoch": 0.10621413746795264, + "grad_norm": 1.4157917499542236, + "learning_rate": 1.944841740100788e-05, + "loss": 0.5038, + "step": 4785 + }, + { + "epoch": 0.10632512402748028, + "grad_norm": 1.3615347146987915, + "learning_rate": 1.9447274809875653e-05, + "loss": 0.6751, + "step": 4790 + }, + { + "epoch": 0.10643611058700791, + "grad_norm": 1.102033257484436, + "learning_rate": 1.944613107017544e-05, + "loss": 0.5384, + "step": 4795 + }, + { + "epoch": 0.10654709714653555, + "grad_norm": 1.1616215705871582, + "learning_rate": 1.9444986182046296e-05, + "loss": 0.642, + "step": 4800 + }, + { + "epoch": 0.1066580837060632, + "grad_norm": 1.0611157417297363, + "learning_rate": 1.9443840145627408e-05, + "loss": 0.6964, + "step": 4805 + }, + { + "epoch": 0.10676907026559083, + "grad_norm": 1.1561607122421265, + "learning_rate": 1.9442692961058104e-05, + "loss": 0.5766, + "step": 4810 + }, + { + "epoch": 0.10688005682511847, + "grad_norm": 1.2140254974365234, + "learning_rate": 1.9441544628477858e-05, + "loss": 0.5897, + "step": 4815 + }, + { + "epoch": 0.10699104338464611, + "grad_norm": 1.4890080690383911, + "learning_rate": 1.9440395148026283e-05, + "loss": 0.6208, + "step": 4820 + }, + { + "epoch": 0.10710202994417375, + "grad_norm": 1.263723373413086, + "learning_rate": 1.9439244519843123e-05, + "loss": 0.4529, + "step": 4825 + }, + { + "epoch": 0.10721301650370141, + "grad_norm": 1.4165737628936768, + "learning_rate": 1.943809274406827e-05, + "loss": 0.6019, + "step": 4830 + }, + { + "epoch": 0.10732400306322905, + "grad_norm": 0.9435001611709595, + "learning_rate": 1.9436939820841757e-05, + "loss": 0.7244, + "step": 4835 + }, + { + "epoch": 0.10743498962275669, + "grad_norm": 1.3421839475631714, + "learning_rate": 1.943578575030375e-05, + "loss": 0.6204, + "step": 4840 + }, + { + "epoch": 0.10754597618228433, + "grad_norm": 0.8362665176391602, + "learning_rate": 1.9434630532594555e-05, + "loss": 0.4819, + "step": 4845 + }, + { + "epoch": 0.10765696274181197, + "grad_norm": 1.0521749258041382, + "learning_rate": 1.9433474167854624e-05, + "loss": 0.6576, + "step": 4850 + }, + { + "epoch": 0.10776794930133961, + "grad_norm": 0.8353959321975708, + "learning_rate": 1.943231665622454e-05, + "loss": 0.7751, + "step": 4855 + }, + { + "epoch": 0.10787893586086725, + "grad_norm": 0.8259255290031433, + "learning_rate": 1.943115799784503e-05, + "loss": 0.4372, + "step": 4860 + }, + { + "epoch": 0.10798992242039489, + "grad_norm": 1.1912258863449097, + "learning_rate": 1.9429998192856957e-05, + "loss": 0.407, + "step": 4865 + }, + { + "epoch": 0.10810090897992253, + "grad_norm": 0.8998766541481018, + "learning_rate": 1.9428837241401334e-05, + "loss": 0.5316, + "step": 4870 + }, + { + "epoch": 0.10821189553945017, + "grad_norm": 1.0864561796188354, + "learning_rate": 1.94276751436193e-05, + "loss": 0.6402, + "step": 4875 + }, + { + "epoch": 0.10832288209897781, + "grad_norm": 1.6458591222763062, + "learning_rate": 1.942651189965214e-05, + "loss": 0.4848, + "step": 4880 + }, + { + "epoch": 0.10843386865850546, + "grad_norm": 1.5769037008285522, + "learning_rate": 1.9425347509641276e-05, + "loss": 0.5189, + "step": 4885 + }, + { + "epoch": 0.1085448552180331, + "grad_norm": 1.1521767377853394, + "learning_rate": 1.9424181973728274e-05, + "loss": 0.5762, + "step": 4890 + }, + { + "epoch": 0.10865584177756074, + "grad_norm": 1.4296083450317383, + "learning_rate": 1.9423015292054834e-05, + "loss": 0.6333, + "step": 4895 + }, + { + "epoch": 0.10876682833708838, + "grad_norm": 1.4438923597335815, + "learning_rate": 1.9421847464762793e-05, + "loss": 0.7679, + "step": 4900 + }, + { + "epoch": 0.10887781489661602, + "grad_norm": 0.8514357805252075, + "learning_rate": 1.942067849199414e-05, + "loss": 0.5387, + "step": 4905 + }, + { + "epoch": 0.10898880145614366, + "grad_norm": 0.9357392191886902, + "learning_rate": 1.941950837389099e-05, + "loss": 0.5576, + "step": 4910 + }, + { + "epoch": 0.1090997880156713, + "grad_norm": 1.3870668411254883, + "learning_rate": 1.94183371105956e-05, + "loss": 0.5933, + "step": 4915 + }, + { + "epoch": 0.10921077457519894, + "grad_norm": 1.166416883468628, + "learning_rate": 1.9417164702250374e-05, + "loss": 0.5293, + "step": 4920 + }, + { + "epoch": 0.10932176113472658, + "grad_norm": 1.2793128490447998, + "learning_rate": 1.9415991148997843e-05, + "loss": 0.6781, + "step": 4925 + }, + { + "epoch": 0.10943274769425422, + "grad_norm": 1.0564727783203125, + "learning_rate": 1.9414816450980686e-05, + "loss": 0.6231, + "step": 4930 + }, + { + "epoch": 0.10954373425378186, + "grad_norm": 1.6393249034881592, + "learning_rate": 1.9413640608341725e-05, + "loss": 0.7114, + "step": 4935 + }, + { + "epoch": 0.10965472081330951, + "grad_norm": 1.1309027671813965, + "learning_rate": 1.9412463621223904e-05, + "loss": 0.4825, + "step": 4940 + }, + { + "epoch": 0.10976570737283715, + "grad_norm": 1.133995771408081, + "learning_rate": 1.9411285489770328e-05, + "loss": 0.5351, + "step": 4945 + }, + { + "epoch": 0.1098766939323648, + "grad_norm": 0.8685352802276611, + "learning_rate": 1.941010621412422e-05, + "loss": 0.499, + "step": 4950 + }, + { + "epoch": 0.10998768049189243, + "grad_norm": 1.7553036212921143, + "learning_rate": 1.9408925794428964e-05, + "loss": 0.5974, + "step": 4955 + }, + { + "epoch": 0.11009866705142007, + "grad_norm": 1.2436516284942627, + "learning_rate": 1.940774423082806e-05, + "loss": 0.5045, + "step": 4960 + }, + { + "epoch": 0.11020965361094771, + "grad_norm": 1.2464370727539062, + "learning_rate": 1.9406561523465164e-05, + "loss": 0.5085, + "step": 4965 + }, + { + "epoch": 0.11032064017047535, + "grad_norm": 1.2938464879989624, + "learning_rate": 1.9405377672484068e-05, + "loss": 0.4118, + "step": 4970 + }, + { + "epoch": 0.110431626730003, + "grad_norm": 1.4582078456878662, + "learning_rate": 1.9404192678028693e-05, + "loss": 0.7055, + "step": 4975 + }, + { + "epoch": 0.11054261328953063, + "grad_norm": 1.1133296489715576, + "learning_rate": 1.9403006540243113e-05, + "loss": 0.5686, + "step": 4980 + }, + { + "epoch": 0.11065359984905827, + "grad_norm": 1.1432008743286133, + "learning_rate": 1.9401819259271537e-05, + "loss": 0.5702, + "step": 4985 + }, + { + "epoch": 0.11076458640858591, + "grad_norm": 0.8886599540710449, + "learning_rate": 1.9400630835258302e-05, + "loss": 0.7432, + "step": 4990 + }, + { + "epoch": 0.11087557296811357, + "grad_norm": 1.0646532773971558, + "learning_rate": 1.93994412683479e-05, + "loss": 0.5704, + "step": 4995 + }, + { + "epoch": 0.11098655952764121, + "grad_norm": 1.7483314275741577, + "learning_rate": 1.939825055868495e-05, + "loss": 0.4863, + "step": 5000 + }, + { + "epoch": 0.11109754608716885, + "grad_norm": 1.3594627380371094, + "learning_rate": 1.939705870641422e-05, + "loss": 0.4865, + "step": 5005 + }, + { + "epoch": 0.11120853264669649, + "grad_norm": 1.101257085800171, + "learning_rate": 1.9395865711680605e-05, + "loss": 0.4982, + "step": 5010 + }, + { + "epoch": 0.11131951920622413, + "grad_norm": 1.4391264915466309, + "learning_rate": 1.9394671574629147e-05, + "loss": 0.4815, + "step": 5015 + }, + { + "epoch": 0.11143050576575177, + "grad_norm": 1.1944100856781006, + "learning_rate": 1.9393476295405028e-05, + "loss": 0.5674, + "step": 5020 + }, + { + "epoch": 0.1115414923252794, + "grad_norm": 1.2506046295166016, + "learning_rate": 1.9392279874153563e-05, + "loss": 0.4759, + "step": 5025 + }, + { + "epoch": 0.11165247888480705, + "grad_norm": 1.2169004678726196, + "learning_rate": 1.9391082311020214e-05, + "loss": 0.5196, + "step": 5030 + }, + { + "epoch": 0.11176346544433469, + "grad_norm": 1.5347559452056885, + "learning_rate": 1.938988360615057e-05, + "loss": 0.5685, + "step": 5035 + }, + { + "epoch": 0.11187445200386233, + "grad_norm": 1.0205435752868652, + "learning_rate": 1.9388683759690365e-05, + "loss": 0.4319, + "step": 5040 + }, + { + "epoch": 0.11198543856338998, + "grad_norm": 1.3055386543273926, + "learning_rate": 1.938748277178548e-05, + "loss": 0.4752, + "step": 5045 + }, + { + "epoch": 0.11209642512291762, + "grad_norm": 1.2094687223434448, + "learning_rate": 1.938628064258192e-05, + "loss": 0.5108, + "step": 5050 + }, + { + "epoch": 0.11220741168244526, + "grad_norm": 1.497977614402771, + "learning_rate": 1.938507737222584e-05, + "loss": 0.6327, + "step": 5055 + }, + { + "epoch": 0.1123183982419729, + "grad_norm": 1.4827046394348145, + "learning_rate": 1.938387296086353e-05, + "loss": 0.7167, + "step": 5060 + }, + { + "epoch": 0.11242938480150054, + "grad_norm": 1.149739146232605, + "learning_rate": 1.9382667408641413e-05, + "loss": 0.4916, + "step": 5065 + }, + { + "epoch": 0.11254037136102818, + "grad_norm": 1.4573441743850708, + "learning_rate": 1.9381460715706064e-05, + "loss": 0.5497, + "step": 5070 + }, + { + "epoch": 0.11265135792055582, + "grad_norm": 1.1646956205368042, + "learning_rate": 1.938025288220418e-05, + "loss": 0.6231, + "step": 5075 + }, + { + "epoch": 0.11276234448008346, + "grad_norm": 1.2401013374328613, + "learning_rate": 1.937904390828261e-05, + "loss": 0.5464, + "step": 5080 + }, + { + "epoch": 0.1128733310396111, + "grad_norm": 0.9036456346511841, + "learning_rate": 1.937783379408834e-05, + "loss": 0.5749, + "step": 5085 + }, + { + "epoch": 0.11298431759913874, + "grad_norm": 1.4067643880844116, + "learning_rate": 1.9376622539768487e-05, + "loss": 0.4189, + "step": 5090 + }, + { + "epoch": 0.11309530415866638, + "grad_norm": 1.0962282419204712, + "learning_rate": 1.9375410145470307e-05, + "loss": 0.5467, + "step": 5095 + }, + { + "epoch": 0.11320629071819403, + "grad_norm": 1.381117343902588, + "learning_rate": 1.9374196611341212e-05, + "loss": 0.5089, + "step": 5100 + }, + { + "epoch": 0.11331727727772167, + "grad_norm": 1.3301773071289062, + "learning_rate": 1.9372981937528728e-05, + "loss": 0.7172, + "step": 5105 + }, + { + "epoch": 0.11342826383724931, + "grad_norm": 1.3714052438735962, + "learning_rate": 1.9371766124180532e-05, + "loss": 0.6936, + "step": 5110 + }, + { + "epoch": 0.11353925039677695, + "grad_norm": 1.2375777959823608, + "learning_rate": 1.9370549171444443e-05, + "loss": 0.5489, + "step": 5115 + }, + { + "epoch": 0.11365023695630459, + "grad_norm": 1.1200803518295288, + "learning_rate": 1.9369331079468413e-05, + "loss": 0.5341, + "step": 5120 + }, + { + "epoch": 0.11376122351583223, + "grad_norm": 1.386127233505249, + "learning_rate": 1.936811184840053e-05, + "loss": 0.3938, + "step": 5125 + }, + { + "epoch": 0.11387221007535987, + "grad_norm": 1.595672607421875, + "learning_rate": 1.9366891478389034e-05, + "loss": 0.5444, + "step": 5130 + }, + { + "epoch": 0.11398319663488751, + "grad_norm": 0.7065406441688538, + "learning_rate": 1.936566996958228e-05, + "loss": 0.4774, + "step": 5135 + }, + { + "epoch": 0.11409418319441515, + "grad_norm": 1.3757734298706055, + "learning_rate": 1.9364447322128784e-05, + "loss": 0.594, + "step": 5140 + }, + { + "epoch": 0.11420516975394279, + "grad_norm": 1.257797122001648, + "learning_rate": 1.9363223536177186e-05, + "loss": 0.6493, + "step": 5145 + }, + { + "epoch": 0.11431615631347043, + "grad_norm": 1.1501637697219849, + "learning_rate": 1.9361998611876272e-05, + "loss": 0.4744, + "step": 5150 + }, + { + "epoch": 0.11442714287299809, + "grad_norm": 1.1146260499954224, + "learning_rate": 1.9360772549374968e-05, + "loss": 0.4714, + "step": 5155 + }, + { + "epoch": 0.11453812943252573, + "grad_norm": 1.1389085054397583, + "learning_rate": 1.9359545348822326e-05, + "loss": 0.602, + "step": 5160 + }, + { + "epoch": 0.11464911599205337, + "grad_norm": 1.3476996421813965, + "learning_rate": 1.935831701036755e-05, + "loss": 0.677, + "step": 5165 + }, + { + "epoch": 0.114760102551581, + "grad_norm": 1.0346547365188599, + "learning_rate": 1.9357087534159982e-05, + "loss": 0.7114, + "step": 5170 + }, + { + "epoch": 0.11487108911110865, + "grad_norm": 1.2413575649261475, + "learning_rate": 1.9355856920349092e-05, + "loss": 0.6657, + "step": 5175 + }, + { + "epoch": 0.11498207567063629, + "grad_norm": 1.1822632551193237, + "learning_rate": 1.9354625169084494e-05, + "loss": 0.5644, + "step": 5180 + }, + { + "epoch": 0.11509306223016393, + "grad_norm": 1.2111988067626953, + "learning_rate": 1.9353392280515938e-05, + "loss": 0.505, + "step": 5185 + }, + { + "epoch": 0.11520404878969157, + "grad_norm": 1.4630290269851685, + "learning_rate": 1.935215825479332e-05, + "loss": 0.4985, + "step": 5190 + }, + { + "epoch": 0.1153150353492192, + "grad_norm": 1.3057409524917603, + "learning_rate": 1.9350923092066668e-05, + "loss": 0.541, + "step": 5195 + }, + { + "epoch": 0.11542602190874685, + "grad_norm": 1.3751634359359741, + "learning_rate": 1.9349686792486143e-05, + "loss": 0.6219, + "step": 5200 + }, + { + "epoch": 0.11553700846827449, + "grad_norm": 0.8946861028671265, + "learning_rate": 1.9348449356202054e-05, + "loss": 0.4943, + "step": 5205 + }, + { + "epoch": 0.11564799502780214, + "grad_norm": 1.3821775913238525, + "learning_rate": 1.9347210783364846e-05, + "loss": 0.5903, + "step": 5210 + }, + { + "epoch": 0.11575898158732978, + "grad_norm": 1.5298506021499634, + "learning_rate": 1.93459710741251e-05, + "loss": 0.6664, + "step": 5215 + }, + { + "epoch": 0.11586996814685742, + "grad_norm": 1.3290419578552246, + "learning_rate": 1.9344730228633535e-05, + "loss": 0.681, + "step": 5220 + }, + { + "epoch": 0.11598095470638506, + "grad_norm": 0.906023383140564, + "learning_rate": 1.934348824704101e-05, + "loss": 0.4962, + "step": 5225 + }, + { + "epoch": 0.1160919412659127, + "grad_norm": 1.286221981048584, + "learning_rate": 1.9342245129498516e-05, + "loss": 0.6054, + "step": 5230 + }, + { + "epoch": 0.11620292782544034, + "grad_norm": 1.4231698513031006, + "learning_rate": 1.9341000876157193e-05, + "loss": 0.487, + "step": 5235 + }, + { + "epoch": 0.11631391438496798, + "grad_norm": 1.2439308166503906, + "learning_rate": 1.933975548716831e-05, + "loss": 0.5635, + "step": 5240 + }, + { + "epoch": 0.11642490094449562, + "grad_norm": 1.6440762281417847, + "learning_rate": 1.9338508962683278e-05, + "loss": 0.6192, + "step": 5245 + }, + { + "epoch": 0.11653588750402326, + "grad_norm": 1.43458092212677, + "learning_rate": 1.9337261302853644e-05, + "loss": 0.6437, + "step": 5250 + }, + { + "epoch": 0.1166468740635509, + "grad_norm": 1.1676011085510254, + "learning_rate": 1.9336012507831097e-05, + "loss": 0.4349, + "step": 5255 + }, + { + "epoch": 0.11675786062307855, + "grad_norm": 1.3660240173339844, + "learning_rate": 1.9334762577767458e-05, + "loss": 0.4539, + "step": 5260 + }, + { + "epoch": 0.11686884718260619, + "grad_norm": 1.2643660306930542, + "learning_rate": 1.9333511512814692e-05, + "loss": 0.6076, + "step": 5265 + }, + { + "epoch": 0.11697983374213383, + "grad_norm": 0.8496934175491333, + "learning_rate": 1.93322593131249e-05, + "loss": 0.6067, + "step": 5270 + }, + { + "epoch": 0.11709082030166147, + "grad_norm": 0.9153058528900146, + "learning_rate": 1.933100597885032e-05, + "loss": 0.4675, + "step": 5275 + }, + { + "epoch": 0.11720180686118911, + "grad_norm": 1.0530915260314941, + "learning_rate": 1.932975151014332e-05, + "loss": 0.5721, + "step": 5280 + }, + { + "epoch": 0.11731279342071675, + "grad_norm": 1.0556403398513794, + "learning_rate": 1.932849590715643e-05, + "loss": 0.5501, + "step": 5285 + }, + { + "epoch": 0.11742377998024439, + "grad_norm": 1.5842150449752808, + "learning_rate": 1.9327239170042288e-05, + "loss": 0.3836, + "step": 5290 + }, + { + "epoch": 0.11753476653977203, + "grad_norm": 0.7407785654067993, + "learning_rate": 1.9325981298953688e-05, + "loss": 0.5094, + "step": 5295 + }, + { + "epoch": 0.11764575309929967, + "grad_norm": 1.1622848510742188, + "learning_rate": 1.932472229404356e-05, + "loss": 0.5122, + "step": 5300 + }, + { + "epoch": 0.11775673965882731, + "grad_norm": 1.4510892629623413, + "learning_rate": 1.9323462155464967e-05, + "loss": 0.6759, + "step": 5305 + }, + { + "epoch": 0.11786772621835495, + "grad_norm": 1.4540632963180542, + "learning_rate": 1.9322200883371118e-05, + "loss": 0.575, + "step": 5310 + }, + { + "epoch": 0.1179787127778826, + "grad_norm": 1.0026957988739014, + "learning_rate": 1.9320938477915346e-05, + "loss": 0.5546, + "step": 5315 + }, + { + "epoch": 0.11808969933741024, + "grad_norm": 1.5292295217514038, + "learning_rate": 1.931967493925113e-05, + "loss": 0.4083, + "step": 5320 + }, + { + "epoch": 0.11820068589693788, + "grad_norm": 1.1803737878799438, + "learning_rate": 1.9318410267532096e-05, + "loss": 0.581, + "step": 5325 + }, + { + "epoch": 0.11831167245646552, + "grad_norm": 0.9704664945602417, + "learning_rate": 1.9317144462911992e-05, + "loss": 0.5058, + "step": 5330 + }, + { + "epoch": 0.11842265901599316, + "grad_norm": 0.9553489089012146, + "learning_rate": 1.9315877525544712e-05, + "loss": 0.486, + "step": 5335 + }, + { + "epoch": 0.1185336455755208, + "grad_norm": 1.0737321376800537, + "learning_rate": 1.9314609455584285e-05, + "loss": 0.5671, + "step": 5340 + }, + { + "epoch": 0.11864463213504844, + "grad_norm": 1.0113078355789185, + "learning_rate": 1.931334025318488e-05, + "loss": 0.4365, + "step": 5345 + }, + { + "epoch": 0.11875561869457608, + "grad_norm": 1.2628003358840942, + "learning_rate": 1.93120699185008e-05, + "loss": 0.5765, + "step": 5350 + }, + { + "epoch": 0.11886660525410372, + "grad_norm": 1.0682296752929688, + "learning_rate": 1.9310798451686488e-05, + "loss": 0.7266, + "step": 5355 + }, + { + "epoch": 0.11897759181363136, + "grad_norm": 1.291569471359253, + "learning_rate": 1.9309525852896533e-05, + "loss": 0.4749, + "step": 5360 + }, + { + "epoch": 0.119088578373159, + "grad_norm": 1.0546334981918335, + "learning_rate": 1.9308252122285643e-05, + "loss": 0.353, + "step": 5365 + }, + { + "epoch": 0.11919956493268666, + "grad_norm": 1.0950523614883423, + "learning_rate": 1.9306977260008676e-05, + "loss": 0.5275, + "step": 5370 + }, + { + "epoch": 0.1193105514922143, + "grad_norm": 1.3744865655899048, + "learning_rate": 1.9305701266220626e-05, + "loss": 0.552, + "step": 5375 + }, + { + "epoch": 0.11942153805174194, + "grad_norm": 1.0956076383590698, + "learning_rate": 1.9304424141076627e-05, + "loss": 0.5896, + "step": 5380 + }, + { + "epoch": 0.11953252461126958, + "grad_norm": 1.3505736589431763, + "learning_rate": 1.9303145884731946e-05, + "loss": 0.5449, + "step": 5385 + }, + { + "epoch": 0.11964351117079722, + "grad_norm": 1.0760904550552368, + "learning_rate": 1.9301866497341984e-05, + "loss": 0.5097, + "step": 5390 + }, + { + "epoch": 0.11975449773032486, + "grad_norm": 1.2607932090759277, + "learning_rate": 1.9300585979062295e-05, + "loss": 0.5546, + "step": 5395 + }, + { + "epoch": 0.1198654842898525, + "grad_norm": 1.0783039331436157, + "learning_rate": 1.9299304330048554e-05, + "loss": 0.5221, + "step": 5400 + }, + { + "epoch": 0.11997647084938014, + "grad_norm": 1.6365145444869995, + "learning_rate": 1.929802155045658e-05, + "loss": 0.5797, + "step": 5405 + }, + { + "epoch": 0.12008745740890778, + "grad_norm": 1.2198350429534912, + "learning_rate": 1.9296737640442325e-05, + "loss": 0.5849, + "step": 5410 + }, + { + "epoch": 0.12019844396843542, + "grad_norm": 1.1546142101287842, + "learning_rate": 1.929545260016189e-05, + "loss": 0.5482, + "step": 5415 + }, + { + "epoch": 0.12030943052796306, + "grad_norm": 1.031053900718689, + "learning_rate": 1.92941664297715e-05, + "loss": 0.6086, + "step": 5420 + }, + { + "epoch": 0.12042041708749071, + "grad_norm": 1.257300615310669, + "learning_rate": 1.9292879129427528e-05, + "loss": 0.5569, + "step": 5425 + }, + { + "epoch": 0.12053140364701835, + "grad_norm": 1.3408029079437256, + "learning_rate": 1.9291590699286474e-05, + "loss": 0.5987, + "step": 5430 + }, + { + "epoch": 0.12064239020654599, + "grad_norm": 1.0354056358337402, + "learning_rate": 1.9290301139504988e-05, + "loss": 0.5358, + "step": 5435 + }, + { + "epoch": 0.12075337676607363, + "grad_norm": 1.4962389469146729, + "learning_rate": 1.9289010450239843e-05, + "loss": 0.5749, + "step": 5440 + }, + { + "epoch": 0.12086436332560127, + "grad_norm": 0.8980013132095337, + "learning_rate": 1.9287718631647964e-05, + "loss": 0.4667, + "step": 5445 + }, + { + "epoch": 0.12097534988512891, + "grad_norm": 1.0994272232055664, + "learning_rate": 1.9286425683886403e-05, + "loss": 0.453, + "step": 5450 + }, + { + "epoch": 0.12108633644465655, + "grad_norm": 1.1135613918304443, + "learning_rate": 1.928513160711235e-05, + "loss": 0.5495, + "step": 5455 + }, + { + "epoch": 0.12119732300418419, + "grad_norm": 1.1412851810455322, + "learning_rate": 1.9283836401483132e-05, + "loss": 0.4949, + "step": 5460 + }, + { + "epoch": 0.12130830956371183, + "grad_norm": 1.3352820873260498, + "learning_rate": 1.9282540067156224e-05, + "loss": 0.6287, + "step": 5465 + }, + { + "epoch": 0.12141929612323947, + "grad_norm": 1.2998685836791992, + "learning_rate": 1.9281242604289228e-05, + "loss": 0.4789, + "step": 5470 + }, + { + "epoch": 0.12153028268276711, + "grad_norm": 0.9999222755432129, + "learning_rate": 1.927994401303988e-05, + "loss": 0.4718, + "step": 5475 + }, + { + "epoch": 0.12164126924229476, + "grad_norm": 1.091381549835205, + "learning_rate": 1.9278644293566064e-05, + "loss": 0.5984, + "step": 5480 + }, + { + "epoch": 0.1217522558018224, + "grad_norm": 1.0779790878295898, + "learning_rate": 1.9277343446025788e-05, + "loss": 0.5483, + "step": 5485 + }, + { + "epoch": 0.12186324236135004, + "grad_norm": 1.052999496459961, + "learning_rate": 1.9276041470577213e-05, + "loss": 0.5924, + "step": 5490 + }, + { + "epoch": 0.12197422892087768, + "grad_norm": 1.0639392137527466, + "learning_rate": 1.9274738367378627e-05, + "loss": 0.5566, + "step": 5495 + }, + { + "epoch": 0.12208521548040532, + "grad_norm": 1.0299577713012695, + "learning_rate": 1.927343413658845e-05, + "loss": 0.4247, + "step": 5500 + }, + { + "epoch": 0.12219620203993296, + "grad_norm": 0.8614324331283569, + "learning_rate": 1.9272128778365258e-05, + "loss": 0.7018, + "step": 5505 + }, + { + "epoch": 0.1223071885994606, + "grad_norm": 1.3008124828338623, + "learning_rate": 1.9270822292867742e-05, + "loss": 0.5816, + "step": 5510 + }, + { + "epoch": 0.12241817515898824, + "grad_norm": 0.9771310687065125, + "learning_rate": 1.9269514680254742e-05, + "loss": 0.5727, + "step": 5515 + }, + { + "epoch": 0.12252916171851588, + "grad_norm": 0.9506747126579285, + "learning_rate": 1.9268205940685236e-05, + "loss": 0.5689, + "step": 5520 + }, + { + "epoch": 0.12264014827804352, + "grad_norm": 0.9057345986366272, + "learning_rate": 1.9266896074318335e-05, + "loss": 0.5966, + "step": 5525 + }, + { + "epoch": 0.12275113483757118, + "grad_norm": 1.8075003623962402, + "learning_rate": 1.926558508131329e-05, + "loss": 0.5467, + "step": 5530 + }, + { + "epoch": 0.12286212139709882, + "grad_norm": 1.395269751548767, + "learning_rate": 1.9264272961829484e-05, + "loss": 0.4531, + "step": 5535 + }, + { + "epoch": 0.12297310795662646, + "grad_norm": 0.9755896329879761, + "learning_rate": 1.926295971602644e-05, + "loss": 0.4991, + "step": 5540 + }, + { + "epoch": 0.1230840945161541, + "grad_norm": 1.3618781566619873, + "learning_rate": 1.926164534406382e-05, + "loss": 0.4559, + "step": 5545 + }, + { + "epoch": 0.12319508107568174, + "grad_norm": 1.4191176891326904, + "learning_rate": 1.926032984610142e-05, + "loss": 0.5268, + "step": 5550 + }, + { + "epoch": 0.12330606763520938, + "grad_norm": 1.3266234397888184, + "learning_rate": 1.9259013222299174e-05, + "loss": 0.6116, + "step": 5555 + }, + { + "epoch": 0.12341705419473702, + "grad_norm": 1.6718363761901855, + "learning_rate": 1.9257695472817152e-05, + "loss": 0.4408, + "step": 5560 + }, + { + "epoch": 0.12352804075426466, + "grad_norm": 1.0698461532592773, + "learning_rate": 1.9256376597815565e-05, + "loss": 0.3938, + "step": 5565 + }, + { + "epoch": 0.1236390273137923, + "grad_norm": 0.9428433179855347, + "learning_rate": 1.9255056597454755e-05, + "loss": 0.6155, + "step": 5570 + }, + { + "epoch": 0.12375001387331994, + "grad_norm": 1.1381133794784546, + "learning_rate": 1.9253735471895198e-05, + "loss": 0.5286, + "step": 5575 + }, + { + "epoch": 0.12386100043284758, + "grad_norm": 0.7855455279350281, + "learning_rate": 1.925241322129752e-05, + "loss": 0.578, + "step": 5580 + }, + { + "epoch": 0.12397198699237523, + "grad_norm": 0.7438721060752869, + "learning_rate": 1.9251089845822472e-05, + "loss": 0.452, + "step": 5585 + }, + { + "epoch": 0.12408297355190287, + "grad_norm": 1.136475682258606, + "learning_rate": 1.9249765345630948e-05, + "loss": 0.469, + "step": 5590 + }, + { + "epoch": 0.12419396011143051, + "grad_norm": 1.2477785348892212, + "learning_rate": 1.9248439720883975e-05, + "loss": 0.4939, + "step": 5595 + }, + { + "epoch": 0.12430494667095815, + "grad_norm": 1.3942158222198486, + "learning_rate": 1.9247112971742713e-05, + "loss": 0.5784, + "step": 5600 + }, + { + "epoch": 0.12441593323048579, + "grad_norm": 1.3450487852096558, + "learning_rate": 1.9245785098368474e-05, + "loss": 0.5385, + "step": 5605 + }, + { + "epoch": 0.12452691979001343, + "grad_norm": 0.8705973029136658, + "learning_rate": 1.924445610092269e-05, + "loss": 0.3492, + "step": 5610 + }, + { + "epoch": 0.12463790634954107, + "grad_norm": 2.236539602279663, + "learning_rate": 1.9243125979566933e-05, + "loss": 0.634, + "step": 5615 + }, + { + "epoch": 0.12474889290906871, + "grad_norm": 1.088524580001831, + "learning_rate": 1.924179473446292e-05, + "loss": 0.4404, + "step": 5620 + }, + { + "epoch": 0.12485987946859635, + "grad_norm": 1.0636274814605713, + "learning_rate": 1.9240462365772495e-05, + "loss": 0.3987, + "step": 5625 + }, + { + "epoch": 0.12497086602812399, + "grad_norm": 1.0778905153274536, + "learning_rate": 1.923912887365765e-05, + "loss": 0.6523, + "step": 5630 + }, + { + "epoch": 0.12508185258765164, + "grad_norm": 1.0517983436584473, + "learning_rate": 1.9237794258280503e-05, + "loss": 0.5392, + "step": 5635 + }, + { + "epoch": 0.12519283914717927, + "grad_norm": 1.3265271186828613, + "learning_rate": 1.923645851980331e-05, + "loss": 0.7403, + "step": 5640 + }, + { + "epoch": 0.12530382570670692, + "grad_norm": 1.2673416137695312, + "learning_rate": 1.9235121658388463e-05, + "loss": 0.364, + "step": 5645 + }, + { + "epoch": 0.12541481226623455, + "grad_norm": 1.2772183418273926, + "learning_rate": 1.9233783674198502e-05, + "loss": 0.6218, + "step": 5650 + }, + { + "epoch": 0.1255257988257622, + "grad_norm": 0.9975202083587646, + "learning_rate": 1.9232444567396088e-05, + "loss": 0.4819, + "step": 5655 + }, + { + "epoch": 0.12563678538528983, + "grad_norm": 1.3848180770874023, + "learning_rate": 1.9231104338144027e-05, + "loss": 0.6272, + "step": 5660 + }, + { + "epoch": 0.12574777194481748, + "grad_norm": 1.2946040630340576, + "learning_rate": 1.9229762986605257e-05, + "loss": 0.6036, + "step": 5665 + }, + { + "epoch": 0.12585875850434514, + "grad_norm": 1.101122498512268, + "learning_rate": 1.922842051294286e-05, + "loss": 0.5324, + "step": 5670 + }, + { + "epoch": 0.12596974506387276, + "grad_norm": 1.2069895267486572, + "learning_rate": 1.9227076917320045e-05, + "loss": 0.5448, + "step": 5675 + }, + { + "epoch": 0.12608073162340042, + "grad_norm": 0.9136874675750732, + "learning_rate": 1.9225732199900164e-05, + "loss": 0.5141, + "step": 5680 + }, + { + "epoch": 0.12619171818292804, + "grad_norm": 1.1953130960464478, + "learning_rate": 1.92243863608467e-05, + "loss": 0.5794, + "step": 5685 + }, + { + "epoch": 0.1263027047424557, + "grad_norm": 0.9920978546142578, + "learning_rate": 1.9223039400323284e-05, + "loss": 0.6553, + "step": 5690 + }, + { + "epoch": 0.12641369130198332, + "grad_norm": 1.174495816230774, + "learning_rate": 1.9221691318493666e-05, + "loss": 0.4533, + "step": 5695 + }, + { + "epoch": 0.12652467786151098, + "grad_norm": 1.5274707078933716, + "learning_rate": 1.9220342115521746e-05, + "loss": 0.6146, + "step": 5700 + }, + { + "epoch": 0.1266356644210386, + "grad_norm": 1.2732661962509155, + "learning_rate": 1.9218991791571553e-05, + "loss": 0.4841, + "step": 5705 + }, + { + "epoch": 0.12674665098056626, + "grad_norm": 1.6695985794067383, + "learning_rate": 1.921764034680726e-05, + "loss": 0.5141, + "step": 5710 + }, + { + "epoch": 0.12685763754009388, + "grad_norm": 0.9126291275024414, + "learning_rate": 1.9216287781393165e-05, + "loss": 0.6414, + "step": 5715 + }, + { + "epoch": 0.12696862409962154, + "grad_norm": 1.1736328601837158, + "learning_rate": 1.9214934095493706e-05, + "loss": 0.5365, + "step": 5720 + }, + { + "epoch": 0.1270796106591492, + "grad_norm": 1.4496265649795532, + "learning_rate": 1.921357928927347e-05, + "loss": 0.6288, + "step": 5725 + }, + { + "epoch": 0.12719059721867682, + "grad_norm": 1.6686094999313354, + "learning_rate": 1.921222336289716e-05, + "loss": 0.5421, + "step": 5730 + }, + { + "epoch": 0.12730158377820447, + "grad_norm": 1.0650641918182373, + "learning_rate": 1.921086631652963e-05, + "loss": 0.5287, + "step": 5735 + }, + { + "epoch": 0.1274125703377321, + "grad_norm": 1.0161991119384766, + "learning_rate": 1.9209508150335864e-05, + "loss": 0.4341, + "step": 5740 + }, + { + "epoch": 0.12752355689725975, + "grad_norm": 1.4244519472122192, + "learning_rate": 1.9208148864480987e-05, + "loss": 0.5102, + "step": 5745 + }, + { + "epoch": 0.12763454345678737, + "grad_norm": 1.0328584909439087, + "learning_rate": 1.920678845913025e-05, + "loss": 0.2526, + "step": 5750 + }, + { + "epoch": 0.12774553001631503, + "grad_norm": 1.2607382535934448, + "learning_rate": 1.9205426934449047e-05, + "loss": 0.5606, + "step": 5755 + }, + { + "epoch": 0.12785651657584265, + "grad_norm": 1.7482237815856934, + "learning_rate": 1.9204064290602912e-05, + "loss": 0.4606, + "step": 5760 + }, + { + "epoch": 0.1279675031353703, + "grad_norm": 1.1892881393432617, + "learning_rate": 1.920270052775751e-05, + "loss": 0.4634, + "step": 5765 + }, + { + "epoch": 0.12807848969489793, + "grad_norm": 1.5729913711547852, + "learning_rate": 1.920133564607864e-05, + "loss": 0.6063, + "step": 5770 + }, + { + "epoch": 0.1281894762544256, + "grad_norm": 1.244259238243103, + "learning_rate": 1.9199969645732238e-05, + "loss": 0.5447, + "step": 5775 + }, + { + "epoch": 0.12830046281395324, + "grad_norm": 1.4631567001342773, + "learning_rate": 1.9198602526884388e-05, + "loss": 0.5313, + "step": 5780 + }, + { + "epoch": 0.12841144937348087, + "grad_norm": 1.0688591003417969, + "learning_rate": 1.9197234289701286e-05, + "loss": 0.5582, + "step": 5785 + }, + { + "epoch": 0.12852243593300852, + "grad_norm": 1.647247552871704, + "learning_rate": 1.9195864934349286e-05, + "loss": 0.5988, + "step": 5790 + }, + { + "epoch": 0.12863342249253615, + "grad_norm": 1.6146548986434937, + "learning_rate": 1.919449446099487e-05, + "loss": 0.5862, + "step": 5795 + }, + { + "epoch": 0.1287444090520638, + "grad_norm": 1.2010599374771118, + "learning_rate": 1.919312286980465e-05, + "loss": 0.6445, + "step": 5800 + }, + { + "epoch": 0.12885539561159143, + "grad_norm": 1.0428661108016968, + "learning_rate": 1.9191750160945382e-05, + "loss": 0.4673, + "step": 5805 + }, + { + "epoch": 0.12896638217111908, + "grad_norm": 0.9652555584907532, + "learning_rate": 1.9190376334583963e-05, + "loss": 0.4984, + "step": 5810 + }, + { + "epoch": 0.1290773687306467, + "grad_norm": 1.1768306493759155, + "learning_rate": 1.9189001390887404e-05, + "loss": 0.5078, + "step": 5815 + }, + { + "epoch": 0.12918835529017436, + "grad_norm": 1.0328947305679321, + "learning_rate": 1.918762533002288e-05, + "loss": 0.5983, + "step": 5820 + }, + { + "epoch": 0.129299341849702, + "grad_norm": 1.2240079641342163, + "learning_rate": 1.9186248152157676e-05, + "loss": 0.6412, + "step": 5825 + }, + { + "epoch": 0.12941032840922964, + "grad_norm": 1.1072537899017334, + "learning_rate": 1.9184869857459233e-05, + "loss": 0.45, + "step": 5830 + }, + { + "epoch": 0.1295213149687573, + "grad_norm": 1.5246691703796387, + "learning_rate": 1.9183490446095116e-05, + "loss": 0.6749, + "step": 5835 + }, + { + "epoch": 0.12963230152828492, + "grad_norm": 1.4271918535232544, + "learning_rate": 1.9182109918233024e-05, + "loss": 0.626, + "step": 5840 + }, + { + "epoch": 0.12974328808781257, + "grad_norm": 1.091291904449463, + "learning_rate": 1.918072827404081e-05, + "loss": 0.4738, + "step": 5845 + }, + { + "epoch": 0.1298542746473402, + "grad_norm": 1.253570795059204, + "learning_rate": 1.9179345513686442e-05, + "loss": 0.6955, + "step": 5850 + }, + { + "epoch": 0.12996526120686785, + "grad_norm": 0.9824383854866028, + "learning_rate": 1.9177961637338027e-05, + "loss": 0.6052, + "step": 5855 + }, + { + "epoch": 0.13007624776639548, + "grad_norm": 1.1693501472473145, + "learning_rate": 1.9176576645163816e-05, + "loss": 0.5972, + "step": 5860 + }, + { + "epoch": 0.13018723432592313, + "grad_norm": 1.0172988176345825, + "learning_rate": 1.9175190537332198e-05, + "loss": 0.4938, + "step": 5865 + }, + { + "epoch": 0.13029822088545076, + "grad_norm": 1.4785795211791992, + "learning_rate": 1.9173803314011682e-05, + "loss": 0.5899, + "step": 5870 + }, + { + "epoch": 0.13040920744497841, + "grad_norm": 1.265539526939392, + "learning_rate": 1.9172414975370925e-05, + "loss": 0.5751, + "step": 5875 + }, + { + "epoch": 0.13052019400450604, + "grad_norm": 1.1638941764831543, + "learning_rate": 1.917102552157872e-05, + "loss": 0.6387, + "step": 5880 + }, + { + "epoch": 0.1306311805640337, + "grad_norm": 1.3803809881210327, + "learning_rate": 1.9169634952803988e-05, + "loss": 0.5157, + "step": 5885 + }, + { + "epoch": 0.13074216712356135, + "grad_norm": 1.2643953561782837, + "learning_rate": 1.916824326921579e-05, + "loss": 0.4576, + "step": 5890 + }, + { + "epoch": 0.13085315368308897, + "grad_norm": 1.0698553323745728, + "learning_rate": 1.9166850470983323e-05, + "loss": 0.5362, + "step": 5895 + }, + { + "epoch": 0.13096414024261663, + "grad_norm": 1.2042971849441528, + "learning_rate": 1.916545655827592e-05, + "loss": 0.5263, + "step": 5900 + }, + { + "epoch": 0.13107512680214425, + "grad_norm": 1.6081792116165161, + "learning_rate": 1.9164061531263047e-05, + "loss": 0.4717, + "step": 5905 + }, + { + "epoch": 0.1311861133616719, + "grad_norm": 1.3085589408874512, + "learning_rate": 1.9162665390114305e-05, + "loss": 0.4643, + "step": 5910 + }, + { + "epoch": 0.13129709992119953, + "grad_norm": 1.3212761878967285, + "learning_rate": 1.916126813499944e-05, + "loss": 0.5564, + "step": 5915 + }, + { + "epoch": 0.1314080864807272, + "grad_norm": 0.9593100547790527, + "learning_rate": 1.9159869766088315e-05, + "loss": 0.37, + "step": 5920 + }, + { + "epoch": 0.1315190730402548, + "grad_norm": 1.0401902198791504, + "learning_rate": 1.9158470283550944e-05, + "loss": 0.6124, + "step": 5925 + }, + { + "epoch": 0.13163005959978247, + "grad_norm": 1.2800883054733276, + "learning_rate": 1.915706968755747e-05, + "loss": 0.4894, + "step": 5930 + }, + { + "epoch": 0.13174104615931012, + "grad_norm": 1.1242331266403198, + "learning_rate": 1.9155667978278175e-05, + "loss": 0.5963, + "step": 5935 + }, + { + "epoch": 0.13185203271883775, + "grad_norm": 1.1818042993545532, + "learning_rate": 1.9154265155883473e-05, + "loss": 0.4622, + "step": 5940 + }, + { + "epoch": 0.1319630192783654, + "grad_norm": 0.9774150848388672, + "learning_rate": 1.9152861220543918e-05, + "loss": 0.5023, + "step": 5945 + }, + { + "epoch": 0.13207400583789303, + "grad_norm": 0.8884637951850891, + "learning_rate": 1.9151456172430186e-05, + "loss": 0.3321, + "step": 5950 + }, + { + "epoch": 0.13218499239742068, + "grad_norm": 1.5701459646224976, + "learning_rate": 1.9150050011713105e-05, + "loss": 0.5133, + "step": 5955 + }, + { + "epoch": 0.1322959789569483, + "grad_norm": 0.9315885901451111, + "learning_rate": 1.9148642738563636e-05, + "loss": 0.5302, + "step": 5960 + }, + { + "epoch": 0.13240696551647596, + "grad_norm": 1.5833088159561157, + "learning_rate": 1.9147234353152862e-05, + "loss": 0.5988, + "step": 5965 + }, + { + "epoch": 0.1325179520760036, + "grad_norm": 1.3734363317489624, + "learning_rate": 1.914582485565201e-05, + "loss": 0.5986, + "step": 5970 + }, + { + "epoch": 0.13262893863553124, + "grad_norm": 1.3130841255187988, + "learning_rate": 1.9144414246232448e-05, + "loss": 0.4492, + "step": 5975 + }, + { + "epoch": 0.13273992519505887, + "grad_norm": 1.4003158807754517, + "learning_rate": 1.914300252506567e-05, + "loss": 0.3576, + "step": 5980 + }, + { + "epoch": 0.13285091175458652, + "grad_norm": 0.9939400553703308, + "learning_rate": 1.9141589692323304e-05, + "loss": 0.413, + "step": 5985 + }, + { + "epoch": 0.13296189831411417, + "grad_norm": 1.0590107440948486, + "learning_rate": 1.9140175748177126e-05, + "loss": 0.5614, + "step": 5990 + }, + { + "epoch": 0.1330728848736418, + "grad_norm": 1.2037274837493896, + "learning_rate": 1.9138760692799033e-05, + "loss": 0.4923, + "step": 5995 + }, + { + "epoch": 0.13318387143316945, + "grad_norm": 1.8287067413330078, + "learning_rate": 1.9137344526361064e-05, + "loss": 0.4642, + "step": 6000 + }, + { + "epoch": 0.13329485799269708, + "grad_norm": 1.330986499786377, + "learning_rate": 1.9135927249035393e-05, + "loss": 0.5365, + "step": 6005 + }, + { + "epoch": 0.13340584455222473, + "grad_norm": 0.9285293817520142, + "learning_rate": 1.9134508860994323e-05, + "loss": 0.4484, + "step": 6010 + }, + { + "epoch": 0.13351683111175236, + "grad_norm": 0.8908197283744812, + "learning_rate": 1.9133089362410305e-05, + "loss": 0.4485, + "step": 6015 + }, + { + "epoch": 0.13362781767128, + "grad_norm": 1.3462244272232056, + "learning_rate": 1.9131668753455906e-05, + "loss": 0.5788, + "step": 6020 + }, + { + "epoch": 0.13373880423080764, + "grad_norm": 0.9598202109336853, + "learning_rate": 1.9130247034303852e-05, + "loss": 0.4366, + "step": 6025 + }, + { + "epoch": 0.1338497907903353, + "grad_norm": 1.2950756549835205, + "learning_rate": 1.912882420512698e-05, + "loss": 0.4562, + "step": 6030 + }, + { + "epoch": 0.13396077734986292, + "grad_norm": 1.2489832639694214, + "learning_rate": 1.912740026609828e-05, + "loss": 0.5912, + "step": 6035 + }, + { + "epoch": 0.13407176390939057, + "grad_norm": 1.3526968955993652, + "learning_rate": 1.9125975217390865e-05, + "loss": 0.4472, + "step": 6040 + }, + { + "epoch": 0.13418275046891823, + "grad_norm": 0.9325576424598694, + "learning_rate": 1.9124549059177988e-05, + "loss": 0.4222, + "step": 6045 + }, + { + "epoch": 0.13429373702844585, + "grad_norm": 0.9659907817840576, + "learning_rate": 1.912312179163304e-05, + "loss": 0.4955, + "step": 6050 + }, + { + "epoch": 0.1344047235879735, + "grad_norm": 0.9910063147544861, + "learning_rate": 1.912169341492954e-05, + "loss": 0.6083, + "step": 6055 + }, + { + "epoch": 0.13451571014750113, + "grad_norm": 1.6210241317749023, + "learning_rate": 1.9120263929241147e-05, + "loss": 0.5013, + "step": 6060 + }, + { + "epoch": 0.1346266967070288, + "grad_norm": 1.1847623586654663, + "learning_rate": 1.911883333474165e-05, + "loss": 0.6173, + "step": 6065 + }, + { + "epoch": 0.1347376832665564, + "grad_norm": 1.136588454246521, + "learning_rate": 1.9117401631604978e-05, + "loss": 0.6391, + "step": 6070 + }, + { + "epoch": 0.13484866982608407, + "grad_norm": 1.2647699117660522, + "learning_rate": 1.9115968820005197e-05, + "loss": 0.5167, + "step": 6075 + }, + { + "epoch": 0.1349596563856117, + "grad_norm": 1.39508056640625, + "learning_rate": 1.9114534900116496e-05, + "loss": 0.5092, + "step": 6080 + }, + { + "epoch": 0.13507064294513935, + "grad_norm": 1.0421615839004517, + "learning_rate": 1.9113099872113212e-05, + "loss": 0.5522, + "step": 6085 + }, + { + "epoch": 0.13518162950466697, + "grad_norm": 1.1894711256027222, + "learning_rate": 1.9111663736169806e-05, + "loss": 0.5877, + "step": 6090 + }, + { + "epoch": 0.13529261606419463, + "grad_norm": 1.133510708808899, + "learning_rate": 1.9110226492460886e-05, + "loss": 0.3857, + "step": 6095 + }, + { + "epoch": 0.13540360262372228, + "grad_norm": 1.1787534952163696, + "learning_rate": 1.9108788141161178e-05, + "loss": 0.5668, + "step": 6100 + }, + { + "epoch": 0.1355145891832499, + "grad_norm": 0.940147340297699, + "learning_rate": 1.9107348682445556e-05, + "loss": 0.4328, + "step": 6105 + }, + { + "epoch": 0.13562557574277756, + "grad_norm": 1.5695501565933228, + "learning_rate": 1.910590811648903e-05, + "loss": 0.5116, + "step": 6110 + }, + { + "epoch": 0.13573656230230519, + "grad_norm": 0.8534506559371948, + "learning_rate": 1.910446644346673e-05, + "loss": 0.5906, + "step": 6115 + }, + { + "epoch": 0.13584754886183284, + "grad_norm": 1.2275046110153198, + "learning_rate": 1.910302366355393e-05, + "loss": 0.6306, + "step": 6120 + }, + { + "epoch": 0.13595853542136047, + "grad_norm": 1.118098497390747, + "learning_rate": 1.910157977692605e-05, + "loss": 0.4972, + "step": 6125 + }, + { + "epoch": 0.13606952198088812, + "grad_norm": 1.447643518447876, + "learning_rate": 1.910013478375862e-05, + "loss": 0.5456, + "step": 6130 + }, + { + "epoch": 0.13618050854041575, + "grad_norm": 1.6203237771987915, + "learning_rate": 1.9098688684227324e-05, + "loss": 0.491, + "step": 6135 + }, + { + "epoch": 0.1362914950999434, + "grad_norm": 1.125411033630371, + "learning_rate": 1.9097241478507973e-05, + "loss": 0.4878, + "step": 6140 + }, + { + "epoch": 0.13640248165947103, + "grad_norm": 1.341929316520691, + "learning_rate": 1.9095793166776513e-05, + "loss": 0.4489, + "step": 6145 + }, + { + "epoch": 0.13651346821899868, + "grad_norm": 0.8598207235336304, + "learning_rate": 1.909434374920902e-05, + "loss": 0.4814, + "step": 6150 + }, + { + "epoch": 0.13662445477852633, + "grad_norm": 1.4603968858718872, + "learning_rate": 1.909289322598172e-05, + "loss": 0.6108, + "step": 6155 + }, + { + "epoch": 0.13673544133805396, + "grad_norm": 1.2341361045837402, + "learning_rate": 1.9091441597270955e-05, + "loss": 0.5515, + "step": 6160 + }, + { + "epoch": 0.1368464278975816, + "grad_norm": 0.923833429813385, + "learning_rate": 1.908998886325321e-05, + "loss": 0.3516, + "step": 6165 + }, + { + "epoch": 0.13695741445710924, + "grad_norm": 0.9731249213218689, + "learning_rate": 1.9088535024105105e-05, + "loss": 0.414, + "step": 6170 + }, + { + "epoch": 0.1370684010166369, + "grad_norm": 1.5650506019592285, + "learning_rate": 1.9087080080003394e-05, + "loss": 0.6774, + "step": 6175 + }, + { + "epoch": 0.13717938757616452, + "grad_norm": 1.0991238355636597, + "learning_rate": 1.908562403112496e-05, + "loss": 0.4536, + "step": 6180 + }, + { + "epoch": 0.13729037413569217, + "grad_norm": 1.1593619585037231, + "learning_rate": 1.9084166877646825e-05, + "loss": 0.5965, + "step": 6185 + }, + { + "epoch": 0.1374013606952198, + "grad_norm": 1.6231162548065186, + "learning_rate": 1.908270861974615e-05, + "loss": 0.6396, + "step": 6190 + }, + { + "epoch": 0.13751234725474745, + "grad_norm": 1.1264643669128418, + "learning_rate": 1.9081249257600226e-05, + "loss": 0.5087, + "step": 6195 + }, + { + "epoch": 0.13762333381427508, + "grad_norm": 1.6626774072647095, + "learning_rate": 1.9079788791386468e-05, + "loss": 0.4835, + "step": 6200 + }, + { + "epoch": 0.13773432037380273, + "grad_norm": 1.7249454259872437, + "learning_rate": 1.907832722128244e-05, + "loss": 0.5835, + "step": 6205 + }, + { + "epoch": 0.13784530693333039, + "grad_norm": 0.7329836487770081, + "learning_rate": 1.9076864547465836e-05, + "loss": 0.6058, + "step": 6210 + }, + { + "epoch": 0.137956293492858, + "grad_norm": 1.4405927658081055, + "learning_rate": 1.9075400770114482e-05, + "loss": 0.7529, + "step": 6215 + }, + { + "epoch": 0.13806728005238567, + "grad_norm": 1.5921906232833862, + "learning_rate": 1.9073935889406343e-05, + "loss": 0.4808, + "step": 6220 + }, + { + "epoch": 0.1381782666119133, + "grad_norm": 1.237058162689209, + "learning_rate": 1.90724699055195e-05, + "loss": 0.5102, + "step": 6225 + }, + { + "epoch": 0.13828925317144095, + "grad_norm": 1.3936007022857666, + "learning_rate": 1.9071002818632203e-05, + "loss": 0.6837, + "step": 6230 + }, + { + "epoch": 0.13840023973096857, + "grad_norm": 1.1461912393569946, + "learning_rate": 1.9069534628922797e-05, + "loss": 0.4168, + "step": 6235 + }, + { + "epoch": 0.13851122629049623, + "grad_norm": 1.069848656654358, + "learning_rate": 1.906806533656979e-05, + "loss": 0.4901, + "step": 6240 + }, + { + "epoch": 0.13862221285002385, + "grad_norm": 1.3110460042953491, + "learning_rate": 1.906659494175182e-05, + "loss": 0.6338, + "step": 6245 + }, + { + "epoch": 0.1387331994095515, + "grad_norm": 1.3842331171035767, + "learning_rate": 1.9065123444647633e-05, + "loss": 0.4519, + "step": 6250 + }, + { + "epoch": 0.13884418596907913, + "grad_norm": 1.457643747329712, + "learning_rate": 1.9063650845436143e-05, + "loss": 0.4812, + "step": 6255 + }, + { + "epoch": 0.13895517252860679, + "grad_norm": 0.7833005785942078, + "learning_rate": 1.906217714429638e-05, + "loss": 0.4751, + "step": 6260 + }, + { + "epoch": 0.13906615908813444, + "grad_norm": 1.5647377967834473, + "learning_rate": 1.9060702341407516e-05, + "loss": 0.5807, + "step": 6265 + }, + { + "epoch": 0.13917714564766206, + "grad_norm": 0.883370041847229, + "learning_rate": 1.9059226436948844e-05, + "loss": 0.4939, + "step": 6270 + }, + { + "epoch": 0.13928813220718972, + "grad_norm": 0.9018153548240662, + "learning_rate": 1.9057749431099807e-05, + "loss": 0.4014, + "step": 6275 + }, + { + "epoch": 0.13939911876671734, + "grad_norm": 0.9330711364746094, + "learning_rate": 1.905627132403997e-05, + "loss": 0.5366, + "step": 6280 + }, + { + "epoch": 0.139510105326245, + "grad_norm": 1.2406800985336304, + "learning_rate": 1.9054792115949033e-05, + "loss": 0.4553, + "step": 6285 + }, + { + "epoch": 0.13962109188577262, + "grad_norm": 1.1809135675430298, + "learning_rate": 1.9053311807006845e-05, + "loss": 0.6236, + "step": 6290 + }, + { + "epoch": 0.13973207844530028, + "grad_norm": 1.25314462184906, + "learning_rate": 1.9051830397393366e-05, + "loss": 0.6285, + "step": 6295 + }, + { + "epoch": 0.1398430650048279, + "grad_norm": 1.5022709369659424, + "learning_rate": 1.9050347887288708e-05, + "loss": 0.4595, + "step": 6300 + }, + { + "epoch": 0.13995405156435556, + "grad_norm": 0.9192030429840088, + "learning_rate": 1.9048864276873103e-05, + "loss": 0.5529, + "step": 6305 + }, + { + "epoch": 0.14006503812388318, + "grad_norm": 1.0889347791671753, + "learning_rate": 1.904737956632693e-05, + "loss": 0.5217, + "step": 6310 + }, + { + "epoch": 0.14017602468341084, + "grad_norm": 1.2225440740585327, + "learning_rate": 1.9045893755830688e-05, + "loss": 0.5005, + "step": 6315 + }, + { + "epoch": 0.1402870112429385, + "grad_norm": 1.1170283555984497, + "learning_rate": 1.9044406845565025e-05, + "loss": 0.5876, + "step": 6320 + }, + { + "epoch": 0.14039799780246612, + "grad_norm": 1.2859587669372559, + "learning_rate": 1.9042918835710708e-05, + "loss": 0.6355, + "step": 6325 + }, + { + "epoch": 0.14050898436199377, + "grad_norm": 1.228201985359192, + "learning_rate": 1.9041429726448645e-05, + "loss": 0.6205, + "step": 6330 + }, + { + "epoch": 0.1406199709215214, + "grad_norm": 1.601422667503357, + "learning_rate": 1.9039939517959882e-05, + "loss": 0.6231, + "step": 6335 + }, + { + "epoch": 0.14073095748104905, + "grad_norm": 1.0911506414413452, + "learning_rate": 1.9038448210425588e-05, + "loss": 0.6034, + "step": 6340 + }, + { + "epoch": 0.14084194404057668, + "grad_norm": 1.59726083278656, + "learning_rate": 1.9036955804027073e-05, + "loss": 0.6577, + "step": 6345 + }, + { + "epoch": 0.14095293060010433, + "grad_norm": 1.157368779182434, + "learning_rate": 1.903546229894578e-05, + "loss": 0.6007, + "step": 6350 + }, + { + "epoch": 0.14106391715963196, + "grad_norm": 1.2398027181625366, + "learning_rate": 1.9033967695363283e-05, + "loss": 0.5543, + "step": 6355 + }, + { + "epoch": 0.1411749037191596, + "grad_norm": 1.2566787004470825, + "learning_rate": 1.903247199346129e-05, + "loss": 0.5806, + "step": 6360 + }, + { + "epoch": 0.14128589027868724, + "grad_norm": 1.6907583475112915, + "learning_rate": 1.9030975193421647e-05, + "loss": 0.5773, + "step": 6365 + }, + { + "epoch": 0.1413968768382149, + "grad_norm": 1.620954155921936, + "learning_rate": 1.9029477295426324e-05, + "loss": 0.5538, + "step": 6370 + }, + { + "epoch": 0.14150786339774254, + "grad_norm": 1.1079115867614746, + "learning_rate": 1.9027978299657436e-05, + "loss": 0.5525, + "step": 6375 + }, + { + "epoch": 0.14161884995727017, + "grad_norm": 1.0491001605987549, + "learning_rate": 1.9026478206297224e-05, + "loss": 0.4703, + "step": 6380 + }, + { + "epoch": 0.14172983651679782, + "grad_norm": 3.4920918941497803, + "learning_rate": 1.9024977015528064e-05, + "loss": 0.6538, + "step": 6385 + }, + { + "epoch": 0.14184082307632545, + "grad_norm": 1.1903600692749023, + "learning_rate": 1.9023474727532466e-05, + "loss": 0.4085, + "step": 6390 + }, + { + "epoch": 0.1419518096358531, + "grad_norm": 1.0123170614242554, + "learning_rate": 1.9021971342493072e-05, + "loss": 0.4996, + "step": 6395 + }, + { + "epoch": 0.14206279619538073, + "grad_norm": 0.8986278176307678, + "learning_rate": 1.9020466860592663e-05, + "loss": 0.5676, + "step": 6400 + }, + { + "epoch": 0.14217378275490838, + "grad_norm": 1.3018028736114502, + "learning_rate": 1.901896128201414e-05, + "loss": 0.504, + "step": 6405 + }, + { + "epoch": 0.142284769314436, + "grad_norm": 1.483491063117981, + "learning_rate": 1.9017454606940557e-05, + "loss": 0.6871, + "step": 6410 + }, + { + "epoch": 0.14239575587396366, + "grad_norm": 1.6943657398223877, + "learning_rate": 1.9015946835555083e-05, + "loss": 0.4479, + "step": 6415 + }, + { + "epoch": 0.14250674243349132, + "grad_norm": 1.2507892847061157, + "learning_rate": 1.9014437968041026e-05, + "loss": 0.5551, + "step": 6420 + }, + { + "epoch": 0.14261772899301894, + "grad_norm": 0.8554363250732422, + "learning_rate": 1.9012928004581837e-05, + "loss": 0.6889, + "step": 6425 + }, + { + "epoch": 0.1427287155525466, + "grad_norm": 1.5508023500442505, + "learning_rate": 1.9011416945361088e-05, + "loss": 0.6158, + "step": 6430 + }, + { + "epoch": 0.14283970211207422, + "grad_norm": 0.8269447684288025, + "learning_rate": 1.9009904790562487e-05, + "loss": 0.5652, + "step": 6435 + }, + { + "epoch": 0.14295068867160188, + "grad_norm": 0.9781570434570312, + "learning_rate": 1.900839154036988e-05, + "loss": 0.5546, + "step": 6440 + }, + { + "epoch": 0.1430616752311295, + "grad_norm": 0.9161341190338135, + "learning_rate": 1.900687719496724e-05, + "loss": 0.514, + "step": 6445 + }, + { + "epoch": 0.14317266179065716, + "grad_norm": 1.2776657342910767, + "learning_rate": 1.9005361754538677e-05, + "loss": 0.5783, + "step": 6450 + }, + { + "epoch": 0.14328364835018478, + "grad_norm": 1.1995465755462646, + "learning_rate": 1.9003845219268436e-05, + "loss": 0.4091, + "step": 6455 + }, + { + "epoch": 0.14339463490971244, + "grad_norm": 1.4316763877868652, + "learning_rate": 1.900232758934089e-05, + "loss": 0.5167, + "step": 6460 + }, + { + "epoch": 0.14350562146924006, + "grad_norm": 1.3804174661636353, + "learning_rate": 1.9000808864940543e-05, + "loss": 0.4177, + "step": 6465 + }, + { + "epoch": 0.14361660802876772, + "grad_norm": 1.1941735744476318, + "learning_rate": 1.8999289046252044e-05, + "loss": 0.591, + "step": 6470 + }, + { + "epoch": 0.14372759458829537, + "grad_norm": 1.327609658241272, + "learning_rate": 1.8997768133460163e-05, + "loss": 0.4782, + "step": 6475 + }, + { + "epoch": 0.143838581147823, + "grad_norm": 1.247695803642273, + "learning_rate": 1.899624612674981e-05, + "loss": 0.6462, + "step": 6480 + }, + { + "epoch": 0.14394956770735065, + "grad_norm": 0.884784460067749, + "learning_rate": 1.8994723026306024e-05, + "loss": 0.568, + "step": 6485 + }, + { + "epoch": 0.14406055426687828, + "grad_norm": 1.2470338344573975, + "learning_rate": 1.899319883231398e-05, + "loss": 0.4931, + "step": 6490 + }, + { + "epoch": 0.14417154082640593, + "grad_norm": 1.626511573791504, + "learning_rate": 1.8991673544958975e-05, + "loss": 0.4637, + "step": 6495 + }, + { + "epoch": 0.14428252738593356, + "grad_norm": 1.8598002195358276, + "learning_rate": 1.899014716442646e-05, + "loss": 0.6426, + "step": 6500 + }, + { + "epoch": 0.1443935139454612, + "grad_norm": 1.0883430242538452, + "learning_rate": 1.8988619690902005e-05, + "loss": 0.5175, + "step": 6505 + }, + { + "epoch": 0.14450450050498884, + "grad_norm": 1.258001685142517, + "learning_rate": 1.8987091124571315e-05, + "loss": 0.4383, + "step": 6510 + }, + { + "epoch": 0.1446154870645165, + "grad_norm": 0.7219902276992798, + "learning_rate": 1.8985561465620225e-05, + "loss": 0.4931, + "step": 6515 + }, + { + "epoch": 0.14472647362404412, + "grad_norm": 0.8572632074356079, + "learning_rate": 1.8984030714234704e-05, + "loss": 0.6104, + "step": 6520 + }, + { + "epoch": 0.14483746018357177, + "grad_norm": 1.0302547216415405, + "learning_rate": 1.8982498870600864e-05, + "loss": 0.5779, + "step": 6525 + }, + { + "epoch": 0.14494844674309942, + "grad_norm": 1.225276231765747, + "learning_rate": 1.8980965934904932e-05, + "loss": 0.4816, + "step": 6530 + }, + { + "epoch": 0.14505943330262705, + "grad_norm": 1.3861422538757324, + "learning_rate": 1.8979431907333282e-05, + "loss": 0.4285, + "step": 6535 + }, + { + "epoch": 0.1451704198621547, + "grad_norm": 1.709928035736084, + "learning_rate": 1.8977896788072416e-05, + "loss": 0.5852, + "step": 6540 + }, + { + "epoch": 0.14528140642168233, + "grad_norm": 1.502562403678894, + "learning_rate": 1.897636057730897e-05, + "loss": 0.5503, + "step": 6545 + }, + { + "epoch": 0.14539239298120998, + "grad_norm": 0.9345400929450989, + "learning_rate": 1.897482327522971e-05, + "loss": 0.4356, + "step": 6550 + }, + { + "epoch": 0.1455033795407376, + "grad_norm": 0.9455825686454773, + "learning_rate": 1.897328488202153e-05, + "loss": 0.4978, + "step": 6555 + }, + { + "epoch": 0.14561436610026526, + "grad_norm": 1.2369747161865234, + "learning_rate": 1.8971745397871473e-05, + "loss": 0.6224, + "step": 6560 + }, + { + "epoch": 0.1457253526597929, + "grad_norm": 0.9470257759094238, + "learning_rate": 1.89702048229667e-05, + "loss": 0.4977, + "step": 6565 + }, + { + "epoch": 0.14583633921932054, + "grad_norm": 2.130319595336914, + "learning_rate": 1.8968663157494503e-05, + "loss": 0.5318, + "step": 6570 + }, + { + "epoch": 0.14594732577884817, + "grad_norm": 1.1602154970169067, + "learning_rate": 1.8967120401642324e-05, + "loss": 0.5466, + "step": 6575 + }, + { + "epoch": 0.14605831233837582, + "grad_norm": 1.5695816278457642, + "learning_rate": 1.8965576555597717e-05, + "loss": 0.5732, + "step": 6580 + }, + { + "epoch": 0.14616929889790348, + "grad_norm": 1.0248501300811768, + "learning_rate": 1.896403161954838e-05, + "loss": 0.404, + "step": 6585 + }, + { + "epoch": 0.1462802854574311, + "grad_norm": 1.1695456504821777, + "learning_rate": 1.896248559368214e-05, + "loss": 0.5657, + "step": 6590 + }, + { + "epoch": 0.14639127201695876, + "grad_norm": 0.8363463878631592, + "learning_rate": 1.8960938478186962e-05, + "loss": 0.5698, + "step": 6595 + }, + { + "epoch": 0.14650225857648638, + "grad_norm": 1.3290842771530151, + "learning_rate": 1.8959390273250938e-05, + "loss": 0.5124, + "step": 6600 + }, + { + "epoch": 0.14661324513601404, + "grad_norm": 1.2733120918273926, + "learning_rate": 1.895784097906229e-05, + "loss": 0.5128, + "step": 6605 + }, + { + "epoch": 0.14672423169554166, + "grad_norm": 1.1366569995880127, + "learning_rate": 1.8956290595809378e-05, + "loss": 0.4937, + "step": 6610 + }, + { + "epoch": 0.14683521825506932, + "grad_norm": 1.2039333581924438, + "learning_rate": 1.895473912368069e-05, + "loss": 0.5817, + "step": 6615 + }, + { + "epoch": 0.14694620481459694, + "grad_norm": 0.963930606842041, + "learning_rate": 1.8953186562864857e-05, + "loss": 0.5287, + "step": 6620 + }, + { + "epoch": 0.1470571913741246, + "grad_norm": 1.2638262510299683, + "learning_rate": 1.8951632913550625e-05, + "loss": 0.6006, + "step": 6625 + }, + { + "epoch": 0.14716817793365222, + "grad_norm": 1.433885097503662, + "learning_rate": 1.8950078175926886e-05, + "loss": 0.5561, + "step": 6630 + }, + { + "epoch": 0.14727916449317988, + "grad_norm": 1.211980938911438, + "learning_rate": 1.8948522350182655e-05, + "loss": 0.5017, + "step": 6635 + }, + { + "epoch": 0.14739015105270753, + "grad_norm": 0.7199383974075317, + "learning_rate": 1.8946965436507094e-05, + "loss": 0.5482, + "step": 6640 + }, + { + "epoch": 0.14750113761223516, + "grad_norm": 1.4644221067428589, + "learning_rate": 1.8945407435089477e-05, + "loss": 0.5931, + "step": 6645 + }, + { + "epoch": 0.1476121241717628, + "grad_norm": 0.8318582773208618, + "learning_rate": 1.8943848346119225e-05, + "loss": 0.4987, + "step": 6650 + }, + { + "epoch": 0.14772311073129044, + "grad_norm": 0.7859492301940918, + "learning_rate": 1.8942288169785884e-05, + "loss": 0.485, + "step": 6655 + }, + { + "epoch": 0.1478340972908181, + "grad_norm": 0.8700778484344482, + "learning_rate": 1.8940726906279142e-05, + "loss": 0.4495, + "step": 6660 + }, + { + "epoch": 0.14794508385034572, + "grad_norm": 1.5189528465270996, + "learning_rate": 1.8939164555788805e-05, + "loss": 0.8257, + "step": 6665 + }, + { + "epoch": 0.14805607040987337, + "grad_norm": 0.9018239974975586, + "learning_rate": 1.893760111850482e-05, + "loss": 0.4749, + "step": 6670 + }, + { + "epoch": 0.148167056969401, + "grad_norm": 0.8321853876113892, + "learning_rate": 1.893603659461727e-05, + "loss": 0.5725, + "step": 6675 + }, + { + "epoch": 0.14827804352892865, + "grad_norm": 1.372806191444397, + "learning_rate": 1.8934470984316352e-05, + "loss": 0.5039, + "step": 6680 + }, + { + "epoch": 0.14838903008845628, + "grad_norm": 1.2954940795898438, + "learning_rate": 1.893290428779242e-05, + "loss": 0.5461, + "step": 6685 + }, + { + "epoch": 0.14850001664798393, + "grad_norm": 1.6831293106079102, + "learning_rate": 1.8931336505235947e-05, + "loss": 0.5783, + "step": 6690 + }, + { + "epoch": 0.14861100320751158, + "grad_norm": 1.0016860961914062, + "learning_rate": 1.892976763683753e-05, + "loss": 0.5389, + "step": 6695 + }, + { + "epoch": 0.1487219897670392, + "grad_norm": 1.2121920585632324, + "learning_rate": 1.8928197682787914e-05, + "loss": 0.5567, + "step": 6700 + }, + { + "epoch": 0.14883297632656686, + "grad_norm": 1.14114511013031, + "learning_rate": 1.8926626643277966e-05, + "loss": 0.4992, + "step": 6705 + }, + { + "epoch": 0.1489439628860945, + "grad_norm": 1.1122453212738037, + "learning_rate": 1.892505451849869e-05, + "loss": 0.614, + "step": 6710 + }, + { + "epoch": 0.14905494944562214, + "grad_norm": 1.7418065071105957, + "learning_rate": 1.8923481308641216e-05, + "loss": 0.5832, + "step": 6715 + }, + { + "epoch": 0.14916593600514977, + "grad_norm": 1.6564451456069946, + "learning_rate": 1.892190701389681e-05, + "loss": 0.4955, + "step": 6720 + }, + { + "epoch": 0.14927692256467742, + "grad_norm": 1.2826859951019287, + "learning_rate": 1.8920331634456874e-05, + "loss": 0.5107, + "step": 6725 + }, + { + "epoch": 0.14938790912420505, + "grad_norm": 1.092296838760376, + "learning_rate": 1.8918755170512932e-05, + "loss": 0.5884, + "step": 6730 + }, + { + "epoch": 0.1494988956837327, + "grad_norm": 1.3704930543899536, + "learning_rate": 1.8917177622256647e-05, + "loss": 0.4831, + "step": 6735 + }, + { + "epoch": 0.14960988224326033, + "grad_norm": 1.782360553741455, + "learning_rate": 1.8915598989879816e-05, + "loss": 0.6033, + "step": 6740 + }, + { + "epoch": 0.14972086880278798, + "grad_norm": 1.6985117197036743, + "learning_rate": 1.891401927357436e-05, + "loss": 0.644, + "step": 6745 + }, + { + "epoch": 0.14983185536231564, + "grad_norm": 1.4931162595748901, + "learning_rate": 1.8912438473532335e-05, + "loss": 0.5186, + "step": 6750 + }, + { + "epoch": 0.14994284192184326, + "grad_norm": 0.8564159870147705, + "learning_rate": 1.891085658994593e-05, + "loss": 0.5087, + "step": 6755 + }, + { + "epoch": 0.15005382848137092, + "grad_norm": 1.126528024673462, + "learning_rate": 1.8909273623007466e-05, + "loss": 0.4766, + "step": 6760 + }, + { + "epoch": 0.15016481504089854, + "grad_norm": 0.8299586772918701, + "learning_rate": 1.8907689572909394e-05, + "loss": 0.5409, + "step": 6765 + }, + { + "epoch": 0.1502758016004262, + "grad_norm": 1.3447434902191162, + "learning_rate": 1.8906104439844297e-05, + "loss": 0.5607, + "step": 6770 + }, + { + "epoch": 0.15038678815995382, + "grad_norm": 1.2309798002243042, + "learning_rate": 1.8904518224004894e-05, + "loss": 0.6018, + "step": 6775 + }, + { + "epoch": 0.15049777471948148, + "grad_norm": 0.9505862593650818, + "learning_rate": 1.8902930925584025e-05, + "loss": 0.4819, + "step": 6780 + }, + { + "epoch": 0.1506087612790091, + "grad_norm": 1.3469936847686768, + "learning_rate": 1.8901342544774674e-05, + "loss": 0.5222, + "step": 6785 + }, + { + "epoch": 0.15071974783853676, + "grad_norm": 1.1433323621749878, + "learning_rate": 1.8899753081769948e-05, + "loss": 0.4524, + "step": 6790 + }, + { + "epoch": 0.15083073439806438, + "grad_norm": 1.206285834312439, + "learning_rate": 1.8898162536763092e-05, + "loss": 0.5026, + "step": 6795 + }, + { + "epoch": 0.15094172095759203, + "grad_norm": 1.9741181135177612, + "learning_rate": 1.8896570909947477e-05, + "loss": 0.3818, + "step": 6800 + }, + { + "epoch": 0.1510527075171197, + "grad_norm": 1.362396478652954, + "learning_rate": 1.8894978201516603e-05, + "loss": 0.7034, + "step": 6805 + }, + { + "epoch": 0.15116369407664731, + "grad_norm": 1.2455583810806274, + "learning_rate": 1.8893384411664115e-05, + "loss": 0.4963, + "step": 6810 + }, + { + "epoch": 0.15127468063617497, + "grad_norm": 1.1156567335128784, + "learning_rate": 1.8891789540583777e-05, + "loss": 0.4454, + "step": 6815 + }, + { + "epoch": 0.1513856671957026, + "grad_norm": 1.0928586721420288, + "learning_rate": 1.8890193588469484e-05, + "loss": 0.5486, + "step": 6820 + }, + { + "epoch": 0.15149665375523025, + "grad_norm": 1.0376923084259033, + "learning_rate": 1.888859655551527e-05, + "loss": 0.4751, + "step": 6825 + }, + { + "epoch": 0.15160764031475787, + "grad_norm": 1.1090154647827148, + "learning_rate": 1.8886998441915298e-05, + "loss": 0.3916, + "step": 6830 + }, + { + "epoch": 0.15171862687428553, + "grad_norm": 1.081882119178772, + "learning_rate": 1.888539924786386e-05, + "loss": 0.4398, + "step": 6835 + }, + { + "epoch": 0.15182961343381315, + "grad_norm": 2.7401556968688965, + "learning_rate": 1.888379897355538e-05, + "loss": 0.56, + "step": 6840 + }, + { + "epoch": 0.1519405999933408, + "grad_norm": 0.9954392313957214, + "learning_rate": 1.8882197619184417e-05, + "loss": 0.4477, + "step": 6845 + }, + { + "epoch": 0.15205158655286843, + "grad_norm": 3.0417439937591553, + "learning_rate": 1.8880595184945653e-05, + "loss": 0.6061, + "step": 6850 + }, + { + "epoch": 0.1521625731123961, + "grad_norm": 1.5988818407058716, + "learning_rate": 1.8878991671033913e-05, + "loss": 0.5861, + "step": 6855 + }, + { + "epoch": 0.15227355967192374, + "grad_norm": 1.2149866819381714, + "learning_rate": 1.8877387077644143e-05, + "loss": 0.4352, + "step": 6860 + }, + { + "epoch": 0.15238454623145137, + "grad_norm": 1.172912359237671, + "learning_rate": 1.8875781404971424e-05, + "loss": 0.5736, + "step": 6865 + }, + { + "epoch": 0.15249553279097902, + "grad_norm": 1.0310333967208862, + "learning_rate": 1.8874174653210967e-05, + "loss": 0.4938, + "step": 6870 + }, + { + "epoch": 0.15260651935050665, + "grad_norm": 2.8296666145324707, + "learning_rate": 1.887256682255812e-05, + "loss": 0.5292, + "step": 6875 + }, + { + "epoch": 0.1527175059100343, + "grad_norm": 1.6240949630737305, + "learning_rate": 1.8870957913208354e-05, + "loss": 0.6376, + "step": 6880 + }, + { + "epoch": 0.15282849246956193, + "grad_norm": 2.6802828311920166, + "learning_rate": 1.8869347925357275e-05, + "loss": 0.4638, + "step": 6885 + }, + { + "epoch": 0.15293947902908958, + "grad_norm": 1.5533713102340698, + "learning_rate": 1.886773685920062e-05, + "loss": 0.44, + "step": 6890 + }, + { + "epoch": 0.1530504655886172, + "grad_norm": 2.4309866428375244, + "learning_rate": 1.886612471493426e-05, + "loss": 0.6499, + "step": 6895 + }, + { + "epoch": 0.15316145214814486, + "grad_norm": 1.664516568183899, + "learning_rate": 1.886451149275419e-05, + "loss": 0.4546, + "step": 6900 + }, + { + "epoch": 0.1532724387076725, + "grad_norm": 1.5281312465667725, + "learning_rate": 1.8862897192856545e-05, + "loss": 0.5696, + "step": 6905 + }, + { + "epoch": 0.15338342526720014, + "grad_norm": 1.1347835063934326, + "learning_rate": 1.8861281815437578e-05, + "loss": 0.5733, + "step": 6910 + }, + { + "epoch": 0.1534944118267278, + "grad_norm": 1.2266205549240112, + "learning_rate": 1.885966536069369e-05, + "loss": 0.6103, + "step": 6915 + }, + { + "epoch": 0.15360539838625542, + "grad_norm": 1.4315506219863892, + "learning_rate": 1.88580478288214e-05, + "loss": 0.6006, + "step": 6920 + }, + { + "epoch": 0.15371638494578307, + "grad_norm": 2.4873852729797363, + "learning_rate": 1.8856429220017364e-05, + "loss": 0.5671, + "step": 6925 + }, + { + "epoch": 0.1538273715053107, + "grad_norm": 1.1945925951004028, + "learning_rate": 1.885480953447836e-05, + "loss": 0.57, + "step": 6930 + }, + { + "epoch": 0.15393835806483835, + "grad_norm": 1.2514160871505737, + "learning_rate": 1.8853188772401316e-05, + "loss": 0.6649, + "step": 6935 + }, + { + "epoch": 0.15404934462436598, + "grad_norm": 1.9401576519012451, + "learning_rate": 1.8851566933983266e-05, + "loss": 0.5246, + "step": 6940 + }, + { + "epoch": 0.15416033118389363, + "grad_norm": 1.5348749160766602, + "learning_rate": 1.88499440194214e-05, + "loss": 0.6484, + "step": 6945 + }, + { + "epoch": 0.15427131774342126, + "grad_norm": 1.4494093656539917, + "learning_rate": 1.8848320028913017e-05, + "loss": 0.6021, + "step": 6950 + }, + { + "epoch": 0.15438230430294891, + "grad_norm": 1.3363004922866821, + "learning_rate": 1.8846694962655564e-05, + "loss": 0.7334, + "step": 6955 + }, + { + "epoch": 0.15449329086247657, + "grad_norm": 1.1602723598480225, + "learning_rate": 1.88450688208466e-05, + "loss": 0.5157, + "step": 6960 + }, + { + "epoch": 0.1546042774220042, + "grad_norm": 1.3268187046051025, + "learning_rate": 1.884344160368384e-05, + "loss": 0.4829, + "step": 6965 + }, + { + "epoch": 0.15471526398153185, + "grad_norm": 1.1891200542449951, + "learning_rate": 1.8841813311365105e-05, + "loss": 0.6283, + "step": 6970 + }, + { + "epoch": 0.15482625054105947, + "grad_norm": 1.3743573427200317, + "learning_rate": 1.884018394408836e-05, + "loss": 0.5518, + "step": 6975 + }, + { + "epoch": 0.15493723710058713, + "grad_norm": 1.314160704612732, + "learning_rate": 1.88385535020517e-05, + "loss": 0.4715, + "step": 6980 + }, + { + "epoch": 0.15504822366011475, + "grad_norm": 0.9060925245285034, + "learning_rate": 1.8836921985453347e-05, + "loss": 0.5421, + "step": 6985 + }, + { + "epoch": 0.1551592102196424, + "grad_norm": 0.9292426705360413, + "learning_rate": 1.8835289394491655e-05, + "loss": 0.6992, + "step": 6990 + }, + { + "epoch": 0.15527019677917003, + "grad_norm": 1.238547682762146, + "learning_rate": 1.883365572936511e-05, + "loss": 0.438, + "step": 6995 + }, + { + "epoch": 0.1553811833386977, + "grad_norm": 1.2419242858886719, + "learning_rate": 1.883202099027233e-05, + "loss": 0.6075, + "step": 7000 + }, + { + "epoch": 0.1554921698982253, + "grad_norm": 1.0661461353302002, + "learning_rate": 1.8830385177412054e-05, + "loss": 0.4628, + "step": 7005 + }, + { + "epoch": 0.15560315645775297, + "grad_norm": 1.3712948560714722, + "learning_rate": 1.8828748290983166e-05, + "loss": 0.6168, + "step": 7010 + }, + { + "epoch": 0.15571414301728062, + "grad_norm": 1.3899610042572021, + "learning_rate": 1.8827110331184667e-05, + "loss": 0.596, + "step": 7015 + }, + { + "epoch": 0.15582512957680825, + "grad_norm": 1.5190640687942505, + "learning_rate": 1.88254712982157e-05, + "loss": 0.592, + "step": 7020 + }, + { + "epoch": 0.1559361161363359, + "grad_norm": 1.1492244005203247, + "learning_rate": 1.8823831192275533e-05, + "loss": 0.5391, + "step": 7025 + }, + { + "epoch": 0.15604710269586353, + "grad_norm": 1.307806134223938, + "learning_rate": 1.8822190013563562e-05, + "loss": 0.4242, + "step": 7030 + }, + { + "epoch": 0.15615808925539118, + "grad_norm": 1.2493864297866821, + "learning_rate": 1.882054776227931e-05, + "loss": 0.5422, + "step": 7035 + }, + { + "epoch": 0.1562690758149188, + "grad_norm": 1.2601972818374634, + "learning_rate": 1.881890443862245e-05, + "loss": 0.5582, + "step": 7040 + }, + { + "epoch": 0.15638006237444646, + "grad_norm": 0.8862437605857849, + "learning_rate": 1.8817260042792763e-05, + "loss": 0.4275, + "step": 7045 + }, + { + "epoch": 0.1564910489339741, + "grad_norm": 1.2929998636245728, + "learning_rate": 1.881561457499017e-05, + "loss": 0.4504, + "step": 7050 + }, + { + "epoch": 0.15660203549350174, + "grad_norm": 2.1380231380462646, + "learning_rate": 1.881396803541472e-05, + "loss": 0.5916, + "step": 7055 + }, + { + "epoch": 0.15671302205302937, + "grad_norm": 1.1888693571090698, + "learning_rate": 1.88123204242666e-05, + "loss": 0.5793, + "step": 7060 + }, + { + "epoch": 0.15682400861255702, + "grad_norm": 1.548318862915039, + "learning_rate": 1.8810671741746115e-05, + "loss": 0.505, + "step": 7065 + }, + { + "epoch": 0.15693499517208467, + "grad_norm": 1.4964656829833984, + "learning_rate": 1.8809021988053707e-05, + "loss": 0.3319, + "step": 7070 + }, + { + "epoch": 0.1570459817316123, + "grad_norm": 1.2946470975875854, + "learning_rate": 1.8807371163389955e-05, + "loss": 0.6479, + "step": 7075 + }, + { + "epoch": 0.15715696829113995, + "grad_norm": 1.5597033500671387, + "learning_rate": 1.880571926795555e-05, + "loss": 0.4953, + "step": 7080 + }, + { + "epoch": 0.15726795485066758, + "grad_norm": 0.9103341102600098, + "learning_rate": 1.8804066301951324e-05, + "loss": 0.5956, + "step": 7085 + }, + { + "epoch": 0.15737894141019523, + "grad_norm": 0.9429594874382019, + "learning_rate": 1.880241226557825e-05, + "loss": 0.5973, + "step": 7090 + }, + { + "epoch": 0.15748992796972286, + "grad_norm": 1.2466639280319214, + "learning_rate": 1.880075715903741e-05, + "loss": 0.5657, + "step": 7095 + }, + { + "epoch": 0.1576009145292505, + "grad_norm": 1.170981764793396, + "learning_rate": 1.8799100982530034e-05, + "loss": 0.6455, + "step": 7100 + }, + { + "epoch": 0.15771190108877814, + "grad_norm": 1.7617019414901733, + "learning_rate": 1.879744373625747e-05, + "loss": 0.5598, + "step": 7105 + }, + { + "epoch": 0.1578228876483058, + "grad_norm": 0.8788993954658508, + "learning_rate": 1.8795785420421198e-05, + "loss": 0.4858, + "step": 7110 + }, + { + "epoch": 0.15793387420783342, + "grad_norm": 0.8216165900230408, + "learning_rate": 1.8794126035222833e-05, + "loss": 0.4852, + "step": 7115 + }, + { + "epoch": 0.15804486076736107, + "grad_norm": 1.0266494750976562, + "learning_rate": 1.879246558086412e-05, + "loss": 0.5108, + "step": 7120 + }, + { + "epoch": 0.15815584732688873, + "grad_norm": 1.1185437440872192, + "learning_rate": 1.879080405754693e-05, + "loss": 0.3881, + "step": 7125 + }, + { + "epoch": 0.15826683388641635, + "grad_norm": 1.2968014478683472, + "learning_rate": 1.8789141465473263e-05, + "loss": 0.4948, + "step": 7130 + }, + { + "epoch": 0.158377820445944, + "grad_norm": 0.8087155818939209, + "learning_rate": 1.8787477804845255e-05, + "loss": 0.5251, + "step": 7135 + }, + { + "epoch": 0.15848880700547163, + "grad_norm": 1.3040249347686768, + "learning_rate": 1.8785813075865164e-05, + "loss": 0.7207, + "step": 7140 + }, + { + "epoch": 0.1585997935649993, + "grad_norm": 1.23062002658844, + "learning_rate": 1.8784147278735386e-05, + "loss": 0.4471, + "step": 7145 + }, + { + "epoch": 0.1587107801245269, + "grad_norm": 1.129224181175232, + "learning_rate": 1.878248041365844e-05, + "loss": 0.4338, + "step": 7150 + }, + { + "epoch": 0.15882176668405457, + "grad_norm": 2.8109686374664307, + "learning_rate": 1.878081248083698e-05, + "loss": 0.568, + "step": 7155 + }, + { + "epoch": 0.1589327532435822, + "grad_norm": 1.2243515253067017, + "learning_rate": 1.8779143480473787e-05, + "loss": 0.4675, + "step": 7160 + }, + { + "epoch": 0.15904373980310985, + "grad_norm": 1.5151373147964478, + "learning_rate": 1.8777473412771777e-05, + "loss": 0.5826, + "step": 7165 + }, + { + "epoch": 0.15915472636263747, + "grad_norm": 0.8092393279075623, + "learning_rate": 1.877580227793398e-05, + "loss": 0.6009, + "step": 7170 + }, + { + "epoch": 0.15926571292216513, + "grad_norm": 1.2688989639282227, + "learning_rate": 1.8774130076163575e-05, + "loss": 0.4677, + "step": 7175 + }, + { + "epoch": 0.15937669948169278, + "grad_norm": 1.2532013654708862, + "learning_rate": 1.877245680766387e-05, + "loss": 0.3996, + "step": 7180 + }, + { + "epoch": 0.1594876860412204, + "grad_norm": 1.0712350606918335, + "learning_rate": 1.8770782472638276e-05, + "loss": 0.4464, + "step": 7185 + }, + { + "epoch": 0.15959867260074806, + "grad_norm": 1.1912342309951782, + "learning_rate": 1.8769107071290367e-05, + "loss": 0.497, + "step": 7190 + }, + { + "epoch": 0.15970965916027569, + "grad_norm": 1.4670413732528687, + "learning_rate": 1.8767430603823833e-05, + "loss": 0.5811, + "step": 7195 + }, + { + "epoch": 0.15982064571980334, + "grad_norm": 1.3657194375991821, + "learning_rate": 1.8765753070442486e-05, + "loss": 0.5604, + "step": 7200 + }, + { + "epoch": 0.15993163227933097, + "grad_norm": 1.1779944896697998, + "learning_rate": 1.8764074471350282e-05, + "loss": 0.5235, + "step": 7205 + }, + { + "epoch": 0.16004261883885862, + "grad_norm": 1.0121064186096191, + "learning_rate": 1.8762394806751295e-05, + "loss": 0.4274, + "step": 7210 + }, + { + "epoch": 0.16015360539838625, + "grad_norm": 1.2211484909057617, + "learning_rate": 1.8760714076849734e-05, + "loss": 0.5711, + "step": 7215 + }, + { + "epoch": 0.1602645919579139, + "grad_norm": 1.132825493812561, + "learning_rate": 1.8759032281849937e-05, + "loss": 0.5821, + "step": 7220 + }, + { + "epoch": 0.16037557851744152, + "grad_norm": 1.1440246105194092, + "learning_rate": 1.875734942195637e-05, + "loss": 0.5773, + "step": 7225 + }, + { + "epoch": 0.16048656507696918, + "grad_norm": 1.1753207445144653, + "learning_rate": 1.8755665497373628e-05, + "loss": 0.5393, + "step": 7230 + }, + { + "epoch": 0.16059755163649683, + "grad_norm": 1.1559597253799438, + "learning_rate": 1.8753980508306442e-05, + "loss": 0.4608, + "step": 7235 + }, + { + "epoch": 0.16070853819602446, + "grad_norm": 1.5974079370498657, + "learning_rate": 1.8752294454959665e-05, + "loss": 0.4511, + "step": 7240 + }, + { + "epoch": 0.1608195247555521, + "grad_norm": 0.7395541667938232, + "learning_rate": 1.875060733753828e-05, + "loss": 0.2952, + "step": 7245 + }, + { + "epoch": 0.16093051131507974, + "grad_norm": 1.159072756767273, + "learning_rate": 1.8748919156247402e-05, + "loss": 0.5593, + "step": 7250 + }, + { + "epoch": 0.1610414978746074, + "grad_norm": 2.472195625305176, + "learning_rate": 1.8747229911292273e-05, + "loss": 0.5588, + "step": 7255 + }, + { + "epoch": 0.16115248443413502, + "grad_norm": 1.0677604675292969, + "learning_rate": 1.874553960287827e-05, + "loss": 0.3486, + "step": 7260 + }, + { + "epoch": 0.16126347099366267, + "grad_norm": 1.0419409275054932, + "learning_rate": 1.8743848231210894e-05, + "loss": 0.521, + "step": 7265 + }, + { + "epoch": 0.1613744575531903, + "grad_norm": 1.076755404472351, + "learning_rate": 1.874215579649577e-05, + "loss": 0.6069, + "step": 7270 + }, + { + "epoch": 0.16148544411271795, + "grad_norm": 0.9147228598594666, + "learning_rate": 1.8740462298938666e-05, + "loss": 0.5681, + "step": 7275 + }, + { + "epoch": 0.16159643067224558, + "grad_norm": 1.19989812374115, + "learning_rate": 1.8738767738745467e-05, + "loss": 0.7323, + "step": 7280 + }, + { + "epoch": 0.16170741723177323, + "grad_norm": 1.425941824913025, + "learning_rate": 1.8737072116122194e-05, + "loss": 0.5968, + "step": 7285 + }, + { + "epoch": 0.16181840379130089, + "grad_norm": 1.1933568716049194, + "learning_rate": 1.8735375431275e-05, + "loss": 0.4999, + "step": 7290 + }, + { + "epoch": 0.1619293903508285, + "grad_norm": 1.1296350955963135, + "learning_rate": 1.873367768441015e-05, + "loss": 0.4927, + "step": 7295 + }, + { + "epoch": 0.16204037691035617, + "grad_norm": 1.342839002609253, + "learning_rate": 1.8731978875734062e-05, + "loss": 0.5207, + "step": 7300 + }, + { + "epoch": 0.1621513634698838, + "grad_norm": 1.3544893264770508, + "learning_rate": 1.8730279005453264e-05, + "loss": 0.4706, + "step": 7305 + }, + { + "epoch": 0.16226235002941145, + "grad_norm": 1.0300699472427368, + "learning_rate": 1.8728578073774427e-05, + "loss": 0.4753, + "step": 7310 + }, + { + "epoch": 0.16237333658893907, + "grad_norm": 1.0216705799102783, + "learning_rate": 1.8726876080904338e-05, + "loss": 0.5686, + "step": 7315 + }, + { + "epoch": 0.16248432314846672, + "grad_norm": 1.362499713897705, + "learning_rate": 1.8725173027049927e-05, + "loss": 0.5845, + "step": 7320 + }, + { + "epoch": 0.16259530970799435, + "grad_norm": 1.2206189632415771, + "learning_rate": 1.8723468912418233e-05, + "loss": 0.5174, + "step": 7325 + }, + { + "epoch": 0.162706296267522, + "grad_norm": 1.0743138790130615, + "learning_rate": 1.8721763737216453e-05, + "loss": 0.5493, + "step": 7330 + }, + { + "epoch": 0.16281728282704963, + "grad_norm": 1.1450023651123047, + "learning_rate": 1.8720057501651885e-05, + "loss": 0.4913, + "step": 7335 + }, + { + "epoch": 0.16292826938657728, + "grad_norm": 1.2171339988708496, + "learning_rate": 1.8718350205931975e-05, + "loss": 0.4827, + "step": 7340 + }, + { + "epoch": 0.16303925594610494, + "grad_norm": 1.5790528059005737, + "learning_rate": 1.871664185026428e-05, + "loss": 0.4709, + "step": 7345 + }, + { + "epoch": 0.16315024250563256, + "grad_norm": 1.3697818517684937, + "learning_rate": 1.8714932434856507e-05, + "loss": 0.3998, + "step": 7350 + }, + { + "epoch": 0.16326122906516022, + "grad_norm": 1.1617748737335205, + "learning_rate": 1.8713221959916472e-05, + "loss": 0.5325, + "step": 7355 + }, + { + "epoch": 0.16337221562468784, + "grad_norm": 1.0637340545654297, + "learning_rate": 1.8711510425652134e-05, + "loss": 0.5762, + "step": 7360 + }, + { + "epoch": 0.1634832021842155, + "grad_norm": 1.0662593841552734, + "learning_rate": 1.8709797832271575e-05, + "loss": 0.4272, + "step": 7365 + }, + { + "epoch": 0.16359418874374312, + "grad_norm": 1.1800469160079956, + "learning_rate": 1.870808417998301e-05, + "loss": 0.6615, + "step": 7370 + }, + { + "epoch": 0.16370517530327078, + "grad_norm": 1.1630209684371948, + "learning_rate": 1.870636946899477e-05, + "loss": 0.4881, + "step": 7375 + }, + { + "epoch": 0.1638161618627984, + "grad_norm": 1.7020683288574219, + "learning_rate": 1.8704653699515328e-05, + "loss": 0.527, + "step": 7380 + }, + { + "epoch": 0.16392714842232606, + "grad_norm": 1.147692322731018, + "learning_rate": 1.8702936871753284e-05, + "loss": 0.4046, + "step": 7385 + }, + { + "epoch": 0.16403813498185368, + "grad_norm": 1.619726538658142, + "learning_rate": 1.8701218985917364e-05, + "loss": 0.7949, + "step": 7390 + }, + { + "epoch": 0.16414912154138134, + "grad_norm": 1.1566890478134155, + "learning_rate": 1.8699500042216423e-05, + "loss": 0.766, + "step": 7395 + }, + { + "epoch": 0.164260108100909, + "grad_norm": 1.1725993156433105, + "learning_rate": 1.869778004085944e-05, + "loss": 0.3988, + "step": 7400 + }, + { + "epoch": 0.16437109466043662, + "grad_norm": 0.9177306294441223, + "learning_rate": 1.8696058982055532e-05, + "loss": 0.4696, + "step": 7405 + }, + { + "epoch": 0.16448208121996427, + "grad_norm": 1.2283203601837158, + "learning_rate": 1.8694336866013932e-05, + "loss": 0.5214, + "step": 7410 + }, + { + "epoch": 0.1645930677794919, + "grad_norm": 1.3590775728225708, + "learning_rate": 1.869261369294402e-05, + "loss": 0.4894, + "step": 7415 + }, + { + "epoch": 0.16470405433901955, + "grad_norm": 1.28773832321167, + "learning_rate": 1.8690889463055285e-05, + "loss": 0.5813, + "step": 7420 + }, + { + "epoch": 0.16481504089854718, + "grad_norm": 1.0126246213912964, + "learning_rate": 1.868916417655736e-05, + "loss": 0.4549, + "step": 7425 + }, + { + "epoch": 0.16492602745807483, + "grad_norm": 1.1124584674835205, + "learning_rate": 1.8687437833659986e-05, + "loss": 0.5777, + "step": 7430 + }, + { + "epoch": 0.16503701401760246, + "grad_norm": 1.226993441581726, + "learning_rate": 1.8685710434573066e-05, + "loss": 0.5007, + "step": 7435 + }, + { + "epoch": 0.1651480005771301, + "grad_norm": 1.3269422054290771, + "learning_rate": 1.8683981979506597e-05, + "loss": 0.4307, + "step": 7440 + }, + { + "epoch": 0.16525898713665776, + "grad_norm": 1.6435010433197021, + "learning_rate": 1.868225246867072e-05, + "loss": 0.5969, + "step": 7445 + }, + { + "epoch": 0.1653699736961854, + "grad_norm": 1.360838770866394, + "learning_rate": 1.868052190227571e-05, + "loss": 0.4791, + "step": 7450 + }, + { + "epoch": 0.16548096025571304, + "grad_norm": 1.045859932899475, + "learning_rate": 1.8678790280531956e-05, + "loss": 0.5986, + "step": 7455 + }, + { + "epoch": 0.16559194681524067, + "grad_norm": 1.305838942527771, + "learning_rate": 1.867705760364999e-05, + "loss": 0.4359, + "step": 7460 + }, + { + "epoch": 0.16570293337476832, + "grad_norm": 0.9243895411491394, + "learning_rate": 1.8675323871840462e-05, + "loss": 0.4628, + "step": 7465 + }, + { + "epoch": 0.16581391993429595, + "grad_norm": 1.281638264656067, + "learning_rate": 1.8673589085314145e-05, + "loss": 0.4752, + "step": 7470 + }, + { + "epoch": 0.1659249064938236, + "grad_norm": 1.163022756576538, + "learning_rate": 1.8671853244281962e-05, + "loss": 0.6009, + "step": 7475 + }, + { + "epoch": 0.16603589305335123, + "grad_norm": 1.445329189300537, + "learning_rate": 1.8670116348954945e-05, + "loss": 0.364, + "step": 7480 + }, + { + "epoch": 0.16614687961287888, + "grad_norm": 1.2617442607879639, + "learning_rate": 1.8668378399544254e-05, + "loss": 0.644, + "step": 7485 + }, + { + "epoch": 0.1662578661724065, + "grad_norm": 0.9319959282875061, + "learning_rate": 1.866663939626119e-05, + "loss": 0.493, + "step": 7490 + }, + { + "epoch": 0.16636885273193416, + "grad_norm": 0.986315906047821, + "learning_rate": 1.866489933931718e-05, + "loss": 0.4185, + "step": 7495 + }, + { + "epoch": 0.16647983929146182, + "grad_norm": 0.9533819556236267, + "learning_rate": 1.8663158228923762e-05, + "loss": 0.5356, + "step": 7500 + }, + { + "epoch": 0.16659082585098944, + "grad_norm": 1.27620267868042, + "learning_rate": 1.8661416065292624e-05, + "loss": 0.5966, + "step": 7505 + }, + { + "epoch": 0.1667018124105171, + "grad_norm": 1.3088321685791016, + "learning_rate": 1.8659672848635568e-05, + "loss": 0.5771, + "step": 7510 + }, + { + "epoch": 0.16681279897004472, + "grad_norm": 0.8196889758110046, + "learning_rate": 1.865792857916453e-05, + "loss": 0.3497, + "step": 7515 + }, + { + "epoch": 0.16692378552957238, + "grad_norm": 1.3427016735076904, + "learning_rate": 1.8656183257091572e-05, + "loss": 0.6173, + "step": 7520 + }, + { + "epoch": 0.1670347720891, + "grad_norm": 1.2403075695037842, + "learning_rate": 1.865443688262888e-05, + "loss": 0.5343, + "step": 7525 + }, + { + "epoch": 0.16714575864862766, + "grad_norm": 1.0151913166046143, + "learning_rate": 1.8652689455988784e-05, + "loss": 0.3825, + "step": 7530 + }, + { + "epoch": 0.16725674520815528, + "grad_norm": 1.1205071210861206, + "learning_rate": 1.865094097738372e-05, + "loss": 0.5593, + "step": 7535 + }, + { + "epoch": 0.16736773176768294, + "grad_norm": 1.0770914554595947, + "learning_rate": 1.864919144702626e-05, + "loss": 0.4712, + "step": 7540 + }, + { + "epoch": 0.16747871832721056, + "grad_norm": 1.0981290340423584, + "learning_rate": 1.8647440865129115e-05, + "loss": 0.526, + "step": 7545 + }, + { + "epoch": 0.16758970488673822, + "grad_norm": 1.3834798336029053, + "learning_rate": 1.8645689231905112e-05, + "loss": 0.5559, + "step": 7550 + }, + { + "epoch": 0.16770069144626587, + "grad_norm": 1.1029125452041626, + "learning_rate": 1.8643936547567205e-05, + "loss": 0.6514, + "step": 7555 + }, + { + "epoch": 0.1678116780057935, + "grad_norm": 1.110398530960083, + "learning_rate": 1.8642182812328483e-05, + "loss": 0.6274, + "step": 7560 + }, + { + "epoch": 0.16792266456532115, + "grad_norm": 1.2129943370819092, + "learning_rate": 1.8640428026402158e-05, + "loss": 0.6039, + "step": 7565 + }, + { + "epoch": 0.16803365112484878, + "grad_norm": 1.3205411434173584, + "learning_rate": 1.863867219000157e-05, + "loss": 0.5309, + "step": 7570 + }, + { + "epoch": 0.16814463768437643, + "grad_norm": 0.8426812291145325, + "learning_rate": 1.8636915303340193e-05, + "loss": 0.4231, + "step": 7575 + }, + { + "epoch": 0.16825562424390406, + "grad_norm": 1.1089075803756714, + "learning_rate": 1.8635157366631614e-05, + "loss": 0.5547, + "step": 7580 + }, + { + "epoch": 0.1683666108034317, + "grad_norm": 1.6381417512893677, + "learning_rate": 1.8633398380089567e-05, + "loss": 0.4485, + "step": 7585 + }, + { + "epoch": 0.16847759736295934, + "grad_norm": 1.5178996324539185, + "learning_rate": 1.86316383439279e-05, + "loss": 0.4218, + "step": 7590 + }, + { + "epoch": 0.168588583922487, + "grad_norm": 1.508217692375183, + "learning_rate": 1.8629877258360587e-05, + "loss": 0.5992, + "step": 7595 + }, + { + "epoch": 0.16869957048201462, + "grad_norm": 1.512963056564331, + "learning_rate": 1.862811512360174e-05, + "loss": 0.4963, + "step": 7600 + }, + { + "epoch": 0.16881055704154227, + "grad_norm": 0.9588901996612549, + "learning_rate": 1.8626351939865594e-05, + "loss": 0.5764, + "step": 7605 + }, + { + "epoch": 0.16892154360106992, + "grad_norm": 1.1882424354553223, + "learning_rate": 1.862458770736651e-05, + "loss": 0.4822, + "step": 7610 + }, + { + "epoch": 0.16903253016059755, + "grad_norm": 0.9214734435081482, + "learning_rate": 1.8622822426318978e-05, + "loss": 0.474, + "step": 7615 + }, + { + "epoch": 0.1691435167201252, + "grad_norm": 0.8436447381973267, + "learning_rate": 1.862105609693761e-05, + "loss": 0.5628, + "step": 7620 + }, + { + "epoch": 0.16925450327965283, + "grad_norm": 1.0661908388137817, + "learning_rate": 1.8619288719437158e-05, + "loss": 0.5611, + "step": 7625 + }, + { + "epoch": 0.16936548983918048, + "grad_norm": 1.3087199926376343, + "learning_rate": 1.861752029403249e-05, + "loss": 0.529, + "step": 7630 + }, + { + "epoch": 0.1694764763987081, + "grad_norm": 1.2095814943313599, + "learning_rate": 1.8615750820938605e-05, + "loss": 0.5776, + "step": 7635 + }, + { + "epoch": 0.16958746295823576, + "grad_norm": 1.132745623588562, + "learning_rate": 1.861398030037063e-05, + "loss": 0.5001, + "step": 7640 + }, + { + "epoch": 0.1696984495177634, + "grad_norm": 1.1624484062194824, + "learning_rate": 1.8612208732543823e-05, + "loss": 0.6071, + "step": 7645 + }, + { + "epoch": 0.16980943607729104, + "grad_norm": 2.140655517578125, + "learning_rate": 1.8610436117673557e-05, + "loss": 0.4338, + "step": 7650 + }, + { + "epoch": 0.16992042263681867, + "grad_norm": 1.5044407844543457, + "learning_rate": 1.8608662455975345e-05, + "loss": 0.5192, + "step": 7655 + }, + { + "epoch": 0.17003140919634632, + "grad_norm": 0.6824148297309875, + "learning_rate": 1.8606887747664823e-05, + "loss": 0.4568, + "step": 7660 + }, + { + "epoch": 0.17014239575587398, + "grad_norm": 1.387648105621338, + "learning_rate": 1.8605111992957757e-05, + "loss": 0.4171, + "step": 7665 + }, + { + "epoch": 0.1702533823154016, + "grad_norm": 1.3381781578063965, + "learning_rate": 1.860333519207003e-05, + "loss": 0.4753, + "step": 7670 + }, + { + "epoch": 0.17036436887492926, + "grad_norm": 1.1267739534378052, + "learning_rate": 1.8601557345217667e-05, + "loss": 0.597, + "step": 7675 + }, + { + "epoch": 0.17047535543445688, + "grad_norm": 1.2873826026916504, + "learning_rate": 1.8599778452616806e-05, + "loss": 0.4989, + "step": 7680 + }, + { + "epoch": 0.17058634199398454, + "grad_norm": 0.9000810980796814, + "learning_rate": 1.8597998514483724e-05, + "loss": 0.4732, + "step": 7685 + }, + { + "epoch": 0.17069732855351216, + "grad_norm": 0.9879721403121948, + "learning_rate": 1.859621753103482e-05, + "loss": 0.535, + "step": 7690 + }, + { + "epoch": 0.17080831511303982, + "grad_norm": 0.6511263251304626, + "learning_rate": 1.8594435502486618e-05, + "loss": 0.3743, + "step": 7695 + }, + { + "epoch": 0.17091930167256744, + "grad_norm": 1.4531750679016113, + "learning_rate": 1.859265242905577e-05, + "loss": 0.6158, + "step": 7700 + }, + { + "epoch": 0.1710302882320951, + "grad_norm": 0.7871989011764526, + "learning_rate": 1.8590868310959054e-05, + "loss": 0.7176, + "step": 7705 + }, + { + "epoch": 0.17114127479162272, + "grad_norm": 0.882152795791626, + "learning_rate": 1.8589083148413384e-05, + "loss": 0.339, + "step": 7710 + }, + { + "epoch": 0.17125226135115038, + "grad_norm": 0.832227349281311, + "learning_rate": 1.8587296941635787e-05, + "loss": 0.5013, + "step": 7715 + }, + { + "epoch": 0.17136324791067803, + "grad_norm": 1.1753653287887573, + "learning_rate": 1.858550969084343e-05, + "loss": 0.343, + "step": 7720 + }, + { + "epoch": 0.17147423447020566, + "grad_norm": 1.0593284368515015, + "learning_rate": 1.8583721396253597e-05, + "loss": 0.5349, + "step": 7725 + }, + { + "epoch": 0.1715852210297333, + "grad_norm": 1.1809357404708862, + "learning_rate": 1.8581932058083705e-05, + "loss": 0.5084, + "step": 7730 + }, + { + "epoch": 0.17169620758926094, + "grad_norm": 1.158944010734558, + "learning_rate": 1.8580141676551298e-05, + "loss": 0.4514, + "step": 7735 + }, + { + "epoch": 0.1718071941487886, + "grad_norm": 1.3387936353683472, + "learning_rate": 1.8578350251874037e-05, + "loss": 0.5451, + "step": 7740 + }, + { + "epoch": 0.17191818070831622, + "grad_norm": 1.0001437664031982, + "learning_rate": 1.8576557784269724e-05, + "loss": 0.467, + "step": 7745 + }, + { + "epoch": 0.17202916726784387, + "grad_norm": 1.3797436952590942, + "learning_rate": 1.8574764273956278e-05, + "loss": 0.5427, + "step": 7750 + }, + { + "epoch": 0.1721401538273715, + "grad_norm": 1.282599925994873, + "learning_rate": 1.857296972115175e-05, + "loss": 0.5228, + "step": 7755 + }, + { + "epoch": 0.17225114038689915, + "grad_norm": 1.4311726093292236, + "learning_rate": 1.8571174126074313e-05, + "loss": 0.5726, + "step": 7760 + }, + { + "epoch": 0.17236212694642677, + "grad_norm": 1.367200493812561, + "learning_rate": 1.8569377488942273e-05, + "loss": 0.5565, + "step": 7765 + }, + { + "epoch": 0.17247311350595443, + "grad_norm": 1.4041142463684082, + "learning_rate": 1.856757980997406e-05, + "loss": 0.4476, + "step": 7770 + }, + { + "epoch": 0.17258410006548208, + "grad_norm": 1.3798754215240479, + "learning_rate": 1.8565781089388223e-05, + "loss": 0.5945, + "step": 7775 + }, + { + "epoch": 0.1726950866250097, + "grad_norm": 1.224753737449646, + "learning_rate": 1.856398132740345e-05, + "loss": 0.4528, + "step": 7780 + }, + { + "epoch": 0.17280607318453736, + "grad_norm": 1.2565453052520752, + "learning_rate": 1.856218052423855e-05, + "loss": 0.4699, + "step": 7785 + }, + { + "epoch": 0.172917059744065, + "grad_norm": 1.1906086206436157, + "learning_rate": 1.8560378680112453e-05, + "loss": 0.5168, + "step": 7790 + }, + { + "epoch": 0.17302804630359264, + "grad_norm": 1.0104856491088867, + "learning_rate": 1.855857579524423e-05, + "loss": 0.5057, + "step": 7795 + }, + { + "epoch": 0.17313903286312027, + "grad_norm": 1.1050394773483276, + "learning_rate": 1.855677186985306e-05, + "loss": 0.5146, + "step": 7800 + }, + { + "epoch": 0.17325001942264792, + "grad_norm": 1.3951995372772217, + "learning_rate": 1.855496690415827e-05, + "loss": 0.446, + "step": 7805 + }, + { + "epoch": 0.17336100598217555, + "grad_norm": 1.1135098934173584, + "learning_rate": 1.8553160898379286e-05, + "loss": 0.5184, + "step": 7810 + }, + { + "epoch": 0.1734719925417032, + "grad_norm": 1.2136260271072388, + "learning_rate": 1.8551353852735693e-05, + "loss": 0.5071, + "step": 7815 + }, + { + "epoch": 0.17358297910123083, + "grad_norm": 1.019028663635254, + "learning_rate": 1.8549545767447174e-05, + "loss": 0.6108, + "step": 7820 + }, + { + "epoch": 0.17369396566075848, + "grad_norm": 1.4090794324874878, + "learning_rate": 1.8547736642733554e-05, + "loss": 0.3684, + "step": 7825 + }, + { + "epoch": 0.17380495222028614, + "grad_norm": 0.944814920425415, + "learning_rate": 1.854592647881478e-05, + "loss": 0.4649, + "step": 7830 + }, + { + "epoch": 0.17391593877981376, + "grad_norm": 1.4081575870513916, + "learning_rate": 1.8544115275910925e-05, + "loss": 0.5574, + "step": 7835 + }, + { + "epoch": 0.17402692533934142, + "grad_norm": 1.8278511762619019, + "learning_rate": 1.854230303424219e-05, + "loss": 0.5835, + "step": 7840 + }, + { + "epoch": 0.17413791189886904, + "grad_norm": 1.2721360921859741, + "learning_rate": 1.8540489754028902e-05, + "loss": 0.4323, + "step": 7845 + }, + { + "epoch": 0.1742488984583967, + "grad_norm": 1.2163597345352173, + "learning_rate": 1.8538675435491515e-05, + "loss": 0.4292, + "step": 7850 + }, + { + "epoch": 0.17435988501792432, + "grad_norm": 1.269092321395874, + "learning_rate": 1.8536860078850598e-05, + "loss": 0.3953, + "step": 7855 + }, + { + "epoch": 0.17447087157745197, + "grad_norm": 1.0180573463439941, + "learning_rate": 1.853504368432687e-05, + "loss": 0.3855, + "step": 7860 + }, + { + "epoch": 0.1745818581369796, + "grad_norm": 0.8039788603782654, + "learning_rate": 1.8533226252141148e-05, + "loss": 0.5043, + "step": 7865 + }, + { + "epoch": 0.17469284469650725, + "grad_norm": 1.2647064924240112, + "learning_rate": 1.85314077825144e-05, + "loss": 0.6227, + "step": 7870 + }, + { + "epoch": 0.17480383125603488, + "grad_norm": 1.075291395187378, + "learning_rate": 1.8529588275667706e-05, + "loss": 0.4935, + "step": 7875 + }, + { + "epoch": 0.17491481781556253, + "grad_norm": 1.4665888547897339, + "learning_rate": 1.8527767731822276e-05, + "loss": 0.7483, + "step": 7880 + }, + { + "epoch": 0.1750258043750902, + "grad_norm": 1.266965627670288, + "learning_rate": 1.8525946151199444e-05, + "loss": 0.5532, + "step": 7885 + }, + { + "epoch": 0.17513679093461781, + "grad_norm": 1.1272302865982056, + "learning_rate": 1.8524123534020674e-05, + "loss": 0.343, + "step": 7890 + }, + { + "epoch": 0.17524777749414547, + "grad_norm": 1.407818078994751, + "learning_rate": 1.852229988050755e-05, + "loss": 0.6022, + "step": 7895 + }, + { + "epoch": 0.1753587640536731, + "grad_norm": 1.155456304550171, + "learning_rate": 1.852047519088179e-05, + "loss": 0.411, + "step": 7900 + }, + { + "epoch": 0.17546975061320075, + "grad_norm": 1.0109899044036865, + "learning_rate": 1.851864946536523e-05, + "loss": 0.5448, + "step": 7905 + }, + { + "epoch": 0.17558073717272837, + "grad_norm": 1.2911863327026367, + "learning_rate": 1.851682270417984e-05, + "loss": 0.4231, + "step": 7910 + }, + { + "epoch": 0.17569172373225603, + "grad_norm": 1.4397233724594116, + "learning_rate": 1.8514994907547707e-05, + "loss": 0.4755, + "step": 7915 + }, + { + "epoch": 0.17580271029178365, + "grad_norm": 1.1584182977676392, + "learning_rate": 1.8513166075691052e-05, + "loss": 0.4597, + "step": 7920 + }, + { + "epoch": 0.1759136968513113, + "grad_norm": 1.5332622528076172, + "learning_rate": 1.8511336208832214e-05, + "loss": 0.5899, + "step": 7925 + }, + { + "epoch": 0.17602468341083893, + "grad_norm": 1.0838502645492554, + "learning_rate": 1.8509505307193666e-05, + "loss": 0.573, + "step": 7930 + }, + { + "epoch": 0.1761356699703666, + "grad_norm": 1.9301401376724243, + "learning_rate": 1.8507673370998e-05, + "loss": 0.5122, + "step": 7935 + }, + { + "epoch": 0.17624665652989424, + "grad_norm": 1.3569397926330566, + "learning_rate": 1.850584040046794e-05, + "loss": 0.5364, + "step": 7940 + }, + { + "epoch": 0.17635764308942187, + "grad_norm": 1.0816103219985962, + "learning_rate": 1.850400639582633e-05, + "loss": 0.661, + "step": 7945 + }, + { + "epoch": 0.17646862964894952, + "grad_norm": 0.9141400456428528, + "learning_rate": 1.8502171357296144e-05, + "loss": 0.4285, + "step": 7950 + }, + { + "epoch": 0.17657961620847715, + "grad_norm": 0.9878506064414978, + "learning_rate": 1.8500335285100477e-05, + "loss": 0.4673, + "step": 7955 + }, + { + "epoch": 0.1766906027680048, + "grad_norm": 1.0910661220550537, + "learning_rate": 1.849849817946255e-05, + "loss": 0.577, + "step": 7960 + }, + { + "epoch": 0.17680158932753243, + "grad_norm": 1.1815053224563599, + "learning_rate": 1.8496660040605722e-05, + "loss": 0.487, + "step": 7965 + }, + { + "epoch": 0.17691257588706008, + "grad_norm": 1.0612215995788574, + "learning_rate": 1.849482086875346e-05, + "loss": 0.5338, + "step": 7970 + }, + { + "epoch": 0.1770235624465877, + "grad_norm": 1.564182162284851, + "learning_rate": 1.8492980664129368e-05, + "loss": 0.4815, + "step": 7975 + }, + { + "epoch": 0.17713454900611536, + "grad_norm": 1.1242620944976807, + "learning_rate": 1.849113942695717e-05, + "loss": 0.4546, + "step": 7980 + }, + { + "epoch": 0.17724553556564301, + "grad_norm": 1.010068655014038, + "learning_rate": 1.8489297157460712e-05, + "loss": 0.4586, + "step": 7985 + }, + { + "epoch": 0.17735652212517064, + "grad_norm": 1.251988172531128, + "learning_rate": 1.848745385586398e-05, + "loss": 0.5537, + "step": 7990 + }, + { + "epoch": 0.1774675086846983, + "grad_norm": 1.282753825187683, + "learning_rate": 1.8485609522391073e-05, + "loss": 0.491, + "step": 7995 + }, + { + "epoch": 0.17757849524422592, + "grad_norm": 1.0400362014770508, + "learning_rate": 1.8483764157266218e-05, + "loss": 0.5087, + "step": 8000 + }, + { + "epoch": 0.17768948180375357, + "grad_norm": 2.066152334213257, + "learning_rate": 1.848191776071377e-05, + "loss": 0.4898, + "step": 8005 + }, + { + "epoch": 0.1778004683632812, + "grad_norm": 0.9378485083580017, + "learning_rate": 1.8480070332958207e-05, + "loss": 0.4535, + "step": 8010 + }, + { + "epoch": 0.17791145492280885, + "grad_norm": 1.3070062398910522, + "learning_rate": 1.847822187422413e-05, + "loss": 0.4423, + "step": 8015 + }, + { + "epoch": 0.17802244148233648, + "grad_norm": 1.1166318655014038, + "learning_rate": 1.8476372384736278e-05, + "loss": 0.4741, + "step": 8020 + }, + { + "epoch": 0.17813342804186413, + "grad_norm": 1.023356556892395, + "learning_rate": 1.847452186471949e-05, + "loss": 0.5005, + "step": 8025 + }, + { + "epoch": 0.17824441460139176, + "grad_norm": 1.0579627752304077, + "learning_rate": 1.8472670314398763e-05, + "loss": 0.5418, + "step": 8030 + }, + { + "epoch": 0.1783554011609194, + "grad_norm": 1.3721271753311157, + "learning_rate": 1.847081773399919e-05, + "loss": 0.5112, + "step": 8035 + }, + { + "epoch": 0.17846638772044707, + "grad_norm": 1.1013174057006836, + "learning_rate": 1.8468964123746008e-05, + "loss": 0.4563, + "step": 8040 + }, + { + "epoch": 0.1785773742799747, + "grad_norm": 1.1645931005477905, + "learning_rate": 1.846710948386457e-05, + "loss": 0.4572, + "step": 8045 + }, + { + "epoch": 0.17868836083950235, + "grad_norm": 1.0203478336334229, + "learning_rate": 1.8465253814580356e-05, + "loss": 0.4665, + "step": 8050 + }, + { + "epoch": 0.17879934739902997, + "grad_norm": 1.235601782798767, + "learning_rate": 1.8463397116118976e-05, + "loss": 0.5339, + "step": 8055 + }, + { + "epoch": 0.17891033395855763, + "grad_norm": 0.7012696266174316, + "learning_rate": 1.8461539388706156e-05, + "loss": 0.4055, + "step": 8060 + }, + { + "epoch": 0.17902132051808525, + "grad_norm": 0.9720817804336548, + "learning_rate": 1.8459680632567757e-05, + "loss": 0.4616, + "step": 8065 + }, + { + "epoch": 0.1791323070776129, + "grad_norm": 1.1411256790161133, + "learning_rate": 1.8457820847929755e-05, + "loss": 0.6752, + "step": 8070 + }, + { + "epoch": 0.17924329363714053, + "grad_norm": 0.9367457032203674, + "learning_rate": 1.845596003501826e-05, + "loss": 0.4181, + "step": 8075 + }, + { + "epoch": 0.1793542801966682, + "grad_norm": 1.1422454118728638, + "learning_rate": 1.845409819405951e-05, + "loss": 0.4357, + "step": 8080 + }, + { + "epoch": 0.1794652667561958, + "grad_norm": 1.0749188661575317, + "learning_rate": 1.8452235325279847e-05, + "loss": 0.5333, + "step": 8085 + }, + { + "epoch": 0.17957625331572347, + "grad_norm": 1.013289213180542, + "learning_rate": 1.845037142890576e-05, + "loss": 0.5039, + "step": 8090 + }, + { + "epoch": 0.17968723987525112, + "grad_norm": 1.0286688804626465, + "learning_rate": 1.8448506505163858e-05, + "loss": 0.549, + "step": 8095 + }, + { + "epoch": 0.17979822643477875, + "grad_norm": 1.7231297492980957, + "learning_rate": 1.844664055428087e-05, + "loss": 0.6414, + "step": 8100 + }, + { + "epoch": 0.1799092129943064, + "grad_norm": 1.311028242111206, + "learning_rate": 1.8444773576483647e-05, + "loss": 0.5827, + "step": 8105 + }, + { + "epoch": 0.18002019955383403, + "grad_norm": 1.5582536458969116, + "learning_rate": 1.844290557199918e-05, + "loss": 0.7342, + "step": 8110 + }, + { + "epoch": 0.18013118611336168, + "grad_norm": 1.0582084655761719, + "learning_rate": 1.8441036541054564e-05, + "loss": 0.5613, + "step": 8115 + }, + { + "epoch": 0.1802421726728893, + "grad_norm": 1.0997862815856934, + "learning_rate": 1.8439166483877032e-05, + "loss": 0.5699, + "step": 8120 + }, + { + "epoch": 0.18035315923241696, + "grad_norm": 0.9655759334564209, + "learning_rate": 1.843729540069395e-05, + "loss": 0.3168, + "step": 8125 + }, + { + "epoch": 0.18046414579194459, + "grad_norm": 1.0601000785827637, + "learning_rate": 1.8435423291732783e-05, + "loss": 0.4598, + "step": 8130 + }, + { + "epoch": 0.18057513235147224, + "grad_norm": 1.1877996921539307, + "learning_rate": 1.8433550157221145e-05, + "loss": 0.5095, + "step": 8135 + }, + { + "epoch": 0.18068611891099987, + "grad_norm": 1.2203222513198853, + "learning_rate": 1.8431675997386764e-05, + "loss": 0.4699, + "step": 8140 + }, + { + "epoch": 0.18079710547052752, + "grad_norm": 2.0087506771087646, + "learning_rate": 1.842980081245749e-05, + "loss": 0.5853, + "step": 8145 + }, + { + "epoch": 0.18090809203005517, + "grad_norm": 1.2762497663497925, + "learning_rate": 1.8427924602661305e-05, + "loss": 0.4663, + "step": 8150 + }, + { + "epoch": 0.1810190785895828, + "grad_norm": 0.9818155765533447, + "learning_rate": 1.842604736822631e-05, + "loss": 0.472, + "step": 8155 + }, + { + "epoch": 0.18113006514911045, + "grad_norm": 1.3976755142211914, + "learning_rate": 1.842416910938074e-05, + "loss": 0.3402, + "step": 8160 + }, + { + "epoch": 0.18124105170863808, + "grad_norm": 1.2582039833068848, + "learning_rate": 1.842228982635294e-05, + "loss": 0.4522, + "step": 8165 + }, + { + "epoch": 0.18135203826816573, + "grad_norm": 3.259056568145752, + "learning_rate": 1.842040951937139e-05, + "loss": 0.4833, + "step": 8170 + }, + { + "epoch": 0.18146302482769336, + "grad_norm": 0.9759329557418823, + "learning_rate": 1.841852818866469e-05, + "loss": 0.3992, + "step": 8175 + }, + { + "epoch": 0.181574011387221, + "grad_norm": 1.2619388103485107, + "learning_rate": 1.8416645834461564e-05, + "loss": 0.5316, + "step": 8180 + }, + { + "epoch": 0.18168499794674864, + "grad_norm": 0.9483252167701721, + "learning_rate": 1.8414762456990868e-05, + "loss": 0.4188, + "step": 8185 + }, + { + "epoch": 0.1817959845062763, + "grad_norm": 0.9606418013572693, + "learning_rate": 1.8412878056481567e-05, + "loss": 0.5083, + "step": 8190 + }, + { + "epoch": 0.18190697106580392, + "grad_norm": 1.6246932744979858, + "learning_rate": 1.841099263316277e-05, + "loss": 0.4212, + "step": 8195 + }, + { + "epoch": 0.18201795762533157, + "grad_norm": 0.8691072463989258, + "learning_rate": 1.84091061872637e-05, + "loss": 0.4484, + "step": 8200 + }, + { + "epoch": 0.18212894418485923, + "grad_norm": 1.4100844860076904, + "learning_rate": 1.84072187190137e-05, + "loss": 0.65, + "step": 8205 + }, + { + "epoch": 0.18223993074438685, + "grad_norm": 1.1791785955429077, + "learning_rate": 1.8405330228642246e-05, + "loss": 0.4917, + "step": 8210 + }, + { + "epoch": 0.1823509173039145, + "grad_norm": 1.2096881866455078, + "learning_rate": 1.840344071637893e-05, + "loss": 0.617, + "step": 8215 + }, + { + "epoch": 0.18246190386344213, + "grad_norm": 1.1849205493927002, + "learning_rate": 1.8401550182453475e-05, + "loss": 0.4585, + "step": 8220 + }, + { + "epoch": 0.18257289042296979, + "grad_norm": 1.3436826467514038, + "learning_rate": 1.839965862709572e-05, + "loss": 0.3399, + "step": 8225 + }, + { + "epoch": 0.1826838769824974, + "grad_norm": 1.2477469444274902, + "learning_rate": 1.8397766050535648e-05, + "loss": 0.565, + "step": 8230 + }, + { + "epoch": 0.18279486354202507, + "grad_norm": 1.2094818353652954, + "learning_rate": 1.839587245300334e-05, + "loss": 0.536, + "step": 8235 + }, + { + "epoch": 0.1829058501015527, + "grad_norm": 1.6377440690994263, + "learning_rate": 1.8393977834729012e-05, + "loss": 0.5195, + "step": 8240 + }, + { + "epoch": 0.18301683666108035, + "grad_norm": 1.2369369268417358, + "learning_rate": 1.8392082195943017e-05, + "loss": 0.7542, + "step": 8245 + }, + { + "epoch": 0.18312782322060797, + "grad_norm": 1.154937744140625, + "learning_rate": 1.8390185536875812e-05, + "loss": 0.4717, + "step": 8250 + }, + { + "epoch": 0.18323880978013563, + "grad_norm": 1.0476173162460327, + "learning_rate": 1.8388287857757986e-05, + "loss": 0.5616, + "step": 8255 + }, + { + "epoch": 0.18334979633966328, + "grad_norm": 1.2952115535736084, + "learning_rate": 1.8386389158820254e-05, + "loss": 0.491, + "step": 8260 + }, + { + "epoch": 0.1834607828991909, + "grad_norm": 1.501037359237671, + "learning_rate": 1.8384489440293455e-05, + "loss": 0.5717, + "step": 8265 + }, + { + "epoch": 0.18357176945871856, + "grad_norm": 1.3697893619537354, + "learning_rate": 1.838258870240855e-05, + "loss": 0.5399, + "step": 8270 + }, + { + "epoch": 0.18368275601824618, + "grad_norm": 1.2369073629379272, + "learning_rate": 1.838068694539662e-05, + "loss": 0.5484, + "step": 8275 + }, + { + "epoch": 0.18379374257777384, + "grad_norm": 1.4244351387023926, + "learning_rate": 1.8378784169488884e-05, + "loss": 0.5621, + "step": 8280 + }, + { + "epoch": 0.18390472913730146, + "grad_norm": 1.5416419506072998, + "learning_rate": 1.8376880374916666e-05, + "loss": 0.4107, + "step": 8285 + }, + { + "epoch": 0.18401571569682912, + "grad_norm": 0.9979259967803955, + "learning_rate": 1.8374975561911426e-05, + "loss": 0.5422, + "step": 8290 + }, + { + "epoch": 0.18412670225635674, + "grad_norm": 1.306522250175476, + "learning_rate": 1.8373069730704743e-05, + "loss": 0.4937, + "step": 8295 + }, + { + "epoch": 0.1842376888158844, + "grad_norm": 1.3945233821868896, + "learning_rate": 1.8371162881528324e-05, + "loss": 0.7341, + "step": 8300 + }, + { + "epoch": 0.18434867537541202, + "grad_norm": 1.2541005611419678, + "learning_rate": 1.8369255014613996e-05, + "loss": 0.4913, + "step": 8305 + }, + { + "epoch": 0.18445966193493968, + "grad_norm": 1.2350218296051025, + "learning_rate": 1.8367346130193713e-05, + "loss": 0.5077, + "step": 8310 + }, + { + "epoch": 0.18457064849446733, + "grad_norm": 1.4759026765823364, + "learning_rate": 1.836543622849955e-05, + "loss": 0.5094, + "step": 8315 + }, + { + "epoch": 0.18468163505399496, + "grad_norm": 1.2139075994491577, + "learning_rate": 1.8363525309763703e-05, + "loss": 0.3951, + "step": 8320 + }, + { + "epoch": 0.1847926216135226, + "grad_norm": 1.1291913986206055, + "learning_rate": 1.83616133742185e-05, + "loss": 0.4726, + "step": 8325 + }, + { + "epoch": 0.18490360817305024, + "grad_norm": 1.0854945182800293, + "learning_rate": 1.8359700422096385e-05, + "loss": 0.5243, + "step": 8330 + }, + { + "epoch": 0.1850145947325779, + "grad_norm": 1.083794116973877, + "learning_rate": 1.8357786453629932e-05, + "loss": 0.4193, + "step": 8335 + }, + { + "epoch": 0.18512558129210552, + "grad_norm": 0.9918060898780823, + "learning_rate": 1.8355871469051825e-05, + "loss": 0.2593, + "step": 8340 + }, + { + "epoch": 0.18523656785163317, + "grad_norm": 1.0464586019515991, + "learning_rate": 1.8353955468594894e-05, + "loss": 0.6056, + "step": 8345 + }, + { + "epoch": 0.1853475544111608, + "grad_norm": 1.2094873189926147, + "learning_rate": 1.8352038452492075e-05, + "loss": 0.5531, + "step": 8350 + }, + { + "epoch": 0.18545854097068845, + "grad_norm": 1.3338085412979126, + "learning_rate": 1.8350120420976426e-05, + "loss": 0.4514, + "step": 8355 + }, + { + "epoch": 0.18556952753021608, + "grad_norm": 1.1383191347122192, + "learning_rate": 1.8348201374281146e-05, + "loss": 0.5114, + "step": 8360 + }, + { + "epoch": 0.18568051408974373, + "grad_norm": 1.254416584968567, + "learning_rate": 1.8346281312639534e-05, + "loss": 0.6463, + "step": 8365 + }, + { + "epoch": 0.18579150064927138, + "grad_norm": 1.138234257698059, + "learning_rate": 1.834436023628504e-05, + "loss": 0.3798, + "step": 8370 + }, + { + "epoch": 0.185902487208799, + "grad_norm": 1.7597484588623047, + "learning_rate": 1.8342438145451207e-05, + "loss": 0.4756, + "step": 8375 + }, + { + "epoch": 0.18601347376832666, + "grad_norm": 0.9508825540542603, + "learning_rate": 1.8340515040371724e-05, + "loss": 0.446, + "step": 8380 + }, + { + "epoch": 0.1861244603278543, + "grad_norm": 1.4030269384384155, + "learning_rate": 1.8338590921280396e-05, + "loss": 0.688, + "step": 8385 + }, + { + "epoch": 0.18623544688738194, + "grad_norm": 1.4024935960769653, + "learning_rate": 1.8336665788411147e-05, + "loss": 0.4606, + "step": 8390 + }, + { + "epoch": 0.18634643344690957, + "grad_norm": 1.5650089979171753, + "learning_rate": 1.833473964199803e-05, + "loss": 0.4514, + "step": 8395 + }, + { + "epoch": 0.18645742000643722, + "grad_norm": 1.0336397886276245, + "learning_rate": 1.833281248227522e-05, + "loss": 0.5878, + "step": 8400 + }, + { + "epoch": 0.18656840656596485, + "grad_norm": 0.9039632081985474, + "learning_rate": 1.8330884309477017e-05, + "loss": 0.6019, + "step": 8405 + }, + { + "epoch": 0.1866793931254925, + "grad_norm": 1.5472464561462402, + "learning_rate": 1.8328955123837837e-05, + "loss": 0.5514, + "step": 8410 + }, + { + "epoch": 0.18679037968502013, + "grad_norm": 0.9944090843200684, + "learning_rate": 1.8327024925592226e-05, + "loss": 0.5208, + "step": 8415 + }, + { + "epoch": 0.18690136624454778, + "grad_norm": 1.2867696285247803, + "learning_rate": 1.8325093714974852e-05, + "loss": 0.4037, + "step": 8420 + }, + { + "epoch": 0.18701235280407544, + "grad_norm": 1.4909098148345947, + "learning_rate": 1.8323161492220506e-05, + "loss": 0.5026, + "step": 8425 + }, + { + "epoch": 0.18712333936360306, + "grad_norm": 1.0257625579833984, + "learning_rate": 1.8321228257564098e-05, + "loss": 0.5233, + "step": 8430 + }, + { + "epoch": 0.18723432592313072, + "grad_norm": 1.1350610256195068, + "learning_rate": 1.8319294011240662e-05, + "loss": 0.6545, + "step": 8435 + }, + { + "epoch": 0.18734531248265834, + "grad_norm": 1.3230243921279907, + "learning_rate": 1.8317358753485365e-05, + "loss": 0.5876, + "step": 8440 + }, + { + "epoch": 0.187456299042186, + "grad_norm": 1.0084996223449707, + "learning_rate": 1.8315422484533486e-05, + "loss": 0.5972, + "step": 8445 + }, + { + "epoch": 0.18756728560171362, + "grad_norm": 0.8630398511886597, + "learning_rate": 1.8313485204620428e-05, + "loss": 0.4348, + "step": 8450 + }, + { + "epoch": 0.18767827216124128, + "grad_norm": 1.212876558303833, + "learning_rate": 1.8311546913981718e-05, + "loss": 0.5198, + "step": 8455 + }, + { + "epoch": 0.1877892587207689, + "grad_norm": 1.1673882007598877, + "learning_rate": 1.830960761285301e-05, + "loss": 0.5926, + "step": 8460 + }, + { + "epoch": 0.18790024528029656, + "grad_norm": 1.3223752975463867, + "learning_rate": 1.830766730147008e-05, + "loss": 0.5681, + "step": 8465 + }, + { + "epoch": 0.1880112318398242, + "grad_norm": 1.0794979333877563, + "learning_rate": 1.8305725980068814e-05, + "loss": 0.4693, + "step": 8470 + }, + { + "epoch": 0.18812221839935184, + "grad_norm": 0.965379536151886, + "learning_rate": 1.8303783648885245e-05, + "loss": 0.4256, + "step": 8475 + }, + { + "epoch": 0.1882332049588795, + "grad_norm": 1.4646849632263184, + "learning_rate": 1.8301840308155507e-05, + "loss": 0.4721, + "step": 8480 + }, + { + "epoch": 0.18834419151840712, + "grad_norm": 0.721495509147644, + "learning_rate": 1.8299895958115867e-05, + "loss": 0.4394, + "step": 8485 + }, + { + "epoch": 0.18845517807793477, + "grad_norm": 1.1117589473724365, + "learning_rate": 1.8297950599002713e-05, + "loss": 0.6224, + "step": 8490 + }, + { + "epoch": 0.1885661646374624, + "grad_norm": 1.2575618028640747, + "learning_rate": 1.829600423105255e-05, + "loss": 0.5526, + "step": 8495 + }, + { + "epoch": 0.18867715119699005, + "grad_norm": 0.9530577659606934, + "learning_rate": 1.829405685450202e-05, + "loss": 0.6299, + "step": 8500 + }, + { + "epoch": 0.18878813775651768, + "grad_norm": 1.205317735671997, + "learning_rate": 1.829210846958787e-05, + "loss": 0.5405, + "step": 8505 + }, + { + "epoch": 0.18889912431604533, + "grad_norm": 1.6385353803634644, + "learning_rate": 1.8290159076546985e-05, + "loss": 0.414, + "step": 8510 + }, + { + "epoch": 0.18901011087557296, + "grad_norm": 0.900580883026123, + "learning_rate": 1.8288208675616363e-05, + "loss": 0.5026, + "step": 8515 + }, + { + "epoch": 0.1891210974351006, + "grad_norm": 1.253099799156189, + "learning_rate": 1.8286257267033124e-05, + "loss": 0.4916, + "step": 8520 + }, + { + "epoch": 0.18923208399462826, + "grad_norm": 1.2847565412521362, + "learning_rate": 1.828430485103452e-05, + "loss": 0.6544, + "step": 8525 + }, + { + "epoch": 0.1893430705541559, + "grad_norm": 1.2562917470932007, + "learning_rate": 1.8282351427857906e-05, + "loss": 0.3892, + "step": 8530 + }, + { + "epoch": 0.18945405711368354, + "grad_norm": 1.5068742036819458, + "learning_rate": 1.828039699774079e-05, + "loss": 0.4453, + "step": 8535 + }, + { + "epoch": 0.18956504367321117, + "grad_norm": 1.0743848085403442, + "learning_rate": 1.827844156092078e-05, + "loss": 0.5158, + "step": 8540 + }, + { + "epoch": 0.18967603023273882, + "grad_norm": 1.0758819580078125, + "learning_rate": 1.8276485117635603e-05, + "loss": 0.6229, + "step": 8545 + }, + { + "epoch": 0.18978701679226645, + "grad_norm": 1.0991002321243286, + "learning_rate": 1.8274527668123126e-05, + "loss": 0.5309, + "step": 8550 + }, + { + "epoch": 0.1898980033517941, + "grad_norm": 1.1984364986419678, + "learning_rate": 1.8272569212621323e-05, + "loss": 0.4739, + "step": 8555 + }, + { + "epoch": 0.19000898991132173, + "grad_norm": 1.335144281387329, + "learning_rate": 1.82706097513683e-05, + "loss": 0.5791, + "step": 8560 + }, + { + "epoch": 0.19011997647084938, + "grad_norm": 0.9494961500167847, + "learning_rate": 1.826864928460228e-05, + "loss": 0.4352, + "step": 8565 + }, + { + "epoch": 0.190230963030377, + "grad_norm": 1.3024598360061646, + "learning_rate": 1.8266687812561614e-05, + "loss": 0.4986, + "step": 8570 + }, + { + "epoch": 0.19034194958990466, + "grad_norm": 1.7000435590744019, + "learning_rate": 1.8264725335484766e-05, + "loss": 0.5095, + "step": 8575 + }, + { + "epoch": 0.19045293614943232, + "grad_norm": 1.4852977991104126, + "learning_rate": 1.826276185361033e-05, + "loss": 0.5957, + "step": 8580 + }, + { + "epoch": 0.19056392270895994, + "grad_norm": 1.6019827127456665, + "learning_rate": 1.826079736717702e-05, + "loss": 0.4828, + "step": 8585 + }, + { + "epoch": 0.1906749092684876, + "grad_norm": 1.0514659881591797, + "learning_rate": 1.825883187642367e-05, + "loss": 0.4597, + "step": 8590 + }, + { + "epoch": 0.19078589582801522, + "grad_norm": 1.223839282989502, + "learning_rate": 1.825686538158924e-05, + "loss": 0.708, + "step": 8595 + }, + { + "epoch": 0.19089688238754288, + "grad_norm": 1.0082738399505615, + "learning_rate": 1.8254897882912804e-05, + "loss": 0.4521, + "step": 8600 + }, + { + "epoch": 0.1910078689470705, + "grad_norm": 1.421229600906372, + "learning_rate": 1.825292938063357e-05, + "loss": 0.6726, + "step": 8605 + }, + { + "epoch": 0.19111885550659816, + "grad_norm": 1.07749605178833, + "learning_rate": 1.8250959874990862e-05, + "loss": 0.5096, + "step": 8610 + }, + { + "epoch": 0.19122984206612578, + "grad_norm": 1.0139070749282837, + "learning_rate": 1.824898936622412e-05, + "loss": 0.5871, + "step": 8615 + }, + { + "epoch": 0.19134082862565344, + "grad_norm": 1.6385419368743896, + "learning_rate": 1.824701785457292e-05, + "loss": 0.577, + "step": 8620 + }, + { + "epoch": 0.19145181518518106, + "grad_norm": 1.14400053024292, + "learning_rate": 1.8245045340276945e-05, + "loss": 0.6675, + "step": 8625 + }, + { + "epoch": 0.19156280174470872, + "grad_norm": 1.2335268259048462, + "learning_rate": 1.8243071823576012e-05, + "loss": 0.5573, + "step": 8630 + }, + { + "epoch": 0.19167378830423637, + "grad_norm": 1.0109930038452148, + "learning_rate": 1.824109730471005e-05, + "loss": 0.5114, + "step": 8635 + }, + { + "epoch": 0.191784774863764, + "grad_norm": 1.3708138465881348, + "learning_rate": 1.8239121783919117e-05, + "loss": 0.4552, + "step": 8640 + }, + { + "epoch": 0.19189576142329165, + "grad_norm": 1.415886640548706, + "learning_rate": 1.823714526144339e-05, + "loss": 0.6616, + "step": 8645 + }, + { + "epoch": 0.19200674798281928, + "grad_norm": 1.3275566101074219, + "learning_rate": 1.8235167737523162e-05, + "loss": 0.4096, + "step": 8650 + }, + { + "epoch": 0.19211773454234693, + "grad_norm": 1.259953260421753, + "learning_rate": 1.823318921239886e-05, + "loss": 0.6252, + "step": 8655 + }, + { + "epoch": 0.19222872110187456, + "grad_norm": 1.3756461143493652, + "learning_rate": 1.823120968631103e-05, + "loss": 0.4376, + "step": 8660 + }, + { + "epoch": 0.1923397076614022, + "grad_norm": 1.2732354402542114, + "learning_rate": 1.8229229159500333e-05, + "loss": 0.4112, + "step": 8665 + }, + { + "epoch": 0.19245069422092984, + "grad_norm": 0.9623165130615234, + "learning_rate": 1.822724763220755e-05, + "loss": 0.589, + "step": 8670 + }, + { + "epoch": 0.1925616807804575, + "grad_norm": 0.9481779336929321, + "learning_rate": 1.822526510467359e-05, + "loss": 0.2917, + "step": 8675 + }, + { + "epoch": 0.19267266733998512, + "grad_norm": 0.9726129174232483, + "learning_rate": 1.822328157713949e-05, + "loss": 0.4917, + "step": 8680 + }, + { + "epoch": 0.19278365389951277, + "grad_norm": 1.8361598253250122, + "learning_rate": 1.8221297049846388e-05, + "loss": 0.6041, + "step": 8685 + }, + { + "epoch": 0.19289464045904042, + "grad_norm": 1.1872971057891846, + "learning_rate": 1.8219311523035568e-05, + "loss": 0.3696, + "step": 8690 + }, + { + "epoch": 0.19300562701856805, + "grad_norm": 1.2972650527954102, + "learning_rate": 1.8217324996948416e-05, + "loss": 0.4888, + "step": 8695 + }, + { + "epoch": 0.1931166135780957, + "grad_norm": 1.157384991645813, + "learning_rate": 1.821533747182645e-05, + "loss": 0.4873, + "step": 8700 + }, + { + "epoch": 0.19322760013762333, + "grad_norm": 0.9881042242050171, + "learning_rate": 1.8213348947911304e-05, + "loss": 0.4809, + "step": 8705 + }, + { + "epoch": 0.19333858669715098, + "grad_norm": 0.7585896253585815, + "learning_rate": 1.8211359425444742e-05, + "loss": 0.3606, + "step": 8710 + }, + { + "epoch": 0.1934495732566786, + "grad_norm": 0.9092647433280945, + "learning_rate": 1.8209368904668638e-05, + "loss": 0.4519, + "step": 8715 + }, + { + "epoch": 0.19356055981620626, + "grad_norm": 1.3298051357269287, + "learning_rate": 1.8207377385824997e-05, + "loss": 0.5164, + "step": 8720 + }, + { + "epoch": 0.1936715463757339, + "grad_norm": 0.9970882534980774, + "learning_rate": 1.8205384869155937e-05, + "loss": 0.5339, + "step": 8725 + }, + { + "epoch": 0.19378253293526154, + "grad_norm": 1.05615234375, + "learning_rate": 1.8203391354903703e-05, + "loss": 0.5448, + "step": 8730 + }, + { + "epoch": 0.19389351949478917, + "grad_norm": 1.287419319152832, + "learning_rate": 1.8201396843310658e-05, + "loss": 0.5216, + "step": 8735 + }, + { + "epoch": 0.19400450605431682, + "grad_norm": 0.873881459236145, + "learning_rate": 1.8199401334619295e-05, + "loss": 0.546, + "step": 8740 + }, + { + "epoch": 0.19411549261384448, + "grad_norm": 0.8294119238853455, + "learning_rate": 1.8197404829072214e-05, + "loss": 0.5422, + "step": 8745 + }, + { + "epoch": 0.1942264791733721, + "grad_norm": 1.0979552268981934, + "learning_rate": 1.8195407326912144e-05, + "loss": 0.4362, + "step": 8750 + }, + { + "epoch": 0.19433746573289976, + "grad_norm": 1.397440791130066, + "learning_rate": 1.819340882838194e-05, + "loss": 0.5967, + "step": 8755 + }, + { + "epoch": 0.19444845229242738, + "grad_norm": 1.3883029222488403, + "learning_rate": 1.819140933372457e-05, + "loss": 0.695, + "step": 8760 + }, + { + "epoch": 0.19455943885195504, + "grad_norm": 1.1809195280075073, + "learning_rate": 1.818940884318312e-05, + "loss": 0.5256, + "step": 8765 + }, + { + "epoch": 0.19467042541148266, + "grad_norm": 1.1587636470794678, + "learning_rate": 1.818740735700081e-05, + "loss": 0.5528, + "step": 8770 + }, + { + "epoch": 0.19478141197101032, + "grad_norm": 1.2209142446517944, + "learning_rate": 1.818540487542097e-05, + "loss": 0.3887, + "step": 8775 + }, + { + "epoch": 0.19489239853053794, + "grad_norm": 1.2413638830184937, + "learning_rate": 1.818340139868706e-05, + "loss": 0.6362, + "step": 8780 + }, + { + "epoch": 0.1950033850900656, + "grad_norm": 0.9728131294250488, + "learning_rate": 1.818139692704265e-05, + "loss": 0.3275, + "step": 8785 + }, + { + "epoch": 0.19511437164959322, + "grad_norm": 0.8453243970870972, + "learning_rate": 1.8179391460731445e-05, + "loss": 0.504, + "step": 8790 + }, + { + "epoch": 0.19522535820912088, + "grad_norm": 1.1394267082214355, + "learning_rate": 1.8177384999997258e-05, + "loss": 0.4273, + "step": 8795 + }, + { + "epoch": 0.19533634476864853, + "grad_norm": 1.447892665863037, + "learning_rate": 1.817537754508402e-05, + "loss": 0.5201, + "step": 8800 + }, + { + "epoch": 0.19544733132817615, + "grad_norm": 1.434080719947815, + "learning_rate": 1.8173369096235804e-05, + "loss": 0.5835, + "step": 8805 + }, + { + "epoch": 0.1955583178877038, + "grad_norm": 1.1676700115203857, + "learning_rate": 1.8171359653696784e-05, + "loss": 0.7007, + "step": 8810 + }, + { + "epoch": 0.19566930444723143, + "grad_norm": 0.8635067343711853, + "learning_rate": 1.8169349217711262e-05, + "loss": 0.4438, + "step": 8815 + }, + { + "epoch": 0.1957802910067591, + "grad_norm": 1.5453852415084839, + "learning_rate": 1.8167337788523654e-05, + "loss": 0.5355, + "step": 8820 + }, + { + "epoch": 0.19589127756628671, + "grad_norm": 1.5389227867126465, + "learning_rate": 1.8165325366378516e-05, + "loss": 0.3978, + "step": 8825 + }, + { + "epoch": 0.19600226412581437, + "grad_norm": 0.9028152823448181, + "learning_rate": 1.8163311951520505e-05, + "loss": 0.5507, + "step": 8830 + }, + { + "epoch": 0.196113250685342, + "grad_norm": 1.4843121767044067, + "learning_rate": 1.81612975441944e-05, + "loss": 0.459, + "step": 8835 + }, + { + "epoch": 0.19622423724486965, + "grad_norm": 1.0722191333770752, + "learning_rate": 1.815928214464511e-05, + "loss": 0.4453, + "step": 8840 + }, + { + "epoch": 0.19633522380439727, + "grad_norm": 1.3621715307235718, + "learning_rate": 1.8157265753117665e-05, + "loss": 0.4542, + "step": 8845 + }, + { + "epoch": 0.19644621036392493, + "grad_norm": 1.274541974067688, + "learning_rate": 1.8155248369857207e-05, + "loss": 0.459, + "step": 8850 + }, + { + "epoch": 0.19655719692345258, + "grad_norm": 0.9879027009010315, + "learning_rate": 1.8153229995109e-05, + "loss": 0.4796, + "step": 8855 + }, + { + "epoch": 0.1966681834829802, + "grad_norm": 1.4747463464736938, + "learning_rate": 1.8151210629118435e-05, + "loss": 0.5642, + "step": 8860 + }, + { + "epoch": 0.19677917004250786, + "grad_norm": 1.3962020874023438, + "learning_rate": 1.814919027213102e-05, + "loss": 0.4157, + "step": 8865 + }, + { + "epoch": 0.1968901566020355, + "grad_norm": 0.8555780053138733, + "learning_rate": 1.814716892439238e-05, + "loss": 0.5106, + "step": 8870 + }, + { + "epoch": 0.19700114316156314, + "grad_norm": 1.1619625091552734, + "learning_rate": 1.8145146586148266e-05, + "loss": 0.3707, + "step": 8875 + }, + { + "epoch": 0.19711212972109077, + "grad_norm": 0.9830631017684937, + "learning_rate": 1.8143123257644548e-05, + "loss": 0.4867, + "step": 8880 + }, + { + "epoch": 0.19722311628061842, + "grad_norm": 0.9953300356864929, + "learning_rate": 1.8141098939127214e-05, + "loss": 0.5274, + "step": 8885 + }, + { + "epoch": 0.19733410284014605, + "grad_norm": 1.415956735610962, + "learning_rate": 1.8139073630842373e-05, + "loss": 0.4372, + "step": 8890 + }, + { + "epoch": 0.1974450893996737, + "grad_norm": 0.6621501445770264, + "learning_rate": 1.8137047333036256e-05, + "loss": 0.4938, + "step": 8895 + }, + { + "epoch": 0.19755607595920133, + "grad_norm": 1.6966521739959717, + "learning_rate": 1.8135020045955217e-05, + "loss": 0.5423, + "step": 8900 + }, + { + "epoch": 0.19766706251872898, + "grad_norm": 1.799787998199463, + "learning_rate": 1.8132991769845717e-05, + "loss": 0.5886, + "step": 8905 + }, + { + "epoch": 0.19777804907825663, + "grad_norm": 1.1175798177719116, + "learning_rate": 1.813096250495436e-05, + "loss": 0.4745, + "step": 8910 + }, + { + "epoch": 0.19788903563778426, + "grad_norm": 1.0063509941101074, + "learning_rate": 1.812893225152785e-05, + "loss": 0.5334, + "step": 8915 + }, + { + "epoch": 0.19800002219731191, + "grad_norm": 0.8781239986419678, + "learning_rate": 1.8126901009813016e-05, + "loss": 0.4526, + "step": 8920 + }, + { + "epoch": 0.19811100875683954, + "grad_norm": 1.2490911483764648, + "learning_rate": 1.8124868780056814e-05, + "loss": 0.5334, + "step": 8925 + }, + { + "epoch": 0.1982219953163672, + "grad_norm": 1.2448034286499023, + "learning_rate": 1.8122835562506314e-05, + "loss": 0.4468, + "step": 8930 + }, + { + "epoch": 0.19833298187589482, + "grad_norm": 1.0076313018798828, + "learning_rate": 1.812080135740871e-05, + "loss": 0.4711, + "step": 8935 + }, + { + "epoch": 0.19844396843542247, + "grad_norm": 1.3471753597259521, + "learning_rate": 1.811876616501131e-05, + "loss": 0.5166, + "step": 8940 + }, + { + "epoch": 0.1985549549949501, + "grad_norm": 1.514022946357727, + "learning_rate": 1.811672998556155e-05, + "loss": 0.5828, + "step": 8945 + }, + { + "epoch": 0.19866594155447775, + "grad_norm": 1.2273184061050415, + "learning_rate": 1.811469281930698e-05, + "loss": 0.4267, + "step": 8950 + }, + { + "epoch": 0.19877692811400538, + "grad_norm": 0.7829887270927429, + "learning_rate": 1.811265466649527e-05, + "loss": 0.4773, + "step": 8955 + }, + { + "epoch": 0.19888791467353303, + "grad_norm": 2.0429697036743164, + "learning_rate": 1.8110615527374212e-05, + "loss": 0.5285, + "step": 8960 + }, + { + "epoch": 0.1989989012330607, + "grad_norm": 1.5543320178985596, + "learning_rate": 1.810857540219172e-05, + "loss": 0.5802, + "step": 8965 + }, + { + "epoch": 0.1991098877925883, + "grad_norm": 1.1182522773742676, + "learning_rate": 1.8106534291195826e-05, + "loss": 0.3691, + "step": 8970 + }, + { + "epoch": 0.19922087435211597, + "grad_norm": 1.3988370895385742, + "learning_rate": 1.810449219463468e-05, + "loss": 0.3581, + "step": 8975 + }, + { + "epoch": 0.1993318609116436, + "grad_norm": 1.0593804121017456, + "learning_rate": 1.8102449112756554e-05, + "loss": 0.4253, + "step": 8980 + }, + { + "epoch": 0.19944284747117125, + "grad_norm": 0.8948121666908264, + "learning_rate": 1.8100405045809836e-05, + "loss": 0.4184, + "step": 8985 + }, + { + "epoch": 0.19955383403069887, + "grad_norm": 1.1369259357452393, + "learning_rate": 1.809835999404304e-05, + "loss": 0.4053, + "step": 8990 + }, + { + "epoch": 0.19966482059022653, + "grad_norm": 1.3200005292892456, + "learning_rate": 1.8096313957704795e-05, + "loss": 0.5784, + "step": 8995 + }, + { + "epoch": 0.19977580714975415, + "grad_norm": 1.4808971881866455, + "learning_rate": 1.8094266937043853e-05, + "loss": 0.5999, + "step": 9000 + }, + { + "epoch": 0.1998867937092818, + "grad_norm": 0.8824801445007324, + "learning_rate": 1.8092218932309086e-05, + "loss": 0.4985, + "step": 9005 + }, + { + "epoch": 0.19999778026880946, + "grad_norm": 1.0153508186340332, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.5107, + "step": 9010 + }, + { + "epoch": 0.2001087668283371, + "grad_norm": 1.2240535020828247, + "learning_rate": 1.808811997161414e-05, + "loss": 0.6717, + "step": 9015 + }, + { + "epoch": 0.20021975338786474, + "grad_norm": 1.374133586883545, + "learning_rate": 1.80860690161523e-05, + "loss": 0.6474, + "step": 9020 + }, + { + "epoch": 0.20033073994739237, + "grad_norm": 1.1667531728744507, + "learning_rate": 1.808401707761331e-05, + "loss": 0.4666, + "step": 9025 + }, + { + "epoch": 0.20044172650692002, + "grad_norm": 1.0652427673339844, + "learning_rate": 1.808196415624663e-05, + "loss": 0.426, + "step": 9030 + }, + { + "epoch": 0.20055271306644765, + "grad_norm": 0.9931832551956177, + "learning_rate": 1.807991025230186e-05, + "loss": 0.3811, + "step": 9035 + }, + { + "epoch": 0.2006636996259753, + "grad_norm": 1.2800239324569702, + "learning_rate": 1.8077855366028695e-05, + "loss": 0.4609, + "step": 9040 + }, + { + "epoch": 0.20077468618550293, + "grad_norm": 1.4617905616760254, + "learning_rate": 1.8075799497676967e-05, + "loss": 0.4553, + "step": 9045 + }, + { + "epoch": 0.20088567274503058, + "grad_norm": 1.4740259647369385, + "learning_rate": 1.807374264749662e-05, + "loss": 0.5532, + "step": 9050 + }, + { + "epoch": 0.2009966593045582, + "grad_norm": 0.8939075469970703, + "learning_rate": 1.8071684815737717e-05, + "loss": 0.4947, + "step": 9055 + }, + { + "epoch": 0.20110764586408586, + "grad_norm": 1.3319915533065796, + "learning_rate": 1.806962600265045e-05, + "loss": 0.5024, + "step": 9060 + }, + { + "epoch": 0.2012186324236135, + "grad_norm": 0.9438144564628601, + "learning_rate": 1.8067566208485112e-05, + "loss": 0.4628, + "step": 9065 + }, + { + "epoch": 0.20132961898314114, + "grad_norm": 1.2884577512741089, + "learning_rate": 1.8065505433492135e-05, + "loss": 0.5252, + "step": 9070 + }, + { + "epoch": 0.2014406055426688, + "grad_norm": 0.9813497066497803, + "learning_rate": 1.8063443677922052e-05, + "loss": 0.5909, + "step": 9075 + }, + { + "epoch": 0.20155159210219642, + "grad_norm": 1.1486997604370117, + "learning_rate": 1.8061380942025532e-05, + "loss": 0.3774, + "step": 9080 + }, + { + "epoch": 0.20166257866172407, + "grad_norm": 1.1693344116210938, + "learning_rate": 1.8059317226053353e-05, + "loss": 0.495, + "step": 9085 + }, + { + "epoch": 0.2017735652212517, + "grad_norm": 1.0905404090881348, + "learning_rate": 1.8057252530256414e-05, + "loss": 0.4009, + "step": 9090 + }, + { + "epoch": 0.20188455178077935, + "grad_norm": 0.8068780303001404, + "learning_rate": 1.8055186854885733e-05, + "loss": 0.4361, + "step": 9095 + }, + { + "epoch": 0.20199553834030698, + "grad_norm": 0.8546350598335266, + "learning_rate": 1.8053120200192452e-05, + "loss": 0.6194, + "step": 9100 + }, + { + "epoch": 0.20210652489983463, + "grad_norm": 1.1348129510879517, + "learning_rate": 1.8051052566427824e-05, + "loss": 0.5497, + "step": 9105 + }, + { + "epoch": 0.20221751145936226, + "grad_norm": 1.852582573890686, + "learning_rate": 1.8048983953843226e-05, + "loss": 0.5829, + "step": 9110 + }, + { + "epoch": 0.2023284980188899, + "grad_norm": 1.4869229793548584, + "learning_rate": 1.8046914362690153e-05, + "loss": 0.4338, + "step": 9115 + }, + { + "epoch": 0.20243948457841757, + "grad_norm": 1.2775733470916748, + "learning_rate": 1.804484379322022e-05, + "loss": 0.5854, + "step": 9120 + }, + { + "epoch": 0.2025504711379452, + "grad_norm": 0.8230618834495544, + "learning_rate": 1.804277224568516e-05, + "loss": 0.4142, + "step": 9125 + }, + { + "epoch": 0.20266145769747285, + "grad_norm": 1.7515913248062134, + "learning_rate": 1.8040699720336817e-05, + "loss": 0.5701, + "step": 9130 + }, + { + "epoch": 0.20277244425700047, + "grad_norm": 1.0117206573486328, + "learning_rate": 1.8038626217427176e-05, + "loss": 0.4278, + "step": 9135 + }, + { + "epoch": 0.20288343081652813, + "grad_norm": 1.2293572425842285, + "learning_rate": 1.8036551737208314e-05, + "loss": 0.6121, + "step": 9140 + }, + { + "epoch": 0.20299441737605575, + "grad_norm": 0.8745748400688171, + "learning_rate": 1.803447627993245e-05, + "loss": 0.5783, + "step": 9145 + }, + { + "epoch": 0.2031054039355834, + "grad_norm": 1.1734461784362793, + "learning_rate": 1.8032399845851896e-05, + "loss": 0.6365, + "step": 9150 + }, + { + "epoch": 0.20321639049511103, + "grad_norm": 1.4227900505065918, + "learning_rate": 1.8030322435219117e-05, + "loss": 0.4876, + "step": 9155 + }, + { + "epoch": 0.20332737705463869, + "grad_norm": 0.7563652396202087, + "learning_rate": 1.8028244048286663e-05, + "loss": 0.4521, + "step": 9160 + }, + { + "epoch": 0.2034383636141663, + "grad_norm": 1.1069258451461792, + "learning_rate": 1.8026164685307224e-05, + "loss": 0.583, + "step": 9165 + }, + { + "epoch": 0.20354935017369397, + "grad_norm": 1.4049474000930786, + "learning_rate": 1.8024084346533598e-05, + "loss": 0.4168, + "step": 9170 + }, + { + "epoch": 0.20366033673322162, + "grad_norm": 1.0486278533935547, + "learning_rate": 1.802200303221871e-05, + "loss": 0.3907, + "step": 9175 + }, + { + "epoch": 0.20377132329274925, + "grad_norm": 2.22003173828125, + "learning_rate": 1.8019920742615596e-05, + "loss": 0.4856, + "step": 9180 + }, + { + "epoch": 0.2038823098522769, + "grad_norm": 1.4012434482574463, + "learning_rate": 1.8017837477977416e-05, + "loss": 0.5096, + "step": 9185 + }, + { + "epoch": 0.20399329641180453, + "grad_norm": 0.994441568851471, + "learning_rate": 1.8015753238557444e-05, + "loss": 0.4237, + "step": 9190 + }, + { + "epoch": 0.20410428297133218, + "grad_norm": 1.2336442470550537, + "learning_rate": 1.8013668024609078e-05, + "loss": 0.479, + "step": 9195 + }, + { + "epoch": 0.2042152695308598, + "grad_norm": 0.9091594815254211, + "learning_rate": 1.8011581836385828e-05, + "loss": 0.5212, + "step": 9200 + }, + { + "epoch": 0.20432625609038746, + "grad_norm": 1.4611879587173462, + "learning_rate": 1.8009494674141327e-05, + "loss": 0.5236, + "step": 9205 + }, + { + "epoch": 0.20443724264991509, + "grad_norm": 1.1127291917800903, + "learning_rate": 1.800740653812932e-05, + "loss": 0.5092, + "step": 9210 + }, + { + "epoch": 0.20454822920944274, + "grad_norm": 1.1224825382232666, + "learning_rate": 1.8005317428603687e-05, + "loss": 0.4901, + "step": 9215 + }, + { + "epoch": 0.20465921576897037, + "grad_norm": 1.2642055749893188, + "learning_rate": 1.8003227345818407e-05, + "loss": 0.6672, + "step": 9220 + }, + { + "epoch": 0.20477020232849802, + "grad_norm": 1.1066789627075195, + "learning_rate": 1.800113629002759e-05, + "loss": 0.4579, + "step": 9225 + }, + { + "epoch": 0.20488118888802567, + "grad_norm": 0.9728310108184814, + "learning_rate": 1.7999044261485453e-05, + "loss": 0.3675, + "step": 9230 + }, + { + "epoch": 0.2049921754475533, + "grad_norm": 1.205916404724121, + "learning_rate": 1.799695126044634e-05, + "loss": 0.4924, + "step": 9235 + }, + { + "epoch": 0.20510316200708095, + "grad_norm": 1.0212023258209229, + "learning_rate": 1.799485728716472e-05, + "loss": 0.4738, + "step": 9240 + }, + { + "epoch": 0.20521414856660858, + "grad_norm": 1.249037742614746, + "learning_rate": 1.7992762341895157e-05, + "loss": 0.4826, + "step": 9245 + }, + { + "epoch": 0.20532513512613623, + "grad_norm": 1.789096713066101, + "learning_rate": 1.7990666424892354e-05, + "loss": 0.4677, + "step": 9250 + }, + { + "epoch": 0.20543612168566386, + "grad_norm": 1.6068724393844604, + "learning_rate": 1.798856953641113e-05, + "loss": 0.4278, + "step": 9255 + }, + { + "epoch": 0.2055471082451915, + "grad_norm": 1.5494028329849243, + "learning_rate": 1.798647167670641e-05, + "loss": 0.4621, + "step": 9260 + }, + { + "epoch": 0.20565809480471914, + "grad_norm": 0.8551934957504272, + "learning_rate": 1.7984372846033252e-05, + "loss": 0.5474, + "step": 9265 + }, + { + "epoch": 0.2057690813642468, + "grad_norm": 1.3189067840576172, + "learning_rate": 1.7982273044646817e-05, + "loss": 0.4884, + "step": 9270 + }, + { + "epoch": 0.20588006792377442, + "grad_norm": 1.0660463571548462, + "learning_rate": 1.7980172272802398e-05, + "loss": 0.5418, + "step": 9275 + }, + { + "epoch": 0.20599105448330207, + "grad_norm": 1.1619466543197632, + "learning_rate": 1.7978070530755393e-05, + "loss": 0.4406, + "step": 9280 + }, + { + "epoch": 0.20610204104282973, + "grad_norm": 1.0269361734390259, + "learning_rate": 1.7975967818761334e-05, + "loss": 0.4529, + "step": 9285 + }, + { + "epoch": 0.20621302760235735, + "grad_norm": 1.061600923538208, + "learning_rate": 1.7973864137075856e-05, + "loss": 0.4159, + "step": 9290 + }, + { + "epoch": 0.206324014161885, + "grad_norm": 1.8789093494415283, + "learning_rate": 1.797175948595472e-05, + "loss": 0.5636, + "step": 9295 + }, + { + "epoch": 0.20643500072141263, + "grad_norm": 1.2407394647598267, + "learning_rate": 1.7969653865653794e-05, + "loss": 0.5232, + "step": 9300 + }, + { + "epoch": 0.20654598728094029, + "grad_norm": 1.0408992767333984, + "learning_rate": 1.7967547276429086e-05, + "loss": 0.5879, + "step": 9305 + }, + { + "epoch": 0.2066569738404679, + "grad_norm": 1.20352041721344, + "learning_rate": 1.79654397185367e-05, + "loss": 0.4962, + "step": 9310 + }, + { + "epoch": 0.20676796039999557, + "grad_norm": 1.2517502307891846, + "learning_rate": 1.7963331192232863e-05, + "loss": 0.3936, + "step": 9315 + }, + { + "epoch": 0.2068789469595232, + "grad_norm": 1.6825429201126099, + "learning_rate": 1.7961221697773932e-05, + "loss": 0.5146, + "step": 9320 + }, + { + "epoch": 0.20698993351905084, + "grad_norm": 1.1055762767791748, + "learning_rate": 1.7959111235416364e-05, + "loss": 0.5971, + "step": 9325 + }, + { + "epoch": 0.20710092007857847, + "grad_norm": 1.408706784248352, + "learning_rate": 1.7956999805416746e-05, + "loss": 0.4739, + "step": 9330 + }, + { + "epoch": 0.20721190663810612, + "grad_norm": 1.5727156400680542, + "learning_rate": 1.7954887408031777e-05, + "loss": 0.4651, + "step": 9335 + }, + { + "epoch": 0.20732289319763378, + "grad_norm": 1.158872365951538, + "learning_rate": 1.7952774043518273e-05, + "loss": 0.4464, + "step": 9340 + }, + { + "epoch": 0.2074338797571614, + "grad_norm": 1.1882591247558594, + "learning_rate": 1.7950659712133178e-05, + "loss": 0.5427, + "step": 9345 + }, + { + "epoch": 0.20754486631668906, + "grad_norm": 0.9776345491409302, + "learning_rate": 1.7948544414133534e-05, + "loss": 0.5132, + "step": 9350 + }, + { + "epoch": 0.20765585287621668, + "grad_norm": 1.3551688194274902, + "learning_rate": 1.794642814977652e-05, + "loss": 0.5469, + "step": 9355 + }, + { + "epoch": 0.20776683943574434, + "grad_norm": 1.2750028371810913, + "learning_rate": 1.794431091931942e-05, + "loss": 0.4309, + "step": 9360 + }, + { + "epoch": 0.20787782599527196, + "grad_norm": 1.3404597043991089, + "learning_rate": 1.7942192723019643e-05, + "loss": 0.6556, + "step": 9365 + }, + { + "epoch": 0.20798881255479962, + "grad_norm": 1.260688066482544, + "learning_rate": 1.7940073561134713e-05, + "loss": 0.5327, + "step": 9370 + }, + { + "epoch": 0.20809979911432724, + "grad_norm": 1.068941354751587, + "learning_rate": 1.7937953433922265e-05, + "loss": 0.5154, + "step": 9375 + }, + { + "epoch": 0.2082107856738549, + "grad_norm": 0.9720483422279358, + "learning_rate": 1.793583234164006e-05, + "loss": 0.3935, + "step": 9380 + }, + { + "epoch": 0.20832177223338252, + "grad_norm": 1.0248860120773315, + "learning_rate": 1.793371028454598e-05, + "loss": 0.6002, + "step": 9385 + }, + { + "epoch": 0.20843275879291018, + "grad_norm": 1.149561882019043, + "learning_rate": 1.7931587262898004e-05, + "loss": 0.6452, + "step": 9390 + }, + { + "epoch": 0.20854374535243783, + "grad_norm": 1.3280117511749268, + "learning_rate": 1.792946327695425e-05, + "loss": 0.575, + "step": 9395 + }, + { + "epoch": 0.20865473191196546, + "grad_norm": 1.1656508445739746, + "learning_rate": 1.7927338326972947e-05, + "loss": 0.4884, + "step": 9400 + }, + { + "epoch": 0.2087657184714931, + "grad_norm": 1.3832805156707764, + "learning_rate": 1.7925212413212435e-05, + "loss": 0.4673, + "step": 9405 + }, + { + "epoch": 0.20887670503102074, + "grad_norm": 1.300185203552246, + "learning_rate": 1.7923085535931176e-05, + "loss": 0.4559, + "step": 9410 + }, + { + "epoch": 0.2089876915905484, + "grad_norm": 1.452893614768982, + "learning_rate": 1.792095769538775e-05, + "loss": 0.6083, + "step": 9415 + }, + { + "epoch": 0.20909867815007602, + "grad_norm": 1.1308960914611816, + "learning_rate": 1.7918828891840853e-05, + "loss": 0.3727, + "step": 9420 + }, + { + "epoch": 0.20920966470960367, + "grad_norm": 1.103422999382019, + "learning_rate": 1.79166991255493e-05, + "loss": 0.3363, + "step": 9425 + }, + { + "epoch": 0.2093206512691313, + "grad_norm": 1.3565524816513062, + "learning_rate": 1.791456839677201e-05, + "loss": 0.5528, + "step": 9430 + }, + { + "epoch": 0.20943163782865895, + "grad_norm": 0.6596664786338806, + "learning_rate": 1.7912436705768045e-05, + "loss": 0.423, + "step": 9435 + }, + { + "epoch": 0.20954262438818658, + "grad_norm": 1.1548718214035034, + "learning_rate": 1.7910304052796558e-05, + "loss": 0.5097, + "step": 9440 + }, + { + "epoch": 0.20965361094771423, + "grad_norm": 1.3421484231948853, + "learning_rate": 1.7908170438116835e-05, + "loss": 0.4689, + "step": 9445 + }, + { + "epoch": 0.20976459750724188, + "grad_norm": 0.8850771188735962, + "learning_rate": 1.790603586198827e-05, + "loss": 0.4149, + "step": 9450 + }, + { + "epoch": 0.2098755840667695, + "grad_norm": 1.0380505323410034, + "learning_rate": 1.790390032467038e-05, + "loss": 0.5725, + "step": 9455 + }, + { + "epoch": 0.20998657062629716, + "grad_norm": 1.1664137840270996, + "learning_rate": 1.7901763826422797e-05, + "loss": 0.5316, + "step": 9460 + }, + { + "epoch": 0.2100975571858248, + "grad_norm": 1.3658771514892578, + "learning_rate": 1.7899626367505266e-05, + "loss": 0.4926, + "step": 9465 + }, + { + "epoch": 0.21020854374535244, + "grad_norm": 1.3364830017089844, + "learning_rate": 1.789748794817766e-05, + "loss": 0.599, + "step": 9470 + }, + { + "epoch": 0.21031953030488007, + "grad_norm": 1.1915723085403442, + "learning_rate": 1.7895348568699953e-05, + "loss": 0.5151, + "step": 9475 + }, + { + "epoch": 0.21043051686440772, + "grad_norm": 1.026668667793274, + "learning_rate": 1.7893208229332245e-05, + "loss": 0.5308, + "step": 9480 + }, + { + "epoch": 0.21054150342393535, + "grad_norm": 1.2627990245819092, + "learning_rate": 1.789106693033475e-05, + "loss": 0.5808, + "step": 9485 + }, + { + "epoch": 0.210652489983463, + "grad_norm": 1.0648430585861206, + "learning_rate": 1.7888924671967808e-05, + "loss": 0.4118, + "step": 9490 + }, + { + "epoch": 0.21076347654299066, + "grad_norm": 1.0836087465286255, + "learning_rate": 1.7886781454491856e-05, + "loss": 0.637, + "step": 9495 + }, + { + "epoch": 0.21087446310251828, + "grad_norm": 0.976684033870697, + "learning_rate": 1.788463727816747e-05, + "loss": 0.3573, + "step": 9500 + }, + { + "epoch": 0.21098544966204594, + "grad_norm": 1.099665641784668, + "learning_rate": 1.7882492143255323e-05, + "loss": 0.5833, + "step": 9505 + }, + { + "epoch": 0.21109643622157356, + "grad_norm": 1.015887975692749, + "learning_rate": 1.788034605001622e-05, + "loss": 0.49, + "step": 9510 + }, + { + "epoch": 0.21120742278110122, + "grad_norm": 1.1677489280700684, + "learning_rate": 1.7878198998711068e-05, + "loss": 0.2998, + "step": 9515 + }, + { + "epoch": 0.21131840934062884, + "grad_norm": 1.1017683744430542, + "learning_rate": 1.7876050989600908e-05, + "loss": 0.4565, + "step": 9520 + }, + { + "epoch": 0.2114293959001565, + "grad_norm": 1.3350118398666382, + "learning_rate": 1.7873902022946882e-05, + "loss": 0.3332, + "step": 9525 + }, + { + "epoch": 0.21154038245968412, + "grad_norm": 1.3715494871139526, + "learning_rate": 1.7871752099010256e-05, + "loss": 0.4485, + "step": 9530 + }, + { + "epoch": 0.21165136901921178, + "grad_norm": 1.1493537425994873, + "learning_rate": 1.7869601218052405e-05, + "loss": 0.4861, + "step": 9535 + }, + { + "epoch": 0.2117623555787394, + "grad_norm": 1.5250881910324097, + "learning_rate": 1.7867449380334834e-05, + "loss": 0.625, + "step": 9540 + }, + { + "epoch": 0.21187334213826706, + "grad_norm": 1.3963158130645752, + "learning_rate": 1.786529658611915e-05, + "loss": 0.546, + "step": 9545 + }, + { + "epoch": 0.2119843286977947, + "grad_norm": 1.4127143621444702, + "learning_rate": 1.786314283566709e-05, + "loss": 0.5782, + "step": 9550 + }, + { + "epoch": 0.21209531525732234, + "grad_norm": 1.5299954414367676, + "learning_rate": 1.786098812924049e-05, + "loss": 0.6186, + "step": 9555 + }, + { + "epoch": 0.21220630181685, + "grad_norm": 1.4167520999908447, + "learning_rate": 1.785883246710132e-05, + "loss": 0.4006, + "step": 9560 + }, + { + "epoch": 0.21231728837637762, + "grad_norm": 0.9981635212898254, + "learning_rate": 1.7856675849511657e-05, + "loss": 0.5463, + "step": 9565 + }, + { + "epoch": 0.21242827493590527, + "grad_norm": 1.4359947443008423, + "learning_rate": 1.785451827673369e-05, + "loss": 0.5462, + "step": 9570 + }, + { + "epoch": 0.2125392614954329, + "grad_norm": 1.2355036735534668, + "learning_rate": 1.7852359749029734e-05, + "loss": 0.5466, + "step": 9575 + }, + { + "epoch": 0.21265024805496055, + "grad_norm": 1.7637877464294434, + "learning_rate": 1.7850200266662212e-05, + "loss": 0.5638, + "step": 9580 + }, + { + "epoch": 0.21276123461448818, + "grad_norm": 1.4671874046325684, + "learning_rate": 1.7848039829893672e-05, + "loss": 0.4638, + "step": 9585 + }, + { + "epoch": 0.21287222117401583, + "grad_norm": 1.0217411518096924, + "learning_rate": 1.784587843898677e-05, + "loss": 0.6062, + "step": 9590 + }, + { + "epoch": 0.21298320773354346, + "grad_norm": 1.0460752248764038, + "learning_rate": 1.784371609420428e-05, + "loss": 0.5773, + "step": 9595 + }, + { + "epoch": 0.2130941942930711, + "grad_norm": 1.8018296957015991, + "learning_rate": 1.7841552795809095e-05, + "loss": 0.4543, + "step": 9600 + }, + { + "epoch": 0.21320518085259876, + "grad_norm": 0.9428887367248535, + "learning_rate": 1.7839388544064215e-05, + "loss": 0.4272, + "step": 9605 + }, + { + "epoch": 0.2133161674121264, + "grad_norm": 0.9210885763168335, + "learning_rate": 1.7837223339232767e-05, + "loss": 0.5215, + "step": 9610 + }, + { + "epoch": 0.21342715397165404, + "grad_norm": 1.9459435939788818, + "learning_rate": 1.7835057181577996e-05, + "loss": 0.6155, + "step": 9615 + }, + { + "epoch": 0.21353814053118167, + "grad_norm": 1.2487971782684326, + "learning_rate": 1.7832890071363243e-05, + "loss": 0.5127, + "step": 9620 + }, + { + "epoch": 0.21364912709070932, + "grad_norm": 0.7779967784881592, + "learning_rate": 1.7830722008851988e-05, + "loss": 0.4642, + "step": 9625 + }, + { + "epoch": 0.21376011365023695, + "grad_norm": 1.3998373746871948, + "learning_rate": 1.7828552994307812e-05, + "loss": 0.6303, + "step": 9630 + }, + { + "epoch": 0.2138711002097646, + "grad_norm": 1.4580212831497192, + "learning_rate": 1.7826383027994415e-05, + "loss": 0.4693, + "step": 9635 + }, + { + "epoch": 0.21398208676929223, + "grad_norm": 1.251250147819519, + "learning_rate": 1.7824212110175623e-05, + "loss": 0.4477, + "step": 9640 + }, + { + "epoch": 0.21409307332881988, + "grad_norm": 1.3137695789337158, + "learning_rate": 1.7822040241115358e-05, + "loss": 0.4949, + "step": 9645 + }, + { + "epoch": 0.2142040598883475, + "grad_norm": 1.1955955028533936, + "learning_rate": 1.7819867421077678e-05, + "loss": 0.4285, + "step": 9650 + }, + { + "epoch": 0.21431504644787516, + "grad_norm": 0.9138591289520264, + "learning_rate": 1.781769365032674e-05, + "loss": 0.5015, + "step": 9655 + }, + { + "epoch": 0.21442603300740282, + "grad_norm": 1.0733075141906738, + "learning_rate": 1.7815518929126827e-05, + "loss": 0.5615, + "step": 9660 + }, + { + "epoch": 0.21453701956693044, + "grad_norm": 1.475874423980713, + "learning_rate": 1.7813343257742333e-05, + "loss": 0.4636, + "step": 9665 + }, + { + "epoch": 0.2146480061264581, + "grad_norm": 1.453596830368042, + "learning_rate": 1.7811166636437775e-05, + "loss": 0.4062, + "step": 9670 + }, + { + "epoch": 0.21475899268598572, + "grad_norm": 1.1406927108764648, + "learning_rate": 1.7808989065477766e-05, + "loss": 0.6224, + "step": 9675 + }, + { + "epoch": 0.21486997924551338, + "grad_norm": 1.306417465209961, + "learning_rate": 1.780681054512706e-05, + "loss": 0.6121, + "step": 9680 + }, + { + "epoch": 0.214980965805041, + "grad_norm": 1.2868911027908325, + "learning_rate": 1.780463107565051e-05, + "loss": 0.5233, + "step": 9685 + }, + { + "epoch": 0.21509195236456866, + "grad_norm": 1.5072758197784424, + "learning_rate": 1.7802450657313086e-05, + "loss": 0.3335, + "step": 9690 + }, + { + "epoch": 0.21520293892409628, + "grad_norm": 1.284999966621399, + "learning_rate": 1.780026929037988e-05, + "loss": 0.4095, + "step": 9695 + }, + { + "epoch": 0.21531392548362394, + "grad_norm": 1.141349196434021, + "learning_rate": 1.7798086975116096e-05, + "loss": 0.4452, + "step": 9700 + }, + { + "epoch": 0.21542491204315156, + "grad_norm": 1.745187759399414, + "learning_rate": 1.7795903711787046e-05, + "loss": 0.5534, + "step": 9705 + }, + { + "epoch": 0.21553589860267922, + "grad_norm": 1.3602417707443237, + "learning_rate": 1.779371950065817e-05, + "loss": 0.5419, + "step": 9710 + }, + { + "epoch": 0.21564688516220687, + "grad_norm": 1.2203381061553955, + "learning_rate": 1.7791534341995018e-05, + "loss": 0.5834, + "step": 9715 + }, + { + "epoch": 0.2157578717217345, + "grad_norm": 1.0616192817687988, + "learning_rate": 1.7789348236063245e-05, + "loss": 0.4124, + "step": 9720 + }, + { + "epoch": 0.21586885828126215, + "grad_norm": 1.2460315227508545, + "learning_rate": 1.7787161183128643e-05, + "loss": 0.5519, + "step": 9725 + }, + { + "epoch": 0.21597984484078978, + "grad_norm": 1.378010869026184, + "learning_rate": 1.7784973183457097e-05, + "loss": 0.5782, + "step": 9730 + }, + { + "epoch": 0.21609083140031743, + "grad_norm": 1.131535291671753, + "learning_rate": 1.778278423731462e-05, + "loss": 0.5816, + "step": 9735 + }, + { + "epoch": 0.21620181795984506, + "grad_norm": 1.2070375680923462, + "learning_rate": 1.778059434496734e-05, + "loss": 0.3977, + "step": 9740 + }, + { + "epoch": 0.2163128045193727, + "grad_norm": 1.092206358909607, + "learning_rate": 1.7778403506681493e-05, + "loss": 0.4687, + "step": 9745 + }, + { + "epoch": 0.21642379107890033, + "grad_norm": 1.1560908555984497, + "learning_rate": 1.7776211722723437e-05, + "loss": 0.4611, + "step": 9750 + }, + { + "epoch": 0.216534777638428, + "grad_norm": 0.817129909992218, + "learning_rate": 1.7774018993359633e-05, + "loss": 0.4608, + "step": 9755 + }, + { + "epoch": 0.21664576419795561, + "grad_norm": 1.1447831392288208, + "learning_rate": 1.7771825318856676e-05, + "loss": 0.5585, + "step": 9760 + }, + { + "epoch": 0.21675675075748327, + "grad_norm": 1.21500825881958, + "learning_rate": 1.776963069948126e-05, + "loss": 0.485, + "step": 9765 + }, + { + "epoch": 0.21686773731701092, + "grad_norm": 0.931507408618927, + "learning_rate": 1.77674351355002e-05, + "loss": 0.4524, + "step": 9770 + }, + { + "epoch": 0.21697872387653855, + "grad_norm": 1.2368406057357788, + "learning_rate": 1.7765238627180424e-05, + "loss": 0.5286, + "step": 9775 + }, + { + "epoch": 0.2170897104360662, + "grad_norm": 1.8320626020431519, + "learning_rate": 1.7763041174788984e-05, + "loss": 0.5937, + "step": 9780 + }, + { + "epoch": 0.21720069699559383, + "grad_norm": 0.9292985796928406, + "learning_rate": 1.7760842778593027e-05, + "loss": 0.4569, + "step": 9785 + }, + { + "epoch": 0.21731168355512148, + "grad_norm": 1.2462888956069946, + "learning_rate": 1.7758643438859836e-05, + "loss": 0.5901, + "step": 9790 + }, + { + "epoch": 0.2174226701146491, + "grad_norm": 1.6621441841125488, + "learning_rate": 1.7756443155856796e-05, + "loss": 0.6449, + "step": 9795 + }, + { + "epoch": 0.21753365667417676, + "grad_norm": 0.8483054041862488, + "learning_rate": 1.7754241929851413e-05, + "loss": 0.5905, + "step": 9800 + }, + { + "epoch": 0.2176446432337044, + "grad_norm": 0.9630061388015747, + "learning_rate": 1.77520397611113e-05, + "loss": 0.3271, + "step": 9805 + }, + { + "epoch": 0.21775562979323204, + "grad_norm": 0.8938013315200806, + "learning_rate": 1.7749836649904192e-05, + "loss": 0.4271, + "step": 9810 + }, + { + "epoch": 0.21786661635275967, + "grad_norm": 1.181822657585144, + "learning_rate": 1.7747632596497932e-05, + "loss": 0.537, + "step": 9815 + }, + { + "epoch": 0.21797760291228732, + "grad_norm": 0.8262178897857666, + "learning_rate": 1.7745427601160487e-05, + "loss": 0.5994, + "step": 9820 + }, + { + "epoch": 0.21808858947181498, + "grad_norm": 1.1472630500793457, + "learning_rate": 1.7743221664159927e-05, + "loss": 0.5166, + "step": 9825 + }, + { + "epoch": 0.2181995760313426, + "grad_norm": 1.1295238733291626, + "learning_rate": 1.774101478576445e-05, + "loss": 0.6013, + "step": 9830 + }, + { + "epoch": 0.21831056259087026, + "grad_norm": 1.3434393405914307, + "learning_rate": 1.7738806966242355e-05, + "loss": 0.5125, + "step": 9835 + }, + { + "epoch": 0.21842154915039788, + "grad_norm": 0.963297426700592, + "learning_rate": 1.7736598205862064e-05, + "loss": 0.6378, + "step": 9840 + }, + { + "epoch": 0.21853253570992554, + "grad_norm": 1.5868712663650513, + "learning_rate": 1.773438850489211e-05, + "loss": 0.4859, + "step": 9845 + }, + { + "epoch": 0.21864352226945316, + "grad_norm": 1.6960935592651367, + "learning_rate": 1.7732177863601135e-05, + "loss": 0.4339, + "step": 9850 + }, + { + "epoch": 0.21875450882898081, + "grad_norm": 1.0453115701675415, + "learning_rate": 1.7729966282257912e-05, + "loss": 0.4424, + "step": 9855 + }, + { + "epoch": 0.21886549538850844, + "grad_norm": 0.9868830442428589, + "learning_rate": 1.7727753761131312e-05, + "loss": 0.4167, + "step": 9860 + }, + { + "epoch": 0.2189764819480361, + "grad_norm": 1.3542011976242065, + "learning_rate": 1.7725540300490326e-05, + "loss": 0.4964, + "step": 9865 + }, + { + "epoch": 0.21908746850756372, + "grad_norm": 0.8640686273574829, + "learning_rate": 1.7723325900604063e-05, + "loss": 0.4559, + "step": 9870 + }, + { + "epoch": 0.21919845506709137, + "grad_norm": 1.0085946321487427, + "learning_rate": 1.7721110561741737e-05, + "loss": 0.3331, + "step": 9875 + }, + { + "epoch": 0.21930944162661903, + "grad_norm": 1.2680177688598633, + "learning_rate": 1.7718894284172684e-05, + "loss": 0.5908, + "step": 9880 + }, + { + "epoch": 0.21942042818614665, + "grad_norm": 1.1593209505081177, + "learning_rate": 1.771667706816635e-05, + "loss": 0.6322, + "step": 9885 + }, + { + "epoch": 0.2195314147456743, + "grad_norm": 1.3372281789779663, + "learning_rate": 1.7714458913992297e-05, + "loss": 0.5001, + "step": 9890 + }, + { + "epoch": 0.21964240130520193, + "grad_norm": 1.2422916889190674, + "learning_rate": 1.7712239821920202e-05, + "loss": 0.499, + "step": 9895 + }, + { + "epoch": 0.2197533878647296, + "grad_norm": 1.1619045734405518, + "learning_rate": 1.7710019792219856e-05, + "loss": 0.5293, + "step": 9900 + }, + { + "epoch": 0.21986437442425721, + "grad_norm": 0.9704828262329102, + "learning_rate": 1.7707798825161155e-05, + "loss": 0.4543, + "step": 9905 + }, + { + "epoch": 0.21997536098378487, + "grad_norm": 1.4115480184555054, + "learning_rate": 1.770557692101413e-05, + "loss": 0.3108, + "step": 9910 + }, + { + "epoch": 0.2200863475433125, + "grad_norm": 0.9354934692382812, + "learning_rate": 1.77033540800489e-05, + "loss": 0.5608, + "step": 9915 + }, + { + "epoch": 0.22019733410284015, + "grad_norm": 1.1399041414260864, + "learning_rate": 1.770113030253572e-05, + "loss": 0.4043, + "step": 9920 + }, + { + "epoch": 0.22030832066236777, + "grad_norm": 1.5980947017669678, + "learning_rate": 1.7698905588744946e-05, + "loss": 0.4469, + "step": 9925 + }, + { + "epoch": 0.22041930722189543, + "grad_norm": 1.166395664215088, + "learning_rate": 1.769667993894705e-05, + "loss": 0.4852, + "step": 9930 + }, + { + "epoch": 0.22053029378142308, + "grad_norm": 0.9159113168716431, + "learning_rate": 1.7694453353412618e-05, + "loss": 0.6554, + "step": 9935 + }, + { + "epoch": 0.2206412803409507, + "grad_norm": 1.1559749841690063, + "learning_rate": 1.7692225832412354e-05, + "loss": 0.4948, + "step": 9940 + }, + { + "epoch": 0.22075226690047836, + "grad_norm": 1.3015704154968262, + "learning_rate": 1.768999737621707e-05, + "loss": 0.7356, + "step": 9945 + }, + { + "epoch": 0.220863253460006, + "grad_norm": 2.7410922050476074, + "learning_rate": 1.7687767985097695e-05, + "loss": 0.4758, + "step": 9950 + }, + { + "epoch": 0.22097424001953364, + "grad_norm": 1.1932713985443115, + "learning_rate": 1.7685537659325272e-05, + "loss": 0.4975, + "step": 9955 + }, + { + "epoch": 0.22108522657906127, + "grad_norm": 1.03585946559906, + "learning_rate": 1.768330639917095e-05, + "loss": 0.4919, + "step": 9960 + }, + { + "epoch": 0.22119621313858892, + "grad_norm": 1.0269675254821777, + "learning_rate": 1.7681074204906013e-05, + "loss": 0.5063, + "step": 9965 + }, + { + "epoch": 0.22130719969811655, + "grad_norm": 1.6854188442230225, + "learning_rate": 1.767884107680183e-05, + "loss": 0.5432, + "step": 9970 + }, + { + "epoch": 0.2214181862576442, + "grad_norm": 1.463315725326538, + "learning_rate": 1.7676607015129904e-05, + "loss": 0.6055, + "step": 9975 + }, + { + "epoch": 0.22152917281717183, + "grad_norm": 1.0149184465408325, + "learning_rate": 1.767437202016184e-05, + "loss": 0.3995, + "step": 9980 + }, + { + "epoch": 0.22164015937669948, + "grad_norm": 1.0655211210250854, + "learning_rate": 1.767213609216936e-05, + "loss": 0.4974, + "step": 9985 + }, + { + "epoch": 0.22175114593622713, + "grad_norm": 0.9974083304405212, + "learning_rate": 1.766989923142431e-05, + "loss": 0.4742, + "step": 9990 + }, + { + "epoch": 0.22186213249575476, + "grad_norm": 1.0866353511810303, + "learning_rate": 1.7667661438198635e-05, + "loss": 0.5871, + "step": 9995 + }, + { + "epoch": 0.22197311905528241, + "grad_norm": 0.9190327525138855, + "learning_rate": 1.7665422712764394e-05, + "loss": 0.4662, + "step": 10000 + }, + { + "epoch": 0.22208410561481004, + "grad_norm": 1.1974272727966309, + "learning_rate": 1.766318305539377e-05, + "loss": 0.5835, + "step": 10005 + }, + { + "epoch": 0.2221950921743377, + "grad_norm": 1.0719481706619263, + "learning_rate": 1.766094246635905e-05, + "loss": 0.4291, + "step": 10010 + }, + { + "epoch": 0.22230607873386532, + "grad_norm": 0.9069860577583313, + "learning_rate": 1.7658700945932637e-05, + "loss": 0.4985, + "step": 10015 + }, + { + "epoch": 0.22241706529339297, + "grad_norm": 1.5030319690704346, + "learning_rate": 1.7656458494387047e-05, + "loss": 0.4545, + "step": 10020 + }, + { + "epoch": 0.2225280518529206, + "grad_norm": 1.5551732778549194, + "learning_rate": 1.7654215111994912e-05, + "loss": 0.454, + "step": 10025 + }, + { + "epoch": 0.22263903841244825, + "grad_norm": 0.9678833484649658, + "learning_rate": 1.7651970799028976e-05, + "loss": 0.4953, + "step": 10030 + }, + { + "epoch": 0.2227500249719759, + "grad_norm": 1.0727559328079224, + "learning_rate": 1.764972555576209e-05, + "loss": 0.6616, + "step": 10035 + }, + { + "epoch": 0.22286101153150353, + "grad_norm": 1.8546661138534546, + "learning_rate": 1.7647479382467227e-05, + "loss": 0.6262, + "step": 10040 + }, + { + "epoch": 0.2229719980910312, + "grad_norm": 1.2231254577636719, + "learning_rate": 1.764523227941747e-05, + "loss": 0.56, + "step": 10045 + }, + { + "epoch": 0.2230829846505588, + "grad_norm": 0.9730484485626221, + "learning_rate": 1.764298424688601e-05, + "loss": 0.5865, + "step": 10050 + }, + { + "epoch": 0.22319397121008647, + "grad_norm": 1.9160155057907104, + "learning_rate": 1.764073528514616e-05, + "loss": 0.6857, + "step": 10055 + }, + { + "epoch": 0.2233049577696141, + "grad_norm": 0.9943063259124756, + "learning_rate": 1.7638485394471337e-05, + "loss": 0.4019, + "step": 10060 + }, + { + "epoch": 0.22341594432914175, + "grad_norm": 1.3601323366165161, + "learning_rate": 1.7636234575135082e-05, + "loss": 0.3918, + "step": 10065 + }, + { + "epoch": 0.22352693088866937, + "grad_norm": 1.2443233728408813, + "learning_rate": 1.763398282741103e-05, + "loss": 0.5831, + "step": 10070 + }, + { + "epoch": 0.22363791744819703, + "grad_norm": 0.9492146968841553, + "learning_rate": 1.7631730151572952e-05, + "loss": 0.4517, + "step": 10075 + }, + { + "epoch": 0.22374890400772465, + "grad_norm": 1.3512073755264282, + "learning_rate": 1.7629476547894716e-05, + "loss": 0.5045, + "step": 10080 + }, + { + "epoch": 0.2238598905672523, + "grad_norm": 1.1920839548110962, + "learning_rate": 1.7627222016650313e-05, + "loss": 0.6147, + "step": 10085 + }, + { + "epoch": 0.22397087712677996, + "grad_norm": 1.264012098312378, + "learning_rate": 1.7624966558113833e-05, + "loss": 0.4869, + "step": 10090 + }, + { + "epoch": 0.2240818636863076, + "grad_norm": 1.0589563846588135, + "learning_rate": 1.762271017255949e-05, + "loss": 0.4367, + "step": 10095 + }, + { + "epoch": 0.22419285024583524, + "grad_norm": 1.3212761878967285, + "learning_rate": 1.7620452860261614e-05, + "loss": 0.488, + "step": 10100 + }, + { + "epoch": 0.22430383680536287, + "grad_norm": 1.3312410116195679, + "learning_rate": 1.761819462149463e-05, + "loss": 0.4991, + "step": 10105 + }, + { + "epoch": 0.22441482336489052, + "grad_norm": 1.1420173645019531, + "learning_rate": 1.7615935456533094e-05, + "loss": 0.4348, + "step": 10110 + }, + { + "epoch": 0.22452580992441815, + "grad_norm": 1.0570411682128906, + "learning_rate": 1.761367536565167e-05, + "loss": 0.4494, + "step": 10115 + }, + { + "epoch": 0.2246367964839458, + "grad_norm": 1.0421570539474487, + "learning_rate": 1.7611414349125128e-05, + "loss": 0.4582, + "step": 10120 + }, + { + "epoch": 0.22474778304347343, + "grad_norm": 0.9665072560310364, + "learning_rate": 1.7609152407228358e-05, + "loss": 0.5182, + "step": 10125 + }, + { + "epoch": 0.22485876960300108, + "grad_norm": 1.0663903951644897, + "learning_rate": 1.7606889540236352e-05, + "loss": 0.4897, + "step": 10130 + }, + { + "epoch": 0.2249697561625287, + "grad_norm": 1.3296186923980713, + "learning_rate": 1.7604625748424237e-05, + "loss": 0.5323, + "step": 10135 + }, + { + "epoch": 0.22508074272205636, + "grad_norm": 1.2479513883590698, + "learning_rate": 1.760236103206722e-05, + "loss": 0.5191, + "step": 10140 + }, + { + "epoch": 0.225191729281584, + "grad_norm": 1.2304879426956177, + "learning_rate": 1.7600095391440645e-05, + "loss": 0.4828, + "step": 10145 + }, + { + "epoch": 0.22530271584111164, + "grad_norm": 1.3983750343322754, + "learning_rate": 1.7597828826819966e-05, + "loss": 0.5423, + "step": 10150 + }, + { + "epoch": 0.2254137024006393, + "grad_norm": 0.6361042261123657, + "learning_rate": 1.7595561338480733e-05, + "loss": 0.3157, + "step": 10155 + }, + { + "epoch": 0.22552468896016692, + "grad_norm": 0.9633803367614746, + "learning_rate": 1.759329292669863e-05, + "loss": 0.4553, + "step": 10160 + }, + { + "epoch": 0.22563567551969457, + "grad_norm": 1.2229855060577393, + "learning_rate": 1.7591023591749436e-05, + "loss": 0.4053, + "step": 10165 + }, + { + "epoch": 0.2257466620792222, + "grad_norm": 1.512622356414795, + "learning_rate": 1.7588753333909053e-05, + "loss": 0.3636, + "step": 10170 + }, + { + "epoch": 0.22585764863874985, + "grad_norm": 1.1153056621551514, + "learning_rate": 1.7586482153453492e-05, + "loss": 0.6119, + "step": 10175 + }, + { + "epoch": 0.22596863519827748, + "grad_norm": 1.1370549201965332, + "learning_rate": 1.7584210050658873e-05, + "loss": 0.6283, + "step": 10180 + }, + { + "epoch": 0.22607962175780513, + "grad_norm": 1.1045695543289185, + "learning_rate": 1.7581937025801433e-05, + "loss": 0.2812, + "step": 10185 + }, + { + "epoch": 0.22619060831733276, + "grad_norm": 1.1070390939712524, + "learning_rate": 1.757966307915752e-05, + "loss": 0.4798, + "step": 10190 + }, + { + "epoch": 0.2263015948768604, + "grad_norm": 1.2510517835617065, + "learning_rate": 1.7577388211003584e-05, + "loss": 0.4774, + "step": 10195 + }, + { + "epoch": 0.22641258143638807, + "grad_norm": 1.1064683198928833, + "learning_rate": 1.7575112421616203e-05, + "loss": 0.4421, + "step": 10200 + }, + { + "epoch": 0.2265235679959157, + "grad_norm": 1.3662172555923462, + "learning_rate": 1.757283571127206e-05, + "loss": 0.5346, + "step": 10205 + }, + { + "epoch": 0.22663455455544335, + "grad_norm": 1.113659143447876, + "learning_rate": 1.7570558080247945e-05, + "loss": 0.4849, + "step": 10210 + }, + { + "epoch": 0.22674554111497097, + "grad_norm": 1.5465953350067139, + "learning_rate": 1.7568279528820774e-05, + "loss": 0.5242, + "step": 10215 + }, + { + "epoch": 0.22685652767449863, + "grad_norm": 0.9810808897018433, + "learning_rate": 1.756600005726756e-05, + "loss": 0.4816, + "step": 10220 + }, + { + "epoch": 0.22696751423402625, + "grad_norm": 1.075408935546875, + "learning_rate": 1.7563719665865425e-05, + "loss": 0.4855, + "step": 10225 + }, + { + "epoch": 0.2270785007935539, + "grad_norm": 1.4035685062408447, + "learning_rate": 1.7561438354891628e-05, + "loss": 0.5537, + "step": 10230 + }, + { + "epoch": 0.22718948735308153, + "grad_norm": 0.9641634821891785, + "learning_rate": 1.7559156124623513e-05, + "loss": 0.414, + "step": 10235 + }, + { + "epoch": 0.22730047391260919, + "grad_norm": 1.221092700958252, + "learning_rate": 1.7556872975338545e-05, + "loss": 0.483, + "step": 10240 + }, + { + "epoch": 0.2274114604721368, + "grad_norm": 1.4603421688079834, + "learning_rate": 1.755458890731431e-05, + "loss": 0.3984, + "step": 10245 + }, + { + "epoch": 0.22752244703166447, + "grad_norm": 1.086022138595581, + "learning_rate": 1.755230392082849e-05, + "loss": 0.499, + "step": 10250 + }, + { + "epoch": 0.22763343359119212, + "grad_norm": 1.0802093744277954, + "learning_rate": 1.7550018016158884e-05, + "loss": 0.5606, + "step": 10255 + }, + { + "epoch": 0.22774442015071975, + "grad_norm": 1.469793677330017, + "learning_rate": 1.754773119358341e-05, + "loss": 0.4977, + "step": 10260 + }, + { + "epoch": 0.2278554067102474, + "grad_norm": 1.184227705001831, + "learning_rate": 1.754544345338009e-05, + "loss": 0.4903, + "step": 10265 + }, + { + "epoch": 0.22796639326977503, + "grad_norm": 1.258518099784851, + "learning_rate": 1.754315479582706e-05, + "loss": 0.5834, + "step": 10270 + }, + { + "epoch": 0.22807737982930268, + "grad_norm": 1.2839202880859375, + "learning_rate": 1.7540865221202574e-05, + "loss": 0.6525, + "step": 10275 + }, + { + "epoch": 0.2281883663888303, + "grad_norm": 1.084051489830017, + "learning_rate": 1.7538574729784977e-05, + "loss": 0.4572, + "step": 10280 + }, + { + "epoch": 0.22829935294835796, + "grad_norm": 1.2756541967391968, + "learning_rate": 1.753628332185275e-05, + "loss": 0.4312, + "step": 10285 + }, + { + "epoch": 0.22841033950788558, + "grad_norm": 1.2992979288101196, + "learning_rate": 1.7533990997684473e-05, + "loss": 0.3662, + "step": 10290 + }, + { + "epoch": 0.22852132606741324, + "grad_norm": 1.274697184562683, + "learning_rate": 1.7531697757558833e-05, + "loss": 0.5757, + "step": 10295 + }, + { + "epoch": 0.22863231262694086, + "grad_norm": 1.0342131853103638, + "learning_rate": 1.7529403601754645e-05, + "loss": 0.3428, + "step": 10300 + }, + { + "epoch": 0.22874329918646852, + "grad_norm": 1.0127534866333008, + "learning_rate": 1.7527108530550815e-05, + "loss": 0.2972, + "step": 10305 + }, + { + "epoch": 0.22885428574599617, + "grad_norm": 1.185622215270996, + "learning_rate": 1.752481254422637e-05, + "loss": 0.4231, + "step": 10310 + }, + { + "epoch": 0.2289652723055238, + "grad_norm": 1.262616753578186, + "learning_rate": 1.752251564306046e-05, + "loss": 0.5794, + "step": 10315 + }, + { + "epoch": 0.22907625886505145, + "grad_norm": 1.0973269939422607, + "learning_rate": 1.752021782733232e-05, + "loss": 0.4948, + "step": 10320 + }, + { + "epoch": 0.22918724542457908, + "grad_norm": 0.6889248490333557, + "learning_rate": 1.7517919097321323e-05, + "loss": 0.6251, + "step": 10325 + }, + { + "epoch": 0.22929823198410673, + "grad_norm": 1.0883080959320068, + "learning_rate": 1.751561945330693e-05, + "loss": 0.7084, + "step": 10330 + }, + { + "epoch": 0.22940921854363436, + "grad_norm": 0.9828934669494629, + "learning_rate": 1.7513318895568734e-05, + "loss": 0.495, + "step": 10335 + }, + { + "epoch": 0.229520205103162, + "grad_norm": 1.5232964754104614, + "learning_rate": 1.7511017424386423e-05, + "loss": 0.5004, + "step": 10340 + }, + { + "epoch": 0.22963119166268964, + "grad_norm": 1.1135151386260986, + "learning_rate": 1.7508715040039805e-05, + "loss": 0.4907, + "step": 10345 + }, + { + "epoch": 0.2297421782222173, + "grad_norm": 1.2410166263580322, + "learning_rate": 1.750641174280879e-05, + "loss": 0.5564, + "step": 10350 + }, + { + "epoch": 0.22985316478174492, + "grad_norm": 1.2214136123657227, + "learning_rate": 1.750410753297341e-05, + "loss": 0.6031, + "step": 10355 + }, + { + "epoch": 0.22996415134127257, + "grad_norm": 1.2132198810577393, + "learning_rate": 1.750180241081381e-05, + "loss": 0.5265, + "step": 10360 + }, + { + "epoch": 0.23007513790080023, + "grad_norm": 1.0986378192901611, + "learning_rate": 1.7499496376610225e-05, + "loss": 0.5982, + "step": 10365 + }, + { + "epoch": 0.23018612446032785, + "grad_norm": 1.078561544418335, + "learning_rate": 1.7497189430643025e-05, + "loss": 0.374, + "step": 10370 + }, + { + "epoch": 0.2302971110198555, + "grad_norm": 1.3416097164154053, + "learning_rate": 1.749488157319268e-05, + "loss": 0.4589, + "step": 10375 + }, + { + "epoch": 0.23040809757938313, + "grad_norm": 1.3649656772613525, + "learning_rate": 1.7492572804539763e-05, + "loss": 0.4703, + "step": 10380 + }, + { + "epoch": 0.23051908413891078, + "grad_norm": 1.029783844947815, + "learning_rate": 1.7490263124964976e-05, + "loss": 0.4931, + "step": 10385 + }, + { + "epoch": 0.2306300706984384, + "grad_norm": 0.9972415566444397, + "learning_rate": 1.7487952534749116e-05, + "loss": 0.4425, + "step": 10390 + }, + { + "epoch": 0.23074105725796606, + "grad_norm": 1.2904571294784546, + "learning_rate": 1.7485641034173103e-05, + "loss": 0.4746, + "step": 10395 + }, + { + "epoch": 0.2308520438174937, + "grad_norm": 1.2893401384353638, + "learning_rate": 1.748332862351796e-05, + "loss": 0.5975, + "step": 10400 + }, + { + "epoch": 0.23096303037702134, + "grad_norm": 1.3970812559127808, + "learning_rate": 1.7481015303064816e-05, + "loss": 0.4489, + "step": 10405 + }, + { + "epoch": 0.23107401693654897, + "grad_norm": 1.0396674871444702, + "learning_rate": 1.747870107309492e-05, + "loss": 0.5176, + "step": 10410 + }, + { + "epoch": 0.23118500349607662, + "grad_norm": 0.7239720225334167, + "learning_rate": 1.7476385933889633e-05, + "loss": 0.4588, + "step": 10415 + }, + { + "epoch": 0.23129599005560428, + "grad_norm": 1.1714787483215332, + "learning_rate": 1.7474069885730414e-05, + "loss": 0.5548, + "step": 10420 + }, + { + "epoch": 0.2314069766151319, + "grad_norm": 1.6496156454086304, + "learning_rate": 1.7471752928898847e-05, + "loss": 0.6559, + "step": 10425 + }, + { + "epoch": 0.23151796317465956, + "grad_norm": 1.0505659580230713, + "learning_rate": 1.7469435063676615e-05, + "loss": 0.5648, + "step": 10430 + }, + { + "epoch": 0.23162894973418718, + "grad_norm": 0.9392535090446472, + "learning_rate": 1.746711629034552e-05, + "loss": 0.6279, + "step": 10435 + }, + { + "epoch": 0.23173993629371484, + "grad_norm": 1.05547034740448, + "learning_rate": 1.746479660918747e-05, + "loss": 0.511, + "step": 10440 + }, + { + "epoch": 0.23185092285324246, + "grad_norm": 0.9287381768226624, + "learning_rate": 1.7462476020484484e-05, + "loss": 0.4376, + "step": 10445 + }, + { + "epoch": 0.23196190941277012, + "grad_norm": 1.5444824695587158, + "learning_rate": 1.7460154524518688e-05, + "loss": 0.4187, + "step": 10450 + }, + { + "epoch": 0.23207289597229774, + "grad_norm": 1.0833466053009033, + "learning_rate": 1.7457832121572323e-05, + "loss": 0.5985, + "step": 10455 + }, + { + "epoch": 0.2321838825318254, + "grad_norm": 1.059387445449829, + "learning_rate": 1.7455508811927746e-05, + "loss": 0.3834, + "step": 10460 + }, + { + "epoch": 0.23229486909135302, + "grad_norm": 1.185230016708374, + "learning_rate": 1.7453184595867404e-05, + "loss": 0.5965, + "step": 10465 + }, + { + "epoch": 0.23240585565088068, + "grad_norm": 0.722567081451416, + "learning_rate": 1.7450859473673882e-05, + "loss": 0.5079, + "step": 10470 + }, + { + "epoch": 0.23251684221040833, + "grad_norm": 1.3505364656448364, + "learning_rate": 1.744853344562985e-05, + "loss": 0.4602, + "step": 10475 + }, + { + "epoch": 0.23262782876993596, + "grad_norm": 1.1881821155548096, + "learning_rate": 1.7446206512018103e-05, + "loss": 0.3281, + "step": 10480 + }, + { + "epoch": 0.2327388153294636, + "grad_norm": 1.5479166507720947, + "learning_rate": 1.744387867312154e-05, + "loss": 0.531, + "step": 10485 + }, + { + "epoch": 0.23284980188899124, + "grad_norm": 1.6832934617996216, + "learning_rate": 1.7441549929223173e-05, + "loss": 0.6222, + "step": 10490 + }, + { + "epoch": 0.2329607884485189, + "grad_norm": 1.35410475730896, + "learning_rate": 1.743922028060612e-05, + "loss": 0.545, + "step": 10495 + }, + { + "epoch": 0.23307177500804652, + "grad_norm": 1.073199987411499, + "learning_rate": 1.743688972755362e-05, + "loss": 0.5057, + "step": 10500 + }, + { + "epoch": 0.23318276156757417, + "grad_norm": 1.2246018648147583, + "learning_rate": 1.7434558270349006e-05, + "loss": 0.386, + "step": 10505 + }, + { + "epoch": 0.2332937481271018, + "grad_norm": 1.15493643283844, + "learning_rate": 1.743222590927573e-05, + "loss": 0.6794, + "step": 10510 + }, + { + "epoch": 0.23340473468662945, + "grad_norm": 1.451859712600708, + "learning_rate": 1.7429892644617354e-05, + "loss": 0.5025, + "step": 10515 + }, + { + "epoch": 0.2335157212461571, + "grad_norm": 1.6235580444335938, + "learning_rate": 1.742755847665755e-05, + "loss": 0.4602, + "step": 10520 + }, + { + "epoch": 0.23362670780568473, + "grad_norm": 1.356918454170227, + "learning_rate": 1.7425223405680098e-05, + "loss": 0.4022, + "step": 10525 + }, + { + "epoch": 0.23373769436521238, + "grad_norm": 1.1652427911758423, + "learning_rate": 1.742288743196888e-05, + "loss": 0.5255, + "step": 10530 + }, + { + "epoch": 0.23384868092474, + "grad_norm": 1.1124521493911743, + "learning_rate": 1.7420550555807906e-05, + "loss": 0.462, + "step": 10535 + }, + { + "epoch": 0.23395966748426766, + "grad_norm": 1.566153645515442, + "learning_rate": 1.741821277748128e-05, + "loss": 0.4599, + "step": 10540 + }, + { + "epoch": 0.2340706540437953, + "grad_norm": 1.2258182764053345, + "learning_rate": 1.741587409727323e-05, + "loss": 0.457, + "step": 10545 + }, + { + "epoch": 0.23418164060332294, + "grad_norm": 1.0857577323913574, + "learning_rate": 1.7413534515468075e-05, + "loss": 0.5158, + "step": 10550 + }, + { + "epoch": 0.23429262716285057, + "grad_norm": 0.9319967031478882, + "learning_rate": 1.7411194032350252e-05, + "loss": 0.454, + "step": 10555 + }, + { + "epoch": 0.23440361372237822, + "grad_norm": 1.503821849822998, + "learning_rate": 1.7408852648204317e-05, + "loss": 0.5528, + "step": 10560 + }, + { + "epoch": 0.23451460028190585, + "grad_norm": 0.9573317766189575, + "learning_rate": 1.7406510363314922e-05, + "loss": 0.5481, + "step": 10565 + }, + { + "epoch": 0.2346255868414335, + "grad_norm": 1.5251328945159912, + "learning_rate": 1.740416717796684e-05, + "loss": 0.4372, + "step": 10570 + }, + { + "epoch": 0.23473657340096116, + "grad_norm": 0.8173489570617676, + "learning_rate": 1.7401823092444945e-05, + "loss": 0.3856, + "step": 10575 + }, + { + "epoch": 0.23484755996048878, + "grad_norm": 1.7951514720916748, + "learning_rate": 1.739947810703422e-05, + "loss": 0.5434, + "step": 10580 + }, + { + "epoch": 0.23495854652001644, + "grad_norm": 0.8231841325759888, + "learning_rate": 1.739713222201976e-05, + "loss": 0.5926, + "step": 10585 + }, + { + "epoch": 0.23506953307954406, + "grad_norm": 1.47267746925354, + "learning_rate": 1.739478543768678e-05, + "loss": 0.4497, + "step": 10590 + }, + { + "epoch": 0.23518051963907172, + "grad_norm": 1.2468472719192505, + "learning_rate": 1.7392437754320577e-05, + "loss": 0.437, + "step": 10595 + }, + { + "epoch": 0.23529150619859934, + "grad_norm": 1.423754334449768, + "learning_rate": 1.7390089172206594e-05, + "loss": 0.5477, + "step": 10600 + }, + { + "epoch": 0.235402492758127, + "grad_norm": 1.2863191366195679, + "learning_rate": 1.7387739691630346e-05, + "loss": 0.4375, + "step": 10605 + }, + { + "epoch": 0.23551347931765462, + "grad_norm": 1.250435471534729, + "learning_rate": 1.738538931287749e-05, + "loss": 0.6207, + "step": 10610 + }, + { + "epoch": 0.23562446587718228, + "grad_norm": 1.0339642763137817, + "learning_rate": 1.7383038036233762e-05, + "loss": 0.5425, + "step": 10615 + }, + { + "epoch": 0.2357354524367099, + "grad_norm": 1.1191165447235107, + "learning_rate": 1.7380685861985037e-05, + "loss": 0.4339, + "step": 10620 + }, + { + "epoch": 0.23584643899623756, + "grad_norm": 1.4968637228012085, + "learning_rate": 1.7378332790417275e-05, + "loss": 0.4038, + "step": 10625 + }, + { + "epoch": 0.2359574255557652, + "grad_norm": 1.324038028717041, + "learning_rate": 1.7375978821816557e-05, + "loss": 0.4687, + "step": 10630 + }, + { + "epoch": 0.23606841211529284, + "grad_norm": 1.475159764289856, + "learning_rate": 1.737362395646907e-05, + "loss": 0.5988, + "step": 10635 + }, + { + "epoch": 0.2361793986748205, + "grad_norm": 1.462770700454712, + "learning_rate": 1.7371268194661114e-05, + "loss": 0.5812, + "step": 10640 + }, + { + "epoch": 0.23629038523434812, + "grad_norm": 1.8520547151565552, + "learning_rate": 1.7368911536679092e-05, + "loss": 0.4466, + "step": 10645 + }, + { + "epoch": 0.23640137179387577, + "grad_norm": 1.1543737649917603, + "learning_rate": 1.736655398280952e-05, + "loss": 0.4219, + "step": 10650 + }, + { + "epoch": 0.2365123583534034, + "grad_norm": 1.0772531032562256, + "learning_rate": 1.7364195533339017e-05, + "loss": 0.6008, + "step": 10655 + }, + { + "epoch": 0.23662334491293105, + "grad_norm": 1.693520426750183, + "learning_rate": 1.736183618855432e-05, + "loss": 0.5874, + "step": 10660 + }, + { + "epoch": 0.23673433147245868, + "grad_norm": 0.9491427540779114, + "learning_rate": 1.735947594874227e-05, + "loss": 0.4984, + "step": 10665 + }, + { + "epoch": 0.23684531803198633, + "grad_norm": 1.327712893486023, + "learning_rate": 1.7357114814189812e-05, + "loss": 0.4518, + "step": 10670 + }, + { + "epoch": 0.23695630459151396, + "grad_norm": 1.220430850982666, + "learning_rate": 1.735475278518401e-05, + "loss": 0.3435, + "step": 10675 + }, + { + "epoch": 0.2370672911510416, + "grad_norm": 1.355727195739746, + "learning_rate": 1.7352389862012034e-05, + "loss": 0.5366, + "step": 10680 + }, + { + "epoch": 0.23717827771056926, + "grad_norm": 0.9620851278305054, + "learning_rate": 1.7350026044961155e-05, + "loss": 0.2839, + "step": 10685 + }, + { + "epoch": 0.2372892642700969, + "grad_norm": 1.1203703880310059, + "learning_rate": 1.734766133431876e-05, + "loss": 0.4745, + "step": 10690 + }, + { + "epoch": 0.23740025082962454, + "grad_norm": 1.1614298820495605, + "learning_rate": 1.734529573037234e-05, + "loss": 0.5457, + "step": 10695 + }, + { + "epoch": 0.23751123738915217, + "grad_norm": 1.0189214944839478, + "learning_rate": 1.73429292334095e-05, + "loss": 0.4508, + "step": 10700 + }, + { + "epoch": 0.23762222394867982, + "grad_norm": 1.196600317955017, + "learning_rate": 1.734056184371795e-05, + "loss": 0.4637, + "step": 10705 + }, + { + "epoch": 0.23773321050820745, + "grad_norm": 1.19878089427948, + "learning_rate": 1.7338193561585507e-05, + "loss": 0.4069, + "step": 10710 + }, + { + "epoch": 0.2378441970677351, + "grad_norm": 1.5032724142074585, + "learning_rate": 1.7335824387300106e-05, + "loss": 0.3725, + "step": 10715 + }, + { + "epoch": 0.23795518362726273, + "grad_norm": 1.6843661069869995, + "learning_rate": 1.7333454321149777e-05, + "loss": 0.4016, + "step": 10720 + }, + { + "epoch": 0.23806617018679038, + "grad_norm": 1.3617744445800781, + "learning_rate": 1.7331083363422665e-05, + "loss": 0.5763, + "step": 10725 + }, + { + "epoch": 0.238177156746318, + "grad_norm": 1.376902461051941, + "learning_rate": 1.7328711514407025e-05, + "loss": 0.3615, + "step": 10730 + }, + { + "epoch": 0.23828814330584566, + "grad_norm": 1.3558409214019775, + "learning_rate": 1.732633877439122e-05, + "loss": 0.5211, + "step": 10735 + }, + { + "epoch": 0.23839912986537332, + "grad_norm": 1.5150203704833984, + "learning_rate": 1.7323965143663713e-05, + "loss": 0.5227, + "step": 10740 + }, + { + "epoch": 0.23851011642490094, + "grad_norm": 0.9877330660820007, + "learning_rate": 1.7321590622513088e-05, + "loss": 0.4526, + "step": 10745 + }, + { + "epoch": 0.2386211029844286, + "grad_norm": 0.9728530049324036, + "learning_rate": 1.731921521122803e-05, + "loss": 0.4788, + "step": 10750 + }, + { + "epoch": 0.23873208954395622, + "grad_norm": 1.4340505599975586, + "learning_rate": 1.7316838910097332e-05, + "loss": 0.4781, + "step": 10755 + }, + { + "epoch": 0.23884307610348388, + "grad_norm": 0.9711979031562805, + "learning_rate": 1.7314461719409902e-05, + "loss": 0.427, + "step": 10760 + }, + { + "epoch": 0.2389540626630115, + "grad_norm": 1.3422971963882446, + "learning_rate": 1.7312083639454743e-05, + "loss": 0.5741, + "step": 10765 + }, + { + "epoch": 0.23906504922253916, + "grad_norm": 0.8390370607376099, + "learning_rate": 1.730970467052098e-05, + "loss": 0.4605, + "step": 10770 + }, + { + "epoch": 0.23917603578206678, + "grad_norm": 1.2565598487854004, + "learning_rate": 1.7307324812897836e-05, + "loss": 0.4963, + "step": 10775 + }, + { + "epoch": 0.23928702234159444, + "grad_norm": 1.854711890220642, + "learning_rate": 1.730494406687465e-05, + "loss": 0.3802, + "step": 10780 + }, + { + "epoch": 0.23939800890112206, + "grad_norm": 1.0575193166732788, + "learning_rate": 1.7302562432740864e-05, + "loss": 0.5513, + "step": 10785 + }, + { + "epoch": 0.23950899546064972, + "grad_norm": 1.5515862703323364, + "learning_rate": 1.7300179910786027e-05, + "loss": 0.6286, + "step": 10790 + }, + { + "epoch": 0.23961998202017737, + "grad_norm": 1.8107975721359253, + "learning_rate": 1.72977965012998e-05, + "loss": 0.5424, + "step": 10795 + }, + { + "epoch": 0.239730968579705, + "grad_norm": 1.4742685556411743, + "learning_rate": 1.7295412204571945e-05, + "loss": 0.4547, + "step": 10800 + }, + { + "epoch": 0.23984195513923265, + "grad_norm": 1.0533596277236938, + "learning_rate": 1.7293027020892348e-05, + "loss": 0.4738, + "step": 10805 + }, + { + "epoch": 0.23995294169876027, + "grad_norm": 1.5168896913528442, + "learning_rate": 1.7290640950550985e-05, + "loss": 0.4324, + "step": 10810 + }, + { + "epoch": 0.24006392825828793, + "grad_norm": 1.048251748085022, + "learning_rate": 1.7288253993837936e-05, + "loss": 0.5434, + "step": 10815 + }, + { + "epoch": 0.24017491481781555, + "grad_norm": 0.968961238861084, + "learning_rate": 1.7285866151043417e-05, + "loss": 0.2887, + "step": 10820 + }, + { + "epoch": 0.2402859013773432, + "grad_norm": 1.3361490964889526, + "learning_rate": 1.728347742245773e-05, + "loss": 0.5679, + "step": 10825 + }, + { + "epoch": 0.24039688793687083, + "grad_norm": 1.0251753330230713, + "learning_rate": 1.7281087808371278e-05, + "loss": 0.5579, + "step": 10830 + }, + { + "epoch": 0.2405078744963985, + "grad_norm": 1.205301284790039, + "learning_rate": 1.727869730907459e-05, + "loss": 0.554, + "step": 10835 + }, + { + "epoch": 0.24061886105592611, + "grad_norm": 1.1939102411270142, + "learning_rate": 1.7276305924858297e-05, + "loss": 0.443, + "step": 10840 + }, + { + "epoch": 0.24072984761545377, + "grad_norm": 1.080856204032898, + "learning_rate": 1.727391365601313e-05, + "loss": 0.3908, + "step": 10845 + }, + { + "epoch": 0.24084083417498142, + "grad_norm": 1.2553077936172485, + "learning_rate": 1.727152050282994e-05, + "loss": 0.4487, + "step": 10850 + }, + { + "epoch": 0.24095182073450905, + "grad_norm": 1.3635225296020508, + "learning_rate": 1.7269126465599667e-05, + "loss": 0.4693, + "step": 10855 + }, + { + "epoch": 0.2410628072940367, + "grad_norm": 1.2379069328308105, + "learning_rate": 1.726673154461338e-05, + "loss": 0.5879, + "step": 10860 + }, + { + "epoch": 0.24117379385356433, + "grad_norm": 1.0275567770004272, + "learning_rate": 1.7264335740162244e-05, + "loss": 0.7738, + "step": 10865 + }, + { + "epoch": 0.24128478041309198, + "grad_norm": 1.408523678779602, + "learning_rate": 1.726193905253753e-05, + "loss": 0.4684, + "step": 10870 + }, + { + "epoch": 0.2413957669726196, + "grad_norm": 0.8828668594360352, + "learning_rate": 1.7259541482030623e-05, + "loss": 0.473, + "step": 10875 + }, + { + "epoch": 0.24150675353214726, + "grad_norm": 1.1700650453567505, + "learning_rate": 1.7257143028933004e-05, + "loss": 0.4779, + "step": 10880 + }, + { + "epoch": 0.2416177400916749, + "grad_norm": 1.1337982416152954, + "learning_rate": 1.7254743693536276e-05, + "loss": 0.5127, + "step": 10885 + }, + { + "epoch": 0.24172872665120254, + "grad_norm": 1.749471664428711, + "learning_rate": 1.7252343476132143e-05, + "loss": 0.4679, + "step": 10890 + }, + { + "epoch": 0.24183971321073017, + "grad_norm": 1.214697241783142, + "learning_rate": 1.724994237701241e-05, + "loss": 0.5016, + "step": 10895 + }, + { + "epoch": 0.24195069977025782, + "grad_norm": 1.2082881927490234, + "learning_rate": 1.7247540396469e-05, + "loss": 0.6128, + "step": 10900 + }, + { + "epoch": 0.24206168632978547, + "grad_norm": 1.2025545835494995, + "learning_rate": 1.7245137534793933e-05, + "loss": 0.4832, + "step": 10905 + }, + { + "epoch": 0.2421726728893131, + "grad_norm": 1.2197436094284058, + "learning_rate": 1.7242733792279342e-05, + "loss": 0.5686, + "step": 10910 + }, + { + "epoch": 0.24228365944884075, + "grad_norm": 1.3093212842941284, + "learning_rate": 1.7240329169217468e-05, + "loss": 0.5299, + "step": 10915 + }, + { + "epoch": 0.24239464600836838, + "grad_norm": 1.033205270767212, + "learning_rate": 1.7237923665900656e-05, + "loss": 0.5039, + "step": 10920 + }, + { + "epoch": 0.24250563256789603, + "grad_norm": 1.2853460311889648, + "learning_rate": 1.723551728262136e-05, + "loss": 0.4971, + "step": 10925 + }, + { + "epoch": 0.24261661912742366, + "grad_norm": 1.6411842107772827, + "learning_rate": 1.723311001967214e-05, + "loss": 0.4512, + "step": 10930 + }, + { + "epoch": 0.24272760568695131, + "grad_norm": 1.0631998777389526, + "learning_rate": 1.7230701877345658e-05, + "loss": 0.4894, + "step": 10935 + }, + { + "epoch": 0.24283859224647894, + "grad_norm": 1.1609796285629272, + "learning_rate": 1.722829285593469e-05, + "loss": 0.5161, + "step": 10940 + }, + { + "epoch": 0.2429495788060066, + "grad_norm": 1.4562970399856567, + "learning_rate": 1.7225882955732124e-05, + "loss": 0.6736, + "step": 10945 + }, + { + "epoch": 0.24306056536553422, + "grad_norm": 1.154300332069397, + "learning_rate": 1.722347217703094e-05, + "loss": 0.6721, + "step": 10950 + }, + { + "epoch": 0.24317155192506187, + "grad_norm": 1.2888139486312866, + "learning_rate": 1.722106052012423e-05, + "loss": 0.4758, + "step": 10955 + }, + { + "epoch": 0.24328253848458953, + "grad_norm": 1.2322896718978882, + "learning_rate": 1.7218647985305204e-05, + "loss": 0.6146, + "step": 10960 + }, + { + "epoch": 0.24339352504411715, + "grad_norm": 1.266477108001709, + "learning_rate": 1.7216234572867165e-05, + "loss": 0.4997, + "step": 10965 + }, + { + "epoch": 0.2435045116036448, + "grad_norm": 1.1659526824951172, + "learning_rate": 1.7213820283103526e-05, + "loss": 0.4684, + "step": 10970 + }, + { + "epoch": 0.24361549816317243, + "grad_norm": 1.1297948360443115, + "learning_rate": 1.7211405116307815e-05, + "loss": 0.4648, + "step": 10975 + }, + { + "epoch": 0.2437264847227001, + "grad_norm": 1.3273589611053467, + "learning_rate": 1.720898907277365e-05, + "loss": 0.5179, + "step": 10980 + }, + { + "epoch": 0.2438374712822277, + "grad_norm": 0.9976261258125305, + "learning_rate": 1.720657215279477e-05, + "loss": 0.5025, + "step": 10985 + }, + { + "epoch": 0.24394845784175537, + "grad_norm": 0.9172224998474121, + "learning_rate": 1.7204154356665023e-05, + "loss": 0.5558, + "step": 10990 + }, + { + "epoch": 0.244059444401283, + "grad_norm": 1.1094406843185425, + "learning_rate": 1.7201735684678348e-05, + "loss": 0.5454, + "step": 10995 + }, + { + "epoch": 0.24417043096081065, + "grad_norm": 1.3654953241348267, + "learning_rate": 1.7199316137128797e-05, + "loss": 0.5252, + "step": 11000 + }, + { + "epoch": 0.24428141752033827, + "grad_norm": 1.7020251750946045, + "learning_rate": 1.7196895714310536e-05, + "loss": 0.4773, + "step": 11005 + }, + { + "epoch": 0.24439240407986593, + "grad_norm": 1.4368350505828857, + "learning_rate": 1.7194474416517832e-05, + "loss": 0.3861, + "step": 11010 + }, + { + "epoch": 0.24450339063939358, + "grad_norm": 1.4007991552352905, + "learning_rate": 1.719205224404506e-05, + "loss": 0.4601, + "step": 11015 + }, + { + "epoch": 0.2446143771989212, + "grad_norm": 1.3285207748413086, + "learning_rate": 1.718962919718669e-05, + "loss": 0.4993, + "step": 11020 + }, + { + "epoch": 0.24472536375844886, + "grad_norm": 1.0307044982910156, + "learning_rate": 1.7187205276237316e-05, + "loss": 0.3924, + "step": 11025 + }, + { + "epoch": 0.2448363503179765, + "grad_norm": 0.9056999683380127, + "learning_rate": 1.718478048149163e-05, + "loss": 0.4292, + "step": 11030 + }, + { + "epoch": 0.24494733687750414, + "grad_norm": 0.9134597778320312, + "learning_rate": 1.718235481324443e-05, + "loss": 0.3722, + "step": 11035 + }, + { + "epoch": 0.24505832343703177, + "grad_norm": 1.4173113107681274, + "learning_rate": 1.7179928271790617e-05, + "loss": 0.5344, + "step": 11040 + }, + { + "epoch": 0.24516930999655942, + "grad_norm": 1.6890437602996826, + "learning_rate": 1.7177500857425207e-05, + "loss": 0.4563, + "step": 11045 + }, + { + "epoch": 0.24528029655608705, + "grad_norm": 1.0443166494369507, + "learning_rate": 1.717507257044331e-05, + "loss": 0.4444, + "step": 11050 + }, + { + "epoch": 0.2453912831156147, + "grad_norm": 1.5586121082305908, + "learning_rate": 1.717264341114016e-05, + "loss": 0.5696, + "step": 11055 + }, + { + "epoch": 0.24550226967514235, + "grad_norm": 0.7577741742134094, + "learning_rate": 1.7170213379811077e-05, + "loss": 0.361, + "step": 11060 + }, + { + "epoch": 0.24561325623466998, + "grad_norm": 1.1730875968933105, + "learning_rate": 1.7167782476751494e-05, + "loss": 0.4033, + "step": 11065 + }, + { + "epoch": 0.24572424279419763, + "grad_norm": 2.1000208854675293, + "learning_rate": 1.716535070225696e-05, + "loss": 0.4252, + "step": 11070 + }, + { + "epoch": 0.24583522935372526, + "grad_norm": 1.5296849012374878, + "learning_rate": 1.7162918056623116e-05, + "loss": 0.4881, + "step": 11075 + }, + { + "epoch": 0.2459462159132529, + "grad_norm": 1.5113800764083862, + "learning_rate": 1.716048454014572e-05, + "loss": 0.5569, + "step": 11080 + }, + { + "epoch": 0.24605720247278054, + "grad_norm": 1.1769105195999146, + "learning_rate": 1.7158050153120623e-05, + "loss": 0.4193, + "step": 11085 + }, + { + "epoch": 0.2461681890323082, + "grad_norm": 1.3997236490249634, + "learning_rate": 1.71556148958438e-05, + "loss": 0.5501, + "step": 11090 + }, + { + "epoch": 0.24627917559183582, + "grad_norm": 1.0395276546478271, + "learning_rate": 1.7153178768611317e-05, + "loss": 0.3598, + "step": 11095 + }, + { + "epoch": 0.24639016215136347, + "grad_norm": 0.9971006512641907, + "learning_rate": 1.7150741771719345e-05, + "loss": 0.5354, + "step": 11100 + }, + { + "epoch": 0.2465011487108911, + "grad_norm": 1.4031944274902344, + "learning_rate": 1.714830390546417e-05, + "loss": 0.4695, + "step": 11105 + }, + { + "epoch": 0.24661213527041875, + "grad_norm": 0.9989936947822571, + "learning_rate": 1.7145865170142186e-05, + "loss": 0.4709, + "step": 11110 + }, + { + "epoch": 0.2467231218299464, + "grad_norm": 1.3992575407028198, + "learning_rate": 1.7143425566049873e-05, + "loss": 0.5232, + "step": 11115 + }, + { + "epoch": 0.24683410838947403, + "grad_norm": 1.3434380292892456, + "learning_rate": 1.714098509348384e-05, + "loss": 0.6072, + "step": 11120 + }, + { + "epoch": 0.2469450949490017, + "grad_norm": 1.2431334257125854, + "learning_rate": 1.7138543752740785e-05, + "loss": 0.4156, + "step": 11125 + }, + { + "epoch": 0.2470560815085293, + "grad_norm": 1.1508046388626099, + "learning_rate": 1.7136101544117526e-05, + "loss": 0.4603, + "step": 11130 + }, + { + "epoch": 0.24716706806805697, + "grad_norm": 1.0870983600616455, + "learning_rate": 1.7133658467910968e-05, + "loss": 0.4438, + "step": 11135 + }, + { + "epoch": 0.2472780546275846, + "grad_norm": 1.3615703582763672, + "learning_rate": 1.7131214524418146e-05, + "loss": 0.5001, + "step": 11140 + }, + { + "epoch": 0.24738904118711225, + "grad_norm": 1.087088942527771, + "learning_rate": 1.7128769713936173e-05, + "loss": 0.4491, + "step": 11145 + }, + { + "epoch": 0.24750002774663987, + "grad_norm": 1.6181623935699463, + "learning_rate": 1.712632403676229e-05, + "loss": 0.4658, + "step": 11150 + }, + { + "epoch": 0.24761101430616753, + "grad_norm": 1.419737696647644, + "learning_rate": 1.7123877493193825e-05, + "loss": 0.4683, + "step": 11155 + }, + { + "epoch": 0.24772200086569515, + "grad_norm": 1.0363826751708984, + "learning_rate": 1.7121430083528227e-05, + "loss": 0.4743, + "step": 11160 + }, + { + "epoch": 0.2478329874252228, + "grad_norm": 1.0170842409133911, + "learning_rate": 1.7118981808063043e-05, + "loss": 0.3609, + "step": 11165 + }, + { + "epoch": 0.24794397398475046, + "grad_norm": 1.1048935651779175, + "learning_rate": 1.7116532667095928e-05, + "loss": 0.4537, + "step": 11170 + }, + { + "epoch": 0.24805496054427809, + "grad_norm": 1.2576793432235718, + "learning_rate": 1.711408266092464e-05, + "loss": 0.4805, + "step": 11175 + }, + { + "epoch": 0.24816594710380574, + "grad_norm": 1.1600898504257202, + "learning_rate": 1.7111631789847038e-05, + "loss": 0.4923, + "step": 11180 + }, + { + "epoch": 0.24827693366333337, + "grad_norm": 1.6441643238067627, + "learning_rate": 1.7109180054161093e-05, + "loss": 0.5338, + "step": 11185 + }, + { + "epoch": 0.24838792022286102, + "grad_norm": 0.9537017345428467, + "learning_rate": 1.710672745416488e-05, + "loss": 0.6072, + "step": 11190 + }, + { + "epoch": 0.24849890678238865, + "grad_norm": 0.8803986310958862, + "learning_rate": 1.710427399015658e-05, + "loss": 0.5011, + "step": 11195 + }, + { + "epoch": 0.2486098933419163, + "grad_norm": 1.057271957397461, + "learning_rate": 1.710181966243447e-05, + "loss": 0.33, + "step": 11200 + }, + { + "epoch": 0.24872087990144393, + "grad_norm": 0.9839698672294617, + "learning_rate": 1.7099364471296947e-05, + "loss": 0.5186, + "step": 11205 + }, + { + "epoch": 0.24883186646097158, + "grad_norm": 1.279700517654419, + "learning_rate": 1.70969084170425e-05, + "loss": 0.5572, + "step": 11210 + }, + { + "epoch": 0.2489428530204992, + "grad_norm": 1.7117375135421753, + "learning_rate": 1.7094451499969725e-05, + "loss": 0.4079, + "step": 11215 + }, + { + "epoch": 0.24905383958002686, + "grad_norm": 1.1009594202041626, + "learning_rate": 1.7091993720377336e-05, + "loss": 0.4195, + "step": 11220 + }, + { + "epoch": 0.2491648261395545, + "grad_norm": 1.4836571216583252, + "learning_rate": 1.708953507856413e-05, + "loss": 0.4876, + "step": 11225 + }, + { + "epoch": 0.24927581269908214, + "grad_norm": 1.6875817775726318, + "learning_rate": 1.708707557482903e-05, + "loss": 0.4789, + "step": 11230 + }, + { + "epoch": 0.2493867992586098, + "grad_norm": 1.110202670097351, + "learning_rate": 1.7084615209471045e-05, + "loss": 0.4394, + "step": 11235 + }, + { + "epoch": 0.24949778581813742, + "grad_norm": 1.2237122058868408, + "learning_rate": 1.7082153982789305e-05, + "loss": 0.4599, + "step": 11240 + }, + { + "epoch": 0.24960877237766507, + "grad_norm": 1.3197574615478516, + "learning_rate": 1.7079691895083036e-05, + "loss": 0.4665, + "step": 11245 + }, + { + "epoch": 0.2497197589371927, + "grad_norm": 1.1246968507766724, + "learning_rate": 1.7077228946651567e-05, + "loss": 0.5569, + "step": 11250 + }, + { + "epoch": 0.24983074549672035, + "grad_norm": 1.3298134803771973, + "learning_rate": 1.7074765137794343e-05, + "loss": 0.5649, + "step": 11255 + }, + { + "epoch": 0.24994173205624798, + "grad_norm": 1.000502109527588, + "learning_rate": 1.7072300468810896e-05, + "loss": 0.568, + "step": 11260 + }, + { + "epoch": 0.2500527186157756, + "grad_norm": 1.1524144411087036, + "learning_rate": 1.7069834940000878e-05, + "loss": 0.4625, + "step": 11265 + }, + { + "epoch": 0.2501637051753033, + "grad_norm": 0.940650224685669, + "learning_rate": 1.706736855166404e-05, + "loss": 0.6018, + "step": 11270 + }, + { + "epoch": 0.2502746917348309, + "grad_norm": 1.473748803138733, + "learning_rate": 1.7064901304100233e-05, + "loss": 0.5434, + "step": 11275 + }, + { + "epoch": 0.25038567829435854, + "grad_norm": 1.3512476682662964, + "learning_rate": 1.706243319760942e-05, + "loss": 0.438, + "step": 11280 + }, + { + "epoch": 0.2504966648538862, + "grad_norm": 1.119840145111084, + "learning_rate": 1.7059964232491666e-05, + "loss": 0.5395, + "step": 11285 + }, + { + "epoch": 0.25060765141341385, + "grad_norm": 2.1528420448303223, + "learning_rate": 1.7057494409047136e-05, + "loss": 0.4493, + "step": 11290 + }, + { + "epoch": 0.25071863797294147, + "grad_norm": 1.6110953092575073, + "learning_rate": 1.7055023727576106e-05, + "loss": 0.4385, + "step": 11295 + }, + { + "epoch": 0.2508296245324691, + "grad_norm": 1.4127633571624756, + "learning_rate": 1.7052552188378954e-05, + "loss": 0.5768, + "step": 11300 + }, + { + "epoch": 0.2509406110919968, + "grad_norm": 1.075862169265747, + "learning_rate": 1.7050079791756157e-05, + "loss": 0.4207, + "step": 11305 + }, + { + "epoch": 0.2510515976515244, + "grad_norm": 1.2337510585784912, + "learning_rate": 1.70476065380083e-05, + "loss": 0.3383, + "step": 11310 + }, + { + "epoch": 0.25116258421105203, + "grad_norm": 1.2561253309249878, + "learning_rate": 1.704513242743608e-05, + "loss": 0.3997, + "step": 11315 + }, + { + "epoch": 0.25127357077057966, + "grad_norm": 1.2801952362060547, + "learning_rate": 1.7042657460340283e-05, + "loss": 0.4344, + "step": 11320 + }, + { + "epoch": 0.25138455733010734, + "grad_norm": 1.136273980140686, + "learning_rate": 1.7040181637021812e-05, + "loss": 0.4397, + "step": 11325 + }, + { + "epoch": 0.25149554388963496, + "grad_norm": 1.1857062578201294, + "learning_rate": 1.7037704957781674e-05, + "loss": 0.4559, + "step": 11330 + }, + { + "epoch": 0.2516065304491626, + "grad_norm": 1.4461593627929688, + "learning_rate": 1.7035227422920965e-05, + "loss": 0.4814, + "step": 11335 + }, + { + "epoch": 0.2517175170086903, + "grad_norm": 1.012604832649231, + "learning_rate": 1.7032749032740904e-05, + "loss": 0.5578, + "step": 11340 + }, + { + "epoch": 0.2518285035682179, + "grad_norm": 1.427864670753479, + "learning_rate": 1.7030269787542798e-05, + "loss": 0.5969, + "step": 11345 + }, + { + "epoch": 0.2519394901277455, + "grad_norm": 0.997259259223938, + "learning_rate": 1.702778968762807e-05, + "loss": 0.4033, + "step": 11350 + }, + { + "epoch": 0.25205047668727315, + "grad_norm": 1.1704946756362915, + "learning_rate": 1.702530873329824e-05, + "loss": 0.6685, + "step": 11355 + }, + { + "epoch": 0.25216146324680083, + "grad_norm": 1.036297082901001, + "learning_rate": 1.702282692485494e-05, + "loss": 0.3092, + "step": 11360 + }, + { + "epoch": 0.25227244980632846, + "grad_norm": 1.4215106964111328, + "learning_rate": 1.702034426259989e-05, + "loss": 0.3612, + "step": 11365 + }, + { + "epoch": 0.2523834363658561, + "grad_norm": 1.420967698097229, + "learning_rate": 1.7017860746834932e-05, + "loss": 0.5245, + "step": 11370 + }, + { + "epoch": 0.2524944229253837, + "grad_norm": 1.196954607963562, + "learning_rate": 1.7015376377861998e-05, + "loss": 0.5223, + "step": 11375 + }, + { + "epoch": 0.2526054094849114, + "grad_norm": 1.2215323448181152, + "learning_rate": 1.7012891155983133e-05, + "loss": 0.3851, + "step": 11380 + }, + { + "epoch": 0.252716396044439, + "grad_norm": 1.6690142154693604, + "learning_rate": 1.701040508150048e-05, + "loss": 0.5762, + "step": 11385 + }, + { + "epoch": 0.25282738260396664, + "grad_norm": 1.261279582977295, + "learning_rate": 1.7007918154716286e-05, + "loss": 0.505, + "step": 11390 + }, + { + "epoch": 0.2529383691634943, + "grad_norm": 1.0459377765655518, + "learning_rate": 1.700543037593291e-05, + "loss": 0.4814, + "step": 11395 + }, + { + "epoch": 0.25304935572302195, + "grad_norm": 1.0348485708236694, + "learning_rate": 1.7002941745452804e-05, + "loss": 0.345, + "step": 11400 + }, + { + "epoch": 0.2531603422825496, + "grad_norm": 1.0944418907165527, + "learning_rate": 1.7000452263578523e-05, + "loss": 0.4802, + "step": 11405 + }, + { + "epoch": 0.2532713288420772, + "grad_norm": 1.30760657787323, + "learning_rate": 1.6997961930612733e-05, + "loss": 0.439, + "step": 11410 + }, + { + "epoch": 0.2533823154016049, + "grad_norm": 1.5078095197677612, + "learning_rate": 1.6995470746858204e-05, + "loss": 0.6, + "step": 11415 + }, + { + "epoch": 0.2534933019611325, + "grad_norm": 1.358195185661316, + "learning_rate": 1.6992978712617802e-05, + "loss": 0.6164, + "step": 11420 + }, + { + "epoch": 0.25360428852066014, + "grad_norm": 0.7815698981285095, + "learning_rate": 1.69904858281945e-05, + "loss": 0.464, + "step": 11425 + }, + { + "epoch": 0.25371527508018776, + "grad_norm": 0.9305065274238586, + "learning_rate": 1.6987992093891375e-05, + "loss": 0.4365, + "step": 11430 + }, + { + "epoch": 0.25382626163971544, + "grad_norm": 0.8990150094032288, + "learning_rate": 1.6985497510011606e-05, + "loss": 0.4459, + "step": 11435 + }, + { + "epoch": 0.25393724819924307, + "grad_norm": 1.0652459859848022, + "learning_rate": 1.698300207685848e-05, + "loss": 0.3107, + "step": 11440 + }, + { + "epoch": 0.2540482347587707, + "grad_norm": 0.9058518409729004, + "learning_rate": 1.698050579473538e-05, + "loss": 0.4161, + "step": 11445 + }, + { + "epoch": 0.2541592213182984, + "grad_norm": 1.298906922340393, + "learning_rate": 1.6978008663945794e-05, + "loss": 0.5682, + "step": 11450 + }, + { + "epoch": 0.254270207877826, + "grad_norm": 1.582791805267334, + "learning_rate": 1.6975510684793318e-05, + "loss": 0.6219, + "step": 11455 + }, + { + "epoch": 0.25438119443735363, + "grad_norm": 0.9926007986068726, + "learning_rate": 1.697301185758165e-05, + "loss": 0.4707, + "step": 11460 + }, + { + "epoch": 0.25449218099688126, + "grad_norm": 1.685634732246399, + "learning_rate": 1.697051218261458e-05, + "loss": 0.4601, + "step": 11465 + }, + { + "epoch": 0.25460316755640894, + "grad_norm": 1.1291474103927612, + "learning_rate": 1.696801166019602e-05, + "loss": 0.5155, + "step": 11470 + }, + { + "epoch": 0.25471415411593656, + "grad_norm": 0.8793083429336548, + "learning_rate": 1.6965510290629973e-05, + "loss": 0.3604, + "step": 11475 + }, + { + "epoch": 0.2548251406754642, + "grad_norm": 1.654395341873169, + "learning_rate": 1.6963008074220542e-05, + "loss": 0.5149, + "step": 11480 + }, + { + "epoch": 0.2549361272349918, + "grad_norm": 1.1508978605270386, + "learning_rate": 1.696050501127194e-05, + "loss": 0.4138, + "step": 11485 + }, + { + "epoch": 0.2550471137945195, + "grad_norm": 1.3007711172103882, + "learning_rate": 1.6958001102088485e-05, + "loss": 0.5267, + "step": 11490 + }, + { + "epoch": 0.2551581003540471, + "grad_norm": 1.4527742862701416, + "learning_rate": 1.6955496346974595e-05, + "loss": 0.2901, + "step": 11495 + }, + { + "epoch": 0.25526908691357475, + "grad_norm": 0.9203034043312073, + "learning_rate": 1.695299074623478e-05, + "loss": 0.4912, + "step": 11500 + }, + { + "epoch": 0.25538007347310243, + "grad_norm": 1.4323278665542603, + "learning_rate": 1.6950484300173676e-05, + "loss": 0.4698, + "step": 11505 + }, + { + "epoch": 0.25549106003263006, + "grad_norm": 1.0388230085372925, + "learning_rate": 1.6947977009095994e-05, + "loss": 0.458, + "step": 11510 + }, + { + "epoch": 0.2556020465921577, + "grad_norm": 1.0585956573486328, + "learning_rate": 1.694546887330657e-05, + "loss": 0.575, + "step": 11515 + }, + { + "epoch": 0.2557130331516853, + "grad_norm": 1.583940863609314, + "learning_rate": 1.6942959893110335e-05, + "loss": 0.4435, + "step": 11520 + }, + { + "epoch": 0.255824019711213, + "grad_norm": 1.193070650100708, + "learning_rate": 1.694045006881232e-05, + "loss": 0.4773, + "step": 11525 + }, + { + "epoch": 0.2559350062707406, + "grad_norm": 1.2020277976989746, + "learning_rate": 1.6937939400717663e-05, + "loss": 0.377, + "step": 11530 + }, + { + "epoch": 0.25604599283026824, + "grad_norm": 1.052960753440857, + "learning_rate": 1.69354278891316e-05, + "loss": 0.7132, + "step": 11535 + }, + { + "epoch": 0.25615697938979587, + "grad_norm": 1.1283267736434937, + "learning_rate": 1.693291553435948e-05, + "loss": 0.4637, + "step": 11540 + }, + { + "epoch": 0.25626796594932355, + "grad_norm": 1.459197998046875, + "learning_rate": 1.6930402336706735e-05, + "loss": 0.4351, + "step": 11545 + }, + { + "epoch": 0.2563789525088512, + "grad_norm": 1.013622760772705, + "learning_rate": 1.6927888296478918e-05, + "loss": 0.486, + "step": 11550 + }, + { + "epoch": 0.2564899390683788, + "grad_norm": 0.8087404370307922, + "learning_rate": 1.6925373413981673e-05, + "loss": 0.4736, + "step": 11555 + }, + { + "epoch": 0.2566009256279065, + "grad_norm": 0.8877795338630676, + "learning_rate": 1.692285768952076e-05, + "loss": 0.4087, + "step": 11560 + }, + { + "epoch": 0.2567119121874341, + "grad_norm": 0.8639494776725769, + "learning_rate": 1.692034112340202e-05, + "loss": 0.4806, + "step": 11565 + }, + { + "epoch": 0.25682289874696174, + "grad_norm": 0.7945966720581055, + "learning_rate": 1.691782371593142e-05, + "loss": 0.5977, + "step": 11570 + }, + { + "epoch": 0.25693388530648936, + "grad_norm": 1.2927353382110596, + "learning_rate": 1.6915305467415014e-05, + "loss": 0.5544, + "step": 11575 + }, + { + "epoch": 0.25704487186601704, + "grad_norm": 0.9172951579093933, + "learning_rate": 1.6912786378158957e-05, + "loss": 0.6539, + "step": 11580 + }, + { + "epoch": 0.25715585842554467, + "grad_norm": 1.2856504917144775, + "learning_rate": 1.691026644846952e-05, + "loss": 0.4804, + "step": 11585 + }, + { + "epoch": 0.2572668449850723, + "grad_norm": 1.0011804103851318, + "learning_rate": 1.6907745678653064e-05, + "loss": 0.4702, + "step": 11590 + }, + { + "epoch": 0.2573778315445999, + "grad_norm": 1.5778934955596924, + "learning_rate": 1.690522406901605e-05, + "loss": 0.4763, + "step": 11595 + }, + { + "epoch": 0.2574888181041276, + "grad_norm": 1.3447763919830322, + "learning_rate": 1.6902701619865056e-05, + "loss": 0.547, + "step": 11600 + }, + { + "epoch": 0.25759980466365523, + "grad_norm": 1.4077445268630981, + "learning_rate": 1.690017833150675e-05, + "loss": 0.4881, + "step": 11605 + }, + { + "epoch": 0.25771079122318286, + "grad_norm": 0.993047297000885, + "learning_rate": 1.6897654204247897e-05, + "loss": 0.6155, + "step": 11610 + }, + { + "epoch": 0.25782177778271054, + "grad_norm": 0.7152522802352905, + "learning_rate": 1.6895129238395386e-05, + "loss": 0.3392, + "step": 11615 + }, + { + "epoch": 0.25793276434223816, + "grad_norm": 1.6415187120437622, + "learning_rate": 1.6892603434256184e-05, + "loss": 0.4007, + "step": 11620 + }, + { + "epoch": 0.2580437509017658, + "grad_norm": 1.0869807004928589, + "learning_rate": 1.6890076792137373e-05, + "loss": 0.55, + "step": 11625 + }, + { + "epoch": 0.2581547374612934, + "grad_norm": 1.0225802659988403, + "learning_rate": 1.688754931234613e-05, + "loss": 0.3417, + "step": 11630 + }, + { + "epoch": 0.2582657240208211, + "grad_norm": 1.0578125715255737, + "learning_rate": 1.6885020995189743e-05, + "loss": 0.4709, + "step": 11635 + }, + { + "epoch": 0.2583767105803487, + "grad_norm": 1.2576895952224731, + "learning_rate": 1.6882491840975593e-05, + "loss": 0.504, + "step": 11640 + }, + { + "epoch": 0.25848769713987635, + "grad_norm": 1.2828280925750732, + "learning_rate": 1.6879961850011174e-05, + "loss": 0.4219, + "step": 11645 + }, + { + "epoch": 0.258598683699404, + "grad_norm": 1.12638521194458, + "learning_rate": 1.6877431022604057e-05, + "loss": 0.556, + "step": 11650 + }, + { + "epoch": 0.25870967025893166, + "grad_norm": 1.4913954734802246, + "learning_rate": 1.6874899359061946e-05, + "loss": 0.6028, + "step": 11655 + }, + { + "epoch": 0.2588206568184593, + "grad_norm": 1.3855817317962646, + "learning_rate": 1.687236685969263e-05, + "loss": 0.5162, + "step": 11660 + }, + { + "epoch": 0.2589316433779869, + "grad_norm": 1.228534460067749, + "learning_rate": 1.6869833524803995e-05, + "loss": 0.6225, + "step": 11665 + }, + { + "epoch": 0.2590426299375146, + "grad_norm": 1.5325289964675903, + "learning_rate": 1.686729935470404e-05, + "loss": 0.5332, + "step": 11670 + }, + { + "epoch": 0.2591536164970422, + "grad_norm": 1.2924435138702393, + "learning_rate": 1.6864764349700866e-05, + "loss": 0.4786, + "step": 11675 + }, + { + "epoch": 0.25926460305656984, + "grad_norm": 1.0833882093429565, + "learning_rate": 1.6862228510102657e-05, + "loss": 0.5727, + "step": 11680 + }, + { + "epoch": 0.25937558961609747, + "grad_norm": 1.1746231317520142, + "learning_rate": 1.6859691836217725e-05, + "loss": 0.2722, + "step": 11685 + }, + { + "epoch": 0.25948657617562515, + "grad_norm": 5.168801784515381, + "learning_rate": 1.6857154328354463e-05, + "loss": 0.6013, + "step": 11690 + }, + { + "epoch": 0.2595975627351528, + "grad_norm": 1.4809683561325073, + "learning_rate": 1.6854615986821377e-05, + "loss": 0.5135, + "step": 11695 + }, + { + "epoch": 0.2597085492946804, + "grad_norm": 1.0130324363708496, + "learning_rate": 1.6852076811927066e-05, + "loss": 0.3937, + "step": 11700 + }, + { + "epoch": 0.25981953585420803, + "grad_norm": 1.0422590970993042, + "learning_rate": 1.6849536803980238e-05, + "loss": 0.4183, + "step": 11705 + }, + { + "epoch": 0.2599305224137357, + "grad_norm": 1.2698982954025269, + "learning_rate": 1.6846995963289696e-05, + "loss": 0.4465, + "step": 11710 + }, + { + "epoch": 0.26004150897326334, + "grad_norm": 1.3013725280761719, + "learning_rate": 1.684445429016435e-05, + "loss": 0.5689, + "step": 11715 + }, + { + "epoch": 0.26015249553279096, + "grad_norm": 1.5303215980529785, + "learning_rate": 1.68419117849132e-05, + "loss": 0.4574, + "step": 11720 + }, + { + "epoch": 0.26026348209231864, + "grad_norm": 1.2155689001083374, + "learning_rate": 1.6839368447845366e-05, + "loss": 0.3824, + "step": 11725 + }, + { + "epoch": 0.26037446865184627, + "grad_norm": 1.172390341758728, + "learning_rate": 1.6836824279270053e-05, + "loss": 0.379, + "step": 11730 + }, + { + "epoch": 0.2604854552113739, + "grad_norm": 1.3164101839065552, + "learning_rate": 1.683427927949657e-05, + "loss": 0.4448, + "step": 11735 + }, + { + "epoch": 0.2605964417709015, + "grad_norm": 1.1558310985565186, + "learning_rate": 1.6831733448834336e-05, + "loss": 0.521, + "step": 11740 + }, + { + "epoch": 0.2607074283304292, + "grad_norm": 1.3138155937194824, + "learning_rate": 1.682918678759286e-05, + "loss": 0.5066, + "step": 11745 + }, + { + "epoch": 0.26081841488995683, + "grad_norm": 1.0508272647857666, + "learning_rate": 1.682663929608176e-05, + "loss": 0.3492, + "step": 11750 + }, + { + "epoch": 0.26092940144948445, + "grad_norm": 0.9779253005981445, + "learning_rate": 1.6824090974610742e-05, + "loss": 0.3875, + "step": 11755 + }, + { + "epoch": 0.2610403880090121, + "grad_norm": 1.0405664443969727, + "learning_rate": 1.6821541823489636e-05, + "loss": 0.4794, + "step": 11760 + }, + { + "epoch": 0.26115137456853976, + "grad_norm": 1.8929942846298218, + "learning_rate": 1.6818991843028353e-05, + "loss": 0.5157, + "step": 11765 + }, + { + "epoch": 0.2612623611280674, + "grad_norm": 1.7605929374694824, + "learning_rate": 1.681644103353691e-05, + "loss": 0.3625, + "step": 11770 + }, + { + "epoch": 0.261373347687595, + "grad_norm": 1.064549207687378, + "learning_rate": 1.6813889395325423e-05, + "loss": 0.4564, + "step": 11775 + }, + { + "epoch": 0.2614843342471227, + "grad_norm": 0.8532626032829285, + "learning_rate": 1.681133692870412e-05, + "loss": 0.4323, + "step": 11780 + }, + { + "epoch": 0.2615953208066503, + "grad_norm": 1.594598650932312, + "learning_rate": 1.6808783633983315e-05, + "loss": 0.4004, + "step": 11785 + }, + { + "epoch": 0.26170630736617795, + "grad_norm": 1.5274829864501953, + "learning_rate": 1.680622951147343e-05, + "loss": 0.3958, + "step": 11790 + }, + { + "epoch": 0.2618172939257056, + "grad_norm": 1.0514847040176392, + "learning_rate": 1.6803674561484987e-05, + "loss": 0.4269, + "step": 11795 + }, + { + "epoch": 0.26192828048523326, + "grad_norm": 1.3296819925308228, + "learning_rate": 1.680111878432861e-05, + "loss": 0.6155, + "step": 11800 + }, + { + "epoch": 0.2620392670447609, + "grad_norm": 0.8670737147331238, + "learning_rate": 1.679856218031502e-05, + "loss": 0.4943, + "step": 11805 + }, + { + "epoch": 0.2621502536042885, + "grad_norm": 0.9139857888221741, + "learning_rate": 1.6796004749755043e-05, + "loss": 0.4629, + "step": 11810 + }, + { + "epoch": 0.2622612401638162, + "grad_norm": 1.5310050249099731, + "learning_rate": 1.6793446492959596e-05, + "loss": 0.3929, + "step": 11815 + }, + { + "epoch": 0.2623722267233438, + "grad_norm": 1.3357696533203125, + "learning_rate": 1.679088741023971e-05, + "loss": 0.4999, + "step": 11820 + }, + { + "epoch": 0.26248321328287144, + "grad_norm": 1.1147435903549194, + "learning_rate": 1.6788327501906507e-05, + "loss": 0.6116, + "step": 11825 + }, + { + "epoch": 0.26259419984239907, + "grad_norm": 1.315943717956543, + "learning_rate": 1.678576676827121e-05, + "loss": 0.3868, + "step": 11830 + }, + { + "epoch": 0.26270518640192675, + "grad_norm": 0.9401881098747253, + "learning_rate": 1.678320520964515e-05, + "loss": 0.5025, + "step": 11835 + }, + { + "epoch": 0.2628161729614544, + "grad_norm": 1.2512296438217163, + "learning_rate": 1.678064282633975e-05, + "loss": 0.4544, + "step": 11840 + }, + { + "epoch": 0.262927159520982, + "grad_norm": 0.9268614649772644, + "learning_rate": 1.6778079618666536e-05, + "loss": 0.4752, + "step": 11845 + }, + { + "epoch": 0.2630381460805096, + "grad_norm": 1.4356162548065186, + "learning_rate": 1.6775515586937135e-05, + "loss": 0.6672, + "step": 11850 + }, + { + "epoch": 0.2631491326400373, + "grad_norm": 1.5182090997695923, + "learning_rate": 1.677295073146327e-05, + "loss": 0.6735, + "step": 11855 + }, + { + "epoch": 0.26326011919956493, + "grad_norm": 1.3008809089660645, + "learning_rate": 1.677038505255677e-05, + "loss": 0.5279, + "step": 11860 + }, + { + "epoch": 0.26337110575909256, + "grad_norm": 0.9265232086181641, + "learning_rate": 1.6767818550529564e-05, + "loss": 0.3631, + "step": 11865 + }, + { + "epoch": 0.26348209231862024, + "grad_norm": 1.1864442825317383, + "learning_rate": 1.676525122569367e-05, + "loss": 0.4952, + "step": 11870 + }, + { + "epoch": 0.26359307887814787, + "grad_norm": 1.2192386388778687, + "learning_rate": 1.676268307836123e-05, + "loss": 0.5234, + "step": 11875 + }, + { + "epoch": 0.2637040654376755, + "grad_norm": 0.9491934180259705, + "learning_rate": 1.6760114108844453e-05, + "loss": 0.5122, + "step": 11880 + }, + { + "epoch": 0.2638150519972031, + "grad_norm": 1.0653088092803955, + "learning_rate": 1.6757544317455677e-05, + "loss": 0.5234, + "step": 11885 + }, + { + "epoch": 0.2639260385567308, + "grad_norm": 0.9259722828865051, + "learning_rate": 1.6754973704507325e-05, + "loss": 0.5128, + "step": 11890 + }, + { + "epoch": 0.26403702511625843, + "grad_norm": 1.3802366256713867, + "learning_rate": 1.6752402270311928e-05, + "loss": 0.4486, + "step": 11895 + }, + { + "epoch": 0.26414801167578605, + "grad_norm": 1.1017757654190063, + "learning_rate": 1.6749830015182106e-05, + "loss": 0.5004, + "step": 11900 + }, + { + "epoch": 0.2642589982353137, + "grad_norm": 1.4015121459960938, + "learning_rate": 1.674725693943059e-05, + "loss": 0.3029, + "step": 11905 + }, + { + "epoch": 0.26436998479484136, + "grad_norm": 1.651033878326416, + "learning_rate": 1.6744683043370204e-05, + "loss": 0.514, + "step": 11910 + }, + { + "epoch": 0.264480971354369, + "grad_norm": 1.0948195457458496, + "learning_rate": 1.6742108327313872e-05, + "loss": 0.6031, + "step": 11915 + }, + { + "epoch": 0.2645919579138966, + "grad_norm": 0.9787044525146484, + "learning_rate": 1.673953279157462e-05, + "loss": 0.5408, + "step": 11920 + }, + { + "epoch": 0.2647029444734243, + "grad_norm": 1.5225952863693237, + "learning_rate": 1.6736956436465573e-05, + "loss": 0.5068, + "step": 11925 + }, + { + "epoch": 0.2648139310329519, + "grad_norm": 0.8884637951850891, + "learning_rate": 1.6734379262299957e-05, + "loss": 0.4416, + "step": 11930 + }, + { + "epoch": 0.26492491759247955, + "grad_norm": 1.1771692037582397, + "learning_rate": 1.6731801269391098e-05, + "loss": 0.518, + "step": 11935 + }, + { + "epoch": 0.2650359041520072, + "grad_norm": 0.838868260383606, + "learning_rate": 1.672922245805242e-05, + "loss": 0.5394, + "step": 11940 + }, + { + "epoch": 0.26514689071153486, + "grad_norm": 1.4811780452728271, + "learning_rate": 1.6726642828597436e-05, + "loss": 0.5489, + "step": 11945 + }, + { + "epoch": 0.2652578772710625, + "grad_norm": 0.8113346099853516, + "learning_rate": 1.672406238133978e-05, + "loss": 0.4482, + "step": 11950 + }, + { + "epoch": 0.2653688638305901, + "grad_norm": 1.0179511308670044, + "learning_rate": 1.672148111659317e-05, + "loss": 0.382, + "step": 11955 + }, + { + "epoch": 0.26547985039011773, + "grad_norm": 1.4388052225112915, + "learning_rate": 1.671889903467143e-05, + "loss": 0.3759, + "step": 11960 + }, + { + "epoch": 0.2655908369496454, + "grad_norm": 1.3310468196868896, + "learning_rate": 1.6716316135888478e-05, + "loss": 0.3242, + "step": 11965 + }, + { + "epoch": 0.26570182350917304, + "grad_norm": 1.3747960329055786, + "learning_rate": 1.6713732420558333e-05, + "loss": 0.4654, + "step": 11970 + }, + { + "epoch": 0.26581281006870067, + "grad_norm": 1.1084215641021729, + "learning_rate": 1.6711147888995117e-05, + "loss": 0.4765, + "step": 11975 + }, + { + "epoch": 0.26592379662822835, + "grad_norm": 1.0055367946624756, + "learning_rate": 1.670856254151305e-05, + "loss": 0.4503, + "step": 11980 + }, + { + "epoch": 0.266034783187756, + "grad_norm": 1.1023660898208618, + "learning_rate": 1.6705976378426447e-05, + "loss": 0.4315, + "step": 11985 + }, + { + "epoch": 0.2661457697472836, + "grad_norm": 1.4151127338409424, + "learning_rate": 1.6703389400049724e-05, + "loss": 0.4301, + "step": 11990 + }, + { + "epoch": 0.2662567563068112, + "grad_norm": 1.0657737255096436, + "learning_rate": 1.67008016066974e-05, + "loss": 0.4067, + "step": 11995 + }, + { + "epoch": 0.2663677428663389, + "grad_norm": 1.3572338819503784, + "learning_rate": 1.669821299868409e-05, + "loss": 0.504, + "step": 12000 + }, + { + "epoch": 0.26647872942586653, + "grad_norm": 1.3516322374343872, + "learning_rate": 1.669562357632451e-05, + "loss": 0.3876, + "step": 12005 + }, + { + "epoch": 0.26658971598539416, + "grad_norm": 1.1288280487060547, + "learning_rate": 1.669303333993347e-05, + "loss": 0.5649, + "step": 12010 + }, + { + "epoch": 0.2667007025449218, + "grad_norm": 1.294328212738037, + "learning_rate": 1.6690442289825882e-05, + "loss": 0.6067, + "step": 12015 + }, + { + "epoch": 0.26681168910444947, + "grad_norm": 0.8820907473564148, + "learning_rate": 1.6687850426316758e-05, + "loss": 0.4497, + "step": 12020 + }, + { + "epoch": 0.2669226756639771, + "grad_norm": 0.8859009742736816, + "learning_rate": 1.668525774972121e-05, + "loss": 0.4719, + "step": 12025 + }, + { + "epoch": 0.2670336622235047, + "grad_norm": 1.128180742263794, + "learning_rate": 1.6682664260354445e-05, + "loss": 0.5357, + "step": 12030 + }, + { + "epoch": 0.2671446487830324, + "grad_norm": 1.1565715074539185, + "learning_rate": 1.6680069958531772e-05, + "loss": 0.4663, + "step": 12035 + }, + { + "epoch": 0.26725563534256, + "grad_norm": 1.0250076055526733, + "learning_rate": 1.6677474844568593e-05, + "loss": 0.5358, + "step": 12040 + }, + { + "epoch": 0.26736662190208765, + "grad_norm": 1.1731986999511719, + "learning_rate": 1.667487891878042e-05, + "loss": 0.4901, + "step": 12045 + }, + { + "epoch": 0.2674776084616153, + "grad_norm": 1.184151291847229, + "learning_rate": 1.667228218148285e-05, + "loss": 0.3815, + "step": 12050 + }, + { + "epoch": 0.26758859502114296, + "grad_norm": 1.1311311721801758, + "learning_rate": 1.6669684632991594e-05, + "loss": 0.5596, + "step": 12055 + }, + { + "epoch": 0.2676995815806706, + "grad_norm": 1.0905286073684692, + "learning_rate": 1.6667086273622447e-05, + "loss": 0.4146, + "step": 12060 + }, + { + "epoch": 0.2678105681401982, + "grad_norm": 1.4904059171676636, + "learning_rate": 1.666448710369131e-05, + "loss": 0.6837, + "step": 12065 + }, + { + "epoch": 0.26792155469972584, + "grad_norm": 1.134896159172058, + "learning_rate": 1.6661887123514183e-05, + "loss": 0.3944, + "step": 12070 + }, + { + "epoch": 0.2680325412592535, + "grad_norm": 1.0507289171218872, + "learning_rate": 1.665928633340716e-05, + "loss": 0.3522, + "step": 12075 + }, + { + "epoch": 0.26814352781878115, + "grad_norm": 1.3701528310775757, + "learning_rate": 1.6656684733686443e-05, + "loss": 0.4823, + "step": 12080 + }, + { + "epoch": 0.2682545143783088, + "grad_norm": 1.105454683303833, + "learning_rate": 1.6654082324668316e-05, + "loss": 0.4494, + "step": 12085 + }, + { + "epoch": 0.26836550093783645, + "grad_norm": 1.7847298383712769, + "learning_rate": 1.6651479106669177e-05, + "loss": 0.3987, + "step": 12090 + }, + { + "epoch": 0.2684764874973641, + "grad_norm": 1.2652286291122437, + "learning_rate": 1.6648875080005515e-05, + "loss": 0.5751, + "step": 12095 + }, + { + "epoch": 0.2685874740568917, + "grad_norm": 1.2714149951934814, + "learning_rate": 1.664627024499392e-05, + "loss": 0.5911, + "step": 12100 + }, + { + "epoch": 0.26869846061641933, + "grad_norm": 1.3231234550476074, + "learning_rate": 1.664366460195108e-05, + "loss": 0.5036, + "step": 12105 + }, + { + "epoch": 0.268809447175947, + "grad_norm": 1.1678462028503418, + "learning_rate": 1.6641058151193776e-05, + "loss": 0.5526, + "step": 12110 + }, + { + "epoch": 0.26892043373547464, + "grad_norm": 1.274032711982727, + "learning_rate": 1.6638450893038895e-05, + "loss": 0.5519, + "step": 12115 + }, + { + "epoch": 0.26903142029500227, + "grad_norm": 1.2381004095077515, + "learning_rate": 1.663584282780342e-05, + "loss": 0.4028, + "step": 12120 + }, + { + "epoch": 0.2691424068545299, + "grad_norm": 1.223659873008728, + "learning_rate": 1.6633233955804428e-05, + "loss": 0.4646, + "step": 12125 + }, + { + "epoch": 0.2692533934140576, + "grad_norm": 1.3925018310546875, + "learning_rate": 1.66306242773591e-05, + "loss": 0.3454, + "step": 12130 + }, + { + "epoch": 0.2693643799735852, + "grad_norm": 1.2066676616668701, + "learning_rate": 1.6628013792784705e-05, + "loss": 0.5239, + "step": 12135 + }, + { + "epoch": 0.2694753665331128, + "grad_norm": 1.4494436979293823, + "learning_rate": 1.6625402502398623e-05, + "loss": 0.4883, + "step": 12140 + }, + { + "epoch": 0.2695863530926405, + "grad_norm": 1.3394190073013306, + "learning_rate": 1.6622790406518327e-05, + "loss": 0.508, + "step": 12145 + }, + { + "epoch": 0.26969733965216813, + "grad_norm": 0.7215554714202881, + "learning_rate": 1.6620177505461383e-05, + "loss": 0.4276, + "step": 12150 + }, + { + "epoch": 0.26980832621169576, + "grad_norm": 1.1991218328475952, + "learning_rate": 1.6617563799545462e-05, + "loss": 0.3537, + "step": 12155 + }, + { + "epoch": 0.2699193127712234, + "grad_norm": 0.8841428756713867, + "learning_rate": 1.6614949289088323e-05, + "loss": 0.3806, + "step": 12160 + }, + { + "epoch": 0.27003029933075107, + "grad_norm": 1.0124881267547607, + "learning_rate": 1.661233397440784e-05, + "loss": 0.6561, + "step": 12165 + }, + { + "epoch": 0.2701412858902787, + "grad_norm": 1.4342435598373413, + "learning_rate": 1.6609717855821965e-05, + "loss": 0.3026, + "step": 12170 + }, + { + "epoch": 0.2702522724498063, + "grad_norm": 1.2713871002197266, + "learning_rate": 1.6607100933648763e-05, + "loss": 0.489, + "step": 12175 + }, + { + "epoch": 0.27036325900933394, + "grad_norm": 1.5005031824111938, + "learning_rate": 1.6604483208206387e-05, + "loss": 0.4706, + "step": 12180 + }, + { + "epoch": 0.2704742455688616, + "grad_norm": 1.0889896154403687, + "learning_rate": 1.6601864679813088e-05, + "loss": 0.4989, + "step": 12185 + }, + { + "epoch": 0.27058523212838925, + "grad_norm": 1.6298508644104004, + "learning_rate": 1.659924534878723e-05, + "loss": 0.4919, + "step": 12190 + }, + { + "epoch": 0.2706962186879169, + "grad_norm": 1.2517374753952026, + "learning_rate": 1.659662521544725e-05, + "loss": 0.4966, + "step": 12195 + }, + { + "epoch": 0.27080720524744456, + "grad_norm": 1.2407162189483643, + "learning_rate": 1.6594004280111697e-05, + "loss": 0.6297, + "step": 12200 + }, + { + "epoch": 0.2709181918069722, + "grad_norm": 1.2246013879776, + "learning_rate": 1.6591382543099222e-05, + "loss": 0.5794, + "step": 12205 + }, + { + "epoch": 0.2710291783664998, + "grad_norm": 1.1958905458450317, + "learning_rate": 1.6588760004728565e-05, + "loss": 0.4974, + "step": 12210 + }, + { + "epoch": 0.27114016492602744, + "grad_norm": 1.1367417573928833, + "learning_rate": 1.658613666531856e-05, + "loss": 0.4294, + "step": 12215 + }, + { + "epoch": 0.2712511514855551, + "grad_norm": 0.8926473259925842, + "learning_rate": 1.6583512525188146e-05, + "loss": 0.4302, + "step": 12220 + }, + { + "epoch": 0.27136213804508275, + "grad_norm": 1.131288766860962, + "learning_rate": 1.658088758465636e-05, + "loss": 0.528, + "step": 12225 + }, + { + "epoch": 0.27147312460461037, + "grad_norm": 1.3164432048797607, + "learning_rate": 1.6578261844042335e-05, + "loss": 0.6973, + "step": 12230 + }, + { + "epoch": 0.271584111164138, + "grad_norm": 1.7050387859344482, + "learning_rate": 1.6575635303665296e-05, + "loss": 0.4737, + "step": 12235 + }, + { + "epoch": 0.2716950977236657, + "grad_norm": 1.5529253482818604, + "learning_rate": 1.657300796384457e-05, + "loss": 0.5328, + "step": 12240 + }, + { + "epoch": 0.2718060842831933, + "grad_norm": 0.7697571516036987, + "learning_rate": 1.6570379824899576e-05, + "loss": 0.6543, + "step": 12245 + }, + { + "epoch": 0.27191707084272093, + "grad_norm": 0.7909753322601318, + "learning_rate": 1.656775088714984e-05, + "loss": 0.4149, + "step": 12250 + }, + { + "epoch": 0.2720280574022486, + "grad_norm": 1.0096886157989502, + "learning_rate": 1.656512115091498e-05, + "loss": 0.4909, + "step": 12255 + }, + { + "epoch": 0.27213904396177624, + "grad_norm": 0.891508162021637, + "learning_rate": 1.6562490616514705e-05, + "loss": 0.3828, + "step": 12260 + }, + { + "epoch": 0.27225003052130387, + "grad_norm": 2.050764560699463, + "learning_rate": 1.6559859284268833e-05, + "loss": 0.5661, + "step": 12265 + }, + { + "epoch": 0.2723610170808315, + "grad_norm": 1.347254991531372, + "learning_rate": 1.6557227154497266e-05, + "loss": 0.5759, + "step": 12270 + }, + { + "epoch": 0.2724720036403592, + "grad_norm": 0.8337706327438354, + "learning_rate": 1.6554594227520015e-05, + "loss": 0.5092, + "step": 12275 + }, + { + "epoch": 0.2725829901998868, + "grad_norm": 1.2662127017974854, + "learning_rate": 1.6551960503657182e-05, + "loss": 0.4536, + "step": 12280 + }, + { + "epoch": 0.2726939767594144, + "grad_norm": 1.0730518102645874, + "learning_rate": 1.654932598322896e-05, + "loss": 0.4121, + "step": 12285 + }, + { + "epoch": 0.27280496331894205, + "grad_norm": 1.139347791671753, + "learning_rate": 1.6546690666555652e-05, + "loss": 0.4884, + "step": 12290 + }, + { + "epoch": 0.27291594987846973, + "grad_norm": 1.3560361862182617, + "learning_rate": 1.654405455395765e-05, + "loss": 0.4048, + "step": 12295 + }, + { + "epoch": 0.27302693643799736, + "grad_norm": 0.7781074643135071, + "learning_rate": 1.654141764575544e-05, + "loss": 0.3949, + "step": 12300 + }, + { + "epoch": 0.273137922997525, + "grad_norm": 0.5743057131767273, + "learning_rate": 1.6538779942269613e-05, + "loss": 0.3579, + "step": 12305 + }, + { + "epoch": 0.27324890955705267, + "grad_norm": 1.3679219484329224, + "learning_rate": 1.6536141443820844e-05, + "loss": 0.557, + "step": 12310 + }, + { + "epoch": 0.2733598961165803, + "grad_norm": 1.3961282968521118, + "learning_rate": 1.6533502150729925e-05, + "loss": 0.4722, + "step": 12315 + }, + { + "epoch": 0.2734708826761079, + "grad_norm": 1.3438667058944702, + "learning_rate": 1.6530862063317726e-05, + "loss": 0.465, + "step": 12320 + }, + { + "epoch": 0.27358186923563554, + "grad_norm": 1.1423835754394531, + "learning_rate": 1.6528221181905217e-05, + "loss": 0.5002, + "step": 12325 + }, + { + "epoch": 0.2736928557951632, + "grad_norm": 0.8907164335250854, + "learning_rate": 1.6525579506813472e-05, + "loss": 0.3263, + "step": 12330 + }, + { + "epoch": 0.27380384235469085, + "grad_norm": 1.622192144393921, + "learning_rate": 1.652293703836366e-05, + "loss": 0.5461, + "step": 12335 + }, + { + "epoch": 0.2739148289142185, + "grad_norm": 0.7274703979492188, + "learning_rate": 1.6520293776877033e-05, + "loss": 0.2692, + "step": 12340 + }, + { + "epoch": 0.2740258154737461, + "grad_norm": 1.012320876121521, + "learning_rate": 1.6517649722674958e-05, + "loss": 0.4936, + "step": 12345 + }, + { + "epoch": 0.2741368020332738, + "grad_norm": 1.2902557849884033, + "learning_rate": 1.6515004876078887e-05, + "loss": 0.5255, + "step": 12350 + }, + { + "epoch": 0.2742477885928014, + "grad_norm": 1.0321226119995117, + "learning_rate": 1.6512359237410375e-05, + "loss": 0.37, + "step": 12355 + }, + { + "epoch": 0.27435877515232904, + "grad_norm": 1.5266025066375732, + "learning_rate": 1.650971280699107e-05, + "loss": 0.4362, + "step": 12360 + }, + { + "epoch": 0.2744697617118567, + "grad_norm": 1.1383706331253052, + "learning_rate": 1.6507065585142707e-05, + "loss": 0.3219, + "step": 12365 + }, + { + "epoch": 0.27458074827138435, + "grad_norm": 1.4877493381500244, + "learning_rate": 1.6504417572187138e-05, + "loss": 0.6639, + "step": 12370 + }, + { + "epoch": 0.27469173483091197, + "grad_norm": 1.5238726139068604, + "learning_rate": 1.6501768768446292e-05, + "loss": 0.5665, + "step": 12375 + }, + { + "epoch": 0.2748027213904396, + "grad_norm": 0.9921168684959412, + "learning_rate": 1.6499119174242207e-05, + "loss": 0.4326, + "step": 12380 + }, + { + "epoch": 0.2749137079499673, + "grad_norm": 1.0095458030700684, + "learning_rate": 1.649646878989701e-05, + "loss": 0.3943, + "step": 12385 + }, + { + "epoch": 0.2750246945094949, + "grad_norm": 1.2044517993927002, + "learning_rate": 1.649381761573292e-05, + "loss": 0.4818, + "step": 12390 + }, + { + "epoch": 0.27513568106902253, + "grad_norm": 0.9758161306381226, + "learning_rate": 1.6491165652072268e-05, + "loss": 0.4202, + "step": 12395 + }, + { + "epoch": 0.27524666762855016, + "grad_norm": 1.5961153507232666, + "learning_rate": 1.648851289923746e-05, + "loss": 0.4546, + "step": 12400 + }, + { + "epoch": 0.27535765418807784, + "grad_norm": 1.5342501401901245, + "learning_rate": 1.648585935755102e-05, + "loss": 0.5894, + "step": 12405 + }, + { + "epoch": 0.27546864074760546, + "grad_norm": 1.1912126541137695, + "learning_rate": 1.648320502733555e-05, + "loss": 0.4693, + "step": 12410 + }, + { + "epoch": 0.2755796273071331, + "grad_norm": 1.8135324716567993, + "learning_rate": 1.6480549908913756e-05, + "loss": 0.3781, + "step": 12415 + }, + { + "epoch": 0.27569061386666077, + "grad_norm": 1.4575508832931519, + "learning_rate": 1.6477894002608435e-05, + "loss": 0.331, + "step": 12420 + }, + { + "epoch": 0.2758016004261884, + "grad_norm": 1.269692063331604, + "learning_rate": 1.647523730874249e-05, + "loss": 0.4155, + "step": 12425 + }, + { + "epoch": 0.275912586985716, + "grad_norm": 1.1125141382217407, + "learning_rate": 1.6472579827638906e-05, + "loss": 0.4862, + "step": 12430 + }, + { + "epoch": 0.27602357354524365, + "grad_norm": 1.6777321100234985, + "learning_rate": 1.6469921559620777e-05, + "loss": 0.3672, + "step": 12435 + }, + { + "epoch": 0.27613456010477133, + "grad_norm": 1.0261355638504028, + "learning_rate": 1.6467262505011282e-05, + "loss": 0.3913, + "step": 12440 + }, + { + "epoch": 0.27624554666429896, + "grad_norm": 1.3293694257736206, + "learning_rate": 1.64646026641337e-05, + "loss": 0.5825, + "step": 12445 + }, + { + "epoch": 0.2763565332238266, + "grad_norm": 1.1149784326553345, + "learning_rate": 1.6461942037311406e-05, + "loss": 0.348, + "step": 12450 + }, + { + "epoch": 0.2764675197833542, + "grad_norm": 0.995347261428833, + "learning_rate": 1.6459280624867876e-05, + "loss": 0.421, + "step": 12455 + }, + { + "epoch": 0.2765785063428819, + "grad_norm": 0.7128069400787354, + "learning_rate": 1.6456618427126664e-05, + "loss": 0.4621, + "step": 12460 + }, + { + "epoch": 0.2766894929024095, + "grad_norm": 1.2790063619613647, + "learning_rate": 1.645395544441144e-05, + "loss": 0.554, + "step": 12465 + }, + { + "epoch": 0.27680047946193714, + "grad_norm": 1.239193320274353, + "learning_rate": 1.645129167704596e-05, + "loss": 0.3666, + "step": 12470 + }, + { + "epoch": 0.2769114660214648, + "grad_norm": 1.1735997200012207, + "learning_rate": 1.644862712535407e-05, + "loss": 0.6032, + "step": 12475 + }, + { + "epoch": 0.27702245258099245, + "grad_norm": 1.070043683052063, + "learning_rate": 1.6445961789659724e-05, + "loss": 0.5279, + "step": 12480 + }, + { + "epoch": 0.2771334391405201, + "grad_norm": 2.237879991531372, + "learning_rate": 1.644329567028696e-05, + "loss": 0.5118, + "step": 12485 + }, + { + "epoch": 0.2772444257000477, + "grad_norm": 0.990967333316803, + "learning_rate": 1.644062876755992e-05, + "loss": 0.5334, + "step": 12490 + }, + { + "epoch": 0.2773554122595754, + "grad_norm": 1.605442762374878, + "learning_rate": 1.6437961081802835e-05, + "loss": 0.5114, + "step": 12495 + }, + { + "epoch": 0.277466398819103, + "grad_norm": 1.337741494178772, + "learning_rate": 1.643529261334003e-05, + "loss": 0.6015, + "step": 12500 + }, + { + "epoch": 0.27757738537863064, + "grad_norm": 1.5959599018096924, + "learning_rate": 1.643262336249593e-05, + "loss": 0.5041, + "step": 12505 + }, + { + "epoch": 0.27768837193815826, + "grad_norm": 1.140795350074768, + "learning_rate": 1.642995332959506e-05, + "loss": 0.5169, + "step": 12510 + }, + { + "epoch": 0.27779935849768594, + "grad_norm": 1.01712965965271, + "learning_rate": 1.6427282514962027e-05, + "loss": 0.3743, + "step": 12515 + }, + { + "epoch": 0.27791034505721357, + "grad_norm": 1.3541737794876099, + "learning_rate": 1.642461091892154e-05, + "loss": 0.595, + "step": 12520 + }, + { + "epoch": 0.2780213316167412, + "grad_norm": 0.8120675683021545, + "learning_rate": 1.64219385417984e-05, + "loss": 0.4762, + "step": 12525 + }, + { + "epoch": 0.2781323181762689, + "grad_norm": 1.5176632404327393, + "learning_rate": 1.6419265383917515e-05, + "loss": 0.5956, + "step": 12530 + }, + { + "epoch": 0.2782433047357965, + "grad_norm": 2.3387813568115234, + "learning_rate": 1.641659144560387e-05, + "loss": 0.4821, + "step": 12535 + }, + { + "epoch": 0.27835429129532413, + "grad_norm": 1.152773380279541, + "learning_rate": 1.6413916727182562e-05, + "loss": 0.4751, + "step": 12540 + }, + { + "epoch": 0.27846527785485176, + "grad_norm": 0.9320288300514221, + "learning_rate": 1.6411241228978764e-05, + "loss": 0.4971, + "step": 12545 + }, + { + "epoch": 0.27857626441437944, + "grad_norm": 1.1640987396240234, + "learning_rate": 1.640856495131776e-05, + "loss": 0.4763, + "step": 12550 + }, + { + "epoch": 0.27868725097390706, + "grad_norm": 1.433164119720459, + "learning_rate": 1.6405887894524925e-05, + "loss": 0.4243, + "step": 12555 + }, + { + "epoch": 0.2787982375334347, + "grad_norm": 1.2848843336105347, + "learning_rate": 1.640321005892572e-05, + "loss": 0.4844, + "step": 12560 + }, + { + "epoch": 0.2789092240929623, + "grad_norm": 1.153416633605957, + "learning_rate": 1.640053144484571e-05, + "loss": 0.5273, + "step": 12565 + }, + { + "epoch": 0.27902021065249, + "grad_norm": 1.2133926153182983, + "learning_rate": 1.6397852052610554e-05, + "loss": 0.6296, + "step": 12570 + }, + { + "epoch": 0.2791311972120176, + "grad_norm": 1.1986991167068481, + "learning_rate": 1.6395171882546002e-05, + "loss": 0.5515, + "step": 12575 + }, + { + "epoch": 0.27924218377154525, + "grad_norm": 0.9386587142944336, + "learning_rate": 1.63924909349779e-05, + "loss": 0.5135, + "step": 12580 + }, + { + "epoch": 0.27935317033107293, + "grad_norm": 0.9406764507293701, + "learning_rate": 1.6389809210232193e-05, + "loss": 0.4616, + "step": 12585 + }, + { + "epoch": 0.27946415689060056, + "grad_norm": 1.2825262546539307, + "learning_rate": 1.6387126708634905e-05, + "loss": 0.5227, + "step": 12590 + }, + { + "epoch": 0.2795751434501282, + "grad_norm": 0.9845243096351624, + "learning_rate": 1.6384443430512176e-05, + "loss": 0.5347, + "step": 12595 + }, + { + "epoch": 0.2796861300096558, + "grad_norm": 1.4052627086639404, + "learning_rate": 1.638175937619023e-05, + "loss": 0.3989, + "step": 12600 + }, + { + "epoch": 0.2797971165691835, + "grad_norm": 1.7123472690582275, + "learning_rate": 1.6379074545995374e-05, + "loss": 0.5085, + "step": 12605 + }, + { + "epoch": 0.2799081031287111, + "grad_norm": 0.9626907706260681, + "learning_rate": 1.6376388940254034e-05, + "loss": 0.5472, + "step": 12610 + }, + { + "epoch": 0.28001908968823874, + "grad_norm": 1.1217817068099976, + "learning_rate": 1.6373702559292712e-05, + "loss": 0.4536, + "step": 12615 + }, + { + "epoch": 0.28013007624776637, + "grad_norm": 1.1611618995666504, + "learning_rate": 1.6371015403438006e-05, + "loss": 0.4647, + "step": 12620 + }, + { + "epoch": 0.28024106280729405, + "grad_norm": 1.1444199085235596, + "learning_rate": 1.6368327473016613e-05, + "loss": 0.3815, + "step": 12625 + }, + { + "epoch": 0.2803520493668217, + "grad_norm": 1.4538722038269043, + "learning_rate": 1.6365638768355325e-05, + "loss": 0.4752, + "step": 12630 + }, + { + "epoch": 0.2804630359263493, + "grad_norm": 1.1787018775939941, + "learning_rate": 1.6362949289781026e-05, + "loss": 0.4783, + "step": 12635 + }, + { + "epoch": 0.280574022485877, + "grad_norm": 1.4149473905563354, + "learning_rate": 1.6360259037620688e-05, + "loss": 0.4009, + "step": 12640 + }, + { + "epoch": 0.2806850090454046, + "grad_norm": 1.4412766695022583, + "learning_rate": 1.635756801220139e-05, + "loss": 0.5886, + "step": 12645 + }, + { + "epoch": 0.28079599560493224, + "grad_norm": 1.1926918029785156, + "learning_rate": 1.6354876213850296e-05, + "loss": 0.4492, + "step": 12650 + }, + { + "epoch": 0.28090698216445986, + "grad_norm": 1.1497153043746948, + "learning_rate": 1.6352183642894662e-05, + "loss": 0.4405, + "step": 12655 + }, + { + "epoch": 0.28101796872398754, + "grad_norm": 1.1040529012680054, + "learning_rate": 1.6349490299661845e-05, + "loss": 0.4971, + "step": 12660 + }, + { + "epoch": 0.28112895528351517, + "grad_norm": 1.356378197669983, + "learning_rate": 1.6346796184479293e-05, + "loss": 0.4433, + "step": 12665 + }, + { + "epoch": 0.2812399418430428, + "grad_norm": 1.1610051393508911, + "learning_rate": 1.6344101297674545e-05, + "loss": 0.5633, + "step": 12670 + }, + { + "epoch": 0.2813509284025704, + "grad_norm": 1.1507076025009155, + "learning_rate": 1.6341405639575235e-05, + "loss": 0.4373, + "step": 12675 + }, + { + "epoch": 0.2814619149620981, + "grad_norm": 1.0662920475006104, + "learning_rate": 1.6338709210509098e-05, + "loss": 0.6102, + "step": 12680 + }, + { + "epoch": 0.28157290152162573, + "grad_norm": 1.0761206150054932, + "learning_rate": 1.6336012010803953e-05, + "loss": 0.664, + "step": 12685 + }, + { + "epoch": 0.28168388808115336, + "grad_norm": 0.75835782289505, + "learning_rate": 1.6333314040787716e-05, + "loss": 0.3878, + "step": 12690 + }, + { + "epoch": 0.28179487464068104, + "grad_norm": 1.2015706300735474, + "learning_rate": 1.6330615300788403e-05, + "loss": 0.4494, + "step": 12695 + }, + { + "epoch": 0.28190586120020866, + "grad_norm": 1.2543123960494995, + "learning_rate": 1.6327915791134107e-05, + "loss": 0.4978, + "step": 12700 + }, + { + "epoch": 0.2820168477597363, + "grad_norm": 1.263872504234314, + "learning_rate": 1.6325215512153035e-05, + "loss": 0.6351, + "step": 12705 + }, + { + "epoch": 0.2821278343192639, + "grad_norm": 1.4159135818481445, + "learning_rate": 1.6322514464173472e-05, + "loss": 0.4785, + "step": 12710 + }, + { + "epoch": 0.2822388208787916, + "grad_norm": 1.1585519313812256, + "learning_rate": 1.6319812647523805e-05, + "loss": 0.513, + "step": 12715 + }, + { + "epoch": 0.2823498074383192, + "grad_norm": 1.630321979522705, + "learning_rate": 1.631711006253251e-05, + "loss": 0.5437, + "step": 12720 + }, + { + "epoch": 0.28246079399784685, + "grad_norm": 1.5868574380874634, + "learning_rate": 1.6314406709528164e-05, + "loss": 0.3609, + "step": 12725 + }, + { + "epoch": 0.2825717805573745, + "grad_norm": 1.2006999254226685, + "learning_rate": 1.6311702588839423e-05, + "loss": 0.4274, + "step": 12730 + }, + { + "epoch": 0.28268276711690216, + "grad_norm": 1.2211520671844482, + "learning_rate": 1.630899770079505e-05, + "loss": 0.4321, + "step": 12735 + }, + { + "epoch": 0.2827937536764298, + "grad_norm": 0.8724921941757202, + "learning_rate": 1.6306292045723894e-05, + "loss": 0.3865, + "step": 12740 + }, + { + "epoch": 0.2829047402359574, + "grad_norm": 1.4219202995300293, + "learning_rate": 1.6303585623954904e-05, + "loss": 0.5281, + "step": 12745 + }, + { + "epoch": 0.2830157267954851, + "grad_norm": 1.7321420907974243, + "learning_rate": 1.6300878435817115e-05, + "loss": 0.5838, + "step": 12750 + }, + { + "epoch": 0.2831267133550127, + "grad_norm": 1.188020944595337, + "learning_rate": 1.629817048163965e-05, + "loss": 0.6092, + "step": 12755 + }, + { + "epoch": 0.28323769991454034, + "grad_norm": 1.03322434425354, + "learning_rate": 1.629546176175175e-05, + "loss": 0.3896, + "step": 12760 + }, + { + "epoch": 0.28334868647406797, + "grad_norm": 1.1465661525726318, + "learning_rate": 1.6292752276482714e-05, + "loss": 0.4183, + "step": 12765 + }, + { + "epoch": 0.28345967303359565, + "grad_norm": 1.3475068807601929, + "learning_rate": 1.6290042026161964e-05, + "loss": 0.4569, + "step": 12770 + }, + { + "epoch": 0.2835706595931233, + "grad_norm": 0.868467390537262, + "learning_rate": 1.6287331011119002e-05, + "loss": 0.3988, + "step": 12775 + }, + { + "epoch": 0.2836816461526509, + "grad_norm": 1.208096981048584, + "learning_rate": 1.6284619231683418e-05, + "loss": 0.417, + "step": 12780 + }, + { + "epoch": 0.2837926327121785, + "grad_norm": 1.2644482851028442, + "learning_rate": 1.6281906688184905e-05, + "loss": 0.4912, + "step": 12785 + }, + { + "epoch": 0.2839036192717062, + "grad_norm": 1.1698830127716064, + "learning_rate": 1.6279193380953247e-05, + "loss": 0.4079, + "step": 12790 + }, + { + "epoch": 0.28401460583123384, + "grad_norm": 1.3247534036636353, + "learning_rate": 1.6276479310318315e-05, + "loss": 0.5211, + "step": 12795 + }, + { + "epoch": 0.28412559239076146, + "grad_norm": 1.4367425441741943, + "learning_rate": 1.627376447661008e-05, + "loss": 0.4516, + "step": 12800 + }, + { + "epoch": 0.28423657895028914, + "grad_norm": 1.5465408563613892, + "learning_rate": 1.62710488801586e-05, + "loss": 0.4393, + "step": 12805 + }, + { + "epoch": 0.28434756550981677, + "grad_norm": 1.4768105745315552, + "learning_rate": 1.626833252129403e-05, + "loss": 0.4672, + "step": 12810 + }, + { + "epoch": 0.2844585520693444, + "grad_norm": 0.8354441523551941, + "learning_rate": 1.626561540034661e-05, + "loss": 0.5801, + "step": 12815 + }, + { + "epoch": 0.284569538628872, + "grad_norm": 1.3228951692581177, + "learning_rate": 1.6262897517646684e-05, + "loss": 0.52, + "step": 12820 + }, + { + "epoch": 0.2846805251883997, + "grad_norm": 1.2378075122833252, + "learning_rate": 1.6260178873524682e-05, + "loss": 0.5024, + "step": 12825 + }, + { + "epoch": 0.28479151174792733, + "grad_norm": 0.9964748024940491, + "learning_rate": 1.625745946831113e-05, + "loss": 0.4219, + "step": 12830 + }, + { + "epoch": 0.28490249830745495, + "grad_norm": 1.212092638015747, + "learning_rate": 1.625473930233664e-05, + "loss": 0.6788, + "step": 12835 + }, + { + "epoch": 0.28501348486698264, + "grad_norm": 1.2740265130996704, + "learning_rate": 1.6252018375931923e-05, + "loss": 0.5355, + "step": 12840 + }, + { + "epoch": 0.28512447142651026, + "grad_norm": 1.424972653388977, + "learning_rate": 1.624929668942778e-05, + "loss": 0.4398, + "step": 12845 + }, + { + "epoch": 0.2852354579860379, + "grad_norm": 1.238060474395752, + "learning_rate": 1.62465742431551e-05, + "loss": 0.6256, + "step": 12850 + }, + { + "epoch": 0.2853464445455655, + "grad_norm": 1.0929948091506958, + "learning_rate": 1.624385103744488e-05, + "loss": 0.5098, + "step": 12855 + }, + { + "epoch": 0.2854574311050932, + "grad_norm": 1.0573160648345947, + "learning_rate": 1.6241127072628186e-05, + "loss": 0.4568, + "step": 12860 + }, + { + "epoch": 0.2855684176646208, + "grad_norm": 0.8821727633476257, + "learning_rate": 1.623840234903619e-05, + "loss": 0.4309, + "step": 12865 + }, + { + "epoch": 0.28567940422414845, + "grad_norm": 1.5702420473098755, + "learning_rate": 1.623567686700017e-05, + "loss": 0.7043, + "step": 12870 + }, + { + "epoch": 0.2857903907836761, + "grad_norm": 1.2420892715454102, + "learning_rate": 1.6232950626851458e-05, + "loss": 0.3862, + "step": 12875 + }, + { + "epoch": 0.28590137734320376, + "grad_norm": 1.5035573244094849, + "learning_rate": 1.6230223628921518e-05, + "loss": 0.4591, + "step": 12880 + }, + { + "epoch": 0.2860123639027314, + "grad_norm": 1.1416951417922974, + "learning_rate": 1.6227495873541883e-05, + "loss": 0.4764, + "step": 12885 + }, + { + "epoch": 0.286123350462259, + "grad_norm": 3.584693193435669, + "learning_rate": 1.6224767361044186e-05, + "loss": 0.416, + "step": 12890 + }, + { + "epoch": 0.2862343370217867, + "grad_norm": 1.1235774755477905, + "learning_rate": 1.6222038091760145e-05, + "loss": 0.474, + "step": 12895 + }, + { + "epoch": 0.2863453235813143, + "grad_norm": 0.9305126667022705, + "learning_rate": 1.6219308066021584e-05, + "loss": 0.4715, + "step": 12900 + }, + { + "epoch": 0.28645631014084194, + "grad_norm": 1.4266836643218994, + "learning_rate": 1.6216577284160408e-05, + "loss": 0.4006, + "step": 12905 + }, + { + "epoch": 0.28656729670036957, + "grad_norm": 1.0521955490112305, + "learning_rate": 1.6213845746508612e-05, + "loss": 0.6069, + "step": 12910 + }, + { + "epoch": 0.28667828325989725, + "grad_norm": 1.033524990081787, + "learning_rate": 1.621111345339829e-05, + "loss": 0.4645, + "step": 12915 + }, + { + "epoch": 0.2867892698194249, + "grad_norm": 1.3503462076187134, + "learning_rate": 1.6208380405161623e-05, + "loss": 0.5717, + "step": 12920 + }, + { + "epoch": 0.2869002563789525, + "grad_norm": 1.0602374076843262, + "learning_rate": 1.6205646602130893e-05, + "loss": 0.3502, + "step": 12925 + }, + { + "epoch": 0.2870112429384801, + "grad_norm": 1.0799397230148315, + "learning_rate": 1.6202912044638453e-05, + "loss": 0.4389, + "step": 12930 + }, + { + "epoch": 0.2871222294980078, + "grad_norm": 1.3221521377563477, + "learning_rate": 1.6200176733016775e-05, + "loss": 0.5155, + "step": 12935 + }, + { + "epoch": 0.28723321605753543, + "grad_norm": 1.3143270015716553, + "learning_rate": 1.6197440667598404e-05, + "loss": 0.6275, + "step": 12940 + }, + { + "epoch": 0.28734420261706306, + "grad_norm": 0.879645049571991, + "learning_rate": 1.619470384871598e-05, + "loss": 0.539, + "step": 12945 + }, + { + "epoch": 0.28745518917659074, + "grad_norm": 1.2223107814788818, + "learning_rate": 1.6191966276702235e-05, + "loss": 0.648, + "step": 12950 + }, + { + "epoch": 0.28756617573611837, + "grad_norm": 1.7858378887176514, + "learning_rate": 1.618922795189e-05, + "loss": 0.5519, + "step": 12955 + }, + { + "epoch": 0.287677162295646, + "grad_norm": 1.4419081211090088, + "learning_rate": 1.6186488874612186e-05, + "loss": 0.4212, + "step": 12960 + }, + { + "epoch": 0.2877881488551736, + "grad_norm": 1.1229172945022583, + "learning_rate": 1.6183749045201804e-05, + "loss": 0.3266, + "step": 12965 + }, + { + "epoch": 0.2878991354147013, + "grad_norm": 1.1010665893554688, + "learning_rate": 1.6181008463991948e-05, + "loss": 0.4354, + "step": 12970 + }, + { + "epoch": 0.2880101219742289, + "grad_norm": 1.0741912126541138, + "learning_rate": 1.6178267131315816e-05, + "loss": 0.5179, + "step": 12975 + }, + { + "epoch": 0.28812110853375655, + "grad_norm": 1.205074667930603, + "learning_rate": 1.6175525047506686e-05, + "loss": 0.5594, + "step": 12980 + }, + { + "epoch": 0.2882320950932842, + "grad_norm": 1.9375433921813965, + "learning_rate": 1.617278221289793e-05, + "loss": 0.4357, + "step": 12985 + }, + { + "epoch": 0.28834308165281186, + "grad_norm": 1.1160500049591064, + "learning_rate": 1.6170038627823016e-05, + "loss": 0.4056, + "step": 12990 + }, + { + "epoch": 0.2884540682123395, + "grad_norm": 1.011465311050415, + "learning_rate": 1.6167294292615498e-05, + "loss": 0.4658, + "step": 12995 + }, + { + "epoch": 0.2885650547718671, + "grad_norm": 1.1458672285079956, + "learning_rate": 1.6164549207609024e-05, + "loss": 0.4624, + "step": 13000 + }, + { + "epoch": 0.2886760413313948, + "grad_norm": 1.1138023138046265, + "learning_rate": 1.616180337313733e-05, + "loss": 0.5013, + "step": 13005 + }, + { + "epoch": 0.2887870278909224, + "grad_norm": 1.3170747756958008, + "learning_rate": 1.615905678953425e-05, + "loss": 0.4251, + "step": 13010 + }, + { + "epoch": 0.28889801445045005, + "grad_norm": 1.0966858863830566, + "learning_rate": 1.6156309457133698e-05, + "loss": 0.5265, + "step": 13015 + }, + { + "epoch": 0.2890090010099777, + "grad_norm": 1.7116811275482178, + "learning_rate": 1.615356137626969e-05, + "loss": 0.4759, + "step": 13020 + }, + { + "epoch": 0.28911998756950535, + "grad_norm": 0.9442894458770752, + "learning_rate": 1.615081254727633e-05, + "loss": 0.3931, + "step": 13025 + }, + { + "epoch": 0.289230974129033, + "grad_norm": 1.521214485168457, + "learning_rate": 1.6148062970487804e-05, + "loss": 0.5104, + "step": 13030 + }, + { + "epoch": 0.2893419606885606, + "grad_norm": 1.3783694505691528, + "learning_rate": 1.6145312646238406e-05, + "loss": 0.5016, + "step": 13035 + }, + { + "epoch": 0.28945294724808823, + "grad_norm": 1.7637170553207397, + "learning_rate": 1.6142561574862505e-05, + "loss": 0.4152, + "step": 13040 + }, + { + "epoch": 0.2895639338076159, + "grad_norm": 1.185720443725586, + "learning_rate": 1.6139809756694565e-05, + "loss": 0.4615, + "step": 13045 + }, + { + "epoch": 0.28967492036714354, + "grad_norm": 1.0681581497192383, + "learning_rate": 1.6137057192069146e-05, + "loss": 0.5975, + "step": 13050 + }, + { + "epoch": 0.28978590692667117, + "grad_norm": 1.145227313041687, + "learning_rate": 1.61343038813209e-05, + "loss": 0.7614, + "step": 13055 + }, + { + "epoch": 0.28989689348619885, + "grad_norm": 2.2884984016418457, + "learning_rate": 1.6131549824784557e-05, + "loss": 0.503, + "step": 13060 + }, + { + "epoch": 0.2900078800457265, + "grad_norm": 1.2518259286880493, + "learning_rate": 1.6128795022794954e-05, + "loss": 0.4066, + "step": 13065 + }, + { + "epoch": 0.2901188666052541, + "grad_norm": 1.4038550853729248, + "learning_rate": 1.6126039475687006e-05, + "loss": 0.4795, + "step": 13070 + }, + { + "epoch": 0.2902298531647817, + "grad_norm": 1.1985700130462646, + "learning_rate": 1.612328318379572e-05, + "loss": 0.4906, + "step": 13075 + }, + { + "epoch": 0.2903408397243094, + "grad_norm": 1.0185683965682983, + "learning_rate": 1.6120526147456202e-05, + "loss": 0.3498, + "step": 13080 + }, + { + "epoch": 0.29045182628383703, + "grad_norm": 1.2648568153381348, + "learning_rate": 1.611776836700364e-05, + "loss": 0.5293, + "step": 13085 + }, + { + "epoch": 0.29056281284336466, + "grad_norm": 1.1476002931594849, + "learning_rate": 1.6115009842773322e-05, + "loss": 0.6391, + "step": 13090 + }, + { + "epoch": 0.2906737994028923, + "grad_norm": 1.592239260673523, + "learning_rate": 1.611225057510061e-05, + "loss": 0.5028, + "step": 13095 + }, + { + "epoch": 0.29078478596241997, + "grad_norm": 1.238527536392212, + "learning_rate": 1.6109490564320974e-05, + "loss": 0.3906, + "step": 13100 + }, + { + "epoch": 0.2908957725219476, + "grad_norm": 1.1238560676574707, + "learning_rate": 1.6106729810769968e-05, + "loss": 0.4653, + "step": 13105 + }, + { + "epoch": 0.2910067590814752, + "grad_norm": 1.2527645826339722, + "learning_rate": 1.610396831478323e-05, + "loss": 0.4405, + "step": 13110 + }, + { + "epoch": 0.2911177456410029, + "grad_norm": 1.5676945447921753, + "learning_rate": 1.6101206076696496e-05, + "loss": 0.4006, + "step": 13115 + }, + { + "epoch": 0.2912287322005305, + "grad_norm": 1.342868447303772, + "learning_rate": 1.609844309684559e-05, + "loss": 0.5548, + "step": 13120 + }, + { + "epoch": 0.29133971876005815, + "grad_norm": 1.2602522373199463, + "learning_rate": 1.609567937556642e-05, + "loss": 0.4057, + "step": 13125 + }, + { + "epoch": 0.2914507053195858, + "grad_norm": 1.3834385871887207, + "learning_rate": 1.6092914913194997e-05, + "loss": 0.5533, + "step": 13130 + }, + { + "epoch": 0.29156169187911346, + "grad_norm": 1.2576740980148315, + "learning_rate": 1.6090149710067412e-05, + "loss": 0.5351, + "step": 13135 + }, + { + "epoch": 0.2916726784386411, + "grad_norm": 1.4116957187652588, + "learning_rate": 1.608738376651985e-05, + "loss": 0.3774, + "step": 13140 + }, + { + "epoch": 0.2917836649981687, + "grad_norm": 1.215957760810852, + "learning_rate": 1.608461708288859e-05, + "loss": 0.3787, + "step": 13145 + }, + { + "epoch": 0.29189465155769634, + "grad_norm": 1.041744351387024, + "learning_rate": 1.608184965950999e-05, + "loss": 0.4878, + "step": 13150 + }, + { + "epoch": 0.292005638117224, + "grad_norm": 0.9204108119010925, + "learning_rate": 1.60790814967205e-05, + "loss": 0.5201, + "step": 13155 + }, + { + "epoch": 0.29211662467675165, + "grad_norm": 0.9996805787086487, + "learning_rate": 1.6076312594856673e-05, + "loss": 0.3782, + "step": 13160 + }, + { + "epoch": 0.29222761123627927, + "grad_norm": 0.9885636568069458, + "learning_rate": 1.6073542954255137e-05, + "loss": 0.5069, + "step": 13165 + }, + { + "epoch": 0.29233859779580695, + "grad_norm": 0.8402591347694397, + "learning_rate": 1.6070772575252623e-05, + "loss": 0.3521, + "step": 13170 + }, + { + "epoch": 0.2924495843553346, + "grad_norm": 2.6965503692626953, + "learning_rate": 1.6068001458185934e-05, + "loss": 0.2871, + "step": 13175 + }, + { + "epoch": 0.2925605709148622, + "grad_norm": 1.0516703128814697, + "learning_rate": 1.606522960339198e-05, + "loss": 0.5634, + "step": 13180 + }, + { + "epoch": 0.29267155747438983, + "grad_norm": 1.8383911848068237, + "learning_rate": 1.6062457011207753e-05, + "loss": 0.5725, + "step": 13185 + }, + { + "epoch": 0.2927825440339175, + "grad_norm": 1.5198522806167603, + "learning_rate": 1.6059683681970334e-05, + "loss": 0.3936, + "step": 13190 + }, + { + "epoch": 0.29289353059344514, + "grad_norm": 1.1313891410827637, + "learning_rate": 1.6056909616016895e-05, + "loss": 0.5721, + "step": 13195 + }, + { + "epoch": 0.29300451715297277, + "grad_norm": 1.2808656692504883, + "learning_rate": 1.6054134813684697e-05, + "loss": 0.5572, + "step": 13200 + }, + { + "epoch": 0.2931155037125004, + "grad_norm": 1.470443844795227, + "learning_rate": 1.6051359275311093e-05, + "loss": 0.589, + "step": 13205 + }, + { + "epoch": 0.2932264902720281, + "grad_norm": 1.6670235395431519, + "learning_rate": 1.6048583001233525e-05, + "loss": 0.5116, + "step": 13210 + }, + { + "epoch": 0.2933374768315557, + "grad_norm": 1.2965604066848755, + "learning_rate": 1.6045805991789518e-05, + "loss": 0.3875, + "step": 13215 + }, + { + "epoch": 0.2934484633910833, + "grad_norm": 1.1231025457382202, + "learning_rate": 1.6043028247316696e-05, + "loss": 0.3527, + "step": 13220 + }, + { + "epoch": 0.293559449950611, + "grad_norm": 1.045440912246704, + "learning_rate": 1.6040249768152767e-05, + "loss": 0.4948, + "step": 13225 + }, + { + "epoch": 0.29367043651013863, + "grad_norm": 1.2011396884918213, + "learning_rate": 1.603747055463553e-05, + "loss": 0.4802, + "step": 13230 + }, + { + "epoch": 0.29378142306966626, + "grad_norm": 1.2609519958496094, + "learning_rate": 1.603469060710287e-05, + "loss": 0.5255, + "step": 13235 + }, + { + "epoch": 0.2938924096291939, + "grad_norm": 1.356063961982727, + "learning_rate": 1.603190992589276e-05, + "loss": 0.4837, + "step": 13240 + }, + { + "epoch": 0.29400339618872157, + "grad_norm": 1.5714696645736694, + "learning_rate": 1.6029128511343276e-05, + "loss": 0.5208, + "step": 13245 + }, + { + "epoch": 0.2941143827482492, + "grad_norm": 1.3724098205566406, + "learning_rate": 1.6026346363792565e-05, + "loss": 0.6037, + "step": 13250 + }, + { + "epoch": 0.2942253693077768, + "grad_norm": 1.2276989221572876, + "learning_rate": 1.6023563483578874e-05, + "loss": 0.5841, + "step": 13255 + }, + { + "epoch": 0.29433635586730444, + "grad_norm": 1.3937835693359375, + "learning_rate": 1.6020779871040538e-05, + "loss": 0.4431, + "step": 13260 + }, + { + "epoch": 0.2944473424268321, + "grad_norm": 0.9244763851165771, + "learning_rate": 1.6017995526515976e-05, + "loss": 0.4486, + "step": 13265 + }, + { + "epoch": 0.29455832898635975, + "grad_norm": 1.135107159614563, + "learning_rate": 1.60152104503437e-05, + "loss": 0.5151, + "step": 13270 + }, + { + "epoch": 0.2946693155458874, + "grad_norm": 1.617790937423706, + "learning_rate": 1.6012424642862315e-05, + "loss": 0.3665, + "step": 13275 + }, + { + "epoch": 0.29478030210541506, + "grad_norm": 1.2885212898254395, + "learning_rate": 1.6009638104410503e-05, + "loss": 0.5053, + "step": 13280 + }, + { + "epoch": 0.2948912886649427, + "grad_norm": 0.9920433759689331, + "learning_rate": 1.6006850835327044e-05, + "loss": 0.4176, + "step": 13285 + }, + { + "epoch": 0.2950022752244703, + "grad_norm": 1.2480064630508423, + "learning_rate": 1.600406283595081e-05, + "loss": 0.5412, + "step": 13290 + }, + { + "epoch": 0.29511326178399794, + "grad_norm": 1.3954321146011353, + "learning_rate": 1.600127410662075e-05, + "loss": 0.461, + "step": 13295 + }, + { + "epoch": 0.2952242483435256, + "grad_norm": 1.1513477563858032, + "learning_rate": 1.599848464767591e-05, + "loss": 0.5022, + "step": 13300 + }, + { + "epoch": 0.29533523490305325, + "grad_norm": 1.1079208850860596, + "learning_rate": 1.599569445945542e-05, + "loss": 0.7639, + "step": 13305 + }, + { + "epoch": 0.29544622146258087, + "grad_norm": 1.159168004989624, + "learning_rate": 1.599290354229851e-05, + "loss": 0.5514, + "step": 13310 + }, + { + "epoch": 0.2955572080221085, + "grad_norm": 0.9496512413024902, + "learning_rate": 1.5990111896544488e-05, + "loss": 0.4599, + "step": 13315 + }, + { + "epoch": 0.2956681945816362, + "grad_norm": 1.3379199504852295, + "learning_rate": 1.598731952253275e-05, + "loss": 0.44, + "step": 13320 + }, + { + "epoch": 0.2957791811411638, + "grad_norm": 0.7901808619499207, + "learning_rate": 1.5984526420602782e-05, + "loss": 0.322, + "step": 13325 + }, + { + "epoch": 0.29589016770069143, + "grad_norm": 1.442884922027588, + "learning_rate": 1.5981732591094164e-05, + "loss": 0.4746, + "step": 13330 + }, + { + "epoch": 0.2960011542602191, + "grad_norm": 1.3721030950546265, + "learning_rate": 1.5978938034346557e-05, + "loss": 0.4736, + "step": 13335 + }, + { + "epoch": 0.29611214081974674, + "grad_norm": 1.2049025297164917, + "learning_rate": 1.597614275069972e-05, + "loss": 0.6011, + "step": 13340 + }, + { + "epoch": 0.29622312737927436, + "grad_norm": 1.2768781185150146, + "learning_rate": 1.5973346740493486e-05, + "loss": 0.4977, + "step": 13345 + }, + { + "epoch": 0.296334113938802, + "grad_norm": 1.061110258102417, + "learning_rate": 1.597055000406779e-05, + "loss": 0.4888, + "step": 13350 + }, + { + "epoch": 0.2964451004983297, + "grad_norm": 1.1296803951263428, + "learning_rate": 1.5967752541762648e-05, + "loss": 0.377, + "step": 13355 + }, + { + "epoch": 0.2965560870578573, + "grad_norm": 1.1014056205749512, + "learning_rate": 1.5964954353918163e-05, + "loss": 0.5997, + "step": 13360 + }, + { + "epoch": 0.2966670736173849, + "grad_norm": 0.9890385270118713, + "learning_rate": 1.5962155440874535e-05, + "loss": 0.5013, + "step": 13365 + }, + { + "epoch": 0.29677806017691255, + "grad_norm": 1.223296880722046, + "learning_rate": 1.5959355802972044e-05, + "loss": 0.4773, + "step": 13370 + }, + { + "epoch": 0.29688904673644023, + "grad_norm": 0.9817351698875427, + "learning_rate": 1.595655544055106e-05, + "loss": 0.4686, + "step": 13375 + }, + { + "epoch": 0.29700003329596786, + "grad_norm": 1.1673156023025513, + "learning_rate": 1.5953754353952043e-05, + "loss": 0.3406, + "step": 13380 + }, + { + "epoch": 0.2971110198554955, + "grad_norm": 1.035629391670227, + "learning_rate": 1.595095254351554e-05, + "loss": 0.3881, + "step": 13385 + }, + { + "epoch": 0.29722200641502317, + "grad_norm": 1.143067479133606, + "learning_rate": 1.5948150009582183e-05, + "loss": 0.5803, + "step": 13390 + }, + { + "epoch": 0.2973329929745508, + "grad_norm": 1.0391279458999634, + "learning_rate": 1.5945346752492697e-05, + "loss": 0.4273, + "step": 13395 + }, + { + "epoch": 0.2974439795340784, + "grad_norm": 1.1800872087478638, + "learning_rate": 1.5942542772587893e-05, + "loss": 0.4803, + "step": 13400 + }, + { + "epoch": 0.29755496609360604, + "grad_norm": 1.7651307582855225, + "learning_rate": 1.5939738070208667e-05, + "loss": 0.4946, + "step": 13405 + }, + { + "epoch": 0.2976659526531337, + "grad_norm": 1.0853071212768555, + "learning_rate": 1.5936932645696005e-05, + "loss": 0.3652, + "step": 13410 + }, + { + "epoch": 0.29777693921266135, + "grad_norm": 1.0994185209274292, + "learning_rate": 1.5934126499390986e-05, + "loss": 0.4054, + "step": 13415 + }, + { + "epoch": 0.297887925772189, + "grad_norm": 0.9863249659538269, + "learning_rate": 1.593131963163477e-05, + "loss": 0.4278, + "step": 13420 + }, + { + "epoch": 0.2979989123317166, + "grad_norm": 0.9838482141494751, + "learning_rate": 1.59285120427686e-05, + "loss": 0.4615, + "step": 13425 + }, + { + "epoch": 0.2981098988912443, + "grad_norm": 1.1166720390319824, + "learning_rate": 1.5925703733133823e-05, + "loss": 0.5157, + "step": 13430 + }, + { + "epoch": 0.2982208854507719, + "grad_norm": 1.2304351329803467, + "learning_rate": 1.5922894703071858e-05, + "loss": 0.5007, + "step": 13435 + }, + { + "epoch": 0.29833187201029954, + "grad_norm": 1.102946162223816, + "learning_rate": 1.592008495292422e-05, + "loss": 0.4565, + "step": 13440 + }, + { + "epoch": 0.2984428585698272, + "grad_norm": 1.1732909679412842, + "learning_rate": 1.5917274483032505e-05, + "loss": 0.3954, + "step": 13445 + }, + { + "epoch": 0.29855384512935484, + "grad_norm": 1.0969271659851074, + "learning_rate": 1.5914463293738402e-05, + "loss": 0.4611, + "step": 13450 + }, + { + "epoch": 0.29866483168888247, + "grad_norm": 0.8274635672569275, + "learning_rate": 1.5911651385383692e-05, + "loss": 0.3072, + "step": 13455 + }, + { + "epoch": 0.2987758182484101, + "grad_norm": 1.5720192193984985, + "learning_rate": 1.5908838758310234e-05, + "loss": 0.5685, + "step": 13460 + }, + { + "epoch": 0.2988868048079378, + "grad_norm": 0.8137319684028625, + "learning_rate": 1.590602541285997e-05, + "loss": 0.4301, + "step": 13465 + }, + { + "epoch": 0.2989977913674654, + "grad_norm": 1.040312647819519, + "learning_rate": 1.590321134937495e-05, + "loss": 0.5921, + "step": 13470 + }, + { + "epoch": 0.29910877792699303, + "grad_norm": 1.202492117881775, + "learning_rate": 1.5900396568197287e-05, + "loss": 0.4665, + "step": 13475 + }, + { + "epoch": 0.29921976448652066, + "grad_norm": 1.6085517406463623, + "learning_rate": 1.58975810696692e-05, + "loss": 0.4096, + "step": 13480 + }, + { + "epoch": 0.29933075104604834, + "grad_norm": 0.8945106267929077, + "learning_rate": 1.5894764854132985e-05, + "loss": 0.4441, + "step": 13485 + }, + { + "epoch": 0.29944173760557596, + "grad_norm": 1.0774484872817993, + "learning_rate": 1.5891947921931027e-05, + "loss": 0.4949, + "step": 13490 + }, + { + "epoch": 0.2995527241651036, + "grad_norm": 0.8812748789787292, + "learning_rate": 1.5889130273405805e-05, + "loss": 0.5329, + "step": 13495 + }, + { + "epoch": 0.29966371072463127, + "grad_norm": 1.6877503395080566, + "learning_rate": 1.588631190889987e-05, + "loss": 0.3313, + "step": 13500 + }, + { + "epoch": 0.2997746972841589, + "grad_norm": 1.2463515996932983, + "learning_rate": 1.5883492828755876e-05, + "loss": 0.3487, + "step": 13505 + }, + { + "epoch": 0.2998856838436865, + "grad_norm": 1.5215736627578735, + "learning_rate": 1.5880673033316555e-05, + "loss": 0.4009, + "step": 13510 + }, + { + "epoch": 0.29999667040321415, + "grad_norm": 1.2792853116989136, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.4295, + "step": 13515 + }, + { + "epoch": 0.30010765696274183, + "grad_norm": 1.0436149835586548, + "learning_rate": 1.587503129792331e-05, + "loss": 0.4025, + "step": 13520 + }, + { + "epoch": 0.30021864352226946, + "grad_norm": 0.9356552958488464, + "learning_rate": 1.5872209358655286e-05, + "loss": 0.2919, + "step": 13525 + }, + { + "epoch": 0.3003296300817971, + "grad_norm": 1.2793176174163818, + "learning_rate": 1.5869386705463742e-05, + "loss": 0.3657, + "step": 13530 + }, + { + "epoch": 0.3004406166413247, + "grad_norm": 1.219204306602478, + "learning_rate": 1.586656333869185e-05, + "loss": 0.5315, + "step": 13535 + }, + { + "epoch": 0.3005516032008524, + "grad_norm": 1.4818681478500366, + "learning_rate": 1.5863739258682858e-05, + "loss": 0.4937, + "step": 13540 + }, + { + "epoch": 0.30066258976038, + "grad_norm": 1.1443345546722412, + "learning_rate": 1.5860914465780112e-05, + "loss": 0.4912, + "step": 13545 + }, + { + "epoch": 0.30077357631990764, + "grad_norm": 1.6327272653579712, + "learning_rate": 1.5858088960327043e-05, + "loss": 0.3595, + "step": 13550 + }, + { + "epoch": 0.3008845628794353, + "grad_norm": 1.1001689434051514, + "learning_rate": 1.5855262742667165e-05, + "loss": 0.526, + "step": 13555 + }, + { + "epoch": 0.30099554943896295, + "grad_norm": 0.9390714168548584, + "learning_rate": 1.585243581314408e-05, + "loss": 0.5033, + "step": 13560 + }, + { + "epoch": 0.3011065359984906, + "grad_norm": 1.3417963981628418, + "learning_rate": 1.5849608172101472e-05, + "loss": 0.4682, + "step": 13565 + }, + { + "epoch": 0.3012175225580182, + "grad_norm": 0.9747509360313416, + "learning_rate": 1.5846779819883127e-05, + "loss": 0.6363, + "step": 13570 + }, + { + "epoch": 0.3013285091175459, + "grad_norm": 1.1777548789978027, + "learning_rate": 1.58439507568329e-05, + "loss": 0.4182, + "step": 13575 + }, + { + "epoch": 0.3014394956770735, + "grad_norm": 1.6634676456451416, + "learning_rate": 1.5841120983294732e-05, + "loss": 0.4847, + "step": 13580 + }, + { + "epoch": 0.30155048223660114, + "grad_norm": 1.0142072439193726, + "learning_rate": 1.583829049961267e-05, + "loss": 0.4785, + "step": 13585 + }, + { + "epoch": 0.30166146879612876, + "grad_norm": 1.115145206451416, + "learning_rate": 1.5835459306130828e-05, + "loss": 0.5335, + "step": 13590 + }, + { + "epoch": 0.30177245535565644, + "grad_norm": 1.2043323516845703, + "learning_rate": 1.5832627403193414e-05, + "loss": 0.5331, + "step": 13595 + }, + { + "epoch": 0.30188344191518407, + "grad_norm": 0.8347172737121582, + "learning_rate": 1.5829794791144723e-05, + "loss": 0.4027, + "step": 13600 + }, + { + "epoch": 0.3019944284747117, + "grad_norm": 0.7217290997505188, + "learning_rate": 1.582696147032913e-05, + "loss": 0.4824, + "step": 13605 + }, + { + "epoch": 0.3021054150342394, + "grad_norm": 0.854871928691864, + "learning_rate": 1.5824127441091107e-05, + "loss": 0.3582, + "step": 13610 + }, + { + "epoch": 0.302216401593767, + "grad_norm": 1.1389976739883423, + "learning_rate": 1.58212927037752e-05, + "loss": 0.3694, + "step": 13615 + }, + { + "epoch": 0.30232738815329463, + "grad_norm": 1.5510504245758057, + "learning_rate": 1.5818457258726048e-05, + "loss": 0.5034, + "step": 13620 + }, + { + "epoch": 0.30243837471282226, + "grad_norm": 1.252467155456543, + "learning_rate": 1.5815621106288377e-05, + "loss": 0.4467, + "step": 13625 + }, + { + "epoch": 0.30254936127234994, + "grad_norm": 1.3585219383239746, + "learning_rate": 1.5812784246806998e-05, + "loss": 0.3812, + "step": 13630 + }, + { + "epoch": 0.30266034783187756, + "grad_norm": 1.4387974739074707, + "learning_rate": 1.5809946680626804e-05, + "loss": 0.5139, + "step": 13635 + }, + { + "epoch": 0.3027713343914052, + "grad_norm": 1.2116645574569702, + "learning_rate": 1.5807108408092778e-05, + "loss": 0.4336, + "step": 13640 + }, + { + "epoch": 0.3028823209509328, + "grad_norm": 0.818150520324707, + "learning_rate": 1.5804269429549983e-05, + "loss": 0.444, + "step": 13645 + }, + { + "epoch": 0.3029933075104605, + "grad_norm": 0.7917804718017578, + "learning_rate": 1.5801429745343583e-05, + "loss": 0.3267, + "step": 13650 + }, + { + "epoch": 0.3031042940699881, + "grad_norm": 2.8502650260925293, + "learning_rate": 1.5798589355818807e-05, + "loss": 0.4336, + "step": 13655 + }, + { + "epoch": 0.30321528062951575, + "grad_norm": 0.9612025618553162, + "learning_rate": 1.5795748261320984e-05, + "loss": 0.5098, + "step": 13660 + }, + { + "epoch": 0.30332626718904343, + "grad_norm": 0.6825042366981506, + "learning_rate": 1.5792906462195524e-05, + "loss": 0.4194, + "step": 13665 + }, + { + "epoch": 0.30343725374857106, + "grad_norm": 1.3684370517730713, + "learning_rate": 1.579006395878793e-05, + "loss": 0.63, + "step": 13670 + }, + { + "epoch": 0.3035482403080987, + "grad_norm": 1.1626808643341064, + "learning_rate": 1.5787220751443773e-05, + "loss": 0.4685, + "step": 13675 + }, + { + "epoch": 0.3036592268676263, + "grad_norm": 1.3281182050704956, + "learning_rate": 1.5784376840508725e-05, + "loss": 0.5736, + "step": 13680 + }, + { + "epoch": 0.303770213427154, + "grad_norm": 1.0269482135772705, + "learning_rate": 1.5781532226328544e-05, + "loss": 0.3554, + "step": 13685 + }, + { + "epoch": 0.3038811999866816, + "grad_norm": 1.3451710939407349, + "learning_rate": 1.5778686909249062e-05, + "loss": 0.4192, + "step": 13690 + }, + { + "epoch": 0.30399218654620924, + "grad_norm": 1.0704500675201416, + "learning_rate": 1.577584088961621e-05, + "loss": 0.3632, + "step": 13695 + }, + { + "epoch": 0.30410317310573687, + "grad_norm": 1.3877739906311035, + "learning_rate": 1.5772994167775986e-05, + "loss": 0.5938, + "step": 13700 + }, + { + "epoch": 0.30421415966526455, + "grad_norm": 1.3011753559112549, + "learning_rate": 1.57701467440745e-05, + "loss": 0.5205, + "step": 13705 + }, + { + "epoch": 0.3043251462247922, + "grad_norm": 2.0894100666046143, + "learning_rate": 1.576729861885792e-05, + "loss": 0.4538, + "step": 13710 + }, + { + "epoch": 0.3044361327843198, + "grad_norm": 1.0299859046936035, + "learning_rate": 1.5764449792472518e-05, + "loss": 0.4944, + "step": 13715 + }, + { + "epoch": 0.3045471193438475, + "grad_norm": 1.1739716529846191, + "learning_rate": 1.576160026526464e-05, + "loss": 0.4768, + "step": 13720 + }, + { + "epoch": 0.3046581059033751, + "grad_norm": 0.999975860118866, + "learning_rate": 1.5758750037580726e-05, + "loss": 0.4753, + "step": 13725 + }, + { + "epoch": 0.30476909246290274, + "grad_norm": 1.3396409749984741, + "learning_rate": 1.5755899109767298e-05, + "loss": 0.6012, + "step": 13730 + }, + { + "epoch": 0.30488007902243036, + "grad_norm": 0.9428462982177734, + "learning_rate": 1.5753047482170956e-05, + "loss": 0.4727, + "step": 13735 + }, + { + "epoch": 0.30499106558195804, + "grad_norm": 1.2645313739776611, + "learning_rate": 1.5750195155138394e-05, + "loss": 0.4488, + "step": 13740 + }, + { + "epoch": 0.30510205214148567, + "grad_norm": 0.9989715814590454, + "learning_rate": 1.5747342129016395e-05, + "loss": 0.3837, + "step": 13745 + }, + { + "epoch": 0.3052130387010133, + "grad_norm": 1.5138322114944458, + "learning_rate": 1.574448840415181e-05, + "loss": 0.5546, + "step": 13750 + }, + { + "epoch": 0.3053240252605409, + "grad_norm": 1.6559644937515259, + "learning_rate": 1.5741633980891596e-05, + "loss": 0.5383, + "step": 13755 + }, + { + "epoch": 0.3054350118200686, + "grad_norm": 0.8841571807861328, + "learning_rate": 1.5738778859582776e-05, + "loss": 0.3331, + "step": 13760 + }, + { + "epoch": 0.30554599837959623, + "grad_norm": 1.1933643817901611, + "learning_rate": 1.5735923040572467e-05, + "loss": 0.5135, + "step": 13765 + }, + { + "epoch": 0.30565698493912385, + "grad_norm": 1.1483535766601562, + "learning_rate": 1.5733066524207875e-05, + "loss": 0.3668, + "step": 13770 + }, + { + "epoch": 0.30576797149865154, + "grad_norm": 1.5365492105484009, + "learning_rate": 1.573020931083628e-05, + "loss": 0.3984, + "step": 13775 + }, + { + "epoch": 0.30587895805817916, + "grad_norm": 1.2241919040679932, + "learning_rate": 1.5727351400805054e-05, + "loss": 0.4336, + "step": 13780 + }, + { + "epoch": 0.3059899446177068, + "grad_norm": 1.256589651107788, + "learning_rate": 1.572449279446165e-05, + "loss": 0.5923, + "step": 13785 + }, + { + "epoch": 0.3061009311772344, + "grad_norm": 0.9442324638366699, + "learning_rate": 1.572163349215362e-05, + "loss": 0.3944, + "step": 13790 + }, + { + "epoch": 0.3062119177367621, + "grad_norm": 1.2820990085601807, + "learning_rate": 1.5718773494228572e-05, + "loss": 0.6538, + "step": 13795 + }, + { + "epoch": 0.3063229042962897, + "grad_norm": 1.3526954650878906, + "learning_rate": 1.5715912801034223e-05, + "loss": 0.5573, + "step": 13800 + }, + { + "epoch": 0.30643389085581735, + "grad_norm": 2.695969343185425, + "learning_rate": 1.5713051412918363e-05, + "loss": 0.4623, + "step": 13805 + }, + { + "epoch": 0.306544877415345, + "grad_norm": 1.3611737489700317, + "learning_rate": 1.5710189330228873e-05, + "loss": 0.369, + "step": 13810 + }, + { + "epoch": 0.30665586397487266, + "grad_norm": 1.6916635036468506, + "learning_rate": 1.5707326553313714e-05, + "loss": 0.5734, + "step": 13815 + }, + { + "epoch": 0.3067668505344003, + "grad_norm": 1.3984599113464355, + "learning_rate": 1.570446308252094e-05, + "loss": 0.4552, + "step": 13820 + }, + { + "epoch": 0.3068778370939279, + "grad_norm": 1.157272219657898, + "learning_rate": 1.5701598918198667e-05, + "loss": 0.6662, + "step": 13825 + }, + { + "epoch": 0.3069888236534556, + "grad_norm": 1.7061251401901245, + "learning_rate": 1.5698734060695127e-05, + "loss": 0.5608, + "step": 13830 + }, + { + "epoch": 0.3070998102129832, + "grad_norm": 1.0720163583755493, + "learning_rate": 1.5695868510358607e-05, + "loss": 0.6503, + "step": 13835 + }, + { + "epoch": 0.30721079677251084, + "grad_norm": 0.9545885324478149, + "learning_rate": 1.5693002267537497e-05, + "loss": 0.509, + "step": 13840 + }, + { + "epoch": 0.30732178333203847, + "grad_norm": 1.1842597723007202, + "learning_rate": 1.5690135332580266e-05, + "loss": 0.2632, + "step": 13845 + }, + { + "epoch": 0.30743276989156615, + "grad_norm": 1.174399733543396, + "learning_rate": 1.5687267705835463e-05, + "loss": 0.342, + "step": 13850 + }, + { + "epoch": 0.3075437564510938, + "grad_norm": 1.443686842918396, + "learning_rate": 1.5684399387651725e-05, + "loss": 0.5161, + "step": 13855 + }, + { + "epoch": 0.3076547430106214, + "grad_norm": 0.8438587784767151, + "learning_rate": 1.5681530378377777e-05, + "loss": 0.3586, + "step": 13860 + }, + { + "epoch": 0.3077657295701491, + "grad_norm": 0.9586126208305359, + "learning_rate": 1.5678660678362416e-05, + "loss": 0.4851, + "step": 13865 + }, + { + "epoch": 0.3078767161296767, + "grad_norm": 1.103760838508606, + "learning_rate": 1.5675790287954535e-05, + "loss": 0.5001, + "step": 13870 + }, + { + "epoch": 0.30798770268920433, + "grad_norm": 1.0907628536224365, + "learning_rate": 1.5672919207503108e-05, + "loss": 0.4651, + "step": 13875 + }, + { + "epoch": 0.30809868924873196, + "grad_norm": 0.6905645132064819, + "learning_rate": 1.5670047437357188e-05, + "loss": 0.4394, + "step": 13880 + }, + { + "epoch": 0.30820967580825964, + "grad_norm": 1.0371253490447998, + "learning_rate": 1.5667174977865917e-05, + "loss": 0.6306, + "step": 13885 + }, + { + "epoch": 0.30832066236778727, + "grad_norm": 1.486870288848877, + "learning_rate": 1.5664301829378515e-05, + "loss": 0.3986, + "step": 13890 + }, + { + "epoch": 0.3084316489273149, + "grad_norm": 1.2162060737609863, + "learning_rate": 1.5661427992244297e-05, + "loss": 0.6083, + "step": 13895 + }, + { + "epoch": 0.3085426354868425, + "grad_norm": 1.528130292892456, + "learning_rate": 1.5658553466812652e-05, + "loss": 0.5284, + "step": 13900 + }, + { + "epoch": 0.3086536220463702, + "grad_norm": 1.2805726528167725, + "learning_rate": 1.565567825343305e-05, + "loss": 0.3335, + "step": 13905 + }, + { + "epoch": 0.30876460860589783, + "grad_norm": 1.0317713022232056, + "learning_rate": 1.5652802352455057e-05, + "loss": 0.5244, + "step": 13910 + }, + { + "epoch": 0.30887559516542545, + "grad_norm": 1.0753859281539917, + "learning_rate": 1.564992576422831e-05, + "loss": 0.4692, + "step": 13915 + }, + { + "epoch": 0.30898658172495314, + "grad_norm": 1.336464762687683, + "learning_rate": 1.5647048489102535e-05, + "loss": 0.4959, + "step": 13920 + }, + { + "epoch": 0.30909756828448076, + "grad_norm": 1.2710378170013428, + "learning_rate": 1.5644170527427545e-05, + "loss": 0.377, + "step": 13925 + }, + { + "epoch": 0.3092085548440084, + "grad_norm": 1.192382574081421, + "learning_rate": 1.5641291879553233e-05, + "loss": 0.4898, + "step": 13930 + }, + { + "epoch": 0.309319541403536, + "grad_norm": 1.1654630899429321, + "learning_rate": 1.5638412545829575e-05, + "loss": 0.4206, + "step": 13935 + }, + { + "epoch": 0.3094305279630637, + "grad_norm": 1.0210871696472168, + "learning_rate": 1.5635532526606625e-05, + "loss": 0.5103, + "step": 13940 + }, + { + "epoch": 0.3095415145225913, + "grad_norm": 1.240159034729004, + "learning_rate": 1.5632651822234533e-05, + "loss": 0.465, + "step": 13945 + }, + { + "epoch": 0.30965250108211895, + "grad_norm": 1.2842824459075928, + "learning_rate": 1.5629770433063523e-05, + "loss": 0.413, + "step": 13950 + }, + { + "epoch": 0.3097634876416466, + "grad_norm": 1.2681161165237427, + "learning_rate": 1.5626888359443905e-05, + "loss": 0.5637, + "step": 13955 + }, + { + "epoch": 0.30987447420117425, + "grad_norm": 1.4573901891708374, + "learning_rate": 1.5624005601726068e-05, + "loss": 0.5249, + "step": 13960 + }, + { + "epoch": 0.3099854607607019, + "grad_norm": 1.5680787563323975, + "learning_rate": 1.5621122160260496e-05, + "loss": 0.5479, + "step": 13965 + }, + { + "epoch": 0.3100964473202295, + "grad_norm": 1.434630036354065, + "learning_rate": 1.561823803539774e-05, + "loss": 0.5073, + "step": 13970 + }, + { + "epoch": 0.3102074338797572, + "grad_norm": 1.25895094871521, + "learning_rate": 1.561535322748845e-05, + "loss": 0.412, + "step": 13975 + }, + { + "epoch": 0.3103184204392848, + "grad_norm": 1.2706303596496582, + "learning_rate": 1.5612467736883343e-05, + "loss": 0.5886, + "step": 13980 + }, + { + "epoch": 0.31042940699881244, + "grad_norm": 1.2274819612503052, + "learning_rate": 1.560958156393323e-05, + "loss": 0.4487, + "step": 13985 + }, + { + "epoch": 0.31054039355834007, + "grad_norm": 0.989825427532196, + "learning_rate": 1.5606694708989007e-05, + "loss": 0.4792, + "step": 13990 + }, + { + "epoch": 0.31065138011786775, + "grad_norm": 1.1437995433807373, + "learning_rate": 1.5603807172401644e-05, + "loss": 0.4597, + "step": 13995 + }, + { + "epoch": 0.3107623666773954, + "grad_norm": 1.5155508518218994, + "learning_rate": 1.5600918954522198e-05, + "loss": 0.5427, + "step": 14000 + }, + { + "epoch": 0.310873353236923, + "grad_norm": 1.084276556968689, + "learning_rate": 1.559803005570181e-05, + "loss": 0.512, + "step": 14005 + }, + { + "epoch": 0.3109843397964506, + "grad_norm": 0.9007726311683655, + "learning_rate": 1.55951404762917e-05, + "loss": 0.4989, + "step": 14010 + }, + { + "epoch": 0.3110953263559783, + "grad_norm": 1.1757463216781616, + "learning_rate": 1.559225021664318e-05, + "loss": 0.4425, + "step": 14015 + }, + { + "epoch": 0.31120631291550593, + "grad_norm": 2.251450538635254, + "learning_rate": 1.558935927710763e-05, + "loss": 0.5534, + "step": 14020 + }, + { + "epoch": 0.31131729947503356, + "grad_norm": 1.3141568899154663, + "learning_rate": 1.5586467658036526e-05, + "loss": 0.5211, + "step": 14025 + }, + { + "epoch": 0.31142828603456124, + "grad_norm": 1.1067291498184204, + "learning_rate": 1.558357535978142e-05, + "loss": 0.4725, + "step": 14030 + }, + { + "epoch": 0.31153927259408887, + "grad_norm": 1.132810115814209, + "learning_rate": 1.5580682382693947e-05, + "loss": 0.4732, + "step": 14035 + }, + { + "epoch": 0.3116502591536165, + "grad_norm": 0.9449180364608765, + "learning_rate": 1.5577788727125824e-05, + "loss": 0.4662, + "step": 14040 + }, + { + "epoch": 0.3117612457131441, + "grad_norm": 1.2642930746078491, + "learning_rate": 1.5574894393428856e-05, + "loss": 0.5886, + "step": 14045 + }, + { + "epoch": 0.3118722322726718, + "grad_norm": 1.315994381904602, + "learning_rate": 1.5571999381954925e-05, + "loss": 0.4998, + "step": 14050 + }, + { + "epoch": 0.3119832188321994, + "grad_norm": 1.6780071258544922, + "learning_rate": 1.5569103693055996e-05, + "loss": 0.3815, + "step": 14055 + }, + { + "epoch": 0.31209420539172705, + "grad_norm": 0.7860261797904968, + "learning_rate": 1.5566207327084116e-05, + "loss": 0.3847, + "step": 14060 + }, + { + "epoch": 0.3122051919512547, + "grad_norm": 1.3429057598114014, + "learning_rate": 1.556331028439142e-05, + "loss": 0.4024, + "step": 14065 + }, + { + "epoch": 0.31231617851078236, + "grad_norm": 1.730366587638855, + "learning_rate": 1.5560412565330115e-05, + "loss": 0.3026, + "step": 14070 + }, + { + "epoch": 0.31242716507031, + "grad_norm": 1.083951711654663, + "learning_rate": 1.5557514170252497e-05, + "loss": 0.4883, + "step": 14075 + }, + { + "epoch": 0.3125381516298376, + "grad_norm": 1.4283391237258911, + "learning_rate": 1.5554615099510945e-05, + "loss": 0.4442, + "step": 14080 + }, + { + "epoch": 0.3126491381893653, + "grad_norm": 1.7201144695281982, + "learning_rate": 1.5551715353457918e-05, + "loss": 0.5614, + "step": 14085 + }, + { + "epoch": 0.3127601247488929, + "grad_norm": 1.1675817966461182, + "learning_rate": 1.5548814932445958e-05, + "loss": 0.3333, + "step": 14090 + }, + { + "epoch": 0.31287111130842055, + "grad_norm": 1.1178109645843506, + "learning_rate": 1.554591383682769e-05, + "loss": 0.5362, + "step": 14095 + }, + { + "epoch": 0.3129820978679482, + "grad_norm": 0.872790515422821, + "learning_rate": 1.5543012066955816e-05, + "loss": 0.3317, + "step": 14100 + }, + { + "epoch": 0.31309308442747585, + "grad_norm": 1.2365795373916626, + "learning_rate": 1.5540109623183127e-05, + "loss": 0.4653, + "step": 14105 + }, + { + "epoch": 0.3132040709870035, + "grad_norm": 1.199913501739502, + "learning_rate": 1.5537206505862486e-05, + "loss": 0.5698, + "step": 14110 + }, + { + "epoch": 0.3133150575465311, + "grad_norm": 1.396148443222046, + "learning_rate": 1.553430271534685e-05, + "loss": 0.5484, + "step": 14115 + }, + { + "epoch": 0.31342604410605873, + "grad_norm": 0.9446309208869934, + "learning_rate": 1.553139825198925e-05, + "loss": 0.5779, + "step": 14120 + }, + { + "epoch": 0.3135370306655864, + "grad_norm": 1.3726730346679688, + "learning_rate": 1.55284931161428e-05, + "loss": 0.3918, + "step": 14125 + }, + { + "epoch": 0.31364801722511404, + "grad_norm": 1.2780450582504272, + "learning_rate": 1.55255873081607e-05, + "loss": 0.3834, + "step": 14130 + }, + { + "epoch": 0.31375900378464167, + "grad_norm": 1.1577582359313965, + "learning_rate": 1.5522680828396225e-05, + "loss": 0.3983, + "step": 14135 + }, + { + "epoch": 0.31386999034416935, + "grad_norm": 1.1583194732666016, + "learning_rate": 1.551977367720274e-05, + "loss": 0.4249, + "step": 14140 + }, + { + "epoch": 0.313980976903697, + "grad_norm": 0.9807047247886658, + "learning_rate": 1.551686585493368e-05, + "loss": 0.4428, + "step": 14145 + }, + { + "epoch": 0.3140919634632246, + "grad_norm": 1.1752185821533203, + "learning_rate": 1.5513957361942572e-05, + "loss": 0.3854, + "step": 14150 + }, + { + "epoch": 0.3142029500227522, + "grad_norm": 1.083425760269165, + "learning_rate": 1.551104819858302e-05, + "loss": 0.4976, + "step": 14155 + }, + { + "epoch": 0.3143139365822799, + "grad_norm": 1.3348230123519897, + "learning_rate": 1.550813836520871e-05, + "loss": 0.4052, + "step": 14160 + }, + { + "epoch": 0.31442492314180753, + "grad_norm": 1.4912233352661133, + "learning_rate": 1.5505227862173416e-05, + "loss": 0.4185, + "step": 14165 + }, + { + "epoch": 0.31453590970133516, + "grad_norm": 1.264873743057251, + "learning_rate": 1.5502316689830977e-05, + "loss": 0.4244, + "step": 14170 + }, + { + "epoch": 0.3146468962608628, + "grad_norm": 1.6147271394729614, + "learning_rate": 1.5499404848535323e-05, + "loss": 0.6673, + "step": 14175 + }, + { + "epoch": 0.31475788282039047, + "grad_norm": 0.8825428485870361, + "learning_rate": 1.549649233864048e-05, + "loss": 0.5818, + "step": 14180 + }, + { + "epoch": 0.3148688693799181, + "grad_norm": 1.313036561012268, + "learning_rate": 1.549357916050053e-05, + "loss": 0.5293, + "step": 14185 + }, + { + "epoch": 0.3149798559394457, + "grad_norm": 1.228447437286377, + "learning_rate": 1.5490665314469647e-05, + "loss": 0.6636, + "step": 14190 + }, + { + "epoch": 0.3150908424989734, + "grad_norm": 0.9629234671592712, + "learning_rate": 1.5487750800902094e-05, + "loss": 0.4886, + "step": 14195 + }, + { + "epoch": 0.315201829058501, + "grad_norm": 0.9011967182159424, + "learning_rate": 1.5484835620152198e-05, + "loss": 0.4487, + "step": 14200 + }, + { + "epoch": 0.31531281561802865, + "grad_norm": 0.8641685843467712, + "learning_rate": 1.5481919772574384e-05, + "loss": 0.518, + "step": 14205 + }, + { + "epoch": 0.3154238021775563, + "grad_norm": 0.7143692374229431, + "learning_rate": 1.547900325852315e-05, + "loss": 0.3314, + "step": 14210 + }, + { + "epoch": 0.31553478873708396, + "grad_norm": 1.0830315351486206, + "learning_rate": 1.5476086078353073e-05, + "loss": 0.6533, + "step": 14215 + }, + { + "epoch": 0.3156457752966116, + "grad_norm": 1.2461847066879272, + "learning_rate": 1.547316823241882e-05, + "loss": 0.5277, + "step": 14220 + }, + { + "epoch": 0.3157567618561392, + "grad_norm": 1.260133147239685, + "learning_rate": 1.5470249721075123e-05, + "loss": 0.3431, + "step": 14225 + }, + { + "epoch": 0.31586774841566684, + "grad_norm": 1.3078408241271973, + "learning_rate": 1.5467330544676814e-05, + "loss": 0.3828, + "step": 14230 + }, + { + "epoch": 0.3159787349751945, + "grad_norm": 1.5358682870864868, + "learning_rate": 1.5464410703578788e-05, + "loss": 0.5465, + "step": 14235 + }, + { + "epoch": 0.31608972153472215, + "grad_norm": 1.4887757301330566, + "learning_rate": 1.5461490198136043e-05, + "loss": 0.4876, + "step": 14240 + }, + { + "epoch": 0.31620070809424977, + "grad_norm": 1.0123707056045532, + "learning_rate": 1.5458569028703632e-05, + "loss": 0.5419, + "step": 14245 + }, + { + "epoch": 0.31631169465377745, + "grad_norm": 1.4290823936462402, + "learning_rate": 1.5455647195636706e-05, + "loss": 0.5567, + "step": 14250 + }, + { + "epoch": 0.3164226812133051, + "grad_norm": 1.0080236196517944, + "learning_rate": 1.5452724699290494e-05, + "loss": 0.4064, + "step": 14255 + }, + { + "epoch": 0.3165336677728327, + "grad_norm": 1.5305603742599487, + "learning_rate": 1.5449801540020294e-05, + "loss": 0.3978, + "step": 14260 + }, + { + "epoch": 0.31664465433236033, + "grad_norm": 1.0586655139923096, + "learning_rate": 1.5446877718181502e-05, + "loss": 0.3429, + "step": 14265 + }, + { + "epoch": 0.316755640891888, + "grad_norm": 1.0884453058242798, + "learning_rate": 1.5443953234129588e-05, + "loss": 0.5679, + "step": 14270 + }, + { + "epoch": 0.31686662745141564, + "grad_norm": 1.7376161813735962, + "learning_rate": 1.5441028088220094e-05, + "loss": 0.4128, + "step": 14275 + }, + { + "epoch": 0.31697761401094326, + "grad_norm": 1.216302752494812, + "learning_rate": 1.5438102280808653e-05, + "loss": 0.4103, + "step": 14280 + }, + { + "epoch": 0.3170886005704709, + "grad_norm": 1.2784180641174316, + "learning_rate": 1.5435175812250975e-05, + "loss": 0.567, + "step": 14285 + }, + { + "epoch": 0.3171995871299986, + "grad_norm": 1.0405032634735107, + "learning_rate": 1.543224868290285e-05, + "loss": 0.4103, + "step": 14290 + }, + { + "epoch": 0.3173105736895262, + "grad_norm": 0.7695423364639282, + "learning_rate": 1.542932089312015e-05, + "loss": 0.4365, + "step": 14295 + }, + { + "epoch": 0.3174215602490538, + "grad_norm": 1.6393778324127197, + "learning_rate": 1.5426392443258823e-05, + "loss": 0.5539, + "step": 14300 + }, + { + "epoch": 0.3175325468085815, + "grad_norm": 1.1428499221801758, + "learning_rate": 1.54234633336749e-05, + "loss": 0.3819, + "step": 14305 + }, + { + "epoch": 0.31764353336810913, + "grad_norm": 2.0584399700164795, + "learning_rate": 1.5420533564724495e-05, + "loss": 0.5535, + "step": 14310 + }, + { + "epoch": 0.31775451992763676, + "grad_norm": 0.7380337715148926, + "learning_rate": 1.5417603136763797e-05, + "loss": 0.524, + "step": 14315 + }, + { + "epoch": 0.3178655064871644, + "grad_norm": 1.0620688199996948, + "learning_rate": 1.5414672050149084e-05, + "loss": 0.4992, + "step": 14320 + }, + { + "epoch": 0.31797649304669207, + "grad_norm": 1.6038435697555542, + "learning_rate": 1.5411740305236698e-05, + "loss": 0.5696, + "step": 14325 + }, + { + "epoch": 0.3180874796062197, + "grad_norm": 1.363045573234558, + "learning_rate": 1.5408807902383074e-05, + "loss": 0.6078, + "step": 14330 + }, + { + "epoch": 0.3181984661657473, + "grad_norm": 1.191996693611145, + "learning_rate": 1.540587484194473e-05, + "loss": 0.5009, + "step": 14335 + }, + { + "epoch": 0.31830945272527494, + "grad_norm": 1.5661468505859375, + "learning_rate": 1.540294112427825e-05, + "loss": 0.3821, + "step": 14340 + }, + { + "epoch": 0.3184204392848026, + "grad_norm": 0.9835692644119263, + "learning_rate": 1.5400006749740305e-05, + "loss": 0.4166, + "step": 14345 + }, + { + "epoch": 0.31853142584433025, + "grad_norm": 1.3363474607467651, + "learning_rate": 1.539707171868765e-05, + "loss": 0.349, + "step": 14350 + }, + { + "epoch": 0.3186424124038579, + "grad_norm": 1.2180577516555786, + "learning_rate": 1.539413603147712e-05, + "loss": 0.4327, + "step": 14355 + }, + { + "epoch": 0.31875339896338556, + "grad_norm": 1.1217113733291626, + "learning_rate": 1.539119968846562e-05, + "loss": 0.4633, + "step": 14360 + }, + { + "epoch": 0.3188643855229132, + "grad_norm": 1.0430909395217896, + "learning_rate": 1.538826269001014e-05, + "loss": 0.5603, + "step": 14365 + }, + { + "epoch": 0.3189753720824408, + "grad_norm": 1.2054392099380493, + "learning_rate": 1.538532503646776e-05, + "loss": 0.6168, + "step": 14370 + }, + { + "epoch": 0.31908635864196844, + "grad_norm": 1.1659835577011108, + "learning_rate": 1.5382386728195616e-05, + "loss": 0.4366, + "step": 14375 + }, + { + "epoch": 0.3191973452014961, + "grad_norm": 1.2844847440719604, + "learning_rate": 1.537944776555095e-05, + "loss": 0.5524, + "step": 14380 + }, + { + "epoch": 0.31930833176102374, + "grad_norm": 0.9078483581542969, + "learning_rate": 1.537650814889106e-05, + "loss": 0.4722, + "step": 14385 + }, + { + "epoch": 0.31941931832055137, + "grad_norm": 1.4450150728225708, + "learning_rate": 1.5373567878573345e-05, + "loss": 0.3376, + "step": 14390 + }, + { + "epoch": 0.319530304880079, + "grad_norm": 1.5254474878311157, + "learning_rate": 1.5370626954955268e-05, + "loss": 0.3074, + "step": 14395 + }, + { + "epoch": 0.3196412914396067, + "grad_norm": 1.281997561454773, + "learning_rate": 1.5367685378394376e-05, + "loss": 0.5479, + "step": 14400 + }, + { + "epoch": 0.3197522779991343, + "grad_norm": 1.493674874305725, + "learning_rate": 1.53647431492483e-05, + "loss": 0.3875, + "step": 14405 + }, + { + "epoch": 0.31986326455866193, + "grad_norm": 1.1070647239685059, + "learning_rate": 1.536180026787474e-05, + "loss": 0.3339, + "step": 14410 + }, + { + "epoch": 0.3199742511181896, + "grad_norm": 2.0354061126708984, + "learning_rate": 1.5358856734631488e-05, + "loss": 0.4553, + "step": 14415 + }, + { + "epoch": 0.32008523767771724, + "grad_norm": 1.0139501094818115, + "learning_rate": 1.5355912549876408e-05, + "loss": 0.5503, + "step": 14420 + }, + { + "epoch": 0.32019622423724486, + "grad_norm": 1.764954924583435, + "learning_rate": 1.5352967713967442e-05, + "loss": 0.4297, + "step": 14425 + }, + { + "epoch": 0.3203072107967725, + "grad_norm": 1.7344118356704712, + "learning_rate": 1.5350022227262613e-05, + "loss": 0.4172, + "step": 14430 + }, + { + "epoch": 0.32041819735630017, + "grad_norm": 0.7399135828018188, + "learning_rate": 1.5347076090120025e-05, + "loss": 0.5299, + "step": 14435 + }, + { + "epoch": 0.3205291839158278, + "grad_norm": 1.514784812927246, + "learning_rate": 1.5344129302897857e-05, + "loss": 0.6035, + "step": 14440 + }, + { + "epoch": 0.3206401704753554, + "grad_norm": 2.186124801635742, + "learning_rate": 1.5341181865954372e-05, + "loss": 0.3716, + "step": 14445 + }, + { + "epoch": 0.32075115703488305, + "grad_norm": 1.0691059827804565, + "learning_rate": 1.533823377964791e-05, + "loss": 0.4006, + "step": 14450 + }, + { + "epoch": 0.32086214359441073, + "grad_norm": 1.0893067121505737, + "learning_rate": 1.5335285044336887e-05, + "loss": 0.4, + "step": 14455 + }, + { + "epoch": 0.32097313015393836, + "grad_norm": 1.1478650569915771, + "learning_rate": 1.53323356603798e-05, + "loss": 0.4683, + "step": 14460 + }, + { + "epoch": 0.321084116713466, + "grad_norm": 1.1583954095840454, + "learning_rate": 1.5329385628135227e-05, + "loss": 0.5464, + "step": 14465 + }, + { + "epoch": 0.32119510327299367, + "grad_norm": 1.4609736204147339, + "learning_rate": 1.5326434947961825e-05, + "loss": 0.4552, + "step": 14470 + }, + { + "epoch": 0.3213060898325213, + "grad_norm": 1.2604814767837524, + "learning_rate": 1.5323483620218324e-05, + "loss": 0.3968, + "step": 14475 + }, + { + "epoch": 0.3214170763920489, + "grad_norm": 0.8882197737693787, + "learning_rate": 1.5320531645263538e-05, + "loss": 0.5652, + "step": 14480 + }, + { + "epoch": 0.32152806295157654, + "grad_norm": 1.5108416080474854, + "learning_rate": 1.5317579023456355e-05, + "loss": 0.3902, + "step": 14485 + }, + { + "epoch": 0.3216390495111042, + "grad_norm": 1.3611268997192383, + "learning_rate": 1.5314625755155753e-05, + "loss": 0.401, + "step": 14490 + }, + { + "epoch": 0.32175003607063185, + "grad_norm": 1.0313974618911743, + "learning_rate": 1.5311671840720775e-05, + "loss": 0.2965, + "step": 14495 + }, + { + "epoch": 0.3218610226301595, + "grad_norm": 0.9944019913673401, + "learning_rate": 1.5308717280510547e-05, + "loss": 0.5465, + "step": 14500 + }, + { + "epoch": 0.3219720091896871, + "grad_norm": 1.3820387125015259, + "learning_rate": 1.5305762074884276e-05, + "loss": 0.4807, + "step": 14505 + }, + { + "epoch": 0.3220829957492148, + "grad_norm": 1.0571720600128174, + "learning_rate": 1.5302806224201247e-05, + "loss": 0.5615, + "step": 14510 + }, + { + "epoch": 0.3221939823087424, + "grad_norm": 1.5374435186386108, + "learning_rate": 1.529984972882082e-05, + "loss": 0.5059, + "step": 14515 + }, + { + "epoch": 0.32230496886827004, + "grad_norm": 1.101186752319336, + "learning_rate": 1.529689258910244e-05, + "loss": 0.3979, + "step": 14520 + }, + { + "epoch": 0.3224159554277977, + "grad_norm": 1.292052984237671, + "learning_rate": 1.529393480540562e-05, + "loss": 0.442, + "step": 14525 + }, + { + "epoch": 0.32252694198732534, + "grad_norm": 0.9494848251342773, + "learning_rate": 1.5290976378089962e-05, + "loss": 0.4632, + "step": 14530 + }, + { + "epoch": 0.32263792854685297, + "grad_norm": 1.0273785591125488, + "learning_rate": 1.5288017307515142e-05, + "loss": 0.2587, + "step": 14535 + }, + { + "epoch": 0.3227489151063806, + "grad_norm": 1.0562678575515747, + "learning_rate": 1.5285057594040912e-05, + "loss": 0.4169, + "step": 14540 + }, + { + "epoch": 0.3228599016659083, + "grad_norm": 3.380312204360962, + "learning_rate": 1.5282097238027106e-05, + "loss": 0.3716, + "step": 14545 + }, + { + "epoch": 0.3229708882254359, + "grad_norm": 1.0881527662277222, + "learning_rate": 1.527913623983363e-05, + "loss": 0.5425, + "step": 14550 + }, + { + "epoch": 0.32308187478496353, + "grad_norm": 1.6390355825424194, + "learning_rate": 1.5276174599820476e-05, + "loss": 0.6704, + "step": 14555 + }, + { + "epoch": 0.32319286134449116, + "grad_norm": 2.2281415462493896, + "learning_rate": 1.527321231834771e-05, + "loss": 0.4503, + "step": 14560 + }, + { + "epoch": 0.32330384790401884, + "grad_norm": 0.8207435011863708, + "learning_rate": 1.5270249395775473e-05, + "loss": 0.357, + "step": 14565 + }, + { + "epoch": 0.32341483446354646, + "grad_norm": 1.023201823234558, + "learning_rate": 1.526728583246399e-05, + "loss": 0.436, + "step": 14570 + }, + { + "epoch": 0.3235258210230741, + "grad_norm": 0.9416097402572632, + "learning_rate": 1.526432162877356e-05, + "loss": 0.5039, + "step": 14575 + }, + { + "epoch": 0.32363680758260177, + "grad_norm": 1.2997766733169556, + "learning_rate": 1.526135678506456e-05, + "loss": 0.437, + "step": 14580 + }, + { + "epoch": 0.3237477941421294, + "grad_norm": 1.6290065050125122, + "learning_rate": 1.5258391301697452e-05, + "loss": 0.4522, + "step": 14585 + }, + { + "epoch": 0.323858780701657, + "grad_norm": 1.3406147956848145, + "learning_rate": 1.5255425179032763e-05, + "loss": 0.4906, + "step": 14590 + }, + { + "epoch": 0.32396976726118465, + "grad_norm": 1.1843397617340088, + "learning_rate": 1.5252458417431106e-05, + "loss": 0.5041, + "step": 14595 + }, + { + "epoch": 0.32408075382071233, + "grad_norm": 1.036899209022522, + "learning_rate": 1.5249491017253166e-05, + "loss": 0.5141, + "step": 14600 + }, + { + "epoch": 0.32419174038023996, + "grad_norm": 1.191863775253296, + "learning_rate": 1.524652297885972e-05, + "loss": 0.6475, + "step": 14605 + }, + { + "epoch": 0.3243027269397676, + "grad_norm": 1.0216777324676514, + "learning_rate": 1.52435543026116e-05, + "loss": 0.4453, + "step": 14610 + }, + { + "epoch": 0.3244137134992952, + "grad_norm": 1.1279789209365845, + "learning_rate": 1.5240584988869738e-05, + "loss": 0.3538, + "step": 14615 + }, + { + "epoch": 0.3245247000588229, + "grad_norm": 1.2324609756469727, + "learning_rate": 1.5237615037995129e-05, + "loss": 0.2331, + "step": 14620 + }, + { + "epoch": 0.3246356866183505, + "grad_norm": 0.8681323528289795, + "learning_rate": 1.5234644450348848e-05, + "loss": 0.3599, + "step": 14625 + }, + { + "epoch": 0.32474667317787814, + "grad_norm": 0.8179816603660583, + "learning_rate": 1.5231673226292048e-05, + "loss": 0.4646, + "step": 14630 + }, + { + "epoch": 0.3248576597374058, + "grad_norm": 0.84742271900177, + "learning_rate": 1.5228701366185963e-05, + "loss": 0.466, + "step": 14635 + }, + { + "epoch": 0.32496864629693345, + "grad_norm": 1.1196081638336182, + "learning_rate": 1.5225728870391902e-05, + "loss": 0.6054, + "step": 14640 + }, + { + "epoch": 0.3250796328564611, + "grad_norm": 0.9078447818756104, + "learning_rate": 1.522275573927125e-05, + "loss": 0.4472, + "step": 14645 + }, + { + "epoch": 0.3251906194159887, + "grad_norm": 1.1502723693847656, + "learning_rate": 1.5219781973185477e-05, + "loss": 0.4347, + "step": 14650 + }, + { + "epoch": 0.3253016059755164, + "grad_norm": 0.9937777519226074, + "learning_rate": 1.521680757249611e-05, + "loss": 0.5165, + "step": 14655 + }, + { + "epoch": 0.325412592535044, + "grad_norm": 1.1031843423843384, + "learning_rate": 1.5213832537564778e-05, + "loss": 0.6251, + "step": 14660 + }, + { + "epoch": 0.32552357909457164, + "grad_norm": 1.3701297044754028, + "learning_rate": 1.5210856868753173e-05, + "loss": 0.6917, + "step": 14665 + }, + { + "epoch": 0.32563456565409926, + "grad_norm": 0.9994934797286987, + "learning_rate": 1.5207880566423064e-05, + "loss": 0.4558, + "step": 14670 + }, + { + "epoch": 0.32574555221362694, + "grad_norm": 1.2605825662612915, + "learning_rate": 1.5204903630936301e-05, + "loss": 0.365, + "step": 14675 + }, + { + "epoch": 0.32585653877315457, + "grad_norm": 1.5471420288085938, + "learning_rate": 1.5201926062654812e-05, + "loss": 0.3223, + "step": 14680 + }, + { + "epoch": 0.3259675253326822, + "grad_norm": 1.0369585752487183, + "learning_rate": 1.5198947861940596e-05, + "loss": 0.4513, + "step": 14685 + }, + { + "epoch": 0.3260785118922099, + "grad_norm": 1.1226929426193237, + "learning_rate": 1.5195969029155735e-05, + "loss": 0.5279, + "step": 14690 + }, + { + "epoch": 0.3261894984517375, + "grad_norm": 2.179290294647217, + "learning_rate": 1.5192989564662388e-05, + "loss": 0.5105, + "step": 14695 + }, + { + "epoch": 0.32630048501126513, + "grad_norm": 1.558292269706726, + "learning_rate": 1.5190009468822782e-05, + "loss": 0.6155, + "step": 14700 + }, + { + "epoch": 0.32641147157079275, + "grad_norm": 0.9227891564369202, + "learning_rate": 1.5187028741999234e-05, + "loss": 0.5226, + "step": 14705 + }, + { + "epoch": 0.32652245813032044, + "grad_norm": 1.2568672895431519, + "learning_rate": 1.5184047384554128e-05, + "loss": 0.3348, + "step": 14710 + }, + { + "epoch": 0.32663344468984806, + "grad_norm": 0.7160055637359619, + "learning_rate": 1.5181065396849924e-05, + "loss": 0.4504, + "step": 14715 + }, + { + "epoch": 0.3267444312493757, + "grad_norm": 0.9879772663116455, + "learning_rate": 1.5178082779249166e-05, + "loss": 0.4325, + "step": 14720 + }, + { + "epoch": 0.3268554178089033, + "grad_norm": 0.9792838096618652, + "learning_rate": 1.5175099532114468e-05, + "loss": 0.451, + "step": 14725 + }, + { + "epoch": 0.326966404368431, + "grad_norm": 1.3058290481567383, + "learning_rate": 1.5172115655808527e-05, + "loss": 0.5641, + "step": 14730 + }, + { + "epoch": 0.3270773909279586, + "grad_norm": 1.0594699382781982, + "learning_rate": 1.5169131150694112e-05, + "loss": 0.4265, + "step": 14735 + }, + { + "epoch": 0.32718837748748625, + "grad_norm": 1.0784043073654175, + "learning_rate": 1.5166146017134063e-05, + "loss": 0.4794, + "step": 14740 + }, + { + "epoch": 0.32729936404701393, + "grad_norm": 1.2410465478897095, + "learning_rate": 1.5163160255491312e-05, + "loss": 0.5655, + "step": 14745 + }, + { + "epoch": 0.32741035060654156, + "grad_norm": 1.521320104598999, + "learning_rate": 1.5160173866128848e-05, + "loss": 0.5817, + "step": 14750 + }, + { + "epoch": 0.3275213371660692, + "grad_norm": 1.2554552555084229, + "learning_rate": 1.5157186849409755e-05, + "loss": 0.4564, + "step": 14755 + }, + { + "epoch": 0.3276323237255968, + "grad_norm": 1.2104326486587524, + "learning_rate": 1.515419920569718e-05, + "loss": 0.6301, + "step": 14760 + }, + { + "epoch": 0.3277433102851245, + "grad_norm": 1.1501840353012085, + "learning_rate": 1.5151210935354352e-05, + "loss": 0.5422, + "step": 14765 + }, + { + "epoch": 0.3278542968446521, + "grad_norm": 1.3156615495681763, + "learning_rate": 1.5148222038744571e-05, + "loss": 0.6681, + "step": 14770 + }, + { + "epoch": 0.32796528340417974, + "grad_norm": 1.120080590248108, + "learning_rate": 1.5145232516231226e-05, + "loss": 0.4336, + "step": 14775 + }, + { + "epoch": 0.32807626996370737, + "grad_norm": 1.1407369375228882, + "learning_rate": 1.5142242368177762e-05, + "loss": 0.6098, + "step": 14780 + }, + { + "epoch": 0.32818725652323505, + "grad_norm": 0.8843163251876831, + "learning_rate": 1.5139251594947721e-05, + "loss": 0.4413, + "step": 14785 + }, + { + "epoch": 0.3282982430827627, + "grad_norm": 1.3494120836257935, + "learning_rate": 1.5136260196904704e-05, + "loss": 0.3554, + "step": 14790 + }, + { + "epoch": 0.3284092296422903, + "grad_norm": 1.1606281995773315, + "learning_rate": 1.5133268174412399e-05, + "loss": 0.3569, + "step": 14795 + }, + { + "epoch": 0.328520216201818, + "grad_norm": 1.158096194267273, + "learning_rate": 1.5130275527834566e-05, + "loss": 0.606, + "step": 14800 + }, + { + "epoch": 0.3286312027613456, + "grad_norm": 1.2391105890274048, + "learning_rate": 1.5127282257535037e-05, + "loss": 0.3439, + "step": 14805 + }, + { + "epoch": 0.32874218932087323, + "grad_norm": 1.2935023307800293, + "learning_rate": 1.5124288363877728e-05, + "loss": 0.4074, + "step": 14810 + }, + { + "epoch": 0.32885317588040086, + "grad_norm": 1.0960984230041504, + "learning_rate": 1.5121293847226626e-05, + "loss": 0.5407, + "step": 14815 + }, + { + "epoch": 0.32896416243992854, + "grad_norm": 1.383482575416565, + "learning_rate": 1.5118298707945794e-05, + "loss": 0.4033, + "step": 14820 + }, + { + "epoch": 0.32907514899945617, + "grad_norm": 1.285130262374878, + "learning_rate": 1.5115302946399368e-05, + "loss": 0.5377, + "step": 14825 + }, + { + "epoch": 0.3291861355589838, + "grad_norm": 1.1885371208190918, + "learning_rate": 1.5112306562951569e-05, + "loss": 0.4775, + "step": 14830 + }, + { + "epoch": 0.3292971221185114, + "grad_norm": 0.8962212204933167, + "learning_rate": 1.510930955796668e-05, + "loss": 0.2948, + "step": 14835 + }, + { + "epoch": 0.3294081086780391, + "grad_norm": 1.2280473709106445, + "learning_rate": 1.510631193180907e-05, + "loss": 0.4923, + "step": 14840 + }, + { + "epoch": 0.32951909523756673, + "grad_norm": 1.0373233556747437, + "learning_rate": 1.5103313684843184e-05, + "loss": 0.6092, + "step": 14845 + }, + { + "epoch": 0.32963008179709435, + "grad_norm": 1.2166337966918945, + "learning_rate": 1.5100314817433535e-05, + "loss": 0.624, + "step": 14850 + }, + { + "epoch": 0.32974106835662204, + "grad_norm": 1.1595144271850586, + "learning_rate": 1.5097315329944711e-05, + "loss": 0.5713, + "step": 14855 + }, + { + "epoch": 0.32985205491614966, + "grad_norm": 1.23126220703125, + "learning_rate": 1.5094315222741388e-05, + "loss": 0.6027, + "step": 14860 + }, + { + "epoch": 0.3299630414756773, + "grad_norm": 0.8551965951919556, + "learning_rate": 1.5091314496188303e-05, + "loss": 0.4671, + "step": 14865 + }, + { + "epoch": 0.3300740280352049, + "grad_norm": 1.1392122507095337, + "learning_rate": 1.5088313150650278e-05, + "loss": 0.426, + "step": 14870 + }, + { + "epoch": 0.3301850145947326, + "grad_norm": 1.1065609455108643, + "learning_rate": 1.5085311186492206e-05, + "loss": 0.3221, + "step": 14875 + }, + { + "epoch": 0.3302960011542602, + "grad_norm": 2.416872262954712, + "learning_rate": 1.5082308604079054e-05, + "loss": 0.3214, + "step": 14880 + }, + { + "epoch": 0.33040698771378785, + "grad_norm": 0.9736378788948059, + "learning_rate": 1.5079305403775866e-05, + "loss": 0.483, + "step": 14885 + }, + { + "epoch": 0.33051797427331553, + "grad_norm": 1.625980019569397, + "learning_rate": 1.5076301585947763e-05, + "loss": 0.4877, + "step": 14890 + }, + { + "epoch": 0.33062896083284316, + "grad_norm": 1.5329805612564087, + "learning_rate": 1.5073297150959935e-05, + "loss": 0.4652, + "step": 14895 + }, + { + "epoch": 0.3307399473923708, + "grad_norm": 0.7575278282165527, + "learning_rate": 1.5070292099177656e-05, + "loss": 0.5156, + "step": 14900 + }, + { + "epoch": 0.3308509339518984, + "grad_norm": 0.901342511177063, + "learning_rate": 1.506728643096627e-05, + "loss": 0.471, + "step": 14905 + }, + { + "epoch": 0.3309619205114261, + "grad_norm": 1.2140843868255615, + "learning_rate": 1.506428014669119e-05, + "loss": 0.4416, + "step": 14910 + }, + { + "epoch": 0.3310729070709537, + "grad_norm": 0.9275099635124207, + "learning_rate": 1.5061273246717918e-05, + "loss": 0.4653, + "step": 14915 + }, + { + "epoch": 0.33118389363048134, + "grad_norm": 1.206339716911316, + "learning_rate": 1.5058265731412017e-05, + "loss": 0.5535, + "step": 14920 + }, + { + "epoch": 0.33129488019000897, + "grad_norm": 1.1294726133346558, + "learning_rate": 1.5055257601139132e-05, + "loss": 0.2508, + "step": 14925 + }, + { + "epoch": 0.33140586674953665, + "grad_norm": 1.9953529834747314, + "learning_rate": 1.5052248856264982e-05, + "loss": 0.5702, + "step": 14930 + }, + { + "epoch": 0.3315168533090643, + "grad_norm": 1.2275333404541016, + "learning_rate": 1.504923949715536e-05, + "loss": 0.5817, + "step": 14935 + }, + { + "epoch": 0.3316278398685919, + "grad_norm": 0.9535608887672424, + "learning_rate": 1.5046229524176132e-05, + "loss": 0.4339, + "step": 14940 + }, + { + "epoch": 0.3317388264281196, + "grad_norm": 1.269870400428772, + "learning_rate": 1.5043218937693245e-05, + "loss": 0.5943, + "step": 14945 + }, + { + "epoch": 0.3318498129876472, + "grad_norm": 1.8588420152664185, + "learning_rate": 1.5040207738072714e-05, + "loss": 0.4375, + "step": 14950 + }, + { + "epoch": 0.33196079954717483, + "grad_norm": 1.0522677898406982, + "learning_rate": 1.5037195925680626e-05, + "loss": 0.3727, + "step": 14955 + }, + { + "epoch": 0.33207178610670246, + "grad_norm": 1.5340313911437988, + "learning_rate": 1.5034183500883153e-05, + "loss": 0.4697, + "step": 14960 + }, + { + "epoch": 0.33218277266623014, + "grad_norm": 0.8807029128074646, + "learning_rate": 1.5031170464046532e-05, + "loss": 0.437, + "step": 14965 + }, + { + "epoch": 0.33229375922575777, + "grad_norm": 1.765564203262329, + "learning_rate": 1.5028156815537083e-05, + "loss": 0.3982, + "step": 14970 + }, + { + "epoch": 0.3324047457852854, + "grad_norm": 1.0525873899459839, + "learning_rate": 1.5025142555721189e-05, + "loss": 0.5576, + "step": 14975 + }, + { + "epoch": 0.332515732344813, + "grad_norm": 1.1160537004470825, + "learning_rate": 1.5022127684965316e-05, + "loss": 0.5933, + "step": 14980 + }, + { + "epoch": 0.3326267189043407, + "grad_norm": 1.0868256092071533, + "learning_rate": 1.5019112203636002e-05, + "loss": 0.5242, + "step": 14985 + }, + { + "epoch": 0.3327377054638683, + "grad_norm": 0.989216685295105, + "learning_rate": 1.5016096112099858e-05, + "loss": 0.6588, + "step": 14990 + }, + { + "epoch": 0.33284869202339595, + "grad_norm": 1.5227062702178955, + "learning_rate": 1.501307941072357e-05, + "loss": 0.4789, + "step": 14995 + }, + { + "epoch": 0.33295967858292363, + "grad_norm": 0.7849462628364563, + "learning_rate": 1.5010062099873904e-05, + "loss": 0.4131, + "step": 15000 + }, + { + "epoch": 0.33307066514245126, + "grad_norm": 1.0590498447418213, + "learning_rate": 1.5007044179917686e-05, + "loss": 0.5614, + "step": 15005 + }, + { + "epoch": 0.3331816517019789, + "grad_norm": 1.6085760593414307, + "learning_rate": 1.5004025651221833e-05, + "loss": 0.4746, + "step": 15010 + }, + { + "epoch": 0.3332926382615065, + "grad_norm": 1.1510807275772095, + "learning_rate": 1.5001006514153323e-05, + "loss": 0.5765, + "step": 15015 + }, + { + "epoch": 0.3334036248210342, + "grad_norm": 1.6156750917434692, + "learning_rate": 1.4997986769079212e-05, + "loss": 0.4793, + "step": 15020 + }, + { + "epoch": 0.3335146113805618, + "grad_norm": 1.2351129055023193, + "learning_rate": 1.499496641636663e-05, + "loss": 0.3932, + "step": 15025 + }, + { + "epoch": 0.33362559794008945, + "grad_norm": 0.9428719282150269, + "learning_rate": 1.4991945456382784e-05, + "loss": 0.5519, + "step": 15030 + }, + { + "epoch": 0.3337365844996171, + "grad_norm": 1.2243869304656982, + "learning_rate": 1.4988923889494952e-05, + "loss": 0.2783, + "step": 15035 + }, + { + "epoch": 0.33384757105914475, + "grad_norm": 1.2313982248306274, + "learning_rate": 1.4985901716070486e-05, + "loss": 0.5672, + "step": 15040 + }, + { + "epoch": 0.3339585576186724, + "grad_norm": 0.9632226228713989, + "learning_rate": 1.4982878936476808e-05, + "loss": 0.4697, + "step": 15045 + }, + { + "epoch": 0.3340695441782, + "grad_norm": 1.2731530666351318, + "learning_rate": 1.4979855551081422e-05, + "loss": 0.4888, + "step": 15050 + }, + { + "epoch": 0.3341805307377277, + "grad_norm": 1.4409986734390259, + "learning_rate": 1.4976831560251901e-05, + "loss": 0.3715, + "step": 15055 + }, + { + "epoch": 0.3342915172972553, + "grad_norm": 0.7494924068450928, + "learning_rate": 1.4973806964355886e-05, + "loss": 0.3139, + "step": 15060 + }, + { + "epoch": 0.33440250385678294, + "grad_norm": 1.039964199066162, + "learning_rate": 1.4970781763761105e-05, + "loss": 0.3249, + "step": 15065 + }, + { + "epoch": 0.33451349041631057, + "grad_norm": 1.3432955741882324, + "learning_rate": 1.4967755958835346e-05, + "loss": 0.4206, + "step": 15070 + }, + { + "epoch": 0.33462447697583825, + "grad_norm": 1.152287483215332, + "learning_rate": 1.4964729549946477e-05, + "loss": 0.5159, + "step": 15075 + }, + { + "epoch": 0.3347354635353659, + "grad_norm": 1.3000831604003906, + "learning_rate": 1.4961702537462439e-05, + "loss": 0.5009, + "step": 15080 + }, + { + "epoch": 0.3348464500948935, + "grad_norm": 1.213396668434143, + "learning_rate": 1.4958674921751248e-05, + "loss": 0.3251, + "step": 15085 + }, + { + "epoch": 0.3349574366544211, + "grad_norm": 1.6604156494140625, + "learning_rate": 1.495564670318099e-05, + "loss": 0.6201, + "step": 15090 + }, + { + "epoch": 0.3350684232139488, + "grad_norm": 0.8826848864555359, + "learning_rate": 1.4952617882119826e-05, + "loss": 0.5158, + "step": 15095 + }, + { + "epoch": 0.33517940977347643, + "grad_norm": 1.5868339538574219, + "learning_rate": 1.4949588458935994e-05, + "loss": 0.4477, + "step": 15100 + }, + { + "epoch": 0.33529039633300406, + "grad_norm": 2.079285144805908, + "learning_rate": 1.4946558433997792e-05, + "loss": 0.5307, + "step": 15105 + }, + { + "epoch": 0.33540138289253174, + "grad_norm": 1.3494343757629395, + "learning_rate": 1.4943527807673604e-05, + "loss": 0.4364, + "step": 15110 + }, + { + "epoch": 0.33551236945205937, + "grad_norm": 1.2706613540649414, + "learning_rate": 1.4940496580331884e-05, + "loss": 0.3495, + "step": 15115 + }, + { + "epoch": 0.335623356011587, + "grad_norm": 1.4336494207382202, + "learning_rate": 1.4937464752341163e-05, + "loss": 0.5021, + "step": 15120 + }, + { + "epoch": 0.3357343425711146, + "grad_norm": 1.0443059206008911, + "learning_rate": 1.4934432324070033e-05, + "loss": 0.5054, + "step": 15125 + }, + { + "epoch": 0.3358453291306423, + "grad_norm": 1.4348981380462646, + "learning_rate": 1.4931399295887172e-05, + "loss": 0.585, + "step": 15130 + }, + { + "epoch": 0.3359563156901699, + "grad_norm": 1.1642272472381592, + "learning_rate": 1.4928365668161322e-05, + "loss": 0.5025, + "step": 15135 + }, + { + "epoch": 0.33606730224969755, + "grad_norm": 1.3693106174468994, + "learning_rate": 1.4925331441261303e-05, + "loss": 0.5125, + "step": 15140 + }, + { + "epoch": 0.3361782888092252, + "grad_norm": 1.2210627794265747, + "learning_rate": 1.4922296615556007e-05, + "loss": 0.4841, + "step": 15145 + }, + { + "epoch": 0.33628927536875286, + "grad_norm": 0.6630014181137085, + "learning_rate": 1.4919261191414394e-05, + "loss": 0.4659, + "step": 15150 + }, + { + "epoch": 0.3364002619282805, + "grad_norm": 1.1604273319244385, + "learning_rate": 1.4916225169205505e-05, + "loss": 0.5551, + "step": 15155 + }, + { + "epoch": 0.3365112484878081, + "grad_norm": 0.9893293976783752, + "learning_rate": 1.4913188549298447e-05, + "loss": 0.541, + "step": 15160 + }, + { + "epoch": 0.3366222350473358, + "grad_norm": 1.03878915309906, + "learning_rate": 1.4910151332062404e-05, + "loss": 0.4045, + "step": 15165 + }, + { + "epoch": 0.3367332216068634, + "grad_norm": 1.2045263051986694, + "learning_rate": 1.4907113517866629e-05, + "loss": 0.556, + "step": 15170 + }, + { + "epoch": 0.33684420816639105, + "grad_norm": 1.4657323360443115, + "learning_rate": 1.4904075107080448e-05, + "loss": 0.4307, + "step": 15175 + }, + { + "epoch": 0.33695519472591867, + "grad_norm": 1.265890121459961, + "learning_rate": 1.4901036100073265e-05, + "loss": 0.5265, + "step": 15180 + }, + { + "epoch": 0.33706618128544635, + "grad_norm": 1.269054651260376, + "learning_rate": 1.4897996497214548e-05, + "loss": 0.3963, + "step": 15185 + }, + { + "epoch": 0.337177167844974, + "grad_norm": 1.269852876663208, + "learning_rate": 1.4894956298873844e-05, + "loss": 0.4867, + "step": 15190 + }, + { + "epoch": 0.3372881544045016, + "grad_norm": 1.182908296585083, + "learning_rate": 1.4891915505420768e-05, + "loss": 0.3747, + "step": 15195 + }, + { + "epoch": 0.33739914096402923, + "grad_norm": 1.119896650314331, + "learning_rate": 1.4888874117225013e-05, + "loss": 0.3979, + "step": 15200 + }, + { + "epoch": 0.3375101275235569, + "grad_norm": 1.0146291255950928, + "learning_rate": 1.488583213465634e-05, + "loss": 0.457, + "step": 15205 + }, + { + "epoch": 0.33762111408308454, + "grad_norm": 0.9620803594589233, + "learning_rate": 1.4882789558084578e-05, + "loss": 0.6078, + "step": 15210 + }, + { + "epoch": 0.33773210064261217, + "grad_norm": 1.4261409044265747, + "learning_rate": 1.487974638787964e-05, + "loss": 0.449, + "step": 15215 + }, + { + "epoch": 0.33784308720213985, + "grad_norm": 1.0503255128860474, + "learning_rate": 1.48767026244115e-05, + "loss": 0.3487, + "step": 15220 + }, + { + "epoch": 0.3379540737616675, + "grad_norm": 1.3295342922210693, + "learning_rate": 1.487365826805021e-05, + "loss": 0.5331, + "step": 15225 + }, + { + "epoch": 0.3380650603211951, + "grad_norm": 1.5140436887741089, + "learning_rate": 1.4870613319165894e-05, + "loss": 0.61, + "step": 15230 + }, + { + "epoch": 0.3381760468807227, + "grad_norm": 1.2723734378814697, + "learning_rate": 1.4867567778128744e-05, + "loss": 0.532, + "step": 15235 + }, + { + "epoch": 0.3382870334402504, + "grad_norm": 0.8640614151954651, + "learning_rate": 1.4864521645309031e-05, + "loss": 0.4707, + "step": 15240 + }, + { + "epoch": 0.33839801999977803, + "grad_norm": 1.4589073657989502, + "learning_rate": 1.4861474921077088e-05, + "loss": 0.5183, + "step": 15245 + }, + { + "epoch": 0.33850900655930566, + "grad_norm": 0.8808674812316895, + "learning_rate": 1.485842760580333e-05, + "loss": 0.6101, + "step": 15250 + }, + { + "epoch": 0.3386199931188333, + "grad_norm": 1.252490520477295, + "learning_rate": 1.4855379699858236e-05, + "loss": 0.3703, + "step": 15255 + }, + { + "epoch": 0.33873097967836097, + "grad_norm": 1.2436802387237549, + "learning_rate": 1.4852331203612363e-05, + "loss": 0.3798, + "step": 15260 + }, + { + "epoch": 0.3388419662378886, + "grad_norm": 1.3613600730895996, + "learning_rate": 1.4849282117436335e-05, + "loss": 0.2952, + "step": 15265 + }, + { + "epoch": 0.3389529527974162, + "grad_norm": 1.2618591785430908, + "learning_rate": 1.4846232441700849e-05, + "loss": 0.5093, + "step": 15270 + }, + { + "epoch": 0.3390639393569439, + "grad_norm": 1.2628250122070312, + "learning_rate": 1.4843182176776679e-05, + "loss": 0.4677, + "step": 15275 + }, + { + "epoch": 0.3391749259164715, + "grad_norm": 1.2692986726760864, + "learning_rate": 1.4840131323034661e-05, + "loss": 0.5347, + "step": 15280 + }, + { + "epoch": 0.33928591247599915, + "grad_norm": 0.9842135906219482, + "learning_rate": 1.4837079880845711e-05, + "loss": 0.5112, + "step": 15285 + }, + { + "epoch": 0.3393968990355268, + "grad_norm": 1.024045467376709, + "learning_rate": 1.4834027850580809e-05, + "loss": 0.4785, + "step": 15290 + }, + { + "epoch": 0.33950788559505446, + "grad_norm": 1.2909910678863525, + "learning_rate": 1.4830975232611013e-05, + "loss": 0.4522, + "step": 15295 + }, + { + "epoch": 0.3396188721545821, + "grad_norm": 1.6360613107681274, + "learning_rate": 1.482792202730745e-05, + "loss": 0.487, + "step": 15300 + }, + { + "epoch": 0.3397298587141097, + "grad_norm": 0.9143996834754944, + "learning_rate": 1.482486823504132e-05, + "loss": 0.3799, + "step": 15305 + }, + { + "epoch": 0.33984084527363734, + "grad_norm": 1.7492982149124146, + "learning_rate": 1.4821813856183891e-05, + "loss": 0.5018, + "step": 15310 + }, + { + "epoch": 0.339951831833165, + "grad_norm": 1.0792031288146973, + "learning_rate": 1.4818758891106504e-05, + "loss": 0.3132, + "step": 15315 + }, + { + "epoch": 0.34006281839269265, + "grad_norm": 1.0121617317199707, + "learning_rate": 1.4815703340180572e-05, + "loss": 0.3193, + "step": 15320 + }, + { + "epoch": 0.34017380495222027, + "grad_norm": 1.20917546749115, + "learning_rate": 1.4812647203777578e-05, + "loss": 0.3868, + "step": 15325 + }, + { + "epoch": 0.34028479151174795, + "grad_norm": 2.5683934688568115, + "learning_rate": 1.4809590482269078e-05, + "loss": 0.4244, + "step": 15330 + }, + { + "epoch": 0.3403957780712756, + "grad_norm": 1.3045350313186646, + "learning_rate": 1.4806533176026696e-05, + "loss": 0.6908, + "step": 15335 + }, + { + "epoch": 0.3405067646308032, + "grad_norm": 1.223612904548645, + "learning_rate": 1.480347528542213e-05, + "loss": 0.3019, + "step": 15340 + }, + { + "epoch": 0.34061775119033083, + "grad_norm": 1.101333498954773, + "learning_rate": 1.4800416810827151e-05, + "loss": 0.6385, + "step": 15345 + }, + { + "epoch": 0.3407287377498585, + "grad_norm": 1.2106317281723022, + "learning_rate": 1.4797357752613594e-05, + "loss": 0.3609, + "step": 15350 + }, + { + "epoch": 0.34083972430938614, + "grad_norm": 0.8643994927406311, + "learning_rate": 1.4794298111153374e-05, + "loss": 0.4171, + "step": 15355 + }, + { + "epoch": 0.34095071086891376, + "grad_norm": 1.0502965450286865, + "learning_rate": 1.4791237886818464e-05, + "loss": 0.3617, + "step": 15360 + }, + { + "epoch": 0.3410616974284414, + "grad_norm": 0.955426812171936, + "learning_rate": 1.4788177079980919e-05, + "loss": 0.3525, + "step": 15365 + }, + { + "epoch": 0.34117268398796907, + "grad_norm": 0.9375084638595581, + "learning_rate": 1.4785115691012866e-05, + "loss": 0.6322, + "step": 15370 + }, + { + "epoch": 0.3412836705474967, + "grad_norm": 0.9736770391464233, + "learning_rate": 1.4782053720286496e-05, + "loss": 0.3913, + "step": 15375 + }, + { + "epoch": 0.3413946571070243, + "grad_norm": 1.5599290132522583, + "learning_rate": 1.477899116817407e-05, + "loss": 0.4838, + "step": 15380 + }, + { + "epoch": 0.341505643666552, + "grad_norm": 1.8174604177474976, + "learning_rate": 1.4775928035047928e-05, + "loss": 0.5936, + "step": 15385 + }, + { + "epoch": 0.34161663022607963, + "grad_norm": 1.320636510848999, + "learning_rate": 1.477286432128047e-05, + "loss": 0.4545, + "step": 15390 + }, + { + "epoch": 0.34172761678560726, + "grad_norm": 0.8202735781669617, + "learning_rate": 1.4769800027244175e-05, + "loss": 0.5133, + "step": 15395 + }, + { + "epoch": 0.3418386033451349, + "grad_norm": 0.8367480039596558, + "learning_rate": 1.476673515331159e-05, + "loss": 0.4065, + "step": 15400 + }, + { + "epoch": 0.34194958990466257, + "grad_norm": 1.1306264400482178, + "learning_rate": 1.4763669699855334e-05, + "loss": 0.4893, + "step": 15405 + }, + { + "epoch": 0.3420605764641902, + "grad_norm": 1.102778434753418, + "learning_rate": 1.4760603667248087e-05, + "loss": 0.3346, + "step": 15410 + }, + { + "epoch": 0.3421715630237178, + "grad_norm": 1.1968376636505127, + "learning_rate": 1.4757537055862616e-05, + "loss": 0.4742, + "step": 15415 + }, + { + "epoch": 0.34228254958324544, + "grad_norm": 1.0032037496566772, + "learning_rate": 1.4754469866071745e-05, + "loss": 0.4358, + "step": 15420 + }, + { + "epoch": 0.3423935361427731, + "grad_norm": 1.210485816001892, + "learning_rate": 1.4751402098248373e-05, + "loss": 0.4164, + "step": 15425 + }, + { + "epoch": 0.34250452270230075, + "grad_norm": 1.4902434349060059, + "learning_rate": 1.4748333752765467e-05, + "loss": 0.4337, + "step": 15430 + }, + { + "epoch": 0.3426155092618284, + "grad_norm": 1.163905382156372, + "learning_rate": 1.4745264829996071e-05, + "loss": 0.5235, + "step": 15435 + }, + { + "epoch": 0.34272649582135606, + "grad_norm": 1.2259149551391602, + "learning_rate": 1.4742195330313287e-05, + "loss": 0.2444, + "step": 15440 + }, + { + "epoch": 0.3428374823808837, + "grad_norm": 1.1223088502883911, + "learning_rate": 1.4739125254090303e-05, + "loss": 0.5696, + "step": 15445 + }, + { + "epoch": 0.3429484689404113, + "grad_norm": 1.2788912057876587, + "learning_rate": 1.4736054601700361e-05, + "loss": 0.4696, + "step": 15450 + }, + { + "epoch": 0.34305945549993894, + "grad_norm": 1.5745298862457275, + "learning_rate": 1.4732983373516784e-05, + "loss": 0.399, + "step": 15455 + }, + { + "epoch": 0.3431704420594666, + "grad_norm": 0.7371529340744019, + "learning_rate": 1.4729911569912965e-05, + "loss": 0.5272, + "step": 15460 + }, + { + "epoch": 0.34328142861899424, + "grad_norm": 0.6537275910377502, + "learning_rate": 1.4726839191262358e-05, + "loss": 0.4604, + "step": 15465 + }, + { + "epoch": 0.34339241517852187, + "grad_norm": 1.454412579536438, + "learning_rate": 1.4723766237938495e-05, + "loss": 0.4575, + "step": 15470 + }, + { + "epoch": 0.3435034017380495, + "grad_norm": 1.6370235681533813, + "learning_rate": 1.4720692710314975e-05, + "loss": 0.4841, + "step": 15475 + }, + { + "epoch": 0.3436143882975772, + "grad_norm": 2.204667568206787, + "learning_rate": 1.4717618608765465e-05, + "loss": 0.6183, + "step": 15480 + }, + { + "epoch": 0.3437253748571048, + "grad_norm": 1.4957976341247559, + "learning_rate": 1.471454393366371e-05, + "loss": 0.6387, + "step": 15485 + }, + { + "epoch": 0.34383636141663243, + "grad_norm": 1.743932843208313, + "learning_rate": 1.4711468685383515e-05, + "loss": 0.4208, + "step": 15490 + }, + { + "epoch": 0.3439473479761601, + "grad_norm": 1.1519712209701538, + "learning_rate": 1.4708392864298754e-05, + "loss": 0.5323, + "step": 15495 + }, + { + "epoch": 0.34405833453568774, + "grad_norm": 1.2590768337249756, + "learning_rate": 1.4705316470783384e-05, + "loss": 0.2691, + "step": 15500 + }, + { + "epoch": 0.34416932109521536, + "grad_norm": 1.0717517137527466, + "learning_rate": 1.4702239505211414e-05, + "loss": 0.5341, + "step": 15505 + }, + { + "epoch": 0.344280307654743, + "grad_norm": 1.0902793407440186, + "learning_rate": 1.4699161967956936e-05, + "loss": 0.5581, + "step": 15510 + }, + { + "epoch": 0.34439129421427067, + "grad_norm": 1.8656681776046753, + "learning_rate": 1.4696083859394107e-05, + "loss": 0.6056, + "step": 15515 + }, + { + "epoch": 0.3445022807737983, + "grad_norm": 0.9933974742889404, + "learning_rate": 1.4693005179897154e-05, + "loss": 0.4728, + "step": 15520 + }, + { + "epoch": 0.3446132673333259, + "grad_norm": 1.2458628416061401, + "learning_rate": 1.4689925929840367e-05, + "loss": 0.5591, + "step": 15525 + }, + { + "epoch": 0.34472425389285355, + "grad_norm": 0.7363001704216003, + "learning_rate": 1.4686846109598114e-05, + "loss": 0.4925, + "step": 15530 + }, + { + "epoch": 0.34483524045238123, + "grad_norm": 1.4438176155090332, + "learning_rate": 1.4683765719544832e-05, + "loss": 0.6418, + "step": 15535 + }, + { + "epoch": 0.34494622701190886, + "grad_norm": 0.6764224767684937, + "learning_rate": 1.468068476005502e-05, + "loss": 0.4647, + "step": 15540 + }, + { + "epoch": 0.3450572135714365, + "grad_norm": 1.0960233211517334, + "learning_rate": 1.4677603231503254e-05, + "loss": 0.443, + "step": 15545 + }, + { + "epoch": 0.34516820013096416, + "grad_norm": 1.5263903141021729, + "learning_rate": 1.4674521134264174e-05, + "loss": 0.4602, + "step": 15550 + }, + { + "epoch": 0.3452791866904918, + "grad_norm": 1.222199559211731, + "learning_rate": 1.467143846871249e-05, + "loss": 0.4396, + "step": 15555 + }, + { + "epoch": 0.3453901732500194, + "grad_norm": 0.9828495979309082, + "learning_rate": 1.4668355235222986e-05, + "loss": 0.4504, + "step": 15560 + }, + { + "epoch": 0.34550115980954704, + "grad_norm": 1.0530000925064087, + "learning_rate": 1.4665271434170507e-05, + "loss": 0.5981, + "step": 15565 + }, + { + "epoch": 0.3456121463690747, + "grad_norm": 1.3413766622543335, + "learning_rate": 1.4662187065929976e-05, + "loss": 0.4722, + "step": 15570 + }, + { + "epoch": 0.34572313292860235, + "grad_norm": 1.097169041633606, + "learning_rate": 1.4659102130876374e-05, + "loss": 0.5017, + "step": 15575 + }, + { + "epoch": 0.34583411948813, + "grad_norm": 1.3739992380142212, + "learning_rate": 1.4656016629384762e-05, + "loss": 0.4514, + "step": 15580 + }, + { + "epoch": 0.3459451060476576, + "grad_norm": 1.134070634841919, + "learning_rate": 1.4652930561830263e-05, + "loss": 0.4381, + "step": 15585 + }, + { + "epoch": 0.3460560926071853, + "grad_norm": 1.2391133308410645, + "learning_rate": 1.464984392858807e-05, + "loss": 0.4815, + "step": 15590 + }, + { + "epoch": 0.3461670791667129, + "grad_norm": 1.32672917842865, + "learning_rate": 1.464675673003345e-05, + "loss": 0.4924, + "step": 15595 + }, + { + "epoch": 0.34627806572624054, + "grad_norm": 1.2489418983459473, + "learning_rate": 1.4643668966541726e-05, + "loss": 0.5202, + "step": 15600 + }, + { + "epoch": 0.3463890522857682, + "grad_norm": 1.1635617017745972, + "learning_rate": 1.4640580638488306e-05, + "loss": 0.4877, + "step": 15605 + }, + { + "epoch": 0.34650003884529584, + "grad_norm": 1.0924367904663086, + "learning_rate": 1.4637491746248653e-05, + "loss": 0.3345, + "step": 15610 + }, + { + "epoch": 0.34661102540482347, + "grad_norm": 1.1278338432312012, + "learning_rate": 1.4634402290198306e-05, + "loss": 0.4457, + "step": 15615 + }, + { + "epoch": 0.3467220119643511, + "grad_norm": 0.7023510336875916, + "learning_rate": 1.4631312270712875e-05, + "loss": 0.5518, + "step": 15620 + }, + { + "epoch": 0.3468329985238788, + "grad_norm": 1.3885475397109985, + "learning_rate": 1.462822168816803e-05, + "loss": 0.3658, + "step": 15625 + }, + { + "epoch": 0.3469439850834064, + "grad_norm": 1.0028605461120605, + "learning_rate": 1.4625130542939512e-05, + "loss": 0.3945, + "step": 15630 + }, + { + "epoch": 0.34705497164293403, + "grad_norm": 1.6290814876556396, + "learning_rate": 1.4622038835403135e-05, + "loss": 0.6108, + "step": 15635 + }, + { + "epoch": 0.34716595820246166, + "grad_norm": 1.0613174438476562, + "learning_rate": 1.4618946565934775e-05, + "loss": 0.4755, + "step": 15640 + }, + { + "epoch": 0.34727694476198934, + "grad_norm": 1.064847469329834, + "learning_rate": 1.4615853734910386e-05, + "loss": 0.3752, + "step": 15645 + }, + { + "epoch": 0.34738793132151696, + "grad_norm": 1.3954349756240845, + "learning_rate": 1.4612760342705979e-05, + "loss": 0.5212, + "step": 15650 + }, + { + "epoch": 0.3474989178810446, + "grad_norm": 1.5847194194793701, + "learning_rate": 1.4609666389697638e-05, + "loss": 0.4606, + "step": 15655 + }, + { + "epoch": 0.34760990444057227, + "grad_norm": 1.386644721031189, + "learning_rate": 1.4606571876261517e-05, + "loss": 0.4912, + "step": 15660 + }, + { + "epoch": 0.3477208910000999, + "grad_norm": 1.5342525243759155, + "learning_rate": 1.4603476802773839e-05, + "loss": 0.4848, + "step": 15665 + }, + { + "epoch": 0.3478318775596275, + "grad_norm": 1.0431338548660278, + "learning_rate": 1.4600381169610888e-05, + "loss": 0.4559, + "step": 15670 + }, + { + "epoch": 0.34794286411915515, + "grad_norm": 1.5182932615280151, + "learning_rate": 1.4597284977149022e-05, + "loss": 0.5943, + "step": 15675 + }, + { + "epoch": 0.34805385067868283, + "grad_norm": 1.3352900743484497, + "learning_rate": 1.4594188225764667e-05, + "loss": 0.5782, + "step": 15680 + }, + { + "epoch": 0.34816483723821046, + "grad_norm": 1.3885362148284912, + "learning_rate": 1.4591090915834319e-05, + "loss": 0.3971, + "step": 15685 + }, + { + "epoch": 0.3482758237977381, + "grad_norm": 1.206699252128601, + "learning_rate": 1.458799304773453e-05, + "loss": 0.4441, + "step": 15690 + }, + { + "epoch": 0.3483868103572657, + "grad_norm": 2.033198833465576, + "learning_rate": 1.4584894621841937e-05, + "loss": 0.3343, + "step": 15695 + }, + { + "epoch": 0.3484977969167934, + "grad_norm": 1.1778301000595093, + "learning_rate": 1.4581795638533227e-05, + "loss": 0.5982, + "step": 15700 + }, + { + "epoch": 0.348608783476321, + "grad_norm": 1.5190293788909912, + "learning_rate": 1.4578696098185175e-05, + "loss": 0.6049, + "step": 15705 + }, + { + "epoch": 0.34871977003584864, + "grad_norm": 1.171130657196045, + "learning_rate": 1.4575596001174605e-05, + "loss": 0.5663, + "step": 15710 + }, + { + "epoch": 0.3488307565953763, + "grad_norm": 2.257117509841919, + "learning_rate": 1.457249534787842e-05, + "loss": 0.5115, + "step": 15715 + }, + { + "epoch": 0.34894174315490395, + "grad_norm": 1.0101255178451538, + "learning_rate": 1.4569394138673583e-05, + "loss": 0.388, + "step": 15720 + }, + { + "epoch": 0.3490527297144316, + "grad_norm": 2.2193045616149902, + "learning_rate": 1.4566292373937133e-05, + "loss": 0.5509, + "step": 15725 + }, + { + "epoch": 0.3491637162739592, + "grad_norm": 0.6599202752113342, + "learning_rate": 1.4563190054046168e-05, + "loss": 0.6527, + "step": 15730 + }, + { + "epoch": 0.3492747028334869, + "grad_norm": 1.3797396421432495, + "learning_rate": 1.4560087179377862e-05, + "loss": 0.7525, + "step": 15735 + }, + { + "epoch": 0.3493856893930145, + "grad_norm": 1.2089630365371704, + "learning_rate": 1.455698375030945e-05, + "loss": 0.4175, + "step": 15740 + }, + { + "epoch": 0.34949667595254214, + "grad_norm": 1.3355190753936768, + "learning_rate": 1.4553879767218238e-05, + "loss": 0.4175, + "step": 15745 + }, + { + "epoch": 0.34960766251206976, + "grad_norm": 0.9695683121681213, + "learning_rate": 1.4550775230481593e-05, + "loss": 0.5127, + "step": 15750 + }, + { + "epoch": 0.34971864907159744, + "grad_norm": 1.308209776878357, + "learning_rate": 1.454767014047696e-05, + "loss": 0.4467, + "step": 15755 + }, + { + "epoch": 0.34982963563112507, + "grad_norm": 1.0755424499511719, + "learning_rate": 1.454456449758184e-05, + "loss": 0.3588, + "step": 15760 + }, + { + "epoch": 0.3499406221906527, + "grad_norm": 1.162462830543518, + "learning_rate": 1.4541458302173815e-05, + "loss": 0.4285, + "step": 15765 + }, + { + "epoch": 0.3500516087501804, + "grad_norm": 1.1130118370056152, + "learning_rate": 1.4538351554630517e-05, + "loss": 0.3654, + "step": 15770 + }, + { + "epoch": 0.350162595309708, + "grad_norm": 1.501671314239502, + "learning_rate": 1.4535244255329657e-05, + "loss": 0.4821, + "step": 15775 + }, + { + "epoch": 0.35027358186923563, + "grad_norm": 1.2213767766952515, + "learning_rate": 1.4532136404649012e-05, + "loss": 0.4606, + "step": 15780 + }, + { + "epoch": 0.35038456842876325, + "grad_norm": 1.1781730651855469, + "learning_rate": 1.4529028002966424e-05, + "loss": 0.4491, + "step": 15785 + }, + { + "epoch": 0.35049555498829094, + "grad_norm": 1.2890111207962036, + "learning_rate": 1.4525919050659798e-05, + "loss": 0.4341, + "step": 15790 + }, + { + "epoch": 0.35060654154781856, + "grad_norm": 1.1150555610656738, + "learning_rate": 1.4522809548107113e-05, + "loss": 0.4862, + "step": 15795 + }, + { + "epoch": 0.3507175281073462, + "grad_norm": 1.4423096179962158, + "learning_rate": 1.4519699495686413e-05, + "loss": 0.5472, + "step": 15800 + }, + { + "epoch": 0.3508285146668738, + "grad_norm": 0.8447060585021973, + "learning_rate": 1.4516588893775805e-05, + "loss": 0.4617, + "step": 15805 + }, + { + "epoch": 0.3509395012264015, + "grad_norm": 1.3163111209869385, + "learning_rate": 1.4513477742753465e-05, + "loss": 0.4202, + "step": 15810 + }, + { + "epoch": 0.3510504877859291, + "grad_norm": 1.0188243389129639, + "learning_rate": 1.4510366042997638e-05, + "loss": 0.5308, + "step": 15815 + }, + { + "epoch": 0.35116147434545675, + "grad_norm": 0.8433900475502014, + "learning_rate": 1.4507253794886638e-05, + "loss": 0.436, + "step": 15820 + }, + { + "epoch": 0.35127246090498443, + "grad_norm": 1.1491411924362183, + "learning_rate": 1.4504140998798834e-05, + "loss": 0.5348, + "step": 15825 + }, + { + "epoch": 0.35138344746451206, + "grad_norm": 1.0281944274902344, + "learning_rate": 1.4501027655112675e-05, + "loss": 0.3657, + "step": 15830 + }, + { + "epoch": 0.3514944340240397, + "grad_norm": 1.3137558698654175, + "learning_rate": 1.449791376420667e-05, + "loss": 0.5407, + "step": 15835 + }, + { + "epoch": 0.3516054205835673, + "grad_norm": 1.239391803741455, + "learning_rate": 1.4494799326459393e-05, + "loss": 0.6412, + "step": 15840 + }, + { + "epoch": 0.351716407143095, + "grad_norm": 1.202092170715332, + "learning_rate": 1.4491684342249485e-05, + "loss": 0.4441, + "step": 15845 + }, + { + "epoch": 0.3518273937026226, + "grad_norm": 1.3718825578689575, + "learning_rate": 1.4488568811955663e-05, + "loss": 0.4353, + "step": 15850 + }, + { + "epoch": 0.35193838026215024, + "grad_norm": 1.1037505865097046, + "learning_rate": 1.4485452735956698e-05, + "loss": 0.4914, + "step": 15855 + }, + { + "epoch": 0.35204936682167787, + "grad_norm": 1.0239901542663574, + "learning_rate": 1.4482336114631432e-05, + "loss": 0.315, + "step": 15860 + }, + { + "epoch": 0.35216035338120555, + "grad_norm": 0.8123399019241333, + "learning_rate": 1.4479218948358773e-05, + "loss": 0.4516, + "step": 15865 + }, + { + "epoch": 0.3522713399407332, + "grad_norm": 0.8140292167663574, + "learning_rate": 1.4476101237517694e-05, + "loss": 0.3005, + "step": 15870 + }, + { + "epoch": 0.3523823265002608, + "grad_norm": 1.1909701824188232, + "learning_rate": 1.4472982982487242e-05, + "loss": 0.2595, + "step": 15875 + }, + { + "epoch": 0.3524933130597885, + "grad_norm": 1.754469394683838, + "learning_rate": 1.4469864183646521e-05, + "loss": 0.4105, + "step": 15880 + }, + { + "epoch": 0.3526042996193161, + "grad_norm": 0.990523099899292, + "learning_rate": 1.44667448413747e-05, + "loss": 0.4436, + "step": 15885 + }, + { + "epoch": 0.35271528617884373, + "grad_norm": 1.1670485734939575, + "learning_rate": 1.4463624956051023e-05, + "loss": 0.6312, + "step": 15890 + }, + { + "epoch": 0.35282627273837136, + "grad_norm": 1.3339301347732544, + "learning_rate": 1.4460504528054796e-05, + "loss": 0.3657, + "step": 15895 + }, + { + "epoch": 0.35293725929789904, + "grad_norm": 1.3710170984268188, + "learning_rate": 1.4457383557765385e-05, + "loss": 0.5359, + "step": 15900 + }, + { + "epoch": 0.35304824585742667, + "grad_norm": 1.1399394273757935, + "learning_rate": 1.445426204556223e-05, + "loss": 0.6212, + "step": 15905 + }, + { + "epoch": 0.3531592324169543, + "grad_norm": 1.5016330480575562, + "learning_rate": 1.4451139991824833e-05, + "loss": 0.5507, + "step": 15910 + }, + { + "epoch": 0.353270218976482, + "grad_norm": 0.6789126396179199, + "learning_rate": 1.4448017396932767e-05, + "loss": 0.4242, + "step": 15915 + }, + { + "epoch": 0.3533812055360096, + "grad_norm": 1.0379139184951782, + "learning_rate": 1.444489426126566e-05, + "loss": 0.4029, + "step": 15920 + }, + { + "epoch": 0.3534921920955372, + "grad_norm": 1.112060546875, + "learning_rate": 1.444177058520322e-05, + "loss": 0.4358, + "step": 15925 + }, + { + "epoch": 0.35360317865506485, + "grad_norm": 1.1852864027023315, + "learning_rate": 1.4438646369125205e-05, + "loss": 0.4653, + "step": 15930 + }, + { + "epoch": 0.35371416521459254, + "grad_norm": 1.0151760578155518, + "learning_rate": 1.443552161341145e-05, + "loss": 0.5065, + "step": 15935 + }, + { + "epoch": 0.35382515177412016, + "grad_norm": 0.7877997756004333, + "learning_rate": 1.4432396318441855e-05, + "loss": 0.512, + "step": 15940 + }, + { + "epoch": 0.3539361383336478, + "grad_norm": 1.031214952468872, + "learning_rate": 1.442927048459638e-05, + "loss": 0.3855, + "step": 15945 + }, + { + "epoch": 0.3540471248931754, + "grad_norm": 1.2018985748291016, + "learning_rate": 1.4426144112255057e-05, + "loss": 0.5145, + "step": 15950 + }, + { + "epoch": 0.3541581114527031, + "grad_norm": 1.386949062347412, + "learning_rate": 1.442301720179797e-05, + "loss": 0.512, + "step": 15955 + }, + { + "epoch": 0.3542690980122307, + "grad_norm": 1.0878076553344727, + "learning_rate": 1.441988975360529e-05, + "loss": 0.5614, + "step": 15960 + }, + { + "epoch": 0.35438008457175835, + "grad_norm": 1.7431411743164062, + "learning_rate": 1.4416761768057237e-05, + "loss": 0.3427, + "step": 15965 + }, + { + "epoch": 0.35449107113128603, + "grad_norm": 1.3923200368881226, + "learning_rate": 1.44136332455341e-05, + "loss": 0.6949, + "step": 15970 + }, + { + "epoch": 0.35460205769081365, + "grad_norm": 1.394872784614563, + "learning_rate": 1.4410504186416237e-05, + "loss": 0.3061, + "step": 15975 + }, + { + "epoch": 0.3547130442503413, + "grad_norm": 1.056589126586914, + "learning_rate": 1.4407374591084064e-05, + "loss": 0.3958, + "step": 15980 + }, + { + "epoch": 0.3548240308098689, + "grad_norm": 1.1440049409866333, + "learning_rate": 1.4404244459918069e-05, + "loss": 0.4387, + "step": 15985 + }, + { + "epoch": 0.3549350173693966, + "grad_norm": 0.9853442907333374, + "learning_rate": 1.4401113793298804e-05, + "loss": 0.4085, + "step": 15990 + }, + { + "epoch": 0.3550460039289242, + "grad_norm": 1.08008873462677, + "learning_rate": 1.4397982591606887e-05, + "loss": 0.4988, + "step": 15995 + }, + { + "epoch": 0.35515699048845184, + "grad_norm": 1.3674938678741455, + "learning_rate": 1.4394850855222995e-05, + "loss": 0.338, + "step": 16000 + }, + { + "epoch": 0.35526797704797947, + "grad_norm": 1.0351570844650269, + "learning_rate": 1.4391718584527871e-05, + "loss": 0.4379, + "step": 16005 + }, + { + "epoch": 0.35537896360750715, + "grad_norm": 1.1538983583450317, + "learning_rate": 1.4388585779902336e-05, + "loss": 0.5366, + "step": 16010 + }, + { + "epoch": 0.3554899501670348, + "grad_norm": 1.7466696500778198, + "learning_rate": 1.4385452441727257e-05, + "loss": 0.595, + "step": 16015 + }, + { + "epoch": 0.3556009367265624, + "grad_norm": 1.2843554019927979, + "learning_rate": 1.4382318570383578e-05, + "loss": 0.4549, + "step": 16020 + }, + { + "epoch": 0.3557119232860901, + "grad_norm": 1.1115984916687012, + "learning_rate": 1.4379184166252304e-05, + "loss": 0.5201, + "step": 16025 + }, + { + "epoch": 0.3558229098456177, + "grad_norm": 1.5879791975021362, + "learning_rate": 1.4376049229714509e-05, + "loss": 0.3875, + "step": 16030 + }, + { + "epoch": 0.35593389640514533, + "grad_norm": 1.3459078073501587, + "learning_rate": 1.4372913761151321e-05, + "loss": 0.7422, + "step": 16035 + }, + { + "epoch": 0.35604488296467296, + "grad_norm": 2.2278897762298584, + "learning_rate": 1.4369777760943944e-05, + "loss": 0.3918, + "step": 16040 + }, + { + "epoch": 0.35615586952420064, + "grad_norm": 1.5131078958511353, + "learning_rate": 1.4366641229473644e-05, + "loss": 0.3198, + "step": 16045 + }, + { + "epoch": 0.35626685608372827, + "grad_norm": 1.5506442785263062, + "learning_rate": 1.4363504167121747e-05, + "loss": 0.4538, + "step": 16050 + }, + { + "epoch": 0.3563778426432559, + "grad_norm": 1.6898088455200195, + "learning_rate": 1.4360366574269647e-05, + "loss": 0.564, + "step": 16055 + }, + { + "epoch": 0.3564888292027835, + "grad_norm": 1.203200340270996, + "learning_rate": 1.4357228451298802e-05, + "loss": 0.4952, + "step": 16060 + }, + { + "epoch": 0.3565998157623112, + "grad_norm": 1.6116600036621094, + "learning_rate": 1.4354089798590735e-05, + "loss": 0.5373, + "step": 16065 + }, + { + "epoch": 0.3567108023218388, + "grad_norm": 0.919315755367279, + "learning_rate": 1.4350950616527032e-05, + "loss": 0.427, + "step": 16070 + }, + { + "epoch": 0.35682178888136645, + "grad_norm": 1.246697187423706, + "learning_rate": 1.4347810905489348e-05, + "loss": 0.4758, + "step": 16075 + }, + { + "epoch": 0.35693277544089413, + "grad_norm": 0.885575532913208, + "learning_rate": 1.4344670665859393e-05, + "loss": 0.4152, + "step": 16080 + }, + { + "epoch": 0.35704376200042176, + "grad_norm": 1.086563229560852, + "learning_rate": 1.434152989801895e-05, + "loss": 0.506, + "step": 16085 + }, + { + "epoch": 0.3571547485599494, + "grad_norm": 1.346968412399292, + "learning_rate": 1.4338388602349862e-05, + "loss": 0.5173, + "step": 16090 + }, + { + "epoch": 0.357265735119477, + "grad_norm": 1.4068158864974976, + "learning_rate": 1.433524677923404e-05, + "loss": 0.481, + "step": 16095 + }, + { + "epoch": 0.3573767216790047, + "grad_norm": 1.0199639797210693, + "learning_rate": 1.4332104429053449e-05, + "loss": 0.4408, + "step": 16100 + }, + { + "epoch": 0.3574877082385323, + "grad_norm": 1.341516375541687, + "learning_rate": 1.4328961552190132e-05, + "loss": 0.3486, + "step": 16105 + }, + { + "epoch": 0.35759869479805995, + "grad_norm": 0.6724761724472046, + "learning_rate": 1.432581814902619e-05, + "loss": 0.2141, + "step": 16110 + }, + { + "epoch": 0.35770968135758757, + "grad_norm": 1.1561756134033203, + "learning_rate": 1.4322674219943783e-05, + "loss": 0.4822, + "step": 16115 + }, + { + "epoch": 0.35782066791711525, + "grad_norm": 1.3430947065353394, + "learning_rate": 1.4319529765325144e-05, + "loss": 0.4901, + "step": 16120 + }, + { + "epoch": 0.3579316544766429, + "grad_norm": 0.9767167568206787, + "learning_rate": 1.431638478555256e-05, + "loss": 0.5472, + "step": 16125 + }, + { + "epoch": 0.3580426410361705, + "grad_norm": 1.5211210250854492, + "learning_rate": 1.4313239281008388e-05, + "loss": 0.6586, + "step": 16130 + }, + { + "epoch": 0.3581536275956982, + "grad_norm": 1.7379239797592163, + "learning_rate": 1.4310093252075055e-05, + "loss": 0.4366, + "step": 16135 + }, + { + "epoch": 0.3582646141552258, + "grad_norm": 0.8871414661407471, + "learning_rate": 1.4306946699135038e-05, + "loss": 0.4634, + "step": 16140 + }, + { + "epoch": 0.35837560071475344, + "grad_norm": 1.0361415147781372, + "learning_rate": 1.4303799622570884e-05, + "loss": 0.4303, + "step": 16145 + }, + { + "epoch": 0.35848658727428107, + "grad_norm": 1.4143927097320557, + "learning_rate": 1.4300652022765207e-05, + "loss": 0.5521, + "step": 16150 + }, + { + "epoch": 0.35859757383380875, + "grad_norm": 1.8394910097122192, + "learning_rate": 1.4297503900100678e-05, + "loss": 0.5932, + "step": 16155 + }, + { + "epoch": 0.3587085603933364, + "grad_norm": 1.1367939710617065, + "learning_rate": 1.429435525496004e-05, + "loss": 0.4967, + "step": 16160 + }, + { + "epoch": 0.358819546952864, + "grad_norm": 0.9071010947227478, + "learning_rate": 1.429120608772609e-05, + "loss": 0.5516, + "step": 16165 + }, + { + "epoch": 0.3589305335123916, + "grad_norm": 1.2656437158584595, + "learning_rate": 1.4288056398781698e-05, + "loss": 0.632, + "step": 16170 + }, + { + "epoch": 0.3590415200719193, + "grad_norm": 0.85821133852005, + "learning_rate": 1.428490618850979e-05, + "loss": 0.4313, + "step": 16175 + }, + { + "epoch": 0.35915250663144693, + "grad_norm": 0.7876311540603638, + "learning_rate": 1.4281755457293359e-05, + "loss": 0.3428, + "step": 16180 + }, + { + "epoch": 0.35926349319097456, + "grad_norm": 1.28394615650177, + "learning_rate": 1.4278604205515453e-05, + "loss": 0.5349, + "step": 16185 + }, + { + "epoch": 0.35937447975050224, + "grad_norm": 1.3520690202713013, + "learning_rate": 1.4275452433559202e-05, + "loss": 0.454, + "step": 16190 + }, + { + "epoch": 0.35948546631002987, + "grad_norm": 1.587080717086792, + "learning_rate": 1.4272300141807782e-05, + "loss": 0.469, + "step": 16195 + }, + { + "epoch": 0.3595964528695575, + "grad_norm": 1.1184484958648682, + "learning_rate": 1.426914733064444e-05, + "loss": 0.3818, + "step": 16200 + }, + { + "epoch": 0.3597074394290851, + "grad_norm": 1.2177501916885376, + "learning_rate": 1.4265994000452484e-05, + "loss": 0.5151, + "step": 16205 + }, + { + "epoch": 0.3598184259886128, + "grad_norm": 1.274096965789795, + "learning_rate": 1.426284015161528e-05, + "loss": 0.5162, + "step": 16210 + }, + { + "epoch": 0.3599294125481404, + "grad_norm": 1.4250680208206177, + "learning_rate": 1.425968578451627e-05, + "loss": 0.4129, + "step": 16215 + }, + { + "epoch": 0.36004039910766805, + "grad_norm": 1.0498861074447632, + "learning_rate": 1.4256530899538948e-05, + "loss": 0.5664, + "step": 16220 + }, + { + "epoch": 0.3601513856671957, + "grad_norm": 1.3277438879013062, + "learning_rate": 1.4253375497066875e-05, + "loss": 0.43, + "step": 16225 + }, + { + "epoch": 0.36026237222672336, + "grad_norm": 1.4574748277664185, + "learning_rate": 1.4250219577483673e-05, + "loss": 0.5163, + "step": 16230 + }, + { + "epoch": 0.360373358786251, + "grad_norm": 1.4078586101531982, + "learning_rate": 1.424706314117303e-05, + "loss": 0.422, + "step": 16235 + }, + { + "epoch": 0.3604843453457786, + "grad_norm": 1.536993384361267, + "learning_rate": 1.4243906188518691e-05, + "loss": 0.4067, + "step": 16240 + }, + { + "epoch": 0.3605953319053063, + "grad_norm": 1.1171602010726929, + "learning_rate": 1.4240748719904471e-05, + "loss": 0.5737, + "step": 16245 + }, + { + "epoch": 0.3607063184648339, + "grad_norm": 1.253387689590454, + "learning_rate": 1.4237590735714246e-05, + "loss": 0.3864, + "step": 16250 + }, + { + "epoch": 0.36081730502436155, + "grad_norm": 1.6348520517349243, + "learning_rate": 1.4234432236331951e-05, + "loss": 0.4945, + "step": 16255 + }, + { + "epoch": 0.36092829158388917, + "grad_norm": 0.8581211566925049, + "learning_rate": 1.4231273222141587e-05, + "loss": 0.5606, + "step": 16260 + }, + { + "epoch": 0.36103927814341685, + "grad_norm": 1.2018139362335205, + "learning_rate": 1.4228113693527212e-05, + "loss": 0.3867, + "step": 16265 + }, + { + "epoch": 0.3611502647029445, + "grad_norm": 0.9835402369499207, + "learning_rate": 1.4224953650872958e-05, + "loss": 0.5186, + "step": 16270 + }, + { + "epoch": 0.3612612512624721, + "grad_norm": 1.4427926540374756, + "learning_rate": 1.4221793094563006e-05, + "loss": 0.5518, + "step": 16275 + }, + { + "epoch": 0.36137223782199973, + "grad_norm": 0.8743858933448792, + "learning_rate": 1.421863202498161e-05, + "loss": 0.4663, + "step": 16280 + }, + { + "epoch": 0.3614832243815274, + "grad_norm": 2.119997978210449, + "learning_rate": 1.4215470442513077e-05, + "loss": 0.3596, + "step": 16285 + }, + { + "epoch": 0.36159421094105504, + "grad_norm": 1.464104413986206, + "learning_rate": 1.4212308347541787e-05, + "loss": 0.4975, + "step": 16290 + }, + { + "epoch": 0.36170519750058266, + "grad_norm": 1.2174603939056396, + "learning_rate": 1.4209145740452175e-05, + "loss": 0.4806, + "step": 16295 + }, + { + "epoch": 0.36181618406011035, + "grad_norm": 1.1920909881591797, + "learning_rate": 1.4205982621628742e-05, + "loss": 0.4288, + "step": 16300 + }, + { + "epoch": 0.361927170619638, + "grad_norm": 1.8685282468795776, + "learning_rate": 1.4202818991456047e-05, + "loss": 0.381, + "step": 16305 + }, + { + "epoch": 0.3620381571791656, + "grad_norm": 1.1709849834442139, + "learning_rate": 1.4199654850318713e-05, + "loss": 0.4441, + "step": 16310 + }, + { + "epoch": 0.3621491437386932, + "grad_norm": 1.5978822708129883, + "learning_rate": 1.4196490198601426e-05, + "loss": 0.4736, + "step": 16315 + }, + { + "epoch": 0.3622601302982209, + "grad_norm": 1.6112041473388672, + "learning_rate": 1.419332503668894e-05, + "loss": 0.4517, + "step": 16320 + }, + { + "epoch": 0.36237111685774853, + "grad_norm": 1.0208383798599243, + "learning_rate": 1.4190159364966053e-05, + "loss": 0.4732, + "step": 16325 + }, + { + "epoch": 0.36248210341727616, + "grad_norm": 0.9899531006813049, + "learning_rate": 1.4186993183817643e-05, + "loss": 0.5406, + "step": 16330 + }, + { + "epoch": 0.3625930899768038, + "grad_norm": 1.1739122867584229, + "learning_rate": 1.4183826493628647e-05, + "loss": 0.5429, + "step": 16335 + }, + { + "epoch": 0.36270407653633147, + "grad_norm": 1.1442511081695557, + "learning_rate": 1.4180659294784058e-05, + "loss": 0.4618, + "step": 16340 + }, + { + "epoch": 0.3628150630958591, + "grad_norm": 1.8962513208389282, + "learning_rate": 1.4177491587668933e-05, + "loss": 0.4247, + "step": 16345 + }, + { + "epoch": 0.3629260496553867, + "grad_norm": 1.5891103744506836, + "learning_rate": 1.4174323372668387e-05, + "loss": 0.5651, + "step": 16350 + }, + { + "epoch": 0.3630370362149144, + "grad_norm": 1.5759340524673462, + "learning_rate": 1.4171154650167606e-05, + "loss": 0.4986, + "step": 16355 + }, + { + "epoch": 0.363148022774442, + "grad_norm": 1.0296753644943237, + "learning_rate": 1.4167985420551836e-05, + "loss": 0.4137, + "step": 16360 + }, + { + "epoch": 0.36325900933396965, + "grad_norm": 1.6463236808776855, + "learning_rate": 1.4164815684206372e-05, + "loss": 0.4071, + "step": 16365 + }, + { + "epoch": 0.3633699958934973, + "grad_norm": 0.9370015263557434, + "learning_rate": 1.4161645441516588e-05, + "loss": 0.2914, + "step": 16370 + }, + { + "epoch": 0.36348098245302496, + "grad_norm": 1.150620460510254, + "learning_rate": 1.4158474692867907e-05, + "loss": 0.4136, + "step": 16375 + }, + { + "epoch": 0.3635919690125526, + "grad_norm": 1.129516363143921, + "learning_rate": 1.4155303438645818e-05, + "loss": 0.4786, + "step": 16380 + }, + { + "epoch": 0.3637029555720802, + "grad_norm": 1.2091377973556519, + "learning_rate": 1.4152131679235872e-05, + "loss": 0.5573, + "step": 16385 + }, + { + "epoch": 0.36381394213160784, + "grad_norm": 1.0632437467575073, + "learning_rate": 1.4148959415023687e-05, + "loss": 0.3591, + "step": 16390 + }, + { + "epoch": 0.3639249286911355, + "grad_norm": 0.9506019949913025, + "learning_rate": 1.4145786646394926e-05, + "loss": 0.4244, + "step": 16395 + }, + { + "epoch": 0.36403591525066314, + "grad_norm": 1.217382550239563, + "learning_rate": 1.414261337373533e-05, + "loss": 0.3737, + "step": 16400 + }, + { + "epoch": 0.36414690181019077, + "grad_norm": 1.446199655532837, + "learning_rate": 1.4139439597430693e-05, + "loss": 0.3826, + "step": 16405 + }, + { + "epoch": 0.36425788836971845, + "grad_norm": 1.6045228242874146, + "learning_rate": 1.4136265317866874e-05, + "loss": 0.6589, + "step": 16410 + }, + { + "epoch": 0.3643688749292461, + "grad_norm": 1.313880443572998, + "learning_rate": 1.4133090535429788e-05, + "loss": 0.5849, + "step": 16415 + }, + { + "epoch": 0.3644798614887737, + "grad_norm": 1.2983330488204956, + "learning_rate": 1.4129915250505418e-05, + "loss": 0.709, + "step": 16420 + }, + { + "epoch": 0.36459084804830133, + "grad_norm": 4.271213054656982, + "learning_rate": 1.41267394634798e-05, + "loss": 0.4601, + "step": 16425 + }, + { + "epoch": 0.364701834607829, + "grad_norm": 0.890982985496521, + "learning_rate": 1.4123563174739036e-05, + "loss": 0.5058, + "step": 16430 + }, + { + "epoch": 0.36481282116735664, + "grad_norm": 0.7690356969833374, + "learning_rate": 1.4120386384669294e-05, + "loss": 0.4065, + "step": 16435 + }, + { + "epoch": 0.36492380772688426, + "grad_norm": 0.7626538276672363, + "learning_rate": 1.4117209093656791e-05, + "loss": 0.4242, + "step": 16440 + }, + { + "epoch": 0.3650347942864119, + "grad_norm": 1.2675507068634033, + "learning_rate": 1.4114031302087818e-05, + "loss": 0.4668, + "step": 16445 + }, + { + "epoch": 0.36514578084593957, + "grad_norm": 1.6725234985351562, + "learning_rate": 1.4110853010348717e-05, + "loss": 0.519, + "step": 16450 + }, + { + "epoch": 0.3652567674054672, + "grad_norm": 1.8293321132659912, + "learning_rate": 1.4107674218825893e-05, + "loss": 0.5037, + "step": 16455 + }, + { + "epoch": 0.3653677539649948, + "grad_norm": 1.29794180393219, + "learning_rate": 1.4104494927905812e-05, + "loss": 0.4575, + "step": 16460 + }, + { + "epoch": 0.3654787405245225, + "grad_norm": 1.054591417312622, + "learning_rate": 1.4101315137975002e-05, + "loss": 0.5259, + "step": 16465 + }, + { + "epoch": 0.36558972708405013, + "grad_norm": 1.571337103843689, + "learning_rate": 1.4098134849420055e-05, + "loss": 0.4236, + "step": 16470 + }, + { + "epoch": 0.36570071364357776, + "grad_norm": 0.854489803314209, + "learning_rate": 1.4094954062627614e-05, + "loss": 0.384, + "step": 16475 + }, + { + "epoch": 0.3658117002031054, + "grad_norm": 1.1105810403823853, + "learning_rate": 1.4091772777984396e-05, + "loss": 0.4972, + "step": 16480 + }, + { + "epoch": 0.36592268676263306, + "grad_norm": 1.42030930519104, + "learning_rate": 1.4088590995877165e-05, + "loss": 0.4479, + "step": 16485 + }, + { + "epoch": 0.3660336733221607, + "grad_norm": 1.3728995323181152, + "learning_rate": 1.408540871669275e-05, + "loss": 0.5138, + "step": 16490 + }, + { + "epoch": 0.3661446598816883, + "grad_norm": 0.9788410067558289, + "learning_rate": 1.4082225940818047e-05, + "loss": 0.4445, + "step": 16495 + }, + { + "epoch": 0.36625564644121594, + "grad_norm": 1.1084668636322021, + "learning_rate": 1.4079042668640002e-05, + "loss": 0.4348, + "step": 16500 + }, + { + "epoch": 0.3663666330007436, + "grad_norm": 1.8628714084625244, + "learning_rate": 1.4075858900545631e-05, + "loss": 0.4036, + "step": 16505 + }, + { + "epoch": 0.36647761956027125, + "grad_norm": 1.1906403303146362, + "learning_rate": 1.4072674636922009e-05, + "loss": 0.376, + "step": 16510 + }, + { + "epoch": 0.3665886061197989, + "grad_norm": 1.1637814044952393, + "learning_rate": 1.4069489878156258e-05, + "loss": 0.4762, + "step": 16515 + }, + { + "epoch": 0.36669959267932656, + "grad_norm": 0.9485587477684021, + "learning_rate": 1.4066304624635576e-05, + "loss": 0.4694, + "step": 16520 + }, + { + "epoch": 0.3668105792388542, + "grad_norm": 1.3874183893203735, + "learning_rate": 1.4063118876747217e-05, + "loss": 0.5263, + "step": 16525 + }, + { + "epoch": 0.3669215657983818, + "grad_norm": 1.4688796997070312, + "learning_rate": 1.405993263487849e-05, + "loss": 0.4912, + "step": 16530 + }, + { + "epoch": 0.36703255235790944, + "grad_norm": 1.0320883989334106, + "learning_rate": 1.405674589941677e-05, + "loss": 0.4507, + "step": 16535 + }, + { + "epoch": 0.3671435389174371, + "grad_norm": 1.0644927024841309, + "learning_rate": 1.405355867074949e-05, + "loss": 0.5375, + "step": 16540 + }, + { + "epoch": 0.36725452547696474, + "grad_norm": 1.249047875404358, + "learning_rate": 1.405037094926414e-05, + "loss": 0.297, + "step": 16545 + }, + { + "epoch": 0.36736551203649237, + "grad_norm": 1.9036270380020142, + "learning_rate": 1.4047182735348273e-05, + "loss": 0.5561, + "step": 16550 + }, + { + "epoch": 0.36747649859602, + "grad_norm": 2.7875096797943115, + "learning_rate": 1.40439940293895e-05, + "loss": 0.5732, + "step": 16555 + }, + { + "epoch": 0.3675874851555477, + "grad_norm": 1.2983033657073975, + "learning_rate": 1.40408048317755e-05, + "loss": 0.3889, + "step": 16560 + }, + { + "epoch": 0.3676984717150753, + "grad_norm": 1.0702457427978516, + "learning_rate": 1.4037615142894e-05, + "loss": 0.5595, + "step": 16565 + }, + { + "epoch": 0.36780945827460293, + "grad_norm": 1.1802325248718262, + "learning_rate": 1.4034424963132792e-05, + "loss": 0.3399, + "step": 16570 + }, + { + "epoch": 0.3679204448341306, + "grad_norm": 1.1893055438995361, + "learning_rate": 1.4031234292879726e-05, + "loss": 0.379, + "step": 16575 + }, + { + "epoch": 0.36803143139365824, + "grad_norm": 1.0848939418792725, + "learning_rate": 1.402804313252271e-05, + "loss": 0.4896, + "step": 16580 + }, + { + "epoch": 0.36814241795318586, + "grad_norm": 1.2174549102783203, + "learning_rate": 1.4024851482449726e-05, + "loss": 0.5846, + "step": 16585 + }, + { + "epoch": 0.3682534045127135, + "grad_norm": 1.244012713432312, + "learning_rate": 1.4021659343048795e-05, + "loss": 0.4235, + "step": 16590 + }, + { + "epoch": 0.36836439107224117, + "grad_norm": 0.7126348614692688, + "learning_rate": 1.4018466714708008e-05, + "loss": 0.4914, + "step": 16595 + }, + { + "epoch": 0.3684753776317688, + "grad_norm": 1.1667653322219849, + "learning_rate": 1.4015273597815516e-05, + "loss": 0.4292, + "step": 16600 + }, + { + "epoch": 0.3685863641912964, + "grad_norm": 0.7720460295677185, + "learning_rate": 1.4012079992759521e-05, + "loss": 0.415, + "step": 16605 + }, + { + "epoch": 0.36869735075082405, + "grad_norm": 1.1298047304153442, + "learning_rate": 1.4008885899928301e-05, + "loss": 0.4099, + "step": 16610 + }, + { + "epoch": 0.36880833731035173, + "grad_norm": 1.6862759590148926, + "learning_rate": 1.4005691319710178e-05, + "loss": 0.471, + "step": 16615 + }, + { + "epoch": 0.36891932386987936, + "grad_norm": 0.9013825058937073, + "learning_rate": 1.4002496252493538e-05, + "loss": 0.4378, + "step": 16620 + }, + { + "epoch": 0.369030310429407, + "grad_norm": 1.32625412940979, + "learning_rate": 1.3999300698666827e-05, + "loss": 0.447, + "step": 16625 + }, + { + "epoch": 0.36914129698893466, + "grad_norm": 1.1559810638427734, + "learning_rate": 1.399610465861855e-05, + "loss": 0.3937, + "step": 16630 + }, + { + "epoch": 0.3692522835484623, + "grad_norm": 1.547504186630249, + "learning_rate": 1.3992908132737269e-05, + "loss": 0.3431, + "step": 16635 + }, + { + "epoch": 0.3693632701079899, + "grad_norm": 0.8603490591049194, + "learning_rate": 1.3989711121411608e-05, + "loss": 0.3781, + "step": 16640 + }, + { + "epoch": 0.36947425666751754, + "grad_norm": 1.6657804250717163, + "learning_rate": 1.3986513625030251e-05, + "loss": 0.4678, + "step": 16645 + }, + { + "epoch": 0.3695852432270452, + "grad_norm": 0.9877973198890686, + "learning_rate": 1.3983315643981938e-05, + "loss": 0.4872, + "step": 16650 + }, + { + "epoch": 0.36969622978657285, + "grad_norm": 2.1902594566345215, + "learning_rate": 1.3980117178655466e-05, + "loss": 0.5882, + "step": 16655 + }, + { + "epoch": 0.3698072163461005, + "grad_norm": 0.9402801394462585, + "learning_rate": 1.3976918229439698e-05, + "loss": 0.4461, + "step": 16660 + }, + { + "epoch": 0.3699182029056281, + "grad_norm": 0.9539381861686707, + "learning_rate": 1.3973718796723546e-05, + "loss": 0.338, + "step": 16665 + }, + { + "epoch": 0.3700291894651558, + "grad_norm": 0.9745631814002991, + "learning_rate": 1.3970518880895992e-05, + "loss": 0.5081, + "step": 16670 + }, + { + "epoch": 0.3701401760246834, + "grad_norm": 1.3075889348983765, + "learning_rate": 1.3967318482346066e-05, + "loss": 0.4365, + "step": 16675 + }, + { + "epoch": 0.37025116258421104, + "grad_norm": 1.23758864402771, + "learning_rate": 1.3964117601462865e-05, + "loss": 0.4368, + "step": 16680 + }, + { + "epoch": 0.3703621491437387, + "grad_norm": 1.430646300315857, + "learning_rate": 1.3960916238635542e-05, + "loss": 0.3723, + "step": 16685 + }, + { + "epoch": 0.37047313570326634, + "grad_norm": 1.2952028512954712, + "learning_rate": 1.3957714394253305e-05, + "loss": 0.2682, + "step": 16690 + }, + { + "epoch": 0.37058412226279397, + "grad_norm": 1.1494914293289185, + "learning_rate": 1.3954512068705425e-05, + "loss": 0.397, + "step": 16695 + }, + { + "epoch": 0.3706951088223216, + "grad_norm": 1.0773718357086182, + "learning_rate": 1.3951309262381231e-05, + "loss": 0.3331, + "step": 16700 + }, + { + "epoch": 0.3708060953818493, + "grad_norm": 1.6925926208496094, + "learning_rate": 1.3948105975670113e-05, + "loss": 0.4808, + "step": 16705 + }, + { + "epoch": 0.3709170819413769, + "grad_norm": 0.9642808437347412, + "learning_rate": 1.3944902208961507e-05, + "loss": 0.4677, + "step": 16710 + }, + { + "epoch": 0.37102806850090453, + "grad_norm": 0.9606306552886963, + "learning_rate": 1.394169796264492e-05, + "loss": 0.4367, + "step": 16715 + }, + { + "epoch": 0.37113905506043215, + "grad_norm": 1.0823974609375, + "learning_rate": 1.3938493237109914e-05, + "loss": 0.4162, + "step": 16720 + }, + { + "epoch": 0.37125004161995984, + "grad_norm": 2.175527811050415, + "learning_rate": 1.3935288032746108e-05, + "loss": 0.4954, + "step": 16725 + }, + { + "epoch": 0.37136102817948746, + "grad_norm": 1.030967116355896, + "learning_rate": 1.3932082349943184e-05, + "loss": 0.6295, + "step": 16730 + }, + { + "epoch": 0.3714720147390151, + "grad_norm": 1.0239731073379517, + "learning_rate": 1.3928876189090874e-05, + "loss": 0.4318, + "step": 16735 + }, + { + "epoch": 0.37158300129854277, + "grad_norm": 1.2536979913711548, + "learning_rate": 1.3925669550578973e-05, + "loss": 0.3785, + "step": 16740 + }, + { + "epoch": 0.3716939878580704, + "grad_norm": 1.4033725261688232, + "learning_rate": 1.3922462434797335e-05, + "loss": 0.5818, + "step": 16745 + }, + { + "epoch": 0.371804974417598, + "grad_norm": 1.1884605884552002, + "learning_rate": 1.3919254842135865e-05, + "loss": 0.4141, + "step": 16750 + }, + { + "epoch": 0.37191596097712565, + "grad_norm": 1.4534409046173096, + "learning_rate": 1.3916046772984539e-05, + "loss": 0.4574, + "step": 16755 + }, + { + "epoch": 0.37202694753665333, + "grad_norm": 1.5618983507156372, + "learning_rate": 1.391283822773338e-05, + "loss": 0.4347, + "step": 16760 + }, + { + "epoch": 0.37213793409618096, + "grad_norm": 1.5037851333618164, + "learning_rate": 1.390962920677247e-05, + "loss": 0.3729, + "step": 16765 + }, + { + "epoch": 0.3722489206557086, + "grad_norm": 0.8089569807052612, + "learning_rate": 1.3906419710491954e-05, + "loss": 0.4113, + "step": 16770 + }, + { + "epoch": 0.3723599072152362, + "grad_norm": 0.9394587278366089, + "learning_rate": 1.390320973928203e-05, + "loss": 0.4543, + "step": 16775 + }, + { + "epoch": 0.3724708937747639, + "grad_norm": 1.4900579452514648, + "learning_rate": 1.389999929353296e-05, + "loss": 0.6413, + "step": 16780 + }, + { + "epoch": 0.3725818803342915, + "grad_norm": 1.5743154287338257, + "learning_rate": 1.3896788373635053e-05, + "loss": 0.4143, + "step": 16785 + }, + { + "epoch": 0.37269286689381914, + "grad_norm": 1.6529701948165894, + "learning_rate": 1.3893576979978683e-05, + "loss": 0.3333, + "step": 16790 + }, + { + "epoch": 0.3728038534533468, + "grad_norm": 1.025791883468628, + "learning_rate": 1.3890365112954282e-05, + "loss": 0.4461, + "step": 16795 + }, + { + "epoch": 0.37291484001287445, + "grad_norm": 1.5343742370605469, + "learning_rate": 1.3887152772952339e-05, + "loss": 0.386, + "step": 16800 + }, + { + "epoch": 0.3730258265724021, + "grad_norm": 1.3995214700698853, + "learning_rate": 1.3883939960363397e-05, + "loss": 0.5093, + "step": 16805 + }, + { + "epoch": 0.3731368131319297, + "grad_norm": 2.0946216583251953, + "learning_rate": 1.3880726675578063e-05, + "loss": 0.3601, + "step": 16810 + }, + { + "epoch": 0.3732477996914574, + "grad_norm": 1.2544093132019043, + "learning_rate": 1.387751291898699e-05, + "loss": 0.4319, + "step": 16815 + }, + { + "epoch": 0.373358786250985, + "grad_norm": 1.000132441520691, + "learning_rate": 1.3874298690980904e-05, + "loss": 0.3898, + "step": 16820 + }, + { + "epoch": 0.37346977281051263, + "grad_norm": 1.495603322982788, + "learning_rate": 1.3871083991950576e-05, + "loss": 0.4117, + "step": 16825 + }, + { + "epoch": 0.37358075937004026, + "grad_norm": 0.9588593244552612, + "learning_rate": 1.3867868822286838e-05, + "loss": 0.4232, + "step": 16830 + }, + { + "epoch": 0.37369174592956794, + "grad_norm": 1.1229437589645386, + "learning_rate": 1.386465318238058e-05, + "loss": 0.3853, + "step": 16835 + }, + { + "epoch": 0.37380273248909557, + "grad_norm": 1.3521358966827393, + "learning_rate": 1.3861437072622752e-05, + "loss": 0.4987, + "step": 16840 + }, + { + "epoch": 0.3739137190486232, + "grad_norm": 0.968280017375946, + "learning_rate": 1.3858220493404353e-05, + "loss": 0.3644, + "step": 16845 + }, + { + "epoch": 0.3740247056081509, + "grad_norm": 1.8695068359375, + "learning_rate": 1.3855003445116446e-05, + "loss": 0.5406, + "step": 16850 + }, + { + "epoch": 0.3741356921676785, + "grad_norm": 1.0490186214447021, + "learning_rate": 1.385178592815015e-05, + "loss": 0.4572, + "step": 16855 + }, + { + "epoch": 0.37424667872720613, + "grad_norm": 0.7611112594604492, + "learning_rate": 1.3848567942896636e-05, + "loss": 0.4575, + "step": 16860 + }, + { + "epoch": 0.37435766528673375, + "grad_norm": 1.1203293800354004, + "learning_rate": 1.384534948974714e-05, + "loss": 0.4189, + "step": 16865 + }, + { + "epoch": 0.37446865184626144, + "grad_norm": 1.5158090591430664, + "learning_rate": 1.3842130569092951e-05, + "loss": 0.5554, + "step": 16870 + }, + { + "epoch": 0.37457963840578906, + "grad_norm": 1.3477773666381836, + "learning_rate": 1.383891118132541e-05, + "loss": 0.5403, + "step": 16875 + }, + { + "epoch": 0.3746906249653167, + "grad_norm": 1.0920310020446777, + "learning_rate": 1.3835691326835925e-05, + "loss": 0.3967, + "step": 16880 + }, + { + "epoch": 0.3748016115248443, + "grad_norm": 0.9676805138587952, + "learning_rate": 1.383247100601595e-05, + "loss": 0.4951, + "step": 16885 + }, + { + "epoch": 0.374912598084372, + "grad_norm": 1.1773241758346558, + "learning_rate": 1.3829250219257007e-05, + "loss": 0.4838, + "step": 16890 + }, + { + "epoch": 0.3750235846438996, + "grad_norm": 1.895973801612854, + "learning_rate": 1.3826028966950662e-05, + "loss": 0.6721, + "step": 16895 + }, + { + "epoch": 0.37513457120342725, + "grad_norm": 1.123016357421875, + "learning_rate": 1.3822807249488545e-05, + "loss": 0.5281, + "step": 16900 + }, + { + "epoch": 0.37524555776295493, + "grad_norm": 1.2969212532043457, + "learning_rate": 1.3819585067262347e-05, + "loss": 0.4464, + "step": 16905 + }, + { + "epoch": 0.37535654432248255, + "grad_norm": 1.220132827758789, + "learning_rate": 1.3816362420663805e-05, + "loss": 0.4504, + "step": 16910 + }, + { + "epoch": 0.3754675308820102, + "grad_norm": 1.846871018409729, + "learning_rate": 1.3813139310084715e-05, + "loss": 0.5145, + "step": 16915 + }, + { + "epoch": 0.3755785174415378, + "grad_norm": 1.588097095489502, + "learning_rate": 1.3809915735916942e-05, + "loss": 0.5535, + "step": 16920 + }, + { + "epoch": 0.3756895040010655, + "grad_norm": 0.79483962059021, + "learning_rate": 1.3806691698552388e-05, + "loss": 0.5354, + "step": 16925 + }, + { + "epoch": 0.3758004905605931, + "grad_norm": 1.1435658931732178, + "learning_rate": 1.3803467198383025e-05, + "loss": 0.5275, + "step": 16930 + }, + { + "epoch": 0.37591147712012074, + "grad_norm": 1.563844919204712, + "learning_rate": 1.3800242235800876e-05, + "loss": 0.3377, + "step": 16935 + }, + { + "epoch": 0.3760224636796484, + "grad_norm": 1.347374439239502, + "learning_rate": 1.3797016811198018e-05, + "loss": 0.312, + "step": 16940 + }, + { + "epoch": 0.37613345023917605, + "grad_norm": 1.2091847658157349, + "learning_rate": 1.3793790924966593e-05, + "loss": 0.5454, + "step": 16945 + }, + { + "epoch": 0.3762444367987037, + "grad_norm": 1.4304174184799194, + "learning_rate": 1.3790564577498791e-05, + "loss": 0.3816, + "step": 16950 + }, + { + "epoch": 0.3763554233582313, + "grad_norm": 0.9188779592514038, + "learning_rate": 1.3787337769186859e-05, + "loss": 0.4192, + "step": 16955 + }, + { + "epoch": 0.376466409917759, + "grad_norm": 1.6021742820739746, + "learning_rate": 1.3784110500423104e-05, + "loss": 0.5217, + "step": 16960 + }, + { + "epoch": 0.3765773964772866, + "grad_norm": 1.2071577310562134, + "learning_rate": 1.3780882771599886e-05, + "loss": 0.524, + "step": 16965 + }, + { + "epoch": 0.37668838303681423, + "grad_norm": 0.8562250733375549, + "learning_rate": 1.3777654583109618e-05, + "loss": 0.4608, + "step": 16970 + }, + { + "epoch": 0.37679936959634186, + "grad_norm": 1.6524200439453125, + "learning_rate": 1.3774425935344775e-05, + "loss": 0.5862, + "step": 16975 + }, + { + "epoch": 0.37691035615586954, + "grad_norm": 1.103670597076416, + "learning_rate": 1.3771196828697888e-05, + "loss": 0.3822, + "step": 16980 + }, + { + "epoch": 0.37702134271539717, + "grad_norm": 0.7237701416015625, + "learning_rate": 1.3767967263561538e-05, + "loss": 0.3177, + "step": 16985 + }, + { + "epoch": 0.3771323292749248, + "grad_norm": 0.7688775062561035, + "learning_rate": 1.3764737240328363e-05, + "loss": 0.5407, + "step": 16990 + }, + { + "epoch": 0.3772433158344525, + "grad_norm": 1.1897283792495728, + "learning_rate": 1.3761506759391061e-05, + "loss": 0.5392, + "step": 16995 + }, + { + "epoch": 0.3773543023939801, + "grad_norm": 1.4582651853561401, + "learning_rate": 1.3758275821142382e-05, + "loss": 0.4387, + "step": 17000 + }, + { + "epoch": 0.3774652889535077, + "grad_norm": 1.1028008460998535, + "learning_rate": 1.3755044425975132e-05, + "loss": 0.4452, + "step": 17005 + }, + { + "epoch": 0.37757627551303535, + "grad_norm": 1.4182307720184326, + "learning_rate": 1.3751812574282177e-05, + "loss": 0.4905, + "step": 17010 + }, + { + "epoch": 0.37768726207256303, + "grad_norm": 1.1496057510375977, + "learning_rate": 1.374858026645643e-05, + "loss": 0.4954, + "step": 17015 + }, + { + "epoch": 0.37779824863209066, + "grad_norm": 1.2465380430221558, + "learning_rate": 1.3745347502890866e-05, + "loss": 0.5762, + "step": 17020 + }, + { + "epoch": 0.3779092351916183, + "grad_norm": 1.1184829473495483, + "learning_rate": 1.3742114283978514e-05, + "loss": 0.4166, + "step": 17025 + }, + { + "epoch": 0.3780202217511459, + "grad_norm": 0.7921366691589355, + "learning_rate": 1.373888061011246e-05, + "loss": 0.3554, + "step": 17030 + }, + { + "epoch": 0.3781312083106736, + "grad_norm": 0.7676562666893005, + "learning_rate": 1.3735646481685836e-05, + "loss": 0.3768, + "step": 17035 + }, + { + "epoch": 0.3782421948702012, + "grad_norm": 2.179152488708496, + "learning_rate": 1.3732411899091844e-05, + "loss": 0.4078, + "step": 17040 + }, + { + "epoch": 0.37835318142972885, + "grad_norm": 1.5713518857955933, + "learning_rate": 1.372917686272373e-05, + "loss": 0.5496, + "step": 17045 + }, + { + "epoch": 0.37846416798925653, + "grad_norm": 2.061424732208252, + "learning_rate": 1.3725941372974801e-05, + "loss": 0.5781, + "step": 17050 + }, + { + "epoch": 0.37857515454878415, + "grad_norm": 0.8337643146514893, + "learning_rate": 1.3722705430238414e-05, + "loss": 0.5363, + "step": 17055 + }, + { + "epoch": 0.3786861411083118, + "grad_norm": 1.2876572608947754, + "learning_rate": 1.3719469034907984e-05, + "loss": 0.5306, + "step": 17060 + }, + { + "epoch": 0.3787971276678394, + "grad_norm": 1.8232654333114624, + "learning_rate": 1.3716232187376985e-05, + "loss": 0.5426, + "step": 17065 + }, + { + "epoch": 0.3789081142273671, + "grad_norm": 1.371899127960205, + "learning_rate": 1.371299488803894e-05, + "loss": 0.6546, + "step": 17070 + }, + { + "epoch": 0.3790191007868947, + "grad_norm": 1.1025066375732422, + "learning_rate": 1.370975713728743e-05, + "loss": 0.4045, + "step": 17075 + }, + { + "epoch": 0.37913008734642234, + "grad_norm": 1.0858376026153564, + "learning_rate": 1.3706518935516087e-05, + "loss": 0.4726, + "step": 17080 + }, + { + "epoch": 0.37924107390594997, + "grad_norm": 1.4351253509521484, + "learning_rate": 1.3703280283118601e-05, + "loss": 0.3277, + "step": 17085 + }, + { + "epoch": 0.37935206046547765, + "grad_norm": 0.8693327903747559, + "learning_rate": 1.3700041180488721e-05, + "loss": 0.5173, + "step": 17090 + }, + { + "epoch": 0.3794630470250053, + "grad_norm": 0.8443187475204468, + "learning_rate": 1.3696801628020243e-05, + "loss": 0.3616, + "step": 17095 + }, + { + "epoch": 0.3795740335845329, + "grad_norm": 1.3002277612686157, + "learning_rate": 1.3693561626107021e-05, + "loss": 0.5085, + "step": 17100 + }, + { + "epoch": 0.3796850201440606, + "grad_norm": 1.221724271774292, + "learning_rate": 1.3690321175142965e-05, + "loss": 0.3993, + "step": 17105 + }, + { + "epoch": 0.3797960067035882, + "grad_norm": 0.9888561964035034, + "learning_rate": 1.3687080275522034e-05, + "loss": 0.4385, + "step": 17110 + }, + { + "epoch": 0.37990699326311583, + "grad_norm": 1.6944843530654907, + "learning_rate": 1.3683838927638249e-05, + "loss": 0.4659, + "step": 17115 + }, + { + "epoch": 0.38001797982264346, + "grad_norm": 0.8631107211112976, + "learning_rate": 1.368059713188568e-05, + "loss": 0.5218, + "step": 17120 + }, + { + "epoch": 0.38012896638217114, + "grad_norm": 1.0087155103683472, + "learning_rate": 1.3677354888658458e-05, + "loss": 0.5457, + "step": 17125 + }, + { + "epoch": 0.38023995294169877, + "grad_norm": 1.0191081762313843, + "learning_rate": 1.3674112198350764e-05, + "loss": 0.3824, + "step": 17130 + }, + { + "epoch": 0.3803509395012264, + "grad_norm": 1.5957056283950806, + "learning_rate": 1.3670869061356829e-05, + "loss": 0.4322, + "step": 17135 + }, + { + "epoch": 0.380461926060754, + "grad_norm": 1.5974568128585815, + "learning_rate": 1.3667625478070945e-05, + "loss": 0.3819, + "step": 17140 + }, + { + "epoch": 0.3805729126202817, + "grad_norm": 0.8775105476379395, + "learning_rate": 1.3664381448887454e-05, + "loss": 0.5077, + "step": 17145 + }, + { + "epoch": 0.3806838991798093, + "grad_norm": 1.3796273469924927, + "learning_rate": 1.3661136974200757e-05, + "loss": 0.3658, + "step": 17150 + }, + { + "epoch": 0.38079488573933695, + "grad_norm": 1.0044777393341064, + "learning_rate": 1.3657892054405304e-05, + "loss": 0.4779, + "step": 17155 + }, + { + "epoch": 0.38090587229886463, + "grad_norm": 0.881597101688385, + "learning_rate": 1.3654646689895605e-05, + "loss": 0.4481, + "step": 17160 + }, + { + "epoch": 0.38101685885839226, + "grad_norm": 1.505136489868164, + "learning_rate": 1.3651400881066217e-05, + "loss": 0.6237, + "step": 17165 + }, + { + "epoch": 0.3811278454179199, + "grad_norm": 0.7536532282829285, + "learning_rate": 1.3648154628311754e-05, + "loss": 0.5762, + "step": 17170 + }, + { + "epoch": 0.3812388319774475, + "grad_norm": 1.034916639328003, + "learning_rate": 1.3644907932026887e-05, + "loss": 0.3791, + "step": 17175 + }, + { + "epoch": 0.3813498185369752, + "grad_norm": 1.6484735012054443, + "learning_rate": 1.364166079260634e-05, + "loss": 0.2924, + "step": 17180 + }, + { + "epoch": 0.3814608050965028, + "grad_norm": 0.9013214111328125, + "learning_rate": 1.3638413210444885e-05, + "loss": 0.357, + "step": 17185 + }, + { + "epoch": 0.38157179165603045, + "grad_norm": 1.3303017616271973, + "learning_rate": 1.3635165185937354e-05, + "loss": 0.5829, + "step": 17190 + }, + { + "epoch": 0.38168277821555807, + "grad_norm": 0.9839069247245789, + "learning_rate": 1.3631916719478633e-05, + "loss": 0.5365, + "step": 17195 + }, + { + "epoch": 0.38179376477508575, + "grad_norm": 1.1940916776657104, + "learning_rate": 1.3628667811463654e-05, + "loss": 0.3673, + "step": 17200 + }, + { + "epoch": 0.3819047513346134, + "grad_norm": 1.1215497255325317, + "learning_rate": 1.3625418462287414e-05, + "loss": 0.3505, + "step": 17205 + }, + { + "epoch": 0.382015737894141, + "grad_norm": 1.1891454458236694, + "learning_rate": 1.3622168672344957e-05, + "loss": 0.6437, + "step": 17210 + }, + { + "epoch": 0.3821267244536687, + "grad_norm": 1.0986990928649902, + "learning_rate": 1.361891844203138e-05, + "loss": 0.427, + "step": 17215 + }, + { + "epoch": 0.3822377110131963, + "grad_norm": 1.3629257678985596, + "learning_rate": 1.3615667771741836e-05, + "loss": 0.4027, + "step": 17220 + }, + { + "epoch": 0.38234869757272394, + "grad_norm": 1.3671947717666626, + "learning_rate": 1.3612416661871532e-05, + "loss": 0.4503, + "step": 17225 + }, + { + "epoch": 0.38245968413225157, + "grad_norm": 1.2490761280059814, + "learning_rate": 1.3609165112815721e-05, + "loss": 0.3434, + "step": 17230 + }, + { + "epoch": 0.38257067069177925, + "grad_norm": 1.1291104555130005, + "learning_rate": 1.3605913124969723e-05, + "loss": 0.4443, + "step": 17235 + }, + { + "epoch": 0.3826816572513069, + "grad_norm": 1.559004306793213, + "learning_rate": 1.36026606987289e-05, + "loss": 0.5403, + "step": 17240 + }, + { + "epoch": 0.3827926438108345, + "grad_norm": 1.1736834049224854, + "learning_rate": 1.3599407834488672e-05, + "loss": 0.4011, + "step": 17245 + }, + { + "epoch": 0.3829036303703621, + "grad_norm": 2.812455654144287, + "learning_rate": 1.359615453264451e-05, + "loss": 0.5226, + "step": 17250 + }, + { + "epoch": 0.3830146169298898, + "grad_norm": 1.8055833578109741, + "learning_rate": 1.3592900793591941e-05, + "loss": 0.4333, + "step": 17255 + }, + { + "epoch": 0.38312560348941743, + "grad_norm": 1.520922064781189, + "learning_rate": 1.3589646617726545e-05, + "loss": 0.5924, + "step": 17260 + }, + { + "epoch": 0.38323659004894506, + "grad_norm": 1.0311449766159058, + "learning_rate": 1.358639200544395e-05, + "loss": 0.4824, + "step": 17265 + }, + { + "epoch": 0.38334757660847274, + "grad_norm": 1.1037408113479614, + "learning_rate": 1.3583136957139841e-05, + "loss": 0.4822, + "step": 17270 + }, + { + "epoch": 0.38345856316800037, + "grad_norm": 1.3447061777114868, + "learning_rate": 1.3579881473209962e-05, + "loss": 0.4643, + "step": 17275 + }, + { + "epoch": 0.383569549727528, + "grad_norm": 1.1121246814727783, + "learning_rate": 1.3576625554050095e-05, + "loss": 0.4628, + "step": 17280 + }, + { + "epoch": 0.3836805362870556, + "grad_norm": 0.7411977052688599, + "learning_rate": 1.357336920005609e-05, + "loss": 0.4589, + "step": 17285 + }, + { + "epoch": 0.3837915228465833, + "grad_norm": 1.5663127899169922, + "learning_rate": 1.3570112411623843e-05, + "loss": 0.4138, + "step": 17290 + }, + { + "epoch": 0.3839025094061109, + "grad_norm": 1.0521498918533325, + "learning_rate": 1.3566855189149302e-05, + "loss": 0.4277, + "step": 17295 + }, + { + "epoch": 0.38401349596563855, + "grad_norm": 1.4922767877578735, + "learning_rate": 1.3563597533028467e-05, + "loss": 0.473, + "step": 17300 + }, + { + "epoch": 0.3841244825251662, + "grad_norm": 0.9321845173835754, + "learning_rate": 1.35603394436574e-05, + "loss": 0.5097, + "step": 17305 + }, + { + "epoch": 0.38423546908469386, + "grad_norm": 1.0778831243515015, + "learning_rate": 1.35570809214322e-05, + "loss": 0.4739, + "step": 17310 + }, + { + "epoch": 0.3843464556442215, + "grad_norm": 1.1382580995559692, + "learning_rate": 1.355382196674903e-05, + "loss": 0.4179, + "step": 17315 + }, + { + "epoch": 0.3844574422037491, + "grad_norm": 1.2955654859542847, + "learning_rate": 1.3550562580004108e-05, + "loss": 0.2823, + "step": 17320 + }, + { + "epoch": 0.3845684287632768, + "grad_norm": 1.0938900709152222, + "learning_rate": 1.3547302761593693e-05, + "loss": 0.3228, + "step": 17325 + }, + { + "epoch": 0.3846794153228044, + "grad_norm": 1.351600170135498, + "learning_rate": 1.3544042511914104e-05, + "loss": 0.6427, + "step": 17330 + }, + { + "epoch": 0.38479040188233204, + "grad_norm": 0.9713222980499268, + "learning_rate": 1.3540781831361713e-05, + "loss": 0.3738, + "step": 17335 + }, + { + "epoch": 0.38490138844185967, + "grad_norm": 1.0599182844161987, + "learning_rate": 1.3537520720332943e-05, + "loss": 0.4398, + "step": 17340 + }, + { + "epoch": 0.38501237500138735, + "grad_norm": 0.9963597059249878, + "learning_rate": 1.3534259179224265e-05, + "loss": 0.3719, + "step": 17345 + }, + { + "epoch": 0.385123361560915, + "grad_norm": 1.0552200078964233, + "learning_rate": 1.3530997208432211e-05, + "loss": 0.3478, + "step": 17350 + }, + { + "epoch": 0.3852343481204426, + "grad_norm": 1.0920829772949219, + "learning_rate": 1.3527734808353356e-05, + "loss": 0.4186, + "step": 17355 + }, + { + "epoch": 0.38534533467997023, + "grad_norm": 1.1607277393341064, + "learning_rate": 1.3524471979384339e-05, + "loss": 0.469, + "step": 17360 + }, + { + "epoch": 0.3854563212394979, + "grad_norm": 1.2528059482574463, + "learning_rate": 1.3521208721921836e-05, + "loss": 0.4884, + "step": 17365 + }, + { + "epoch": 0.38556730779902554, + "grad_norm": 1.1077717542648315, + "learning_rate": 1.3517945036362585e-05, + "loss": 0.5902, + "step": 17370 + }, + { + "epoch": 0.38567829435855316, + "grad_norm": 0.9106099605560303, + "learning_rate": 1.3514680923103373e-05, + "loss": 0.5072, + "step": 17375 + }, + { + "epoch": 0.38578928091808085, + "grad_norm": 1.0978457927703857, + "learning_rate": 1.3511416382541048e-05, + "loss": 0.4089, + "step": 17380 + }, + { + "epoch": 0.38590026747760847, + "grad_norm": 0.7796788215637207, + "learning_rate": 1.350815141507249e-05, + "loss": 0.4714, + "step": 17385 + }, + { + "epoch": 0.3860112540371361, + "grad_norm": 0.7110152244567871, + "learning_rate": 1.3504886021094653e-05, + "loss": 0.4504, + "step": 17390 + }, + { + "epoch": 0.3861222405966637, + "grad_norm": 1.5071749687194824, + "learning_rate": 1.3501620201004524e-05, + "loss": 0.5551, + "step": 17395 + }, + { + "epoch": 0.3862332271561914, + "grad_norm": 1.5758322477340698, + "learning_rate": 1.3498353955199157e-05, + "loss": 0.4305, + "step": 17400 + }, + { + "epoch": 0.38634421371571903, + "grad_norm": 1.0656967163085938, + "learning_rate": 1.3495087284075648e-05, + "loss": 0.5435, + "step": 17405 + }, + { + "epoch": 0.38645520027524666, + "grad_norm": 1.3301317691802979, + "learning_rate": 1.3491820188031148e-05, + "loss": 0.4687, + "step": 17410 + }, + { + "epoch": 0.3865661868347743, + "grad_norm": 1.1281219720840454, + "learning_rate": 1.348855266746286e-05, + "loss": 0.4022, + "step": 17415 + }, + { + "epoch": 0.38667717339430197, + "grad_norm": 1.6176022291183472, + "learning_rate": 1.348528472276804e-05, + "loss": 0.482, + "step": 17420 + }, + { + "epoch": 0.3867881599538296, + "grad_norm": 1.332358717918396, + "learning_rate": 1.348201635434399e-05, + "loss": 0.4974, + "step": 17425 + }, + { + "epoch": 0.3868991465133572, + "grad_norm": 0.8697658777236938, + "learning_rate": 1.3478747562588068e-05, + "loss": 0.5393, + "step": 17430 + }, + { + "epoch": 0.3870101330728849, + "grad_norm": 1.16774582862854, + "learning_rate": 1.3475478347897688e-05, + "loss": 0.6004, + "step": 17435 + }, + { + "epoch": 0.3871211196324125, + "grad_norm": 1.2171958684921265, + "learning_rate": 1.3472208710670305e-05, + "loss": 0.3667, + "step": 17440 + }, + { + "epoch": 0.38723210619194015, + "grad_norm": 1.1921159029006958, + "learning_rate": 1.3468938651303431e-05, + "loss": 0.488, + "step": 17445 + }, + { + "epoch": 0.3873430927514678, + "grad_norm": 0.9221675395965576, + "learning_rate": 1.3465668170194633e-05, + "loss": 0.5209, + "step": 17450 + }, + { + "epoch": 0.38745407931099546, + "grad_norm": 1.400107502937317, + "learning_rate": 1.3462397267741516e-05, + "loss": 0.6247, + "step": 17455 + }, + { + "epoch": 0.3875650658705231, + "grad_norm": 1.6427634954452515, + "learning_rate": 1.3459125944341755e-05, + "loss": 0.3778, + "step": 17460 + }, + { + "epoch": 0.3876760524300507, + "grad_norm": 1.0119582414627075, + "learning_rate": 1.3455854200393064e-05, + "loss": 0.4266, + "step": 17465 + }, + { + "epoch": 0.38778703898957834, + "grad_norm": 1.327309489250183, + "learning_rate": 1.3452582036293205e-05, + "loss": 0.5284, + "step": 17470 + }, + { + "epoch": 0.387898025549106, + "grad_norm": 1.0754491090774536, + "learning_rate": 1.3449309452440004e-05, + "loss": 0.5902, + "step": 17475 + }, + { + "epoch": 0.38800901210863364, + "grad_norm": 1.4490890502929688, + "learning_rate": 1.3446036449231328e-05, + "loss": 0.5134, + "step": 17480 + }, + { + "epoch": 0.38811999866816127, + "grad_norm": 0.7128272652626038, + "learning_rate": 1.3442763027065096e-05, + "loss": 0.5247, + "step": 17485 + }, + { + "epoch": 0.38823098522768895, + "grad_norm": 0.9914596676826477, + "learning_rate": 1.3439489186339283e-05, + "loss": 0.3952, + "step": 17490 + }, + { + "epoch": 0.3883419717872166, + "grad_norm": 1.220529556274414, + "learning_rate": 1.3436214927451907e-05, + "loss": 0.4257, + "step": 17495 + }, + { + "epoch": 0.3884529583467442, + "grad_norm": 1.3346257209777832, + "learning_rate": 1.3432940250801047e-05, + "loss": 0.4396, + "step": 17500 + }, + { + "epoch": 0.38856394490627183, + "grad_norm": 1.3106735944747925, + "learning_rate": 1.3429665156784825e-05, + "loss": 0.4336, + "step": 17505 + }, + { + "epoch": 0.3886749314657995, + "grad_norm": 1.7288190126419067, + "learning_rate": 1.3426389645801415e-05, + "loss": 0.3259, + "step": 17510 + }, + { + "epoch": 0.38878591802532714, + "grad_norm": 1.962158441543579, + "learning_rate": 1.3423113718249044e-05, + "loss": 0.422, + "step": 17515 + }, + { + "epoch": 0.38889690458485476, + "grad_norm": 0.9909548163414001, + "learning_rate": 1.3419837374525986e-05, + "loss": 0.4822, + "step": 17520 + }, + { + "epoch": 0.3890078911443824, + "grad_norm": 1.0369338989257812, + "learning_rate": 1.3416560615030567e-05, + "loss": 0.4333, + "step": 17525 + }, + { + "epoch": 0.38911887770391007, + "grad_norm": 0.9148257970809937, + "learning_rate": 1.3413283440161168e-05, + "loss": 0.386, + "step": 17530 + }, + { + "epoch": 0.3892298642634377, + "grad_norm": 1.404958724975586, + "learning_rate": 1.3410005850316219e-05, + "loss": 0.4468, + "step": 17535 + }, + { + "epoch": 0.3893408508229653, + "grad_norm": 1.2354094982147217, + "learning_rate": 1.340672784589419e-05, + "loss": 0.4362, + "step": 17540 + }, + { + "epoch": 0.389451837382493, + "grad_norm": 0.912862241268158, + "learning_rate": 1.3403449427293617e-05, + "loss": 0.7274, + "step": 17545 + }, + { + "epoch": 0.38956282394202063, + "grad_norm": 1.044878602027893, + "learning_rate": 1.3400170594913078e-05, + "loss": 0.3184, + "step": 17550 + }, + { + "epoch": 0.38967381050154826, + "grad_norm": 1.0582435131072998, + "learning_rate": 1.33968913491512e-05, + "loss": 0.3822, + "step": 17555 + }, + { + "epoch": 0.3897847970610759, + "grad_norm": 1.0423102378845215, + "learning_rate": 1.3393611690406665e-05, + "loss": 0.4363, + "step": 17560 + }, + { + "epoch": 0.38989578362060356, + "grad_norm": 1.4456026554107666, + "learning_rate": 1.3390331619078198e-05, + "loss": 0.4593, + "step": 17565 + }, + { + "epoch": 0.3900067701801312, + "grad_norm": 1.5314327478408813, + "learning_rate": 1.3387051135564588e-05, + "loss": 0.3941, + "step": 17570 + }, + { + "epoch": 0.3901177567396588, + "grad_norm": 1.1662486791610718, + "learning_rate": 1.338377024026466e-05, + "loss": 0.424, + "step": 17575 + }, + { + "epoch": 0.39022874329918644, + "grad_norm": 1.3233836889266968, + "learning_rate": 1.3380488933577294e-05, + "loss": 0.4227, + "step": 17580 + }, + { + "epoch": 0.3903397298587141, + "grad_norm": 0.7573814988136292, + "learning_rate": 1.337720721590142e-05, + "loss": 0.3918, + "step": 17585 + }, + { + "epoch": 0.39045071641824175, + "grad_norm": 1.548936367034912, + "learning_rate": 1.3373925087636017e-05, + "loss": 0.5082, + "step": 17590 + }, + { + "epoch": 0.3905617029777694, + "grad_norm": 0.9623432755470276, + "learning_rate": 1.3370642549180117e-05, + "loss": 0.3315, + "step": 17595 + }, + { + "epoch": 0.39067268953729706, + "grad_norm": 0.876105546951294, + "learning_rate": 1.3367359600932803e-05, + "loss": 0.4066, + "step": 17600 + }, + { + "epoch": 0.3907836760968247, + "grad_norm": 1.3079777956008911, + "learning_rate": 1.3364076243293203e-05, + "loss": 0.5509, + "step": 17605 + }, + { + "epoch": 0.3908946626563523, + "grad_norm": 1.271996021270752, + "learning_rate": 1.3360792476660494e-05, + "loss": 0.429, + "step": 17610 + }, + { + "epoch": 0.39100564921587994, + "grad_norm": 0.795352578163147, + "learning_rate": 1.3357508301433905e-05, + "loss": 0.3334, + "step": 17615 + }, + { + "epoch": 0.3911166357754076, + "grad_norm": 1.3621175289154053, + "learning_rate": 1.335422371801272e-05, + "loss": 0.5516, + "step": 17620 + }, + { + "epoch": 0.39122762233493524, + "grad_norm": 1.1170103549957275, + "learning_rate": 1.3350938726796261e-05, + "loss": 0.3054, + "step": 17625 + }, + { + "epoch": 0.39133860889446287, + "grad_norm": 1.2189823389053345, + "learning_rate": 1.3347653328183912e-05, + "loss": 0.5544, + "step": 17630 + }, + { + "epoch": 0.3914495954539905, + "grad_norm": 1.1982585191726685, + "learning_rate": 1.3344367522575098e-05, + "loss": 0.3228, + "step": 17635 + }, + { + "epoch": 0.3915605820135182, + "grad_norm": 1.4040985107421875, + "learning_rate": 1.3341081310369297e-05, + "loss": 0.5198, + "step": 17640 + }, + { + "epoch": 0.3916715685730458, + "grad_norm": 1.0702539682388306, + "learning_rate": 1.3337794691966034e-05, + "loss": 0.3367, + "step": 17645 + }, + { + "epoch": 0.39178255513257343, + "grad_norm": 1.5340244770050049, + "learning_rate": 1.3334507667764892e-05, + "loss": 0.5923, + "step": 17650 + }, + { + "epoch": 0.3918935416921011, + "grad_norm": 1.4048302173614502, + "learning_rate": 1.3331220238165485e-05, + "loss": 0.5435, + "step": 17655 + }, + { + "epoch": 0.39200452825162874, + "grad_norm": 1.0287718772888184, + "learning_rate": 1.3327932403567493e-05, + "loss": 0.5771, + "step": 17660 + }, + { + "epoch": 0.39211551481115636, + "grad_norm": 1.8984160423278809, + "learning_rate": 1.3324644164370643e-05, + "loss": 0.4725, + "step": 17665 + }, + { + "epoch": 0.392226501370684, + "grad_norm": 1.6385459899902344, + "learning_rate": 1.3321355520974708e-05, + "loss": 0.5825, + "step": 17670 + }, + { + "epoch": 0.39233748793021167, + "grad_norm": 1.5618573427200317, + "learning_rate": 1.3318066473779504e-05, + "loss": 0.4333, + "step": 17675 + }, + { + "epoch": 0.3924484744897393, + "grad_norm": 1.079800009727478, + "learning_rate": 1.3314777023184907e-05, + "loss": 0.4186, + "step": 17680 + }, + { + "epoch": 0.3925594610492669, + "grad_norm": 0.9325678944587708, + "learning_rate": 1.3311487169590835e-05, + "loss": 0.496, + "step": 17685 + }, + { + "epoch": 0.39267044760879455, + "grad_norm": 1.4722448587417603, + "learning_rate": 1.330819691339726e-05, + "loss": 0.4277, + "step": 17690 + }, + { + "epoch": 0.39278143416832223, + "grad_norm": 0.955633282661438, + "learning_rate": 1.33049062550042e-05, + "loss": 0.4529, + "step": 17695 + }, + { + "epoch": 0.39289242072784986, + "grad_norm": 1.27122962474823, + "learning_rate": 1.330161519481172e-05, + "loss": 0.462, + "step": 17700 + }, + { + "epoch": 0.3930034072873775, + "grad_norm": 0.7992278933525085, + "learning_rate": 1.3298323733219938e-05, + "loss": 0.5928, + "step": 17705 + }, + { + "epoch": 0.39311439384690516, + "grad_norm": 1.091391921043396, + "learning_rate": 1.3295031870629016e-05, + "loss": 0.5228, + "step": 17710 + }, + { + "epoch": 0.3932253804064328, + "grad_norm": 1.3356877565383911, + "learning_rate": 1.3291739607439173e-05, + "loss": 0.4639, + "step": 17715 + }, + { + "epoch": 0.3933363669659604, + "grad_norm": 1.7986055612564087, + "learning_rate": 1.3288446944050668e-05, + "loss": 0.484, + "step": 17720 + }, + { + "epoch": 0.39344735352548804, + "grad_norm": 1.2770730257034302, + "learning_rate": 1.328515388086381e-05, + "loss": 0.5049, + "step": 17725 + }, + { + "epoch": 0.3935583400850157, + "grad_norm": 1.0809767246246338, + "learning_rate": 1.3281860418278962e-05, + "loss": 0.438, + "step": 17730 + }, + { + "epoch": 0.39366932664454335, + "grad_norm": 1.3970037698745728, + "learning_rate": 1.327856655669653e-05, + "loss": 0.4304, + "step": 17735 + }, + { + "epoch": 0.393780313204071, + "grad_norm": 1.239540934562683, + "learning_rate": 1.3275272296516973e-05, + "loss": 0.5295, + "step": 17740 + }, + { + "epoch": 0.3938912997635986, + "grad_norm": 2.00631046295166, + "learning_rate": 1.3271977638140794e-05, + "loss": 0.395, + "step": 17745 + }, + { + "epoch": 0.3940022863231263, + "grad_norm": 0.9444000124931335, + "learning_rate": 1.3268682581968547e-05, + "loss": 0.4573, + "step": 17750 + }, + { + "epoch": 0.3941132728826539, + "grad_norm": 1.2916285991668701, + "learning_rate": 1.3265387128400833e-05, + "loss": 0.4818, + "step": 17755 + }, + { + "epoch": 0.39422425944218153, + "grad_norm": 0.8935806751251221, + "learning_rate": 1.3262091277838304e-05, + "loss": 0.4236, + "step": 17760 + }, + { + "epoch": 0.3943352460017092, + "grad_norm": 1.304722547531128, + "learning_rate": 1.325879503068166e-05, + "loss": 0.5102, + "step": 17765 + }, + { + "epoch": 0.39444623256123684, + "grad_norm": 1.0897523164749146, + "learning_rate": 1.3255498387331643e-05, + "loss": 0.4259, + "step": 17770 + }, + { + "epoch": 0.39455721912076447, + "grad_norm": 0.8314265608787537, + "learning_rate": 1.325220134818905e-05, + "loss": 0.4265, + "step": 17775 + }, + { + "epoch": 0.3946682056802921, + "grad_norm": 1.312457799911499, + "learning_rate": 1.3248903913654726e-05, + "loss": 0.5147, + "step": 17780 + }, + { + "epoch": 0.3947791922398198, + "grad_norm": 1.258514165878296, + "learning_rate": 1.3245606084129559e-05, + "loss": 0.352, + "step": 17785 + }, + { + "epoch": 0.3948901787993474, + "grad_norm": 1.0305403470993042, + "learning_rate": 1.3242307860014487e-05, + "loss": 0.3029, + "step": 17790 + }, + { + "epoch": 0.39500116535887503, + "grad_norm": 1.2700746059417725, + "learning_rate": 1.32390092417105e-05, + "loss": 0.3088, + "step": 17795 + }, + { + "epoch": 0.39511215191840265, + "grad_norm": 1.1628782749176025, + "learning_rate": 1.3235710229618635e-05, + "loss": 0.5507, + "step": 17800 + }, + { + "epoch": 0.39522313847793034, + "grad_norm": 1.2276886701583862, + "learning_rate": 1.3232410824139969e-05, + "loss": 0.4989, + "step": 17805 + }, + { + "epoch": 0.39533412503745796, + "grad_norm": 1.2096561193466187, + "learning_rate": 1.3229111025675639e-05, + "loss": 0.425, + "step": 17810 + }, + { + "epoch": 0.3954451115969856, + "grad_norm": 1.6849702596664429, + "learning_rate": 1.3225810834626817e-05, + "loss": 0.411, + "step": 17815 + }, + { + "epoch": 0.39555609815651327, + "grad_norm": 0.8754458427429199, + "learning_rate": 1.3222510251394732e-05, + "loss": 0.5221, + "step": 17820 + }, + { + "epoch": 0.3956670847160409, + "grad_norm": 1.1738617420196533, + "learning_rate": 1.3219209276380657e-05, + "loss": 0.3602, + "step": 17825 + }, + { + "epoch": 0.3957780712755685, + "grad_norm": 1.1277354955673218, + "learning_rate": 1.3215907909985919e-05, + "loss": 0.5628, + "step": 17830 + }, + { + "epoch": 0.39588905783509615, + "grad_norm": 1.6044995784759521, + "learning_rate": 1.321260615261188e-05, + "loss": 0.4562, + "step": 17835 + }, + { + "epoch": 0.39600004439462383, + "grad_norm": 1.0552541017532349, + "learning_rate": 1.320930400465996e-05, + "loss": 0.433, + "step": 17840 + }, + { + "epoch": 0.39611103095415146, + "grad_norm": 0.8654063940048218, + "learning_rate": 1.3206001466531624e-05, + "loss": 0.2673, + "step": 17845 + }, + { + "epoch": 0.3962220175136791, + "grad_norm": 1.105943202972412, + "learning_rate": 1.3202698538628376e-05, + "loss": 0.417, + "step": 17850 + }, + { + "epoch": 0.3963330040732067, + "grad_norm": 1.0926660299301147, + "learning_rate": 1.3199395221351785e-05, + "loss": 0.4506, + "step": 17855 + }, + { + "epoch": 0.3964439906327344, + "grad_norm": 1.0280393362045288, + "learning_rate": 1.3196091515103454e-05, + "loss": 0.4719, + "step": 17860 + }, + { + "epoch": 0.396554977192262, + "grad_norm": 1.2806470394134521, + "learning_rate": 1.3192787420285034e-05, + "loss": 0.5129, + "step": 17865 + }, + { + "epoch": 0.39666596375178964, + "grad_norm": 0.8270732760429382, + "learning_rate": 1.3189482937298225e-05, + "loss": 0.5978, + "step": 17870 + }, + { + "epoch": 0.3967769503113173, + "grad_norm": 0.8961861729621887, + "learning_rate": 1.3186178066544781e-05, + "loss": 0.3647, + "step": 17875 + }, + { + "epoch": 0.39688793687084495, + "grad_norm": 0.9055027961730957, + "learning_rate": 1.3182872808426492e-05, + "loss": 0.4148, + "step": 17880 + }, + { + "epoch": 0.3969989234303726, + "grad_norm": 1.5910625457763672, + "learning_rate": 1.3179567163345201e-05, + "loss": 0.4073, + "step": 17885 + }, + { + "epoch": 0.3971099099899002, + "grad_norm": 1.038095235824585, + "learning_rate": 1.3176261131702796e-05, + "loss": 0.4585, + "step": 17890 + }, + { + "epoch": 0.3972208965494279, + "grad_norm": 1.1451902389526367, + "learning_rate": 1.3172954713901218e-05, + "loss": 0.5159, + "step": 17895 + }, + { + "epoch": 0.3973318831089555, + "grad_norm": 0.8876758813858032, + "learning_rate": 1.3169647910342447e-05, + "loss": 0.4819, + "step": 17900 + }, + { + "epoch": 0.39744286966848313, + "grad_norm": 1.7914245128631592, + "learning_rate": 1.316634072142851e-05, + "loss": 0.4021, + "step": 17905 + }, + { + "epoch": 0.39755385622801076, + "grad_norm": 0.9543294310569763, + "learning_rate": 1.316303314756149e-05, + "loss": 0.4622, + "step": 17910 + }, + { + "epoch": 0.39766484278753844, + "grad_norm": 1.5023553371429443, + "learning_rate": 1.3159725189143506e-05, + "loss": 0.7284, + "step": 17915 + }, + { + "epoch": 0.39777582934706607, + "grad_norm": 1.5158809423446655, + "learning_rate": 1.3156416846576732e-05, + "loss": 0.3533, + "step": 17920 + }, + { + "epoch": 0.3978868159065937, + "grad_norm": 1.2066943645477295, + "learning_rate": 1.3153108120263384e-05, + "loss": 0.6102, + "step": 17925 + }, + { + "epoch": 0.3979978024661214, + "grad_norm": 1.2224715948104858, + "learning_rate": 1.3149799010605726e-05, + "loss": 0.6959, + "step": 17930 + }, + { + "epoch": 0.398108789025649, + "grad_norm": 1.9479608535766602, + "learning_rate": 1.3146489518006065e-05, + "loss": 0.5941, + "step": 17935 + }, + { + "epoch": 0.3982197755851766, + "grad_norm": 1.061508059501648, + "learning_rate": 1.314317964286676e-05, + "loss": 0.5794, + "step": 17940 + }, + { + "epoch": 0.39833076214470425, + "grad_norm": 1.1498526334762573, + "learning_rate": 1.313986938559022e-05, + "loss": 0.5621, + "step": 17945 + }, + { + "epoch": 0.39844174870423194, + "grad_norm": 1.051237940788269, + "learning_rate": 1.3136558746578888e-05, + "loss": 0.3175, + "step": 17950 + }, + { + "epoch": 0.39855273526375956, + "grad_norm": 0.903063952922821, + "learning_rate": 1.3133247726235263e-05, + "loss": 0.4213, + "step": 17955 + }, + { + "epoch": 0.3986637218232872, + "grad_norm": 1.0225557088851929, + "learning_rate": 1.312993632496189e-05, + "loss": 0.3809, + "step": 17960 + }, + { + "epoch": 0.39877470838281487, + "grad_norm": 2.102311372756958, + "learning_rate": 1.3126624543161351e-05, + "loss": 0.3249, + "step": 17965 + }, + { + "epoch": 0.3988856949423425, + "grad_norm": 1.8216239213943481, + "learning_rate": 1.3123312381236287e-05, + "loss": 0.5173, + "step": 17970 + }, + { + "epoch": 0.3989966815018701, + "grad_norm": 1.2820842266082764, + "learning_rate": 1.311999983958938e-05, + "loss": 0.2883, + "step": 17975 + }, + { + "epoch": 0.39910766806139775, + "grad_norm": 1.3804726600646973, + "learning_rate": 1.3116686918623356e-05, + "loss": 0.5963, + "step": 17980 + }, + { + "epoch": 0.39921865462092543, + "grad_norm": 1.2053264379501343, + "learning_rate": 1.3113373618740988e-05, + "loss": 0.4112, + "step": 17985 + }, + { + "epoch": 0.39932964118045305, + "grad_norm": 0.9135293364524841, + "learning_rate": 1.3110059940345096e-05, + "loss": 0.4919, + "step": 17990 + }, + { + "epoch": 0.3994406277399807, + "grad_norm": 1.1950745582580566, + "learning_rate": 1.3106745883838549e-05, + "loss": 0.5597, + "step": 17995 + }, + { + "epoch": 0.3995516142995083, + "grad_norm": 1.2758936882019043, + "learning_rate": 1.3103431449624257e-05, + "loss": 0.432, + "step": 18000 + }, + { + "epoch": 0.399662600859036, + "grad_norm": 1.616534948348999, + "learning_rate": 1.3100116638105177e-05, + "loss": 0.5755, + "step": 18005 + }, + { + "epoch": 0.3997735874185636, + "grad_norm": 1.3022828102111816, + "learning_rate": 1.3096801449684314e-05, + "loss": 0.4205, + "step": 18010 + }, + { + "epoch": 0.39988457397809124, + "grad_norm": 1.1768077611923218, + "learning_rate": 1.3093485884764714e-05, + "loss": 0.4766, + "step": 18015 + }, + { + "epoch": 0.3999955605376189, + "grad_norm": 0.7835399508476257, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.4221, + "step": 18020 + }, + { + "epoch": 0.40010654709714655, + "grad_norm": 1.2957817316055298, + "learning_rate": 1.308685362704174e-05, + "loss": 0.4314, + "step": 18025 + }, + { + "epoch": 0.4002175336566742, + "grad_norm": 1.2330552339553833, + "learning_rate": 1.3083536935044695e-05, + "loss": 0.4486, + "step": 18030 + }, + { + "epoch": 0.4003285202162018, + "grad_norm": 1.5200679302215576, + "learning_rate": 1.3080219868161565e-05, + "loss": 0.408, + "step": 18035 + }, + { + "epoch": 0.4004395067757295, + "grad_norm": 0.9305362105369568, + "learning_rate": 1.3076902426795639e-05, + "loss": 0.6147, + "step": 18040 + }, + { + "epoch": 0.4005504933352571, + "grad_norm": 0.6245304942131042, + "learning_rate": 1.3073584611350234e-05, + "loss": 0.3921, + "step": 18045 + }, + { + "epoch": 0.40066147989478473, + "grad_norm": 1.8916888236999512, + "learning_rate": 1.3070266422228717e-05, + "loss": 0.588, + "step": 18050 + }, + { + "epoch": 0.40077246645431236, + "grad_norm": 1.5298385620117188, + "learning_rate": 1.3066947859834507e-05, + "loss": 0.5068, + "step": 18055 + }, + { + "epoch": 0.40088345301384004, + "grad_norm": 1.188046932220459, + "learning_rate": 1.3063628924571061e-05, + "loss": 0.5606, + "step": 18060 + }, + { + "epoch": 0.40099443957336767, + "grad_norm": 1.1807680130004883, + "learning_rate": 1.3060309616841887e-05, + "loss": 0.4235, + "step": 18065 + }, + { + "epoch": 0.4011054261328953, + "grad_norm": 1.1999351978302002, + "learning_rate": 1.3056989937050532e-05, + "loss": 0.4043, + "step": 18070 + }, + { + "epoch": 0.401216412692423, + "grad_norm": 0.7646277546882629, + "learning_rate": 1.3053669885600592e-05, + "loss": 0.513, + "step": 18075 + }, + { + "epoch": 0.4013273992519506, + "grad_norm": 1.1868324279785156, + "learning_rate": 1.3050349462895711e-05, + "loss": 0.2457, + "step": 18080 + }, + { + "epoch": 0.4014383858114782, + "grad_norm": 1.2941502332687378, + "learning_rate": 1.3047028669339569e-05, + "loss": 0.4342, + "step": 18085 + }, + { + "epoch": 0.40154937237100585, + "grad_norm": 0.8198782205581665, + "learning_rate": 1.3043707505335905e-05, + "loss": 0.3848, + "step": 18090 + }, + { + "epoch": 0.40166035893053353, + "grad_norm": 1.302763819694519, + "learning_rate": 1.3040385971288486e-05, + "loss": 0.4415, + "step": 18095 + }, + { + "epoch": 0.40177134549006116, + "grad_norm": 1.5501524209976196, + "learning_rate": 1.3037064067601141e-05, + "loss": 0.4466, + "step": 18100 + }, + { + "epoch": 0.4018823320495888, + "grad_norm": 0.9922233819961548, + "learning_rate": 1.3033741794677728e-05, + "loss": 0.5519, + "step": 18105 + }, + { + "epoch": 0.4019933186091164, + "grad_norm": 0.9999378323554993, + "learning_rate": 1.3030419152922165e-05, + "loss": 0.316, + "step": 18110 + }, + { + "epoch": 0.4021043051686441, + "grad_norm": 1.383109211921692, + "learning_rate": 1.3027096142738404e-05, + "loss": 0.4103, + "step": 18115 + }, + { + "epoch": 0.4022152917281717, + "grad_norm": 1.4409019947052002, + "learning_rate": 1.3023772764530446e-05, + "loss": 0.4338, + "step": 18120 + }, + { + "epoch": 0.40232627828769935, + "grad_norm": 0.8448358774185181, + "learning_rate": 1.3020449018702337e-05, + "loss": 0.468, + "step": 18125 + }, + { + "epoch": 0.402437264847227, + "grad_norm": 1.57224702835083, + "learning_rate": 1.3017124905658164e-05, + "loss": 0.5316, + "step": 18130 + }, + { + "epoch": 0.40254825140675465, + "grad_norm": 1.8472278118133545, + "learning_rate": 1.3013800425802066e-05, + "loss": 0.4976, + "step": 18135 + }, + { + "epoch": 0.4026592379662823, + "grad_norm": 1.0904327630996704, + "learning_rate": 1.301047557953822e-05, + "loss": 0.5188, + "step": 18140 + }, + { + "epoch": 0.4027702245258099, + "grad_norm": 1.1760843992233276, + "learning_rate": 1.300715036727085e-05, + "loss": 0.6897, + "step": 18145 + }, + { + "epoch": 0.4028812110853376, + "grad_norm": 1.4732872247695923, + "learning_rate": 1.300382478940422e-05, + "loss": 0.2838, + "step": 18150 + }, + { + "epoch": 0.4029921976448652, + "grad_norm": 1.4299676418304443, + "learning_rate": 1.3000498846342646e-05, + "loss": 0.5486, + "step": 18155 + }, + { + "epoch": 0.40310318420439284, + "grad_norm": 1.4404714107513428, + "learning_rate": 1.2997172538490486e-05, + "loss": 0.48, + "step": 18160 + }, + { + "epoch": 0.40321417076392047, + "grad_norm": 1.078039526939392, + "learning_rate": 1.2993845866252142e-05, + "loss": 0.3799, + "step": 18165 + }, + { + "epoch": 0.40332515732344815, + "grad_norm": 1.5176726579666138, + "learning_rate": 1.2990518830032057e-05, + "loss": 0.3537, + "step": 18170 + }, + { + "epoch": 0.4034361438829758, + "grad_norm": 0.7993068695068359, + "learning_rate": 1.2987191430234724e-05, + "loss": 0.3949, + "step": 18175 + }, + { + "epoch": 0.4035471304425034, + "grad_norm": 0.994655430316925, + "learning_rate": 1.2983863667264673e-05, + "loss": 0.4176, + "step": 18180 + }, + { + "epoch": 0.4036581170020311, + "grad_norm": 1.150771975517273, + "learning_rate": 1.2980535541526487e-05, + "loss": 0.5571, + "step": 18185 + }, + { + "epoch": 0.4037691035615587, + "grad_norm": 1.3434624671936035, + "learning_rate": 1.2977207053424781e-05, + "loss": 0.7217, + "step": 18190 + }, + { + "epoch": 0.40388009012108633, + "grad_norm": 0.6132928133010864, + "learning_rate": 1.297387820336423e-05, + "loss": 0.4322, + "step": 18195 + }, + { + "epoch": 0.40399107668061396, + "grad_norm": 1.3462475538253784, + "learning_rate": 1.2970548991749538e-05, + "loss": 0.4018, + "step": 18200 + }, + { + "epoch": 0.40410206324014164, + "grad_norm": 1.393395185470581, + "learning_rate": 1.2967219418985466e-05, + "loss": 0.4862, + "step": 18205 + }, + { + "epoch": 0.40421304979966927, + "grad_norm": 1.1231212615966797, + "learning_rate": 1.2963889485476806e-05, + "loss": 0.3355, + "step": 18210 + }, + { + "epoch": 0.4043240363591969, + "grad_norm": 1.0703603029251099, + "learning_rate": 1.2960559191628403e-05, + "loss": 0.5421, + "step": 18215 + }, + { + "epoch": 0.4044350229187245, + "grad_norm": 1.6870838403701782, + "learning_rate": 1.2957228537845139e-05, + "loss": 0.5258, + "step": 18220 + }, + { + "epoch": 0.4045460094782522, + "grad_norm": 1.1466280221939087, + "learning_rate": 1.295389752453195e-05, + "loss": 0.4478, + "step": 18225 + }, + { + "epoch": 0.4046569960377798, + "grad_norm": 1.7697092294692993, + "learning_rate": 1.2950566152093808e-05, + "loss": 0.4538, + "step": 18230 + }, + { + "epoch": 0.40476798259730745, + "grad_norm": 1.9452863931655884, + "learning_rate": 1.2947234420935729e-05, + "loss": 0.4496, + "step": 18235 + }, + { + "epoch": 0.40487896915683513, + "grad_norm": 1.3969025611877441, + "learning_rate": 1.2943902331462771e-05, + "loss": 0.5327, + "step": 18240 + }, + { + "epoch": 0.40498995571636276, + "grad_norm": 1.274922251701355, + "learning_rate": 1.2940569884080044e-05, + "loss": 0.359, + "step": 18245 + }, + { + "epoch": 0.4051009422758904, + "grad_norm": 1.0482635498046875, + "learning_rate": 1.2937237079192688e-05, + "loss": 0.462, + "step": 18250 + }, + { + "epoch": 0.405211928835418, + "grad_norm": 0.9442659020423889, + "learning_rate": 1.2933903917205902e-05, + "loss": 0.3367, + "step": 18255 + }, + { + "epoch": 0.4053229153949457, + "grad_norm": 0.9624006748199463, + "learning_rate": 1.2930570398524915e-05, + "loss": 0.3738, + "step": 18260 + }, + { + "epoch": 0.4054339019544733, + "grad_norm": 1.4392725229263306, + "learning_rate": 1.2927236523555009e-05, + "loss": 0.346, + "step": 18265 + }, + { + "epoch": 0.40554488851400095, + "grad_norm": 1.205470323562622, + "learning_rate": 1.2923902292701502e-05, + "loss": 0.354, + "step": 18270 + }, + { + "epoch": 0.40565587507352857, + "grad_norm": 0.8593617677688599, + "learning_rate": 1.292056770636976e-05, + "loss": 0.5462, + "step": 18275 + }, + { + "epoch": 0.40576686163305625, + "grad_norm": 1.7613037824630737, + "learning_rate": 1.2917232764965194e-05, + "loss": 0.5203, + "step": 18280 + }, + { + "epoch": 0.4058778481925839, + "grad_norm": 1.4268548488616943, + "learning_rate": 1.2913897468893249e-05, + "loss": 0.565, + "step": 18285 + }, + { + "epoch": 0.4059888347521115, + "grad_norm": 1.5129190683364868, + "learning_rate": 1.2910561818559421e-05, + "loss": 0.5035, + "step": 18290 + }, + { + "epoch": 0.4060998213116392, + "grad_norm": 1.219570517539978, + "learning_rate": 1.2907225814369254e-05, + "loss": 0.418, + "step": 18295 + }, + { + "epoch": 0.4062108078711668, + "grad_norm": 1.4354437589645386, + "learning_rate": 1.2903889456728315e-05, + "loss": 0.3389, + "step": 18300 + }, + { + "epoch": 0.40632179443069444, + "grad_norm": 1.1195069551467896, + "learning_rate": 1.290055274604224e-05, + "loss": 0.5354, + "step": 18305 + }, + { + "epoch": 0.40643278099022206, + "grad_norm": 0.6017734408378601, + "learning_rate": 1.289721568271669e-05, + "loss": 0.5803, + "step": 18310 + }, + { + "epoch": 0.40654376754974975, + "grad_norm": 1.248761773109436, + "learning_rate": 1.2893878267157373e-05, + "loss": 0.5133, + "step": 18315 + }, + { + "epoch": 0.40665475410927737, + "grad_norm": 1.238337755203247, + "learning_rate": 1.2890540499770041e-05, + "loss": 0.6132, + "step": 18320 + }, + { + "epoch": 0.406765740668805, + "grad_norm": 1.2553455829620361, + "learning_rate": 1.2887202380960491e-05, + "loss": 0.3653, + "step": 18325 + }, + { + "epoch": 0.4068767272283326, + "grad_norm": 1.3628250360488892, + "learning_rate": 1.2883863911134553e-05, + "loss": 0.4313, + "step": 18330 + }, + { + "epoch": 0.4069877137878603, + "grad_norm": 1.0972529649734497, + "learning_rate": 1.2880525090698118e-05, + "loss": 0.3519, + "step": 18335 + }, + { + "epoch": 0.40709870034738793, + "grad_norm": 1.4026445150375366, + "learning_rate": 1.2877185920057101e-05, + "loss": 0.4353, + "step": 18340 + }, + { + "epoch": 0.40720968690691556, + "grad_norm": 1.2224009037017822, + "learning_rate": 1.2873846399617469e-05, + "loss": 0.3717, + "step": 18345 + }, + { + "epoch": 0.40732067346644324, + "grad_norm": 1.2186386585235596, + "learning_rate": 1.2870506529785232e-05, + "loss": 0.4258, + "step": 18350 + }, + { + "epoch": 0.40743166002597087, + "grad_norm": 1.1470417976379395, + "learning_rate": 1.2867166310966437e-05, + "loss": 0.5244, + "step": 18355 + }, + { + "epoch": 0.4075426465854985, + "grad_norm": 1.1212327480316162, + "learning_rate": 1.2863825743567174e-05, + "loss": 0.3101, + "step": 18360 + }, + { + "epoch": 0.4076536331450261, + "grad_norm": 1.2823984622955322, + "learning_rate": 1.2860484827993584e-05, + "loss": 0.4485, + "step": 18365 + }, + { + "epoch": 0.4077646197045538, + "grad_norm": 0.9786013960838318, + "learning_rate": 1.2857143564651845e-05, + "loss": 0.3765, + "step": 18370 + }, + { + "epoch": 0.4078756062640814, + "grad_norm": 0.9708194732666016, + "learning_rate": 1.2853801953948171e-05, + "loss": 0.3235, + "step": 18375 + }, + { + "epoch": 0.40798659282360905, + "grad_norm": 1.119699478149414, + "learning_rate": 1.2850459996288826e-05, + "loss": 0.5239, + "step": 18380 + }, + { + "epoch": 0.4080975793831367, + "grad_norm": 1.5546756982803345, + "learning_rate": 1.2847117692080115e-05, + "loss": 0.549, + "step": 18385 + }, + { + "epoch": 0.40820856594266436, + "grad_norm": 1.2498154640197754, + "learning_rate": 1.2843775041728384e-05, + "loss": 0.4397, + "step": 18390 + }, + { + "epoch": 0.408319552502192, + "grad_norm": 1.4244163036346436, + "learning_rate": 1.2840432045640018e-05, + "loss": 0.3831, + "step": 18395 + }, + { + "epoch": 0.4084305390617196, + "grad_norm": 1.7624845504760742, + "learning_rate": 1.2837088704221454e-05, + "loss": 0.3975, + "step": 18400 + }, + { + "epoch": 0.4085415256212473, + "grad_norm": 1.2871631383895874, + "learning_rate": 1.2833745017879158e-05, + "loss": 0.3667, + "step": 18405 + }, + { + "epoch": 0.4086525121807749, + "grad_norm": 1.5793527364730835, + "learning_rate": 1.2830400987019646e-05, + "loss": 0.5073, + "step": 18410 + }, + { + "epoch": 0.40876349874030254, + "grad_norm": 1.3336777687072754, + "learning_rate": 1.2827056612049473e-05, + "loss": 0.3303, + "step": 18415 + }, + { + "epoch": 0.40887448529983017, + "grad_norm": 1.7279059886932373, + "learning_rate": 1.2823711893375242e-05, + "loss": 0.4877, + "step": 18420 + }, + { + "epoch": 0.40898547185935785, + "grad_norm": 1.0418862104415894, + "learning_rate": 1.2820366831403587e-05, + "loss": 0.5104, + "step": 18425 + }, + { + "epoch": 0.4090964584188855, + "grad_norm": 1.9648549556732178, + "learning_rate": 1.2817021426541191e-05, + "loss": 0.3704, + "step": 18430 + }, + { + "epoch": 0.4092074449784131, + "grad_norm": 1.1906790733337402, + "learning_rate": 1.2813675679194779e-05, + "loss": 0.323, + "step": 18435 + }, + { + "epoch": 0.40931843153794073, + "grad_norm": 1.1181881427764893, + "learning_rate": 1.2810329589771114e-05, + "loss": 0.5711, + "step": 18440 + }, + { + "epoch": 0.4094294180974684, + "grad_norm": 1.0041084289550781, + "learning_rate": 1.2806983158677e-05, + "loss": 0.319, + "step": 18445 + }, + { + "epoch": 0.40954040465699604, + "grad_norm": 1.1120296716690063, + "learning_rate": 1.2803636386319288e-05, + "loss": 0.4846, + "step": 18450 + }, + { + "epoch": 0.40965139121652366, + "grad_norm": 0.7871121168136597, + "learning_rate": 1.2800289273104869e-05, + "loss": 0.3841, + "step": 18455 + }, + { + "epoch": 0.40976237777605135, + "grad_norm": 1.2689319849014282, + "learning_rate": 1.2796941819440671e-05, + "loss": 0.2925, + "step": 18460 + }, + { + "epoch": 0.40987336433557897, + "grad_norm": 1.0140430927276611, + "learning_rate": 1.2793594025733668e-05, + "loss": 0.3438, + "step": 18465 + }, + { + "epoch": 0.4099843508951066, + "grad_norm": 0.9305757880210876, + "learning_rate": 1.2790245892390868e-05, + "loss": 0.2941, + "step": 18470 + }, + { + "epoch": 0.4100953374546342, + "grad_norm": 1.158919334411621, + "learning_rate": 1.2786897419819335e-05, + "loss": 0.4539, + "step": 18475 + }, + { + "epoch": 0.4102063240141619, + "grad_norm": 0.8390790224075317, + "learning_rate": 1.278354860842616e-05, + "loss": 0.3339, + "step": 18480 + }, + { + "epoch": 0.41031731057368953, + "grad_norm": 1.2888376712799072, + "learning_rate": 1.2780199458618478e-05, + "loss": 0.2988, + "step": 18485 + }, + { + "epoch": 0.41042829713321716, + "grad_norm": 1.410506248474121, + "learning_rate": 1.2776849970803472e-05, + "loss": 0.4381, + "step": 18490 + }, + { + "epoch": 0.4105392836927448, + "grad_norm": 0.8905935883522034, + "learning_rate": 1.2773500145388358e-05, + "loss": 0.4926, + "step": 18495 + }, + { + "epoch": 0.41065027025227246, + "grad_norm": 1.7534258365631104, + "learning_rate": 1.2770149982780398e-05, + "loss": 0.4428, + "step": 18500 + }, + { + "epoch": 0.4107612568118001, + "grad_norm": 1.1103402376174927, + "learning_rate": 1.2766799483386894e-05, + "loss": 0.3413, + "step": 18505 + }, + { + "epoch": 0.4108722433713277, + "grad_norm": 1.3265504837036133, + "learning_rate": 1.276344864761519e-05, + "loss": 0.3296, + "step": 18510 + }, + { + "epoch": 0.4109832299308554, + "grad_norm": 1.7852246761322021, + "learning_rate": 1.2760097475872661e-05, + "loss": 0.4809, + "step": 18515 + }, + { + "epoch": 0.411094216490383, + "grad_norm": 1.636186957359314, + "learning_rate": 1.2756745968566743e-05, + "loss": 0.3488, + "step": 18520 + }, + { + "epoch": 0.41120520304991065, + "grad_norm": 0.9312105774879456, + "learning_rate": 1.2753394126104894e-05, + "loss": 0.5638, + "step": 18525 + }, + { + "epoch": 0.4113161896094383, + "grad_norm": 1.4691455364227295, + "learning_rate": 1.2750041948894621e-05, + "loss": 0.4399, + "step": 18530 + }, + { + "epoch": 0.41142717616896596, + "grad_norm": 1.0564097166061401, + "learning_rate": 1.274668943734347e-05, + "loss": 0.4582, + "step": 18535 + }, + { + "epoch": 0.4115381627284936, + "grad_norm": 1.2533094882965088, + "learning_rate": 1.274333659185903e-05, + "loss": 0.349, + "step": 18540 + }, + { + "epoch": 0.4116491492880212, + "grad_norm": 1.1874221563339233, + "learning_rate": 1.2739983412848926e-05, + "loss": 0.4179, + "step": 18545 + }, + { + "epoch": 0.41176013584754884, + "grad_norm": 1.1301754713058472, + "learning_rate": 1.2736629900720832e-05, + "loss": 0.3151, + "step": 18550 + }, + { + "epoch": 0.4118711224070765, + "grad_norm": 1.4599363803863525, + "learning_rate": 1.2733276055882446e-05, + "loss": 0.4141, + "step": 18555 + }, + { + "epoch": 0.41198210896660414, + "grad_norm": 1.0648486614227295, + "learning_rate": 1.272992187874153e-05, + "loss": 0.5459, + "step": 18560 + }, + { + "epoch": 0.41209309552613177, + "grad_norm": 1.3196961879730225, + "learning_rate": 1.2726567369705864e-05, + "loss": 0.4486, + "step": 18565 + }, + { + "epoch": 0.41220408208565945, + "grad_norm": 1.1008305549621582, + "learning_rate": 1.2723212529183285e-05, + "loss": 0.4476, + "step": 18570 + }, + { + "epoch": 0.4123150686451871, + "grad_norm": 1.0683090686798096, + "learning_rate": 1.2719857357581656e-05, + "loss": 0.4345, + "step": 18575 + }, + { + "epoch": 0.4124260552047147, + "grad_norm": 0.8868213295936584, + "learning_rate": 1.2716501855308892e-05, + "loss": 0.3925, + "step": 18580 + }, + { + "epoch": 0.41253704176424233, + "grad_norm": 1.6421164274215698, + "learning_rate": 1.2713146022772943e-05, + "loss": 0.3936, + "step": 18585 + }, + { + "epoch": 0.41264802832377, + "grad_norm": 1.1328929662704468, + "learning_rate": 1.27097898603818e-05, + "loss": 0.4957, + "step": 18590 + }, + { + "epoch": 0.41275901488329764, + "grad_norm": 0.95980304479599, + "learning_rate": 1.2706433368543494e-05, + "loss": 0.3633, + "step": 18595 + }, + { + "epoch": 0.41287000144282526, + "grad_norm": 0.9733328819274902, + "learning_rate": 1.2703076547666096e-05, + "loss": 0.3381, + "step": 18600 + }, + { + "epoch": 0.4129809880023529, + "grad_norm": 1.4620132446289062, + "learning_rate": 1.2699719398157715e-05, + "loss": 0.5159, + "step": 18605 + }, + { + "epoch": 0.41309197456188057, + "grad_norm": 1.0303751230239868, + "learning_rate": 1.2696361920426505e-05, + "loss": 0.4639, + "step": 18610 + }, + { + "epoch": 0.4132029611214082, + "grad_norm": 1.0312970876693726, + "learning_rate": 1.2693004114880654e-05, + "loss": 0.3479, + "step": 18615 + }, + { + "epoch": 0.4133139476809358, + "grad_norm": 1.142751693725586, + "learning_rate": 1.2689645981928395e-05, + "loss": 0.3001, + "step": 18620 + }, + { + "epoch": 0.4134249342404635, + "grad_norm": 0.9720245003700256, + "learning_rate": 1.2686287521978e-05, + "loss": 0.4678, + "step": 18625 + }, + { + "epoch": 0.41353592079999113, + "grad_norm": 0.8710252642631531, + "learning_rate": 1.2682928735437776e-05, + "loss": 0.4558, + "step": 18630 + }, + { + "epoch": 0.41364690735951876, + "grad_norm": 1.5070589780807495, + "learning_rate": 1.2679569622716075e-05, + "loss": 0.4395, + "step": 18635 + }, + { + "epoch": 0.4137578939190464, + "grad_norm": 1.492600917816162, + "learning_rate": 1.2676210184221285e-05, + "loss": 0.5579, + "step": 18640 + }, + { + "epoch": 0.41386888047857406, + "grad_norm": 0.7719084024429321, + "learning_rate": 1.2672850420361837e-05, + "loss": 0.4032, + "step": 18645 + }, + { + "epoch": 0.4139798670381017, + "grad_norm": 1.246576189994812, + "learning_rate": 1.2669490331546198e-05, + "loss": 0.6183, + "step": 18650 + }, + { + "epoch": 0.4140908535976293, + "grad_norm": 1.1218833923339844, + "learning_rate": 1.2666129918182876e-05, + "loss": 0.4348, + "step": 18655 + }, + { + "epoch": 0.41420184015715694, + "grad_norm": 0.9847343564033508, + "learning_rate": 1.2662769180680424e-05, + "loss": 0.4673, + "step": 18660 + }, + { + "epoch": 0.4143128267166846, + "grad_norm": 1.0954967737197876, + "learning_rate": 1.2659408119447422e-05, + "loss": 0.4823, + "step": 18665 + }, + { + "epoch": 0.41442381327621225, + "grad_norm": 1.2822248935699463, + "learning_rate": 1.2656046734892498e-05, + "loss": 0.5339, + "step": 18670 + }, + { + "epoch": 0.4145347998357399, + "grad_norm": 1.1070445775985718, + "learning_rate": 1.2652685027424324e-05, + "loss": 0.4468, + "step": 18675 + }, + { + "epoch": 0.41464578639526756, + "grad_norm": 1.1634217500686646, + "learning_rate": 1.2649322997451599e-05, + "loss": 0.5181, + "step": 18680 + }, + { + "epoch": 0.4147567729547952, + "grad_norm": 0.889788031578064, + "learning_rate": 1.2645960645383069e-05, + "loss": 0.3611, + "step": 18685 + }, + { + "epoch": 0.4148677595143228, + "grad_norm": 1.3153811693191528, + "learning_rate": 1.2642597971627518e-05, + "loss": 0.4127, + "step": 18690 + }, + { + "epoch": 0.41497874607385044, + "grad_norm": 0.9526509046554565, + "learning_rate": 1.2639234976593766e-05, + "loss": 0.474, + "step": 18695 + }, + { + "epoch": 0.4150897326333781, + "grad_norm": 1.2924917936325073, + "learning_rate": 1.2635871660690677e-05, + "loss": 0.4069, + "step": 18700 + }, + { + "epoch": 0.41520071919290574, + "grad_norm": 1.0292452573776245, + "learning_rate": 1.2632508024327152e-05, + "loss": 0.3366, + "step": 18705 + }, + { + "epoch": 0.41531170575243337, + "grad_norm": 1.0939360857009888, + "learning_rate": 1.2629144067912133e-05, + "loss": 0.4079, + "step": 18710 + }, + { + "epoch": 0.415422692311961, + "grad_norm": 1.4312463998794556, + "learning_rate": 1.2625779791854593e-05, + "loss": 0.4504, + "step": 18715 + }, + { + "epoch": 0.4155336788714887, + "grad_norm": 1.6537981033325195, + "learning_rate": 1.2622415196563554e-05, + "loss": 0.4369, + "step": 18720 + }, + { + "epoch": 0.4156446654310163, + "grad_norm": 1.350915789604187, + "learning_rate": 1.2619050282448067e-05, + "loss": 0.532, + "step": 18725 + }, + { + "epoch": 0.41575565199054393, + "grad_norm": 1.0188899040222168, + "learning_rate": 1.2615685049917233e-05, + "loss": 0.4303, + "step": 18730 + }, + { + "epoch": 0.4158666385500716, + "grad_norm": 1.0881189107894897, + "learning_rate": 1.2612319499380183e-05, + "loss": 0.2445, + "step": 18735 + }, + { + "epoch": 0.41597762510959924, + "grad_norm": 1.1773778200149536, + "learning_rate": 1.260895363124609e-05, + "loss": 0.6159, + "step": 18740 + }, + { + "epoch": 0.41608861166912686, + "grad_norm": 1.1648191213607788, + "learning_rate": 1.2605587445924164e-05, + "loss": 0.4181, + "step": 18745 + }, + { + "epoch": 0.4161995982286545, + "grad_norm": 1.302767276763916, + "learning_rate": 1.2602220943823654e-05, + "loss": 0.5449, + "step": 18750 + }, + { + "epoch": 0.41631058478818217, + "grad_norm": 0.9167510271072388, + "learning_rate": 1.2598854125353847e-05, + "loss": 0.3292, + "step": 18755 + }, + { + "epoch": 0.4164215713477098, + "grad_norm": 1.0387072563171387, + "learning_rate": 1.2595486990924075e-05, + "loss": 0.3367, + "step": 18760 + }, + { + "epoch": 0.4165325579072374, + "grad_norm": 1.5445338487625122, + "learning_rate": 1.2592119540943697e-05, + "loss": 0.4267, + "step": 18765 + }, + { + "epoch": 0.41664354446676505, + "grad_norm": 1.1272823810577393, + "learning_rate": 1.258875177582212e-05, + "loss": 0.4504, + "step": 18770 + }, + { + "epoch": 0.41675453102629273, + "grad_norm": 0.8140206933021545, + "learning_rate": 1.2585383695968782e-05, + "loss": 0.4006, + "step": 18775 + }, + { + "epoch": 0.41686551758582036, + "grad_norm": 1.2129508256912231, + "learning_rate": 1.2582015301793167e-05, + "loss": 0.5976, + "step": 18780 + }, + { + "epoch": 0.416976504145348, + "grad_norm": 1.2662301063537598, + "learning_rate": 1.2578646593704786e-05, + "loss": 0.4862, + "step": 18785 + }, + { + "epoch": 0.41708749070487566, + "grad_norm": 1.3142824172973633, + "learning_rate": 1.2575277572113205e-05, + "loss": 0.5799, + "step": 18790 + }, + { + "epoch": 0.4171984772644033, + "grad_norm": 1.0552418231964111, + "learning_rate": 1.2571908237428012e-05, + "loss": 0.3674, + "step": 18795 + }, + { + "epoch": 0.4173094638239309, + "grad_norm": 0.9380001425743103, + "learning_rate": 1.2568538590058844e-05, + "loss": 0.3359, + "step": 18800 + }, + { + "epoch": 0.41742045038345854, + "grad_norm": 1.519612193107605, + "learning_rate": 1.2565168630415366e-05, + "loss": 0.4007, + "step": 18805 + }, + { + "epoch": 0.4175314369429862, + "grad_norm": 1.1087913513183594, + "learning_rate": 1.2561798358907287e-05, + "loss": 0.3141, + "step": 18810 + }, + { + "epoch": 0.41764242350251385, + "grad_norm": 1.1452858448028564, + "learning_rate": 1.2558427775944357e-05, + "loss": 0.5899, + "step": 18815 + }, + { + "epoch": 0.4177534100620415, + "grad_norm": 1.290701150894165, + "learning_rate": 1.2555056881936359e-05, + "loss": 0.3658, + "step": 18820 + }, + { + "epoch": 0.4178643966215691, + "grad_norm": 1.1495695114135742, + "learning_rate": 1.2551685677293112e-05, + "loss": 0.392, + "step": 18825 + }, + { + "epoch": 0.4179753831810968, + "grad_norm": 1.0973929166793823, + "learning_rate": 1.2548314162424481e-05, + "loss": 0.4206, + "step": 18830 + }, + { + "epoch": 0.4180863697406244, + "grad_norm": 1.074574589729309, + "learning_rate": 1.254494233774036e-05, + "loss": 0.4358, + "step": 18835 + }, + { + "epoch": 0.41819735630015203, + "grad_norm": 1.424739956855774, + "learning_rate": 1.2541570203650681e-05, + "loss": 0.473, + "step": 18840 + }, + { + "epoch": 0.4183083428596797, + "grad_norm": 1.4399970769882202, + "learning_rate": 1.2538197760565425e-05, + "loss": 0.4795, + "step": 18845 + }, + { + "epoch": 0.41841932941920734, + "grad_norm": 1.6367219686508179, + "learning_rate": 1.2534825008894595e-05, + "loss": 0.4966, + "step": 18850 + }, + { + "epoch": 0.41853031597873497, + "grad_norm": 1.3322153091430664, + "learning_rate": 1.2531451949048243e-05, + "loss": 0.6716, + "step": 18855 + }, + { + "epoch": 0.4186413025382626, + "grad_norm": 1.3036226034164429, + "learning_rate": 1.2528078581436454e-05, + "loss": 0.5527, + "step": 18860 + }, + { + "epoch": 0.4187522890977903, + "grad_norm": 0.9684581756591797, + "learning_rate": 1.2524704906469347e-05, + "loss": 0.38, + "step": 18865 + }, + { + "epoch": 0.4188632756573179, + "grad_norm": 0.9556499719619751, + "learning_rate": 1.2521330924557087e-05, + "loss": 0.366, + "step": 18870 + }, + { + "epoch": 0.4189742622168455, + "grad_norm": 0.9147199392318726, + "learning_rate": 1.2517956636109867e-05, + "loss": 0.4608, + "step": 18875 + }, + { + "epoch": 0.41908524877637315, + "grad_norm": 1.5343172550201416, + "learning_rate": 1.2514582041537926e-05, + "loss": 0.417, + "step": 18880 + }, + { + "epoch": 0.41919623533590084, + "grad_norm": 1.2957946062088013, + "learning_rate": 1.2511207141251532e-05, + "loss": 0.6602, + "step": 18885 + }, + { + "epoch": 0.41930722189542846, + "grad_norm": 1.3479381799697876, + "learning_rate": 1.2507831935660995e-05, + "loss": 0.3779, + "step": 18890 + }, + { + "epoch": 0.4194182084549561, + "grad_norm": 1.4515689611434937, + "learning_rate": 1.2504456425176662e-05, + "loss": 0.5103, + "step": 18895 + }, + { + "epoch": 0.41952919501448377, + "grad_norm": 1.2327100038528442, + "learning_rate": 1.2501080610208915e-05, + "loss": 0.5223, + "step": 18900 + }, + { + "epoch": 0.4196401815740114, + "grad_norm": 1.5770394802093506, + "learning_rate": 1.2497704491168178e-05, + "loss": 0.4282, + "step": 18905 + }, + { + "epoch": 0.419751168133539, + "grad_norm": 1.4935261011123657, + "learning_rate": 1.2494328068464907e-05, + "loss": 0.4712, + "step": 18910 + }, + { + "epoch": 0.41986215469306665, + "grad_norm": 1.3158011436462402, + "learning_rate": 1.2490951342509592e-05, + "loss": 0.3454, + "step": 18915 + }, + { + "epoch": 0.41997314125259433, + "grad_norm": 0.869740903377533, + "learning_rate": 1.2487574313712766e-05, + "loss": 0.4122, + "step": 18920 + }, + { + "epoch": 0.42008412781212195, + "grad_norm": 0.9241218566894531, + "learning_rate": 1.2484196982484997e-05, + "loss": 0.3949, + "step": 18925 + }, + { + "epoch": 0.4201951143716496, + "grad_norm": 1.3600034713745117, + "learning_rate": 1.2480819349236895e-05, + "loss": 0.5165, + "step": 18930 + }, + { + "epoch": 0.4203061009311772, + "grad_norm": 1.205753207206726, + "learning_rate": 1.2477441414379093e-05, + "loss": 0.5086, + "step": 18935 + }, + { + "epoch": 0.4204170874907049, + "grad_norm": 1.6294773817062378, + "learning_rate": 1.2474063178322274e-05, + "loss": 0.3416, + "step": 18940 + }, + { + "epoch": 0.4205280740502325, + "grad_norm": 1.817699909210205, + "learning_rate": 1.247068464147715e-05, + "loss": 0.3767, + "step": 18945 + }, + { + "epoch": 0.42063906060976014, + "grad_norm": 1.5021382570266724, + "learning_rate": 1.2467305804254472e-05, + "loss": 0.3796, + "step": 18950 + }, + { + "epoch": 0.4207500471692878, + "grad_norm": 1.0630805492401123, + "learning_rate": 1.2463926667065031e-05, + "loss": 0.5853, + "step": 18955 + }, + { + "epoch": 0.42086103372881545, + "grad_norm": 0.8353287577629089, + "learning_rate": 1.246054723031965e-05, + "loss": 0.2753, + "step": 18960 + }, + { + "epoch": 0.4209720202883431, + "grad_norm": 0.8969990611076355, + "learning_rate": 1.2457167494429187e-05, + "loss": 0.4887, + "step": 18965 + }, + { + "epoch": 0.4210830068478707, + "grad_norm": 1.1911489963531494, + "learning_rate": 1.2453787459804543e-05, + "loss": 0.4958, + "step": 18970 + }, + { + "epoch": 0.4211939934073984, + "grad_norm": 0.843773365020752, + "learning_rate": 1.2450407126856648e-05, + "loss": 0.4585, + "step": 18975 + }, + { + "epoch": 0.421304979966926, + "grad_norm": 0.9726853370666504, + "learning_rate": 1.2447026495996469e-05, + "loss": 0.2542, + "step": 18980 + }, + { + "epoch": 0.42141596652645363, + "grad_norm": 0.9821330904960632, + "learning_rate": 1.2443645567635018e-05, + "loss": 0.3909, + "step": 18985 + }, + { + "epoch": 0.4215269530859813, + "grad_norm": 1.09699285030365, + "learning_rate": 1.2440264342183335e-05, + "loss": 0.351, + "step": 18990 + }, + { + "epoch": 0.42163793964550894, + "grad_norm": 1.2256817817687988, + "learning_rate": 1.2436882820052498e-05, + "loss": 0.4693, + "step": 18995 + }, + { + "epoch": 0.42174892620503657, + "grad_norm": 1.5253905057907104, + "learning_rate": 1.2433501001653618e-05, + "loss": 0.451, + "step": 19000 + }, + { + "epoch": 0.4218599127645642, + "grad_norm": 1.5205445289611816, + "learning_rate": 1.243011888739785e-05, + "loss": 0.3386, + "step": 19005 + }, + { + "epoch": 0.4219708993240919, + "grad_norm": 1.7538121938705444, + "learning_rate": 1.2426736477696378e-05, + "loss": 0.5199, + "step": 19010 + }, + { + "epoch": 0.4220818858836195, + "grad_norm": 0.802723228931427, + "learning_rate": 1.2423353772960421e-05, + "loss": 0.4092, + "step": 19015 + }, + { + "epoch": 0.4221928724431471, + "grad_norm": 1.3640711307525635, + "learning_rate": 1.2419970773601241e-05, + "loss": 0.4255, + "step": 19020 + }, + { + "epoch": 0.42230385900267475, + "grad_norm": 0.8367384076118469, + "learning_rate": 1.241658748003013e-05, + "loss": 0.3952, + "step": 19025 + }, + { + "epoch": 0.42241484556220243, + "grad_norm": 1.389653205871582, + "learning_rate": 1.241320389265842e-05, + "loss": 0.3961, + "step": 19030 + }, + { + "epoch": 0.42252583212173006, + "grad_norm": 1.5635536909103394, + "learning_rate": 1.240982001189747e-05, + "loss": 0.4853, + "step": 19035 + }, + { + "epoch": 0.4226368186812577, + "grad_norm": 1.2197387218475342, + "learning_rate": 1.2406435838158686e-05, + "loss": 0.478, + "step": 19040 + }, + { + "epoch": 0.42274780524078537, + "grad_norm": 0.9387150406837463, + "learning_rate": 1.2403051371853502e-05, + "loss": 0.4831, + "step": 19045 + }, + { + "epoch": 0.422858791800313, + "grad_norm": 1.2293328046798706, + "learning_rate": 1.2399666613393396e-05, + "loss": 0.4918, + "step": 19050 + }, + { + "epoch": 0.4229697783598406, + "grad_norm": 1.0011482238769531, + "learning_rate": 1.2396281563189867e-05, + "loss": 0.5591, + "step": 19055 + }, + { + "epoch": 0.42308076491936825, + "grad_norm": 1.3648799657821655, + "learning_rate": 1.2392896221654465e-05, + "loss": 0.5343, + "step": 19060 + }, + { + "epoch": 0.42319175147889593, + "grad_norm": 1.37348210811615, + "learning_rate": 1.238951058919876e-05, + "loss": 0.6581, + "step": 19065 + }, + { + "epoch": 0.42330273803842355, + "grad_norm": 1.1474547386169434, + "learning_rate": 1.2386124666234377e-05, + "loss": 0.6939, + "step": 19070 + }, + { + "epoch": 0.4234137245979512, + "grad_norm": 1.3844075202941895, + "learning_rate": 1.2382738453172957e-05, + "loss": 0.4332, + "step": 19075 + }, + { + "epoch": 0.4235247111574788, + "grad_norm": 1.0813393592834473, + "learning_rate": 1.2379351950426188e-05, + "loss": 0.4339, + "step": 19080 + }, + { + "epoch": 0.4236356977170065, + "grad_norm": 0.8828607797622681, + "learning_rate": 1.2375965158405789e-05, + "loss": 0.3954, + "step": 19085 + }, + { + "epoch": 0.4237466842765341, + "grad_norm": 1.3992273807525635, + "learning_rate": 1.2372578077523514e-05, + "loss": 0.4668, + "step": 19090 + }, + { + "epoch": 0.42385767083606174, + "grad_norm": 1.2180794477462769, + "learning_rate": 1.2369190708191151e-05, + "loss": 0.5227, + "step": 19095 + }, + { + "epoch": 0.4239686573955894, + "grad_norm": 1.0402294397354126, + "learning_rate": 1.2365803050820531e-05, + "loss": 0.4719, + "step": 19100 + }, + { + "epoch": 0.42407964395511705, + "grad_norm": 1.536978006362915, + "learning_rate": 1.2362415105823509e-05, + "loss": 0.5169, + "step": 19105 + }, + { + "epoch": 0.4241906305146447, + "grad_norm": 1.119295358657837, + "learning_rate": 1.2359026873611981e-05, + "loss": 0.5108, + "step": 19110 + }, + { + "epoch": 0.4243016170741723, + "grad_norm": 0.9652313590049744, + "learning_rate": 1.2355638354597878e-05, + "loss": 0.4447, + "step": 19115 + }, + { + "epoch": 0.4244126036337, + "grad_norm": 1.04342520236969, + "learning_rate": 1.2352249549193165e-05, + "loss": 0.4986, + "step": 19120 + }, + { + "epoch": 0.4245235901932276, + "grad_norm": 1.2141095399856567, + "learning_rate": 1.234886045780984e-05, + "loss": 0.5623, + "step": 19125 + }, + { + "epoch": 0.42463457675275523, + "grad_norm": 1.7910752296447754, + "learning_rate": 1.2345471080859937e-05, + "loss": 0.469, + "step": 19130 + }, + { + "epoch": 0.42474556331228286, + "grad_norm": 1.4534924030303955, + "learning_rate": 1.2342081418755525e-05, + "loss": 0.4874, + "step": 19135 + }, + { + "epoch": 0.42485654987181054, + "grad_norm": 0.9207220673561096, + "learning_rate": 1.233869147190871e-05, + "loss": 0.3644, + "step": 19140 + }, + { + "epoch": 0.42496753643133817, + "grad_norm": 0.8127537369728088, + "learning_rate": 1.233530124073163e-05, + "loss": 0.4096, + "step": 19145 + }, + { + "epoch": 0.4250785229908658, + "grad_norm": 1.811837911605835, + "learning_rate": 1.2331910725636455e-05, + "loss": 0.5663, + "step": 19150 + }, + { + "epoch": 0.4251895095503935, + "grad_norm": 1.11298668384552, + "learning_rate": 1.2328519927035396e-05, + "loss": 0.3821, + "step": 19155 + }, + { + "epoch": 0.4253004961099211, + "grad_norm": 1.6024359464645386, + "learning_rate": 1.2325128845340696e-05, + "loss": 0.5324, + "step": 19160 + }, + { + "epoch": 0.4254114826694487, + "grad_norm": 1.2056422233581543, + "learning_rate": 1.2321737480964625e-05, + "loss": 0.3337, + "step": 19165 + }, + { + "epoch": 0.42552246922897635, + "grad_norm": 0.8898048400878906, + "learning_rate": 1.2318345834319501e-05, + "loss": 0.4795, + "step": 19170 + }, + { + "epoch": 0.42563345578850403, + "grad_norm": 1.0445302724838257, + "learning_rate": 1.2314953905817662e-05, + "loss": 0.4654, + "step": 19175 + }, + { + "epoch": 0.42574444234803166, + "grad_norm": 1.4014710187911987, + "learning_rate": 1.231156169587149e-05, + "loss": 0.5228, + "step": 19180 + }, + { + "epoch": 0.4258554289075593, + "grad_norm": 1.1380759477615356, + "learning_rate": 1.2308169204893403e-05, + "loss": 0.4072, + "step": 19185 + }, + { + "epoch": 0.4259664154670869, + "grad_norm": 1.5649347305297852, + "learning_rate": 1.2304776433295844e-05, + "loss": 0.2593, + "step": 19190 + }, + { + "epoch": 0.4260774020266146, + "grad_norm": 1.3048160076141357, + "learning_rate": 1.2301383381491297e-05, + "loss": 0.3555, + "step": 19195 + }, + { + "epoch": 0.4261883885861422, + "grad_norm": 1.260620355606079, + "learning_rate": 1.2297990049892274e-05, + "loss": 0.3316, + "step": 19200 + }, + { + "epoch": 0.42629937514566985, + "grad_norm": 0.98924720287323, + "learning_rate": 1.2294596438911328e-05, + "loss": 0.6583, + "step": 19205 + }, + { + "epoch": 0.4264103617051975, + "grad_norm": 1.6030290126800537, + "learning_rate": 1.2291202548961042e-05, + "loss": 0.4335, + "step": 19210 + }, + { + "epoch": 0.42652134826472515, + "grad_norm": 1.4659713506698608, + "learning_rate": 1.2287808380454038e-05, + "loss": 0.3872, + "step": 19215 + }, + { + "epoch": 0.4266323348242528, + "grad_norm": 1.2156850099563599, + "learning_rate": 1.2284413933802961e-05, + "loss": 0.3163, + "step": 19220 + }, + { + "epoch": 0.4267433213837804, + "grad_norm": 1.2371267080307007, + "learning_rate": 1.2281019209420502e-05, + "loss": 0.4625, + "step": 19225 + }, + { + "epoch": 0.4268543079433081, + "grad_norm": 1.0696591138839722, + "learning_rate": 1.2277624207719373e-05, + "loss": 0.3984, + "step": 19230 + }, + { + "epoch": 0.4269652945028357, + "grad_norm": 1.0581618547439575, + "learning_rate": 1.2274228929112336e-05, + "loss": 0.4227, + "step": 19235 + }, + { + "epoch": 0.42707628106236334, + "grad_norm": 0.9675800800323486, + "learning_rate": 1.227083337401217e-05, + "loss": 0.6256, + "step": 19240 + }, + { + "epoch": 0.42718726762189096, + "grad_norm": 1.111681342124939, + "learning_rate": 1.22674375428317e-05, + "loss": 0.4658, + "step": 19245 + }, + { + "epoch": 0.42729825418141865, + "grad_norm": 1.5244219303131104, + "learning_rate": 1.2264041435983776e-05, + "loss": 0.4315, + "step": 19250 + }, + { + "epoch": 0.4274092407409463, + "grad_norm": 0.9702281355857849, + "learning_rate": 1.2260645053881288e-05, + "loss": 0.4759, + "step": 19255 + }, + { + "epoch": 0.4275202273004739, + "grad_norm": 0.8706420063972473, + "learning_rate": 1.2257248396937156e-05, + "loss": 0.457, + "step": 19260 + }, + { + "epoch": 0.4276312138600016, + "grad_norm": 1.9086284637451172, + "learning_rate": 1.2253851465564333e-05, + "loss": 0.5152, + "step": 19265 + }, + { + "epoch": 0.4277422004195292, + "grad_norm": 1.522344946861267, + "learning_rate": 1.2250454260175809e-05, + "loss": 0.6024, + "step": 19270 + }, + { + "epoch": 0.42785318697905683, + "grad_norm": 1.0553314685821533, + "learning_rate": 1.2247056781184604e-05, + "loss": 0.3522, + "step": 19275 + }, + { + "epoch": 0.42796417353858446, + "grad_norm": 1.4162936210632324, + "learning_rate": 1.2243659029003769e-05, + "loss": 0.3912, + "step": 19280 + }, + { + "epoch": 0.42807516009811214, + "grad_norm": 1.8890293836593628, + "learning_rate": 1.2240261004046397e-05, + "loss": 0.4187, + "step": 19285 + }, + { + "epoch": 0.42818614665763977, + "grad_norm": 1.1863138675689697, + "learning_rate": 1.2236862706725603e-05, + "loss": 0.3151, + "step": 19290 + }, + { + "epoch": 0.4282971332171674, + "grad_norm": 0.9996054172515869, + "learning_rate": 1.2233464137454542e-05, + "loss": 0.4022, + "step": 19295 + }, + { + "epoch": 0.428408119776695, + "grad_norm": 2.290318489074707, + "learning_rate": 1.2230065296646406e-05, + "loss": 0.4952, + "step": 19300 + }, + { + "epoch": 0.4285191063362227, + "grad_norm": 0.921788215637207, + "learning_rate": 1.2226666184714409e-05, + "loss": 0.3769, + "step": 19305 + }, + { + "epoch": 0.4286300928957503, + "grad_norm": 1.0003505945205688, + "learning_rate": 1.2223266802071802e-05, + "loss": 0.379, + "step": 19310 + }, + { + "epoch": 0.42874107945527795, + "grad_norm": 1.5152891874313354, + "learning_rate": 1.2219867149131876e-05, + "loss": 0.5854, + "step": 19315 + }, + { + "epoch": 0.42885206601480563, + "grad_norm": 1.3230036497116089, + "learning_rate": 1.2216467226307944e-05, + "loss": 0.4412, + "step": 19320 + }, + { + "epoch": 0.42896305257433326, + "grad_norm": 1.2009028196334839, + "learning_rate": 1.2213067034013363e-05, + "loss": 0.3158, + "step": 19325 + }, + { + "epoch": 0.4290740391338609, + "grad_norm": 1.3071898221969604, + "learning_rate": 1.2209666572661515e-05, + "loss": 0.4378, + "step": 19330 + }, + { + "epoch": 0.4291850256933885, + "grad_norm": 0.9393930435180664, + "learning_rate": 1.2206265842665814e-05, + "loss": 0.4835, + "step": 19335 + }, + { + "epoch": 0.4292960122529162, + "grad_norm": 1.3003790378570557, + "learning_rate": 1.220286484443971e-05, + "loss": 0.4946, + "step": 19340 + }, + { + "epoch": 0.4294069988124438, + "grad_norm": 1.2015130519866943, + "learning_rate": 1.2199463578396688e-05, + "loss": 0.4431, + "step": 19345 + }, + { + "epoch": 0.42951798537197144, + "grad_norm": 1.0020374059677124, + "learning_rate": 1.2196062044950259e-05, + "loss": 0.2633, + "step": 19350 + }, + { + "epoch": 0.42962897193149907, + "grad_norm": 0.7587225437164307, + "learning_rate": 1.2192660244513971e-05, + "loss": 0.4256, + "step": 19355 + }, + { + "epoch": 0.42973995849102675, + "grad_norm": 1.2856824398040771, + "learning_rate": 1.2189258177501406e-05, + "loss": 0.4935, + "step": 19360 + }, + { + "epoch": 0.4298509450505544, + "grad_norm": 0.6996091604232788, + "learning_rate": 1.2185855844326174e-05, + "loss": 0.45, + "step": 19365 + }, + { + "epoch": 0.429961931610082, + "grad_norm": 1.1408183574676514, + "learning_rate": 1.218245324540192e-05, + "loss": 0.493, + "step": 19370 + }, + { + "epoch": 0.4300729181696097, + "grad_norm": 1.2416564226150513, + "learning_rate": 1.2179050381142319e-05, + "loss": 0.6317, + "step": 19375 + }, + { + "epoch": 0.4301839047291373, + "grad_norm": 1.2081810235977173, + "learning_rate": 1.217564725196108e-05, + "loss": 0.3698, + "step": 19380 + }, + { + "epoch": 0.43029489128866494, + "grad_norm": 1.1831772327423096, + "learning_rate": 1.2172243858271944e-05, + "loss": 0.5561, + "step": 19385 + }, + { + "epoch": 0.43040587784819256, + "grad_norm": 1.0908524990081787, + "learning_rate": 1.2168840200488686e-05, + "loss": 0.5915, + "step": 19390 + }, + { + "epoch": 0.43051686440772025, + "grad_norm": 1.0368895530700684, + "learning_rate": 1.216543627902511e-05, + "loss": 0.6316, + "step": 19395 + }, + { + "epoch": 0.43062785096724787, + "grad_norm": 1.4189674854278564, + "learning_rate": 1.2162032094295052e-05, + "loss": 0.4226, + "step": 19400 + }, + { + "epoch": 0.4307388375267755, + "grad_norm": 0.9197772741317749, + "learning_rate": 1.2158627646712384e-05, + "loss": 0.3057, + "step": 19405 + }, + { + "epoch": 0.4308498240863031, + "grad_norm": 1.11489737033844, + "learning_rate": 1.2155222936691007e-05, + "loss": 0.4414, + "step": 19410 + }, + { + "epoch": 0.4309608106458308, + "grad_norm": 0.9230841398239136, + "learning_rate": 1.2151817964644852e-05, + "loss": 0.3631, + "step": 19415 + }, + { + "epoch": 0.43107179720535843, + "grad_norm": 1.5710375308990479, + "learning_rate": 1.2148412730987887e-05, + "loss": 0.446, + "step": 19420 + }, + { + "epoch": 0.43118278376488606, + "grad_norm": 0.9980664849281311, + "learning_rate": 1.2145007236134108e-05, + "loss": 0.1992, + "step": 19425 + }, + { + "epoch": 0.43129377032441374, + "grad_norm": 1.850558876991272, + "learning_rate": 1.214160148049754e-05, + "loss": 0.3733, + "step": 19430 + }, + { + "epoch": 0.43140475688394136, + "grad_norm": 1.4530669450759888, + "learning_rate": 1.2138195464492246e-05, + "loss": 0.5507, + "step": 19435 + }, + { + "epoch": 0.431515743443469, + "grad_norm": 1.877859115600586, + "learning_rate": 1.2134789188532322e-05, + "loss": 0.4828, + "step": 19440 + }, + { + "epoch": 0.4316267300029966, + "grad_norm": 0.936596691608429, + "learning_rate": 1.2131382653031887e-05, + "loss": 0.6039, + "step": 19445 + }, + { + "epoch": 0.4317377165625243, + "grad_norm": 1.5196179151535034, + "learning_rate": 1.2127975858405096e-05, + "loss": 0.5006, + "step": 19450 + }, + { + "epoch": 0.4318487031220519, + "grad_norm": 1.0282752513885498, + "learning_rate": 1.2124568805066137e-05, + "loss": 0.4285, + "step": 19455 + }, + { + "epoch": 0.43195968968157955, + "grad_norm": 0.9904384016990662, + "learning_rate": 1.2121161493429225e-05, + "loss": 0.3089, + "step": 19460 + }, + { + "epoch": 0.4320706762411072, + "grad_norm": 1.1516846418380737, + "learning_rate": 1.2117753923908617e-05, + "loss": 0.4541, + "step": 19465 + }, + { + "epoch": 0.43218166280063486, + "grad_norm": 1.0985053777694702, + "learning_rate": 1.211434609691859e-05, + "loss": 0.5971, + "step": 19470 + }, + { + "epoch": 0.4322926493601625, + "grad_norm": 1.2207547426223755, + "learning_rate": 1.2110938012873453e-05, + "loss": 0.545, + "step": 19475 + }, + { + "epoch": 0.4324036359196901, + "grad_norm": 1.536851406097412, + "learning_rate": 1.2107529672187552e-05, + "loss": 0.5405, + "step": 19480 + }, + { + "epoch": 0.4325146224792178, + "grad_norm": 1.8614027500152588, + "learning_rate": 1.2104121075275263e-05, + "loss": 0.5477, + "step": 19485 + }, + { + "epoch": 0.4326256090387454, + "grad_norm": 1.1792525053024292, + "learning_rate": 1.210071222255099e-05, + "loss": 0.5541, + "step": 19490 + }, + { + "epoch": 0.43273659559827304, + "grad_norm": 1.6490572690963745, + "learning_rate": 1.2097303114429169e-05, + "loss": 0.4629, + "step": 19495 + }, + { + "epoch": 0.43284758215780067, + "grad_norm": 1.219548225402832, + "learning_rate": 1.209389375132427e-05, + "loss": 0.4757, + "step": 19500 + }, + { + "epoch": 0.43295856871732835, + "grad_norm": 1.2101728916168213, + "learning_rate": 1.2090484133650791e-05, + "loss": 0.3978, + "step": 19505 + }, + { + "epoch": 0.433069555276856, + "grad_norm": 0.8404216766357422, + "learning_rate": 1.2087074261823261e-05, + "loss": 0.5797, + "step": 19510 + }, + { + "epoch": 0.4331805418363836, + "grad_norm": 1.5160115957260132, + "learning_rate": 1.2083664136256243e-05, + "loss": 0.4582, + "step": 19515 + }, + { + "epoch": 0.43329152839591123, + "grad_norm": 2.2496602535247803, + "learning_rate": 1.2080253757364327e-05, + "loss": 0.352, + "step": 19520 + }, + { + "epoch": 0.4334025149554389, + "grad_norm": 0.9950528740882874, + "learning_rate": 1.2076843125562135e-05, + "loss": 0.3788, + "step": 19525 + }, + { + "epoch": 0.43351350151496654, + "grad_norm": 0.9879639148712158, + "learning_rate": 1.2073432241264322e-05, + "loss": 0.4802, + "step": 19530 + }, + { + "epoch": 0.43362448807449416, + "grad_norm": 1.0998682975769043, + "learning_rate": 1.2070021104885571e-05, + "loss": 0.4783, + "step": 19535 + }, + { + "epoch": 0.43373547463402184, + "grad_norm": 1.5225579738616943, + "learning_rate": 1.2066609716840595e-05, + "loss": 0.2778, + "step": 19540 + }, + { + "epoch": 0.43384646119354947, + "grad_norm": 1.4201195240020752, + "learning_rate": 1.206319807754414e-05, + "loss": 0.521, + "step": 19545 + }, + { + "epoch": 0.4339574477530771, + "grad_norm": 1.3159350156784058, + "learning_rate": 1.2059786187410984e-05, + "loss": 0.4117, + "step": 19550 + }, + { + "epoch": 0.4340684343126047, + "grad_norm": 1.2883739471435547, + "learning_rate": 1.2056374046855932e-05, + "loss": 0.3565, + "step": 19555 + }, + { + "epoch": 0.4341794208721324, + "grad_norm": 1.8150126934051514, + "learning_rate": 1.205296165629382e-05, + "loss": 0.4719, + "step": 19560 + }, + { + "epoch": 0.43429040743166003, + "grad_norm": 1.3383867740631104, + "learning_rate": 1.2049549016139513e-05, + "loss": 0.3684, + "step": 19565 + }, + { + "epoch": 0.43440139399118766, + "grad_norm": 1.4191855192184448, + "learning_rate": 1.2046136126807913e-05, + "loss": 0.3516, + "step": 19570 + }, + { + "epoch": 0.4345123805507153, + "grad_norm": 1.0860313177108765, + "learning_rate": 1.204272298871394e-05, + "loss": 0.4765, + "step": 19575 + }, + { + "epoch": 0.43462336711024296, + "grad_norm": 0.9528011679649353, + "learning_rate": 1.203930960227256e-05, + "loss": 0.6358, + "step": 19580 + }, + { + "epoch": 0.4347343536697706, + "grad_norm": 1.3005439043045044, + "learning_rate": 1.203589596789876e-05, + "loss": 0.3419, + "step": 19585 + }, + { + "epoch": 0.4348453402292982, + "grad_norm": 1.0321930646896362, + "learning_rate": 1.203248208600755e-05, + "loss": 0.3879, + "step": 19590 + }, + { + "epoch": 0.4349563267888259, + "grad_norm": 1.3699491024017334, + "learning_rate": 1.2029067957013992e-05, + "loss": 0.4039, + "step": 19595 + }, + { + "epoch": 0.4350673133483535, + "grad_norm": 1.516921877861023, + "learning_rate": 1.2025653581333149e-05, + "loss": 0.4654, + "step": 19600 + }, + { + "epoch": 0.43517829990788115, + "grad_norm": 1.036165714263916, + "learning_rate": 1.2022238959380142e-05, + "loss": 0.3697, + "step": 19605 + }, + { + "epoch": 0.4352892864674088, + "grad_norm": 1.4566125869750977, + "learning_rate": 1.2018824091570103e-05, + "loss": 0.3274, + "step": 19610 + }, + { + "epoch": 0.43540027302693646, + "grad_norm": 1.4865602254867554, + "learning_rate": 1.2015408978318201e-05, + "loss": 0.4777, + "step": 19615 + }, + { + "epoch": 0.4355112595864641, + "grad_norm": 0.8841688632965088, + "learning_rate": 1.2011993620039637e-05, + "loss": 0.5436, + "step": 19620 + }, + { + "epoch": 0.4356222461459917, + "grad_norm": 1.1306862831115723, + "learning_rate": 1.2008578017149634e-05, + "loss": 0.571, + "step": 19625 + }, + { + "epoch": 0.43573323270551934, + "grad_norm": 1.3157212734222412, + "learning_rate": 1.2005162170063454e-05, + "loss": 0.5216, + "step": 19630 + }, + { + "epoch": 0.435844219265047, + "grad_norm": 0.6145315170288086, + "learning_rate": 1.2001746079196381e-05, + "loss": 0.4309, + "step": 19635 + }, + { + "epoch": 0.43595520582457464, + "grad_norm": 1.297216534614563, + "learning_rate": 1.1998329744963733e-05, + "loss": 0.483, + "step": 19640 + }, + { + "epoch": 0.43606619238410227, + "grad_norm": 1.5347627401351929, + "learning_rate": 1.1994913167780857e-05, + "loss": 0.4924, + "step": 19645 + }, + { + "epoch": 0.43617717894362995, + "grad_norm": 1.5418304204940796, + "learning_rate": 1.1991496348063127e-05, + "loss": 0.4475, + "step": 19650 + }, + { + "epoch": 0.4362881655031576, + "grad_norm": 1.326873540878296, + "learning_rate": 1.1988079286225954e-05, + "loss": 0.4511, + "step": 19655 + }, + { + "epoch": 0.4363991520626852, + "grad_norm": 1.0645344257354736, + "learning_rate": 1.1984661982684763e-05, + "loss": 0.4377, + "step": 19660 + }, + { + "epoch": 0.43651013862221283, + "grad_norm": 1.4390782117843628, + "learning_rate": 1.1981244437855027e-05, + "loss": 0.3424, + "step": 19665 + }, + { + "epoch": 0.4366211251817405, + "grad_norm": 1.2534408569335938, + "learning_rate": 1.1977826652152235e-05, + "loss": 0.4035, + "step": 19670 + }, + { + "epoch": 0.43673211174126814, + "grad_norm": 0.9360421895980835, + "learning_rate": 1.1974408625991916e-05, + "loss": 0.347, + "step": 19675 + }, + { + "epoch": 0.43684309830079576, + "grad_norm": 1.5838125944137573, + "learning_rate": 1.1970990359789616e-05, + "loss": 0.4984, + "step": 19680 + }, + { + "epoch": 0.4369540848603234, + "grad_norm": 1.7336652278900146, + "learning_rate": 1.1967571853960916e-05, + "loss": 0.4712, + "step": 19685 + }, + { + "epoch": 0.43706507141985107, + "grad_norm": 1.5626726150512695, + "learning_rate": 1.196415310892143e-05, + "loss": 0.2976, + "step": 19690 + }, + { + "epoch": 0.4371760579793787, + "grad_norm": 1.5406701564788818, + "learning_rate": 1.1960734125086797e-05, + "loss": 0.4693, + "step": 19695 + }, + { + "epoch": 0.4372870445389063, + "grad_norm": 0.7011148929595947, + "learning_rate": 1.1957314902872686e-05, + "loss": 0.5277, + "step": 19700 + }, + { + "epoch": 0.437398031098434, + "grad_norm": 1.4581482410430908, + "learning_rate": 1.1953895442694789e-05, + "loss": 0.4809, + "step": 19705 + }, + { + "epoch": 0.43750901765796163, + "grad_norm": 1.0448225736618042, + "learning_rate": 1.1950475744968842e-05, + "loss": 0.457, + "step": 19710 + }, + { + "epoch": 0.43762000421748926, + "grad_norm": 0.9020761251449585, + "learning_rate": 1.1947055810110591e-05, + "loss": 0.3995, + "step": 19715 + }, + { + "epoch": 0.4377309907770169, + "grad_norm": 1.1282302141189575, + "learning_rate": 1.1943635638535827e-05, + "loss": 0.5794, + "step": 19720 + }, + { + "epoch": 0.43784197733654456, + "grad_norm": 1.2756569385528564, + "learning_rate": 1.1940215230660362e-05, + "loss": 0.641, + "step": 19725 + }, + { + "epoch": 0.4379529638960722, + "grad_norm": 1.2321871519088745, + "learning_rate": 1.1936794586900033e-05, + "loss": 0.4981, + "step": 19730 + }, + { + "epoch": 0.4380639504555998, + "grad_norm": 1.5373114347457886, + "learning_rate": 1.1933373707670714e-05, + "loss": 0.417, + "step": 19735 + }, + { + "epoch": 0.43817493701512744, + "grad_norm": 0.9480398297309875, + "learning_rate": 1.1929952593388307e-05, + "loss": 0.4449, + "step": 19740 + }, + { + "epoch": 0.4382859235746551, + "grad_norm": 1.2366368770599365, + "learning_rate": 1.1926531244468733e-05, + "loss": 0.3991, + "step": 19745 + }, + { + "epoch": 0.43839691013418275, + "grad_norm": 0.8997697234153748, + "learning_rate": 1.1923109661327954e-05, + "loss": 0.4469, + "step": 19750 + }, + { + "epoch": 0.4385078966937104, + "grad_norm": 1.6303012371063232, + "learning_rate": 1.191968784438195e-05, + "loss": 0.4095, + "step": 19755 + }, + { + "epoch": 0.43861888325323806, + "grad_norm": 1.4117064476013184, + "learning_rate": 1.1916265794046738e-05, + "loss": 0.4293, + "step": 19760 + }, + { + "epoch": 0.4387298698127657, + "grad_norm": 0.8353791832923889, + "learning_rate": 1.1912843510738355e-05, + "loss": 0.4011, + "step": 19765 + }, + { + "epoch": 0.4388408563722933, + "grad_norm": 1.3969166278839111, + "learning_rate": 1.1909420994872871e-05, + "loss": 0.3568, + "step": 19770 + }, + { + "epoch": 0.43895184293182093, + "grad_norm": 1.2692325115203857, + "learning_rate": 1.190599824686639e-05, + "loss": 0.3129, + "step": 19775 + }, + { + "epoch": 0.4390628294913486, + "grad_norm": 0.765633761882782, + "learning_rate": 1.1902575267135035e-05, + "loss": 0.4564, + "step": 19780 + }, + { + "epoch": 0.43917381605087624, + "grad_norm": 0.9388693571090698, + "learning_rate": 1.1899152056094958e-05, + "loss": 0.4518, + "step": 19785 + }, + { + "epoch": 0.43928480261040387, + "grad_norm": 1.4808331727981567, + "learning_rate": 1.1895728614162343e-05, + "loss": 0.4618, + "step": 19790 + }, + { + "epoch": 0.4393957891699315, + "grad_norm": 0.7243686318397522, + "learning_rate": 1.1892304941753403e-05, + "loss": 0.388, + "step": 19795 + }, + { + "epoch": 0.4395067757294592, + "grad_norm": 1.2276579141616821, + "learning_rate": 1.188888103928437e-05, + "loss": 0.3931, + "step": 19800 + }, + { + "epoch": 0.4396177622889868, + "grad_norm": 1.0823590755462646, + "learning_rate": 1.1885456907171517e-05, + "loss": 0.5161, + "step": 19805 + }, + { + "epoch": 0.43972874884851443, + "grad_norm": 0.8903690576553345, + "learning_rate": 1.188203254583114e-05, + "loss": 0.3907, + "step": 19810 + }, + { + "epoch": 0.4398397354080421, + "grad_norm": 1.7568098306655884, + "learning_rate": 1.1878607955679555e-05, + "loss": 0.5345, + "step": 19815 + }, + { + "epoch": 0.43995072196756974, + "grad_norm": 1.1826956272125244, + "learning_rate": 1.1875183137133114e-05, + "loss": 0.4775, + "step": 19820 + }, + { + "epoch": 0.44006170852709736, + "grad_norm": 1.2964848279953003, + "learning_rate": 1.1871758090608199e-05, + "loss": 0.3452, + "step": 19825 + }, + { + "epoch": 0.440172695086625, + "grad_norm": 0.9392102360725403, + "learning_rate": 1.1868332816521208e-05, + "loss": 0.3947, + "step": 19830 + }, + { + "epoch": 0.44028368164615267, + "grad_norm": 0.9794422388076782, + "learning_rate": 1.1864907315288585e-05, + "loss": 0.4547, + "step": 19835 + }, + { + "epoch": 0.4403946682056803, + "grad_norm": 0.8995769023895264, + "learning_rate": 1.1861481587326782e-05, + "loss": 0.5318, + "step": 19840 + }, + { + "epoch": 0.4405056547652079, + "grad_norm": 1.2317299842834473, + "learning_rate": 1.1858055633052292e-05, + "loss": 0.4632, + "step": 19845 + }, + { + "epoch": 0.44061664132473555, + "grad_norm": 1.25813627243042, + "learning_rate": 1.1854629452881628e-05, + "loss": 0.488, + "step": 19850 + }, + { + "epoch": 0.44072762788426323, + "grad_norm": 1.1181604862213135, + "learning_rate": 1.1851203047231337e-05, + "loss": 0.3962, + "step": 19855 + }, + { + "epoch": 0.44083861444379085, + "grad_norm": 1.2032124996185303, + "learning_rate": 1.1847776416517987e-05, + "loss": 0.4725, + "step": 19860 + }, + { + "epoch": 0.4409496010033185, + "grad_norm": 0.9831379652023315, + "learning_rate": 1.1844349561158176e-05, + "loss": 0.3965, + "step": 19865 + }, + { + "epoch": 0.44106058756284616, + "grad_norm": 1.1643702983856201, + "learning_rate": 1.1840922481568531e-05, + "loss": 0.4862, + "step": 19870 + }, + { + "epoch": 0.4411715741223738, + "grad_norm": 1.2106220722198486, + "learning_rate": 1.1837495178165706e-05, + "loss": 0.3527, + "step": 19875 + }, + { + "epoch": 0.4412825606819014, + "grad_norm": 1.1105222702026367, + "learning_rate": 1.1834067651366379e-05, + "loss": 0.5157, + "step": 19880 + }, + { + "epoch": 0.44139354724142904, + "grad_norm": 1.241958737373352, + "learning_rate": 1.1830639901587256e-05, + "loss": 0.4843, + "step": 19885 + }, + { + "epoch": 0.4415045338009567, + "grad_norm": 1.510998249053955, + "learning_rate": 1.1827211929245075e-05, + "loss": 0.4317, + "step": 19890 + }, + { + "epoch": 0.44161552036048435, + "grad_norm": 0.9848299622535706, + "learning_rate": 1.1823783734756598e-05, + "loss": 0.367, + "step": 19895 + }, + { + "epoch": 0.441726506920012, + "grad_norm": 0.8628853559494019, + "learning_rate": 1.1820355318538608e-05, + "loss": 0.5189, + "step": 19900 + }, + { + "epoch": 0.4418374934795396, + "grad_norm": 1.2922097444534302, + "learning_rate": 1.1816926681007925e-05, + "loss": 0.4615, + "step": 19905 + }, + { + "epoch": 0.4419484800390673, + "grad_norm": 1.526689887046814, + "learning_rate": 1.1813497822581388e-05, + "loss": 0.4063, + "step": 19910 + }, + { + "epoch": 0.4420594665985949, + "grad_norm": 2.21928334236145, + "learning_rate": 1.1810068743675866e-05, + "loss": 0.3148, + "step": 19915 + }, + { + "epoch": 0.44217045315812253, + "grad_norm": 1.0265175104141235, + "learning_rate": 1.180663944470826e-05, + "loss": 0.5255, + "step": 19920 + }, + { + "epoch": 0.4422814397176502, + "grad_norm": 1.005786418914795, + "learning_rate": 1.1803209926095489e-05, + "loss": 0.4243, + "step": 19925 + }, + { + "epoch": 0.44239242627717784, + "grad_norm": 1.3304482698440552, + "learning_rate": 1.1799780188254504e-05, + "loss": 0.5117, + "step": 19930 + }, + { + "epoch": 0.44250341283670547, + "grad_norm": 1.0991795063018799, + "learning_rate": 1.1796350231602278e-05, + "loss": 0.3571, + "step": 19935 + }, + { + "epoch": 0.4426143993962331, + "grad_norm": 0.973331868648529, + "learning_rate": 1.1792920056555812e-05, + "loss": 0.328, + "step": 19940 + }, + { + "epoch": 0.4427253859557608, + "grad_norm": 0.9038109183311462, + "learning_rate": 1.1789489663532142e-05, + "loss": 0.4416, + "step": 19945 + }, + { + "epoch": 0.4428363725152884, + "grad_norm": 1.6230500936508179, + "learning_rate": 1.178605905294832e-05, + "loss": 0.5181, + "step": 19950 + }, + { + "epoch": 0.442947359074816, + "grad_norm": 1.0690028667449951, + "learning_rate": 1.1782628225221429e-05, + "loss": 0.2601, + "step": 19955 + }, + { + "epoch": 0.44305834563434365, + "grad_norm": 1.025299072265625, + "learning_rate": 1.1779197180768575e-05, + "loss": 0.4718, + "step": 19960 + }, + { + "epoch": 0.44316933219387133, + "grad_norm": 1.301763653755188, + "learning_rate": 1.1775765920006898e-05, + "loss": 0.4844, + "step": 19965 + }, + { + "epoch": 0.44328031875339896, + "grad_norm": 0.6717504262924194, + "learning_rate": 1.177233444335355e-05, + "loss": 0.3417, + "step": 19970 + }, + { + "epoch": 0.4433913053129266, + "grad_norm": 0.9575620889663696, + "learning_rate": 1.176890275122573e-05, + "loss": 0.4368, + "step": 19975 + }, + { + "epoch": 0.44350229187245427, + "grad_norm": 1.4361292123794556, + "learning_rate": 1.1765470844040645e-05, + "loss": 0.339, + "step": 19980 + }, + { + "epoch": 0.4436132784319819, + "grad_norm": 1.0805773735046387, + "learning_rate": 1.1762038722215533e-05, + "loss": 0.4149, + "step": 19985 + }, + { + "epoch": 0.4437242649915095, + "grad_norm": 0.904903769493103, + "learning_rate": 1.1758606386167666e-05, + "loss": 0.3387, + "step": 19990 + }, + { + "epoch": 0.44383525155103715, + "grad_norm": 1.5252553224563599, + "learning_rate": 1.1755173836314331e-05, + "loss": 0.4768, + "step": 19995 + }, + { + "epoch": 0.44394623811056483, + "grad_norm": 0.7814958691596985, + "learning_rate": 1.1751741073072846e-05, + "loss": 0.4594, + "step": 20000 + }, + { + "epoch": 0.44405722467009245, + "grad_norm": 0.8147719502449036, + "learning_rate": 1.174830809686056e-05, + "loss": 0.4328, + "step": 20005 + }, + { + "epoch": 0.4441682112296201, + "grad_norm": 1.363770604133606, + "learning_rate": 1.1744874908094835e-05, + "loss": 0.4362, + "step": 20010 + }, + { + "epoch": 0.44427919778914776, + "grad_norm": 1.544438362121582, + "learning_rate": 1.1741441507193073e-05, + "loss": 0.6222, + "step": 20015 + }, + { + "epoch": 0.4443901843486754, + "grad_norm": 1.065306305885315, + "learning_rate": 1.1738007894572691e-05, + "loss": 0.3821, + "step": 20020 + }, + { + "epoch": 0.444501170908203, + "grad_norm": 1.0254193544387817, + "learning_rate": 1.1734574070651137e-05, + "loss": 0.3301, + "step": 20025 + }, + { + "epoch": 0.44461215746773064, + "grad_norm": 0.7435585856437683, + "learning_rate": 1.1731140035845887e-05, + "loss": 0.3025, + "step": 20030 + }, + { + "epoch": 0.4447231440272583, + "grad_norm": 0.9800282716751099, + "learning_rate": 1.1727705790574437e-05, + "loss": 0.319, + "step": 20035 + }, + { + "epoch": 0.44483413058678595, + "grad_norm": 1.4175928831100464, + "learning_rate": 1.1724271335254312e-05, + "loss": 0.5204, + "step": 20040 + }, + { + "epoch": 0.4449451171463136, + "grad_norm": 1.1740878820419312, + "learning_rate": 1.172083667030306e-05, + "loss": 0.4353, + "step": 20045 + }, + { + "epoch": 0.4450561037058412, + "grad_norm": 2.3309788703918457, + "learning_rate": 1.1717401796138256e-05, + "loss": 0.3666, + "step": 20050 + }, + { + "epoch": 0.4451670902653689, + "grad_norm": 1.2700037956237793, + "learning_rate": 1.17139667131775e-05, + "loss": 0.5611, + "step": 20055 + }, + { + "epoch": 0.4452780768248965, + "grad_norm": 0.7723988890647888, + "learning_rate": 1.1710531421838422e-05, + "loss": 0.333, + "step": 20060 + }, + { + "epoch": 0.44538906338442413, + "grad_norm": 1.2870498895645142, + "learning_rate": 1.170709592253867e-05, + "loss": 0.3527, + "step": 20065 + }, + { + "epoch": 0.4455000499439518, + "grad_norm": 1.2130166292190552, + "learning_rate": 1.170366021569592e-05, + "loss": 0.4413, + "step": 20070 + }, + { + "epoch": 0.44561103650347944, + "grad_norm": 0.8243342041969299, + "learning_rate": 1.1700224301727877e-05, + "loss": 0.5166, + "step": 20075 + }, + { + "epoch": 0.44572202306300707, + "grad_norm": 1.8635729551315308, + "learning_rate": 1.1696788181052263e-05, + "loss": 0.5611, + "step": 20080 + }, + { + "epoch": 0.4458330096225347, + "grad_norm": 0.9250847697257996, + "learning_rate": 1.1693351854086833e-05, + "loss": 0.3723, + "step": 20085 + }, + { + "epoch": 0.4459439961820624, + "grad_norm": 1.3857841491699219, + "learning_rate": 1.1689915321249364e-05, + "loss": 0.5136, + "step": 20090 + }, + { + "epoch": 0.44605498274159, + "grad_norm": 1.1505711078643799, + "learning_rate": 1.1686478582957657e-05, + "loss": 0.4465, + "step": 20095 + }, + { + "epoch": 0.4461659693011176, + "grad_norm": 1.2375531196594238, + "learning_rate": 1.168304163962954e-05, + "loss": 0.5294, + "step": 20100 + }, + { + "epoch": 0.44627695586064525, + "grad_norm": 1.0499058961868286, + "learning_rate": 1.1679604491682865e-05, + "loss": 0.5367, + "step": 20105 + }, + { + "epoch": 0.44638794242017293, + "grad_norm": 0.9685129523277283, + "learning_rate": 1.167616713953551e-05, + "loss": 0.4059, + "step": 20110 + }, + { + "epoch": 0.44649892897970056, + "grad_norm": 1.0699036121368408, + "learning_rate": 1.1672729583605376e-05, + "loss": 0.4751, + "step": 20115 + }, + { + "epoch": 0.4466099155392282, + "grad_norm": 1.2520678043365479, + "learning_rate": 1.1669291824310388e-05, + "loss": 0.4419, + "step": 20120 + }, + { + "epoch": 0.44672090209875587, + "grad_norm": 1.1786726713180542, + "learning_rate": 1.16658538620685e-05, + "loss": 0.512, + "step": 20125 + }, + { + "epoch": 0.4468318886582835, + "grad_norm": 0.972960352897644, + "learning_rate": 1.1662415697297685e-05, + "loss": 0.4704, + "step": 20130 + }, + { + "epoch": 0.4469428752178111, + "grad_norm": 1.3304414749145508, + "learning_rate": 1.1658977330415943e-05, + "loss": 0.2956, + "step": 20135 + }, + { + "epoch": 0.44705386177733875, + "grad_norm": 1.6226036548614502, + "learning_rate": 1.16555387618413e-05, + "loss": 0.6865, + "step": 20140 + }, + { + "epoch": 0.4471648483368664, + "grad_norm": 1.115662932395935, + "learning_rate": 1.165209999199181e-05, + "loss": 0.4262, + "step": 20145 + }, + { + "epoch": 0.44727583489639405, + "grad_norm": 1.3321551084518433, + "learning_rate": 1.1648661021285544e-05, + "loss": 0.4129, + "step": 20150 + }, + { + "epoch": 0.4473868214559217, + "grad_norm": 0.812921941280365, + "learning_rate": 1.1645221850140598e-05, + "loss": 0.563, + "step": 20155 + }, + { + "epoch": 0.4474978080154493, + "grad_norm": 1.030159831047058, + "learning_rate": 1.1641782478975099e-05, + "loss": 0.4808, + "step": 20160 + }, + { + "epoch": 0.447608794574977, + "grad_norm": 0.8396998047828674, + "learning_rate": 1.1638342908207191e-05, + "loss": 0.481, + "step": 20165 + }, + { + "epoch": 0.4477197811345046, + "grad_norm": 0.9397024512290955, + "learning_rate": 1.1634903138255045e-05, + "loss": 0.4517, + "step": 20170 + }, + { + "epoch": 0.44783076769403224, + "grad_norm": 1.6640818119049072, + "learning_rate": 1.163146316953686e-05, + "loss": 0.4182, + "step": 20175 + }, + { + "epoch": 0.4479417542535599, + "grad_norm": 1.1481603384017944, + "learning_rate": 1.1628023002470857e-05, + "loss": 0.5254, + "step": 20180 + }, + { + "epoch": 0.44805274081308755, + "grad_norm": 1.1192456483840942, + "learning_rate": 1.1624582637475274e-05, + "loss": 0.4221, + "step": 20185 + }, + { + "epoch": 0.4481637273726152, + "grad_norm": 1.560524582862854, + "learning_rate": 1.1621142074968385e-05, + "loss": 0.4385, + "step": 20190 + }, + { + "epoch": 0.4482747139321428, + "grad_norm": 1.255010724067688, + "learning_rate": 1.1617701315368478e-05, + "loss": 0.4787, + "step": 20195 + }, + { + "epoch": 0.4483857004916705, + "grad_norm": 0.8947778344154358, + "learning_rate": 1.1614260359093869e-05, + "loss": 0.5235, + "step": 20200 + }, + { + "epoch": 0.4484966870511981, + "grad_norm": 1.2940188646316528, + "learning_rate": 1.1610819206562902e-05, + "loss": 0.4674, + "step": 20205 + }, + { + "epoch": 0.44860767361072573, + "grad_norm": 1.851446270942688, + "learning_rate": 1.1607377858193938e-05, + "loss": 0.3948, + "step": 20210 + }, + { + "epoch": 0.44871866017025336, + "grad_norm": 0.9747987389564514, + "learning_rate": 1.1603936314405365e-05, + "loss": 0.3417, + "step": 20215 + }, + { + "epoch": 0.44882964672978104, + "grad_norm": 1.1763579845428467, + "learning_rate": 1.160049457561559e-05, + "loss": 0.4035, + "step": 20220 + }, + { + "epoch": 0.44894063328930867, + "grad_norm": 1.1478216648101807, + "learning_rate": 1.1597052642243054e-05, + "loss": 0.6614, + "step": 20225 + }, + { + "epoch": 0.4490516198488363, + "grad_norm": 1.6329827308654785, + "learning_rate": 1.1593610514706217e-05, + "loss": 0.4021, + "step": 20230 + }, + { + "epoch": 0.449162606408364, + "grad_norm": 1.0295683145523071, + "learning_rate": 1.1590168193423557e-05, + "loss": 0.5769, + "step": 20235 + }, + { + "epoch": 0.4492735929678916, + "grad_norm": 1.0577813386917114, + "learning_rate": 1.158672567881358e-05, + "loss": 0.5333, + "step": 20240 + }, + { + "epoch": 0.4493845795274192, + "grad_norm": 1.0659865140914917, + "learning_rate": 1.1583282971294816e-05, + "loss": 0.468, + "step": 20245 + }, + { + "epoch": 0.44949556608694685, + "grad_norm": 1.8417197465896606, + "learning_rate": 1.1579840071285817e-05, + "loss": 0.4453, + "step": 20250 + }, + { + "epoch": 0.44960655264647453, + "grad_norm": 1.0317349433898926, + "learning_rate": 1.1576396979205162e-05, + "loss": 0.3537, + "step": 20255 + }, + { + "epoch": 0.44971753920600216, + "grad_norm": 0.8817946910858154, + "learning_rate": 1.1572953695471449e-05, + "loss": 0.4846, + "step": 20260 + }, + { + "epoch": 0.4498285257655298, + "grad_norm": 1.245755672454834, + "learning_rate": 1.1569510220503304e-05, + "loss": 0.5292, + "step": 20265 + }, + { + "epoch": 0.4499395123250574, + "grad_norm": 1.0409419536590576, + "learning_rate": 1.1566066554719366e-05, + "loss": 0.2456, + "step": 20270 + }, + { + "epoch": 0.4500504988845851, + "grad_norm": 1.1645506620407104, + "learning_rate": 1.156262269853831e-05, + "loss": 0.3571, + "step": 20275 + }, + { + "epoch": 0.4501614854441127, + "grad_norm": 1.2852460145950317, + "learning_rate": 1.1559178652378826e-05, + "loss": 0.503, + "step": 20280 + }, + { + "epoch": 0.45027247200364034, + "grad_norm": 1.27859628200531, + "learning_rate": 1.1555734416659632e-05, + "loss": 0.4348, + "step": 20285 + }, + { + "epoch": 0.450383458563168, + "grad_norm": 1.7184007167816162, + "learning_rate": 1.1552289991799466e-05, + "loss": 0.4694, + "step": 20290 + }, + { + "epoch": 0.45049444512269565, + "grad_norm": 1.8372256755828857, + "learning_rate": 1.1548845378217086e-05, + "loss": 0.426, + "step": 20295 + }, + { + "epoch": 0.4506054316822233, + "grad_norm": 1.3854246139526367, + "learning_rate": 1.154540057633128e-05, + "loss": 0.4741, + "step": 20300 + }, + { + "epoch": 0.4507164182417509, + "grad_norm": 1.5537136793136597, + "learning_rate": 1.1541955586560855e-05, + "loss": 0.5976, + "step": 20305 + }, + { + "epoch": 0.4508274048012786, + "grad_norm": 0.862808346748352, + "learning_rate": 1.1538510409324642e-05, + "loss": 0.4937, + "step": 20310 + }, + { + "epoch": 0.4509383913608062, + "grad_norm": 1.1472190618515015, + "learning_rate": 1.1535065045041492e-05, + "loss": 0.3002, + "step": 20315 + }, + { + "epoch": 0.45104937792033384, + "grad_norm": 0.9795900583267212, + "learning_rate": 1.1531619494130283e-05, + "loss": 0.3339, + "step": 20320 + }, + { + "epoch": 0.45116036447986146, + "grad_norm": 1.3006950616836548, + "learning_rate": 1.1528173757009913e-05, + "loss": 0.5872, + "step": 20325 + }, + { + "epoch": 0.45127135103938915, + "grad_norm": 1.2611806392669678, + "learning_rate": 1.1524727834099304e-05, + "loss": 0.4502, + "step": 20330 + }, + { + "epoch": 0.45138233759891677, + "grad_norm": 0.8779106736183167, + "learning_rate": 1.1521281725817393e-05, + "loss": 0.3637, + "step": 20335 + }, + { + "epoch": 0.4514933241584444, + "grad_norm": 1.3904417753219604, + "learning_rate": 1.1517835432583156e-05, + "loss": 0.455, + "step": 20340 + }, + { + "epoch": 0.4516043107179721, + "grad_norm": 1.4688048362731934, + "learning_rate": 1.1514388954815576e-05, + "loss": 0.559, + "step": 20345 + }, + { + "epoch": 0.4517152972774997, + "grad_norm": 1.2480225563049316, + "learning_rate": 1.1510942292933667e-05, + "loss": 0.3412, + "step": 20350 + }, + { + "epoch": 0.45182628383702733, + "grad_norm": 1.2750505208969116, + "learning_rate": 1.1507495447356462e-05, + "loss": 0.3771, + "step": 20355 + }, + { + "epoch": 0.45193727039655496, + "grad_norm": 0.7525187730789185, + "learning_rate": 1.1504048418503016e-05, + "loss": 0.4345, + "step": 20360 + }, + { + "epoch": 0.45204825695608264, + "grad_norm": 1.280582308769226, + "learning_rate": 1.1500601206792405e-05, + "loss": 0.3903, + "step": 20365 + }, + { + "epoch": 0.45215924351561027, + "grad_norm": 1.211562156677246, + "learning_rate": 1.1497153812643735e-05, + "loss": 0.3751, + "step": 20370 + }, + { + "epoch": 0.4522702300751379, + "grad_norm": 1.9452505111694336, + "learning_rate": 1.1493706236476123e-05, + "loss": 0.4946, + "step": 20375 + }, + { + "epoch": 0.4523812166346655, + "grad_norm": 0.9842869639396667, + "learning_rate": 1.1490258478708718e-05, + "loss": 0.3214, + "step": 20380 + }, + { + "epoch": 0.4524922031941932, + "grad_norm": 0.8979201912879944, + "learning_rate": 1.1486810539760684e-05, + "loss": 0.4406, + "step": 20385 + }, + { + "epoch": 0.4526031897537208, + "grad_norm": 1.5928150415420532, + "learning_rate": 1.1483362420051211e-05, + "loss": 0.4085, + "step": 20390 + }, + { + "epoch": 0.45271417631324845, + "grad_norm": 1.6551522016525269, + "learning_rate": 1.1479914119999508e-05, + "loss": 0.3635, + "step": 20395 + }, + { + "epoch": 0.45282516287277613, + "grad_norm": 1.5658823251724243, + "learning_rate": 1.1476465640024814e-05, + "loss": 0.4973, + "step": 20400 + }, + { + "epoch": 0.45293614943230376, + "grad_norm": 1.3159431219100952, + "learning_rate": 1.1473016980546377e-05, + "loss": 0.4794, + "step": 20405 + }, + { + "epoch": 0.4530471359918314, + "grad_norm": 2.1857190132141113, + "learning_rate": 1.1469568141983476e-05, + "loss": 0.4942, + "step": 20410 + }, + { + "epoch": 0.453158122551359, + "grad_norm": 1.0008069276809692, + "learning_rate": 1.1466119124755407e-05, + "loss": 0.4464, + "step": 20415 + }, + { + "epoch": 0.4532691091108867, + "grad_norm": 1.4019452333450317, + "learning_rate": 1.1462669929281491e-05, + "loss": 0.1733, + "step": 20420 + }, + { + "epoch": 0.4533800956704143, + "grad_norm": 1.0772136449813843, + "learning_rate": 1.1459220555981075e-05, + "loss": 0.4846, + "step": 20425 + }, + { + "epoch": 0.45349108222994194, + "grad_norm": 1.3797098398208618, + "learning_rate": 1.1455771005273516e-05, + "loss": 0.461, + "step": 20430 + }, + { + "epoch": 0.45360206878946957, + "grad_norm": 1.1715055704116821, + "learning_rate": 1.1452321277578197e-05, + "loss": 0.4189, + "step": 20435 + }, + { + "epoch": 0.45371305534899725, + "grad_norm": 1.3069055080413818, + "learning_rate": 1.1448871373314532e-05, + "loss": 0.5456, + "step": 20440 + }, + { + "epoch": 0.4538240419085249, + "grad_norm": 2.144012928009033, + "learning_rate": 1.1445421292901943e-05, + "loss": 0.4793, + "step": 20445 + }, + { + "epoch": 0.4539350284680525, + "grad_norm": 0.875789225101471, + "learning_rate": 1.144197103675988e-05, + "loss": 0.4647, + "step": 20450 + }, + { + "epoch": 0.4540460150275802, + "grad_norm": 1.1556012630462646, + "learning_rate": 1.1438520605307815e-05, + "loss": 0.3527, + "step": 20455 + }, + { + "epoch": 0.4541570015871078, + "grad_norm": 0.8506287932395935, + "learning_rate": 1.1435069998965239e-05, + "loss": 0.2872, + "step": 20460 + }, + { + "epoch": 0.45426798814663544, + "grad_norm": 0.9600818157196045, + "learning_rate": 1.1431619218151666e-05, + "loss": 0.4843, + "step": 20465 + }, + { + "epoch": 0.45437897470616306, + "grad_norm": 0.8915436267852783, + "learning_rate": 1.142816826328663e-05, + "loss": 0.5575, + "step": 20470 + }, + { + "epoch": 0.45448996126569075, + "grad_norm": 1.268429160118103, + "learning_rate": 1.1424717134789685e-05, + "loss": 0.3821, + "step": 20475 + }, + { + "epoch": 0.45460094782521837, + "grad_norm": 1.1212326288223267, + "learning_rate": 1.142126583308041e-05, + "loss": 0.3212, + "step": 20480 + }, + { + "epoch": 0.454711934384746, + "grad_norm": 1.3861826658248901, + "learning_rate": 1.14178143585784e-05, + "loss": 0.4671, + "step": 20485 + }, + { + "epoch": 0.4548229209442736, + "grad_norm": 1.224517822265625, + "learning_rate": 1.1414362711703277e-05, + "loss": 0.4713, + "step": 20490 + }, + { + "epoch": 0.4549339075038013, + "grad_norm": 1.119930624961853, + "learning_rate": 1.141091089287468e-05, + "loss": 0.4724, + "step": 20495 + }, + { + "epoch": 0.45504489406332893, + "grad_norm": 1.015840768814087, + "learning_rate": 1.1407458902512268e-05, + "loss": 0.4609, + "step": 20500 + }, + { + "epoch": 0.45515588062285656, + "grad_norm": 0.9772009253501892, + "learning_rate": 1.1404006741035718e-05, + "loss": 0.4207, + "step": 20505 + }, + { + "epoch": 0.45526686718238424, + "grad_norm": 0.799676775932312, + "learning_rate": 1.1400554408864741e-05, + "loss": 0.3784, + "step": 20510 + }, + { + "epoch": 0.45537785374191186, + "grad_norm": 1.0177497863769531, + "learning_rate": 1.1397101906419056e-05, + "loss": 0.5354, + "step": 20515 + }, + { + "epoch": 0.4554888403014395, + "grad_norm": 1.6851955652236938, + "learning_rate": 1.1393649234118407e-05, + "loss": 0.4534, + "step": 20520 + }, + { + "epoch": 0.4555998268609671, + "grad_norm": 1.011048436164856, + "learning_rate": 1.139019639238256e-05, + "loss": 0.4355, + "step": 20525 + }, + { + "epoch": 0.4557108134204948, + "grad_norm": 1.0171626806259155, + "learning_rate": 1.1386743381631296e-05, + "loss": 0.556, + "step": 20530 + }, + { + "epoch": 0.4558217999800224, + "grad_norm": 1.0538846254348755, + "learning_rate": 1.1383290202284418e-05, + "loss": 0.5416, + "step": 20535 + }, + { + "epoch": 0.45593278653955005, + "grad_norm": 1.385033369064331, + "learning_rate": 1.1379836854761761e-05, + "loss": 0.5533, + "step": 20540 + }, + { + "epoch": 0.4560437730990777, + "grad_norm": 0.8482803702354431, + "learning_rate": 1.1376383339483165e-05, + "loss": 0.2547, + "step": 20545 + }, + { + "epoch": 0.45615475965860536, + "grad_norm": 1.0015593767166138, + "learning_rate": 1.1372929656868501e-05, + "loss": 0.4572, + "step": 20550 + }, + { + "epoch": 0.456265746218133, + "grad_norm": 1.6447019577026367, + "learning_rate": 1.1369475807337653e-05, + "loss": 0.3652, + "step": 20555 + }, + { + "epoch": 0.4563767327776606, + "grad_norm": 1.3405704498291016, + "learning_rate": 1.136602179131053e-05, + "loss": 0.3431, + "step": 20560 + }, + { + "epoch": 0.4564877193371883, + "grad_norm": 0.7782473564147949, + "learning_rate": 1.1362567609207056e-05, + "loss": 0.393, + "step": 20565 + }, + { + "epoch": 0.4565987058967159, + "grad_norm": 0.8959970474243164, + "learning_rate": 1.1359113261447183e-05, + "loss": 0.5545, + "step": 20570 + }, + { + "epoch": 0.45670969245624354, + "grad_norm": 1.2282785177230835, + "learning_rate": 1.135565874845088e-05, + "loss": 0.5127, + "step": 20575 + }, + { + "epoch": 0.45682067901577117, + "grad_norm": 1.4772942066192627, + "learning_rate": 1.135220407063813e-05, + "loss": 0.5955, + "step": 20580 + }, + { + "epoch": 0.45693166557529885, + "grad_norm": 1.02876615524292, + "learning_rate": 1.1348749228428946e-05, + "loss": 0.5075, + "step": 20585 + }, + { + "epoch": 0.4570426521348265, + "grad_norm": 1.2462966442108154, + "learning_rate": 1.1345294222243351e-05, + "loss": 0.2885, + "step": 20590 + }, + { + "epoch": 0.4571536386943541, + "grad_norm": 1.0734065771102905, + "learning_rate": 1.13418390525014e-05, + "loss": 0.4402, + "step": 20595 + }, + { + "epoch": 0.45726462525388173, + "grad_norm": 1.3125464916229248, + "learning_rate": 1.1338383719623156e-05, + "loss": 0.4559, + "step": 20600 + }, + { + "epoch": 0.4573756118134094, + "grad_norm": 1.280454397201538, + "learning_rate": 1.1334928224028707e-05, + "loss": 0.4138, + "step": 20605 + }, + { + "epoch": 0.45748659837293704, + "grad_norm": 1.2182717323303223, + "learning_rate": 1.1331472566138162e-05, + "loss": 0.4755, + "step": 20610 + }, + { + "epoch": 0.45759758493246466, + "grad_norm": 1.3797804117202759, + "learning_rate": 1.1328016746371648e-05, + "loss": 0.4237, + "step": 20615 + }, + { + "epoch": 0.45770857149199234, + "grad_norm": 0.990795910358429, + "learning_rate": 1.1324560765149312e-05, + "loss": 0.4135, + "step": 20620 + }, + { + "epoch": 0.45781955805151997, + "grad_norm": 1.1758495569229126, + "learning_rate": 1.1321104622891321e-05, + "loss": 0.5617, + "step": 20625 + }, + { + "epoch": 0.4579305446110476, + "grad_norm": 0.8563075661659241, + "learning_rate": 1.131764832001786e-05, + "loss": 0.442, + "step": 20630 + }, + { + "epoch": 0.4580415311705752, + "grad_norm": 1.0942500829696655, + "learning_rate": 1.1314191856949134e-05, + "loss": 0.4125, + "step": 20635 + }, + { + "epoch": 0.4581525177301029, + "grad_norm": 1.0212494134902954, + "learning_rate": 1.1310735234105372e-05, + "loss": 0.5584, + "step": 20640 + }, + { + "epoch": 0.45826350428963053, + "grad_norm": 0.9054468274116516, + "learning_rate": 1.1307278451906817e-05, + "loss": 0.6214, + "step": 20645 + }, + { + "epoch": 0.45837449084915816, + "grad_norm": 1.5153844356536865, + "learning_rate": 1.1303821510773728e-05, + "loss": 0.5178, + "step": 20650 + }, + { + "epoch": 0.4584854774086858, + "grad_norm": 1.2968688011169434, + "learning_rate": 1.1300364411126395e-05, + "loss": 0.3395, + "step": 20655 + }, + { + "epoch": 0.45859646396821346, + "grad_norm": 1.2937582731246948, + "learning_rate": 1.1296907153385115e-05, + "loss": 0.4076, + "step": 20660 + }, + { + "epoch": 0.4587074505277411, + "grad_norm": 1.468260645866394, + "learning_rate": 1.1293449737970217e-05, + "loss": 0.3309, + "step": 20665 + }, + { + "epoch": 0.4588184370872687, + "grad_norm": 1.098471999168396, + "learning_rate": 1.1289992165302036e-05, + "loss": 0.4811, + "step": 20670 + }, + { + "epoch": 0.4589294236467964, + "grad_norm": 2.470288038253784, + "learning_rate": 1.128653443580093e-05, + "loss": 0.4609, + "step": 20675 + }, + { + "epoch": 0.459040410206324, + "grad_norm": 1.0436688661575317, + "learning_rate": 1.1283076549887286e-05, + "loss": 0.4925, + "step": 20680 + }, + { + "epoch": 0.45915139676585165, + "grad_norm": 1.34546959400177, + "learning_rate": 1.1279618507981498e-05, + "loss": 0.5932, + "step": 20685 + }, + { + "epoch": 0.4592623833253793, + "grad_norm": 1.2342839241027832, + "learning_rate": 1.1276160310503984e-05, + "loss": 0.3635, + "step": 20690 + }, + { + "epoch": 0.45937336988490696, + "grad_norm": 0.7874952554702759, + "learning_rate": 1.1272701957875178e-05, + "loss": 0.3375, + "step": 20695 + }, + { + "epoch": 0.4594843564444346, + "grad_norm": 0.848264217376709, + "learning_rate": 1.1269243450515537e-05, + "loss": 0.3849, + "step": 20700 + }, + { + "epoch": 0.4595953430039622, + "grad_norm": 1.1894251108169556, + "learning_rate": 1.1265784788845534e-05, + "loss": 0.464, + "step": 20705 + }, + { + "epoch": 0.45970632956348983, + "grad_norm": 1.3765366077423096, + "learning_rate": 1.126232597328566e-05, + "loss": 0.4449, + "step": 20710 + }, + { + "epoch": 0.4598173161230175, + "grad_norm": 1.2355886697769165, + "learning_rate": 1.1258867004256428e-05, + "loss": 0.2947, + "step": 20715 + }, + { + "epoch": 0.45992830268254514, + "grad_norm": 0.8822180032730103, + "learning_rate": 1.1255407882178368e-05, + "loss": 0.4502, + "step": 20720 + }, + { + "epoch": 0.46003928924207277, + "grad_norm": 1.4964523315429688, + "learning_rate": 1.1251948607472029e-05, + "loss": 0.5549, + "step": 20725 + }, + { + "epoch": 0.46015027580160045, + "grad_norm": 1.5759141445159912, + "learning_rate": 1.1248489180557977e-05, + "loss": 0.6028, + "step": 20730 + }, + { + "epoch": 0.4602612623611281, + "grad_norm": 1.3286418914794922, + "learning_rate": 1.1245029601856798e-05, + "loss": 0.3963, + "step": 20735 + }, + { + "epoch": 0.4603722489206557, + "grad_norm": 1.1409542560577393, + "learning_rate": 1.1241569871789096e-05, + "loss": 0.3983, + "step": 20740 + }, + { + "epoch": 0.46048323548018333, + "grad_norm": 1.1353600025177002, + "learning_rate": 1.123810999077549e-05, + "loss": 0.491, + "step": 20745 + }, + { + "epoch": 0.460594222039711, + "grad_norm": 1.1560803651809692, + "learning_rate": 1.1234649959236625e-05, + "loss": 0.6121, + "step": 20750 + }, + { + "epoch": 0.46070520859923864, + "grad_norm": 1.4681437015533447, + "learning_rate": 1.123118977759316e-05, + "loss": 0.5314, + "step": 20755 + }, + { + "epoch": 0.46081619515876626, + "grad_norm": 1.4478296041488647, + "learning_rate": 1.1227729446265766e-05, + "loss": 0.4961, + "step": 20760 + }, + { + "epoch": 0.4609271817182939, + "grad_norm": 1.1718122959136963, + "learning_rate": 1.1224268965675149e-05, + "loss": 0.3873, + "step": 20765 + }, + { + "epoch": 0.46103816827782157, + "grad_norm": 1.1868184804916382, + "learning_rate": 1.1220808336242015e-05, + "loss": 0.5384, + "step": 20770 + }, + { + "epoch": 0.4611491548373492, + "grad_norm": 1.2744981050491333, + "learning_rate": 1.1217347558387098e-05, + "loss": 0.3696, + "step": 20775 + }, + { + "epoch": 0.4612601413968768, + "grad_norm": 1.1455271244049072, + "learning_rate": 1.1213886632531147e-05, + "loss": 0.4095, + "step": 20780 + }, + { + "epoch": 0.4613711279564045, + "grad_norm": 1.3924636840820312, + "learning_rate": 1.121042555909493e-05, + "loss": 0.4655, + "step": 20785 + }, + { + "epoch": 0.46148211451593213, + "grad_norm": 1.2234874963760376, + "learning_rate": 1.120696433849923e-05, + "loss": 0.4949, + "step": 20790 + }, + { + "epoch": 0.46159310107545976, + "grad_norm": 1.050239086151123, + "learning_rate": 1.1203502971164859e-05, + "loss": 0.4225, + "step": 20795 + }, + { + "epoch": 0.4617040876349874, + "grad_norm": 1.2952483892440796, + "learning_rate": 1.1200041457512632e-05, + "loss": 0.592, + "step": 20800 + }, + { + "epoch": 0.46181507419451506, + "grad_norm": 1.2631264925003052, + "learning_rate": 1.1196579797963386e-05, + "loss": 0.3866, + "step": 20805 + }, + { + "epoch": 0.4619260607540427, + "grad_norm": 1.1427369117736816, + "learning_rate": 1.1193117992937985e-05, + "loss": 0.3557, + "step": 20810 + }, + { + "epoch": 0.4620370473135703, + "grad_norm": 1.1873159408569336, + "learning_rate": 1.11896560428573e-05, + "loss": 0.4925, + "step": 20815 + }, + { + "epoch": 0.46214803387309794, + "grad_norm": 1.403205156326294, + "learning_rate": 1.1186193948142219e-05, + "loss": 0.5569, + "step": 20820 + }, + { + "epoch": 0.4622590204326256, + "grad_norm": 1.2612415552139282, + "learning_rate": 1.1182731709213658e-05, + "loss": 0.5531, + "step": 20825 + }, + { + "epoch": 0.46237000699215325, + "grad_norm": 1.758579969406128, + "learning_rate": 1.1179269326492543e-05, + "loss": 0.514, + "step": 20830 + }, + { + "epoch": 0.4624809935516809, + "grad_norm": 1.4168121814727783, + "learning_rate": 1.117580680039982e-05, + "loss": 0.4506, + "step": 20835 + }, + { + "epoch": 0.46259198011120856, + "grad_norm": 1.189832091331482, + "learning_rate": 1.1172344131356447e-05, + "loss": 0.4042, + "step": 20840 + }, + { + "epoch": 0.4627029666707362, + "grad_norm": 0.9151319861412048, + "learning_rate": 1.1168881319783407e-05, + "loss": 0.4344, + "step": 20845 + }, + { + "epoch": 0.4628139532302638, + "grad_norm": 0.8878720998764038, + "learning_rate": 1.1165418366101696e-05, + "loss": 0.2991, + "step": 20850 + }, + { + "epoch": 0.46292493978979143, + "grad_norm": 1.068393349647522, + "learning_rate": 1.116195527073233e-05, + "loss": 0.4902, + "step": 20855 + }, + { + "epoch": 0.4630359263493191, + "grad_norm": 0.8120904564857483, + "learning_rate": 1.1158492034096337e-05, + "loss": 0.444, + "step": 20860 + }, + { + "epoch": 0.46314691290884674, + "grad_norm": 1.1619365215301514, + "learning_rate": 1.1155028656614768e-05, + "loss": 0.3862, + "step": 20865 + }, + { + "epoch": 0.46325789946837437, + "grad_norm": 1.2095001935958862, + "learning_rate": 1.1151565138708688e-05, + "loss": 0.4603, + "step": 20870 + }, + { + "epoch": 0.463368886027902, + "grad_norm": 1.2450065612792969, + "learning_rate": 1.1148101480799181e-05, + "loss": 0.4101, + "step": 20875 + }, + { + "epoch": 0.4634798725874297, + "grad_norm": 2.0599148273468018, + "learning_rate": 1.1144637683307346e-05, + "loss": 0.3749, + "step": 20880 + }, + { + "epoch": 0.4635908591469573, + "grad_norm": 0.8745701909065247, + "learning_rate": 1.1141173746654304e-05, + "loss": 0.398, + "step": 20885 + }, + { + "epoch": 0.4637018457064849, + "grad_norm": 1.383885383605957, + "learning_rate": 1.1137709671261181e-05, + "loss": 0.4914, + "step": 20890 + }, + { + "epoch": 0.4638128322660126, + "grad_norm": 1.2333803176879883, + "learning_rate": 1.1134245457549133e-05, + "loss": 0.3697, + "step": 20895 + }, + { + "epoch": 0.46392381882554024, + "grad_norm": 1.6844825744628906, + "learning_rate": 1.1130781105939325e-05, + "loss": 0.5081, + "step": 20900 + }, + { + "epoch": 0.46403480538506786, + "grad_norm": 0.9865526556968689, + "learning_rate": 1.1127316616852943e-05, + "loss": 0.3997, + "step": 20905 + }, + { + "epoch": 0.4641457919445955, + "grad_norm": 1.1580348014831543, + "learning_rate": 1.112385199071119e-05, + "loss": 0.4482, + "step": 20910 + }, + { + "epoch": 0.46425677850412317, + "grad_norm": 0.8300138711929321, + "learning_rate": 1.112038722793528e-05, + "loss": 0.5253, + "step": 20915 + }, + { + "epoch": 0.4643677650636508, + "grad_norm": 1.238762378692627, + "learning_rate": 1.1116922328946447e-05, + "loss": 0.481, + "step": 20920 + }, + { + "epoch": 0.4644787516231784, + "grad_norm": 1.333196997642517, + "learning_rate": 1.1113457294165944e-05, + "loss": 0.3692, + "step": 20925 + }, + { + "epoch": 0.46458973818270605, + "grad_norm": 1.5084500312805176, + "learning_rate": 1.1109992124015036e-05, + "loss": 0.4737, + "step": 20930 + }, + { + "epoch": 0.46470072474223373, + "grad_norm": 1.103947401046753, + "learning_rate": 1.1106526818915008e-05, + "loss": 0.5106, + "step": 20935 + }, + { + "epoch": 0.46481171130176135, + "grad_norm": 1.341492772102356, + "learning_rate": 1.1103061379287163e-05, + "loss": 0.4256, + "step": 20940 + }, + { + "epoch": 0.464922697861289, + "grad_norm": 1.0959070920944214, + "learning_rate": 1.1099595805552815e-05, + "loss": 0.3839, + "step": 20945 + }, + { + "epoch": 0.46503368442081666, + "grad_norm": 1.2406764030456543, + "learning_rate": 1.1096130098133296e-05, + "loss": 0.4045, + "step": 20950 + }, + { + "epoch": 0.4651446709803443, + "grad_norm": 1.3345435857772827, + "learning_rate": 1.1092664257449955e-05, + "loss": 0.3243, + "step": 20955 + }, + { + "epoch": 0.4652556575398719, + "grad_norm": 1.7433699369430542, + "learning_rate": 1.1089198283924155e-05, + "loss": 0.466, + "step": 20960 + }, + { + "epoch": 0.46536664409939954, + "grad_norm": 1.3542841672897339, + "learning_rate": 1.1085732177977284e-05, + "loss": 0.3902, + "step": 20965 + }, + { + "epoch": 0.4654776306589272, + "grad_norm": 1.3806394338607788, + "learning_rate": 1.1082265940030736e-05, + "loss": 0.4725, + "step": 20970 + }, + { + "epoch": 0.46558861721845485, + "grad_norm": 1.1428102254867554, + "learning_rate": 1.1078799570505925e-05, + "loss": 0.3389, + "step": 20975 + }, + { + "epoch": 0.4656996037779825, + "grad_norm": 1.1814501285552979, + "learning_rate": 1.107533306982428e-05, + "loss": 0.3445, + "step": 20980 + }, + { + "epoch": 0.4658105903375101, + "grad_norm": 1.3944542407989502, + "learning_rate": 1.1071866438407245e-05, + "loss": 0.5482, + "step": 20985 + }, + { + "epoch": 0.4659215768970378, + "grad_norm": 0.8299020528793335, + "learning_rate": 1.1068399676676286e-05, + "loss": 0.4399, + "step": 20990 + }, + { + "epoch": 0.4660325634565654, + "grad_norm": 1.0346838235855103, + "learning_rate": 1.1064932785052877e-05, + "loss": 0.3264, + "step": 20995 + }, + { + "epoch": 0.46614355001609303, + "grad_norm": 1.1892578601837158, + "learning_rate": 1.106146576395851e-05, + "loss": 0.5251, + "step": 21000 + }, + { + "epoch": 0.4662545365756207, + "grad_norm": 1.5619806051254272, + "learning_rate": 1.1057998613814695e-05, + "loss": 0.3387, + "step": 21005 + }, + { + "epoch": 0.46636552313514834, + "grad_norm": 1.4932775497436523, + "learning_rate": 1.105453133504296e-05, + "loss": 0.5554, + "step": 21010 + }, + { + "epoch": 0.46647650969467597, + "grad_norm": 1.0520886182785034, + "learning_rate": 1.1051063928064836e-05, + "loss": 0.4444, + "step": 21015 + }, + { + "epoch": 0.4665874962542036, + "grad_norm": 1.2631596326828003, + "learning_rate": 1.104759639330189e-05, + "loss": 0.3723, + "step": 21020 + }, + { + "epoch": 0.4666984828137313, + "grad_norm": 1.4202393293380737, + "learning_rate": 1.1044128731175686e-05, + "loss": 0.4709, + "step": 21025 + }, + { + "epoch": 0.4668094693732589, + "grad_norm": 1.4321917295455933, + "learning_rate": 1.1040660942107813e-05, + "loss": 0.4961, + "step": 21030 + }, + { + "epoch": 0.4669204559327865, + "grad_norm": 1.6301039457321167, + "learning_rate": 1.1037193026519872e-05, + "loss": 0.3466, + "step": 21035 + }, + { + "epoch": 0.4670314424923142, + "grad_norm": 0.9708942174911499, + "learning_rate": 1.1033724984833483e-05, + "loss": 0.5608, + "step": 21040 + }, + { + "epoch": 0.46714242905184183, + "grad_norm": 1.3298207521438599, + "learning_rate": 1.1030256817470272e-05, + "loss": 0.5311, + "step": 21045 + }, + { + "epoch": 0.46725341561136946, + "grad_norm": 0.9955623149871826, + "learning_rate": 1.1026788524851897e-05, + "loss": 0.4202, + "step": 21050 + }, + { + "epoch": 0.4673644021708971, + "grad_norm": 0.8595684766769409, + "learning_rate": 1.1023320107400017e-05, + "loss": 0.3733, + "step": 21055 + }, + { + "epoch": 0.46747538873042477, + "grad_norm": 1.570600152015686, + "learning_rate": 1.101985156553631e-05, + "loss": 0.5551, + "step": 21060 + }, + { + "epoch": 0.4675863752899524, + "grad_norm": 0.6361602544784546, + "learning_rate": 1.101638289968247e-05, + "loss": 0.5578, + "step": 21065 + }, + { + "epoch": 0.46769736184948, + "grad_norm": 1.6245297193527222, + "learning_rate": 1.1012914110260202e-05, + "loss": 0.5088, + "step": 21070 + }, + { + "epoch": 0.46780834840900765, + "grad_norm": 1.0813826322555542, + "learning_rate": 1.1009445197691237e-05, + "loss": 0.4791, + "step": 21075 + }, + { + "epoch": 0.4679193349685353, + "grad_norm": 0.7604579329490662, + "learning_rate": 1.1005976162397309e-05, + "loss": 0.3741, + "step": 21080 + }, + { + "epoch": 0.46803032152806295, + "grad_norm": 1.3884882926940918, + "learning_rate": 1.1002507004800174e-05, + "loss": 0.5016, + "step": 21085 + }, + { + "epoch": 0.4681413080875906, + "grad_norm": 1.620674967765808, + "learning_rate": 1.09990377253216e-05, + "loss": 0.5009, + "step": 21090 + }, + { + "epoch": 0.46825229464711826, + "grad_norm": 0.7146098017692566, + "learning_rate": 1.099556832438337e-05, + "loss": 0.4999, + "step": 21095 + }, + { + "epoch": 0.4683632812066459, + "grad_norm": 0.9891538023948669, + "learning_rate": 1.0992098802407281e-05, + "loss": 0.4181, + "step": 21100 + }, + { + "epoch": 0.4684742677661735, + "grad_norm": 1.9593673944473267, + "learning_rate": 1.0988629159815147e-05, + "loss": 0.4987, + "step": 21105 + }, + { + "epoch": 0.46858525432570114, + "grad_norm": 0.9993635416030884, + "learning_rate": 1.0985159397028795e-05, + "loss": 0.2836, + "step": 21110 + }, + { + "epoch": 0.4686962408852288, + "grad_norm": 0.8817034959793091, + "learning_rate": 1.0981689514470069e-05, + "loss": 0.4039, + "step": 21115 + }, + { + "epoch": 0.46880722744475645, + "grad_norm": 1.5623383522033691, + "learning_rate": 1.0978219512560824e-05, + "loss": 0.3283, + "step": 21120 + }, + { + "epoch": 0.4689182140042841, + "grad_norm": 1.112033724784851, + "learning_rate": 1.097474939172293e-05, + "loss": 0.3188, + "step": 21125 + }, + { + "epoch": 0.4690292005638117, + "grad_norm": 0.9545212984085083, + "learning_rate": 1.0971279152378271e-05, + "loss": 0.4523, + "step": 21130 + }, + { + "epoch": 0.4691401871233394, + "grad_norm": 1.1092740297317505, + "learning_rate": 1.0967808794948756e-05, + "loss": 0.36, + "step": 21135 + }, + { + "epoch": 0.469251173682867, + "grad_norm": 1.4138292074203491, + "learning_rate": 1.0964338319856288e-05, + "loss": 0.3877, + "step": 21140 + }, + { + "epoch": 0.46936216024239463, + "grad_norm": 1.7056288719177246, + "learning_rate": 1.0960867727522806e-05, + "loss": 0.4093, + "step": 21145 + }, + { + "epoch": 0.4694731468019223, + "grad_norm": 0.9393314123153687, + "learning_rate": 1.0957397018370247e-05, + "loss": 0.4628, + "step": 21150 + }, + { + "epoch": 0.46958413336144994, + "grad_norm": 0.9642075300216675, + "learning_rate": 1.0953926192820565e-05, + "loss": 0.5429, + "step": 21155 + }, + { + "epoch": 0.46969511992097757, + "grad_norm": 1.1112103462219238, + "learning_rate": 1.0950455251295739e-05, + "loss": 0.4087, + "step": 21160 + }, + { + "epoch": 0.4698061064805052, + "grad_norm": 1.0092273950576782, + "learning_rate": 1.0946984194217753e-05, + "loss": 0.4188, + "step": 21165 + }, + { + "epoch": 0.4699170930400329, + "grad_norm": 1.1835898160934448, + "learning_rate": 1.0943513022008603e-05, + "loss": 0.4485, + "step": 21170 + }, + { + "epoch": 0.4700280795995605, + "grad_norm": 1.524971604347229, + "learning_rate": 1.0940041735090305e-05, + "loss": 0.4301, + "step": 21175 + }, + { + "epoch": 0.4701390661590881, + "grad_norm": 1.383873701095581, + "learning_rate": 1.0936570333884885e-05, + "loss": 0.3277, + "step": 21180 + }, + { + "epoch": 0.47025005271861575, + "grad_norm": 1.8832981586456299, + "learning_rate": 1.0933098818814383e-05, + "loss": 0.4364, + "step": 21185 + }, + { + "epoch": 0.47036103927814343, + "grad_norm": 1.0231654644012451, + "learning_rate": 1.0929627190300858e-05, + "loss": 0.4135, + "step": 21190 + }, + { + "epoch": 0.47047202583767106, + "grad_norm": 1.4642794132232666, + "learning_rate": 1.0926155448766376e-05, + "loss": 0.3931, + "step": 21195 + }, + { + "epoch": 0.4705830123971987, + "grad_norm": 0.6988185048103333, + "learning_rate": 1.092268359463302e-05, + "loss": 0.3946, + "step": 21200 + }, + { + "epoch": 0.47069399895672637, + "grad_norm": 1.163567304611206, + "learning_rate": 1.091921162832289e-05, + "loss": 0.3978, + "step": 21205 + }, + { + "epoch": 0.470804985516254, + "grad_norm": 1.002905249595642, + "learning_rate": 1.0915739550258091e-05, + "loss": 0.3521, + "step": 21210 + }, + { + "epoch": 0.4709159720757816, + "grad_norm": 0.9412310719490051, + "learning_rate": 1.0912267360860747e-05, + "loss": 0.5536, + "step": 21215 + }, + { + "epoch": 0.47102695863530925, + "grad_norm": 1.228294014930725, + "learning_rate": 1.0908795060552997e-05, + "loss": 0.3347, + "step": 21220 + }, + { + "epoch": 0.4711379451948369, + "grad_norm": 1.4175302982330322, + "learning_rate": 1.0905322649756992e-05, + "loss": 0.4958, + "step": 21225 + }, + { + "epoch": 0.47124893175436455, + "grad_norm": 0.6760334968566895, + "learning_rate": 1.0901850128894899e-05, + "loss": 0.4364, + "step": 21230 + }, + { + "epoch": 0.4713599183138922, + "grad_norm": 1.2598497867584229, + "learning_rate": 1.089837749838889e-05, + "loss": 0.4319, + "step": 21235 + }, + { + "epoch": 0.4714709048734198, + "grad_norm": 1.8593010902404785, + "learning_rate": 1.0894904758661155e-05, + "loss": 0.3682, + "step": 21240 + }, + { + "epoch": 0.4715818914329475, + "grad_norm": 1.5405446290969849, + "learning_rate": 1.0891431910133905e-05, + "loss": 0.4079, + "step": 21245 + }, + { + "epoch": 0.4716928779924751, + "grad_norm": 1.7913031578063965, + "learning_rate": 1.0887958953229349e-05, + "loss": 0.4243, + "step": 21250 + }, + { + "epoch": 0.47180386455200274, + "grad_norm": 0.7782161831855774, + "learning_rate": 1.0884485888369725e-05, + "loss": 0.3811, + "step": 21255 + }, + { + "epoch": 0.4719148511115304, + "grad_norm": 0.8864994049072266, + "learning_rate": 1.0881012715977271e-05, + "loss": 0.5465, + "step": 21260 + }, + { + "epoch": 0.47202583767105805, + "grad_norm": 1.6385595798492432, + "learning_rate": 1.0877539436474245e-05, + "loss": 0.3958, + "step": 21265 + }, + { + "epoch": 0.47213682423058567, + "grad_norm": 0.8608049750328064, + "learning_rate": 1.0874066050282917e-05, + "loss": 0.4109, + "step": 21270 + }, + { + "epoch": 0.4722478107901133, + "grad_norm": 1.0551360845565796, + "learning_rate": 1.087059255782557e-05, + "loss": 0.6298, + "step": 21275 + }, + { + "epoch": 0.472358797349641, + "grad_norm": 1.1309964656829834, + "learning_rate": 1.08671189595245e-05, + "loss": 0.3551, + "step": 21280 + }, + { + "epoch": 0.4724697839091686, + "grad_norm": 1.741068959236145, + "learning_rate": 1.0863645255802017e-05, + "loss": 0.4292, + "step": 21285 + }, + { + "epoch": 0.47258077046869623, + "grad_norm": 1.0462664365768433, + "learning_rate": 1.086017144708044e-05, + "loss": 0.5168, + "step": 21290 + }, + { + "epoch": 0.47269175702822386, + "grad_norm": 1.3243409395217896, + "learning_rate": 1.0856697533782102e-05, + "loss": 0.4631, + "step": 21295 + }, + { + "epoch": 0.47280274358775154, + "grad_norm": 1.2467161417007446, + "learning_rate": 1.085322351632935e-05, + "loss": 0.4451, + "step": 21300 + }, + { + "epoch": 0.47291373014727917, + "grad_norm": 1.0557029247283936, + "learning_rate": 1.0849749395144544e-05, + "loss": 0.3586, + "step": 21305 + }, + { + "epoch": 0.4730247167068068, + "grad_norm": 0.7982147336006165, + "learning_rate": 1.084627517065006e-05, + "loss": 0.3789, + "step": 21310 + }, + { + "epoch": 0.4731357032663345, + "grad_norm": 1.3924659490585327, + "learning_rate": 1.0842800843268274e-05, + "loss": 0.4243, + "step": 21315 + }, + { + "epoch": 0.4732466898258621, + "grad_norm": 0.9715703725814819, + "learning_rate": 1.0839326413421593e-05, + "loss": 0.3626, + "step": 21320 + }, + { + "epoch": 0.4733576763853897, + "grad_norm": 1.034337043762207, + "learning_rate": 1.0835851881532418e-05, + "loss": 0.4058, + "step": 21325 + }, + { + "epoch": 0.47346866294491735, + "grad_norm": 1.2233444452285767, + "learning_rate": 1.0832377248023175e-05, + "loss": 0.3141, + "step": 21330 + }, + { + "epoch": 0.47357964950444503, + "grad_norm": 1.4177346229553223, + "learning_rate": 1.0828902513316299e-05, + "loss": 0.4295, + "step": 21335 + }, + { + "epoch": 0.47369063606397266, + "grad_norm": 1.0765271186828613, + "learning_rate": 1.0825427677834235e-05, + "loss": 0.3157, + "step": 21340 + }, + { + "epoch": 0.4738016226235003, + "grad_norm": 0.633413553237915, + "learning_rate": 1.0821952741999443e-05, + "loss": 0.3753, + "step": 21345 + }, + { + "epoch": 0.4739126091830279, + "grad_norm": 2.0716490745544434, + "learning_rate": 1.0818477706234394e-05, + "loss": 0.4344, + "step": 21350 + }, + { + "epoch": 0.4740235957425556, + "grad_norm": 0.8376625180244446, + "learning_rate": 1.0815002570961568e-05, + "loss": 0.5318, + "step": 21355 + }, + { + "epoch": 0.4741345823020832, + "grad_norm": 0.8807407021522522, + "learning_rate": 1.0811527336603465e-05, + "loss": 0.3093, + "step": 21360 + }, + { + "epoch": 0.47424556886161084, + "grad_norm": 1.2454125881195068, + "learning_rate": 1.0808052003582588e-05, + "loss": 0.4386, + "step": 21365 + }, + { + "epoch": 0.4743565554211385, + "grad_norm": 1.1970107555389404, + "learning_rate": 1.080457657232146e-05, + "loss": 0.2683, + "step": 21370 + }, + { + "epoch": 0.47446754198066615, + "grad_norm": 0.8811781406402588, + "learning_rate": 1.0801101043242607e-05, + "loss": 0.377, + "step": 21375 + }, + { + "epoch": 0.4745785285401938, + "grad_norm": 1.0804111957550049, + "learning_rate": 1.079762541676858e-05, + "loss": 0.4174, + "step": 21380 + }, + { + "epoch": 0.4746895150997214, + "grad_norm": 1.7538986206054688, + "learning_rate": 1.0794149693321927e-05, + "loss": 0.5051, + "step": 21385 + }, + { + "epoch": 0.4748005016592491, + "grad_norm": 1.358725905418396, + "learning_rate": 1.0790673873325219e-05, + "loss": 0.659, + "step": 21390 + }, + { + "epoch": 0.4749114882187767, + "grad_norm": 1.6165276765823364, + "learning_rate": 1.0787197957201035e-05, + "loss": 0.306, + "step": 21395 + }, + { + "epoch": 0.47502247477830434, + "grad_norm": 1.8659031391143799, + "learning_rate": 1.0783721945371962e-05, + "loss": 0.4602, + "step": 21400 + }, + { + "epoch": 0.47513346133783196, + "grad_norm": 1.331539511680603, + "learning_rate": 1.0780245838260602e-05, + "loss": 0.3587, + "step": 21405 + }, + { + "epoch": 0.47524444789735965, + "grad_norm": 1.6114097833633423, + "learning_rate": 1.0776769636289568e-05, + "loss": 0.3664, + "step": 21410 + }, + { + "epoch": 0.47535543445688727, + "grad_norm": 1.5674644708633423, + "learning_rate": 1.077329333988149e-05, + "loss": 0.3443, + "step": 21415 + }, + { + "epoch": 0.4754664210164149, + "grad_norm": 1.9007574319839478, + "learning_rate": 1.0769816949459002e-05, + "loss": 0.2939, + "step": 21420 + }, + { + "epoch": 0.4755774075759426, + "grad_norm": 1.640196681022644, + "learning_rate": 1.0766340465444749e-05, + "loss": 0.3501, + "step": 21425 + }, + { + "epoch": 0.4756883941354702, + "grad_norm": 1.8425315618515015, + "learning_rate": 1.0762863888261392e-05, + "loss": 0.4537, + "step": 21430 + }, + { + "epoch": 0.47579938069499783, + "grad_norm": 1.562214970588684, + "learning_rate": 1.0759387218331606e-05, + "loss": 0.3477, + "step": 21435 + }, + { + "epoch": 0.47591036725452546, + "grad_norm": 0.9589049816131592, + "learning_rate": 1.0755910456078062e-05, + "loss": 0.3283, + "step": 21440 + }, + { + "epoch": 0.47602135381405314, + "grad_norm": 1.2559269666671753, + "learning_rate": 1.0752433601923465e-05, + "loss": 0.3608, + "step": 21445 + }, + { + "epoch": 0.47613234037358076, + "grad_norm": 1.0764013528823853, + "learning_rate": 1.0748956656290512e-05, + "loss": 0.5934, + "step": 21450 + }, + { + "epoch": 0.4762433269331084, + "grad_norm": 1.1700280904769897, + "learning_rate": 1.0745479619601923e-05, + "loss": 0.3546, + "step": 21455 + }, + { + "epoch": 0.476354313492636, + "grad_norm": 1.494066834449768, + "learning_rate": 1.074200249228042e-05, + "loss": 0.476, + "step": 21460 + }, + { + "epoch": 0.4764653000521637, + "grad_norm": 1.316925287246704, + "learning_rate": 1.073852527474874e-05, + "loss": 0.3571, + "step": 21465 + }, + { + "epoch": 0.4765762866116913, + "grad_norm": 1.3482223749160767, + "learning_rate": 1.0735047967429638e-05, + "loss": 0.3626, + "step": 21470 + }, + { + "epoch": 0.47668727317121895, + "grad_norm": 0.9858172535896301, + "learning_rate": 1.0731570570745869e-05, + "loss": 0.4228, + "step": 21475 + }, + { + "epoch": 0.47679825973074663, + "grad_norm": 1.263492226600647, + "learning_rate": 1.0728093085120202e-05, + "loss": 0.3674, + "step": 21480 + }, + { + "epoch": 0.47690924629027426, + "grad_norm": 1.7882106304168701, + "learning_rate": 1.072461551097542e-05, + "loss": 0.291, + "step": 21485 + }, + { + "epoch": 0.4770202328498019, + "grad_norm": 1.1271024942398071, + "learning_rate": 1.0721137848734316e-05, + "loss": 0.483, + "step": 21490 + }, + { + "epoch": 0.4771312194093295, + "grad_norm": 1.754003643989563, + "learning_rate": 1.0717660098819685e-05, + "loss": 0.4653, + "step": 21495 + }, + { + "epoch": 0.4772422059688572, + "grad_norm": 1.011060357093811, + "learning_rate": 1.0714182261654351e-05, + "loss": 0.4512, + "step": 21500 + }, + { + "epoch": 0.4773531925283848, + "grad_norm": 1.6048130989074707, + "learning_rate": 1.0710704337661131e-05, + "loss": 0.2906, + "step": 21505 + }, + { + "epoch": 0.47746417908791244, + "grad_norm": 0.7402877807617188, + "learning_rate": 1.0707226327262862e-05, + "loss": 0.516, + "step": 21510 + }, + { + "epoch": 0.47757516564744007, + "grad_norm": 1.0845234394073486, + "learning_rate": 1.070374823088239e-05, + "loss": 0.2873, + "step": 21515 + }, + { + "epoch": 0.47768615220696775, + "grad_norm": 1.0958060026168823, + "learning_rate": 1.0700270048942568e-05, + "loss": 0.2853, + "step": 21520 + }, + { + "epoch": 0.4777971387664954, + "grad_norm": 1.5488417148590088, + "learning_rate": 1.0696791781866255e-05, + "loss": 0.459, + "step": 21525 + }, + { + "epoch": 0.477908125326023, + "grad_norm": 1.8981561660766602, + "learning_rate": 1.0693313430076343e-05, + "loss": 0.6263, + "step": 21530 + }, + { + "epoch": 0.4780191118855507, + "grad_norm": 1.2021952867507935, + "learning_rate": 1.0689834993995705e-05, + "loss": 0.4644, + "step": 21535 + }, + { + "epoch": 0.4781300984450783, + "grad_norm": 1.1099729537963867, + "learning_rate": 1.0686356474047242e-05, + "loss": 0.4083, + "step": 21540 + }, + { + "epoch": 0.47824108500460594, + "grad_norm": 1.1983321905136108, + "learning_rate": 1.068287787065386e-05, + "loss": 0.4784, + "step": 21545 + }, + { + "epoch": 0.47835207156413356, + "grad_norm": 1.0222690105438232, + "learning_rate": 1.0679399184238477e-05, + "loss": 0.4406, + "step": 21550 + }, + { + "epoch": 0.47846305812366124, + "grad_norm": 1.0962550640106201, + "learning_rate": 1.0675920415224021e-05, + "loss": 0.4584, + "step": 21555 + }, + { + "epoch": 0.47857404468318887, + "grad_norm": 1.4673035144805908, + "learning_rate": 1.0672441564033429e-05, + "loss": 0.3869, + "step": 21560 + }, + { + "epoch": 0.4786850312427165, + "grad_norm": 1.1756876707077026, + "learning_rate": 1.0668962631089646e-05, + "loss": 0.4112, + "step": 21565 + }, + { + "epoch": 0.4787960178022441, + "grad_norm": 1.4724787473678589, + "learning_rate": 1.0665483616815628e-05, + "loss": 0.4768, + "step": 21570 + }, + { + "epoch": 0.4789070043617718, + "grad_norm": 1.4865951538085938, + "learning_rate": 1.0662004521634346e-05, + "loss": 0.2881, + "step": 21575 + }, + { + "epoch": 0.47901799092129943, + "grad_norm": 2.133817434310913, + "learning_rate": 1.065852534596877e-05, + "loss": 0.4258, + "step": 21580 + }, + { + "epoch": 0.47912897748082706, + "grad_norm": 1.590226411819458, + "learning_rate": 1.0655046090241895e-05, + "loss": 0.4986, + "step": 21585 + }, + { + "epoch": 0.47923996404035474, + "grad_norm": 1.794185996055603, + "learning_rate": 1.0651566754876715e-05, + "loss": 0.5605, + "step": 21590 + }, + { + "epoch": 0.47935095059988236, + "grad_norm": 1.2741948366165161, + "learning_rate": 1.0648087340296232e-05, + "loss": 0.508, + "step": 21595 + }, + { + "epoch": 0.47946193715941, + "grad_norm": 2.1430916786193848, + "learning_rate": 1.0644607846923462e-05, + "loss": 0.3775, + "step": 21600 + }, + { + "epoch": 0.4795729237189376, + "grad_norm": 1.19858980178833, + "learning_rate": 1.0641128275181433e-05, + "loss": 0.4721, + "step": 21605 + }, + { + "epoch": 0.4796839102784653, + "grad_norm": 1.104694128036499, + "learning_rate": 1.063764862549318e-05, + "loss": 0.4713, + "step": 21610 + }, + { + "epoch": 0.4797948968379929, + "grad_norm": 1.2688933610916138, + "learning_rate": 1.0634168898281745e-05, + "loss": 0.3581, + "step": 21615 + }, + { + "epoch": 0.47990588339752055, + "grad_norm": 0.7898721098899841, + "learning_rate": 1.0630689093970182e-05, + "loss": 0.5328, + "step": 21620 + }, + { + "epoch": 0.4800168699570482, + "grad_norm": 1.2295295000076294, + "learning_rate": 1.0627209212981552e-05, + "loss": 0.4056, + "step": 21625 + }, + { + "epoch": 0.48012785651657586, + "grad_norm": 1.2840009927749634, + "learning_rate": 1.0623729255738932e-05, + "loss": 0.4107, + "step": 21630 + }, + { + "epoch": 0.4802388430761035, + "grad_norm": 0.8035621643066406, + "learning_rate": 1.0620249222665398e-05, + "loss": 0.3382, + "step": 21635 + }, + { + "epoch": 0.4803498296356311, + "grad_norm": 1.6547472476959229, + "learning_rate": 1.0616769114184044e-05, + "loss": 0.3411, + "step": 21640 + }, + { + "epoch": 0.4804608161951588, + "grad_norm": 0.7649518251419067, + "learning_rate": 1.0613288930717974e-05, + "loss": 0.4424, + "step": 21645 + }, + { + "epoch": 0.4805718027546864, + "grad_norm": 1.1245989799499512, + "learning_rate": 1.060980867269029e-05, + "loss": 0.5485, + "step": 21650 + }, + { + "epoch": 0.48068278931421404, + "grad_norm": 0.8971288800239563, + "learning_rate": 1.0606328340524113e-05, + "loss": 0.4177, + "step": 21655 + }, + { + "epoch": 0.48079377587374167, + "grad_norm": 1.5661104917526245, + "learning_rate": 1.0602847934642568e-05, + "loss": 0.3255, + "step": 21660 + }, + { + "epoch": 0.48090476243326935, + "grad_norm": 0.85600745677948, + "learning_rate": 1.0599367455468793e-05, + "loss": 0.4899, + "step": 21665 + }, + { + "epoch": 0.481015748992797, + "grad_norm": 0.8572877049446106, + "learning_rate": 1.0595886903425934e-05, + "loss": 0.4959, + "step": 21670 + }, + { + "epoch": 0.4811267355523246, + "grad_norm": 1.0525004863739014, + "learning_rate": 1.0592406278937143e-05, + "loss": 0.392, + "step": 21675 + }, + { + "epoch": 0.48123772211185223, + "grad_norm": 1.3853223323822021, + "learning_rate": 1.0588925582425585e-05, + "loss": 0.3902, + "step": 21680 + }, + { + "epoch": 0.4813487086713799, + "grad_norm": 1.3365157842636108, + "learning_rate": 1.058544481431443e-05, + "loss": 0.433, + "step": 21685 + }, + { + "epoch": 0.48145969523090754, + "grad_norm": 1.196568489074707, + "learning_rate": 1.0581963975026856e-05, + "loss": 0.4933, + "step": 21690 + }, + { + "epoch": 0.48157068179043516, + "grad_norm": 1.1724721193313599, + "learning_rate": 1.0578483064986054e-05, + "loss": 0.5655, + "step": 21695 + }, + { + "epoch": 0.48168166834996284, + "grad_norm": 1.0338935852050781, + "learning_rate": 1.057500208461522e-05, + "loss": 0.4333, + "step": 21700 + }, + { + "epoch": 0.48179265490949047, + "grad_norm": 1.549454689025879, + "learning_rate": 1.0571521034337565e-05, + "loss": 0.6056, + "step": 21705 + }, + { + "epoch": 0.4819036414690181, + "grad_norm": 1.3995952606201172, + "learning_rate": 1.0568039914576296e-05, + "loss": 0.5526, + "step": 21710 + }, + { + "epoch": 0.4820146280285457, + "grad_norm": 1.5786106586456299, + "learning_rate": 1.0564558725754642e-05, + "loss": 0.5402, + "step": 21715 + }, + { + "epoch": 0.4821256145880734, + "grad_norm": 1.1411170959472656, + "learning_rate": 1.0561077468295828e-05, + "loss": 0.3592, + "step": 21720 + }, + { + "epoch": 0.48223660114760103, + "grad_norm": 1.2534387111663818, + "learning_rate": 1.0557596142623099e-05, + "loss": 0.4262, + "step": 21725 + }, + { + "epoch": 0.48234758770712866, + "grad_norm": 1.2995420694351196, + "learning_rate": 1.05541147491597e-05, + "loss": 0.3863, + "step": 21730 + }, + { + "epoch": 0.4824585742666563, + "grad_norm": 1.1867161989212036, + "learning_rate": 1.0550633288328891e-05, + "loss": 0.393, + "step": 21735 + }, + { + "epoch": 0.48256956082618396, + "grad_norm": 1.3839069604873657, + "learning_rate": 1.0547151760553932e-05, + "loss": 0.5031, + "step": 21740 + }, + { + "epoch": 0.4826805473857116, + "grad_norm": 1.813341498374939, + "learning_rate": 1.0543670166258095e-05, + "loss": 0.3658, + "step": 21745 + }, + { + "epoch": 0.4827915339452392, + "grad_norm": 0.6340671181678772, + "learning_rate": 1.0540188505864664e-05, + "loss": 0.4365, + "step": 21750 + }, + { + "epoch": 0.4829025205047669, + "grad_norm": 1.045729398727417, + "learning_rate": 1.0536706779796925e-05, + "loss": 0.4977, + "step": 21755 + }, + { + "epoch": 0.4830135070642945, + "grad_norm": 0.9235963225364685, + "learning_rate": 1.0533224988478176e-05, + "loss": 0.5887, + "step": 21760 + }, + { + "epoch": 0.48312449362382215, + "grad_norm": 1.3279688358306885, + "learning_rate": 1.0529743132331725e-05, + "loss": 0.3984, + "step": 21765 + }, + { + "epoch": 0.4832354801833498, + "grad_norm": 1.1080505847930908, + "learning_rate": 1.0526261211780877e-05, + "loss": 0.3861, + "step": 21770 + }, + { + "epoch": 0.48334646674287746, + "grad_norm": 1.468857765197754, + "learning_rate": 1.0522779227248956e-05, + "loss": 0.5409, + "step": 21775 + }, + { + "epoch": 0.4834574533024051, + "grad_norm": 1.0246763229370117, + "learning_rate": 1.0519297179159289e-05, + "loss": 0.3904, + "step": 21780 + }, + { + "epoch": 0.4835684398619327, + "grad_norm": 0.8548372387886047, + "learning_rate": 1.0515815067935213e-05, + "loss": 0.4115, + "step": 21785 + }, + { + "epoch": 0.48367942642146033, + "grad_norm": 3.396918535232544, + "learning_rate": 1.0512332894000072e-05, + "loss": 0.4362, + "step": 21790 + }, + { + "epoch": 0.483790412980988, + "grad_norm": 0.9646040797233582, + "learning_rate": 1.0508850657777217e-05, + "loss": 0.4026, + "step": 21795 + }, + { + "epoch": 0.48390139954051564, + "grad_norm": 1.2373651266098022, + "learning_rate": 1.0505368359690002e-05, + "loss": 0.3865, + "step": 21800 + }, + { + "epoch": 0.48401238610004327, + "grad_norm": 1.5203875303268433, + "learning_rate": 1.0501886000161799e-05, + "loss": 0.611, + "step": 21805 + }, + { + "epoch": 0.48412337265957095, + "grad_norm": 0.7656844854354858, + "learning_rate": 1.0498403579615978e-05, + "loss": 0.5243, + "step": 21810 + }, + { + "epoch": 0.4842343592190986, + "grad_norm": 1.5859229564666748, + "learning_rate": 1.0494921098475923e-05, + "loss": 0.4365, + "step": 21815 + }, + { + "epoch": 0.4843453457786262, + "grad_norm": 1.1974952220916748, + "learning_rate": 1.0491438557165022e-05, + "loss": 0.3849, + "step": 21820 + }, + { + "epoch": 0.48445633233815383, + "grad_norm": 1.2102409601211548, + "learning_rate": 1.0487955956106666e-05, + "loss": 0.4075, + "step": 21825 + }, + { + "epoch": 0.4845673188976815, + "grad_norm": 1.5311321020126343, + "learning_rate": 1.0484473295724264e-05, + "loss": 0.4312, + "step": 21830 + }, + { + "epoch": 0.48467830545720914, + "grad_norm": 1.7898871898651123, + "learning_rate": 1.0480990576441223e-05, + "loss": 0.4472, + "step": 21835 + }, + { + "epoch": 0.48478929201673676, + "grad_norm": 1.4970860481262207, + "learning_rate": 1.047750779868096e-05, + "loss": 0.3975, + "step": 21840 + }, + { + "epoch": 0.4849002785762644, + "grad_norm": 1.2085856199264526, + "learning_rate": 1.04740249628669e-05, + "loss": 0.3633, + "step": 21845 + }, + { + "epoch": 0.48501126513579207, + "grad_norm": 1.2464152574539185, + "learning_rate": 1.0470542069422475e-05, + "loss": 0.4639, + "step": 21850 + }, + { + "epoch": 0.4851222516953197, + "grad_norm": 0.94156414270401, + "learning_rate": 1.0467059118771126e-05, + "loss": 0.363, + "step": 21855 + }, + { + "epoch": 0.4852332382548473, + "grad_norm": 1.0619620084762573, + "learning_rate": 1.0463576111336293e-05, + "loss": 0.3719, + "step": 21860 + }, + { + "epoch": 0.485344224814375, + "grad_norm": 1.4810116291046143, + "learning_rate": 1.046009304754143e-05, + "loss": 0.27, + "step": 21865 + }, + { + "epoch": 0.48545521137390263, + "grad_norm": 1.1808918714523315, + "learning_rate": 1.0456609927809997e-05, + "loss": 0.361, + "step": 21870 + }, + { + "epoch": 0.48556619793343025, + "grad_norm": 0.9369193315505981, + "learning_rate": 1.0453126752565463e-05, + "loss": 0.4352, + "step": 21875 + }, + { + "epoch": 0.4856771844929579, + "grad_norm": 1.1464390754699707, + "learning_rate": 1.0449643522231296e-05, + "loss": 0.5644, + "step": 21880 + }, + { + "epoch": 0.48578817105248556, + "grad_norm": 1.6998164653778076, + "learning_rate": 1.044616023723098e-05, + "loss": 0.5686, + "step": 21885 + }, + { + "epoch": 0.4858991576120132, + "grad_norm": 0.9991883635520935, + "learning_rate": 1.0442676897987995e-05, + "loss": 0.3721, + "step": 21890 + }, + { + "epoch": 0.4860101441715408, + "grad_norm": 1.3870561122894287, + "learning_rate": 1.043919350492584e-05, + "loss": 0.3989, + "step": 21895 + }, + { + "epoch": 0.48612113073106844, + "grad_norm": 0.7130857110023499, + "learning_rate": 1.0435710058468011e-05, + "loss": 0.5284, + "step": 21900 + }, + { + "epoch": 0.4862321172905961, + "grad_norm": 1.0134477615356445, + "learning_rate": 1.0432226559038018e-05, + "loss": 0.3538, + "step": 21905 + }, + { + "epoch": 0.48634310385012375, + "grad_norm": 1.3501771688461304, + "learning_rate": 1.0428743007059366e-05, + "loss": 0.3259, + "step": 21910 + }, + { + "epoch": 0.4864540904096514, + "grad_norm": 1.179742455482483, + "learning_rate": 1.0425259402955577e-05, + "loss": 0.4111, + "step": 21915 + }, + { + "epoch": 0.48656507696917906, + "grad_norm": 0.9390612244606018, + "learning_rate": 1.0421775747150174e-05, + "loss": 0.3326, + "step": 21920 + }, + { + "epoch": 0.4866760635287067, + "grad_norm": 1.1103237867355347, + "learning_rate": 1.0418292040066693e-05, + "loss": 0.4458, + "step": 21925 + }, + { + "epoch": 0.4867870500882343, + "grad_norm": 1.2311724424362183, + "learning_rate": 1.0414808282128668e-05, + "loss": 0.3293, + "step": 21930 + }, + { + "epoch": 0.48689803664776193, + "grad_norm": 1.5785497426986694, + "learning_rate": 1.0411324473759643e-05, + "loss": 0.4245, + "step": 21935 + }, + { + "epoch": 0.4870090232072896, + "grad_norm": 1.5965907573699951, + "learning_rate": 1.0407840615383168e-05, + "loss": 0.3961, + "step": 21940 + }, + { + "epoch": 0.48712000976681724, + "grad_norm": 1.7988016605377197, + "learning_rate": 1.04043567074228e-05, + "loss": 0.4447, + "step": 21945 + }, + { + "epoch": 0.48723099632634487, + "grad_norm": 0.9706549048423767, + "learning_rate": 1.0400872750302095e-05, + "loss": 0.4787, + "step": 21950 + }, + { + "epoch": 0.4873419828858725, + "grad_norm": 1.2334511280059814, + "learning_rate": 1.039738874444463e-05, + "loss": 0.4045, + "step": 21955 + }, + { + "epoch": 0.4874529694454002, + "grad_norm": 1.216166377067566, + "learning_rate": 1.0393904690273974e-05, + "loss": 0.2541, + "step": 21960 + }, + { + "epoch": 0.4875639560049278, + "grad_norm": 0.8982766270637512, + "learning_rate": 1.0390420588213708e-05, + "loss": 0.4672, + "step": 21965 + }, + { + "epoch": 0.4876749425644554, + "grad_norm": 1.6147069931030273, + "learning_rate": 1.0386936438687414e-05, + "loss": 0.4673, + "step": 21970 + }, + { + "epoch": 0.4877859291239831, + "grad_norm": 1.2869994640350342, + "learning_rate": 1.0383452242118686e-05, + "loss": 0.4443, + "step": 21975 + }, + { + "epoch": 0.48789691568351073, + "grad_norm": 1.865621566772461, + "learning_rate": 1.037996799893112e-05, + "loss": 0.3578, + "step": 21980 + }, + { + "epoch": 0.48800790224303836, + "grad_norm": 1.2217044830322266, + "learning_rate": 1.0376483709548319e-05, + "loss": 0.4546, + "step": 21985 + }, + { + "epoch": 0.488118888802566, + "grad_norm": 1.1836082935333252, + "learning_rate": 1.0372999374393893e-05, + "loss": 0.4257, + "step": 21990 + }, + { + "epoch": 0.48822987536209367, + "grad_norm": 0.815844714641571, + "learning_rate": 1.0369514993891451e-05, + "loss": 0.372, + "step": 21995 + }, + { + "epoch": 0.4883408619216213, + "grad_norm": 1.2024153470993042, + "learning_rate": 1.0366030568464618e-05, + "loss": 0.4395, + "step": 22000 + }, + { + "epoch": 0.4884518484811489, + "grad_norm": 1.3669815063476562, + "learning_rate": 1.0362546098537012e-05, + "loss": 0.442, + "step": 22005 + }, + { + "epoch": 0.48856283504067655, + "grad_norm": 0.7903822064399719, + "learning_rate": 1.035906158453227e-05, + "loss": 0.3806, + "step": 22010 + }, + { + "epoch": 0.48867382160020423, + "grad_norm": 1.359349012374878, + "learning_rate": 1.0355577026874026e-05, + "loss": 0.4904, + "step": 22015 + }, + { + "epoch": 0.48878480815973185, + "grad_norm": 1.09807288646698, + "learning_rate": 1.0352092425985919e-05, + "loss": 0.5047, + "step": 22020 + }, + { + "epoch": 0.4888957947192595, + "grad_norm": 1.374467134475708, + "learning_rate": 1.0348607782291595e-05, + "loss": 0.4666, + "step": 22025 + }, + { + "epoch": 0.48900678127878716, + "grad_norm": 0.9287629127502441, + "learning_rate": 1.0345123096214706e-05, + "loss": 0.3533, + "step": 22030 + }, + { + "epoch": 0.4891177678383148, + "grad_norm": 1.2469364404678345, + "learning_rate": 1.0341638368178905e-05, + "loss": 0.4242, + "step": 22035 + }, + { + "epoch": 0.4892287543978424, + "grad_norm": 1.1202641725540161, + "learning_rate": 1.0338153598607862e-05, + "loss": 0.4986, + "step": 22040 + }, + { + "epoch": 0.48933974095737004, + "grad_norm": 1.2364095449447632, + "learning_rate": 1.0334668787925237e-05, + "loss": 0.4598, + "step": 22045 + }, + { + "epoch": 0.4894507275168977, + "grad_norm": 1.5487759113311768, + "learning_rate": 1.0331183936554703e-05, + "loss": 0.4866, + "step": 22050 + }, + { + "epoch": 0.48956171407642535, + "grad_norm": 1.3600040674209595, + "learning_rate": 1.0327699044919936e-05, + "loss": 0.4836, + "step": 22055 + }, + { + "epoch": 0.489672700635953, + "grad_norm": 1.1588906049728394, + "learning_rate": 1.0324214113444619e-05, + "loss": 0.5456, + "step": 22060 + }, + { + "epoch": 0.48978368719548065, + "grad_norm": 1.1482707262039185, + "learning_rate": 1.0320729142552437e-05, + "loss": 0.331, + "step": 22065 + }, + { + "epoch": 0.4898946737550083, + "grad_norm": 1.0912998914718628, + "learning_rate": 1.0317244132667081e-05, + "loss": 0.3431, + "step": 22070 + }, + { + "epoch": 0.4900056603145359, + "grad_norm": 1.4887139797210693, + "learning_rate": 1.031375908421225e-05, + "loss": 0.5176, + "step": 22075 + }, + { + "epoch": 0.49011664687406353, + "grad_norm": 1.220666527748108, + "learning_rate": 1.0310273997611639e-05, + "loss": 0.4706, + "step": 22080 + }, + { + "epoch": 0.4902276334335912, + "grad_norm": 1.1723802089691162, + "learning_rate": 1.0306788873288958e-05, + "loss": 0.3419, + "step": 22085 + }, + { + "epoch": 0.49033861999311884, + "grad_norm": 0.8831339478492737, + "learning_rate": 1.0303303711667914e-05, + "loss": 0.1974, + "step": 22090 + }, + { + "epoch": 0.49044960655264647, + "grad_norm": 0.974607527256012, + "learning_rate": 1.0299818513172221e-05, + "loss": 0.4939, + "step": 22095 + }, + { + "epoch": 0.4905605931121741, + "grad_norm": 1.4574120044708252, + "learning_rate": 1.0296333278225599e-05, + "loss": 0.4942, + "step": 22100 + }, + { + "epoch": 0.4906715796717018, + "grad_norm": 1.623812198638916, + "learning_rate": 1.0292848007251773e-05, + "loss": 0.3821, + "step": 22105 + }, + { + "epoch": 0.4907825662312294, + "grad_norm": 1.2309707403182983, + "learning_rate": 1.0289362700674466e-05, + "loss": 0.473, + "step": 22110 + }, + { + "epoch": 0.490893552790757, + "grad_norm": 1.5308640003204346, + "learning_rate": 1.0285877358917414e-05, + "loss": 0.3446, + "step": 22115 + }, + { + "epoch": 0.4910045393502847, + "grad_norm": 1.2372568845748901, + "learning_rate": 1.0282391982404347e-05, + "loss": 0.5556, + "step": 22120 + }, + { + "epoch": 0.49111552590981233, + "grad_norm": 1.0952364206314087, + "learning_rate": 1.0278906571559012e-05, + "loss": 0.452, + "step": 22125 + }, + { + "epoch": 0.49122651246933996, + "grad_norm": 1.801071286201477, + "learning_rate": 1.0275421126805154e-05, + "loss": 0.4498, + "step": 22130 + }, + { + "epoch": 0.4913374990288676, + "grad_norm": 1.126573085784912, + "learning_rate": 1.0271935648566517e-05, + "loss": 0.2875, + "step": 22135 + }, + { + "epoch": 0.49144848558839527, + "grad_norm": 0.9358459711074829, + "learning_rate": 1.0268450137266856e-05, + "loss": 0.4407, + "step": 22140 + }, + { + "epoch": 0.4915594721479229, + "grad_norm": 0.8276937007904053, + "learning_rate": 1.0264964593329928e-05, + "loss": 0.4608, + "step": 22145 + }, + { + "epoch": 0.4916704587074505, + "grad_norm": 0.99130779504776, + "learning_rate": 1.0261479017179494e-05, + "loss": 0.466, + "step": 22150 + }, + { + "epoch": 0.49178144526697815, + "grad_norm": 1.2398957014083862, + "learning_rate": 1.0257993409239318e-05, + "loss": 0.3074, + "step": 22155 + }, + { + "epoch": 0.4918924318265058, + "grad_norm": 1.6642937660217285, + "learning_rate": 1.0254507769933166e-05, + "loss": 0.3289, + "step": 22160 + }, + { + "epoch": 0.49200341838603345, + "grad_norm": 0.7789919376373291, + "learning_rate": 1.0251022099684815e-05, + "loss": 0.5229, + "step": 22165 + }, + { + "epoch": 0.4921144049455611, + "grad_norm": 2.206631660461426, + "learning_rate": 1.024753639891804e-05, + "loss": 0.4176, + "step": 22170 + }, + { + "epoch": 0.49222539150508876, + "grad_norm": 1.417659044265747, + "learning_rate": 1.0244050668056617e-05, + "loss": 0.3517, + "step": 22175 + }, + { + "epoch": 0.4923363780646164, + "grad_norm": 1.23202645778656, + "learning_rate": 1.0240564907524335e-05, + "loss": 0.5035, + "step": 22180 + }, + { + "epoch": 0.492447364624144, + "grad_norm": 1.1205260753631592, + "learning_rate": 1.0237079117744977e-05, + "loss": 0.4148, + "step": 22185 + }, + { + "epoch": 0.49255835118367164, + "grad_norm": 0.7706506848335266, + "learning_rate": 1.0233593299142336e-05, + "loss": 0.3719, + "step": 22190 + }, + { + "epoch": 0.4926693377431993, + "grad_norm": 1.1921621561050415, + "learning_rate": 1.0230107452140203e-05, + "loss": 0.4742, + "step": 22195 + }, + { + "epoch": 0.49278032430272695, + "grad_norm": 1.1580519676208496, + "learning_rate": 1.0226621577162377e-05, + "loss": 0.4158, + "step": 22200 + }, + { + "epoch": 0.4928913108622546, + "grad_norm": 1.3027304410934448, + "learning_rate": 1.0223135674632663e-05, + "loss": 0.4033, + "step": 22205 + }, + { + "epoch": 0.4930022974217822, + "grad_norm": 1.076047420501709, + "learning_rate": 1.021964974497486e-05, + "loss": 0.5984, + "step": 22210 + }, + { + "epoch": 0.4931132839813099, + "grad_norm": 1.8300280570983887, + "learning_rate": 1.021616378861278e-05, + "loss": 0.3093, + "step": 22215 + }, + { + "epoch": 0.4932242705408375, + "grad_norm": 0.9792448282241821, + "learning_rate": 1.021267780597023e-05, + "loss": 0.3822, + "step": 22220 + }, + { + "epoch": 0.49333525710036513, + "grad_norm": 1.382432222366333, + "learning_rate": 1.0209191797471026e-05, + "loss": 0.3343, + "step": 22225 + }, + { + "epoch": 0.4934462436598928, + "grad_norm": 1.1926004886627197, + "learning_rate": 1.0205705763538985e-05, + "loss": 0.4739, + "step": 22230 + }, + { + "epoch": 0.49355723021942044, + "grad_norm": 1.1565606594085693, + "learning_rate": 1.0202219704597924e-05, + "loss": 0.4212, + "step": 22235 + }, + { + "epoch": 0.49366821677894807, + "grad_norm": 0.8615053296089172, + "learning_rate": 1.0198733621071672e-05, + "loss": 0.5037, + "step": 22240 + }, + { + "epoch": 0.4937792033384757, + "grad_norm": 1.8336201906204224, + "learning_rate": 1.0195247513384054e-05, + "loss": 0.582, + "step": 22245 + }, + { + "epoch": 0.4938901898980034, + "grad_norm": 1.6957383155822754, + "learning_rate": 1.0191761381958897e-05, + "loss": 0.4026, + "step": 22250 + }, + { + "epoch": 0.494001176457531, + "grad_norm": 1.1199365854263306, + "learning_rate": 1.0188275227220034e-05, + "loss": 0.4577, + "step": 22255 + }, + { + "epoch": 0.4941121630170586, + "grad_norm": 1.173117756843567, + "learning_rate": 1.01847890495913e-05, + "loss": 0.266, + "step": 22260 + }, + { + "epoch": 0.49422314957658625, + "grad_norm": 0.8515411615371704, + "learning_rate": 1.0181302849496535e-05, + "loss": 0.3795, + "step": 22265 + }, + { + "epoch": 0.49433413613611393, + "grad_norm": 1.5342164039611816, + "learning_rate": 1.0177816627359575e-05, + "loss": 0.4661, + "step": 22270 + }, + { + "epoch": 0.49444512269564156, + "grad_norm": 1.0877432823181152, + "learning_rate": 1.0174330383604266e-05, + "loss": 0.3531, + "step": 22275 + }, + { + "epoch": 0.4945561092551692, + "grad_norm": 0.9727147817611694, + "learning_rate": 1.0170844118654457e-05, + "loss": 0.4371, + "step": 22280 + }, + { + "epoch": 0.49466709581469687, + "grad_norm": 0.9545711874961853, + "learning_rate": 1.0167357832933988e-05, + "loss": 0.4294, + "step": 22285 + }, + { + "epoch": 0.4947780823742245, + "grad_norm": 0.8435078263282776, + "learning_rate": 1.0163871526866718e-05, + "loss": 0.5171, + "step": 22290 + }, + { + "epoch": 0.4948890689337521, + "grad_norm": 1.1975470781326294, + "learning_rate": 1.0160385200876497e-05, + "loss": 0.442, + "step": 22295 + }, + { + "epoch": 0.49500005549327974, + "grad_norm": 1.1052860021591187, + "learning_rate": 1.0156898855387183e-05, + "loss": 0.3691, + "step": 22300 + }, + { + "epoch": 0.4951110420528074, + "grad_norm": 0.8964700698852539, + "learning_rate": 1.015341249082263e-05, + "loss": 0.3977, + "step": 22305 + }, + { + "epoch": 0.49522202861233505, + "grad_norm": 1.4109879732131958, + "learning_rate": 1.0149926107606702e-05, + "loss": 0.5525, + "step": 22310 + }, + { + "epoch": 0.4953330151718627, + "grad_norm": 1.1375197172164917, + "learning_rate": 1.0146439706163259e-05, + "loss": 0.507, + "step": 22315 + }, + { + "epoch": 0.4954440017313903, + "grad_norm": 0.9993934035301208, + "learning_rate": 1.0142953286916166e-05, + "loss": 0.3335, + "step": 22320 + }, + { + "epoch": 0.495554988290918, + "grad_norm": 1.6546269655227661, + "learning_rate": 1.0139466850289296e-05, + "loss": 0.3553, + "step": 22325 + }, + { + "epoch": 0.4956659748504456, + "grad_norm": 1.108886957168579, + "learning_rate": 1.0135980396706513e-05, + "loss": 0.4506, + "step": 22330 + }, + { + "epoch": 0.49577696140997324, + "grad_norm": 1.3343003988265991, + "learning_rate": 1.0132493926591688e-05, + "loss": 0.417, + "step": 22335 + }, + { + "epoch": 0.4958879479695009, + "grad_norm": 1.0910420417785645, + "learning_rate": 1.0129007440368699e-05, + "loss": 0.372, + "step": 22340 + }, + { + "epoch": 0.49599893452902855, + "grad_norm": 1.4987506866455078, + "learning_rate": 1.0125520938461415e-05, + "loss": 0.4328, + "step": 22345 + }, + { + "epoch": 0.49610992108855617, + "grad_norm": 1.265060305595398, + "learning_rate": 1.0122034421293719e-05, + "loss": 0.4198, + "step": 22350 + }, + { + "epoch": 0.4962209076480838, + "grad_norm": 1.5524022579193115, + "learning_rate": 1.0118547889289485e-05, + "loss": 0.4624, + "step": 22355 + }, + { + "epoch": 0.4963318942076115, + "grad_norm": 0.7765639424324036, + "learning_rate": 1.0115061342872597e-05, + "loss": 0.438, + "step": 22360 + }, + { + "epoch": 0.4964428807671391, + "grad_norm": 0.9765235781669617, + "learning_rate": 1.0111574782466935e-05, + "loss": 0.4708, + "step": 22365 + }, + { + "epoch": 0.49655386732666673, + "grad_norm": 1.64158034324646, + "learning_rate": 1.0108088208496385e-05, + "loss": 0.5551, + "step": 22370 + }, + { + "epoch": 0.49666485388619436, + "grad_norm": 1.1808514595031738, + "learning_rate": 1.0104601621384835e-05, + "loss": 0.3945, + "step": 22375 + }, + { + "epoch": 0.49677584044572204, + "grad_norm": 1.438317894935608, + "learning_rate": 1.0101115021556172e-05, + "loss": 0.4569, + "step": 22380 + }, + { + "epoch": 0.49688682700524966, + "grad_norm": 1.202447533607483, + "learning_rate": 1.0097628409434281e-05, + "loss": 0.4311, + "step": 22385 + }, + { + "epoch": 0.4969978135647773, + "grad_norm": 1.1385836601257324, + "learning_rate": 1.0094141785443057e-05, + "loss": 0.2666, + "step": 22390 + }, + { + "epoch": 0.497108800124305, + "grad_norm": 1.2178776264190674, + "learning_rate": 1.0090655150006389e-05, + "loss": 0.5786, + "step": 22395 + }, + { + "epoch": 0.4972197866838326, + "grad_norm": 1.0078877210617065, + "learning_rate": 1.0087168503548173e-05, + "loss": 0.3878, + "step": 22400 + }, + { + "epoch": 0.4973307732433602, + "grad_norm": 1.3112704753875732, + "learning_rate": 1.00836818464923e-05, + "loss": 0.4361, + "step": 22405 + }, + { + "epoch": 0.49744175980288785, + "grad_norm": 0.9972655773162842, + "learning_rate": 1.0080195179262673e-05, + "loss": 0.3988, + "step": 22410 + }, + { + "epoch": 0.49755274636241553, + "grad_norm": 0.8976503014564514, + "learning_rate": 1.0076708502283184e-05, + "loss": 0.4958, + "step": 22415 + }, + { + "epoch": 0.49766373292194316, + "grad_norm": 1.2193833589553833, + "learning_rate": 1.0073221815977732e-05, + "loss": 0.2802, + "step": 22420 + }, + { + "epoch": 0.4977747194814708, + "grad_norm": 1.1301801204681396, + "learning_rate": 1.0069735120770217e-05, + "loss": 0.402, + "step": 22425 + }, + { + "epoch": 0.4978857060409984, + "grad_norm": 1.3754868507385254, + "learning_rate": 1.0066248417084539e-05, + "loss": 0.3872, + "step": 22430 + }, + { + "epoch": 0.4979966926005261, + "grad_norm": 1.152056336402893, + "learning_rate": 1.0062761705344601e-05, + "loss": 0.4077, + "step": 22435 + }, + { + "epoch": 0.4981076791600537, + "grad_norm": 1.1381980180740356, + "learning_rate": 1.0059274985974305e-05, + "loss": 0.4486, + "step": 22440 + }, + { + "epoch": 0.49821866571958134, + "grad_norm": 0.9616106748580933, + "learning_rate": 1.0055788259397559e-05, + "loss": 0.4419, + "step": 22445 + }, + { + "epoch": 0.498329652279109, + "grad_norm": 1.1387845277786255, + "learning_rate": 1.005230152603826e-05, + "loss": 0.411, + "step": 22450 + }, + { + "epoch": 0.49844063883863665, + "grad_norm": 1.3904991149902344, + "learning_rate": 1.0048814786320319e-05, + "loss": 0.4552, + "step": 22455 + }, + { + "epoch": 0.4985516253981643, + "grad_norm": 1.3449314832687378, + "learning_rate": 1.0045328040667638e-05, + "loss": 0.4975, + "step": 22460 + }, + { + "epoch": 0.4986626119576919, + "grad_norm": 1.5593739748001099, + "learning_rate": 1.0041841289504125e-05, + "loss": 0.4867, + "step": 22465 + }, + { + "epoch": 0.4987735985172196, + "grad_norm": 0.911896824836731, + "learning_rate": 1.003835453325369e-05, + "loss": 0.3393, + "step": 22470 + }, + { + "epoch": 0.4988845850767472, + "grad_norm": 1.7970991134643555, + "learning_rate": 1.003486777234024e-05, + "loss": 0.4731, + "step": 22475 + }, + { + "epoch": 0.49899557163627484, + "grad_norm": 1.274659514427185, + "learning_rate": 1.0031381007187681e-05, + "loss": 0.4363, + "step": 22480 + }, + { + "epoch": 0.49910655819580246, + "grad_norm": 1.4955745935440063, + "learning_rate": 1.0027894238219923e-05, + "loss": 0.3854, + "step": 22485 + }, + { + "epoch": 0.49921754475533014, + "grad_norm": 1.678066611289978, + "learning_rate": 1.0024407465860881e-05, + "loss": 0.5047, + "step": 22490 + }, + { + "epoch": 0.49932853131485777, + "grad_norm": 1.081360101699829, + "learning_rate": 1.002092069053446e-05, + "loss": 0.4052, + "step": 22495 + }, + { + "epoch": 0.4994395178743854, + "grad_norm": 0.9500686526298523, + "learning_rate": 1.0017433912664572e-05, + "loss": 0.3784, + "step": 22500 + }, + { + "epoch": 0.4995505044339131, + "grad_norm": 1.5203646421432495, + "learning_rate": 1.0013947132675125e-05, + "loss": 0.4019, + "step": 22505 + }, + { + "epoch": 0.4996614909934407, + "grad_norm": 1.4040472507476807, + "learning_rate": 1.0010460350990037e-05, + "loss": 0.3283, + "step": 22510 + }, + { + "epoch": 0.49977247755296833, + "grad_norm": 1.0540430545806885, + "learning_rate": 1.0006973568033209e-05, + "loss": 0.4228, + "step": 22515 + }, + { + "epoch": 0.49988346411249596, + "grad_norm": 0.9384956359863281, + "learning_rate": 1.000348678422856e-05, + "loss": 0.4279, + "step": 22520 + }, + { + "epoch": 0.49999445067202364, + "grad_norm": 1.4751801490783691, + "learning_rate": 1e-05, + "loss": 0.5231, + "step": 22525 + }, + { + "epoch": 0.5001054372315512, + "grad_norm": 1.00909423828125, + "learning_rate": 9.996513215771439e-06, + "loss": 0.3545, + "step": 22530 + }, + { + "epoch": 0.5002164237910789, + "grad_norm": 1.2268149852752686, + "learning_rate": 9.993026431966793e-06, + "loss": 0.4632, + "step": 22535 + }, + { + "epoch": 0.5003274103506066, + "grad_norm": 1.1826709508895874, + "learning_rate": 9.989539649009968e-06, + "loss": 0.3902, + "step": 22540 + }, + { + "epoch": 0.5004383969101341, + "grad_norm": 1.2434492111206055, + "learning_rate": 9.986052867324878e-06, + "loss": 0.3739, + "step": 22545 + }, + { + "epoch": 0.5005493834696618, + "grad_norm": 1.4091449975967407, + "learning_rate": 9.982566087335431e-06, + "loss": 0.59, + "step": 22550 + }, + { + "epoch": 0.5006603700291895, + "grad_norm": 0.9223693609237671, + "learning_rate": 9.97907930946554e-06, + "loss": 0.3983, + "step": 22555 + }, + { + "epoch": 0.5007713565887171, + "grad_norm": 1.0859547853469849, + "learning_rate": 9.975592534139122e-06, + "loss": 0.4797, + "step": 22560 + }, + { + "epoch": 0.5008823431482448, + "grad_norm": 2.139925479888916, + "learning_rate": 9.972105761780077e-06, + "loss": 0.7093, + "step": 22565 + }, + { + "epoch": 0.5009933297077724, + "grad_norm": 1.3303583860397339, + "learning_rate": 9.968618992812324e-06, + "loss": 0.3643, + "step": 22570 + }, + { + "epoch": 0.5011043162673, + "grad_norm": 0.7934449315071106, + "learning_rate": 9.965132227659764e-06, + "loss": 0.3783, + "step": 22575 + }, + { + "epoch": 0.5012153028268277, + "grad_norm": 1.7177858352661133, + "learning_rate": 9.961645466746314e-06, + "loss": 0.4049, + "step": 22580 + }, + { + "epoch": 0.5013262893863553, + "grad_norm": 1.3820732831954956, + "learning_rate": 9.958158710495877e-06, + "loss": 0.3094, + "step": 22585 + }, + { + "epoch": 0.5014372759458829, + "grad_norm": 1.27067232131958, + "learning_rate": 9.954671959332366e-06, + "loss": 0.5818, + "step": 22590 + }, + { + "epoch": 0.5015482625054106, + "grad_norm": 0.9377214312553406, + "learning_rate": 9.951185213679686e-06, + "loss": 0.439, + "step": 22595 + }, + { + "epoch": 0.5016592490649382, + "grad_norm": 1.2688758373260498, + "learning_rate": 9.947698473961744e-06, + "loss": 0.4648, + "step": 22600 + }, + { + "epoch": 0.5017702356244659, + "grad_norm": 1.2199465036392212, + "learning_rate": 9.944211740602445e-06, + "loss": 0.3057, + "step": 22605 + }, + { + "epoch": 0.5018812221839936, + "grad_norm": 1.1889398097991943, + "learning_rate": 9.940725014025696e-06, + "loss": 0.4765, + "step": 22610 + }, + { + "epoch": 0.5019922087435211, + "grad_norm": 1.2021925449371338, + "learning_rate": 9.937238294655399e-06, + "loss": 0.2815, + "step": 22615 + }, + { + "epoch": 0.5021031953030488, + "grad_norm": 1.5429582595825195, + "learning_rate": 9.933751582915464e-06, + "loss": 0.262, + "step": 22620 + }, + { + "epoch": 0.5022141818625765, + "grad_norm": 1.4658925533294678, + "learning_rate": 9.930264879229785e-06, + "loss": 0.4531, + "step": 22625 + }, + { + "epoch": 0.5023251684221041, + "grad_norm": 1.2303109169006348, + "learning_rate": 9.926778184022273e-06, + "loss": 0.3372, + "step": 22630 + }, + { + "epoch": 0.5024361549816317, + "grad_norm": 1.1760908365249634, + "learning_rate": 9.92329149771682e-06, + "loss": 0.3905, + "step": 22635 + }, + { + "epoch": 0.5025471415411593, + "grad_norm": 1.3378584384918213, + "learning_rate": 9.919804820737328e-06, + "loss": 0.2739, + "step": 22640 + }, + { + "epoch": 0.502658128100687, + "grad_norm": 1.1107934713363647, + "learning_rate": 9.916318153507701e-06, + "loss": 0.3587, + "step": 22645 + }, + { + "epoch": 0.5027691146602147, + "grad_norm": 1.070551872253418, + "learning_rate": 9.912831496451829e-06, + "loss": 0.3671, + "step": 22650 + }, + { + "epoch": 0.5028801012197422, + "grad_norm": 1.170880675315857, + "learning_rate": 9.909344849993616e-06, + "loss": 0.3604, + "step": 22655 + }, + { + "epoch": 0.5029910877792699, + "grad_norm": 0.8394003510475159, + "learning_rate": 9.905858214556947e-06, + "loss": 0.5054, + "step": 22660 + }, + { + "epoch": 0.5031020743387976, + "grad_norm": 1.767619013786316, + "learning_rate": 9.902371590565724e-06, + "loss": 0.3654, + "step": 22665 + }, + { + "epoch": 0.5032130608983252, + "grad_norm": 1.0583374500274658, + "learning_rate": 9.898884978443833e-06, + "loss": 0.4577, + "step": 22670 + }, + { + "epoch": 0.5033240474578529, + "grad_norm": 1.1993390321731567, + "learning_rate": 9.895398378615165e-06, + "loss": 0.3257, + "step": 22675 + }, + { + "epoch": 0.5034350340173805, + "grad_norm": 1.3034275770187378, + "learning_rate": 9.891911791503618e-06, + "loss": 0.4339, + "step": 22680 + }, + { + "epoch": 0.5035460205769081, + "grad_norm": 1.5261765718460083, + "learning_rate": 9.888425217533067e-06, + "loss": 0.3689, + "step": 22685 + }, + { + "epoch": 0.5036570071364358, + "grad_norm": 1.1259262561798096, + "learning_rate": 9.88493865712741e-06, + "loss": 0.4287, + "step": 22690 + }, + { + "epoch": 0.5037679936959634, + "grad_norm": 1.1270091533660889, + "learning_rate": 9.881452110710519e-06, + "loss": 0.4501, + "step": 22695 + }, + { + "epoch": 0.503878980255491, + "grad_norm": 1.2836631536483765, + "learning_rate": 9.877965578706286e-06, + "loss": 0.4244, + "step": 22700 + }, + { + "epoch": 0.5039899668150187, + "grad_norm": 1.300953984260559, + "learning_rate": 9.874479061538588e-06, + "loss": 0.2626, + "step": 22705 + }, + { + "epoch": 0.5041009533745463, + "grad_norm": 1.2757043838500977, + "learning_rate": 9.870992559631304e-06, + "loss": 0.514, + "step": 22710 + }, + { + "epoch": 0.504211939934074, + "grad_norm": 1.0749907493591309, + "learning_rate": 9.867506073408313e-06, + "loss": 0.4767, + "step": 22715 + }, + { + "epoch": 0.5043229264936017, + "grad_norm": 1.3691682815551758, + "learning_rate": 9.86401960329349e-06, + "loss": 0.5215, + "step": 22720 + }, + { + "epoch": 0.5044339130531292, + "grad_norm": 1.7371306419372559, + "learning_rate": 9.860533149710705e-06, + "loss": 0.5877, + "step": 22725 + }, + { + "epoch": 0.5045448996126569, + "grad_norm": 1.204622745513916, + "learning_rate": 9.857046713083836e-06, + "loss": 0.434, + "step": 22730 + }, + { + "epoch": 0.5046558861721846, + "grad_norm": 0.9504795074462891, + "learning_rate": 9.853560293836743e-06, + "loss": 0.4497, + "step": 22735 + }, + { + "epoch": 0.5047668727317122, + "grad_norm": 1.0582622289657593, + "learning_rate": 9.850073892393303e-06, + "loss": 0.3307, + "step": 22740 + }, + { + "epoch": 0.5048778592912398, + "grad_norm": 1.0019503831863403, + "learning_rate": 9.846587509177374e-06, + "loss": 0.3662, + "step": 22745 + }, + { + "epoch": 0.5049888458507674, + "grad_norm": 1.3375298976898193, + "learning_rate": 9.843101144612824e-06, + "loss": 0.4374, + "step": 22750 + }, + { + "epoch": 0.5050998324102951, + "grad_norm": 1.2192867994308472, + "learning_rate": 9.839614799123507e-06, + "loss": 0.5075, + "step": 22755 + }, + { + "epoch": 0.5052108189698228, + "grad_norm": 1.1226353645324707, + "learning_rate": 9.836128473133282e-06, + "loss": 0.4266, + "step": 22760 + }, + { + "epoch": 0.5053218055293504, + "grad_norm": 1.436882495880127, + "learning_rate": 9.832642167066015e-06, + "loss": 0.4404, + "step": 22765 + }, + { + "epoch": 0.505432792088878, + "grad_norm": 0.9391142725944519, + "learning_rate": 9.829155881345546e-06, + "loss": 0.3448, + "step": 22770 + }, + { + "epoch": 0.5055437786484057, + "grad_norm": 1.5629217624664307, + "learning_rate": 9.825669616395737e-06, + "loss": 0.3353, + "step": 22775 + }, + { + "epoch": 0.5056547652079333, + "grad_norm": 1.2283124923706055, + "learning_rate": 9.822183372640426e-06, + "loss": 0.4115, + "step": 22780 + }, + { + "epoch": 0.505765751767461, + "grad_norm": 1.7312462329864502, + "learning_rate": 9.818697150503467e-06, + "loss": 0.5001, + "step": 22785 + }, + { + "epoch": 0.5058767383269887, + "grad_norm": 1.1549898386001587, + "learning_rate": 9.815210950408703e-06, + "loss": 0.4316, + "step": 22790 + }, + { + "epoch": 0.5059877248865162, + "grad_norm": 1.2889511585235596, + "learning_rate": 9.811724772779968e-06, + "loss": 0.3246, + "step": 22795 + }, + { + "epoch": 0.5060987114460439, + "grad_norm": 1.1245282888412476, + "learning_rate": 9.808238618041108e-06, + "loss": 0.4711, + "step": 22800 + }, + { + "epoch": 0.5062096980055715, + "grad_norm": 1.3970367908477783, + "learning_rate": 9.804752486615947e-06, + "loss": 0.4878, + "step": 22805 + }, + { + "epoch": 0.5063206845650992, + "grad_norm": 0.7514325380325317, + "learning_rate": 9.801266378928326e-06, + "loss": 0.3886, + "step": 22810 + }, + { + "epoch": 0.5064316711246268, + "grad_norm": 0.8837243914604187, + "learning_rate": 9.797780295402078e-06, + "loss": 0.3596, + "step": 22815 + }, + { + "epoch": 0.5065426576841544, + "grad_norm": 1.0351483821868896, + "learning_rate": 9.79429423646102e-06, + "loss": 0.2448, + "step": 22820 + }, + { + "epoch": 0.5066536442436821, + "grad_norm": 1.0443217754364014, + "learning_rate": 9.790808202528977e-06, + "loss": 0.484, + "step": 22825 + }, + { + "epoch": 0.5067646308032098, + "grad_norm": 1.6663671731948853, + "learning_rate": 9.787322194029773e-06, + "loss": 0.4237, + "step": 22830 + }, + { + "epoch": 0.5068756173627373, + "grad_norm": 1.1216480731964111, + "learning_rate": 9.783836211387224e-06, + "loss": 0.429, + "step": 22835 + }, + { + "epoch": 0.506986603922265, + "grad_norm": 0.8126741647720337, + "learning_rate": 9.780350255025143e-06, + "loss": 0.6123, + "step": 22840 + }, + { + "epoch": 0.5070975904817927, + "grad_norm": 1.0116031169891357, + "learning_rate": 9.776864325367338e-06, + "loss": 0.3102, + "step": 22845 + }, + { + "epoch": 0.5072085770413203, + "grad_norm": 1.1866459846496582, + "learning_rate": 9.773378422837624e-06, + "loss": 0.4162, + "step": 22850 + }, + { + "epoch": 0.507319563600848, + "grad_norm": 1.530235767364502, + "learning_rate": 9.7698925478598e-06, + "loss": 0.4966, + "step": 22855 + }, + { + "epoch": 0.5074305501603755, + "grad_norm": 1.1207528114318848, + "learning_rate": 9.76640670085767e-06, + "loss": 0.3392, + "step": 22860 + }, + { + "epoch": 0.5075415367199032, + "grad_norm": 0.8439724445343018, + "learning_rate": 9.762920882255026e-06, + "loss": 0.4822, + "step": 22865 + }, + { + "epoch": 0.5076525232794309, + "grad_norm": 1.509404182434082, + "learning_rate": 9.759435092475667e-06, + "loss": 0.4871, + "step": 22870 + }, + { + "epoch": 0.5077635098389585, + "grad_norm": 0.8720496892929077, + "learning_rate": 9.755949331943386e-06, + "loss": 0.3836, + "step": 22875 + }, + { + "epoch": 0.5078744963984861, + "grad_norm": 1.2868167161941528, + "learning_rate": 9.752463601081963e-06, + "loss": 0.3545, + "step": 22880 + }, + { + "epoch": 0.5079854829580138, + "grad_norm": 1.5097538232803345, + "learning_rate": 9.748977900315189e-06, + "loss": 0.5467, + "step": 22885 + }, + { + "epoch": 0.5080964695175414, + "grad_norm": 1.5139718055725098, + "learning_rate": 9.745492230066835e-06, + "loss": 0.3364, + "step": 22890 + }, + { + "epoch": 0.5082074560770691, + "grad_norm": 0.7690618634223938, + "learning_rate": 9.742006590760683e-06, + "loss": 0.2862, + "step": 22895 + }, + { + "epoch": 0.5083184426365968, + "grad_norm": 1.22612464427948, + "learning_rate": 9.738520982820509e-06, + "loss": 0.3642, + "step": 22900 + }, + { + "epoch": 0.5084294291961243, + "grad_norm": 1.3420028686523438, + "learning_rate": 9.735035406670072e-06, + "loss": 0.4987, + "step": 22905 + }, + { + "epoch": 0.508540415755652, + "grad_norm": 1.1621700525283813, + "learning_rate": 9.731549862733147e-06, + "loss": 0.4764, + "step": 22910 + }, + { + "epoch": 0.5086514023151796, + "grad_norm": 1.2968989610671997, + "learning_rate": 9.728064351433484e-06, + "loss": 0.4678, + "step": 22915 + }, + { + "epoch": 0.5087623888747073, + "grad_norm": 1.6852279901504517, + "learning_rate": 9.72457887319485e-06, + "loss": 0.4159, + "step": 22920 + }, + { + "epoch": 0.5088733754342349, + "grad_norm": 0.8907633423805237, + "learning_rate": 9.72109342844099e-06, + "loss": 0.4015, + "step": 22925 + }, + { + "epoch": 0.5089843619937625, + "grad_norm": 1.4889419078826904, + "learning_rate": 9.717608017595653e-06, + "loss": 0.3983, + "step": 22930 + }, + { + "epoch": 0.5090953485532902, + "grad_norm": 1.1271584033966064, + "learning_rate": 9.714122641082593e-06, + "loss": 0.5151, + "step": 22935 + }, + { + "epoch": 0.5092063351128179, + "grad_norm": 0.8067299723625183, + "learning_rate": 9.710637299325537e-06, + "loss": 0.4571, + "step": 22940 + }, + { + "epoch": 0.5093173216723454, + "grad_norm": 1.6813609600067139, + "learning_rate": 9.707151992748232e-06, + "loss": 0.5164, + "step": 22945 + }, + { + "epoch": 0.5094283082318731, + "grad_norm": 1.0879247188568115, + "learning_rate": 9.703666721774403e-06, + "loss": 0.4272, + "step": 22950 + }, + { + "epoch": 0.5095392947914008, + "grad_norm": 1.1162388324737549, + "learning_rate": 9.70018148682778e-06, + "loss": 0.4953, + "step": 22955 + }, + { + "epoch": 0.5096502813509284, + "grad_norm": 0.8649173378944397, + "learning_rate": 9.69669628833209e-06, + "loss": 0.3846, + "step": 22960 + }, + { + "epoch": 0.5097612679104561, + "grad_norm": 1.5884920358657837, + "learning_rate": 9.693211126711046e-06, + "loss": 0.2942, + "step": 22965 + }, + { + "epoch": 0.5098722544699836, + "grad_norm": 1.2138025760650635, + "learning_rate": 9.689726002388363e-06, + "loss": 0.4575, + "step": 22970 + }, + { + "epoch": 0.5099832410295113, + "grad_norm": 1.2881455421447754, + "learning_rate": 9.686240915787753e-06, + "loss": 0.4061, + "step": 22975 + }, + { + "epoch": 0.510094227589039, + "grad_norm": 1.0300076007843018, + "learning_rate": 9.682755867332919e-06, + "loss": 0.2838, + "step": 22980 + }, + { + "epoch": 0.5102052141485666, + "grad_norm": 1.7775558233261108, + "learning_rate": 9.679270857447567e-06, + "loss": 0.4665, + "step": 22985 + }, + { + "epoch": 0.5103162007080942, + "grad_norm": 0.9555680751800537, + "learning_rate": 9.675785886555383e-06, + "loss": 0.5005, + "step": 22990 + }, + { + "epoch": 0.5104271872676219, + "grad_norm": 1.1331521272659302, + "learning_rate": 9.672300955080067e-06, + "loss": 0.3951, + "step": 22995 + }, + { + "epoch": 0.5105381738271495, + "grad_norm": 0.889107346534729, + "learning_rate": 9.6688160634453e-06, + "loss": 0.3451, + "step": 23000 + }, + { + "epoch": 0.5106491603866772, + "grad_norm": 3.3628461360931396, + "learning_rate": 9.665331212074768e-06, + "loss": 0.3267, + "step": 23005 + }, + { + "epoch": 0.5107601469462049, + "grad_norm": 1.115371584892273, + "learning_rate": 9.661846401392141e-06, + "loss": 0.3528, + "step": 23010 + }, + { + "epoch": 0.5108711335057324, + "grad_norm": 1.2750897407531738, + "learning_rate": 9.658361631821095e-06, + "loss": 0.4573, + "step": 23015 + }, + { + "epoch": 0.5109821200652601, + "grad_norm": 1.5250240564346313, + "learning_rate": 9.654876903785299e-06, + "loss": 0.5291, + "step": 23020 + }, + { + "epoch": 0.5110931066247877, + "grad_norm": 1.0960813760757446, + "learning_rate": 9.651392217708408e-06, + "loss": 0.3362, + "step": 23025 + }, + { + "epoch": 0.5112040931843154, + "grad_norm": 1.3788139820098877, + "learning_rate": 9.647907574014086e-06, + "loss": 0.4561, + "step": 23030 + }, + { + "epoch": 0.511315079743843, + "grad_norm": 0.8054192662239075, + "learning_rate": 9.644422973125977e-06, + "loss": 0.401, + "step": 23035 + }, + { + "epoch": 0.5114260663033706, + "grad_norm": 1.1127040386199951, + "learning_rate": 9.64093841546773e-06, + "loss": 0.4227, + "step": 23040 + }, + { + "epoch": 0.5115370528628983, + "grad_norm": 1.0898431539535522, + "learning_rate": 9.637453901462992e-06, + "loss": 0.4615, + "step": 23045 + }, + { + "epoch": 0.511648039422426, + "grad_norm": 1.0687729120254517, + "learning_rate": 9.633969431535385e-06, + "loss": 0.4972, + "step": 23050 + }, + { + "epoch": 0.5117590259819536, + "grad_norm": 1.4021871089935303, + "learning_rate": 9.630485006108554e-06, + "loss": 0.4042, + "step": 23055 + }, + { + "epoch": 0.5118700125414812, + "grad_norm": 1.0496760606765747, + "learning_rate": 9.627000625606112e-06, + "loss": 0.4764, + "step": 23060 + }, + { + "epoch": 0.5119809991010089, + "grad_norm": 1.0399717092514038, + "learning_rate": 9.623516290451683e-06, + "loss": 0.3277, + "step": 23065 + }, + { + "epoch": 0.5120919856605365, + "grad_norm": 1.9745243787765503, + "learning_rate": 9.620032001068883e-06, + "loss": 0.4506, + "step": 23070 + }, + { + "epoch": 0.5122029722200642, + "grad_norm": 1.5703458786010742, + "learning_rate": 9.616547757881317e-06, + "loss": 0.5289, + "step": 23075 + }, + { + "epoch": 0.5123139587795917, + "grad_norm": 1.2810927629470825, + "learning_rate": 9.61306356131259e-06, + "loss": 0.4078, + "step": 23080 + }, + { + "epoch": 0.5124249453391194, + "grad_norm": 1.4439598321914673, + "learning_rate": 9.609579411786297e-06, + "loss": 0.445, + "step": 23085 + }, + { + "epoch": 0.5125359318986471, + "grad_norm": 1.3506888151168823, + "learning_rate": 9.60609530972603e-06, + "loss": 0.4064, + "step": 23090 + }, + { + "epoch": 0.5126469184581747, + "grad_norm": 1.1572383642196655, + "learning_rate": 9.602611255555372e-06, + "loss": 0.5398, + "step": 23095 + }, + { + "epoch": 0.5127579050177024, + "grad_norm": 1.0995047092437744, + "learning_rate": 9.599127249697905e-06, + "loss": 0.4621, + "step": 23100 + }, + { + "epoch": 0.51286889157723, + "grad_norm": 1.1626933813095093, + "learning_rate": 9.595643292577206e-06, + "loss": 0.3887, + "step": 23105 + }, + { + "epoch": 0.5129798781367576, + "grad_norm": 0.9370729327201843, + "learning_rate": 9.592159384616835e-06, + "loss": 0.3973, + "step": 23110 + }, + { + "epoch": 0.5130908646962853, + "grad_norm": 2.0881588459014893, + "learning_rate": 9.588675526240362e-06, + "loss": 0.4405, + "step": 23115 + }, + { + "epoch": 0.513201851255813, + "grad_norm": 1.0966885089874268, + "learning_rate": 9.585191717871336e-06, + "loss": 0.3349, + "step": 23120 + }, + { + "epoch": 0.5133128378153405, + "grad_norm": 1.0974148511886597, + "learning_rate": 9.581707959933309e-06, + "loss": 0.3571, + "step": 23125 + }, + { + "epoch": 0.5134238243748682, + "grad_norm": 1.4036043882369995, + "learning_rate": 9.57822425284983e-06, + "loss": 0.4259, + "step": 23130 + }, + { + "epoch": 0.5135348109343958, + "grad_norm": 1.3437319993972778, + "learning_rate": 9.574740597044427e-06, + "loss": 0.5187, + "step": 23135 + }, + { + "epoch": 0.5136457974939235, + "grad_norm": 0.8622636795043945, + "learning_rate": 9.571256992940641e-06, + "loss": 0.4853, + "step": 23140 + }, + { + "epoch": 0.5137567840534512, + "grad_norm": 1.7558482885360718, + "learning_rate": 9.567773440961988e-06, + "loss": 0.4349, + "step": 23145 + }, + { + "epoch": 0.5138677706129787, + "grad_norm": 1.3710284233093262, + "learning_rate": 9.564289941531987e-06, + "loss": 0.4344, + "step": 23150 + }, + { + "epoch": 0.5139787571725064, + "grad_norm": 1.8275679349899292, + "learning_rate": 9.560806495074162e-06, + "loss": 0.336, + "step": 23155 + }, + { + "epoch": 0.5140897437320341, + "grad_norm": 1.0730940103530884, + "learning_rate": 9.557323102012005e-06, + "loss": 0.5358, + "step": 23160 + }, + { + "epoch": 0.5142007302915617, + "grad_norm": 1.1426434516906738, + "learning_rate": 9.553839762769025e-06, + "loss": 0.3901, + "step": 23165 + }, + { + "epoch": 0.5143117168510893, + "grad_norm": 1.0768260955810547, + "learning_rate": 9.550356477768706e-06, + "loss": 0.3633, + "step": 23170 + }, + { + "epoch": 0.514422703410617, + "grad_norm": 1.2068915367126465, + "learning_rate": 9.546873247434542e-06, + "loss": 0.3501, + "step": 23175 + }, + { + "epoch": 0.5145336899701446, + "grad_norm": 1.3856827020645142, + "learning_rate": 9.543390072190006e-06, + "loss": 0.2788, + "step": 23180 + }, + { + "epoch": 0.5146446765296723, + "grad_norm": 1.6092084646224976, + "learning_rate": 9.539906952458572e-06, + "loss": 0.5391, + "step": 23185 + }, + { + "epoch": 0.5147556630891998, + "grad_norm": 0.6838003396987915, + "learning_rate": 9.536423888663712e-06, + "loss": 0.3466, + "step": 23190 + }, + { + "epoch": 0.5148666496487275, + "grad_norm": 2.4816436767578125, + "learning_rate": 9.532940881228878e-06, + "loss": 0.5389, + "step": 23195 + }, + { + "epoch": 0.5149776362082552, + "grad_norm": 1.3245611190795898, + "learning_rate": 9.52945793057753e-06, + "loss": 0.5037, + "step": 23200 + }, + { + "epoch": 0.5150886227677828, + "grad_norm": 1.5784523487091064, + "learning_rate": 9.525975037133101e-06, + "loss": 0.4671, + "step": 23205 + }, + { + "epoch": 0.5151996093273105, + "grad_norm": 1.3718194961547852, + "learning_rate": 9.522492201319042e-06, + "loss": 0.3642, + "step": 23210 + }, + { + "epoch": 0.5153105958868381, + "grad_norm": 0.9248862266540527, + "learning_rate": 9.51900942355878e-06, + "loss": 0.3951, + "step": 23215 + }, + { + "epoch": 0.5154215824463657, + "grad_norm": 1.1761683225631714, + "learning_rate": 9.515526704275739e-06, + "loss": 0.4326, + "step": 23220 + }, + { + "epoch": 0.5155325690058934, + "grad_norm": 1.9514886140823364, + "learning_rate": 9.512044043893336e-06, + "loss": 0.3304, + "step": 23225 + }, + { + "epoch": 0.5156435555654211, + "grad_norm": 1.2721954584121704, + "learning_rate": 9.508561442834982e-06, + "loss": 0.4436, + "step": 23230 + }, + { + "epoch": 0.5157545421249486, + "grad_norm": 1.279233455657959, + "learning_rate": 9.505078901524077e-06, + "loss": 0.4154, + "step": 23235 + }, + { + "epoch": 0.5158655286844763, + "grad_norm": 1.6182982921600342, + "learning_rate": 9.501596420384024e-06, + "loss": 0.4688, + "step": 23240 + }, + { + "epoch": 0.5159765152440039, + "grad_norm": 1.204156517982483, + "learning_rate": 9.498113999838203e-06, + "loss": 0.4464, + "step": 23245 + }, + { + "epoch": 0.5160875018035316, + "grad_norm": 1.3307151794433594, + "learning_rate": 9.494631640310001e-06, + "loss": 0.3903, + "step": 23250 + }, + { + "epoch": 0.5161984883630593, + "grad_norm": 1.3102271556854248, + "learning_rate": 9.491149342222787e-06, + "loss": 0.4727, + "step": 23255 + }, + { + "epoch": 0.5163094749225868, + "grad_norm": 0.9829373955726624, + "learning_rate": 9.48766710599993e-06, + "loss": 0.4298, + "step": 23260 + }, + { + "epoch": 0.5164204614821145, + "grad_norm": 0.9249544739723206, + "learning_rate": 9.48418493206479e-06, + "loss": 0.4166, + "step": 23265 + }, + { + "epoch": 0.5165314480416422, + "grad_norm": 1.1289701461791992, + "learning_rate": 9.480702820840713e-06, + "loss": 0.3981, + "step": 23270 + }, + { + "epoch": 0.5166424346011698, + "grad_norm": 1.086345911026001, + "learning_rate": 9.477220772751049e-06, + "loss": 0.3643, + "step": 23275 + }, + { + "epoch": 0.5167534211606974, + "grad_norm": 1.0040391683578491, + "learning_rate": 9.473738788219128e-06, + "loss": 0.5242, + "step": 23280 + }, + { + "epoch": 0.5168644077202251, + "grad_norm": 1.2268530130386353, + "learning_rate": 9.470256867668282e-06, + "loss": 0.5265, + "step": 23285 + }, + { + "epoch": 0.5169753942797527, + "grad_norm": 1.5280417203903198, + "learning_rate": 9.466775011521825e-06, + "loss": 0.5185, + "step": 23290 + }, + { + "epoch": 0.5170863808392804, + "grad_norm": 1.1195710897445679, + "learning_rate": 9.463293220203075e-06, + "loss": 0.2102, + "step": 23295 + }, + { + "epoch": 0.517197367398808, + "grad_norm": 1.3801215887069702, + "learning_rate": 9.459811494135339e-06, + "loss": 0.3567, + "step": 23300 + }, + { + "epoch": 0.5173083539583356, + "grad_norm": 1.0789028406143188, + "learning_rate": 9.456329833741907e-06, + "loss": 0.4993, + "step": 23305 + }, + { + "epoch": 0.5174193405178633, + "grad_norm": 1.283454179763794, + "learning_rate": 9.452848239446074e-06, + "loss": 0.688, + "step": 23310 + }, + { + "epoch": 0.5175303270773909, + "grad_norm": 0.9683428406715393, + "learning_rate": 9.449366711671112e-06, + "loss": 0.4205, + "step": 23315 + }, + { + "epoch": 0.5176413136369186, + "grad_norm": 1.541322112083435, + "learning_rate": 9.445885250840301e-06, + "loss": 0.344, + "step": 23320 + }, + { + "epoch": 0.5177523001964462, + "grad_norm": 1.2456687688827515, + "learning_rate": 9.442403857376903e-06, + "loss": 0.4871, + "step": 23325 + }, + { + "epoch": 0.5178632867559738, + "grad_norm": 1.0860142707824707, + "learning_rate": 9.438922531704174e-06, + "loss": 0.53, + "step": 23330 + }, + { + "epoch": 0.5179742733155015, + "grad_norm": 1.7298486232757568, + "learning_rate": 9.435441274245363e-06, + "loss": 0.4706, + "step": 23335 + }, + { + "epoch": 0.5180852598750292, + "grad_norm": 1.22835373878479, + "learning_rate": 9.431960085423707e-06, + "loss": 0.6449, + "step": 23340 + }, + { + "epoch": 0.5181962464345568, + "grad_norm": 1.4939440488815308, + "learning_rate": 9.428478965662437e-06, + "loss": 0.4136, + "step": 23345 + }, + { + "epoch": 0.5183072329940844, + "grad_norm": 0.8647384643554688, + "learning_rate": 9.424997915384781e-06, + "loss": 0.3932, + "step": 23350 + }, + { + "epoch": 0.518418219553612, + "grad_norm": 1.2243454456329346, + "learning_rate": 9.421516935013948e-06, + "loss": 0.4535, + "step": 23355 + }, + { + "epoch": 0.5185292061131397, + "grad_norm": 0.7740560173988342, + "learning_rate": 9.418036024973149e-06, + "loss": 0.362, + "step": 23360 + }, + { + "epoch": 0.5186401926726674, + "grad_norm": 0.9760211110115051, + "learning_rate": 9.414555185685573e-06, + "loss": 0.3387, + "step": 23365 + }, + { + "epoch": 0.5187511792321949, + "grad_norm": 0.9076090455055237, + "learning_rate": 9.41107441757442e-06, + "loss": 0.4295, + "step": 23370 + }, + { + "epoch": 0.5188621657917226, + "grad_norm": 1.1980648040771484, + "learning_rate": 9.407593721062858e-06, + "loss": 0.4772, + "step": 23375 + }, + { + "epoch": 0.5189731523512503, + "grad_norm": 1.1833443641662598, + "learning_rate": 9.404113096574066e-06, + "loss": 0.4988, + "step": 23380 + }, + { + "epoch": 0.5190841389107779, + "grad_norm": 1.2384533882141113, + "learning_rate": 9.40063254453121e-06, + "loss": 0.4169, + "step": 23385 + }, + { + "epoch": 0.5191951254703056, + "grad_norm": 1.1089009046554565, + "learning_rate": 9.397152065357434e-06, + "loss": 0.4497, + "step": 23390 + }, + { + "epoch": 0.5193061120298332, + "grad_norm": 1.2220206260681152, + "learning_rate": 9.393671659475894e-06, + "loss": 0.4402, + "step": 23395 + }, + { + "epoch": 0.5194170985893608, + "grad_norm": 1.0658073425292969, + "learning_rate": 9.390191327309713e-06, + "loss": 0.366, + "step": 23400 + }, + { + "epoch": 0.5195280851488885, + "grad_norm": 1.9952574968338013, + "learning_rate": 9.386711069282028e-06, + "loss": 0.4334, + "step": 23405 + }, + { + "epoch": 0.5196390717084161, + "grad_norm": 1.4178756475448608, + "learning_rate": 9.383230885815957e-06, + "loss": 0.4913, + "step": 23410 + }, + { + "epoch": 0.5197500582679437, + "grad_norm": 0.7791183590888977, + "learning_rate": 9.379750777334603e-06, + "loss": 0.4209, + "step": 23415 + }, + { + "epoch": 0.5198610448274714, + "grad_norm": 1.835775375366211, + "learning_rate": 9.376270744261073e-06, + "loss": 0.5712, + "step": 23420 + }, + { + "epoch": 0.519972031386999, + "grad_norm": 0.8781411051750183, + "learning_rate": 9.37279078701845e-06, + "loss": 0.5048, + "step": 23425 + }, + { + "epoch": 0.5200830179465267, + "grad_norm": 1.235450267791748, + "learning_rate": 9.369310906029823e-06, + "loss": 0.5186, + "step": 23430 + }, + { + "epoch": 0.5201940045060544, + "grad_norm": 1.0174843072891235, + "learning_rate": 9.36583110171826e-06, + "loss": 0.4388, + "step": 23435 + }, + { + "epoch": 0.5203049910655819, + "grad_norm": 0.8445138335227966, + "learning_rate": 9.362351374506823e-06, + "loss": 0.5178, + "step": 23440 + }, + { + "epoch": 0.5204159776251096, + "grad_norm": 1.0303021669387817, + "learning_rate": 9.358871724818568e-06, + "loss": 0.3982, + "step": 23445 + }, + { + "epoch": 0.5205269641846373, + "grad_norm": 1.1887991428375244, + "learning_rate": 9.355392153076541e-06, + "loss": 0.3977, + "step": 23450 + }, + { + "epoch": 0.5206379507441649, + "grad_norm": 1.1209839582443237, + "learning_rate": 9.351912659703773e-06, + "loss": 0.3891, + "step": 23455 + }, + { + "epoch": 0.5207489373036925, + "grad_norm": 1.235081672668457, + "learning_rate": 9.34843324512329e-06, + "loss": 0.3527, + "step": 23460 + }, + { + "epoch": 0.5208599238632201, + "grad_norm": 0.8877482414245605, + "learning_rate": 9.344953909758106e-06, + "loss": 0.3534, + "step": 23465 + }, + { + "epoch": 0.5209709104227478, + "grad_norm": 1.4008976221084595, + "learning_rate": 9.341474654031231e-06, + "loss": 0.5007, + "step": 23470 + }, + { + "epoch": 0.5210818969822755, + "grad_norm": 1.8766608238220215, + "learning_rate": 9.337995478365657e-06, + "loss": 0.5887, + "step": 23475 + }, + { + "epoch": 0.521192883541803, + "grad_norm": 0.7073838710784912, + "learning_rate": 9.334516383184377e-06, + "loss": 0.4969, + "step": 23480 + }, + { + "epoch": 0.5213038701013307, + "grad_norm": 1.209763526916504, + "learning_rate": 9.331037368910359e-06, + "loss": 0.5199, + "step": 23485 + }, + { + "epoch": 0.5214148566608584, + "grad_norm": 1.488824725151062, + "learning_rate": 9.327558435966573e-06, + "loss": 0.3552, + "step": 23490 + }, + { + "epoch": 0.521525843220386, + "grad_norm": 1.5087478160858154, + "learning_rate": 9.324079584775982e-06, + "loss": 0.4104, + "step": 23495 + }, + { + "epoch": 0.5216368297799137, + "grad_norm": 1.2469892501831055, + "learning_rate": 9.320600815761523e-06, + "loss": 0.4061, + "step": 23500 + }, + { + "epoch": 0.5217478163394413, + "grad_norm": 1.4992282390594482, + "learning_rate": 9.317122129346144e-06, + "loss": 0.3954, + "step": 23505 + }, + { + "epoch": 0.5218588028989689, + "grad_norm": 1.198625922203064, + "learning_rate": 9.313643525952762e-06, + "loss": 0.5065, + "step": 23510 + }, + { + "epoch": 0.5219697894584966, + "grad_norm": 1.102835774421692, + "learning_rate": 9.310165006004297e-06, + "loss": 0.6921, + "step": 23515 + }, + { + "epoch": 0.5220807760180242, + "grad_norm": 1.4361196756362915, + "learning_rate": 9.306686569923662e-06, + "loss": 0.4449, + "step": 23520 + }, + { + "epoch": 0.5221917625775518, + "grad_norm": 1.5560322999954224, + "learning_rate": 9.303208218133745e-06, + "loss": 0.4447, + "step": 23525 + }, + { + "epoch": 0.5223027491370795, + "grad_norm": 1.9868260622024536, + "learning_rate": 9.299729951057439e-06, + "loss": 0.4669, + "step": 23530 + }, + { + "epoch": 0.5224137356966071, + "grad_norm": 1.1894471645355225, + "learning_rate": 9.296251769117614e-06, + "loss": 0.3816, + "step": 23535 + }, + { + "epoch": 0.5225247222561348, + "grad_norm": 1.239475965499878, + "learning_rate": 9.292773672737143e-06, + "loss": 0.2769, + "step": 23540 + }, + { + "epoch": 0.5226357088156625, + "grad_norm": 1.0350260734558105, + "learning_rate": 9.28929566233887e-06, + "loss": 0.3005, + "step": 23545 + }, + { + "epoch": 0.52274669537519, + "grad_norm": 0.9946759343147278, + "learning_rate": 9.28581773834565e-06, + "loss": 0.5116, + "step": 23550 + }, + { + "epoch": 0.5228576819347177, + "grad_norm": 0.9383373856544495, + "learning_rate": 9.282339901180317e-06, + "loss": 0.3541, + "step": 23555 + }, + { + "epoch": 0.5229686684942454, + "grad_norm": 0.6263040900230408, + "learning_rate": 9.278862151265688e-06, + "loss": 0.4045, + "step": 23560 + }, + { + "epoch": 0.523079655053773, + "grad_norm": 0.7122305631637573, + "learning_rate": 9.275384489024586e-06, + "loss": 0.5649, + "step": 23565 + }, + { + "epoch": 0.5231906416133006, + "grad_norm": 0.9679470062255859, + "learning_rate": 9.271906914879802e-06, + "loss": 0.5078, + "step": 23570 + }, + { + "epoch": 0.5233016281728283, + "grad_norm": 1.3679300546646118, + "learning_rate": 9.268429429254134e-06, + "loss": 0.3559, + "step": 23575 + }, + { + "epoch": 0.5234126147323559, + "grad_norm": 1.5585976839065552, + "learning_rate": 9.264952032570364e-06, + "loss": 0.4081, + "step": 23580 + }, + { + "epoch": 0.5235236012918836, + "grad_norm": 1.3559560775756836, + "learning_rate": 9.261474725251261e-06, + "loss": 0.5403, + "step": 23585 + }, + { + "epoch": 0.5236345878514111, + "grad_norm": 1.2353577613830566, + "learning_rate": 9.257997507719585e-06, + "loss": 0.5122, + "step": 23590 + }, + { + "epoch": 0.5237455744109388, + "grad_norm": 1.4342765808105469, + "learning_rate": 9.254520380398083e-06, + "loss": 0.4143, + "step": 23595 + }, + { + "epoch": 0.5238565609704665, + "grad_norm": 1.2309331893920898, + "learning_rate": 9.25104334370949e-06, + "loss": 0.4069, + "step": 23600 + }, + { + "epoch": 0.5239675475299941, + "grad_norm": 1.0727909803390503, + "learning_rate": 9.24756639807654e-06, + "loss": 0.438, + "step": 23605 + }, + { + "epoch": 0.5240785340895218, + "grad_norm": 0.7829542756080627, + "learning_rate": 9.24408954392194e-06, + "loss": 0.4547, + "step": 23610 + }, + { + "epoch": 0.5241895206490494, + "grad_norm": 1.0878058671951294, + "learning_rate": 9.240612781668401e-06, + "loss": 0.3775, + "step": 23615 + }, + { + "epoch": 0.524300507208577, + "grad_norm": 0.8925349116325378, + "learning_rate": 9.23713611173861e-06, + "loss": 0.3224, + "step": 23620 + }, + { + "epoch": 0.5244114937681047, + "grad_norm": 1.7960782051086426, + "learning_rate": 9.233659534555257e-06, + "loss": 0.387, + "step": 23625 + }, + { + "epoch": 0.5245224803276324, + "grad_norm": 1.554244041442871, + "learning_rate": 9.230183050541001e-06, + "loss": 0.4478, + "step": 23630 + }, + { + "epoch": 0.52463346688716, + "grad_norm": 1.7812159061431885, + "learning_rate": 9.226706660118511e-06, + "loss": 0.4356, + "step": 23635 + }, + { + "epoch": 0.5247444534466876, + "grad_norm": 1.4758063554763794, + "learning_rate": 9.223230363710434e-06, + "loss": 0.388, + "step": 23640 + }, + { + "epoch": 0.5248554400062152, + "grad_norm": 1.4203014373779297, + "learning_rate": 9.2197541617394e-06, + "loss": 0.5469, + "step": 23645 + }, + { + "epoch": 0.5249664265657429, + "grad_norm": 1.188704252243042, + "learning_rate": 9.216278054628043e-06, + "loss": 0.3986, + "step": 23650 + }, + { + "epoch": 0.5250774131252706, + "grad_norm": 1.4540413618087769, + "learning_rate": 9.21280204279897e-06, + "loss": 0.2821, + "step": 23655 + }, + { + "epoch": 0.5251883996847981, + "grad_norm": 1.3421919345855713, + "learning_rate": 9.209326126674781e-06, + "loss": 0.5352, + "step": 23660 + }, + { + "epoch": 0.5252993862443258, + "grad_norm": 0.8097754716873169, + "learning_rate": 9.205850306678076e-06, + "loss": 0.5054, + "step": 23665 + }, + { + "epoch": 0.5254103728038535, + "grad_norm": 1.0205293893814087, + "learning_rate": 9.202374583231423e-06, + "loss": 0.4522, + "step": 23670 + }, + { + "epoch": 0.5255213593633811, + "grad_norm": 1.4187626838684082, + "learning_rate": 9.198898956757396e-06, + "loss": 0.4245, + "step": 23675 + }, + { + "epoch": 0.5256323459229088, + "grad_norm": 1.542188048362732, + "learning_rate": 9.195423427678544e-06, + "loss": 0.4067, + "step": 23680 + }, + { + "epoch": 0.5257433324824364, + "grad_norm": 1.1413275003433228, + "learning_rate": 9.191947996417415e-06, + "loss": 0.3822, + "step": 23685 + }, + { + "epoch": 0.525854319041964, + "grad_norm": 1.4354232549667358, + "learning_rate": 9.18847266339654e-06, + "loss": 0.3561, + "step": 23690 + }, + { + "epoch": 0.5259653056014917, + "grad_norm": 1.5432720184326172, + "learning_rate": 9.184997429038435e-06, + "loss": 0.4168, + "step": 23695 + }, + { + "epoch": 0.5260762921610193, + "grad_norm": 1.0400127172470093, + "learning_rate": 9.18152229376561e-06, + "loss": 0.2103, + "step": 23700 + }, + { + "epoch": 0.5261872787205469, + "grad_norm": 1.6075176000595093, + "learning_rate": 9.17804725800056e-06, + "loss": 0.4054, + "step": 23705 + }, + { + "epoch": 0.5262982652800746, + "grad_norm": 1.1011056900024414, + "learning_rate": 9.174572322165766e-06, + "loss": 0.3888, + "step": 23710 + }, + { + "epoch": 0.5264092518396022, + "grad_norm": 1.4790599346160889, + "learning_rate": 9.171097486683703e-06, + "loss": 0.2997, + "step": 23715 + }, + { + "epoch": 0.5265202383991299, + "grad_norm": 0.9740801453590393, + "learning_rate": 9.167622751976825e-06, + "loss": 0.4052, + "step": 23720 + }, + { + "epoch": 0.5266312249586576, + "grad_norm": 1.1656138896942139, + "learning_rate": 9.164148118467585e-06, + "loss": 0.3117, + "step": 23725 + }, + { + "epoch": 0.5267422115181851, + "grad_norm": 1.1819837093353271, + "learning_rate": 9.16067358657841e-06, + "loss": 0.4508, + "step": 23730 + }, + { + "epoch": 0.5268531980777128, + "grad_norm": 1.0367882251739502, + "learning_rate": 9.157199156731729e-06, + "loss": 0.4348, + "step": 23735 + }, + { + "epoch": 0.5269641846372405, + "grad_norm": 1.3061459064483643, + "learning_rate": 9.153724829349944e-06, + "loss": 0.5079, + "step": 23740 + }, + { + "epoch": 0.527075171196768, + "grad_norm": 1.1266971826553345, + "learning_rate": 9.150250604855454e-06, + "loss": 0.4737, + "step": 23745 + }, + { + "epoch": 0.5271861577562957, + "grad_norm": 1.4855886697769165, + "learning_rate": 9.146776483670654e-06, + "loss": 0.4979, + "step": 23750 + }, + { + "epoch": 0.5272971443158233, + "grad_norm": 1.0671223402023315, + "learning_rate": 9.1433024662179e-06, + "loss": 0.4063, + "step": 23755 + }, + { + "epoch": 0.527408130875351, + "grad_norm": 1.0044794082641602, + "learning_rate": 9.139828552919566e-06, + "loss": 0.3773, + "step": 23760 + }, + { + "epoch": 0.5275191174348787, + "grad_norm": 1.251092791557312, + "learning_rate": 9.136354744197984e-06, + "loss": 0.4365, + "step": 23765 + }, + { + "epoch": 0.5276301039944062, + "grad_norm": 1.231126308441162, + "learning_rate": 9.132881040475498e-06, + "loss": 0.2866, + "step": 23770 + }, + { + "epoch": 0.5277410905539339, + "grad_norm": 1.3552109003067017, + "learning_rate": 9.129407442174431e-06, + "loss": 0.3533, + "step": 23775 + }, + { + "epoch": 0.5278520771134616, + "grad_norm": 1.3020673990249634, + "learning_rate": 9.125933949717083e-06, + "loss": 0.4461, + "step": 23780 + }, + { + "epoch": 0.5279630636729892, + "grad_norm": 1.9571168422698975, + "learning_rate": 9.12246056352576e-06, + "loss": 0.4935, + "step": 23785 + }, + { + "epoch": 0.5280740502325169, + "grad_norm": 1.401061773300171, + "learning_rate": 9.118987284022732e-06, + "loss": 0.2573, + "step": 23790 + }, + { + "epoch": 0.5281850367920445, + "grad_norm": 1.519338846206665, + "learning_rate": 9.115514111630282e-06, + "loss": 0.3944, + "step": 23795 + }, + { + "epoch": 0.5282960233515721, + "grad_norm": 1.5136542320251465, + "learning_rate": 9.112041046770653e-06, + "loss": 0.3774, + "step": 23800 + }, + { + "epoch": 0.5284070099110998, + "grad_norm": 1.6715372800827026, + "learning_rate": 9.108568089866096e-06, + "loss": 0.3998, + "step": 23805 + }, + { + "epoch": 0.5285179964706274, + "grad_norm": 1.1824544668197632, + "learning_rate": 9.105095241338846e-06, + "loss": 0.4635, + "step": 23810 + }, + { + "epoch": 0.528628983030155, + "grad_norm": 1.4660857915878296, + "learning_rate": 9.101622501611112e-06, + "loss": 0.4204, + "step": 23815 + }, + { + "epoch": 0.5287399695896827, + "grad_norm": 2.133476734161377, + "learning_rate": 9.098149871105104e-06, + "loss": 0.2596, + "step": 23820 + }, + { + "epoch": 0.5288509561492103, + "grad_norm": 1.5634102821350098, + "learning_rate": 9.09467735024301e-06, + "loss": 0.4877, + "step": 23825 + }, + { + "epoch": 0.528961942708738, + "grad_norm": 1.0197075605392456, + "learning_rate": 9.091204939447003e-06, + "loss": 0.5129, + "step": 23830 + }, + { + "epoch": 0.5290729292682657, + "grad_norm": 0.8712895512580872, + "learning_rate": 9.087732639139256e-06, + "loss": 0.4002, + "step": 23835 + }, + { + "epoch": 0.5291839158277932, + "grad_norm": 1.3366756439208984, + "learning_rate": 9.084260449741912e-06, + "loss": 0.4197, + "step": 23840 + }, + { + "epoch": 0.5292949023873209, + "grad_norm": 1.2106136083602905, + "learning_rate": 9.080788371677115e-06, + "loss": 0.3646, + "step": 23845 + }, + { + "epoch": 0.5294058889468486, + "grad_norm": 1.1375362873077393, + "learning_rate": 9.07731640536698e-06, + "loss": 0.5453, + "step": 23850 + }, + { + "epoch": 0.5295168755063762, + "grad_norm": 1.8555535078048706, + "learning_rate": 9.073844551233624e-06, + "loss": 0.3916, + "step": 23855 + }, + { + "epoch": 0.5296278620659038, + "grad_norm": 0.942084550857544, + "learning_rate": 9.070372809699146e-06, + "loss": 0.4782, + "step": 23860 + }, + { + "epoch": 0.5297388486254314, + "grad_norm": 0.90847247838974, + "learning_rate": 9.066901181185618e-06, + "loss": 0.3912, + "step": 23865 + }, + { + "epoch": 0.5298498351849591, + "grad_norm": 1.3108025789260864, + "learning_rate": 9.06342966611512e-06, + "loss": 0.4641, + "step": 23870 + }, + { + "epoch": 0.5299608217444868, + "grad_norm": 1.1245001554489136, + "learning_rate": 9.059958264909698e-06, + "loss": 0.3802, + "step": 23875 + }, + { + "epoch": 0.5300718083040143, + "grad_norm": 1.2725918292999268, + "learning_rate": 9.0564869779914e-06, + "loss": 0.4285, + "step": 23880 + }, + { + "epoch": 0.530182794863542, + "grad_norm": 1.2415249347686768, + "learning_rate": 9.053015805782249e-06, + "loss": 0.385, + "step": 23885 + }, + { + "epoch": 0.5302937814230697, + "grad_norm": 1.4148523807525635, + "learning_rate": 9.04954474870426e-06, + "loss": 0.4127, + "step": 23890 + }, + { + "epoch": 0.5304047679825973, + "grad_norm": 0.6649506092071533, + "learning_rate": 9.046073807179436e-06, + "loss": 0.3126, + "step": 23895 + }, + { + "epoch": 0.530515754542125, + "grad_norm": 1.101078748703003, + "learning_rate": 9.042602981629756e-06, + "loss": 0.2615, + "step": 23900 + }, + { + "epoch": 0.5306267411016526, + "grad_norm": 1.0729953050613403, + "learning_rate": 9.0391322724772e-06, + "loss": 0.4776, + "step": 23905 + }, + { + "epoch": 0.5307377276611802, + "grad_norm": 1.1954920291900635, + "learning_rate": 9.035661680143713e-06, + "loss": 0.3606, + "step": 23910 + }, + { + "epoch": 0.5308487142207079, + "grad_norm": 1.2097233533859253, + "learning_rate": 9.032191205051245e-06, + "loss": 0.52, + "step": 23915 + }, + { + "epoch": 0.5309597007802355, + "grad_norm": 1.2129782438278198, + "learning_rate": 9.028720847621732e-06, + "loss": 0.5183, + "step": 23920 + }, + { + "epoch": 0.5310706873397631, + "grad_norm": 0.9085155725479126, + "learning_rate": 9.025250608277075e-06, + "loss": 0.4627, + "step": 23925 + }, + { + "epoch": 0.5311816738992908, + "grad_norm": 0.8985617756843567, + "learning_rate": 9.021780487439183e-06, + "loss": 0.4049, + "step": 23930 + }, + { + "epoch": 0.5312926604588184, + "grad_norm": 1.4433565139770508, + "learning_rate": 9.018310485529935e-06, + "loss": 0.5358, + "step": 23935 + }, + { + "epoch": 0.5314036470183461, + "grad_norm": 0.9419113397598267, + "learning_rate": 9.014840602971208e-06, + "loss": 0.3272, + "step": 23940 + }, + { + "epoch": 0.5315146335778738, + "grad_norm": 1.5108088254928589, + "learning_rate": 9.011370840184855e-06, + "loss": 0.4735, + "step": 23945 + }, + { + "epoch": 0.5316256201374013, + "grad_norm": 1.1959902048110962, + "learning_rate": 9.007901197592722e-06, + "loss": 0.4795, + "step": 23950 + }, + { + "epoch": 0.531736606696929, + "grad_norm": 1.501643419265747, + "learning_rate": 9.004431675616634e-06, + "loss": 0.4664, + "step": 23955 + }, + { + "epoch": 0.5318475932564567, + "grad_norm": 0.9131843447685242, + "learning_rate": 9.000962274678402e-06, + "loss": 0.5461, + "step": 23960 + }, + { + "epoch": 0.5319585798159843, + "grad_norm": 1.1181492805480957, + "learning_rate": 8.997492995199828e-06, + "loss": 0.4563, + "step": 23965 + }, + { + "epoch": 0.532069566375512, + "grad_norm": 1.3199212551116943, + "learning_rate": 8.994023837602694e-06, + "loss": 0.3744, + "step": 23970 + }, + { + "epoch": 0.5321805529350395, + "grad_norm": 1.2438446283340454, + "learning_rate": 8.990554802308765e-06, + "loss": 0.5131, + "step": 23975 + }, + { + "epoch": 0.5322915394945672, + "grad_norm": 1.3528581857681274, + "learning_rate": 8.987085889739801e-06, + "loss": 0.3848, + "step": 23980 + }, + { + "epoch": 0.5324025260540949, + "grad_norm": 0.8180687427520752, + "learning_rate": 8.983617100317534e-06, + "loss": 0.3982, + "step": 23985 + }, + { + "epoch": 0.5325135126136225, + "grad_norm": 0.9565178751945496, + "learning_rate": 8.980148434463695e-06, + "loss": 0.4377, + "step": 23990 + }, + { + "epoch": 0.5326244991731501, + "grad_norm": 1.3885096311569214, + "learning_rate": 8.976679892599987e-06, + "loss": 0.3927, + "step": 23995 + }, + { + "epoch": 0.5327354857326778, + "grad_norm": 1.039081335067749, + "learning_rate": 8.973211475148103e-06, + "loss": 0.4103, + "step": 24000 + }, + { + "epoch": 0.5328464722922054, + "grad_norm": 1.3395166397094727, + "learning_rate": 8.96974318252973e-06, + "loss": 0.3978, + "step": 24005 + }, + { + "epoch": 0.5329574588517331, + "grad_norm": 1.1190773248672485, + "learning_rate": 8.96627501516652e-06, + "loss": 0.42, + "step": 24010 + }, + { + "epoch": 0.5330684454112607, + "grad_norm": 1.1331000328063965, + "learning_rate": 8.962806973480133e-06, + "loss": 0.477, + "step": 24015 + }, + { + "epoch": 0.5331794319707883, + "grad_norm": 1.137101650238037, + "learning_rate": 8.95933905789219e-06, + "loss": 0.4234, + "step": 24020 + }, + { + "epoch": 0.533290418530316, + "grad_norm": 0.9587161540985107, + "learning_rate": 8.955871268824316e-06, + "loss": 0.3629, + "step": 24025 + }, + { + "epoch": 0.5334014050898436, + "grad_norm": 1.3958914279937744, + "learning_rate": 8.952403606698113e-06, + "loss": 0.3726, + "step": 24030 + }, + { + "epoch": 0.5335123916493713, + "grad_norm": 0.9416199326515198, + "learning_rate": 8.948936071935164e-06, + "loss": 0.4429, + "step": 24035 + }, + { + "epoch": 0.5336233782088989, + "grad_norm": 1.9556666612625122, + "learning_rate": 8.945468664957046e-06, + "loss": 0.4545, + "step": 24040 + }, + { + "epoch": 0.5337343647684265, + "grad_norm": 1.3999742269515991, + "learning_rate": 8.942001386185307e-06, + "loss": 0.5151, + "step": 24045 + }, + { + "epoch": 0.5338453513279542, + "grad_norm": 1.2332944869995117, + "learning_rate": 8.938534236041495e-06, + "loss": 0.3034, + "step": 24050 + }, + { + "epoch": 0.5339563378874819, + "grad_norm": 2.9347760677337646, + "learning_rate": 8.935067214947126e-06, + "loss": 0.3782, + "step": 24055 + }, + { + "epoch": 0.5340673244470094, + "grad_norm": 1.1664717197418213, + "learning_rate": 8.931600323323717e-06, + "loss": 0.4993, + "step": 24060 + }, + { + "epoch": 0.5341783110065371, + "grad_norm": 1.102626919746399, + "learning_rate": 8.928133561592756e-06, + "loss": 0.4186, + "step": 24065 + }, + { + "epoch": 0.5342892975660648, + "grad_norm": 1.6264910697937012, + "learning_rate": 8.924666930175722e-06, + "loss": 0.332, + "step": 24070 + }, + { + "epoch": 0.5344002841255924, + "grad_norm": 1.000712513923645, + "learning_rate": 8.921200429494079e-06, + "loss": 0.488, + "step": 24075 + }, + { + "epoch": 0.53451127068512, + "grad_norm": 0.8100029826164246, + "learning_rate": 8.917734059969266e-06, + "loss": 0.3343, + "step": 24080 + }, + { + "epoch": 0.5346222572446476, + "grad_norm": 0.926163911819458, + "learning_rate": 8.914267822022716e-06, + "loss": 0.2985, + "step": 24085 + }, + { + "epoch": 0.5347332438041753, + "grad_norm": 1.284947395324707, + "learning_rate": 8.910801716075847e-06, + "loss": 0.3884, + "step": 24090 + }, + { + "epoch": 0.534844230363703, + "grad_norm": 0.9958680272102356, + "learning_rate": 8.90733574255005e-06, + "loss": 0.5513, + "step": 24095 + }, + { + "epoch": 0.5349552169232306, + "grad_norm": 1.1211278438568115, + "learning_rate": 8.90386990186671e-06, + "loss": 0.3841, + "step": 24100 + }, + { + "epoch": 0.5350662034827582, + "grad_norm": 1.7093493938446045, + "learning_rate": 8.900404194447189e-06, + "loss": 0.5569, + "step": 24105 + }, + { + "epoch": 0.5351771900422859, + "grad_norm": 0.7877238988876343, + "learning_rate": 8.896938620712837e-06, + "loss": 0.3767, + "step": 24110 + }, + { + "epoch": 0.5352881766018135, + "grad_norm": 0.9894497394561768, + "learning_rate": 8.893473181084993e-06, + "loss": 0.5499, + "step": 24115 + }, + { + "epoch": 0.5353991631613412, + "grad_norm": 1.3464109897613525, + "learning_rate": 8.890007875984966e-06, + "loss": 0.2911, + "step": 24120 + }, + { + "epoch": 0.5355101497208689, + "grad_norm": 0.7942631244659424, + "learning_rate": 8.88654270583406e-06, + "loss": 0.5159, + "step": 24125 + }, + { + "epoch": 0.5356211362803964, + "grad_norm": 1.2688034772872925, + "learning_rate": 8.883077671053557e-06, + "loss": 0.4691, + "step": 24130 + }, + { + "epoch": 0.5357321228399241, + "grad_norm": 1.5835150480270386, + "learning_rate": 8.879612772064726e-06, + "loss": 0.364, + "step": 24135 + }, + { + "epoch": 0.5358431093994517, + "grad_norm": 1.22607421875, + "learning_rate": 8.876148009288813e-06, + "loss": 0.47, + "step": 24140 + }, + { + "epoch": 0.5359540959589794, + "grad_norm": 1.0492281913757324, + "learning_rate": 8.872683383147055e-06, + "loss": 0.3973, + "step": 24145 + }, + { + "epoch": 0.536065082518507, + "grad_norm": 1.0997753143310547, + "learning_rate": 8.869218894060678e-06, + "loss": 0.3275, + "step": 24150 + }, + { + "epoch": 0.5361760690780346, + "grad_norm": 1.5366569757461548, + "learning_rate": 8.865754542450868e-06, + "loss": 0.3678, + "step": 24155 + }, + { + "epoch": 0.5362870556375623, + "grad_norm": 1.67007315158844, + "learning_rate": 8.862290328738822e-06, + "loss": 0.3455, + "step": 24160 + }, + { + "epoch": 0.53639804219709, + "grad_norm": 1.1318846940994263, + "learning_rate": 8.8588262533457e-06, + "loss": 0.4769, + "step": 24165 + }, + { + "epoch": 0.5365090287566175, + "grad_norm": 0.8410813808441162, + "learning_rate": 8.855362316692654e-06, + "loss": 0.36, + "step": 24170 + }, + { + "epoch": 0.5366200153161452, + "grad_norm": 1.8174649477005005, + "learning_rate": 8.85189851920082e-06, + "loss": 0.395, + "step": 24175 + }, + { + "epoch": 0.5367310018756729, + "grad_norm": 0.8335534930229187, + "learning_rate": 8.848434861291313e-06, + "loss": 0.4751, + "step": 24180 + }, + { + "epoch": 0.5368419884352005, + "grad_norm": 1.2102363109588623, + "learning_rate": 8.844971343385237e-06, + "loss": 0.3887, + "step": 24185 + }, + { + "epoch": 0.5369529749947282, + "grad_norm": 1.066596269607544, + "learning_rate": 8.841507965903666e-06, + "loss": 0.3254, + "step": 24190 + }, + { + "epoch": 0.5370639615542557, + "grad_norm": 0.9416897892951965, + "learning_rate": 8.838044729267674e-06, + "loss": 0.4878, + "step": 24195 + }, + { + "epoch": 0.5371749481137834, + "grad_norm": 0.8740838170051575, + "learning_rate": 8.834581633898307e-06, + "loss": 0.2981, + "step": 24200 + }, + { + "epoch": 0.5372859346733111, + "grad_norm": 1.625891089439392, + "learning_rate": 8.831118680216597e-06, + "loss": 0.5089, + "step": 24205 + }, + { + "epoch": 0.5373969212328387, + "grad_norm": 0.9911015033721924, + "learning_rate": 8.827655868643557e-06, + "loss": 0.5279, + "step": 24210 + }, + { + "epoch": 0.5375079077923663, + "grad_norm": 1.2692562341690063, + "learning_rate": 8.824193199600184e-06, + "loss": 0.4197, + "step": 24215 + }, + { + "epoch": 0.537618894351894, + "grad_norm": 1.1671028137207031, + "learning_rate": 8.820730673507459e-06, + "loss": 0.4664, + "step": 24220 + }, + { + "epoch": 0.5377298809114216, + "grad_norm": 1.1915336847305298, + "learning_rate": 8.817268290786343e-06, + "loss": 0.3918, + "step": 24225 + }, + { + "epoch": 0.5378408674709493, + "grad_norm": 1.9181326627731323, + "learning_rate": 8.813806051857781e-06, + "loss": 0.4549, + "step": 24230 + }, + { + "epoch": 0.537951854030477, + "grad_norm": 1.3655357360839844, + "learning_rate": 8.810343957142706e-06, + "loss": 0.4827, + "step": 24235 + }, + { + "epoch": 0.5380628405900045, + "grad_norm": 0.8995305299758911, + "learning_rate": 8.806882007062016e-06, + "loss": 0.4682, + "step": 24240 + }, + { + "epoch": 0.5381738271495322, + "grad_norm": 1.4401637315750122, + "learning_rate": 8.803420202036617e-06, + "loss": 0.4811, + "step": 24245 + }, + { + "epoch": 0.5382848137090598, + "grad_norm": 1.2994670867919922, + "learning_rate": 8.799958542487371e-06, + "loss": 0.4077, + "step": 24250 + }, + { + "epoch": 0.5383958002685875, + "grad_norm": 1.2455394268035889, + "learning_rate": 8.79649702883514e-06, + "loss": 0.3975, + "step": 24255 + }, + { + "epoch": 0.5385067868281151, + "grad_norm": 1.3992613554000854, + "learning_rate": 8.793035661500771e-06, + "loss": 0.3986, + "step": 24260 + }, + { + "epoch": 0.5386177733876427, + "grad_norm": 1.7506078481674194, + "learning_rate": 8.789574440905073e-06, + "loss": 0.441, + "step": 24265 + }, + { + "epoch": 0.5387287599471704, + "grad_norm": 0.9146484732627869, + "learning_rate": 8.786113367468859e-06, + "loss": 0.3456, + "step": 24270 + }, + { + "epoch": 0.5388397465066981, + "grad_norm": 1.160560965538025, + "learning_rate": 8.782652441612906e-06, + "loss": 0.3486, + "step": 24275 + }, + { + "epoch": 0.5389507330662257, + "grad_norm": 1.0188573598861694, + "learning_rate": 8.779191663757987e-06, + "loss": 0.3588, + "step": 24280 + }, + { + "epoch": 0.5390617196257533, + "grad_norm": 0.8397150635719299, + "learning_rate": 8.775731034324854e-06, + "loss": 0.5183, + "step": 24285 + }, + { + "epoch": 0.539172706185281, + "grad_norm": 1.5384353399276733, + "learning_rate": 8.772270553734234e-06, + "loss": 0.3716, + "step": 24290 + }, + { + "epoch": 0.5392836927448086, + "grad_norm": 0.579929769039154, + "learning_rate": 8.768810222406846e-06, + "loss": 0.4645, + "step": 24295 + }, + { + "epoch": 0.5393946793043363, + "grad_norm": 1.4148974418640137, + "learning_rate": 8.765350040763377e-06, + "loss": 0.471, + "step": 24300 + }, + { + "epoch": 0.5395056658638638, + "grad_norm": 0.8738304972648621, + "learning_rate": 8.761890009224515e-06, + "loss": 0.4906, + "step": 24305 + }, + { + "epoch": 0.5396166524233915, + "grad_norm": 1.0829598903656006, + "learning_rate": 8.758430128210908e-06, + "loss": 0.3976, + "step": 24310 + }, + { + "epoch": 0.5397276389829192, + "grad_norm": 1.7203539609909058, + "learning_rate": 8.754970398143205e-06, + "loss": 0.5641, + "step": 24315 + }, + { + "epoch": 0.5398386255424468, + "grad_norm": 1.3489069938659668, + "learning_rate": 8.751510819442025e-06, + "loss": 0.4008, + "step": 24320 + }, + { + "epoch": 0.5399496121019745, + "grad_norm": 1.8932770490646362, + "learning_rate": 8.748051392527973e-06, + "loss": 0.6124, + "step": 24325 + }, + { + "epoch": 0.5400605986615021, + "grad_norm": 1.372402310371399, + "learning_rate": 8.744592117821633e-06, + "loss": 0.4919, + "step": 24330 + }, + { + "epoch": 0.5401715852210297, + "grad_norm": 0.9137066602706909, + "learning_rate": 8.741132995743573e-06, + "loss": 0.3805, + "step": 24335 + }, + { + "epoch": 0.5402825717805574, + "grad_norm": 0.8052771091461182, + "learning_rate": 8.737674026714342e-06, + "loss": 0.4316, + "step": 24340 + }, + { + "epoch": 0.5403935583400851, + "grad_norm": 1.2825324535369873, + "learning_rate": 8.734215211154471e-06, + "loss": 0.2739, + "step": 24345 + }, + { + "epoch": 0.5405045448996126, + "grad_norm": 1.194736361503601, + "learning_rate": 8.730756549484465e-06, + "loss": 0.3092, + "step": 24350 + }, + { + "epoch": 0.5406155314591403, + "grad_norm": 1.0905753374099731, + "learning_rate": 8.727298042124827e-06, + "loss": 0.4223, + "step": 24355 + }, + { + "epoch": 0.5407265180186679, + "grad_norm": 1.0970087051391602, + "learning_rate": 8.723839689496018e-06, + "loss": 0.5198, + "step": 24360 + }, + { + "epoch": 0.5408375045781956, + "grad_norm": 1.4189403057098389, + "learning_rate": 8.720381492018501e-06, + "loss": 0.4348, + "step": 24365 + }, + { + "epoch": 0.5409484911377233, + "grad_norm": 1.2112865447998047, + "learning_rate": 8.716923450112716e-06, + "loss": 0.4805, + "step": 24370 + }, + { + "epoch": 0.5410594776972508, + "grad_norm": 0.8020737767219543, + "learning_rate": 8.713465564199071e-06, + "loss": 0.504, + "step": 24375 + }, + { + "epoch": 0.5411704642567785, + "grad_norm": 1.3366249799728394, + "learning_rate": 8.71000783469797e-06, + "loss": 0.3047, + "step": 24380 + }, + { + "epoch": 0.5412814508163062, + "grad_norm": 1.6268224716186523, + "learning_rate": 8.706550262029787e-06, + "loss": 0.5713, + "step": 24385 + }, + { + "epoch": 0.5413924373758338, + "grad_norm": 1.1691818237304688, + "learning_rate": 8.703092846614889e-06, + "loss": 0.4409, + "step": 24390 + }, + { + "epoch": 0.5415034239353614, + "grad_norm": 1.7712862491607666, + "learning_rate": 8.69963558887361e-06, + "loss": 0.4349, + "step": 24395 + }, + { + "epoch": 0.5416144104948891, + "grad_norm": 1.100163221359253, + "learning_rate": 8.696178489226274e-06, + "loss": 0.6371, + "step": 24400 + }, + { + "epoch": 0.5417253970544167, + "grad_norm": 1.5967742204666138, + "learning_rate": 8.692721548093188e-06, + "loss": 0.3944, + "step": 24405 + }, + { + "epoch": 0.5418363836139444, + "grad_norm": 1.168125867843628, + "learning_rate": 8.68926476589463e-06, + "loss": 0.5811, + "step": 24410 + }, + { + "epoch": 0.5419473701734719, + "grad_norm": 1.3047908544540405, + "learning_rate": 8.685808143050869e-06, + "loss": 0.4678, + "step": 24415 + }, + { + "epoch": 0.5420583567329996, + "grad_norm": 1.6398917436599731, + "learning_rate": 8.682351679982142e-06, + "loss": 0.4024, + "step": 24420 + }, + { + "epoch": 0.5421693432925273, + "grad_norm": 1.2872521877288818, + "learning_rate": 8.67889537710868e-06, + "loss": 0.4321, + "step": 24425 + }, + { + "epoch": 0.5422803298520549, + "grad_norm": 1.3213415145874023, + "learning_rate": 8.67543923485069e-06, + "loss": 0.5972, + "step": 24430 + }, + { + "epoch": 0.5423913164115826, + "grad_norm": 1.914083480834961, + "learning_rate": 8.671983253628354e-06, + "loss": 0.5109, + "step": 24435 + }, + { + "epoch": 0.5425023029711102, + "grad_norm": 1.3763399124145508, + "learning_rate": 8.66852743386184e-06, + "loss": 0.3901, + "step": 24440 + }, + { + "epoch": 0.5426132895306378, + "grad_norm": 1.1899441480636597, + "learning_rate": 8.665071775971296e-06, + "loss": 0.4049, + "step": 24445 + }, + { + "epoch": 0.5427242760901655, + "grad_norm": 1.1333017349243164, + "learning_rate": 8.661616280376846e-06, + "loss": 0.555, + "step": 24450 + }, + { + "epoch": 0.5428352626496932, + "grad_norm": 1.456155776977539, + "learning_rate": 8.658160947498604e-06, + "loss": 0.3794, + "step": 24455 + }, + { + "epoch": 0.5429462492092207, + "grad_norm": 1.4425901174545288, + "learning_rate": 8.654705777756649e-06, + "loss": 0.5578, + "step": 24460 + }, + { + "epoch": 0.5430572357687484, + "grad_norm": 1.1383377313613892, + "learning_rate": 8.65125077157106e-06, + "loss": 0.456, + "step": 24465 + }, + { + "epoch": 0.543168222328276, + "grad_norm": 1.1584062576293945, + "learning_rate": 8.647795929361873e-06, + "loss": 0.3621, + "step": 24470 + }, + { + "epoch": 0.5432792088878037, + "grad_norm": 1.4009616374969482, + "learning_rate": 8.644341251549126e-06, + "loss": 0.4397, + "step": 24475 + }, + { + "epoch": 0.5433901954473314, + "grad_norm": 1.229703426361084, + "learning_rate": 8.64088673855282e-06, + "loss": 0.6272, + "step": 24480 + }, + { + "epoch": 0.5435011820068589, + "grad_norm": 1.0358545780181885, + "learning_rate": 8.637432390792945e-06, + "loss": 0.4596, + "step": 24485 + }, + { + "epoch": 0.5436121685663866, + "grad_norm": 0.9032576084136963, + "learning_rate": 8.633978208689476e-06, + "loss": 0.4377, + "step": 24490 + }, + { + "epoch": 0.5437231551259143, + "grad_norm": 1.1318433284759521, + "learning_rate": 8.630524192662349e-06, + "loss": 0.3985, + "step": 24495 + }, + { + "epoch": 0.5438341416854419, + "grad_norm": 1.161857008934021, + "learning_rate": 8.627070343131502e-06, + "loss": 0.4137, + "step": 24500 + }, + { + "epoch": 0.5439451282449695, + "grad_norm": 1.4461456537246704, + "learning_rate": 8.623616660516836e-06, + "loss": 0.3999, + "step": 24505 + }, + { + "epoch": 0.5440561148044972, + "grad_norm": 1.6744439601898193, + "learning_rate": 8.620163145238239e-06, + "loss": 0.5026, + "step": 24510 + }, + { + "epoch": 0.5441671013640248, + "grad_norm": 1.600347638130188, + "learning_rate": 8.616709797715585e-06, + "loss": 0.5446, + "step": 24515 + }, + { + "epoch": 0.5442780879235525, + "grad_norm": 1.0115714073181152, + "learning_rate": 8.613256618368708e-06, + "loss": 0.3249, + "step": 24520 + }, + { + "epoch": 0.54438907448308, + "grad_norm": 1.2178457975387573, + "learning_rate": 8.609803607617447e-06, + "loss": 0.5456, + "step": 24525 + }, + { + "epoch": 0.5445000610426077, + "grad_norm": 1.0085428953170776, + "learning_rate": 8.606350765881595e-06, + "loss": 0.5988, + "step": 24530 + }, + { + "epoch": 0.5446110476021354, + "grad_norm": 1.0458776950836182, + "learning_rate": 8.602898093580943e-06, + "loss": 0.4891, + "step": 24535 + }, + { + "epoch": 0.544722034161663, + "grad_norm": 1.2219630479812622, + "learning_rate": 8.599445591135262e-06, + "loss": 0.4179, + "step": 24540 + }, + { + "epoch": 0.5448330207211907, + "grad_norm": 1.1068999767303467, + "learning_rate": 8.595993258964282e-06, + "loss": 0.3851, + "step": 24545 + }, + { + "epoch": 0.5449440072807183, + "grad_norm": 1.4395174980163574, + "learning_rate": 8.592541097487737e-06, + "loss": 0.3795, + "step": 24550 + }, + { + "epoch": 0.5450549938402459, + "grad_norm": 0.8915678858757019, + "learning_rate": 8.589089107125323e-06, + "loss": 0.4981, + "step": 24555 + }, + { + "epoch": 0.5451659803997736, + "grad_norm": 1.1956037282943726, + "learning_rate": 8.585637288296724e-06, + "loss": 0.549, + "step": 24560 + }, + { + "epoch": 0.5452769669593013, + "grad_norm": 1.6773871183395386, + "learning_rate": 8.582185641421601e-06, + "loss": 0.5066, + "step": 24565 + }, + { + "epoch": 0.5453879535188288, + "grad_norm": 1.8565099239349365, + "learning_rate": 8.578734166919592e-06, + "loss": 0.438, + "step": 24570 + }, + { + "epoch": 0.5454989400783565, + "grad_norm": 1.4970033168792725, + "learning_rate": 8.575282865210316e-06, + "loss": 0.4107, + "step": 24575 + }, + { + "epoch": 0.5456099266378841, + "grad_norm": 0.9346946477890015, + "learning_rate": 8.571831736713373e-06, + "loss": 0.3067, + "step": 24580 + }, + { + "epoch": 0.5457209131974118, + "grad_norm": 1.2216112613677979, + "learning_rate": 8.568380781848337e-06, + "loss": 0.3981, + "step": 24585 + }, + { + "epoch": 0.5458318997569395, + "grad_norm": 0.9755334854125977, + "learning_rate": 8.564930001034763e-06, + "loss": 0.2897, + "step": 24590 + }, + { + "epoch": 0.545942886316467, + "grad_norm": 1.269582748413086, + "learning_rate": 8.561479394692187e-06, + "loss": 0.4555, + "step": 24595 + }, + { + "epoch": 0.5460538728759947, + "grad_norm": 1.3922683000564575, + "learning_rate": 8.558028963240123e-06, + "loss": 0.4091, + "step": 24600 + }, + { + "epoch": 0.5461648594355224, + "grad_norm": 1.26840078830719, + "learning_rate": 8.55457870709806e-06, + "loss": 0.5134, + "step": 24605 + }, + { + "epoch": 0.54627584599505, + "grad_norm": 1.013315200805664, + "learning_rate": 8.551128626685473e-06, + "loss": 0.2049, + "step": 24610 + }, + { + "epoch": 0.5463868325545777, + "grad_norm": 1.287469506263733, + "learning_rate": 8.547678722421805e-06, + "loss": 0.3325, + "step": 24615 + }, + { + "epoch": 0.5464978191141053, + "grad_norm": 1.38962984085083, + "learning_rate": 8.544228994726486e-06, + "loss": 0.4033, + "step": 24620 + }, + { + "epoch": 0.5466088056736329, + "grad_norm": 1.2414928674697876, + "learning_rate": 8.540779444018929e-06, + "loss": 0.4281, + "step": 24625 + }, + { + "epoch": 0.5467197922331606, + "grad_norm": 1.0304001569747925, + "learning_rate": 8.537330070718509e-06, + "loss": 0.5581, + "step": 24630 + }, + { + "epoch": 0.5468307787926882, + "grad_norm": 0.9019938111305237, + "learning_rate": 8.533880875244596e-06, + "loss": 0.4139, + "step": 24635 + }, + { + "epoch": 0.5469417653522158, + "grad_norm": 1.1788930892944336, + "learning_rate": 8.530431858016527e-06, + "loss": 0.5491, + "step": 24640 + }, + { + "epoch": 0.5470527519117435, + "grad_norm": 1.1308344602584839, + "learning_rate": 8.526983019453624e-06, + "loss": 0.4609, + "step": 24645 + }, + { + "epoch": 0.5471637384712711, + "grad_norm": 0.87598717212677, + "learning_rate": 8.52353435997519e-06, + "loss": 0.3522, + "step": 24650 + }, + { + "epoch": 0.5472747250307988, + "grad_norm": 1.2333834171295166, + "learning_rate": 8.52008588000049e-06, + "loss": 0.5089, + "step": 24655 + }, + { + "epoch": 0.5473857115903265, + "grad_norm": 2.247903347015381, + "learning_rate": 8.516637579948792e-06, + "loss": 0.3212, + "step": 24660 + }, + { + "epoch": 0.547496698149854, + "grad_norm": 1.563804030418396, + "learning_rate": 8.513189460239318e-06, + "loss": 0.4435, + "step": 24665 + }, + { + "epoch": 0.5476076847093817, + "grad_norm": 1.2471294403076172, + "learning_rate": 8.509741521291287e-06, + "loss": 0.2912, + "step": 24670 + }, + { + "epoch": 0.5477186712689094, + "grad_norm": 1.4435144662857056, + "learning_rate": 8.506293763523879e-06, + "loss": 0.4713, + "step": 24675 + }, + { + "epoch": 0.547829657828437, + "grad_norm": 0.9990353584289551, + "learning_rate": 8.502846187356268e-06, + "loss": 0.4863, + "step": 24680 + }, + { + "epoch": 0.5479406443879646, + "grad_norm": 1.8638103008270264, + "learning_rate": 8.499398793207596e-06, + "loss": 0.532, + "step": 24685 + }, + { + "epoch": 0.5480516309474922, + "grad_norm": 1.349532961845398, + "learning_rate": 8.495951581496989e-06, + "loss": 0.4164, + "step": 24690 + }, + { + "epoch": 0.5481626175070199, + "grad_norm": 1.3890430927276611, + "learning_rate": 8.492504552643541e-06, + "loss": 0.4659, + "step": 24695 + }, + { + "epoch": 0.5482736040665476, + "grad_norm": 0.9236255288124084, + "learning_rate": 8.489057707066335e-06, + "loss": 0.3011, + "step": 24700 + }, + { + "epoch": 0.5483845906260751, + "grad_norm": 0.7612236738204956, + "learning_rate": 8.485611045184424e-06, + "loss": 0.3232, + "step": 24705 + }, + { + "epoch": 0.5484955771856028, + "grad_norm": 1.6489447355270386, + "learning_rate": 8.482164567416848e-06, + "loss": 0.5851, + "step": 24710 + }, + { + "epoch": 0.5486065637451305, + "grad_norm": 1.068479299545288, + "learning_rate": 8.478718274182608e-06, + "loss": 0.4415, + "step": 24715 + }, + { + "epoch": 0.5487175503046581, + "grad_norm": 1.324263334274292, + "learning_rate": 8.475272165900703e-06, + "loss": 0.3814, + "step": 24720 + }, + { + "epoch": 0.5488285368641858, + "grad_norm": 1.062963604927063, + "learning_rate": 8.47182624299009e-06, + "loss": 0.5722, + "step": 24725 + }, + { + "epoch": 0.5489395234237134, + "grad_norm": 1.2563762664794922, + "learning_rate": 8.468380505869718e-06, + "loss": 0.4247, + "step": 24730 + }, + { + "epoch": 0.549050509983241, + "grad_norm": 1.1818538904190063, + "learning_rate": 8.46493495495851e-06, + "loss": 0.4213, + "step": 24735 + }, + { + "epoch": 0.5491614965427687, + "grad_norm": 1.265612006187439, + "learning_rate": 8.46148959067536e-06, + "loss": 0.3626, + "step": 24740 + }, + { + "epoch": 0.5492724831022963, + "grad_norm": 1.954348087310791, + "learning_rate": 8.458044413439148e-06, + "loss": 0.5429, + "step": 24745 + }, + { + "epoch": 0.5493834696618239, + "grad_norm": 0.8584548234939575, + "learning_rate": 8.454599423668722e-06, + "loss": 0.3159, + "step": 24750 + }, + { + "epoch": 0.5494944562213516, + "grad_norm": 1.0633316040039062, + "learning_rate": 8.45115462178292e-06, + "loss": 0.3613, + "step": 24755 + }, + { + "epoch": 0.5496054427808792, + "grad_norm": 1.023334264755249, + "learning_rate": 8.447710008200539e-06, + "loss": 0.3953, + "step": 24760 + }, + { + "epoch": 0.5497164293404069, + "grad_norm": 0.8943719267845154, + "learning_rate": 8.444265583340368e-06, + "loss": 0.5067, + "step": 24765 + }, + { + "epoch": 0.5498274158999346, + "grad_norm": 0.8232437968254089, + "learning_rate": 8.440821347621176e-06, + "loss": 0.2908, + "step": 24770 + }, + { + "epoch": 0.5499384024594621, + "grad_norm": 1.4632898569107056, + "learning_rate": 8.437377301461691e-06, + "loss": 0.4248, + "step": 24775 + }, + { + "epoch": 0.5500493890189898, + "grad_norm": 0.9818194508552551, + "learning_rate": 8.433933445280637e-06, + "loss": 0.5157, + "step": 24780 + }, + { + "epoch": 0.5501603755785175, + "grad_norm": 1.169661283493042, + "learning_rate": 8.4304897794967e-06, + "loss": 0.4002, + "step": 24785 + }, + { + "epoch": 0.5502713621380451, + "grad_norm": 1.7490116357803345, + "learning_rate": 8.42704630452855e-06, + "loss": 0.4003, + "step": 24790 + }, + { + "epoch": 0.5503823486975727, + "grad_norm": 1.6208126544952393, + "learning_rate": 8.42360302079484e-06, + "loss": 0.4971, + "step": 24795 + }, + { + "epoch": 0.5504933352571003, + "grad_norm": 1.2586880922317505, + "learning_rate": 8.420159928714183e-06, + "loss": 0.3776, + "step": 24800 + }, + { + "epoch": 0.550604321816628, + "grad_norm": 1.1551201343536377, + "learning_rate": 8.416717028705188e-06, + "loss": 0.459, + "step": 24805 + }, + { + "epoch": 0.5507153083761557, + "grad_norm": 1.0400720834732056, + "learning_rate": 8.413274321186423e-06, + "loss": 0.4561, + "step": 24810 + }, + { + "epoch": 0.5508262949356832, + "grad_norm": 1.4724727869033813, + "learning_rate": 8.409831806576446e-06, + "loss": 0.3669, + "step": 24815 + }, + { + "epoch": 0.5509372814952109, + "grad_norm": 2.1651415824890137, + "learning_rate": 8.406389485293786e-06, + "loss": 0.4854, + "step": 24820 + }, + { + "epoch": 0.5510482680547386, + "grad_norm": 1.3159157037734985, + "learning_rate": 8.402947357756946e-06, + "loss": 0.4823, + "step": 24825 + }, + { + "epoch": 0.5511592546142662, + "grad_norm": 1.1210155487060547, + "learning_rate": 8.399505424384415e-06, + "loss": 0.443, + "step": 24830 + }, + { + "epoch": 0.5512702411737939, + "grad_norm": 0.9298530220985413, + "learning_rate": 8.39606368559464e-06, + "loss": 0.3296, + "step": 24835 + }, + { + "epoch": 0.5513812277333215, + "grad_norm": 1.265432357788086, + "learning_rate": 8.392622141806068e-06, + "loss": 0.4532, + "step": 24840 + }, + { + "epoch": 0.5514922142928491, + "grad_norm": 0.821119487285614, + "learning_rate": 8.389180793437101e-06, + "loss": 0.3021, + "step": 24845 + }, + { + "epoch": 0.5516032008523768, + "grad_norm": 0.9582390785217285, + "learning_rate": 8.385739640906131e-06, + "loss": 0.2717, + "step": 24850 + }, + { + "epoch": 0.5517141874119044, + "grad_norm": 1.3956959247589111, + "learning_rate": 8.382298684631528e-06, + "loss": 0.3915, + "step": 24855 + }, + { + "epoch": 0.551825173971432, + "grad_norm": 1.9195780754089355, + "learning_rate": 8.378857925031616e-06, + "loss": 0.4014, + "step": 24860 + }, + { + "epoch": 0.5519361605309597, + "grad_norm": 1.3991817235946655, + "learning_rate": 8.375417362524729e-06, + "loss": 0.3306, + "step": 24865 + }, + { + "epoch": 0.5520471470904873, + "grad_norm": 1.4770393371582031, + "learning_rate": 8.371976997529145e-06, + "loss": 0.552, + "step": 24870 + }, + { + "epoch": 0.552158133650015, + "grad_norm": 1.588649034500122, + "learning_rate": 8.36853683046314e-06, + "loss": 0.565, + "step": 24875 + }, + { + "epoch": 0.5522691202095427, + "grad_norm": 1.552659273147583, + "learning_rate": 8.365096861744957e-06, + "loss": 0.5495, + "step": 24880 + }, + { + "epoch": 0.5523801067690702, + "grad_norm": 1.6279903650283813, + "learning_rate": 8.361657091792812e-06, + "loss": 0.5373, + "step": 24885 + }, + { + "epoch": 0.5524910933285979, + "grad_norm": 0.727022647857666, + "learning_rate": 8.358217521024906e-06, + "loss": 0.4088, + "step": 24890 + }, + { + "epoch": 0.5526020798881256, + "grad_norm": 0.8844607472419739, + "learning_rate": 8.354778149859403e-06, + "loss": 0.5545, + "step": 24895 + }, + { + "epoch": 0.5527130664476532, + "grad_norm": 0.9243135452270508, + "learning_rate": 8.351338978714458e-06, + "loss": 0.3905, + "step": 24900 + }, + { + "epoch": 0.5528240530071808, + "grad_norm": 0.919755756855011, + "learning_rate": 8.347900008008194e-06, + "loss": 0.564, + "step": 24905 + }, + { + "epoch": 0.5529350395667084, + "grad_norm": 1.214043140411377, + "learning_rate": 8.3444612381587e-06, + "loss": 0.384, + "step": 24910 + }, + { + "epoch": 0.5530460261262361, + "grad_norm": 1.4756897687911987, + "learning_rate": 8.341022669584062e-06, + "loss": 0.6571, + "step": 24915 + }, + { + "epoch": 0.5531570126857638, + "grad_norm": 1.5681345462799072, + "learning_rate": 8.33758430270232e-06, + "loss": 0.379, + "step": 24920 + }, + { + "epoch": 0.5532679992452914, + "grad_norm": 0.8127152323722839, + "learning_rate": 8.334146137931507e-06, + "loss": 0.5053, + "step": 24925 + }, + { + "epoch": 0.553378985804819, + "grad_norm": 2.043302536010742, + "learning_rate": 8.330708175689614e-06, + "loss": 0.469, + "step": 24930 + }, + { + "epoch": 0.5534899723643467, + "grad_norm": 1.2709745168685913, + "learning_rate": 8.327270416394628e-06, + "loss": 0.4525, + "step": 24935 + }, + { + "epoch": 0.5536009589238743, + "grad_norm": 1.3546684980392456, + "learning_rate": 8.323832860464492e-06, + "loss": 0.3388, + "step": 24940 + }, + { + "epoch": 0.553711945483402, + "grad_norm": 1.3516091108322144, + "learning_rate": 8.320395508317137e-06, + "loss": 0.4119, + "step": 24945 + }, + { + "epoch": 0.5538229320429296, + "grad_norm": 1.1552678346633911, + "learning_rate": 8.316958360370462e-06, + "loss": 0.3053, + "step": 24950 + }, + { + "epoch": 0.5539339186024572, + "grad_norm": 1.4363343715667725, + "learning_rate": 8.313521417042347e-06, + "loss": 0.4092, + "step": 24955 + }, + { + "epoch": 0.5540449051619849, + "grad_norm": 1.314101219177246, + "learning_rate": 8.310084678750638e-06, + "loss": 0.3924, + "step": 24960 + }, + { + "epoch": 0.5541558917215125, + "grad_norm": 1.3979650735855103, + "learning_rate": 8.306648145913173e-06, + "loss": 0.3895, + "step": 24965 + }, + { + "epoch": 0.5542668782810402, + "grad_norm": 1.220910906791687, + "learning_rate": 8.303211818947739e-06, + "loss": 0.5069, + "step": 24970 + }, + { + "epoch": 0.5543778648405678, + "grad_norm": 0.6868875622749329, + "learning_rate": 8.299775698272128e-06, + "loss": 0.3418, + "step": 24975 + }, + { + "epoch": 0.5544888514000954, + "grad_norm": 1.1837313175201416, + "learning_rate": 8.296339784304081e-06, + "loss": 0.3593, + "step": 24980 + }, + { + "epoch": 0.5545998379596231, + "grad_norm": 1.2916624546051025, + "learning_rate": 8.292904077461332e-06, + "loss": 0.2936, + "step": 24985 + }, + { + "epoch": 0.5547108245191508, + "grad_norm": 0.7648731470108032, + "learning_rate": 8.289468578161581e-06, + "loss": 0.428, + "step": 24990 + }, + { + "epoch": 0.5548218110786783, + "grad_norm": 1.3653825521469116, + "learning_rate": 8.2860332868225e-06, + "loss": 0.3027, + "step": 24995 + }, + { + "epoch": 0.554932797638206, + "grad_norm": 1.024545669555664, + "learning_rate": 8.282598203861749e-06, + "loss": 0.4062, + "step": 25000 + }, + { + "epoch": 0.5550437841977337, + "grad_norm": 1.0971325635910034, + "learning_rate": 8.279163329696944e-06, + "loss": 0.341, + "step": 25005 + }, + { + "epoch": 0.5551547707572613, + "grad_norm": 1.2084906101226807, + "learning_rate": 8.275728664745693e-06, + "loss": 0.4603, + "step": 25010 + }, + { + "epoch": 0.555265757316789, + "grad_norm": 1.4418290853500366, + "learning_rate": 8.272294209425566e-06, + "loss": 0.4499, + "step": 25015 + }, + { + "epoch": 0.5553767438763165, + "grad_norm": 1.2812556028366089, + "learning_rate": 8.268859964154113e-06, + "loss": 0.4002, + "step": 25020 + }, + { + "epoch": 0.5554877304358442, + "grad_norm": 1.1037013530731201, + "learning_rate": 8.265425929348867e-06, + "loss": 0.3524, + "step": 25025 + }, + { + "epoch": 0.5555987169953719, + "grad_norm": 1.4665725231170654, + "learning_rate": 8.261992105427312e-06, + "loss": 0.4524, + "step": 25030 + }, + { + "epoch": 0.5557097035548995, + "grad_norm": 1.751202940940857, + "learning_rate": 8.258558492806932e-06, + "loss": 0.527, + "step": 25035 + }, + { + "epoch": 0.5558206901144271, + "grad_norm": 1.0393388271331787, + "learning_rate": 8.255125091905167e-06, + "loss": 0.2466, + "step": 25040 + }, + { + "epoch": 0.5559316766739548, + "grad_norm": 1.357147455215454, + "learning_rate": 8.251691903139445e-06, + "loss": 0.577, + "step": 25045 + }, + { + "epoch": 0.5560426632334824, + "grad_norm": 1.1873313188552856, + "learning_rate": 8.248258926927157e-06, + "loss": 0.5742, + "step": 25050 + }, + { + "epoch": 0.5561536497930101, + "grad_norm": 1.418981671333313, + "learning_rate": 8.244826163685672e-06, + "loss": 0.3359, + "step": 25055 + }, + { + "epoch": 0.5562646363525378, + "grad_norm": 0.8048796057701111, + "learning_rate": 8.241393613832337e-06, + "loss": 0.3413, + "step": 25060 + }, + { + "epoch": 0.5563756229120653, + "grad_norm": 1.4124388694763184, + "learning_rate": 8.237961277784468e-06, + "loss": 0.4055, + "step": 25065 + }, + { + "epoch": 0.556486609471593, + "grad_norm": 1.5513757467269897, + "learning_rate": 8.234529155959357e-06, + "loss": 0.508, + "step": 25070 + }, + { + "epoch": 0.5565975960311206, + "grad_norm": 0.8069791793823242, + "learning_rate": 8.231097248774273e-06, + "loss": 0.353, + "step": 25075 + }, + { + "epoch": 0.5567085825906483, + "grad_norm": 0.7790321707725525, + "learning_rate": 8.22766555664645e-06, + "loss": 0.3161, + "step": 25080 + }, + { + "epoch": 0.5568195691501759, + "grad_norm": 1.2847238779067993, + "learning_rate": 8.224234079993107e-06, + "loss": 0.376, + "step": 25085 + }, + { + "epoch": 0.5569305557097035, + "grad_norm": 1.4938470125198364, + "learning_rate": 8.220802819231427e-06, + "loss": 0.4056, + "step": 25090 + }, + { + "epoch": 0.5570415422692312, + "grad_norm": 1.4761202335357666, + "learning_rate": 8.217371774778575e-06, + "loss": 0.6105, + "step": 25095 + }, + { + "epoch": 0.5571525288287589, + "grad_norm": 1.031489372253418, + "learning_rate": 8.213940947051682e-06, + "loss": 0.4533, + "step": 25100 + }, + { + "epoch": 0.5572635153882864, + "grad_norm": 1.3615245819091797, + "learning_rate": 8.21051033646786e-06, + "loss": 0.4889, + "step": 25105 + }, + { + "epoch": 0.5573745019478141, + "grad_norm": 1.0796312093734741, + "learning_rate": 8.207079943444191e-06, + "loss": 0.3195, + "step": 25110 + }, + { + "epoch": 0.5574854885073418, + "grad_norm": 1.623977541923523, + "learning_rate": 8.203649768397725e-06, + "loss": 0.4583, + "step": 25115 + }, + { + "epoch": 0.5575964750668694, + "grad_norm": 1.6875170469284058, + "learning_rate": 8.200219811745502e-06, + "loss": 0.4174, + "step": 25120 + }, + { + "epoch": 0.5577074616263971, + "grad_norm": 1.4233814477920532, + "learning_rate": 8.196790073904514e-06, + "loss": 0.548, + "step": 25125 + }, + { + "epoch": 0.5578184481859246, + "grad_norm": 1.2150535583496094, + "learning_rate": 8.193360555291741e-06, + "loss": 0.6045, + "step": 25130 + }, + { + "epoch": 0.5579294347454523, + "grad_norm": 1.5399670600891113, + "learning_rate": 8.189931256324136e-06, + "loss": 0.3197, + "step": 25135 + }, + { + "epoch": 0.55804042130498, + "grad_norm": 1.1746127605438232, + "learning_rate": 8.186502177418614e-06, + "loss": 0.4447, + "step": 25140 + }, + { + "epoch": 0.5581514078645076, + "grad_norm": 1.565848708152771, + "learning_rate": 8.18307331899208e-06, + "loss": 0.2881, + "step": 25145 + }, + { + "epoch": 0.5582623944240352, + "grad_norm": 1.0995526313781738, + "learning_rate": 8.179644681461395e-06, + "loss": 0.6268, + "step": 25150 + }, + { + "epoch": 0.5583733809835629, + "grad_norm": 1.115090012550354, + "learning_rate": 8.176216265243404e-06, + "loss": 0.4727, + "step": 25155 + }, + { + "epoch": 0.5584843675430905, + "grad_norm": 1.7932703495025635, + "learning_rate": 8.172788070754927e-06, + "loss": 0.4323, + "step": 25160 + }, + { + "epoch": 0.5585953541026182, + "grad_norm": 1.0676770210266113, + "learning_rate": 8.169360098412744e-06, + "loss": 0.4214, + "step": 25165 + }, + { + "epoch": 0.5587063406621459, + "grad_norm": 1.1755608320236206, + "learning_rate": 8.165932348633625e-06, + "loss": 0.4027, + "step": 25170 + }, + { + "epoch": 0.5588173272216734, + "grad_norm": 1.457756519317627, + "learning_rate": 8.162504821834296e-06, + "loss": 0.4976, + "step": 25175 + }, + { + "epoch": 0.5589283137812011, + "grad_norm": 0.9333487749099731, + "learning_rate": 8.159077518431474e-06, + "loss": 0.3416, + "step": 25180 + }, + { + "epoch": 0.5590393003407287, + "grad_norm": 1.0539964437484741, + "learning_rate": 8.155650438841826e-06, + "loss": 0.3759, + "step": 25185 + }, + { + "epoch": 0.5591502869002564, + "grad_norm": 1.611405849456787, + "learning_rate": 8.152223583482018e-06, + "loss": 0.3181, + "step": 25190 + }, + { + "epoch": 0.559261273459784, + "grad_norm": 1.2849960327148438, + "learning_rate": 8.148796952768668e-06, + "loss": 0.3716, + "step": 25195 + }, + { + "epoch": 0.5593722600193116, + "grad_norm": 1.1448283195495605, + "learning_rate": 8.145370547118374e-06, + "loss": 0.3764, + "step": 25200 + }, + { + "epoch": 0.5594832465788393, + "grad_norm": 1.0560680627822876, + "learning_rate": 8.141944366947711e-06, + "loss": 0.4771, + "step": 25205 + }, + { + "epoch": 0.559594233138367, + "grad_norm": 0.9812813401222229, + "learning_rate": 8.138518412673221e-06, + "loss": 0.2162, + "step": 25210 + }, + { + "epoch": 0.5597052196978946, + "grad_norm": 1.6056973934173584, + "learning_rate": 8.135092684711417e-06, + "loss": 0.4784, + "step": 25215 + }, + { + "epoch": 0.5598162062574222, + "grad_norm": 0.9755131006240845, + "learning_rate": 8.131667183478793e-06, + "loss": 0.3506, + "step": 25220 + }, + { + "epoch": 0.5599271928169499, + "grad_norm": 0.9014113545417786, + "learning_rate": 8.128241909391803e-06, + "loss": 0.3755, + "step": 25225 + }, + { + "epoch": 0.5600381793764775, + "grad_norm": 1.5927777290344238, + "learning_rate": 8.12481686286689e-06, + "loss": 0.5215, + "step": 25230 + }, + { + "epoch": 0.5601491659360052, + "grad_norm": 1.598618745803833, + "learning_rate": 8.121392044320448e-06, + "loss": 0.5198, + "step": 25235 + }, + { + "epoch": 0.5602601524955327, + "grad_norm": 1.4176770448684692, + "learning_rate": 8.117967454168862e-06, + "loss": 0.4328, + "step": 25240 + }, + { + "epoch": 0.5603711390550604, + "grad_norm": 1.2567905187606812, + "learning_rate": 8.114543092828485e-06, + "loss": 0.3619, + "step": 25245 + }, + { + "epoch": 0.5604821256145881, + "grad_norm": 1.7040385007858276, + "learning_rate": 8.11111896071563e-06, + "loss": 0.3801, + "step": 25250 + }, + { + "epoch": 0.5605931121741157, + "grad_norm": 1.7464607954025269, + "learning_rate": 8.107695058246603e-06, + "loss": 0.5361, + "step": 25255 + }, + { + "epoch": 0.5607040987336434, + "grad_norm": 1.2700963020324707, + "learning_rate": 8.104271385837658e-06, + "loss": 0.4105, + "step": 25260 + }, + { + "epoch": 0.560815085293171, + "grad_norm": 1.1427721977233887, + "learning_rate": 8.100847943905047e-06, + "loss": 0.3996, + "step": 25265 + }, + { + "epoch": 0.5609260718526986, + "grad_norm": 1.2108415365219116, + "learning_rate": 8.09742473286497e-06, + "loss": 0.4433, + "step": 25270 + }, + { + "epoch": 0.5610370584122263, + "grad_norm": 0.8741481900215149, + "learning_rate": 8.09400175313361e-06, + "loss": 0.5403, + "step": 25275 + }, + { + "epoch": 0.561148044971754, + "grad_norm": 1.054548740386963, + "learning_rate": 8.09057900512713e-06, + "loss": 0.447, + "step": 25280 + }, + { + "epoch": 0.5612590315312815, + "grad_norm": 1.8012455701828003, + "learning_rate": 8.087156489261647e-06, + "loss": 0.644, + "step": 25285 + }, + { + "epoch": 0.5613700180908092, + "grad_norm": 1.0386770963668823, + "learning_rate": 8.08373420595327e-06, + "loss": 0.4729, + "step": 25290 + }, + { + "epoch": 0.5614810046503368, + "grad_norm": 1.0843547582626343, + "learning_rate": 8.080312155618053e-06, + "loss": 0.389, + "step": 25295 + }, + { + "epoch": 0.5615919912098645, + "grad_norm": 1.1872903108596802, + "learning_rate": 8.07689033867205e-06, + "loss": 0.3114, + "step": 25300 + }, + { + "epoch": 0.5617029777693922, + "grad_norm": 1.244188666343689, + "learning_rate": 8.073468755531269e-06, + "loss": 0.4731, + "step": 25305 + }, + { + "epoch": 0.5618139643289197, + "grad_norm": 1.497534990310669, + "learning_rate": 8.070047406611696e-06, + "loss": 0.3794, + "step": 25310 + }, + { + "epoch": 0.5619249508884474, + "grad_norm": 0.9751390814781189, + "learning_rate": 8.066626292329288e-06, + "loss": 0.4799, + "step": 25315 + }, + { + "epoch": 0.5620359374479751, + "grad_norm": 2.0736711025238037, + "learning_rate": 8.063205413099969e-06, + "loss": 0.3642, + "step": 25320 + }, + { + "epoch": 0.5621469240075027, + "grad_norm": 0.9690294861793518, + "learning_rate": 8.05978476933964e-06, + "loss": 0.4201, + "step": 25325 + }, + { + "epoch": 0.5622579105670303, + "grad_norm": 1.3464387655258179, + "learning_rate": 8.056364361464176e-06, + "loss": 0.3445, + "step": 25330 + }, + { + "epoch": 0.562368897126558, + "grad_norm": 1.2715051174163818, + "learning_rate": 8.052944189889409e-06, + "loss": 0.3768, + "step": 25335 + }, + { + "epoch": 0.5624798836860856, + "grad_norm": 1.1041648387908936, + "learning_rate": 8.049524255031163e-06, + "loss": 0.4955, + "step": 25340 + }, + { + "epoch": 0.5625908702456133, + "grad_norm": 1.3899486064910889, + "learning_rate": 8.046104557305212e-06, + "loss": 0.3873, + "step": 25345 + }, + { + "epoch": 0.5627018568051408, + "grad_norm": 1.2393910884857178, + "learning_rate": 8.04268509712732e-06, + "loss": 0.4267, + "step": 25350 + }, + { + "epoch": 0.5628128433646685, + "grad_norm": 1.4444127082824707, + "learning_rate": 8.039265874913206e-06, + "loss": 0.3685, + "step": 25355 + }, + { + "epoch": 0.5629238299241962, + "grad_norm": 1.5184959173202515, + "learning_rate": 8.035846891078572e-06, + "loss": 0.248, + "step": 25360 + }, + { + "epoch": 0.5630348164837238, + "grad_norm": 1.041031002998352, + "learning_rate": 8.032428146039088e-06, + "loss": 0.3472, + "step": 25365 + }, + { + "epoch": 0.5631458030432515, + "grad_norm": 1.2190139293670654, + "learning_rate": 8.029009640210389e-06, + "loss": 0.2602, + "step": 25370 + }, + { + "epoch": 0.5632567896027791, + "grad_norm": 0.991255521774292, + "learning_rate": 8.02559137400809e-06, + "loss": 0.435, + "step": 25375 + }, + { + "epoch": 0.5633677761623067, + "grad_norm": 2.327293634414673, + "learning_rate": 8.022173347847766e-06, + "loss": 0.3849, + "step": 25380 + }, + { + "epoch": 0.5634787627218344, + "grad_norm": 1.307950735092163, + "learning_rate": 8.018755562144975e-06, + "loss": 0.4234, + "step": 25385 + }, + { + "epoch": 0.5635897492813621, + "grad_norm": 0.7813902497291565, + "learning_rate": 8.01533801731524e-06, + "loss": 0.3237, + "step": 25390 + }, + { + "epoch": 0.5637007358408896, + "grad_norm": 1.0847278833389282, + "learning_rate": 8.011920713774051e-06, + "loss": 0.3685, + "step": 25395 + }, + { + "epoch": 0.5638117224004173, + "grad_norm": 0.9492262601852417, + "learning_rate": 8.008503651936876e-06, + "loss": 0.4609, + "step": 25400 + }, + { + "epoch": 0.5639227089599449, + "grad_norm": 1.9693423509597778, + "learning_rate": 8.005086832219145e-06, + "loss": 0.4801, + "step": 25405 + }, + { + "epoch": 0.5640336955194726, + "grad_norm": 1.6223005056381226, + "learning_rate": 8.001670255036268e-06, + "loss": 0.3899, + "step": 25410 + }, + { + "epoch": 0.5641446820790003, + "grad_norm": 1.0886939764022827, + "learning_rate": 7.99825392080362e-06, + "loss": 0.4616, + "step": 25415 + }, + { + "epoch": 0.5642556686385278, + "grad_norm": 1.0888535976409912, + "learning_rate": 7.994837829936549e-06, + "loss": 0.4002, + "step": 25420 + }, + { + "epoch": 0.5643666551980555, + "grad_norm": 1.322079062461853, + "learning_rate": 7.991421982850367e-06, + "loss": 0.4154, + "step": 25425 + }, + { + "epoch": 0.5644776417575832, + "grad_norm": 1.256758451461792, + "learning_rate": 7.988006379960366e-06, + "loss": 0.533, + "step": 25430 + }, + { + "epoch": 0.5645886283171108, + "grad_norm": 1.1814241409301758, + "learning_rate": 7.9845910216818e-06, + "loss": 0.574, + "step": 25435 + }, + { + "epoch": 0.5646996148766384, + "grad_norm": 1.4561266899108887, + "learning_rate": 7.9811759084299e-06, + "loss": 0.4807, + "step": 25440 + }, + { + "epoch": 0.5648106014361661, + "grad_norm": 1.137708306312561, + "learning_rate": 7.97776104061986e-06, + "loss": 0.5643, + "step": 25445 + }, + { + "epoch": 0.5649215879956937, + "grad_norm": 0.908723771572113, + "learning_rate": 7.974346418666854e-06, + "loss": 0.4804, + "step": 25450 + }, + { + "epoch": 0.5650325745552214, + "grad_norm": 2.1305763721466064, + "learning_rate": 7.970932042986013e-06, + "loss": 0.5224, + "step": 25455 + }, + { + "epoch": 0.565143561114749, + "grad_norm": 1.0807911157608032, + "learning_rate": 7.967517913992453e-06, + "loss": 0.4506, + "step": 25460 + }, + { + "epoch": 0.5652545476742766, + "grad_norm": 1.67317533493042, + "learning_rate": 7.964104032101246e-06, + "loss": 0.4928, + "step": 25465 + }, + { + "epoch": 0.5653655342338043, + "grad_norm": 1.2551069259643555, + "learning_rate": 7.96069039772744e-06, + "loss": 0.4584, + "step": 25470 + }, + { + "epoch": 0.5654765207933319, + "grad_norm": 1.3132424354553223, + "learning_rate": 7.957277011286063e-06, + "loss": 0.4113, + "step": 25475 + }, + { + "epoch": 0.5655875073528596, + "grad_norm": 1.1066019535064697, + "learning_rate": 7.953863873192092e-06, + "loss": 0.4033, + "step": 25480 + }, + { + "epoch": 0.5656984939123872, + "grad_norm": 1.9208440780639648, + "learning_rate": 7.95045098386049e-06, + "loss": 0.4711, + "step": 25485 + }, + { + "epoch": 0.5658094804719148, + "grad_norm": 0.8026219606399536, + "learning_rate": 7.947038343706184e-06, + "loss": 0.3814, + "step": 25490 + }, + { + "epoch": 0.5659204670314425, + "grad_norm": 1.6581600904464722, + "learning_rate": 7.943625953144068e-06, + "loss": 0.3491, + "step": 25495 + }, + { + "epoch": 0.5660314535909702, + "grad_norm": 0.9231352806091309, + "learning_rate": 7.940213812589018e-06, + "loss": 0.453, + "step": 25500 + }, + { + "epoch": 0.5661424401504977, + "grad_norm": 1.7178947925567627, + "learning_rate": 7.93680192245586e-06, + "loss": 0.4339, + "step": 25505 + }, + { + "epoch": 0.5662534267100254, + "grad_norm": 1.2096792459487915, + "learning_rate": 7.93339028315941e-06, + "loss": 0.3569, + "step": 25510 + }, + { + "epoch": 0.566364413269553, + "grad_norm": 0.9888404607772827, + "learning_rate": 7.929978895114432e-06, + "loss": 0.406, + "step": 25515 + }, + { + "epoch": 0.5664753998290807, + "grad_norm": 1.1972068548202515, + "learning_rate": 7.926567758735683e-06, + "loss": 0.4937, + "step": 25520 + }, + { + "epoch": 0.5665863863886084, + "grad_norm": 1.4918951988220215, + "learning_rate": 7.923156874437867e-06, + "loss": 0.4326, + "step": 25525 + }, + { + "epoch": 0.5666973729481359, + "grad_norm": 1.0668883323669434, + "learning_rate": 7.919746242635675e-06, + "loss": 0.1992, + "step": 25530 + }, + { + "epoch": 0.5668083595076636, + "grad_norm": 1.5377274751663208, + "learning_rate": 7.91633586374376e-06, + "loss": 0.3767, + "step": 25535 + }, + { + "epoch": 0.5669193460671913, + "grad_norm": 1.5494482517242432, + "learning_rate": 7.91292573817674e-06, + "loss": 0.3608, + "step": 25540 + }, + { + "epoch": 0.5670303326267189, + "grad_norm": 1.1204816102981567, + "learning_rate": 7.909515866349214e-06, + "loss": 0.442, + "step": 25545 + }, + { + "epoch": 0.5671413191862466, + "grad_norm": 1.6153379678726196, + "learning_rate": 7.906106248675733e-06, + "loss": 0.3891, + "step": 25550 + }, + { + "epoch": 0.5672523057457742, + "grad_norm": 1.0235836505889893, + "learning_rate": 7.902696885570833e-06, + "loss": 0.4467, + "step": 25555 + }, + { + "epoch": 0.5673632923053018, + "grad_norm": 1.214629888534546, + "learning_rate": 7.899287777449015e-06, + "loss": 0.4197, + "step": 25560 + }, + { + "epoch": 0.5674742788648295, + "grad_norm": 1.6324222087860107, + "learning_rate": 7.89587892472474e-06, + "loss": 0.2196, + "step": 25565 + }, + { + "epoch": 0.567585265424357, + "grad_norm": 1.2325080633163452, + "learning_rate": 7.892470327812451e-06, + "loss": 0.4715, + "step": 25570 + }, + { + "epoch": 0.5676962519838847, + "grad_norm": 1.5055568218231201, + "learning_rate": 7.889061987126549e-06, + "loss": 0.444, + "step": 25575 + }, + { + "epoch": 0.5678072385434124, + "grad_norm": 1.1074658632278442, + "learning_rate": 7.885653903081412e-06, + "loss": 0.495, + "step": 25580 + }, + { + "epoch": 0.56791822510294, + "grad_norm": 0.9098854660987854, + "learning_rate": 7.882246076091385e-06, + "loss": 0.4111, + "step": 25585 + }, + { + "epoch": 0.5680292116624677, + "grad_norm": 0.773410975933075, + "learning_rate": 7.878838506570774e-06, + "loss": 0.3386, + "step": 25590 + }, + { + "epoch": 0.5681401982219954, + "grad_norm": 0.5425692796707153, + "learning_rate": 7.875431194933868e-06, + "loss": 0.3496, + "step": 25595 + }, + { + "epoch": 0.5682511847815229, + "grad_norm": 1.6536232233047485, + "learning_rate": 7.872024141594907e-06, + "loss": 0.3964, + "step": 25600 + }, + { + "epoch": 0.5683621713410506, + "grad_norm": 0.9511072039604187, + "learning_rate": 7.86861734696812e-06, + "loss": 0.5014, + "step": 25605 + }, + { + "epoch": 0.5684731579005783, + "grad_norm": 1.6080678701400757, + "learning_rate": 7.865210811467682e-06, + "loss": 0.5391, + "step": 25610 + }, + { + "epoch": 0.5685841444601059, + "grad_norm": 1.163704514503479, + "learning_rate": 7.861804535507754e-06, + "loss": 0.3368, + "step": 25615 + }, + { + "epoch": 0.5686951310196335, + "grad_norm": 1.0903295278549194, + "learning_rate": 7.858398519502464e-06, + "loss": 0.649, + "step": 25620 + }, + { + "epoch": 0.5688061175791612, + "grad_norm": 1.1664574146270752, + "learning_rate": 7.854992763865896e-06, + "loss": 0.3017, + "step": 25625 + }, + { + "epoch": 0.5689171041386888, + "grad_norm": 0.8933138251304626, + "learning_rate": 7.851587269012118e-06, + "loss": 0.4096, + "step": 25630 + }, + { + "epoch": 0.5690280906982165, + "grad_norm": 1.3496272563934326, + "learning_rate": 7.848182035355151e-06, + "loss": 0.5146, + "step": 25635 + }, + { + "epoch": 0.569139077257744, + "grad_norm": 1.5749702453613281, + "learning_rate": 7.844777063308994e-06, + "loss": 0.4096, + "step": 25640 + }, + { + "epoch": 0.5692500638172717, + "grad_norm": 1.6230230331420898, + "learning_rate": 7.84137235328762e-06, + "loss": 0.3783, + "step": 25645 + }, + { + "epoch": 0.5693610503767994, + "grad_norm": 0.7861883640289307, + "learning_rate": 7.83796790570495e-06, + "loss": 0.4608, + "step": 25650 + }, + { + "epoch": 0.569472036936327, + "grad_norm": 2.24285888671875, + "learning_rate": 7.834563720974895e-06, + "loss": 0.4869, + "step": 25655 + }, + { + "epoch": 0.5695830234958547, + "grad_norm": 1.4274076223373413, + "learning_rate": 7.831159799511316e-06, + "loss": 0.2404, + "step": 25660 + }, + { + "epoch": 0.5696940100553823, + "grad_norm": 1.2562860250473022, + "learning_rate": 7.82775614172806e-06, + "loss": 0.426, + "step": 25665 + }, + { + "epoch": 0.5698049966149099, + "grad_norm": 1.2366671562194824, + "learning_rate": 7.824352748038924e-06, + "loss": 0.5719, + "step": 25670 + }, + { + "epoch": 0.5699159831744376, + "grad_norm": 1.4575527906417847, + "learning_rate": 7.820949618857685e-06, + "loss": 0.4365, + "step": 25675 + }, + { + "epoch": 0.5700269697339653, + "grad_norm": 1.5027035474777222, + "learning_rate": 7.817546754598085e-06, + "loss": 0.4668, + "step": 25680 + }, + { + "epoch": 0.5701379562934928, + "grad_norm": 1.3077846765518188, + "learning_rate": 7.814144155673828e-06, + "loss": 0.3834, + "step": 25685 + }, + { + "epoch": 0.5702489428530205, + "grad_norm": 1.1927096843719482, + "learning_rate": 7.810741822498596e-06, + "loss": 0.4916, + "step": 25690 + }, + { + "epoch": 0.5703599294125481, + "grad_norm": 1.380219578742981, + "learning_rate": 7.80733975548603e-06, + "loss": 0.3316, + "step": 25695 + }, + { + "epoch": 0.5704709159720758, + "grad_norm": 0.9102444052696228, + "learning_rate": 7.803937955049743e-06, + "loss": 0.3905, + "step": 25700 + }, + { + "epoch": 0.5705819025316035, + "grad_norm": 0.704429566860199, + "learning_rate": 7.800536421603317e-06, + "loss": 0.4945, + "step": 25705 + }, + { + "epoch": 0.570692889091131, + "grad_norm": 0.7455553412437439, + "learning_rate": 7.797135155560292e-06, + "loss": 0.5326, + "step": 25710 + }, + { + "epoch": 0.5708038756506587, + "grad_norm": 0.8614157438278198, + "learning_rate": 7.793734157334192e-06, + "loss": 0.5087, + "step": 25715 + }, + { + "epoch": 0.5709148622101864, + "grad_norm": 1.1032811403274536, + "learning_rate": 7.790333427338489e-06, + "loss": 0.4716, + "step": 25720 + }, + { + "epoch": 0.571025848769714, + "grad_norm": 0.9284597039222717, + "learning_rate": 7.786932965986638e-06, + "loss": 0.5088, + "step": 25725 + }, + { + "epoch": 0.5711368353292416, + "grad_norm": 1.2898480892181396, + "learning_rate": 7.783532773692058e-06, + "loss": 0.4708, + "step": 25730 + }, + { + "epoch": 0.5712478218887693, + "grad_norm": 0.7317522764205933, + "learning_rate": 7.780132850868126e-06, + "loss": 0.3739, + "step": 25735 + }, + { + "epoch": 0.5713588084482969, + "grad_norm": 0.6803992986679077, + "learning_rate": 7.776733197928203e-06, + "loss": 0.534, + "step": 25740 + }, + { + "epoch": 0.5714697950078246, + "grad_norm": 0.9768413305282593, + "learning_rate": 7.773333815285596e-06, + "loss": 0.4836, + "step": 25745 + }, + { + "epoch": 0.5715807815673521, + "grad_norm": 1.4873603582382202, + "learning_rate": 7.769934703353594e-06, + "loss": 0.3999, + "step": 25750 + }, + { + "epoch": 0.5716917681268798, + "grad_norm": 1.1292402744293213, + "learning_rate": 7.76653586254546e-06, + "loss": 0.3039, + "step": 25755 + }, + { + "epoch": 0.5718027546864075, + "grad_norm": 1.3553985357284546, + "learning_rate": 7.763137293274399e-06, + "loss": 0.3609, + "step": 25760 + }, + { + "epoch": 0.5719137412459351, + "grad_norm": 1.3553942441940308, + "learning_rate": 7.759738995953608e-06, + "loss": 0.5826, + "step": 25765 + }, + { + "epoch": 0.5720247278054628, + "grad_norm": 0.895066499710083, + "learning_rate": 7.756340970996233e-06, + "loss": 0.5257, + "step": 25770 + }, + { + "epoch": 0.5721357143649904, + "grad_norm": 1.4519301652908325, + "learning_rate": 7.752943218815401e-06, + "loss": 0.387, + "step": 25775 + }, + { + "epoch": 0.572246700924518, + "grad_norm": 3.0451765060424805, + "learning_rate": 7.749545739824193e-06, + "loss": 0.4637, + "step": 25780 + }, + { + "epoch": 0.5723576874840457, + "grad_norm": 1.217422366142273, + "learning_rate": 7.746148534435668e-06, + "loss": 0.2863, + "step": 25785 + }, + { + "epoch": 0.5724686740435734, + "grad_norm": 1.5442456007003784, + "learning_rate": 7.742751603062848e-06, + "loss": 0.3668, + "step": 25790 + }, + { + "epoch": 0.572579660603101, + "grad_norm": 1.1498961448669434, + "learning_rate": 7.739354946118714e-06, + "loss": 0.3538, + "step": 25795 + }, + { + "epoch": 0.5726906471626286, + "grad_norm": 1.1722846031188965, + "learning_rate": 7.735958564016228e-06, + "loss": 0.4353, + "step": 25800 + }, + { + "epoch": 0.5728016337221562, + "grad_norm": 1.1115599870681763, + "learning_rate": 7.732562457168304e-06, + "loss": 0.5448, + "step": 25805 + }, + { + "epoch": 0.5729126202816839, + "grad_norm": 1.3585245609283447, + "learning_rate": 7.729166625987834e-06, + "loss": 0.2745, + "step": 25810 + }, + { + "epoch": 0.5730236068412116, + "grad_norm": 1.2924643754959106, + "learning_rate": 7.725771070887668e-06, + "loss": 0.423, + "step": 25815 + }, + { + "epoch": 0.5731345934007391, + "grad_norm": 1.2379727363586426, + "learning_rate": 7.722375792280628e-06, + "loss": 0.4376, + "step": 25820 + }, + { + "epoch": 0.5732455799602668, + "grad_norm": 1.567542552947998, + "learning_rate": 7.718980790579503e-06, + "loss": 0.4743, + "step": 25825 + }, + { + "epoch": 0.5733565665197945, + "grad_norm": 1.2755229473114014, + "learning_rate": 7.71558606619704e-06, + "loss": 0.542, + "step": 25830 + }, + { + "epoch": 0.5734675530793221, + "grad_norm": 0.7536981701850891, + "learning_rate": 7.712191619545962e-06, + "loss": 0.4327, + "step": 25835 + }, + { + "epoch": 0.5735785396388497, + "grad_norm": 0.9931631088256836, + "learning_rate": 7.70879745103896e-06, + "loss": 0.3605, + "step": 25840 + }, + { + "epoch": 0.5736895261983774, + "grad_norm": 1.2342039346694946, + "learning_rate": 7.705403561088672e-06, + "loss": 0.3614, + "step": 25845 + }, + { + "epoch": 0.573800512757905, + "grad_norm": 1.337133526802063, + "learning_rate": 7.702009950107729e-06, + "loss": 0.4008, + "step": 25850 + }, + { + "epoch": 0.5739114993174327, + "grad_norm": 0.7864338755607605, + "learning_rate": 7.698616618508706e-06, + "loss": 0.4875, + "step": 25855 + }, + { + "epoch": 0.5740224858769603, + "grad_norm": 1.7323805093765259, + "learning_rate": 7.695223566704156e-06, + "loss": 0.278, + "step": 25860 + }, + { + "epoch": 0.5741334724364879, + "grad_norm": 2.2891860008239746, + "learning_rate": 7.691830795106599e-06, + "loss": 0.4967, + "step": 25865 + }, + { + "epoch": 0.5742444589960156, + "grad_norm": 1.6649677753448486, + "learning_rate": 7.68843830412851e-06, + "loss": 0.3181, + "step": 25870 + }, + { + "epoch": 0.5743554455555432, + "grad_norm": 1.1986839771270752, + "learning_rate": 7.685046094182343e-06, + "loss": 0.4243, + "step": 25875 + }, + { + "epoch": 0.5744664321150709, + "grad_norm": 1.3807337284088135, + "learning_rate": 7.681654165680504e-06, + "loss": 0.5999, + "step": 25880 + }, + { + "epoch": 0.5745774186745986, + "grad_norm": 1.0951980352401733, + "learning_rate": 7.67826251903538e-06, + "loss": 0.4205, + "step": 25885 + }, + { + "epoch": 0.5746884052341261, + "grad_norm": 1.0187554359436035, + "learning_rate": 7.67487115465931e-06, + "loss": 0.4065, + "step": 25890 + }, + { + "epoch": 0.5747993917936538, + "grad_norm": 1.6819921731948853, + "learning_rate": 7.671480072964605e-06, + "loss": 0.4441, + "step": 25895 + }, + { + "epoch": 0.5749103783531815, + "grad_norm": 0.9652168154716492, + "learning_rate": 7.668089274363548e-06, + "loss": 0.3745, + "step": 25900 + }, + { + "epoch": 0.575021364912709, + "grad_norm": 1.1394695043563843, + "learning_rate": 7.664698759268371e-06, + "loss": 0.5128, + "step": 25905 + }, + { + "epoch": 0.5751323514722367, + "grad_norm": 1.1738479137420654, + "learning_rate": 7.661308528091294e-06, + "loss": 0.4227, + "step": 25910 + }, + { + "epoch": 0.5752433380317643, + "grad_norm": 0.7554298043251038, + "learning_rate": 7.657918581244477e-06, + "loss": 0.407, + "step": 25915 + }, + { + "epoch": 0.575354324591292, + "grad_norm": 1.571569800376892, + "learning_rate": 7.654528919140067e-06, + "loss": 0.4258, + "step": 25920 + }, + { + "epoch": 0.5754653111508197, + "grad_norm": 1.7037241458892822, + "learning_rate": 7.651139542190164e-06, + "loss": 0.5274, + "step": 25925 + }, + { + "epoch": 0.5755762977103472, + "grad_norm": 1.4097694158554077, + "learning_rate": 7.647750450806838e-06, + "loss": 0.3991, + "step": 25930 + }, + { + "epoch": 0.5756872842698749, + "grad_norm": 1.1634163856506348, + "learning_rate": 7.644361645402124e-06, + "loss": 0.3743, + "step": 25935 + }, + { + "epoch": 0.5757982708294026, + "grad_norm": 0.6868215799331665, + "learning_rate": 7.64097312638802e-06, + "loss": 0.5497, + "step": 25940 + }, + { + "epoch": 0.5759092573889302, + "grad_norm": 1.4639344215393066, + "learning_rate": 7.637584894176491e-06, + "loss": 0.5139, + "step": 25945 + }, + { + "epoch": 0.5760202439484579, + "grad_norm": 1.0975699424743652, + "learning_rate": 7.634196949179472e-06, + "loss": 0.452, + "step": 25950 + }, + { + "epoch": 0.5761312305079855, + "grad_norm": 0.7695174217224121, + "learning_rate": 7.630809291808848e-06, + "loss": 0.3719, + "step": 25955 + }, + { + "epoch": 0.5762422170675131, + "grad_norm": 1.2575563192367554, + "learning_rate": 7.6274219224764905e-06, + "loss": 0.3246, + "step": 25960 + }, + { + "epoch": 0.5763532036270408, + "grad_norm": 1.0166893005371094, + "learning_rate": 7.624034841594212e-06, + "loss": 0.401, + "step": 25965 + }, + { + "epoch": 0.5764641901865684, + "grad_norm": 1.1126619577407837, + "learning_rate": 7.620648049573815e-06, + "loss": 0.4612, + "step": 25970 + }, + { + "epoch": 0.576575176746096, + "grad_norm": 0.6467011570930481, + "learning_rate": 7.617261546827045e-06, + "loss": 0.4013, + "step": 25975 + }, + { + "epoch": 0.5766861633056237, + "grad_norm": 1.3136359453201294, + "learning_rate": 7.613875333765623e-06, + "loss": 0.37, + "step": 25980 + }, + { + "epoch": 0.5767971498651513, + "grad_norm": 1.0311453342437744, + "learning_rate": 7.610489410801242e-06, + "loss": 0.3963, + "step": 25985 + }, + { + "epoch": 0.576908136424679, + "grad_norm": 1.1149797439575195, + "learning_rate": 7.607103778345538e-06, + "loss": 0.4246, + "step": 25990 + }, + { + "epoch": 0.5770191229842067, + "grad_norm": 0.9374891519546509, + "learning_rate": 7.603718436810137e-06, + "loss": 0.4815, + "step": 25995 + }, + { + "epoch": 0.5771301095437342, + "grad_norm": 1.7929052114486694, + "learning_rate": 7.600333386606607e-06, + "loss": 0.4759, + "step": 26000 + }, + { + "epoch": 0.5772410961032619, + "grad_norm": 1.3042231798171997, + "learning_rate": 7.596948628146498e-06, + "loss": 0.4221, + "step": 26005 + }, + { + "epoch": 0.5773520826627896, + "grad_norm": 0.7556316256523132, + "learning_rate": 7.593564161841318e-06, + "loss": 0.3902, + "step": 26010 + }, + { + "epoch": 0.5774630692223172, + "grad_norm": 1.3005378246307373, + "learning_rate": 7.590179988102533e-06, + "loss": 0.4377, + "step": 26015 + }, + { + "epoch": 0.5775740557818448, + "grad_norm": 0.8551700115203857, + "learning_rate": 7.586796107341587e-06, + "loss": 0.471, + "step": 26020 + }, + { + "epoch": 0.5776850423413724, + "grad_norm": 0.9651265740394592, + "learning_rate": 7.5834125199698725e-06, + "loss": 0.5191, + "step": 26025 + }, + { + "epoch": 0.5777960289009001, + "grad_norm": 1.8282874822616577, + "learning_rate": 7.580029226398762e-06, + "loss": 0.4154, + "step": 26030 + }, + { + "epoch": 0.5779070154604278, + "grad_norm": 1.118324875831604, + "learning_rate": 7.5766462270395815e-06, + "loss": 0.2921, + "step": 26035 + }, + { + "epoch": 0.5780180020199553, + "grad_norm": 1.2642825841903687, + "learning_rate": 7.573263522303627e-06, + "loss": 0.4322, + "step": 26040 + }, + { + "epoch": 0.578128988579483, + "grad_norm": 1.2556785345077515, + "learning_rate": 7.569881112602153e-06, + "loss": 0.2382, + "step": 26045 + }, + { + "epoch": 0.5782399751390107, + "grad_norm": 0.7829084992408752, + "learning_rate": 7.566498998346384e-06, + "loss": 0.4101, + "step": 26050 + }, + { + "epoch": 0.5783509616985383, + "grad_norm": 1.2316957712173462, + "learning_rate": 7.563117179947506e-06, + "loss": 0.4872, + "step": 26055 + }, + { + "epoch": 0.578461948258066, + "grad_norm": 1.0883417129516602, + "learning_rate": 7.559735657816667e-06, + "loss": 0.4142, + "step": 26060 + }, + { + "epoch": 0.5785729348175936, + "grad_norm": 1.3580420017242432, + "learning_rate": 7.5563544323649815e-06, + "loss": 0.4207, + "step": 26065 + }, + { + "epoch": 0.5786839213771212, + "grad_norm": 1.2232900857925415, + "learning_rate": 7.552973504003534e-06, + "loss": 0.5603, + "step": 26070 + }, + { + "epoch": 0.5787949079366489, + "grad_norm": 2.7719151973724365, + "learning_rate": 7.5495928731433565e-06, + "loss": 0.4197, + "step": 26075 + }, + { + "epoch": 0.5789058944961765, + "grad_norm": 1.237747073173523, + "learning_rate": 7.546212540195463e-06, + "loss": 0.4251, + "step": 26080 + }, + { + "epoch": 0.5790168810557041, + "grad_norm": 1.1366463899612427, + "learning_rate": 7.542832505570815e-06, + "loss": 0.5521, + "step": 26085 + }, + { + "epoch": 0.5791278676152318, + "grad_norm": 1.3835487365722656, + "learning_rate": 7.539452769680351e-06, + "loss": 0.47, + "step": 26090 + }, + { + "epoch": 0.5792388541747594, + "grad_norm": 0.8140641450881958, + "learning_rate": 7.536073332934972e-06, + "loss": 0.5301, + "step": 26095 + }, + { + "epoch": 0.5793498407342871, + "grad_norm": 1.0987571477890015, + "learning_rate": 7.532694195745529e-06, + "loss": 0.3106, + "step": 26100 + }, + { + "epoch": 0.5794608272938148, + "grad_norm": 0.8439056277275085, + "learning_rate": 7.529315358522855e-06, + "loss": 0.3396, + "step": 26105 + }, + { + "epoch": 0.5795718138533423, + "grad_norm": 1.406260371208191, + "learning_rate": 7.5259368216777296e-06, + "loss": 0.4741, + "step": 26110 + }, + { + "epoch": 0.57968280041287, + "grad_norm": 1.0653043985366821, + "learning_rate": 7.522558585620907e-06, + "loss": 0.3736, + "step": 26115 + }, + { + "epoch": 0.5797937869723977, + "grad_norm": 1.3564051389694214, + "learning_rate": 7.51918065076311e-06, + "loss": 0.4633, + "step": 26120 + }, + { + "epoch": 0.5799047735319253, + "grad_norm": 1.2771178483963013, + "learning_rate": 7.515803017515002e-06, + "loss": 0.4959, + "step": 26125 + }, + { + "epoch": 0.580015760091453, + "grad_norm": 1.5175601243972778, + "learning_rate": 7.512425686287237e-06, + "loss": 0.4971, + "step": 26130 + }, + { + "epoch": 0.5801267466509805, + "grad_norm": 1.8545664548873901, + "learning_rate": 7.509048657490411e-06, + "loss": 0.4164, + "step": 26135 + }, + { + "epoch": 0.5802377332105082, + "grad_norm": 1.141951084136963, + "learning_rate": 7.505671931535099e-06, + "loss": 0.3089, + "step": 26140 + }, + { + "epoch": 0.5803487197700359, + "grad_norm": 1.5077598094940186, + "learning_rate": 7.502295508831824e-06, + "loss": 0.3922, + "step": 26145 + }, + { + "epoch": 0.5804597063295635, + "grad_norm": 1.1959303617477417, + "learning_rate": 7.498919389791084e-06, + "loss": 0.4636, + "step": 26150 + }, + { + "epoch": 0.5805706928890911, + "grad_norm": 1.1563524007797241, + "learning_rate": 7.495543574823341e-06, + "loss": 0.4562, + "step": 26155 + }, + { + "epoch": 0.5806816794486188, + "grad_norm": 1.376280426979065, + "learning_rate": 7.492168064339007e-06, + "loss": 0.444, + "step": 26160 + }, + { + "epoch": 0.5807926660081464, + "grad_norm": 1.0079575777053833, + "learning_rate": 7.488792858748473e-06, + "loss": 0.3961, + "step": 26165 + }, + { + "epoch": 0.5809036525676741, + "grad_norm": 1.5137135982513428, + "learning_rate": 7.485417958462078e-06, + "loss": 0.5764, + "step": 26170 + }, + { + "epoch": 0.5810146391272017, + "grad_norm": 1.3020367622375488, + "learning_rate": 7.4820433638901355e-06, + "loss": 0.3482, + "step": 26175 + }, + { + "epoch": 0.5811256256867293, + "grad_norm": 1.1549218893051147, + "learning_rate": 7.478669075442917e-06, + "loss": 0.2838, + "step": 26180 + }, + { + "epoch": 0.581236612246257, + "grad_norm": 1.3579342365264893, + "learning_rate": 7.475295093530655e-06, + "loss": 0.4663, + "step": 26185 + }, + { + "epoch": 0.5813475988057846, + "grad_norm": 0.9994341135025024, + "learning_rate": 7.4719214185635505e-06, + "loss": 0.3361, + "step": 26190 + }, + { + "epoch": 0.5814585853653123, + "grad_norm": 0.8745097517967224, + "learning_rate": 7.4685480509517594e-06, + "loss": 0.2108, + "step": 26195 + }, + { + "epoch": 0.5815695719248399, + "grad_norm": 0.8067927360534668, + "learning_rate": 7.465174991105405e-06, + "loss": 0.3894, + "step": 26200 + }, + { + "epoch": 0.5816805584843675, + "grad_norm": 1.0950978994369507, + "learning_rate": 7.461802239434579e-06, + "loss": 0.4599, + "step": 26205 + }, + { + "epoch": 0.5817915450438952, + "grad_norm": 1.183995246887207, + "learning_rate": 7.4584297963493195e-06, + "loss": 0.3717, + "step": 26210 + }, + { + "epoch": 0.5819025316034229, + "grad_norm": 1.009460687637329, + "learning_rate": 7.455057662259645e-06, + "loss": 0.4878, + "step": 26215 + }, + { + "epoch": 0.5820135181629504, + "grad_norm": 0.9349616765975952, + "learning_rate": 7.451685837575522e-06, + "loss": 0.4331, + "step": 26220 + }, + { + "epoch": 0.5821245047224781, + "grad_norm": 1.2902761697769165, + "learning_rate": 7.448314322706891e-06, + "loss": 0.4988, + "step": 26225 + }, + { + "epoch": 0.5822354912820058, + "grad_norm": 1.7355648279190063, + "learning_rate": 7.444943118063645e-06, + "loss": 0.5101, + "step": 26230 + }, + { + "epoch": 0.5823464778415334, + "grad_norm": 0.7856650352478027, + "learning_rate": 7.441572224055644e-06, + "loss": 0.3294, + "step": 26235 + }, + { + "epoch": 0.582457464401061, + "grad_norm": 1.197662353515625, + "learning_rate": 7.438201641092716e-06, + "loss": 0.3985, + "step": 26240 + }, + { + "epoch": 0.5825684509605886, + "grad_norm": 0.7614005208015442, + "learning_rate": 7.4348313695846375e-06, + "loss": 0.3994, + "step": 26245 + }, + { + "epoch": 0.5826794375201163, + "grad_norm": 1.2452610731124878, + "learning_rate": 7.431461409941162e-06, + "loss": 0.3622, + "step": 26250 + }, + { + "epoch": 0.582790424079644, + "grad_norm": 1.1531463861465454, + "learning_rate": 7.428091762571989e-06, + "loss": 0.4351, + "step": 26255 + }, + { + "epoch": 0.5829014106391716, + "grad_norm": 1.2325093746185303, + "learning_rate": 7.424722427886795e-06, + "loss": 0.4805, + "step": 26260 + }, + { + "epoch": 0.5830123971986992, + "grad_norm": 1.3518481254577637, + "learning_rate": 7.421353406295215e-06, + "loss": 0.5129, + "step": 26265 + }, + { + "epoch": 0.5831233837582269, + "grad_norm": 1.5780023336410522, + "learning_rate": 7.417984698206837e-06, + "loss": 0.4087, + "step": 26270 + }, + { + "epoch": 0.5832343703177545, + "grad_norm": 1.3883979320526123, + "learning_rate": 7.414616304031223e-06, + "loss": 0.4416, + "step": 26275 + }, + { + "epoch": 0.5833453568772822, + "grad_norm": 1.5068031549453735, + "learning_rate": 7.411248224177884e-06, + "loss": 0.3407, + "step": 26280 + }, + { + "epoch": 0.5834563434368099, + "grad_norm": 1.6327471733093262, + "learning_rate": 7.407880459056306e-06, + "loss": 0.4364, + "step": 26285 + }, + { + "epoch": 0.5835673299963374, + "grad_norm": 1.1161293983459473, + "learning_rate": 7.404513009075929e-06, + "loss": 0.4822, + "step": 26290 + }, + { + "epoch": 0.5836783165558651, + "grad_norm": 1.0726834535598755, + "learning_rate": 7.401145874646154e-06, + "loss": 0.5733, + "step": 26295 + }, + { + "epoch": 0.5837893031153927, + "grad_norm": 1.6274734735488892, + "learning_rate": 7.39777905617635e-06, + "loss": 0.5559, + "step": 26300 + }, + { + "epoch": 0.5839002896749204, + "grad_norm": 1.3102668523788452, + "learning_rate": 7.39441255407584e-06, + "loss": 0.4602, + "step": 26305 + }, + { + "epoch": 0.584011276234448, + "grad_norm": 1.2450367212295532, + "learning_rate": 7.391046368753913e-06, + "loss": 0.6068, + "step": 26310 + }, + { + "epoch": 0.5841222627939756, + "grad_norm": 1.6839604377746582, + "learning_rate": 7.387680500619819e-06, + "loss": 0.4881, + "step": 26315 + }, + { + "epoch": 0.5842332493535033, + "grad_norm": 1.215613603591919, + "learning_rate": 7.384314950082767e-06, + "loss": 0.4433, + "step": 26320 + }, + { + "epoch": 0.584344235913031, + "grad_norm": 1.1325950622558594, + "learning_rate": 7.380949717551935e-06, + "loss": 0.4274, + "step": 26325 + }, + { + "epoch": 0.5844552224725585, + "grad_norm": 1.5983508825302124, + "learning_rate": 7.3775848034364485e-06, + "loss": 0.5021, + "step": 26330 + }, + { + "epoch": 0.5845662090320862, + "grad_norm": 0.9946767091751099, + "learning_rate": 7.37422020814541e-06, + "loss": 0.3135, + "step": 26335 + }, + { + "epoch": 0.5846771955916139, + "grad_norm": 1.2925642728805542, + "learning_rate": 7.37085593208787e-06, + "loss": 0.4129, + "step": 26340 + }, + { + "epoch": 0.5847881821511415, + "grad_norm": 1.2564030885696411, + "learning_rate": 7.367491975672847e-06, + "loss": 0.3438, + "step": 26345 + }, + { + "epoch": 0.5848991687106692, + "grad_norm": 1.0621161460876465, + "learning_rate": 7.364128339309326e-06, + "loss": 0.4698, + "step": 26350 + }, + { + "epoch": 0.5850101552701967, + "grad_norm": 1.0059372186660767, + "learning_rate": 7.360765023406237e-06, + "loss": 0.4562, + "step": 26355 + }, + { + "epoch": 0.5851211418297244, + "grad_norm": 1.0170514583587646, + "learning_rate": 7.357402028372489e-06, + "loss": 0.3451, + "step": 26360 + }, + { + "epoch": 0.5852321283892521, + "grad_norm": 0.9726334810256958, + "learning_rate": 7.354039354616935e-06, + "loss": 0.3805, + "step": 26365 + }, + { + "epoch": 0.5853431149487797, + "grad_norm": 1.5828025341033936, + "learning_rate": 7.350677002548403e-06, + "loss": 0.4252, + "step": 26370 + }, + { + "epoch": 0.5854541015083073, + "grad_norm": 1.3867665529251099, + "learning_rate": 7.347314972575681e-06, + "loss": 0.4395, + "step": 26375 + }, + { + "epoch": 0.585565088067835, + "grad_norm": 1.0094491243362427, + "learning_rate": 7.343953265107502e-06, + "loss": 0.5013, + "step": 26380 + }, + { + "epoch": 0.5856760746273626, + "grad_norm": 1.0547956228256226, + "learning_rate": 7.340591880552583e-06, + "loss": 0.4529, + "step": 26385 + }, + { + "epoch": 0.5857870611868903, + "grad_norm": 1.1832739114761353, + "learning_rate": 7.33723081931958e-06, + "loss": 0.2863, + "step": 26390 + }, + { + "epoch": 0.585898047746418, + "grad_norm": 0.9504315257072449, + "learning_rate": 7.333870081817127e-06, + "loss": 0.4596, + "step": 26395 + }, + { + "epoch": 0.5860090343059455, + "grad_norm": 1.5113601684570312, + "learning_rate": 7.330509668453805e-06, + "loss": 0.3081, + "step": 26400 + }, + { + "epoch": 0.5861200208654732, + "grad_norm": 0.8444293737411499, + "learning_rate": 7.3271495796381666e-06, + "loss": 0.3587, + "step": 26405 + }, + { + "epoch": 0.5862310074250008, + "grad_norm": 2.0980753898620605, + "learning_rate": 7.323789815778718e-06, + "loss": 0.3862, + "step": 26410 + }, + { + "epoch": 0.5863419939845285, + "grad_norm": 0.6181811690330505, + "learning_rate": 7.320430377283928e-06, + "loss": 0.2972, + "step": 26415 + }, + { + "epoch": 0.5864529805440561, + "grad_norm": 1.166435718536377, + "learning_rate": 7.317071264562226e-06, + "loss": 0.5111, + "step": 26420 + }, + { + "epoch": 0.5865639671035837, + "grad_norm": 1.4199657440185547, + "learning_rate": 7.313712478022002e-06, + "loss": 0.3927, + "step": 26425 + }, + { + "epoch": 0.5866749536631114, + "grad_norm": 1.1458617448806763, + "learning_rate": 7.310354018071604e-06, + "loss": 0.423, + "step": 26430 + }, + { + "epoch": 0.5867859402226391, + "grad_norm": 0.5648505091667175, + "learning_rate": 7.306995885119349e-06, + "loss": 0.5081, + "step": 26435 + }, + { + "epoch": 0.5868969267821667, + "grad_norm": 0.9345181584358215, + "learning_rate": 7.303638079573498e-06, + "loss": 0.3507, + "step": 26440 + }, + { + "epoch": 0.5870079133416943, + "grad_norm": 1.25766122341156, + "learning_rate": 7.300280601842289e-06, + "loss": 0.4505, + "step": 26445 + }, + { + "epoch": 0.587118899901222, + "grad_norm": 1.497269868850708, + "learning_rate": 7.296923452333908e-06, + "loss": 0.4878, + "step": 26450 + }, + { + "epoch": 0.5872298864607496, + "grad_norm": 1.4120640754699707, + "learning_rate": 7.293566631456508e-06, + "loss": 0.4281, + "step": 26455 + }, + { + "epoch": 0.5873408730202773, + "grad_norm": 1.3898377418518066, + "learning_rate": 7.290210139618203e-06, + "loss": 0.3614, + "step": 26460 + }, + { + "epoch": 0.5874518595798048, + "grad_norm": 1.8048535585403442, + "learning_rate": 7.286853977227059e-06, + "loss": 0.4069, + "step": 26465 + }, + { + "epoch": 0.5875628461393325, + "grad_norm": 1.8236231803894043, + "learning_rate": 7.283498144691113e-06, + "loss": 0.3288, + "step": 26470 + }, + { + "epoch": 0.5876738326988602, + "grad_norm": 0.779463529586792, + "learning_rate": 7.2801426424183465e-06, + "loss": 0.4077, + "step": 26475 + }, + { + "epoch": 0.5877848192583878, + "grad_norm": 0.8819475769996643, + "learning_rate": 7.276787470816722e-06, + "loss": 0.4519, + "step": 26480 + }, + { + "epoch": 0.5878958058179155, + "grad_norm": 1.0383362770080566, + "learning_rate": 7.273432630294139e-06, + "loss": 0.3105, + "step": 26485 + }, + { + "epoch": 0.5880067923774431, + "grad_norm": 1.1080772876739502, + "learning_rate": 7.270078121258471e-06, + "loss": 0.3799, + "step": 26490 + }, + { + "epoch": 0.5881177789369707, + "grad_norm": 2.0721535682678223, + "learning_rate": 7.266723944117555e-06, + "loss": 0.3617, + "step": 26495 + }, + { + "epoch": 0.5882287654964984, + "grad_norm": 0.8842170238494873, + "learning_rate": 7.263370099279173e-06, + "loss": 0.3916, + "step": 26500 + }, + { + "epoch": 0.5883397520560261, + "grad_norm": 3.7884020805358887, + "learning_rate": 7.260016587151078e-06, + "loss": 0.3228, + "step": 26505 + }, + { + "epoch": 0.5884507386155536, + "grad_norm": 1.390315055847168, + "learning_rate": 7.256663408140974e-06, + "loss": 0.5956, + "step": 26510 + }, + { + "epoch": 0.5885617251750813, + "grad_norm": 0.9888474345207214, + "learning_rate": 7.253310562656531e-06, + "loss": 0.3319, + "step": 26515 + }, + { + "epoch": 0.5886727117346089, + "grad_norm": 1.2436730861663818, + "learning_rate": 7.249958051105383e-06, + "loss": 0.639, + "step": 26520 + }, + { + "epoch": 0.5887836982941366, + "grad_norm": 0.937451183795929, + "learning_rate": 7.246605873895109e-06, + "loss": 0.3847, + "step": 26525 + }, + { + "epoch": 0.5888946848536643, + "grad_norm": 0.7500023245811462, + "learning_rate": 7.243254031433262e-06, + "loss": 0.4246, + "step": 26530 + }, + { + "epoch": 0.5890056714131918, + "grad_norm": 1.2583023309707642, + "learning_rate": 7.2399025241273406e-06, + "loss": 0.4899, + "step": 26535 + }, + { + "epoch": 0.5891166579727195, + "grad_norm": 1.3657556772232056, + "learning_rate": 7.2365513523848156e-06, + "loss": 0.4028, + "step": 26540 + }, + { + "epoch": 0.5892276445322472, + "grad_norm": 1.4807183742523193, + "learning_rate": 7.233200516613109e-06, + "loss": 0.3921, + "step": 26545 + }, + { + "epoch": 0.5893386310917748, + "grad_norm": 1.4240939617156982, + "learning_rate": 7.2298500172196054e-06, + "loss": 0.3524, + "step": 26550 + }, + { + "epoch": 0.5894496176513024, + "grad_norm": 1.1841248273849487, + "learning_rate": 7.226499854611646e-06, + "loss": 0.4088, + "step": 26555 + }, + { + "epoch": 0.5895606042108301, + "grad_norm": 1.4342392683029175, + "learning_rate": 7.223150029196532e-06, + "loss": 0.4176, + "step": 26560 + }, + { + "epoch": 0.5896715907703577, + "grad_norm": 1.3130017518997192, + "learning_rate": 7.219800541381526e-06, + "loss": 0.4723, + "step": 26565 + }, + { + "epoch": 0.5897825773298854, + "grad_norm": 1.9191733598709106, + "learning_rate": 7.216451391573844e-06, + "loss": 0.3313, + "step": 26570 + }, + { + "epoch": 0.5898935638894129, + "grad_norm": 1.1599457263946533, + "learning_rate": 7.213102580180666e-06, + "loss": 0.4146, + "step": 26575 + }, + { + "epoch": 0.5900045504489406, + "grad_norm": 1.4810781478881836, + "learning_rate": 7.209754107609132e-06, + "loss": 0.4848, + "step": 26580 + }, + { + "epoch": 0.5901155370084683, + "grad_norm": 0.3967765271663666, + "learning_rate": 7.206405974266335e-06, + "loss": 0.297, + "step": 26585 + }, + { + "epoch": 0.5902265235679959, + "grad_norm": 1.0816624164581299, + "learning_rate": 7.203058180559332e-06, + "loss": 0.472, + "step": 26590 + }, + { + "epoch": 0.5903375101275236, + "grad_norm": 1.3460806608200073, + "learning_rate": 7.199710726895133e-06, + "loss": 0.5931, + "step": 26595 + }, + { + "epoch": 0.5904484966870512, + "grad_norm": 1.2487653493881226, + "learning_rate": 7.19636361368071e-06, + "loss": 0.454, + "step": 26600 + }, + { + "epoch": 0.5905594832465788, + "grad_norm": 1.078736424446106, + "learning_rate": 7.1930168413230025e-06, + "loss": 0.3498, + "step": 26605 + }, + { + "epoch": 0.5906704698061065, + "grad_norm": 1.2618157863616943, + "learning_rate": 7.189670410228889e-06, + "loss": 0.4946, + "step": 26610 + }, + { + "epoch": 0.5907814563656342, + "grad_norm": 0.9752281308174133, + "learning_rate": 7.186324320805226e-06, + "loss": 0.3529, + "step": 26615 + }, + { + "epoch": 0.5908924429251617, + "grad_norm": 0.8908410668373108, + "learning_rate": 7.182978573458811e-06, + "loss": 0.4021, + "step": 26620 + }, + { + "epoch": 0.5910034294846894, + "grad_norm": 1.5097835063934326, + "learning_rate": 7.1796331685964136e-06, + "loss": 0.3922, + "step": 26625 + }, + { + "epoch": 0.591114416044217, + "grad_norm": 1.578121304512024, + "learning_rate": 7.176288106624761e-06, + "loss": 0.2987, + "step": 26630 + }, + { + "epoch": 0.5912254026037447, + "grad_norm": 0.7530714273452759, + "learning_rate": 7.172943387950526e-06, + "loss": 0.3963, + "step": 26635 + }, + { + "epoch": 0.5913363891632724, + "grad_norm": 2.377854585647583, + "learning_rate": 7.169599012980359e-06, + "loss": 0.4138, + "step": 26640 + }, + { + "epoch": 0.5914473757227999, + "grad_norm": 0.8900545239448547, + "learning_rate": 7.166254982120845e-06, + "loss": 0.4248, + "step": 26645 + }, + { + "epoch": 0.5915583622823276, + "grad_norm": 1.3987830877304077, + "learning_rate": 7.162911295778552e-06, + "loss": 0.2741, + "step": 26650 + }, + { + "epoch": 0.5916693488418553, + "grad_norm": 1.3931851387023926, + "learning_rate": 7.159567954359983e-06, + "loss": 0.4107, + "step": 26655 + }, + { + "epoch": 0.5917803354013829, + "grad_norm": 0.9386917948722839, + "learning_rate": 7.15622495827162e-06, + "loss": 0.5291, + "step": 26660 + }, + { + "epoch": 0.5918913219609105, + "grad_norm": 0.991595983505249, + "learning_rate": 7.152882307919888e-06, + "loss": 0.4553, + "step": 26665 + }, + { + "epoch": 0.5920023085204382, + "grad_norm": 1.2766691446304321, + "learning_rate": 7.149540003711178e-06, + "loss": 0.6014, + "step": 26670 + }, + { + "epoch": 0.5921132950799658, + "grad_norm": 1.4079058170318604, + "learning_rate": 7.1461980460518335e-06, + "loss": 0.3876, + "step": 26675 + }, + { + "epoch": 0.5922242816394935, + "grad_norm": 2.7287020683288574, + "learning_rate": 7.142856435348159e-06, + "loss": 0.4923, + "step": 26680 + }, + { + "epoch": 0.592335268199021, + "grad_norm": 1.208175539970398, + "learning_rate": 7.139515172006416e-06, + "loss": 0.365, + "step": 26685 + }, + { + "epoch": 0.5924462547585487, + "grad_norm": 2.4387660026550293, + "learning_rate": 7.136174256432828e-06, + "loss": 0.5239, + "step": 26690 + }, + { + "epoch": 0.5925572413180764, + "grad_norm": 0.8286358714103699, + "learning_rate": 7.132833689033567e-06, + "loss": 0.449, + "step": 26695 + }, + { + "epoch": 0.592668227877604, + "grad_norm": 1.651318073272705, + "learning_rate": 7.129493470214775e-06, + "loss": 0.4033, + "step": 26700 + }, + { + "epoch": 0.5927792144371317, + "grad_norm": 0.869933009147644, + "learning_rate": 7.126153600382533e-06, + "loss": 0.3771, + "step": 26705 + }, + { + "epoch": 0.5928902009966593, + "grad_norm": 1.569030523300171, + "learning_rate": 7.122814079942899e-06, + "loss": 0.4554, + "step": 26710 + }, + { + "epoch": 0.5930011875561869, + "grad_norm": 0.9556633234024048, + "learning_rate": 7.119474909301886e-06, + "loss": 0.4001, + "step": 26715 + }, + { + "epoch": 0.5931121741157146, + "grad_norm": 1.1097278594970703, + "learning_rate": 7.1161360888654466e-06, + "loss": 0.4616, + "step": 26720 + }, + { + "epoch": 0.5932231606752423, + "grad_norm": 1.8948615789413452, + "learning_rate": 7.112797619039516e-06, + "loss": 0.3212, + "step": 26725 + }, + { + "epoch": 0.5933341472347698, + "grad_norm": 1.9563379287719727, + "learning_rate": 7.109459500229961e-06, + "loss": 0.3579, + "step": 26730 + }, + { + "epoch": 0.5934451337942975, + "grad_norm": 1.5273131132125854, + "learning_rate": 7.106121732842633e-06, + "loss": 0.4067, + "step": 26735 + }, + { + "epoch": 0.5935561203538251, + "grad_norm": 1.0542635917663574, + "learning_rate": 7.102784317283314e-06, + "loss": 0.5198, + "step": 26740 + }, + { + "epoch": 0.5936671069133528, + "grad_norm": 0.851750910282135, + "learning_rate": 7.09944725395776e-06, + "loss": 0.3944, + "step": 26745 + }, + { + "epoch": 0.5937780934728805, + "grad_norm": 1.0166863203048706, + "learning_rate": 7.096110543271686e-06, + "loss": 0.5288, + "step": 26750 + }, + { + "epoch": 0.593889080032408, + "grad_norm": 1.1966583728790283, + "learning_rate": 7.09277418563075e-06, + "loss": 0.421, + "step": 26755 + }, + { + "epoch": 0.5940000665919357, + "grad_norm": 0.9916155338287354, + "learning_rate": 7.089438181440582e-06, + "loss": 0.5078, + "step": 26760 + }, + { + "epoch": 0.5941110531514634, + "grad_norm": 0.8691633939743042, + "learning_rate": 7.086102531106755e-06, + "loss": 0.4073, + "step": 26765 + }, + { + "epoch": 0.594222039710991, + "grad_norm": 1.0698593854904175, + "learning_rate": 7.082767235034809e-06, + "loss": 0.3515, + "step": 26770 + }, + { + "epoch": 0.5943330262705186, + "grad_norm": 0.8931669592857361, + "learning_rate": 7.079432293630244e-06, + "loss": 0.4208, + "step": 26775 + }, + { + "epoch": 0.5944440128300463, + "grad_norm": 1.258162260055542, + "learning_rate": 7.0760977072985005e-06, + "loss": 0.4374, + "step": 26780 + }, + { + "epoch": 0.5945549993895739, + "grad_norm": 0.9825992584228516, + "learning_rate": 7.072763476444997e-06, + "loss": 0.4695, + "step": 26785 + }, + { + "epoch": 0.5946659859491016, + "grad_norm": 1.4318424463272095, + "learning_rate": 7.069429601475088e-06, + "loss": 0.3576, + "step": 26790 + }, + { + "epoch": 0.5947769725086292, + "grad_norm": 0.6295336484909058, + "learning_rate": 7.066096082794102e-06, + "loss": 0.3883, + "step": 26795 + }, + { + "epoch": 0.5948879590681568, + "grad_norm": 1.4859687089920044, + "learning_rate": 7.0627629208073144e-06, + "loss": 0.3997, + "step": 26800 + }, + { + "epoch": 0.5949989456276845, + "grad_norm": 0.9783358573913574, + "learning_rate": 7.0594301159199606e-06, + "loss": 0.3838, + "step": 26805 + }, + { + "epoch": 0.5951099321872121, + "grad_norm": 0.9241087436676025, + "learning_rate": 7.056097668537232e-06, + "loss": 0.4825, + "step": 26810 + }, + { + "epoch": 0.5952209187467398, + "grad_norm": 0.8490179777145386, + "learning_rate": 7.052765579064273e-06, + "loss": 0.3689, + "step": 26815 + }, + { + "epoch": 0.5953319053062675, + "grad_norm": 1.491245985031128, + "learning_rate": 7.049433847906194e-06, + "loss": 0.4896, + "step": 26820 + }, + { + "epoch": 0.595442891865795, + "grad_norm": 1.1904845237731934, + "learning_rate": 7.046102475468051e-06, + "loss": 0.2843, + "step": 26825 + }, + { + "epoch": 0.5955538784253227, + "grad_norm": 0.8467894792556763, + "learning_rate": 7.04277146215486e-06, + "loss": 0.4668, + "step": 26830 + }, + { + "epoch": 0.5956648649848504, + "grad_norm": 1.1466150283813477, + "learning_rate": 7.039440808371602e-06, + "loss": 0.4891, + "step": 26835 + }, + { + "epoch": 0.595775851544378, + "grad_norm": 0.8960450291633606, + "learning_rate": 7.036110514523197e-06, + "loss": 0.3969, + "step": 26840 + }, + { + "epoch": 0.5958868381039056, + "grad_norm": 1.353668212890625, + "learning_rate": 7.03278058101454e-06, + "loss": 0.4128, + "step": 26845 + }, + { + "epoch": 0.5959978246634332, + "grad_norm": 1.8645679950714111, + "learning_rate": 7.029451008250463e-06, + "loss": 0.3549, + "step": 26850 + }, + { + "epoch": 0.5961088112229609, + "grad_norm": 1.427398681640625, + "learning_rate": 7.026121796635772e-06, + "loss": 0.3455, + "step": 26855 + }, + { + "epoch": 0.5962197977824886, + "grad_norm": 0.7911583781242371, + "learning_rate": 7.022792946575222e-06, + "loss": 0.4177, + "step": 26860 + }, + { + "epoch": 0.5963307843420161, + "grad_norm": 1.1445287466049194, + "learning_rate": 7.019464458473518e-06, + "loss": 0.6095, + "step": 26865 + }, + { + "epoch": 0.5964417709015438, + "grad_norm": 0.8763558268547058, + "learning_rate": 7.016136332735332e-06, + "loss": 0.4996, + "step": 26870 + }, + { + "epoch": 0.5965527574610715, + "grad_norm": 1.2170817852020264, + "learning_rate": 7.012808569765279e-06, + "loss": 0.3265, + "step": 26875 + }, + { + "epoch": 0.5966637440205991, + "grad_norm": 1.1268202066421509, + "learning_rate": 7.009481169967943e-06, + "loss": 0.5045, + "step": 26880 + }, + { + "epoch": 0.5967747305801268, + "grad_norm": 1.6648198366165161, + "learning_rate": 7.006154133747861e-06, + "loss": 0.455, + "step": 26885 + }, + { + "epoch": 0.5968857171396544, + "grad_norm": 1.6047817468643188, + "learning_rate": 7.002827461509514e-06, + "loss": 0.266, + "step": 26890 + }, + { + "epoch": 0.596996703699182, + "grad_norm": 0.7733944058418274, + "learning_rate": 6.999501153657358e-06, + "loss": 0.3569, + "step": 26895 + }, + { + "epoch": 0.5971076902587097, + "grad_norm": 1.3187754154205322, + "learning_rate": 6.996175210595784e-06, + "loss": 0.2942, + "step": 26900 + }, + { + "epoch": 0.5972186768182373, + "grad_norm": 1.2069423198699951, + "learning_rate": 6.992849632729157e-06, + "loss": 0.4206, + "step": 26905 + }, + { + "epoch": 0.5973296633777649, + "grad_norm": 1.752915620803833, + "learning_rate": 6.989524420461784e-06, + "loss": 0.4684, + "step": 26910 + }, + { + "epoch": 0.5974406499372926, + "grad_norm": 1.001297950744629, + "learning_rate": 6.986199574197936e-06, + "loss": 0.4267, + "step": 26915 + }, + { + "epoch": 0.5975516364968202, + "grad_norm": 1.145909070968628, + "learning_rate": 6.982875094341838e-06, + "loss": 0.3435, + "step": 26920 + }, + { + "epoch": 0.5976626230563479, + "grad_norm": 1.3165746927261353, + "learning_rate": 6.979550981297666e-06, + "loss": 0.3841, + "step": 26925 + }, + { + "epoch": 0.5977736096158756, + "grad_norm": 1.3009177446365356, + "learning_rate": 6.976227235469557e-06, + "loss": 0.4375, + "step": 26930 + }, + { + "epoch": 0.5978845961754031, + "grad_norm": 1.1894065141677856, + "learning_rate": 6.972903857261599e-06, + "loss": 0.4092, + "step": 26935 + }, + { + "epoch": 0.5979955827349308, + "grad_norm": 0.9577630758285522, + "learning_rate": 6.969580847077836e-06, + "loss": 0.3938, + "step": 26940 + }, + { + "epoch": 0.5981065692944585, + "grad_norm": 0.8460142612457275, + "learning_rate": 6.966258205322274e-06, + "loss": 0.2974, + "step": 26945 + }, + { + "epoch": 0.5982175558539861, + "grad_norm": 1.2991278171539307, + "learning_rate": 6.962935932398862e-06, + "loss": 0.3644, + "step": 26950 + }, + { + "epoch": 0.5983285424135137, + "grad_norm": 0.9580723643302917, + "learning_rate": 6.959614028711517e-06, + "loss": 0.2362, + "step": 26955 + }, + { + "epoch": 0.5984395289730413, + "grad_norm": 0.9842463731765747, + "learning_rate": 6.956292494664098e-06, + "loss": 0.4265, + "step": 26960 + }, + { + "epoch": 0.598550515532569, + "grad_norm": 1.0971413850784302, + "learning_rate": 6.952971330660429e-06, + "loss": 0.3913, + "step": 26965 + }, + { + "epoch": 0.5986615020920967, + "grad_norm": 1.3643193244934082, + "learning_rate": 6.949650537104292e-06, + "loss": 0.4197, + "step": 26970 + }, + { + "epoch": 0.5987724886516242, + "grad_norm": 1.7623727321624756, + "learning_rate": 6.946330114399409e-06, + "loss": 0.4682, + "step": 26975 + }, + { + "epoch": 0.5988834752111519, + "grad_norm": 0.7582911849021912, + "learning_rate": 6.943010062949471e-06, + "loss": 0.3793, + "step": 26980 + }, + { + "epoch": 0.5989944617706796, + "grad_norm": 1.4799220561981201, + "learning_rate": 6.939690383158115e-06, + "loss": 0.3809, + "step": 26985 + }, + { + "epoch": 0.5991054483302072, + "grad_norm": 1.0741524696350098, + "learning_rate": 6.936371075428943e-06, + "loss": 0.3723, + "step": 26990 + }, + { + "epoch": 0.5992164348897349, + "grad_norm": 1.4326813220977783, + "learning_rate": 6.933052140165496e-06, + "loss": 0.5451, + "step": 26995 + }, + { + "epoch": 0.5993274214492625, + "grad_norm": 1.158925175666809, + "learning_rate": 6.9297335777712845e-06, + "loss": 0.4122, + "step": 27000 + }, + { + "epoch": 0.5994384080087901, + "grad_norm": 1.623766303062439, + "learning_rate": 6.926415388649772e-06, + "loss": 0.4549, + "step": 27005 + }, + { + "epoch": 0.5995493945683178, + "grad_norm": 0.7449182271957397, + "learning_rate": 6.923097573204365e-06, + "loss": 0.3833, + "step": 27010 + }, + { + "epoch": 0.5996603811278454, + "grad_norm": 1.0153058767318726, + "learning_rate": 6.919780131838438e-06, + "loss": 0.445, + "step": 27015 + }, + { + "epoch": 0.599771367687373, + "grad_norm": 1.8542829751968384, + "learning_rate": 6.91646306495531e-06, + "loss": 0.4383, + "step": 27020 + }, + { + "epoch": 0.5998823542469007, + "grad_norm": 1.3858826160430908, + "learning_rate": 6.913146372958263e-06, + "loss": 0.331, + "step": 27025 + }, + { + "epoch": 0.5999933408064283, + "grad_norm": 1.2759438753128052, + "learning_rate": 6.909830056250527e-06, + "loss": 0.334, + "step": 27030 + }, + { + "epoch": 0.600104327365956, + "grad_norm": 1.3611114025115967, + "learning_rate": 6.90651411523529e-06, + "loss": 0.3268, + "step": 27035 + }, + { + "epoch": 0.6002153139254837, + "grad_norm": 1.6142209768295288, + "learning_rate": 6.90319855031569e-06, + "loss": 0.407, + "step": 27040 + }, + { + "epoch": 0.6003263004850112, + "grad_norm": 1.0603861808776855, + "learning_rate": 6.899883361894827e-06, + "loss": 0.3074, + "step": 27045 + }, + { + "epoch": 0.6004372870445389, + "grad_norm": 1.0023366212844849, + "learning_rate": 6.896568550375744e-06, + "loss": 0.4157, + "step": 27050 + }, + { + "epoch": 0.6005482736040666, + "grad_norm": 1.9315853118896484, + "learning_rate": 6.893254116161454e-06, + "loss": 0.3693, + "step": 27055 + }, + { + "epoch": 0.6006592601635942, + "grad_norm": 0.5876302123069763, + "learning_rate": 6.889940059654905e-06, + "loss": 0.2404, + "step": 27060 + }, + { + "epoch": 0.6007702467231218, + "grad_norm": 1.1511262655258179, + "learning_rate": 6.886626381259016e-06, + "loss": 0.302, + "step": 27065 + }, + { + "epoch": 0.6008812332826494, + "grad_norm": 0.9467554092407227, + "learning_rate": 6.883313081376647e-06, + "loss": 0.5602, + "step": 27070 + }, + { + "epoch": 0.6009922198421771, + "grad_norm": 1.7370318174362183, + "learning_rate": 6.8800001604106246e-06, + "loss": 0.4617, + "step": 27075 + }, + { + "epoch": 0.6011032064017048, + "grad_norm": 2.1454873085021973, + "learning_rate": 6.876687618763716e-06, + "loss": 0.415, + "step": 27080 + }, + { + "epoch": 0.6012141929612324, + "grad_norm": 1.8225048780441284, + "learning_rate": 6.873375456838652e-06, + "loss": 0.4102, + "step": 27085 + }, + { + "epoch": 0.60132517952076, + "grad_norm": 1.0236406326293945, + "learning_rate": 6.870063675038117e-06, + "loss": 0.349, + "step": 27090 + }, + { + "epoch": 0.6014361660802877, + "grad_norm": 0.8470390439033508, + "learning_rate": 6.8667522737647395e-06, + "loss": 0.5173, + "step": 27095 + }, + { + "epoch": 0.6015471526398153, + "grad_norm": 1.1244659423828125, + "learning_rate": 6.863441253421117e-06, + "loss": 0.4413, + "step": 27100 + }, + { + "epoch": 0.601658139199343, + "grad_norm": 1.3510165214538574, + "learning_rate": 6.860130614409784e-06, + "loss": 0.4308, + "step": 27105 + }, + { + "epoch": 0.6017691257588706, + "grad_norm": 1.0276979207992554, + "learning_rate": 6.856820357133239e-06, + "loss": 0.5655, + "step": 27110 + }, + { + "epoch": 0.6018801123183982, + "grad_norm": 0.9559340476989746, + "learning_rate": 6.853510481993939e-06, + "loss": 0.3937, + "step": 27115 + }, + { + "epoch": 0.6019910988779259, + "grad_norm": 1.581098198890686, + "learning_rate": 6.850200989394278e-06, + "loss": 0.3655, + "step": 27120 + }, + { + "epoch": 0.6021020854374535, + "grad_norm": 0.9266625642776489, + "learning_rate": 6.846891879736622e-06, + "loss": 0.4016, + "step": 27125 + }, + { + "epoch": 0.6022130719969812, + "grad_norm": 1.1542038917541504, + "learning_rate": 6.84358315342327e-06, + "loss": 0.4539, + "step": 27130 + }, + { + "epoch": 0.6023240585565088, + "grad_norm": 1.1682536602020264, + "learning_rate": 6.840274810856493e-06, + "loss": 0.4021, + "step": 27135 + }, + { + "epoch": 0.6024350451160364, + "grad_norm": 1.201403021812439, + "learning_rate": 6.836966852438514e-06, + "loss": 0.396, + "step": 27140 + }, + { + "epoch": 0.6025460316755641, + "grad_norm": 1.5981374979019165, + "learning_rate": 6.833659278571491e-06, + "loss": 0.4464, + "step": 27145 + }, + { + "epoch": 0.6026570182350918, + "grad_norm": 1.264754056930542, + "learning_rate": 6.830352089657557e-06, + "loss": 0.4714, + "step": 27150 + }, + { + "epoch": 0.6027680047946193, + "grad_norm": 1.026802659034729, + "learning_rate": 6.827045286098784e-06, + "loss": 0.5208, + "step": 27155 + }, + { + "epoch": 0.602878991354147, + "grad_norm": 1.1696611642837524, + "learning_rate": 6.823738868297207e-06, + "loss": 0.4067, + "step": 27160 + }, + { + "epoch": 0.6029899779136747, + "grad_norm": 1.1774004697799683, + "learning_rate": 6.820432836654802e-06, + "loss": 0.3677, + "step": 27165 + }, + { + "epoch": 0.6031009644732023, + "grad_norm": 1.192420482635498, + "learning_rate": 6.817127191573511e-06, + "loss": 0.3367, + "step": 27170 + }, + { + "epoch": 0.60321195103273, + "grad_norm": 0.8283929824829102, + "learning_rate": 6.813821933455222e-06, + "loss": 0.4364, + "step": 27175 + }, + { + "epoch": 0.6033229375922575, + "grad_norm": 1.1864694356918335, + "learning_rate": 6.810517062701776e-06, + "loss": 0.4281, + "step": 27180 + }, + { + "epoch": 0.6034339241517852, + "grad_norm": 1.2099741697311401, + "learning_rate": 6.80721257971497e-06, + "loss": 0.4737, + "step": 27185 + }, + { + "epoch": 0.6035449107113129, + "grad_norm": 1.3351482152938843, + "learning_rate": 6.80390848489655e-06, + "loss": 0.5003, + "step": 27190 + }, + { + "epoch": 0.6036558972708405, + "grad_norm": 1.6452354192733765, + "learning_rate": 6.800604778648216e-06, + "loss": 0.4329, + "step": 27195 + }, + { + "epoch": 0.6037668838303681, + "grad_norm": 1.4642529487609863, + "learning_rate": 6.797301461371626e-06, + "loss": 0.4045, + "step": 27200 + }, + { + "epoch": 0.6038778703898958, + "grad_norm": 1.5834846496582031, + "learning_rate": 6.79399853346838e-06, + "loss": 0.3889, + "step": 27205 + }, + { + "epoch": 0.6039888569494234, + "grad_norm": 1.0351002216339111, + "learning_rate": 6.790695995340044e-06, + "loss": 0.301, + "step": 27210 + }, + { + "epoch": 0.6040998435089511, + "grad_norm": 0.9682518243789673, + "learning_rate": 6.787393847388122e-06, + "loss": 0.2879, + "step": 27215 + }, + { + "epoch": 0.6042108300684788, + "grad_norm": 1.2477974891662598, + "learning_rate": 6.784092090014083e-06, + "loss": 0.4827, + "step": 27220 + }, + { + "epoch": 0.6043218166280063, + "grad_norm": 0.8728592991828918, + "learning_rate": 6.7807907236193436e-06, + "loss": 0.4047, + "step": 27225 + }, + { + "epoch": 0.604432803187534, + "grad_norm": 1.7829116582870483, + "learning_rate": 6.777489748605271e-06, + "loss": 0.4008, + "step": 27230 + }, + { + "epoch": 0.6045437897470616, + "grad_norm": 1.3560502529144287, + "learning_rate": 6.774189165373188e-06, + "loss": 0.3258, + "step": 27235 + }, + { + "epoch": 0.6046547763065893, + "grad_norm": 0.855907142162323, + "learning_rate": 6.770888974324365e-06, + "loss": 0.4336, + "step": 27240 + }, + { + "epoch": 0.6047657628661169, + "grad_norm": 1.0651112794876099, + "learning_rate": 6.767589175860032e-06, + "loss": 0.4607, + "step": 27245 + }, + { + "epoch": 0.6048767494256445, + "grad_norm": 1.178040862083435, + "learning_rate": 6.7642897703813695e-06, + "loss": 0.3974, + "step": 27250 + }, + { + "epoch": 0.6049877359851722, + "grad_norm": 1.5252591371536255, + "learning_rate": 6.7609907582895005e-06, + "loss": 0.4059, + "step": 27255 + }, + { + "epoch": 0.6050987225446999, + "grad_norm": 1.074078917503357, + "learning_rate": 6.757692139985517e-06, + "loss": 0.4739, + "step": 27260 + }, + { + "epoch": 0.6052097091042274, + "grad_norm": 1.1060175895690918, + "learning_rate": 6.754393915870445e-06, + "loss": 0.3433, + "step": 27265 + }, + { + "epoch": 0.6053206956637551, + "grad_norm": 0.9834054112434387, + "learning_rate": 6.751096086345279e-06, + "loss": 0.3567, + "step": 27270 + }, + { + "epoch": 0.6054316822232828, + "grad_norm": 1.5103505849838257, + "learning_rate": 6.747798651810953e-06, + "loss": 0.4983, + "step": 27275 + }, + { + "epoch": 0.6055426687828104, + "grad_norm": 1.1536871194839478, + "learning_rate": 6.74450161266836e-06, + "loss": 0.3651, + "step": 27280 + }, + { + "epoch": 0.6056536553423381, + "grad_norm": 1.0520485639572144, + "learning_rate": 6.741204969318343e-06, + "loss": 0.4183, + "step": 27285 + }, + { + "epoch": 0.6057646419018656, + "grad_norm": 1.0779529809951782, + "learning_rate": 6.7379087221616965e-06, + "loss": 0.4373, + "step": 27290 + }, + { + "epoch": 0.6058756284613933, + "grad_norm": 1.3279848098754883, + "learning_rate": 6.734612871599169e-06, + "loss": 0.4022, + "step": 27295 + }, + { + "epoch": 0.605986615020921, + "grad_norm": 1.6855266094207764, + "learning_rate": 6.731317418031456e-06, + "loss": 0.4741, + "step": 27300 + }, + { + "epoch": 0.6060976015804486, + "grad_norm": 1.8417173624038696, + "learning_rate": 6.728022361859208e-06, + "loss": 0.3657, + "step": 27305 + }, + { + "epoch": 0.6062085881399762, + "grad_norm": 0.34610217809677124, + "learning_rate": 6.72472770348303e-06, + "loss": 0.3711, + "step": 27310 + }, + { + "epoch": 0.6063195746995039, + "grad_norm": 0.9657518863677979, + "learning_rate": 6.721433443303471e-06, + "loss": 0.4311, + "step": 27315 + }, + { + "epoch": 0.6064305612590315, + "grad_norm": 1.4196455478668213, + "learning_rate": 6.7181395817210415e-06, + "loss": 0.4459, + "step": 27320 + }, + { + "epoch": 0.6065415478185592, + "grad_norm": 0.5634430646896362, + "learning_rate": 6.714846119136192e-06, + "loss": 0.4148, + "step": 27325 + }, + { + "epoch": 0.6066525343780869, + "grad_norm": 0.9256958365440369, + "learning_rate": 6.711553055949333e-06, + "loss": 0.4587, + "step": 27330 + }, + { + "epoch": 0.6067635209376144, + "grad_norm": 1.0506219863891602, + "learning_rate": 6.70826039256083e-06, + "loss": 0.3011, + "step": 27335 + }, + { + "epoch": 0.6068745074971421, + "grad_norm": 0.6727412939071655, + "learning_rate": 6.7049681293709836e-06, + "loss": 0.4911, + "step": 27340 + }, + { + "epoch": 0.6069854940566697, + "grad_norm": 1.085914969444275, + "learning_rate": 6.701676266780066e-06, + "loss": 0.3931, + "step": 27345 + }, + { + "epoch": 0.6070964806161974, + "grad_norm": 2.1826765537261963, + "learning_rate": 6.698384805188283e-06, + "loss": 0.3863, + "step": 27350 + }, + { + "epoch": 0.607207467175725, + "grad_norm": 0.9371523261070251, + "learning_rate": 6.695093744995806e-06, + "loss": 0.4066, + "step": 27355 + }, + { + "epoch": 0.6073184537352526, + "grad_norm": 0.7349005937576294, + "learning_rate": 6.6918030866027415e-06, + "loss": 0.3822, + "step": 27360 + }, + { + "epoch": 0.6074294402947803, + "grad_norm": 0.9338274002075195, + "learning_rate": 6.688512830409167e-06, + "loss": 0.3196, + "step": 27365 + }, + { + "epoch": 0.607540426854308, + "grad_norm": 1.0718753337860107, + "learning_rate": 6.6852229768150976e-06, + "loss": 0.3626, + "step": 27370 + }, + { + "epoch": 0.6076514134138356, + "grad_norm": 1.2759748697280884, + "learning_rate": 6.681933526220499e-06, + "loss": 0.3641, + "step": 27375 + }, + { + "epoch": 0.6077623999733632, + "grad_norm": 1.4225863218307495, + "learning_rate": 6.678644479025298e-06, + "loss": 0.3547, + "step": 27380 + }, + { + "epoch": 0.6078733865328909, + "grad_norm": 0.8354029059410095, + "learning_rate": 6.675355835629358e-06, + "loss": 0.4423, + "step": 27385 + }, + { + "epoch": 0.6079843730924185, + "grad_norm": 1.19036865234375, + "learning_rate": 6.672067596432506e-06, + "loss": 0.5164, + "step": 27390 + }, + { + "epoch": 0.6080953596519462, + "grad_norm": 1.3374768495559692, + "learning_rate": 6.668779761834518e-06, + "loss": 0.5144, + "step": 27395 + }, + { + "epoch": 0.6082063462114737, + "grad_norm": 1.3295682668685913, + "learning_rate": 6.665492332235111e-06, + "loss": 0.4183, + "step": 27400 + }, + { + "epoch": 0.6083173327710014, + "grad_norm": 1.1211193799972534, + "learning_rate": 6.6622053080339666e-06, + "loss": 0.4292, + "step": 27405 + }, + { + "epoch": 0.6084283193305291, + "grad_norm": 1.4494692087173462, + "learning_rate": 6.658918689630706e-06, + "loss": 0.4577, + "step": 27410 + }, + { + "epoch": 0.6085393058900567, + "grad_norm": 0.9899317026138306, + "learning_rate": 6.6556324774249025e-06, + "loss": 0.2767, + "step": 27415 + }, + { + "epoch": 0.6086502924495844, + "grad_norm": 1.7873963117599487, + "learning_rate": 6.652346671816092e-06, + "loss": 0.5098, + "step": 27420 + }, + { + "epoch": 0.608761279009112, + "grad_norm": 0.9044897556304932, + "learning_rate": 6.649061273203741e-06, + "loss": 0.329, + "step": 27425 + }, + { + "epoch": 0.6088722655686396, + "grad_norm": 1.0911178588867188, + "learning_rate": 6.645776281987286e-06, + "loss": 0.2071, + "step": 27430 + }, + { + "epoch": 0.6089832521281673, + "grad_norm": 1.748152256011963, + "learning_rate": 6.642491698566098e-06, + "loss": 0.4249, + "step": 27435 + }, + { + "epoch": 0.609094238687695, + "grad_norm": 1.0939466953277588, + "learning_rate": 6.639207523339512e-06, + "loss": 0.3096, + "step": 27440 + }, + { + "epoch": 0.6092052252472225, + "grad_norm": 1.5443814992904663, + "learning_rate": 6.635923756706801e-06, + "loss": 0.4344, + "step": 27445 + }, + { + "epoch": 0.6093162118067502, + "grad_norm": 0.6851821541786194, + "learning_rate": 6.632640399067197e-06, + "loss": 0.3742, + "step": 27450 + }, + { + "epoch": 0.6094271983662778, + "grad_norm": 0.9900246262550354, + "learning_rate": 6.629357450819885e-06, + "loss": 0.5785, + "step": 27455 + }, + { + "epoch": 0.6095381849258055, + "grad_norm": 1.3923543691635132, + "learning_rate": 6.626074912363985e-06, + "loss": 0.3563, + "step": 27460 + }, + { + "epoch": 0.6096491714853332, + "grad_norm": 1.3146992921829224, + "learning_rate": 6.622792784098586e-06, + "loss": 0.2434, + "step": 27465 + }, + { + "epoch": 0.6097601580448607, + "grad_norm": 0.7010953426361084, + "learning_rate": 6.61951106642271e-06, + "loss": 0.3297, + "step": 27470 + }, + { + "epoch": 0.6098711446043884, + "grad_norm": 0.9393205642700195, + "learning_rate": 6.616229759735342e-06, + "loss": 0.3292, + "step": 27475 + }, + { + "epoch": 0.6099821311639161, + "grad_norm": 1.6617045402526855, + "learning_rate": 6.612948864435415e-06, + "loss": 0.4001, + "step": 27480 + }, + { + "epoch": 0.6100931177234437, + "grad_norm": 1.3801769018173218, + "learning_rate": 6.609668380921801e-06, + "loss": 0.3048, + "step": 27485 + }, + { + "epoch": 0.6102041042829713, + "grad_norm": 2.336073398590088, + "learning_rate": 6.6063883095933405e-06, + "loss": 0.4305, + "step": 27490 + }, + { + "epoch": 0.610315090842499, + "grad_norm": 1.2219001054763794, + "learning_rate": 6.603108650848802e-06, + "loss": 0.4475, + "step": 27495 + }, + { + "epoch": 0.6104260774020266, + "grad_norm": 1.3967128992080688, + "learning_rate": 6.599829405086924e-06, + "loss": 0.3382, + "step": 27500 + }, + { + "epoch": 0.6105370639615543, + "grad_norm": 0.959255039691925, + "learning_rate": 6.596550572706386e-06, + "loss": 0.4155, + "step": 27505 + }, + { + "epoch": 0.6106480505210818, + "grad_norm": 1.337578296661377, + "learning_rate": 6.593272154105811e-06, + "loss": 0.564, + "step": 27510 + }, + { + "epoch": 0.6107590370806095, + "grad_norm": 1.1137542724609375, + "learning_rate": 6.589994149683787e-06, + "loss": 0.5698, + "step": 27515 + }, + { + "epoch": 0.6108700236401372, + "grad_norm": 1.5539424419403076, + "learning_rate": 6.586716559838832e-06, + "loss": 0.4567, + "step": 27520 + }, + { + "epoch": 0.6109810101996648, + "grad_norm": 0.9463354349136353, + "learning_rate": 6.583439384969437e-06, + "loss": 0.5832, + "step": 27525 + }, + { + "epoch": 0.6110919967591925, + "grad_norm": 2.1490676403045654, + "learning_rate": 6.580162625474018e-06, + "loss": 0.5059, + "step": 27530 + }, + { + "epoch": 0.6112029833187201, + "grad_norm": 1.1205565929412842, + "learning_rate": 6.57688628175096e-06, + "loss": 0.4286, + "step": 27535 + }, + { + "epoch": 0.6113139698782477, + "grad_norm": 1.2223470211029053, + "learning_rate": 6.573610354198587e-06, + "loss": 0.4103, + "step": 27540 + }, + { + "epoch": 0.6114249564377754, + "grad_norm": 1.0583183765411377, + "learning_rate": 6.5703348432151784e-06, + "loss": 0.4206, + "step": 27545 + }, + { + "epoch": 0.6115359429973031, + "grad_norm": 1.509799599647522, + "learning_rate": 6.567059749198954e-06, + "loss": 0.3284, + "step": 27550 + }, + { + "epoch": 0.6116469295568306, + "grad_norm": 0.9263482093811035, + "learning_rate": 6.5637850725480945e-06, + "loss": 0.4801, + "step": 27555 + }, + { + "epoch": 0.6117579161163583, + "grad_norm": 0.9162907600402832, + "learning_rate": 6.560510813660719e-06, + "loss": 0.5102, + "step": 27560 + }, + { + "epoch": 0.6118689026758859, + "grad_norm": 1.2033370733261108, + "learning_rate": 6.557236972934907e-06, + "loss": 0.3699, + "step": 27565 + }, + { + "epoch": 0.6119798892354136, + "grad_norm": 1.1790424585342407, + "learning_rate": 6.5539635507686735e-06, + "loss": 0.5855, + "step": 27570 + }, + { + "epoch": 0.6120908757949413, + "grad_norm": 1.3403029441833496, + "learning_rate": 6.55069054756e-06, + "loss": 0.4091, + "step": 27575 + }, + { + "epoch": 0.6122018623544688, + "grad_norm": 0.9827760457992554, + "learning_rate": 6.547417963706797e-06, + "loss": 0.4158, + "step": 27580 + }, + { + "epoch": 0.6123128489139965, + "grad_norm": 0.9233430624008179, + "learning_rate": 6.544145799606938e-06, + "loss": 0.3699, + "step": 27585 + }, + { + "epoch": 0.6124238354735242, + "grad_norm": 0.7052626609802246, + "learning_rate": 6.540874055658249e-06, + "loss": 0.2455, + "step": 27590 + }, + { + "epoch": 0.6125348220330518, + "grad_norm": 1.3502213954925537, + "learning_rate": 6.537602732258485e-06, + "loss": 0.497, + "step": 27595 + }, + { + "epoch": 0.6126458085925794, + "grad_norm": 1.0900074243545532, + "learning_rate": 6.534331829805373e-06, + "loss": 0.4483, + "step": 27600 + }, + { + "epoch": 0.6127567951521071, + "grad_norm": 1.11318838596344, + "learning_rate": 6.53106134869657e-06, + "loss": 0.3535, + "step": 27605 + }, + { + "epoch": 0.6128677817116347, + "grad_norm": 0.7351371049880981, + "learning_rate": 6.527791289329699e-06, + "loss": 0.3415, + "step": 27610 + }, + { + "epoch": 0.6129787682711624, + "grad_norm": 1.8652602434158325, + "learning_rate": 6.524521652102315e-06, + "loss": 0.4084, + "step": 27615 + }, + { + "epoch": 0.61308975483069, + "grad_norm": 0.9447879195213318, + "learning_rate": 6.5212524374119315e-06, + "loss": 0.3778, + "step": 27620 + }, + { + "epoch": 0.6132007413902176, + "grad_norm": 2.6146841049194336, + "learning_rate": 6.517983645656014e-06, + "loss": 0.463, + "step": 27625 + }, + { + "epoch": 0.6133117279497453, + "grad_norm": 1.130544900894165, + "learning_rate": 6.514715277231963e-06, + "loss": 0.5154, + "step": 27630 + }, + { + "epoch": 0.6134227145092729, + "grad_norm": 0.8905532956123352, + "learning_rate": 6.5114473325371445e-06, + "loss": 0.444, + "step": 27635 + }, + { + "epoch": 0.6135337010688006, + "grad_norm": 1.8041813373565674, + "learning_rate": 6.508179811968855e-06, + "loss": 0.4868, + "step": 27640 + }, + { + "epoch": 0.6136446876283282, + "grad_norm": 0.8101031184196472, + "learning_rate": 6.504912715924355e-06, + "loss": 0.6036, + "step": 27645 + }, + { + "epoch": 0.6137556741878558, + "grad_norm": 1.235679268836975, + "learning_rate": 6.501646044800847e-06, + "loss": 0.5193, + "step": 27650 + }, + { + "epoch": 0.6138666607473835, + "grad_norm": 0.6435564160346985, + "learning_rate": 6.498379798995478e-06, + "loss": 0.3598, + "step": 27655 + }, + { + "epoch": 0.6139776473069112, + "grad_norm": 0.9466953277587891, + "learning_rate": 6.495113978905351e-06, + "loss": 0.3198, + "step": 27660 + }, + { + "epoch": 0.6140886338664387, + "grad_norm": 1.6178840398788452, + "learning_rate": 6.4918485849275116e-06, + "loss": 0.4214, + "step": 27665 + }, + { + "epoch": 0.6141996204259664, + "grad_norm": 1.2968192100524902, + "learning_rate": 6.488583617458955e-06, + "loss": 0.3609, + "step": 27670 + }, + { + "epoch": 0.6143106069854941, + "grad_norm": 1.3874645233154297, + "learning_rate": 6.485319076896628e-06, + "loss": 0.4386, + "step": 27675 + }, + { + "epoch": 0.6144215935450217, + "grad_norm": 0.7128292322158813, + "learning_rate": 6.482054963637416e-06, + "loss": 0.5341, + "step": 27680 + }, + { + "epoch": 0.6145325801045494, + "grad_norm": 1.555774211883545, + "learning_rate": 6.478791278078169e-06, + "loss": 0.4607, + "step": 27685 + }, + { + "epoch": 0.6146435666640769, + "grad_norm": 1.5064046382904053, + "learning_rate": 6.475528020615665e-06, + "loss": 0.4431, + "step": 27690 + }, + { + "epoch": 0.6147545532236046, + "grad_norm": 1.5725188255310059, + "learning_rate": 6.472265191646647e-06, + "loss": 0.4654, + "step": 27695 + }, + { + "epoch": 0.6148655397831323, + "grad_norm": 1.4065055847167969, + "learning_rate": 6.469002791567792e-06, + "loss": 0.5809, + "step": 27700 + }, + { + "epoch": 0.6149765263426599, + "grad_norm": 1.1861436367034912, + "learning_rate": 6.4657408207757365e-06, + "loss": 0.3904, + "step": 27705 + }, + { + "epoch": 0.6150875129021875, + "grad_norm": 1.070533037185669, + "learning_rate": 6.4624792796670624e-06, + "loss": 0.3536, + "step": 27710 + }, + { + "epoch": 0.6151984994617152, + "grad_norm": 1.5751197338104248, + "learning_rate": 6.459218168638291e-06, + "loss": 0.5824, + "step": 27715 + }, + { + "epoch": 0.6153094860212428, + "grad_norm": 0.9858487248420715, + "learning_rate": 6.4559574880859015e-06, + "loss": 0.4948, + "step": 27720 + }, + { + "epoch": 0.6154204725807705, + "grad_norm": 1.114532709121704, + "learning_rate": 6.452697238406311e-06, + "loss": 0.5044, + "step": 27725 + }, + { + "epoch": 0.6155314591402982, + "grad_norm": 1.7676235437393188, + "learning_rate": 6.449437419995894e-06, + "loss": 0.3233, + "step": 27730 + }, + { + "epoch": 0.6156424456998257, + "grad_norm": 1.1269664764404297, + "learning_rate": 6.446178033250973e-06, + "loss": 0.3921, + "step": 27735 + }, + { + "epoch": 0.6157534322593534, + "grad_norm": 1.5101771354675293, + "learning_rate": 6.442919078567803e-06, + "loss": 0.4236, + "step": 27740 + }, + { + "epoch": 0.615864418818881, + "grad_norm": 0.6449528336524963, + "learning_rate": 6.439660556342606e-06, + "loss": 0.3422, + "step": 27745 + }, + { + "epoch": 0.6159754053784087, + "grad_norm": 1.5987694263458252, + "learning_rate": 6.436402466971534e-06, + "loss": 0.4795, + "step": 27750 + }, + { + "epoch": 0.6160863919379364, + "grad_norm": 1.3472490310668945, + "learning_rate": 6.4331448108507e-06, + "loss": 0.5204, + "step": 27755 + }, + { + "epoch": 0.6161973784974639, + "grad_norm": 1.8186256885528564, + "learning_rate": 6.42988758837616e-06, + "loss": 0.3725, + "step": 27760 + }, + { + "epoch": 0.6163083650569916, + "grad_norm": 0.9955191016197205, + "learning_rate": 6.426630799943911e-06, + "loss": 0.3704, + "step": 27765 + }, + { + "epoch": 0.6164193516165193, + "grad_norm": 1.6446115970611572, + "learning_rate": 6.423374445949908e-06, + "loss": 0.5087, + "step": 27770 + }, + { + "epoch": 0.6165303381760469, + "grad_norm": 0.7298675179481506, + "learning_rate": 6.420118526790041e-06, + "loss": 0.4841, + "step": 27775 + }, + { + "epoch": 0.6166413247355745, + "grad_norm": 3.0948612689971924, + "learning_rate": 6.416863042860162e-06, + "loss": 0.6477, + "step": 27780 + }, + { + "epoch": 0.6167523112951022, + "grad_norm": 0.7705976366996765, + "learning_rate": 6.4136079945560524e-06, + "loss": 0.3469, + "step": 27785 + }, + { + "epoch": 0.6168632978546298, + "grad_norm": 1.0038697719573975, + "learning_rate": 6.410353382273458e-06, + "loss": 0.4518, + "step": 27790 + }, + { + "epoch": 0.6169742844141575, + "grad_norm": 1.2131325006484985, + "learning_rate": 6.4070992064080606e-06, + "loss": 0.4135, + "step": 27795 + }, + { + "epoch": 0.617085270973685, + "grad_norm": 1.26372230052948, + "learning_rate": 6.4038454673554915e-06, + "loss": 0.2634, + "step": 27800 + }, + { + "epoch": 0.6171962575332127, + "grad_norm": 1.0183274745941162, + "learning_rate": 6.4005921655113305e-06, + "loss": 0.3646, + "step": 27805 + }, + { + "epoch": 0.6173072440927404, + "grad_norm": 1.1538991928100586, + "learning_rate": 6.397339301271103e-06, + "loss": 0.4179, + "step": 27810 + }, + { + "epoch": 0.617418230652268, + "grad_norm": 0.9333328604698181, + "learning_rate": 6.3940868750302774e-06, + "loss": 0.2912, + "step": 27815 + }, + { + "epoch": 0.6175292172117957, + "grad_norm": 1.7044806480407715, + "learning_rate": 6.39083488718428e-06, + "loss": 0.2643, + "step": 27820 + }, + { + "epoch": 0.6176402037713233, + "grad_norm": 1.2348930835723877, + "learning_rate": 6.387583338128471e-06, + "loss": 0.4042, + "step": 27825 + }, + { + "epoch": 0.6177511903308509, + "grad_norm": 1.1704115867614746, + "learning_rate": 6.384332228258168e-06, + "loss": 0.4946, + "step": 27830 + }, + { + "epoch": 0.6178621768903786, + "grad_norm": 1.7322683334350586, + "learning_rate": 6.3810815579686225e-06, + "loss": 0.4124, + "step": 27835 + }, + { + "epoch": 0.6179731634499063, + "grad_norm": 1.5869555473327637, + "learning_rate": 6.377831327655043e-06, + "loss": 0.4499, + "step": 27840 + }, + { + "epoch": 0.6180841500094338, + "grad_norm": 0.9546343684196472, + "learning_rate": 6.374581537712588e-06, + "loss": 0.4926, + "step": 27845 + }, + { + "epoch": 0.6181951365689615, + "grad_norm": 1.1583161354064941, + "learning_rate": 6.371332188536347e-06, + "loss": 0.4863, + "step": 27850 + }, + { + "epoch": 0.6183061231284891, + "grad_norm": 0.680205225944519, + "learning_rate": 6.368083280521372e-06, + "loss": 0.3792, + "step": 27855 + }, + { + "epoch": 0.6184171096880168, + "grad_norm": 1.0736056566238403, + "learning_rate": 6.364834814062648e-06, + "loss": 0.5166, + "step": 27860 + }, + { + "epoch": 0.6185280962475445, + "grad_norm": 1.0722342729568481, + "learning_rate": 6.361586789555121e-06, + "loss": 0.337, + "step": 27865 + }, + { + "epoch": 0.618639082807072, + "grad_norm": 1.02963125705719, + "learning_rate": 6.358339207393663e-06, + "loss": 0.1989, + "step": 27870 + }, + { + "epoch": 0.6187500693665997, + "grad_norm": 0.7019577026367188, + "learning_rate": 6.3550920679731134e-06, + "loss": 0.492, + "step": 27875 + }, + { + "epoch": 0.6188610559261274, + "grad_norm": 0.9072814583778381, + "learning_rate": 6.35184537168825e-06, + "loss": 0.5211, + "step": 27880 + }, + { + "epoch": 0.618972042485655, + "grad_norm": 1.3982828855514526, + "learning_rate": 6.348599118933786e-06, + "loss": 0.4369, + "step": 27885 + }, + { + "epoch": 0.6190830290451826, + "grad_norm": 2.1749801635742188, + "learning_rate": 6.3453533101044e-06, + "loss": 0.3617, + "step": 27890 + }, + { + "epoch": 0.6191940156047103, + "grad_norm": 1.2338169813156128, + "learning_rate": 6.342107945594698e-06, + "loss": 0.5012, + "step": 27895 + }, + { + "epoch": 0.6193050021642379, + "grad_norm": 1.5822906494140625, + "learning_rate": 6.3388630257992455e-06, + "loss": 0.4334, + "step": 27900 + }, + { + "epoch": 0.6194159887237656, + "grad_norm": 1.246267318725586, + "learning_rate": 6.335618551112548e-06, + "loss": 0.327, + "step": 27905 + }, + { + "epoch": 0.6195269752832931, + "grad_norm": 1.0501974821090698, + "learning_rate": 6.332374521929059e-06, + "loss": 0.4558, + "step": 27910 + }, + { + "epoch": 0.6196379618428208, + "grad_norm": 1.8456707000732422, + "learning_rate": 6.3291309386431744e-06, + "loss": 0.3494, + "step": 27915 + }, + { + "epoch": 0.6197489484023485, + "grad_norm": 1.3171080350875854, + "learning_rate": 6.32588780164924e-06, + "loss": 0.2822, + "step": 27920 + }, + { + "epoch": 0.6198599349618761, + "grad_norm": 1.881778597831726, + "learning_rate": 6.322645111341541e-06, + "loss": 0.4102, + "step": 27925 + }, + { + "epoch": 0.6199709215214038, + "grad_norm": 0.7797800898551941, + "learning_rate": 6.319402868114321e-06, + "loss": 0.3568, + "step": 27930 + }, + { + "epoch": 0.6200819080809314, + "grad_norm": 1.3908886909484863, + "learning_rate": 6.3161610723617525e-06, + "loss": 0.5446, + "step": 27935 + }, + { + "epoch": 0.620192894640459, + "grad_norm": 0.75746089220047, + "learning_rate": 6.3129197244779715e-06, + "loss": 0.3518, + "step": 27940 + }, + { + "epoch": 0.6203038811999867, + "grad_norm": 0.9157155156135559, + "learning_rate": 6.309678824857039e-06, + "loss": 0.3124, + "step": 27945 + }, + { + "epoch": 0.6204148677595144, + "grad_norm": 1.303605556488037, + "learning_rate": 6.306438373892985e-06, + "loss": 0.4871, + "step": 27950 + }, + { + "epoch": 0.620525854319042, + "grad_norm": 1.2030810117721558, + "learning_rate": 6.30319837197976e-06, + "loss": 0.2725, + "step": 27955 + }, + { + "epoch": 0.6206368408785696, + "grad_norm": 1.1164860725402832, + "learning_rate": 6.2999588195112806e-06, + "loss": 0.4663, + "step": 27960 + }, + { + "epoch": 0.6207478274380972, + "grad_norm": 1.380496859550476, + "learning_rate": 6.296719716881401e-06, + "loss": 0.4018, + "step": 27965 + }, + { + "epoch": 0.6208588139976249, + "grad_norm": 1.0364583730697632, + "learning_rate": 6.293481064483915e-06, + "loss": 0.4486, + "step": 27970 + }, + { + "epoch": 0.6209698005571526, + "grad_norm": 1.5763059854507446, + "learning_rate": 6.290242862712576e-06, + "loss": 0.4054, + "step": 27975 + }, + { + "epoch": 0.6210807871166801, + "grad_norm": 0.7757239937782288, + "learning_rate": 6.287005111961062e-06, + "loss": 0.3067, + "step": 27980 + }, + { + "epoch": 0.6211917736762078, + "grad_norm": 1.8326990604400635, + "learning_rate": 6.283767812623016e-06, + "loss": 0.4763, + "step": 27985 + }, + { + "epoch": 0.6213027602357355, + "grad_norm": 1.3120774030685425, + "learning_rate": 6.280530965092019e-06, + "loss": 0.3022, + "step": 27990 + }, + { + "epoch": 0.6214137467952631, + "grad_norm": 1.223878264427185, + "learning_rate": 6.2772945697615895e-06, + "loss": 0.442, + "step": 27995 + }, + { + "epoch": 0.6215247333547907, + "grad_norm": 1.3699520826339722, + "learning_rate": 6.274058627025205e-06, + "loss": 0.4172, + "step": 28000 + }, + { + "epoch": 0.6216357199143184, + "grad_norm": 0.9021301865577698, + "learning_rate": 6.270823137276271e-06, + "loss": 0.3973, + "step": 28005 + }, + { + "epoch": 0.621746706473846, + "grad_norm": 1.7066882848739624, + "learning_rate": 6.267588100908159e-06, + "loss": 0.4331, + "step": 28010 + }, + { + "epoch": 0.6218576930333737, + "grad_norm": 1.2832542657852173, + "learning_rate": 6.264353518314166e-06, + "loss": 0.4259, + "step": 28015 + }, + { + "epoch": 0.6219686795929013, + "grad_norm": 1.3464566469192505, + "learning_rate": 6.261119389887545e-06, + "loss": 0.5368, + "step": 28020 + }, + { + "epoch": 0.6220796661524289, + "grad_norm": 1.6499924659729004, + "learning_rate": 6.257885716021488e-06, + "loss": 0.5052, + "step": 28025 + }, + { + "epoch": 0.6221906527119566, + "grad_norm": 1.5775763988494873, + "learning_rate": 6.254652497109136e-06, + "loss": 0.4077, + "step": 28030 + }, + { + "epoch": 0.6223016392714842, + "grad_norm": 1.848100185394287, + "learning_rate": 6.251419733543572e-06, + "loss": 0.5273, + "step": 28035 + }, + { + "epoch": 0.6224126258310119, + "grad_norm": 1.6376484632492065, + "learning_rate": 6.248187425717827e-06, + "loss": 0.3518, + "step": 28040 + }, + { + "epoch": 0.6225236123905395, + "grad_norm": 1.0268816947937012, + "learning_rate": 6.244955574024867e-06, + "loss": 0.4048, + "step": 28045 + }, + { + "epoch": 0.6226345989500671, + "grad_norm": 1.2212193012237549, + "learning_rate": 6.241724178857621e-06, + "loss": 0.5282, + "step": 28050 + }, + { + "epoch": 0.6227455855095948, + "grad_norm": 1.729117751121521, + "learning_rate": 6.23849324060894e-06, + "loss": 0.5671, + "step": 28055 + }, + { + "epoch": 0.6228565720691225, + "grad_norm": 1.5485546588897705, + "learning_rate": 6.235262759671641e-06, + "loss": 0.4374, + "step": 28060 + }, + { + "epoch": 0.62296755862865, + "grad_norm": 0.8996152877807617, + "learning_rate": 6.232032736438465e-06, + "loss": 0.4608, + "step": 28065 + }, + { + "epoch": 0.6230785451881777, + "grad_norm": 1.1960076093673706, + "learning_rate": 6.228803171302112e-06, + "loss": 0.3264, + "step": 28070 + }, + { + "epoch": 0.6231895317477053, + "grad_norm": 1.4368724822998047, + "learning_rate": 6.225574064655227e-06, + "loss": 0.3819, + "step": 28075 + }, + { + "epoch": 0.623300518307233, + "grad_norm": 1.8827331066131592, + "learning_rate": 6.222345416890383e-06, + "loss": 0.3901, + "step": 28080 + }, + { + "epoch": 0.6234115048667607, + "grad_norm": 1.6428965330123901, + "learning_rate": 6.21911722840012e-06, + "loss": 0.5045, + "step": 28085 + }, + { + "epoch": 0.6235224914262882, + "grad_norm": 0.8102307319641113, + "learning_rate": 6.215889499576898e-06, + "loss": 0.4805, + "step": 28090 + }, + { + "epoch": 0.6236334779858159, + "grad_norm": 0.8937715888023376, + "learning_rate": 6.212662230813141e-06, + "loss": 0.4285, + "step": 28095 + }, + { + "epoch": 0.6237444645453436, + "grad_norm": 1.6061286926269531, + "learning_rate": 6.2094354225012124e-06, + "loss": 0.484, + "step": 28100 + }, + { + "epoch": 0.6238554511048712, + "grad_norm": 0.9164207577705383, + "learning_rate": 6.206209075033408e-06, + "loss": 0.4131, + "step": 28105 + }, + { + "epoch": 0.6239664376643989, + "grad_norm": 0.8834840059280396, + "learning_rate": 6.202983188801985e-06, + "loss": 0.3085, + "step": 28110 + }, + { + "epoch": 0.6240774242239265, + "grad_norm": 1.0660358667373657, + "learning_rate": 6.199757764199128e-06, + "loss": 0.393, + "step": 28115 + }, + { + "epoch": 0.6241884107834541, + "grad_norm": 2.5209226608276367, + "learning_rate": 6.196532801616981e-06, + "loss": 0.4907, + "step": 28120 + }, + { + "epoch": 0.6242993973429818, + "grad_norm": 0.8467637896537781, + "learning_rate": 6.193308301447616e-06, + "loss": 0.4862, + "step": 28125 + }, + { + "epoch": 0.6244103839025094, + "grad_norm": 1.031442403793335, + "learning_rate": 6.190084264083061e-06, + "loss": 0.4154, + "step": 28130 + }, + { + "epoch": 0.624521370462037, + "grad_norm": 0.9046064615249634, + "learning_rate": 6.186860689915286e-06, + "loss": 0.3292, + "step": 28135 + }, + { + "epoch": 0.6246323570215647, + "grad_norm": 0.8271287679672241, + "learning_rate": 6.183637579336199e-06, + "loss": 0.4532, + "step": 28140 + }, + { + "epoch": 0.6247433435810923, + "grad_norm": 0.8403948545455933, + "learning_rate": 6.180414932737659e-06, + "loss": 0.3869, + "step": 28145 + }, + { + "epoch": 0.62485433014062, + "grad_norm": 1.2737436294555664, + "learning_rate": 6.177192750511456e-06, + "loss": 0.475, + "step": 28150 + }, + { + "epoch": 0.6249653167001477, + "grad_norm": 1.0544812679290771, + "learning_rate": 6.173971033049342e-06, + "loss": 0.511, + "step": 28155 + }, + { + "epoch": 0.6250763032596752, + "grad_norm": 0.8840219974517822, + "learning_rate": 6.170749780742998e-06, + "loss": 0.3926, + "step": 28160 + }, + { + "epoch": 0.6251872898192029, + "grad_norm": 1.263260006904602, + "learning_rate": 6.167528993984051e-06, + "loss": 0.4556, + "step": 28165 + }, + { + "epoch": 0.6252982763787306, + "grad_norm": 1.5352908372879028, + "learning_rate": 6.164308673164078e-06, + "loss": 0.4933, + "step": 28170 + }, + { + "epoch": 0.6254092629382582, + "grad_norm": 0.9619081616401672, + "learning_rate": 6.161088818674592e-06, + "loss": 0.5107, + "step": 28175 + }, + { + "epoch": 0.6255202494977858, + "grad_norm": 0.8135992884635925, + "learning_rate": 6.1578694309070505e-06, + "loss": 0.3525, + "step": 28180 + }, + { + "epoch": 0.6256312360573134, + "grad_norm": 1.5407414436340332, + "learning_rate": 6.154650510252862e-06, + "loss": 0.5522, + "step": 28185 + }, + { + "epoch": 0.6257422226168411, + "grad_norm": 0.9878733158111572, + "learning_rate": 6.151432057103366e-06, + "loss": 0.4503, + "step": 28190 + }, + { + "epoch": 0.6258532091763688, + "grad_norm": 0.9099797606468201, + "learning_rate": 6.148214071849855e-06, + "loss": 0.381, + "step": 28195 + }, + { + "epoch": 0.6259641957358963, + "grad_norm": 1.2787854671478271, + "learning_rate": 6.144996554883556e-06, + "loss": 0.3622, + "step": 28200 + }, + { + "epoch": 0.626075182295424, + "grad_norm": 1.287074089050293, + "learning_rate": 6.141779506595651e-06, + "loss": 0.4097, + "step": 28205 + }, + { + "epoch": 0.6261861688549517, + "grad_norm": 1.020998239517212, + "learning_rate": 6.138562927377251e-06, + "loss": 0.3028, + "step": 28210 + }, + { + "epoch": 0.6262971554144793, + "grad_norm": 0.7408778667449951, + "learning_rate": 6.135346817619419e-06, + "loss": 0.3358, + "step": 28215 + }, + { + "epoch": 0.626408141974007, + "grad_norm": 0.669231653213501, + "learning_rate": 6.132131177713165e-06, + "loss": 0.4524, + "step": 28220 + }, + { + "epoch": 0.6265191285335346, + "grad_norm": 0.7824404239654541, + "learning_rate": 6.1289160080494256e-06, + "loss": 0.421, + "step": 28225 + }, + { + "epoch": 0.6266301150930622, + "grad_norm": 1.124798059463501, + "learning_rate": 6.125701309019101e-06, + "loss": 0.2571, + "step": 28230 + }, + { + "epoch": 0.6267411016525899, + "grad_norm": 1.0454397201538086, + "learning_rate": 6.122487081013011e-06, + "loss": 0.4768, + "step": 28235 + }, + { + "epoch": 0.6268520882121175, + "grad_norm": 1.7600301504135132, + "learning_rate": 6.1192733244219395e-06, + "loss": 0.4956, + "step": 28240 + }, + { + "epoch": 0.6269630747716451, + "grad_norm": 1.7674944400787354, + "learning_rate": 6.1160600396366064e-06, + "loss": 0.4959, + "step": 28245 + }, + { + "epoch": 0.6270740613311728, + "grad_norm": 1.1841814517974854, + "learning_rate": 6.112847227047662e-06, + "loss": 0.4012, + "step": 28250 + }, + { + "epoch": 0.6271850478907004, + "grad_norm": 0.8376629948616028, + "learning_rate": 6.109634887045721e-06, + "loss": 0.3011, + "step": 28255 + }, + { + "epoch": 0.6272960344502281, + "grad_norm": 0.7868844270706177, + "learning_rate": 6.1064230200213196e-06, + "loss": 0.2649, + "step": 28260 + }, + { + "epoch": 0.6274070210097558, + "grad_norm": 1.001660943031311, + "learning_rate": 6.103211626364951e-06, + "loss": 0.374, + "step": 28265 + }, + { + "epoch": 0.6275180075692833, + "grad_norm": 0.9349133968353271, + "learning_rate": 6.1000007064670445e-06, + "loss": 0.5471, + "step": 28270 + }, + { + "epoch": 0.627628994128811, + "grad_norm": 1.4925248622894287, + "learning_rate": 6.096790260717971e-06, + "loss": 0.3443, + "step": 28275 + }, + { + "epoch": 0.6277399806883387, + "grad_norm": 0.9076890349388123, + "learning_rate": 6.093580289508047e-06, + "loss": 0.4022, + "step": 28280 + }, + { + "epoch": 0.6278509672478663, + "grad_norm": 0.7487587332725525, + "learning_rate": 6.090370793227531e-06, + "loss": 0.3874, + "step": 28285 + }, + { + "epoch": 0.627961953807394, + "grad_norm": 1.1190788745880127, + "learning_rate": 6.087161772266623e-06, + "loss": 0.2739, + "step": 28290 + }, + { + "epoch": 0.6280729403669215, + "grad_norm": 1.0714386701583862, + "learning_rate": 6.083953227015463e-06, + "loss": 0.4865, + "step": 28295 + }, + { + "epoch": 0.6281839269264492, + "grad_norm": 1.3088133335113525, + "learning_rate": 6.080745157864135e-06, + "loss": 0.4213, + "step": 28300 + }, + { + "epoch": 0.6282949134859769, + "grad_norm": 1.300983190536499, + "learning_rate": 6.07753756520267e-06, + "loss": 0.4912, + "step": 28305 + }, + { + "epoch": 0.6284059000455045, + "grad_norm": 0.9377284049987793, + "learning_rate": 6.074330449421029e-06, + "loss": 0.3563, + "step": 28310 + }, + { + "epoch": 0.6285168866050321, + "grad_norm": 1.3748418092727661, + "learning_rate": 6.071123810909131e-06, + "loss": 0.5195, + "step": 28315 + }, + { + "epoch": 0.6286278731645598, + "grad_norm": 1.298658847808838, + "learning_rate": 6.067917650056818e-06, + "loss": 0.4443, + "step": 28320 + }, + { + "epoch": 0.6287388597240874, + "grad_norm": 1.233130931854248, + "learning_rate": 6.064711967253891e-06, + "loss": 0.4408, + "step": 28325 + }, + { + "epoch": 0.6288498462836151, + "grad_norm": 2.0042026042938232, + "learning_rate": 6.06150676289009e-06, + "loss": 0.6012, + "step": 28330 + }, + { + "epoch": 0.6289608328431427, + "grad_norm": 1.11312997341156, + "learning_rate": 6.058302037355084e-06, + "loss": 0.3594, + "step": 28335 + }, + { + "epoch": 0.6290718194026703, + "grad_norm": 1.4051095247268677, + "learning_rate": 6.055097791038499e-06, + "loss": 0.35, + "step": 28340 + }, + { + "epoch": 0.629182805962198, + "grad_norm": 1.0128973722457886, + "learning_rate": 6.051894024329892e-06, + "loss": 0.366, + "step": 28345 + }, + { + "epoch": 0.6292937925217256, + "grad_norm": 0.9693575501441956, + "learning_rate": 6.048690737618768e-06, + "loss": 0.4185, + "step": 28350 + }, + { + "epoch": 0.6294047790812533, + "grad_norm": 0.9025382995605469, + "learning_rate": 6.0454879312945755e-06, + "loss": 0.3497, + "step": 28355 + }, + { + "epoch": 0.6295157656407809, + "grad_norm": 1.336121916770935, + "learning_rate": 6.042285605746696e-06, + "loss": 0.4256, + "step": 28360 + }, + { + "epoch": 0.6296267522003085, + "grad_norm": 1.1140364408493042, + "learning_rate": 6.0390837613644615e-06, + "loss": 0.2469, + "step": 28365 + }, + { + "epoch": 0.6297377387598362, + "grad_norm": 1.1654084920883179, + "learning_rate": 6.035882398537137e-06, + "loss": 0.4281, + "step": 28370 + }, + { + "epoch": 0.6298487253193639, + "grad_norm": 1.7681803703308105, + "learning_rate": 6.032681517653938e-06, + "loss": 0.5058, + "step": 28375 + }, + { + "epoch": 0.6299597118788914, + "grad_norm": 0.9355083107948303, + "learning_rate": 6.0294811191040125e-06, + "loss": 0.4615, + "step": 28380 + }, + { + "epoch": 0.6300706984384191, + "grad_norm": 1.3876092433929443, + "learning_rate": 6.026281203276456e-06, + "loss": 0.4634, + "step": 28385 + }, + { + "epoch": 0.6301816849979468, + "grad_norm": 1.6106153726577759, + "learning_rate": 6.023081770560307e-06, + "loss": 0.4213, + "step": 28390 + }, + { + "epoch": 0.6302926715574744, + "grad_norm": 0.8123453259468079, + "learning_rate": 6.019882821344536e-06, + "loss": 0.4301, + "step": 28395 + }, + { + "epoch": 0.630403658117002, + "grad_norm": 1.408504843711853, + "learning_rate": 6.016684356018066e-06, + "loss": 0.6509, + "step": 28400 + }, + { + "epoch": 0.6305146446765296, + "grad_norm": 1.090542197227478, + "learning_rate": 6.01348637496975e-06, + "loss": 0.403, + "step": 28405 + }, + { + "epoch": 0.6306256312360573, + "grad_norm": 1.3139768838882446, + "learning_rate": 6.010288878588393e-06, + "loss": 0.3842, + "step": 28410 + }, + { + "epoch": 0.630736617795585, + "grad_norm": 0.8315967321395874, + "learning_rate": 6.007091867262735e-06, + "loss": 0.4683, + "step": 28415 + }, + { + "epoch": 0.6308476043551126, + "grad_norm": 1.1125186681747437, + "learning_rate": 6.003895341381454e-06, + "loss": 0.4052, + "step": 28420 + }, + { + "epoch": 0.6309585909146402, + "grad_norm": 1.2615845203399658, + "learning_rate": 6.000699301333177e-06, + "loss": 0.4602, + "step": 28425 + }, + { + "epoch": 0.6310695774741679, + "grad_norm": 1.5077705383300781, + "learning_rate": 5.997503747506465e-06, + "loss": 0.3776, + "step": 28430 + }, + { + "epoch": 0.6311805640336955, + "grad_norm": 0.9946312308311462, + "learning_rate": 5.994308680289822e-06, + "loss": 0.5199, + "step": 28435 + }, + { + "epoch": 0.6312915505932232, + "grad_norm": 1.2056828737258911, + "learning_rate": 5.991114100071701e-06, + "loss": 0.3758, + "step": 28440 + }, + { + "epoch": 0.6314025371527509, + "grad_norm": 1.2828701734542847, + "learning_rate": 5.987920007240478e-06, + "loss": 0.5064, + "step": 28445 + }, + { + "epoch": 0.6315135237122784, + "grad_norm": 1.6678309440612793, + "learning_rate": 5.98472640218449e-06, + "loss": 0.4126, + "step": 28450 + }, + { + "epoch": 0.6316245102718061, + "grad_norm": 1.8954167366027832, + "learning_rate": 5.981533285291995e-06, + "loss": 0.5003, + "step": 28455 + }, + { + "epoch": 0.6317354968313337, + "grad_norm": 1.7117716073989868, + "learning_rate": 5.9783406569512105e-06, + "loss": 0.3651, + "step": 28460 + }, + { + "epoch": 0.6318464833908614, + "grad_norm": 0.7891132831573486, + "learning_rate": 5.975148517550278e-06, + "loss": 0.2982, + "step": 28465 + }, + { + "epoch": 0.631957469950389, + "grad_norm": 1.4389150142669678, + "learning_rate": 5.971956867477289e-06, + "loss": 0.5224, + "step": 28470 + }, + { + "epoch": 0.6320684565099166, + "grad_norm": 1.39657461643219, + "learning_rate": 5.96876570712028e-06, + "loss": 0.472, + "step": 28475 + }, + { + "epoch": 0.6321794430694443, + "grad_norm": 1.6805533170700073, + "learning_rate": 5.965575036867212e-06, + "loss": 0.505, + "step": 28480 + }, + { + "epoch": 0.632290429628972, + "grad_norm": 1.7978190183639526, + "learning_rate": 5.962384857106005e-06, + "loss": 0.3385, + "step": 28485 + }, + { + "epoch": 0.6324014161884995, + "grad_norm": 1.0432755947113037, + "learning_rate": 5.9591951682245034e-06, + "loss": 0.5049, + "step": 28490 + }, + { + "epoch": 0.6325124027480272, + "grad_norm": 1.0674772262573242, + "learning_rate": 5.956005970610499e-06, + "loss": 0.4788, + "step": 28495 + }, + { + "epoch": 0.6326233893075549, + "grad_norm": 1.2680522203445435, + "learning_rate": 5.952817264651732e-06, + "loss": 0.4034, + "step": 28500 + }, + { + "epoch": 0.6327343758670825, + "grad_norm": 1.232718586921692, + "learning_rate": 5.949629050735863e-06, + "loss": 0.5217, + "step": 28505 + }, + { + "epoch": 0.6328453624266102, + "grad_norm": 0.9667626023292542, + "learning_rate": 5.946441329250517e-06, + "loss": 0.3919, + "step": 28510 + }, + { + "epoch": 0.6329563489861377, + "grad_norm": 1.1591609716415405, + "learning_rate": 5.9432541005832324e-06, + "loss": 0.4037, + "step": 28515 + }, + { + "epoch": 0.6330673355456654, + "grad_norm": 1.1515146493911743, + "learning_rate": 5.940067365121512e-06, + "loss": 0.3504, + "step": 28520 + }, + { + "epoch": 0.6331783221051931, + "grad_norm": 1.1757744550704956, + "learning_rate": 5.936881123252787e-06, + "loss": 0.4657, + "step": 28525 + }, + { + "epoch": 0.6332893086647207, + "grad_norm": 0.8316757082939148, + "learning_rate": 5.933695375364425e-06, + "loss": 0.4254, + "step": 28530 + }, + { + "epoch": 0.6334002952242483, + "grad_norm": 1.2674232721328735, + "learning_rate": 5.930510121843746e-06, + "loss": 0.4108, + "step": 28535 + }, + { + "epoch": 0.633511281783776, + "grad_norm": 0.9032902121543884, + "learning_rate": 5.927325363077996e-06, + "loss": 0.5447, + "step": 28540 + }, + { + "epoch": 0.6336222683433036, + "grad_norm": 1.034072995185852, + "learning_rate": 5.924141099454368e-06, + "loss": 0.4106, + "step": 28545 + }, + { + "epoch": 0.6337332549028313, + "grad_norm": 0.8327234983444214, + "learning_rate": 5.92095733136e-06, + "loss": 0.3196, + "step": 28550 + }, + { + "epoch": 0.633844241462359, + "grad_norm": 0.9522653818130493, + "learning_rate": 5.917774059181956e-06, + "loss": 0.3871, + "step": 28555 + }, + { + "epoch": 0.6339552280218865, + "grad_norm": 1.6604143381118774, + "learning_rate": 5.9145912833072535e-06, + "loss": 0.3178, + "step": 28560 + }, + { + "epoch": 0.6340662145814142, + "grad_norm": 0.8660258650779724, + "learning_rate": 5.911409004122839e-06, + "loss": 0.1758, + "step": 28565 + }, + { + "epoch": 0.6341772011409418, + "grad_norm": 1.1857221126556396, + "learning_rate": 5.90822722201561e-06, + "loss": 0.4051, + "step": 28570 + }, + { + "epoch": 0.6342881877004695, + "grad_norm": 1.7750307321548462, + "learning_rate": 5.9050459373723865e-06, + "loss": 0.4322, + "step": 28575 + }, + { + "epoch": 0.6343991742599971, + "grad_norm": 1.1627682447433472, + "learning_rate": 5.901865150579946e-06, + "loss": 0.39, + "step": 28580 + }, + { + "epoch": 0.6345101608195247, + "grad_norm": 1.180820107460022, + "learning_rate": 5.898684862025001e-06, + "loss": 0.2816, + "step": 28585 + }, + { + "epoch": 0.6346211473790524, + "grad_norm": 1.285776138305664, + "learning_rate": 5.895505072094191e-06, + "loss": 0.4874, + "step": 28590 + }, + { + "epoch": 0.6347321339385801, + "grad_norm": 0.9625634551048279, + "learning_rate": 5.892325781174113e-06, + "loss": 0.3683, + "step": 28595 + }, + { + "epoch": 0.6348431204981076, + "grad_norm": 0.9896165728569031, + "learning_rate": 5.889146989651286e-06, + "loss": 0.3098, + "step": 28600 + }, + { + "epoch": 0.6349541070576353, + "grad_norm": 1.09481680393219, + "learning_rate": 5.885968697912181e-06, + "loss": 0.3626, + "step": 28605 + }, + { + "epoch": 0.635065093617163, + "grad_norm": 1.0791270732879639, + "learning_rate": 5.88279090634321e-06, + "loss": 0.4849, + "step": 28610 + }, + { + "epoch": 0.6351760801766906, + "grad_norm": 0.7296260595321655, + "learning_rate": 5.879613615330708e-06, + "loss": 0.323, + "step": 28615 + }, + { + "epoch": 0.6352870667362183, + "grad_norm": 1.3319703340530396, + "learning_rate": 5.876436825260967e-06, + "loss": 0.4253, + "step": 28620 + }, + { + "epoch": 0.6353980532957458, + "grad_norm": 1.4313448667526245, + "learning_rate": 5.873260536520205e-06, + "loss": 0.5015, + "step": 28625 + }, + { + "epoch": 0.6355090398552735, + "grad_norm": 1.895022988319397, + "learning_rate": 5.870084749494586e-06, + "loss": 0.3823, + "step": 28630 + }, + { + "epoch": 0.6356200264148012, + "grad_norm": 1.3738524913787842, + "learning_rate": 5.866909464570215e-06, + "loss": 0.4114, + "step": 28635 + }, + { + "epoch": 0.6357310129743288, + "grad_norm": 0.9916070103645325, + "learning_rate": 5.863734682133129e-06, + "loss": 0.4405, + "step": 28640 + }, + { + "epoch": 0.6358419995338565, + "grad_norm": 1.1903655529022217, + "learning_rate": 5.860560402569308e-06, + "loss": 0.4088, + "step": 28645 + }, + { + "epoch": 0.6359529860933841, + "grad_norm": 1.4209836721420288, + "learning_rate": 5.857386626264673e-06, + "loss": 0.4614, + "step": 28650 + }, + { + "epoch": 0.6360639726529117, + "grad_norm": 1.390576720237732, + "learning_rate": 5.854213353605076e-06, + "loss": 0.3247, + "step": 28655 + }, + { + "epoch": 0.6361749592124394, + "grad_norm": 1.4189549684524536, + "learning_rate": 5.8510405849763175e-06, + "loss": 0.3953, + "step": 28660 + }, + { + "epoch": 0.6362859457719671, + "grad_norm": 1.592666506767273, + "learning_rate": 5.847868320764128e-06, + "loss": 0.3987, + "step": 28665 + }, + { + "epoch": 0.6363969323314946, + "grad_norm": 1.0677518844604492, + "learning_rate": 5.844696561354186e-06, + "loss": 0.2998, + "step": 28670 + }, + { + "epoch": 0.6365079188910223, + "grad_norm": 1.2202013731002808, + "learning_rate": 5.841525307132097e-06, + "loss": 0.4315, + "step": 28675 + }, + { + "epoch": 0.6366189054505499, + "grad_norm": 1.3202593326568604, + "learning_rate": 5.838354558483418e-06, + "loss": 0.4257, + "step": 28680 + }, + { + "epoch": 0.6367298920100776, + "grad_norm": 1.6365586519241333, + "learning_rate": 5.8351843157936305e-06, + "loss": 0.3262, + "step": 28685 + }, + { + "epoch": 0.6368408785696053, + "grad_norm": 1.246142029762268, + "learning_rate": 5.832014579448167e-06, + "loss": 0.4667, + "step": 28690 + }, + { + "epoch": 0.6369518651291328, + "grad_norm": 1.7104053497314453, + "learning_rate": 5.828845349832396e-06, + "loss": 0.5282, + "step": 28695 + }, + { + "epoch": 0.6370628516886605, + "grad_norm": 1.6780058145523071, + "learning_rate": 5.825676627331614e-06, + "loss": 0.543, + "step": 28700 + }, + { + "epoch": 0.6371738382481882, + "grad_norm": 1.1392569541931152, + "learning_rate": 5.822508412331074e-06, + "loss": 0.3653, + "step": 28705 + }, + { + "epoch": 0.6372848248077158, + "grad_norm": 1.0654574632644653, + "learning_rate": 5.819340705215946e-06, + "loss": 0.3235, + "step": 28710 + }, + { + "epoch": 0.6373958113672434, + "grad_norm": 1.223675012588501, + "learning_rate": 5.816173506371352e-06, + "loss": 0.2812, + "step": 28715 + }, + { + "epoch": 0.6375067979267711, + "grad_norm": 1.6331626176834106, + "learning_rate": 5.813006816182358e-06, + "loss": 0.4586, + "step": 28720 + }, + { + "epoch": 0.6376177844862987, + "grad_norm": 2.492760419845581, + "learning_rate": 5.80984063503395e-06, + "loss": 0.4431, + "step": 28725 + }, + { + "epoch": 0.6377287710458264, + "grad_norm": 1.1395188570022583, + "learning_rate": 5.8066749633110675e-06, + "loss": 0.3516, + "step": 28730 + }, + { + "epoch": 0.6378397576053539, + "grad_norm": 0.9498520493507385, + "learning_rate": 5.803509801398575e-06, + "loss": 0.289, + "step": 28735 + }, + { + "epoch": 0.6379507441648816, + "grad_norm": 1.5028700828552246, + "learning_rate": 5.800345149681293e-06, + "loss": 0.4283, + "step": 28740 + }, + { + "epoch": 0.6380617307244093, + "grad_norm": 1.2373186349868774, + "learning_rate": 5.797181008543958e-06, + "loss": 0.4102, + "step": 28745 + }, + { + "epoch": 0.6381727172839369, + "grad_norm": 0.6199971437454224, + "learning_rate": 5.79401737837126e-06, + "loss": 0.1848, + "step": 28750 + }, + { + "epoch": 0.6382837038434646, + "grad_norm": 1.6247268915176392, + "learning_rate": 5.790854259547827e-06, + "loss": 0.4643, + "step": 28755 + }, + { + "epoch": 0.6383946904029922, + "grad_norm": 0.8452243804931641, + "learning_rate": 5.787691652458214e-06, + "loss": 0.4384, + "step": 28760 + }, + { + "epoch": 0.6385056769625198, + "grad_norm": 1.413805365562439, + "learning_rate": 5.784529557486927e-06, + "loss": 0.5771, + "step": 28765 + }, + { + "epoch": 0.6386166635220475, + "grad_norm": 1.5562477111816406, + "learning_rate": 5.781367975018395e-06, + "loss": 0.3771, + "step": 28770 + }, + { + "epoch": 0.6387276500815752, + "grad_norm": 1.1243140697479248, + "learning_rate": 5.778206905436996e-06, + "loss": 0.4371, + "step": 28775 + }, + { + "epoch": 0.6388386366411027, + "grad_norm": 0.9571521282196045, + "learning_rate": 5.775046349127046e-06, + "loss": 0.3678, + "step": 28780 + }, + { + "epoch": 0.6389496232006304, + "grad_norm": 0.8463472127914429, + "learning_rate": 5.771886306472788e-06, + "loss": 0.4502, + "step": 28785 + }, + { + "epoch": 0.639060609760158, + "grad_norm": 1.0742371082305908, + "learning_rate": 5.768726777858417e-06, + "loss": 0.3888, + "step": 28790 + }, + { + "epoch": 0.6391715963196857, + "grad_norm": 1.8010051250457764, + "learning_rate": 5.765567763668049e-06, + "loss": 0.4078, + "step": 28795 + }, + { + "epoch": 0.6392825828792134, + "grad_norm": 1.1428115367889404, + "learning_rate": 5.762409264285752e-06, + "loss": 0.3807, + "step": 28800 + }, + { + "epoch": 0.6393935694387409, + "grad_norm": 0.9962915182113647, + "learning_rate": 5.759251280095529e-06, + "loss": 0.5052, + "step": 28805 + }, + { + "epoch": 0.6395045559982686, + "grad_norm": 1.4247018098831177, + "learning_rate": 5.756093811481309e-06, + "loss": 0.555, + "step": 28810 + }, + { + "epoch": 0.6396155425577963, + "grad_norm": 1.413870930671692, + "learning_rate": 5.7529368588269745e-06, + "loss": 0.4547, + "step": 28815 + }, + { + "epoch": 0.6397265291173239, + "grad_norm": 1.667945146560669, + "learning_rate": 5.7497804225163275e-06, + "loss": 0.3261, + "step": 28820 + }, + { + "epoch": 0.6398375156768515, + "grad_norm": 1.1330857276916504, + "learning_rate": 5.746624502933128e-06, + "loss": 0.3848, + "step": 28825 + }, + { + "epoch": 0.6399485022363792, + "grad_norm": 0.8767269849777222, + "learning_rate": 5.743469100461052e-06, + "loss": 0.1863, + "step": 28830 + }, + { + "epoch": 0.6400594887959068, + "grad_norm": 1.1383758783340454, + "learning_rate": 5.740314215483733e-06, + "loss": 0.3286, + "step": 28835 + }, + { + "epoch": 0.6401704753554345, + "grad_norm": 0.9295101165771484, + "learning_rate": 5.7371598483847214e-06, + "loss": 0.4101, + "step": 28840 + }, + { + "epoch": 0.640281461914962, + "grad_norm": 0.9914706945419312, + "learning_rate": 5.734005999547522e-06, + "loss": 0.3852, + "step": 28845 + }, + { + "epoch": 0.6403924484744897, + "grad_norm": 0.812248945236206, + "learning_rate": 5.730852669355562e-06, + "loss": 0.4883, + "step": 28850 + }, + { + "epoch": 0.6405034350340174, + "grad_norm": 2.2017431259155273, + "learning_rate": 5.727699858192222e-06, + "loss": 0.5746, + "step": 28855 + }, + { + "epoch": 0.640614421593545, + "grad_norm": 1.232469916343689, + "learning_rate": 5.7245475664408e-06, + "loss": 0.2837, + "step": 28860 + }, + { + "epoch": 0.6407254081530727, + "grad_norm": 0.9875843524932861, + "learning_rate": 5.721395794484551e-06, + "loss": 0.5556, + "step": 28865 + }, + { + "epoch": 0.6408363947126003, + "grad_norm": 0.9763182997703552, + "learning_rate": 5.718244542706648e-06, + "loss": 0.5721, + "step": 28870 + }, + { + "epoch": 0.6409473812721279, + "grad_norm": 0.9100670218467712, + "learning_rate": 5.715093811490216e-06, + "loss": 0.4426, + "step": 28875 + }, + { + "epoch": 0.6410583678316556, + "grad_norm": 1.6159127950668335, + "learning_rate": 5.711943601218306e-06, + "loss": 0.3191, + "step": 28880 + }, + { + "epoch": 0.6411693543911833, + "grad_norm": 1.117018699645996, + "learning_rate": 5.708793912273911e-06, + "loss": 0.3323, + "step": 28885 + }, + { + "epoch": 0.6412803409507108, + "grad_norm": 1.3021478652954102, + "learning_rate": 5.705644745039965e-06, + "loss": 0.4529, + "step": 28890 + }, + { + "epoch": 0.6413913275102385, + "grad_norm": 0.9184788465499878, + "learning_rate": 5.702496099899324e-06, + "loss": 0.4596, + "step": 28895 + }, + { + "epoch": 0.6415023140697661, + "grad_norm": 0.7418227195739746, + "learning_rate": 5.699347977234799e-06, + "loss": 0.3201, + "step": 28900 + }, + { + "epoch": 0.6416133006292938, + "grad_norm": 0.9961920380592346, + "learning_rate": 5.696200377429119e-06, + "loss": 0.4405, + "step": 28905 + }, + { + "epoch": 0.6417242871888215, + "grad_norm": 0.6618875861167908, + "learning_rate": 5.693053300864968e-06, + "loss": 0.4857, + "step": 28910 + }, + { + "epoch": 0.641835273748349, + "grad_norm": 0.9733692407608032, + "learning_rate": 5.6899067479249485e-06, + "loss": 0.3799, + "step": 28915 + }, + { + "epoch": 0.6419462603078767, + "grad_norm": 1.4784406423568726, + "learning_rate": 5.686760718991611e-06, + "loss": 0.4574, + "step": 28920 + }, + { + "epoch": 0.6420572468674044, + "grad_norm": 1.488350510597229, + "learning_rate": 5.683615214447445e-06, + "loss": 0.3339, + "step": 28925 + }, + { + "epoch": 0.642168233426932, + "grad_norm": 0.9722474813461304, + "learning_rate": 5.680470234674859e-06, + "loss": 0.3292, + "step": 28930 + }, + { + "epoch": 0.6422792199864596, + "grad_norm": 0.7445114850997925, + "learning_rate": 5.677325780056221e-06, + "loss": 0.37, + "step": 28935 + }, + { + "epoch": 0.6423902065459873, + "grad_norm": 1.06137216091156, + "learning_rate": 5.6741818509738124e-06, + "loss": 0.4396, + "step": 28940 + }, + { + "epoch": 0.6425011931055149, + "grad_norm": 0.7704484462738037, + "learning_rate": 5.6710384478098675e-06, + "loss": 0.3744, + "step": 28945 + }, + { + "epoch": 0.6426121796650426, + "grad_norm": 2.215862274169922, + "learning_rate": 5.667895570946554e-06, + "loss": 0.5016, + "step": 28950 + }, + { + "epoch": 0.6427231662245702, + "grad_norm": 1.1816397905349731, + "learning_rate": 5.664753220765964e-06, + "loss": 0.3079, + "step": 28955 + }, + { + "epoch": 0.6428341527840978, + "grad_norm": 1.280444860458374, + "learning_rate": 5.661611397650142e-06, + "loss": 0.4385, + "step": 28960 + }, + { + "epoch": 0.6429451393436255, + "grad_norm": 1.6912497282028198, + "learning_rate": 5.658470101981053e-06, + "loss": 0.3826, + "step": 28965 + }, + { + "epoch": 0.6430561259031531, + "grad_norm": 1.6810418367385864, + "learning_rate": 5.655329334140608e-06, + "loss": 0.3615, + "step": 28970 + }, + { + "epoch": 0.6431671124626808, + "grad_norm": 1.3643230199813843, + "learning_rate": 5.652189094510656e-06, + "loss": 0.3644, + "step": 28975 + }, + { + "epoch": 0.6432780990222084, + "grad_norm": 1.0266542434692383, + "learning_rate": 5.6490493834729685e-06, + "loss": 0.396, + "step": 28980 + }, + { + "epoch": 0.643389085581736, + "grad_norm": 1.3530707359313965, + "learning_rate": 5.645910201409268e-06, + "loss": 0.3436, + "step": 28985 + }, + { + "epoch": 0.6435000721412637, + "grad_norm": 0.9574394226074219, + "learning_rate": 5.6427715487012e-06, + "loss": 0.2981, + "step": 28990 + }, + { + "epoch": 0.6436110587007914, + "grad_norm": 1.4541423320770264, + "learning_rate": 5.639633425730357e-06, + "loss": 0.3871, + "step": 28995 + }, + { + "epoch": 0.643722045260319, + "grad_norm": 0.9210712909698486, + "learning_rate": 5.636495832878257e-06, + "loss": 0.5351, + "step": 29000 + }, + { + "epoch": 0.6438330318198466, + "grad_norm": 0.9838594794273376, + "learning_rate": 5.633358770526357e-06, + "loss": 0.3142, + "step": 29005 + }, + { + "epoch": 0.6439440183793742, + "grad_norm": 1.0086106061935425, + "learning_rate": 5.630222239056058e-06, + "loss": 0.3137, + "step": 29010 + }, + { + "epoch": 0.6440550049389019, + "grad_norm": 1.020031452178955, + "learning_rate": 5.6270862388486806e-06, + "loss": 0.3719, + "step": 29015 + }, + { + "epoch": 0.6441659914984296, + "grad_norm": 1.0523961782455444, + "learning_rate": 5.623950770285496e-06, + "loss": 0.4242, + "step": 29020 + }, + { + "epoch": 0.6442769780579571, + "grad_norm": 0.6960680484771729, + "learning_rate": 5.620815833747697e-06, + "loss": 0.3194, + "step": 29025 + }, + { + "epoch": 0.6443879646174848, + "grad_norm": 1.0132477283477783, + "learning_rate": 5.617681429616421e-06, + "loss": 0.4739, + "step": 29030 + }, + { + "epoch": 0.6444989511770125, + "grad_norm": 1.0914894342422485, + "learning_rate": 5.614547558272745e-06, + "loss": 0.39, + "step": 29035 + }, + { + "epoch": 0.6446099377365401, + "grad_norm": 1.1537106037139893, + "learning_rate": 5.611414220097665e-06, + "loss": 0.4725, + "step": 29040 + }, + { + "epoch": 0.6447209242960678, + "grad_norm": 1.0115886926651, + "learning_rate": 5.6082814154721296e-06, + "loss": 0.3318, + "step": 29045 + }, + { + "epoch": 0.6448319108555954, + "grad_norm": 1.0585254430770874, + "learning_rate": 5.6051491447770065e-06, + "loss": 0.4163, + "step": 29050 + }, + { + "epoch": 0.644942897415123, + "grad_norm": 1.8216021060943604, + "learning_rate": 5.602017408393113e-06, + "loss": 0.5211, + "step": 29055 + }, + { + "epoch": 0.6450538839746507, + "grad_norm": 1.7823377847671509, + "learning_rate": 5.598886206701195e-06, + "loss": 0.3622, + "step": 29060 + }, + { + "epoch": 0.6451648705341783, + "grad_norm": 1.1798115968704224, + "learning_rate": 5.59575554008193e-06, + "loss": 0.2396, + "step": 29065 + }, + { + "epoch": 0.6452758570937059, + "grad_norm": 1.2847548723220825, + "learning_rate": 5.592625408915939e-06, + "loss": 0.4482, + "step": 29070 + }, + { + "epoch": 0.6453868436532336, + "grad_norm": 1.348514437675476, + "learning_rate": 5.589495813583765e-06, + "loss": 0.4592, + "step": 29075 + }, + { + "epoch": 0.6454978302127612, + "grad_norm": 2.274677276611328, + "learning_rate": 5.586366754465903e-06, + "loss": 0.4655, + "step": 29080 + }, + { + "epoch": 0.6456088167722889, + "grad_norm": 1.4002689123153687, + "learning_rate": 5.583238231942765e-06, + "loss": 0.3983, + "step": 29085 + }, + { + "epoch": 0.6457198033318166, + "grad_norm": 1.5544315576553345, + "learning_rate": 5.580110246394712e-06, + "loss": 0.3103, + "step": 29090 + }, + { + "epoch": 0.6458307898913441, + "grad_norm": 2.2772841453552246, + "learning_rate": 5.576982798202031e-06, + "loss": 0.3768, + "step": 29095 + }, + { + "epoch": 0.6459417764508718, + "grad_norm": 1.393365740776062, + "learning_rate": 5.57385588774495e-06, + "loss": 0.4128, + "step": 29100 + }, + { + "epoch": 0.6460527630103995, + "grad_norm": 1.1971317529678345, + "learning_rate": 5.5707295154036225e-06, + "loss": 0.4362, + "step": 29105 + }, + { + "epoch": 0.6461637495699271, + "grad_norm": 0.9054796695709229, + "learning_rate": 5.567603681558149e-06, + "loss": 0.4237, + "step": 29110 + }, + { + "epoch": 0.6462747361294547, + "grad_norm": 1.1730214357376099, + "learning_rate": 5.564478386588552e-06, + "loss": 0.3831, + "step": 29115 + }, + { + "epoch": 0.6463857226889823, + "grad_norm": 1.0970360040664673, + "learning_rate": 5.561353630874802e-06, + "loss": 0.4006, + "step": 29120 + }, + { + "epoch": 0.64649670924851, + "grad_norm": 1.798247218132019, + "learning_rate": 5.558229414796785e-06, + "loss": 0.3468, + "step": 29125 + }, + { + "epoch": 0.6466076958080377, + "grad_norm": 1.501258134841919, + "learning_rate": 5.555105738734345e-06, + "loss": 0.4938, + "step": 29130 + }, + { + "epoch": 0.6467186823675652, + "grad_norm": 1.220799207687378, + "learning_rate": 5.5519826030672375e-06, + "loss": 0.4188, + "step": 29135 + }, + { + "epoch": 0.6468296689270929, + "grad_norm": 1.2013776302337646, + "learning_rate": 5.548860008175167e-06, + "loss": 0.3992, + "step": 29140 + }, + { + "epoch": 0.6469406554866206, + "grad_norm": 0.6283308863639832, + "learning_rate": 5.545737954437774e-06, + "loss": 0.2735, + "step": 29145 + }, + { + "epoch": 0.6470516420461482, + "grad_norm": 1.1020041704177856, + "learning_rate": 5.542616442234618e-06, + "loss": 0.3537, + "step": 29150 + }, + { + "epoch": 0.6471626286056759, + "grad_norm": 0.9292207956314087, + "learning_rate": 5.53949547194521e-06, + "loss": 0.487, + "step": 29155 + }, + { + "epoch": 0.6472736151652035, + "grad_norm": 1.0567008256912231, + "learning_rate": 5.536375043948979e-06, + "loss": 0.453, + "step": 29160 + }, + { + "epoch": 0.6473846017247311, + "grad_norm": 1.5478672981262207, + "learning_rate": 5.533255158625304e-06, + "loss": 0.5066, + "step": 29165 + }, + { + "epoch": 0.6474955882842588, + "grad_norm": 1.153273344039917, + "learning_rate": 5.530135816353484e-06, + "loss": 0.372, + "step": 29170 + }, + { + "epoch": 0.6476065748437864, + "grad_norm": 1.4719431400299072, + "learning_rate": 5.527017017512759e-06, + "loss": 0.5028, + "step": 29175 + }, + { + "epoch": 0.647717561403314, + "grad_norm": 1.5583471059799194, + "learning_rate": 5.5238987624823075e-06, + "loss": 0.4037, + "step": 29180 + }, + { + "epoch": 0.6478285479628417, + "grad_norm": 1.2767226696014404, + "learning_rate": 5.52078105164123e-06, + "loss": 0.4889, + "step": 29185 + }, + { + "epoch": 0.6479395345223693, + "grad_norm": 1.0163935422897339, + "learning_rate": 5.5176638853685736e-06, + "loss": 0.4455, + "step": 29190 + }, + { + "epoch": 0.648050521081897, + "grad_norm": 1.9621665477752686, + "learning_rate": 5.514547264043305e-06, + "loss": 0.319, + "step": 29195 + }, + { + "epoch": 0.6481615076414247, + "grad_norm": 1.4272702932357788, + "learning_rate": 5.5114311880443374e-06, + "loss": 0.4779, + "step": 29200 + }, + { + "epoch": 0.6482724942009522, + "grad_norm": 1.5791912078857422, + "learning_rate": 5.508315657750516e-06, + "loss": 0.4263, + "step": 29205 + }, + { + "epoch": 0.6483834807604799, + "grad_norm": 1.9292148351669312, + "learning_rate": 5.505200673540609e-06, + "loss": 0.3609, + "step": 29210 + }, + { + "epoch": 0.6484944673200076, + "grad_norm": 0.8313891887664795, + "learning_rate": 5.502086235793336e-06, + "loss": 0.3403, + "step": 29215 + }, + { + "epoch": 0.6486054538795352, + "grad_norm": 1.1140210628509521, + "learning_rate": 5.498972344887328e-06, + "loss": 0.4964, + "step": 29220 + }, + { + "epoch": 0.6487164404390628, + "grad_norm": 1.0608550310134888, + "learning_rate": 5.495859001201166e-06, + "loss": 0.3615, + "step": 29225 + }, + { + "epoch": 0.6488274269985904, + "grad_norm": 1.2314826250076294, + "learning_rate": 5.492746205113367e-06, + "loss": 0.4928, + "step": 29230 + }, + { + "epoch": 0.6489384135581181, + "grad_norm": 1.6340162754058838, + "learning_rate": 5.489633957002362e-06, + "loss": 0.2452, + "step": 29235 + }, + { + "epoch": 0.6490494001176458, + "grad_norm": 1.148646354675293, + "learning_rate": 5.486522257246538e-06, + "loss": 0.3765, + "step": 29240 + }, + { + "epoch": 0.6491603866771734, + "grad_norm": 1.5189963579177856, + "learning_rate": 5.483411106224199e-06, + "loss": 0.4144, + "step": 29245 + }, + { + "epoch": 0.649271373236701, + "grad_norm": 1.792006254196167, + "learning_rate": 5.480300504313593e-06, + "loss": 0.2704, + "step": 29250 + }, + { + "epoch": 0.6493823597962287, + "grad_norm": 0.776985764503479, + "learning_rate": 5.47719045189289e-06, + "loss": 0.4363, + "step": 29255 + }, + { + "epoch": 0.6494933463557563, + "grad_norm": 1.555522084236145, + "learning_rate": 5.474080949340203e-06, + "loss": 0.2998, + "step": 29260 + }, + { + "epoch": 0.649604332915284, + "grad_norm": 1.0401331186294556, + "learning_rate": 5.47097199703358e-06, + "loss": 0.4357, + "step": 29265 + }, + { + "epoch": 0.6497153194748116, + "grad_norm": 1.3011021614074707, + "learning_rate": 5.467863595350988e-06, + "loss": 0.4046, + "step": 29270 + }, + { + "epoch": 0.6498263060343392, + "grad_norm": 0.8262555003166199, + "learning_rate": 5.4647557446703446e-06, + "loss": 0.4975, + "step": 29275 + }, + { + "epoch": 0.6499372925938669, + "grad_norm": 0.6609529852867126, + "learning_rate": 5.461648445369485e-06, + "loss": 0.3821, + "step": 29280 + }, + { + "epoch": 0.6500482791533945, + "grad_norm": 1.106066346168518, + "learning_rate": 5.458541697826185e-06, + "loss": 0.3786, + "step": 29285 + }, + { + "epoch": 0.6501592657129222, + "grad_norm": 1.384567141532898, + "learning_rate": 5.4554355024181596e-06, + "loss": 0.4237, + "step": 29290 + }, + { + "epoch": 0.6502702522724498, + "grad_norm": 1.3681679964065552, + "learning_rate": 5.45232985952304e-06, + "loss": 0.2711, + "step": 29295 + }, + { + "epoch": 0.6503812388319774, + "grad_norm": 0.8765036463737488, + "learning_rate": 5.4492247695184085e-06, + "loss": 0.336, + "step": 29300 + }, + { + "epoch": 0.6504922253915051, + "grad_norm": 1.1128723621368408, + "learning_rate": 5.446120232781764e-06, + "loss": 0.5218, + "step": 29305 + }, + { + "epoch": 0.6506032119510328, + "grad_norm": 0.899052619934082, + "learning_rate": 5.443016249690552e-06, + "loss": 0.3567, + "step": 29310 + }, + { + "epoch": 0.6507141985105603, + "grad_norm": 0.8907622694969177, + "learning_rate": 5.439912820622139e-06, + "loss": 0.3628, + "step": 29315 + }, + { + "epoch": 0.650825185070088, + "grad_norm": 0.8252400755882263, + "learning_rate": 5.436809945953835e-06, + "loss": 0.412, + "step": 29320 + }, + { + "epoch": 0.6509361716296157, + "grad_norm": 1.7461494207382202, + "learning_rate": 5.43370762606287e-06, + "loss": 0.4964, + "step": 29325 + }, + { + "epoch": 0.6510471581891433, + "grad_norm": 1.049271583557129, + "learning_rate": 5.430605861326421e-06, + "loss": 0.3316, + "step": 29330 + }, + { + "epoch": 0.651158144748671, + "grad_norm": 1.3792396783828735, + "learning_rate": 5.4275046521215844e-06, + "loss": 0.5365, + "step": 29335 + }, + { + "epoch": 0.6512691313081985, + "grad_norm": 1.2729400396347046, + "learning_rate": 5.4244039988254e-06, + "loss": 0.4268, + "step": 29340 + }, + { + "epoch": 0.6513801178677262, + "grad_norm": 1.1661580801010132, + "learning_rate": 5.4213039018148285e-06, + "loss": 0.3322, + "step": 29345 + }, + { + "epoch": 0.6514911044272539, + "grad_norm": 0.8208044767379761, + "learning_rate": 5.418204361466777e-06, + "loss": 0.416, + "step": 29350 + }, + { + "epoch": 0.6516020909867815, + "grad_norm": 0.8626852631568909, + "learning_rate": 5.415105378158069e-06, + "loss": 0.3311, + "step": 29355 + }, + { + "epoch": 0.6517130775463091, + "grad_norm": 1.2224349975585938, + "learning_rate": 5.412006952265476e-06, + "loss": 0.3479, + "step": 29360 + }, + { + "epoch": 0.6518240641058368, + "grad_norm": 0.9776822924613953, + "learning_rate": 5.408909084165688e-06, + "loss": 0.5522, + "step": 29365 + }, + { + "epoch": 0.6519350506653644, + "grad_norm": 1.2499409914016724, + "learning_rate": 5.405811774235334e-06, + "loss": 0.5577, + "step": 29370 + }, + { + "epoch": 0.6520460372248921, + "grad_norm": 1.3289955854415894, + "learning_rate": 5.402715022850983e-06, + "loss": 0.3796, + "step": 29375 + }, + { + "epoch": 0.6521570237844198, + "grad_norm": 1.8029693365097046, + "learning_rate": 5.3996188303891175e-06, + "loss": 0.4136, + "step": 29380 + }, + { + "epoch": 0.6522680103439473, + "grad_norm": 1.359915852546692, + "learning_rate": 5.396523197226169e-06, + "loss": 0.3164, + "step": 29385 + }, + { + "epoch": 0.652378996903475, + "grad_norm": 0.7354469299316406, + "learning_rate": 5.393428123738487e-06, + "loss": 0.5063, + "step": 29390 + }, + { + "epoch": 0.6524899834630026, + "grad_norm": 1.1061774492263794, + "learning_rate": 5.390333610302365e-06, + "loss": 0.383, + "step": 29395 + }, + { + "epoch": 0.6526009700225303, + "grad_norm": 1.4231762886047363, + "learning_rate": 5.387239657294028e-06, + "loss": 0.4511, + "step": 29400 + }, + { + "epoch": 0.6527119565820579, + "grad_norm": 1.2462953329086304, + "learning_rate": 5.384146265089618e-06, + "loss": 0.3371, + "step": 29405 + }, + { + "epoch": 0.6528229431415855, + "grad_norm": 1.5884225368499756, + "learning_rate": 5.381053434065229e-06, + "loss": 0.3727, + "step": 29410 + }, + { + "epoch": 0.6529339297011132, + "grad_norm": 0.7938171625137329, + "learning_rate": 5.3779611645968696e-06, + "loss": 0.2553, + "step": 29415 + }, + { + "epoch": 0.6530449162606409, + "grad_norm": 0.7266703844070435, + "learning_rate": 5.374869457060494e-06, + "loss": 0.3257, + "step": 29420 + }, + { + "epoch": 0.6531559028201684, + "grad_norm": 1.5917963981628418, + "learning_rate": 5.371778311831974e-06, + "loss": 0.4968, + "step": 29425 + }, + { + "epoch": 0.6532668893796961, + "grad_norm": 0.8227339386940002, + "learning_rate": 5.368687729287125e-06, + "loss": 0.4034, + "step": 29430 + }, + { + "epoch": 0.6533778759392238, + "grad_norm": 1.4282219409942627, + "learning_rate": 5.3655977098016955e-06, + "loss": 0.5062, + "step": 29435 + }, + { + "epoch": 0.6534888624987514, + "grad_norm": 1.2380449771881104, + "learning_rate": 5.362508253751349e-06, + "loss": 0.4836, + "step": 29440 + }, + { + "epoch": 0.6535998490582791, + "grad_norm": 1.3846499919891357, + "learning_rate": 5.3594193615116995e-06, + "loss": 0.4116, + "step": 29445 + }, + { + "epoch": 0.6537108356178066, + "grad_norm": 1.1998333930969238, + "learning_rate": 5.356331033458276e-06, + "loss": 0.4964, + "step": 29450 + }, + { + "epoch": 0.6538218221773343, + "grad_norm": 0.9864094257354736, + "learning_rate": 5.353243269966553e-06, + "loss": 0.4673, + "step": 29455 + }, + { + "epoch": 0.653932808736862, + "grad_norm": 1.1273391246795654, + "learning_rate": 5.350156071411933e-06, + "loss": 0.3415, + "step": 29460 + }, + { + "epoch": 0.6540437952963896, + "grad_norm": 1.0939232110977173, + "learning_rate": 5.347069438169739e-06, + "loss": 0.2618, + "step": 29465 + }, + { + "epoch": 0.6541547818559172, + "grad_norm": 1.1240630149841309, + "learning_rate": 5.343983370615242e-06, + "loss": 0.2867, + "step": 29470 + }, + { + "epoch": 0.6542657684154449, + "grad_norm": 0.9784446358680725, + "learning_rate": 5.340897869123629e-06, + "loss": 0.3292, + "step": 29475 + }, + { + "epoch": 0.6543767549749725, + "grad_norm": 1.2527360916137695, + "learning_rate": 5.3378129340700256e-06, + "loss": 0.3011, + "step": 29480 + }, + { + "epoch": 0.6544877415345002, + "grad_norm": 1.3284943103790283, + "learning_rate": 5.334728565829495e-06, + "loss": 0.5744, + "step": 29485 + }, + { + "epoch": 0.6545987280940279, + "grad_norm": 0.7942169308662415, + "learning_rate": 5.331644764777016e-06, + "loss": 0.413, + "step": 29490 + }, + { + "epoch": 0.6547097146535554, + "grad_norm": 1.040993094444275, + "learning_rate": 5.328561531287513e-06, + "loss": 0.2812, + "step": 29495 + }, + { + "epoch": 0.6548207012130831, + "grad_norm": 1.427320957183838, + "learning_rate": 5.325478865735829e-06, + "loss": 0.4896, + "step": 29500 + }, + { + "epoch": 0.6549316877726107, + "grad_norm": 1.2465837001800537, + "learning_rate": 5.32239676849675e-06, + "loss": 0.5024, + "step": 29505 + }, + { + "epoch": 0.6550426743321384, + "grad_norm": 1.5954735279083252, + "learning_rate": 5.319315239944982e-06, + "loss": 0.4399, + "step": 29510 + }, + { + "epoch": 0.655153660891666, + "grad_norm": 0.9695634841918945, + "learning_rate": 5.316234280455168e-06, + "loss": 0.3623, + "step": 29515 + }, + { + "epoch": 0.6552646474511936, + "grad_norm": 1.064803957939148, + "learning_rate": 5.313153890401888e-06, + "loss": 0.4928, + "step": 29520 + }, + { + "epoch": 0.6553756340107213, + "grad_norm": 0.8261331915855408, + "learning_rate": 5.310074070159634e-06, + "loss": 0.2688, + "step": 29525 + }, + { + "epoch": 0.655486620570249, + "grad_norm": 1.123943567276001, + "learning_rate": 5.30699482010285e-06, + "loss": 0.2963, + "step": 29530 + }, + { + "epoch": 0.6555976071297765, + "grad_norm": 1.4670149087905884, + "learning_rate": 5.303916140605893e-06, + "loss": 0.3191, + "step": 29535 + }, + { + "epoch": 0.6557085936893042, + "grad_norm": 1.5768437385559082, + "learning_rate": 5.300838032043061e-06, + "loss": 0.5026, + "step": 29540 + }, + { + "epoch": 0.6558195802488319, + "grad_norm": 0.774504542350769, + "learning_rate": 5.297760494788586e-06, + "loss": 0.357, + "step": 29545 + }, + { + "epoch": 0.6559305668083595, + "grad_norm": 1.2776967287063599, + "learning_rate": 5.294683529216616e-06, + "loss": 0.5908, + "step": 29550 + }, + { + "epoch": 0.6560415533678872, + "grad_norm": 1.3623162508010864, + "learning_rate": 5.291607135701246e-06, + "loss": 0.3338, + "step": 29555 + }, + { + "epoch": 0.6561525399274147, + "grad_norm": 0.9033370614051819, + "learning_rate": 5.288531314616488e-06, + "loss": 0.3197, + "step": 29560 + }, + { + "epoch": 0.6562635264869424, + "grad_norm": 1.1467854976654053, + "learning_rate": 5.285456066336292e-06, + "loss": 0.2737, + "step": 29565 + }, + { + "epoch": 0.6563745130464701, + "grad_norm": 1.61380136013031, + "learning_rate": 5.2823813912345345e-06, + "loss": 0.3388, + "step": 29570 + }, + { + "epoch": 0.6564854996059977, + "grad_norm": 0.3687746822834015, + "learning_rate": 5.2793072896850295e-06, + "loss": 0.4065, + "step": 29575 + }, + { + "epoch": 0.6565964861655254, + "grad_norm": 1.5772136449813843, + "learning_rate": 5.276233762061507e-06, + "loss": 0.4353, + "step": 29580 + }, + { + "epoch": 0.656707472725053, + "grad_norm": 1.5397409200668335, + "learning_rate": 5.273160808737647e-06, + "loss": 0.3549, + "step": 29585 + }, + { + "epoch": 0.6568184592845806, + "grad_norm": 1.4259532690048218, + "learning_rate": 5.270088430087039e-06, + "loss": 0.3488, + "step": 29590 + }, + { + "epoch": 0.6569294458441083, + "grad_norm": 1.0682930946350098, + "learning_rate": 5.267016626483219e-06, + "loss": 0.4933, + "step": 29595 + }, + { + "epoch": 0.657040432403636, + "grad_norm": 1.2446573972702026, + "learning_rate": 5.263945398299642e-06, + "loss": 0.3519, + "step": 29600 + }, + { + "epoch": 0.6571514189631635, + "grad_norm": 1.5529674291610718, + "learning_rate": 5.260874745909704e-06, + "loss": 0.4463, + "step": 29605 + }, + { + "epoch": 0.6572624055226912, + "grad_norm": 1.337281584739685, + "learning_rate": 5.2578046696867165e-06, + "loss": 0.4352, + "step": 29610 + }, + { + "epoch": 0.6573733920822188, + "grad_norm": 1.5599809885025024, + "learning_rate": 5.254735170003937e-06, + "loss": 0.321, + "step": 29615 + }, + { + "epoch": 0.6574843786417465, + "grad_norm": 1.1805211305618286, + "learning_rate": 5.251666247234537e-06, + "loss": 0.3249, + "step": 29620 + }, + { + "epoch": 0.6575953652012742, + "grad_norm": 1.4224441051483154, + "learning_rate": 5.248597901751631e-06, + "loss": 0.5011, + "step": 29625 + }, + { + "epoch": 0.6577063517608017, + "grad_norm": 1.2685762643814087, + "learning_rate": 5.245530133928259e-06, + "loss": 0.2142, + "step": 29630 + }, + { + "epoch": 0.6578173383203294, + "grad_norm": 1.105307936668396, + "learning_rate": 5.242462944137385e-06, + "loss": 0.4682, + "step": 29635 + }, + { + "epoch": 0.6579283248798571, + "grad_norm": 0.9354825019836426, + "learning_rate": 5.239396332751916e-06, + "loss": 0.2499, + "step": 29640 + }, + { + "epoch": 0.6580393114393847, + "grad_norm": 0.9176279306411743, + "learning_rate": 5.236330300144669e-06, + "loss": 0.6083, + "step": 29645 + }, + { + "epoch": 0.6581502979989123, + "grad_norm": 1.7297823429107666, + "learning_rate": 5.233264846688409e-06, + "loss": 0.3017, + "step": 29650 + }, + { + "epoch": 0.65826128455844, + "grad_norm": 1.1669957637786865, + "learning_rate": 5.230199972755828e-06, + "loss": 0.4691, + "step": 29655 + }, + { + "epoch": 0.6583722711179676, + "grad_norm": 1.277626633644104, + "learning_rate": 5.227135678719531e-06, + "loss": 0.395, + "step": 29660 + }, + { + "epoch": 0.6584832576774953, + "grad_norm": 1.1681313514709473, + "learning_rate": 5.224071964952078e-06, + "loss": 0.3731, + "step": 29665 + }, + { + "epoch": 0.6585942442370228, + "grad_norm": 1.8174617290496826, + "learning_rate": 5.221008831825931e-06, + "loss": 0.4484, + "step": 29670 + }, + { + "epoch": 0.6587052307965505, + "grad_norm": 0.9886231422424316, + "learning_rate": 5.2179462797135095e-06, + "loss": 0.3804, + "step": 29675 + }, + { + "epoch": 0.6588162173560782, + "grad_norm": 1.7291259765625, + "learning_rate": 5.214884308987136e-06, + "loss": 0.4318, + "step": 29680 + }, + { + "epoch": 0.6589272039156058, + "grad_norm": 1.1902395486831665, + "learning_rate": 5.211822920019081e-06, + "loss": 0.3705, + "step": 29685 + }, + { + "epoch": 0.6590381904751335, + "grad_norm": 1.5060759782791138, + "learning_rate": 5.2087621131815404e-06, + "loss": 0.4569, + "step": 29690 + }, + { + "epoch": 0.6591491770346611, + "grad_norm": 1.078432321548462, + "learning_rate": 5.205701888846631e-06, + "loss": 0.3947, + "step": 29695 + }, + { + "epoch": 0.6592601635941887, + "grad_norm": 0.9717170000076294, + "learning_rate": 5.202642247386409e-06, + "loss": 0.4034, + "step": 29700 + }, + { + "epoch": 0.6593711501537164, + "grad_norm": 1.2641414403915405, + "learning_rate": 5.199583189172851e-06, + "loss": 0.5615, + "step": 29705 + }, + { + "epoch": 0.6594821367132441, + "grad_norm": 1.0129261016845703, + "learning_rate": 5.1965247145778685e-06, + "loss": 0.3732, + "step": 29710 + }, + { + "epoch": 0.6595931232727716, + "grad_norm": 1.067817211151123, + "learning_rate": 5.193466823973307e-06, + "loss": 0.3279, + "step": 29715 + }, + { + "epoch": 0.6597041098322993, + "grad_norm": 1.3536492586135864, + "learning_rate": 5.190409517730924e-06, + "loss": 0.4702, + "step": 29720 + }, + { + "epoch": 0.659815096391827, + "grad_norm": 1.6192893981933594, + "learning_rate": 5.1873527962224266e-06, + "loss": 0.4683, + "step": 29725 + }, + { + "epoch": 0.6599260829513546, + "grad_norm": 0.9836245775222778, + "learning_rate": 5.184296659819431e-06, + "loss": 0.3923, + "step": 29730 + }, + { + "epoch": 0.6600370695108823, + "grad_norm": 1.0475212335586548, + "learning_rate": 5.181241108893498e-06, + "loss": 0.4763, + "step": 29735 + }, + { + "epoch": 0.6601480560704098, + "grad_norm": 0.9123460054397583, + "learning_rate": 5.178186143816113e-06, + "loss": 0.4424, + "step": 29740 + }, + { + "epoch": 0.6602590426299375, + "grad_norm": 2.083481788635254, + "learning_rate": 5.175131764958681e-06, + "loss": 0.3078, + "step": 29745 + }, + { + "epoch": 0.6603700291894652, + "grad_norm": 1.2369792461395264, + "learning_rate": 5.172077972692553e-06, + "loss": 0.3764, + "step": 29750 + }, + { + "epoch": 0.6604810157489928, + "grad_norm": 1.2039142847061157, + "learning_rate": 5.169024767388989e-06, + "loss": 0.3861, + "step": 29755 + }, + { + "epoch": 0.6605920023085204, + "grad_norm": 1.1370278596878052, + "learning_rate": 5.1659721494191964e-06, + "loss": 0.4308, + "step": 29760 + }, + { + "epoch": 0.6607029888680481, + "grad_norm": 1.3318910598754883, + "learning_rate": 5.162920119154293e-06, + "loss": 0.3658, + "step": 29765 + }, + { + "epoch": 0.6608139754275757, + "grad_norm": 1.2840640544891357, + "learning_rate": 5.1598686769653395e-06, + "loss": 0.3036, + "step": 29770 + }, + { + "epoch": 0.6609249619871034, + "grad_norm": 1.0112098455429077, + "learning_rate": 5.156817823223323e-06, + "loss": 0.4429, + "step": 29775 + }, + { + "epoch": 0.6610359485466311, + "grad_norm": 0.8643494844436646, + "learning_rate": 5.15376755829915e-06, + "loss": 0.4527, + "step": 29780 + }, + { + "epoch": 0.6611469351061586, + "grad_norm": 1.8895761966705322, + "learning_rate": 5.150717882563668e-06, + "loss": 0.4424, + "step": 29785 + }, + { + "epoch": 0.6612579216656863, + "grad_norm": 1.0386717319488525, + "learning_rate": 5.147668796387639e-06, + "loss": 0.4194, + "step": 29790 + }, + { + "epoch": 0.6613689082252139, + "grad_norm": 0.8823903203010559, + "learning_rate": 5.144620300141763e-06, + "loss": 0.2653, + "step": 29795 + }, + { + "epoch": 0.6614798947847416, + "grad_norm": 1.2153304815292358, + "learning_rate": 5.141572394196672e-06, + "loss": 0.2755, + "step": 29800 + }, + { + "epoch": 0.6615908813442692, + "grad_norm": 1.1399264335632324, + "learning_rate": 5.1385250789229116e-06, + "loss": 0.4392, + "step": 29805 + }, + { + "epoch": 0.6617018679037968, + "grad_norm": 1.8947248458862305, + "learning_rate": 5.1354783546909725e-06, + "loss": 0.3716, + "step": 29810 + }, + { + "epoch": 0.6618128544633245, + "grad_norm": 1.0885425806045532, + "learning_rate": 5.132432221871256e-06, + "loss": 0.4363, + "step": 29815 + }, + { + "epoch": 0.6619238410228522, + "grad_norm": 2.46814227104187, + "learning_rate": 5.1293866808341084e-06, + "loss": 0.5248, + "step": 29820 + }, + { + "epoch": 0.6620348275823797, + "grad_norm": 0.9989572167396545, + "learning_rate": 5.126341731949791e-06, + "loss": 0.371, + "step": 29825 + }, + { + "epoch": 0.6621458141419074, + "grad_norm": 0.8480471968650818, + "learning_rate": 5.123297375588503e-06, + "loss": 0.4898, + "step": 29830 + }, + { + "epoch": 0.6622568007014351, + "grad_norm": 1.4133449792861938, + "learning_rate": 5.120253612120363e-06, + "loss": 0.3531, + "step": 29835 + }, + { + "epoch": 0.6623677872609627, + "grad_norm": 1.0268564224243164, + "learning_rate": 5.117210441915426e-06, + "loss": 0.4025, + "step": 29840 + }, + { + "epoch": 0.6624787738204904, + "grad_norm": 0.7333199977874756, + "learning_rate": 5.114167865343664e-06, + "loss": 0.3128, + "step": 29845 + }, + { + "epoch": 0.6625897603800179, + "grad_norm": 1.1543068885803223, + "learning_rate": 5.1111258827749925e-06, + "loss": 0.4092, + "step": 29850 + }, + { + "epoch": 0.6627007469395456, + "grad_norm": 1.7395691871643066, + "learning_rate": 5.108084494579235e-06, + "loss": 0.5329, + "step": 29855 + }, + { + "epoch": 0.6628117334990733, + "grad_norm": 1.5491636991500854, + "learning_rate": 5.1050437011261624e-06, + "loss": 0.306, + "step": 29860 + }, + { + "epoch": 0.6629227200586009, + "grad_norm": 0.8757246136665344, + "learning_rate": 5.102003502785456e-06, + "loss": 0.3236, + "step": 29865 + }, + { + "epoch": 0.6630337066181285, + "grad_norm": 1.34422767162323, + "learning_rate": 5.098963899926741e-06, + "loss": 0.5338, + "step": 29870 + }, + { + "epoch": 0.6631446931776562, + "grad_norm": 0.969270646572113, + "learning_rate": 5.095924892919556e-06, + "loss": 0.3907, + "step": 29875 + }, + { + "epoch": 0.6632556797371838, + "grad_norm": 2.467583417892456, + "learning_rate": 5.0928864821333745e-06, + "loss": 0.3063, + "step": 29880 + }, + { + "epoch": 0.6633666662967115, + "grad_norm": 1.0197619199752808, + "learning_rate": 5.0898486679376e-06, + "loss": 0.4827, + "step": 29885 + }, + { + "epoch": 0.6634776528562392, + "grad_norm": 1.4381027221679688, + "learning_rate": 5.086811450701554e-06, + "loss": 0.4524, + "step": 29890 + }, + { + "epoch": 0.6635886394157667, + "grad_norm": 1.236048698425293, + "learning_rate": 5.083774830794499e-06, + "loss": 0.3053, + "step": 29895 + }, + { + "epoch": 0.6636996259752944, + "grad_norm": 1.6473312377929688, + "learning_rate": 5.080738808585608e-06, + "loss": 0.3745, + "step": 29900 + }, + { + "epoch": 0.663810612534822, + "grad_norm": 0.7598027586936951, + "learning_rate": 5.077703384443995e-06, + "loss": 0.5744, + "step": 29905 + }, + { + "epoch": 0.6639215990943497, + "grad_norm": 1.2780729532241821, + "learning_rate": 5.0746685587387e-06, + "loss": 0.3281, + "step": 29910 + }, + { + "epoch": 0.6640325856538773, + "grad_norm": 1.2586196660995483, + "learning_rate": 5.0716343318386795e-06, + "loss": 0.4889, + "step": 29915 + }, + { + "epoch": 0.6641435722134049, + "grad_norm": 0.7518161535263062, + "learning_rate": 5.068600704112832e-06, + "loss": 0.2159, + "step": 29920 + }, + { + "epoch": 0.6642545587729326, + "grad_norm": 1.3559149503707886, + "learning_rate": 5.065567675929968e-06, + "loss": 0.4216, + "step": 29925 + }, + { + "epoch": 0.6643655453324603, + "grad_norm": 1.2818373441696167, + "learning_rate": 5.062535247658838e-06, + "loss": 0.3533, + "step": 29930 + }, + { + "epoch": 0.6644765318919879, + "grad_norm": 1.5211478471755981, + "learning_rate": 5.059503419668117e-06, + "loss": 0.4355, + "step": 29935 + }, + { + "epoch": 0.6645875184515155, + "grad_norm": 2.1579790115356445, + "learning_rate": 5.056472192326398e-06, + "loss": 0.3259, + "step": 29940 + }, + { + "epoch": 0.6646985050110432, + "grad_norm": 0.821919858455658, + "learning_rate": 5.053441566002214e-06, + "loss": 0.3938, + "step": 29945 + }, + { + "epoch": 0.6648094915705708, + "grad_norm": 1.3272850513458252, + "learning_rate": 5.0504115410640105e-06, + "loss": 0.3472, + "step": 29950 + }, + { + "epoch": 0.6649204781300985, + "grad_norm": 1.5676440000534058, + "learning_rate": 5.047382117880178e-06, + "loss": 0.4516, + "step": 29955 + }, + { + "epoch": 0.665031464689626, + "grad_norm": 1.3177131414413452, + "learning_rate": 5.044353296819011e-06, + "loss": 0.452, + "step": 29960 + }, + { + "epoch": 0.6651424512491537, + "grad_norm": 1.7546396255493164, + "learning_rate": 5.0413250782487524e-06, + "loss": 0.4441, + "step": 29965 + }, + { + "epoch": 0.6652534378086814, + "grad_norm": 0.9149209856987, + "learning_rate": 5.0382974625375635e-06, + "loss": 0.1941, + "step": 29970 + }, + { + "epoch": 0.665364424368209, + "grad_norm": 1.5751773118972778, + "learning_rate": 5.035270450053526e-06, + "loss": 0.6836, + "step": 29975 + }, + { + "epoch": 0.6654754109277367, + "grad_norm": 1.3157830238342285, + "learning_rate": 5.03224404116466e-06, + "loss": 0.4885, + "step": 29980 + }, + { + "epoch": 0.6655863974872643, + "grad_norm": 1.436733365058899, + "learning_rate": 5.029218236238899e-06, + "loss": 0.4401, + "step": 29985 + }, + { + "epoch": 0.6656973840467919, + "grad_norm": 2.5588650703430176, + "learning_rate": 5.026193035644113e-06, + "loss": 0.4649, + "step": 29990 + }, + { + "epoch": 0.6658083706063196, + "grad_norm": 1.2236137390136719, + "learning_rate": 5.023168439748103e-06, + "loss": 0.3297, + "step": 29995 + }, + { + "epoch": 0.6659193571658473, + "grad_norm": 1.1968693733215332, + "learning_rate": 5.020144448918578e-06, + "loss": 0.3276, + "step": 30000 + }, + { + "epoch": 0.6660303437253748, + "grad_norm": 1.0230461359024048, + "learning_rate": 5.017121063523194e-06, + "loss": 0.4819, + "step": 30005 + }, + { + "epoch": 0.6661413302849025, + "grad_norm": 1.333814024925232, + "learning_rate": 5.014098283929516e-06, + "loss": 0.3443, + "step": 30010 + }, + { + "epoch": 0.6662523168444301, + "grad_norm": 1.472735047340393, + "learning_rate": 5.011076110505047e-06, + "loss": 0.3886, + "step": 30015 + }, + { + "epoch": 0.6663633034039578, + "grad_norm": 0.960411012172699, + "learning_rate": 5.0080545436172155e-06, + "loss": 0.3675, + "step": 30020 + }, + { + "epoch": 0.6664742899634855, + "grad_norm": 1.6177083253860474, + "learning_rate": 5.00503358363337e-06, + "loss": 0.3223, + "step": 30025 + }, + { + "epoch": 0.666585276523013, + "grad_norm": 1.1531990766525269, + "learning_rate": 5.0020132309207905e-06, + "loss": 0.3309, + "step": 30030 + }, + { + "epoch": 0.6666962630825407, + "grad_norm": 0.785301685333252, + "learning_rate": 4.998993485846678e-06, + "loss": 0.3819, + "step": 30035 + }, + { + "epoch": 0.6668072496420684, + "grad_norm": 1.5275787115097046, + "learning_rate": 4.99597434877817e-06, + "loss": 0.4338, + "step": 30040 + }, + { + "epoch": 0.666918236201596, + "grad_norm": 2.108081579208374, + "learning_rate": 4.9929558200823135e-06, + "loss": 0.405, + "step": 30045 + }, + { + "epoch": 0.6670292227611236, + "grad_norm": 1.3582932949066162, + "learning_rate": 4.989937900126096e-06, + "loss": 0.4276, + "step": 30050 + }, + { + "epoch": 0.6671402093206513, + "grad_norm": 1.2807767391204834, + "learning_rate": 4.9869205892764306e-06, + "loss": 0.5332, + "step": 30055 + }, + { + "epoch": 0.6672511958801789, + "grad_norm": 1.020280122756958, + "learning_rate": 4.983903887900144e-06, + "loss": 0.3643, + "step": 30060 + }, + { + "epoch": 0.6673621824397066, + "grad_norm": 1.1606618165969849, + "learning_rate": 4.9808877963640025e-06, + "loss": 0.3958, + "step": 30065 + }, + { + "epoch": 0.6674731689992341, + "grad_norm": 0.8008097410202026, + "learning_rate": 4.977872315034687e-06, + "loss": 0.4277, + "step": 30070 + }, + { + "epoch": 0.6675841555587618, + "grad_norm": 1.4365850687026978, + "learning_rate": 4.974857444278816e-06, + "loss": 0.4398, + "step": 30075 + }, + { + "epoch": 0.6676951421182895, + "grad_norm": 1.1749114990234375, + "learning_rate": 4.97184318446292e-06, + "loss": 0.4682, + "step": 30080 + }, + { + "epoch": 0.6678061286778171, + "grad_norm": 1.1037579774856567, + "learning_rate": 4.96882953595347e-06, + "loss": 0.2849, + "step": 30085 + }, + { + "epoch": 0.6679171152373448, + "grad_norm": 1.5772374868392944, + "learning_rate": 4.965816499116849e-06, + "loss": 0.559, + "step": 30090 + }, + { + "epoch": 0.6680281017968724, + "grad_norm": 1.2721027135849, + "learning_rate": 4.9628040743193775e-06, + "loss": 0.5381, + "step": 30095 + }, + { + "epoch": 0.6681390883564, + "grad_norm": 1.6205239295959473, + "learning_rate": 4.9597922619272894e-06, + "loss": 0.4568, + "step": 30100 + }, + { + "epoch": 0.6682500749159277, + "grad_norm": 1.4295194149017334, + "learning_rate": 4.956781062306759e-06, + "loss": 0.3646, + "step": 30105 + }, + { + "epoch": 0.6683610614754554, + "grad_norm": 0.6828015446662903, + "learning_rate": 4.9537704758238705e-06, + "loss": 0.4187, + "step": 30110 + }, + { + "epoch": 0.668472048034983, + "grad_norm": 1.455264925956726, + "learning_rate": 4.950760502844646e-06, + "loss": 0.4342, + "step": 30115 + }, + { + "epoch": 0.6685830345945106, + "grad_norm": 0.7783439755439758, + "learning_rate": 4.947751143735022e-06, + "loss": 0.3632, + "step": 30120 + }, + { + "epoch": 0.6686940211540382, + "grad_norm": 1.1136553287506104, + "learning_rate": 4.9447423988608744e-06, + "loss": 0.3266, + "step": 30125 + }, + { + "epoch": 0.6688050077135659, + "grad_norm": 1.053621530532837, + "learning_rate": 4.941734268587987e-06, + "loss": 0.4097, + "step": 30130 + }, + { + "epoch": 0.6689159942730936, + "grad_norm": 1.3171651363372803, + "learning_rate": 4.938726753282085e-06, + "loss": 0.5856, + "step": 30135 + }, + { + "epoch": 0.6690269808326211, + "grad_norm": 1.715846061706543, + "learning_rate": 4.935719853308814e-06, + "loss": 0.5022, + "step": 30140 + }, + { + "epoch": 0.6691379673921488, + "grad_norm": 1.1749157905578613, + "learning_rate": 4.932713569033734e-06, + "loss": 0.399, + "step": 30145 + }, + { + "epoch": 0.6692489539516765, + "grad_norm": 1.3666599988937378, + "learning_rate": 4.929707900822348e-06, + "loss": 0.4338, + "step": 30150 + }, + { + "epoch": 0.6693599405112041, + "grad_norm": 1.496510624885559, + "learning_rate": 4.926702849040067e-06, + "loss": 0.4298, + "step": 30155 + }, + { + "epoch": 0.6694709270707317, + "grad_norm": 0.9879202842712402, + "learning_rate": 4.923698414052239e-06, + "loss": 0.355, + "step": 30160 + }, + { + "epoch": 0.6695819136302594, + "grad_norm": 0.870068371295929, + "learning_rate": 4.920694596224137e-06, + "loss": 0.407, + "step": 30165 + }, + { + "epoch": 0.669692900189787, + "grad_norm": 1.1352063417434692, + "learning_rate": 4.917691395920948e-06, + "loss": 0.4668, + "step": 30170 + }, + { + "epoch": 0.6698038867493147, + "grad_norm": 1.1441079378128052, + "learning_rate": 4.914688813507798e-06, + "loss": 0.4633, + "step": 30175 + }, + { + "epoch": 0.6699148733088423, + "grad_norm": 1.7710208892822266, + "learning_rate": 4.911686849349723e-06, + "loss": 0.3025, + "step": 30180 + }, + { + "epoch": 0.6700258598683699, + "grad_norm": 1.5889769792556763, + "learning_rate": 4.908685503811696e-06, + "loss": 0.4249, + "step": 30185 + }, + { + "epoch": 0.6701368464278976, + "grad_norm": 1.0026971101760864, + "learning_rate": 4.905684777258616e-06, + "loss": 0.3464, + "step": 30190 + }, + { + "epoch": 0.6702478329874252, + "grad_norm": 1.486332654953003, + "learning_rate": 4.90268467005529e-06, + "loss": 0.4357, + "step": 30195 + }, + { + "epoch": 0.6703588195469529, + "grad_norm": 1.3142849206924438, + "learning_rate": 4.899685182566472e-06, + "loss": 0.2584, + "step": 30200 + }, + { + "epoch": 0.6704698061064805, + "grad_norm": 1.6392567157745361, + "learning_rate": 4.896686315156819e-06, + "loss": 0.5574, + "step": 30205 + }, + { + "epoch": 0.6705807926660081, + "grad_norm": 1.4760245084762573, + "learning_rate": 4.893688068190933e-06, + "loss": 0.3984, + "step": 30210 + }, + { + "epoch": 0.6706917792255358, + "grad_norm": 0.6990063786506653, + "learning_rate": 4.890690442033323e-06, + "loss": 0.3127, + "step": 30215 + }, + { + "epoch": 0.6708027657850635, + "grad_norm": 0.9935014843940735, + "learning_rate": 4.887693437048433e-06, + "loss": 0.4458, + "step": 30220 + }, + { + "epoch": 0.670913752344591, + "grad_norm": 1.9423924684524536, + "learning_rate": 4.884697053600635e-06, + "loss": 0.5444, + "step": 30225 + }, + { + "epoch": 0.6710247389041187, + "grad_norm": 0.797426164150238, + "learning_rate": 4.881701292054209e-06, + "loss": 0.2913, + "step": 30230 + }, + { + "epoch": 0.6711357254636463, + "grad_norm": 1.5976841449737549, + "learning_rate": 4.878706152773377e-06, + "loss": 0.5609, + "step": 30235 + }, + { + "epoch": 0.671246712023174, + "grad_norm": 1.7722688913345337, + "learning_rate": 4.8757116361222735e-06, + "loss": 0.3951, + "step": 30240 + }, + { + "epoch": 0.6713576985827017, + "grad_norm": 1.9161877632141113, + "learning_rate": 4.872717742464963e-06, + "loss": 0.4617, + "step": 30245 + }, + { + "epoch": 0.6714686851422292, + "grad_norm": 0.7850003242492676, + "learning_rate": 4.869724472165438e-06, + "loss": 0.3621, + "step": 30250 + }, + { + "epoch": 0.6715796717017569, + "grad_norm": 1.3206015825271606, + "learning_rate": 4.866731825587602e-06, + "loss": 0.2995, + "step": 30255 + }, + { + "epoch": 0.6716906582612846, + "grad_norm": 0.9397589564323425, + "learning_rate": 4.863739803095299e-06, + "loss": 0.3133, + "step": 30260 + }, + { + "epoch": 0.6718016448208122, + "grad_norm": 1.0778348445892334, + "learning_rate": 4.8607484050522815e-06, + "loss": 0.3775, + "step": 30265 + }, + { + "epoch": 0.6719126313803399, + "grad_norm": 0.6114378571510315, + "learning_rate": 4.8577576318222365e-06, + "loss": 0.3259, + "step": 30270 + }, + { + "epoch": 0.6720236179398675, + "grad_norm": 2.020146608352661, + "learning_rate": 4.854767483768776e-06, + "loss": 0.2728, + "step": 30275 + }, + { + "epoch": 0.6721346044993951, + "grad_norm": 1.1776325702667236, + "learning_rate": 4.851777961255427e-06, + "loss": 0.5557, + "step": 30280 + }, + { + "epoch": 0.6722455910589228, + "grad_norm": 1.3874727487564087, + "learning_rate": 4.84878906464565e-06, + "loss": 0.3102, + "step": 30285 + }, + { + "epoch": 0.6723565776184504, + "grad_norm": 0.9559499621391296, + "learning_rate": 4.845800794302821e-06, + "loss": 0.4892, + "step": 30290 + }, + { + "epoch": 0.672467564177978, + "grad_norm": 2.322970151901245, + "learning_rate": 4.842813150590247e-06, + "loss": 0.4686, + "step": 30295 + }, + { + "epoch": 0.6725785507375057, + "grad_norm": 0.8557878732681274, + "learning_rate": 4.839826133871152e-06, + "loss": 0.3854, + "step": 30300 + }, + { + "epoch": 0.6726895372970333, + "grad_norm": 0.7575249671936035, + "learning_rate": 4.836839744508693e-06, + "loss": 0.3209, + "step": 30305 + }, + { + "epoch": 0.672800523856561, + "grad_norm": 1.4156855344772339, + "learning_rate": 4.8338539828659384e-06, + "loss": 0.5021, + "step": 30310 + }, + { + "epoch": 0.6729115104160887, + "grad_norm": 1.846325159072876, + "learning_rate": 4.830868849305894e-06, + "loss": 0.3873, + "step": 30315 + }, + { + "epoch": 0.6730224969756162, + "grad_norm": 0.8236002326011658, + "learning_rate": 4.827884344191474e-06, + "loss": 0.4415, + "step": 30320 + }, + { + "epoch": 0.6731334835351439, + "grad_norm": 0.9851003289222717, + "learning_rate": 4.824900467885536e-06, + "loss": 0.4356, + "step": 30325 + }, + { + "epoch": 0.6732444700946716, + "grad_norm": 0.9721106290817261, + "learning_rate": 4.821917220750838e-06, + "loss": 0.5158, + "step": 30330 + }, + { + "epoch": 0.6733554566541992, + "grad_norm": 0.8726085424423218, + "learning_rate": 4.818934603150082e-06, + "loss": 0.3236, + "step": 30335 + }, + { + "epoch": 0.6734664432137268, + "grad_norm": 1.2427781820297241, + "learning_rate": 4.815952615445878e-06, + "loss": 0.2958, + "step": 30340 + }, + { + "epoch": 0.6735774297732544, + "grad_norm": 1.540948510169983, + "learning_rate": 4.8129712580007725e-06, + "loss": 0.45, + "step": 30345 + }, + { + "epoch": 0.6736884163327821, + "grad_norm": 1.114790916442871, + "learning_rate": 4.809990531177221e-06, + "loss": 0.2742, + "step": 30350 + }, + { + "epoch": 0.6737994028923098, + "grad_norm": 1.4274752140045166, + "learning_rate": 4.807010435337616e-06, + "loss": 0.4615, + "step": 30355 + }, + { + "epoch": 0.6739103894518373, + "grad_norm": 0.9064056277275085, + "learning_rate": 4.804030970844269e-06, + "loss": 0.3181, + "step": 30360 + }, + { + "epoch": 0.674021376011365, + "grad_norm": 1.431104063987732, + "learning_rate": 4.801052138059408e-06, + "loss": 0.2833, + "step": 30365 + }, + { + "epoch": 0.6741323625708927, + "grad_norm": 1.2696224451065063, + "learning_rate": 4.798073937345194e-06, + "loss": 0.2398, + "step": 30370 + }, + { + "epoch": 0.6742433491304203, + "grad_norm": 1.2749775648117065, + "learning_rate": 4.795096369063703e-06, + "loss": 0.4965, + "step": 30375 + }, + { + "epoch": 0.674354335689948, + "grad_norm": 1.1306232213974, + "learning_rate": 4.792119433576943e-06, + "loss": 0.3975, + "step": 30380 + }, + { + "epoch": 0.6744653222494756, + "grad_norm": 0.717216432094574, + "learning_rate": 4.789143131246832e-06, + "loss": 0.5037, + "step": 30385 + }, + { + "epoch": 0.6745763088090032, + "grad_norm": 1.363181710243225, + "learning_rate": 4.786167462435224e-06, + "loss": 0.4823, + "step": 30390 + }, + { + "epoch": 0.6746872953685309, + "grad_norm": 1.6704401969909668, + "learning_rate": 4.783192427503893e-06, + "loss": 0.2458, + "step": 30395 + }, + { + "epoch": 0.6747982819280585, + "grad_norm": 1.2120403051376343, + "learning_rate": 4.780218026814527e-06, + "loss": 0.4551, + "step": 30400 + }, + { + "epoch": 0.6749092684875861, + "grad_norm": 0.7928174734115601, + "learning_rate": 4.777244260728751e-06, + "loss": 0.3698, + "step": 30405 + }, + { + "epoch": 0.6750202550471138, + "grad_norm": 1.511757493019104, + "learning_rate": 4.7742711296081e-06, + "loss": 0.4478, + "step": 30410 + }, + { + "epoch": 0.6751312416066414, + "grad_norm": 1.3559863567352295, + "learning_rate": 4.771298633814038e-06, + "loss": 0.3981, + "step": 30415 + }, + { + "epoch": 0.6752422281661691, + "grad_norm": 1.2549550533294678, + "learning_rate": 4.768326773707956e-06, + "loss": 0.399, + "step": 30420 + }, + { + "epoch": 0.6753532147256968, + "grad_norm": 2.3049769401550293, + "learning_rate": 4.765355549651156e-06, + "loss": 0.3148, + "step": 30425 + }, + { + "epoch": 0.6754642012852243, + "grad_norm": 1.6337782144546509, + "learning_rate": 4.762384962004877e-06, + "loss": 0.3418, + "step": 30430 + }, + { + "epoch": 0.675575187844752, + "grad_norm": 1.4367377758026123, + "learning_rate": 4.7594150111302635e-06, + "loss": 0.3908, + "step": 30435 + }, + { + "epoch": 0.6756861744042797, + "grad_norm": 0.9786241054534912, + "learning_rate": 4.7564456973883984e-06, + "loss": 0.4951, + "step": 30440 + }, + { + "epoch": 0.6757971609638073, + "grad_norm": 1.0460742712020874, + "learning_rate": 4.753477021140284e-06, + "loss": 0.333, + "step": 30445 + }, + { + "epoch": 0.675908147523335, + "grad_norm": 1.367433786392212, + "learning_rate": 4.7505089827468335e-06, + "loss": 0.3857, + "step": 30450 + }, + { + "epoch": 0.6760191340828625, + "grad_norm": 0.8574208617210388, + "learning_rate": 4.747541582568899e-06, + "loss": 0.3719, + "step": 30455 + }, + { + "epoch": 0.6761301206423902, + "grad_norm": 0.9311047792434692, + "learning_rate": 4.74457482096724e-06, + "loss": 0.4381, + "step": 30460 + }, + { + "epoch": 0.6762411072019179, + "grad_norm": 0.7453902959823608, + "learning_rate": 4.741608698302552e-06, + "loss": 0.3637, + "step": 30465 + }, + { + "epoch": 0.6763520937614454, + "grad_norm": 1.4380278587341309, + "learning_rate": 4.73864321493544e-06, + "loss": 0.3262, + "step": 30470 + }, + { + "epoch": 0.6764630803209731, + "grad_norm": 1.0542434453964233, + "learning_rate": 4.7356783712264405e-06, + "loss": 0.452, + "step": 30475 + }, + { + "epoch": 0.6765740668805008, + "grad_norm": 1.6459670066833496, + "learning_rate": 4.732714167536014e-06, + "loss": 0.4058, + "step": 30480 + }, + { + "epoch": 0.6766850534400284, + "grad_norm": 1.1828376054763794, + "learning_rate": 4.72975060422453e-06, + "loss": 0.3055, + "step": 30485 + }, + { + "epoch": 0.6767960399995561, + "grad_norm": 0.7649294137954712, + "learning_rate": 4.7267876816522966e-06, + "loss": 0.3618, + "step": 30490 + }, + { + "epoch": 0.6769070265590837, + "grad_norm": 1.0336248874664307, + "learning_rate": 4.723825400179527e-06, + "loss": 0.464, + "step": 30495 + }, + { + "epoch": 0.6770180131186113, + "grad_norm": 1.0955878496170044, + "learning_rate": 4.720863760166371e-06, + "loss": 0.4166, + "step": 30500 + }, + { + "epoch": 0.677128999678139, + "grad_norm": 1.1246545314788818, + "learning_rate": 4.717902761972898e-06, + "loss": 0.3503, + "step": 30505 + }, + { + "epoch": 0.6772399862376666, + "grad_norm": 1.230867624282837, + "learning_rate": 4.714942405959088e-06, + "loss": 0.2898, + "step": 30510 + }, + { + "epoch": 0.6773509727971943, + "grad_norm": 1.5435923337936401, + "learning_rate": 4.71198269248486e-06, + "loss": 0.4346, + "step": 30515 + }, + { + "epoch": 0.6774619593567219, + "grad_norm": 0.9442398548126221, + "learning_rate": 4.709023621910037e-06, + "loss": 0.5059, + "step": 30520 + }, + { + "epoch": 0.6775729459162495, + "grad_norm": 1.648020625114441, + "learning_rate": 4.706065194594378e-06, + "loss": 0.5491, + "step": 30525 + }, + { + "epoch": 0.6776839324757772, + "grad_norm": 1.0996757745742798, + "learning_rate": 4.703107410897563e-06, + "loss": 0.471, + "step": 30530 + }, + { + "epoch": 0.6777949190353049, + "grad_norm": 1.7918943166732788, + "learning_rate": 4.700150271179179e-06, + "loss": 0.2379, + "step": 30535 + }, + { + "epoch": 0.6779059055948324, + "grad_norm": 1.1911612749099731, + "learning_rate": 4.697193775798755e-06, + "loss": 0.3775, + "step": 30540 + }, + { + "epoch": 0.6780168921543601, + "grad_norm": 1.0882309675216675, + "learning_rate": 4.694237925115724e-06, + "loss": 0.5021, + "step": 30545 + }, + { + "epoch": 0.6781278787138878, + "grad_norm": 1.1152145862579346, + "learning_rate": 4.691282719489456e-06, + "loss": 0.4538, + "step": 30550 + }, + { + "epoch": 0.6782388652734154, + "grad_norm": 0.7347661852836609, + "learning_rate": 4.688328159279228e-06, + "loss": 0.5172, + "step": 30555 + }, + { + "epoch": 0.678349851832943, + "grad_norm": 1.4539144039154053, + "learning_rate": 4.68537424484425e-06, + "loss": 0.4374, + "step": 30560 + }, + { + "epoch": 0.6784608383924706, + "grad_norm": 1.5657992362976074, + "learning_rate": 4.6824209765436445e-06, + "loss": 0.3194, + "step": 30565 + }, + { + "epoch": 0.6785718249519983, + "grad_norm": 1.41443932056427, + "learning_rate": 4.679468354736467e-06, + "loss": 0.4329, + "step": 30570 + }, + { + "epoch": 0.678682811511526, + "grad_norm": 1.2728972434997559, + "learning_rate": 4.6765163797816795e-06, + "loss": 0.422, + "step": 30575 + }, + { + "epoch": 0.6787937980710536, + "grad_norm": 0.8089253306388855, + "learning_rate": 4.673565052038181e-06, + "loss": 0.5511, + "step": 30580 + }, + { + "epoch": 0.6789047846305812, + "grad_norm": 1.1690912246704102, + "learning_rate": 4.670614371864775e-06, + "loss": 0.4561, + "step": 30585 + }, + { + "epoch": 0.6790157711901089, + "grad_norm": 1.8266934156417847, + "learning_rate": 4.667664339620206e-06, + "loss": 0.3425, + "step": 30590 + }, + { + "epoch": 0.6791267577496365, + "grad_norm": 1.1937004327774048, + "learning_rate": 4.664714955663118e-06, + "loss": 0.3204, + "step": 30595 + }, + { + "epoch": 0.6792377443091642, + "grad_norm": 0.8955053687095642, + "learning_rate": 4.661766220352098e-06, + "loss": 0.4385, + "step": 30600 + }, + { + "epoch": 0.6793487308686919, + "grad_norm": 1.1877119541168213, + "learning_rate": 4.6588181340456315e-06, + "loss": 0.2442, + "step": 30605 + }, + { + "epoch": 0.6794597174282194, + "grad_norm": 1.1351243257522583, + "learning_rate": 4.655870697102145e-06, + "loss": 0.5295, + "step": 30610 + }, + { + "epoch": 0.6795707039877471, + "grad_norm": 0.9337393641471863, + "learning_rate": 4.65292390987998e-06, + "loss": 0.3629, + "step": 30615 + }, + { + "epoch": 0.6796816905472747, + "grad_norm": 1.265181303024292, + "learning_rate": 4.649977772737389e-06, + "loss": 0.3897, + "step": 30620 + }, + { + "epoch": 0.6797926771068024, + "grad_norm": 1.2284386157989502, + "learning_rate": 4.647032286032563e-06, + "loss": 0.2598, + "step": 30625 + }, + { + "epoch": 0.67990366366633, + "grad_norm": 1.3195282220840454, + "learning_rate": 4.644087450123594e-06, + "loss": 0.3161, + "step": 30630 + }, + { + "epoch": 0.6800146502258576, + "grad_norm": 1.4926730394363403, + "learning_rate": 4.641143265368515e-06, + "loss": 0.3973, + "step": 30635 + }, + { + "epoch": 0.6801256367853853, + "grad_norm": 3.5536727905273438, + "learning_rate": 4.638199732125261e-06, + "loss": 0.3454, + "step": 30640 + }, + { + "epoch": 0.680236623344913, + "grad_norm": 1.021014928817749, + "learning_rate": 4.635256850751702e-06, + "loss": 0.2805, + "step": 30645 + }, + { + "epoch": 0.6803476099044405, + "grad_norm": 1.885343074798584, + "learning_rate": 4.632314621605627e-06, + "loss": 0.4288, + "step": 30650 + }, + { + "epoch": 0.6804585964639682, + "grad_norm": 1.2568762302398682, + "learning_rate": 4.629373045044735e-06, + "loss": 0.4563, + "step": 30655 + }, + { + "epoch": 0.6805695830234959, + "grad_norm": 1.048861026763916, + "learning_rate": 4.626432121426659e-06, + "loss": 0.3522, + "step": 30660 + }, + { + "epoch": 0.6806805695830235, + "grad_norm": 1.5068540573120117, + "learning_rate": 4.623491851108942e-06, + "loss": 0.2918, + "step": 30665 + }, + { + "epoch": 0.6807915561425512, + "grad_norm": 1.1883301734924316, + "learning_rate": 4.620552234449052e-06, + "loss": 0.363, + "step": 30670 + }, + { + "epoch": 0.6809025427020787, + "grad_norm": 1.6479636430740356, + "learning_rate": 4.6176132718043866e-06, + "loss": 0.481, + "step": 30675 + }, + { + "epoch": 0.6810135292616064, + "grad_norm": 0.9919922351837158, + "learning_rate": 4.614674963532244e-06, + "loss": 0.37, + "step": 30680 + }, + { + "epoch": 0.6811245158211341, + "grad_norm": 1.2481129169464111, + "learning_rate": 4.611737309989861e-06, + "loss": 0.434, + "step": 30685 + }, + { + "epoch": 0.6812355023806617, + "grad_norm": 1.077807068824768, + "learning_rate": 4.608800311534383e-06, + "loss": 0.4506, + "step": 30690 + }, + { + "epoch": 0.6813464889401893, + "grad_norm": 0.9274584650993347, + "learning_rate": 4.60586396852288e-06, + "loss": 0.4033, + "step": 30695 + }, + { + "epoch": 0.681457475499717, + "grad_norm": 2.024184226989746, + "learning_rate": 4.602928281312351e-06, + "loss": 0.3622, + "step": 30700 + }, + { + "epoch": 0.6815684620592446, + "grad_norm": 1.019030213356018, + "learning_rate": 4.599993250259697e-06, + "loss": 0.3212, + "step": 30705 + }, + { + "epoch": 0.6816794486187723, + "grad_norm": 1.0451632738113403, + "learning_rate": 4.597058875721756e-06, + "loss": 0.2417, + "step": 30710 + }, + { + "epoch": 0.6817904351783, + "grad_norm": 0.8908951878547668, + "learning_rate": 4.594125158055275e-06, + "loss": 0.5534, + "step": 30715 + }, + { + "epoch": 0.6819014217378275, + "grad_norm": 1.572871446609497, + "learning_rate": 4.59119209761693e-06, + "loss": 0.5749, + "step": 30720 + }, + { + "epoch": 0.6820124082973552, + "grad_norm": 1.3295234441757202, + "learning_rate": 4.588259694763307e-06, + "loss": 0.4703, + "step": 30725 + }, + { + "epoch": 0.6821233948568828, + "grad_norm": 1.2267779111862183, + "learning_rate": 4.5853279498509196e-06, + "loss": 0.2847, + "step": 30730 + }, + { + "epoch": 0.6822343814164105, + "grad_norm": 1.4097859859466553, + "learning_rate": 4.582396863236205e-06, + "loss": 0.2772, + "step": 30735 + }, + { + "epoch": 0.6823453679759381, + "grad_norm": 1.4641203880310059, + "learning_rate": 4.579466435275506e-06, + "loss": 0.3562, + "step": 30740 + }, + { + "epoch": 0.6824563545354657, + "grad_norm": 1.4551869630813599, + "learning_rate": 4.576536666325103e-06, + "loss": 0.4807, + "step": 30745 + }, + { + "epoch": 0.6825673410949934, + "grad_norm": 1.5356292724609375, + "learning_rate": 4.57360755674118e-06, + "loss": 0.3482, + "step": 30750 + }, + { + "epoch": 0.6826783276545211, + "grad_norm": 1.2408578395843506, + "learning_rate": 4.570679106879852e-06, + "loss": 0.3142, + "step": 30755 + }, + { + "epoch": 0.6827893142140486, + "grad_norm": 1.8057538270950317, + "learning_rate": 4.567751317097152e-06, + "loss": 0.4854, + "step": 30760 + }, + { + "epoch": 0.6829003007735763, + "grad_norm": 1.3727633953094482, + "learning_rate": 4.564824187749025e-06, + "loss": 0.4088, + "step": 30765 + }, + { + "epoch": 0.683011287333104, + "grad_norm": 1.9439268112182617, + "learning_rate": 4.561897719191349e-06, + "loss": 0.3903, + "step": 30770 + }, + { + "epoch": 0.6831222738926316, + "grad_norm": 1.17835533618927, + "learning_rate": 4.558971911779908e-06, + "loss": 0.535, + "step": 30775 + }, + { + "epoch": 0.6832332604521593, + "grad_norm": 1.093266487121582, + "learning_rate": 4.556046765870413e-06, + "loss": 0.5418, + "step": 30780 + }, + { + "epoch": 0.6833442470116868, + "grad_norm": 1.1753915548324585, + "learning_rate": 4.5531222818184984e-06, + "loss": 0.3903, + "step": 30785 + }, + { + "epoch": 0.6834552335712145, + "grad_norm": 1.338931679725647, + "learning_rate": 4.550198459979706e-06, + "loss": 0.4379, + "step": 30790 + }, + { + "epoch": 0.6835662201307422, + "grad_norm": 1.7753686904907227, + "learning_rate": 4.547275300709511e-06, + "loss": 0.3737, + "step": 30795 + }, + { + "epoch": 0.6836772066902698, + "grad_norm": 0.8611935973167419, + "learning_rate": 4.544352804363294e-06, + "loss": 0.5054, + "step": 30800 + }, + { + "epoch": 0.6837881932497974, + "grad_norm": 1.2256321907043457, + "learning_rate": 4.54143097129637e-06, + "loss": 0.3536, + "step": 30805 + }, + { + "epoch": 0.6838991798093251, + "grad_norm": 1.5533641576766968, + "learning_rate": 4.5385098018639585e-06, + "loss": 0.6279, + "step": 30810 + }, + { + "epoch": 0.6840101663688527, + "grad_norm": 1.3186376094818115, + "learning_rate": 4.535589296421212e-06, + "loss": 0.4289, + "step": 30815 + }, + { + "epoch": 0.6841211529283804, + "grad_norm": 1.0984452962875366, + "learning_rate": 4.5326694553231885e-06, + "loss": 0.3821, + "step": 30820 + }, + { + "epoch": 0.6842321394879081, + "grad_norm": 1.0532371997833252, + "learning_rate": 4.529750278924882e-06, + "loss": 0.3864, + "step": 30825 + }, + { + "epoch": 0.6843431260474356, + "grad_norm": 1.2279421091079712, + "learning_rate": 4.526831767581186e-06, + "loss": 0.4095, + "step": 30830 + }, + { + "epoch": 0.6844541126069633, + "grad_norm": 0.8931465744972229, + "learning_rate": 4.5239139216469316e-06, + "loss": 0.4348, + "step": 30835 + }, + { + "epoch": 0.6845650991664909, + "grad_norm": 0.9575490355491638, + "learning_rate": 4.5209967414768545e-06, + "loss": 0.4169, + "step": 30840 + }, + { + "epoch": 0.6846760857260186, + "grad_norm": 1.4071946144104004, + "learning_rate": 4.518080227425621e-06, + "loss": 0.4897, + "step": 30845 + }, + { + "epoch": 0.6847870722855462, + "grad_norm": 0.7204203605651855, + "learning_rate": 4.515164379847806e-06, + "loss": 0.3523, + "step": 30850 + }, + { + "epoch": 0.6848980588450738, + "grad_norm": 1.372045636177063, + "learning_rate": 4.512249199097914e-06, + "loss": 0.5325, + "step": 30855 + }, + { + "epoch": 0.6850090454046015, + "grad_norm": 1.1533608436584473, + "learning_rate": 4.509334685530357e-06, + "loss": 0.4836, + "step": 30860 + }, + { + "epoch": 0.6851200319641292, + "grad_norm": 1.05873703956604, + "learning_rate": 4.506420839499474e-06, + "loss": 0.5, + "step": 30865 + }, + { + "epoch": 0.6852310185236568, + "grad_norm": 3.0505340099334717, + "learning_rate": 4.503507661359524e-06, + "loss": 0.4532, + "step": 30870 + }, + { + "epoch": 0.6853420050831844, + "grad_norm": 0.9786636233329773, + "learning_rate": 4.500595151464676e-06, + "loss": 0.2672, + "step": 30875 + }, + { + "epoch": 0.6854529916427121, + "grad_norm": 0.9993107914924622, + "learning_rate": 4.49768331016903e-06, + "loss": 0.4078, + "step": 30880 + }, + { + "epoch": 0.6855639782022397, + "grad_norm": 1.8332587480545044, + "learning_rate": 4.4947721378265896e-06, + "loss": 0.3187, + "step": 30885 + }, + { + "epoch": 0.6856749647617674, + "grad_norm": 0.5899979472160339, + "learning_rate": 4.491861634791294e-06, + "loss": 0.4247, + "step": 30890 + }, + { + "epoch": 0.6857859513212949, + "grad_norm": 1.7995885610580444, + "learning_rate": 4.488951801416983e-06, + "loss": 0.3427, + "step": 30895 + }, + { + "epoch": 0.6858969378808226, + "grad_norm": 1.1848108768463135, + "learning_rate": 4.4860426380574295e-06, + "loss": 0.4209, + "step": 30900 + }, + { + "epoch": 0.6860079244403503, + "grad_norm": 1.3083794116973877, + "learning_rate": 4.483134145066324e-06, + "loss": 0.3509, + "step": 30905 + }, + { + "epoch": 0.6861189109998779, + "grad_norm": 1.1152405738830566, + "learning_rate": 4.480226322797263e-06, + "loss": 0.3331, + "step": 30910 + }, + { + "epoch": 0.6862298975594056, + "grad_norm": 1.3240209817886353, + "learning_rate": 4.4773191716037774e-06, + "loss": 0.4114, + "step": 30915 + }, + { + "epoch": 0.6863408841189332, + "grad_norm": 0.7910842895507812, + "learning_rate": 4.474412691839302e-06, + "loss": 0.3904, + "step": 30920 + }, + { + "epoch": 0.6864518706784608, + "grad_norm": 1.6772698163986206, + "learning_rate": 4.471506883857201e-06, + "loss": 0.3804, + "step": 30925 + }, + { + "epoch": 0.6865628572379885, + "grad_norm": 0.955270528793335, + "learning_rate": 4.468601748010755e-06, + "loss": 0.3493, + "step": 30930 + }, + { + "epoch": 0.6866738437975162, + "grad_norm": 1.2743643522262573, + "learning_rate": 4.465697284653153e-06, + "loss": 0.4314, + "step": 30935 + }, + { + "epoch": 0.6867848303570437, + "grad_norm": 1.24860680103302, + "learning_rate": 4.4627934941375185e-06, + "loss": 0.3745, + "step": 30940 + }, + { + "epoch": 0.6868958169165714, + "grad_norm": 0.7497403025627136, + "learning_rate": 4.459890376816878e-06, + "loss": 0.4471, + "step": 30945 + }, + { + "epoch": 0.687006803476099, + "grad_norm": 1.4058095216751099, + "learning_rate": 4.456987933044185e-06, + "loss": 0.2944, + "step": 30950 + }, + { + "epoch": 0.6871177900356267, + "grad_norm": 1.4874502420425415, + "learning_rate": 4.454086163172312e-06, + "loss": 0.2856, + "step": 30955 + }, + { + "epoch": 0.6872287765951544, + "grad_norm": 0.9197191596031189, + "learning_rate": 4.45118506755404e-06, + "loss": 0.3419, + "step": 30960 + }, + { + "epoch": 0.6873397631546819, + "grad_norm": 1.030403733253479, + "learning_rate": 4.448284646542084e-06, + "loss": 0.3863, + "step": 30965 + }, + { + "epoch": 0.6874507497142096, + "grad_norm": 1.351831078529358, + "learning_rate": 4.445384900489056e-06, + "loss": 0.277, + "step": 30970 + }, + { + "epoch": 0.6875617362737373, + "grad_norm": 1.2920376062393188, + "learning_rate": 4.442485829747507e-06, + "loss": 0.3688, + "step": 30975 + }, + { + "epoch": 0.6876727228332649, + "grad_norm": 1.0285531282424927, + "learning_rate": 4.4395874346698885e-06, + "loss": 0.3805, + "step": 30980 + }, + { + "epoch": 0.6877837093927925, + "grad_norm": 1.517197608947754, + "learning_rate": 4.436689715608583e-06, + "loss": 0.4708, + "step": 30985 + }, + { + "epoch": 0.6878946959523202, + "grad_norm": 1.133748173713684, + "learning_rate": 4.433792672915886e-06, + "loss": 0.3964, + "step": 30990 + }, + { + "epoch": 0.6880056825118478, + "grad_norm": 1.029487133026123, + "learning_rate": 4.430896306944006e-06, + "loss": 0.3344, + "step": 30995 + }, + { + "epoch": 0.6881166690713755, + "grad_norm": 0.752305805683136, + "learning_rate": 4.428000618045078e-06, + "loss": 0.434, + "step": 31000 + }, + { + "epoch": 0.688227655630903, + "grad_norm": 1.6484147310256958, + "learning_rate": 4.425105606571145e-06, + "loss": 0.3976, + "step": 31005 + }, + { + "epoch": 0.6883386421904307, + "grad_norm": 1.2067406177520752, + "learning_rate": 4.422211272874175e-06, + "loss": 0.3293, + "step": 31010 + }, + { + "epoch": 0.6884496287499584, + "grad_norm": 1.2860989570617676, + "learning_rate": 4.419317617306056e-06, + "loss": 0.3547, + "step": 31015 + }, + { + "epoch": 0.688560615309486, + "grad_norm": 1.2995693683624268, + "learning_rate": 4.416424640218582e-06, + "loss": 0.3672, + "step": 31020 + }, + { + "epoch": 0.6886716018690137, + "grad_norm": 1.966412901878357, + "learning_rate": 4.413532341963477e-06, + "loss": 0.6209, + "step": 31025 + }, + { + "epoch": 0.6887825884285413, + "grad_norm": 1.75698983669281, + "learning_rate": 4.410640722892371e-06, + "loss": 0.3503, + "step": 31030 + }, + { + "epoch": 0.6888935749880689, + "grad_norm": 0.894189178943634, + "learning_rate": 4.40774978335682e-06, + "loss": 0.4089, + "step": 31035 + }, + { + "epoch": 0.6890045615475966, + "grad_norm": 0.8134837746620178, + "learning_rate": 4.404859523708301e-06, + "loss": 0.2775, + "step": 31040 + }, + { + "epoch": 0.6891155481071243, + "grad_norm": 1.6088848114013672, + "learning_rate": 4.40196994429819e-06, + "loss": 0.459, + "step": 31045 + }, + { + "epoch": 0.6892265346666518, + "grad_norm": 1.016860008239746, + "learning_rate": 4.399081045477804e-06, + "loss": 0.4462, + "step": 31050 + }, + { + "epoch": 0.6893375212261795, + "grad_norm": 1.5716601610183716, + "learning_rate": 4.396192827598357e-06, + "loss": 0.804, + "step": 31055 + }, + { + "epoch": 0.6894485077857071, + "grad_norm": 0.7367437481880188, + "learning_rate": 4.393305291010995e-06, + "loss": 0.4039, + "step": 31060 + }, + { + "epoch": 0.6895594943452348, + "grad_norm": 1.4050825834274292, + "learning_rate": 4.39041843606677e-06, + "loss": 0.3258, + "step": 31065 + }, + { + "epoch": 0.6896704809047625, + "grad_norm": 1.144169569015503, + "learning_rate": 4.387532263116662e-06, + "loss": 0.2019, + "step": 31070 + }, + { + "epoch": 0.68978146746429, + "grad_norm": 1.2167930603027344, + "learning_rate": 4.384646772511554e-06, + "loss": 0.357, + "step": 31075 + }, + { + "epoch": 0.6898924540238177, + "grad_norm": 1.3271721601486206, + "learning_rate": 4.381761964602264e-06, + "loss": 0.5452, + "step": 31080 + }, + { + "epoch": 0.6900034405833454, + "grad_norm": 0.8778615593910217, + "learning_rate": 4.3788778397395075e-06, + "loss": 0.4604, + "step": 31085 + }, + { + "epoch": 0.690114427142873, + "grad_norm": 1.1979990005493164, + "learning_rate": 4.375994398273935e-06, + "loss": 0.3149, + "step": 31090 + }, + { + "epoch": 0.6902254137024006, + "grad_norm": 1.1055999994277954, + "learning_rate": 4.3731116405560996e-06, + "loss": 0.5063, + "step": 31095 + }, + { + "epoch": 0.6903364002619283, + "grad_norm": 1.1913548707962036, + "learning_rate": 4.370229566936482e-06, + "loss": 0.5055, + "step": 31100 + }, + { + "epoch": 0.6904473868214559, + "grad_norm": 1.7680208683013916, + "learning_rate": 4.36734817776547e-06, + "loss": 0.5424, + "step": 31105 + }, + { + "epoch": 0.6905583733809836, + "grad_norm": 1.0807723999023438, + "learning_rate": 4.36446747339338e-06, + "loss": 0.335, + "step": 31110 + }, + { + "epoch": 0.6906693599405112, + "grad_norm": 1.183213472366333, + "learning_rate": 4.361587454170431e-06, + "loss": 0.5329, + "step": 31115 + }, + { + "epoch": 0.6907803465000388, + "grad_norm": 1.3761305809020996, + "learning_rate": 4.3587081204467685e-06, + "loss": 0.3529, + "step": 31120 + }, + { + "epoch": 0.6908913330595665, + "grad_norm": 1.7719734907150269, + "learning_rate": 4.355829472572457e-06, + "loss": 0.4814, + "step": 31125 + }, + { + "epoch": 0.6910023196190941, + "grad_norm": 0.8037557601928711, + "learning_rate": 4.352951510897466e-06, + "loss": 0.497, + "step": 31130 + }, + { + "epoch": 0.6911133061786218, + "grad_norm": 1.0056993961334229, + "learning_rate": 4.350074235771695e-06, + "loss": 0.4597, + "step": 31135 + }, + { + "epoch": 0.6912242927381494, + "grad_norm": 1.2936779260635376, + "learning_rate": 4.347197647544947e-06, + "loss": 0.3356, + "step": 31140 + }, + { + "epoch": 0.691335279297677, + "grad_norm": 0.9828980565071106, + "learning_rate": 4.3443217465669505e-06, + "loss": 0.3686, + "step": 31145 + }, + { + "epoch": 0.6914462658572047, + "grad_norm": 1.1120939254760742, + "learning_rate": 4.3414465331873524e-06, + "loss": 0.4776, + "step": 31150 + }, + { + "epoch": 0.6915572524167324, + "grad_norm": 1.0941098928451538, + "learning_rate": 4.338572007755703e-06, + "loss": 0.3537, + "step": 31155 + }, + { + "epoch": 0.69166823897626, + "grad_norm": 1.1693826913833618, + "learning_rate": 4.335698170621487e-06, + "loss": 0.3013, + "step": 31160 + }, + { + "epoch": 0.6917792255357876, + "grad_norm": 1.5390578508377075, + "learning_rate": 4.332825022134086e-06, + "loss": 0.5256, + "step": 31165 + }, + { + "epoch": 0.6918902120953152, + "grad_norm": 1.020473837852478, + "learning_rate": 4.329952562642816e-06, + "loss": 0.4088, + "step": 31170 + }, + { + "epoch": 0.6920011986548429, + "grad_norm": 1.4370976686477661, + "learning_rate": 4.327080792496895e-06, + "loss": 0.3802, + "step": 31175 + }, + { + "epoch": 0.6921121852143706, + "grad_norm": 1.3019744157791138, + "learning_rate": 4.324209712045465e-06, + "loss": 0.5318, + "step": 31180 + }, + { + "epoch": 0.6922231717738981, + "grad_norm": 1.436402678489685, + "learning_rate": 4.321339321637587e-06, + "loss": 0.4362, + "step": 31185 + }, + { + "epoch": 0.6923341583334258, + "grad_norm": 1.1188828945159912, + "learning_rate": 4.318469621622226e-06, + "loss": 0.5995, + "step": 31190 + }, + { + "epoch": 0.6924451448929535, + "grad_norm": 1.3164883852005005, + "learning_rate": 4.315600612348278e-06, + "loss": 0.4404, + "step": 31195 + }, + { + "epoch": 0.6925561314524811, + "grad_norm": 1.9575704336166382, + "learning_rate": 4.3127322941645385e-06, + "loss": 0.4189, + "step": 31200 + }, + { + "epoch": 0.6926671180120088, + "grad_norm": 1.4906368255615234, + "learning_rate": 4.309864667419735e-06, + "loss": 0.4129, + "step": 31205 + }, + { + "epoch": 0.6927781045715364, + "grad_norm": 0.632857620716095, + "learning_rate": 4.306997732462505e-06, + "loss": 0.2905, + "step": 31210 + }, + { + "epoch": 0.692889091131064, + "grad_norm": 1.4873570203781128, + "learning_rate": 4.304131489641393e-06, + "loss": 0.2703, + "step": 31215 + }, + { + "epoch": 0.6930000776905917, + "grad_norm": 1.2380766868591309, + "learning_rate": 4.301265939304877e-06, + "loss": 0.4143, + "step": 31220 + }, + { + "epoch": 0.6931110642501193, + "grad_norm": 1.487708568572998, + "learning_rate": 4.298401081801332e-06, + "loss": 0.3418, + "step": 31225 + }, + { + "epoch": 0.6932220508096469, + "grad_norm": 1.3513848781585693, + "learning_rate": 4.295536917479062e-06, + "loss": 0.3183, + "step": 31230 + }, + { + "epoch": 0.6933330373691746, + "grad_norm": 1.166556715965271, + "learning_rate": 4.292673446686285e-06, + "loss": 0.4073, + "step": 31235 + }, + { + "epoch": 0.6934440239287022, + "grad_norm": 1.4981805086135864, + "learning_rate": 4.2898106697711266e-06, + "loss": 0.5024, + "step": 31240 + }, + { + "epoch": 0.6935550104882299, + "grad_norm": 1.0821495056152344, + "learning_rate": 4.286948587081639e-06, + "loss": 0.3748, + "step": 31245 + }, + { + "epoch": 0.6936659970477576, + "grad_norm": 0.773173987865448, + "learning_rate": 4.284087198965781e-06, + "loss": 0.4054, + "step": 31250 + }, + { + "epoch": 0.6937769836072851, + "grad_norm": 1.5648664236068726, + "learning_rate": 4.281226505771433e-06, + "loss": 0.4982, + "step": 31255 + }, + { + "epoch": 0.6938879701668128, + "grad_norm": 1.5265167951583862, + "learning_rate": 4.278366507846384e-06, + "loss": 0.4065, + "step": 31260 + }, + { + "epoch": 0.6939989567263405, + "grad_norm": 1.2185603380203247, + "learning_rate": 4.275507205538348e-06, + "loss": 0.3195, + "step": 31265 + }, + { + "epoch": 0.6941099432858681, + "grad_norm": 0.8565247058868408, + "learning_rate": 4.272648599194948e-06, + "loss": 0.3741, + "step": 31270 + }, + { + "epoch": 0.6942209298453957, + "grad_norm": 1.493923544883728, + "learning_rate": 4.269790689163722e-06, + "loss": 0.5898, + "step": 31275 + }, + { + "epoch": 0.6943319164049233, + "grad_norm": 0.9452623724937439, + "learning_rate": 4.2669334757921284e-06, + "loss": 0.4826, + "step": 31280 + }, + { + "epoch": 0.694442902964451, + "grad_norm": 1.0773069858551025, + "learning_rate": 4.2640769594275335e-06, + "loss": 0.3968, + "step": 31285 + }, + { + "epoch": 0.6945538895239787, + "grad_norm": 1.870210886001587, + "learning_rate": 4.261221140417228e-06, + "loss": 0.3045, + "step": 31290 + }, + { + "epoch": 0.6946648760835062, + "grad_norm": 1.6059725284576416, + "learning_rate": 4.258366019108405e-06, + "loss": 0.4136, + "step": 31295 + }, + { + "epoch": 0.6947758626430339, + "grad_norm": 1.3067309856414795, + "learning_rate": 4.255511595848191e-06, + "loss": 0.3463, + "step": 31300 + }, + { + "epoch": 0.6948868492025616, + "grad_norm": 1.3735544681549072, + "learning_rate": 4.2526578709836075e-06, + "loss": 0.5428, + "step": 31305 + }, + { + "epoch": 0.6949978357620892, + "grad_norm": 1.036281704902649, + "learning_rate": 4.2498048448616084e-06, + "loss": 0.2687, + "step": 31310 + }, + { + "epoch": 0.6951088223216169, + "grad_norm": 1.331540584564209, + "learning_rate": 4.2469525178290485e-06, + "loss": 0.4058, + "step": 31315 + }, + { + "epoch": 0.6952198088811445, + "grad_norm": 1.4195505380630493, + "learning_rate": 4.24410089023271e-06, + "loss": 0.3653, + "step": 31320 + }, + { + "epoch": 0.6953307954406721, + "grad_norm": 1.5275979042053223, + "learning_rate": 4.241249962419278e-06, + "loss": 0.5446, + "step": 31325 + }, + { + "epoch": 0.6954417820001998, + "grad_norm": 1.3449691534042358, + "learning_rate": 4.238399734735365e-06, + "loss": 0.351, + "step": 31330 + }, + { + "epoch": 0.6955527685597274, + "grad_norm": 1.099867343902588, + "learning_rate": 4.235550207527488e-06, + "loss": 0.5494, + "step": 31335 + }, + { + "epoch": 0.695663755119255, + "grad_norm": 0.9885258078575134, + "learning_rate": 4.2327013811420855e-06, + "loss": 0.5013, + "step": 31340 + }, + { + "epoch": 0.6957747416787827, + "grad_norm": 1.9952067136764526, + "learning_rate": 4.229853255925506e-06, + "loss": 0.4587, + "step": 31345 + }, + { + "epoch": 0.6958857282383103, + "grad_norm": 1.0655864477157593, + "learning_rate": 4.2270058322240134e-06, + "loss": 0.4163, + "step": 31350 + }, + { + "epoch": 0.695996714797838, + "grad_norm": 1.3053919076919556, + "learning_rate": 4.224159110383797e-06, + "loss": 0.6184, + "step": 31355 + }, + { + "epoch": 0.6961077013573657, + "grad_norm": 1.7516372203826904, + "learning_rate": 4.221313090750939e-06, + "loss": 0.5149, + "step": 31360 + }, + { + "epoch": 0.6962186879168932, + "grad_norm": 0.7757472991943359, + "learning_rate": 4.218467773671461e-06, + "loss": 0.2833, + "step": 31365 + }, + { + "epoch": 0.6963296744764209, + "grad_norm": 0.9427791237831116, + "learning_rate": 4.215623159491276e-06, + "loss": 0.4318, + "step": 31370 + }, + { + "epoch": 0.6964406610359486, + "grad_norm": 1.348248839378357, + "learning_rate": 4.212779248556229e-06, + "loss": 0.3954, + "step": 31375 + }, + { + "epoch": 0.6965516475954762, + "grad_norm": 1.0021616220474243, + "learning_rate": 4.209936041212076e-06, + "loss": 0.2566, + "step": 31380 + }, + { + "epoch": 0.6966626341550038, + "grad_norm": 1.6703379154205322, + "learning_rate": 4.207093537804476e-06, + "loss": 0.6691, + "step": 31385 + }, + { + "epoch": 0.6967736207145314, + "grad_norm": 0.5511424541473389, + "learning_rate": 4.20425173867902e-06, + "loss": 0.3375, + "step": 31390 + }, + { + "epoch": 0.6968846072740591, + "grad_norm": 1.0685218572616577, + "learning_rate": 4.201410644181197e-06, + "loss": 0.3661, + "step": 31395 + }, + { + "epoch": 0.6969955938335868, + "grad_norm": 1.3645395040512085, + "learning_rate": 4.19857025465642e-06, + "loss": 0.3223, + "step": 31400 + }, + { + "epoch": 0.6971065803931144, + "grad_norm": 1.014066457748413, + "learning_rate": 4.195730570450019e-06, + "loss": 0.3221, + "step": 31405 + }, + { + "epoch": 0.697217566952642, + "grad_norm": 1.2080676555633545, + "learning_rate": 4.1928915919072254e-06, + "loss": 0.4337, + "step": 31410 + }, + { + "epoch": 0.6973285535121697, + "grad_norm": 1.6252745389938354, + "learning_rate": 4.190053319373201e-06, + "loss": 0.6076, + "step": 31415 + }, + { + "epoch": 0.6974395400716973, + "grad_norm": 1.463215708732605, + "learning_rate": 4.187215753193004e-06, + "loss": 0.3474, + "step": 31420 + }, + { + "epoch": 0.697550526631225, + "grad_norm": 1.4297840595245361, + "learning_rate": 4.184378893711626e-06, + "loss": 0.5259, + "step": 31425 + }, + { + "epoch": 0.6976615131907526, + "grad_norm": 1.3269591331481934, + "learning_rate": 4.181542741273954e-06, + "loss": 0.4864, + "step": 31430 + }, + { + "epoch": 0.6977724997502802, + "grad_norm": 1.156131386756897, + "learning_rate": 4.178707296224802e-06, + "loss": 0.502, + "step": 31435 + }, + { + "epoch": 0.6978834863098079, + "grad_norm": 1.3167673349380493, + "learning_rate": 4.175872558908898e-06, + "loss": 0.3402, + "step": 31440 + }, + { + "epoch": 0.6979944728693355, + "grad_norm": 0.9266581535339355, + "learning_rate": 4.173038529670871e-06, + "loss": 0.3809, + "step": 31445 + }, + { + "epoch": 0.6981054594288632, + "grad_norm": 0.9850370287895203, + "learning_rate": 4.170205208855281e-06, + "loss": 0.4619, + "step": 31450 + }, + { + "epoch": 0.6982164459883908, + "grad_norm": 1.2709016799926758, + "learning_rate": 4.167372596806587e-06, + "loss": 0.2927, + "step": 31455 + }, + { + "epoch": 0.6983274325479184, + "grad_norm": 0.5235553979873657, + "learning_rate": 4.1645406938691725e-06, + "loss": 0.3464, + "step": 31460 + }, + { + "epoch": 0.6984384191074461, + "grad_norm": 1.417148470878601, + "learning_rate": 4.161709500387332e-06, + "loss": 0.3279, + "step": 31465 + }, + { + "epoch": 0.6985494056669738, + "grad_norm": 1.2142740488052368, + "learning_rate": 4.158879016705267e-06, + "loss": 0.4259, + "step": 31470 + }, + { + "epoch": 0.6986603922265013, + "grad_norm": 0.8356972336769104, + "learning_rate": 4.156049243167105e-06, + "loss": 0.44, + "step": 31475 + }, + { + "epoch": 0.698771378786029, + "grad_norm": 1.6601563692092896, + "learning_rate": 4.153220180116874e-06, + "loss": 0.3932, + "step": 31480 + }, + { + "epoch": 0.6988823653455567, + "grad_norm": 1.4118906259536743, + "learning_rate": 4.150391827898524e-06, + "loss": 0.3759, + "step": 31485 + }, + { + "epoch": 0.6989933519050843, + "grad_norm": 1.6847814321517944, + "learning_rate": 4.147564186855923e-06, + "loss": 0.3568, + "step": 31490 + }, + { + "epoch": 0.699104338464612, + "grad_norm": 1.3887543678283691, + "learning_rate": 4.144737257332835e-06, + "loss": 0.3413, + "step": 31495 + }, + { + "epoch": 0.6992153250241395, + "grad_norm": 1.4843730926513672, + "learning_rate": 4.141911039672959e-06, + "loss": 0.4609, + "step": 31500 + }, + { + "epoch": 0.6993263115836672, + "grad_norm": 1.0380373001098633, + "learning_rate": 4.139085534219887e-06, + "loss": 0.47, + "step": 31505 + }, + { + "epoch": 0.6994372981431949, + "grad_norm": 1.1411293745040894, + "learning_rate": 4.1362607413171455e-06, + "loss": 0.4828, + "step": 31510 + }, + { + "epoch": 0.6995482847027225, + "grad_norm": 1.434410572052002, + "learning_rate": 4.133436661308153e-06, + "loss": 0.3755, + "step": 31515 + }, + { + "epoch": 0.6996592712622501, + "grad_norm": 1.2564046382904053, + "learning_rate": 4.130613294536257e-06, + "loss": 0.3389, + "step": 31520 + }, + { + "epoch": 0.6997702578217778, + "grad_norm": 1.2875218391418457, + "learning_rate": 4.127790641344715e-06, + "loss": 0.4025, + "step": 31525 + }, + { + "epoch": 0.6998812443813054, + "grad_norm": 1.2832059860229492, + "learning_rate": 4.124968702076689e-06, + "loss": 0.5064, + "step": 31530 + }, + { + "epoch": 0.6999922309408331, + "grad_norm": 1.0163439512252808, + "learning_rate": 4.12214747707527e-06, + "loss": 0.3183, + "step": 31535 + }, + { + "epoch": 0.7001032175003608, + "grad_norm": 1.3421192169189453, + "learning_rate": 4.119326966683443e-06, + "loss": 0.3161, + "step": 31540 + }, + { + "epoch": 0.7002142040598883, + "grad_norm": 0.8114428520202637, + "learning_rate": 4.116507171244125e-06, + "loss": 0.3758, + "step": 31545 + }, + { + "epoch": 0.700325190619416, + "grad_norm": 1.2384015321731567, + "learning_rate": 4.1136880911001305e-06, + "loss": 0.4125, + "step": 31550 + }, + { + "epoch": 0.7004361771789436, + "grad_norm": 1.3213547468185425, + "learning_rate": 4.1108697265942e-06, + "loss": 0.3732, + "step": 31555 + }, + { + "epoch": 0.7005471637384713, + "grad_norm": 1.2653157711029053, + "learning_rate": 4.108052078068974e-06, + "loss": 0.4037, + "step": 31560 + }, + { + "epoch": 0.7006581502979989, + "grad_norm": 0.8571048378944397, + "learning_rate": 4.1052351458670195e-06, + "loss": 0.3677, + "step": 31565 + }, + { + "epoch": 0.7007691368575265, + "grad_norm": 1.0218335390090942, + "learning_rate": 4.1024189303308025e-06, + "loss": 0.276, + "step": 31570 + }, + { + "epoch": 0.7008801234170542, + "grad_norm": 2.0458157062530518, + "learning_rate": 4.099603431802718e-06, + "loss": 0.3352, + "step": 31575 + }, + { + "epoch": 0.7009911099765819, + "grad_norm": 0.4724593162536621, + "learning_rate": 4.096788650625056e-06, + "loss": 0.3873, + "step": 31580 + }, + { + "epoch": 0.7011020965361094, + "grad_norm": 0.9088649749755859, + "learning_rate": 4.0939745871400335e-06, + "loss": 0.3816, + "step": 31585 + }, + { + "epoch": 0.7012130830956371, + "grad_norm": 1.1194968223571777, + "learning_rate": 4.091161241689771e-06, + "loss": 0.3933, + "step": 31590 + }, + { + "epoch": 0.7013240696551648, + "grad_norm": 1.356024146080017, + "learning_rate": 4.088348614616313e-06, + "loss": 0.4728, + "step": 31595 + }, + { + "epoch": 0.7014350562146924, + "grad_norm": 1.4687360525131226, + "learning_rate": 4.085536706261599e-06, + "loss": 0.4513, + "step": 31600 + }, + { + "epoch": 0.7015460427742201, + "grad_norm": 1.7567170858383179, + "learning_rate": 4.0827255169674985e-06, + "loss": 0.2785, + "step": 31605 + }, + { + "epoch": 0.7016570293337476, + "grad_norm": 1.7780139446258545, + "learning_rate": 4.079915047075786e-06, + "loss": 0.5441, + "step": 31610 + }, + { + "epoch": 0.7017680158932753, + "grad_norm": 1.035780906677246, + "learning_rate": 4.077105296928146e-06, + "loss": 0.4645, + "step": 31615 + }, + { + "epoch": 0.701879002452803, + "grad_norm": 1.972312092781067, + "learning_rate": 4.0742962668661826e-06, + "loss": 0.5715, + "step": 31620 + }, + { + "epoch": 0.7019899890123306, + "grad_norm": 1.128519892692566, + "learning_rate": 4.071487957231403e-06, + "loss": 0.4997, + "step": 31625 + }, + { + "epoch": 0.7021009755718582, + "grad_norm": 1.760668396949768, + "learning_rate": 4.068680368365234e-06, + "loss": 0.3383, + "step": 31630 + }, + { + "epoch": 0.7022119621313859, + "grad_norm": 1.2317535877227783, + "learning_rate": 4.065873500609018e-06, + "loss": 0.5581, + "step": 31635 + }, + { + "epoch": 0.7023229486909135, + "grad_norm": 1.2062588930130005, + "learning_rate": 4.063067354303997e-06, + "loss": 0.2875, + "step": 31640 + }, + { + "epoch": 0.7024339352504412, + "grad_norm": 1.362601637840271, + "learning_rate": 4.060261929791338e-06, + "loss": 0.4041, + "step": 31645 + }, + { + "epoch": 0.7025449218099689, + "grad_norm": 0.7741042971611023, + "learning_rate": 4.057457227412112e-06, + "loss": 0.2976, + "step": 31650 + }, + { + "epoch": 0.7026559083694964, + "grad_norm": 1.195422887802124, + "learning_rate": 4.054653247507304e-06, + "loss": 0.4414, + "step": 31655 + }, + { + "epoch": 0.7027668949290241, + "grad_norm": 1.0758905410766602, + "learning_rate": 4.0518499904178195e-06, + "loss": 0.4778, + "step": 31660 + }, + { + "epoch": 0.7028778814885517, + "grad_norm": 0.9068155288696289, + "learning_rate": 4.049047456484463e-06, + "loss": 0.3964, + "step": 31665 + }, + { + "epoch": 0.7029888680480794, + "grad_norm": 1.274548888206482, + "learning_rate": 4.046245646047961e-06, + "loss": 0.5064, + "step": 31670 + }, + { + "epoch": 0.703099854607607, + "grad_norm": 1.3194080591201782, + "learning_rate": 4.0434445594489415e-06, + "loss": 0.5299, + "step": 31675 + }, + { + "epoch": 0.7032108411671346, + "grad_norm": 0.789570152759552, + "learning_rate": 4.04064419702796e-06, + "loss": 0.2611, + "step": 31680 + }, + { + "epoch": 0.7033218277266623, + "grad_norm": 0.9924457669258118, + "learning_rate": 4.037844559125468e-06, + "loss": 0.4622, + "step": 31685 + }, + { + "epoch": 0.70343281428619, + "grad_norm": 1.1734859943389893, + "learning_rate": 4.035045646081838e-06, + "loss": 0.4851, + "step": 31690 + }, + { + "epoch": 0.7035438008457175, + "grad_norm": 1.1078006029129028, + "learning_rate": 4.032247458237357e-06, + "loss": 0.2741, + "step": 31695 + }, + { + "epoch": 0.7036547874052452, + "grad_norm": 0.924501895904541, + "learning_rate": 4.029449995932213e-06, + "loss": 0.3489, + "step": 31700 + }, + { + "epoch": 0.7037657739647729, + "grad_norm": 1.0132527351379395, + "learning_rate": 4.026653259506518e-06, + "loss": 0.2577, + "step": 31705 + }, + { + "epoch": 0.7038767605243005, + "grad_norm": 0.8732908368110657, + "learning_rate": 4.023857249300283e-06, + "loss": 0.3736, + "step": 31710 + }, + { + "epoch": 0.7039877470838282, + "grad_norm": 0.8892420530319214, + "learning_rate": 4.021061965653441e-06, + "loss": 0.4148, + "step": 31715 + }, + { + "epoch": 0.7040987336433557, + "grad_norm": 0.8567600250244141, + "learning_rate": 4.018267408905838e-06, + "loss": 0.4137, + "step": 31720 + }, + { + "epoch": 0.7042097202028834, + "grad_norm": 1.7506471872329712, + "learning_rate": 4.015473579397218e-06, + "loss": 0.4998, + "step": 31725 + }, + { + "epoch": 0.7043207067624111, + "grad_norm": 1.1711151599884033, + "learning_rate": 4.012680477467254e-06, + "loss": 0.5172, + "step": 31730 + }, + { + "epoch": 0.7044316933219387, + "grad_norm": 0.9681052565574646, + "learning_rate": 4.009888103455512e-06, + "loss": 0.4736, + "step": 31735 + }, + { + "epoch": 0.7045426798814663, + "grad_norm": 0.9805378317832947, + "learning_rate": 4.007096457701487e-06, + "loss": 0.4674, + "step": 31740 + }, + { + "epoch": 0.704653666440994, + "grad_norm": 1.5319339036941528, + "learning_rate": 4.004305540544579e-06, + "loss": 0.3196, + "step": 31745 + }, + { + "epoch": 0.7047646530005216, + "grad_norm": 1.4754489660263062, + "learning_rate": 4.001515352324091e-06, + "loss": 0.5432, + "step": 31750 + }, + { + "epoch": 0.7048756395600493, + "grad_norm": 1.703891396522522, + "learning_rate": 3.998725893379254e-06, + "loss": 0.4281, + "step": 31755 + }, + { + "epoch": 0.704986626119577, + "grad_norm": 0.9801030158996582, + "learning_rate": 3.995937164049192e-06, + "loss": 0.3976, + "step": 31760 + }, + { + "epoch": 0.7050976126791045, + "grad_norm": 0.8765810132026672, + "learning_rate": 3.993149164672957e-06, + "loss": 0.4155, + "step": 31765 + }, + { + "epoch": 0.7052085992386322, + "grad_norm": 1.5806583166122437, + "learning_rate": 3.990361895589499e-06, + "loss": 0.4737, + "step": 31770 + }, + { + "epoch": 0.7053195857981599, + "grad_norm": 0.9778581857681274, + "learning_rate": 3.987575357137685e-06, + "loss": 0.251, + "step": 31775 + }, + { + "epoch": 0.7054305723576875, + "grad_norm": 0.9157988429069519, + "learning_rate": 3.984789549656299e-06, + "loss": 0.4607, + "step": 31780 + }, + { + "epoch": 0.7055415589172152, + "grad_norm": 0.6539163589477539, + "learning_rate": 3.982004473484022e-06, + "loss": 0.3835, + "step": 31785 + }, + { + "epoch": 0.7056525454767427, + "grad_norm": 1.8265697956085205, + "learning_rate": 3.979220128959463e-06, + "loss": 0.4101, + "step": 31790 + }, + { + "epoch": 0.7057635320362704, + "grad_norm": 1.7749618291854858, + "learning_rate": 3.976436516421125e-06, + "loss": 0.4697, + "step": 31795 + }, + { + "epoch": 0.7058745185957981, + "grad_norm": 1.127279281616211, + "learning_rate": 3.973653636207437e-06, + "loss": 0.4854, + "step": 31800 + }, + { + "epoch": 0.7059855051553257, + "grad_norm": 1.56232750415802, + "learning_rate": 3.970871488656727e-06, + "loss": 0.3458, + "step": 31805 + }, + { + "epoch": 0.7060964917148533, + "grad_norm": 1.2407728433609009, + "learning_rate": 3.968090074107242e-06, + "loss": 0.359, + "step": 31810 + }, + { + "epoch": 0.706207478274381, + "grad_norm": 0.9445065259933472, + "learning_rate": 3.965309392897135e-06, + "loss": 0.3034, + "step": 31815 + }, + { + "epoch": 0.7063184648339086, + "grad_norm": 0.8194674849510193, + "learning_rate": 3.9625294453644755e-06, + "loss": 0.2554, + "step": 31820 + }, + { + "epoch": 0.7064294513934363, + "grad_norm": 1.5787767171859741, + "learning_rate": 3.9597502318472356e-06, + "loss": 0.4429, + "step": 31825 + }, + { + "epoch": 0.706540437952964, + "grad_norm": 1.8735803365707397, + "learning_rate": 3.956971752683309e-06, + "loss": 0.4748, + "step": 31830 + }, + { + "epoch": 0.7066514245124915, + "grad_norm": 1.4561665058135986, + "learning_rate": 3.954194008210485e-06, + "loss": 0.3635, + "step": 31835 + }, + { + "epoch": 0.7067624110720192, + "grad_norm": 1.3835692405700684, + "learning_rate": 3.951416998766481e-06, + "loss": 0.4811, + "step": 31840 + }, + { + "epoch": 0.7068733976315468, + "grad_norm": 1.6009646654129028, + "learning_rate": 3.94864072468891e-06, + "loss": 0.216, + "step": 31845 + }, + { + "epoch": 0.7069843841910745, + "grad_norm": 1.3886228799819946, + "learning_rate": 3.945865186315308e-06, + "loss": 0.193, + "step": 31850 + }, + { + "epoch": 0.7070953707506021, + "grad_norm": 1.8874969482421875, + "learning_rate": 3.9430903839831104e-06, + "loss": 0.4452, + "step": 31855 + }, + { + "epoch": 0.7072063573101297, + "grad_norm": 1.0513811111450195, + "learning_rate": 3.9403163180296685e-06, + "loss": 0.474, + "step": 31860 + }, + { + "epoch": 0.7073173438696574, + "grad_norm": 0.8507716655731201, + "learning_rate": 3.937542988792251e-06, + "loss": 0.2627, + "step": 31865 + }, + { + "epoch": 0.7074283304291851, + "grad_norm": 0.38229304552078247, + "learning_rate": 3.934770396608022e-06, + "loss": 0.3952, + "step": 31870 + }, + { + "epoch": 0.7075393169887126, + "grad_norm": 1.3319016695022583, + "learning_rate": 3.931998541814069e-06, + "loss": 0.4774, + "step": 31875 + }, + { + "epoch": 0.7076503035482403, + "grad_norm": 1.4282726049423218, + "learning_rate": 3.92922742474738e-06, + "loss": 0.5038, + "step": 31880 + }, + { + "epoch": 0.707761290107768, + "grad_norm": 1.7863751649856567, + "learning_rate": 3.926457045744862e-06, + "loss": 0.4292, + "step": 31885 + }, + { + "epoch": 0.7078722766672956, + "grad_norm": 0.9300488829612732, + "learning_rate": 3.923687405143329e-06, + "loss": 0.5032, + "step": 31890 + }, + { + "epoch": 0.7079832632268233, + "grad_norm": 1.699332594871521, + "learning_rate": 3.9209185032795004e-06, + "loss": 0.4435, + "step": 31895 + }, + { + "epoch": 0.7080942497863508, + "grad_norm": 1.6415492296218872, + "learning_rate": 3.918150340490015e-06, + "loss": 0.4167, + "step": 31900 + }, + { + "epoch": 0.7082052363458785, + "grad_norm": 0.8185513615608215, + "learning_rate": 3.915382917111412e-06, + "loss": 0.3837, + "step": 31905 + }, + { + "epoch": 0.7083162229054062, + "grad_norm": 1.4475336074829102, + "learning_rate": 3.912616233480148e-06, + "loss": 0.2266, + "step": 31910 + }, + { + "epoch": 0.7084272094649338, + "grad_norm": 1.804500937461853, + "learning_rate": 3.909850289932589e-06, + "loss": 0.4735, + "step": 31915 + }, + { + "epoch": 0.7085381960244614, + "grad_norm": 1.0260645151138306, + "learning_rate": 3.907085086805005e-06, + "loss": 0.4578, + "step": 31920 + }, + { + "epoch": 0.7086491825839891, + "grad_norm": 1.2490384578704834, + "learning_rate": 3.904320624433584e-06, + "loss": 0.3645, + "step": 31925 + }, + { + "epoch": 0.7087601691435167, + "grad_norm": 1.592247486114502, + "learning_rate": 3.901556903154415e-06, + "loss": 0.3791, + "step": 31930 + }, + { + "epoch": 0.7088711557030444, + "grad_norm": 1.0241756439208984, + "learning_rate": 3.89879392330351e-06, + "loss": 0.423, + "step": 31935 + }, + { + "epoch": 0.7089821422625721, + "grad_norm": 1.805743932723999, + "learning_rate": 3.896031685216774e-06, + "loss": 0.5112, + "step": 31940 + }, + { + "epoch": 0.7090931288220996, + "grad_norm": 1.302553653717041, + "learning_rate": 3.893270189230033e-06, + "loss": 0.4258, + "step": 31945 + }, + { + "epoch": 0.7092041153816273, + "grad_norm": 1.4034984111785889, + "learning_rate": 3.890509435679026e-06, + "loss": 0.5204, + "step": 31950 + }, + { + "epoch": 0.7093151019411549, + "grad_norm": 1.0881425142288208, + "learning_rate": 3.8877494248993895e-06, + "loss": 0.4758, + "step": 31955 + }, + { + "epoch": 0.7094260885006826, + "grad_norm": 1.1435565948486328, + "learning_rate": 3.884990157226683e-06, + "loss": 0.312, + "step": 31960 + }, + { + "epoch": 0.7095370750602102, + "grad_norm": 1.5867775678634644, + "learning_rate": 3.882231632996361e-06, + "loss": 0.4257, + "step": 31965 + }, + { + "epoch": 0.7096480616197378, + "grad_norm": 1.1169955730438232, + "learning_rate": 3.879473852543799e-06, + "loss": 0.5656, + "step": 31970 + }, + { + "epoch": 0.7097590481792655, + "grad_norm": 1.6792324781417847, + "learning_rate": 3.876716816204284e-06, + "loss": 0.4188, + "step": 31975 + }, + { + "epoch": 0.7098700347387932, + "grad_norm": 1.1958585977554321, + "learning_rate": 3.873960524312997e-06, + "loss": 0.3275, + "step": 31980 + }, + { + "epoch": 0.7099810212983207, + "grad_norm": 0.9273353219032288, + "learning_rate": 3.87120497720505e-06, + "loss": 0.2232, + "step": 31985 + }, + { + "epoch": 0.7100920078578484, + "grad_norm": 1.430031180381775, + "learning_rate": 3.8684501752154425e-06, + "loss": 0.4956, + "step": 31990 + }, + { + "epoch": 0.7102029944173761, + "grad_norm": 1.3313651084899902, + "learning_rate": 3.8656961186791e-06, + "loss": 0.5053, + "step": 31995 + }, + { + "epoch": 0.7103139809769037, + "grad_norm": 1.2391557693481445, + "learning_rate": 3.862942807930854e-06, + "loss": 0.5513, + "step": 32000 + }, + { + "epoch": 0.7104249675364314, + "grad_norm": 1.1085586547851562, + "learning_rate": 3.860190243305435e-06, + "loss": 0.516, + "step": 32005 + }, + { + "epoch": 0.7105359540959589, + "grad_norm": 1.0961024761199951, + "learning_rate": 3.857438425137499e-06, + "loss": 0.419, + "step": 32010 + }, + { + "epoch": 0.7106469406554866, + "grad_norm": 1.7764806747436523, + "learning_rate": 3.854687353761596e-06, + "loss": 0.3715, + "step": 32015 + }, + { + "epoch": 0.7107579272150143, + "grad_norm": 0.960142970085144, + "learning_rate": 3.851937029512197e-06, + "loss": 0.4334, + "step": 32020 + }, + { + "epoch": 0.7108689137745419, + "grad_norm": 1.5467451810836792, + "learning_rate": 3.849187452723672e-06, + "loss": 0.3228, + "step": 32025 + }, + { + "epoch": 0.7109799003340695, + "grad_norm": 0.7945905327796936, + "learning_rate": 3.846438623730309e-06, + "loss": 0.3627, + "step": 32030 + }, + { + "epoch": 0.7110908868935972, + "grad_norm": 1.7557940483093262, + "learning_rate": 3.843690542866303e-06, + "loss": 0.3534, + "step": 32035 + }, + { + "epoch": 0.7112018734531248, + "grad_norm": 0.9791882634162903, + "learning_rate": 3.840943210465751e-06, + "loss": 0.3775, + "step": 32040 + }, + { + "epoch": 0.7113128600126525, + "grad_norm": 1.2559431791305542, + "learning_rate": 3.83819662686267e-06, + "loss": 0.3808, + "step": 32045 + }, + { + "epoch": 0.7114238465721802, + "grad_norm": 1.400604248046875, + "learning_rate": 3.835450792390977e-06, + "loss": 0.3739, + "step": 32050 + }, + { + "epoch": 0.7115348331317077, + "grad_norm": 1.0400586128234863, + "learning_rate": 3.832705707384504e-06, + "loss": 0.3604, + "step": 32055 + }, + { + "epoch": 0.7116458196912354, + "grad_norm": 1.6382838487625122, + "learning_rate": 3.829961372176985e-06, + "loss": 0.2549, + "step": 32060 + }, + { + "epoch": 0.711756806250763, + "grad_norm": 1.6023155450820923, + "learning_rate": 3.827217787102072e-06, + "loss": 0.3423, + "step": 32065 + }, + { + "epoch": 0.7118677928102907, + "grad_norm": 4.825948238372803, + "learning_rate": 3.8244749524933155e-06, + "loss": 0.3798, + "step": 32070 + }, + { + "epoch": 0.7119787793698183, + "grad_norm": 1.3694977760314941, + "learning_rate": 3.821732868684187e-06, + "loss": 0.4026, + "step": 32075 + }, + { + "epoch": 0.7120897659293459, + "grad_norm": 1.321022629737854, + "learning_rate": 3.8189915360080536e-06, + "loss": 0.4889, + "step": 32080 + }, + { + "epoch": 0.7122007524888736, + "grad_norm": 1.2078578472137451, + "learning_rate": 3.8162509547982015e-06, + "loss": 0.4411, + "step": 32085 + }, + { + "epoch": 0.7123117390484013, + "grad_norm": 1.4181798696517944, + "learning_rate": 3.8135111253878166e-06, + "loss": 0.4319, + "step": 32090 + }, + { + "epoch": 0.7124227256079289, + "grad_norm": 1.0716606378555298, + "learning_rate": 3.8107720481100053e-06, + "loss": 0.3785, + "step": 32095 + }, + { + "epoch": 0.7125337121674565, + "grad_norm": 2.094991445541382, + "learning_rate": 3.808033723297767e-06, + "loss": 0.53, + "step": 32100 + }, + { + "epoch": 0.7126446987269842, + "grad_norm": 1.6106618642807007, + "learning_rate": 3.805296151284027e-06, + "loss": 0.4087, + "step": 32105 + }, + { + "epoch": 0.7127556852865118, + "grad_norm": 1.186604619026184, + "learning_rate": 3.802559332401601e-06, + "loss": 0.5331, + "step": 32110 + }, + { + "epoch": 0.7128666718460395, + "grad_norm": 0.5464258193969727, + "learning_rate": 3.799823266983227e-06, + "loss": 0.3207, + "step": 32115 + }, + { + "epoch": 0.712977658405567, + "grad_norm": 1.6606342792510986, + "learning_rate": 3.79708795536155e-06, + "loss": 0.4592, + "step": 32120 + }, + { + "epoch": 0.7130886449650947, + "grad_norm": 1.799006462097168, + "learning_rate": 3.794353397869113e-06, + "loss": 0.2939, + "step": 32125 + }, + { + "epoch": 0.7131996315246224, + "grad_norm": 0.9671017527580261, + "learning_rate": 3.7916195948383817e-06, + "loss": 0.2851, + "step": 32130 + }, + { + "epoch": 0.71331061808415, + "grad_norm": 1.8571856021881104, + "learning_rate": 3.7888865466017144e-06, + "loss": 0.3613, + "step": 32135 + }, + { + "epoch": 0.7134216046436777, + "grad_norm": 0.34334808588027954, + "learning_rate": 3.7861542534913907e-06, + "loss": 0.2196, + "step": 32140 + }, + { + "epoch": 0.7135325912032053, + "grad_norm": 0.8765976428985596, + "learning_rate": 3.7834227158395964e-06, + "loss": 0.4297, + "step": 32145 + }, + { + "epoch": 0.7136435777627329, + "grad_norm": 1.156449317932129, + "learning_rate": 3.7806919339784166e-06, + "loss": 0.3554, + "step": 32150 + }, + { + "epoch": 0.7137545643222606, + "grad_norm": 0.9955435395240784, + "learning_rate": 3.777961908239857e-06, + "loss": 0.6089, + "step": 32155 + }, + { + "epoch": 0.7138655508817883, + "grad_norm": 2.179382801055908, + "learning_rate": 3.775232638955818e-06, + "loss": 0.2496, + "step": 32160 + }, + { + "epoch": 0.7139765374413158, + "grad_norm": 0.9458391070365906, + "learning_rate": 3.7725041264581184e-06, + "loss": 0.3709, + "step": 32165 + }, + { + "epoch": 0.7140875240008435, + "grad_norm": 1.672772765159607, + "learning_rate": 3.769776371078485e-06, + "loss": 0.3334, + "step": 32170 + }, + { + "epoch": 0.7141985105603711, + "grad_norm": 1.3009119033813477, + "learning_rate": 3.7670493731485424e-06, + "loss": 0.393, + "step": 32175 + }, + { + "epoch": 0.7143094971198988, + "grad_norm": 1.1832098960876465, + "learning_rate": 3.7643231329998366e-06, + "loss": 0.1881, + "step": 32180 + }, + { + "epoch": 0.7144204836794265, + "grad_norm": 0.9928916692733765, + "learning_rate": 3.7615976509638086e-06, + "loss": 0.4001, + "step": 32185 + }, + { + "epoch": 0.714531470238954, + "grad_norm": 0.813753604888916, + "learning_rate": 3.7588729273718194e-06, + "loss": 0.3298, + "step": 32190 + }, + { + "epoch": 0.7146424567984817, + "grad_norm": 0.9435087442398071, + "learning_rate": 3.756148962555125e-06, + "loss": 0.4893, + "step": 32195 + }, + { + "epoch": 0.7147534433580094, + "grad_norm": 1.2980031967163086, + "learning_rate": 3.7534257568448995e-06, + "loss": 0.4273, + "step": 32200 + }, + { + "epoch": 0.714864429917537, + "grad_norm": 1.0607577562332153, + "learning_rate": 3.7507033105722244e-06, + "loss": 0.3802, + "step": 32205 + }, + { + "epoch": 0.7149754164770646, + "grad_norm": 1.1984096765518188, + "learning_rate": 3.7479816240680788e-06, + "loss": 0.4617, + "step": 32210 + }, + { + "epoch": 0.7150864030365923, + "grad_norm": 1.4583815336227417, + "learning_rate": 3.7452606976633644e-06, + "loss": 0.3474, + "step": 32215 + }, + { + "epoch": 0.7151973895961199, + "grad_norm": 1.0872869491577148, + "learning_rate": 3.742540531688873e-06, + "loss": 0.4781, + "step": 32220 + }, + { + "epoch": 0.7153083761556476, + "grad_norm": 0.5375029444694519, + "learning_rate": 3.739821126475318e-06, + "loss": 0.4053, + "step": 32225 + }, + { + "epoch": 0.7154193627151751, + "grad_norm": 0.8395071625709534, + "learning_rate": 3.7371024823533187e-06, + "loss": 0.4364, + "step": 32230 + }, + { + "epoch": 0.7155303492747028, + "grad_norm": 0.849446713924408, + "learning_rate": 3.7343845996533922e-06, + "loss": 0.4214, + "step": 32235 + }, + { + "epoch": 0.7156413358342305, + "grad_norm": 1.2325609922409058, + "learning_rate": 3.731667478705976e-06, + "loss": 0.5089, + "step": 32240 + }, + { + "epoch": 0.7157523223937581, + "grad_norm": 1.1667782068252563, + "learning_rate": 3.728951119841403e-06, + "loss": 0.626, + "step": 32245 + }, + { + "epoch": 0.7158633089532858, + "grad_norm": 1.1611791849136353, + "learning_rate": 3.7262355233899204e-06, + "loss": 0.4001, + "step": 32250 + }, + { + "epoch": 0.7159742955128134, + "grad_norm": 1.2110391855239868, + "learning_rate": 3.7235206896816858e-06, + "loss": 0.5051, + "step": 32255 + }, + { + "epoch": 0.716085282072341, + "grad_norm": 1.197821021080017, + "learning_rate": 3.720806619046753e-06, + "loss": 0.3824, + "step": 32260 + }, + { + "epoch": 0.7161962686318687, + "grad_norm": 1.2906665802001953, + "learning_rate": 3.718093311815095e-06, + "loss": 0.4665, + "step": 32265 + }, + { + "epoch": 0.7163072551913964, + "grad_norm": 1.2199299335479736, + "learning_rate": 3.715380768316582e-06, + "loss": 0.3052, + "step": 32270 + }, + { + "epoch": 0.716418241750924, + "grad_norm": 1.3464202880859375, + "learning_rate": 3.7126689888810017e-06, + "loss": 0.463, + "step": 32275 + }, + { + "epoch": 0.7165292283104516, + "grad_norm": 1.0648338794708252, + "learning_rate": 3.7099579738380366e-06, + "loss": 0.3672, + "step": 32280 + }, + { + "epoch": 0.7166402148699792, + "grad_norm": 1.0626704692840576, + "learning_rate": 3.7072477235172875e-06, + "loss": 0.3367, + "step": 32285 + }, + { + "epoch": 0.7167512014295069, + "grad_norm": 1.1101748943328857, + "learning_rate": 3.704538238248254e-06, + "loss": 0.3515, + "step": 32290 + }, + { + "epoch": 0.7168621879890346, + "grad_norm": 1.251792311668396, + "learning_rate": 3.7018295183603515e-06, + "loss": 0.474, + "step": 32295 + }, + { + "epoch": 0.7169731745485621, + "grad_norm": 1.3575869798660278, + "learning_rate": 3.6991215641828903e-06, + "loss": 0.3501, + "step": 32300 + }, + { + "epoch": 0.7170841611080898, + "grad_norm": 1.2677310705184937, + "learning_rate": 3.696414376045101e-06, + "loss": 0.4466, + "step": 32305 + }, + { + "epoch": 0.7171951476676175, + "grad_norm": 1.0338740348815918, + "learning_rate": 3.693707954276108e-06, + "loss": 0.3098, + "step": 32310 + }, + { + "epoch": 0.7173061342271451, + "grad_norm": 2.0934510231018066, + "learning_rate": 3.6910022992049556e-06, + "loss": 0.3614, + "step": 32315 + }, + { + "epoch": 0.7174171207866727, + "grad_norm": 1.3029704093933105, + "learning_rate": 3.688297411160581e-06, + "loss": 0.4913, + "step": 32320 + }, + { + "epoch": 0.7175281073462004, + "grad_norm": 1.1890994310379028, + "learning_rate": 3.6855932904718426e-06, + "loss": 0.3007, + "step": 32325 + }, + { + "epoch": 0.717639093905728, + "grad_norm": 1.3361420631408691, + "learning_rate": 3.6828899374674933e-06, + "loss": 0.3256, + "step": 32330 + }, + { + "epoch": 0.7177500804652557, + "grad_norm": 1.8069764375686646, + "learning_rate": 3.680187352476198e-06, + "loss": 0.3186, + "step": 32335 + }, + { + "epoch": 0.7178610670247833, + "grad_norm": 1.300967812538147, + "learning_rate": 3.6774855358265327e-06, + "loss": 0.3302, + "step": 32340 + }, + { + "epoch": 0.7179720535843109, + "grad_norm": 1.4119071960449219, + "learning_rate": 3.6747844878469695e-06, + "loss": 0.4967, + "step": 32345 + }, + { + "epoch": 0.7180830401438386, + "grad_norm": 1.0396215915679932, + "learning_rate": 3.672084208865898e-06, + "loss": 0.5122, + "step": 32350 + }, + { + "epoch": 0.7181940267033662, + "grad_norm": 1.1072118282318115, + "learning_rate": 3.6693846992116024e-06, + "loss": 0.3959, + "step": 32355 + }, + { + "epoch": 0.7183050132628939, + "grad_norm": 1.172977089881897, + "learning_rate": 3.6666859592122885e-06, + "loss": 0.2945, + "step": 32360 + }, + { + "epoch": 0.7184159998224215, + "grad_norm": 1.402540683746338, + "learning_rate": 3.663987989196051e-06, + "loss": 0.4883, + "step": 32365 + }, + { + "epoch": 0.7185269863819491, + "grad_norm": 0.952644944190979, + "learning_rate": 3.6612907894909042e-06, + "loss": 0.2974, + "step": 32370 + }, + { + "epoch": 0.7186379729414768, + "grad_norm": 1.0303863286972046, + "learning_rate": 3.6585943604247687e-06, + "loss": 0.4951, + "step": 32375 + }, + { + "epoch": 0.7187489595010045, + "grad_norm": 0.9419283270835876, + "learning_rate": 3.65589870232546e-06, + "loss": 0.3822, + "step": 32380 + }, + { + "epoch": 0.718859946060532, + "grad_norm": 1.5716652870178223, + "learning_rate": 3.653203815520714e-06, + "loss": 0.5331, + "step": 32385 + }, + { + "epoch": 0.7189709326200597, + "grad_norm": 1.2303674221038818, + "learning_rate": 3.6505097003381585e-06, + "loss": 0.4361, + "step": 32390 + }, + { + "epoch": 0.7190819191795873, + "grad_norm": 2.1736645698547363, + "learning_rate": 3.6478163571053404e-06, + "loss": 0.4631, + "step": 32395 + }, + { + "epoch": 0.719192905739115, + "grad_norm": 1.194144368171692, + "learning_rate": 3.645123786149708e-06, + "loss": 0.5266, + "step": 32400 + }, + { + "epoch": 0.7193038922986427, + "grad_norm": 1.0125007629394531, + "learning_rate": 3.642431987798611e-06, + "loss": 0.4056, + "step": 32405 + }, + { + "epoch": 0.7194148788581702, + "grad_norm": 1.3125933408737183, + "learning_rate": 3.6397409623793147e-06, + "loss": 0.3606, + "step": 32410 + }, + { + "epoch": 0.7195258654176979, + "grad_norm": 1.499477744102478, + "learning_rate": 3.6370507102189767e-06, + "loss": 0.4938, + "step": 32415 + }, + { + "epoch": 0.7196368519772256, + "grad_norm": 1.4758806228637695, + "learning_rate": 3.634361231644675e-06, + "loss": 0.4187, + "step": 32420 + }, + { + "epoch": 0.7197478385367532, + "grad_norm": 1.444891095161438, + "learning_rate": 3.6316725269833887e-06, + "loss": 0.4928, + "step": 32425 + }, + { + "epoch": 0.7198588250962809, + "grad_norm": 1.2352997064590454, + "learning_rate": 3.628984596561996e-06, + "loss": 0.387, + "step": 32430 + }, + { + "epoch": 0.7199698116558085, + "grad_norm": 1.1362451314926147, + "learning_rate": 3.6262974407072928e-06, + "loss": 0.3679, + "step": 32435 + }, + { + "epoch": 0.7200807982153361, + "grad_norm": 1.1246973276138306, + "learning_rate": 3.6236110597459674e-06, + "loss": 0.5203, + "step": 32440 + }, + { + "epoch": 0.7201917847748638, + "grad_norm": 0.9550732970237732, + "learning_rate": 3.620925454004628e-06, + "loss": 0.5526, + "step": 32445 + }, + { + "epoch": 0.7203027713343914, + "grad_norm": 1.293099045753479, + "learning_rate": 3.6182406238097745e-06, + "loss": 0.5008, + "step": 32450 + }, + { + "epoch": 0.720413757893919, + "grad_norm": 1.2851907014846802, + "learning_rate": 3.6155565694878237e-06, + "loss": 0.4955, + "step": 32455 + }, + { + "epoch": 0.7205247444534467, + "grad_norm": 1.3594646453857422, + "learning_rate": 3.6128732913650966e-06, + "loss": 0.4474, + "step": 32460 + }, + { + "epoch": 0.7206357310129743, + "grad_norm": 1.6697702407836914, + "learning_rate": 3.61019078976781e-06, + "loss": 0.3925, + "step": 32465 + }, + { + "epoch": 0.720746717572502, + "grad_norm": 1.8893567323684692, + "learning_rate": 3.607509065022101e-06, + "loss": 0.4252, + "step": 32470 + }, + { + "epoch": 0.7208577041320297, + "grad_norm": 1.0012366771697998, + "learning_rate": 3.604828117453999e-06, + "loss": 0.4189, + "step": 32475 + }, + { + "epoch": 0.7209686906915572, + "grad_norm": 1.221556544303894, + "learning_rate": 3.602147947389446e-06, + "loss": 0.4251, + "step": 32480 + }, + { + "epoch": 0.7210796772510849, + "grad_norm": 1.0607415437698364, + "learning_rate": 3.5994685551542917e-06, + "loss": 0.5034, + "step": 32485 + }, + { + "epoch": 0.7211906638106126, + "grad_norm": 0.768079936504364, + "learning_rate": 3.5967899410742812e-06, + "loss": 0.3316, + "step": 32490 + }, + { + "epoch": 0.7213016503701402, + "grad_norm": 0.9735240340232849, + "learning_rate": 3.5941121054750794e-06, + "loss": 0.4515, + "step": 32495 + }, + { + "epoch": 0.7214126369296678, + "grad_norm": 1.3301113843917847, + "learning_rate": 3.5914350486822403e-06, + "loss": 0.2459, + "step": 32500 + }, + { + "epoch": 0.7215236234891954, + "grad_norm": 1.4246984720230103, + "learning_rate": 3.5887587710212346e-06, + "loss": 0.3347, + "step": 32505 + }, + { + "epoch": 0.7216346100487231, + "grad_norm": 0.778403103351593, + "learning_rate": 3.58608327281744e-06, + "loss": 0.3325, + "step": 32510 + }, + { + "epoch": 0.7217455966082508, + "grad_norm": 1.7449480295181274, + "learning_rate": 3.5834085543961274e-06, + "loss": 0.5193, + "step": 32515 + }, + { + "epoch": 0.7218565831677783, + "grad_norm": 1.47147798538208, + "learning_rate": 3.5807346160824863e-06, + "loss": 0.3736, + "step": 32520 + }, + { + "epoch": 0.721967569727306, + "grad_norm": 1.3669594526290894, + "learning_rate": 3.5780614582015983e-06, + "loss": 0.4206, + "step": 32525 + }, + { + "epoch": 0.7220785562868337, + "grad_norm": 1.250982403755188, + "learning_rate": 3.5753890810784643e-06, + "loss": 0.3494, + "step": 32530 + }, + { + "epoch": 0.7221895428463613, + "grad_norm": 2.3692402839660645, + "learning_rate": 3.5727174850379766e-06, + "loss": 0.3719, + "step": 32535 + }, + { + "epoch": 0.722300529405889, + "grad_norm": 0.8750824928283691, + "learning_rate": 3.5700466704049442e-06, + "loss": 0.4329, + "step": 32540 + }, + { + "epoch": 0.7224115159654166, + "grad_norm": 1.0827548503875732, + "learning_rate": 3.5673766375040695e-06, + "loss": 0.3744, + "step": 32545 + }, + { + "epoch": 0.7225225025249442, + "grad_norm": 1.5364011526107788, + "learning_rate": 3.5647073866599736e-06, + "loss": 0.4377, + "step": 32550 + }, + { + "epoch": 0.7226334890844719, + "grad_norm": 2.185579776763916, + "learning_rate": 3.562038918197168e-06, + "loss": 0.3987, + "step": 32555 + }, + { + "epoch": 0.7227444756439995, + "grad_norm": 1.3342119455337524, + "learning_rate": 3.559371232440083e-06, + "loss": 0.4989, + "step": 32560 + }, + { + "epoch": 0.7228554622035271, + "grad_norm": 1.243114948272705, + "learning_rate": 3.55670432971304e-06, + "loss": 0.2273, + "step": 32565 + }, + { + "epoch": 0.7229664487630548, + "grad_norm": 1.3207345008850098, + "learning_rate": 3.5540382103402795e-06, + "loss": 0.5867, + "step": 32570 + }, + { + "epoch": 0.7230774353225824, + "grad_norm": 1.442366600036621, + "learning_rate": 3.551372874645931e-06, + "loss": 0.3574, + "step": 32575 + }, + { + "epoch": 0.7231884218821101, + "grad_norm": 1.133528232574463, + "learning_rate": 3.5487083229540453e-06, + "loss": 0.3784, + "step": 32580 + }, + { + "epoch": 0.7232994084416378, + "grad_norm": 0.8515037894248962, + "learning_rate": 3.5460445555885612e-06, + "loss": 0.3293, + "step": 32585 + }, + { + "epoch": 0.7234103950011653, + "grad_norm": 0.9453320503234863, + "learning_rate": 3.5433815728733366e-06, + "loss": 0.3655, + "step": 32590 + }, + { + "epoch": 0.723521381560693, + "grad_norm": 1.4416009187698364, + "learning_rate": 3.540719375132129e-06, + "loss": 0.4351, + "step": 32595 + }, + { + "epoch": 0.7236323681202207, + "grad_norm": 0.6182461977005005, + "learning_rate": 3.538057962688595e-06, + "loss": 0.3649, + "step": 32600 + }, + { + "epoch": 0.7237433546797483, + "grad_norm": 0.5999155640602112, + "learning_rate": 3.535397335866304e-06, + "loss": 0.307, + "step": 32605 + }, + { + "epoch": 0.723854341239276, + "grad_norm": 0.9559226036071777, + "learning_rate": 3.5327374949887216e-06, + "loss": 0.45, + "step": 32610 + }, + { + "epoch": 0.7239653277988035, + "grad_norm": 1.6150270700454712, + "learning_rate": 3.5300784403792256e-06, + "loss": 0.4936, + "step": 32615 + }, + { + "epoch": 0.7240763143583312, + "grad_norm": 1.1039475202560425, + "learning_rate": 3.5274201723610967e-06, + "loss": 0.3893, + "step": 32620 + }, + { + "epoch": 0.7241873009178589, + "grad_norm": 1.903681993484497, + "learning_rate": 3.524762691257513e-06, + "loss": 0.5153, + "step": 32625 + }, + { + "epoch": 0.7242982874773864, + "grad_norm": 0.9269314408302307, + "learning_rate": 3.5221059973915683e-06, + "loss": 0.5013, + "step": 32630 + }, + { + "epoch": 0.7244092740369141, + "grad_norm": 1.893462896347046, + "learning_rate": 3.5194500910862485e-06, + "loss": 0.3573, + "step": 32635 + }, + { + "epoch": 0.7245202605964418, + "grad_norm": 1.3693534135818481, + "learning_rate": 3.5167949726644545e-06, + "loss": 0.4122, + "step": 32640 + }, + { + "epoch": 0.7246312471559694, + "grad_norm": 1.0553785562515259, + "learning_rate": 3.5141406424489823e-06, + "loss": 0.3246, + "step": 32645 + }, + { + "epoch": 0.7247422337154971, + "grad_norm": 0.5601930618286133, + "learning_rate": 3.5114871007625397e-06, + "loss": 0.1944, + "step": 32650 + }, + { + "epoch": 0.7248532202750247, + "grad_norm": 0.5924381613731384, + "learning_rate": 3.5088343479277365e-06, + "loss": 0.4126, + "step": 32655 + }, + { + "epoch": 0.7249642068345523, + "grad_norm": 1.915968418121338, + "learning_rate": 3.506182384267082e-06, + "loss": 0.411, + "step": 32660 + }, + { + "epoch": 0.72507519339408, + "grad_norm": 1.2195994853973389, + "learning_rate": 3.503531210102996e-06, + "loss": 0.4658, + "step": 32665 + }, + { + "epoch": 0.7251861799536076, + "grad_norm": 1.0181869268417358, + "learning_rate": 3.5008808257577955e-06, + "loss": 0.4259, + "step": 32670 + }, + { + "epoch": 0.7252971665131352, + "grad_norm": 1.091694712638855, + "learning_rate": 3.498231231553708e-06, + "loss": 0.5182, + "step": 32675 + }, + { + "epoch": 0.7254081530726629, + "grad_norm": 1.29982590675354, + "learning_rate": 3.4955824278128657e-06, + "loss": 0.4597, + "step": 32680 + }, + { + "epoch": 0.7255191396321905, + "grad_norm": 1.1753360033035278, + "learning_rate": 3.492934414857294e-06, + "loss": 0.3575, + "step": 32685 + }, + { + "epoch": 0.7256301261917182, + "grad_norm": 1.8706086874008179, + "learning_rate": 3.4902871930089365e-06, + "loss": 0.5171, + "step": 32690 + }, + { + "epoch": 0.7257411127512459, + "grad_norm": 0.8465336561203003, + "learning_rate": 3.487640762589627e-06, + "loss": 0.3946, + "step": 32695 + }, + { + "epoch": 0.7258520993107734, + "grad_norm": 1.5317870378494263, + "learning_rate": 3.484995123921112e-06, + "loss": 0.4923, + "step": 32700 + }, + { + "epoch": 0.7259630858703011, + "grad_norm": 1.4120965003967285, + "learning_rate": 3.482350277325045e-06, + "loss": 0.472, + "step": 32705 + }, + { + "epoch": 0.7260740724298288, + "grad_norm": 1.5469073057174683, + "learning_rate": 3.479706223122968e-06, + "loss": 0.5002, + "step": 32710 + }, + { + "epoch": 0.7261850589893564, + "grad_norm": 1.0146976709365845, + "learning_rate": 3.477062961636346e-06, + "loss": 0.3272, + "step": 32715 + }, + { + "epoch": 0.726296045548884, + "grad_norm": 1.6725594997406006, + "learning_rate": 3.474420493186528e-06, + "loss": 0.278, + "step": 32720 + }, + { + "epoch": 0.7264070321084116, + "grad_norm": 1.9448822736740112, + "learning_rate": 3.4717788180947855e-06, + "loss": 0.2973, + "step": 32725 + }, + { + "epoch": 0.7265180186679393, + "grad_norm": 1.0481772422790527, + "learning_rate": 3.4691379366822765e-06, + "loss": 0.2087, + "step": 32730 + }, + { + "epoch": 0.726629005227467, + "grad_norm": 1.0238654613494873, + "learning_rate": 3.466497849270075e-06, + "loss": 0.4737, + "step": 32735 + }, + { + "epoch": 0.7267399917869946, + "grad_norm": 1.1857450008392334, + "learning_rate": 3.463858556179156e-06, + "loss": 0.3494, + "step": 32740 + }, + { + "epoch": 0.7268509783465222, + "grad_norm": 1.1542832851409912, + "learning_rate": 3.46122005773039e-06, + "loss": 0.3563, + "step": 32745 + }, + { + "epoch": 0.7269619649060499, + "grad_norm": 0.7841120362281799, + "learning_rate": 3.458582354244564e-06, + "loss": 0.2948, + "step": 32750 + }, + { + "epoch": 0.7270729514655775, + "grad_norm": 1.2427359819412231, + "learning_rate": 3.4559454460423535e-06, + "loss": 0.2064, + "step": 32755 + }, + { + "epoch": 0.7271839380251052, + "grad_norm": 2.157987594604492, + "learning_rate": 3.453309333444349e-06, + "loss": 0.3572, + "step": 32760 + }, + { + "epoch": 0.7272949245846329, + "grad_norm": 1.419998049736023, + "learning_rate": 3.450674016771042e-06, + "loss": 0.4259, + "step": 32765 + }, + { + "epoch": 0.7274059111441604, + "grad_norm": 1.2902168035507202, + "learning_rate": 3.448039496342821e-06, + "loss": 0.3996, + "step": 32770 + }, + { + "epoch": 0.7275168977036881, + "grad_norm": 1.4809598922729492, + "learning_rate": 3.445405772479987e-06, + "loss": 0.3526, + "step": 32775 + }, + { + "epoch": 0.7276278842632157, + "grad_norm": 1.3631724119186401, + "learning_rate": 3.4427728455027343e-06, + "loss": 0.3453, + "step": 32780 + }, + { + "epoch": 0.7277388708227434, + "grad_norm": 1.4944729804992676, + "learning_rate": 3.4401407157311706e-06, + "loss": 0.3023, + "step": 32785 + }, + { + "epoch": 0.727849857382271, + "grad_norm": 1.2906522750854492, + "learning_rate": 3.4375093834852956e-06, + "loss": 0.4278, + "step": 32790 + }, + { + "epoch": 0.7279608439417986, + "grad_norm": 1.471267580986023, + "learning_rate": 3.4348788490850236e-06, + "loss": 0.2057, + "step": 32795 + }, + { + "epoch": 0.7280718305013263, + "grad_norm": 1.5331370830535889, + "learning_rate": 3.4322491128501613e-06, + "loss": 0.3482, + "step": 32800 + }, + { + "epoch": 0.728182817060854, + "grad_norm": 1.3465474843978882, + "learning_rate": 3.429620175100428e-06, + "loss": 0.5177, + "step": 32805 + }, + { + "epoch": 0.7282938036203815, + "grad_norm": 0.895503580570221, + "learning_rate": 3.4269920361554342e-06, + "loss": 0.3538, + "step": 32810 + }, + { + "epoch": 0.7284047901799092, + "grad_norm": 1.9255492687225342, + "learning_rate": 3.424364696334709e-06, + "loss": 0.4316, + "step": 32815 + }, + { + "epoch": 0.7285157767394369, + "grad_norm": 1.0728861093521118, + "learning_rate": 3.421738155957668e-06, + "loss": 0.411, + "step": 32820 + }, + { + "epoch": 0.7286267632989645, + "grad_norm": 1.22779381275177, + "learning_rate": 3.419112415343643e-06, + "loss": 0.3588, + "step": 32825 + }, + { + "epoch": 0.7287377498584922, + "grad_norm": 1.1258268356323242, + "learning_rate": 3.416487474811856e-06, + "loss": 0.3582, + "step": 32830 + }, + { + "epoch": 0.7288487364180197, + "grad_norm": 1.030422568321228, + "learning_rate": 3.4138633346814463e-06, + "loss": 0.5491, + "step": 32835 + }, + { + "epoch": 0.7289597229775474, + "grad_norm": 1.0262194871902466, + "learning_rate": 3.4112399952714414e-06, + "loss": 0.3523, + "step": 32840 + }, + { + "epoch": 0.7290707095370751, + "grad_norm": 1.7964067459106445, + "learning_rate": 3.4086174569007802e-06, + "loss": 0.4073, + "step": 32845 + }, + { + "epoch": 0.7291816960966027, + "grad_norm": 1.8312066793441772, + "learning_rate": 3.4059957198883067e-06, + "loss": 0.5042, + "step": 32850 + }, + { + "epoch": 0.7292926826561303, + "grad_norm": 0.892008900642395, + "learning_rate": 3.403374784552754e-06, + "loss": 0.3988, + "step": 32855 + }, + { + "epoch": 0.729403669215658, + "grad_norm": 1.1622059345245361, + "learning_rate": 3.4007546512127764e-06, + "loss": 0.3452, + "step": 32860 + }, + { + "epoch": 0.7295146557751856, + "grad_norm": 0.48767390847206116, + "learning_rate": 3.3981353201869126e-06, + "loss": 0.254, + "step": 32865 + }, + { + "epoch": 0.7296256423347133, + "grad_norm": 1.3434008359909058, + "learning_rate": 3.395516791793616e-06, + "loss": 0.3915, + "step": 32870 + }, + { + "epoch": 0.729736628894241, + "grad_norm": 1.2038825750350952, + "learning_rate": 3.3928990663512416e-06, + "loss": 0.5588, + "step": 32875 + }, + { + "epoch": 0.7298476154537685, + "grad_norm": 1.5192091464996338, + "learning_rate": 3.3902821441780366e-06, + "loss": 0.249, + "step": 32880 + }, + { + "epoch": 0.7299586020132962, + "grad_norm": 0.9307333827018738, + "learning_rate": 3.3876660255921646e-06, + "loss": 0.4169, + "step": 32885 + }, + { + "epoch": 0.7300695885728238, + "grad_norm": 1.7946795225143433, + "learning_rate": 3.385050710911677e-06, + "loss": 0.615, + "step": 32890 + }, + { + "epoch": 0.7301805751323515, + "grad_norm": 1.5713242292404175, + "learning_rate": 3.382436200454543e-06, + "loss": 0.4895, + "step": 32895 + }, + { + "epoch": 0.7302915616918791, + "grad_norm": 1.2957006692886353, + "learning_rate": 3.3798224945386192e-06, + "loss": 0.4357, + "step": 32900 + }, + { + "epoch": 0.7304025482514067, + "grad_norm": 1.557170033454895, + "learning_rate": 3.377209593481674e-06, + "loss": 0.3959, + "step": 32905 + }, + { + "epoch": 0.7305135348109344, + "grad_norm": 1.1738910675048828, + "learning_rate": 3.3745974976013785e-06, + "loss": 0.4669, + "step": 32910 + }, + { + "epoch": 0.7306245213704621, + "grad_norm": 1.196847915649414, + "learning_rate": 3.3719862072152964e-06, + "loss": 0.353, + "step": 32915 + }, + { + "epoch": 0.7307355079299896, + "grad_norm": 2.7891581058502197, + "learning_rate": 3.369375722640905e-06, + "loss": 0.3949, + "step": 32920 + }, + { + "epoch": 0.7308464944895173, + "grad_norm": 1.387994408607483, + "learning_rate": 3.366766044195574e-06, + "loss": 0.3498, + "step": 32925 + }, + { + "epoch": 0.730957481049045, + "grad_norm": 1.214930534362793, + "learning_rate": 3.3641571721965802e-06, + "loss": 0.4307, + "step": 32930 + }, + { + "epoch": 0.7310684676085726, + "grad_norm": 1.103918194770813, + "learning_rate": 3.3615491069611062e-06, + "loss": 0.2793, + "step": 32935 + }, + { + "epoch": 0.7311794541681003, + "grad_norm": 0.9579012989997864, + "learning_rate": 3.358941848806224e-06, + "loss": 0.4801, + "step": 32940 + }, + { + "epoch": 0.7312904407276278, + "grad_norm": 1.5926653146743774, + "learning_rate": 3.3563353980489244e-06, + "loss": 0.4244, + "step": 32945 + }, + { + "epoch": 0.7314014272871555, + "grad_norm": 0.8913977742195129, + "learning_rate": 3.353729755006081e-06, + "loss": 0.3668, + "step": 32950 + }, + { + "epoch": 0.7315124138466832, + "grad_norm": 0.952574610710144, + "learning_rate": 3.351124919994485e-06, + "loss": 0.3864, + "step": 32955 + }, + { + "epoch": 0.7316234004062108, + "grad_norm": 0.9258487224578857, + "learning_rate": 3.3485208933308253e-06, + "loss": 0.3548, + "step": 32960 + }, + { + "epoch": 0.7317343869657384, + "grad_norm": 1.7011442184448242, + "learning_rate": 3.3459176753316857e-06, + "loss": 0.3893, + "step": 32965 + }, + { + "epoch": 0.7318453735252661, + "grad_norm": 1.0624943971633911, + "learning_rate": 3.3433152663135614e-06, + "loss": 0.442, + "step": 32970 + }, + { + "epoch": 0.7319563600847937, + "grad_norm": 1.2660802602767944, + "learning_rate": 3.3407136665928395e-06, + "loss": 0.3552, + "step": 32975 + }, + { + "epoch": 0.7320673466443214, + "grad_norm": 1.497206449508667, + "learning_rate": 3.338112876485821e-06, + "loss": 0.3593, + "step": 32980 + }, + { + "epoch": 0.7321783332038491, + "grad_norm": 0.6715766787528992, + "learning_rate": 3.3355128963086913e-06, + "loss": 0.2989, + "step": 32985 + }, + { + "epoch": 0.7322893197633766, + "grad_norm": 0.7257074117660522, + "learning_rate": 3.3329137263775534e-06, + "loss": 0.3485, + "step": 32990 + }, + { + "epoch": 0.7324003063229043, + "grad_norm": 0.8963240385055542, + "learning_rate": 3.3303153670084086e-06, + "loss": 0.3927, + "step": 32995 + }, + { + "epoch": 0.7325112928824319, + "grad_norm": 0.8797928690910339, + "learning_rate": 3.32771781851715e-06, + "loss": 0.4239, + "step": 33000 + }, + { + "epoch": 0.7326222794419596, + "grad_norm": 0.583372175693512, + "learning_rate": 3.3251210812195843e-06, + "loss": 0.3269, + "step": 33005 + }, + { + "epoch": 0.7327332660014872, + "grad_norm": 1.7786016464233398, + "learning_rate": 3.32252515543141e-06, + "loss": 0.4063, + "step": 33010 + }, + { + "epoch": 0.7328442525610148, + "grad_norm": 1.127631425857544, + "learning_rate": 3.319930041468231e-06, + "loss": 0.5401, + "step": 33015 + }, + { + "epoch": 0.7329552391205425, + "grad_norm": 1.1388825178146362, + "learning_rate": 3.3173357396455587e-06, + "loss": 0.3118, + "step": 33020 + }, + { + "epoch": 0.7330662256800702, + "grad_norm": 0.7228020429611206, + "learning_rate": 3.314742250278792e-06, + "loss": 0.3522, + "step": 33025 + }, + { + "epoch": 0.7331772122395978, + "grad_norm": 1.0773768424987793, + "learning_rate": 3.3121495736832445e-06, + "loss": 0.4434, + "step": 33030 + }, + { + "epoch": 0.7332881987991254, + "grad_norm": 1.291050910949707, + "learning_rate": 3.3095577101741192e-06, + "loss": 0.3147, + "step": 33035 + }, + { + "epoch": 0.7333991853586531, + "grad_norm": 1.4104204177856445, + "learning_rate": 3.306966660066534e-06, + "loss": 0.4284, + "step": 33040 + }, + { + "epoch": 0.7335101719181807, + "grad_norm": 1.1272939443588257, + "learning_rate": 3.3043764236754916e-06, + "loss": 0.4002, + "step": 33045 + }, + { + "epoch": 0.7336211584777084, + "grad_norm": 1.4456478357315063, + "learning_rate": 3.3017870013159116e-06, + "loss": 0.4438, + "step": 33050 + }, + { + "epoch": 0.7337321450372359, + "grad_norm": 1.3265424966812134, + "learning_rate": 3.2991983933025997e-06, + "loss": 0.3728, + "step": 33055 + }, + { + "epoch": 0.7338431315967636, + "grad_norm": 1.2507997751235962, + "learning_rate": 3.2966105999502786e-06, + "loss": 0.3079, + "step": 33060 + }, + { + "epoch": 0.7339541181562913, + "grad_norm": 0.9457982778549194, + "learning_rate": 3.2940236215735554e-06, + "loss": 0.2878, + "step": 33065 + }, + { + "epoch": 0.7340651047158189, + "grad_norm": 1.8294035196304321, + "learning_rate": 3.2914374584869547e-06, + "loss": 0.3948, + "step": 33070 + }, + { + "epoch": 0.7341760912753466, + "grad_norm": 0.5478452444076538, + "learning_rate": 3.2888521110048844e-06, + "loss": 0.4006, + "step": 33075 + }, + { + "epoch": 0.7342870778348742, + "grad_norm": 1.1786937713623047, + "learning_rate": 3.286267579441671e-06, + "loss": 0.4344, + "step": 33080 + }, + { + "epoch": 0.7343980643944018, + "grad_norm": 0.8153396844863892, + "learning_rate": 3.2836838641115266e-06, + "loss": 0.4621, + "step": 33085 + }, + { + "epoch": 0.7345090509539295, + "grad_norm": 1.1007554531097412, + "learning_rate": 3.2811009653285753e-06, + "loss": 0.4171, + "step": 33090 + }, + { + "epoch": 0.7346200375134572, + "grad_norm": 1.808050274848938, + "learning_rate": 3.2785188834068325e-06, + "loss": 0.3717, + "step": 33095 + }, + { + "epoch": 0.7347310240729847, + "grad_norm": 1.3318843841552734, + "learning_rate": 3.275937618660221e-06, + "loss": 0.4008, + "step": 33100 + }, + { + "epoch": 0.7348420106325124, + "grad_norm": 0.7449874877929688, + "learning_rate": 3.273357171402567e-06, + "loss": 0.3371, + "step": 33105 + }, + { + "epoch": 0.73495299719204, + "grad_norm": 1.0369887351989746, + "learning_rate": 3.270777541947586e-06, + "loss": 0.4029, + "step": 33110 + }, + { + "epoch": 0.7350639837515677, + "grad_norm": 0.887763261795044, + "learning_rate": 3.268198730608906e-06, + "loss": 0.3888, + "step": 33115 + }, + { + "epoch": 0.7351749703110954, + "grad_norm": 0.9809614419937134, + "learning_rate": 3.265620737700044e-06, + "loss": 0.4863, + "step": 33120 + }, + { + "epoch": 0.7352859568706229, + "grad_norm": 1.8363839387893677, + "learning_rate": 3.2630435635344283e-06, + "loss": 0.4954, + "step": 33125 + }, + { + "epoch": 0.7353969434301506, + "grad_norm": 1.0824159383773804, + "learning_rate": 3.260467208425384e-06, + "loss": 0.3013, + "step": 33130 + }, + { + "epoch": 0.7355079299896783, + "grad_norm": 1.8370351791381836, + "learning_rate": 3.257891672686132e-06, + "loss": 0.2907, + "step": 33135 + }, + { + "epoch": 0.7356189165492059, + "grad_norm": 1.621275782585144, + "learning_rate": 3.2553169566298017e-06, + "loss": 0.4826, + "step": 33140 + }, + { + "epoch": 0.7357299031087335, + "grad_norm": 1.2516789436340332, + "learning_rate": 3.2527430605694134e-06, + "loss": 0.3353, + "step": 33145 + }, + { + "epoch": 0.7358408896682612, + "grad_norm": 1.3625743389129639, + "learning_rate": 3.250169984817897e-06, + "loss": 0.2859, + "step": 33150 + }, + { + "epoch": 0.7359518762277888, + "grad_norm": 1.683611512184143, + "learning_rate": 3.2475977296880747e-06, + "loss": 0.5756, + "step": 33155 + }, + { + "epoch": 0.7360628627873165, + "grad_norm": 0.8776915669441223, + "learning_rate": 3.2450262954926746e-06, + "loss": 0.3158, + "step": 33160 + }, + { + "epoch": 0.736173849346844, + "grad_norm": 0.9705437421798706, + "learning_rate": 3.2424556825443252e-06, + "loss": 0.418, + "step": 33165 + }, + { + "epoch": 0.7362848359063717, + "grad_norm": 1.0836580991744995, + "learning_rate": 3.2398858911555486e-06, + "loss": 0.3985, + "step": 33170 + }, + { + "epoch": 0.7363958224658994, + "grad_norm": 1.2586573362350464, + "learning_rate": 3.237316921638777e-06, + "loss": 0.3557, + "step": 33175 + }, + { + "epoch": 0.736506809025427, + "grad_norm": 0.9279100298881531, + "learning_rate": 3.23474877430633e-06, + "loss": 0.3226, + "step": 33180 + }, + { + "epoch": 0.7366177955849547, + "grad_norm": 1.4810030460357666, + "learning_rate": 3.2321814494704384e-06, + "loss": 0.3564, + "step": 33185 + }, + { + "epoch": 0.7367287821444823, + "grad_norm": 0.5548216104507446, + "learning_rate": 3.2296149474432325e-06, + "loss": 0.3007, + "step": 33190 + }, + { + "epoch": 0.7368397687040099, + "grad_norm": 0.4443269968032837, + "learning_rate": 3.2270492685367315e-06, + "loss": 0.4224, + "step": 33195 + }, + { + "epoch": 0.7369507552635376, + "grad_norm": 1.2632858753204346, + "learning_rate": 3.2244844130628684e-06, + "loss": 0.4178, + "step": 33200 + }, + { + "epoch": 0.7370617418230653, + "grad_norm": 1.9343825578689575, + "learning_rate": 3.2219203813334643e-06, + "loss": 0.3557, + "step": 33205 + }, + { + "epoch": 0.7371727283825928, + "grad_norm": 1.1359671354293823, + "learning_rate": 3.2193571736602482e-06, + "loss": 0.4858, + "step": 33210 + }, + { + "epoch": 0.7372837149421205, + "grad_norm": 1.0493714809417725, + "learning_rate": 3.2167947903548503e-06, + "loss": 0.5146, + "step": 33215 + }, + { + "epoch": 0.7373947015016481, + "grad_norm": 1.2342268228530884, + "learning_rate": 3.2142332317287884e-06, + "loss": 0.4715, + "step": 33220 + }, + { + "epoch": 0.7375056880611758, + "grad_norm": 1.3913966417312622, + "learning_rate": 3.2116724980934964e-06, + "loss": 0.2505, + "step": 33225 + }, + { + "epoch": 0.7376166746207035, + "grad_norm": 1.4098409414291382, + "learning_rate": 3.2091125897602927e-06, + "loss": 0.447, + "step": 33230 + }, + { + "epoch": 0.737727661180231, + "grad_norm": 0.5722621083259583, + "learning_rate": 3.2065535070404085e-06, + "loss": 0.4938, + "step": 33235 + }, + { + "epoch": 0.7378386477397587, + "grad_norm": 1.978156328201294, + "learning_rate": 3.2039952502449624e-06, + "loss": 0.3224, + "step": 33240 + }, + { + "epoch": 0.7379496342992864, + "grad_norm": 1.9196401834487915, + "learning_rate": 3.2014378196849803e-06, + "loss": 0.4027, + "step": 33245 + }, + { + "epoch": 0.738060620858814, + "grad_norm": 1.0648365020751953, + "learning_rate": 3.1988812156713923e-06, + "loss": 0.3652, + "step": 33250 + }, + { + "epoch": 0.7381716074183416, + "grad_norm": 1.3732563257217407, + "learning_rate": 3.1963254385150133e-06, + "loss": 0.3899, + "step": 33255 + }, + { + "epoch": 0.7382825939778693, + "grad_norm": 1.086742877960205, + "learning_rate": 3.193770488526573e-06, + "loss": 0.2905, + "step": 33260 + }, + { + "epoch": 0.7383935805373969, + "grad_norm": 1.2955430746078491, + "learning_rate": 3.1912163660166873e-06, + "loss": 0.4638, + "step": 33265 + }, + { + "epoch": 0.7385045670969246, + "grad_norm": 0.7780910134315491, + "learning_rate": 3.18866307129588e-06, + "loss": 0.4211, + "step": 33270 + }, + { + "epoch": 0.7386155536564522, + "grad_norm": 2.104405403137207, + "learning_rate": 3.1861106046745773e-06, + "loss": 0.4243, + "step": 33275 + }, + { + "epoch": 0.7387265402159798, + "grad_norm": 1.490172028541565, + "learning_rate": 3.183558966463092e-06, + "loss": 0.4406, + "step": 33280 + }, + { + "epoch": 0.7388375267755075, + "grad_norm": 0.8644965291023254, + "learning_rate": 3.18100815697165e-06, + "loss": 0.1803, + "step": 33285 + }, + { + "epoch": 0.7389485133350351, + "grad_norm": 0.9825798869132996, + "learning_rate": 3.178458176510367e-06, + "loss": 0.3847, + "step": 33290 + }, + { + "epoch": 0.7390594998945628, + "grad_norm": 0.8776482343673706, + "learning_rate": 3.1759090253892578e-06, + "loss": 0.4212, + "step": 33295 + }, + { + "epoch": 0.7391704864540904, + "grad_norm": 1.527934193611145, + "learning_rate": 3.1733607039182467e-06, + "loss": 0.3646, + "step": 33300 + }, + { + "epoch": 0.739281473013618, + "grad_norm": 1.00637686252594, + "learning_rate": 3.170813212407143e-06, + "loss": 0.4107, + "step": 33305 + }, + { + "epoch": 0.7393924595731457, + "grad_norm": 1.2116798162460327, + "learning_rate": 3.1682665511656696e-06, + "loss": 0.373, + "step": 33310 + }, + { + "epoch": 0.7395034461326734, + "grad_norm": 1.912797451019287, + "learning_rate": 3.1657207205034326e-06, + "loss": 0.412, + "step": 33315 + }, + { + "epoch": 0.739614432692201, + "grad_norm": 0.8689662218093872, + "learning_rate": 3.163175720729954e-06, + "loss": 0.3351, + "step": 33320 + }, + { + "epoch": 0.7397254192517286, + "grad_norm": 1.001035451889038, + "learning_rate": 3.1606315521546394e-06, + "loss": 0.4791, + "step": 33325 + }, + { + "epoch": 0.7398364058112562, + "grad_norm": 1.1467145681381226, + "learning_rate": 3.158088215086802e-06, + "loss": 0.2781, + "step": 33330 + }, + { + "epoch": 0.7399473923707839, + "grad_norm": 1.4117281436920166, + "learning_rate": 3.155545709835658e-06, + "loss": 0.2971, + "step": 33335 + }, + { + "epoch": 0.7400583789303116, + "grad_norm": 2.128990888595581, + "learning_rate": 3.153004036710308e-06, + "loss": 0.4511, + "step": 33340 + }, + { + "epoch": 0.7401693654898391, + "grad_norm": 1.1109024286270142, + "learning_rate": 3.1504631960197673e-06, + "loss": 0.3601, + "step": 33345 + }, + { + "epoch": 0.7402803520493668, + "grad_norm": 0.9951190948486328, + "learning_rate": 3.147923188072938e-06, + "loss": 0.3177, + "step": 33350 + }, + { + "epoch": 0.7403913386088945, + "grad_norm": 0.598811686038971, + "learning_rate": 3.145384013178625e-06, + "loss": 0.3142, + "step": 33355 + }, + { + "epoch": 0.7405023251684221, + "grad_norm": 1.0349372625350952, + "learning_rate": 3.1428456716455403e-06, + "loss": 0.4896, + "step": 33360 + }, + { + "epoch": 0.7406133117279498, + "grad_norm": 0.933586835861206, + "learning_rate": 3.1403081637822776e-06, + "loss": 0.4462, + "step": 33365 + }, + { + "epoch": 0.7407242982874774, + "grad_norm": 0.9369057416915894, + "learning_rate": 3.1377714898973468e-06, + "loss": 0.2947, + "step": 33370 + }, + { + "epoch": 0.740835284847005, + "grad_norm": 0.9451455473899841, + "learning_rate": 3.13523565029914e-06, + "loss": 0.3599, + "step": 33375 + }, + { + "epoch": 0.7409462714065327, + "grad_norm": 1.2992775440216064, + "learning_rate": 3.1327006452959595e-06, + "loss": 0.3006, + "step": 33380 + }, + { + "epoch": 0.7410572579660603, + "grad_norm": 1.442800521850586, + "learning_rate": 3.1301664751960082e-06, + "loss": 0.4617, + "step": 33385 + }, + { + "epoch": 0.7411682445255879, + "grad_norm": 1.0337368249893188, + "learning_rate": 3.1276331403073733e-06, + "loss": 0.5779, + "step": 33390 + }, + { + "epoch": 0.7412792310851156, + "grad_norm": 1.5513899326324463, + "learning_rate": 3.1251006409380557e-06, + "loss": 0.3455, + "step": 33395 + }, + { + "epoch": 0.7413902176446432, + "grad_norm": 0.8598487973213196, + "learning_rate": 3.1225689773959434e-06, + "loss": 0.2977, + "step": 33400 + }, + { + "epoch": 0.7415012042041709, + "grad_norm": 2.030599594116211, + "learning_rate": 3.120038149988832e-06, + "loss": 0.308, + "step": 33405 + }, + { + "epoch": 0.7416121907636986, + "grad_norm": 1.0822179317474365, + "learning_rate": 3.1175081590244063e-06, + "loss": 0.4726, + "step": 33410 + }, + { + "epoch": 0.7417231773232261, + "grad_norm": 0.7643406391143799, + "learning_rate": 3.1149790048102568e-06, + "loss": 0.4757, + "step": 33415 + }, + { + "epoch": 0.7418341638827538, + "grad_norm": 0.6753915548324585, + "learning_rate": 3.112450687653872e-06, + "loss": 0.2489, + "step": 33420 + }, + { + "epoch": 0.7419451504422815, + "grad_norm": 1.1946618556976318, + "learning_rate": 3.1099232078626294e-06, + "loss": 0.4191, + "step": 33425 + }, + { + "epoch": 0.7420561370018091, + "grad_norm": 0.712417721748352, + "learning_rate": 3.107396565743821e-06, + "loss": 0.3746, + "step": 33430 + }, + { + "epoch": 0.7421671235613367, + "grad_norm": 0.8652156591415405, + "learning_rate": 3.104870761604617e-06, + "loss": 0.4765, + "step": 33435 + }, + { + "epoch": 0.7422781101208643, + "grad_norm": 0.856709361076355, + "learning_rate": 3.102345795752102e-06, + "loss": 0.2438, + "step": 33440 + }, + { + "epoch": 0.742389096680392, + "grad_norm": 2.4367215633392334, + "learning_rate": 3.099821668493256e-06, + "loss": 0.4003, + "step": 33445 + }, + { + "epoch": 0.7425000832399197, + "grad_norm": 1.0651171207427979, + "learning_rate": 3.0972983801349464e-06, + "loss": 0.4672, + "step": 33450 + }, + { + "epoch": 0.7426110697994472, + "grad_norm": 0.8328818082809448, + "learning_rate": 3.094775930983953e-06, + "loss": 0.3191, + "step": 33455 + }, + { + "epoch": 0.7427220563589749, + "grad_norm": 1.5253877639770508, + "learning_rate": 3.0922543213469403e-06, + "loss": 0.4132, + "step": 33460 + }, + { + "epoch": 0.7428330429185026, + "grad_norm": 1.3908597230911255, + "learning_rate": 3.0897335515304803e-06, + "loss": 0.4988, + "step": 33465 + }, + { + "epoch": 0.7429440294780302, + "grad_norm": 1.2378095388412476, + "learning_rate": 3.087213621841044e-06, + "loss": 0.4961, + "step": 33470 + }, + { + "epoch": 0.7430550160375579, + "grad_norm": 0.9367465972900391, + "learning_rate": 3.0846945325849884e-06, + "loss": 0.5043, + "step": 33475 + }, + { + "epoch": 0.7431660025970855, + "grad_norm": 1.15652596950531, + "learning_rate": 3.082176284068582e-06, + "loss": 0.4179, + "step": 33480 + }, + { + "epoch": 0.7432769891566131, + "grad_norm": 1.3893247842788696, + "learning_rate": 3.0796588765979797e-06, + "loss": 0.2992, + "step": 33485 + }, + { + "epoch": 0.7433879757161408, + "grad_norm": 0.5285239219665527, + "learning_rate": 3.0771423104792454e-06, + "loss": 0.3843, + "step": 33490 + }, + { + "epoch": 0.7434989622756684, + "grad_norm": 1.2001694440841675, + "learning_rate": 3.074626586018328e-06, + "loss": 0.4564, + "step": 33495 + }, + { + "epoch": 0.743609948835196, + "grad_norm": 1.9519604444503784, + "learning_rate": 3.0721117035210845e-06, + "loss": 0.53, + "step": 33500 + }, + { + "epoch": 0.7437209353947237, + "grad_norm": 0.930395245552063, + "learning_rate": 3.069597663293269e-06, + "loss": 0.5216, + "step": 33505 + }, + { + "epoch": 0.7438319219542513, + "grad_norm": 1.4803013801574707, + "learning_rate": 3.067084465640523e-06, + "loss": 0.265, + "step": 33510 + }, + { + "epoch": 0.743942908513779, + "grad_norm": 1.587602138519287, + "learning_rate": 3.0645721108684003e-06, + "loss": 0.3635, + "step": 33515 + }, + { + "epoch": 0.7440538950733067, + "grad_norm": 1.2842870950698853, + "learning_rate": 3.062060599282337e-06, + "loss": 0.3305, + "step": 33520 + }, + { + "epoch": 0.7441648816328342, + "grad_norm": 1.3928651809692383, + "learning_rate": 3.059549931187682e-06, + "loss": 0.5274, + "step": 33525 + }, + { + "epoch": 0.7442758681923619, + "grad_norm": 1.439814567565918, + "learning_rate": 3.057040106889666e-06, + "loss": 0.3928, + "step": 33530 + }, + { + "epoch": 0.7443868547518896, + "grad_norm": 0.9000222086906433, + "learning_rate": 3.054531126693433e-06, + "loss": 0.428, + "step": 33535 + }, + { + "epoch": 0.7444978413114172, + "grad_norm": 1.4337481260299683, + "learning_rate": 3.052022990904009e-06, + "loss": 0.4436, + "step": 33540 + }, + { + "epoch": 0.7446088278709448, + "grad_norm": 1.6256216764450073, + "learning_rate": 3.0495156998263307e-06, + "loss": 0.3843, + "step": 33545 + }, + { + "epoch": 0.7447198144304724, + "grad_norm": 0.9762675762176514, + "learning_rate": 3.047009253765221e-06, + "loss": 0.4457, + "step": 33550 + }, + { + "epoch": 0.7448308009900001, + "grad_norm": 1.7330665588378906, + "learning_rate": 3.04450365302541e-06, + "loss": 0.4577, + "step": 33555 + }, + { + "epoch": 0.7449417875495278, + "grad_norm": 2.0685441493988037, + "learning_rate": 3.0419988979115146e-06, + "loss": 0.3594, + "step": 33560 + }, + { + "epoch": 0.7450527741090553, + "grad_norm": 1.9236466884613037, + "learning_rate": 3.0394949887280624e-06, + "loss": 0.2987, + "step": 33565 + }, + { + "epoch": 0.745163760668583, + "grad_norm": 1.2355669736862183, + "learning_rate": 3.036991925779461e-06, + "loss": 0.288, + "step": 33570 + }, + { + "epoch": 0.7452747472281107, + "grad_norm": 0.5316757559776306, + "learning_rate": 3.0344897093700333e-06, + "loss": 0.2921, + "step": 33575 + }, + { + "epoch": 0.7453857337876383, + "grad_norm": 1.079461693763733, + "learning_rate": 3.031988339803983e-06, + "loss": 0.4051, + "step": 33580 + }, + { + "epoch": 0.745496720347166, + "grad_norm": 0.6526192426681519, + "learning_rate": 3.0294878173854213e-06, + "loss": 0.2468, + "step": 33585 + }, + { + "epoch": 0.7456077069066936, + "grad_norm": 1.38423752784729, + "learning_rate": 3.0269881424183567e-06, + "loss": 0.4037, + "step": 33590 + }, + { + "epoch": 0.7457186934662212, + "grad_norm": 1.5122541189193726, + "learning_rate": 3.0244893152066844e-06, + "loss": 0.3324, + "step": 33595 + }, + { + "epoch": 0.7458296800257489, + "grad_norm": 1.4059768915176392, + "learning_rate": 3.021991336054211e-06, + "loss": 0.3236, + "step": 33600 + }, + { + "epoch": 0.7459406665852765, + "grad_norm": 1.2875112295150757, + "learning_rate": 3.0194942052646246e-06, + "loss": 0.441, + "step": 33605 + }, + { + "epoch": 0.7460516531448041, + "grad_norm": 1.081081748008728, + "learning_rate": 3.0169979231415225e-06, + "loss": 0.3409, + "step": 33610 + }, + { + "epoch": 0.7461626397043318, + "grad_norm": 0.8130430579185486, + "learning_rate": 3.014502489988397e-06, + "loss": 0.3626, + "step": 33615 + }, + { + "epoch": 0.7462736262638594, + "grad_norm": 0.7352177500724792, + "learning_rate": 3.0120079061086284e-06, + "loss": 0.3773, + "step": 33620 + }, + { + "epoch": 0.7463846128233871, + "grad_norm": 1.0624009370803833, + "learning_rate": 3.0095141718055055e-06, + "loss": 0.3875, + "step": 33625 + }, + { + "epoch": 0.7464955993829148, + "grad_norm": 0.5698983073234558, + "learning_rate": 3.007021287382201e-06, + "loss": 0.4605, + "step": 33630 + }, + { + "epoch": 0.7466065859424423, + "grad_norm": 1.0034706592559814, + "learning_rate": 3.004529253141797e-06, + "loss": 0.306, + "step": 33635 + }, + { + "epoch": 0.74671757250197, + "grad_norm": 1.4835110902786255, + "learning_rate": 3.0020380693872687e-06, + "loss": 0.5002, + "step": 33640 + }, + { + "epoch": 0.7468285590614977, + "grad_norm": 0.8922720551490784, + "learning_rate": 2.9995477364214787e-06, + "loss": 0.3605, + "step": 33645 + }, + { + "epoch": 0.7469395456210253, + "grad_norm": 2.575284719467163, + "learning_rate": 2.9970582545472015e-06, + "loss": 0.4445, + "step": 33650 + }, + { + "epoch": 0.747050532180553, + "grad_norm": 1.1334099769592285, + "learning_rate": 2.9945696240670905e-06, + "loss": 0.3478, + "step": 33655 + }, + { + "epoch": 0.7471615187400805, + "grad_norm": 1.2259379625320435, + "learning_rate": 2.992081845283715e-06, + "loss": 0.2761, + "step": 33660 + }, + { + "epoch": 0.7472725052996082, + "grad_norm": 0.957604706287384, + "learning_rate": 2.9895949184995234e-06, + "loss": 0.39, + "step": 33665 + }, + { + "epoch": 0.7473834918591359, + "grad_norm": 2.265244483947754, + "learning_rate": 2.9871088440168696e-06, + "loss": 0.3312, + "step": 33670 + }, + { + "epoch": 0.7474944784186635, + "grad_norm": 1.3157933950424194, + "learning_rate": 2.9846236221380055e-06, + "loss": 0.3522, + "step": 33675 + }, + { + "epoch": 0.7476054649781911, + "grad_norm": 1.5606595277786255, + "learning_rate": 2.9821392531650717e-06, + "loss": 0.3771, + "step": 33680 + }, + { + "epoch": 0.7477164515377188, + "grad_norm": 1.7389330863952637, + "learning_rate": 2.9796557374001145e-06, + "loss": 0.3752, + "step": 33685 + }, + { + "epoch": 0.7478274380972464, + "grad_norm": 1.3296996355056763, + "learning_rate": 2.9771730751450645e-06, + "loss": 0.3257, + "step": 33690 + }, + { + "epoch": 0.7479384246567741, + "grad_norm": 1.2829476594924927, + "learning_rate": 2.974691266701759e-06, + "loss": 0.4326, + "step": 33695 + }, + { + "epoch": 0.7480494112163018, + "grad_norm": 1.3557809591293335, + "learning_rate": 2.9722103123719324e-06, + "loss": 0.368, + "step": 33700 + }, + { + "epoch": 0.7481603977758293, + "grad_norm": 1.0110480785369873, + "learning_rate": 2.9697302124572034e-06, + "loss": 0.2584, + "step": 33705 + }, + { + "epoch": 0.748271384335357, + "grad_norm": 1.133504867553711, + "learning_rate": 2.967250967259101e-06, + "loss": 0.4974, + "step": 33710 + }, + { + "epoch": 0.7483823708948846, + "grad_norm": 1.103011131286621, + "learning_rate": 2.9647725770790357e-06, + "loss": 0.5845, + "step": 33715 + }, + { + "epoch": 0.7484933574544123, + "grad_norm": 0.8758270144462585, + "learning_rate": 2.962295042218327e-06, + "loss": 0.3712, + "step": 33720 + }, + { + "epoch": 0.7486043440139399, + "grad_norm": 1.137786626815796, + "learning_rate": 2.9598183629781875e-06, + "loss": 0.4716, + "step": 33725 + }, + { + "epoch": 0.7487153305734675, + "grad_norm": 0.7884349822998047, + "learning_rate": 2.9573425396597166e-06, + "loss": 0.4341, + "step": 33730 + }, + { + "epoch": 0.7488263171329952, + "grad_norm": 1.3493396043777466, + "learning_rate": 2.954867572563924e-06, + "loss": 0.532, + "step": 33735 + }, + { + "epoch": 0.7489373036925229, + "grad_norm": 0.6397843956947327, + "learning_rate": 2.9523934619917017e-06, + "loss": 0.4425, + "step": 33740 + }, + { + "epoch": 0.7490482902520504, + "grad_norm": 0.7703794836997986, + "learning_rate": 2.9499202082438493e-06, + "loss": 0.3525, + "step": 33745 + }, + { + "epoch": 0.7491592768115781, + "grad_norm": 1.2579290866851807, + "learning_rate": 2.9474478116210503e-06, + "loss": 0.2793, + "step": 33750 + }, + { + "epoch": 0.7492702633711058, + "grad_norm": 0.9212802052497864, + "learning_rate": 2.944976272423895e-06, + "loss": 0.3236, + "step": 33755 + }, + { + "epoch": 0.7493812499306334, + "grad_norm": 1.295153260231018, + "learning_rate": 2.9425055909528654e-06, + "loss": 0.3656, + "step": 33760 + }, + { + "epoch": 0.7494922364901611, + "grad_norm": 1.2512061595916748, + "learning_rate": 2.940035767508336e-06, + "loss": 0.3759, + "step": 33765 + }, + { + "epoch": 0.7496032230496886, + "grad_norm": 1.540828824043274, + "learning_rate": 2.9375668023905823e-06, + "loss": 0.5052, + "step": 33770 + }, + { + "epoch": 0.7497142096092163, + "grad_norm": 1.717550277709961, + "learning_rate": 2.9350986958997685e-06, + "loss": 0.3682, + "step": 33775 + }, + { + "epoch": 0.749825196168744, + "grad_norm": 0.9938486814498901, + "learning_rate": 2.932631448335964e-06, + "loss": 0.3751, + "step": 33780 + }, + { + "epoch": 0.7499361827282716, + "grad_norm": 0.6627386212348938, + "learning_rate": 2.9301650599991227e-06, + "loss": 0.3568, + "step": 33785 + }, + { + "epoch": 0.7500471692877992, + "grad_norm": 1.0992333889007568, + "learning_rate": 2.9276995311891078e-06, + "loss": 0.4247, + "step": 33790 + }, + { + "epoch": 0.7501581558473269, + "grad_norm": 1.372779369354248, + "learning_rate": 2.9252348622056605e-06, + "loss": 0.2754, + "step": 33795 + }, + { + "epoch": 0.7502691424068545, + "grad_norm": 1.7185051441192627, + "learning_rate": 2.9227710533484356e-06, + "loss": 0.3183, + "step": 33800 + }, + { + "epoch": 0.7503801289663822, + "grad_norm": 0.9331538677215576, + "learning_rate": 2.920308104916967e-06, + "loss": 0.2455, + "step": 33805 + }, + { + "epoch": 0.7504911155259099, + "grad_norm": 1.317017674446106, + "learning_rate": 2.9178460172106992e-06, + "loss": 0.4605, + "step": 33810 + }, + { + "epoch": 0.7506021020854374, + "grad_norm": 1.4651226997375488, + "learning_rate": 2.915384790528958e-06, + "loss": 0.4375, + "step": 33815 + }, + { + "epoch": 0.7507130886449651, + "grad_norm": 1.4238923788070679, + "learning_rate": 2.9129244251709766e-06, + "loss": 0.3796, + "step": 33820 + }, + { + "epoch": 0.7508240752044928, + "grad_norm": 1.2364790439605713, + "learning_rate": 2.9104649214358726e-06, + "loss": 0.3157, + "step": 33825 + }, + { + "epoch": 0.7509350617640204, + "grad_norm": 1.8082529306411743, + "learning_rate": 2.908006279622667e-06, + "loss": 0.3494, + "step": 33830 + }, + { + "epoch": 0.751046048323548, + "grad_norm": 1.0577956438064575, + "learning_rate": 2.9055485000302765e-06, + "loss": 0.4521, + "step": 33835 + }, + { + "epoch": 0.7511570348830756, + "grad_norm": 1.3225284814834595, + "learning_rate": 2.9030915829575034e-06, + "loss": 0.3558, + "step": 33840 + }, + { + "epoch": 0.7512680214426033, + "grad_norm": 1.5698176622390747, + "learning_rate": 2.9006355287030576e-06, + "loss": 0.3399, + "step": 33845 + }, + { + "epoch": 0.751379008002131, + "grad_norm": 0.7082293033599854, + "learning_rate": 2.89818033756553e-06, + "loss": 0.3808, + "step": 33850 + }, + { + "epoch": 0.7514899945616585, + "grad_norm": 0.8936519622802734, + "learning_rate": 2.895726009843425e-06, + "loss": 0.3318, + "step": 33855 + }, + { + "epoch": 0.7516009811211862, + "grad_norm": 1.417548656463623, + "learning_rate": 2.893272545835121e-06, + "loss": 0.5136, + "step": 33860 + }, + { + "epoch": 0.7517119676807139, + "grad_norm": 1.2555241584777832, + "learning_rate": 2.8908199458389075e-06, + "loss": 0.4926, + "step": 33865 + }, + { + "epoch": 0.7518229542402415, + "grad_norm": 1.7509957551956177, + "learning_rate": 2.8883682101529655e-06, + "loss": 0.2201, + "step": 33870 + }, + { + "epoch": 0.7519339407997692, + "grad_norm": 1.6181340217590332, + "learning_rate": 2.8859173390753627e-06, + "loss": 0.3649, + "step": 33875 + }, + { + "epoch": 0.7520449273592968, + "grad_norm": 1.0291986465454102, + "learning_rate": 2.883467332904074e-06, + "loss": 0.3769, + "step": 33880 + }, + { + "epoch": 0.7521559139188244, + "grad_norm": 1.4332181215286255, + "learning_rate": 2.8810181919369574e-06, + "loss": 0.3164, + "step": 33885 + }, + { + "epoch": 0.7522669004783521, + "grad_norm": 1.5019235610961914, + "learning_rate": 2.878569916471774e-06, + "loss": 0.2946, + "step": 33890 + }, + { + "epoch": 0.7523778870378797, + "grad_norm": 1.932149887084961, + "learning_rate": 2.8761225068061793e-06, + "loss": 0.4063, + "step": 33895 + }, + { + "epoch": 0.7524888735974073, + "grad_norm": 1.2563505172729492, + "learning_rate": 2.8736759632377154e-06, + "loss": 0.394, + "step": 33900 + }, + { + "epoch": 0.752599860156935, + "grad_norm": 0.8643641471862793, + "learning_rate": 2.871230286063832e-06, + "loss": 0.3462, + "step": 33905 + }, + { + "epoch": 0.7527108467164626, + "grad_norm": 1.325053095817566, + "learning_rate": 2.8687854755818577e-06, + "loss": 0.4378, + "step": 33910 + }, + { + "epoch": 0.7528218332759903, + "grad_norm": 0.9804831147193909, + "learning_rate": 2.866341532089031e-06, + "loss": 0.4027, + "step": 33915 + }, + { + "epoch": 0.752932819835518, + "grad_norm": 0.7422093749046326, + "learning_rate": 2.8638984558824777e-06, + "loss": 0.3409, + "step": 33920 + }, + { + "epoch": 0.7530438063950455, + "grad_norm": 1.1952039003372192, + "learning_rate": 2.8614562472592156e-06, + "loss": 0.4869, + "step": 33925 + }, + { + "epoch": 0.7531547929545732, + "grad_norm": 1.3097890615463257, + "learning_rate": 2.8590149065161655e-06, + "loss": 0.4153, + "step": 33930 + }, + { + "epoch": 0.7532657795141009, + "grad_norm": 1.0756065845489502, + "learning_rate": 2.85657443395013e-06, + "loss": 0.279, + "step": 33935 + }, + { + "epoch": 0.7533767660736285, + "grad_norm": 0.832936704158783, + "learning_rate": 2.8541348298578207e-06, + "loss": 0.4459, + "step": 33940 + }, + { + "epoch": 0.7534877526331561, + "grad_norm": 1.2226253747940063, + "learning_rate": 2.8516960945358307e-06, + "loss": 0.4666, + "step": 33945 + }, + { + "epoch": 0.7535987391926837, + "grad_norm": 0.6219229102134705, + "learning_rate": 2.849258228280656e-06, + "loss": 0.1928, + "step": 33950 + }, + { + "epoch": 0.7537097257522114, + "grad_norm": 0.9533223509788513, + "learning_rate": 2.846821231388688e-06, + "loss": 0.2028, + "step": 33955 + }, + { + "epoch": 0.7538207123117391, + "grad_norm": 1.187782883644104, + "learning_rate": 2.8443851041561996e-06, + "loss": 0.4268, + "step": 33960 + }, + { + "epoch": 0.7539316988712667, + "grad_norm": 1.5726901292800903, + "learning_rate": 2.841949846879377e-06, + "loss": 0.4969, + "step": 33965 + }, + { + "epoch": 0.7540426854307943, + "grad_norm": 1.2846169471740723, + "learning_rate": 2.839515459854283e-06, + "loss": 0.4282, + "step": 33970 + }, + { + "epoch": 0.754153671990322, + "grad_norm": 1.3724855184555054, + "learning_rate": 2.8370819433768837e-06, + "loss": 0.2923, + "step": 33975 + }, + { + "epoch": 0.7542646585498496, + "grad_norm": 0.7159687876701355, + "learning_rate": 2.834649297743043e-06, + "loss": 0.4449, + "step": 33980 + }, + { + "epoch": 0.7543756451093773, + "grad_norm": 0.85330730676651, + "learning_rate": 2.832217523248507e-06, + "loss": 0.2278, + "step": 33985 + }, + { + "epoch": 0.754486631668905, + "grad_norm": 1.3416775465011597, + "learning_rate": 2.829786620188928e-06, + "loss": 0.5682, + "step": 33990 + }, + { + "epoch": 0.7545976182284325, + "grad_norm": 1.798352599143982, + "learning_rate": 2.827356588859842e-06, + "loss": 0.4677, + "step": 33995 + }, + { + "epoch": 0.7547086047879602, + "grad_norm": 1.3195164203643799, + "learning_rate": 2.8249274295566863e-06, + "loss": 0.3804, + "step": 34000 + }, + { + "epoch": 0.7548195913474878, + "grad_norm": 1.8729616403579712, + "learning_rate": 2.822499142574795e-06, + "loss": 0.4065, + "step": 34005 + }, + { + "epoch": 0.7549305779070155, + "grad_norm": 1.4058563709259033, + "learning_rate": 2.8200717282093813e-06, + "loss": 0.2909, + "step": 34010 + }, + { + "epoch": 0.7550415644665431, + "grad_norm": 1.6897121667861938, + "learning_rate": 2.817645186755572e-06, + "loss": 0.3288, + "step": 34015 + }, + { + "epoch": 0.7551525510260707, + "grad_norm": 1.0009219646453857, + "learning_rate": 2.8152195185083697e-06, + "loss": 0.4974, + "step": 34020 + }, + { + "epoch": 0.7552635375855984, + "grad_norm": 0.9866798520088196, + "learning_rate": 2.812794723762685e-06, + "loss": 0.3526, + "step": 34025 + }, + { + "epoch": 0.7553745241451261, + "grad_norm": 2.1823205947875977, + "learning_rate": 2.8103708028133113e-06, + "loss": 0.4714, + "step": 34030 + }, + { + "epoch": 0.7554855107046536, + "grad_norm": 1.2664459943771362, + "learning_rate": 2.807947755954946e-06, + "loss": 0.3761, + "step": 34035 + }, + { + "epoch": 0.7555964972641813, + "grad_norm": 0.8264433145523071, + "learning_rate": 2.8055255834821695e-06, + "loss": 0.3104, + "step": 34040 + }, + { + "epoch": 0.755707483823709, + "grad_norm": 0.9597057700157166, + "learning_rate": 2.8031042856894663e-06, + "loss": 0.4608, + "step": 34045 + }, + { + "epoch": 0.7558184703832366, + "grad_norm": 1.1200875043869019, + "learning_rate": 2.8006838628712054e-06, + "loss": 0.3461, + "step": 34050 + }, + { + "epoch": 0.7559294569427643, + "grad_norm": 0.4323360025882721, + "learning_rate": 2.798264315321658e-06, + "loss": 0.4009, + "step": 34055 + }, + { + "epoch": 0.7560404435022918, + "grad_norm": 1.4403953552246094, + "learning_rate": 2.79584564333498e-06, + "loss": 0.3895, + "step": 34060 + }, + { + "epoch": 0.7561514300618195, + "grad_norm": 1.3160420656204224, + "learning_rate": 2.793427847205231e-06, + "loss": 0.3576, + "step": 34065 + }, + { + "epoch": 0.7562624166213472, + "grad_norm": 1.1515971422195435, + "learning_rate": 2.791010927226353e-06, + "loss": 0.52, + "step": 34070 + }, + { + "epoch": 0.7563734031808748, + "grad_norm": 1.2929975986480713, + "learning_rate": 2.7885948836921916e-06, + "loss": 0.4144, + "step": 34075 + }, + { + "epoch": 0.7564843897404024, + "grad_norm": 1.1152117252349854, + "learning_rate": 2.7861797168964753e-06, + "loss": 0.392, + "step": 34080 + }, + { + "epoch": 0.7565953762999301, + "grad_norm": 0.6945995092391968, + "learning_rate": 2.783765427132837e-06, + "loss": 0.3099, + "step": 34085 + }, + { + "epoch": 0.7567063628594577, + "grad_norm": 0.6502373218536377, + "learning_rate": 2.781352014694799e-06, + "loss": 0.414, + "step": 34090 + }, + { + "epoch": 0.7568173494189854, + "grad_norm": 1.6204890012741089, + "learning_rate": 2.7789394798757706e-06, + "loss": 0.4609, + "step": 34095 + }, + { + "epoch": 0.7569283359785131, + "grad_norm": 1.5944551229476929, + "learning_rate": 2.776527822969066e-06, + "loss": 0.2841, + "step": 34100 + }, + { + "epoch": 0.7570393225380406, + "grad_norm": 1.1407469511032104, + "learning_rate": 2.77411704426788e-06, + "loss": 0.4497, + "step": 34105 + }, + { + "epoch": 0.7571503090975683, + "grad_norm": 1.1005648374557495, + "learning_rate": 2.771707144065313e-06, + "loss": 0.3722, + "step": 34110 + }, + { + "epoch": 0.7572612956570959, + "grad_norm": 1.2683881521224976, + "learning_rate": 2.769298122654347e-06, + "loss": 0.3378, + "step": 34115 + }, + { + "epoch": 0.7573722822166236, + "grad_norm": 1.8083860874176025, + "learning_rate": 2.7668899803278646e-06, + "loss": 0.6097, + "step": 34120 + }, + { + "epoch": 0.7574832687761512, + "grad_norm": 1.6240153312683105, + "learning_rate": 2.764482717378644e-06, + "loss": 0.3813, + "step": 34125 + }, + { + "epoch": 0.7575942553356788, + "grad_norm": 1.3337078094482422, + "learning_rate": 2.7620763340993452e-06, + "loss": 0.3571, + "step": 34130 + }, + { + "epoch": 0.7577052418952065, + "grad_norm": 1.110442042350769, + "learning_rate": 2.7596708307825347e-06, + "loss": 0.3753, + "step": 34135 + }, + { + "epoch": 0.7578162284547342, + "grad_norm": 0.9959350824356079, + "learning_rate": 2.757266207720659e-06, + "loss": 0.4788, + "step": 34140 + }, + { + "epoch": 0.7579272150142617, + "grad_norm": 1.6347358226776123, + "learning_rate": 2.7548624652060672e-06, + "loss": 0.2915, + "step": 34145 + }, + { + "epoch": 0.7580382015737894, + "grad_norm": 1.3603200912475586, + "learning_rate": 2.7524596035310037e-06, + "loss": 0.383, + "step": 34150 + }, + { + "epoch": 0.7581491881333171, + "grad_norm": 1.4851304292678833, + "learning_rate": 2.7500576229875895e-06, + "loss": 0.4998, + "step": 34155 + }, + { + "epoch": 0.7582601746928447, + "grad_norm": 1.3021334409713745, + "learning_rate": 2.7476565238678597e-06, + "loss": 0.5353, + "step": 34160 + }, + { + "epoch": 0.7583711612523724, + "grad_norm": 0.8948301672935486, + "learning_rate": 2.7452563064637238e-06, + "loss": 0.3798, + "step": 34165 + }, + { + "epoch": 0.7584821478118999, + "grad_norm": 1.1484118700027466, + "learning_rate": 2.742856971066996e-06, + "loss": 0.3008, + "step": 34170 + }, + { + "epoch": 0.7585931343714276, + "grad_norm": 1.1975305080413818, + "learning_rate": 2.7404585179693822e-06, + "loss": 0.3417, + "step": 34175 + }, + { + "epoch": 0.7587041209309553, + "grad_norm": 1.2970155477523804, + "learning_rate": 2.738060947462472e-06, + "loss": 0.5035, + "step": 34180 + }, + { + "epoch": 0.7588151074904829, + "grad_norm": 1.2377759218215942, + "learning_rate": 2.7356642598377604e-06, + "loss": 0.3494, + "step": 34185 + }, + { + "epoch": 0.7589260940500105, + "grad_norm": 1.0850483179092407, + "learning_rate": 2.7332684553866216e-06, + "loss": 0.4834, + "step": 34190 + }, + { + "epoch": 0.7590370806095382, + "grad_norm": 1.1056970357894897, + "learning_rate": 2.730873534400337e-06, + "loss": 0.4547, + "step": 34195 + }, + { + "epoch": 0.7591480671690658, + "grad_norm": 1.525825023651123, + "learning_rate": 2.728479497170066e-06, + "loss": 0.3762, + "step": 34200 + }, + { + "epoch": 0.7592590537285935, + "grad_norm": 1.0932745933532715, + "learning_rate": 2.726086343986871e-06, + "loss": 0.3346, + "step": 34205 + }, + { + "epoch": 0.7593700402881212, + "grad_norm": 1.8926199674606323, + "learning_rate": 2.723694075141706e-06, + "loss": 0.4164, + "step": 34210 + }, + { + "epoch": 0.7594810268476487, + "grad_norm": 1.1629252433776855, + "learning_rate": 2.7213026909254105e-06, + "loss": 0.388, + "step": 34215 + }, + { + "epoch": 0.7595920134071764, + "grad_norm": 0.8054100871086121, + "learning_rate": 2.7189121916287252e-06, + "loss": 0.2926, + "step": 34220 + }, + { + "epoch": 0.759702999966704, + "grad_norm": 1.3957380056381226, + "learning_rate": 2.7165225775422745e-06, + "loss": 0.3635, + "step": 34225 + }, + { + "epoch": 0.7598139865262317, + "grad_norm": 1.75041925907135, + "learning_rate": 2.7141338489565818e-06, + "loss": 0.3999, + "step": 34230 + }, + { + "epoch": 0.7599249730857593, + "grad_norm": 1.5254703760147095, + "learning_rate": 2.7117460061620624e-06, + "loss": 0.4837, + "step": 34235 + }, + { + "epoch": 0.7600359596452869, + "grad_norm": 1.7756999731063843, + "learning_rate": 2.7093590494490196e-06, + "loss": 0.5549, + "step": 34240 + }, + { + "epoch": 0.7601469462048146, + "grad_norm": 1.2557058334350586, + "learning_rate": 2.706972979107655e-06, + "loss": 0.3825, + "step": 34245 + }, + { + "epoch": 0.7602579327643423, + "grad_norm": 0.9370241761207581, + "learning_rate": 2.704587795428053e-06, + "loss": 0.2929, + "step": 34250 + }, + { + "epoch": 0.7603689193238699, + "grad_norm": 2.2293343544006348, + "learning_rate": 2.702203498700201e-06, + "loss": 0.2746, + "step": 34255 + }, + { + "epoch": 0.7604799058833975, + "grad_norm": 1.6066590547561646, + "learning_rate": 2.699820089213975e-06, + "loss": 0.3987, + "step": 34260 + }, + { + "epoch": 0.7605908924429252, + "grad_norm": 1.2838315963745117, + "learning_rate": 2.697437567259137e-06, + "loss": 0.3886, + "step": 34265 + }, + { + "epoch": 0.7607018790024528, + "grad_norm": 1.2681989669799805, + "learning_rate": 2.695055933125351e-06, + "loss": 0.4308, + "step": 34270 + }, + { + "epoch": 0.7608128655619805, + "grad_norm": 0.8416971564292908, + "learning_rate": 2.692675187102164e-06, + "loss": 0.4893, + "step": 34275 + }, + { + "epoch": 0.760923852121508, + "grad_norm": 0.8352290987968445, + "learning_rate": 2.6902953294790223e-06, + "loss": 0.2652, + "step": 34280 + }, + { + "epoch": 0.7610348386810357, + "grad_norm": 1.274704098701477, + "learning_rate": 2.6879163605452573e-06, + "loss": 0.3504, + "step": 34285 + }, + { + "epoch": 0.7611458252405634, + "grad_norm": 0.6752955913543701, + "learning_rate": 2.685538280590102e-06, + "loss": 0.275, + "step": 34290 + }, + { + "epoch": 0.761256811800091, + "grad_norm": 0.892386794090271, + "learning_rate": 2.6831610899026718e-06, + "loss": 0.4368, + "step": 34295 + }, + { + "epoch": 0.7613677983596187, + "grad_norm": 1.7363653182983398, + "learning_rate": 2.680784788771974e-06, + "loss": 0.4177, + "step": 34300 + }, + { + "epoch": 0.7614787849191463, + "grad_norm": 1.6341389417648315, + "learning_rate": 2.678409377486918e-06, + "loss": 0.3493, + "step": 34305 + }, + { + "epoch": 0.7615897714786739, + "grad_norm": 1.1072078943252563, + "learning_rate": 2.6760348563362912e-06, + "loss": 0.3427, + "step": 34310 + }, + { + "epoch": 0.7617007580382016, + "grad_norm": 1.033800482749939, + "learning_rate": 2.6736612256087848e-06, + "loss": 0.5172, + "step": 34315 + }, + { + "epoch": 0.7618117445977293, + "grad_norm": 1.2220630645751953, + "learning_rate": 2.6712884855929788e-06, + "loss": 0.2853, + "step": 34320 + }, + { + "epoch": 0.7619227311572568, + "grad_norm": 1.1428450345993042, + "learning_rate": 2.668916636577338e-06, + "loss": 0.3438, + "step": 34325 + }, + { + "epoch": 0.7620337177167845, + "grad_norm": 1.0615078210830688, + "learning_rate": 2.6665456788502276e-06, + "loss": 0.5136, + "step": 34330 + }, + { + "epoch": 0.7621447042763121, + "grad_norm": 1.0006989240646362, + "learning_rate": 2.6641756126998964e-06, + "loss": 0.3704, + "step": 34335 + }, + { + "epoch": 0.7622556908358398, + "grad_norm": 1.4435336589813232, + "learning_rate": 2.6618064384144925e-06, + "loss": 0.3761, + "step": 34340 + }, + { + "epoch": 0.7623666773953675, + "grad_norm": 1.9680712223052979, + "learning_rate": 2.6594381562820537e-06, + "loss": 0.4304, + "step": 34345 + }, + { + "epoch": 0.762477663954895, + "grad_norm": 2.007110834121704, + "learning_rate": 2.6570707665905026e-06, + "loss": 0.4103, + "step": 34350 + }, + { + "epoch": 0.7625886505144227, + "grad_norm": 1.2605485916137695, + "learning_rate": 2.654704269627665e-06, + "loss": 0.3096, + "step": 34355 + }, + { + "epoch": 0.7626996370739504, + "grad_norm": 1.315887451171875, + "learning_rate": 2.6523386656812444e-06, + "loss": 0.4719, + "step": 34360 + }, + { + "epoch": 0.762810623633478, + "grad_norm": 2.135716676712036, + "learning_rate": 2.6499739550388505e-06, + "loss": 0.3175, + "step": 34365 + }, + { + "epoch": 0.7629216101930056, + "grad_norm": 0.8498549461364746, + "learning_rate": 2.647610137987969e-06, + "loss": 0.2739, + "step": 34370 + }, + { + "epoch": 0.7630325967525333, + "grad_norm": 0.4586606025695801, + "learning_rate": 2.64524721481599e-06, + "loss": 0.297, + "step": 34375 + }, + { + "epoch": 0.7631435833120609, + "grad_norm": 0.8612726330757141, + "learning_rate": 2.6428851858101913e-06, + "loss": 0.4003, + "step": 34380 + }, + { + "epoch": 0.7632545698715886, + "grad_norm": 1.0840905904769897, + "learning_rate": 2.6405240512577344e-06, + "loss": 0.4075, + "step": 34385 + }, + { + "epoch": 0.7633655564311161, + "grad_norm": 1.3596967458724976, + "learning_rate": 2.638163811445685e-06, + "loss": 0.414, + "step": 34390 + }, + { + "epoch": 0.7634765429906438, + "grad_norm": 1.3669801950454712, + "learning_rate": 2.635804466660986e-06, + "loss": 0.3763, + "step": 34395 + }, + { + "epoch": 0.7635875295501715, + "grad_norm": 1.8525625467300415, + "learning_rate": 2.633446017190484e-06, + "loss": 0.2287, + "step": 34400 + }, + { + "epoch": 0.7636985161096991, + "grad_norm": 0.44575735926628113, + "learning_rate": 2.631088463320911e-06, + "loss": 0.4444, + "step": 34405 + }, + { + "epoch": 0.7638095026692268, + "grad_norm": 0.9332336187362671, + "learning_rate": 2.6287318053388877e-06, + "loss": 0.2368, + "step": 34410 + }, + { + "epoch": 0.7639204892287544, + "grad_norm": 1.2206798791885376, + "learning_rate": 2.6263760435309317e-06, + "loss": 0.4207, + "step": 34415 + }, + { + "epoch": 0.764031475788282, + "grad_norm": 1.5997384786605835, + "learning_rate": 2.624021178183446e-06, + "loss": 0.3723, + "step": 34420 + }, + { + "epoch": 0.7641424623478097, + "grad_norm": 1.4401050806045532, + "learning_rate": 2.6216672095827267e-06, + "loss": 0.4989, + "step": 34425 + }, + { + "epoch": 0.7642534489073374, + "grad_norm": 1.4782438278198242, + "learning_rate": 2.6193141380149665e-06, + "loss": 0.2464, + "step": 34430 + }, + { + "epoch": 0.7643644354668649, + "grad_norm": 1.1002386808395386, + "learning_rate": 2.616961963766237e-06, + "loss": 0.3429, + "step": 34435 + }, + { + "epoch": 0.7644754220263926, + "grad_norm": 0.8980961441993713, + "learning_rate": 2.614610687122515e-06, + "loss": 0.4068, + "step": 34440 + }, + { + "epoch": 0.7645864085859202, + "grad_norm": 0.9613227248191833, + "learning_rate": 2.612260308369654e-06, + "loss": 0.436, + "step": 34445 + }, + { + "epoch": 0.7646973951454479, + "grad_norm": 1.1408544778823853, + "learning_rate": 2.6099108277934105e-06, + "loss": 0.3591, + "step": 34450 + }, + { + "epoch": 0.7648083817049756, + "grad_norm": 0.9857153296470642, + "learning_rate": 2.607562245679421e-06, + "loss": 0.3469, + "step": 34455 + }, + { + "epoch": 0.7649193682645031, + "grad_norm": 1.092454195022583, + "learning_rate": 2.605214562313222e-06, + "loss": 0.4233, + "step": 34460 + }, + { + "epoch": 0.7650303548240308, + "grad_norm": 1.2938358783721924, + "learning_rate": 2.602867777980239e-06, + "loss": 0.4979, + "step": 34465 + }, + { + "epoch": 0.7651413413835585, + "grad_norm": 1.9508321285247803, + "learning_rate": 2.6005218929657816e-06, + "loss": 0.5095, + "step": 34470 + }, + { + "epoch": 0.7652523279430861, + "grad_norm": 1.2797309160232544, + "learning_rate": 2.598176907555058e-06, + "loss": 0.3765, + "step": 34475 + }, + { + "epoch": 0.7653633145026137, + "grad_norm": 1.148760437965393, + "learning_rate": 2.5958328220331597e-06, + "loss": 0.3695, + "step": 34480 + }, + { + "epoch": 0.7654743010621414, + "grad_norm": 0.8093987703323364, + "learning_rate": 2.593489636685076e-06, + "loss": 0.3263, + "step": 34485 + }, + { + "epoch": 0.765585287621669, + "grad_norm": 0.899083137512207, + "learning_rate": 2.5911473517956854e-06, + "loss": 0.4253, + "step": 34490 + }, + { + "epoch": 0.7656962741811967, + "grad_norm": 1.1059026718139648, + "learning_rate": 2.588805967649749e-06, + "loss": 0.3879, + "step": 34495 + }, + { + "epoch": 0.7658072607407242, + "grad_norm": 1.5844160318374634, + "learning_rate": 2.58646548453193e-06, + "loss": 0.4274, + "step": 34500 + }, + { + "epoch": 0.7659182473002519, + "grad_norm": 1.7420692443847656, + "learning_rate": 2.584125902726773e-06, + "loss": 0.3555, + "step": 34505 + }, + { + "epoch": 0.7660292338597796, + "grad_norm": 1.1865463256835938, + "learning_rate": 2.58178722251872e-06, + "loss": 0.4135, + "step": 34510 + }, + { + "epoch": 0.7661402204193072, + "grad_norm": 1.4520313739776611, + "learning_rate": 2.579449444192095e-06, + "loss": 0.5398, + "step": 34515 + }, + { + "epoch": 0.7662512069788349, + "grad_norm": 1.1610015630722046, + "learning_rate": 2.5771125680311227e-06, + "loss": 0.4224, + "step": 34520 + }, + { + "epoch": 0.7663621935383625, + "grad_norm": 1.9998167753219604, + "learning_rate": 2.5747765943199065e-06, + "loss": 0.3173, + "step": 34525 + }, + { + "epoch": 0.7664731800978901, + "grad_norm": 1.0180845260620117, + "learning_rate": 2.572441523342454e-06, + "loss": 0.3311, + "step": 34530 + }, + { + "epoch": 0.7665841666574178, + "grad_norm": 1.30564546585083, + "learning_rate": 2.5701073553826474e-06, + "loss": 0.408, + "step": 34535 + }, + { + "epoch": 0.7666951532169455, + "grad_norm": 1.6283055543899536, + "learning_rate": 2.5677740907242733e-06, + "loss": 0.4124, + "step": 34540 + }, + { + "epoch": 0.766806139776473, + "grad_norm": 1.1568033695220947, + "learning_rate": 2.565441729650997e-06, + "loss": 0.2596, + "step": 34545 + }, + { + "epoch": 0.7669171263360007, + "grad_norm": 0.9474804401397705, + "learning_rate": 2.5631102724463843e-06, + "loss": 0.407, + "step": 34550 + }, + { + "epoch": 0.7670281128955283, + "grad_norm": 1.356563687324524, + "learning_rate": 2.56077971939388e-06, + "loss": 0.4522, + "step": 34555 + }, + { + "epoch": 0.767139099455056, + "grad_norm": 1.5708154439926147, + "learning_rate": 2.5584500707768314e-06, + "loss": 0.4666, + "step": 34560 + }, + { + "epoch": 0.7672500860145837, + "grad_norm": 0.9258326292037964, + "learning_rate": 2.5561213268784634e-06, + "loss": 0.3345, + "step": 34565 + }, + { + "epoch": 0.7673610725741112, + "grad_norm": 1.1713894605636597, + "learning_rate": 2.5537934879818994e-06, + "loss": 0.3471, + "step": 34570 + }, + { + "epoch": 0.7674720591336389, + "grad_norm": 2.1963934898376465, + "learning_rate": 2.5514665543701535e-06, + "loss": 0.4385, + "step": 34575 + }, + { + "epoch": 0.7675830456931666, + "grad_norm": 1.4954826831817627, + "learning_rate": 2.5491405263261205e-06, + "loss": 0.3973, + "step": 34580 + }, + { + "epoch": 0.7676940322526942, + "grad_norm": 0.7373457551002502, + "learning_rate": 2.546815404132598e-06, + "loss": 0.2641, + "step": 34585 + }, + { + "epoch": 0.7678050188122219, + "grad_norm": 0.7391909956932068, + "learning_rate": 2.544491188072258e-06, + "loss": 0.2995, + "step": 34590 + }, + { + "epoch": 0.7679160053717495, + "grad_norm": 1.2540416717529297, + "learning_rate": 2.5421678784276772e-06, + "loss": 0.3834, + "step": 34595 + }, + { + "epoch": 0.7680269919312771, + "grad_norm": 1.5087982416152954, + "learning_rate": 2.539845475481316e-06, + "loss": 0.4047, + "step": 34600 + }, + { + "epoch": 0.7681379784908048, + "grad_norm": 0.9404889345169067, + "learning_rate": 2.537523979515519e-06, + "loss": 0.4289, + "step": 34605 + }, + { + "epoch": 0.7682489650503324, + "grad_norm": 1.318764090538025, + "learning_rate": 2.535203390812534e-06, + "loss": 0.3573, + "step": 34610 + }, + { + "epoch": 0.76835995160986, + "grad_norm": 0.981238603591919, + "learning_rate": 2.532883709654481e-06, + "loss": 0.2748, + "step": 34615 + }, + { + "epoch": 0.7684709381693877, + "grad_norm": 1.5967328548431396, + "learning_rate": 2.5305649363233885e-06, + "loss": 0.3664, + "step": 34620 + }, + { + "epoch": 0.7685819247289153, + "grad_norm": 1.0770137310028076, + "learning_rate": 2.5282470711011564e-06, + "loss": 0.3657, + "step": 34625 + }, + { + "epoch": 0.768692911288443, + "grad_norm": 1.1292991638183594, + "learning_rate": 2.525930114269587e-06, + "loss": 0.4482, + "step": 34630 + }, + { + "epoch": 0.7688038978479707, + "grad_norm": 0.9217401742935181, + "learning_rate": 2.523614066110371e-06, + "loss": 0.5845, + "step": 34635 + }, + { + "epoch": 0.7689148844074982, + "grad_norm": 1.4295508861541748, + "learning_rate": 2.5212989269050814e-06, + "loss": 0.467, + "step": 34640 + }, + { + "epoch": 0.7690258709670259, + "grad_norm": 1.5857596397399902, + "learning_rate": 2.5189846969351882e-06, + "loss": 0.3764, + "step": 34645 + }, + { + "epoch": 0.7691368575265536, + "grad_norm": 0.6380545496940613, + "learning_rate": 2.516671376482045e-06, + "loss": 0.4617, + "step": 34650 + }, + { + "epoch": 0.7692478440860812, + "grad_norm": 1.4921594858169556, + "learning_rate": 2.5143589658268974e-06, + "loss": 0.3563, + "step": 34655 + }, + { + "epoch": 0.7693588306456088, + "grad_norm": 0.9895206689834595, + "learning_rate": 2.5120474652508843e-06, + "loss": 0.3998, + "step": 34660 + }, + { + "epoch": 0.7694698172051364, + "grad_norm": 0.9758628010749817, + "learning_rate": 2.509736875035026e-06, + "loss": 0.4492, + "step": 34665 + }, + { + "epoch": 0.7695808037646641, + "grad_norm": 1.2848100662231445, + "learning_rate": 2.5074271954602404e-06, + "loss": 0.4602, + "step": 34670 + }, + { + "epoch": 0.7696917903241918, + "grad_norm": 1.179426670074463, + "learning_rate": 2.5051184268073246e-06, + "loss": 0.4438, + "step": 34675 + }, + { + "epoch": 0.7698027768837193, + "grad_norm": 1.0144343376159668, + "learning_rate": 2.502810569356976e-06, + "loss": 0.2947, + "step": 34680 + }, + { + "epoch": 0.769913763443247, + "grad_norm": 2.486269235610962, + "learning_rate": 2.5005036233897763e-06, + "loss": 0.3491, + "step": 34685 + }, + { + "epoch": 0.7700247500027747, + "grad_norm": 1.8819888830184937, + "learning_rate": 2.498197589186193e-06, + "loss": 0.416, + "step": 34690 + }, + { + "epoch": 0.7701357365623023, + "grad_norm": 1.1190801858901978, + "learning_rate": 2.4958924670265905e-06, + "loss": 0.5151, + "step": 34695 + }, + { + "epoch": 0.77024672312183, + "grad_norm": 0.8236473798751831, + "learning_rate": 2.4935882571912107e-06, + "loss": 0.3966, + "step": 34700 + }, + { + "epoch": 0.7703577096813576, + "grad_norm": 1.3798598051071167, + "learning_rate": 2.4912849599602007e-06, + "loss": 0.3029, + "step": 34705 + }, + { + "epoch": 0.7704686962408852, + "grad_norm": 1.6030954122543335, + "learning_rate": 2.4889825756135786e-06, + "loss": 0.515, + "step": 34710 + }, + { + "epoch": 0.7705796828004129, + "grad_norm": 0.8956512808799744, + "learning_rate": 2.4866811044312667e-06, + "loss": 0.4533, + "step": 34715 + }, + { + "epoch": 0.7706906693599405, + "grad_norm": 1.3995407819747925, + "learning_rate": 2.4843805466930706e-06, + "loss": 0.4383, + "step": 34720 + }, + { + "epoch": 0.7708016559194681, + "grad_norm": 0.7382140159606934, + "learning_rate": 2.4820809026786787e-06, + "loss": 0.3899, + "step": 34725 + }, + { + "epoch": 0.7709126424789958, + "grad_norm": 1.231865406036377, + "learning_rate": 2.4797821726676806e-06, + "loss": 0.2791, + "step": 34730 + }, + { + "epoch": 0.7710236290385234, + "grad_norm": 1.6912078857421875, + "learning_rate": 2.4774843569395425e-06, + "loss": 0.3465, + "step": 34735 + }, + { + "epoch": 0.7711346155980511, + "grad_norm": 1.707521677017212, + "learning_rate": 2.4751874557736278e-06, + "loss": 0.6154, + "step": 34740 + }, + { + "epoch": 0.7712456021575788, + "grad_norm": 0.8998667001724243, + "learning_rate": 2.472891469449188e-06, + "loss": 0.3077, + "step": 34745 + }, + { + "epoch": 0.7713565887171063, + "grad_norm": 1.4956085681915283, + "learning_rate": 2.4705963982453575e-06, + "loss": 0.424, + "step": 34750 + }, + { + "epoch": 0.771467575276634, + "grad_norm": 1.070346474647522, + "learning_rate": 2.4683022424411674e-06, + "loss": 0.412, + "step": 34755 + }, + { + "epoch": 0.7715785618361617, + "grad_norm": 0.7793159484863281, + "learning_rate": 2.466009002315529e-06, + "loss": 0.4079, + "step": 34760 + }, + { + "epoch": 0.7716895483956893, + "grad_norm": 0.7783902287483215, + "learning_rate": 2.463716678147251e-06, + "loss": 0.4412, + "step": 34765 + }, + { + "epoch": 0.7718005349552169, + "grad_norm": 1.293110966682434, + "learning_rate": 2.4614252702150234e-06, + "loss": 0.5132, + "step": 34770 + }, + { + "epoch": 0.7719115215147445, + "grad_norm": 0.8567259311676025, + "learning_rate": 2.4591347787974307e-06, + "loss": 0.3509, + "step": 34775 + }, + { + "epoch": 0.7720225080742722, + "grad_norm": 1.187793254852295, + "learning_rate": 2.4568452041729383e-06, + "loss": 0.436, + "step": 34780 + }, + { + "epoch": 0.7721334946337999, + "grad_norm": 1.8018677234649658, + "learning_rate": 2.4545565466199115e-06, + "loss": 0.4194, + "step": 34785 + }, + { + "epoch": 0.7722444811933274, + "grad_norm": 2.446875810623169, + "learning_rate": 2.4522688064165923e-06, + "loss": 0.2922, + "step": 34790 + }, + { + "epoch": 0.7723554677528551, + "grad_norm": 1.2683809995651245, + "learning_rate": 2.44998198384112e-06, + "loss": 0.3612, + "step": 34795 + }, + { + "epoch": 0.7724664543123828, + "grad_norm": 1.1614607572555542, + "learning_rate": 2.4476960791715154e-06, + "loss": 0.3466, + "step": 34800 + }, + { + "epoch": 0.7725774408719104, + "grad_norm": 1.0021891593933105, + "learning_rate": 2.4454110926856955e-06, + "loss": 0.4664, + "step": 34805 + }, + { + "epoch": 0.7726884274314381, + "grad_norm": 1.3776967525482178, + "learning_rate": 2.443127024661456e-06, + "loss": 0.5079, + "step": 34810 + }, + { + "epoch": 0.7727994139909657, + "grad_norm": 0.9636569619178772, + "learning_rate": 2.4408438753764918e-06, + "loss": 0.3864, + "step": 34815 + }, + { + "epoch": 0.7729104005504933, + "grad_norm": 0.6618173122406006, + "learning_rate": 2.438561645108375e-06, + "loss": 0.4138, + "step": 34820 + }, + { + "epoch": 0.773021387110021, + "grad_norm": 1.2725285291671753, + "learning_rate": 2.4362803341345744e-06, + "loss": 0.4558, + "step": 34825 + }, + { + "epoch": 0.7731323736695486, + "grad_norm": 1.3965803384780884, + "learning_rate": 2.4339999427324467e-06, + "loss": 0.4042, + "step": 34830 + }, + { + "epoch": 0.7732433602290762, + "grad_norm": 1.222990870475769, + "learning_rate": 2.4317204711792286e-06, + "loss": 0.3529, + "step": 34835 + }, + { + "epoch": 0.7733543467886039, + "grad_norm": 1.7446800470352173, + "learning_rate": 2.429441919752057e-06, + "loss": 0.4438, + "step": 34840 + }, + { + "epoch": 0.7734653333481315, + "grad_norm": 0.9652834534645081, + "learning_rate": 2.4271642887279434e-06, + "loss": 0.2622, + "step": 34845 + }, + { + "epoch": 0.7735763199076592, + "grad_norm": 0.9489607810974121, + "learning_rate": 2.424887578383799e-06, + "loss": 0.4876, + "step": 34850 + }, + { + "epoch": 0.7736873064671869, + "grad_norm": 1.6105998754501343, + "learning_rate": 2.4226117889964206e-06, + "loss": 0.3108, + "step": 34855 + }, + { + "epoch": 0.7737982930267144, + "grad_norm": 1.1085927486419678, + "learning_rate": 2.4203369208424853e-06, + "loss": 0.3468, + "step": 34860 + }, + { + "epoch": 0.7739092795862421, + "grad_norm": 1.0164639949798584, + "learning_rate": 2.4180629741985707e-06, + "loss": 0.4253, + "step": 34865 + }, + { + "epoch": 0.7740202661457698, + "grad_norm": 1.6611783504486084, + "learning_rate": 2.4157899493411274e-06, + "loss": 0.3143, + "step": 34870 + }, + { + "epoch": 0.7741312527052974, + "grad_norm": 1.1299711465835571, + "learning_rate": 2.4135178465465103e-06, + "loss": 0.4837, + "step": 34875 + }, + { + "epoch": 0.774242239264825, + "grad_norm": 0.9263424873352051, + "learning_rate": 2.411246666090947e-06, + "loss": 0.4062, + "step": 34880 + }, + { + "epoch": 0.7743532258243526, + "grad_norm": 1.4223159551620483, + "learning_rate": 2.408976408250564e-06, + "loss": 0.4009, + "step": 34885 + }, + { + "epoch": 0.7744642123838803, + "grad_norm": 0.8300201892852783, + "learning_rate": 2.4067070733013742e-06, + "loss": 0.4451, + "step": 34890 + }, + { + "epoch": 0.774575198943408, + "grad_norm": 1.0428576469421387, + "learning_rate": 2.4044386615192682e-06, + "loss": 0.3676, + "step": 34895 + }, + { + "epoch": 0.7746861855029356, + "grad_norm": 0.931232750415802, + "learning_rate": 2.4021711731800402e-06, + "loss": 0.4564, + "step": 34900 + }, + { + "epoch": 0.7747971720624632, + "grad_norm": 1.0562961101531982, + "learning_rate": 2.3999046085593567e-06, + "loss": 0.3875, + "step": 34905 + }, + { + "epoch": 0.7749081586219909, + "grad_norm": 1.2111852169036865, + "learning_rate": 2.397638967932783e-06, + "loss": 0.3402, + "step": 34910 + }, + { + "epoch": 0.7750191451815185, + "grad_norm": 1.5517101287841797, + "learning_rate": 2.3953742515757684e-06, + "loss": 0.367, + "step": 34915 + }, + { + "epoch": 0.7751301317410462, + "grad_norm": 0.7939963340759277, + "learning_rate": 2.3931104597636467e-06, + "loss": 0.3447, + "step": 34920 + }, + { + "epoch": 0.7752411183005739, + "grad_norm": 1.2602977752685547, + "learning_rate": 2.3908475927716456e-06, + "loss": 0.3851, + "step": 34925 + }, + { + "epoch": 0.7753521048601014, + "grad_norm": 0.8524995446205139, + "learning_rate": 2.388585650874873e-06, + "loss": 0.3435, + "step": 34930 + }, + { + "epoch": 0.7754630914196291, + "grad_norm": 1.17127525806427, + "learning_rate": 2.3863246343483306e-06, + "loss": 0.4395, + "step": 34935 + }, + { + "epoch": 0.7755740779791567, + "grad_norm": 0.8931377530097961, + "learning_rate": 2.384064543466906e-06, + "loss": 0.3894, + "step": 34940 + }, + { + "epoch": 0.7756850645386844, + "grad_norm": 1.257630467414856, + "learning_rate": 2.3818053785053717e-06, + "loss": 0.3443, + "step": 34945 + }, + { + "epoch": 0.775796051098212, + "grad_norm": 1.4244590997695923, + "learning_rate": 2.379547139738392e-06, + "loss": 0.4465, + "step": 34950 + }, + { + "epoch": 0.7759070376577396, + "grad_norm": 1.145723581314087, + "learning_rate": 2.377289827440511e-06, + "loss": 0.4198, + "step": 34955 + }, + { + "epoch": 0.7760180242172673, + "grad_norm": 0.8434739112854004, + "learning_rate": 2.3750334418861707e-06, + "loss": 0.3091, + "step": 34960 + }, + { + "epoch": 0.776129010776795, + "grad_norm": 1.065153956413269, + "learning_rate": 2.37277798334969e-06, + "loss": 0.4492, + "step": 34965 + }, + { + "epoch": 0.7762399973363225, + "grad_norm": 0.8990171551704407, + "learning_rate": 2.3705234521052823e-06, + "loss": 0.2161, + "step": 34970 + }, + { + "epoch": 0.7763509838958502, + "grad_norm": 1.2386252880096436, + "learning_rate": 2.3682698484270496e-06, + "loss": 0.3579, + "step": 34975 + }, + { + "epoch": 0.7764619704553779, + "grad_norm": 1.7970882654190063, + "learning_rate": 2.3660171725889703e-06, + "loss": 0.4856, + "step": 34980 + }, + { + "epoch": 0.7765729570149055, + "grad_norm": 1.218748927116394, + "learning_rate": 2.363765424864923e-06, + "loss": 0.4453, + "step": 34985 + }, + { + "epoch": 0.7766839435744332, + "grad_norm": 0.9332694411277771, + "learning_rate": 2.361514605528663e-06, + "loss": 0.4588, + "step": 34990 + }, + { + "epoch": 0.7767949301339607, + "grad_norm": 1.112435221672058, + "learning_rate": 2.3592647148538407e-06, + "loss": 0.4827, + "step": 34995 + }, + { + "epoch": 0.7769059166934884, + "grad_norm": 0.8251892924308777, + "learning_rate": 2.3570157531139915e-06, + "loss": 0.3976, + "step": 35000 + }, + { + "epoch": 0.7770169032530161, + "grad_norm": 1.2258474826812744, + "learning_rate": 2.3547677205825313e-06, + "loss": 0.4968, + "step": 35005 + }, + { + "epoch": 0.7771278898125437, + "grad_norm": 1.9798097610473633, + "learning_rate": 2.352520617532774e-06, + "loss": 0.4555, + "step": 35010 + }, + { + "epoch": 0.7772388763720713, + "grad_norm": 2.0439610481262207, + "learning_rate": 2.350274444237911e-06, + "loss": 0.2519, + "step": 35015 + }, + { + "epoch": 0.777349862931599, + "grad_norm": 1.3134821653366089, + "learning_rate": 2.3480292009710282e-06, + "loss": 0.4432, + "step": 35020 + }, + { + "epoch": 0.7774608494911266, + "grad_norm": 0.9833825826644897, + "learning_rate": 2.345784888005088e-06, + "loss": 0.4134, + "step": 35025 + }, + { + "epoch": 0.7775718360506543, + "grad_norm": 1.082362174987793, + "learning_rate": 2.3435415056129564e-06, + "loss": 0.4299, + "step": 35030 + }, + { + "epoch": 0.777682822610182, + "grad_norm": 1.2098311185836792, + "learning_rate": 2.3412990540673663e-06, + "loss": 0.3032, + "step": 35035 + }, + { + "epoch": 0.7777938091697095, + "grad_norm": 1.1878077983856201, + "learning_rate": 2.3390575336409547e-06, + "loss": 0.2941, + "step": 35040 + }, + { + "epoch": 0.7779047957292372, + "grad_norm": 0.8910760879516602, + "learning_rate": 2.3368169446062328e-06, + "loss": 0.4637, + "step": 35045 + }, + { + "epoch": 0.7780157822887648, + "grad_norm": 1.0687915086746216, + "learning_rate": 2.334577287235609e-06, + "loss": 0.4609, + "step": 35050 + }, + { + "epoch": 0.7781267688482925, + "grad_norm": 1.5093152523040771, + "learning_rate": 2.3323385618013682e-06, + "loss": 0.4844, + "step": 35055 + }, + { + "epoch": 0.7782377554078201, + "grad_norm": 1.6555454730987549, + "learning_rate": 2.3301007685756925e-06, + "loss": 0.4811, + "step": 35060 + }, + { + "epoch": 0.7783487419673477, + "grad_norm": 0.8645807504653931, + "learning_rate": 2.3278639078306397e-06, + "loss": 0.2037, + "step": 35065 + }, + { + "epoch": 0.7784597285268754, + "grad_norm": 1.1185818910598755, + "learning_rate": 2.3256279798381664e-06, + "loss": 0.3339, + "step": 35070 + }, + { + "epoch": 0.7785707150864031, + "grad_norm": 1.836459994316101, + "learning_rate": 2.323392984870101e-06, + "loss": 0.5161, + "step": 35075 + }, + { + "epoch": 0.7786817016459306, + "grad_norm": 0.5720643997192383, + "learning_rate": 2.3211589231981723e-06, + "loss": 0.3593, + "step": 35080 + }, + { + "epoch": 0.7787926882054583, + "grad_norm": 1.4042539596557617, + "learning_rate": 2.3189257950939915e-06, + "loss": 0.4549, + "step": 35085 + }, + { + "epoch": 0.778903674764986, + "grad_norm": 1.7690019607543945, + "learning_rate": 2.3166936008290486e-06, + "loss": 0.3279, + "step": 35090 + }, + { + "epoch": 0.7790146613245136, + "grad_norm": 3.0548744201660156, + "learning_rate": 2.3144623406747335e-06, + "loss": 0.3572, + "step": 35095 + }, + { + "epoch": 0.7791256478840413, + "grad_norm": 1.4523555040359497, + "learning_rate": 2.312232014902309e-06, + "loss": 0.5051, + "step": 35100 + }, + { + "epoch": 0.7792366344435688, + "grad_norm": 1.3138129711151123, + "learning_rate": 2.310002623782933e-06, + "loss": 0.5978, + "step": 35105 + }, + { + "epoch": 0.7793476210030965, + "grad_norm": 1.1594641208648682, + "learning_rate": 2.307774167587651e-06, + "loss": 0.3501, + "step": 35110 + }, + { + "epoch": 0.7794586075626242, + "grad_norm": 1.2157979011535645, + "learning_rate": 2.3055466465873845e-06, + "loss": 0.4071, + "step": 35115 + }, + { + "epoch": 0.7795695941221518, + "grad_norm": 0.8386463522911072, + "learning_rate": 2.303320061052955e-06, + "loss": 0.4692, + "step": 35120 + }, + { + "epoch": 0.7796805806816794, + "grad_norm": 1.299099087715149, + "learning_rate": 2.301094411255057e-06, + "loss": 0.3723, + "step": 35125 + }, + { + "epoch": 0.7797915672412071, + "grad_norm": 1.3308439254760742, + "learning_rate": 2.2988696974642797e-06, + "loss": 0.4427, + "step": 35130 + }, + { + "epoch": 0.7799025538007347, + "grad_norm": 0.8495783805847168, + "learning_rate": 2.2966459199511002e-06, + "loss": 0.3364, + "step": 35135 + }, + { + "epoch": 0.7800135403602624, + "grad_norm": 1.6486972570419312, + "learning_rate": 2.2944230789858723e-06, + "loss": 0.3838, + "step": 35140 + }, + { + "epoch": 0.7801245269197901, + "grad_norm": 0.9651938080787659, + "learning_rate": 2.292201174838846e-06, + "loss": 0.3624, + "step": 35145 + }, + { + "epoch": 0.7802355134793176, + "grad_norm": 0.7932876944541931, + "learning_rate": 2.2899802077801482e-06, + "loss": 0.3292, + "step": 35150 + }, + { + "epoch": 0.7803465000388453, + "grad_norm": 1.7208325862884521, + "learning_rate": 2.2877601780798033e-06, + "loss": 0.3253, + "step": 35155 + }, + { + "epoch": 0.7804574865983729, + "grad_norm": 1.4360439777374268, + "learning_rate": 2.2855410860077065e-06, + "loss": 0.3437, + "step": 35160 + }, + { + "epoch": 0.7805684731579006, + "grad_norm": 1.357067346572876, + "learning_rate": 2.2833229318336537e-06, + "loss": 0.3154, + "step": 35165 + }, + { + "epoch": 0.7806794597174282, + "grad_norm": 1.123355507850647, + "learning_rate": 2.281105715827321e-06, + "loss": 0.4595, + "step": 35170 + }, + { + "epoch": 0.7807904462769558, + "grad_norm": 3.049179792404175, + "learning_rate": 2.278889438258266e-06, + "loss": 0.4447, + "step": 35175 + }, + { + "epoch": 0.7809014328364835, + "grad_norm": 0.8946663737297058, + "learning_rate": 2.2766740993959404e-06, + "loss": 0.3773, + "step": 35180 + }, + { + "epoch": 0.7810124193960112, + "grad_norm": 1.7017005681991577, + "learning_rate": 2.2744596995096733e-06, + "loss": 0.4379, + "step": 35185 + }, + { + "epoch": 0.7811234059555388, + "grad_norm": 1.0567814111709595, + "learning_rate": 2.272246238868687e-06, + "loss": 0.485, + "step": 35190 + }, + { + "epoch": 0.7812343925150664, + "grad_norm": 1.122822642326355, + "learning_rate": 2.2700337177420895e-06, + "loss": 0.6283, + "step": 35195 + }, + { + "epoch": 0.7813453790745941, + "grad_norm": 1.1927663087844849, + "learning_rate": 2.267822136398864e-06, + "loss": 0.4545, + "step": 35200 + }, + { + "epoch": 0.7814563656341217, + "grad_norm": 1.7498384714126587, + "learning_rate": 2.2656114951078957e-06, + "loss": 0.2906, + "step": 35205 + }, + { + "epoch": 0.7815673521936494, + "grad_norm": 1.5318573713302612, + "learning_rate": 2.263401794137938e-06, + "loss": 0.5141, + "step": 35210 + }, + { + "epoch": 0.7816783387531769, + "grad_norm": 0.884480357170105, + "learning_rate": 2.261193033757645e-06, + "loss": 0.5263, + "step": 35215 + }, + { + "epoch": 0.7817893253127046, + "grad_norm": 1.082189679145813, + "learning_rate": 2.2589852142355516e-06, + "loss": 0.4145, + "step": 35220 + }, + { + "epoch": 0.7819003118722323, + "grad_norm": 1.2107759714126587, + "learning_rate": 2.256778335840072e-06, + "loss": 0.4267, + "step": 35225 + }, + { + "epoch": 0.7820112984317599, + "grad_norm": 0.9035611152648926, + "learning_rate": 2.2545723988395164e-06, + "loss": 0.3775, + "step": 35230 + }, + { + "epoch": 0.7821222849912876, + "grad_norm": 0.9946116209030151, + "learning_rate": 2.2523674035020693e-06, + "loss": 0.3925, + "step": 35235 + }, + { + "epoch": 0.7822332715508152, + "grad_norm": 0.7790983319282532, + "learning_rate": 2.250163350095812e-06, + "loss": 0.3855, + "step": 35240 + }, + { + "epoch": 0.7823442581103428, + "grad_norm": 1.2134404182434082, + "learning_rate": 2.2479602388887013e-06, + "loss": 0.2843, + "step": 35245 + }, + { + "epoch": 0.7824552446698705, + "grad_norm": 1.49185049533844, + "learning_rate": 2.245758070148587e-06, + "loss": 0.3673, + "step": 35250 + }, + { + "epoch": 0.7825662312293982, + "grad_norm": 0.7419693470001221, + "learning_rate": 2.2435568441432034e-06, + "loss": 0.3048, + "step": 35255 + }, + { + "epoch": 0.7826772177889257, + "grad_norm": 1.4088214635849, + "learning_rate": 2.241356561140162e-06, + "loss": 0.3416, + "step": 35260 + }, + { + "epoch": 0.7827882043484534, + "grad_norm": 1.0905264616012573, + "learning_rate": 2.2391572214069725e-06, + "loss": 0.292, + "step": 35265 + }, + { + "epoch": 0.782899190907981, + "grad_norm": 1.7834142446517944, + "learning_rate": 2.2369588252110175e-06, + "loss": 0.3821, + "step": 35270 + }, + { + "epoch": 0.7830101774675087, + "grad_norm": 0.674659788608551, + "learning_rate": 2.234761372819577e-06, + "loss": 0.2597, + "step": 35275 + }, + { + "epoch": 0.7831211640270364, + "grad_norm": 1.2916734218597412, + "learning_rate": 2.232564864499802e-06, + "loss": 0.4008, + "step": 35280 + }, + { + "epoch": 0.7832321505865639, + "grad_norm": 0.8347023725509644, + "learning_rate": 2.2303693005187445e-06, + "loss": 0.3105, + "step": 35285 + }, + { + "epoch": 0.7833431371460916, + "grad_norm": 0.6374974846839905, + "learning_rate": 2.228174681143327e-06, + "loss": 0.3454, + "step": 35290 + }, + { + "epoch": 0.7834541237056193, + "grad_norm": 0.8511948585510254, + "learning_rate": 2.22598100664037e-06, + "loss": 0.2354, + "step": 35295 + }, + { + "epoch": 0.7835651102651469, + "grad_norm": 1.0302071571350098, + "learning_rate": 2.223788277276567e-06, + "loss": 0.3177, + "step": 35300 + }, + { + "epoch": 0.7836760968246745, + "grad_norm": 1.5168780088424683, + "learning_rate": 2.2215964933185097e-06, + "loss": 0.4536, + "step": 35305 + }, + { + "epoch": 0.7837870833842022, + "grad_norm": 0.7155570387840271, + "learning_rate": 2.2194056550326605e-06, + "loss": 0.505, + "step": 35310 + }, + { + "epoch": 0.7838980699437298, + "grad_norm": 1.4938472509384155, + "learning_rate": 2.217215762685381e-06, + "loss": 0.4783, + "step": 35315 + }, + { + "epoch": 0.7840090565032575, + "grad_norm": 1.1102867126464844, + "learning_rate": 2.2150268165429035e-06, + "loss": 0.1669, + "step": 35320 + }, + { + "epoch": 0.784120043062785, + "grad_norm": 0.8757681250572205, + "learning_rate": 2.212838816871361e-06, + "loss": 0.3824, + "step": 35325 + }, + { + "epoch": 0.7842310296223127, + "grad_norm": 1.2006827592849731, + "learning_rate": 2.2106517639367552e-06, + "loss": 0.5301, + "step": 35330 + }, + { + "epoch": 0.7843420161818404, + "grad_norm": 1.375402808189392, + "learning_rate": 2.208465658004986e-06, + "loss": 0.4124, + "step": 35335 + }, + { + "epoch": 0.784453002741368, + "grad_norm": 1.2572929859161377, + "learning_rate": 2.206280499341833e-06, + "loss": 0.4598, + "step": 35340 + }, + { + "epoch": 0.7845639893008957, + "grad_norm": 0.8071463108062744, + "learning_rate": 2.204096288212956e-06, + "loss": 0.2912, + "step": 35345 + }, + { + "epoch": 0.7846749758604233, + "grad_norm": 0.8503620028495789, + "learning_rate": 2.2019130248839092e-06, + "loss": 0.4762, + "step": 35350 + }, + { + "epoch": 0.7847859624199509, + "grad_norm": 0.8228245377540588, + "learning_rate": 2.1997307096201228e-06, + "loss": 0.2941, + "step": 35355 + }, + { + "epoch": 0.7848969489794786, + "grad_norm": 1.9255098104476929, + "learning_rate": 2.1975493426869155e-06, + "loss": 0.4365, + "step": 35360 + }, + { + "epoch": 0.7850079355390063, + "grad_norm": 1.1783138513565063, + "learning_rate": 2.195368924349495e-06, + "loss": 0.5566, + "step": 35365 + }, + { + "epoch": 0.7851189220985338, + "grad_norm": 1.8189682960510254, + "learning_rate": 2.1931894548729425e-06, + "loss": 0.3299, + "step": 35370 + }, + { + "epoch": 0.7852299086580615, + "grad_norm": 1.408636212348938, + "learning_rate": 2.1910109345222377e-06, + "loss": 0.3573, + "step": 35375 + }, + { + "epoch": 0.7853408952175891, + "grad_norm": 1.0059270858764648, + "learning_rate": 2.1888333635622305e-06, + "loss": 0.3375, + "step": 35380 + }, + { + "epoch": 0.7854518817771168, + "grad_norm": 1.2585111856460571, + "learning_rate": 2.1866567422576667e-06, + "loss": 0.4384, + "step": 35385 + }, + { + "epoch": 0.7855628683366445, + "grad_norm": 1.1098383665084839, + "learning_rate": 2.1844810708731755e-06, + "loss": 0.4599, + "step": 35390 + }, + { + "epoch": 0.785673854896172, + "grad_norm": 1.0346242189407349, + "learning_rate": 2.182306349673261e-06, + "loss": 0.4093, + "step": 35395 + }, + { + "epoch": 0.7857848414556997, + "grad_norm": 0.9923444390296936, + "learning_rate": 2.180132578922326e-06, + "loss": 0.3588, + "step": 35400 + }, + { + "epoch": 0.7858958280152274, + "grad_norm": 0.7084795236587524, + "learning_rate": 2.1779597588846426e-06, + "loss": 0.2708, + "step": 35405 + }, + { + "epoch": 0.786006814574755, + "grad_norm": 0.9460107088088989, + "learning_rate": 2.175787889824381e-06, + "loss": 0.434, + "step": 35410 + }, + { + "epoch": 0.7861178011342826, + "grad_norm": 1.1864134073257446, + "learning_rate": 2.1736169720055853e-06, + "loss": 0.3858, + "step": 35415 + }, + { + "epoch": 0.7862287876938103, + "grad_norm": 1.140289068222046, + "learning_rate": 2.17144700569219e-06, + "loss": 0.3212, + "step": 35420 + }, + { + "epoch": 0.7863397742533379, + "grad_norm": 1.0600495338439941, + "learning_rate": 2.1692779911480156e-06, + "loss": 0.3844, + "step": 35425 + }, + { + "epoch": 0.7864507608128656, + "grad_norm": 2.1447246074676514, + "learning_rate": 2.167109928636759e-06, + "loss": 0.4349, + "step": 35430 + }, + { + "epoch": 0.7865617473723931, + "grad_norm": 0.8802706003189087, + "learning_rate": 2.16494281842201e-06, + "loss": 0.3403, + "step": 35435 + }, + { + "epoch": 0.7866727339319208, + "grad_norm": 0.8940117359161377, + "learning_rate": 2.162776660767233e-06, + "loss": 0.5823, + "step": 35440 + }, + { + "epoch": 0.7867837204914485, + "grad_norm": 1.7119009494781494, + "learning_rate": 2.160611455935786e-06, + "loss": 0.4244, + "step": 35445 + }, + { + "epoch": 0.7868947070509761, + "grad_norm": 1.043269157409668, + "learning_rate": 2.15844720419091e-06, + "loss": 0.3863, + "step": 35450 + }, + { + "epoch": 0.7870056936105038, + "grad_norm": 1.3950551748275757, + "learning_rate": 2.1562839057957218e-06, + "loss": 0.4137, + "step": 35455 + }, + { + "epoch": 0.7871166801700314, + "grad_norm": 1.2941358089447021, + "learning_rate": 2.154121561013233e-06, + "loss": 0.4987, + "step": 35460 + }, + { + "epoch": 0.787227666729559, + "grad_norm": 0.9239951372146606, + "learning_rate": 2.1519601701063285e-06, + "loss": 0.4469, + "step": 35465 + }, + { + "epoch": 0.7873386532890867, + "grad_norm": 1.5648642778396606, + "learning_rate": 2.149799733337786e-06, + "loss": 0.3853, + "step": 35470 + }, + { + "epoch": 0.7874496398486144, + "grad_norm": 1.719732403755188, + "learning_rate": 2.1476402509702687e-06, + "loss": 0.3693, + "step": 35475 + }, + { + "epoch": 0.787560626408142, + "grad_norm": 1.4296544790267944, + "learning_rate": 2.1454817232663117e-06, + "loss": 0.3076, + "step": 35480 + }, + { + "epoch": 0.7876716129676696, + "grad_norm": 1.3797460794448853, + "learning_rate": 2.1433241504883463e-06, + "loss": 0.477, + "step": 35485 + }, + { + "epoch": 0.7877825995271972, + "grad_norm": 1.2195448875427246, + "learning_rate": 2.1411675328986802e-06, + "loss": 0.3835, + "step": 35490 + }, + { + "epoch": 0.7878935860867249, + "grad_norm": 1.3560128211975098, + "learning_rate": 2.139011870759511e-06, + "loss": 0.3778, + "step": 35495 + }, + { + "epoch": 0.7880045726462526, + "grad_norm": 1.120679259300232, + "learning_rate": 2.1368571643329118e-06, + "loss": 0.4081, + "step": 35500 + }, + { + "epoch": 0.7881155592057801, + "grad_norm": 1.4044411182403564, + "learning_rate": 2.134703413880851e-06, + "loss": 0.5043, + "step": 35505 + }, + { + "epoch": 0.7882265457653078, + "grad_norm": 1.3516502380371094, + "learning_rate": 2.132550619665168e-06, + "loss": 0.5035, + "step": 35510 + }, + { + "epoch": 0.7883375323248355, + "grad_norm": 1.0625264644622803, + "learning_rate": 2.130398781947598e-06, + "loss": 0.2935, + "step": 35515 + }, + { + "epoch": 0.7884485188843631, + "grad_norm": 1.0616987943649292, + "learning_rate": 2.128247900989748e-06, + "loss": 0.429, + "step": 35520 + }, + { + "epoch": 0.7885595054438908, + "grad_norm": 1.1464285850524902, + "learning_rate": 2.126097977053122e-06, + "loss": 0.4682, + "step": 35525 + }, + { + "epoch": 0.7886704920034184, + "grad_norm": 1.5215123891830444, + "learning_rate": 2.1239490103990946e-06, + "loss": 0.4505, + "step": 35530 + }, + { + "epoch": 0.788781478562946, + "grad_norm": 1.3544058799743652, + "learning_rate": 2.1218010012889347e-06, + "loss": 0.4219, + "step": 35535 + }, + { + "epoch": 0.7888924651224737, + "grad_norm": 1.2148548364639282, + "learning_rate": 2.1196539499837842e-06, + "loss": 0.3494, + "step": 35540 + }, + { + "epoch": 0.7890034516820013, + "grad_norm": 1.4827840328216553, + "learning_rate": 2.1175078567446815e-06, + "loss": 0.2958, + "step": 35545 + }, + { + "epoch": 0.7891144382415289, + "grad_norm": 1.3042678833007812, + "learning_rate": 2.1153627218325346e-06, + "loss": 0.3881, + "step": 35550 + }, + { + "epoch": 0.7892254248010566, + "grad_norm": 1.068055272102356, + "learning_rate": 2.1132185455081446e-06, + "loss": 0.5395, + "step": 35555 + }, + { + "epoch": 0.7893364113605842, + "grad_norm": 1.0008199214935303, + "learning_rate": 2.1110753280321973e-06, + "loss": 0.3095, + "step": 35560 + }, + { + "epoch": 0.7894473979201119, + "grad_norm": 1.100245714187622, + "learning_rate": 2.1089330696652498e-06, + "loss": 0.3576, + "step": 35565 + }, + { + "epoch": 0.7895583844796396, + "grad_norm": 2.6166369915008545, + "learning_rate": 2.10679177066776e-06, + "loss": 0.3863, + "step": 35570 + }, + { + "epoch": 0.7896693710391671, + "grad_norm": 1.4175260066986084, + "learning_rate": 2.104651431300051e-06, + "loss": 0.2881, + "step": 35575 + }, + { + "epoch": 0.7897803575986948, + "grad_norm": 1.2780652046203613, + "learning_rate": 2.102512051822344e-06, + "loss": 0.4031, + "step": 35580 + }, + { + "epoch": 0.7898913441582225, + "grad_norm": 1.2362151145935059, + "learning_rate": 2.1003736324947345e-06, + "loss": 0.4989, + "step": 35585 + }, + { + "epoch": 0.7900023307177501, + "grad_norm": 1.7409553527832031, + "learning_rate": 2.098236173577205e-06, + "loss": 0.377, + "step": 35590 + }, + { + "epoch": 0.7901133172772777, + "grad_norm": 0.9475632905960083, + "learning_rate": 2.0960996753296236e-06, + "loss": 0.4269, + "step": 35595 + }, + { + "epoch": 0.7902243038368053, + "grad_norm": 0.9686654210090637, + "learning_rate": 2.0939641380117326e-06, + "loss": 0.5675, + "step": 35600 + }, + { + "epoch": 0.790335290396333, + "grad_norm": 1.8504366874694824, + "learning_rate": 2.0918295618831708e-06, + "loss": 0.4839, + "step": 35605 + }, + { + "epoch": 0.7904462769558607, + "grad_norm": 1.4183422327041626, + "learning_rate": 2.089695947203445e-06, + "loss": 0.2792, + "step": 35610 + }, + { + "epoch": 0.7905572635153882, + "grad_norm": 1.3813730478286743, + "learning_rate": 2.087563294231958e-06, + "loss": 0.2715, + "step": 35615 + }, + { + "epoch": 0.7906682500749159, + "grad_norm": 0.8090710639953613, + "learning_rate": 2.085431603227992e-06, + "loss": 0.3649, + "step": 35620 + }, + { + "epoch": 0.7907792366344436, + "grad_norm": 1.393152117729187, + "learning_rate": 2.0833008744507054e-06, + "loss": 0.4255, + "step": 35625 + }, + { + "epoch": 0.7908902231939712, + "grad_norm": 1.1389708518981934, + "learning_rate": 2.08117110815915e-06, + "loss": 0.354, + "step": 35630 + }, + { + "epoch": 0.7910012097534989, + "grad_norm": 1.4676787853240967, + "learning_rate": 2.079042304612252e-06, + "loss": 0.4842, + "step": 35635 + }, + { + "epoch": 0.7911121963130265, + "grad_norm": 1.428437352180481, + "learning_rate": 2.0769144640688256e-06, + "loss": 0.2615, + "step": 35640 + }, + { + "epoch": 0.7912231828725541, + "grad_norm": 1.4401766061782837, + "learning_rate": 2.074787586787569e-06, + "loss": 0.3501, + "step": 35645 + }, + { + "epoch": 0.7913341694320818, + "grad_norm": 1.6893134117126465, + "learning_rate": 2.0726616730270554e-06, + "loss": 0.3072, + "step": 35650 + }, + { + "epoch": 0.7914451559916094, + "grad_norm": 1.5719894170761108, + "learning_rate": 2.070536723045752e-06, + "loss": 0.5454, + "step": 35655 + }, + { + "epoch": 0.791556142551137, + "grad_norm": 0.9744945168495178, + "learning_rate": 2.068412737101998e-06, + "loss": 0.3632, + "step": 35660 + }, + { + "epoch": 0.7916671291106647, + "grad_norm": 1.5975704193115234, + "learning_rate": 2.0662897154540263e-06, + "loss": 0.3699, + "step": 35665 + }, + { + "epoch": 0.7917781156701923, + "grad_norm": 0.754593551158905, + "learning_rate": 2.06416765835994e-06, + "loss": 0.2998, + "step": 35670 + }, + { + "epoch": 0.79188910222972, + "grad_norm": 0.7576720118522644, + "learning_rate": 2.0620465660777357e-06, + "loss": 0.3634, + "step": 35675 + }, + { + "epoch": 0.7920000887892477, + "grad_norm": 0.9368253946304321, + "learning_rate": 2.0599264388652907e-06, + "loss": 0.4255, + "step": 35680 + }, + { + "epoch": 0.7921110753487752, + "grad_norm": 0.8757548332214355, + "learning_rate": 2.057807276980357e-06, + "loss": 0.2294, + "step": 35685 + }, + { + "epoch": 0.7922220619083029, + "grad_norm": 1.1793277263641357, + "learning_rate": 2.055689080680582e-06, + "loss": 0.3232, + "step": 35690 + }, + { + "epoch": 0.7923330484678306, + "grad_norm": 1.2925007343292236, + "learning_rate": 2.0535718502234823e-06, + "loss": 0.4537, + "step": 35695 + }, + { + "epoch": 0.7924440350273582, + "grad_norm": 1.0207651853561401, + "learning_rate": 2.0514555858664663e-06, + "loss": 0.4051, + "step": 35700 + }, + { + "epoch": 0.7925550215868858, + "grad_norm": 1.2424222230911255, + "learning_rate": 2.0493402878668266e-06, + "loss": 0.3735, + "step": 35705 + }, + { + "epoch": 0.7926660081464134, + "grad_norm": 1.308300256729126, + "learning_rate": 2.0472259564817265e-06, + "loss": 0.5448, + "step": 35710 + }, + { + "epoch": 0.7927769947059411, + "grad_norm": 1.087999701499939, + "learning_rate": 2.045112591968227e-06, + "loss": 0.3331, + "step": 35715 + }, + { + "epoch": 0.7928879812654688, + "grad_norm": 1.1336021423339844, + "learning_rate": 2.0430001945832557e-06, + "loss": 0.4715, + "step": 35720 + }, + { + "epoch": 0.7929989678249963, + "grad_norm": 1.1399098634719849, + "learning_rate": 2.0408887645836363e-06, + "loss": 0.2863, + "step": 35725 + }, + { + "epoch": 0.793109954384524, + "grad_norm": 2.4566142559051514, + "learning_rate": 2.03877830222607e-06, + "loss": 0.5767, + "step": 35730 + }, + { + "epoch": 0.7932209409440517, + "grad_norm": 1.3776051998138428, + "learning_rate": 2.036668807767136e-06, + "loss": 0.4465, + "step": 35735 + }, + { + "epoch": 0.7933319275035793, + "grad_norm": 0.8232426047325134, + "learning_rate": 2.0345602814633035e-06, + "loss": 0.2845, + "step": 35740 + }, + { + "epoch": 0.793442914063107, + "grad_norm": 0.9974232316017151, + "learning_rate": 2.0324527235709148e-06, + "loss": 0.3041, + "step": 35745 + }, + { + "epoch": 0.7935539006226346, + "grad_norm": 1.5925170183181763, + "learning_rate": 2.0303461343462062e-06, + "loss": 0.5265, + "step": 35750 + }, + { + "epoch": 0.7936648871821622, + "grad_norm": 1.1427022218704224, + "learning_rate": 2.028240514045284e-06, + "loss": 0.2463, + "step": 35755 + }, + { + "epoch": 0.7937758737416899, + "grad_norm": 0.8311687111854553, + "learning_rate": 2.0261358629241466e-06, + "loss": 0.5457, + "step": 35760 + }, + { + "epoch": 0.7938868603012175, + "grad_norm": 1.539837121963501, + "learning_rate": 2.024032181238668e-06, + "loss": 0.3874, + "step": 35765 + }, + { + "epoch": 0.7939978468607451, + "grad_norm": 1.208118200302124, + "learning_rate": 2.021929469244608e-06, + "loss": 0.3899, + "step": 35770 + }, + { + "epoch": 0.7941088334202728, + "grad_norm": 0.8833453059196472, + "learning_rate": 2.019827727197605e-06, + "loss": 0.4849, + "step": 35775 + }, + { + "epoch": 0.7942198199798004, + "grad_norm": 1.987878680229187, + "learning_rate": 2.0177269553531863e-06, + "loss": 0.2693, + "step": 35780 + }, + { + "epoch": 0.7943308065393281, + "grad_norm": 1.4484559297561646, + "learning_rate": 2.0156271539667517e-06, + "loss": 0.3459, + "step": 35785 + }, + { + "epoch": 0.7944417930988558, + "grad_norm": 1.3147300481796265, + "learning_rate": 2.013528323293592e-06, + "loss": 0.2581, + "step": 35790 + }, + { + "epoch": 0.7945527796583833, + "grad_norm": 1.0331820249557495, + "learning_rate": 2.0114304635888717e-06, + "loss": 0.2734, + "step": 35795 + }, + { + "epoch": 0.794663766217911, + "grad_norm": 1.2817115783691406, + "learning_rate": 2.009333575107647e-06, + "loss": 0.3658, + "step": 35800 + }, + { + "epoch": 0.7947747527774387, + "grad_norm": 1.2282030582427979, + "learning_rate": 2.0072376581048445e-06, + "loss": 0.3646, + "step": 35805 + }, + { + "epoch": 0.7948857393369663, + "grad_norm": 1.2596557140350342, + "learning_rate": 2.005142712835283e-06, + "loss": 0.4325, + "step": 35810 + }, + { + "epoch": 0.794996725896494, + "grad_norm": 1.4031026363372803, + "learning_rate": 2.0030487395536593e-06, + "loss": 0.4099, + "step": 35815 + }, + { + "epoch": 0.7951077124560215, + "grad_norm": 0.8215189576148987, + "learning_rate": 2.0009557385145485e-06, + "loss": 0.3786, + "step": 35820 + }, + { + "epoch": 0.7952186990155492, + "grad_norm": 0.9108057022094727, + "learning_rate": 1.998863709972414e-06, + "loss": 0.3869, + "step": 35825 + }, + { + "epoch": 0.7953296855750769, + "grad_norm": 0.8067658543586731, + "learning_rate": 1.9967726541815935e-06, + "loss": 0.5273, + "step": 35830 + }, + { + "epoch": 0.7954406721346045, + "grad_norm": 0.9091742038726807, + "learning_rate": 1.994682571396316e-06, + "loss": 0.4029, + "step": 35835 + }, + { + "epoch": 0.7955516586941321, + "grad_norm": 1.6972312927246094, + "learning_rate": 1.99259346187068e-06, + "loss": 0.4327, + "step": 35840 + }, + { + "epoch": 0.7956626452536598, + "grad_norm": 0.9916061162948608, + "learning_rate": 1.990505325858677e-06, + "loss": 0.5104, + "step": 35845 + }, + { + "epoch": 0.7957736318131874, + "grad_norm": 0.9939529299736023, + "learning_rate": 1.9884181636141775e-06, + "loss": 0.3834, + "step": 35850 + }, + { + "epoch": 0.7958846183727151, + "grad_norm": 1.5403352975845337, + "learning_rate": 1.986331975390926e-06, + "loss": 0.3761, + "step": 35855 + }, + { + "epoch": 0.7959956049322428, + "grad_norm": 1.4493037462234497, + "learning_rate": 1.98424676144256e-06, + "loss": 0.4491, + "step": 35860 + }, + { + "epoch": 0.7961065914917703, + "grad_norm": 1.16340172290802, + "learning_rate": 1.982162522022587e-06, + "loss": 0.3466, + "step": 35865 + }, + { + "epoch": 0.796217578051298, + "grad_norm": 1.4656342267990112, + "learning_rate": 1.980079257384405e-06, + "loss": 0.4166, + "step": 35870 + }, + { + "epoch": 0.7963285646108257, + "grad_norm": 1.0838220119476318, + "learning_rate": 1.9779969677812927e-06, + "loss": 0.58, + "step": 35875 + }, + { + "epoch": 0.7964395511703533, + "grad_norm": 1.0586246252059937, + "learning_rate": 1.975915653466404e-06, + "loss": 0.302, + "step": 35880 + }, + { + "epoch": 0.7965505377298809, + "grad_norm": 1.5136168003082275, + "learning_rate": 1.9738353146927802e-06, + "loss": 0.3081, + "step": 35885 + }, + { + "epoch": 0.7966615242894085, + "grad_norm": 0.7670632004737854, + "learning_rate": 1.97175595171334e-06, + "loss": 0.3214, + "step": 35890 + }, + { + "epoch": 0.7967725108489362, + "grad_norm": 0.9023318886756897, + "learning_rate": 1.969677564780885e-06, + "loss": 0.3999, + "step": 35895 + }, + { + "epoch": 0.7968834974084639, + "grad_norm": 1.1757646799087524, + "learning_rate": 1.9676001541481037e-06, + "loss": 0.4676, + "step": 35900 + }, + { + "epoch": 0.7969944839679914, + "grad_norm": 1.283319115638733, + "learning_rate": 1.965523720067555e-06, + "loss": 0.3683, + "step": 35905 + }, + { + "epoch": 0.7971054705275191, + "grad_norm": 1.4499841928482056, + "learning_rate": 1.9634482627916883e-06, + "loss": 0.3305, + "step": 35910 + }, + { + "epoch": 0.7972164570870468, + "grad_norm": 0.9750193357467651, + "learning_rate": 1.9613737825728276e-06, + "loss": 0.4215, + "step": 35915 + }, + { + "epoch": 0.7973274436465744, + "grad_norm": 1.2049821615219116, + "learning_rate": 1.9593002796631856e-06, + "loss": 0.4339, + "step": 35920 + }, + { + "epoch": 0.7974384302061021, + "grad_norm": 0.8911698460578918, + "learning_rate": 1.9572277543148453e-06, + "loss": 0.4305, + "step": 35925 + }, + { + "epoch": 0.7975494167656297, + "grad_norm": 1.4148341417312622, + "learning_rate": 1.9551562067797824e-06, + "loss": 0.3123, + "step": 35930 + }, + { + "epoch": 0.7976604033251573, + "grad_norm": 2.4391891956329346, + "learning_rate": 1.9530856373098496e-06, + "loss": 0.3437, + "step": 35935 + }, + { + "epoch": 0.797771389884685, + "grad_norm": 2.3069112300872803, + "learning_rate": 1.951016046156776e-06, + "loss": 0.3807, + "step": 35940 + }, + { + "epoch": 0.7978823764442126, + "grad_norm": 2.131568193435669, + "learning_rate": 1.9489474335721793e-06, + "loss": 0.3841, + "step": 35945 + }, + { + "epoch": 0.7979933630037402, + "grad_norm": 0.7713155150413513, + "learning_rate": 1.9468797998075494e-06, + "loss": 0.325, + "step": 35950 + }, + { + "epoch": 0.7981043495632679, + "grad_norm": 0.8977245092391968, + "learning_rate": 1.944813145114266e-06, + "loss": 0.3944, + "step": 35955 + }, + { + "epoch": 0.7982153361227955, + "grad_norm": 1.6785709857940674, + "learning_rate": 1.942747469743589e-06, + "loss": 0.4234, + "step": 35960 + }, + { + "epoch": 0.7983263226823232, + "grad_norm": 1.0968211889266968, + "learning_rate": 1.9406827739466482e-06, + "loss": 0.4327, + "step": 35965 + }, + { + "epoch": 0.7984373092418509, + "grad_norm": 1.513850450515747, + "learning_rate": 1.9386190579744703e-06, + "loss": 0.3457, + "step": 35970 + }, + { + "epoch": 0.7985482958013784, + "grad_norm": 1.183782935142517, + "learning_rate": 1.9365563220779494e-06, + "loss": 0.2454, + "step": 35975 + }, + { + "epoch": 0.7986592823609061, + "grad_norm": 0.9433440566062927, + "learning_rate": 1.9344945665078672e-06, + "loss": 0.4454, + "step": 35980 + }, + { + "epoch": 0.7987702689204338, + "grad_norm": 0.9357894062995911, + "learning_rate": 1.9324337915148895e-06, + "loss": 0.3218, + "step": 35985 + }, + { + "epoch": 0.7988812554799614, + "grad_norm": 1.7372990846633911, + "learning_rate": 1.930373997349553e-06, + "loss": 0.3769, + "step": 35990 + }, + { + "epoch": 0.798992242039489, + "grad_norm": 1.4230307340621948, + "learning_rate": 1.928315184262284e-06, + "loss": 0.3372, + "step": 35995 + }, + { + "epoch": 0.7991032285990166, + "grad_norm": 1.1874383687973022, + "learning_rate": 1.926257352503381e-06, + "loss": 0.4081, + "step": 36000 + }, + { + "epoch": 0.7992142151585443, + "grad_norm": 0.7755220532417297, + "learning_rate": 1.924200502323036e-06, + "loss": 0.3184, + "step": 36005 + }, + { + "epoch": 0.799325201718072, + "grad_norm": 1.1535636186599731, + "learning_rate": 1.922144633971307e-06, + "loss": 0.4537, + "step": 36010 + }, + { + "epoch": 0.7994361882775995, + "grad_norm": 0.8753936886787415, + "learning_rate": 1.920089747698144e-06, + "loss": 0.2761, + "step": 36015 + }, + { + "epoch": 0.7995471748371272, + "grad_norm": 0.8846555948257446, + "learning_rate": 1.9180358437533695e-06, + "loss": 0.3525, + "step": 36020 + }, + { + "epoch": 0.7996581613966549, + "grad_norm": 1.0711405277252197, + "learning_rate": 1.9159829223866956e-06, + "loss": 0.5587, + "step": 36025 + }, + { + "epoch": 0.7997691479561825, + "grad_norm": 1.5819116830825806, + "learning_rate": 1.913930983847703e-06, + "loss": 0.2664, + "step": 36030 + }, + { + "epoch": 0.7998801345157102, + "grad_norm": 1.0081666707992554, + "learning_rate": 1.911880028385866e-06, + "loss": 0.4461, + "step": 36035 + }, + { + "epoch": 0.7999911210752378, + "grad_norm": 1.3341885805130005, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.4054, + "step": 36040 + }, + { + "epoch": 0.8001021076347654, + "grad_norm": 1.4828771352767944, + "learning_rate": 1.907781067690919e-06, + "loss": 0.3104, + "step": 36045 + }, + { + "epoch": 0.8002130941942931, + "grad_norm": 1.285024881362915, + "learning_rate": 1.9057330629561476e-06, + "loss": 0.4829, + "step": 36050 + }, + { + "epoch": 0.8003240807538207, + "grad_norm": 1.3645647764205933, + "learning_rate": 1.9036860422952076e-06, + "loss": 0.435, + "step": 36055 + }, + { + "epoch": 0.8004350673133483, + "grad_norm": 0.6595139503479004, + "learning_rate": 1.9016400059569629e-06, + "loss": 0.2595, + "step": 36060 + }, + { + "epoch": 0.800546053872876, + "grad_norm": 1.4788298606872559, + "learning_rate": 1.899594954190166e-06, + "loss": 0.3523, + "step": 36065 + }, + { + "epoch": 0.8006570404324036, + "grad_norm": 2.774763822555542, + "learning_rate": 1.89755088724345e-06, + "loss": 0.5137, + "step": 36070 + }, + { + "epoch": 0.8007680269919313, + "grad_norm": 1.1998732089996338, + "learning_rate": 1.895507805365322e-06, + "loss": 0.3658, + "step": 36075 + }, + { + "epoch": 0.800879013551459, + "grad_norm": 1.142486333847046, + "learning_rate": 1.8934657088041763e-06, + "loss": 0.4606, + "step": 36080 + }, + { + "epoch": 0.8009900001109865, + "grad_norm": 1.127084732055664, + "learning_rate": 1.8914245978082812e-06, + "loss": 0.456, + "step": 36085 + }, + { + "epoch": 0.8011009866705142, + "grad_norm": 1.3502229452133179, + "learning_rate": 1.8893844726257914e-06, + "loss": 0.4376, + "step": 36090 + }, + { + "epoch": 0.8012119732300419, + "grad_norm": 1.8028035163879395, + "learning_rate": 1.8873453335047342e-06, + "loss": 0.4925, + "step": 36095 + }, + { + "epoch": 0.8013229597895695, + "grad_norm": 1.4846652746200562, + "learning_rate": 1.8853071806930235e-06, + "loss": 0.4125, + "step": 36100 + }, + { + "epoch": 0.8014339463490971, + "grad_norm": 0.7727453708648682, + "learning_rate": 1.883270014438453e-06, + "loss": 0.4342, + "step": 36105 + }, + { + "epoch": 0.8015449329086247, + "grad_norm": 1.8496626615524292, + "learning_rate": 1.8812338349886905e-06, + "loss": 0.375, + "step": 36110 + }, + { + "epoch": 0.8016559194681524, + "grad_norm": 0.7790250778198242, + "learning_rate": 1.8791986425912935e-06, + "loss": 0.3095, + "step": 36115 + }, + { + "epoch": 0.8017669060276801, + "grad_norm": 1.1624494791030884, + "learning_rate": 1.877164437493687e-06, + "loss": 0.3837, + "step": 36120 + }, + { + "epoch": 0.8018778925872077, + "grad_norm": 1.23572838306427, + "learning_rate": 1.875131219943187e-06, + "loss": 0.5883, + "step": 36125 + }, + { + "epoch": 0.8019888791467353, + "grad_norm": 1.4058516025543213, + "learning_rate": 1.8730989901869868e-06, + "loss": 0.2798, + "step": 36130 + }, + { + "epoch": 0.802099865706263, + "grad_norm": 1.2532075643539429, + "learning_rate": 1.871067748472154e-06, + "loss": 0.4864, + "step": 36135 + }, + { + "epoch": 0.8022108522657906, + "grad_norm": 0.9430113434791565, + "learning_rate": 1.8690374950456436e-06, + "loss": 0.2935, + "step": 36140 + }, + { + "epoch": 0.8023218388253183, + "grad_norm": 1.6031843423843384, + "learning_rate": 1.8670082301542835e-06, + "loss": 0.3808, + "step": 36145 + }, + { + "epoch": 0.802432825384846, + "grad_norm": 1.3810571432113647, + "learning_rate": 1.8649799540447865e-06, + "loss": 0.3689, + "step": 36150 + }, + { + "epoch": 0.8025438119443735, + "grad_norm": 1.1026618480682373, + "learning_rate": 1.8629526669637465e-06, + "loss": 0.3452, + "step": 36155 + }, + { + "epoch": 0.8026547985039012, + "grad_norm": 1.6488431692123413, + "learning_rate": 1.8609263691576285e-06, + "loss": 0.4822, + "step": 36160 + }, + { + "epoch": 0.8027657850634288, + "grad_norm": 1.0668485164642334, + "learning_rate": 1.8589010608727897e-06, + "loss": 0.494, + "step": 36165 + }, + { + "epoch": 0.8028767716229565, + "grad_norm": 0.8742284178733826, + "learning_rate": 1.8568767423554545e-06, + "loss": 0.2853, + "step": 36170 + }, + { + "epoch": 0.8029877581824841, + "grad_norm": 1.244224190711975, + "learning_rate": 1.854853413851737e-06, + "loss": 0.3572, + "step": 36175 + }, + { + "epoch": 0.8030987447420117, + "grad_norm": 1.4055209159851074, + "learning_rate": 1.8528310756076217e-06, + "loss": 0.3008, + "step": 36180 + }, + { + "epoch": 0.8032097313015394, + "grad_norm": 1.29067862033844, + "learning_rate": 1.8508097278689819e-06, + "loss": 0.35, + "step": 36185 + }, + { + "epoch": 0.8033207178610671, + "grad_norm": 1.2676900625228882, + "learning_rate": 1.8487893708815675e-06, + "loss": 0.3111, + "step": 36190 + }, + { + "epoch": 0.8034317044205946, + "grad_norm": 1.301653504371643, + "learning_rate": 1.8467700048910008e-06, + "loss": 0.386, + "step": 36195 + }, + { + "epoch": 0.8035426909801223, + "grad_norm": 1.308471918106079, + "learning_rate": 1.844751630142797e-06, + "loss": 0.4339, + "step": 36200 + }, + { + "epoch": 0.80365367753965, + "grad_norm": 1.5464400053024292, + "learning_rate": 1.842734246882336e-06, + "loss": 0.4505, + "step": 36205 + }, + { + "epoch": 0.8037646640991776, + "grad_norm": 0.9431395530700684, + "learning_rate": 1.8407178553548876e-06, + "loss": 0.3395, + "step": 36210 + }, + { + "epoch": 0.8038756506587053, + "grad_norm": 1.0454790592193604, + "learning_rate": 1.8387024558056022e-06, + "loss": 0.3386, + "step": 36215 + }, + { + "epoch": 0.8039866372182328, + "grad_norm": 0.7487240433692932, + "learning_rate": 1.8366880484794969e-06, + "loss": 0.231, + "step": 36220 + }, + { + "epoch": 0.8040976237777605, + "grad_norm": 1.3686524629592896, + "learning_rate": 1.834674633621485e-06, + "loss": 0.4947, + "step": 36225 + }, + { + "epoch": 0.8042086103372882, + "grad_norm": 1.4362846612930298, + "learning_rate": 1.8326622114763437e-06, + "loss": 0.3767, + "step": 36230 + }, + { + "epoch": 0.8043195968968158, + "grad_norm": 0.6831071972846985, + "learning_rate": 1.8306507822887398e-06, + "loss": 0.3878, + "step": 36235 + }, + { + "epoch": 0.8044305834563434, + "grad_norm": 0.9789371490478516, + "learning_rate": 1.8286403463032187e-06, + "loss": 0.3105, + "step": 36240 + }, + { + "epoch": 0.8045415700158711, + "grad_norm": 1.684880256652832, + "learning_rate": 1.826630903764196e-06, + "loss": 0.3869, + "step": 36245 + }, + { + "epoch": 0.8046525565753987, + "grad_norm": 0.9941912293434143, + "learning_rate": 1.8246224549159808e-06, + "loss": 0.3643, + "step": 36250 + }, + { + "epoch": 0.8047635431349264, + "grad_norm": 1.7709214687347412, + "learning_rate": 1.8226150000027464e-06, + "loss": 0.4625, + "step": 36255 + }, + { + "epoch": 0.804874529694454, + "grad_norm": 0.734659731388092, + "learning_rate": 1.8206085392685568e-06, + "loss": 0.3929, + "step": 36260 + }, + { + "epoch": 0.8049855162539816, + "grad_norm": 1.0187081098556519, + "learning_rate": 1.818603072957349e-06, + "loss": 0.3132, + "step": 36265 + }, + { + "epoch": 0.8050965028135093, + "grad_norm": 1.8822877407073975, + "learning_rate": 1.8165986013129423e-06, + "loss": 0.2947, + "step": 36270 + }, + { + "epoch": 0.8052074893730369, + "grad_norm": 0.9087295532226562, + "learning_rate": 1.8145951245790305e-06, + "loss": 0.3782, + "step": 36275 + }, + { + "epoch": 0.8053184759325646, + "grad_norm": 1.003465175628662, + "learning_rate": 1.8125926429991946e-06, + "loss": 0.3677, + "step": 36280 + }, + { + "epoch": 0.8054294624920922, + "grad_norm": 1.4164167642593384, + "learning_rate": 1.8105911568168832e-06, + "loss": 0.4365, + "step": 36285 + }, + { + "epoch": 0.8055404490516198, + "grad_norm": 1.4984828233718872, + "learning_rate": 1.808590666275437e-06, + "loss": 0.4024, + "step": 36290 + }, + { + "epoch": 0.8056514356111475, + "grad_norm": 0.8606235980987549, + "learning_rate": 1.8065911716180639e-06, + "loss": 0.5163, + "step": 36295 + }, + { + "epoch": 0.8057624221706752, + "grad_norm": 1.0989290475845337, + "learning_rate": 1.8045926730878594e-06, + "loss": 0.5101, + "step": 36300 + }, + { + "epoch": 0.8058734087302027, + "grad_norm": 1.3459844589233398, + "learning_rate": 1.80259517092779e-06, + "loss": 0.4239, + "step": 36305 + }, + { + "epoch": 0.8059843952897304, + "grad_norm": 1.6788448095321655, + "learning_rate": 1.8005986653807107e-06, + "loss": 0.189, + "step": 36310 + }, + { + "epoch": 0.8060953818492581, + "grad_norm": 1.3815594911575317, + "learning_rate": 1.7986031566893436e-06, + "loss": 0.385, + "step": 36315 + }, + { + "epoch": 0.8062063684087857, + "grad_norm": 1.7086074352264404, + "learning_rate": 1.7966086450962994e-06, + "loss": 0.3657, + "step": 36320 + }, + { + "epoch": 0.8063173549683134, + "grad_norm": 1.1373320817947388, + "learning_rate": 1.7946151308440675e-06, + "loss": 0.4686, + "step": 36325 + }, + { + "epoch": 0.8064283415278409, + "grad_norm": 1.21728515625, + "learning_rate": 1.7926226141750058e-06, + "loss": 0.237, + "step": 36330 + }, + { + "epoch": 0.8065393280873686, + "grad_norm": 1.0383799076080322, + "learning_rate": 1.7906310953313645e-06, + "loss": 0.416, + "step": 36335 + }, + { + "epoch": 0.8066503146468963, + "grad_norm": 0.803288459777832, + "learning_rate": 1.7886405745552605e-06, + "loss": 0.3733, + "step": 36340 + }, + { + "epoch": 0.8067613012064239, + "grad_norm": 1.2933439016342163, + "learning_rate": 1.7866510520886982e-06, + "loss": 0.3213, + "step": 36345 + }, + { + "epoch": 0.8068722877659515, + "grad_norm": 1.3617157936096191, + "learning_rate": 1.784662528173553e-06, + "loss": 0.3961, + "step": 36350 + }, + { + "epoch": 0.8069832743254792, + "grad_norm": 1.522436261177063, + "learning_rate": 1.782675003051586e-06, + "loss": 0.3179, + "step": 36355 + }, + { + "epoch": 0.8070942608850068, + "grad_norm": 0.7228170037269592, + "learning_rate": 1.7806884769644362e-06, + "loss": 0.3856, + "step": 36360 + }, + { + "epoch": 0.8072052474445345, + "grad_norm": 2.0226802825927734, + "learning_rate": 1.7787029501536123e-06, + "loss": 0.3473, + "step": 36365 + }, + { + "epoch": 0.8073162340040622, + "grad_norm": 1.0841708183288574, + "learning_rate": 1.776718422860515e-06, + "loss": 0.4584, + "step": 36370 + }, + { + "epoch": 0.8074272205635897, + "grad_norm": 0.9569538831710815, + "learning_rate": 1.774734895326411e-06, + "loss": 0.3909, + "step": 36375 + }, + { + "epoch": 0.8075382071231174, + "grad_norm": 1.6677311658859253, + "learning_rate": 1.772752367792452e-06, + "loss": 0.4026, + "step": 36380 + }, + { + "epoch": 0.807649193682645, + "grad_norm": 1.107144832611084, + "learning_rate": 1.7707708404996715e-06, + "loss": 0.3025, + "step": 36385 + }, + { + "epoch": 0.8077601802421727, + "grad_norm": 0.5250940322875977, + "learning_rate": 1.7687903136889705e-06, + "loss": 0.3486, + "step": 36390 + }, + { + "epoch": 0.8078711668017003, + "grad_norm": 1.1956672668457031, + "learning_rate": 1.7668107876011397e-06, + "loss": 0.2646, + "step": 36395 + }, + { + "epoch": 0.8079821533612279, + "grad_norm": 1.6405327320098877, + "learning_rate": 1.76483226247684e-06, + "loss": 0.3972, + "step": 36400 + }, + { + "epoch": 0.8080931399207556, + "grad_norm": 2.861914873123169, + "learning_rate": 1.7628547385566152e-06, + "loss": 0.3234, + "step": 36405 + }, + { + "epoch": 0.8082041264802833, + "grad_norm": 0.755532443523407, + "learning_rate": 1.760878216080888e-06, + "loss": 0.4652, + "step": 36410 + }, + { + "epoch": 0.8083151130398109, + "grad_norm": 1.5427390336990356, + "learning_rate": 1.7589026952899524e-06, + "loss": 0.2767, + "step": 36415 + }, + { + "epoch": 0.8084260995993385, + "grad_norm": 1.0020582675933838, + "learning_rate": 1.756928176423992e-06, + "loss": 0.3635, + "step": 36420 + }, + { + "epoch": 0.8085370861588662, + "grad_norm": 1.156604528427124, + "learning_rate": 1.754954659723056e-06, + "loss": 0.4422, + "step": 36425 + }, + { + "epoch": 0.8086480727183938, + "grad_norm": 1.1243420839309692, + "learning_rate": 1.7529821454270835e-06, + "loss": 0.4141, + "step": 36430 + }, + { + "epoch": 0.8087590592779215, + "grad_norm": 1.0397528409957886, + "learning_rate": 1.7510106337758802e-06, + "loss": 0.4895, + "step": 36435 + }, + { + "epoch": 0.808870045837449, + "grad_norm": 1.0990221500396729, + "learning_rate": 1.7490401250091404e-06, + "loss": 0.4166, + "step": 36440 + }, + { + "epoch": 0.8089810323969767, + "grad_norm": 1.041306734085083, + "learning_rate": 1.7470706193664322e-06, + "loss": 0.4988, + "step": 36445 + }, + { + "epoch": 0.8090920189565044, + "grad_norm": 1.0351485013961792, + "learning_rate": 1.7451021170871974e-06, + "loss": 0.3052, + "step": 36450 + }, + { + "epoch": 0.809203005516032, + "grad_norm": 0.9568383097648621, + "learning_rate": 1.7431346184107645e-06, + "loss": 0.5788, + "step": 36455 + }, + { + "epoch": 0.8093139920755597, + "grad_norm": 0.883678138256073, + "learning_rate": 1.7411681235763323e-06, + "loss": 0.3323, + "step": 36460 + }, + { + "epoch": 0.8094249786350873, + "grad_norm": 0.6918429732322693, + "learning_rate": 1.7392026328229804e-06, + "loss": 0.2976, + "step": 36465 + }, + { + "epoch": 0.8095359651946149, + "grad_norm": 1.1069185733795166, + "learning_rate": 1.7372381463896703e-06, + "loss": 0.4608, + "step": 36470 + }, + { + "epoch": 0.8096469517541426, + "grad_norm": 1.3932291269302368, + "learning_rate": 1.735274664515233e-06, + "loss": 0.4438, + "step": 36475 + }, + { + "epoch": 0.8097579383136703, + "grad_norm": 0.6072965264320374, + "learning_rate": 1.7333121874383875e-06, + "loss": 0.377, + "step": 36480 + }, + { + "epoch": 0.8098689248731978, + "grad_norm": 1.8820232152938843, + "learning_rate": 1.7313507153977183e-06, + "loss": 0.4978, + "step": 36485 + }, + { + "epoch": 0.8099799114327255, + "grad_norm": 1.3516621589660645, + "learning_rate": 1.7293902486317016e-06, + "loss": 0.454, + "step": 36490 + }, + { + "epoch": 0.8100908979922531, + "grad_norm": 0.7736095786094666, + "learning_rate": 1.7274307873786777e-06, + "loss": 0.4383, + "step": 36495 + }, + { + "epoch": 0.8102018845517808, + "grad_norm": 1.2522380352020264, + "learning_rate": 1.7254723318768785e-06, + "loss": 0.3762, + "step": 36500 + }, + { + "epoch": 0.8103128711113085, + "grad_norm": 1.4323714971542358, + "learning_rate": 1.7235148823643987e-06, + "loss": 0.377, + "step": 36505 + }, + { + "epoch": 0.810423857670836, + "grad_norm": 0.9815630316734314, + "learning_rate": 1.721558439079225e-06, + "loss": 0.2898, + "step": 36510 + }, + { + "epoch": 0.8105348442303637, + "grad_norm": 1.7804838418960571, + "learning_rate": 1.7196030022592102e-06, + "loss": 0.3777, + "step": 36515 + }, + { + "epoch": 0.8106458307898914, + "grad_norm": 1.0361943244934082, + "learning_rate": 1.7176485721420943e-06, + "loss": 0.4349, + "step": 36520 + }, + { + "epoch": 0.810756817349419, + "grad_norm": 1.1842056512832642, + "learning_rate": 1.7156951489654872e-06, + "loss": 0.3378, + "step": 36525 + }, + { + "epoch": 0.8108678039089466, + "grad_norm": 1.0792927742004395, + "learning_rate": 1.713742732966881e-06, + "loss": 0.3917, + "step": 36530 + }, + { + "epoch": 0.8109787904684743, + "grad_norm": 1.4058988094329834, + "learning_rate": 1.7117913243836415e-06, + "loss": 0.3641, + "step": 36535 + }, + { + "epoch": 0.8110897770280019, + "grad_norm": 1.1576017141342163, + "learning_rate": 1.70984092345302e-06, + "loss": 0.4311, + "step": 36540 + }, + { + "epoch": 0.8112007635875296, + "grad_norm": 0.43751659989356995, + "learning_rate": 1.7078915304121323e-06, + "loss": 0.3346, + "step": 36545 + }, + { + "epoch": 0.8113117501470571, + "grad_norm": 1.3185070753097534, + "learning_rate": 1.7059431454979825e-06, + "loss": 0.3505, + "step": 36550 + }, + { + "epoch": 0.8114227367065848, + "grad_norm": 2.0647809505462646, + "learning_rate": 1.7039957689474517e-06, + "loss": 0.4157, + "step": 36555 + }, + { + "epoch": 0.8115337232661125, + "grad_norm": 1.7556878328323364, + "learning_rate": 1.7020494009972909e-06, + "loss": 0.3379, + "step": 36560 + }, + { + "epoch": 0.8116447098256401, + "grad_norm": 1.018389105796814, + "learning_rate": 1.7001040418841364e-06, + "loss": 0.3126, + "step": 36565 + }, + { + "epoch": 0.8117556963851678, + "grad_norm": 1.5789307355880737, + "learning_rate": 1.6981596918444953e-06, + "loss": 0.4389, + "step": 36570 + }, + { + "epoch": 0.8118666829446954, + "grad_norm": 1.1881147623062134, + "learning_rate": 1.6962163511147557e-06, + "loss": 0.4891, + "step": 36575 + }, + { + "epoch": 0.811977669504223, + "grad_norm": 0.9965287446975708, + "learning_rate": 1.6942740199311858e-06, + "loss": 0.4181, + "step": 36580 + }, + { + "epoch": 0.8120886560637507, + "grad_norm": 1.7986716032028198, + "learning_rate": 1.6923326985299238e-06, + "loss": 0.3028, + "step": 36585 + }, + { + "epoch": 0.8121996426232784, + "grad_norm": 1.148728370666504, + "learning_rate": 1.6903923871469917e-06, + "loss": 0.1677, + "step": 36590 + }, + { + "epoch": 0.8123106291828059, + "grad_norm": 1.5958105325698853, + "learning_rate": 1.6884530860182835e-06, + "loss": 0.4435, + "step": 36595 + }, + { + "epoch": 0.8124216157423336, + "grad_norm": 0.8438432812690735, + "learning_rate": 1.6865147953795746e-06, + "loss": 0.4816, + "step": 36600 + }, + { + "epoch": 0.8125326023018612, + "grad_norm": 1.058948278427124, + "learning_rate": 1.6845775154665167e-06, + "loss": 0.415, + "step": 36605 + }, + { + "epoch": 0.8126435888613889, + "grad_norm": 1.3056766986846924, + "learning_rate": 1.6826412465146357e-06, + "loss": 0.3453, + "step": 36610 + }, + { + "epoch": 0.8127545754209166, + "grad_norm": 1.4271174669265747, + "learning_rate": 1.6807059887593392e-06, + "loss": 0.3211, + "step": 36615 + }, + { + "epoch": 0.8128655619804441, + "grad_norm": 0.9875726103782654, + "learning_rate": 1.6787717424359061e-06, + "loss": 0.4205, + "step": 36620 + }, + { + "epoch": 0.8129765485399718, + "grad_norm": 1.2538775205612183, + "learning_rate": 1.676838507779499e-06, + "loss": 0.6146, + "step": 36625 + }, + { + "epoch": 0.8130875350994995, + "grad_norm": 0.8289942145347595, + "learning_rate": 1.6749062850251508e-06, + "loss": 0.4664, + "step": 36630 + }, + { + "epoch": 0.8131985216590271, + "grad_norm": 0.9525663256645203, + "learning_rate": 1.6729750744077755e-06, + "loss": 0.4705, + "step": 36635 + }, + { + "epoch": 0.8133095082185547, + "grad_norm": 0.8990917801856995, + "learning_rate": 1.6710448761621667e-06, + "loss": 0.422, + "step": 36640 + }, + { + "epoch": 0.8134204947780824, + "grad_norm": 0.651301920413971, + "learning_rate": 1.6691156905229865e-06, + "loss": 0.4145, + "step": 36645 + }, + { + "epoch": 0.81353148133761, + "grad_norm": 0.46783456206321716, + "learning_rate": 1.6671875177247833e-06, + "loss": 0.302, + "step": 36650 + }, + { + "epoch": 0.8136424678971377, + "grad_norm": 1.1785321235656738, + "learning_rate": 1.6652603580019733e-06, + "loss": 0.3042, + "step": 36655 + }, + { + "epoch": 0.8137534544566652, + "grad_norm": 1.3166145086288452, + "learning_rate": 1.6633342115888562e-06, + "loss": 0.3713, + "step": 36660 + }, + { + "epoch": 0.8138644410161929, + "grad_norm": 1.2056806087493896, + "learning_rate": 1.6614090787196091e-06, + "loss": 0.5188, + "step": 36665 + }, + { + "epoch": 0.8139754275757206, + "grad_norm": 1.5675289630889893, + "learning_rate": 1.6594849596282781e-06, + "loss": 0.4461, + "step": 36670 + }, + { + "epoch": 0.8140864141352482, + "grad_norm": 0.9947167038917542, + "learning_rate": 1.657561854548797e-06, + "loss": 0.4885, + "step": 36675 + }, + { + "epoch": 0.8141974006947759, + "grad_norm": 1.071495771408081, + "learning_rate": 1.6556397637149646e-06, + "loss": 0.4027, + "step": 36680 + }, + { + "epoch": 0.8143083872543035, + "grad_norm": 1.2620582580566406, + "learning_rate": 1.6537186873604638e-06, + "loss": 0.3634, + "step": 36685 + }, + { + "epoch": 0.8144193738138311, + "grad_norm": 1.3768949508666992, + "learning_rate": 1.6517986257188578e-06, + "loss": 0.4012, + "step": 36690 + }, + { + "epoch": 0.8145303603733588, + "grad_norm": 1.1447209119796753, + "learning_rate": 1.6498795790235734e-06, + "loss": 0.3399, + "step": 36695 + }, + { + "epoch": 0.8146413469328865, + "grad_norm": 1.545142412185669, + "learning_rate": 1.6479615475079291e-06, + "loss": 0.4296, + "step": 36700 + }, + { + "epoch": 0.814752333492414, + "grad_norm": 1.6830742359161377, + "learning_rate": 1.6460445314051065e-06, + "loss": 0.2984, + "step": 36705 + }, + { + "epoch": 0.8148633200519417, + "grad_norm": 1.2970561981201172, + "learning_rate": 1.6441285309481746e-06, + "loss": 0.5463, + "step": 36710 + }, + { + "epoch": 0.8149743066114693, + "grad_norm": 0.8436324596405029, + "learning_rate": 1.6422135463700705e-06, + "loss": 0.2658, + "step": 36715 + }, + { + "epoch": 0.815085293170997, + "grad_norm": 1.074008584022522, + "learning_rate": 1.6402995779036146e-06, + "loss": 0.3757, + "step": 36720 + }, + { + "epoch": 0.8151962797305247, + "grad_norm": 1.4187140464782715, + "learning_rate": 1.6383866257815007e-06, + "loss": 0.3975, + "step": 36725 + }, + { + "epoch": 0.8153072662900522, + "grad_norm": 1.1671775579452515, + "learning_rate": 1.6364746902362972e-06, + "loss": 0.4162, + "step": 36730 + }, + { + "epoch": 0.8154182528495799, + "grad_norm": 1.0524426698684692, + "learning_rate": 1.6345637715004524e-06, + "loss": 0.4194, + "step": 36735 + }, + { + "epoch": 0.8155292394091076, + "grad_norm": 1.8277971744537354, + "learning_rate": 1.6326538698062878e-06, + "loss": 0.4083, + "step": 36740 + }, + { + "epoch": 0.8156402259686352, + "grad_norm": 0.9649227857589722, + "learning_rate": 1.6307449853860058e-06, + "loss": 0.3563, + "step": 36745 + }, + { + "epoch": 0.8157512125281629, + "grad_norm": 1.5071200132369995, + "learning_rate": 1.6288371184716779e-06, + "loss": 0.3864, + "step": 36750 + }, + { + "epoch": 0.8158621990876905, + "grad_norm": 0.7660090923309326, + "learning_rate": 1.6269302692952605e-06, + "loss": 0.2733, + "step": 36755 + }, + { + "epoch": 0.8159731856472181, + "grad_norm": 1.075348138809204, + "learning_rate": 1.625024438088577e-06, + "loss": 0.3427, + "step": 36760 + }, + { + "epoch": 0.8160841722067458, + "grad_norm": 0.3273741602897644, + "learning_rate": 1.623119625083338e-06, + "loss": 0.3185, + "step": 36765 + }, + { + "epoch": 0.8161951587662734, + "grad_norm": 1.2251849174499512, + "learning_rate": 1.6212158305111192e-06, + "loss": 0.4443, + "step": 36770 + }, + { + "epoch": 0.816306145325801, + "grad_norm": 1.5640612840652466, + "learning_rate": 1.6193130546033808e-06, + "loss": 0.5205, + "step": 36775 + }, + { + "epoch": 0.8164171318853287, + "grad_norm": 1.0030092000961304, + "learning_rate": 1.6174112975914524e-06, + "loss": 0.5655, + "step": 36780 + }, + { + "epoch": 0.8165281184448563, + "grad_norm": 1.602996587753296, + "learning_rate": 1.615510559706548e-06, + "loss": 0.3742, + "step": 36785 + }, + { + "epoch": 0.816639105004384, + "grad_norm": 1.4866420030593872, + "learning_rate": 1.613610841179748e-06, + "loss": 0.3605, + "step": 36790 + }, + { + "epoch": 0.8167500915639117, + "grad_norm": 1.5150279998779297, + "learning_rate": 1.611712142242019e-06, + "loss": 0.4498, + "step": 36795 + }, + { + "epoch": 0.8168610781234392, + "grad_norm": 1.7357852458953857, + "learning_rate": 1.6098144631241918e-06, + "loss": 0.3914, + "step": 36800 + }, + { + "epoch": 0.8169720646829669, + "grad_norm": 0.8778465986251831, + "learning_rate": 1.6079178040569853e-06, + "loss": 0.3118, + "step": 36805 + }, + { + "epoch": 0.8170830512424946, + "grad_norm": 1.068134069442749, + "learning_rate": 1.6060221652709885e-06, + "loss": 0.5221, + "step": 36810 + }, + { + "epoch": 0.8171940378020222, + "grad_norm": 1.6891485452651978, + "learning_rate": 1.6041275469966645e-06, + "loss": 0.3636, + "step": 36815 + }, + { + "epoch": 0.8173050243615498, + "grad_norm": 1.3472400903701782, + "learning_rate": 1.602233949464357e-06, + "loss": 0.3646, + "step": 36820 + }, + { + "epoch": 0.8174160109210774, + "grad_norm": 1.4167143106460571, + "learning_rate": 1.6003413729042804e-06, + "loss": 0.3788, + "step": 36825 + }, + { + "epoch": 0.8175269974806051, + "grad_norm": 1.301668643951416, + "learning_rate": 1.5984498175465292e-06, + "loss": 0.4136, + "step": 36830 + }, + { + "epoch": 0.8176379840401328, + "grad_norm": 1.205451488494873, + "learning_rate": 1.596559283621074e-06, + "loss": 0.3997, + "step": 36835 + }, + { + "epoch": 0.8177489705996603, + "grad_norm": 0.7667840719223022, + "learning_rate": 1.5946697713577574e-06, + "loss": 0.3981, + "step": 36840 + }, + { + "epoch": 0.817859957159188, + "grad_norm": 1.0957651138305664, + "learning_rate": 1.592781280986302e-06, + "loss": 0.3886, + "step": 36845 + }, + { + "epoch": 0.8179709437187157, + "grad_norm": 1.3430111408233643, + "learning_rate": 1.5908938127363004e-06, + "loss": 0.5454, + "step": 36850 + }, + { + "epoch": 0.8180819302782433, + "grad_norm": 1.6858025789260864, + "learning_rate": 1.5890073668372275e-06, + "loss": 0.4572, + "step": 36855 + }, + { + "epoch": 0.818192916837771, + "grad_norm": 0.7941359877586365, + "learning_rate": 1.5871219435184325e-06, + "loss": 0.2364, + "step": 36860 + }, + { + "epoch": 0.8183039033972986, + "grad_norm": 1.369829773902893, + "learning_rate": 1.585237543009136e-06, + "loss": 0.4684, + "step": 36865 + }, + { + "epoch": 0.8184148899568262, + "grad_norm": 1.1318687200546265, + "learning_rate": 1.5833541655384387e-06, + "loss": 0.3791, + "step": 36870 + }, + { + "epoch": 0.8185258765163539, + "grad_norm": 1.6038217544555664, + "learning_rate": 1.5814718113353134e-06, + "loss": 0.4192, + "step": 36875 + }, + { + "epoch": 0.8186368630758815, + "grad_norm": 1.0083684921264648, + "learning_rate": 1.5795904806286144e-06, + "loss": 0.2792, + "step": 36880 + }, + { + "epoch": 0.8187478496354091, + "grad_norm": 1.6192983388900757, + "learning_rate": 1.5777101736470623e-06, + "loss": 0.4293, + "step": 36885 + }, + { + "epoch": 0.8188588361949368, + "grad_norm": 0.9264241456985474, + "learning_rate": 1.575830890619261e-06, + "loss": 0.384, + "step": 36890 + }, + { + "epoch": 0.8189698227544644, + "grad_norm": 1.500458002090454, + "learning_rate": 1.5739526317736897e-06, + "loss": 0.4064, + "step": 36895 + }, + { + "epoch": 0.8190808093139921, + "grad_norm": 1.7399510145187378, + "learning_rate": 1.572075397338696e-06, + "loss": 0.4525, + "step": 36900 + }, + { + "epoch": 0.8191917958735198, + "grad_norm": 1.5163832902908325, + "learning_rate": 1.5701991875425137e-06, + "loss": 0.3745, + "step": 36905 + }, + { + "epoch": 0.8193027824330473, + "grad_norm": 1.2246482372283936, + "learning_rate": 1.5683240026132395e-06, + "loss": 0.42, + "step": 36910 + }, + { + "epoch": 0.819413768992575, + "grad_norm": 0.7396544218063354, + "learning_rate": 1.5664498427788554e-06, + "loss": 0.3279, + "step": 36915 + }, + { + "epoch": 0.8195247555521027, + "grad_norm": 1.3506535291671753, + "learning_rate": 1.5645767082672192e-06, + "loss": 0.4584, + "step": 36920 + }, + { + "epoch": 0.8196357421116303, + "grad_norm": 1.3335028886795044, + "learning_rate": 1.562704599306053e-06, + "loss": 0.4633, + "step": 36925 + }, + { + "epoch": 0.8197467286711579, + "grad_norm": 0.8453715443611145, + "learning_rate": 1.5608335161229682e-06, + "loss": 0.325, + "step": 36930 + }, + { + "epoch": 0.8198577152306855, + "grad_norm": 0.909500241279602, + "learning_rate": 1.5589634589454383e-06, + "loss": 0.3557, + "step": 36935 + }, + { + "epoch": 0.8199687017902132, + "grad_norm": 1.10624361038208, + "learning_rate": 1.5570944280008227e-06, + "loss": 0.3512, + "step": 36940 + }, + { + "epoch": 0.8200796883497409, + "grad_norm": 1.1543127298355103, + "learning_rate": 1.5552264235163538e-06, + "loss": 0.3086, + "step": 36945 + }, + { + "epoch": 0.8201906749092684, + "grad_norm": 1.129363775253296, + "learning_rate": 1.5533594457191326e-06, + "loss": 0.4401, + "step": 36950 + }, + { + "epoch": 0.8203016614687961, + "grad_norm": 0.8860126733779907, + "learning_rate": 1.5514934948361437e-06, + "loss": 0.3586, + "step": 36955 + }, + { + "epoch": 0.8204126480283238, + "grad_norm": 1.5794183015823364, + "learning_rate": 1.5496285710942393e-06, + "loss": 0.3656, + "step": 36960 + }, + { + "epoch": 0.8205236345878514, + "grad_norm": 1.1547932624816895, + "learning_rate": 1.5477646747201559e-06, + "loss": 0.3935, + "step": 36965 + }, + { + "epoch": 0.8206346211473791, + "grad_norm": 1.3218568563461304, + "learning_rate": 1.545901805940494e-06, + "loss": 0.4343, + "step": 36970 + }, + { + "epoch": 0.8207456077069067, + "grad_norm": 1.116638422012329, + "learning_rate": 1.5440399649817384e-06, + "loss": 0.4022, + "step": 36975 + }, + { + "epoch": 0.8208565942664343, + "grad_norm": 1.830057978630066, + "learning_rate": 1.5421791520702468e-06, + "loss": 0.4358, + "step": 36980 + }, + { + "epoch": 0.820967580825962, + "grad_norm": 1.1473445892333984, + "learning_rate": 1.540319367432246e-06, + "loss": 0.2289, + "step": 36985 + }, + { + "epoch": 0.8210785673854896, + "grad_norm": 1.3270286321640015, + "learning_rate": 1.538460611293847e-06, + "loss": 0.3231, + "step": 36990 + }, + { + "epoch": 0.8211895539450172, + "grad_norm": 0.9333168268203735, + "learning_rate": 1.5366028838810265e-06, + "loss": 0.2854, + "step": 36995 + }, + { + "epoch": 0.8213005405045449, + "grad_norm": 1.4975849390029907, + "learning_rate": 1.5347461854196466e-06, + "loss": 0.5078, + "step": 37000 + }, + { + "epoch": 0.8214115270640725, + "grad_norm": 1.1126400232315063, + "learning_rate": 1.5328905161354324e-06, + "loss": 0.4876, + "step": 37005 + }, + { + "epoch": 0.8215225136236002, + "grad_norm": 0.8321356177330017, + "learning_rate": 1.5310358762539957e-06, + "loss": 0.375, + "step": 37010 + }, + { + "epoch": 0.8216335001831279, + "grad_norm": 2.101627826690674, + "learning_rate": 1.5291822660008116e-06, + "loss": 0.416, + "step": 37015 + }, + { + "epoch": 0.8217444867426554, + "grad_norm": 1.0231560468673706, + "learning_rate": 1.527329685601241e-06, + "loss": 0.4258, + "step": 37020 + }, + { + "epoch": 0.8218554733021831, + "grad_norm": 0.7642114162445068, + "learning_rate": 1.5254781352805092e-06, + "loss": 0.2772, + "step": 37025 + }, + { + "epoch": 0.8219664598617108, + "grad_norm": 1.4178411960601807, + "learning_rate": 1.5236276152637275e-06, + "loss": 0.3743, + "step": 37030 + }, + { + "epoch": 0.8220774464212384, + "grad_norm": 1.4414584636688232, + "learning_rate": 1.52177812577587e-06, + "loss": 0.4376, + "step": 37035 + }, + { + "epoch": 0.822188432980766, + "grad_norm": 1.2922779321670532, + "learning_rate": 1.5199296670417973e-06, + "loss": 0.3136, + "step": 37040 + }, + { + "epoch": 0.8222994195402936, + "grad_norm": 0.901348352432251, + "learning_rate": 1.5180822392862327e-06, + "loss": 0.331, + "step": 37045 + }, + { + "epoch": 0.8224104060998213, + "grad_norm": 1.0336065292358398, + "learning_rate": 1.5162358427337853e-06, + "loss": 0.5121, + "step": 37050 + }, + { + "epoch": 0.822521392659349, + "grad_norm": 1.231096625328064, + "learning_rate": 1.5143904776089302e-06, + "loss": 0.3965, + "step": 37055 + }, + { + "epoch": 0.8226323792188766, + "grad_norm": 1.0460683107376099, + "learning_rate": 1.5125461441360223e-06, + "loss": 0.3874, + "step": 37060 + }, + { + "epoch": 0.8227433657784042, + "grad_norm": 1.1694492101669312, + "learning_rate": 1.5107028425392923e-06, + "loss": 0.499, + "step": 37065 + }, + { + "epoch": 0.8228543523379319, + "grad_norm": 1.3804575204849243, + "learning_rate": 1.5088605730428362e-06, + "loss": 0.3613, + "step": 37070 + }, + { + "epoch": 0.8229653388974595, + "grad_norm": 0.6450167894363403, + "learning_rate": 1.5070193358706375e-06, + "loss": 0.3518, + "step": 37075 + }, + { + "epoch": 0.8230763254569872, + "grad_norm": 0.9184905290603638, + "learning_rate": 1.5051791312465425e-06, + "loss": 0.2628, + "step": 37080 + }, + { + "epoch": 0.8231873120165148, + "grad_norm": 0.5770201683044434, + "learning_rate": 1.5033399593942789e-06, + "loss": 0.2979, + "step": 37085 + }, + { + "epoch": 0.8232982985760424, + "grad_norm": 1.383227825164795, + "learning_rate": 1.5015018205374498e-06, + "loss": 0.4115, + "step": 37090 + }, + { + "epoch": 0.8234092851355701, + "grad_norm": 1.2248444557189941, + "learning_rate": 1.4996647148995258e-06, + "loss": 0.3667, + "step": 37095 + }, + { + "epoch": 0.8235202716950977, + "grad_norm": 1.2955788373947144, + "learning_rate": 1.4978286427038602e-06, + "loss": 0.5362, + "step": 37100 + }, + { + "epoch": 0.8236312582546254, + "grad_norm": 1.5456572771072388, + "learning_rate": 1.495993604173671e-06, + "loss": 0.4109, + "step": 37105 + }, + { + "epoch": 0.823742244814153, + "grad_norm": 1.211588978767395, + "learning_rate": 1.49415959953206e-06, + "loss": 0.4457, + "step": 37110 + }, + { + "epoch": 0.8238532313736806, + "grad_norm": 1.0468361377716064, + "learning_rate": 1.4923266290020011e-06, + "loss": 0.2724, + "step": 37115 + }, + { + "epoch": 0.8239642179332083, + "grad_norm": 1.3627668619155884, + "learning_rate": 1.4904946928063347e-06, + "loss": 0.3655, + "step": 37120 + }, + { + "epoch": 0.824075204492736, + "grad_norm": 1.1421093940734863, + "learning_rate": 1.4886637911677882e-06, + "loss": 0.3082, + "step": 37125 + }, + { + "epoch": 0.8241861910522635, + "grad_norm": 0.8973304629325867, + "learning_rate": 1.4868339243089503e-06, + "loss": 0.3593, + "step": 37130 + }, + { + "epoch": 0.8242971776117912, + "grad_norm": 0.8185334205627441, + "learning_rate": 1.4850050924522953e-06, + "loss": 0.43, + "step": 37135 + }, + { + "epoch": 0.8244081641713189, + "grad_norm": 1.235817313194275, + "learning_rate": 1.4831772958201618e-06, + "loss": 0.4316, + "step": 37140 + }, + { + "epoch": 0.8245191507308465, + "grad_norm": 0.8884466886520386, + "learning_rate": 1.4813505346347701e-06, + "loss": 0.3893, + "step": 37145 + }, + { + "epoch": 0.8246301372903742, + "grad_norm": 0.9675939679145813, + "learning_rate": 1.4795248091182124e-06, + "loss": 0.4503, + "step": 37150 + }, + { + "epoch": 0.8247411238499017, + "grad_norm": 0.9958243370056152, + "learning_rate": 1.4777001194924512e-06, + "loss": 0.296, + "step": 37155 + }, + { + "epoch": 0.8248521104094294, + "grad_norm": 1.5261372327804565, + "learning_rate": 1.4758764659793302e-06, + "loss": 0.335, + "step": 37160 + }, + { + "epoch": 0.8249630969689571, + "grad_norm": 1.335727572441101, + "learning_rate": 1.4740538488005584e-06, + "loss": 0.4203, + "step": 37165 + }, + { + "epoch": 0.8250740835284847, + "grad_norm": 0.9879004955291748, + "learning_rate": 1.4722322681777257e-06, + "loss": 0.4227, + "step": 37170 + }, + { + "epoch": 0.8251850700880123, + "grad_norm": 1.6839021444320679, + "learning_rate": 1.4704117243322969e-06, + "loss": 0.4345, + "step": 37175 + }, + { + "epoch": 0.82529605664754, + "grad_norm": 1.8196085691452026, + "learning_rate": 1.4685922174856015e-06, + "loss": 0.3528, + "step": 37180 + }, + { + "epoch": 0.8254070432070676, + "grad_norm": 1.3297110795974731, + "learning_rate": 1.466773747858854e-06, + "loss": 0.5426, + "step": 37185 + }, + { + "epoch": 0.8255180297665953, + "grad_norm": 1.045060157775879, + "learning_rate": 1.464956315673135e-06, + "loss": 0.3207, + "step": 37190 + }, + { + "epoch": 0.825629016326123, + "grad_norm": 1.4947199821472168, + "learning_rate": 1.4631399211494023e-06, + "loss": 0.5682, + "step": 37195 + }, + { + "epoch": 0.8257400028856505, + "grad_norm": 1.687486171722412, + "learning_rate": 1.4613245645084894e-06, + "loss": 0.322, + "step": 37200 + }, + { + "epoch": 0.8258509894451782, + "grad_norm": 1.0408520698547363, + "learning_rate": 1.4595102459710987e-06, + "loss": 0.5531, + "step": 37205 + }, + { + "epoch": 0.8259619760047058, + "grad_norm": 1.493778944015503, + "learning_rate": 1.45769696575781e-06, + "loss": 0.4164, + "step": 37210 + }, + { + "epoch": 0.8260729625642335, + "grad_norm": 0.9806472659111023, + "learning_rate": 1.455884724089075e-06, + "loss": 0.3899, + "step": 37215 + }, + { + "epoch": 0.8261839491237611, + "grad_norm": 1.0444679260253906, + "learning_rate": 1.454073521185222e-06, + "loss": 0.4276, + "step": 37220 + }, + { + "epoch": 0.8262949356832887, + "grad_norm": 1.407180666923523, + "learning_rate": 1.452263357266447e-06, + "loss": 0.3518, + "step": 37225 + }, + { + "epoch": 0.8264059222428164, + "grad_norm": 1.4814797639846802, + "learning_rate": 1.450454232552826e-06, + "loss": 0.3628, + "step": 37230 + }, + { + "epoch": 0.8265169088023441, + "grad_norm": 1.703507661819458, + "learning_rate": 1.4486461472643088e-06, + "loss": 0.4244, + "step": 37235 + }, + { + "epoch": 0.8266278953618716, + "grad_norm": 0.640510618686676, + "learning_rate": 1.4468391016207129e-06, + "loss": 0.3671, + "step": 37240 + }, + { + "epoch": 0.8267388819213993, + "grad_norm": 1.0508774518966675, + "learning_rate": 1.4450330958417348e-06, + "loss": 0.3553, + "step": 37245 + }, + { + "epoch": 0.826849868480927, + "grad_norm": 1.3584023714065552, + "learning_rate": 1.4432281301469397e-06, + "loss": 0.3616, + "step": 37250 + }, + { + "epoch": 0.8269608550404546, + "grad_norm": 1.5076439380645752, + "learning_rate": 1.4414242047557747e-06, + "loss": 0.2058, + "step": 37255 + }, + { + "epoch": 0.8270718415999823, + "grad_norm": 0.9351935386657715, + "learning_rate": 1.4396213198875485e-06, + "loss": 0.4854, + "step": 37260 + }, + { + "epoch": 0.8271828281595098, + "grad_norm": 1.1554254293441772, + "learning_rate": 1.437819475761455e-06, + "loss": 0.5623, + "step": 37265 + }, + { + "epoch": 0.8272938147190375, + "grad_norm": 1.1598232984542847, + "learning_rate": 1.4360186725965518e-06, + "loss": 0.5052, + "step": 37270 + }, + { + "epoch": 0.8274048012785652, + "grad_norm": 1.4718127250671387, + "learning_rate": 1.434218910611781e-06, + "loss": 0.3818, + "step": 37275 + }, + { + "epoch": 0.8275157878380928, + "grad_norm": 0.9634243249893188, + "learning_rate": 1.4324201900259438e-06, + "loss": 0.3148, + "step": 37280 + }, + { + "epoch": 0.8276267743976204, + "grad_norm": 1.3595343828201294, + "learning_rate": 1.4306225110577288e-06, + "loss": 0.4535, + "step": 37285 + }, + { + "epoch": 0.8277377609571481, + "grad_norm": 0.9844816327095032, + "learning_rate": 1.4288258739256877e-06, + "loss": 0.2749, + "step": 37290 + }, + { + "epoch": 0.8278487475166757, + "grad_norm": 2.4494025707244873, + "learning_rate": 1.4270302788482537e-06, + "loss": 0.4749, + "step": 37295 + }, + { + "epoch": 0.8279597340762034, + "grad_norm": 1.2348016500473022, + "learning_rate": 1.4252357260437244e-06, + "loss": 0.4774, + "step": 37300 + }, + { + "epoch": 0.8280707206357311, + "grad_norm": 0.7983580827713013, + "learning_rate": 1.4234422157302808e-06, + "loss": 0.2663, + "step": 37305 + }, + { + "epoch": 0.8281817071952586, + "grad_norm": 1.6383472681045532, + "learning_rate": 1.4216497481259662e-06, + "loss": 0.2833, + "step": 37310 + }, + { + "epoch": 0.8282926937547863, + "grad_norm": 1.3078876733779907, + "learning_rate": 1.4198583234487052e-06, + "loss": 0.3705, + "step": 37315 + }, + { + "epoch": 0.8284036803143139, + "grad_norm": 0.9692357182502747, + "learning_rate": 1.4180679419162968e-06, + "loss": 0.3132, + "step": 37320 + }, + { + "epoch": 0.8285146668738416, + "grad_norm": 1.1948881149291992, + "learning_rate": 1.4162786037464038e-06, + "loss": 0.3414, + "step": 37325 + }, + { + "epoch": 0.8286256534333692, + "grad_norm": 2.3686370849609375, + "learning_rate": 1.414490309156573e-06, + "loss": 0.4313, + "step": 37330 + }, + { + "epoch": 0.8287366399928968, + "grad_norm": 1.1939934492111206, + "learning_rate": 1.4127030583642143e-06, + "loss": 0.4808, + "step": 37335 + }, + { + "epoch": 0.8288476265524245, + "grad_norm": 1.2011733055114746, + "learning_rate": 1.410916851586619e-06, + "loss": 0.418, + "step": 37340 + }, + { + "epoch": 0.8289586131119522, + "grad_norm": 1.4537737369537354, + "learning_rate": 1.409131689040949e-06, + "loss": 0.4427, + "step": 37345 + }, + { + "epoch": 0.8290695996714798, + "grad_norm": 0.9123027920722961, + "learning_rate": 1.407347570944234e-06, + "loss": 0.4614, + "step": 37350 + }, + { + "epoch": 0.8291805862310074, + "grad_norm": 1.0160402059555054, + "learning_rate": 1.4055644975133875e-06, + "loss": 0.443, + "step": 37355 + }, + { + "epoch": 0.8292915727905351, + "grad_norm": 1.07515287399292, + "learning_rate": 1.4037824689651825e-06, + "loss": 0.3687, + "step": 37360 + }, + { + "epoch": 0.8294025593500627, + "grad_norm": 2.9584853649139404, + "learning_rate": 1.4020014855162755e-06, + "loss": 0.3932, + "step": 37365 + }, + { + "epoch": 0.8295135459095904, + "grad_norm": 0.940378725528717, + "learning_rate": 1.4002215473831948e-06, + "loss": 0.4168, + "step": 37370 + }, + { + "epoch": 0.8296245324691179, + "grad_norm": 1.1896016597747803, + "learning_rate": 1.3984426547823348e-06, + "loss": 0.454, + "step": 37375 + }, + { + "epoch": 0.8297355190286456, + "grad_norm": 1.081487774848938, + "learning_rate": 1.3966648079299717e-06, + "loss": 0.2407, + "step": 37380 + }, + { + "epoch": 0.8298465055881733, + "grad_norm": 1.4853719472885132, + "learning_rate": 1.3948880070422455e-06, + "loss": 0.3657, + "step": 37385 + }, + { + "epoch": 0.8299574921477009, + "grad_norm": 0.9414169788360596, + "learning_rate": 1.3931122523351792e-06, + "loss": 0.443, + "step": 37390 + }, + { + "epoch": 0.8300684787072286, + "grad_norm": 0.7520697712898254, + "learning_rate": 1.3913375440246569e-06, + "loss": 0.3519, + "step": 37395 + }, + { + "epoch": 0.8301794652667562, + "grad_norm": 0.8525213599205017, + "learning_rate": 1.3895638823264447e-06, + "loss": 0.2664, + "step": 37400 + }, + { + "epoch": 0.8302904518262838, + "grad_norm": 5.166147708892822, + "learning_rate": 1.387791267456181e-06, + "loss": 0.3077, + "step": 37405 + }, + { + "epoch": 0.8304014383858115, + "grad_norm": 0.8263622522354126, + "learning_rate": 1.3860196996293696e-06, + "loss": 0.4258, + "step": 37410 + }, + { + "epoch": 0.8305124249453392, + "grad_norm": 1.5888712406158447, + "learning_rate": 1.3842491790613966e-06, + "loss": 0.5111, + "step": 37415 + }, + { + "epoch": 0.8306234115048667, + "grad_norm": 1.092028021812439, + "learning_rate": 1.382479705967511e-06, + "loss": 0.3655, + "step": 37420 + }, + { + "epoch": 0.8307343980643944, + "grad_norm": 1.4173848628997803, + "learning_rate": 1.380711280562841e-06, + "loss": 0.4612, + "step": 37425 + }, + { + "epoch": 0.830845384623922, + "grad_norm": 0.5405141711235046, + "learning_rate": 1.3789439030623896e-06, + "loss": 0.2213, + "step": 37430 + }, + { + "epoch": 0.8309563711834497, + "grad_norm": 1.3125280141830444, + "learning_rate": 1.3771775736810244e-06, + "loss": 0.3059, + "step": 37435 + }, + { + "epoch": 0.8310673577429774, + "grad_norm": 1.3626099824905396, + "learning_rate": 1.3754122926334922e-06, + "loss": 0.3351, + "step": 37440 + }, + { + "epoch": 0.8311783443025049, + "grad_norm": 0.6582348942756653, + "learning_rate": 1.3736480601344071e-06, + "loss": 0.288, + "step": 37445 + }, + { + "epoch": 0.8312893308620326, + "grad_norm": 1.2736144065856934, + "learning_rate": 1.3718848763982596e-06, + "loss": 0.4283, + "step": 37450 + }, + { + "epoch": 0.8314003174215603, + "grad_norm": 0.8599841594696045, + "learning_rate": 1.3701227416394146e-06, + "loss": 0.3242, + "step": 37455 + }, + { + "epoch": 0.8315113039810879, + "grad_norm": 1.3656071424484253, + "learning_rate": 1.3683616560721036e-06, + "loss": 0.3931, + "step": 37460 + }, + { + "epoch": 0.8316222905406155, + "grad_norm": 1.123210072517395, + "learning_rate": 1.3666016199104349e-06, + "loss": 0.3652, + "step": 37465 + }, + { + "epoch": 0.8317332771001432, + "grad_norm": 0.8573991656303406, + "learning_rate": 1.364842633368385e-06, + "loss": 0.4622, + "step": 37470 + }, + { + "epoch": 0.8318442636596708, + "grad_norm": 1.0115129947662354, + "learning_rate": 1.363084696659811e-06, + "loss": 0.3035, + "step": 37475 + }, + { + "epoch": 0.8319552502191985, + "grad_norm": 1.4920989274978638, + "learning_rate": 1.3613278099984305e-06, + "loss": 0.2896, + "step": 37480 + }, + { + "epoch": 0.832066236778726, + "grad_norm": 1.930938959121704, + "learning_rate": 1.3595719735978451e-06, + "loss": 0.2667, + "step": 37485 + }, + { + "epoch": 0.8321772233382537, + "grad_norm": 1.1663861274719238, + "learning_rate": 1.3578171876715196e-06, + "loss": 0.3067, + "step": 37490 + }, + { + "epoch": 0.8322882098977814, + "grad_norm": 1.9616607427597046, + "learning_rate": 1.3560634524327987e-06, + "loss": 0.4342, + "step": 37495 + }, + { + "epoch": 0.832399196457309, + "grad_norm": 0.8635976910591125, + "learning_rate": 1.354310768094892e-06, + "loss": 0.3984, + "step": 37500 + }, + { + "epoch": 0.8325101830168367, + "grad_norm": 1.164693832397461, + "learning_rate": 1.352559134870889e-06, + "loss": 0.5515, + "step": 37505 + }, + { + "epoch": 0.8326211695763643, + "grad_norm": 1.0937644243240356, + "learning_rate": 1.3508085529737425e-06, + "loss": 0.3391, + "step": 37510 + }, + { + "epoch": 0.8327321561358919, + "grad_norm": 2.0383598804473877, + "learning_rate": 1.349059022616287e-06, + "loss": 0.5429, + "step": 37515 + }, + { + "epoch": 0.8328431426954196, + "grad_norm": 1.3086220026016235, + "learning_rate": 1.34731054401122e-06, + "loss": 0.2063, + "step": 37520 + }, + { + "epoch": 0.8329541292549473, + "grad_norm": 1.2310152053833008, + "learning_rate": 1.3455631173711214e-06, + "loss": 0.2309, + "step": 37525 + }, + { + "epoch": 0.8330651158144748, + "grad_norm": 1.0935806035995483, + "learning_rate": 1.3438167429084315e-06, + "loss": 0.4173, + "step": 37530 + }, + { + "epoch": 0.8331761023740025, + "grad_norm": 1.3279857635498047, + "learning_rate": 1.3420714208354713e-06, + "loss": 0.4536, + "step": 37535 + }, + { + "epoch": 0.8332870889335301, + "grad_norm": 0.9432138800621033, + "learning_rate": 1.3403271513644334e-06, + "loss": 0.5366, + "step": 37540 + }, + { + "epoch": 0.8333980754930578, + "grad_norm": 1.0061371326446533, + "learning_rate": 1.338583934707377e-06, + "loss": 0.4347, + "step": 37545 + }, + { + "epoch": 0.8335090620525855, + "grad_norm": 2.5035080909729004, + "learning_rate": 1.3368417710762394e-06, + "loss": 0.3435, + "step": 37550 + }, + { + "epoch": 0.833620048612113, + "grad_norm": 0.8789152503013611, + "learning_rate": 1.335100660682822e-06, + "loss": 0.3996, + "step": 37555 + }, + { + "epoch": 0.8337310351716407, + "grad_norm": 0.9698935151100159, + "learning_rate": 1.3333606037388102e-06, + "loss": 0.2393, + "step": 37560 + }, + { + "epoch": 0.8338420217311684, + "grad_norm": 0.7996877431869507, + "learning_rate": 1.331621600455747e-06, + "loss": 0.3614, + "step": 37565 + }, + { + "epoch": 0.833953008290696, + "grad_norm": 2.1126487255096436, + "learning_rate": 1.3298836510450597e-06, + "loss": 0.3795, + "step": 37570 + }, + { + "epoch": 0.8340639948502236, + "grad_norm": 2.0766563415527344, + "learning_rate": 1.3281467557180416e-06, + "loss": 0.5423, + "step": 37575 + }, + { + "epoch": 0.8341749814097513, + "grad_norm": 1.3232349157333374, + "learning_rate": 1.3264109146858562e-06, + "loss": 0.3806, + "step": 37580 + }, + { + "epoch": 0.8342859679692789, + "grad_norm": 1.0379611253738403, + "learning_rate": 1.3246761281595454e-06, + "loss": 0.3126, + "step": 37585 + }, + { + "epoch": 0.8343969545288066, + "grad_norm": 0.6964349746704102, + "learning_rate": 1.3229423963500132e-06, + "loss": 0.3547, + "step": 37590 + }, + { + "epoch": 0.8345079410883341, + "grad_norm": 0.9996328949928284, + "learning_rate": 1.321209719468044e-06, + "loss": 0.4493, + "step": 37595 + }, + { + "epoch": 0.8346189276478618, + "grad_norm": 1.6771372556686401, + "learning_rate": 1.3194780977242927e-06, + "loss": 0.5017, + "step": 37600 + }, + { + "epoch": 0.8347299142073895, + "grad_norm": 1.0705715417861938, + "learning_rate": 1.317747531329281e-06, + "loss": 0.2607, + "step": 37605 + }, + { + "epoch": 0.8348409007669171, + "grad_norm": 1.3443447351455688, + "learning_rate": 1.316018020493408e-06, + "loss": 0.4812, + "step": 37610 + }, + { + "epoch": 0.8349518873264448, + "grad_norm": 1.5578769445419312, + "learning_rate": 1.3142895654269372e-06, + "loss": 0.4023, + "step": 37615 + }, + { + "epoch": 0.8350628738859724, + "grad_norm": 1.4091074466705322, + "learning_rate": 1.3125621663400123e-06, + "loss": 0.5718, + "step": 37620 + }, + { + "epoch": 0.8351738604455, + "grad_norm": 1.1248959302902222, + "learning_rate": 1.3108358234426455e-06, + "loss": 0.3817, + "step": 37625 + }, + { + "epoch": 0.8352848470050277, + "grad_norm": 1.4390947818756104, + "learning_rate": 1.3091105369447166e-06, + "loss": 0.328, + "step": 37630 + }, + { + "epoch": 0.8353958335645554, + "grad_norm": 1.4196460247039795, + "learning_rate": 1.3073863070559833e-06, + "loss": 0.3957, + "step": 37635 + }, + { + "epoch": 0.835506820124083, + "grad_norm": 2.1124303340911865, + "learning_rate": 1.3056631339860682e-06, + "loss": 0.4624, + "step": 37640 + }, + { + "epoch": 0.8356178066836106, + "grad_norm": 1.3274662494659424, + "learning_rate": 1.3039410179444734e-06, + "loss": 0.413, + "step": 37645 + }, + { + "epoch": 0.8357287932431382, + "grad_norm": 1.625228762626648, + "learning_rate": 1.3022199591405616e-06, + "loss": 0.3787, + "step": 37650 + }, + { + "epoch": 0.8358397798026659, + "grad_norm": 0.7213447690010071, + "learning_rate": 1.3004999577835786e-06, + "loss": 0.3415, + "step": 37655 + }, + { + "epoch": 0.8359507663621936, + "grad_norm": 0.8313508629798889, + "learning_rate": 1.2987810140826362e-06, + "loss": 0.4463, + "step": 37660 + }, + { + "epoch": 0.8360617529217211, + "grad_norm": 1.5393515825271606, + "learning_rate": 1.297063128246715e-06, + "loss": 0.3254, + "step": 37665 + }, + { + "epoch": 0.8361727394812488, + "grad_norm": 0.8955177068710327, + "learning_rate": 1.2953463004846722e-06, + "loss": 0.4756, + "step": 37670 + }, + { + "epoch": 0.8362837260407765, + "grad_norm": 0.8730138540267944, + "learning_rate": 1.2936305310052322e-06, + "loss": 0.2815, + "step": 37675 + }, + { + "epoch": 0.8363947126003041, + "grad_norm": 1.8725333213806152, + "learning_rate": 1.2919158200169924e-06, + "loss": 0.3197, + "step": 37680 + }, + { + "epoch": 0.8365056991598318, + "grad_norm": 1.0760202407836914, + "learning_rate": 1.290202167728426e-06, + "loss": 0.2608, + "step": 37685 + }, + { + "epoch": 0.8366166857193594, + "grad_norm": 1.7121400833129883, + "learning_rate": 1.2884895743478664e-06, + "loss": 0.4663, + "step": 37690 + }, + { + "epoch": 0.836727672278887, + "grad_norm": 1.1320337057113647, + "learning_rate": 1.2867780400835307e-06, + "loss": 0.4716, + "step": 37695 + }, + { + "epoch": 0.8368386588384147, + "grad_norm": 0.8330530524253845, + "learning_rate": 1.2850675651434962e-06, + "loss": 0.3685, + "step": 37700 + }, + { + "epoch": 0.8369496453979423, + "grad_norm": 0.9803527593612671, + "learning_rate": 1.2833581497357205e-06, + "loss": 0.3924, + "step": 37705 + }, + { + "epoch": 0.8370606319574699, + "grad_norm": 1.5887954235076904, + "learning_rate": 1.2816497940680294e-06, + "loss": 0.3234, + "step": 37710 + }, + { + "epoch": 0.8371716185169976, + "grad_norm": 1.1901915073394775, + "learning_rate": 1.2799424983481145e-06, + "loss": 0.393, + "step": 37715 + }, + { + "epoch": 0.8372826050765252, + "grad_norm": 1.1848924160003662, + "learning_rate": 1.2782362627835488e-06, + "loss": 0.4492, + "step": 37720 + }, + { + "epoch": 0.8373935916360529, + "grad_norm": 0.7305712103843689, + "learning_rate": 1.2765310875817649e-06, + "loss": 0.3837, + "step": 37725 + }, + { + "epoch": 0.8375045781955806, + "grad_norm": 1.7613849639892578, + "learning_rate": 1.2748269729500784e-06, + "loss": 0.5601, + "step": 37730 + }, + { + "epoch": 0.8376155647551081, + "grad_norm": 1.5227969884872437, + "learning_rate": 1.2731239190956635e-06, + "loss": 0.3904, + "step": 37735 + }, + { + "epoch": 0.8377265513146358, + "grad_norm": 1.541243076324463, + "learning_rate": 1.2714219262255777e-06, + "loss": 0.2467, + "step": 37740 + }, + { + "epoch": 0.8378375378741635, + "grad_norm": 0.8089504241943359, + "learning_rate": 1.2697209945467382e-06, + "loss": 0.4567, + "step": 37745 + }, + { + "epoch": 0.837948524433691, + "grad_norm": 1.0389708280563354, + "learning_rate": 1.2680211242659425e-06, + "loss": 0.2745, + "step": 37750 + }, + { + "epoch": 0.8380595109932187, + "grad_norm": 0.951557457447052, + "learning_rate": 1.266322315589853e-06, + "loss": 0.4042, + "step": 37755 + }, + { + "epoch": 0.8381704975527463, + "grad_norm": 1.3259742259979248, + "learning_rate": 1.264624568725007e-06, + "loss": 0.353, + "step": 37760 + }, + { + "epoch": 0.838281484112274, + "grad_norm": 0.8747276067733765, + "learning_rate": 1.262927883877808e-06, + "loss": 0.2623, + "step": 37765 + }, + { + "epoch": 0.8383924706718017, + "grad_norm": 2.2142369747161865, + "learning_rate": 1.261232261254537e-06, + "loss": 0.3924, + "step": 37770 + }, + { + "epoch": 0.8385034572313292, + "grad_norm": 1.2620859146118164, + "learning_rate": 1.2595377010613375e-06, + "loss": 0.4454, + "step": 37775 + }, + { + "epoch": 0.8386144437908569, + "grad_norm": 1.658715844154358, + "learning_rate": 1.2578442035042338e-06, + "loss": 0.3923, + "step": 37780 + }, + { + "epoch": 0.8387254303503846, + "grad_norm": 1.0938314199447632, + "learning_rate": 1.2561517687891112e-06, + "loss": 0.4819, + "step": 37785 + }, + { + "epoch": 0.8388364169099122, + "grad_norm": 1.25962233543396, + "learning_rate": 1.2544603971217318e-06, + "loss": 0.3127, + "step": 37790 + }, + { + "epoch": 0.8389474034694399, + "grad_norm": 1.0734721422195435, + "learning_rate": 1.2527700887077289e-06, + "loss": 0.203, + "step": 37795 + }, + { + "epoch": 0.8390583900289675, + "grad_norm": 1.505988597869873, + "learning_rate": 1.2510808437526e-06, + "loss": 0.5183, + "step": 37800 + }, + { + "epoch": 0.8391693765884951, + "grad_norm": 1.4044837951660156, + "learning_rate": 1.2493926624617237e-06, + "loss": 0.2907, + "step": 37805 + }, + { + "epoch": 0.8392803631480228, + "grad_norm": 0.949424147605896, + "learning_rate": 1.2477055450403374e-06, + "loss": 0.4513, + "step": 37810 + }, + { + "epoch": 0.8393913497075504, + "grad_norm": 1.0142112970352173, + "learning_rate": 1.2460194916935587e-06, + "loss": 0.4101, + "step": 37815 + }, + { + "epoch": 0.839502336267078, + "grad_norm": 1.2752467393875122, + "learning_rate": 1.2443345026263731e-06, + "loss": 0.5054, + "step": 37820 + }, + { + "epoch": 0.8396133228266057, + "grad_norm": 1.5303524732589722, + "learning_rate": 1.2426505780436326e-06, + "loss": 0.5011, + "step": 37825 + }, + { + "epoch": 0.8397243093861333, + "grad_norm": 1.807246208190918, + "learning_rate": 1.2409677181500668e-06, + "loss": 0.2896, + "step": 37830 + }, + { + "epoch": 0.839835295945661, + "grad_norm": 0.9525343179702759, + "learning_rate": 1.2392859231502685e-06, + "loss": 0.3024, + "step": 37835 + }, + { + "epoch": 0.8399462825051887, + "grad_norm": 1.0603681802749634, + "learning_rate": 1.2376051932487087e-06, + "loss": 0.4048, + "step": 37840 + }, + { + "epoch": 0.8400572690647162, + "grad_norm": 1.1888055801391602, + "learning_rate": 1.2359255286497195e-06, + "loss": 0.4145, + "step": 37845 + }, + { + "epoch": 0.8401682556242439, + "grad_norm": 0.9829010963439941, + "learning_rate": 1.2342469295575133e-06, + "loss": 0.4637, + "step": 37850 + }, + { + "epoch": 0.8402792421837716, + "grad_norm": 1.1636102199554443, + "learning_rate": 1.2325693961761697e-06, + "loss": 0.3748, + "step": 37855 + }, + { + "epoch": 0.8403902287432992, + "grad_norm": 1.3435735702514648, + "learning_rate": 1.2308929287096327e-06, + "loss": 0.3393, + "step": 37860 + }, + { + "epoch": 0.8405012153028268, + "grad_norm": 0.8890703320503235, + "learning_rate": 1.2292175273617258e-06, + "loss": 0.3868, + "step": 37865 + }, + { + "epoch": 0.8406122018623544, + "grad_norm": 1.4685038328170776, + "learning_rate": 1.2275431923361358e-06, + "loss": 0.4727, + "step": 37870 + }, + { + "epoch": 0.8407231884218821, + "grad_norm": 1.1564308404922485, + "learning_rate": 1.2258699238364224e-06, + "loss": 0.3571, + "step": 37875 + }, + { + "epoch": 0.8408341749814098, + "grad_norm": 0.8358651399612427, + "learning_rate": 1.2241977220660206e-06, + "loss": 0.3261, + "step": 37880 + }, + { + "epoch": 0.8409451615409373, + "grad_norm": 1.0563150644302368, + "learning_rate": 1.2225265872282266e-06, + "loss": 0.4553, + "step": 37885 + }, + { + "epoch": 0.841056148100465, + "grad_norm": 1.1132227182388306, + "learning_rate": 1.2208565195262145e-06, + "loss": 0.2361, + "step": 37890 + }, + { + "epoch": 0.8411671346599927, + "grad_norm": 1.431659460067749, + "learning_rate": 1.2191875191630209e-06, + "loss": 0.3593, + "step": 37895 + }, + { + "epoch": 0.8412781212195203, + "grad_norm": 1.1609866619110107, + "learning_rate": 1.21751958634156e-06, + "loss": 0.333, + "step": 37900 + }, + { + "epoch": 0.841389107779048, + "grad_norm": 1.087960958480835, + "learning_rate": 1.2158527212646165e-06, + "loss": 0.3605, + "step": 37905 + }, + { + "epoch": 0.8415000943385756, + "grad_norm": 1.426638126373291, + "learning_rate": 1.214186924134838e-06, + "loss": 0.4866, + "step": 37910 + }, + { + "epoch": 0.8416110808981032, + "grad_norm": 3.363008499145508, + "learning_rate": 1.2125221951547495e-06, + "loss": 0.4721, + "step": 37915 + }, + { + "epoch": 0.8417220674576309, + "grad_norm": 1.3448435068130493, + "learning_rate": 1.2108585345267387e-06, + "loss": 0.3232, + "step": 37920 + }, + { + "epoch": 0.8418330540171586, + "grad_norm": 1.0982211828231812, + "learning_rate": 1.209195942453073e-06, + "loss": 0.3121, + "step": 37925 + }, + { + "epoch": 0.8419440405766861, + "grad_norm": 1.3792861700057983, + "learning_rate": 1.2075344191358818e-06, + "loss": 0.2409, + "step": 37930 + }, + { + "epoch": 0.8420550271362138, + "grad_norm": 1.3266873359680176, + "learning_rate": 1.2058739647771667e-06, + "loss": 0.2559, + "step": 37935 + }, + { + "epoch": 0.8421660136957414, + "grad_norm": 0.886651873588562, + "learning_rate": 1.204214579578804e-06, + "loss": 0.3027, + "step": 37940 + }, + { + "epoch": 0.8422770002552691, + "grad_norm": 0.8672597408294678, + "learning_rate": 1.2025562637425326e-06, + "loss": 0.4554, + "step": 37945 + }, + { + "epoch": 0.8423879868147968, + "grad_norm": 1.3894035816192627, + "learning_rate": 1.2008990174699685e-06, + "loss": 0.3483, + "step": 37950 + }, + { + "epoch": 0.8424989733743243, + "grad_norm": 0.844137966632843, + "learning_rate": 1.1992428409625901e-06, + "loss": 0.2818, + "step": 37955 + }, + { + "epoch": 0.842609959933852, + "grad_norm": 1.4286129474639893, + "learning_rate": 1.1975877344217501e-06, + "loss": 0.3868, + "step": 37960 + }, + { + "epoch": 0.8427209464933797, + "grad_norm": 0.6080193519592285, + "learning_rate": 1.1959336980486757e-06, + "loss": 0.4645, + "step": 37965 + }, + { + "epoch": 0.8428319330529073, + "grad_norm": 0.7501819133758545, + "learning_rate": 1.194280732044454e-06, + "loss": 0.3131, + "step": 37970 + }, + { + "epoch": 0.842942919612435, + "grad_norm": 1.4122878313064575, + "learning_rate": 1.1926288366100503e-06, + "loss": 0.3991, + "step": 37975 + }, + { + "epoch": 0.8430539061719626, + "grad_norm": 1.70725679397583, + "learning_rate": 1.1909780119462922e-06, + "loss": 0.4149, + "step": 37980 + }, + { + "epoch": 0.8431648927314902, + "grad_norm": 0.9734331369400024, + "learning_rate": 1.1893282582538867e-06, + "loss": 0.307, + "step": 37985 + }, + { + "epoch": 0.8432758792910179, + "grad_norm": 1.3157240152359009, + "learning_rate": 1.1876795757334014e-06, + "loss": 0.3889, + "step": 37990 + }, + { + "epoch": 0.8433868658505455, + "grad_norm": 1.4884155988693237, + "learning_rate": 1.1860319645852814e-06, + "loss": 0.4665, + "step": 37995 + }, + { + "epoch": 0.8434978524100731, + "grad_norm": 1.281921625137329, + "learning_rate": 1.184385425009832e-06, + "loss": 0.4709, + "step": 38000 + }, + { + "epoch": 0.8436088389696008, + "grad_norm": 1.284393310546875, + "learning_rate": 1.1827399572072407e-06, + "loss": 0.3922, + "step": 38005 + }, + { + "epoch": 0.8437198255291284, + "grad_norm": 1.273414134979248, + "learning_rate": 1.1810955613775521e-06, + "loss": 0.3056, + "step": 38010 + }, + { + "epoch": 0.8438308120886561, + "grad_norm": 1.4604092836380005, + "learning_rate": 1.1794522377206907e-06, + "loss": 0.3527, + "step": 38015 + }, + { + "epoch": 0.8439417986481837, + "grad_norm": 0.8474437594413757, + "learning_rate": 1.1778099864364422e-06, + "loss": 0.4289, + "step": 38020 + }, + { + "epoch": 0.8440527852077113, + "grad_norm": 1.360791802406311, + "learning_rate": 1.1761688077244703e-06, + "loss": 0.2715, + "step": 38025 + }, + { + "epoch": 0.844163771767239, + "grad_norm": 1.2367948293685913, + "learning_rate": 1.174528701784301e-06, + "loss": 0.5403, + "step": 38030 + }, + { + "epoch": 0.8442747583267667, + "grad_norm": 1.419376015663147, + "learning_rate": 1.1728896688153347e-06, + "loss": 0.5606, + "step": 38035 + }, + { + "epoch": 0.8443857448862943, + "grad_norm": 1.1906652450561523, + "learning_rate": 1.1712517090168373e-06, + "loss": 0.4601, + "step": 38040 + }, + { + "epoch": 0.8444967314458219, + "grad_norm": 1.702460765838623, + "learning_rate": 1.1696148225879467e-06, + "loss": 0.4131, + "step": 38045 + }, + { + "epoch": 0.8446077180053495, + "grad_norm": 0.9393609166145325, + "learning_rate": 1.1679790097276744e-06, + "loss": 0.3865, + "step": 38050 + }, + { + "epoch": 0.8447187045648772, + "grad_norm": 1.3559194803237915, + "learning_rate": 1.1663442706348915e-06, + "loss": 0.4174, + "step": 38055 + }, + { + "epoch": 0.8448296911244049, + "grad_norm": 2.1292991638183594, + "learning_rate": 1.164710605508348e-06, + "loss": 0.4488, + "step": 38060 + }, + { + "epoch": 0.8449406776839324, + "grad_norm": 1.1899800300598145, + "learning_rate": 1.1630780145466558e-06, + "loss": 0.5035, + "step": 38065 + }, + { + "epoch": 0.8450516642434601, + "grad_norm": 1.1950067281723022, + "learning_rate": 1.161446497948302e-06, + "loss": 0.3659, + "step": 38070 + }, + { + "epoch": 0.8451626508029878, + "grad_norm": 1.067469596862793, + "learning_rate": 1.1598160559116423e-06, + "loss": 0.2644, + "step": 38075 + }, + { + "epoch": 0.8452736373625154, + "grad_norm": 0.9901965856552124, + "learning_rate": 1.158186688634898e-06, + "loss": 0.3783, + "step": 38080 + }, + { + "epoch": 0.845384623922043, + "grad_norm": 0.9443339109420776, + "learning_rate": 1.156558396316164e-06, + "loss": 0.4083, + "step": 38085 + }, + { + "epoch": 0.8454956104815707, + "grad_norm": 1.2968891859054565, + "learning_rate": 1.1549311791534e-06, + "loss": 0.4318, + "step": 38090 + }, + { + "epoch": 0.8456065970410983, + "grad_norm": 1.3473148345947266, + "learning_rate": 1.153305037344441e-06, + "loss": 0.4181, + "step": 38095 + }, + { + "epoch": 0.845717583600626, + "grad_norm": 1.1904234886169434, + "learning_rate": 1.1516799710869841e-06, + "loss": 0.3757, + "step": 38100 + }, + { + "epoch": 0.8458285701601536, + "grad_norm": 1.0559203624725342, + "learning_rate": 1.1500559805786016e-06, + "loss": 0.3172, + "step": 38105 + }, + { + "epoch": 0.8459395567196812, + "grad_norm": 0.5630988478660583, + "learning_rate": 1.1484330660167342e-06, + "loss": 0.3873, + "step": 38110 + }, + { + "epoch": 0.8460505432792089, + "grad_norm": 0.8863928914070129, + "learning_rate": 1.1468112275986864e-06, + "loss": 0.3741, + "step": 38115 + }, + { + "epoch": 0.8461615298387365, + "grad_norm": 0.9940023422241211, + "learning_rate": 1.1451904655216417e-06, + "loss": 0.3822, + "step": 38120 + }, + { + "epoch": 0.8462725163982642, + "grad_norm": 1.2200144529342651, + "learning_rate": 1.14357077998264e-06, + "loss": 0.3438, + "step": 38125 + }, + { + "epoch": 0.8463835029577919, + "grad_norm": 0.8880993127822876, + "learning_rate": 1.1419521711786018e-06, + "loss": 0.377, + "step": 38130 + }, + { + "epoch": 0.8464944895173194, + "grad_norm": 0.7141773700714111, + "learning_rate": 1.140334639306312e-06, + "loss": 0.3334, + "step": 38135 + }, + { + "epoch": 0.8466054760768471, + "grad_norm": 1.2401177883148193, + "learning_rate": 1.138718184562423e-06, + "loss": 0.5925, + "step": 38140 + }, + { + "epoch": 0.8467164626363748, + "grad_norm": 2.065546751022339, + "learning_rate": 1.1371028071434599e-06, + "loss": 0.4347, + "step": 38145 + }, + { + "epoch": 0.8468274491959024, + "grad_norm": 1.3787494897842407, + "learning_rate": 1.135488507245811e-06, + "loss": 0.3479, + "step": 38150 + }, + { + "epoch": 0.84693843575543, + "grad_norm": 2.0585546493530273, + "learning_rate": 1.1338752850657409e-06, + "loss": 0.4585, + "step": 38155 + }, + { + "epoch": 0.8470494223149576, + "grad_norm": 1.160445213317871, + "learning_rate": 1.132263140799381e-06, + "loss": 0.3359, + "step": 38160 + }, + { + "epoch": 0.8471604088744853, + "grad_norm": 1.3824493885040283, + "learning_rate": 1.1306520746427263e-06, + "loss": 0.489, + "step": 38165 + }, + { + "epoch": 0.847271395434013, + "grad_norm": 1.5092135667800903, + "learning_rate": 1.1290420867916496e-06, + "loss": 0.3813, + "step": 38170 + }, + { + "epoch": 0.8473823819935405, + "grad_norm": 1.6400960683822632, + "learning_rate": 1.1274331774418822e-06, + "loss": 0.3731, + "step": 38175 + }, + { + "epoch": 0.8474933685530682, + "grad_norm": 0.9592393636703491, + "learning_rate": 1.1258253467890346e-06, + "loss": 0.4253, + "step": 38180 + }, + { + "epoch": 0.8476043551125959, + "grad_norm": 1.5121009349822998, + "learning_rate": 1.1242185950285778e-06, + "loss": 0.5113, + "step": 38185 + }, + { + "epoch": 0.8477153416721235, + "grad_norm": 0.8395439982414246, + "learning_rate": 1.122612922355858e-06, + "loss": 0.4075, + "step": 38190 + }, + { + "epoch": 0.8478263282316512, + "grad_norm": 0.9275767803192139, + "learning_rate": 1.1210083289660877e-06, + "loss": 0.4377, + "step": 38195 + }, + { + "epoch": 0.8479373147911788, + "grad_norm": 1.0100282430648804, + "learning_rate": 1.1194048150543457e-06, + "loss": 0.499, + "step": 38200 + }, + { + "epoch": 0.8480483013507064, + "grad_norm": 2.3323545455932617, + "learning_rate": 1.1178023808155846e-06, + "loss": 0.5272, + "step": 38205 + }, + { + "epoch": 0.8481592879102341, + "grad_norm": 2.287036418914795, + "learning_rate": 1.11620102644462e-06, + "loss": 0.5176, + "step": 38210 + }, + { + "epoch": 0.8482702744697617, + "grad_norm": 1.46099853515625, + "learning_rate": 1.11460075213614e-06, + "loss": 0.4802, + "step": 38215 + }, + { + "epoch": 0.8483812610292893, + "grad_norm": 1.0827932357788086, + "learning_rate": 1.1130015580847032e-06, + "loss": 0.5283, + "step": 38220 + }, + { + "epoch": 0.848492247588817, + "grad_norm": 1.2606589794158936, + "learning_rate": 1.111403444484731e-06, + "loss": 0.3741, + "step": 38225 + }, + { + "epoch": 0.8486032341483446, + "grad_norm": 1.5086512565612793, + "learning_rate": 1.1098064115305196e-06, + "loss": 0.3827, + "step": 38230 + }, + { + "epoch": 0.8487142207078723, + "grad_norm": 1.9647616147994995, + "learning_rate": 1.1082104594162269e-06, + "loss": 0.4122, + "step": 38235 + }, + { + "epoch": 0.8488252072674, + "grad_norm": 0.9889862537384033, + "learning_rate": 1.1066155883358877e-06, + "loss": 0.3284, + "step": 38240 + }, + { + "epoch": 0.8489361938269275, + "grad_norm": 1.1836154460906982, + "learning_rate": 1.1050217984833978e-06, + "loss": 0.4974, + "step": 38245 + }, + { + "epoch": 0.8490471803864552, + "grad_norm": 1.180612325668335, + "learning_rate": 1.1034290900525279e-06, + "loss": 0.585, + "step": 38250 + }, + { + "epoch": 0.8491581669459829, + "grad_norm": 1.3655891418457031, + "learning_rate": 1.1018374632369111e-06, + "loss": 0.3475, + "step": 38255 + }, + { + "epoch": 0.8492691535055105, + "grad_norm": 0.8092775940895081, + "learning_rate": 1.1002469182300546e-06, + "loss": 0.2952, + "step": 38260 + }, + { + "epoch": 0.8493801400650381, + "grad_norm": 1.331992268562317, + "learning_rate": 1.0986574552253282e-06, + "loss": 0.4558, + "step": 38265 + }, + { + "epoch": 0.8494911266245657, + "grad_norm": 1.3252182006835938, + "learning_rate": 1.0970690744159784e-06, + "loss": 0.2301, + "step": 38270 + }, + { + "epoch": 0.8496021131840934, + "grad_norm": 0.5097599029541016, + "learning_rate": 1.0954817759951098e-06, + "loss": 0.3065, + "step": 38275 + }, + { + "epoch": 0.8497130997436211, + "grad_norm": 1.2783838510513306, + "learning_rate": 1.0938955601557056e-06, + "loss": 0.3088, + "step": 38280 + }, + { + "epoch": 0.8498240863031487, + "grad_norm": 1.0956603288650513, + "learning_rate": 1.092310427090608e-06, + "loss": 0.3869, + "step": 38285 + }, + { + "epoch": 0.8499350728626763, + "grad_norm": 1.4360817670822144, + "learning_rate": 1.0907263769925381e-06, + "loss": 0.5542, + "step": 38290 + }, + { + "epoch": 0.850046059422204, + "grad_norm": 1.2553974390029907, + "learning_rate": 1.089143410054072e-06, + "loss": 0.3406, + "step": 38295 + }, + { + "epoch": 0.8501570459817316, + "grad_norm": 1.1232776641845703, + "learning_rate": 1.0875615264676665e-06, + "loss": 0.5081, + "step": 38300 + }, + { + "epoch": 0.8502680325412593, + "grad_norm": 1.687680959701538, + "learning_rate": 1.0859807264256428e-06, + "loss": 0.3252, + "step": 38305 + }, + { + "epoch": 0.850379019100787, + "grad_norm": 2.0268588066101074, + "learning_rate": 1.084401010120185e-06, + "loss": 0.2039, + "step": 38310 + }, + { + "epoch": 0.8504900056603145, + "grad_norm": 1.1189128160476685, + "learning_rate": 1.0828223777433534e-06, + "loss": 0.4119, + "step": 38315 + }, + { + "epoch": 0.8506009922198422, + "grad_norm": 1.8909300565719604, + "learning_rate": 1.0812448294870692e-06, + "loss": 0.5072, + "step": 38320 + }, + { + "epoch": 0.8507119787793698, + "grad_norm": 1.4315944910049438, + "learning_rate": 1.0796683655431272e-06, + "loss": 0.3991, + "step": 38325 + }, + { + "epoch": 0.8508229653388975, + "grad_norm": 1.3107892274856567, + "learning_rate": 1.078092986103192e-06, + "loss": 0.3847, + "step": 38330 + }, + { + "epoch": 0.8509339518984251, + "grad_norm": 1.8488363027572632, + "learning_rate": 1.0765186913587866e-06, + "loss": 0.2773, + "step": 38335 + }, + { + "epoch": 0.8510449384579527, + "grad_norm": 1.318145513534546, + "learning_rate": 1.074945481501314e-06, + "loss": 0.401, + "step": 38340 + }, + { + "epoch": 0.8511559250174804, + "grad_norm": 0.9807799458503723, + "learning_rate": 1.0733733567220362e-06, + "loss": 0.3081, + "step": 38345 + }, + { + "epoch": 0.8512669115770081, + "grad_norm": 1.1080820560455322, + "learning_rate": 1.0718023172120895e-06, + "loss": 0.2787, + "step": 38350 + }, + { + "epoch": 0.8513778981365356, + "grad_norm": 2.1662795543670654, + "learning_rate": 1.0702323631624723e-06, + "loss": 0.3918, + "step": 38355 + }, + { + "epoch": 0.8514888846960633, + "grad_norm": 1.3688949346542358, + "learning_rate": 1.068663494764056e-06, + "loss": 0.3132, + "step": 38360 + }, + { + "epoch": 0.851599871255591, + "grad_norm": 0.9906854033470154, + "learning_rate": 1.067095712207581e-06, + "loss": 0.2986, + "step": 38365 + }, + { + "epoch": 0.8517108578151186, + "grad_norm": 1.1726410388946533, + "learning_rate": 1.0655290156836485e-06, + "loss": 0.3745, + "step": 38370 + }, + { + "epoch": 0.8518218443746463, + "grad_norm": 0.900324285030365, + "learning_rate": 1.0639634053827363e-06, + "loss": 0.3802, + "step": 38375 + }, + { + "epoch": 0.8519328309341738, + "grad_norm": 1.3714967966079712, + "learning_rate": 1.0623988814951812e-06, + "loss": 0.2792, + "step": 38380 + }, + { + "epoch": 0.8520438174937015, + "grad_norm": 0.9182549118995667, + "learning_rate": 1.0608354442111968e-06, + "loss": 0.4102, + "step": 38385 + }, + { + "epoch": 0.8521548040532292, + "grad_norm": 1.286138892173767, + "learning_rate": 1.0592730937208618e-06, + "loss": 0.2673, + "step": 38390 + }, + { + "epoch": 0.8522657906127568, + "grad_norm": 1.5091959238052368, + "learning_rate": 1.0577118302141166e-06, + "loss": 0.4845, + "step": 38395 + }, + { + "epoch": 0.8523767771722844, + "grad_norm": 1.234449863433838, + "learning_rate": 1.0561516538807792e-06, + "loss": 0.2381, + "step": 38400 + }, + { + "epoch": 0.8524877637318121, + "grad_norm": 2.3573572635650635, + "learning_rate": 1.054592564910526e-06, + "loss": 0.3428, + "step": 38405 + }, + { + "epoch": 0.8525987502913397, + "grad_norm": 1.1645255088806152, + "learning_rate": 1.0530345634929084e-06, + "loss": 0.4389, + "step": 38410 + }, + { + "epoch": 0.8527097368508674, + "grad_norm": 1.2179172039031982, + "learning_rate": 1.0514776498173452e-06, + "loss": 0.2181, + "step": 38415 + }, + { + "epoch": 0.852820723410395, + "grad_norm": 1.5778353214263916, + "learning_rate": 1.0499218240731157e-06, + "loss": 0.5121, + "step": 38420 + }, + { + "epoch": 0.8529317099699226, + "grad_norm": 1.748390555381775, + "learning_rate": 1.0483670864493777e-06, + "loss": 0.3426, + "step": 38425 + }, + { + "epoch": 0.8530426965294503, + "grad_norm": 1.0926501750946045, + "learning_rate": 1.0468134371351445e-06, + "loss": 0.4044, + "step": 38430 + }, + { + "epoch": 0.8531536830889779, + "grad_norm": 0.5647231340408325, + "learning_rate": 1.0452608763193095e-06, + "loss": 0.3543, + "step": 38435 + }, + { + "epoch": 0.8532646696485056, + "grad_norm": 0.8507322669029236, + "learning_rate": 1.0437094041906238e-06, + "loss": 0.3336, + "step": 38440 + }, + { + "epoch": 0.8533756562080332, + "grad_norm": 1.3485618829727173, + "learning_rate": 1.0421590209377107e-06, + "loss": 0.419, + "step": 38445 + }, + { + "epoch": 0.8534866427675608, + "grad_norm": 2.003201961517334, + "learning_rate": 1.0406097267490644e-06, + "loss": 0.455, + "step": 38450 + }, + { + "epoch": 0.8535976293270885, + "grad_norm": 1.314960241317749, + "learning_rate": 1.0390615218130383e-06, + "loss": 0.4184, + "step": 38455 + }, + { + "epoch": 0.8537086158866162, + "grad_norm": 1.501952052116394, + "learning_rate": 1.037514406317861e-06, + "loss": 0.345, + "step": 38460 + }, + { + "epoch": 0.8538196024461437, + "grad_norm": 1.1983728408813477, + "learning_rate": 1.0359683804516219e-06, + "loss": 0.368, + "step": 38465 + }, + { + "epoch": 0.8539305890056714, + "grad_norm": 0.7054364085197449, + "learning_rate": 1.0344234444022872e-06, + "loss": 0.4098, + "step": 38470 + }, + { + "epoch": 0.8540415755651991, + "grad_norm": 0.7602006196975708, + "learning_rate": 1.032879598357679e-06, + "loss": 0.3234, + "step": 38475 + }, + { + "epoch": 0.8541525621247267, + "grad_norm": 1.3115113973617554, + "learning_rate": 1.0313368425054983e-06, + "loss": 0.4597, + "step": 38480 + }, + { + "epoch": 0.8542635486842544, + "grad_norm": 1.0144224166870117, + "learning_rate": 1.0297951770333037e-06, + "loss": 0.1787, + "step": 38485 + }, + { + "epoch": 0.8543745352437819, + "grad_norm": 1.2411842346191406, + "learning_rate": 1.02825460212853e-06, + "loss": 0.5086, + "step": 38490 + }, + { + "epoch": 0.8544855218033096, + "grad_norm": 1.0099564790725708, + "learning_rate": 1.0267151179784706e-06, + "loss": 0.4363, + "step": 38495 + }, + { + "epoch": 0.8545965083628373, + "grad_norm": 1.160954475402832, + "learning_rate": 1.025176724770295e-06, + "loss": 0.2975, + "step": 38500 + }, + { + "epoch": 0.8547074949223649, + "grad_norm": 1.771726369857788, + "learning_rate": 1.0236394226910329e-06, + "loss": 0.3246, + "step": 38505 + }, + { + "epoch": 0.8548184814818925, + "grad_norm": 1.7871524095535278, + "learning_rate": 1.0221032119275864e-06, + "loss": 0.4353, + "step": 38510 + }, + { + "epoch": 0.8549294680414202, + "grad_norm": 0.8769774436950684, + "learning_rate": 1.0205680926667194e-06, + "loss": 0.4317, + "step": 38515 + }, + { + "epoch": 0.8550404546009478, + "grad_norm": 1.4010024070739746, + "learning_rate": 1.0190340650950726e-06, + "loss": 0.2572, + "step": 38520 + }, + { + "epoch": 0.8551514411604755, + "grad_norm": 1.1399223804473877, + "learning_rate": 1.017501129399141e-06, + "loss": 0.3274, + "step": 38525 + }, + { + "epoch": 0.8552624277200032, + "grad_norm": 1.0759845972061157, + "learning_rate": 1.015969285765297e-06, + "loss": 0.3051, + "step": 38530 + }, + { + "epoch": 0.8553734142795307, + "grad_norm": 1.2395176887512207, + "learning_rate": 1.0144385343797801e-06, + "loss": 0.3784, + "step": 38535 + }, + { + "epoch": 0.8554844008390584, + "grad_norm": 2.213679790496826, + "learning_rate": 1.0129088754286886e-06, + "loss": 0.4289, + "step": 38540 + }, + { + "epoch": 0.855595387398586, + "grad_norm": 0.756289005279541, + "learning_rate": 1.011380309097998e-06, + "loss": 0.2351, + "step": 38545 + }, + { + "epoch": 0.8557063739581137, + "grad_norm": 0.951809823513031, + "learning_rate": 1.0098528355735414e-06, + "loss": 0.2969, + "step": 38550 + }, + { + "epoch": 0.8558173605176413, + "grad_norm": 0.8504477143287659, + "learning_rate": 1.0083264550410266e-06, + "loss": 0.4501, + "step": 38555 + }, + { + "epoch": 0.8559283470771689, + "grad_norm": 1.6416808366775513, + "learning_rate": 1.0068011676860278e-06, + "loss": 0.43, + "step": 38560 + }, + { + "epoch": 0.8560393336366966, + "grad_norm": 1.2744433879852295, + "learning_rate": 1.0052769736939793e-06, + "loss": 0.4125, + "step": 38565 + }, + { + "epoch": 0.8561503201962243, + "grad_norm": 0.7917124032974243, + "learning_rate": 1.003753873250194e-06, + "loss": 0.4791, + "step": 38570 + }, + { + "epoch": 0.8562613067557518, + "grad_norm": 1.0418739318847656, + "learning_rate": 1.002231866539839e-06, + "loss": 0.3657, + "step": 38575 + }, + { + "epoch": 0.8563722933152795, + "grad_norm": 1.2840015888214111, + "learning_rate": 1.0007109537479564e-06, + "loss": 0.3881, + "step": 38580 + }, + { + "epoch": 0.8564832798748072, + "grad_norm": 1.4934595823287964, + "learning_rate": 9.991911350594586e-07, + "loss": 0.3519, + "step": 38585 + }, + { + "epoch": 0.8565942664343348, + "grad_norm": 1.3786568641662598, + "learning_rate": 9.976724106591128e-07, + "loss": 0.3896, + "step": 38590 + }, + { + "epoch": 0.8567052529938625, + "grad_norm": 1.0313783884048462, + "learning_rate": 9.961547807315664e-07, + "loss": 0.2541, + "step": 38595 + }, + { + "epoch": 0.85681623955339, + "grad_norm": 1.1191176176071167, + "learning_rate": 9.946382454613235e-07, + "loss": 0.5072, + "step": 38600 + }, + { + "epoch": 0.8569272261129177, + "grad_norm": 1.5176087617874146, + "learning_rate": 9.931228050327623e-07, + "loss": 0.4345, + "step": 38605 + }, + { + "epoch": 0.8570382126724454, + "grad_norm": 0.8821091651916504, + "learning_rate": 9.916084596301223e-07, + "loss": 0.4797, + "step": 38610 + }, + { + "epoch": 0.857149199231973, + "grad_norm": 1.2651344537734985, + "learning_rate": 9.90095209437515e-07, + "loss": 0.4668, + "step": 38615 + }, + { + "epoch": 0.8572601857915007, + "grad_norm": 1.8215205669403076, + "learning_rate": 9.885830546389151e-07, + "loss": 0.5254, + "step": 38620 + }, + { + "epoch": 0.8573711723510283, + "grad_norm": 1.7566852569580078, + "learning_rate": 9.870719954181651e-07, + "loss": 0.4328, + "step": 38625 + }, + { + "epoch": 0.8574821589105559, + "grad_norm": 1.0727424621582031, + "learning_rate": 9.855620319589766e-07, + "loss": 0.2483, + "step": 38630 + }, + { + "epoch": 0.8575931454700836, + "grad_norm": 1.2513246536254883, + "learning_rate": 9.840531644449214e-07, + "loss": 0.4956, + "step": 38635 + }, + { + "epoch": 0.8577041320296113, + "grad_norm": 1.3015859127044678, + "learning_rate": 9.825453930594464e-07, + "loss": 0.2856, + "step": 38640 + }, + { + "epoch": 0.8578151185891388, + "grad_norm": 1.3504365682601929, + "learning_rate": 9.810387179858616e-07, + "loss": 0.4944, + "step": 38645 + }, + { + "epoch": 0.8579261051486665, + "grad_norm": 0.6242560744285583, + "learning_rate": 9.795331394073404e-07, + "loss": 0.3423, + "step": 38650 + }, + { + "epoch": 0.8580370917081941, + "grad_norm": 1.6594665050506592, + "learning_rate": 9.780286575069298e-07, + "loss": 0.2734, + "step": 38655 + }, + { + "epoch": 0.8581480782677218, + "grad_norm": 1.602768063545227, + "learning_rate": 9.765252724675356e-07, + "loss": 0.5262, + "step": 38660 + }, + { + "epoch": 0.8582590648272495, + "grad_norm": 1.629512071609497, + "learning_rate": 9.750229844719362e-07, + "loss": 0.4276, + "step": 38665 + }, + { + "epoch": 0.858370051386777, + "grad_norm": 1.0607631206512451, + "learning_rate": 9.735217937027774e-07, + "loss": 0.2883, + "step": 38670 + }, + { + "epoch": 0.8584810379463047, + "grad_norm": 1.34122896194458, + "learning_rate": 9.720217003425648e-07, + "loss": 0.3398, + "step": 38675 + }, + { + "epoch": 0.8585920245058324, + "grad_norm": 0.9040728211402893, + "learning_rate": 9.705227045736777e-07, + "loss": 0.4113, + "step": 38680 + }, + { + "epoch": 0.85870301106536, + "grad_norm": 0.8782297968864441, + "learning_rate": 9.69024806578356e-07, + "loss": 0.389, + "step": 38685 + }, + { + "epoch": 0.8588139976248876, + "grad_norm": 1.0483667850494385, + "learning_rate": 9.675280065387117e-07, + "loss": 0.4214, + "step": 38690 + }, + { + "epoch": 0.8589249841844153, + "grad_norm": 1.4147143363952637, + "learning_rate": 9.660323046367193e-07, + "loss": 0.3655, + "step": 38695 + }, + { + "epoch": 0.8590359707439429, + "grad_norm": 1.479657530784607, + "learning_rate": 9.645377010542212e-07, + "loss": 0.3764, + "step": 38700 + }, + { + "epoch": 0.8591469573034706, + "grad_norm": 2.3263349533081055, + "learning_rate": 9.630441959729286e-07, + "loss": 0.4449, + "step": 38705 + }, + { + "epoch": 0.8592579438629981, + "grad_norm": 1.145320177078247, + "learning_rate": 9.61551789574413e-07, + "loss": 0.292, + "step": 38710 + }, + { + "epoch": 0.8593689304225258, + "grad_norm": 1.9598171710968018, + "learning_rate": 9.600604820401205e-07, + "loss": 0.3929, + "step": 38715 + }, + { + "epoch": 0.8594799169820535, + "grad_norm": 0.999853789806366, + "learning_rate": 9.585702735513546e-07, + "loss": 0.2936, + "step": 38720 + }, + { + "epoch": 0.8595909035415811, + "grad_norm": 0.9991888403892517, + "learning_rate": 9.570811642892952e-07, + "loss": 0.3136, + "step": 38725 + }, + { + "epoch": 0.8597018901011088, + "grad_norm": 1.3448801040649414, + "learning_rate": 9.555931544349772e-07, + "loss": 0.3744, + "step": 38730 + }, + { + "epoch": 0.8598128766606364, + "grad_norm": 1.1844966411590576, + "learning_rate": 9.54106244169314e-07, + "loss": 0.4347, + "step": 38735 + }, + { + "epoch": 0.859923863220164, + "grad_norm": 1.6972241401672363, + "learning_rate": 9.526204336730727e-07, + "loss": 0.3532, + "step": 38740 + }, + { + "epoch": 0.8600348497796917, + "grad_norm": 1.1772350072860718, + "learning_rate": 9.511357231268992e-07, + "loss": 0.3568, + "step": 38745 + }, + { + "epoch": 0.8601458363392194, + "grad_norm": 1.950363278388977, + "learning_rate": 9.496521127112956e-07, + "loss": 0.3482, + "step": 38750 + }, + { + "epoch": 0.8602568228987469, + "grad_norm": 0.92097008228302, + "learning_rate": 9.481696026066367e-07, + "loss": 0.2089, + "step": 38755 + }, + { + "epoch": 0.8603678094582746, + "grad_norm": 1.196353793144226, + "learning_rate": 9.466881929931582e-07, + "loss": 0.5417, + "step": 38760 + }, + { + "epoch": 0.8604787960178022, + "grad_norm": 1.7988780736923218, + "learning_rate": 9.452078840509693e-07, + "loss": 0.2771, + "step": 38765 + }, + { + "epoch": 0.8605897825773299, + "grad_norm": 1.4759079217910767, + "learning_rate": 9.43728675960035e-07, + "loss": 0.4198, + "step": 38770 + }, + { + "epoch": 0.8607007691368576, + "grad_norm": 1.9383196830749512, + "learning_rate": 9.422505689001993e-07, + "loss": 0.539, + "step": 38775 + }, + { + "epoch": 0.8608117556963851, + "grad_norm": 0.9065805673599243, + "learning_rate": 9.407735630511594e-07, + "loss": 0.4382, + "step": 38780 + }, + { + "epoch": 0.8609227422559128, + "grad_norm": 0.8202492594718933, + "learning_rate": 9.392976585924885e-07, + "loss": 0.4727, + "step": 38785 + }, + { + "epoch": 0.8610337288154405, + "grad_norm": 1.5601189136505127, + "learning_rate": 9.378228557036217e-07, + "loss": 0.497, + "step": 38790 + }, + { + "epoch": 0.8611447153749681, + "grad_norm": 0.7903019785881042, + "learning_rate": 9.363491545638592e-07, + "loss": 0.362, + "step": 38795 + }, + { + "epoch": 0.8612557019344957, + "grad_norm": 0.5757574439048767, + "learning_rate": 9.348765553523697e-07, + "loss": 0.4147, + "step": 38800 + }, + { + "epoch": 0.8613666884940234, + "grad_norm": 0.9555240273475647, + "learning_rate": 9.334050582481857e-07, + "loss": 0.3742, + "step": 38805 + }, + { + "epoch": 0.861477675053551, + "grad_norm": 0.8194215297698975, + "learning_rate": 9.319346634302084e-07, + "loss": 0.2755, + "step": 38810 + }, + { + "epoch": 0.8615886616130787, + "grad_norm": 1.1851353645324707, + "learning_rate": 9.304653710772038e-07, + "loss": 0.3956, + "step": 38815 + }, + { + "epoch": 0.8616996481726062, + "grad_norm": 1.3655849695205688, + "learning_rate": 9.289971813678001e-07, + "loss": 0.5262, + "step": 38820 + }, + { + "epoch": 0.8618106347321339, + "grad_norm": 1.137489676475525, + "learning_rate": 9.275300944804999e-07, + "loss": 0.3999, + "step": 38825 + }, + { + "epoch": 0.8619216212916616, + "grad_norm": 1.7556220293045044, + "learning_rate": 9.260641105936618e-07, + "loss": 0.5324, + "step": 38830 + }, + { + "epoch": 0.8620326078511892, + "grad_norm": 1.0561107397079468, + "learning_rate": 9.245992298855177e-07, + "loss": 0.3947, + "step": 38835 + }, + { + "epoch": 0.8621435944107169, + "grad_norm": 0.8867374658584595, + "learning_rate": 9.231354525341652e-07, + "loss": 0.4675, + "step": 38840 + }, + { + "epoch": 0.8622545809702445, + "grad_norm": 1.4009345769882202, + "learning_rate": 9.216727787175605e-07, + "loss": 0.3704, + "step": 38845 + }, + { + "epoch": 0.8623655675297721, + "grad_norm": 1.6466008424758911, + "learning_rate": 9.202112086135351e-07, + "loss": 0.356, + "step": 38850 + }, + { + "epoch": 0.8624765540892998, + "grad_norm": 1.4668787717819214, + "learning_rate": 9.187507423997777e-07, + "loss": 0.3356, + "step": 38855 + }, + { + "epoch": 0.8625875406488275, + "grad_norm": 1.3413883447647095, + "learning_rate": 9.172913802538508e-07, + "loss": 0.3936, + "step": 38860 + }, + { + "epoch": 0.862698527208355, + "grad_norm": 0.8753415942192078, + "learning_rate": 9.158331223531747e-07, + "loss": 0.2515, + "step": 38865 + }, + { + "epoch": 0.8628095137678827, + "grad_norm": 0.8159883618354797, + "learning_rate": 9.143759688750419e-07, + "loss": 0.3006, + "step": 38870 + }, + { + "epoch": 0.8629205003274103, + "grad_norm": 1.0602672100067139, + "learning_rate": 9.129199199966099e-07, + "loss": 0.4705, + "step": 38875 + }, + { + "epoch": 0.863031486886938, + "grad_norm": 0.6323870420455933, + "learning_rate": 9.114649758948967e-07, + "loss": 0.4241, + "step": 38880 + }, + { + "epoch": 0.8631424734464657, + "grad_norm": 1.046294093132019, + "learning_rate": 9.100111367467923e-07, + "loss": 0.3208, + "step": 38885 + }, + { + "epoch": 0.8632534600059932, + "grad_norm": 0.940803587436676, + "learning_rate": 9.085584027290472e-07, + "loss": 0.358, + "step": 38890 + }, + { + "epoch": 0.8633644465655209, + "grad_norm": 1.8877294063568115, + "learning_rate": 9.071067740182815e-07, + "loss": 0.2846, + "step": 38895 + }, + { + "epoch": 0.8634754331250486, + "grad_norm": 1.3504570722579956, + "learning_rate": 9.056562507909805e-07, + "loss": 0.3206, + "step": 38900 + }, + { + "epoch": 0.8635864196845762, + "grad_norm": 1.4714055061340332, + "learning_rate": 9.042068332234899e-07, + "loss": 0.5496, + "step": 38905 + }, + { + "epoch": 0.8636974062441038, + "grad_norm": 1.0898418426513672, + "learning_rate": 9.027585214920298e-07, + "loss": 0.4504, + "step": 38910 + }, + { + "epoch": 0.8638083928036315, + "grad_norm": 1.6247862577438354, + "learning_rate": 9.013113157726771e-07, + "loss": 0.3826, + "step": 38915 + }, + { + "epoch": 0.8639193793631591, + "grad_norm": 1.524096131324768, + "learning_rate": 8.998652162413801e-07, + "loss": 0.3156, + "step": 38920 + }, + { + "epoch": 0.8640303659226868, + "grad_norm": 1.3673064708709717, + "learning_rate": 8.984202230739536e-07, + "loss": 0.3662, + "step": 38925 + }, + { + "epoch": 0.8641413524822144, + "grad_norm": 1.0908459424972534, + "learning_rate": 8.969763364460682e-07, + "loss": 0.4091, + "step": 38930 + }, + { + "epoch": 0.864252339041742, + "grad_norm": 1.3286373615264893, + "learning_rate": 8.955335565332734e-07, + "loss": 0.4986, + "step": 38935 + }, + { + "epoch": 0.8643633256012697, + "grad_norm": 1.986507534980774, + "learning_rate": 8.940918835109735e-07, + "loss": 0.5036, + "step": 38940 + }, + { + "epoch": 0.8644743121607973, + "grad_norm": 0.984667956829071, + "learning_rate": 8.926513175544448e-07, + "loss": 0.4329, + "step": 38945 + }, + { + "epoch": 0.864585298720325, + "grad_norm": 0.7082566022872925, + "learning_rate": 8.91211858838823e-07, + "loss": 0.4022, + "step": 38950 + }, + { + "epoch": 0.8646962852798526, + "grad_norm": 1.6974024772644043, + "learning_rate": 8.897735075391156e-07, + "loss": 0.4663, + "step": 38955 + }, + { + "epoch": 0.8648072718393802, + "grad_norm": 1.3038138151168823, + "learning_rate": 8.88336263830194e-07, + "loss": 0.4664, + "step": 38960 + }, + { + "epoch": 0.8649182583989079, + "grad_norm": 1.2903189659118652, + "learning_rate": 8.869001278867884e-07, + "loss": 0.4917, + "step": 38965 + }, + { + "epoch": 0.8650292449584356, + "grad_norm": 1.3441555500030518, + "learning_rate": 8.854650998835046e-07, + "loss": 0.3813, + "step": 38970 + }, + { + "epoch": 0.8651402315179632, + "grad_norm": 1.2959764003753662, + "learning_rate": 8.840311799948042e-07, + "loss": 0.551, + "step": 38975 + }, + { + "epoch": 0.8652512180774908, + "grad_norm": 0.7261309027671814, + "learning_rate": 8.825983683950223e-07, + "loss": 0.318, + "step": 38980 + }, + { + "epoch": 0.8653622046370184, + "grad_norm": 1.0499922037124634, + "learning_rate": 8.811666652583517e-07, + "loss": 0.482, + "step": 38985 + }, + { + "epoch": 0.8654731911965461, + "grad_norm": 1.1763601303100586, + "learning_rate": 8.797360707588576e-07, + "loss": 0.5707, + "step": 38990 + }, + { + "epoch": 0.8655841777560738, + "grad_norm": 0.9862807393074036, + "learning_rate": 8.783065850704631e-07, + "loss": 0.4042, + "step": 38995 + }, + { + "epoch": 0.8656951643156013, + "grad_norm": 0.8914280533790588, + "learning_rate": 8.768782083669647e-07, + "loss": 0.4066, + "step": 39000 + }, + { + "epoch": 0.865806150875129, + "grad_norm": 1.3279800415039062, + "learning_rate": 8.754509408220146e-07, + "loss": 0.5931, + "step": 39005 + }, + { + "epoch": 0.8659171374346567, + "grad_norm": 1.4973981380462646, + "learning_rate": 8.740247826091397e-07, + "loss": 0.4384, + "step": 39010 + }, + { + "epoch": 0.8660281239941843, + "grad_norm": 1.2501366138458252, + "learning_rate": 8.725997339017233e-07, + "loss": 0.3055, + "step": 39015 + }, + { + "epoch": 0.866139110553712, + "grad_norm": 1.1514090299606323, + "learning_rate": 8.711757948730227e-07, + "loss": 0.4909, + "step": 39020 + }, + { + "epoch": 0.8662500971132396, + "grad_norm": 1.296980857849121, + "learning_rate": 8.697529656961512e-07, + "loss": 0.4126, + "step": 39025 + }, + { + "epoch": 0.8663610836727672, + "grad_norm": 0.9663153290748596, + "learning_rate": 8.683312465440952e-07, + "loss": 0.3933, + "step": 39030 + }, + { + "epoch": 0.8664720702322949, + "grad_norm": 1.295825719833374, + "learning_rate": 8.669106375896996e-07, + "loss": 0.3961, + "step": 39035 + }, + { + "epoch": 0.8665830567918225, + "grad_norm": 1.4817475080490112, + "learning_rate": 8.654911390056786e-07, + "loss": 0.5709, + "step": 39040 + }, + { + "epoch": 0.8666940433513501, + "grad_norm": 1.2556267976760864, + "learning_rate": 8.640727509646119e-07, + "loss": 0.4379, + "step": 39045 + }, + { + "epoch": 0.8668050299108778, + "grad_norm": 0.8033402562141418, + "learning_rate": 8.626554736389393e-07, + "loss": 0.399, + "step": 39050 + }, + { + "epoch": 0.8669160164704054, + "grad_norm": 0.9582207798957825, + "learning_rate": 8.612393072009706e-07, + "loss": 0.2177, + "step": 39055 + }, + { + "epoch": 0.8670270030299331, + "grad_norm": 0.9874535799026489, + "learning_rate": 8.598242518228773e-07, + "loss": 0.5628, + "step": 39060 + }, + { + "epoch": 0.8671379895894608, + "grad_norm": 1.2723040580749512, + "learning_rate": 8.58410307676697e-07, + "loss": 0.4486, + "step": 39065 + }, + { + "epoch": 0.8672489761489883, + "grad_norm": 1.10847008228302, + "learning_rate": 8.569974749343357e-07, + "loss": 0.441, + "step": 39070 + }, + { + "epoch": 0.867359962708516, + "grad_norm": 0.913597822189331, + "learning_rate": 8.555857537675549e-07, + "loss": 0.3791, + "step": 39075 + }, + { + "epoch": 0.8674709492680437, + "grad_norm": 1.1242194175720215, + "learning_rate": 8.541751443479928e-07, + "loss": 0.3358, + "step": 39080 + }, + { + "epoch": 0.8675819358275713, + "grad_norm": 1.259722113609314, + "learning_rate": 8.527656468471423e-07, + "loss": 0.4287, + "step": 39085 + }, + { + "epoch": 0.8676929223870989, + "grad_norm": 0.994596004486084, + "learning_rate": 8.513572614363674e-07, + "loss": 0.5339, + "step": 39090 + }, + { + "epoch": 0.8678039089466265, + "grad_norm": 1.569893717765808, + "learning_rate": 8.499499882868955e-07, + "loss": 0.3643, + "step": 39095 + }, + { + "epoch": 0.8679148955061542, + "grad_norm": 1.5473707914352417, + "learning_rate": 8.485438275698154e-07, + "loss": 0.433, + "step": 39100 + }, + { + "epoch": 0.8680258820656819, + "grad_norm": 1.753111720085144, + "learning_rate": 8.47138779456087e-07, + "loss": 0.3161, + "step": 39105 + }, + { + "epoch": 0.8681368686252094, + "grad_norm": 0.9786942601203918, + "learning_rate": 8.457348441165281e-07, + "loss": 0.4065, + "step": 39110 + }, + { + "epoch": 0.8682478551847371, + "grad_norm": 1.395298957824707, + "learning_rate": 8.443320217218254e-07, + "loss": 0.4125, + "step": 39115 + }, + { + "epoch": 0.8683588417442648, + "grad_norm": 1.0039939880371094, + "learning_rate": 8.429303124425315e-07, + "loss": 0.4161, + "step": 39120 + }, + { + "epoch": 0.8684698283037924, + "grad_norm": 1.6132234334945679, + "learning_rate": 8.415297164490577e-07, + "loss": 0.417, + "step": 39125 + }, + { + "epoch": 0.8685808148633201, + "grad_norm": 1.0659582614898682, + "learning_rate": 8.40130233911689e-07, + "loss": 0.3051, + "step": 39130 + }, + { + "epoch": 0.8686918014228477, + "grad_norm": 0.9651395678520203, + "learning_rate": 8.387318650005638e-07, + "loss": 0.4814, + "step": 39135 + }, + { + "epoch": 0.8688027879823753, + "grad_norm": 0.6835569143295288, + "learning_rate": 8.373346098856961e-07, + "loss": 0.3561, + "step": 39140 + }, + { + "epoch": 0.868913774541903, + "grad_norm": 1.12702214717865, + "learning_rate": 8.359384687369554e-07, + "loss": 0.4681, + "step": 39145 + }, + { + "epoch": 0.8690247611014306, + "grad_norm": 0.813792884349823, + "learning_rate": 8.345434417240816e-07, + "loss": 0.3715, + "step": 39150 + }, + { + "epoch": 0.8691357476609582, + "grad_norm": 0.6443630456924438, + "learning_rate": 8.331495290166791e-07, + "loss": 0.3748, + "step": 39155 + }, + { + "epoch": 0.8692467342204859, + "grad_norm": 1.2461071014404297, + "learning_rate": 8.317567307842123e-07, + "loss": 0.467, + "step": 39160 + }, + { + "epoch": 0.8693577207800135, + "grad_norm": 1.268563151359558, + "learning_rate": 8.303650471960157e-07, + "loss": 0.3786, + "step": 39165 + }, + { + "epoch": 0.8694687073395412, + "grad_norm": 1.2659401893615723, + "learning_rate": 8.289744784212827e-07, + "loss": 0.4257, + "step": 39170 + }, + { + "epoch": 0.8695796938990689, + "grad_norm": 1.381678819656372, + "learning_rate": 8.275850246290762e-07, + "loss": 0.4126, + "step": 39175 + }, + { + "epoch": 0.8696906804585964, + "grad_norm": 1.0406432151794434, + "learning_rate": 8.261966859883208e-07, + "loss": 0.4649, + "step": 39180 + }, + { + "epoch": 0.8698016670181241, + "grad_norm": 1.262363076210022, + "learning_rate": 8.24809462667805e-07, + "loss": 0.3267, + "step": 39185 + }, + { + "epoch": 0.8699126535776518, + "grad_norm": 1.6432816982269287, + "learning_rate": 8.234233548361847e-07, + "loss": 0.296, + "step": 39190 + }, + { + "epoch": 0.8700236401371794, + "grad_norm": 0.5735118389129639, + "learning_rate": 8.220383626619755e-07, + "loss": 0.2755, + "step": 39195 + }, + { + "epoch": 0.870134626696707, + "grad_norm": 0.5151550769805908, + "learning_rate": 8.206544863135612e-07, + "loss": 0.3769, + "step": 39200 + }, + { + "epoch": 0.8702456132562346, + "grad_norm": 1.830293893814087, + "learning_rate": 8.19271725959192e-07, + "loss": 0.3203, + "step": 39205 + }, + { + "epoch": 0.8703565998157623, + "grad_norm": 1.3666870594024658, + "learning_rate": 8.178900817669744e-07, + "loss": 0.3906, + "step": 39210 + }, + { + "epoch": 0.87046758637529, + "grad_norm": 1.4511600732803345, + "learning_rate": 8.165095539048884e-07, + "loss": 0.4858, + "step": 39215 + }, + { + "epoch": 0.8705785729348176, + "grad_norm": 1.1984554529190063, + "learning_rate": 8.151301425407699e-07, + "loss": 0.3846, + "step": 39220 + }, + { + "epoch": 0.8706895594943452, + "grad_norm": 0.7747187614440918, + "learning_rate": 8.137518478423256e-07, + "loss": 0.3232, + "step": 39225 + }, + { + "epoch": 0.8708005460538729, + "grad_norm": 1.4917807579040527, + "learning_rate": 8.123746699771229e-07, + "loss": 0.2611, + "step": 39230 + }, + { + "epoch": 0.8709115326134005, + "grad_norm": 1.4434287548065186, + "learning_rate": 8.109986091125965e-07, + "loss": 0.3076, + "step": 39235 + }, + { + "epoch": 0.8710225191729282, + "grad_norm": 1.3788912296295166, + "learning_rate": 8.096236654160394e-07, + "loss": 0.2503, + "step": 39240 + }, + { + "epoch": 0.8711335057324558, + "grad_norm": 1.336180329322815, + "learning_rate": 8.082498390546178e-07, + "loss": 0.5352, + "step": 39245 + }, + { + "epoch": 0.8712444922919834, + "grad_norm": 1.1848613023757935, + "learning_rate": 8.068771301953515e-07, + "loss": 0.5586, + "step": 39250 + }, + { + "epoch": 0.8713554788515111, + "grad_norm": 1.0970314741134644, + "learning_rate": 8.055055390051336e-07, + "loss": 0.2981, + "step": 39255 + }, + { + "epoch": 0.8714664654110387, + "grad_norm": 1.1610400676727295, + "learning_rate": 8.041350656507152e-07, + "loss": 0.3591, + "step": 39260 + }, + { + "epoch": 0.8715774519705664, + "grad_norm": 1.0197187662124634, + "learning_rate": 8.027657102987163e-07, + "loss": 0.3865, + "step": 39265 + }, + { + "epoch": 0.871688438530094, + "grad_norm": 0.8454812169075012, + "learning_rate": 8.01397473115616e-07, + "loss": 0.2558, + "step": 39270 + }, + { + "epoch": 0.8717994250896216, + "grad_norm": 1.5173369646072388, + "learning_rate": 8.000303542677635e-07, + "loss": 0.3468, + "step": 39275 + }, + { + "epoch": 0.8719104116491493, + "grad_norm": 1.057705283164978, + "learning_rate": 7.986643539213634e-07, + "loss": 0.3252, + "step": 39280 + }, + { + "epoch": 0.872021398208677, + "grad_norm": 0.6779467463493347, + "learning_rate": 7.97299472242492e-07, + "loss": 0.459, + "step": 39285 + }, + { + "epoch": 0.8721323847682045, + "grad_norm": 0.7705076336860657, + "learning_rate": 7.959357093970899e-07, + "loss": 0.3955, + "step": 39290 + }, + { + "epoch": 0.8722433713277322, + "grad_norm": 0.6741588115692139, + "learning_rate": 7.945730655509543e-07, + "loss": 0.3321, + "step": 39295 + }, + { + "epoch": 0.8723543578872599, + "grad_norm": 1.1833090782165527, + "learning_rate": 7.932115408697549e-07, + "loss": 0.35, + "step": 39300 + }, + { + "epoch": 0.8724653444467875, + "grad_norm": 1.345542073249817, + "learning_rate": 7.918511355190173e-07, + "loss": 0.4641, + "step": 39305 + }, + { + "epoch": 0.8725763310063152, + "grad_norm": 1.673676609992981, + "learning_rate": 7.904918496641379e-07, + "loss": 0.3948, + "step": 39310 + }, + { + "epoch": 0.8726873175658427, + "grad_norm": 3.6500816345214844, + "learning_rate": 7.891336834703722e-07, + "loss": 0.4197, + "step": 39315 + }, + { + "epoch": 0.8727983041253704, + "grad_norm": 1.4190216064453125, + "learning_rate": 7.877766371028417e-07, + "loss": 0.455, + "step": 39320 + }, + { + "epoch": 0.8729092906848981, + "grad_norm": 2.2108571529388428, + "learning_rate": 7.864207107265342e-07, + "loss": 0.421, + "step": 39325 + }, + { + "epoch": 0.8730202772444257, + "grad_norm": 1.1898353099822998, + "learning_rate": 7.850659045062958e-07, + "loss": 0.466, + "step": 39330 + }, + { + "epoch": 0.8731312638039533, + "grad_norm": 1.808774709701538, + "learning_rate": 7.837122186068414e-07, + "loss": 0.357, + "step": 39335 + }, + { + "epoch": 0.873242250363481, + "grad_norm": 0.5657018423080444, + "learning_rate": 7.823596531927447e-07, + "loss": 0.3987, + "step": 39340 + }, + { + "epoch": 0.8733532369230086, + "grad_norm": 1.325925588607788, + "learning_rate": 7.810082084284476e-07, + "loss": 0.4304, + "step": 39345 + }, + { + "epoch": 0.8734642234825363, + "grad_norm": 0.9428622722625732, + "learning_rate": 7.796578844782554e-07, + "loss": 0.3992, + "step": 39350 + }, + { + "epoch": 0.873575210042064, + "grad_norm": 0.9125751256942749, + "learning_rate": 7.783086815063346e-07, + "loss": 0.4402, + "step": 39355 + }, + { + "epoch": 0.8736861966015915, + "grad_norm": 1.4098918437957764, + "learning_rate": 7.769605996767182e-07, + "loss": 0.4035, + "step": 39360 + }, + { + "epoch": 0.8737971831611192, + "grad_norm": 1.6082866191864014, + "learning_rate": 7.756136391532998e-07, + "loss": 0.3726, + "step": 39365 + }, + { + "epoch": 0.8739081697206468, + "grad_norm": 0.6247255802154541, + "learning_rate": 7.742678000998372e-07, + "loss": 0.4634, + "step": 39370 + }, + { + "epoch": 0.8740191562801745, + "grad_norm": 1.5552891492843628, + "learning_rate": 7.729230826799583e-07, + "loss": 0.443, + "step": 39375 + }, + { + "epoch": 0.8741301428397021, + "grad_norm": 1.285137414932251, + "learning_rate": 7.715794870571425e-07, + "loss": 0.3397, + "step": 39380 + }, + { + "epoch": 0.8742411293992297, + "grad_norm": 1.1897644996643066, + "learning_rate": 7.702370133947457e-07, + "loss": 0.3414, + "step": 39385 + }, + { + "epoch": 0.8743521159587574, + "grad_norm": 1.2517973184585571, + "learning_rate": 7.688956618559762e-07, + "loss": 0.1984, + "step": 39390 + }, + { + "epoch": 0.8744631025182851, + "grad_norm": 1.8219321966171265, + "learning_rate": 7.675554326039158e-07, + "loss": 0.4278, + "step": 39395 + }, + { + "epoch": 0.8745740890778126, + "grad_norm": 0.8656405806541443, + "learning_rate": 7.66216325801501e-07, + "loss": 0.4873, + "step": 39400 + }, + { + "epoch": 0.8746850756373403, + "grad_norm": 1.532086730003357, + "learning_rate": 7.648783416115369e-07, + "loss": 0.4338, + "step": 39405 + }, + { + "epoch": 0.874796062196868, + "grad_norm": 0.9961473941802979, + "learning_rate": 7.635414801966934e-07, + "loss": 0.3696, + "step": 39410 + }, + { + "epoch": 0.8749070487563956, + "grad_norm": 1.5317226648330688, + "learning_rate": 7.622057417194995e-07, + "loss": 0.346, + "step": 39415 + }, + { + "epoch": 0.8750180353159233, + "grad_norm": 1.653637409210205, + "learning_rate": 7.608711263423507e-07, + "loss": 0.4753, + "step": 39420 + }, + { + "epoch": 0.8751290218754508, + "grad_norm": 0.7220828533172607, + "learning_rate": 7.595376342275041e-07, + "loss": 0.3362, + "step": 39425 + }, + { + "epoch": 0.8752400084349785, + "grad_norm": 2.434098958969116, + "learning_rate": 7.582052655370809e-07, + "loss": 0.4546, + "step": 39430 + }, + { + "epoch": 0.8753509949945062, + "grad_norm": 1.1487387418746948, + "learning_rate": 7.568740204330693e-07, + "loss": 0.2843, + "step": 39435 + }, + { + "epoch": 0.8754619815540338, + "grad_norm": 1.684936285018921, + "learning_rate": 7.555438990773134e-07, + "loss": 0.3151, + "step": 39440 + }, + { + "epoch": 0.8755729681135614, + "grad_norm": 0.7425820231437683, + "learning_rate": 7.542149016315292e-07, + "loss": 0.3909, + "step": 39445 + }, + { + "epoch": 0.8756839546730891, + "grad_norm": 1.5853488445281982, + "learning_rate": 7.528870282572864e-07, + "loss": 0.263, + "step": 39450 + }, + { + "epoch": 0.8757949412326167, + "grad_norm": 1.7768914699554443, + "learning_rate": 7.515602791160281e-07, + "loss": 0.4729, + "step": 39455 + }, + { + "epoch": 0.8759059277921444, + "grad_norm": 1.2601383924484253, + "learning_rate": 7.502346543690531e-07, + "loss": 0.3555, + "step": 39460 + }, + { + "epoch": 0.8760169143516721, + "grad_norm": 1.3894011974334717, + "learning_rate": 7.48910154177529e-07, + "loss": 0.3625, + "step": 39465 + }, + { + "epoch": 0.8761279009111996, + "grad_norm": 0.9332782626152039, + "learning_rate": 7.475867787024815e-07, + "loss": 0.3834, + "step": 39470 + }, + { + "epoch": 0.8762388874707273, + "grad_norm": 2.1381521224975586, + "learning_rate": 7.462645281048043e-07, + "loss": 0.2857, + "step": 39475 + }, + { + "epoch": 0.8763498740302549, + "grad_norm": 0.9826429486274719, + "learning_rate": 7.449434025452496e-07, + "loss": 0.4026, + "step": 39480 + }, + { + "epoch": 0.8764608605897826, + "grad_norm": 1.0933654308319092, + "learning_rate": 7.43623402184438e-07, + "loss": 0.4764, + "step": 39485 + }, + { + "epoch": 0.8765718471493102, + "grad_norm": 0.9672232270240784, + "learning_rate": 7.423045271828489e-07, + "loss": 0.3762, + "step": 39490 + }, + { + "epoch": 0.8766828337088378, + "grad_norm": 0.4871096909046173, + "learning_rate": 7.409867777008295e-07, + "loss": 0.4321, + "step": 39495 + }, + { + "epoch": 0.8767938202683655, + "grad_norm": 1.155806541442871, + "learning_rate": 7.396701538985829e-07, + "loss": 0.3544, + "step": 39500 + }, + { + "epoch": 0.8769048068278932, + "grad_norm": 1.418287992477417, + "learning_rate": 7.383546559361843e-07, + "loss": 0.3252, + "step": 39505 + }, + { + "epoch": 0.8770157933874208, + "grad_norm": 0.6687827706336975, + "learning_rate": 7.370402839735635e-07, + "loss": 0.4073, + "step": 39510 + }, + { + "epoch": 0.8771267799469484, + "grad_norm": 1.328175663948059, + "learning_rate": 7.357270381705195e-07, + "loss": 0.3662, + "step": 39515 + }, + { + "epoch": 0.8772377665064761, + "grad_norm": 1.973333477973938, + "learning_rate": 7.344149186867133e-07, + "loss": 0.3604, + "step": 39520 + }, + { + "epoch": 0.8773487530660037, + "grad_norm": 1.0833680629730225, + "learning_rate": 7.331039256816664e-07, + "loss": 0.5241, + "step": 39525 + }, + { + "epoch": 0.8774597396255314, + "grad_norm": 1.6844494342803955, + "learning_rate": 7.317940593147665e-07, + "loss": 0.4995, + "step": 39530 + }, + { + "epoch": 0.8775707261850589, + "grad_norm": 0.6150209307670593, + "learning_rate": 7.3048531974526e-07, + "loss": 0.3504, + "step": 39535 + }, + { + "epoch": 0.8776817127445866, + "grad_norm": 1.4000403881072998, + "learning_rate": 7.291777071322614e-07, + "loss": 0.2883, + "step": 39540 + }, + { + "epoch": 0.8777926993041143, + "grad_norm": 1.3559322357177734, + "learning_rate": 7.278712216347461e-07, + "loss": 0.2883, + "step": 39545 + }, + { + "epoch": 0.8779036858636419, + "grad_norm": 1.0683934688568115, + "learning_rate": 7.265658634115502e-07, + "loss": 0.3154, + "step": 39550 + }, + { + "epoch": 0.8780146724231696, + "grad_norm": 0.9396424293518066, + "learning_rate": 7.25261632621378e-07, + "loss": 0.3369, + "step": 39555 + }, + { + "epoch": 0.8781256589826972, + "grad_norm": 1.2671051025390625, + "learning_rate": 7.239585294227891e-07, + "loss": 0.3504, + "step": 39560 + }, + { + "epoch": 0.8782366455422248, + "grad_norm": 1.2282811403274536, + "learning_rate": 7.226565539742148e-07, + "loss": 0.4722, + "step": 39565 + }, + { + "epoch": 0.8783476321017525, + "grad_norm": 1.1958354711532593, + "learning_rate": 7.213557064339405e-07, + "loss": 0.4123, + "step": 39570 + }, + { + "epoch": 0.8784586186612802, + "grad_norm": 1.2750756740570068, + "learning_rate": 7.20055986960122e-07, + "loss": 0.2987, + "step": 39575 + }, + { + "epoch": 0.8785696052208077, + "grad_norm": 0.874458372592926, + "learning_rate": 7.187573957107751e-07, + "loss": 0.3363, + "step": 39580 + }, + { + "epoch": 0.8786805917803354, + "grad_norm": 0.7889037132263184, + "learning_rate": 7.174599328437759e-07, + "loss": 0.593, + "step": 39585 + }, + { + "epoch": 0.878791578339863, + "grad_norm": 1.0096137523651123, + "learning_rate": 7.161635985168691e-07, + "loss": 0.3501, + "step": 39590 + }, + { + "epoch": 0.8789025648993907, + "grad_norm": 1.4937151670455933, + "learning_rate": 7.148683928876544e-07, + "loss": 0.3758, + "step": 39595 + }, + { + "epoch": 0.8790135514589184, + "grad_norm": 0.5935758948326111, + "learning_rate": 7.135743161136e-07, + "loss": 0.3388, + "step": 39600 + }, + { + "epoch": 0.8791245380184459, + "grad_norm": 2.038949489593506, + "learning_rate": 7.12281368352038e-07, + "loss": 0.3533, + "step": 39605 + }, + { + "epoch": 0.8792355245779736, + "grad_norm": 0.6185315847396851, + "learning_rate": 7.109895497601571e-07, + "loss": 0.2537, + "step": 39610 + }, + { + "epoch": 0.8793465111375013, + "grad_norm": 1.051365613937378, + "learning_rate": 7.09698860495015e-07, + "loss": 0.3532, + "step": 39615 + }, + { + "epoch": 0.8794574976970289, + "grad_norm": 1.2595382928848267, + "learning_rate": 7.084093007135274e-07, + "loss": 0.4788, + "step": 39620 + }, + { + "epoch": 0.8795684842565565, + "grad_norm": 1.1612679958343506, + "learning_rate": 7.071208705724742e-07, + "loss": 0.5306, + "step": 39625 + }, + { + "epoch": 0.8796794708160842, + "grad_norm": 1.0881028175354004, + "learning_rate": 7.058335702285024e-07, + "loss": 0.366, + "step": 39630 + }, + { + "epoch": 0.8797904573756118, + "grad_norm": 1.3085839748382568, + "learning_rate": 7.045473998381136e-07, + "loss": 0.2082, + "step": 39635 + }, + { + "epoch": 0.8799014439351395, + "grad_norm": 1.039229154586792, + "learning_rate": 7.032623595576782e-07, + "loss": 0.4965, + "step": 39640 + }, + { + "epoch": 0.880012430494667, + "grad_norm": 1.720349907875061, + "learning_rate": 7.019784495434246e-07, + "loss": 0.4264, + "step": 39645 + }, + { + "epoch": 0.8801234170541947, + "grad_norm": 1.182712435722351, + "learning_rate": 7.0069566995145e-07, + "loss": 0.5752, + "step": 39650 + }, + { + "epoch": 0.8802344036137224, + "grad_norm": 1.3283008337020874, + "learning_rate": 6.994140209377065e-07, + "loss": 0.4175, + "step": 39655 + }, + { + "epoch": 0.88034539017325, + "grad_norm": 0.8335916996002197, + "learning_rate": 6.981335026580149e-07, + "loss": 0.3787, + "step": 39660 + }, + { + "epoch": 0.8804563767327777, + "grad_norm": 1.896451711654663, + "learning_rate": 6.968541152680575e-07, + "loss": 0.4486, + "step": 39665 + }, + { + "epoch": 0.8805673632923053, + "grad_norm": 0.5865107774734497, + "learning_rate": 6.955758589233741e-07, + "loss": 0.4943, + "step": 39670 + }, + { + "epoch": 0.8806783498518329, + "grad_norm": 2.0769338607788086, + "learning_rate": 6.942987337793761e-07, + "loss": 0.3804, + "step": 39675 + }, + { + "epoch": 0.8807893364113606, + "grad_norm": 1.878135323524475, + "learning_rate": 6.93022739991327e-07, + "loss": 0.3271, + "step": 39680 + }, + { + "epoch": 0.8809003229708883, + "grad_norm": 0.7034597992897034, + "learning_rate": 6.917478777143593e-07, + "loss": 0.3703, + "step": 39685 + }, + { + "epoch": 0.8810113095304158, + "grad_norm": 1.7691963911056519, + "learning_rate": 6.904741471034692e-07, + "loss": 0.5127, + "step": 39690 + }, + { + "epoch": 0.8811222960899435, + "grad_norm": 0.9068023562431335, + "learning_rate": 6.892015483135095e-07, + "loss": 0.3244, + "step": 39695 + }, + { + "epoch": 0.8812332826494711, + "grad_norm": 0.825576663017273, + "learning_rate": 6.879300814992007e-07, + "loss": 0.4179, + "step": 39700 + }, + { + "epoch": 0.8813442692089988, + "grad_norm": 2.7324061393737793, + "learning_rate": 6.866597468151204e-07, + "loss": 0.3272, + "step": 39705 + }, + { + "epoch": 0.8814552557685265, + "grad_norm": 1.538509726524353, + "learning_rate": 6.853905444157161e-07, + "loss": 0.4511, + "step": 39710 + }, + { + "epoch": 0.881566242328054, + "grad_norm": 1.2030854225158691, + "learning_rate": 6.841224744552888e-07, + "loss": 0.4274, + "step": 39715 + }, + { + "epoch": 0.8816772288875817, + "grad_norm": 1.1677093505859375, + "learning_rate": 6.828555370880085e-07, + "loss": 0.4939, + "step": 39720 + }, + { + "epoch": 0.8817882154471094, + "grad_norm": 1.0679088830947876, + "learning_rate": 6.815897324679044e-07, + "loss": 0.4146, + "step": 39725 + }, + { + "epoch": 0.881899202006637, + "grad_norm": 0.8127391338348389, + "learning_rate": 6.80325060748871e-07, + "loss": 0.3293, + "step": 39730 + }, + { + "epoch": 0.8820101885661646, + "grad_norm": 1.3469727039337158, + "learning_rate": 6.790615220846586e-07, + "loss": 0.4906, + "step": 39735 + }, + { + "epoch": 0.8821211751256923, + "grad_norm": 1.0445455312728882, + "learning_rate": 6.777991166288877e-07, + "loss": 0.2626, + "step": 39740 + }, + { + "epoch": 0.8822321616852199, + "grad_norm": 1.063741683959961, + "learning_rate": 6.765378445350346e-07, + "loss": 0.2856, + "step": 39745 + }, + { + "epoch": 0.8823431482447476, + "grad_norm": 0.642105758190155, + "learning_rate": 6.752777059564431e-07, + "loss": 0.3201, + "step": 39750 + }, + { + "epoch": 0.8824541348042751, + "grad_norm": 1.2013835906982422, + "learning_rate": 6.74018701046315e-07, + "loss": 0.323, + "step": 39755 + }, + { + "epoch": 0.8825651213638028, + "grad_norm": 1.887306571006775, + "learning_rate": 6.727608299577171e-07, + "loss": 0.3755, + "step": 39760 + }, + { + "epoch": 0.8826761079233305, + "grad_norm": 1.15422523021698, + "learning_rate": 6.715040928435746e-07, + "loss": 0.4409, + "step": 39765 + }, + { + "epoch": 0.8827870944828581, + "grad_norm": 1.1066185235977173, + "learning_rate": 6.702484898566797e-07, + "loss": 0.3627, + "step": 39770 + }, + { + "epoch": 0.8828980810423858, + "grad_norm": 0.8012137413024902, + "learning_rate": 6.689940211496848e-07, + "loss": 0.4263, + "step": 39775 + }, + { + "epoch": 0.8830090676019134, + "grad_norm": 0.863029956817627, + "learning_rate": 6.677406868751013e-07, + "loss": 0.424, + "step": 39780 + }, + { + "epoch": 0.883120054161441, + "grad_norm": 1.1876236200332642, + "learning_rate": 6.664884871853095e-07, + "loss": 0.2413, + "step": 39785 + }, + { + "epoch": 0.8832310407209687, + "grad_norm": 1.074812412261963, + "learning_rate": 6.652374222325441e-07, + "loss": 0.3695, + "step": 39790 + }, + { + "epoch": 0.8833420272804964, + "grad_norm": 1.769379734992981, + "learning_rate": 6.639874921689049e-07, + "loss": 0.3624, + "step": 39795 + }, + { + "epoch": 0.883453013840024, + "grad_norm": 1.0724666118621826, + "learning_rate": 6.627386971463589e-07, + "loss": 0.5713, + "step": 39800 + }, + { + "epoch": 0.8835640003995516, + "grad_norm": 1.0125855207443237, + "learning_rate": 6.614910373167249e-07, + "loss": 0.4042, + "step": 39805 + }, + { + "epoch": 0.8836749869590792, + "grad_norm": 0.8673639297485352, + "learning_rate": 6.602445128316937e-07, + "loss": 0.37, + "step": 39810 + }, + { + "epoch": 0.8837859735186069, + "grad_norm": 0.8695536851882935, + "learning_rate": 6.589991238428095e-07, + "loss": 0.3782, + "step": 39815 + }, + { + "epoch": 0.8838969600781346, + "grad_norm": 1.2568012475967407, + "learning_rate": 6.577548705014869e-07, + "loss": 0.3347, + "step": 39820 + }, + { + "epoch": 0.8840079466376621, + "grad_norm": 1.2032254934310913, + "learning_rate": 6.565117529589937e-07, + "loss": 0.2547, + "step": 39825 + }, + { + "epoch": 0.8841189331971898, + "grad_norm": 0.8552618622779846, + "learning_rate": 6.552697713664658e-07, + "loss": 0.3563, + "step": 39830 + }, + { + "epoch": 0.8842299197567175, + "grad_norm": 1.2995036840438843, + "learning_rate": 6.540289258749011e-07, + "loss": 0.4856, + "step": 39835 + }, + { + "epoch": 0.8843409063162451, + "grad_norm": 2.986711263656616, + "learning_rate": 6.527892166351535e-07, + "loss": 0.3476, + "step": 39840 + }, + { + "epoch": 0.8844518928757727, + "grad_norm": 1.8609137535095215, + "learning_rate": 6.515506437979469e-07, + "loss": 0.3801, + "step": 39845 + }, + { + "epoch": 0.8845628794353004, + "grad_norm": 1.7628650665283203, + "learning_rate": 6.503132075138596e-07, + "loss": 0.332, + "step": 39850 + }, + { + "epoch": 0.884673865994828, + "grad_norm": 1.6240646839141846, + "learning_rate": 6.490769079333359e-07, + "loss": 0.4001, + "step": 39855 + }, + { + "epoch": 0.8847848525543557, + "grad_norm": 1.0301650762557983, + "learning_rate": 6.478417452066821e-07, + "loss": 0.3662, + "step": 39860 + }, + { + "epoch": 0.8848958391138833, + "grad_norm": 1.349965214729309, + "learning_rate": 6.466077194840637e-07, + "loss": 0.3889, + "step": 39865 + }, + { + "epoch": 0.8850068256734109, + "grad_norm": 1.5930453538894653, + "learning_rate": 6.453748309155105e-07, + "loss": 0.5254, + "step": 39870 + }, + { + "epoch": 0.8851178122329386, + "grad_norm": 0.877350389957428, + "learning_rate": 6.441430796509107e-07, + "loss": 0.3623, + "step": 39875 + }, + { + "epoch": 0.8852287987924662, + "grad_norm": 1.5074859857559204, + "learning_rate": 6.429124658400188e-07, + "loss": 0.39, + "step": 39880 + }, + { + "epoch": 0.8853397853519939, + "grad_norm": 0.7831709980964661, + "learning_rate": 6.416829896324495e-07, + "loss": 0.4656, + "step": 39885 + }, + { + "epoch": 0.8854507719115216, + "grad_norm": 1.270965576171875, + "learning_rate": 6.404546511776755e-07, + "loss": 0.4003, + "step": 39890 + }, + { + "epoch": 0.8855617584710491, + "grad_norm": 1.269120693206787, + "learning_rate": 6.392274506250374e-07, + "loss": 0.4082, + "step": 39895 + }, + { + "epoch": 0.8856727450305768, + "grad_norm": 1.6134681701660156, + "learning_rate": 6.380013881237302e-07, + "loss": 0.5901, + "step": 39900 + }, + { + "epoch": 0.8857837315901045, + "grad_norm": 0.9332907199859619, + "learning_rate": 6.36776463822818e-07, + "loss": 0.3308, + "step": 39905 + }, + { + "epoch": 0.885894718149632, + "grad_norm": 2.3193821907043457, + "learning_rate": 6.355526778712195e-07, + "loss": 0.5457, + "step": 39910 + }, + { + "epoch": 0.8860057047091597, + "grad_norm": 0.9229357838630676, + "learning_rate": 6.343300304177214e-07, + "loss": 0.3415, + "step": 39915 + }, + { + "epoch": 0.8861166912686873, + "grad_norm": 0.9777660369873047, + "learning_rate": 6.331085216109701e-07, + "loss": 0.4594, + "step": 39920 + }, + { + "epoch": 0.886227677828215, + "grad_norm": 1.237738847732544, + "learning_rate": 6.318881515994679e-07, + "loss": 0.3276, + "step": 39925 + }, + { + "epoch": 0.8863386643877427, + "grad_norm": 1.1996231079101562, + "learning_rate": 6.306689205315885e-07, + "loss": 0.5043, + "step": 39930 + }, + { + "epoch": 0.8864496509472702, + "grad_norm": 1.0719642639160156, + "learning_rate": 6.294508285555567e-07, + "loss": 0.3521, + "step": 39935 + }, + { + "epoch": 0.8865606375067979, + "grad_norm": 0.7645555734634399, + "learning_rate": 6.282338758194684e-07, + "loss": 0.3757, + "step": 39940 + }, + { + "epoch": 0.8866716240663256, + "grad_norm": 1.6293964385986328, + "learning_rate": 6.270180624712751e-07, + "loss": 0.4791, + "step": 39945 + }, + { + "epoch": 0.8867826106258532, + "grad_norm": 1.2439913749694824, + "learning_rate": 6.258033886587911e-07, + "loss": 0.375, + "step": 39950 + }, + { + "epoch": 0.8868935971853809, + "grad_norm": 1.689984917640686, + "learning_rate": 6.245898545296924e-07, + "loss": 0.4247, + "step": 39955 + }, + { + "epoch": 0.8870045837449085, + "grad_norm": 1.1024510860443115, + "learning_rate": 6.233774602315157e-07, + "loss": 0.2614, + "step": 39960 + }, + { + "epoch": 0.8871155703044361, + "grad_norm": 1.1792956590652466, + "learning_rate": 6.221662059116629e-07, + "loss": 0.3694, + "step": 39965 + }, + { + "epoch": 0.8872265568639638, + "grad_norm": 1.1179901361465454, + "learning_rate": 6.209560917173896e-07, + "loss": 0.3596, + "step": 39970 + }, + { + "epoch": 0.8873375434234915, + "grad_norm": 1.654262900352478, + "learning_rate": 6.197471177958214e-07, + "loss": 0.3389, + "step": 39975 + }, + { + "epoch": 0.887448529983019, + "grad_norm": 1.3884893655776978, + "learning_rate": 6.185392842939386e-07, + "loss": 0.499, + "step": 39980 + }, + { + "epoch": 0.8875595165425467, + "grad_norm": 1.1281630992889404, + "learning_rate": 6.173325913585882e-07, + "loss": 0.2934, + "step": 39985 + }, + { + "epoch": 0.8876705031020743, + "grad_norm": 1.5607919692993164, + "learning_rate": 6.161270391364726e-07, + "loss": 0.4504, + "step": 39990 + }, + { + "epoch": 0.887781489661602, + "grad_norm": 1.0889188051223755, + "learning_rate": 6.149226277741616e-07, + "loss": 0.3168, + "step": 39995 + }, + { + "epoch": 0.8878924762211297, + "grad_norm": 1.2045074701309204, + "learning_rate": 6.137193574180811e-07, + "loss": 0.3516, + "step": 40000 + }, + { + "epoch": 0.8880034627806572, + "grad_norm": 1.0601027011871338, + "learning_rate": 6.125172282145242e-07, + "loss": 0.4217, + "step": 40005 + }, + { + "epoch": 0.8881144493401849, + "grad_norm": 0.6195953488349915, + "learning_rate": 6.113162403096374e-07, + "loss": 0.398, + "step": 40010 + }, + { + "epoch": 0.8882254358997126, + "grad_norm": 0.8855378031730652, + "learning_rate": 6.101163938494359e-07, + "loss": 0.3358, + "step": 40015 + }, + { + "epoch": 0.8883364224592402, + "grad_norm": 1.527886152267456, + "learning_rate": 6.08917688979791e-07, + "loss": 0.3484, + "step": 40020 + }, + { + "epoch": 0.8884474090187678, + "grad_norm": 1.2936069965362549, + "learning_rate": 6.077201258464383e-07, + "loss": 0.263, + "step": 40025 + }, + { + "epoch": 0.8885583955782955, + "grad_norm": 1.3309067487716675, + "learning_rate": 6.065237045949757e-07, + "loss": 0.419, + "step": 40030 + }, + { + "epoch": 0.8886693821378231, + "grad_norm": 1.2705096006393433, + "learning_rate": 6.053284253708547e-07, + "loss": 0.4035, + "step": 40035 + }, + { + "epoch": 0.8887803686973508, + "grad_norm": 2.917478561401367, + "learning_rate": 6.04134288319399e-07, + "loss": 0.5202, + "step": 40040 + }, + { + "epoch": 0.8888913552568783, + "grad_norm": 0.9519320726394653, + "learning_rate": 6.029412935857837e-07, + "loss": 0.4442, + "step": 40045 + }, + { + "epoch": 0.889002341816406, + "grad_norm": 0.9434588551521301, + "learning_rate": 6.017494413150504e-07, + "loss": 0.4018, + "step": 40050 + }, + { + "epoch": 0.8891133283759337, + "grad_norm": 0.7919776439666748, + "learning_rate": 6.005587316521022e-07, + "loss": 0.4546, + "step": 40055 + }, + { + "epoch": 0.8892243149354613, + "grad_norm": 1.9365350008010864, + "learning_rate": 5.993691647416988e-07, + "loss": 0.4293, + "step": 40060 + }, + { + "epoch": 0.889335301494989, + "grad_norm": 1.0461664199829102, + "learning_rate": 5.981807407284668e-07, + "loss": 0.3527, + "step": 40065 + }, + { + "epoch": 0.8894462880545166, + "grad_norm": 0.9392116069793701, + "learning_rate": 5.969934597568872e-07, + "loss": 0.4463, + "step": 40070 + }, + { + "epoch": 0.8895572746140442, + "grad_norm": 0.9327507615089417, + "learning_rate": 5.958073219713089e-07, + "loss": 0.3969, + "step": 40075 + }, + { + "epoch": 0.8896682611735719, + "grad_norm": 1.5740067958831787, + "learning_rate": 5.946223275159369e-07, + "loss": 0.6056, + "step": 40080 + }, + { + "epoch": 0.8897792477330996, + "grad_norm": 1.093630075454712, + "learning_rate": 5.934384765348378e-07, + "loss": 0.3468, + "step": 40085 + }, + { + "epoch": 0.8898902342926271, + "grad_norm": 0.9992703199386597, + "learning_rate": 5.922557691719432e-07, + "loss": 0.2102, + "step": 40090 + }, + { + "epoch": 0.8900012208521548, + "grad_norm": 1.1181128025054932, + "learning_rate": 5.910742055710394e-07, + "loss": 0.2883, + "step": 40095 + }, + { + "epoch": 0.8901122074116824, + "grad_norm": 0.8682889342308044, + "learning_rate": 5.898937858757814e-07, + "loss": 0.3134, + "step": 40100 + }, + { + "epoch": 0.8902231939712101, + "grad_norm": 1.079490065574646, + "learning_rate": 5.887145102296754e-07, + "loss": 0.4043, + "step": 40105 + }, + { + "epoch": 0.8903341805307378, + "grad_norm": 1.373574137687683, + "learning_rate": 5.875363787760957e-07, + "loss": 0.3318, + "step": 40110 + }, + { + "epoch": 0.8904451670902653, + "grad_norm": 1.2262922525405884, + "learning_rate": 5.863593916582788e-07, + "loss": 0.3128, + "step": 40115 + }, + { + "epoch": 0.890556153649793, + "grad_norm": 0.48372411727905273, + "learning_rate": 5.851835490193136e-07, + "loss": 0.3235, + "step": 40120 + }, + { + "epoch": 0.8906671402093207, + "grad_norm": 1.1640418767929077, + "learning_rate": 5.840088510021602e-07, + "loss": 0.4542, + "step": 40125 + }, + { + "epoch": 0.8907781267688483, + "grad_norm": 0.7971268892288208, + "learning_rate": 5.828352977496299e-07, + "loss": 0.4302, + "step": 40130 + }, + { + "epoch": 0.890889113328376, + "grad_norm": 1.0506165027618408, + "learning_rate": 5.816628894044018e-07, + "loss": 0.3768, + "step": 40135 + }, + { + "epoch": 0.8910000998879036, + "grad_norm": 1.1588351726531982, + "learning_rate": 5.804916261090132e-07, + "loss": 0.2719, + "step": 40140 + }, + { + "epoch": 0.8911110864474312, + "grad_norm": 1.5512523651123047, + "learning_rate": 5.793215080058623e-07, + "loss": 0.317, + "step": 40145 + }, + { + "epoch": 0.8912220730069589, + "grad_norm": 1.2580381631851196, + "learning_rate": 5.781525352372086e-07, + "loss": 0.3762, + "step": 40150 + }, + { + "epoch": 0.8913330595664865, + "grad_norm": 1.8583688735961914, + "learning_rate": 5.769847079451696e-07, + "loss": 0.2636, + "step": 40155 + }, + { + "epoch": 0.8914440461260141, + "grad_norm": 1.5035998821258545, + "learning_rate": 5.758180262717283e-07, + "loss": 0.4279, + "step": 40160 + }, + { + "epoch": 0.8915550326855418, + "grad_norm": 1.1198701858520508, + "learning_rate": 5.746524903587247e-07, + "loss": 0.4272, + "step": 40165 + }, + { + "epoch": 0.8916660192450694, + "grad_norm": 1.359174132347107, + "learning_rate": 5.73488100347861e-07, + "loss": 0.416, + "step": 40170 + }, + { + "epoch": 0.8917770058045971, + "grad_norm": 1.6771479845046997, + "learning_rate": 5.723248563807016e-07, + "loss": 0.41, + "step": 40175 + }, + { + "epoch": 0.8918879923641247, + "grad_norm": 2.4248905181884766, + "learning_rate": 5.711627585986667e-07, + "loss": 0.4567, + "step": 40180 + }, + { + "epoch": 0.8919989789236523, + "grad_norm": 0.9985633492469788, + "learning_rate": 5.700018071430424e-07, + "loss": 0.3216, + "step": 40185 + }, + { + "epoch": 0.89210996548318, + "grad_norm": 0.6354491114616394, + "learning_rate": 5.688420021549724e-07, + "loss": 0.3433, + "step": 40190 + }, + { + "epoch": 0.8922209520427077, + "grad_norm": 1.0853493213653564, + "learning_rate": 5.676833437754614e-07, + "loss": 0.4728, + "step": 40195 + }, + { + "epoch": 0.8923319386022353, + "grad_norm": 0.627981960773468, + "learning_rate": 5.66525832145377e-07, + "loss": 0.4467, + "step": 40200 + }, + { + "epoch": 0.8924429251617629, + "grad_norm": 0.9562311172485352, + "learning_rate": 5.653694674054444e-07, + "loss": 0.3171, + "step": 40205 + }, + { + "epoch": 0.8925539117212905, + "grad_norm": 0.48237770795822144, + "learning_rate": 5.642142496962511e-07, + "loss": 0.3237, + "step": 40210 + }, + { + "epoch": 0.8926648982808182, + "grad_norm": 0.9646100401878357, + "learning_rate": 5.630601791582424e-07, + "loss": 0.2887, + "step": 40215 + }, + { + "epoch": 0.8927758848403459, + "grad_norm": 1.7289109230041504, + "learning_rate": 5.619072559317307e-07, + "loss": 0.4087, + "step": 40220 + }, + { + "epoch": 0.8928868713998734, + "grad_norm": 1.0684244632720947, + "learning_rate": 5.607554801568794e-07, + "loss": 0.4546, + "step": 40225 + }, + { + "epoch": 0.8929978579594011, + "grad_norm": 1.2578312158584595, + "learning_rate": 5.596048519737218e-07, + "loss": 0.353, + "step": 40230 + }, + { + "epoch": 0.8931088445189288, + "grad_norm": 0.9892885088920593, + "learning_rate": 5.584553715221442e-07, + "loss": 0.4642, + "step": 40235 + }, + { + "epoch": 0.8932198310784564, + "grad_norm": 0.9489505887031555, + "learning_rate": 5.573070389419e-07, + "loss": 0.3711, + "step": 40240 + }, + { + "epoch": 0.893330817637984, + "grad_norm": 1.6525774002075195, + "learning_rate": 5.561598543725954e-07, + "loss": 0.4137, + "step": 40245 + }, + { + "epoch": 0.8934418041975117, + "grad_norm": 1.2819950580596924, + "learning_rate": 5.550138179537057e-07, + "loss": 0.3999, + "step": 40250 + }, + { + "epoch": 0.8935527907570393, + "grad_norm": 0.81032794713974, + "learning_rate": 5.53868929824558e-07, + "loss": 0.2751, + "step": 40255 + }, + { + "epoch": 0.893663777316567, + "grad_norm": 1.0968986749649048, + "learning_rate": 5.52725190124348e-07, + "loss": 0.4661, + "step": 40260 + }, + { + "epoch": 0.8937747638760946, + "grad_norm": 1.1706798076629639, + "learning_rate": 5.515825989921242e-07, + "loss": 0.3541, + "step": 40265 + }, + { + "epoch": 0.8938857504356222, + "grad_norm": 1.1918624639511108, + "learning_rate": 5.50441156566801e-07, + "loss": 0.4062, + "step": 40270 + }, + { + "epoch": 0.8939967369951499, + "grad_norm": 1.0423552989959717, + "learning_rate": 5.493008629871499e-07, + "loss": 0.319, + "step": 40275 + }, + { + "epoch": 0.8941077235546775, + "grad_norm": 1.1794507503509521, + "learning_rate": 5.481617183918053e-07, + "loss": 0.2986, + "step": 40280 + }, + { + "epoch": 0.8942187101142052, + "grad_norm": 1.3555355072021484, + "learning_rate": 5.470237229192599e-07, + "loss": 0.4681, + "step": 40285 + }, + { + "epoch": 0.8943296966737329, + "grad_norm": 1.5522536039352417, + "learning_rate": 5.458868767078673e-07, + "loss": 0.4119, + "step": 40290 + }, + { + "epoch": 0.8944406832332604, + "grad_norm": 1.1344321966171265, + "learning_rate": 5.447511798958427e-07, + "loss": 0.3222, + "step": 40295 + }, + { + "epoch": 0.8945516697927881, + "grad_norm": 1.0988397598266602, + "learning_rate": 5.436166326212577e-07, + "loss": 0.4429, + "step": 40300 + }, + { + "epoch": 0.8946626563523158, + "grad_norm": 1.102994680404663, + "learning_rate": 5.424832350220477e-07, + "loss": 0.5143, + "step": 40305 + }, + { + "epoch": 0.8947736429118434, + "grad_norm": 0.7307510375976562, + "learning_rate": 5.413509872360101e-07, + "loss": 0.3052, + "step": 40310 + }, + { + "epoch": 0.894884629471371, + "grad_norm": 1.411036491394043, + "learning_rate": 5.40219889400796e-07, + "loss": 0.3787, + "step": 40315 + }, + { + "epoch": 0.8949956160308986, + "grad_norm": 0.9514254331588745, + "learning_rate": 5.390899416539231e-07, + "loss": 0.3885, + "step": 40320 + }, + { + "epoch": 0.8951066025904263, + "grad_norm": 1.9836772680282593, + "learning_rate": 5.379611441327647e-07, + "loss": 0.4674, + "step": 40325 + }, + { + "epoch": 0.895217589149954, + "grad_norm": 1.1093319654464722, + "learning_rate": 5.368334969745592e-07, + "loss": 0.4865, + "step": 40330 + }, + { + "epoch": 0.8953285757094815, + "grad_norm": 1.0533522367477417, + "learning_rate": 5.357070003163978e-07, + "loss": 0.5483, + "step": 40335 + }, + { + "epoch": 0.8954395622690092, + "grad_norm": 0.6712168455123901, + "learning_rate": 5.345816542952387e-07, + "loss": 0.3108, + "step": 40340 + }, + { + "epoch": 0.8955505488285369, + "grad_norm": 1.435325026512146, + "learning_rate": 5.334574590478992e-07, + "loss": 0.3496, + "step": 40345 + }, + { + "epoch": 0.8956615353880645, + "grad_norm": 0.8815795183181763, + "learning_rate": 5.32334414711052e-07, + "loss": 0.3578, + "step": 40350 + }, + { + "epoch": 0.8957725219475922, + "grad_norm": 1.6313246488571167, + "learning_rate": 5.31212521421236e-07, + "loss": 0.3128, + "step": 40355 + }, + { + "epoch": 0.8958835085071198, + "grad_norm": 1.9099875688552856, + "learning_rate": 5.300917793148441e-07, + "loss": 0.3975, + "step": 40360 + }, + { + "epoch": 0.8959944950666474, + "grad_norm": 1.2062870264053345, + "learning_rate": 5.28972188528134e-07, + "loss": 0.3287, + "step": 40365 + }, + { + "epoch": 0.8961054816261751, + "grad_norm": 1.5065999031066895, + "learning_rate": 5.278537491972236e-07, + "loss": 0.4504, + "step": 40370 + }, + { + "epoch": 0.8962164681857027, + "grad_norm": 1.3372350931167603, + "learning_rate": 5.267364614580861e-07, + "loss": 0.3728, + "step": 40375 + }, + { + "epoch": 0.8963274547452303, + "grad_norm": 1.2467929124832153, + "learning_rate": 5.256203254465597e-07, + "loss": 0.3721, + "step": 40380 + }, + { + "epoch": 0.896438441304758, + "grad_norm": 1.1268647909164429, + "learning_rate": 5.24505341298338e-07, + "loss": 0.3695, + "step": 40385 + }, + { + "epoch": 0.8965494278642856, + "grad_norm": 0.6092180013656616, + "learning_rate": 5.233915091489794e-07, + "loss": 0.3494, + "step": 40390 + }, + { + "epoch": 0.8966604144238133, + "grad_norm": 1.103093147277832, + "learning_rate": 5.222788291338998e-07, + "loss": 0.4334, + "step": 40395 + }, + { + "epoch": 0.896771400983341, + "grad_norm": 2.1130781173706055, + "learning_rate": 5.211673013883733e-07, + "loss": 0.3672, + "step": 40400 + }, + { + "epoch": 0.8968823875428685, + "grad_norm": 1.6719002723693848, + "learning_rate": 5.200569260475374e-07, + "loss": 0.5602, + "step": 40405 + }, + { + "epoch": 0.8969933741023962, + "grad_norm": 1.2825921773910522, + "learning_rate": 5.189477032463864e-07, + "loss": 0.3651, + "step": 40410 + }, + { + "epoch": 0.8971043606619239, + "grad_norm": 1.1338067054748535, + "learning_rate": 5.178396331197766e-07, + "loss": 0.1951, + "step": 40415 + }, + { + "epoch": 0.8972153472214515, + "grad_norm": 1.777740478515625, + "learning_rate": 5.167327158024249e-07, + "loss": 0.493, + "step": 40420 + }, + { + "epoch": 0.8973263337809791, + "grad_norm": 1.1521728038787842, + "learning_rate": 5.156269514289036e-07, + "loss": 0.4128, + "step": 40425 + }, + { + "epoch": 0.8974373203405067, + "grad_norm": 1.4409505128860474, + "learning_rate": 5.145223401336507e-07, + "loss": 0.4833, + "step": 40430 + }, + { + "epoch": 0.8975483069000344, + "grad_norm": 1.0183593034744263, + "learning_rate": 5.134188820509589e-07, + "loss": 0.2831, + "step": 40435 + }, + { + "epoch": 0.8976592934595621, + "grad_norm": 0.7380016446113586, + "learning_rate": 5.123165773149852e-07, + "loss": 0.3006, + "step": 40440 + }, + { + "epoch": 0.8977702800190897, + "grad_norm": 1.1958386898040771, + "learning_rate": 5.112154260597413e-07, + "loss": 0.4989, + "step": 40445 + }, + { + "epoch": 0.8978812665786173, + "grad_norm": 1.2959365844726562, + "learning_rate": 5.101154284191035e-07, + "loss": 0.3115, + "step": 40450 + }, + { + "epoch": 0.897992253138145, + "grad_norm": 0.7626371383666992, + "learning_rate": 5.090165845268058e-07, + "loss": 0.3649, + "step": 40455 + }, + { + "epoch": 0.8981032396976726, + "grad_norm": 1.715620517730713, + "learning_rate": 5.079188945164426e-07, + "loss": 0.4139, + "step": 40460 + }, + { + "epoch": 0.8982142262572003, + "grad_norm": 1.511171817779541, + "learning_rate": 5.068223585214637e-07, + "loss": 0.5195, + "step": 40465 + }, + { + "epoch": 0.898325212816728, + "grad_norm": 1.423026204109192, + "learning_rate": 5.057269766751871e-07, + "loss": 0.4019, + "step": 40470 + }, + { + "epoch": 0.8984361993762555, + "grad_norm": 2.012284278869629, + "learning_rate": 5.046327491107816e-07, + "loss": 0.3232, + "step": 40475 + }, + { + "epoch": 0.8985471859357832, + "grad_norm": 1.1970844268798828, + "learning_rate": 5.035396759612831e-07, + "loss": 0.3283, + "step": 40480 + }, + { + "epoch": 0.8986581724953108, + "grad_norm": 1.5900375843048096, + "learning_rate": 5.024477573595799e-07, + "loss": 0.4416, + "step": 40485 + }, + { + "epoch": 0.8987691590548385, + "grad_norm": 1.5092462301254272, + "learning_rate": 5.013569934384277e-07, + "loss": 0.5254, + "step": 40490 + }, + { + "epoch": 0.8988801456143661, + "grad_norm": 1.6395747661590576, + "learning_rate": 5.002673843304351e-07, + "loss": 0.5007, + "step": 40495 + }, + { + "epoch": 0.8989911321738937, + "grad_norm": 2.0644948482513428, + "learning_rate": 4.99178930168075e-07, + "loss": 0.3698, + "step": 40500 + }, + { + "epoch": 0.8991021187334214, + "grad_norm": 1.0750409364700317, + "learning_rate": 4.98091631083678e-07, + "loss": 0.4849, + "step": 40505 + }, + { + "epoch": 0.8992131052929491, + "grad_norm": 0.9656897783279419, + "learning_rate": 4.970054872094321e-07, + "loss": 0.412, + "step": 40510 + }, + { + "epoch": 0.8993240918524766, + "grad_norm": 1.18376624584198, + "learning_rate": 4.959204986773903e-07, + "loss": 0.3912, + "step": 40515 + }, + { + "epoch": 0.8994350784120043, + "grad_norm": 1.7295215129852295, + "learning_rate": 4.948366656194581e-07, + "loss": 0.4733, + "step": 40520 + }, + { + "epoch": 0.899546064971532, + "grad_norm": 1.199609637260437, + "learning_rate": 4.93753988167408e-07, + "loss": 0.4633, + "step": 40525 + }, + { + "epoch": 0.8996570515310596, + "grad_norm": 1.0033913850784302, + "learning_rate": 4.926724664528648e-07, + "loss": 0.3885, + "step": 40530 + }, + { + "epoch": 0.8997680380905873, + "grad_norm": 0.852364182472229, + "learning_rate": 4.915921006073187e-07, + "loss": 0.2825, + "step": 40535 + }, + { + "epoch": 0.8998790246501148, + "grad_norm": 1.3140791654586792, + "learning_rate": 4.90512890762117e-07, + "loss": 0.3215, + "step": 40540 + }, + { + "epoch": 0.8999900112096425, + "grad_norm": 1.3916743993759155, + "learning_rate": 4.894348370484648e-07, + "loss": 0.2732, + "step": 40545 + }, + { + "epoch": 0.9001009977691702, + "grad_norm": 0.7842644453048706, + "learning_rate": 4.883579395974303e-07, + "loss": 0.3304, + "step": 40550 + }, + { + "epoch": 0.9002119843286978, + "grad_norm": 1.6732553243637085, + "learning_rate": 4.872821985399368e-07, + "loss": 0.3536, + "step": 40555 + }, + { + "epoch": 0.9003229708882254, + "grad_norm": 1.2899658679962158, + "learning_rate": 4.862076140067695e-07, + "loss": 0.3919, + "step": 40560 + }, + { + "epoch": 0.9004339574477531, + "grad_norm": 1.014262080192566, + "learning_rate": 4.851341861285764e-07, + "loss": 0.2967, + "step": 40565 + }, + { + "epoch": 0.9005449440072807, + "grad_norm": 1.1751197576522827, + "learning_rate": 4.840619150358561e-07, + "loss": 0.3216, + "step": 40570 + }, + { + "epoch": 0.9006559305668084, + "grad_norm": 0.9085738062858582, + "learning_rate": 4.829908008589768e-07, + "loss": 0.4418, + "step": 40575 + }, + { + "epoch": 0.900766917126336, + "grad_norm": 1.2819691896438599, + "learning_rate": 4.819208437281553e-07, + "loss": 0.4594, + "step": 40580 + }, + { + "epoch": 0.9008779036858636, + "grad_norm": 0.861376941204071, + "learning_rate": 4.808520437734776e-07, + "loss": 0.3508, + "step": 40585 + }, + { + "epoch": 0.9009888902453913, + "grad_norm": 1.106539249420166, + "learning_rate": 4.797844011248854e-07, + "loss": 0.3888, + "step": 40590 + }, + { + "epoch": 0.9010998768049189, + "grad_norm": 1.338828444480896, + "learning_rate": 4.787179159121758e-07, + "loss": 0.3314, + "step": 40595 + }, + { + "epoch": 0.9012108633644466, + "grad_norm": 0.7250569462776184, + "learning_rate": 4.776525882650107e-07, + "loss": 0.4059, + "step": 40600 + }, + { + "epoch": 0.9013218499239742, + "grad_norm": 1.144252061843872, + "learning_rate": 4.765884183129077e-07, + "loss": 0.534, + "step": 40605 + }, + { + "epoch": 0.9014328364835018, + "grad_norm": 1.1259852647781372, + "learning_rate": 4.7552540618524656e-07, + "loss": 0.4234, + "step": 40610 + }, + { + "epoch": 0.9015438230430295, + "grad_norm": 0.9830173254013062, + "learning_rate": 4.7446355201126283e-07, + "loss": 0.2786, + "step": 40615 + }, + { + "epoch": 0.9016548096025572, + "grad_norm": 1.644303798675537, + "learning_rate": 4.734028559200543e-07, + "loss": 0.3073, + "step": 40620 + }, + { + "epoch": 0.9017657961620847, + "grad_norm": 1.8261100053787231, + "learning_rate": 4.72343318040579e-07, + "loss": 0.4793, + "step": 40625 + }, + { + "epoch": 0.9018767827216124, + "grad_norm": 1.2500735521316528, + "learning_rate": 4.7128493850164715e-07, + "loss": 0.2982, + "step": 40630 + }, + { + "epoch": 0.9019877692811401, + "grad_norm": 1.7409520149230957, + "learning_rate": 4.7022771743193807e-07, + "loss": 0.3677, + "step": 40635 + }, + { + "epoch": 0.9020987558406677, + "grad_norm": 1.2236367464065552, + "learning_rate": 4.6917165495998006e-07, + "loss": 0.4784, + "step": 40640 + }, + { + "epoch": 0.9022097424001954, + "grad_norm": 0.9592740535736084, + "learning_rate": 4.681167512141693e-07, + "loss": 0.314, + "step": 40645 + }, + { + "epoch": 0.9023207289597229, + "grad_norm": 1.1023225784301758, + "learning_rate": 4.6706300632275767e-07, + "loss": 0.3069, + "step": 40650 + }, + { + "epoch": 0.9024317155192506, + "grad_norm": 2.137260913848877, + "learning_rate": 4.660104204138538e-07, + "loss": 0.3771, + "step": 40655 + }, + { + "epoch": 0.9025427020787783, + "grad_norm": 1.2477887868881226, + "learning_rate": 4.6495899361542974e-07, + "loss": 0.5641, + "step": 40660 + }, + { + "epoch": 0.9026536886383059, + "grad_norm": 1.1454601287841797, + "learning_rate": 4.63908726055311e-07, + "loss": 0.4877, + "step": 40665 + }, + { + "epoch": 0.9027646751978335, + "grad_norm": 1.7181624174118042, + "learning_rate": 4.628596178611888e-07, + "loss": 0.3043, + "step": 40670 + }, + { + "epoch": 0.9028756617573612, + "grad_norm": 1.327402114868164, + "learning_rate": 4.6181166916061004e-07, + "loss": 0.3787, + "step": 40675 + }, + { + "epoch": 0.9029866483168888, + "grad_norm": 1.639182209968567, + "learning_rate": 4.607648800809783e-07, + "loss": 0.312, + "step": 40680 + }, + { + "epoch": 0.9030976348764165, + "grad_norm": 0.5940906405448914, + "learning_rate": 4.597192507495618e-07, + "loss": 0.3517, + "step": 40685 + }, + { + "epoch": 0.9032086214359442, + "grad_norm": 1.1408140659332275, + "learning_rate": 4.586747812934822e-07, + "loss": 0.3383, + "step": 40690 + }, + { + "epoch": 0.9033196079954717, + "grad_norm": 0.9758588075637817, + "learning_rate": 4.576314718397246e-07, + "loss": 0.4833, + "step": 40695 + }, + { + "epoch": 0.9034305945549994, + "grad_norm": 2.091536521911621, + "learning_rate": 4.5658932251512856e-07, + "loss": 0.278, + "step": 40700 + }, + { + "epoch": 0.903541581114527, + "grad_norm": 1.074259638786316, + "learning_rate": 4.555483334463984e-07, + "loss": 0.396, + "step": 40705 + }, + { + "epoch": 0.9036525676740547, + "grad_norm": 0.7181487083435059, + "learning_rate": 4.5450850476009056e-07, + "loss": 0.3388, + "step": 40710 + }, + { + "epoch": 0.9037635542335823, + "grad_norm": 1.2724895477294922, + "learning_rate": 4.534698365826273e-07, + "loss": 0.3813, + "step": 40715 + }, + { + "epoch": 0.9038745407931099, + "grad_norm": 1.474095106124878, + "learning_rate": 4.524323290402832e-07, + "loss": 0.1913, + "step": 40720 + }, + { + "epoch": 0.9039855273526376, + "grad_norm": 1.1835238933563232, + "learning_rate": 4.5139598225919845e-07, + "loss": 0.2618, + "step": 40725 + }, + { + "epoch": 0.9040965139121653, + "grad_norm": 0.8935763239860535, + "learning_rate": 4.503607963653644e-07, + "loss": 0.2087, + "step": 40730 + }, + { + "epoch": 0.9042075004716928, + "grad_norm": 1.2680327892303467, + "learning_rate": 4.4932677148463943e-07, + "loss": 0.431, + "step": 40735 + }, + { + "epoch": 0.9043184870312205, + "grad_norm": 0.9342009425163269, + "learning_rate": 4.4829390774273395e-07, + "loss": 0.4155, + "step": 40740 + }, + { + "epoch": 0.9044294735907482, + "grad_norm": 0.9703999161720276, + "learning_rate": 4.47262205265222e-07, + "loss": 0.572, + "step": 40745 + }, + { + "epoch": 0.9045404601502758, + "grad_norm": 0.8090248107910156, + "learning_rate": 4.4623166417753217e-07, + "loss": 0.4299, + "step": 40750 + }, + { + "epoch": 0.9046514467098035, + "grad_norm": 1.2748615741729736, + "learning_rate": 4.452022846049564e-07, + "loss": 0.4863, + "step": 40755 + }, + { + "epoch": 0.904762433269331, + "grad_norm": 1.3430474996566772, + "learning_rate": 4.441740666726446e-07, + "loss": 0.3988, + "step": 40760 + }, + { + "epoch": 0.9048734198288587, + "grad_norm": 1.024856448173523, + "learning_rate": 4.4314701050559905e-07, + "loss": 0.3997, + "step": 40765 + }, + { + "epoch": 0.9049844063883864, + "grad_norm": 0.7125641703605652, + "learning_rate": 4.421211162286909e-07, + "loss": 0.2898, + "step": 40770 + }, + { + "epoch": 0.905095392947914, + "grad_norm": 1.450653076171875, + "learning_rate": 4.410963839666416e-07, + "loss": 0.2538, + "step": 40775 + }, + { + "epoch": 0.9052063795074416, + "grad_norm": 1.2268130779266357, + "learning_rate": 4.40072813844038e-07, + "loss": 0.3693, + "step": 40780 + }, + { + "epoch": 0.9053173660669693, + "grad_norm": 1.2509618997573853, + "learning_rate": 4.3905040598531737e-07, + "loss": 0.6456, + "step": 40785 + }, + { + "epoch": 0.9054283526264969, + "grad_norm": 1.0354459285736084, + "learning_rate": 4.3802916051478463e-07, + "loss": 0.2563, + "step": 40790 + }, + { + "epoch": 0.9055393391860246, + "grad_norm": 1.0008553266525269, + "learning_rate": 4.370090775565983e-07, + "loss": 0.4748, + "step": 40795 + }, + { + "epoch": 0.9056503257455523, + "grad_norm": 0.9893780946731567, + "learning_rate": 4.359901572347758e-07, + "loss": 0.339, + "step": 40800 + }, + { + "epoch": 0.9057613123050798, + "grad_norm": 1.1091101169586182, + "learning_rate": 4.349723996731969e-07, + "loss": 0.4297, + "step": 40805 + }, + { + "epoch": 0.9058722988646075, + "grad_norm": 0.9167705178260803, + "learning_rate": 4.3395580499559276e-07, + "loss": 0.3564, + "step": 40810 + }, + { + "epoch": 0.9059832854241351, + "grad_norm": 1.1292577981948853, + "learning_rate": 4.3294037332555996e-07, + "loss": 0.3398, + "step": 40815 + }, + { + "epoch": 0.9060942719836628, + "grad_norm": 1.3591548204421997, + "learning_rate": 4.3192610478655197e-07, + "loss": 0.3313, + "step": 40820 + }, + { + "epoch": 0.9062052585431905, + "grad_norm": 0.561805009841919, + "learning_rate": 4.3091299950187905e-07, + "loss": 0.3893, + "step": 40825 + }, + { + "epoch": 0.906316245102718, + "grad_norm": 0.9161701798439026, + "learning_rate": 4.2990105759471266e-07, + "loss": 0.4769, + "step": 40830 + }, + { + "epoch": 0.9064272316622457, + "grad_norm": 1.3063408136367798, + "learning_rate": 4.288902791880778e-07, + "loss": 0.6025, + "step": 40835 + }, + { + "epoch": 0.9065382182217734, + "grad_norm": 0.502839207649231, + "learning_rate": 4.27880664404865e-07, + "loss": 0.3625, + "step": 40840 + }, + { + "epoch": 0.906649204781301, + "grad_norm": 1.1110161542892456, + "learning_rate": 4.268722133678194e-07, + "loss": 0.3522, + "step": 40845 + }, + { + "epoch": 0.9067601913408286, + "grad_norm": 2.8650565147399902, + "learning_rate": 4.258649261995429e-07, + "loss": 0.3195, + "step": 40850 + }, + { + "epoch": 0.9068711779003563, + "grad_norm": 0.9162212610244751, + "learning_rate": 4.248588030225009e-07, + "loss": 0.374, + "step": 40855 + }, + { + "epoch": 0.9069821644598839, + "grad_norm": 1.5972285270690918, + "learning_rate": 4.2385384395901117e-07, + "loss": 0.2971, + "step": 40860 + }, + { + "epoch": 0.9070931510194116, + "grad_norm": 1.1207962036132812, + "learning_rate": 4.2285004913125704e-07, + "loss": 0.3178, + "step": 40865 + }, + { + "epoch": 0.9072041375789391, + "grad_norm": 1.0846506357192993, + "learning_rate": 4.218474186612731e-07, + "loss": 0.3724, + "step": 40870 + }, + { + "epoch": 0.9073151241384668, + "grad_norm": 1.6842200756072998, + "learning_rate": 4.208459526709585e-07, + "loss": 0.4633, + "step": 40875 + }, + { + "epoch": 0.9074261106979945, + "grad_norm": 1.2570232152938843, + "learning_rate": 4.19845651282067e-07, + "loss": 0.3244, + "step": 40880 + }, + { + "epoch": 0.9075370972575221, + "grad_norm": 0.7616875767707825, + "learning_rate": 4.188465146162113e-07, + "loss": 0.4373, + "step": 40885 + }, + { + "epoch": 0.9076480838170498, + "grad_norm": 1.3283637762069702, + "learning_rate": 4.178485427948642e-07, + "loss": 0.2744, + "step": 40890 + }, + { + "epoch": 0.9077590703765774, + "grad_norm": 1.1224035024642944, + "learning_rate": 4.168517359393542e-07, + "loss": 0.2493, + "step": 40895 + }, + { + "epoch": 0.907870056936105, + "grad_norm": 1.5765423774719238, + "learning_rate": 4.1585609417087115e-07, + "loss": 0.3856, + "step": 40900 + }, + { + "epoch": 0.9079810434956327, + "grad_norm": 0.6863722801208496, + "learning_rate": 4.148616176104625e-07, + "loss": 0.2628, + "step": 40905 + }, + { + "epoch": 0.9080920300551604, + "grad_norm": 1.0331417322158813, + "learning_rate": 4.1386830637903166e-07, + "loss": 0.3926, + "step": 40910 + }, + { + "epoch": 0.9082030166146879, + "grad_norm": 1.2012207508087158, + "learning_rate": 4.12876160597343e-07, + "loss": 0.4389, + "step": 40915 + }, + { + "epoch": 0.9083140031742156, + "grad_norm": 0.9674161672592163, + "learning_rate": 4.11885180386018e-07, + "loss": 0.5956, + "step": 40920 + }, + { + "epoch": 0.9084249897337432, + "grad_norm": 1.5293216705322266, + "learning_rate": 4.1089536586553567e-07, + "loss": 0.419, + "step": 40925 + }, + { + "epoch": 0.9085359762932709, + "grad_norm": 1.6467329263687134, + "learning_rate": 4.0990671715623655e-07, + "loss": 0.3022, + "step": 40930 + }, + { + "epoch": 0.9086469628527986, + "grad_norm": 1.517439842224121, + "learning_rate": 4.0891923437831547e-07, + "loss": 0.4942, + "step": 40935 + }, + { + "epoch": 0.9087579494123261, + "grad_norm": 1.8788321018218994, + "learning_rate": 4.079329176518287e-07, + "loss": 0.3285, + "step": 40940 + }, + { + "epoch": 0.9088689359718538, + "grad_norm": 1.032759428024292, + "learning_rate": 4.0694776709668815e-07, + "loss": 0.3184, + "step": 40945 + }, + { + "epoch": 0.9089799225313815, + "grad_norm": 1.741289496421814, + "learning_rate": 4.059637828326657e-07, + "loss": 0.2962, + "step": 40950 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.205772042274475, + "learning_rate": 4.049809649793901e-07, + "loss": 0.3886, + "step": 40955 + }, + { + "epoch": 0.9092018956504367, + "grad_norm": 1.3863246440887451, + "learning_rate": 4.039993136563503e-07, + "loss": 0.2554, + "step": 40960 + }, + { + "epoch": 0.9093128822099644, + "grad_norm": 1.532273530960083, + "learning_rate": 4.030188289828907e-07, + "loss": 0.3074, + "step": 40965 + }, + { + "epoch": 0.909423868769492, + "grad_norm": 2.732170581817627, + "learning_rate": 4.02039511078216e-07, + "loss": 0.3315, + "step": 40970 + }, + { + "epoch": 0.9095348553290197, + "grad_norm": 1.513314127922058, + "learning_rate": 4.0106136006138885e-07, + "loss": 0.4582, + "step": 40975 + }, + { + "epoch": 0.9096458418885472, + "grad_norm": 2.2053635120391846, + "learning_rate": 4.0008437605132957e-07, + "loss": 0.359, + "step": 40980 + }, + { + "epoch": 0.9097568284480749, + "grad_norm": 1.4839268922805786, + "learning_rate": 3.9910855916681535e-07, + "loss": 0.3554, + "step": 40985 + }, + { + "epoch": 0.9098678150076026, + "grad_norm": 0.9925002455711365, + "learning_rate": 3.981339095264847e-07, + "loss": 0.1972, + "step": 40990 + }, + { + "epoch": 0.9099788015671302, + "grad_norm": 1.1903655529022217, + "learning_rate": 3.9716042724883054e-07, + "loss": 0.5049, + "step": 40995 + }, + { + "epoch": 0.9100897881266579, + "grad_norm": 1.2450945377349854, + "learning_rate": 3.96188112452206e-07, + "loss": 0.3352, + "step": 41000 + }, + { + "epoch": 0.9102007746861855, + "grad_norm": 1.1269035339355469, + "learning_rate": 3.952169652548221e-07, + "loss": 0.3713, + "step": 41005 + }, + { + "epoch": 0.9103117612457131, + "grad_norm": 1.070256233215332, + "learning_rate": 3.9424698577474773e-07, + "loss": 0.3097, + "step": 41010 + }, + { + "epoch": 0.9104227478052408, + "grad_norm": 1.2544130086898804, + "learning_rate": 3.9327817412990963e-07, + "loss": 0.3488, + "step": 41015 + }, + { + "epoch": 0.9105337343647685, + "grad_norm": 1.162050485610962, + "learning_rate": 3.9231053043809255e-07, + "loss": 0.3, + "step": 41020 + }, + { + "epoch": 0.910644720924296, + "grad_norm": 0.7748410105705261, + "learning_rate": 3.9134405481694115e-07, + "loss": 0.2911, + "step": 41025 + }, + { + "epoch": 0.9107557074838237, + "grad_norm": 1.0052975416183472, + "learning_rate": 3.9037874738395266e-07, + "loss": 0.3194, + "step": 41030 + }, + { + "epoch": 0.9108666940433513, + "grad_norm": 1.3958162069320679, + "learning_rate": 3.894146082564887e-07, + "loss": 0.2533, + "step": 41035 + }, + { + "epoch": 0.910977680602879, + "grad_norm": 1.427446961402893, + "learning_rate": 3.884516375517644e-07, + "loss": 0.4574, + "step": 41040 + }, + { + "epoch": 0.9110886671624067, + "grad_norm": 1.627304196357727, + "learning_rate": 3.87489835386855e-07, + "loss": 0.295, + "step": 41045 + }, + { + "epoch": 0.9111996537219342, + "grad_norm": 1.1479729413986206, + "learning_rate": 3.865292018786959e-07, + "loss": 0.2985, + "step": 41050 + }, + { + "epoch": 0.9113106402814619, + "grad_norm": 1.2646464109420776, + "learning_rate": 3.8556973714407366e-07, + "loss": 0.3134, + "step": 41055 + }, + { + "epoch": 0.9114216268409896, + "grad_norm": 1.2283658981323242, + "learning_rate": 3.846114412996393e-07, + "loss": 0.3191, + "step": 41060 + }, + { + "epoch": 0.9115326134005172, + "grad_norm": 1.0735255479812622, + "learning_rate": 3.836543144618976e-07, + "loss": 0.3153, + "step": 41065 + }, + { + "epoch": 0.9116435999600448, + "grad_norm": 1.5772662162780762, + "learning_rate": 3.826983567472131e-07, + "loss": 0.3514, + "step": 41070 + }, + { + "epoch": 0.9117545865195725, + "grad_norm": 1.1826890707015991, + "learning_rate": 3.817435682718096e-07, + "loss": 0.4131, + "step": 41075 + }, + { + "epoch": 0.9118655730791001, + "grad_norm": 1.1610921621322632, + "learning_rate": 3.8078994915176526e-07, + "loss": 0.506, + "step": 41080 + }, + { + "epoch": 0.9119765596386278, + "grad_norm": 0.877407431602478, + "learning_rate": 3.7983749950301965e-07, + "loss": 0.3275, + "step": 41085 + }, + { + "epoch": 0.9120875461981554, + "grad_norm": 1.2805155515670776, + "learning_rate": 3.788862194413667e-07, + "loss": 0.4192, + "step": 41090 + }, + { + "epoch": 0.912198532757683, + "grad_norm": 0.8939440250396729, + "learning_rate": 3.779361090824596e-07, + "loss": 0.3885, + "step": 41095 + }, + { + "epoch": 0.9123095193172107, + "grad_norm": 1.0157679319381714, + "learning_rate": 3.769871685418114e-07, + "loss": 0.3205, + "step": 41100 + }, + { + "epoch": 0.9124205058767383, + "grad_norm": 1.2212369441986084, + "learning_rate": 3.760393979347898e-07, + "loss": 0.33, + "step": 41105 + }, + { + "epoch": 0.912531492436266, + "grad_norm": 1.968551516532898, + "learning_rate": 3.7509279737662276e-07, + "loss": 0.2711, + "step": 41110 + }, + { + "epoch": 0.9126424789957936, + "grad_norm": 1.060575008392334, + "learning_rate": 3.741473669823936e-07, + "loss": 0.3427, + "step": 41115 + }, + { + "epoch": 0.9127534655553212, + "grad_norm": 1.3390034437179565, + "learning_rate": 3.7320310686704497e-07, + "loss": 0.3595, + "step": 41120 + }, + { + "epoch": 0.9128644521148489, + "grad_norm": 1.1845945119857788, + "learning_rate": 3.722600171453761e-07, + "loss": 0.4191, + "step": 41125 + }, + { + "epoch": 0.9129754386743766, + "grad_norm": 0.7942050099372864, + "learning_rate": 3.713180979320463e-07, + "loss": 0.3634, + "step": 41130 + }, + { + "epoch": 0.9130864252339042, + "grad_norm": 1.055257797241211, + "learning_rate": 3.7037734934156967e-07, + "loss": 0.4133, + "step": 41135 + }, + { + "epoch": 0.9131974117934318, + "grad_norm": 0.9896296858787537, + "learning_rate": 3.6943777148831907e-07, + "loss": 0.4523, + "step": 41140 + }, + { + "epoch": 0.9133083983529594, + "grad_norm": 1.3810358047485352, + "learning_rate": 3.684993644865276e-07, + "loss": 0.5251, + "step": 41145 + }, + { + "epoch": 0.9134193849124871, + "grad_norm": 1.073945164680481, + "learning_rate": 3.6756212845028063e-07, + "loss": 0.4773, + "step": 41150 + }, + { + "epoch": 0.9135303714720148, + "grad_norm": 1.60965096950531, + "learning_rate": 3.666260634935248e-07, + "loss": 0.3653, + "step": 41155 + }, + { + "epoch": 0.9136413580315423, + "grad_norm": 1.5314462184906006, + "learning_rate": 3.656911697300658e-07, + "loss": 0.3599, + "step": 41160 + }, + { + "epoch": 0.91375234459107, + "grad_norm": 0.9583492279052734, + "learning_rate": 3.6475744727356157e-07, + "loss": 0.3483, + "step": 41165 + }, + { + "epoch": 0.9138633311505977, + "grad_norm": 1.8657221794128418, + "learning_rate": 3.6382489623753457e-07, + "loss": 0.3817, + "step": 41170 + }, + { + "epoch": 0.9139743177101253, + "grad_norm": 0.9618664979934692, + "learning_rate": 3.628935167353587e-07, + "loss": 0.2427, + "step": 41175 + }, + { + "epoch": 0.914085304269653, + "grad_norm": 1.2513151168823242, + "learning_rate": 3.6196330888026764e-07, + "loss": 0.4282, + "step": 41180 + }, + { + "epoch": 0.9141962908291806, + "grad_norm": 1.128990888595581, + "learning_rate": 3.6103427278535665e-07, + "loss": 0.4141, + "step": 41185 + }, + { + "epoch": 0.9143072773887082, + "grad_norm": 0.7350396513938904, + "learning_rate": 3.601064085635697e-07, + "loss": 0.2665, + "step": 41190 + }, + { + "epoch": 0.9144182639482359, + "grad_norm": 1.0335029363632202, + "learning_rate": 3.5917971632771773e-07, + "loss": 0.4452, + "step": 41195 + }, + { + "epoch": 0.9145292505077635, + "grad_norm": 1.4310749769210815, + "learning_rate": 3.5825419619046176e-07, + "loss": 0.2763, + "step": 41200 + }, + { + "epoch": 0.9146402370672911, + "grad_norm": 1.1692556142807007, + "learning_rate": 3.573298482643262e-07, + "loss": 0.4406, + "step": 41205 + }, + { + "epoch": 0.9147512236268188, + "grad_norm": 1.2683709859848022, + "learning_rate": 3.564066726616866e-07, + "loss": 0.4272, + "step": 41210 + }, + { + "epoch": 0.9148622101863464, + "grad_norm": 1.0134937763214111, + "learning_rate": 3.5548466949478445e-07, + "loss": 0.1871, + "step": 41215 + }, + { + "epoch": 0.9149731967458741, + "grad_norm": 1.2475417852401733, + "learning_rate": 3.545638388757078e-07, + "loss": 0.3767, + "step": 41220 + }, + { + "epoch": 0.9150841833054018, + "grad_norm": 1.4357985258102417, + "learning_rate": 3.5364418091641374e-07, + "loss": 0.4015, + "step": 41225 + }, + { + "epoch": 0.9151951698649293, + "grad_norm": 0.8965421319007874, + "learning_rate": 3.5272569572870727e-07, + "loss": 0.3689, + "step": 41230 + }, + { + "epoch": 0.915306156424457, + "grad_norm": 1.8212131261825562, + "learning_rate": 3.5180838342425803e-07, + "loss": 0.4141, + "step": 41235 + }, + { + "epoch": 0.9154171429839847, + "grad_norm": 1.3277180194854736, + "learning_rate": 3.508922441145857e-07, + "loss": 0.1806, + "step": 41240 + }, + { + "epoch": 0.9155281295435123, + "grad_norm": 0.8737463355064392, + "learning_rate": 3.499772779110766e-07, + "loss": 0.291, + "step": 41245 + }, + { + "epoch": 0.9156391161030399, + "grad_norm": 0.8136932253837585, + "learning_rate": 3.490634849249641e-07, + "loss": 0.4348, + "step": 41250 + }, + { + "epoch": 0.9157501026625675, + "grad_norm": 1.7655400037765503, + "learning_rate": 3.4815086526734707e-07, + "loss": 0.4404, + "step": 41255 + }, + { + "epoch": 0.9158610892220952, + "grad_norm": 1.2167655229568481, + "learning_rate": 3.4723941904917793e-07, + "loss": 0.3053, + "step": 41260 + }, + { + "epoch": 0.9159720757816229, + "grad_norm": 1.5969772338867188, + "learning_rate": 3.463291463812668e-07, + "loss": 0.3528, + "step": 41265 + }, + { + "epoch": 0.9160830623411504, + "grad_norm": 1.587766408920288, + "learning_rate": 3.4542004737428304e-07, + "loss": 0.34, + "step": 41270 + }, + { + "epoch": 0.9161940489006781, + "grad_norm": 2.0868446826934814, + "learning_rate": 3.4451212213875043e-07, + "loss": 0.2913, + "step": 41275 + }, + { + "epoch": 0.9163050354602058, + "grad_norm": 1.0023850202560425, + "learning_rate": 3.436053707850528e-07, + "loss": 0.32, + "step": 41280 + }, + { + "epoch": 0.9164160220197334, + "grad_norm": 3.024143934249878, + "learning_rate": 3.4269979342342776e-07, + "loss": 0.3368, + "step": 41285 + }, + { + "epoch": 0.9165270085792611, + "grad_norm": 1.2240880727767944, + "learning_rate": 3.417953901639759e-07, + "loss": 0.2696, + "step": 41290 + }, + { + "epoch": 0.9166379951387887, + "grad_norm": 1.0215989351272583, + "learning_rate": 3.408921611166471e-07, + "loss": 0.329, + "step": 41295 + }, + { + "epoch": 0.9167489816983163, + "grad_norm": 1.2344481945037842, + "learning_rate": 3.399901063912558e-07, + "loss": 0.4592, + "step": 41300 + }, + { + "epoch": 0.916859968257844, + "grad_norm": 1.5032986402511597, + "learning_rate": 3.390892260974721e-07, + "loss": 0.5914, + "step": 41305 + }, + { + "epoch": 0.9169709548173716, + "grad_norm": 1.5667091608047485, + "learning_rate": 3.381895203448182e-07, + "loss": 0.4586, + "step": 41310 + }, + { + "epoch": 0.9170819413768992, + "grad_norm": 1.53379225730896, + "learning_rate": 3.372909892426801e-07, + "loss": 0.3376, + "step": 41315 + }, + { + "epoch": 0.9171929279364269, + "grad_norm": 1.1867161989212036, + "learning_rate": 3.363936329002959e-07, + "loss": 0.4196, + "step": 41320 + }, + { + "epoch": 0.9173039144959545, + "grad_norm": 1.3190207481384277, + "learning_rate": 3.3549745142676614e-07, + "loss": 0.2724, + "step": 41325 + }, + { + "epoch": 0.9174149010554822, + "grad_norm": 1.7920886278152466, + "learning_rate": 3.3460244493104365e-07, + "loss": 0.2532, + "step": 41330 + }, + { + "epoch": 0.9175258876150099, + "grad_norm": 1.1654620170593262, + "learning_rate": 3.3370861352194026e-07, + "loss": 0.3344, + "step": 41335 + }, + { + "epoch": 0.9176368741745374, + "grad_norm": 0.8272110223770142, + "learning_rate": 3.328159573081258e-07, + "loss": 0.4368, + "step": 41340 + }, + { + "epoch": 0.9177478607340651, + "grad_norm": 0.7133996486663818, + "learning_rate": 3.319244763981255e-07, + "loss": 0.3415, + "step": 41345 + }, + { + "epoch": 0.9178588472935928, + "grad_norm": 0.7516515851020813, + "learning_rate": 3.3103417090032287e-07, + "loss": 0.4044, + "step": 41350 + }, + { + "epoch": 0.9179698338531204, + "grad_norm": 0.8074901103973389, + "learning_rate": 3.3014504092296006e-07, + "loss": 0.2569, + "step": 41355 + }, + { + "epoch": 0.918080820412648, + "grad_norm": 0.8953653573989868, + "learning_rate": 3.292570865741318e-07, + "loss": 0.5623, + "step": 41360 + }, + { + "epoch": 0.9181918069721756, + "grad_norm": 2.03694748878479, + "learning_rate": 3.2837030796179503e-07, + "loss": 0.5488, + "step": 41365 + }, + { + "epoch": 0.9183027935317033, + "grad_norm": 1.2388249635696411, + "learning_rate": 3.274847051937591e-07, + "loss": 0.4, + "step": 41370 + }, + { + "epoch": 0.918413780091231, + "grad_norm": 0.5898800492286682, + "learning_rate": 3.2660027837769446e-07, + "loss": 0.3577, + "step": 41375 + }, + { + "epoch": 0.9185247666507586, + "grad_norm": 0.9966252446174622, + "learning_rate": 3.257170276211241e-07, + "loss": 0.3671, + "step": 41380 + }, + { + "epoch": 0.9186357532102862, + "grad_norm": 0.8349930047988892, + "learning_rate": 3.248349530314332e-07, + "loss": 0.4678, + "step": 41385 + }, + { + "epoch": 0.9187467397698139, + "grad_norm": 1.071436882019043, + "learning_rate": 3.239540547158626e-07, + "loss": 0.4007, + "step": 41390 + }, + { + "epoch": 0.9188577263293415, + "grad_norm": 0.9444680213928223, + "learning_rate": 3.2307433278150445e-07, + "loss": 0.3086, + "step": 41395 + }, + { + "epoch": 0.9189687128888692, + "grad_norm": 1.670590877532959, + "learning_rate": 3.2219578733531655e-07, + "loss": 0.2858, + "step": 41400 + }, + { + "epoch": 0.9190796994483968, + "grad_norm": 1.7639275789260864, + "learning_rate": 3.213184184841078e-07, + "loss": 0.4105, + "step": 41405 + }, + { + "epoch": 0.9191906860079244, + "grad_norm": 1.3352954387664795, + "learning_rate": 3.2044222633454525e-07, + "loss": 0.4735, + "step": 41410 + }, + { + "epoch": 0.9193016725674521, + "grad_norm": 0.9515475034713745, + "learning_rate": 3.195672109931558e-07, + "loss": 0.2875, + "step": 41415 + }, + { + "epoch": 0.9194126591269797, + "grad_norm": 1.4635220766067505, + "learning_rate": 3.186933725663166e-07, + "loss": 0.3685, + "step": 41420 + }, + { + "epoch": 0.9195236456865074, + "grad_norm": 1.1768747568130493, + "learning_rate": 3.1782071116027156e-07, + "loss": 0.4472, + "step": 41425 + }, + { + "epoch": 0.919634632246035, + "grad_norm": 1.2579537630081177, + "learning_rate": 3.169492268811103e-07, + "loss": 0.2951, + "step": 41430 + }, + { + "epoch": 0.9197456188055626, + "grad_norm": 1.3200901746749878, + "learning_rate": 3.160789198347891e-07, + "loss": 0.4369, + "step": 41435 + }, + { + "epoch": 0.9198566053650903, + "grad_norm": 1.4001281261444092, + "learning_rate": 3.152097901271156e-07, + "loss": 0.3694, + "step": 41440 + }, + { + "epoch": 0.919967591924618, + "grad_norm": 1.1858893632888794, + "learning_rate": 3.1434183786375416e-07, + "loss": 0.5221, + "step": 41445 + }, + { + "epoch": 0.9200785784841455, + "grad_norm": 0.99980628490448, + "learning_rate": 3.1347506315023036e-07, + "loss": 0.3342, + "step": 41450 + }, + { + "epoch": 0.9201895650436732, + "grad_norm": 1.8966047763824463, + "learning_rate": 3.12609466091921e-07, + "loss": 0.3557, + "step": 41455 + }, + { + "epoch": 0.9203005516032009, + "grad_norm": 1.1274603605270386, + "learning_rate": 3.117450467940653e-07, + "loss": 0.3224, + "step": 41460 + }, + { + "epoch": 0.9204115381627285, + "grad_norm": 1.2839633226394653, + "learning_rate": 3.108818053617557e-07, + "loss": 0.3317, + "step": 41465 + }, + { + "epoch": 0.9205225247222562, + "grad_norm": 0.5879912972450256, + "learning_rate": 3.1001974189993845e-07, + "loss": 0.3073, + "step": 41470 + }, + { + "epoch": 0.9206335112817837, + "grad_norm": 1.3780027627944946, + "learning_rate": 3.0915885651342624e-07, + "loss": 0.5207, + "step": 41475 + }, + { + "epoch": 0.9207444978413114, + "grad_norm": 1.2077401876449585, + "learning_rate": 3.0829914930687767e-07, + "loss": 0.3862, + "step": 41480 + }, + { + "epoch": 0.9208554844008391, + "grad_norm": 1.1930087804794312, + "learning_rate": 3.0744062038481573e-07, + "loss": 0.3539, + "step": 41485 + }, + { + "epoch": 0.9209664709603667, + "grad_norm": 1.0110259056091309, + "learning_rate": 3.065832698516169e-07, + "loss": 0.4615, + "step": 41490 + }, + { + "epoch": 0.9210774575198943, + "grad_norm": 0.975702702999115, + "learning_rate": 3.0572709781151344e-07, + "loss": 0.3641, + "step": 41495 + }, + { + "epoch": 0.921188444079422, + "grad_norm": 0.9686071872711182, + "learning_rate": 3.0487210436859985e-07, + "loss": 0.4104, + "step": 41500 + }, + { + "epoch": 0.9212994306389496, + "grad_norm": 1.383853554725647, + "learning_rate": 3.0401828962681955e-07, + "loss": 0.4135, + "step": 41505 + }, + { + "epoch": 0.9214104171984773, + "grad_norm": 1.5861616134643555, + "learning_rate": 3.031656536899785e-07, + "loss": 0.5269, + "step": 41510 + }, + { + "epoch": 0.921521403758005, + "grad_norm": 2.3221077919006348, + "learning_rate": 3.023141966617349e-07, + "loss": 0.384, + "step": 41515 + }, + { + "epoch": 0.9216323903175325, + "grad_norm": 1.989410638809204, + "learning_rate": 3.0146391864560917e-07, + "loss": 0.5662, + "step": 41520 + }, + { + "epoch": 0.9217433768770602, + "grad_norm": 1.0644265413284302, + "learning_rate": 3.006148197449743e-07, + "loss": 0.5402, + "step": 41525 + }, + { + "epoch": 0.9218543634365878, + "grad_norm": 1.6977014541625977, + "learning_rate": 2.9976690006305877e-07, + "loss": 0.336, + "step": 41530 + }, + { + "epoch": 0.9219653499961155, + "grad_norm": 1.9462661743164062, + "learning_rate": 2.989201597029534e-07, + "loss": 0.4512, + "step": 41535 + }, + { + "epoch": 0.9220763365556431, + "grad_norm": 1.5462268590927124, + "learning_rate": 2.9807459876759817e-07, + "loss": 0.4801, + "step": 41540 + }, + { + "epoch": 0.9221873231151707, + "grad_norm": 1.7612812519073486, + "learning_rate": 2.972302173597974e-07, + "loss": 0.5732, + "step": 41545 + }, + { + "epoch": 0.9222983096746984, + "grad_norm": 1.0951625108718872, + "learning_rate": 2.963870155822046e-07, + "loss": 0.4075, + "step": 41550 + }, + { + "epoch": 0.9224092962342261, + "grad_norm": 1.443281888961792, + "learning_rate": 2.955449935373356e-07, + "loss": 0.2509, + "step": 41555 + }, + { + "epoch": 0.9225202827937536, + "grad_norm": 0.3948124349117279, + "learning_rate": 2.947041513275606e-07, + "loss": 0.3246, + "step": 41560 + }, + { + "epoch": 0.9226312693532813, + "grad_norm": 1.4538019895553589, + "learning_rate": 2.938644890551057e-07, + "loss": 0.4642, + "step": 41565 + }, + { + "epoch": 0.922742255912809, + "grad_norm": 1.6253353357315063, + "learning_rate": 2.9302600682205473e-07, + "loss": 0.4874, + "step": 41570 + }, + { + "epoch": 0.9228532424723366, + "grad_norm": 1.0660721063613892, + "learning_rate": 2.921887047303462e-07, + "loss": 0.4034, + "step": 41575 + }, + { + "epoch": 0.9229642290318643, + "grad_norm": 1.0873395204544067, + "learning_rate": 2.9135258288177757e-07, + "loss": 0.4524, + "step": 41580 + }, + { + "epoch": 0.9230752155913918, + "grad_norm": 1.0553845167160034, + "learning_rate": 2.9051764137800195e-07, + "loss": 0.5371, + "step": 41585 + }, + { + "epoch": 0.9231862021509195, + "grad_norm": 1.0221503973007202, + "learning_rate": 2.896838803205282e-07, + "loss": 0.4071, + "step": 41590 + }, + { + "epoch": 0.9232971887104472, + "grad_norm": 0.7316520810127258, + "learning_rate": 2.8885129981072403e-07, + "loss": 0.3178, + "step": 41595 + }, + { + "epoch": 0.9234081752699748, + "grad_norm": 1.321444034576416, + "learning_rate": 2.880198999498074e-07, + "loss": 0.3049, + "step": 41600 + }, + { + "epoch": 0.9235191618295024, + "grad_norm": 1.0900516510009766, + "learning_rate": 2.871896808388608e-07, + "loss": 0.4049, + "step": 41605 + }, + { + "epoch": 0.9236301483890301, + "grad_norm": 1.1955881118774414, + "learning_rate": 2.863606425788201e-07, + "loss": 0.2784, + "step": 41610 + }, + { + "epoch": 0.9237411349485577, + "grad_norm": 1.5617562532424927, + "learning_rate": 2.855327852704737e-07, + "loss": 0.4223, + "step": 41615 + }, + { + "epoch": 0.9238521215080854, + "grad_norm": 1.5575618743896484, + "learning_rate": 2.847061090144732e-07, + "loss": 0.5853, + "step": 41620 + }, + { + "epoch": 0.9239631080676131, + "grad_norm": 0.6329960227012634, + "learning_rate": 2.838806139113204e-07, + "loss": 0.3444, + "step": 41625 + }, + { + "epoch": 0.9240740946271406, + "grad_norm": 1.4175231456756592, + "learning_rate": 2.830563000613773e-07, + "loss": 0.4397, + "step": 41630 + }, + { + "epoch": 0.9241850811866683, + "grad_norm": 0.7703967690467834, + "learning_rate": 2.822331675648604e-07, + "loss": 0.3143, + "step": 41635 + }, + { + "epoch": 0.9242960677461959, + "grad_norm": 0.6176158785820007, + "learning_rate": 2.8141121652184413e-07, + "loss": 0.2804, + "step": 41640 + }, + { + "epoch": 0.9244070543057236, + "grad_norm": 0.8418532609939575, + "learning_rate": 2.8059044703226066e-07, + "loss": 0.3661, + "step": 41645 + }, + { + "epoch": 0.9245180408652512, + "grad_norm": 1.2525098323822021, + "learning_rate": 2.7977085919589253e-07, + "loss": 0.3764, + "step": 41650 + }, + { + "epoch": 0.9246290274247788, + "grad_norm": 2.123753547668457, + "learning_rate": 2.789524531123844e-07, + "loss": 0.5501, + "step": 41655 + }, + { + "epoch": 0.9247400139843065, + "grad_norm": 1.645007848739624, + "learning_rate": 2.781352288812356e-07, + "loss": 0.447, + "step": 41660 + }, + { + "epoch": 0.9248510005438342, + "grad_norm": 1.2815983295440674, + "learning_rate": 2.773191866017999e-07, + "loss": 0.555, + "step": 41665 + }, + { + "epoch": 0.9249619871033617, + "grad_norm": 1.142004132270813, + "learning_rate": 2.765043263732914e-07, + "loss": 0.3564, + "step": 41670 + }, + { + "epoch": 0.9250729736628894, + "grad_norm": 1.6616061925888062, + "learning_rate": 2.7569064829477633e-07, + "loss": 0.449, + "step": 41675 + }, + { + "epoch": 0.9251839602224171, + "grad_norm": 0.8066574335098267, + "learning_rate": 2.7487815246518e-07, + "loss": 0.3865, + "step": 41680 + }, + { + "epoch": 0.9252949467819447, + "grad_norm": 1.1873153448104858, + "learning_rate": 2.7406683898328125e-07, + "loss": 0.2746, + "step": 41685 + }, + { + "epoch": 0.9254059333414724, + "grad_norm": 1.0035192966461182, + "learning_rate": 2.7325670794771887e-07, + "loss": 0.2461, + "step": 41690 + }, + { + "epoch": 0.9255169199009999, + "grad_norm": 0.9507527351379395, + "learning_rate": 2.724477594569852e-07, + "loss": 0.4659, + "step": 41695 + }, + { + "epoch": 0.9256279064605276, + "grad_norm": 0.8072733879089355, + "learning_rate": 2.716399936094294e-07, + "loss": 0.3659, + "step": 41700 + }, + { + "epoch": 0.9257388930200553, + "grad_norm": 1.3433468341827393, + "learning_rate": 2.708334105032551e-07, + "loss": 0.5524, + "step": 41705 + }, + { + "epoch": 0.9258498795795829, + "grad_norm": 1.2383370399475098, + "learning_rate": 2.7002801023652715e-07, + "loss": 0.2805, + "step": 41710 + }, + { + "epoch": 0.9259608661391105, + "grad_norm": 0.6843294501304626, + "learning_rate": 2.692237929071617e-07, + "loss": 0.4555, + "step": 41715 + }, + { + "epoch": 0.9260718526986382, + "grad_norm": 1.0873684883117676, + "learning_rate": 2.684207586129328e-07, + "loss": 0.4586, + "step": 41720 + }, + { + "epoch": 0.9261828392581658, + "grad_norm": 0.7387077212333679, + "learning_rate": 2.676189074514712e-07, + "loss": 0.2936, + "step": 41725 + }, + { + "epoch": 0.9262938258176935, + "grad_norm": 1.1081559658050537, + "learning_rate": 2.6681823952026343e-07, + "loss": 0.3782, + "step": 41730 + }, + { + "epoch": 0.9264048123772212, + "grad_norm": 0.7738690376281738, + "learning_rate": 2.660187549166504e-07, + "loss": 0.3472, + "step": 41735 + }, + { + "epoch": 0.9265157989367487, + "grad_norm": 0.8234603404998779, + "learning_rate": 2.6522045373783223e-07, + "loss": 0.3328, + "step": 41740 + }, + { + "epoch": 0.9266267854962764, + "grad_norm": 1.5815513134002686, + "learning_rate": 2.6442333608086344e-07, + "loss": 0.3223, + "step": 41745 + }, + { + "epoch": 0.926737772055804, + "grad_norm": 1.6585302352905273, + "learning_rate": 2.6362740204265434e-07, + "loss": 0.5519, + "step": 41750 + }, + { + "epoch": 0.9268487586153317, + "grad_norm": 1.0758693218231201, + "learning_rate": 2.628326517199731e-07, + "loss": 0.3138, + "step": 41755 + }, + { + "epoch": 0.9269597451748594, + "grad_norm": 2.1765646934509277, + "learning_rate": 2.6203908520944234e-07, + "loss": 0.36, + "step": 41760 + }, + { + "epoch": 0.9270707317343869, + "grad_norm": 1.0552098751068115, + "learning_rate": 2.6124670260754046e-07, + "loss": 0.3435, + "step": 41765 + }, + { + "epoch": 0.9271817182939146, + "grad_norm": 1.592545986175537, + "learning_rate": 2.6045550401060383e-07, + "loss": 0.4286, + "step": 41770 + }, + { + "epoch": 0.9272927048534423, + "grad_norm": 1.1659295558929443, + "learning_rate": 2.5966548951482206e-07, + "loss": 0.4308, + "step": 41775 + }, + { + "epoch": 0.9274036914129699, + "grad_norm": 0.8281853199005127, + "learning_rate": 2.58876659216245e-07, + "loss": 0.361, + "step": 41780 + }, + { + "epoch": 0.9275146779724975, + "grad_norm": 0.868346095085144, + "learning_rate": 2.5808901321077274e-07, + "loss": 0.3813, + "step": 41785 + }, + { + "epoch": 0.9276256645320252, + "grad_norm": 1.7827457189559937, + "learning_rate": 2.573025515941685e-07, + "loss": 0.5315, + "step": 41790 + }, + { + "epoch": 0.9277366510915528, + "grad_norm": 0.8157330751419067, + "learning_rate": 2.5651727446204364e-07, + "loss": 0.3371, + "step": 41795 + }, + { + "epoch": 0.9278476376510805, + "grad_norm": 1.5221593379974365, + "learning_rate": 2.5573318190987186e-07, + "loss": 0.3624, + "step": 41800 + }, + { + "epoch": 0.927958624210608, + "grad_norm": 1.3963264226913452, + "learning_rate": 2.549502740329812e-07, + "loss": 0.3376, + "step": 41805 + }, + { + "epoch": 0.9280696107701357, + "grad_norm": 1.3674664497375488, + "learning_rate": 2.5416855092655345e-07, + "loss": 0.4054, + "step": 41810 + }, + { + "epoch": 0.9281805973296634, + "grad_norm": 1.2279380559921265, + "learning_rate": 2.533880126856281e-07, + "loss": 0.3089, + "step": 41815 + }, + { + "epoch": 0.928291583889191, + "grad_norm": 0.9081880450248718, + "learning_rate": 2.5260865940510027e-07, + "loss": 0.3848, + "step": 41820 + }, + { + "epoch": 0.9284025704487187, + "grad_norm": 1.3663815259933472, + "learning_rate": 2.51830491179722e-07, + "loss": 0.3177, + "step": 41825 + }, + { + "epoch": 0.9285135570082463, + "grad_norm": 1.113681674003601, + "learning_rate": 2.510535081040999e-07, + "loss": 0.3978, + "step": 41830 + }, + { + "epoch": 0.9286245435677739, + "grad_norm": 1.3013014793395996, + "learning_rate": 2.502777102726961e-07, + "loss": 0.3886, + "step": 41835 + }, + { + "epoch": 0.9287355301273016, + "grad_norm": 1.1549102067947388, + "learning_rate": 2.4950309777983074e-07, + "loss": 0.2329, + "step": 41840 + }, + { + "epoch": 0.9288465166868293, + "grad_norm": 1.560652256011963, + "learning_rate": 2.4872967071967736e-07, + "loss": 0.4282, + "step": 41845 + }, + { + "epoch": 0.9289575032463568, + "grad_norm": 1.8950015306472778, + "learning_rate": 2.479574291862685e-07, + "loss": 0.4252, + "step": 41850 + }, + { + "epoch": 0.9290684898058845, + "grad_norm": 1.6299108266830444, + "learning_rate": 2.4718637327348915e-07, + "loss": 0.4783, + "step": 41855 + }, + { + "epoch": 0.9291794763654121, + "grad_norm": 1.442406415939331, + "learning_rate": 2.4641650307508203e-07, + "loss": 0.3578, + "step": 41860 + }, + { + "epoch": 0.9292904629249398, + "grad_norm": 1.77949059009552, + "learning_rate": 2.456478186846456e-07, + "loss": 0.4453, + "step": 41865 + }, + { + "epoch": 0.9294014494844675, + "grad_norm": 1.6224652528762817, + "learning_rate": 2.44880320195634e-07, + "loss": 0.4472, + "step": 41870 + }, + { + "epoch": 0.929512436043995, + "grad_norm": 1.282048225402832, + "learning_rate": 2.4411400770135705e-07, + "loss": 0.3831, + "step": 41875 + }, + { + "epoch": 0.9296234226035227, + "grad_norm": 1.2435473203659058, + "learning_rate": 2.4334888129497915e-07, + "loss": 0.4324, + "step": 41880 + }, + { + "epoch": 0.9297344091630504, + "grad_norm": 1.9706709384918213, + "learning_rate": 2.425849410695236e-07, + "loss": 0.3785, + "step": 41885 + }, + { + "epoch": 0.929845395722578, + "grad_norm": 1.8323307037353516, + "learning_rate": 2.418221871178683e-07, + "loss": 0.3998, + "step": 41890 + }, + { + "epoch": 0.9299563822821056, + "grad_norm": 0.8738846778869629, + "learning_rate": 2.4106061953274363e-07, + "loss": 0.1959, + "step": 41895 + }, + { + "epoch": 0.9300673688416333, + "grad_norm": 1.2526519298553467, + "learning_rate": 2.40300238406741e-07, + "loss": 0.4916, + "step": 41900 + }, + { + "epoch": 0.9301783554011609, + "grad_norm": 1.3293545246124268, + "learning_rate": 2.3954104383230316e-07, + "loss": 0.3191, + "step": 41905 + }, + { + "epoch": 0.9302893419606886, + "grad_norm": 1.7949703931808472, + "learning_rate": 2.387830359017318e-07, + "loss": 0.4433, + "step": 41910 + }, + { + "epoch": 0.9304003285202161, + "grad_norm": 1.6385241746902466, + "learning_rate": 2.38026214707181e-07, + "loss": 0.2801, + "step": 41915 + }, + { + "epoch": 0.9305113150797438, + "grad_norm": 1.4388099908828735, + "learning_rate": 2.3727058034066497e-07, + "loss": 0.4695, + "step": 41920 + }, + { + "epoch": 0.9306223016392715, + "grad_norm": 1.20249342918396, + "learning_rate": 2.3651613289405019e-07, + "loss": 0.4453, + "step": 41925 + }, + { + "epoch": 0.9307332881987991, + "grad_norm": 1.9365683794021606, + "learning_rate": 2.3576287245905883e-07, + "loss": 0.352, + "step": 41930 + }, + { + "epoch": 0.9308442747583268, + "grad_norm": 0.9989916086196899, + "learning_rate": 2.350107991272721e-07, + "loss": 0.4682, + "step": 41935 + }, + { + "epoch": 0.9309552613178544, + "grad_norm": 1.4359912872314453, + "learning_rate": 2.3425991299012241e-07, + "loss": 0.3343, + "step": 41940 + }, + { + "epoch": 0.931066247877382, + "grad_norm": 1.1598297357559204, + "learning_rate": 2.335102141389012e-07, + "loss": 0.3479, + "step": 41945 + }, + { + "epoch": 0.9311772344369097, + "grad_norm": 1.731669306755066, + "learning_rate": 2.327617026647533e-07, + "loss": 0.4167, + "step": 41950 + }, + { + "epoch": 0.9312882209964374, + "grad_norm": 1.6868865489959717, + "learning_rate": 2.3201437865868148e-07, + "loss": 0.3485, + "step": 41955 + }, + { + "epoch": 0.931399207555965, + "grad_norm": 0.9299235939979553, + "learning_rate": 2.3126824221154199e-07, + "loss": 0.3797, + "step": 41960 + }, + { + "epoch": 0.9315101941154926, + "grad_norm": 1.5356875658035278, + "learning_rate": 2.3052329341404777e-07, + "loss": 0.2368, + "step": 41965 + }, + { + "epoch": 0.9316211806750202, + "grad_norm": 1.2975677251815796, + "learning_rate": 2.2977953235676642e-07, + "loss": 0.4181, + "step": 41970 + }, + { + "epoch": 0.9317321672345479, + "grad_norm": 0.5879126191139221, + "learning_rate": 2.2903695913012448e-07, + "loss": 0.1615, + "step": 41975 + }, + { + "epoch": 0.9318431537940756, + "grad_norm": 1.8237122297286987, + "learning_rate": 2.282955738243986e-07, + "loss": 0.2599, + "step": 41980 + }, + { + "epoch": 0.9319541403536031, + "grad_norm": 0.8162139058113098, + "learning_rate": 2.2755537652972558e-07, + "loss": 0.3054, + "step": 41985 + }, + { + "epoch": 0.9320651269131308, + "grad_norm": 1.2494746446609497, + "learning_rate": 2.2681636733609457e-07, + "loss": 0.3428, + "step": 41990 + }, + { + "epoch": 0.9321761134726585, + "grad_norm": 0.41989076137542725, + "learning_rate": 2.260785463333548e-07, + "loss": 0.3983, + "step": 41995 + }, + { + "epoch": 0.9322871000321861, + "grad_norm": 0.9777684807777405, + "learning_rate": 2.253419136112045e-07, + "loss": 0.3267, + "step": 42000 + }, + { + "epoch": 0.9323980865917137, + "grad_norm": 1.2072054147720337, + "learning_rate": 2.2460646925920315e-07, + "loss": 0.4389, + "step": 42005 + }, + { + "epoch": 0.9325090731512414, + "grad_norm": 1.2655891180038452, + "learning_rate": 2.238722133667637e-07, + "loss": 0.4491, + "step": 42010 + }, + { + "epoch": 0.932620059710769, + "grad_norm": 1.3673769235610962, + "learning_rate": 2.2313914602315245e-07, + "loss": 0.3805, + "step": 42015 + }, + { + "epoch": 0.9327310462702967, + "grad_norm": 1.3959051370620728, + "learning_rate": 2.2240726731749707e-07, + "loss": 0.3748, + "step": 42020 + }, + { + "epoch": 0.9328420328298244, + "grad_norm": 1.0409404039382935, + "learning_rate": 2.21676577338773e-07, + "loss": 0.3834, + "step": 42025 + }, + { + "epoch": 0.9329530193893519, + "grad_norm": 1.689075231552124, + "learning_rate": 2.20947076175817e-07, + "loss": 0.2738, + "step": 42030 + }, + { + "epoch": 0.9330640059488796, + "grad_norm": 0.9436668157577515, + "learning_rate": 2.2021876391731922e-07, + "loss": 0.5671, + "step": 42035 + }, + { + "epoch": 0.9331749925084072, + "grad_norm": 1.3573400974273682, + "learning_rate": 2.1949164065182548e-07, + "loss": 0.4767, + "step": 42040 + }, + { + "epoch": 0.9332859790679349, + "grad_norm": 0.9591343998908997, + "learning_rate": 2.187657064677362e-07, + "loss": 0.2737, + "step": 42045 + }, + { + "epoch": 0.9333969656274625, + "grad_norm": 0.7570490837097168, + "learning_rate": 2.1804096145330856e-07, + "loss": 0.4176, + "step": 42050 + }, + { + "epoch": 0.9335079521869901, + "grad_norm": 1.4021674394607544, + "learning_rate": 2.1731740569665538e-07, + "loss": 0.4511, + "step": 42055 + }, + { + "epoch": 0.9336189387465178, + "grad_norm": 1.2997996807098389, + "learning_rate": 2.165950392857441e-07, + "loss": 0.3495, + "step": 42060 + }, + { + "epoch": 0.9337299253060455, + "grad_norm": 1.744541049003601, + "learning_rate": 2.1587386230839558e-07, + "loss": 0.3737, + "step": 42065 + }, + { + "epoch": 0.933840911865573, + "grad_norm": 1.6680949926376343, + "learning_rate": 2.1515387485228968e-07, + "loss": 0.3786, + "step": 42070 + }, + { + "epoch": 0.9339518984251007, + "grad_norm": 2.040933847427368, + "learning_rate": 2.1443507700495968e-07, + "loss": 0.313, + "step": 42075 + }, + { + "epoch": 0.9340628849846284, + "grad_norm": 1.7324872016906738, + "learning_rate": 2.1371746885379575e-07, + "loss": 0.4825, + "step": 42080 + }, + { + "epoch": 0.934173871544156, + "grad_norm": 1.2083805799484253, + "learning_rate": 2.1300105048604136e-07, + "loss": 0.473, + "step": 42085 + }, + { + "epoch": 0.9342848581036837, + "grad_norm": 1.1241426467895508, + "learning_rate": 2.1228582198879467e-07, + "loss": 0.3938, + "step": 42090 + }, + { + "epoch": 0.9343958446632112, + "grad_norm": 1.544179916381836, + "learning_rate": 2.1157178344901385e-07, + "loss": 0.4666, + "step": 42095 + }, + { + "epoch": 0.9345068312227389, + "grad_norm": 0.7302343249320984, + "learning_rate": 2.1085893495350729e-07, + "loss": 0.4213, + "step": 42100 + }, + { + "epoch": 0.9346178177822666, + "grad_norm": 1.4219212532043457, + "learning_rate": 2.101472765889412e-07, + "loss": 0.4218, + "step": 42105 + }, + { + "epoch": 0.9347288043417942, + "grad_norm": 1.3587608337402344, + "learning_rate": 2.0943680844183633e-07, + "loss": 0.4437, + "step": 42110 + }, + { + "epoch": 0.9348397909013219, + "grad_norm": 0.9834860563278198, + "learning_rate": 2.087275305985692e-07, + "loss": 0.3333, + "step": 42115 + }, + { + "epoch": 0.9349507774608495, + "grad_norm": 1.191887617111206, + "learning_rate": 2.0801944314537192e-07, + "loss": 0.3715, + "step": 42120 + }, + { + "epoch": 0.9350617640203771, + "grad_norm": 1.459552526473999, + "learning_rate": 2.0731254616833008e-07, + "loss": 0.3668, + "step": 42125 + }, + { + "epoch": 0.9351727505799048, + "grad_norm": 0.7708008885383606, + "learning_rate": 2.0660683975338826e-07, + "loss": 0.3064, + "step": 42130 + }, + { + "epoch": 0.9352837371394325, + "grad_norm": 1.4403290748596191, + "learning_rate": 2.0590232398634114e-07, + "loss": 0.4275, + "step": 42135 + }, + { + "epoch": 0.93539472369896, + "grad_norm": 1.713077187538147, + "learning_rate": 2.0519899895284245e-07, + "loss": 0.3068, + "step": 42140 + }, + { + "epoch": 0.9355057102584877, + "grad_norm": 1.082633137702942, + "learning_rate": 2.0449686473840157e-07, + "loss": 0.3676, + "step": 42145 + }, + { + "epoch": 0.9356166968180153, + "grad_norm": 1.344906210899353, + "learning_rate": 2.0379592142837912e-07, + "loss": 0.3333, + "step": 42150 + }, + { + "epoch": 0.935727683377543, + "grad_norm": 0.6421421766281128, + "learning_rate": 2.0309616910799579e-07, + "loss": 0.3735, + "step": 42155 + }, + { + "epoch": 0.9358386699370707, + "grad_norm": 1.8165996074676514, + "learning_rate": 2.0239760786232355e-07, + "loss": 0.3604, + "step": 42160 + }, + { + "epoch": 0.9359496564965982, + "grad_norm": 1.6352641582489014, + "learning_rate": 2.017002377762911e-07, + "loss": 0.399, + "step": 42165 + }, + { + "epoch": 0.9360606430561259, + "grad_norm": 1.11709463596344, + "learning_rate": 2.010040589346829e-07, + "loss": 0.388, + "step": 42170 + }, + { + "epoch": 0.9361716296156536, + "grad_norm": 1.6944082975387573, + "learning_rate": 2.0030907142213784e-07, + "loss": 0.4607, + "step": 42175 + }, + { + "epoch": 0.9362826161751812, + "grad_norm": 1.1754425764083862, + "learning_rate": 1.996152753231506e-07, + "loss": 0.4429, + "step": 42180 + }, + { + "epoch": 0.9363936027347088, + "grad_norm": 0.9020445942878723, + "learning_rate": 1.989226707220704e-07, + "loss": 0.284, + "step": 42185 + }, + { + "epoch": 0.9365045892942365, + "grad_norm": 1.2843596935272217, + "learning_rate": 1.9823125770310204e-07, + "loss": 0.4293, + "step": 42190 + }, + { + "epoch": 0.9366155758537641, + "grad_norm": 1.1910959482192993, + "learning_rate": 1.9754103635030385e-07, + "loss": 0.5196, + "step": 42195 + }, + { + "epoch": 0.9367265624132918, + "grad_norm": 1.0260785818099976, + "learning_rate": 1.968520067475921e-07, + "loss": 0.3814, + "step": 42200 + }, + { + "epoch": 0.9368375489728193, + "grad_norm": 1.587637186050415, + "learning_rate": 1.9616416897873525e-07, + "loss": 0.3861, + "step": 42205 + }, + { + "epoch": 0.936948535532347, + "grad_norm": 1.1606448888778687, + "learning_rate": 1.9547752312735978e-07, + "loss": 0.4401, + "step": 42210 + }, + { + "epoch": 0.9370595220918747, + "grad_norm": 1.0315425395965576, + "learning_rate": 1.947920692769456e-07, + "loss": 0.4936, + "step": 42215 + }, + { + "epoch": 0.9371705086514023, + "grad_norm": 1.2617237567901611, + "learning_rate": 1.9410780751082714e-07, + "loss": 0.3557, + "step": 42220 + }, + { + "epoch": 0.93728149521093, + "grad_norm": 1.15226411819458, + "learning_rate": 1.9342473791219561e-07, + "loss": 0.3832, + "step": 42225 + }, + { + "epoch": 0.9373924817704576, + "grad_norm": 0.784106433391571, + "learning_rate": 1.9274286056409574e-07, + "loss": 0.5419, + "step": 42230 + }, + { + "epoch": 0.9375034683299852, + "grad_norm": 0.9966132044792175, + "learning_rate": 1.9206217554942675e-07, + "loss": 0.431, + "step": 42235 + }, + { + "epoch": 0.9376144548895129, + "grad_norm": 1.2382640838623047, + "learning_rate": 1.9138268295094687e-07, + "loss": 0.3532, + "step": 42240 + }, + { + "epoch": 0.9377254414490406, + "grad_norm": 1.321143627166748, + "learning_rate": 1.9070438285126337e-07, + "loss": 0.3812, + "step": 42245 + }, + { + "epoch": 0.9378364280085681, + "grad_norm": 2.318057060241699, + "learning_rate": 1.9002727533284358e-07, + "loss": 0.1845, + "step": 42250 + }, + { + "epoch": 0.9379474145680958, + "grad_norm": 1.220383882522583, + "learning_rate": 1.893513604780073e-07, + "loss": 0.314, + "step": 42255 + }, + { + "epoch": 0.9380584011276234, + "grad_norm": 1.221579909324646, + "learning_rate": 1.8867663836893092e-07, + "loss": 0.3582, + "step": 42260 + }, + { + "epoch": 0.9381693876871511, + "grad_norm": 1.1214572191238403, + "learning_rate": 1.880031090876444e-07, + "loss": 0.4177, + "step": 42265 + }, + { + "epoch": 0.9382803742466788, + "grad_norm": 1.2276601791381836, + "learning_rate": 1.8733077271603117e-07, + "loss": 0.3543, + "step": 42270 + }, + { + "epoch": 0.9383913608062063, + "grad_norm": 1.359606385231018, + "learning_rate": 1.8665962933583581e-07, + "loss": 0.5302, + "step": 42275 + }, + { + "epoch": 0.938502347365734, + "grad_norm": 1.5073870420455933, + "learning_rate": 1.8598967902864974e-07, + "loss": 0.4673, + "step": 42280 + }, + { + "epoch": 0.9386133339252617, + "grad_norm": 1.1110531091690063, + "learning_rate": 1.8532092187592553e-07, + "loss": 0.4733, + "step": 42285 + }, + { + "epoch": 0.9387243204847893, + "grad_norm": 1.0862751007080078, + "learning_rate": 1.8465335795896822e-07, + "loss": 0.4716, + "step": 42290 + }, + { + "epoch": 0.938835307044317, + "grad_norm": 1.0596460103988647, + "learning_rate": 1.8398698735893616e-07, + "loss": 0.3851, + "step": 42295 + }, + { + "epoch": 0.9389462936038446, + "grad_norm": 1.0239298343658447, + "learning_rate": 1.833218101568468e-07, + "loss": 0.3476, + "step": 42300 + }, + { + "epoch": 0.9390572801633722, + "grad_norm": 0.8641639947891235, + "learning_rate": 1.8265782643356877e-07, + "loss": 0.3246, + "step": 42305 + }, + { + "epoch": 0.9391682667228999, + "grad_norm": 1.0871738195419312, + "learning_rate": 1.8199503626982751e-07, + "loss": 0.3363, + "step": 42310 + }, + { + "epoch": 0.9392792532824275, + "grad_norm": 1.2380642890930176, + "learning_rate": 1.81333439746203e-07, + "loss": 0.4041, + "step": 42315 + }, + { + "epoch": 0.9393902398419551, + "grad_norm": 1.2620341777801514, + "learning_rate": 1.8067303694312975e-07, + "loss": 0.4553, + "step": 42320 + }, + { + "epoch": 0.9395012264014828, + "grad_norm": 2.0406928062438965, + "learning_rate": 1.8001382794089695e-07, + "loss": 0.3012, + "step": 42325 + }, + { + "epoch": 0.9396122129610104, + "grad_norm": 1.631622076034546, + "learning_rate": 1.793558128196493e-07, + "loss": 0.4641, + "step": 42330 + }, + { + "epoch": 0.9397231995205381, + "grad_norm": 1.1621760129928589, + "learning_rate": 1.786989916593862e-07, + "loss": 0.3177, + "step": 42335 + }, + { + "epoch": 0.9398341860800657, + "grad_norm": 1.408253788948059, + "learning_rate": 1.780433645399615e-07, + "loss": 0.4708, + "step": 42340 + }, + { + "epoch": 0.9399451726395933, + "grad_norm": 0.9971133470535278, + "learning_rate": 1.773889315410837e-07, + "loss": 0.3845, + "step": 42345 + }, + { + "epoch": 0.940056159199121, + "grad_norm": 0.8766626715660095, + "learning_rate": 1.7673569274231805e-07, + "loss": 0.3939, + "step": 42350 + }, + { + "epoch": 0.9401671457586487, + "grad_norm": 1.2505395412445068, + "learning_rate": 1.76083648223081e-07, + "loss": 0.5388, + "step": 42355 + }, + { + "epoch": 0.9402781323181763, + "grad_norm": 1.0482748746871948, + "learning_rate": 1.754327980626491e-07, + "loss": 0.4699, + "step": 42360 + }, + { + "epoch": 0.9403891188777039, + "grad_norm": 0.8574467301368713, + "learning_rate": 1.7478314234014693e-07, + "loss": 0.4789, + "step": 42365 + }, + { + "epoch": 0.9405001054372315, + "grad_norm": 1.6569310426712036, + "learning_rate": 1.74134681134559e-07, + "loss": 0.3876, + "step": 42370 + }, + { + "epoch": 0.9406110919967592, + "grad_norm": 0.793308675289154, + "learning_rate": 1.734874145247245e-07, + "loss": 0.2943, + "step": 42375 + }, + { + "epoch": 0.9407220785562869, + "grad_norm": 1.8770109415054321, + "learning_rate": 1.728413425893327e-07, + "loss": 0.313, + "step": 42380 + }, + { + "epoch": 0.9408330651158144, + "grad_norm": 0.7877984642982483, + "learning_rate": 1.7219646540693414e-07, + "loss": 0.3232, + "step": 42385 + }, + { + "epoch": 0.9409440516753421, + "grad_norm": 1.603267788887024, + "learning_rate": 1.7155278305592938e-07, + "loss": 0.3759, + "step": 42390 + }, + { + "epoch": 0.9410550382348698, + "grad_norm": 0.926700234413147, + "learning_rate": 1.7091029561457473e-07, + "loss": 0.252, + "step": 42395 + }, + { + "epoch": 0.9411660247943974, + "grad_norm": 1.335512638092041, + "learning_rate": 1.7026900316098217e-07, + "loss": 0.3411, + "step": 42400 + }, + { + "epoch": 0.941277011353925, + "grad_norm": 1.0387749671936035, + "learning_rate": 1.696289057731182e-07, + "loss": 0.2486, + "step": 42405 + }, + { + "epoch": 0.9413879979134527, + "grad_norm": 1.5329840183258057, + "learning_rate": 1.689900035288028e-07, + "loss": 0.5884, + "step": 42410 + }, + { + "epoch": 0.9414989844729803, + "grad_norm": 1.8901313543319702, + "learning_rate": 1.6835229650571272e-07, + "loss": 0.3843, + "step": 42415 + }, + { + "epoch": 0.941609971032508, + "grad_norm": 1.305861473083496, + "learning_rate": 1.6771578478137818e-07, + "loss": 0.4051, + "step": 42420 + }, + { + "epoch": 0.9417209575920356, + "grad_norm": 1.112451434135437, + "learning_rate": 1.6708046843318172e-07, + "loss": 0.2721, + "step": 42425 + }, + { + "epoch": 0.9418319441515632, + "grad_norm": 0.6873188018798828, + "learning_rate": 1.6644634753836598e-07, + "loss": 0.2981, + "step": 42430 + }, + { + "epoch": 0.9419429307110909, + "grad_norm": 1.1935917139053345, + "learning_rate": 1.6581342217402485e-07, + "loss": 0.2242, + "step": 42435 + }, + { + "epoch": 0.9420539172706185, + "grad_norm": 1.4765454530715942, + "learning_rate": 1.6518169241710568e-07, + "loss": 0.4643, + "step": 42440 + }, + { + "epoch": 0.9421649038301462, + "grad_norm": 1.2573107481002808, + "learning_rate": 1.645511583444126e-07, + "loss": 0.5275, + "step": 42445 + }, + { + "epoch": 0.9422758903896739, + "grad_norm": 2.344226837158203, + "learning_rate": 1.6392182003260427e-07, + "loss": 0.395, + "step": 42450 + }, + { + "epoch": 0.9423868769492014, + "grad_norm": 2.1406664848327637, + "learning_rate": 1.6329367755819503e-07, + "loss": 0.48, + "step": 42455 + }, + { + "epoch": 0.9424978635087291, + "grad_norm": 2.3771812915802, + "learning_rate": 1.6266673099754825e-07, + "loss": 0.2588, + "step": 42460 + }, + { + "epoch": 0.9426088500682568, + "grad_norm": 1.4632244110107422, + "learning_rate": 1.6204098042689076e-07, + "loss": 0.4524, + "step": 42465 + }, + { + "epoch": 0.9427198366277844, + "grad_norm": 1.293175458908081, + "learning_rate": 1.6141642592229611e-07, + "loss": 0.4124, + "step": 42470 + }, + { + "epoch": 0.942830823187312, + "grad_norm": 1.3106547594070435, + "learning_rate": 1.6079306755969582e-07, + "loss": 0.4071, + "step": 42475 + }, + { + "epoch": 0.9429418097468396, + "grad_norm": 0.9079194068908691, + "learning_rate": 1.601709054148759e-07, + "loss": 0.2636, + "step": 42480 + }, + { + "epoch": 0.9430527963063673, + "grad_norm": 0.9949849247932434, + "learning_rate": 1.595499395634781e-07, + "loss": 0.4341, + "step": 42485 + }, + { + "epoch": 0.943163782865895, + "grad_norm": 1.311824917793274, + "learning_rate": 1.5893017008099532e-07, + "loss": 0.325, + "step": 42490 + }, + { + "epoch": 0.9432747694254225, + "grad_norm": 1.0263524055480957, + "learning_rate": 1.5831159704277955e-07, + "loss": 0.3093, + "step": 42495 + }, + { + "epoch": 0.9433857559849502, + "grad_norm": 1.7075735330581665, + "learning_rate": 1.5769422052403172e-07, + "loss": 0.3251, + "step": 42500 + }, + { + "epoch": 0.9434967425444779, + "grad_norm": 1.4891036748886108, + "learning_rate": 1.5707804059981403e-07, + "loss": 0.3352, + "step": 42505 + }, + { + "epoch": 0.9436077291040055, + "grad_norm": 0.8496419191360474, + "learning_rate": 1.5646305734503541e-07, + "loss": 0.3912, + "step": 42510 + }, + { + "epoch": 0.9437187156635332, + "grad_norm": 0.9281029105186462, + "learning_rate": 1.558492708344661e-07, + "loss": 0.299, + "step": 42515 + }, + { + "epoch": 0.9438297022230608, + "grad_norm": 0.8617515563964844, + "learning_rate": 1.552366811427286e-07, + "loss": 0.4132, + "step": 42520 + }, + { + "epoch": 0.9439406887825884, + "grad_norm": 1.8958934545516968, + "learning_rate": 1.546252883442978e-07, + "loss": 0.4692, + "step": 42525 + }, + { + "epoch": 0.9440516753421161, + "grad_norm": 1.008283257484436, + "learning_rate": 1.5401509251350643e-07, + "loss": 0.3475, + "step": 42530 + }, + { + "epoch": 0.9441626619016437, + "grad_norm": 1.0553314685821533, + "learning_rate": 1.5340609372453852e-07, + "loss": 0.2833, + "step": 42535 + }, + { + "epoch": 0.9442736484611713, + "grad_norm": 0.5733352899551392, + "learning_rate": 1.5279829205143482e-07, + "loss": 0.3802, + "step": 42540 + }, + { + "epoch": 0.944384635020699, + "grad_norm": 1.4484621286392212, + "learning_rate": 1.5219168756809067e-07, + "loss": 0.4502, + "step": 42545 + }, + { + "epoch": 0.9444956215802266, + "grad_norm": 2.533099412918091, + "learning_rate": 1.5158628034825373e-07, + "loss": 0.3167, + "step": 42550 + }, + { + "epoch": 0.9446066081397543, + "grad_norm": 1.320468783378601, + "learning_rate": 1.509820704655285e-07, + "loss": 0.4806, + "step": 42555 + }, + { + "epoch": 0.944717594699282, + "grad_norm": 1.3541382551193237, + "learning_rate": 1.5037905799337062e-07, + "loss": 0.4546, + "step": 42560 + }, + { + "epoch": 0.9448285812588095, + "grad_norm": 0.7527419924736023, + "learning_rate": 1.4977724300509367e-07, + "loss": 0.3497, + "step": 42565 + }, + { + "epoch": 0.9449395678183372, + "grad_norm": 1.0765899419784546, + "learning_rate": 1.491766255738658e-07, + "loss": 0.4331, + "step": 42570 + }, + { + "epoch": 0.9450505543778649, + "grad_norm": 0.9651219248771667, + "learning_rate": 1.485772057727064e-07, + "loss": 0.2916, + "step": 42575 + }, + { + "epoch": 0.9451615409373925, + "grad_norm": 0.8204355239868164, + "learning_rate": 1.479789836744916e-07, + "loss": 0.4547, + "step": 42580 + }, + { + "epoch": 0.9452725274969201, + "grad_norm": 0.870439350605011, + "learning_rate": 1.4738195935194987e-07, + "loss": 0.3729, + "step": 42585 + }, + { + "epoch": 0.9453835140564477, + "grad_norm": 0.8403009176254272, + "learning_rate": 1.4678613287766653e-07, + "loss": 0.2963, + "step": 42590 + }, + { + "epoch": 0.9454945006159754, + "grad_norm": 0.7220903635025024, + "learning_rate": 1.4619150432408025e-07, + "loss": 0.3095, + "step": 42595 + }, + { + "epoch": 0.9456054871755031, + "grad_norm": 1.0451745986938477, + "learning_rate": 1.4559807376348324e-07, + "loss": 0.4031, + "step": 42600 + }, + { + "epoch": 0.9457164737350306, + "grad_norm": 1.8504866361618042, + "learning_rate": 1.450058412680244e-07, + "loss": 0.4052, + "step": 42605 + }, + { + "epoch": 0.9458274602945583, + "grad_norm": 1.066005825996399, + "learning_rate": 1.4441480690970399e-07, + "loss": 0.3125, + "step": 42610 + }, + { + "epoch": 0.945938446854086, + "grad_norm": 1.2941941022872925, + "learning_rate": 1.4382497076037783e-07, + "loss": 0.4924, + "step": 42615 + }, + { + "epoch": 0.9460494334136136, + "grad_norm": 1.210545539855957, + "learning_rate": 1.432363328917574e-07, + "loss": 0.4056, + "step": 42620 + }, + { + "epoch": 0.9461604199731413, + "grad_norm": 1.2813447713851929, + "learning_rate": 1.4264889337540556e-07, + "loss": 0.4493, + "step": 42625 + }, + { + "epoch": 0.946271406532669, + "grad_norm": 1.060917854309082, + "learning_rate": 1.4206265228274285e-07, + "loss": 0.2476, + "step": 42630 + }, + { + "epoch": 0.9463823930921965, + "grad_norm": 1.3904987573623657, + "learning_rate": 1.4147760968504232e-07, + "loss": 0.3694, + "step": 42635 + }, + { + "epoch": 0.9464933796517242, + "grad_norm": 1.623928427696228, + "learning_rate": 1.4089376565343037e-07, + "loss": 0.4551, + "step": 42640 + }, + { + "epoch": 0.9466043662112518, + "grad_norm": 1.5032771825790405, + "learning_rate": 1.4031112025889028e-07, + "loss": 0.2393, + "step": 42645 + }, + { + "epoch": 0.9467153527707795, + "grad_norm": 1.5255825519561768, + "learning_rate": 1.397296735722564e-07, + "loss": 0.4987, + "step": 42650 + }, + { + "epoch": 0.9468263393303071, + "grad_norm": 1.5206665992736816, + "learning_rate": 1.391494256642212e-07, + "loss": 0.4196, + "step": 42655 + }, + { + "epoch": 0.9469373258898347, + "grad_norm": 1.3917713165283203, + "learning_rate": 1.3857037660532703e-07, + "loss": 0.3997, + "step": 42660 + }, + { + "epoch": 0.9470483124493624, + "grad_norm": 1.3763508796691895, + "learning_rate": 1.3799252646597428e-07, + "loss": 0.2943, + "step": 42665 + }, + { + "epoch": 0.9471592990088901, + "grad_norm": 0.9568051695823669, + "learning_rate": 1.3741587531641566e-07, + "loss": 0.2781, + "step": 42670 + }, + { + "epoch": 0.9472702855684176, + "grad_norm": 1.4659420251846313, + "learning_rate": 1.368404232267584e-07, + "loss": 0.3767, + "step": 42675 + }, + { + "epoch": 0.9473812721279453, + "grad_norm": 0.5646659135818481, + "learning_rate": 1.362661702669632e-07, + "loss": 0.3299, + "step": 42680 + }, + { + "epoch": 0.947492258687473, + "grad_norm": 1.633173942565918, + "learning_rate": 1.3569311650684757e-07, + "loss": 0.5553, + "step": 42685 + }, + { + "epoch": 0.9476032452470006, + "grad_norm": 1.2178746461868286, + "learning_rate": 1.351212620160802e-07, + "loss": 0.6447, + "step": 42690 + }, + { + "epoch": 0.9477142318065283, + "grad_norm": 0.9479925036430359, + "learning_rate": 1.3455060686418663e-07, + "loss": 0.3073, + "step": 42695 + }, + { + "epoch": 0.9478252183660558, + "grad_norm": 0.6817415356636047, + "learning_rate": 1.3398115112054243e-07, + "loss": 0.3668, + "step": 42700 + }, + { + "epoch": 0.9479362049255835, + "grad_norm": 1.6585414409637451, + "learning_rate": 1.3341289485438336e-07, + "loss": 0.3634, + "step": 42705 + }, + { + "epoch": 0.9480471914851112, + "grad_norm": 1.1411597728729248, + "learning_rate": 1.32845838134793e-07, + "loss": 0.3535, + "step": 42710 + }, + { + "epoch": 0.9481581780446388, + "grad_norm": 1.1587713956832886, + "learning_rate": 1.3227998103071516e-07, + "loss": 0.4131, + "step": 42715 + }, + { + "epoch": 0.9482691646041664, + "grad_norm": 3.019202709197998, + "learning_rate": 1.3171532361094252e-07, + "loss": 0.4121, + "step": 42720 + }, + { + "epoch": 0.9483801511636941, + "grad_norm": 1.5659027099609375, + "learning_rate": 1.3115186594412576e-07, + "loss": 0.4875, + "step": 42725 + }, + { + "epoch": 0.9484911377232217, + "grad_norm": 1.1905382871627808, + "learning_rate": 1.3058960809876676e-07, + "loss": 0.3339, + "step": 42730 + }, + { + "epoch": 0.9486021242827494, + "grad_norm": 1.0344101190567017, + "learning_rate": 1.3002855014322413e-07, + "loss": 0.2769, + "step": 42735 + }, + { + "epoch": 0.948713110842277, + "grad_norm": 0.7989633083343506, + "learning_rate": 1.2946869214570888e-07, + "loss": 0.3921, + "step": 42740 + }, + { + "epoch": 0.9488240974018046, + "grad_norm": 1.0567229986190796, + "learning_rate": 1.2891003417428772e-07, + "loss": 0.4605, + "step": 42745 + }, + { + "epoch": 0.9489350839613323, + "grad_norm": 0.9074198007583618, + "learning_rate": 1.2835257629687847e-07, + "loss": 0.3401, + "step": 42750 + }, + { + "epoch": 0.9490460705208599, + "grad_norm": 1.9332501888275146, + "learning_rate": 1.2779631858125696e-07, + "loss": 0.2172, + "step": 42755 + }, + { + "epoch": 0.9491570570803876, + "grad_norm": 0.7035365700721741, + "learning_rate": 1.272412610950502e-07, + "loss": 0.4515, + "step": 42760 + }, + { + "epoch": 0.9492680436399152, + "grad_norm": 1.057877540588379, + "learning_rate": 1.2668740390573975e-07, + "loss": 0.4552, + "step": 42765 + }, + { + "epoch": 0.9493790301994428, + "grad_norm": 0.7197465896606445, + "learning_rate": 1.2613474708066175e-07, + "loss": 0.3718, + "step": 42770 + }, + { + "epoch": 0.9494900167589705, + "grad_norm": 0.8760359883308411, + "learning_rate": 1.2558329068700803e-07, + "loss": 0.2546, + "step": 42775 + }, + { + "epoch": 0.9496010033184982, + "grad_norm": 1.2623904943466187, + "learning_rate": 1.250330347918205e-07, + "loss": 0.4187, + "step": 42780 + }, + { + "epoch": 0.9497119898780257, + "grad_norm": 1.0229982137680054, + "learning_rate": 1.244839794620001e-07, + "loss": 0.4495, + "step": 42785 + }, + { + "epoch": 0.9498229764375534, + "grad_norm": 0.8693443536758423, + "learning_rate": 1.2393612476429562e-07, + "loss": 0.2962, + "step": 42790 + }, + { + "epoch": 0.9499339629970811, + "grad_norm": 1.0552219152450562, + "learning_rate": 1.2338947076531603e-07, + "loss": 0.4832, + "step": 42795 + }, + { + "epoch": 0.9500449495566087, + "grad_norm": 1.2997872829437256, + "learning_rate": 1.2284401753152153e-07, + "loss": 0.4576, + "step": 42800 + }, + { + "epoch": 0.9501559361161364, + "grad_norm": 1.9434880018234253, + "learning_rate": 1.222997651292257e-07, + "loss": 0.4407, + "step": 42805 + }, + { + "epoch": 0.9502669226756639, + "grad_norm": 1.5652992725372314, + "learning_rate": 1.2175671362459785e-07, + "loss": 0.3417, + "step": 42810 + }, + { + "epoch": 0.9503779092351916, + "grad_norm": 1.203678011894226, + "learning_rate": 1.2121486308365848e-07, + "loss": 0.3064, + "step": 42815 + }, + { + "epoch": 0.9504888957947193, + "grad_norm": 0.8892753720283508, + "learning_rate": 1.2067421357228605e-07, + "loss": 0.3789, + "step": 42820 + }, + { + "epoch": 0.9505998823542469, + "grad_norm": 1.4720929861068726, + "learning_rate": 1.2013476515621014e-07, + "loss": 0.3041, + "step": 42825 + }, + { + "epoch": 0.9507108689137745, + "grad_norm": 0.9997137188911438, + "learning_rate": 1.195965179010139e-07, + "loss": 0.2007, + "step": 42830 + }, + { + "epoch": 0.9508218554733022, + "grad_norm": 1.6503864526748657, + "learning_rate": 1.1905947187213717e-07, + "loss": 0.412, + "step": 42835 + }, + { + "epoch": 0.9509328420328298, + "grad_norm": 1.0847293138504028, + "learning_rate": 1.185236271348722e-07, + "loss": 0.5052, + "step": 42840 + }, + { + "epoch": 0.9510438285923575, + "grad_norm": 1.8282418251037598, + "learning_rate": 1.1798898375436463e-07, + "loss": 0.438, + "step": 42845 + }, + { + "epoch": 0.9511548151518852, + "grad_norm": 1.196332573890686, + "learning_rate": 1.1745554179561469e-07, + "loss": 0.4065, + "step": 42850 + }, + { + "epoch": 0.9512658017114127, + "grad_norm": 0.569940447807312, + "learning_rate": 1.1692330132347607e-07, + "loss": 0.3161, + "step": 42855 + }, + { + "epoch": 0.9513767882709404, + "grad_norm": 1.002369999885559, + "learning_rate": 1.1639226240265811e-07, + "loss": 0.4974, + "step": 42860 + }, + { + "epoch": 0.951487774830468, + "grad_norm": 1.0855084657669067, + "learning_rate": 1.1586242509772139e-07, + "loss": 0.2977, + "step": 42865 + }, + { + "epoch": 0.9515987613899957, + "grad_norm": 1.2213571071624756, + "learning_rate": 1.1533378947308216e-07, + "loss": 0.4363, + "step": 42870 + }, + { + "epoch": 0.9517097479495233, + "grad_norm": 1.488828420639038, + "learning_rate": 1.1480635559301012e-07, + "loss": 0.4341, + "step": 42875 + }, + { + "epoch": 0.9518207345090509, + "grad_norm": 0.6738657355308533, + "learning_rate": 1.1428012352162843e-07, + "loss": 0.3108, + "step": 42880 + }, + { + "epoch": 0.9519317210685786, + "grad_norm": 1.1900697946548462, + "learning_rate": 1.137550933229159e-07, + "loss": 0.4509, + "step": 42885 + }, + { + "epoch": 0.9520427076281063, + "grad_norm": 1.0502153635025024, + "learning_rate": 1.132312650607037e-07, + "loss": 0.32, + "step": 42890 + }, + { + "epoch": 0.9521536941876338, + "grad_norm": 0.9347466826438904, + "learning_rate": 1.1270863879867644e-07, + "loss": 0.3837, + "step": 42895 + }, + { + "epoch": 0.9522646807471615, + "grad_norm": 1.7153974771499634, + "learning_rate": 1.1218721460037218e-07, + "loss": 0.5206, + "step": 42900 + }, + { + "epoch": 0.9523756673066892, + "grad_norm": 1.7464630603790283, + "learning_rate": 1.1166699252918578e-07, + "loss": 0.1663, + "step": 42905 + }, + { + "epoch": 0.9524866538662168, + "grad_norm": 1.6750175952911377, + "learning_rate": 1.1114797264836441e-07, + "loss": 0.4082, + "step": 42910 + }, + { + "epoch": 0.9525976404257445, + "grad_norm": 1.6528713703155518, + "learning_rate": 1.1063015502100761e-07, + "loss": 0.4874, + "step": 42915 + }, + { + "epoch": 0.952708626985272, + "grad_norm": 2.113939046859741, + "learning_rate": 1.1011353971007055e-07, + "loss": 0.4054, + "step": 42920 + }, + { + "epoch": 0.9528196135447997, + "grad_norm": 1.6014636754989624, + "learning_rate": 1.0959812677835968e-07, + "loss": 0.3204, + "step": 42925 + }, + { + "epoch": 0.9529306001043274, + "grad_norm": 1.902674913406372, + "learning_rate": 1.0908391628854042e-07, + "loss": 0.5774, + "step": 42930 + }, + { + "epoch": 0.953041586663855, + "grad_norm": 1.0486432313919067, + "learning_rate": 1.0857090830312612e-07, + "loss": 0.3855, + "step": 42935 + }, + { + "epoch": 0.9531525732233826, + "grad_norm": 1.0042614936828613, + "learning_rate": 1.0805910288448795e-07, + "loss": 0.2789, + "step": 42940 + }, + { + "epoch": 0.9532635597829103, + "grad_norm": 0.6804906129837036, + "learning_rate": 1.0754850009484841e-07, + "loss": 0.3862, + "step": 42945 + }, + { + "epoch": 0.9533745463424379, + "grad_norm": 1.6858739852905273, + "learning_rate": 1.070390999962867e-07, + "loss": 0.3043, + "step": 42950 + }, + { + "epoch": 0.9534855329019656, + "grad_norm": 1.2860480546951294, + "learning_rate": 1.0653090265073218e-07, + "loss": 0.3668, + "step": 42955 + }, + { + "epoch": 0.9535965194614933, + "grad_norm": 1.269942283630371, + "learning_rate": 1.0602390811996988e-07, + "loss": 0.44, + "step": 42960 + }, + { + "epoch": 0.9537075060210208, + "grad_norm": 1.5455029010772705, + "learning_rate": 1.0551811646563936e-07, + "loss": 0.4378, + "step": 42965 + }, + { + "epoch": 0.9538184925805485, + "grad_norm": 0.7402146458625793, + "learning_rate": 1.0501352774923368e-07, + "loss": 0.2378, + "step": 42970 + }, + { + "epoch": 0.9539294791400761, + "grad_norm": 1.076019287109375, + "learning_rate": 1.0451014203209708e-07, + "loss": 0.3095, + "step": 42975 + }, + { + "epoch": 0.9540404656996038, + "grad_norm": 1.0999020338058472, + "learning_rate": 1.0400795937543062e-07, + "loss": 0.398, + "step": 42980 + }, + { + "epoch": 0.9541514522591314, + "grad_norm": 1.187849521636963, + "learning_rate": 1.0350697984028768e-07, + "loss": 0.4443, + "step": 42985 + }, + { + "epoch": 0.954262438818659, + "grad_norm": 1.3116439580917358, + "learning_rate": 1.030072034875762e-07, + "loss": 0.327, + "step": 42990 + }, + { + "epoch": 0.9543734253781867, + "grad_norm": 1.2499772310256958, + "learning_rate": 1.0250863037805647e-07, + "loss": 0.3766, + "step": 42995 + }, + { + "epoch": 0.9544844119377144, + "grad_norm": 2.4240808486938477, + "learning_rate": 1.0201126057234445e-07, + "loss": 0.3844, + "step": 43000 + }, + { + "epoch": 0.954595398497242, + "grad_norm": 0.8385963439941406, + "learning_rate": 1.0151509413090843e-07, + "loss": 0.2594, + "step": 43005 + }, + { + "epoch": 0.9547063850567696, + "grad_norm": 0.9707930684089661, + "learning_rate": 1.0102013111406905e-07, + "loss": 0.2967, + "step": 43010 + }, + { + "epoch": 0.9548173716162973, + "grad_norm": 0.9702897071838379, + "learning_rate": 1.0052637158200484e-07, + "loss": 0.2644, + "step": 43015 + }, + { + "epoch": 0.9549283581758249, + "grad_norm": 1.0392128229141235, + "learning_rate": 1.0003381559474335e-07, + "loss": 0.3998, + "step": 43020 + }, + { + "epoch": 0.9550393447353526, + "grad_norm": 2.132612705230713, + "learning_rate": 9.954246321216887e-08, + "loss": 0.5521, + "step": 43025 + }, + { + "epoch": 0.9551503312948801, + "grad_norm": 1.4161665439605713, + "learning_rate": 9.905231449401809e-08, + "loss": 0.4327, + "step": 43030 + }, + { + "epoch": 0.9552613178544078, + "grad_norm": 1.1672700643539429, + "learning_rate": 9.856336949988221e-08, + "loss": 0.4846, + "step": 43035 + }, + { + "epoch": 0.9553723044139355, + "grad_norm": 1.3782694339752197, + "learning_rate": 9.80756282892048e-08, + "loss": 0.4533, + "step": 43040 + }, + { + "epoch": 0.9554832909734631, + "grad_norm": 1.3480278253555298, + "learning_rate": 9.758909092128289e-08, + "loss": 0.4974, + "step": 43045 + }, + { + "epoch": 0.9555942775329908, + "grad_norm": 0.8139400482177734, + "learning_rate": 9.710375745527023e-08, + "loss": 0.3991, + "step": 43050 + }, + { + "epoch": 0.9557052640925184, + "grad_norm": 1.030752182006836, + "learning_rate": 9.661962795017189e-08, + "loss": 0.4059, + "step": 43055 + }, + { + "epoch": 0.955816250652046, + "grad_norm": 1.4513559341430664, + "learning_rate": 9.613670246484408e-08, + "loss": 0.4464, + "step": 43060 + }, + { + "epoch": 0.9559272372115737, + "grad_norm": 2.3847713470458984, + "learning_rate": 9.565498105800208e-08, + "loss": 0.2873, + "step": 43065 + }, + { + "epoch": 0.9560382237711014, + "grad_norm": 1.3109573125839233, + "learning_rate": 9.517446378821016e-08, + "loss": 0.4918, + "step": 43070 + }, + { + "epoch": 0.9561492103306289, + "grad_norm": 1.3552931547164917, + "learning_rate": 9.469515071388935e-08, + "loss": 0.352, + "step": 43075 + }, + { + "epoch": 0.9562601968901566, + "grad_norm": 1.0919387340545654, + "learning_rate": 9.421704189331193e-08, + "loss": 0.404, + "step": 43080 + }, + { + "epoch": 0.9563711834496842, + "grad_norm": 2.247760534286499, + "learning_rate": 9.374013738460586e-08, + "loss": 0.3161, + "step": 43085 + }, + { + "epoch": 0.9564821700092119, + "grad_norm": 2.0436818599700928, + "learning_rate": 9.32644372457503e-08, + "loss": 0.3053, + "step": 43090 + }, + { + "epoch": 0.9565931565687396, + "grad_norm": 2.8228936195373535, + "learning_rate": 9.278994153457898e-08, + "loss": 0.3766, + "step": 43095 + }, + { + "epoch": 0.9567041431282671, + "grad_norm": 1.2718102931976318, + "learning_rate": 9.231665030878134e-08, + "loss": 0.4939, + "step": 43100 + }, + { + "epoch": 0.9568151296877948, + "grad_norm": 1.6868155002593994, + "learning_rate": 9.184456362589799e-08, + "loss": 0.3679, + "step": 43105 + }, + { + "epoch": 0.9569261162473225, + "grad_norm": 0.9345059990882874, + "learning_rate": 9.137368154332304e-08, + "loss": 0.3236, + "step": 43110 + }, + { + "epoch": 0.9570371028068501, + "grad_norm": 2.1842598915100098, + "learning_rate": 9.090400411830403e-08, + "loss": 0.5163, + "step": 43115 + }, + { + "epoch": 0.9571480893663777, + "grad_norm": 0.8142204880714417, + "learning_rate": 9.043553140794414e-08, + "loss": 0.3814, + "step": 43120 + }, + { + "epoch": 0.9572590759259054, + "grad_norm": 0.9440129995346069, + "learning_rate": 8.996826346919896e-08, + "loss": 0.3837, + "step": 43125 + }, + { + "epoch": 0.957370062485433, + "grad_norm": 1.2841126918792725, + "learning_rate": 8.950220035887636e-08, + "loss": 0.3815, + "step": 43130 + }, + { + "epoch": 0.9574810490449607, + "grad_norm": 0.7791603207588196, + "learning_rate": 8.903734213363769e-08, + "loss": 0.2432, + "step": 43135 + }, + { + "epoch": 0.9575920356044882, + "grad_norm": 1.2277582883834839, + "learning_rate": 8.857368885000217e-08, + "loss": 0.4522, + "step": 43140 + }, + { + "epoch": 0.9577030221640159, + "grad_norm": 1.1917132139205933, + "learning_rate": 8.811124056433584e-08, + "loss": 0.4469, + "step": 43145 + }, + { + "epoch": 0.9578140087235436, + "grad_norm": 0.8144562244415283, + "learning_rate": 8.764999733286372e-08, + "loss": 0.3648, + "step": 43150 + }, + { + "epoch": 0.9579249952830712, + "grad_norm": 0.9421244263648987, + "learning_rate": 8.718995921166096e-08, + "loss": 0.3401, + "step": 43155 + }, + { + "epoch": 0.9580359818425989, + "grad_norm": 0.9399101138114929, + "learning_rate": 8.673112625665725e-08, + "loss": 0.2573, + "step": 43160 + }, + { + "epoch": 0.9581469684021265, + "grad_norm": 0.9864373803138733, + "learning_rate": 8.627349852363798e-08, + "loss": 0.3559, + "step": 43165 + }, + { + "epoch": 0.9582579549616541, + "grad_norm": 0.36230021715164185, + "learning_rate": 8.581707606823863e-08, + "loss": 0.3169, + "step": 43170 + }, + { + "epoch": 0.9583689415211818, + "grad_norm": 1.521687388420105, + "learning_rate": 8.536185894594928e-08, + "loss": 0.4052, + "step": 43175 + }, + { + "epoch": 0.9584799280807095, + "grad_norm": 1.3936021327972412, + "learning_rate": 8.490784721211454e-08, + "loss": 0.3974, + "step": 43180 + }, + { + "epoch": 0.958590914640237, + "grad_norm": 1.47688889503479, + "learning_rate": 8.445504092193024e-08, + "loss": 0.1766, + "step": 43185 + }, + { + "epoch": 0.9587019011997647, + "grad_norm": 1.2802175283432007, + "learning_rate": 8.400344013044793e-08, + "loss": 0.3253, + "step": 43190 + }, + { + "epoch": 0.9588128877592923, + "grad_norm": 1.3018136024475098, + "learning_rate": 8.355304489257254e-08, + "loss": 0.3899, + "step": 43195 + }, + { + "epoch": 0.95892387431882, + "grad_norm": 1.7593441009521484, + "learning_rate": 8.31038552630603e-08, + "loss": 0.4278, + "step": 43200 + }, + { + "epoch": 0.9590348608783477, + "grad_norm": 0.9603082537651062, + "learning_rate": 8.265587129652308e-08, + "loss": 0.4065, + "step": 43205 + }, + { + "epoch": 0.9591458474378752, + "grad_norm": 1.1551685333251953, + "learning_rate": 8.220909304742397e-08, + "loss": 0.4648, + "step": 43210 + }, + { + "epoch": 0.9592568339974029, + "grad_norm": 1.3292489051818848, + "learning_rate": 8.176352057008174e-08, + "loss": 0.3998, + "step": 43215 + }, + { + "epoch": 0.9593678205569306, + "grad_norm": 0.9885017275810242, + "learning_rate": 8.131915391866752e-08, + "loss": 0.2783, + "step": 43220 + }, + { + "epoch": 0.9594788071164582, + "grad_norm": 0.9839476346969604, + "learning_rate": 8.087599314720696e-08, + "loss": 0.2947, + "step": 43225 + }, + { + "epoch": 0.9595897936759858, + "grad_norm": 2.1047043800354004, + "learning_rate": 8.043403830957586e-08, + "loss": 0.4337, + "step": 43230 + }, + { + "epoch": 0.9597007802355135, + "grad_norm": 1.219032645225525, + "learning_rate": 7.99932894595079e-08, + "loss": 0.3295, + "step": 43235 + }, + { + "epoch": 0.9598117667950411, + "grad_norm": 1.1567282676696777, + "learning_rate": 7.95537466505858e-08, + "loss": 0.2751, + "step": 43240 + }, + { + "epoch": 0.9599227533545688, + "grad_norm": 0.9363585114479065, + "learning_rate": 7.911540993624789e-08, + "loss": 0.3384, + "step": 43245 + }, + { + "epoch": 0.9600337399140964, + "grad_norm": 1.3244959115982056, + "learning_rate": 7.867827936978711e-08, + "loss": 0.324, + "step": 43250 + }, + { + "epoch": 0.960144726473624, + "grad_norm": 1.3957321643829346, + "learning_rate": 7.824235500434762e-08, + "loss": 0.5184, + "step": 43255 + }, + { + "epoch": 0.9602557130331517, + "grad_norm": 1.3671963214874268, + "learning_rate": 7.780763689292814e-08, + "loss": 0.4634, + "step": 43260 + }, + { + "epoch": 0.9603666995926793, + "grad_norm": 0.523896336555481, + "learning_rate": 7.73741250883786e-08, + "loss": 0.2899, + "step": 43265 + }, + { + "epoch": 0.960477686152207, + "grad_norm": 0.655168354511261, + "learning_rate": 7.694181964340574e-08, + "loss": 0.289, + "step": 43270 + }, + { + "epoch": 0.9605886727117346, + "grad_norm": 2.0806264877319336, + "learning_rate": 7.651072061056752e-08, + "loss": 0.3962, + "step": 43275 + }, + { + "epoch": 0.9606996592712622, + "grad_norm": 0.921379566192627, + "learning_rate": 7.608082804227424e-08, + "loss": 0.3671, + "step": 43280 + }, + { + "epoch": 0.9608106458307899, + "grad_norm": 1.1921287775039673, + "learning_rate": 7.565214199079185e-08, + "loss": 0.382, + "step": 43285 + }, + { + "epoch": 0.9609216323903176, + "grad_norm": 0.6474278569221497, + "learning_rate": 7.522466250823867e-08, + "loss": 0.2865, + "step": 43290 + }, + { + "epoch": 0.9610326189498452, + "grad_norm": 1.0548124313354492, + "learning_rate": 7.479838964658648e-08, + "loss": 0.4609, + "step": 43295 + }, + { + "epoch": 0.9611436055093728, + "grad_norm": 1.597723126411438, + "learning_rate": 7.437332345765825e-08, + "loss": 0.4392, + "step": 43300 + }, + { + "epoch": 0.9612545920689004, + "grad_norm": 0.8780395984649658, + "learning_rate": 7.394946399313374e-08, + "loss": 0.2615, + "step": 43305 + }, + { + "epoch": 0.9613655786284281, + "grad_norm": 1.7681487798690796, + "learning_rate": 7.352681130454398e-08, + "loss": 0.4348, + "step": 43310 + }, + { + "epoch": 0.9614765651879558, + "grad_norm": 1.4514552354812622, + "learning_rate": 7.310536544327452e-08, + "loss": 0.3698, + "step": 43315 + }, + { + "epoch": 0.9615875517474833, + "grad_norm": 1.1815019845962524, + "learning_rate": 7.268512646056213e-08, + "loss": 0.3915, + "step": 43320 + }, + { + "epoch": 0.961698538307011, + "grad_norm": 1.2830376625061035, + "learning_rate": 7.226609440749821e-08, + "loss": 0.3866, + "step": 43325 + }, + { + "epoch": 0.9618095248665387, + "grad_norm": 1.001721978187561, + "learning_rate": 7.184826933502642e-08, + "loss": 0.2697, + "step": 43330 + }, + { + "epoch": 0.9619205114260663, + "grad_norm": 1.2729138135910034, + "learning_rate": 7.143165129394725e-08, + "loss": 0.6466, + "step": 43335 + }, + { + "epoch": 0.962031497985594, + "grad_norm": 0.32785171270370483, + "learning_rate": 7.101624033490906e-08, + "loss": 0.3248, + "step": 43340 + }, + { + "epoch": 0.9621424845451216, + "grad_norm": 0.7534942030906677, + "learning_rate": 7.060203650841813e-08, + "loss": 0.3856, + "step": 43345 + }, + { + "epoch": 0.9622534711046492, + "grad_norm": 1.3082144260406494, + "learning_rate": 7.018903986483083e-08, + "loss": 0.3323, + "step": 43350 + }, + { + "epoch": 0.9623644576641769, + "grad_norm": 1.194825530052185, + "learning_rate": 6.977725045435702e-08, + "loss": 0.3502, + "step": 43355 + }, + { + "epoch": 0.9624754442237045, + "grad_norm": 1.8124589920043945, + "learning_rate": 6.936666832706329e-08, + "loss": 0.1988, + "step": 43360 + }, + { + "epoch": 0.9625864307832321, + "grad_norm": 1.005189061164856, + "learning_rate": 6.895729353286418e-08, + "loss": 0.4766, + "step": 43365 + }, + { + "epoch": 0.9626974173427598, + "grad_norm": 1.0299992561340332, + "learning_rate": 6.854912612153097e-08, + "loss": 0.4319, + "step": 43370 + }, + { + "epoch": 0.9628084039022874, + "grad_norm": 1.143924355506897, + "learning_rate": 6.814216614268843e-08, + "loss": 0.4616, + "step": 43375 + }, + { + "epoch": 0.9629193904618151, + "grad_norm": 1.1192833185195923, + "learning_rate": 6.773641364581141e-08, + "loss": 0.4629, + "step": 43380 + }, + { + "epoch": 0.9630303770213428, + "grad_norm": 1.9731518030166626, + "learning_rate": 6.733186868023156e-08, + "loss": 0.3022, + "step": 43385 + }, + { + "epoch": 0.9631413635808703, + "grad_norm": 1.9478931427001953, + "learning_rate": 6.692853129513177e-08, + "loss": 0.503, + "step": 43390 + }, + { + "epoch": 0.963252350140398, + "grad_norm": 1.0108568668365479, + "learning_rate": 6.652640153954836e-08, + "loss": 0.4192, + "step": 43395 + }, + { + "epoch": 0.9633633366999257, + "grad_norm": 1.6158348321914673, + "learning_rate": 6.612547946237003e-08, + "loss": 0.4582, + "step": 43400 + }, + { + "epoch": 0.9634743232594533, + "grad_norm": 1.3156096935272217, + "learning_rate": 6.572576511234108e-08, + "loss": 0.3058, + "step": 43405 + }, + { + "epoch": 0.9635853098189809, + "grad_norm": 0.9295489192008972, + "learning_rate": 6.5327258538056e-08, + "loss": 0.3073, + "step": 43410 + }, + { + "epoch": 0.9636962963785085, + "grad_norm": 1.3966041803359985, + "learning_rate": 6.492995978796379e-08, + "loss": 0.4236, + "step": 43415 + }, + { + "epoch": 0.9638072829380362, + "grad_norm": 1.1565799713134766, + "learning_rate": 6.453386891036917e-08, + "loss": 0.266, + "step": 43420 + }, + { + "epoch": 0.9639182694975639, + "grad_norm": 1.27523934841156, + "learning_rate": 6.413898595342472e-08, + "loss": 0.3808, + "step": 43425 + }, + { + "epoch": 0.9640292560570914, + "grad_norm": 1.3923174142837524, + "learning_rate": 6.374531096514091e-08, + "loss": 0.4183, + "step": 43430 + }, + { + "epoch": 0.9641402426166191, + "grad_norm": 1.050148367881775, + "learning_rate": 6.335284399337726e-08, + "loss": 0.4404, + "step": 43435 + }, + { + "epoch": 0.9642512291761468, + "grad_norm": 0.7960019111633301, + "learning_rate": 6.296158508585115e-08, + "loss": 0.353, + "step": 43440 + }, + { + "epoch": 0.9643622157356744, + "grad_norm": 0.8833103179931641, + "learning_rate": 6.257153429012897e-08, + "loss": 0.424, + "step": 43445 + }, + { + "epoch": 0.9644732022952021, + "grad_norm": 1.2397501468658447, + "learning_rate": 6.218269165363166e-08, + "loss": 0.4319, + "step": 43450 + }, + { + "epoch": 0.9645841888547297, + "grad_norm": 1.011025309562683, + "learning_rate": 6.179505722363367e-08, + "loss": 0.336, + "step": 43455 + }, + { + "epoch": 0.9646951754142573, + "grad_norm": 0.9096837043762207, + "learning_rate": 6.140863104726391e-08, + "loss": 0.3744, + "step": 43460 + }, + { + "epoch": 0.964806161973785, + "grad_norm": 1.0249688625335693, + "learning_rate": 6.10234131715004e-08, + "loss": 0.4071, + "step": 43465 + }, + { + "epoch": 0.9649171485333126, + "grad_norm": 1.0655795335769653, + "learning_rate": 6.063940364317677e-08, + "loss": 0.5957, + "step": 43470 + }, + { + "epoch": 0.9650281350928402, + "grad_norm": 1.1619106531143188, + "learning_rate": 6.025660250898124e-08, + "loss": 0.4282, + "step": 43475 + }, + { + "epoch": 0.9651391216523679, + "grad_norm": 0.9888255000114441, + "learning_rate": 5.987500981545325e-08, + "loss": 0.3985, + "step": 43480 + }, + { + "epoch": 0.9652501082118955, + "grad_norm": 1.5950113534927368, + "learning_rate": 5.9494625608984555e-08, + "loss": 0.3052, + "step": 43485 + }, + { + "epoch": 0.9653610947714232, + "grad_norm": 1.6013449430465698, + "learning_rate": 5.9115449935821526e-08, + "loss": 0.2657, + "step": 43490 + }, + { + "epoch": 0.9654720813309509, + "grad_norm": 1.5253368616104126, + "learning_rate": 5.873748284206171e-08, + "loss": 0.4415, + "step": 43495 + }, + { + "epoch": 0.9655830678904784, + "grad_norm": 1.13545823097229, + "learning_rate": 5.836072437365947e-08, + "loss": 0.4705, + "step": 43500 + }, + { + "epoch": 0.9656940544500061, + "grad_norm": 1.5528640747070312, + "learning_rate": 5.798517457641817e-08, + "loss": 0.3194, + "step": 43505 + }, + { + "epoch": 0.9658050410095338, + "grad_norm": 1.22503662109375, + "learning_rate": 5.7610833495996833e-08, + "loss": 0.3924, + "step": 43510 + }, + { + "epoch": 0.9659160275690614, + "grad_norm": 1.2438831329345703, + "learning_rate": 5.7237701177906836e-08, + "loss": 0.3593, + "step": 43515 + }, + { + "epoch": 0.966027014128589, + "grad_norm": 1.5987977981567383, + "learning_rate": 5.686577766751078e-08, + "loss": 0.2376, + "step": 43520 + }, + { + "epoch": 0.9661380006881166, + "grad_norm": 1.195892333984375, + "learning_rate": 5.649506301002583e-08, + "loss": 0.4138, + "step": 43525 + }, + { + "epoch": 0.9662489872476443, + "grad_norm": 1.0989891290664673, + "learning_rate": 5.612555725052482e-08, + "loss": 0.3425, + "step": 43530 + }, + { + "epoch": 0.966359973807172, + "grad_norm": 1.1684006452560425, + "learning_rate": 5.5757260433928485e-08, + "loss": 0.5131, + "step": 43535 + }, + { + "epoch": 0.9664709603666995, + "grad_norm": 1.0840319395065308, + "learning_rate": 5.539017260501545e-08, + "loss": 0.4428, + "step": 43540 + }, + { + "epoch": 0.9665819469262272, + "grad_norm": 1.027111530303955, + "learning_rate": 5.502429380841223e-08, + "loss": 0.4958, + "step": 43545 + }, + { + "epoch": 0.9666929334857549, + "grad_norm": 1.1387192010879517, + "learning_rate": 5.465962408860326e-08, + "loss": 0.4502, + "step": 43550 + }, + { + "epoch": 0.9668039200452825, + "grad_norm": 1.6442123651504517, + "learning_rate": 5.429616348992195e-08, + "loss": 0.3364, + "step": 43555 + }, + { + "epoch": 0.9669149066048102, + "grad_norm": 1.353895902633667, + "learning_rate": 5.393391205655851e-08, + "loss": 0.3276, + "step": 43560 + }, + { + "epoch": 0.9670258931643378, + "grad_norm": 1.4506542682647705, + "learning_rate": 5.357286983255439e-08, + "loss": 0.4487, + "step": 43565 + }, + { + "epoch": 0.9671368797238654, + "grad_norm": 0.8998032808303833, + "learning_rate": 5.3213036861801125e-08, + "loss": 0.3369, + "step": 43570 + }, + { + "epoch": 0.9672478662833931, + "grad_norm": 2.011275291442871, + "learning_rate": 5.285441318804929e-08, + "loss": 0.3461, + "step": 43575 + }, + { + "epoch": 0.9673588528429207, + "grad_norm": 0.7465802431106567, + "learning_rate": 5.249699885489734e-08, + "loss": 0.3433, + "step": 43580 + }, + { + "epoch": 0.9674698394024484, + "grad_norm": 0.9246371388435364, + "learning_rate": 5.2140793905799405e-08, + "loss": 0.4466, + "step": 43585 + }, + { + "epoch": 0.967580825961976, + "grad_norm": 0.8375202417373657, + "learning_rate": 5.178579838406084e-08, + "loss": 0.3347, + "step": 43590 + }, + { + "epoch": 0.9676918125215036, + "grad_norm": 0.8115552663803101, + "learning_rate": 5.143201233284156e-08, + "loss": 0.3079, + "step": 43595 + }, + { + "epoch": 0.9678027990810313, + "grad_norm": 0.6515522599220276, + "learning_rate": 5.1079435795152735e-08, + "loss": 0.3216, + "step": 43600 + }, + { + "epoch": 0.967913785640559, + "grad_norm": 0.7174926400184631, + "learning_rate": 5.072806881386005e-08, + "loss": 0.2816, + "step": 43605 + }, + { + "epoch": 0.9680247722000865, + "grad_norm": 1.0490044355392456, + "learning_rate": 5.037791143168158e-08, + "loss": 0.2679, + "step": 43610 + }, + { + "epoch": 0.9681357587596142, + "grad_norm": 1.0273045301437378, + "learning_rate": 5.0028963691188813e-08, + "loss": 0.3042, + "step": 43615 + }, + { + "epoch": 0.9682467453191419, + "grad_norm": 1.9209235906600952, + "learning_rate": 4.9681225634804484e-08, + "loss": 0.5211, + "step": 43620 + }, + { + "epoch": 0.9683577318786695, + "grad_norm": 1.4809397459030151, + "learning_rate": 4.933469730480589e-08, + "loss": 0.4624, + "step": 43625 + }, + { + "epoch": 0.9684687184381972, + "grad_norm": 1.1139112710952759, + "learning_rate": 4.8989378743322654e-08, + "loss": 0.289, + "step": 43630 + }, + { + "epoch": 0.9685797049977247, + "grad_norm": 1.4791672229766846, + "learning_rate": 4.8645269992337875e-08, + "loss": 0.4627, + "step": 43635 + }, + { + "epoch": 0.9686906915572524, + "grad_norm": 1.950037956237793, + "learning_rate": 4.830237109368696e-08, + "loss": 0.4771, + "step": 43640 + }, + { + "epoch": 0.9688016781167801, + "grad_norm": 1.644313097000122, + "learning_rate": 4.79606820890588e-08, + "loss": 0.3903, + "step": 43645 + }, + { + "epoch": 0.9689126646763077, + "grad_norm": 1.6992888450622559, + "learning_rate": 4.762020301999459e-08, + "loss": 0.3884, + "step": 43650 + }, + { + "epoch": 0.9690236512358353, + "grad_norm": 1.1619939804077148, + "learning_rate": 4.7280933927886795e-08, + "loss": 0.3372, + "step": 43655 + }, + { + "epoch": 0.969134637795363, + "grad_norm": 1.012537956237793, + "learning_rate": 4.694287485398574e-08, + "loss": 0.4419, + "step": 43660 + }, + { + "epoch": 0.9692456243548906, + "grad_norm": 1.0167009830474854, + "learning_rate": 4.660602583938967e-08, + "loss": 0.6154, + "step": 43665 + }, + { + "epoch": 0.9693566109144183, + "grad_norm": 2.161961317062378, + "learning_rate": 4.6270386925051366e-08, + "loss": 0.3254, + "step": 43670 + }, + { + "epoch": 0.969467597473946, + "grad_norm": 0.96853107213974, + "learning_rate": 4.59359581517782e-08, + "loss": 0.1922, + "step": 43675 + }, + { + "epoch": 0.9695785840334735, + "grad_norm": 0.6753823161125183, + "learning_rate": 4.560273956022654e-08, + "loss": 0.5552, + "step": 43680 + }, + { + "epoch": 0.9696895705930012, + "grad_norm": 1.753062129020691, + "learning_rate": 4.527073119091063e-08, + "loss": 0.3512, + "step": 43685 + }, + { + "epoch": 0.9698005571525288, + "grad_norm": 1.2516409158706665, + "learning_rate": 4.4939933084192646e-08, + "loss": 0.392, + "step": 43690 + }, + { + "epoch": 0.9699115437120565, + "grad_norm": 1.4465858936309814, + "learning_rate": 4.461034528029151e-08, + "loss": 0.4693, + "step": 43695 + }, + { + "epoch": 0.9700225302715841, + "grad_norm": 0.8663288950920105, + "learning_rate": 4.42819678192774e-08, + "loss": 0.4261, + "step": 43700 + }, + { + "epoch": 0.9701335168311117, + "grad_norm": 0.7888398766517639, + "learning_rate": 4.395480074107172e-08, + "loss": 0.4786, + "step": 43705 + }, + { + "epoch": 0.9702445033906394, + "grad_norm": 0.7504032850265503, + "learning_rate": 4.362884408545154e-08, + "loss": 0.2153, + "step": 43710 + }, + { + "epoch": 0.9703554899501671, + "grad_norm": 1.000174641609192, + "learning_rate": 4.330409789204515e-08, + "loss": 0.3833, + "step": 43715 + }, + { + "epoch": 0.9704664765096946, + "grad_norm": 1.6008414030075073, + "learning_rate": 4.298056220033542e-08, + "loss": 0.1503, + "step": 43720 + }, + { + "epoch": 0.9705774630692223, + "grad_norm": 1.3198015689849854, + "learning_rate": 4.2658237049655325e-08, + "loss": 0.3803, + "step": 43725 + }, + { + "epoch": 0.97068844962875, + "grad_norm": 0.7263050079345703, + "learning_rate": 4.2337122479191304e-08, + "loss": 0.2082, + "step": 43730 + }, + { + "epoch": 0.9707994361882776, + "grad_norm": 1.561052918434143, + "learning_rate": 4.201721852798657e-08, + "loss": 0.4365, + "step": 43735 + }, + { + "epoch": 0.9709104227478053, + "grad_norm": 0.8215240836143494, + "learning_rate": 4.169852523493001e-08, + "loss": 0.2649, + "step": 43740 + }, + { + "epoch": 0.9710214093073328, + "grad_norm": 0.9629057049751282, + "learning_rate": 4.138104263877063e-08, + "loss": 0.4287, + "step": 43745 + }, + { + "epoch": 0.9711323958668605, + "grad_norm": 1.085941195487976, + "learning_rate": 4.106477077810422e-08, + "loss": 0.3626, + "step": 43750 + }, + { + "epoch": 0.9712433824263882, + "grad_norm": 1.2793713808059692, + "learning_rate": 4.0749709691383365e-08, + "loss": 0.232, + "step": 43755 + }, + { + "epoch": 0.9713543689859158, + "grad_norm": 1.0775913000106812, + "learning_rate": 4.043585941691297e-08, + "loss": 0.3176, + "step": 43760 + }, + { + "epoch": 0.9714653555454434, + "grad_norm": 1.6739836931228638, + "learning_rate": 4.012321999284918e-08, + "loss": 0.3226, + "step": 43765 + }, + { + "epoch": 0.9715763421049711, + "grad_norm": 0.8687372207641602, + "learning_rate": 3.981179145720049e-08, + "loss": 0.4759, + "step": 43770 + }, + { + "epoch": 0.9716873286644987, + "grad_norm": 0.6673811078071594, + "learning_rate": 3.950157384783104e-08, + "loss": 0.2595, + "step": 43775 + }, + { + "epoch": 0.9717983152240264, + "grad_norm": 0.8460555672645569, + "learning_rate": 3.9192567202455125e-08, + "loss": 0.3479, + "step": 43780 + }, + { + "epoch": 0.9719093017835541, + "grad_norm": 1.246845006942749, + "learning_rate": 3.8884771558640454e-08, + "loss": 0.3967, + "step": 43785 + }, + { + "epoch": 0.9720202883430816, + "grad_norm": 3.1427626609802246, + "learning_rate": 3.8578186953808216e-08, + "loss": 0.3284, + "step": 43790 + }, + { + "epoch": 0.9721312749026093, + "grad_norm": 1.5429573059082031, + "learning_rate": 3.827281342523304e-08, + "loss": 0.2478, + "step": 43795 + }, + { + "epoch": 0.9722422614621369, + "grad_norm": 1.5764998197555542, + "learning_rate": 3.796865101003966e-08, + "loss": 0.4668, + "step": 43800 + }, + { + "epoch": 0.9723532480216646, + "grad_norm": 1.108802318572998, + "learning_rate": 3.76656997452074e-08, + "loss": 0.4174, + "step": 43805 + }, + { + "epoch": 0.9724642345811922, + "grad_norm": 1.0935370922088623, + "learning_rate": 3.7363959667569006e-08, + "loss": 0.403, + "step": 43810 + }, + { + "epoch": 0.9725752211407198, + "grad_norm": 1.443190336227417, + "learning_rate": 3.706343081380737e-08, + "loss": 0.3971, + "step": 43815 + }, + { + "epoch": 0.9726862077002475, + "grad_norm": 2.337955951690674, + "learning_rate": 3.676411322046103e-08, + "loss": 0.3461, + "step": 43820 + }, + { + "epoch": 0.9727971942597752, + "grad_norm": 2.3962912559509277, + "learning_rate": 3.6466006923919775e-08, + "loss": 0.446, + "step": 43825 + }, + { + "epoch": 0.9729081808193027, + "grad_norm": 1.3454636335372925, + "learning_rate": 3.6169111960426826e-08, + "loss": 0.3366, + "step": 43830 + }, + { + "epoch": 0.9730191673788304, + "grad_norm": 1.3569358587265015, + "learning_rate": 3.587342836607666e-08, + "loss": 0.3145, + "step": 43835 + }, + { + "epoch": 0.9731301539383581, + "grad_norm": 0.7849341034889221, + "learning_rate": 3.557895617681717e-08, + "loss": 0.3135, + "step": 43840 + }, + { + "epoch": 0.9732411404978857, + "grad_norm": 1.1286523342132568, + "learning_rate": 3.528569542845084e-08, + "loss": 0.3887, + "step": 43845 + }, + { + "epoch": 0.9733521270574134, + "grad_norm": 1.1259015798568726, + "learning_rate": 3.499364615663137e-08, + "loss": 0.344, + "step": 43850 + }, + { + "epoch": 0.9734631136169409, + "grad_norm": 1.3279014825820923, + "learning_rate": 3.4702808396863683e-08, + "loss": 0.3563, + "step": 43855 + }, + { + "epoch": 0.9735741001764686, + "grad_norm": 2.132629871368408, + "learning_rate": 3.4413182184507285e-08, + "loss": 0.4919, + "step": 43860 + }, + { + "epoch": 0.9736850867359963, + "grad_norm": 1.5616865158081055, + "learning_rate": 3.412476755477401e-08, + "loss": 0.4435, + "step": 43865 + }, + { + "epoch": 0.9737960732955239, + "grad_norm": 0.9031458497047424, + "learning_rate": 3.383756454272913e-08, + "loss": 0.462, + "step": 43870 + }, + { + "epoch": 0.9739070598550515, + "grad_norm": 1.6827720403671265, + "learning_rate": 3.355157318328916e-08, + "loss": 0.2801, + "step": 43875 + }, + { + "epoch": 0.9740180464145792, + "grad_norm": 1.1727105379104614, + "learning_rate": 3.326679351122408e-08, + "loss": 0.4307, + "step": 43880 + }, + { + "epoch": 0.9741290329741068, + "grad_norm": 1.3294233083724976, + "learning_rate": 3.2983225561156185e-08, + "loss": 0.4035, + "step": 43885 + }, + { + "epoch": 0.9742400195336345, + "grad_norm": 1.5523672103881836, + "learning_rate": 3.270086936756123e-08, + "loss": 0.3738, + "step": 43890 + }, + { + "epoch": 0.9743510060931622, + "grad_norm": 1.3038649559020996, + "learning_rate": 3.24197249647662e-08, + "loss": 0.2539, + "step": 43895 + }, + { + "epoch": 0.9744619926526897, + "grad_norm": 1.1056405305862427, + "learning_rate": 3.213979238695375e-08, + "loss": 0.3263, + "step": 43900 + }, + { + "epoch": 0.9745729792122174, + "grad_norm": 1.8236504793167114, + "learning_rate": 3.186107166815444e-08, + "loss": 0.4064, + "step": 43905 + }, + { + "epoch": 0.974683965771745, + "grad_norm": 1.126415729522705, + "learning_rate": 3.158356284225561e-08, + "loss": 0.2778, + "step": 43910 + }, + { + "epoch": 0.9747949523312727, + "grad_norm": 1.2793995141983032, + "learning_rate": 3.1307265942996935e-08, + "loss": 0.4504, + "step": 43915 + }, + { + "epoch": 0.9749059388908003, + "grad_norm": 1.346092939376831, + "learning_rate": 3.1032181003967096e-08, + "loss": 0.3949, + "step": 43920 + }, + { + "epoch": 0.9750169254503279, + "grad_norm": 0.5175896286964417, + "learning_rate": 3.075830805861157e-08, + "loss": 0.3089, + "step": 43925 + }, + { + "epoch": 0.9751279120098556, + "grad_norm": 1.5366896390914917, + "learning_rate": 3.048564714022706e-08, + "loss": 0.4121, + "step": 43930 + }, + { + "epoch": 0.9752388985693833, + "grad_norm": 0.8148269057273865, + "learning_rate": 3.021419828196259e-08, + "loss": 0.39, + "step": 43935 + }, + { + "epoch": 0.9753498851289109, + "grad_norm": 1.4114397764205933, + "learning_rate": 2.994396151681955e-08, + "loss": 0.4164, + "step": 43940 + }, + { + "epoch": 0.9754608716884385, + "grad_norm": 1.029274344444275, + "learning_rate": 2.967493687765277e-08, + "loss": 0.292, + "step": 43945 + }, + { + "epoch": 0.9755718582479662, + "grad_norm": 1.5301095247268677, + "learning_rate": 2.9407124397169418e-08, + "loss": 0.4802, + "step": 43950 + }, + { + "epoch": 0.9756828448074938, + "grad_norm": 1.4264928102493286, + "learning_rate": 2.9140524107929e-08, + "loss": 0.3826, + "step": 43955 + }, + { + "epoch": 0.9757938313670215, + "grad_norm": 0.8320451974868774, + "learning_rate": 2.8875136042343378e-08, + "loss": 0.4036, + "step": 43960 + }, + { + "epoch": 0.975904817926549, + "grad_norm": 1.3311158418655396, + "learning_rate": 2.8610960232678952e-08, + "loss": 0.5306, + "step": 43965 + }, + { + "epoch": 0.9760158044860767, + "grad_norm": 1.6904776096343994, + "learning_rate": 2.8347996711052262e-08, + "loss": 0.5003, + "step": 43970 + }, + { + "epoch": 0.9761267910456044, + "grad_norm": 0.4445568919181824, + "learning_rate": 2.8086245509434397e-08, + "loss": 0.332, + "step": 43975 + }, + { + "epoch": 0.976237777605132, + "grad_norm": 1.2849273681640625, + "learning_rate": 2.7825706659646568e-08, + "loss": 0.504, + "step": 43980 + }, + { + "epoch": 0.9763487641646597, + "grad_norm": 1.234660267829895, + "learning_rate": 2.756638019336566e-08, + "loss": 0.3669, + "step": 43985 + }, + { + "epoch": 0.9764597507241873, + "grad_norm": 1.010250210762024, + "learning_rate": 2.7308266142119788e-08, + "loss": 0.2806, + "step": 43990 + }, + { + "epoch": 0.9765707372837149, + "grad_norm": 1.0096099376678467, + "learning_rate": 2.7051364537288293e-08, + "loss": 0.3195, + "step": 43995 + }, + { + "epoch": 0.9766817238432426, + "grad_norm": 1.6283340454101562, + "learning_rate": 2.679567541010619e-08, + "loss": 0.374, + "step": 44000 + }, + { + "epoch": 0.9767927104027703, + "grad_norm": 1.0632973909378052, + "learning_rate": 2.6541198791657506e-08, + "loss": 0.2042, + "step": 44005 + }, + { + "epoch": 0.9769036969622978, + "grad_norm": 1.6411769390106201, + "learning_rate": 2.6287934712881936e-08, + "loss": 0.3868, + "step": 44010 + }, + { + "epoch": 0.9770146835218255, + "grad_norm": 0.9020516872406006, + "learning_rate": 2.603588320456929e-08, + "loss": 0.3887, + "step": 44015 + }, + { + "epoch": 0.9771256700813531, + "grad_norm": 1.113605260848999, + "learning_rate": 2.5785044297365057e-08, + "loss": 0.4919, + "step": 44020 + }, + { + "epoch": 0.9772366566408808, + "grad_norm": 1.0201387405395508, + "learning_rate": 2.5535418021763735e-08, + "loss": 0.3106, + "step": 44025 + }, + { + "epoch": 0.9773476432004085, + "grad_norm": 1.0621434450149536, + "learning_rate": 2.528700440811438e-08, + "loss": 0.4039, + "step": 44030 + }, + { + "epoch": 0.977458629759936, + "grad_norm": 1.047764539718628, + "learning_rate": 2.5039803486618385e-08, + "loss": 0.3628, + "step": 44035 + }, + { + "epoch": 0.9775696163194637, + "grad_norm": 1.3255995512008667, + "learning_rate": 2.479381528732949e-08, + "loss": 0.2393, + "step": 44040 + }, + { + "epoch": 0.9776806028789914, + "grad_norm": 1.75392484664917, + "learning_rate": 2.4549039840154887e-08, + "loss": 0.4483, + "step": 44045 + }, + { + "epoch": 0.977791589438519, + "grad_norm": 0.8042294383049011, + "learning_rate": 2.4305477174852986e-08, + "loss": 0.3084, + "step": 44050 + }, + { + "epoch": 0.9779025759980466, + "grad_norm": 0.8410343527793884, + "learning_rate": 2.4063127321034552e-08, + "loss": 0.2697, + "step": 44055 + }, + { + "epoch": 0.9780135625575743, + "grad_norm": 1.6293293237686157, + "learning_rate": 2.38219903081649e-08, + "loss": 0.3805, + "step": 44060 + }, + { + "epoch": 0.9781245491171019, + "grad_norm": 0.9448833465576172, + "learning_rate": 2.358206616555947e-08, + "loss": 0.4176, + "step": 44065 + }, + { + "epoch": 0.9782355356766296, + "grad_norm": 0.8377218842506409, + "learning_rate": 2.3343354922389372e-08, + "loss": 0.4437, + "step": 44070 + }, + { + "epoch": 0.9783465222361573, + "grad_norm": 1.2574573755264282, + "learning_rate": 2.3105856607674727e-08, + "loss": 0.441, + "step": 44075 + }, + { + "epoch": 0.9784575087956848, + "grad_norm": 1.2281770706176758, + "learning_rate": 2.2869571250289102e-08, + "loss": 0.3629, + "step": 44080 + }, + { + "epoch": 0.9785684953552125, + "grad_norm": 1.4200760126113892, + "learning_rate": 2.263449887896174e-08, + "loss": 0.4407, + "step": 44085 + }, + { + "epoch": 0.9786794819147401, + "grad_norm": 1.2259193658828735, + "learning_rate": 2.2400639522269786e-08, + "loss": 0.4167, + "step": 44090 + }, + { + "epoch": 0.9787904684742678, + "grad_norm": 0.6913291215896606, + "learning_rate": 2.2167993208644932e-08, + "loss": 0.2671, + "step": 44095 + }, + { + "epoch": 0.9789014550337954, + "grad_norm": 1.290332555770874, + "learning_rate": 2.193655996637345e-08, + "loss": 0.3919, + "step": 44100 + }, + { + "epoch": 0.979012441593323, + "grad_norm": 0.9615741968154907, + "learning_rate": 2.1706339823591716e-08, + "loss": 0.5159, + "step": 44105 + }, + { + "epoch": 0.9791234281528507, + "grad_norm": 1.1563535928726196, + "learning_rate": 2.1477332808287342e-08, + "loss": 0.3505, + "step": 44110 + }, + { + "epoch": 0.9792344147123784, + "grad_norm": 1.0211172103881836, + "learning_rate": 2.1249538948304726e-08, + "loss": 0.4348, + "step": 44115 + }, + { + "epoch": 0.979345401271906, + "grad_norm": 0.7108829617500305, + "learning_rate": 2.1022958271336158e-08, + "loss": 0.4062, + "step": 44120 + }, + { + "epoch": 0.9794563878314336, + "grad_norm": 0.9567662477493286, + "learning_rate": 2.07975908049296e-08, + "loss": 0.5, + "step": 44125 + }, + { + "epoch": 0.9795673743909613, + "grad_norm": 1.2178698778152466, + "learning_rate": 2.0573436576484253e-08, + "loss": 0.3449, + "step": 44130 + }, + { + "epoch": 0.9796783609504889, + "grad_norm": 1.179904580116272, + "learning_rate": 2.0350495613252753e-08, + "loss": 0.2882, + "step": 44135 + }, + { + "epoch": 0.9797893475100166, + "grad_norm": 1.3387186527252197, + "learning_rate": 2.0128767942337868e-08, + "loss": 0.2806, + "step": 44140 + }, + { + "epoch": 0.9799003340695441, + "grad_norm": 0.8245904445648193, + "learning_rate": 1.9908253590698033e-08, + "loss": 0.3059, + "step": 44145 + }, + { + "epoch": 0.9800113206290718, + "grad_norm": 1.3168443441390991, + "learning_rate": 1.9688952585141808e-08, + "loss": 0.3215, + "step": 44150 + }, + { + "epoch": 0.9801223071885995, + "grad_norm": 1.4170719385147095, + "learning_rate": 1.9470864952331192e-08, + "loss": 0.4385, + "step": 44155 + }, + { + "epoch": 0.9802332937481271, + "grad_norm": 1.0151528120040894, + "learning_rate": 1.9253990718781646e-08, + "loss": 0.4276, + "step": 44160 + }, + { + "epoch": 0.9803442803076547, + "grad_norm": 2.1504173278808594, + "learning_rate": 1.9038329910858743e-08, + "loss": 0.4582, + "step": 44165 + }, + { + "epoch": 0.9804552668671824, + "grad_norm": 2.0559544563293457, + "learning_rate": 1.8823882554781513e-08, + "loss": 0.3243, + "step": 44170 + }, + { + "epoch": 0.98056625342671, + "grad_norm": 2.0051698684692383, + "learning_rate": 1.8610648676622432e-08, + "loss": 0.5498, + "step": 44175 + }, + { + "epoch": 0.9806772399862377, + "grad_norm": 0.8001129627227783, + "learning_rate": 1.839862830230632e-08, + "loss": 0.4211, + "step": 44180 + }, + { + "epoch": 0.9807882265457654, + "grad_norm": 1.5729435682296753, + "learning_rate": 1.8187821457609222e-08, + "loss": 0.3743, + "step": 44185 + }, + { + "epoch": 0.9808992131052929, + "grad_norm": 1.2498583793640137, + "learning_rate": 1.7978228168160638e-08, + "loss": 0.3745, + "step": 44190 + }, + { + "epoch": 0.9810101996648206, + "grad_norm": 0.9071432948112488, + "learning_rate": 1.7769848459441296e-08, + "loss": 0.3781, + "step": 44195 + }, + { + "epoch": 0.9811211862243482, + "grad_norm": 0.6762136220932007, + "learning_rate": 1.7562682356786488e-08, + "loss": 0.4706, + "step": 44200 + }, + { + "epoch": 0.9812321727838759, + "grad_norm": 1.1027727127075195, + "learning_rate": 1.7356729885381617e-08, + "loss": 0.3397, + "step": 44205 + }, + { + "epoch": 0.9813431593434035, + "grad_norm": 1.1394597291946411, + "learning_rate": 1.715199107026666e-08, + "loss": 0.3597, + "step": 44210 + }, + { + "epoch": 0.9814541459029311, + "grad_norm": 1.1961270570755005, + "learning_rate": 1.6948465936332815e-08, + "loss": 0.3961, + "step": 44215 + }, + { + "epoch": 0.9815651324624588, + "grad_norm": 0.7435837388038635, + "learning_rate": 1.674615450832362e-08, + "loss": 0.3602, + "step": 44220 + }, + { + "epoch": 0.9816761190219865, + "grad_norm": 1.2395622730255127, + "learning_rate": 1.6545056810836068e-08, + "loss": 0.3737, + "step": 44225 + }, + { + "epoch": 0.981787105581514, + "grad_norm": 1.4732521772384644, + "learning_rate": 1.634517286831949e-08, + "loss": 0.3207, + "step": 44230 + }, + { + "epoch": 0.9818980921410417, + "grad_norm": 1.865546464920044, + "learning_rate": 1.6146502705072233e-08, + "loss": 0.3945, + "step": 44235 + }, + { + "epoch": 0.9820090787005694, + "grad_norm": 0.9982540607452393, + "learning_rate": 1.594904634525163e-08, + "loss": 0.2663, + "step": 44240 + }, + { + "epoch": 0.982120065260097, + "grad_norm": 1.148383617401123, + "learning_rate": 1.5752803812860706e-08, + "loss": 0.2313, + "step": 44245 + }, + { + "epoch": 0.9822310518196247, + "grad_norm": 1.4810702800750732, + "learning_rate": 1.5557775131760376e-08, + "loss": 0.3491, + "step": 44250 + }, + { + "epoch": 0.9823420383791522, + "grad_norm": 1.1810661554336548, + "learning_rate": 1.5363960325660565e-08, + "loss": 0.3289, + "step": 44255 + }, + { + "epoch": 0.9824530249386799, + "grad_norm": 1.225584864616394, + "learning_rate": 1.5171359418123533e-08, + "loss": 0.5736, + "step": 44260 + }, + { + "epoch": 0.9825640114982076, + "grad_norm": 1.2184268236160278, + "learning_rate": 1.4979972432567213e-08, + "loss": 0.4315, + "step": 44265 + }, + { + "epoch": 0.9826749980577352, + "grad_norm": 2.8620071411132812, + "learning_rate": 1.4789799392258553e-08, + "loss": 0.3292, + "step": 44270 + }, + { + "epoch": 0.9827859846172629, + "grad_norm": 1.2940585613250732, + "learning_rate": 1.4600840320317945e-08, + "loss": 0.5029, + "step": 44275 + }, + { + "epoch": 0.9828969711767905, + "grad_norm": 0.979594349861145, + "learning_rate": 1.4413095239719233e-08, + "loss": 0.2596, + "step": 44280 + }, + { + "epoch": 0.9830079577363181, + "grad_norm": 0.9143130779266357, + "learning_rate": 1.4226564173286383e-08, + "loss": 0.3334, + "step": 44285 + }, + { + "epoch": 0.9831189442958458, + "grad_norm": 1.4164605140686035, + "learning_rate": 1.4041247143699033e-08, + "loss": 0.3318, + "step": 44290 + }, + { + "epoch": 0.9832299308553735, + "grad_norm": 1.3115427494049072, + "learning_rate": 1.3857144173485827e-08, + "loss": 0.447, + "step": 44295 + }, + { + "epoch": 0.983340917414901, + "grad_norm": 1.3506934642791748, + "learning_rate": 1.3674255285031079e-08, + "loss": 0.4044, + "step": 44300 + }, + { + "epoch": 0.9834519039744287, + "grad_norm": 1.1205554008483887, + "learning_rate": 1.349258050056812e-08, + "loss": 0.363, + "step": 44305 + }, + { + "epoch": 0.9835628905339563, + "grad_norm": 0.9037481546401978, + "learning_rate": 1.3312119842184834e-08, + "loss": 0.3079, + "step": 44310 + }, + { + "epoch": 0.983673877093484, + "grad_norm": 1.1315191984176636, + "learning_rate": 1.313287333182256e-08, + "loss": 0.3683, + "step": 44315 + }, + { + "epoch": 0.9837848636530117, + "grad_norm": 1.7846201658248901, + "learning_rate": 1.2954840991270535e-08, + "loss": 0.2396, + "step": 44320 + }, + { + "epoch": 0.9838958502125392, + "grad_norm": 1.5754671096801758, + "learning_rate": 1.2778022842175886e-08, + "loss": 0.2761, + "step": 44325 + }, + { + "epoch": 0.9840068367720669, + "grad_norm": 1.4894391298294067, + "learning_rate": 1.2602418906034753e-08, + "loss": 0.3827, + "step": 44330 + }, + { + "epoch": 0.9841178233315946, + "grad_norm": 2.369821786880493, + "learning_rate": 1.2428029204195612e-08, + "loss": 0.4618, + "step": 44335 + }, + { + "epoch": 0.9842288098911222, + "grad_norm": 1.513770341873169, + "learning_rate": 1.2254853757862617e-08, + "loss": 0.336, + "step": 44340 + }, + { + "epoch": 0.9843397964506498, + "grad_norm": 0.6345555186271667, + "learning_rate": 1.2082892588086704e-08, + "loss": 0.3276, + "step": 44345 + }, + { + "epoch": 0.9844507830101775, + "grad_norm": 0.9394330978393555, + "learning_rate": 1.1912145715775591e-08, + "loss": 0.4819, + "step": 44350 + }, + { + "epoch": 0.9845617695697051, + "grad_norm": 1.2701035737991333, + "learning_rate": 1.1742613161689342e-08, + "loss": 0.3594, + "step": 44355 + }, + { + "epoch": 0.9846727561292328, + "grad_norm": 1.3909372091293335, + "learning_rate": 1.1574294946438136e-08, + "loss": 0.3869, + "step": 44360 + }, + { + "epoch": 0.9847837426887603, + "grad_norm": 1.4137762784957886, + "learning_rate": 1.1407191090485604e-08, + "loss": 0.4051, + "step": 44365 + }, + { + "epoch": 0.984894729248288, + "grad_norm": 0.9500555396080017, + "learning_rate": 1.1241301614147715e-08, + "loss": 0.3502, + "step": 44370 + }, + { + "epoch": 0.9850057158078157, + "grad_norm": 0.7241771221160889, + "learning_rate": 1.1076626537591672e-08, + "loss": 0.3031, + "step": 44375 + }, + { + "epoch": 0.9851167023673433, + "grad_norm": 0.9816907048225403, + "learning_rate": 1.0913165880840348e-08, + "loss": 0.2453, + "step": 44380 + }, + { + "epoch": 0.985227688926871, + "grad_norm": 1.0117193460464478, + "learning_rate": 1.0750919663764514e-08, + "loss": 0.3752, + "step": 44385 + }, + { + "epoch": 0.9853386754863986, + "grad_norm": 0.9182025194168091, + "learning_rate": 1.0589887906090612e-08, + "loss": 0.4162, + "step": 44390 + }, + { + "epoch": 0.9854496620459262, + "grad_norm": 1.2671328783035278, + "learning_rate": 1.0430070627397425e-08, + "loss": 0.4278, + "step": 44395 + }, + { + "epoch": 0.9855606486054539, + "grad_norm": 1.280246376991272, + "learning_rate": 1.0271467847112748e-08, + "loss": 0.3933, + "step": 44400 + }, + { + "epoch": 0.9856716351649816, + "grad_norm": 0.9924276471138, + "learning_rate": 1.0114079584520042e-08, + "loss": 0.2724, + "step": 44405 + }, + { + "epoch": 0.9857826217245091, + "grad_norm": 0.8611973524093628, + "learning_rate": 9.957905858755112e-09, + "loss": 0.4114, + "step": 44410 + }, + { + "epoch": 0.9858936082840368, + "grad_norm": 1.7133750915527344, + "learning_rate": 9.802946688802772e-09, + "loss": 0.3201, + "step": 44415 + }, + { + "epoch": 0.9860045948435644, + "grad_norm": 1.1161009073257446, + "learning_rate": 9.649202093504616e-09, + "loss": 0.4083, + "step": 44420 + }, + { + "epoch": 0.9861155814030921, + "grad_norm": 1.7523982524871826, + "learning_rate": 9.49667209155014e-09, + "loss": 0.3819, + "step": 44425 + }, + { + "epoch": 0.9862265679626198, + "grad_norm": 1.6116431951522827, + "learning_rate": 9.34535670148562e-09, + "loss": 0.3923, + "step": 44430 + }, + { + "epoch": 0.9863375545221473, + "grad_norm": 1.6228337287902832, + "learning_rate": 9.195255941707448e-09, + "loss": 0.3135, + "step": 44435 + }, + { + "epoch": 0.986448541081675, + "grad_norm": 1.1930763721466064, + "learning_rate": 9.046369830462143e-09, + "loss": 0.226, + "step": 44440 + }, + { + "epoch": 0.9865595276412027, + "grad_norm": 2.828937292098999, + "learning_rate": 8.898698385853e-09, + "loss": 0.4682, + "step": 44445 + }, + { + "epoch": 0.9866705142007303, + "grad_norm": 0.6722959280014038, + "learning_rate": 8.752241625831215e-09, + "loss": 0.2786, + "step": 44450 + }, + { + "epoch": 0.986781500760258, + "grad_norm": 2.1818439960479736, + "learning_rate": 8.606999568204766e-09, + "loss": 0.4206, + "step": 44455 + }, + { + "epoch": 0.9868924873197856, + "grad_norm": 0.8703639507293701, + "learning_rate": 8.46297223063064e-09, + "loss": 0.4662, + "step": 44460 + }, + { + "epoch": 0.9870034738793132, + "grad_norm": 1.0020828247070312, + "learning_rate": 8.320159630620384e-09, + "loss": 0.3988, + "step": 44465 + }, + { + "epoch": 0.9871144604388409, + "grad_norm": 1.252456784248352, + "learning_rate": 8.178561785534556e-09, + "loss": 0.3913, + "step": 44470 + }, + { + "epoch": 0.9872254469983684, + "grad_norm": 1.3738957643508911, + "learning_rate": 8.038178712589384e-09, + "loss": 0.5101, + "step": 44475 + }, + { + "epoch": 0.9873364335578961, + "grad_norm": 1.1012595891952515, + "learning_rate": 7.899010428852328e-09, + "loss": 0.3748, + "step": 44480 + }, + { + "epoch": 0.9874474201174238, + "grad_norm": 1.94729745388031, + "learning_rate": 7.761056951242074e-09, + "loss": 0.2961, + "step": 44485 + }, + { + "epoch": 0.9875584066769514, + "grad_norm": 0.995019793510437, + "learning_rate": 7.624318296530763e-09, + "loss": 0.271, + "step": 44490 + }, + { + "epoch": 0.9876693932364791, + "grad_norm": 1.0689935684204102, + "learning_rate": 7.488794481343986e-09, + "loss": 0.518, + "step": 44495 + }, + { + "epoch": 0.9877803797960067, + "grad_norm": 1.081696629524231, + "learning_rate": 7.354485522157451e-09, + "loss": 0.3497, + "step": 44500 + }, + { + "epoch": 0.9878913663555343, + "grad_norm": 1.0555349588394165, + "learning_rate": 7.221391435299208e-09, + "loss": 0.487, + "step": 44505 + }, + { + "epoch": 0.988002352915062, + "grad_norm": 1.3464195728302002, + "learning_rate": 7.089512236950758e-09, + "loss": 0.5904, + "step": 44510 + }, + { + "epoch": 0.9881133394745897, + "grad_norm": 1.7165731191635132, + "learning_rate": 6.958847943147051e-09, + "loss": 0.4417, + "step": 44515 + }, + { + "epoch": 0.9882243260341173, + "grad_norm": 0.8013442754745483, + "learning_rate": 6.82939856977094e-09, + "loss": 0.3522, + "step": 44520 + }, + { + "epoch": 0.9883353125936449, + "grad_norm": 1.3391999006271362, + "learning_rate": 6.701164132563165e-09, + "loss": 0.2826, + "step": 44525 + }, + { + "epoch": 0.9884462991531725, + "grad_norm": 1.4739177227020264, + "learning_rate": 6.574144647112368e-09, + "loss": 0.3699, + "step": 44530 + }, + { + "epoch": 0.9885572857127002, + "grad_norm": 0.9099801182746887, + "learning_rate": 6.448340128861752e-09, + "loss": 0.3221, + "step": 44535 + }, + { + "epoch": 0.9886682722722279, + "grad_norm": 2.0303006172180176, + "learning_rate": 6.323750593106859e-09, + "loss": 0.4449, + "step": 44540 + }, + { + "epoch": 0.9887792588317554, + "grad_norm": 1.530608892440796, + "learning_rate": 6.200376054993351e-09, + "loss": 0.3188, + "step": 44545 + }, + { + "epoch": 0.9888902453912831, + "grad_norm": 2.018537998199463, + "learning_rate": 6.078216529522563e-09, + "loss": 0.4305, + "step": 44550 + }, + { + "epoch": 0.9890012319508108, + "grad_norm": 0.8686156272888184, + "learning_rate": 5.957272031543726e-09, + "loss": 0.2856, + "step": 44555 + }, + { + "epoch": 0.9891122185103384, + "grad_norm": 0.7620587348937988, + "learning_rate": 5.837542575763966e-09, + "loss": 0.4426, + "step": 44560 + }, + { + "epoch": 0.989223205069866, + "grad_norm": 4.526071071624756, + "learning_rate": 5.719028176737196e-09, + "loss": 0.4328, + "step": 44565 + }, + { + "epoch": 0.9893341916293937, + "grad_norm": 0.9952146410942078, + "learning_rate": 5.60172884887189e-09, + "loss": 0.2975, + "step": 44570 + }, + { + "epoch": 0.9894451781889213, + "grad_norm": 1.9656871557235718, + "learning_rate": 5.485644606431084e-09, + "loss": 0.286, + "step": 44575 + }, + { + "epoch": 0.989556164748449, + "grad_norm": 1.245468258857727, + "learning_rate": 5.3707754635257126e-09, + "loss": 0.4315, + "step": 44580 + }, + { + "epoch": 0.9896671513079766, + "grad_norm": 1.3038861751556396, + "learning_rate": 5.257121434122381e-09, + "loss": 0.31, + "step": 44585 + }, + { + "epoch": 0.9897781378675042, + "grad_norm": 1.346030831336975, + "learning_rate": 5.144682532038925e-09, + "loss": 0.3789, + "step": 44590 + }, + { + "epoch": 0.9898891244270319, + "grad_norm": 1.2573784589767456, + "learning_rate": 5.03345877094441e-09, + "loss": 0.3522, + "step": 44595 + }, + { + "epoch": 0.9900001109865595, + "grad_norm": 1.692915678024292, + "learning_rate": 4.923450164361354e-09, + "loss": 0.4208, + "step": 44600 + }, + { + "epoch": 0.9901110975460872, + "grad_norm": 1.0562539100646973, + "learning_rate": 4.814656725664613e-09, + "loss": 0.358, + "step": 44605 + }, + { + "epoch": 0.9902220841056149, + "grad_norm": 1.2630434036254883, + "learning_rate": 4.707078468080273e-09, + "loss": 0.4486, + "step": 44610 + }, + { + "epoch": 0.9903330706651424, + "grad_norm": 1.063307762145996, + "learning_rate": 4.600715404687872e-09, + "loss": 0.387, + "step": 44615 + }, + { + "epoch": 0.9904440572246701, + "grad_norm": 0.9800068736076355, + "learning_rate": 4.495567548419288e-09, + "loss": 0.371, + "step": 44620 + }, + { + "epoch": 0.9905550437841978, + "grad_norm": 1.1773452758789062, + "learning_rate": 4.39163491205652e-09, + "loss": 0.3439, + "step": 44625 + }, + { + "epoch": 0.9906660303437254, + "grad_norm": 0.8336577415466309, + "learning_rate": 4.288917508236124e-09, + "loss": 0.5058, + "step": 44630 + }, + { + "epoch": 0.990777016903253, + "grad_norm": 1.85921049118042, + "learning_rate": 4.187415349445889e-09, + "loss": 0.4798, + "step": 44635 + }, + { + "epoch": 0.9908880034627806, + "grad_norm": 2.1528615951538086, + "learning_rate": 4.087128448027056e-09, + "loss": 0.3445, + "step": 44640 + }, + { + "epoch": 0.9909989900223083, + "grad_norm": 1.1345372200012207, + "learning_rate": 3.988056816170982e-09, + "loss": 0.318, + "step": 44645 + }, + { + "epoch": 0.991109976581836, + "grad_norm": 1.681210994720459, + "learning_rate": 3.890200465923588e-09, + "loss": 0.5825, + "step": 44650 + }, + { + "epoch": 0.9912209631413635, + "grad_norm": 0.6857728362083435, + "learning_rate": 3.793559409180913e-09, + "loss": 0.3354, + "step": 44655 + }, + { + "epoch": 0.9913319497008912, + "grad_norm": 1.3109638690948486, + "learning_rate": 3.6981336576924487e-09, + "loss": 0.362, + "step": 44660 + }, + { + "epoch": 0.9914429362604189, + "grad_norm": 1.4599089622497559, + "learning_rate": 3.603923223060024e-09, + "loss": 0.3748, + "step": 44665 + }, + { + "epoch": 0.9915539228199465, + "grad_norm": 2.070918560028076, + "learning_rate": 3.5109281167367004e-09, + "loss": 0.436, + "step": 44670 + }, + { + "epoch": 0.9916649093794742, + "grad_norm": 1.6659787893295288, + "learning_rate": 3.4191483500300995e-09, + "loss": 0.3908, + "step": 44675 + }, + { + "epoch": 0.9917758959390018, + "grad_norm": 0.8800778985023499, + "learning_rate": 3.328583934096852e-09, + "loss": 0.4847, + "step": 44680 + }, + { + "epoch": 0.9918868824985294, + "grad_norm": 2.3744845390319824, + "learning_rate": 3.23923487994815e-09, + "loss": 0.3263, + "step": 44685 + }, + { + "epoch": 0.9919978690580571, + "grad_norm": 1.5989969968795776, + "learning_rate": 3.151101198446416e-09, + "loss": 0.3522, + "step": 44690 + }, + { + "epoch": 0.9921088556175847, + "grad_norm": 1.0988764762878418, + "learning_rate": 3.064182900307522e-09, + "loss": 0.3976, + "step": 44695 + }, + { + "epoch": 0.9922198421771123, + "grad_norm": 0.8881332874298096, + "learning_rate": 2.978479996098571e-09, + "loss": 0.2979, + "step": 44700 + }, + { + "epoch": 0.99233082873664, + "grad_norm": 1.5848913192749023, + "learning_rate": 2.8939924962378964e-09, + "loss": 0.362, + "step": 44705 + }, + { + "epoch": 0.9924418152961676, + "grad_norm": 2.511681079864502, + "learning_rate": 2.810720410998391e-09, + "loss": 0.434, + "step": 44710 + }, + { + "epoch": 0.9925528018556953, + "grad_norm": 1.503347635269165, + "learning_rate": 2.728663750503069e-09, + "loss": 0.3451, + "step": 44715 + }, + { + "epoch": 0.992663788415223, + "grad_norm": 0.9345759749412537, + "learning_rate": 2.647822524729504e-09, + "loss": 0.2029, + "step": 44720 + }, + { + "epoch": 0.9927747749747505, + "grad_norm": 1.4155058860778809, + "learning_rate": 2.568196743504281e-09, + "loss": 0.3551, + "step": 44725 + }, + { + "epoch": 0.9928857615342782, + "grad_norm": 1.0107218027114868, + "learning_rate": 2.489786416508544e-09, + "loss": 0.228, + "step": 44730 + }, + { + "epoch": 0.9929967480938059, + "grad_norm": 1.2654509544372559, + "learning_rate": 2.4125915532757782e-09, + "loss": 0.5571, + "step": 44735 + }, + { + "epoch": 0.9931077346533335, + "grad_norm": 1.0148260593414307, + "learning_rate": 2.336612163191809e-09, + "loss": 0.2945, + "step": 44740 + }, + { + "epoch": 0.9932187212128611, + "grad_norm": 1.9776355028152466, + "learning_rate": 2.2618482554925825e-09, + "loss": 0.2915, + "step": 44745 + }, + { + "epoch": 0.9933297077723887, + "grad_norm": 0.6568176746368408, + "learning_rate": 2.1882998392674936e-09, + "loss": 0.3109, + "step": 44750 + }, + { + "epoch": 0.9934406943319164, + "grad_norm": 1.109466314315796, + "learning_rate": 2.1159669234593893e-09, + "loss": 0.5572, + "step": 44755 + }, + { + "epoch": 0.9935516808914441, + "grad_norm": 0.8876851201057434, + "learning_rate": 2.044849516861236e-09, + "loss": 0.3924, + "step": 44760 + }, + { + "epoch": 0.9936626674509716, + "grad_norm": 0.8593242168426514, + "learning_rate": 1.97494762811945e-09, + "loss": 0.54, + "step": 44765 + }, + { + "epoch": 0.9937736540104993, + "grad_norm": 1.1869981288909912, + "learning_rate": 1.9062612657338996e-09, + "loss": 0.3661, + "step": 44770 + }, + { + "epoch": 0.993884640570027, + "grad_norm": 2.151790142059326, + "learning_rate": 1.8387904380534615e-09, + "loss": 0.4096, + "step": 44775 + }, + { + "epoch": 0.9939956271295546, + "grad_norm": 0.9031875133514404, + "learning_rate": 1.7725351532815738e-09, + "loss": 0.2741, + "step": 44780 + }, + { + "epoch": 0.9941066136890823, + "grad_norm": 1.4749810695648193, + "learning_rate": 1.7074954194729044e-09, + "loss": 0.4858, + "step": 44785 + }, + { + "epoch": 0.99421760024861, + "grad_norm": 0.9121118187904358, + "learning_rate": 1.6436712445366821e-09, + "loss": 0.3698, + "step": 44790 + }, + { + "epoch": 0.9943285868081375, + "grad_norm": 1.0880600214004517, + "learning_rate": 1.5810626362300351e-09, + "loss": 0.3834, + "step": 44795 + }, + { + "epoch": 0.9944395733676652, + "grad_norm": 1.0860681533813477, + "learning_rate": 1.519669602165763e-09, + "loss": 0.3456, + "step": 44800 + }, + { + "epoch": 0.9945505599271928, + "grad_norm": 1.1477147340774536, + "learning_rate": 1.4594921498078951e-09, + "loss": 0.3126, + "step": 44805 + }, + { + "epoch": 0.9946615464867204, + "grad_norm": 1.5204180479049683, + "learning_rate": 1.4005302864716908e-09, + "loss": 0.2647, + "step": 44810 + }, + { + "epoch": 0.9947725330462481, + "grad_norm": 1.7552012205123901, + "learning_rate": 1.3427840193280805e-09, + "loss": 0.4337, + "step": 44815 + }, + { + "epoch": 0.9948835196057757, + "grad_norm": 0.8465442061424255, + "learning_rate": 1.2862533553947843e-09, + "loss": 0.4007, + "step": 44820 + }, + { + "epoch": 0.9949945061653034, + "grad_norm": 0.7114987373352051, + "learning_rate": 1.2309383015451926e-09, + "loss": 0.3397, + "step": 44825 + }, + { + "epoch": 0.9951054927248311, + "grad_norm": 0.8507906794548035, + "learning_rate": 1.1768388645061468e-09, + "loss": 0.3564, + "step": 44830 + }, + { + "epoch": 0.9952164792843586, + "grad_norm": 1.5379858016967773, + "learning_rate": 1.1239550508523877e-09, + "loss": 0.5228, + "step": 44835 + }, + { + "epoch": 0.9953274658438863, + "grad_norm": 0.9246373176574707, + "learning_rate": 1.0722868670154374e-09, + "loss": 0.3035, + "step": 44840 + }, + { + "epoch": 0.995438452403414, + "grad_norm": 1.0635490417480469, + "learning_rate": 1.0218343192758273e-09, + "loss": 0.3683, + "step": 44845 + }, + { + "epoch": 0.9955494389629416, + "grad_norm": 1.466962218284607, + "learning_rate": 9.725974137675399e-10, + "loss": 0.3935, + "step": 44850 + }, + { + "epoch": 0.9956604255224693, + "grad_norm": 1.0246407985687256, + "learning_rate": 9.245761564768973e-10, + "loss": 0.3812, + "step": 44855 + }, + { + "epoch": 0.9957714120819968, + "grad_norm": 1.632717490196228, + "learning_rate": 8.777705532414526e-10, + "loss": 0.3514, + "step": 44860 + }, + { + "epoch": 0.9958823986415245, + "grad_norm": 0.7847806215286255, + "learning_rate": 8.321806097522089e-10, + "loss": 0.4794, + "step": 44865 + }, + { + "epoch": 0.9959933852010522, + "grad_norm": 0.9914886951446533, + "learning_rate": 7.878063315525097e-10, + "loss": 0.4997, + "step": 44870 + }, + { + "epoch": 0.9961043717605798, + "grad_norm": 0.9847691655158997, + "learning_rate": 7.446477240358185e-10, + "loss": 0.2787, + "step": 44875 + }, + { + "epoch": 0.9962153583201074, + "grad_norm": 1.0501551628112793, + "learning_rate": 7.027047924512698e-10, + "loss": 0.2494, + "step": 44880 + }, + { + "epoch": 0.9963263448796351, + "grad_norm": 0.8864679336547852, + "learning_rate": 6.619775418958974e-10, + "loss": 0.3905, + "step": 44885 + }, + { + "epoch": 0.9964373314391627, + "grad_norm": 0.9408095479011536, + "learning_rate": 6.224659773212959e-10, + "loss": 0.4379, + "step": 44890 + }, + { + "epoch": 0.9965483179986904, + "grad_norm": 2.961080551147461, + "learning_rate": 5.841701035336212e-10, + "loss": 0.4349, + "step": 44895 + }, + { + "epoch": 0.996659304558218, + "grad_norm": 1.2695547342300415, + "learning_rate": 5.470899251858175e-10, + "loss": 0.364, + "step": 44900 + }, + { + "epoch": 0.9967702911177456, + "grad_norm": 0.9183524250984192, + "learning_rate": 5.11225446787611e-10, + "loss": 0.3824, + "step": 44905 + }, + { + "epoch": 0.9968812776772733, + "grad_norm": 1.1266758441925049, + "learning_rate": 4.765766726999577e-10, + "loss": 0.2879, + "step": 44910 + }, + { + "epoch": 0.9969922642368009, + "grad_norm": 0.5046502947807312, + "learning_rate": 4.4314360713282324e-10, + "loss": 0.3201, + "step": 44915 + }, + { + "epoch": 0.9971032507963286, + "grad_norm": 1.2100203037261963, + "learning_rate": 4.109262541529546e-10, + "loss": 0.4993, + "step": 44920 + }, + { + "epoch": 0.9972142373558562, + "grad_norm": 0.8196183443069458, + "learning_rate": 3.7992461767721865e-10, + "loss": 0.535, + "step": 44925 + }, + { + "epoch": 0.9973252239153838, + "grad_norm": 1.5104304552078247, + "learning_rate": 3.501387014737123e-10, + "loss": 0.3876, + "step": 44930 + }, + { + "epoch": 0.9974362104749115, + "grad_norm": 2.42695689201355, + "learning_rate": 3.2156850916398307e-10, + "loss": 0.4088, + "step": 44935 + }, + { + "epoch": 0.9975471970344392, + "grad_norm": 1.9051953554153442, + "learning_rate": 2.942140442219188e-10, + "loss": 0.3739, + "step": 44940 + }, + { + "epoch": 0.9976581835939667, + "grad_norm": 1.0544376373291016, + "learning_rate": 2.680753099726374e-10, + "loss": 0.4976, + "step": 44945 + }, + { + "epoch": 0.9977691701534944, + "grad_norm": 1.1928341388702393, + "learning_rate": 2.4315230959359726e-10, + "loss": 0.3354, + "step": 44950 + }, + { + "epoch": 0.9978801567130221, + "grad_norm": 1.284271001815796, + "learning_rate": 2.194450461168174e-10, + "loss": 0.4687, + "step": 44955 + }, + { + "epoch": 0.9979911432725497, + "grad_norm": 0.9410303235054016, + "learning_rate": 1.9695352242221633e-10, + "loss": 0.2556, + "step": 44960 + }, + { + "epoch": 0.9981021298320774, + "grad_norm": 1.4326661825180054, + "learning_rate": 1.7567774124649384e-10, + "loss": 0.3306, + "step": 44965 + }, + { + "epoch": 0.9982131163916049, + "grad_norm": 0.7842845916748047, + "learning_rate": 1.5561770517424913e-10, + "loss": 0.4147, + "step": 44970 + }, + { + "epoch": 0.9983241029511326, + "grad_norm": 1.3309615850448608, + "learning_rate": 1.3677341664464216e-10, + "loss": 0.4497, + "step": 44975 + }, + { + "epoch": 0.9984350895106603, + "grad_norm": 0.7155956625938416, + "learning_rate": 1.191448779502835e-10, + "loss": 0.3195, + "step": 44980 + }, + { + "epoch": 0.9985460760701879, + "grad_norm": 1.2971090078353882, + "learning_rate": 1.0273209123279338e-10, + "loss": 0.3304, + "step": 44985 + }, + { + "epoch": 0.9986570626297155, + "grad_norm": 0.8447107076644897, + "learning_rate": 8.753505848724253e-11, + "loss": 0.3095, + "step": 44990 + }, + { + "epoch": 0.9987680491892432, + "grad_norm": 0.9911041855812073, + "learning_rate": 7.355378156326254e-11, + "loss": 0.4211, + "step": 44995 + }, + { + "epoch": 0.9988790357487708, + "grad_norm": 0.7930455207824707, + "learning_rate": 6.078826215949462e-11, + "loss": 0.3887, + "step": 45000 + }, + { + "epoch": 0.9989900223082985, + "grad_norm": 1.4207168817520142, + "learning_rate": 4.923850182692036e-11, + "loss": 0.4148, + "step": 45005 + }, + { + "epoch": 0.9991010088678262, + "grad_norm": 0.814357340335846, + "learning_rate": 3.8904501971082086e-11, + "loss": 0.3109, + "step": 45010 + }, + { + "epoch": 0.9992119954273537, + "grad_norm": 1.4594573974609375, + "learning_rate": 2.978626384875227e-11, + "loss": 0.3686, + "step": 45015 + }, + { + "epoch": 0.9993229819868814, + "grad_norm": 2.3122453689575195, + "learning_rate": 2.188378856682327e-11, + "loss": 0.333, + "step": 45020 + }, + { + "epoch": 0.999433968546409, + "grad_norm": 1.6867766380310059, + "learning_rate": 1.5197077087858448e-11, + "loss": 0.4699, + "step": 45025 + }, + { + "epoch": 0.9995449551059367, + "grad_norm": 1.980859637260437, + "learning_rate": 9.726130222320607e-12, + "loss": 0.4905, + "step": 45030 + }, + { + "epoch": 0.9996559416654643, + "grad_norm": 1.486547827720642, + "learning_rate": 5.4709486385640106e-12, + "loss": 0.4427, + "step": 45035 + }, + { + "epoch": 0.9997669282249919, + "grad_norm": 1.1338590383529663, + "learning_rate": 2.4315328517321435e-12, + "loss": 0.4851, + "step": 45040 + }, + { + "epoch": 0.9998779147845196, + "grad_norm": 1.1397794485092163, + "learning_rate": 6.078832315292716e-13, + "loss": 0.4461, + "step": 45045 + }, + { + "epoch": 0.9999889013440473, + "grad_norm": 0.7539506554603577, + "learning_rate": 0.0, + "loss": 0.4099, + "step": 45050 + }, + { + "epoch": 0.9999889013440473, + "step": 45050, + "total_flos": 2.0927971544599167e+19, + "train_loss": 0.45276333124470897, + "train_runtime": 95784.0198, + "train_samples_per_second": 11.288, + "train_steps_per_second": 0.47 + } + ], + "logging_steps": 5, + "max_steps": 45050, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 15000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0927971544599167e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}