{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 1, "global_step": 55170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045314482508609753, "grad_norm": 6.016256332397461, "learning_rate": 4.5314482508609754e-08, "loss": 2.0466, "step": 5 }, { "epoch": 0.0009062896501721951, "grad_norm": 5.982686996459961, "learning_rate": 9.062896501721951e-08, "loss": 2.0423, "step": 10 }, { "epoch": 0.0013594344752582926, "grad_norm": 5.980902671813965, "learning_rate": 1.3594344752582926e-07, "loss": 2.0842, "step": 15 }, { "epoch": 0.0018125793003443901, "grad_norm": 6.1512451171875, "learning_rate": 1.8125793003443901e-07, "loss": 2.0961, "step": 20 }, { "epoch": 0.0022657241254304874, "grad_norm": 6.2417402267456055, "learning_rate": 2.2657241254304874e-07, "loss": 2.0391, "step": 25 }, { "epoch": 0.0027188689505165853, "grad_norm": 5.98213005065918, "learning_rate": 2.718868950516585e-07, "loss": 2.0236, "step": 30 }, { "epoch": 0.0031720137756026828, "grad_norm": 5.963215351104736, "learning_rate": 3.172013775602683e-07, "loss": 2.0253, "step": 35 }, { "epoch": 0.0036251586006887803, "grad_norm": 5.272254943847656, "learning_rate": 3.6251586006887803e-07, "loss": 2.0246, "step": 40 }, { "epoch": 0.004078303425774877, "grad_norm": 4.858317852020264, "learning_rate": 4.0783034257748773e-07, "loss": 2.0098, "step": 45 }, { "epoch": 0.004531448250860975, "grad_norm": 4.720812797546387, "learning_rate": 4.531448250860975e-07, "loss": 1.9805, "step": 50 }, { "epoch": 0.004984593075947072, "grad_norm": 4.4687089920043945, "learning_rate": 4.984593075947073e-07, "loss": 1.9451, "step": 55 }, { "epoch": 0.005437737901033171, "grad_norm": 4.439191818237305, "learning_rate": 5.43773790103317e-07, "loss": 1.8901, "step": 60 }, { "epoch": 0.005890882726119268, "grad_norm": 3.890505075454712, "learning_rate": 5.890882726119268e-07, "loss": 1.8052, "step": 65 }, { "epoch": 0.0063440275512053656, "grad_norm": 4.592777252197266, "learning_rate": 6.344027551205366e-07, "loss": 1.7827, "step": 70 }, { "epoch": 0.006797172376291463, "grad_norm": 3.524919033050537, "learning_rate": 6.797172376291463e-07, "loss": 1.7426, "step": 75 }, { "epoch": 0.0072503172013775605, "grad_norm": 3.805680751800537, "learning_rate": 7.250317201377561e-07, "loss": 1.644, "step": 80 }, { "epoch": 0.007703462026463658, "grad_norm": 3.735938549041748, "learning_rate": 7.703462026463658e-07, "loss": 1.4852, "step": 85 }, { "epoch": 0.008156606851549755, "grad_norm": 3.9681382179260254, "learning_rate": 8.156606851549755e-07, "loss": 1.4147, "step": 90 }, { "epoch": 0.008609751676635852, "grad_norm": 4.521117687225342, "learning_rate": 8.609751676635852e-07, "loss": 1.3303, "step": 95 }, { "epoch": 0.00906289650172195, "grad_norm": 5.397397041320801, "learning_rate": 9.06289650172195e-07, "loss": 1.2474, "step": 100 }, { "epoch": 0.009516041326808047, "grad_norm": 3.708193063735962, "learning_rate": 9.516041326808047e-07, "loss": 1.1166, "step": 105 }, { "epoch": 0.009969186151894145, "grad_norm": 2.971996784210205, "learning_rate": 9.969186151894146e-07, "loss": 1.0466, "step": 110 }, { "epoch": 0.010422330976980244, "grad_norm": 2.70930814743042, "learning_rate": 1.0422330976980244e-06, "loss": 1.0226, "step": 115 }, { "epoch": 0.010875475802066341, "grad_norm": 1.9328722953796387, "learning_rate": 1.087547580206634e-06, "loss": 0.9078, "step": 120 }, { "epoch": 0.011328620627152439, "grad_norm": 1.5543678998947144, "learning_rate": 1.132862062715244e-06, "loss": 0.8414, "step": 125 }, { "epoch": 0.011781765452238536, "grad_norm": 1.383331060409546, "learning_rate": 1.1781765452238536e-06, "loss": 0.8423, "step": 130 }, { "epoch": 0.012234910277324634, "grad_norm": 1.320696473121643, "learning_rate": 1.2234910277324635e-06, "loss": 0.8314, "step": 135 }, { "epoch": 0.012688055102410731, "grad_norm": 1.1713027954101562, "learning_rate": 1.268805510241073e-06, "loss": 0.7441, "step": 140 }, { "epoch": 0.013141199927496829, "grad_norm": 1.2312129735946655, "learning_rate": 1.314119992749683e-06, "loss": 0.8356, "step": 145 }, { "epoch": 0.013594344752582926, "grad_norm": 1.2196614742279053, "learning_rate": 1.3594344752582926e-06, "loss": 0.7761, "step": 150 }, { "epoch": 0.014047489577669024, "grad_norm": 1.1641188859939575, "learning_rate": 1.4047489577669025e-06, "loss": 0.8079, "step": 155 }, { "epoch": 0.014500634402755121, "grad_norm": 1.2945092916488647, "learning_rate": 1.4500634402755121e-06, "loss": 0.7565, "step": 160 }, { "epoch": 0.014953779227841218, "grad_norm": 1.0843696594238281, "learning_rate": 1.495377922784122e-06, "loss": 0.7777, "step": 165 }, { "epoch": 0.015406924052927316, "grad_norm": 1.1917531490325928, "learning_rate": 1.5406924052927316e-06, "loss": 0.8231, "step": 170 }, { "epoch": 0.015860068878013413, "grad_norm": 1.2114207744598389, "learning_rate": 1.5860068878013415e-06, "loss": 0.7994, "step": 175 }, { "epoch": 0.01631321370309951, "grad_norm": 1.1651109457015991, "learning_rate": 1.631321370309951e-06, "loss": 0.8378, "step": 180 }, { "epoch": 0.01676635852818561, "grad_norm": 1.0738365650177002, "learning_rate": 1.676635852818561e-06, "loss": 0.7823, "step": 185 }, { "epoch": 0.017219503353271704, "grad_norm": 1.0971013307571411, "learning_rate": 1.7219503353271704e-06, "loss": 0.8539, "step": 190 }, { "epoch": 0.017672648178357803, "grad_norm": 1.0240015983581543, "learning_rate": 1.7672648178357805e-06, "loss": 0.8346, "step": 195 }, { "epoch": 0.0181257930034439, "grad_norm": 1.121253490447998, "learning_rate": 1.81257930034439e-06, "loss": 0.781, "step": 200 }, { "epoch": 0.01857893782853, "grad_norm": 1.0049034357070923, "learning_rate": 1.857893782853e-06, "loss": 0.8102, "step": 205 }, { "epoch": 0.019032082653616094, "grad_norm": 1.016873836517334, "learning_rate": 1.9032082653616094e-06, "loss": 0.8293, "step": 210 }, { "epoch": 0.019485227478702193, "grad_norm": 1.0028071403503418, "learning_rate": 1.9485227478702193e-06, "loss": 0.7287, "step": 215 }, { "epoch": 0.01993837230378829, "grad_norm": 0.9992745518684387, "learning_rate": 1.993837230378829e-06, "loss": 0.8068, "step": 220 }, { "epoch": 0.020391517128874388, "grad_norm": 1.03682279586792, "learning_rate": 2.039151712887439e-06, "loss": 0.7913, "step": 225 }, { "epoch": 0.020844661953960487, "grad_norm": 1.020959734916687, "learning_rate": 2.084466195396049e-06, "loss": 0.7745, "step": 230 }, { "epoch": 0.021297806779046583, "grad_norm": 1.0782688856124878, "learning_rate": 2.1297806779046583e-06, "loss": 0.8429, "step": 235 }, { "epoch": 0.021750951604132682, "grad_norm": 0.9844406247138977, "learning_rate": 2.175095160413268e-06, "loss": 0.819, "step": 240 }, { "epoch": 0.022204096429218778, "grad_norm": 1.0113593339920044, "learning_rate": 2.220409642921878e-06, "loss": 0.7552, "step": 245 }, { "epoch": 0.022657241254304877, "grad_norm": 0.9811721444129944, "learning_rate": 2.265724125430488e-06, "loss": 0.819, "step": 250 }, { "epoch": 0.023110386079390973, "grad_norm": 1.0651823282241821, "learning_rate": 2.3110386079390973e-06, "loss": 0.7643, "step": 255 }, { "epoch": 0.023563530904477072, "grad_norm": 1.0943059921264648, "learning_rate": 2.356353090447707e-06, "loss": 0.7751, "step": 260 }, { "epoch": 0.024016675729563168, "grad_norm": 1.018865704536438, "learning_rate": 2.401667572956317e-06, "loss": 0.788, "step": 265 }, { "epoch": 0.024469820554649267, "grad_norm": 0.9829012751579285, "learning_rate": 2.446982055464927e-06, "loss": 0.7916, "step": 270 }, { "epoch": 0.024922965379735363, "grad_norm": 1.088053584098816, "learning_rate": 2.4922965379735363e-06, "loss": 0.7481, "step": 275 }, { "epoch": 0.025376110204821462, "grad_norm": 1.0996031761169434, "learning_rate": 2.537611020482146e-06, "loss": 0.8487, "step": 280 }, { "epoch": 0.025829255029907558, "grad_norm": 1.0826600790023804, "learning_rate": 2.582925502990756e-06, "loss": 0.7727, "step": 285 }, { "epoch": 0.026282399854993657, "grad_norm": 1.0131170749664307, "learning_rate": 2.628239985499366e-06, "loss": 0.7742, "step": 290 }, { "epoch": 0.026735544680079753, "grad_norm": 1.0506353378295898, "learning_rate": 2.6735544680079754e-06, "loss": 0.7308, "step": 295 }, { "epoch": 0.027188689505165852, "grad_norm": 1.0438340902328491, "learning_rate": 2.7188689505165852e-06, "loss": 0.7443, "step": 300 }, { "epoch": 0.027641834330251948, "grad_norm": 0.9550973176956177, "learning_rate": 2.764183433025195e-06, "loss": 0.7872, "step": 305 }, { "epoch": 0.028094979155338047, "grad_norm": 1.0069187879562378, "learning_rate": 2.809497915533805e-06, "loss": 0.7229, "step": 310 }, { "epoch": 0.028548123980424143, "grad_norm": 1.0021281242370605, "learning_rate": 2.8548123980424144e-06, "loss": 0.7909, "step": 315 }, { "epoch": 0.029001268805510242, "grad_norm": 1.1729037761688232, "learning_rate": 2.9001268805510242e-06, "loss": 0.7961, "step": 320 }, { "epoch": 0.029454413630596338, "grad_norm": 0.9805673360824585, "learning_rate": 2.945441363059634e-06, "loss": 0.7576, "step": 325 }, { "epoch": 0.029907558455682437, "grad_norm": 0.9241052865982056, "learning_rate": 2.990755845568244e-06, "loss": 0.7235, "step": 330 }, { "epoch": 0.030360703280768533, "grad_norm": 0.9695248007774353, "learning_rate": 3.0360703280768534e-06, "loss": 0.8201, "step": 335 }, { "epoch": 0.030813848105854632, "grad_norm": 0.9739518761634827, "learning_rate": 3.0813848105854632e-06, "loss": 0.7737, "step": 340 }, { "epoch": 0.03126699293094073, "grad_norm": 0.946783721446991, "learning_rate": 3.126699293094073e-06, "loss": 0.7629, "step": 345 }, { "epoch": 0.03172013775602683, "grad_norm": 0.955342710018158, "learning_rate": 3.172013775602683e-06, "loss": 0.7674, "step": 350 }, { "epoch": 0.032173282581112926, "grad_norm": 0.9450662136077881, "learning_rate": 3.217328258111293e-06, "loss": 0.7363, "step": 355 }, { "epoch": 0.03262642740619902, "grad_norm": 0.8807371854782104, "learning_rate": 3.262642740619902e-06, "loss": 0.7366, "step": 360 }, { "epoch": 0.03307957223128512, "grad_norm": 0.9874439239501953, "learning_rate": 3.307957223128512e-06, "loss": 0.7575, "step": 365 }, { "epoch": 0.03353271705637122, "grad_norm": 1.1180672645568848, "learning_rate": 3.353271705637122e-06, "loss": 0.7507, "step": 370 }, { "epoch": 0.033985861881457316, "grad_norm": 0.9600545167922974, "learning_rate": 3.398586188145732e-06, "loss": 0.7464, "step": 375 }, { "epoch": 0.03443900670654341, "grad_norm": 0.8992711901664734, "learning_rate": 3.443900670654341e-06, "loss": 0.7917, "step": 380 }, { "epoch": 0.03489215153162951, "grad_norm": 0.9252201914787292, "learning_rate": 3.4892151531629507e-06, "loss": 0.6995, "step": 385 }, { "epoch": 0.03534529635671561, "grad_norm": 0.8849993348121643, "learning_rate": 3.534529635671561e-06, "loss": 0.7234, "step": 390 }, { "epoch": 0.035798441181801706, "grad_norm": 0.9057847261428833, "learning_rate": 3.579844118180171e-06, "loss": 0.792, "step": 395 }, { "epoch": 0.0362515860068878, "grad_norm": 1.0374603271484375, "learning_rate": 3.62515860068878e-06, "loss": 0.7823, "step": 400 }, { "epoch": 0.0367047308319739, "grad_norm": 0.9031121134757996, "learning_rate": 3.6704730831973897e-06, "loss": 0.701, "step": 405 }, { "epoch": 0.03715787565706, "grad_norm": 0.8856092691421509, "learning_rate": 3.715787565706e-06, "loss": 0.7649, "step": 410 }, { "epoch": 0.037611020482146096, "grad_norm": 0.8307746648788452, "learning_rate": 3.76110204821461e-06, "loss": 0.7613, "step": 415 }, { "epoch": 0.03806416530723219, "grad_norm": 0.8942095637321472, "learning_rate": 3.806416530723219e-06, "loss": 0.7164, "step": 420 }, { "epoch": 0.03851731013231829, "grad_norm": 0.8829682469367981, "learning_rate": 3.851731013231829e-06, "loss": 0.7673, "step": 425 }, { "epoch": 0.03897045495740439, "grad_norm": 0.9050325751304626, "learning_rate": 3.897045495740439e-06, "loss": 0.6989, "step": 430 }, { "epoch": 0.039423599782490486, "grad_norm": 0.9627199172973633, "learning_rate": 3.942359978249049e-06, "loss": 0.7012, "step": 435 }, { "epoch": 0.03987674460757658, "grad_norm": 1.0455825328826904, "learning_rate": 3.987674460757658e-06, "loss": 0.739, "step": 440 }, { "epoch": 0.04032988943266268, "grad_norm": 0.8705432415008545, "learning_rate": 4.032988943266268e-06, "loss": 0.7435, "step": 445 }, { "epoch": 0.040783034257748776, "grad_norm": 0.9525200724601746, "learning_rate": 4.078303425774878e-06, "loss": 0.7996, "step": 450 }, { "epoch": 0.041236179082834876, "grad_norm": 0.8773642778396606, "learning_rate": 4.1236179082834875e-06, "loss": 0.745, "step": 455 }, { "epoch": 0.041689323907920975, "grad_norm": 0.9019294381141663, "learning_rate": 4.168932390792098e-06, "loss": 0.7441, "step": 460 }, { "epoch": 0.04214246873300707, "grad_norm": 0.8837187886238098, "learning_rate": 4.214246873300707e-06, "loss": 0.7147, "step": 465 }, { "epoch": 0.042595613558093166, "grad_norm": 0.8710213899612427, "learning_rate": 4.259561355809317e-06, "loss": 0.7575, "step": 470 }, { "epoch": 0.043048758383179266, "grad_norm": 0.931779682636261, "learning_rate": 4.304875838317927e-06, "loss": 0.7653, "step": 475 }, { "epoch": 0.043501903208265365, "grad_norm": 1.0216498374938965, "learning_rate": 4.350190320826536e-06, "loss": 0.7395, "step": 480 }, { "epoch": 0.04395504803335146, "grad_norm": 0.9698764681816101, "learning_rate": 4.395504803335146e-06, "loss": 0.775, "step": 485 }, { "epoch": 0.044408192858437556, "grad_norm": 0.8843251466751099, "learning_rate": 4.440819285843756e-06, "loss": 0.7328, "step": 490 }, { "epoch": 0.044861337683523655, "grad_norm": 0.8967944979667664, "learning_rate": 4.4861337683523655e-06, "loss": 0.7718, "step": 495 }, { "epoch": 0.045314482508609755, "grad_norm": 0.8908171057701111, "learning_rate": 4.531448250860976e-06, "loss": 0.7395, "step": 500 }, { "epoch": 0.04576762733369585, "grad_norm": 0.8921871781349182, "learning_rate": 4.576762733369585e-06, "loss": 0.8052, "step": 505 }, { "epoch": 0.046220772158781946, "grad_norm": 0.8373174667358398, "learning_rate": 4.622077215878195e-06, "loss": 0.7748, "step": 510 }, { "epoch": 0.046673916983868045, "grad_norm": 1.3748780488967896, "learning_rate": 4.667391698386805e-06, "loss": 0.787, "step": 515 }, { "epoch": 0.047127061808954145, "grad_norm": 0.9219179153442383, "learning_rate": 4.712706180895414e-06, "loss": 0.7978, "step": 520 }, { "epoch": 0.04758020663404024, "grad_norm": 0.8481162190437317, "learning_rate": 4.758020663404024e-06, "loss": 0.7548, "step": 525 }, { "epoch": 0.048033351459126336, "grad_norm": 0.9015586972236633, "learning_rate": 4.803335145912634e-06, "loss": 0.733, "step": 530 }, { "epoch": 0.048486496284212435, "grad_norm": 0.8575695157051086, "learning_rate": 4.8486496284212435e-06, "loss": 0.7092, "step": 535 }, { "epoch": 0.048939641109298535, "grad_norm": 0.8544911742210388, "learning_rate": 4.893964110929854e-06, "loss": 0.6949, "step": 540 }, { "epoch": 0.04939278593438463, "grad_norm": 1.1108269691467285, "learning_rate": 4.939278593438463e-06, "loss": 0.7047, "step": 545 }, { "epoch": 0.049845930759470726, "grad_norm": 1.0313535928726196, "learning_rate": 4.984593075947073e-06, "loss": 0.7542, "step": 550 }, { "epoch": 0.050299075584556825, "grad_norm": 0.9121854305267334, "learning_rate": 5.029907558455683e-06, "loss": 0.7044, "step": 555 }, { "epoch": 0.050752220409642924, "grad_norm": 0.9126025438308716, "learning_rate": 5.075222040964292e-06, "loss": 0.6822, "step": 560 }, { "epoch": 0.05120536523472902, "grad_norm": 0.8374198079109192, "learning_rate": 5.120536523472902e-06, "loss": 0.7415, "step": 565 }, { "epoch": 0.051658510059815116, "grad_norm": 1.0105656385421753, "learning_rate": 5.165851005981512e-06, "loss": 0.7443, "step": 570 }, { "epoch": 0.052111654884901215, "grad_norm": 0.9275915026664734, "learning_rate": 5.2111654884901216e-06, "loss": 0.7264, "step": 575 }, { "epoch": 0.052564799709987314, "grad_norm": 0.9037122130393982, "learning_rate": 5.256479970998732e-06, "loss": 0.7682, "step": 580 }, { "epoch": 0.05301794453507341, "grad_norm": 0.9672852158546448, "learning_rate": 5.301794453507341e-06, "loss": 0.8038, "step": 585 }, { "epoch": 0.053471089360159506, "grad_norm": 0.8340831995010376, "learning_rate": 5.347108936015951e-06, "loss": 0.6893, "step": 590 }, { "epoch": 0.053924234185245605, "grad_norm": 1.0558521747589111, "learning_rate": 5.392423418524561e-06, "loss": 0.6954, "step": 595 }, { "epoch": 0.054377379010331704, "grad_norm": 0.8389281630516052, "learning_rate": 5.4377379010331704e-06, "loss": 0.7224, "step": 600 }, { "epoch": 0.054830523835417797, "grad_norm": 0.8278689384460449, "learning_rate": 5.48305238354178e-06, "loss": 0.7304, "step": 605 }, { "epoch": 0.055283668660503896, "grad_norm": 0.841869592666626, "learning_rate": 5.52836686605039e-06, "loss": 0.6777, "step": 610 }, { "epoch": 0.055736813485589995, "grad_norm": 0.8396051526069641, "learning_rate": 5.573681348559e-06, "loss": 0.7532, "step": 615 }, { "epoch": 0.056189958310676094, "grad_norm": 0.864200234413147, "learning_rate": 5.61899583106761e-06, "loss": 0.7263, "step": 620 }, { "epoch": 0.056643103135762186, "grad_norm": 0.8598406314849854, "learning_rate": 5.664310313576219e-06, "loss": 0.693, "step": 625 }, { "epoch": 0.057096247960848286, "grad_norm": 0.8663750290870667, "learning_rate": 5.709624796084829e-06, "loss": 0.7152, "step": 630 }, { "epoch": 0.057549392785934385, "grad_norm": 0.826008677482605, "learning_rate": 5.754939278593439e-06, "loss": 0.6902, "step": 635 }, { "epoch": 0.058002537611020484, "grad_norm": 0.8137087225914001, "learning_rate": 5.8002537611020485e-06, "loss": 0.7208, "step": 640 }, { "epoch": 0.058455682436106576, "grad_norm": 0.8402184844017029, "learning_rate": 5.845568243610658e-06, "loss": 0.6905, "step": 645 }, { "epoch": 0.058908827261192676, "grad_norm": 0.8646835088729858, "learning_rate": 5.890882726119268e-06, "loss": 0.7185, "step": 650 }, { "epoch": 0.059361972086278775, "grad_norm": 0.8679500222206116, "learning_rate": 5.936197208627878e-06, "loss": 0.7172, "step": 655 }, { "epoch": 0.059815116911364874, "grad_norm": 0.831421971321106, "learning_rate": 5.981511691136488e-06, "loss": 0.7133, "step": 660 }, { "epoch": 0.06026826173645097, "grad_norm": 0.8598697781562805, "learning_rate": 6.026826173645097e-06, "loss": 0.7239, "step": 665 }, { "epoch": 0.060721406561537065, "grad_norm": 0.9271496534347534, "learning_rate": 6.072140656153707e-06, "loss": 0.758, "step": 670 }, { "epoch": 0.061174551386623165, "grad_norm": 0.8864847421646118, "learning_rate": 6.117455138662317e-06, "loss": 0.7561, "step": 675 }, { "epoch": 0.061627696211709264, "grad_norm": 0.8525952696800232, "learning_rate": 6.1627696211709265e-06, "loss": 0.7387, "step": 680 }, { "epoch": 0.06208084103679536, "grad_norm": 0.843408465385437, "learning_rate": 6.208084103679537e-06, "loss": 0.6876, "step": 685 }, { "epoch": 0.06253398586188146, "grad_norm": 0.778634250164032, "learning_rate": 6.253398586188146e-06, "loss": 0.7652, "step": 690 }, { "epoch": 0.06298713068696755, "grad_norm": 0.8166111707687378, "learning_rate": 6.298713068696756e-06, "loss": 0.727, "step": 695 }, { "epoch": 0.06344027551205365, "grad_norm": 0.893463134765625, "learning_rate": 6.344027551205366e-06, "loss": 0.7544, "step": 700 }, { "epoch": 0.06389342033713975, "grad_norm": 0.8252425789833069, "learning_rate": 6.389342033713975e-06, "loss": 0.7047, "step": 705 }, { "epoch": 0.06434656516222585, "grad_norm": 0.8584256768226624, "learning_rate": 6.434656516222586e-06, "loss": 0.749, "step": 710 }, { "epoch": 0.06479970998731195, "grad_norm": 0.8807448148727417, "learning_rate": 6.479970998731195e-06, "loss": 0.7092, "step": 715 }, { "epoch": 0.06525285481239804, "grad_norm": 0.9361736178398132, "learning_rate": 6.525285481239804e-06, "loss": 0.6921, "step": 720 }, { "epoch": 0.06570599963748414, "grad_norm": 0.8284675478935242, "learning_rate": 6.570599963748414e-06, "loss": 0.7941, "step": 725 }, { "epoch": 0.06615914446257024, "grad_norm": 0.8188536763191223, "learning_rate": 6.615914446257024e-06, "loss": 0.7282, "step": 730 }, { "epoch": 0.06661228928765633, "grad_norm": 0.808755099773407, "learning_rate": 6.661228928765634e-06, "loss": 0.7541, "step": 735 }, { "epoch": 0.06706543411274243, "grad_norm": 0.8192352652549744, "learning_rate": 6.706543411274244e-06, "loss": 0.7114, "step": 740 }, { "epoch": 0.06751857893782853, "grad_norm": 0.8349238038063049, "learning_rate": 6.751857893782853e-06, "loss": 0.7089, "step": 745 }, { "epoch": 0.06797172376291463, "grad_norm": 0.8056740164756775, "learning_rate": 6.797172376291464e-06, "loss": 0.7149, "step": 750 }, { "epoch": 0.06842486858800073, "grad_norm": 0.8399401307106018, "learning_rate": 6.842486858800073e-06, "loss": 0.7105, "step": 755 }, { "epoch": 0.06887801341308682, "grad_norm": 0.8043131828308105, "learning_rate": 6.887801341308682e-06, "loss": 0.708, "step": 760 }, { "epoch": 0.06933115823817292, "grad_norm": 0.8481742739677429, "learning_rate": 6.933115823817292e-06, "loss": 0.6962, "step": 765 }, { "epoch": 0.06978430306325901, "grad_norm": 0.9020221829414368, "learning_rate": 6.9784303063259014e-06, "loss": 0.6827, "step": 770 }, { "epoch": 0.07023744788834511, "grad_norm": 0.8233721852302551, "learning_rate": 7.023744788834512e-06, "loss": 0.7756, "step": 775 }, { "epoch": 0.07069059271343121, "grad_norm": 0.8479670882225037, "learning_rate": 7.069059271343122e-06, "loss": 0.6993, "step": 780 }, { "epoch": 0.07114373753851731, "grad_norm": 0.8186155557632446, "learning_rate": 7.1143737538517314e-06, "loss": 0.7379, "step": 785 }, { "epoch": 0.07159688236360341, "grad_norm": 0.844886839389801, "learning_rate": 7.159688236360342e-06, "loss": 0.6996, "step": 790 }, { "epoch": 0.07205002718868951, "grad_norm": 0.9060750603675842, "learning_rate": 7.205002718868951e-06, "loss": 0.7455, "step": 795 }, { "epoch": 0.0725031720137756, "grad_norm": 0.8928006887435913, "learning_rate": 7.25031720137756e-06, "loss": 0.7677, "step": 800 }, { "epoch": 0.0729563168388617, "grad_norm": 0.8424704074859619, "learning_rate": 7.29563168388617e-06, "loss": 0.7082, "step": 805 }, { "epoch": 0.0734094616639478, "grad_norm": 0.8681677579879761, "learning_rate": 7.3409461663947795e-06, "loss": 0.769, "step": 810 }, { "epoch": 0.0738626064890339, "grad_norm": 0.8004975914955139, "learning_rate": 7.38626064890339e-06, "loss": 0.6865, "step": 815 }, { "epoch": 0.07431575131412, "grad_norm": 0.8493906259536743, "learning_rate": 7.431575131412e-06, "loss": 0.7011, "step": 820 }, { "epoch": 0.07476889613920609, "grad_norm": 0.8899917602539062, "learning_rate": 7.4768896139206095e-06, "loss": 0.7289, "step": 825 }, { "epoch": 0.07522204096429219, "grad_norm": 0.7924049496650696, "learning_rate": 7.52220409642922e-06, "loss": 0.723, "step": 830 }, { "epoch": 0.07567518578937829, "grad_norm": 0.8311818838119507, "learning_rate": 7.567518578937829e-06, "loss": 0.7007, "step": 835 }, { "epoch": 0.07612833061446438, "grad_norm": 0.8383309841156006, "learning_rate": 7.612833061446438e-06, "loss": 0.7522, "step": 840 }, { "epoch": 0.07658147543955048, "grad_norm": 0.7895633578300476, "learning_rate": 7.658147543955047e-06, "loss": 0.7078, "step": 845 }, { "epoch": 0.07703462026463657, "grad_norm": 0.8557552695274353, "learning_rate": 7.703462026463658e-06, "loss": 0.6873, "step": 850 }, { "epoch": 0.07748776508972267, "grad_norm": 0.7973926067352295, "learning_rate": 7.748776508972268e-06, "loss": 0.6716, "step": 855 }, { "epoch": 0.07794090991480877, "grad_norm": 0.8357936143875122, "learning_rate": 7.794090991480877e-06, "loss": 0.737, "step": 860 }, { "epoch": 0.07839405473989487, "grad_norm": 0.8027513027191162, "learning_rate": 7.839405473989488e-06, "loss": 0.72, "step": 865 }, { "epoch": 0.07884719956498097, "grad_norm": 0.7950075268745422, "learning_rate": 7.884719956498098e-06, "loss": 0.7171, "step": 870 }, { "epoch": 0.07930034439006707, "grad_norm": 0.8145079612731934, "learning_rate": 7.930034439006707e-06, "loss": 0.758, "step": 875 }, { "epoch": 0.07975348921515316, "grad_norm": 0.8201032280921936, "learning_rate": 7.975348921515317e-06, "loss": 0.7644, "step": 880 }, { "epoch": 0.08020663404023926, "grad_norm": 0.8337165117263794, "learning_rate": 8.020663404023926e-06, "loss": 0.6787, "step": 885 }, { "epoch": 0.08065977886532535, "grad_norm": 0.8285698890686035, "learning_rate": 8.065977886532536e-06, "loss": 0.7729, "step": 890 }, { "epoch": 0.08111292369041145, "grad_norm": 0.8201051950454712, "learning_rate": 8.111292369041145e-06, "loss": 0.6927, "step": 895 }, { "epoch": 0.08156606851549755, "grad_norm": 0.8015955090522766, "learning_rate": 8.156606851549756e-06, "loss": 0.7738, "step": 900 }, { "epoch": 0.08201921334058365, "grad_norm": 0.8296622633934021, "learning_rate": 8.201921334058366e-06, "loss": 0.7084, "step": 905 }, { "epoch": 0.08247235816566975, "grad_norm": 0.8089708685874939, "learning_rate": 8.247235816566975e-06, "loss": 0.7066, "step": 910 }, { "epoch": 0.08292550299075585, "grad_norm": 0.8962370157241821, "learning_rate": 8.292550299075586e-06, "loss": 0.7199, "step": 915 }, { "epoch": 0.08337864781584195, "grad_norm": 0.872968316078186, "learning_rate": 8.337864781584196e-06, "loss": 0.7163, "step": 920 }, { "epoch": 0.08383179264092804, "grad_norm": 0.8795778751373291, "learning_rate": 8.383179264092803e-06, "loss": 0.7094, "step": 925 }, { "epoch": 0.08428493746601413, "grad_norm": 0.8178483843803406, "learning_rate": 8.428493746601414e-06, "loss": 0.7275, "step": 930 }, { "epoch": 0.08473808229110023, "grad_norm": 1.9018560647964478, "learning_rate": 8.473808229110024e-06, "loss": 0.7098, "step": 935 }, { "epoch": 0.08519122711618633, "grad_norm": 0.8590347170829773, "learning_rate": 8.519122711618633e-06, "loss": 0.7242, "step": 940 }, { "epoch": 0.08564437194127243, "grad_norm": 0.8075234293937683, "learning_rate": 8.564437194127244e-06, "loss": 0.6683, "step": 945 }, { "epoch": 0.08609751676635853, "grad_norm": 0.8303983807563782, "learning_rate": 8.609751676635854e-06, "loss": 0.6918, "step": 950 }, { "epoch": 0.08655066159144463, "grad_norm": 0.8645445108413696, "learning_rate": 8.655066159144463e-06, "loss": 0.6714, "step": 955 }, { "epoch": 0.08700380641653073, "grad_norm": 0.7864328026771545, "learning_rate": 8.700380641653073e-06, "loss": 0.74, "step": 960 }, { "epoch": 0.08745695124161681, "grad_norm": 0.8204703330993652, "learning_rate": 8.745695124161682e-06, "loss": 0.7409, "step": 965 }, { "epoch": 0.08791009606670291, "grad_norm": 0.7802404165267944, "learning_rate": 8.791009606670292e-06, "loss": 0.7075, "step": 970 }, { "epoch": 0.08836324089178901, "grad_norm": 0.8112988471984863, "learning_rate": 8.836324089178901e-06, "loss": 0.6942, "step": 975 }, { "epoch": 0.08881638571687511, "grad_norm": 1.0347669124603271, "learning_rate": 8.881638571687512e-06, "loss": 0.75, "step": 980 }, { "epoch": 0.08926953054196121, "grad_norm": 0.7999122738838196, "learning_rate": 8.926953054196122e-06, "loss": 0.7255, "step": 985 }, { "epoch": 0.08972267536704731, "grad_norm": 0.9503291845321655, "learning_rate": 8.972267536704731e-06, "loss": 0.6582, "step": 990 }, { "epoch": 0.09017582019213341, "grad_norm": 0.8389723896980286, "learning_rate": 9.017582019213342e-06, "loss": 0.7294, "step": 995 }, { "epoch": 0.09062896501721951, "grad_norm": 0.7861019372940063, "learning_rate": 9.062896501721952e-06, "loss": 0.6602, "step": 1000 }, { "epoch": 0.0910821098423056, "grad_norm": 1.0304704904556274, "learning_rate": 9.10821098423056e-06, "loss": 0.759, "step": 1005 }, { "epoch": 0.0915352546673917, "grad_norm": 0.8281311392784119, "learning_rate": 9.15352546673917e-06, "loss": 0.6891, "step": 1010 }, { "epoch": 0.0919883994924778, "grad_norm": 0.8454942107200623, "learning_rate": 9.19883994924778e-06, "loss": 0.6637, "step": 1015 }, { "epoch": 0.09244154431756389, "grad_norm": 0.7659911513328552, "learning_rate": 9.24415443175639e-06, "loss": 0.673, "step": 1020 }, { "epoch": 0.09289468914264999, "grad_norm": 0.8115429878234863, "learning_rate": 9.289468914264999e-06, "loss": 0.6966, "step": 1025 }, { "epoch": 0.09334783396773609, "grad_norm": 0.7945654988288879, "learning_rate": 9.33478339677361e-06, "loss": 0.6837, "step": 1030 }, { "epoch": 0.09380097879282219, "grad_norm": 0.8040125966072083, "learning_rate": 9.38009787928222e-06, "loss": 0.6811, "step": 1035 }, { "epoch": 0.09425412361790829, "grad_norm": 0.8210301399230957, "learning_rate": 9.425412361790829e-06, "loss": 0.728, "step": 1040 }, { "epoch": 0.09470726844299437, "grad_norm": 0.8699106574058533, "learning_rate": 9.470726844299438e-06, "loss": 0.7637, "step": 1045 }, { "epoch": 0.09516041326808047, "grad_norm": 0.8630550503730774, "learning_rate": 9.516041326808048e-06, "loss": 0.7219, "step": 1050 }, { "epoch": 0.09561355809316657, "grad_norm": 0.7842612862586975, "learning_rate": 9.561355809316657e-06, "loss": 0.7424, "step": 1055 }, { "epoch": 0.09606670291825267, "grad_norm": 0.908790111541748, "learning_rate": 9.606670291825268e-06, "loss": 0.6852, "step": 1060 }, { "epoch": 0.09651984774333877, "grad_norm": 0.8228508830070496, "learning_rate": 9.651984774333878e-06, "loss": 0.707, "step": 1065 }, { "epoch": 0.09697299256842487, "grad_norm": 0.7951065897941589, "learning_rate": 9.697299256842487e-06, "loss": 0.6414, "step": 1070 }, { "epoch": 0.09742613739351097, "grad_norm": 0.8280823230743408, "learning_rate": 9.742613739351098e-06, "loss": 0.7103, "step": 1075 }, { "epoch": 0.09787928221859707, "grad_norm": 0.9675266146659851, "learning_rate": 9.787928221859708e-06, "loss": 0.6897, "step": 1080 }, { "epoch": 0.09833242704368315, "grad_norm": 0.870352029800415, "learning_rate": 9.833242704368315e-06, "loss": 0.6616, "step": 1085 }, { "epoch": 0.09878557186876925, "grad_norm": 0.8623203635215759, "learning_rate": 9.878557186876927e-06, "loss": 0.7365, "step": 1090 }, { "epoch": 0.09923871669385535, "grad_norm": 0.856452465057373, "learning_rate": 9.923871669385536e-06, "loss": 0.6826, "step": 1095 }, { "epoch": 0.09969186151894145, "grad_norm": 0.8867284059524536, "learning_rate": 9.969186151894145e-06, "loss": 0.7391, "step": 1100 }, { "epoch": 0.10014500634402755, "grad_norm": 0.9644516110420227, "learning_rate": 1.0014500634402755e-05, "loss": 0.6597, "step": 1105 }, { "epoch": 0.10059815116911365, "grad_norm": 0.871374785900116, "learning_rate": 1.0059815116911366e-05, "loss": 0.7315, "step": 1110 }, { "epoch": 0.10105129599419975, "grad_norm": 0.8495467305183411, "learning_rate": 1.0105129599419975e-05, "loss": 0.6956, "step": 1115 }, { "epoch": 0.10150444081928585, "grad_norm": 0.928229033946991, "learning_rate": 1.0150444081928585e-05, "loss": 0.7906, "step": 1120 }, { "epoch": 0.10195758564437195, "grad_norm": 0.9027310013771057, "learning_rate": 1.0195758564437196e-05, "loss": 0.6798, "step": 1125 }, { "epoch": 0.10241073046945803, "grad_norm": 0.8270677328109741, "learning_rate": 1.0241073046945804e-05, "loss": 0.6996, "step": 1130 }, { "epoch": 0.10286387529454413, "grad_norm": 0.8922069072723389, "learning_rate": 1.0286387529454413e-05, "loss": 0.7574, "step": 1135 }, { "epoch": 0.10331702011963023, "grad_norm": 0.8729934096336365, "learning_rate": 1.0331702011963024e-05, "loss": 0.6687, "step": 1140 }, { "epoch": 0.10377016494471633, "grad_norm": 0.8629111051559448, "learning_rate": 1.0377016494471634e-05, "loss": 0.6683, "step": 1145 }, { "epoch": 0.10422330976980243, "grad_norm": 0.880828320980072, "learning_rate": 1.0422330976980243e-05, "loss": 0.6834, "step": 1150 }, { "epoch": 0.10467645459488853, "grad_norm": 0.8722450733184814, "learning_rate": 1.0467645459488854e-05, "loss": 0.7097, "step": 1155 }, { "epoch": 0.10512959941997463, "grad_norm": 0.8644553422927856, "learning_rate": 1.0512959941997464e-05, "loss": 0.6649, "step": 1160 }, { "epoch": 0.10558274424506073, "grad_norm": 0.8034363985061646, "learning_rate": 1.0558274424506073e-05, "loss": 0.715, "step": 1165 }, { "epoch": 0.10603588907014681, "grad_norm": 0.7831945419311523, "learning_rate": 1.0603588907014683e-05, "loss": 0.6864, "step": 1170 }, { "epoch": 0.10648903389523291, "grad_norm": 0.802034854888916, "learning_rate": 1.0648903389523292e-05, "loss": 0.6515, "step": 1175 }, { "epoch": 0.10694217872031901, "grad_norm": 0.843821108341217, "learning_rate": 1.0694217872031901e-05, "loss": 0.7239, "step": 1180 }, { "epoch": 0.10739532354540511, "grad_norm": 0.9052399396896362, "learning_rate": 1.0739532354540511e-05, "loss": 0.6914, "step": 1185 }, { "epoch": 0.10784846837049121, "grad_norm": 0.7857725620269775, "learning_rate": 1.0784846837049122e-05, "loss": 0.6206, "step": 1190 }, { "epoch": 0.10830161319557731, "grad_norm": 0.7904521822929382, "learning_rate": 1.0830161319557731e-05, "loss": 0.6761, "step": 1195 }, { "epoch": 0.10875475802066341, "grad_norm": 0.8491748571395874, "learning_rate": 1.0875475802066341e-05, "loss": 0.7236, "step": 1200 }, { "epoch": 0.10920790284574951, "grad_norm": 0.8582770824432373, "learning_rate": 1.0920790284574952e-05, "loss": 0.6794, "step": 1205 }, { "epoch": 0.10966104767083559, "grad_norm": 0.8070040345191956, "learning_rate": 1.096610476708356e-05, "loss": 0.7248, "step": 1210 }, { "epoch": 0.11011419249592169, "grad_norm": 0.8263701796531677, "learning_rate": 1.101141924959217e-05, "loss": 0.6466, "step": 1215 }, { "epoch": 0.11056733732100779, "grad_norm": 0.8869950175285339, "learning_rate": 1.105673373210078e-05, "loss": 0.7632, "step": 1220 }, { "epoch": 0.11102048214609389, "grad_norm": 0.8123072385787964, "learning_rate": 1.110204821460939e-05, "loss": 0.7326, "step": 1225 }, { "epoch": 0.11147362697117999, "grad_norm": 0.777942955493927, "learning_rate": 1.1147362697118e-05, "loss": 0.7138, "step": 1230 }, { "epoch": 0.11192677179626609, "grad_norm": 0.8463612198829651, "learning_rate": 1.1192677179626609e-05, "loss": 0.6903, "step": 1235 }, { "epoch": 0.11237991662135219, "grad_norm": 0.8077281713485718, "learning_rate": 1.123799166213522e-05, "loss": 0.7165, "step": 1240 }, { "epoch": 0.11283306144643829, "grad_norm": 0.7659412622451782, "learning_rate": 1.128330614464383e-05, "loss": 0.6864, "step": 1245 }, { "epoch": 0.11328620627152437, "grad_norm": 0.8571773767471313, "learning_rate": 1.1328620627152439e-05, "loss": 0.6856, "step": 1250 }, { "epoch": 0.11373935109661047, "grad_norm": 1.1303999423980713, "learning_rate": 1.1373935109661048e-05, "loss": 0.7138, "step": 1255 }, { "epoch": 0.11419249592169657, "grad_norm": 0.7828883528709412, "learning_rate": 1.1419249592169658e-05, "loss": 0.7062, "step": 1260 }, { "epoch": 0.11464564074678267, "grad_norm": 0.894838809967041, "learning_rate": 1.1464564074678267e-05, "loss": 0.6738, "step": 1265 }, { "epoch": 0.11509878557186877, "grad_norm": 0.8426092863082886, "learning_rate": 1.1509878557186878e-05, "loss": 0.7285, "step": 1270 }, { "epoch": 0.11555193039695487, "grad_norm": 0.8109918236732483, "learning_rate": 1.1555193039695488e-05, "loss": 0.7345, "step": 1275 }, { "epoch": 0.11600507522204097, "grad_norm": 0.7923703789710999, "learning_rate": 1.1600507522204097e-05, "loss": 0.7194, "step": 1280 }, { "epoch": 0.11645822004712707, "grad_norm": 0.80158931016922, "learning_rate": 1.1645822004712708e-05, "loss": 0.6773, "step": 1285 }, { "epoch": 0.11691136487221315, "grad_norm": 0.834099531173706, "learning_rate": 1.1691136487221316e-05, "loss": 0.7239, "step": 1290 }, { "epoch": 0.11736450969729925, "grad_norm": 0.8286436796188354, "learning_rate": 1.1736450969729925e-05, "loss": 0.6945, "step": 1295 }, { "epoch": 0.11781765452238535, "grad_norm": 0.861746609210968, "learning_rate": 1.1781765452238536e-05, "loss": 0.7007, "step": 1300 }, { "epoch": 0.11827079934747145, "grad_norm": 0.8473659753799438, "learning_rate": 1.1827079934747146e-05, "loss": 0.6644, "step": 1305 }, { "epoch": 0.11872394417255755, "grad_norm": 0.8250194191932678, "learning_rate": 1.1872394417255755e-05, "loss": 0.7013, "step": 1310 }, { "epoch": 0.11917708899764365, "grad_norm": 0.9342251420021057, "learning_rate": 1.1917708899764365e-05, "loss": 0.6747, "step": 1315 }, { "epoch": 0.11963023382272975, "grad_norm": 1.0025978088378906, "learning_rate": 1.1963023382272976e-05, "loss": 0.6526, "step": 1320 }, { "epoch": 0.12008337864781585, "grad_norm": 0.8143936991691589, "learning_rate": 1.2008337864781585e-05, "loss": 0.7158, "step": 1325 }, { "epoch": 0.12053652347290195, "grad_norm": 0.8678363561630249, "learning_rate": 1.2053652347290195e-05, "loss": 0.6278, "step": 1330 }, { "epoch": 0.12098966829798803, "grad_norm": 0.9283420443534851, "learning_rate": 1.2098966829798804e-05, "loss": 0.6565, "step": 1335 }, { "epoch": 0.12144281312307413, "grad_norm": 0.8425723314285278, "learning_rate": 1.2144281312307414e-05, "loss": 0.6658, "step": 1340 }, { "epoch": 0.12189595794816023, "grad_norm": 0.8260273933410645, "learning_rate": 1.2189595794816023e-05, "loss": 0.6326, "step": 1345 }, { "epoch": 0.12234910277324633, "grad_norm": 0.8832162022590637, "learning_rate": 1.2234910277324634e-05, "loss": 0.7082, "step": 1350 }, { "epoch": 0.12280224759833243, "grad_norm": 0.8422349095344543, "learning_rate": 1.2280224759833244e-05, "loss": 0.7072, "step": 1355 }, { "epoch": 0.12325539242341853, "grad_norm": 0.8603054285049438, "learning_rate": 1.2325539242341853e-05, "loss": 0.6957, "step": 1360 }, { "epoch": 0.12370853724850463, "grad_norm": 0.8631654977798462, "learning_rate": 1.2370853724850464e-05, "loss": 0.6542, "step": 1365 }, { "epoch": 0.12416168207359073, "grad_norm": 0.831926703453064, "learning_rate": 1.2416168207359074e-05, "loss": 0.7483, "step": 1370 }, { "epoch": 0.12461482689867681, "grad_norm": 0.8884423971176147, "learning_rate": 1.2461482689867681e-05, "loss": 0.7087, "step": 1375 }, { "epoch": 0.1250679717237629, "grad_norm": 0.807146430015564, "learning_rate": 1.2506797172376292e-05, "loss": 0.6663, "step": 1380 }, { "epoch": 0.125521116548849, "grad_norm": 0.7867916226387024, "learning_rate": 1.2552111654884902e-05, "loss": 0.6909, "step": 1385 }, { "epoch": 0.1259742613739351, "grad_norm": 0.8012793660163879, "learning_rate": 1.2597426137393511e-05, "loss": 0.7204, "step": 1390 }, { "epoch": 0.1264274061990212, "grad_norm": 0.8625927567481995, "learning_rate": 1.264274061990212e-05, "loss": 0.7112, "step": 1395 }, { "epoch": 0.1268805510241073, "grad_norm": 0.8163318037986755, "learning_rate": 1.2688055102410732e-05, "loss": 0.7394, "step": 1400 }, { "epoch": 0.1273336958491934, "grad_norm": 0.8041713237762451, "learning_rate": 1.2733369584919341e-05, "loss": 0.688, "step": 1405 }, { "epoch": 0.1277868406742795, "grad_norm": 0.809386134147644, "learning_rate": 1.277868406742795e-05, "loss": 0.6038, "step": 1410 }, { "epoch": 0.1282399854993656, "grad_norm": 0.8432759046554565, "learning_rate": 1.2823998549936562e-05, "loss": 0.6672, "step": 1415 }, { "epoch": 0.1286931303244517, "grad_norm": 0.7927317023277283, "learning_rate": 1.2869313032445171e-05, "loss": 0.6755, "step": 1420 }, { "epoch": 0.1291462751495378, "grad_norm": 0.8721144199371338, "learning_rate": 1.291462751495378e-05, "loss": 0.646, "step": 1425 }, { "epoch": 0.1295994199746239, "grad_norm": 0.8450779914855957, "learning_rate": 1.295994199746239e-05, "loss": 0.6957, "step": 1430 }, { "epoch": 0.13005256479970997, "grad_norm": 0.8845952153205872, "learning_rate": 1.3005256479970998e-05, "loss": 0.7797, "step": 1435 }, { "epoch": 0.13050570962479607, "grad_norm": 0.8951588273048401, "learning_rate": 1.3050570962479607e-05, "loss": 0.7118, "step": 1440 }, { "epoch": 0.13095885444988217, "grad_norm": 0.8125964403152466, "learning_rate": 1.3095885444988218e-05, "loss": 0.6387, "step": 1445 }, { "epoch": 0.13141199927496827, "grad_norm": 0.7962906360626221, "learning_rate": 1.3141199927496828e-05, "loss": 0.7394, "step": 1450 }, { "epoch": 0.13186514410005437, "grad_norm": 0.7706547975540161, "learning_rate": 1.3186514410005437e-05, "loss": 0.6472, "step": 1455 }, { "epoch": 0.13231828892514047, "grad_norm": 0.9268766045570374, "learning_rate": 1.3231828892514048e-05, "loss": 0.7254, "step": 1460 }, { "epoch": 0.13277143375022657, "grad_norm": 0.77468341588974, "learning_rate": 1.3277143375022658e-05, "loss": 0.753, "step": 1465 }, { "epoch": 0.13322457857531267, "grad_norm": 0.8258037567138672, "learning_rate": 1.3322457857531267e-05, "loss": 0.6409, "step": 1470 }, { "epoch": 0.13367772340039877, "grad_norm": 0.8306136131286621, "learning_rate": 1.3367772340039877e-05, "loss": 0.7044, "step": 1475 }, { "epoch": 0.13413086822548487, "grad_norm": 0.8698281049728394, "learning_rate": 1.3413086822548488e-05, "loss": 0.7299, "step": 1480 }, { "epoch": 0.13458401305057097, "grad_norm": 0.8370408415794373, "learning_rate": 1.3458401305057097e-05, "loss": 0.6838, "step": 1485 }, { "epoch": 0.13503715787565707, "grad_norm": 0.8126932382583618, "learning_rate": 1.3503715787565707e-05, "loss": 0.7403, "step": 1490 }, { "epoch": 0.13549030270074316, "grad_norm": 0.8393591642379761, "learning_rate": 1.3549030270074318e-05, "loss": 0.6698, "step": 1495 }, { "epoch": 0.13594344752582926, "grad_norm": 0.788479208946228, "learning_rate": 1.3594344752582927e-05, "loss": 0.6677, "step": 1500 }, { "epoch": 0.13639659235091536, "grad_norm": 0.8721023201942444, "learning_rate": 1.3639659235091537e-05, "loss": 0.7107, "step": 1505 }, { "epoch": 0.13684973717600146, "grad_norm": 0.866582453250885, "learning_rate": 1.3684973717600146e-05, "loss": 0.6557, "step": 1510 }, { "epoch": 0.13730288200108753, "grad_norm": 0.7527275681495667, "learning_rate": 1.3730288200108754e-05, "loss": 0.6085, "step": 1515 }, { "epoch": 0.13775602682617363, "grad_norm": 0.7928276658058167, "learning_rate": 1.3775602682617363e-05, "loss": 0.6852, "step": 1520 }, { "epoch": 0.13820917165125973, "grad_norm": 0.8544141054153442, "learning_rate": 1.3820917165125975e-05, "loss": 0.6635, "step": 1525 }, { "epoch": 0.13866231647634583, "grad_norm": 0.86512291431427, "learning_rate": 1.3866231647634584e-05, "loss": 0.6998, "step": 1530 }, { "epoch": 0.13911546130143193, "grad_norm": 0.8498688340187073, "learning_rate": 1.3911546130143193e-05, "loss": 0.686, "step": 1535 }, { "epoch": 0.13956860612651803, "grad_norm": 0.849513828754425, "learning_rate": 1.3956860612651803e-05, "loss": 0.6651, "step": 1540 }, { "epoch": 0.14002175095160413, "grad_norm": 0.8536824584007263, "learning_rate": 1.4002175095160414e-05, "loss": 0.7105, "step": 1545 }, { "epoch": 0.14047489577669023, "grad_norm": 0.763559103012085, "learning_rate": 1.4047489577669023e-05, "loss": 0.6369, "step": 1550 }, { "epoch": 0.14092804060177633, "grad_norm": 0.8098294138908386, "learning_rate": 1.4092804060177633e-05, "loss": 0.676, "step": 1555 }, { "epoch": 0.14138118542686243, "grad_norm": 0.7392351031303406, "learning_rate": 1.4138118542686244e-05, "loss": 0.6318, "step": 1560 }, { "epoch": 0.14183433025194853, "grad_norm": 0.9397588968276978, "learning_rate": 1.4183433025194853e-05, "loss": 0.7295, "step": 1565 }, { "epoch": 0.14228747507703463, "grad_norm": 1.0900477170944214, "learning_rate": 1.4228747507703463e-05, "loss": 0.6503, "step": 1570 }, { "epoch": 0.14274061990212072, "grad_norm": 0.8149482011795044, "learning_rate": 1.4274061990212072e-05, "loss": 0.7014, "step": 1575 }, { "epoch": 0.14319376472720682, "grad_norm": 0.9425377249717712, "learning_rate": 1.4319376472720683e-05, "loss": 0.7155, "step": 1580 }, { "epoch": 0.14364690955229292, "grad_norm": 0.8183274269104004, "learning_rate": 1.4364690955229293e-05, "loss": 0.7489, "step": 1585 }, { "epoch": 0.14410005437737902, "grad_norm": 0.8271051645278931, "learning_rate": 1.4410005437737902e-05, "loss": 0.626, "step": 1590 }, { "epoch": 0.14455319920246512, "grad_norm": 0.7812680006027222, "learning_rate": 1.4455319920246513e-05, "loss": 0.6537, "step": 1595 }, { "epoch": 0.1450063440275512, "grad_norm": 0.7534462213516235, "learning_rate": 1.450063440275512e-05, "loss": 0.6525, "step": 1600 }, { "epoch": 0.1454594888526373, "grad_norm": 0.8145058751106262, "learning_rate": 1.454594888526373e-05, "loss": 0.6796, "step": 1605 }, { "epoch": 0.1459126336777234, "grad_norm": 0.7670394778251648, "learning_rate": 1.459126336777234e-05, "loss": 0.6606, "step": 1610 }, { "epoch": 0.1463657785028095, "grad_norm": 0.8289769887924194, "learning_rate": 1.463657785028095e-05, "loss": 0.6922, "step": 1615 }, { "epoch": 0.1468189233278956, "grad_norm": 0.8377647995948792, "learning_rate": 1.4681892332789559e-05, "loss": 0.7284, "step": 1620 }, { "epoch": 0.1472720681529817, "grad_norm": 0.8459876775741577, "learning_rate": 1.472720681529817e-05, "loss": 0.6463, "step": 1625 }, { "epoch": 0.1477252129780678, "grad_norm": 0.841952383518219, "learning_rate": 1.477252129780678e-05, "loss": 0.6844, "step": 1630 }, { "epoch": 0.1481783578031539, "grad_norm": 0.8334453105926514, "learning_rate": 1.4817835780315389e-05, "loss": 0.6899, "step": 1635 }, { "epoch": 0.14863150262824, "grad_norm": 0.7981679439544678, "learning_rate": 1.4863150262824e-05, "loss": 0.6672, "step": 1640 }, { "epoch": 0.14908464745332609, "grad_norm": 0.7844497561454773, "learning_rate": 1.490846474533261e-05, "loss": 0.6743, "step": 1645 }, { "epoch": 0.14953779227841218, "grad_norm": 0.8604995012283325, "learning_rate": 1.4953779227841219e-05, "loss": 0.6756, "step": 1650 }, { "epoch": 0.14999093710349828, "grad_norm": 0.8262831568717957, "learning_rate": 1.4999093710349828e-05, "loss": 0.6864, "step": 1655 }, { "epoch": 0.15044408192858438, "grad_norm": 3.950866222381592, "learning_rate": 1.504440819285844e-05, "loss": 0.6847, "step": 1660 }, { "epoch": 0.15089722675367048, "grad_norm": 1.9823105335235596, "learning_rate": 1.5089722675367049e-05, "loss": 0.698, "step": 1665 }, { "epoch": 0.15135037157875658, "grad_norm": 1.605940818786621, "learning_rate": 1.5135037157875658e-05, "loss": 0.6799, "step": 1670 }, { "epoch": 0.15180351640384268, "grad_norm": 2.6171300411224365, "learning_rate": 1.518035164038427e-05, "loss": 0.718, "step": 1675 }, { "epoch": 0.15225666122892875, "grad_norm": 0.9881243705749512, "learning_rate": 1.5225666122892876e-05, "loss": 0.618, "step": 1680 }, { "epoch": 0.15270980605401485, "grad_norm": 1.0057508945465088, "learning_rate": 1.5270980605401487e-05, "loss": 0.668, "step": 1685 }, { "epoch": 0.15316295087910095, "grad_norm": 1.169883370399475, "learning_rate": 1.5316295087910094e-05, "loss": 0.6651, "step": 1690 }, { "epoch": 0.15361609570418705, "grad_norm": 0.9097902774810791, "learning_rate": 1.5361609570418706e-05, "loss": 0.6522, "step": 1695 }, { "epoch": 0.15406924052927315, "grad_norm": 0.7885249257087708, "learning_rate": 1.5406924052927317e-05, "loss": 0.657, "step": 1700 }, { "epoch": 0.15452238535435925, "grad_norm": 0.8311389088630676, "learning_rate": 1.5452238535435924e-05, "loss": 0.7007, "step": 1705 }, { "epoch": 0.15497553017944535, "grad_norm": 0.7960989475250244, "learning_rate": 1.5497553017944536e-05, "loss": 0.7597, "step": 1710 }, { "epoch": 0.15542867500453145, "grad_norm": 0.8414075374603271, "learning_rate": 1.5542867500453147e-05, "loss": 0.6934, "step": 1715 }, { "epoch": 0.15588181982961755, "grad_norm": 0.8374921083450317, "learning_rate": 1.5588181982961754e-05, "loss": 0.6637, "step": 1720 }, { "epoch": 0.15633496465470365, "grad_norm": 0.7916471362113953, "learning_rate": 1.5633496465470366e-05, "loss": 0.7401, "step": 1725 }, { "epoch": 0.15678810947978974, "grad_norm": 0.8318490386009216, "learning_rate": 1.5678810947978977e-05, "loss": 0.6768, "step": 1730 }, { "epoch": 0.15724125430487584, "grad_norm": 0.8583362102508545, "learning_rate": 1.5724125430487584e-05, "loss": 0.6767, "step": 1735 }, { "epoch": 0.15769439912996194, "grad_norm": 0.8266200423240662, "learning_rate": 1.5769439912996196e-05, "loss": 0.7085, "step": 1740 }, { "epoch": 0.15814754395504804, "grad_norm": 0.8578665256500244, "learning_rate": 1.5814754395504803e-05, "loss": 0.6483, "step": 1745 }, { "epoch": 0.15860068878013414, "grad_norm": 1.1986628770828247, "learning_rate": 1.5860068878013414e-05, "loss": 0.6927, "step": 1750 }, { "epoch": 0.15905383360522024, "grad_norm": 0.8988423943519592, "learning_rate": 1.5905383360522026e-05, "loss": 0.6996, "step": 1755 }, { "epoch": 0.1595069784303063, "grad_norm": 0.7995740175247192, "learning_rate": 1.5950697843030633e-05, "loss": 0.6893, "step": 1760 }, { "epoch": 0.1599601232553924, "grad_norm": 0.8319813013076782, "learning_rate": 1.599601232553924e-05, "loss": 0.6728, "step": 1765 }, { "epoch": 0.1604132680804785, "grad_norm": 0.735465407371521, "learning_rate": 1.6041326808047852e-05, "loss": 0.6279, "step": 1770 }, { "epoch": 0.1608664129055646, "grad_norm": 0.8623532652854919, "learning_rate": 1.6086641290556463e-05, "loss": 0.6746, "step": 1775 }, { "epoch": 0.1613195577306507, "grad_norm": 0.8520218133926392, "learning_rate": 1.613195577306507e-05, "loss": 0.679, "step": 1780 }, { "epoch": 0.1617727025557368, "grad_norm": 0.8694140911102295, "learning_rate": 1.6177270255573682e-05, "loss": 0.698, "step": 1785 }, { "epoch": 0.1622258473808229, "grad_norm": 0.8009285926818848, "learning_rate": 1.622258473808229e-05, "loss": 0.6306, "step": 1790 }, { "epoch": 0.162678992205909, "grad_norm": 0.8537169098854065, "learning_rate": 1.62678992205909e-05, "loss": 0.6638, "step": 1795 }, { "epoch": 0.1631321370309951, "grad_norm": 1.1869014501571655, "learning_rate": 1.6313213703099512e-05, "loss": 0.6927, "step": 1800 }, { "epoch": 0.1635852818560812, "grad_norm": 0.88529372215271, "learning_rate": 1.635852818560812e-05, "loss": 0.7285, "step": 1805 }, { "epoch": 0.1640384266811673, "grad_norm": 0.8358108401298523, "learning_rate": 1.640384266811673e-05, "loss": 0.6722, "step": 1810 }, { "epoch": 0.1644915715062534, "grad_norm": 0.7710966467857361, "learning_rate": 1.6449157150625342e-05, "loss": 0.6735, "step": 1815 }, { "epoch": 0.1649447163313395, "grad_norm": 0.768605649471283, "learning_rate": 1.649447163313395e-05, "loss": 0.6758, "step": 1820 }, { "epoch": 0.1653978611564256, "grad_norm": 0.8077301979064941, "learning_rate": 1.653978611564256e-05, "loss": 0.6495, "step": 1825 }, { "epoch": 0.1658510059815117, "grad_norm": 0.8487488627433777, "learning_rate": 1.6585100598151172e-05, "loss": 0.6821, "step": 1830 }, { "epoch": 0.1663041508065978, "grad_norm": 0.8074855804443359, "learning_rate": 1.663041508065978e-05, "loss": 0.6645, "step": 1835 }, { "epoch": 0.1667572956316839, "grad_norm": 0.8097678422927856, "learning_rate": 1.667572956316839e-05, "loss": 0.7145, "step": 1840 }, { "epoch": 0.16721044045676997, "grad_norm": 0.7902653217315674, "learning_rate": 1.6721044045677e-05, "loss": 0.7261, "step": 1845 }, { "epoch": 0.16766358528185607, "grad_norm": 0.816898763179779, "learning_rate": 1.6766358528185607e-05, "loss": 0.6783, "step": 1850 }, { "epoch": 0.16811673010694217, "grad_norm": 0.8633898496627808, "learning_rate": 1.6811673010694218e-05, "loss": 0.6685, "step": 1855 }, { "epoch": 0.16856987493202827, "grad_norm": 0.8558787703514099, "learning_rate": 1.685698749320283e-05, "loss": 0.6683, "step": 1860 }, { "epoch": 0.16902301975711437, "grad_norm": 0.9175195097923279, "learning_rate": 1.6902301975711437e-05, "loss": 0.7185, "step": 1865 }, { "epoch": 0.16947616458220047, "grad_norm": 0.7801957726478577, "learning_rate": 1.6947616458220048e-05, "loss": 0.6955, "step": 1870 }, { "epoch": 0.16992930940728657, "grad_norm": 0.8339425921440125, "learning_rate": 1.699293094072866e-05, "loss": 0.7593, "step": 1875 }, { "epoch": 0.17038245423237267, "grad_norm": 0.860041081905365, "learning_rate": 1.7038245423237267e-05, "loss": 0.6856, "step": 1880 }, { "epoch": 0.17083559905745876, "grad_norm": 0.7761180996894836, "learning_rate": 1.7083559905745878e-05, "loss": 0.6767, "step": 1885 }, { "epoch": 0.17128874388254486, "grad_norm": 0.9165372252464294, "learning_rate": 1.712887438825449e-05, "loss": 0.6763, "step": 1890 }, { "epoch": 0.17174188870763096, "grad_norm": 0.7656795382499695, "learning_rate": 1.7174188870763097e-05, "loss": 0.5976, "step": 1895 }, { "epoch": 0.17219503353271706, "grad_norm": 0.8346091508865356, "learning_rate": 1.7219503353271708e-05, "loss": 0.637, "step": 1900 }, { "epoch": 0.17264817835780316, "grad_norm": 0.7921444177627563, "learning_rate": 1.7264817835780315e-05, "loss": 0.6575, "step": 1905 }, { "epoch": 0.17310132318288926, "grad_norm": 0.8487653136253357, "learning_rate": 1.7310132318288927e-05, "loss": 0.6583, "step": 1910 }, { "epoch": 0.17355446800797536, "grad_norm": 0.7881762981414795, "learning_rate": 1.7355446800797538e-05, "loss": 0.6481, "step": 1915 }, { "epoch": 0.17400761283306146, "grad_norm": 0.7814942598342896, "learning_rate": 1.7400761283306145e-05, "loss": 0.6066, "step": 1920 }, { "epoch": 0.17446075765814753, "grad_norm": 0.8867872357368469, "learning_rate": 1.7446075765814753e-05, "loss": 0.6501, "step": 1925 }, { "epoch": 0.17491390248323363, "grad_norm": 0.7526772022247314, "learning_rate": 1.7491390248323364e-05, "loss": 0.6898, "step": 1930 }, { "epoch": 0.17536704730831973, "grad_norm": 0.8044338822364807, "learning_rate": 1.7536704730831975e-05, "loss": 0.6661, "step": 1935 }, { "epoch": 0.17582019213340583, "grad_norm": 0.8792513608932495, "learning_rate": 1.7582019213340583e-05, "loss": 0.5947, "step": 1940 }, { "epoch": 0.17627333695849193, "grad_norm": 0.8302608728408813, "learning_rate": 1.7627333695849194e-05, "loss": 0.7264, "step": 1945 }, { "epoch": 0.17672648178357803, "grad_norm": 0.7998339533805847, "learning_rate": 1.7672648178357802e-05, "loss": 0.6409, "step": 1950 }, { "epoch": 0.17717962660866413, "grad_norm": 0.8677413463592529, "learning_rate": 1.7717962660866413e-05, "loss": 0.7041, "step": 1955 }, { "epoch": 0.17763277143375023, "grad_norm": 0.8164024353027344, "learning_rate": 1.7763277143375024e-05, "loss": 0.6824, "step": 1960 }, { "epoch": 0.17808591625883632, "grad_norm": 0.8649614453315735, "learning_rate": 1.7808591625883632e-05, "loss": 0.6739, "step": 1965 }, { "epoch": 0.17853906108392242, "grad_norm": 0.8390263915061951, "learning_rate": 1.7853906108392243e-05, "loss": 0.6962, "step": 1970 }, { "epoch": 0.17899220590900852, "grad_norm": 0.8899409770965576, "learning_rate": 1.7899220590900854e-05, "loss": 0.6974, "step": 1975 }, { "epoch": 0.17944535073409462, "grad_norm": 0.7759222984313965, "learning_rate": 1.7944535073409462e-05, "loss": 0.6547, "step": 1980 }, { "epoch": 0.17989849555918072, "grad_norm": 0.8078128695487976, "learning_rate": 1.7989849555918073e-05, "loss": 0.6848, "step": 1985 }, { "epoch": 0.18035164038426682, "grad_norm": 0.847313642501831, "learning_rate": 1.8035164038426684e-05, "loss": 0.6753, "step": 1990 }, { "epoch": 0.18080478520935292, "grad_norm": 0.8318511843681335, "learning_rate": 1.8080478520935292e-05, "loss": 0.6588, "step": 1995 }, { "epoch": 0.18125793003443902, "grad_norm": 0.7961201667785645, "learning_rate": 1.8125793003443903e-05, "loss": 0.6922, "step": 2000 }, { "epoch": 0.18171107485952512, "grad_norm": 0.8698632717132568, "learning_rate": 1.8171107485952514e-05, "loss": 0.7078, "step": 2005 }, { "epoch": 0.1821642196846112, "grad_norm": 0.837388813495636, "learning_rate": 1.821642196846112e-05, "loss": 0.7123, "step": 2010 }, { "epoch": 0.1826173645096973, "grad_norm": 0.854392409324646, "learning_rate": 1.826173645096973e-05, "loss": 0.6455, "step": 2015 }, { "epoch": 0.1830705093347834, "grad_norm": 0.8289536237716675, "learning_rate": 1.830705093347834e-05, "loss": 0.6258, "step": 2020 }, { "epoch": 0.1835236541598695, "grad_norm": 0.9089272022247314, "learning_rate": 1.835236541598695e-05, "loss": 0.6891, "step": 2025 }, { "epoch": 0.1839767989849556, "grad_norm": 0.8296128511428833, "learning_rate": 1.839767989849556e-05, "loss": 0.7223, "step": 2030 }, { "epoch": 0.18442994381004169, "grad_norm": 0.8543399572372437, "learning_rate": 1.844299438100417e-05, "loss": 0.6624, "step": 2035 }, { "epoch": 0.18488308863512778, "grad_norm": 0.8296340107917786, "learning_rate": 1.848830886351278e-05, "loss": 0.6916, "step": 2040 }, { "epoch": 0.18533623346021388, "grad_norm": 0.8519600629806519, "learning_rate": 1.853362334602139e-05, "loss": 0.655, "step": 2045 }, { "epoch": 0.18578937828529998, "grad_norm": 0.7761092185974121, "learning_rate": 1.8578937828529998e-05, "loss": 0.6507, "step": 2050 }, { "epoch": 0.18624252311038608, "grad_norm": 0.8311357498168945, "learning_rate": 1.862425231103861e-05, "loss": 0.7785, "step": 2055 }, { "epoch": 0.18669566793547218, "grad_norm": 0.8090501427650452, "learning_rate": 1.866956679354722e-05, "loss": 0.6927, "step": 2060 }, { "epoch": 0.18714881276055828, "grad_norm": 0.8516384959220886, "learning_rate": 1.8714881276055828e-05, "loss": 0.6987, "step": 2065 }, { "epoch": 0.18760195758564438, "grad_norm": 0.814505398273468, "learning_rate": 1.876019575856444e-05, "loss": 0.7071, "step": 2070 }, { "epoch": 0.18805510241073048, "grad_norm": 0.7856454253196716, "learning_rate": 1.880551024107305e-05, "loss": 0.6392, "step": 2075 }, { "epoch": 0.18850824723581658, "grad_norm": 0.891367495059967, "learning_rate": 1.8850824723581658e-05, "loss": 0.6804, "step": 2080 }, { "epoch": 0.18896139206090268, "grad_norm": 0.7776215672492981, "learning_rate": 1.889613920609027e-05, "loss": 0.6113, "step": 2085 }, { "epoch": 0.18941453688598875, "grad_norm": 0.8168394565582275, "learning_rate": 1.8941453688598876e-05, "loss": 0.7383, "step": 2090 }, { "epoch": 0.18986768171107485, "grad_norm": 0.8793442845344543, "learning_rate": 1.8986768171107484e-05, "loss": 0.6655, "step": 2095 }, { "epoch": 0.19032082653616095, "grad_norm": 0.7807128429412842, "learning_rate": 1.9032082653616095e-05, "loss": 0.6681, "step": 2100 }, { "epoch": 0.19077397136124705, "grad_norm": 0.7932562828063965, "learning_rate": 1.9077397136124706e-05, "loss": 0.671, "step": 2105 }, { "epoch": 0.19122711618633315, "grad_norm": 0.8020330667495728, "learning_rate": 1.9122711618633314e-05, "loss": 0.6519, "step": 2110 }, { "epoch": 0.19168026101141925, "grad_norm": 0.8838668465614319, "learning_rate": 1.9168026101141925e-05, "loss": 0.704, "step": 2115 }, { "epoch": 0.19213340583650534, "grad_norm": 0.8233444690704346, "learning_rate": 1.9213340583650536e-05, "loss": 0.6952, "step": 2120 }, { "epoch": 0.19258655066159144, "grad_norm": 0.8177528977394104, "learning_rate": 1.9258655066159144e-05, "loss": 0.6486, "step": 2125 }, { "epoch": 0.19303969548667754, "grad_norm": 0.8346889615058899, "learning_rate": 1.9303969548667755e-05, "loss": 0.6339, "step": 2130 }, { "epoch": 0.19349284031176364, "grad_norm": 0.8191184401512146, "learning_rate": 1.9349284031176366e-05, "loss": 0.6898, "step": 2135 }, { "epoch": 0.19394598513684974, "grad_norm": 0.8271493315696716, "learning_rate": 1.9394598513684974e-05, "loss": 0.6592, "step": 2140 }, { "epoch": 0.19439912996193584, "grad_norm": 0.7820581197738647, "learning_rate": 1.9439912996193585e-05, "loss": 0.6443, "step": 2145 }, { "epoch": 0.19485227478702194, "grad_norm": 0.8239832520484924, "learning_rate": 1.9485227478702196e-05, "loss": 0.5964, "step": 2150 }, { "epoch": 0.19530541961210804, "grad_norm": 0.8213889002799988, "learning_rate": 1.9530541961210804e-05, "loss": 0.6864, "step": 2155 }, { "epoch": 0.19575856443719414, "grad_norm": 0.8874015808105469, "learning_rate": 1.9575856443719415e-05, "loss": 0.6678, "step": 2160 }, { "epoch": 0.19621170926228024, "grad_norm": 0.8178291320800781, "learning_rate": 1.9621170926228023e-05, "loss": 0.6589, "step": 2165 }, { "epoch": 0.1966648540873663, "grad_norm": 0.815295934677124, "learning_rate": 1.966648540873663e-05, "loss": 0.6716, "step": 2170 }, { "epoch": 0.1971179989124524, "grad_norm": 0.8000981211662292, "learning_rate": 1.9711799891245242e-05, "loss": 0.6188, "step": 2175 }, { "epoch": 0.1975711437375385, "grad_norm": 0.8327875137329102, "learning_rate": 1.9757114373753853e-05, "loss": 0.6462, "step": 2180 }, { "epoch": 0.1980242885626246, "grad_norm": 0.8812274932861328, "learning_rate": 1.980242885626246e-05, "loss": 0.6833, "step": 2185 }, { "epoch": 0.1984774333877107, "grad_norm": 0.974230945110321, "learning_rate": 1.9847743338771072e-05, "loss": 0.748, "step": 2190 }, { "epoch": 0.1989305782127968, "grad_norm": 0.8152954578399658, "learning_rate": 1.9893057821279683e-05, "loss": 0.6706, "step": 2195 }, { "epoch": 0.1993837230378829, "grad_norm": 0.8329400420188904, "learning_rate": 1.993837230378829e-05, "loss": 0.704, "step": 2200 }, { "epoch": 0.199836867862969, "grad_norm": 0.905837893486023, "learning_rate": 1.9983686786296902e-05, "loss": 0.681, "step": 2205 }, { "epoch": 0.2002900126880551, "grad_norm": 0.7923001646995544, "learning_rate": 2.002900126880551e-05, "loss": 0.6891, "step": 2210 }, { "epoch": 0.2007431575131412, "grad_norm": 0.8375571966171265, "learning_rate": 2.007431575131412e-05, "loss": 0.6155, "step": 2215 }, { "epoch": 0.2011963023382273, "grad_norm": 0.8712363243103027, "learning_rate": 2.0119630233822732e-05, "loss": 0.6152, "step": 2220 }, { "epoch": 0.2016494471633134, "grad_norm": 0.7995284795761108, "learning_rate": 2.016494471633134e-05, "loss": 0.6256, "step": 2225 }, { "epoch": 0.2021025919883995, "grad_norm": 0.8751293420791626, "learning_rate": 2.021025919883995e-05, "loss": 0.6841, "step": 2230 }, { "epoch": 0.2025557368134856, "grad_norm": 0.7974370718002319, "learning_rate": 2.0255573681348562e-05, "loss": 0.6541, "step": 2235 }, { "epoch": 0.2030088816385717, "grad_norm": 0.7895273566246033, "learning_rate": 2.030088816385717e-05, "loss": 0.6654, "step": 2240 }, { "epoch": 0.2034620264636578, "grad_norm": 0.8452236652374268, "learning_rate": 2.034620264636578e-05, "loss": 0.6662, "step": 2245 }, { "epoch": 0.2039151712887439, "grad_norm": 0.8037408590316772, "learning_rate": 2.0391517128874392e-05, "loss": 0.6145, "step": 2250 }, { "epoch": 0.20436831611382997, "grad_norm": 0.7836667895317078, "learning_rate": 2.0436831611382996e-05, "loss": 0.6881, "step": 2255 }, { "epoch": 0.20482146093891607, "grad_norm": 0.820275068283081, "learning_rate": 2.0482146093891607e-05, "loss": 0.7245, "step": 2260 }, { "epoch": 0.20527460576400217, "grad_norm": 0.8529037237167358, "learning_rate": 2.052746057640022e-05, "loss": 0.699, "step": 2265 }, { "epoch": 0.20572775058908827, "grad_norm": 0.913713812828064, "learning_rate": 2.0572775058908826e-05, "loss": 0.6979, "step": 2270 }, { "epoch": 0.20618089541417436, "grad_norm": 0.8235358595848083, "learning_rate": 2.0618089541417437e-05, "loss": 0.673, "step": 2275 }, { "epoch": 0.20663404023926046, "grad_norm": 0.8327574133872986, "learning_rate": 2.066340402392605e-05, "loss": 0.6282, "step": 2280 }, { "epoch": 0.20708718506434656, "grad_norm": 0.8417721390724182, "learning_rate": 2.0708718506434656e-05, "loss": 0.6946, "step": 2285 }, { "epoch": 0.20754032988943266, "grad_norm": 0.8163365125656128, "learning_rate": 2.0754032988943267e-05, "loss": 0.6179, "step": 2290 }, { "epoch": 0.20799347471451876, "grad_norm": 0.9812320470809937, "learning_rate": 2.079934747145188e-05, "loss": 0.7318, "step": 2295 }, { "epoch": 0.20844661953960486, "grad_norm": 0.9201845526695251, "learning_rate": 2.0844661953960486e-05, "loss": 0.6071, "step": 2300 }, { "epoch": 0.20889976436469096, "grad_norm": 0.856987476348877, "learning_rate": 2.0889976436469097e-05, "loss": 0.6398, "step": 2305 }, { "epoch": 0.20935290918977706, "grad_norm": 0.8207604885101318, "learning_rate": 2.093529091897771e-05, "loss": 0.6793, "step": 2310 }, { "epoch": 0.20980605401486316, "grad_norm": 0.8458103537559509, "learning_rate": 2.0980605401486316e-05, "loss": 0.6602, "step": 2315 }, { "epoch": 0.21025919883994926, "grad_norm": 0.8802046179771423, "learning_rate": 2.1025919883994927e-05, "loss": 0.6893, "step": 2320 }, { "epoch": 0.21071234366503536, "grad_norm": 0.872003972530365, "learning_rate": 2.1071234366503535e-05, "loss": 0.6647, "step": 2325 }, { "epoch": 0.21116548849012146, "grad_norm": 0.8256244659423828, "learning_rate": 2.1116548849012146e-05, "loss": 0.6542, "step": 2330 }, { "epoch": 0.21161863331520753, "grad_norm": 0.8336780071258545, "learning_rate": 2.1161863331520754e-05, "loss": 0.6402, "step": 2335 }, { "epoch": 0.21207177814029363, "grad_norm": 0.8634117841720581, "learning_rate": 2.1207177814029365e-05, "loss": 0.7179, "step": 2340 }, { "epoch": 0.21252492296537973, "grad_norm": 0.8113746643066406, "learning_rate": 2.1252492296537973e-05, "loss": 0.6451, "step": 2345 }, { "epoch": 0.21297806779046582, "grad_norm": 0.8746495246887207, "learning_rate": 2.1297806779046584e-05, "loss": 0.6215, "step": 2350 }, { "epoch": 0.21343121261555192, "grad_norm": 0.8088000416755676, "learning_rate": 2.1343121261555192e-05, "loss": 0.651, "step": 2355 }, { "epoch": 0.21388435744063802, "grad_norm": 0.8345638513565063, "learning_rate": 2.1388435744063803e-05, "loss": 0.6916, "step": 2360 }, { "epoch": 0.21433750226572412, "grad_norm": 0.8498806357383728, "learning_rate": 2.1433750226572414e-05, "loss": 0.6809, "step": 2365 }, { "epoch": 0.21479064709081022, "grad_norm": 0.8483204245567322, "learning_rate": 2.1479064709081022e-05, "loss": 0.6761, "step": 2370 }, { "epoch": 0.21524379191589632, "grad_norm": 0.9153007864952087, "learning_rate": 2.1524379191589633e-05, "loss": 0.6852, "step": 2375 }, { "epoch": 0.21569693674098242, "grad_norm": 0.8338358402252197, "learning_rate": 2.1569693674098244e-05, "loss": 0.6557, "step": 2380 }, { "epoch": 0.21615008156606852, "grad_norm": 0.8001590967178345, "learning_rate": 2.1615008156606852e-05, "loss": 0.6049, "step": 2385 }, { "epoch": 0.21660322639115462, "grad_norm": 0.8286995887756348, "learning_rate": 2.1660322639115463e-05, "loss": 0.6256, "step": 2390 }, { "epoch": 0.21705637121624072, "grad_norm": 0.9919258952140808, "learning_rate": 2.1705637121624074e-05, "loss": 0.5963, "step": 2395 }, { "epoch": 0.21750951604132682, "grad_norm": 0.7804303169250488, "learning_rate": 2.1750951604132682e-05, "loss": 0.6505, "step": 2400 }, { "epoch": 0.21796266086641292, "grad_norm": 0.804158627986908, "learning_rate": 2.1796266086641293e-05, "loss": 0.6689, "step": 2405 }, { "epoch": 0.21841580569149902, "grad_norm": 0.7539385557174683, "learning_rate": 2.1841580569149904e-05, "loss": 0.6759, "step": 2410 }, { "epoch": 0.2188689505165851, "grad_norm": 0.7887430191040039, "learning_rate": 2.188689505165851e-05, "loss": 0.6494, "step": 2415 }, { "epoch": 0.21932209534167119, "grad_norm": 0.7964594960212708, "learning_rate": 2.193220953416712e-05, "loss": 0.7002, "step": 2420 }, { "epoch": 0.21977524016675729, "grad_norm": 0.8938416838645935, "learning_rate": 2.197752401667573e-05, "loss": 0.7084, "step": 2425 }, { "epoch": 0.22022838499184338, "grad_norm": 0.8822945356369019, "learning_rate": 2.202283849918434e-05, "loss": 0.6, "step": 2430 }, { "epoch": 0.22068152981692948, "grad_norm": 0.8902570605278015, "learning_rate": 2.206815298169295e-05, "loss": 0.6404, "step": 2435 }, { "epoch": 0.22113467464201558, "grad_norm": 0.8197152018547058, "learning_rate": 2.211346746420156e-05, "loss": 0.6246, "step": 2440 }, { "epoch": 0.22158781946710168, "grad_norm": 1.4695484638214111, "learning_rate": 2.215878194671017e-05, "loss": 0.6689, "step": 2445 }, { "epoch": 0.22204096429218778, "grad_norm": 0.8586527705192566, "learning_rate": 2.220409642921878e-05, "loss": 0.6667, "step": 2450 }, { "epoch": 0.22249410911727388, "grad_norm": 0.8077982664108276, "learning_rate": 2.224941091172739e-05, "loss": 0.6364, "step": 2455 }, { "epoch": 0.22294725394235998, "grad_norm": 0.8835175037384033, "learning_rate": 2.2294725394236e-05, "loss": 0.6998, "step": 2460 }, { "epoch": 0.22340039876744608, "grad_norm": 0.7806249856948853, "learning_rate": 2.234003987674461e-05, "loss": 0.6991, "step": 2465 }, { "epoch": 0.22385354359253218, "grad_norm": 0.8322763442993164, "learning_rate": 2.2385354359253217e-05, "loss": 0.6782, "step": 2470 }, { "epoch": 0.22430668841761828, "grad_norm": 0.8225110173225403, "learning_rate": 2.243066884176183e-05, "loss": 0.6019, "step": 2475 }, { "epoch": 0.22475983324270438, "grad_norm": 0.8543949127197266, "learning_rate": 2.247598332427044e-05, "loss": 0.6279, "step": 2480 }, { "epoch": 0.22521297806779048, "grad_norm": 0.8157056570053101, "learning_rate": 2.2521297806779047e-05, "loss": 0.6568, "step": 2485 }, { "epoch": 0.22566612289287658, "grad_norm": 0.8464009761810303, "learning_rate": 2.256661228928766e-05, "loss": 0.6411, "step": 2490 }, { "epoch": 0.22611926771796267, "grad_norm": 0.9009828567504883, "learning_rate": 2.261192677179627e-05, "loss": 0.6667, "step": 2495 }, { "epoch": 0.22657241254304875, "grad_norm": 0.8398420214653015, "learning_rate": 2.2657241254304877e-05, "loss": 0.6475, "step": 2500 }, { "epoch": 0.22702555736813484, "grad_norm": 0.8426862955093384, "learning_rate": 2.2702555736813485e-05, "loss": 0.6827, "step": 2505 }, { "epoch": 0.22747870219322094, "grad_norm": 0.8399500846862793, "learning_rate": 2.2747870219322096e-05, "loss": 0.6845, "step": 2510 }, { "epoch": 0.22793184701830704, "grad_norm": 0.8694862127304077, "learning_rate": 2.2793184701830704e-05, "loss": 0.668, "step": 2515 }, { "epoch": 0.22838499184339314, "grad_norm": 0.8356603384017944, "learning_rate": 2.2838499184339315e-05, "loss": 0.6624, "step": 2520 }, { "epoch": 0.22883813666847924, "grad_norm": 0.8476899266242981, "learning_rate": 2.2883813666847926e-05, "loss": 0.6183, "step": 2525 }, { "epoch": 0.22929128149356534, "grad_norm": 0.84376060962677, "learning_rate": 2.2929128149356534e-05, "loss": 0.6367, "step": 2530 }, { "epoch": 0.22974442631865144, "grad_norm": 0.7703520655632019, "learning_rate": 2.2974442631865145e-05, "loss": 0.6151, "step": 2535 }, { "epoch": 0.23019757114373754, "grad_norm": 0.9314011931419373, "learning_rate": 2.3019757114373756e-05, "loss": 0.6617, "step": 2540 }, { "epoch": 0.23065071596882364, "grad_norm": 0.8471437692642212, "learning_rate": 2.3065071596882364e-05, "loss": 0.6456, "step": 2545 }, { "epoch": 0.23110386079390974, "grad_norm": 0.8297092914581299, "learning_rate": 2.3110386079390975e-05, "loss": 0.6278, "step": 2550 }, { "epoch": 0.23155700561899584, "grad_norm": 0.8798389434814453, "learning_rate": 2.3155700561899586e-05, "loss": 0.6569, "step": 2555 }, { "epoch": 0.23201015044408194, "grad_norm": 0.8150957226753235, "learning_rate": 2.3201015044408194e-05, "loss": 0.7267, "step": 2560 }, { "epoch": 0.23246329526916804, "grad_norm": 0.8452844023704529, "learning_rate": 2.3246329526916805e-05, "loss": 0.6333, "step": 2565 }, { "epoch": 0.23291644009425413, "grad_norm": 0.8448356986045837, "learning_rate": 2.3291644009425416e-05, "loss": 0.6872, "step": 2570 }, { "epoch": 0.23336958491934023, "grad_norm": 0.801586925983429, "learning_rate": 2.3336958491934024e-05, "loss": 0.7287, "step": 2575 }, { "epoch": 0.2338227297444263, "grad_norm": 0.7937217354774475, "learning_rate": 2.338227297444263e-05, "loss": 0.6679, "step": 2580 }, { "epoch": 0.2342758745695124, "grad_norm": 0.9758570194244385, "learning_rate": 2.3427587456951243e-05, "loss": 0.6212, "step": 2585 }, { "epoch": 0.2347290193945985, "grad_norm": 0.8108507394790649, "learning_rate": 2.347290193945985e-05, "loss": 0.6578, "step": 2590 }, { "epoch": 0.2351821642196846, "grad_norm": 0.8137131333351135, "learning_rate": 2.351821642196846e-05, "loss": 0.6489, "step": 2595 }, { "epoch": 0.2356353090447707, "grad_norm": 0.8310118913650513, "learning_rate": 2.3563530904477073e-05, "loss": 0.6511, "step": 2600 }, { "epoch": 0.2360884538698568, "grad_norm": 0.7360436320304871, "learning_rate": 2.360884538698568e-05, "loss": 0.6424, "step": 2605 }, { "epoch": 0.2365415986949429, "grad_norm": 0.7743526101112366, "learning_rate": 2.365415986949429e-05, "loss": 0.6982, "step": 2610 }, { "epoch": 0.236994743520029, "grad_norm": 0.8738293647766113, "learning_rate": 2.3699474352002903e-05, "loss": 0.6617, "step": 2615 }, { "epoch": 0.2374478883451151, "grad_norm": 0.8241714835166931, "learning_rate": 2.374478883451151e-05, "loss": 0.6787, "step": 2620 }, { "epoch": 0.2379010331702012, "grad_norm": 0.8266722559928894, "learning_rate": 2.379010331702012e-05, "loss": 0.6879, "step": 2625 }, { "epoch": 0.2383541779952873, "grad_norm": 0.8683573603630066, "learning_rate": 2.383541779952873e-05, "loss": 0.6518, "step": 2630 }, { "epoch": 0.2388073228203734, "grad_norm": 0.7869330644607544, "learning_rate": 2.388073228203734e-05, "loss": 0.666, "step": 2635 }, { "epoch": 0.2392604676454595, "grad_norm": 0.841697096824646, "learning_rate": 2.392604676454595e-05, "loss": 0.6113, "step": 2640 }, { "epoch": 0.2397136124705456, "grad_norm": 0.7877386808395386, "learning_rate": 2.397136124705456e-05, "loss": 0.5946, "step": 2645 }, { "epoch": 0.2401667572956317, "grad_norm": 0.8938409686088562, "learning_rate": 2.401667572956317e-05, "loss": 0.6303, "step": 2650 }, { "epoch": 0.2406199021207178, "grad_norm": 0.818673849105835, "learning_rate": 2.406199021207178e-05, "loss": 0.6378, "step": 2655 }, { "epoch": 0.2410730469458039, "grad_norm": 0.8725866079330444, "learning_rate": 2.410730469458039e-05, "loss": 0.6815, "step": 2660 }, { "epoch": 0.24152619177088996, "grad_norm": 0.7914195656776428, "learning_rate": 2.4152619177088997e-05, "loss": 0.6495, "step": 2665 }, { "epoch": 0.24197933659597606, "grad_norm": 0.8092657327651978, "learning_rate": 2.4197933659597608e-05, "loss": 0.6617, "step": 2670 }, { "epoch": 0.24243248142106216, "grad_norm": 0.8270645141601562, "learning_rate": 2.4243248142106216e-05, "loss": 0.6518, "step": 2675 }, { "epoch": 0.24288562624614826, "grad_norm": 0.7582054734230042, "learning_rate": 2.4288562624614827e-05, "loss": 0.6633, "step": 2680 }, { "epoch": 0.24333877107123436, "grad_norm": 1.052982211112976, "learning_rate": 2.4333877107123438e-05, "loss": 0.6543, "step": 2685 }, { "epoch": 0.24379191589632046, "grad_norm": 0.834418773651123, "learning_rate": 2.4379191589632046e-05, "loss": 0.6885, "step": 2690 }, { "epoch": 0.24424506072140656, "grad_norm": 0.9168856739997864, "learning_rate": 2.4424506072140657e-05, "loss": 0.7048, "step": 2695 }, { "epoch": 0.24469820554649266, "grad_norm": 0.8269613981246948, "learning_rate": 2.4469820554649268e-05, "loss": 0.692, "step": 2700 }, { "epoch": 0.24515135037157876, "grad_norm": 0.8005282878875732, "learning_rate": 2.4515135037157876e-05, "loss": 0.665, "step": 2705 }, { "epoch": 0.24560449519666486, "grad_norm": 0.8649964928627014, "learning_rate": 2.4560449519666487e-05, "loss": 0.6792, "step": 2710 }, { "epoch": 0.24605764002175096, "grad_norm": 0.8045487999916077, "learning_rate": 2.4605764002175098e-05, "loss": 0.6469, "step": 2715 }, { "epoch": 0.24651078484683706, "grad_norm": 0.8137491345405579, "learning_rate": 2.4651078484683706e-05, "loss": 0.6667, "step": 2720 }, { "epoch": 0.24696392967192315, "grad_norm": 0.8273957371711731, "learning_rate": 2.4696392967192317e-05, "loss": 0.6613, "step": 2725 }, { "epoch": 0.24741707449700925, "grad_norm": 0.8856151700019836, "learning_rate": 2.4741707449700928e-05, "loss": 0.6238, "step": 2730 }, { "epoch": 0.24787021932209535, "grad_norm": 0.864784836769104, "learning_rate": 2.4787021932209536e-05, "loss": 0.6733, "step": 2735 }, { "epoch": 0.24832336414718145, "grad_norm": 0.8557373881340027, "learning_rate": 2.4832336414718147e-05, "loss": 0.6136, "step": 2740 }, { "epoch": 0.24877650897226752, "grad_norm": 0.8231181502342224, "learning_rate": 2.4877650897226755e-05, "loss": 0.6906, "step": 2745 }, { "epoch": 0.24922965379735362, "grad_norm": 0.8005005121231079, "learning_rate": 2.4922965379735363e-05, "loss": 0.6656, "step": 2750 }, { "epoch": 0.24968279862243972, "grad_norm": 0.8577089905738831, "learning_rate": 2.4968279862243974e-05, "loss": 0.6313, "step": 2755 }, { "epoch": 0.2501359434475258, "grad_norm": 0.823160707950592, "learning_rate": 2.5013594344752585e-05, "loss": 0.6887, "step": 2760 }, { "epoch": 0.25058908827261195, "grad_norm": 0.7780691981315613, "learning_rate": 2.5058908827261196e-05, "loss": 0.6306, "step": 2765 }, { "epoch": 0.251042233097698, "grad_norm": 0.8333734273910522, "learning_rate": 2.5104223309769804e-05, "loss": 0.6643, "step": 2770 }, { "epoch": 0.25149537792278415, "grad_norm": 0.8704510927200317, "learning_rate": 2.5149537792278415e-05, "loss": 0.6258, "step": 2775 }, { "epoch": 0.2519485227478702, "grad_norm": 0.8648356795310974, "learning_rate": 2.5194852274787023e-05, "loss": 0.6823, "step": 2780 }, { "epoch": 0.2524016675729563, "grad_norm": 0.8057388067245483, "learning_rate": 2.524016675729563e-05, "loss": 0.6569, "step": 2785 }, { "epoch": 0.2528548123980424, "grad_norm": 0.7853803634643555, "learning_rate": 2.528548123980424e-05, "loss": 0.6742, "step": 2790 }, { "epoch": 0.2533079572231285, "grad_norm": 0.8535770177841187, "learning_rate": 2.533079572231285e-05, "loss": 0.6722, "step": 2795 }, { "epoch": 0.2537611020482146, "grad_norm": 0.8812286257743835, "learning_rate": 2.5376110204821464e-05, "loss": 0.621, "step": 2800 }, { "epoch": 0.2542142468733007, "grad_norm": 0.8805758357048035, "learning_rate": 2.542142468733007e-05, "loss": 0.5898, "step": 2805 }, { "epoch": 0.2546673916983868, "grad_norm": 0.860139012336731, "learning_rate": 2.5466739169838683e-05, "loss": 0.6797, "step": 2810 }, { "epoch": 0.2551205365234729, "grad_norm": 0.7791703343391418, "learning_rate": 2.551205365234729e-05, "loss": 0.6361, "step": 2815 }, { "epoch": 0.255573681348559, "grad_norm": 0.850843608379364, "learning_rate": 2.55573681348559e-05, "loss": 0.6521, "step": 2820 }, { "epoch": 0.2560268261736451, "grad_norm": 0.8939759135246277, "learning_rate": 2.560268261736451e-05, "loss": 0.642, "step": 2825 }, { "epoch": 0.2564799709987312, "grad_norm": 0.9206168055534363, "learning_rate": 2.5647997099873124e-05, "loss": 0.6545, "step": 2830 }, { "epoch": 0.2569331158238173, "grad_norm": 0.8003260493278503, "learning_rate": 2.5693311582381728e-05, "loss": 0.6632, "step": 2835 }, { "epoch": 0.2573862606489034, "grad_norm": 0.8521022796630859, "learning_rate": 2.5738626064890343e-05, "loss": 0.5961, "step": 2840 }, { "epoch": 0.2578394054739895, "grad_norm": 0.7925294637680054, "learning_rate": 2.578394054739895e-05, "loss": 0.6175, "step": 2845 }, { "epoch": 0.2582925502990756, "grad_norm": 0.8526947498321533, "learning_rate": 2.582925502990756e-05, "loss": 0.6686, "step": 2850 }, { "epoch": 0.2587456951241617, "grad_norm": 0.8025302886962891, "learning_rate": 2.587456951241617e-05, "loss": 0.6325, "step": 2855 }, { "epoch": 0.2591988399492478, "grad_norm": 0.8288617730140686, "learning_rate": 2.591988399492478e-05, "loss": 0.6878, "step": 2860 }, { "epoch": 0.2596519847743339, "grad_norm": 0.9675948619842529, "learning_rate": 2.5965198477433388e-05, "loss": 0.6946, "step": 2865 }, { "epoch": 0.26010512959941995, "grad_norm": 0.8804870247840881, "learning_rate": 2.6010512959941996e-05, "loss": 0.6414, "step": 2870 }, { "epoch": 0.2605582744245061, "grad_norm": 0.85069340467453, "learning_rate": 2.605582744245061e-05, "loss": 0.6924, "step": 2875 }, { "epoch": 0.26101141924959215, "grad_norm": 0.849820613861084, "learning_rate": 2.6101141924959215e-05, "loss": 0.6678, "step": 2880 }, { "epoch": 0.2614645640746783, "grad_norm": 0.8781740665435791, "learning_rate": 2.614645640746783e-05, "loss": 0.7116, "step": 2885 }, { "epoch": 0.26191770889976435, "grad_norm": 0.8902778029441833, "learning_rate": 2.6191770889976437e-05, "loss": 0.6122, "step": 2890 }, { "epoch": 0.2623708537248505, "grad_norm": 0.8102195262908936, "learning_rate": 2.6237085372485048e-05, "loss": 0.6352, "step": 2895 }, { "epoch": 0.26282399854993654, "grad_norm": 0.8478800058364868, "learning_rate": 2.6282399854993656e-05, "loss": 0.7248, "step": 2900 }, { "epoch": 0.26327714337502267, "grad_norm": 0.8797401189804077, "learning_rate": 2.6327714337502267e-05, "loss": 0.6687, "step": 2905 }, { "epoch": 0.26373028820010874, "grad_norm": 0.8777120113372803, "learning_rate": 2.6373028820010875e-05, "loss": 0.6409, "step": 2910 }, { "epoch": 0.26418343302519487, "grad_norm": 1.1650631427764893, "learning_rate": 2.641834330251949e-05, "loss": 0.6306, "step": 2915 }, { "epoch": 0.26463657785028094, "grad_norm": 0.8136563897132874, "learning_rate": 2.6463657785028097e-05, "loss": 0.5846, "step": 2920 }, { "epoch": 0.26508972267536707, "grad_norm": 0.8808854818344116, "learning_rate": 2.6508972267536708e-05, "loss": 0.6175, "step": 2925 }, { "epoch": 0.26554286750045314, "grad_norm": 0.8315426111221313, "learning_rate": 2.6554286750045316e-05, "loss": 0.7147, "step": 2930 }, { "epoch": 0.26599601232553927, "grad_norm": 0.8252599835395813, "learning_rate": 2.6599601232553927e-05, "loss": 0.6108, "step": 2935 }, { "epoch": 0.26644915715062534, "grad_norm": 0.7700263857841492, "learning_rate": 2.6644915715062535e-05, "loss": 0.5666, "step": 2940 }, { "epoch": 0.26690230197571146, "grad_norm": 0.8237745761871338, "learning_rate": 2.669023019757115e-05, "loss": 0.7214, "step": 2945 }, { "epoch": 0.26735544680079754, "grad_norm": 0.8432348370552063, "learning_rate": 2.6735544680079754e-05, "loss": 0.7115, "step": 2950 }, { "epoch": 0.2678085916258836, "grad_norm": 0.8238469362258911, "learning_rate": 2.678085916258836e-05, "loss": 0.6649, "step": 2955 }, { "epoch": 0.26826173645096973, "grad_norm": 0.8553200960159302, "learning_rate": 2.6826173645096976e-05, "loss": 0.643, "step": 2960 }, { "epoch": 0.2687148812760558, "grad_norm": 0.792320728302002, "learning_rate": 2.687148812760558e-05, "loss": 0.6454, "step": 2965 }, { "epoch": 0.26916802610114193, "grad_norm": 0.7937535643577576, "learning_rate": 2.6916802610114195e-05, "loss": 0.6093, "step": 2970 }, { "epoch": 0.269621170926228, "grad_norm": 0.9111419320106506, "learning_rate": 2.6962117092622802e-05, "loss": 0.6277, "step": 2975 }, { "epoch": 0.27007431575131413, "grad_norm": 0.830481767654419, "learning_rate": 2.7007431575131414e-05, "loss": 0.5856, "step": 2980 }, { "epoch": 0.2705274605764002, "grad_norm": 0.8587421774864197, "learning_rate": 2.705274605764002e-05, "loss": 0.6151, "step": 2985 }, { "epoch": 0.27098060540148633, "grad_norm": 0.8051192760467529, "learning_rate": 2.7098060540148636e-05, "loss": 0.576, "step": 2990 }, { "epoch": 0.2714337502265724, "grad_norm": 0.986095130443573, "learning_rate": 2.714337502265724e-05, "loss": 0.6143, "step": 2995 }, { "epoch": 0.27188689505165853, "grad_norm": 0.9789834022521973, "learning_rate": 2.7188689505165855e-05, "loss": 0.6428, "step": 3000 }, { "epoch": 0.2723400398767446, "grad_norm": 0.8478959798812866, "learning_rate": 2.7234003987674462e-05, "loss": 0.6291, "step": 3005 }, { "epoch": 0.2727931847018307, "grad_norm": 0.791316568851471, "learning_rate": 2.7279318470183074e-05, "loss": 0.5905, "step": 3010 }, { "epoch": 0.2732463295269168, "grad_norm": 0.8786819577217102, "learning_rate": 2.732463295269168e-05, "loss": 0.6846, "step": 3015 }, { "epoch": 0.2736994743520029, "grad_norm": 0.8403626084327698, "learning_rate": 2.7369947435200292e-05, "loss": 0.6568, "step": 3020 }, { "epoch": 0.274152619177089, "grad_norm": 0.7959733605384827, "learning_rate": 2.74152619177089e-05, "loss": 0.6219, "step": 3025 }, { "epoch": 0.27460576400217507, "grad_norm": 0.8133234977722168, "learning_rate": 2.7460576400217508e-05, "loss": 0.6831, "step": 3030 }, { "epoch": 0.2750589088272612, "grad_norm": 0.8181875348091125, "learning_rate": 2.7505890882726122e-05, "loss": 0.6876, "step": 3035 }, { "epoch": 0.27551205365234727, "grad_norm": 0.9131973385810852, "learning_rate": 2.7551205365234727e-05, "loss": 0.6494, "step": 3040 }, { "epoch": 0.2759651984774334, "grad_norm": 0.8100696206092834, "learning_rate": 2.759651984774334e-05, "loss": 0.694, "step": 3045 }, { "epoch": 0.27641834330251946, "grad_norm": 0.9069897532463074, "learning_rate": 2.764183433025195e-05, "loss": 0.6032, "step": 3050 }, { "epoch": 0.2768714881276056, "grad_norm": 0.8337202668190002, "learning_rate": 2.768714881276056e-05, "loss": 0.6183, "step": 3055 }, { "epoch": 0.27732463295269166, "grad_norm": 0.9358482956886292, "learning_rate": 2.7732463295269168e-05, "loss": 0.6494, "step": 3060 }, { "epoch": 0.2777777777777778, "grad_norm": 0.7746974229812622, "learning_rate": 2.777777777777778e-05, "loss": 0.6484, "step": 3065 }, { "epoch": 0.27823092260286386, "grad_norm": 0.8657786250114441, "learning_rate": 2.7823092260286387e-05, "loss": 0.6899, "step": 3070 }, { "epoch": 0.27868406742795, "grad_norm": 0.8454146981239319, "learning_rate": 2.7868406742795e-05, "loss": 0.7241, "step": 3075 }, { "epoch": 0.27913721225303606, "grad_norm": 0.7200838923454285, "learning_rate": 2.7913721225303606e-05, "loss": 0.599, "step": 3080 }, { "epoch": 0.2795903570781222, "grad_norm": 0.8424976468086243, "learning_rate": 2.795903570781222e-05, "loss": 0.6066, "step": 3085 }, { "epoch": 0.28004350190320826, "grad_norm": 0.8442434668540955, "learning_rate": 2.8004350190320828e-05, "loss": 0.686, "step": 3090 }, { "epoch": 0.2804966467282944, "grad_norm": 0.8483108282089233, "learning_rate": 2.804966467282944e-05, "loss": 0.6331, "step": 3095 }, { "epoch": 0.28094979155338046, "grad_norm": 0.856548011302948, "learning_rate": 2.8094979155338047e-05, "loss": 0.6027, "step": 3100 }, { "epoch": 0.2814029363784666, "grad_norm": 0.7788861393928528, "learning_rate": 2.814029363784666e-05, "loss": 0.6528, "step": 3105 }, { "epoch": 0.28185608120355266, "grad_norm": 0.8453931212425232, "learning_rate": 2.8185608120355266e-05, "loss": 0.6748, "step": 3110 }, { "epoch": 0.2823092260286387, "grad_norm": 0.8613355755805969, "learning_rate": 2.8230922602863873e-05, "loss": 0.6292, "step": 3115 }, { "epoch": 0.28276237085372485, "grad_norm": 0.8871157765388489, "learning_rate": 2.8276237085372488e-05, "loss": 0.6689, "step": 3120 }, { "epoch": 0.2832155156788109, "grad_norm": 0.814266562461853, "learning_rate": 2.8321551567881092e-05, "loss": 0.5757, "step": 3125 }, { "epoch": 0.28366866050389705, "grad_norm": 0.9020169377326965, "learning_rate": 2.8366866050389707e-05, "loss": 0.644, "step": 3130 }, { "epoch": 0.2841218053289831, "grad_norm": 0.8244061470031738, "learning_rate": 2.8412180532898315e-05, "loss": 0.674, "step": 3135 }, { "epoch": 0.28457495015406925, "grad_norm": 0.7528793215751648, "learning_rate": 2.8457495015406926e-05, "loss": 0.6711, "step": 3140 }, { "epoch": 0.2850280949791553, "grad_norm": 0.798069953918457, "learning_rate": 2.8502809497915533e-05, "loss": 0.6686, "step": 3145 }, { "epoch": 0.28548123980424145, "grad_norm": 0.8990511894226074, "learning_rate": 2.8548123980424145e-05, "loss": 0.6357, "step": 3150 }, { "epoch": 0.2859343846293275, "grad_norm": 0.8208374977111816, "learning_rate": 2.8593438462932752e-05, "loss": 0.6087, "step": 3155 }, { "epoch": 0.28638752945441365, "grad_norm": 0.8061433434486389, "learning_rate": 2.8638752945441367e-05, "loss": 0.6332, "step": 3160 }, { "epoch": 0.2868406742794997, "grad_norm": 0.87767493724823, "learning_rate": 2.8684067427949975e-05, "loss": 0.5857, "step": 3165 }, { "epoch": 0.28729381910458585, "grad_norm": 0.8736397624015808, "learning_rate": 2.8729381910458586e-05, "loss": 0.5992, "step": 3170 }, { "epoch": 0.2877469639296719, "grad_norm": 0.804164707660675, "learning_rate": 2.8774696392967193e-05, "loss": 0.6735, "step": 3175 }, { "epoch": 0.28820010875475804, "grad_norm": 0.7984071373939514, "learning_rate": 2.8820010875475805e-05, "loss": 0.6238, "step": 3180 }, { "epoch": 0.2886532535798441, "grad_norm": 0.9303390979766846, "learning_rate": 2.8865325357984412e-05, "loss": 0.6701, "step": 3185 }, { "epoch": 0.28910639840493024, "grad_norm": 0.8673346638679504, "learning_rate": 2.8910639840493027e-05, "loss": 0.6131, "step": 3190 }, { "epoch": 0.2895595432300163, "grad_norm": 1.0721712112426758, "learning_rate": 2.895595432300163e-05, "loss": 0.6068, "step": 3195 }, { "epoch": 0.2900126880551024, "grad_norm": 0.8383752107620239, "learning_rate": 2.900126880551024e-05, "loss": 0.6665, "step": 3200 }, { "epoch": 0.2904658328801885, "grad_norm": 0.8784968256950378, "learning_rate": 2.9046583288018853e-05, "loss": 0.5902, "step": 3205 }, { "epoch": 0.2909189777052746, "grad_norm": 0.7875505089759827, "learning_rate": 2.909189777052746e-05, "loss": 0.6094, "step": 3210 }, { "epoch": 0.2913721225303607, "grad_norm": 0.8445430994033813, "learning_rate": 2.9137212253036072e-05, "loss": 0.6045, "step": 3215 }, { "epoch": 0.2918252673554468, "grad_norm": 0.8291459083557129, "learning_rate": 2.918252673554468e-05, "loss": 0.6762, "step": 3220 }, { "epoch": 0.2922784121805329, "grad_norm": 0.7236889600753784, "learning_rate": 2.922784121805329e-05, "loss": 0.5804, "step": 3225 }, { "epoch": 0.292731557005619, "grad_norm": 0.9010696411132812, "learning_rate": 2.92731557005619e-05, "loss": 0.6559, "step": 3230 }, { "epoch": 0.2931847018307051, "grad_norm": 0.8579304814338684, "learning_rate": 2.9318470183070513e-05, "loss": 0.6829, "step": 3235 }, { "epoch": 0.2936378466557912, "grad_norm": 0.8056133985519409, "learning_rate": 2.9363784665579118e-05, "loss": 0.6425, "step": 3240 }, { "epoch": 0.2940909914808773, "grad_norm": 0.8115129470825195, "learning_rate": 2.9409099148087732e-05, "loss": 0.6627, "step": 3245 }, { "epoch": 0.2945441363059634, "grad_norm": 0.8319226503372192, "learning_rate": 2.945441363059634e-05, "loss": 0.6328, "step": 3250 }, { "epoch": 0.2949972811310495, "grad_norm": 0.9491512775421143, "learning_rate": 2.949972811310495e-05, "loss": 0.663, "step": 3255 }, { "epoch": 0.2954504259561356, "grad_norm": 0.8371458649635315, "learning_rate": 2.954504259561356e-05, "loss": 0.6324, "step": 3260 }, { "epoch": 0.2959035707812217, "grad_norm": 0.8413903117179871, "learning_rate": 2.959035707812217e-05, "loss": 0.6603, "step": 3265 }, { "epoch": 0.2963567156063078, "grad_norm": 0.936238706111908, "learning_rate": 2.9635671560630778e-05, "loss": 0.6513, "step": 3270 }, { "epoch": 0.29680986043139385, "grad_norm": 0.9652707576751709, "learning_rate": 2.9680986043139386e-05, "loss": 0.589, "step": 3275 }, { "epoch": 0.29726300525648, "grad_norm": 0.8055279850959778, "learning_rate": 2.9726300525648e-05, "loss": 0.6018, "step": 3280 }, { "epoch": 0.29771615008156604, "grad_norm": 0.8390842080116272, "learning_rate": 2.9771615008156604e-05, "loss": 0.6849, "step": 3285 }, { "epoch": 0.29816929490665217, "grad_norm": 0.8698114156723022, "learning_rate": 2.981692949066522e-05, "loss": 0.6702, "step": 3290 }, { "epoch": 0.29862243973173824, "grad_norm": 0.8313329219818115, "learning_rate": 2.9862243973173827e-05, "loss": 0.6122, "step": 3295 }, { "epoch": 0.29907558455682437, "grad_norm": 0.7872563004493713, "learning_rate": 2.9907558455682438e-05, "loss": 0.5822, "step": 3300 }, { "epoch": 0.29952872938191044, "grad_norm": 0.9620490670204163, "learning_rate": 2.9952872938191046e-05, "loss": 0.6591, "step": 3305 }, { "epoch": 0.29998187420699657, "grad_norm": 0.8685542941093445, "learning_rate": 2.9998187420699657e-05, "loss": 0.6265, "step": 3310 }, { "epoch": 0.30043501903208264, "grad_norm": 0.8274585604667664, "learning_rate": 3.0043501903208264e-05, "loss": 0.6162, "step": 3315 }, { "epoch": 0.30088816385716877, "grad_norm": 0.8368684649467468, "learning_rate": 3.008881638571688e-05, "loss": 0.6404, "step": 3320 }, { "epoch": 0.30134130868225484, "grad_norm": 1.0007960796356201, "learning_rate": 3.0134130868225487e-05, "loss": 0.629, "step": 3325 }, { "epoch": 0.30179445350734097, "grad_norm": 0.8654600977897644, "learning_rate": 3.0179445350734098e-05, "loss": 0.6588, "step": 3330 }, { "epoch": 0.30224759833242704, "grad_norm": 0.824070155620575, "learning_rate": 3.0224759833242706e-05, "loss": 0.6698, "step": 3335 }, { "epoch": 0.30270074315751316, "grad_norm": 0.871516764163971, "learning_rate": 3.0270074315751317e-05, "loss": 0.6048, "step": 3340 }, { "epoch": 0.30315388798259923, "grad_norm": 0.8736647367477417, "learning_rate": 3.0315388798259924e-05, "loss": 0.6291, "step": 3345 }, { "epoch": 0.30360703280768536, "grad_norm": 0.784710705280304, "learning_rate": 3.036070328076854e-05, "loss": 0.6702, "step": 3350 }, { "epoch": 0.30406017763277143, "grad_norm": 0.8888075351715088, "learning_rate": 3.0406017763277143e-05, "loss": 0.6215, "step": 3355 }, { "epoch": 0.3045133224578575, "grad_norm": 0.8123961091041565, "learning_rate": 3.045133224578575e-05, "loss": 0.6002, "step": 3360 }, { "epoch": 0.30496646728294363, "grad_norm": 0.8412827849388123, "learning_rate": 3.0496646728294366e-05, "loss": 0.6224, "step": 3365 }, { "epoch": 0.3054196121080297, "grad_norm": 0.8708664178848267, "learning_rate": 3.054196121080297e-05, "loss": 0.6707, "step": 3370 }, { "epoch": 0.30587275693311583, "grad_norm": 0.8351549506187439, "learning_rate": 3.058727569331159e-05, "loss": 0.6528, "step": 3375 }, { "epoch": 0.3063259017582019, "grad_norm": 0.8377884030342102, "learning_rate": 3.063259017582019e-05, "loss": 0.6545, "step": 3380 }, { "epoch": 0.30677904658328803, "grad_norm": 0.7602893114089966, "learning_rate": 3.06779046583288e-05, "loss": 0.5988, "step": 3385 }, { "epoch": 0.3072321914083741, "grad_norm": 0.7977021932601929, "learning_rate": 3.072321914083741e-05, "loss": 0.6238, "step": 3390 }, { "epoch": 0.3076853362334602, "grad_norm": 0.8824435472488403, "learning_rate": 3.0768533623346026e-05, "loss": 0.6777, "step": 3395 }, { "epoch": 0.3081384810585463, "grad_norm": 0.9607120752334595, "learning_rate": 3.081384810585463e-05, "loss": 0.6827, "step": 3400 }, { "epoch": 0.3085916258836324, "grad_norm": 0.8034214973449707, "learning_rate": 3.085916258836324e-05, "loss": 0.6879, "step": 3405 }, { "epoch": 0.3090447707087185, "grad_norm": 0.8144966959953308, "learning_rate": 3.090447707087185e-05, "loss": 0.6649, "step": 3410 }, { "epoch": 0.3094979155338046, "grad_norm": 0.8070133924484253, "learning_rate": 3.094979155338046e-05, "loss": 0.6284, "step": 3415 }, { "epoch": 0.3099510603588907, "grad_norm": 0.8349020481109619, "learning_rate": 3.099510603588907e-05, "loss": 0.6319, "step": 3420 }, { "epoch": 0.3104042051839768, "grad_norm": 0.8011594414710999, "learning_rate": 3.1040420518397686e-05, "loss": 0.6673, "step": 3425 }, { "epoch": 0.3108573500090629, "grad_norm": 0.9199284315109253, "learning_rate": 3.108573500090629e-05, "loss": 0.5846, "step": 3430 }, { "epoch": 0.311310494834149, "grad_norm": 0.85769122838974, "learning_rate": 3.11310494834149e-05, "loss": 0.651, "step": 3435 }, { "epoch": 0.3117636396592351, "grad_norm": 0.8905433416366577, "learning_rate": 3.117636396592351e-05, "loss": 0.703, "step": 3440 }, { "epoch": 0.31221678448432116, "grad_norm": 0.8247604370117188, "learning_rate": 3.1221678448432117e-05, "loss": 0.633, "step": 3445 }, { "epoch": 0.3126699293094073, "grad_norm": 0.9443382024765015, "learning_rate": 3.126699293094073e-05, "loss": 0.623, "step": 3450 }, { "epoch": 0.31312307413449336, "grad_norm": 0.9014633893966675, "learning_rate": 3.131230741344934e-05, "loss": 0.7325, "step": 3455 }, { "epoch": 0.3135762189595795, "grad_norm": 0.8866268992424011, "learning_rate": 3.135762189595795e-05, "loss": 0.6099, "step": 3460 }, { "epoch": 0.31402936378466556, "grad_norm": 0.8426503539085388, "learning_rate": 3.1402936378466554e-05, "loss": 0.5808, "step": 3465 }, { "epoch": 0.3144825086097517, "grad_norm": 0.7842848896980286, "learning_rate": 3.144825086097517e-05, "loss": 0.6181, "step": 3470 }, { "epoch": 0.31493565343483776, "grad_norm": 0.9732040762901306, "learning_rate": 3.1493565343483777e-05, "loss": 0.6258, "step": 3475 }, { "epoch": 0.3153887982599239, "grad_norm": 0.8460723161697388, "learning_rate": 3.153887982599239e-05, "loss": 0.6647, "step": 3480 }, { "epoch": 0.31584194308500996, "grad_norm": 0.8441756367683411, "learning_rate": 3.1584194308501e-05, "loss": 0.621, "step": 3485 }, { "epoch": 0.3162950879100961, "grad_norm": 0.8549978137016296, "learning_rate": 3.1629508791009607e-05, "loss": 0.6478, "step": 3490 }, { "epoch": 0.31674823273518216, "grad_norm": 0.8248720765113831, "learning_rate": 3.1674823273518214e-05, "loss": 0.6208, "step": 3495 }, { "epoch": 0.3172013775602683, "grad_norm": 0.8307811617851257, "learning_rate": 3.172013775602683e-05, "loss": 0.657, "step": 3500 }, { "epoch": 0.31765452238535435, "grad_norm": 0.8718565106391907, "learning_rate": 3.1765452238535437e-05, "loss": 0.5573, "step": 3505 }, { "epoch": 0.3181076672104405, "grad_norm": 0.9114343523979187, "learning_rate": 3.181076672104405e-05, "loss": 0.6236, "step": 3510 }, { "epoch": 0.31856081203552655, "grad_norm": 0.7902658581733704, "learning_rate": 3.185608120355266e-05, "loss": 0.6273, "step": 3515 }, { "epoch": 0.3190139568606126, "grad_norm": 0.8523035645484924, "learning_rate": 3.1901395686061267e-05, "loss": 0.6118, "step": 3520 }, { "epoch": 0.31946710168569875, "grad_norm": 0.8429620862007141, "learning_rate": 3.1946710168569874e-05, "loss": 0.6174, "step": 3525 }, { "epoch": 0.3199202465107848, "grad_norm": 0.8206061720848083, "learning_rate": 3.199202465107848e-05, "loss": 0.652, "step": 3530 }, { "epoch": 0.32037339133587095, "grad_norm": 0.8167045712471008, "learning_rate": 3.2037339133587097e-05, "loss": 0.6234, "step": 3535 }, { "epoch": 0.320826536160957, "grad_norm": 0.8826706409454346, "learning_rate": 3.2082653616095704e-05, "loss": 0.6494, "step": 3540 }, { "epoch": 0.32127968098604315, "grad_norm": 0.8240894079208374, "learning_rate": 3.212796809860432e-05, "loss": 0.6518, "step": 3545 }, { "epoch": 0.3217328258111292, "grad_norm": 0.8108679056167603, "learning_rate": 3.2173282581112927e-05, "loss": 0.5751, "step": 3550 }, { "epoch": 0.32218597063621535, "grad_norm": 0.994523823261261, "learning_rate": 3.2218597063621534e-05, "loss": 0.6095, "step": 3555 }, { "epoch": 0.3226391154613014, "grad_norm": 0.9363687634468079, "learning_rate": 3.226391154613014e-05, "loss": 0.5749, "step": 3560 }, { "epoch": 0.32309226028638754, "grad_norm": 0.8055035471916199, "learning_rate": 3.2309226028638757e-05, "loss": 0.6372, "step": 3565 }, { "epoch": 0.3235454051114736, "grad_norm": 0.9263759851455688, "learning_rate": 3.2354540511147364e-05, "loss": 0.6117, "step": 3570 }, { "epoch": 0.32399854993655974, "grad_norm": 0.7871035933494568, "learning_rate": 3.239985499365598e-05, "loss": 0.6166, "step": 3575 }, { "epoch": 0.3244516947616458, "grad_norm": 0.822509229183197, "learning_rate": 3.244516947616458e-05, "loss": 0.6011, "step": 3580 }, { "epoch": 0.32490483958673194, "grad_norm": 0.89741450548172, "learning_rate": 3.2490483958673194e-05, "loss": 0.665, "step": 3585 }, { "epoch": 0.325357984411818, "grad_norm": 0.9035495519638062, "learning_rate": 3.25357984411818e-05, "loss": 0.6148, "step": 3590 }, { "epoch": 0.32581112923690414, "grad_norm": 0.8666448593139648, "learning_rate": 3.2581112923690417e-05, "loss": 0.6696, "step": 3595 }, { "epoch": 0.3262642740619902, "grad_norm": 0.8046849370002747, "learning_rate": 3.2626427406199024e-05, "loss": 0.6321, "step": 3600 }, { "epoch": 0.3267174188870763, "grad_norm": 0.822550356388092, "learning_rate": 3.267174188870763e-05, "loss": 0.6337, "step": 3605 }, { "epoch": 0.3271705637121624, "grad_norm": 0.8738881945610046, "learning_rate": 3.271705637121624e-05, "loss": 0.6225, "step": 3610 }, { "epoch": 0.3276237085372485, "grad_norm": 0.8612209558486938, "learning_rate": 3.276237085372485e-05, "loss": 0.692, "step": 3615 }, { "epoch": 0.3280768533623346, "grad_norm": 0.8282519578933716, "learning_rate": 3.280768533623346e-05, "loss": 0.6224, "step": 3620 }, { "epoch": 0.3285299981874207, "grad_norm": 0.8017866611480713, "learning_rate": 3.285299981874207e-05, "loss": 0.629, "step": 3625 }, { "epoch": 0.3289831430125068, "grad_norm": 0.8641547560691833, "learning_rate": 3.2898314301250684e-05, "loss": 0.6285, "step": 3630 }, { "epoch": 0.3294362878375929, "grad_norm": 0.7908748388290405, "learning_rate": 3.294362878375929e-05, "loss": 0.6508, "step": 3635 }, { "epoch": 0.329889432662679, "grad_norm": 0.727245569229126, "learning_rate": 3.29889432662679e-05, "loss": 0.6291, "step": 3640 }, { "epoch": 0.3303425774877651, "grad_norm": 0.8771240711212158, "learning_rate": 3.303425774877651e-05, "loss": 0.6515, "step": 3645 }, { "epoch": 0.3307957223128512, "grad_norm": 0.8563861846923828, "learning_rate": 3.307957223128512e-05, "loss": 0.5952, "step": 3650 }, { "epoch": 0.3312488671379373, "grad_norm": 0.9378343820571899, "learning_rate": 3.312488671379373e-05, "loss": 0.6535, "step": 3655 }, { "epoch": 0.3317020119630234, "grad_norm": 0.7621697187423706, "learning_rate": 3.3170201196302344e-05, "loss": 0.5951, "step": 3660 }, { "epoch": 0.3321551567881095, "grad_norm": 0.8767490983009338, "learning_rate": 3.321551567881095e-05, "loss": 0.5734, "step": 3665 }, { "epoch": 0.3326083016131956, "grad_norm": 0.8542325496673584, "learning_rate": 3.326083016131956e-05, "loss": 0.6138, "step": 3670 }, { "epoch": 0.33306144643828167, "grad_norm": 0.8509350419044495, "learning_rate": 3.330614464382817e-05, "loss": 0.6467, "step": 3675 }, { "epoch": 0.3335145912633678, "grad_norm": 0.8304462432861328, "learning_rate": 3.335145912633678e-05, "loss": 0.6227, "step": 3680 }, { "epoch": 0.33396773608845387, "grad_norm": 0.8240889310836792, "learning_rate": 3.339677360884539e-05, "loss": 0.6555, "step": 3685 }, { "epoch": 0.33442088091353994, "grad_norm": 0.9349605441093445, "learning_rate": 3.3442088091354e-05, "loss": 0.5972, "step": 3690 }, { "epoch": 0.33487402573862607, "grad_norm": 0.8495287299156189, "learning_rate": 3.3487402573862605e-05, "loss": 0.6442, "step": 3695 }, { "epoch": 0.33532717056371214, "grad_norm": 0.8439125418663025, "learning_rate": 3.353271705637121e-05, "loss": 0.6827, "step": 3700 }, { "epoch": 0.33578031538879827, "grad_norm": 0.8770002126693726, "learning_rate": 3.357803153887983e-05, "loss": 0.6171, "step": 3705 }, { "epoch": 0.33623346021388434, "grad_norm": 0.8713083863258362, "learning_rate": 3.3623346021388435e-05, "loss": 0.5895, "step": 3710 }, { "epoch": 0.33668660503897047, "grad_norm": 0.9182153940200806, "learning_rate": 3.366866050389705e-05, "loss": 0.6113, "step": 3715 }, { "epoch": 0.33713974986405654, "grad_norm": 0.8559891581535339, "learning_rate": 3.371397498640566e-05, "loss": 0.6247, "step": 3720 }, { "epoch": 0.33759289468914266, "grad_norm": 0.8426746726036072, "learning_rate": 3.3759289468914265e-05, "loss": 0.6131, "step": 3725 }, { "epoch": 0.33804603951422874, "grad_norm": 1.239707112312317, "learning_rate": 3.380460395142287e-05, "loss": 0.6756, "step": 3730 }, { "epoch": 0.33849918433931486, "grad_norm": 0.7899940013885498, "learning_rate": 3.384991843393149e-05, "loss": 0.6218, "step": 3735 }, { "epoch": 0.33895232916440093, "grad_norm": 0.7868574857711792, "learning_rate": 3.3895232916440095e-05, "loss": 0.6076, "step": 3740 }, { "epoch": 0.33940547398948706, "grad_norm": 0.8488652110099792, "learning_rate": 3.394054739894871e-05, "loss": 0.6495, "step": 3745 }, { "epoch": 0.33985861881457313, "grad_norm": 0.8551833629608154, "learning_rate": 3.398586188145732e-05, "loss": 0.6318, "step": 3750 }, { "epoch": 0.34031176363965926, "grad_norm": 0.7660235166549683, "learning_rate": 3.4031176363965925e-05, "loss": 0.6597, "step": 3755 }, { "epoch": 0.34076490846474533, "grad_norm": 0.8442707061767578, "learning_rate": 3.407649084647453e-05, "loss": 0.629, "step": 3760 }, { "epoch": 0.34121805328983146, "grad_norm": 0.802200436592102, "learning_rate": 3.412180532898315e-05, "loss": 0.6373, "step": 3765 }, { "epoch": 0.34167119811491753, "grad_norm": 0.8070580363273621, "learning_rate": 3.4167119811491755e-05, "loss": 0.6557, "step": 3770 }, { "epoch": 0.3421243429400036, "grad_norm": 0.8050976991653442, "learning_rate": 3.421243429400036e-05, "loss": 0.5603, "step": 3775 }, { "epoch": 0.3425774877650897, "grad_norm": 0.925885021686554, "learning_rate": 3.425774877650898e-05, "loss": 0.6388, "step": 3780 }, { "epoch": 0.3430306325901758, "grad_norm": 0.8761688470840454, "learning_rate": 3.430306325901758e-05, "loss": 0.6122, "step": 3785 }, { "epoch": 0.3434837774152619, "grad_norm": 0.924302875995636, "learning_rate": 3.434837774152619e-05, "loss": 0.6167, "step": 3790 }, { "epoch": 0.343936922240348, "grad_norm": 0.7997691035270691, "learning_rate": 3.43936922240348e-05, "loss": 0.6259, "step": 3795 }, { "epoch": 0.3443900670654341, "grad_norm": 0.895696222782135, "learning_rate": 3.4439006706543415e-05, "loss": 0.6064, "step": 3800 }, { "epoch": 0.3448432118905202, "grad_norm": 0.7984565496444702, "learning_rate": 3.448432118905202e-05, "loss": 0.5863, "step": 3805 }, { "epoch": 0.3452963567156063, "grad_norm": 0.8953773379325867, "learning_rate": 3.452963567156063e-05, "loss": 0.6418, "step": 3810 }, { "epoch": 0.3457495015406924, "grad_norm": 0.9657151103019714, "learning_rate": 3.457495015406924e-05, "loss": 0.6526, "step": 3815 }, { "epoch": 0.3462026463657785, "grad_norm": 0.829946756362915, "learning_rate": 3.462026463657785e-05, "loss": 0.6283, "step": 3820 }, { "epoch": 0.3466557911908646, "grad_norm": 0.8669992685317993, "learning_rate": 3.466557911908646e-05, "loss": 0.629, "step": 3825 }, { "epoch": 0.3471089360159507, "grad_norm": 0.8672708868980408, "learning_rate": 3.4710893601595075e-05, "loss": 0.6033, "step": 3830 }, { "epoch": 0.3475620808410368, "grad_norm": 0.7882630228996277, "learning_rate": 3.475620808410368e-05, "loss": 0.6086, "step": 3835 }, { "epoch": 0.3480152256661229, "grad_norm": 0.9506611824035645, "learning_rate": 3.480152256661229e-05, "loss": 0.6125, "step": 3840 }, { "epoch": 0.348468370491209, "grad_norm": 0.8114120364189148, "learning_rate": 3.48468370491209e-05, "loss": 0.6102, "step": 3845 }, { "epoch": 0.34892151531629506, "grad_norm": 0.9222038984298706, "learning_rate": 3.4892151531629506e-05, "loss": 0.6408, "step": 3850 }, { "epoch": 0.3493746601413812, "grad_norm": 0.8297349810600281, "learning_rate": 3.493746601413812e-05, "loss": 0.6347, "step": 3855 }, { "epoch": 0.34982780496646726, "grad_norm": 0.8003364205360413, "learning_rate": 3.498278049664673e-05, "loss": 0.5499, "step": 3860 }, { "epoch": 0.3502809497915534, "grad_norm": 0.8151943683624268, "learning_rate": 3.502809497915534e-05, "loss": 0.6429, "step": 3865 }, { "epoch": 0.35073409461663946, "grad_norm": 0.997219979763031, "learning_rate": 3.507340946166395e-05, "loss": 0.578, "step": 3870 }, { "epoch": 0.3511872394417256, "grad_norm": 0.7695597410202026, "learning_rate": 3.511872394417256e-05, "loss": 0.577, "step": 3875 }, { "epoch": 0.35164038426681166, "grad_norm": 0.810575544834137, "learning_rate": 3.5164038426681166e-05, "loss": 0.6208, "step": 3880 }, { "epoch": 0.3520935290918978, "grad_norm": 0.9196922183036804, "learning_rate": 3.520935290918978e-05, "loss": 0.6164, "step": 3885 }, { "epoch": 0.35254667391698385, "grad_norm": 0.8807030916213989, "learning_rate": 3.525466739169839e-05, "loss": 0.6022, "step": 3890 }, { "epoch": 0.35299981874207, "grad_norm": 0.755882203578949, "learning_rate": 3.5299981874207e-05, "loss": 0.6634, "step": 3895 }, { "epoch": 0.35345296356715605, "grad_norm": 0.9151172041893005, "learning_rate": 3.5345296356715604e-05, "loss": 0.6109, "step": 3900 }, { "epoch": 0.3539061083922422, "grad_norm": 0.9122495055198669, "learning_rate": 3.539061083922422e-05, "loss": 0.6613, "step": 3905 }, { "epoch": 0.35435925321732825, "grad_norm": 0.8455286622047424, "learning_rate": 3.5435925321732826e-05, "loss": 0.6541, "step": 3910 }, { "epoch": 0.3548123980424144, "grad_norm": 0.9664796590805054, "learning_rate": 3.548123980424144e-05, "loss": 0.6165, "step": 3915 }, { "epoch": 0.35526554286750045, "grad_norm": 0.814849853515625, "learning_rate": 3.552655428675005e-05, "loss": 0.6553, "step": 3920 }, { "epoch": 0.3557186876925866, "grad_norm": 0.8316401839256287, "learning_rate": 3.5571868769258656e-05, "loss": 0.6078, "step": 3925 }, { "epoch": 0.35617183251767265, "grad_norm": 1.122188925743103, "learning_rate": 3.5617183251767264e-05, "loss": 0.6175, "step": 3930 }, { "epoch": 0.3566249773427587, "grad_norm": 0.8320538997650146, "learning_rate": 3.566249773427587e-05, "loss": 0.6455, "step": 3935 }, { "epoch": 0.35707812216784485, "grad_norm": 0.8629981875419617, "learning_rate": 3.5707812216784486e-05, "loss": 0.5711, "step": 3940 }, { "epoch": 0.3575312669929309, "grad_norm": 0.824882984161377, "learning_rate": 3.5753126699293094e-05, "loss": 0.6379, "step": 3945 }, { "epoch": 0.35798441181801705, "grad_norm": 0.82892245054245, "learning_rate": 3.579844118180171e-05, "loss": 0.6517, "step": 3950 }, { "epoch": 0.3584375566431031, "grad_norm": 0.853236734867096, "learning_rate": 3.5843755664310316e-05, "loss": 0.6141, "step": 3955 }, { "epoch": 0.35889070146818924, "grad_norm": 0.9237259030342102, "learning_rate": 3.5889070146818924e-05, "loss": 0.6365, "step": 3960 }, { "epoch": 0.3593438462932753, "grad_norm": 0.8446763157844543, "learning_rate": 3.593438462932753e-05, "loss": 0.6099, "step": 3965 }, { "epoch": 0.35979699111836144, "grad_norm": 0.7580060958862305, "learning_rate": 3.5979699111836146e-05, "loss": 0.6341, "step": 3970 }, { "epoch": 0.3602501359434475, "grad_norm": 0.8104161620140076, "learning_rate": 3.6025013594344754e-05, "loss": 0.6324, "step": 3975 }, { "epoch": 0.36070328076853364, "grad_norm": 0.8339964151382446, "learning_rate": 3.607032807685337e-05, "loss": 0.6163, "step": 3980 }, { "epoch": 0.3611564255936197, "grad_norm": 0.8520172834396362, "learning_rate": 3.6115642559361976e-05, "loss": 0.6047, "step": 3985 }, { "epoch": 0.36160957041870584, "grad_norm": 0.8737123608589172, "learning_rate": 3.6160957041870584e-05, "loss": 0.6488, "step": 3990 }, { "epoch": 0.3620627152437919, "grad_norm": 0.8606485724449158, "learning_rate": 3.620627152437919e-05, "loss": 0.6345, "step": 3995 }, { "epoch": 0.36251586006887804, "grad_norm": 0.8314768671989441, "learning_rate": 3.6251586006887806e-05, "loss": 0.6217, "step": 4000 }, { "epoch": 0.3629690048939641, "grad_norm": 0.8592543601989746, "learning_rate": 3.6296900489396414e-05, "loss": 0.6561, "step": 4005 }, { "epoch": 0.36342214971905024, "grad_norm": 0.8402546048164368, "learning_rate": 3.634221497190503e-05, "loss": 0.5423, "step": 4010 }, { "epoch": 0.3638752945441363, "grad_norm": 0.8401572704315186, "learning_rate": 3.638752945441363e-05, "loss": 0.6226, "step": 4015 }, { "epoch": 0.3643284393692224, "grad_norm": 0.871745228767395, "learning_rate": 3.643284393692224e-05, "loss": 0.6227, "step": 4020 }, { "epoch": 0.3647815841943085, "grad_norm": 0.9879257082939148, "learning_rate": 3.647815841943085e-05, "loss": 0.6616, "step": 4025 }, { "epoch": 0.3652347290193946, "grad_norm": 0.9459033012390137, "learning_rate": 3.652347290193946e-05, "loss": 0.5928, "step": 4030 }, { "epoch": 0.3656878738444807, "grad_norm": 1.1753588914871216, "learning_rate": 3.6568787384448074e-05, "loss": 0.6364, "step": 4035 }, { "epoch": 0.3661410186695668, "grad_norm": 0.9474778771400452, "learning_rate": 3.661410186695668e-05, "loss": 0.6115, "step": 4040 }, { "epoch": 0.3665941634946529, "grad_norm": 0.830950915813446, "learning_rate": 3.665941634946529e-05, "loss": 0.6169, "step": 4045 }, { "epoch": 0.367047308319739, "grad_norm": 0.8554236888885498, "learning_rate": 3.67047308319739e-05, "loss": 0.657, "step": 4050 }, { "epoch": 0.3675004531448251, "grad_norm": 0.8651173114776611, "learning_rate": 3.675004531448251e-05, "loss": 0.6378, "step": 4055 }, { "epoch": 0.3679535979699112, "grad_norm": 0.8263232111930847, "learning_rate": 3.679535979699112e-05, "loss": 0.6034, "step": 4060 }, { "epoch": 0.3684067427949973, "grad_norm": 0.8563245534896851, "learning_rate": 3.6840674279499734e-05, "loss": 0.569, "step": 4065 }, { "epoch": 0.36885988762008337, "grad_norm": 0.8512355089187622, "learning_rate": 3.688598876200834e-05, "loss": 0.5467, "step": 4070 }, { "epoch": 0.3693130324451695, "grad_norm": 0.9142612814903259, "learning_rate": 3.693130324451695e-05, "loss": 0.6083, "step": 4075 }, { "epoch": 0.36976617727025557, "grad_norm": 0.779013454914093, "learning_rate": 3.697661772702556e-05, "loss": 0.5816, "step": 4080 }, { "epoch": 0.3702193220953417, "grad_norm": 0.9490751028060913, "learning_rate": 3.702193220953417e-05, "loss": 0.6621, "step": 4085 }, { "epoch": 0.37067246692042777, "grad_norm": 0.908134937286377, "learning_rate": 3.706724669204278e-05, "loss": 0.6275, "step": 4090 }, { "epoch": 0.37112561174551384, "grad_norm": 0.8399788737297058, "learning_rate": 3.711256117455139e-05, "loss": 0.5763, "step": 4095 }, { "epoch": 0.37157875657059997, "grad_norm": 0.8892530798912048, "learning_rate": 3.7157875657059995e-05, "loss": 0.5967, "step": 4100 }, { "epoch": 0.37203190139568604, "grad_norm": 0.8776654601097107, "learning_rate": 3.72031901395686e-05, "loss": 0.6095, "step": 4105 }, { "epoch": 0.37248504622077216, "grad_norm": 0.8939406871795654, "learning_rate": 3.724850462207722e-05, "loss": 0.6674, "step": 4110 }, { "epoch": 0.37293819104585824, "grad_norm": 0.8172447681427002, "learning_rate": 3.7293819104585825e-05, "loss": 0.5985, "step": 4115 }, { "epoch": 0.37339133587094436, "grad_norm": 0.8461565971374512, "learning_rate": 3.733913358709444e-05, "loss": 0.5875, "step": 4120 }, { "epoch": 0.37384448069603043, "grad_norm": 0.7875018119812012, "learning_rate": 3.738444806960305e-05, "loss": 0.5705, "step": 4125 }, { "epoch": 0.37429762552111656, "grad_norm": 0.756347119808197, "learning_rate": 3.7429762552111655e-05, "loss": 0.6415, "step": 4130 }, { "epoch": 0.37475077034620263, "grad_norm": 0.8405019640922546, "learning_rate": 3.747507703462026e-05, "loss": 0.6237, "step": 4135 }, { "epoch": 0.37520391517128876, "grad_norm": 1.448639988899231, "learning_rate": 3.752039151712888e-05, "loss": 0.6419, "step": 4140 }, { "epoch": 0.37565705999637483, "grad_norm": 0.7804273366928101, "learning_rate": 3.7565705999637485e-05, "loss": 0.6491, "step": 4145 }, { "epoch": 0.37611020482146096, "grad_norm": 0.8836449384689331, "learning_rate": 3.76110204821461e-05, "loss": 0.6481, "step": 4150 }, { "epoch": 0.37656334964654703, "grad_norm": 0.8356822729110718, "learning_rate": 3.765633496465471e-05, "loss": 0.6535, "step": 4155 }, { "epoch": 0.37701649447163316, "grad_norm": 0.8545291423797607, "learning_rate": 3.7701649447163315e-05, "loss": 0.606, "step": 4160 }, { "epoch": 0.37746963929671923, "grad_norm": 0.8803985714912415, "learning_rate": 3.774696392967192e-05, "loss": 0.6934, "step": 4165 }, { "epoch": 0.37792278412180536, "grad_norm": 0.806046187877655, "learning_rate": 3.779227841218054e-05, "loss": 0.5966, "step": 4170 }, { "epoch": 0.3783759289468914, "grad_norm": 0.9095320105552673, "learning_rate": 3.7837592894689145e-05, "loss": 0.6256, "step": 4175 }, { "epoch": 0.3788290737719775, "grad_norm": 0.8412052989006042, "learning_rate": 3.788290737719775e-05, "loss": 0.6191, "step": 4180 }, { "epoch": 0.3792822185970636, "grad_norm": 0.8159540295600891, "learning_rate": 3.792822185970637e-05, "loss": 0.5995, "step": 4185 }, { "epoch": 0.3797353634221497, "grad_norm": 0.8626527786254883, "learning_rate": 3.797353634221497e-05, "loss": 0.6429, "step": 4190 }, { "epoch": 0.3801885082472358, "grad_norm": 0.8852330446243286, "learning_rate": 3.801885082472358e-05, "loss": 0.6315, "step": 4195 }, { "epoch": 0.3806416530723219, "grad_norm": 0.8414595127105713, "learning_rate": 3.806416530723219e-05, "loss": 0.6432, "step": 4200 }, { "epoch": 0.381094797897408, "grad_norm": 0.9344000220298767, "learning_rate": 3.8109479789740805e-05, "loss": 0.6511, "step": 4205 }, { "epoch": 0.3815479427224941, "grad_norm": 0.8870944380760193, "learning_rate": 3.815479427224941e-05, "loss": 0.6691, "step": 4210 }, { "epoch": 0.3820010875475802, "grad_norm": 0.9136339426040649, "learning_rate": 3.820010875475802e-05, "loss": 0.6527, "step": 4215 }, { "epoch": 0.3824542323726663, "grad_norm": 0.7984883189201355, "learning_rate": 3.824542323726663e-05, "loss": 0.6102, "step": 4220 }, { "epoch": 0.3829073771977524, "grad_norm": 1.0190143585205078, "learning_rate": 3.829073771977524e-05, "loss": 0.7041, "step": 4225 }, { "epoch": 0.3833605220228385, "grad_norm": 0.8659690022468567, "learning_rate": 3.833605220228385e-05, "loss": 0.6257, "step": 4230 }, { "epoch": 0.3838136668479246, "grad_norm": 0.8881471157073975, "learning_rate": 3.8381366684792465e-05, "loss": 0.5937, "step": 4235 }, { "epoch": 0.3842668116730107, "grad_norm": 0.8113963603973389, "learning_rate": 3.842668116730107e-05, "loss": 0.6338, "step": 4240 }, { "epoch": 0.3847199564980968, "grad_norm": 0.9216580390930176, "learning_rate": 3.847199564980968e-05, "loss": 0.6116, "step": 4245 }, { "epoch": 0.3851731013231829, "grad_norm": 0.8018492460250854, "learning_rate": 3.851731013231829e-05, "loss": 0.6106, "step": 4250 }, { "epoch": 0.385626246148269, "grad_norm": 0.8481138348579407, "learning_rate": 3.85626246148269e-05, "loss": 0.6238, "step": 4255 }, { "epoch": 0.3860793909733551, "grad_norm": 0.8587308526039124, "learning_rate": 3.860793909733551e-05, "loss": 0.6114, "step": 4260 }, { "epoch": 0.38653253579844116, "grad_norm": 0.8172160983085632, "learning_rate": 3.865325357984412e-05, "loss": 0.5995, "step": 4265 }, { "epoch": 0.3869856806235273, "grad_norm": 0.8167363405227661, "learning_rate": 3.869856806235273e-05, "loss": 0.6485, "step": 4270 }, { "epoch": 0.38743882544861336, "grad_norm": 0.82893967628479, "learning_rate": 3.874388254486134e-05, "loss": 0.7045, "step": 4275 }, { "epoch": 0.3878919702736995, "grad_norm": 0.8731433749198914, "learning_rate": 3.878919702736995e-05, "loss": 0.6055, "step": 4280 }, { "epoch": 0.38834511509878555, "grad_norm": 0.8846164345741272, "learning_rate": 3.8834511509878556e-05, "loss": 0.6315, "step": 4285 }, { "epoch": 0.3887982599238717, "grad_norm": 0.8147419691085815, "learning_rate": 3.887982599238717e-05, "loss": 0.6167, "step": 4290 }, { "epoch": 0.38925140474895775, "grad_norm": 0.7580723762512207, "learning_rate": 3.892514047489578e-05, "loss": 0.6867, "step": 4295 }, { "epoch": 0.3897045495740439, "grad_norm": 0.8042551875114441, "learning_rate": 3.897045495740439e-05, "loss": 0.6622, "step": 4300 }, { "epoch": 0.39015769439912995, "grad_norm": 0.9773116111755371, "learning_rate": 3.9015769439912994e-05, "loss": 0.6291, "step": 4305 }, { "epoch": 0.3906108392242161, "grad_norm": 0.8704566955566406, "learning_rate": 3.906108392242161e-05, "loss": 0.6597, "step": 4310 }, { "epoch": 0.39106398404930215, "grad_norm": 0.8484933972358704, "learning_rate": 3.9106398404930216e-05, "loss": 0.6592, "step": 4315 }, { "epoch": 0.3915171288743883, "grad_norm": 0.7966654896736145, "learning_rate": 3.915171288743883e-05, "loss": 0.5749, "step": 4320 }, { "epoch": 0.39197027369947435, "grad_norm": 0.8534024953842163, "learning_rate": 3.919702736994744e-05, "loss": 0.6228, "step": 4325 }, { "epoch": 0.3924234185245605, "grad_norm": 0.87668776512146, "learning_rate": 3.9242341852456046e-05, "loss": 0.5904, "step": 4330 }, { "epoch": 0.39287656334964655, "grad_norm": 0.8803106546401978, "learning_rate": 3.9287656334964654e-05, "loss": 0.6, "step": 4335 }, { "epoch": 0.3933297081747326, "grad_norm": 0.9035614728927612, "learning_rate": 3.933297081747326e-05, "loss": 0.5747, "step": 4340 }, { "epoch": 0.39378285299981874, "grad_norm": 0.9362914562225342, "learning_rate": 3.9378285299981876e-05, "loss": 0.5947, "step": 4345 }, { "epoch": 0.3942359978249048, "grad_norm": 0.9116201996803284, "learning_rate": 3.9423599782490484e-05, "loss": 0.6979, "step": 4350 }, { "epoch": 0.39468914264999094, "grad_norm": 0.8652161359786987, "learning_rate": 3.94689142649991e-05, "loss": 0.6509, "step": 4355 }, { "epoch": 0.395142287475077, "grad_norm": 0.8443554043769836, "learning_rate": 3.9514228747507706e-05, "loss": 0.6173, "step": 4360 }, { "epoch": 0.39559543230016314, "grad_norm": 0.878594696521759, "learning_rate": 3.9559543230016314e-05, "loss": 0.6217, "step": 4365 }, { "epoch": 0.3960485771252492, "grad_norm": 0.8109238743782043, "learning_rate": 3.960485771252492e-05, "loss": 0.6291, "step": 4370 }, { "epoch": 0.39650172195033534, "grad_norm": 0.8894370794296265, "learning_rate": 3.9650172195033536e-05, "loss": 0.5769, "step": 4375 }, { "epoch": 0.3969548667754214, "grad_norm": 0.908823549747467, "learning_rate": 3.9695486677542144e-05, "loss": 0.6203, "step": 4380 }, { "epoch": 0.39740801160050754, "grad_norm": 0.9628437161445618, "learning_rate": 3.974080116005076e-05, "loss": 0.5975, "step": 4385 }, { "epoch": 0.3978611564255936, "grad_norm": 0.910093367099762, "learning_rate": 3.9786115642559366e-05, "loss": 0.614, "step": 4390 }, { "epoch": 0.39831430125067974, "grad_norm": 0.8174340128898621, "learning_rate": 3.9831430125067974e-05, "loss": 0.6045, "step": 4395 }, { "epoch": 0.3987674460757658, "grad_norm": 0.8270817995071411, "learning_rate": 3.987674460757658e-05, "loss": 0.6418, "step": 4400 }, { "epoch": 0.39922059090085193, "grad_norm": 0.8116051554679871, "learning_rate": 3.9922059090085196e-05, "loss": 0.5576, "step": 4405 }, { "epoch": 0.399673735725938, "grad_norm": 0.87581866979599, "learning_rate": 3.9967373572593804e-05, "loss": 0.6816, "step": 4410 }, { "epoch": 0.40012688055102413, "grad_norm": 0.8656613826751709, "learning_rate": 4.001268805510242e-05, "loss": 0.6175, "step": 4415 }, { "epoch": 0.4005800253761102, "grad_norm": 0.8540873527526855, "learning_rate": 4.005800253761102e-05, "loss": 0.6217, "step": 4420 }, { "epoch": 0.4010331702011963, "grad_norm": 0.905489444732666, "learning_rate": 4.010331702011963e-05, "loss": 0.6372, "step": 4425 }, { "epoch": 0.4014863150262824, "grad_norm": 0.9472097754478455, "learning_rate": 4.014863150262824e-05, "loss": 0.6197, "step": 4430 }, { "epoch": 0.4019394598513685, "grad_norm": 0.9029079675674438, "learning_rate": 4.019394598513685e-05, "loss": 0.6056, "step": 4435 }, { "epoch": 0.4023926046764546, "grad_norm": 0.9068440794944763, "learning_rate": 4.0239260467645464e-05, "loss": 0.6205, "step": 4440 }, { "epoch": 0.4028457495015407, "grad_norm": 0.8218876719474792, "learning_rate": 4.028457495015407e-05, "loss": 0.6081, "step": 4445 }, { "epoch": 0.4032988943266268, "grad_norm": 0.9573096632957458, "learning_rate": 4.032988943266268e-05, "loss": 0.597, "step": 4450 }, { "epoch": 0.40375203915171287, "grad_norm": 0.8687506318092346, "learning_rate": 4.037520391517129e-05, "loss": 0.6179, "step": 4455 }, { "epoch": 0.404205183976799, "grad_norm": 0.8991154432296753, "learning_rate": 4.04205183976799e-05, "loss": 0.6155, "step": 4460 }, { "epoch": 0.40465832880188507, "grad_norm": 0.8197643160820007, "learning_rate": 4.046583288018851e-05, "loss": 0.6244, "step": 4465 }, { "epoch": 0.4051114736269712, "grad_norm": 0.8702784180641174, "learning_rate": 4.0511147362697124e-05, "loss": 0.6315, "step": 4470 }, { "epoch": 0.40556461845205727, "grad_norm": 0.8400620222091675, "learning_rate": 4.055646184520573e-05, "loss": 0.5949, "step": 4475 }, { "epoch": 0.4060177632771434, "grad_norm": 0.9202621579170227, "learning_rate": 4.060177632771434e-05, "loss": 0.6192, "step": 4480 }, { "epoch": 0.40647090810222947, "grad_norm": 0.8841713666915894, "learning_rate": 4.064709081022295e-05, "loss": 0.5942, "step": 4485 }, { "epoch": 0.4069240529273156, "grad_norm": 0.9809640645980835, "learning_rate": 4.069240529273156e-05, "loss": 0.6363, "step": 4490 }, { "epoch": 0.40737719775240167, "grad_norm": 0.8903448581695557, "learning_rate": 4.073771977524017e-05, "loss": 0.6377, "step": 4495 }, { "epoch": 0.4078303425774878, "grad_norm": 0.8761560916900635, "learning_rate": 4.0783034257748784e-05, "loss": 0.6221, "step": 4500 }, { "epoch": 0.40828348740257386, "grad_norm": 0.9182726144790649, "learning_rate": 4.082834874025739e-05, "loss": 0.6225, "step": 4505 }, { "epoch": 0.40873663222765994, "grad_norm": 0.868641197681427, "learning_rate": 4.087366322276599e-05, "loss": 0.6485, "step": 4510 }, { "epoch": 0.40918977705274606, "grad_norm": 0.8424393534660339, "learning_rate": 4.091897770527461e-05, "loss": 0.5447, "step": 4515 }, { "epoch": 0.40964292187783213, "grad_norm": 0.8521050214767456, "learning_rate": 4.0964292187783215e-05, "loss": 0.6375, "step": 4520 }, { "epoch": 0.41009606670291826, "grad_norm": 0.8830313682556152, "learning_rate": 4.100960667029183e-05, "loss": 0.6284, "step": 4525 }, { "epoch": 0.41054921152800433, "grad_norm": 0.8200377225875854, "learning_rate": 4.105492115280044e-05, "loss": 0.6366, "step": 4530 }, { "epoch": 0.41100235635309046, "grad_norm": 0.8146795034408569, "learning_rate": 4.1100235635309045e-05, "loss": 0.601, "step": 4535 }, { "epoch": 0.41145550117817653, "grad_norm": 0.8612405061721802, "learning_rate": 4.114555011781765e-05, "loss": 0.6239, "step": 4540 }, { "epoch": 0.41190864600326266, "grad_norm": 0.9371121525764465, "learning_rate": 4.119086460032627e-05, "loss": 0.6088, "step": 4545 }, { "epoch": 0.41236179082834873, "grad_norm": 0.9215017557144165, "learning_rate": 4.1236179082834875e-05, "loss": 0.6383, "step": 4550 }, { "epoch": 0.41281493565343486, "grad_norm": 1.012731909751892, "learning_rate": 4.128149356534349e-05, "loss": 0.6224, "step": 4555 }, { "epoch": 0.4132680804785209, "grad_norm": 0.9442242383956909, "learning_rate": 4.13268080478521e-05, "loss": 0.5939, "step": 4560 }, { "epoch": 0.41372122530360705, "grad_norm": 0.8331873416900635, "learning_rate": 4.1372122530360705e-05, "loss": 0.6435, "step": 4565 }, { "epoch": 0.4141743701286931, "grad_norm": 0.9116935729980469, "learning_rate": 4.141743701286931e-05, "loss": 0.657, "step": 4570 }, { "epoch": 0.41462751495377925, "grad_norm": 0.7919728755950928, "learning_rate": 4.146275149537793e-05, "loss": 0.5739, "step": 4575 }, { "epoch": 0.4150806597788653, "grad_norm": 0.8448849320411682, "learning_rate": 4.1508065977886535e-05, "loss": 0.5642, "step": 4580 }, { "epoch": 0.4155338046039514, "grad_norm": 0.9357454180717468, "learning_rate": 4.155338046039514e-05, "loss": 0.6383, "step": 4585 }, { "epoch": 0.4159869494290375, "grad_norm": 0.9842563271522522, "learning_rate": 4.159869494290376e-05, "loss": 0.6035, "step": 4590 }, { "epoch": 0.4164400942541236, "grad_norm": 0.8557567000389099, "learning_rate": 4.164400942541236e-05, "loss": 0.5767, "step": 4595 }, { "epoch": 0.4168932390792097, "grad_norm": 0.8311106562614441, "learning_rate": 4.168932390792097e-05, "loss": 0.6205, "step": 4600 }, { "epoch": 0.4173463839042958, "grad_norm": 0.8790229558944702, "learning_rate": 4.173463839042958e-05, "loss": 0.6055, "step": 4605 }, { "epoch": 0.4177995287293819, "grad_norm": 0.7752683162689209, "learning_rate": 4.1779952872938195e-05, "loss": 0.6187, "step": 4610 }, { "epoch": 0.418252673554468, "grad_norm": 0.806203305721283, "learning_rate": 4.18252673554468e-05, "loss": 0.6175, "step": 4615 }, { "epoch": 0.4187058183795541, "grad_norm": 1.003536581993103, "learning_rate": 4.187058183795542e-05, "loss": 0.6251, "step": 4620 }, { "epoch": 0.4191589632046402, "grad_norm": 0.867901086807251, "learning_rate": 4.191589632046402e-05, "loss": 0.549, "step": 4625 }, { "epoch": 0.4196121080297263, "grad_norm": 0.9190835952758789, "learning_rate": 4.196121080297263e-05, "loss": 0.6254, "step": 4630 }, { "epoch": 0.4200652528548124, "grad_norm": 0.7516510486602783, "learning_rate": 4.200652528548124e-05, "loss": 0.5376, "step": 4635 }, { "epoch": 0.4205183976798985, "grad_norm": 0.8714063763618469, "learning_rate": 4.2051839767989855e-05, "loss": 0.6057, "step": 4640 }, { "epoch": 0.4209715425049846, "grad_norm": 0.8157809376716614, "learning_rate": 4.209715425049846e-05, "loss": 0.5695, "step": 4645 }, { "epoch": 0.4214246873300707, "grad_norm": 0.8693602085113525, "learning_rate": 4.214246873300707e-05, "loss": 0.5985, "step": 4650 }, { "epoch": 0.4218778321551568, "grad_norm": 0.9411785006523132, "learning_rate": 4.218778321551568e-05, "loss": 0.5784, "step": 4655 }, { "epoch": 0.4223309769802429, "grad_norm": 0.8352392315864563, "learning_rate": 4.223309769802429e-05, "loss": 0.5977, "step": 4660 }, { "epoch": 0.422784121805329, "grad_norm": 0.9492303133010864, "learning_rate": 4.22784121805329e-05, "loss": 0.6206, "step": 4665 }, { "epoch": 0.42323726663041505, "grad_norm": 0.8049758672714233, "learning_rate": 4.232372666304151e-05, "loss": 0.5566, "step": 4670 }, { "epoch": 0.4236904114555012, "grad_norm": 0.8151426911354065, "learning_rate": 4.236904114555012e-05, "loss": 0.6488, "step": 4675 }, { "epoch": 0.42414355628058725, "grad_norm": 0.8530765175819397, "learning_rate": 4.241435562805873e-05, "loss": 0.5952, "step": 4680 }, { "epoch": 0.4245967011056734, "grad_norm": 0.8182388544082642, "learning_rate": 4.245967011056734e-05, "loss": 0.6204, "step": 4685 }, { "epoch": 0.42504984593075945, "grad_norm": 0.8514456748962402, "learning_rate": 4.2504984593075946e-05, "loss": 0.6192, "step": 4690 }, { "epoch": 0.4255029907558456, "grad_norm": 0.8044938445091248, "learning_rate": 4.255029907558456e-05, "loss": 0.5695, "step": 4695 }, { "epoch": 0.42595613558093165, "grad_norm": 0.9315749406814575, "learning_rate": 4.259561355809317e-05, "loss": 0.5939, "step": 4700 }, { "epoch": 0.4264092804060178, "grad_norm": 0.8418103456497192, "learning_rate": 4.264092804060178e-05, "loss": 0.653, "step": 4705 }, { "epoch": 0.42686242523110385, "grad_norm": 0.7374321222305298, "learning_rate": 4.2686242523110384e-05, "loss": 0.5953, "step": 4710 }, { "epoch": 0.42731557005619, "grad_norm": 0.9278823733329773, "learning_rate": 4.2731557005619e-05, "loss": 0.6069, "step": 4715 }, { "epoch": 0.42776871488127605, "grad_norm": 0.8587461113929749, "learning_rate": 4.2776871488127606e-05, "loss": 0.6518, "step": 4720 }, { "epoch": 0.4282218597063622, "grad_norm": 0.8244873285293579, "learning_rate": 4.282218597063622e-05, "loss": 0.5975, "step": 4725 }, { "epoch": 0.42867500453144824, "grad_norm": 0.8861063122749329, "learning_rate": 4.286750045314483e-05, "loss": 0.6267, "step": 4730 }, { "epoch": 0.42912814935653437, "grad_norm": 0.8752411007881165, "learning_rate": 4.291281493565344e-05, "loss": 0.5831, "step": 4735 }, { "epoch": 0.42958129418162044, "grad_norm": 0.8664864897727966, "learning_rate": 4.2958129418162044e-05, "loss": 0.6085, "step": 4740 }, { "epoch": 0.43003443900670657, "grad_norm": 0.8660111427307129, "learning_rate": 4.300344390067066e-05, "loss": 0.581, "step": 4745 }, { "epoch": 0.43048758383179264, "grad_norm": 0.8549082279205322, "learning_rate": 4.3048758383179266e-05, "loss": 0.6517, "step": 4750 }, { "epoch": 0.4309407286568787, "grad_norm": 0.7691596746444702, "learning_rate": 4.3094072865687874e-05, "loss": 0.5955, "step": 4755 }, { "epoch": 0.43139387348196484, "grad_norm": 0.804476797580719, "learning_rate": 4.313938734819649e-05, "loss": 0.6414, "step": 4760 }, { "epoch": 0.4318470183070509, "grad_norm": 0.8431987762451172, "learning_rate": 4.3184701830705096e-05, "loss": 0.5994, "step": 4765 }, { "epoch": 0.43230016313213704, "grad_norm": 0.9571689963340759, "learning_rate": 4.3230016313213704e-05, "loss": 0.5711, "step": 4770 }, { "epoch": 0.4327533079572231, "grad_norm": 0.8741220235824585, "learning_rate": 4.327533079572231e-05, "loss": 0.6075, "step": 4775 }, { "epoch": 0.43320645278230924, "grad_norm": 0.8766109347343445, "learning_rate": 4.3320645278230926e-05, "loss": 0.5914, "step": 4780 }, { "epoch": 0.4336595976073953, "grad_norm": 0.8470813632011414, "learning_rate": 4.3365959760739534e-05, "loss": 0.5795, "step": 4785 }, { "epoch": 0.43411274243248144, "grad_norm": 0.8004540205001831, "learning_rate": 4.341127424324815e-05, "loss": 0.6132, "step": 4790 }, { "epoch": 0.4345658872575675, "grad_norm": 0.8047029972076416, "learning_rate": 4.3456588725756756e-05, "loss": 0.6297, "step": 4795 }, { "epoch": 0.43501903208265363, "grad_norm": 0.8621811270713806, "learning_rate": 4.3501903208265364e-05, "loss": 0.6086, "step": 4800 }, { "epoch": 0.4354721769077397, "grad_norm": 0.8615626692771912, "learning_rate": 4.354721769077397e-05, "loss": 0.6212, "step": 4805 }, { "epoch": 0.43592532173282583, "grad_norm": 0.7915815711021423, "learning_rate": 4.3592532173282586e-05, "loss": 0.6437, "step": 4810 }, { "epoch": 0.4363784665579119, "grad_norm": 0.8361389636993408, "learning_rate": 4.3637846655791194e-05, "loss": 0.6287, "step": 4815 }, { "epoch": 0.43683161138299803, "grad_norm": 0.8498762249946594, "learning_rate": 4.368316113829981e-05, "loss": 0.5596, "step": 4820 }, { "epoch": 0.4372847562080841, "grad_norm": 0.8479225039482117, "learning_rate": 4.372847562080841e-05, "loss": 0.7046, "step": 4825 }, { "epoch": 0.4377379010331702, "grad_norm": 0.8685916662216187, "learning_rate": 4.377379010331702e-05, "loss": 0.5431, "step": 4830 }, { "epoch": 0.4381910458582563, "grad_norm": 0.785413920879364, "learning_rate": 4.381910458582563e-05, "loss": 0.6239, "step": 4835 }, { "epoch": 0.43864419068334237, "grad_norm": 0.8103176951408386, "learning_rate": 4.386441906833424e-05, "loss": 0.6398, "step": 4840 }, { "epoch": 0.4390973355084285, "grad_norm": 0.8587669134140015, "learning_rate": 4.3909733550842854e-05, "loss": 0.579, "step": 4845 }, { "epoch": 0.43955048033351457, "grad_norm": 0.8799600005149841, "learning_rate": 4.395504803335146e-05, "loss": 0.6356, "step": 4850 }, { "epoch": 0.4400036251586007, "grad_norm": 0.8734298348426819, "learning_rate": 4.400036251586007e-05, "loss": 0.6092, "step": 4855 }, { "epoch": 0.44045676998368677, "grad_norm": 0.9608545303344727, "learning_rate": 4.404567699836868e-05, "loss": 0.6091, "step": 4860 }, { "epoch": 0.4409099148087729, "grad_norm": 0.7731572389602661, "learning_rate": 4.409099148087729e-05, "loss": 0.6008, "step": 4865 }, { "epoch": 0.44136305963385897, "grad_norm": 0.8789125680923462, "learning_rate": 4.41363059633859e-05, "loss": 0.6345, "step": 4870 }, { "epoch": 0.4418162044589451, "grad_norm": 0.8592774271965027, "learning_rate": 4.4181620445894514e-05, "loss": 0.5906, "step": 4875 }, { "epoch": 0.44226934928403117, "grad_norm": 0.8593401908874512, "learning_rate": 4.422693492840312e-05, "loss": 0.7202, "step": 4880 }, { "epoch": 0.4427224941091173, "grad_norm": 0.8052661418914795, "learning_rate": 4.427224941091173e-05, "loss": 0.5913, "step": 4885 }, { "epoch": 0.44317563893420336, "grad_norm": 0.8460658192634583, "learning_rate": 4.431756389342034e-05, "loss": 0.6106, "step": 4890 }, { "epoch": 0.4436287837592895, "grad_norm": 0.96129310131073, "learning_rate": 4.436287837592895e-05, "loss": 0.6331, "step": 4895 }, { "epoch": 0.44408192858437556, "grad_norm": 0.8187257647514343, "learning_rate": 4.440819285843756e-05, "loss": 0.5642, "step": 4900 }, { "epoch": 0.4445350734094617, "grad_norm": 0.8920066356658936, "learning_rate": 4.4453507340946174e-05, "loss": 0.6017, "step": 4905 }, { "epoch": 0.44498821823454776, "grad_norm": 0.8708632588386536, "learning_rate": 4.449882182345478e-05, "loss": 0.6098, "step": 4910 }, { "epoch": 0.44544136305963383, "grad_norm": 0.7902442216873169, "learning_rate": 4.454413630596338e-05, "loss": 0.5888, "step": 4915 }, { "epoch": 0.44589450788471996, "grad_norm": 0.9075696468353271, "learning_rate": 4.4589450788472e-05, "loss": 0.6196, "step": 4920 }, { "epoch": 0.44634765270980603, "grad_norm": 0.8837315440177917, "learning_rate": 4.4634765270980605e-05, "loss": 0.6572, "step": 4925 }, { "epoch": 0.44680079753489216, "grad_norm": 0.8626925349235535, "learning_rate": 4.468007975348922e-05, "loss": 0.6595, "step": 4930 }, { "epoch": 0.44725394235997823, "grad_norm": 0.8091461658477783, "learning_rate": 4.472539423599783e-05, "loss": 0.6492, "step": 4935 }, { "epoch": 0.44770708718506436, "grad_norm": 0.8392575979232788, "learning_rate": 4.4770708718506435e-05, "loss": 0.6049, "step": 4940 }, { "epoch": 0.44816023201015043, "grad_norm": 0.8546628355979919, "learning_rate": 4.481602320101504e-05, "loss": 0.629, "step": 4945 }, { "epoch": 0.44861337683523655, "grad_norm": 0.8820680975914001, "learning_rate": 4.486133768352366e-05, "loss": 0.6257, "step": 4950 }, { "epoch": 0.4490665216603226, "grad_norm": 0.8732356429100037, "learning_rate": 4.4906652166032265e-05, "loss": 0.6168, "step": 4955 }, { "epoch": 0.44951966648540875, "grad_norm": 0.8816845417022705, "learning_rate": 4.495196664854088e-05, "loss": 0.5989, "step": 4960 }, { "epoch": 0.4499728113104948, "grad_norm": 0.8748577833175659, "learning_rate": 4.499728113104949e-05, "loss": 0.5931, "step": 4965 }, { "epoch": 0.45042595613558095, "grad_norm": 0.8213568329811096, "learning_rate": 4.5042595613558095e-05, "loss": 0.6154, "step": 4970 }, { "epoch": 0.450879100960667, "grad_norm": 0.8327969312667847, "learning_rate": 4.50879100960667e-05, "loss": 0.6664, "step": 4975 }, { "epoch": 0.45133224578575315, "grad_norm": 0.8225820660591125, "learning_rate": 4.513322457857532e-05, "loss": 0.5805, "step": 4980 }, { "epoch": 0.4517853906108392, "grad_norm": 0.7981924414634705, "learning_rate": 4.5178539061083925e-05, "loss": 0.5586, "step": 4985 }, { "epoch": 0.45223853543592535, "grad_norm": 0.9155174493789673, "learning_rate": 4.522385354359254e-05, "loss": 0.5718, "step": 4990 }, { "epoch": 0.4526916802610114, "grad_norm": 0.8580183386802673, "learning_rate": 4.526916802610115e-05, "loss": 0.6137, "step": 4995 }, { "epoch": 0.4531448250860975, "grad_norm": 0.810133159160614, "learning_rate": 4.5314482508609755e-05, "loss": 0.6148, "step": 5000 }, { "epoch": 0.4535979699111836, "grad_norm": 0.7960261106491089, "learning_rate": 4.535979699111836e-05, "loss": 0.5618, "step": 5005 }, { "epoch": 0.4540511147362697, "grad_norm": 0.916415274143219, "learning_rate": 4.540511147362697e-05, "loss": 0.6271, "step": 5010 }, { "epoch": 0.4545042595613558, "grad_norm": 0.8154698610305786, "learning_rate": 4.5450425956135585e-05, "loss": 0.6144, "step": 5015 }, { "epoch": 0.4549574043864419, "grad_norm": 0.8983984589576721, "learning_rate": 4.549574043864419e-05, "loss": 0.6377, "step": 5020 }, { "epoch": 0.455410549211528, "grad_norm": 0.8542330265045166, "learning_rate": 4.554105492115281e-05, "loss": 0.601, "step": 5025 }, { "epoch": 0.4558636940366141, "grad_norm": 0.8440463542938232, "learning_rate": 4.558636940366141e-05, "loss": 0.6212, "step": 5030 }, { "epoch": 0.4563168388617002, "grad_norm": 0.7781584858894348, "learning_rate": 4.563168388617002e-05, "loss": 0.6158, "step": 5035 }, { "epoch": 0.4567699836867863, "grad_norm": 0.8707416653633118, "learning_rate": 4.567699836867863e-05, "loss": 0.6376, "step": 5040 }, { "epoch": 0.4572231285118724, "grad_norm": 0.8440870642662048, "learning_rate": 4.5722312851187245e-05, "loss": 0.5622, "step": 5045 }, { "epoch": 0.4576762733369585, "grad_norm": 0.8626463413238525, "learning_rate": 4.576762733369585e-05, "loss": 0.5789, "step": 5050 }, { "epoch": 0.4581294181620446, "grad_norm": 0.8652251362800598, "learning_rate": 4.581294181620446e-05, "loss": 0.5638, "step": 5055 }, { "epoch": 0.4585825629871307, "grad_norm": 0.8482958674430847, "learning_rate": 4.585825629871307e-05, "loss": 0.6207, "step": 5060 }, { "epoch": 0.4590357078122168, "grad_norm": 0.9748696088790894, "learning_rate": 4.590357078122168e-05, "loss": 0.5542, "step": 5065 }, { "epoch": 0.4594888526373029, "grad_norm": 0.8938618302345276, "learning_rate": 4.594888526373029e-05, "loss": 0.5996, "step": 5070 }, { "epoch": 0.45994199746238895, "grad_norm": 0.8535104990005493, "learning_rate": 4.59941997462389e-05, "loss": 0.604, "step": 5075 }, { "epoch": 0.4603951422874751, "grad_norm": 0.8376330137252808, "learning_rate": 4.603951422874751e-05, "loss": 0.6037, "step": 5080 }, { "epoch": 0.46084828711256115, "grad_norm": 0.8220360279083252, "learning_rate": 4.608482871125612e-05, "loss": 0.6254, "step": 5085 }, { "epoch": 0.4613014319376473, "grad_norm": 0.8601440191268921, "learning_rate": 4.613014319376473e-05, "loss": 0.5947, "step": 5090 }, { "epoch": 0.46175457676273335, "grad_norm": 0.8620185256004333, "learning_rate": 4.6175457676273336e-05, "loss": 0.589, "step": 5095 }, { "epoch": 0.4622077215878195, "grad_norm": 0.8637043237686157, "learning_rate": 4.622077215878195e-05, "loss": 0.5657, "step": 5100 }, { "epoch": 0.46266086641290555, "grad_norm": 0.8352770209312439, "learning_rate": 4.626608664129056e-05, "loss": 0.6278, "step": 5105 }, { "epoch": 0.4631140112379917, "grad_norm": 0.7788306474685669, "learning_rate": 4.631140112379917e-05, "loss": 0.6379, "step": 5110 }, { "epoch": 0.46356715606307775, "grad_norm": 0.9257190227508545, "learning_rate": 4.635671560630778e-05, "loss": 0.6254, "step": 5115 }, { "epoch": 0.4640203008881639, "grad_norm": 0.9837411642074585, "learning_rate": 4.640203008881639e-05, "loss": 0.5712, "step": 5120 }, { "epoch": 0.46447344571324994, "grad_norm": 0.7918562889099121, "learning_rate": 4.6447344571324996e-05, "loss": 0.5861, "step": 5125 }, { "epoch": 0.46492659053833607, "grad_norm": 0.9284143447875977, "learning_rate": 4.649265905383361e-05, "loss": 0.5972, "step": 5130 }, { "epoch": 0.46537973536342214, "grad_norm": 0.8866010308265686, "learning_rate": 4.653797353634222e-05, "loss": 0.6295, "step": 5135 }, { "epoch": 0.46583288018850827, "grad_norm": 0.8158244490623474, "learning_rate": 4.658328801885083e-05, "loss": 0.5847, "step": 5140 }, { "epoch": 0.46628602501359434, "grad_norm": 0.8422221541404724, "learning_rate": 4.662860250135943e-05, "loss": 0.5778, "step": 5145 }, { "epoch": 0.46673916983868047, "grad_norm": 0.8687641024589539, "learning_rate": 4.667391698386805e-05, "loss": 0.5827, "step": 5150 }, { "epoch": 0.46719231466376654, "grad_norm": 0.8478764295578003, "learning_rate": 4.6719231466376656e-05, "loss": 0.6045, "step": 5155 }, { "epoch": 0.4676454594888526, "grad_norm": 0.8261911869049072, "learning_rate": 4.676454594888526e-05, "loss": 0.659, "step": 5160 }, { "epoch": 0.46809860431393874, "grad_norm": 0.8436740040779114, "learning_rate": 4.680986043139388e-05, "loss": 0.6405, "step": 5165 }, { "epoch": 0.4685517491390248, "grad_norm": 0.8475112915039062, "learning_rate": 4.6855174913902486e-05, "loss": 0.6224, "step": 5170 }, { "epoch": 0.46900489396411094, "grad_norm": 0.8916735053062439, "learning_rate": 4.690048939641109e-05, "loss": 0.6047, "step": 5175 }, { "epoch": 0.469458038789197, "grad_norm": 0.8411452770233154, "learning_rate": 4.69458038789197e-05, "loss": 0.5892, "step": 5180 }, { "epoch": 0.46991118361428313, "grad_norm": 0.8895859122276306, "learning_rate": 4.6991118361428316e-05, "loss": 0.5851, "step": 5185 }, { "epoch": 0.4703643284393692, "grad_norm": 0.8122456073760986, "learning_rate": 4.703643284393692e-05, "loss": 0.605, "step": 5190 }, { "epoch": 0.47081747326445533, "grad_norm": 0.8013954162597656, "learning_rate": 4.708174732644554e-05, "loss": 0.6187, "step": 5195 }, { "epoch": 0.4712706180895414, "grad_norm": 0.8626060485839844, "learning_rate": 4.7127061808954146e-05, "loss": 0.6208, "step": 5200 }, { "epoch": 0.47172376291462753, "grad_norm": 0.8390354514122009, "learning_rate": 4.717237629146275e-05, "loss": 0.6426, "step": 5205 }, { "epoch": 0.4721769077397136, "grad_norm": 0.8646942377090454, "learning_rate": 4.721769077397136e-05, "loss": 0.5879, "step": 5210 }, { "epoch": 0.47263005256479973, "grad_norm": 0.8431336879730225, "learning_rate": 4.7263005256479976e-05, "loss": 0.5827, "step": 5215 }, { "epoch": 0.4730831973898858, "grad_norm": 0.8599961400032043, "learning_rate": 4.730831973898858e-05, "loss": 0.6056, "step": 5220 }, { "epoch": 0.47353634221497193, "grad_norm": 0.8949995636940002, "learning_rate": 4.73536342214972e-05, "loss": 0.5937, "step": 5225 }, { "epoch": 0.473989487040058, "grad_norm": 0.8081187605857849, "learning_rate": 4.7398948704005806e-05, "loss": 0.5643, "step": 5230 }, { "epoch": 0.4744426318651441, "grad_norm": 0.9280082583427429, "learning_rate": 4.744426318651441e-05, "loss": 0.5825, "step": 5235 }, { "epoch": 0.4748957766902302, "grad_norm": 0.9056142568588257, "learning_rate": 4.748957766902302e-05, "loss": 0.6049, "step": 5240 }, { "epoch": 0.47534892151531627, "grad_norm": 0.7779013514518738, "learning_rate": 4.753489215153163e-05, "loss": 0.5779, "step": 5245 }, { "epoch": 0.4758020663404024, "grad_norm": 0.766521155834198, "learning_rate": 4.758020663404024e-05, "loss": 0.5231, "step": 5250 }, { "epoch": 0.47625521116548847, "grad_norm": 0.8302894234657288, "learning_rate": 4.762552111654885e-05, "loss": 0.6227, "step": 5255 }, { "epoch": 0.4767083559905746, "grad_norm": 0.816490113735199, "learning_rate": 4.767083559905746e-05, "loss": 0.6194, "step": 5260 }, { "epoch": 0.47716150081566067, "grad_norm": 0.8809311389923096, "learning_rate": 4.7716150081566066e-05, "loss": 0.6039, "step": 5265 }, { "epoch": 0.4776146456407468, "grad_norm": 0.8490098118782043, "learning_rate": 4.776146456407468e-05, "loss": 0.5842, "step": 5270 }, { "epoch": 0.47806779046583286, "grad_norm": 0.8886969685554504, "learning_rate": 4.780677904658329e-05, "loss": 0.6128, "step": 5275 }, { "epoch": 0.478520935290919, "grad_norm": 0.8730165362358093, "learning_rate": 4.78520935290919e-05, "loss": 0.5909, "step": 5280 }, { "epoch": 0.47897408011600506, "grad_norm": 0.8638790249824524, "learning_rate": 4.789740801160051e-05, "loss": 0.5764, "step": 5285 }, { "epoch": 0.4794272249410912, "grad_norm": 0.8518521189689636, "learning_rate": 4.794272249410912e-05, "loss": 0.6025, "step": 5290 }, { "epoch": 0.47988036976617726, "grad_norm": 0.9028800129890442, "learning_rate": 4.7988036976617726e-05, "loss": 0.6508, "step": 5295 }, { "epoch": 0.4803335145912634, "grad_norm": 0.8406249284744263, "learning_rate": 4.803335145912634e-05, "loss": 0.6669, "step": 5300 }, { "epoch": 0.48078665941634946, "grad_norm": 0.8483386635780334, "learning_rate": 4.807866594163495e-05, "loss": 0.5959, "step": 5305 }, { "epoch": 0.4812398042414356, "grad_norm": 0.8183832168579102, "learning_rate": 4.812398042414356e-05, "loss": 0.6253, "step": 5310 }, { "epoch": 0.48169294906652166, "grad_norm": 0.8910837769508362, "learning_rate": 4.816929490665217e-05, "loss": 0.6559, "step": 5315 }, { "epoch": 0.4821460938916078, "grad_norm": 0.9538207054138184, "learning_rate": 4.821460938916078e-05, "loss": 0.6939, "step": 5320 }, { "epoch": 0.48259923871669386, "grad_norm": 0.9343950748443604, "learning_rate": 4.8259923871669386e-05, "loss": 0.6285, "step": 5325 }, { "epoch": 0.48305238354177993, "grad_norm": 0.790796160697937, "learning_rate": 4.8305238354177994e-05, "loss": 0.6371, "step": 5330 }, { "epoch": 0.48350552836686606, "grad_norm": 0.7822690606117249, "learning_rate": 4.835055283668661e-05, "loss": 0.5849, "step": 5335 }, { "epoch": 0.4839586731919521, "grad_norm": 0.9607090950012207, "learning_rate": 4.8395867319195217e-05, "loss": 0.597, "step": 5340 }, { "epoch": 0.48441181801703825, "grad_norm": 0.783545732498169, "learning_rate": 4.844118180170383e-05, "loss": 0.6185, "step": 5345 }, { "epoch": 0.4848649628421243, "grad_norm": 1.0387641191482544, "learning_rate": 4.848649628421243e-05, "loss": 0.6294, "step": 5350 }, { "epoch": 0.48531810766721045, "grad_norm": 0.7918679714202881, "learning_rate": 4.8531810766721047e-05, "loss": 0.6082, "step": 5355 }, { "epoch": 0.4857712524922965, "grad_norm": 0.8185076117515564, "learning_rate": 4.8577125249229654e-05, "loss": 0.5346, "step": 5360 }, { "epoch": 0.48622439731738265, "grad_norm": 0.862173318862915, "learning_rate": 4.862243973173827e-05, "loss": 0.5648, "step": 5365 }, { "epoch": 0.4866775421424687, "grad_norm": 1.1841217279434204, "learning_rate": 4.8667754214246877e-05, "loss": 0.5818, "step": 5370 }, { "epoch": 0.48713068696755485, "grad_norm": 0.8780586123466492, "learning_rate": 4.8713068696755484e-05, "loss": 0.5849, "step": 5375 }, { "epoch": 0.4875838317926409, "grad_norm": 0.8988229036331177, "learning_rate": 4.875838317926409e-05, "loss": 0.6316, "step": 5380 }, { "epoch": 0.48803697661772705, "grad_norm": 0.8094955086708069, "learning_rate": 4.8803697661772707e-05, "loss": 0.5909, "step": 5385 }, { "epoch": 0.4884901214428131, "grad_norm": 0.8456873297691345, "learning_rate": 4.8849012144281314e-05, "loss": 0.6411, "step": 5390 }, { "epoch": 0.48894326626789925, "grad_norm": 0.8935300707817078, "learning_rate": 4.889432662678993e-05, "loss": 0.574, "step": 5395 }, { "epoch": 0.4893964110929853, "grad_norm": 0.7869353890419006, "learning_rate": 4.8939641109298537e-05, "loss": 0.5833, "step": 5400 }, { "epoch": 0.4898495559180714, "grad_norm": 0.7913839221000671, "learning_rate": 4.8984955591807144e-05, "loss": 0.5854, "step": 5405 }, { "epoch": 0.4903027007431575, "grad_norm": 0.8617231845855713, "learning_rate": 4.903027007431575e-05, "loss": 0.587, "step": 5410 }, { "epoch": 0.4907558455682436, "grad_norm": 0.8475949168205261, "learning_rate": 4.907558455682436e-05, "loss": 0.5998, "step": 5415 }, { "epoch": 0.4912089903933297, "grad_norm": 0.8913253545761108, "learning_rate": 4.9120899039332974e-05, "loss": 0.581, "step": 5420 }, { "epoch": 0.4916621352184158, "grad_norm": 0.9296994209289551, "learning_rate": 4.916621352184158e-05, "loss": 0.639, "step": 5425 }, { "epoch": 0.4921152800435019, "grad_norm": 0.8625933527946472, "learning_rate": 4.9211528004350197e-05, "loss": 0.5504, "step": 5430 }, { "epoch": 0.492568424868588, "grad_norm": 0.8670766949653625, "learning_rate": 4.92568424868588e-05, "loss": 0.5931, "step": 5435 }, { "epoch": 0.4930215696936741, "grad_norm": 0.8745831251144409, "learning_rate": 4.930215696936741e-05, "loss": 0.5689, "step": 5440 }, { "epoch": 0.4934747145187602, "grad_norm": 1.0381951332092285, "learning_rate": 4.934747145187602e-05, "loss": 0.5646, "step": 5445 }, { "epoch": 0.4939278593438463, "grad_norm": 0.8845943808555603, "learning_rate": 4.9392785934384634e-05, "loss": 0.5858, "step": 5450 }, { "epoch": 0.4943810041689324, "grad_norm": 0.8778969049453735, "learning_rate": 4.943810041689324e-05, "loss": 0.5568, "step": 5455 }, { "epoch": 0.4948341489940185, "grad_norm": 1.0536367893218994, "learning_rate": 4.9483414899401857e-05, "loss": 0.6091, "step": 5460 }, { "epoch": 0.4952872938191046, "grad_norm": 0.8409548997879028, "learning_rate": 4.952872938191046e-05, "loss": 0.6264, "step": 5465 }, { "epoch": 0.4957404386441907, "grad_norm": 0.9089795351028442, "learning_rate": 4.957404386441907e-05, "loss": 0.6548, "step": 5470 }, { "epoch": 0.4961935834692768, "grad_norm": 0.9334297776222229, "learning_rate": 4.961935834692768e-05, "loss": 0.5564, "step": 5475 }, { "epoch": 0.4966467282943629, "grad_norm": 0.8065686821937561, "learning_rate": 4.9664672829436294e-05, "loss": 0.5666, "step": 5480 }, { "epoch": 0.497099873119449, "grad_norm": 0.9021301865577698, "learning_rate": 4.97099873119449e-05, "loss": 0.6039, "step": 5485 }, { "epoch": 0.49755301794453505, "grad_norm": 0.7650969624519348, "learning_rate": 4.975530179445351e-05, "loss": 0.5941, "step": 5490 }, { "epoch": 0.4980061627696212, "grad_norm": 0.8701999187469482, "learning_rate": 4.980061627696212e-05, "loss": 0.6007, "step": 5495 }, { "epoch": 0.49845930759470725, "grad_norm": 0.8476364016532898, "learning_rate": 4.9845930759470725e-05, "loss": 0.6532, "step": 5500 }, { "epoch": 0.4989124524197934, "grad_norm": 0.8889872431755066, "learning_rate": 4.989124524197934e-05, "loss": 0.6644, "step": 5505 }, { "epoch": 0.49936559724487944, "grad_norm": 0.8035936951637268, "learning_rate": 4.993655972448795e-05, "loss": 0.6052, "step": 5510 }, { "epoch": 0.49981874206996557, "grad_norm": 0.8362823724746704, "learning_rate": 4.998187420699656e-05, "loss": 0.5465, "step": 5515 }, { "epoch": 0.5002718868950516, "grad_norm": 0.8491589426994324, "learning_rate": 4.999999954963848e-05, "loss": 0.6231, "step": 5520 }, { "epoch": 0.5007250317201377, "grad_norm": 0.8064247369766235, "learning_rate": 4.9999996797429225e-05, "loss": 0.5569, "step": 5525 }, { "epoch": 0.5011781765452239, "grad_norm": 0.8516063094139099, "learning_rate": 4.999999154321184e-05, "loss": 0.6228, "step": 5530 }, { "epoch": 0.50163132137031, "grad_norm": 0.8811013698577881, "learning_rate": 4.999998378698686e-05, "loss": 0.5782, "step": 5535 }, { "epoch": 0.502084466195396, "grad_norm": 0.8853281140327454, "learning_rate": 4.999997352875504e-05, "loss": 0.5762, "step": 5540 }, { "epoch": 0.5025376110204821, "grad_norm": 0.8219696879386902, "learning_rate": 4.999996076851742e-05, "loss": 0.5591, "step": 5545 }, { "epoch": 0.5029907558455683, "grad_norm": 0.8655716776847839, "learning_rate": 4.999994550627528e-05, "loss": 0.6236, "step": 5550 }, { "epoch": 0.5034439006706544, "grad_norm": 0.9596734046936035, "learning_rate": 4.999992774203014e-05, "loss": 0.5925, "step": 5555 }, { "epoch": 0.5038970454957404, "grad_norm": 0.8622197508811951, "learning_rate": 4.9999907475783775e-05, "loss": 0.5892, "step": 5560 }, { "epoch": 0.5043501903208265, "grad_norm": 0.944914698600769, "learning_rate": 4.999988470753823e-05, "loss": 0.6329, "step": 5565 }, { "epoch": 0.5048033351459126, "grad_norm": 0.8559314012527466, "learning_rate": 4.999985943729576e-05, "loss": 0.5905, "step": 5570 }, { "epoch": 0.5052564799709988, "grad_norm": 0.8623883724212646, "learning_rate": 4.99998316650589e-05, "loss": 0.5884, "step": 5575 }, { "epoch": 0.5057096247960848, "grad_norm": 0.8426478505134583, "learning_rate": 4.999980139083046e-05, "loss": 0.6039, "step": 5580 }, { "epoch": 0.5061627696211709, "grad_norm": 0.8923940062522888, "learning_rate": 4.999976861461343e-05, "loss": 0.5723, "step": 5585 }, { "epoch": 0.506615914446257, "grad_norm": 0.8235048651695251, "learning_rate": 4.99997333364111e-05, "loss": 0.5877, "step": 5590 }, { "epoch": 0.5070690592713432, "grad_norm": 0.8632950782775879, "learning_rate": 4.999969555622702e-05, "loss": 0.5919, "step": 5595 }, { "epoch": 0.5075222040964292, "grad_norm": 0.8995289206504822, "learning_rate": 4.9999655274064956e-05, "loss": 0.5559, "step": 5600 }, { "epoch": 0.5079753489215153, "grad_norm": 0.9000346660614014, "learning_rate": 4.999961248992894e-05, "loss": 0.5523, "step": 5605 }, { "epoch": 0.5084284937466014, "grad_norm": 0.8365309834480286, "learning_rate": 4.999956720382325e-05, "loss": 0.5973, "step": 5610 }, { "epoch": 0.5088816385716876, "grad_norm": 0.8668230175971985, "learning_rate": 4.999951941575243e-05, "loss": 0.5504, "step": 5615 }, { "epoch": 0.5093347833967736, "grad_norm": 0.8893197774887085, "learning_rate": 4.999946912572124e-05, "loss": 0.6444, "step": 5620 }, { "epoch": 0.5097879282218597, "grad_norm": 0.7699057459831238, "learning_rate": 4.999941633373475e-05, "loss": 0.5476, "step": 5625 }, { "epoch": 0.5102410730469458, "grad_norm": 0.8616582155227661, "learning_rate": 4.999936103979821e-05, "loss": 0.5858, "step": 5630 }, { "epoch": 0.510694217872032, "grad_norm": 0.8783174157142639, "learning_rate": 4.9999303243917166e-05, "loss": 0.5818, "step": 5635 }, { "epoch": 0.511147362697118, "grad_norm": 0.8656919598579407, "learning_rate": 4.999924294609741e-05, "loss": 0.5712, "step": 5640 }, { "epoch": 0.5116005075222041, "grad_norm": 0.9824750423431396, "learning_rate": 4.999918014634496e-05, "loss": 0.5951, "step": 5645 }, { "epoch": 0.5120536523472902, "grad_norm": 0.856519341468811, "learning_rate": 4.999911484466612e-05, "loss": 0.6153, "step": 5650 }, { "epoch": 0.5125067971723762, "grad_norm": 0.8151008486747742, "learning_rate": 4.9999047041067405e-05, "loss": 0.6666, "step": 5655 }, { "epoch": 0.5129599419974624, "grad_norm": 0.8736802935600281, "learning_rate": 4.999897673555561e-05, "loss": 0.5806, "step": 5660 }, { "epoch": 0.5134130868225485, "grad_norm": 0.8187179565429688, "learning_rate": 4.999890392813778e-05, "loss": 0.535, "step": 5665 }, { "epoch": 0.5138662316476346, "grad_norm": 0.8786453604698181, "learning_rate": 4.999882861882119e-05, "loss": 0.5922, "step": 5670 }, { "epoch": 0.5143193764727206, "grad_norm": 0.9016815423965454, "learning_rate": 4.9998750807613385e-05, "loss": 0.5769, "step": 5675 }, { "epoch": 0.5147725212978068, "grad_norm": 0.8084769248962402, "learning_rate": 4.9998670494522145e-05, "loss": 0.5884, "step": 5680 }, { "epoch": 0.5152256661228929, "grad_norm": 0.9098870754241943, "learning_rate": 4.9998587679555516e-05, "loss": 0.5702, "step": 5685 }, { "epoch": 0.515678810947979, "grad_norm": 1.3101688623428345, "learning_rate": 4.999850236272178e-05, "loss": 0.6406, "step": 5690 }, { "epoch": 0.516131955773065, "grad_norm": 0.8431967496871948, "learning_rate": 4.999841454402947e-05, "loss": 0.5859, "step": 5695 }, { "epoch": 0.5165851005981512, "grad_norm": 0.8763777017593384, "learning_rate": 4.9998324223487384e-05, "loss": 0.6403, "step": 5700 }, { "epoch": 0.5170382454232373, "grad_norm": 0.8838351368904114, "learning_rate": 4.999823140110457e-05, "loss": 0.6084, "step": 5705 }, { "epoch": 0.5174913902483234, "grad_norm": 0.8258557319641113, "learning_rate": 4.9998136076890287e-05, "loss": 0.5725, "step": 5710 }, { "epoch": 0.5179445350734094, "grad_norm": 0.8770052194595337, "learning_rate": 4.999803825085411e-05, "loss": 0.6459, "step": 5715 }, { "epoch": 0.5183976798984956, "grad_norm": 0.8392895460128784, "learning_rate": 4.999793792300581e-05, "loss": 0.5362, "step": 5720 }, { "epoch": 0.5188508247235817, "grad_norm": 0.9525279402732849, "learning_rate": 4.9997835093355425e-05, "loss": 0.6164, "step": 5725 }, { "epoch": 0.5193039695486678, "grad_norm": 0.8650209307670593, "learning_rate": 4.999772976191327e-05, "loss": 0.6066, "step": 5730 }, { "epoch": 0.5197571143737538, "grad_norm": 0.7899057269096375, "learning_rate": 4.999762192868985e-05, "loss": 0.6005, "step": 5735 }, { "epoch": 0.5202102591988399, "grad_norm": 0.9101588726043701, "learning_rate": 4.999751159369599e-05, "loss": 0.5939, "step": 5740 }, { "epoch": 0.5206634040239261, "grad_norm": 0.828790009021759, "learning_rate": 4.999739875694272e-05, "loss": 0.5426, "step": 5745 }, { "epoch": 0.5211165488490122, "grad_norm": 0.8141089081764221, "learning_rate": 4.999728341844133e-05, "loss": 0.5401, "step": 5750 }, { "epoch": 0.5215696936740982, "grad_norm": 0.9027454257011414, "learning_rate": 4.9997165578203366e-05, "loss": 0.5651, "step": 5755 }, { "epoch": 0.5220228384991843, "grad_norm": 0.8990687131881714, "learning_rate": 4.999704523624061e-05, "loss": 0.5584, "step": 5760 }, { "epoch": 0.5224759833242705, "grad_norm": 0.7872946262359619, "learning_rate": 4.9996922392565127e-05, "loss": 0.6255, "step": 5765 }, { "epoch": 0.5229291281493565, "grad_norm": 0.835645318031311, "learning_rate": 4.9996797047189195e-05, "loss": 0.6544, "step": 5770 }, { "epoch": 0.5233822729744426, "grad_norm": 0.9866094589233398, "learning_rate": 4.999666920012537e-05, "loss": 0.5855, "step": 5775 }, { "epoch": 0.5238354177995287, "grad_norm": 0.8303639888763428, "learning_rate": 4.9996538851386445e-05, "loss": 0.5544, "step": 5780 }, { "epoch": 0.5242885626246149, "grad_norm": 0.8310614228248596, "learning_rate": 4.999640600098545e-05, "loss": 0.548, "step": 5785 }, { "epoch": 0.524741707449701, "grad_norm": 0.8468443155288696, "learning_rate": 4.99962706489357e-05, "loss": 0.5389, "step": 5790 }, { "epoch": 0.525194852274787, "grad_norm": 0.8765844106674194, "learning_rate": 4.999613279525073e-05, "loss": 0.63, "step": 5795 }, { "epoch": 0.5256479970998731, "grad_norm": 0.9006084203720093, "learning_rate": 4.999599243994434e-05, "loss": 0.5455, "step": 5800 }, { "epoch": 0.5261011419249593, "grad_norm": 0.892507016658783, "learning_rate": 4.999584958303058e-05, "loss": 0.5839, "step": 5805 }, { "epoch": 0.5265542867500453, "grad_norm": 0.910627543926239, "learning_rate": 4.9995704224523746e-05, "loss": 0.563, "step": 5810 }, { "epoch": 0.5270074315751314, "grad_norm": 0.8630537390708923, "learning_rate": 4.999555636443838e-05, "loss": 0.5409, "step": 5815 }, { "epoch": 0.5274605764002175, "grad_norm": 0.9128010272979736, "learning_rate": 4.999540600278928e-05, "loss": 0.6064, "step": 5820 }, { "epoch": 0.5279137212253036, "grad_norm": 0.8538975715637207, "learning_rate": 4.9995253139591494e-05, "loss": 0.5871, "step": 5825 }, { "epoch": 0.5283668660503897, "grad_norm": 0.8190693855285645, "learning_rate": 4.9995097774860336e-05, "loss": 0.6213, "step": 5830 }, { "epoch": 0.5288200108754758, "grad_norm": 0.8676832318305969, "learning_rate": 4.999493990861134e-05, "loss": 0.5822, "step": 5835 }, { "epoch": 0.5292731557005619, "grad_norm": 0.9935728311538696, "learning_rate": 4.9994779540860305e-05, "loss": 0.6379, "step": 5840 }, { "epoch": 0.529726300525648, "grad_norm": 0.9372619390487671, "learning_rate": 4.999461667162328e-05, "loss": 0.5805, "step": 5845 }, { "epoch": 0.5301794453507341, "grad_norm": 0.9070656299591064, "learning_rate": 4.9994451300916576e-05, "loss": 0.6075, "step": 5850 }, { "epoch": 0.5306325901758202, "grad_norm": 0.8118924498558044, "learning_rate": 4.999428342875673e-05, "loss": 0.6097, "step": 5855 }, { "epoch": 0.5310857350009063, "grad_norm": 0.8510047197341919, "learning_rate": 4.999411305516055e-05, "loss": 0.6332, "step": 5860 }, { "epoch": 0.5315388798259923, "grad_norm": 0.907687783241272, "learning_rate": 4.9993940180145096e-05, "loss": 0.5623, "step": 5865 }, { "epoch": 0.5319920246510785, "grad_norm": 0.8047569394111633, "learning_rate": 4.9993764803727646e-05, "loss": 0.6055, "step": 5870 }, { "epoch": 0.5324451694761646, "grad_norm": 0.7944129705429077, "learning_rate": 4.9993586925925776e-05, "loss": 0.6259, "step": 5875 }, { "epoch": 0.5328983143012507, "grad_norm": 0.8003072142601013, "learning_rate": 4.999340654675728e-05, "loss": 0.5463, "step": 5880 }, { "epoch": 0.5333514591263367, "grad_norm": 0.8919734954833984, "learning_rate": 4.99932236662402e-05, "loss": 0.5898, "step": 5885 }, { "epoch": 0.5338046039514229, "grad_norm": 0.8510867357254028, "learning_rate": 4.999303828439285e-05, "loss": 0.5255, "step": 5890 }, { "epoch": 0.534257748776509, "grad_norm": 0.8551084995269775, "learning_rate": 4.999285040123377e-05, "loss": 0.6083, "step": 5895 }, { "epoch": 0.5347108936015951, "grad_norm": 0.8382150530815125, "learning_rate": 4.9992660016781787e-05, "loss": 0.6604, "step": 5900 }, { "epoch": 0.5351640384266811, "grad_norm": 0.9382196068763733, "learning_rate": 4.999246713105593e-05, "loss": 0.5987, "step": 5905 }, { "epoch": 0.5356171832517672, "grad_norm": 0.8777171969413757, "learning_rate": 4.9992271744075516e-05, "loss": 0.6134, "step": 5910 }, { "epoch": 0.5360703280768534, "grad_norm": 0.957981288433075, "learning_rate": 4.99920738558601e-05, "loss": 0.5818, "step": 5915 }, { "epoch": 0.5365234729019395, "grad_norm": 0.8498092889785767, "learning_rate": 4.9991873466429485e-05, "loss": 0.5639, "step": 5920 }, { "epoch": 0.5369766177270255, "grad_norm": 0.8964778780937195, "learning_rate": 4.999167057580372e-05, "loss": 0.5738, "step": 5925 }, { "epoch": 0.5374297625521116, "grad_norm": 0.8540729880332947, "learning_rate": 4.9991465184003124e-05, "loss": 0.5691, "step": 5930 }, { "epoch": 0.5378829073771978, "grad_norm": 0.9101712107658386, "learning_rate": 4.999125729104824e-05, "loss": 0.6362, "step": 5935 }, { "epoch": 0.5383360522022839, "grad_norm": 0.7789832353591919, "learning_rate": 4.999104689695987e-05, "loss": 0.5846, "step": 5940 }, { "epoch": 0.5387891970273699, "grad_norm": 1.0083667039871216, "learning_rate": 4.999083400175909e-05, "loss": 0.6331, "step": 5945 }, { "epoch": 0.539242341852456, "grad_norm": 0.98734050989151, "learning_rate": 4.9990618605467185e-05, "loss": 0.5758, "step": 5950 }, { "epoch": 0.5396954866775422, "grad_norm": 0.8564559817314148, "learning_rate": 4.9990400708105736e-05, "loss": 0.6092, "step": 5955 }, { "epoch": 0.5401486315026283, "grad_norm": 0.8410665988922119, "learning_rate": 4.999018030969652e-05, "loss": 0.5736, "step": 5960 }, { "epoch": 0.5406017763277143, "grad_norm": 0.842379629611969, "learning_rate": 4.998995741026162e-05, "loss": 0.5004, "step": 5965 }, { "epoch": 0.5410549211528004, "grad_norm": 0.8551698327064514, "learning_rate": 4.9989732009823333e-05, "loss": 0.5566, "step": 5970 }, { "epoch": 0.5415080659778866, "grad_norm": 1.0128360986709595, "learning_rate": 4.9989504108404216e-05, "loss": 0.6054, "step": 5975 }, { "epoch": 0.5419612108029727, "grad_norm": 0.8333370089530945, "learning_rate": 4.998927370602708e-05, "loss": 0.5647, "step": 5980 }, { "epoch": 0.5424143556280587, "grad_norm": 0.8570535778999329, "learning_rate": 4.998904080271498e-05, "loss": 0.6451, "step": 5985 }, { "epoch": 0.5428675004531448, "grad_norm": 0.9289223551750183, "learning_rate": 4.998880539849124e-05, "loss": 0.5825, "step": 5990 }, { "epoch": 0.5433206452782309, "grad_norm": 0.7858771085739136, "learning_rate": 4.99885674933794e-05, "loss": 0.5567, "step": 5995 }, { "epoch": 0.5437737901033171, "grad_norm": 0.8155767917633057, "learning_rate": 4.998832708740329e-05, "loss": 0.5669, "step": 6000 }, { "epoch": 0.5442269349284031, "grad_norm": 0.8651280999183655, "learning_rate": 4.998808418058694e-05, "loss": 0.5534, "step": 6005 }, { "epoch": 0.5446800797534892, "grad_norm": 0.910690188407898, "learning_rate": 4.9987838772954685e-05, "loss": 0.562, "step": 6010 }, { "epoch": 0.5451332245785753, "grad_norm": 0.8744031190872192, "learning_rate": 4.9987590864531084e-05, "loss": 0.5668, "step": 6015 }, { "epoch": 0.5455863694036615, "grad_norm": 0.9651488661766052, "learning_rate": 4.9987340455340936e-05, "loss": 0.6117, "step": 6020 }, { "epoch": 0.5460395142287475, "grad_norm": 0.9159373641014099, "learning_rate": 4.998708754540931e-05, "loss": 0.5616, "step": 6025 }, { "epoch": 0.5464926590538336, "grad_norm": 0.8833006620407104, "learning_rate": 4.998683213476152e-05, "loss": 0.5881, "step": 6030 }, { "epoch": 0.5469458038789197, "grad_norm": 0.8279474377632141, "learning_rate": 4.9986574223423116e-05, "loss": 0.6776, "step": 6035 }, { "epoch": 0.5473989487040058, "grad_norm": 2.9651682376861572, "learning_rate": 4.9986313811419916e-05, "loss": 0.553, "step": 6040 }, { "epoch": 0.5478520935290919, "grad_norm": 0.8801607489585876, "learning_rate": 4.998605089877799e-05, "loss": 0.5424, "step": 6045 }, { "epoch": 0.548305238354178, "grad_norm": 0.8130127787590027, "learning_rate": 4.998578548552364e-05, "loss": 0.511, "step": 6050 }, { "epoch": 0.5487583831792641, "grad_norm": 0.8061729669570923, "learning_rate": 4.998551757168344e-05, "loss": 0.5556, "step": 6055 }, { "epoch": 0.5492115280043501, "grad_norm": 0.8385651707649231, "learning_rate": 4.9985247157284185e-05, "loss": 0.6102, "step": 6060 }, { "epoch": 0.5496646728294363, "grad_norm": 0.8107746839523315, "learning_rate": 4.9984974242352944e-05, "loss": 0.5717, "step": 6065 }, { "epoch": 0.5501178176545224, "grad_norm": 1.1813582181930542, "learning_rate": 4.9984698826917044e-05, "loss": 0.5881, "step": 6070 }, { "epoch": 0.5505709624796085, "grad_norm": 0.9847273230552673, "learning_rate": 4.998442091100404e-05, "loss": 0.5362, "step": 6075 }, { "epoch": 0.5510241073046945, "grad_norm": 0.8744961023330688, "learning_rate": 4.998414049464174e-05, "loss": 0.5893, "step": 6080 }, { "epoch": 0.5514772521297807, "grad_norm": 0.9320786595344543, "learning_rate": 4.998385757785822e-05, "loss": 0.5909, "step": 6085 }, { "epoch": 0.5519303969548668, "grad_norm": 0.8727133274078369, "learning_rate": 4.998357216068178e-05, "loss": 0.6466, "step": 6090 }, { "epoch": 0.5523835417799529, "grad_norm": 0.9109986424446106, "learning_rate": 4.998328424314099e-05, "loss": 0.5728, "step": 6095 }, { "epoch": 0.5528366866050389, "grad_norm": 0.8084288239479065, "learning_rate": 4.998299382526468e-05, "loss": 0.6242, "step": 6100 }, { "epoch": 0.5532898314301251, "grad_norm": 0.8539305329322815, "learning_rate": 4.998270090708189e-05, "loss": 0.5485, "step": 6105 }, { "epoch": 0.5537429762552112, "grad_norm": 0.8443488478660583, "learning_rate": 4.998240548862196e-05, "loss": 0.5333, "step": 6110 }, { "epoch": 0.5541961210802973, "grad_norm": 0.8738698363304138, "learning_rate": 4.998210756991444e-05, "loss": 0.6857, "step": 6115 }, { "epoch": 0.5546492659053833, "grad_norm": 0.8073911070823669, "learning_rate": 4.998180715098914e-05, "loss": 0.5561, "step": 6120 }, { "epoch": 0.5551024107304695, "grad_norm": 0.8360193371772766, "learning_rate": 4.9981504231876144e-05, "loss": 0.563, "step": 6125 }, { "epoch": 0.5555555555555556, "grad_norm": 0.9441671371459961, "learning_rate": 4.998119881260576e-05, "loss": 0.5947, "step": 6130 }, { "epoch": 0.5560087003806417, "grad_norm": 0.8213287591934204, "learning_rate": 4.998089089320854e-05, "loss": 0.5614, "step": 6135 }, { "epoch": 0.5564618452057277, "grad_norm": 0.8576318025588989, "learning_rate": 4.998058047371533e-05, "loss": 0.5871, "step": 6140 }, { "epoch": 0.5569149900308138, "grad_norm": 0.7936515808105469, "learning_rate": 4.998026755415718e-05, "loss": 0.5343, "step": 6145 }, { "epoch": 0.5573681348559, "grad_norm": 0.8731726408004761, "learning_rate": 4.99799521345654e-05, "loss": 0.6062, "step": 6150 }, { "epoch": 0.557821279680986, "grad_norm": 0.844716489315033, "learning_rate": 4.997963421497157e-05, "loss": 0.5943, "step": 6155 }, { "epoch": 0.5582744245060721, "grad_norm": 0.8034050464630127, "learning_rate": 4.997931379540751e-05, "loss": 0.5716, "step": 6160 }, { "epoch": 0.5587275693311582, "grad_norm": 0.7732856273651123, "learning_rate": 4.9978990875905275e-05, "loss": 0.5965, "step": 6165 }, { "epoch": 0.5591807141562444, "grad_norm": 0.9358978271484375, "learning_rate": 4.9978665456497186e-05, "loss": 0.5824, "step": 6170 }, { "epoch": 0.5596338589813304, "grad_norm": 0.7660355567932129, "learning_rate": 4.997833753721582e-05, "loss": 0.5695, "step": 6175 }, { "epoch": 0.5600870038064165, "grad_norm": 0.8438649773597717, "learning_rate": 4.9978007118093985e-05, "loss": 0.6476, "step": 6180 }, { "epoch": 0.5605401486315026, "grad_norm": 0.9251261353492737, "learning_rate": 4.997767419916476e-05, "loss": 0.6448, "step": 6185 }, { "epoch": 0.5609932934565888, "grad_norm": 0.905017614364624, "learning_rate": 4.997733878046145e-05, "loss": 0.5736, "step": 6190 }, { "epoch": 0.5614464382816748, "grad_norm": 0.9597222805023193, "learning_rate": 4.997700086201763e-05, "loss": 0.565, "step": 6195 }, { "epoch": 0.5618995831067609, "grad_norm": 0.8659042119979858, "learning_rate": 4.997666044386713e-05, "loss": 0.5375, "step": 6200 }, { "epoch": 0.562352727931847, "grad_norm": 0.8243951201438904, "learning_rate": 4.9976317526044e-05, "loss": 0.6075, "step": 6205 }, { "epoch": 0.5628058727569332, "grad_norm": 0.8462655544281006, "learning_rate": 4.997597210858258e-05, "loss": 0.6045, "step": 6210 }, { "epoch": 0.5632590175820192, "grad_norm": 0.7989419102668762, "learning_rate": 4.997562419151742e-05, "loss": 0.541, "step": 6215 }, { "epoch": 0.5637121624071053, "grad_norm": 0.8749746084213257, "learning_rate": 4.997527377488335e-05, "loss": 0.5766, "step": 6220 }, { "epoch": 0.5641653072321914, "grad_norm": 0.9899231195449829, "learning_rate": 4.997492085871543e-05, "loss": 0.6053, "step": 6225 }, { "epoch": 0.5646184520572775, "grad_norm": 0.828517735004425, "learning_rate": 4.9974565443049e-05, "loss": 0.6262, "step": 6230 }, { "epoch": 0.5650715968823636, "grad_norm": 0.897578775882721, "learning_rate": 4.9974207527919606e-05, "loss": 0.606, "step": 6235 }, { "epoch": 0.5655247417074497, "grad_norm": 0.8317257761955261, "learning_rate": 4.997384711336309e-05, "loss": 0.6457, "step": 6240 }, { "epoch": 0.5659778865325358, "grad_norm": 0.8604244589805603, "learning_rate": 4.997348419941551e-05, "loss": 0.5845, "step": 6245 }, { "epoch": 0.5664310313576219, "grad_norm": 0.8945031762123108, "learning_rate": 4.9973118786113186e-05, "loss": 0.5893, "step": 6250 }, { "epoch": 0.566884176182708, "grad_norm": 0.9049243927001953, "learning_rate": 4.997275087349269e-05, "loss": 0.6046, "step": 6255 }, { "epoch": 0.5673373210077941, "grad_norm": 0.9405499696731567, "learning_rate": 4.997238046159085e-05, "loss": 0.6147, "step": 6260 }, { "epoch": 0.5677904658328802, "grad_norm": 0.8378004431724548, "learning_rate": 4.9972007550444735e-05, "loss": 0.5468, "step": 6265 }, { "epoch": 0.5682436106579662, "grad_norm": 0.8704628348350525, "learning_rate": 4.997163214009165e-05, "loss": 0.5531, "step": 6270 }, { "epoch": 0.5686967554830524, "grad_norm": 0.9143903851509094, "learning_rate": 4.9971254230569186e-05, "loss": 0.6378, "step": 6275 }, { "epoch": 0.5691499003081385, "grad_norm": 0.7574027180671692, "learning_rate": 4.997087382191516e-05, "loss": 0.6091, "step": 6280 }, { "epoch": 0.5696030451332246, "grad_norm": 0.7693238258361816, "learning_rate": 4.997049091416763e-05, "loss": 0.5695, "step": 6285 }, { "epoch": 0.5700561899583106, "grad_norm": 0.8315666317939758, "learning_rate": 4.997010550736494e-05, "loss": 0.5535, "step": 6290 }, { "epoch": 0.5705093347833968, "grad_norm": 0.823498010635376, "learning_rate": 4.9969717601545645e-05, "loss": 0.6172, "step": 6295 }, { "epoch": 0.5709624796084829, "grad_norm": 0.8946885466575623, "learning_rate": 4.996932719674857e-05, "loss": 0.6293, "step": 6300 }, { "epoch": 0.571415624433569, "grad_norm": 0.8482374548912048, "learning_rate": 4.9968934293012794e-05, "loss": 0.5797, "step": 6305 }, { "epoch": 0.571868769258655, "grad_norm": 0.7996541857719421, "learning_rate": 4.996853889037763e-05, "loss": 0.5837, "step": 6310 }, { "epoch": 0.5723219140837411, "grad_norm": 0.8814182877540588, "learning_rate": 4.996814098888265e-05, "loss": 0.5817, "step": 6315 }, { "epoch": 0.5727750589088273, "grad_norm": 0.9630857110023499, "learning_rate": 4.996774058856768e-05, "loss": 0.5757, "step": 6320 }, { "epoch": 0.5732282037339134, "grad_norm": 0.8072431683540344, "learning_rate": 4.996733768947279e-05, "loss": 0.626, "step": 6325 }, { "epoch": 0.5736813485589994, "grad_norm": 0.8341922163963318, "learning_rate": 4.9966932291638314e-05, "loss": 0.5898, "step": 6330 }, { "epoch": 0.5741344933840855, "grad_norm": 1.0413392782211304, "learning_rate": 4.996652439510481e-05, "loss": 0.5818, "step": 6335 }, { "epoch": 0.5745876382091717, "grad_norm": 0.7707074284553528, "learning_rate": 4.9966113999913095e-05, "loss": 0.5524, "step": 6340 }, { "epoch": 0.5750407830342578, "grad_norm": 0.8470082879066467, "learning_rate": 4.996570110610426e-05, "loss": 0.6145, "step": 6345 }, { "epoch": 0.5754939278593438, "grad_norm": 0.8214190602302551, "learning_rate": 4.996528571371961e-05, "loss": 0.6135, "step": 6350 }, { "epoch": 0.5759470726844299, "grad_norm": 0.8805350065231323, "learning_rate": 4.996486782280074e-05, "loss": 0.5416, "step": 6355 }, { "epoch": 0.5764002175095161, "grad_norm": 0.8886336088180542, "learning_rate": 4.996444743338945e-05, "loss": 0.5587, "step": 6360 }, { "epoch": 0.5768533623346022, "grad_norm": 0.9130271077156067, "learning_rate": 4.996402454552782e-05, "loss": 0.5564, "step": 6365 }, { "epoch": 0.5773065071596882, "grad_norm": 0.9343716502189636, "learning_rate": 4.996359915925818e-05, "loss": 0.6012, "step": 6370 }, { "epoch": 0.5777596519847743, "grad_norm": 0.7984656095504761, "learning_rate": 4.9963171274623095e-05, "loss": 0.5698, "step": 6375 }, { "epoch": 0.5782127968098605, "grad_norm": 0.9484997987747192, "learning_rate": 4.996274089166539e-05, "loss": 0.5666, "step": 6380 }, { "epoch": 0.5786659416349466, "grad_norm": 0.9021596908569336, "learning_rate": 4.996230801042814e-05, "loss": 0.5849, "step": 6385 }, { "epoch": 0.5791190864600326, "grad_norm": 0.9284070134162903, "learning_rate": 4.9961872630954656e-05, "loss": 0.5627, "step": 6390 }, { "epoch": 0.5795722312851187, "grad_norm": 0.989865779876709, "learning_rate": 4.996143475328853e-05, "loss": 0.6737, "step": 6395 }, { "epoch": 0.5800253761102048, "grad_norm": 0.8471856713294983, "learning_rate": 4.9960994377473574e-05, "loss": 0.5573, "step": 6400 }, { "epoch": 0.580478520935291, "grad_norm": 0.8236604928970337, "learning_rate": 4.996055150355387e-05, "loss": 0.4934, "step": 6405 }, { "epoch": 0.580931665760377, "grad_norm": 0.7831682562828064, "learning_rate": 4.996010613157371e-05, "loss": 0.5656, "step": 6410 }, { "epoch": 0.5813848105854631, "grad_norm": 0.8547714948654175, "learning_rate": 4.9959658261577713e-05, "loss": 0.5296, "step": 6415 }, { "epoch": 0.5818379554105492, "grad_norm": 0.8361198306083679, "learning_rate": 4.995920789361067e-05, "loss": 0.4935, "step": 6420 }, { "epoch": 0.5822911002356354, "grad_norm": 0.8527016043663025, "learning_rate": 4.995875502771767e-05, "loss": 0.5243, "step": 6425 }, { "epoch": 0.5827442450607214, "grad_norm": 0.9205556511878967, "learning_rate": 4.995829966394403e-05, "loss": 0.5593, "step": 6430 }, { "epoch": 0.5831973898858075, "grad_norm": 0.8904383182525635, "learning_rate": 4.9957841802335314e-05, "loss": 0.556, "step": 6435 }, { "epoch": 0.5836505347108936, "grad_norm": 0.9114331603050232, "learning_rate": 4.995738144293736e-05, "loss": 0.595, "step": 6440 }, { "epoch": 0.5841036795359797, "grad_norm": 1.1150643825531006, "learning_rate": 4.995691858579623e-05, "loss": 0.565, "step": 6445 }, { "epoch": 0.5845568243610658, "grad_norm": 0.8210616707801819, "learning_rate": 4.995645323095826e-05, "loss": 0.5816, "step": 6450 }, { "epoch": 0.5850099691861519, "grad_norm": 0.8190375566482544, "learning_rate": 4.995598537847001e-05, "loss": 0.5517, "step": 6455 }, { "epoch": 0.585463114011238, "grad_norm": 0.9511589407920837, "learning_rate": 4.9955515028378295e-05, "loss": 0.5357, "step": 6460 }, { "epoch": 0.5859162588363241, "grad_norm": 0.880591869354248, "learning_rate": 4.995504218073021e-05, "loss": 0.5786, "step": 6465 }, { "epoch": 0.5863694036614102, "grad_norm": 0.9036387801170349, "learning_rate": 4.995456683557307e-05, "loss": 0.5968, "step": 6470 }, { "epoch": 0.5868225484864963, "grad_norm": 0.915861964225769, "learning_rate": 4.995408899295444e-05, "loss": 0.5204, "step": 6475 }, { "epoch": 0.5872756933115824, "grad_norm": 0.8525407910346985, "learning_rate": 4.9953608652922145e-05, "loss": 0.5056, "step": 6480 }, { "epoch": 0.5877288381366684, "grad_norm": 0.8027551174163818, "learning_rate": 4.995312581552427e-05, "loss": 0.6093, "step": 6485 }, { "epoch": 0.5881819829617546, "grad_norm": 0.8742019534111023, "learning_rate": 4.995264048080912e-05, "loss": 0.6126, "step": 6490 }, { "epoch": 0.5886351277868407, "grad_norm": 0.8148497343063354, "learning_rate": 4.995215264882528e-05, "loss": 0.6044, "step": 6495 }, { "epoch": 0.5890882726119268, "grad_norm": 0.8175822496414185, "learning_rate": 4.995166231962157e-05, "loss": 0.6256, "step": 6500 }, { "epoch": 0.5895414174370128, "grad_norm": 0.8195366859436035, "learning_rate": 4.995116949324705e-05, "loss": 0.5095, "step": 6505 }, { "epoch": 0.589994562262099, "grad_norm": 0.9268137216567993, "learning_rate": 4.9950674169751066e-05, "loss": 0.6217, "step": 6510 }, { "epoch": 0.5904477070871851, "grad_norm": 0.8471976518630981, "learning_rate": 4.995017634918316e-05, "loss": 0.5464, "step": 6515 }, { "epoch": 0.5909008519122712, "grad_norm": 0.9009429216384888, "learning_rate": 4.9949676031593185e-05, "loss": 0.5967, "step": 6520 }, { "epoch": 0.5913539967373572, "grad_norm": 0.8808012008666992, "learning_rate": 4.99491732170312e-05, "loss": 0.5642, "step": 6525 }, { "epoch": 0.5918071415624434, "grad_norm": 0.8684262633323669, "learning_rate": 4.9948667905547516e-05, "loss": 0.5897, "step": 6530 }, { "epoch": 0.5922602863875295, "grad_norm": 0.9343585968017578, "learning_rate": 4.9948160097192715e-05, "loss": 0.6068, "step": 6535 }, { "epoch": 0.5927134312126155, "grad_norm": 0.780870795249939, "learning_rate": 4.994764979201762e-05, "loss": 0.5808, "step": 6540 }, { "epoch": 0.5931665760377016, "grad_norm": 0.8186470866203308, "learning_rate": 4.99471369900733e-05, "loss": 0.5817, "step": 6545 }, { "epoch": 0.5936197208627877, "grad_norm": 0.7913421988487244, "learning_rate": 4.994662169141108e-05, "loss": 0.5739, "step": 6550 }, { "epoch": 0.5940728656878739, "grad_norm": 0.8863900303840637, "learning_rate": 4.994610389608252e-05, "loss": 0.5635, "step": 6555 }, { "epoch": 0.59452601051296, "grad_norm": 0.8980908989906311, "learning_rate": 4.994558360413946e-05, "loss": 0.6235, "step": 6560 }, { "epoch": 0.594979155338046, "grad_norm": 0.9709354043006897, "learning_rate": 4.9945060815633945e-05, "loss": 0.6254, "step": 6565 }, { "epoch": 0.5954323001631321, "grad_norm": 0.8629058003425598, "learning_rate": 4.994453553061832e-05, "loss": 0.5424, "step": 6570 }, { "epoch": 0.5958854449882183, "grad_norm": 0.8002598881721497, "learning_rate": 4.994400774914515e-05, "loss": 0.5123, "step": 6575 }, { "epoch": 0.5963385898133043, "grad_norm": 0.8849971890449524, "learning_rate": 4.994347747126725e-05, "loss": 0.5873, "step": 6580 }, { "epoch": 0.5967917346383904, "grad_norm": 0.8414378762245178, "learning_rate": 4.994294469703768e-05, "loss": 0.5911, "step": 6585 }, { "epoch": 0.5972448794634765, "grad_norm": 0.8607784509658813, "learning_rate": 4.994240942650979e-05, "loss": 0.562, "step": 6590 }, { "epoch": 0.5976980242885627, "grad_norm": 0.8543559312820435, "learning_rate": 4.994187165973713e-05, "loss": 0.5771, "step": 6595 }, { "epoch": 0.5981511691136487, "grad_norm": 0.8695547580718994, "learning_rate": 4.9941331396773516e-05, "loss": 0.6034, "step": 6600 }, { "epoch": 0.5986043139387348, "grad_norm": 0.8681554794311523, "learning_rate": 4.994078863767302e-05, "loss": 0.5666, "step": 6605 }, { "epoch": 0.5990574587638209, "grad_norm": 0.9288983941078186, "learning_rate": 4.994024338248998e-05, "loss": 0.5886, "step": 6610 }, { "epoch": 0.5995106035889071, "grad_norm": 0.8713579773902893, "learning_rate": 4.993969563127894e-05, "loss": 0.5855, "step": 6615 }, { "epoch": 0.5999637484139931, "grad_norm": 0.9214992523193359, "learning_rate": 4.9939145384094734e-05, "loss": 0.6021, "step": 6620 }, { "epoch": 0.6004168932390792, "grad_norm": 0.8197630643844604, "learning_rate": 4.9938592640992435e-05, "loss": 0.5185, "step": 6625 }, { "epoch": 0.6008700380641653, "grad_norm": 0.85880047082901, "learning_rate": 4.9938037402027346e-05, "loss": 0.5725, "step": 6630 }, { "epoch": 0.6013231828892514, "grad_norm": 0.8879333734512329, "learning_rate": 4.9937479667255046e-05, "loss": 0.5639, "step": 6635 }, { "epoch": 0.6017763277143375, "grad_norm": 0.8843153715133667, "learning_rate": 4.9936919436731345e-05, "loss": 0.563, "step": 6640 }, { "epoch": 0.6022294725394236, "grad_norm": 0.9157406687736511, "learning_rate": 4.993635671051232e-05, "loss": 0.5653, "step": 6645 }, { "epoch": 0.6026826173645097, "grad_norm": 0.9097387194633484, "learning_rate": 4.993579148865429e-05, "loss": 0.551, "step": 6650 }, { "epoch": 0.6031357621895957, "grad_norm": 0.8540716767311096, "learning_rate": 4.9935223771213814e-05, "loss": 0.5376, "step": 6655 }, { "epoch": 0.6035889070146819, "grad_norm": 0.9346873164176941, "learning_rate": 4.9934653558247716e-05, "loss": 0.5621, "step": 6660 }, { "epoch": 0.604042051839768, "grad_norm": 0.7676681280136108, "learning_rate": 4.993408084981306e-05, "loss": 0.5625, "step": 6665 }, { "epoch": 0.6044951966648541, "grad_norm": 1.0233432054519653, "learning_rate": 4.993350564596716e-05, "loss": 0.5269, "step": 6670 }, { "epoch": 0.6049483414899401, "grad_norm": 0.849890947341919, "learning_rate": 4.9932927946767585e-05, "loss": 0.5547, "step": 6675 }, { "epoch": 0.6054014863150263, "grad_norm": 0.8642697930335999, "learning_rate": 4.993234775227216e-05, "loss": 0.5463, "step": 6680 }, { "epoch": 0.6058546311401124, "grad_norm": 0.8940352201461792, "learning_rate": 4.9931765062538936e-05, "loss": 0.5437, "step": 6685 }, { "epoch": 0.6063077759651985, "grad_norm": 0.8582884073257446, "learning_rate": 4.993117987762624e-05, "loss": 0.5416, "step": 6690 }, { "epoch": 0.6067609207902845, "grad_norm": 0.8087893724441528, "learning_rate": 4.993059219759264e-05, "loss": 0.5419, "step": 6695 }, { "epoch": 0.6072140656153707, "grad_norm": 0.8779772520065308, "learning_rate": 4.9930002022496934e-05, "loss": 0.5742, "step": 6700 }, { "epoch": 0.6076672104404568, "grad_norm": 0.7973787784576416, "learning_rate": 4.9929409352398206e-05, "loss": 0.547, "step": 6705 }, { "epoch": 0.6081203552655429, "grad_norm": 0.8978163003921509, "learning_rate": 4.992881418735577e-05, "loss": 0.5736, "step": 6710 }, { "epoch": 0.6085735000906289, "grad_norm": 0.9105599522590637, "learning_rate": 4.992821652742916e-05, "loss": 0.5318, "step": 6715 }, { "epoch": 0.609026644915715, "grad_norm": 0.898473858833313, "learning_rate": 4.992761637267823e-05, "loss": 0.6156, "step": 6720 }, { "epoch": 0.6094797897408012, "grad_norm": 0.8775843381881714, "learning_rate": 4.9927013723163026e-05, "loss": 0.5771, "step": 6725 }, { "epoch": 0.6099329345658873, "grad_norm": 0.8274092674255371, "learning_rate": 4.9926408578943854e-05, "loss": 0.5871, "step": 6730 }, { "epoch": 0.6103860793909733, "grad_norm": 0.914863646030426, "learning_rate": 4.9925800940081294e-05, "loss": 0.5466, "step": 6735 }, { "epoch": 0.6108392242160594, "grad_norm": 0.8215007781982422, "learning_rate": 4.9925190806636144e-05, "loss": 0.6125, "step": 6740 }, { "epoch": 0.6112923690411456, "grad_norm": 0.8392397165298462, "learning_rate": 4.992457817866947e-05, "loss": 0.5593, "step": 6745 }, { "epoch": 0.6117455138662317, "grad_norm": 0.8775562644004822, "learning_rate": 4.992396305624259e-05, "loss": 0.552, "step": 6750 }, { "epoch": 0.6121986586913177, "grad_norm": 0.8794897794723511, "learning_rate": 4.9923345439417064e-05, "loss": 0.5093, "step": 6755 }, { "epoch": 0.6126518035164038, "grad_norm": 0.8509501218795776, "learning_rate": 4.9922725328254694e-05, "loss": 0.4927, "step": 6760 }, { "epoch": 0.61310494834149, "grad_norm": 0.8217429518699646, "learning_rate": 4.992210272281756e-05, "loss": 0.6052, "step": 6765 }, { "epoch": 0.6135580931665761, "grad_norm": 0.884706974029541, "learning_rate": 4.992147762316794e-05, "loss": 0.5948, "step": 6770 }, { "epoch": 0.6140112379916621, "grad_norm": 0.839270293712616, "learning_rate": 4.9920850029368436e-05, "loss": 0.5827, "step": 6775 }, { "epoch": 0.6144643828167482, "grad_norm": 0.8579437732696533, "learning_rate": 4.9920219941481826e-05, "loss": 0.5586, "step": 6780 }, { "epoch": 0.6149175276418344, "grad_norm": 0.7880479693412781, "learning_rate": 4.9919587359571185e-05, "loss": 0.5356, "step": 6785 }, { "epoch": 0.6153706724669205, "grad_norm": 0.7755650877952576, "learning_rate": 4.991895228369982e-05, "loss": 0.5055, "step": 6790 }, { "epoch": 0.6158238172920065, "grad_norm": 0.807279646396637, "learning_rate": 4.991831471393127e-05, "loss": 0.5703, "step": 6795 }, { "epoch": 0.6162769621170926, "grad_norm": 0.9164748787879944, "learning_rate": 4.991767465032937e-05, "loss": 0.5758, "step": 6800 }, { "epoch": 0.6167301069421787, "grad_norm": 0.9369795918464661, "learning_rate": 4.991703209295817e-05, "loss": 0.57, "step": 6805 }, { "epoch": 0.6171832517672649, "grad_norm": 0.8669858574867249, "learning_rate": 4.991638704188197e-05, "loss": 0.5321, "step": 6810 }, { "epoch": 0.6176363965923509, "grad_norm": 0.924586832523346, "learning_rate": 4.991573949716534e-05, "loss": 0.6, "step": 6815 }, { "epoch": 0.618089541417437, "grad_norm": 0.7919301390647888, "learning_rate": 4.991508945887307e-05, "loss": 0.6101, "step": 6820 }, { "epoch": 0.6185426862425231, "grad_norm": 0.8779674172401428, "learning_rate": 4.991443692707023e-05, "loss": 0.5825, "step": 6825 }, { "epoch": 0.6189958310676092, "grad_norm": 1.016897201538086, "learning_rate": 4.991378190182212e-05, "loss": 0.6115, "step": 6830 }, { "epoch": 0.6194489758926953, "grad_norm": 0.9096161127090454, "learning_rate": 4.991312438319429e-05, "loss": 0.6058, "step": 6835 }, { "epoch": 0.6199021207177814, "grad_norm": 0.7844492197036743, "learning_rate": 4.991246437125256e-05, "loss": 0.5312, "step": 6840 }, { "epoch": 0.6203552655428675, "grad_norm": 0.9149840474128723, "learning_rate": 4.991180186606296e-05, "loss": 0.5736, "step": 6845 }, { "epoch": 0.6208084103679536, "grad_norm": 0.8260831236839294, "learning_rate": 4.991113686769181e-05, "loss": 0.6085, "step": 6850 }, { "epoch": 0.6212615551930397, "grad_norm": 0.8137637972831726, "learning_rate": 4.991046937620567e-05, "loss": 0.5538, "step": 6855 }, { "epoch": 0.6217147000181258, "grad_norm": 0.9874271154403687, "learning_rate": 4.990979939167133e-05, "loss": 0.5996, "step": 6860 }, { "epoch": 0.6221678448432119, "grad_norm": 0.8838761448860168, "learning_rate": 4.990912691415585e-05, "loss": 0.5488, "step": 6865 }, { "epoch": 0.622620989668298, "grad_norm": 0.8533156514167786, "learning_rate": 4.990845194372652e-05, "loss": 0.5762, "step": 6870 }, { "epoch": 0.6230741344933841, "grad_norm": 0.8561303615570068, "learning_rate": 4.99077744804509e-05, "loss": 0.6073, "step": 6875 }, { "epoch": 0.6235272793184702, "grad_norm": 0.7812048196792603, "learning_rate": 4.990709452439679e-05, "loss": 0.5832, "step": 6880 }, { "epoch": 0.6239804241435563, "grad_norm": 0.81035977602005, "learning_rate": 4.990641207563225e-05, "loss": 0.5401, "step": 6885 }, { "epoch": 0.6244335689686423, "grad_norm": 0.8408607244491577, "learning_rate": 4.9905727134225557e-05, "loss": 0.5094, "step": 6890 }, { "epoch": 0.6248867137937285, "grad_norm": 0.8422982096672058, "learning_rate": 4.990503970024528e-05, "loss": 0.6141, "step": 6895 }, { "epoch": 0.6253398586188146, "grad_norm": 0.8354423642158508, "learning_rate": 4.99043497737602e-05, "loss": 0.5434, "step": 6900 }, { "epoch": 0.6257930034439007, "grad_norm": 0.8520204424858093, "learning_rate": 4.990365735483939e-05, "loss": 0.5249, "step": 6905 }, { "epoch": 0.6262461482689867, "grad_norm": 0.8098015189170837, "learning_rate": 4.9902962443552126e-05, "loss": 0.5059, "step": 6910 }, { "epoch": 0.6266992930940729, "grad_norm": 0.8633955717086792, "learning_rate": 4.9902265039967964e-05, "loss": 0.5331, "step": 6915 }, { "epoch": 0.627152437919159, "grad_norm": 0.8242766261100769, "learning_rate": 4.99015651441567e-05, "loss": 0.53, "step": 6920 }, { "epoch": 0.627605582744245, "grad_norm": 0.8483250737190247, "learning_rate": 4.9900862756188384e-05, "loss": 0.5517, "step": 6925 }, { "epoch": 0.6280587275693311, "grad_norm": 0.8873424530029297, "learning_rate": 4.990015787613329e-05, "loss": 0.4947, "step": 6930 }, { "epoch": 0.6285118723944173, "grad_norm": 1.0606542825698853, "learning_rate": 4.989945050406199e-05, "loss": 0.5845, "step": 6935 }, { "epoch": 0.6289650172195034, "grad_norm": 0.8241831064224243, "learning_rate": 4.989874064004526e-05, "loss": 0.4939, "step": 6940 }, { "epoch": 0.6294181620445894, "grad_norm": 0.7517449259757996, "learning_rate": 4.9898028284154155e-05, "loss": 0.5837, "step": 6945 }, { "epoch": 0.6298713068696755, "grad_norm": 0.8297584056854248, "learning_rate": 4.989731343645996e-05, "loss": 0.523, "step": 6950 }, { "epoch": 0.6303244516947617, "grad_norm": 0.8464785218238831, "learning_rate": 4.989659609703422e-05, "loss": 0.5361, "step": 6955 }, { "epoch": 0.6307775965198478, "grad_norm": 0.8313464522361755, "learning_rate": 4.989587626594873e-05, "loss": 0.553, "step": 6960 }, { "epoch": 0.6312307413449338, "grad_norm": 0.7791213393211365, "learning_rate": 4.989515394327552e-05, "loss": 0.5291, "step": 6965 }, { "epoch": 0.6316838861700199, "grad_norm": 0.971520185470581, "learning_rate": 4.989442912908689e-05, "loss": 0.5774, "step": 6970 }, { "epoch": 0.632137030995106, "grad_norm": 0.8708187937736511, "learning_rate": 4.989370182345538e-05, "loss": 0.5587, "step": 6975 }, { "epoch": 0.6325901758201922, "grad_norm": 0.8335626125335693, "learning_rate": 4.9892972026453776e-05, "loss": 0.5897, "step": 6980 }, { "epoch": 0.6330433206452782, "grad_norm": 0.8852291703224182, "learning_rate": 4.989223973815511e-05, "loss": 0.5279, "step": 6985 }, { "epoch": 0.6334964654703643, "grad_norm": 0.8380326628684998, "learning_rate": 4.989150495863268e-05, "loss": 0.5838, "step": 6990 }, { "epoch": 0.6339496102954504, "grad_norm": 0.88983553647995, "learning_rate": 4.989076768796003e-05, "loss": 0.5232, "step": 6995 }, { "epoch": 0.6344027551205366, "grad_norm": 0.825962483882904, "learning_rate": 4.989002792621092e-05, "loss": 0.5399, "step": 7000 }, { "epoch": 0.6348558999456226, "grad_norm": 0.8607646822929382, "learning_rate": 4.9889285673459405e-05, "loss": 0.5816, "step": 7005 }, { "epoch": 0.6353090447707087, "grad_norm": 0.7781727910041809, "learning_rate": 4.988854092977977e-05, "loss": 0.5477, "step": 7010 }, { "epoch": 0.6357621895957948, "grad_norm": 0.8745645880699158, "learning_rate": 4.988779369524654e-05, "loss": 0.5509, "step": 7015 }, { "epoch": 0.636215334420881, "grad_norm": 0.970353364944458, "learning_rate": 4.988704396993451e-05, "loss": 0.5379, "step": 7020 }, { "epoch": 0.636668479245967, "grad_norm": 0.8139991164207458, "learning_rate": 4.98862917539187e-05, "loss": 0.5411, "step": 7025 }, { "epoch": 0.6371216240710531, "grad_norm": 0.8960239887237549, "learning_rate": 4.98855370472744e-05, "loss": 0.5169, "step": 7030 }, { "epoch": 0.6375747688961392, "grad_norm": 0.9172124266624451, "learning_rate": 4.9884779850077146e-05, "loss": 0.5599, "step": 7035 }, { "epoch": 0.6380279137212252, "grad_norm": 0.8401117920875549, "learning_rate": 4.988402016240271e-05, "loss": 0.5509, "step": 7040 }, { "epoch": 0.6384810585463114, "grad_norm": 0.8293363451957703, "learning_rate": 4.988325798432712e-05, "loss": 0.5518, "step": 7045 }, { "epoch": 0.6389342033713975, "grad_norm": 0.8921065330505371, "learning_rate": 4.988249331592666e-05, "loss": 0.5875, "step": 7050 }, { "epoch": 0.6393873481964836, "grad_norm": 0.8582276105880737, "learning_rate": 4.988172615727786e-05, "loss": 0.5691, "step": 7055 }, { "epoch": 0.6398404930215696, "grad_norm": 0.8065584301948547, "learning_rate": 4.988095650845749e-05, "loss": 0.5285, "step": 7060 }, { "epoch": 0.6402936378466558, "grad_norm": 0.8339735865592957, "learning_rate": 4.9880184369542586e-05, "loss": 0.5271, "step": 7065 }, { "epoch": 0.6407467826717419, "grad_norm": 0.765916645526886, "learning_rate": 4.987940974061041e-05, "loss": 0.55, "step": 7070 }, { "epoch": 0.641199927496828, "grad_norm": 0.9416107535362244, "learning_rate": 4.987863262173852e-05, "loss": 0.5754, "step": 7075 }, { "epoch": 0.641653072321914, "grad_norm": 0.8296758532524109, "learning_rate": 4.987785301300465e-05, "loss": 0.5669, "step": 7080 }, { "epoch": 0.6421062171470002, "grad_norm": 0.9372495412826538, "learning_rate": 4.987707091448684e-05, "loss": 0.6511, "step": 7085 }, { "epoch": 0.6425593619720863, "grad_norm": 0.8965403437614441, "learning_rate": 4.9876286326263367e-05, "loss": 0.5129, "step": 7090 }, { "epoch": 0.6430125067971724, "grad_norm": 0.9660283327102661, "learning_rate": 4.987549924841275e-05, "loss": 0.5223, "step": 7095 }, { "epoch": 0.6434656516222584, "grad_norm": 0.8972845077514648, "learning_rate": 4.9874709681013754e-05, "loss": 0.5624, "step": 7100 }, { "epoch": 0.6439187964473446, "grad_norm": 0.8372393846511841, "learning_rate": 4.9873917624145406e-05, "loss": 0.6004, "step": 7105 }, { "epoch": 0.6443719412724307, "grad_norm": 0.8859729766845703, "learning_rate": 4.9873123077886975e-05, "loss": 0.5463, "step": 7110 }, { "epoch": 0.6448250860975168, "grad_norm": 0.8248586654663086, "learning_rate": 4.9872326042317974e-05, "loss": 0.5554, "step": 7115 }, { "epoch": 0.6452782309226028, "grad_norm": 0.8535829186439514, "learning_rate": 4.987152651751818e-05, "loss": 0.6044, "step": 7120 }, { "epoch": 0.6457313757476889, "grad_norm": 0.8958288431167603, "learning_rate": 4.98707245035676e-05, "loss": 0.5483, "step": 7125 }, { "epoch": 0.6461845205727751, "grad_norm": 0.8280274868011475, "learning_rate": 4.986992000054651e-05, "loss": 0.5975, "step": 7130 }, { "epoch": 0.6466376653978612, "grad_norm": 0.870841920375824, "learning_rate": 4.986911300853542e-05, "loss": 0.5246, "step": 7135 }, { "epoch": 0.6470908102229472, "grad_norm": 0.9460265636444092, "learning_rate": 4.986830352761508e-05, "loss": 0.5712, "step": 7140 }, { "epoch": 0.6475439550480333, "grad_norm": 0.8494751453399658, "learning_rate": 4.986749155786653e-05, "loss": 0.6367, "step": 7145 }, { "epoch": 0.6479970998731195, "grad_norm": 0.8472420573234558, "learning_rate": 4.9866677099371004e-05, "loss": 0.5381, "step": 7150 }, { "epoch": 0.6484502446982056, "grad_norm": 0.7588494420051575, "learning_rate": 4.986586015221004e-05, "loss": 0.5172, "step": 7155 }, { "epoch": 0.6489033895232916, "grad_norm": 0.8194259405136108, "learning_rate": 4.986504071646538e-05, "loss": 0.5848, "step": 7160 }, { "epoch": 0.6493565343483777, "grad_norm": 0.8564736247062683, "learning_rate": 4.9864218792219045e-05, "loss": 0.5463, "step": 7165 }, { "epoch": 0.6498096791734639, "grad_norm": 0.9176914095878601, "learning_rate": 4.9863394379553286e-05, "loss": 0.5474, "step": 7170 }, { "epoch": 0.65026282399855, "grad_norm": 0.9599884152412415, "learning_rate": 4.986256747855061e-05, "loss": 0.5807, "step": 7175 }, { "epoch": 0.650715968823636, "grad_norm": 0.8484050631523132, "learning_rate": 4.986173808929378e-05, "loss": 0.5228, "step": 7180 }, { "epoch": 0.6511691136487221, "grad_norm": 0.894356906414032, "learning_rate": 4.98609062118658e-05, "loss": 0.5557, "step": 7185 }, { "epoch": 0.6516222584738083, "grad_norm": 0.8620449900627136, "learning_rate": 4.9860071846349915e-05, "loss": 0.5115, "step": 7190 }, { "epoch": 0.6520754032988944, "grad_norm": 1.0637134313583374, "learning_rate": 4.985923499282964e-05, "loss": 0.6285, "step": 7195 }, { "epoch": 0.6525285481239804, "grad_norm": 0.9059747457504272, "learning_rate": 4.9858395651388716e-05, "loss": 0.5826, "step": 7200 }, { "epoch": 0.6529816929490665, "grad_norm": 0.8058026432991028, "learning_rate": 4.985755382211116e-05, "loss": 0.5793, "step": 7205 }, { "epoch": 0.6534348377741526, "grad_norm": 0.7815763354301453, "learning_rate": 4.985670950508121e-05, "loss": 0.5164, "step": 7210 }, { "epoch": 0.6538879825992387, "grad_norm": 0.8104599118232727, "learning_rate": 4.985586270038337e-05, "loss": 0.5302, "step": 7215 }, { "epoch": 0.6543411274243248, "grad_norm": 0.8938913941383362, "learning_rate": 4.985501340810239e-05, "loss": 0.5646, "step": 7220 }, { "epoch": 0.6547942722494109, "grad_norm": 0.8695154190063477, "learning_rate": 4.985416162832327e-05, "loss": 0.4902, "step": 7225 }, { "epoch": 0.655247417074497, "grad_norm": 0.8517284989356995, "learning_rate": 4.985330736113124e-05, "loss": 0.5284, "step": 7230 }, { "epoch": 0.6557005618995831, "grad_norm": 0.7923974394798279, "learning_rate": 4.985245060661182e-05, "loss": 0.5548, "step": 7235 }, { "epoch": 0.6561537067246692, "grad_norm": 0.8878098130226135, "learning_rate": 4.985159136485074e-05, "loss": 0.5335, "step": 7240 }, { "epoch": 0.6566068515497553, "grad_norm": 0.8980861902236938, "learning_rate": 4.985072963593399e-05, "loss": 0.5934, "step": 7245 }, { "epoch": 0.6570599963748414, "grad_norm": 1.007890224456787, "learning_rate": 4.984986541994782e-05, "loss": 0.5893, "step": 7250 }, { "epoch": 0.6575131411999275, "grad_norm": 0.8520410060882568, "learning_rate": 4.984899871697872e-05, "loss": 0.5216, "step": 7255 }, { "epoch": 0.6579662860250136, "grad_norm": 0.8556172251701355, "learning_rate": 4.984812952711342e-05, "loss": 0.5454, "step": 7260 }, { "epoch": 0.6584194308500997, "grad_norm": 0.8773563504219055, "learning_rate": 4.984725785043892e-05, "loss": 0.6591, "step": 7265 }, { "epoch": 0.6588725756751858, "grad_norm": 0.8674762845039368, "learning_rate": 4.984638368704246e-05, "loss": 0.5396, "step": 7270 }, { "epoch": 0.6593257205002719, "grad_norm": 0.8886318206787109, "learning_rate": 4.984550703701152e-05, "loss": 0.5352, "step": 7275 }, { "epoch": 0.659778865325358, "grad_norm": 0.8625372052192688, "learning_rate": 4.9844627900433835e-05, "loss": 0.5902, "step": 7280 }, { "epoch": 0.6602320101504441, "grad_norm": 0.850202202796936, "learning_rate": 4.98437462773974e-05, "loss": 0.5763, "step": 7285 }, { "epoch": 0.6606851549755302, "grad_norm": 0.8192672729492188, "learning_rate": 4.9842862167990424e-05, "loss": 0.5707, "step": 7290 }, { "epoch": 0.6611382998006162, "grad_norm": 0.8149262070655823, "learning_rate": 4.9841975572301416e-05, "loss": 0.521, "step": 7295 }, { "epoch": 0.6615914446257024, "grad_norm": 0.9168290495872498, "learning_rate": 4.984108649041909e-05, "loss": 0.5673, "step": 7300 }, { "epoch": 0.6620445894507885, "grad_norm": 0.8526946306228638, "learning_rate": 4.9840194922432427e-05, "loss": 0.5825, "step": 7305 }, { "epoch": 0.6624977342758746, "grad_norm": 0.8222017884254456, "learning_rate": 4.983930086843066e-05, "loss": 0.5505, "step": 7310 }, { "epoch": 0.6629508791009606, "grad_norm": 0.9359790682792664, "learning_rate": 4.983840432850327e-05, "loss": 0.5558, "step": 7315 }, { "epoch": 0.6634040239260468, "grad_norm": 0.8627228736877441, "learning_rate": 4.9837505302739976e-05, "loss": 0.5619, "step": 7320 }, { "epoch": 0.6638571687511329, "grad_norm": 0.9661971926689148, "learning_rate": 4.9836603791230754e-05, "loss": 0.5788, "step": 7325 }, { "epoch": 0.664310313576219, "grad_norm": 0.8417542576789856, "learning_rate": 4.983569979406583e-05, "loss": 0.5545, "step": 7330 }, { "epoch": 0.664763458401305, "grad_norm": 0.9259768128395081, "learning_rate": 4.983479331133567e-05, "loss": 0.5751, "step": 7335 }, { "epoch": 0.6652166032263912, "grad_norm": 0.873826801776886, "learning_rate": 4.983388434313101e-05, "loss": 0.6195, "step": 7340 }, { "epoch": 0.6656697480514773, "grad_norm": 0.7890362739562988, "learning_rate": 4.98329728895428e-05, "loss": 0.589, "step": 7345 }, { "epoch": 0.6661228928765633, "grad_norm": 0.8883289098739624, "learning_rate": 4.983205895066228e-05, "loss": 0.5226, "step": 7350 }, { "epoch": 0.6665760377016494, "grad_norm": 0.8650939464569092, "learning_rate": 4.983114252658089e-05, "loss": 0.5802, "step": 7355 }, { "epoch": 0.6670291825267356, "grad_norm": 0.77011638879776, "learning_rate": 4.983022361739037e-05, "loss": 0.4683, "step": 7360 }, { "epoch": 0.6674823273518217, "grad_norm": 0.8307809829711914, "learning_rate": 4.982930222318267e-05, "loss": 0.4825, "step": 7365 }, { "epoch": 0.6679354721769077, "grad_norm": 0.8753910660743713, "learning_rate": 4.9828378344050016e-05, "loss": 0.5874, "step": 7370 }, { "epoch": 0.6683886170019938, "grad_norm": 0.7897202968597412, "learning_rate": 4.982745198008487e-05, "loss": 0.5497, "step": 7375 }, { "epoch": 0.6688417618270799, "grad_norm": 0.8410820364952087, "learning_rate": 4.982652313137993e-05, "loss": 0.5636, "step": 7380 }, { "epoch": 0.6692949066521661, "grad_norm": 0.8031216859817505, "learning_rate": 4.982559179802816e-05, "loss": 0.5631, "step": 7385 }, { "epoch": 0.6697480514772521, "grad_norm": 0.8360608816146851, "learning_rate": 4.982465798012278e-05, "loss": 0.5511, "step": 7390 }, { "epoch": 0.6702011963023382, "grad_norm": 0.949295163154602, "learning_rate": 4.982372167775723e-05, "loss": 0.5178, "step": 7395 }, { "epoch": 0.6706543411274243, "grad_norm": 0.8872372508049011, "learning_rate": 4.982278289102522e-05, "loss": 0.4931, "step": 7400 }, { "epoch": 0.6711074859525105, "grad_norm": 0.8021174669265747, "learning_rate": 4.9821841620020714e-05, "loss": 0.5354, "step": 7405 }, { "epoch": 0.6715606307775965, "grad_norm": 0.8133381605148315, "learning_rate": 4.9820897864837906e-05, "loss": 0.5164, "step": 7410 }, { "epoch": 0.6720137756026826, "grad_norm": 0.8343533277511597, "learning_rate": 4.981995162557125e-05, "loss": 0.5622, "step": 7415 }, { "epoch": 0.6724669204277687, "grad_norm": 0.9505667090415955, "learning_rate": 4.981900290231544e-05, "loss": 0.5415, "step": 7420 }, { "epoch": 0.6729200652528549, "grad_norm": 0.9523220658302307, "learning_rate": 4.981805169516544e-05, "loss": 0.5531, "step": 7425 }, { "epoch": 0.6733732100779409, "grad_norm": 0.816318690776825, "learning_rate": 4.9817098004216424e-05, "loss": 0.5146, "step": 7430 }, { "epoch": 0.673826354903027, "grad_norm": 1.0005453824996948, "learning_rate": 4.981614182956385e-05, "loss": 0.5708, "step": 7435 }, { "epoch": 0.6742794997281131, "grad_norm": 0.9033327698707581, "learning_rate": 4.981518317130342e-05, "loss": 0.5874, "step": 7440 }, { "epoch": 0.6747326445531993, "grad_norm": 0.8631378412246704, "learning_rate": 4.981422202953107e-05, "loss": 0.5085, "step": 7445 }, { "epoch": 0.6751857893782853, "grad_norm": 0.8004341125488281, "learning_rate": 4.981325840434298e-05, "loss": 0.5298, "step": 7450 }, { "epoch": 0.6756389342033714, "grad_norm": 0.7629010081291199, "learning_rate": 4.981229229583561e-05, "loss": 0.5247, "step": 7455 }, { "epoch": 0.6760920790284575, "grad_norm": 0.8460348844528198, "learning_rate": 4.9811323704105636e-05, "loss": 0.5082, "step": 7460 }, { "epoch": 0.6765452238535435, "grad_norm": 0.8838338255882263, "learning_rate": 4.9810352629250004e-05, "loss": 0.5699, "step": 7465 }, { "epoch": 0.6769983686786297, "grad_norm": 0.8321059942245483, "learning_rate": 4.9809379071365894e-05, "loss": 0.5502, "step": 7470 }, { "epoch": 0.6774515135037158, "grad_norm": 0.9196350574493408, "learning_rate": 4.9808403030550734e-05, "loss": 0.5638, "step": 7475 }, { "epoch": 0.6779046583288019, "grad_norm": 0.8407566547393799, "learning_rate": 4.980742450690221e-05, "loss": 0.5785, "step": 7480 }, { "epoch": 0.6783578031538879, "grad_norm": 0.7795018553733826, "learning_rate": 4.980644350051826e-05, "loss": 0.5111, "step": 7485 }, { "epoch": 0.6788109479789741, "grad_norm": 0.8450911641120911, "learning_rate": 4.9805460011497056e-05, "loss": 0.548, "step": 7490 }, { "epoch": 0.6792640928040602, "grad_norm": 0.8913778066635132, "learning_rate": 4.980447403993703e-05, "loss": 0.5201, "step": 7495 }, { "epoch": 0.6797172376291463, "grad_norm": 0.8530367612838745, "learning_rate": 4.980348558593686e-05, "loss": 0.5116, "step": 7500 }, { "epoch": 0.6801703824542323, "grad_norm": 0.868003785610199, "learning_rate": 4.980249464959546e-05, "loss": 0.5868, "step": 7505 }, { "epoch": 0.6806235272793185, "grad_norm": 0.7942471504211426, "learning_rate": 4.980150123101202e-05, "loss": 0.5147, "step": 7510 }, { "epoch": 0.6810766721044046, "grad_norm": 0.8009514808654785, "learning_rate": 4.980050533028595e-05, "loss": 0.5835, "step": 7515 }, { "epoch": 0.6815298169294907, "grad_norm": 1.0409746170043945, "learning_rate": 4.979950694751692e-05, "loss": 0.5482, "step": 7520 }, { "epoch": 0.6819829617545767, "grad_norm": 0.8060599565505981, "learning_rate": 4.9798506082804854e-05, "loss": 0.4706, "step": 7525 }, { "epoch": 0.6824361065796629, "grad_norm": 0.9345548152923584, "learning_rate": 4.979750273624991e-05, "loss": 0.5327, "step": 7530 }, { "epoch": 0.682889251404749, "grad_norm": 0.893804132938385, "learning_rate": 4.979649690795252e-05, "loss": 0.5489, "step": 7535 }, { "epoch": 0.6833423962298351, "grad_norm": 0.8983023762702942, "learning_rate": 4.9795488598013326e-05, "loss": 0.5664, "step": 7540 }, { "epoch": 0.6837955410549211, "grad_norm": 0.9585543274879456, "learning_rate": 4.9794477806533256e-05, "loss": 0.5267, "step": 7545 }, { "epoch": 0.6842486858800072, "grad_norm": 0.9736562967300415, "learning_rate": 4.979346453361346e-05, "loss": 0.6063, "step": 7550 }, { "epoch": 0.6847018307050934, "grad_norm": 0.8751676082611084, "learning_rate": 4.979244877935536e-05, "loss": 0.5832, "step": 7555 }, { "epoch": 0.6851549755301795, "grad_norm": 0.8590703010559082, "learning_rate": 4.97914305438606e-05, "loss": 0.547, "step": 7560 }, { "epoch": 0.6856081203552655, "grad_norm": 0.8469277620315552, "learning_rate": 4.9790409827231094e-05, "loss": 0.5453, "step": 7565 }, { "epoch": 0.6860612651803516, "grad_norm": 0.7933079600334167, "learning_rate": 4.978938662956899e-05, "loss": 0.5344, "step": 7570 }, { "epoch": 0.6865144100054378, "grad_norm": 0.7902665138244629, "learning_rate": 4.9788360950976693e-05, "loss": 0.5441, "step": 7575 }, { "epoch": 0.6869675548305239, "grad_norm": 0.8273674249649048, "learning_rate": 4.9787332791556854e-05, "loss": 0.521, "step": 7580 }, { "epoch": 0.6874206996556099, "grad_norm": 0.8916956782341003, "learning_rate": 4.9786302151412364e-05, "loss": 0.5518, "step": 7585 }, { "epoch": 0.687873844480696, "grad_norm": 0.9259384274482727, "learning_rate": 4.978526903064638e-05, "loss": 0.5767, "step": 7590 }, { "epoch": 0.6883269893057822, "grad_norm": 0.8779903054237366, "learning_rate": 4.9784233429362296e-05, "loss": 0.6178, "step": 7595 }, { "epoch": 0.6887801341308682, "grad_norm": 0.9286255836486816, "learning_rate": 4.9783195347663746e-05, "loss": 0.5635, "step": 7600 }, { "epoch": 0.6892332789559543, "grad_norm": 0.9129353761672974, "learning_rate": 4.9782154785654636e-05, "loss": 0.4995, "step": 7605 }, { "epoch": 0.6896864237810404, "grad_norm": 0.8289165496826172, "learning_rate": 4.978111174343908e-05, "loss": 0.5637, "step": 7610 }, { "epoch": 0.6901395686061265, "grad_norm": 0.8590943217277527, "learning_rate": 4.97800662211215e-05, "loss": 0.4978, "step": 7615 }, { "epoch": 0.6905927134312126, "grad_norm": 0.8474560379981995, "learning_rate": 4.977901821880651e-05, "loss": 0.5501, "step": 7620 }, { "epoch": 0.6910458582562987, "grad_norm": 0.8773858547210693, "learning_rate": 4.9777967736598994e-05, "loss": 0.5497, "step": 7625 }, { "epoch": 0.6914990030813848, "grad_norm": 0.8400627374649048, "learning_rate": 4.97769147746041e-05, "loss": 0.5307, "step": 7630 }, { "epoch": 0.6919521479064709, "grad_norm": 0.8624570965766907, "learning_rate": 4.97758593329272e-05, "loss": 0.5234, "step": 7635 }, { "epoch": 0.692405292731557, "grad_norm": 0.8376266360282898, "learning_rate": 4.977480141167392e-05, "loss": 0.5381, "step": 7640 }, { "epoch": 0.6928584375566431, "grad_norm": 0.9270439147949219, "learning_rate": 4.977374101095014e-05, "loss": 0.541, "step": 7645 }, { "epoch": 0.6933115823817292, "grad_norm": 0.830281674861908, "learning_rate": 4.977267813086198e-05, "loss": 0.4886, "step": 7650 }, { "epoch": 0.6937647272068153, "grad_norm": 0.830754280090332, "learning_rate": 4.977161277151582e-05, "loss": 0.4834, "step": 7655 }, { "epoch": 0.6942178720319014, "grad_norm": 0.8735975027084351, "learning_rate": 4.9770544933018285e-05, "loss": 0.5533, "step": 7660 }, { "epoch": 0.6946710168569875, "grad_norm": 0.8793995976448059, "learning_rate": 4.976947461547624e-05, "loss": 0.528, "step": 7665 }, { "epoch": 0.6951241616820736, "grad_norm": 0.7623860239982605, "learning_rate": 4.976840181899679e-05, "loss": 0.5132, "step": 7670 }, { "epoch": 0.6955773065071597, "grad_norm": 0.8828871846199036, "learning_rate": 4.9767326543687326e-05, "loss": 0.5595, "step": 7675 }, { "epoch": 0.6960304513322458, "grad_norm": 0.8367555737495422, "learning_rate": 4.9766248789655446e-05, "loss": 0.5589, "step": 7680 }, { "epoch": 0.6964835961573319, "grad_norm": 0.8051501512527466, "learning_rate": 4.976516855700901e-05, "loss": 0.5002, "step": 7685 }, { "epoch": 0.696936740982418, "grad_norm": 0.8389200568199158, "learning_rate": 4.976408584585613e-05, "loss": 0.5183, "step": 7690 }, { "epoch": 0.697389885807504, "grad_norm": 0.8495132327079773, "learning_rate": 4.976300065630518e-05, "loss": 0.5249, "step": 7695 }, { "epoch": 0.6978430306325901, "grad_norm": 0.8546156883239746, "learning_rate": 4.9761912988464744e-05, "loss": 0.5383, "step": 7700 }, { "epoch": 0.6982961754576763, "grad_norm": 0.8183867931365967, "learning_rate": 4.976082284244369e-05, "loss": 0.5471, "step": 7705 }, { "epoch": 0.6987493202827624, "grad_norm": 0.8336789011955261, "learning_rate": 4.975973021835111e-05, "loss": 0.5716, "step": 7710 }, { "epoch": 0.6992024651078484, "grad_norm": 0.9625382423400879, "learning_rate": 4.975863511629637e-05, "loss": 0.497, "step": 7715 }, { "epoch": 0.6996556099329345, "grad_norm": 0.8011309504508972, "learning_rate": 4.975753753638904e-05, "loss": 0.5291, "step": 7720 }, { "epoch": 0.7001087547580207, "grad_norm": 1.0297538042068481, "learning_rate": 4.9756437478739e-05, "loss": 0.4908, "step": 7725 }, { "epoch": 0.7005618995831068, "grad_norm": 0.9109827876091003, "learning_rate": 4.975533494345632e-05, "loss": 0.5303, "step": 7730 }, { "epoch": 0.7010150444081928, "grad_norm": 0.8551725745201111, "learning_rate": 4.9754229930651353e-05, "loss": 0.5279, "step": 7735 }, { "epoch": 0.7014681892332789, "grad_norm": 0.914047122001648, "learning_rate": 4.9753122440434686e-05, "loss": 0.5357, "step": 7740 }, { "epoch": 0.7019213340583651, "grad_norm": 0.8863340020179749, "learning_rate": 4.975201247291716e-05, "loss": 0.4701, "step": 7745 }, { "epoch": 0.7023744788834512, "grad_norm": 0.8372417092323303, "learning_rate": 4.9750900028209855e-05, "loss": 0.5705, "step": 7750 }, { "epoch": 0.7028276237085372, "grad_norm": 0.756743848323822, "learning_rate": 4.974978510642411e-05, "loss": 0.5035, "step": 7755 }, { "epoch": 0.7032807685336233, "grad_norm": 0.8581302762031555, "learning_rate": 4.97486677076715e-05, "loss": 0.496, "step": 7760 }, { "epoch": 0.7037339133587095, "grad_norm": 0.7791547775268555, "learning_rate": 4.974754783206387e-05, "loss": 0.5818, "step": 7765 }, { "epoch": 0.7041870581837956, "grad_norm": 0.9035072922706604, "learning_rate": 4.974642547971328e-05, "loss": 0.515, "step": 7770 }, { "epoch": 0.7046402030088816, "grad_norm": 0.9459173083305359, "learning_rate": 4.9745300650732065e-05, "loss": 0.5461, "step": 7775 }, { "epoch": 0.7050933478339677, "grad_norm": 0.8764398097991943, "learning_rate": 4.974417334523279e-05, "loss": 0.5591, "step": 7780 }, { "epoch": 0.7055464926590538, "grad_norm": 0.9404100179672241, "learning_rate": 4.9743043563328286e-05, "loss": 0.6037, "step": 7785 }, { "epoch": 0.70599963748414, "grad_norm": 0.8161552548408508, "learning_rate": 4.9741911305131614e-05, "loss": 0.5116, "step": 7790 }, { "epoch": 0.706452782309226, "grad_norm": 0.9042869210243225, "learning_rate": 4.97407765707561e-05, "loss": 0.5131, "step": 7795 }, { "epoch": 0.7069059271343121, "grad_norm": 0.8872236609458923, "learning_rate": 4.973963936031531e-05, "loss": 0.5201, "step": 7800 }, { "epoch": 0.7073590719593982, "grad_norm": 0.7998805642127991, "learning_rate": 4.973849967392303e-05, "loss": 0.5002, "step": 7805 }, { "epoch": 0.7078122167844844, "grad_norm": 0.8634191751480103, "learning_rate": 4.9737357511693354e-05, "loss": 0.5201, "step": 7810 }, { "epoch": 0.7082653616095704, "grad_norm": 0.8923665285110474, "learning_rate": 4.973621287374057e-05, "loss": 0.5871, "step": 7815 }, { "epoch": 0.7087185064346565, "grad_norm": 0.7961644530296326, "learning_rate": 4.973506576017925e-05, "loss": 0.5575, "step": 7820 }, { "epoch": 0.7091716512597426, "grad_norm": 0.7715955376625061, "learning_rate": 4.9733916171124174e-05, "loss": 0.5196, "step": 7825 }, { "epoch": 0.7096247960848288, "grad_norm": 0.954255998134613, "learning_rate": 4.973276410669041e-05, "loss": 0.4941, "step": 7830 }, { "epoch": 0.7100779409099148, "grad_norm": 0.8572391271591187, "learning_rate": 4.973160956699325e-05, "loss": 0.5548, "step": 7835 }, { "epoch": 0.7105310857350009, "grad_norm": 0.8598835468292236, "learning_rate": 4.9730452552148255e-05, "loss": 0.5799, "step": 7840 }, { "epoch": 0.710984230560087, "grad_norm": 1.0412951707839966, "learning_rate": 4.97292930622712e-05, "loss": 0.5324, "step": 7845 }, { "epoch": 0.7114373753851732, "grad_norm": 0.8342775106430054, "learning_rate": 4.972813109747814e-05, "loss": 0.5395, "step": 7850 }, { "epoch": 0.7118905202102592, "grad_norm": 0.8473647832870483, "learning_rate": 4.9726966657885354e-05, "loss": 0.5145, "step": 7855 }, { "epoch": 0.7123436650353453, "grad_norm": 0.8617616891860962, "learning_rate": 4.972579974360939e-05, "loss": 0.5059, "step": 7860 }, { "epoch": 0.7127968098604314, "grad_norm": 0.7816449403762817, "learning_rate": 4.9724630354767035e-05, "loss": 0.553, "step": 7865 }, { "epoch": 0.7132499546855174, "grad_norm": 0.8827611804008484, "learning_rate": 4.972345849147531e-05, "loss": 0.539, "step": 7870 }, { "epoch": 0.7137030995106036, "grad_norm": 0.8769249320030212, "learning_rate": 4.972228415385149e-05, "loss": 0.5488, "step": 7875 }, { "epoch": 0.7141562443356897, "grad_norm": 0.9290351867675781, "learning_rate": 4.972110734201313e-05, "loss": 0.5335, "step": 7880 }, { "epoch": 0.7146093891607758, "grad_norm": 0.8275288939476013, "learning_rate": 4.971992805607799e-05, "loss": 0.5275, "step": 7885 }, { "epoch": 0.7150625339858618, "grad_norm": 0.8112239241600037, "learning_rate": 4.971874629616409e-05, "loss": 0.4961, "step": 7890 }, { "epoch": 0.715515678810948, "grad_norm": 0.7761585116386414, "learning_rate": 4.97175620623897e-05, "loss": 0.494, "step": 7895 }, { "epoch": 0.7159688236360341, "grad_norm": 0.8695656657218933, "learning_rate": 4.9716375354873346e-05, "loss": 0.5548, "step": 7900 }, { "epoch": 0.7164219684611202, "grad_norm": 0.8039553761482239, "learning_rate": 4.971518617373379e-05, "loss": 0.5379, "step": 7905 }, { "epoch": 0.7168751132862062, "grad_norm": 0.7869178652763367, "learning_rate": 4.971399451909005e-05, "loss": 0.5227, "step": 7910 }, { "epoch": 0.7173282581112924, "grad_norm": 0.980396032333374, "learning_rate": 4.9712800391061386e-05, "loss": 0.5372, "step": 7915 }, { "epoch": 0.7177814029363785, "grad_norm": 0.8161850571632385, "learning_rate": 4.97116037897673e-05, "loss": 0.5109, "step": 7920 }, { "epoch": 0.7182345477614646, "grad_norm": 0.8855746388435364, "learning_rate": 4.971040471532755e-05, "loss": 0.5537, "step": 7925 }, { "epoch": 0.7186876925865506, "grad_norm": 0.8045728802680969, "learning_rate": 4.9709203167862154e-05, "loss": 0.5443, "step": 7930 }, { "epoch": 0.7191408374116368, "grad_norm": 0.9321965575218201, "learning_rate": 4.970799914749134e-05, "loss": 0.5349, "step": 7935 }, { "epoch": 0.7195939822367229, "grad_norm": 0.8378075957298279, "learning_rate": 4.970679265433562e-05, "loss": 0.581, "step": 7940 }, { "epoch": 0.720047127061809, "grad_norm": 0.864031195640564, "learning_rate": 4.9705583688515746e-05, "loss": 0.5811, "step": 7945 }, { "epoch": 0.720500271886895, "grad_norm": 0.8519191741943359, "learning_rate": 4.9704372250152704e-05, "loss": 0.4696, "step": 7950 }, { "epoch": 0.7209534167119811, "grad_norm": 0.8849290609359741, "learning_rate": 4.9703158339367735e-05, "loss": 0.5643, "step": 7955 }, { "epoch": 0.7214065615370673, "grad_norm": 0.905796229839325, "learning_rate": 4.9701941956282327e-05, "loss": 0.4974, "step": 7960 }, { "epoch": 0.7218597063621534, "grad_norm": 0.8540452718734741, "learning_rate": 4.970072310101821e-05, "loss": 0.5411, "step": 7965 }, { "epoch": 0.7223128511872394, "grad_norm": 0.7565069198608398, "learning_rate": 4.969950177369739e-05, "loss": 0.5461, "step": 7970 }, { "epoch": 0.7227659960123255, "grad_norm": 0.837223470211029, "learning_rate": 4.969827797444207e-05, "loss": 0.5333, "step": 7975 }, { "epoch": 0.7232191408374117, "grad_norm": 0.8839082717895508, "learning_rate": 4.9697051703374745e-05, "loss": 0.5971, "step": 7980 }, { "epoch": 0.7236722856624977, "grad_norm": 0.8532863855361938, "learning_rate": 4.969582296061814e-05, "loss": 0.5165, "step": 7985 }, { "epoch": 0.7241254304875838, "grad_norm": 0.9258517622947693, "learning_rate": 4.969459174629523e-05, "loss": 0.5113, "step": 7990 }, { "epoch": 0.7245785753126699, "grad_norm": 0.8743969798088074, "learning_rate": 4.969335806052922e-05, "loss": 0.5009, "step": 7995 }, { "epoch": 0.7250317201377561, "grad_norm": 0.8566511273384094, "learning_rate": 4.969212190344359e-05, "loss": 0.551, "step": 8000 }, { "epoch": 0.7254848649628421, "grad_norm": 0.9295993447303772, "learning_rate": 4.969088327516205e-05, "loss": 0.5276, "step": 8005 }, { "epoch": 0.7259380097879282, "grad_norm": 0.8874592781066895, "learning_rate": 4.9689642175808574e-05, "loss": 0.5349, "step": 8010 }, { "epoch": 0.7263911546130143, "grad_norm": 0.8837982416152954, "learning_rate": 4.968839860550735e-05, "loss": 0.5263, "step": 8015 }, { "epoch": 0.7268442994381005, "grad_norm": 0.8414391279220581, "learning_rate": 4.968715256438285e-05, "loss": 0.4857, "step": 8020 }, { "epoch": 0.7272974442631865, "grad_norm": 0.8411561846733093, "learning_rate": 4.9685904052559786e-05, "loss": 0.5003, "step": 8025 }, { "epoch": 0.7277505890882726, "grad_norm": 0.846743643283844, "learning_rate": 4.968465307016309e-05, "loss": 0.5027, "step": 8030 }, { "epoch": 0.7282037339133587, "grad_norm": 0.9518309831619263, "learning_rate": 4.9683399617317974e-05, "loss": 0.4287, "step": 8035 }, { "epoch": 0.7286568787384448, "grad_norm": 0.8041695952415466, "learning_rate": 4.9682143694149884e-05, "loss": 0.4677, "step": 8040 }, { "epoch": 0.7291100235635309, "grad_norm": 0.8582595586776733, "learning_rate": 4.96808853007845e-05, "loss": 0.512, "step": 8045 }, { "epoch": 0.729563168388617, "grad_norm": 0.8442491292953491, "learning_rate": 4.967962443734778e-05, "loss": 0.5491, "step": 8050 }, { "epoch": 0.7300163132137031, "grad_norm": 0.8829084634780884, "learning_rate": 4.9678361103965897e-05, "loss": 0.5349, "step": 8055 }, { "epoch": 0.7304694580387892, "grad_norm": 0.992107093334198, "learning_rate": 4.96770953007653e-05, "loss": 0.4989, "step": 8060 }, { "epoch": 0.7309226028638753, "grad_norm": 0.8447121381759644, "learning_rate": 4.967582702787266e-05, "loss": 0.5298, "step": 8065 }, { "epoch": 0.7313757476889614, "grad_norm": 0.9114962220191956, "learning_rate": 4.967455628541491e-05, "loss": 0.5481, "step": 8070 }, { "epoch": 0.7318288925140475, "grad_norm": 0.8169251084327698, "learning_rate": 4.967328307351922e-05, "loss": 0.5446, "step": 8075 }, { "epoch": 0.7322820373391336, "grad_norm": 0.8745860457420349, "learning_rate": 4.967200739231302e-05, "loss": 0.5147, "step": 8080 }, { "epoch": 0.7327351821642197, "grad_norm": 0.9586148262023926, "learning_rate": 4.9670729241923985e-05, "loss": 0.6091, "step": 8085 }, { "epoch": 0.7331883269893058, "grad_norm": 0.7546008229255676, "learning_rate": 4.966944862248002e-05, "loss": 0.568, "step": 8090 }, { "epoch": 0.7336414718143919, "grad_norm": 0.8955003023147583, "learning_rate": 4.966816553410931e-05, "loss": 0.5176, "step": 8095 }, { "epoch": 0.734094616639478, "grad_norm": 0.8849169015884399, "learning_rate": 4.966687997694024e-05, "loss": 0.5621, "step": 8100 }, { "epoch": 0.734547761464564, "grad_norm": 0.9254324436187744, "learning_rate": 4.966559195110149e-05, "loss": 0.5414, "step": 8105 }, { "epoch": 0.7350009062896502, "grad_norm": 0.8411292433738708, "learning_rate": 4.966430145672196e-05, "loss": 0.5258, "step": 8110 }, { "epoch": 0.7354540511147363, "grad_norm": 0.9090816974639893, "learning_rate": 4.96630084939308e-05, "loss": 0.515, "step": 8115 }, { "epoch": 0.7359071959398223, "grad_norm": 0.8130317330360413, "learning_rate": 4.9661713062857415e-05, "loss": 0.5535, "step": 8120 }, { "epoch": 0.7363603407649084, "grad_norm": 0.8106669187545776, "learning_rate": 4.966041516363145e-05, "loss": 0.5266, "step": 8125 }, { "epoch": 0.7368134855899946, "grad_norm": 0.8304873108863831, "learning_rate": 4.96591147963828e-05, "loss": 0.5036, "step": 8130 }, { "epoch": 0.7372666304150807, "grad_norm": 0.8079414367675781, "learning_rate": 4.96578119612416e-05, "loss": 0.5539, "step": 8135 }, { "epoch": 0.7377197752401667, "grad_norm": 0.8058449625968933, "learning_rate": 4.965650665833825e-05, "loss": 0.5413, "step": 8140 }, { "epoch": 0.7381729200652528, "grad_norm": 0.7574976682662964, "learning_rate": 4.9655198887803374e-05, "loss": 0.5305, "step": 8145 }, { "epoch": 0.738626064890339, "grad_norm": 0.9189540147781372, "learning_rate": 4.9653888649767854e-05, "loss": 0.5397, "step": 8150 }, { "epoch": 0.7390792097154251, "grad_norm": 0.8642148375511169, "learning_rate": 4.965257594436283e-05, "loss": 0.5386, "step": 8155 }, { "epoch": 0.7395323545405111, "grad_norm": 0.8127231001853943, "learning_rate": 4.965126077171968e-05, "loss": 0.5754, "step": 8160 }, { "epoch": 0.7399854993655972, "grad_norm": 0.862087607383728, "learning_rate": 4.964994313197e-05, "loss": 0.5494, "step": 8165 }, { "epoch": 0.7404386441906834, "grad_norm": 0.9989303350448608, "learning_rate": 4.964862302524569e-05, "loss": 0.5124, "step": 8170 }, { "epoch": 0.7408917890157695, "grad_norm": 0.7793373465538025, "learning_rate": 4.964730045167886e-05, "loss": 0.5358, "step": 8175 }, { "epoch": 0.7413449338408555, "grad_norm": 0.8962302207946777, "learning_rate": 4.964597541140186e-05, "loss": 0.5831, "step": 8180 }, { "epoch": 0.7417980786659416, "grad_norm": 0.8830583095550537, "learning_rate": 4.96446479045473e-05, "loss": 0.5501, "step": 8185 }, { "epoch": 0.7422512234910277, "grad_norm": 0.9637821316719055, "learning_rate": 4.9643317931248066e-05, "loss": 0.6259, "step": 8190 }, { "epoch": 0.7427043683161139, "grad_norm": 0.8800966739654541, "learning_rate": 4.964198549163723e-05, "loss": 0.5623, "step": 8195 }, { "epoch": 0.7431575131411999, "grad_norm": 0.807065486907959, "learning_rate": 4.964065058584817e-05, "loss": 0.5477, "step": 8200 }, { "epoch": 0.743610657966286, "grad_norm": 0.8508424758911133, "learning_rate": 4.963931321401445e-05, "loss": 0.506, "step": 8205 }, { "epoch": 0.7440638027913721, "grad_norm": 0.8123301267623901, "learning_rate": 4.9637973376269946e-05, "loss": 0.5352, "step": 8210 }, { "epoch": 0.7445169476164583, "grad_norm": 0.831058919429779, "learning_rate": 4.9636631072748736e-05, "loss": 0.5168, "step": 8215 }, { "epoch": 0.7449700924415443, "grad_norm": 0.9233681559562683, "learning_rate": 4.963528630358516e-05, "loss": 0.5053, "step": 8220 }, { "epoch": 0.7454232372666304, "grad_norm": 0.9537581205368042, "learning_rate": 4.963393906891381e-05, "loss": 0.5606, "step": 8225 }, { "epoch": 0.7458763820917165, "grad_norm": 0.8025029897689819, "learning_rate": 4.96325893688695e-05, "loss": 0.4896, "step": 8230 }, { "epoch": 0.7463295269168027, "grad_norm": 1.0490643978118896, "learning_rate": 4.963123720358732e-05, "loss": 0.5303, "step": 8235 }, { "epoch": 0.7467826717418887, "grad_norm": 0.9005363583564758, "learning_rate": 4.962988257320259e-05, "loss": 0.5241, "step": 8240 }, { "epoch": 0.7472358165669748, "grad_norm": 0.8612938523292542, "learning_rate": 4.9628525477850896e-05, "loss": 0.5309, "step": 8245 }, { "epoch": 0.7476889613920609, "grad_norm": 0.790421724319458, "learning_rate": 4.962716591766804e-05, "loss": 0.5597, "step": 8250 }, { "epoch": 0.748142106217147, "grad_norm": 0.7976511716842651, "learning_rate": 4.962580389279009e-05, "loss": 0.5538, "step": 8255 }, { "epoch": 0.7485952510422331, "grad_norm": 0.875232458114624, "learning_rate": 4.9624439403353375e-05, "loss": 0.5737, "step": 8260 }, { "epoch": 0.7490483958673192, "grad_norm": 0.9434481263160706, "learning_rate": 4.962307244949443e-05, "loss": 0.595, "step": 8265 }, { "epoch": 0.7495015406924053, "grad_norm": 0.8055761456489563, "learning_rate": 4.962170303135007e-05, "loss": 0.4465, "step": 8270 }, { "epoch": 0.7499546855174913, "grad_norm": 0.8560803532600403, "learning_rate": 4.9620331149057356e-05, "loss": 0.5092, "step": 8275 }, { "epoch": 0.7504078303425775, "grad_norm": 0.9509853720664978, "learning_rate": 4.961895680275357e-05, "loss": 0.5437, "step": 8280 }, { "epoch": 0.7508609751676636, "grad_norm": 0.8844623565673828, "learning_rate": 4.961757999257626e-05, "loss": 0.5607, "step": 8285 }, { "epoch": 0.7513141199927497, "grad_norm": 0.8548365235328674, "learning_rate": 4.961620071866323e-05, "loss": 0.5072, "step": 8290 }, { "epoch": 0.7517672648178357, "grad_norm": 0.8828372955322266, "learning_rate": 4.961481898115251e-05, "loss": 0.542, "step": 8295 }, { "epoch": 0.7522204096429219, "grad_norm": 0.8250105977058411, "learning_rate": 4.961343478018239e-05, "loss": 0.5364, "step": 8300 }, { "epoch": 0.752673554468008, "grad_norm": 0.9041432738304138, "learning_rate": 4.961204811589138e-05, "loss": 0.545, "step": 8305 }, { "epoch": 0.7531266992930941, "grad_norm": 0.8543074131011963, "learning_rate": 4.96106589884183e-05, "loss": 0.5347, "step": 8310 }, { "epoch": 0.7535798441181801, "grad_norm": 0.8434504270553589, "learning_rate": 4.9609267397902125e-05, "loss": 0.5006, "step": 8315 }, { "epoch": 0.7540329889432663, "grad_norm": 0.824393093585968, "learning_rate": 4.960787334448216e-05, "loss": 0.6049, "step": 8320 }, { "epoch": 0.7544861337683524, "grad_norm": 1.0090327262878418, "learning_rate": 4.9606476828297915e-05, "loss": 0.545, "step": 8325 }, { "epoch": 0.7549392785934385, "grad_norm": 0.774476945400238, "learning_rate": 4.960507784948915e-05, "loss": 0.4936, "step": 8330 }, { "epoch": 0.7553924234185245, "grad_norm": 0.930637001991272, "learning_rate": 4.960367640819587e-05, "loss": 0.5915, "step": 8335 }, { "epoch": 0.7558455682436107, "grad_norm": 0.8475789427757263, "learning_rate": 4.9602272504558344e-05, "loss": 0.5446, "step": 8340 }, { "epoch": 0.7562987130686968, "grad_norm": 0.9202598333358765, "learning_rate": 4.960086613871706e-05, "loss": 0.5429, "step": 8345 }, { "epoch": 0.7567518578937829, "grad_norm": 0.9212363958358765, "learning_rate": 4.9599457310812784e-05, "loss": 0.5726, "step": 8350 }, { "epoch": 0.7572050027188689, "grad_norm": 0.8868643641471863, "learning_rate": 4.95980460209865e-05, "loss": 0.5711, "step": 8355 }, { "epoch": 0.757658147543955, "grad_norm": 0.882646381855011, "learning_rate": 4.959663226937946e-05, "loss": 0.5458, "step": 8360 }, { "epoch": 0.7581112923690412, "grad_norm": 0.8536713719367981, "learning_rate": 4.959521605613315e-05, "loss": 0.5588, "step": 8365 }, { "epoch": 0.7585644371941273, "grad_norm": 0.8826894164085388, "learning_rate": 4.9593797381389295e-05, "loss": 0.467, "step": 8370 }, { "epoch": 0.7590175820192133, "grad_norm": 0.7876854538917542, "learning_rate": 4.959237624528988e-05, "loss": 0.5268, "step": 8375 }, { "epoch": 0.7594707268442994, "grad_norm": 0.7925274968147278, "learning_rate": 4.959095264797715e-05, "loss": 0.5302, "step": 8380 }, { "epoch": 0.7599238716693856, "grad_norm": 0.8320027589797974, "learning_rate": 4.958952658959356e-05, "loss": 0.4549, "step": 8385 }, { "epoch": 0.7603770164944716, "grad_norm": 0.7916707396507263, "learning_rate": 4.958809807028184e-05, "loss": 0.5164, "step": 8390 }, { "epoch": 0.7608301613195577, "grad_norm": 1.1235424280166626, "learning_rate": 4.958666709018495e-05, "loss": 0.5065, "step": 8395 }, { "epoch": 0.7612833061446438, "grad_norm": 0.8274693489074707, "learning_rate": 4.9585233649446105e-05, "loss": 0.5105, "step": 8400 }, { "epoch": 0.76173645096973, "grad_norm": 0.8179918527603149, "learning_rate": 4.958379774820876e-05, "loss": 0.5199, "step": 8405 }, { "epoch": 0.762189595794816, "grad_norm": 0.8861761689186096, "learning_rate": 4.958235938661663e-05, "loss": 0.5291, "step": 8410 }, { "epoch": 0.7626427406199021, "grad_norm": 0.8456570506095886, "learning_rate": 4.958091856481367e-05, "loss": 0.5065, "step": 8415 }, { "epoch": 0.7630958854449882, "grad_norm": 0.942874550819397, "learning_rate": 4.9579475282944065e-05, "loss": 0.5485, "step": 8420 }, { "epoch": 0.7635490302700744, "grad_norm": 0.8712714910507202, "learning_rate": 4.957802954115226e-05, "loss": 0.5762, "step": 8425 }, { "epoch": 0.7640021750951604, "grad_norm": 0.9191599488258362, "learning_rate": 4.957658133958295e-05, "loss": 0.5455, "step": 8430 }, { "epoch": 0.7644553199202465, "grad_norm": 0.8521578311920166, "learning_rate": 4.957513067838108e-05, "loss": 0.5479, "step": 8435 }, { "epoch": 0.7649084647453326, "grad_norm": 0.8745684623718262, "learning_rate": 4.9573677557691825e-05, "loss": 0.5549, "step": 8440 }, { "epoch": 0.7653616095704187, "grad_norm": 0.9694643616676331, "learning_rate": 4.9572221977660603e-05, "loss": 0.5668, "step": 8445 }, { "epoch": 0.7658147543955048, "grad_norm": 0.964215099811554, "learning_rate": 4.957076393843311e-05, "loss": 0.5323, "step": 8450 }, { "epoch": 0.7662678992205909, "grad_norm": 0.8520615100860596, "learning_rate": 4.9569303440155245e-05, "loss": 0.5483, "step": 8455 }, { "epoch": 0.766721044045677, "grad_norm": 0.7835680246353149, "learning_rate": 4.9567840482973185e-05, "loss": 0.5654, "step": 8460 }, { "epoch": 0.767174188870763, "grad_norm": 0.8880181312561035, "learning_rate": 4.956637506703335e-05, "loss": 0.5162, "step": 8465 }, { "epoch": 0.7676273336958492, "grad_norm": 0.893972635269165, "learning_rate": 4.95649071924824e-05, "loss": 0.5079, "step": 8470 }, { "epoch": 0.7680804785209353, "grad_norm": 0.8670584559440613, "learning_rate": 4.956343685946723e-05, "loss": 0.5466, "step": 8475 }, { "epoch": 0.7685336233460214, "grad_norm": 0.819282591342926, "learning_rate": 4.9561964068134984e-05, "loss": 0.522, "step": 8480 }, { "epoch": 0.7689867681711074, "grad_norm": 0.8770890235900879, "learning_rate": 4.9560488818633086e-05, "loss": 0.4719, "step": 8485 }, { "epoch": 0.7694399129961936, "grad_norm": 0.8794461488723755, "learning_rate": 4.9559011111109155e-05, "loss": 0.5106, "step": 8490 }, { "epoch": 0.7698930578212797, "grad_norm": 0.8694079518318176, "learning_rate": 4.9557530945711096e-05, "loss": 0.4701, "step": 8495 }, { "epoch": 0.7703462026463658, "grad_norm": 0.8235492706298828, "learning_rate": 4.9556048322587025e-05, "loss": 0.5079, "step": 8500 }, { "epoch": 0.7707993474714518, "grad_norm": 0.8837901949882507, "learning_rate": 4.9554563241885356e-05, "loss": 0.5959, "step": 8505 }, { "epoch": 0.771252492296538, "grad_norm": 0.8400229215621948, "learning_rate": 4.955307570375468e-05, "loss": 0.5661, "step": 8510 }, { "epoch": 0.7717056371216241, "grad_norm": 0.8738158345222473, "learning_rate": 4.955158570834389e-05, "loss": 0.4789, "step": 8515 }, { "epoch": 0.7721587819467102, "grad_norm": 0.9011485576629639, "learning_rate": 4.955009325580211e-05, "loss": 0.5234, "step": 8520 }, { "epoch": 0.7726119267717962, "grad_norm": 0.9313690662384033, "learning_rate": 4.95485983462787e-05, "loss": 0.5424, "step": 8525 }, { "epoch": 0.7730650715968823, "grad_norm": 0.846605122089386, "learning_rate": 4.954710097992327e-05, "loss": 0.5341, "step": 8530 }, { "epoch": 0.7735182164219685, "grad_norm": 0.8677839636802673, "learning_rate": 4.9545601156885665e-05, "loss": 0.5144, "step": 8535 }, { "epoch": 0.7739713612470546, "grad_norm": 0.8349720239639282, "learning_rate": 4.9544098877316e-05, "loss": 0.5258, "step": 8540 }, { "epoch": 0.7744245060721406, "grad_norm": 0.7498482465744019, "learning_rate": 4.954259414136463e-05, "loss": 0.5161, "step": 8545 }, { "epoch": 0.7748776508972267, "grad_norm": 0.9704023599624634, "learning_rate": 4.954108694918213e-05, "loss": 0.499, "step": 8550 }, { "epoch": 0.7753307957223129, "grad_norm": 0.7899861931800842, "learning_rate": 4.9539577300919364e-05, "loss": 0.4839, "step": 8555 }, { "epoch": 0.775783940547399, "grad_norm": 0.8610770106315613, "learning_rate": 4.9538065196727404e-05, "loss": 0.4735, "step": 8560 }, { "epoch": 0.776237085372485, "grad_norm": 0.8037115931510925, "learning_rate": 4.9536550636757587e-05, "loss": 0.5233, "step": 8565 }, { "epoch": 0.7766902301975711, "grad_norm": 1.3418909311294556, "learning_rate": 4.953503362116148e-05, "loss": 0.5133, "step": 8570 }, { "epoch": 0.7771433750226573, "grad_norm": 0.8323001861572266, "learning_rate": 4.953351415009091e-05, "loss": 0.5382, "step": 8575 }, { "epoch": 0.7775965198477434, "grad_norm": 0.799590528011322, "learning_rate": 4.9531992223697957e-05, "loss": 0.553, "step": 8580 }, { "epoch": 0.7780496646728294, "grad_norm": 0.8661311268806458, "learning_rate": 4.9530467842134934e-05, "loss": 0.4868, "step": 8585 }, { "epoch": 0.7785028094979155, "grad_norm": 0.7976498007774353, "learning_rate": 4.9528941005554386e-05, "loss": 0.5192, "step": 8590 }, { "epoch": 0.7789559543230016, "grad_norm": 1.1042238473892212, "learning_rate": 4.952741171410914e-05, "loss": 0.5072, "step": 8595 }, { "epoch": 0.7794090991480878, "grad_norm": 0.8510773777961731, "learning_rate": 4.9525879967952236e-05, "loss": 0.5786, "step": 8600 }, { "epoch": 0.7798622439731738, "grad_norm": 0.8245643973350525, "learning_rate": 4.952434576723697e-05, "loss": 0.5334, "step": 8605 }, { "epoch": 0.7803153887982599, "grad_norm": 0.8323460817337036, "learning_rate": 4.952280911211688e-05, "loss": 0.4803, "step": 8610 }, { "epoch": 0.780768533623346, "grad_norm": 0.8146399259567261, "learning_rate": 4.9521270002745784e-05, "loss": 0.4947, "step": 8615 }, { "epoch": 0.7812216784484322, "grad_norm": 0.9076191782951355, "learning_rate": 4.951972843927769e-05, "loss": 0.4549, "step": 8620 }, { "epoch": 0.7816748232735182, "grad_norm": 0.780083954334259, "learning_rate": 4.951818442186688e-05, "loss": 0.5183, "step": 8625 }, { "epoch": 0.7821279680986043, "grad_norm": 1.0922757387161255, "learning_rate": 4.951663795066789e-05, "loss": 0.5109, "step": 8630 }, { "epoch": 0.7825811129236904, "grad_norm": 0.8137921690940857, "learning_rate": 4.951508902583548e-05, "loss": 0.5046, "step": 8635 }, { "epoch": 0.7830342577487766, "grad_norm": 0.7788081169128418, "learning_rate": 4.951353764752468e-05, "loss": 0.4913, "step": 8640 }, { "epoch": 0.7834874025738626, "grad_norm": 0.8292219042778015, "learning_rate": 4.951198381589074e-05, "loss": 0.4829, "step": 8645 }, { "epoch": 0.7839405473989487, "grad_norm": 0.8182724714279175, "learning_rate": 4.951042753108918e-05, "loss": 0.5195, "step": 8650 }, { "epoch": 0.7843936922240348, "grad_norm": 0.8620957136154175, "learning_rate": 4.950886879327574e-05, "loss": 0.4863, "step": 8655 }, { "epoch": 0.784846837049121, "grad_norm": 0.8774337768554688, "learning_rate": 4.9507307602606424e-05, "loss": 0.5133, "step": 8660 }, { "epoch": 0.785299981874207, "grad_norm": 0.8785188794136047, "learning_rate": 4.9505743959237484e-05, "loss": 0.5793, "step": 8665 }, { "epoch": 0.7857531266992931, "grad_norm": 0.8777759075164795, "learning_rate": 4.95041778633254e-05, "loss": 0.5111, "step": 8670 }, { "epoch": 0.7862062715243792, "grad_norm": 0.896644651889801, "learning_rate": 4.9502609315026916e-05, "loss": 0.5257, "step": 8675 }, { "epoch": 0.7866594163494652, "grad_norm": 0.7867894768714905, "learning_rate": 4.9501038314499007e-05, "loss": 0.5207, "step": 8680 }, { "epoch": 0.7871125611745514, "grad_norm": 0.8629199862480164, "learning_rate": 4.94994648618989e-05, "loss": 0.4957, "step": 8685 }, { "epoch": 0.7875657059996375, "grad_norm": 0.9972370266914368, "learning_rate": 4.949788895738407e-05, "loss": 0.5084, "step": 8690 }, { "epoch": 0.7880188508247236, "grad_norm": 0.8132585287094116, "learning_rate": 4.949631060111223e-05, "loss": 0.5375, "step": 8695 }, { "epoch": 0.7884719956498096, "grad_norm": 0.9183336496353149, "learning_rate": 4.9494729793241346e-05, "loss": 0.4962, "step": 8700 }, { "epoch": 0.7889251404748958, "grad_norm": 0.8888422846794128, "learning_rate": 4.9493146533929625e-05, "loss": 0.4583, "step": 8705 }, { "epoch": 0.7893782852999819, "grad_norm": 0.8431023359298706, "learning_rate": 4.9491560823335516e-05, "loss": 0.5429, "step": 8710 }, { "epoch": 0.789831430125068, "grad_norm": 0.8581605553627014, "learning_rate": 4.948997266161772e-05, "loss": 0.4846, "step": 8715 }, { "epoch": 0.790284574950154, "grad_norm": 0.764604389667511, "learning_rate": 4.9488382048935177e-05, "loss": 0.5133, "step": 8720 }, { "epoch": 0.7907377197752402, "grad_norm": 0.8544548153877258, "learning_rate": 4.9486788985447085e-05, "loss": 0.49, "step": 8725 }, { "epoch": 0.7911908646003263, "grad_norm": 0.8586869239807129, "learning_rate": 4.9485193471312875e-05, "loss": 0.5962, "step": 8730 }, { "epoch": 0.7916440094254124, "grad_norm": 0.7741138339042664, "learning_rate": 4.9483595506692224e-05, "loss": 0.5264, "step": 8735 }, { "epoch": 0.7920971542504984, "grad_norm": 0.8118835687637329, "learning_rate": 4.948199509174506e-05, "loss": 0.5394, "step": 8740 }, { "epoch": 0.7925502990755846, "grad_norm": 0.7739230990409851, "learning_rate": 4.9480392226631545e-05, "loss": 0.4968, "step": 8745 }, { "epoch": 0.7930034439006707, "grad_norm": 0.8436456322669983, "learning_rate": 4.947878691151211e-05, "loss": 0.4516, "step": 8750 }, { "epoch": 0.7934565887257568, "grad_norm": 0.8840537667274475, "learning_rate": 4.947717914654739e-05, "loss": 0.558, "step": 8755 }, { "epoch": 0.7939097335508428, "grad_norm": 0.7717965245246887, "learning_rate": 4.947556893189832e-05, "loss": 0.5363, "step": 8760 }, { "epoch": 0.7943628783759289, "grad_norm": 0.8463494181632996, "learning_rate": 4.9473956267726034e-05, "loss": 0.5137, "step": 8765 }, { "epoch": 0.7948160232010151, "grad_norm": 0.8278518319129944, "learning_rate": 4.947234115419193e-05, "loss": 0.5151, "step": 8770 }, { "epoch": 0.7952691680261011, "grad_norm": 0.8587587475776672, "learning_rate": 4.947072359145766e-05, "loss": 0.5386, "step": 8775 }, { "epoch": 0.7957223128511872, "grad_norm": 0.7893475890159607, "learning_rate": 4.946910357968508e-05, "loss": 0.4647, "step": 8780 }, { "epoch": 0.7961754576762733, "grad_norm": 0.8578081727027893, "learning_rate": 4.946748111903636e-05, "loss": 0.4927, "step": 8785 }, { "epoch": 0.7966286025013595, "grad_norm": 0.9062371253967285, "learning_rate": 4.9465856209673855e-05, "loss": 0.5533, "step": 8790 }, { "epoch": 0.7970817473264455, "grad_norm": 0.9160646200180054, "learning_rate": 4.946422885176019e-05, "loss": 0.5098, "step": 8795 }, { "epoch": 0.7975348921515316, "grad_norm": 0.8768150806427002, "learning_rate": 4.9462599045458226e-05, "loss": 0.5009, "step": 8800 }, { "epoch": 0.7979880369766177, "grad_norm": 0.9328088760375977, "learning_rate": 4.946096679093108e-05, "loss": 0.543, "step": 8805 }, { "epoch": 0.7984411818017039, "grad_norm": 0.8809916973114014, "learning_rate": 4.945933208834211e-05, "loss": 0.5128, "step": 8810 }, { "epoch": 0.7988943266267899, "grad_norm": 0.8559165596961975, "learning_rate": 4.945769493785491e-05, "loss": 0.5033, "step": 8815 }, { "epoch": 0.799347471451876, "grad_norm": 0.8785814046859741, "learning_rate": 4.945605533963334e-05, "loss": 0.5371, "step": 8820 }, { "epoch": 0.7998006162769621, "grad_norm": 0.9341546297073364, "learning_rate": 4.94544132938415e-05, "loss": 0.501, "step": 8825 }, { "epoch": 0.8002537611020483, "grad_norm": 0.8224701285362244, "learning_rate": 4.945276880064369e-05, "loss": 0.4946, "step": 8830 }, { "epoch": 0.8007069059271343, "grad_norm": 0.7934675812721252, "learning_rate": 4.9451121860204515e-05, "loss": 0.5322, "step": 8835 }, { "epoch": 0.8011600507522204, "grad_norm": 0.8405020833015442, "learning_rate": 4.9449472472688794e-05, "loss": 0.4882, "step": 8840 }, { "epoch": 0.8016131955773065, "grad_norm": 0.7944738268852234, "learning_rate": 4.944782063826161e-05, "loss": 0.5025, "step": 8845 }, { "epoch": 0.8020663404023926, "grad_norm": 0.8250653147697449, "learning_rate": 4.9446166357088275e-05, "loss": 0.5043, "step": 8850 }, { "epoch": 0.8025194852274787, "grad_norm": 0.8939177393913269, "learning_rate": 4.944450962933435e-05, "loss": 0.5062, "step": 8855 }, { "epoch": 0.8029726300525648, "grad_norm": 0.84300297498703, "learning_rate": 4.9442850455165626e-05, "loss": 0.4556, "step": 8860 }, { "epoch": 0.8034257748776509, "grad_norm": 0.7546675205230713, "learning_rate": 4.944118883474817e-05, "loss": 0.477, "step": 8865 }, { "epoch": 0.803878919702737, "grad_norm": 1.0608553886413574, "learning_rate": 4.943952476824827e-05, "loss": 0.52, "step": 8870 }, { "epoch": 0.8043320645278231, "grad_norm": 0.986639142036438, "learning_rate": 4.943785825583247e-05, "loss": 0.5458, "step": 8875 }, { "epoch": 0.8047852093529092, "grad_norm": 0.836441695690155, "learning_rate": 4.943618929766756e-05, "loss": 0.5307, "step": 8880 }, { "epoch": 0.8052383541779953, "grad_norm": 0.8917811512947083, "learning_rate": 4.943451789392056e-05, "loss": 0.5406, "step": 8885 }, { "epoch": 0.8056914990030813, "grad_norm": 0.8580133318901062, "learning_rate": 4.943284404475875e-05, "loss": 0.5128, "step": 8890 }, { "epoch": 0.8061446438281675, "grad_norm": 0.9079362154006958, "learning_rate": 4.943116775034966e-05, "loss": 0.5375, "step": 8895 }, { "epoch": 0.8065977886532536, "grad_norm": 0.7908406257629395, "learning_rate": 4.942948901086103e-05, "loss": 0.4902, "step": 8900 }, { "epoch": 0.8070509334783397, "grad_norm": 0.9029373526573181, "learning_rate": 4.942780782646088e-05, "loss": 0.5387, "step": 8905 }, { "epoch": 0.8075040783034257, "grad_norm": 0.8428201079368591, "learning_rate": 4.942612419731747e-05, "loss": 0.5067, "step": 8910 }, { "epoch": 0.8079572231285119, "grad_norm": 0.8759452700614929, "learning_rate": 4.942443812359929e-05, "loss": 0.5138, "step": 8915 }, { "epoch": 0.808410367953598, "grad_norm": 0.8549659848213196, "learning_rate": 4.942274960547508e-05, "loss": 0.5083, "step": 8920 }, { "epoch": 0.8088635127786841, "grad_norm": 0.8198952674865723, "learning_rate": 4.942105864311385e-05, "loss": 0.4699, "step": 8925 }, { "epoch": 0.8093166576037701, "grad_norm": 0.8221438527107239, "learning_rate": 4.94193652366848e-05, "loss": 0.4791, "step": 8930 }, { "epoch": 0.8097698024288562, "grad_norm": 0.873609721660614, "learning_rate": 4.941766938635744e-05, "loss": 0.4951, "step": 8935 }, { "epoch": 0.8102229472539424, "grad_norm": 0.792964518070221, "learning_rate": 4.941597109230145e-05, "loss": 0.4807, "step": 8940 }, { "epoch": 0.8106760920790285, "grad_norm": 0.9364765882492065, "learning_rate": 4.941427035468683e-05, "loss": 0.529, "step": 8945 }, { "epoch": 0.8111292369041145, "grad_norm": 0.8903587460517883, "learning_rate": 4.9412567173683786e-05, "loss": 0.5397, "step": 8950 }, { "epoch": 0.8115823817292006, "grad_norm": 0.8264960050582886, "learning_rate": 4.941086154946276e-05, "loss": 0.4853, "step": 8955 }, { "epoch": 0.8120355265542868, "grad_norm": 0.8628368377685547, "learning_rate": 4.940915348219446e-05, "loss": 0.4766, "step": 8960 }, { "epoch": 0.8124886713793729, "grad_norm": 0.8610336184501648, "learning_rate": 4.9407442972049825e-05, "loss": 0.5284, "step": 8965 }, { "epoch": 0.8129418162044589, "grad_norm": 0.8773778676986694, "learning_rate": 4.9405730019200044e-05, "loss": 0.4788, "step": 8970 }, { "epoch": 0.813394961029545, "grad_norm": 0.8605122566223145, "learning_rate": 4.9404014623816565e-05, "loss": 0.4468, "step": 8975 }, { "epoch": 0.8138481058546312, "grad_norm": 0.9718349575996399, "learning_rate": 4.9402296786071046e-05, "loss": 0.5221, "step": 8980 }, { "epoch": 0.8143012506797173, "grad_norm": 0.783787727355957, "learning_rate": 4.940057650613541e-05, "loss": 0.4917, "step": 8985 }, { "epoch": 0.8147543955048033, "grad_norm": 1.0435853004455566, "learning_rate": 4.939885378418183e-05, "loss": 0.5225, "step": 8990 }, { "epoch": 0.8152075403298894, "grad_norm": 0.9150357246398926, "learning_rate": 4.939712862038272e-05, "loss": 0.5192, "step": 8995 }, { "epoch": 0.8156606851549756, "grad_norm": 0.8357282280921936, "learning_rate": 4.939540101491072e-05, "loss": 0.5142, "step": 9000 }, { "epoch": 0.8161138299800617, "grad_norm": 0.8742620348930359, "learning_rate": 4.9393670967938754e-05, "loss": 0.5349, "step": 9005 }, { "epoch": 0.8165669748051477, "grad_norm": 0.9042479991912842, "learning_rate": 4.939193847963994e-05, "loss": 0.4881, "step": 9010 }, { "epoch": 0.8170201196302338, "grad_norm": 0.8478610515594482, "learning_rate": 4.9390203550187684e-05, "loss": 0.5334, "step": 9015 }, { "epoch": 0.8174732644553199, "grad_norm": 0.8159575462341309, "learning_rate": 4.938846617975561e-05, "loss": 0.5222, "step": 9020 }, { "epoch": 0.817926409280406, "grad_norm": 0.8768303394317627, "learning_rate": 4.93867263685176e-05, "loss": 0.4767, "step": 9025 }, { "epoch": 0.8183795541054921, "grad_norm": 0.8353102803230286, "learning_rate": 4.9384984116647765e-05, "loss": 0.5099, "step": 9030 }, { "epoch": 0.8188326989305782, "grad_norm": 0.7804176211357117, "learning_rate": 4.938323942432048e-05, "loss": 0.4952, "step": 9035 }, { "epoch": 0.8192858437556643, "grad_norm": 0.8231666684150696, "learning_rate": 4.938149229171035e-05, "loss": 0.4835, "step": 9040 }, { "epoch": 0.8197389885807504, "grad_norm": 0.8514224886894226, "learning_rate": 4.937974271899223e-05, "loss": 0.4938, "step": 9045 }, { "epoch": 0.8201921334058365, "grad_norm": 0.845439612865448, "learning_rate": 4.937799070634121e-05, "loss": 0.4871, "step": 9050 }, { "epoch": 0.8206452782309226, "grad_norm": 0.8601774573326111, "learning_rate": 4.9376236253932646e-05, "loss": 0.4932, "step": 9055 }, { "epoch": 0.8210984230560087, "grad_norm": 0.7799217104911804, "learning_rate": 4.9374479361942114e-05, "loss": 0.4925, "step": 9060 }, { "epoch": 0.8215515678810948, "grad_norm": 0.8471131920814514, "learning_rate": 4.937272003054545e-05, "loss": 0.4923, "step": 9065 }, { "epoch": 0.8220047127061809, "grad_norm": 0.8406532406806946, "learning_rate": 4.937095825991872e-05, "loss": 0.5098, "step": 9070 }, { "epoch": 0.822457857531267, "grad_norm": 0.9166792631149292, "learning_rate": 4.9369194050238254e-05, "loss": 0.5034, "step": 9075 }, { "epoch": 0.8229110023563531, "grad_norm": 0.8713884949684143, "learning_rate": 4.936742740168061e-05, "loss": 0.5454, "step": 9080 }, { "epoch": 0.8233641471814392, "grad_norm": 0.8342157602310181, "learning_rate": 4.93656583144226e-05, "loss": 0.4877, "step": 9085 }, { "epoch": 0.8238172920065253, "grad_norm": 0.8216428160667419, "learning_rate": 4.936388678864126e-05, "loss": 0.5142, "step": 9090 }, { "epoch": 0.8242704368316114, "grad_norm": 0.8523399829864502, "learning_rate": 4.93621128245139e-05, "loss": 0.4988, "step": 9095 }, { "epoch": 0.8247235816566975, "grad_norm": 0.7696032524108887, "learning_rate": 4.936033642221805e-05, "loss": 0.438, "step": 9100 }, { "epoch": 0.8251767264817835, "grad_norm": 0.8295176029205322, "learning_rate": 4.935855758193149e-05, "loss": 0.47, "step": 9105 }, { "epoch": 0.8256298713068697, "grad_norm": 0.8185020685195923, "learning_rate": 4.9356776303832266e-05, "loss": 0.5265, "step": 9110 }, { "epoch": 0.8260830161319558, "grad_norm": 0.8378824591636658, "learning_rate": 4.935499258809862e-05, "loss": 0.5187, "step": 9115 }, { "epoch": 0.8265361609570419, "grad_norm": 0.8950664401054382, "learning_rate": 4.935320643490909e-05, "loss": 0.491, "step": 9120 }, { "epoch": 0.8269893057821279, "grad_norm": 0.8816161751747131, "learning_rate": 4.9351417844442425e-05, "loss": 0.4832, "step": 9125 }, { "epoch": 0.8274424506072141, "grad_norm": 0.851898193359375, "learning_rate": 4.9349626816877636e-05, "loss": 0.5826, "step": 9130 }, { "epoch": 0.8278955954323002, "grad_norm": 0.9158554673194885, "learning_rate": 4.934783335239396e-05, "loss": 0.5086, "step": 9135 }, { "epoch": 0.8283487402573863, "grad_norm": 0.8508226275444031, "learning_rate": 4.934603745117089e-05, "loss": 0.5091, "step": 9140 }, { "epoch": 0.8288018850824723, "grad_norm": 0.8803194165229797, "learning_rate": 4.934423911338817e-05, "loss": 0.5236, "step": 9145 }, { "epoch": 0.8292550299075585, "grad_norm": 0.7688875794410706, "learning_rate": 4.934243833922576e-05, "loss": 0.4923, "step": 9150 }, { "epoch": 0.8297081747326446, "grad_norm": 0.8089540600776672, "learning_rate": 4.93406351288639e-05, "loss": 0.4767, "step": 9155 }, { "epoch": 0.8301613195577306, "grad_norm": 0.8597229719161987, "learning_rate": 4.933882948248304e-05, "loss": 0.5298, "step": 9160 }, { "epoch": 0.8306144643828167, "grad_norm": 0.8330439925193787, "learning_rate": 4.9337021400263904e-05, "loss": 0.5119, "step": 9165 }, { "epoch": 0.8310676092079028, "grad_norm": 0.9068859219551086, "learning_rate": 4.933521088238744e-05, "loss": 0.5049, "step": 9170 }, { "epoch": 0.831520754032989, "grad_norm": 0.8117282390594482, "learning_rate": 4.933339792903483e-05, "loss": 0.5099, "step": 9175 }, { "epoch": 0.831973898858075, "grad_norm": 0.9052037000656128, "learning_rate": 4.933158254038754e-05, "loss": 0.4749, "step": 9180 }, { "epoch": 0.8324270436831611, "grad_norm": 0.9249728322029114, "learning_rate": 4.932976471662725e-05, "loss": 0.4741, "step": 9185 }, { "epoch": 0.8328801885082472, "grad_norm": 0.9686796069145203, "learning_rate": 4.9327944457935874e-05, "loss": 0.4732, "step": 9190 }, { "epoch": 0.8333333333333334, "grad_norm": 1.1313916444778442, "learning_rate": 4.9326121764495596e-05, "loss": 0.4986, "step": 9195 }, { "epoch": 0.8337864781584194, "grad_norm": 0.8197826743125916, "learning_rate": 4.9324296636488834e-05, "loss": 0.5081, "step": 9200 }, { "epoch": 0.8342396229835055, "grad_norm": 0.7703066468238831, "learning_rate": 4.9322469074098235e-05, "loss": 0.5493, "step": 9205 }, { "epoch": 0.8346927678085916, "grad_norm": 0.9050767421722412, "learning_rate": 4.93206390775067e-05, "loss": 0.5284, "step": 9210 }, { "epoch": 0.8351459126336778, "grad_norm": 0.8193638920783997, "learning_rate": 4.9318806646897394e-05, "loss": 0.5311, "step": 9215 }, { "epoch": 0.8355990574587638, "grad_norm": 0.9421545267105103, "learning_rate": 4.93169717824537e-05, "loss": 0.5013, "step": 9220 }, { "epoch": 0.8360522022838499, "grad_norm": 0.9001126885414124, "learning_rate": 4.931513448435925e-05, "loss": 0.4887, "step": 9225 }, { "epoch": 0.836505347108936, "grad_norm": 0.8063634037971497, "learning_rate": 4.9313294752797914e-05, "loss": 0.499, "step": 9230 }, { "epoch": 0.8369584919340222, "grad_norm": 0.8355640172958374, "learning_rate": 4.9311452587953825e-05, "loss": 0.5284, "step": 9235 }, { "epoch": 0.8374116367591082, "grad_norm": 0.8325961232185364, "learning_rate": 4.930960799001134e-05, "loss": 0.4905, "step": 9240 }, { "epoch": 0.8378647815841943, "grad_norm": 0.8556712865829468, "learning_rate": 4.930776095915508e-05, "loss": 0.488, "step": 9245 }, { "epoch": 0.8383179264092804, "grad_norm": 0.9721340537071228, "learning_rate": 4.9305911495569875e-05, "loss": 0.5221, "step": 9250 }, { "epoch": 0.8387710712343664, "grad_norm": 0.7795276641845703, "learning_rate": 4.9304059599440835e-05, "loss": 0.5423, "step": 9255 }, { "epoch": 0.8392242160594526, "grad_norm": 0.9102662801742554, "learning_rate": 4.930220527095329e-05, "loss": 0.5245, "step": 9260 }, { "epoch": 0.8396773608845387, "grad_norm": 0.916848361492157, "learning_rate": 4.930034851029283e-05, "loss": 0.5146, "step": 9265 }, { "epoch": 0.8401305057096248, "grad_norm": 0.8739010691642761, "learning_rate": 4.929848931764527e-05, "loss": 0.4891, "step": 9270 }, { "epoch": 0.8405836505347108, "grad_norm": 0.8948177099227905, "learning_rate": 4.929662769319669e-05, "loss": 0.5057, "step": 9275 }, { "epoch": 0.841036795359797, "grad_norm": 0.918620765209198, "learning_rate": 4.92947636371334e-05, "loss": 0.4714, "step": 9280 }, { "epoch": 0.8414899401848831, "grad_norm": 0.8902973532676697, "learning_rate": 4.9292897149641945e-05, "loss": 0.5386, "step": 9285 }, { "epoch": 0.8419430850099692, "grad_norm": 0.9036000370979309, "learning_rate": 4.929102823090914e-05, "loss": 0.5007, "step": 9290 }, { "epoch": 0.8423962298350552, "grad_norm": 0.8957160711288452, "learning_rate": 4.9289156881122014e-05, "loss": 0.4976, "step": 9295 }, { "epoch": 0.8428493746601414, "grad_norm": 0.8212020993232727, "learning_rate": 4.928728310046785e-05, "loss": 0.5363, "step": 9300 }, { "epoch": 0.8433025194852275, "grad_norm": 0.8099269270896912, "learning_rate": 4.928540688913419e-05, "loss": 0.5618, "step": 9305 }, { "epoch": 0.8437556643103136, "grad_norm": 0.9357674717903137, "learning_rate": 4.92835282473088e-05, "loss": 0.4863, "step": 9310 }, { "epoch": 0.8442088091353996, "grad_norm": 0.8348541855812073, "learning_rate": 4.928164717517969e-05, "loss": 0.4854, "step": 9315 }, { "epoch": 0.8446619539604858, "grad_norm": 0.842549741268158, "learning_rate": 4.927976367293512e-05, "loss": 0.4724, "step": 9320 }, { "epoch": 0.8451150987855719, "grad_norm": 0.866948664188385, "learning_rate": 4.92778777407636e-05, "loss": 0.4847, "step": 9325 }, { "epoch": 0.845568243610658, "grad_norm": 0.8411350250244141, "learning_rate": 4.927598937885387e-05, "loss": 0.5371, "step": 9330 }, { "epoch": 0.846021388435744, "grad_norm": 0.8255221247673035, "learning_rate": 4.927409858739491e-05, "loss": 0.5075, "step": 9335 }, { "epoch": 0.8464745332608301, "grad_norm": 1.009853482246399, "learning_rate": 4.9272205366575964e-05, "loss": 0.5642, "step": 9340 }, { "epoch": 0.8469276780859163, "grad_norm": 0.8802311420440674, "learning_rate": 4.927030971658649e-05, "loss": 0.4624, "step": 9345 }, { "epoch": 0.8473808229110024, "grad_norm": 0.8816494941711426, "learning_rate": 4.9268411637616215e-05, "loss": 0.4991, "step": 9350 }, { "epoch": 0.8478339677360884, "grad_norm": 0.8448815941810608, "learning_rate": 4.926651112985511e-05, "loss": 0.4802, "step": 9355 }, { "epoch": 0.8482871125611745, "grad_norm": 0.8428962230682373, "learning_rate": 4.9264608193493365e-05, "loss": 0.5005, "step": 9360 }, { "epoch": 0.8487402573862607, "grad_norm": 0.8612008094787598, "learning_rate": 4.926270282872143e-05, "loss": 0.4917, "step": 9365 }, { "epoch": 0.8491934022113468, "grad_norm": 0.9170850515365601, "learning_rate": 4.926079503572999e-05, "loss": 0.4718, "step": 9370 }, { "epoch": 0.8496465470364328, "grad_norm": 0.8747850060462952, "learning_rate": 4.925888481470998e-05, "loss": 0.5219, "step": 9375 }, { "epoch": 0.8500996918615189, "grad_norm": 0.8926866054534912, "learning_rate": 4.925697216585258e-05, "loss": 0.4897, "step": 9380 }, { "epoch": 0.8505528366866051, "grad_norm": 0.9315779805183411, "learning_rate": 4.925505708934921e-05, "loss": 0.495, "step": 9385 }, { "epoch": 0.8510059815116912, "grad_norm": 0.8862330317497253, "learning_rate": 4.9253139585391515e-05, "loss": 0.5116, "step": 9390 }, { "epoch": 0.8514591263367772, "grad_norm": 0.7122061848640442, "learning_rate": 4.925121965417142e-05, "loss": 0.4259, "step": 9395 }, { "epoch": 0.8519122711618633, "grad_norm": 0.8328790664672852, "learning_rate": 4.924929729588106e-05, "loss": 0.4401, "step": 9400 }, { "epoch": 0.8523654159869495, "grad_norm": 0.8846346735954285, "learning_rate": 4.924737251071283e-05, "loss": 0.5143, "step": 9405 }, { "epoch": 0.8528185608120356, "grad_norm": 0.842960000038147, "learning_rate": 4.9245445298859364e-05, "loss": 0.4304, "step": 9410 }, { "epoch": 0.8532717056371216, "grad_norm": 0.812855064868927, "learning_rate": 4.924351566051354e-05, "loss": 0.4796, "step": 9415 }, { "epoch": 0.8537248504622077, "grad_norm": 0.8919089436531067, "learning_rate": 4.924158359586847e-05, "loss": 0.4849, "step": 9420 }, { "epoch": 0.8541779952872938, "grad_norm": 0.8520292043685913, "learning_rate": 4.9239649105117517e-05, "loss": 0.4767, "step": 9425 }, { "epoch": 0.85463114011238, "grad_norm": 0.8885392546653748, "learning_rate": 4.923771218845429e-05, "loss": 0.4869, "step": 9430 }, { "epoch": 0.855084284937466, "grad_norm": 0.8036805391311646, "learning_rate": 4.923577284607263e-05, "loss": 0.4851, "step": 9435 }, { "epoch": 0.8555374297625521, "grad_norm": 0.9063959717750549, "learning_rate": 4.9233831078166634e-05, "loss": 0.5227, "step": 9440 }, { "epoch": 0.8559905745876382, "grad_norm": 0.8788178563117981, "learning_rate": 4.923188688493063e-05, "loss": 0.4893, "step": 9445 }, { "epoch": 0.8564437194127243, "grad_norm": 0.8280020356178284, "learning_rate": 4.922994026655919e-05, "loss": 0.5132, "step": 9450 }, { "epoch": 0.8568968642378104, "grad_norm": 0.8646061420440674, "learning_rate": 4.922799122324715e-05, "loss": 0.5142, "step": 9455 }, { "epoch": 0.8573500090628965, "grad_norm": 0.8835160136222839, "learning_rate": 4.9226039755189544e-05, "loss": 0.5064, "step": 9460 }, { "epoch": 0.8578031538879826, "grad_norm": 0.8357526659965515, "learning_rate": 4.92240858625817e-05, "loss": 0.5206, "step": 9465 }, { "epoch": 0.8582562987130687, "grad_norm": 0.8656480312347412, "learning_rate": 4.922212954561915e-05, "loss": 0.4786, "step": 9470 }, { "epoch": 0.8587094435381548, "grad_norm": 0.8946352601051331, "learning_rate": 4.922017080449769e-05, "loss": 0.5153, "step": 9475 }, { "epoch": 0.8591625883632409, "grad_norm": 0.8420366644859314, "learning_rate": 4.9218209639413334e-05, "loss": 0.4862, "step": 9480 }, { "epoch": 0.859615733188327, "grad_norm": 0.8374010920524597, "learning_rate": 4.921624605056239e-05, "loss": 0.5122, "step": 9485 }, { "epoch": 0.8600688780134131, "grad_norm": 0.8315761089324951, "learning_rate": 4.9214280038141344e-05, "loss": 0.4733, "step": 9490 }, { "epoch": 0.8605220228384992, "grad_norm": 0.8426036834716797, "learning_rate": 4.9212311602346975e-05, "loss": 0.4815, "step": 9495 }, { "epoch": 0.8609751676635853, "grad_norm": 0.8081281185150146, "learning_rate": 4.921034074337626e-05, "loss": 0.4612, "step": 9500 }, { "epoch": 0.8614283124886714, "grad_norm": 0.8833835124969482, "learning_rate": 4.9208367461426474e-05, "loss": 0.4954, "step": 9505 }, { "epoch": 0.8618814573137574, "grad_norm": 0.8463374376296997, "learning_rate": 4.920639175669508e-05, "loss": 0.4482, "step": 9510 }, { "epoch": 0.8623346021388436, "grad_norm": 0.808097779750824, "learning_rate": 4.920441362937982e-05, "loss": 0.4625, "step": 9515 }, { "epoch": 0.8627877469639297, "grad_norm": 0.912240743637085, "learning_rate": 4.920243307967866e-05, "loss": 0.4769, "step": 9520 }, { "epoch": 0.8632408917890158, "grad_norm": 0.9288632273674011, "learning_rate": 4.920045010778982e-05, "loss": 0.5071, "step": 9525 }, { "epoch": 0.8636940366141018, "grad_norm": 0.8797445297241211, "learning_rate": 4.9198464713911746e-05, "loss": 0.5065, "step": 9530 }, { "epoch": 0.864147181439188, "grad_norm": 0.9494373798370361, "learning_rate": 4.9196476898243146e-05, "loss": 0.4531, "step": 9535 }, { "epoch": 0.8646003262642741, "grad_norm": 0.835588812828064, "learning_rate": 4.919448666098296e-05, "loss": 0.4916, "step": 9540 }, { "epoch": 0.8650534710893601, "grad_norm": 0.9695026874542236, "learning_rate": 4.919249400233037e-05, "loss": 0.5491, "step": 9545 }, { "epoch": 0.8655066159144462, "grad_norm": 0.9092223048210144, "learning_rate": 4.9190498922484796e-05, "loss": 0.5077, "step": 9550 }, { "epoch": 0.8659597607395324, "grad_norm": 0.8238818645477295, "learning_rate": 4.9188501421645914e-05, "loss": 0.516, "step": 9555 }, { "epoch": 0.8664129055646185, "grad_norm": 0.8678950667381287, "learning_rate": 4.9186501500013635e-05, "loss": 0.4838, "step": 9560 }, { "epoch": 0.8668660503897045, "grad_norm": 0.893283486366272, "learning_rate": 4.918449915778811e-05, "loss": 0.4847, "step": 9565 }, { "epoch": 0.8673191952147906, "grad_norm": 0.8486025333404541, "learning_rate": 4.918249439516974e-05, "loss": 0.4252, "step": 9570 }, { "epoch": 0.8677723400398768, "grad_norm": 0.805827796459198, "learning_rate": 4.918048721235915e-05, "loss": 0.4758, "step": 9575 }, { "epoch": 0.8682254848649629, "grad_norm": 0.7814825177192688, "learning_rate": 4.917847760955722e-05, "loss": 0.482, "step": 9580 }, { "epoch": 0.8686786296900489, "grad_norm": 0.7981892824172974, "learning_rate": 4.9176465586965084e-05, "loss": 0.474, "step": 9585 }, { "epoch": 0.869131774515135, "grad_norm": 0.8092257976531982, "learning_rate": 4.9174451144784095e-05, "loss": 0.4639, "step": 9590 }, { "epoch": 0.8695849193402211, "grad_norm": 0.8129842281341553, "learning_rate": 4.917243428321587e-05, "loss": 0.5302, "step": 9595 }, { "epoch": 0.8700380641653073, "grad_norm": 0.8447667360305786, "learning_rate": 4.9170415002462244e-05, "loss": 0.4701, "step": 9600 }, { "epoch": 0.8704912089903933, "grad_norm": 0.7875558137893677, "learning_rate": 4.9168393302725316e-05, "loss": 0.4356, "step": 9605 }, { "epoch": 0.8709443538154794, "grad_norm": 0.8897076845169067, "learning_rate": 4.916636918420742e-05, "loss": 0.4761, "step": 9610 }, { "epoch": 0.8713974986405655, "grad_norm": 0.765748918056488, "learning_rate": 4.916434264711112e-05, "loss": 0.4885, "step": 9615 }, { "epoch": 0.8718506434656517, "grad_norm": 0.8717777132987976, "learning_rate": 4.916231369163924e-05, "loss": 0.4918, "step": 9620 }, { "epoch": 0.8723037882907377, "grad_norm": 0.8543215394020081, "learning_rate": 4.9160282317994845e-05, "loss": 0.4962, "step": 9625 }, { "epoch": 0.8727569331158238, "grad_norm": 0.8479017019271851, "learning_rate": 4.915824852638122e-05, "loss": 0.5714, "step": 9630 }, { "epoch": 0.8732100779409099, "grad_norm": 0.8589567542076111, "learning_rate": 4.915621231700192e-05, "loss": 0.5527, "step": 9635 }, { "epoch": 0.8736632227659961, "grad_norm": 0.7907124757766724, "learning_rate": 4.9154173690060724e-05, "loss": 0.5153, "step": 9640 }, { "epoch": 0.8741163675910821, "grad_norm": 0.9135311245918274, "learning_rate": 4.9152132645761664e-05, "loss": 0.5419, "step": 9645 }, { "epoch": 0.8745695124161682, "grad_norm": 0.8193404674530029, "learning_rate": 4.9150089184308995e-05, "loss": 0.5078, "step": 9650 }, { "epoch": 0.8750226572412543, "grad_norm": 0.8277804255485535, "learning_rate": 4.9148043305907245e-05, "loss": 0.4726, "step": 9655 }, { "epoch": 0.8754758020663403, "grad_norm": 0.8614281415939331, "learning_rate": 4.914599501076115e-05, "loss": 0.5307, "step": 9660 }, { "epoch": 0.8759289468914265, "grad_norm": 0.8606155514717102, "learning_rate": 4.9143944299075715e-05, "loss": 0.455, "step": 9665 }, { "epoch": 0.8763820917165126, "grad_norm": 0.9004210829734802, "learning_rate": 4.9141891171056176e-05, "loss": 0.4924, "step": 9670 }, { "epoch": 0.8768352365415987, "grad_norm": 0.908113420009613, "learning_rate": 4.9139835626908006e-05, "loss": 0.5212, "step": 9675 }, { "epoch": 0.8772883813666847, "grad_norm": 0.9058215022087097, "learning_rate": 4.913777766683692e-05, "loss": 0.4935, "step": 9680 }, { "epoch": 0.8777415261917709, "grad_norm": 0.8783618807792664, "learning_rate": 4.913571729104889e-05, "loss": 0.4931, "step": 9685 }, { "epoch": 0.878194671016857, "grad_norm": 0.8827531337738037, "learning_rate": 4.913365449975012e-05, "loss": 0.5055, "step": 9690 }, { "epoch": 0.8786478158419431, "grad_norm": 0.8467762470245361, "learning_rate": 4.913158929314704e-05, "loss": 0.4865, "step": 9695 }, { "epoch": 0.8791009606670291, "grad_norm": 0.8755585551261902, "learning_rate": 4.9129521671446346e-05, "loss": 0.4857, "step": 9700 }, { "epoch": 0.8795541054921153, "grad_norm": 0.8243511319160461, "learning_rate": 4.912745163485497e-05, "loss": 0.4973, "step": 9705 }, { "epoch": 0.8800072503172014, "grad_norm": 0.8796558976173401, "learning_rate": 4.912537918358008e-05, "loss": 0.5069, "step": 9710 }, { "epoch": 0.8804603951422875, "grad_norm": 0.8879361152648926, "learning_rate": 4.912330431782909e-05, "loss": 0.4785, "step": 9715 }, { "epoch": 0.8809135399673735, "grad_norm": 0.8765604496002197, "learning_rate": 4.9121227037809634e-05, "loss": 0.5077, "step": 9720 }, { "epoch": 0.8813666847924597, "grad_norm": 0.8996294736862183, "learning_rate": 4.911914734372963e-05, "loss": 0.4999, "step": 9725 }, { "epoch": 0.8818198296175458, "grad_norm": 0.8524369597434998, "learning_rate": 4.911706523579721e-05, "loss": 0.5262, "step": 9730 }, { "epoch": 0.8822729744426319, "grad_norm": 0.8642301559448242, "learning_rate": 4.911498071422075e-05, "loss": 0.4961, "step": 9735 }, { "epoch": 0.8827261192677179, "grad_norm": 0.8350131511688232, "learning_rate": 4.911289377920886e-05, "loss": 0.486, "step": 9740 }, { "epoch": 0.883179264092804, "grad_norm": 1.0574283599853516, "learning_rate": 4.911080443097041e-05, "loss": 0.4983, "step": 9745 }, { "epoch": 0.8836324089178902, "grad_norm": 0.8227438926696777, "learning_rate": 4.910871266971451e-05, "loss": 0.4816, "step": 9750 }, { "epoch": 0.8840855537429763, "grad_norm": 0.8885517120361328, "learning_rate": 4.910661849565049e-05, "loss": 0.5342, "step": 9755 }, { "epoch": 0.8845386985680623, "grad_norm": 0.8717201352119446, "learning_rate": 4.910452190898794e-05, "loss": 0.4834, "step": 9760 }, { "epoch": 0.8849918433931484, "grad_norm": 0.8783742785453796, "learning_rate": 4.910242290993668e-05, "loss": 0.5076, "step": 9765 }, { "epoch": 0.8854449882182346, "grad_norm": 0.830932080745697, "learning_rate": 4.910032149870681e-05, "loss": 0.5514, "step": 9770 }, { "epoch": 0.8858981330433207, "grad_norm": 1.0343940258026123, "learning_rate": 4.909821767550861e-05, "loss": 0.5079, "step": 9775 }, { "epoch": 0.8863512778684067, "grad_norm": 0.8330262899398804, "learning_rate": 4.9096111440552626e-05, "loss": 0.449, "step": 9780 }, { "epoch": 0.8868044226934928, "grad_norm": 0.8082001805305481, "learning_rate": 4.9094002794049674e-05, "loss": 0.4878, "step": 9785 }, { "epoch": 0.887257567518579, "grad_norm": 0.8082283139228821, "learning_rate": 4.9091891736210774e-05, "loss": 0.4757, "step": 9790 }, { "epoch": 0.887710712343665, "grad_norm": 0.9227048754692078, "learning_rate": 4.908977826724721e-05, "loss": 0.5256, "step": 9795 }, { "epoch": 0.8881638571687511, "grad_norm": 0.9090992212295532, "learning_rate": 4.908766238737048e-05, "loss": 0.4373, "step": 9800 }, { "epoch": 0.8886170019938372, "grad_norm": 0.8277629017829895, "learning_rate": 4.908554409679237e-05, "loss": 0.4985, "step": 9805 }, { "epoch": 0.8890701468189234, "grad_norm": 0.773097813129425, "learning_rate": 4.9083423395724856e-05, "loss": 0.4649, "step": 9810 }, { "epoch": 0.8895232916440095, "grad_norm": 0.7961937189102173, "learning_rate": 4.908130028438019e-05, "loss": 0.5592, "step": 9815 }, { "epoch": 0.8899764364690955, "grad_norm": 0.7779892683029175, "learning_rate": 4.9079174762970855e-05, "loss": 0.5153, "step": 9820 }, { "epoch": 0.8904295812941816, "grad_norm": 0.8714814186096191, "learning_rate": 4.907704683170956e-05, "loss": 0.4639, "step": 9825 }, { "epoch": 0.8908827261192677, "grad_norm": 0.9044687151908875, "learning_rate": 4.907491649080929e-05, "loss": 0.5151, "step": 9830 }, { "epoch": 0.8913358709443538, "grad_norm": 0.9223561882972717, "learning_rate": 4.907278374048323e-05, "loss": 0.4946, "step": 9835 }, { "epoch": 0.8917890157694399, "grad_norm": 0.8112915754318237, "learning_rate": 4.907064858094485e-05, "loss": 0.4627, "step": 9840 }, { "epoch": 0.892242160594526, "grad_norm": 0.9020683765411377, "learning_rate": 4.90685110124078e-05, "loss": 0.5629, "step": 9845 }, { "epoch": 0.8926953054196121, "grad_norm": 0.807222306728363, "learning_rate": 4.9066371035086045e-05, "loss": 0.442, "step": 9850 }, { "epoch": 0.8931484502446982, "grad_norm": 0.9394431710243225, "learning_rate": 4.9064228649193746e-05, "loss": 0.5771, "step": 9855 }, { "epoch": 0.8936015950697843, "grad_norm": 0.819263756275177, "learning_rate": 4.90620838549453e-05, "loss": 0.5414, "step": 9860 }, { "epoch": 0.8940547398948704, "grad_norm": 0.8212104439735413, "learning_rate": 4.9059936652555375e-05, "loss": 0.4738, "step": 9865 }, { "epoch": 0.8945078847199565, "grad_norm": 0.8486361503601074, "learning_rate": 4.9057787042238844e-05, "loss": 0.4729, "step": 9870 }, { "epoch": 0.8949610295450426, "grad_norm": 0.838636040687561, "learning_rate": 4.905563502421086e-05, "loss": 0.4582, "step": 9875 }, { "epoch": 0.8954141743701287, "grad_norm": 0.8074139356613159, "learning_rate": 4.9053480598686796e-05, "loss": 0.4415, "step": 9880 }, { "epoch": 0.8958673191952148, "grad_norm": 0.8868325352668762, "learning_rate": 4.905132376588225e-05, "loss": 0.5045, "step": 9885 }, { "epoch": 0.8963204640203009, "grad_norm": 0.8633578419685364, "learning_rate": 4.9049164526013104e-05, "loss": 0.5121, "step": 9890 }, { "epoch": 0.896773608845387, "grad_norm": 0.8426816463470459, "learning_rate": 4.9047002879295435e-05, "loss": 0.4959, "step": 9895 }, { "epoch": 0.8972267536704731, "grad_norm": 0.8705111742019653, "learning_rate": 4.9044838825945593e-05, "loss": 0.4956, "step": 9900 }, { "epoch": 0.8976798984955592, "grad_norm": 0.9335348606109619, "learning_rate": 4.904267236618015e-05, "loss": 0.4895, "step": 9905 }, { "epoch": 0.8981330433206453, "grad_norm": 0.79926997423172, "learning_rate": 4.904050350021593e-05, "loss": 0.4692, "step": 9910 }, { "epoch": 0.8985861881457313, "grad_norm": 0.7301221489906311, "learning_rate": 4.903833222826999e-05, "loss": 0.4319, "step": 9915 }, { "epoch": 0.8990393329708175, "grad_norm": 0.8650585412979126, "learning_rate": 4.903615855055964e-05, "loss": 0.4946, "step": 9920 }, { "epoch": 0.8994924777959036, "grad_norm": 0.8980997800827026, "learning_rate": 4.903398246730241e-05, "loss": 0.5388, "step": 9925 }, { "epoch": 0.8999456226209896, "grad_norm": 0.7930150032043457, "learning_rate": 4.903180397871609e-05, "loss": 0.43, "step": 9930 }, { "epoch": 0.9003987674460757, "grad_norm": 1.0339504480361938, "learning_rate": 4.9029623085018715e-05, "loss": 0.487, "step": 9935 }, { "epoch": 0.9008519122711619, "grad_norm": 0.8717787265777588, "learning_rate": 4.902743978642853e-05, "loss": 0.4549, "step": 9940 }, { "epoch": 0.901305057096248, "grad_norm": 0.8501968383789062, "learning_rate": 4.9025254083164053e-05, "loss": 0.4689, "step": 9945 }, { "epoch": 0.901758201921334, "grad_norm": 0.879324197769165, "learning_rate": 4.9023065975444015e-05, "loss": 0.4502, "step": 9950 }, { "epoch": 0.9022113467464201, "grad_norm": 1.0190086364746094, "learning_rate": 4.9020875463487425e-05, "loss": 0.5047, "step": 9955 }, { "epoch": 0.9026644915715063, "grad_norm": 0.8390746116638184, "learning_rate": 4.9018682547513494e-05, "loss": 0.4931, "step": 9960 }, { "epoch": 0.9031176363965924, "grad_norm": 0.9022707343101501, "learning_rate": 4.901648722774169e-05, "loss": 0.5397, "step": 9965 }, { "epoch": 0.9035707812216784, "grad_norm": 0.8293024897575378, "learning_rate": 4.901428950439173e-05, "loss": 0.4671, "step": 9970 }, { "epoch": 0.9040239260467645, "grad_norm": 0.9402429461479187, "learning_rate": 4.9012089377683556e-05, "loss": 0.482, "step": 9975 }, { "epoch": 0.9044770708718507, "grad_norm": 0.8583147525787354, "learning_rate": 4.900988684783736e-05, "loss": 0.441, "step": 9980 }, { "epoch": 0.9049302156969368, "grad_norm": 0.8074634075164795, "learning_rate": 4.9007681915073576e-05, "loss": 0.4593, "step": 9985 }, { "epoch": 0.9053833605220228, "grad_norm": 0.8689502477645874, "learning_rate": 4.900547457961287e-05, "loss": 0.4968, "step": 9990 }, { "epoch": 0.9058365053471089, "grad_norm": 0.8653568625450134, "learning_rate": 4.900326484167615e-05, "loss": 0.4994, "step": 9995 }, { "epoch": 0.906289650172195, "grad_norm": 0.932055652141571, "learning_rate": 4.9001052701484575e-05, "loss": 0.4844, "step": 10000 }, { "epoch": 0.9067427949972812, "grad_norm": 0.9027488231658936, "learning_rate": 4.899883815925953e-05, "loss": 0.4851, "step": 10005 }, { "epoch": 0.9071959398223672, "grad_norm": 1.0366978645324707, "learning_rate": 4.899662121522264e-05, "loss": 0.4673, "step": 10010 }, { "epoch": 0.9076490846474533, "grad_norm": 0.8394487500190735, "learning_rate": 4.899440186959579e-05, "loss": 0.4898, "step": 10015 }, { "epoch": 0.9081022294725394, "grad_norm": 0.837236225605011, "learning_rate": 4.8992180122601094e-05, "loss": 0.4946, "step": 10020 }, { "epoch": 0.9085553742976256, "grad_norm": 0.8169035911560059, "learning_rate": 4.8989955974460896e-05, "loss": 0.471, "step": 10025 }, { "epoch": 0.9090085191227116, "grad_norm": 0.8784216642379761, "learning_rate": 4.8987729425397806e-05, "loss": 0.5177, "step": 10030 }, { "epoch": 0.9094616639477977, "grad_norm": 0.8895712494850159, "learning_rate": 4.8985500475634635e-05, "loss": 0.4965, "step": 10035 }, { "epoch": 0.9099148087728838, "grad_norm": 0.934988260269165, "learning_rate": 4.898326912539447e-05, "loss": 0.4656, "step": 10040 }, { "epoch": 0.91036795359797, "grad_norm": 0.8188620805740356, "learning_rate": 4.898103537490063e-05, "loss": 0.4952, "step": 10045 }, { "epoch": 0.910821098423056, "grad_norm": 0.8991656303405762, "learning_rate": 4.897879922437666e-05, "loss": 0.4674, "step": 10050 }, { "epoch": 0.9112742432481421, "grad_norm": 0.8106249570846558, "learning_rate": 4.897656067404635e-05, "loss": 0.5098, "step": 10055 }, { "epoch": 0.9117273880732282, "grad_norm": 0.829221248626709, "learning_rate": 4.897431972413375e-05, "loss": 0.5059, "step": 10060 }, { "epoch": 0.9121805328983144, "grad_norm": 0.7718456983566284, "learning_rate": 4.8972076374863124e-05, "loss": 0.4728, "step": 10065 }, { "epoch": 0.9126336777234004, "grad_norm": 0.9091936945915222, "learning_rate": 4.896983062645899e-05, "loss": 0.5382, "step": 10070 }, { "epoch": 0.9130868225484865, "grad_norm": 0.8871400952339172, "learning_rate": 4.896758247914611e-05, "loss": 0.4729, "step": 10075 }, { "epoch": 0.9135399673735726, "grad_norm": 0.8252334594726562, "learning_rate": 4.896533193314947e-05, "loss": 0.4504, "step": 10080 }, { "epoch": 0.9139931121986586, "grad_norm": 0.9001058340072632, "learning_rate": 4.896307898869431e-05, "loss": 0.461, "step": 10085 }, { "epoch": 0.9144462570237448, "grad_norm": 0.7907036542892456, "learning_rate": 4.896082364600609e-05, "loss": 0.4969, "step": 10090 }, { "epoch": 0.9148994018488309, "grad_norm": 0.8196050524711609, "learning_rate": 4.895856590531056e-05, "loss": 0.4947, "step": 10095 }, { "epoch": 0.915352546673917, "grad_norm": 0.9437698125839233, "learning_rate": 4.895630576683364e-05, "loss": 0.5177, "step": 10100 }, { "epoch": 0.915805691499003, "grad_norm": 0.8023747205734253, "learning_rate": 4.895404323080155e-05, "loss": 0.5113, "step": 10105 }, { "epoch": 0.9162588363240892, "grad_norm": 0.9388362765312195, "learning_rate": 4.895177829744071e-05, "loss": 0.4774, "step": 10110 }, { "epoch": 0.9167119811491753, "grad_norm": 1.0606410503387451, "learning_rate": 4.894951096697781e-05, "loss": 0.5062, "step": 10115 }, { "epoch": 0.9171651259742614, "grad_norm": 0.8735558390617371, "learning_rate": 4.8947241239639746e-05, "loss": 0.4809, "step": 10120 }, { "epoch": 0.9176182707993474, "grad_norm": 0.9541451334953308, "learning_rate": 4.8944969115653684e-05, "loss": 0.435, "step": 10125 }, { "epoch": 0.9180714156244336, "grad_norm": 0.7837846279144287, "learning_rate": 4.894269459524702e-05, "loss": 0.4972, "step": 10130 }, { "epoch": 0.9185245604495197, "grad_norm": 0.8609964847564697, "learning_rate": 4.8940417678647395e-05, "loss": 0.5031, "step": 10135 }, { "epoch": 0.9189777052746058, "grad_norm": 0.8507171869277954, "learning_rate": 4.893813836608266e-05, "loss": 0.4894, "step": 10140 }, { "epoch": 0.9194308500996918, "grad_norm": 0.8470519185066223, "learning_rate": 4.893585665778096e-05, "loss": 0.4901, "step": 10145 }, { "epoch": 0.9198839949247779, "grad_norm": 0.8887928128242493, "learning_rate": 4.893357255397063e-05, "loss": 0.455, "step": 10150 }, { "epoch": 0.9203371397498641, "grad_norm": 0.9443104267120361, "learning_rate": 4.8931286054880266e-05, "loss": 0.4986, "step": 10155 }, { "epoch": 0.9207902845749502, "grad_norm": 0.8472976088523865, "learning_rate": 4.89289971607387e-05, "loss": 0.438, "step": 10160 }, { "epoch": 0.9212434294000362, "grad_norm": 0.8846810460090637, "learning_rate": 4.892670587177501e-05, "loss": 0.4766, "step": 10165 }, { "epoch": 0.9216965742251223, "grad_norm": 0.7471703290939331, "learning_rate": 4.8924412188218515e-05, "loss": 0.44, "step": 10170 }, { "epoch": 0.9221497190502085, "grad_norm": 0.9249391555786133, "learning_rate": 4.8922116110298754e-05, "loss": 0.521, "step": 10175 }, { "epoch": 0.9226028638752946, "grad_norm": 0.8743047118186951, "learning_rate": 4.891981763824554e-05, "loss": 0.4375, "step": 10180 }, { "epoch": 0.9230560087003806, "grad_norm": 0.9827556014060974, "learning_rate": 4.8917516772288874e-05, "loss": 0.4953, "step": 10185 }, { "epoch": 0.9235091535254667, "grad_norm": 0.8894075751304626, "learning_rate": 4.8915213512659045e-05, "loss": 0.482, "step": 10190 }, { "epoch": 0.9239622983505529, "grad_norm": 0.7805470824241638, "learning_rate": 4.891290785958657e-05, "loss": 0.4714, "step": 10195 }, { "epoch": 0.924415443175639, "grad_norm": 0.8717172741889954, "learning_rate": 4.891059981330219e-05, "loss": 0.4866, "step": 10200 }, { "epoch": 0.924868588000725, "grad_norm": 0.8268107771873474, "learning_rate": 4.89082893740369e-05, "loss": 0.444, "step": 10205 }, { "epoch": 0.9253217328258111, "grad_norm": 0.8305187225341797, "learning_rate": 4.890597654202192e-05, "loss": 0.5086, "step": 10210 }, { "epoch": 0.9257748776508973, "grad_norm": 0.8542505502700806, "learning_rate": 4.890366131748873e-05, "loss": 0.4628, "step": 10215 }, { "epoch": 0.9262280224759833, "grad_norm": 0.9021068811416626, "learning_rate": 4.890134370066903e-05, "loss": 0.4317, "step": 10220 }, { "epoch": 0.9266811673010694, "grad_norm": 0.9579981565475464, "learning_rate": 4.889902369179479e-05, "loss": 0.4918, "step": 10225 }, { "epoch": 0.9271343121261555, "grad_norm": 0.8470531105995178, "learning_rate": 4.8896701291098165e-05, "loss": 0.4994, "step": 10230 }, { "epoch": 0.9275874569512416, "grad_norm": 0.8731473684310913, "learning_rate": 4.8894376498811596e-05, "loss": 0.47, "step": 10235 }, { "epoch": 0.9280406017763277, "grad_norm": 0.8073639273643494, "learning_rate": 4.889204931516775e-05, "loss": 0.447, "step": 10240 }, { "epoch": 0.9284937466014138, "grad_norm": 0.8339231610298157, "learning_rate": 4.888971974039954e-05, "loss": 0.4778, "step": 10245 }, { "epoch": 0.9289468914264999, "grad_norm": 0.80567866563797, "learning_rate": 4.8887387774740095e-05, "loss": 0.4942, "step": 10250 }, { "epoch": 0.929400036251586, "grad_norm": 0.8922431468963623, "learning_rate": 4.888505341842281e-05, "loss": 0.5096, "step": 10255 }, { "epoch": 0.9298531810766721, "grad_norm": 0.7807848453521729, "learning_rate": 4.888271667168131e-05, "loss": 0.4881, "step": 10260 }, { "epoch": 0.9303063259017582, "grad_norm": 0.857519268989563, "learning_rate": 4.8880377534749434e-05, "loss": 0.5007, "step": 10265 }, { "epoch": 0.9307594707268443, "grad_norm": 0.7641361951828003, "learning_rate": 4.887803600786131e-05, "loss": 0.5101, "step": 10270 }, { "epoch": 0.9312126155519304, "grad_norm": 0.7894142270088196, "learning_rate": 4.887569209125128e-05, "loss": 0.4994, "step": 10275 }, { "epoch": 0.9316657603770165, "grad_norm": 0.8313911557197571, "learning_rate": 4.88733457851539e-05, "loss": 0.4269, "step": 10280 }, { "epoch": 0.9321189052021026, "grad_norm": 0.8807212710380554, "learning_rate": 4.887099708980402e-05, "loss": 0.4328, "step": 10285 }, { "epoch": 0.9325720500271887, "grad_norm": 0.9407925605773926, "learning_rate": 4.886864600543667e-05, "loss": 0.4638, "step": 10290 }, { "epoch": 0.9330251948522748, "grad_norm": 0.9441057443618774, "learning_rate": 4.886629253228716e-05, "loss": 0.5588, "step": 10295 }, { "epoch": 0.9334783396773609, "grad_norm": 0.8102371692657471, "learning_rate": 4.886393667059103e-05, "loss": 0.4931, "step": 10300 }, { "epoch": 0.933931484502447, "grad_norm": 0.8396726846694946, "learning_rate": 4.8861578420584045e-05, "loss": 0.4637, "step": 10305 }, { "epoch": 0.9343846293275331, "grad_norm": 0.8664460182189941, "learning_rate": 4.8859217782502225e-05, "loss": 0.4796, "step": 10310 }, { "epoch": 0.9348377741526191, "grad_norm": 0.8229756355285645, "learning_rate": 4.8856854756581826e-05, "loss": 0.4861, "step": 10315 }, { "epoch": 0.9352909189777052, "grad_norm": 0.8942013382911682, "learning_rate": 4.885448934305934e-05, "loss": 0.4145, "step": 10320 }, { "epoch": 0.9357440638027914, "grad_norm": 0.8724278807640076, "learning_rate": 4.885212154217149e-05, "loss": 0.4515, "step": 10325 }, { "epoch": 0.9361972086278775, "grad_norm": 0.8591696619987488, "learning_rate": 4.884975135415526e-05, "loss": 0.4773, "step": 10330 }, { "epoch": 0.9366503534529635, "grad_norm": 0.8400585055351257, "learning_rate": 4.884737877924786e-05, "loss": 0.4226, "step": 10335 }, { "epoch": 0.9371034982780496, "grad_norm": 0.9599235653877258, "learning_rate": 4.884500381768671e-05, "loss": 0.5179, "step": 10340 }, { "epoch": 0.9375566431031358, "grad_norm": 0.7889439463615417, "learning_rate": 4.884262646970953e-05, "loss": 0.4601, "step": 10345 }, { "epoch": 0.9380097879282219, "grad_norm": 0.8779420852661133, "learning_rate": 4.884024673555423e-05, "loss": 0.4277, "step": 10350 }, { "epoch": 0.9384629327533079, "grad_norm": 0.9300277829170227, "learning_rate": 4.883786461545898e-05, "loss": 0.4781, "step": 10355 }, { "epoch": 0.938916077578394, "grad_norm": 0.8328245878219604, "learning_rate": 4.883548010966218e-05, "loss": 0.4989, "step": 10360 }, { "epoch": 0.9393692224034802, "grad_norm": 0.7910520434379578, "learning_rate": 4.883309321840247e-05, "loss": 0.4895, "step": 10365 }, { "epoch": 0.9398223672285663, "grad_norm": 0.8679208755493164, "learning_rate": 4.883070394191873e-05, "loss": 0.4489, "step": 10370 }, { "epoch": 0.9402755120536523, "grad_norm": 0.7814086079597473, "learning_rate": 4.8828312280450096e-05, "loss": 0.4328, "step": 10375 }, { "epoch": 0.9407286568787384, "grad_norm": 0.8289539813995361, "learning_rate": 4.88259182342359e-05, "loss": 0.5259, "step": 10380 }, { "epoch": 0.9411818017038246, "grad_norm": 0.8698672652244568, "learning_rate": 4.882352180351576e-05, "loss": 0.4765, "step": 10385 }, { "epoch": 0.9416349465289107, "grad_norm": 0.9381403923034668, "learning_rate": 4.88211229885295e-05, "loss": 0.5044, "step": 10390 }, { "epoch": 0.9420880913539967, "grad_norm": 0.879417896270752, "learning_rate": 4.8818721789517206e-05, "loss": 0.4904, "step": 10395 }, { "epoch": 0.9425412361790828, "grad_norm": 0.7859774231910706, "learning_rate": 4.881631820671918e-05, "loss": 0.4188, "step": 10400 }, { "epoch": 0.9429943810041689, "grad_norm": 0.912597119808197, "learning_rate": 4.881391224037598e-05, "loss": 0.4651, "step": 10405 }, { "epoch": 0.9434475258292551, "grad_norm": 0.7326025366783142, "learning_rate": 4.881150389072839e-05, "loss": 0.4514, "step": 10410 }, { "epoch": 0.9439006706543411, "grad_norm": 0.8964463472366333, "learning_rate": 4.880909315801744e-05, "loss": 0.4578, "step": 10415 }, { "epoch": 0.9443538154794272, "grad_norm": 0.858917236328125, "learning_rate": 4.88066800424844e-05, "loss": 0.4924, "step": 10420 }, { "epoch": 0.9448069603045133, "grad_norm": 0.8831143379211426, "learning_rate": 4.880426454437077e-05, "loss": 0.449, "step": 10425 }, { "epoch": 0.9452601051295995, "grad_norm": 0.9432497024536133, "learning_rate": 4.88018466639183e-05, "loss": 0.5056, "step": 10430 }, { "epoch": 0.9457132499546855, "grad_norm": 0.8289841413497925, "learning_rate": 4.8799426401368964e-05, "loss": 0.4347, "step": 10435 }, { "epoch": 0.9461663947797716, "grad_norm": 0.9051375985145569, "learning_rate": 4.8797003756964995e-05, "loss": 0.4953, "step": 10440 }, { "epoch": 0.9466195396048577, "grad_norm": 0.8641471266746521, "learning_rate": 4.8794578730948846e-05, "loss": 0.4287, "step": 10445 }, { "epoch": 0.9470726844299439, "grad_norm": 0.937078595161438, "learning_rate": 4.879215132356321e-05, "loss": 0.4882, "step": 10450 }, { "epoch": 0.9475258292550299, "grad_norm": 0.8335527777671814, "learning_rate": 4.878972153505102e-05, "loss": 0.5213, "step": 10455 }, { "epoch": 0.947978974080116, "grad_norm": 0.8322834372520447, "learning_rate": 4.878728936565547e-05, "loss": 0.4916, "step": 10460 }, { "epoch": 0.9484321189052021, "grad_norm": 0.8968382477760315, "learning_rate": 4.878485481561996e-05, "loss": 0.4694, "step": 10465 }, { "epoch": 0.9488852637302883, "grad_norm": 0.8000396490097046, "learning_rate": 4.878241788518813e-05, "loss": 0.4994, "step": 10470 }, { "epoch": 0.9493384085553743, "grad_norm": 0.8609491586685181, "learning_rate": 4.877997857460388e-05, "loss": 0.4573, "step": 10475 }, { "epoch": 0.9497915533804604, "grad_norm": 0.8666964769363403, "learning_rate": 4.8777536884111344e-05, "loss": 0.5212, "step": 10480 }, { "epoch": 0.9502446982055465, "grad_norm": 0.8340434432029724, "learning_rate": 4.8775092813954865e-05, "loss": 0.4785, "step": 10485 }, { "epoch": 0.9506978430306325, "grad_norm": 0.8853246569633484, "learning_rate": 4.877264636437907e-05, "loss": 0.5305, "step": 10490 }, { "epoch": 0.9511509878557187, "grad_norm": 0.8149592876434326, "learning_rate": 4.877019753562878e-05, "loss": 0.4961, "step": 10495 }, { "epoch": 0.9516041326808048, "grad_norm": 0.8426259756088257, "learning_rate": 4.876774632794909e-05, "loss": 0.4908, "step": 10500 }, { "epoch": 0.9520572775058909, "grad_norm": 0.8251957297325134, "learning_rate": 4.8765292741585325e-05, "loss": 0.4561, "step": 10505 }, { "epoch": 0.9525104223309769, "grad_norm": 0.9247187972068787, "learning_rate": 4.8762836776783015e-05, "loss": 0.4624, "step": 10510 }, { "epoch": 0.9529635671560631, "grad_norm": 0.7649874687194824, "learning_rate": 4.876037843378797e-05, "loss": 0.4737, "step": 10515 }, { "epoch": 0.9534167119811492, "grad_norm": 0.8509097695350647, "learning_rate": 4.875791771284622e-05, "loss": 0.4776, "step": 10520 }, { "epoch": 0.9538698568062353, "grad_norm": 0.9223915338516235, "learning_rate": 4.875545461420403e-05, "loss": 0.4998, "step": 10525 }, { "epoch": 0.9543230016313213, "grad_norm": 0.831737220287323, "learning_rate": 4.875298913810792e-05, "loss": 0.453, "step": 10530 }, { "epoch": 0.9547761464564075, "grad_norm": 0.8448391556739807, "learning_rate": 4.875052128480462e-05, "loss": 0.4998, "step": 10535 }, { "epoch": 0.9552292912814936, "grad_norm": 0.8813838958740234, "learning_rate": 4.874805105454113e-05, "loss": 0.5116, "step": 10540 }, { "epoch": 0.9556824361065797, "grad_norm": 0.8470022082328796, "learning_rate": 4.874557844756466e-05, "loss": 0.443, "step": 10545 }, { "epoch": 0.9561355809316657, "grad_norm": 0.8595629930496216, "learning_rate": 4.874310346412266e-05, "loss": 0.4732, "step": 10550 }, { "epoch": 0.9565887257567519, "grad_norm": 0.8448654413223267, "learning_rate": 4.874062610446285e-05, "loss": 0.4681, "step": 10555 }, { "epoch": 0.957041870581838, "grad_norm": 0.8277295231819153, "learning_rate": 4.873814636883315e-05, "loss": 0.4366, "step": 10560 }, { "epoch": 0.957495015406924, "grad_norm": 0.9070938229560852, "learning_rate": 4.8735664257481736e-05, "loss": 0.5012, "step": 10565 }, { "epoch": 0.9579481602320101, "grad_norm": 0.8514173626899719, "learning_rate": 4.873317977065702e-05, "loss": 0.4533, "step": 10570 }, { "epoch": 0.9584013050570962, "grad_norm": 0.9580140113830566, "learning_rate": 4.873069290860766e-05, "loss": 0.4472, "step": 10575 }, { "epoch": 0.9588544498821824, "grad_norm": 0.8417937159538269, "learning_rate": 4.872820367158252e-05, "loss": 0.4739, "step": 10580 }, { "epoch": 0.9593075947072685, "grad_norm": 0.8720872402191162, "learning_rate": 4.872571205983073e-05, "loss": 0.4678, "step": 10585 }, { "epoch": 0.9597607395323545, "grad_norm": 0.8765698075294495, "learning_rate": 4.8723218073601665e-05, "loss": 0.4996, "step": 10590 }, { "epoch": 0.9602138843574406, "grad_norm": 0.8655943870544434, "learning_rate": 4.8720721713144916e-05, "loss": 0.4769, "step": 10595 }, { "epoch": 0.9606670291825268, "grad_norm": 0.8656120896339417, "learning_rate": 4.8718222978710315e-05, "loss": 0.4651, "step": 10600 }, { "epoch": 0.9611201740076128, "grad_norm": 0.8455989956855774, "learning_rate": 4.8715721870547947e-05, "loss": 0.4864, "step": 10605 }, { "epoch": 0.9615733188326989, "grad_norm": 0.838371753692627, "learning_rate": 4.871321838890811e-05, "loss": 0.4377, "step": 10610 }, { "epoch": 0.962026463657785, "grad_norm": 1.0478646755218506, "learning_rate": 4.871071253404137e-05, "loss": 0.478, "step": 10615 }, { "epoch": 0.9624796084828712, "grad_norm": 0.8721421957015991, "learning_rate": 4.8708204306198494e-05, "loss": 0.5016, "step": 10620 }, { "epoch": 0.9629327533079572, "grad_norm": 0.8017870187759399, "learning_rate": 4.870569370563052e-05, "loss": 0.469, "step": 10625 }, { "epoch": 0.9633858981330433, "grad_norm": 0.9344682693481445, "learning_rate": 4.87031807325887e-05, "loss": 0.5082, "step": 10630 }, { "epoch": 0.9638390429581294, "grad_norm": 0.9241188168525696, "learning_rate": 4.870066538732455e-05, "loss": 0.5113, "step": 10635 }, { "epoch": 0.9642921877832156, "grad_norm": 0.920012891292572, "learning_rate": 4.869814767008979e-05, "loss": 0.4794, "step": 10640 }, { "epoch": 0.9647453326083016, "grad_norm": 0.9098497033119202, "learning_rate": 4.8695627581136405e-05, "loss": 0.444, "step": 10645 }, { "epoch": 0.9651984774333877, "grad_norm": 0.7900912165641785, "learning_rate": 4.86931051207166e-05, "loss": 0.476, "step": 10650 }, { "epoch": 0.9656516222584738, "grad_norm": 1.0324937105178833, "learning_rate": 4.869058028908282e-05, "loss": 0.4986, "step": 10655 }, { "epoch": 0.9661047670835599, "grad_norm": 0.8832917213439941, "learning_rate": 4.868805308648776e-05, "loss": 0.4551, "step": 10660 }, { "epoch": 0.966557911908646, "grad_norm": 0.8860809803009033, "learning_rate": 4.868552351318434e-05, "loss": 0.5183, "step": 10665 }, { "epoch": 0.9670110567337321, "grad_norm": 0.8516433835029602, "learning_rate": 4.8682991569425715e-05, "loss": 0.4849, "step": 10670 }, { "epoch": 0.9674642015588182, "grad_norm": 0.7975735068321228, "learning_rate": 4.86804572554653e-05, "loss": 0.5203, "step": 10675 }, { "epoch": 0.9679173463839043, "grad_norm": 0.8499065041542053, "learning_rate": 4.86779205715567e-05, "loss": 0.4293, "step": 10680 }, { "epoch": 0.9683704912089904, "grad_norm": 0.8117836713790894, "learning_rate": 4.867538151795381e-05, "loss": 0.4804, "step": 10685 }, { "epoch": 0.9688236360340765, "grad_norm": 0.8265647292137146, "learning_rate": 4.867284009491074e-05, "loss": 0.5339, "step": 10690 }, { "epoch": 0.9692767808591626, "grad_norm": 0.7872313857078552, "learning_rate": 4.8670296302681826e-05, "loss": 0.4782, "step": 10695 }, { "epoch": 0.9697299256842487, "grad_norm": 0.9611385464668274, "learning_rate": 4.866775014152166e-05, "loss": 0.5349, "step": 10700 }, { "epoch": 0.9701830705093348, "grad_norm": 0.8951845169067383, "learning_rate": 4.8665201611685065e-05, "loss": 0.4891, "step": 10705 }, { "epoch": 0.9706362153344209, "grad_norm": 0.7915012836456299, "learning_rate": 4.866265071342708e-05, "loss": 0.4609, "step": 10710 }, { "epoch": 0.971089360159507, "grad_norm": 0.8891786336898804, "learning_rate": 4.8660097447003014e-05, "loss": 0.5045, "step": 10715 }, { "epoch": 0.971542504984593, "grad_norm": 0.8181620836257935, "learning_rate": 4.8657541812668406e-05, "loss": 0.4884, "step": 10720 }, { "epoch": 0.9719956498096791, "grad_norm": 0.9329288005828857, "learning_rate": 4.865498381067901e-05, "loss": 0.4743, "step": 10725 }, { "epoch": 0.9724487946347653, "grad_norm": 0.838973879814148, "learning_rate": 4.865242344129084e-05, "loss": 0.4633, "step": 10730 }, { "epoch": 0.9729019394598514, "grad_norm": 0.8490776419639587, "learning_rate": 4.864986070476013e-05, "loss": 0.4015, "step": 10735 }, { "epoch": 0.9733550842849374, "grad_norm": 0.8633219599723816, "learning_rate": 4.864729560134338e-05, "loss": 0.4584, "step": 10740 }, { "epoch": 0.9738082291100235, "grad_norm": 0.7898775935173035, "learning_rate": 4.864472813129728e-05, "loss": 0.5138, "step": 10745 }, { "epoch": 0.9742613739351097, "grad_norm": 0.8582158088684082, "learning_rate": 4.86421582948788e-05, "loss": 0.5021, "step": 10750 }, { "epoch": 0.9747145187601958, "grad_norm": 0.8034811615943909, "learning_rate": 4.863958609234513e-05, "loss": 0.4525, "step": 10755 }, { "epoch": 0.9751676635852818, "grad_norm": 0.8600065112113953, "learning_rate": 4.863701152395368e-05, "loss": 0.4197, "step": 10760 }, { "epoch": 0.9756208084103679, "grad_norm": 0.9820786118507385, "learning_rate": 4.863443458996213e-05, "loss": 0.5257, "step": 10765 }, { "epoch": 0.9760739532354541, "grad_norm": 0.8377594947814941, "learning_rate": 4.863185529062838e-05, "loss": 0.521, "step": 10770 }, { "epoch": 0.9765270980605402, "grad_norm": 0.9231745004653931, "learning_rate": 4.862927362621056e-05, "loss": 0.4988, "step": 10775 }, { "epoch": 0.9769802428856262, "grad_norm": 0.9231213927268982, "learning_rate": 4.862668959696706e-05, "loss": 0.5313, "step": 10780 }, { "epoch": 0.9774333877107123, "grad_norm": 0.7613572478294373, "learning_rate": 4.8624103203156464e-05, "loss": 0.4574, "step": 10785 }, { "epoch": 0.9778865325357985, "grad_norm": 0.9260156154632568, "learning_rate": 4.8621514445037636e-05, "loss": 0.4376, "step": 10790 }, { "epoch": 0.9783396773608846, "grad_norm": 0.7443624138832092, "learning_rate": 4.861892332286966e-05, "loss": 0.3928, "step": 10795 }, { "epoch": 0.9787928221859706, "grad_norm": 0.9198125600814819, "learning_rate": 4.861632983691185e-05, "loss": 0.4388, "step": 10800 }, { "epoch": 0.9792459670110567, "grad_norm": 0.8805493116378784, "learning_rate": 4.861373398742377e-05, "loss": 0.4626, "step": 10805 }, { "epoch": 0.9796991118361428, "grad_norm": 0.8860712051391602, "learning_rate": 4.8611135774665205e-05, "loss": 0.4591, "step": 10810 }, { "epoch": 0.980152256661229, "grad_norm": 0.9787001609802246, "learning_rate": 4.86085351988962e-05, "loss": 0.483, "step": 10815 }, { "epoch": 0.980605401486315, "grad_norm": 0.841128945350647, "learning_rate": 4.860593226037701e-05, "loss": 0.4516, "step": 10820 }, { "epoch": 0.9810585463114011, "grad_norm": 0.9302271604537964, "learning_rate": 4.860332695936813e-05, "loss": 0.4387, "step": 10825 }, { "epoch": 0.9815116911364872, "grad_norm": 0.8225229382514954, "learning_rate": 4.860071929613032e-05, "loss": 0.4383, "step": 10830 }, { "epoch": 0.9819648359615734, "grad_norm": 0.8514480590820312, "learning_rate": 4.859810927092453e-05, "loss": 0.4617, "step": 10835 }, { "epoch": 0.9824179807866594, "grad_norm": 1.0080634355545044, "learning_rate": 4.8595496884011995e-05, "loss": 0.5405, "step": 10840 }, { "epoch": 0.9828711256117455, "grad_norm": 0.7957746982574463, "learning_rate": 4.8592882135654154e-05, "loss": 0.4404, "step": 10845 }, { "epoch": 0.9833242704368316, "grad_norm": 0.9404016137123108, "learning_rate": 4.859026502611269e-05, "loss": 0.4985, "step": 10850 }, { "epoch": 0.9837774152619178, "grad_norm": 1.5236347913742065, "learning_rate": 4.858764555564954e-05, "loss": 0.4631, "step": 10855 }, { "epoch": 0.9842305600870038, "grad_norm": 0.8580194115638733, "learning_rate": 4.8585023724526836e-05, "loss": 0.4508, "step": 10860 }, { "epoch": 0.9846837049120899, "grad_norm": 0.9230474233627319, "learning_rate": 4.8582399533006996e-05, "loss": 0.4862, "step": 10865 }, { "epoch": 0.985136849737176, "grad_norm": 0.8768299221992493, "learning_rate": 4.857977298135263e-05, "loss": 0.4433, "step": 10870 }, { "epoch": 0.9855899945622622, "grad_norm": 0.8682053685188293, "learning_rate": 4.857714406982661e-05, "loss": 0.4886, "step": 10875 }, { "epoch": 0.9860431393873482, "grad_norm": 0.8093960881233215, "learning_rate": 4.857451279869205e-05, "loss": 0.4627, "step": 10880 }, { "epoch": 0.9864962842124343, "grad_norm": 0.8257168531417847, "learning_rate": 4.857187916821228e-05, "loss": 0.4649, "step": 10885 }, { "epoch": 0.9869494290375204, "grad_norm": 0.8424094319343567, "learning_rate": 4.856924317865087e-05, "loss": 0.474, "step": 10890 }, { "epoch": 0.9874025738626064, "grad_norm": 0.8923273086547852, "learning_rate": 4.856660483027164e-05, "loss": 0.5152, "step": 10895 }, { "epoch": 0.9878557186876926, "grad_norm": 0.8180555105209351, "learning_rate": 4.856396412333862e-05, "loss": 0.469, "step": 10900 }, { "epoch": 0.9883088635127787, "grad_norm": 0.9470937252044678, "learning_rate": 4.856132105811611e-05, "loss": 0.5015, "step": 10905 }, { "epoch": 0.9887620083378648, "grad_norm": 0.9024142026901245, "learning_rate": 4.855867563486863e-05, "loss": 0.4647, "step": 10910 }, { "epoch": 0.9892151531629508, "grad_norm": 0.980538547039032, "learning_rate": 4.855602785386092e-05, "loss": 0.4606, "step": 10915 }, { "epoch": 0.989668297988037, "grad_norm": 0.7945879101753235, "learning_rate": 4.855337771535798e-05, "loss": 0.5337, "step": 10920 }, { "epoch": 0.9901214428131231, "grad_norm": 0.7858673334121704, "learning_rate": 4.855072521962505e-05, "loss": 0.4458, "step": 10925 }, { "epoch": 0.9905745876382092, "grad_norm": 0.9537668228149414, "learning_rate": 4.8548070366927566e-05, "loss": 0.4725, "step": 10930 }, { "epoch": 0.9910277324632952, "grad_norm": 0.860791027545929, "learning_rate": 4.854541315753124e-05, "loss": 0.4373, "step": 10935 }, { "epoch": 0.9914808772883814, "grad_norm": 0.9023060202598572, "learning_rate": 4.8542753591702004e-05, "loss": 0.4322, "step": 10940 }, { "epoch": 0.9919340221134675, "grad_norm": 0.874427080154419, "learning_rate": 4.854009166970603e-05, "loss": 0.4214, "step": 10945 }, { "epoch": 0.9923871669385536, "grad_norm": 0.8927114009857178, "learning_rate": 4.8537427391809734e-05, "loss": 0.4848, "step": 10950 }, { "epoch": 0.9928403117636396, "grad_norm": 0.8406829237937927, "learning_rate": 4.853476075827974e-05, "loss": 0.4778, "step": 10955 }, { "epoch": 0.9932934565887258, "grad_norm": 0.8052718639373779, "learning_rate": 4.8532091769382934e-05, "loss": 0.4772, "step": 10960 }, { "epoch": 0.9937466014138119, "grad_norm": 0.9485329985618591, "learning_rate": 4.852942042538643e-05, "loss": 0.4383, "step": 10965 }, { "epoch": 0.994199746238898, "grad_norm": 0.8760508894920349, "learning_rate": 4.852674672655757e-05, "loss": 0.4035, "step": 10970 }, { "epoch": 0.994652891063984, "grad_norm": 0.858680784702301, "learning_rate": 4.8524070673163954e-05, "loss": 0.506, "step": 10975 }, { "epoch": 0.9951060358890701, "grad_norm": 0.8291572332382202, "learning_rate": 4.8521392265473384e-05, "loss": 0.4993, "step": 10980 }, { "epoch": 0.9955591807141563, "grad_norm": 0.8586686849594116, "learning_rate": 4.851871150375393e-05, "loss": 0.4179, "step": 10985 }, { "epoch": 0.9960123255392423, "grad_norm": 0.8748261332511902, "learning_rate": 4.851602838827387e-05, "loss": 0.4378, "step": 10990 }, { "epoch": 0.9964654703643284, "grad_norm": 0.7874118685722351, "learning_rate": 4.8513342919301754e-05, "loss": 0.4518, "step": 10995 }, { "epoch": 0.9969186151894145, "grad_norm": 0.868488609790802, "learning_rate": 4.851065509710632e-05, "loss": 0.4105, "step": 11000 }, { "epoch": 0.9973717600145007, "grad_norm": 0.857492208480835, "learning_rate": 4.8507964921956574e-05, "loss": 0.4643, "step": 11005 }, { "epoch": 0.9978249048395867, "grad_norm": 0.8727580904960632, "learning_rate": 4.850527239412176e-05, "loss": 0.4604, "step": 11010 }, { "epoch": 0.9982780496646728, "grad_norm": 0.8552215099334717, "learning_rate": 4.850257751387133e-05, "loss": 0.4059, "step": 11015 }, { "epoch": 0.9987311944897589, "grad_norm": 1.0097440481185913, "learning_rate": 4.8499880281475e-05, "loss": 0.4927, "step": 11020 }, { "epoch": 0.9991843393148451, "grad_norm": 0.827957808971405, "learning_rate": 4.849718069720272e-05, "loss": 0.4783, "step": 11025 }, { "epoch": 0.9996374841399311, "grad_norm": 0.7865667343139648, "learning_rate": 4.849447876132463e-05, "loss": 0.4758, "step": 11030 }, { "epoch": 1.0000906289650173, "grad_norm": 0.8410733342170715, "learning_rate": 4.849177447411118e-05, "loss": 0.4163, "step": 11035 }, { "epoch": 1.0005437737901033, "grad_norm": 0.8223056197166443, "learning_rate": 4.848906783583299e-05, "loss": 0.3629, "step": 11040 }, { "epoch": 1.0009969186151895, "grad_norm": 0.837211012840271, "learning_rate": 4.848635884676096e-05, "loss": 0.3572, "step": 11045 }, { "epoch": 1.0014500634402754, "grad_norm": 0.8567138910293579, "learning_rate": 4.848364750716619e-05, "loss": 0.3925, "step": 11050 }, { "epoch": 1.0019032082653616, "grad_norm": 0.7841707468032837, "learning_rate": 4.848093381732004e-05, "loss": 0.4188, "step": 11055 }, { "epoch": 1.0023563530904478, "grad_norm": 0.8800122737884521, "learning_rate": 4.8478217777494096e-05, "loss": 0.3892, "step": 11060 }, { "epoch": 1.0028094979155338, "grad_norm": 0.8288499712944031, "learning_rate": 4.8475499387960176e-05, "loss": 0.3663, "step": 11065 }, { "epoch": 1.00326264274062, "grad_norm": 0.9530485272407532, "learning_rate": 4.847277864899035e-05, "loss": 0.3988, "step": 11070 }, { "epoch": 1.003715787565706, "grad_norm": 0.9598199129104614, "learning_rate": 4.8470055560856895e-05, "loss": 0.3633, "step": 11075 }, { "epoch": 1.004168932390792, "grad_norm": 0.9651179313659668, "learning_rate": 4.846733012383235e-05, "loss": 0.3842, "step": 11080 }, { "epoch": 1.0046220772158783, "grad_norm": 0.8962307572364807, "learning_rate": 4.846460233818947e-05, "loss": 0.3839, "step": 11085 }, { "epoch": 1.0050752220409642, "grad_norm": 0.8873730301856995, "learning_rate": 4.8461872204201254e-05, "loss": 0.3411, "step": 11090 }, { "epoch": 1.0055283668660504, "grad_norm": 0.7967630624771118, "learning_rate": 4.8459139722140946e-05, "loss": 0.353, "step": 11095 }, { "epoch": 1.0059815116911366, "grad_norm": 0.900016725063324, "learning_rate": 4.845640489228199e-05, "loss": 0.3632, "step": 11100 }, { "epoch": 1.0064346565162225, "grad_norm": 0.9437655806541443, "learning_rate": 4.845366771489811e-05, "loss": 0.4347, "step": 11105 }, { "epoch": 1.0068878013413087, "grad_norm": 0.8177016973495483, "learning_rate": 4.845092819026324e-05, "loss": 0.419, "step": 11110 }, { "epoch": 1.0073409461663947, "grad_norm": 0.9831915497779846, "learning_rate": 4.844818631865155e-05, "loss": 0.4111, "step": 11115 }, { "epoch": 1.0077940909914809, "grad_norm": 0.8957890272140503, "learning_rate": 4.844544210033743e-05, "loss": 0.4082, "step": 11120 }, { "epoch": 1.008247235816567, "grad_norm": 0.9060913324356079, "learning_rate": 4.8442695535595564e-05, "loss": 0.3824, "step": 11125 }, { "epoch": 1.008700380641653, "grad_norm": 0.8914282917976379, "learning_rate": 4.843994662470079e-05, "loss": 0.4111, "step": 11130 }, { "epoch": 1.0091535254667392, "grad_norm": 0.9007201194763184, "learning_rate": 4.843719536792823e-05, "loss": 0.4065, "step": 11135 }, { "epoch": 1.0096066702918252, "grad_norm": 0.8823778629302979, "learning_rate": 4.843444176555324e-05, "loss": 0.4039, "step": 11140 }, { "epoch": 1.0100598151169113, "grad_norm": 0.865009605884552, "learning_rate": 4.84316858178514e-05, "loss": 0.3762, "step": 11145 }, { "epoch": 1.0105129599419975, "grad_norm": 1.0092908143997192, "learning_rate": 4.842892752509852e-05, "loss": 0.45, "step": 11150 }, { "epoch": 1.0109661047670835, "grad_norm": 0.8628846406936646, "learning_rate": 4.8426166887570644e-05, "loss": 0.4107, "step": 11155 }, { "epoch": 1.0114192495921697, "grad_norm": 0.8745213150978088, "learning_rate": 4.8423403905544075e-05, "loss": 0.4037, "step": 11160 }, { "epoch": 1.0118723944172558, "grad_norm": 0.859372079372406, "learning_rate": 4.842063857929533e-05, "loss": 0.337, "step": 11165 }, { "epoch": 1.0123255392423418, "grad_norm": 0.864588737487793, "learning_rate": 4.841787090910115e-05, "loss": 0.3541, "step": 11170 }, { "epoch": 1.012778684067428, "grad_norm": 0.9542399048805237, "learning_rate": 4.841510089523854e-05, "loss": 0.3835, "step": 11175 }, { "epoch": 1.013231828892514, "grad_norm": 0.9830279350280762, "learning_rate": 4.841232853798471e-05, "loss": 0.3591, "step": 11180 }, { "epoch": 1.0136849737176001, "grad_norm": 0.9310513734817505, "learning_rate": 4.840955383761713e-05, "loss": 0.3769, "step": 11185 }, { "epoch": 1.0141381185426863, "grad_norm": 0.8323154449462891, "learning_rate": 4.840677679441348e-05, "loss": 0.3827, "step": 11190 }, { "epoch": 1.0145912633677723, "grad_norm": 0.9117293357849121, "learning_rate": 4.84039974086517e-05, "loss": 0.4625, "step": 11195 }, { "epoch": 1.0150444081928585, "grad_norm": 0.8517593145370483, "learning_rate": 4.8401215680609954e-05, "loss": 0.4299, "step": 11200 }, { "epoch": 1.0154975530179446, "grad_norm": 0.858699381351471, "learning_rate": 4.839843161056662e-05, "loss": 0.3675, "step": 11205 }, { "epoch": 1.0159506978430306, "grad_norm": 0.8622615337371826, "learning_rate": 4.839564519880035e-05, "loss": 0.3586, "step": 11210 }, { "epoch": 1.0164038426681168, "grad_norm": 0.813484251499176, "learning_rate": 4.839285644559e-05, "loss": 0.4114, "step": 11215 }, { "epoch": 1.0168569874932027, "grad_norm": 0.8032749891281128, "learning_rate": 4.839006535121466e-05, "loss": 0.3269, "step": 11220 }, { "epoch": 1.017310132318289, "grad_norm": 0.8725521564483643, "learning_rate": 4.8387271915953684e-05, "loss": 0.4197, "step": 11225 }, { "epoch": 1.0177632771433751, "grad_norm": 0.9518267512321472, "learning_rate": 4.8384476140086624e-05, "loss": 0.3987, "step": 11230 }, { "epoch": 1.018216421968461, "grad_norm": 0.9219575524330139, "learning_rate": 4.8381678023893286e-05, "loss": 0.4086, "step": 11235 }, { "epoch": 1.0186695667935473, "grad_norm": 0.8866197466850281, "learning_rate": 4.8378877567653714e-05, "loss": 0.4368, "step": 11240 }, { "epoch": 1.0191227116186332, "grad_norm": 0.8417726159095764, "learning_rate": 4.837607477164816e-05, "loss": 0.3809, "step": 11245 }, { "epoch": 1.0195758564437194, "grad_norm": 0.9434330463409424, "learning_rate": 4.837326963615715e-05, "loss": 0.3324, "step": 11250 }, { "epoch": 1.0200290012688056, "grad_norm": 0.8656006455421448, "learning_rate": 4.83704621614614e-05, "loss": 0.3876, "step": 11255 }, { "epoch": 1.0204821460938915, "grad_norm": 0.8193944692611694, "learning_rate": 4.8367652347841916e-05, "loss": 0.3964, "step": 11260 }, { "epoch": 1.0209352909189777, "grad_norm": 0.9196531772613525, "learning_rate": 4.836484019557988e-05, "loss": 0.3934, "step": 11265 }, { "epoch": 1.021388435744064, "grad_norm": 0.9573341608047485, "learning_rate": 4.836202570495673e-05, "loss": 0.4208, "step": 11270 }, { "epoch": 1.0218415805691499, "grad_norm": 0.8293805122375488, "learning_rate": 4.8359208876254145e-05, "loss": 0.3629, "step": 11275 }, { "epoch": 1.022294725394236, "grad_norm": 0.8242719173431396, "learning_rate": 4.835638970975405e-05, "loss": 0.3437, "step": 11280 }, { "epoch": 1.022747870219322, "grad_norm": 0.9053919315338135, "learning_rate": 4.835356820573857e-05, "loss": 0.3669, "step": 11285 }, { "epoch": 1.0232010150444082, "grad_norm": 0.9786067008972168, "learning_rate": 4.83507443644901e-05, "loss": 0.3824, "step": 11290 }, { "epoch": 1.0236541598694944, "grad_norm": 0.8926152586936951, "learning_rate": 4.8347918186291244e-05, "loss": 0.3839, "step": 11295 }, { "epoch": 1.0241073046945803, "grad_norm": 0.8443888425827026, "learning_rate": 4.8345089671424825e-05, "loss": 0.3567, "step": 11300 }, { "epoch": 1.0245604495196665, "grad_norm": 0.8735445141792297, "learning_rate": 4.834225882017395e-05, "loss": 0.3976, "step": 11305 }, { "epoch": 1.0250135943447525, "grad_norm": 0.7852501273155212, "learning_rate": 4.833942563282192e-05, "loss": 0.3415, "step": 11310 }, { "epoch": 1.0254667391698387, "grad_norm": 0.8664860129356384, "learning_rate": 4.8336590109652284e-05, "loss": 0.3743, "step": 11315 }, { "epoch": 1.0259198839949248, "grad_norm": 0.8781003952026367, "learning_rate": 4.833375225094882e-05, "loss": 0.4247, "step": 11320 }, { "epoch": 1.0263730288200108, "grad_norm": 0.8646047115325928, "learning_rate": 4.833091205699555e-05, "loss": 0.4159, "step": 11325 }, { "epoch": 1.026826173645097, "grad_norm": 0.8683573007583618, "learning_rate": 4.8328069528076705e-05, "loss": 0.4062, "step": 11330 }, { "epoch": 1.0272793184701832, "grad_norm": 0.8266617655754089, "learning_rate": 4.8325224664476776e-05, "loss": 0.4056, "step": 11335 }, { "epoch": 1.0277324632952691, "grad_norm": 0.9354825019836426, "learning_rate": 4.832237746648047e-05, "loss": 0.3781, "step": 11340 }, { "epoch": 1.0281856081203553, "grad_norm": 0.853651225566864, "learning_rate": 4.8319527934372755e-05, "loss": 0.3934, "step": 11345 }, { "epoch": 1.0286387529454413, "grad_norm": 0.993410587310791, "learning_rate": 4.8316676068438804e-05, "loss": 0.4052, "step": 11350 }, { "epoch": 1.0290918977705275, "grad_norm": 0.8427106142044067, "learning_rate": 4.831382186896402e-05, "loss": 0.3583, "step": 11355 }, { "epoch": 1.0295450425956136, "grad_norm": 0.8709532618522644, "learning_rate": 4.831096533623407e-05, "loss": 0.3778, "step": 11360 }, { "epoch": 1.0299981874206996, "grad_norm": 1.0800107717514038, "learning_rate": 4.830810647053482e-05, "loss": 0.4032, "step": 11365 }, { "epoch": 1.0304513322457858, "grad_norm": 0.8718567490577698, "learning_rate": 4.83052452721524e-05, "loss": 0.3601, "step": 11370 }, { "epoch": 1.0309044770708717, "grad_norm": 0.9303900599479675, "learning_rate": 4.830238174137316e-05, "loss": 0.3686, "step": 11375 }, { "epoch": 1.031357621895958, "grad_norm": 0.9199053645133972, "learning_rate": 4.829951587848367e-05, "loss": 0.3804, "step": 11380 }, { "epoch": 1.031810766721044, "grad_norm": 0.9119997620582581, "learning_rate": 4.829664768377076e-05, "loss": 0.3724, "step": 11385 }, { "epoch": 1.03226391154613, "grad_norm": 0.8266474008560181, "learning_rate": 4.829377715752147e-05, "loss": 0.3369, "step": 11390 }, { "epoch": 1.0327170563712162, "grad_norm": 0.9619772434234619, "learning_rate": 4.829090430002309e-05, "loss": 0.4272, "step": 11395 }, { "epoch": 1.0331702011963024, "grad_norm": 0.8494915962219238, "learning_rate": 4.828802911156313e-05, "loss": 0.4064, "step": 11400 }, { "epoch": 1.0336233460213884, "grad_norm": 0.9000316858291626, "learning_rate": 4.8285151592429354e-05, "loss": 0.384, "step": 11405 }, { "epoch": 1.0340764908464746, "grad_norm": 0.8638260960578918, "learning_rate": 4.8282271742909726e-05, "loss": 0.4248, "step": 11410 }, { "epoch": 1.0345296356715605, "grad_norm": 0.944977343082428, "learning_rate": 4.8279389563292476e-05, "loss": 0.3978, "step": 11415 }, { "epoch": 1.0349827804966467, "grad_norm": 0.8762547373771667, "learning_rate": 4.827650505386604e-05, "loss": 0.3338, "step": 11420 }, { "epoch": 1.035435925321733, "grad_norm": 0.9312855005264282, "learning_rate": 4.827361821491912e-05, "loss": 0.3687, "step": 11425 }, { "epoch": 1.0358890701468189, "grad_norm": 0.860047459602356, "learning_rate": 4.827072904674062e-05, "loss": 0.4216, "step": 11430 }, { "epoch": 1.036342214971905, "grad_norm": 0.9106894135475159, "learning_rate": 4.826783754961969e-05, "loss": 0.3548, "step": 11435 }, { "epoch": 1.0367953597969912, "grad_norm": 0.8559234738349915, "learning_rate": 4.8264943723845704e-05, "loss": 0.3964, "step": 11440 }, { "epoch": 1.0372485046220772, "grad_norm": 0.8821597695350647, "learning_rate": 4.826204756970829e-05, "loss": 0.3634, "step": 11445 }, { "epoch": 1.0377016494471634, "grad_norm": 0.8507845401763916, "learning_rate": 4.82591490874973e-05, "loss": 0.4216, "step": 11450 }, { "epoch": 1.0381547942722493, "grad_norm": 0.9436402320861816, "learning_rate": 4.82562482775028e-05, "loss": 0.4205, "step": 11455 }, { "epoch": 1.0386079390973355, "grad_norm": 0.825844407081604, "learning_rate": 4.8253345140015106e-05, "loss": 0.3186, "step": 11460 }, { "epoch": 1.0390610839224217, "grad_norm": 0.8917832374572754, "learning_rate": 4.8250439675324774e-05, "loss": 0.3246, "step": 11465 }, { "epoch": 1.0395142287475077, "grad_norm": 0.9416117072105408, "learning_rate": 4.8247531883722586e-05, "loss": 0.3877, "step": 11470 }, { "epoch": 1.0399673735725938, "grad_norm": 0.8523006439208984, "learning_rate": 4.8244621765499534e-05, "loss": 0.3846, "step": 11475 }, { "epoch": 1.0404205183976798, "grad_norm": 0.7984223961830139, "learning_rate": 4.824170932094689e-05, "loss": 0.3834, "step": 11480 }, { "epoch": 1.040873663222766, "grad_norm": 0.8202054500579834, "learning_rate": 4.8238794550356113e-05, "loss": 0.3716, "step": 11485 }, { "epoch": 1.0413268080478522, "grad_norm": 1.4405378103256226, "learning_rate": 4.823587745401893e-05, "loss": 0.3833, "step": 11490 }, { "epoch": 1.0417799528729381, "grad_norm": 0.9704737663269043, "learning_rate": 4.823295803222727e-05, "loss": 0.3874, "step": 11495 }, { "epoch": 1.0422330976980243, "grad_norm": 0.8832210898399353, "learning_rate": 4.823003628527332e-05, "loss": 0.3899, "step": 11500 }, { "epoch": 1.0426862425231105, "grad_norm": 0.9238032102584839, "learning_rate": 4.8227112213449474e-05, "loss": 0.3893, "step": 11505 }, { "epoch": 1.0431393873481964, "grad_norm": 0.8613232970237732, "learning_rate": 4.82241858170484e-05, "loss": 0.3607, "step": 11510 }, { "epoch": 1.0435925321732826, "grad_norm": 0.8094808459281921, "learning_rate": 4.822125709636295e-05, "loss": 0.3608, "step": 11515 }, { "epoch": 1.0440456769983686, "grad_norm": 0.8205888271331787, "learning_rate": 4.8218326051686245e-05, "loss": 0.418, "step": 11520 }, { "epoch": 1.0444988218234548, "grad_norm": 0.7794011831283569, "learning_rate": 4.821539268331162e-05, "loss": 0.3767, "step": 11525 }, { "epoch": 1.044951966648541, "grad_norm": 1.0011320114135742, "learning_rate": 4.821245699153264e-05, "loss": 0.4301, "step": 11530 }, { "epoch": 1.045405111473627, "grad_norm": 0.9233134984970093, "learning_rate": 4.820951897664312e-05, "loss": 0.3859, "step": 11535 }, { "epoch": 1.045858256298713, "grad_norm": 0.8324272036552429, "learning_rate": 4.82065786389371e-05, "loss": 0.3599, "step": 11540 }, { "epoch": 1.0463114011237993, "grad_norm": 0.819955050945282, "learning_rate": 4.8203635978708836e-05, "loss": 0.385, "step": 11545 }, { "epoch": 1.0467645459488852, "grad_norm": 0.8608436584472656, "learning_rate": 4.820069099625285e-05, "loss": 0.418, "step": 11550 }, { "epoch": 1.0472176907739714, "grad_norm": 0.8648677468299866, "learning_rate": 4.819774369186386e-05, "loss": 0.4099, "step": 11555 }, { "epoch": 1.0476708355990574, "grad_norm": 0.8743899464607239, "learning_rate": 4.819479406583685e-05, "loss": 0.3425, "step": 11560 }, { "epoch": 1.0481239804241436, "grad_norm": 0.9441232085227966, "learning_rate": 4.819184211846699e-05, "loss": 0.3865, "step": 11565 }, { "epoch": 1.0485771252492297, "grad_norm": 0.7767312526702881, "learning_rate": 4.818888785004974e-05, "loss": 0.3945, "step": 11570 }, { "epoch": 1.0490302700743157, "grad_norm": 0.8846515417098999, "learning_rate": 4.8185931260880754e-05, "loss": 0.3785, "step": 11575 }, { "epoch": 1.049483414899402, "grad_norm": 1.0922164916992188, "learning_rate": 4.8182972351255936e-05, "loss": 0.3676, "step": 11580 }, { "epoch": 1.0499365597244879, "grad_norm": 0.8235259652137756, "learning_rate": 4.81800111214714e-05, "loss": 0.3582, "step": 11585 }, { "epoch": 1.050389704549574, "grad_norm": 0.8353856801986694, "learning_rate": 4.817704757182352e-05, "loss": 0.4236, "step": 11590 }, { "epoch": 1.0508428493746602, "grad_norm": 0.8744451403617859, "learning_rate": 4.8174081702608874e-05, "loss": 0.3802, "step": 11595 }, { "epoch": 1.0512959941997462, "grad_norm": 0.8958879709243774, "learning_rate": 4.817111351412431e-05, "loss": 0.3962, "step": 11600 }, { "epoch": 1.0517491390248324, "grad_norm": 0.9290004968643188, "learning_rate": 4.816814300666687e-05, "loss": 0.3709, "step": 11605 }, { "epoch": 1.0522022838499185, "grad_norm": 0.8870899081230164, "learning_rate": 4.816517018053383e-05, "loss": 0.4062, "step": 11610 }, { "epoch": 1.0526554286750045, "grad_norm": 0.9146875739097595, "learning_rate": 4.816219503602275e-05, "loss": 0.3537, "step": 11615 }, { "epoch": 1.0531085735000907, "grad_norm": 0.9371978044509888, "learning_rate": 4.815921757343135e-05, "loss": 0.3779, "step": 11620 }, { "epoch": 1.0535617183251766, "grad_norm": 1.0470609664916992, "learning_rate": 4.815623779305763e-05, "loss": 0.3709, "step": 11625 }, { "epoch": 1.0540148631502628, "grad_norm": 0.9105995893478394, "learning_rate": 4.81532556951998e-05, "loss": 0.4173, "step": 11630 }, { "epoch": 1.054468007975349, "grad_norm": 0.9247618317604065, "learning_rate": 4.815027128015631e-05, "loss": 0.3661, "step": 11635 }, { "epoch": 1.054921152800435, "grad_norm": 0.8429310321807861, "learning_rate": 4.8147284548225855e-05, "loss": 0.3768, "step": 11640 }, { "epoch": 1.0553742976255212, "grad_norm": 0.8351632952690125, "learning_rate": 4.814429549970732e-05, "loss": 0.3877, "step": 11645 }, { "epoch": 1.0558274424506071, "grad_norm": 0.9019758701324463, "learning_rate": 4.814130413489989e-05, "loss": 0.3844, "step": 11650 }, { "epoch": 1.0562805872756933, "grad_norm": 1.004144310951233, "learning_rate": 4.813831045410291e-05, "loss": 0.3879, "step": 11655 }, { "epoch": 1.0567337321007795, "grad_norm": 0.9554693698883057, "learning_rate": 4.8135314457615985e-05, "loss": 0.4074, "step": 11660 }, { "epoch": 1.0571868769258654, "grad_norm": 0.8696164488792419, "learning_rate": 4.813231614573899e-05, "loss": 0.3565, "step": 11665 }, { "epoch": 1.0576400217509516, "grad_norm": 0.9411433339118958, "learning_rate": 4.812931551877196e-05, "loss": 0.3742, "step": 11670 }, { "epoch": 1.0580931665760378, "grad_norm": 0.9257156252861023, "learning_rate": 4.8126312577015215e-05, "loss": 0.4146, "step": 11675 }, { "epoch": 1.0585463114011238, "grad_norm": 0.8431351184844971, "learning_rate": 4.8123307320769296e-05, "loss": 0.3573, "step": 11680 }, { "epoch": 1.05899945622621, "grad_norm": 0.8067407608032227, "learning_rate": 4.812029975033496e-05, "loss": 0.3687, "step": 11685 }, { "epoch": 1.059452601051296, "grad_norm": 0.9565578103065491, "learning_rate": 4.8117289866013205e-05, "loss": 0.3893, "step": 11690 }, { "epoch": 1.059905745876382, "grad_norm": 0.9116023182868958, "learning_rate": 4.8114277668105265e-05, "loss": 0.4417, "step": 11695 }, { "epoch": 1.0603588907014683, "grad_norm": 0.9045963883399963, "learning_rate": 4.811126315691261e-05, "loss": 0.387, "step": 11700 }, { "epoch": 1.0608120355265542, "grad_norm": 0.8564643263816833, "learning_rate": 4.810824633273693e-05, "loss": 0.3544, "step": 11705 }, { "epoch": 1.0612651803516404, "grad_norm": 0.8843787908554077, "learning_rate": 4.810522719588013e-05, "loss": 0.35, "step": 11710 }, { "epoch": 1.0617183251767264, "grad_norm": 0.922376275062561, "learning_rate": 4.810220574664439e-05, "loss": 0.4082, "step": 11715 }, { "epoch": 1.0621714700018126, "grad_norm": 0.9693898558616638, "learning_rate": 4.8099181985332086e-05, "loss": 0.4045, "step": 11720 }, { "epoch": 1.0626246148268987, "grad_norm": 0.8809909224510193, "learning_rate": 4.8096155912245843e-05, "loss": 0.409, "step": 11725 }, { "epoch": 1.0630777596519847, "grad_norm": 0.8975785374641418, "learning_rate": 4.809312752768851e-05, "loss": 0.3508, "step": 11730 }, { "epoch": 1.0635309044770709, "grad_norm": 0.8918411731719971, "learning_rate": 4.809009683196316e-05, "loss": 0.3877, "step": 11735 }, { "epoch": 1.063984049302157, "grad_norm": 0.8303594589233398, "learning_rate": 4.8087063825373123e-05, "loss": 0.3382, "step": 11740 }, { "epoch": 1.064437194127243, "grad_norm": 0.871291995048523, "learning_rate": 4.808402850822193e-05, "loss": 0.3715, "step": 11745 }, { "epoch": 1.0648903389523292, "grad_norm": 0.9061521887779236, "learning_rate": 4.808099088081336e-05, "loss": 0.3882, "step": 11750 }, { "epoch": 1.0653434837774152, "grad_norm": 0.9531295299530029, "learning_rate": 4.8077950943451425e-05, "loss": 0.3383, "step": 11755 }, { "epoch": 1.0657966286025014, "grad_norm": 0.8682747483253479, "learning_rate": 4.807490869644035e-05, "loss": 0.3794, "step": 11760 }, { "epoch": 1.0662497734275875, "grad_norm": 0.8166399002075195, "learning_rate": 4.807186414008461e-05, "loss": 0.4075, "step": 11765 }, { "epoch": 1.0667029182526735, "grad_norm": 0.9295274019241333, "learning_rate": 4.806881727468892e-05, "loss": 0.3631, "step": 11770 }, { "epoch": 1.0671560630777597, "grad_norm": 0.7843268513679504, "learning_rate": 4.806576810055818e-05, "loss": 0.3916, "step": 11775 }, { "epoch": 1.0676092079028456, "grad_norm": 0.85649174451828, "learning_rate": 4.806271661799759e-05, "loss": 0.3445, "step": 11780 }, { "epoch": 1.0680623527279318, "grad_norm": 0.9056670069694519, "learning_rate": 4.8059662827312515e-05, "loss": 0.3206, "step": 11785 }, { "epoch": 1.068515497553018, "grad_norm": 1.9693435430526733, "learning_rate": 4.8056606728808594e-05, "loss": 0.3833, "step": 11790 }, { "epoch": 1.068968642378104, "grad_norm": 1.4295130968093872, "learning_rate": 4.8053548322791674e-05, "loss": 0.3689, "step": 11795 }, { "epoch": 1.0694217872031901, "grad_norm": 0.8218747973442078, "learning_rate": 4.805048760956784e-05, "loss": 0.3651, "step": 11800 }, { "epoch": 1.0698749320282763, "grad_norm": 1.340010404586792, "learning_rate": 4.804742458944343e-05, "loss": 0.4066, "step": 11805 }, { "epoch": 1.0703280768533623, "grad_norm": 0.8193607330322266, "learning_rate": 4.804435926272496e-05, "loss": 0.3613, "step": 11810 }, { "epoch": 1.0707812216784485, "grad_norm": 0.8133839964866638, "learning_rate": 4.8041291629719224e-05, "loss": 0.3742, "step": 11815 }, { "epoch": 1.0712343665035344, "grad_norm": 0.8026869297027588, "learning_rate": 4.803822169073324e-05, "loss": 0.4002, "step": 11820 }, { "epoch": 1.0716875113286206, "grad_norm": 0.8777545690536499, "learning_rate": 4.803514944607423e-05, "loss": 0.4027, "step": 11825 }, { "epoch": 1.0721406561537068, "grad_norm": 0.8388559222221375, "learning_rate": 4.803207489604969e-05, "loss": 0.3371, "step": 11830 }, { "epoch": 1.0725938009787928, "grad_norm": 0.8332975506782532, "learning_rate": 4.80289980409673e-05, "loss": 0.3482, "step": 11835 }, { "epoch": 1.073046945803879, "grad_norm": 0.9200919270515442, "learning_rate": 4.8025918881135015e-05, "loss": 0.4161, "step": 11840 }, { "epoch": 1.0735000906289651, "grad_norm": 0.9279763102531433, "learning_rate": 4.802283741686097e-05, "loss": 0.3802, "step": 11845 }, { "epoch": 1.073953235454051, "grad_norm": 0.8895385265350342, "learning_rate": 4.801975364845358e-05, "loss": 0.3435, "step": 11850 }, { "epoch": 1.0744063802791373, "grad_norm": 0.9661354422569275, "learning_rate": 4.801666757622146e-05, "loss": 0.3455, "step": 11855 }, { "epoch": 1.0748595251042232, "grad_norm": 0.7938678860664368, "learning_rate": 4.801357920047347e-05, "loss": 0.343, "step": 11860 }, { "epoch": 1.0753126699293094, "grad_norm": 0.8133737444877625, "learning_rate": 4.801048852151869e-05, "loss": 0.3759, "step": 11865 }, { "epoch": 1.0757658147543956, "grad_norm": 0.8439207077026367, "learning_rate": 4.8007395539666446e-05, "loss": 0.4457, "step": 11870 }, { "epoch": 1.0762189595794815, "grad_norm": 0.885903537273407, "learning_rate": 4.800430025522628e-05, "loss": 0.3583, "step": 11875 }, { "epoch": 1.0766721044045677, "grad_norm": 1.0330981016159058, "learning_rate": 4.8001202668507956e-05, "loss": 0.3768, "step": 11880 }, { "epoch": 1.077125249229654, "grad_norm": 0.9431635141372681, "learning_rate": 4.7998102779821495e-05, "loss": 0.3768, "step": 11885 }, { "epoch": 1.0775783940547399, "grad_norm": 0.828884482383728, "learning_rate": 4.799500058947715e-05, "loss": 0.3684, "step": 11890 }, { "epoch": 1.078031538879826, "grad_norm": 1.0689743757247925, "learning_rate": 4.799189609778535e-05, "loss": 0.4081, "step": 11895 }, { "epoch": 1.078484683704912, "grad_norm": 0.8281779885292053, "learning_rate": 4.7988789305056836e-05, "loss": 0.366, "step": 11900 }, { "epoch": 1.0789378285299982, "grad_norm": 0.7839416861534119, "learning_rate": 4.79856802116025e-05, "loss": 0.3589, "step": 11905 }, { "epoch": 1.0793909733550844, "grad_norm": 0.8299369215965271, "learning_rate": 4.798256881773353e-05, "loss": 0.3699, "step": 11910 }, { "epoch": 1.0798441181801703, "grad_norm": 0.8094996809959412, "learning_rate": 4.79794551237613e-05, "loss": 0.378, "step": 11915 }, { "epoch": 1.0802972630052565, "grad_norm": 0.8401751518249512, "learning_rate": 4.7976339129997436e-05, "loss": 0.365, "step": 11920 }, { "epoch": 1.0807504078303425, "grad_norm": 0.8263360857963562, "learning_rate": 4.797322083675379e-05, "loss": 0.3584, "step": 11925 }, { "epoch": 1.0812035526554287, "grad_norm": 0.9266029000282288, "learning_rate": 4.797010024434242e-05, "loss": 0.399, "step": 11930 }, { "epoch": 1.0816566974805149, "grad_norm": 0.8544906377792358, "learning_rate": 4.796697735307566e-05, "loss": 0.4266, "step": 11935 }, { "epoch": 1.0821098423056008, "grad_norm": 0.8921790719032288, "learning_rate": 4.7963852163266046e-05, "loss": 0.377, "step": 11940 }, { "epoch": 1.082562987130687, "grad_norm": 0.9536526203155518, "learning_rate": 4.796072467522634e-05, "loss": 0.4664, "step": 11945 }, { "epoch": 1.0830161319557732, "grad_norm": 0.9257422685623169, "learning_rate": 4.795759488926955e-05, "loss": 0.4271, "step": 11950 }, { "epoch": 1.0834692767808591, "grad_norm": 0.8489125967025757, "learning_rate": 4.79544628057089e-05, "loss": 0.3859, "step": 11955 }, { "epoch": 1.0839224216059453, "grad_norm": 0.8841333985328674, "learning_rate": 4.7951328424857855e-05, "loss": 0.3749, "step": 11960 }, { "epoch": 1.0843755664310313, "grad_norm": 0.7131010890007019, "learning_rate": 4.79481917470301e-05, "loss": 0.3787, "step": 11965 }, { "epoch": 1.0848287112561175, "grad_norm": 1.0298123359680176, "learning_rate": 4.7945052772539555e-05, "loss": 0.4161, "step": 11970 }, { "epoch": 1.0852818560812036, "grad_norm": 0.8943709135055542, "learning_rate": 4.794191150170038e-05, "loss": 0.3948, "step": 11975 }, { "epoch": 1.0857350009062896, "grad_norm": 0.8781269788742065, "learning_rate": 4.793876793482694e-05, "loss": 0.4168, "step": 11980 }, { "epoch": 1.0861881457313758, "grad_norm": 0.810387909412384, "learning_rate": 4.7935622072233854e-05, "loss": 0.3943, "step": 11985 }, { "epoch": 1.0866412905564617, "grad_norm": 0.93533855676651, "learning_rate": 4.7932473914235954e-05, "loss": 0.3865, "step": 11990 }, { "epoch": 1.087094435381548, "grad_norm": 0.8254369497299194, "learning_rate": 4.7929323461148316e-05, "loss": 0.3907, "step": 11995 }, { "epoch": 1.0875475802066341, "grad_norm": 0.7867270708084106, "learning_rate": 4.7926170713286235e-05, "loss": 0.3568, "step": 12000 }, { "epoch": 1.08800072503172, "grad_norm": 0.8847482204437256, "learning_rate": 4.792301567096524e-05, "loss": 0.4061, "step": 12005 }, { "epoch": 1.0884538698568063, "grad_norm": 0.9020704030990601, "learning_rate": 4.791985833450108e-05, "loss": 0.4087, "step": 12010 }, { "epoch": 1.0889070146818924, "grad_norm": 0.8900327682495117, "learning_rate": 4.791669870420976e-05, "loss": 0.3717, "step": 12015 }, { "epoch": 1.0893601595069784, "grad_norm": 0.9060009717941284, "learning_rate": 4.791353678040748e-05, "loss": 0.4056, "step": 12020 }, { "epoch": 1.0898133043320646, "grad_norm": 0.9394340515136719, "learning_rate": 4.7910372563410685e-05, "loss": 0.3552, "step": 12025 }, { "epoch": 1.0902664491571505, "grad_norm": 0.9177825450897217, "learning_rate": 4.790720605353608e-05, "loss": 0.4032, "step": 12030 }, { "epoch": 1.0907195939822367, "grad_norm": 0.8897939324378967, "learning_rate": 4.7904037251100534e-05, "loss": 0.3287, "step": 12035 }, { "epoch": 1.091172738807323, "grad_norm": 0.8449847102165222, "learning_rate": 4.7900866156421196e-05, "loss": 0.3907, "step": 12040 }, { "epoch": 1.0916258836324089, "grad_norm": 0.8823729157447815, "learning_rate": 4.789769276981544e-05, "loss": 0.3893, "step": 12045 }, { "epoch": 1.092079028457495, "grad_norm": 0.9840012192726135, "learning_rate": 4.789451709160085e-05, "loss": 0.3628, "step": 12050 }, { "epoch": 1.092532173282581, "grad_norm": 0.9818935394287109, "learning_rate": 4.789133912209524e-05, "loss": 0.3694, "step": 12055 }, { "epoch": 1.0929853181076672, "grad_norm": 0.921527624130249, "learning_rate": 4.7888158861616685e-05, "loss": 0.4091, "step": 12060 }, { "epoch": 1.0934384629327534, "grad_norm": 0.8205445408821106, "learning_rate": 4.7884976310483445e-05, "loss": 0.3721, "step": 12065 }, { "epoch": 1.0938916077578393, "grad_norm": 0.8688151836395264, "learning_rate": 4.7881791469014044e-05, "loss": 0.364, "step": 12070 }, { "epoch": 1.0943447525829255, "grad_norm": 0.8729259967803955, "learning_rate": 4.787860433752722e-05, "loss": 0.4049, "step": 12075 }, { "epoch": 1.0947978974080117, "grad_norm": 0.8697648644447327, "learning_rate": 4.787541491634193e-05, "loss": 0.3312, "step": 12080 }, { "epoch": 1.0952510422330977, "grad_norm": 0.9078481793403625, "learning_rate": 4.7872223205777386e-05, "loss": 0.383, "step": 12085 }, { "epoch": 1.0957041870581838, "grad_norm": 0.9512729048728943, "learning_rate": 4.786902920615302e-05, "loss": 0.3665, "step": 12090 }, { "epoch": 1.0961573318832698, "grad_norm": 0.9763415455818176, "learning_rate": 4.7865832917788475e-05, "loss": 0.3875, "step": 12095 }, { "epoch": 1.096610476708356, "grad_norm": 0.8274351358413696, "learning_rate": 4.786263434100364e-05, "loss": 0.3668, "step": 12100 }, { "epoch": 1.0970636215334422, "grad_norm": 0.8993649482727051, "learning_rate": 4.785943347611863e-05, "loss": 0.353, "step": 12105 }, { "epoch": 1.0975167663585281, "grad_norm": 0.8935574293136597, "learning_rate": 4.785623032345379e-05, "loss": 0.3416, "step": 12110 }, { "epoch": 1.0979699111836143, "grad_norm": 0.8752820491790771, "learning_rate": 4.78530248833297e-05, "loss": 0.4217, "step": 12115 }, { "epoch": 1.0984230560087003, "grad_norm": 0.8197797536849976, "learning_rate": 4.7849817156067146e-05, "loss": 0.3589, "step": 12120 }, { "epoch": 1.0988762008337865, "grad_norm": 0.8450929522514343, "learning_rate": 4.784660714198717e-05, "loss": 0.3814, "step": 12125 }, { "epoch": 1.0993293456588726, "grad_norm": 1.1304266452789307, "learning_rate": 4.784339484141103e-05, "loss": 0.3782, "step": 12130 }, { "epoch": 1.0997824904839586, "grad_norm": 0.8613160848617554, "learning_rate": 4.784018025466021e-05, "loss": 0.4156, "step": 12135 }, { "epoch": 1.1002356353090448, "grad_norm": 0.8506388664245605, "learning_rate": 4.783696338205642e-05, "loss": 0.3877, "step": 12140 }, { "epoch": 1.100688780134131, "grad_norm": 0.9041993021965027, "learning_rate": 4.7833744223921626e-05, "loss": 0.3857, "step": 12145 }, { "epoch": 1.101141924959217, "grad_norm": 0.846179187297821, "learning_rate": 4.783052278057799e-05, "loss": 0.3138, "step": 12150 }, { "epoch": 1.101595069784303, "grad_norm": 1.0347062349319458, "learning_rate": 4.7827299052347916e-05, "loss": 0.4026, "step": 12155 }, { "epoch": 1.102048214609389, "grad_norm": 0.8746593594551086, "learning_rate": 4.7824073039554034e-05, "loss": 0.4006, "step": 12160 }, { "epoch": 1.1025013594344752, "grad_norm": 0.9393975734710693, "learning_rate": 4.7820844742519214e-05, "loss": 0.3725, "step": 12165 }, { "epoch": 1.1029545042595614, "grad_norm": 0.8854675889015198, "learning_rate": 4.7817614161566526e-05, "loss": 0.3804, "step": 12170 }, { "epoch": 1.1034076490846474, "grad_norm": 0.905401885509491, "learning_rate": 4.781438129701931e-05, "loss": 0.3738, "step": 12175 }, { "epoch": 1.1038607939097336, "grad_norm": 0.9174976944923401, "learning_rate": 4.781114614920109e-05, "loss": 0.4056, "step": 12180 }, { "epoch": 1.1043139387348195, "grad_norm": 0.8193023800849915, "learning_rate": 4.7807908718435655e-05, "loss": 0.3465, "step": 12185 }, { "epoch": 1.1047670835599057, "grad_norm": 0.8688977956771851, "learning_rate": 4.780466900504701e-05, "loss": 0.3595, "step": 12190 }, { "epoch": 1.105220228384992, "grad_norm": 0.8545559644699097, "learning_rate": 4.7801427009359375e-05, "loss": 0.4569, "step": 12195 }, { "epoch": 1.1056733732100779, "grad_norm": 0.85065758228302, "learning_rate": 4.7798182731697224e-05, "loss": 0.3737, "step": 12200 }, { "epoch": 1.106126518035164, "grad_norm": 0.9723955392837524, "learning_rate": 4.7794936172385236e-05, "loss": 0.4005, "step": 12205 }, { "epoch": 1.1065796628602502, "grad_norm": 0.8753993511199951, "learning_rate": 4.779168733174833e-05, "loss": 0.3922, "step": 12210 }, { "epoch": 1.1070328076853362, "grad_norm": 0.8171776533126831, "learning_rate": 4.7788436210111655e-05, "loss": 0.3547, "step": 12215 }, { "epoch": 1.1074859525104224, "grad_norm": 1.0204066038131714, "learning_rate": 4.778518280780057e-05, "loss": 0.408, "step": 12220 }, { "epoch": 1.1079390973355083, "grad_norm": 0.8369526863098145, "learning_rate": 4.77819271251407e-05, "loss": 0.3489, "step": 12225 }, { "epoch": 1.1083922421605945, "grad_norm": 0.8603342175483704, "learning_rate": 4.777866916245785e-05, "loss": 0.3448, "step": 12230 }, { "epoch": 1.1088453869856807, "grad_norm": 0.8271458148956299, "learning_rate": 4.77754089200781e-05, "loss": 0.3598, "step": 12235 }, { "epoch": 1.1092985318107667, "grad_norm": 0.8638849854469299, "learning_rate": 4.7772146398327715e-05, "loss": 0.3707, "step": 12240 }, { "epoch": 1.1097516766358528, "grad_norm": 0.8214361071586609, "learning_rate": 4.776888159753323e-05, "loss": 0.3534, "step": 12245 }, { "epoch": 1.110204821460939, "grad_norm": 0.8788526654243469, "learning_rate": 4.776561451802137e-05, "loss": 0.3848, "step": 12250 }, { "epoch": 1.110657966286025, "grad_norm": 0.9039310216903687, "learning_rate": 4.776234516011912e-05, "loss": 0.3229, "step": 12255 }, { "epoch": 1.1111111111111112, "grad_norm": 0.8558334112167358, "learning_rate": 4.775907352415367e-05, "loss": 0.3653, "step": 12260 }, { "epoch": 1.1115642559361971, "grad_norm": 0.9235883355140686, "learning_rate": 4.775579961045244e-05, "loss": 0.3896, "step": 12265 }, { "epoch": 1.1120174007612833, "grad_norm": 0.8583235740661621, "learning_rate": 4.7752523419343105e-05, "loss": 0.3926, "step": 12270 }, { "epoch": 1.1124705455863695, "grad_norm": 0.8435277938842773, "learning_rate": 4.7749244951153524e-05, "loss": 0.3917, "step": 12275 }, { "epoch": 1.1129236904114554, "grad_norm": 0.9307669401168823, "learning_rate": 4.7745964206211826e-05, "loss": 0.3683, "step": 12280 }, { "epoch": 1.1133768352365416, "grad_norm": 0.8063194155693054, "learning_rate": 4.774268118484634e-05, "loss": 0.3425, "step": 12285 }, { "epoch": 1.1138299800616278, "grad_norm": 0.8330150246620178, "learning_rate": 4.773939588738563e-05, "loss": 0.3245, "step": 12290 }, { "epoch": 1.1142831248867138, "grad_norm": 0.7677237391471863, "learning_rate": 4.773610831415849e-05, "loss": 0.3488, "step": 12295 }, { "epoch": 1.1147362697118, "grad_norm": 0.9374913573265076, "learning_rate": 4.773281846549395e-05, "loss": 0.4422, "step": 12300 }, { "epoch": 1.115189414536886, "grad_norm": 0.8380866646766663, "learning_rate": 4.7729526341721255e-05, "loss": 0.3206, "step": 12305 }, { "epoch": 1.115642559361972, "grad_norm": 1.1505810022354126, "learning_rate": 4.772623194316987e-05, "loss": 0.3352, "step": 12310 }, { "epoch": 1.1160957041870583, "grad_norm": 0.7891635298728943, "learning_rate": 4.772293527016952e-05, "loss": 0.3562, "step": 12315 }, { "epoch": 1.1165488490121442, "grad_norm": 0.8458313345909119, "learning_rate": 4.771963632305012e-05, "loss": 0.4038, "step": 12320 }, { "epoch": 1.1170019938372304, "grad_norm": 0.8393685817718506, "learning_rate": 4.771633510214184e-05, "loss": 0.3751, "step": 12325 }, { "epoch": 1.1174551386623164, "grad_norm": 0.8728351593017578, "learning_rate": 4.7713031607775054e-05, "loss": 0.3714, "step": 12330 }, { "epoch": 1.1179082834874026, "grad_norm": 0.8654003143310547, "learning_rate": 4.77097258402804e-05, "loss": 0.3553, "step": 12335 }, { "epoch": 1.1183614283124887, "grad_norm": 0.8946516513824463, "learning_rate": 4.77064177999887e-05, "loss": 0.3434, "step": 12340 }, { "epoch": 1.1188145731375747, "grad_norm": 0.8350505828857422, "learning_rate": 4.7703107487231024e-05, "loss": 0.3355, "step": 12345 }, { "epoch": 1.119267717962661, "grad_norm": 0.8645466566085815, "learning_rate": 4.7699794902338694e-05, "loss": 0.3667, "step": 12350 }, { "epoch": 1.119720862787747, "grad_norm": 0.8766921162605286, "learning_rate": 4.76964800456432e-05, "loss": 0.3752, "step": 12355 }, { "epoch": 1.120174007612833, "grad_norm": 0.8849995136260986, "learning_rate": 4.769316291747632e-05, "loss": 0.3353, "step": 12360 }, { "epoch": 1.1206271524379192, "grad_norm": 0.8263131976127625, "learning_rate": 4.7689843518170016e-05, "loss": 0.3524, "step": 12365 }, { "epoch": 1.1210802972630052, "grad_norm": 0.9259498715400696, "learning_rate": 4.768652184805651e-05, "loss": 0.3756, "step": 12370 }, { "epoch": 1.1215334420880914, "grad_norm": 0.9731711745262146, "learning_rate": 4.768319790746822e-05, "loss": 0.3704, "step": 12375 }, { "epoch": 1.1219865869131775, "grad_norm": 0.9117513298988342, "learning_rate": 4.767987169673782e-05, "loss": 0.3817, "step": 12380 }, { "epoch": 1.1224397317382635, "grad_norm": 0.8928380012512207, "learning_rate": 4.7676543216198195e-05, "loss": 0.4112, "step": 12385 }, { "epoch": 1.1228928765633497, "grad_norm": 0.8607878684997559, "learning_rate": 4.767321246618246e-05, "loss": 0.4034, "step": 12390 }, { "epoch": 1.1233460213884356, "grad_norm": 0.9016765356063843, "learning_rate": 4.7669879447023954e-05, "loss": 0.3848, "step": 12395 }, { "epoch": 1.1237991662135218, "grad_norm": 0.9272328615188599, "learning_rate": 4.766654415905625e-05, "loss": 0.406, "step": 12400 }, { "epoch": 1.124252311038608, "grad_norm": 0.8690624237060547, "learning_rate": 4.7663206602613145e-05, "loss": 0.3413, "step": 12405 }, { "epoch": 1.124705455863694, "grad_norm": 0.8095653653144836, "learning_rate": 4.7659866778028664e-05, "loss": 0.3537, "step": 12410 }, { "epoch": 1.1251586006887802, "grad_norm": 0.8870428204536438, "learning_rate": 4.765652468563705e-05, "loss": 0.3593, "step": 12415 }, { "epoch": 1.1256117455138663, "grad_norm": 1.0329029560089111, "learning_rate": 4.7653180325772787e-05, "loss": 0.3464, "step": 12420 }, { "epoch": 1.1260648903389523, "grad_norm": 0.8576248288154602, "learning_rate": 4.7649833698770586e-05, "loss": 0.3293, "step": 12425 }, { "epoch": 1.1265180351640385, "grad_norm": 0.8618151545524597, "learning_rate": 4.764648480496536e-05, "loss": 0.418, "step": 12430 }, { "epoch": 1.1269711799891244, "grad_norm": 0.8988131284713745, "learning_rate": 4.764313364469229e-05, "loss": 0.4383, "step": 12435 }, { "epoch": 1.1274243248142106, "grad_norm": 0.8617266416549683, "learning_rate": 4.763978021828674e-05, "loss": 0.3695, "step": 12440 }, { "epoch": 1.1278774696392968, "grad_norm": 0.9226356148719788, "learning_rate": 4.7636424526084346e-05, "loss": 0.3703, "step": 12445 }, { "epoch": 1.1283306144643828, "grad_norm": 0.800165057182312, "learning_rate": 4.7633066568420924e-05, "loss": 0.3493, "step": 12450 }, { "epoch": 1.128783759289469, "grad_norm": 0.8169365525245667, "learning_rate": 4.762970634563255e-05, "loss": 0.3557, "step": 12455 }, { "epoch": 1.129236904114555, "grad_norm": 0.8253119587898254, "learning_rate": 4.762634385805551e-05, "loss": 0.3821, "step": 12460 }, { "epoch": 1.129690048939641, "grad_norm": 0.9205767512321472, "learning_rate": 4.7622979106026337e-05, "loss": 0.4083, "step": 12465 }, { "epoch": 1.1301431937647273, "grad_norm": 0.9325867891311646, "learning_rate": 4.761961208988176e-05, "loss": 0.3851, "step": 12470 }, { "epoch": 1.1305963385898132, "grad_norm": 0.882795512676239, "learning_rate": 4.7616242809958764e-05, "loss": 0.365, "step": 12475 }, { "epoch": 1.1310494834148994, "grad_norm": 0.9397836327552795, "learning_rate": 4.761287126659454e-05, "loss": 0.4121, "step": 12480 }, { "epoch": 1.1315026282399856, "grad_norm": 0.8765484094619751, "learning_rate": 4.7609497460126516e-05, "loss": 0.3893, "step": 12485 }, { "epoch": 1.1319557730650716, "grad_norm": 0.8145151734352112, "learning_rate": 4.7606121390892345e-05, "loss": 0.3558, "step": 12490 }, { "epoch": 1.1324089178901577, "grad_norm": 1.0528340339660645, "learning_rate": 4.7602743059229905e-05, "loss": 0.405, "step": 12495 }, { "epoch": 1.1328620627152437, "grad_norm": 0.8213949203491211, "learning_rate": 4.759936246547729e-05, "loss": 0.335, "step": 12500 }, { "epoch": 1.1333152075403299, "grad_norm": 0.8573718070983887, "learning_rate": 4.759597960997284e-05, "loss": 0.3423, "step": 12505 }, { "epoch": 1.133768352365416, "grad_norm": 0.9128398299217224, "learning_rate": 4.759259449305512e-05, "loss": 0.3671, "step": 12510 }, { "epoch": 1.134221497190502, "grad_norm": 0.8570360541343689, "learning_rate": 4.758920711506291e-05, "loss": 0.3416, "step": 12515 }, { "epoch": 1.1346746420155882, "grad_norm": 0.896602988243103, "learning_rate": 4.7585817476335206e-05, "loss": 0.3843, "step": 12520 }, { "epoch": 1.1351277868406742, "grad_norm": 0.9044479727745056, "learning_rate": 4.7582425577211255e-05, "loss": 0.3503, "step": 12525 }, { "epoch": 1.1355809316657604, "grad_norm": 0.8191632032394409, "learning_rate": 4.757903141803051e-05, "loss": 0.388, "step": 12530 }, { "epoch": 1.1360340764908465, "grad_norm": 0.8204656839370728, "learning_rate": 4.757563499913268e-05, "loss": 0.3698, "step": 12535 }, { "epoch": 1.1364872213159325, "grad_norm": 0.9161546230316162, "learning_rate": 4.757223632085766e-05, "loss": 0.3624, "step": 12540 }, { "epoch": 1.1369403661410187, "grad_norm": 1.0139867067337036, "learning_rate": 4.7568835383545606e-05, "loss": 0.362, "step": 12545 }, { "epoch": 1.1373935109661049, "grad_norm": 0.8622676730155945, "learning_rate": 4.7565432187536874e-05, "loss": 0.3706, "step": 12550 }, { "epoch": 1.1378466557911908, "grad_norm": 0.8620196580886841, "learning_rate": 4.756202673317206e-05, "loss": 0.368, "step": 12555 }, { "epoch": 1.138299800616277, "grad_norm": 0.904996931552887, "learning_rate": 4.755861902079198e-05, "loss": 0.3688, "step": 12560 }, { "epoch": 1.138752945441363, "grad_norm": 0.8248980641365051, "learning_rate": 4.755520905073769e-05, "loss": 0.3524, "step": 12565 }, { "epoch": 1.1392060902664491, "grad_norm": 0.8425131440162659, "learning_rate": 4.7551796823350445e-05, "loss": 0.3571, "step": 12570 }, { "epoch": 1.1396592350915353, "grad_norm": 0.8184306025505066, "learning_rate": 4.754838233897176e-05, "loss": 0.3303, "step": 12575 }, { "epoch": 1.1401123799166213, "grad_norm": 0.9195936322212219, "learning_rate": 4.754496559794335e-05, "loss": 0.3445, "step": 12580 }, { "epoch": 1.1405655247417075, "grad_norm": 0.9354201555252075, "learning_rate": 4.754154660060715e-05, "loss": 0.3433, "step": 12585 }, { "epoch": 1.1410186695667934, "grad_norm": 0.9600435495376587, "learning_rate": 4.753812534730535e-05, "loss": 0.3674, "step": 12590 }, { "epoch": 1.1414718143918796, "grad_norm": 0.8305388689041138, "learning_rate": 4.753470183838035e-05, "loss": 0.3528, "step": 12595 }, { "epoch": 1.1419249592169658, "grad_norm": 0.8568893074989319, "learning_rate": 4.753127607417476e-05, "loss": 0.3737, "step": 12600 }, { "epoch": 1.1423781040420518, "grad_norm": 0.8718670606613159, "learning_rate": 4.752784805503145e-05, "loss": 0.3905, "step": 12605 }, { "epoch": 1.142831248867138, "grad_norm": 0.8785691261291504, "learning_rate": 4.75244177812935e-05, "loss": 0.3389, "step": 12610 }, { "epoch": 1.1432843936922241, "grad_norm": 0.8774757385253906, "learning_rate": 4.7520985253304193e-05, "loss": 0.3805, "step": 12615 }, { "epoch": 1.14373753851731, "grad_norm": 0.8838241100311279, "learning_rate": 4.7517550471407077e-05, "loss": 0.3591, "step": 12620 }, { "epoch": 1.1441906833423963, "grad_norm": 0.8273633718490601, "learning_rate": 4.75141134359459e-05, "loss": 0.3929, "step": 12625 }, { "epoch": 1.1446438281674824, "grad_norm": 0.9108178019523621, "learning_rate": 4.7510674147264634e-05, "loss": 0.3993, "step": 12630 }, { "epoch": 1.1450969729925684, "grad_norm": 0.8808887004852295, "learning_rate": 4.7507232605707486e-05, "loss": 0.3609, "step": 12635 }, { "epoch": 1.1455501178176546, "grad_norm": 0.8417958617210388, "learning_rate": 4.750378881161889e-05, "loss": 0.3602, "step": 12640 }, { "epoch": 1.1460032626427405, "grad_norm": 0.933230459690094, "learning_rate": 4.750034276534351e-05, "loss": 0.3318, "step": 12645 }, { "epoch": 1.1464564074678267, "grad_norm": 0.832571804523468, "learning_rate": 4.749689446722622e-05, "loss": 0.3592, "step": 12650 }, { "epoch": 1.1469095522929127, "grad_norm": 0.8155151009559631, "learning_rate": 4.7493443917612116e-05, "loss": 0.3924, "step": 12655 }, { "epoch": 1.1473626971179989, "grad_norm": 0.8070470690727234, "learning_rate": 4.7489991116846545e-05, "loss": 0.3666, "step": 12660 }, { "epoch": 1.147815841943085, "grad_norm": 0.8249188661575317, "learning_rate": 4.748653606527507e-05, "loss": 0.3446, "step": 12665 }, { "epoch": 1.148268986768171, "grad_norm": 1.0814698934555054, "learning_rate": 4.748307876324345e-05, "loss": 0.4658, "step": 12670 }, { "epoch": 1.1487221315932572, "grad_norm": 1.0020496845245361, "learning_rate": 4.747961921109772e-05, "loss": 0.3695, "step": 12675 }, { "epoch": 1.1491752764183434, "grad_norm": 0.9263580441474915, "learning_rate": 4.747615740918409e-05, "loss": 0.3755, "step": 12680 }, { "epoch": 1.1496284212434293, "grad_norm": 0.7415669560432434, "learning_rate": 4.747269335784903e-05, "loss": 0.3784, "step": 12685 }, { "epoch": 1.1500815660685155, "grad_norm": 0.9694579243659973, "learning_rate": 4.7469227057439225e-05, "loss": 0.3306, "step": 12690 }, { "epoch": 1.1505347108936017, "grad_norm": 0.8700313568115234, "learning_rate": 4.746575850830158e-05, "loss": 0.3645, "step": 12695 }, { "epoch": 1.1509878557186877, "grad_norm": 0.861260712146759, "learning_rate": 4.7462287710783224e-05, "loss": 0.3726, "step": 12700 }, { "epoch": 1.1514410005437739, "grad_norm": 0.9360377192497253, "learning_rate": 4.745881466523153e-05, "loss": 0.4231, "step": 12705 }, { "epoch": 1.1518941453688598, "grad_norm": 0.9154558181762695, "learning_rate": 4.745533937199407e-05, "loss": 0.3994, "step": 12710 }, { "epoch": 1.152347290193946, "grad_norm": 0.8606727719306946, "learning_rate": 4.7451861831418646e-05, "loss": 0.3403, "step": 12715 }, { "epoch": 1.1528004350190322, "grad_norm": 0.8460464477539062, "learning_rate": 4.74483820438533e-05, "loss": 0.3296, "step": 12720 }, { "epoch": 1.1532535798441181, "grad_norm": 1.029278039932251, "learning_rate": 4.74449000096463e-05, "loss": 0.4013, "step": 12725 }, { "epoch": 1.1537067246692043, "grad_norm": 0.7998257875442505, "learning_rate": 4.744141572914612e-05, "loss": 0.3621, "step": 12730 }, { "epoch": 1.1541598694942903, "grad_norm": 0.7970107793807983, "learning_rate": 4.743792920270146e-05, "loss": 0.3517, "step": 12735 }, { "epoch": 1.1546130143193765, "grad_norm": 0.930261492729187, "learning_rate": 4.743444043066126e-05, "loss": 0.3574, "step": 12740 }, { "epoch": 1.1550661591444626, "grad_norm": 0.9984077215194702, "learning_rate": 4.743094941337468e-05, "loss": 0.373, "step": 12745 }, { "epoch": 1.1555193039695486, "grad_norm": 0.8238091468811035, "learning_rate": 4.7427456151191104e-05, "loss": 0.3326, "step": 12750 }, { "epoch": 1.1559724487946348, "grad_norm": 0.7804591655731201, "learning_rate": 4.742396064446013e-05, "loss": 0.3594, "step": 12755 }, { "epoch": 1.156425593619721, "grad_norm": 0.9249730110168457, "learning_rate": 4.7420462893531594e-05, "loss": 0.3562, "step": 12760 }, { "epoch": 1.156878738444807, "grad_norm": 0.9592832326889038, "learning_rate": 4.741696289875556e-05, "loss": 0.3835, "step": 12765 }, { "epoch": 1.1573318832698931, "grad_norm": 0.8419387340545654, "learning_rate": 4.741346066048229e-05, "loss": 0.3733, "step": 12770 }, { "epoch": 1.157785028094979, "grad_norm": 0.8701534867286682, "learning_rate": 4.7409956179062306e-05, "loss": 0.3216, "step": 12775 }, { "epoch": 1.1582381729200653, "grad_norm": 0.9257303476333618, "learning_rate": 4.740644945484633e-05, "loss": 0.3979, "step": 12780 }, { "epoch": 1.1586913177451514, "grad_norm": 0.9289520382881165, "learning_rate": 4.740294048818532e-05, "loss": 0.3896, "step": 12785 }, { "epoch": 1.1591444625702374, "grad_norm": 0.9034426808357239, "learning_rate": 4.7399429279430455e-05, "loss": 0.3981, "step": 12790 }, { "epoch": 1.1595976073953236, "grad_norm": 0.7484148144721985, "learning_rate": 4.7395915828933126e-05, "loss": 0.3521, "step": 12795 }, { "epoch": 1.1600507522204095, "grad_norm": 0.7977918982505798, "learning_rate": 4.739240013704498e-05, "loss": 0.394, "step": 12800 }, { "epoch": 1.1605038970454957, "grad_norm": 0.8916045427322388, "learning_rate": 4.738888220411785e-05, "loss": 0.3712, "step": 12805 }, { "epoch": 1.160957041870582, "grad_norm": 0.8566712737083435, "learning_rate": 4.738536203050383e-05, "loss": 0.3697, "step": 12810 }, { "epoch": 1.1614101866956679, "grad_norm": 0.9661340713500977, "learning_rate": 4.73818396165552e-05, "loss": 0.3428, "step": 12815 }, { "epoch": 1.161863331520754, "grad_norm": 0.8759592175483704, "learning_rate": 4.73783149626245e-05, "loss": 0.3144, "step": 12820 }, { "epoch": 1.1623164763458402, "grad_norm": 1.089042067527771, "learning_rate": 4.737478806906448e-05, "loss": 0.3787, "step": 12825 }, { "epoch": 1.1627696211709262, "grad_norm": 0.8472829461097717, "learning_rate": 4.73712589362281e-05, "loss": 0.3739, "step": 12830 }, { "epoch": 1.1632227659960124, "grad_norm": 0.948634684085846, "learning_rate": 4.736772756446856e-05, "loss": 0.4255, "step": 12835 }, { "epoch": 1.1636759108210983, "grad_norm": 0.8113477826118469, "learning_rate": 4.736419395413929e-05, "loss": 0.3487, "step": 12840 }, { "epoch": 1.1641290556461845, "grad_norm": 0.889181911945343, "learning_rate": 4.7360658105593926e-05, "loss": 0.384, "step": 12845 }, { "epoch": 1.1645822004712707, "grad_norm": 0.9676528573036194, "learning_rate": 4.735712001918634e-05, "loss": 0.4177, "step": 12850 }, { "epoch": 1.1650353452963567, "grad_norm": 0.8750907778739929, "learning_rate": 4.735357969527062e-05, "loss": 0.3715, "step": 12855 }, { "epoch": 1.1654884901214428, "grad_norm": 0.8837761878967285, "learning_rate": 4.7350037134201095e-05, "loss": 0.3344, "step": 12860 }, { "epoch": 1.1659416349465288, "grad_norm": 0.9547573924064636, "learning_rate": 4.73464923363323e-05, "loss": 0.4096, "step": 12865 }, { "epoch": 1.166394779771615, "grad_norm": 0.7854652404785156, "learning_rate": 4.7342945302018996e-05, "loss": 0.3164, "step": 12870 }, { "epoch": 1.1668479245967012, "grad_norm": 0.8755490779876709, "learning_rate": 4.733939603161617e-05, "loss": 0.3383, "step": 12875 }, { "epoch": 1.1673010694217871, "grad_norm": 0.8194869756698608, "learning_rate": 4.7335844525479036e-05, "loss": 0.3338, "step": 12880 }, { "epoch": 1.1677542142468733, "grad_norm": 0.7724260091781616, "learning_rate": 4.733229078396303e-05, "loss": 0.305, "step": 12885 }, { "epoch": 1.1682073590719595, "grad_norm": 0.9192975759506226, "learning_rate": 4.7328734807423816e-05, "loss": 0.3747, "step": 12890 }, { "epoch": 1.1686605038970455, "grad_norm": 0.8448793292045593, "learning_rate": 4.732517659621727e-05, "loss": 0.3413, "step": 12895 }, { "epoch": 1.1691136487221316, "grad_norm": 0.8694965839385986, "learning_rate": 4.732161615069951e-05, "loss": 0.3358, "step": 12900 }, { "epoch": 1.1695667935472176, "grad_norm": 0.8982720375061035, "learning_rate": 4.731805347122685e-05, "loss": 0.357, "step": 12905 }, { "epoch": 1.1700199383723038, "grad_norm": 0.9621032476425171, "learning_rate": 4.731448855815587e-05, "loss": 0.4099, "step": 12910 }, { "epoch": 1.17047308319739, "grad_norm": 1.023514986038208, "learning_rate": 4.731092141184332e-05, "loss": 0.4048, "step": 12915 }, { "epoch": 1.170926228022476, "grad_norm": 0.8091338872909546, "learning_rate": 4.730735203264621e-05, "loss": 0.3368, "step": 12920 }, { "epoch": 1.171379372847562, "grad_norm": 0.9507421255111694, "learning_rate": 4.730378042092178e-05, "loss": 0.3758, "step": 12925 }, { "epoch": 1.171832517672648, "grad_norm": 0.8779581189155579, "learning_rate": 4.7300206577027454e-05, "loss": 0.3347, "step": 12930 }, { "epoch": 1.1722856624977342, "grad_norm": 0.8899171948432922, "learning_rate": 4.729663050132093e-05, "loss": 0.4372, "step": 12935 }, { "epoch": 1.1727388073228204, "grad_norm": 0.9396481513977051, "learning_rate": 4.729305219416007e-05, "loss": 0.3771, "step": 12940 }, { "epoch": 1.1731919521479064, "grad_norm": 0.8969680666923523, "learning_rate": 4.7289471655903015e-05, "loss": 0.3662, "step": 12945 }, { "epoch": 1.1736450969729926, "grad_norm": 0.8652588725090027, "learning_rate": 4.728588888690811e-05, "loss": 0.3842, "step": 12950 }, { "epoch": 1.1740982417980788, "grad_norm": 0.9562860131263733, "learning_rate": 4.728230388753391e-05, "loss": 0.4077, "step": 12955 }, { "epoch": 1.1745513866231647, "grad_norm": 0.7674459218978882, "learning_rate": 4.72787166581392e-05, "loss": 0.3743, "step": 12960 }, { "epoch": 1.175004531448251, "grad_norm": 0.8841195702552795, "learning_rate": 4.727512719908299e-05, "loss": 0.34, "step": 12965 }, { "epoch": 1.175457676273337, "grad_norm": 0.9892132878303528, "learning_rate": 4.727153551072453e-05, "loss": 0.3913, "step": 12970 }, { "epoch": 1.175910821098423, "grad_norm": 0.8305217027664185, "learning_rate": 4.726794159342326e-05, "loss": 0.3522, "step": 12975 }, { "epoch": 1.1763639659235092, "grad_norm": 0.8575117588043213, "learning_rate": 4.7264345447538874e-05, "loss": 0.3478, "step": 12980 }, { "epoch": 1.1768171107485952, "grad_norm": 0.8899431824684143, "learning_rate": 4.7260747073431266e-05, "loss": 0.3778, "step": 12985 }, { "epoch": 1.1772702555736814, "grad_norm": 0.908649742603302, "learning_rate": 4.725714647146057e-05, "loss": 0.354, "step": 12990 }, { "epoch": 1.1777234003987673, "grad_norm": 0.9989773631095886, "learning_rate": 4.7253543641987116e-05, "loss": 0.3454, "step": 12995 }, { "epoch": 1.1781765452238535, "grad_norm": 1.7134169340133667, "learning_rate": 4.724993858537151e-05, "loss": 0.3754, "step": 13000 }, { "epoch": 1.1786296900489397, "grad_norm": 0.7933313250541687, "learning_rate": 4.724633130197452e-05, "loss": 0.3807, "step": 13005 }, { "epoch": 1.1790828348740257, "grad_norm": 0.8458194136619568, "learning_rate": 4.724272179215718e-05, "loss": 0.3724, "step": 13010 }, { "epoch": 1.1795359796991118, "grad_norm": 0.9159793257713318, "learning_rate": 4.723911005628071e-05, "loss": 0.4024, "step": 13015 }, { "epoch": 1.179989124524198, "grad_norm": 0.9399170875549316, "learning_rate": 4.723549609470659e-05, "loss": 0.3614, "step": 13020 }, { "epoch": 1.180442269349284, "grad_norm": 0.8951082825660706, "learning_rate": 4.7231879907796516e-05, "loss": 0.3722, "step": 13025 }, { "epoch": 1.1808954141743702, "grad_norm": 0.8849614262580872, "learning_rate": 4.7228261495912376e-05, "loss": 0.3857, "step": 13030 }, { "epoch": 1.1813485589994563, "grad_norm": 0.9353001713752747, "learning_rate": 4.7224640859416316e-05, "loss": 0.3725, "step": 13035 }, { "epoch": 1.1818017038245423, "grad_norm": 0.8581328988075256, "learning_rate": 4.722101799867068e-05, "loss": 0.3171, "step": 13040 }, { "epoch": 1.1822548486496285, "grad_norm": 0.9152098894119263, "learning_rate": 4.7217392914038055e-05, "loss": 0.3595, "step": 13045 }, { "epoch": 1.1827079934747144, "grad_norm": 0.9139869213104248, "learning_rate": 4.721376560588124e-05, "loss": 0.4268, "step": 13050 }, { "epoch": 1.1831611382998006, "grad_norm": 0.8527369499206543, "learning_rate": 4.721013607456325e-05, "loss": 0.3306, "step": 13055 }, { "epoch": 1.1836142831248866, "grad_norm": 0.8524812459945679, "learning_rate": 4.720650432044733e-05, "loss": 0.3559, "step": 13060 }, { "epoch": 1.1840674279499728, "grad_norm": 0.9454774260520935, "learning_rate": 4.7202870343896955e-05, "loss": 0.4564, "step": 13065 }, { "epoch": 1.184520572775059, "grad_norm": 0.9070358872413635, "learning_rate": 4.719923414527581e-05, "loss": 0.3751, "step": 13070 }, { "epoch": 1.184973717600145, "grad_norm": 0.8548275232315063, "learning_rate": 4.7195595724947804e-05, "loss": 0.3611, "step": 13075 }, { "epoch": 1.185426862425231, "grad_norm": 0.8261054754257202, "learning_rate": 4.719195508327708e-05, "loss": 0.3504, "step": 13080 }, { "epoch": 1.1858800072503173, "grad_norm": 0.7893258333206177, "learning_rate": 4.718831222062798e-05, "loss": 0.3765, "step": 13085 }, { "epoch": 1.1863331520754032, "grad_norm": 0.9026743769645691, "learning_rate": 4.71846671373651e-05, "loss": 0.3272, "step": 13090 }, { "epoch": 1.1867862969004894, "grad_norm": 0.9332479238510132, "learning_rate": 4.7181019833853236e-05, "loss": 0.3826, "step": 13095 }, { "epoch": 1.1872394417255756, "grad_norm": 0.9124051928520203, "learning_rate": 4.71773703104574e-05, "loss": 0.3891, "step": 13100 }, { "epoch": 1.1876925865506616, "grad_norm": 0.8366103172302246, "learning_rate": 4.717371856754286e-05, "loss": 0.376, "step": 13105 }, { "epoch": 1.1881457313757477, "grad_norm": 0.8556443452835083, "learning_rate": 4.717006460547506e-05, "loss": 0.323, "step": 13110 }, { "epoch": 1.1885988762008337, "grad_norm": 0.8990413546562195, "learning_rate": 4.71664084246197e-05, "loss": 0.3515, "step": 13115 }, { "epoch": 1.18905202102592, "grad_norm": 0.8274251818656921, "learning_rate": 4.7162750025342696e-05, "loss": 0.3438, "step": 13120 }, { "epoch": 1.189505165851006, "grad_norm": 0.7876518964767456, "learning_rate": 4.715908940801017e-05, "loss": 0.3584, "step": 13125 }, { "epoch": 1.189958310676092, "grad_norm": 0.8457061052322388, "learning_rate": 4.7155426572988485e-05, "loss": 0.4013, "step": 13130 }, { "epoch": 1.1904114555011782, "grad_norm": 0.9037103056907654, "learning_rate": 4.715176152064422e-05, "loss": 0.363, "step": 13135 }, { "epoch": 1.1908646003262642, "grad_norm": 0.9322301149368286, "learning_rate": 4.714809425134418e-05, "loss": 0.3874, "step": 13140 }, { "epoch": 1.1913177451513504, "grad_norm": 1.020405888557434, "learning_rate": 4.7144424765455375e-05, "loss": 0.3547, "step": 13145 }, { "epoch": 1.1917708899764365, "grad_norm": 0.9936540126800537, "learning_rate": 4.714075306334505e-05, "loss": 0.397, "step": 13150 }, { "epoch": 1.1922240348015225, "grad_norm": 0.8645361661911011, "learning_rate": 4.7137079145380685e-05, "loss": 0.366, "step": 13155 }, { "epoch": 1.1926771796266087, "grad_norm": 0.8835740089416504, "learning_rate": 4.713340301192995e-05, "loss": 0.3281, "step": 13160 }, { "epoch": 1.1931303244516949, "grad_norm": 0.9154490232467651, "learning_rate": 4.712972466336076e-05, "loss": 0.3685, "step": 13165 }, { "epoch": 1.1935834692767808, "grad_norm": 0.9254873991012573, "learning_rate": 4.712604410004124e-05, "loss": 0.3587, "step": 13170 }, { "epoch": 1.194036614101867, "grad_norm": 0.9319527745246887, "learning_rate": 4.712236132233975e-05, "loss": 0.3536, "step": 13175 }, { "epoch": 1.194489758926953, "grad_norm": 0.7813329100608826, "learning_rate": 4.7118676330624865e-05, "loss": 0.347, "step": 13180 }, { "epoch": 1.1949429037520392, "grad_norm": 0.8400955200195312, "learning_rate": 4.7114989125265374e-05, "loss": 0.3848, "step": 13185 }, { "epoch": 1.1953960485771253, "grad_norm": 0.8181741237640381, "learning_rate": 4.7111299706630285e-05, "loss": 0.3715, "step": 13190 }, { "epoch": 1.1958491934022113, "grad_norm": 0.8785662055015564, "learning_rate": 4.710760807508886e-05, "loss": 0.3571, "step": 13195 }, { "epoch": 1.1963023382272975, "grad_norm": 0.8908258080482483, "learning_rate": 4.7103914231010536e-05, "loss": 0.3494, "step": 13200 }, { "epoch": 1.1967554830523834, "grad_norm": 0.782060980796814, "learning_rate": 4.710021817476501e-05, "loss": 0.3436, "step": 13205 }, { "epoch": 1.1972086278774696, "grad_norm": 0.7922201156616211, "learning_rate": 4.709651990672217e-05, "loss": 0.3293, "step": 13210 }, { "epoch": 1.1976617727025558, "grad_norm": 0.8423681259155273, "learning_rate": 4.7092819427252156e-05, "loss": 0.358, "step": 13215 }, { "epoch": 1.1981149175276418, "grad_norm": 0.8921973705291748, "learning_rate": 4.70891167367253e-05, "loss": 0.3452, "step": 13220 }, { "epoch": 1.198568062352728, "grad_norm": 1.004164457321167, "learning_rate": 4.7085411835512186e-05, "loss": 0.4, "step": 13225 }, { "epoch": 1.1990212071778141, "grad_norm": 0.9212868213653564, "learning_rate": 4.7081704723983574e-05, "loss": 0.3921, "step": 13230 }, { "epoch": 1.1994743520029, "grad_norm": 0.9857001304626465, "learning_rate": 4.707799540251049e-05, "loss": 0.3425, "step": 13235 }, { "epoch": 1.1999274968279863, "grad_norm": 0.8310409188270569, "learning_rate": 4.707428387146417e-05, "loss": 0.3311, "step": 13240 }, { "epoch": 1.2003806416530722, "grad_norm": 0.9143227338790894, "learning_rate": 4.707057013121605e-05, "loss": 0.3337, "step": 13245 }, { "epoch": 1.2008337864781584, "grad_norm": 0.9061553478240967, "learning_rate": 4.706685418213782e-05, "loss": 0.3664, "step": 13250 }, { "epoch": 1.2012869313032446, "grad_norm": 0.8561981320381165, "learning_rate": 4.706313602460136e-05, "loss": 0.3616, "step": 13255 }, { "epoch": 1.2017400761283306, "grad_norm": 0.9415317177772522, "learning_rate": 4.7059415658978785e-05, "loss": 0.3483, "step": 13260 }, { "epoch": 1.2021932209534167, "grad_norm": 0.8463327884674072, "learning_rate": 4.7055693085642436e-05, "loss": 0.3889, "step": 13265 }, { "epoch": 1.2026463657785027, "grad_norm": 0.8747805953025818, "learning_rate": 4.705196830496486e-05, "loss": 0.3801, "step": 13270 }, { "epoch": 1.2030995106035889, "grad_norm": 0.918551504611969, "learning_rate": 4.704824131731885e-05, "loss": 0.3767, "step": 13275 }, { "epoch": 1.203552655428675, "grad_norm": 0.9512204527854919, "learning_rate": 4.70445121230774e-05, "loss": 0.3596, "step": 13280 }, { "epoch": 1.204005800253761, "grad_norm": 0.8055136203765869, "learning_rate": 4.704078072261371e-05, "loss": 0.3151, "step": 13285 }, { "epoch": 1.2044589450788472, "grad_norm": 0.8840947151184082, "learning_rate": 4.703704711630125e-05, "loss": 0.3303, "step": 13290 }, { "epoch": 1.2049120899039334, "grad_norm": 0.8637556433677673, "learning_rate": 4.7033311304513656e-05, "loss": 0.362, "step": 13295 }, { "epoch": 1.2053652347290194, "grad_norm": 0.8652904033660889, "learning_rate": 4.702957328762482e-05, "loss": 0.3262, "step": 13300 }, { "epoch": 1.2058183795541055, "grad_norm": 0.8439725041389465, "learning_rate": 4.702583306600884e-05, "loss": 0.397, "step": 13305 }, { "epoch": 1.2062715243791915, "grad_norm": 0.9081153869628906, "learning_rate": 4.702209064004004e-05, "loss": 0.3786, "step": 13310 }, { "epoch": 1.2067246692042777, "grad_norm": 0.83958500623703, "learning_rate": 4.7018346010092976e-05, "loss": 0.3117, "step": 13315 }, { "epoch": 1.2071778140293639, "grad_norm": 0.7645622491836548, "learning_rate": 4.701459917654239e-05, "loss": 0.306, "step": 13320 }, { "epoch": 1.2076309588544498, "grad_norm": 0.9324590563774109, "learning_rate": 4.701085013976327e-05, "loss": 0.4167, "step": 13325 }, { "epoch": 1.208084103679536, "grad_norm": 0.8954794406890869, "learning_rate": 4.7007098900130834e-05, "loss": 0.3945, "step": 13330 }, { "epoch": 1.208537248504622, "grad_norm": 0.8988186717033386, "learning_rate": 4.7003345458020504e-05, "loss": 0.3703, "step": 13335 }, { "epoch": 1.2089903933297081, "grad_norm": 1.0693849325180054, "learning_rate": 4.6999589813807924e-05, "loss": 0.3276, "step": 13340 }, { "epoch": 1.2094435381547943, "grad_norm": 0.9266483783721924, "learning_rate": 4.699583196786895e-05, "loss": 0.3788, "step": 13345 }, { "epoch": 1.2098966829798803, "grad_norm": 0.900558590888977, "learning_rate": 4.699207192057968e-05, "loss": 0.4183, "step": 13350 }, { "epoch": 1.2103498278049665, "grad_norm": 0.9199671745300293, "learning_rate": 4.698830967231642e-05, "loss": 0.3846, "step": 13355 }, { "epoch": 1.2108029726300527, "grad_norm": 0.874121367931366, "learning_rate": 4.698454522345569e-05, "loss": 0.3433, "step": 13360 }, { "epoch": 1.2112561174551386, "grad_norm": 0.9190245866775513, "learning_rate": 4.698077857437425e-05, "loss": 0.3981, "step": 13365 }, { "epoch": 1.2117092622802248, "grad_norm": 0.83365398645401, "learning_rate": 4.697700972544905e-05, "loss": 0.3596, "step": 13370 }, { "epoch": 1.212162407105311, "grad_norm": 0.8803371787071228, "learning_rate": 4.69732386770573e-05, "loss": 0.364, "step": 13375 }, { "epoch": 1.212615551930397, "grad_norm": 0.9137833714485168, "learning_rate": 4.696946542957639e-05, "loss": 0.3892, "step": 13380 }, { "epoch": 1.2130686967554831, "grad_norm": 0.9301764965057373, "learning_rate": 4.696568998338394e-05, "loss": 0.3552, "step": 13385 }, { "epoch": 1.213521841580569, "grad_norm": 0.8175264596939087, "learning_rate": 4.696191233885783e-05, "loss": 0.3775, "step": 13390 }, { "epoch": 1.2139749864056553, "grad_norm": 0.8779416084289551, "learning_rate": 4.6958132496376094e-05, "loss": 0.3906, "step": 13395 }, { "epoch": 1.2144281312307412, "grad_norm": 0.9981866478919983, "learning_rate": 4.695435045631704e-05, "loss": 0.3669, "step": 13400 }, { "epoch": 1.2148812760558274, "grad_norm": 0.890963613986969, "learning_rate": 4.695056621905918e-05, "loss": 0.3588, "step": 13405 }, { "epoch": 1.2153344208809136, "grad_norm": 0.89928138256073, "learning_rate": 4.694677978498122e-05, "loss": 0.3585, "step": 13410 }, { "epoch": 1.2157875657059996, "grad_norm": 0.9013669490814209, "learning_rate": 4.6942991154462126e-05, "loss": 0.3423, "step": 13415 }, { "epoch": 1.2162407105310857, "grad_norm": 0.8476746082305908, "learning_rate": 4.693920032788106e-05, "loss": 0.3188, "step": 13420 }, { "epoch": 1.216693855356172, "grad_norm": 0.9592295289039612, "learning_rate": 4.6935407305617406e-05, "loss": 0.39, "step": 13425 }, { "epoch": 1.2171470001812579, "grad_norm": 0.8207860589027405, "learning_rate": 4.693161208805077e-05, "loss": 0.3965, "step": 13430 }, { "epoch": 1.217600145006344, "grad_norm": 0.9021629095077515, "learning_rate": 4.6927814675560986e-05, "loss": 0.3571, "step": 13435 }, { "epoch": 1.2180532898314302, "grad_norm": 1.0023094415664673, "learning_rate": 4.69240150685281e-05, "loss": 0.3868, "step": 13440 }, { "epoch": 1.2185064346565162, "grad_norm": 0.7954906225204468, "learning_rate": 4.692021326733237e-05, "loss": 0.3879, "step": 13445 }, { "epoch": 1.2189595794816024, "grad_norm": 0.8150657415390015, "learning_rate": 4.691640927235429e-05, "loss": 0.3538, "step": 13450 }, { "epoch": 1.2194127243066883, "grad_norm": 0.8615502119064331, "learning_rate": 4.691260308397456e-05, "loss": 0.3874, "step": 13455 }, { "epoch": 1.2198658691317745, "grad_norm": 0.893435537815094, "learning_rate": 4.69087947025741e-05, "loss": 0.3302, "step": 13460 }, { "epoch": 1.2203190139568607, "grad_norm": 0.8075352907180786, "learning_rate": 4.690498412853407e-05, "loss": 0.3317, "step": 13465 }, { "epoch": 1.2207721587819467, "grad_norm": 0.9292077422142029, "learning_rate": 4.690117136223581e-05, "loss": 0.3822, "step": 13470 }, { "epoch": 1.2212253036070329, "grad_norm": 0.8118329644203186, "learning_rate": 4.689735640406093e-05, "loss": 0.3678, "step": 13475 }, { "epoch": 1.2216784484321188, "grad_norm": 0.9188785552978516, "learning_rate": 4.689353925439121e-05, "loss": 0.3938, "step": 13480 }, { "epoch": 1.222131593257205, "grad_norm": 0.8794747591018677, "learning_rate": 4.6889719913608685e-05, "loss": 0.3824, "step": 13485 }, { "epoch": 1.2225847380822912, "grad_norm": 0.9200755953788757, "learning_rate": 4.6885898382095586e-05, "loss": 0.328, "step": 13490 }, { "epoch": 1.2230378829073771, "grad_norm": 1.5757144689559937, "learning_rate": 4.6882074660234385e-05, "loss": 0.363, "step": 13495 }, { "epoch": 1.2234910277324633, "grad_norm": 0.8735378980636597, "learning_rate": 4.687824874840775e-05, "loss": 0.3902, "step": 13500 }, { "epoch": 1.2239441725575495, "grad_norm": 0.8835929036140442, "learning_rate": 4.687442064699858e-05, "loss": 0.3565, "step": 13505 }, { "epoch": 1.2243973173826355, "grad_norm": 0.8760011196136475, "learning_rate": 4.687059035639001e-05, "loss": 0.3557, "step": 13510 }, { "epoch": 1.2248504622077216, "grad_norm": 0.9388099312782288, "learning_rate": 4.686675787696535e-05, "loss": 0.3596, "step": 13515 }, { "epoch": 1.2253036070328076, "grad_norm": 1.1097822189331055, "learning_rate": 4.686292320910818e-05, "loss": 0.3516, "step": 13520 }, { "epoch": 1.2257567518578938, "grad_norm": 0.9518733024597168, "learning_rate": 4.6859086353202265e-05, "loss": 0.34, "step": 13525 }, { "epoch": 1.22620989668298, "grad_norm": 0.8441634774208069, "learning_rate": 4.6855247309631586e-05, "loss": 0.3299, "step": 13530 }, { "epoch": 1.226663041508066, "grad_norm": 0.8427156805992126, "learning_rate": 4.685140607878038e-05, "loss": 0.3603, "step": 13535 }, { "epoch": 1.2271161863331521, "grad_norm": 0.9393357038497925, "learning_rate": 4.684756266103306e-05, "loss": 0.3666, "step": 13540 }, { "epoch": 1.227569331158238, "grad_norm": 0.8499166965484619, "learning_rate": 4.684371705677429e-05, "loss": 0.3106, "step": 13545 }, { "epoch": 1.2280224759833243, "grad_norm": 0.8859207034111023, "learning_rate": 4.683986926638893e-05, "loss": 0.3556, "step": 13550 }, { "epoch": 1.2284756208084104, "grad_norm": 0.9124035835266113, "learning_rate": 4.6836019290262066e-05, "loss": 0.4211, "step": 13555 }, { "epoch": 1.2289287656334964, "grad_norm": 0.9021211862564087, "learning_rate": 4.683216712877901e-05, "loss": 0.3567, "step": 13560 }, { "epoch": 1.2293819104585826, "grad_norm": 0.9572153687477112, "learning_rate": 4.6828312782325295e-05, "loss": 0.3757, "step": 13565 }, { "epoch": 1.2298350552836688, "grad_norm": 0.9392530918121338, "learning_rate": 4.6824456251286654e-05, "loss": 0.3318, "step": 13570 }, { "epoch": 1.2302882001087547, "grad_norm": 0.8396379947662354, "learning_rate": 4.682059753604905e-05, "loss": 0.3314, "step": 13575 }, { "epoch": 1.230741344933841, "grad_norm": 0.9473565220832825, "learning_rate": 4.6816736636998674e-05, "loss": 0.355, "step": 13580 }, { "epoch": 1.2311944897589269, "grad_norm": 0.928432822227478, "learning_rate": 4.6812873554521916e-05, "loss": 0.3872, "step": 13585 }, { "epoch": 1.231647634584013, "grad_norm": 1.003257155418396, "learning_rate": 4.68090082890054e-05, "loss": 0.4074, "step": 13590 }, { "epoch": 1.2321007794090992, "grad_norm": 0.9163318872451782, "learning_rate": 4.680514084083596e-05, "loss": 0.4049, "step": 13595 }, { "epoch": 1.2325539242341852, "grad_norm": 0.9541494846343994, "learning_rate": 4.6801271210400656e-05, "loss": 0.3695, "step": 13600 }, { "epoch": 1.2330070690592714, "grad_norm": 0.8519158363342285, "learning_rate": 4.679739939808676e-05, "loss": 0.3427, "step": 13605 }, { "epoch": 1.2334602138843573, "grad_norm": 0.8442367911338806, "learning_rate": 4.679352540428176e-05, "loss": 0.3933, "step": 13610 }, { "epoch": 1.2339133587094435, "grad_norm": 0.9326508641242981, "learning_rate": 4.6789649229373375e-05, "loss": 0.3888, "step": 13615 }, { "epoch": 1.2343665035345297, "grad_norm": 0.8382309675216675, "learning_rate": 4.6785770873749525e-05, "loss": 0.3142, "step": 13620 }, { "epoch": 1.2348196483596157, "grad_norm": 0.899158239364624, "learning_rate": 4.6781890337798365e-05, "loss": 0.3558, "step": 13625 }, { "epoch": 1.2352727931847018, "grad_norm": 0.853508710861206, "learning_rate": 4.677800762190826e-05, "loss": 0.3401, "step": 13630 }, { "epoch": 1.235725938009788, "grad_norm": 0.8736655116081238, "learning_rate": 4.677412272646778e-05, "loss": 0.3529, "step": 13635 }, { "epoch": 1.236179082834874, "grad_norm": 0.9835892915725708, "learning_rate": 4.677023565186574e-05, "loss": 0.3512, "step": 13640 }, { "epoch": 1.2366322276599602, "grad_norm": 0.9591622352600098, "learning_rate": 4.6766346398491155e-05, "loss": 0.314, "step": 13645 }, { "epoch": 1.2370853724850461, "grad_norm": 0.7807872295379639, "learning_rate": 4.676245496673327e-05, "loss": 0.303, "step": 13650 }, { "epoch": 1.2375385173101323, "grad_norm": 0.7984666228294373, "learning_rate": 4.675856135698153e-05, "loss": 0.3188, "step": 13655 }, { "epoch": 1.2379916621352185, "grad_norm": 0.917048454284668, "learning_rate": 4.6754665569625624e-05, "loss": 0.3716, "step": 13660 }, { "epoch": 1.2384448069603045, "grad_norm": 0.8537184596061707, "learning_rate": 4.6750767605055424e-05, "loss": 0.3982, "step": 13665 }, { "epoch": 1.2388979517853906, "grad_norm": 1.077243685722351, "learning_rate": 4.6746867463661046e-05, "loss": 0.3578, "step": 13670 }, { "epoch": 1.2393510966104766, "grad_norm": 0.8508317470550537, "learning_rate": 4.674296514583283e-05, "loss": 0.3328, "step": 13675 }, { "epoch": 1.2398042414355628, "grad_norm": 0.8149635195732117, "learning_rate": 4.6739060651961305e-05, "loss": 0.366, "step": 13680 }, { "epoch": 1.240257386260649, "grad_norm": 0.8814164400100708, "learning_rate": 4.673515398243724e-05, "loss": 0.3357, "step": 13685 }, { "epoch": 1.240710531085735, "grad_norm": 0.8786739706993103, "learning_rate": 4.673124513765162e-05, "loss": 0.3421, "step": 13690 }, { "epoch": 1.241163675910821, "grad_norm": 0.9083707332611084, "learning_rate": 4.672733411799565e-05, "loss": 0.374, "step": 13695 }, { "epoch": 1.2416168207359073, "grad_norm": 0.9063785076141357, "learning_rate": 4.672342092386073e-05, "loss": 0.3418, "step": 13700 }, { "epoch": 1.2420699655609932, "grad_norm": 0.8512806296348572, "learning_rate": 4.671950555563849e-05, "loss": 0.3064, "step": 13705 }, { "epoch": 1.2425231103860794, "grad_norm": 0.8109384179115295, "learning_rate": 4.67155880137208e-05, "loss": 0.3381, "step": 13710 }, { "epoch": 1.2429762552111654, "grad_norm": 0.8487366437911987, "learning_rate": 4.6711668298499724e-05, "loss": 0.3518, "step": 13715 }, { "epoch": 1.2434294000362516, "grad_norm": 0.8660186529159546, "learning_rate": 4.670774641036754e-05, "loss": 0.3956, "step": 13720 }, { "epoch": 1.2438825448613378, "grad_norm": 0.9857766032218933, "learning_rate": 4.6703822349716755e-05, "loss": 0.3299, "step": 13725 }, { "epoch": 1.2443356896864237, "grad_norm": 0.8789328932762146, "learning_rate": 4.66998961169401e-05, "loss": 0.345, "step": 13730 }, { "epoch": 1.24478883451151, "grad_norm": 0.9250821471214294, "learning_rate": 4.66959677124305e-05, "loss": 0.3543, "step": 13735 }, { "epoch": 1.2452419793365959, "grad_norm": 0.9009773135185242, "learning_rate": 4.6692037136581116e-05, "loss": 0.3415, "step": 13740 }, { "epoch": 1.245695124161682, "grad_norm": 0.8725888133049011, "learning_rate": 4.6688104389785327e-05, "loss": 0.3952, "step": 13745 }, { "epoch": 1.2461482689867682, "grad_norm": 0.9033427834510803, "learning_rate": 4.668416947243671e-05, "loss": 0.404, "step": 13750 }, { "epoch": 1.2466014138118542, "grad_norm": 0.8681004047393799, "learning_rate": 4.6680232384929097e-05, "loss": 0.3667, "step": 13755 }, { "epoch": 1.2470545586369404, "grad_norm": 0.7713662981987, "learning_rate": 4.667629312765649e-05, "loss": 0.3202, "step": 13760 }, { "epoch": 1.2475077034620266, "grad_norm": 0.9773503541946411, "learning_rate": 4.667235170101314e-05, "loss": 0.3697, "step": 13765 }, { "epoch": 1.2479608482871125, "grad_norm": 0.8444396257400513, "learning_rate": 4.66684081053935e-05, "loss": 0.3498, "step": 13770 }, { "epoch": 1.2484139931121987, "grad_norm": 0.7740346193313599, "learning_rate": 4.6664462341192253e-05, "loss": 0.3397, "step": 13775 }, { "epoch": 1.2488671379372849, "grad_norm": 0.9154688715934753, "learning_rate": 4.66605144088043e-05, "loss": 0.3335, "step": 13780 }, { "epoch": 1.2493202827623708, "grad_norm": 0.8173655271530151, "learning_rate": 4.665656430862473e-05, "loss": 0.3254, "step": 13785 }, { "epoch": 1.249773427587457, "grad_norm": 0.8054863214492798, "learning_rate": 4.665261204104889e-05, "loss": 0.354, "step": 13790 }, { "epoch": 1.250226572412543, "grad_norm": 0.8824300765991211, "learning_rate": 4.6648657606472326e-05, "loss": 0.329, "step": 13795 }, { "epoch": 1.2506797172376292, "grad_norm": 0.922743022441864, "learning_rate": 4.664470100529078e-05, "loss": 0.3895, "step": 13800 }, { "epoch": 1.2511328620627151, "grad_norm": 0.7964768409729004, "learning_rate": 4.664074223790024e-05, "loss": 0.3602, "step": 13805 }, { "epoch": 1.2515860068878013, "grad_norm": 0.8710575103759766, "learning_rate": 4.6636781304696896e-05, "loss": 0.3801, "step": 13810 }, { "epoch": 1.2520391517128875, "grad_norm": 0.9176396727561951, "learning_rate": 4.663281820607718e-05, "loss": 0.4102, "step": 13815 }, { "epoch": 1.2524922965379734, "grad_norm": 0.8913403749465942, "learning_rate": 4.6628852942437696e-05, "loss": 0.325, "step": 13820 }, { "epoch": 1.2529454413630596, "grad_norm": 0.8522996306419373, "learning_rate": 4.66248855141753e-05, "loss": 0.3206, "step": 13825 }, { "epoch": 1.2533985861881458, "grad_norm": 0.8705560564994812, "learning_rate": 4.662091592168705e-05, "loss": 0.3697, "step": 13830 }, { "epoch": 1.2538517310132318, "grad_norm": 0.9132054448127747, "learning_rate": 4.6616944165370225e-05, "loss": 0.3553, "step": 13835 }, { "epoch": 1.254304875838318, "grad_norm": 0.906783401966095, "learning_rate": 4.6612970245622316e-05, "loss": 0.3827, "step": 13840 }, { "epoch": 1.2547580206634041, "grad_norm": 0.8006032705307007, "learning_rate": 4.6608994162841045e-05, "loss": 0.3562, "step": 13845 }, { "epoch": 1.25521116548849, "grad_norm": 0.8942027688026428, "learning_rate": 4.660501591742433e-05, "loss": 0.3794, "step": 13850 }, { "epoch": 1.2556643103135763, "grad_norm": 0.9287647008895874, "learning_rate": 4.660103550977032e-05, "loss": 0.3574, "step": 13855 }, { "epoch": 1.2561174551386622, "grad_norm": 0.9597514867782593, "learning_rate": 4.6597052940277373e-05, "loss": 0.3641, "step": 13860 }, { "epoch": 1.2565705999637484, "grad_norm": 1.0452021360397339, "learning_rate": 4.659306820934407e-05, "loss": 0.4062, "step": 13865 }, { "epoch": 1.2570237447888344, "grad_norm": 0.8948825597763062, "learning_rate": 4.65890813173692e-05, "loss": 0.3925, "step": 13870 }, { "epoch": 1.2574768896139206, "grad_norm": 0.8573547005653381, "learning_rate": 4.6585092264751776e-05, "loss": 0.3375, "step": 13875 }, { "epoch": 1.2579300344390068, "grad_norm": 0.8968073725700378, "learning_rate": 4.658110105189101e-05, "loss": 0.3473, "step": 13880 }, { "epoch": 1.2583831792640927, "grad_norm": 0.8481261134147644, "learning_rate": 4.6577107679186366e-05, "loss": 0.3851, "step": 13885 }, { "epoch": 1.258836324089179, "grad_norm": 0.9151260256767273, "learning_rate": 4.657311214703749e-05, "loss": 0.3426, "step": 13890 }, { "epoch": 1.259289468914265, "grad_norm": 0.9986085891723633, "learning_rate": 4.656911445584425e-05, "loss": 0.371, "step": 13895 }, { "epoch": 1.259742613739351, "grad_norm": 0.8411430716514587, "learning_rate": 4.656511460600675e-05, "loss": 0.3849, "step": 13900 }, { "epoch": 1.2601957585644372, "grad_norm": 0.7476597428321838, "learning_rate": 4.656111259792529e-05, "loss": 0.3398, "step": 13905 }, { "epoch": 1.2606489033895234, "grad_norm": 1.0574496984481812, "learning_rate": 4.655710843200039e-05, "loss": 0.3783, "step": 13910 }, { "epoch": 1.2611020482146094, "grad_norm": 0.7744396328926086, "learning_rate": 4.6553102108632794e-05, "loss": 0.3993, "step": 13915 }, { "epoch": 1.2615551930396955, "grad_norm": 0.948942244052887, "learning_rate": 4.654909362822346e-05, "loss": 0.3655, "step": 13920 }, { "epoch": 1.2620083378647815, "grad_norm": 0.918473482131958, "learning_rate": 4.654508299117354e-05, "loss": 0.3693, "step": 13925 }, { "epoch": 1.2624614826898677, "grad_norm": 1.000537633895874, "learning_rate": 4.654107019788443e-05, "loss": 0.3838, "step": 13930 }, { "epoch": 1.2629146275149536, "grad_norm": 0.8265812993049622, "learning_rate": 4.6537055248757736e-05, "loss": 0.3208, "step": 13935 }, { "epoch": 1.2633677723400398, "grad_norm": 0.90170818567276, "learning_rate": 4.653303814419527e-05, "loss": 0.3472, "step": 13940 }, { "epoch": 1.263820917165126, "grad_norm": 0.9489778280258179, "learning_rate": 4.652901888459907e-05, "loss": 0.3293, "step": 13945 }, { "epoch": 1.264274061990212, "grad_norm": 0.904049813747406, "learning_rate": 4.6524997470371386e-05, "loss": 0.3308, "step": 13950 }, { "epoch": 1.2647272068152982, "grad_norm": 0.9129140973091125, "learning_rate": 4.6520973901914675e-05, "loss": 0.4269, "step": 13955 }, { "epoch": 1.2651803516403843, "grad_norm": 0.9050818681716919, "learning_rate": 4.651694817963162e-05, "loss": 0.3752, "step": 13960 }, { "epoch": 1.2656334964654703, "grad_norm": 0.8247373700141907, "learning_rate": 4.651292030392512e-05, "loss": 0.326, "step": 13965 }, { "epoch": 1.2660866412905565, "grad_norm": 0.8677919507026672, "learning_rate": 4.650889027519828e-05, "loss": 0.3704, "step": 13970 }, { "epoch": 1.2665397861156427, "grad_norm": 0.7688205242156982, "learning_rate": 4.650485809385443e-05, "loss": 0.3234, "step": 13975 }, { "epoch": 1.2669929309407286, "grad_norm": 0.9047220349311829, "learning_rate": 4.6500823760297116e-05, "loss": 0.3695, "step": 13980 }, { "epoch": 1.2674460757658148, "grad_norm": 0.910365104675293, "learning_rate": 4.6496787274930086e-05, "loss": 0.4197, "step": 13985 }, { "epoch": 1.267899220590901, "grad_norm": 0.8794249892234802, "learning_rate": 4.649274863815733e-05, "loss": 0.3377, "step": 13990 }, { "epoch": 1.268352365415987, "grad_norm": 0.9869941473007202, "learning_rate": 4.6488707850383e-05, "loss": 0.3998, "step": 13995 }, { "epoch": 1.268805510241073, "grad_norm": 0.9556425213813782, "learning_rate": 4.648466491201154e-05, "loss": 0.4218, "step": 14000 }, { "epoch": 1.269258655066159, "grad_norm": 0.814842700958252, "learning_rate": 4.648061982344756e-05, "loss": 0.3964, "step": 14005 }, { "epoch": 1.2697117998912453, "grad_norm": 0.8377740979194641, "learning_rate": 4.647657258509588e-05, "loss": 0.343, "step": 14010 }, { "epoch": 1.2701649447163312, "grad_norm": 0.8789991140365601, "learning_rate": 4.647252319736156e-05, "loss": 0.3711, "step": 14015 }, { "epoch": 1.2706180895414174, "grad_norm": 1.1742985248565674, "learning_rate": 4.646847166064985e-05, "loss": 0.3592, "step": 14020 }, { "epoch": 1.2710712343665036, "grad_norm": 0.9342051148414612, "learning_rate": 4.646441797536625e-05, "loss": 0.3708, "step": 14025 }, { "epoch": 1.2715243791915896, "grad_norm": 0.8685064315795898, "learning_rate": 4.646036214191643e-05, "loss": 0.3657, "step": 14030 }, { "epoch": 1.2719775240166757, "grad_norm": 0.8385773301124573, "learning_rate": 4.645630416070632e-05, "loss": 0.3319, "step": 14035 }, { "epoch": 1.272430668841762, "grad_norm": 0.9072621464729309, "learning_rate": 4.6452244032142037e-05, "loss": 0.3357, "step": 14040 }, { "epoch": 1.2728838136668479, "grad_norm": 0.922272801399231, "learning_rate": 4.6448181756629915e-05, "loss": 0.3354, "step": 14045 }, { "epoch": 1.273336958491934, "grad_norm": 0.8923614025115967, "learning_rate": 4.644411733457651e-05, "loss": 0.3387, "step": 14050 }, { "epoch": 1.2737901033170203, "grad_norm": 0.8554062247276306, "learning_rate": 4.64400507663886e-05, "loss": 0.3786, "step": 14055 }, { "epoch": 1.2742432481421062, "grad_norm": 0.8848999738693237, "learning_rate": 4.643598205247316e-05, "loss": 0.3962, "step": 14060 }, { "epoch": 1.2746963929671924, "grad_norm": 0.9032531976699829, "learning_rate": 4.643191119323739e-05, "loss": 0.3801, "step": 14065 }, { "epoch": 1.2751495377922784, "grad_norm": 0.8463643789291382, "learning_rate": 4.64278381890887e-05, "loss": 0.3477, "step": 14070 }, { "epoch": 1.2756026826173645, "grad_norm": 1.0339469909667969, "learning_rate": 4.6423763040434723e-05, "loss": 0.3597, "step": 14075 }, { "epoch": 1.2760558274424505, "grad_norm": 0.9444010853767395, "learning_rate": 4.641968574768329e-05, "loss": 0.3452, "step": 14080 }, { "epoch": 1.2765089722675367, "grad_norm": 0.8690407276153564, "learning_rate": 4.641560631124248e-05, "loss": 0.346, "step": 14085 }, { "epoch": 1.2769621170926229, "grad_norm": 0.94745272397995, "learning_rate": 4.6411524731520544e-05, "loss": 0.3622, "step": 14090 }, { "epoch": 1.2774152619177088, "grad_norm": 0.8845673203468323, "learning_rate": 4.640744100892598e-05, "loss": 0.3221, "step": 14095 }, { "epoch": 1.277868406742795, "grad_norm": 0.7815845608711243, "learning_rate": 4.6403355143867476e-05, "loss": 0.3498, "step": 14100 }, { "epoch": 1.2783215515678812, "grad_norm": 0.8527534008026123, "learning_rate": 4.6399267136753955e-05, "loss": 0.3449, "step": 14105 }, { "epoch": 1.2787746963929671, "grad_norm": 0.9282807111740112, "learning_rate": 4.639517698799455e-05, "loss": 0.3541, "step": 14110 }, { "epoch": 1.2792278412180533, "grad_norm": 0.8574861288070679, "learning_rate": 4.639108469799859e-05, "loss": 0.3671, "step": 14115 }, { "epoch": 1.2796809860431395, "grad_norm": 0.8983964920043945, "learning_rate": 4.6386990267175646e-05, "loss": 0.3548, "step": 14120 }, { "epoch": 1.2801341308682255, "grad_norm": 0.812110185623169, "learning_rate": 4.638289369593549e-05, "loss": 0.3365, "step": 14125 }, { "epoch": 1.2805872756933117, "grad_norm": 0.9935036301612854, "learning_rate": 4.63787949846881e-05, "loss": 0.3645, "step": 14130 }, { "epoch": 1.2810404205183976, "grad_norm": 0.8002793192863464, "learning_rate": 4.6374694133843686e-05, "loss": 0.3573, "step": 14135 }, { "epoch": 1.2814935653434838, "grad_norm": 0.9407739043235779, "learning_rate": 4.637059114381266e-05, "loss": 0.3787, "step": 14140 }, { "epoch": 1.2819467101685698, "grad_norm": 0.9142048358917236, "learning_rate": 4.636648601500564e-05, "loss": 0.3648, "step": 14145 }, { "epoch": 1.282399854993656, "grad_norm": 0.8574484586715698, "learning_rate": 4.636237874783348e-05, "loss": 0.3444, "step": 14150 }, { "epoch": 1.2828529998187421, "grad_norm": 0.854861319065094, "learning_rate": 4.635826934270723e-05, "loss": 0.3197, "step": 14155 }, { "epoch": 1.283306144643828, "grad_norm": 0.9940296411514282, "learning_rate": 4.635415780003817e-05, "loss": 0.3942, "step": 14160 }, { "epoch": 1.2837592894689143, "grad_norm": 0.8684219121932983, "learning_rate": 4.6350044120237766e-05, "loss": 0.3465, "step": 14165 }, { "epoch": 1.2842124342940004, "grad_norm": 0.8985313773155212, "learning_rate": 4.634592830371774e-05, "loss": 0.3526, "step": 14170 }, { "epoch": 1.2846655791190864, "grad_norm": 0.9418298006057739, "learning_rate": 4.634181035088999e-05, "loss": 0.3624, "step": 14175 }, { "epoch": 1.2851187239441726, "grad_norm": 0.9922621846199036, "learning_rate": 4.633769026216665e-05, "loss": 0.367, "step": 14180 }, { "epoch": 1.2855718687692588, "grad_norm": 0.946080207824707, "learning_rate": 4.633356803796005e-05, "loss": 0.3812, "step": 14185 }, { "epoch": 1.2860250135943447, "grad_norm": 0.8463732004165649, "learning_rate": 4.632944367868275e-05, "loss": 0.3547, "step": 14190 }, { "epoch": 1.286478158419431, "grad_norm": 0.8833571672439575, "learning_rate": 4.6325317184747517e-05, "loss": 0.3184, "step": 14195 }, { "epoch": 1.2869313032445169, "grad_norm": 0.896632730960846, "learning_rate": 4.632118855656733e-05, "loss": 0.3675, "step": 14200 }, { "epoch": 1.287384448069603, "grad_norm": 0.8771361708641052, "learning_rate": 4.6317057794555394e-05, "loss": 0.3107, "step": 14205 }, { "epoch": 1.287837592894689, "grad_norm": 0.8659237027168274, "learning_rate": 4.63129248991251e-05, "loss": 0.3539, "step": 14210 }, { "epoch": 1.2882907377197752, "grad_norm": 0.8115595579147339, "learning_rate": 4.630878987069007e-05, "loss": 0.31, "step": 14215 }, { "epoch": 1.2887438825448614, "grad_norm": 0.9406973123550415, "learning_rate": 4.630465270966415e-05, "loss": 0.3887, "step": 14220 }, { "epoch": 1.2891970273699473, "grad_norm": 0.9114307165145874, "learning_rate": 4.630051341646139e-05, "loss": 0.3524, "step": 14225 }, { "epoch": 1.2896501721950335, "grad_norm": 0.9687734246253967, "learning_rate": 4.629637199149604e-05, "loss": 0.3586, "step": 14230 }, { "epoch": 1.2901033170201197, "grad_norm": 0.8464292287826538, "learning_rate": 4.629222843518259e-05, "loss": 0.3302, "step": 14235 }, { "epoch": 1.2905564618452057, "grad_norm": 0.8154206871986389, "learning_rate": 4.628808274793572e-05, "loss": 0.327, "step": 14240 }, { "epoch": 1.2910096066702919, "grad_norm": 0.993108868598938, "learning_rate": 4.628393493017033e-05, "loss": 0.3725, "step": 14245 }, { "epoch": 1.291462751495378, "grad_norm": 0.8850388526916504, "learning_rate": 4.627978498230154e-05, "loss": 0.3299, "step": 14250 }, { "epoch": 1.291915896320464, "grad_norm": 0.8207643628120422, "learning_rate": 4.627563290474467e-05, "loss": 0.3471, "step": 14255 }, { "epoch": 1.2923690411455502, "grad_norm": 0.9201274514198303, "learning_rate": 4.627147869791527e-05, "loss": 0.3624, "step": 14260 }, { "epoch": 1.2928221859706361, "grad_norm": 0.8744319677352905, "learning_rate": 4.626732236222909e-05, "loss": 0.3297, "step": 14265 }, { "epoch": 1.2932753307957223, "grad_norm": 0.823434591293335, "learning_rate": 4.626316389810209e-05, "loss": 0.4144, "step": 14270 }, { "epoch": 1.2937284756208083, "grad_norm": 0.926087498664856, "learning_rate": 4.625900330595047e-05, "loss": 0.3127, "step": 14275 }, { "epoch": 1.2941816204458945, "grad_norm": 0.9005059003829956, "learning_rate": 4.625484058619061e-05, "loss": 0.3426, "step": 14280 }, { "epoch": 1.2946347652709806, "grad_norm": 0.867497444152832, "learning_rate": 4.625067573923911e-05, "loss": 0.3383, "step": 14285 }, { "epoch": 1.2950879100960666, "grad_norm": 0.8796989917755127, "learning_rate": 4.6246508765512816e-05, "loss": 0.3496, "step": 14290 }, { "epoch": 1.2955410549211528, "grad_norm": 0.92581707239151, "learning_rate": 4.624233966542873e-05, "loss": 0.395, "step": 14295 }, { "epoch": 1.295994199746239, "grad_norm": 0.9106953144073486, "learning_rate": 4.6238168439404114e-05, "loss": 0.3777, "step": 14300 }, { "epoch": 1.296447344571325, "grad_norm": 0.8517726063728333, "learning_rate": 4.623399508785642e-05, "loss": 0.3189, "step": 14305 }, { "epoch": 1.2969004893964111, "grad_norm": 0.8089507818222046, "learning_rate": 4.6229819611203316e-05, "loss": 0.3443, "step": 14310 }, { "epoch": 1.2973536342214973, "grad_norm": 0.8163309097290039, "learning_rate": 4.622564200986269e-05, "loss": 0.3112, "step": 14315 }, { "epoch": 1.2978067790465833, "grad_norm": 0.945347249507904, "learning_rate": 4.6221462284252636e-05, "loss": 0.3511, "step": 14320 }, { "epoch": 1.2982599238716694, "grad_norm": 0.9936633706092834, "learning_rate": 4.6217280434791465e-05, "loss": 0.3531, "step": 14325 }, { "epoch": 1.2987130686967554, "grad_norm": 0.891598105430603, "learning_rate": 4.62130964618977e-05, "loss": 0.3612, "step": 14330 }, { "epoch": 1.2991662135218416, "grad_norm": 0.9222522974014282, "learning_rate": 4.6208910365990066e-05, "loss": 0.3903, "step": 14335 }, { "epoch": 1.2996193583469275, "grad_norm": 0.938677191734314, "learning_rate": 4.6204722147487506e-05, "loss": 0.3946, "step": 14340 }, { "epoch": 1.3000725031720137, "grad_norm": 0.8548560738563538, "learning_rate": 4.620053180680919e-05, "loss": 0.3698, "step": 14345 }, { "epoch": 1.3005256479971, "grad_norm": 0.9374920129776001, "learning_rate": 4.619633934437448e-05, "loss": 0.3328, "step": 14350 }, { "epoch": 1.3009787928221859, "grad_norm": 1.0045626163482666, "learning_rate": 4.6192144760602974e-05, "loss": 0.3516, "step": 14355 }, { "epoch": 1.301431937647272, "grad_norm": 0.9512208700180054, "learning_rate": 4.618794805591445e-05, "loss": 0.3334, "step": 14360 }, { "epoch": 1.3018850824723582, "grad_norm": 0.9541460871696472, "learning_rate": 4.6183749230728926e-05, "loss": 0.4041, "step": 14365 }, { "epoch": 1.3023382272974442, "grad_norm": 0.796757161617279, "learning_rate": 4.617954828546661e-05, "loss": 0.3721, "step": 14370 }, { "epoch": 1.3027913721225304, "grad_norm": 0.9321326613426208, "learning_rate": 4.6175345220547946e-05, "loss": 0.373, "step": 14375 }, { "epoch": 1.3032445169476166, "grad_norm": 0.8941385746002197, "learning_rate": 4.6171140036393575e-05, "loss": 0.3867, "step": 14380 }, { "epoch": 1.3036976617727025, "grad_norm": 0.8674700856208801, "learning_rate": 4.616693273342435e-05, "loss": 0.3716, "step": 14385 }, { "epoch": 1.3041508065977887, "grad_norm": 0.9495554566383362, "learning_rate": 4.616272331206134e-05, "loss": 0.3209, "step": 14390 }, { "epoch": 1.3046039514228749, "grad_norm": 0.762588620185852, "learning_rate": 4.615851177272583e-05, "loss": 0.3484, "step": 14395 }, { "epoch": 1.3050570962479608, "grad_norm": 0.9182942509651184, "learning_rate": 4.615429811583931e-05, "loss": 0.3964, "step": 14400 }, { "epoch": 1.305510241073047, "grad_norm": 0.9747132062911987, "learning_rate": 4.615008234182348e-05, "loss": 0.3474, "step": 14405 }, { "epoch": 1.305963385898133, "grad_norm": 0.9050168395042419, "learning_rate": 4.614586445110026e-05, "loss": 0.3419, "step": 14410 }, { "epoch": 1.3064165307232192, "grad_norm": 0.8362680077552795, "learning_rate": 4.614164444409178e-05, "loss": 0.3463, "step": 14415 }, { "epoch": 1.3068696755483051, "grad_norm": 0.9084765911102295, "learning_rate": 4.613742232122038e-05, "loss": 0.3429, "step": 14420 }, { "epoch": 1.3073228203733913, "grad_norm": 0.9550070762634277, "learning_rate": 4.61331980829086e-05, "loss": 0.309, "step": 14425 }, { "epoch": 1.3077759651984775, "grad_norm": 0.8294469118118286, "learning_rate": 4.612897172957922e-05, "loss": 0.3284, "step": 14430 }, { "epoch": 1.3082291100235635, "grad_norm": 0.8624558448791504, "learning_rate": 4.61247432616552e-05, "loss": 0.3881, "step": 14435 }, { "epoch": 1.3086822548486496, "grad_norm": 0.841999351978302, "learning_rate": 4.6120512679559734e-05, "loss": 0.393, "step": 14440 }, { "epoch": 1.3091353996737358, "grad_norm": 0.9616760611534119, "learning_rate": 4.611627998371622e-05, "loss": 0.3367, "step": 14445 }, { "epoch": 1.3095885444988218, "grad_norm": 0.9512012004852295, "learning_rate": 4.611204517454827e-05, "loss": 0.3377, "step": 14450 }, { "epoch": 1.310041689323908, "grad_norm": 0.9001579284667969, "learning_rate": 4.61078082524797e-05, "loss": 0.3249, "step": 14455 }, { "epoch": 1.3104948341489941, "grad_norm": 0.8646469116210938, "learning_rate": 4.610356921793454e-05, "loss": 0.3579, "step": 14460 }, { "epoch": 1.31094797897408, "grad_norm": 0.9017072916030884, "learning_rate": 4.6099328071337045e-05, "loss": 0.3484, "step": 14465 }, { "epoch": 1.3114011237991663, "grad_norm": 0.7983731627464294, "learning_rate": 4.6095084813111664e-05, "loss": 0.3243, "step": 14470 }, { "epoch": 1.3118542686242523, "grad_norm": 0.8506336808204651, "learning_rate": 4.6090839443683064e-05, "loss": 0.3023, "step": 14475 }, { "epoch": 1.3123074134493384, "grad_norm": 0.8595249652862549, "learning_rate": 4.608659196347612e-05, "loss": 0.3557, "step": 14480 }, { "epoch": 1.3127605582744244, "grad_norm": 0.835127592086792, "learning_rate": 4.6082342372915924e-05, "loss": 0.307, "step": 14485 }, { "epoch": 1.3132137030995106, "grad_norm": 0.805833637714386, "learning_rate": 4.607809067242778e-05, "loss": 0.3462, "step": 14490 }, { "epoch": 1.3136668479245968, "grad_norm": 0.9030977487564087, "learning_rate": 4.6073836862437195e-05, "loss": 0.3524, "step": 14495 }, { "epoch": 1.3141199927496827, "grad_norm": 0.8895474076271057, "learning_rate": 4.606958094336988e-05, "loss": 0.3385, "step": 14500 }, { "epoch": 1.314573137574769, "grad_norm": 1.0330729484558105, "learning_rate": 4.60653229156518e-05, "loss": 0.3711, "step": 14505 }, { "epoch": 1.315026282399855, "grad_norm": 0.8633441925048828, "learning_rate": 4.6061062779709076e-05, "loss": 0.3645, "step": 14510 }, { "epoch": 1.315479427224941, "grad_norm": 0.7821888327598572, "learning_rate": 4.605680053596806e-05, "loss": 0.3375, "step": 14515 }, { "epoch": 1.3159325720500272, "grad_norm": 0.8921559453010559, "learning_rate": 4.605253618485534e-05, "loss": 0.3792, "step": 14520 }, { "epoch": 1.3163857168751134, "grad_norm": 0.8971678614616394, "learning_rate": 4.604826972679768e-05, "loss": 0.3272, "step": 14525 }, { "epoch": 1.3168388617001994, "grad_norm": 0.9633116722106934, "learning_rate": 4.604400116222206e-05, "loss": 0.3671, "step": 14530 }, { "epoch": 1.3172920065252856, "grad_norm": 0.8350028395652771, "learning_rate": 4.60397304915557e-05, "loss": 0.3439, "step": 14535 }, { "epoch": 1.3177451513503715, "grad_norm": 0.9385808706283569, "learning_rate": 4.6035457715225993e-05, "loss": 0.3668, "step": 14540 }, { "epoch": 1.3181982961754577, "grad_norm": 0.7728114724159241, "learning_rate": 4.603118283366057e-05, "loss": 0.294, "step": 14545 }, { "epoch": 1.3186514410005437, "grad_norm": 0.9415702819824219, "learning_rate": 4.602690584728726e-05, "loss": 0.3494, "step": 14550 }, { "epoch": 1.3191045858256298, "grad_norm": 0.881949245929718, "learning_rate": 4.6022626756534104e-05, "loss": 0.2976, "step": 14555 }, { "epoch": 1.319557730650716, "grad_norm": 0.7938547134399414, "learning_rate": 4.601834556182936e-05, "loss": 0.3674, "step": 14560 }, { "epoch": 1.320010875475802, "grad_norm": 0.7558797597885132, "learning_rate": 4.6014062263601484e-05, "loss": 0.371, "step": 14565 }, { "epoch": 1.3204640203008882, "grad_norm": 0.9073057770729065, "learning_rate": 4.600977686227915e-05, "loss": 0.361, "step": 14570 }, { "epoch": 1.3209171651259743, "grad_norm": 1.118634819984436, "learning_rate": 4.6005489358291245e-05, "loss": 0.325, "step": 14575 }, { "epoch": 1.3213703099510603, "grad_norm": 0.8616956472396851, "learning_rate": 4.600119975206687e-05, "loss": 0.3008, "step": 14580 }, { "epoch": 1.3218234547761465, "grad_norm": 0.850596010684967, "learning_rate": 4.599690804403532e-05, "loss": 0.3418, "step": 14585 }, { "epoch": 1.3222765996012327, "grad_norm": 0.8723049759864807, "learning_rate": 4.5992614234626116e-05, "loss": 0.3013, "step": 14590 }, { "epoch": 1.3227297444263186, "grad_norm": 0.882343590259552, "learning_rate": 4.5988318324268984e-05, "loss": 0.3527, "step": 14595 }, { "epoch": 1.3231828892514048, "grad_norm": 0.8299558162689209, "learning_rate": 4.598402031339386e-05, "loss": 0.3206, "step": 14600 }, { "epoch": 1.3236360340764908, "grad_norm": 0.8458487391471863, "learning_rate": 4.5979720202430884e-05, "loss": 0.3883, "step": 14605 }, { "epoch": 1.324089178901577, "grad_norm": 0.8977141380310059, "learning_rate": 4.597541799181042e-05, "loss": 0.3281, "step": 14610 }, { "epoch": 1.324542323726663, "grad_norm": 0.8349534273147583, "learning_rate": 4.597111368196304e-05, "loss": 0.3668, "step": 14615 }, { "epoch": 1.324995468551749, "grad_norm": 0.7856200933456421, "learning_rate": 4.59668072733195e-05, "loss": 0.3077, "step": 14620 }, { "epoch": 1.3254486133768353, "grad_norm": 0.8700851798057556, "learning_rate": 4.596249876631081e-05, "loss": 0.3687, "step": 14625 }, { "epoch": 1.3259017582019212, "grad_norm": 0.8498971462249756, "learning_rate": 4.5958188161368156e-05, "loss": 0.3475, "step": 14630 }, { "epoch": 1.3263549030270074, "grad_norm": 0.8755861520767212, "learning_rate": 4.595387545892294e-05, "loss": 0.378, "step": 14635 }, { "epoch": 1.3268080478520936, "grad_norm": 0.8118677139282227, "learning_rate": 4.594956065940679e-05, "loss": 0.3316, "step": 14640 }, { "epoch": 1.3272611926771796, "grad_norm": 0.8503636121749878, "learning_rate": 4.594524376325152e-05, "loss": 0.3038, "step": 14645 }, { "epoch": 1.3277143375022658, "grad_norm": 0.9408687353134155, "learning_rate": 4.594092477088918e-05, "loss": 0.3285, "step": 14650 }, { "epoch": 1.328167482327352, "grad_norm": 0.8851175308227539, "learning_rate": 4.593660368275201e-05, "loss": 0.3123, "step": 14655 }, { "epoch": 1.328620627152438, "grad_norm": 0.9827463626861572, "learning_rate": 4.593228049927247e-05, "loss": 0.3613, "step": 14660 }, { "epoch": 1.329073771977524, "grad_norm": 0.8996395468711853, "learning_rate": 4.592795522088321e-05, "loss": 0.3257, "step": 14665 }, { "epoch": 1.32952691680261, "grad_norm": 0.8909155130386353, "learning_rate": 4.592362784801713e-05, "loss": 0.3812, "step": 14670 }, { "epoch": 1.3299800616276962, "grad_norm": 1.08168625831604, "learning_rate": 4.5919298381107296e-05, "loss": 0.3563, "step": 14675 }, { "epoch": 1.3304332064527822, "grad_norm": 0.8058910965919495, "learning_rate": 4.5914966820587003e-05, "loss": 0.3564, "step": 14680 }, { "epoch": 1.3308863512778684, "grad_norm": 0.8723969459533691, "learning_rate": 4.5910633166889774e-05, "loss": 0.3447, "step": 14685 }, { "epoch": 1.3313394961029545, "grad_norm": 0.9162372350692749, "learning_rate": 4.5906297420449296e-05, "loss": 0.3662, "step": 14690 }, { "epoch": 1.3317926409280405, "grad_norm": 0.8726950883865356, "learning_rate": 4.590195958169951e-05, "loss": 0.3378, "step": 14695 }, { "epoch": 1.3322457857531267, "grad_norm": 0.8875930905342102, "learning_rate": 4.5897619651074536e-05, "loss": 0.3147, "step": 14700 }, { "epoch": 1.3326989305782129, "grad_norm": 1.5111559629440308, "learning_rate": 4.589327762900873e-05, "loss": 0.3385, "step": 14705 }, { "epoch": 1.3331520754032988, "grad_norm": 1.2072172164916992, "learning_rate": 4.5888933515936636e-05, "loss": 0.4054, "step": 14710 }, { "epoch": 1.333605220228385, "grad_norm": 1.0824451446533203, "learning_rate": 4.5884587312293006e-05, "loss": 0.404, "step": 14715 }, { "epoch": 1.3340583650534712, "grad_norm": 0.8696940541267395, "learning_rate": 4.588023901851283e-05, "loss": 0.3424, "step": 14720 }, { "epoch": 1.3345115098785572, "grad_norm": 0.877903938293457, "learning_rate": 4.5875888635031266e-05, "loss": 0.3443, "step": 14725 }, { "epoch": 1.3349646547036433, "grad_norm": 0.8330527544021606, "learning_rate": 4.5871536162283715e-05, "loss": 0.3109, "step": 14730 }, { "epoch": 1.3354177995287293, "grad_norm": 0.8135043978691101, "learning_rate": 4.5867181600705766e-05, "loss": 0.3451, "step": 14735 }, { "epoch": 1.3358709443538155, "grad_norm": 0.8404101729393005, "learning_rate": 4.586282495073323e-05, "loss": 0.3526, "step": 14740 }, { "epoch": 1.3363240891789014, "grad_norm": 0.8774439692497253, "learning_rate": 4.585846621280212e-05, "loss": 0.3046, "step": 14745 }, { "epoch": 1.3367772340039876, "grad_norm": 0.8983399868011475, "learning_rate": 4.5854105387348654e-05, "loss": 0.3548, "step": 14750 }, { "epoch": 1.3372303788290738, "grad_norm": 0.7701455950737, "learning_rate": 4.584974247480928e-05, "loss": 0.3364, "step": 14755 }, { "epoch": 1.3376835236541598, "grad_norm": 0.9522407054901123, "learning_rate": 4.5845377475620625e-05, "loss": 0.3426, "step": 14760 }, { "epoch": 1.338136668479246, "grad_norm": 0.9503135681152344, "learning_rate": 4.584101039021955e-05, "loss": 0.3642, "step": 14765 }, { "epoch": 1.3385898133043321, "grad_norm": 0.7875221371650696, "learning_rate": 4.5836641219043106e-05, "loss": 0.2995, "step": 14770 }, { "epoch": 1.339042958129418, "grad_norm": 0.832079291343689, "learning_rate": 4.583226996252857e-05, "loss": 0.34, "step": 14775 }, { "epoch": 1.3394961029545043, "grad_norm": 0.9088258147239685, "learning_rate": 4.582789662111341e-05, "loss": 0.3872, "step": 14780 }, { "epoch": 1.3399492477795905, "grad_norm": 0.8907753229141235, "learning_rate": 4.582352119523532e-05, "loss": 0.3621, "step": 14785 }, { "epoch": 1.3404023926046764, "grad_norm": 1.0131417512893677, "learning_rate": 4.581914368533218e-05, "loss": 0.361, "step": 14790 }, { "epoch": 1.3408555374297626, "grad_norm": 0.9131868481636047, "learning_rate": 4.581476409184211e-05, "loss": 0.3249, "step": 14795 }, { "epoch": 1.3413086822548488, "grad_norm": 0.9454854726791382, "learning_rate": 4.581038241520341e-05, "loss": 0.3721, "step": 14800 }, { "epoch": 1.3417618270799347, "grad_norm": 0.8406082987785339, "learning_rate": 4.5805998655854605e-05, "loss": 0.3255, "step": 14805 }, { "epoch": 1.342214971905021, "grad_norm": 0.9302612543106079, "learning_rate": 4.580161281423442e-05, "loss": 0.3673, "step": 14810 }, { "epoch": 1.3426681167301069, "grad_norm": 0.9466236233711243, "learning_rate": 4.5797224890781794e-05, "loss": 0.3396, "step": 14815 }, { "epoch": 1.343121261555193, "grad_norm": 0.8285138010978699, "learning_rate": 4.579283488593586e-05, "loss": 0.3558, "step": 14820 }, { "epoch": 1.343574406380279, "grad_norm": 0.9399087429046631, "learning_rate": 4.5788442800136e-05, "loss": 0.3608, "step": 14825 }, { "epoch": 1.3440275512053652, "grad_norm": 0.9171890020370483, "learning_rate": 4.578404863382174e-05, "loss": 0.3555, "step": 14830 }, { "epoch": 1.3444806960304514, "grad_norm": 1.1541216373443604, "learning_rate": 4.5779652387432884e-05, "loss": 0.3962, "step": 14835 }, { "epoch": 1.3449338408555374, "grad_norm": 0.7760249972343445, "learning_rate": 4.5775254061409377e-05, "loss": 0.3401, "step": 14840 }, { "epoch": 1.3453869856806235, "grad_norm": 0.7992055416107178, "learning_rate": 4.577085365619143e-05, "loss": 0.3023, "step": 14845 }, { "epoch": 1.3458401305057097, "grad_norm": 0.869691014289856, "learning_rate": 4.576645117221942e-05, "loss": 0.3502, "step": 14850 }, { "epoch": 1.3462932753307957, "grad_norm": 0.8052191138267517, "learning_rate": 4.5762046609933964e-05, "loss": 0.3333, "step": 14855 }, { "epoch": 1.3467464201558819, "grad_norm": 0.8059237599372864, "learning_rate": 4.575763996977586e-05, "loss": 0.3067, "step": 14860 }, { "epoch": 1.347199564980968, "grad_norm": 0.8126056790351868, "learning_rate": 4.575323125218612e-05, "loss": 0.3584, "step": 14865 }, { "epoch": 1.347652709806054, "grad_norm": 0.873244047164917, "learning_rate": 4.5748820457606e-05, "loss": 0.3739, "step": 14870 }, { "epoch": 1.3481058546311402, "grad_norm": 0.8804246783256531, "learning_rate": 4.57444075864769e-05, "loss": 0.3288, "step": 14875 }, { "epoch": 1.3485589994562261, "grad_norm": 0.841936469078064, "learning_rate": 4.573999263924049e-05, "loss": 0.3614, "step": 14880 }, { "epoch": 1.3490121442813123, "grad_norm": 0.8625938296318054, "learning_rate": 4.573557561633859e-05, "loss": 0.3482, "step": 14885 }, { "epoch": 1.3494652891063983, "grad_norm": 0.8231288194656372, "learning_rate": 4.5731156518213286e-05, "loss": 0.2988, "step": 14890 }, { "epoch": 1.3499184339314845, "grad_norm": 1.0073546171188354, "learning_rate": 4.5726735345306824e-05, "loss": 0.3885, "step": 14895 }, { "epoch": 1.3503715787565707, "grad_norm": 0.8522156476974487, "learning_rate": 4.572231209806168e-05, "loss": 0.347, "step": 14900 }, { "epoch": 1.3508247235816566, "grad_norm": 1.1476324796676636, "learning_rate": 4.571788677692054e-05, "loss": 0.3576, "step": 14905 }, { "epoch": 1.3512778684067428, "grad_norm": 0.7969544529914856, "learning_rate": 4.571345938232629e-05, "loss": 0.3513, "step": 14910 }, { "epoch": 1.351731013231829, "grad_norm": 0.7748422026634216, "learning_rate": 4.570902991472201e-05, "loss": 0.3721, "step": 14915 }, { "epoch": 1.352184158056915, "grad_norm": 0.8651939034461975, "learning_rate": 4.570459837455103e-05, "loss": 0.3213, "step": 14920 }, { "epoch": 1.3526373028820011, "grad_norm": 0.9691091775894165, "learning_rate": 4.5700164762256836e-05, "loss": 0.3704, "step": 14925 }, { "epoch": 1.3530904477070873, "grad_norm": 1.002585768699646, "learning_rate": 4.5695729078283154e-05, "loss": 0.3712, "step": 14930 }, { "epoch": 1.3535435925321733, "grad_norm": 0.8582854270935059, "learning_rate": 4.569129132307392e-05, "loss": 0.3438, "step": 14935 }, { "epoch": 1.3539967373572595, "grad_norm": 0.832726776599884, "learning_rate": 4.5686851497073246e-05, "loss": 0.3161, "step": 14940 }, { "epoch": 1.3544498821823454, "grad_norm": 0.9749166965484619, "learning_rate": 4.568240960072548e-05, "loss": 0.3547, "step": 14945 }, { "epoch": 1.3549030270074316, "grad_norm": 0.9483156800270081, "learning_rate": 4.567796563447518e-05, "loss": 0.3306, "step": 14950 }, { "epoch": 1.3553561718325176, "grad_norm": 0.9979045987129211, "learning_rate": 4.567351959876708e-05, "loss": 0.3604, "step": 14955 }, { "epoch": 1.3558093166576037, "grad_norm": 0.8372693657875061, "learning_rate": 4.5669071494046156e-05, "loss": 0.3337, "step": 14960 }, { "epoch": 1.35626246148269, "grad_norm": 0.8888668417930603, "learning_rate": 4.5664621320757574e-05, "loss": 0.3333, "step": 14965 }, { "epoch": 1.3567156063077759, "grad_norm": 0.8535961508750916, "learning_rate": 4.5660169079346694e-05, "loss": 0.3186, "step": 14970 }, { "epoch": 1.357168751132862, "grad_norm": 0.9456357955932617, "learning_rate": 4.565571477025911e-05, "loss": 0.3262, "step": 14975 }, { "epoch": 1.3576218959579482, "grad_norm": 0.8757704496383667, "learning_rate": 4.565125839394061e-05, "loss": 0.2861, "step": 14980 }, { "epoch": 1.3580750407830342, "grad_norm": 0.9118072390556335, "learning_rate": 4.5646799950837196e-05, "loss": 0.4003, "step": 14985 }, { "epoch": 1.3585281856081204, "grad_norm": 0.8691385984420776, "learning_rate": 4.564233944139506e-05, "loss": 0.3494, "step": 14990 }, { "epoch": 1.3589813304332066, "grad_norm": 0.8160786032676697, "learning_rate": 4.56378768660606e-05, "loss": 0.3541, "step": 14995 }, { "epoch": 1.3594344752582925, "grad_norm": 0.8494128584861755, "learning_rate": 4.5633412225280456e-05, "loss": 0.3488, "step": 15000 }, { "epoch": 1.3598876200833787, "grad_norm": 0.7658330798149109, "learning_rate": 4.562894551950145e-05, "loss": 0.3334, "step": 15005 }, { "epoch": 1.3603407649084647, "grad_norm": 0.8783830404281616, "learning_rate": 4.56244767491706e-05, "loss": 0.3452, "step": 15010 }, { "epoch": 1.3607939097335509, "grad_norm": 0.95050448179245, "learning_rate": 4.562000591473513e-05, "loss": 0.3961, "step": 15015 }, { "epoch": 1.3612470545586368, "grad_norm": 0.8375329971313477, "learning_rate": 4.561553301664251e-05, "loss": 0.3258, "step": 15020 }, { "epoch": 1.361700199383723, "grad_norm": 0.9973564743995667, "learning_rate": 4.5611058055340375e-05, "loss": 0.3015, "step": 15025 }, { "epoch": 1.3621533442088092, "grad_norm": 1.0421749353408813, "learning_rate": 4.560658103127658e-05, "loss": 0.3403, "step": 15030 }, { "epoch": 1.3626064890338951, "grad_norm": 0.8927331566810608, "learning_rate": 4.56021019448992e-05, "loss": 0.3742, "step": 15035 }, { "epoch": 1.3630596338589813, "grad_norm": 0.8288667798042297, "learning_rate": 4.559762079665648e-05, "loss": 0.3087, "step": 15040 }, { "epoch": 1.3635127786840675, "grad_norm": 0.8474732637405396, "learning_rate": 4.559313758699691e-05, "loss": 0.3093, "step": 15045 }, { "epoch": 1.3639659235091535, "grad_norm": 0.9104608297348022, "learning_rate": 4.558865231636917e-05, "loss": 0.339, "step": 15050 }, { "epoch": 1.3644190683342396, "grad_norm": 0.8801599740982056, "learning_rate": 4.5584164985222156e-05, "loss": 0.3408, "step": 15055 }, { "epoch": 1.3648722131593258, "grad_norm": 0.867707371711731, "learning_rate": 4.557967559400494e-05, "loss": 0.3291, "step": 15060 }, { "epoch": 1.3653253579844118, "grad_norm": 0.7847603559494019, "learning_rate": 4.557518414316684e-05, "loss": 0.3301, "step": 15065 }, { "epoch": 1.365778502809498, "grad_norm": 0.8774462342262268, "learning_rate": 4.5570690633157345e-05, "loss": 0.4029, "step": 15070 }, { "epoch": 1.366231647634584, "grad_norm": 0.9366100430488586, "learning_rate": 4.5566195064426184e-05, "loss": 0.3479, "step": 15075 }, { "epoch": 1.3666847924596701, "grad_norm": 0.8166449666023254, "learning_rate": 4.556169743742327e-05, "loss": 0.3185, "step": 15080 }, { "epoch": 1.367137937284756, "grad_norm": 0.8409197926521301, "learning_rate": 4.5557197752598725e-05, "loss": 0.3579, "step": 15085 }, { "epoch": 1.3675910821098423, "grad_norm": 0.8337535858154297, "learning_rate": 4.555269601040288e-05, "loss": 0.3718, "step": 15090 }, { "epoch": 1.3680442269349284, "grad_norm": 0.9237117171287537, "learning_rate": 4.554819221128627e-05, "loss": 0.3497, "step": 15095 }, { "epoch": 1.3684973717600144, "grad_norm": 0.839246928691864, "learning_rate": 4.554368635569963e-05, "loss": 0.3038, "step": 15100 }, { "epoch": 1.3689505165851006, "grad_norm": 0.8174630999565125, "learning_rate": 4.5539178444093925e-05, "loss": 0.3833, "step": 15105 }, { "epoch": 1.3694036614101868, "grad_norm": 0.9241131544113159, "learning_rate": 4.55346684769203e-05, "loss": 0.3628, "step": 15110 }, { "epoch": 1.3698568062352727, "grad_norm": 0.9116911888122559, "learning_rate": 4.55301564546301e-05, "loss": 0.3253, "step": 15115 }, { "epoch": 1.370309951060359, "grad_norm": 0.8060349822044373, "learning_rate": 4.5525642377674906e-05, "loss": 0.3077, "step": 15120 }, { "epoch": 1.370763095885445, "grad_norm": 0.9460508227348328, "learning_rate": 4.552112624650649e-05, "loss": 0.3746, "step": 15125 }, { "epoch": 1.371216240710531, "grad_norm": 0.8948659896850586, "learning_rate": 4.5516608061576814e-05, "loss": 0.3414, "step": 15130 }, { "epoch": 1.3716693855356172, "grad_norm": 0.9848813414573669, "learning_rate": 4.551208782333808e-05, "loss": 0.4049, "step": 15135 }, { "epoch": 1.3721225303607034, "grad_norm": 0.8754732608795166, "learning_rate": 4.5507565532242655e-05, "loss": 0.3446, "step": 15140 }, { "epoch": 1.3725756751857894, "grad_norm": 0.9265137910842896, "learning_rate": 4.550304118874313e-05, "loss": 0.3507, "step": 15145 }, { "epoch": 1.3730288200108753, "grad_norm": 0.970954954624176, "learning_rate": 4.549851479329232e-05, "loss": 0.3402, "step": 15150 }, { "epoch": 1.3734819648359615, "grad_norm": 0.9262577295303345, "learning_rate": 4.5493986346343224e-05, "loss": 0.354, "step": 15155 }, { "epoch": 1.3739351096610477, "grad_norm": 0.8158589005470276, "learning_rate": 4.548945584834904e-05, "loss": 0.2924, "step": 15160 }, { "epoch": 1.3743882544861337, "grad_norm": 0.9188200235366821, "learning_rate": 4.5484923299763197e-05, "loss": 0.3549, "step": 15165 }, { "epoch": 1.3748413993112198, "grad_norm": 0.9387962818145752, "learning_rate": 4.54803887010393e-05, "loss": 0.3504, "step": 15170 }, { "epoch": 1.375294544136306, "grad_norm": 0.8335390090942383, "learning_rate": 4.5475852052631176e-05, "loss": 0.3096, "step": 15175 }, { "epoch": 1.375747688961392, "grad_norm": 0.8593229651451111, "learning_rate": 4.547131335499286e-05, "loss": 0.3522, "step": 15180 }, { "epoch": 1.3762008337864782, "grad_norm": 1.0017249584197998, "learning_rate": 4.5466772608578576e-05, "loss": 0.3292, "step": 15185 }, { "epoch": 1.3766539786115644, "grad_norm": 0.9307664036750793, "learning_rate": 4.546222981384278e-05, "loss": 0.3528, "step": 15190 }, { "epoch": 1.3771071234366503, "grad_norm": 0.8073698282241821, "learning_rate": 4.5457684971240097e-05, "loss": 0.3381, "step": 15195 }, { "epoch": 1.3775602682617365, "grad_norm": 0.880437433719635, "learning_rate": 4.5453138081225395e-05, "loss": 0.3993, "step": 15200 }, { "epoch": 1.3780134130868227, "grad_norm": 0.8769826292991638, "learning_rate": 4.5448589144253716e-05, "loss": 0.3571, "step": 15205 }, { "epoch": 1.3784665579119086, "grad_norm": 0.9147416353225708, "learning_rate": 4.544403816078032e-05, "loss": 0.3111, "step": 15210 }, { "epoch": 1.3789197027369948, "grad_norm": 0.9179064035415649, "learning_rate": 4.543948513126068e-05, "loss": 0.3629, "step": 15215 }, { "epoch": 1.3793728475620808, "grad_norm": 0.8123385310173035, "learning_rate": 4.543493005615046e-05, "loss": 0.3247, "step": 15220 }, { "epoch": 1.379825992387167, "grad_norm": 0.8965974450111389, "learning_rate": 4.543037293590552e-05, "loss": 0.3081, "step": 15225 }, { "epoch": 1.380279137212253, "grad_norm": 0.9525113105773926, "learning_rate": 4.542581377098196e-05, "loss": 0.3226, "step": 15230 }, { "epoch": 1.380732282037339, "grad_norm": 0.8892359733581543, "learning_rate": 4.542125256183605e-05, "loss": 0.3325, "step": 15235 }, { "epoch": 1.3811854268624253, "grad_norm": 0.8768373727798462, "learning_rate": 4.5416689308924285e-05, "loss": 0.3575, "step": 15240 }, { "epoch": 1.3816385716875113, "grad_norm": 0.8223785758018494, "learning_rate": 4.5412124012703344e-05, "loss": 0.4057, "step": 15245 }, { "epoch": 1.3820917165125974, "grad_norm": 0.9497567415237427, "learning_rate": 4.540755667363014e-05, "loss": 0.3279, "step": 15250 }, { "epoch": 1.3825448613376836, "grad_norm": 1.1738145351409912, "learning_rate": 4.540298729216176e-05, "loss": 0.3196, "step": 15255 }, { "epoch": 1.3829980061627696, "grad_norm": 0.8642224073410034, "learning_rate": 4.539841586875551e-05, "loss": 0.3221, "step": 15260 }, { "epoch": 1.3834511509878558, "grad_norm": 0.8885906934738159, "learning_rate": 4.5393842403868915e-05, "loss": 0.3604, "step": 15265 }, { "epoch": 1.383904295812942, "grad_norm": 0.9149910807609558, "learning_rate": 4.538926689795967e-05, "loss": 0.3549, "step": 15270 }, { "epoch": 1.384357440638028, "grad_norm": 0.8977124691009521, "learning_rate": 4.53846893514857e-05, "loss": 0.2979, "step": 15275 }, { "epoch": 1.384810585463114, "grad_norm": 0.7707200050354004, "learning_rate": 4.5380109764905135e-05, "loss": 0.3083, "step": 15280 }, { "epoch": 1.3852637302882, "grad_norm": 0.8910935521125793, "learning_rate": 4.537552813867629e-05, "loss": 0.3526, "step": 15285 }, { "epoch": 1.3857168751132862, "grad_norm": 0.8719798922538757, "learning_rate": 4.53709444732577e-05, "loss": 0.3415, "step": 15290 }, { "epoch": 1.3861700199383722, "grad_norm": 0.9085013270378113, "learning_rate": 4.536635876910811e-05, "loss": 0.3773, "step": 15295 }, { "epoch": 1.3866231647634584, "grad_norm": 0.952574610710144, "learning_rate": 4.536177102668645e-05, "loss": 0.3386, "step": 15300 }, { "epoch": 1.3870763095885446, "grad_norm": 0.8637735247612, "learning_rate": 4.5357181246451854e-05, "loss": 0.3868, "step": 15305 }, { "epoch": 1.3875294544136305, "grad_norm": 0.8578862547874451, "learning_rate": 4.535258942886368e-05, "loss": 0.3634, "step": 15310 }, { "epoch": 1.3879825992387167, "grad_norm": 0.8447268009185791, "learning_rate": 4.534799557438147e-05, "loss": 0.3652, "step": 15315 }, { "epoch": 1.3884357440638029, "grad_norm": 0.7103087902069092, "learning_rate": 4.534339968346499e-05, "loss": 0.338, "step": 15320 }, { "epoch": 1.3888888888888888, "grad_norm": 0.8862382173538208, "learning_rate": 4.533880175657419e-05, "loss": 0.3125, "step": 15325 }, { "epoch": 1.389342033713975, "grad_norm": 0.9453997015953064, "learning_rate": 4.533420179416924e-05, "loss": 0.3526, "step": 15330 }, { "epoch": 1.3897951785390612, "grad_norm": 0.8406849503517151, "learning_rate": 4.53295997967105e-05, "loss": 0.3113, "step": 15335 }, { "epoch": 1.3902483233641472, "grad_norm": 0.8796119689941406, "learning_rate": 4.532499576465853e-05, "loss": 0.3222, "step": 15340 }, { "epoch": 1.3907014681892333, "grad_norm": 0.8767834305763245, "learning_rate": 4.532038969847412e-05, "loss": 0.3604, "step": 15345 }, { "epoch": 1.3911546130143193, "grad_norm": 0.9242461323738098, "learning_rate": 4.531578159861824e-05, "loss": 0.3551, "step": 15350 }, { "epoch": 1.3916077578394055, "grad_norm": 0.7900345921516418, "learning_rate": 4.531117146555207e-05, "loss": 0.2935, "step": 15355 }, { "epoch": 1.3920609026644915, "grad_norm": 0.8589555621147156, "learning_rate": 4.530655929973699e-05, "loss": 0.3302, "step": 15360 }, { "epoch": 1.3925140474895776, "grad_norm": 0.8538352847099304, "learning_rate": 4.53019451016346e-05, "loss": 0.381, "step": 15365 }, { "epoch": 1.3929671923146638, "grad_norm": 0.9293335676193237, "learning_rate": 4.5297328871706665e-05, "loss": 0.3536, "step": 15370 }, { "epoch": 1.3934203371397498, "grad_norm": 0.8154127597808838, "learning_rate": 4.529271061041521e-05, "loss": 0.3223, "step": 15375 }, { "epoch": 1.393873481964836, "grad_norm": 0.8543274998664856, "learning_rate": 4.528809031822241e-05, "loss": 0.3472, "step": 15380 }, { "epoch": 1.3943266267899221, "grad_norm": 0.9126324653625488, "learning_rate": 4.528346799559068e-05, "loss": 0.3534, "step": 15385 }, { "epoch": 1.394779771615008, "grad_norm": 0.9617660045623779, "learning_rate": 4.5278843642982604e-05, "loss": 0.3383, "step": 15390 }, { "epoch": 1.3952329164400943, "grad_norm": 0.8337819576263428, "learning_rate": 4.527421726086101e-05, "loss": 0.3098, "step": 15395 }, { "epoch": 1.3956860612651805, "grad_norm": 0.8312467932701111, "learning_rate": 4.5269588849688895e-05, "loss": 0.3411, "step": 15400 }, { "epoch": 1.3961392060902664, "grad_norm": 0.8592232465744019, "learning_rate": 4.526495840992948e-05, "loss": 0.3203, "step": 15405 }, { "epoch": 1.3965923509153526, "grad_norm": 0.8788986802101135, "learning_rate": 4.526032594204617e-05, "loss": 0.3336, "step": 15410 }, { "epoch": 1.3970454957404386, "grad_norm": 0.9104689955711365, "learning_rate": 4.52556914465026e-05, "loss": 0.3661, "step": 15415 }, { "epoch": 1.3974986405655248, "grad_norm": 0.8508967161178589, "learning_rate": 4.525105492376257e-05, "loss": 0.3464, "step": 15420 }, { "epoch": 1.3979517853906107, "grad_norm": 0.8681458830833435, "learning_rate": 4.524641637429013e-05, "loss": 0.3149, "step": 15425 }, { "epoch": 1.398404930215697, "grad_norm": 0.8286123871803284, "learning_rate": 4.524177579854949e-05, "loss": 0.3226, "step": 15430 }, { "epoch": 1.398858075040783, "grad_norm": 0.9019888639450073, "learning_rate": 4.523713319700508e-05, "loss": 0.3596, "step": 15435 }, { "epoch": 1.399311219865869, "grad_norm": 0.9020208120346069, "learning_rate": 4.5232488570121544e-05, "loss": 0.3791, "step": 15440 }, { "epoch": 1.3997643646909552, "grad_norm": 0.8438413739204407, "learning_rate": 4.522784191836371e-05, "loss": 0.352, "step": 15445 }, { "epoch": 1.4002175095160414, "grad_norm": 0.8149915337562561, "learning_rate": 4.522319324219663e-05, "loss": 0.3095, "step": 15450 }, { "epoch": 1.4006706543411274, "grad_norm": 0.9477730989456177, "learning_rate": 4.521854254208552e-05, "loss": 0.3336, "step": 15455 }, { "epoch": 1.4011237991662135, "grad_norm": 0.8279924392700195, "learning_rate": 4.521388981849584e-05, "loss": 0.3621, "step": 15460 }, { "epoch": 1.4015769439912997, "grad_norm": 0.9039889574050903, "learning_rate": 4.520923507189323e-05, "loss": 0.3287, "step": 15465 }, { "epoch": 1.4020300888163857, "grad_norm": 0.8835041522979736, "learning_rate": 4.520457830274355e-05, "loss": 0.313, "step": 15470 }, { "epoch": 1.4024832336414719, "grad_norm": 0.9529926776885986, "learning_rate": 4.519991951151284e-05, "loss": 0.3488, "step": 15475 }, { "epoch": 1.4029363784665578, "grad_norm": 0.8838729858398438, "learning_rate": 4.519525869866736e-05, "loss": 0.3731, "step": 15480 }, { "epoch": 1.403389523291644, "grad_norm": 0.8688704967498779, "learning_rate": 4.5190595864673554e-05, "loss": 0.3558, "step": 15485 }, { "epoch": 1.40384266811673, "grad_norm": 0.9496243000030518, "learning_rate": 4.51859310099981e-05, "loss": 0.3488, "step": 15490 }, { "epoch": 1.4042958129418162, "grad_norm": 0.8735055923461914, "learning_rate": 4.518126413510784e-05, "loss": 0.3053, "step": 15495 }, { "epoch": 1.4047489577669023, "grad_norm": 0.8721928596496582, "learning_rate": 4.5176595240469845e-05, "loss": 0.3307, "step": 15500 }, { "epoch": 1.4052021025919883, "grad_norm": 0.9051746129989624, "learning_rate": 4.517192432655137e-05, "loss": 0.3139, "step": 15505 }, { "epoch": 1.4056552474170745, "grad_norm": 0.9803463220596313, "learning_rate": 4.51672513938199e-05, "loss": 0.3441, "step": 15510 }, { "epoch": 1.4061083922421607, "grad_norm": 0.8440713882446289, "learning_rate": 4.5162576442743086e-05, "loss": 0.3089, "step": 15515 }, { "epoch": 1.4065615370672466, "grad_norm": 0.8972756266593933, "learning_rate": 4.5157899473788815e-05, "loss": 0.3812, "step": 15520 }, { "epoch": 1.4070146818923328, "grad_norm": 0.926794171333313, "learning_rate": 4.515322048742514e-05, "loss": 0.375, "step": 15525 }, { "epoch": 1.407467826717419, "grad_norm": 0.8561320304870605, "learning_rate": 4.5148539484120356e-05, "loss": 0.341, "step": 15530 }, { "epoch": 1.407920971542505, "grad_norm": 0.9178066849708557, "learning_rate": 4.514385646434292e-05, "loss": 0.3197, "step": 15535 }, { "epoch": 1.4083741163675911, "grad_norm": 0.8962686061859131, "learning_rate": 4.513917142856152e-05, "loss": 0.2954, "step": 15540 }, { "epoch": 1.4088272611926773, "grad_norm": 0.96378493309021, "learning_rate": 4.513448437724504e-05, "loss": 0.3042, "step": 15545 }, { "epoch": 1.4092804060177633, "grad_norm": 0.7547777891159058, "learning_rate": 4.512979531086256e-05, "loss": 0.3104, "step": 15550 }, { "epoch": 1.4097335508428492, "grad_norm": 0.873738169670105, "learning_rate": 4.512510422988335e-05, "loss": 0.2944, "step": 15555 }, { "epoch": 1.4101866956679354, "grad_norm": 0.9460462331771851, "learning_rate": 4.512041113477691e-05, "loss": 0.3625, "step": 15560 }, { "epoch": 1.4106398404930216, "grad_norm": 0.9259429574012756, "learning_rate": 4.511571602601292e-05, "loss": 0.359, "step": 15565 }, { "epoch": 1.4110929853181076, "grad_norm": 0.9153011441230774, "learning_rate": 4.5111018904061275e-05, "loss": 0.3481, "step": 15570 }, { "epoch": 1.4115461301431937, "grad_norm": 0.8469756841659546, "learning_rate": 4.5106319769392055e-05, "loss": 0.3399, "step": 15575 }, { "epoch": 1.41199927496828, "grad_norm": 0.8937860727310181, "learning_rate": 4.510161862247556e-05, "loss": 0.297, "step": 15580 }, { "epoch": 1.4124524197933659, "grad_norm": 0.8644063472747803, "learning_rate": 4.509691546378227e-05, "loss": 0.3382, "step": 15585 }, { "epoch": 1.412905564618452, "grad_norm": 0.8380452990531921, "learning_rate": 4.50922102937829e-05, "loss": 0.3384, "step": 15590 }, { "epoch": 1.4133587094435383, "grad_norm": 0.9441470503807068, "learning_rate": 4.508750311294832e-05, "loss": 0.3401, "step": 15595 }, { "epoch": 1.4138118542686242, "grad_norm": 0.9296892881393433, "learning_rate": 4.5082793921749645e-05, "loss": 0.2971, "step": 15600 }, { "epoch": 1.4142649990937104, "grad_norm": 0.8790765404701233, "learning_rate": 4.5078082720658154e-05, "loss": 0.4048, "step": 15605 }, { "epoch": 1.4147181439187966, "grad_norm": 0.7654474973678589, "learning_rate": 4.507336951014537e-05, "loss": 0.3339, "step": 15610 }, { "epoch": 1.4151712887438825, "grad_norm": 0.7816233038902283, "learning_rate": 4.506865429068297e-05, "loss": 0.2616, "step": 15615 }, { "epoch": 1.4156244335689687, "grad_norm": 0.913131594657898, "learning_rate": 4.506393706274287e-05, "loss": 0.3564, "step": 15620 }, { "epoch": 1.4160775783940547, "grad_norm": 0.8793874979019165, "learning_rate": 4.5059217826797164e-05, "loss": 0.3389, "step": 15625 }, { "epoch": 1.4165307232191409, "grad_norm": 0.9020580649375916, "learning_rate": 4.505449658331816e-05, "loss": 0.3324, "step": 15630 }, { "epoch": 1.4169838680442268, "grad_norm": 0.8857312202453613, "learning_rate": 4.504977333277836e-05, "loss": 0.3583, "step": 15635 }, { "epoch": 1.417437012869313, "grad_norm": 0.845656156539917, "learning_rate": 4.5045048075650455e-05, "loss": 0.2992, "step": 15640 }, { "epoch": 1.4178901576943992, "grad_norm": 0.868704617023468, "learning_rate": 4.5040320812407377e-05, "loss": 0.2828, "step": 15645 }, { "epoch": 1.4183433025194851, "grad_norm": 0.8852822780609131, "learning_rate": 4.5035591543522206e-05, "loss": 0.3205, "step": 15650 }, { "epoch": 1.4187964473445713, "grad_norm": 0.9117987751960754, "learning_rate": 4.5030860269468265e-05, "loss": 0.3406, "step": 15655 }, { "epoch": 1.4192495921696575, "grad_norm": 0.8493071794509888, "learning_rate": 4.502612699071906e-05, "loss": 0.3151, "step": 15660 }, { "epoch": 1.4197027369947435, "grad_norm": 0.7977601289749146, "learning_rate": 4.502139170774829e-05, "loss": 0.3483, "step": 15665 }, { "epoch": 1.4201558818198297, "grad_norm": 0.9074074029922485, "learning_rate": 4.5016654421029876e-05, "loss": 0.3455, "step": 15670 }, { "epoch": 1.4206090266449158, "grad_norm": 0.8420115113258362, "learning_rate": 4.501191513103792e-05, "loss": 0.3171, "step": 15675 }, { "epoch": 1.4210621714700018, "grad_norm": 0.7460442781448364, "learning_rate": 4.5007173838246726e-05, "loss": 0.2729, "step": 15680 }, { "epoch": 1.421515316295088, "grad_norm": 0.7744186520576477, "learning_rate": 4.500243054313081e-05, "loss": 0.3939, "step": 15685 }, { "epoch": 1.421968461120174, "grad_norm": 0.8361969590187073, "learning_rate": 4.4997685246164895e-05, "loss": 0.34, "step": 15690 }, { "epoch": 1.4224216059452601, "grad_norm": 0.8267629146575928, "learning_rate": 4.499293794782387e-05, "loss": 0.3134, "step": 15695 }, { "epoch": 1.422874750770346, "grad_norm": 0.9905667901039124, "learning_rate": 4.498818864858286e-05, "loss": 0.3759, "step": 15700 }, { "epoch": 1.4233278955954323, "grad_norm": 0.835564911365509, "learning_rate": 4.498343734891717e-05, "loss": 0.3763, "step": 15705 }, { "epoch": 1.4237810404205185, "grad_norm": 0.973638653755188, "learning_rate": 4.4978684049302325e-05, "loss": 0.3371, "step": 15710 }, { "epoch": 1.4242341852456044, "grad_norm": 0.9285940527915955, "learning_rate": 4.4973928750214015e-05, "loss": 0.3354, "step": 15715 }, { "epoch": 1.4246873300706906, "grad_norm": 0.8687307238578796, "learning_rate": 4.4969171452128166e-05, "loss": 0.3072, "step": 15720 }, { "epoch": 1.4251404748957768, "grad_norm": 0.9285119771957397, "learning_rate": 4.49644121555209e-05, "loss": 0.334, "step": 15725 }, { "epoch": 1.4255936197208627, "grad_norm": 0.8089897036552429, "learning_rate": 4.4959650860868504e-05, "loss": 0.3925, "step": 15730 }, { "epoch": 1.426046764545949, "grad_norm": 0.9755212664604187, "learning_rate": 4.4954887568647495e-05, "loss": 0.3563, "step": 15735 }, { "epoch": 1.426499909371035, "grad_norm": 1.8900717496871948, "learning_rate": 4.495012227933461e-05, "loss": 0.3248, "step": 15740 }, { "epoch": 1.426953054196121, "grad_norm": 0.9558421969413757, "learning_rate": 4.494535499340673e-05, "loss": 0.3628, "step": 15745 }, { "epoch": 1.4274061990212072, "grad_norm": 0.8774139881134033, "learning_rate": 4.494058571134099e-05, "loss": 0.3568, "step": 15750 }, { "epoch": 1.4278593438462932, "grad_norm": 0.9311321973800659, "learning_rate": 4.493581443361469e-05, "loss": 0.3217, "step": 15755 }, { "epoch": 1.4283124886713794, "grad_norm": 0.8893566727638245, "learning_rate": 4.493104116070533e-05, "loss": 0.341, "step": 15760 }, { "epoch": 1.4287656334964653, "grad_norm": 0.9505446553230286, "learning_rate": 4.492626589309064e-05, "loss": 0.3839, "step": 15765 }, { "epoch": 1.4292187783215515, "grad_norm": 0.839041531085968, "learning_rate": 4.4921488631248526e-05, "loss": 0.3881, "step": 15770 }, { "epoch": 1.4296719231466377, "grad_norm": 0.8152316808700562, "learning_rate": 4.4916709375657094e-05, "loss": 0.3532, "step": 15775 }, { "epoch": 1.4301250679717237, "grad_norm": 0.8173640966415405, "learning_rate": 4.4911928126794655e-05, "loss": 0.288, "step": 15780 }, { "epoch": 1.4305782127968099, "grad_norm": 1.0459697246551514, "learning_rate": 4.4907144885139716e-05, "loss": 0.3521, "step": 15785 }, { "epoch": 1.431031357621896, "grad_norm": 0.9177184104919434, "learning_rate": 4.490235965117099e-05, "loss": 0.3204, "step": 15790 }, { "epoch": 1.431484502446982, "grad_norm": 0.8543307185173035, "learning_rate": 4.489757242536738e-05, "loss": 0.2948, "step": 15795 }, { "epoch": 1.4319376472720682, "grad_norm": 0.8607534766197205, "learning_rate": 4.4892783208207995e-05, "loss": 0.2985, "step": 15800 }, { "epoch": 1.4323907920971544, "grad_norm": 0.9438170194625854, "learning_rate": 4.4887992000172143e-05, "loss": 0.3755, "step": 15805 }, { "epoch": 1.4328439369222403, "grad_norm": 0.900154173374176, "learning_rate": 4.488319880173933e-05, "loss": 0.3315, "step": 15810 }, { "epoch": 1.4332970817473265, "grad_norm": 0.9761333465576172, "learning_rate": 4.487840361338925e-05, "loss": 0.3325, "step": 15815 }, { "epoch": 1.4337502265724125, "grad_norm": 0.8877332210540771, "learning_rate": 4.487360643560183e-05, "loss": 0.3553, "step": 15820 }, { "epoch": 1.4342033713974986, "grad_norm": 0.8985492587089539, "learning_rate": 4.4868807268857154e-05, "loss": 0.3115, "step": 15825 }, { "epoch": 1.4346565162225846, "grad_norm": 0.9778724908828735, "learning_rate": 4.486400611363553e-05, "loss": 0.3209, "step": 15830 }, { "epoch": 1.4351096610476708, "grad_norm": 0.914187490940094, "learning_rate": 4.4859202970417456e-05, "loss": 0.3263, "step": 15835 }, { "epoch": 1.435562805872757, "grad_norm": 0.8824253082275391, "learning_rate": 4.4854397839683636e-05, "loss": 0.2932, "step": 15840 }, { "epoch": 1.436015950697843, "grad_norm": 0.9009947180747986, "learning_rate": 4.484959072191497e-05, "loss": 0.3437, "step": 15845 }, { "epoch": 1.4364690955229291, "grad_norm": 0.8069299459457397, "learning_rate": 4.4844781617592555e-05, "loss": 0.2992, "step": 15850 }, { "epoch": 1.4369222403480153, "grad_norm": 0.8011388182640076, "learning_rate": 4.483997052719769e-05, "loss": 0.3539, "step": 15855 }, { "epoch": 1.4373753851731013, "grad_norm": 0.8709663152694702, "learning_rate": 4.4835157451211865e-05, "loss": 0.2899, "step": 15860 }, { "epoch": 1.4378285299981874, "grad_norm": 0.8475409746170044, "learning_rate": 4.483034239011678e-05, "loss": 0.3043, "step": 15865 }, { "epoch": 1.4382816748232736, "grad_norm": 0.8384758234024048, "learning_rate": 4.4825525344394314e-05, "loss": 0.3644, "step": 15870 }, { "epoch": 1.4387348196483596, "grad_norm": 0.8600175380706787, "learning_rate": 4.482070631452658e-05, "loss": 0.3401, "step": 15875 }, { "epoch": 1.4391879644734458, "grad_norm": 0.9233013987541199, "learning_rate": 4.481588530099585e-05, "loss": 0.3128, "step": 15880 }, { "epoch": 1.4396411092985317, "grad_norm": 0.9198874235153198, "learning_rate": 4.481106230428463e-05, "loss": 0.3164, "step": 15885 }, { "epoch": 1.440094254123618, "grad_norm": 0.8296319842338562, "learning_rate": 4.480623732487559e-05, "loss": 0.3109, "step": 15890 }, { "epoch": 1.4405473989487039, "grad_norm": 0.9698281288146973, "learning_rate": 4.4801410363251626e-05, "loss": 0.3366, "step": 15895 }, { "epoch": 1.44100054377379, "grad_norm": 0.8146260976791382, "learning_rate": 4.479658141989582e-05, "loss": 0.308, "step": 15900 }, { "epoch": 1.4414536885988762, "grad_norm": 0.9424358010292053, "learning_rate": 4.4791750495291454e-05, "loss": 0.3493, "step": 15905 }, { "epoch": 1.4419068334239622, "grad_norm": 0.8169508576393127, "learning_rate": 4.478691758992201e-05, "loss": 0.3157, "step": 15910 }, { "epoch": 1.4423599782490484, "grad_norm": 0.9039985537528992, "learning_rate": 4.478208270427116e-05, "loss": 0.3748, "step": 15915 }, { "epoch": 1.4428131230741346, "grad_norm": 0.877204179763794, "learning_rate": 4.477724583882278e-05, "loss": 0.3116, "step": 15920 }, { "epoch": 1.4432662678992205, "grad_norm": 0.9837504625320435, "learning_rate": 4.4772406994060955e-05, "loss": 0.3751, "step": 15925 }, { "epoch": 1.4437194127243067, "grad_norm": 0.8390685319900513, "learning_rate": 4.4767566170469956e-05, "loss": 0.3122, "step": 15930 }, { "epoch": 1.4441725575493929, "grad_norm": 0.926455020904541, "learning_rate": 4.476272336853425e-05, "loss": 0.3373, "step": 15935 }, { "epoch": 1.4446257023744788, "grad_norm": 0.8612189292907715, "learning_rate": 4.475787858873851e-05, "loss": 0.334, "step": 15940 }, { "epoch": 1.445078847199565, "grad_norm": 0.8817455768585205, "learning_rate": 4.4753031831567604e-05, "loss": 0.3266, "step": 15945 }, { "epoch": 1.4455319920246512, "grad_norm": 0.9319576025009155, "learning_rate": 4.474818309750659e-05, "loss": 0.3702, "step": 15950 }, { "epoch": 1.4459851368497372, "grad_norm": 0.7946282625198364, "learning_rate": 4.474333238704074e-05, "loss": 0.3154, "step": 15955 }, { "epoch": 1.4464382816748234, "grad_norm": 0.880961537361145, "learning_rate": 4.473847970065551e-05, "loss": 0.3387, "step": 15960 }, { "epoch": 1.4468914264999093, "grad_norm": 0.9235846996307373, "learning_rate": 4.473362503883656e-05, "loss": 0.3287, "step": 15965 }, { "epoch": 1.4473445713249955, "grad_norm": 0.8958224654197693, "learning_rate": 4.4728768402069745e-05, "loss": 0.2917, "step": 15970 }, { "epoch": 1.4477977161500815, "grad_norm": 0.9621612429618835, "learning_rate": 4.472390979084111e-05, "loss": 0.2907, "step": 15975 }, { "epoch": 1.4482508609751676, "grad_norm": 0.8714088201522827, "learning_rate": 4.4719049205636923e-05, "loss": 0.3604, "step": 15980 }, { "epoch": 1.4487040058002538, "grad_norm": 0.885924756526947, "learning_rate": 4.471418664694362e-05, "loss": 0.3411, "step": 15985 }, { "epoch": 1.4491571506253398, "grad_norm": 0.905998945236206, "learning_rate": 4.470932211524787e-05, "loss": 0.3279, "step": 15990 }, { "epoch": 1.449610295450426, "grad_norm": 0.887562096118927, "learning_rate": 4.470445561103648e-05, "loss": 0.3182, "step": 15995 }, { "epoch": 1.4500634402755121, "grad_norm": 0.8351467847824097, "learning_rate": 4.469958713479652e-05, "loss": 0.3402, "step": 16000 }, { "epoch": 1.450516585100598, "grad_norm": 0.9611178636550903, "learning_rate": 4.469471668701522e-05, "loss": 0.3152, "step": 16005 }, { "epoch": 1.4509697299256843, "grad_norm": 0.8947104811668396, "learning_rate": 4.468984426818002e-05, "loss": 0.3559, "step": 16010 }, { "epoch": 1.4514228747507705, "grad_norm": 0.8748965859413147, "learning_rate": 4.4684969878778534e-05, "loss": 0.3322, "step": 16015 }, { "epoch": 1.4518760195758564, "grad_norm": 1.0739569664001465, "learning_rate": 4.468009351929863e-05, "loss": 0.3069, "step": 16020 }, { "epoch": 1.4523291644009426, "grad_norm": 0.8463712334632874, "learning_rate": 4.4675215190228296e-05, "loss": 0.3284, "step": 16025 }, { "epoch": 1.4527823092260286, "grad_norm": 0.8762927651405334, "learning_rate": 4.467033489205578e-05, "loss": 0.3142, "step": 16030 }, { "epoch": 1.4532354540511148, "grad_norm": 0.9167442321777344, "learning_rate": 4.46654526252695e-05, "loss": 0.298, "step": 16035 }, { "epoch": 1.4536885988762007, "grad_norm": 0.8412947654724121, "learning_rate": 4.466056839035807e-05, "loss": 0.3864, "step": 16040 }, { "epoch": 1.454141743701287, "grad_norm": 0.8310391902923584, "learning_rate": 4.465568218781032e-05, "loss": 0.2938, "step": 16045 }, { "epoch": 1.454594888526373, "grad_norm": 0.9998140335083008, "learning_rate": 4.4650794018115236e-05, "loss": 0.3079, "step": 16050 }, { "epoch": 1.455048033351459, "grad_norm": 0.8466027975082397, "learning_rate": 4.464590388176206e-05, "loss": 0.3563, "step": 16055 }, { "epoch": 1.4555011781765452, "grad_norm": 0.7999390959739685, "learning_rate": 4.464101177924017e-05, "loss": 0.338, "step": 16060 }, { "epoch": 1.4559543230016314, "grad_norm": 0.9268151521682739, "learning_rate": 4.463611771103918e-05, "loss": 0.3237, "step": 16065 }, { "epoch": 1.4564074678267174, "grad_norm": 0.8639420866966248, "learning_rate": 4.463122167764891e-05, "loss": 0.3386, "step": 16070 }, { "epoch": 1.4568606126518036, "grad_norm": 1.017209529876709, "learning_rate": 4.462632367955932e-05, "loss": 0.3317, "step": 16075 }, { "epoch": 1.4573137574768897, "grad_norm": 0.9332828521728516, "learning_rate": 4.4621423717260624e-05, "loss": 0.3515, "step": 16080 }, { "epoch": 1.4577669023019757, "grad_norm": 0.8548428416252136, "learning_rate": 4.461652179124322e-05, "loss": 0.3299, "step": 16085 }, { "epoch": 1.4582200471270619, "grad_norm": 0.8331112861633301, "learning_rate": 4.461161790199767e-05, "loss": 0.3686, "step": 16090 }, { "epoch": 1.4586731919521478, "grad_norm": 0.8921917080879211, "learning_rate": 4.4606712050014775e-05, "loss": 0.3445, "step": 16095 }, { "epoch": 1.459126336777234, "grad_norm": 0.9255713820457458, "learning_rate": 4.4601804235785514e-05, "loss": 0.3722, "step": 16100 }, { "epoch": 1.45957948160232, "grad_norm": 0.8583682775497437, "learning_rate": 4.459689445980105e-05, "loss": 0.3452, "step": 16105 }, { "epoch": 1.4600326264274062, "grad_norm": 0.8613516688346863, "learning_rate": 4.4591982722552775e-05, "loss": 0.388, "step": 16110 }, { "epoch": 1.4604857712524923, "grad_norm": 1.062615990638733, "learning_rate": 4.4587069024532236e-05, "loss": 0.3538, "step": 16115 }, { "epoch": 1.4609389160775783, "grad_norm": 0.8037470579147339, "learning_rate": 4.4582153366231215e-05, "loss": 0.3137, "step": 16120 }, { "epoch": 1.4613920609026645, "grad_norm": 0.8761669397354126, "learning_rate": 4.457723574814167e-05, "loss": 0.321, "step": 16125 }, { "epoch": 1.4618452057277507, "grad_norm": 0.9870803952217102, "learning_rate": 4.4572316170755734e-05, "loss": 0.3318, "step": 16130 }, { "epoch": 1.4622983505528366, "grad_norm": 0.9859821796417236, "learning_rate": 4.45673946345658e-05, "loss": 0.3389, "step": 16135 }, { "epoch": 1.4627514953779228, "grad_norm": 0.9904748201370239, "learning_rate": 4.4562471140064385e-05, "loss": 0.324, "step": 16140 }, { "epoch": 1.463204640203009, "grad_norm": 0.8890429139137268, "learning_rate": 4.4557545687744246e-05, "loss": 0.3522, "step": 16145 }, { "epoch": 1.463657785028095, "grad_norm": 0.8229525685310364, "learning_rate": 4.4552618278098325e-05, "loss": 0.3454, "step": 16150 }, { "epoch": 1.4641109298531811, "grad_norm": 0.9146429896354675, "learning_rate": 4.454768891161976e-05, "loss": 0.3125, "step": 16155 }, { "epoch": 1.464564074678267, "grad_norm": 0.8579217195510864, "learning_rate": 4.4542757588801874e-05, "loss": 0.301, "step": 16160 }, { "epoch": 1.4650172195033533, "grad_norm": 1.0170505046844482, "learning_rate": 4.4537824310138205e-05, "loss": 0.3282, "step": 16165 }, { "epoch": 1.4654703643284392, "grad_norm": 0.8582420349121094, "learning_rate": 4.453288907612248e-05, "loss": 0.3133, "step": 16170 }, { "epoch": 1.4659235091535254, "grad_norm": 0.9170921444892883, "learning_rate": 4.45279518872486e-05, "loss": 0.3379, "step": 16175 }, { "epoch": 1.4663766539786116, "grad_norm": 0.9352853298187256, "learning_rate": 4.452301274401071e-05, "loss": 0.3485, "step": 16180 }, { "epoch": 1.4668297988036976, "grad_norm": 0.875583291053772, "learning_rate": 4.4518071646903094e-05, "loss": 0.3401, "step": 16185 }, { "epoch": 1.4672829436287838, "grad_norm": 0.8831033110618591, "learning_rate": 4.451312859642027e-05, "loss": 0.3229, "step": 16190 }, { "epoch": 1.46773608845387, "grad_norm": 1.0743290185928345, "learning_rate": 4.450818359305694e-05, "loss": 0.3811, "step": 16195 }, { "epoch": 1.468189233278956, "grad_norm": 0.8578141927719116, "learning_rate": 4.4503236637308e-05, "loss": 0.3655, "step": 16200 }, { "epoch": 1.468642378104042, "grad_norm": 0.8375040888786316, "learning_rate": 4.449828772966855e-05, "loss": 0.3172, "step": 16205 }, { "epoch": 1.4690955229291283, "grad_norm": 0.9390548467636108, "learning_rate": 4.449333687063387e-05, "loss": 0.3092, "step": 16210 }, { "epoch": 1.4695486677542142, "grad_norm": 0.912736177444458, "learning_rate": 4.448838406069944e-05, "loss": 0.3118, "step": 16215 }, { "epoch": 1.4700018125793004, "grad_norm": 0.8888869285583496, "learning_rate": 4.4483429300360954e-05, "loss": 0.3142, "step": 16220 }, { "epoch": 1.4704549574043864, "grad_norm": 0.8601214289665222, "learning_rate": 4.4478472590114276e-05, "loss": 0.2787, "step": 16225 }, { "epoch": 1.4709081022294725, "grad_norm": 0.8638772368431091, "learning_rate": 4.4473513930455477e-05, "loss": 0.3325, "step": 16230 }, { "epoch": 1.4713612470545585, "grad_norm": 0.8229684829711914, "learning_rate": 4.4468553321880815e-05, "loss": 0.2917, "step": 16235 }, { "epoch": 1.4718143918796447, "grad_norm": 0.8202263116836548, "learning_rate": 4.446359076488676e-05, "loss": 0.3095, "step": 16240 }, { "epoch": 1.4722675367047309, "grad_norm": 0.927639365196228, "learning_rate": 4.445862625996996e-05, "loss": 0.2686, "step": 16245 }, { "epoch": 1.4727206815298168, "grad_norm": 0.9009503126144409, "learning_rate": 4.4453659807627266e-05, "loss": 0.353, "step": 16250 }, { "epoch": 1.473173826354903, "grad_norm": 0.8713672161102295, "learning_rate": 4.444869140835572e-05, "loss": 0.3041, "step": 16255 }, { "epoch": 1.4736269711799892, "grad_norm": 0.9004248380661011, "learning_rate": 4.444372106265257e-05, "loss": 0.2988, "step": 16260 }, { "epoch": 1.4740801160050752, "grad_norm": 0.8278849720954895, "learning_rate": 4.443874877101524e-05, "loss": 0.3578, "step": 16265 }, { "epoch": 1.4745332608301613, "grad_norm": 0.8071872591972351, "learning_rate": 4.443377453394135e-05, "loss": 0.373, "step": 16270 }, { "epoch": 1.4749864056552475, "grad_norm": 0.8965317606925964, "learning_rate": 4.4428798351928744e-05, "loss": 0.3429, "step": 16275 }, { "epoch": 1.4754395504803335, "grad_norm": 0.8471928834915161, "learning_rate": 4.4423820225475435e-05, "loss": 0.3318, "step": 16280 }, { "epoch": 1.4758926953054197, "grad_norm": 0.9302198886871338, "learning_rate": 4.4418840155079635e-05, "loss": 0.339, "step": 16285 }, { "epoch": 1.4763458401305056, "grad_norm": 0.8837357759475708, "learning_rate": 4.441385814123974e-05, "loss": 0.3886, "step": 16290 }, { "epoch": 1.4767989849555918, "grad_norm": 0.8406584858894348, "learning_rate": 4.440887418445436e-05, "loss": 0.3068, "step": 16295 }, { "epoch": 1.4772521297806778, "grad_norm": 0.8617344498634338, "learning_rate": 4.4403888285222296e-05, "loss": 0.3299, "step": 16300 }, { "epoch": 1.477705274605764, "grad_norm": 0.8395094275474548, "learning_rate": 4.439890044404252e-05, "loss": 0.3262, "step": 16305 }, { "epoch": 1.4781584194308501, "grad_norm": 0.8789792060852051, "learning_rate": 4.439391066141424e-05, "loss": 0.2972, "step": 16310 }, { "epoch": 1.478611564255936, "grad_norm": 0.9660056233406067, "learning_rate": 4.438891893783682e-05, "loss": 0.3676, "step": 16315 }, { "epoch": 1.4790647090810223, "grad_norm": 0.9812306761741638, "learning_rate": 4.4383925273809847e-05, "loss": 0.3803, "step": 16320 }, { "epoch": 1.4795178539061085, "grad_norm": 0.9735592603683472, "learning_rate": 4.437892966983307e-05, "loss": 0.3405, "step": 16325 }, { "epoch": 1.4799709987311944, "grad_norm": 0.8526990413665771, "learning_rate": 4.437393212640647e-05, "loss": 0.3475, "step": 16330 }, { "epoch": 1.4804241435562806, "grad_norm": 0.8899300694465637, "learning_rate": 4.436893264403019e-05, "loss": 0.3251, "step": 16335 }, { "epoch": 1.4808772883813668, "grad_norm": 0.8965761661529541, "learning_rate": 4.436393122320458e-05, "loss": 0.2951, "step": 16340 }, { "epoch": 1.4813304332064527, "grad_norm": 0.9336226582527161, "learning_rate": 4.435892786443019e-05, "loss": 0.3185, "step": 16345 }, { "epoch": 1.481783578031539, "grad_norm": 0.8406663537025452, "learning_rate": 4.435392256820775e-05, "loss": 0.3085, "step": 16350 }, { "epoch": 1.482236722856625, "grad_norm": 0.843023419380188, "learning_rate": 4.4348915335038196e-05, "loss": 0.3089, "step": 16355 }, { "epoch": 1.482689867681711, "grad_norm": 0.8891059160232544, "learning_rate": 4.4343906165422665e-05, "loss": 0.335, "step": 16360 }, { "epoch": 1.4831430125067973, "grad_norm": 0.8593640923500061, "learning_rate": 4.433889505986246e-05, "loss": 0.3193, "step": 16365 }, { "epoch": 1.4835961573318832, "grad_norm": 0.8993439674377441, "learning_rate": 4.43338820188591e-05, "loss": 0.3327, "step": 16370 }, { "epoch": 1.4840493021569694, "grad_norm": 0.8503761291503906, "learning_rate": 4.432886704291429e-05, "loss": 0.3531, "step": 16375 }, { "epoch": 1.4845024469820554, "grad_norm": 0.9606332182884216, "learning_rate": 4.4323850132529934e-05, "loss": 0.2944, "step": 16380 }, { "epoch": 1.4849555918071415, "grad_norm": 0.8548496961593628, "learning_rate": 4.4318831288208115e-05, "loss": 0.2957, "step": 16385 }, { "epoch": 1.4854087366322277, "grad_norm": 0.8917195796966553, "learning_rate": 4.431381051045115e-05, "loss": 0.3134, "step": 16390 }, { "epoch": 1.4858618814573137, "grad_norm": 0.9505776762962341, "learning_rate": 4.4308787799761485e-05, "loss": 0.3097, "step": 16395 }, { "epoch": 1.4863150262823999, "grad_norm": 0.9188403487205505, "learning_rate": 4.430376315664181e-05, "loss": 0.3279, "step": 16400 }, { "epoch": 1.486768171107486, "grad_norm": 0.8815337419509888, "learning_rate": 4.4298736581595e-05, "loss": 0.3098, "step": 16405 }, { "epoch": 1.487221315932572, "grad_norm": 0.8372187614440918, "learning_rate": 4.42937080751241e-05, "loss": 0.3434, "step": 16410 }, { "epoch": 1.4876744607576582, "grad_norm": 0.8718562722206116, "learning_rate": 4.428867763773238e-05, "loss": 0.3224, "step": 16415 }, { "epoch": 1.4881276055827444, "grad_norm": 0.911737859249115, "learning_rate": 4.428364526992328e-05, "loss": 0.3458, "step": 16420 }, { "epoch": 1.4885807504078303, "grad_norm": 0.9692966341972351, "learning_rate": 4.427861097220044e-05, "loss": 0.3322, "step": 16425 }, { "epoch": 1.4890338952329165, "grad_norm": 0.8757699728012085, "learning_rate": 4.42735747450677e-05, "loss": 0.3392, "step": 16430 }, { "epoch": 1.4894870400580025, "grad_norm": 0.949142336845398, "learning_rate": 4.4268536589029087e-05, "loss": 0.3287, "step": 16435 }, { "epoch": 1.4899401848830887, "grad_norm": 0.8817501068115234, "learning_rate": 4.426349650458881e-05, "loss": 0.3217, "step": 16440 }, { "epoch": 1.4903933297081746, "grad_norm": 0.9576331377029419, "learning_rate": 4.42584544922513e-05, "loss": 0.328, "step": 16445 }, { "epoch": 1.4908464745332608, "grad_norm": 0.8747189044952393, "learning_rate": 4.425341055252115e-05, "loss": 0.3322, "step": 16450 }, { "epoch": 1.491299619358347, "grad_norm": 0.8452276587486267, "learning_rate": 4.424836468590317e-05, "loss": 0.2987, "step": 16455 }, { "epoch": 1.491752764183433, "grad_norm": 0.8514413833618164, "learning_rate": 4.424331689290233e-05, "loss": 0.3159, "step": 16460 }, { "epoch": 1.4922059090085191, "grad_norm": 0.876915454864502, "learning_rate": 4.4238267174023837e-05, "loss": 0.2778, "step": 16465 }, { "epoch": 1.4926590538336053, "grad_norm": 0.9552947282791138, "learning_rate": 4.423321552977307e-05, "loss": 0.3471, "step": 16470 }, { "epoch": 1.4931121986586913, "grad_norm": 0.8758605718612671, "learning_rate": 4.422816196065558e-05, "loss": 0.3243, "step": 16475 }, { "epoch": 1.4935653434837775, "grad_norm": 0.9883660078048706, "learning_rate": 4.422310646717714e-05, "loss": 0.3797, "step": 16480 }, { "epoch": 1.4940184883088636, "grad_norm": 0.8535401225090027, "learning_rate": 4.421804904984371e-05, "loss": 0.3219, "step": 16485 }, { "epoch": 1.4944716331339496, "grad_norm": 0.9459010362625122, "learning_rate": 4.421298970916144e-05, "loss": 0.3438, "step": 16490 }, { "epoch": 1.4949247779590358, "grad_norm": 1.0566688776016235, "learning_rate": 4.420792844563666e-05, "loss": 0.3505, "step": 16495 }, { "epoch": 1.4953779227841217, "grad_norm": 0.9175550937652588, "learning_rate": 4.420286525977591e-05, "loss": 0.3554, "step": 16500 }, { "epoch": 1.495831067609208, "grad_norm": 0.8189864158630371, "learning_rate": 4.419780015208591e-05, "loss": 0.3116, "step": 16505 }, { "epoch": 1.4962842124342939, "grad_norm": 0.9402145743370056, "learning_rate": 4.419273312307358e-05, "loss": 0.3282, "step": 16510 }, { "epoch": 1.49673735725938, "grad_norm": 0.8571115732192993, "learning_rate": 4.418766417324603e-05, "loss": 0.2791, "step": 16515 }, { "epoch": 1.4971905020844662, "grad_norm": 0.838041365146637, "learning_rate": 4.418259330311058e-05, "loss": 0.3433, "step": 16520 }, { "epoch": 1.4976436469095522, "grad_norm": 0.8882701992988586, "learning_rate": 4.417752051317469e-05, "loss": 0.3502, "step": 16525 }, { "epoch": 1.4980967917346384, "grad_norm": 1.0503344535827637, "learning_rate": 4.417244580394607e-05, "loss": 0.3273, "step": 16530 }, { "epoch": 1.4985499365597246, "grad_norm": 0.9523491263389587, "learning_rate": 4.4167369175932594e-05, "loss": 0.3061, "step": 16535 }, { "epoch": 1.4990030813848105, "grad_norm": 0.8217817544937134, "learning_rate": 4.416229062964232e-05, "loss": 0.2851, "step": 16540 }, { "epoch": 1.4994562262098967, "grad_norm": 0.9434483051300049, "learning_rate": 4.4157210165583535e-05, "loss": 0.3922, "step": 16545 }, { "epoch": 1.499909371034983, "grad_norm": 0.9641083478927612, "learning_rate": 4.4152127784264676e-05, "loss": 0.3414, "step": 16550 }, { "epoch": 1.5003625158600689, "grad_norm": 0.8322111964225769, "learning_rate": 4.41470434861944e-05, "loss": 0.2907, "step": 16555 }, { "epoch": 1.5008156606851548, "grad_norm": 0.8505094051361084, "learning_rate": 4.414195727188153e-05, "loss": 0.2997, "step": 16560 }, { "epoch": 1.5012688055102412, "grad_norm": 0.9184703230857849, "learning_rate": 4.413686914183511e-05, "loss": 0.3609, "step": 16565 }, { "epoch": 1.5017219503353272, "grad_norm": 0.8251420855522156, "learning_rate": 4.413177909656435e-05, "loss": 0.2827, "step": 16570 }, { "epoch": 1.5021750951604131, "grad_norm": 0.7987921833992004, "learning_rate": 4.412668713657868e-05, "loss": 0.2976, "step": 16575 }, { "epoch": 1.5026282399854993, "grad_norm": 0.8203641772270203, "learning_rate": 4.412159326238769e-05, "loss": 0.3424, "step": 16580 }, { "epoch": 1.5030813848105855, "grad_norm": 0.8569257855415344, "learning_rate": 4.4116497474501184e-05, "loss": 0.3099, "step": 16585 }, { "epoch": 1.5035345296356715, "grad_norm": 0.844150722026825, "learning_rate": 4.4111399773429155e-05, "loss": 0.38, "step": 16590 }, { "epoch": 1.5039876744607577, "grad_norm": 0.8691183924674988, "learning_rate": 4.4106300159681765e-05, "loss": 0.2866, "step": 16595 }, { "epoch": 1.5044408192858438, "grad_norm": 0.9741607904434204, "learning_rate": 4.4101198633769396e-05, "loss": 0.3249, "step": 16600 }, { "epoch": 1.5048939641109298, "grad_norm": 0.9396120309829712, "learning_rate": 4.409609519620261e-05, "loss": 0.3564, "step": 16605 }, { "epoch": 1.505347108936016, "grad_norm": 0.7792648077011108, "learning_rate": 4.409098984749217e-05, "loss": 0.2891, "step": 16610 }, { "epoch": 1.5058002537611022, "grad_norm": 0.8615527153015137, "learning_rate": 4.408588258814901e-05, "loss": 0.3381, "step": 16615 }, { "epoch": 1.5062533985861881, "grad_norm": 0.9389621019363403, "learning_rate": 4.4080773418684265e-05, "loss": 0.2808, "step": 16620 }, { "epoch": 1.506706543411274, "grad_norm": 0.9936752915382385, "learning_rate": 4.4075662339609266e-05, "loss": 0.2844, "step": 16625 }, { "epoch": 1.5071596882363605, "grad_norm": 0.7607190012931824, "learning_rate": 4.407054935143554e-05, "loss": 0.2795, "step": 16630 }, { "epoch": 1.5076128330614464, "grad_norm": 0.7438227534294128, "learning_rate": 4.4065434454674784e-05, "loss": 0.3402, "step": 16635 }, { "epoch": 1.5080659778865324, "grad_norm": 0.8970581293106079, "learning_rate": 4.40603176498389e-05, "loss": 0.3462, "step": 16640 }, { "epoch": 1.5085191227116186, "grad_norm": 0.9115033149719238, "learning_rate": 4.405519893743998e-05, "loss": 0.3237, "step": 16645 }, { "epoch": 1.5089722675367048, "grad_norm": 0.8780820369720459, "learning_rate": 4.405007831799031e-05, "loss": 0.3258, "step": 16650 }, { "epoch": 1.5094254123617907, "grad_norm": 0.9339021444320679, "learning_rate": 4.404495579200236e-05, "loss": 0.3397, "step": 16655 }, { "epoch": 1.509878557186877, "grad_norm": 0.8274199366569519, "learning_rate": 4.40398313599888e-05, "loss": 0.2829, "step": 16660 }, { "epoch": 1.510331702011963, "grad_norm": 0.9540591835975647, "learning_rate": 4.403470502246249e-05, "loss": 0.314, "step": 16665 }, { "epoch": 1.510784846837049, "grad_norm": 0.9711896777153015, "learning_rate": 4.4029576779936456e-05, "loss": 0.3334, "step": 16670 }, { "epoch": 1.5112379916621352, "grad_norm": 0.7909640669822693, "learning_rate": 4.402444663292394e-05, "loss": 0.3129, "step": 16675 }, { "epoch": 1.5116911364872214, "grad_norm": 0.824843168258667, "learning_rate": 4.4019314581938386e-05, "loss": 0.3277, "step": 16680 }, { "epoch": 1.5121442813123074, "grad_norm": 1.025460124015808, "learning_rate": 4.401418062749339e-05, "loss": 0.3108, "step": 16685 }, { "epoch": 1.5125974261373936, "grad_norm": 0.9290443658828735, "learning_rate": 4.400904477010278e-05, "loss": 0.3511, "step": 16690 }, { "epoch": 1.5130505709624797, "grad_norm": 0.8760688900947571, "learning_rate": 4.4003907010280534e-05, "loss": 0.3488, "step": 16695 }, { "epoch": 1.5135037157875657, "grad_norm": 0.8707441687583923, "learning_rate": 4.399876734854085e-05, "loss": 0.3291, "step": 16700 }, { "epoch": 1.5139568606126517, "grad_norm": 0.9163396954536438, "learning_rate": 4.399362578539811e-05, "loss": 0.3319, "step": 16705 }, { "epoch": 1.5144100054377378, "grad_norm": 0.7590965032577515, "learning_rate": 4.398848232136687e-05, "loss": 0.3399, "step": 16710 }, { "epoch": 1.514863150262824, "grad_norm": 0.7996467351913452, "learning_rate": 4.398333695696192e-05, "loss": 0.3059, "step": 16715 }, { "epoch": 1.51531629508791, "grad_norm": 0.924118161201477, "learning_rate": 4.397818969269817e-05, "loss": 0.3277, "step": 16720 }, { "epoch": 1.5157694399129962, "grad_norm": 0.89335036277771, "learning_rate": 4.39730405290908e-05, "loss": 0.3226, "step": 16725 }, { "epoch": 1.5162225847380824, "grad_norm": 0.8390341997146606, "learning_rate": 4.396788946665511e-05, "loss": 0.3439, "step": 16730 }, { "epoch": 1.5166757295631683, "grad_norm": 0.9174239635467529, "learning_rate": 4.396273650590663e-05, "loss": 0.3056, "step": 16735 }, { "epoch": 1.5171288743882545, "grad_norm": 0.8130680918693542, "learning_rate": 4.3957581647361066e-05, "loss": 0.3521, "step": 16740 }, { "epoch": 1.5175820192133407, "grad_norm": 0.9275286197662354, "learning_rate": 4.3952424891534325e-05, "loss": 0.3136, "step": 16745 }, { "epoch": 1.5180351640384266, "grad_norm": 0.910183846950531, "learning_rate": 4.39472662389425e-05, "loss": 0.3007, "step": 16750 }, { "epoch": 1.5184883088635128, "grad_norm": 0.9117527008056641, "learning_rate": 4.394210569010185e-05, "loss": 0.3228, "step": 16755 }, { "epoch": 1.518941453688599, "grad_norm": 0.9314092397689819, "learning_rate": 4.393694324552887e-05, "loss": 0.3454, "step": 16760 }, { "epoch": 1.519394598513685, "grad_norm": 0.8119666576385498, "learning_rate": 4.3931778905740216e-05, "loss": 0.3289, "step": 16765 }, { "epoch": 1.519847743338771, "grad_norm": 0.8956707119941711, "learning_rate": 4.392661267125272e-05, "loss": 0.2714, "step": 16770 }, { "epoch": 1.5203008881638573, "grad_norm": 0.8850768208503723, "learning_rate": 4.392144454258343e-05, "loss": 0.3217, "step": 16775 }, { "epoch": 1.5207540329889433, "grad_norm": 1.021035075187683, "learning_rate": 4.3916274520249575e-05, "loss": 0.3359, "step": 16780 }, { "epoch": 1.5212071778140293, "grad_norm": 0.9658727049827576, "learning_rate": 4.3911102604768574e-05, "loss": 0.4008, "step": 16785 }, { "epoch": 1.5216603226391154, "grad_norm": 0.8194105625152588, "learning_rate": 4.390592879665802e-05, "loss": 0.3205, "step": 16790 }, { "epoch": 1.5221134674642016, "grad_norm": 0.8324549198150635, "learning_rate": 4.3900753096435734e-05, "loss": 0.2826, "step": 16795 }, { "epoch": 1.5225666122892876, "grad_norm": 0.908935546875, "learning_rate": 4.389557550461968e-05, "loss": 0.3463, "step": 16800 }, { "epoch": 1.5230197571143738, "grad_norm": 0.787485659122467, "learning_rate": 4.389039602172804e-05, "loss": 0.2937, "step": 16805 }, { "epoch": 1.52347290193946, "grad_norm": 0.8178339004516602, "learning_rate": 4.38852146482792e-05, "loss": 0.3256, "step": 16810 }, { "epoch": 1.523926046764546, "grad_norm": 0.8291727900505066, "learning_rate": 4.388003138479167e-05, "loss": 0.3446, "step": 16815 }, { "epoch": 1.524379191589632, "grad_norm": 0.8015860319137573, "learning_rate": 4.3874846231784225e-05, "loss": 0.3281, "step": 16820 }, { "epoch": 1.5248323364147183, "grad_norm": 0.8130186796188354, "learning_rate": 4.386965918977579e-05, "loss": 0.3185, "step": 16825 }, { "epoch": 1.5252854812398042, "grad_norm": 0.8767794370651245, "learning_rate": 4.3864470259285484e-05, "loss": 0.3182, "step": 16830 }, { "epoch": 1.5257386260648902, "grad_norm": 0.8263176679611206, "learning_rate": 4.385927944083261e-05, "loss": 0.3097, "step": 16835 }, { "epoch": 1.5261917708899766, "grad_norm": 0.9067090153694153, "learning_rate": 4.385408673493669e-05, "loss": 0.3134, "step": 16840 }, { "epoch": 1.5266449157150626, "grad_norm": 0.9445196390151978, "learning_rate": 4.384889214211738e-05, "loss": 0.3519, "step": 16845 }, { "epoch": 1.5270980605401485, "grad_norm": 0.9353730082511902, "learning_rate": 4.3843695662894586e-05, "loss": 0.3183, "step": 16850 }, { "epoch": 1.5275512053652347, "grad_norm": 0.954073965549469, "learning_rate": 4.383849729778835e-05, "loss": 0.3621, "step": 16855 }, { "epoch": 1.5280043501903209, "grad_norm": 0.8924869298934937, "learning_rate": 4.3833297047318944e-05, "loss": 0.294, "step": 16860 }, { "epoch": 1.5284574950154068, "grad_norm": 0.8494390845298767, "learning_rate": 4.38280949120068e-05, "loss": 0.3253, "step": 16865 }, { "epoch": 1.528910639840493, "grad_norm": 0.8728858828544617, "learning_rate": 4.382289089237255e-05, "loss": 0.3422, "step": 16870 }, { "epoch": 1.5293637846655792, "grad_norm": 0.8005915880203247, "learning_rate": 4.3817684988937016e-05, "loss": 0.3313, "step": 16875 }, { "epoch": 1.5298169294906652, "grad_norm": 0.8793318271636963, "learning_rate": 4.3812477202221206e-05, "loss": 0.2962, "step": 16880 }, { "epoch": 1.5302700743157513, "grad_norm": 0.9239771366119385, "learning_rate": 4.380726753274632e-05, "loss": 0.3644, "step": 16885 }, { "epoch": 1.5307232191408375, "grad_norm": 0.8735407590866089, "learning_rate": 4.380205598103374e-05, "loss": 0.3494, "step": 16890 }, { "epoch": 1.5311763639659235, "grad_norm": 0.8048292398452759, "learning_rate": 4.379684254760504e-05, "loss": 0.3068, "step": 16895 }, { "epoch": 1.5316295087910095, "grad_norm": 0.7478331923484802, "learning_rate": 4.379162723298199e-05, "loss": 0.2892, "step": 16900 }, { "epoch": 1.5320826536160959, "grad_norm": 1.178605556488037, "learning_rate": 4.3786410037686534e-05, "loss": 0.3465, "step": 16905 }, { "epoch": 1.5325357984411818, "grad_norm": 0.8216167092323303, "learning_rate": 4.378119096224081e-05, "loss": 0.3197, "step": 16910 }, { "epoch": 1.5329889432662678, "grad_norm": 0.8639706373214722, "learning_rate": 4.3775970007167146e-05, "loss": 0.2846, "step": 16915 }, { "epoch": 1.533442088091354, "grad_norm": 0.8658016324043274, "learning_rate": 4.377074717298805e-05, "loss": 0.3232, "step": 16920 }, { "epoch": 1.5338952329164401, "grad_norm": 0.8472742438316345, "learning_rate": 4.376552246022623e-05, "loss": 0.3171, "step": 16925 }, { "epoch": 1.534348377741526, "grad_norm": 0.8074241876602173, "learning_rate": 4.3760295869404586e-05, "loss": 0.3166, "step": 16930 }, { "epoch": 1.5348015225666123, "grad_norm": 0.8179529905319214, "learning_rate": 4.375506740104619e-05, "loss": 0.2646, "step": 16935 }, { "epoch": 1.5352546673916985, "grad_norm": 0.8430346846580505, "learning_rate": 4.3749837055674304e-05, "loss": 0.3029, "step": 16940 }, { "epoch": 1.5357078122167844, "grad_norm": 0.9465624690055847, "learning_rate": 4.374460483381239e-05, "loss": 0.3625, "step": 16945 }, { "epoch": 1.5361609570418706, "grad_norm": 0.9336488246917725, "learning_rate": 4.373937073598408e-05, "loss": 0.3262, "step": 16950 }, { "epoch": 1.5366141018669568, "grad_norm": 0.8974557518959045, "learning_rate": 4.373413476271322e-05, "loss": 0.3022, "step": 16955 }, { "epoch": 1.5370672466920428, "grad_norm": 1.0039215087890625, "learning_rate": 4.372889691452382e-05, "loss": 0.3146, "step": 16960 }, { "epoch": 1.5375203915171287, "grad_norm": 0.8281719088554382, "learning_rate": 4.3723657191940083e-05, "loss": 0.3368, "step": 16965 }, { "epoch": 1.5379735363422151, "grad_norm": 0.8318036794662476, "learning_rate": 4.371841559548641e-05, "loss": 0.2931, "step": 16970 }, { "epoch": 1.538426681167301, "grad_norm": 0.9552793502807617, "learning_rate": 4.371317212568736e-05, "loss": 0.3157, "step": 16975 }, { "epoch": 1.538879825992387, "grad_norm": 0.8462011218070984, "learning_rate": 4.370792678306773e-05, "loss": 0.3101, "step": 16980 }, { "epoch": 1.5393329708174732, "grad_norm": 0.8346260190010071, "learning_rate": 4.3702679568152466e-05, "loss": 0.3138, "step": 16985 }, { "epoch": 1.5397861156425594, "grad_norm": 0.9434829354286194, "learning_rate": 4.369743048146671e-05, "loss": 0.3599, "step": 16990 }, { "epoch": 1.5402392604676454, "grad_norm": 0.8829513192176819, "learning_rate": 4.369217952353578e-05, "loss": 0.3188, "step": 16995 }, { "epoch": 1.5406924052927315, "grad_norm": 0.8973281979560852, "learning_rate": 4.368692669488521e-05, "loss": 0.3105, "step": 17000 }, { "epoch": 1.5411455501178177, "grad_norm": 0.7946110367774963, "learning_rate": 4.36816719960407e-05, "loss": 0.3033, "step": 17005 }, { "epoch": 1.5415986949429037, "grad_norm": 0.9422683119773865, "learning_rate": 4.367641542752814e-05, "loss": 0.3615, "step": 17010 }, { "epoch": 1.5420518397679899, "grad_norm": 1.0310269594192505, "learning_rate": 4.3671156989873605e-05, "loss": 0.2868, "step": 17015 }, { "epoch": 1.542504984593076, "grad_norm": 0.850303053855896, "learning_rate": 4.366589668360338e-05, "loss": 0.3018, "step": 17020 }, { "epoch": 1.542958129418162, "grad_norm": 0.8689785599708557, "learning_rate": 4.366063450924389e-05, "loss": 0.3135, "step": 17025 }, { "epoch": 1.543411274243248, "grad_norm": 0.8805714249610901, "learning_rate": 4.36553704673218e-05, "loss": 0.325, "step": 17030 }, { "epoch": 1.5438644190683344, "grad_norm": 0.8789202570915222, "learning_rate": 4.3650104558363924e-05, "loss": 0.3293, "step": 17035 }, { "epoch": 1.5443175638934203, "grad_norm": 0.837662935256958, "learning_rate": 4.364483678289728e-05, "loss": 0.29, "step": 17040 }, { "epoch": 1.5447707087185063, "grad_norm": 0.8155145645141602, "learning_rate": 4.3639567141449065e-05, "loss": 0.2827, "step": 17045 }, { "epoch": 1.5452238535435925, "grad_norm": 0.9367856383323669, "learning_rate": 4.363429563454667e-05, "loss": 0.3, "step": 17050 }, { "epoch": 1.5456769983686787, "grad_norm": 0.9272308945655823, "learning_rate": 4.3629022262717676e-05, "loss": 0.3807, "step": 17055 }, { "epoch": 1.5461301431937646, "grad_norm": 0.8485404253005981, "learning_rate": 4.362374702648983e-05, "loss": 0.3635, "step": 17060 }, { "epoch": 1.5465832880188508, "grad_norm": 0.8587406873703003, "learning_rate": 4.361846992639109e-05, "loss": 0.2886, "step": 17065 }, { "epoch": 1.547036432843937, "grad_norm": 0.9226687550544739, "learning_rate": 4.361319096294958e-05, "loss": 0.3472, "step": 17070 }, { "epoch": 1.547489577669023, "grad_norm": 0.8939982056617737, "learning_rate": 4.360791013669363e-05, "loss": 0.3561, "step": 17075 }, { "epoch": 1.5479427224941091, "grad_norm": 0.80282062292099, "learning_rate": 4.360262744815174e-05, "loss": 0.3456, "step": 17080 }, { "epoch": 1.5483958673191953, "grad_norm": 0.7844919562339783, "learning_rate": 4.359734289785261e-05, "loss": 0.303, "step": 17085 }, { "epoch": 1.5488490121442813, "grad_norm": 0.8648966550827026, "learning_rate": 4.359205648632512e-05, "loss": 0.3143, "step": 17090 }, { "epoch": 1.5493021569693675, "grad_norm": 0.831176221370697, "learning_rate": 4.358676821409832e-05, "loss": 0.3207, "step": 17095 }, { "epoch": 1.5497553017944536, "grad_norm": 0.8092813491821289, "learning_rate": 4.3581478081701474e-05, "loss": 0.3079, "step": 17100 }, { "epoch": 1.5502084466195396, "grad_norm": 0.7268885374069214, "learning_rate": 4.357618608966403e-05, "loss": 0.2891, "step": 17105 }, { "epoch": 1.5506615914446256, "grad_norm": 0.8993212580680847, "learning_rate": 4.3570892238515606e-05, "loss": 0.3347, "step": 17110 }, { "epoch": 1.5511147362697117, "grad_norm": 0.7736019492149353, "learning_rate": 4.3565596528786e-05, "loss": 0.3265, "step": 17115 }, { "epoch": 1.551567881094798, "grad_norm": 0.8949097990989685, "learning_rate": 4.356029896100521e-05, "loss": 0.316, "step": 17120 }, { "epoch": 1.552021025919884, "grad_norm": 0.9289581775665283, "learning_rate": 4.3554999535703436e-05, "loss": 0.3676, "step": 17125 }, { "epoch": 1.55247417074497, "grad_norm": 0.8606915473937988, "learning_rate": 4.3549698253411037e-05, "loss": 0.2842, "step": 17130 }, { "epoch": 1.5529273155700563, "grad_norm": 0.9461026191711426, "learning_rate": 4.354439511465856e-05, "loss": 0.3197, "step": 17135 }, { "epoch": 1.5533804603951422, "grad_norm": 0.8128757476806641, "learning_rate": 4.353909011997675e-05, "loss": 0.2725, "step": 17140 }, { "epoch": 1.5538336052202284, "grad_norm": 0.9130784869194031, "learning_rate": 4.3533783269896545e-05, "loss": 0.3505, "step": 17145 }, { "epoch": 1.5542867500453146, "grad_norm": 0.8968615531921387, "learning_rate": 4.352847456494903e-05, "loss": 0.3993, "step": 17150 }, { "epoch": 1.5547398948704005, "grad_norm": 0.8755431771278381, "learning_rate": 4.352316400566553e-05, "loss": 0.335, "step": 17155 }, { "epoch": 1.5551930396954867, "grad_norm": 1.0983177423477173, "learning_rate": 4.3517851592577506e-05, "loss": 0.3284, "step": 17160 }, { "epoch": 1.555646184520573, "grad_norm": 0.8288461565971375, "learning_rate": 4.351253732621664e-05, "loss": 0.2977, "step": 17165 }, { "epoch": 1.5560993293456589, "grad_norm": 0.9183955788612366, "learning_rate": 4.350722120711478e-05, "loss": 0.345, "step": 17170 }, { "epoch": 1.5565524741707448, "grad_norm": 0.8405925035476685, "learning_rate": 4.350190323580396e-05, "loss": 0.2974, "step": 17175 }, { "epoch": 1.5570056189958312, "grad_norm": 0.8974467515945435, "learning_rate": 4.3496583412816414e-05, "loss": 0.3392, "step": 17180 }, { "epoch": 1.5574587638209172, "grad_norm": 0.8332897424697876, "learning_rate": 4.349126173868455e-05, "loss": 0.3145, "step": 17185 }, { "epoch": 1.5579119086460032, "grad_norm": 0.9360265135765076, "learning_rate": 4.348593821394096e-05, "loss": 0.3018, "step": 17190 }, { "epoch": 1.5583650534710893, "grad_norm": 0.8886414170265198, "learning_rate": 4.348061283911842e-05, "loss": 0.3131, "step": 17195 }, { "epoch": 1.5588181982961755, "grad_norm": 0.9816023111343384, "learning_rate": 4.3475285614749895e-05, "loss": 0.3317, "step": 17200 }, { "epoch": 1.5592713431212615, "grad_norm": 0.9204791188240051, "learning_rate": 4.3469956541368554e-05, "loss": 0.3185, "step": 17205 }, { "epoch": 1.5597244879463477, "grad_norm": 0.9188042283058167, "learning_rate": 4.3464625619507706e-05, "loss": 0.3357, "step": 17210 }, { "epoch": 1.5601776327714338, "grad_norm": 0.9981820583343506, "learning_rate": 4.345929284970088e-05, "loss": 0.351, "step": 17215 }, { "epoch": 1.5606307775965198, "grad_norm": 0.9189842343330383, "learning_rate": 4.345395823248179e-05, "loss": 0.3157, "step": 17220 }, { "epoch": 1.561083922421606, "grad_norm": 0.9693995714187622, "learning_rate": 4.344862176838433e-05, "loss": 0.3611, "step": 17225 }, { "epoch": 1.5615370672466922, "grad_norm": 0.8076170682907104, "learning_rate": 4.344328345794256e-05, "loss": 0.2953, "step": 17230 }, { "epoch": 1.5619902120717781, "grad_norm": 0.9646245241165161, "learning_rate": 4.343794330169075e-05, "loss": 0.2932, "step": 17235 }, { "epoch": 1.562443356896864, "grad_norm": 0.8387430906295776, "learning_rate": 4.343260130016335e-05, "loss": 0.296, "step": 17240 }, { "epoch": 1.5628965017219505, "grad_norm": 0.9376871585845947, "learning_rate": 4.3427257453894964e-05, "loss": 0.3564, "step": 17245 }, { "epoch": 1.5633496465470365, "grad_norm": 0.9160321354866028, "learning_rate": 4.342191176342043e-05, "loss": 0.332, "step": 17250 }, { "epoch": 1.5638027913721224, "grad_norm": 0.8078805208206177, "learning_rate": 4.341656422927474e-05, "loss": 0.3068, "step": 17255 }, { "epoch": 1.5642559361972086, "grad_norm": 0.8762774467468262, "learning_rate": 4.341121485199308e-05, "loss": 0.315, "step": 17260 }, { "epoch": 1.5647090810222948, "grad_norm": 0.8939506411552429, "learning_rate": 4.340586363211081e-05, "loss": 0.3024, "step": 17265 }, { "epoch": 1.5651622258473807, "grad_norm": 0.9131081104278564, "learning_rate": 4.340051057016348e-05, "loss": 0.3146, "step": 17270 }, { "epoch": 1.565615370672467, "grad_norm": 0.8572666645050049, "learning_rate": 4.339515566668684e-05, "loss": 0.3007, "step": 17275 }, { "epoch": 1.566068515497553, "grad_norm": 0.8303747773170471, "learning_rate": 4.3389798922216795e-05, "loss": 0.2835, "step": 17280 }, { "epoch": 1.566521660322639, "grad_norm": 0.906228244304657, "learning_rate": 4.338444033728946e-05, "loss": 0.352, "step": 17285 }, { "epoch": 1.5669748051477252, "grad_norm": 0.9005520939826965, "learning_rate": 4.3379079912441125e-05, "loss": 0.3528, "step": 17290 }, { "epoch": 1.5674279499728114, "grad_norm": 0.8414116501808167, "learning_rate": 4.3373717648208254e-05, "loss": 0.2938, "step": 17295 }, { "epoch": 1.5678810947978974, "grad_norm": 0.9385018944740295, "learning_rate": 4.336835354512751e-05, "loss": 0.369, "step": 17300 }, { "epoch": 1.5683342396229834, "grad_norm": 0.8854381442070007, "learning_rate": 4.336298760373574e-05, "loss": 0.3573, "step": 17305 }, { "epoch": 1.5687873844480698, "grad_norm": 0.8372178077697754, "learning_rate": 4.335761982456996e-05, "loss": 0.3136, "step": 17310 }, { "epoch": 1.5692405292731557, "grad_norm": 1.082476258277893, "learning_rate": 4.335225020816738e-05, "loss": 0.3341, "step": 17315 }, { "epoch": 1.5696936740982417, "grad_norm": 0.8238819241523743, "learning_rate": 4.334687875506539e-05, "loss": 0.339, "step": 17320 }, { "epoch": 1.5701468189233279, "grad_norm": 0.805468738079071, "learning_rate": 4.334150546580158e-05, "loss": 0.3341, "step": 17325 }, { "epoch": 1.570599963748414, "grad_norm": 0.8247341513633728, "learning_rate": 4.33361303409137e-05, "loss": 0.3029, "step": 17330 }, { "epoch": 1.5710531085735, "grad_norm": 0.8731282949447632, "learning_rate": 4.33307533809397e-05, "loss": 0.2887, "step": 17335 }, { "epoch": 1.5715062533985862, "grad_norm": 1.1330907344818115, "learning_rate": 4.33253745864177e-05, "loss": 0.3643, "step": 17340 }, { "epoch": 1.5719593982236724, "grad_norm": 1.0221962928771973, "learning_rate": 4.331999395788602e-05, "loss": 0.3212, "step": 17345 }, { "epoch": 1.5724125430487583, "grad_norm": 0.7916745543479919, "learning_rate": 4.331461149588315e-05, "loss": 0.3147, "step": 17350 }, { "epoch": 1.5728656878738445, "grad_norm": 0.8000069856643677, "learning_rate": 4.3309227200947775e-05, "loss": 0.3555, "step": 17355 }, { "epoch": 1.5733188326989307, "grad_norm": 0.8988903164863586, "learning_rate": 4.3303841073618746e-05, "loss": 0.3137, "step": 17360 }, { "epoch": 1.5737719775240167, "grad_norm": 0.8684631586074829, "learning_rate": 4.329845311443511e-05, "loss": 0.3424, "step": 17365 }, { "epoch": 1.5742251223491026, "grad_norm": 0.9315049648284912, "learning_rate": 4.329306332393611e-05, "loss": 0.3738, "step": 17370 }, { "epoch": 1.574678267174189, "grad_norm": 1.0162602663040161, "learning_rate": 4.3287671702661145e-05, "loss": 0.3351, "step": 17375 }, { "epoch": 1.575131411999275, "grad_norm": 0.948407769203186, "learning_rate": 4.328227825114981e-05, "loss": 0.279, "step": 17380 }, { "epoch": 1.575584556824361, "grad_norm": 0.8442184925079346, "learning_rate": 4.32768829699419e-05, "loss": 0.3233, "step": 17385 }, { "epoch": 1.5760377016494471, "grad_norm": 0.9138738512992859, "learning_rate": 4.3271485859577356e-05, "loss": 0.3041, "step": 17390 }, { "epoch": 1.5764908464745333, "grad_norm": 0.8257153034210205, "learning_rate": 4.3266086920596326e-05, "loss": 0.3319, "step": 17395 }, { "epoch": 1.5769439912996193, "grad_norm": 0.770224392414093, "learning_rate": 4.326068615353915e-05, "loss": 0.2779, "step": 17400 }, { "epoch": 1.5773971361247054, "grad_norm": 0.8436338305473328, "learning_rate": 4.325528355894633e-05, "loss": 0.3071, "step": 17405 }, { "epoch": 1.5778502809497916, "grad_norm": 0.7987277507781982, "learning_rate": 4.324987913735855e-05, "loss": 0.3193, "step": 17410 }, { "epoch": 1.5783034257748776, "grad_norm": 0.8879241347312927, "learning_rate": 4.324447288931671e-05, "loss": 0.3181, "step": 17415 }, { "epoch": 1.5787565705999638, "grad_norm": 0.9419184923171997, "learning_rate": 4.323906481536185e-05, "loss": 0.3225, "step": 17420 }, { "epoch": 1.57920971542505, "grad_norm": 0.7770799994468689, "learning_rate": 4.323365491603521e-05, "loss": 0.3404, "step": 17425 }, { "epoch": 1.579662860250136, "grad_norm": 0.831650972366333, "learning_rate": 4.3228243191878236e-05, "loss": 0.3073, "step": 17430 }, { "epoch": 1.580116005075222, "grad_norm": 0.8528594970703125, "learning_rate": 4.3222829643432513e-05, "loss": 0.3481, "step": 17435 }, { "epoch": 1.5805691499003083, "grad_norm": 0.8838956952095032, "learning_rate": 4.321741427123984e-05, "loss": 0.3136, "step": 17440 }, { "epoch": 1.5810222947253942, "grad_norm": 0.8785305619239807, "learning_rate": 4.321199707584219e-05, "loss": 0.2987, "step": 17445 }, { "epoch": 1.5814754395504802, "grad_norm": 0.887760579586029, "learning_rate": 4.320657805778171e-05, "loss": 0.3015, "step": 17450 }, { "epoch": 1.5819285843755664, "grad_norm": 1.1134121417999268, "learning_rate": 4.320115721760075e-05, "loss": 0.3031, "step": 17455 }, { "epoch": 1.5823817292006526, "grad_norm": 0.9334794878959656, "learning_rate": 4.319573455584182e-05, "loss": 0.3572, "step": 17460 }, { "epoch": 1.5828348740257385, "grad_norm": 0.876079797744751, "learning_rate": 4.3190310073047626e-05, "loss": 0.364, "step": 17465 }, { "epoch": 1.5832880188508247, "grad_norm": 0.9007444381713867, "learning_rate": 4.318488376976105e-05, "loss": 0.343, "step": 17470 }, { "epoch": 1.583741163675911, "grad_norm": 0.8411834239959717, "learning_rate": 4.317945564652516e-05, "loss": 0.265, "step": 17475 }, { "epoch": 1.5841943085009969, "grad_norm": 0.8737295269966125, "learning_rate": 4.3174025703883206e-05, "loss": 0.3462, "step": 17480 }, { "epoch": 1.584647453326083, "grad_norm": 0.9137975573539734, "learning_rate": 4.316859394237861e-05, "loss": 0.3164, "step": 17485 }, { "epoch": 1.5851005981511692, "grad_norm": 0.9213309288024902, "learning_rate": 4.316316036255499e-05, "loss": 0.3154, "step": 17490 }, { "epoch": 1.5855537429762552, "grad_norm": 0.8998512625694275, "learning_rate": 4.315772496495615e-05, "loss": 0.2971, "step": 17495 }, { "epoch": 1.5860068878013414, "grad_norm": 0.8421633839607239, "learning_rate": 4.3152287750126044e-05, "loss": 0.3485, "step": 17500 }, { "epoch": 1.5864600326264275, "grad_norm": 0.8268222212791443, "learning_rate": 4.314684871860886e-05, "loss": 0.2683, "step": 17505 }, { "epoch": 1.5869131774515135, "grad_norm": 0.7997350692749023, "learning_rate": 4.31414078709489e-05, "loss": 0.2825, "step": 17510 }, { "epoch": 1.5873663222765995, "grad_norm": 0.8593233823776245, "learning_rate": 4.313596520769072e-05, "loss": 0.287, "step": 17515 }, { "epoch": 1.5878194671016856, "grad_norm": 1.0086101293563843, "learning_rate": 4.313052072937901e-05, "loss": 0.3214, "step": 17520 }, { "epoch": 1.5882726119267718, "grad_norm": 0.9454305171966553, "learning_rate": 4.312507443655867e-05, "loss": 0.2896, "step": 17525 }, { "epoch": 1.5887257567518578, "grad_norm": 0.9215148687362671, "learning_rate": 4.311962632977473e-05, "loss": 0.3626, "step": 17530 }, { "epoch": 1.589178901576944, "grad_norm": 0.8396368622779846, "learning_rate": 4.311417640957247e-05, "loss": 0.3388, "step": 17535 }, { "epoch": 1.5896320464020302, "grad_norm": 0.9638393521308899, "learning_rate": 4.310872467649732e-05, "loss": 0.3474, "step": 17540 }, { "epoch": 1.5900851912271161, "grad_norm": 0.8310582637786865, "learning_rate": 4.310327113109487e-05, "loss": 0.3079, "step": 17545 }, { "epoch": 1.5905383360522023, "grad_norm": 0.9427390098571777, "learning_rate": 4.309781577391093e-05, "loss": 0.3034, "step": 17550 }, { "epoch": 1.5909914808772885, "grad_norm": 0.8294682502746582, "learning_rate": 4.309235860549148e-05, "loss": 0.2812, "step": 17555 }, { "epoch": 1.5914446257023744, "grad_norm": 0.819551944732666, "learning_rate": 4.3086899626382646e-05, "loss": 0.3055, "step": 17560 }, { "epoch": 1.5918977705274606, "grad_norm": 0.9524248838424683, "learning_rate": 4.3081438837130794e-05, "loss": 0.3059, "step": 17565 }, { "epoch": 1.5923509153525468, "grad_norm": 1.1098201274871826, "learning_rate": 4.3075976238282437e-05, "loss": 0.3204, "step": 17570 }, { "epoch": 1.5928040601776328, "grad_norm": 0.843657910823822, "learning_rate": 4.3070511830384254e-05, "loss": 0.2819, "step": 17575 }, { "epoch": 1.5932572050027187, "grad_norm": 0.8747349381446838, "learning_rate": 4.3065045613983146e-05, "loss": 0.3213, "step": 17580 }, { "epoch": 1.5937103498278051, "grad_norm": 0.8661729693412781, "learning_rate": 4.305957758962617e-05, "loss": 0.3134, "step": 17585 }, { "epoch": 1.594163494652891, "grad_norm": 0.8032749891281128, "learning_rate": 4.305410775786055e-05, "loss": 0.2927, "step": 17590 }, { "epoch": 1.594616639477977, "grad_norm": 0.8918287754058838, "learning_rate": 4.304863611923373e-05, "loss": 0.3229, "step": 17595 }, { "epoch": 1.5950697843030632, "grad_norm": 0.854421854019165, "learning_rate": 4.304316267429331e-05, "loss": 0.3311, "step": 17600 }, { "epoch": 1.5955229291281494, "grad_norm": 0.8735854625701904, "learning_rate": 4.3037687423587056e-05, "loss": 0.3306, "step": 17605 }, { "epoch": 1.5959760739532354, "grad_norm": 0.9500280618667603, "learning_rate": 4.3032210367662964e-05, "loss": 0.3233, "step": 17610 }, { "epoch": 1.5964292187783216, "grad_norm": 0.8315480351448059, "learning_rate": 4.302673150706915e-05, "loss": 0.3209, "step": 17615 }, { "epoch": 1.5968823636034077, "grad_norm": 0.8196550011634827, "learning_rate": 4.302125084235397e-05, "loss": 0.3257, "step": 17620 }, { "epoch": 1.5973355084284937, "grad_norm": 0.8490954041481018, "learning_rate": 4.301576837406589e-05, "loss": 0.2715, "step": 17625 }, { "epoch": 1.5977886532535799, "grad_norm": 0.9668064117431641, "learning_rate": 4.301028410275364e-05, "loss": 0.3385, "step": 17630 }, { "epoch": 1.598241798078666, "grad_norm": 0.8431141972541809, "learning_rate": 4.300479802896607e-05, "loss": 0.2884, "step": 17635 }, { "epoch": 1.598694942903752, "grad_norm": 0.859464704990387, "learning_rate": 4.299931015325221e-05, "loss": 0.3034, "step": 17640 }, { "epoch": 1.599148087728838, "grad_norm": 0.9753414392471313, "learning_rate": 4.299382047616131e-05, "loss": 0.3354, "step": 17645 }, { "epoch": 1.5996012325539244, "grad_norm": 0.8501152992248535, "learning_rate": 4.2988328998242775e-05, "loss": 0.3132, "step": 17650 }, { "epoch": 1.6000543773790104, "grad_norm": 0.8417093753814697, "learning_rate": 4.29828357200462e-05, "loss": 0.3227, "step": 17655 }, { "epoch": 1.6005075222040963, "grad_norm": 0.8198684453964233, "learning_rate": 4.297734064212135e-05, "loss": 0.3161, "step": 17660 }, { "epoch": 1.6009606670291825, "grad_norm": 0.9300566911697388, "learning_rate": 4.2971843765018154e-05, "loss": 0.3137, "step": 17665 }, { "epoch": 1.6014138118542687, "grad_norm": 1.287636160850525, "learning_rate": 4.296634508928677e-05, "loss": 0.2827, "step": 17670 }, { "epoch": 1.6018669566793546, "grad_norm": 0.8925343155860901, "learning_rate": 4.2960844615477495e-05, "loss": 0.3114, "step": 17675 }, { "epoch": 1.6023201015044408, "grad_norm": 0.8287371397018433, "learning_rate": 4.295534234414081e-05, "loss": 0.2665, "step": 17680 }, { "epoch": 1.602773246329527, "grad_norm": 0.9273688793182373, "learning_rate": 4.2949838275827406e-05, "loss": 0.3452, "step": 17685 }, { "epoch": 1.603226391154613, "grad_norm": 0.8219596743583679, "learning_rate": 4.294433241108812e-05, "loss": 0.2656, "step": 17690 }, { "epoch": 1.6036795359796991, "grad_norm": 0.8805015087127686, "learning_rate": 4.293882475047397e-05, "loss": 0.327, "step": 17695 }, { "epoch": 1.6041326808047853, "grad_norm": 0.9608417749404907, "learning_rate": 4.2933315294536184e-05, "loss": 0.3327, "step": 17700 }, { "epoch": 1.6045858256298713, "grad_norm": 0.9672837257385254, "learning_rate": 4.292780404382614e-05, "loss": 0.2812, "step": 17705 }, { "epoch": 1.6050389704549572, "grad_norm": 0.8432943820953369, "learning_rate": 4.29222909988954e-05, "loss": 0.3144, "step": 17710 }, { "epoch": 1.6054921152800437, "grad_norm": 0.8654338121414185, "learning_rate": 4.291677616029572e-05, "loss": 0.2839, "step": 17715 }, { "epoch": 1.6059452601051296, "grad_norm": 0.8509992957115173, "learning_rate": 4.291125952857903e-05, "loss": 0.3167, "step": 17720 }, { "epoch": 1.6063984049302156, "grad_norm": 0.8729085326194763, "learning_rate": 4.290574110429743e-05, "loss": 0.2816, "step": 17725 }, { "epoch": 1.6068515497553018, "grad_norm": 0.848524808883667, "learning_rate": 4.290022088800321e-05, "loss": 0.3104, "step": 17730 }, { "epoch": 1.607304694580388, "grad_norm": 0.9206980466842651, "learning_rate": 4.289469888024882e-05, "loss": 0.2974, "step": 17735 }, { "epoch": 1.607757839405474, "grad_norm": 0.8582622408866882, "learning_rate": 4.288917508158693e-05, "loss": 0.2998, "step": 17740 }, { "epoch": 1.60821098423056, "grad_norm": 0.8574740886688232, "learning_rate": 4.2883649492570345e-05, "loss": 0.3424, "step": 17745 }, { "epoch": 1.6086641290556463, "grad_norm": 1.0434455871582031, "learning_rate": 4.287812211375207e-05, "loss": 0.3517, "step": 17750 }, { "epoch": 1.6091172738807322, "grad_norm": 0.8015867471694946, "learning_rate": 4.2872592945685294e-05, "loss": 0.2659, "step": 17755 }, { "epoch": 1.6095704187058184, "grad_norm": 0.8549805283546448, "learning_rate": 4.2867061988923375e-05, "loss": 0.2941, "step": 17760 }, { "epoch": 1.6100235635309046, "grad_norm": 0.9038550853729248, "learning_rate": 4.286152924401985e-05, "loss": 0.2737, "step": 17765 }, { "epoch": 1.6104767083559905, "grad_norm": 0.8220217227935791, "learning_rate": 4.2855994711528446e-05, "loss": 0.2953, "step": 17770 }, { "epoch": 1.6109298531810765, "grad_norm": 0.8133675456047058, "learning_rate": 4.2850458392003044e-05, "loss": 0.3169, "step": 17775 }, { "epoch": 1.611382998006163, "grad_norm": 0.9393328428268433, "learning_rate": 4.2844920285997736e-05, "loss": 0.3425, "step": 17780 }, { "epoch": 1.6118361428312489, "grad_norm": 0.8572909832000732, "learning_rate": 4.283938039406678e-05, "loss": 0.3191, "step": 17785 }, { "epoch": 1.6122892876563348, "grad_norm": 0.9603254199028015, "learning_rate": 4.283383871676459e-05, "loss": 0.3562, "step": 17790 }, { "epoch": 1.612742432481421, "grad_norm": 0.8593929409980774, "learning_rate": 4.282829525464581e-05, "loss": 0.3001, "step": 17795 }, { "epoch": 1.6131955773065072, "grad_norm": 0.9429061412811279, "learning_rate": 4.282275000826521e-05, "loss": 0.3064, "step": 17800 }, { "epoch": 1.6136487221315932, "grad_norm": 0.8559083342552185, "learning_rate": 4.281720297817775e-05, "loss": 0.2686, "step": 17805 }, { "epoch": 1.6141018669566793, "grad_norm": 0.8677086234092712, "learning_rate": 4.281165416493861e-05, "loss": 0.2521, "step": 17810 }, { "epoch": 1.6145550117817655, "grad_norm": 0.8968037366867065, "learning_rate": 4.280610356910309e-05, "loss": 0.2646, "step": 17815 }, { "epoch": 1.6150081566068515, "grad_norm": 0.9510029554367065, "learning_rate": 4.2800551191226704e-05, "loss": 0.3454, "step": 17820 }, { "epoch": 1.6154613014319377, "grad_norm": 0.9829952716827393, "learning_rate": 4.279499703186515e-05, "loss": 0.3107, "step": 17825 }, { "epoch": 1.6159144462570239, "grad_norm": 0.8650780916213989, "learning_rate": 4.278944109157426e-05, "loss": 0.3219, "step": 17830 }, { "epoch": 1.6163675910821098, "grad_norm": 0.8582170605659485, "learning_rate": 4.2783883370910106e-05, "loss": 0.2868, "step": 17835 }, { "epoch": 1.616820735907196, "grad_norm": 0.8813605308532715, "learning_rate": 4.2778323870428885e-05, "loss": 0.2826, "step": 17840 }, { "epoch": 1.6172738807322822, "grad_norm": 0.7781887054443359, "learning_rate": 4.2772762590687e-05, "loss": 0.3068, "step": 17845 }, { "epoch": 1.6177270255573681, "grad_norm": 0.9411242604255676, "learning_rate": 4.276719953224103e-05, "loss": 0.3475, "step": 17850 }, { "epoch": 1.618180170382454, "grad_norm": 0.8418288230895996, "learning_rate": 4.276163469564772e-05, "loss": 0.3325, "step": 17855 }, { "epoch": 1.6186333152075403, "grad_norm": 0.7243488430976868, "learning_rate": 4.275606808146402e-05, "loss": 0.2756, "step": 17860 }, { "epoch": 1.6190864600326265, "grad_norm": 0.8463386297225952, "learning_rate": 4.275049969024701e-05, "loss": 0.2567, "step": 17865 }, { "epoch": 1.6195396048577124, "grad_norm": 0.8861500024795532, "learning_rate": 4.274492952255399e-05, "loss": 0.3301, "step": 17870 }, { "epoch": 1.6199927496827986, "grad_norm": 0.8181281685829163, "learning_rate": 4.273935757894243e-05, "loss": 0.3302, "step": 17875 }, { "epoch": 1.6204458945078848, "grad_norm": 1.0009722709655762, "learning_rate": 4.273378385996997e-05, "loss": 0.2834, "step": 17880 }, { "epoch": 1.6208990393329707, "grad_norm": 0.8798965811729431, "learning_rate": 4.272820836619442e-05, "loss": 0.3411, "step": 17885 }, { "epoch": 1.621352184158057, "grad_norm": 0.905396044254303, "learning_rate": 4.272263109817378e-05, "loss": 0.2848, "step": 17890 }, { "epoch": 1.6218053289831431, "grad_norm": 0.979878306388855, "learning_rate": 4.271705205646623e-05, "loss": 0.3157, "step": 17895 }, { "epoch": 1.622258473808229, "grad_norm": 0.8875058889389038, "learning_rate": 4.271147124163013e-05, "loss": 0.3291, "step": 17900 }, { "epoch": 1.6227116186333153, "grad_norm": 0.982925534248352, "learning_rate": 4.2705888654224e-05, "loss": 0.3054, "step": 17905 }, { "epoch": 1.6231647634584014, "grad_norm": 0.8640679717063904, "learning_rate": 4.2700304294806544e-05, "loss": 0.3175, "step": 17910 }, { "epoch": 1.6236179082834874, "grad_norm": 0.8753639459609985, "learning_rate": 4.269471816393664e-05, "loss": 0.2904, "step": 17915 }, { "epoch": 1.6240710531085734, "grad_norm": 0.7976831197738647, "learning_rate": 4.268913026217338e-05, "loss": 0.3024, "step": 17920 }, { "epoch": 1.6245241979336598, "grad_norm": 0.8053048849105835, "learning_rate": 4.268354059007597e-05, "loss": 0.262, "step": 17925 }, { "epoch": 1.6249773427587457, "grad_norm": 0.8668192028999329, "learning_rate": 4.2677949148203845e-05, "loss": 0.3073, "step": 17930 }, { "epoch": 1.6254304875838317, "grad_norm": 0.9027950763702393, "learning_rate": 4.2672355937116594e-05, "loss": 0.3013, "step": 17935 }, { "epoch": 1.6258836324089179, "grad_norm": 0.9046555757522583, "learning_rate": 4.2666760957373985e-05, "loss": 0.279, "step": 17940 }, { "epoch": 1.626336777234004, "grad_norm": 0.9391236901283264, "learning_rate": 4.266116420953597e-05, "loss": 0.3345, "step": 17945 }, { "epoch": 1.62678992205909, "grad_norm": 0.878691554069519, "learning_rate": 4.2655565694162667e-05, "loss": 0.3034, "step": 17950 }, { "epoch": 1.6272430668841762, "grad_norm": 0.7997892498970032, "learning_rate": 4.264996541181437e-05, "loss": 0.3004, "step": 17955 }, { "epoch": 1.6276962117092624, "grad_norm": 0.8349008560180664, "learning_rate": 4.264436336305159e-05, "loss": 0.3085, "step": 17960 }, { "epoch": 1.6281493565343483, "grad_norm": 0.9459183812141418, "learning_rate": 4.2638759548434945e-05, "loss": 0.295, "step": 17965 }, { "epoch": 1.6286025013594345, "grad_norm": 0.8377007842063904, "learning_rate": 4.2633153968525283e-05, "loss": 0.3162, "step": 17970 }, { "epoch": 1.6290556461845207, "grad_norm": 0.8854864835739136, "learning_rate": 4.262754662388362e-05, "loss": 0.3133, "step": 17975 }, { "epoch": 1.6295087910096067, "grad_norm": 0.8317819237709045, "learning_rate": 4.262193751507113e-05, "loss": 0.2867, "step": 17980 }, { "epoch": 1.6299619358346926, "grad_norm": 0.9010564088821411, "learning_rate": 4.261632664264916e-05, "loss": 0.2815, "step": 17985 }, { "epoch": 1.630415080659779, "grad_norm": 0.9163343906402588, "learning_rate": 4.261071400717929e-05, "loss": 0.3208, "step": 17990 }, { "epoch": 1.630868225484865, "grad_norm": 0.8114281892776489, "learning_rate": 4.260509960922319e-05, "loss": 0.2661, "step": 17995 }, { "epoch": 1.631321370309951, "grad_norm": 0.9968792200088501, "learning_rate": 4.259948344934278e-05, "loss": 0.3289, "step": 18000 }, { "epoch": 1.6317745151350371, "grad_norm": 0.8059724569320679, "learning_rate": 4.259386552810011e-05, "loss": 0.2731, "step": 18005 }, { "epoch": 1.6322276599601233, "grad_norm": 0.854312539100647, "learning_rate": 4.2588245846057426e-05, "loss": 0.2887, "step": 18010 }, { "epoch": 1.6326808047852093, "grad_norm": 0.9443269371986389, "learning_rate": 4.2582624403777164e-05, "loss": 0.3431, "step": 18015 }, { "epoch": 1.6331339496102955, "grad_norm": 1.0057246685028076, "learning_rate": 4.2577001201821895e-05, "loss": 0.3169, "step": 18020 }, { "epoch": 1.6335870944353816, "grad_norm": 0.8380560278892517, "learning_rate": 4.2571376240754416e-05, "loss": 0.2819, "step": 18025 }, { "epoch": 1.6340402392604676, "grad_norm": 0.9425898790359497, "learning_rate": 4.2565749521137657e-05, "loss": 0.3419, "step": 18030 }, { "epoch": 1.6344933840855538, "grad_norm": 0.8912044763565063, "learning_rate": 4.256012104353475e-05, "loss": 0.2771, "step": 18035 }, { "epoch": 1.63494652891064, "grad_norm": 0.8718425631523132, "learning_rate": 4.255449080850899e-05, "loss": 0.3238, "step": 18040 }, { "epoch": 1.635399673735726, "grad_norm": 0.9145520329475403, "learning_rate": 4.254885881662386e-05, "loss": 0.301, "step": 18045 }, { "epoch": 1.6358528185608119, "grad_norm": 0.8443177938461304, "learning_rate": 4.2543225068443004e-05, "loss": 0.2973, "step": 18050 }, { "epoch": 1.6363059633858983, "grad_norm": 1.066307544708252, "learning_rate": 4.253758956453025e-05, "loss": 0.3103, "step": 18055 }, { "epoch": 1.6367591082109842, "grad_norm": 1.025628924369812, "learning_rate": 4.253195230544961e-05, "loss": 0.2983, "step": 18060 }, { "epoch": 1.6372122530360702, "grad_norm": 0.9385077357292175, "learning_rate": 4.252631329176525e-05, "loss": 0.3062, "step": 18065 }, { "epoch": 1.6376653978611564, "grad_norm": 0.8046302199363708, "learning_rate": 4.2520672524041535e-05, "loss": 0.3149, "step": 18070 }, { "epoch": 1.6381185426862426, "grad_norm": 0.9071001410484314, "learning_rate": 4.251503000284299e-05, "loss": 0.3229, "step": 18075 }, { "epoch": 1.6385716875113285, "grad_norm": 0.8217174410820007, "learning_rate": 4.2509385728734316e-05, "loss": 0.2849, "step": 18080 }, { "epoch": 1.6390248323364147, "grad_norm": 0.8497008085250854, "learning_rate": 4.250373970228041e-05, "loss": 0.3222, "step": 18085 }, { "epoch": 1.639477977161501, "grad_norm": 0.8140535354614258, "learning_rate": 4.2498091924046315e-05, "loss": 0.2805, "step": 18090 }, { "epoch": 1.6399311219865869, "grad_norm": 0.98907470703125, "learning_rate": 4.249244239459725e-05, "loss": 0.293, "step": 18095 }, { "epoch": 1.640384266811673, "grad_norm": 0.8944506645202637, "learning_rate": 4.248679111449866e-05, "loss": 0.2905, "step": 18100 }, { "epoch": 1.6408374116367592, "grad_norm": 0.8617191314697266, "learning_rate": 4.2481138084316085e-05, "loss": 0.323, "step": 18105 }, { "epoch": 1.6412905564618452, "grad_norm": 0.8548178672790527, "learning_rate": 4.247548330461532e-05, "loss": 0.3554, "step": 18110 }, { "epoch": 1.6417437012869311, "grad_norm": 0.8252630829811096, "learning_rate": 4.2469826775962264e-05, "loss": 0.3134, "step": 18115 }, { "epoch": 1.6421968461120175, "grad_norm": 0.8495299816131592, "learning_rate": 4.246416849892304e-05, "loss": 0.3115, "step": 18120 }, { "epoch": 1.6426499909371035, "grad_norm": 1.0190260410308838, "learning_rate": 4.245850847406393e-05, "loss": 0.316, "step": 18125 }, { "epoch": 1.6431031357621895, "grad_norm": 0.8713011145591736, "learning_rate": 4.245284670195139e-05, "loss": 0.2937, "step": 18130 }, { "epoch": 1.6435562805872757, "grad_norm": 0.7465673089027405, "learning_rate": 4.244718318315205e-05, "loss": 0.2822, "step": 18135 }, { "epoch": 1.6440094254123618, "grad_norm": 0.8521396517753601, "learning_rate": 4.2441517918232724e-05, "loss": 0.2638, "step": 18140 }, { "epoch": 1.6444625702374478, "grad_norm": 0.9209028482437134, "learning_rate": 4.2435850907760386e-05, "loss": 0.3641, "step": 18145 }, { "epoch": 1.644915715062534, "grad_norm": 0.7811183333396912, "learning_rate": 4.243018215230219e-05, "loss": 0.2979, "step": 18150 }, { "epoch": 1.6453688598876202, "grad_norm": 0.8021581768989563, "learning_rate": 4.242451165242548e-05, "loss": 0.3031, "step": 18155 }, { "epoch": 1.6458220047127061, "grad_norm": 1.0553148984909058, "learning_rate": 4.2418839408697755e-05, "loss": 0.3248, "step": 18160 }, { "epoch": 1.6462751495377923, "grad_norm": 0.8295159339904785, "learning_rate": 4.2413165421686686e-05, "loss": 0.3232, "step": 18165 }, { "epoch": 1.6467282943628785, "grad_norm": 0.829942524433136, "learning_rate": 4.240748969196014e-05, "loss": 0.2718, "step": 18170 }, { "epoch": 1.6471814391879644, "grad_norm": 0.8269074559211731, "learning_rate": 4.240181222008613e-05, "loss": 0.2914, "step": 18175 }, { "epoch": 1.6476345840130504, "grad_norm": 0.8044787645339966, "learning_rate": 4.239613300663289e-05, "loss": 0.3013, "step": 18180 }, { "epoch": 1.6480877288381368, "grad_norm": 0.9263767600059509, "learning_rate": 4.239045205216877e-05, "loss": 0.3354, "step": 18185 }, { "epoch": 1.6485408736632228, "grad_norm": 0.9121374487876892, "learning_rate": 4.238476935726232e-05, "loss": 0.3203, "step": 18190 }, { "epoch": 1.6489940184883087, "grad_norm": 0.8215614557266235, "learning_rate": 4.237908492248229e-05, "loss": 0.3025, "step": 18195 }, { "epoch": 1.649447163313395, "grad_norm": 0.8174506425857544, "learning_rate": 4.237339874839755e-05, "loss": 0.2822, "step": 18200 }, { "epoch": 1.649900308138481, "grad_norm": 0.8329336643218994, "learning_rate": 4.2367710835577204e-05, "loss": 0.2604, "step": 18205 }, { "epoch": 1.650353452963567, "grad_norm": 0.8645308613777161, "learning_rate": 4.2362021184590485e-05, "loss": 0.2827, "step": 18210 }, { "epoch": 1.6508065977886532, "grad_norm": 0.8629558086395264, "learning_rate": 4.235632979600681e-05, "loss": 0.3013, "step": 18215 }, { "epoch": 1.6512597426137394, "grad_norm": 0.8683720827102661, "learning_rate": 4.2350636670395786e-05, "loss": 0.2688, "step": 18220 }, { "epoch": 1.6517128874388254, "grad_norm": 0.8225610256195068, "learning_rate": 4.2344941808327174e-05, "loss": 0.3092, "step": 18225 }, { "epoch": 1.6521660322639116, "grad_norm": 0.8328880667686462, "learning_rate": 4.2339245210370925e-05, "loss": 0.313, "step": 18230 }, { "epoch": 1.6526191770889977, "grad_norm": 0.8666002750396729, "learning_rate": 4.233354687709716e-05, "loss": 0.3148, "step": 18235 }, { "epoch": 1.6530723219140837, "grad_norm": 0.8772616982460022, "learning_rate": 4.232784680907616e-05, "loss": 0.2659, "step": 18240 }, { "epoch": 1.65352546673917, "grad_norm": 0.8115444183349609, "learning_rate": 4.232214500687839e-05, "loss": 0.2982, "step": 18245 }, { "epoch": 1.653978611564256, "grad_norm": 0.783485472202301, "learning_rate": 4.23164414710745e-05, "loss": 0.2919, "step": 18250 }, { "epoch": 1.654431756389342, "grad_norm": 0.9689051508903503, "learning_rate": 4.231073620223528e-05, "loss": 0.3121, "step": 18255 }, { "epoch": 1.654884901214428, "grad_norm": 0.872133195400238, "learning_rate": 4.2305029200931734e-05, "loss": 0.2838, "step": 18260 }, { "epoch": 1.6553380460395142, "grad_norm": 0.8478687405586243, "learning_rate": 4.229932046773503e-05, "loss": 0.32, "step": 18265 }, { "epoch": 1.6557911908646004, "grad_norm": 0.8728999495506287, "learning_rate": 4.229361000321647e-05, "loss": 0.3495, "step": 18270 }, { "epoch": 1.6562443356896863, "grad_norm": 0.8873351812362671, "learning_rate": 4.228789780794757e-05, "loss": 0.2961, "step": 18275 }, { "epoch": 1.6566974805147725, "grad_norm": 0.9640417098999023, "learning_rate": 4.2282183882500026e-05, "loss": 0.317, "step": 18280 }, { "epoch": 1.6571506253398587, "grad_norm": 1.0031400918960571, "learning_rate": 4.227646822744567e-05, "loss": 0.3148, "step": 18285 }, { "epoch": 1.6576037701649446, "grad_norm": 0.7663844227790833, "learning_rate": 4.2270750843356545e-05, "loss": 0.2825, "step": 18290 }, { "epoch": 1.6580569149900308, "grad_norm": 0.9184001684188843, "learning_rate": 4.226503173080483e-05, "loss": 0.3301, "step": 18295 }, { "epoch": 1.658510059815117, "grad_norm": 0.8592252135276794, "learning_rate": 4.2259310890362895e-05, "loss": 0.2964, "step": 18300 }, { "epoch": 1.658963204640203, "grad_norm": 0.8657112717628479, "learning_rate": 4.22535883226033e-05, "loss": 0.2865, "step": 18305 }, { "epoch": 1.6594163494652892, "grad_norm": 0.8804430961608887, "learning_rate": 4.224786402809875e-05, "loss": 0.3301, "step": 18310 }, { "epoch": 1.6598694942903753, "grad_norm": 0.8623974919319153, "learning_rate": 4.224213800742214e-05, "loss": 0.2942, "step": 18315 }, { "epoch": 1.6603226391154613, "grad_norm": 0.8495715856552124, "learning_rate": 4.223641026114653e-05, "loss": 0.2781, "step": 18320 }, { "epoch": 1.6607757839405473, "grad_norm": 1.0739666223526, "learning_rate": 4.223068078984515e-05, "loss": 0.3502, "step": 18325 }, { "epoch": 1.6612289287656337, "grad_norm": 0.9496556520462036, "learning_rate": 4.222494959409141e-05, "loss": 0.3139, "step": 18330 }, { "epoch": 1.6616820735907196, "grad_norm": 0.8686436414718628, "learning_rate": 4.22192166744589e-05, "loss": 0.2971, "step": 18335 }, { "epoch": 1.6621352184158056, "grad_norm": 1.0432953834533691, "learning_rate": 4.221348203152136e-05, "loss": 0.2518, "step": 18340 }, { "epoch": 1.6625883632408918, "grad_norm": 0.8642166256904602, "learning_rate": 4.220774566585272e-05, "loss": 0.314, "step": 18345 }, { "epoch": 1.663041508065978, "grad_norm": 0.794090211391449, "learning_rate": 4.2202007578027066e-05, "loss": 0.2978, "step": 18350 }, { "epoch": 1.663494652891064, "grad_norm": 0.7875705361366272, "learning_rate": 4.2196267768618694e-05, "loss": 0.3154, "step": 18355 }, { "epoch": 1.66394779771615, "grad_norm": 0.8370786905288696, "learning_rate": 4.219052623820203e-05, "loss": 0.2889, "step": 18360 }, { "epoch": 1.6644009425412363, "grad_norm": 0.9009444713592529, "learning_rate": 4.218478298735168e-05, "loss": 0.2845, "step": 18365 }, { "epoch": 1.6648540873663222, "grad_norm": 0.8835994601249695, "learning_rate": 4.217903801664245e-05, "loss": 0.2773, "step": 18370 }, { "epoch": 1.6653072321914084, "grad_norm": 0.8914021253585815, "learning_rate": 4.217329132664928e-05, "loss": 0.3013, "step": 18375 }, { "epoch": 1.6657603770164946, "grad_norm": 1.0042939186096191, "learning_rate": 4.21675429179473e-05, "loss": 0.3208, "step": 18380 }, { "epoch": 1.6662135218415806, "grad_norm": 0.9096315503120422, "learning_rate": 4.216179279111184e-05, "loss": 0.264, "step": 18385 }, { "epoch": 1.6666666666666665, "grad_norm": 0.9380147457122803, "learning_rate": 4.215604094671835e-05, "loss": 0.277, "step": 18390 }, { "epoch": 1.667119811491753, "grad_norm": 0.9131767153739929, "learning_rate": 4.2150287385342474e-05, "loss": 0.3012, "step": 18395 }, { "epoch": 1.6675729563168389, "grad_norm": 0.9215840101242065, "learning_rate": 4.214453210756004e-05, "loss": 0.3187, "step": 18400 }, { "epoch": 1.6680261011419248, "grad_norm": 1.0419434309005737, "learning_rate": 4.213877511394704e-05, "loss": 0.3484, "step": 18405 }, { "epoch": 1.668479245967011, "grad_norm": 0.9115170836448669, "learning_rate": 4.213301640507964e-05, "loss": 0.3622, "step": 18410 }, { "epoch": 1.6689323907920972, "grad_norm": 0.7822936773300171, "learning_rate": 4.212725598153416e-05, "loss": 0.2925, "step": 18415 }, { "epoch": 1.6693855356171832, "grad_norm": 1.0561422109603882, "learning_rate": 4.212149384388711e-05, "loss": 0.3187, "step": 18420 }, { "epoch": 1.6698386804422694, "grad_norm": 0.8649877905845642, "learning_rate": 4.211572999271517e-05, "loss": 0.2725, "step": 18425 }, { "epoch": 1.6702918252673555, "grad_norm": 0.8040804266929626, "learning_rate": 4.2109964428595185e-05, "loss": 0.2823, "step": 18430 }, { "epoch": 1.6707449700924415, "grad_norm": 0.9421810507774353, "learning_rate": 4.2104197152104174e-05, "loss": 0.3568, "step": 18435 }, { "epoch": 1.6711981149175277, "grad_norm": 0.821999192237854, "learning_rate": 4.2098428163819336e-05, "loss": 0.2776, "step": 18440 }, { "epoch": 1.6716512597426139, "grad_norm": 0.9485285878181458, "learning_rate": 4.209265746431802e-05, "loss": 0.3025, "step": 18445 }, { "epoch": 1.6721044045676998, "grad_norm": 0.8184763193130493, "learning_rate": 4.208688505417777e-05, "loss": 0.2729, "step": 18450 }, { "epoch": 1.6725575493927858, "grad_norm": 0.8645181655883789, "learning_rate": 4.208111093397629e-05, "loss": 0.2576, "step": 18455 }, { "epoch": 1.6730106942178722, "grad_norm": 0.9486141204833984, "learning_rate": 4.2075335104291456e-05, "loss": 0.3445, "step": 18460 }, { "epoch": 1.6734638390429581, "grad_norm": 0.8946083784103394, "learning_rate": 4.206955756570131e-05, "loss": 0.2715, "step": 18465 }, { "epoch": 1.673916983868044, "grad_norm": 0.8228843808174133, "learning_rate": 4.2063778318784064e-05, "loss": 0.2974, "step": 18470 }, { "epoch": 1.6743701286931303, "grad_norm": 0.7943463325500488, "learning_rate": 4.205799736411813e-05, "loss": 0.2913, "step": 18475 }, { "epoch": 1.6748232735182165, "grad_norm": 0.8572854399681091, "learning_rate": 4.2052214702282046e-05, "loss": 0.2441, "step": 18480 }, { "epoch": 1.6752764183433024, "grad_norm": 0.797808825969696, "learning_rate": 4.2046430333854546e-05, "loss": 0.2825, "step": 18485 }, { "epoch": 1.6757295631683886, "grad_norm": 0.7740611433982849, "learning_rate": 4.204064425941454e-05, "loss": 0.2847, "step": 18490 }, { "epoch": 1.6761827079934748, "grad_norm": 0.9661316275596619, "learning_rate": 4.2034856479541096e-05, "loss": 0.314, "step": 18495 }, { "epoch": 1.6766358528185608, "grad_norm": 0.7914071083068848, "learning_rate": 4.202906699481345e-05, "loss": 0.2687, "step": 18500 }, { "epoch": 1.677088997643647, "grad_norm": 0.7816911935806274, "learning_rate": 4.202327580581104e-05, "loss": 0.2786, "step": 18505 }, { "epoch": 1.6775421424687331, "grad_norm": 0.8675497770309448, "learning_rate": 4.2017482913113416e-05, "loss": 0.2689, "step": 18510 }, { "epoch": 1.677995287293819, "grad_norm": 0.8646499514579773, "learning_rate": 4.2011688317300355e-05, "loss": 0.3135, "step": 18515 }, { "epoch": 1.678448432118905, "grad_norm": 0.9190946221351624, "learning_rate": 4.2005892018951775e-05, "loss": 0.295, "step": 18520 }, { "epoch": 1.6789015769439914, "grad_norm": 0.8021218776702881, "learning_rate": 4.200009401864777e-05, "loss": 0.3178, "step": 18525 }, { "epoch": 1.6793547217690774, "grad_norm": 0.8156978487968445, "learning_rate": 4.199429431696862e-05, "loss": 0.3738, "step": 18530 }, { "epoch": 1.6798078665941634, "grad_norm": 0.9481425881385803, "learning_rate": 4.1988492914494735e-05, "loss": 0.311, "step": 18535 }, { "epoch": 1.6802610114192496, "grad_norm": 0.8373022079467773, "learning_rate": 4.198268981180675e-05, "loss": 0.3624, "step": 18540 }, { "epoch": 1.6807141562443357, "grad_norm": 0.838717520236969, "learning_rate": 4.1976885009485414e-05, "loss": 0.3318, "step": 18545 }, { "epoch": 1.6811673010694217, "grad_norm": 0.9083749651908875, "learning_rate": 4.19710785081117e-05, "loss": 0.3049, "step": 18550 }, { "epoch": 1.6816204458945079, "grad_norm": 1.0058990716934204, "learning_rate": 4.1965270308266705e-05, "loss": 0.3117, "step": 18555 }, { "epoch": 1.682073590719594, "grad_norm": 0.8042169213294983, "learning_rate": 4.195946041053172e-05, "loss": 0.2947, "step": 18560 }, { "epoch": 1.68252673554468, "grad_norm": 0.8743519186973572, "learning_rate": 4.195364881548821e-05, "loss": 0.3328, "step": 18565 }, { "epoch": 1.6829798803697662, "grad_norm": 0.9902830719947815, "learning_rate": 4.1947835523717796e-05, "loss": 0.3195, "step": 18570 }, { "epoch": 1.6834330251948524, "grad_norm": 0.972085177898407, "learning_rate": 4.1942020535802265e-05, "loss": 0.2994, "step": 18575 }, { "epoch": 1.6838861700199383, "grad_norm": 0.8222377300262451, "learning_rate": 4.1936203852323605e-05, "loss": 0.2879, "step": 18580 }, { "epoch": 1.6843393148450243, "grad_norm": 0.9415888786315918, "learning_rate": 4.193038547386392e-05, "loss": 0.2972, "step": 18585 }, { "epoch": 1.6847924596701107, "grad_norm": 0.8585674166679382, "learning_rate": 4.192456540100554e-05, "loss": 0.2667, "step": 18590 }, { "epoch": 1.6852456044951967, "grad_norm": 0.9641662836074829, "learning_rate": 4.1918743634330946e-05, "loss": 0.3358, "step": 18595 }, { "epoch": 1.6856987493202826, "grad_norm": 0.8970014452934265, "learning_rate": 4.1912920174422764e-05, "loss": 0.3064, "step": 18600 }, { "epoch": 1.6861518941453688, "grad_norm": 0.8863723278045654, "learning_rate": 4.190709502186381e-05, "loss": 0.3241, "step": 18605 }, { "epoch": 1.686605038970455, "grad_norm": 0.8746827244758606, "learning_rate": 4.190126817723707e-05, "loss": 0.2765, "step": 18610 }, { "epoch": 1.687058183795541, "grad_norm": 0.77330082654953, "learning_rate": 4.1895439641125694e-05, "loss": 0.2688, "step": 18615 }, { "epoch": 1.6875113286206271, "grad_norm": 0.9148644804954529, "learning_rate": 4.1889609414113016e-05, "loss": 0.3266, "step": 18620 }, { "epoch": 1.6879644734457133, "grad_norm": 0.9076082706451416, "learning_rate": 4.188377749678251e-05, "loss": 0.2934, "step": 18625 }, { "epoch": 1.6884176182707993, "grad_norm": 1.0126285552978516, "learning_rate": 4.187794388971785e-05, "loss": 0.2943, "step": 18630 }, { "epoch": 1.6888707630958855, "grad_norm": 0.9398326873779297, "learning_rate": 4.187210859350286e-05, "loss": 0.317, "step": 18635 }, { "epoch": 1.6893239079209716, "grad_norm": 0.8708258867263794, "learning_rate": 4.186627160872153e-05, "loss": 0.2951, "step": 18640 }, { "epoch": 1.6897770527460576, "grad_norm": 0.8405221104621887, "learning_rate": 4.186043293595805e-05, "loss": 0.3319, "step": 18645 }, { "epoch": 1.6902301975711438, "grad_norm": 0.7793798446655273, "learning_rate": 4.185459257579673e-05, "loss": 0.2855, "step": 18650 }, { "epoch": 1.69068334239623, "grad_norm": 0.9983845949172974, "learning_rate": 4.184875052882209e-05, "loss": 0.3315, "step": 18655 }, { "epoch": 1.691136487221316, "grad_norm": 0.8673156499862671, "learning_rate": 4.184290679561881e-05, "loss": 0.303, "step": 18660 }, { "epoch": 1.691589632046402, "grad_norm": 0.8267568945884705, "learning_rate": 4.183706137677171e-05, "loss": 0.2798, "step": 18665 }, { "epoch": 1.692042776871488, "grad_norm": 0.853584349155426, "learning_rate": 4.183121427286583e-05, "loss": 0.309, "step": 18670 }, { "epoch": 1.6924959216965743, "grad_norm": 0.9345765113830566, "learning_rate": 4.1825365484486324e-05, "loss": 0.2901, "step": 18675 }, { "epoch": 1.6929490665216602, "grad_norm": 0.7988394498825073, "learning_rate": 4.1819515012218555e-05, "loss": 0.3193, "step": 18680 }, { "epoch": 1.6934022113467464, "grad_norm": 0.9034197926521301, "learning_rate": 4.181366285664804e-05, "loss": 0.2978, "step": 18685 }, { "epoch": 1.6938553561718326, "grad_norm": 0.9465492963790894, "learning_rate": 4.1807809018360463e-05, "loss": 0.2797, "step": 18690 }, { "epoch": 1.6943085009969185, "grad_norm": 0.8373128771781921, "learning_rate": 4.180195349794168e-05, "loss": 0.2877, "step": 18695 }, { "epoch": 1.6947616458220047, "grad_norm": 0.8976441025733948, "learning_rate": 4.1796096295977707e-05, "loss": 0.304, "step": 18700 }, { "epoch": 1.695214790647091, "grad_norm": 0.8093327283859253, "learning_rate": 4.179023741305474e-05, "loss": 0.2563, "step": 18705 }, { "epoch": 1.6956679354721769, "grad_norm": 0.7470993399620056, "learning_rate": 4.178437684975913e-05, "loss": 0.2853, "step": 18710 }, { "epoch": 1.696121080297263, "grad_norm": 0.8157191872596741, "learning_rate": 4.177851460667742e-05, "loss": 0.3294, "step": 18715 }, { "epoch": 1.6965742251223492, "grad_norm": 0.914120614528656, "learning_rate": 4.1772650684396296e-05, "loss": 0.313, "step": 18720 }, { "epoch": 1.6970273699474352, "grad_norm": 1.0030455589294434, "learning_rate": 4.1766785083502616e-05, "loss": 0.3318, "step": 18725 }, { "epoch": 1.6974805147725212, "grad_norm": 0.8775113224983215, "learning_rate": 4.176091780458342e-05, "loss": 0.2921, "step": 18730 }, { "epoch": 1.6979336595976076, "grad_norm": 0.8386179208755493, "learning_rate": 4.175504884822591e-05, "loss": 0.2742, "step": 18735 }, { "epoch": 1.6983868044226935, "grad_norm": 0.8221515417098999, "learning_rate": 4.174917821501744e-05, "loss": 0.2785, "step": 18740 }, { "epoch": 1.6988399492477795, "grad_norm": 0.8763789534568787, "learning_rate": 4.174330590554555e-05, "loss": 0.3287, "step": 18745 }, { "epoch": 1.6992930940728657, "grad_norm": 0.8485104441642761, "learning_rate": 4.173743192039795e-05, "loss": 0.2912, "step": 18750 }, { "epoch": 1.6997462388979518, "grad_norm": 0.9541552066802979, "learning_rate": 4.17315562601625e-05, "loss": 0.2878, "step": 18755 }, { "epoch": 1.7001993837230378, "grad_norm": 0.9225139021873474, "learning_rate": 4.172567892542725e-05, "loss": 0.2907, "step": 18760 }, { "epoch": 1.700652528548124, "grad_norm": 0.9268158078193665, "learning_rate": 4.17197999167804e-05, "loss": 0.3189, "step": 18765 }, { "epoch": 1.7011056733732102, "grad_norm": 0.7458837032318115, "learning_rate": 4.171391923481031e-05, "loss": 0.2857, "step": 18770 }, { "epoch": 1.7015588181982961, "grad_norm": 0.8177514672279358, "learning_rate": 4.1708036880105545e-05, "loss": 0.286, "step": 18775 }, { "epoch": 1.7020119630233823, "grad_norm": 0.8526171445846558, "learning_rate": 4.1702152853254795e-05, "loss": 0.3527, "step": 18780 }, { "epoch": 1.7024651078484685, "grad_norm": 0.7738702297210693, "learning_rate": 4.1696267154846946e-05, "loss": 0.2714, "step": 18785 }, { "epoch": 1.7029182526735545, "grad_norm": 0.8340524435043335, "learning_rate": 4.169037978547103e-05, "loss": 0.313, "step": 18790 }, { "epoch": 1.7033713974986404, "grad_norm": 0.8270505666732788, "learning_rate": 4.168449074571626e-05, "loss": 0.3113, "step": 18795 }, { "epoch": 1.7038245423237268, "grad_norm": 0.8140398859977722, "learning_rate": 4.167860003617203e-05, "loss": 0.3065, "step": 18800 }, { "epoch": 1.7042776871488128, "grad_norm": 1.0027519464492798, "learning_rate": 4.167270765742786e-05, "loss": 0.2851, "step": 18805 }, { "epoch": 1.7047308319738987, "grad_norm": 0.9964253306388855, "learning_rate": 4.1666813610073476e-05, "loss": 0.3258, "step": 18810 }, { "epoch": 1.705183976798985, "grad_norm": 0.8346615433692932, "learning_rate": 4.166091789469875e-05, "loss": 0.3091, "step": 18815 }, { "epoch": 1.705637121624071, "grad_norm": 0.8645350337028503, "learning_rate": 4.1655020511893724e-05, "loss": 0.3182, "step": 18820 }, { "epoch": 1.706090266449157, "grad_norm": 0.8952618837356567, "learning_rate": 4.164912146224862e-05, "loss": 0.2973, "step": 18825 }, { "epoch": 1.7065434112742432, "grad_norm": 0.8741565346717834, "learning_rate": 4.1643220746353814e-05, "loss": 0.3406, "step": 18830 }, { "epoch": 1.7069965560993294, "grad_norm": 0.9108509421348572, "learning_rate": 4.163731836479984e-05, "loss": 0.3119, "step": 18835 }, { "epoch": 1.7074497009244154, "grad_norm": 0.965908944606781, "learning_rate": 4.163141431817743e-05, "loss": 0.3203, "step": 18840 }, { "epoch": 1.7079028457495016, "grad_norm": 0.9032286405563354, "learning_rate": 4.162550860707744e-05, "loss": 0.3016, "step": 18845 }, { "epoch": 1.7083559905745878, "grad_norm": 0.7919615507125854, "learning_rate": 4.1619601232090944e-05, "loss": 0.3178, "step": 18850 }, { "epoch": 1.7088091353996737, "grad_norm": 0.9177213311195374, "learning_rate": 4.161369219380914e-05, "loss": 0.2767, "step": 18855 }, { "epoch": 1.7092622802247597, "grad_norm": 0.8551110625267029, "learning_rate": 4.1607781492823386e-05, "loss": 0.3199, "step": 18860 }, { "epoch": 1.709715425049846, "grad_norm": 0.8097763061523438, "learning_rate": 4.160186912972526e-05, "loss": 0.3606, "step": 18865 }, { "epoch": 1.710168569874932, "grad_norm": 0.8300080299377441, "learning_rate": 4.159595510510645e-05, "loss": 0.2765, "step": 18870 }, { "epoch": 1.710621714700018, "grad_norm": 0.9425070285797119, "learning_rate": 4.159003941955886e-05, "loss": 0.2436, "step": 18875 }, { "epoch": 1.7110748595251042, "grad_norm": 0.8222600221633911, "learning_rate": 4.1584122073674506e-05, "loss": 0.2329, "step": 18880 }, { "epoch": 1.7115280043501904, "grad_norm": 0.8131742477416992, "learning_rate": 4.1578203068045595e-05, "loss": 0.2562, "step": 18885 }, { "epoch": 1.7119811491752763, "grad_norm": 0.8033243417739868, "learning_rate": 4.157228240326453e-05, "loss": 0.2914, "step": 18890 }, { "epoch": 1.7124342940003625, "grad_norm": 0.8778545260429382, "learning_rate": 4.1566360079923847e-05, "loss": 0.2736, "step": 18895 }, { "epoch": 1.7128874388254487, "grad_norm": 0.9314966797828674, "learning_rate": 4.156043609861623e-05, "loss": 0.3152, "step": 18900 }, { "epoch": 1.7133405836505347, "grad_norm": 1.0026484727859497, "learning_rate": 4.1554510459934585e-05, "loss": 0.2911, "step": 18905 }, { "epoch": 1.7137937284756208, "grad_norm": 0.7808854579925537, "learning_rate": 4.1548583164471936e-05, "loss": 0.2697, "step": 18910 }, { "epoch": 1.714246873300707, "grad_norm": 0.8665362000465393, "learning_rate": 4.154265421282147e-05, "loss": 0.263, "step": 18915 }, { "epoch": 1.714700018125793, "grad_norm": 0.9673632979393005, "learning_rate": 4.15367236055766e-05, "loss": 0.284, "step": 18920 }, { "epoch": 1.715153162950879, "grad_norm": 0.8337221145629883, "learning_rate": 4.1530791343330833e-05, "loss": 0.29, "step": 18925 }, { "epoch": 1.7156063077759653, "grad_norm": 0.8264334797859192, "learning_rate": 4.152485742667788e-05, "loss": 0.2817, "step": 18930 }, { "epoch": 1.7160594526010513, "grad_norm": 0.966547429561615, "learning_rate": 4.151892185621161e-05, "loss": 0.3339, "step": 18935 }, { "epoch": 1.7165125974261373, "grad_norm": 0.8891755938529968, "learning_rate": 4.1512984632526065e-05, "loss": 0.2979, "step": 18940 }, { "epoch": 1.7169657422512234, "grad_norm": 0.8468058109283447, "learning_rate": 4.150704575621542e-05, "loss": 0.3051, "step": 18945 }, { "epoch": 1.7174188870763096, "grad_norm": 0.8013687133789062, "learning_rate": 4.150110522787407e-05, "loss": 0.2729, "step": 18950 }, { "epoch": 1.7178720319013956, "grad_norm": 0.8057905435562134, "learning_rate": 4.149516304809652e-05, "loss": 0.2793, "step": 18955 }, { "epoch": 1.7183251767264818, "grad_norm": 0.9327337741851807, "learning_rate": 4.148921921747748e-05, "loss": 0.2807, "step": 18960 }, { "epoch": 1.718778321551568, "grad_norm": 0.8352736234664917, "learning_rate": 4.1483273736611804e-05, "loss": 0.2666, "step": 18965 }, { "epoch": 1.719231466376654, "grad_norm": 1.2352648973464966, "learning_rate": 4.147732660609452e-05, "loss": 0.33, "step": 18970 }, { "epoch": 1.71968461120174, "grad_norm": 0.9205837845802307, "learning_rate": 4.147137782652083e-05, "loss": 0.2763, "step": 18975 }, { "epoch": 1.7201377560268263, "grad_norm": 0.8354939222335815, "learning_rate": 4.1465427398486066e-05, "loss": 0.2558, "step": 18980 }, { "epoch": 1.7205909008519122, "grad_norm": 0.9263020753860474, "learning_rate": 4.145947532258577e-05, "loss": 0.2626, "step": 18985 }, { "epoch": 1.7210440456769984, "grad_norm": 0.8566945195198059, "learning_rate": 4.145352159941562e-05, "loss": 0.334, "step": 18990 }, { "epoch": 1.7214971905020846, "grad_norm": 0.838605523109436, "learning_rate": 4.1447566229571455e-05, "loss": 0.2941, "step": 18995 }, { "epoch": 1.7219503353271706, "grad_norm": 0.8952492475509644, "learning_rate": 4.1441609213649313e-05, "loss": 0.2564, "step": 19000 }, { "epoch": 1.7224034801522565, "grad_norm": 0.7641306519508362, "learning_rate": 4.143565055224536e-05, "loss": 0.2829, "step": 19005 }, { "epoch": 1.7228566249773427, "grad_norm": 0.8753764629364014, "learning_rate": 4.1429690245955945e-05, "loss": 0.2702, "step": 19010 }, { "epoch": 1.723309769802429, "grad_norm": 0.7539977431297302, "learning_rate": 4.142372829537757e-05, "loss": 0.3079, "step": 19015 }, { "epoch": 1.7237629146275149, "grad_norm": 0.9514847993850708, "learning_rate": 4.141776470110692e-05, "loss": 0.3563, "step": 19020 }, { "epoch": 1.724216059452601, "grad_norm": 0.8034759163856506, "learning_rate": 4.141179946374082e-05, "loss": 0.3065, "step": 19025 }, { "epoch": 1.7246692042776872, "grad_norm": 0.8706058263778687, "learning_rate": 4.1405832583876294e-05, "loss": 0.294, "step": 19030 }, { "epoch": 1.7251223491027732, "grad_norm": 0.8641671538352966, "learning_rate": 4.139986406211049e-05, "loss": 0.2654, "step": 19035 }, { "epoch": 1.7255754939278594, "grad_norm": 0.824171245098114, "learning_rate": 4.139389389904075e-05, "loss": 0.2776, "step": 19040 }, { "epoch": 1.7260286387529455, "grad_norm": 0.9619285464286804, "learning_rate": 4.1387922095264566e-05, "loss": 0.3244, "step": 19045 }, { "epoch": 1.7264817835780315, "grad_norm": 0.9202738404273987, "learning_rate": 4.13819486513796e-05, "loss": 0.2965, "step": 19050 }, { "epoch": 1.7269349284031177, "grad_norm": 0.8614444732666016, "learning_rate": 4.137597356798367e-05, "loss": 0.2437, "step": 19055 }, { "epoch": 1.7273880732282039, "grad_norm": 0.7983126640319824, "learning_rate": 4.1369996845674775e-05, "loss": 0.2761, "step": 19060 }, { "epoch": 1.7278412180532898, "grad_norm": 0.8885471224784851, "learning_rate": 4.136401848505106e-05, "loss": 0.297, "step": 19065 }, { "epoch": 1.7282943628783758, "grad_norm": 0.8788526654243469, "learning_rate": 4.135803848671084e-05, "loss": 0.2747, "step": 19070 }, { "epoch": 1.728747507703462, "grad_norm": 0.9045390486717224, "learning_rate": 4.13520568512526e-05, "loss": 0.3007, "step": 19075 }, { "epoch": 1.7292006525285482, "grad_norm": 0.9322999715805054, "learning_rate": 4.134607357927498e-05, "loss": 0.3061, "step": 19080 }, { "epoch": 1.7296537973536341, "grad_norm": 0.8420912623405457, "learning_rate": 4.13400886713768e-05, "loss": 0.2898, "step": 19085 }, { "epoch": 1.7301069421787203, "grad_norm": 0.9027439951896667, "learning_rate": 4.133410212815701e-05, "loss": 0.3405, "step": 19090 }, { "epoch": 1.7305600870038065, "grad_norm": 0.8862640857696533, "learning_rate": 4.1328113950214764e-05, "loss": 0.2665, "step": 19095 }, { "epoch": 1.7310132318288924, "grad_norm": 0.8229019641876221, "learning_rate": 4.1322124138149354e-05, "loss": 0.3209, "step": 19100 }, { "epoch": 1.7314663766539786, "grad_norm": 0.8016148805618286, "learning_rate": 4.131613269256024e-05, "loss": 0.3179, "step": 19105 }, { "epoch": 1.7319195214790648, "grad_norm": 0.8752793073654175, "learning_rate": 4.131013961404706e-05, "loss": 0.3168, "step": 19110 }, { "epoch": 1.7323726663041508, "grad_norm": 0.8042363524436951, "learning_rate": 4.1304144903209585e-05, "loss": 0.2521, "step": 19115 }, { "epoch": 1.732825811129237, "grad_norm": 0.8657643795013428, "learning_rate": 4.1298148560647775e-05, "loss": 0.2942, "step": 19120 }, { "epoch": 1.7332789559543231, "grad_norm": 2.757603406906128, "learning_rate": 4.129215058696175e-05, "loss": 0.295, "step": 19125 }, { "epoch": 1.733732100779409, "grad_norm": 0.895244836807251, "learning_rate": 4.1286150982751784e-05, "loss": 0.3055, "step": 19130 }, { "epoch": 1.734185245604495, "grad_norm": 0.9114812016487122, "learning_rate": 4.128014974861832e-05, "loss": 0.3085, "step": 19135 }, { "epoch": 1.7346383904295815, "grad_norm": 0.869724690914154, "learning_rate": 4.127414688516198e-05, "loss": 0.3321, "step": 19140 }, { "epoch": 1.7350915352546674, "grad_norm": 0.843235194683075, "learning_rate": 4.1268142392983504e-05, "loss": 0.2993, "step": 19145 }, { "epoch": 1.7355446800797534, "grad_norm": 0.7203092575073242, "learning_rate": 4.1262136272683844e-05, "loss": 0.2544, "step": 19150 }, { "epoch": 1.7359978249048396, "grad_norm": 0.9166197180747986, "learning_rate": 4.125612852486409e-05, "loss": 0.2987, "step": 19155 }, { "epoch": 1.7364509697299257, "grad_norm": 0.9192290902137756, "learning_rate": 4.125011915012549e-05, "loss": 0.2832, "step": 19160 }, { "epoch": 1.7369041145550117, "grad_norm": 0.8300483226776123, "learning_rate": 4.124410814906948e-05, "loss": 0.3421, "step": 19165 }, { "epoch": 1.7373572593800979, "grad_norm": 0.9587914347648621, "learning_rate": 4.1238095522297626e-05, "loss": 0.3152, "step": 19170 }, { "epoch": 1.737810404205184, "grad_norm": 0.9005248546600342, "learning_rate": 4.123208127041169e-05, "loss": 0.3027, "step": 19175 }, { "epoch": 1.73826354903027, "grad_norm": 0.8289730548858643, "learning_rate": 4.1226065394013577e-05, "loss": 0.3067, "step": 19180 }, { "epoch": 1.7387166938553562, "grad_norm": 0.8641492128372192, "learning_rate": 4.122004789370534e-05, "loss": 0.3355, "step": 19185 }, { "epoch": 1.7391698386804424, "grad_norm": 0.8801600933074951, "learning_rate": 4.121402877008924e-05, "loss": 0.2671, "step": 19190 }, { "epoch": 1.7396229835055284, "grad_norm": 0.9612187147140503, "learning_rate": 4.120800802376766e-05, "loss": 0.2906, "step": 19195 }, { "epoch": 1.7400761283306143, "grad_norm": 0.8512090444564819, "learning_rate": 4.120198565534315e-05, "loss": 0.3201, "step": 19200 }, { "epoch": 1.7405292731557007, "grad_norm": 0.9186554551124573, "learning_rate": 4.119596166541844e-05, "loss": 0.3326, "step": 19205 }, { "epoch": 1.7409824179807867, "grad_norm": 0.9068862199783325, "learning_rate": 4.118993605459641e-05, "loss": 0.3089, "step": 19210 }, { "epoch": 1.7414355628058726, "grad_norm": 0.8158524632453918, "learning_rate": 4.118390882348011e-05, "loss": 0.3639, "step": 19215 }, { "epoch": 1.7418887076309588, "grad_norm": 0.8807220458984375, "learning_rate": 4.117787997267275e-05, "loss": 0.264, "step": 19220 }, { "epoch": 1.742341852456045, "grad_norm": 0.8771553039550781, "learning_rate": 4.117184950277769e-05, "loss": 0.2856, "step": 19225 }, { "epoch": 1.742794997281131, "grad_norm": 0.9085263609886169, "learning_rate": 4.116581741439845e-05, "loss": 0.2796, "step": 19230 }, { "epoch": 1.7432481421062171, "grad_norm": 0.8890429139137268, "learning_rate": 4.115978370813875e-05, "loss": 0.2758, "step": 19235 }, { "epoch": 1.7437012869313033, "grad_norm": 0.8065839409828186, "learning_rate": 4.115374838460244e-05, "loss": 0.2836, "step": 19240 }, { "epoch": 1.7441544317563893, "grad_norm": 0.8478485941886902, "learning_rate": 4.114771144439352e-05, "loss": 0.2613, "step": 19245 }, { "epoch": 1.7446075765814755, "grad_norm": 0.8940823078155518, "learning_rate": 4.114167288811619e-05, "loss": 0.2977, "step": 19250 }, { "epoch": 1.7450607214065617, "grad_norm": 0.8990182876586914, "learning_rate": 4.1135632716374775e-05, "loss": 0.2898, "step": 19255 }, { "epoch": 1.7455138662316476, "grad_norm": 0.8463213443756104, "learning_rate": 4.112959092977378e-05, "loss": 0.2586, "step": 19260 }, { "epoch": 1.7459670110567336, "grad_norm": 0.8546347618103027, "learning_rate": 4.112354752891788e-05, "loss": 0.2756, "step": 19265 }, { "epoch": 1.74642015588182, "grad_norm": 0.7959683537483215, "learning_rate": 4.111750251441189e-05, "loss": 0.2693, "step": 19270 }, { "epoch": 1.746873300706906, "grad_norm": 1.044423222541809, "learning_rate": 4.1111455886860793e-05, "loss": 0.2765, "step": 19275 }, { "epoch": 1.747326445531992, "grad_norm": 0.8735219240188599, "learning_rate": 4.110540764686975e-05, "loss": 0.3048, "step": 19280 }, { "epoch": 1.747779590357078, "grad_norm": 0.8124662637710571, "learning_rate": 4.109935779504406e-05, "loss": 0.268, "step": 19285 }, { "epoch": 1.7482327351821643, "grad_norm": 0.8907108306884766, "learning_rate": 4.1093306331989215e-05, "loss": 0.2537, "step": 19290 }, { "epoch": 1.7486858800072502, "grad_norm": 1.2577956914901733, "learning_rate": 4.1087253258310815e-05, "loss": 0.3442, "step": 19295 }, { "epoch": 1.7491390248323364, "grad_norm": 0.8160061240196228, "learning_rate": 4.108119857461468e-05, "loss": 0.3193, "step": 19300 }, { "epoch": 1.7495921696574226, "grad_norm": 0.9688681364059448, "learning_rate": 4.107514228150675e-05, "loss": 0.311, "step": 19305 }, { "epoch": 1.7500453144825086, "grad_norm": 0.9194340109825134, "learning_rate": 4.1069084379593146e-05, "loss": 0.3103, "step": 19310 }, { "epoch": 1.7504984593075947, "grad_norm": 0.8364298343658447, "learning_rate": 4.106302486948015e-05, "loss": 0.2653, "step": 19315 }, { "epoch": 1.750951604132681, "grad_norm": 0.9961747527122498, "learning_rate": 4.1056963751774194e-05, "loss": 0.3038, "step": 19320 }, { "epoch": 1.7514047489577669, "grad_norm": 0.8632218241691589, "learning_rate": 4.105090102708187e-05, "loss": 0.3324, "step": 19325 }, { "epoch": 1.7518578937828528, "grad_norm": 0.8014705181121826, "learning_rate": 4.104483669600995e-05, "loss": 0.2769, "step": 19330 }, { "epoch": 1.7523110386079392, "grad_norm": 0.89472496509552, "learning_rate": 4.1038770759165356e-05, "loss": 0.307, "step": 19335 }, { "epoch": 1.7527641834330252, "grad_norm": 0.8721039891242981, "learning_rate": 4.103270321715516e-05, "loss": 0.3068, "step": 19340 }, { "epoch": 1.7532173282581112, "grad_norm": 0.8872756958007812, "learning_rate": 4.10266340705866e-05, "loss": 0.2881, "step": 19345 }, { "epoch": 1.7536704730831973, "grad_norm": 0.8691482543945312, "learning_rate": 4.102056332006708e-05, "loss": 0.2646, "step": 19350 }, { "epoch": 1.7541236179082835, "grad_norm": 0.8699742555618286, "learning_rate": 4.1014490966204185e-05, "loss": 0.2619, "step": 19355 }, { "epoch": 1.7545767627333695, "grad_norm": 0.8756722807884216, "learning_rate": 4.100841700960561e-05, "loss": 0.2994, "step": 19360 }, { "epoch": 1.7550299075584557, "grad_norm": 0.8235260248184204, "learning_rate": 4.100234145087925e-05, "loss": 0.2862, "step": 19365 }, { "epoch": 1.7554830523835419, "grad_norm": 0.8953651189804077, "learning_rate": 4.0996264290633146e-05, "loss": 0.2821, "step": 19370 }, { "epoch": 1.7559361972086278, "grad_norm": 0.8399690985679626, "learning_rate": 4.09901855294755e-05, "loss": 0.2818, "step": 19375 }, { "epoch": 1.756389342033714, "grad_norm": 0.9054051637649536, "learning_rate": 4.098410516801469e-05, "loss": 0.3166, "step": 19380 }, { "epoch": 1.7568424868588002, "grad_norm": 0.795894980430603, "learning_rate": 4.097802320685923e-05, "loss": 0.2684, "step": 19385 }, { "epoch": 1.7572956316838861, "grad_norm": 0.8400864601135254, "learning_rate": 4.09719396466178e-05, "loss": 0.2598, "step": 19390 }, { "epoch": 1.7577487765089723, "grad_norm": 0.9609578251838684, "learning_rate": 4.096585448789926e-05, "loss": 0.3812, "step": 19395 }, { "epoch": 1.7582019213340585, "grad_norm": 0.8947471380233765, "learning_rate": 4.09597677313126e-05, "loss": 0.2896, "step": 19400 }, { "epoch": 1.7586550661591445, "grad_norm": 0.9471061825752258, "learning_rate": 4.095367937746699e-05, "loss": 0.2598, "step": 19405 }, { "epoch": 1.7591082109842304, "grad_norm": 0.7485555410385132, "learning_rate": 4.0947589426971755e-05, "loss": 0.2548, "step": 19410 }, { "epoch": 1.7595613558093166, "grad_norm": 0.8224881291389465, "learning_rate": 4.094149788043638e-05, "loss": 0.2902, "step": 19415 }, { "epoch": 1.7600145006344028, "grad_norm": 0.894382894039154, "learning_rate": 4.093540473847051e-05, "loss": 0.3126, "step": 19420 }, { "epoch": 1.7604676454594888, "grad_norm": 0.8527883291244507, "learning_rate": 4.092931000168394e-05, "loss": 0.2821, "step": 19425 }, { "epoch": 1.760920790284575, "grad_norm": 0.9927448034286499, "learning_rate": 4.092321367068664e-05, "loss": 0.3415, "step": 19430 }, { "epoch": 1.7613739351096611, "grad_norm": 0.8739977478981018, "learning_rate": 4.091711574608874e-05, "loss": 0.3233, "step": 19435 }, { "epoch": 1.761827079934747, "grad_norm": 0.9428513646125793, "learning_rate": 4.091101622850051e-05, "loss": 0.305, "step": 19440 }, { "epoch": 1.7622802247598333, "grad_norm": 0.8301167488098145, "learning_rate": 4.0904915118532405e-05, "loss": 0.28, "step": 19445 }, { "epoch": 1.7627333695849194, "grad_norm": 0.845911979675293, "learning_rate": 4.0898812416795005e-05, "loss": 0.2338, "step": 19450 }, { "epoch": 1.7631865144100054, "grad_norm": 0.8963679075241089, "learning_rate": 4.08927081238991e-05, "loss": 0.3105, "step": 19455 }, { "epoch": 1.7636396592350916, "grad_norm": 0.9004165530204773, "learning_rate": 4.0886602240455586e-05, "loss": 0.2915, "step": 19460 }, { "epoch": 1.7640928040601778, "grad_norm": 0.8657984137535095, "learning_rate": 4.0880494767075544e-05, "loss": 0.2672, "step": 19465 }, { "epoch": 1.7645459488852637, "grad_norm": 0.9474828243255615, "learning_rate": 4.087438570437022e-05, "loss": 0.2819, "step": 19470 }, { "epoch": 1.7649990937103497, "grad_norm": 0.9007276296615601, "learning_rate": 4.086827505295101e-05, "loss": 0.2983, "step": 19475 }, { "epoch": 1.765452238535436, "grad_norm": 0.86380535364151, "learning_rate": 4.086216281342947e-05, "loss": 0.31, "step": 19480 }, { "epoch": 1.765905383360522, "grad_norm": 0.7730414867401123, "learning_rate": 4.0856048986417316e-05, "loss": 0.3144, "step": 19485 }, { "epoch": 1.766358528185608, "grad_norm": 0.7939407229423523, "learning_rate": 4.084993357252641e-05, "loss": 0.2803, "step": 19490 }, { "epoch": 1.7668116730106942, "grad_norm": 0.8385398983955383, "learning_rate": 4.08438165723688e-05, "loss": 0.257, "step": 19495 }, { "epoch": 1.7672648178357804, "grad_norm": 0.7685638070106506, "learning_rate": 4.083769798655668e-05, "loss": 0.2748, "step": 19500 }, { "epoch": 1.7677179626608663, "grad_norm": 0.8656795620918274, "learning_rate": 4.083157781570238e-05, "loss": 0.2496, "step": 19505 }, { "epoch": 1.7681711074859525, "grad_norm": 0.986110270023346, "learning_rate": 4.082545606041842e-05, "loss": 0.2792, "step": 19510 }, { "epoch": 1.7686242523110387, "grad_norm": 0.8804507851600647, "learning_rate": 4.0819332721317474e-05, "loss": 0.317, "step": 19515 }, { "epoch": 1.7690773971361247, "grad_norm": 0.868642270565033, "learning_rate": 4.0813207799012356e-05, "loss": 0.2652, "step": 19520 }, { "epoch": 1.7695305419612108, "grad_norm": 0.8954640030860901, "learning_rate": 4.0807081294116056e-05, "loss": 0.2947, "step": 19525 }, { "epoch": 1.769983686786297, "grad_norm": 0.9527991414070129, "learning_rate": 4.080095320724172e-05, "loss": 0.293, "step": 19530 }, { "epoch": 1.770436831611383, "grad_norm": 0.8522401452064514, "learning_rate": 4.0794823539002634e-05, "loss": 0.2741, "step": 19535 }, { "epoch": 1.770889976436469, "grad_norm": 0.949901282787323, "learning_rate": 4.0788692290012286e-05, "loss": 0.2763, "step": 19540 }, { "epoch": 1.7713431212615554, "grad_norm": 0.7536320686340332, "learning_rate": 4.078255946088426e-05, "loss": 0.3407, "step": 19545 }, { "epoch": 1.7717962660866413, "grad_norm": 0.8844178915023804, "learning_rate": 4.077642505223236e-05, "loss": 0.2914, "step": 19550 }, { "epoch": 1.7722494109117273, "grad_norm": 0.8629109263420105, "learning_rate": 4.0770289064670506e-05, "loss": 0.2766, "step": 19555 }, { "epoch": 1.7727025557368135, "grad_norm": 0.8532384634017944, "learning_rate": 4.076415149881279e-05, "loss": 0.2849, "step": 19560 }, { "epoch": 1.7731557005618996, "grad_norm": 0.9166068434715271, "learning_rate": 4.075801235527346e-05, "loss": 0.2799, "step": 19565 }, { "epoch": 1.7736088453869856, "grad_norm": 0.8155298829078674, "learning_rate": 4.0751871634666926e-05, "loss": 0.2913, "step": 19570 }, { "epoch": 1.7740619902120718, "grad_norm": 0.9319400191307068, "learning_rate": 4.074572933760776e-05, "loss": 0.2627, "step": 19575 }, { "epoch": 1.774515135037158, "grad_norm": 0.8003904819488525, "learning_rate": 4.073958546471067e-05, "loss": 0.3383, "step": 19580 }, { "epoch": 1.774968279862244, "grad_norm": 0.8449223041534424, "learning_rate": 4.073344001659056e-05, "loss": 0.2779, "step": 19585 }, { "epoch": 1.77542142468733, "grad_norm": 0.9731044173240662, "learning_rate": 4.072729299386244e-05, "loss": 0.3158, "step": 19590 }, { "epoch": 1.7758745695124163, "grad_norm": 0.8624690175056458, "learning_rate": 4.0721144397141534e-05, "loss": 0.2968, "step": 19595 }, { "epoch": 1.7763277143375023, "grad_norm": 0.89542156457901, "learning_rate": 4.0714994227043166e-05, "loss": 0.2903, "step": 19600 }, { "epoch": 1.7767808591625882, "grad_norm": 0.8328409194946289, "learning_rate": 4.070884248418287e-05, "loss": 0.3032, "step": 19605 }, { "epoch": 1.7772340039876746, "grad_norm": 0.9559199810028076, "learning_rate": 4.0702689169176314e-05, "loss": 0.3324, "step": 19610 }, { "epoch": 1.7776871488127606, "grad_norm": 0.9410372972488403, "learning_rate": 4.0696534282639315e-05, "loss": 0.2969, "step": 19615 }, { "epoch": 1.7781402936378465, "grad_norm": 0.8242523670196533, "learning_rate": 4.0690377825187855e-05, "loss": 0.2488, "step": 19620 }, { "epoch": 1.7785934384629327, "grad_norm": 0.837493360042572, "learning_rate": 4.068421979743808e-05, "loss": 0.3108, "step": 19625 }, { "epoch": 1.779046583288019, "grad_norm": 0.8175445199012756, "learning_rate": 4.067806020000629e-05, "loss": 0.2647, "step": 19630 }, { "epoch": 1.7794997281131049, "grad_norm": 0.7757850885391235, "learning_rate": 4.067189903350893e-05, "loss": 0.2738, "step": 19635 }, { "epoch": 1.779952872938191, "grad_norm": 0.9167371988296509, "learning_rate": 4.066573629856262e-05, "loss": 0.2507, "step": 19640 }, { "epoch": 1.7804060177632772, "grad_norm": 0.869949460029602, "learning_rate": 4.0659571995784123e-05, "loss": 0.2768, "step": 19645 }, { "epoch": 1.7808591625883632, "grad_norm": 0.7779558300971985, "learning_rate": 4.065340612579036e-05, "loss": 0.2893, "step": 19650 }, { "epoch": 1.7813123074134494, "grad_norm": 0.8805973529815674, "learning_rate": 4.064723868919843e-05, "loss": 0.3138, "step": 19655 }, { "epoch": 1.7817654522385356, "grad_norm": 0.8412729501724243, "learning_rate": 4.064106968662556e-05, "loss": 0.2842, "step": 19660 }, { "epoch": 1.7822185970636215, "grad_norm": 0.7931427359580994, "learning_rate": 4.0634899118689154e-05, "loss": 0.3125, "step": 19665 }, { "epoch": 1.7826717418887075, "grad_norm": 0.8866362571716309, "learning_rate": 4.062872698600675e-05, "loss": 0.3126, "step": 19670 }, { "epoch": 1.7831248867137939, "grad_norm": 0.7980650663375854, "learning_rate": 4.062255328919607e-05, "loss": 0.2628, "step": 19675 }, { "epoch": 1.7835780315388798, "grad_norm": 0.9059445858001709, "learning_rate": 4.061637802887497e-05, "loss": 0.2934, "step": 19680 }, { "epoch": 1.7840311763639658, "grad_norm": 0.9297138452529907, "learning_rate": 4.0610201205661486e-05, "loss": 0.2955, "step": 19685 }, { "epoch": 1.784484321189052, "grad_norm": 0.9054891467094421, "learning_rate": 4.060402282017379e-05, "loss": 0.298, "step": 19690 }, { "epoch": 1.7849374660141382, "grad_norm": 0.8344095349311829, "learning_rate": 4.0597842873030206e-05, "loss": 0.2542, "step": 19695 }, { "epoch": 1.7853906108392241, "grad_norm": 0.8501378893852234, "learning_rate": 4.059166136484924e-05, "loss": 0.2727, "step": 19700 }, { "epoch": 1.7858437556643103, "grad_norm": 0.8530305027961731, "learning_rate": 4.058547829624953e-05, "loss": 0.2777, "step": 19705 }, { "epoch": 1.7862969004893965, "grad_norm": 0.9148552417755127, "learning_rate": 4.0579293667849884e-05, "loss": 0.3522, "step": 19710 }, { "epoch": 1.7867500453144824, "grad_norm": 0.8254474997520447, "learning_rate": 4.0573107480269265e-05, "loss": 0.255, "step": 19715 }, { "epoch": 1.7872031901395686, "grad_norm": 0.9540396928787231, "learning_rate": 4.056691973412679e-05, "loss": 0.2714, "step": 19720 }, { "epoch": 1.7876563349646548, "grad_norm": 0.8593186140060425, "learning_rate": 4.0560730430041705e-05, "loss": 0.3018, "step": 19725 }, { "epoch": 1.7881094797897408, "grad_norm": 0.8867223858833313, "learning_rate": 4.055453956863348e-05, "loss": 0.309, "step": 19730 }, { "epoch": 1.7885626246148267, "grad_norm": 0.8222487568855286, "learning_rate": 4.0548347150521666e-05, "loss": 0.3183, "step": 19735 }, { "epoch": 1.7890157694399131, "grad_norm": 0.8349538445472717, "learning_rate": 4.054215317632601e-05, "loss": 0.2877, "step": 19740 }, { "epoch": 1.789468914264999, "grad_norm": 0.9321218729019165, "learning_rate": 4.053595764666642e-05, "loss": 0.2649, "step": 19745 }, { "epoch": 1.789922059090085, "grad_norm": 0.8236182928085327, "learning_rate": 4.052976056216293e-05, "loss": 0.2655, "step": 19750 }, { "epoch": 1.7903752039151712, "grad_norm": 0.8528435230255127, "learning_rate": 4.052356192343576e-05, "loss": 0.2854, "step": 19755 }, { "epoch": 1.7908283487402574, "grad_norm": 0.8420412540435791, "learning_rate": 4.051736173110525e-05, "loss": 0.2746, "step": 19760 }, { "epoch": 1.7912814935653434, "grad_norm": 0.8575788140296936, "learning_rate": 4.051115998579195e-05, "loss": 0.2937, "step": 19765 }, { "epoch": 1.7917346383904296, "grad_norm": 0.9486353993415833, "learning_rate": 4.05049566881165e-05, "loss": 0.3132, "step": 19770 }, { "epoch": 1.7921877832155158, "grad_norm": 0.836327850818634, "learning_rate": 4.0498751838699754e-05, "loss": 0.276, "step": 19775 }, { "epoch": 1.7926409280406017, "grad_norm": 0.9049779176712036, "learning_rate": 4.0492545438162685e-05, "loss": 0.2619, "step": 19780 }, { "epoch": 1.793094072865688, "grad_norm": 0.9477555155754089, "learning_rate": 4.048633748712643e-05, "loss": 0.3198, "step": 19785 }, { "epoch": 1.793547217690774, "grad_norm": 0.9112554788589478, "learning_rate": 4.048012798621228e-05, "loss": 0.3191, "step": 19790 }, { "epoch": 1.79400036251586, "grad_norm": 0.8543556928634644, "learning_rate": 4.0473916936041686e-05, "loss": 0.2942, "step": 19795 }, { "epoch": 1.7944535073409462, "grad_norm": 0.9214422702789307, "learning_rate": 4.0467704337236264e-05, "loss": 0.2484, "step": 19800 }, { "epoch": 1.7949066521660324, "grad_norm": 0.8629183173179626, "learning_rate": 4.046149019041775e-05, "loss": 0.2908, "step": 19805 }, { "epoch": 1.7953597969911184, "grad_norm": 0.873383641242981, "learning_rate": 4.045527449620808e-05, "loss": 0.2668, "step": 19810 }, { "epoch": 1.7958129418162043, "grad_norm": 0.9253928661346436, "learning_rate": 4.044905725522931e-05, "loss": 0.268, "step": 19815 }, { "epoch": 1.7962660866412905, "grad_norm": 0.9320300817489624, "learning_rate": 4.0442838468103674e-05, "loss": 0.3051, "step": 19820 }, { "epoch": 1.7967192314663767, "grad_norm": 0.9781197309494019, "learning_rate": 4.043661813545354e-05, "loss": 0.3212, "step": 19825 }, { "epoch": 1.7971723762914626, "grad_norm": 0.7362196445465088, "learning_rate": 4.0430396257901435e-05, "loss": 0.2688, "step": 19830 }, { "epoch": 1.7976255211165488, "grad_norm": 0.8454239368438721, "learning_rate": 4.0424172836070066e-05, "loss": 0.3023, "step": 19835 }, { "epoch": 1.798078665941635, "grad_norm": 0.8720277547836304, "learning_rate": 4.0417947870582266e-05, "loss": 0.2698, "step": 19840 }, { "epoch": 1.798531810766721, "grad_norm": 0.9214737415313721, "learning_rate": 4.041172136206103e-05, "loss": 0.2763, "step": 19845 }, { "epoch": 1.7989849555918072, "grad_norm": 0.8881752490997314, "learning_rate": 4.040549331112951e-05, "loss": 0.2576, "step": 19850 }, { "epoch": 1.7994381004168933, "grad_norm": 0.9463508725166321, "learning_rate": 4.039926371841101e-05, "loss": 0.2795, "step": 19855 }, { "epoch": 1.7998912452419793, "grad_norm": 0.7982717156410217, "learning_rate": 4.039303258452899e-05, "loss": 0.2389, "step": 19860 }, { "epoch": 1.8003443900670655, "grad_norm": 0.8630609512329102, "learning_rate": 4.038679991010707e-05, "loss": 0.2485, "step": 19865 }, { "epoch": 1.8007975348921517, "grad_norm": 0.9026185274124146, "learning_rate": 4.038056569576901e-05, "loss": 0.3014, "step": 19870 }, { "epoch": 1.8012506797172376, "grad_norm": 0.8731305599212646, "learning_rate": 4.037432994213874e-05, "loss": 0.2973, "step": 19875 }, { "epoch": 1.8017038245423236, "grad_norm": 0.7154305577278137, "learning_rate": 4.0368092649840325e-05, "loss": 0.2547, "step": 19880 }, { "epoch": 1.80215696936741, "grad_norm": 1.050809383392334, "learning_rate": 4.036185381949801e-05, "loss": 0.3292, "step": 19885 }, { "epoch": 1.802610114192496, "grad_norm": 0.8531519174575806, "learning_rate": 4.035561345173616e-05, "loss": 0.3002, "step": 19890 }, { "epoch": 1.803063259017582, "grad_norm": 0.7816852927207947, "learning_rate": 4.034937154717934e-05, "loss": 0.267, "step": 19895 }, { "epoch": 1.803516403842668, "grad_norm": 0.8747511506080627, "learning_rate": 4.034312810645222e-05, "loss": 0.2896, "step": 19900 }, { "epoch": 1.8039695486677543, "grad_norm": 0.8207461833953857, "learning_rate": 4.033688313017965e-05, "loss": 0.2771, "step": 19905 }, { "epoch": 1.8044226934928402, "grad_norm": 0.8870819211006165, "learning_rate": 4.033063661898663e-05, "loss": 0.2985, "step": 19910 }, { "epoch": 1.8048758383179264, "grad_norm": 0.8634806275367737, "learning_rate": 4.032438857349832e-05, "loss": 0.2618, "step": 19915 }, { "epoch": 1.8053289831430126, "grad_norm": 0.9131994843482971, "learning_rate": 4.031813899434002e-05, "loss": 0.2815, "step": 19920 }, { "epoch": 1.8057821279680986, "grad_norm": 0.8653879761695862, "learning_rate": 4.031188788213719e-05, "loss": 0.2866, "step": 19925 }, { "epoch": 1.8062352727931847, "grad_norm": 0.7925137281417847, "learning_rate": 4.030563523751544e-05, "loss": 0.2806, "step": 19930 }, { "epoch": 1.806688417618271, "grad_norm": 0.7431967854499817, "learning_rate": 4.029938106110055e-05, "loss": 0.2784, "step": 19935 }, { "epoch": 1.8071415624433569, "grad_norm": 0.9711384773254395, "learning_rate": 4.029312535351843e-05, "loss": 0.3062, "step": 19940 }, { "epoch": 1.8075947072684428, "grad_norm": 0.8710916042327881, "learning_rate": 4.028686811539515e-05, "loss": 0.315, "step": 19945 }, { "epoch": 1.8080478520935293, "grad_norm": 0.996192991733551, "learning_rate": 4.028060934735694e-05, "loss": 0.278, "step": 19950 }, { "epoch": 1.8085009969186152, "grad_norm": 0.8347010612487793, "learning_rate": 4.027434905003019e-05, "loss": 0.2381, "step": 19955 }, { "epoch": 1.8089541417437012, "grad_norm": 0.9478174448013306, "learning_rate": 4.026808722404142e-05, "loss": 0.3071, "step": 19960 }, { "epoch": 1.8094072865687874, "grad_norm": 0.951339066028595, "learning_rate": 4.026182387001731e-05, "loss": 0.2825, "step": 19965 }, { "epoch": 1.8098604313938735, "grad_norm": 0.8074339628219604, "learning_rate": 4.0255558988584716e-05, "loss": 0.2994, "step": 19970 }, { "epoch": 1.8103135762189595, "grad_norm": 0.88193279504776, "learning_rate": 4.0249292580370623e-05, "loss": 0.2514, "step": 19975 }, { "epoch": 1.8107667210440457, "grad_norm": 1.0997896194458008, "learning_rate": 4.024302464600217e-05, "loss": 0.2764, "step": 19980 }, { "epoch": 1.8112198658691319, "grad_norm": 0.8447313904762268, "learning_rate": 4.023675518610666e-05, "loss": 0.2944, "step": 19985 }, { "epoch": 1.8116730106942178, "grad_norm": 0.7245734333992004, "learning_rate": 4.023048420131154e-05, "loss": 0.2601, "step": 19990 }, { "epoch": 1.812126155519304, "grad_norm": 0.9647153615951538, "learning_rate": 4.0224211692244415e-05, "loss": 0.3049, "step": 19995 }, { "epoch": 1.8125793003443902, "grad_norm": 0.9666344523429871, "learning_rate": 4.021793765953303e-05, "loss": 0.263, "step": 20000 }, { "epoch": 1.8130324451694761, "grad_norm": 0.9517194032669067, "learning_rate": 4.02116621038053e-05, "loss": 0.2862, "step": 20005 }, { "epoch": 1.813485589994562, "grad_norm": 0.8090932369232178, "learning_rate": 4.020538502568929e-05, "loss": 0.2669, "step": 20010 }, { "epoch": 1.8139387348196485, "grad_norm": 0.9002370834350586, "learning_rate": 4.01991064258132e-05, "loss": 0.2859, "step": 20015 }, { "epoch": 1.8143918796447345, "grad_norm": 0.8839691877365112, "learning_rate": 4.0192826304805406e-05, "loss": 0.2723, "step": 20020 }, { "epoch": 1.8148450244698204, "grad_norm": 0.8844120502471924, "learning_rate": 4.018654466329442e-05, "loss": 0.3344, "step": 20025 }, { "epoch": 1.8152981692949066, "grad_norm": 0.8383800387382507, "learning_rate": 4.018026150190891e-05, "loss": 0.2561, "step": 20030 }, { "epoch": 1.8157513141199928, "grad_norm": 0.8484800457954407, "learning_rate": 4.017397682127769e-05, "loss": 0.3276, "step": 20035 }, { "epoch": 1.8162044589450788, "grad_norm": 0.9023807644844055, "learning_rate": 4.0167690622029755e-05, "loss": 0.2831, "step": 20040 }, { "epoch": 1.816657603770165, "grad_norm": 0.7771007418632507, "learning_rate": 4.01614029047942e-05, "loss": 0.2847, "step": 20045 }, { "epoch": 1.8171107485952511, "grad_norm": 0.8680269718170166, "learning_rate": 4.0155113670200325e-05, "loss": 0.3253, "step": 20050 }, { "epoch": 1.817563893420337, "grad_norm": 0.8731369972229004, "learning_rate": 4.014882291887755e-05, "loss": 0.3058, "step": 20055 }, { "epoch": 1.8180170382454233, "grad_norm": 0.7756339311599731, "learning_rate": 4.014253065145546e-05, "loss": 0.274, "step": 20060 }, { "epoch": 1.8184701830705094, "grad_norm": 1.014328956604004, "learning_rate": 4.013623686856377e-05, "loss": 0.2681, "step": 20065 }, { "epoch": 1.8189233278955954, "grad_norm": 0.8735911846160889, "learning_rate": 4.0129941570832385e-05, "loss": 0.2747, "step": 20070 }, { "epoch": 1.8193764727206814, "grad_norm": 0.9076184630393982, "learning_rate": 4.012364475889133e-05, "loss": 0.2923, "step": 20075 }, { "epoch": 1.8198296175457678, "grad_norm": 0.8892605304718018, "learning_rate": 4.01173464333708e-05, "loss": 0.2757, "step": 20080 }, { "epoch": 1.8202827623708537, "grad_norm": 0.8838591575622559, "learning_rate": 4.011104659490113e-05, "loss": 0.2978, "step": 20085 }, { "epoch": 1.8207359071959397, "grad_norm": 0.9002130031585693, "learning_rate": 4.010474524411281e-05, "loss": 0.2944, "step": 20090 }, { "epoch": 1.8211890520210259, "grad_norm": 0.8307431936264038, "learning_rate": 4.009844238163647e-05, "loss": 0.2693, "step": 20095 }, { "epoch": 1.821642196846112, "grad_norm": 0.9040517210960388, "learning_rate": 4.009213800810292e-05, "loss": 0.2742, "step": 20100 }, { "epoch": 1.822095341671198, "grad_norm": 0.7505114078521729, "learning_rate": 4.008583212414309e-05, "loss": 0.2534, "step": 20105 }, { "epoch": 1.8225484864962842, "grad_norm": 0.8935396671295166, "learning_rate": 4.007952473038809e-05, "loss": 0.3015, "step": 20110 }, { "epoch": 1.8230016313213704, "grad_norm": 0.9573212265968323, "learning_rate": 4.0073215827469145e-05, "loss": 0.2825, "step": 20115 }, { "epoch": 1.8234547761464563, "grad_norm": 1.0180537700653076, "learning_rate": 4.006690541601767e-05, "loss": 0.3118, "step": 20120 }, { "epoch": 1.8239079209715425, "grad_norm": 0.8490697145462036, "learning_rate": 4.006059349666522e-05, "loss": 0.2372, "step": 20125 }, { "epoch": 1.8243610657966287, "grad_norm": 0.8722063899040222, "learning_rate": 4.005428007004347e-05, "loss": 0.297, "step": 20130 }, { "epoch": 1.8248142106217147, "grad_norm": 0.8399658203125, "learning_rate": 4.0047965136784285e-05, "loss": 0.2377, "step": 20135 }, { "epoch": 1.8252673554468006, "grad_norm": 1.0009249448776245, "learning_rate": 4.004164869751965e-05, "loss": 0.2696, "step": 20140 }, { "epoch": 1.825720500271887, "grad_norm": 0.8745613098144531, "learning_rate": 4.003533075288175e-05, "loss": 0.2863, "step": 20145 }, { "epoch": 1.826173645096973, "grad_norm": 0.8723413944244385, "learning_rate": 4.002901130350285e-05, "loss": 0.2586, "step": 20150 }, { "epoch": 1.826626789922059, "grad_norm": 0.8947932720184326, "learning_rate": 4.0022690350015425e-05, "loss": 0.2779, "step": 20155 }, { "epoch": 1.8270799347471451, "grad_norm": 0.7663194537162781, "learning_rate": 4.0016367893052066e-05, "loss": 0.3337, "step": 20160 }, { "epoch": 1.8275330795722313, "grad_norm": 0.919926106929779, "learning_rate": 4.0010043933245535e-05, "loss": 0.2712, "step": 20165 }, { "epoch": 1.8279862243973173, "grad_norm": 0.9051507711410522, "learning_rate": 4.000371847122873e-05, "loss": 0.3322, "step": 20170 }, { "epoch": 1.8284393692224035, "grad_norm": 0.953317403793335, "learning_rate": 3.9997391507634705e-05, "loss": 0.2891, "step": 20175 }, { "epoch": 1.8288925140474896, "grad_norm": 0.9112091660499573, "learning_rate": 3.999106304309668e-05, "loss": 0.2929, "step": 20180 }, { "epoch": 1.8293456588725756, "grad_norm": 0.909342885017395, "learning_rate": 3.998473307824799e-05, "loss": 0.3239, "step": 20185 }, { "epoch": 1.8297988036976618, "grad_norm": 0.8533919453620911, "learning_rate": 3.9978401613722146e-05, "loss": 0.2964, "step": 20190 }, { "epoch": 1.830251948522748, "grad_norm": 0.9508411884307861, "learning_rate": 3.997206865015282e-05, "loss": 0.3062, "step": 20195 }, { "epoch": 1.830705093347834, "grad_norm": 0.9700195789337158, "learning_rate": 3.9965734188173786e-05, "loss": 0.2953, "step": 20200 }, { "epoch": 1.8311582381729201, "grad_norm": 0.846572756767273, "learning_rate": 3.995939822841902e-05, "loss": 0.2311, "step": 20205 }, { "epoch": 1.8316113829980063, "grad_norm": 1.0039548873901367, "learning_rate": 3.995306077152262e-05, "loss": 0.2921, "step": 20210 }, { "epoch": 1.8320645278230923, "grad_norm": 0.8227837681770325, "learning_rate": 3.994672181811884e-05, "loss": 0.2829, "step": 20215 }, { "epoch": 1.8325176726481782, "grad_norm": 0.7754899263381958, "learning_rate": 3.994038136884209e-05, "loss": 0.2659, "step": 20220 }, { "epoch": 1.8329708174732644, "grad_norm": 1.0213215351104736, "learning_rate": 3.9934039424326936e-05, "loss": 0.2829, "step": 20225 }, { "epoch": 1.8334239622983506, "grad_norm": 0.8166643381118774, "learning_rate": 3.9927695985208057e-05, "loss": 0.2428, "step": 20230 }, { "epoch": 1.8338771071234365, "grad_norm": 0.9038397669792175, "learning_rate": 3.9921351052120315e-05, "loss": 0.2994, "step": 20235 }, { "epoch": 1.8343302519485227, "grad_norm": 0.8829077482223511, "learning_rate": 3.9915004625698716e-05, "loss": 0.3154, "step": 20240 }, { "epoch": 1.834783396773609, "grad_norm": 0.7198551297187805, "learning_rate": 3.990865670657842e-05, "loss": 0.243, "step": 20245 }, { "epoch": 1.8352365415986949, "grad_norm": 0.9305186867713928, "learning_rate": 3.9902307295394716e-05, "loss": 0.2958, "step": 20250 }, { "epoch": 1.835689686423781, "grad_norm": 0.85170978307724, "learning_rate": 3.9895956392783054e-05, "loss": 0.3177, "step": 20255 }, { "epoch": 1.8361428312488672, "grad_norm": 0.9899299740791321, "learning_rate": 3.9889603999379046e-05, "loss": 0.3115, "step": 20260 }, { "epoch": 1.8365959760739532, "grad_norm": 0.8306777477264404, "learning_rate": 3.9883250115818435e-05, "loss": 0.2884, "step": 20265 }, { "epoch": 1.8370491208990394, "grad_norm": 1.0299568176269531, "learning_rate": 3.9876894742737124e-05, "loss": 0.32, "step": 20270 }, { "epoch": 1.8375022657241256, "grad_norm": 0.7788583636283875, "learning_rate": 3.9870537880771156e-05, "loss": 0.2475, "step": 20275 }, { "epoch": 1.8379554105492115, "grad_norm": 1.0517712831497192, "learning_rate": 3.986417953055673e-05, "loss": 0.2937, "step": 20280 }, { "epoch": 1.8384085553742975, "grad_norm": 0.8863223791122437, "learning_rate": 3.985781969273019e-05, "loss": 0.3079, "step": 20285 }, { "epoch": 1.8388617001993839, "grad_norm": 0.8132122159004211, "learning_rate": 3.985145836792804e-05, "loss": 0.2911, "step": 20290 }, { "epoch": 1.8393148450244698, "grad_norm": 0.9436118006706238, "learning_rate": 3.9845095556786905e-05, "loss": 0.2841, "step": 20295 }, { "epoch": 1.8397679898495558, "grad_norm": 0.851689338684082, "learning_rate": 3.98387312599436e-05, "loss": 0.2618, "step": 20300 }, { "epoch": 1.840221134674642, "grad_norm": 0.8547185063362122, "learning_rate": 3.9832365478035044e-05, "loss": 0.2711, "step": 20305 }, { "epoch": 1.8406742794997282, "grad_norm": 0.8707652688026428, "learning_rate": 3.982599821169834e-05, "loss": 0.2512, "step": 20310 }, { "epoch": 1.8411274243248141, "grad_norm": 1.1908934116363525, "learning_rate": 3.981962946157073e-05, "loss": 0.2934, "step": 20315 }, { "epoch": 1.8415805691499003, "grad_norm": 0.751724362373352, "learning_rate": 3.9813259228289584e-05, "loss": 0.241, "step": 20320 }, { "epoch": 1.8420337139749865, "grad_norm": 0.8465730547904968, "learning_rate": 3.980688751249245e-05, "loss": 0.248, "step": 20325 }, { "epoch": 1.8424868588000725, "grad_norm": 0.8862453103065491, "learning_rate": 3.9800514314817004e-05, "loss": 0.2429, "step": 20330 }, { "epoch": 1.8429400036251586, "grad_norm": 0.8160443902015686, "learning_rate": 3.9794139635901094e-05, "loss": 0.3214, "step": 20335 }, { "epoch": 1.8433931484502448, "grad_norm": 1.0881675481796265, "learning_rate": 3.978776347638268e-05, "loss": 0.3007, "step": 20340 }, { "epoch": 1.8438462932753308, "grad_norm": 0.8682482242584229, "learning_rate": 3.9781385836899896e-05, "loss": 0.3113, "step": 20345 }, { "epoch": 1.8442994381004167, "grad_norm": 0.9329793453216553, "learning_rate": 3.977500671809103e-05, "loss": 0.2872, "step": 20350 }, { "epoch": 1.8447525829255031, "grad_norm": 0.8478608727455139, "learning_rate": 3.976862612059449e-05, "loss": 0.2525, "step": 20355 }, { "epoch": 1.845205727750589, "grad_norm": 0.8346651196479797, "learning_rate": 3.9762244045048855e-05, "loss": 0.2919, "step": 20360 }, { "epoch": 1.845658872575675, "grad_norm": 1.7158546447753906, "learning_rate": 3.9755860492092844e-05, "loss": 0.2702, "step": 20365 }, { "epoch": 1.8461120174007613, "grad_norm": 0.9023807048797607, "learning_rate": 3.974947546236533e-05, "loss": 0.3228, "step": 20370 }, { "epoch": 1.8465651622258474, "grad_norm": 0.9114800691604614, "learning_rate": 3.9743088956505325e-05, "loss": 0.3127, "step": 20375 }, { "epoch": 1.8470183070509334, "grad_norm": 0.8807557225227356, "learning_rate": 3.973670097515199e-05, "loss": 0.2859, "step": 20380 }, { "epoch": 1.8474714518760196, "grad_norm": 0.8904132843017578, "learning_rate": 3.973031151894464e-05, "loss": 0.2843, "step": 20385 }, { "epoch": 1.8479245967011058, "grad_norm": 1.0146998167037964, "learning_rate": 3.972392058852274e-05, "loss": 0.3488, "step": 20390 }, { "epoch": 1.8483777415261917, "grad_norm": 0.9070367217063904, "learning_rate": 3.971752818452589e-05, "loss": 0.288, "step": 20395 }, { "epoch": 1.848830886351278, "grad_norm": 1.0332039594650269, "learning_rate": 3.9711134307593835e-05, "loss": 0.3054, "step": 20400 }, { "epoch": 1.849284031176364, "grad_norm": 0.9376007914543152, "learning_rate": 3.9704738958366494e-05, "loss": 0.3096, "step": 20405 }, { "epoch": 1.84973717600145, "grad_norm": 0.8421182036399841, "learning_rate": 3.9698342137483904e-05, "loss": 0.2824, "step": 20410 }, { "epoch": 1.850190320826536, "grad_norm": 0.8601294159889221, "learning_rate": 3.969194384558626e-05, "loss": 0.2708, "step": 20415 }, { "epoch": 1.8506434656516224, "grad_norm": 0.8094217777252197, "learning_rate": 3.968554408331392e-05, "loss": 0.2374, "step": 20420 }, { "epoch": 1.8510966104767084, "grad_norm": 0.8615491390228271, "learning_rate": 3.967914285130735e-05, "loss": 0.251, "step": 20425 }, { "epoch": 1.8515497553017943, "grad_norm": 0.8358895182609558, "learning_rate": 3.967274015020721e-05, "loss": 0.2516, "step": 20430 }, { "epoch": 1.8520029001268805, "grad_norm": 0.8720216155052185, "learning_rate": 3.966633598065428e-05, "loss": 0.2802, "step": 20435 }, { "epoch": 1.8524560449519667, "grad_norm": 0.8846692442893982, "learning_rate": 3.9659930343289475e-05, "loss": 0.2828, "step": 20440 }, { "epoch": 1.8529091897770527, "grad_norm": 0.8846976161003113, "learning_rate": 3.9653523238753894e-05, "loss": 0.284, "step": 20445 }, { "epoch": 1.8533623346021388, "grad_norm": 0.8691480159759521, "learning_rate": 3.9647114667688744e-05, "loss": 0.2476, "step": 20450 }, { "epoch": 1.853815479427225, "grad_norm": 0.8949333429336548, "learning_rate": 3.964070463073542e-05, "loss": 0.3244, "step": 20455 }, { "epoch": 1.854268624252311, "grad_norm": 0.8440203666687012, "learning_rate": 3.963429312853542e-05, "loss": 0.2451, "step": 20460 }, { "epoch": 1.8547217690773972, "grad_norm": 0.8547397255897522, "learning_rate": 3.962788016173041e-05, "loss": 0.2598, "step": 20465 }, { "epoch": 1.8551749139024833, "grad_norm": 0.7755733132362366, "learning_rate": 3.9621465730962224e-05, "loss": 0.2563, "step": 20470 }, { "epoch": 1.8556280587275693, "grad_norm": 0.8429813981056213, "learning_rate": 3.96150498368728e-05, "loss": 0.2513, "step": 20475 }, { "epoch": 1.8560812035526553, "grad_norm": 0.868789792060852, "learning_rate": 3.9608632480104244e-05, "loss": 0.2828, "step": 20480 }, { "epoch": 1.8565343483777417, "grad_norm": 0.8351235389709473, "learning_rate": 3.96022136612988e-05, "loss": 0.2679, "step": 20485 }, { "epoch": 1.8569874932028276, "grad_norm": 0.9149982333183289, "learning_rate": 3.959579338109889e-05, "loss": 0.2664, "step": 20490 }, { "epoch": 1.8574406380279136, "grad_norm": 0.9217008948326111, "learning_rate": 3.958937164014705e-05, "loss": 0.2596, "step": 20495 }, { "epoch": 1.8578937828529998, "grad_norm": 0.8014796376228333, "learning_rate": 3.958294843908596e-05, "loss": 0.2294, "step": 20500 }, { "epoch": 1.858346927678086, "grad_norm": 0.8753756284713745, "learning_rate": 3.957652377855845e-05, "loss": 0.2682, "step": 20505 }, { "epoch": 1.858800072503172, "grad_norm": 0.7866687178611755, "learning_rate": 3.957009765920752e-05, "loss": 0.2554, "step": 20510 }, { "epoch": 1.859253217328258, "grad_norm": 0.8576914072036743, "learning_rate": 3.95636700816763e-05, "loss": 0.263, "step": 20515 }, { "epoch": 1.8597063621533443, "grad_norm": 0.9665506482124329, "learning_rate": 3.955724104660804e-05, "loss": 0.3174, "step": 20520 }, { "epoch": 1.8601595069784302, "grad_norm": 0.8578290939331055, "learning_rate": 3.955081055464618e-05, "loss": 0.315, "step": 20525 }, { "epoch": 1.8606126518035164, "grad_norm": 0.925279974937439, "learning_rate": 3.954437860643428e-05, "loss": 0.3007, "step": 20530 }, { "epoch": 1.8610657966286026, "grad_norm": 0.7922590374946594, "learning_rate": 3.953794520261605e-05, "loss": 0.2465, "step": 20535 }, { "epoch": 1.8615189414536886, "grad_norm": 0.9376883506774902, "learning_rate": 3.953151034383535e-05, "loss": 0.2997, "step": 20540 }, { "epoch": 1.8619720862787748, "grad_norm": 0.8954723477363586, "learning_rate": 3.952507403073619e-05, "loss": 0.2508, "step": 20545 }, { "epoch": 1.862425231103861, "grad_norm": 0.9007991552352905, "learning_rate": 3.951863626396271e-05, "loss": 0.2737, "step": 20550 }, { "epoch": 1.862878375928947, "grad_norm": 0.9154739379882812, "learning_rate": 3.951219704415919e-05, "loss": 0.2713, "step": 20555 }, { "epoch": 1.8633315207540329, "grad_norm": 0.798803985118866, "learning_rate": 3.9505756371970104e-05, "loss": 0.2671, "step": 20560 }, { "epoch": 1.863784665579119, "grad_norm": 0.877993106842041, "learning_rate": 3.949931424804001e-05, "loss": 0.3831, "step": 20565 }, { "epoch": 1.8642378104042052, "grad_norm": 0.9358622431755066, "learning_rate": 3.949287067301364e-05, "loss": 0.2678, "step": 20570 }, { "epoch": 1.8646909552292912, "grad_norm": 0.8859714269638062, "learning_rate": 3.9486425647535875e-05, "loss": 0.2817, "step": 20575 }, { "epoch": 1.8651441000543774, "grad_norm": 0.9405844807624817, "learning_rate": 3.9479979172251737e-05, "loss": 0.3113, "step": 20580 }, { "epoch": 1.8655972448794635, "grad_norm": 0.7971060276031494, "learning_rate": 3.9473531247806386e-05, "loss": 0.265, "step": 20585 }, { "epoch": 1.8660503897045495, "grad_norm": 1.0778932571411133, "learning_rate": 3.9467081874845133e-05, "loss": 0.2833, "step": 20590 }, { "epoch": 1.8665035345296357, "grad_norm": 0.9588818550109863, "learning_rate": 3.946063105401344e-05, "loss": 0.2894, "step": 20595 }, { "epoch": 1.8669566793547219, "grad_norm": 0.7621418237686157, "learning_rate": 3.94541787859569e-05, "loss": 0.2461, "step": 20600 }, { "epoch": 1.8674098241798078, "grad_norm": 1.0622766017913818, "learning_rate": 3.9447725071321265e-05, "loss": 0.3401, "step": 20605 }, { "epoch": 1.867862969004894, "grad_norm": 0.8737813830375671, "learning_rate": 3.944126991075241e-05, "loss": 0.2638, "step": 20610 }, { "epoch": 1.8683161138299802, "grad_norm": 0.7578558921813965, "learning_rate": 3.943481330489639e-05, "loss": 0.2406, "step": 20615 }, { "epoch": 1.8687692586550662, "grad_norm": 0.846162736415863, "learning_rate": 3.942835525439937e-05, "loss": 0.2463, "step": 20620 }, { "epoch": 1.8692224034801521, "grad_norm": 0.8193262815475464, "learning_rate": 3.9421895759907676e-05, "loss": 0.2492, "step": 20625 }, { "epoch": 1.8696755483052383, "grad_norm": 0.8201340436935425, "learning_rate": 3.9415434822067785e-05, "loss": 0.2661, "step": 20630 }, { "epoch": 1.8701286931303245, "grad_norm": 0.9027456045150757, "learning_rate": 3.94089724415263e-05, "loss": 0.3316, "step": 20635 }, { "epoch": 1.8705818379554104, "grad_norm": 0.946780264377594, "learning_rate": 3.940250861892999e-05, "loss": 0.2742, "step": 20640 }, { "epoch": 1.8710349827804966, "grad_norm": 0.975418746471405, "learning_rate": 3.939604335492575e-05, "loss": 0.2783, "step": 20645 }, { "epoch": 1.8714881276055828, "grad_norm": 0.8164170384407043, "learning_rate": 3.938957665016061e-05, "loss": 0.2911, "step": 20650 }, { "epoch": 1.8719412724306688, "grad_norm": 0.8287278413772583, "learning_rate": 3.9383108505281785e-05, "loss": 0.2949, "step": 20655 }, { "epoch": 1.872394417255755, "grad_norm": 0.8698907494544983, "learning_rate": 3.93766389209366e-05, "loss": 0.2945, "step": 20660 }, { "epoch": 1.8728475620808411, "grad_norm": 0.7188710570335388, "learning_rate": 3.9370167897772526e-05, "loss": 0.2538, "step": 20665 }, { "epoch": 1.873300706905927, "grad_norm": 0.9105436205863953, "learning_rate": 3.936369543643719e-05, "loss": 0.3055, "step": 20670 }, { "epoch": 1.8737538517310133, "grad_norm": 0.78200763463974, "learning_rate": 3.935722153757836e-05, "loss": 0.2882, "step": 20675 }, { "epoch": 1.8742069965560995, "grad_norm": 0.8860959410667419, "learning_rate": 3.935074620184395e-05, "loss": 0.3175, "step": 20680 }, { "epoch": 1.8746601413811854, "grad_norm": 0.9462512731552124, "learning_rate": 3.9344269429882005e-05, "loss": 0.2854, "step": 20685 }, { "epoch": 1.8751132862062714, "grad_norm": 0.8516097068786621, "learning_rate": 3.9337791222340724e-05, "loss": 0.2383, "step": 20690 }, { "epoch": 1.8755664310313578, "grad_norm": 0.8335115313529968, "learning_rate": 3.933131157986846e-05, "loss": 0.2818, "step": 20695 }, { "epoch": 1.8760195758564437, "grad_norm": 0.9449013471603394, "learning_rate": 3.932483050311367e-05, "loss": 0.267, "step": 20700 }, { "epoch": 1.8764727206815297, "grad_norm": 0.863536536693573, "learning_rate": 3.931834799272502e-05, "loss": 0.2443, "step": 20705 }, { "epoch": 1.8769258655066159, "grad_norm": 0.797860324382782, "learning_rate": 3.931186404935126e-05, "loss": 0.2868, "step": 20710 }, { "epoch": 1.877379010331702, "grad_norm": 0.8666322231292725, "learning_rate": 3.93053786736413e-05, "loss": 0.3208, "step": 20715 }, { "epoch": 1.877832155156788, "grad_norm": 0.9123872518539429, "learning_rate": 3.929889186624421e-05, "loss": 0.2841, "step": 20720 }, { "epoch": 1.8782852999818742, "grad_norm": 0.8751884698867798, "learning_rate": 3.9292403627809184e-05, "loss": 0.2925, "step": 20725 }, { "epoch": 1.8787384448069604, "grad_norm": 0.8152556419372559, "learning_rate": 3.9285913958985575e-05, "loss": 0.2718, "step": 20730 }, { "epoch": 1.8791915896320464, "grad_norm": 0.8924704194068909, "learning_rate": 3.9279422860422873e-05, "loss": 0.3035, "step": 20735 }, { "epoch": 1.8796447344571325, "grad_norm": 0.9323400855064392, "learning_rate": 3.927293033277069e-05, "loss": 0.2853, "step": 20740 }, { "epoch": 1.8800978792822187, "grad_norm": 1.1076909303665161, "learning_rate": 3.926643637667883e-05, "loss": 0.3174, "step": 20745 }, { "epoch": 1.8805510241073047, "grad_norm": 0.95304274559021, "learning_rate": 3.925994099279718e-05, "loss": 0.2769, "step": 20750 }, { "epoch": 1.8810041689323906, "grad_norm": 0.9414074420928955, "learning_rate": 3.9253444181775825e-05, "loss": 0.2484, "step": 20755 }, { "epoch": 1.881457313757477, "grad_norm": 0.8457611203193665, "learning_rate": 3.924694594426496e-05, "loss": 0.2753, "step": 20760 }, { "epoch": 1.881910458582563, "grad_norm": 0.8074051737785339, "learning_rate": 3.924044628091492e-05, "loss": 0.2628, "step": 20765 }, { "epoch": 1.882363603407649, "grad_norm": 0.8996312618255615, "learning_rate": 3.923394519237621e-05, "loss": 0.2348, "step": 20770 }, { "epoch": 1.8828167482327351, "grad_norm": 0.9394465088844299, "learning_rate": 3.9227442679299456e-05, "loss": 0.3025, "step": 20775 }, { "epoch": 1.8832698930578213, "grad_norm": 0.9290368556976318, "learning_rate": 3.922093874233542e-05, "loss": 0.2793, "step": 20780 }, { "epoch": 1.8837230378829073, "grad_norm": 0.91438227891922, "learning_rate": 3.921443338213503e-05, "loss": 0.2445, "step": 20785 }, { "epoch": 1.8841761827079935, "grad_norm": 0.9106050133705139, "learning_rate": 3.920792659934934e-05, "loss": 0.2794, "step": 20790 }, { "epoch": 1.8846293275330797, "grad_norm": 0.8292180895805359, "learning_rate": 3.920141839462955e-05, "loss": 0.2919, "step": 20795 }, { "epoch": 1.8850824723581656, "grad_norm": 0.8484019637107849, "learning_rate": 3.919490876862701e-05, "loss": 0.2463, "step": 20800 }, { "epoch": 1.8855356171832518, "grad_norm": 0.9194732904434204, "learning_rate": 3.918839772199321e-05, "loss": 0.2942, "step": 20805 }, { "epoch": 1.885988762008338, "grad_norm": 0.9573848843574524, "learning_rate": 3.918188525537976e-05, "loss": 0.321, "step": 20810 }, { "epoch": 1.886441906833424, "grad_norm": 0.9217650294303894, "learning_rate": 3.917537136943844e-05, "loss": 0.3038, "step": 20815 }, { "epoch": 1.88689505165851, "grad_norm": 0.9007180333137512, "learning_rate": 3.9168856064821165e-05, "loss": 0.2806, "step": 20820 }, { "epoch": 1.8873481964835963, "grad_norm": 0.8060653805732727, "learning_rate": 3.916233934217998e-05, "loss": 0.2995, "step": 20825 }, { "epoch": 1.8878013413086823, "grad_norm": 1.0170707702636719, "learning_rate": 3.915582120216709e-05, "loss": 0.2529, "step": 20830 }, { "epoch": 1.8882544861337682, "grad_norm": 0.9275903701782227, "learning_rate": 3.914930164543483e-05, "loss": 0.2714, "step": 20835 }, { "epoch": 1.8887076309588544, "grad_norm": 0.9100077748298645, "learning_rate": 3.914278067263567e-05, "loss": 0.3166, "step": 20840 }, { "epoch": 1.8891607757839406, "grad_norm": 0.8966882228851318, "learning_rate": 3.913625828442225e-05, "loss": 0.265, "step": 20845 }, { "epoch": 1.8896139206090266, "grad_norm": 0.8255658745765686, "learning_rate": 3.912973448144732e-05, "loss": 0.2508, "step": 20850 }, { "epoch": 1.8900670654341127, "grad_norm": 0.9210665822029114, "learning_rate": 3.912320926436379e-05, "loss": 0.2615, "step": 20855 }, { "epoch": 1.890520210259199, "grad_norm": 0.8272840976715088, "learning_rate": 3.911668263382469e-05, "loss": 0.2982, "step": 20860 }, { "epoch": 1.8909733550842849, "grad_norm": 0.9117411971092224, "learning_rate": 3.9110154590483227e-05, "loss": 0.3026, "step": 20865 }, { "epoch": 1.891426499909371, "grad_norm": 0.8693245053291321, "learning_rate": 3.910362513499273e-05, "loss": 0.2749, "step": 20870 }, { "epoch": 1.8918796447344572, "grad_norm": 0.8996984362602234, "learning_rate": 3.909709426800665e-05, "loss": 0.2602, "step": 20875 }, { "epoch": 1.8923327895595432, "grad_norm": 0.7953909635543823, "learning_rate": 3.909056199017861e-05, "loss": 0.2734, "step": 20880 }, { "epoch": 1.8927859343846292, "grad_norm": 0.8729183077812195, "learning_rate": 3.9084028302162365e-05, "loss": 0.3063, "step": 20885 }, { "epoch": 1.8932390792097156, "grad_norm": 0.9081895351409912, "learning_rate": 3.9077493204611814e-05, "loss": 0.2796, "step": 20890 }, { "epoch": 1.8936922240348015, "grad_norm": 0.8315554261207581, "learning_rate": 3.907095669818098e-05, "loss": 0.2632, "step": 20895 }, { "epoch": 1.8941453688598875, "grad_norm": 0.8203907608985901, "learning_rate": 3.906441878352404e-05, "loss": 0.2585, "step": 20900 }, { "epoch": 1.8945985136849737, "grad_norm": 0.8424674272537231, "learning_rate": 3.905787946129531e-05, "loss": 0.2998, "step": 20905 }, { "epoch": 1.8950516585100599, "grad_norm": 0.8731051087379456, "learning_rate": 3.905133873214926e-05, "loss": 0.2631, "step": 20910 }, { "epoch": 1.8955048033351458, "grad_norm": 0.9519386887550354, "learning_rate": 3.9044796596740476e-05, "loss": 0.2646, "step": 20915 }, { "epoch": 1.895957948160232, "grad_norm": 0.8071016669273376, "learning_rate": 3.903825305572371e-05, "loss": 0.2444, "step": 20920 }, { "epoch": 1.8964110929853182, "grad_norm": 0.7914605140686035, "learning_rate": 3.9031708109753816e-05, "loss": 0.3175, "step": 20925 }, { "epoch": 1.8968642378104041, "grad_norm": 0.9856507182121277, "learning_rate": 3.902516175948583e-05, "loss": 0.3102, "step": 20930 }, { "epoch": 1.8973173826354903, "grad_norm": 0.8986512422561646, "learning_rate": 3.9018614005574925e-05, "loss": 0.238, "step": 20935 }, { "epoch": 1.8977705274605765, "grad_norm": 0.8883002400398254, "learning_rate": 3.901206484867639e-05, "loss": 0.2801, "step": 20940 }, { "epoch": 1.8982236722856625, "grad_norm": 0.7771424055099487, "learning_rate": 3.900551428944567e-05, "loss": 0.2361, "step": 20945 }, { "epoch": 1.8986768171107486, "grad_norm": 0.938355028629303, "learning_rate": 3.899896232853834e-05, "loss": 0.3066, "step": 20950 }, { "epoch": 1.8991299619358348, "grad_norm": 0.9057039618492126, "learning_rate": 3.8992408966610125e-05, "loss": 0.3001, "step": 20955 }, { "epoch": 1.8995831067609208, "grad_norm": 1.0237829685211182, "learning_rate": 3.89858542043169e-05, "loss": 0.3321, "step": 20960 }, { "epoch": 1.9000362515860068, "grad_norm": 0.8941649198532104, "learning_rate": 3.897929804231466e-05, "loss": 0.2574, "step": 20965 }, { "epoch": 1.900489396411093, "grad_norm": 0.9414231181144714, "learning_rate": 3.897274048125953e-05, "loss": 0.2595, "step": 20970 }, { "epoch": 1.9009425412361791, "grad_norm": 0.9318041205406189, "learning_rate": 3.896618152180782e-05, "loss": 0.2461, "step": 20975 }, { "epoch": 1.901395686061265, "grad_norm": 0.8759263157844543, "learning_rate": 3.8959621164615943e-05, "loss": 0.2936, "step": 20980 }, { "epoch": 1.9018488308863513, "grad_norm": 0.7867467999458313, "learning_rate": 3.895305941034046e-05, "loss": 0.2596, "step": 20985 }, { "epoch": 1.9023019757114374, "grad_norm": 0.8195303082466125, "learning_rate": 3.8946496259638075e-05, "loss": 0.2636, "step": 20990 }, { "epoch": 1.9027551205365234, "grad_norm": 0.9366142749786377, "learning_rate": 3.8939931713165625e-05, "loss": 0.2486, "step": 20995 }, { "epoch": 1.9032082653616096, "grad_norm": 0.8392592072486877, "learning_rate": 3.89333657715801e-05, "loss": 0.273, "step": 21000 }, { "epoch": 1.9036614101866958, "grad_norm": 1.0151642560958862, "learning_rate": 3.892679843553862e-05, "loss": 0.2535, "step": 21005 }, { "epoch": 1.9041145550117817, "grad_norm": 0.9476789236068726, "learning_rate": 3.892022970569844e-05, "loss": 0.3051, "step": 21010 }, { "epoch": 1.904567699836868, "grad_norm": 0.8374054431915283, "learning_rate": 3.8913659582716974e-05, "loss": 0.2454, "step": 21015 }, { "epoch": 1.905020844661954, "grad_norm": 0.9183117747306824, "learning_rate": 3.890708806725174e-05, "loss": 0.2449, "step": 21020 }, { "epoch": 1.90547398948704, "grad_norm": 1.0080939531326294, "learning_rate": 3.8900515159960436e-05, "loss": 0.2879, "step": 21025 }, { "epoch": 1.905927134312126, "grad_norm": 0.8043929934501648, "learning_rate": 3.889394086150088e-05, "loss": 0.2616, "step": 21030 }, { "epoch": 1.9063802791372124, "grad_norm": 0.7798545360565186, "learning_rate": 3.8887365172531024e-05, "loss": 0.2811, "step": 21035 }, { "epoch": 1.9068334239622984, "grad_norm": 0.9028066396713257, "learning_rate": 3.888078809370897e-05, "loss": 0.32, "step": 21040 }, { "epoch": 1.9072865687873843, "grad_norm": 0.9516180157661438, "learning_rate": 3.887420962569295e-05, "loss": 0.3064, "step": 21045 }, { "epoch": 1.9077397136124705, "grad_norm": 0.89799565076828, "learning_rate": 3.8867629769141335e-05, "loss": 0.2813, "step": 21050 }, { "epoch": 1.9081928584375567, "grad_norm": 0.9164626598358154, "learning_rate": 3.886104852471265e-05, "loss": 0.2781, "step": 21055 }, { "epoch": 1.9086460032626427, "grad_norm": 0.8080704808235168, "learning_rate": 3.885446589306554e-05, "loss": 0.2589, "step": 21060 }, { "epoch": 1.9090991480877288, "grad_norm": 0.9191581606864929, "learning_rate": 3.88478818748588e-05, "loss": 0.2656, "step": 21065 }, { "epoch": 1.909552292912815, "grad_norm": 0.8886386156082153, "learning_rate": 3.884129647075136e-05, "loss": 0.2228, "step": 21070 }, { "epoch": 1.910005437737901, "grad_norm": 0.900184690952301, "learning_rate": 3.8834709681402296e-05, "loss": 0.2672, "step": 21075 }, { "epoch": 1.9104585825629872, "grad_norm": 0.8703711032867432, "learning_rate": 3.882812150747081e-05, "loss": 0.3137, "step": 21080 }, { "epoch": 1.9109117273880734, "grad_norm": 1.0084333419799805, "learning_rate": 3.882153194961624e-05, "loss": 0.2913, "step": 21085 }, { "epoch": 1.9113648722131593, "grad_norm": 0.9035248756408691, "learning_rate": 3.8814941008498084e-05, "loss": 0.3132, "step": 21090 }, { "epoch": 1.9118180170382453, "grad_norm": 0.9683552384376526, "learning_rate": 3.8808348684775963e-05, "loss": 0.3001, "step": 21095 }, { "epoch": 1.9122711618633317, "grad_norm": 0.935458779335022, "learning_rate": 3.880175497910964e-05, "loss": 0.2847, "step": 21100 }, { "epoch": 1.9127243066884176, "grad_norm": 0.957992672920227, "learning_rate": 3.8795159892159006e-05, "loss": 0.3039, "step": 21105 }, { "epoch": 1.9131774515135036, "grad_norm": 0.9725937247276306, "learning_rate": 3.878856342458411e-05, "loss": 0.3328, "step": 21110 }, { "epoch": 1.9136305963385898, "grad_norm": 0.9646616578102112, "learning_rate": 3.8781965577045125e-05, "loss": 0.3138, "step": 21115 }, { "epoch": 1.914083741163676, "grad_norm": 0.8681191205978394, "learning_rate": 3.877536635020237e-05, "loss": 0.281, "step": 21120 }, { "epoch": 1.914536885988762, "grad_norm": 0.9655329585075378, "learning_rate": 3.876876574471629e-05, "loss": 0.256, "step": 21125 }, { "epoch": 1.914990030813848, "grad_norm": 0.9164480566978455, "learning_rate": 3.876216376124748e-05, "loss": 0.3013, "step": 21130 }, { "epoch": 1.9154431756389343, "grad_norm": 0.8691954612731934, "learning_rate": 3.875556040045667e-05, "loss": 0.2534, "step": 21135 }, { "epoch": 1.9158963204640203, "grad_norm": 0.793373167514801, "learning_rate": 3.874895566300473e-05, "loss": 0.2641, "step": 21140 }, { "epoch": 1.9163494652891064, "grad_norm": 0.840736448764801, "learning_rate": 3.874234954955265e-05, "loss": 0.28, "step": 21145 }, { "epoch": 1.9168026101141926, "grad_norm": 0.9494770169258118, "learning_rate": 3.873574206076158e-05, "loss": 0.2599, "step": 21150 }, { "epoch": 1.9172557549392786, "grad_norm": 0.9168447852134705, "learning_rate": 3.872913319729281e-05, "loss": 0.2704, "step": 21155 }, { "epoch": 1.9177088997643645, "grad_norm": 0.8198391199111938, "learning_rate": 3.872252295980774e-05, "loss": 0.254, "step": 21160 }, { "epoch": 1.918162044589451, "grad_norm": 0.8060178160667419, "learning_rate": 3.871591134896794e-05, "loss": 0.2402, "step": 21165 }, { "epoch": 1.918615189414537, "grad_norm": 0.8606326580047607, "learning_rate": 3.87092983654351e-05, "loss": 0.2505, "step": 21170 }, { "epoch": 1.9190683342396229, "grad_norm": 1.0147732496261597, "learning_rate": 3.870268400987104e-05, "loss": 0.2987, "step": 21175 }, { "epoch": 1.919521479064709, "grad_norm": 0.9552265405654907, "learning_rate": 3.8696068282937725e-05, "loss": 0.3676, "step": 21180 }, { "epoch": 1.9199746238897952, "grad_norm": 0.9221466183662415, "learning_rate": 3.8689451185297275e-05, "loss": 0.272, "step": 21185 }, { "epoch": 1.9204277687148812, "grad_norm": 0.7865853309631348, "learning_rate": 3.868283271761192e-05, "loss": 0.2594, "step": 21190 }, { "epoch": 1.9208809135399674, "grad_norm": 0.9014759063720703, "learning_rate": 3.867621288054404e-05, "loss": 0.2565, "step": 21195 }, { "epoch": 1.9213340583650536, "grad_norm": 0.8447257876396179, "learning_rate": 3.866959167475616e-05, "loss": 0.2618, "step": 21200 }, { "epoch": 1.9217872031901395, "grad_norm": 0.9326667785644531, "learning_rate": 3.8662969100910905e-05, "loss": 0.3082, "step": 21205 }, { "epoch": 1.9222403480152257, "grad_norm": 0.8898334503173828, "learning_rate": 3.86563451596711e-05, "loss": 0.3032, "step": 21210 }, { "epoch": 1.9226934928403119, "grad_norm": 0.7552017569541931, "learning_rate": 3.8649719851699654e-05, "loss": 0.2501, "step": 21215 }, { "epoch": 1.9231466376653978, "grad_norm": 0.9330799579620361, "learning_rate": 3.864309317765962e-05, "loss": 0.2536, "step": 21220 }, { "epoch": 1.9235997824904838, "grad_norm": 0.9023211598396301, "learning_rate": 3.863646513821422e-05, "loss": 0.2626, "step": 21225 }, { "epoch": 1.9240529273155702, "grad_norm": 0.8993731737136841, "learning_rate": 3.8629835734026775e-05, "loss": 0.2754, "step": 21230 }, { "epoch": 1.9245060721406562, "grad_norm": 0.8544058203697205, "learning_rate": 3.862320496576076e-05, "loss": 0.2837, "step": 21235 }, { "epoch": 1.9249592169657421, "grad_norm": 0.834213137626648, "learning_rate": 3.861657283407979e-05, "loss": 0.2843, "step": 21240 }, { "epoch": 1.9254123617908283, "grad_norm": 0.9211512804031372, "learning_rate": 3.8609939339647606e-05, "loss": 0.2393, "step": 21245 }, { "epoch": 1.9258655066159145, "grad_norm": 0.9167578816413879, "learning_rate": 3.86033044831281e-05, "loss": 0.2551, "step": 21250 }, { "epoch": 1.9263186514410005, "grad_norm": 0.7963224649429321, "learning_rate": 3.859666826518526e-05, "loss": 0.2244, "step": 21255 }, { "epoch": 1.9267717962660866, "grad_norm": 1.0784181356430054, "learning_rate": 3.859003068648329e-05, "loss": 0.2495, "step": 21260 }, { "epoch": 1.9272249410911728, "grad_norm": 0.9211447238922119, "learning_rate": 3.858339174768644e-05, "loss": 0.2695, "step": 21265 }, { "epoch": 1.9276780859162588, "grad_norm": 0.9020233154296875, "learning_rate": 3.857675144945916e-05, "loss": 0.2296, "step": 21270 }, { "epoch": 1.928131230741345, "grad_norm": 0.9752929210662842, "learning_rate": 3.8570109792466e-05, "loss": 0.2961, "step": 21275 }, { "epoch": 1.9285843755664311, "grad_norm": 0.8476923704147339, "learning_rate": 3.856346677737167e-05, "loss": 0.2521, "step": 21280 }, { "epoch": 1.929037520391517, "grad_norm": 0.9896304607391357, "learning_rate": 3.8556822404840986e-05, "loss": 0.3067, "step": 21285 }, { "epoch": 1.929490665216603, "grad_norm": 0.8633641600608826, "learning_rate": 3.8550176675538954e-05, "loss": 0.2843, "step": 21290 }, { "epoch": 1.9299438100416895, "grad_norm": 0.8576300740242004, "learning_rate": 3.854352959013065e-05, "loss": 0.2549, "step": 21295 }, { "epoch": 1.9303969548667754, "grad_norm": 0.9159168601036072, "learning_rate": 3.853688114928131e-05, "loss": 0.228, "step": 21300 }, { "epoch": 1.9308500996918614, "grad_norm": 0.9765849709510803, "learning_rate": 3.853023135365635e-05, "loss": 0.2326, "step": 21305 }, { "epoch": 1.9313032445169476, "grad_norm": 0.9774726629257202, "learning_rate": 3.852358020392125e-05, "loss": 0.2633, "step": 21310 }, { "epoch": 1.9317563893420338, "grad_norm": 0.7491434812545776, "learning_rate": 3.851692770074168e-05, "loss": 0.2345, "step": 21315 }, { "epoch": 1.9322095341671197, "grad_norm": 0.9328328371047974, "learning_rate": 3.851027384478341e-05, "loss": 0.2938, "step": 21320 }, { "epoch": 1.932662678992206, "grad_norm": 0.9466821551322937, "learning_rate": 3.850361863671237e-05, "loss": 0.2928, "step": 21325 }, { "epoch": 1.933115823817292, "grad_norm": 0.8598039150238037, "learning_rate": 3.8496962077194616e-05, "loss": 0.2741, "step": 21330 }, { "epoch": 1.933568968642378, "grad_norm": 0.803435206413269, "learning_rate": 3.849030416689633e-05, "loss": 0.2569, "step": 21335 }, { "epoch": 1.9340221134674642, "grad_norm": 0.9513557553291321, "learning_rate": 3.848364490648384e-05, "loss": 0.2936, "step": 21340 }, { "epoch": 1.9344752582925504, "grad_norm": 0.8765847682952881, "learning_rate": 3.847698429662361e-05, "loss": 0.3073, "step": 21345 }, { "epoch": 1.9349284031176364, "grad_norm": 0.8391371369361877, "learning_rate": 3.847032233798224e-05, "loss": 0.244, "step": 21350 }, { "epoch": 1.9353815479427225, "grad_norm": 0.8734466433525085, "learning_rate": 3.8463659031226456e-05, "loss": 0.2755, "step": 21355 }, { "epoch": 1.9358346927678087, "grad_norm": 0.9092640280723572, "learning_rate": 3.845699437702312e-05, "loss": 0.2743, "step": 21360 }, { "epoch": 1.9362878375928947, "grad_norm": 0.8083379864692688, "learning_rate": 3.8450328376039244e-05, "loss": 0.2494, "step": 21365 }, { "epoch": 1.9367409824179806, "grad_norm": 0.7312109470367432, "learning_rate": 3.844366102894195e-05, "loss": 0.2539, "step": 21370 }, { "epoch": 1.9371941272430668, "grad_norm": 0.8622061610221863, "learning_rate": 3.843699233639853e-05, "loss": 0.2379, "step": 21375 }, { "epoch": 1.937647272068153, "grad_norm": 0.8152210712432861, "learning_rate": 3.843032229907636e-05, "loss": 0.2915, "step": 21380 }, { "epoch": 1.938100416893239, "grad_norm": 0.9268614053726196, "learning_rate": 3.8423650917642995e-05, "loss": 0.2761, "step": 21385 }, { "epoch": 1.9385535617183252, "grad_norm": 0.8639741539955139, "learning_rate": 3.8416978192766106e-05, "loss": 0.2954, "step": 21390 }, { "epoch": 1.9390067065434113, "grad_norm": 0.9332370758056641, "learning_rate": 3.8410304125113514e-05, "loss": 0.303, "step": 21395 }, { "epoch": 1.9394598513684973, "grad_norm": 0.847697377204895, "learning_rate": 3.840362871535315e-05, "loss": 0.3082, "step": 21400 }, { "epoch": 1.9399129961935835, "grad_norm": 0.9309500455856323, "learning_rate": 3.839695196415309e-05, "loss": 0.2619, "step": 21405 }, { "epoch": 1.9403661410186697, "grad_norm": 0.8357970118522644, "learning_rate": 3.839027387218155e-05, "loss": 0.2298, "step": 21410 }, { "epoch": 1.9408192858437556, "grad_norm": 0.8847268223762512, "learning_rate": 3.8383594440106874e-05, "loss": 0.2692, "step": 21415 }, { "epoch": 1.9412724306688418, "grad_norm": 0.8363723754882812, "learning_rate": 3.837691366859753e-05, "loss": 0.2439, "step": 21420 }, { "epoch": 1.941725575493928, "grad_norm": 0.8409842848777771, "learning_rate": 3.8370231558322166e-05, "loss": 0.2542, "step": 21425 }, { "epoch": 1.942178720319014, "grad_norm": 1.0650726556777954, "learning_rate": 3.8363548109949495e-05, "loss": 0.2782, "step": 21430 }, { "epoch": 1.9426318651441, "grad_norm": 0.8644388318061829, "learning_rate": 3.835686332414842e-05, "loss": 0.2751, "step": 21435 }, { "epoch": 1.9430850099691863, "grad_norm": 0.8652386665344238, "learning_rate": 3.8350177201587934e-05, "loss": 0.2629, "step": 21440 }, { "epoch": 1.9435381547942723, "grad_norm": 0.7898512482643127, "learning_rate": 3.834348974293721e-05, "loss": 0.3009, "step": 21445 }, { "epoch": 1.9439912996193582, "grad_norm": 0.8267961144447327, "learning_rate": 3.833680094886552e-05, "loss": 0.2804, "step": 21450 }, { "epoch": 1.9444444444444444, "grad_norm": 0.8307373523712158, "learning_rate": 3.8330110820042285e-05, "loss": 0.2701, "step": 21455 }, { "epoch": 1.9448975892695306, "grad_norm": 0.8160120248794556, "learning_rate": 3.832341935713705e-05, "loss": 0.2733, "step": 21460 }, { "epoch": 1.9453507340946166, "grad_norm": 0.885642945766449, "learning_rate": 3.831672656081951e-05, "loss": 0.2635, "step": 21465 }, { "epoch": 1.9458038789197027, "grad_norm": 0.9383153915405273, "learning_rate": 3.831003243175946e-05, "loss": 0.2624, "step": 21470 }, { "epoch": 1.946257023744789, "grad_norm": 0.8619828224182129, "learning_rate": 3.830333697062687e-05, "loss": 0.2774, "step": 21475 }, { "epoch": 1.9467101685698749, "grad_norm": 0.949792742729187, "learning_rate": 3.8296640178091826e-05, "loss": 0.2463, "step": 21480 }, { "epoch": 1.947163313394961, "grad_norm": 0.9081515669822693, "learning_rate": 3.828994205482452e-05, "loss": 0.2382, "step": 21485 }, { "epoch": 1.9476164582200473, "grad_norm": 1.0541276931762695, "learning_rate": 3.828324260149534e-05, "loss": 0.2972, "step": 21490 }, { "epoch": 1.9480696030451332, "grad_norm": 0.7985306978225708, "learning_rate": 3.827654181877473e-05, "loss": 0.2505, "step": 21495 }, { "epoch": 1.9485227478702192, "grad_norm": 0.9561123847961426, "learning_rate": 3.826983970733335e-05, "loss": 0.2389, "step": 21500 }, { "epoch": 1.9489758926953056, "grad_norm": 0.9112632870674133, "learning_rate": 3.8263136267841916e-05, "loss": 0.2845, "step": 21505 }, { "epoch": 1.9494290375203915, "grad_norm": 0.8204839825630188, "learning_rate": 3.825643150097132e-05, "loss": 0.274, "step": 21510 }, { "epoch": 1.9498821823454775, "grad_norm": 0.8912250399589539, "learning_rate": 3.824972540739258e-05, "loss": 0.2684, "step": 21515 }, { "epoch": 1.9503353271705637, "grad_norm": 0.7268580794334412, "learning_rate": 3.824301798777684e-05, "loss": 0.2625, "step": 21520 }, { "epoch": 1.9507884719956499, "grad_norm": 0.9015383720397949, "learning_rate": 3.823630924279539e-05, "loss": 0.3223, "step": 21525 }, { "epoch": 1.9512416168207358, "grad_norm": 0.8586991429328918, "learning_rate": 3.8229599173119624e-05, "loss": 0.254, "step": 21530 }, { "epoch": 1.951694761645822, "grad_norm": 0.9032458662986755, "learning_rate": 3.8222887779421104e-05, "loss": 0.2517, "step": 21535 }, { "epoch": 1.9521479064709082, "grad_norm": 0.7987295389175415, "learning_rate": 3.821617506237151e-05, "loss": 0.2431, "step": 21540 }, { "epoch": 1.9526010512959942, "grad_norm": 0.9121063947677612, "learning_rate": 3.820946102264265e-05, "loss": 0.2778, "step": 21545 }, { "epoch": 1.9530541961210803, "grad_norm": 0.8032146096229553, "learning_rate": 3.820274566090646e-05, "loss": 0.2762, "step": 21550 }, { "epoch": 1.9535073409461665, "grad_norm": 0.892109215259552, "learning_rate": 3.8196028977835016e-05, "loss": 0.2659, "step": 21555 }, { "epoch": 1.9539604857712525, "grad_norm": 0.9306886196136475, "learning_rate": 3.818931097410053e-05, "loss": 0.2515, "step": 21560 }, { "epoch": 1.9544136305963384, "grad_norm": 0.8127644658088684, "learning_rate": 3.818259165037536e-05, "loss": 0.2661, "step": 21565 }, { "epoch": 1.9548667754214248, "grad_norm": 0.8678514361381531, "learning_rate": 3.817587100733195e-05, "loss": 0.2714, "step": 21570 }, { "epoch": 1.9553199202465108, "grad_norm": 0.8508524298667908, "learning_rate": 3.816914904564291e-05, "loss": 0.2409, "step": 21575 }, { "epoch": 1.9557730650715968, "grad_norm": 0.9105180501937866, "learning_rate": 3.816242576598099e-05, "loss": 0.2619, "step": 21580 }, { "epoch": 1.956226209896683, "grad_norm": 0.9797322750091553, "learning_rate": 3.8155701169019056e-05, "loss": 0.2787, "step": 21585 }, { "epoch": 1.9566793547217691, "grad_norm": 0.7900907397270203, "learning_rate": 3.814897525543009e-05, "loss": 0.2527, "step": 21590 }, { "epoch": 1.957132499546855, "grad_norm": 0.8821613788604736, "learning_rate": 3.814224802588724e-05, "loss": 0.2747, "step": 21595 }, { "epoch": 1.9575856443719413, "grad_norm": 0.8693777918815613, "learning_rate": 3.813551948106376e-05, "loss": 0.2691, "step": 21600 }, { "epoch": 1.9580387891970275, "grad_norm": 0.9604732394218445, "learning_rate": 3.812878962163306e-05, "loss": 0.2852, "step": 21605 }, { "epoch": 1.9584919340221134, "grad_norm": 0.9528103470802307, "learning_rate": 3.812205844826865e-05, "loss": 0.2797, "step": 21610 }, { "epoch": 1.9589450788471996, "grad_norm": 0.8633652329444885, "learning_rate": 3.8115325961644195e-05, "loss": 0.2609, "step": 21615 }, { "epoch": 1.9593982236722858, "grad_norm": 0.9486339092254639, "learning_rate": 3.810859216243349e-05, "loss": 0.2492, "step": 21620 }, { "epoch": 1.9598513684973717, "grad_norm": 0.9351524114608765, "learning_rate": 3.8101857051310435e-05, "loss": 0.2755, "step": 21625 }, { "epoch": 1.9603045133224577, "grad_norm": 0.881844699382782, "learning_rate": 3.809512062894911e-05, "loss": 0.2412, "step": 21630 }, { "epoch": 1.960757658147544, "grad_norm": 0.8704222440719604, "learning_rate": 3.8088382896023676e-05, "loss": 0.2665, "step": 21635 }, { "epoch": 1.96121080297263, "grad_norm": 0.9041954278945923, "learning_rate": 3.808164385320847e-05, "loss": 0.2621, "step": 21640 }, { "epoch": 1.961663947797716, "grad_norm": 0.9475952982902527, "learning_rate": 3.807490350117791e-05, "loss": 0.2989, "step": 21645 }, { "epoch": 1.9621170926228022, "grad_norm": 0.8598679900169373, "learning_rate": 3.80681618406066e-05, "loss": 0.2612, "step": 21650 }, { "epoch": 1.9625702374478884, "grad_norm": 0.8319771885871887, "learning_rate": 3.806141887216923e-05, "loss": 0.2414, "step": 21655 }, { "epoch": 1.9630233822729743, "grad_norm": 0.813073456287384, "learning_rate": 3.805467459654065e-05, "loss": 0.2161, "step": 21660 }, { "epoch": 1.9634765270980605, "grad_norm": 0.8330743312835693, "learning_rate": 3.804792901439581e-05, "loss": 0.2539, "step": 21665 }, { "epoch": 1.9639296719231467, "grad_norm": 0.8336201310157776, "learning_rate": 3.804118212640984e-05, "loss": 0.2802, "step": 21670 }, { "epoch": 1.9643828167482327, "grad_norm": 0.8825152516365051, "learning_rate": 3.803443393325794e-05, "loss": 0.2699, "step": 21675 }, { "epoch": 1.9648359615733189, "grad_norm": 0.7969107031822205, "learning_rate": 3.80276844356155e-05, "loss": 0.2587, "step": 21680 }, { "epoch": 1.965289106398405, "grad_norm": 0.9428780674934387, "learning_rate": 3.8020933634157986e-05, "loss": 0.258, "step": 21685 }, { "epoch": 1.965742251223491, "grad_norm": 0.9191794991493225, "learning_rate": 3.8014181529561035e-05, "loss": 0.2689, "step": 21690 }, { "epoch": 1.966195396048577, "grad_norm": 0.9669958353042603, "learning_rate": 3.8007428122500386e-05, "loss": 0.2609, "step": 21695 }, { "epoch": 1.9666485408736634, "grad_norm": 0.9076502323150635, "learning_rate": 3.800067341365196e-05, "loss": 0.2557, "step": 21700 }, { "epoch": 1.9671016856987493, "grad_norm": 0.8764363527297974, "learning_rate": 3.7993917403691724e-05, "loss": 0.252, "step": 21705 }, { "epoch": 1.9675548305238353, "grad_norm": 0.8648235201835632, "learning_rate": 3.7987160093295846e-05, "loss": 0.2311, "step": 21710 }, { "epoch": 1.9680079753489215, "grad_norm": 0.9475594758987427, "learning_rate": 3.7980401483140596e-05, "loss": 0.2613, "step": 21715 }, { "epoch": 1.9684611201740077, "grad_norm": 1.0130733251571655, "learning_rate": 3.797364157390237e-05, "loss": 0.3207, "step": 21720 }, { "epoch": 1.9689142649990936, "grad_norm": 0.9655417203903198, "learning_rate": 3.796688036625772e-05, "loss": 0.225, "step": 21725 }, { "epoch": 1.9693674098241798, "grad_norm": 0.8077081441879272, "learning_rate": 3.79601178608833e-05, "loss": 0.2635, "step": 21730 }, { "epoch": 1.969820554649266, "grad_norm": 0.8204033374786377, "learning_rate": 3.79533540584559e-05, "loss": 0.2639, "step": 21735 }, { "epoch": 1.970273699474352, "grad_norm": 0.9053328633308411, "learning_rate": 3.794658895965244e-05, "loss": 0.2569, "step": 21740 }, { "epoch": 1.9707268442994381, "grad_norm": 0.9054266214370728, "learning_rate": 3.7939822565149983e-05, "loss": 0.2558, "step": 21745 }, { "epoch": 1.9711799891245243, "grad_norm": 0.8773769736289978, "learning_rate": 3.7933054875625715e-05, "loss": 0.2372, "step": 21750 }, { "epoch": 1.9716331339496103, "grad_norm": 0.8786621689796448, "learning_rate": 3.7926285891756946e-05, "loss": 0.2463, "step": 21755 }, { "epoch": 1.9720862787746964, "grad_norm": 0.9707854986190796, "learning_rate": 3.791951561422111e-05, "loss": 0.3152, "step": 21760 }, { "epoch": 1.9725394235997826, "grad_norm": 0.8923788070678711, "learning_rate": 3.791274404369578e-05, "loss": 0.2791, "step": 21765 }, { "epoch": 1.9729925684248686, "grad_norm": 0.8367949724197388, "learning_rate": 3.790597118085867e-05, "loss": 0.2529, "step": 21770 }, { "epoch": 1.9734457132499545, "grad_norm": 0.7095955610275269, "learning_rate": 3.7899197026387593e-05, "loss": 0.2526, "step": 21775 }, { "epoch": 1.9738988580750407, "grad_norm": 0.9424694776535034, "learning_rate": 3.7892421580960526e-05, "loss": 0.2416, "step": 21780 }, { "epoch": 1.974352002900127, "grad_norm": 0.947014331817627, "learning_rate": 3.788564484525554e-05, "loss": 0.2557, "step": 21785 }, { "epoch": 1.9748051477252129, "grad_norm": 0.9425961375236511, "learning_rate": 3.7878866819950866e-05, "loss": 0.2609, "step": 21790 }, { "epoch": 1.975258292550299, "grad_norm": 0.96077561378479, "learning_rate": 3.7872087505724854e-05, "loss": 0.2588, "step": 21795 }, { "epoch": 1.9757114373753852, "grad_norm": 0.8791220188140869, "learning_rate": 3.786530690325596e-05, "loss": 0.27, "step": 21800 }, { "epoch": 1.9761645822004712, "grad_norm": 0.8262759447097778, "learning_rate": 3.7858525013222806e-05, "loss": 0.2554, "step": 21805 }, { "epoch": 1.9766177270255574, "grad_norm": 0.9823401570320129, "learning_rate": 3.7851741836304124e-05, "loss": 0.2623, "step": 21810 }, { "epoch": 1.9770708718506436, "grad_norm": 0.8591639995574951, "learning_rate": 3.7844957373178766e-05, "loss": 0.2458, "step": 21815 }, { "epoch": 1.9775240166757295, "grad_norm": 0.9346791505813599, "learning_rate": 3.783817162452575e-05, "loss": 0.3176, "step": 21820 }, { "epoch": 1.9779771615008157, "grad_norm": 0.8722875118255615, "learning_rate": 3.783138459102416e-05, "loss": 0.2515, "step": 21825 }, { "epoch": 1.9784303063259019, "grad_norm": 0.8496626019477844, "learning_rate": 3.782459627335327e-05, "loss": 0.2705, "step": 21830 }, { "epoch": 1.9788834511509878, "grad_norm": 0.8463780283927917, "learning_rate": 3.7817806672192444e-05, "loss": 0.2457, "step": 21835 }, { "epoch": 1.9793365959760738, "grad_norm": 0.7856522798538208, "learning_rate": 3.78110157882212e-05, "loss": 0.2324, "step": 21840 }, { "epoch": 1.9797897408011602, "grad_norm": 0.77533358335495, "learning_rate": 3.780422362211917e-05, "loss": 0.2882, "step": 21845 }, { "epoch": 1.9802428856262462, "grad_norm": 0.8629361391067505, "learning_rate": 3.7797430174566105e-05, "loss": 0.3115, "step": 21850 }, { "epoch": 1.9806960304513321, "grad_norm": 0.9075189232826233, "learning_rate": 3.779063544624189e-05, "loss": 0.314, "step": 21855 }, { "epoch": 1.9811491752764183, "grad_norm": 0.8439139723777771, "learning_rate": 3.7783839437826576e-05, "loss": 0.2266, "step": 21860 }, { "epoch": 1.9816023201015045, "grad_norm": 0.9844762086868286, "learning_rate": 3.777704215000029e-05, "loss": 0.3094, "step": 21865 }, { "epoch": 1.9820554649265905, "grad_norm": 0.8375369906425476, "learning_rate": 3.77702435834433e-05, "loss": 0.2879, "step": 21870 }, { "epoch": 1.9825086097516766, "grad_norm": 0.8369290828704834, "learning_rate": 3.776344373883601e-05, "loss": 0.2436, "step": 21875 }, { "epoch": 1.9829617545767628, "grad_norm": 0.882300615310669, "learning_rate": 3.775664261685896e-05, "loss": 0.2798, "step": 21880 }, { "epoch": 1.9834148994018488, "grad_norm": 0.9388135075569153, "learning_rate": 3.774984021819279e-05, "loss": 0.2515, "step": 21885 }, { "epoch": 1.983868044226935, "grad_norm": 0.693779468536377, "learning_rate": 3.774303654351832e-05, "loss": 0.207, "step": 21890 }, { "epoch": 1.9843211890520212, "grad_norm": 0.8614654541015625, "learning_rate": 3.7736231593516445e-05, "loss": 0.2505, "step": 21895 }, { "epoch": 1.984774333877107, "grad_norm": 0.7488423585891724, "learning_rate": 3.77294253688682e-05, "loss": 0.2298, "step": 21900 }, { "epoch": 1.985227478702193, "grad_norm": 0.9224866628646851, "learning_rate": 3.772261787025476e-05, "loss": 0.284, "step": 21905 }, { "epoch": 1.9856806235272795, "grad_norm": 0.7713114619255066, "learning_rate": 3.7715809098357425e-05, "loss": 0.2256, "step": 21910 }, { "epoch": 1.9861337683523654, "grad_norm": 0.8383636474609375, "learning_rate": 3.770899905385761e-05, "loss": 0.2182, "step": 21915 }, { "epoch": 1.9865869131774514, "grad_norm": 0.9543099999427795, "learning_rate": 3.770218773743688e-05, "loss": 0.2639, "step": 21920 }, { "epoch": 1.9870400580025376, "grad_norm": 0.8951392769813538, "learning_rate": 3.7695375149776916e-05, "loss": 0.2564, "step": 21925 }, { "epoch": 1.9874932028276238, "grad_norm": 0.8465229272842407, "learning_rate": 3.76885612915595e-05, "loss": 0.2704, "step": 21930 }, { "epoch": 1.9879463476527097, "grad_norm": 0.9542536735534668, "learning_rate": 3.768174616346658e-05, "loss": 0.2658, "step": 21935 }, { "epoch": 1.988399492477796, "grad_norm": 0.8920192718505859, "learning_rate": 3.767492976618023e-05, "loss": 0.2899, "step": 21940 }, { "epoch": 1.988852637302882, "grad_norm": 0.786250650882721, "learning_rate": 3.766811210038262e-05, "loss": 0.2441, "step": 21945 }, { "epoch": 1.989305782127968, "grad_norm": 0.8486946225166321, "learning_rate": 3.766129316675606e-05, "loss": 0.2324, "step": 21950 }, { "epoch": 1.9897589269530542, "grad_norm": 0.8343789577484131, "learning_rate": 3.765447296598301e-05, "loss": 0.2635, "step": 21955 }, { "epoch": 1.9902120717781404, "grad_norm": 0.9202194809913635, "learning_rate": 3.764765149874603e-05, "loss": 0.2811, "step": 21960 }, { "epoch": 1.9906652166032264, "grad_norm": 0.9191534519195557, "learning_rate": 3.76408287657278e-05, "loss": 0.2946, "step": 21965 }, { "epoch": 1.9911183614283123, "grad_norm": 0.8184916377067566, "learning_rate": 3.763400476761115e-05, "loss": 0.2427, "step": 21970 }, { "epoch": 1.9915715062533987, "grad_norm": 0.9413515329360962, "learning_rate": 3.762717950507905e-05, "loss": 0.2552, "step": 21975 }, { "epoch": 1.9920246510784847, "grad_norm": 0.8341882824897766, "learning_rate": 3.7620352978814555e-05, "loss": 0.2365, "step": 21980 }, { "epoch": 1.9924777959035707, "grad_norm": 0.8513680100440979, "learning_rate": 3.761352518950086e-05, "loss": 0.2464, "step": 21985 }, { "epoch": 1.9929309407286568, "grad_norm": 0.8981239795684814, "learning_rate": 3.7606696137821296e-05, "loss": 0.2454, "step": 21990 }, { "epoch": 1.993384085553743, "grad_norm": 0.9956494569778442, "learning_rate": 3.7599865824459326e-05, "loss": 0.2343, "step": 21995 }, { "epoch": 1.993837230378829, "grad_norm": 0.9878427386283875, "learning_rate": 3.759303425009852e-05, "loss": 0.2816, "step": 22000 }, { "epoch": 1.9942903752039152, "grad_norm": 0.9217300415039062, "learning_rate": 3.75862014154226e-05, "loss": 0.2196, "step": 22005 }, { "epoch": 1.9947435200290013, "grad_norm": 0.9011213779449463, "learning_rate": 3.7579367321115385e-05, "loss": 0.2731, "step": 22010 }, { "epoch": 1.9951966648540873, "grad_norm": 0.9163604974746704, "learning_rate": 3.757253196786083e-05, "loss": 0.2318, "step": 22015 }, { "epoch": 1.9956498096791735, "grad_norm": 0.9467377066612244, "learning_rate": 3.7565695356343025e-05, "loss": 0.2632, "step": 22020 }, { "epoch": 1.9961029545042597, "grad_norm": 0.9400582313537598, "learning_rate": 3.755885748724618e-05, "loss": 0.2619, "step": 22025 }, { "epoch": 1.9965560993293456, "grad_norm": 0.9084227681159973, "learning_rate": 3.7552018361254644e-05, "loss": 0.2418, "step": 22030 }, { "epoch": 1.9970092441544316, "grad_norm": 0.8958334922790527, "learning_rate": 3.754517797905286e-05, "loss": 0.2415, "step": 22035 }, { "epoch": 1.997462388979518, "grad_norm": 0.8943895697593689, "learning_rate": 3.753833634132542e-05, "loss": 0.352, "step": 22040 }, { "epoch": 1.997915533804604, "grad_norm": 0.8635402321815491, "learning_rate": 3.753149344875704e-05, "loss": 0.2898, "step": 22045 }, { "epoch": 1.99836867862969, "grad_norm": 0.7699769139289856, "learning_rate": 3.7524649302032566e-05, "loss": 0.2413, "step": 22050 }, { "epoch": 1.998821823454776, "grad_norm": 0.8560754656791687, "learning_rate": 3.751780390183695e-05, "loss": 0.2774, "step": 22055 }, { "epoch": 1.9992749682798623, "grad_norm": 0.8320276141166687, "learning_rate": 3.7510957248855294e-05, "loss": 0.2695, "step": 22060 }, { "epoch": 1.9997281131049482, "grad_norm": 0.9161665439605713, "learning_rate": 3.75041093437728e-05, "loss": 0.2892, "step": 22065 }, { "epoch": 2.0001812579300347, "grad_norm": 0.8348901867866516, "learning_rate": 3.749726018727482e-05, "loss": 0.2054, "step": 22070 }, { "epoch": 2.0006344027551206, "grad_norm": 0.9156001806259155, "learning_rate": 3.749040978004681e-05, "loss": 0.2111, "step": 22075 }, { "epoch": 2.0010875475802066, "grad_norm": 0.9720994830131531, "learning_rate": 3.7483558122774376e-05, "loss": 0.1694, "step": 22080 }, { "epoch": 2.0015406924052925, "grad_norm": 0.8606445789337158, "learning_rate": 3.747670521614322e-05, "loss": 0.2, "step": 22085 }, { "epoch": 2.001993837230379, "grad_norm": 0.9236133694648743, "learning_rate": 3.746985106083919e-05, "loss": 0.1903, "step": 22090 }, { "epoch": 2.002446982055465, "grad_norm": 0.8067532777786255, "learning_rate": 3.746299565754825e-05, "loss": 0.1759, "step": 22095 }, { "epoch": 2.002900126880551, "grad_norm": 0.6811484098434448, "learning_rate": 3.745613900695649e-05, "loss": 0.1465, "step": 22100 }, { "epoch": 2.0033532717056373, "grad_norm": 1.0549582242965698, "learning_rate": 3.744928110975012e-05, "loss": 0.1736, "step": 22105 }, { "epoch": 2.0038064165307232, "grad_norm": 0.8454669713973999, "learning_rate": 3.74424219666155e-05, "loss": 0.1841, "step": 22110 }, { "epoch": 2.004259561355809, "grad_norm": 0.9032926559448242, "learning_rate": 3.743556157823908e-05, "loss": 0.211, "step": 22115 }, { "epoch": 2.0047127061808956, "grad_norm": 0.8657379150390625, "learning_rate": 3.742869994530746e-05, "loss": 0.1721, "step": 22120 }, { "epoch": 2.0051658510059815, "grad_norm": 0.8917607665061951, "learning_rate": 3.742183706850733e-05, "loss": 0.1975, "step": 22125 }, { "epoch": 2.0056189958310675, "grad_norm": 0.8672922849655151, "learning_rate": 3.741497294852556e-05, "loss": 0.1748, "step": 22130 }, { "epoch": 2.006072140656154, "grad_norm": 0.931098461151123, "learning_rate": 3.74081075860491e-05, "loss": 0.1832, "step": 22135 }, { "epoch": 2.00652528548124, "grad_norm": 0.8350840210914612, "learning_rate": 3.7401240981765037e-05, "loss": 0.1607, "step": 22140 }, { "epoch": 2.006978430306326, "grad_norm": 0.9072911143302917, "learning_rate": 3.739437313636058e-05, "loss": 0.1652, "step": 22145 }, { "epoch": 2.007431575131412, "grad_norm": 0.8175344467163086, "learning_rate": 3.738750405052308e-05, "loss": 0.1569, "step": 22150 }, { "epoch": 2.007884719956498, "grad_norm": 0.8878505825996399, "learning_rate": 3.7380633724939974e-05, "loss": 0.1739, "step": 22155 }, { "epoch": 2.008337864781584, "grad_norm": 0.7836102247238159, "learning_rate": 3.7373762160298865e-05, "loss": 0.1844, "step": 22160 }, { "epoch": 2.00879100960667, "grad_norm": 0.8317826986312866, "learning_rate": 3.736688935728746e-05, "loss": 0.1554, "step": 22165 }, { "epoch": 2.0092441544317565, "grad_norm": 0.7742133736610413, "learning_rate": 3.736001531659358e-05, "loss": 0.1859, "step": 22170 }, { "epoch": 2.0096972992568425, "grad_norm": 0.8628591299057007, "learning_rate": 3.73531400389052e-05, "loss": 0.1821, "step": 22175 }, { "epoch": 2.0101504440819284, "grad_norm": 0.87382972240448, "learning_rate": 3.734626352491038e-05, "loss": 0.1703, "step": 22180 }, { "epoch": 2.010603588907015, "grad_norm": 0.7797926068305969, "learning_rate": 3.733938577529733e-05, "loss": 0.1664, "step": 22185 }, { "epoch": 2.011056733732101, "grad_norm": 0.9192909598350525, "learning_rate": 3.733250679075438e-05, "loss": 0.1576, "step": 22190 }, { "epoch": 2.0115098785571868, "grad_norm": 0.9244998693466187, "learning_rate": 3.7325626571969994e-05, "loss": 0.1937, "step": 22195 }, { "epoch": 2.011963023382273, "grad_norm": 0.8081252574920654, "learning_rate": 3.731874511963273e-05, "loss": 0.1349, "step": 22200 }, { "epoch": 2.012416168207359, "grad_norm": 0.7809156179428101, "learning_rate": 3.731186243443128e-05, "loss": 0.1755, "step": 22205 }, { "epoch": 2.012869313032445, "grad_norm": 0.77298504114151, "learning_rate": 3.730497851705448e-05, "loss": 0.1388, "step": 22210 }, { "epoch": 2.013322457857531, "grad_norm": 0.878730058670044, "learning_rate": 3.729809336819128e-05, "loss": 0.1816, "step": 22215 }, { "epoch": 2.0137756026826175, "grad_norm": 0.8137261867523193, "learning_rate": 3.729120698853073e-05, "loss": 0.1714, "step": 22220 }, { "epoch": 2.0142287475077034, "grad_norm": 0.8559881448745728, "learning_rate": 3.7284319378762034e-05, "loss": 0.1712, "step": 22225 }, { "epoch": 2.0146818923327894, "grad_norm": 0.8117128610610962, "learning_rate": 3.7277430539574494e-05, "loss": 0.2054, "step": 22230 }, { "epoch": 2.015135037157876, "grad_norm": 0.8960525989532471, "learning_rate": 3.7270540471657555e-05, "loss": 0.1879, "step": 22235 }, { "epoch": 2.0155881819829617, "grad_norm": 0.691987156867981, "learning_rate": 3.7263649175700784e-05, "loss": 0.185, "step": 22240 }, { "epoch": 2.0160413268080477, "grad_norm": 0.9108364582061768, "learning_rate": 3.7256756652393855e-05, "loss": 0.1938, "step": 22245 }, { "epoch": 2.016494471633134, "grad_norm": 0.9484875798225403, "learning_rate": 3.724986290242658e-05, "loss": 0.1759, "step": 22250 }, { "epoch": 2.01694761645822, "grad_norm": 0.9005046486854553, "learning_rate": 3.724296792648888e-05, "loss": 0.1908, "step": 22255 }, { "epoch": 2.017400761283306, "grad_norm": 1.059119462966919, "learning_rate": 3.723607172527082e-05, "loss": 0.2014, "step": 22260 }, { "epoch": 2.0178539061083924, "grad_norm": 0.8929831981658936, "learning_rate": 3.722917429946255e-05, "loss": 0.1605, "step": 22265 }, { "epoch": 2.0183070509334784, "grad_norm": 0.9903032183647156, "learning_rate": 3.722227564975439e-05, "loss": 0.1822, "step": 22270 }, { "epoch": 2.0187601957585644, "grad_norm": 0.7874348759651184, "learning_rate": 3.721537577683675e-05, "loss": 0.1834, "step": 22275 }, { "epoch": 2.0192133405836503, "grad_norm": 0.7605298757553101, "learning_rate": 3.720847468140018e-05, "loss": 0.1617, "step": 22280 }, { "epoch": 2.0196664854087367, "grad_norm": 0.7781873941421509, "learning_rate": 3.720157236413533e-05, "loss": 0.1688, "step": 22285 }, { "epoch": 2.0201196302338227, "grad_norm": 0.7934197783470154, "learning_rate": 3.719466882573299e-05, "loss": 0.1396, "step": 22290 }, { "epoch": 2.0205727750589086, "grad_norm": 0.8043756484985352, "learning_rate": 3.718776406688408e-05, "loss": 0.1601, "step": 22295 }, { "epoch": 2.021025919883995, "grad_norm": 0.7524570822715759, "learning_rate": 3.718085808827961e-05, "loss": 0.1707, "step": 22300 }, { "epoch": 2.021479064709081, "grad_norm": 0.8645891547203064, "learning_rate": 3.7173950890610756e-05, "loss": 0.1905, "step": 22305 }, { "epoch": 2.021932209534167, "grad_norm": 0.8964389562606812, "learning_rate": 3.716704247456879e-05, "loss": 0.2013, "step": 22310 }, { "epoch": 2.0223853543592534, "grad_norm": 0.8922165632247925, "learning_rate": 3.7160132840845085e-05, "loss": 0.1709, "step": 22315 }, { "epoch": 2.0228384991843393, "grad_norm": 0.8972598910331726, "learning_rate": 3.7153221990131174e-05, "loss": 0.1835, "step": 22320 }, { "epoch": 2.0232916440094253, "grad_norm": 0.8506314754486084, "learning_rate": 3.7146309923118697e-05, "loss": 0.2042, "step": 22325 }, { "epoch": 2.0237447888345117, "grad_norm": 0.896125316619873, "learning_rate": 3.713939664049942e-05, "loss": 0.1639, "step": 22330 }, { "epoch": 2.0241979336595977, "grad_norm": 0.8813570141792297, "learning_rate": 3.713248214296523e-05, "loss": 0.1866, "step": 22335 }, { "epoch": 2.0246510784846836, "grad_norm": 0.7909635305404663, "learning_rate": 3.7125566431208125e-05, "loss": 0.159, "step": 22340 }, { "epoch": 2.0251042233097696, "grad_norm": 0.9119857549667358, "learning_rate": 3.7118649505920224e-05, "loss": 0.172, "step": 22345 }, { "epoch": 2.025557368134856, "grad_norm": 0.812126636505127, "learning_rate": 3.711173136779379e-05, "loss": 0.1664, "step": 22350 }, { "epoch": 2.026010512959942, "grad_norm": 0.9765439033508301, "learning_rate": 3.7104812017521185e-05, "loss": 0.2009, "step": 22355 }, { "epoch": 2.026463657785028, "grad_norm": 0.892051637172699, "learning_rate": 3.70978914557949e-05, "loss": 0.1694, "step": 22360 }, { "epoch": 2.0269168026101143, "grad_norm": 0.8164052963256836, "learning_rate": 3.7090969683307546e-05, "loss": 0.1654, "step": 22365 }, { "epoch": 2.0273699474352003, "grad_norm": 0.8245729207992554, "learning_rate": 3.708404670075186e-05, "loss": 0.1732, "step": 22370 }, { "epoch": 2.0278230922602862, "grad_norm": 0.8732262253761292, "learning_rate": 3.70771225088207e-05, "loss": 0.2103, "step": 22375 }, { "epoch": 2.0282762370853726, "grad_norm": 0.9373519420623779, "learning_rate": 3.707019710820703e-05, "loss": 0.1837, "step": 22380 }, { "epoch": 2.0287293819104586, "grad_norm": 0.7925602197647095, "learning_rate": 3.7063270499603965e-05, "loss": 0.1775, "step": 22385 }, { "epoch": 2.0291825267355446, "grad_norm": 0.8696021437644958, "learning_rate": 3.7056342683704695e-05, "loss": 0.1766, "step": 22390 }, { "epoch": 2.029635671560631, "grad_norm": 0.871673047542572, "learning_rate": 3.704941366120258e-05, "loss": 0.1725, "step": 22395 }, { "epoch": 2.030088816385717, "grad_norm": 0.796028196811676, "learning_rate": 3.7042483432791074e-05, "loss": 0.1763, "step": 22400 }, { "epoch": 2.030541961210803, "grad_norm": 0.8869535326957703, "learning_rate": 3.703555199916375e-05, "loss": 0.2065, "step": 22405 }, { "epoch": 2.0309951060358893, "grad_norm": 0.8056180477142334, "learning_rate": 3.7028619361014324e-05, "loss": 0.1764, "step": 22410 }, { "epoch": 2.0314482508609752, "grad_norm": 0.7686439156532288, "learning_rate": 3.70216855190366e-05, "loss": 0.1745, "step": 22415 }, { "epoch": 2.031901395686061, "grad_norm": 0.9170577526092529, "learning_rate": 3.701475047392452e-05, "loss": 0.2056, "step": 22420 }, { "epoch": 2.032354540511147, "grad_norm": 0.8419462442398071, "learning_rate": 3.7007814226372164e-05, "loss": 0.1819, "step": 22425 }, { "epoch": 2.0328076853362336, "grad_norm": 0.892756998538971, "learning_rate": 3.700087677707369e-05, "loss": 0.185, "step": 22430 }, { "epoch": 2.0332608301613195, "grad_norm": 0.8841712474822998, "learning_rate": 3.699393812672341e-05, "loss": 0.2007, "step": 22435 }, { "epoch": 2.0337139749864055, "grad_norm": 0.892970085144043, "learning_rate": 3.698699827601575e-05, "loss": 0.2086, "step": 22440 }, { "epoch": 2.034167119811492, "grad_norm": 0.8341164588928223, "learning_rate": 3.698005722564526e-05, "loss": 0.1862, "step": 22445 }, { "epoch": 2.034620264636578, "grad_norm": 0.7765936851501465, "learning_rate": 3.6973114976306586e-05, "loss": 0.1685, "step": 22450 }, { "epoch": 2.035073409461664, "grad_norm": 0.9223626255989075, "learning_rate": 3.696617152869451e-05, "loss": 0.1709, "step": 22455 }, { "epoch": 2.0355265542867502, "grad_norm": 0.7542870044708252, "learning_rate": 3.695922688350395e-05, "loss": 0.1813, "step": 22460 }, { "epoch": 2.035979699111836, "grad_norm": 0.9414351582527161, "learning_rate": 3.695228104142992e-05, "loss": 0.2001, "step": 22465 }, { "epoch": 2.036432843936922, "grad_norm": 0.7905782461166382, "learning_rate": 3.6945334003167564e-05, "loss": 0.1819, "step": 22470 }, { "epoch": 2.0368859887620085, "grad_norm": 0.7844231724739075, "learning_rate": 3.693838576941214e-05, "loss": 0.2104, "step": 22475 }, { "epoch": 2.0373391335870945, "grad_norm": 0.9006971120834351, "learning_rate": 3.6931436340859037e-05, "loss": 0.1878, "step": 22480 }, { "epoch": 2.0377922784121805, "grad_norm": 0.9116414189338684, "learning_rate": 3.692448571820374e-05, "loss": 0.167, "step": 22485 }, { "epoch": 2.0382454232372664, "grad_norm": 0.8953067064285278, "learning_rate": 3.691753390214189e-05, "loss": 0.1971, "step": 22490 }, { "epoch": 2.038698568062353, "grad_norm": 0.7997906804084778, "learning_rate": 3.6910580893369225e-05, "loss": 0.2081, "step": 22495 }, { "epoch": 2.039151712887439, "grad_norm": 0.8878154158592224, "learning_rate": 3.690362669258159e-05, "loss": 0.1823, "step": 22500 }, { "epoch": 2.0396048577125248, "grad_norm": 0.7929674386978149, "learning_rate": 3.689667130047497e-05, "loss": 0.1808, "step": 22505 }, { "epoch": 2.040058002537611, "grad_norm": 0.8815019130706787, "learning_rate": 3.688971471774547e-05, "loss": 0.1853, "step": 22510 }, { "epoch": 2.040511147362697, "grad_norm": 0.8102365732192993, "learning_rate": 3.6882756945089295e-05, "loss": 0.167, "step": 22515 }, { "epoch": 2.040964292187783, "grad_norm": 0.9181260466575623, "learning_rate": 3.68757979832028e-05, "loss": 0.1829, "step": 22520 }, { "epoch": 2.0414174370128695, "grad_norm": 1.0807809829711914, "learning_rate": 3.6868837832782414e-05, "loss": 0.1792, "step": 22525 }, { "epoch": 2.0418705818379554, "grad_norm": 0.8072863221168518, "learning_rate": 3.6861876494524724e-05, "loss": 0.1492, "step": 22530 }, { "epoch": 2.0423237266630414, "grad_norm": 1.027867078781128, "learning_rate": 3.6854913969126445e-05, "loss": 0.1865, "step": 22535 }, { "epoch": 2.042776871488128, "grad_norm": 0.7659299969673157, "learning_rate": 3.684795025728435e-05, "loss": 0.1472, "step": 22540 }, { "epoch": 2.0432300163132138, "grad_norm": 0.9479508996009827, "learning_rate": 3.684098535969539e-05, "loss": 0.1903, "step": 22545 }, { "epoch": 2.0436831611382997, "grad_norm": 0.9499671459197998, "learning_rate": 3.6834019277056615e-05, "loss": 0.1877, "step": 22550 }, { "epoch": 2.0441363059633857, "grad_norm": 0.8078908324241638, "learning_rate": 3.682705201006519e-05, "loss": 0.1497, "step": 22555 }, { "epoch": 2.044589450788472, "grad_norm": 0.8687248826026917, "learning_rate": 3.682008355941841e-05, "loss": 0.1526, "step": 22560 }, { "epoch": 2.045042595613558, "grad_norm": 0.8610833883285522, "learning_rate": 3.6813113925813656e-05, "loss": 0.1743, "step": 22565 }, { "epoch": 2.045495740438644, "grad_norm": 0.865368664264679, "learning_rate": 3.680614310994848e-05, "loss": 0.1628, "step": 22570 }, { "epoch": 2.0459488852637304, "grad_norm": 0.8356055617332458, "learning_rate": 3.6799171112520506e-05, "loss": 0.1693, "step": 22575 }, { "epoch": 2.0464020300888164, "grad_norm": 0.8738073110580444, "learning_rate": 3.6792197934227496e-05, "loss": 0.204, "step": 22580 }, { "epoch": 2.0468551749139023, "grad_norm": 1.006347417831421, "learning_rate": 3.678522357576734e-05, "loss": 0.1965, "step": 22585 }, { "epoch": 2.0473083197389887, "grad_norm": 0.8786470890045166, "learning_rate": 3.6778248037838016e-05, "loss": 0.1891, "step": 22590 }, { "epoch": 2.0477614645640747, "grad_norm": 0.7637738585472107, "learning_rate": 3.677127132113764e-05, "loss": 0.1766, "step": 22595 }, { "epoch": 2.0482146093891607, "grad_norm": 0.771630585193634, "learning_rate": 3.676429342636446e-05, "loss": 0.1881, "step": 22600 }, { "epoch": 2.048667754214247, "grad_norm": 0.8699048161506653, "learning_rate": 3.675731435421682e-05, "loss": 0.1872, "step": 22605 }, { "epoch": 2.049120899039333, "grad_norm": 0.903174877166748, "learning_rate": 3.675033410539318e-05, "loss": 0.1681, "step": 22610 }, { "epoch": 2.049574043864419, "grad_norm": 0.8789992332458496, "learning_rate": 3.674335268059213e-05, "loss": 0.1711, "step": 22615 }, { "epoch": 2.050027188689505, "grad_norm": 0.822296142578125, "learning_rate": 3.673637008051237e-05, "loss": 0.1663, "step": 22620 }, { "epoch": 2.0504803335145914, "grad_norm": 0.8558931946754456, "learning_rate": 3.6729386305852724e-05, "loss": 0.166, "step": 22625 }, { "epoch": 2.0509334783396773, "grad_norm": 0.8625283241271973, "learning_rate": 3.672240135731214e-05, "loss": 0.1582, "step": 22630 }, { "epoch": 2.0513866231647633, "grad_norm": 1.0099583864212036, "learning_rate": 3.671541523558967e-05, "loss": 0.1535, "step": 22635 }, { "epoch": 2.0518397679898497, "grad_norm": 0.8172871470451355, "learning_rate": 3.670842794138447e-05, "loss": 0.1738, "step": 22640 }, { "epoch": 2.0522929128149356, "grad_norm": 0.853092610836029, "learning_rate": 3.6701439475395846e-05, "loss": 0.162, "step": 22645 }, { "epoch": 2.0527460576400216, "grad_norm": 0.8478779196739197, "learning_rate": 3.6694449838323205e-05, "loss": 0.1766, "step": 22650 }, { "epoch": 2.053199202465108, "grad_norm": 0.9399266242980957, "learning_rate": 3.6687459030866076e-05, "loss": 0.1798, "step": 22655 }, { "epoch": 2.053652347290194, "grad_norm": 0.7646821737289429, "learning_rate": 3.668046705372409e-05, "loss": 0.169, "step": 22660 }, { "epoch": 2.05410549211528, "grad_norm": 0.901872992515564, "learning_rate": 3.667347390759702e-05, "loss": 0.2202, "step": 22665 }, { "epoch": 2.0545586369403663, "grad_norm": 0.8655814528465271, "learning_rate": 3.666647959318473e-05, "loss": 0.1721, "step": 22670 }, { "epoch": 2.0550117817654523, "grad_norm": 0.8158611059188843, "learning_rate": 3.6659484111187216e-05, "loss": 0.1779, "step": 22675 }, { "epoch": 2.0554649265905383, "grad_norm": 0.9957107901573181, "learning_rate": 3.665248746230459e-05, "loss": 0.1817, "step": 22680 }, { "epoch": 2.055918071415624, "grad_norm": 0.7783240675926208, "learning_rate": 3.664548964723708e-05, "loss": 0.1759, "step": 22685 }, { "epoch": 2.0563712162407106, "grad_norm": 0.8524824380874634, "learning_rate": 3.663849066668504e-05, "loss": 0.16, "step": 22690 }, { "epoch": 2.0568243610657966, "grad_norm": 0.8465549945831299, "learning_rate": 3.663149052134891e-05, "loss": 0.1908, "step": 22695 }, { "epoch": 2.0572775058908825, "grad_norm": 0.8432751297950745, "learning_rate": 3.662448921192928e-05, "loss": 0.1679, "step": 22700 }, { "epoch": 2.057730650715969, "grad_norm": 0.7676463723182678, "learning_rate": 3.661748673912684e-05, "loss": 0.1517, "step": 22705 }, { "epoch": 2.058183795541055, "grad_norm": 1.0123859643936157, "learning_rate": 3.6610483103642394e-05, "loss": 0.1499, "step": 22710 }, { "epoch": 2.058636940366141, "grad_norm": 0.9530651569366455, "learning_rate": 3.660347830617687e-05, "loss": 0.1574, "step": 22715 }, { "epoch": 2.0590900851912273, "grad_norm": 0.9215995669364929, "learning_rate": 3.659647234743133e-05, "loss": 0.1908, "step": 22720 }, { "epoch": 2.0595432300163132, "grad_norm": 0.8075702786445618, "learning_rate": 3.658946522810692e-05, "loss": 0.2158, "step": 22725 }, { "epoch": 2.059996374841399, "grad_norm": 0.8854828476905823, "learning_rate": 3.65824569489049e-05, "loss": 0.1569, "step": 22730 }, { "epoch": 2.0604495196664856, "grad_norm": 0.8988309502601624, "learning_rate": 3.657544751052667e-05, "loss": 0.1525, "step": 22735 }, { "epoch": 2.0609026644915716, "grad_norm": 0.9154642820358276, "learning_rate": 3.6568436913673744e-05, "loss": 0.1924, "step": 22740 }, { "epoch": 2.0613558093166575, "grad_norm": 0.7112711668014526, "learning_rate": 3.656142515904775e-05, "loss": 0.1454, "step": 22745 }, { "epoch": 2.0618089541417435, "grad_norm": 0.8023086786270142, "learning_rate": 3.655441224735041e-05, "loss": 0.1703, "step": 22750 }, { "epoch": 2.06226209896683, "grad_norm": 0.9270704984664917, "learning_rate": 3.654739817928358e-05, "loss": 0.1793, "step": 22755 }, { "epoch": 2.062715243791916, "grad_norm": 0.778030276298523, "learning_rate": 3.654038295554924e-05, "loss": 0.1903, "step": 22760 }, { "epoch": 2.063168388617002, "grad_norm": 0.8919812440872192, "learning_rate": 3.653336657684948e-05, "loss": 0.1594, "step": 22765 }, { "epoch": 2.063621533442088, "grad_norm": 0.7637484669685364, "learning_rate": 3.6526349043886495e-05, "loss": 0.1702, "step": 22770 }, { "epoch": 2.064074678267174, "grad_norm": 0.8229074478149414, "learning_rate": 3.6519330357362594e-05, "loss": 0.1785, "step": 22775 }, { "epoch": 2.06452782309226, "grad_norm": 0.8848685026168823, "learning_rate": 3.651231051798022e-05, "loss": 0.1699, "step": 22780 }, { "epoch": 2.0649809679173465, "grad_norm": 0.9159567952156067, "learning_rate": 3.6505289526441914e-05, "loss": 0.1716, "step": 22785 }, { "epoch": 2.0654341127424325, "grad_norm": 0.8929039239883423, "learning_rate": 3.6498267383450344e-05, "loss": 0.2053, "step": 22790 }, { "epoch": 2.0658872575675185, "grad_norm": 0.7472127079963684, "learning_rate": 3.6491244089708285e-05, "loss": 0.1422, "step": 22795 }, { "epoch": 2.066340402392605, "grad_norm": 0.7749798893928528, "learning_rate": 3.6484219645918646e-05, "loss": 0.1871, "step": 22800 }, { "epoch": 2.066793547217691, "grad_norm": 0.8088188171386719, "learning_rate": 3.647719405278441e-05, "loss": 0.1546, "step": 22805 }, { "epoch": 2.067246692042777, "grad_norm": 0.9294149875640869, "learning_rate": 3.647016731100871e-05, "loss": 0.1788, "step": 22810 }, { "epoch": 2.067699836867863, "grad_norm": 0.8703341484069824, "learning_rate": 3.6463139421294794e-05, "loss": 0.1919, "step": 22815 }, { "epoch": 2.068152981692949, "grad_norm": 0.8896740078926086, "learning_rate": 3.6456110384346017e-05, "loss": 0.1629, "step": 22820 }, { "epoch": 2.068606126518035, "grad_norm": 0.8650333881378174, "learning_rate": 3.644908020086583e-05, "loss": 0.1836, "step": 22825 }, { "epoch": 2.069059271343121, "grad_norm": 0.8821173906326294, "learning_rate": 3.644204887155783e-05, "loss": 0.1739, "step": 22830 }, { "epoch": 2.0695124161682075, "grad_norm": 0.8679720759391785, "learning_rate": 3.643501639712571e-05, "loss": 0.2116, "step": 22835 }, { "epoch": 2.0699655609932934, "grad_norm": 0.8787007927894592, "learning_rate": 3.642798277827328e-05, "loss": 0.1504, "step": 22840 }, { "epoch": 2.0704187058183794, "grad_norm": 0.8184781074523926, "learning_rate": 3.642094801570448e-05, "loss": 0.1563, "step": 22845 }, { "epoch": 2.070871850643466, "grad_norm": 1.005205512046814, "learning_rate": 3.641391211012333e-05, "loss": 0.1934, "step": 22850 }, { "epoch": 2.0713249954685518, "grad_norm": 0.9737535119056702, "learning_rate": 3.6406875062234e-05, "loss": 0.1653, "step": 22855 }, { "epoch": 2.0717781402936377, "grad_norm": 0.8128912448883057, "learning_rate": 3.6399836872740765e-05, "loss": 0.159, "step": 22860 }, { "epoch": 2.072231285118724, "grad_norm": 0.8592696785926819, "learning_rate": 3.6392797542348e-05, "loss": 0.1941, "step": 22865 }, { "epoch": 2.07268442994381, "grad_norm": 0.9716417789459229, "learning_rate": 3.63857570717602e-05, "loss": 0.194, "step": 22870 }, { "epoch": 2.073137574768896, "grad_norm": 0.8976809978485107, "learning_rate": 3.637871546168198e-05, "loss": 0.1822, "step": 22875 }, { "epoch": 2.0735907195939824, "grad_norm": 0.866005003452301, "learning_rate": 3.637167271281808e-05, "loss": 0.198, "step": 22880 }, { "epoch": 2.0740438644190684, "grad_norm": 0.7867100238800049, "learning_rate": 3.636462882587333e-05, "loss": 0.1751, "step": 22885 }, { "epoch": 2.0744970092441544, "grad_norm": 0.8565154075622559, "learning_rate": 3.6357583801552685e-05, "loss": 0.191, "step": 22890 }, { "epoch": 2.0749501540692403, "grad_norm": 0.9041048884391785, "learning_rate": 3.635053764056121e-05, "loss": 0.1544, "step": 22895 }, { "epoch": 2.0754032988943267, "grad_norm": 0.8082165718078613, "learning_rate": 3.634349034360409e-05, "loss": 0.1932, "step": 22900 }, { "epoch": 2.0758564437194127, "grad_norm": 0.9144600033760071, "learning_rate": 3.6336441911386627e-05, "loss": 0.1886, "step": 22905 }, { "epoch": 2.0763095885444987, "grad_norm": 0.8665714859962463, "learning_rate": 3.632939234461422e-05, "loss": 0.1644, "step": 22910 }, { "epoch": 2.076762733369585, "grad_norm": 0.876072883605957, "learning_rate": 3.63223416439924e-05, "loss": 0.15, "step": 22915 }, { "epoch": 2.077215878194671, "grad_norm": 1.0315203666687012, "learning_rate": 3.63152898102268e-05, "loss": 0.2015, "step": 22920 }, { "epoch": 2.077669023019757, "grad_norm": 0.7132622599601746, "learning_rate": 3.6308236844023166e-05, "loss": 0.1703, "step": 22925 }, { "epoch": 2.0781221678448434, "grad_norm": 0.7947543263435364, "learning_rate": 3.630118274608738e-05, "loss": 0.1699, "step": 22930 }, { "epoch": 2.0785753126699293, "grad_norm": 0.9550694227218628, "learning_rate": 3.6294127517125394e-05, "loss": 0.1914, "step": 22935 }, { "epoch": 2.0790284574950153, "grad_norm": 0.9360312223434448, "learning_rate": 3.628707115784331e-05, "loss": 0.1955, "step": 22940 }, { "epoch": 2.0794816023201017, "grad_norm": 0.8941988348960876, "learning_rate": 3.628001366894732e-05, "loss": 0.1778, "step": 22945 }, { "epoch": 2.0799347471451877, "grad_norm": 0.8144424557685852, "learning_rate": 3.627295505114376e-05, "loss": 0.1595, "step": 22950 }, { "epoch": 2.0803878919702736, "grad_norm": 0.7917506694793701, "learning_rate": 3.626589530513904e-05, "loss": 0.1781, "step": 22955 }, { "epoch": 2.0808410367953596, "grad_norm": 0.9675992131233215, "learning_rate": 3.625883443163972e-05, "loss": 0.1771, "step": 22960 }, { "epoch": 2.081294181620446, "grad_norm": 0.885948896408081, "learning_rate": 3.625177243135244e-05, "loss": 0.2109, "step": 22965 }, { "epoch": 2.081747326445532, "grad_norm": 0.9414727091789246, "learning_rate": 3.6244709304983964e-05, "loss": 0.1813, "step": 22970 }, { "epoch": 2.082200471270618, "grad_norm": 0.8814032077789307, "learning_rate": 3.623764505324119e-05, "loss": 0.1677, "step": 22975 }, { "epoch": 2.0826536160957043, "grad_norm": 1.1609623432159424, "learning_rate": 3.623057967683109e-05, "loss": 0.1978, "step": 22980 }, { "epoch": 2.0831067609207903, "grad_norm": 0.7607418298721313, "learning_rate": 3.622351317646079e-05, "loss": 0.1678, "step": 22985 }, { "epoch": 2.0835599057458762, "grad_norm": 0.9396650195121765, "learning_rate": 3.621644555283749e-05, "loss": 0.186, "step": 22990 }, { "epoch": 2.0840130505709626, "grad_norm": 0.9658262729644775, "learning_rate": 3.620937680666853e-05, "loss": 0.1991, "step": 22995 }, { "epoch": 2.0844661953960486, "grad_norm": 0.9671950936317444, "learning_rate": 3.6202306938661347e-05, "loss": 0.1855, "step": 23000 }, { "epoch": 2.0849193402211346, "grad_norm": 0.8541342616081238, "learning_rate": 3.61952359495235e-05, "loss": 0.1614, "step": 23005 }, { "epoch": 2.085372485046221, "grad_norm": 0.7937174439430237, "learning_rate": 3.618816383996266e-05, "loss": 0.1843, "step": 23010 }, { "epoch": 2.085825629871307, "grad_norm": 0.8027775883674622, "learning_rate": 3.618109061068659e-05, "loss": 0.1672, "step": 23015 }, { "epoch": 2.086278774696393, "grad_norm": 0.7914005517959595, "learning_rate": 3.61740162624032e-05, "loss": 0.1576, "step": 23020 }, { "epoch": 2.086731919521479, "grad_norm": 0.854601263999939, "learning_rate": 3.616694079582048e-05, "loss": 0.154, "step": 23025 }, { "epoch": 2.0871850643465653, "grad_norm": 0.9226366281509399, "learning_rate": 3.615986421164656e-05, "loss": 0.1585, "step": 23030 }, { "epoch": 2.087638209171651, "grad_norm": 0.7509725689888, "learning_rate": 3.6152786510589643e-05, "loss": 0.1473, "step": 23035 }, { "epoch": 2.088091353996737, "grad_norm": 0.9036109447479248, "learning_rate": 3.614570769335809e-05, "loss": 0.1726, "step": 23040 }, { "epoch": 2.0885444988218236, "grad_norm": 0.8831096291542053, "learning_rate": 3.613862776066034e-05, "loss": 0.1768, "step": 23045 }, { "epoch": 2.0889976436469095, "grad_norm": 0.8393821120262146, "learning_rate": 3.6131546713204964e-05, "loss": 0.1623, "step": 23050 }, { "epoch": 2.0894507884719955, "grad_norm": 0.9557551741600037, "learning_rate": 3.612446455170063e-05, "loss": 0.1883, "step": 23055 }, { "epoch": 2.089903933297082, "grad_norm": 0.9194515347480774, "learning_rate": 3.6117381276856116e-05, "loss": 0.1795, "step": 23060 }, { "epoch": 2.090357078122168, "grad_norm": 0.8806637525558472, "learning_rate": 3.611029688938033e-05, "loss": 0.2353, "step": 23065 }, { "epoch": 2.090810222947254, "grad_norm": 0.8432883024215698, "learning_rate": 3.610321138998227e-05, "loss": 0.1862, "step": 23070 }, { "epoch": 2.0912633677723402, "grad_norm": 0.9618788361549377, "learning_rate": 3.609612477937107e-05, "loss": 0.1722, "step": 23075 }, { "epoch": 2.091716512597426, "grad_norm": 0.9316142201423645, "learning_rate": 3.6089037058255945e-05, "loss": 0.182, "step": 23080 }, { "epoch": 2.092169657422512, "grad_norm": 0.8722761869430542, "learning_rate": 3.608194822734624e-05, "loss": 0.1807, "step": 23085 }, { "epoch": 2.0926228022475986, "grad_norm": 0.9408997893333435, "learning_rate": 3.6074858287351415e-05, "loss": 0.1908, "step": 23090 }, { "epoch": 2.0930759470726845, "grad_norm": 0.8604814410209656, "learning_rate": 3.606776723898103e-05, "loss": 0.1732, "step": 23095 }, { "epoch": 2.0935290918977705, "grad_norm": 1.0590484142303467, "learning_rate": 3.6060675082944756e-05, "loss": 0.1824, "step": 23100 }, { "epoch": 2.0939822367228564, "grad_norm": 0.8744784593582153, "learning_rate": 3.605358181995238e-05, "loss": 0.1698, "step": 23105 }, { "epoch": 2.094435381547943, "grad_norm": 0.7088775634765625, "learning_rate": 3.60464874507138e-05, "loss": 0.1584, "step": 23110 }, { "epoch": 2.094888526373029, "grad_norm": 0.8046888113021851, "learning_rate": 3.603939197593902e-05, "loss": 0.1867, "step": 23115 }, { "epoch": 2.0953416711981148, "grad_norm": 0.9105606079101562, "learning_rate": 3.603229539633816e-05, "loss": 0.1531, "step": 23120 }, { "epoch": 2.095794816023201, "grad_norm": 0.9043077230453491, "learning_rate": 3.602519771262145e-05, "loss": 0.1733, "step": 23125 }, { "epoch": 2.096247960848287, "grad_norm": 0.8019181489944458, "learning_rate": 3.601809892549922e-05, "loss": 0.1587, "step": 23130 }, { "epoch": 2.096701105673373, "grad_norm": 0.8193879723548889, "learning_rate": 3.601099903568192e-05, "loss": 0.2028, "step": 23135 }, { "epoch": 2.0971542504984595, "grad_norm": 0.7357867360115051, "learning_rate": 3.600389804388013e-05, "loss": 0.1765, "step": 23140 }, { "epoch": 2.0976073953235455, "grad_norm": 0.8132690787315369, "learning_rate": 3.59967959508045e-05, "loss": 0.1948, "step": 23145 }, { "epoch": 2.0980605401486314, "grad_norm": 0.7758851647377014, "learning_rate": 3.5989692757165804e-05, "loss": 0.174, "step": 23150 }, { "epoch": 2.0985136849737174, "grad_norm": 0.8674086928367615, "learning_rate": 3.5982588463674947e-05, "loss": 0.187, "step": 23155 }, { "epoch": 2.098966829798804, "grad_norm": 0.768828272819519, "learning_rate": 3.5975483071042925e-05, "loss": 0.1587, "step": 23160 }, { "epoch": 2.0994199746238897, "grad_norm": 0.9132446050643921, "learning_rate": 3.596837657998085e-05, "loss": 0.1573, "step": 23165 }, { "epoch": 2.0998731194489757, "grad_norm": 0.8886389136314392, "learning_rate": 3.596126899119994e-05, "loss": 0.1937, "step": 23170 }, { "epoch": 2.100326264274062, "grad_norm": 0.8623743057250977, "learning_rate": 3.595416030541152e-05, "loss": 0.1885, "step": 23175 }, { "epoch": 2.100779409099148, "grad_norm": 0.9459007978439331, "learning_rate": 3.5947050523327024e-05, "loss": 0.1831, "step": 23180 }, { "epoch": 2.101232553924234, "grad_norm": 0.8972563743591309, "learning_rate": 3.5939939645658024e-05, "loss": 0.1823, "step": 23185 }, { "epoch": 2.1016856987493204, "grad_norm": 0.7787379622459412, "learning_rate": 3.593282767311617e-05, "loss": 0.1719, "step": 23190 }, { "epoch": 2.1021388435744064, "grad_norm": 0.8476887941360474, "learning_rate": 3.592571460641321e-05, "loss": 0.1582, "step": 23195 }, { "epoch": 2.1025919883994924, "grad_norm": 0.88924640417099, "learning_rate": 3.591860044626104e-05, "loss": 0.1899, "step": 23200 }, { "epoch": 2.1030451332245788, "grad_norm": 0.8700147867202759, "learning_rate": 3.591148519337166e-05, "loss": 0.1815, "step": 23205 }, { "epoch": 2.1034982780496647, "grad_norm": 0.8005215525627136, "learning_rate": 3.590436884845715e-05, "loss": 0.1551, "step": 23210 }, { "epoch": 2.1039514228747507, "grad_norm": 0.9116935133934021, "learning_rate": 3.589725141222972e-05, "loss": 0.1591, "step": 23215 }, { "epoch": 2.104404567699837, "grad_norm": 0.8657578229904175, "learning_rate": 3.5890132885401674e-05, "loss": 0.1455, "step": 23220 }, { "epoch": 2.104857712524923, "grad_norm": 0.9980453848838806, "learning_rate": 3.588301326868545e-05, "loss": 0.1815, "step": 23225 }, { "epoch": 2.105310857350009, "grad_norm": 0.7741807103157043, "learning_rate": 3.587589256279359e-05, "loss": 0.1622, "step": 23230 }, { "epoch": 2.105764002175095, "grad_norm": 0.8536080718040466, "learning_rate": 3.586877076843872e-05, "loss": 0.154, "step": 23235 }, { "epoch": 2.1062171470001814, "grad_norm": 1.0043061971664429, "learning_rate": 3.58616478863336e-05, "loss": 0.1957, "step": 23240 }, { "epoch": 2.1066702918252673, "grad_norm": 0.7782866358757019, "learning_rate": 3.5854523917191084e-05, "loss": 0.1526, "step": 23245 }, { "epoch": 2.1071234366503533, "grad_norm": 0.8742188811302185, "learning_rate": 3.5847398861724144e-05, "loss": 0.1713, "step": 23250 }, { "epoch": 2.1075765814754397, "grad_norm": 0.8874481916427612, "learning_rate": 3.584027272064586e-05, "loss": 0.1579, "step": 23255 }, { "epoch": 2.1080297263005257, "grad_norm": 0.9280371069908142, "learning_rate": 3.583314549466942e-05, "loss": 0.183, "step": 23260 }, { "epoch": 2.1084828711256116, "grad_norm": 0.8097473382949829, "learning_rate": 3.5826017184508115e-05, "loss": 0.1913, "step": 23265 }, { "epoch": 2.108936015950698, "grad_norm": 0.7380501627922058, "learning_rate": 3.581888779087534e-05, "loss": 0.162, "step": 23270 }, { "epoch": 2.109389160775784, "grad_norm": 0.8984990119934082, "learning_rate": 3.5811757314484635e-05, "loss": 0.1919, "step": 23275 }, { "epoch": 2.10984230560087, "grad_norm": 0.9058222770690918, "learning_rate": 3.5804625756049596e-05, "loss": 0.2189, "step": 23280 }, { "epoch": 2.1102954504259563, "grad_norm": 0.939836859703064, "learning_rate": 3.579749311628396e-05, "loss": 0.1896, "step": 23285 }, { "epoch": 2.1107485952510423, "grad_norm": 0.7552431225776672, "learning_rate": 3.579035939590156e-05, "loss": 0.1686, "step": 23290 }, { "epoch": 2.1112017400761283, "grad_norm": 0.8427239060401917, "learning_rate": 3.578322459561634e-05, "loss": 0.1664, "step": 23295 }, { "epoch": 2.1116548849012142, "grad_norm": 0.8812171816825867, "learning_rate": 3.577608871614237e-05, "loss": 0.1738, "step": 23300 }, { "epoch": 2.1121080297263006, "grad_norm": 0.8059263825416565, "learning_rate": 3.576895175819379e-05, "loss": 0.1614, "step": 23305 }, { "epoch": 2.1125611745513866, "grad_norm": 0.8728508353233337, "learning_rate": 3.576181372248488e-05, "loss": 0.1644, "step": 23310 }, { "epoch": 2.1130143193764725, "grad_norm": 0.7795554399490356, "learning_rate": 3.5754674609730013e-05, "loss": 0.1385, "step": 23315 }, { "epoch": 2.113467464201559, "grad_norm": 0.8806779980659485, "learning_rate": 3.5747534420643666e-05, "loss": 0.1813, "step": 23320 }, { "epoch": 2.113920609026645, "grad_norm": 1.0254441499710083, "learning_rate": 3.5740393155940455e-05, "loss": 0.1932, "step": 23325 }, { "epoch": 2.114373753851731, "grad_norm": 0.9017237424850464, "learning_rate": 3.5733250816335066e-05, "loss": 0.1833, "step": 23330 }, { "epoch": 2.1148268986768173, "grad_norm": 0.7965636253356934, "learning_rate": 3.5726107402542294e-05, "loss": 0.1577, "step": 23335 }, { "epoch": 2.1152800435019032, "grad_norm": 0.9013510942459106, "learning_rate": 3.5718962915277075e-05, "loss": 0.196, "step": 23340 }, { "epoch": 2.115733188326989, "grad_norm": 0.863764226436615, "learning_rate": 3.5711817355254426e-05, "loss": 0.1903, "step": 23345 }, { "epoch": 2.1161863331520756, "grad_norm": 0.7915270924568176, "learning_rate": 3.570467072318948e-05, "loss": 0.1783, "step": 23350 }, { "epoch": 2.1166394779771616, "grad_norm": 0.8897678256034851, "learning_rate": 3.569752301979746e-05, "loss": 0.1833, "step": 23355 }, { "epoch": 2.1170926228022475, "grad_norm": 0.8391732573509216, "learning_rate": 3.5690374245793723e-05, "loss": 0.197, "step": 23360 }, { "epoch": 2.1175457676273335, "grad_norm": 0.7807589173316956, "learning_rate": 3.568322440189372e-05, "loss": 0.1447, "step": 23365 }, { "epoch": 2.11799891245242, "grad_norm": 0.8930678367614746, "learning_rate": 3.5676073488813e-05, "loss": 0.1723, "step": 23370 }, { "epoch": 2.118452057277506, "grad_norm": 0.8585749268531799, "learning_rate": 3.566892150726725e-05, "loss": 0.1866, "step": 23375 }, { "epoch": 2.118905202102592, "grad_norm": 0.8251767158508301, "learning_rate": 3.566176845797222e-05, "loss": 0.1756, "step": 23380 }, { "epoch": 2.119358346927678, "grad_norm": 0.8097807765007019, "learning_rate": 3.5654614341643794e-05, "loss": 0.1691, "step": 23385 }, { "epoch": 2.119811491752764, "grad_norm": 0.8227307796478271, "learning_rate": 3.5647459158997966e-05, "loss": 0.1647, "step": 23390 }, { "epoch": 2.12026463657785, "grad_norm": 0.9604357481002808, "learning_rate": 3.564030291075083e-05, "loss": 0.1954, "step": 23395 }, { "epoch": 2.1207177814029365, "grad_norm": 0.9850428700447083, "learning_rate": 3.563314559761858e-05, "loss": 0.1895, "step": 23400 }, { "epoch": 2.1211709262280225, "grad_norm": 0.9485155344009399, "learning_rate": 3.5625987220317524e-05, "loss": 0.1898, "step": 23405 }, { "epoch": 2.1216240710531085, "grad_norm": 0.8527873754501343, "learning_rate": 3.561882777956407e-05, "loss": 0.1618, "step": 23410 }, { "epoch": 2.122077215878195, "grad_norm": 0.8850252032279968, "learning_rate": 3.561166727607474e-05, "loss": 0.1574, "step": 23415 }, { "epoch": 2.122530360703281, "grad_norm": 0.8717895150184631, "learning_rate": 3.560450571056617e-05, "loss": 0.1582, "step": 23420 }, { "epoch": 2.122983505528367, "grad_norm": 0.8322432041168213, "learning_rate": 3.559734308375509e-05, "loss": 0.1655, "step": 23425 }, { "epoch": 2.1234366503534527, "grad_norm": 0.9777104258537292, "learning_rate": 3.559017939635832e-05, "loss": 0.1909, "step": 23430 }, { "epoch": 2.123889795178539, "grad_norm": 0.9760131239891052, "learning_rate": 3.558301464909282e-05, "loss": 0.1877, "step": 23435 }, { "epoch": 2.124342940003625, "grad_norm": 0.8476155996322632, "learning_rate": 3.557584884267564e-05, "loss": 0.1756, "step": 23440 }, { "epoch": 2.124796084828711, "grad_norm": 0.899445652961731, "learning_rate": 3.556868197782392e-05, "loss": 0.1645, "step": 23445 }, { "epoch": 2.1252492296537975, "grad_norm": 0.7808212041854858, "learning_rate": 3.556151405525495e-05, "loss": 0.1833, "step": 23450 }, { "epoch": 2.1257023744788834, "grad_norm": 0.7728453874588013, "learning_rate": 3.5554345075686076e-05, "loss": 0.1514, "step": 23455 }, { "epoch": 2.1261555193039694, "grad_norm": 1.031178593635559, "learning_rate": 3.5547175039834776e-05, "loss": 0.1643, "step": 23460 }, { "epoch": 2.126608664129056, "grad_norm": 0.8526515960693359, "learning_rate": 3.5540003948418634e-05, "loss": 0.1912, "step": 23465 }, { "epoch": 2.1270618089541418, "grad_norm": 0.9735995531082153, "learning_rate": 3.553283180215533e-05, "loss": 0.1948, "step": 23470 }, { "epoch": 2.1275149537792277, "grad_norm": 0.890195369720459, "learning_rate": 3.552565860176267e-05, "loss": 0.196, "step": 23475 }, { "epoch": 2.127968098604314, "grad_norm": 0.8380461931228638, "learning_rate": 3.551848434795853e-05, "loss": 0.157, "step": 23480 }, { "epoch": 2.1284212434294, "grad_norm": 0.8874711394309998, "learning_rate": 3.5511309041460926e-05, "loss": 0.1769, "step": 23485 }, { "epoch": 2.128874388254486, "grad_norm": 0.8012987375259399, "learning_rate": 3.550413268298795e-05, "loss": 0.1711, "step": 23490 }, { "epoch": 2.1293275330795725, "grad_norm": 0.8432744145393372, "learning_rate": 3.5496955273257836e-05, "loss": 0.2098, "step": 23495 }, { "epoch": 2.1297806779046584, "grad_norm": 0.8531911969184875, "learning_rate": 3.548977681298889e-05, "loss": 0.1695, "step": 23500 }, { "epoch": 2.1302338227297444, "grad_norm": 0.7736402153968811, "learning_rate": 3.548259730289952e-05, "loss": 0.1331, "step": 23505 }, { "epoch": 2.1306869675548303, "grad_norm": 1.0175354480743408, "learning_rate": 3.547541674370828e-05, "loss": 0.1638, "step": 23510 }, { "epoch": 2.1311401123799167, "grad_norm": 0.8040491938591003, "learning_rate": 3.5468235136133796e-05, "loss": 0.162, "step": 23515 }, { "epoch": 2.1315932572050027, "grad_norm": 0.8902863264083862, "learning_rate": 3.54610524808948e-05, "loss": 0.1745, "step": 23520 }, { "epoch": 2.1320464020300887, "grad_norm": 0.7758011817932129, "learning_rate": 3.545386877871012e-05, "loss": 0.1778, "step": 23525 }, { "epoch": 2.132499546855175, "grad_norm": 0.7377585768699646, "learning_rate": 3.5446684030298724e-05, "loss": 0.1583, "step": 23530 }, { "epoch": 2.132952691680261, "grad_norm": 0.8118022680282593, "learning_rate": 3.543949823637966e-05, "loss": 0.178, "step": 23535 }, { "epoch": 2.133405836505347, "grad_norm": 0.7868669629096985, "learning_rate": 3.5432311397672095e-05, "loss": 0.1652, "step": 23540 }, { "epoch": 2.1338589813304334, "grad_norm": 0.9359257221221924, "learning_rate": 3.542512351489527e-05, "loss": 0.1769, "step": 23545 }, { "epoch": 2.1343121261555194, "grad_norm": 0.9833459258079529, "learning_rate": 3.541793458876856e-05, "loss": 0.1811, "step": 23550 }, { "epoch": 2.1347652709806053, "grad_norm": 0.8071849942207336, "learning_rate": 3.5410744620011424e-05, "loss": 0.184, "step": 23555 }, { "epoch": 2.1352184158056913, "grad_norm": 0.9581958055496216, "learning_rate": 3.540355360934347e-05, "loss": 0.1709, "step": 23560 }, { "epoch": 2.1356715606307777, "grad_norm": 0.879606306552887, "learning_rate": 3.5396361557484335e-05, "loss": 0.175, "step": 23565 }, { "epoch": 2.1361247054558636, "grad_norm": 0.8730477690696716, "learning_rate": 3.538916846515383e-05, "loss": 0.1647, "step": 23570 }, { "epoch": 2.1365778502809496, "grad_norm": 0.9689947366714478, "learning_rate": 3.5381974333071823e-05, "loss": 0.1707, "step": 23575 }, { "epoch": 2.137030995106036, "grad_norm": 0.8284784555435181, "learning_rate": 3.537477916195832e-05, "loss": 0.1907, "step": 23580 }, { "epoch": 2.137484139931122, "grad_norm": 0.7677625417709351, "learning_rate": 3.5367582952533416e-05, "loss": 0.1487, "step": 23585 }, { "epoch": 2.137937284756208, "grad_norm": 0.9151692390441895, "learning_rate": 3.536038570551731e-05, "loss": 0.1759, "step": 23590 }, { "epoch": 2.1383904295812943, "grad_norm": 0.8831391334533691, "learning_rate": 3.5353187421630284e-05, "loss": 0.1823, "step": 23595 }, { "epoch": 2.1388435744063803, "grad_norm": 0.8651043176651001, "learning_rate": 3.534598810159277e-05, "loss": 0.1574, "step": 23600 }, { "epoch": 2.1392967192314662, "grad_norm": 0.7722187042236328, "learning_rate": 3.5338787746125274e-05, "loss": 0.1644, "step": 23605 }, { "epoch": 2.1397498640565527, "grad_norm": 0.83514803647995, "learning_rate": 3.53315863559484e-05, "loss": 0.2048, "step": 23610 }, { "epoch": 2.1402030088816386, "grad_norm": 0.9983030557632446, "learning_rate": 3.5324383931782866e-05, "loss": 0.1633, "step": 23615 }, { "epoch": 2.1406561537067246, "grad_norm": 0.8326677680015564, "learning_rate": 3.53171804743495e-05, "loss": 0.1774, "step": 23620 }, { "epoch": 2.141109298531811, "grad_norm": 0.8797131776809692, "learning_rate": 3.5309975984369234e-05, "loss": 0.2324, "step": 23625 }, { "epoch": 2.141562443356897, "grad_norm": 0.9459890723228455, "learning_rate": 3.530277046256308e-05, "loss": 0.1616, "step": 23630 }, { "epoch": 2.142015588181983, "grad_norm": 0.8594310283660889, "learning_rate": 3.5295563909652175e-05, "loss": 0.1725, "step": 23635 }, { "epoch": 2.142468733007069, "grad_norm": 0.8012278079986572, "learning_rate": 3.5288356326357755e-05, "loss": 0.1611, "step": 23640 }, { "epoch": 2.1429218778321553, "grad_norm": 0.96346116065979, "learning_rate": 3.5281147713401145e-05, "loss": 0.1948, "step": 23645 }, { "epoch": 2.1433750226572412, "grad_norm": 0.9295298457145691, "learning_rate": 3.527393807150381e-05, "loss": 0.1623, "step": 23650 }, { "epoch": 2.143828167482327, "grad_norm": 0.8010542392730713, "learning_rate": 3.5266727401387277e-05, "loss": 0.2094, "step": 23655 }, { "epoch": 2.1442813123074136, "grad_norm": 0.8931977152824402, "learning_rate": 3.5259515703773185e-05, "loss": 0.1666, "step": 23660 }, { "epoch": 2.1447344571324995, "grad_norm": 0.9364404082298279, "learning_rate": 3.52523029793833e-05, "loss": 0.167, "step": 23665 }, { "epoch": 2.1451876019575855, "grad_norm": 1.0091248750686646, "learning_rate": 3.524508922893947e-05, "loss": 0.1667, "step": 23670 }, { "epoch": 2.145640746782672, "grad_norm": 0.8298370838165283, "learning_rate": 3.523787445316365e-05, "loss": 0.1649, "step": 23675 }, { "epoch": 2.146093891607758, "grad_norm": 0.8590496778488159, "learning_rate": 3.5230658652777894e-05, "loss": 0.212, "step": 23680 }, { "epoch": 2.146547036432844, "grad_norm": 0.7986263036727905, "learning_rate": 3.522344182850436e-05, "loss": 0.1795, "step": 23685 }, { "epoch": 2.1470001812579302, "grad_norm": 0.9594981670379639, "learning_rate": 3.52162239810653e-05, "loss": 0.1929, "step": 23690 }, { "epoch": 2.147453326083016, "grad_norm": 0.8844142556190491, "learning_rate": 3.52090051111831e-05, "loss": 0.1902, "step": 23695 }, { "epoch": 2.147906470908102, "grad_norm": 0.8392266631126404, "learning_rate": 3.520178521958021e-05, "loss": 0.1669, "step": 23700 }, { "epoch": 2.148359615733188, "grad_norm": 0.805792510509491, "learning_rate": 3.519456430697922e-05, "loss": 0.1598, "step": 23705 }, { "epoch": 2.1488127605582745, "grad_norm": 0.8744885325431824, "learning_rate": 3.5187342374102776e-05, "loss": 0.1982, "step": 23710 }, { "epoch": 2.1492659053833605, "grad_norm": 0.9617924690246582, "learning_rate": 3.5180119421673666e-05, "loss": 0.1688, "step": 23715 }, { "epoch": 2.1497190502084464, "grad_norm": 0.9125294089317322, "learning_rate": 3.517289545041476e-05, "loss": 0.1815, "step": 23720 }, { "epoch": 2.150172195033533, "grad_norm": 0.9176315665245056, "learning_rate": 3.516567046104905e-05, "loss": 0.1735, "step": 23725 }, { "epoch": 2.150625339858619, "grad_norm": 0.9010619521141052, "learning_rate": 3.5158444454299595e-05, "loss": 0.1635, "step": 23730 }, { "epoch": 2.1510784846837048, "grad_norm": 0.8534459471702576, "learning_rate": 3.515121743088958e-05, "loss": 0.1787, "step": 23735 }, { "epoch": 2.151531629508791, "grad_norm": 0.7495490312576294, "learning_rate": 3.5143989391542306e-05, "loss": 0.1385, "step": 23740 }, { "epoch": 2.151984774333877, "grad_norm": 0.9055975079536438, "learning_rate": 3.513676033698113e-05, "loss": 0.1805, "step": 23745 }, { "epoch": 2.152437919158963, "grad_norm": 0.8867578506469727, "learning_rate": 3.5129530267929564e-05, "loss": 0.164, "step": 23750 }, { "epoch": 2.1528910639840495, "grad_norm": 0.8132657408714294, "learning_rate": 3.512229918511119e-05, "loss": 0.1617, "step": 23755 }, { "epoch": 2.1533442088091355, "grad_norm": 0.9191045761108398, "learning_rate": 3.5115067089249675e-05, "loss": 0.2089, "step": 23760 }, { "epoch": 2.1537973536342214, "grad_norm": 0.836870551109314, "learning_rate": 3.5107833981068837e-05, "loss": 0.1944, "step": 23765 }, { "epoch": 2.154250498459308, "grad_norm": 0.9400525093078613, "learning_rate": 3.510059986129255e-05, "loss": 0.2003, "step": 23770 }, { "epoch": 2.154703643284394, "grad_norm": 0.9094268679618835, "learning_rate": 3.509336473064482e-05, "loss": 0.1804, "step": 23775 }, { "epoch": 2.1551567881094797, "grad_norm": 0.8429906964302063, "learning_rate": 3.5086128589849735e-05, "loss": 0.1766, "step": 23780 }, { "epoch": 2.1556099329345657, "grad_norm": 0.8488281965255737, "learning_rate": 3.507889143963149e-05, "loss": 0.1767, "step": 23785 }, { "epoch": 2.156063077759652, "grad_norm": 0.8798884749412537, "learning_rate": 3.5071653280714386e-05, "loss": 0.1533, "step": 23790 }, { "epoch": 2.156516222584738, "grad_norm": 1.0401229858398438, "learning_rate": 3.506441411382281e-05, "loss": 0.1583, "step": 23795 }, { "epoch": 2.156969367409824, "grad_norm": 0.8308138251304626, "learning_rate": 3.505717393968127e-05, "loss": 0.1951, "step": 23800 }, { "epoch": 2.1574225122349104, "grad_norm": 0.9827160239219666, "learning_rate": 3.504993275901436e-05, "loss": 0.1575, "step": 23805 }, { "epoch": 2.1578756570599964, "grad_norm": 0.9201845526695251, "learning_rate": 3.5042690572546786e-05, "loss": 0.1727, "step": 23810 }, { "epoch": 2.1583288018850824, "grad_norm": 0.8709570169448853, "learning_rate": 3.503544738100335e-05, "loss": 0.1803, "step": 23815 }, { "epoch": 2.1587819467101688, "grad_norm": 0.940839946269989, "learning_rate": 3.502820318510894e-05, "loss": 0.1568, "step": 23820 }, { "epoch": 2.1592350915352547, "grad_norm": 0.8509741425514221, "learning_rate": 3.502095798558856e-05, "loss": 0.1743, "step": 23825 }, { "epoch": 2.1596882363603407, "grad_norm": 0.8217824101448059, "learning_rate": 3.5013711783167324e-05, "loss": 0.1696, "step": 23830 }, { "epoch": 2.1601413811854266, "grad_norm": 0.8391299247741699, "learning_rate": 3.500646457857043e-05, "loss": 0.1731, "step": 23835 }, { "epoch": 2.160594526010513, "grad_norm": 0.7985739707946777, "learning_rate": 3.499921637252318e-05, "loss": 0.1781, "step": 23840 }, { "epoch": 2.161047670835599, "grad_norm": 0.7292243242263794, "learning_rate": 3.4991967165750974e-05, "loss": 0.193, "step": 23845 }, { "epoch": 2.161500815660685, "grad_norm": 0.8518734574317932, "learning_rate": 3.4984716958979316e-05, "loss": 0.1607, "step": 23850 }, { "epoch": 2.1619539604857714, "grad_norm": 0.7839882969856262, "learning_rate": 3.49774657529338e-05, "loss": 0.1633, "step": 23855 }, { "epoch": 2.1624071053108573, "grad_norm": 0.764332115650177, "learning_rate": 3.497021354834015e-05, "loss": 0.1679, "step": 23860 }, { "epoch": 2.1628602501359433, "grad_norm": 0.7300511598587036, "learning_rate": 3.496296034592415e-05, "loss": 0.1557, "step": 23865 }, { "epoch": 2.1633133949610297, "grad_norm": 0.9077322483062744, "learning_rate": 3.4955706146411716e-05, "loss": 0.1892, "step": 23870 }, { "epoch": 2.1637665397861157, "grad_norm": 0.779822587966919, "learning_rate": 3.4948450950528836e-05, "loss": 0.1455, "step": 23875 }, { "epoch": 2.1642196846112016, "grad_norm": 0.863089382648468, "learning_rate": 3.494119475900163e-05, "loss": 0.2109, "step": 23880 }, { "epoch": 2.164672829436288, "grad_norm": 0.8432854413986206, "learning_rate": 3.4933937572556294e-05, "loss": 0.1543, "step": 23885 }, { "epoch": 2.165125974261374, "grad_norm": 0.9346694946289062, "learning_rate": 3.4926679391919124e-05, "loss": 0.1859, "step": 23890 }, { "epoch": 2.16557911908646, "grad_norm": 0.8846502304077148, "learning_rate": 3.4919420217816526e-05, "loss": 0.1977, "step": 23895 }, { "epoch": 2.1660322639115464, "grad_norm": 0.840729832649231, "learning_rate": 3.4912160050974997e-05, "loss": 0.1655, "step": 23900 }, { "epoch": 2.1664854087366323, "grad_norm": 0.8194918632507324, "learning_rate": 3.490489889212114e-05, "loss": 0.1954, "step": 23905 }, { "epoch": 2.1669385535617183, "grad_norm": 0.8634887933731079, "learning_rate": 3.489763674198165e-05, "loss": 0.1955, "step": 23910 }, { "epoch": 2.1673916983868042, "grad_norm": 0.8654711842536926, "learning_rate": 3.489037360128334e-05, "loss": 0.1736, "step": 23915 }, { "epoch": 2.1678448432118906, "grad_norm": 0.9794725179672241, "learning_rate": 3.4883109470753085e-05, "loss": 0.1688, "step": 23920 }, { "epoch": 2.1682979880369766, "grad_norm": 0.7694689035415649, "learning_rate": 3.48758443511179e-05, "loss": 0.1703, "step": 23925 }, { "epoch": 2.1687511328620626, "grad_norm": 0.7902131080627441, "learning_rate": 3.486857824310487e-05, "loss": 0.14, "step": 23930 }, { "epoch": 2.169204277687149, "grad_norm": 0.8321094512939453, "learning_rate": 3.48613111474412e-05, "loss": 0.1648, "step": 23935 }, { "epoch": 2.169657422512235, "grad_norm": 0.7939566969871521, "learning_rate": 3.485404306485417e-05, "loss": 0.1608, "step": 23940 }, { "epoch": 2.170110567337321, "grad_norm": 0.8204646706581116, "learning_rate": 3.484677399607117e-05, "loss": 0.1805, "step": 23945 }, { "epoch": 2.1705637121624073, "grad_norm": 0.9389485120773315, "learning_rate": 3.483950394181972e-05, "loss": 0.1906, "step": 23950 }, { "epoch": 2.1710168569874932, "grad_norm": 0.9806286096572876, "learning_rate": 3.4832232902827376e-05, "loss": 0.1737, "step": 23955 }, { "epoch": 2.171470001812579, "grad_norm": 0.799985945224762, "learning_rate": 3.482496087982185e-05, "loss": 0.1887, "step": 23960 }, { "epoch": 2.171923146637665, "grad_norm": 0.9356262683868408, "learning_rate": 3.481768787353091e-05, "loss": 0.1727, "step": 23965 }, { "epoch": 2.1723762914627516, "grad_norm": 0.8544461727142334, "learning_rate": 3.481041388468244e-05, "loss": 0.1749, "step": 23970 }, { "epoch": 2.1728294362878375, "grad_norm": 0.8840419054031372, "learning_rate": 3.4803138914004444e-05, "loss": 0.1742, "step": 23975 }, { "epoch": 2.1732825811129235, "grad_norm": 0.8401985168457031, "learning_rate": 3.479586296222499e-05, "loss": 0.1536, "step": 23980 }, { "epoch": 2.17373572593801, "grad_norm": 0.9657076597213745, "learning_rate": 3.478858603007226e-05, "loss": 0.1758, "step": 23985 }, { "epoch": 2.174188870763096, "grad_norm": 0.8144464492797852, "learning_rate": 3.478130811827453e-05, "loss": 0.1355, "step": 23990 }, { "epoch": 2.174642015588182, "grad_norm": 0.8894739747047424, "learning_rate": 3.4774029227560174e-05, "loss": 0.1693, "step": 23995 }, { "epoch": 2.1750951604132682, "grad_norm": 0.8119502663612366, "learning_rate": 3.476674935865767e-05, "loss": 0.1843, "step": 24000 }, { "epoch": 2.175548305238354, "grad_norm": 0.8630030751228333, "learning_rate": 3.4759468512295587e-05, "loss": 0.1606, "step": 24005 }, { "epoch": 2.17600145006344, "grad_norm": 0.9875895380973816, "learning_rate": 3.4752186689202605e-05, "loss": 0.1811, "step": 24010 }, { "epoch": 2.1764545948885266, "grad_norm": 0.828980028629303, "learning_rate": 3.4744903890107474e-05, "loss": 0.1661, "step": 24015 }, { "epoch": 2.1769077397136125, "grad_norm": 0.8652154803276062, "learning_rate": 3.473762011573907e-05, "loss": 0.2072, "step": 24020 }, { "epoch": 2.1773608845386985, "grad_norm": 1.0527147054672241, "learning_rate": 3.473033536682635e-05, "loss": 0.1801, "step": 24025 }, { "epoch": 2.177814029363785, "grad_norm": 0.8713710904121399, "learning_rate": 3.472304964409838e-05, "loss": 0.1607, "step": 24030 }, { "epoch": 2.178267174188871, "grad_norm": 0.9083179235458374, "learning_rate": 3.471576294828432e-05, "loss": 0.1434, "step": 24035 }, { "epoch": 2.178720319013957, "grad_norm": 0.7887172102928162, "learning_rate": 3.470847528011341e-05, "loss": 0.1613, "step": 24040 }, { "epoch": 2.1791734638390428, "grad_norm": 0.667923629283905, "learning_rate": 3.470118664031501e-05, "loss": 0.1673, "step": 24045 }, { "epoch": 2.179626608664129, "grad_norm": 0.974098801612854, "learning_rate": 3.469389702961858e-05, "loss": 0.1729, "step": 24050 }, { "epoch": 2.180079753489215, "grad_norm": 0.8504811525344849, "learning_rate": 3.468660644875366e-05, "loss": 0.1635, "step": 24055 }, { "epoch": 2.180532898314301, "grad_norm": 0.8455040454864502, "learning_rate": 3.467931489844989e-05, "loss": 0.1439, "step": 24060 }, { "epoch": 2.1809860431393875, "grad_norm": 0.7620255351066589, "learning_rate": 3.467202237943701e-05, "loss": 0.1786, "step": 24065 }, { "epoch": 2.1814391879644734, "grad_norm": 0.7021966576576233, "learning_rate": 3.466472889244487e-05, "loss": 0.1484, "step": 24070 }, { "epoch": 2.1818923327895594, "grad_norm": 0.8119468688964844, "learning_rate": 3.465743443820337e-05, "loss": 0.151, "step": 24075 }, { "epoch": 2.182345477614646, "grad_norm": 0.9139662384986877, "learning_rate": 3.465013901744259e-05, "loss": 0.1757, "step": 24080 }, { "epoch": 2.1827986224397318, "grad_norm": 0.7744290232658386, "learning_rate": 3.4642842630892624e-05, "loss": 0.1507, "step": 24085 }, { "epoch": 2.1832517672648177, "grad_norm": 0.9671985507011414, "learning_rate": 3.463554527928372e-05, "loss": 0.1709, "step": 24090 }, { "epoch": 2.183704912089904, "grad_norm": 1.009607195854187, "learning_rate": 3.462824696334618e-05, "loss": 0.1898, "step": 24095 }, { "epoch": 2.18415805691499, "grad_norm": 0.8850642442703247, "learning_rate": 3.4620947683810424e-05, "loss": 0.1633, "step": 24100 }, { "epoch": 2.184611201740076, "grad_norm": 0.9092703461647034, "learning_rate": 3.4613647441406974e-05, "loss": 0.1764, "step": 24105 }, { "epoch": 2.185064346565162, "grad_norm": 0.9517411589622498, "learning_rate": 3.4606346236866435e-05, "loss": 0.182, "step": 24110 }, { "epoch": 2.1855174913902484, "grad_norm": 0.7033929228782654, "learning_rate": 3.4599044070919527e-05, "loss": 0.1594, "step": 24115 }, { "epoch": 2.1859706362153344, "grad_norm": 0.9043415784835815, "learning_rate": 3.459174094429704e-05, "loss": 0.1492, "step": 24120 }, { "epoch": 2.1864237810404203, "grad_norm": 0.8240482807159424, "learning_rate": 3.4584436857729875e-05, "loss": 0.1612, "step": 24125 }, { "epoch": 2.1868769258655067, "grad_norm": 0.9493361115455627, "learning_rate": 3.457713181194903e-05, "loss": 0.1496, "step": 24130 }, { "epoch": 2.1873300706905927, "grad_norm": 0.7881723046302795, "learning_rate": 3.456982580768559e-05, "loss": 0.1808, "step": 24135 }, { "epoch": 2.1877832155156787, "grad_norm": 0.7408466339111328, "learning_rate": 3.456251884567076e-05, "loss": 0.169, "step": 24140 }, { "epoch": 2.188236360340765, "grad_norm": 0.9298791289329529, "learning_rate": 3.455521092663581e-05, "loss": 0.1613, "step": 24145 }, { "epoch": 2.188689505165851, "grad_norm": 0.9202686548233032, "learning_rate": 3.454790205131211e-05, "loss": 0.1753, "step": 24150 }, { "epoch": 2.189142649990937, "grad_norm": 0.9371484518051147, "learning_rate": 3.454059222043115e-05, "loss": 0.1943, "step": 24155 }, { "epoch": 2.1895957948160234, "grad_norm": 0.8277885317802429, "learning_rate": 3.453328143472449e-05, "loss": 0.1627, "step": 24160 }, { "epoch": 2.1900489396411094, "grad_norm": 0.6484752297401428, "learning_rate": 3.452596969492381e-05, "loss": 0.1358, "step": 24165 }, { "epoch": 2.1905020844661953, "grad_norm": 0.825456976890564, "learning_rate": 3.4518657001760854e-05, "loss": 0.1737, "step": 24170 }, { "epoch": 2.1909552292912817, "grad_norm": 0.9882339239120483, "learning_rate": 3.4511343355967494e-05, "loss": 0.168, "step": 24175 }, { "epoch": 2.1914083741163677, "grad_norm": 0.8532353639602661, "learning_rate": 3.4504028758275675e-05, "loss": 0.1795, "step": 24180 }, { "epoch": 2.1918615189414536, "grad_norm": 0.9244552850723267, "learning_rate": 3.4496713209417444e-05, "loss": 0.1941, "step": 24185 }, { "epoch": 2.1923146637665396, "grad_norm": 0.8723586797714233, "learning_rate": 3.448939671012495e-05, "loss": 0.1882, "step": 24190 }, { "epoch": 2.192767808591626, "grad_norm": 0.804898738861084, "learning_rate": 3.448207926113042e-05, "loss": 0.1893, "step": 24195 }, { "epoch": 2.193220953416712, "grad_norm": 0.7745462656021118, "learning_rate": 3.447476086316619e-05, "loss": 0.1786, "step": 24200 }, { "epoch": 2.193674098241798, "grad_norm": 0.8397523760795593, "learning_rate": 3.446744151696469e-05, "loss": 0.1745, "step": 24205 }, { "epoch": 2.1941272430668843, "grad_norm": 0.7951452732086182, "learning_rate": 3.446012122325845e-05, "loss": 0.187, "step": 24210 }, { "epoch": 2.1945803878919703, "grad_norm": 0.7755935788154602, "learning_rate": 3.445279998278007e-05, "loss": 0.176, "step": 24215 }, { "epoch": 2.1950335327170563, "grad_norm": 0.9413190484046936, "learning_rate": 3.4445477796262285e-05, "loss": 0.1884, "step": 24220 }, { "epoch": 2.1954866775421427, "grad_norm": 0.8326770067214966, "learning_rate": 3.443815466443787e-05, "loss": 0.1536, "step": 24225 }, { "epoch": 2.1959398223672286, "grad_norm": 0.7908810973167419, "learning_rate": 3.443083058803977e-05, "loss": 0.1651, "step": 24230 }, { "epoch": 2.1963929671923146, "grad_norm": 0.8643514513969421, "learning_rate": 3.442350556780094e-05, "loss": 0.1874, "step": 24235 }, { "epoch": 2.1968461120174005, "grad_norm": 0.9180721640586853, "learning_rate": 3.441617960445448e-05, "loss": 0.1448, "step": 24240 }, { "epoch": 2.197299256842487, "grad_norm": 0.8893779516220093, "learning_rate": 3.4408852698733596e-05, "loss": 0.2071, "step": 24245 }, { "epoch": 2.197752401667573, "grad_norm": 0.8518044948577881, "learning_rate": 3.440152485137155e-05, "loss": 0.1851, "step": 24250 }, { "epoch": 2.198205546492659, "grad_norm": 0.996691107749939, "learning_rate": 3.4394196063101724e-05, "loss": 0.1609, "step": 24255 }, { "epoch": 2.1986586913177453, "grad_norm": 0.8047477006912231, "learning_rate": 3.4386866334657576e-05, "loss": 0.1399, "step": 24260 }, { "epoch": 2.1991118361428312, "grad_norm": 0.7685557007789612, "learning_rate": 3.437953566677267e-05, "loss": 0.1851, "step": 24265 }, { "epoch": 2.199564980967917, "grad_norm": 0.8844974637031555, "learning_rate": 3.4372204060180654e-05, "loss": 0.195, "step": 24270 }, { "epoch": 2.2000181257930036, "grad_norm": 0.8749129772186279, "learning_rate": 3.43648715156153e-05, "loss": 0.1643, "step": 24275 }, { "epoch": 2.2004712706180896, "grad_norm": 0.8070729970932007, "learning_rate": 3.4357538033810447e-05, "loss": 0.177, "step": 24280 }, { "epoch": 2.2009244154431755, "grad_norm": 0.9195606708526611, "learning_rate": 3.435020361550001e-05, "loss": 0.1671, "step": 24285 }, { "epoch": 2.201377560268262, "grad_norm": 0.8443716168403625, "learning_rate": 3.4342868261418036e-05, "loss": 0.161, "step": 24290 }, { "epoch": 2.201830705093348, "grad_norm": 0.8646785020828247, "learning_rate": 3.433553197229865e-05, "loss": 0.1566, "step": 24295 }, { "epoch": 2.202283849918434, "grad_norm": 0.8057845234870911, "learning_rate": 3.432819474887607e-05, "loss": 0.1729, "step": 24300 }, { "epoch": 2.2027369947435202, "grad_norm": 0.7756083607673645, "learning_rate": 3.432085659188461e-05, "loss": 0.1672, "step": 24305 }, { "epoch": 2.203190139568606, "grad_norm": 0.9727265238761902, "learning_rate": 3.431351750205867e-05, "loss": 0.1843, "step": 24310 }, { "epoch": 2.203643284393692, "grad_norm": 0.8728733062744141, "learning_rate": 3.430617748013274e-05, "loss": 0.1708, "step": 24315 }, { "epoch": 2.204096429218778, "grad_norm": 0.9407969117164612, "learning_rate": 3.4298836526841425e-05, "loss": 0.1787, "step": 24320 }, { "epoch": 2.2045495740438645, "grad_norm": 0.8376643657684326, "learning_rate": 3.4291494642919405e-05, "loss": 0.1844, "step": 24325 }, { "epoch": 2.2050027188689505, "grad_norm": 0.8704820275306702, "learning_rate": 3.4284151829101475e-05, "loss": 0.1671, "step": 24330 }, { "epoch": 2.2054558636940365, "grad_norm": 1.0200046300888062, "learning_rate": 3.427680808612248e-05, "loss": 0.1916, "step": 24335 }, { "epoch": 2.205909008519123, "grad_norm": 0.7988191843032837, "learning_rate": 3.4269463414717394e-05, "loss": 0.1521, "step": 24340 }, { "epoch": 2.206362153344209, "grad_norm": 0.7718919515609741, "learning_rate": 3.4262117815621284e-05, "loss": 0.161, "step": 24345 }, { "epoch": 2.206815298169295, "grad_norm": 0.7871127724647522, "learning_rate": 3.425477128956928e-05, "loss": 0.1568, "step": 24350 }, { "epoch": 2.207268442994381, "grad_norm": 1.0656794309616089, "learning_rate": 3.424742383729665e-05, "loss": 0.1856, "step": 24355 }, { "epoch": 2.207721587819467, "grad_norm": 0.6977935433387756, "learning_rate": 3.4240075459538714e-05, "loss": 0.1684, "step": 24360 }, { "epoch": 2.208174732644553, "grad_norm": 0.8720184564590454, "learning_rate": 3.4232726157030895e-05, "loss": 0.1641, "step": 24365 }, { "epoch": 2.208627877469639, "grad_norm": 0.9111668467521667, "learning_rate": 3.4225375930508726e-05, "loss": 0.1674, "step": 24370 }, { "epoch": 2.2090810222947255, "grad_norm": 0.8335977792739868, "learning_rate": 3.421802478070781e-05, "loss": 0.1614, "step": 24375 }, { "epoch": 2.2095341671198114, "grad_norm": 0.8339298963546753, "learning_rate": 3.421067270836386e-05, "loss": 0.177, "step": 24380 }, { "epoch": 2.2099873119448974, "grad_norm": 0.8543573021888733, "learning_rate": 3.4203319714212676e-05, "loss": 0.1788, "step": 24385 }, { "epoch": 2.210440456769984, "grad_norm": 0.8053746223449707, "learning_rate": 3.419596579899014e-05, "loss": 0.1856, "step": 24390 }, { "epoch": 2.2108936015950698, "grad_norm": 0.8808937072753906, "learning_rate": 3.418861096343224e-05, "loss": 0.1668, "step": 24395 }, { "epoch": 2.2113467464201557, "grad_norm": 0.856513500213623, "learning_rate": 3.418125520827505e-05, "loss": 0.2137, "step": 24400 }, { "epoch": 2.211799891245242, "grad_norm": 0.9785667061805725, "learning_rate": 3.417389853425473e-05, "loss": 0.1747, "step": 24405 }, { "epoch": 2.212253036070328, "grad_norm": 0.7593975067138672, "learning_rate": 3.4166540942107545e-05, "loss": 0.1578, "step": 24410 }, { "epoch": 2.212706180895414, "grad_norm": 0.849456787109375, "learning_rate": 3.415918243256985e-05, "loss": 0.1556, "step": 24415 }, { "epoch": 2.2131593257205004, "grad_norm": 0.8630437254905701, "learning_rate": 3.4151823006378074e-05, "loss": 0.1827, "step": 24420 }, { "epoch": 2.2136124705455864, "grad_norm": 0.850845217704773, "learning_rate": 3.4144462664268764e-05, "loss": 0.1713, "step": 24425 }, { "epoch": 2.2140656153706724, "grad_norm": 0.9636523127555847, "learning_rate": 3.413710140697853e-05, "loss": 0.2022, "step": 24430 }, { "epoch": 2.2145187601957588, "grad_norm": 0.8320059776306152, "learning_rate": 3.41297392352441e-05, "loss": 0.1736, "step": 24435 }, { "epoch": 2.2149719050208447, "grad_norm": 0.8614416122436523, "learning_rate": 3.412237614980229e-05, "loss": 0.1657, "step": 24440 }, { "epoch": 2.2154250498459307, "grad_norm": 0.928465723991394, "learning_rate": 3.411501215138998e-05, "loss": 0.2233, "step": 24445 }, { "epoch": 2.2158781946710167, "grad_norm": 0.7482896447181702, "learning_rate": 3.4107647240744186e-05, "loss": 0.1672, "step": 24450 }, { "epoch": 2.216331339496103, "grad_norm": 0.8038943409919739, "learning_rate": 3.4100281418601966e-05, "loss": 0.1751, "step": 24455 }, { "epoch": 2.216784484321189, "grad_norm": 0.856506884098053, "learning_rate": 3.4092914685700505e-05, "loss": 0.1694, "step": 24460 }, { "epoch": 2.217237629146275, "grad_norm": 0.9498538374900818, "learning_rate": 3.4085547042777074e-05, "loss": 0.1729, "step": 24465 }, { "epoch": 2.2176907739713614, "grad_norm": 0.9131307005882263, "learning_rate": 3.4078178490569026e-05, "loss": 0.1515, "step": 24470 }, { "epoch": 2.2181439187964473, "grad_norm": 0.8737550973892212, "learning_rate": 3.40708090298138e-05, "loss": 0.1483, "step": 24475 }, { "epoch": 2.2185970636215333, "grad_norm": 0.7455196380615234, "learning_rate": 3.4063438661248936e-05, "loss": 0.1523, "step": 24480 }, { "epoch": 2.2190502084466197, "grad_norm": 0.8800937533378601, "learning_rate": 3.405606738561207e-05, "loss": 0.1516, "step": 24485 }, { "epoch": 2.2195033532717057, "grad_norm": 0.9568480253219604, "learning_rate": 3.404869520364092e-05, "loss": 0.1716, "step": 24490 }, { "epoch": 2.2199564980967916, "grad_norm": 0.8137528300285339, "learning_rate": 3.4041322116073305e-05, "loss": 0.1726, "step": 24495 }, { "epoch": 2.220409642921878, "grad_norm": 0.9297825694084167, "learning_rate": 3.40339481236471e-05, "loss": 0.1799, "step": 24500 }, { "epoch": 2.220862787746964, "grad_norm": 1.0067310333251953, "learning_rate": 3.4026573227100324e-05, "loss": 0.1656, "step": 24505 }, { "epoch": 2.22131593257205, "grad_norm": 0.8946861624717712, "learning_rate": 3.401919742717105e-05, "loss": 0.1675, "step": 24510 }, { "epoch": 2.221769077397136, "grad_norm": 0.8517629504203796, "learning_rate": 3.401182072459743e-05, "loss": 0.1822, "step": 24515 }, { "epoch": 2.2222222222222223, "grad_norm": 1.0575491189956665, "learning_rate": 3.400444312011776e-05, "loss": 0.2027, "step": 24520 }, { "epoch": 2.2226753670473083, "grad_norm": 0.8516336679458618, "learning_rate": 3.399706461447038e-05, "loss": 0.1621, "step": 24525 }, { "epoch": 2.2231285118723942, "grad_norm": 0.8385445475578308, "learning_rate": 3.398968520839373e-05, "loss": 0.1785, "step": 24530 }, { "epoch": 2.2235816566974806, "grad_norm": 0.8330908417701721, "learning_rate": 3.398230490262634e-05, "loss": 0.1609, "step": 24535 }, { "epoch": 2.2240348015225666, "grad_norm": 0.9158313870429993, "learning_rate": 3.397492369790685e-05, "loss": 0.1668, "step": 24540 }, { "epoch": 2.2244879463476526, "grad_norm": 0.7499475479125977, "learning_rate": 3.396754159497395e-05, "loss": 0.1586, "step": 24545 }, { "epoch": 2.224941091172739, "grad_norm": 0.8120031356811523, "learning_rate": 3.396015859456646e-05, "loss": 0.1471, "step": 24550 }, { "epoch": 2.225394235997825, "grad_norm": 0.8622608780860901, "learning_rate": 3.395277469742327e-05, "loss": 0.1654, "step": 24555 }, { "epoch": 2.225847380822911, "grad_norm": 0.9358069896697998, "learning_rate": 3.394538990428337e-05, "loss": 0.187, "step": 24560 }, { "epoch": 2.2263005256479973, "grad_norm": 0.7966043949127197, "learning_rate": 3.3938004215885805e-05, "loss": 0.1591, "step": 24565 }, { "epoch": 2.2267536704730833, "grad_norm": 0.8495304584503174, "learning_rate": 3.393061763296976e-05, "loss": 0.1724, "step": 24570 }, { "epoch": 2.227206815298169, "grad_norm": 0.8100224733352661, "learning_rate": 3.39232301562745e-05, "loss": 0.1636, "step": 24575 }, { "epoch": 2.2276599601232556, "grad_norm": 0.8048340082168579, "learning_rate": 3.3915841786539335e-05, "loss": 0.1475, "step": 24580 }, { "epoch": 2.2281131049483416, "grad_norm": 0.8481403589248657, "learning_rate": 3.390845252450372e-05, "loss": 0.1635, "step": 24585 }, { "epoch": 2.2285662497734275, "grad_norm": 0.8053424954414368, "learning_rate": 3.3901062370907165e-05, "loss": 0.2028, "step": 24590 }, { "epoch": 2.2290193945985135, "grad_norm": 0.8730196952819824, "learning_rate": 3.389367132648926e-05, "loss": 0.1752, "step": 24595 }, { "epoch": 2.2294725394236, "grad_norm": 0.8717469573020935, "learning_rate": 3.388627939198974e-05, "loss": 0.1701, "step": 24600 }, { "epoch": 2.229925684248686, "grad_norm": 0.9607825875282288, "learning_rate": 3.387888656814838e-05, "loss": 0.1844, "step": 24605 }, { "epoch": 2.230378829073772, "grad_norm": 0.8095515966415405, "learning_rate": 3.387149285570504e-05, "loss": 0.1764, "step": 24610 }, { "epoch": 2.2308319738988582, "grad_norm": 0.9557685256004333, "learning_rate": 3.386409825539969e-05, "loss": 0.163, "step": 24615 }, { "epoch": 2.231285118723944, "grad_norm": 0.9073939323425293, "learning_rate": 3.3856702767972406e-05, "loss": 0.1754, "step": 24620 }, { "epoch": 2.23173826354903, "grad_norm": 0.9461207985877991, "learning_rate": 3.38493063941633e-05, "loss": 0.2101, "step": 24625 }, { "epoch": 2.2321914083741166, "grad_norm": 0.7828025221824646, "learning_rate": 3.3841909134712625e-05, "loss": 0.1647, "step": 24630 }, { "epoch": 2.2326445531992025, "grad_norm": 0.7314454913139343, "learning_rate": 3.38345109903607e-05, "loss": 0.1645, "step": 24635 }, { "epoch": 2.2330976980242885, "grad_norm": 0.8882225155830383, "learning_rate": 3.382711196184792e-05, "loss": 0.1712, "step": 24640 }, { "epoch": 2.2335508428493744, "grad_norm": 0.8850889801979065, "learning_rate": 3.3819712049914795e-05, "loss": 0.1533, "step": 24645 }, { "epoch": 2.234003987674461, "grad_norm": 0.8763338327407837, "learning_rate": 3.38123112553019e-05, "loss": 0.1638, "step": 24650 }, { "epoch": 2.234457132499547, "grad_norm": 0.7651267647743225, "learning_rate": 3.380490957874992e-05, "loss": 0.1562, "step": 24655 }, { "epoch": 2.2349102773246328, "grad_norm": 0.8198035955429077, "learning_rate": 3.3797507020999616e-05, "loss": 0.174, "step": 24660 }, { "epoch": 2.235363422149719, "grad_norm": 0.7773071527481079, "learning_rate": 3.379010358279183e-05, "loss": 0.174, "step": 24665 }, { "epoch": 2.235816566974805, "grad_norm": 0.8404704928398132, "learning_rate": 3.378269926486751e-05, "loss": 0.1731, "step": 24670 }, { "epoch": 2.236269711799891, "grad_norm": 0.7794957160949707, "learning_rate": 3.377529406796768e-05, "loss": 0.1697, "step": 24675 }, { "epoch": 2.2367228566249775, "grad_norm": 1.0068014860153198, "learning_rate": 3.376788799283345e-05, "loss": 0.2103, "step": 24680 }, { "epoch": 2.2371760014500635, "grad_norm": 0.853679895401001, "learning_rate": 3.376048104020603e-05, "loss": 0.1762, "step": 24685 }, { "epoch": 2.2376291462751494, "grad_norm": 0.878826379776001, "learning_rate": 3.3753073210826704e-05, "loss": 0.154, "step": 24690 }, { "epoch": 2.238082291100236, "grad_norm": 0.791313111782074, "learning_rate": 3.3745664505436844e-05, "loss": 0.1555, "step": 24695 }, { "epoch": 2.238535435925322, "grad_norm": 0.9003859162330627, "learning_rate": 3.373825492477794e-05, "loss": 0.1651, "step": 24700 }, { "epoch": 2.2389885807504077, "grad_norm": 0.9023056626319885, "learning_rate": 3.3730844469591515e-05, "loss": 0.1632, "step": 24705 }, { "epoch": 2.239441725575494, "grad_norm": 0.8221909999847412, "learning_rate": 3.3723433140619226e-05, "loss": 0.1518, "step": 24710 }, { "epoch": 2.23989487040058, "grad_norm": 0.9735109806060791, "learning_rate": 3.371602093860281e-05, "loss": 0.1664, "step": 24715 }, { "epoch": 2.240348015225666, "grad_norm": 0.9066499471664429, "learning_rate": 3.370860786428406e-05, "loss": 0.1498, "step": 24720 }, { "epoch": 2.240801160050752, "grad_norm": 0.8545717000961304, "learning_rate": 3.3701193918404895e-05, "loss": 0.1625, "step": 24725 }, { "epoch": 2.2412543048758384, "grad_norm": 0.8443592190742493, "learning_rate": 3.36937791017073e-05, "loss": 0.1499, "step": 24730 }, { "epoch": 2.2417074497009244, "grad_norm": 0.8670035600662231, "learning_rate": 3.3686363414933355e-05, "loss": 0.1873, "step": 24735 }, { "epoch": 2.2421605945260104, "grad_norm": 0.8183228969573975, "learning_rate": 3.3678946858825224e-05, "loss": 0.185, "step": 24740 }, { "epoch": 2.2426137393510968, "grad_norm": 0.8422229290008545, "learning_rate": 3.367152943412515e-05, "loss": 0.1661, "step": 24745 }, { "epoch": 2.2430668841761827, "grad_norm": 0.8794096112251282, "learning_rate": 3.366411114157549e-05, "loss": 0.1603, "step": 24750 }, { "epoch": 2.2435200290012687, "grad_norm": 0.9984766840934753, "learning_rate": 3.3656691981918656e-05, "loss": 0.1704, "step": 24755 }, { "epoch": 2.243973173826355, "grad_norm": 0.8058167695999146, "learning_rate": 3.3649271955897156e-05, "loss": 0.1717, "step": 24760 }, { "epoch": 2.244426318651441, "grad_norm": 0.9650743007659912, "learning_rate": 3.3641851064253595e-05, "loss": 0.1751, "step": 24765 }, { "epoch": 2.244879463476527, "grad_norm": 0.783987283706665, "learning_rate": 3.3634429307730665e-05, "loss": 0.1494, "step": 24770 }, { "epoch": 2.245332608301613, "grad_norm": 0.8611016273498535, "learning_rate": 3.3627006687071126e-05, "loss": 0.1772, "step": 24775 }, { "epoch": 2.2457857531266994, "grad_norm": 0.7403048872947693, "learning_rate": 3.361958320301784e-05, "loss": 0.1713, "step": 24780 }, { "epoch": 2.2462388979517853, "grad_norm": 0.8305344581604004, "learning_rate": 3.361215885631376e-05, "loss": 0.1756, "step": 24785 }, { "epoch": 2.2466920427768713, "grad_norm": 0.8516370058059692, "learning_rate": 3.36047336477019e-05, "loss": 0.1448, "step": 24790 }, { "epoch": 2.2471451876019577, "grad_norm": 0.9563270211219788, "learning_rate": 3.35973075779254e-05, "loss": 0.1707, "step": 24795 }, { "epoch": 2.2475983324270437, "grad_norm": 0.9579285383224487, "learning_rate": 3.3589880647727444e-05, "loss": 0.1847, "step": 24800 }, { "epoch": 2.2480514772521296, "grad_norm": 0.8070614337921143, "learning_rate": 3.358245285785133e-05, "loss": 0.1742, "step": 24805 }, { "epoch": 2.248504622077216, "grad_norm": 0.8672356605529785, "learning_rate": 3.3575024209040435e-05, "loss": 0.1536, "step": 24810 }, { "epoch": 2.248957766902302, "grad_norm": 0.9221159815788269, "learning_rate": 3.356759470203821e-05, "loss": 0.1582, "step": 24815 }, { "epoch": 2.249410911727388, "grad_norm": 1.0723308324813843, "learning_rate": 3.356016433758822e-05, "loss": 0.1766, "step": 24820 }, { "epoch": 2.2498640565524743, "grad_norm": 0.9169015288352966, "learning_rate": 3.355273311643408e-05, "loss": 0.184, "step": 24825 }, { "epoch": 2.2503172013775603, "grad_norm": 0.9007452726364136, "learning_rate": 3.3545301039319535e-05, "loss": 0.1913, "step": 24830 }, { "epoch": 2.2507703462026463, "grad_norm": 0.8377029895782471, "learning_rate": 3.3537868106988354e-05, "loss": 0.1567, "step": 24835 }, { "epoch": 2.2512234910277327, "grad_norm": 0.8245998620986938, "learning_rate": 3.353043432018446e-05, "loss": 0.162, "step": 24840 }, { "epoch": 2.2516766358528186, "grad_norm": 0.8658782839775085, "learning_rate": 3.352299967965181e-05, "loss": 0.1891, "step": 24845 }, { "epoch": 2.2521297806779046, "grad_norm": 0.7586293816566467, "learning_rate": 3.351556418613447e-05, "loss": 0.1519, "step": 24850 }, { "epoch": 2.252582925502991, "grad_norm": 0.690420389175415, "learning_rate": 3.350812784037659e-05, "loss": 0.1613, "step": 24855 }, { "epoch": 2.253036070328077, "grad_norm": 0.8071103692054749, "learning_rate": 3.35006906431224e-05, "loss": 0.1585, "step": 24860 }, { "epoch": 2.253489215153163, "grad_norm": 0.8516764044761658, "learning_rate": 3.349325259511622e-05, "loss": 0.1631, "step": 24865 }, { "epoch": 2.253942359978249, "grad_norm": 0.7638039588928223, "learning_rate": 3.348581369710244e-05, "loss": 0.1594, "step": 24870 }, { "epoch": 2.2543955048033353, "grad_norm": 0.8963443636894226, "learning_rate": 3.347837394982556e-05, "loss": 0.1571, "step": 24875 }, { "epoch": 2.2548486496284212, "grad_norm": 1.0996809005737305, "learning_rate": 3.3470933354030155e-05, "loss": 0.1642, "step": 24880 }, { "epoch": 2.255301794453507, "grad_norm": 0.8425295948982239, "learning_rate": 3.346349191046087e-05, "loss": 0.1669, "step": 24885 }, { "epoch": 2.2557549392785936, "grad_norm": 0.7755472660064697, "learning_rate": 3.3456049619862465e-05, "loss": 0.1491, "step": 24890 }, { "epoch": 2.2562080841036796, "grad_norm": 0.8447437286376953, "learning_rate": 3.344860648297974e-05, "loss": 0.1658, "step": 24895 }, { "epoch": 2.2566612289287655, "grad_norm": 0.8946755528450012, "learning_rate": 3.344116250055762e-05, "loss": 0.1916, "step": 24900 }, { "epoch": 2.2571143737538515, "grad_norm": 0.8053516745567322, "learning_rate": 3.3433717673341114e-05, "loss": 0.1557, "step": 24905 }, { "epoch": 2.257567518578938, "grad_norm": 0.9044468402862549, "learning_rate": 3.342627200207529e-05, "loss": 0.1719, "step": 24910 }, { "epoch": 2.258020663404024, "grad_norm": 0.8593749403953552, "learning_rate": 3.3418825487505314e-05, "loss": 0.1863, "step": 24915 }, { "epoch": 2.25847380822911, "grad_norm": 0.8274468183517456, "learning_rate": 3.341137813037644e-05, "loss": 0.1626, "step": 24920 }, { "epoch": 2.258926953054196, "grad_norm": 0.7912399172782898, "learning_rate": 3.3403929931434e-05, "loss": 0.1702, "step": 24925 }, { "epoch": 2.259380097879282, "grad_norm": 0.9920771718025208, "learning_rate": 3.3396480891423406e-05, "loss": 0.1392, "step": 24930 }, { "epoch": 2.259833242704368, "grad_norm": 0.7309701442718506, "learning_rate": 3.338903101109017e-05, "loss": 0.1505, "step": 24935 }, { "epoch": 2.2602863875294545, "grad_norm": 0.8528419733047485, "learning_rate": 3.338158029117988e-05, "loss": 0.1591, "step": 24940 }, { "epoch": 2.2607395323545405, "grad_norm": 0.9500712156295776, "learning_rate": 3.3374128732438185e-05, "loss": 0.1674, "step": 24945 }, { "epoch": 2.2611926771796265, "grad_norm": 0.8515398502349854, "learning_rate": 3.336667633561086e-05, "loss": 0.1645, "step": 24950 }, { "epoch": 2.261645822004713, "grad_norm": 0.8067042827606201, "learning_rate": 3.335922310144375e-05, "loss": 0.1588, "step": 24955 }, { "epoch": 2.262098966829799, "grad_norm": 0.7851539850234985, "learning_rate": 3.335176903068276e-05, "loss": 0.1975, "step": 24960 }, { "epoch": 2.262552111654885, "grad_norm": 0.8454164266586304, "learning_rate": 3.3344314124073905e-05, "loss": 0.2019, "step": 24965 }, { "epoch": 2.263005256479971, "grad_norm": 0.9176356792449951, "learning_rate": 3.333685838236327e-05, "loss": 0.1849, "step": 24970 }, { "epoch": 2.263458401305057, "grad_norm": 0.8828552961349487, "learning_rate": 3.332940180629703e-05, "loss": 0.1366, "step": 24975 }, { "epoch": 2.263911546130143, "grad_norm": 0.916559100151062, "learning_rate": 3.332194439662143e-05, "loss": 0.1868, "step": 24980 }, { "epoch": 2.2643646909552295, "grad_norm": 0.9250521063804626, "learning_rate": 3.3314486154082845e-05, "loss": 0.1852, "step": 24985 }, { "epoch": 2.2648178357803155, "grad_norm": 0.8660367131233215, "learning_rate": 3.3307027079427654e-05, "loss": 0.1816, "step": 24990 }, { "epoch": 2.2652709806054014, "grad_norm": 0.8977935314178467, "learning_rate": 3.3299567173402395e-05, "loss": 0.1748, "step": 24995 }, { "epoch": 2.2657241254304874, "grad_norm": 0.8793613314628601, "learning_rate": 3.329210643675364e-05, "loss": 0.1835, "step": 25000 }, { "epoch": 2.266177270255574, "grad_norm": 0.9749943017959595, "learning_rate": 3.3284644870228074e-05, "loss": 0.162, "step": 25005 }, { "epoch": 2.2666304150806598, "grad_norm": 0.7555277347564697, "learning_rate": 3.327718247457245e-05, "loss": 0.1708, "step": 25010 }, { "epoch": 2.2670835599057457, "grad_norm": 0.788521409034729, "learning_rate": 3.326971925053361e-05, "loss": 0.1803, "step": 25015 }, { "epoch": 2.267536704730832, "grad_norm": 0.8184869885444641, "learning_rate": 3.3262255198858466e-05, "loss": 0.1597, "step": 25020 }, { "epoch": 2.267989849555918, "grad_norm": 0.8844038248062134, "learning_rate": 3.3254790320294035e-05, "loss": 0.16, "step": 25025 }, { "epoch": 2.268442994381004, "grad_norm": 0.7829478979110718, "learning_rate": 3.324732461558739e-05, "loss": 0.1804, "step": 25030 }, { "epoch": 2.2688961392060905, "grad_norm": 0.8099678754806519, "learning_rate": 3.323985808548571e-05, "loss": 0.1525, "step": 25035 }, { "epoch": 2.2693492840311764, "grad_norm": 0.8839786648750305, "learning_rate": 3.323239073073625e-05, "loss": 0.1694, "step": 25040 }, { "epoch": 2.2698024288562624, "grad_norm": 0.8597089648246765, "learning_rate": 3.3224922552086345e-05, "loss": 0.1833, "step": 25045 }, { "epoch": 2.2702555736813483, "grad_norm": 0.7611936926841736, "learning_rate": 3.321745355028341e-05, "loss": 0.18, "step": 25050 }, { "epoch": 2.2707087185064347, "grad_norm": 0.8105006814002991, "learning_rate": 3.3209983726074944e-05, "loss": 0.1752, "step": 25055 }, { "epoch": 2.2711618633315207, "grad_norm": 0.9481298327445984, "learning_rate": 3.320251308020853e-05, "loss": 0.1491, "step": 25060 }, { "epoch": 2.2716150081566067, "grad_norm": 0.8542715311050415, "learning_rate": 3.319504161343183e-05, "loss": 0.1716, "step": 25065 }, { "epoch": 2.272068152981693, "grad_norm": 0.8730912804603577, "learning_rate": 3.31875693264926e-05, "loss": 0.1511, "step": 25070 }, { "epoch": 2.272521297806779, "grad_norm": 0.9157565236091614, "learning_rate": 3.3180096220138666e-05, "loss": 0.1647, "step": 25075 }, { "epoch": 2.272974442631865, "grad_norm": 0.9037336111068726, "learning_rate": 3.3172622295117936e-05, "loss": 0.1712, "step": 25080 }, { "epoch": 2.2734275874569514, "grad_norm": 0.7928333878517151, "learning_rate": 3.316514755217841e-05, "loss": 0.1683, "step": 25085 }, { "epoch": 2.2738807322820374, "grad_norm": 0.861378014087677, "learning_rate": 3.315767199206814e-05, "loss": 0.1572, "step": 25090 }, { "epoch": 2.2743338771071233, "grad_norm": 0.9267046451568604, "learning_rate": 3.315019561553531e-05, "loss": 0.1675, "step": 25095 }, { "epoch": 2.2747870219322097, "grad_norm": 0.9024016261100769, "learning_rate": 3.3142718423328154e-05, "loss": 0.2002, "step": 25100 }, { "epoch": 2.2752401667572957, "grad_norm": 0.8242454528808594, "learning_rate": 3.3135240416194975e-05, "loss": 0.1469, "step": 25105 }, { "epoch": 2.2756933115823816, "grad_norm": 0.9134857654571533, "learning_rate": 3.3127761594884186e-05, "loss": 0.1609, "step": 25110 }, { "epoch": 2.276146456407468, "grad_norm": 0.8690265417098999, "learning_rate": 3.3120281960144274e-05, "loss": 0.1658, "step": 25115 }, { "epoch": 2.276599601232554, "grad_norm": 0.8079309463500977, "learning_rate": 3.31128015127238e-05, "loss": 0.1494, "step": 25120 }, { "epoch": 2.27705274605764, "grad_norm": 0.7775449156761169, "learning_rate": 3.3105320253371405e-05, "loss": 0.1572, "step": 25125 }, { "epoch": 2.277505890882726, "grad_norm": 0.8159329891204834, "learning_rate": 3.309783818283581e-05, "loss": 0.1673, "step": 25130 }, { "epoch": 2.2779590357078123, "grad_norm": 0.9426651000976562, "learning_rate": 3.309035530186585e-05, "loss": 0.1499, "step": 25135 }, { "epoch": 2.2784121805328983, "grad_norm": 0.9418271780014038, "learning_rate": 3.308287161121038e-05, "loss": 0.1784, "step": 25140 }, { "epoch": 2.2788653253579843, "grad_norm": 0.7642375826835632, "learning_rate": 3.307538711161839e-05, "loss": 0.1657, "step": 25145 }, { "epoch": 2.2793184701830707, "grad_norm": 0.7840839624404907, "learning_rate": 3.306790180383893e-05, "loss": 0.1845, "step": 25150 }, { "epoch": 2.2797716150081566, "grad_norm": 0.8098362684249878, "learning_rate": 3.306041568862113e-05, "loss": 0.1675, "step": 25155 }, { "epoch": 2.2802247598332426, "grad_norm": 0.8551677465438843, "learning_rate": 3.305292876671421e-05, "loss": 0.1387, "step": 25160 }, { "epoch": 2.280677904658329, "grad_norm": 0.8327226042747498, "learning_rate": 3.3045441038867454e-05, "loss": 0.1679, "step": 25165 }, { "epoch": 2.281131049483415, "grad_norm": 0.7991703748703003, "learning_rate": 3.3037952505830234e-05, "loss": 0.1907, "step": 25170 }, { "epoch": 2.281584194308501, "grad_norm": 0.8267358541488647, "learning_rate": 3.303046316835201e-05, "loss": 0.1834, "step": 25175 }, { "epoch": 2.282037339133587, "grad_norm": 0.7728298902511597, "learning_rate": 3.302297302718232e-05, "loss": 0.1638, "step": 25180 }, { "epoch": 2.2824904839586733, "grad_norm": 0.8495290875434875, "learning_rate": 3.301548208307078e-05, "loss": 0.1762, "step": 25185 }, { "epoch": 2.2829436287837592, "grad_norm": 0.8361557126045227, "learning_rate": 3.300799033676708e-05, "loss": 0.1504, "step": 25190 }, { "epoch": 2.283396773608845, "grad_norm": 0.8694162368774414, "learning_rate": 3.3000497789021e-05, "loss": 0.1832, "step": 25195 }, { "epoch": 2.2838499184339316, "grad_norm": 0.8506320714950562, "learning_rate": 3.299300444058239e-05, "loss": 0.1508, "step": 25200 }, { "epoch": 2.2843030632590176, "grad_norm": 0.8838393688201904, "learning_rate": 3.2985510292201205e-05, "loss": 0.1692, "step": 25205 }, { "epoch": 2.2847562080841035, "grad_norm": 0.8902614712715149, "learning_rate": 3.2978015344627446e-05, "loss": 0.1746, "step": 25210 }, { "epoch": 2.28520935290919, "grad_norm": 0.9402307868003845, "learning_rate": 3.297051959861122e-05, "loss": 0.1993, "step": 25215 }, { "epoch": 2.285662497734276, "grad_norm": 0.9069491624832153, "learning_rate": 3.296302305490269e-05, "loss": 0.1839, "step": 25220 }, { "epoch": 2.286115642559362, "grad_norm": 0.7630391120910645, "learning_rate": 3.295552571425212e-05, "loss": 0.1774, "step": 25225 }, { "epoch": 2.2865687873844482, "grad_norm": 0.7876491546630859, "learning_rate": 3.294802757740985e-05, "loss": 0.1575, "step": 25230 }, { "epoch": 2.287021932209534, "grad_norm": 1.0723638534545898, "learning_rate": 3.29405286451263e-05, "loss": 0.1947, "step": 25235 }, { "epoch": 2.28747507703462, "grad_norm": 0.9060602188110352, "learning_rate": 3.293302891815195e-05, "loss": 0.1456, "step": 25240 }, { "epoch": 2.2879282218597066, "grad_norm": 0.8592933416366577, "learning_rate": 3.292552839723738e-05, "loss": 0.156, "step": 25245 }, { "epoch": 2.2883813666847925, "grad_norm": 0.7692029476165771, "learning_rate": 3.291802708313326e-05, "loss": 0.1621, "step": 25250 }, { "epoch": 2.2888345115098785, "grad_norm": 0.8217832446098328, "learning_rate": 3.2910524976590304e-05, "loss": 0.1651, "step": 25255 }, { "epoch": 2.289287656334965, "grad_norm": 0.8133158087730408, "learning_rate": 3.290302207835934e-05, "loss": 0.2226, "step": 25260 }, { "epoch": 2.289740801160051, "grad_norm": 0.8082952499389648, "learning_rate": 3.2895518389191256e-05, "loss": 0.167, "step": 25265 }, { "epoch": 2.290193945985137, "grad_norm": 0.9509341716766357, "learning_rate": 3.2888013909837015e-05, "loss": 0.1724, "step": 25270 }, { "epoch": 2.2906470908102228, "grad_norm": 0.941184937953949, "learning_rate": 3.288050864104768e-05, "loss": 0.2415, "step": 25275 }, { "epoch": 2.291100235635309, "grad_norm": 0.8359248042106628, "learning_rate": 3.2873002583574375e-05, "loss": 0.1633, "step": 25280 }, { "epoch": 2.291553380460395, "grad_norm": 0.9587634205818176, "learning_rate": 3.2865495738168314e-05, "loss": 0.1841, "step": 25285 }, { "epoch": 2.292006525285481, "grad_norm": 0.7863991856575012, "learning_rate": 3.285798810558077e-05, "loss": 0.1743, "step": 25290 }, { "epoch": 2.2924596701105675, "grad_norm": 0.8641149997711182, "learning_rate": 3.2850479686563134e-05, "loss": 0.1701, "step": 25295 }, { "epoch": 2.2929128149356535, "grad_norm": 0.8787203431129456, "learning_rate": 3.2842970481866834e-05, "loss": 0.1826, "step": 25300 }, { "epoch": 2.2933659597607394, "grad_norm": 0.8874940276145935, "learning_rate": 3.283546049224339e-05, "loss": 0.168, "step": 25305 }, { "epoch": 2.2938191045858254, "grad_norm": 0.7721136808395386, "learning_rate": 3.282794971844442e-05, "loss": 0.17, "step": 25310 }, { "epoch": 2.294272249410912, "grad_norm": 0.885046660900116, "learning_rate": 3.282043816122159e-05, "loss": 0.1779, "step": 25315 }, { "epoch": 2.2947253942359978, "grad_norm": 1.01674485206604, "learning_rate": 3.2812925821326663e-05, "loss": 0.1904, "step": 25320 }, { "epoch": 2.2951785390610837, "grad_norm": 0.9440849423408508, "learning_rate": 3.280541269951148e-05, "loss": 0.1536, "step": 25325 }, { "epoch": 2.29563168388617, "grad_norm": 0.9846826791763306, "learning_rate": 3.2797898796527965e-05, "loss": 0.2001, "step": 25330 }, { "epoch": 2.296084828711256, "grad_norm": 0.8934835195541382, "learning_rate": 3.2790384113128094e-05, "loss": 0.1567, "step": 25335 }, { "epoch": 2.296537973536342, "grad_norm": 0.8207715749740601, "learning_rate": 3.278286865006394e-05, "loss": 0.1554, "step": 25340 }, { "epoch": 2.2969911183614284, "grad_norm": 1.0070024728775024, "learning_rate": 3.2775352408087674e-05, "loss": 0.157, "step": 25345 }, { "epoch": 2.2974442631865144, "grad_norm": 0.8170310258865356, "learning_rate": 3.276783538795151e-05, "loss": 0.1415, "step": 25350 }, { "epoch": 2.2978974080116004, "grad_norm": 0.8400498628616333, "learning_rate": 3.2760317590407746e-05, "loss": 0.1798, "step": 25355 }, { "epoch": 2.2983505528366868, "grad_norm": 0.899379312992096, "learning_rate": 3.2752799016208775e-05, "loss": 0.1705, "step": 25360 }, { "epoch": 2.2988036976617727, "grad_norm": 1.0003962516784668, "learning_rate": 3.274527966610705e-05, "loss": 0.1547, "step": 25365 }, { "epoch": 2.2992568424868587, "grad_norm": 0.8264821767807007, "learning_rate": 3.273775954085513e-05, "loss": 0.1862, "step": 25370 }, { "epoch": 2.299709987311945, "grad_norm": 0.8181108832359314, "learning_rate": 3.2730238641205626e-05, "loss": 0.1694, "step": 25375 }, { "epoch": 2.300163132137031, "grad_norm": 0.7784764170646667, "learning_rate": 3.272271696791122e-05, "loss": 0.1405, "step": 25380 }, { "epoch": 2.300616276962117, "grad_norm": 0.8586056232452393, "learning_rate": 3.271519452172468e-05, "loss": 0.1347, "step": 25385 }, { "epoch": 2.3010694217872034, "grad_norm": 0.7613102197647095, "learning_rate": 3.2707671303398866e-05, "loss": 0.1771, "step": 25390 }, { "epoch": 2.3015225666122894, "grad_norm": 0.740289568901062, "learning_rate": 3.270014731368671e-05, "loss": 0.1649, "step": 25395 }, { "epoch": 2.3019757114373753, "grad_norm": 0.9951416254043579, "learning_rate": 3.2692622553341206e-05, "loss": 0.1685, "step": 25400 }, { "epoch": 2.3024288562624613, "grad_norm": 0.7906646132469177, "learning_rate": 3.268509702311543e-05, "loss": 0.1462, "step": 25405 }, { "epoch": 2.3028820010875477, "grad_norm": 0.8885692954063416, "learning_rate": 3.267757072376255e-05, "loss": 0.1745, "step": 25410 }, { "epoch": 2.3033351459126337, "grad_norm": 0.7378767132759094, "learning_rate": 3.26700436560358e-05, "loss": 0.1536, "step": 25415 }, { "epoch": 2.3037882907377196, "grad_norm": 0.7551643252372742, "learning_rate": 3.266251582068849e-05, "loss": 0.1515, "step": 25420 }, { "epoch": 2.304241435562806, "grad_norm": 0.7930310368537903, "learning_rate": 3.2654987218474e-05, "loss": 0.1645, "step": 25425 }, { "epoch": 2.304694580387892, "grad_norm": 0.8573249578475952, "learning_rate": 3.2647457850145805e-05, "loss": 0.1556, "step": 25430 }, { "epoch": 2.305147725212978, "grad_norm": 0.8390746116638184, "learning_rate": 3.263992771645744e-05, "loss": 0.1523, "step": 25435 }, { "epoch": 2.3056008700380644, "grad_norm": 0.9577966928482056, "learning_rate": 3.2632396818162526e-05, "loss": 0.1727, "step": 25440 }, { "epoch": 2.3060540148631503, "grad_norm": 1.0157217979431152, "learning_rate": 3.262486515601476e-05, "loss": 0.1785, "step": 25445 }, { "epoch": 2.3065071596882363, "grad_norm": 0.8520486354827881, "learning_rate": 3.2617332730767915e-05, "loss": 0.1598, "step": 25450 }, { "epoch": 2.3069603045133222, "grad_norm": 0.829319417476654, "learning_rate": 3.260979954317582e-05, "loss": 0.1666, "step": 25455 }, { "epoch": 2.3074134493384086, "grad_norm": 0.827138364315033, "learning_rate": 3.2602265593992434e-05, "loss": 0.1771, "step": 25460 }, { "epoch": 2.3078665941634946, "grad_norm": 0.933563768863678, "learning_rate": 3.259473088397173e-05, "loss": 0.1913, "step": 25465 }, { "epoch": 2.3083197389885806, "grad_norm": 0.8769569396972656, "learning_rate": 3.258719541386779e-05, "loss": 0.1758, "step": 25470 }, { "epoch": 2.308772883813667, "grad_norm": 0.897752583026886, "learning_rate": 3.257965918443476e-05, "loss": 0.1868, "step": 25475 }, { "epoch": 2.309226028638753, "grad_norm": 0.9197245240211487, "learning_rate": 3.257212219642689e-05, "loss": 0.1587, "step": 25480 }, { "epoch": 2.309679173463839, "grad_norm": 0.8152848482131958, "learning_rate": 3.256458445059847e-05, "loss": 0.1623, "step": 25485 }, { "epoch": 2.3101323182889253, "grad_norm": 0.7948685884475708, "learning_rate": 3.255704594770388e-05, "loss": 0.1527, "step": 25490 }, { "epoch": 2.3105854631140113, "grad_norm": 0.8240807056427002, "learning_rate": 3.254950668849757e-05, "loss": 0.1888, "step": 25495 }, { "epoch": 2.311038607939097, "grad_norm": 0.9622671604156494, "learning_rate": 3.254196667373408e-05, "loss": 0.1751, "step": 25500 }, { "epoch": 2.3114917527641836, "grad_norm": 0.9417314529418945, "learning_rate": 3.253442590416802e-05, "loss": 0.165, "step": 25505 }, { "epoch": 2.3119448975892696, "grad_norm": 0.9629358053207397, "learning_rate": 3.2526884380554066e-05, "loss": 0.1594, "step": 25510 }, { "epoch": 2.3123980424143555, "grad_norm": 0.9763612151145935, "learning_rate": 3.251934210364699e-05, "loss": 0.1567, "step": 25515 }, { "epoch": 2.312851187239442, "grad_norm": 0.8529669642448425, "learning_rate": 3.2511799074201604e-05, "loss": 0.1477, "step": 25520 }, { "epoch": 2.313304332064528, "grad_norm": 0.8668376207351685, "learning_rate": 3.250425529297282e-05, "loss": 0.2077, "step": 25525 }, { "epoch": 2.313757476889614, "grad_norm": 0.7592209577560425, "learning_rate": 3.249671076071564e-05, "loss": 0.1797, "step": 25530 }, { "epoch": 2.3142106217147003, "grad_norm": 0.7430671453475952, "learning_rate": 3.248916547818511e-05, "loss": 0.1548, "step": 25535 }, { "epoch": 2.3146637665397862, "grad_norm": 0.8916156888008118, "learning_rate": 3.248161944613637e-05, "loss": 0.1628, "step": 25540 }, { "epoch": 2.315116911364872, "grad_norm": 1.0505350828170776, "learning_rate": 3.247407266532462e-05, "loss": 0.1977, "step": 25545 }, { "epoch": 2.315570056189958, "grad_norm": 0.7510360479354858, "learning_rate": 3.246652513650515e-05, "loss": 0.1724, "step": 25550 }, { "epoch": 2.3160232010150446, "grad_norm": 0.8923635482788086, "learning_rate": 3.245897686043332e-05, "loss": 0.1788, "step": 25555 }, { "epoch": 2.3164763458401305, "grad_norm": 0.9986546635627747, "learning_rate": 3.245142783786458e-05, "loss": 0.2117, "step": 25560 }, { "epoch": 2.3169294906652165, "grad_norm": 0.8096336722373962, "learning_rate": 3.244387806955441e-05, "loss": 0.1561, "step": 25565 }, { "epoch": 2.317382635490303, "grad_norm": 0.9586970806121826, "learning_rate": 3.243632755625841e-05, "loss": 0.1777, "step": 25570 }, { "epoch": 2.317835780315389, "grad_norm": 0.9466581344604492, "learning_rate": 3.2428776298732235e-05, "loss": 0.1956, "step": 25575 }, { "epoch": 2.318288925140475, "grad_norm": 0.7451836466789246, "learning_rate": 3.242122429773161e-05, "loss": 0.1478, "step": 25580 }, { "epoch": 2.3187420699655608, "grad_norm": 0.7342284321784973, "learning_rate": 3.241367155401235e-05, "loss": 0.1638, "step": 25585 }, { "epoch": 2.319195214790647, "grad_norm": 0.9614751935005188, "learning_rate": 3.240611806833034e-05, "loss": 0.163, "step": 25590 }, { "epoch": 2.319648359615733, "grad_norm": 0.9253520965576172, "learning_rate": 3.239856384144153e-05, "loss": 0.1926, "step": 25595 }, { "epoch": 2.320101504440819, "grad_norm": 0.8138766884803772, "learning_rate": 3.239100887410195e-05, "loss": 0.1742, "step": 25600 }, { "epoch": 2.3205546492659055, "grad_norm": 0.9552575349807739, "learning_rate": 3.23834531670677e-05, "loss": 0.1767, "step": 25605 }, { "epoch": 2.3210077940909914, "grad_norm": 0.8391215801239014, "learning_rate": 3.237589672109496e-05, "loss": 0.1557, "step": 25610 }, { "epoch": 2.3214609389160774, "grad_norm": 0.90244460105896, "learning_rate": 3.2368339536939986e-05, "loss": 0.1854, "step": 25615 }, { "epoch": 2.321914083741164, "grad_norm": 0.7976839542388916, "learning_rate": 3.23607816153591e-05, "loss": 0.1643, "step": 25620 }, { "epoch": 2.3223672285662498, "grad_norm": 0.8397722244262695, "learning_rate": 3.2353222957108694e-05, "loss": 0.1493, "step": 25625 }, { "epoch": 2.3228203733913357, "grad_norm": 0.8545205593109131, "learning_rate": 3.234566356294526e-05, "loss": 0.1805, "step": 25630 }, { "epoch": 2.323273518216422, "grad_norm": 0.7863306999206543, "learning_rate": 3.2338103433625334e-05, "loss": 0.1837, "step": 25635 }, { "epoch": 2.323726663041508, "grad_norm": 0.8309882283210754, "learning_rate": 3.233054256990552e-05, "loss": 0.1578, "step": 25640 }, { "epoch": 2.324179807866594, "grad_norm": 1.0307468175888062, "learning_rate": 3.232298097254254e-05, "loss": 0.1857, "step": 25645 }, { "epoch": 2.3246329526916805, "grad_norm": 0.9211634397506714, "learning_rate": 3.231541864229315e-05, "loss": 0.1948, "step": 25650 }, { "epoch": 2.3250860975167664, "grad_norm": 0.9988476634025574, "learning_rate": 3.2307855579914195e-05, "loss": 0.2034, "step": 25655 }, { "epoch": 2.3255392423418524, "grad_norm": 0.9060786366462708, "learning_rate": 3.2300291786162566e-05, "loss": 0.171, "step": 25660 }, { "epoch": 2.325992387166939, "grad_norm": 0.9042549729347229, "learning_rate": 3.2292727261795276e-05, "loss": 0.1781, "step": 25665 }, { "epoch": 2.3264455319920248, "grad_norm": 0.8653271794319153, "learning_rate": 3.2285162007569374e-05, "loss": 0.1582, "step": 25670 }, { "epoch": 2.3268986768171107, "grad_norm": 0.8791850805282593, "learning_rate": 3.2277596024241995e-05, "loss": 0.1808, "step": 25675 }, { "epoch": 2.3273518216421967, "grad_norm": 0.8202143907546997, "learning_rate": 3.227002931257036e-05, "loss": 0.1525, "step": 25680 }, { "epoch": 2.327804966467283, "grad_norm": 0.8253878951072693, "learning_rate": 3.226246187331171e-05, "loss": 0.1831, "step": 25685 }, { "epoch": 2.328258111292369, "grad_norm": 0.7623640894889832, "learning_rate": 3.225489370722344e-05, "loss": 0.164, "step": 25690 }, { "epoch": 2.328711256117455, "grad_norm": 0.8025678992271423, "learning_rate": 3.224732481506294e-05, "loss": 0.1767, "step": 25695 }, { "epoch": 2.3291644009425414, "grad_norm": 0.7627614140510559, "learning_rate": 3.2239755197587734e-05, "loss": 0.1473, "step": 25700 }, { "epoch": 2.3296175457676274, "grad_norm": 0.7574685215950012, "learning_rate": 3.223218485555538e-05, "loss": 0.1489, "step": 25705 }, { "epoch": 2.3300706905927133, "grad_norm": 0.7953682541847229, "learning_rate": 3.222461378972351e-05, "loss": 0.1482, "step": 25710 }, { "epoch": 2.3305238354177993, "grad_norm": 0.893150806427002, "learning_rate": 3.221704200084986e-05, "loss": 0.194, "step": 25715 }, { "epoch": 2.3309769802428857, "grad_norm": 0.9687781929969788, "learning_rate": 3.22094694896922e-05, "loss": 0.1555, "step": 25720 }, { "epoch": 2.3314301250679716, "grad_norm": 0.8249948024749756, "learning_rate": 3.22018962570084e-05, "loss": 0.1748, "step": 25725 }, { "epoch": 2.3318832698930576, "grad_norm": 0.9590620398521423, "learning_rate": 3.219432230355639e-05, "loss": 0.1514, "step": 25730 }, { "epoch": 2.332336414718144, "grad_norm": 0.7876166701316833, "learning_rate": 3.218674763009417e-05, "loss": 0.1776, "step": 25735 }, { "epoch": 2.33278955954323, "grad_norm": 0.9501940011978149, "learning_rate": 3.217917223737982e-05, "loss": 0.2094, "step": 25740 }, { "epoch": 2.333242704368316, "grad_norm": 0.8333490490913391, "learning_rate": 3.2171596126171485e-05, "loss": 0.1506, "step": 25745 }, { "epoch": 2.3336958491934023, "grad_norm": 1.041430115699768, "learning_rate": 3.216401929722738e-05, "loss": 0.1678, "step": 25750 }, { "epoch": 2.3341489940184883, "grad_norm": 0.8045421838760376, "learning_rate": 3.2156441751305805e-05, "loss": 0.1713, "step": 25755 }, { "epoch": 2.3346021388435743, "grad_norm": 0.9394654035568237, "learning_rate": 3.2148863489165124e-05, "loss": 0.1622, "step": 25760 }, { "epoch": 2.3350552836686607, "grad_norm": 0.7714900970458984, "learning_rate": 3.2141284511563766e-05, "loss": 0.133, "step": 25765 }, { "epoch": 2.3355084284937466, "grad_norm": 0.8088856935501099, "learning_rate": 3.213370481926023e-05, "loss": 0.1693, "step": 25770 }, { "epoch": 2.3359615733188326, "grad_norm": 0.9069876074790955, "learning_rate": 3.212612441301312e-05, "loss": 0.1735, "step": 25775 }, { "epoch": 2.336414718143919, "grad_norm": 0.8106603622436523, "learning_rate": 3.211854329358106e-05, "loss": 0.1566, "step": 25780 }, { "epoch": 2.336867862969005, "grad_norm": 0.8368204832077026, "learning_rate": 3.211096146172279e-05, "loss": 0.1759, "step": 25785 }, { "epoch": 2.337321007794091, "grad_norm": 0.8284101486206055, "learning_rate": 3.2103378918197086e-05, "loss": 0.1734, "step": 25790 }, { "epoch": 2.3377741526191773, "grad_norm": 0.976414144039154, "learning_rate": 3.209579566376282e-05, "loss": 0.1887, "step": 25795 }, { "epoch": 2.3382272974442633, "grad_norm": 0.8621848225593567, "learning_rate": 3.208821169917892e-05, "loss": 0.1705, "step": 25800 }, { "epoch": 2.3386804422693492, "grad_norm": 0.8595209717750549, "learning_rate": 3.20806270252044e-05, "loss": 0.1792, "step": 25805 }, { "epoch": 2.339133587094435, "grad_norm": 0.8131311535835266, "learning_rate": 3.2073041642598335e-05, "loss": 0.1709, "step": 25810 }, { "epoch": 2.3395867319195216, "grad_norm": 0.7853872776031494, "learning_rate": 3.206545555211987e-05, "loss": 0.1784, "step": 25815 }, { "epoch": 2.3400398767446076, "grad_norm": 0.8396316766738892, "learning_rate": 3.205786875452823e-05, "loss": 0.1895, "step": 25820 }, { "epoch": 2.3404930215696935, "grad_norm": 0.8640989661216736, "learning_rate": 3.20502812505827e-05, "loss": 0.173, "step": 25825 }, { "epoch": 2.34094616639478, "grad_norm": 0.7745577096939087, "learning_rate": 3.204269304104263e-05, "loss": 0.1534, "step": 25830 }, { "epoch": 2.341399311219866, "grad_norm": 0.7455109357833862, "learning_rate": 3.203510412666747e-05, "loss": 0.1712, "step": 25835 }, { "epoch": 2.341852456044952, "grad_norm": 0.7491734027862549, "learning_rate": 3.20275145082167e-05, "loss": 0.1923, "step": 25840 }, { "epoch": 2.3423056008700383, "grad_norm": 0.8977418541908264, "learning_rate": 3.201992418644991e-05, "loss": 0.2112, "step": 25845 }, { "epoch": 2.342758745695124, "grad_norm": 0.7500549554824829, "learning_rate": 3.201233316212673e-05, "loss": 0.161, "step": 25850 }, { "epoch": 2.34321189052021, "grad_norm": 0.7455937266349792, "learning_rate": 3.2004741436006877e-05, "loss": 0.1461, "step": 25855 }, { "epoch": 2.343665035345296, "grad_norm": 0.9710594415664673, "learning_rate": 3.199714900885014e-05, "loss": 0.2047, "step": 25860 }, { "epoch": 2.3441181801703825, "grad_norm": 0.7982906103134155, "learning_rate": 3.198955588141635e-05, "loss": 0.1561, "step": 25865 }, { "epoch": 2.3445713249954685, "grad_norm": 0.9135197997093201, "learning_rate": 3.1981962054465454e-05, "loss": 0.1739, "step": 25870 }, { "epoch": 2.3450244698205545, "grad_norm": 0.6893361210823059, "learning_rate": 3.1974367528757444e-05, "loss": 0.1706, "step": 25875 }, { "epoch": 2.345477614645641, "grad_norm": 0.964994490146637, "learning_rate": 3.196677230505236e-05, "loss": 0.1598, "step": 25880 }, { "epoch": 2.345930759470727, "grad_norm": 0.8072537183761597, "learning_rate": 3.195917638411036e-05, "loss": 0.1833, "step": 25885 }, { "epoch": 2.346383904295813, "grad_norm": 0.7816234230995178, "learning_rate": 3.1951579766691625e-05, "loss": 0.1437, "step": 25890 }, { "epoch": 2.346837049120899, "grad_norm": 0.7907162308692932, "learning_rate": 3.194398245355644e-05, "loss": 0.1305, "step": 25895 }, { "epoch": 2.347290193945985, "grad_norm": 1.001846194267273, "learning_rate": 3.193638444546515e-05, "loss": 0.1687, "step": 25900 }, { "epoch": 2.347743338771071, "grad_norm": 0.8323495388031006, "learning_rate": 3.192878574317815e-05, "loss": 0.1815, "step": 25905 }, { "epoch": 2.3481964835961575, "grad_norm": 0.8434582352638245, "learning_rate": 3.1921186347455934e-05, "loss": 0.1707, "step": 25910 }, { "epoch": 2.3486496284212435, "grad_norm": 0.8953823447227478, "learning_rate": 3.1913586259059044e-05, "loss": 0.1957, "step": 25915 }, { "epoch": 2.3491027732463294, "grad_norm": 0.8001226782798767, "learning_rate": 3.190598547874811e-05, "loss": 0.1408, "step": 25920 }, { "epoch": 2.349555918071416, "grad_norm": 0.7877936363220215, "learning_rate": 3.1898384007283815e-05, "loss": 0.1806, "step": 25925 }, { "epoch": 2.350009062896502, "grad_norm": 0.8539075255393982, "learning_rate": 3.189078184542692e-05, "loss": 0.1796, "step": 25930 }, { "epoch": 2.3504622077215878, "grad_norm": 0.8421875834465027, "learning_rate": 3.188317899393824e-05, "loss": 0.1908, "step": 25935 }, { "epoch": 2.350915352546674, "grad_norm": 0.9156140685081482, "learning_rate": 3.1875575453578684e-05, "loss": 0.1784, "step": 25940 }, { "epoch": 2.35136849737176, "grad_norm": 0.8584846258163452, "learning_rate": 3.186797122510921e-05, "loss": 0.1489, "step": 25945 }, { "epoch": 2.351821642196846, "grad_norm": 0.8843679428100586, "learning_rate": 3.186036630929086e-05, "loss": 0.171, "step": 25950 }, { "epoch": 2.352274787021932, "grad_norm": 0.7437861561775208, "learning_rate": 3.1852760706884724e-05, "loss": 0.159, "step": 25955 }, { "epoch": 2.3527279318470184, "grad_norm": 0.7755461931228638, "learning_rate": 3.184515441865199e-05, "loss": 0.1723, "step": 25960 }, { "epoch": 2.3531810766721044, "grad_norm": 0.7608129978179932, "learning_rate": 3.1837547445353874e-05, "loss": 0.1498, "step": 25965 }, { "epoch": 2.3536342214971904, "grad_norm": 0.8276287913322449, "learning_rate": 3.1829939787751705e-05, "loss": 0.1584, "step": 25970 }, { "epoch": 2.3540873663222768, "grad_norm": 0.7853977084159851, "learning_rate": 3.182233144660685e-05, "loss": 0.1687, "step": 25975 }, { "epoch": 2.3545405111473627, "grad_norm": 0.8846850395202637, "learning_rate": 3.181472242268076e-05, "loss": 0.1484, "step": 25980 }, { "epoch": 2.3549936559724487, "grad_norm": 0.7942383289337158, "learning_rate": 3.180711271673495e-05, "loss": 0.1541, "step": 25985 }, { "epoch": 2.3554468007975347, "grad_norm": 1.0373313426971436, "learning_rate": 3.1799502329531e-05, "loss": 0.1691, "step": 25990 }, { "epoch": 2.355899945622621, "grad_norm": 0.8149959444999695, "learning_rate": 3.179189126183054e-05, "loss": 0.1651, "step": 25995 }, { "epoch": 2.356353090447707, "grad_norm": 0.7898527979850769, "learning_rate": 3.1784279514395325e-05, "loss": 0.1681, "step": 26000 }, { "epoch": 2.356806235272793, "grad_norm": 0.825859546661377, "learning_rate": 3.177666708798712e-05, "loss": 0.1474, "step": 26005 }, { "epoch": 2.3572593800978794, "grad_norm": 0.8359223008155823, "learning_rate": 3.176905398336778e-05, "loss": 0.1769, "step": 26010 }, { "epoch": 2.3577125249229653, "grad_norm": 0.873379111289978, "learning_rate": 3.176144020129922e-05, "loss": 0.1784, "step": 26015 }, { "epoch": 2.3581656697480513, "grad_norm": 0.7353900671005249, "learning_rate": 3.175382574254345e-05, "loss": 0.1584, "step": 26020 }, { "epoch": 2.3586188145731377, "grad_norm": 0.823991060256958, "learning_rate": 3.1746210607862514e-05, "loss": 0.1828, "step": 26025 }, { "epoch": 2.3590719593982237, "grad_norm": 0.8772817254066467, "learning_rate": 3.1738594798018544e-05, "loss": 0.1582, "step": 26030 }, { "epoch": 2.3595251042233096, "grad_norm": 0.834918200969696, "learning_rate": 3.173097831377372e-05, "loss": 0.1354, "step": 26035 }, { "epoch": 2.359978249048396, "grad_norm": 0.8892227411270142, "learning_rate": 3.172336115589032e-05, "loss": 0.1896, "step": 26040 }, { "epoch": 2.360431393873482, "grad_norm": 0.8684990406036377, "learning_rate": 3.171574332513065e-05, "loss": 0.1404, "step": 26045 }, { "epoch": 2.360884538698568, "grad_norm": 0.8039185404777527, "learning_rate": 3.1708124822257126e-05, "loss": 0.1852, "step": 26050 }, { "epoch": 2.3613376835236544, "grad_norm": 0.7542555928230286, "learning_rate": 3.1700505648032205e-05, "loss": 0.1585, "step": 26055 }, { "epoch": 2.3617908283487403, "grad_norm": 0.8087906837463379, "learning_rate": 3.169288580321841e-05, "loss": 0.1671, "step": 26060 }, { "epoch": 2.3622439731738263, "grad_norm": 0.8897677063941956, "learning_rate": 3.168526528857834e-05, "loss": 0.1828, "step": 26065 }, { "epoch": 2.3626971179989127, "grad_norm": 0.6358467936515808, "learning_rate": 3.167764410487467e-05, "loss": 0.1354, "step": 26070 }, { "epoch": 2.3631502628239986, "grad_norm": 0.8803695440292358, "learning_rate": 3.1670022252870106e-05, "loss": 0.1837, "step": 26075 }, { "epoch": 2.3636034076490846, "grad_norm": 0.8965404629707336, "learning_rate": 3.166239973332746e-05, "loss": 0.1748, "step": 26080 }, { "epoch": 2.3640565524741706, "grad_norm": 0.7254555821418762, "learning_rate": 3.165477654700961e-05, "loss": 0.1713, "step": 26085 }, { "epoch": 2.364509697299257, "grad_norm": 1.0830533504486084, "learning_rate": 3.164715269467947e-05, "loss": 0.1748, "step": 26090 }, { "epoch": 2.364962842124343, "grad_norm": 0.919523298740387, "learning_rate": 3.163952817710004e-05, "loss": 0.1625, "step": 26095 }, { "epoch": 2.365415986949429, "grad_norm": 0.8290399312973022, "learning_rate": 3.1631902995034374e-05, "loss": 0.1556, "step": 26100 }, { "epoch": 2.3658691317745153, "grad_norm": 0.7364559769630432, "learning_rate": 3.1624277149245625e-05, "loss": 0.1568, "step": 26105 }, { "epoch": 2.3663222765996013, "grad_norm": 0.8699804544448853, "learning_rate": 3.1616650640496975e-05, "loss": 0.1485, "step": 26110 }, { "epoch": 2.366775421424687, "grad_norm": 0.7991268634796143, "learning_rate": 3.16090234695517e-05, "loss": 0.1744, "step": 26115 }, { "epoch": 2.367228566249773, "grad_norm": 0.9795913696289062, "learning_rate": 3.160139563717312e-05, "loss": 0.1868, "step": 26120 }, { "epoch": 2.3676817110748596, "grad_norm": 0.8678423166275024, "learning_rate": 3.1593767144124627e-05, "loss": 0.1858, "step": 26125 }, { "epoch": 2.3681348558999455, "grad_norm": 0.8307703733444214, "learning_rate": 3.1586137991169685e-05, "loss": 0.1895, "step": 26130 }, { "epoch": 2.3685880007250315, "grad_norm": 0.878517746925354, "learning_rate": 3.1578508179071836e-05, "loss": 0.1771, "step": 26135 }, { "epoch": 2.369041145550118, "grad_norm": 0.9127415418624878, "learning_rate": 3.1570877708594665e-05, "loss": 0.1713, "step": 26140 }, { "epoch": 2.369494290375204, "grad_norm": 0.94277423620224, "learning_rate": 3.156324658050183e-05, "loss": 0.1813, "step": 26145 }, { "epoch": 2.36994743520029, "grad_norm": 0.8521183729171753, "learning_rate": 3.155561479555705e-05, "loss": 0.1582, "step": 26150 }, { "epoch": 2.3704005800253762, "grad_norm": 0.7815011739730835, "learning_rate": 3.1547982354524143e-05, "loss": 0.1554, "step": 26155 }, { "epoch": 2.370853724850462, "grad_norm": 0.788204550743103, "learning_rate": 3.1540349258166936e-05, "loss": 0.1405, "step": 26160 }, { "epoch": 2.371306869675548, "grad_norm": 0.7971954345703125, "learning_rate": 3.1532715507249365e-05, "loss": 0.1786, "step": 26165 }, { "epoch": 2.3717600145006346, "grad_norm": 0.8959699273109436, "learning_rate": 3.152508110253542e-05, "loss": 0.2031, "step": 26170 }, { "epoch": 2.3722131593257205, "grad_norm": 0.8524569869041443, "learning_rate": 3.151744604478915e-05, "loss": 0.1747, "step": 26175 }, { "epoch": 2.3726663041508065, "grad_norm": 0.8572075963020325, "learning_rate": 3.1509810334774683e-05, "loss": 0.1886, "step": 26180 }, { "epoch": 2.373119448975893, "grad_norm": 0.8827096223831177, "learning_rate": 3.150217397325618e-05, "loss": 0.158, "step": 26185 }, { "epoch": 2.373572593800979, "grad_norm": 0.9172871708869934, "learning_rate": 3.149453696099793e-05, "loss": 0.1697, "step": 26190 }, { "epoch": 2.374025738626065, "grad_norm": 0.8532930016517639, "learning_rate": 3.14868992987642e-05, "loss": 0.1745, "step": 26195 }, { "epoch": 2.374478883451151, "grad_norm": 0.8339188694953918, "learning_rate": 3.147926098731942e-05, "loss": 0.1448, "step": 26200 }, { "epoch": 2.374932028276237, "grad_norm": 0.8485869765281677, "learning_rate": 3.1471622027428e-05, "loss": 0.1471, "step": 26205 }, { "epoch": 2.375385173101323, "grad_norm": 0.9001031517982483, "learning_rate": 3.146398241985446e-05, "loss": 0.1631, "step": 26210 }, { "epoch": 2.375838317926409, "grad_norm": 0.8473097681999207, "learning_rate": 3.1456342165363377e-05, "loss": 0.1684, "step": 26215 }, { "epoch": 2.3762914627514955, "grad_norm": 0.8403385281562805, "learning_rate": 3.1448701264719384e-05, "loss": 0.1474, "step": 26220 }, { "epoch": 2.3767446075765815, "grad_norm": 0.8877490758895874, "learning_rate": 3.144105971868719e-05, "loss": 0.1763, "step": 26225 }, { "epoch": 2.3771977524016674, "grad_norm": 1.0675311088562012, "learning_rate": 3.143341752803156e-05, "loss": 0.167, "step": 26230 }, { "epoch": 2.377650897226754, "grad_norm": 0.7137593626976013, "learning_rate": 3.1425774693517335e-05, "loss": 0.1395, "step": 26235 }, { "epoch": 2.37810404205184, "grad_norm": 0.7984723448753357, "learning_rate": 3.14181312159094e-05, "loss": 0.1757, "step": 26240 }, { "epoch": 2.3785571868769257, "grad_norm": 0.8765779733657837, "learning_rate": 3.141048709597272e-05, "loss": 0.1766, "step": 26245 }, { "epoch": 2.379010331702012, "grad_norm": 0.8050801753997803, "learning_rate": 3.140284233447233e-05, "loss": 0.1461, "step": 26250 }, { "epoch": 2.379463476527098, "grad_norm": 0.8331514596939087, "learning_rate": 3.139519693217332e-05, "loss": 0.1523, "step": 26255 }, { "epoch": 2.379916621352184, "grad_norm": 0.7045026421546936, "learning_rate": 3.1387550889840834e-05, "loss": 0.1544, "step": 26260 }, { "epoch": 2.38036976617727, "grad_norm": 0.8721777200698853, "learning_rate": 3.1379904208240094e-05, "loss": 0.1792, "step": 26265 }, { "epoch": 2.3808229110023564, "grad_norm": 0.8116551041603088, "learning_rate": 3.137225688813638e-05, "loss": 0.1562, "step": 26270 }, { "epoch": 2.3812760558274424, "grad_norm": 0.7144197821617126, "learning_rate": 3.136460893029505e-05, "loss": 0.1416, "step": 26275 }, { "epoch": 2.3817292006525284, "grad_norm": 0.8619564175605774, "learning_rate": 3.13569603354815e-05, "loss": 0.149, "step": 26280 }, { "epoch": 2.3821823454776148, "grad_norm": 0.7178249955177307, "learning_rate": 3.1349311104461224e-05, "loss": 0.1749, "step": 26285 }, { "epoch": 2.3826354903027007, "grad_norm": 0.8352950811386108, "learning_rate": 3.134166123799973e-05, "loss": 0.155, "step": 26290 }, { "epoch": 2.3830886351277867, "grad_norm": 0.8320109248161316, "learning_rate": 3.133401073686264e-05, "loss": 0.1449, "step": 26295 }, { "epoch": 2.383541779952873, "grad_norm": 0.7055287957191467, "learning_rate": 3.132635960181562e-05, "loss": 0.1673, "step": 26300 }, { "epoch": 2.383994924777959, "grad_norm": 0.9127789735794067, "learning_rate": 3.131870783362439e-05, "loss": 0.1461, "step": 26305 }, { "epoch": 2.384448069603045, "grad_norm": 0.848531186580658, "learning_rate": 3.1311055433054745e-05, "loss": 0.1493, "step": 26310 }, { "epoch": 2.3849012144281314, "grad_norm": 0.8339393734931946, "learning_rate": 3.130340240087253e-05, "loss": 0.1831, "step": 26315 }, { "epoch": 2.3853543592532174, "grad_norm": 0.9323712587356567, "learning_rate": 3.1295748737843675e-05, "loss": 0.1656, "step": 26320 }, { "epoch": 2.3858075040783033, "grad_norm": 0.9369649887084961, "learning_rate": 3.128809444473417e-05, "loss": 0.1873, "step": 26325 }, { "epoch": 2.3862606489033897, "grad_norm": 0.7798925042152405, "learning_rate": 3.128043952231004e-05, "loss": 0.1795, "step": 26330 }, { "epoch": 2.3867137937284757, "grad_norm": 0.8593260049819946, "learning_rate": 3.1272783971337394e-05, "loss": 0.1728, "step": 26335 }, { "epoch": 2.3871669385535617, "grad_norm": 0.7306386232376099, "learning_rate": 3.126512779258242e-05, "loss": 0.1575, "step": 26340 }, { "epoch": 2.387620083378648, "grad_norm": 0.8424440622329712, "learning_rate": 3.1257470986811335e-05, "loss": 0.135, "step": 26345 }, { "epoch": 2.388073228203734, "grad_norm": 0.9517315030097961, "learning_rate": 3.124981355479042e-05, "loss": 0.1599, "step": 26350 }, { "epoch": 2.38852637302882, "grad_norm": 0.6839801669120789, "learning_rate": 3.124215549728607e-05, "loss": 0.1434, "step": 26355 }, { "epoch": 2.388979517853906, "grad_norm": 0.7543492913246155, "learning_rate": 3.123449681506469e-05, "loss": 0.1407, "step": 26360 }, { "epoch": 2.3894326626789923, "grad_norm": 0.8555076718330383, "learning_rate": 3.1226837508892745e-05, "loss": 0.1429, "step": 26365 }, { "epoch": 2.3898858075040783, "grad_norm": 0.8078946471214294, "learning_rate": 3.121917757953681e-05, "loss": 0.1416, "step": 26370 }, { "epoch": 2.3903389523291643, "grad_norm": 1.1264151334762573, "learning_rate": 3.1211517027763474e-05, "loss": 0.199, "step": 26375 }, { "epoch": 2.3907920971542507, "grad_norm": 0.9530636072158813, "learning_rate": 3.120385585433942e-05, "loss": 0.148, "step": 26380 }, { "epoch": 2.3912452419793366, "grad_norm": 0.8422128558158875, "learning_rate": 3.1196194060031375e-05, "loss": 0.1575, "step": 26385 }, { "epoch": 2.3916983868044226, "grad_norm": 0.9023135900497437, "learning_rate": 3.118853164560613e-05, "loss": 0.1812, "step": 26390 }, { "epoch": 2.3921515316295086, "grad_norm": 0.8274620771408081, "learning_rate": 3.118086861183055e-05, "loss": 0.1452, "step": 26395 }, { "epoch": 2.392604676454595, "grad_norm": 0.8647996187210083, "learning_rate": 3.117320495947154e-05, "loss": 0.1403, "step": 26400 }, { "epoch": 2.393057821279681, "grad_norm": 0.9516957402229309, "learning_rate": 3.1165540689296094e-05, "loss": 0.1733, "step": 26405 }, { "epoch": 2.393510966104767, "grad_norm": 0.8606735467910767, "learning_rate": 3.115787580207126e-05, "loss": 0.1725, "step": 26410 }, { "epoch": 2.3939641109298533, "grad_norm": 0.9730439782142639, "learning_rate": 3.115021029856413e-05, "loss": 0.1716, "step": 26415 }, { "epoch": 2.3944172557549392, "grad_norm": 0.867279052734375, "learning_rate": 3.1142544179541876e-05, "loss": 0.1657, "step": 26420 }, { "epoch": 2.394870400580025, "grad_norm": 0.7890639901161194, "learning_rate": 3.1134877445771725e-05, "loss": 0.1536, "step": 26425 }, { "epoch": 2.3953235454051116, "grad_norm": 0.7487570643424988, "learning_rate": 3.112721009802096e-05, "loss": 0.1412, "step": 26430 }, { "epoch": 2.3957766902301976, "grad_norm": 0.7622184753417969, "learning_rate": 3.111954213705693e-05, "loss": 0.1415, "step": 26435 }, { "epoch": 2.3962298350552835, "grad_norm": 0.8595138192176819, "learning_rate": 3.111187356364707e-05, "loss": 0.1731, "step": 26440 }, { "epoch": 2.39668297988037, "grad_norm": 0.8016836643218994, "learning_rate": 3.1104204378558835e-05, "loss": 0.1528, "step": 26445 }, { "epoch": 2.397136124705456, "grad_norm": 0.7937566637992859, "learning_rate": 3.109653458255975e-05, "loss": 0.1628, "step": 26450 }, { "epoch": 2.397589269530542, "grad_norm": 0.8787643909454346, "learning_rate": 3.1088864176417435e-05, "loss": 0.1479, "step": 26455 }, { "epoch": 2.3980424143556283, "grad_norm": 0.8123072385787964, "learning_rate": 3.108119316089953e-05, "loss": 0.166, "step": 26460 }, { "epoch": 2.398495559180714, "grad_norm": 0.7702158689498901, "learning_rate": 3.107352153677376e-05, "loss": 0.1753, "step": 26465 }, { "epoch": 2.3989487040058, "grad_norm": 0.9080550074577332, "learning_rate": 3.106584930480791e-05, "loss": 0.1624, "step": 26470 }, { "epoch": 2.3994018488308866, "grad_norm": 0.835796594619751, "learning_rate": 3.105817646576979e-05, "loss": 0.1747, "step": 26475 }, { "epoch": 2.3998549936559725, "grad_norm": 0.9767827391624451, "learning_rate": 3.1050503020427336e-05, "loss": 0.1559, "step": 26480 }, { "epoch": 2.4003081384810585, "grad_norm": 0.7745000123977661, "learning_rate": 3.104282896954849e-05, "loss": 0.1627, "step": 26485 }, { "epoch": 2.4007612833061445, "grad_norm": 0.8510745763778687, "learning_rate": 3.1035154313901274e-05, "loss": 0.178, "step": 26490 }, { "epoch": 2.401214428131231, "grad_norm": 0.7908599972724915, "learning_rate": 3.1027479054253785e-05, "loss": 0.1679, "step": 26495 }, { "epoch": 2.401667572956317, "grad_norm": 0.8603745698928833, "learning_rate": 3.1019803191374144e-05, "loss": 0.1849, "step": 26500 }, { "epoch": 2.402120717781403, "grad_norm": 0.7469510436058044, "learning_rate": 3.101212672603057e-05, "loss": 0.1924, "step": 26505 }, { "epoch": 2.402573862606489, "grad_norm": 0.8174178600311279, "learning_rate": 3.1004449658991326e-05, "loss": 0.1633, "step": 26510 }, { "epoch": 2.403027007431575, "grad_norm": 1.0247838497161865, "learning_rate": 3.099677199102472e-05, "loss": 0.1741, "step": 26515 }, { "epoch": 2.403480152256661, "grad_norm": 0.7588873505592346, "learning_rate": 3.098909372289914e-05, "loss": 0.1496, "step": 26520 }, { "epoch": 2.403933297081747, "grad_norm": 0.8092746734619141, "learning_rate": 3.098141485538305e-05, "loss": 0.153, "step": 26525 }, { "epoch": 2.4043864419068335, "grad_norm": 0.8593928217887878, "learning_rate": 3.097373538924494e-05, "loss": 0.169, "step": 26530 }, { "epoch": 2.4048395867319194, "grad_norm": 0.8469714522361755, "learning_rate": 3.0966055325253366e-05, "loss": 0.1374, "step": 26535 }, { "epoch": 2.4052927315570054, "grad_norm": 1.0075308084487915, "learning_rate": 3.095837466417696e-05, "loss": 0.1581, "step": 26540 }, { "epoch": 2.405745876382092, "grad_norm": 0.8500180840492249, "learning_rate": 3.09506934067844e-05, "loss": 0.1393, "step": 26545 }, { "epoch": 2.4061990212071778, "grad_norm": 0.8331440091133118, "learning_rate": 3.0943011553844434e-05, "loss": 0.1395, "step": 26550 }, { "epoch": 2.4066521660322637, "grad_norm": 0.8638006448745728, "learning_rate": 3.093532910612587e-05, "loss": 0.1416, "step": 26555 }, { "epoch": 2.40710531085735, "grad_norm": 0.8053168058395386, "learning_rate": 3.092764606439756e-05, "loss": 0.1803, "step": 26560 }, { "epoch": 2.407558455682436, "grad_norm": 0.7308458685874939, "learning_rate": 3.091996242942842e-05, "loss": 0.1234, "step": 26565 }, { "epoch": 2.408011600507522, "grad_norm": 0.8848094344139099, "learning_rate": 3.0912278201987446e-05, "loss": 0.1557, "step": 26570 }, { "epoch": 2.4084647453326085, "grad_norm": 0.7769845724105835, "learning_rate": 3.090459338284367e-05, "loss": 0.1905, "step": 26575 }, { "epoch": 2.4089178901576944, "grad_norm": 0.9252490997314453, "learning_rate": 3.0896907972766184e-05, "loss": 0.1923, "step": 26580 }, { "epoch": 2.4093710349827804, "grad_norm": 0.7855699062347412, "learning_rate": 3.088922197252417e-05, "loss": 0.1627, "step": 26585 }, { "epoch": 2.409824179807867, "grad_norm": 0.9058955311775208, "learning_rate": 3.088153538288682e-05, "loss": 0.1876, "step": 26590 }, { "epoch": 2.4102773246329527, "grad_norm": 0.8022380471229553, "learning_rate": 3.087384820462342e-05, "loss": 0.1553, "step": 26595 }, { "epoch": 2.4107304694580387, "grad_norm": 0.903361439704895, "learning_rate": 3.086616043850331e-05, "loss": 0.1896, "step": 26600 }, { "epoch": 2.411183614283125, "grad_norm": 0.7757391929626465, "learning_rate": 3.085847208529588e-05, "loss": 0.162, "step": 26605 }, { "epoch": 2.411636759108211, "grad_norm": 0.890511155128479, "learning_rate": 3.085078314577058e-05, "loss": 0.1536, "step": 26610 }, { "epoch": 2.412089903933297, "grad_norm": 0.8746675252914429, "learning_rate": 3.0843093620696925e-05, "loss": 0.1836, "step": 26615 }, { "epoch": 2.412543048758383, "grad_norm": 0.629963219165802, "learning_rate": 3.083540351084449e-05, "loss": 0.149, "step": 26620 }, { "epoch": 2.4129961935834694, "grad_norm": 0.7105464339256287, "learning_rate": 3.082771281698289e-05, "loss": 0.1657, "step": 26625 }, { "epoch": 2.4134493384085554, "grad_norm": 0.7575259208679199, "learning_rate": 3.082002153988183e-05, "loss": 0.1554, "step": 26630 }, { "epoch": 2.4139024832336413, "grad_norm": 0.8485234975814819, "learning_rate": 3.081232968031104e-05, "loss": 0.1462, "step": 26635 }, { "epoch": 2.4143556280587277, "grad_norm": 0.7526084780693054, "learning_rate": 3.0804637239040336e-05, "loss": 0.1485, "step": 26640 }, { "epoch": 2.4148087728838137, "grad_norm": 0.8348374962806702, "learning_rate": 3.079694421683957e-05, "loss": 0.1685, "step": 26645 }, { "epoch": 2.4152619177088996, "grad_norm": 0.8244729042053223, "learning_rate": 3.078925061447868e-05, "loss": 0.158, "step": 26650 }, { "epoch": 2.415715062533986, "grad_norm": 0.8386844396591187, "learning_rate": 3.078155643272762e-05, "loss": 0.1591, "step": 26655 }, { "epoch": 2.416168207359072, "grad_norm": 0.9036726355552673, "learning_rate": 3.077386167235644e-05, "loss": 0.1625, "step": 26660 }, { "epoch": 2.416621352184158, "grad_norm": 0.8039996027946472, "learning_rate": 3.076616633413523e-05, "loss": 0.1686, "step": 26665 }, { "epoch": 2.417074497009244, "grad_norm": 0.7850460410118103, "learning_rate": 3.075847041883415e-05, "loss": 0.209, "step": 26670 }, { "epoch": 2.4175276418343303, "grad_norm": 0.8782591223716736, "learning_rate": 3.075077392722339e-05, "loss": 0.1594, "step": 26675 }, { "epoch": 2.4179807866594163, "grad_norm": 0.9745951294898987, "learning_rate": 3.0743076860073245e-05, "loss": 0.1719, "step": 26680 }, { "epoch": 2.4184339314845023, "grad_norm": 0.8087928891181946, "learning_rate": 3.073537921815402e-05, "loss": 0.1685, "step": 26685 }, { "epoch": 2.4188870763095887, "grad_norm": 0.9294443726539612, "learning_rate": 3.072768100223611e-05, "loss": 0.1731, "step": 26690 }, { "epoch": 2.4193402211346746, "grad_norm": 0.9666886329650879, "learning_rate": 3.071998221308995e-05, "loss": 0.1981, "step": 26695 }, { "epoch": 2.4197933659597606, "grad_norm": 0.7568742632865906, "learning_rate": 3.071228285148603e-05, "loss": 0.1446, "step": 26700 }, { "epoch": 2.420246510784847, "grad_norm": 0.8873618245124817, "learning_rate": 3.070458291819491e-05, "loss": 0.1666, "step": 26705 }, { "epoch": 2.420699655609933, "grad_norm": 0.9060069918632507, "learning_rate": 3.06968824139872e-05, "loss": 0.1523, "step": 26710 }, { "epoch": 2.421152800435019, "grad_norm": 0.8445006012916565, "learning_rate": 3.068918133963358e-05, "loss": 0.1466, "step": 26715 }, { "epoch": 2.4216059452601053, "grad_norm": 0.8195416927337646, "learning_rate": 3.068147969590477e-05, "loss": 0.166, "step": 26720 }, { "epoch": 2.4220590900851913, "grad_norm": 0.9217990636825562, "learning_rate": 3.067377748357155e-05, "loss": 0.15, "step": 26725 }, { "epoch": 2.4225122349102772, "grad_norm": 0.8457598686218262, "learning_rate": 3.066607470340476e-05, "loss": 0.1774, "step": 26730 }, { "epoch": 2.4229653797353636, "grad_norm": 0.9809102416038513, "learning_rate": 3.065837135617529e-05, "loss": 0.1516, "step": 26735 }, { "epoch": 2.4234185245604496, "grad_norm": 0.9070588946342468, "learning_rate": 3.065066744265411e-05, "loss": 0.1654, "step": 26740 }, { "epoch": 2.4238716693855356, "grad_norm": 0.8745318651199341, "learning_rate": 3.064296296361223e-05, "loss": 0.1747, "step": 26745 }, { "epoch": 2.424324814210622, "grad_norm": 0.7800479531288147, "learning_rate": 3.06352579198207e-05, "loss": 0.1349, "step": 26750 }, { "epoch": 2.424777959035708, "grad_norm": 0.9677242636680603, "learning_rate": 3.062755231205065e-05, "loss": 0.1893, "step": 26755 }, { "epoch": 2.425231103860794, "grad_norm": 0.9701499938964844, "learning_rate": 3.061984614107326e-05, "loss": 0.1936, "step": 26760 }, { "epoch": 2.42568424868588, "grad_norm": 0.7495046257972717, "learning_rate": 3.061213940765978e-05, "loss": 0.177, "step": 26765 }, { "epoch": 2.4261373935109662, "grad_norm": 0.851309597492218, "learning_rate": 3.060443211258148e-05, "loss": 0.1859, "step": 26770 }, { "epoch": 2.426590538336052, "grad_norm": 0.8945930600166321, "learning_rate": 3.0596724256609726e-05, "loss": 0.1685, "step": 26775 }, { "epoch": 2.427043683161138, "grad_norm": 0.7870331406593323, "learning_rate": 3.058901584051592e-05, "loss": 0.1466, "step": 26780 }, { "epoch": 2.4274968279862246, "grad_norm": 0.789115309715271, "learning_rate": 3.058130686507151e-05, "loss": 0.1573, "step": 26785 }, { "epoch": 2.4279499728113105, "grad_norm": 0.8361273407936096, "learning_rate": 3.057359733104803e-05, "loss": 0.1352, "step": 26790 }, { "epoch": 2.4284031176363965, "grad_norm": 0.8230966925621033, "learning_rate": 3.056588723921704e-05, "loss": 0.1675, "step": 26795 }, { "epoch": 2.4288562624614825, "grad_norm": 0.7509821057319641, "learning_rate": 3.0558176590350166e-05, "loss": 0.1436, "step": 26800 }, { "epoch": 2.429309407286569, "grad_norm": 0.8700202107429504, "learning_rate": 3.0550465385219115e-05, "loss": 0.1348, "step": 26805 }, { "epoch": 2.429762552111655, "grad_norm": 0.9530929327011108, "learning_rate": 3.0542753624595603e-05, "loss": 0.1614, "step": 26810 }, { "epoch": 2.4302156969367408, "grad_norm": 0.7592161297798157, "learning_rate": 3.053504130925144e-05, "loss": 0.1638, "step": 26815 }, { "epoch": 2.430668841761827, "grad_norm": 0.9166494011878967, "learning_rate": 3.052732843995847e-05, "loss": 0.1645, "step": 26820 }, { "epoch": 2.431121986586913, "grad_norm": 0.794445812702179, "learning_rate": 3.0519615017488594e-05, "loss": 0.1499, "step": 26825 }, { "epoch": 2.431575131411999, "grad_norm": 0.8077408075332642, "learning_rate": 3.0511901042613788e-05, "loss": 0.1458, "step": 26830 }, { "epoch": 2.4320282762370855, "grad_norm": 0.8309443593025208, "learning_rate": 3.0504186516106065e-05, "loss": 0.1481, "step": 26835 }, { "epoch": 2.4324814210621715, "grad_norm": 0.8253073692321777, "learning_rate": 3.049647143873749e-05, "loss": 0.1406, "step": 26840 }, { "epoch": 2.4329345658872574, "grad_norm": 0.7604209184646606, "learning_rate": 3.048875581128019e-05, "loss": 0.1318, "step": 26845 }, { "epoch": 2.433387710712344, "grad_norm": 0.7860593199729919, "learning_rate": 3.048103963450636e-05, "loss": 0.145, "step": 26850 }, { "epoch": 2.43384085553743, "grad_norm": 0.8422468900680542, "learning_rate": 3.0473322909188234e-05, "loss": 0.1736, "step": 26855 }, { "epoch": 2.4342940003625158, "grad_norm": 0.8342530727386475, "learning_rate": 3.04656056360981e-05, "loss": 0.1677, "step": 26860 }, { "epoch": 2.434747145187602, "grad_norm": 0.879239022731781, "learning_rate": 3.0457887816008302e-05, "loss": 0.1485, "step": 26865 }, { "epoch": 2.435200290012688, "grad_norm": 0.6687124967575073, "learning_rate": 3.045016944969124e-05, "loss": 0.1712, "step": 26870 }, { "epoch": 2.435653434837774, "grad_norm": 0.8349435329437256, "learning_rate": 3.0442450537919388e-05, "loss": 0.1585, "step": 26875 }, { "epoch": 2.4361065796628605, "grad_norm": 0.8040997385978699, "learning_rate": 3.0434731081465247e-05, "loss": 0.1556, "step": 26880 }, { "epoch": 2.4365597244879464, "grad_norm": 0.8405946493148804, "learning_rate": 3.0427011081101374e-05, "loss": 0.1682, "step": 26885 }, { "epoch": 2.4370128693130324, "grad_norm": 0.8948811888694763, "learning_rate": 3.041929053760041e-05, "loss": 0.1633, "step": 26890 }, { "epoch": 2.4374660141381184, "grad_norm": 0.787964940071106, "learning_rate": 3.0411569451735e-05, "loss": 0.1328, "step": 26895 }, { "epoch": 2.4379191589632048, "grad_norm": 0.8650020360946655, "learning_rate": 3.04038478242779e-05, "loss": 0.161, "step": 26900 }, { "epoch": 2.4383723037882907, "grad_norm": 0.8098362684249878, "learning_rate": 3.0396125656001878e-05, "loss": 0.1632, "step": 26905 }, { "epoch": 2.4388254486133767, "grad_norm": 0.9015963673591614, "learning_rate": 3.0388402947679785e-05, "loss": 0.1962, "step": 26910 }, { "epoch": 2.439278593438463, "grad_norm": 0.7106752991676331, "learning_rate": 3.0380679700084492e-05, "loss": 0.1738, "step": 26915 }, { "epoch": 2.439731738263549, "grad_norm": 0.7828565239906311, "learning_rate": 3.0372955913988966e-05, "loss": 0.1636, "step": 26920 }, { "epoch": 2.440184883088635, "grad_norm": 1.5588349103927612, "learning_rate": 3.0365231590166188e-05, "loss": 0.1822, "step": 26925 }, { "epoch": 2.4406380279137214, "grad_norm": 0.9055042266845703, "learning_rate": 3.0357506729389224e-05, "loss": 0.1392, "step": 26930 }, { "epoch": 2.4410911727388074, "grad_norm": 0.7259253263473511, "learning_rate": 3.0349781332431175e-05, "loss": 0.1789, "step": 26935 }, { "epoch": 2.4415443175638933, "grad_norm": 0.746252179145813, "learning_rate": 3.0342055400065193e-05, "loss": 0.1607, "step": 26940 }, { "epoch": 2.4419974623889793, "grad_norm": 0.7566622495651245, "learning_rate": 3.0334328933064517e-05, "loss": 0.1561, "step": 26945 }, { "epoch": 2.4424506072140657, "grad_norm": 0.975543200969696, "learning_rate": 3.0326601932202383e-05, "loss": 0.164, "step": 26950 }, { "epoch": 2.4429037520391517, "grad_norm": 0.9014024138450623, "learning_rate": 3.0318874398252135e-05, "loss": 0.1595, "step": 26955 }, { "epoch": 2.4433568968642376, "grad_norm": 0.7711353302001953, "learning_rate": 3.0311146331987135e-05, "loss": 0.1723, "step": 26960 }, { "epoch": 2.443810041689324, "grad_norm": 0.8410671353340149, "learning_rate": 3.0303417734180818e-05, "loss": 0.1593, "step": 26965 }, { "epoch": 2.44426318651441, "grad_norm": 0.8137435913085938, "learning_rate": 3.029568860560666e-05, "loss": 0.1602, "step": 26970 }, { "epoch": 2.444716331339496, "grad_norm": 0.7940528988838196, "learning_rate": 3.0287958947038198e-05, "loss": 0.157, "step": 26975 }, { "epoch": 2.4451694761645824, "grad_norm": 0.8973736763000488, "learning_rate": 3.0280228759249012e-05, "loss": 0.1579, "step": 26980 }, { "epoch": 2.4456226209896683, "grad_norm": 0.7954376339912415, "learning_rate": 3.027249804301275e-05, "loss": 0.175, "step": 26985 }, { "epoch": 2.4460757658147543, "grad_norm": 0.9386512637138367, "learning_rate": 3.02647667991031e-05, "loss": 0.1487, "step": 26990 }, { "epoch": 2.4465289106398407, "grad_norm": 0.7826530337333679, "learning_rate": 3.0257035028293812e-05, "loss": 0.1469, "step": 26995 }, { "epoch": 2.4469820554649266, "grad_norm": 0.9379197955131531, "learning_rate": 3.0249302731358686e-05, "loss": 0.1315, "step": 27000 }, { "epoch": 2.4474352002900126, "grad_norm": 0.8328291773796082, "learning_rate": 3.0241569909071555e-05, "loss": 0.1803, "step": 27005 }, { "epoch": 2.447888345115099, "grad_norm": 0.7901774048805237, "learning_rate": 3.0233836562206342e-05, "loss": 0.1589, "step": 27010 }, { "epoch": 2.448341489940185, "grad_norm": 0.8190219402313232, "learning_rate": 3.0226102691536994e-05, "loss": 0.1622, "step": 27015 }, { "epoch": 2.448794634765271, "grad_norm": 0.833406388759613, "learning_rate": 3.021836829783753e-05, "loss": 0.1488, "step": 27020 }, { "epoch": 2.449247779590357, "grad_norm": 0.9442074298858643, "learning_rate": 3.0210633381882004e-05, "loss": 0.1923, "step": 27025 }, { "epoch": 2.4497009244154433, "grad_norm": 0.9175161123275757, "learning_rate": 3.0202897944444515e-05, "loss": 0.168, "step": 27030 }, { "epoch": 2.4501540692405293, "grad_norm": 0.823987603187561, "learning_rate": 3.0195161986299243e-05, "loss": 0.1388, "step": 27035 }, { "epoch": 2.450607214065615, "grad_norm": 0.8874643445014954, "learning_rate": 3.0187425508220412e-05, "loss": 0.1563, "step": 27040 }, { "epoch": 2.4510603588907016, "grad_norm": 0.7197165489196777, "learning_rate": 3.017968851098228e-05, "loss": 0.1603, "step": 27045 }, { "epoch": 2.4515135037157876, "grad_norm": 0.8261886835098267, "learning_rate": 3.0171950995359165e-05, "loss": 0.1335, "step": 27050 }, { "epoch": 2.4519666485408735, "grad_norm": 0.7934787273406982, "learning_rate": 3.0164212962125447e-05, "loss": 0.1387, "step": 27055 }, { "epoch": 2.45241979336596, "grad_norm": 0.7479290962219238, "learning_rate": 3.0156474412055556e-05, "loss": 0.1357, "step": 27060 }, { "epoch": 2.452872938191046, "grad_norm": 0.8653318881988525, "learning_rate": 3.0148735345923957e-05, "loss": 0.1502, "step": 27065 }, { "epoch": 2.453326083016132, "grad_norm": 0.8358133435249329, "learning_rate": 3.0140995764505194e-05, "loss": 0.1447, "step": 27070 }, { "epoch": 2.453779227841218, "grad_norm": 0.814285933971405, "learning_rate": 3.0133255668573827e-05, "loss": 0.1588, "step": 27075 }, { "epoch": 2.4542323726663042, "grad_norm": 0.9553859829902649, "learning_rate": 3.0125515058904497e-05, "loss": 0.1929, "step": 27080 }, { "epoch": 2.45468551749139, "grad_norm": 0.9154797792434692, "learning_rate": 3.011777393627189e-05, "loss": 0.1354, "step": 27085 }, { "epoch": 2.455138662316476, "grad_norm": 0.8017305135726929, "learning_rate": 3.0110032301450734e-05, "loss": 0.1502, "step": 27090 }, { "epoch": 2.4555918071415626, "grad_norm": 0.9190124273300171, "learning_rate": 3.0102290155215824e-05, "loss": 0.1816, "step": 27095 }, { "epoch": 2.4560449519666485, "grad_norm": 0.869240939617157, "learning_rate": 3.0094547498341985e-05, "loss": 0.173, "step": 27100 }, { "epoch": 2.4564980967917345, "grad_norm": 0.7429944276809692, "learning_rate": 3.0086804331604114e-05, "loss": 0.148, "step": 27105 }, { "epoch": 2.456951241616821, "grad_norm": 0.912353515625, "learning_rate": 3.007906065577715e-05, "loss": 0.1607, "step": 27110 }, { "epoch": 2.457404386441907, "grad_norm": 0.8053203225135803, "learning_rate": 3.0071316471636068e-05, "loss": 0.1658, "step": 27115 }, { "epoch": 2.457857531266993, "grad_norm": 0.8519784808158875, "learning_rate": 3.006357177995593e-05, "loss": 0.1521, "step": 27120 }, { "epoch": 2.458310676092079, "grad_norm": 0.8086087703704834, "learning_rate": 3.0055826581511804e-05, "loss": 0.1867, "step": 27125 }, { "epoch": 2.458763820917165, "grad_norm": 0.7921028733253479, "learning_rate": 3.0048080877078856e-05, "loss": 0.1494, "step": 27130 }, { "epoch": 2.459216965742251, "grad_norm": 0.9769750237464905, "learning_rate": 3.0040334667432263e-05, "loss": 0.1559, "step": 27135 }, { "epoch": 2.4596701105673375, "grad_norm": 0.8476482033729553, "learning_rate": 3.0032587953347273e-05, "loss": 0.1699, "step": 27140 }, { "epoch": 2.4601232553924235, "grad_norm": 0.7974772453308105, "learning_rate": 3.002484073559918e-05, "loss": 0.1505, "step": 27145 }, { "epoch": 2.4605764002175095, "grad_norm": 0.8526124954223633, "learning_rate": 3.001709301496332e-05, "loss": 0.1543, "step": 27150 }, { "epoch": 2.461029545042596, "grad_norm": 0.8260271549224854, "learning_rate": 3.0009344792215104e-05, "loss": 0.1704, "step": 27155 }, { "epoch": 2.461482689867682, "grad_norm": 0.6790786385536194, "learning_rate": 3.0001596068129967e-05, "loss": 0.1357, "step": 27160 }, { "epoch": 2.4619358346927678, "grad_norm": 0.7482728958129883, "learning_rate": 2.9993846843483408e-05, "loss": 0.1425, "step": 27165 }, { "epoch": 2.4623889795178537, "grad_norm": 0.9388378262519836, "learning_rate": 2.998609711905096e-05, "loss": 0.179, "step": 27170 }, { "epoch": 2.46284212434294, "grad_norm": 0.8409760594367981, "learning_rate": 2.9978346895608223e-05, "loss": 0.2001, "step": 27175 }, { "epoch": 2.463295269168026, "grad_norm": 0.917353093624115, "learning_rate": 2.997059617393086e-05, "loss": 0.1513, "step": 27180 }, { "epoch": 2.463748413993112, "grad_norm": 0.8340640068054199, "learning_rate": 2.9962844954794543e-05, "loss": 0.1586, "step": 27185 }, { "epoch": 2.4642015588181985, "grad_norm": 0.9238966703414917, "learning_rate": 2.995509323897503e-05, "loss": 0.177, "step": 27190 }, { "epoch": 2.4646547036432844, "grad_norm": 0.8141590356826782, "learning_rate": 2.9947341027248104e-05, "loss": 0.1779, "step": 27195 }, { "epoch": 2.4651078484683704, "grad_norm": 0.9456556439399719, "learning_rate": 2.9939588320389617e-05, "loss": 0.1761, "step": 27200 }, { "epoch": 2.4655609932934563, "grad_norm": 0.936211884021759, "learning_rate": 2.9931835119175465e-05, "loss": 0.1815, "step": 27205 }, { "epoch": 2.4660141381185428, "grad_norm": 0.9524610638618469, "learning_rate": 2.992408142438159e-05, "loss": 0.1425, "step": 27210 }, { "epoch": 2.4664672829436287, "grad_norm": 0.7711324691772461, "learning_rate": 2.9916327236783968e-05, "loss": 0.1703, "step": 27215 }, { "epoch": 2.4669204277687147, "grad_norm": 0.8308486938476562, "learning_rate": 2.9908572557158666e-05, "loss": 0.1474, "step": 27220 }, { "epoch": 2.467373572593801, "grad_norm": 0.836294949054718, "learning_rate": 2.990081738628176e-05, "loss": 0.1323, "step": 27225 }, { "epoch": 2.467826717418887, "grad_norm": 0.7609798312187195, "learning_rate": 2.9893061724929394e-05, "loss": 0.1593, "step": 27230 }, { "epoch": 2.468279862243973, "grad_norm": 0.8237544298171997, "learning_rate": 2.9885305573877758e-05, "loss": 0.1695, "step": 27235 }, { "epoch": 2.4687330070690594, "grad_norm": 0.8617951273918152, "learning_rate": 2.9877548933903082e-05, "loss": 0.1735, "step": 27240 }, { "epoch": 2.4691861518941454, "grad_norm": 0.8106963038444519, "learning_rate": 2.9869791805781665e-05, "loss": 0.1677, "step": 27245 }, { "epoch": 2.4696392967192313, "grad_norm": 0.7611016631126404, "learning_rate": 2.986203419028984e-05, "loss": 0.1441, "step": 27250 }, { "epoch": 2.4700924415443177, "grad_norm": 0.7885866761207581, "learning_rate": 2.9854276088203988e-05, "loss": 0.2021, "step": 27255 }, { "epoch": 2.4705455863694037, "grad_norm": 0.9072457551956177, "learning_rate": 2.984651750030054e-05, "loss": 0.1876, "step": 27260 }, { "epoch": 2.4709987311944897, "grad_norm": 0.9130061268806458, "learning_rate": 2.9838758427355984e-05, "loss": 0.2318, "step": 27265 }, { "epoch": 2.471451876019576, "grad_norm": 0.8598118424415588, "learning_rate": 2.9830998870146854e-05, "loss": 0.1833, "step": 27270 }, { "epoch": 2.471905020844662, "grad_norm": 0.8543215394020081, "learning_rate": 2.982323882944972e-05, "loss": 0.1221, "step": 27275 }, { "epoch": 2.472358165669748, "grad_norm": 0.8965583443641663, "learning_rate": 2.9815478306041212e-05, "loss": 0.173, "step": 27280 }, { "epoch": 2.4728113104948344, "grad_norm": 0.8653157353401184, "learning_rate": 2.9807717300698003e-05, "loss": 0.1654, "step": 27285 }, { "epoch": 2.4732644553199203, "grad_norm": 0.7928777933120728, "learning_rate": 2.9799955814196827e-05, "loss": 0.1606, "step": 27290 }, { "epoch": 2.4737176001450063, "grad_norm": 0.8053364157676697, "learning_rate": 2.9792193847314454e-05, "loss": 0.1832, "step": 27295 }, { "epoch": 2.4741707449700923, "grad_norm": 0.8879052996635437, "learning_rate": 2.97844314008277e-05, "loss": 0.2001, "step": 27300 }, { "epoch": 2.4746238897951787, "grad_norm": 0.9007254242897034, "learning_rate": 2.9776668475513426e-05, "loss": 0.1529, "step": 27305 }, { "epoch": 2.4750770346202646, "grad_norm": 0.8443492650985718, "learning_rate": 2.9768905072148555e-05, "loss": 0.1444, "step": 27310 }, { "epoch": 2.4755301794453506, "grad_norm": 0.9279047846794128, "learning_rate": 2.9761141191510068e-05, "loss": 0.1663, "step": 27315 }, { "epoch": 2.475983324270437, "grad_norm": 0.8388667106628418, "learning_rate": 2.975337683437495e-05, "loss": 0.1945, "step": 27320 }, { "epoch": 2.476436469095523, "grad_norm": 0.8138309717178345, "learning_rate": 2.9745612001520273e-05, "loss": 0.1642, "step": 27325 }, { "epoch": 2.476889613920609, "grad_norm": 0.7896280288696289, "learning_rate": 2.973784669372315e-05, "loss": 0.1928, "step": 27330 }, { "epoch": 2.4773427587456953, "grad_norm": 0.7219569087028503, "learning_rate": 2.9730080911760715e-05, "loss": 0.1452, "step": 27335 }, { "epoch": 2.4777959035707813, "grad_norm": 0.9372833967208862, "learning_rate": 2.9722314656410188e-05, "loss": 0.1567, "step": 27340 }, { "epoch": 2.4782490483958672, "grad_norm": 0.7138280868530273, "learning_rate": 2.9714547928448816e-05, "loss": 0.1799, "step": 27345 }, { "epoch": 2.478702193220953, "grad_norm": 0.8432759046554565, "learning_rate": 2.9706780728653893e-05, "loss": 0.1521, "step": 27350 }, { "epoch": 2.4791553380460396, "grad_norm": 0.9042105078697205, "learning_rate": 2.9699013057802762e-05, "loss": 0.1384, "step": 27355 }, { "epoch": 2.4796084828711256, "grad_norm": 0.861112654209137, "learning_rate": 2.9691244916672816e-05, "loss": 0.1458, "step": 27360 }, { "epoch": 2.4800616276962115, "grad_norm": 0.8759329319000244, "learning_rate": 2.968347630604149e-05, "loss": 0.1716, "step": 27365 }, { "epoch": 2.480514772521298, "grad_norm": 0.8033079504966736, "learning_rate": 2.9675707226686278e-05, "loss": 0.1577, "step": 27370 }, { "epoch": 2.480967917346384, "grad_norm": 0.8347320556640625, "learning_rate": 2.9667937679384712e-05, "loss": 0.1288, "step": 27375 }, { "epoch": 2.48142106217147, "grad_norm": 0.8271902799606323, "learning_rate": 2.9660167664914352e-05, "loss": 0.1871, "step": 27380 }, { "epoch": 2.4818742069965563, "grad_norm": 0.7678508758544922, "learning_rate": 2.9652397184052842e-05, "loss": 0.1488, "step": 27385 }, { "epoch": 2.482327351821642, "grad_norm": 0.7413695454597473, "learning_rate": 2.9644626237577843e-05, "loss": 0.1777, "step": 27390 }, { "epoch": 2.482780496646728, "grad_norm": 0.8184211254119873, "learning_rate": 2.9636854826267093e-05, "loss": 0.1574, "step": 27395 }, { "epoch": 2.4832336414718146, "grad_norm": 0.735659122467041, "learning_rate": 2.9629082950898334e-05, "loss": 0.1417, "step": 27400 }, { "epoch": 2.4836867862969005, "grad_norm": 0.7948119640350342, "learning_rate": 2.962131061224939e-05, "loss": 0.1422, "step": 27405 }, { "epoch": 2.4841399311219865, "grad_norm": 0.8690754771232605, "learning_rate": 2.9613537811098125e-05, "loss": 0.1719, "step": 27410 }, { "epoch": 2.484593075947073, "grad_norm": 0.8310465216636658, "learning_rate": 2.9605764548222433e-05, "loss": 0.1368, "step": 27415 }, { "epoch": 2.485046220772159, "grad_norm": 0.8739113211631775, "learning_rate": 2.9597990824400264e-05, "loss": 0.1625, "step": 27420 }, { "epoch": 2.485499365597245, "grad_norm": 0.8560107946395874, "learning_rate": 2.959021664040961e-05, "loss": 0.1514, "step": 27425 }, { "epoch": 2.485952510422331, "grad_norm": 0.8448775410652161, "learning_rate": 2.9582441997028542e-05, "loss": 0.1485, "step": 27430 }, { "epoch": 2.486405655247417, "grad_norm": 1.0096652507781982, "learning_rate": 2.9574666895035118e-05, "loss": 0.1566, "step": 27435 }, { "epoch": 2.486858800072503, "grad_norm": 0.8533732295036316, "learning_rate": 2.956689133520749e-05, "loss": 0.1569, "step": 27440 }, { "epoch": 2.487311944897589, "grad_norm": 0.884769082069397, "learning_rate": 2.9559115318323832e-05, "loss": 0.1503, "step": 27445 }, { "epoch": 2.4877650897226755, "grad_norm": 0.7399085164070129, "learning_rate": 2.955133884516237e-05, "loss": 0.1565, "step": 27450 }, { "epoch": 2.4882182345477615, "grad_norm": 0.791553258895874, "learning_rate": 2.9543561916501378e-05, "loss": 0.1404, "step": 27455 }, { "epoch": 2.4886713793728474, "grad_norm": 0.8816794753074646, "learning_rate": 2.9535784533119175e-05, "loss": 0.1518, "step": 27460 }, { "epoch": 2.489124524197934, "grad_norm": 0.7219662666320801, "learning_rate": 2.9528006695794126e-05, "loss": 0.1398, "step": 27465 }, { "epoch": 2.48957766902302, "grad_norm": 0.8184975981712341, "learning_rate": 2.9520228405304627e-05, "loss": 0.1303, "step": 27470 }, { "epoch": 2.4900308138481058, "grad_norm": 0.8653312921524048, "learning_rate": 2.9512449662429143e-05, "loss": 0.2019, "step": 27475 }, { "epoch": 2.4904839586731917, "grad_norm": 0.8035889863967896, "learning_rate": 2.9504670467946178e-05, "loss": 0.1434, "step": 27480 }, { "epoch": 2.490937103498278, "grad_norm": 0.9183624982833862, "learning_rate": 2.949689082263427e-05, "loss": 0.1586, "step": 27485 }, { "epoch": 2.491390248323364, "grad_norm": 0.8528918027877808, "learning_rate": 2.948911072727201e-05, "loss": 0.1538, "step": 27490 }, { "epoch": 2.49184339314845, "grad_norm": 0.759531557559967, "learning_rate": 2.9481330182638017e-05, "loss": 0.1544, "step": 27495 }, { "epoch": 2.4922965379735365, "grad_norm": 0.8254613280296326, "learning_rate": 2.947354918951099e-05, "loss": 0.1309, "step": 27500 }, { "epoch": 2.4927496827986224, "grad_norm": 0.9750409722328186, "learning_rate": 2.9465767748669655e-05, "loss": 0.1438, "step": 27505 }, { "epoch": 2.4932028276237084, "grad_norm": 0.9091053605079651, "learning_rate": 2.945798586089277e-05, "loss": 0.1712, "step": 27510 }, { "epoch": 2.4936559724487948, "grad_norm": 0.8776338696479797, "learning_rate": 2.945020352695916e-05, "loss": 0.1616, "step": 27515 }, { "epoch": 2.4941091172738807, "grad_norm": 0.8462930917739868, "learning_rate": 2.9442420747647664e-05, "loss": 0.1682, "step": 27520 }, { "epoch": 2.4945622620989667, "grad_norm": 0.8761798739433289, "learning_rate": 2.9434637523737208e-05, "loss": 0.1725, "step": 27525 }, { "epoch": 2.495015406924053, "grad_norm": 0.8499380350112915, "learning_rate": 2.942685385600672e-05, "loss": 0.145, "step": 27530 }, { "epoch": 2.495468551749139, "grad_norm": 0.9684653878211975, "learning_rate": 2.9419069745235206e-05, "loss": 0.1693, "step": 27535 }, { "epoch": 2.495921696574225, "grad_norm": 0.8126114010810852, "learning_rate": 2.94112851922017e-05, "loss": 0.1391, "step": 27540 }, { "epoch": 2.4963748413993114, "grad_norm": 0.7840297818183899, "learning_rate": 2.9403500197685273e-05, "loss": 0.1313, "step": 27545 }, { "epoch": 2.4968279862243974, "grad_norm": 0.9357712268829346, "learning_rate": 2.939571476246507e-05, "loss": 0.1664, "step": 27550 }, { "epoch": 2.4972811310494833, "grad_norm": 0.8165059685707092, "learning_rate": 2.9387928887320237e-05, "loss": 0.1554, "step": 27555 }, { "epoch": 2.4977342758745698, "grad_norm": 0.8006167411804199, "learning_rate": 2.9380142573030005e-05, "loss": 0.1488, "step": 27560 }, { "epoch": 2.4981874206996557, "grad_norm": 0.9394017457962036, "learning_rate": 2.937235582037362e-05, "loss": 0.1634, "step": 27565 }, { "epoch": 2.4986405655247417, "grad_norm": 0.9133169054985046, "learning_rate": 2.9364568630130384e-05, "loss": 0.1439, "step": 27570 }, { "epoch": 2.4990937103498276, "grad_norm": 0.8702112436294556, "learning_rate": 2.9356781003079654e-05, "loss": 0.1716, "step": 27575 }, { "epoch": 2.499546855174914, "grad_norm": 0.8201572299003601, "learning_rate": 2.9348992940000798e-05, "loss": 0.1408, "step": 27580 }, { "epoch": 2.5, "grad_norm": 0.8479902148246765, "learning_rate": 2.9341204441673266e-05, "loss": 0.1857, "step": 27585 }, { "epoch": 2.500453144825086, "grad_norm": 0.7183383703231812, "learning_rate": 2.9333415508876517e-05, "loss": 0.1504, "step": 27590 }, { "epoch": 2.500906289650172, "grad_norm": 0.8600064516067505, "learning_rate": 2.9325626142390084e-05, "loss": 0.1667, "step": 27595 }, { "epoch": 2.5013594344752583, "grad_norm": 0.704616129398346, "learning_rate": 2.9317836342993528e-05, "loss": 0.1377, "step": 27600 }, { "epoch": 2.5018125793003443, "grad_norm": 0.7733756899833679, "learning_rate": 2.931004611146645e-05, "loss": 0.1426, "step": 27605 }, { "epoch": 2.5022657241254302, "grad_norm": 0.7216371893882751, "learning_rate": 2.930225544858849e-05, "loss": 0.1627, "step": 27610 }, { "epoch": 2.5027188689505167, "grad_norm": 0.8521899580955505, "learning_rate": 2.9294464355139356e-05, "loss": 0.1438, "step": 27615 }, { "epoch": 2.5031720137756026, "grad_norm": 0.7590405344963074, "learning_rate": 2.928667283189878e-05, "loss": 0.15, "step": 27620 }, { "epoch": 2.5036251586006886, "grad_norm": 0.7883206009864807, "learning_rate": 2.9278880879646542e-05, "loss": 0.1433, "step": 27625 }, { "epoch": 2.504078303425775, "grad_norm": 0.8439908027648926, "learning_rate": 2.9271088499162457e-05, "loss": 0.1526, "step": 27630 }, { "epoch": 2.504531448250861, "grad_norm": 0.8357980847358704, "learning_rate": 2.9263295691226384e-05, "loss": 0.1685, "step": 27635 }, { "epoch": 2.504984593075947, "grad_norm": 0.8025311827659607, "learning_rate": 2.925550245661824e-05, "loss": 0.159, "step": 27640 }, { "epoch": 2.5054377379010333, "grad_norm": 0.8852629065513611, "learning_rate": 2.9247708796117973e-05, "loss": 0.1816, "step": 27645 }, { "epoch": 2.5058908827261193, "grad_norm": 0.7248457074165344, "learning_rate": 2.9239914710505577e-05, "loss": 0.1398, "step": 27650 }, { "epoch": 2.5063440275512052, "grad_norm": 0.8366149663925171, "learning_rate": 2.923212020056108e-05, "loss": 0.152, "step": 27655 }, { "epoch": 2.5067971723762916, "grad_norm": 0.8545560836791992, "learning_rate": 2.922432526706456e-05, "loss": 0.1609, "step": 27660 }, { "epoch": 2.5072503172013776, "grad_norm": 0.7863669395446777, "learning_rate": 2.921652991079614e-05, "loss": 0.1563, "step": 27665 }, { "epoch": 2.5077034620264635, "grad_norm": 0.9104048609733582, "learning_rate": 2.9208734132535987e-05, "loss": 0.1497, "step": 27670 }, { "epoch": 2.50815660685155, "grad_norm": 0.8293709754943848, "learning_rate": 2.9200937933064293e-05, "loss": 0.1382, "step": 27675 }, { "epoch": 2.508609751676636, "grad_norm": 0.8561192750930786, "learning_rate": 2.919314131316131e-05, "loss": 0.1567, "step": 27680 }, { "epoch": 2.509062896501722, "grad_norm": 0.8650340437889099, "learning_rate": 2.9185344273607336e-05, "loss": 0.1639, "step": 27685 }, { "epoch": 2.5095160413268083, "grad_norm": 0.8980063199996948, "learning_rate": 2.9177546815182688e-05, "loss": 0.1863, "step": 27690 }, { "epoch": 2.5099691861518942, "grad_norm": 0.8546443581581116, "learning_rate": 2.9169748938667734e-05, "loss": 0.1889, "step": 27695 }, { "epoch": 2.51042233097698, "grad_norm": 0.8447632789611816, "learning_rate": 2.9161950644842906e-05, "loss": 0.1793, "step": 27700 }, { "epoch": 2.5108754758020666, "grad_norm": 0.7497230768203735, "learning_rate": 2.9154151934488644e-05, "loss": 0.1588, "step": 27705 }, { "epoch": 2.5113286206271526, "grad_norm": 0.7590937614440918, "learning_rate": 2.9146352808385458e-05, "loss": 0.1396, "step": 27710 }, { "epoch": 2.5117817654522385, "grad_norm": 0.7605500817298889, "learning_rate": 2.9138553267313885e-05, "loss": 0.1669, "step": 27715 }, { "epoch": 2.5122349102773245, "grad_norm": 0.7945247292518616, "learning_rate": 2.9130753312054494e-05, "loss": 0.1325, "step": 27720 }, { "epoch": 2.512688055102411, "grad_norm": 0.8933020830154419, "learning_rate": 2.9122952943387916e-05, "loss": 0.1428, "step": 27725 }, { "epoch": 2.513141199927497, "grad_norm": 0.8664500117301941, "learning_rate": 2.9115152162094812e-05, "loss": 0.1562, "step": 27730 }, { "epoch": 2.513594344752583, "grad_norm": 0.8451887369155884, "learning_rate": 2.9107350968955895e-05, "loss": 0.1762, "step": 27735 }, { "epoch": 2.5140474895776688, "grad_norm": 0.8627873659133911, "learning_rate": 2.9099549364751898e-05, "loss": 0.147, "step": 27740 }, { "epoch": 2.514500634402755, "grad_norm": 0.9267680644989014, "learning_rate": 2.909174735026362e-05, "loss": 0.1647, "step": 27745 }, { "epoch": 2.514953779227841, "grad_norm": 0.9123027324676514, "learning_rate": 2.9083944926271878e-05, "loss": 0.1491, "step": 27750 }, { "epoch": 2.515406924052927, "grad_norm": 0.7750957608222961, "learning_rate": 2.907614209355755e-05, "loss": 0.1484, "step": 27755 }, { "epoch": 2.5158600688780135, "grad_norm": 0.7602953910827637, "learning_rate": 2.906833885290155e-05, "loss": 0.1314, "step": 27760 }, { "epoch": 2.5163132137030995, "grad_norm": 0.7753039002418518, "learning_rate": 2.9060535205084816e-05, "loss": 0.1527, "step": 27765 }, { "epoch": 2.5167663585281854, "grad_norm": 0.8968260288238525, "learning_rate": 2.9052731150888347e-05, "loss": 0.1596, "step": 27770 }, { "epoch": 2.517219503353272, "grad_norm": 0.7358638644218445, "learning_rate": 2.9044926691093167e-05, "loss": 0.1524, "step": 27775 }, { "epoch": 2.517672648178358, "grad_norm": 0.8248065710067749, "learning_rate": 2.903712182648036e-05, "loss": 0.1354, "step": 27780 }, { "epoch": 2.5181257930034437, "grad_norm": 0.8787135481834412, "learning_rate": 2.9029316557831048e-05, "loss": 0.1418, "step": 27785 }, { "epoch": 2.51857893782853, "grad_norm": 0.8704959154129028, "learning_rate": 2.9021510885926363e-05, "loss": 0.1483, "step": 27790 }, { "epoch": 2.519032082653616, "grad_norm": 0.833407461643219, "learning_rate": 2.9013704811547516e-05, "loss": 0.1627, "step": 27795 }, { "epoch": 2.519485227478702, "grad_norm": 0.7918851375579834, "learning_rate": 2.9005898335475724e-05, "loss": 0.1679, "step": 27800 }, { "epoch": 2.5199383723037885, "grad_norm": 0.8688015937805176, "learning_rate": 2.899809145849227e-05, "loss": 0.1663, "step": 27805 }, { "epoch": 2.5203915171288744, "grad_norm": 0.9398573637008667, "learning_rate": 2.8990284181378484e-05, "loss": 0.1564, "step": 27810 }, { "epoch": 2.5208446619539604, "grad_norm": 0.8742785453796387, "learning_rate": 2.89824765049157e-05, "loss": 0.1395, "step": 27815 }, { "epoch": 2.521297806779047, "grad_norm": 0.8596312999725342, "learning_rate": 2.8974668429885323e-05, "loss": 0.15, "step": 27820 }, { "epoch": 2.5217509516041328, "grad_norm": 0.8127084970474243, "learning_rate": 2.896685995706879e-05, "loss": 0.1338, "step": 27825 }, { "epoch": 2.5222040964292187, "grad_norm": 0.8561649918556213, "learning_rate": 2.8959051087247563e-05, "loss": 0.1378, "step": 27830 }, { "epoch": 2.522657241254305, "grad_norm": 0.9379816651344299, "learning_rate": 2.8951241821203173e-05, "loss": 0.1753, "step": 27835 }, { "epoch": 2.523110386079391, "grad_norm": 0.8215036392211914, "learning_rate": 2.8943432159717165e-05, "loss": 0.161, "step": 27840 }, { "epoch": 2.523563530904477, "grad_norm": 0.7831107974052429, "learning_rate": 2.8935622103571124e-05, "loss": 0.144, "step": 27845 }, { "epoch": 2.524016675729563, "grad_norm": 0.8780638575553894, "learning_rate": 2.8927811653546703e-05, "loss": 0.1437, "step": 27850 }, { "epoch": 2.5244698205546494, "grad_norm": 0.8063316941261292, "learning_rate": 2.8920000810425556e-05, "loss": 0.1627, "step": 27855 }, { "epoch": 2.5249229653797354, "grad_norm": 0.8025145530700684, "learning_rate": 2.891218957498941e-05, "loss": 0.1546, "step": 27860 }, { "epoch": 2.5253761102048213, "grad_norm": 0.8494036793708801, "learning_rate": 2.8904377948020008e-05, "loss": 0.1458, "step": 27865 }, { "epoch": 2.5258292550299073, "grad_norm": 0.6647771596908569, "learning_rate": 2.8896565930299137e-05, "loss": 0.1461, "step": 27870 }, { "epoch": 2.5262823998549937, "grad_norm": 0.7103942036628723, "learning_rate": 2.8888753522608637e-05, "loss": 0.1296, "step": 27875 }, { "epoch": 2.5267355446800797, "grad_norm": 0.8137333989143372, "learning_rate": 2.8880940725730364e-05, "loss": 0.1812, "step": 27880 }, { "epoch": 2.5271886895051656, "grad_norm": 1.0324915647506714, "learning_rate": 2.887312754044623e-05, "loss": 0.1763, "step": 27885 }, { "epoch": 2.527641834330252, "grad_norm": 0.8552796244621277, "learning_rate": 2.8865313967538182e-05, "loss": 0.1734, "step": 27890 }, { "epoch": 2.528094979155338, "grad_norm": 0.8270636796951294, "learning_rate": 2.8857500007788213e-05, "loss": 0.1688, "step": 27895 }, { "epoch": 2.528548123980424, "grad_norm": 0.8379900455474854, "learning_rate": 2.884968566197833e-05, "loss": 0.1377, "step": 27900 }, { "epoch": 2.5290012688055103, "grad_norm": 0.880416214466095, "learning_rate": 2.8841870930890612e-05, "loss": 0.1656, "step": 27905 }, { "epoch": 2.5294544136305963, "grad_norm": 0.7786189913749695, "learning_rate": 2.8834055815307148e-05, "loss": 0.1528, "step": 27910 }, { "epoch": 2.5299075584556823, "grad_norm": 0.7282492518424988, "learning_rate": 2.8826240316010084e-05, "loss": 0.1565, "step": 27915 }, { "epoch": 2.5303607032807687, "grad_norm": 0.6997399926185608, "learning_rate": 2.88184244337816e-05, "loss": 0.1405, "step": 27920 }, { "epoch": 2.5308138481058546, "grad_norm": 0.8619735240936279, "learning_rate": 2.8810608169403903e-05, "loss": 0.1624, "step": 27925 }, { "epoch": 2.5312669929309406, "grad_norm": 0.7896915674209595, "learning_rate": 2.880279152365925e-05, "loss": 0.1587, "step": 27930 }, { "epoch": 2.531720137756027, "grad_norm": 0.8338260650634766, "learning_rate": 2.8794974497329933e-05, "loss": 0.1583, "step": 27935 }, { "epoch": 2.532173282581113, "grad_norm": 0.8787974119186401, "learning_rate": 2.8787157091198293e-05, "loss": 0.1677, "step": 27940 }, { "epoch": 2.532626427406199, "grad_norm": 0.7502824068069458, "learning_rate": 2.877933930604669e-05, "loss": 0.1477, "step": 27945 }, { "epoch": 2.5330795722312853, "grad_norm": 0.8684325218200684, "learning_rate": 2.877152114265753e-05, "loss": 0.1633, "step": 27950 }, { "epoch": 2.5335327170563713, "grad_norm": 0.9228343367576599, "learning_rate": 2.8763702601813263e-05, "loss": 0.1785, "step": 27955 }, { "epoch": 2.5339858618814572, "grad_norm": 0.845976710319519, "learning_rate": 2.875588368429636e-05, "loss": 0.1559, "step": 27960 }, { "epoch": 2.5344390067065437, "grad_norm": 1.0192582607269287, "learning_rate": 2.8748064390889356e-05, "loss": 0.1714, "step": 27965 }, { "epoch": 2.5348921515316296, "grad_norm": 0.773658812046051, "learning_rate": 2.8740244722374793e-05, "loss": 0.1714, "step": 27970 }, { "epoch": 2.5353452963567156, "grad_norm": 0.7777646780014038, "learning_rate": 2.8732424679535274e-05, "loss": 0.1262, "step": 27975 }, { "epoch": 2.535798441181802, "grad_norm": 0.9309254288673401, "learning_rate": 2.8724604263153442e-05, "loss": 0.1748, "step": 27980 }, { "epoch": 2.536251586006888, "grad_norm": 0.7718258500099182, "learning_rate": 2.8716783474011942e-05, "loss": 0.1332, "step": 27985 }, { "epoch": 2.536704730831974, "grad_norm": 0.7710718512535095, "learning_rate": 2.8708962312893505e-05, "loss": 0.1346, "step": 27990 }, { "epoch": 2.53715787565706, "grad_norm": 0.9546815752983093, "learning_rate": 2.870114078058086e-05, "loss": 0.186, "step": 27995 }, { "epoch": 2.537611020482146, "grad_norm": 0.7810234427452087, "learning_rate": 2.86933188778568e-05, "loss": 0.1467, "step": 28000 }, { "epoch": 2.5380641653072322, "grad_norm": 0.9014552235603333, "learning_rate": 2.8685496605504136e-05, "loss": 0.1697, "step": 28005 }, { "epoch": 2.538517310132318, "grad_norm": 0.944490373134613, "learning_rate": 2.8677673964305724e-05, "loss": 0.1563, "step": 28010 }, { "epoch": 2.538970454957404, "grad_norm": 0.9526199102401733, "learning_rate": 2.8669850955044462e-05, "loss": 0.1661, "step": 28015 }, { "epoch": 2.5394235997824905, "grad_norm": 0.7644966244697571, "learning_rate": 2.866202757850327e-05, "loss": 0.1327, "step": 28020 }, { "epoch": 2.5398767446075765, "grad_norm": 0.9123398065567017, "learning_rate": 2.8654203835465126e-05, "loss": 0.142, "step": 28025 }, { "epoch": 2.5403298894326625, "grad_norm": 0.9145048260688782, "learning_rate": 2.8646379726713025e-05, "loss": 0.1564, "step": 28030 }, { "epoch": 2.540783034257749, "grad_norm": 0.8138489723205566, "learning_rate": 2.863855525303001e-05, "loss": 0.1514, "step": 28035 }, { "epoch": 2.541236179082835, "grad_norm": 0.8843850493431091, "learning_rate": 2.8630730415199154e-05, "loss": 0.1406, "step": 28040 }, { "epoch": 2.541689323907921, "grad_norm": 0.8070067167282104, "learning_rate": 2.862290521400357e-05, "loss": 0.1523, "step": 28045 }, { "epoch": 2.542142468733007, "grad_norm": 0.7589446306228638, "learning_rate": 2.8615079650226406e-05, "loss": 0.139, "step": 28050 }, { "epoch": 2.542595613558093, "grad_norm": 0.6867998242378235, "learning_rate": 2.860725372465085e-05, "loss": 0.1279, "step": 28055 }, { "epoch": 2.543048758383179, "grad_norm": 0.8432937264442444, "learning_rate": 2.8599427438060124e-05, "loss": 0.1408, "step": 28060 }, { "epoch": 2.5435019032082655, "grad_norm": 0.8286011815071106, "learning_rate": 2.8591600791237483e-05, "loss": 0.1797, "step": 28065 }, { "epoch": 2.5439550480333515, "grad_norm": 0.8254090547561646, "learning_rate": 2.8583773784966217e-05, "loss": 0.1386, "step": 28070 }, { "epoch": 2.5444081928584374, "grad_norm": 0.7391841411590576, "learning_rate": 2.857594642002966e-05, "loss": 0.1207, "step": 28075 }, { "epoch": 2.544861337683524, "grad_norm": 0.9631227254867554, "learning_rate": 2.856811869721117e-05, "loss": 0.1462, "step": 28080 }, { "epoch": 2.54531448250861, "grad_norm": 0.922681987285614, "learning_rate": 2.8560290617294167e-05, "loss": 0.1573, "step": 28085 }, { "epoch": 2.5457676273336958, "grad_norm": 0.7258879542350769, "learning_rate": 2.8552462181062072e-05, "loss": 0.1433, "step": 28090 }, { "epoch": 2.546220772158782, "grad_norm": 0.845966100692749, "learning_rate": 2.8544633389298363e-05, "loss": 0.135, "step": 28095 }, { "epoch": 2.546673916983868, "grad_norm": 0.8944917321205139, "learning_rate": 2.853680424278654e-05, "loss": 0.1525, "step": 28100 }, { "epoch": 2.547127061808954, "grad_norm": 0.7598668336868286, "learning_rate": 2.852897474231015e-05, "loss": 0.1355, "step": 28105 }, { "epoch": 2.5475802066340405, "grad_norm": 0.7769808769226074, "learning_rate": 2.8521144888652786e-05, "loss": 0.1426, "step": 28110 }, { "epoch": 2.5480333514591265, "grad_norm": 0.825910747051239, "learning_rate": 2.8513314682598047e-05, "loss": 0.1448, "step": 28115 }, { "epoch": 2.5484864962842124, "grad_norm": 0.8459782004356384, "learning_rate": 2.8505484124929584e-05, "loss": 0.1556, "step": 28120 }, { "epoch": 2.5489396411092984, "grad_norm": 0.8247504234313965, "learning_rate": 2.8497653216431082e-05, "loss": 0.1587, "step": 28125 }, { "epoch": 2.549392785934385, "grad_norm": 0.8652893304824829, "learning_rate": 2.8489821957886264e-05, "loss": 0.1393, "step": 28130 }, { "epoch": 2.5498459307594707, "grad_norm": 0.78905189037323, "learning_rate": 2.8481990350078892e-05, "loss": 0.1369, "step": 28135 }, { "epoch": 2.5502990755845567, "grad_norm": 0.8322566151618958, "learning_rate": 2.847415839379275e-05, "loss": 0.1434, "step": 28140 }, { "epoch": 2.5507522204096427, "grad_norm": 0.8913535475730896, "learning_rate": 2.8466326089811652e-05, "loss": 0.1467, "step": 28145 }, { "epoch": 2.551205365234729, "grad_norm": 0.7787355780601501, "learning_rate": 2.845849343891947e-05, "loss": 0.1356, "step": 28150 }, { "epoch": 2.551658510059815, "grad_norm": 0.8507276177406311, "learning_rate": 2.84506604419001e-05, "loss": 0.1418, "step": 28155 }, { "epoch": 2.552111654884901, "grad_norm": 0.8269304633140564, "learning_rate": 2.8442827099537463e-05, "loss": 0.1519, "step": 28160 }, { "epoch": 2.5525647997099874, "grad_norm": 0.8223332762718201, "learning_rate": 2.8434993412615528e-05, "loss": 0.151, "step": 28165 }, { "epoch": 2.5530179445350734, "grad_norm": 1.0145572423934937, "learning_rate": 2.8427159381918285e-05, "loss": 0.1618, "step": 28170 }, { "epoch": 2.5534710893601593, "grad_norm": 0.6681910753250122, "learning_rate": 2.841932500822978e-05, "loss": 0.1446, "step": 28175 }, { "epoch": 2.5539242341852457, "grad_norm": 0.8148435950279236, "learning_rate": 2.841149029233407e-05, "loss": 0.13, "step": 28180 }, { "epoch": 2.5543773790103317, "grad_norm": 0.8287680149078369, "learning_rate": 2.840365523501526e-05, "loss": 0.1537, "step": 28185 }, { "epoch": 2.5548305238354176, "grad_norm": 0.9264797568321228, "learning_rate": 2.8395819837057476e-05, "loss": 0.1583, "step": 28190 }, { "epoch": 2.555283668660504, "grad_norm": 0.7246888875961304, "learning_rate": 2.8387984099244902e-05, "loss": 0.1465, "step": 28195 }, { "epoch": 2.55573681348559, "grad_norm": 0.8716610670089722, "learning_rate": 2.838014802236173e-05, "loss": 0.1634, "step": 28200 }, { "epoch": 2.556189958310676, "grad_norm": 0.8596564531326294, "learning_rate": 2.837231160719221e-05, "loss": 0.1572, "step": 28205 }, { "epoch": 2.5566431031357624, "grad_norm": 0.7548962235450745, "learning_rate": 2.836447485452059e-05, "loss": 0.1435, "step": 28210 }, { "epoch": 2.5570962479608483, "grad_norm": 0.841044545173645, "learning_rate": 2.8356637765131194e-05, "loss": 0.1299, "step": 28215 }, { "epoch": 2.5575493927859343, "grad_norm": 0.872963011264801, "learning_rate": 2.8348800339808363e-05, "loss": 0.1336, "step": 28220 }, { "epoch": 2.5580025376110207, "grad_norm": 0.7381399273872375, "learning_rate": 2.834096257933646e-05, "loss": 0.1562, "step": 28225 }, { "epoch": 2.5584556824361067, "grad_norm": 0.8803527355194092, "learning_rate": 2.833312448449989e-05, "loss": 0.1831, "step": 28230 }, { "epoch": 2.5589088272611926, "grad_norm": 0.8666426539421082, "learning_rate": 2.83252860560831e-05, "loss": 0.1817, "step": 28235 }, { "epoch": 2.559361972086279, "grad_norm": 0.9046549201011658, "learning_rate": 2.8317447294870546e-05, "loss": 0.1496, "step": 28240 }, { "epoch": 2.559815116911365, "grad_norm": 0.8377737402915955, "learning_rate": 2.8309608201646746e-05, "loss": 0.1361, "step": 28245 }, { "epoch": 2.560268261736451, "grad_norm": 0.797372579574585, "learning_rate": 2.8301768777196246e-05, "loss": 0.1446, "step": 28250 }, { "epoch": 2.560721406561537, "grad_norm": 0.8256444334983826, "learning_rate": 2.829392902230361e-05, "loss": 0.1605, "step": 28255 }, { "epoch": 2.5611745513866233, "grad_norm": 0.9193214178085327, "learning_rate": 2.8286088937753443e-05, "loss": 0.1454, "step": 28260 }, { "epoch": 2.5616276962117093, "grad_norm": 0.75620037317276, "learning_rate": 2.8278248524330387e-05, "loss": 0.1409, "step": 28265 }, { "epoch": 2.5620808410367952, "grad_norm": 0.7334786057472229, "learning_rate": 2.8270407782819104e-05, "loss": 0.1468, "step": 28270 }, { "epoch": 2.562533985861881, "grad_norm": 0.7227643728256226, "learning_rate": 2.826256671400431e-05, "loss": 0.1438, "step": 28275 }, { "epoch": 2.5629871306869676, "grad_norm": 0.8973819613456726, "learning_rate": 2.8254725318670733e-05, "loss": 0.1754, "step": 28280 }, { "epoch": 2.5634402755120536, "grad_norm": 0.8132975101470947, "learning_rate": 2.8246883597603146e-05, "loss": 0.1579, "step": 28285 }, { "epoch": 2.5638934203371395, "grad_norm": 0.7188611030578613, "learning_rate": 2.8239041551586355e-05, "loss": 0.1557, "step": 28290 }, { "epoch": 2.564346565162226, "grad_norm": 0.7418354749679565, "learning_rate": 2.8231199181405183e-05, "loss": 0.1416, "step": 28295 }, { "epoch": 2.564799709987312, "grad_norm": 0.9055258631706238, "learning_rate": 2.822335648784451e-05, "loss": 0.1522, "step": 28300 }, { "epoch": 2.565252854812398, "grad_norm": 1.0795905590057373, "learning_rate": 2.821551347168923e-05, "loss": 0.1867, "step": 28305 }, { "epoch": 2.5657059996374842, "grad_norm": 0.7173680067062378, "learning_rate": 2.8207670133724272e-05, "loss": 0.1506, "step": 28310 }, { "epoch": 2.56615914446257, "grad_norm": 0.8210365772247314, "learning_rate": 2.819982647473461e-05, "loss": 0.1286, "step": 28315 }, { "epoch": 2.566612289287656, "grad_norm": 0.8446120619773865, "learning_rate": 2.819198249550522e-05, "loss": 0.1266, "step": 28320 }, { "epoch": 2.5670654341127426, "grad_norm": 0.7794247269630432, "learning_rate": 2.8184138196821154e-05, "loss": 0.1762, "step": 28325 }, { "epoch": 2.5675185789378285, "grad_norm": 0.8318856954574585, "learning_rate": 2.817629357946745e-05, "loss": 0.1618, "step": 28330 }, { "epoch": 2.5679717237629145, "grad_norm": 0.8348758220672607, "learning_rate": 2.8168448644229222e-05, "loss": 0.173, "step": 28335 }, { "epoch": 2.568424868588001, "grad_norm": 0.7461562156677246, "learning_rate": 2.816060339189158e-05, "loss": 0.1444, "step": 28340 }, { "epoch": 2.568878013413087, "grad_norm": 0.7621652483940125, "learning_rate": 2.8152757823239685e-05, "loss": 0.1382, "step": 28345 }, { "epoch": 2.569331158238173, "grad_norm": 0.8112903833389282, "learning_rate": 2.814491193905871e-05, "loss": 0.1643, "step": 28350 }, { "epoch": 2.5697843030632592, "grad_norm": 0.8640956878662109, "learning_rate": 2.813706574013389e-05, "loss": 0.1537, "step": 28355 }, { "epoch": 2.570237447888345, "grad_norm": 0.6623457074165344, "learning_rate": 2.8129219227250476e-05, "loss": 0.1406, "step": 28360 }, { "epoch": 2.570690592713431, "grad_norm": 0.7705540657043457, "learning_rate": 2.8121372401193742e-05, "loss": 0.1329, "step": 28365 }, { "epoch": 2.5711437375385175, "grad_norm": 0.8160174489021301, "learning_rate": 2.8113525262749003e-05, "loss": 0.1298, "step": 28370 }, { "epoch": 2.5715968823636035, "grad_norm": 0.7472044229507446, "learning_rate": 2.81056778127016e-05, "loss": 0.1263, "step": 28375 }, { "epoch": 2.5720500271886895, "grad_norm": 0.7754073739051819, "learning_rate": 2.8097830051836905e-05, "loss": 0.1493, "step": 28380 }, { "epoch": 2.572503172013776, "grad_norm": 0.8505250215530396, "learning_rate": 2.808998198094034e-05, "loss": 0.1443, "step": 28385 }, { "epoch": 2.572956316838862, "grad_norm": 0.7561440467834473, "learning_rate": 2.8082133600797332e-05, "loss": 0.1252, "step": 28390 }, { "epoch": 2.573409461663948, "grad_norm": 0.8956730365753174, "learning_rate": 2.8074284912193355e-05, "loss": 0.1551, "step": 28395 }, { "epoch": 2.5738626064890338, "grad_norm": 0.8016186952590942, "learning_rate": 2.8066435915913902e-05, "loss": 0.1504, "step": 28400 }, { "epoch": 2.5743157513141197, "grad_norm": 0.8190480470657349, "learning_rate": 2.8058586612744502e-05, "loss": 0.1375, "step": 28405 }, { "epoch": 2.574768896139206, "grad_norm": 0.7637748718261719, "learning_rate": 2.8050737003470724e-05, "loss": 0.1924, "step": 28410 }, { "epoch": 2.575222040964292, "grad_norm": 0.7746516466140747, "learning_rate": 2.8042887088878157e-05, "loss": 0.13, "step": 28415 }, { "epoch": 2.575675185789378, "grad_norm": 0.9659831523895264, "learning_rate": 2.8035036869752424e-05, "loss": 0.1654, "step": 28420 }, { "epoch": 2.5761283306144644, "grad_norm": 0.9202244877815247, "learning_rate": 2.8027186346879165e-05, "loss": 0.1713, "step": 28425 }, { "epoch": 2.5765814754395504, "grad_norm": 0.7724209427833557, "learning_rate": 2.801933552104408e-05, "loss": 0.1479, "step": 28430 }, { "epoch": 2.5770346202646364, "grad_norm": 0.7153626680374146, "learning_rate": 2.8011484393032873e-05, "loss": 0.1473, "step": 28435 }, { "epoch": 2.5774877650897228, "grad_norm": 0.9570797681808472, "learning_rate": 2.8003632963631286e-05, "loss": 0.1501, "step": 28440 }, { "epoch": 2.5779409099148087, "grad_norm": 0.8969792723655701, "learning_rate": 2.7995781233625106e-05, "loss": 0.1737, "step": 28445 }, { "epoch": 2.5783940547398947, "grad_norm": 0.85554039478302, "learning_rate": 2.798792920380012e-05, "loss": 0.1565, "step": 28450 }, { "epoch": 2.578847199564981, "grad_norm": 0.8540194630622864, "learning_rate": 2.7980076874942172e-05, "loss": 0.1618, "step": 28455 }, { "epoch": 2.579300344390067, "grad_norm": 0.938929557800293, "learning_rate": 2.797222424783712e-05, "loss": 0.1862, "step": 28460 }, { "epoch": 2.579753489215153, "grad_norm": 0.7623956203460693, "learning_rate": 2.7964371323270866e-05, "loss": 0.1338, "step": 28465 }, { "epoch": 2.5802066340402394, "grad_norm": 0.7451079487800598, "learning_rate": 2.795651810202932e-05, "loss": 0.1818, "step": 28470 }, { "epoch": 2.5806597788653254, "grad_norm": 0.7543916702270508, "learning_rate": 2.7948664584898453e-05, "loss": 0.1321, "step": 28475 }, { "epoch": 2.5811129236904113, "grad_norm": 0.8355231881141663, "learning_rate": 2.7940810772664233e-05, "loss": 0.1305, "step": 28480 }, { "epoch": 2.5815660685154977, "grad_norm": 0.8705804944038391, "learning_rate": 2.7932956666112674e-05, "loss": 0.1346, "step": 28485 }, { "epoch": 2.5820192133405837, "grad_norm": 0.9928923845291138, "learning_rate": 2.7925102266029823e-05, "loss": 0.1638, "step": 28490 }, { "epoch": 2.5824723581656697, "grad_norm": 0.8344063758850098, "learning_rate": 2.791724757320175e-05, "loss": 0.1497, "step": 28495 }, { "epoch": 2.582925502990756, "grad_norm": 0.8884432911872864, "learning_rate": 2.7909392588414553e-05, "loss": 0.1479, "step": 28500 }, { "epoch": 2.583378647815842, "grad_norm": 0.8980674147605896, "learning_rate": 2.7901537312454362e-05, "loss": 0.1367, "step": 28505 }, { "epoch": 2.583831792640928, "grad_norm": 0.9057635068893433, "learning_rate": 2.7893681746107337e-05, "loss": 0.139, "step": 28510 }, { "epoch": 2.5842849374660144, "grad_norm": 0.8511994481086731, "learning_rate": 2.7885825890159662e-05, "loss": 0.1271, "step": 28515 }, { "epoch": 2.5847380822911004, "grad_norm": 0.7397542595863342, "learning_rate": 2.7877969745397547e-05, "loss": 0.1347, "step": 28520 }, { "epoch": 2.5851912271161863, "grad_norm": 0.7489181160926819, "learning_rate": 2.787011331260726e-05, "loss": 0.1332, "step": 28525 }, { "epoch": 2.5856443719412723, "grad_norm": 0.8632702827453613, "learning_rate": 2.7862256592575063e-05, "loss": 0.1619, "step": 28530 }, { "epoch": 2.5860975167663587, "grad_norm": 0.7884047627449036, "learning_rate": 2.785439958608726e-05, "loss": 0.1666, "step": 28535 }, { "epoch": 2.5865506615914446, "grad_norm": 0.6880090832710266, "learning_rate": 2.7846542293930174e-05, "loss": 0.156, "step": 28540 }, { "epoch": 2.5870038064165306, "grad_norm": 0.9081429839134216, "learning_rate": 2.7838684716890168e-05, "loss": 0.1434, "step": 28545 }, { "epoch": 2.5874569512416166, "grad_norm": 0.7481025457382202, "learning_rate": 2.7830826855753645e-05, "loss": 0.142, "step": 28550 }, { "epoch": 2.587910096066703, "grad_norm": 0.9314320087432861, "learning_rate": 2.7822968711307017e-05, "loss": 0.1518, "step": 28555 }, { "epoch": 2.588363240891789, "grad_norm": 0.6285427808761597, "learning_rate": 2.781511028433672e-05, "loss": 0.1454, "step": 28560 }, { "epoch": 2.588816385716875, "grad_norm": 0.8176143169403076, "learning_rate": 2.7807251575629235e-05, "loss": 0.1633, "step": 28565 }, { "epoch": 2.5892695305419613, "grad_norm": 0.7867632508277893, "learning_rate": 2.7799392585971055e-05, "loss": 0.1174, "step": 28570 }, { "epoch": 2.5897226753670473, "grad_norm": 0.8430165648460388, "learning_rate": 2.7791533316148732e-05, "loss": 0.1568, "step": 28575 }, { "epoch": 2.590175820192133, "grad_norm": 0.7544487714767456, "learning_rate": 2.778367376694881e-05, "loss": 0.1367, "step": 28580 }, { "epoch": 2.5906289650172196, "grad_norm": 0.9846712350845337, "learning_rate": 2.7775813939157868e-05, "loss": 0.1747, "step": 28585 }, { "epoch": 2.5910821098423056, "grad_norm": 0.9664506316184998, "learning_rate": 2.7767953833562532e-05, "loss": 0.146, "step": 28590 }, { "epoch": 2.5915352546673915, "grad_norm": 0.7605307102203369, "learning_rate": 2.7760093450949438e-05, "loss": 0.1523, "step": 28595 }, { "epoch": 2.591988399492478, "grad_norm": 0.8218874335289001, "learning_rate": 2.7752232792105265e-05, "loss": 0.1296, "step": 28600 }, { "epoch": 2.592441544317564, "grad_norm": 0.7047369480133057, "learning_rate": 2.7744371857816702e-05, "loss": 0.1368, "step": 28605 }, { "epoch": 2.59289468914265, "grad_norm": 0.8837043642997742, "learning_rate": 2.7736510648870472e-05, "loss": 0.145, "step": 28610 }, { "epoch": 2.5933478339677363, "grad_norm": 0.8276644349098206, "learning_rate": 2.7728649166053333e-05, "loss": 0.1441, "step": 28615 }, { "epoch": 2.5938009787928222, "grad_norm": 0.860105037689209, "learning_rate": 2.7720787410152067e-05, "loss": 0.1364, "step": 28620 }, { "epoch": 2.594254123617908, "grad_norm": 0.8733716011047363, "learning_rate": 2.771292538195347e-05, "loss": 0.1318, "step": 28625 }, { "epoch": 2.5947072684429946, "grad_norm": 0.9157313704490662, "learning_rate": 2.770506308224439e-05, "loss": 0.1895, "step": 28630 }, { "epoch": 2.5951604132680806, "grad_norm": 0.799434244632721, "learning_rate": 2.7697200511811676e-05, "loss": 0.135, "step": 28635 }, { "epoch": 2.5956135580931665, "grad_norm": 0.8216500282287598, "learning_rate": 2.768933767144223e-05, "loss": 0.1431, "step": 28640 }, { "epoch": 2.596066702918253, "grad_norm": 0.7941978573799133, "learning_rate": 2.768147456192296e-05, "loss": 0.1602, "step": 28645 }, { "epoch": 2.596519847743339, "grad_norm": 0.8997818827629089, "learning_rate": 2.7673611184040804e-05, "loss": 0.1618, "step": 28650 }, { "epoch": 2.596972992568425, "grad_norm": 0.8768215775489807, "learning_rate": 2.7665747538582735e-05, "loss": 0.1689, "step": 28655 }, { "epoch": 2.597426137393511, "grad_norm": 0.8659344911575317, "learning_rate": 2.7657883626335757e-05, "loss": 0.1587, "step": 28660 }, { "epoch": 2.597879282218597, "grad_norm": 0.780453085899353, "learning_rate": 2.765001944808689e-05, "loss": 0.1433, "step": 28665 }, { "epoch": 2.598332427043683, "grad_norm": 0.9083179831504822, "learning_rate": 2.7642155004623178e-05, "loss": 0.1755, "step": 28670 }, { "epoch": 2.598785571868769, "grad_norm": 0.8898298740386963, "learning_rate": 2.7634290296731698e-05, "loss": 0.1426, "step": 28675 }, { "epoch": 2.599238716693855, "grad_norm": 0.8212187886238098, "learning_rate": 2.7626425325199552e-05, "loss": 0.1726, "step": 28680 }, { "epoch": 2.5996918615189415, "grad_norm": 0.8712775111198425, "learning_rate": 2.7618560090813872e-05, "loss": 0.1506, "step": 28685 }, { "epoch": 2.6001450063440275, "grad_norm": 0.7868581414222717, "learning_rate": 2.761069459436182e-05, "loss": 0.1605, "step": 28690 }, { "epoch": 2.6005981511691134, "grad_norm": 0.749912440776825, "learning_rate": 2.760282883663057e-05, "loss": 0.1492, "step": 28695 }, { "epoch": 2.6010512959942, "grad_norm": 0.7823379039764404, "learning_rate": 2.759496281840733e-05, "loss": 0.1459, "step": 28700 }, { "epoch": 2.601504440819286, "grad_norm": 0.7669755220413208, "learning_rate": 2.7587096540479324e-05, "loss": 0.167, "step": 28705 }, { "epoch": 2.6019575856443717, "grad_norm": 0.712775707244873, "learning_rate": 2.7579230003633826e-05, "loss": 0.1466, "step": 28710 }, { "epoch": 2.602410730469458, "grad_norm": 0.8616892695426941, "learning_rate": 2.7571363208658117e-05, "loss": 0.165, "step": 28715 }, { "epoch": 2.602863875294544, "grad_norm": 0.8860467672348022, "learning_rate": 2.7563496156339518e-05, "loss": 0.1519, "step": 28720 }, { "epoch": 2.60331702011963, "grad_norm": 0.9058404564857483, "learning_rate": 2.7555628847465348e-05, "loss": 0.147, "step": 28725 }, { "epoch": 2.6037701649447165, "grad_norm": 0.7373528480529785, "learning_rate": 2.754776128282298e-05, "loss": 0.1252, "step": 28730 }, { "epoch": 2.6042233097698024, "grad_norm": 0.7773388624191284, "learning_rate": 2.75398934631998e-05, "loss": 0.1446, "step": 28735 }, { "epoch": 2.6046764545948884, "grad_norm": 0.707069456577301, "learning_rate": 2.7532025389383233e-05, "loss": 0.1119, "step": 28740 }, { "epoch": 2.605129599419975, "grad_norm": 0.8795504570007324, "learning_rate": 2.7524157062160698e-05, "loss": 0.1517, "step": 28745 }, { "epoch": 2.6055827442450608, "grad_norm": 0.7925854325294495, "learning_rate": 2.7516288482319673e-05, "loss": 0.1442, "step": 28750 }, { "epoch": 2.6060358890701467, "grad_norm": 0.8676121830940247, "learning_rate": 2.7508419650647655e-05, "loss": 0.127, "step": 28755 }, { "epoch": 2.606489033895233, "grad_norm": 0.8376127481460571, "learning_rate": 2.7500550567932137e-05, "loss": 0.1653, "step": 28760 }, { "epoch": 2.606942178720319, "grad_norm": 0.7752143144607544, "learning_rate": 2.749268123496068e-05, "loss": 0.1559, "step": 28765 }, { "epoch": 2.607395323545405, "grad_norm": 0.843512237071991, "learning_rate": 2.748481165252084e-05, "loss": 0.1348, "step": 28770 }, { "epoch": 2.6078484683704914, "grad_norm": 0.8904138207435608, "learning_rate": 2.7476941821400216e-05, "loss": 0.1434, "step": 28775 }, { "epoch": 2.6083016131955774, "grad_norm": 0.7635827660560608, "learning_rate": 2.7469071742386416e-05, "loss": 0.1375, "step": 28780 }, { "epoch": 2.6087547580206634, "grad_norm": 0.7220375537872314, "learning_rate": 2.7461201416267078e-05, "loss": 0.1792, "step": 28785 }, { "epoch": 2.6092079028457498, "grad_norm": 0.730597734451294, "learning_rate": 2.7453330843829867e-05, "loss": 0.1359, "step": 28790 }, { "epoch": 2.6096610476708357, "grad_norm": 0.8037050366401672, "learning_rate": 2.7445460025862478e-05, "loss": 0.1572, "step": 28795 }, { "epoch": 2.6101141924959217, "grad_norm": 0.9002293944358826, "learning_rate": 2.7437588963152626e-05, "loss": 0.1326, "step": 28800 }, { "epoch": 2.6105673373210077, "grad_norm": 0.7174371480941772, "learning_rate": 2.7429717656488047e-05, "loss": 0.1382, "step": 28805 }, { "epoch": 2.611020482146094, "grad_norm": 0.7367898225784302, "learning_rate": 2.7421846106656503e-05, "loss": 0.1408, "step": 28810 }, { "epoch": 2.61147362697118, "grad_norm": 0.7611013650894165, "learning_rate": 2.7413974314445777e-05, "loss": 0.1318, "step": 28815 }, { "epoch": 2.611926771796266, "grad_norm": 0.7974714040756226, "learning_rate": 2.7406102280643686e-05, "loss": 0.1543, "step": 28820 }, { "epoch": 2.612379916621352, "grad_norm": 0.9952753186225891, "learning_rate": 2.739823000603807e-05, "loss": 0.1328, "step": 28825 }, { "epoch": 2.6128330614464383, "grad_norm": 0.8162652254104614, "learning_rate": 2.7390357491416786e-05, "loss": 0.1303, "step": 28830 }, { "epoch": 2.6132862062715243, "grad_norm": 0.7548615336418152, "learning_rate": 2.7382484737567715e-05, "loss": 0.1238, "step": 28835 }, { "epoch": 2.6137393510966103, "grad_norm": 0.7816554307937622, "learning_rate": 2.737461174527876e-05, "loss": 0.1241, "step": 28840 }, { "epoch": 2.6141924959216967, "grad_norm": 0.866503119468689, "learning_rate": 2.7366738515337858e-05, "loss": 0.1569, "step": 28845 }, { "epoch": 2.6146456407467826, "grad_norm": 0.786319375038147, "learning_rate": 2.7358865048532973e-05, "loss": 0.1482, "step": 28850 }, { "epoch": 2.6150987855718686, "grad_norm": 0.7493298649787903, "learning_rate": 2.7350991345652077e-05, "loss": 0.1523, "step": 28855 }, { "epoch": 2.615551930396955, "grad_norm": 0.8404673933982849, "learning_rate": 2.7343117407483172e-05, "loss": 0.1624, "step": 28860 }, { "epoch": 2.616005075222041, "grad_norm": 0.9105881452560425, "learning_rate": 2.7335243234814277e-05, "loss": 0.1419, "step": 28865 }, { "epoch": 2.616458220047127, "grad_norm": 0.8727304935455322, "learning_rate": 2.7327368828433453e-05, "loss": 0.1277, "step": 28870 }, { "epoch": 2.6169113648722133, "grad_norm": 0.804707407951355, "learning_rate": 2.7319494189128768e-05, "loss": 0.1606, "step": 28875 }, { "epoch": 2.6173645096972993, "grad_norm": 0.8339704275131226, "learning_rate": 2.731161931768833e-05, "loss": 0.1654, "step": 28880 }, { "epoch": 2.6178176545223852, "grad_norm": 0.7329429388046265, "learning_rate": 2.7303744214900246e-05, "loss": 0.1248, "step": 28885 }, { "epoch": 2.6182707993474716, "grad_norm": 0.9514808058738708, "learning_rate": 2.7295868881552657e-05, "loss": 0.1645, "step": 28890 }, { "epoch": 2.6187239441725576, "grad_norm": 0.7743392586708069, "learning_rate": 2.7287993318433737e-05, "loss": 0.1672, "step": 28895 }, { "epoch": 2.6191770889976436, "grad_norm": 0.8669340014457703, "learning_rate": 2.7280117526331667e-05, "loss": 0.1473, "step": 28900 }, { "epoch": 2.61963023382273, "grad_norm": 0.7745627760887146, "learning_rate": 2.727224150603467e-05, "loss": 0.1407, "step": 28905 }, { "epoch": 2.620083378647816, "grad_norm": 0.8268622159957886, "learning_rate": 2.726436525833097e-05, "loss": 0.1519, "step": 28910 }, { "epoch": 2.620536523472902, "grad_norm": 0.8962649703025818, "learning_rate": 2.7256488784008834e-05, "loss": 0.1159, "step": 28915 }, { "epoch": 2.6209896682979883, "grad_norm": 0.770439088344574, "learning_rate": 2.724861208385654e-05, "loss": 0.1188, "step": 28920 }, { "epoch": 2.6214428131230743, "grad_norm": 0.9898951649665833, "learning_rate": 2.7240735158662377e-05, "loss": 0.1477, "step": 28925 }, { "epoch": 2.62189595794816, "grad_norm": 0.7762460708618164, "learning_rate": 2.7232858009214695e-05, "loss": 0.1413, "step": 28930 }, { "epoch": 2.622349102773246, "grad_norm": 0.8778555393218994, "learning_rate": 2.7224980636301822e-05, "loss": 0.1515, "step": 28935 }, { "epoch": 2.6228022475983326, "grad_norm": 0.7272727489471436, "learning_rate": 2.721710304071214e-05, "loss": 0.124, "step": 28940 }, { "epoch": 2.6232553924234185, "grad_norm": 0.8195849061012268, "learning_rate": 2.7209225223234038e-05, "loss": 0.1554, "step": 28945 }, { "epoch": 2.6237085372485045, "grad_norm": 0.9273531436920166, "learning_rate": 2.720134718465592e-05, "loss": 0.161, "step": 28950 }, { "epoch": 2.6241616820735905, "grad_norm": 0.8442899584770203, "learning_rate": 2.719346892576623e-05, "loss": 0.1537, "step": 28955 }, { "epoch": 2.624614826898677, "grad_norm": 0.8579908013343811, "learning_rate": 2.7185590447353433e-05, "loss": 0.1411, "step": 28960 }, { "epoch": 2.625067971723763, "grad_norm": 0.8084115386009216, "learning_rate": 2.7177711750206008e-05, "loss": 0.167, "step": 28965 }, { "epoch": 2.625521116548849, "grad_norm": 0.7673989534378052, "learning_rate": 2.7169832835112452e-05, "loss": 0.1505, "step": 28970 }, { "epoch": 2.625974261373935, "grad_norm": 0.9304232597351074, "learning_rate": 2.71619537028613e-05, "loss": 0.1685, "step": 28975 }, { "epoch": 2.626427406199021, "grad_norm": 0.9111407995223999, "learning_rate": 2.7154074354241078e-05, "loss": 0.1555, "step": 28980 }, { "epoch": 2.626880551024107, "grad_norm": 0.7605234384536743, "learning_rate": 2.714619479004037e-05, "loss": 0.1161, "step": 28985 }, { "epoch": 2.6273336958491935, "grad_norm": 0.9415959119796753, "learning_rate": 2.7138315011047764e-05, "loss": 0.1395, "step": 28990 }, { "epoch": 2.6277868406742795, "grad_norm": 0.6845701336860657, "learning_rate": 2.713043501805187e-05, "loss": 0.1283, "step": 28995 }, { "epoch": 2.6282399854993654, "grad_norm": 0.702918529510498, "learning_rate": 2.712255481184132e-05, "loss": 0.1392, "step": 29000 }, { "epoch": 2.628693130324452, "grad_norm": 0.9968546628952026, "learning_rate": 2.7114674393204765e-05, "loss": 0.1516, "step": 29005 }, { "epoch": 2.629146275149538, "grad_norm": 0.6771713495254517, "learning_rate": 2.7106793762930877e-05, "loss": 0.1365, "step": 29010 }, { "epoch": 2.6295994199746238, "grad_norm": 0.8159477114677429, "learning_rate": 2.709891292180836e-05, "loss": 0.1336, "step": 29015 }, { "epoch": 2.63005256479971, "grad_norm": 0.9291574358940125, "learning_rate": 2.7091031870625932e-05, "loss": 0.163, "step": 29020 }, { "epoch": 2.630505709624796, "grad_norm": 0.8901233077049255, "learning_rate": 2.7083150610172332e-05, "loss": 0.1558, "step": 29025 }, { "epoch": 2.630958854449882, "grad_norm": 0.8027275204658508, "learning_rate": 2.7075269141236303e-05, "loss": 0.1304, "step": 29030 }, { "epoch": 2.6314119992749685, "grad_norm": 0.861175537109375, "learning_rate": 2.7067387464606637e-05, "loss": 0.1575, "step": 29035 }, { "epoch": 2.6318651441000545, "grad_norm": 0.7264774441719055, "learning_rate": 2.705950558107214e-05, "loss": 0.116, "step": 29040 }, { "epoch": 2.6323182889251404, "grad_norm": 0.828836977481842, "learning_rate": 2.7051623491421634e-05, "loss": 0.1231, "step": 29045 }, { "epoch": 2.632771433750227, "grad_norm": 0.912580668926239, "learning_rate": 2.7043741196443944e-05, "loss": 0.1755, "step": 29050 }, { "epoch": 2.633224578575313, "grad_norm": 0.7469333410263062, "learning_rate": 2.703585869692795e-05, "loss": 0.138, "step": 29055 }, { "epoch": 2.6336777234003987, "grad_norm": 0.741617739200592, "learning_rate": 2.7027975993662534e-05, "loss": 0.1295, "step": 29060 }, { "epoch": 2.6341308682254847, "grad_norm": 0.8309171795845032, "learning_rate": 2.702009308743659e-05, "loss": 0.1547, "step": 29065 }, { "epoch": 2.634584013050571, "grad_norm": 0.6632031798362732, "learning_rate": 2.7012209979039044e-05, "loss": 0.1281, "step": 29070 }, { "epoch": 2.635037157875657, "grad_norm": 0.8175519108772278, "learning_rate": 2.7004326669258844e-05, "loss": 0.1552, "step": 29075 }, { "epoch": 2.635490302700743, "grad_norm": 0.8841816186904907, "learning_rate": 2.6996443158884955e-05, "loss": 0.1555, "step": 29080 }, { "epoch": 2.635943447525829, "grad_norm": 0.9623150825500488, "learning_rate": 2.698855944870636e-05, "loss": 0.1608, "step": 29085 }, { "epoch": 2.6363965923509154, "grad_norm": 0.8978971838951111, "learning_rate": 2.698067553951206e-05, "loss": 0.1489, "step": 29090 }, { "epoch": 2.6368497371760014, "grad_norm": 0.9677059054374695, "learning_rate": 2.6972791432091082e-05, "loss": 0.1187, "step": 29095 }, { "epoch": 2.6373028820010873, "grad_norm": 0.8347237706184387, "learning_rate": 2.696490712723248e-05, "loss": 0.169, "step": 29100 }, { "epoch": 2.6377560268261737, "grad_norm": 0.8112174868583679, "learning_rate": 2.6957022625725305e-05, "loss": 0.163, "step": 29105 }, { "epoch": 2.6382091716512597, "grad_norm": 0.8307750225067139, "learning_rate": 2.694913792835864e-05, "loss": 0.1878, "step": 29110 }, { "epoch": 2.6386623164763456, "grad_norm": 0.8857294917106628, "learning_rate": 2.6941253035921594e-05, "loss": 0.1572, "step": 29115 }, { "epoch": 2.639115461301432, "grad_norm": 0.8128360509872437, "learning_rate": 2.693336794920328e-05, "loss": 0.1249, "step": 29120 }, { "epoch": 2.639568606126518, "grad_norm": 0.7737478613853455, "learning_rate": 2.6925482668992862e-05, "loss": 0.1445, "step": 29125 }, { "epoch": 2.640021750951604, "grad_norm": 0.8019301295280457, "learning_rate": 2.691759719607948e-05, "loss": 0.1501, "step": 29130 }, { "epoch": 2.6404748957766904, "grad_norm": 1.115870714187622, "learning_rate": 2.690971153125233e-05, "loss": 0.1484, "step": 29135 }, { "epoch": 2.6409280406017763, "grad_norm": 0.7954892516136169, "learning_rate": 2.6901825675300597e-05, "loss": 0.1414, "step": 29140 }, { "epoch": 2.6413811854268623, "grad_norm": 0.7142674326896667, "learning_rate": 2.6893939629013497e-05, "loss": 0.1273, "step": 29145 }, { "epoch": 2.6418343302519487, "grad_norm": 0.977094829082489, "learning_rate": 2.6886053393180295e-05, "loss": 0.1472, "step": 29150 }, { "epoch": 2.6422874750770347, "grad_norm": 0.7379287481307983, "learning_rate": 2.687816696859023e-05, "loss": 0.1449, "step": 29155 }, { "epoch": 2.6427406199021206, "grad_norm": 0.7697951197624207, "learning_rate": 2.6870280356032577e-05, "loss": 0.1269, "step": 29160 }, { "epoch": 2.643193764727207, "grad_norm": 0.8334922194480896, "learning_rate": 2.6862393556296633e-05, "loss": 0.1706, "step": 29165 }, { "epoch": 2.643646909552293, "grad_norm": 0.8476558923721313, "learning_rate": 2.6854506570171716e-05, "loss": 0.1788, "step": 29170 }, { "epoch": 2.644100054377379, "grad_norm": 0.9085559248924255, "learning_rate": 2.6846619398447148e-05, "loss": 0.1519, "step": 29175 }, { "epoch": 2.6445531992024653, "grad_norm": 0.9320864081382751, "learning_rate": 2.683873204191229e-05, "loss": 0.1243, "step": 29180 }, { "epoch": 2.6450063440275513, "grad_norm": 0.7725174427032471, "learning_rate": 2.6830844501356512e-05, "loss": 0.1469, "step": 29185 }, { "epoch": 2.6454594888526373, "grad_norm": 0.8143424391746521, "learning_rate": 2.6822956777569185e-05, "loss": 0.1682, "step": 29190 }, { "epoch": 2.6459126336777237, "grad_norm": 0.7774080038070679, "learning_rate": 2.6815068871339743e-05, "loss": 0.194, "step": 29195 }, { "epoch": 2.6463657785028096, "grad_norm": 0.826740026473999, "learning_rate": 2.680718078345758e-05, "loss": 0.1392, "step": 29200 }, { "epoch": 2.6468189233278956, "grad_norm": 0.7124971747398376, "learning_rate": 2.6799292514712165e-05, "loss": 0.1379, "step": 29205 }, { "epoch": 2.6472720681529816, "grad_norm": 0.9854990839958191, "learning_rate": 2.6791404065892946e-05, "loss": 0.1572, "step": 29210 }, { "epoch": 2.647725212978068, "grad_norm": 0.9441239237785339, "learning_rate": 2.6783515437789392e-05, "loss": 0.1479, "step": 29215 }, { "epoch": 2.648178357803154, "grad_norm": 0.776623547077179, "learning_rate": 2.6775626631191018e-05, "loss": 0.1499, "step": 29220 }, { "epoch": 2.64863150262824, "grad_norm": 0.8194611668586731, "learning_rate": 2.6767737646887325e-05, "loss": 0.1671, "step": 29225 }, { "epoch": 2.649084647453326, "grad_norm": 0.8101444840431213, "learning_rate": 2.675984848566786e-05, "loss": 0.1564, "step": 29230 }, { "epoch": 2.6495377922784122, "grad_norm": 0.7390269637107849, "learning_rate": 2.6751959148322152e-05, "loss": 0.1602, "step": 29235 }, { "epoch": 2.649990937103498, "grad_norm": 0.9389563798904419, "learning_rate": 2.674406963563979e-05, "loss": 0.1368, "step": 29240 }, { "epoch": 2.650444081928584, "grad_norm": 0.7928979396820068, "learning_rate": 2.673617994841035e-05, "loss": 0.1551, "step": 29245 }, { "epoch": 2.6508972267536706, "grad_norm": 1.211255669593811, "learning_rate": 2.6728290087423435e-05, "loss": 0.1823, "step": 29250 }, { "epoch": 2.6513503715787565, "grad_norm": 0.7924467921257019, "learning_rate": 2.6720400053468652e-05, "loss": 0.1513, "step": 29255 }, { "epoch": 2.6518035164038425, "grad_norm": 0.8548313975334167, "learning_rate": 2.6712509847335653e-05, "loss": 0.1409, "step": 29260 }, { "epoch": 2.652256661228929, "grad_norm": 0.7501570582389832, "learning_rate": 2.6704619469814096e-05, "loss": 0.1303, "step": 29265 }, { "epoch": 2.652709806054015, "grad_norm": 0.7903860211372375, "learning_rate": 2.6696728921693648e-05, "loss": 0.1314, "step": 29270 }, { "epoch": 2.653162950879101, "grad_norm": 0.6971772909164429, "learning_rate": 2.6688838203763992e-05, "loss": 0.1539, "step": 29275 }, { "epoch": 2.653616095704187, "grad_norm": 0.8969376683235168, "learning_rate": 2.6680947316814835e-05, "loss": 0.1375, "step": 29280 }, { "epoch": 2.654069240529273, "grad_norm": 0.901101291179657, "learning_rate": 2.6673056261635904e-05, "loss": 0.1543, "step": 29285 }, { "epoch": 2.654522385354359, "grad_norm": 0.8564673662185669, "learning_rate": 2.6665165039016937e-05, "loss": 0.149, "step": 29290 }, { "epoch": 2.6549755301794455, "grad_norm": 0.8716834187507629, "learning_rate": 2.6657273649747694e-05, "loss": 0.1363, "step": 29295 }, { "epoch": 2.6554286750045315, "grad_norm": 0.7861490249633789, "learning_rate": 2.664938209461794e-05, "loss": 0.1586, "step": 29300 }, { "epoch": 2.6558818198296175, "grad_norm": 0.7387974262237549, "learning_rate": 2.6641490374417465e-05, "loss": 0.1298, "step": 29305 }, { "epoch": 2.656334964654704, "grad_norm": 0.8838900327682495, "learning_rate": 2.6633598489936077e-05, "loss": 0.1578, "step": 29310 }, { "epoch": 2.65678810947979, "grad_norm": 0.8734526038169861, "learning_rate": 2.662570644196361e-05, "loss": 0.1581, "step": 29315 }, { "epoch": 2.657241254304876, "grad_norm": 0.8682892322540283, "learning_rate": 2.6617814231289888e-05, "loss": 0.1555, "step": 29320 }, { "epoch": 2.657694399129962, "grad_norm": 0.7796434164047241, "learning_rate": 2.660992185870477e-05, "loss": 0.1256, "step": 29325 }, { "epoch": 2.658147543955048, "grad_norm": 0.787380039691925, "learning_rate": 2.660202932499813e-05, "loss": 0.124, "step": 29330 }, { "epoch": 2.658600688780134, "grad_norm": 0.8607234358787537, "learning_rate": 2.6594136630959855e-05, "loss": 0.1509, "step": 29335 }, { "epoch": 2.65905383360522, "grad_norm": 0.8016408085823059, "learning_rate": 2.6586243777379836e-05, "loss": 0.1348, "step": 29340 }, { "epoch": 2.6595069784303065, "grad_norm": 0.7969799637794495, "learning_rate": 2.6578350765048014e-05, "loss": 0.1405, "step": 29345 }, { "epoch": 2.6599601232553924, "grad_norm": 0.7760375142097473, "learning_rate": 2.6570457594754316e-05, "loss": 0.1417, "step": 29350 }, { "epoch": 2.6604132680804784, "grad_norm": 0.7979738712310791, "learning_rate": 2.6562564267288687e-05, "loss": 0.1404, "step": 29355 }, { "epoch": 2.6608664129055644, "grad_norm": 0.8116744160652161, "learning_rate": 2.6554670783441098e-05, "loss": 0.1496, "step": 29360 }, { "epoch": 2.6613195577306508, "grad_norm": 0.7012956142425537, "learning_rate": 2.654677714400153e-05, "loss": 0.1364, "step": 29365 }, { "epoch": 2.6617727025557367, "grad_norm": 0.913429856300354, "learning_rate": 2.653888334975998e-05, "loss": 0.1331, "step": 29370 }, { "epoch": 2.6622258473808227, "grad_norm": 0.766970694065094, "learning_rate": 2.653098940150647e-05, "loss": 0.127, "step": 29375 }, { "epoch": 2.662678992205909, "grad_norm": 0.8803042769432068, "learning_rate": 2.652309530003102e-05, "loss": 0.1538, "step": 29380 }, { "epoch": 2.663132137030995, "grad_norm": 0.8866909146308899, "learning_rate": 2.6515201046123677e-05, "loss": 0.1494, "step": 29385 }, { "epoch": 2.663585281856081, "grad_norm": 0.8418163657188416, "learning_rate": 2.6507306640574503e-05, "loss": 0.1844, "step": 29390 }, { "epoch": 2.6640384266811674, "grad_norm": 0.8089554309844971, "learning_rate": 2.649941208417357e-05, "loss": 0.1345, "step": 29395 }, { "epoch": 2.6644915715062534, "grad_norm": 0.8865895867347717, "learning_rate": 2.6491517377710967e-05, "loss": 0.1405, "step": 29400 }, { "epoch": 2.6649447163313393, "grad_norm": 0.7634585499763489, "learning_rate": 2.64836225219768e-05, "loss": 0.1879, "step": 29405 }, { "epoch": 2.6653978611564257, "grad_norm": 0.8257496953010559, "learning_rate": 2.64757275177612e-05, "loss": 0.1221, "step": 29410 }, { "epoch": 2.6658510059815117, "grad_norm": 0.7718321084976196, "learning_rate": 2.6467832365854284e-05, "loss": 0.1491, "step": 29415 }, { "epoch": 2.6663041508065977, "grad_norm": 0.8359481692314148, "learning_rate": 2.6459937067046197e-05, "loss": 0.1356, "step": 29420 }, { "epoch": 2.666757295631684, "grad_norm": 1.0304820537567139, "learning_rate": 2.6452041622127132e-05, "loss": 0.1606, "step": 29425 }, { "epoch": 2.66721044045677, "grad_norm": 0.8328040242195129, "learning_rate": 2.6444146031887245e-05, "loss": 0.1662, "step": 29430 }, { "epoch": 2.667663585281856, "grad_norm": 0.7152688503265381, "learning_rate": 2.6436250297116738e-05, "loss": 0.1219, "step": 29435 }, { "epoch": 2.6681167301069424, "grad_norm": 0.8300583958625793, "learning_rate": 2.642835441860581e-05, "loss": 0.137, "step": 29440 }, { "epoch": 2.6685698749320284, "grad_norm": 0.8450018167495728, "learning_rate": 2.6420458397144686e-05, "loss": 0.1433, "step": 29445 }, { "epoch": 2.6690230197571143, "grad_norm": 0.7414877414703369, "learning_rate": 2.6412562233523607e-05, "loss": 0.1249, "step": 29450 }, { "epoch": 2.6694761645822007, "grad_norm": 0.8031722903251648, "learning_rate": 2.640466592853283e-05, "loss": 0.1404, "step": 29455 }, { "epoch": 2.6699293094072867, "grad_norm": 0.9497859477996826, "learning_rate": 2.6396769482962614e-05, "loss": 0.149, "step": 29460 }, { "epoch": 2.6703824542323726, "grad_norm": 0.7853332161903381, "learning_rate": 2.6388872897603227e-05, "loss": 0.1247, "step": 29465 }, { "epoch": 2.6708355990574586, "grad_norm": 0.6700757741928101, "learning_rate": 2.638097617324497e-05, "loss": 0.121, "step": 29470 }, { "epoch": 2.671288743882545, "grad_norm": 0.8201432824134827, "learning_rate": 2.6373079310678152e-05, "loss": 0.1563, "step": 29475 }, { "epoch": 2.671741888707631, "grad_norm": 0.9028806090354919, "learning_rate": 2.6365182310693098e-05, "loss": 0.1852, "step": 29480 }, { "epoch": 2.672195033532717, "grad_norm": 0.869293212890625, "learning_rate": 2.6357285174080133e-05, "loss": 0.1332, "step": 29485 }, { "epoch": 2.672648178357803, "grad_norm": 0.8188022375106812, "learning_rate": 2.6349387901629608e-05, "loss": 0.1343, "step": 29490 }, { "epoch": 2.6731013231828893, "grad_norm": 0.6604544520378113, "learning_rate": 2.6341490494131887e-05, "loss": 0.1498, "step": 29495 }, { "epoch": 2.6735544680079752, "grad_norm": 0.7913999557495117, "learning_rate": 2.633359295237734e-05, "loss": 0.1512, "step": 29500 }, { "epoch": 2.674007612833061, "grad_norm": 0.8191317319869995, "learning_rate": 2.6325695277156364e-05, "loss": 0.1522, "step": 29505 }, { "epoch": 2.6744607576581476, "grad_norm": 0.7911943197250366, "learning_rate": 2.631779746925936e-05, "loss": 0.1267, "step": 29510 }, { "epoch": 2.6749139024832336, "grad_norm": 0.8182210922241211, "learning_rate": 2.6309899529476735e-05, "loss": 0.14, "step": 29515 }, { "epoch": 2.6753670473083195, "grad_norm": 0.7976159453392029, "learning_rate": 2.6302001458598924e-05, "loss": 0.1387, "step": 29520 }, { "epoch": 2.675820192133406, "grad_norm": 0.7631782293319702, "learning_rate": 2.629410325741637e-05, "loss": 0.1526, "step": 29525 }, { "epoch": 2.676273336958492, "grad_norm": 0.8931511044502258, "learning_rate": 2.6286204926719517e-05, "loss": 0.1356, "step": 29530 }, { "epoch": 2.676726481783578, "grad_norm": 0.9024404287338257, "learning_rate": 2.627830646729884e-05, "loss": 0.1422, "step": 29535 }, { "epoch": 2.6771796266086643, "grad_norm": 0.7971619963645935, "learning_rate": 2.6270407879944827e-05, "loss": 0.137, "step": 29540 }, { "epoch": 2.6776327714337502, "grad_norm": 0.7027516961097717, "learning_rate": 2.6262509165447957e-05, "loss": 0.1357, "step": 29545 }, { "epoch": 2.678085916258836, "grad_norm": 0.7775990962982178, "learning_rate": 2.6254610324598748e-05, "loss": 0.1352, "step": 29550 }, { "epoch": 2.6785390610839226, "grad_norm": 0.7292483448982239, "learning_rate": 2.624671135818771e-05, "loss": 0.1629, "step": 29555 }, { "epoch": 2.6789922059090086, "grad_norm": 0.9639482498168945, "learning_rate": 2.6238812267005374e-05, "loss": 0.1398, "step": 29560 }, { "epoch": 2.6794453507340945, "grad_norm": 0.9201565384864807, "learning_rate": 2.6230913051842295e-05, "loss": 0.1607, "step": 29565 }, { "epoch": 2.679898495559181, "grad_norm": 0.9590044617652893, "learning_rate": 2.622301371348902e-05, "loss": 0.1613, "step": 29570 }, { "epoch": 2.680351640384267, "grad_norm": 0.7090720534324646, "learning_rate": 2.6215114252736123e-05, "loss": 0.1291, "step": 29575 }, { "epoch": 2.680804785209353, "grad_norm": 0.6811681389808655, "learning_rate": 2.6207214670374174e-05, "loss": 0.1185, "step": 29580 }, { "epoch": 2.6812579300344392, "grad_norm": 0.7346963286399841, "learning_rate": 2.6199314967193773e-05, "loss": 0.1178, "step": 29585 }, { "epoch": 2.681711074859525, "grad_norm": 0.8272832036018372, "learning_rate": 2.6191415143985527e-05, "loss": 0.161, "step": 29590 }, { "epoch": 2.682164219684611, "grad_norm": 0.9182509183883667, "learning_rate": 2.6183515201540053e-05, "loss": 0.1356, "step": 29595 }, { "epoch": 2.6826173645096976, "grad_norm": 0.7759901881217957, "learning_rate": 2.6175615140647976e-05, "loss": 0.1314, "step": 29600 }, { "epoch": 2.6830705093347835, "grad_norm": 0.7342140078544617, "learning_rate": 2.616771496209994e-05, "loss": 0.1441, "step": 29605 }, { "epoch": 2.6835236541598695, "grad_norm": 0.8912898302078247, "learning_rate": 2.615981466668659e-05, "loss": 0.1711, "step": 29610 }, { "epoch": 2.6839767989849554, "grad_norm": 0.725959837436676, "learning_rate": 2.6151914255198594e-05, "loss": 0.1167, "step": 29615 }, { "epoch": 2.684429943810042, "grad_norm": 0.8170228004455566, "learning_rate": 2.6144013728426635e-05, "loss": 0.1462, "step": 29620 }, { "epoch": 2.684883088635128, "grad_norm": 0.8017739057540894, "learning_rate": 2.6136113087161402e-05, "loss": 0.1391, "step": 29625 }, { "epoch": 2.6853362334602138, "grad_norm": 0.875032365322113, "learning_rate": 2.6128212332193574e-05, "loss": 0.1504, "step": 29630 }, { "epoch": 2.6857893782852997, "grad_norm": 0.777814507484436, "learning_rate": 2.6120311464313885e-05, "loss": 0.1328, "step": 29635 }, { "epoch": 2.686242523110386, "grad_norm": 0.8985427618026733, "learning_rate": 2.6112410484313038e-05, "loss": 0.1314, "step": 29640 }, { "epoch": 2.686695667935472, "grad_norm": 0.724873960018158, "learning_rate": 2.6104509392981778e-05, "loss": 0.1544, "step": 29645 }, { "epoch": 2.687148812760558, "grad_norm": 0.7998974919319153, "learning_rate": 2.609660819111085e-05, "loss": 0.1464, "step": 29650 }, { "epoch": 2.6876019575856445, "grad_norm": 0.9418435096740723, "learning_rate": 2.6088706879490992e-05, "loss": 0.16, "step": 29655 }, { "epoch": 2.6880551024107304, "grad_norm": 0.8477809429168701, "learning_rate": 2.6080805458912988e-05, "loss": 0.1466, "step": 29660 }, { "epoch": 2.6885082472358164, "grad_norm": 0.7982146739959717, "learning_rate": 2.607290393016761e-05, "loss": 0.1231, "step": 29665 }, { "epoch": 2.688961392060903, "grad_norm": 0.7853220105171204, "learning_rate": 2.6065002294045642e-05, "loss": 0.1455, "step": 29670 }, { "epoch": 2.6894145368859887, "grad_norm": 0.8957681655883789, "learning_rate": 2.605710055133788e-05, "loss": 0.1347, "step": 29675 }, { "epoch": 2.6898676817110747, "grad_norm": 0.7732312679290771, "learning_rate": 2.604919870283514e-05, "loss": 0.1439, "step": 29680 }, { "epoch": 2.690320826536161, "grad_norm": 0.7573689818382263, "learning_rate": 2.6041296749328242e-05, "loss": 0.1394, "step": 29685 }, { "epoch": 2.690773971361247, "grad_norm": 0.9568281769752502, "learning_rate": 2.6033394691608016e-05, "loss": 0.141, "step": 29690 }, { "epoch": 2.691227116186333, "grad_norm": 0.7916044592857361, "learning_rate": 2.6025492530465295e-05, "loss": 0.1818, "step": 29695 }, { "epoch": 2.6916802610114194, "grad_norm": 0.7443975210189819, "learning_rate": 2.6017590266690928e-05, "loss": 0.1499, "step": 29700 }, { "epoch": 2.6921334058365054, "grad_norm": 0.8689723014831543, "learning_rate": 2.6009687901075798e-05, "loss": 0.1491, "step": 29705 }, { "epoch": 2.6925865506615914, "grad_norm": 1.082648754119873, "learning_rate": 2.6001785434410752e-05, "loss": 0.1526, "step": 29710 }, { "epoch": 2.6930396954866778, "grad_norm": 0.8799446821212769, "learning_rate": 2.5993882867486685e-05, "loss": 0.1207, "step": 29715 }, { "epoch": 2.6934928403117637, "grad_norm": 1.0530542135238647, "learning_rate": 2.5985980201094485e-05, "loss": 0.1604, "step": 29720 }, { "epoch": 2.6939459851368497, "grad_norm": 0.9028748869895935, "learning_rate": 2.5978077436025044e-05, "loss": 0.1475, "step": 29725 }, { "epoch": 2.694399129961936, "grad_norm": 0.873327910900116, "learning_rate": 2.5970174573069295e-05, "loss": 0.166, "step": 29730 }, { "epoch": 2.694852274787022, "grad_norm": 0.9142202734947205, "learning_rate": 2.596227161301814e-05, "loss": 0.1484, "step": 29735 }, { "epoch": 2.695305419612108, "grad_norm": 0.795738697052002, "learning_rate": 2.5954368556662524e-05, "loss": 0.1536, "step": 29740 }, { "epoch": 2.695758564437194, "grad_norm": 1.041056513786316, "learning_rate": 2.594646540479337e-05, "loss": 0.1519, "step": 29745 }, { "epoch": 2.6962117092622804, "grad_norm": 0.8245218992233276, "learning_rate": 2.5938562158201633e-05, "loss": 0.141, "step": 29750 }, { "epoch": 2.6966648540873663, "grad_norm": 0.7874355912208557, "learning_rate": 2.5930658817678293e-05, "loss": 0.1455, "step": 29755 }, { "epoch": 2.6971179989124523, "grad_norm": 0.8320429921150208, "learning_rate": 2.59227553840143e-05, "loss": 0.1341, "step": 29760 }, { "epoch": 2.6975711437375383, "grad_norm": 1.0693703889846802, "learning_rate": 2.591485185800064e-05, "loss": 0.1387, "step": 29765 }, { "epoch": 2.6980242885626247, "grad_norm": 0.8013246059417725, "learning_rate": 2.5906948240428285e-05, "loss": 0.131, "step": 29770 }, { "epoch": 2.6984774333877106, "grad_norm": 1.0098000764846802, "learning_rate": 2.5899044532088246e-05, "loss": 0.1472, "step": 29775 }, { "epoch": 2.6989305782127966, "grad_norm": 0.8639510869979858, "learning_rate": 2.5891140733771536e-05, "loss": 0.1547, "step": 29780 }, { "epoch": 2.699383723037883, "grad_norm": 0.7684406638145447, "learning_rate": 2.5883236846269156e-05, "loss": 0.1197, "step": 29785 }, { "epoch": 2.699836867862969, "grad_norm": 0.8245255947113037, "learning_rate": 2.587533287037213e-05, "loss": 0.1537, "step": 29790 }, { "epoch": 2.700290012688055, "grad_norm": 0.7498013377189636, "learning_rate": 2.5867428806871496e-05, "loss": 0.1326, "step": 29795 }, { "epoch": 2.7007431575131413, "grad_norm": 0.8976531028747559, "learning_rate": 2.5859524656558298e-05, "loss": 0.1467, "step": 29800 }, { "epoch": 2.7011963023382273, "grad_norm": 0.8876221179962158, "learning_rate": 2.5851620420223575e-05, "loss": 0.1225, "step": 29805 }, { "epoch": 2.7016494471633132, "grad_norm": 0.8135111331939697, "learning_rate": 2.5843716098658398e-05, "loss": 0.1619, "step": 29810 }, { "epoch": 2.7021025919883996, "grad_norm": 0.7918768525123596, "learning_rate": 2.583581169265382e-05, "loss": 0.133, "step": 29815 }, { "epoch": 2.7025557368134856, "grad_norm": 0.8114787340164185, "learning_rate": 2.5827907203000936e-05, "loss": 0.1278, "step": 29820 }, { "epoch": 2.7030088816385716, "grad_norm": 0.7973418831825256, "learning_rate": 2.582000263049082e-05, "loss": 0.1327, "step": 29825 }, { "epoch": 2.703462026463658, "grad_norm": 0.7873468995094299, "learning_rate": 2.581209797591455e-05, "loss": 0.136, "step": 29830 }, { "epoch": 2.703915171288744, "grad_norm": 0.897901713848114, "learning_rate": 2.5804193240063252e-05, "loss": 0.1599, "step": 29835 }, { "epoch": 2.70436831611383, "grad_norm": 0.9596134424209595, "learning_rate": 2.5796288423728015e-05, "loss": 0.1568, "step": 29840 }, { "epoch": 2.7048214609389163, "grad_norm": 0.85427325963974, "learning_rate": 2.5788383527699973e-05, "loss": 0.1218, "step": 29845 }, { "epoch": 2.7052746057640022, "grad_norm": 0.8877111077308655, "learning_rate": 2.5780478552770237e-05, "loss": 0.1455, "step": 29850 }, { "epoch": 2.705727750589088, "grad_norm": 0.9057868719100952, "learning_rate": 2.5772573499729934e-05, "loss": 0.2039, "step": 29855 }, { "epoch": 2.7061808954141746, "grad_norm": 0.9703570604324341, "learning_rate": 2.5764668369370228e-05, "loss": 0.1986, "step": 29860 }, { "epoch": 2.7066340402392606, "grad_norm": 0.914042592048645, "learning_rate": 2.5756763162482243e-05, "loss": 0.1474, "step": 29865 }, { "epoch": 2.7070871850643465, "grad_norm": 0.617137610912323, "learning_rate": 2.5748857879857154e-05, "loss": 0.1236, "step": 29870 }, { "epoch": 2.707540329889433, "grad_norm": 0.7640342712402344, "learning_rate": 2.574095252228611e-05, "loss": 0.1491, "step": 29875 }, { "epoch": 2.707993474714519, "grad_norm": 0.8813996911048889, "learning_rate": 2.573304709056029e-05, "loss": 0.1598, "step": 29880 }, { "epoch": 2.708446619539605, "grad_norm": 0.8389626145362854, "learning_rate": 2.5725141585470865e-05, "loss": 0.1427, "step": 29885 }, { "epoch": 2.708899764364691, "grad_norm": 0.8251122236251831, "learning_rate": 2.571723600780902e-05, "loss": 0.1428, "step": 29890 }, { "epoch": 2.709352909189777, "grad_norm": 0.7687849998474121, "learning_rate": 2.5709330358365962e-05, "loss": 0.1325, "step": 29895 }, { "epoch": 2.709806054014863, "grad_norm": 0.8187410831451416, "learning_rate": 2.570142463793288e-05, "loss": 0.1253, "step": 29900 }, { "epoch": 2.710259198839949, "grad_norm": 0.8320968747138977, "learning_rate": 2.569351884730099e-05, "loss": 0.1241, "step": 29905 }, { "epoch": 2.710712343665035, "grad_norm": 0.9316514134407043, "learning_rate": 2.5685612987261488e-05, "loss": 0.1492, "step": 29910 }, { "epoch": 2.7111654884901215, "grad_norm": 0.8289247751235962, "learning_rate": 2.5677707058605605e-05, "loss": 0.1177, "step": 29915 }, { "epoch": 2.7116186333152075, "grad_norm": 0.8829716444015503, "learning_rate": 2.5669801062124577e-05, "loss": 0.1507, "step": 29920 }, { "epoch": 2.7120717781402934, "grad_norm": 0.749948263168335, "learning_rate": 2.5661894998609638e-05, "loss": 0.1364, "step": 29925 }, { "epoch": 2.71252492296538, "grad_norm": 0.8631970286369324, "learning_rate": 2.5653988868852014e-05, "loss": 0.1451, "step": 29930 }, { "epoch": 2.712978067790466, "grad_norm": 0.8438135981559753, "learning_rate": 2.5646082673642964e-05, "loss": 0.1452, "step": 29935 }, { "epoch": 2.7134312126155518, "grad_norm": 0.9611305594444275, "learning_rate": 2.563817641377374e-05, "loss": 0.169, "step": 29940 }, { "epoch": 2.713884357440638, "grad_norm": 0.9501329660415649, "learning_rate": 2.5630270090035613e-05, "loss": 0.1675, "step": 29945 }, { "epoch": 2.714337502265724, "grad_norm": 0.7703759670257568, "learning_rate": 2.5622363703219837e-05, "loss": 0.1644, "step": 29950 }, { "epoch": 2.71479064709081, "grad_norm": 0.8064903616905212, "learning_rate": 2.561445725411769e-05, "loss": 0.1364, "step": 29955 }, { "epoch": 2.7152437919158965, "grad_norm": 0.7928228974342346, "learning_rate": 2.560655074352046e-05, "loss": 0.1259, "step": 29960 }, { "epoch": 2.7156969367409824, "grad_norm": 0.8244730830192566, "learning_rate": 2.5598644172219423e-05, "loss": 0.1358, "step": 29965 }, { "epoch": 2.7161500815660684, "grad_norm": 0.892173707485199, "learning_rate": 2.5590737541005866e-05, "loss": 0.1568, "step": 29970 }, { "epoch": 2.716603226391155, "grad_norm": 0.7332577705383301, "learning_rate": 2.5582830850671107e-05, "loss": 0.1419, "step": 29975 }, { "epoch": 2.7170563712162408, "grad_norm": 0.9396560788154602, "learning_rate": 2.557492410200643e-05, "loss": 0.1565, "step": 29980 }, { "epoch": 2.7175095160413267, "grad_norm": 0.6934871673583984, "learning_rate": 2.5567017295803164e-05, "loss": 0.1444, "step": 29985 }, { "epoch": 2.717962660866413, "grad_norm": 0.7473351955413818, "learning_rate": 2.5559110432852618e-05, "loss": 0.1259, "step": 29990 }, { "epoch": 2.718415805691499, "grad_norm": 0.9547014832496643, "learning_rate": 2.55512035139461e-05, "loss": 0.1565, "step": 29995 }, { "epoch": 2.718868950516585, "grad_norm": 0.8795901536941528, "learning_rate": 2.5543296539874957e-05, "loss": 0.1634, "step": 30000 }, { "epoch": 2.7193220953416715, "grad_norm": 0.8679450750350952, "learning_rate": 2.55353895114305e-05, "loss": 0.1564, "step": 30005 }, { "epoch": 2.7197752401667574, "grad_norm": 0.8913860321044922, "learning_rate": 2.552748242940409e-05, "loss": 0.1392, "step": 30010 }, { "epoch": 2.7202283849918434, "grad_norm": 0.9390507340431213, "learning_rate": 2.551957529458707e-05, "loss": 0.1629, "step": 30015 }, { "epoch": 2.7206815298169293, "grad_norm": 0.8938106298446655, "learning_rate": 2.5511668107770757e-05, "loss": 0.1357, "step": 30020 }, { "epoch": 2.7211346746420157, "grad_norm": 0.8818641304969788, "learning_rate": 2.5503760869746534e-05, "loss": 0.16, "step": 30025 }, { "epoch": 2.7215878194671017, "grad_norm": 0.7395902276039124, "learning_rate": 2.5495853581305758e-05, "loss": 0.136, "step": 30030 }, { "epoch": 2.7220409642921877, "grad_norm": 0.879599392414093, "learning_rate": 2.548794624323979e-05, "loss": 0.1641, "step": 30035 }, { "epoch": 2.7224941091172736, "grad_norm": 0.8824294805526733, "learning_rate": 2.5480038856339993e-05, "loss": 0.1456, "step": 30040 }, { "epoch": 2.72294725394236, "grad_norm": 0.9363076090812683, "learning_rate": 2.5472131421397744e-05, "loss": 0.1452, "step": 30045 }, { "epoch": 2.723400398767446, "grad_norm": 0.8955417275428772, "learning_rate": 2.546422393920442e-05, "loss": 0.1414, "step": 30050 }, { "epoch": 2.723853543592532, "grad_norm": 0.8825808167457581, "learning_rate": 2.545631641055141e-05, "loss": 0.1513, "step": 30055 }, { "epoch": 2.7243066884176184, "grad_norm": 0.9717397689819336, "learning_rate": 2.5448408836230103e-05, "loss": 0.1379, "step": 30060 }, { "epoch": 2.7247598332427043, "grad_norm": 0.8635409474372864, "learning_rate": 2.5440501217031885e-05, "loss": 0.1403, "step": 30065 }, { "epoch": 2.7252129780677903, "grad_norm": 0.8615196347236633, "learning_rate": 2.5432593553748157e-05, "loss": 0.1353, "step": 30070 }, { "epoch": 2.7256661228928767, "grad_norm": 0.8836183547973633, "learning_rate": 2.5424685847170322e-05, "loss": 0.1507, "step": 30075 }, { "epoch": 2.7261192677179626, "grad_norm": 0.7234398722648621, "learning_rate": 2.5416778098089776e-05, "loss": 0.1349, "step": 30080 }, { "epoch": 2.7265724125430486, "grad_norm": 0.7298287153244019, "learning_rate": 2.5408870307297944e-05, "loss": 0.1203, "step": 30085 }, { "epoch": 2.727025557368135, "grad_norm": 0.7856137752532959, "learning_rate": 2.5400962475586237e-05, "loss": 0.1275, "step": 30090 }, { "epoch": 2.727478702193221, "grad_norm": 0.7995075583457947, "learning_rate": 2.5393054603746057e-05, "loss": 0.1349, "step": 30095 }, { "epoch": 2.727931847018307, "grad_norm": 0.9067547917366028, "learning_rate": 2.538514669256885e-05, "loss": 0.126, "step": 30100 }, { "epoch": 2.7283849918433933, "grad_norm": 0.7650130391120911, "learning_rate": 2.5377238742846027e-05, "loss": 0.1381, "step": 30105 }, { "epoch": 2.7288381366684793, "grad_norm": 0.7584020495414734, "learning_rate": 2.536933075536903e-05, "loss": 0.1236, "step": 30110 }, { "epoch": 2.7292912814935653, "grad_norm": 0.948759913444519, "learning_rate": 2.5361422730929278e-05, "loss": 0.1485, "step": 30115 }, { "epoch": 2.7297444263186517, "grad_norm": 0.9027184247970581, "learning_rate": 2.5353514670318224e-05, "loss": 0.1359, "step": 30120 }, { "epoch": 2.7301975711437376, "grad_norm": 0.8182434439659119, "learning_rate": 2.5345606574327295e-05, "loss": 0.1366, "step": 30125 }, { "epoch": 2.7306507159688236, "grad_norm": 0.8146435022354126, "learning_rate": 2.5337698443747948e-05, "loss": 0.1222, "step": 30130 }, { "epoch": 2.73110386079391, "grad_norm": 0.8221654295921326, "learning_rate": 2.5329790279371624e-05, "loss": 0.1383, "step": 30135 }, { "epoch": 2.731557005618996, "grad_norm": 0.9250681400299072, "learning_rate": 2.5321882081989777e-05, "loss": 0.156, "step": 30140 }, { "epoch": 2.732010150444082, "grad_norm": 0.8142967820167542, "learning_rate": 2.5313973852393864e-05, "loss": 0.1169, "step": 30145 }, { "epoch": 2.732463295269168, "grad_norm": 0.7565405368804932, "learning_rate": 2.5306065591375343e-05, "loss": 0.1371, "step": 30150 }, { "epoch": 2.7329164400942543, "grad_norm": 0.7417978644371033, "learning_rate": 2.5298157299725678e-05, "loss": 0.1389, "step": 30155 }, { "epoch": 2.7333695849193402, "grad_norm": 0.7384862303733826, "learning_rate": 2.529024897823632e-05, "loss": 0.1265, "step": 30160 }, { "epoch": 2.733822729744426, "grad_norm": 0.755763053894043, "learning_rate": 2.5282340627698746e-05, "loss": 0.1108, "step": 30165 }, { "epoch": 2.734275874569512, "grad_norm": 0.8233922719955444, "learning_rate": 2.5274432248904434e-05, "loss": 0.1431, "step": 30170 }, { "epoch": 2.7347290193945986, "grad_norm": 0.8995926976203918, "learning_rate": 2.526652384264485e-05, "loss": 0.1383, "step": 30175 }, { "epoch": 2.7351821642196845, "grad_norm": 0.67240971326828, "learning_rate": 2.5258615409711466e-05, "loss": 0.1154, "step": 30180 }, { "epoch": 2.7356353090447705, "grad_norm": 0.8361822366714478, "learning_rate": 2.5250706950895763e-05, "loss": 0.1408, "step": 30185 }, { "epoch": 2.736088453869857, "grad_norm": 0.7465325593948364, "learning_rate": 2.5242798466989226e-05, "loss": 0.1177, "step": 30190 }, { "epoch": 2.736541598694943, "grad_norm": 0.8458765149116516, "learning_rate": 2.5234889958783337e-05, "loss": 0.1462, "step": 30195 }, { "epoch": 2.736994743520029, "grad_norm": 0.7052130699157715, "learning_rate": 2.522698142706958e-05, "loss": 0.1274, "step": 30200 }, { "epoch": 2.737447888345115, "grad_norm": 0.7792151570320129, "learning_rate": 2.521907287263945e-05, "loss": 0.1417, "step": 30205 }, { "epoch": 2.737901033170201, "grad_norm": 0.8911088705062866, "learning_rate": 2.5211164296284422e-05, "loss": 0.1445, "step": 30210 }, { "epoch": 2.738354177995287, "grad_norm": 0.8855786919593811, "learning_rate": 2.5203255698796e-05, "loss": 0.1343, "step": 30215 }, { "epoch": 2.7388073228203735, "grad_norm": 0.9000064134597778, "learning_rate": 2.5195347080965692e-05, "loss": 0.1416, "step": 30220 }, { "epoch": 2.7392604676454595, "grad_norm": 0.673887312412262, "learning_rate": 2.5187438443584977e-05, "loss": 0.1255, "step": 30225 }, { "epoch": 2.7397136124705455, "grad_norm": 0.8011918663978577, "learning_rate": 2.517952978744536e-05, "loss": 0.1322, "step": 30230 }, { "epoch": 2.740166757295632, "grad_norm": 0.7774696350097656, "learning_rate": 2.5171621113338333e-05, "loss": 0.1169, "step": 30235 }, { "epoch": 2.740619902120718, "grad_norm": 0.7032745480537415, "learning_rate": 2.516371242205542e-05, "loss": 0.1126, "step": 30240 }, { "epoch": 2.741073046945804, "grad_norm": 0.8654191493988037, "learning_rate": 2.5155803714388098e-05, "loss": 0.1216, "step": 30245 }, { "epoch": 2.74152619177089, "grad_norm": 0.871894359588623, "learning_rate": 2.5147894991127896e-05, "loss": 0.1334, "step": 30250 }, { "epoch": 2.741979336595976, "grad_norm": 0.8739616274833679, "learning_rate": 2.5139986253066317e-05, "loss": 0.1604, "step": 30255 }, { "epoch": 2.742432481421062, "grad_norm": 0.8700281381607056, "learning_rate": 2.5132077500994855e-05, "loss": 0.1323, "step": 30260 }, { "epoch": 2.7428856262461485, "grad_norm": 0.6817046999931335, "learning_rate": 2.5124168735705044e-05, "loss": 0.1234, "step": 30265 }, { "epoch": 2.7433387710712345, "grad_norm": 0.7299990653991699, "learning_rate": 2.5116259957988375e-05, "loss": 0.1212, "step": 30270 }, { "epoch": 2.7437919158963204, "grad_norm": 0.7896156907081604, "learning_rate": 2.5108351168636373e-05, "loss": 0.1506, "step": 30275 }, { "epoch": 2.744245060721407, "grad_norm": 0.9875864386558533, "learning_rate": 2.5100442368440548e-05, "loss": 0.1306, "step": 30280 }, { "epoch": 2.744698205546493, "grad_norm": 0.7939635515213013, "learning_rate": 2.5092533558192423e-05, "loss": 0.1446, "step": 30285 }, { "epoch": 2.7451513503715788, "grad_norm": 0.8268237709999084, "learning_rate": 2.5084624738683504e-05, "loss": 0.1163, "step": 30290 }, { "epoch": 2.7456044951966647, "grad_norm": 0.7391602993011475, "learning_rate": 2.5076715910705306e-05, "loss": 0.1379, "step": 30295 }, { "epoch": 2.7460576400217507, "grad_norm": 0.7690908312797546, "learning_rate": 2.5068807075049368e-05, "loss": 0.1438, "step": 30300 }, { "epoch": 2.746510784846837, "grad_norm": 0.7071430087089539, "learning_rate": 2.506089823250718e-05, "loss": 0.1388, "step": 30305 }, { "epoch": 2.746963929671923, "grad_norm": 0.8168348073959351, "learning_rate": 2.5052989383870286e-05, "loss": 0.147, "step": 30310 }, { "epoch": 2.747417074497009, "grad_norm": 0.7881221771240234, "learning_rate": 2.50450805299302e-05, "loss": 0.1511, "step": 30315 }, { "epoch": 2.7478702193220954, "grad_norm": 0.8202252388000488, "learning_rate": 2.5037171671478425e-05, "loss": 0.1262, "step": 30320 }, { "epoch": 2.7483233641471814, "grad_norm": 0.8223000764846802, "learning_rate": 2.5029262809306503e-05, "loss": 0.1207, "step": 30325 }, { "epoch": 2.7487765089722673, "grad_norm": 0.7770290970802307, "learning_rate": 2.5021353944205948e-05, "loss": 0.1404, "step": 30330 }, { "epoch": 2.7492296537973537, "grad_norm": 0.8140674233436584, "learning_rate": 2.5013445076968288e-05, "loss": 0.1346, "step": 30335 }, { "epoch": 2.7496827986224397, "grad_norm": 0.7927941083908081, "learning_rate": 2.5005536208385045e-05, "loss": 0.1354, "step": 30340 }, { "epoch": 2.7501359434475257, "grad_norm": 0.9249879121780396, "learning_rate": 2.4997627339247727e-05, "loss": 0.1539, "step": 30345 }, { "epoch": 2.750589088272612, "grad_norm": 0.8228557109832764, "learning_rate": 2.4989718470347866e-05, "loss": 0.1237, "step": 30350 }, { "epoch": 2.751042233097698, "grad_norm": 0.8241555690765381, "learning_rate": 2.4981809602476985e-05, "loss": 0.127, "step": 30355 }, { "epoch": 2.751495377922784, "grad_norm": 0.7933488488197327, "learning_rate": 2.4973900736426604e-05, "loss": 0.1477, "step": 30360 }, { "epoch": 2.7519485227478704, "grad_norm": 0.7306645512580872, "learning_rate": 2.4965991872988252e-05, "loss": 0.1341, "step": 30365 }, { "epoch": 2.7524016675729563, "grad_norm": 0.8929803967475891, "learning_rate": 2.495808301295344e-05, "loss": 0.1191, "step": 30370 }, { "epoch": 2.7528548123980423, "grad_norm": 0.8196352124214172, "learning_rate": 2.4950174157113698e-05, "loss": 0.1283, "step": 30375 }, { "epoch": 2.7533079572231287, "grad_norm": 0.743916928768158, "learning_rate": 2.4942265306260538e-05, "loss": 0.1248, "step": 30380 }, { "epoch": 2.7537611020482147, "grad_norm": 0.7557235956192017, "learning_rate": 2.4934356461185485e-05, "loss": 0.1414, "step": 30385 }, { "epoch": 2.7542142468733006, "grad_norm": 0.7554042339324951, "learning_rate": 2.492644762268007e-05, "loss": 0.149, "step": 30390 }, { "epoch": 2.754667391698387, "grad_norm": 0.8283641934394836, "learning_rate": 2.4918538791535792e-05, "loss": 0.1328, "step": 30395 }, { "epoch": 2.755120536523473, "grad_norm": 0.8836473822593689, "learning_rate": 2.491062996854418e-05, "loss": 0.1285, "step": 30400 }, { "epoch": 2.755573681348559, "grad_norm": 0.9033547043800354, "learning_rate": 2.490272115449675e-05, "loss": 0.1251, "step": 30405 }, { "epoch": 2.7560268261736454, "grad_norm": 0.8812206983566284, "learning_rate": 2.489481235018502e-05, "loss": 0.1521, "step": 30410 }, { "epoch": 2.7564799709987313, "grad_norm": 0.813896119594574, "learning_rate": 2.488690355640052e-05, "loss": 0.1426, "step": 30415 }, { "epoch": 2.7569331158238173, "grad_norm": 0.9732536673545837, "learning_rate": 2.4878994773934733e-05, "loss": 0.1516, "step": 30420 }, { "epoch": 2.7573862606489032, "grad_norm": 0.6948548555374146, "learning_rate": 2.4871086003579204e-05, "loss": 0.1184, "step": 30425 }, { "epoch": 2.7578394054739896, "grad_norm": 0.6077486276626587, "learning_rate": 2.486317724612542e-05, "loss": 0.1285, "step": 30430 }, { "epoch": 2.7582925502990756, "grad_norm": 0.8581481575965881, "learning_rate": 2.4855268502364902e-05, "loss": 0.1376, "step": 30435 }, { "epoch": 2.7587456951241616, "grad_norm": 0.763373851776123, "learning_rate": 2.4847359773089166e-05, "loss": 0.1551, "step": 30440 }, { "epoch": 2.7591988399492475, "grad_norm": 0.9324893951416016, "learning_rate": 2.483945105908971e-05, "loss": 0.1523, "step": 30445 }, { "epoch": 2.759651984774334, "grad_norm": 0.8163465857505798, "learning_rate": 2.483154236115806e-05, "loss": 0.1372, "step": 30450 }, { "epoch": 2.76010512959942, "grad_norm": 0.8414956331253052, "learning_rate": 2.4823633680085695e-05, "loss": 0.1502, "step": 30455 }, { "epoch": 2.760558274424506, "grad_norm": 0.8175309300422668, "learning_rate": 2.4815725016664132e-05, "loss": 0.141, "step": 30460 }, { "epoch": 2.7610114192495923, "grad_norm": 0.6937305331230164, "learning_rate": 2.4807816371684875e-05, "loss": 0.1245, "step": 30465 }, { "epoch": 2.761464564074678, "grad_norm": 0.8196836113929749, "learning_rate": 2.4799907745939418e-05, "loss": 0.1297, "step": 30470 }, { "epoch": 2.761917708899764, "grad_norm": 0.7963302731513977, "learning_rate": 2.479199914021927e-05, "loss": 0.1656, "step": 30475 }, { "epoch": 2.7623708537248506, "grad_norm": 0.8098440170288086, "learning_rate": 2.478409055531591e-05, "loss": 0.1586, "step": 30480 }, { "epoch": 2.7628239985499365, "grad_norm": 0.8017787933349609, "learning_rate": 2.4776181992020837e-05, "loss": 0.1762, "step": 30485 }, { "epoch": 2.7632771433750225, "grad_norm": 0.8600775599479675, "learning_rate": 2.476827345112555e-05, "loss": 0.1587, "step": 30490 }, { "epoch": 2.763730288200109, "grad_norm": 0.8252716660499573, "learning_rate": 2.4760364933421535e-05, "loss": 0.1378, "step": 30495 }, { "epoch": 2.764183433025195, "grad_norm": 0.7853375673294067, "learning_rate": 2.475245643970027e-05, "loss": 0.1401, "step": 30500 }, { "epoch": 2.764636577850281, "grad_norm": 0.7977749705314636, "learning_rate": 2.4744547970753268e-05, "loss": 0.1356, "step": 30505 }, { "epoch": 2.7650897226753672, "grad_norm": 0.9095019102096558, "learning_rate": 2.4736639527371976e-05, "loss": 0.1327, "step": 30510 }, { "epoch": 2.765542867500453, "grad_norm": 0.8262405395507812, "learning_rate": 2.4728731110347896e-05, "loss": 0.1545, "step": 30515 }, { "epoch": 2.765996012325539, "grad_norm": 0.7688859701156616, "learning_rate": 2.4720822720472492e-05, "loss": 0.1138, "step": 30520 }, { "epoch": 2.7664491571506256, "grad_norm": 0.6860023140907288, "learning_rate": 2.4712914358537246e-05, "loss": 0.1345, "step": 30525 }, { "epoch": 2.7669023019757115, "grad_norm": 0.8087032437324524, "learning_rate": 2.4705006025333635e-05, "loss": 0.1583, "step": 30530 }, { "epoch": 2.7673554468007975, "grad_norm": 0.8471735715866089, "learning_rate": 2.469709772165311e-05, "loss": 0.156, "step": 30535 }, { "epoch": 2.767808591625884, "grad_norm": 0.7345898151397705, "learning_rate": 2.468918944828716e-05, "loss": 0.1488, "step": 30540 }, { "epoch": 2.76826173645097, "grad_norm": 0.8231287002563477, "learning_rate": 2.4681281206027224e-05, "loss": 0.1513, "step": 30545 }, { "epoch": 2.768714881276056, "grad_norm": 0.8626927733421326, "learning_rate": 2.4673372995664773e-05, "loss": 0.1309, "step": 30550 }, { "epoch": 2.7691680261011418, "grad_norm": 0.8484632968902588, "learning_rate": 2.4665464817991275e-05, "loss": 0.1194, "step": 30555 }, { "epoch": 2.769621170926228, "grad_norm": 0.8969523906707764, "learning_rate": 2.4657556673798157e-05, "loss": 0.152, "step": 30560 }, { "epoch": 2.770074315751314, "grad_norm": 0.7269735932350159, "learning_rate": 2.4649648563876894e-05, "loss": 0.16, "step": 30565 }, { "epoch": 2.7705274605764, "grad_norm": 0.7969017624855042, "learning_rate": 2.464174048901891e-05, "loss": 0.1406, "step": 30570 }, { "epoch": 2.770980605401486, "grad_norm": 0.8817876577377319, "learning_rate": 2.463383245001566e-05, "loss": 0.1206, "step": 30575 }, { "epoch": 2.7714337502265725, "grad_norm": 0.8675803542137146, "learning_rate": 2.4625924447658596e-05, "loss": 0.1411, "step": 30580 }, { "epoch": 2.7718868950516584, "grad_norm": 0.8293569684028625, "learning_rate": 2.4618016482739128e-05, "loss": 0.1411, "step": 30585 }, { "epoch": 2.7723400398767444, "grad_norm": 0.666471540927887, "learning_rate": 2.4610108556048712e-05, "loss": 0.1257, "step": 30590 }, { "epoch": 2.772793184701831, "grad_norm": 0.7827867269515991, "learning_rate": 2.4602200668378743e-05, "loss": 0.1254, "step": 30595 }, { "epoch": 2.7732463295269167, "grad_norm": 0.7983878254890442, "learning_rate": 2.4594292820520674e-05, "loss": 0.1248, "step": 30600 }, { "epoch": 2.7736994743520027, "grad_norm": 0.7109051942825317, "learning_rate": 2.4586385013265926e-05, "loss": 0.1362, "step": 30605 }, { "epoch": 2.774152619177089, "grad_norm": 0.8261443376541138, "learning_rate": 2.4578477247405895e-05, "loss": 0.12, "step": 30610 }, { "epoch": 2.774605764002175, "grad_norm": 1.0019865036010742, "learning_rate": 2.4570569523732015e-05, "loss": 0.1504, "step": 30615 }, { "epoch": 2.775058908827261, "grad_norm": 0.8021111488342285, "learning_rate": 2.456266184303567e-05, "loss": 0.1367, "step": 30620 }, { "epoch": 2.7755120536523474, "grad_norm": 0.8061633110046387, "learning_rate": 2.4554754206108278e-05, "loss": 0.1104, "step": 30625 }, { "epoch": 2.7759651984774334, "grad_norm": 0.8241289854049683, "learning_rate": 2.4546846613741238e-05, "loss": 0.1424, "step": 30630 }, { "epoch": 2.7764183433025194, "grad_norm": 0.8464207649230957, "learning_rate": 2.4538939066725942e-05, "loss": 0.1243, "step": 30635 }, { "epoch": 2.7768714881276058, "grad_norm": 0.7417780160903931, "learning_rate": 2.4531031565853775e-05, "loss": 0.1176, "step": 30640 }, { "epoch": 2.7773246329526917, "grad_norm": 0.763094425201416, "learning_rate": 2.452312411191614e-05, "loss": 0.1518, "step": 30645 }, { "epoch": 2.7777777777777777, "grad_norm": 0.8753992319107056, "learning_rate": 2.4515216705704395e-05, "loss": 0.1388, "step": 30650 }, { "epoch": 2.778230922602864, "grad_norm": 0.726085364818573, "learning_rate": 2.450730934800993e-05, "loss": 0.1101, "step": 30655 }, { "epoch": 2.77868406742795, "grad_norm": 0.7252955436706543, "learning_rate": 2.4499402039624106e-05, "loss": 0.1217, "step": 30660 }, { "epoch": 2.779137212253036, "grad_norm": 0.7483440637588501, "learning_rate": 2.4491494781338296e-05, "loss": 0.1216, "step": 30665 }, { "epoch": 2.7795903570781224, "grad_norm": 0.6833323836326599, "learning_rate": 2.4483587573943872e-05, "loss": 0.1844, "step": 30670 }, { "epoch": 2.7800435019032084, "grad_norm": 0.8943446278572083, "learning_rate": 2.4475680418232166e-05, "loss": 0.1498, "step": 30675 }, { "epoch": 2.7804966467282943, "grad_norm": 0.8197920918464661, "learning_rate": 2.4467773314994547e-05, "loss": 0.1331, "step": 30680 }, { "epoch": 2.7809497915533807, "grad_norm": 0.7660908699035645, "learning_rate": 2.4459866265022348e-05, "loss": 0.1211, "step": 30685 }, { "epoch": 2.7814029363784667, "grad_norm": 0.7260441184043884, "learning_rate": 2.4451959269106918e-05, "loss": 0.1499, "step": 30690 }, { "epoch": 2.7818560812035527, "grad_norm": 0.8267040848731995, "learning_rate": 2.4444052328039598e-05, "loss": 0.1374, "step": 30695 }, { "epoch": 2.7823092260286386, "grad_norm": 0.69676274061203, "learning_rate": 2.4436145442611698e-05, "loss": 0.106, "step": 30700 }, { "epoch": 2.7827623708537246, "grad_norm": 0.7996678948402405, "learning_rate": 2.4428238613614562e-05, "loss": 0.1227, "step": 30705 }, { "epoch": 2.783215515678811, "grad_norm": 0.6649576425552368, "learning_rate": 2.4420331841839492e-05, "loss": 0.1232, "step": 30710 }, { "epoch": 2.783668660503897, "grad_norm": 0.8300333023071289, "learning_rate": 2.4412425128077804e-05, "loss": 0.1245, "step": 30715 }, { "epoch": 2.784121805328983, "grad_norm": 0.8169664740562439, "learning_rate": 2.4404518473120823e-05, "loss": 0.1705, "step": 30720 }, { "epoch": 2.7845749501540693, "grad_norm": 0.812347412109375, "learning_rate": 2.439661187775982e-05, "loss": 0.1289, "step": 30725 }, { "epoch": 2.7850280949791553, "grad_norm": 0.8350656032562256, "learning_rate": 2.4388705342786112e-05, "loss": 0.1223, "step": 30730 }, { "epoch": 2.7854812398042412, "grad_norm": 0.8543576598167419, "learning_rate": 2.4380798868990972e-05, "loss": 0.1325, "step": 30735 }, { "epoch": 2.7859343846293276, "grad_norm": 0.7428364157676697, "learning_rate": 2.4372892457165694e-05, "loss": 0.1344, "step": 30740 }, { "epoch": 2.7863875294544136, "grad_norm": 0.91456139087677, "learning_rate": 2.436498610810156e-05, "loss": 0.1525, "step": 30745 }, { "epoch": 2.7868406742794996, "grad_norm": 0.8208064436912537, "learning_rate": 2.4357079822589816e-05, "loss": 0.1458, "step": 30750 }, { "epoch": 2.787293819104586, "grad_norm": 0.8473157286643982, "learning_rate": 2.4349173601421748e-05, "loss": 0.1301, "step": 30755 }, { "epoch": 2.787746963929672, "grad_norm": 0.7677980661392212, "learning_rate": 2.434126744538861e-05, "loss": 0.1194, "step": 30760 }, { "epoch": 2.788200108754758, "grad_norm": 0.7512784004211426, "learning_rate": 2.433336135528164e-05, "loss": 0.1262, "step": 30765 }, { "epoch": 2.7886532535798443, "grad_norm": 0.8073645830154419, "learning_rate": 2.4325455331892103e-05, "loss": 0.1198, "step": 30770 }, { "epoch": 2.7891063984049302, "grad_norm": 0.7700027823448181, "learning_rate": 2.4317549376011213e-05, "loss": 0.1479, "step": 30775 }, { "epoch": 2.789559543230016, "grad_norm": 0.8235492706298828, "learning_rate": 2.4309643488430215e-05, "loss": 0.1451, "step": 30780 }, { "epoch": 2.7900126880551026, "grad_norm": 0.7933754920959473, "learning_rate": 2.430173766994034e-05, "loss": 0.1429, "step": 30785 }, { "epoch": 2.7904658328801886, "grad_norm": 0.7235928773880005, "learning_rate": 2.429383192133278e-05, "loss": 0.1135, "step": 30790 }, { "epoch": 2.7909189777052745, "grad_norm": 0.7967099547386169, "learning_rate": 2.4285926243398768e-05, "loss": 0.1599, "step": 30795 }, { "epoch": 2.791372122530361, "grad_norm": 0.8021700978279114, "learning_rate": 2.4278020636929493e-05, "loss": 0.1322, "step": 30800 }, { "epoch": 2.791825267355447, "grad_norm": 0.7510271072387695, "learning_rate": 2.427011510271616e-05, "loss": 0.1578, "step": 30805 }, { "epoch": 2.792278412180533, "grad_norm": 0.6726012825965881, "learning_rate": 2.426220964154996e-05, "loss": 0.1106, "step": 30810 }, { "epoch": 2.7927315570056193, "grad_norm": 0.7403250932693481, "learning_rate": 2.4254304254222062e-05, "loss": 0.1299, "step": 30815 }, { "epoch": 2.793184701830705, "grad_norm": 0.7542359232902527, "learning_rate": 2.424639894152365e-05, "loss": 0.1271, "step": 30820 }, { "epoch": 2.793637846655791, "grad_norm": 0.6614605188369751, "learning_rate": 2.4238493704245882e-05, "loss": 0.1236, "step": 30825 }, { "epoch": 2.794090991480877, "grad_norm": 0.8584026098251343, "learning_rate": 2.423058854317992e-05, "loss": 0.1249, "step": 30830 }, { "epoch": 2.7945441363059635, "grad_norm": 0.8623068332672119, "learning_rate": 2.422268345911693e-05, "loss": 0.1195, "step": 30835 }, { "epoch": 2.7949972811310495, "grad_norm": 0.8578435778617859, "learning_rate": 2.421477845284803e-05, "loss": 0.1127, "step": 30840 }, { "epoch": 2.7954504259561355, "grad_norm": 0.7198463678359985, "learning_rate": 2.420687352516437e-05, "loss": 0.1323, "step": 30845 }, { "epoch": 2.7959035707812214, "grad_norm": 0.8024008274078369, "learning_rate": 2.4198968676857075e-05, "loss": 0.1303, "step": 30850 }, { "epoch": 2.796356715606308, "grad_norm": 0.7284458875656128, "learning_rate": 2.4191063908717266e-05, "loss": 0.132, "step": 30855 }, { "epoch": 2.796809860431394, "grad_norm": 0.7837977409362793, "learning_rate": 2.4183159221536064e-05, "loss": 0.1257, "step": 30860 }, { "epoch": 2.7972630052564798, "grad_norm": 0.7600180506706238, "learning_rate": 2.4175254616104552e-05, "loss": 0.1355, "step": 30865 }, { "epoch": 2.797716150081566, "grad_norm": 0.8317718505859375, "learning_rate": 2.4167350093213843e-05, "loss": 0.1534, "step": 30870 }, { "epoch": 2.798169294906652, "grad_norm": 0.8135343790054321, "learning_rate": 2.415944565365501e-05, "loss": 0.131, "step": 30875 }, { "epoch": 2.798622439731738, "grad_norm": 0.7729604244232178, "learning_rate": 2.415154129821914e-05, "loss": 0.1383, "step": 30880 }, { "epoch": 2.7990755845568245, "grad_norm": 0.7785916328430176, "learning_rate": 2.414363702769732e-05, "loss": 0.1464, "step": 30885 }, { "epoch": 2.7995287293819104, "grad_norm": 0.8411537408828735, "learning_rate": 2.413573284288058e-05, "loss": 0.1223, "step": 30890 }, { "epoch": 2.7999818742069964, "grad_norm": 0.7259188294410706, "learning_rate": 2.4127828744559992e-05, "loss": 0.1413, "step": 30895 }, { "epoch": 2.800435019032083, "grad_norm": 0.823095977306366, "learning_rate": 2.41199247335266e-05, "loss": 0.1285, "step": 30900 }, { "epoch": 2.8008881638571688, "grad_norm": 0.7582665681838989, "learning_rate": 2.4112020810571435e-05, "loss": 0.1389, "step": 30905 }, { "epoch": 2.8013413086822547, "grad_norm": 1.0033949613571167, "learning_rate": 2.4104116976485537e-05, "loss": 0.1585, "step": 30910 }, { "epoch": 2.801794453507341, "grad_norm": 0.788008987903595, "learning_rate": 2.409621323205991e-05, "loss": 0.1414, "step": 30915 }, { "epoch": 2.802247598332427, "grad_norm": 0.6753188967704773, "learning_rate": 2.408830957808556e-05, "loss": 0.1318, "step": 30920 }, { "epoch": 2.802700743157513, "grad_norm": 0.7924475073814392, "learning_rate": 2.4080406015353506e-05, "loss": 0.1236, "step": 30925 }, { "epoch": 2.8031538879825995, "grad_norm": 0.9519688487052917, "learning_rate": 2.407250254465472e-05, "loss": 0.1683, "step": 30930 }, { "epoch": 2.8036070328076854, "grad_norm": 0.7925074100494385, "learning_rate": 2.406459916678021e-05, "loss": 0.1293, "step": 30935 }, { "epoch": 2.8040601776327714, "grad_norm": 0.7918004393577576, "learning_rate": 2.4056695882520913e-05, "loss": 0.1274, "step": 30940 }, { "epoch": 2.804513322457858, "grad_norm": 0.8173108696937561, "learning_rate": 2.4048792692667814e-05, "loss": 0.1254, "step": 30945 }, { "epoch": 2.8049664672829437, "grad_norm": 0.7432035803794861, "learning_rate": 2.404088959801187e-05, "loss": 0.1274, "step": 30950 }, { "epoch": 2.8054196121080297, "grad_norm": 0.7304825782775879, "learning_rate": 2.4032986599344013e-05, "loss": 0.1381, "step": 30955 }, { "epoch": 2.8058727569331157, "grad_norm": 0.8509790897369385, "learning_rate": 2.4025083697455197e-05, "loss": 0.1551, "step": 30960 }, { "epoch": 2.806325901758202, "grad_norm": 0.9121021628379822, "learning_rate": 2.401718089313632e-05, "loss": 0.1379, "step": 30965 }, { "epoch": 2.806779046583288, "grad_norm": 0.8852136135101318, "learning_rate": 2.4009278187178314e-05, "loss": 0.1706, "step": 30970 }, { "epoch": 2.807232191408374, "grad_norm": 0.809470534324646, "learning_rate": 2.4001375580372093e-05, "loss": 0.1208, "step": 30975 }, { "epoch": 2.80768533623346, "grad_norm": 0.8532115817070007, "learning_rate": 2.399347307350853e-05, "loss": 0.1343, "step": 30980 }, { "epoch": 2.8081384810585464, "grad_norm": 0.8601802587509155, "learning_rate": 2.3985570667378527e-05, "loss": 0.1305, "step": 30985 }, { "epoch": 2.8085916258836323, "grad_norm": 0.9746099710464478, "learning_rate": 2.3977668362772947e-05, "loss": 0.1813, "step": 30990 }, { "epoch": 2.8090447707087183, "grad_norm": 0.841117262840271, "learning_rate": 2.3969766160482666e-05, "loss": 0.1662, "step": 30995 }, { "epoch": 2.8094979155338047, "grad_norm": 0.7945386171340942, "learning_rate": 2.3961864061298544e-05, "loss": 0.1436, "step": 31000 }, { "epoch": 2.8099510603588906, "grad_norm": 0.9047985076904297, "learning_rate": 2.395396206601141e-05, "loss": 0.1339, "step": 31005 }, { "epoch": 2.8104042051839766, "grad_norm": 0.8161617517471313, "learning_rate": 2.3946060175412105e-05, "loss": 0.1412, "step": 31010 }, { "epoch": 2.810857350009063, "grad_norm": 0.777119517326355, "learning_rate": 2.393815839029146e-05, "loss": 0.1467, "step": 31015 }, { "epoch": 2.811310494834149, "grad_norm": 0.8463948965072632, "learning_rate": 2.3930256711440275e-05, "loss": 0.1382, "step": 31020 }, { "epoch": 2.811763639659235, "grad_norm": 0.6940619349479675, "learning_rate": 2.392235513964937e-05, "loss": 0.1441, "step": 31025 }, { "epoch": 2.8122167844843213, "grad_norm": 0.7733732461929321, "learning_rate": 2.391445367570952e-05, "loss": 0.1121, "step": 31030 }, { "epoch": 2.8126699293094073, "grad_norm": 0.8257811665534973, "learning_rate": 2.390655232041151e-05, "loss": 0.1135, "step": 31035 }, { "epoch": 2.8131230741344933, "grad_norm": 0.9016638994216919, "learning_rate": 2.3898651074546117e-05, "loss": 0.1147, "step": 31040 }, { "epoch": 2.8135762189595797, "grad_norm": 0.8168627023696899, "learning_rate": 2.3890749938904098e-05, "loss": 0.155, "step": 31045 }, { "epoch": 2.8140293637846656, "grad_norm": 0.7718183994293213, "learning_rate": 2.3882848914276205e-05, "loss": 0.1384, "step": 31050 }, { "epoch": 2.8144825086097516, "grad_norm": 0.7992652654647827, "learning_rate": 2.387494800145316e-05, "loss": 0.1246, "step": 31055 }, { "epoch": 2.814935653434838, "grad_norm": 0.8265255093574524, "learning_rate": 2.3867047201225703e-05, "loss": 0.1557, "step": 31060 }, { "epoch": 2.815388798259924, "grad_norm": 0.811365008354187, "learning_rate": 2.3859146514384548e-05, "loss": 0.1146, "step": 31065 }, { "epoch": 2.81584194308501, "grad_norm": 0.8215813636779785, "learning_rate": 2.385124594172039e-05, "loss": 0.1317, "step": 31070 }, { "epoch": 2.8162950879100963, "grad_norm": 0.7881990075111389, "learning_rate": 2.384334548402394e-05, "loss": 0.1558, "step": 31075 }, { "epoch": 2.8167482327351823, "grad_norm": 0.9793100953102112, "learning_rate": 2.383544514208585e-05, "loss": 0.1267, "step": 31080 }, { "epoch": 2.8172013775602682, "grad_norm": 0.8110396862030029, "learning_rate": 2.3827544916696805e-05, "loss": 0.1309, "step": 31085 }, { "epoch": 2.8176545223853546, "grad_norm": 0.8349232077598572, "learning_rate": 2.381964480864747e-05, "loss": 0.1377, "step": 31090 }, { "epoch": 2.8181076672104406, "grad_norm": 0.9320533871650696, "learning_rate": 2.3811744818728472e-05, "loss": 0.1217, "step": 31095 }, { "epoch": 2.8185608120355266, "grad_norm": 0.8148719668388367, "learning_rate": 2.3803844947730468e-05, "loss": 0.1131, "step": 31100 }, { "epoch": 2.8190139568606125, "grad_norm": 0.953705370426178, "learning_rate": 2.3795945196444054e-05, "loss": 0.1198, "step": 31105 }, { "epoch": 2.8194671016856985, "grad_norm": 0.7211304903030396, "learning_rate": 2.378804556565985e-05, "loss": 0.1194, "step": 31110 }, { "epoch": 2.819920246510785, "grad_norm": 0.7850508093833923, "learning_rate": 2.378014605616846e-05, "loss": 0.1357, "step": 31115 }, { "epoch": 2.820373391335871, "grad_norm": 0.834663987159729, "learning_rate": 2.3772246668760462e-05, "loss": 0.1345, "step": 31120 }, { "epoch": 2.820826536160957, "grad_norm": 0.6681488752365112, "learning_rate": 2.3764347404226444e-05, "loss": 0.1312, "step": 31125 }, { "epoch": 2.821279680986043, "grad_norm": 0.7282131910324097, "learning_rate": 2.3756448263356946e-05, "loss": 0.1136, "step": 31130 }, { "epoch": 2.821732825811129, "grad_norm": 0.9059714674949646, "learning_rate": 2.3748549246942524e-05, "loss": 0.1499, "step": 31135 }, { "epoch": 2.822185970636215, "grad_norm": 0.7167139053344727, "learning_rate": 2.3740650355773724e-05, "loss": 0.1121, "step": 31140 }, { "epoch": 2.8226391154613015, "grad_norm": 0.8615918755531311, "learning_rate": 2.373275159064105e-05, "loss": 0.1275, "step": 31145 }, { "epoch": 2.8230922602863875, "grad_norm": 0.6951270699501038, "learning_rate": 2.3724852952335032e-05, "loss": 0.1183, "step": 31150 }, { "epoch": 2.8235454051114734, "grad_norm": 0.8356525301933289, "learning_rate": 2.3716954441646168e-05, "loss": 0.1303, "step": 31155 }, { "epoch": 2.82399854993656, "grad_norm": 0.7675175666809082, "learning_rate": 2.370905605936493e-05, "loss": 0.1242, "step": 31160 }, { "epoch": 2.824451694761646, "grad_norm": 0.6923123598098755, "learning_rate": 2.3701157806281808e-05, "loss": 0.1375, "step": 31165 }, { "epoch": 2.8249048395867318, "grad_norm": 0.8100854754447937, "learning_rate": 2.3693259683187238e-05, "loss": 0.1151, "step": 31170 }, { "epoch": 2.825357984411818, "grad_norm": 0.8492275476455688, "learning_rate": 2.3685361690871685e-05, "loss": 0.1549, "step": 31175 }, { "epoch": 2.825811129236904, "grad_norm": 0.740302324295044, "learning_rate": 2.367746383012558e-05, "loss": 0.1231, "step": 31180 }, { "epoch": 2.82626427406199, "grad_norm": 0.8371584415435791, "learning_rate": 2.366956610173934e-05, "loss": 0.1241, "step": 31185 }, { "epoch": 2.8267174188870765, "grad_norm": 0.7302920818328857, "learning_rate": 2.3661668506503385e-05, "loss": 0.1246, "step": 31190 }, { "epoch": 2.8271705637121625, "grad_norm": 0.8003329038619995, "learning_rate": 2.3653771045208084e-05, "loss": 0.1449, "step": 31195 }, { "epoch": 2.8276237085372484, "grad_norm": 0.6779359579086304, "learning_rate": 2.3645873718643836e-05, "loss": 0.1196, "step": 31200 }, { "epoch": 2.828076853362335, "grad_norm": 0.9423840641975403, "learning_rate": 2.3637976527601004e-05, "loss": 0.1497, "step": 31205 }, { "epoch": 2.828529998187421, "grad_norm": 0.7289210557937622, "learning_rate": 2.363007947286994e-05, "loss": 0.1363, "step": 31210 }, { "epoch": 2.8289831430125068, "grad_norm": 0.7804199457168579, "learning_rate": 2.3622182555240996e-05, "loss": 0.1142, "step": 31215 }, { "epoch": 2.829436287837593, "grad_norm": 0.9141858816146851, "learning_rate": 2.361428577550448e-05, "loss": 0.1227, "step": 31220 }, { "epoch": 2.829889432662679, "grad_norm": 0.8235105276107788, "learning_rate": 2.3606389134450703e-05, "loss": 0.1094, "step": 31225 }, { "epoch": 2.830342577487765, "grad_norm": 0.7563514113426208, "learning_rate": 2.3598492632869987e-05, "loss": 0.1364, "step": 31230 }, { "epoch": 2.830795722312851, "grad_norm": 0.8281657099723816, "learning_rate": 2.3590596271552593e-05, "loss": 0.1326, "step": 31235 }, { "epoch": 2.8312488671379374, "grad_norm": 0.8240872025489807, "learning_rate": 2.358270005128881e-05, "loss": 0.1433, "step": 31240 }, { "epoch": 2.8317020119630234, "grad_norm": 0.9621156454086304, "learning_rate": 2.3574803972868875e-05, "loss": 0.1505, "step": 31245 }, { "epoch": 2.8321551567881094, "grad_norm": 0.7934697866439819, "learning_rate": 2.3566908037083043e-05, "loss": 0.1777, "step": 31250 }, { "epoch": 2.8326083016131953, "grad_norm": 0.8693579435348511, "learning_rate": 2.3559012244721543e-05, "loss": 0.1377, "step": 31255 }, { "epoch": 2.8330614464382817, "grad_norm": 0.8219706416130066, "learning_rate": 2.3551116596574577e-05, "loss": 0.1267, "step": 31260 }, { "epoch": 2.8335145912633677, "grad_norm": 0.8010072112083435, "learning_rate": 2.3543221093432355e-05, "loss": 0.1361, "step": 31265 }, { "epoch": 2.8339677360884536, "grad_norm": 0.7394925355911255, "learning_rate": 2.353532573608507e-05, "loss": 0.1251, "step": 31270 }, { "epoch": 2.83442088091354, "grad_norm": 0.8328939080238342, "learning_rate": 2.3527430525322864e-05, "loss": 0.1409, "step": 31275 }, { "epoch": 2.834874025738626, "grad_norm": 0.7477438449859619, "learning_rate": 2.351953546193592e-05, "loss": 0.1159, "step": 31280 }, { "epoch": 2.835327170563712, "grad_norm": 0.5979812145233154, "learning_rate": 2.351164054671436e-05, "loss": 0.1099, "step": 31285 }, { "epoch": 2.8357803153887984, "grad_norm": 0.888161838054657, "learning_rate": 2.3503745780448314e-05, "loss": 0.1311, "step": 31290 }, { "epoch": 2.8362334602138843, "grad_norm": 0.7014565467834473, "learning_rate": 2.3495851163927908e-05, "loss": 0.1191, "step": 31295 }, { "epoch": 2.8366866050389703, "grad_norm": 0.8373759388923645, "learning_rate": 2.348795669794322e-05, "loss": 0.1488, "step": 31300 }, { "epoch": 2.8371397498640567, "grad_norm": 0.7297298312187195, "learning_rate": 2.348006238328434e-05, "loss": 0.1276, "step": 31305 }, { "epoch": 2.8375928946891427, "grad_norm": 1.0078805685043335, "learning_rate": 2.3472168220741324e-05, "loss": 0.1614, "step": 31310 }, { "epoch": 2.8380460395142286, "grad_norm": 0.7459198236465454, "learning_rate": 2.346427421110423e-05, "loss": 0.1503, "step": 31315 }, { "epoch": 2.838499184339315, "grad_norm": 0.8642480373382568, "learning_rate": 2.3456380355163103e-05, "loss": 0.1275, "step": 31320 }, { "epoch": 2.838952329164401, "grad_norm": 0.8440015316009521, "learning_rate": 2.344848665370794e-05, "loss": 0.1187, "step": 31325 }, { "epoch": 2.839405473989487, "grad_norm": 0.720894455909729, "learning_rate": 2.3440593107528767e-05, "loss": 0.156, "step": 31330 }, { "epoch": 2.8398586188145734, "grad_norm": 0.6836358904838562, "learning_rate": 2.343269971741555e-05, "loss": 0.1134, "step": 31335 }, { "epoch": 2.8403117636396593, "grad_norm": 0.7534887790679932, "learning_rate": 2.3424806484158282e-05, "loss": 0.113, "step": 31340 }, { "epoch": 2.8407649084647453, "grad_norm": 1.021977186203003, "learning_rate": 2.3416913408546913e-05, "loss": 0.1275, "step": 31345 }, { "epoch": 2.8412180532898317, "grad_norm": 0.9041578769683838, "learning_rate": 2.3409020491371382e-05, "loss": 0.1357, "step": 31350 }, { "epoch": 2.8416711981149176, "grad_norm": 0.7653954029083252, "learning_rate": 2.3401127733421634e-05, "loss": 0.1263, "step": 31355 }, { "epoch": 2.8421243429400036, "grad_norm": 0.7391716837882996, "learning_rate": 2.3393235135487545e-05, "loss": 0.1512, "step": 31360 }, { "epoch": 2.8425774877650896, "grad_norm": 0.7653791308403015, "learning_rate": 2.3385342698359027e-05, "loss": 0.1048, "step": 31365 }, { "epoch": 2.843030632590176, "grad_norm": 0.7905849814414978, "learning_rate": 2.3377450422825967e-05, "loss": 0.1365, "step": 31370 }, { "epoch": 2.843483777415262, "grad_norm": 0.8489323258399963, "learning_rate": 2.336955830967821e-05, "loss": 0.1442, "step": 31375 }, { "epoch": 2.843936922240348, "grad_norm": 0.9363840818405151, "learning_rate": 2.336166635970562e-05, "loss": 0.1322, "step": 31380 }, { "epoch": 2.844390067065434, "grad_norm": 0.8175262212753296, "learning_rate": 2.3353774573698e-05, "loss": 0.131, "step": 31385 }, { "epoch": 2.8448432118905203, "grad_norm": 0.7173205018043518, "learning_rate": 2.334588295244518e-05, "loss": 0.1265, "step": 31390 }, { "epoch": 2.845296356715606, "grad_norm": 0.8378481268882751, "learning_rate": 2.333799149673696e-05, "loss": 0.1131, "step": 31395 }, { "epoch": 2.845749501540692, "grad_norm": 0.9205868244171143, "learning_rate": 2.3330100207363107e-05, "loss": 0.1538, "step": 31400 }, { "epoch": 2.8462026463657786, "grad_norm": 0.914458155632019, "learning_rate": 2.332220908511339e-05, "loss": 0.1202, "step": 31405 }, { "epoch": 2.8466557911908645, "grad_norm": 0.9207897782325745, "learning_rate": 2.331431813077756e-05, "loss": 0.1362, "step": 31410 }, { "epoch": 2.8471089360159505, "grad_norm": 0.8131842017173767, "learning_rate": 2.330642734514534e-05, "loss": 0.1338, "step": 31415 }, { "epoch": 2.847562080841037, "grad_norm": 0.7804493308067322, "learning_rate": 2.3298536729006444e-05, "loss": 0.11, "step": 31420 }, { "epoch": 2.848015225666123, "grad_norm": 0.8332563042640686, "learning_rate": 2.3290646283150564e-05, "loss": 0.1437, "step": 31425 }, { "epoch": 2.848468370491209, "grad_norm": 0.9892388582229614, "learning_rate": 2.3282756008367377e-05, "loss": 0.1308, "step": 31430 }, { "epoch": 2.8489215153162952, "grad_norm": 0.6545665860176086, "learning_rate": 2.3274865905446566e-05, "loss": 0.118, "step": 31435 }, { "epoch": 2.849374660141381, "grad_norm": 0.8179240226745605, "learning_rate": 2.3266975975177744e-05, "loss": 0.1289, "step": 31440 }, { "epoch": 2.849827804966467, "grad_norm": 0.8273254036903381, "learning_rate": 2.325908621835056e-05, "loss": 0.125, "step": 31445 }, { "epoch": 2.8502809497915536, "grad_norm": 0.8356003165245056, "learning_rate": 2.3251196635754606e-05, "loss": 0.1116, "step": 31450 }, { "epoch": 2.8507340946166395, "grad_norm": 0.8196987509727478, "learning_rate": 2.324330722817949e-05, "loss": 0.1401, "step": 31455 }, { "epoch": 2.8511872394417255, "grad_norm": 0.849205732345581, "learning_rate": 2.3235417996414788e-05, "loss": 0.1395, "step": 31460 }, { "epoch": 2.851640384266812, "grad_norm": 0.8656071424484253, "learning_rate": 2.322752894125004e-05, "loss": 0.1431, "step": 31465 }, { "epoch": 2.852093529091898, "grad_norm": 0.7500385642051697, "learning_rate": 2.3219640063474796e-05, "loss": 0.1269, "step": 31470 }, { "epoch": 2.852546673916984, "grad_norm": 0.7574905157089233, "learning_rate": 2.3211751363878574e-05, "loss": 0.1316, "step": 31475 }, { "epoch": 2.85299981874207, "grad_norm": 0.6783806681632996, "learning_rate": 2.320386284325088e-05, "loss": 0.1157, "step": 31480 }, { "epoch": 2.853452963567156, "grad_norm": 0.8983184695243835, "learning_rate": 2.319597450238121e-05, "loss": 0.1495, "step": 31485 }, { "epoch": 2.853906108392242, "grad_norm": 0.7486414909362793, "learning_rate": 2.3188086342059013e-05, "loss": 0.1353, "step": 31490 }, { "epoch": 2.8543592532173285, "grad_norm": 0.8219732046127319, "learning_rate": 2.3180198363073747e-05, "loss": 0.1412, "step": 31495 }, { "epoch": 2.8548123980424145, "grad_norm": 0.6712707877159119, "learning_rate": 2.3172310566214843e-05, "loss": 0.1373, "step": 31500 }, { "epoch": 2.8552655428675005, "grad_norm": 0.8160238265991211, "learning_rate": 2.3164422952271717e-05, "loss": 0.1507, "step": 31505 }, { "epoch": 2.8557186876925864, "grad_norm": 0.7141696214675903, "learning_rate": 2.315653552203377e-05, "loss": 0.1462, "step": 31510 }, { "epoch": 2.8561718325176724, "grad_norm": 0.7554715871810913, "learning_rate": 2.3148648276290362e-05, "loss": 0.1282, "step": 31515 }, { "epoch": 2.8566249773427588, "grad_norm": 0.8042119741439819, "learning_rate": 2.3140761215830873e-05, "loss": 0.1551, "step": 31520 }, { "epoch": 2.8570781221678447, "grad_norm": 0.9982163310050964, "learning_rate": 2.3132874341444614e-05, "loss": 0.1566, "step": 31525 }, { "epoch": 2.8575312669929307, "grad_norm": 0.7682029604911804, "learning_rate": 2.3124987653920925e-05, "loss": 0.1026, "step": 31530 }, { "epoch": 2.857984411818017, "grad_norm": 0.8839163780212402, "learning_rate": 2.311710115404911e-05, "loss": 0.143, "step": 31535 }, { "epoch": 2.858437556643103, "grad_norm": 0.8015140295028687, "learning_rate": 2.3109214842618446e-05, "loss": 0.1264, "step": 31540 }, { "epoch": 2.858890701468189, "grad_norm": 0.8477754592895508, "learning_rate": 2.3101328720418196e-05, "loss": 0.113, "step": 31545 }, { "epoch": 2.8593438462932754, "grad_norm": 0.7432470321655273, "learning_rate": 2.3093442788237622e-05, "loss": 0.1612, "step": 31550 }, { "epoch": 2.8597969911183614, "grad_norm": 0.9613863229751587, "learning_rate": 2.3085557046865927e-05, "loss": 0.1341, "step": 31555 }, { "epoch": 2.8602501359434473, "grad_norm": 0.8650069236755371, "learning_rate": 2.307767149709234e-05, "loss": 0.1196, "step": 31560 }, { "epoch": 2.8607032807685338, "grad_norm": 0.7801662087440491, "learning_rate": 2.3069786139706025e-05, "loss": 0.1133, "step": 31565 }, { "epoch": 2.8611564255936197, "grad_norm": 0.7685104608535767, "learning_rate": 2.306190097549617e-05, "loss": 0.131, "step": 31570 }, { "epoch": 2.8616095704187057, "grad_norm": 0.7377440333366394, "learning_rate": 2.3054016005251933e-05, "loss": 0.1327, "step": 31575 }, { "epoch": 2.862062715243792, "grad_norm": 0.8075966835021973, "learning_rate": 2.3046131229762418e-05, "loss": 0.1409, "step": 31580 }, { "epoch": 2.862515860068878, "grad_norm": 0.8235846757888794, "learning_rate": 2.3038246649816757e-05, "loss": 0.1442, "step": 31585 }, { "epoch": 2.862969004893964, "grad_norm": 0.6971850991249084, "learning_rate": 2.3030362266204032e-05, "loss": 0.1172, "step": 31590 }, { "epoch": 2.8634221497190504, "grad_norm": 0.7906076312065125, "learning_rate": 2.302247807971331e-05, "loss": 0.1564, "step": 31595 }, { "epoch": 2.8638752945441364, "grad_norm": 0.8795026540756226, "learning_rate": 2.301459409113367e-05, "loss": 0.1311, "step": 31600 }, { "epoch": 2.8643284393692223, "grad_norm": 0.7285217046737671, "learning_rate": 2.300671030125411e-05, "loss": 0.1322, "step": 31605 }, { "epoch": 2.8647815841943087, "grad_norm": 0.8811337947845459, "learning_rate": 2.2998826710863658e-05, "loss": 0.1275, "step": 31610 }, { "epoch": 2.8652347290193947, "grad_norm": 0.9184786677360535, "learning_rate": 2.2990943320751304e-05, "loss": 0.1251, "step": 31615 }, { "epoch": 2.8656878738444806, "grad_norm": 0.7728307843208313, "learning_rate": 2.298306013170602e-05, "loss": 0.1073, "step": 31620 }, { "epoch": 2.866141018669567, "grad_norm": 0.7321602702140808, "learning_rate": 2.2975177144516774e-05, "loss": 0.1275, "step": 31625 }, { "epoch": 2.866594163494653, "grad_norm": 0.8108304738998413, "learning_rate": 2.2967294359972468e-05, "loss": 0.1356, "step": 31630 }, { "epoch": 2.867047308319739, "grad_norm": 0.9233987927436829, "learning_rate": 2.2959411778862037e-05, "loss": 0.1223, "step": 31635 }, { "epoch": 2.867500453144825, "grad_norm": 0.7410162091255188, "learning_rate": 2.295152940197436e-05, "loss": 0.1335, "step": 31640 }, { "epoch": 2.8679535979699113, "grad_norm": 0.8330317735671997, "learning_rate": 2.2943647230098313e-05, "loss": 0.1225, "step": 31645 }, { "epoch": 2.8684067427949973, "grad_norm": 0.8335797786712646, "learning_rate": 2.293576526402276e-05, "loss": 0.132, "step": 31650 }, { "epoch": 2.8688598876200833, "grad_norm": 0.7703251838684082, "learning_rate": 2.2927883504536505e-05, "loss": 0.1308, "step": 31655 }, { "epoch": 2.869313032445169, "grad_norm": 0.8157458901405334, "learning_rate": 2.2920001952428366e-05, "loss": 0.119, "step": 31660 }, { "epoch": 2.8697661772702556, "grad_norm": 0.8855977058410645, "learning_rate": 2.2912120608487143e-05, "loss": 0.122, "step": 31665 }, { "epoch": 2.8702193220953416, "grad_norm": 0.8807500600814819, "learning_rate": 2.290423947350159e-05, "loss": 0.1427, "step": 31670 }, { "epoch": 2.8706724669204275, "grad_norm": 0.7991552352905273, "learning_rate": 2.2896358548260468e-05, "loss": 0.1522, "step": 31675 }, { "epoch": 2.871125611745514, "grad_norm": 0.7483828663825989, "learning_rate": 2.2888477833552486e-05, "loss": 0.128, "step": 31680 }, { "epoch": 2.8715787565706, "grad_norm": 0.8055364489555359, "learning_rate": 2.2880597330166352e-05, "loss": 0.1379, "step": 31685 }, { "epoch": 2.872031901395686, "grad_norm": 1.01914644241333, "learning_rate": 2.2872717038890768e-05, "loss": 0.1778, "step": 31690 }, { "epoch": 2.8724850462207723, "grad_norm": 0.797950267791748, "learning_rate": 2.2864836960514363e-05, "loss": 0.1233, "step": 31695 }, { "epoch": 2.8729381910458582, "grad_norm": 0.7352201342582703, "learning_rate": 2.2856957095825814e-05, "loss": 0.1168, "step": 31700 }, { "epoch": 2.873391335870944, "grad_norm": 0.8636356592178345, "learning_rate": 2.284907744561371e-05, "loss": 0.1257, "step": 31705 }, { "epoch": 2.8738444806960306, "grad_norm": 0.7932099103927612, "learning_rate": 2.2841198010666668e-05, "loss": 0.1205, "step": 31710 }, { "epoch": 2.8742976255211166, "grad_norm": 0.9320448040962219, "learning_rate": 2.2833318791773266e-05, "loss": 0.1393, "step": 31715 }, { "epoch": 2.8747507703462025, "grad_norm": 0.8332235813140869, "learning_rate": 2.282543978972204e-05, "loss": 0.1291, "step": 31720 }, { "epoch": 2.875203915171289, "grad_norm": 0.7048521637916565, "learning_rate": 2.2817561005301545e-05, "loss": 0.1096, "step": 31725 }, { "epoch": 2.875657059996375, "grad_norm": 0.8360177278518677, "learning_rate": 2.2809682439300272e-05, "loss": 0.1599, "step": 31730 }, { "epoch": 2.876110204821461, "grad_norm": 0.6850513815879822, "learning_rate": 2.2801804092506725e-05, "loss": 0.1093, "step": 31735 }, { "epoch": 2.8765633496465473, "grad_norm": 1.153946042060852, "learning_rate": 2.2793925965709375e-05, "loss": 0.1352, "step": 31740 }, { "epoch": 2.877016494471633, "grad_norm": 0.7213010787963867, "learning_rate": 2.278604805969665e-05, "loss": 0.1075, "step": 31745 }, { "epoch": 2.877469639296719, "grad_norm": 0.7498790621757507, "learning_rate": 2.277817037525699e-05, "loss": 0.1366, "step": 31750 }, { "epoch": 2.8779227841218056, "grad_norm": 0.8367212414741516, "learning_rate": 2.2770292913178783e-05, "loss": 0.1215, "step": 31755 }, { "epoch": 2.8783759289468915, "grad_norm": 0.7926765084266663, "learning_rate": 2.276241567425042e-05, "loss": 0.1283, "step": 31760 }, { "epoch": 2.8788290737719775, "grad_norm": 0.7321791052818298, "learning_rate": 2.275453865926026e-05, "loss": 0.1193, "step": 31765 }, { "epoch": 2.8792822185970635, "grad_norm": 0.7122007012367249, "learning_rate": 2.2746661868996617e-05, "loss": 0.117, "step": 31770 }, { "epoch": 2.87973536342215, "grad_norm": 0.7220330238342285, "learning_rate": 2.2738785304247823e-05, "loss": 0.1114, "step": 31775 }, { "epoch": 2.880188508247236, "grad_norm": 0.6821263432502747, "learning_rate": 2.2730908965802153e-05, "loss": 0.1253, "step": 31780 }, { "epoch": 2.880641653072322, "grad_norm": 0.9874787926673889, "learning_rate": 2.2723032854447884e-05, "loss": 0.1387, "step": 31785 }, { "epoch": 2.8810947978974077, "grad_norm": 0.866073727607727, "learning_rate": 2.2715156970973263e-05, "loss": 0.1059, "step": 31790 }, { "epoch": 2.881547942722494, "grad_norm": 0.9240857362747192, "learning_rate": 2.27072813161665e-05, "loss": 0.1259, "step": 31795 }, { "epoch": 2.88200108754758, "grad_norm": 0.7687937617301941, "learning_rate": 2.2699405890815794e-05, "loss": 0.1147, "step": 31800 }, { "epoch": 2.882454232372666, "grad_norm": 0.7088953256607056, "learning_rate": 2.2691530695709332e-05, "loss": 0.1334, "step": 31805 }, { "epoch": 2.8829073771977525, "grad_norm": 1.0047589540481567, "learning_rate": 2.268365573163525e-05, "loss": 0.1932, "step": 31810 }, { "epoch": 2.8833605220228384, "grad_norm": 0.7615644931793213, "learning_rate": 2.26757809993817e-05, "loss": 0.1625, "step": 31815 }, { "epoch": 2.8838136668479244, "grad_norm": 0.9913521409034729, "learning_rate": 2.2667906499736757e-05, "loss": 0.1411, "step": 31820 }, { "epoch": 2.884266811673011, "grad_norm": 0.7420276403427124, "learning_rate": 2.2660032233488522e-05, "loss": 0.1102, "step": 31825 }, { "epoch": 2.8847199564980968, "grad_norm": 0.7640970349311829, "learning_rate": 2.265215820142506e-05, "loss": 0.1128, "step": 31830 }, { "epoch": 2.8851731013231827, "grad_norm": 0.7127946615219116, "learning_rate": 2.264428440433439e-05, "loss": 0.1169, "step": 31835 }, { "epoch": 2.885626246148269, "grad_norm": 0.701849639415741, "learning_rate": 2.263641084300455e-05, "loss": 0.1157, "step": 31840 }, { "epoch": 2.886079390973355, "grad_norm": 0.7503623366355896, "learning_rate": 2.26285375182235e-05, "loss": 0.1348, "step": 31845 }, { "epoch": 2.886532535798441, "grad_norm": 0.8437284231185913, "learning_rate": 2.262066443077921e-05, "loss": 0.1229, "step": 31850 }, { "epoch": 2.8869856806235275, "grad_norm": 0.7284365296363831, "learning_rate": 2.2612791581459638e-05, "loss": 0.128, "step": 31855 }, { "epoch": 2.8874388254486134, "grad_norm": 0.6533395648002625, "learning_rate": 2.2604918971052686e-05, "loss": 0.1443, "step": 31860 }, { "epoch": 2.8878919702736994, "grad_norm": 0.8553148508071899, "learning_rate": 2.2597046600346263e-05, "loss": 0.1045, "step": 31865 }, { "epoch": 2.8883451150987858, "grad_norm": 0.7800582647323608, "learning_rate": 2.258917447012822e-05, "loss": 0.1367, "step": 31870 }, { "epoch": 2.8887982599238717, "grad_norm": 0.7349948287010193, "learning_rate": 2.2581302581186407e-05, "loss": 0.1097, "step": 31875 }, { "epoch": 2.8892514047489577, "grad_norm": 0.8065287470817566, "learning_rate": 2.2573430934308663e-05, "loss": 0.132, "step": 31880 }, { "epoch": 2.889704549574044, "grad_norm": 0.6077610850334167, "learning_rate": 2.2565559530282758e-05, "loss": 0.1259, "step": 31885 }, { "epoch": 2.89015769439913, "grad_norm": 0.7141543626785278, "learning_rate": 2.255768836989649e-05, "loss": 0.1425, "step": 31890 }, { "epoch": 2.890610839224216, "grad_norm": 0.9057512283325195, "learning_rate": 2.2549817453937583e-05, "loss": 0.1237, "step": 31895 }, { "epoch": 2.8910639840493024, "grad_norm": 0.7952043414115906, "learning_rate": 2.2541946783193776e-05, "loss": 0.1214, "step": 31900 }, { "epoch": 2.8915171288743884, "grad_norm": 0.8430529832839966, "learning_rate": 2.2534076358452775e-05, "loss": 0.1268, "step": 31905 }, { "epoch": 2.8919702736994743, "grad_norm": 0.7785978317260742, "learning_rate": 2.2526206180502238e-05, "loss": 0.1392, "step": 31910 }, { "epoch": 2.8924234185245603, "grad_norm": 0.8568116426467896, "learning_rate": 2.251833625012982e-05, "loss": 0.1321, "step": 31915 }, { "epoch": 2.8928765633496467, "grad_norm": 1.0216940641403198, "learning_rate": 2.2510466568123156e-05, "loss": 0.1404, "step": 31920 }, { "epoch": 2.8933297081747327, "grad_norm": 0.8090324997901917, "learning_rate": 2.2502597135269836e-05, "loss": 0.1386, "step": 31925 }, { "epoch": 2.8937828529998186, "grad_norm": 0.8874560594558716, "learning_rate": 2.2494727952357446e-05, "loss": 0.1475, "step": 31930 }, { "epoch": 2.8942359978249046, "grad_norm": 0.7430415153503418, "learning_rate": 2.248685902017352e-05, "loss": 0.1167, "step": 31935 }, { "epoch": 2.894689142649991, "grad_norm": 0.6938379406929016, "learning_rate": 2.2478990339505593e-05, "loss": 0.1547, "step": 31940 }, { "epoch": 2.895142287475077, "grad_norm": 0.7452081441879272, "learning_rate": 2.247112191114117e-05, "loss": 0.112, "step": 31945 }, { "epoch": 2.895595432300163, "grad_norm": 0.7150058150291443, "learning_rate": 2.2463253735867717e-05, "loss": 0.1268, "step": 31950 }, { "epoch": 2.8960485771252493, "grad_norm": 0.867144763469696, "learning_rate": 2.24553858144727e-05, "loss": 0.1435, "step": 31955 }, { "epoch": 2.8965017219503353, "grad_norm": 0.8222196102142334, "learning_rate": 2.2447518147743516e-05, "loss": 0.1209, "step": 31960 }, { "epoch": 2.8969548667754212, "grad_norm": 0.812006413936615, "learning_rate": 2.243965073646758e-05, "loss": 0.1169, "step": 31965 }, { "epoch": 2.8974080116005076, "grad_norm": 0.8408561944961548, "learning_rate": 2.243178358143227e-05, "loss": 0.1118, "step": 31970 }, { "epoch": 2.8978611564255936, "grad_norm": 0.823291540145874, "learning_rate": 2.2423916683424923e-05, "loss": 0.14, "step": 31975 }, { "epoch": 2.8983143012506796, "grad_norm": 0.8013290166854858, "learning_rate": 2.2416050043232874e-05, "loss": 0.1302, "step": 31980 }, { "epoch": 2.898767446075766, "grad_norm": 0.74269700050354, "learning_rate": 2.24081836616434e-05, "loss": 0.1304, "step": 31985 }, { "epoch": 2.899220590900852, "grad_norm": 0.7546403408050537, "learning_rate": 2.240031753944378e-05, "loss": 0.1134, "step": 31990 }, { "epoch": 2.899673735725938, "grad_norm": 0.9291658997535706, "learning_rate": 2.2392451677421266e-05, "loss": 0.1146, "step": 31995 }, { "epoch": 2.9001268805510243, "grad_norm": 0.8006657361984253, "learning_rate": 2.2384586076363064e-05, "loss": 0.1259, "step": 32000 }, { "epoch": 2.9005800253761103, "grad_norm": 0.8854795694351196, "learning_rate": 2.2376720737056385e-05, "loss": 0.1187, "step": 32005 }, { "epoch": 2.901033170201196, "grad_norm": 0.8376604318618774, "learning_rate": 2.2368855660288365e-05, "loss": 0.1263, "step": 32010 }, { "epoch": 2.9014863150262826, "grad_norm": 0.8118534088134766, "learning_rate": 2.2360990846846164e-05, "loss": 0.129, "step": 32015 }, { "epoch": 2.9019394598513686, "grad_norm": 0.7932248711585999, "learning_rate": 2.2353126297516896e-05, "loss": 0.1021, "step": 32020 }, { "epoch": 2.9023926046764545, "grad_norm": 0.8683807849884033, "learning_rate": 2.2345262013087635e-05, "loss": 0.1412, "step": 32025 }, { "epoch": 2.902845749501541, "grad_norm": 0.8125703930854797, "learning_rate": 2.2337397994345467e-05, "loss": 0.1202, "step": 32030 }, { "epoch": 2.903298894326627, "grad_norm": 0.8603363633155823, "learning_rate": 2.232953424207739e-05, "loss": 0.1249, "step": 32035 }, { "epoch": 2.903752039151713, "grad_norm": 0.9795196652412415, "learning_rate": 2.2321670757070433e-05, "loss": 0.1345, "step": 32040 }, { "epoch": 2.904205183976799, "grad_norm": 0.7970513105392456, "learning_rate": 2.2313807540111574e-05, "loss": 0.1242, "step": 32045 }, { "epoch": 2.9046583288018852, "grad_norm": 0.8577406406402588, "learning_rate": 2.230594459198776e-05, "loss": 0.1249, "step": 32050 }, { "epoch": 2.905111473626971, "grad_norm": 0.8671483993530273, "learning_rate": 2.2298081913485925e-05, "loss": 0.1078, "step": 32055 }, { "epoch": 2.905564618452057, "grad_norm": 0.8358308672904968, "learning_rate": 2.2290219505392973e-05, "loss": 0.1354, "step": 32060 }, { "epoch": 2.906017763277143, "grad_norm": 0.7210385203361511, "learning_rate": 2.228235736849576e-05, "loss": 0.0952, "step": 32065 }, { "epoch": 2.9064709081022295, "grad_norm": 0.8455272912979126, "learning_rate": 2.227449550358115e-05, "loss": 0.1192, "step": 32070 }, { "epoch": 2.9069240529273155, "grad_norm": 0.9187964797019958, "learning_rate": 2.2266633911435937e-05, "loss": 0.124, "step": 32075 }, { "epoch": 2.9073771977524014, "grad_norm": 0.8972232937812805, "learning_rate": 2.2258772592846933e-05, "loss": 0.1422, "step": 32080 }, { "epoch": 2.907830342577488, "grad_norm": 0.7717419862747192, "learning_rate": 2.2250911548600896e-05, "loss": 0.1097, "step": 32085 }, { "epoch": 2.908283487402574, "grad_norm": 0.7802886962890625, "learning_rate": 2.2243050779484557e-05, "loss": 0.1284, "step": 32090 }, { "epoch": 2.9087366322276598, "grad_norm": 0.7826052308082581, "learning_rate": 2.2235190286284637e-05, "loss": 0.1178, "step": 32095 }, { "epoch": 2.909189777052746, "grad_norm": 0.8749125003814697, "learning_rate": 2.2227330069787795e-05, "loss": 0.1171, "step": 32100 }, { "epoch": 2.909642921877832, "grad_norm": 0.9209553003311157, "learning_rate": 2.2219470130780702e-05, "loss": 0.1507, "step": 32105 }, { "epoch": 2.910096066702918, "grad_norm": 0.8039694428443909, "learning_rate": 2.221161047004998e-05, "loss": 0.1199, "step": 32110 }, { "epoch": 2.9105492115280045, "grad_norm": 0.8201670050621033, "learning_rate": 2.220375108838222e-05, "loss": 0.1044, "step": 32115 }, { "epoch": 2.9110023563530905, "grad_norm": 0.8314396142959595, "learning_rate": 2.219589198656401e-05, "loss": 0.1256, "step": 32120 }, { "epoch": 2.9114555011781764, "grad_norm": 0.8430472612380981, "learning_rate": 2.2188033165381867e-05, "loss": 0.1314, "step": 32125 }, { "epoch": 2.911908646003263, "grad_norm": 0.6313696503639221, "learning_rate": 2.218017462562231e-05, "loss": 0.0929, "step": 32130 }, { "epoch": 2.912361790828349, "grad_norm": 0.8449990749359131, "learning_rate": 2.217231636807184e-05, "loss": 0.1314, "step": 32135 }, { "epoch": 2.9128149356534347, "grad_norm": 0.7298390865325928, "learning_rate": 2.2164458393516897e-05, "loss": 0.1082, "step": 32140 }, { "epoch": 2.913268080478521, "grad_norm": 0.8512391448020935, "learning_rate": 2.2156600702743928e-05, "loss": 0.1481, "step": 32145 }, { "epoch": 2.913721225303607, "grad_norm": 0.9676353931427002, "learning_rate": 2.214874329653931e-05, "loss": 0.1323, "step": 32150 }, { "epoch": 2.914174370128693, "grad_norm": 0.8022176027297974, "learning_rate": 2.2140886175689428e-05, "loss": 0.1083, "step": 32155 }, { "epoch": 2.9146275149537795, "grad_norm": 0.786887526512146, "learning_rate": 2.2133029340980627e-05, "loss": 0.133, "step": 32160 }, { "epoch": 2.9150806597788654, "grad_norm": 0.8085120916366577, "learning_rate": 2.2125172793199216e-05, "loss": 0.1254, "step": 32165 }, { "epoch": 2.9155338046039514, "grad_norm": 0.7773489952087402, "learning_rate": 2.211731653313149e-05, "loss": 0.1129, "step": 32170 }, { "epoch": 2.9159869494290374, "grad_norm": 0.7812919616699219, "learning_rate": 2.2109460561563694e-05, "loss": 0.1255, "step": 32175 }, { "epoch": 2.9164400942541238, "grad_norm": 0.7065467238426208, "learning_rate": 2.2101604879282063e-05, "loss": 0.1257, "step": 32180 }, { "epoch": 2.9168932390792097, "grad_norm": 0.8249679803848267, "learning_rate": 2.2093749487072797e-05, "loss": 0.1312, "step": 32185 }, { "epoch": 2.9173463839042957, "grad_norm": 0.7920220494270325, "learning_rate": 2.2085894385722062e-05, "loss": 0.1078, "step": 32190 }, { "epoch": 2.9177995287293816, "grad_norm": 0.7914595007896423, "learning_rate": 2.2078039576016e-05, "loss": 0.1312, "step": 32195 }, { "epoch": 2.918252673554468, "grad_norm": 0.8157780170440674, "learning_rate": 2.2070185058740735e-05, "loss": 0.1364, "step": 32200 }, { "epoch": 2.918705818379554, "grad_norm": 0.8133488297462463, "learning_rate": 2.2062330834682332e-05, "loss": 0.1052, "step": 32205 }, { "epoch": 2.91915896320464, "grad_norm": 0.737430214881897, "learning_rate": 2.2054476904626856e-05, "loss": 0.1173, "step": 32210 }, { "epoch": 2.9196121080297264, "grad_norm": 0.8004975914955139, "learning_rate": 2.204662326936032e-05, "loss": 0.1156, "step": 32215 }, { "epoch": 2.9200652528548123, "grad_norm": 0.6831725835800171, "learning_rate": 2.2038769929668728e-05, "loss": 0.1253, "step": 32220 }, { "epoch": 2.9205183976798983, "grad_norm": 0.8011335134506226, "learning_rate": 2.203091688633805e-05, "loss": 0.1129, "step": 32225 }, { "epoch": 2.9209715425049847, "grad_norm": 0.7897927165031433, "learning_rate": 2.202306414015421e-05, "loss": 0.1333, "step": 32230 }, { "epoch": 2.9214246873300707, "grad_norm": 0.8511306047439575, "learning_rate": 2.2015211691903124e-05, "loss": 0.121, "step": 32235 }, { "epoch": 2.9218778321551566, "grad_norm": 0.6859016418457031, "learning_rate": 2.200735954237065e-05, "loss": 0.1275, "step": 32240 }, { "epoch": 2.922330976980243, "grad_norm": 0.6968622207641602, "learning_rate": 2.1999507692342635e-05, "loss": 0.1338, "step": 32245 }, { "epoch": 2.922784121805329, "grad_norm": 0.8633404970169067, "learning_rate": 2.1991656142604927e-05, "loss": 0.1483, "step": 32250 }, { "epoch": 2.923237266630415, "grad_norm": 0.8260207176208496, "learning_rate": 2.1983804893943274e-05, "loss": 0.1148, "step": 32255 }, { "epoch": 2.9236904114555013, "grad_norm": 0.815863847732544, "learning_rate": 2.197595394714346e-05, "loss": 0.1369, "step": 32260 }, { "epoch": 2.9241435562805873, "grad_norm": 0.8232970833778381, "learning_rate": 2.1968103302991185e-05, "loss": 0.134, "step": 32265 }, { "epoch": 2.9245967011056733, "grad_norm": 0.7245250344276428, "learning_rate": 2.196025296227216e-05, "loss": 0.105, "step": 32270 }, { "epoch": 2.9250498459307597, "grad_norm": 0.6918197274208069, "learning_rate": 2.1952402925772048e-05, "loss": 0.1205, "step": 32275 }, { "epoch": 2.9255029907558456, "grad_norm": 0.7398641109466553, "learning_rate": 2.1944553194276478e-05, "loss": 0.1235, "step": 32280 }, { "epoch": 2.9259561355809316, "grad_norm": 1.090011715888977, "learning_rate": 2.1936703768571067e-05, "loss": 0.147, "step": 32285 }, { "epoch": 2.926409280406018, "grad_norm": 0.6561744213104248, "learning_rate": 2.1928854649441367e-05, "loss": 0.1286, "step": 32290 }, { "epoch": 2.926862425231104, "grad_norm": 0.8542864322662354, "learning_rate": 2.1921005837672934e-05, "loss": 0.1181, "step": 32295 }, { "epoch": 2.92731557005619, "grad_norm": 0.8367809653282166, "learning_rate": 2.1913157334051283e-05, "loss": 0.1273, "step": 32300 }, { "epoch": 2.9277687148812763, "grad_norm": 0.7710029482841492, "learning_rate": 2.190530913936188e-05, "loss": 0.1305, "step": 32305 }, { "epoch": 2.9282218597063623, "grad_norm": 0.8118173480033875, "learning_rate": 2.1897461254390186e-05, "loss": 0.1143, "step": 32310 }, { "epoch": 2.9286750045314482, "grad_norm": 0.8722488284111023, "learning_rate": 2.188961367992163e-05, "loss": 0.1091, "step": 32315 }, { "epoch": 2.929128149356534, "grad_norm": 0.7838241457939148, "learning_rate": 2.188176641674158e-05, "loss": 0.1226, "step": 32320 }, { "epoch": 2.9295812941816206, "grad_norm": 0.9292865991592407, "learning_rate": 2.1873919465635403e-05, "loss": 0.1367, "step": 32325 }, { "epoch": 2.9300344390067066, "grad_norm": 0.692358136177063, "learning_rate": 2.186607282738842e-05, "loss": 0.1182, "step": 32330 }, { "epoch": 2.9304875838317925, "grad_norm": 0.9649110436439514, "learning_rate": 2.1858226502785928e-05, "loss": 0.1248, "step": 32335 }, { "epoch": 2.9309407286568785, "grad_norm": 0.7642430067062378, "learning_rate": 2.18503804926132e-05, "loss": 0.1217, "step": 32340 }, { "epoch": 2.931393873481965, "grad_norm": 0.8056466579437256, "learning_rate": 2.1842534797655442e-05, "loss": 0.1338, "step": 32345 }, { "epoch": 2.931847018307051, "grad_norm": 0.851441502571106, "learning_rate": 2.183468941869788e-05, "loss": 0.1248, "step": 32350 }, { "epoch": 2.932300163132137, "grad_norm": 0.754078209400177, "learning_rate": 2.182684435652566e-05, "loss": 0.1275, "step": 32355 }, { "epoch": 2.932753307957223, "grad_norm": 0.7392402291297913, "learning_rate": 2.1818999611923936e-05, "loss": 0.1143, "step": 32360 }, { "epoch": 2.933206452782309, "grad_norm": 0.8322813510894775, "learning_rate": 2.181115518567781e-05, "loss": 0.1251, "step": 32365 }, { "epoch": 2.933659597607395, "grad_norm": 0.7147146463394165, "learning_rate": 2.1803311078572347e-05, "loss": 0.1139, "step": 32370 }, { "epoch": 2.9341127424324815, "grad_norm": 0.7304614782333374, "learning_rate": 2.1795467291392593e-05, "loss": 0.1339, "step": 32375 }, { "epoch": 2.9345658872575675, "grad_norm": 0.7887855172157288, "learning_rate": 2.1787623824923552e-05, "loss": 0.1084, "step": 32380 }, { "epoch": 2.9350190320826535, "grad_norm": 0.8102666735649109, "learning_rate": 2.1779780679950203e-05, "loss": 0.1228, "step": 32385 }, { "epoch": 2.93547217690774, "grad_norm": 0.6981595754623413, "learning_rate": 2.1771937857257508e-05, "loss": 0.1275, "step": 32390 }, { "epoch": 2.935925321732826, "grad_norm": 0.7100594639778137, "learning_rate": 2.1764095357630345e-05, "loss": 0.1112, "step": 32395 }, { "epoch": 2.936378466557912, "grad_norm": 0.7711502909660339, "learning_rate": 2.1756253181853622e-05, "loss": 0.1253, "step": 32400 }, { "epoch": 2.936831611382998, "grad_norm": 0.8278403282165527, "learning_rate": 2.174841133071217e-05, "loss": 0.1279, "step": 32405 }, { "epoch": 2.937284756208084, "grad_norm": 0.6449916958808899, "learning_rate": 2.1740569804990814e-05, "loss": 0.1157, "step": 32410 }, { "epoch": 2.93773790103317, "grad_norm": 0.8979370594024658, "learning_rate": 2.1732728605474346e-05, "loss": 0.1157, "step": 32415 }, { "epoch": 2.9381910458582565, "grad_norm": 0.7319280505180359, "learning_rate": 2.172488773294749e-05, "loss": 0.1106, "step": 32420 }, { "epoch": 2.9386441906833425, "grad_norm": 0.8558738231658936, "learning_rate": 2.1717047188194987e-05, "loss": 0.1178, "step": 32425 }, { "epoch": 2.9390973355084284, "grad_norm": 0.7255055904388428, "learning_rate": 2.17092069720015e-05, "loss": 0.119, "step": 32430 }, { "epoch": 2.939550480333515, "grad_norm": 0.7189823389053345, "learning_rate": 2.1701367085151692e-05, "loss": 0.1087, "step": 32435 }, { "epoch": 2.940003625158601, "grad_norm": 0.7817735075950623, "learning_rate": 2.1693527528430187e-05, "loss": 0.1, "step": 32440 }, { "epoch": 2.9404567699836868, "grad_norm": 0.7216088175773621, "learning_rate": 2.168568830262156e-05, "loss": 0.1062, "step": 32445 }, { "epoch": 2.9409099148087727, "grad_norm": 0.8561839461326599, "learning_rate": 2.167784940851037e-05, "loss": 0.1187, "step": 32450 }, { "epoch": 2.941363059633859, "grad_norm": 0.8819250464439392, "learning_rate": 2.1670010846881144e-05, "loss": 0.1462, "step": 32455 }, { "epoch": 2.941816204458945, "grad_norm": 0.8434649705886841, "learning_rate": 2.1662172618518346e-05, "loss": 0.1405, "step": 32460 }, { "epoch": 2.942269349284031, "grad_norm": 0.7312418222427368, "learning_rate": 2.1654334724206444e-05, "loss": 0.0966, "step": 32465 }, { "epoch": 2.942722494109117, "grad_norm": 0.9160321950912476, "learning_rate": 2.164649716472985e-05, "loss": 0.1393, "step": 32470 }, { "epoch": 2.9431756389342034, "grad_norm": 0.753891110420227, "learning_rate": 2.1638659940872954e-05, "loss": 0.1349, "step": 32475 }, { "epoch": 2.9436287837592894, "grad_norm": 0.7942816615104675, "learning_rate": 2.1630823053420117e-05, "loss": 0.1288, "step": 32480 }, { "epoch": 2.9440819285843753, "grad_norm": 0.9765668511390686, "learning_rate": 2.162298650315564e-05, "loss": 0.1259, "step": 32485 }, { "epoch": 2.9445350734094617, "grad_norm": 0.7292512655258179, "learning_rate": 2.161515029086382e-05, "loss": 0.109, "step": 32490 }, { "epoch": 2.9449882182345477, "grad_norm": 0.6778416037559509, "learning_rate": 2.16073144173289e-05, "loss": 0.1251, "step": 32495 }, { "epoch": 2.9454413630596337, "grad_norm": 0.6999648213386536, "learning_rate": 2.159947888333509e-05, "loss": 0.1008, "step": 32500 }, { "epoch": 2.94589450788472, "grad_norm": 0.7282708883285522, "learning_rate": 2.1591643689666602e-05, "loss": 0.1332, "step": 32505 }, { "epoch": 2.946347652709806, "grad_norm": 0.8369594216346741, "learning_rate": 2.1583808837107554e-05, "loss": 0.1266, "step": 32510 }, { "epoch": 2.946800797534892, "grad_norm": 0.7365009784698486, "learning_rate": 2.1575974326442077e-05, "loss": 0.109, "step": 32515 }, { "epoch": 2.9472539423599784, "grad_norm": 0.7871037125587463, "learning_rate": 2.156814015845424e-05, "loss": 0.1132, "step": 32520 }, { "epoch": 2.9477070871850644, "grad_norm": 0.8404465913772583, "learning_rate": 2.15603063339281e-05, "loss": 0.1134, "step": 32525 }, { "epoch": 2.9481602320101503, "grad_norm": 0.6793585419654846, "learning_rate": 2.1552472853647676e-05, "loss": 0.1247, "step": 32530 }, { "epoch": 2.9486133768352367, "grad_norm": 0.7037976980209351, "learning_rate": 2.154463971839692e-05, "loss": 0.1017, "step": 32535 }, { "epoch": 2.9490665216603227, "grad_norm": 0.777864933013916, "learning_rate": 2.1536806928959797e-05, "loss": 0.1124, "step": 32540 }, { "epoch": 2.9495196664854086, "grad_norm": 0.8116819858551025, "learning_rate": 2.15289744861202e-05, "loss": 0.111, "step": 32545 }, { "epoch": 2.949972811310495, "grad_norm": 0.8779160380363464, "learning_rate": 2.1521142390662008e-05, "loss": 0.1171, "step": 32550 }, { "epoch": 2.950425956135581, "grad_norm": 0.8429458141326904, "learning_rate": 2.1513310643369076e-05, "loss": 0.1371, "step": 32555 }, { "epoch": 2.950879100960667, "grad_norm": 0.8517260551452637, "learning_rate": 2.150547924502518e-05, "loss": 0.1309, "step": 32560 }, { "epoch": 2.9513322457857534, "grad_norm": 0.8465380072593689, "learning_rate": 2.1497648196414107e-05, "loss": 0.1137, "step": 32565 }, { "epoch": 2.9517853906108393, "grad_norm": 1.0462239980697632, "learning_rate": 2.1489817498319585e-05, "loss": 0.1251, "step": 32570 }, { "epoch": 2.9522385354359253, "grad_norm": 0.782306432723999, "learning_rate": 2.1481987151525314e-05, "loss": 0.1055, "step": 32575 }, { "epoch": 2.9526916802610113, "grad_norm": 0.6203527450561523, "learning_rate": 2.1474157156814968e-05, "loss": 0.1231, "step": 32580 }, { "epoch": 2.9531448250860977, "grad_norm": 0.8724557757377625, "learning_rate": 2.1466327514972154e-05, "loss": 0.1165, "step": 32585 }, { "epoch": 2.9535979699111836, "grad_norm": 0.6862197518348694, "learning_rate": 2.1458498226780477e-05, "loss": 0.1111, "step": 32590 }, { "epoch": 2.9540511147362696, "grad_norm": 0.6864007115364075, "learning_rate": 2.1450669293023502e-05, "loss": 0.1298, "step": 32595 }, { "epoch": 2.9545042595613555, "grad_norm": 0.8887528777122498, "learning_rate": 2.1442840714484736e-05, "loss": 0.1315, "step": 32600 }, { "epoch": 2.954957404386442, "grad_norm": 0.7364968657493591, "learning_rate": 2.1435012491947683e-05, "loss": 0.1158, "step": 32605 }, { "epoch": 2.955410549211528, "grad_norm": 0.7625244855880737, "learning_rate": 2.1427184626195778e-05, "loss": 0.1119, "step": 32610 }, { "epoch": 2.955863694036614, "grad_norm": 0.6646460890769958, "learning_rate": 2.141935711801244e-05, "loss": 0.111, "step": 32615 }, { "epoch": 2.9563168388617003, "grad_norm": 0.8368454575538635, "learning_rate": 2.1411529968181067e-05, "loss": 0.1317, "step": 32620 }, { "epoch": 2.9567699836867862, "grad_norm": 0.8483220338821411, "learning_rate": 2.1403703177484974e-05, "loss": 0.1231, "step": 32625 }, { "epoch": 2.957223128511872, "grad_norm": 0.6078668236732483, "learning_rate": 2.139587674670749e-05, "loss": 0.0994, "step": 32630 }, { "epoch": 2.9576762733369586, "grad_norm": 0.6795395016670227, "learning_rate": 2.1388050676631873e-05, "loss": 0.1245, "step": 32635 }, { "epoch": 2.9581294181620446, "grad_norm": 0.9871223568916321, "learning_rate": 2.1380224968041366e-05, "loss": 0.1343, "step": 32640 }, { "epoch": 2.9585825629871305, "grad_norm": 0.8563558459281921, "learning_rate": 2.1372399621719177e-05, "loss": 0.1201, "step": 32645 }, { "epoch": 2.959035707812217, "grad_norm": 0.7137540578842163, "learning_rate": 2.1364574638448452e-05, "loss": 0.1139, "step": 32650 }, { "epoch": 2.959488852637303, "grad_norm": 0.7380006909370422, "learning_rate": 2.1356750019012327e-05, "loss": 0.1112, "step": 32655 }, { "epoch": 2.959941997462389, "grad_norm": 0.8267508745193481, "learning_rate": 2.134892576419389e-05, "loss": 0.1242, "step": 32660 }, { "epoch": 2.9603951422874752, "grad_norm": 0.8360555768013, "learning_rate": 2.1341101874776198e-05, "loss": 0.123, "step": 32665 }, { "epoch": 2.960848287112561, "grad_norm": 0.8647732734680176, "learning_rate": 2.1333278351542276e-05, "loss": 0.1253, "step": 32670 }, { "epoch": 2.961301431937647, "grad_norm": 0.7355263829231262, "learning_rate": 2.1325455195275085e-05, "loss": 0.1056, "step": 32675 }, { "epoch": 2.9617545767627336, "grad_norm": 0.7801843285560608, "learning_rate": 2.1317632406757586e-05, "loss": 0.1171, "step": 32680 }, { "epoch": 2.9622077215878195, "grad_norm": 0.8757714629173279, "learning_rate": 2.1309809986772677e-05, "loss": 0.142, "step": 32685 }, { "epoch": 2.9626608664129055, "grad_norm": 0.8223606944084167, "learning_rate": 2.130198793610323e-05, "loss": 0.1229, "step": 32690 }, { "epoch": 2.963114011237992, "grad_norm": 0.8758775591850281, "learning_rate": 2.129416625553209e-05, "loss": 0.1213, "step": 32695 }, { "epoch": 2.963567156063078, "grad_norm": 0.8427641987800598, "learning_rate": 2.1286344945842034e-05, "loss": 0.1477, "step": 32700 }, { "epoch": 2.964020300888164, "grad_norm": 0.8084893822669983, "learning_rate": 2.1278524007815833e-05, "loss": 0.1201, "step": 32705 }, { "epoch": 2.96447344571325, "grad_norm": 0.8004051446914673, "learning_rate": 2.1270703442236215e-05, "loss": 0.1234, "step": 32710 }, { "epoch": 2.964926590538336, "grad_norm": 0.7076078653335571, "learning_rate": 2.1262883249885846e-05, "loss": 0.1126, "step": 32715 }, { "epoch": 2.965379735363422, "grad_norm": 1.0652847290039062, "learning_rate": 2.12550634315474e-05, "loss": 0.1342, "step": 32720 }, { "epoch": 2.965832880188508, "grad_norm": 0.9286179542541504, "learning_rate": 2.1247243988003458e-05, "loss": 0.1505, "step": 32725 }, { "epoch": 2.9662860250135945, "grad_norm": 0.8041060566902161, "learning_rate": 2.1239424920036607e-05, "loss": 0.1061, "step": 32730 }, { "epoch": 2.9667391698386805, "grad_norm": 0.9228474497795105, "learning_rate": 2.123160622842939e-05, "loss": 0.1281, "step": 32735 }, { "epoch": 2.9671923146637664, "grad_norm": 0.7569360733032227, "learning_rate": 2.1223787913964284e-05, "loss": 0.105, "step": 32740 }, { "epoch": 2.9676454594888524, "grad_norm": 0.8180964589118958, "learning_rate": 2.1215969977423776e-05, "loss": 0.1538, "step": 32745 }, { "epoch": 2.968098604313939, "grad_norm": 0.7514869570732117, "learning_rate": 2.120815241959026e-05, "loss": 0.132, "step": 32750 }, { "epoch": 2.9685517491390248, "grad_norm": 0.8094807267189026, "learning_rate": 2.1200335241246127e-05, "loss": 0.0941, "step": 32755 }, { "epoch": 2.9690048939641107, "grad_norm": 0.8819969892501831, "learning_rate": 2.1192518443173737e-05, "loss": 0.1081, "step": 32760 }, { "epoch": 2.969458038789197, "grad_norm": 0.795799970626831, "learning_rate": 2.1184702026155384e-05, "loss": 0.1195, "step": 32765 }, { "epoch": 2.969911183614283, "grad_norm": 0.7679431438446045, "learning_rate": 2.1176885990973348e-05, "loss": 0.1355, "step": 32770 }, { "epoch": 2.970364328439369, "grad_norm": 0.8071277141571045, "learning_rate": 2.116907033840985e-05, "loss": 0.1364, "step": 32775 }, { "epoch": 2.9708174732644554, "grad_norm": 1.0136613845825195, "learning_rate": 2.1161255069247085e-05, "loss": 0.1552, "step": 32780 }, { "epoch": 2.9712706180895414, "grad_norm": 0.7110490202903748, "learning_rate": 2.115344018426722e-05, "loss": 0.1177, "step": 32785 }, { "epoch": 2.9717237629146274, "grad_norm": 0.9113421440124512, "learning_rate": 2.1145625684252352e-05, "loss": 0.1114, "step": 32790 }, { "epoch": 2.9721769077397138, "grad_norm": 0.8287716507911682, "learning_rate": 2.1137811569984574e-05, "loss": 0.1181, "step": 32795 }, { "epoch": 2.9726300525647997, "grad_norm": 0.8670343160629272, "learning_rate": 2.1129997842245915e-05, "loss": 0.1296, "step": 32800 }, { "epoch": 2.9730831973898857, "grad_norm": 0.7093629837036133, "learning_rate": 2.1122184501818377e-05, "loss": 0.1179, "step": 32805 }, { "epoch": 2.973536342214972, "grad_norm": 0.7125111222267151, "learning_rate": 2.1114371549483937e-05, "loss": 0.1053, "step": 32810 }, { "epoch": 2.973989487040058, "grad_norm": 0.6670512557029724, "learning_rate": 2.1106558986024495e-05, "loss": 0.1155, "step": 32815 }, { "epoch": 2.974442631865144, "grad_norm": 0.9299026131629944, "learning_rate": 2.1098746812221947e-05, "loss": 0.1235, "step": 32820 }, { "epoch": 2.9748957766902304, "grad_norm": 0.8103495240211487, "learning_rate": 2.1090935028858146e-05, "loss": 0.1119, "step": 32825 }, { "epoch": 2.9753489215153164, "grad_norm": 0.6723147630691528, "learning_rate": 2.1083123636714878e-05, "loss": 0.1194, "step": 32830 }, { "epoch": 2.9758020663404023, "grad_norm": 0.7397666573524475, "learning_rate": 2.107531263657393e-05, "loss": 0.0999, "step": 32835 }, { "epoch": 2.9762552111654887, "grad_norm": 0.798862874507904, "learning_rate": 2.1067502029217016e-05, "loss": 0.1137, "step": 32840 }, { "epoch": 2.9767083559905747, "grad_norm": 0.7444459795951843, "learning_rate": 2.105969181542582e-05, "loss": 0.1085, "step": 32845 }, { "epoch": 2.9771615008156607, "grad_norm": 0.8988606929779053, "learning_rate": 2.1051881995982013e-05, "loss": 0.1118, "step": 32850 }, { "epoch": 2.9776146456407466, "grad_norm": 0.7499696016311646, "learning_rate": 2.104407257166718e-05, "loss": 0.1107, "step": 32855 }, { "epoch": 2.978067790465833, "grad_norm": 0.7145745158195496, "learning_rate": 2.1036263543262913e-05, "loss": 0.1153, "step": 32860 }, { "epoch": 2.978520935290919, "grad_norm": 0.7601265907287598, "learning_rate": 2.102845491155072e-05, "loss": 0.1072, "step": 32865 }, { "epoch": 2.978974080116005, "grad_norm": 0.8644714951515198, "learning_rate": 2.10206466773121e-05, "loss": 0.1456, "step": 32870 }, { "epoch": 2.979427224941091, "grad_norm": 0.6108025312423706, "learning_rate": 2.1012838841328513e-05, "loss": 0.1025, "step": 32875 }, { "epoch": 2.9798803697661773, "grad_norm": 0.9944037795066833, "learning_rate": 2.100503140438136e-05, "loss": 0.1664, "step": 32880 }, { "epoch": 2.9803335145912633, "grad_norm": 0.8316084742546082, "learning_rate": 2.099722436725202e-05, "loss": 0.1011, "step": 32885 }, { "epoch": 2.9807866594163492, "grad_norm": 0.693438708782196, "learning_rate": 2.098941773072181e-05, "loss": 0.1, "step": 32890 }, { "epoch": 2.9812398042414356, "grad_norm": 0.8982511162757874, "learning_rate": 2.0981611495572027e-05, "loss": 0.1173, "step": 32895 }, { "epoch": 2.9816929490665216, "grad_norm": 0.8353116512298584, "learning_rate": 2.0973805662583933e-05, "loss": 0.1073, "step": 32900 }, { "epoch": 2.9821460938916076, "grad_norm": 0.5859304070472717, "learning_rate": 2.0966000232538725e-05, "loss": 0.0986, "step": 32905 }, { "epoch": 2.982599238716694, "grad_norm": 0.7879964113235474, "learning_rate": 2.0958195206217586e-05, "loss": 0.1015, "step": 32910 }, { "epoch": 2.98305238354178, "grad_norm": 0.7947445511817932, "learning_rate": 2.0950390584401625e-05, "loss": 0.1338, "step": 32915 }, { "epoch": 2.983505528366866, "grad_norm": 0.7512179613113403, "learning_rate": 2.0942586367871945e-05, "loss": 0.1924, "step": 32920 }, { "epoch": 2.9839586731919523, "grad_norm": 0.8032301664352417, "learning_rate": 2.09347825574096e-05, "loss": 0.1475, "step": 32925 }, { "epoch": 2.9844118180170383, "grad_norm": 0.8204244375228882, "learning_rate": 2.092697915379558e-05, "loss": 0.1084, "step": 32930 }, { "epoch": 2.984864962842124, "grad_norm": 0.7223837971687317, "learning_rate": 2.091917615781088e-05, "loss": 0.1192, "step": 32935 }, { "epoch": 2.9853181076672106, "grad_norm": 0.8438956141471863, "learning_rate": 2.0911373570236394e-05, "loss": 0.1368, "step": 32940 }, { "epoch": 2.9857712524922966, "grad_norm": 0.8112822771072388, "learning_rate": 2.090357139185303e-05, "loss": 0.1416, "step": 32945 }, { "epoch": 2.9862243973173825, "grad_norm": 0.9764158129692078, "learning_rate": 2.0895769623441626e-05, "loss": 0.1305, "step": 32950 }, { "epoch": 2.986677542142469, "grad_norm": 0.7413337826728821, "learning_rate": 2.0887968265782977e-05, "loss": 0.1211, "step": 32955 }, { "epoch": 2.987130686967555, "grad_norm": 0.888697624206543, "learning_rate": 2.0880167319657858e-05, "loss": 0.1189, "step": 32960 }, { "epoch": 2.987583831792641, "grad_norm": 0.8146106600761414, "learning_rate": 2.0872366785847e-05, "loss": 0.1186, "step": 32965 }, { "epoch": 2.9880369766177273, "grad_norm": 0.7367721796035767, "learning_rate": 2.0864566665131054e-05, "loss": 0.1058, "step": 32970 }, { "epoch": 2.9884901214428132, "grad_norm": 0.7788300514221191, "learning_rate": 2.0856766958290688e-05, "loss": 0.1114, "step": 32975 }, { "epoch": 2.988943266267899, "grad_norm": 0.8644613027572632, "learning_rate": 2.0848967666106472e-05, "loss": 0.1112, "step": 32980 }, { "epoch": 2.9893964110929856, "grad_norm": 0.7153546810150146, "learning_rate": 2.0841168789358977e-05, "loss": 0.1044, "step": 32985 }, { "epoch": 2.9898495559180716, "grad_norm": 0.772641122341156, "learning_rate": 2.083337032882872e-05, "loss": 0.1066, "step": 32990 }, { "epoch": 2.9903027007431575, "grad_norm": 0.7494721412658691, "learning_rate": 2.0825572285296163e-05, "loss": 0.1027, "step": 32995 }, { "epoch": 2.9907558455682435, "grad_norm": 0.7299070358276367, "learning_rate": 2.0817774659541755e-05, "loss": 0.1316, "step": 33000 }, { "epoch": 2.9912089903933294, "grad_norm": 0.6577503681182861, "learning_rate": 2.0809977452345863e-05, "loss": 0.1119, "step": 33005 }, { "epoch": 2.991662135218416, "grad_norm": 0.969937264919281, "learning_rate": 2.0802180664488844e-05, "loss": 0.1249, "step": 33010 }, { "epoch": 2.992115280043502, "grad_norm": 0.7217673063278198, "learning_rate": 2.079438429675101e-05, "loss": 0.1153, "step": 33015 }, { "epoch": 2.9925684248685878, "grad_norm": 0.7815482020378113, "learning_rate": 2.0786588349912606e-05, "loss": 0.0994, "step": 33020 }, { "epoch": 2.993021569693674, "grad_norm": 0.7825194001197815, "learning_rate": 2.077879282475388e-05, "loss": 0.14, "step": 33025 }, { "epoch": 2.99347471451876, "grad_norm": 0.8893621563911438, "learning_rate": 2.077099772205498e-05, "loss": 0.1146, "step": 33030 }, { "epoch": 2.993927859343846, "grad_norm": 0.8578217625617981, "learning_rate": 2.076320304259606e-05, "loss": 0.1177, "step": 33035 }, { "epoch": 2.9943810041689325, "grad_norm": 0.8950655460357666, "learning_rate": 2.0755408787157218e-05, "loss": 0.1093, "step": 33040 }, { "epoch": 2.9948341489940185, "grad_norm": 0.919211208820343, "learning_rate": 2.0747614956518494e-05, "loss": 0.1139, "step": 33045 }, { "epoch": 2.9952872938191044, "grad_norm": 0.7099471092224121, "learning_rate": 2.073982155145991e-05, "loss": 0.105, "step": 33050 }, { "epoch": 2.995740438644191, "grad_norm": 0.8092036247253418, "learning_rate": 2.073202857276142e-05, "loss": 0.1156, "step": 33055 }, { "epoch": 2.9961935834692768, "grad_norm": 0.6777909994125366, "learning_rate": 2.072423602120295e-05, "loss": 0.108, "step": 33060 }, { "epoch": 2.9966467282943627, "grad_norm": 0.7864583730697632, "learning_rate": 2.0716443897564393e-05, "loss": 0.1291, "step": 33065 }, { "epoch": 2.997099873119449, "grad_norm": 0.7236821055412292, "learning_rate": 2.0708652202625567e-05, "loss": 0.1155, "step": 33070 }, { "epoch": 2.997553017944535, "grad_norm": 0.8639704585075378, "learning_rate": 2.0700860937166293e-05, "loss": 0.1243, "step": 33075 }, { "epoch": 2.998006162769621, "grad_norm": 0.8195403814315796, "learning_rate": 2.0693070101966302e-05, "loss": 0.1417, "step": 33080 }, { "epoch": 2.9984593075947075, "grad_norm": 0.707187831401825, "learning_rate": 2.068527969780531e-05, "loss": 0.1163, "step": 33085 }, { "epoch": 2.9989124524197934, "grad_norm": 0.6237666010856628, "learning_rate": 2.067748972546299e-05, "loss": 0.1212, "step": 33090 }, { "epoch": 2.9993655972448794, "grad_norm": 0.6945413947105408, "learning_rate": 2.0669700185718953e-05, "loss": 0.104, "step": 33095 }, { "epoch": 2.999818742069966, "grad_norm": 0.686066746711731, "learning_rate": 2.0661911079352784e-05, "loss": 0.1022, "step": 33100 }, { "epoch": 3.0002718868950518, "grad_norm": 0.6974451541900635, "learning_rate": 2.0654122407144034e-05, "loss": 0.0942, "step": 33105 }, { "epoch": 3.0007250317201377, "grad_norm": 0.646852433681488, "learning_rate": 2.064633416987217e-05, "loss": 0.0656, "step": 33110 }, { "epoch": 3.0011781765452237, "grad_norm": 0.8565932512283325, "learning_rate": 2.063854636831666e-05, "loss": 0.0825, "step": 33115 }, { "epoch": 3.00163132137031, "grad_norm": 0.7267903089523315, "learning_rate": 2.0630759003256896e-05, "loss": 0.0726, "step": 33120 }, { "epoch": 3.002084466195396, "grad_norm": 0.7458726763725281, "learning_rate": 2.062297207547225e-05, "loss": 0.0819, "step": 33125 }, { "epoch": 3.002537611020482, "grad_norm": 0.6945560574531555, "learning_rate": 2.061518558574205e-05, "loss": 0.0653, "step": 33130 }, { "epoch": 3.0029907558455684, "grad_norm": 0.6550741791725159, "learning_rate": 2.0607399534845547e-05, "loss": 0.0653, "step": 33135 }, { "epoch": 3.0034439006706544, "grad_norm": 0.608694851398468, "learning_rate": 2.0599613923561995e-05, "loss": 0.0844, "step": 33140 }, { "epoch": 3.0038970454957403, "grad_norm": 0.6656638979911804, "learning_rate": 2.059182875267055e-05, "loss": 0.0801, "step": 33145 }, { "epoch": 3.0043501903208267, "grad_norm": 0.6413872241973877, "learning_rate": 2.0584044022950387e-05, "loss": 0.0683, "step": 33150 }, { "epoch": 3.0048033351459127, "grad_norm": 0.6317963600158691, "learning_rate": 2.0576259735180596e-05, "loss": 0.0754, "step": 33155 }, { "epoch": 3.0052564799709987, "grad_norm": 0.7352420687675476, "learning_rate": 2.0568475890140215e-05, "loss": 0.08, "step": 33160 }, { "epoch": 3.0057096247960846, "grad_norm": 0.6267818808555603, "learning_rate": 2.0560692488608278e-05, "loss": 0.0728, "step": 33165 }, { "epoch": 3.006162769621171, "grad_norm": 0.652280330657959, "learning_rate": 2.0552909531363723e-05, "loss": 0.0629, "step": 33170 }, { "epoch": 3.006615914446257, "grad_norm": 0.6975553035736084, "learning_rate": 2.054512701918549e-05, "loss": 0.0659, "step": 33175 }, { "epoch": 3.007069059271343, "grad_norm": 0.6955904960632324, "learning_rate": 2.0537344952852453e-05, "loss": 0.0699, "step": 33180 }, { "epoch": 3.0075222040964293, "grad_norm": 0.6061315536499023, "learning_rate": 2.0529563333143438e-05, "loss": 0.0644, "step": 33185 }, { "epoch": 3.0079753489215153, "grad_norm": 0.6783926486968994, "learning_rate": 2.0521782160837243e-05, "loss": 0.0618, "step": 33190 }, { "epoch": 3.0084284937466013, "grad_norm": 0.631397008895874, "learning_rate": 2.0514001436712596e-05, "loss": 0.0602, "step": 33195 }, { "epoch": 3.0088816385716877, "grad_norm": 0.58910071849823, "learning_rate": 2.0506221161548202e-05, "loss": 0.0668, "step": 33200 }, { "epoch": 3.0093347833967736, "grad_norm": 0.8165273666381836, "learning_rate": 2.0498441336122717e-05, "loss": 0.1085, "step": 33205 }, { "epoch": 3.0097879282218596, "grad_norm": 0.7937790155410767, "learning_rate": 2.049066196121474e-05, "loss": 0.0724, "step": 33210 }, { "epoch": 3.010241073046946, "grad_norm": 0.6267117857933044, "learning_rate": 2.048288303760284e-05, "loss": 0.0687, "step": 33215 }, { "epoch": 3.010694217872032, "grad_norm": 0.5867065191268921, "learning_rate": 2.0475104566065543e-05, "loss": 0.0762, "step": 33220 }, { "epoch": 3.011147362697118, "grad_norm": 0.5852190256118774, "learning_rate": 2.0467326547381297e-05, "loss": 0.0802, "step": 33225 }, { "epoch": 3.0116005075222043, "grad_norm": 0.7285323143005371, "learning_rate": 2.045954898232855e-05, "loss": 0.07, "step": 33230 }, { "epoch": 3.0120536523472903, "grad_norm": 0.6380605101585388, "learning_rate": 2.0451771871685672e-05, "loss": 0.0718, "step": 33235 }, { "epoch": 3.0125067971723762, "grad_norm": 0.5701432824134827, "learning_rate": 2.0443995216231e-05, "loss": 0.0718, "step": 33240 }, { "epoch": 3.012959941997462, "grad_norm": 0.755183219909668, "learning_rate": 2.0436219016742837e-05, "loss": 0.0862, "step": 33245 }, { "epoch": 3.0134130868225486, "grad_norm": 0.6955975890159607, "learning_rate": 2.042844327399941e-05, "loss": 0.0711, "step": 33250 }, { "epoch": 3.0138662316476346, "grad_norm": 0.9417277574539185, "learning_rate": 2.0420667988778926e-05, "loss": 0.0848, "step": 33255 }, { "epoch": 3.0143193764727205, "grad_norm": 0.7354271411895752, "learning_rate": 2.0412893161859536e-05, "loss": 0.0716, "step": 33260 }, { "epoch": 3.014772521297807, "grad_norm": 0.6202910542488098, "learning_rate": 2.040511879401935e-05, "loss": 0.0661, "step": 33265 }, { "epoch": 3.015225666122893, "grad_norm": 0.5867742300033569, "learning_rate": 2.039734488603644e-05, "loss": 0.0724, "step": 33270 }, { "epoch": 3.015678810947979, "grad_norm": 0.6285318732261658, "learning_rate": 2.038957143868879e-05, "loss": 0.0729, "step": 33275 }, { "epoch": 3.0161319557730653, "grad_norm": 0.7627113461494446, "learning_rate": 2.0381798452754404e-05, "loss": 0.0692, "step": 33280 }, { "epoch": 3.016585100598151, "grad_norm": 0.6893037557601929, "learning_rate": 2.037402592901118e-05, "loss": 0.0723, "step": 33285 }, { "epoch": 3.017038245423237, "grad_norm": 0.7498071193695068, "learning_rate": 2.0366253868237008e-05, "loss": 0.0675, "step": 33290 }, { "epoch": 3.0174913902483236, "grad_norm": 0.6378016471862793, "learning_rate": 2.0358482271209723e-05, "loss": 0.0735, "step": 33295 }, { "epoch": 3.0179445350734095, "grad_norm": 0.6950735449790955, "learning_rate": 2.035071113870709e-05, "loss": 0.0699, "step": 33300 }, { "epoch": 3.0183976798984955, "grad_norm": 0.5870081186294556, "learning_rate": 2.0342940471506868e-05, "loss": 0.0803, "step": 33305 }, { "epoch": 3.0188508247235815, "grad_norm": 0.5953757762908936, "learning_rate": 2.033517027038673e-05, "loss": 0.0694, "step": 33310 }, { "epoch": 3.019303969548668, "grad_norm": 0.6249048113822937, "learning_rate": 2.032740053612433e-05, "loss": 0.0692, "step": 33315 }, { "epoch": 3.019757114373754, "grad_norm": 0.571973443031311, "learning_rate": 2.031963126949727e-05, "loss": 0.0658, "step": 33320 }, { "epoch": 3.02021025919884, "grad_norm": 0.6736738681793213, "learning_rate": 2.0311862471283088e-05, "loss": 0.0648, "step": 33325 }, { "epoch": 3.020663404023926, "grad_norm": 0.8494826555252075, "learning_rate": 2.0304094142259307e-05, "loss": 0.0848, "step": 33330 }, { "epoch": 3.021116548849012, "grad_norm": 0.6350685358047485, "learning_rate": 2.0296326283203357e-05, "loss": 0.0674, "step": 33335 }, { "epoch": 3.021569693674098, "grad_norm": 0.5083855986595154, "learning_rate": 2.0288558894892663e-05, "loss": 0.0588, "step": 33340 }, { "epoch": 3.0220228384991845, "grad_norm": 0.6902773380279541, "learning_rate": 2.0280791978104594e-05, "loss": 0.0719, "step": 33345 }, { "epoch": 3.0224759833242705, "grad_norm": 0.6157683730125427, "learning_rate": 2.0273025533616454e-05, "loss": 0.064, "step": 33350 }, { "epoch": 3.0229291281493564, "grad_norm": 0.7839102745056152, "learning_rate": 2.0265259562205517e-05, "loss": 0.071, "step": 33355 }, { "epoch": 3.023382272974443, "grad_norm": 0.6094021201133728, "learning_rate": 2.0257494064649015e-05, "loss": 0.0627, "step": 33360 }, { "epoch": 3.023835417799529, "grad_norm": 0.6397721171379089, "learning_rate": 2.0249729041724097e-05, "loss": 0.0695, "step": 33365 }, { "epoch": 3.0242885626246148, "grad_norm": 0.6848085522651672, "learning_rate": 2.0241964494207907e-05, "loss": 0.0615, "step": 33370 }, { "epoch": 3.0247417074497007, "grad_norm": 0.5850020051002502, "learning_rate": 2.0234200422877516e-05, "loss": 0.0799, "step": 33375 }, { "epoch": 3.025194852274787, "grad_norm": 0.6743781566619873, "learning_rate": 2.0226436828509958e-05, "loss": 0.0677, "step": 33380 }, { "epoch": 3.025647997099873, "grad_norm": 0.628790557384491, "learning_rate": 2.0218673711882223e-05, "loss": 0.0634, "step": 33385 }, { "epoch": 3.026101141924959, "grad_norm": 0.6437466144561768, "learning_rate": 2.0210911073771233e-05, "loss": 0.0703, "step": 33390 }, { "epoch": 3.0265542867500455, "grad_norm": 0.7002527117729187, "learning_rate": 2.0203148914953884e-05, "loss": 0.083, "step": 33395 }, { "epoch": 3.0270074315751314, "grad_norm": 0.46072059869766235, "learning_rate": 2.0195387236207007e-05, "loss": 0.0645, "step": 33400 }, { "epoch": 3.0274605764002174, "grad_norm": 0.5682394504547119, "learning_rate": 2.0187626038307405e-05, "loss": 0.058, "step": 33405 }, { "epoch": 3.027913721225304, "grad_norm": 0.6283129453659058, "learning_rate": 2.017986532203182e-05, "loss": 0.0838, "step": 33410 }, { "epoch": 3.0283668660503897, "grad_norm": 0.6709119081497192, "learning_rate": 2.017210508815694e-05, "loss": 0.0763, "step": 33415 }, { "epoch": 3.0288200108754757, "grad_norm": 0.6521695852279663, "learning_rate": 2.0164345337459413e-05, "loss": 0.076, "step": 33420 }, { "epoch": 3.029273155700562, "grad_norm": 0.5329607725143433, "learning_rate": 2.0156586070715838e-05, "loss": 0.0602, "step": 33425 }, { "epoch": 3.029726300525648, "grad_norm": 0.6481011509895325, "learning_rate": 2.0148827288702764e-05, "loss": 0.0797, "step": 33430 }, { "epoch": 3.030179445350734, "grad_norm": 0.6671131253242493, "learning_rate": 2.0141068992196707e-05, "loss": 0.0733, "step": 33435 }, { "epoch": 3.03063259017582, "grad_norm": 0.5300525426864624, "learning_rate": 2.0133311181974094e-05, "loss": 0.0689, "step": 33440 }, { "epoch": 3.0310857350009064, "grad_norm": 0.5536491274833679, "learning_rate": 2.0125553858811354e-05, "loss": 0.0759, "step": 33445 }, { "epoch": 3.0315388798259923, "grad_norm": 0.6011869311332703, "learning_rate": 2.0117797023484826e-05, "loss": 0.0641, "step": 33450 }, { "epoch": 3.0319920246510783, "grad_norm": 0.8603693246841431, "learning_rate": 2.0110040676770816e-05, "loss": 0.0683, "step": 33455 }, { "epoch": 3.0324451694761647, "grad_norm": 0.7929648160934448, "learning_rate": 2.0102284819445604e-05, "loss": 0.0811, "step": 33460 }, { "epoch": 3.0328983143012507, "grad_norm": 0.8058768510818481, "learning_rate": 2.009452945228537e-05, "loss": 0.0865, "step": 33465 }, { "epoch": 3.0333514591263366, "grad_norm": 0.6386051177978516, "learning_rate": 2.008677457606629e-05, "loss": 0.07, "step": 33470 }, { "epoch": 3.033804603951423, "grad_norm": 0.6891403794288635, "learning_rate": 2.0079020191564473e-05, "loss": 0.0706, "step": 33475 }, { "epoch": 3.034257748776509, "grad_norm": 0.6217478513717651, "learning_rate": 2.0071266299555976e-05, "loss": 0.0625, "step": 33480 }, { "epoch": 3.034710893601595, "grad_norm": 0.655491292476654, "learning_rate": 2.0063512900816824e-05, "loss": 0.0701, "step": 33485 }, { "epoch": 3.0351640384266814, "grad_norm": 0.7523077726364136, "learning_rate": 2.0055759996122957e-05, "loss": 0.0885, "step": 33490 }, { "epoch": 3.0356171832517673, "grad_norm": 0.6472657322883606, "learning_rate": 2.00480075862503e-05, "loss": 0.0898, "step": 33495 }, { "epoch": 3.0360703280768533, "grad_norm": 0.7455252408981323, "learning_rate": 2.004025567197473e-05, "loss": 0.0708, "step": 33500 }, { "epoch": 3.0365234729019392, "grad_norm": 0.6479419469833374, "learning_rate": 2.0032504254072038e-05, "loss": 0.0683, "step": 33505 }, { "epoch": 3.0369766177270257, "grad_norm": 0.8287599086761475, "learning_rate": 2.002475333331801e-05, "loss": 0.0918, "step": 33510 }, { "epoch": 3.0374297625521116, "grad_norm": 0.7599159479141235, "learning_rate": 2.0017002910488343e-05, "loss": 0.0907, "step": 33515 }, { "epoch": 3.0378829073771976, "grad_norm": 0.5490638613700867, "learning_rate": 2.0009252986358704e-05, "loss": 0.0807, "step": 33520 }, { "epoch": 3.038336052202284, "grad_norm": 0.699701726436615, "learning_rate": 2.0001503561704725e-05, "loss": 0.0755, "step": 33525 }, { "epoch": 3.03878919702737, "grad_norm": 0.6334744691848755, "learning_rate": 1.9993754637301954e-05, "loss": 0.066, "step": 33530 }, { "epoch": 3.039242341852456, "grad_norm": 0.6441859006881714, "learning_rate": 1.998600621392591e-05, "loss": 0.0743, "step": 33535 }, { "epoch": 3.0396954866775423, "grad_norm": 0.6578091382980347, "learning_rate": 1.997825829235206e-05, "loss": 0.0942, "step": 33540 }, { "epoch": 3.0401486315026283, "grad_norm": 0.6182687878608704, "learning_rate": 1.9970510873355813e-05, "loss": 0.0698, "step": 33545 }, { "epoch": 3.0406017763277142, "grad_norm": 0.6375321745872498, "learning_rate": 1.996276395771255e-05, "loss": 0.0699, "step": 33550 }, { "epoch": 3.0410549211528006, "grad_norm": 0.6586810946464539, "learning_rate": 1.9955017546197562e-05, "loss": 0.0657, "step": 33555 }, { "epoch": 3.0415080659778866, "grad_norm": 0.7409407496452332, "learning_rate": 1.994727163958613e-05, "loss": 0.0671, "step": 33560 }, { "epoch": 3.0419612108029725, "grad_norm": 0.6166648268699646, "learning_rate": 1.9939526238653456e-05, "loss": 0.087, "step": 33565 }, { "epoch": 3.0424143556280585, "grad_norm": 0.6823718547821045, "learning_rate": 1.9931781344174703e-05, "loss": 0.067, "step": 33570 }, { "epoch": 3.042867500453145, "grad_norm": 0.6586255431175232, "learning_rate": 1.9924036956925002e-05, "loss": 0.0659, "step": 33575 }, { "epoch": 3.043320645278231, "grad_norm": 0.6617932319641113, "learning_rate": 1.9916293077679384e-05, "loss": 0.0593, "step": 33580 }, { "epoch": 3.043773790103317, "grad_norm": 0.5839371085166931, "learning_rate": 1.990854970721288e-05, "loss": 0.0631, "step": 33585 }, { "epoch": 3.0442269349284032, "grad_norm": 0.6878220438957214, "learning_rate": 1.9900806846300434e-05, "loss": 0.0948, "step": 33590 }, { "epoch": 3.044680079753489, "grad_norm": 0.7260808944702148, "learning_rate": 1.9893064495716968e-05, "loss": 0.0796, "step": 33595 }, { "epoch": 3.045133224578575, "grad_norm": 0.6568900942802429, "learning_rate": 1.9885322656237344e-05, "loss": 0.0751, "step": 33600 }, { "epoch": 3.0455863694036616, "grad_norm": 0.5570575594902039, "learning_rate": 1.987758132863634e-05, "loss": 0.0843, "step": 33605 }, { "epoch": 3.0460395142287475, "grad_norm": 0.62864089012146, "learning_rate": 1.9869840513688735e-05, "loss": 0.0716, "step": 33610 }, { "epoch": 3.0464926590538335, "grad_norm": 0.7888750433921814, "learning_rate": 1.9862100212169228e-05, "loss": 0.0746, "step": 33615 }, { "epoch": 3.04694580387892, "grad_norm": 0.6252142190933228, "learning_rate": 1.9854360424852463e-05, "loss": 0.0716, "step": 33620 }, { "epoch": 3.047398948704006, "grad_norm": 0.6147468090057373, "learning_rate": 1.9846621152513058e-05, "loss": 0.0687, "step": 33625 }, { "epoch": 3.047852093529092, "grad_norm": 0.718863308429718, "learning_rate": 1.983888239592554e-05, "loss": 0.0813, "step": 33630 }, { "epoch": 3.048305238354178, "grad_norm": 0.6317819356918335, "learning_rate": 1.9831144155864417e-05, "loss": 0.067, "step": 33635 }, { "epoch": 3.048758383179264, "grad_norm": 0.5985910296440125, "learning_rate": 1.9823406433104138e-05, "loss": 0.0667, "step": 33640 }, { "epoch": 3.04921152800435, "grad_norm": 0.8418295979499817, "learning_rate": 1.981566922841909e-05, "loss": 0.081, "step": 33645 }, { "epoch": 3.049664672829436, "grad_norm": 0.5764531493186951, "learning_rate": 1.9807932542583625e-05, "loss": 0.0554, "step": 33650 }, { "epoch": 3.0501178176545225, "grad_norm": 0.7055293321609497, "learning_rate": 1.980019637637202e-05, "loss": 0.0604, "step": 33655 }, { "epoch": 3.0505709624796085, "grad_norm": 0.6522062420845032, "learning_rate": 1.9792460730558515e-05, "loss": 0.0684, "step": 33660 }, { "epoch": 3.0510241073046944, "grad_norm": 0.7025566697120667, "learning_rate": 1.978472560591731e-05, "loss": 0.0693, "step": 33665 }, { "epoch": 3.051477252129781, "grad_norm": 0.5946282148361206, "learning_rate": 1.9776991003222523e-05, "loss": 0.0604, "step": 33670 }, { "epoch": 3.051930396954867, "grad_norm": 0.6394093036651611, "learning_rate": 1.976925692324825e-05, "loss": 0.0596, "step": 33675 }, { "epoch": 3.0523835417799527, "grad_norm": 0.8127219080924988, "learning_rate": 1.9761523366768502e-05, "loss": 0.0756, "step": 33680 }, { "epoch": 3.052836686605039, "grad_norm": 0.6181193590164185, "learning_rate": 1.9753790334557272e-05, "loss": 0.0737, "step": 33685 }, { "epoch": 3.053289831430125, "grad_norm": 0.7377755641937256, "learning_rate": 1.9746057827388484e-05, "loss": 0.0748, "step": 33690 }, { "epoch": 3.053742976255211, "grad_norm": 0.8814583420753479, "learning_rate": 1.9738325846035988e-05, "loss": 0.0756, "step": 33695 }, { "epoch": 3.0541961210802975, "grad_norm": 0.7162852883338928, "learning_rate": 1.9730594391273637e-05, "loss": 0.0679, "step": 33700 }, { "epoch": 3.0546492659053834, "grad_norm": 0.6605504751205444, "learning_rate": 1.9722863463875174e-05, "loss": 0.0732, "step": 33705 }, { "epoch": 3.0551024107304694, "grad_norm": 0.7121884226799011, "learning_rate": 1.9715133064614317e-05, "loss": 0.0662, "step": 33710 }, { "epoch": 3.0555555555555554, "grad_norm": 0.610419750213623, "learning_rate": 1.970740319426474e-05, "loss": 0.0694, "step": 33715 }, { "epoch": 3.0560087003806418, "grad_norm": 0.8097472190856934, "learning_rate": 1.9699673853600028e-05, "loss": 0.0744, "step": 33720 }, { "epoch": 3.0564618452057277, "grad_norm": 0.6453525424003601, "learning_rate": 1.9691945043393755e-05, "loss": 0.0903, "step": 33725 }, { "epoch": 3.0569149900308137, "grad_norm": 0.7427363991737366, "learning_rate": 1.9684216764419412e-05, "loss": 0.0719, "step": 33730 }, { "epoch": 3.0573681348559, "grad_norm": 0.6043506860733032, "learning_rate": 1.967648901745045e-05, "loss": 0.0683, "step": 33735 }, { "epoch": 3.057821279680986, "grad_norm": 0.6643949151039124, "learning_rate": 1.966876180326028e-05, "loss": 0.0712, "step": 33740 }, { "epoch": 3.058274424506072, "grad_norm": 0.6183105707168579, "learning_rate": 1.966103512262222e-05, "loss": 0.0685, "step": 33745 }, { "epoch": 3.0587275693311584, "grad_norm": 0.7406706213951111, "learning_rate": 1.965330897630957e-05, "loss": 0.0658, "step": 33750 }, { "epoch": 3.0591807141562444, "grad_norm": 0.6793999075889587, "learning_rate": 1.9645583365095568e-05, "loss": 0.0779, "step": 33755 }, { "epoch": 3.0596338589813303, "grad_norm": 0.6558445692062378, "learning_rate": 1.9637858289753392e-05, "loss": 0.0792, "step": 33760 }, { "epoch": 3.0600870038064167, "grad_norm": 0.6935184597969055, "learning_rate": 1.9630133751056177e-05, "loss": 0.0627, "step": 33765 }, { "epoch": 3.0605401486315027, "grad_norm": 0.8301883935928345, "learning_rate": 1.962240974977698e-05, "loss": 0.0846, "step": 33770 }, { "epoch": 3.0609932934565887, "grad_norm": 0.8191155195236206, "learning_rate": 1.9614686286688837e-05, "loss": 0.0681, "step": 33775 }, { "epoch": 3.0614464382816746, "grad_norm": 0.5789591073989868, "learning_rate": 1.9606963362564717e-05, "loss": 0.0592, "step": 33780 }, { "epoch": 3.061899583106761, "grad_norm": 0.6933618187904358, "learning_rate": 1.959924097817752e-05, "loss": 0.0724, "step": 33785 }, { "epoch": 3.062352727931847, "grad_norm": 0.5444266200065613, "learning_rate": 1.9591519134300122e-05, "loss": 0.076, "step": 33790 }, { "epoch": 3.062805872756933, "grad_norm": 0.7615405917167664, "learning_rate": 1.9583797831705306e-05, "loss": 0.0744, "step": 33795 }, { "epoch": 3.0632590175820194, "grad_norm": 0.6511474847793579, "learning_rate": 1.9576077071165832e-05, "loss": 0.0735, "step": 33800 }, { "epoch": 3.0637121624071053, "grad_norm": 0.7029293179512024, "learning_rate": 1.956835685345441e-05, "loss": 0.0702, "step": 33805 }, { "epoch": 3.0641653072321913, "grad_norm": 0.6808367371559143, "learning_rate": 1.956063717934366e-05, "loss": 0.0631, "step": 33810 }, { "epoch": 3.0646184520572777, "grad_norm": 0.7186261415481567, "learning_rate": 1.955291804960619e-05, "loss": 0.0766, "step": 33815 }, { "epoch": 3.0650715968823636, "grad_norm": 0.7431857585906982, "learning_rate": 1.9545199465014513e-05, "loss": 0.0702, "step": 33820 }, { "epoch": 3.0655247417074496, "grad_norm": 0.6756089329719543, "learning_rate": 1.953748142634112e-05, "loss": 0.0755, "step": 33825 }, { "epoch": 3.065977886532536, "grad_norm": 0.5625896453857422, "learning_rate": 1.9529763934358435e-05, "loss": 0.0586, "step": 33830 }, { "epoch": 3.066431031357622, "grad_norm": 0.6117385029792786, "learning_rate": 1.9522046989838822e-05, "loss": 0.0624, "step": 33835 }, { "epoch": 3.066884176182708, "grad_norm": 0.6957292556762695, "learning_rate": 1.9514330593554607e-05, "loss": 0.0727, "step": 33840 }, { "epoch": 3.067337321007794, "grad_norm": 0.6740790605545044, "learning_rate": 1.9506614746278028e-05, "loss": 0.0769, "step": 33845 }, { "epoch": 3.0677904658328803, "grad_norm": 0.6464065909385681, "learning_rate": 1.9498899448781306e-05, "loss": 0.077, "step": 33850 }, { "epoch": 3.0682436106579662, "grad_norm": 0.761904239654541, "learning_rate": 1.949118470183659e-05, "loss": 0.0758, "step": 33855 }, { "epoch": 3.068696755483052, "grad_norm": 0.7336789965629578, "learning_rate": 1.9483470506215968e-05, "loss": 0.0627, "step": 33860 }, { "epoch": 3.0691499003081386, "grad_norm": 0.5719134211540222, "learning_rate": 1.947575686269148e-05, "loss": 0.0775, "step": 33865 }, { "epoch": 3.0696030451332246, "grad_norm": 0.6274194121360779, "learning_rate": 1.9468043772035127e-05, "loss": 0.0661, "step": 33870 }, { "epoch": 3.0700561899583105, "grad_norm": 0.6769095063209534, "learning_rate": 1.9460331235018807e-05, "loss": 0.0678, "step": 33875 }, { "epoch": 3.070509334783397, "grad_norm": 0.794829249382019, "learning_rate": 1.9452619252414425e-05, "loss": 0.0647, "step": 33880 }, { "epoch": 3.070962479608483, "grad_norm": 0.535304069519043, "learning_rate": 1.944490782499377e-05, "loss": 0.0549, "step": 33885 }, { "epoch": 3.071415624433569, "grad_norm": 0.5944863557815552, "learning_rate": 1.943719695352862e-05, "loss": 0.0654, "step": 33890 }, { "epoch": 3.0718687692586553, "grad_norm": 0.6673592329025269, "learning_rate": 1.9429486638790686e-05, "loss": 0.0614, "step": 33895 }, { "epoch": 3.0723219140837412, "grad_norm": 0.5146896839141846, "learning_rate": 1.9421776881551605e-05, "loss": 0.0607, "step": 33900 }, { "epoch": 3.072775058908827, "grad_norm": 0.6253342628479004, "learning_rate": 1.941406768258299e-05, "loss": 0.0892, "step": 33905 }, { "epoch": 3.073228203733913, "grad_norm": 0.723111629486084, "learning_rate": 1.9406359042656358e-05, "loss": 0.0685, "step": 33910 }, { "epoch": 3.0736813485589995, "grad_norm": 0.6692327857017517, "learning_rate": 1.9398650962543204e-05, "loss": 0.068, "step": 33915 }, { "epoch": 3.0741344933840855, "grad_norm": 0.6232112646102905, "learning_rate": 1.9390943443014964e-05, "loss": 0.0652, "step": 33920 }, { "epoch": 3.0745876382091715, "grad_norm": 0.7145944833755493, "learning_rate": 1.9383236484842993e-05, "loss": 0.0721, "step": 33925 }, { "epoch": 3.075040783034258, "grad_norm": 0.600167989730835, "learning_rate": 1.937553008879862e-05, "loss": 0.0648, "step": 33930 }, { "epoch": 3.075493927859344, "grad_norm": 0.6199212074279785, "learning_rate": 1.936782425565309e-05, "loss": 0.0749, "step": 33935 }, { "epoch": 3.07594707268443, "grad_norm": 0.612899899482727, "learning_rate": 1.9360118986177615e-05, "loss": 0.0662, "step": 33940 }, { "epoch": 3.076400217509516, "grad_norm": 0.6478356122970581, "learning_rate": 1.935241428114334e-05, "loss": 0.0686, "step": 33945 }, { "epoch": 3.076853362334602, "grad_norm": 0.6258473992347717, "learning_rate": 1.9344710141321347e-05, "loss": 0.0749, "step": 33950 }, { "epoch": 3.077306507159688, "grad_norm": 0.6184313893318176, "learning_rate": 1.933700656748269e-05, "loss": 0.0677, "step": 33955 }, { "epoch": 3.0777596519847745, "grad_norm": 0.6950438022613525, "learning_rate": 1.9329303560398315e-05, "loss": 0.0712, "step": 33960 }, { "epoch": 3.0782127968098605, "grad_norm": 0.6048763990402222, "learning_rate": 1.932160112083916e-05, "loss": 0.0661, "step": 33965 }, { "epoch": 3.0786659416349464, "grad_norm": 0.7186567783355713, "learning_rate": 1.9313899249576088e-05, "loss": 0.0704, "step": 33970 }, { "epoch": 3.0791190864600324, "grad_norm": 0.7453657388687134, "learning_rate": 1.9306197947379894e-05, "loss": 0.073, "step": 33975 }, { "epoch": 3.079572231285119, "grad_norm": 0.6966942548751831, "learning_rate": 1.929849721502135e-05, "loss": 0.0743, "step": 33980 }, { "epoch": 3.0800253761102048, "grad_norm": 0.5485123991966248, "learning_rate": 1.929079705327112e-05, "loss": 0.0618, "step": 33985 }, { "epoch": 3.0804785209352907, "grad_norm": 1.0619083642959595, "learning_rate": 1.9283097462899847e-05, "loss": 0.0898, "step": 33990 }, { "epoch": 3.080931665760377, "grad_norm": 0.6604741215705872, "learning_rate": 1.927539844467812e-05, "loss": 0.0615, "step": 33995 }, { "epoch": 3.081384810585463, "grad_norm": 0.7795284986495972, "learning_rate": 1.9267699999376447e-05, "loss": 0.0697, "step": 34000 }, { "epoch": 3.081837955410549, "grad_norm": 0.7254737615585327, "learning_rate": 1.9260002127765295e-05, "loss": 0.0673, "step": 34005 }, { "epoch": 3.0822911002356355, "grad_norm": 0.6532217860221863, "learning_rate": 1.925230483061508e-05, "loss": 0.0738, "step": 34010 }, { "epoch": 3.0827442450607214, "grad_norm": 0.6269121766090393, "learning_rate": 1.9244608108696134e-05, "loss": 0.062, "step": 34015 }, { "epoch": 3.0831973898858074, "grad_norm": 0.7167661786079407, "learning_rate": 1.9236911962778755e-05, "loss": 0.0797, "step": 34020 }, { "epoch": 3.083650534710894, "grad_norm": 0.6839688420295715, "learning_rate": 1.9229216393633174e-05, "loss": 0.0704, "step": 34025 }, { "epoch": 3.0841036795359797, "grad_norm": 0.6820206046104431, "learning_rate": 1.922152140202956e-05, "loss": 0.0649, "step": 34030 }, { "epoch": 3.0845568243610657, "grad_norm": 0.6500756144523621, "learning_rate": 1.9213826988738056e-05, "loss": 0.0664, "step": 34035 }, { "epoch": 3.085009969186152, "grad_norm": 0.7656744122505188, "learning_rate": 1.920613315452869e-05, "loss": 0.0636, "step": 34040 }, { "epoch": 3.085463114011238, "grad_norm": 0.8104240894317627, "learning_rate": 1.919843990017148e-05, "loss": 0.0703, "step": 34045 }, { "epoch": 3.085916258836324, "grad_norm": 0.7799742817878723, "learning_rate": 1.9190747226436363e-05, "loss": 0.077, "step": 34050 }, { "epoch": 3.08636940366141, "grad_norm": 0.7417973875999451, "learning_rate": 1.9183055134093226e-05, "loss": 0.0765, "step": 34055 }, { "epoch": 3.0868225484864964, "grad_norm": 0.8745042085647583, "learning_rate": 1.9175363623911907e-05, "loss": 0.0826, "step": 34060 }, { "epoch": 3.0872756933115824, "grad_norm": 1.0462794303894043, "learning_rate": 1.916767269666216e-05, "loss": 0.0926, "step": 34065 }, { "epoch": 3.0877288381366683, "grad_norm": 0.8014547824859619, "learning_rate": 1.9159982353113708e-05, "loss": 0.0693, "step": 34070 }, { "epoch": 3.0881819829617547, "grad_norm": 0.6697682738304138, "learning_rate": 1.9152292594036184e-05, "loss": 0.0771, "step": 34075 }, { "epoch": 3.0886351277868407, "grad_norm": 0.851371705532074, "learning_rate": 1.9144603420199196e-05, "loss": 0.0838, "step": 34080 }, { "epoch": 3.0890882726119266, "grad_norm": 0.5721359252929688, "learning_rate": 1.9136914832372282e-05, "loss": 0.0637, "step": 34085 }, { "epoch": 3.089541417437013, "grad_norm": 0.6539310812950134, "learning_rate": 1.912922683132491e-05, "loss": 0.0828, "step": 34090 }, { "epoch": 3.089994562262099, "grad_norm": 0.6874323487281799, "learning_rate": 1.912153941782651e-05, "loss": 0.0818, "step": 34095 }, { "epoch": 3.090447707087185, "grad_norm": 0.5812366008758545, "learning_rate": 1.911385259264642e-05, "loss": 0.0803, "step": 34100 }, { "epoch": 3.0909008519122714, "grad_norm": 0.7254628539085388, "learning_rate": 1.9106166356553956e-05, "loss": 0.0668, "step": 34105 }, { "epoch": 3.0913539967373573, "grad_norm": 0.8351628184318542, "learning_rate": 1.909848071031836e-05, "loss": 0.0838, "step": 34110 }, { "epoch": 3.0918071415624433, "grad_norm": 0.6443451642990112, "learning_rate": 1.90907956547088e-05, "loss": 0.0691, "step": 34115 }, { "epoch": 3.0922602863875293, "grad_norm": 0.5707381367683411, "learning_rate": 1.9083111190494414e-05, "loss": 0.0666, "step": 34120 }, { "epoch": 3.0927134312126157, "grad_norm": 0.6837881803512573, "learning_rate": 1.907542731844427e-05, "loss": 0.0709, "step": 34125 }, { "epoch": 3.0931665760377016, "grad_norm": 0.6799283027648926, "learning_rate": 1.906774403932735e-05, "loss": 0.0721, "step": 34130 }, { "epoch": 3.0936197208627876, "grad_norm": 0.6710236072540283, "learning_rate": 1.9060061353912616e-05, "loss": 0.0592, "step": 34135 }, { "epoch": 3.094072865687874, "grad_norm": 0.7585818767547607, "learning_rate": 1.905237926296895e-05, "loss": 0.0646, "step": 34140 }, { "epoch": 3.09452601051296, "grad_norm": 0.5578125715255737, "learning_rate": 1.9044697767265174e-05, "loss": 0.0678, "step": 34145 }, { "epoch": 3.094979155338046, "grad_norm": 0.6986560821533203, "learning_rate": 1.903701686757007e-05, "loss": 0.0744, "step": 34150 }, { "epoch": 3.0954323001631323, "grad_norm": 0.6214962601661682, "learning_rate": 1.902933656465232e-05, "loss": 0.0735, "step": 34155 }, { "epoch": 3.0958854449882183, "grad_norm": 0.7481732964515686, "learning_rate": 1.9021656859280594e-05, "loss": 0.0666, "step": 34160 }, { "epoch": 3.0963385898133042, "grad_norm": 0.6893871426582336, "learning_rate": 1.9013977752223467e-05, "loss": 0.0664, "step": 34165 }, { "epoch": 3.0967917346383906, "grad_norm": 0.6759927868843079, "learning_rate": 1.9006299244249464e-05, "loss": 0.0693, "step": 34170 }, { "epoch": 3.0972448794634766, "grad_norm": 0.6717797517776489, "learning_rate": 1.8998621336127077e-05, "loss": 0.0747, "step": 34175 }, { "epoch": 3.0976980242885626, "grad_norm": 0.7043672800064087, "learning_rate": 1.899094402862468e-05, "loss": 0.0861, "step": 34180 }, { "epoch": 3.0981511691136485, "grad_norm": 0.5494319200515747, "learning_rate": 1.898326732251064e-05, "loss": 0.0772, "step": 34185 }, { "epoch": 3.098604313938735, "grad_norm": 0.7219845652580261, "learning_rate": 1.8975591218553237e-05, "loss": 0.0654, "step": 34190 }, { "epoch": 3.099057458763821, "grad_norm": 0.6659619212150574, "learning_rate": 1.89679157175207e-05, "loss": 0.0692, "step": 34195 }, { "epoch": 3.099510603588907, "grad_norm": 0.7238364219665527, "learning_rate": 1.8960240820181215e-05, "loss": 0.0628, "step": 34200 }, { "epoch": 3.0999637484139932, "grad_norm": 0.5388793349266052, "learning_rate": 1.895256652730285e-05, "loss": 0.0728, "step": 34205 }, { "epoch": 3.100416893239079, "grad_norm": 0.7398572564125061, "learning_rate": 1.894489283965368e-05, "loss": 0.0732, "step": 34210 }, { "epoch": 3.100870038064165, "grad_norm": 0.515578031539917, "learning_rate": 1.8937219758001678e-05, "loss": 0.0623, "step": 34215 }, { "epoch": 3.1013231828892516, "grad_norm": 0.6614870429039001, "learning_rate": 1.8929547283114773e-05, "loss": 0.0643, "step": 34220 }, { "epoch": 3.1017763277143375, "grad_norm": 0.9883306622505188, "learning_rate": 1.8921875415760838e-05, "loss": 0.0761, "step": 34225 }, { "epoch": 3.1022294725394235, "grad_norm": 0.5786348581314087, "learning_rate": 1.891420415670765e-05, "loss": 0.0646, "step": 34230 }, { "epoch": 3.10268261736451, "grad_norm": 0.67952561378479, "learning_rate": 1.8906533506722983e-05, "loss": 0.0708, "step": 34235 }, { "epoch": 3.103135762189596, "grad_norm": 0.6493013501167297, "learning_rate": 1.8898863466574494e-05, "loss": 0.0662, "step": 34240 }, { "epoch": 3.103588907014682, "grad_norm": 0.6826576590538025, "learning_rate": 1.8891194037029804e-05, "loss": 0.0686, "step": 34245 }, { "epoch": 3.104042051839768, "grad_norm": 0.7158969640731812, "learning_rate": 1.8883525218856493e-05, "loss": 0.0723, "step": 34250 }, { "epoch": 3.104495196664854, "grad_norm": 0.7158511281013489, "learning_rate": 1.8875857012822038e-05, "loss": 0.0798, "step": 34255 }, { "epoch": 3.10494834148994, "grad_norm": 0.601134181022644, "learning_rate": 1.886818941969388e-05, "loss": 0.0722, "step": 34260 }, { "epoch": 3.105401486315026, "grad_norm": 0.7074301838874817, "learning_rate": 1.8860522440239415e-05, "loss": 0.07, "step": 34265 }, { "epoch": 3.1058546311401125, "grad_norm": 0.7083747386932373, "learning_rate": 1.8852856075225924e-05, "loss": 0.074, "step": 34270 }, { "epoch": 3.1063077759651985, "grad_norm": 0.6036167740821838, "learning_rate": 1.884519032542068e-05, "loss": 0.0658, "step": 34275 }, { "epoch": 3.1067609207902844, "grad_norm": 0.704325795173645, "learning_rate": 1.8837525191590866e-05, "loss": 0.0668, "step": 34280 }, { "epoch": 3.107214065615371, "grad_norm": 0.6654403209686279, "learning_rate": 1.8829860674503613e-05, "loss": 0.0737, "step": 34285 }, { "epoch": 3.107667210440457, "grad_norm": 0.8268064260482788, "learning_rate": 1.8822196774926e-05, "loss": 0.075, "step": 34290 }, { "epoch": 3.1081203552655428, "grad_norm": 0.7146289348602295, "learning_rate": 1.8814533493625012e-05, "loss": 0.0674, "step": 34295 }, { "epoch": 3.108573500090629, "grad_norm": 0.7113519310951233, "learning_rate": 1.880687083136761e-05, "loss": 0.0734, "step": 34300 }, { "epoch": 3.109026644915715, "grad_norm": 0.6538947820663452, "learning_rate": 1.879920878892066e-05, "loss": 0.0695, "step": 34305 }, { "epoch": 3.109479789740801, "grad_norm": 0.7769166231155396, "learning_rate": 1.8791547367050994e-05, "loss": 0.0723, "step": 34310 }, { "epoch": 3.1099329345658875, "grad_norm": 0.6140180230140686, "learning_rate": 1.8783886566525376e-05, "loss": 0.0643, "step": 34315 }, { "epoch": 3.1103860793909734, "grad_norm": 0.6537125110626221, "learning_rate": 1.877622638811048e-05, "loss": 0.0652, "step": 34320 }, { "epoch": 3.1108392242160594, "grad_norm": 0.5035091042518616, "learning_rate": 1.8768566832572955e-05, "loss": 0.0679, "step": 34325 }, { "epoch": 3.1112923690411454, "grad_norm": 0.7462336421012878, "learning_rate": 1.8760907900679363e-05, "loss": 0.0887, "step": 34330 }, { "epoch": 3.1117455138662318, "grad_norm": 0.7281152009963989, "learning_rate": 1.875324959319622e-05, "loss": 0.063, "step": 34335 }, { "epoch": 3.1121986586913177, "grad_norm": 0.6510533690452576, "learning_rate": 1.874559191088998e-05, "loss": 0.0684, "step": 34340 }, { "epoch": 3.1126518035164037, "grad_norm": 0.7794012427330017, "learning_rate": 1.8737934854527002e-05, "loss": 0.0665, "step": 34345 }, { "epoch": 3.11310494834149, "grad_norm": 0.7113415002822876, "learning_rate": 1.873027842487363e-05, "loss": 0.0675, "step": 34350 }, { "epoch": 3.113558093166576, "grad_norm": 0.6694140434265137, "learning_rate": 1.8722622622696108e-05, "loss": 0.0704, "step": 34355 }, { "epoch": 3.114011237991662, "grad_norm": 0.6913222670555115, "learning_rate": 1.871496744876063e-05, "loss": 0.0734, "step": 34360 }, { "epoch": 3.1144643828167484, "grad_norm": 0.649608850479126, "learning_rate": 1.870731290383335e-05, "loss": 0.0617, "step": 34365 }, { "epoch": 3.1149175276418344, "grad_norm": 0.7208966612815857, "learning_rate": 1.8699658988680307e-05, "loss": 0.0835, "step": 34370 }, { "epoch": 3.1153706724669203, "grad_norm": 0.6754832863807678, "learning_rate": 1.8692005704067525e-05, "loss": 0.0699, "step": 34375 }, { "epoch": 3.1158238172920063, "grad_norm": 0.6888307332992554, "learning_rate": 1.868435305076095e-05, "loss": 0.0741, "step": 34380 }, { "epoch": 3.1162769621170927, "grad_norm": 0.7215002775192261, "learning_rate": 1.867670102952645e-05, "loss": 0.0819, "step": 34385 }, { "epoch": 3.1167301069421787, "grad_norm": 0.7807165384292603, "learning_rate": 1.866904964112986e-05, "loss": 0.0695, "step": 34390 }, { "epoch": 3.1171832517672646, "grad_norm": 0.7543066740036011, "learning_rate": 1.866139888633691e-05, "loss": 0.0649, "step": 34395 }, { "epoch": 3.117636396592351, "grad_norm": 0.8500527739524841, "learning_rate": 1.865374876591331e-05, "loss": 0.0744, "step": 34400 }, { "epoch": 3.118089541417437, "grad_norm": 0.6467263102531433, "learning_rate": 1.8646099280624673e-05, "loss": 0.062, "step": 34405 }, { "epoch": 3.118542686242523, "grad_norm": 0.6604231595993042, "learning_rate": 1.863845043123657e-05, "loss": 0.0656, "step": 34410 }, { "epoch": 3.1189958310676094, "grad_norm": 0.6155848503112793, "learning_rate": 1.8630802218514507e-05, "loss": 0.0702, "step": 34415 }, { "epoch": 3.1194489758926953, "grad_norm": 0.9178539514541626, "learning_rate": 1.86231546432239e-05, "loss": 0.0817, "step": 34420 }, { "epoch": 3.1199021207177813, "grad_norm": 0.6802240610122681, "learning_rate": 1.8615507706130132e-05, "loss": 0.0812, "step": 34425 }, { "epoch": 3.1203552655428677, "grad_norm": 0.7683461308479309, "learning_rate": 1.8607861407998524e-05, "loss": 0.0859, "step": 34430 }, { "epoch": 3.1208084103679536, "grad_norm": 0.587897002696991, "learning_rate": 1.860021574959429e-05, "loss": 0.0674, "step": 34435 }, { "epoch": 3.1212615551930396, "grad_norm": 0.7289374470710754, "learning_rate": 1.8592570731682642e-05, "loss": 0.0705, "step": 34440 }, { "epoch": 3.121714700018126, "grad_norm": 0.8053105473518372, "learning_rate": 1.858492635502867e-05, "loss": 0.0923, "step": 34445 }, { "epoch": 3.122167844843212, "grad_norm": 0.8548277020454407, "learning_rate": 1.8577282620397434e-05, "loss": 0.0735, "step": 34450 }, { "epoch": 3.122620989668298, "grad_norm": 0.65860515832901, "learning_rate": 1.8569639528553942e-05, "loss": 0.0703, "step": 34455 }, { "epoch": 3.123074134493384, "grad_norm": 0.5583929419517517, "learning_rate": 1.8561997080263085e-05, "loss": 0.0547, "step": 34460 }, { "epoch": 3.1235272793184703, "grad_norm": 0.6539294719696045, "learning_rate": 1.8554355276289743e-05, "loss": 0.0686, "step": 34465 }, { "epoch": 3.1239804241435563, "grad_norm": 0.600587010383606, "learning_rate": 1.85467141173987e-05, "loss": 0.0668, "step": 34470 }, { "epoch": 3.124433568968642, "grad_norm": 0.6199257969856262, "learning_rate": 1.8539073604354687e-05, "loss": 0.0682, "step": 34475 }, { "epoch": 3.1248867137937286, "grad_norm": 0.7172333002090454, "learning_rate": 1.853143373792238e-05, "loss": 0.0674, "step": 34480 }, { "epoch": 3.1253398586188146, "grad_norm": 0.677783191204071, "learning_rate": 1.852379451886637e-05, "loss": 0.0689, "step": 34485 }, { "epoch": 3.1257930034439005, "grad_norm": 0.6743616461753845, "learning_rate": 1.851615594795119e-05, "loss": 0.0629, "step": 34490 }, { "epoch": 3.126246148268987, "grad_norm": 0.7585973143577576, "learning_rate": 1.8508518025941316e-05, "loss": 0.0726, "step": 34495 }, { "epoch": 3.126699293094073, "grad_norm": 0.6645802855491638, "learning_rate": 1.8500880753601147e-05, "loss": 0.0685, "step": 34500 }, { "epoch": 3.127152437919159, "grad_norm": 0.8386120796203613, "learning_rate": 1.849324413169504e-05, "loss": 0.0794, "step": 34505 }, { "epoch": 3.127605582744245, "grad_norm": 0.7708282470703125, "learning_rate": 1.8485608160987255e-05, "loss": 0.0794, "step": 34510 }, { "epoch": 3.1280587275693312, "grad_norm": 0.6644384860992432, "learning_rate": 1.8477972842242002e-05, "loss": 0.0707, "step": 34515 }, { "epoch": 3.128511872394417, "grad_norm": 0.6535254120826721, "learning_rate": 1.847033817622344e-05, "loss": 0.0723, "step": 34520 }, { "epoch": 3.128965017219503, "grad_norm": 0.5952282547950745, "learning_rate": 1.8462704163695632e-05, "loss": 0.0667, "step": 34525 }, { "epoch": 3.1294181620445896, "grad_norm": 0.9853865504264832, "learning_rate": 1.8455070805422613e-05, "loss": 0.0746, "step": 34530 }, { "epoch": 3.1298713068696755, "grad_norm": 0.5669326782226562, "learning_rate": 1.8447438102168314e-05, "loss": 0.0672, "step": 34535 }, { "epoch": 3.1303244516947615, "grad_norm": 0.589081883430481, "learning_rate": 1.843980605469662e-05, "loss": 0.0747, "step": 34540 }, { "epoch": 3.130777596519848, "grad_norm": 0.6656039357185364, "learning_rate": 1.8432174663771364e-05, "loss": 0.0675, "step": 34545 }, { "epoch": 3.131230741344934, "grad_norm": 0.6222648620605469, "learning_rate": 1.8424543930156275e-05, "loss": 0.067, "step": 34550 }, { "epoch": 3.13168388617002, "grad_norm": 1.0086309909820557, "learning_rate": 1.841691385461507e-05, "loss": 0.0865, "step": 34555 }, { "epoch": 3.132137030995106, "grad_norm": 0.5368759036064148, "learning_rate": 1.8409284437911337e-05, "loss": 0.0625, "step": 34560 }, { "epoch": 3.132590175820192, "grad_norm": 0.7227411270141602, "learning_rate": 1.8401655680808648e-05, "loss": 0.0763, "step": 34565 }, { "epoch": 3.133043320645278, "grad_norm": 0.6776700615882874, "learning_rate": 1.8394027584070492e-05, "loss": 0.0643, "step": 34570 }, { "epoch": 3.1334964654703645, "grad_norm": 0.6229242086410522, "learning_rate": 1.8386400148460285e-05, "loss": 0.0694, "step": 34575 }, { "epoch": 3.1339496102954505, "grad_norm": 0.5277907252311707, "learning_rate": 1.83787733747414e-05, "loss": 0.0682, "step": 34580 }, { "epoch": 3.1344027551205365, "grad_norm": 0.6525728106498718, "learning_rate": 1.8371147263677097e-05, "loss": 0.0674, "step": 34585 }, { "epoch": 3.1348558999456224, "grad_norm": 0.8355732560157776, "learning_rate": 1.8363521816030622e-05, "loss": 0.069, "step": 34590 }, { "epoch": 3.135309044770709, "grad_norm": 0.7927785515785217, "learning_rate": 1.8355897032565127e-05, "loss": 0.0788, "step": 34595 }, { "epoch": 3.135762189595795, "grad_norm": 0.8652230501174927, "learning_rate": 1.83482729140437e-05, "loss": 0.0718, "step": 34600 }, { "epoch": 3.1362153344208807, "grad_norm": 0.6878419518470764, "learning_rate": 1.834064946122938e-05, "loss": 0.0638, "step": 34605 }, { "epoch": 3.136668479245967, "grad_norm": 0.6836963891983032, "learning_rate": 1.8333026674885105e-05, "loss": 0.0686, "step": 34610 }, { "epoch": 3.137121624071053, "grad_norm": 0.9029942154884338, "learning_rate": 1.8325404555773774e-05, "loss": 0.0715, "step": 34615 }, { "epoch": 3.137574768896139, "grad_norm": 0.7333600521087646, "learning_rate": 1.8317783104658215e-05, "loss": 0.0672, "step": 34620 }, { "epoch": 3.1380279137212255, "grad_norm": 0.598110020160675, "learning_rate": 1.831016232230118e-05, "loss": 0.0684, "step": 34625 }, { "epoch": 3.1384810585463114, "grad_norm": 0.7266579866409302, "learning_rate": 1.830254220946536e-05, "loss": 0.0819, "step": 34630 }, { "epoch": 3.1389342033713974, "grad_norm": 0.7647938132286072, "learning_rate": 1.8294922766913378e-05, "loss": 0.0691, "step": 34635 }, { "epoch": 3.139387348196484, "grad_norm": 0.6849826574325562, "learning_rate": 1.828730399540779e-05, "loss": 0.0687, "step": 34640 }, { "epoch": 3.1398404930215698, "grad_norm": 0.6031427979469299, "learning_rate": 1.82796858957111e-05, "loss": 0.0919, "step": 34645 }, { "epoch": 3.1402936378466557, "grad_norm": 0.6908556818962097, "learning_rate": 1.8272068468585708e-05, "loss": 0.0786, "step": 34650 }, { "epoch": 3.1407467826717417, "grad_norm": 0.6123191118240356, "learning_rate": 1.8264451714793974e-05, "loss": 0.0645, "step": 34655 }, { "epoch": 3.141199927496828, "grad_norm": 0.7386499047279358, "learning_rate": 1.8256835635098196e-05, "loss": 0.0775, "step": 34660 }, { "epoch": 3.141653072321914, "grad_norm": 0.6615965962409973, "learning_rate": 1.8249220230260583e-05, "loss": 0.063, "step": 34665 }, { "epoch": 3.142106217147, "grad_norm": 0.6445053815841675, "learning_rate": 1.82416055010433e-05, "loss": 0.0722, "step": 34670 }, { "epoch": 3.1425593619720864, "grad_norm": 0.6402685046195984, "learning_rate": 1.8233991448208416e-05, "loss": 0.0647, "step": 34675 }, { "epoch": 3.1430125067971724, "grad_norm": 0.7532083988189697, "learning_rate": 1.8226378072517952e-05, "loss": 0.0738, "step": 34680 }, { "epoch": 3.1434656516222583, "grad_norm": 0.6508232951164246, "learning_rate": 1.8218765374733864e-05, "loss": 0.065, "step": 34685 }, { "epoch": 3.1439187964473447, "grad_norm": 0.5884077548980713, "learning_rate": 1.821115335561803e-05, "loss": 0.0593, "step": 34690 }, { "epoch": 3.1443719412724307, "grad_norm": 0.6352061629295349, "learning_rate": 1.820354201593227e-05, "loss": 0.0604, "step": 34695 }, { "epoch": 3.1448250860975167, "grad_norm": 0.6650886535644531, "learning_rate": 1.8195931356438313e-05, "loss": 0.0601, "step": 34700 }, { "epoch": 3.145278230922603, "grad_norm": 0.6834538578987122, "learning_rate": 1.8188321377897845e-05, "loss": 0.0684, "step": 34705 }, { "epoch": 3.145731375747689, "grad_norm": 0.5916967988014221, "learning_rate": 1.818071208107248e-05, "loss": 0.0664, "step": 34710 }, { "epoch": 3.146184520572775, "grad_norm": 0.6028671860694885, "learning_rate": 1.8173103466723755e-05, "loss": 0.0662, "step": 34715 }, { "epoch": 3.1466376653978614, "grad_norm": 0.6669163107872009, "learning_rate": 1.8165495535613153e-05, "loss": 0.0639, "step": 34720 }, { "epoch": 3.1470908102229473, "grad_norm": 0.703160285949707, "learning_rate": 1.8157888288502053e-05, "loss": 0.0758, "step": 34725 }, { "epoch": 3.1475439550480333, "grad_norm": 0.5908927917480469, "learning_rate": 1.8150281726151807e-05, "loss": 0.0649, "step": 34730 }, { "epoch": 3.1479970998731193, "grad_norm": 0.6236259937286377, "learning_rate": 1.814267584932369e-05, "loss": 0.0705, "step": 34735 }, { "epoch": 3.1484502446982057, "grad_norm": 0.6101236343383789, "learning_rate": 1.8135070658778885e-05, "loss": 0.0695, "step": 34740 }, { "epoch": 3.1489033895232916, "grad_norm": 0.5350420475006104, "learning_rate": 1.812746615527854e-05, "loss": 0.0605, "step": 34745 }, { "epoch": 3.1493565343483776, "grad_norm": 0.6744124293327332, "learning_rate": 1.8119862339583693e-05, "loss": 0.07, "step": 34750 }, { "epoch": 3.149809679173464, "grad_norm": 0.6389852166175842, "learning_rate": 1.8112259212455352e-05, "loss": 0.0725, "step": 34755 }, { "epoch": 3.15026282399855, "grad_norm": 0.6243335008621216, "learning_rate": 1.8104656774654442e-05, "loss": 0.0686, "step": 34760 }, { "epoch": 3.150715968823636, "grad_norm": 0.696351170539856, "learning_rate": 1.8097055026941808e-05, "loss": 0.0773, "step": 34765 }, { "epoch": 3.1511691136487223, "grad_norm": 0.7787836194038391, "learning_rate": 1.8089453970078234e-05, "loss": 0.076, "step": 34770 }, { "epoch": 3.1516222584738083, "grad_norm": 0.5957227945327759, "learning_rate": 1.808185360482446e-05, "loss": 0.088, "step": 34775 }, { "epoch": 3.1520754032988942, "grad_norm": 0.6554669737815857, "learning_rate": 1.8074253931941104e-05, "loss": 0.0564, "step": 34780 }, { "epoch": 3.15252854812398, "grad_norm": 0.6587013602256775, "learning_rate": 1.8066654952188766e-05, "loss": 0.0618, "step": 34785 }, { "epoch": 3.1529816929490666, "grad_norm": 0.5769917964935303, "learning_rate": 1.8059056666327935e-05, "loss": 0.0584, "step": 34790 }, { "epoch": 3.1534348377741526, "grad_norm": 0.7443243265151978, "learning_rate": 1.8051459075119055e-05, "loss": 0.0671, "step": 34795 }, { "epoch": 3.1538879825992385, "grad_norm": 0.5619853734970093, "learning_rate": 1.8043862179322514e-05, "loss": 0.09, "step": 34800 }, { "epoch": 3.154341127424325, "grad_norm": 0.6954877972602844, "learning_rate": 1.8036265979698584e-05, "loss": 0.0682, "step": 34805 }, { "epoch": 3.154794272249411, "grad_norm": 0.7027772068977356, "learning_rate": 1.802867047700752e-05, "loss": 0.0791, "step": 34810 }, { "epoch": 3.155247417074497, "grad_norm": 0.7361786365509033, "learning_rate": 1.8021075672009463e-05, "loss": 0.0675, "step": 34815 }, { "epoch": 3.1557005618995833, "grad_norm": 0.6967583298683167, "learning_rate": 1.8013481565464512e-05, "loss": 0.0659, "step": 34820 }, { "epoch": 3.156153706724669, "grad_norm": 0.6579480767250061, "learning_rate": 1.8005888158132693e-05, "loss": 0.0764, "step": 34825 }, { "epoch": 3.156606851549755, "grad_norm": 0.699273943901062, "learning_rate": 1.7998295450773943e-05, "loss": 0.0829, "step": 34830 }, { "epoch": 3.1570599963748416, "grad_norm": 0.601546585559845, "learning_rate": 1.7990703444148165e-05, "loss": 0.0623, "step": 34835 }, { "epoch": 3.1575131411999275, "grad_norm": 0.7602547407150269, "learning_rate": 1.7983112139015146e-05, "loss": 0.0826, "step": 34840 }, { "epoch": 3.1579662860250135, "grad_norm": 0.5333455801010132, "learning_rate": 1.797552153613464e-05, "loss": 0.0598, "step": 34845 }, { "epoch": 3.1584194308501, "grad_norm": 0.9151288866996765, "learning_rate": 1.796793163626631e-05, "loss": 0.064, "step": 34850 }, { "epoch": 3.158872575675186, "grad_norm": 0.5925536155700684, "learning_rate": 1.7960342440169758e-05, "loss": 0.0769, "step": 34855 }, { "epoch": 3.159325720500272, "grad_norm": 0.721372663974762, "learning_rate": 1.7952753948604524e-05, "loss": 0.0746, "step": 34860 }, { "epoch": 3.159778865325358, "grad_norm": 0.6136746406555176, "learning_rate": 1.7945166162330045e-05, "loss": 0.0678, "step": 34865 }, { "epoch": 3.160232010150444, "grad_norm": 0.6847996711730957, "learning_rate": 1.7937579082105728e-05, "loss": 0.0762, "step": 34870 }, { "epoch": 3.16068515497553, "grad_norm": 0.7689264416694641, "learning_rate": 1.792999270869088e-05, "loss": 0.0771, "step": 34875 }, { "epoch": 3.161138299800616, "grad_norm": 0.48845621943473816, "learning_rate": 1.792240704284475e-05, "loss": 0.0627, "step": 34880 }, { "epoch": 3.1615914446257025, "grad_norm": 0.5897139310836792, "learning_rate": 1.7914822085326528e-05, "loss": 0.0633, "step": 34885 }, { "epoch": 3.1620445894507885, "grad_norm": 0.6768973469734192, "learning_rate": 1.7907237836895296e-05, "loss": 0.0717, "step": 34890 }, { "epoch": 3.1624977342758744, "grad_norm": 0.6094030737876892, "learning_rate": 1.7899654298310098e-05, "loss": 0.0665, "step": 34895 }, { "epoch": 3.162950879100961, "grad_norm": 0.5898670554161072, "learning_rate": 1.7892071470329906e-05, "loss": 0.0736, "step": 34900 }, { "epoch": 3.163404023926047, "grad_norm": 0.6395431756973267, "learning_rate": 1.7884489353713598e-05, "loss": 0.0768, "step": 34905 }, { "epoch": 3.1638571687511328, "grad_norm": 0.6437960863113403, "learning_rate": 1.7876907949219995e-05, "loss": 0.0677, "step": 34910 }, { "epoch": 3.1643103135762187, "grad_norm": 0.6018849015235901, "learning_rate": 1.7869327257607872e-05, "loss": 0.0711, "step": 34915 }, { "epoch": 3.164763458401305, "grad_norm": 0.6625878810882568, "learning_rate": 1.786174727963587e-05, "loss": 0.0605, "step": 34920 }, { "epoch": 3.165216603226391, "grad_norm": 0.5306797623634338, "learning_rate": 1.7854168016062623e-05, "loss": 0.0655, "step": 34925 }, { "epoch": 3.165669748051477, "grad_norm": 0.6059548854827881, "learning_rate": 1.7846589467646646e-05, "loss": 0.0598, "step": 34930 }, { "epoch": 3.1661228928765635, "grad_norm": 0.8689901232719421, "learning_rate": 1.783901163514642e-05, "loss": 0.0805, "step": 34935 }, { "epoch": 3.1665760377016494, "grad_norm": 0.7437876462936401, "learning_rate": 1.7831434519320335e-05, "loss": 0.0702, "step": 34940 }, { "epoch": 3.1670291825267354, "grad_norm": 0.517210841178894, "learning_rate": 1.7823858120926702e-05, "loss": 0.0672, "step": 34945 }, { "epoch": 3.167482327351822, "grad_norm": 0.7254430055618286, "learning_rate": 1.7816282440723773e-05, "loss": 0.0757, "step": 34950 }, { "epoch": 3.1679354721769077, "grad_norm": 0.6124657392501831, "learning_rate": 1.7808707479469725e-05, "loss": 0.0698, "step": 34955 }, { "epoch": 3.1683886170019937, "grad_norm": 0.715866208076477, "learning_rate": 1.7801133237922663e-05, "loss": 0.0729, "step": 34960 }, { "epoch": 3.16884176182708, "grad_norm": 0.6810238361358643, "learning_rate": 1.779355971684063e-05, "loss": 0.0732, "step": 34965 }, { "epoch": 3.169294906652166, "grad_norm": 0.8650215268135071, "learning_rate": 1.7785986916981566e-05, "loss": 0.0911, "step": 34970 }, { "epoch": 3.169748051477252, "grad_norm": 0.6687921285629272, "learning_rate": 1.777841483910338e-05, "loss": 0.0747, "step": 34975 }, { "epoch": 3.1702011963023384, "grad_norm": 0.6577603816986084, "learning_rate": 1.7770843483963866e-05, "loss": 0.0662, "step": 34980 }, { "epoch": 3.1706543411274244, "grad_norm": 0.7463670372962952, "learning_rate": 1.7763272852320784e-05, "loss": 0.07, "step": 34985 }, { "epoch": 3.1711074859525104, "grad_norm": 0.6959970593452454, "learning_rate": 1.7755702944931805e-05, "loss": 0.0747, "step": 34990 }, { "epoch": 3.1715606307775963, "grad_norm": 0.6021974086761475, "learning_rate": 1.774813376255452e-05, "loss": 0.0704, "step": 34995 }, { "epoch": 3.1720137756026827, "grad_norm": 0.5986400842666626, "learning_rate": 1.7740565305946466e-05, "loss": 0.0655, "step": 35000 }, { "epoch": 3.1724669204277687, "grad_norm": 0.5927206873893738, "learning_rate": 1.7732997575865086e-05, "loss": 0.0699, "step": 35005 }, { "epoch": 3.1729200652528546, "grad_norm": 0.7117237448692322, "learning_rate": 1.7725430573067757e-05, "loss": 0.0719, "step": 35010 }, { "epoch": 3.173373210077941, "grad_norm": 0.618911623954773, "learning_rate": 1.7717864298311807e-05, "loss": 0.0775, "step": 35015 }, { "epoch": 3.173826354903027, "grad_norm": 0.6978043913841248, "learning_rate": 1.7710298752354454e-05, "loss": 0.0705, "step": 35020 }, { "epoch": 3.174279499728113, "grad_norm": 0.6621131300926208, "learning_rate": 1.7702733935952863e-05, "loss": 0.076, "step": 35025 }, { "epoch": 3.1747326445531994, "grad_norm": 0.5834252834320068, "learning_rate": 1.769516984986414e-05, "loss": 0.0645, "step": 35030 }, { "epoch": 3.1751857893782853, "grad_norm": 0.6876317858695984, "learning_rate": 1.7687606494845277e-05, "loss": 0.0584, "step": 35035 }, { "epoch": 3.1756389342033713, "grad_norm": 0.6268448233604431, "learning_rate": 1.768004387165323e-05, "loss": 0.0695, "step": 35040 }, { "epoch": 3.1760920790284577, "grad_norm": 0.6962889432907104, "learning_rate": 1.7672481981044868e-05, "loss": 0.0626, "step": 35045 }, { "epoch": 3.1765452238535437, "grad_norm": 0.777230978012085, "learning_rate": 1.7664920823776982e-05, "loss": 0.0838, "step": 35050 }, { "epoch": 3.1769983686786296, "grad_norm": 0.798832356929779, "learning_rate": 1.765736040060631e-05, "loss": 0.0629, "step": 35055 }, { "epoch": 3.1774515135037156, "grad_norm": 0.6071575880050659, "learning_rate": 1.7649800712289482e-05, "loss": 0.0759, "step": 35060 }, { "epoch": 3.177904658328802, "grad_norm": 0.5658495426177979, "learning_rate": 1.7642241759583088e-05, "loss": 0.0666, "step": 35065 }, { "epoch": 3.178357803153888, "grad_norm": 0.9014638066291809, "learning_rate": 1.7634683543243618e-05, "loss": 0.0705, "step": 35070 }, { "epoch": 3.178810947978974, "grad_norm": 0.6889020800590515, "learning_rate": 1.7627126064027512e-05, "loss": 0.0645, "step": 35075 }, { "epoch": 3.1792640928040603, "grad_norm": 0.6410954594612122, "learning_rate": 1.761956932269113e-05, "loss": 0.0664, "step": 35080 }, { "epoch": 3.1797172376291463, "grad_norm": 0.5765653252601624, "learning_rate": 1.761201331999074e-05, "loss": 0.0797, "step": 35085 }, { "epoch": 3.1801703824542322, "grad_norm": 0.5270400643348694, "learning_rate": 1.7604458056682556e-05, "loss": 0.0734, "step": 35090 }, { "epoch": 3.1806235272793186, "grad_norm": 0.6704181432723999, "learning_rate": 1.7596903533522708e-05, "loss": 0.0697, "step": 35095 }, { "epoch": 3.1810766721044046, "grad_norm": 0.586052656173706, "learning_rate": 1.7589349751267255e-05, "loss": 0.0624, "step": 35100 }, { "epoch": 3.1815298169294906, "grad_norm": 0.6328674554824829, "learning_rate": 1.7581796710672187e-05, "loss": 0.0576, "step": 35105 }, { "epoch": 3.181982961754577, "grad_norm": 0.7719002366065979, "learning_rate": 1.7574244412493413e-05, "loss": 0.0742, "step": 35110 }, { "epoch": 3.182436106579663, "grad_norm": 0.6408720016479492, "learning_rate": 1.7566692857486768e-05, "loss": 0.0581, "step": 35115 }, { "epoch": 3.182889251404749, "grad_norm": 0.6713401675224304, "learning_rate": 1.7559142046408008e-05, "loss": 0.0614, "step": 35120 }, { "epoch": 3.1833423962298353, "grad_norm": 0.6738040447235107, "learning_rate": 1.755159198001283e-05, "loss": 0.0652, "step": 35125 }, { "epoch": 3.1837955410549212, "grad_norm": 0.7041658163070679, "learning_rate": 1.7544042659056853e-05, "loss": 0.0682, "step": 35130 }, { "epoch": 3.184248685880007, "grad_norm": 0.8402538895606995, "learning_rate": 1.75364940842956e-05, "loss": 0.0769, "step": 35135 }, { "epoch": 3.184701830705093, "grad_norm": 0.74085932970047, "learning_rate": 1.752894625648455e-05, "loss": 0.0714, "step": 35140 }, { "epoch": 3.1851549755301796, "grad_norm": 0.7245231866836548, "learning_rate": 1.7521399176379072e-05, "loss": 0.0676, "step": 35145 }, { "epoch": 3.1856081203552655, "grad_norm": 0.6419621706008911, "learning_rate": 1.7513852844734493e-05, "loss": 0.0757, "step": 35150 }, { "epoch": 3.1860612651803515, "grad_norm": 0.5402097105979919, "learning_rate": 1.7506307262306064e-05, "loss": 0.0573, "step": 35155 }, { "epoch": 3.186514410005438, "grad_norm": 0.6103927493095398, "learning_rate": 1.749876242984893e-05, "loss": 0.0652, "step": 35160 }, { "epoch": 3.186967554830524, "grad_norm": 0.5625858902931213, "learning_rate": 1.7491218348118184e-05, "loss": 0.0711, "step": 35165 }, { "epoch": 3.18742069965561, "grad_norm": 0.8329655528068542, "learning_rate": 1.7483675017868856e-05, "loss": 0.0741, "step": 35170 }, { "epoch": 3.187873844480696, "grad_norm": 0.5395764112472534, "learning_rate": 1.7476132439855862e-05, "loss": 0.0806, "step": 35175 }, { "epoch": 3.188326989305782, "grad_norm": 0.7604511976242065, "learning_rate": 1.7468590614834084e-05, "loss": 0.0716, "step": 35180 }, { "epoch": 3.188780134130868, "grad_norm": 0.6773141026496887, "learning_rate": 1.7461049543558293e-05, "loss": 0.0624, "step": 35185 }, { "epoch": 3.189233278955954, "grad_norm": 0.7050101161003113, "learning_rate": 1.745350922678322e-05, "loss": 0.0638, "step": 35190 }, { "epoch": 3.1896864237810405, "grad_norm": 0.7129282355308533, "learning_rate": 1.74459696652635e-05, "loss": 0.0695, "step": 35195 }, { "epoch": 3.1901395686061265, "grad_norm": 0.5599685907363892, "learning_rate": 1.743843085975368e-05, "loss": 0.0635, "step": 35200 }, { "epoch": 3.1905927134312124, "grad_norm": 0.7078012824058533, "learning_rate": 1.743089281100826e-05, "loss": 0.0855, "step": 35205 }, { "epoch": 3.191045858256299, "grad_norm": 0.6195815205574036, "learning_rate": 1.7423355519781643e-05, "loss": 0.0594, "step": 35210 }, { "epoch": 3.191499003081385, "grad_norm": 0.512128472328186, "learning_rate": 1.7415818986828166e-05, "loss": 0.0791, "step": 35215 }, { "epoch": 3.1919521479064707, "grad_norm": 0.6935889720916748, "learning_rate": 1.7408283212902098e-05, "loss": 0.0639, "step": 35220 }, { "epoch": 3.192405292731557, "grad_norm": 0.6140889525413513, "learning_rate": 1.7400748198757605e-05, "loss": 0.055, "step": 35225 }, { "epoch": 3.192858437556643, "grad_norm": 0.6187936067581177, "learning_rate": 1.7393213945148802e-05, "loss": 0.0803, "step": 35230 }, { "epoch": 3.193311582381729, "grad_norm": 0.6096532940864563, "learning_rate": 1.7385680452829716e-05, "loss": 0.0682, "step": 35235 }, { "epoch": 3.1937647272068155, "grad_norm": 0.7899211049079895, "learning_rate": 1.7378147722554304e-05, "loss": 0.0725, "step": 35240 }, { "epoch": 3.1942178720319014, "grad_norm": 0.7027391791343689, "learning_rate": 1.737061575507646e-05, "loss": 0.0703, "step": 35245 }, { "epoch": 3.1946710168569874, "grad_norm": 0.5515874624252319, "learning_rate": 1.7363084551149952e-05, "loss": 0.069, "step": 35250 }, { "epoch": 3.195124161682074, "grad_norm": 0.7323881387710571, "learning_rate": 1.735555411152853e-05, "loss": 0.0716, "step": 35255 }, { "epoch": 3.1955773065071598, "grad_norm": 0.746456503868103, "learning_rate": 1.734802443696584e-05, "loss": 0.0616, "step": 35260 }, { "epoch": 3.1960304513322457, "grad_norm": 0.6070939898490906, "learning_rate": 1.7340495528215442e-05, "loss": 0.0587, "step": 35265 }, { "epoch": 3.1964835961573317, "grad_norm": 0.6589295268058777, "learning_rate": 1.7332967386030856e-05, "loss": 0.0759, "step": 35270 }, { "epoch": 3.196936740982418, "grad_norm": 0.7323816418647766, "learning_rate": 1.732544001116547e-05, "loss": 0.0604, "step": 35275 }, { "epoch": 3.197389885807504, "grad_norm": 0.5573540329933167, "learning_rate": 1.7317913404372653e-05, "loss": 0.0756, "step": 35280 }, { "epoch": 3.19784303063259, "grad_norm": 0.715714156627655, "learning_rate": 1.731038756640565e-05, "loss": 0.0754, "step": 35285 }, { "epoch": 3.1982961754576764, "grad_norm": 0.6510732769966125, "learning_rate": 1.7302862498017662e-05, "loss": 0.07, "step": 35290 }, { "epoch": 3.1987493202827624, "grad_norm": 0.739929735660553, "learning_rate": 1.729533819996181e-05, "loss": 0.0694, "step": 35295 }, { "epoch": 3.1992024651078483, "grad_norm": 0.5640380382537842, "learning_rate": 1.7287814672991098e-05, "loss": 0.0601, "step": 35300 }, { "epoch": 3.1996556099329347, "grad_norm": 0.6706939935684204, "learning_rate": 1.7280291917858505e-05, "loss": 0.0701, "step": 35305 }, { "epoch": 3.2001087547580207, "grad_norm": 0.658062219619751, "learning_rate": 1.727276993531691e-05, "loss": 0.0724, "step": 35310 }, { "epoch": 3.2005618995831067, "grad_norm": 0.5560057759284973, "learning_rate": 1.726524872611911e-05, "loss": 0.0612, "step": 35315 }, { "epoch": 3.201015044408193, "grad_norm": 0.6243430376052856, "learning_rate": 1.725772829101784e-05, "loss": 0.0579, "step": 35320 }, { "epoch": 3.201468189233279, "grad_norm": 0.6587127447128296, "learning_rate": 1.725020863076573e-05, "loss": 0.0809, "step": 35325 }, { "epoch": 3.201921334058365, "grad_norm": 0.6921695470809937, "learning_rate": 1.7242689746115358e-05, "loss": 0.0727, "step": 35330 }, { "epoch": 3.202374478883451, "grad_norm": 0.6053243279457092, "learning_rate": 1.723517163781923e-05, "loss": 0.0699, "step": 35335 }, { "epoch": 3.2028276237085374, "grad_norm": 0.7169052958488464, "learning_rate": 1.7227654306629744e-05, "loss": 0.0627, "step": 35340 }, { "epoch": 3.2032807685336233, "grad_norm": 0.6836256384849548, "learning_rate": 1.7220137753299244e-05, "loss": 0.072, "step": 35345 }, { "epoch": 3.2037339133587093, "grad_norm": 0.5810679793357849, "learning_rate": 1.721262197857998e-05, "loss": 0.0685, "step": 35350 }, { "epoch": 3.2041870581837957, "grad_norm": 0.5842704176902771, "learning_rate": 1.720510698322414e-05, "loss": 0.066, "step": 35355 }, { "epoch": 3.2046402030088816, "grad_norm": 0.8547168374061584, "learning_rate": 1.7197592767983846e-05, "loss": 0.0798, "step": 35360 }, { "epoch": 3.2050933478339676, "grad_norm": 0.6107892394065857, "learning_rate": 1.7190079333611088e-05, "loss": 0.0634, "step": 35365 }, { "epoch": 3.205546492659054, "grad_norm": 0.7448932528495789, "learning_rate": 1.718256668085784e-05, "loss": 0.0692, "step": 35370 }, { "epoch": 3.20599963748414, "grad_norm": 0.6056388020515442, "learning_rate": 1.7175054810475956e-05, "loss": 0.0784, "step": 35375 }, { "epoch": 3.206452782309226, "grad_norm": 0.6975818276405334, "learning_rate": 1.7167543723217227e-05, "loss": 0.0613, "step": 35380 }, { "epoch": 3.2069059271343123, "grad_norm": 0.5869576930999756, "learning_rate": 1.7160033419833383e-05, "loss": 0.0633, "step": 35385 }, { "epoch": 3.2073590719593983, "grad_norm": 0.678925096988678, "learning_rate": 1.7152523901076033e-05, "loss": 0.075, "step": 35390 }, { "epoch": 3.2078122167844842, "grad_norm": 0.6102326512336731, "learning_rate": 1.7145015167696756e-05, "loss": 0.0647, "step": 35395 }, { "epoch": 3.2082653616095707, "grad_norm": 0.647577702999115, "learning_rate": 1.7137507220447e-05, "loss": 0.0578, "step": 35400 }, { "epoch": 3.2087185064346566, "grad_norm": 0.5632933378219604, "learning_rate": 1.7130000060078187e-05, "loss": 0.0584, "step": 35405 }, { "epoch": 3.2091716512597426, "grad_norm": 0.6714352369308472, "learning_rate": 1.712249368734164e-05, "loss": 0.0615, "step": 35410 }, { "epoch": 3.2096247960848285, "grad_norm": 0.675361692905426, "learning_rate": 1.7114988102988573e-05, "loss": 0.0611, "step": 35415 }, { "epoch": 3.210077940909915, "grad_norm": 0.781956672668457, "learning_rate": 1.7107483307770162e-05, "loss": 0.0722, "step": 35420 }, { "epoch": 3.210531085735001, "grad_norm": 0.5899052619934082, "learning_rate": 1.7099979302437496e-05, "loss": 0.074, "step": 35425 }, { "epoch": 3.210984230560087, "grad_norm": 0.6556848287582397, "learning_rate": 1.709247608774157e-05, "loss": 0.0701, "step": 35430 }, { "epoch": 3.2114373753851733, "grad_norm": 0.7437913417816162, "learning_rate": 1.7084973664433318e-05, "loss": 0.0594, "step": 35435 }, { "epoch": 3.2118905202102592, "grad_norm": 0.7092465758323669, "learning_rate": 1.7077472033263564e-05, "loss": 0.0705, "step": 35440 }, { "epoch": 3.212343665035345, "grad_norm": 0.6858547925949097, "learning_rate": 1.7069971194983093e-05, "loss": 0.0769, "step": 35445 }, { "epoch": 3.2127968098604316, "grad_norm": 0.6854438781738281, "learning_rate": 1.7062471150342584e-05, "loss": 0.0738, "step": 35450 }, { "epoch": 3.2132499546855176, "grad_norm": 0.6991135478019714, "learning_rate": 1.705497190009265e-05, "loss": 0.0858, "step": 35455 }, { "epoch": 3.2137030995106035, "grad_norm": 0.5769578814506531, "learning_rate": 1.7047473444983814e-05, "loss": 0.0619, "step": 35460 }, { "epoch": 3.2141562443356895, "grad_norm": 0.7837098240852356, "learning_rate": 1.703997578576652e-05, "loss": 0.0864, "step": 35465 }, { "epoch": 3.214609389160776, "grad_norm": 0.6094517707824707, "learning_rate": 1.7032478923191136e-05, "loss": 0.0574, "step": 35470 }, { "epoch": 3.215062533985862, "grad_norm": 0.6698917150497437, "learning_rate": 1.7024982858007965e-05, "loss": 0.0735, "step": 35475 }, { "epoch": 3.215515678810948, "grad_norm": 0.7092030048370361, "learning_rate": 1.7017487590967195e-05, "loss": 0.0708, "step": 35480 }, { "epoch": 3.215968823636034, "grad_norm": 0.6496942639350891, "learning_rate": 1.700999312281898e-05, "loss": 0.0706, "step": 35485 }, { "epoch": 3.21642196846112, "grad_norm": 0.5748459100723267, "learning_rate": 1.700249945431334e-05, "loss": 0.0618, "step": 35490 }, { "epoch": 3.216875113286206, "grad_norm": 0.7765089273452759, "learning_rate": 1.6995006586200263e-05, "loss": 0.0708, "step": 35495 }, { "epoch": 3.2173282581112925, "grad_norm": 0.5496929883956909, "learning_rate": 1.6987514519229636e-05, "loss": 0.0672, "step": 35500 }, { "epoch": 3.2177814029363785, "grad_norm": 0.7184414863586426, "learning_rate": 1.698002325415126e-05, "loss": 0.0808, "step": 35505 }, { "epoch": 3.2182345477614644, "grad_norm": 0.6366729736328125, "learning_rate": 1.6972532791714877e-05, "loss": 0.0791, "step": 35510 }, { "epoch": 3.218687692586551, "grad_norm": 0.8386797308921814, "learning_rate": 1.6965043132670115e-05, "loss": 0.0815, "step": 35515 }, { "epoch": 3.219140837411637, "grad_norm": 0.6551039218902588, "learning_rate": 1.695755427776655e-05, "loss": 0.0709, "step": 35520 }, { "epoch": 3.2195939822367228, "grad_norm": 0.7623088359832764, "learning_rate": 1.6950066227753688e-05, "loss": 0.0777, "step": 35525 }, { "epoch": 3.220047127061809, "grad_norm": 0.5497446656227112, "learning_rate": 1.6942578983380907e-05, "loss": 0.0618, "step": 35530 }, { "epoch": 3.220500271886895, "grad_norm": 0.6257222294807434, "learning_rate": 1.693509254539755e-05, "loss": 0.0599, "step": 35535 }, { "epoch": 3.220953416711981, "grad_norm": 0.7067297697067261, "learning_rate": 1.692760691455285e-05, "loss": 0.0635, "step": 35540 }, { "epoch": 3.221406561537067, "grad_norm": 0.6689762473106384, "learning_rate": 1.692012209159598e-05, "loss": 0.0584, "step": 35545 }, { "epoch": 3.2218597063621535, "grad_norm": 0.7231155037879944, "learning_rate": 1.6912638077276034e-05, "loss": 0.0726, "step": 35550 }, { "epoch": 3.2223128511872394, "grad_norm": 0.768118679523468, "learning_rate": 1.6905154872341993e-05, "loss": 0.0754, "step": 35555 }, { "epoch": 3.2227659960123254, "grad_norm": 0.7064412236213684, "learning_rate": 1.6897672477542786e-05, "loss": 0.0737, "step": 35560 }, { "epoch": 3.223219140837412, "grad_norm": 0.5875034928321838, "learning_rate": 1.6890190893627266e-05, "loss": 0.0665, "step": 35565 }, { "epoch": 3.2236722856624977, "grad_norm": 0.5552347898483276, "learning_rate": 1.6882710121344176e-05, "loss": 0.0691, "step": 35570 }, { "epoch": 3.2241254304875837, "grad_norm": 0.7131790518760681, "learning_rate": 1.687523016144221e-05, "loss": 0.0861, "step": 35575 }, { "epoch": 3.22457857531267, "grad_norm": 0.7172418236732483, "learning_rate": 1.6867751014669947e-05, "loss": 0.0839, "step": 35580 }, { "epoch": 3.225031720137756, "grad_norm": 0.6313625574111938, "learning_rate": 1.6860272681775913e-05, "loss": 0.0748, "step": 35585 }, { "epoch": 3.225484864962842, "grad_norm": 0.842316746711731, "learning_rate": 1.6852795163508542e-05, "loss": 0.0757, "step": 35590 }, { "epoch": 3.225938009787928, "grad_norm": 0.6108740568161011, "learning_rate": 1.6845318460616183e-05, "loss": 0.0557, "step": 35595 }, { "epoch": 3.2263911546130144, "grad_norm": 0.9307047128677368, "learning_rate": 1.6837842573847117e-05, "loss": 0.078, "step": 35600 }, { "epoch": 3.2268442994381004, "grad_norm": 0.6534098982810974, "learning_rate": 1.6830367503949517e-05, "loss": 0.0611, "step": 35605 }, { "epoch": 3.2272974442631863, "grad_norm": 0.5742986798286438, "learning_rate": 1.68228932516715e-05, "loss": 0.0637, "step": 35610 }, { "epoch": 3.2277505890882727, "grad_norm": 0.6243442296981812, "learning_rate": 1.681541981776109e-05, "loss": 0.0658, "step": 35615 }, { "epoch": 3.2282037339133587, "grad_norm": 0.6773375272750854, "learning_rate": 1.6807947202966235e-05, "loss": 0.0717, "step": 35620 }, { "epoch": 3.2286568787384446, "grad_norm": 0.7645496129989624, "learning_rate": 1.68004754080348e-05, "loss": 0.0648, "step": 35625 }, { "epoch": 3.229110023563531, "grad_norm": 0.6497361063957214, "learning_rate": 1.6793004433714545e-05, "loss": 0.0657, "step": 35630 }, { "epoch": 3.229563168388617, "grad_norm": 0.6716238856315613, "learning_rate": 1.678553428075318e-05, "loss": 0.0647, "step": 35635 }, { "epoch": 3.230016313213703, "grad_norm": 0.6206133365631104, "learning_rate": 1.6778064949898332e-05, "loss": 0.0823, "step": 35640 }, { "epoch": 3.2304694580387894, "grad_norm": 0.6825173497200012, "learning_rate": 1.6770596441897513e-05, "loss": 0.0691, "step": 35645 }, { "epoch": 3.2309226028638753, "grad_norm": 0.6568259000778198, "learning_rate": 1.67631287574982e-05, "loss": 0.0634, "step": 35650 }, { "epoch": 3.2313757476889613, "grad_norm": 0.7405086755752563, "learning_rate": 1.6755661897447728e-05, "loss": 0.0715, "step": 35655 }, { "epoch": 3.2318288925140477, "grad_norm": 0.6895834803581238, "learning_rate": 1.6748195862493406e-05, "loss": 0.0744, "step": 35660 }, { "epoch": 3.2322820373391337, "grad_norm": 0.5997737050056458, "learning_rate": 1.6740730653382434e-05, "loss": 0.0687, "step": 35665 }, { "epoch": 3.2327351821642196, "grad_norm": 0.696906328201294, "learning_rate": 1.6733266270861924e-05, "loss": 0.0806, "step": 35670 }, { "epoch": 3.2331883269893056, "grad_norm": 0.5767266750335693, "learning_rate": 1.6725802715678924e-05, "loss": 0.0652, "step": 35675 }, { "epoch": 3.233641471814392, "grad_norm": 0.7622692584991455, "learning_rate": 1.6718339988580388e-05, "loss": 0.0741, "step": 35680 }, { "epoch": 3.234094616639478, "grad_norm": 0.6506110429763794, "learning_rate": 1.6710878090313183e-05, "loss": 0.0655, "step": 35685 }, { "epoch": 3.234547761464564, "grad_norm": 0.7103327512741089, "learning_rate": 1.670341702162411e-05, "loss": 0.0726, "step": 35690 }, { "epoch": 3.2350009062896503, "grad_norm": 0.6814565658569336, "learning_rate": 1.6695956783259847e-05, "loss": 0.0903, "step": 35695 }, { "epoch": 3.2354540511147363, "grad_norm": 0.627438485622406, "learning_rate": 1.6688497375967042e-05, "loss": 0.0618, "step": 35700 }, { "epoch": 3.2359071959398222, "grad_norm": 0.7063400745391846, "learning_rate": 1.6681038800492243e-05, "loss": 0.0617, "step": 35705 }, { "epoch": 3.2363603407649086, "grad_norm": 0.659323513507843, "learning_rate": 1.6673581057581883e-05, "loss": 0.0645, "step": 35710 }, { "epoch": 3.2368134855899946, "grad_norm": 0.623687207698822, "learning_rate": 1.6666124147982353e-05, "loss": 0.0578, "step": 35715 }, { "epoch": 3.2372666304150806, "grad_norm": 0.5708637237548828, "learning_rate": 1.6658668072439927e-05, "loss": 0.0637, "step": 35720 }, { "epoch": 3.237719775240167, "grad_norm": 0.6540096402168274, "learning_rate": 1.6651212831700818e-05, "loss": 0.0707, "step": 35725 }, { "epoch": 3.238172920065253, "grad_norm": 0.6967449188232422, "learning_rate": 1.6643758426511162e-05, "loss": 0.0789, "step": 35730 }, { "epoch": 3.238626064890339, "grad_norm": 0.5060498714447021, "learning_rate": 1.6636304857616976e-05, "loss": 0.0652, "step": 35735 }, { "epoch": 3.239079209715425, "grad_norm": 0.5930262207984924, "learning_rate": 1.6628852125764238e-05, "loss": 0.0657, "step": 35740 }, { "epoch": 3.2395323545405112, "grad_norm": 0.6325960159301758, "learning_rate": 1.66214002316988e-05, "loss": 0.0618, "step": 35745 }, { "epoch": 3.239985499365597, "grad_norm": 0.5511959791183472, "learning_rate": 1.661394917616646e-05, "loss": 0.0708, "step": 35750 }, { "epoch": 3.240438644190683, "grad_norm": 0.6696194410324097, "learning_rate": 1.6606498959912927e-05, "loss": 0.0732, "step": 35755 }, { "epoch": 3.2408917890157696, "grad_norm": 0.757249116897583, "learning_rate": 1.6599049583683813e-05, "loss": 0.0689, "step": 35760 }, { "epoch": 3.2413449338408555, "grad_norm": 0.6541077494621277, "learning_rate": 1.659160104822466e-05, "loss": 0.0671, "step": 35765 }, { "epoch": 3.2417980786659415, "grad_norm": 0.8436203002929688, "learning_rate": 1.6584153354280913e-05, "loss": 0.0852, "step": 35770 }, { "epoch": 3.242251223491028, "grad_norm": 0.7242597341537476, "learning_rate": 1.657670650259794e-05, "loss": 0.0644, "step": 35775 }, { "epoch": 3.242704368316114, "grad_norm": 0.59670090675354, "learning_rate": 1.656926049392103e-05, "loss": 0.0571, "step": 35780 }, { "epoch": 3.2431575131412, "grad_norm": 0.6290921568870544, "learning_rate": 1.6561815328995378e-05, "loss": 0.0626, "step": 35785 }, { "epoch": 3.2436106579662862, "grad_norm": 0.6439557671546936, "learning_rate": 1.655437100856611e-05, "loss": 0.0756, "step": 35790 }, { "epoch": 3.244063802791372, "grad_norm": 0.7202614545822144, "learning_rate": 1.6546927533378236e-05, "loss": 0.0771, "step": 35795 }, { "epoch": 3.244516947616458, "grad_norm": 0.6597886085510254, "learning_rate": 1.6539484904176713e-05, "loss": 0.0709, "step": 35800 }, { "epoch": 3.2449700924415446, "grad_norm": 0.6473577618598938, "learning_rate": 1.6532043121706405e-05, "loss": 0.0593, "step": 35805 }, { "epoch": 3.2454232372666305, "grad_norm": 0.6231406927108765, "learning_rate": 1.652460218671208e-05, "loss": 0.0742, "step": 35810 }, { "epoch": 3.2458763820917165, "grad_norm": 0.6006757020950317, "learning_rate": 1.6517162099938434e-05, "loss": 0.0598, "step": 35815 }, { "epoch": 3.2463295269168024, "grad_norm": 0.7123556733131409, "learning_rate": 1.650972286213008e-05, "loss": 0.0677, "step": 35820 }, { "epoch": 3.246782671741889, "grad_norm": 0.6878281831741333, "learning_rate": 1.6502284474031522e-05, "loss": 0.075, "step": 35825 }, { "epoch": 3.247235816566975, "grad_norm": 0.8606639504432678, "learning_rate": 1.6494846936387214e-05, "loss": 0.0791, "step": 35830 }, { "epoch": 3.2476889613920608, "grad_norm": 0.5991421937942505, "learning_rate": 1.6487410249941494e-05, "loss": 0.0586, "step": 35835 }, { "epoch": 3.248142106217147, "grad_norm": 0.7720733880996704, "learning_rate": 1.6479974415438632e-05, "loss": 0.068, "step": 35840 }, { "epoch": 3.248595251042233, "grad_norm": 0.5943734049797058, "learning_rate": 1.6472539433622825e-05, "loss": 0.0839, "step": 35845 }, { "epoch": 3.249048395867319, "grad_norm": 0.6785855889320374, "learning_rate": 1.6465105305238138e-05, "loss": 0.057, "step": 35850 }, { "epoch": 3.2495015406924055, "grad_norm": 0.6660404205322266, "learning_rate": 1.645767203102861e-05, "loss": 0.0717, "step": 35855 }, { "epoch": 3.2499546855174914, "grad_norm": 0.6835613250732422, "learning_rate": 1.6450239611738142e-05, "loss": 0.0708, "step": 35860 }, { "epoch": 3.2504078303425774, "grad_norm": 0.5680165886878967, "learning_rate": 1.6442808048110583e-05, "loss": 0.0679, "step": 35865 }, { "epoch": 3.2508609751676634, "grad_norm": 0.6566238403320312, "learning_rate": 1.6435377340889703e-05, "loss": 0.058, "step": 35870 }, { "epoch": 3.2513141199927498, "grad_norm": 0.6633539795875549, "learning_rate": 1.642794749081914e-05, "loss": 0.0744, "step": 35875 }, { "epoch": 3.2517672648178357, "grad_norm": 0.6540647149085999, "learning_rate": 1.64205184986425e-05, "loss": 0.0657, "step": 35880 }, { "epoch": 3.2522204096429217, "grad_norm": 0.6206323504447937, "learning_rate": 1.641309036510326e-05, "loss": 0.0704, "step": 35885 }, { "epoch": 3.252673554468008, "grad_norm": 0.7090588808059692, "learning_rate": 1.640566309094484e-05, "loss": 0.0762, "step": 35890 }, { "epoch": 3.253126699293094, "grad_norm": 0.6100761890411377, "learning_rate": 1.6398236676910568e-05, "loss": 0.0656, "step": 35895 }, { "epoch": 3.25357984411818, "grad_norm": 0.667644202709198, "learning_rate": 1.639081112374367e-05, "loss": 0.0766, "step": 35900 }, { "epoch": 3.2540329889432664, "grad_norm": 0.6695157289505005, "learning_rate": 1.6383386432187314e-05, "loss": 0.0652, "step": 35905 }, { "epoch": 3.2544861337683524, "grad_norm": 0.7890726327896118, "learning_rate": 1.6375962602984547e-05, "loss": 0.076, "step": 35910 }, { "epoch": 3.2549392785934383, "grad_norm": 0.6845704913139343, "learning_rate": 1.636853963687836e-05, "loss": 0.0705, "step": 35915 }, { "epoch": 3.2553924234185247, "grad_norm": 0.8233734965324402, "learning_rate": 1.636111753461165e-05, "loss": 0.0626, "step": 35920 }, { "epoch": 3.2558455682436107, "grad_norm": 0.7775692343711853, "learning_rate": 1.635369629692721e-05, "loss": 0.0828, "step": 35925 }, { "epoch": 3.2562987130686967, "grad_norm": 0.6869652271270752, "learning_rate": 1.6346275924567773e-05, "loss": 0.0697, "step": 35930 }, { "epoch": 3.256751857893783, "grad_norm": 0.6912325024604797, "learning_rate": 1.6338856418275972e-05, "loss": 0.0627, "step": 35935 }, { "epoch": 3.257205002718869, "grad_norm": 0.5975180864334106, "learning_rate": 1.633143777879434e-05, "loss": 0.0659, "step": 35940 }, { "epoch": 3.257658147543955, "grad_norm": 0.624165952205658, "learning_rate": 1.6324020006865347e-05, "loss": 0.0626, "step": 35945 }, { "epoch": 3.258111292369041, "grad_norm": 0.7651771306991577, "learning_rate": 1.6316603103231365e-05, "loss": 0.0954, "step": 35950 }, { "epoch": 3.2585644371941274, "grad_norm": 0.6946186423301697, "learning_rate": 1.6309187068634677e-05, "loss": 0.0588, "step": 35955 }, { "epoch": 3.2590175820192133, "grad_norm": 0.6406940221786499, "learning_rate": 1.6301771903817495e-05, "loss": 0.0666, "step": 35960 }, { "epoch": 3.2594707268442993, "grad_norm": 0.6180949211120605, "learning_rate": 1.629435760952191e-05, "loss": 0.0658, "step": 35965 }, { "epoch": 3.2599238716693857, "grad_norm": 0.7141253352165222, "learning_rate": 1.6286944186489965e-05, "loss": 0.0684, "step": 35970 }, { "epoch": 3.2603770164944716, "grad_norm": 0.6412773728370667, "learning_rate": 1.627953163546359e-05, "loss": 0.0697, "step": 35975 }, { "epoch": 3.2608301613195576, "grad_norm": 0.5423885583877563, "learning_rate": 1.627211995718463e-05, "loss": 0.0644, "step": 35980 }, { "epoch": 3.261283306144644, "grad_norm": 0.4292938709259033, "learning_rate": 1.626470915239487e-05, "loss": 0.0654, "step": 35985 }, { "epoch": 3.26173645096973, "grad_norm": 0.7251296043395996, "learning_rate": 1.625729922183596e-05, "loss": 0.0741, "step": 35990 }, { "epoch": 3.262189595794816, "grad_norm": 0.7299497723579407, "learning_rate": 1.6249890166249506e-05, "loss": 0.0641, "step": 35995 }, { "epoch": 3.262642740619902, "grad_norm": 1.0476326942443848, "learning_rate": 1.6242481986376993e-05, "loss": 0.0772, "step": 36000 }, { "epoch": 3.2630958854449883, "grad_norm": 0.5202943682670593, "learning_rate": 1.6235074682959845e-05, "loss": 0.0601, "step": 36005 }, { "epoch": 3.2635490302700743, "grad_norm": 0.6659681797027588, "learning_rate": 1.6227668256739396e-05, "loss": 0.0627, "step": 36010 }, { "epoch": 3.26400217509516, "grad_norm": 0.5901613831520081, "learning_rate": 1.6220262708456864e-05, "loss": 0.0594, "step": 36015 }, { "epoch": 3.2644553199202466, "grad_norm": 0.5934803485870361, "learning_rate": 1.6212858038853412e-05, "loss": 0.0907, "step": 36020 }, { "epoch": 3.2649084647453326, "grad_norm": 0.6609403491020203, "learning_rate": 1.6205454248670093e-05, "loss": 0.0655, "step": 36025 }, { "epoch": 3.2653616095704185, "grad_norm": 0.5663449764251709, "learning_rate": 1.6198051338647885e-05, "loss": 0.0688, "step": 36030 }, { "epoch": 3.265814754395505, "grad_norm": 0.6694226264953613, "learning_rate": 1.6190649309527687e-05, "loss": 0.09, "step": 36035 }, { "epoch": 3.266267899220591, "grad_norm": 0.5795015692710876, "learning_rate": 1.6183248162050273e-05, "loss": 0.0709, "step": 36040 }, { "epoch": 3.266721044045677, "grad_norm": 0.6437596082687378, "learning_rate": 1.6175847896956368e-05, "loss": 0.0619, "step": 36045 }, { "epoch": 3.2671741888707633, "grad_norm": 0.7039116621017456, "learning_rate": 1.6168448514986583e-05, "loss": 0.0643, "step": 36050 }, { "epoch": 3.2676273336958492, "grad_norm": 0.6306853890419006, "learning_rate": 1.6161050016881458e-05, "loss": 0.0702, "step": 36055 }, { "epoch": 3.268080478520935, "grad_norm": 0.6348460912704468, "learning_rate": 1.6153652403381443e-05, "loss": 0.072, "step": 36060 }, { "epoch": 3.2685336233460216, "grad_norm": 0.6765544414520264, "learning_rate": 1.6146255675226878e-05, "loss": 0.0626, "step": 36065 }, { "epoch": 3.2689867681711076, "grad_norm": 0.716407299041748, "learning_rate": 1.6138859833158036e-05, "loss": 0.0647, "step": 36070 }, { "epoch": 3.2694399129961935, "grad_norm": 0.7455058693885803, "learning_rate": 1.613146487791511e-05, "loss": 0.0695, "step": 36075 }, { "epoch": 3.26989305782128, "grad_norm": 0.5747495293617249, "learning_rate": 1.6124070810238166e-05, "loss": 0.0646, "step": 36080 }, { "epoch": 3.270346202646366, "grad_norm": 0.8783767223358154, "learning_rate": 1.6116677630867223e-05, "loss": 0.0721, "step": 36085 }, { "epoch": 3.270799347471452, "grad_norm": 0.6790851950645447, "learning_rate": 1.6109285340542174e-05, "loss": 0.0619, "step": 36090 }, { "epoch": 3.271252492296538, "grad_norm": 0.6463431119918823, "learning_rate": 1.610189394000286e-05, "loss": 0.0607, "step": 36095 }, { "epoch": 3.271705637121624, "grad_norm": 0.7512148022651672, "learning_rate": 1.6094503429989016e-05, "loss": 0.064, "step": 36100 }, { "epoch": 3.27215878194671, "grad_norm": 0.6200889348983765, "learning_rate": 1.608711381124027e-05, "loss": 0.0758, "step": 36105 }, { "epoch": 3.272611926771796, "grad_norm": 0.5682360529899597, "learning_rate": 1.6079725084496188e-05, "loss": 0.0667, "step": 36110 }, { "epoch": 3.2730650715968825, "grad_norm": 0.7251569032669067, "learning_rate": 1.6072337250496235e-05, "loss": 0.0625, "step": 36115 }, { "epoch": 3.2735182164219685, "grad_norm": 0.6677758693695068, "learning_rate": 1.6064950309979787e-05, "loss": 0.0667, "step": 36120 }, { "epoch": 3.2739713612470545, "grad_norm": 0.6885267496109009, "learning_rate": 1.6057564263686142e-05, "loss": 0.077, "step": 36125 }, { "epoch": 3.2744245060721404, "grad_norm": 0.606876015663147, "learning_rate": 1.605017911235448e-05, "loss": 0.0633, "step": 36130 }, { "epoch": 3.274877650897227, "grad_norm": 0.6265695691108704, "learning_rate": 1.604279485672392e-05, "loss": 0.061, "step": 36135 }, { "epoch": 3.275330795722313, "grad_norm": 0.687890350818634, "learning_rate": 1.6035411497533474e-05, "loss": 0.0672, "step": 36140 }, { "epoch": 3.2757839405473987, "grad_norm": 0.6861122846603394, "learning_rate": 1.602802903552208e-05, "loss": 0.0921, "step": 36145 }, { "epoch": 3.276237085372485, "grad_norm": 0.48678481578826904, "learning_rate": 1.602064747142858e-05, "loss": 0.0649, "step": 36150 }, { "epoch": 3.276690230197571, "grad_norm": 0.6250580549240112, "learning_rate": 1.6013266805991706e-05, "loss": 0.0673, "step": 36155 }, { "epoch": 3.277143375022657, "grad_norm": 0.573884129524231, "learning_rate": 1.6005887039950133e-05, "loss": 0.0655, "step": 36160 }, { "epoch": 3.2775965198477435, "grad_norm": 0.6110538840293884, "learning_rate": 1.599850817404242e-05, "loss": 0.0627, "step": 36165 }, { "epoch": 3.2780496646728294, "grad_norm": 0.8998131155967712, "learning_rate": 1.5991130209007056e-05, "loss": 0.0839, "step": 36170 }, { "epoch": 3.2785028094979154, "grad_norm": 0.669678807258606, "learning_rate": 1.598375314558243e-05, "loss": 0.072, "step": 36175 }, { "epoch": 3.278955954323002, "grad_norm": 0.6236297488212585, "learning_rate": 1.597637698450683e-05, "loss": 0.0711, "step": 36180 }, { "epoch": 3.2794090991480878, "grad_norm": 0.5174222588539124, "learning_rate": 1.596900172651848e-05, "loss": 0.0595, "step": 36185 }, { "epoch": 3.2798622439731737, "grad_norm": 0.7004865407943726, "learning_rate": 1.596162737235548e-05, "loss": 0.0725, "step": 36190 }, { "epoch": 3.28031538879826, "grad_norm": 0.6218287348747253, "learning_rate": 1.595425392275587e-05, "loss": 0.0639, "step": 36195 }, { "epoch": 3.280768533623346, "grad_norm": 0.5825852751731873, "learning_rate": 1.5946881378457597e-05, "loss": 0.0684, "step": 36200 }, { "epoch": 3.281221678448432, "grad_norm": 0.5667723417282104, "learning_rate": 1.5939509740198484e-05, "loss": 0.0591, "step": 36205 }, { "epoch": 3.2816748232735184, "grad_norm": 0.7781783938407898, "learning_rate": 1.5932139008716297e-05, "loss": 0.0721, "step": 36210 }, { "epoch": 3.2821279680986044, "grad_norm": 0.7852537035942078, "learning_rate": 1.592476918474871e-05, "loss": 0.0811, "step": 36215 }, { "epoch": 3.2825811129236904, "grad_norm": 0.6415577530860901, "learning_rate": 1.5917400269033288e-05, "loss": 0.0787, "step": 36220 }, { "epoch": 3.2830342577487763, "grad_norm": 0.7676610350608826, "learning_rate": 1.591003226230753e-05, "loss": 0.0634, "step": 36225 }, { "epoch": 3.2834874025738627, "grad_norm": 0.6503139734268188, "learning_rate": 1.59026651653088e-05, "loss": 0.0607, "step": 36230 }, { "epoch": 3.2839405473989487, "grad_norm": 0.7699753046035767, "learning_rate": 1.5895298978774416e-05, "loss": 0.0622, "step": 36235 }, { "epoch": 3.2843936922240347, "grad_norm": 0.7089414596557617, "learning_rate": 1.5887933703441602e-05, "loss": 0.0654, "step": 36240 }, { "epoch": 3.284846837049121, "grad_norm": 0.5063508749008179, "learning_rate": 1.5880569340047445e-05, "loss": 0.058, "step": 36245 }, { "epoch": 3.285299981874207, "grad_norm": 0.7273854613304138, "learning_rate": 1.587320588932901e-05, "loss": 0.0832, "step": 36250 }, { "epoch": 3.285753126699293, "grad_norm": 0.6601486802101135, "learning_rate": 1.58658433520232e-05, "loss": 0.0697, "step": 36255 }, { "epoch": 3.2862062715243794, "grad_norm": 0.588923454284668, "learning_rate": 1.585848172886688e-05, "loss": 0.0758, "step": 36260 }, { "epoch": 3.2866594163494653, "grad_norm": 0.5556511282920837, "learning_rate": 1.5851121020596804e-05, "loss": 0.0708, "step": 36265 }, { "epoch": 3.2871125611745513, "grad_norm": 0.5720561146736145, "learning_rate": 1.5843761227949623e-05, "loss": 0.0729, "step": 36270 }, { "epoch": 3.2875657059996373, "grad_norm": 0.5676770210266113, "learning_rate": 1.5836402351661915e-05, "loss": 0.0647, "step": 36275 }, { "epoch": 3.2880188508247237, "grad_norm": 0.6925122141838074, "learning_rate": 1.582904439247016e-05, "loss": 0.0707, "step": 36280 }, { "epoch": 3.2884719956498096, "grad_norm": 0.6836761236190796, "learning_rate": 1.5821687351110734e-05, "loss": 0.0665, "step": 36285 }, { "epoch": 3.2889251404748956, "grad_norm": 0.6436324119567871, "learning_rate": 1.5814331228319957e-05, "loss": 0.0728, "step": 36290 }, { "epoch": 3.289378285299982, "grad_norm": 0.6480724215507507, "learning_rate": 1.5806976024834e-05, "loss": 0.0687, "step": 36295 }, { "epoch": 3.289831430125068, "grad_norm": 0.6599742770195007, "learning_rate": 1.5799621741388997e-05, "loss": 0.0558, "step": 36300 }, { "epoch": 3.290284574950154, "grad_norm": 0.7442611455917358, "learning_rate": 1.5792268378720957e-05, "loss": 0.0642, "step": 36305 }, { "epoch": 3.2907377197752403, "grad_norm": 0.6382429599761963, "learning_rate": 1.5784915937565807e-05, "loss": 0.085, "step": 36310 }, { "epoch": 3.2911908646003263, "grad_norm": 0.6641179323196411, "learning_rate": 1.5777564418659398e-05, "loss": 0.0665, "step": 36315 }, { "epoch": 3.2916440094254122, "grad_norm": 0.8340433835983276, "learning_rate": 1.577021382273745e-05, "loss": 0.0669, "step": 36320 }, { "epoch": 3.2920971542504986, "grad_norm": 0.646177351474762, "learning_rate": 1.5762864150535623e-05, "loss": 0.0653, "step": 36325 }, { "epoch": 3.2925502990755846, "grad_norm": 0.6601238250732422, "learning_rate": 1.575551540278948e-05, "loss": 0.0665, "step": 36330 }, { "epoch": 3.2930034439006706, "grad_norm": 0.6109870672225952, "learning_rate": 1.574816758023447e-05, "loss": 0.0653, "step": 36335 }, { "epoch": 3.293456588725757, "grad_norm": 0.5916908383369446, "learning_rate": 1.5740820683605992e-05, "loss": 0.0659, "step": 36340 }, { "epoch": 3.293909733550843, "grad_norm": 0.5971527099609375, "learning_rate": 1.57334747136393e-05, "loss": 0.0618, "step": 36345 }, { "epoch": 3.294362878375929, "grad_norm": 0.5051060914993286, "learning_rate": 1.572612967106959e-05, "loss": 0.0674, "step": 36350 }, { "epoch": 3.294816023201015, "grad_norm": 0.68807053565979, "learning_rate": 1.571878555663196e-05, "loss": 0.0671, "step": 36355 }, { "epoch": 3.2952691680261013, "grad_norm": 0.7096226215362549, "learning_rate": 1.5711442371061405e-05, "loss": 0.072, "step": 36360 }, { "epoch": 3.295722312851187, "grad_norm": 0.7415252327919006, "learning_rate": 1.570410011509285e-05, "loss": 0.0865, "step": 36365 }, { "epoch": 3.296175457676273, "grad_norm": 0.7364245653152466, "learning_rate": 1.5696758789461087e-05, "loss": 0.116, "step": 36370 }, { "epoch": 3.2966286025013596, "grad_norm": 0.5825679898262024, "learning_rate": 1.5689418394900846e-05, "loss": 0.0653, "step": 36375 }, { "epoch": 3.2970817473264455, "grad_norm": 0.5722463130950928, "learning_rate": 1.568207893214677e-05, "loss": 0.0581, "step": 36380 }, { "epoch": 3.2975348921515315, "grad_norm": 0.7719249129295349, "learning_rate": 1.5674740401933373e-05, "loss": 0.0623, "step": 36385 }, { "epoch": 3.297988036976618, "grad_norm": 0.5805779099464417, "learning_rate": 1.5667402804995123e-05, "loss": 0.1014, "step": 36390 }, { "epoch": 3.298441181801704, "grad_norm": 0.6117797493934631, "learning_rate": 1.5660066142066344e-05, "loss": 0.0713, "step": 36395 }, { "epoch": 3.29889432662679, "grad_norm": 0.9030008316040039, "learning_rate": 1.5652730413881307e-05, "loss": 0.0848, "step": 36400 }, { "epoch": 3.299347471451876, "grad_norm": 0.5331222414970398, "learning_rate": 1.564539562117417e-05, "loss": 0.0847, "step": 36405 }, { "epoch": 3.299800616276962, "grad_norm": 0.6447046399116516, "learning_rate": 1.5638061764679e-05, "loss": 0.0695, "step": 36410 }, { "epoch": 3.300253761102048, "grad_norm": 0.5691525340080261, "learning_rate": 1.563072884512978e-05, "loss": 0.061, "step": 36415 }, { "epoch": 3.300706905927134, "grad_norm": 0.6807186603546143, "learning_rate": 1.5623396863260377e-05, "loss": 0.0595, "step": 36420 }, { "epoch": 3.3011600507522205, "grad_norm": 0.7100244760513306, "learning_rate": 1.5616065819804584e-05, "loss": 0.0681, "step": 36425 }, { "epoch": 3.3016131955773065, "grad_norm": 0.6424606442451477, "learning_rate": 1.5608735715496107e-05, "loss": 0.064, "step": 36430 }, { "epoch": 3.3020663404023924, "grad_norm": 0.6488098502159119, "learning_rate": 1.560140655106852e-05, "loss": 0.0627, "step": 36435 }, { "epoch": 3.302519485227479, "grad_norm": 0.7123261094093323, "learning_rate": 1.5594078327255357e-05, "loss": 0.0589, "step": 36440 }, { "epoch": 3.302972630052565, "grad_norm": 0.7279481887817383, "learning_rate": 1.5586751044790005e-05, "loss": 0.0641, "step": 36445 }, { "epoch": 3.3034257748776508, "grad_norm": 0.836082398891449, "learning_rate": 1.5579424704405786e-05, "loss": 0.0847, "step": 36450 }, { "epoch": 3.303878919702737, "grad_norm": 0.7402432560920715, "learning_rate": 1.557209930683594e-05, "loss": 0.0673, "step": 36455 }, { "epoch": 3.304332064527823, "grad_norm": 0.7139565348625183, "learning_rate": 1.5564774852813577e-05, "loss": 0.0603, "step": 36460 }, { "epoch": 3.304785209352909, "grad_norm": 0.7498888373374939, "learning_rate": 1.5557451343071733e-05, "loss": 0.0672, "step": 36465 }, { "epoch": 3.3052383541779955, "grad_norm": 0.6464352607727051, "learning_rate": 1.5550128778343355e-05, "loss": 0.0676, "step": 36470 }, { "epoch": 3.3056914990030815, "grad_norm": 0.6969254612922668, "learning_rate": 1.554280715936128e-05, "loss": 0.0631, "step": 36475 }, { "epoch": 3.3061446438281674, "grad_norm": 0.6995512843132019, "learning_rate": 1.553548648685827e-05, "loss": 0.0621, "step": 36480 }, { "epoch": 3.306597788653254, "grad_norm": 0.6009315848350525, "learning_rate": 1.5528166761566962e-05, "loss": 0.0621, "step": 36485 }, { "epoch": 3.30705093347834, "grad_norm": 0.620963990688324, "learning_rate": 1.552084798421993e-05, "loss": 0.0718, "step": 36490 }, { "epoch": 3.3075040783034257, "grad_norm": 0.6811867356300354, "learning_rate": 1.5513530155549634e-05, "loss": 0.0803, "step": 36495 }, { "epoch": 3.3079572231285117, "grad_norm": 0.6571009755134583, "learning_rate": 1.5506213276288445e-05, "loss": 0.076, "step": 36500 }, { "epoch": 3.308410367953598, "grad_norm": 0.5280532240867615, "learning_rate": 1.5498897347168654e-05, "loss": 0.061, "step": 36505 }, { "epoch": 3.308863512778684, "grad_norm": 0.5602198243141174, "learning_rate": 1.5491582368922415e-05, "loss": 0.0612, "step": 36510 }, { "epoch": 3.30931665760377, "grad_norm": 0.6234837770462036, "learning_rate": 1.5484268342281826e-05, "loss": 0.064, "step": 36515 }, { "epoch": 3.3097698024288564, "grad_norm": 0.6897841691970825, "learning_rate": 1.5476955267978888e-05, "loss": 0.0668, "step": 36520 }, { "epoch": 3.3102229472539424, "grad_norm": 0.7636523246765137, "learning_rate": 1.5469643146745476e-05, "loss": 0.0655, "step": 36525 }, { "epoch": 3.3106760920790284, "grad_norm": 0.5821263790130615, "learning_rate": 1.546233197931341e-05, "loss": 0.0613, "step": 36530 }, { "epoch": 3.3111292369041143, "grad_norm": 0.5551704168319702, "learning_rate": 1.545502176641438e-05, "loss": 0.0592, "step": 36535 }, { "epoch": 3.3115823817292007, "grad_norm": 0.6355758905410767, "learning_rate": 1.544771250877999e-05, "loss": 0.0567, "step": 36540 }, { "epoch": 3.3120355265542867, "grad_norm": 0.6920828819274902, "learning_rate": 1.544040420714177e-05, "loss": 0.0712, "step": 36545 }, { "epoch": 3.3124886713793726, "grad_norm": 0.680935800075531, "learning_rate": 1.5433096862231123e-05, "loss": 0.0637, "step": 36550 }, { "epoch": 3.312941816204459, "grad_norm": 0.7138166427612305, "learning_rate": 1.542579047477939e-05, "loss": 0.0752, "step": 36555 }, { "epoch": 3.313394961029545, "grad_norm": 0.5781198143959045, "learning_rate": 1.5418485045517766e-05, "loss": 0.0609, "step": 36560 }, { "epoch": 3.313848105854631, "grad_norm": 0.6710150241851807, "learning_rate": 1.5411180575177402e-05, "loss": 0.0851, "step": 36565 }, { "epoch": 3.3143012506797174, "grad_norm": 0.6464378237724304, "learning_rate": 1.540387706448933e-05, "loss": 0.0643, "step": 36570 }, { "epoch": 3.3147543955048033, "grad_norm": 0.7030400037765503, "learning_rate": 1.5396574514184483e-05, "loss": 0.0648, "step": 36575 }, { "epoch": 3.3152075403298893, "grad_norm": 0.567412793636322, "learning_rate": 1.5389272924993706e-05, "loss": 0.0614, "step": 36580 }, { "epoch": 3.3156606851549757, "grad_norm": 0.512793242931366, "learning_rate": 1.538197229764775e-05, "loss": 0.0558, "step": 36585 }, { "epoch": 3.3161138299800617, "grad_norm": 0.623753011226654, "learning_rate": 1.5374672632877252e-05, "loss": 0.0689, "step": 36590 }, { "epoch": 3.3165669748051476, "grad_norm": 0.5715769529342651, "learning_rate": 1.536737393141277e-05, "loss": 0.0715, "step": 36595 }, { "epoch": 3.317020119630234, "grad_norm": 0.7367890477180481, "learning_rate": 1.536007619398476e-05, "loss": 0.082, "step": 36600 }, { "epoch": 3.31747326445532, "grad_norm": 0.660319447517395, "learning_rate": 1.5352779421323582e-05, "loss": 0.0643, "step": 36605 }, { "epoch": 3.317926409280406, "grad_norm": 0.7355560660362244, "learning_rate": 1.534548361415951e-05, "loss": 0.0673, "step": 36610 }, { "epoch": 3.3183795541054923, "grad_norm": 0.6065195202827454, "learning_rate": 1.5338188773222695e-05, "loss": 0.0564, "step": 36615 }, { "epoch": 3.3188326989305783, "grad_norm": 0.6754169464111328, "learning_rate": 1.5330894899243224e-05, "loss": 0.0743, "step": 36620 }, { "epoch": 3.3192858437556643, "grad_norm": 0.6596850752830505, "learning_rate": 1.532360199295105e-05, "loss": 0.0666, "step": 36625 }, { "epoch": 3.3197389885807502, "grad_norm": 0.5727421045303345, "learning_rate": 1.5316310055076056e-05, "loss": 0.0639, "step": 36630 }, { "epoch": 3.3201921334058366, "grad_norm": 0.5834304690361023, "learning_rate": 1.530901908634803e-05, "loss": 0.0644, "step": 36635 }, { "epoch": 3.3206452782309226, "grad_norm": 0.6957223415374756, "learning_rate": 1.5301729087496646e-05, "loss": 0.073, "step": 36640 }, { "epoch": 3.3210984230560086, "grad_norm": 0.5748009085655212, "learning_rate": 1.529444005925151e-05, "loss": 0.0611, "step": 36645 }, { "epoch": 3.321551567881095, "grad_norm": 0.6745550632476807, "learning_rate": 1.5287152002342077e-05, "loss": 0.0858, "step": 36650 }, { "epoch": 3.322004712706181, "grad_norm": 0.5849301815032959, "learning_rate": 1.5279864917497754e-05, "loss": 0.0675, "step": 36655 }, { "epoch": 3.322457857531267, "grad_norm": 0.519865870475769, "learning_rate": 1.5272578805447844e-05, "loss": 0.0623, "step": 36660 }, { "epoch": 3.3229110023563533, "grad_norm": 0.543945848941803, "learning_rate": 1.5265293666921527e-05, "loss": 0.0782, "step": 36665 }, { "epoch": 3.3233641471814392, "grad_norm": 0.5918794870376587, "learning_rate": 1.5258009502647925e-05, "loss": 0.0653, "step": 36670 }, { "epoch": 3.323817292006525, "grad_norm": 0.8200437426567078, "learning_rate": 1.5250726313356013e-05, "loss": 0.0649, "step": 36675 }, { "epoch": 3.324270436831611, "grad_norm": 0.6169978380203247, "learning_rate": 1.5243444099774707e-05, "loss": 0.0738, "step": 36680 }, { "epoch": 3.3247235816566976, "grad_norm": 0.6287147998809814, "learning_rate": 1.5236162862632818e-05, "loss": 0.078, "step": 36685 }, { "epoch": 3.3251767264817835, "grad_norm": 0.6837977766990662, "learning_rate": 1.5228882602659051e-05, "loss": 0.0709, "step": 36690 }, { "epoch": 3.3256298713068695, "grad_norm": 0.6333690881729126, "learning_rate": 1.522160332058202e-05, "loss": 0.0693, "step": 36695 }, { "epoch": 3.326083016131956, "grad_norm": 0.7589621543884277, "learning_rate": 1.5214325017130226e-05, "loss": 0.0774, "step": 36700 }, { "epoch": 3.326536160957042, "grad_norm": 0.7428878545761108, "learning_rate": 1.5207047693032095e-05, "loss": 0.0738, "step": 36705 }, { "epoch": 3.326989305782128, "grad_norm": 0.8526939749717712, "learning_rate": 1.5199771349015942e-05, "loss": 0.0695, "step": 36710 }, { "epoch": 3.327442450607214, "grad_norm": 0.6089016795158386, "learning_rate": 1.5192495985809985e-05, "loss": 0.0711, "step": 36715 }, { "epoch": 3.3278955954323, "grad_norm": 0.6158754229545593, "learning_rate": 1.518522160414234e-05, "loss": 0.0618, "step": 36720 }, { "epoch": 3.328348740257386, "grad_norm": 0.5330245494842529, "learning_rate": 1.5177948204741049e-05, "loss": 0.0597, "step": 36725 }, { "epoch": 3.3288018850824725, "grad_norm": 0.6058109998703003, "learning_rate": 1.517067578833401e-05, "loss": 0.0729, "step": 36730 }, { "epoch": 3.3292550299075585, "grad_norm": 0.5968886017799377, "learning_rate": 1.5163404355649065e-05, "loss": 0.0612, "step": 36735 }, { "epoch": 3.3297081747326445, "grad_norm": 0.6400167346000671, "learning_rate": 1.5156133907413932e-05, "loss": 0.0595, "step": 36740 }, { "epoch": 3.330161319557731, "grad_norm": 0.5993390083312988, "learning_rate": 1.5148864444356247e-05, "loss": 0.0791, "step": 36745 }, { "epoch": 3.330614464382817, "grad_norm": 0.685390830039978, "learning_rate": 1.5141595967203553e-05, "loss": 0.0766, "step": 36750 }, { "epoch": 3.331067609207903, "grad_norm": 0.6626861691474915, "learning_rate": 1.5134328476683251e-05, "loss": 0.0632, "step": 36755 }, { "epoch": 3.3315207540329888, "grad_norm": 0.694129467010498, "learning_rate": 1.5127061973522696e-05, "loss": 0.066, "step": 36760 }, { "epoch": 3.331973898858075, "grad_norm": 0.7242607474327087, "learning_rate": 1.5119796458449115e-05, "loss": 0.0741, "step": 36765 }, { "epoch": 3.332427043683161, "grad_norm": 0.7398892045021057, "learning_rate": 1.5112531932189638e-05, "loss": 0.0575, "step": 36770 }, { "epoch": 3.332880188508247, "grad_norm": 0.7095710039138794, "learning_rate": 1.5105268395471323e-05, "loss": 0.0717, "step": 36775 }, { "epoch": 3.3333333333333335, "grad_norm": 0.6228137612342834, "learning_rate": 1.509800584902108e-05, "loss": 0.0744, "step": 36780 }, { "epoch": 3.3337864781584194, "grad_norm": 0.7126034498214722, "learning_rate": 1.509074429356577e-05, "loss": 0.0749, "step": 36785 }, { "epoch": 3.3342396229835054, "grad_norm": 0.6322215795516968, "learning_rate": 1.508348372983211e-05, "loss": 0.0904, "step": 36790 }, { "epoch": 3.334692767808592, "grad_norm": 0.6022145748138428, "learning_rate": 1.5076224158546745e-05, "loss": 0.0654, "step": 36795 }, { "epoch": 3.3351459126336778, "grad_norm": 0.5826670527458191, "learning_rate": 1.5068965580436239e-05, "loss": 0.0594, "step": 36800 }, { "epoch": 3.3355990574587637, "grad_norm": 0.6472330689430237, "learning_rate": 1.5061707996227e-05, "loss": 0.0675, "step": 36805 }, { "epoch": 3.3360522022838497, "grad_norm": 0.7547546029090881, "learning_rate": 1.5054451406645397e-05, "loss": 0.0712, "step": 36810 }, { "epoch": 3.336505347108936, "grad_norm": 0.662155032157898, "learning_rate": 1.5047195812417653e-05, "loss": 0.0637, "step": 36815 }, { "epoch": 3.336958491934022, "grad_norm": 0.6381480693817139, "learning_rate": 1.503994121426991e-05, "loss": 0.0815, "step": 36820 }, { "epoch": 3.337411636759108, "grad_norm": 0.7481842637062073, "learning_rate": 1.5032687612928225e-05, "loss": 0.0626, "step": 36825 }, { "epoch": 3.3378647815841944, "grad_norm": 0.6015474200248718, "learning_rate": 1.5025435009118529e-05, "loss": 0.0672, "step": 36830 }, { "epoch": 3.3383179264092804, "grad_norm": 0.6588448286056519, "learning_rate": 1.5018183403566677e-05, "loss": 0.0782, "step": 36835 }, { "epoch": 3.3387710712343663, "grad_norm": 0.7095950245857239, "learning_rate": 1.5010932796998397e-05, "loss": 0.0619, "step": 36840 }, { "epoch": 3.3392242160594527, "grad_norm": 0.6684134006500244, "learning_rate": 1.5003683190139339e-05, "loss": 0.0678, "step": 36845 }, { "epoch": 3.3396773608845387, "grad_norm": 0.7652760744094849, "learning_rate": 1.4996434583715052e-05, "loss": 0.0771, "step": 36850 }, { "epoch": 3.3401305057096247, "grad_norm": 0.8696916103363037, "learning_rate": 1.4989186978450967e-05, "loss": 0.0631, "step": 36855 }, { "epoch": 3.340583650534711, "grad_norm": 0.6025087833404541, "learning_rate": 1.4981940375072434e-05, "loss": 0.0682, "step": 36860 }, { "epoch": 3.341036795359797, "grad_norm": 0.810546875, "learning_rate": 1.4974694774304704e-05, "loss": 0.0678, "step": 36865 }, { "epoch": 3.341489940184883, "grad_norm": 0.7348450422286987, "learning_rate": 1.4967450176872902e-05, "loss": 0.078, "step": 36870 }, { "epoch": 3.3419430850099694, "grad_norm": 0.6082539558410645, "learning_rate": 1.4960206583502082e-05, "loss": 0.0589, "step": 36875 }, { "epoch": 3.3423962298350554, "grad_norm": 0.7955969572067261, "learning_rate": 1.4952963994917174e-05, "loss": 0.0843, "step": 36880 }, { "epoch": 3.3428493746601413, "grad_norm": 0.6200499534606934, "learning_rate": 1.4945722411843026e-05, "loss": 0.0688, "step": 36885 }, { "epoch": 3.3433025194852277, "grad_norm": 0.7288259267807007, "learning_rate": 1.4938481835004392e-05, "loss": 0.0648, "step": 36890 }, { "epoch": 3.3437556643103137, "grad_norm": 0.6325421929359436, "learning_rate": 1.4931242265125883e-05, "loss": 0.0677, "step": 36895 }, { "epoch": 3.3442088091353996, "grad_norm": 0.6137844324111938, "learning_rate": 1.492400370293206e-05, "loss": 0.0591, "step": 36900 }, { "epoch": 3.3446619539604856, "grad_norm": 0.47779810428619385, "learning_rate": 1.4916766149147346e-05, "loss": 0.064, "step": 36905 }, { "epoch": 3.345115098785572, "grad_norm": 0.6765597462654114, "learning_rate": 1.4909529604496086e-05, "loss": 0.0692, "step": 36910 }, { "epoch": 3.345568243610658, "grad_norm": 0.6228484511375427, "learning_rate": 1.490229406970252e-05, "loss": 0.0694, "step": 36915 }, { "epoch": 3.346021388435744, "grad_norm": 0.6382238268852234, "learning_rate": 1.4895059545490772e-05, "loss": 0.0836, "step": 36920 }, { "epoch": 3.3464745332608303, "grad_norm": 0.6164516806602478, "learning_rate": 1.4887826032584884e-05, "loss": 0.0669, "step": 36925 }, { "epoch": 3.3469276780859163, "grad_norm": 0.675477921962738, "learning_rate": 1.4880593531708778e-05, "loss": 0.0739, "step": 36930 }, { "epoch": 3.3473808229110023, "grad_norm": 0.6964076161384583, "learning_rate": 1.4873362043586294e-05, "loss": 0.0636, "step": 36935 }, { "epoch": 3.347833967736088, "grad_norm": 0.6693610548973083, "learning_rate": 1.4866131568941171e-05, "loss": 0.0632, "step": 36940 }, { "epoch": 3.3482871125611746, "grad_norm": 0.7370733022689819, "learning_rate": 1.4858902108497014e-05, "loss": 0.0708, "step": 36945 }, { "epoch": 3.3487402573862606, "grad_norm": 0.803255558013916, "learning_rate": 1.485167366297737e-05, "loss": 0.0627, "step": 36950 }, { "epoch": 3.3491934022113465, "grad_norm": 0.5986968874931335, "learning_rate": 1.484444623310565e-05, "loss": 0.0617, "step": 36955 }, { "epoch": 3.349646547036433, "grad_norm": 0.6961264610290527, "learning_rate": 1.4837219819605183e-05, "loss": 0.0729, "step": 36960 }, { "epoch": 3.350099691861519, "grad_norm": 0.6104423999786377, "learning_rate": 1.4829994423199203e-05, "loss": 0.0678, "step": 36965 }, { "epoch": 3.350552836686605, "grad_norm": 0.6045801639556885, "learning_rate": 1.4822770044610809e-05, "loss": 0.0624, "step": 36970 }, { "epoch": 3.3510059815116913, "grad_norm": 0.4169977009296417, "learning_rate": 1.481554668456303e-05, "loss": 0.0699, "step": 36975 }, { "epoch": 3.3514591263367772, "grad_norm": 0.7247487306594849, "learning_rate": 1.480832434377879e-05, "loss": 0.0733, "step": 36980 }, { "epoch": 3.351912271161863, "grad_norm": 0.5040954947471619, "learning_rate": 1.4801103022980886e-05, "loss": 0.0714, "step": 36985 }, { "epoch": 3.3523654159869496, "grad_norm": 0.6691620349884033, "learning_rate": 1.4793882722892044e-05, "loss": 0.0878, "step": 36990 }, { "epoch": 3.3528185608120356, "grad_norm": 0.5698748230934143, "learning_rate": 1.4786663444234866e-05, "loss": 0.0662, "step": 36995 }, { "epoch": 3.3532717056371215, "grad_norm": 0.5581625699996948, "learning_rate": 1.477944518773186e-05, "loss": 0.0565, "step": 37000 }, { "epoch": 3.353724850462208, "grad_norm": 0.6083986163139343, "learning_rate": 1.4772227954105446e-05, "loss": 0.0596, "step": 37005 }, { "epoch": 3.354177995287294, "grad_norm": 0.5740327835083008, "learning_rate": 1.4765011744077905e-05, "loss": 0.0692, "step": 37010 }, { "epoch": 3.35463114011238, "grad_norm": 0.6480974555015564, "learning_rate": 1.4757796558371456e-05, "loss": 0.0719, "step": 37015 }, { "epoch": 3.3550842849374662, "grad_norm": 1.191305160522461, "learning_rate": 1.4750582397708184e-05, "loss": 0.0703, "step": 37020 }, { "epoch": 3.355537429762552, "grad_norm": 0.6387565732002258, "learning_rate": 1.4743369262810092e-05, "loss": 0.0633, "step": 37025 }, { "epoch": 3.355990574587638, "grad_norm": 0.6623743772506714, "learning_rate": 1.4736157154399077e-05, "loss": 0.0663, "step": 37030 }, { "epoch": 3.356443719412724, "grad_norm": 0.5428873300552368, "learning_rate": 1.4728946073196918e-05, "loss": 0.0609, "step": 37035 }, { "epoch": 3.3568968642378105, "grad_norm": 0.7009481191635132, "learning_rate": 1.4721736019925308e-05, "loss": 0.0723, "step": 37040 }, { "epoch": 3.3573500090628965, "grad_norm": 0.8718494772911072, "learning_rate": 1.4714526995305833e-05, "loss": 0.071, "step": 37045 }, { "epoch": 3.3578031538879825, "grad_norm": 0.7047044038772583, "learning_rate": 1.470731900005997e-05, "loss": 0.0605, "step": 37050 }, { "epoch": 3.358256298713069, "grad_norm": 0.6121190190315247, "learning_rate": 1.470011203490911e-05, "loss": 0.0693, "step": 37055 }, { "epoch": 3.358709443538155, "grad_norm": 0.8464983105659485, "learning_rate": 1.4692906100574511e-05, "loss": 0.0736, "step": 37060 }, { "epoch": 3.3591625883632408, "grad_norm": 0.6187869310379028, "learning_rate": 1.4685701197777357e-05, "loss": 0.0635, "step": 37065 }, { "epoch": 3.359615733188327, "grad_norm": 0.6225117444992065, "learning_rate": 1.4678497327238711e-05, "loss": 0.0688, "step": 37070 }, { "epoch": 3.360068878013413, "grad_norm": 0.5833883285522461, "learning_rate": 1.4671294489679543e-05, "loss": 0.061, "step": 37075 }, { "epoch": 3.360522022838499, "grad_norm": 0.6018774509429932, "learning_rate": 1.4664092685820721e-05, "loss": 0.0555, "step": 37080 }, { "epoch": 3.360975167663585, "grad_norm": 0.5496803522109985, "learning_rate": 1.4656891916382986e-05, "loss": 0.0656, "step": 37085 }, { "epoch": 3.3614283124886715, "grad_norm": 0.6695072650909424, "learning_rate": 1.4649692182087013e-05, "loss": 0.0682, "step": 37090 }, { "epoch": 3.3618814573137574, "grad_norm": 0.5250763893127441, "learning_rate": 1.4642493483653341e-05, "loss": 0.0676, "step": 37095 }, { "epoch": 3.3623346021388434, "grad_norm": 0.49977487325668335, "learning_rate": 1.463529582180242e-05, "loss": 0.059, "step": 37100 }, { "epoch": 3.36278774696393, "grad_norm": 0.6801801919937134, "learning_rate": 1.4628099197254607e-05, "loss": 0.0766, "step": 37105 }, { "epoch": 3.3632408917890158, "grad_norm": 0.7369896769523621, "learning_rate": 1.4620903610730122e-05, "loss": 0.0754, "step": 37110 }, { "epoch": 3.3636940366141017, "grad_norm": 0.5879465937614441, "learning_rate": 1.4613709062949114e-05, "loss": 0.0625, "step": 37115 }, { "epoch": 3.364147181439188, "grad_norm": 0.7128068208694458, "learning_rate": 1.4606515554631617e-05, "loss": 0.0661, "step": 37120 }, { "epoch": 3.364600326264274, "grad_norm": 0.6486746072769165, "learning_rate": 1.4599323086497555e-05, "loss": 0.0651, "step": 37125 }, { "epoch": 3.36505347108936, "grad_norm": 0.7039248943328857, "learning_rate": 1.4592131659266761e-05, "loss": 0.0637, "step": 37130 }, { "epoch": 3.3655066159144464, "grad_norm": 0.7591780424118042, "learning_rate": 1.4584941273658936e-05, "loss": 0.0809, "step": 37135 }, { "epoch": 3.3659597607395324, "grad_norm": 0.5932065844535828, "learning_rate": 1.4577751930393719e-05, "loss": 0.0609, "step": 37140 }, { "epoch": 3.3664129055646184, "grad_norm": 0.6454771757125854, "learning_rate": 1.4570563630190612e-05, "loss": 0.0693, "step": 37145 }, { "epoch": 3.3668660503897048, "grad_norm": 0.6089423894882202, "learning_rate": 1.4563376373769022e-05, "loss": 0.0664, "step": 37150 }, { "epoch": 3.3673191952147907, "grad_norm": 0.6514254212379456, "learning_rate": 1.4556190161848254e-05, "loss": 0.0652, "step": 37155 }, { "epoch": 3.3677723400398767, "grad_norm": 0.7956656217575073, "learning_rate": 1.4549004995147502e-05, "loss": 0.0593, "step": 37160 }, { "epoch": 3.3682254848649626, "grad_norm": 0.7248668074607849, "learning_rate": 1.4541820874385855e-05, "loss": 0.0635, "step": 37165 }, { "epoch": 3.368678629690049, "grad_norm": 0.9142137765884399, "learning_rate": 1.4534637800282322e-05, "loss": 0.0686, "step": 37170 }, { "epoch": 3.369131774515135, "grad_norm": 0.7180199027061462, "learning_rate": 1.4527455773555776e-05, "loss": 0.0684, "step": 37175 }, { "epoch": 3.369584919340221, "grad_norm": 0.7055489420890808, "learning_rate": 1.4520274794924993e-05, "loss": 0.0657, "step": 37180 }, { "epoch": 3.3700380641653074, "grad_norm": 0.7879129648208618, "learning_rate": 1.4513094865108651e-05, "loss": 0.0767, "step": 37185 }, { "epoch": 3.3704912089903933, "grad_norm": 0.8500697612762451, "learning_rate": 1.4505915984825313e-05, "loss": 0.0714, "step": 37190 }, { "epoch": 3.3709443538154793, "grad_norm": 0.5893452763557434, "learning_rate": 1.4498738154793461e-05, "loss": 0.0658, "step": 37195 }, { "epoch": 3.3713974986405657, "grad_norm": 0.5947142839431763, "learning_rate": 1.4491561375731444e-05, "loss": 0.0607, "step": 37200 }, { "epoch": 3.3718506434656517, "grad_norm": 0.8128035664558411, "learning_rate": 1.4484385648357512e-05, "loss": 0.0769, "step": 37205 }, { "epoch": 3.3723037882907376, "grad_norm": 0.666056215763092, "learning_rate": 1.4477210973389826e-05, "loss": 0.0586, "step": 37210 }, { "epoch": 3.3727569331158236, "grad_norm": 0.6287643313407898, "learning_rate": 1.4470037351546408e-05, "loss": 0.0643, "step": 37215 }, { "epoch": 3.37321007794091, "grad_norm": 0.5935428142547607, "learning_rate": 1.4462864783545224e-05, "loss": 0.0682, "step": 37220 }, { "epoch": 3.373663222765996, "grad_norm": 0.7066668272018433, "learning_rate": 1.4455693270104096e-05, "loss": 0.0676, "step": 37225 }, { "epoch": 3.374116367591082, "grad_norm": 0.6670644283294678, "learning_rate": 1.4448522811940737e-05, "loss": 0.069, "step": 37230 }, { "epoch": 3.3745695124161683, "grad_norm": 0.6022231578826904, "learning_rate": 1.4441353409772803e-05, "loss": 0.0688, "step": 37235 }, { "epoch": 3.3750226572412543, "grad_norm": 0.5641397833824158, "learning_rate": 1.4434185064317768e-05, "loss": 0.0653, "step": 37240 }, { "epoch": 3.3754758020663402, "grad_norm": 0.6611388325691223, "learning_rate": 1.4427017776293078e-05, "loss": 0.0662, "step": 37245 }, { "epoch": 3.3759289468914266, "grad_norm": 0.6456977128982544, "learning_rate": 1.441985154641602e-05, "loss": 0.0711, "step": 37250 }, { "epoch": 3.3763820917165126, "grad_norm": 0.665949821472168, "learning_rate": 1.4412686375403783e-05, "loss": 0.0708, "step": 37255 }, { "epoch": 3.3768352365415986, "grad_norm": 0.4468235969543457, "learning_rate": 1.4405522263973498e-05, "loss": 0.061, "step": 37260 }, { "epoch": 3.377288381366685, "grad_norm": 0.747553825378418, "learning_rate": 1.4398359212842099e-05, "loss": 0.0716, "step": 37265 }, { "epoch": 3.377741526191771, "grad_norm": 0.6527658104896545, "learning_rate": 1.4391197222726511e-05, "loss": 0.0637, "step": 37270 }, { "epoch": 3.378194671016857, "grad_norm": 0.7332178950309753, "learning_rate": 1.4384036294343488e-05, "loss": 0.0642, "step": 37275 }, { "epoch": 3.3786478158419433, "grad_norm": 0.6297224164009094, "learning_rate": 1.4376876428409692e-05, "loss": 0.0808, "step": 37280 }, { "epoch": 3.3791009606670293, "grad_norm": 0.5281938910484314, "learning_rate": 1.4369717625641715e-05, "loss": 0.0614, "step": 37285 }, { "epoch": 3.379554105492115, "grad_norm": 0.5812075138092041, "learning_rate": 1.4362559886755972e-05, "loss": 0.0727, "step": 37290 }, { "epoch": 3.3800072503172016, "grad_norm": 0.6971353888511658, "learning_rate": 1.435540321246884e-05, "loss": 0.0804, "step": 37295 }, { "epoch": 3.3804603951422876, "grad_norm": 0.6724805235862732, "learning_rate": 1.4348247603496553e-05, "loss": 0.065, "step": 37300 }, { "epoch": 3.3809135399673735, "grad_norm": 0.7062762379646301, "learning_rate": 1.4341093060555239e-05, "loss": 0.0594, "step": 37305 }, { "epoch": 3.3813666847924595, "grad_norm": 0.7127952575683594, "learning_rate": 1.4333939584360951e-05, "loss": 0.0542, "step": 37310 }, { "epoch": 3.381819829617546, "grad_norm": 0.5293408036231995, "learning_rate": 1.4326787175629577e-05, "loss": 0.059, "step": 37315 }, { "epoch": 3.382272974442632, "grad_norm": 0.5634048581123352, "learning_rate": 1.431963583507696e-05, "loss": 0.0671, "step": 37320 }, { "epoch": 3.382726119267718, "grad_norm": 0.5960442423820496, "learning_rate": 1.4312485563418798e-05, "loss": 0.0591, "step": 37325 }, { "epoch": 3.3831792640928042, "grad_norm": 0.7705152630805969, "learning_rate": 1.4305336361370686e-05, "loss": 0.07, "step": 37330 }, { "epoch": 3.38363240891789, "grad_norm": 0.7068137526512146, "learning_rate": 1.4298188229648146e-05, "loss": 0.0629, "step": 37335 }, { "epoch": 3.384085553742976, "grad_norm": 0.6435515880584717, "learning_rate": 1.4291041168966527e-05, "loss": 0.0579, "step": 37340 }, { "epoch": 3.384538698568062, "grad_norm": 0.6107269525527954, "learning_rate": 1.4283895180041146e-05, "loss": 0.0665, "step": 37345 }, { "epoch": 3.3849918433931485, "grad_norm": 0.6874361634254456, "learning_rate": 1.4276750263587138e-05, "loss": 0.068, "step": 37350 }, { "epoch": 3.3854449882182345, "grad_norm": 0.5919417142868042, "learning_rate": 1.4269606420319598e-05, "loss": 0.0603, "step": 37355 }, { "epoch": 3.3858981330433204, "grad_norm": 0.6804156303405762, "learning_rate": 1.4262463650953478e-05, "loss": 0.0604, "step": 37360 }, { "epoch": 3.386351277868407, "grad_norm": 0.5918968319892883, "learning_rate": 1.425532195620361e-05, "loss": 0.0639, "step": 37365 }, { "epoch": 3.386804422693493, "grad_norm": 0.6662484407424927, "learning_rate": 1.4248181336784765e-05, "loss": 0.0595, "step": 37370 }, { "epoch": 3.3872575675185788, "grad_norm": 0.6283101439476013, "learning_rate": 1.4241041793411569e-05, "loss": 0.0618, "step": 37375 }, { "epoch": 3.387710712343665, "grad_norm": 0.7368021607398987, "learning_rate": 1.4233903326798548e-05, "loss": 0.0723, "step": 37380 }, { "epoch": 3.388163857168751, "grad_norm": 0.5700169801712036, "learning_rate": 1.422676593766012e-05, "loss": 0.0616, "step": 37385 }, { "epoch": 3.388617001993837, "grad_norm": 0.5395476818084717, "learning_rate": 1.4219629626710587e-05, "loss": 0.0739, "step": 37390 }, { "epoch": 3.3890701468189235, "grad_norm": 0.6272894144058228, "learning_rate": 1.4212494394664177e-05, "loss": 0.0785, "step": 37395 }, { "epoch": 3.3895232916440095, "grad_norm": 0.5424582958221436, "learning_rate": 1.4205360242234975e-05, "loss": 0.0735, "step": 37400 }, { "epoch": 3.3899764364690954, "grad_norm": 0.5340643525123596, "learning_rate": 1.4198227170136969e-05, "loss": 0.0693, "step": 37405 }, { "epoch": 3.390429581294182, "grad_norm": 0.5961649417877197, "learning_rate": 1.4191095179084037e-05, "loss": 0.0556, "step": 37410 }, { "epoch": 3.3908827261192678, "grad_norm": 0.7641898989677429, "learning_rate": 1.4183964269789946e-05, "loss": 0.0655, "step": 37415 }, { "epoch": 3.3913358709443537, "grad_norm": 0.7006478905677795, "learning_rate": 1.4176834442968378e-05, "loss": 0.0681, "step": 37420 }, { "epoch": 3.39178901576944, "grad_norm": 0.6200090050697327, "learning_rate": 1.4169705699332874e-05, "loss": 0.0601, "step": 37425 }, { "epoch": 3.392242160594526, "grad_norm": 0.5426644086837769, "learning_rate": 1.4162578039596888e-05, "loss": 0.0524, "step": 37430 }, { "epoch": 3.392695305419612, "grad_norm": 0.7016287446022034, "learning_rate": 1.4155451464473757e-05, "loss": 0.0644, "step": 37435 }, { "epoch": 3.393148450244698, "grad_norm": 0.6562203168869019, "learning_rate": 1.4148325974676696e-05, "loss": 0.0582, "step": 37440 }, { "epoch": 3.3936015950697844, "grad_norm": 0.6686019897460938, "learning_rate": 1.414120157091885e-05, "loss": 0.0686, "step": 37445 }, { "epoch": 3.3940547398948704, "grad_norm": 0.49709194898605347, "learning_rate": 1.4134078253913224e-05, "loss": 0.0706, "step": 37450 }, { "epoch": 3.3945078847199563, "grad_norm": 0.654052734375, "learning_rate": 1.4126956024372718e-05, "loss": 0.0583, "step": 37455 }, { "epoch": 3.3949610295450428, "grad_norm": 0.6048903465270996, "learning_rate": 1.4119834883010127e-05, "loss": 0.0632, "step": 37460 }, { "epoch": 3.3954141743701287, "grad_norm": 0.8296813368797302, "learning_rate": 1.4112714830538132e-05, "loss": 0.0656, "step": 37465 }, { "epoch": 3.3958673191952147, "grad_norm": 0.6715512275695801, "learning_rate": 1.4105595867669325e-05, "loss": 0.0711, "step": 37470 }, { "epoch": 3.396320464020301, "grad_norm": 0.6860480904579163, "learning_rate": 1.4098477995116166e-05, "loss": 0.069, "step": 37475 }, { "epoch": 3.396773608845387, "grad_norm": 0.6754740476608276, "learning_rate": 1.4091361213591016e-05, "loss": 0.0698, "step": 37480 }, { "epoch": 3.397226753670473, "grad_norm": 0.5764801502227783, "learning_rate": 1.4084245523806114e-05, "loss": 0.0708, "step": 37485 }, { "epoch": 3.397679898495559, "grad_norm": 0.4858788549900055, "learning_rate": 1.407713092647362e-05, "loss": 0.0591, "step": 37490 }, { "epoch": 3.3981330433206454, "grad_norm": 0.8144728541374207, "learning_rate": 1.4070017422305554e-05, "loss": 0.0751, "step": 37495 }, { "epoch": 3.3985861881457313, "grad_norm": 0.8294988870620728, "learning_rate": 1.4062905012013838e-05, "loss": 0.0727, "step": 37500 }, { "epoch": 3.3990393329708173, "grad_norm": 0.8549115657806396, "learning_rate": 1.4055793696310291e-05, "loss": 0.0623, "step": 37505 }, { "epoch": 3.3994924777959037, "grad_norm": 0.9262466430664062, "learning_rate": 1.4048683475906596e-05, "loss": 0.073, "step": 37510 }, { "epoch": 3.3999456226209896, "grad_norm": 0.5890268087387085, "learning_rate": 1.404157435151438e-05, "loss": 0.0619, "step": 37515 }, { "epoch": 3.4003987674460756, "grad_norm": 0.5011121034622192, "learning_rate": 1.4034466323845092e-05, "loss": 0.0601, "step": 37520 }, { "epoch": 3.400851912271162, "grad_norm": 0.6576364636421204, "learning_rate": 1.4027359393610129e-05, "loss": 0.0598, "step": 37525 }, { "epoch": 3.401305057096248, "grad_norm": 0.6124126315116882, "learning_rate": 1.4020253561520746e-05, "loss": 0.0679, "step": 37530 }, { "epoch": 3.401758201921334, "grad_norm": 0.6982143521308899, "learning_rate": 1.4013148828288089e-05, "loss": 0.0576, "step": 37535 }, { "epoch": 3.4022113467464203, "grad_norm": 0.604889452457428, "learning_rate": 1.400604519462323e-05, "loss": 0.0642, "step": 37540 }, { "epoch": 3.4026644915715063, "grad_norm": 0.6067721247673035, "learning_rate": 1.3998942661237069e-05, "loss": 0.0741, "step": 37545 }, { "epoch": 3.4031176363965923, "grad_norm": 0.8420253396034241, "learning_rate": 1.3991841228840453e-05, "loss": 0.0692, "step": 37550 }, { "epoch": 3.4035707812216787, "grad_norm": 0.6547291874885559, "learning_rate": 1.3984740898144084e-05, "loss": 0.0602, "step": 37555 }, { "epoch": 3.4040239260467646, "grad_norm": 0.650482714176178, "learning_rate": 1.3977641669858566e-05, "loss": 0.0577, "step": 37560 }, { "epoch": 3.4044770708718506, "grad_norm": 0.5438544154167175, "learning_rate": 1.3970543544694412e-05, "loss": 0.0661, "step": 37565 }, { "epoch": 3.404930215696937, "grad_norm": 0.6616883277893066, "learning_rate": 1.3963446523361966e-05, "loss": 0.06, "step": 37570 }, { "epoch": 3.405383360522023, "grad_norm": 0.8371368646621704, "learning_rate": 1.395635060657154e-05, "loss": 0.0697, "step": 37575 }, { "epoch": 3.405836505347109, "grad_norm": 1.313338041305542, "learning_rate": 1.3949255795033272e-05, "loss": 0.1297, "step": 37580 }, { "epoch": 3.406289650172195, "grad_norm": 0.621117889881134, "learning_rate": 1.3942162089457214e-05, "loss": 0.0643, "step": 37585 }, { "epoch": 3.4067427949972813, "grad_norm": 0.5410668849945068, "learning_rate": 1.3935069490553333e-05, "loss": 0.0733, "step": 37590 }, { "epoch": 3.4071959398223672, "grad_norm": 0.6116331815719604, "learning_rate": 1.3927977999031414e-05, "loss": 0.0614, "step": 37595 }, { "epoch": 3.407649084647453, "grad_norm": 0.7310557961463928, "learning_rate": 1.392088761560121e-05, "loss": 0.069, "step": 37600 }, { "epoch": 3.4081022294725396, "grad_norm": 0.6292179822921753, "learning_rate": 1.3913798340972324e-05, "loss": 0.0723, "step": 37605 }, { "epoch": 3.4085553742976256, "grad_norm": 0.5389962792396545, "learning_rate": 1.3906710175854232e-05, "loss": 0.0536, "step": 37610 }, { "epoch": 3.4090085191227115, "grad_norm": 0.6099892258644104, "learning_rate": 1.3899623120956357e-05, "loss": 0.0589, "step": 37615 }, { "epoch": 3.4094616639477975, "grad_norm": 0.665086030960083, "learning_rate": 1.3892537176987933e-05, "loss": 0.0629, "step": 37620 }, { "epoch": 3.409914808772884, "grad_norm": 0.5779497027397156, "learning_rate": 1.3885452344658153e-05, "loss": 0.0588, "step": 37625 }, { "epoch": 3.41036795359797, "grad_norm": 0.5992752313613892, "learning_rate": 1.3878368624676062e-05, "loss": 0.073, "step": 37630 }, { "epoch": 3.410821098423056, "grad_norm": 0.543117344379425, "learning_rate": 1.3871286017750584e-05, "loss": 0.0657, "step": 37635 }, { "epoch": 3.411274243248142, "grad_norm": 0.6045592427253723, "learning_rate": 1.3864204524590585e-05, "loss": 0.0669, "step": 37640 }, { "epoch": 3.411727388073228, "grad_norm": 0.6639752388000488, "learning_rate": 1.3857124145904743e-05, "loss": 0.0677, "step": 37645 }, { "epoch": 3.412180532898314, "grad_norm": 0.5335975289344788, "learning_rate": 1.3850044882401694e-05, "loss": 0.0595, "step": 37650 }, { "epoch": 3.4126336777234005, "grad_norm": 0.6512395739555359, "learning_rate": 1.3842966734789923e-05, "loss": 0.0606, "step": 37655 }, { "epoch": 3.4130868225484865, "grad_norm": 0.7631365656852722, "learning_rate": 1.38358897037778e-05, "loss": 0.0859, "step": 37660 }, { "epoch": 3.4135399673735725, "grad_norm": 0.6038702726364136, "learning_rate": 1.382881379007363e-05, "loss": 0.0613, "step": 37665 }, { "epoch": 3.413993112198659, "grad_norm": 0.6205273270606995, "learning_rate": 1.3821738994385535e-05, "loss": 0.0695, "step": 37670 }, { "epoch": 3.414446257023745, "grad_norm": 0.7589276432991028, "learning_rate": 1.3814665317421588e-05, "loss": 0.065, "step": 37675 }, { "epoch": 3.414899401848831, "grad_norm": 0.7346209287643433, "learning_rate": 1.3807592759889718e-05, "loss": 0.0699, "step": 37680 }, { "epoch": 3.415352546673917, "grad_norm": 0.6447376012802124, "learning_rate": 1.3800521322497739e-05, "loss": 0.0566, "step": 37685 }, { "epoch": 3.415805691499003, "grad_norm": 0.7555273771286011, "learning_rate": 1.379345100595339e-05, "loss": 0.0701, "step": 37690 }, { "epoch": 3.416258836324089, "grad_norm": 0.5564449429512024, "learning_rate": 1.3786381810964233e-05, "loss": 0.0695, "step": 37695 }, { "epoch": 3.4167119811491755, "grad_norm": 0.6839959025382996, "learning_rate": 1.3779313738237787e-05, "loss": 0.0644, "step": 37700 }, { "epoch": 3.4171651259742615, "grad_norm": 0.5458970069885254, "learning_rate": 1.3772246788481413e-05, "loss": 0.0616, "step": 37705 }, { "epoch": 3.4176182707993474, "grad_norm": 0.6728091835975647, "learning_rate": 1.3765180962402374e-05, "loss": 0.0616, "step": 37710 }, { "epoch": 3.4180714156244334, "grad_norm": 0.6061590313911438, "learning_rate": 1.3758116260707827e-05, "loss": 0.0807, "step": 37715 }, { "epoch": 3.41852456044952, "grad_norm": 0.7894011735916138, "learning_rate": 1.3751052684104793e-05, "loss": 0.0713, "step": 37720 }, { "epoch": 3.4189777052746058, "grad_norm": 0.5419319868087769, "learning_rate": 1.374399023330022e-05, "loss": 0.059, "step": 37725 }, { "epoch": 3.4194308500996917, "grad_norm": 0.7370250225067139, "learning_rate": 1.3736928909000907e-05, "loss": 0.075, "step": 37730 }, { "epoch": 3.419883994924778, "grad_norm": 0.6786344647407532, "learning_rate": 1.3729868711913557e-05, "loss": 0.0709, "step": 37735 }, { "epoch": 3.420337139749864, "grad_norm": 0.5030903816223145, "learning_rate": 1.372280964274476e-05, "loss": 0.0561, "step": 37740 }, { "epoch": 3.42079028457495, "grad_norm": 0.7736775875091553, "learning_rate": 1.3715751702200973e-05, "loss": 0.0689, "step": 37745 }, { "epoch": 3.421243429400036, "grad_norm": 0.636604368686676, "learning_rate": 1.370869489098858e-05, "loss": 0.0638, "step": 37750 }, { "epoch": 3.4216965742251224, "grad_norm": 0.6598326563835144, "learning_rate": 1.3701639209813821e-05, "loss": 0.0666, "step": 37755 }, { "epoch": 3.4221497190502084, "grad_norm": 0.6172454953193665, "learning_rate": 1.3694584659382829e-05, "loss": 0.0617, "step": 37760 }, { "epoch": 3.4226028638752943, "grad_norm": 0.5881363153457642, "learning_rate": 1.3687531240401614e-05, "loss": 0.0566, "step": 37765 }, { "epoch": 3.4230560087003807, "grad_norm": 0.6241719126701355, "learning_rate": 1.368047895357611e-05, "loss": 0.072, "step": 37770 }, { "epoch": 3.4235091535254667, "grad_norm": 0.5229659080505371, "learning_rate": 1.3673427799612093e-05, "loss": 0.0621, "step": 37775 }, { "epoch": 3.4239622983505527, "grad_norm": 0.6647652983665466, "learning_rate": 1.3666377779215257e-05, "loss": 0.0569, "step": 37780 }, { "epoch": 3.424415443175639, "grad_norm": 0.6555690765380859, "learning_rate": 1.3659328893091161e-05, "loss": 0.0689, "step": 37785 }, { "epoch": 3.424868588000725, "grad_norm": 0.6293379068374634, "learning_rate": 1.3652281141945256e-05, "loss": 0.0763, "step": 37790 }, { "epoch": 3.425321732825811, "grad_norm": 0.51472407579422, "learning_rate": 1.3645234526482897e-05, "loss": 0.0647, "step": 37795 }, { "epoch": 3.4257748776508974, "grad_norm": 0.6934787631034851, "learning_rate": 1.3638189047409306e-05, "loss": 0.0636, "step": 37800 }, { "epoch": 3.4262280224759833, "grad_norm": 0.7075166702270508, "learning_rate": 1.3631144705429597e-05, "loss": 0.0775, "step": 37805 }, { "epoch": 3.4266811673010693, "grad_norm": 0.7346448302268982, "learning_rate": 1.3624101501248768e-05, "loss": 0.0755, "step": 37810 }, { "epoch": 3.4271343121261557, "grad_norm": 0.846248984336853, "learning_rate": 1.3617059435571697e-05, "loss": 0.0846, "step": 37815 }, { "epoch": 3.4275874569512417, "grad_norm": 0.598759114742279, "learning_rate": 1.3610018509103176e-05, "loss": 0.0644, "step": 37820 }, { "epoch": 3.4280406017763276, "grad_norm": 0.5849074125289917, "learning_rate": 1.3602978722547854e-05, "loss": 0.0593, "step": 37825 }, { "epoch": 3.428493746601414, "grad_norm": 0.6163986921310425, "learning_rate": 1.359594007661027e-05, "loss": 0.0644, "step": 37830 }, { "epoch": 3.4289468914265, "grad_norm": 0.6622375845909119, "learning_rate": 1.3588902571994863e-05, "loss": 0.0648, "step": 37835 }, { "epoch": 3.429400036251586, "grad_norm": 0.5868737697601318, "learning_rate": 1.358186620940593e-05, "loss": 0.0624, "step": 37840 }, { "epoch": 3.429853181076672, "grad_norm": 0.5645637512207031, "learning_rate": 1.3574830989547702e-05, "loss": 0.0603, "step": 37845 }, { "epoch": 3.4303063259017583, "grad_norm": 0.5621510744094849, "learning_rate": 1.3567796913124248e-05, "loss": 0.0665, "step": 37850 }, { "epoch": 3.4307594707268443, "grad_norm": 0.5416396260261536, "learning_rate": 1.3560763980839541e-05, "loss": 0.0564, "step": 37855 }, { "epoch": 3.4312126155519302, "grad_norm": 0.9815284609794617, "learning_rate": 1.3553732193397445e-05, "loss": 0.0648, "step": 37860 }, { "epoch": 3.4316657603770166, "grad_norm": 0.6922767162322998, "learning_rate": 1.354670155150169e-05, "loss": 0.0609, "step": 37865 }, { "epoch": 3.4321189052021026, "grad_norm": 0.6162027716636658, "learning_rate": 1.3539672055855926e-05, "loss": 0.0566, "step": 37870 }, { "epoch": 3.4325720500271886, "grad_norm": 0.5341674089431763, "learning_rate": 1.3532643707163656e-05, "loss": 0.0739, "step": 37875 }, { "epoch": 3.433025194852275, "grad_norm": 0.6590220928192139, "learning_rate": 1.352561650612827e-05, "loss": 0.062, "step": 37880 }, { "epoch": 3.433478339677361, "grad_norm": 0.9054074287414551, "learning_rate": 1.3518590453453083e-05, "loss": 0.0725, "step": 37885 }, { "epoch": 3.433931484502447, "grad_norm": 0.8704655766487122, "learning_rate": 1.3511565549841226e-05, "loss": 0.0808, "step": 37890 }, { "epoch": 3.434384629327533, "grad_norm": 0.6726416945457458, "learning_rate": 1.350454179599579e-05, "loss": 0.0785, "step": 37895 }, { "epoch": 3.4348377741526193, "grad_norm": 0.7054168581962585, "learning_rate": 1.349751919261968e-05, "loss": 0.0705, "step": 37900 }, { "epoch": 3.435290918977705, "grad_norm": 0.6740320920944214, "learning_rate": 1.3490497740415747e-05, "loss": 0.0658, "step": 37905 }, { "epoch": 3.435744063802791, "grad_norm": 0.727152943611145, "learning_rate": 1.3483477440086689e-05, "loss": 0.0679, "step": 37910 }, { "epoch": 3.4361972086278776, "grad_norm": 0.5577965378761292, "learning_rate": 1.3476458292335093e-05, "loss": 0.0619, "step": 37915 }, { "epoch": 3.4366503534529635, "grad_norm": 0.5094205737113953, "learning_rate": 1.3469440297863467e-05, "loss": 0.0604, "step": 37920 }, { "epoch": 3.4371034982780495, "grad_norm": 0.8122804164886475, "learning_rate": 1.3462423457374135e-05, "loss": 0.0634, "step": 37925 }, { "epoch": 3.437556643103136, "grad_norm": 0.7196964025497437, "learning_rate": 1.3455407771569373e-05, "loss": 0.0764, "step": 37930 }, { "epoch": 3.438009787928222, "grad_norm": 0.7823583483695984, "learning_rate": 1.3448393241151302e-05, "loss": 0.0722, "step": 37935 }, { "epoch": 3.438462932753308, "grad_norm": 0.4838842749595642, "learning_rate": 1.3441379866821931e-05, "loss": 0.0569, "step": 37940 }, { "epoch": 3.4389160775783942, "grad_norm": 0.6734374165534973, "learning_rate": 1.3434367649283191e-05, "loss": 0.0746, "step": 37945 }, { "epoch": 3.43936922240348, "grad_norm": 0.5713377594947815, "learning_rate": 1.3427356589236827e-05, "loss": 0.0596, "step": 37950 }, { "epoch": 3.439822367228566, "grad_norm": 0.5819869637489319, "learning_rate": 1.3420346687384538e-05, "loss": 0.0662, "step": 37955 }, { "epoch": 3.4402755120536526, "grad_norm": 0.6198607087135315, "learning_rate": 1.3413337944427867e-05, "loss": 0.0668, "step": 37960 }, { "epoch": 3.4407286568787385, "grad_norm": 0.6650185585021973, "learning_rate": 1.3406330361068239e-05, "loss": 0.0646, "step": 37965 }, { "epoch": 3.4411818017038245, "grad_norm": 0.7651728391647339, "learning_rate": 1.3399323938007008e-05, "loss": 0.0815, "step": 37970 }, { "epoch": 3.441634946528911, "grad_norm": 0.6665892601013184, "learning_rate": 1.3392318675945343e-05, "loss": 0.0636, "step": 37975 }, { "epoch": 3.442088091353997, "grad_norm": 0.6391820311546326, "learning_rate": 1.3385314575584357e-05, "loss": 0.0657, "step": 37980 }, { "epoch": 3.442541236179083, "grad_norm": 0.5899826288223267, "learning_rate": 1.3378311637625012e-05, "loss": 0.0745, "step": 37985 }, { "epoch": 3.4429943810041688, "grad_norm": 0.44367125630378723, "learning_rate": 1.3371309862768158e-05, "loss": 0.0758, "step": 37990 }, { "epoch": 3.443447525829255, "grad_norm": 0.6331565380096436, "learning_rate": 1.3364309251714563e-05, "loss": 0.0557, "step": 37995 }, { "epoch": 3.443900670654341, "grad_norm": 0.8926640748977661, "learning_rate": 1.3357309805164813e-05, "loss": 0.0722, "step": 38000 }, { "epoch": 3.444353815479427, "grad_norm": 0.5708804726600647, "learning_rate": 1.3350311523819447e-05, "loss": 0.0585, "step": 38005 }, { "epoch": 3.4448069603045135, "grad_norm": 0.6444875597953796, "learning_rate": 1.3343314408378838e-05, "loss": 0.088, "step": 38010 }, { "epoch": 3.4452601051295995, "grad_norm": 0.637718677520752, "learning_rate": 1.3336318459543254e-05, "loss": 0.0604, "step": 38015 }, { "epoch": 3.4457132499546854, "grad_norm": 0.5643535256385803, "learning_rate": 1.3329323678012873e-05, "loss": 0.0687, "step": 38020 }, { "epoch": 3.4461663947797714, "grad_norm": 0.5402596592903137, "learning_rate": 1.332233006448772e-05, "loss": 0.0604, "step": 38025 }, { "epoch": 3.446619539604858, "grad_norm": 0.669844388961792, "learning_rate": 1.3315337619667726e-05, "loss": 0.0649, "step": 38030 }, { "epoch": 3.4470726844299437, "grad_norm": 0.6529077887535095, "learning_rate": 1.3308346344252692e-05, "loss": 0.0568, "step": 38035 }, { "epoch": 3.4475258292550297, "grad_norm": 0.8873150944709778, "learning_rate": 1.3301356238942298e-05, "loss": 0.0676, "step": 38040 }, { "epoch": 3.447978974080116, "grad_norm": 0.6283507943153381, "learning_rate": 1.3294367304436134e-05, "loss": 0.0598, "step": 38045 }, { "epoch": 3.448432118905202, "grad_norm": 0.5725191235542297, "learning_rate": 1.3287379541433648e-05, "loss": 0.0564, "step": 38050 }, { "epoch": 3.448885263730288, "grad_norm": 0.6504910588264465, "learning_rate": 1.328039295063418e-05, "loss": 0.0576, "step": 38055 }, { "epoch": 3.4493384085553744, "grad_norm": 0.5535368323326111, "learning_rate": 1.3273407532736942e-05, "loss": 0.0574, "step": 38060 }, { "epoch": 3.4497915533804604, "grad_norm": 0.7583141326904297, "learning_rate": 1.326642328844105e-05, "loss": 0.0634, "step": 38065 }, { "epoch": 3.4502446982055464, "grad_norm": 0.5834643244743347, "learning_rate": 1.3259440218445465e-05, "loss": 0.0562, "step": 38070 }, { "epoch": 3.4506978430306328, "grad_norm": 0.628950297832489, "learning_rate": 1.3252458323449084e-05, "loss": 0.0616, "step": 38075 }, { "epoch": 3.4511509878557187, "grad_norm": 0.7790383100509644, "learning_rate": 1.3245477604150647e-05, "loss": 0.0655, "step": 38080 }, { "epoch": 3.4516041326808047, "grad_norm": 0.5565195083618164, "learning_rate": 1.3238498061248783e-05, "loss": 0.0708, "step": 38085 }, { "epoch": 3.452057277505891, "grad_norm": 0.6615811586380005, "learning_rate": 1.3231519695442007e-05, "loss": 0.0857, "step": 38090 }, { "epoch": 3.452510422330977, "grad_norm": 0.6176372170448303, "learning_rate": 1.322454250742871e-05, "loss": 0.0578, "step": 38095 }, { "epoch": 3.452963567156063, "grad_norm": 0.5864734649658203, "learning_rate": 1.3217566497907192e-05, "loss": 0.0624, "step": 38100 }, { "epoch": 3.4534167119811494, "grad_norm": 0.6133440732955933, "learning_rate": 1.3210591667575601e-05, "loss": 0.0668, "step": 38105 }, { "epoch": 3.4538698568062354, "grad_norm": 0.6051492094993591, "learning_rate": 1.320361801713198e-05, "loss": 0.0623, "step": 38110 }, { "epoch": 3.4543230016313213, "grad_norm": 0.6190629601478577, "learning_rate": 1.3196645547274255e-05, "loss": 0.0561, "step": 38115 }, { "epoch": 3.4547761464564073, "grad_norm": 0.7295631170272827, "learning_rate": 1.3189674258700227e-05, "loss": 0.0724, "step": 38120 }, { "epoch": 3.4552292912814937, "grad_norm": 0.5762172937393188, "learning_rate": 1.31827041521076e-05, "loss": 0.0688, "step": 38125 }, { "epoch": 3.4556824361065797, "grad_norm": 0.8314685225486755, "learning_rate": 1.3175735228193936e-05, "loss": 0.0712, "step": 38130 }, { "epoch": 3.4561355809316656, "grad_norm": 0.4868575930595398, "learning_rate": 1.3168767487656678e-05, "loss": 0.0569, "step": 38135 }, { "epoch": 3.456588725756752, "grad_norm": 0.6607013940811157, "learning_rate": 1.3161800931193185e-05, "loss": 0.0714, "step": 38140 }, { "epoch": 3.457041870581838, "grad_norm": 0.6957395076751709, "learning_rate": 1.3154835559500644e-05, "loss": 0.068, "step": 38145 }, { "epoch": 3.457495015406924, "grad_norm": 0.7214202284812927, "learning_rate": 1.3147871373276172e-05, "loss": 0.0535, "step": 38150 }, { "epoch": 3.45794816023201, "grad_norm": 0.5398727655410767, "learning_rate": 1.3140908373216737e-05, "loss": 0.0557, "step": 38155 }, { "epoch": 3.4584013050570963, "grad_norm": 0.7618252635002136, "learning_rate": 1.3133946560019195e-05, "loss": 0.0712, "step": 38160 }, { "epoch": 3.4588544498821823, "grad_norm": 0.7073648571968079, "learning_rate": 1.312698593438031e-05, "loss": 0.0612, "step": 38165 }, { "epoch": 3.4593075947072682, "grad_norm": 0.7667965292930603, "learning_rate": 1.3120026496996666e-05, "loss": 0.0577, "step": 38170 }, { "epoch": 3.4597607395323546, "grad_norm": 0.5497159957885742, "learning_rate": 1.3113068248564797e-05, "loss": 0.0667, "step": 38175 }, { "epoch": 3.4602138843574406, "grad_norm": 0.6994185447692871, "learning_rate": 1.3106111189781072e-05, "loss": 0.0742, "step": 38180 }, { "epoch": 3.4606670291825266, "grad_norm": 0.641665518283844, "learning_rate": 1.3099155321341756e-05, "loss": 0.0641, "step": 38185 }, { "epoch": 3.461120174007613, "grad_norm": 0.6057525873184204, "learning_rate": 1.3092200643943015e-05, "loss": 0.0646, "step": 38190 }, { "epoch": 3.461573318832699, "grad_norm": 0.6145967245101929, "learning_rate": 1.3085247158280842e-05, "loss": 0.0637, "step": 38195 }, { "epoch": 3.462026463657785, "grad_norm": 0.5031989812850952, "learning_rate": 1.307829486505117e-05, "loss": 0.0613, "step": 38200 }, { "epoch": 3.4624796084828713, "grad_norm": 0.6196920275688171, "learning_rate": 1.3071343764949779e-05, "loss": 0.0651, "step": 38205 }, { "epoch": 3.4629327533079572, "grad_norm": 0.5647338628768921, "learning_rate": 1.3064393858672322e-05, "loss": 0.0672, "step": 38210 }, { "epoch": 3.463385898133043, "grad_norm": 0.5278452038764954, "learning_rate": 1.3057445146914385e-05, "loss": 0.0666, "step": 38215 }, { "epoch": 3.4638390429581296, "grad_norm": 0.7100855708122253, "learning_rate": 1.3050497630371355e-05, "loss": 0.0597, "step": 38220 }, { "epoch": 3.4642921877832156, "grad_norm": 0.7707821726799011, "learning_rate": 1.3043551309738572e-05, "loss": 0.0709, "step": 38225 }, { "epoch": 3.4647453326083015, "grad_norm": 0.5291355848312378, "learning_rate": 1.3036606185711218e-05, "loss": 0.057, "step": 38230 }, { "epoch": 3.465198477433388, "grad_norm": 0.5814080834388733, "learning_rate": 1.3029662258984348e-05, "loss": 0.0618, "step": 38235 }, { "epoch": 3.465651622258474, "grad_norm": 0.6296553611755371, "learning_rate": 1.3022719530252947e-05, "loss": 0.0649, "step": 38240 }, { "epoch": 3.46610476708356, "grad_norm": 0.8416089415550232, "learning_rate": 1.3015778000211804e-05, "loss": 0.0685, "step": 38245 }, { "epoch": 3.466557911908646, "grad_norm": 0.6932570934295654, "learning_rate": 1.3008837669555668e-05, "loss": 0.0591, "step": 38250 }, { "epoch": 3.467011056733732, "grad_norm": 0.49864864349365234, "learning_rate": 1.3001898538979095e-05, "loss": 0.0628, "step": 38255 }, { "epoch": 3.467464201558818, "grad_norm": 0.5475532412528992, "learning_rate": 1.2994960609176582e-05, "loss": 0.0718, "step": 38260 }, { "epoch": 3.467917346383904, "grad_norm": 0.6085952520370483, "learning_rate": 1.2988023880842467e-05, "loss": 0.0677, "step": 38265 }, { "epoch": 3.4683704912089905, "grad_norm": 0.4739663004875183, "learning_rate": 1.2981088354670973e-05, "loss": 0.075, "step": 38270 }, { "epoch": 3.4688236360340765, "grad_norm": 0.7425973415374756, "learning_rate": 1.2974154031356234e-05, "loss": 0.0601, "step": 38275 }, { "epoch": 3.4692767808591625, "grad_norm": 0.5294045209884644, "learning_rate": 1.2967220911592221e-05, "loss": 0.0657, "step": 38280 }, { "epoch": 3.469729925684249, "grad_norm": 0.5599417686462402, "learning_rate": 1.2960288996072806e-05, "loss": 0.0672, "step": 38285 }, { "epoch": 3.470183070509335, "grad_norm": 0.7633801698684692, "learning_rate": 1.295335828549174e-05, "loss": 0.0616, "step": 38290 }, { "epoch": 3.470636215334421, "grad_norm": 0.6465011835098267, "learning_rate": 1.2946428780542638e-05, "loss": 0.0598, "step": 38295 }, { "epoch": 3.4710893601595068, "grad_norm": 0.8782579302787781, "learning_rate": 1.2939500481919031e-05, "loss": 0.0864, "step": 38300 }, { "epoch": 3.471542504984593, "grad_norm": 0.6163049936294556, "learning_rate": 1.293257339031429e-05, "loss": 0.0887, "step": 38305 }, { "epoch": 3.471995649809679, "grad_norm": 0.6324719786643982, "learning_rate": 1.2925647506421684e-05, "loss": 0.0567, "step": 38310 }, { "epoch": 3.472448794634765, "grad_norm": 0.6079891324043274, "learning_rate": 1.2918722830934357e-05, "loss": 0.0541, "step": 38315 }, { "epoch": 3.4729019394598515, "grad_norm": 0.7483313083648682, "learning_rate": 1.2911799364545324e-05, "loss": 0.0573, "step": 38320 }, { "epoch": 3.4733550842849374, "grad_norm": 0.6488090753555298, "learning_rate": 1.2904877107947508e-05, "loss": 0.0589, "step": 38325 }, { "epoch": 3.4738082291100234, "grad_norm": 0.8420494794845581, "learning_rate": 1.2897956061833677e-05, "loss": 0.0581, "step": 38330 }, { "epoch": 3.47426137393511, "grad_norm": 0.736996054649353, "learning_rate": 1.2891036226896497e-05, "loss": 0.0709, "step": 38335 }, { "epoch": 3.4747145187601958, "grad_norm": 0.6851480603218079, "learning_rate": 1.2884117603828505e-05, "loss": 0.084, "step": 38340 }, { "epoch": 3.4751676635852817, "grad_norm": 0.6207802891731262, "learning_rate": 1.2877200193322108e-05, "loss": 0.0575, "step": 38345 }, { "epoch": 3.475620808410368, "grad_norm": 0.6872967481613159, "learning_rate": 1.287028399606962e-05, "loss": 0.0826, "step": 38350 }, { "epoch": 3.476073953235454, "grad_norm": 0.6016016602516174, "learning_rate": 1.2863369012763216e-05, "loss": 0.0575, "step": 38355 }, { "epoch": 3.47652709806054, "grad_norm": 0.679020345211029, "learning_rate": 1.2856455244094943e-05, "loss": 0.0727, "step": 38360 }, { "epoch": 3.4769802428856265, "grad_norm": 0.4812844693660736, "learning_rate": 1.2849542690756733e-05, "loss": 0.0653, "step": 38365 }, { "epoch": 3.4774333877107124, "grad_norm": 0.5378340482711792, "learning_rate": 1.2842631353440387e-05, "loss": 0.0655, "step": 38370 }, { "epoch": 3.4778865325357984, "grad_norm": 0.6369275450706482, "learning_rate": 1.2835721232837616e-05, "loss": 0.0605, "step": 38375 }, { "epoch": 3.478339677360885, "grad_norm": 0.6882529258728027, "learning_rate": 1.2828812329639977e-05, "loss": 0.0683, "step": 38380 }, { "epoch": 3.4787928221859707, "grad_norm": 0.5439378619194031, "learning_rate": 1.2821904644538912e-05, "loss": 0.0651, "step": 38385 }, { "epoch": 3.4792459670110567, "grad_norm": 0.6146447658538818, "learning_rate": 1.281499817822575e-05, "loss": 0.0546, "step": 38390 }, { "epoch": 3.4796991118361427, "grad_norm": 0.6418092250823975, "learning_rate": 1.280809293139168e-05, "loss": 0.0785, "step": 38395 }, { "epoch": 3.480152256661229, "grad_norm": 0.6929206848144531, "learning_rate": 1.2801188904727798e-05, "loss": 0.0683, "step": 38400 }, { "epoch": 3.480605401486315, "grad_norm": 0.6029554605484009, "learning_rate": 1.2794286098925056e-05, "loss": 0.0703, "step": 38405 }, { "epoch": 3.481058546311401, "grad_norm": 0.5019002556800842, "learning_rate": 1.2787384514674287e-05, "loss": 0.052, "step": 38410 }, { "epoch": 3.4815116911364874, "grad_norm": 0.747319221496582, "learning_rate": 1.2780484152666195e-05, "loss": 0.0563, "step": 38415 }, { "epoch": 3.4819648359615734, "grad_norm": 0.5828468799591064, "learning_rate": 1.2773585013591389e-05, "loss": 0.0673, "step": 38420 }, { "epoch": 3.4824179807866593, "grad_norm": 0.5648056864738464, "learning_rate": 1.2766687098140326e-05, "loss": 0.0593, "step": 38425 }, { "epoch": 3.4828711256117453, "grad_norm": 0.7344432473182678, "learning_rate": 1.2759790407003355e-05, "loss": 0.0628, "step": 38430 }, { "epoch": 3.4833242704368317, "grad_norm": 0.6553139090538025, "learning_rate": 1.27528949408707e-05, "loss": 0.0659, "step": 38435 }, { "epoch": 3.4837774152619176, "grad_norm": 0.5495647192001343, "learning_rate": 1.2746000700432448e-05, "loss": 0.0705, "step": 38440 }, { "epoch": 3.4842305600870036, "grad_norm": 0.7467184662818909, "learning_rate": 1.2739107686378607e-05, "loss": 0.0711, "step": 38445 }, { "epoch": 3.48468370491209, "grad_norm": 0.638176441192627, "learning_rate": 1.2732215899398995e-05, "loss": 0.061, "step": 38450 }, { "epoch": 3.485136849737176, "grad_norm": 0.6163360476493835, "learning_rate": 1.2725325340183372e-05, "loss": 0.0521, "step": 38455 }, { "epoch": 3.485589994562262, "grad_norm": 0.5563523173332214, "learning_rate": 1.271843600942134e-05, "loss": 0.0917, "step": 38460 }, { "epoch": 3.4860431393873483, "grad_norm": 0.5702326893806458, "learning_rate": 1.2711547907802374e-05, "loss": 0.0629, "step": 38465 }, { "epoch": 3.4864962842124343, "grad_norm": 0.5979897975921631, "learning_rate": 1.2704661036015863e-05, "loss": 0.067, "step": 38470 }, { "epoch": 3.4869494290375203, "grad_norm": 0.6262529492378235, "learning_rate": 1.2697775394751015e-05, "loss": 0.0826, "step": 38475 }, { "epoch": 3.4874025738626067, "grad_norm": 0.7320153713226318, "learning_rate": 1.2690890984696979e-05, "loss": 0.0705, "step": 38480 }, { "epoch": 3.4878557186876926, "grad_norm": 0.4790875017642975, "learning_rate": 1.2684007806542731e-05, "loss": 0.0653, "step": 38485 }, { "epoch": 3.4883088635127786, "grad_norm": 0.5802872180938721, "learning_rate": 1.2677125860977132e-05, "loss": 0.0571, "step": 38490 }, { "epoch": 3.488762008337865, "grad_norm": 0.7218320369720459, "learning_rate": 1.2670245148688964e-05, "loss": 0.0744, "step": 38495 }, { "epoch": 3.489215153162951, "grad_norm": 0.6918104290962219, "learning_rate": 1.2663365670366812e-05, "loss": 0.0669, "step": 38500 }, { "epoch": 3.489668297988037, "grad_norm": 0.7692624926567078, "learning_rate": 1.2656487426699201e-05, "loss": 0.0698, "step": 38505 }, { "epoch": 3.4901214428131233, "grad_norm": 0.5763952136039734, "learning_rate": 1.2649610418374502e-05, "loss": 0.0699, "step": 38510 }, { "epoch": 3.4905745876382093, "grad_norm": 0.6349143385887146, "learning_rate": 1.2642734646080962e-05, "loss": 0.0613, "step": 38515 }, { "epoch": 3.4910277324632952, "grad_norm": 0.6238627433776855, "learning_rate": 1.2635860110506731e-05, "loss": 0.0566, "step": 38520 }, { "epoch": 3.491480877288381, "grad_norm": 0.6508736610412598, "learning_rate": 1.262898681233978e-05, "loss": 0.0712, "step": 38525 }, { "epoch": 3.4919340221134676, "grad_norm": 0.4525754153728485, "learning_rate": 1.2622114752268022e-05, "loss": 0.0531, "step": 38530 }, { "epoch": 3.4923871669385536, "grad_norm": 0.5928863883018494, "learning_rate": 1.2615243930979203e-05, "loss": 0.0674, "step": 38535 }, { "epoch": 3.4928403117636395, "grad_norm": 0.5726080536842346, "learning_rate": 1.260837434916095e-05, "loss": 0.0636, "step": 38540 }, { "epoch": 3.493293456588726, "grad_norm": 0.5670980215072632, "learning_rate": 1.2601506007500796e-05, "loss": 0.0491, "step": 38545 }, { "epoch": 3.493746601413812, "grad_norm": 0.6781818866729736, "learning_rate": 1.2594638906686098e-05, "loss": 0.0501, "step": 38550 }, { "epoch": 3.494199746238898, "grad_norm": 0.7251284122467041, "learning_rate": 1.2587773047404139e-05, "loss": 0.0696, "step": 38555 }, { "epoch": 3.494652891063984, "grad_norm": 0.5335484743118286, "learning_rate": 1.2580908430342053e-05, "loss": 0.057, "step": 38560 }, { "epoch": 3.49510603588907, "grad_norm": 0.6149359941482544, "learning_rate": 1.2574045056186834e-05, "loss": 0.0581, "step": 38565 }, { "epoch": 3.495559180714156, "grad_norm": 0.6899547576904297, "learning_rate": 1.2567182925625409e-05, "loss": 0.082, "step": 38570 }, { "epoch": 3.496012325539242, "grad_norm": 0.5285412073135376, "learning_rate": 1.2560322039344502e-05, "loss": 0.0662, "step": 38575 }, { "epoch": 3.4964654703643285, "grad_norm": 0.7809799313545227, "learning_rate": 1.2553462398030779e-05, "loss": 0.0616, "step": 38580 }, { "epoch": 3.4969186151894145, "grad_norm": 0.49756431579589844, "learning_rate": 1.2546604002370744e-05, "loss": 0.0584, "step": 38585 }, { "epoch": 3.4973717600145005, "grad_norm": 0.49590879678726196, "learning_rate": 1.2539746853050783e-05, "loss": 0.0648, "step": 38590 }, { "epoch": 3.497824904839587, "grad_norm": 0.5272486209869385, "learning_rate": 1.2532890950757186e-05, "loss": 0.0534, "step": 38595 }, { "epoch": 3.498278049664673, "grad_norm": 0.8405635952949524, "learning_rate": 1.2526036296176063e-05, "loss": 0.0623, "step": 38600 }, { "epoch": 3.4987311944897588, "grad_norm": 0.555529773235321, "learning_rate": 1.2519182889993453e-05, "loss": 0.0644, "step": 38605 }, { "epoch": 3.499184339314845, "grad_norm": 0.6846579313278198, "learning_rate": 1.2512330732895233e-05, "loss": 0.0596, "step": 38610 }, { "epoch": 3.499637484139931, "grad_norm": 0.5848059058189392, "learning_rate": 1.2505479825567179e-05, "loss": 0.0656, "step": 38615 }, { "epoch": 3.500090628965017, "grad_norm": 0.5440685749053955, "learning_rate": 1.2498630168694927e-05, "loss": 0.0597, "step": 38620 }, { "epoch": 3.5005437737901035, "grad_norm": 0.6074931621551514, "learning_rate": 1.2491781762963986e-05, "loss": 0.0671, "step": 38625 }, { "epoch": 3.5009969186151895, "grad_norm": 0.553288996219635, "learning_rate": 1.2484934609059762e-05, "loss": 0.0738, "step": 38630 }, { "epoch": 3.5014500634402754, "grad_norm": 0.66270512342453, "learning_rate": 1.2478088707667512e-05, "loss": 0.0624, "step": 38635 }, { "epoch": 3.501903208265362, "grad_norm": 0.6753090620040894, "learning_rate": 1.2471244059472381e-05, "loss": 0.0651, "step": 38640 }, { "epoch": 3.502356353090448, "grad_norm": 0.8446976542472839, "learning_rate": 1.2464400665159377e-05, "loss": 0.0785, "step": 38645 }, { "epoch": 3.5028094979155338, "grad_norm": 0.5639562606811523, "learning_rate": 1.2457558525413383e-05, "loss": 0.0587, "step": 38650 }, { "epoch": 3.50326264274062, "grad_norm": 0.7149177193641663, "learning_rate": 1.2450717640919183e-05, "loss": 0.063, "step": 38655 }, { "epoch": 3.503715787565706, "grad_norm": 0.6231134533882141, "learning_rate": 1.2443878012361402e-05, "loss": 0.0738, "step": 38660 }, { "epoch": 3.504168932390792, "grad_norm": 0.54749596118927, "learning_rate": 1.2437039640424552e-05, "loss": 0.0544, "step": 38665 }, { "epoch": 3.504622077215878, "grad_norm": 0.6088166236877441, "learning_rate": 1.2430202525793014e-05, "loss": 0.058, "step": 38670 }, { "epoch": 3.5050752220409644, "grad_norm": 0.7533892393112183, "learning_rate": 1.2423366669151069e-05, "loss": 0.0601, "step": 38675 }, { "epoch": 3.5055283668660504, "grad_norm": 0.6300519704818726, "learning_rate": 1.2416532071182834e-05, "loss": 0.0653, "step": 38680 }, { "epoch": 3.5059815116911364, "grad_norm": 0.685264527797699, "learning_rate": 1.2409698732572328e-05, "loss": 0.0693, "step": 38685 }, { "epoch": 3.5064346565162223, "grad_norm": 0.5623971819877625, "learning_rate": 1.2402866654003425e-05, "loss": 0.0618, "step": 38690 }, { "epoch": 3.5068878013413087, "grad_norm": 0.7046867609024048, "learning_rate": 1.239603583615988e-05, "loss": 0.0568, "step": 38695 }, { "epoch": 3.5073409461663947, "grad_norm": 0.6464666128158569, "learning_rate": 1.2389206279725338e-05, "loss": 0.0622, "step": 38700 }, { "epoch": 3.5077940909914807, "grad_norm": 0.5586261749267578, "learning_rate": 1.2382377985383293e-05, "loss": 0.0528, "step": 38705 }, { "epoch": 3.508247235816567, "grad_norm": 0.6085014939308167, "learning_rate": 1.2375550953817124e-05, "loss": 0.0759, "step": 38710 }, { "epoch": 3.508700380641653, "grad_norm": 0.6364027261734009, "learning_rate": 1.2368725185710084e-05, "loss": 0.058, "step": 38715 }, { "epoch": 3.509153525466739, "grad_norm": 0.5295395255088806, "learning_rate": 1.2361900681745289e-05, "loss": 0.0531, "step": 38720 }, { "epoch": 3.5096066702918254, "grad_norm": 0.7374891638755798, "learning_rate": 1.2355077442605755e-05, "loss": 0.0608, "step": 38725 }, { "epoch": 3.5100598151169113, "grad_norm": 0.6638000011444092, "learning_rate": 1.2348255468974345e-05, "loss": 0.0598, "step": 38730 }, { "epoch": 3.5105129599419973, "grad_norm": 0.6150575876235962, "learning_rate": 1.2341434761533804e-05, "loss": 0.0605, "step": 38735 }, { "epoch": 3.5109661047670837, "grad_norm": 0.5569559931755066, "learning_rate": 1.2334615320966752e-05, "loss": 0.0584, "step": 38740 }, { "epoch": 3.5114192495921697, "grad_norm": 0.6597542762756348, "learning_rate": 1.2327797147955672e-05, "loss": 0.0686, "step": 38745 }, { "epoch": 3.5118723944172556, "grad_norm": 0.7407880425453186, "learning_rate": 1.2320980243182945e-05, "loss": 0.0716, "step": 38750 }, { "epoch": 3.512325539242342, "grad_norm": 0.5715307593345642, "learning_rate": 1.2314164607330802e-05, "loss": 0.0664, "step": 38755 }, { "epoch": 3.512778684067428, "grad_norm": 0.5735325217247009, "learning_rate": 1.2307350241081354e-05, "loss": 0.0812, "step": 38760 }, { "epoch": 3.513231828892514, "grad_norm": 0.5587538480758667, "learning_rate": 1.2300537145116583e-05, "loss": 0.0575, "step": 38765 }, { "epoch": 3.5136849737176004, "grad_norm": 0.8304815888404846, "learning_rate": 1.229372532011834e-05, "loss": 0.0718, "step": 38770 }, { "epoch": 3.5141381185426863, "grad_norm": 0.473985880613327, "learning_rate": 1.228691476676837e-05, "loss": 0.0532, "step": 38775 }, { "epoch": 3.5145912633677723, "grad_norm": 0.8079963326454163, "learning_rate": 1.2280105485748269e-05, "loss": 0.0829, "step": 38780 }, { "epoch": 3.5150444081928587, "grad_norm": 0.5039668083190918, "learning_rate": 1.2273297477739501e-05, "loss": 0.069, "step": 38785 }, { "epoch": 3.5154975530179446, "grad_norm": 0.5932535529136658, "learning_rate": 1.2266490743423445e-05, "loss": 0.0715, "step": 38790 }, { "epoch": 3.5159506978430306, "grad_norm": 0.6340896487236023, "learning_rate": 1.2259685283481278e-05, "loss": 0.0615, "step": 38795 }, { "epoch": 3.5164038426681166, "grad_norm": 0.6114493608474731, "learning_rate": 1.2252881098594134e-05, "loss": 0.0716, "step": 38800 }, { "epoch": 3.516856987493203, "grad_norm": 0.7795690894126892, "learning_rate": 1.2246078189442941e-05, "loss": 0.0629, "step": 38805 }, { "epoch": 3.517310132318289, "grad_norm": 0.6714675426483154, "learning_rate": 1.2239276556708562e-05, "loss": 0.0591, "step": 38810 }, { "epoch": 3.517763277143375, "grad_norm": 0.6937402486801147, "learning_rate": 1.2232476201071703e-05, "loss": 0.0685, "step": 38815 }, { "epoch": 3.518216421968461, "grad_norm": 0.7294026613235474, "learning_rate": 1.2225677123212929e-05, "loss": 0.0616, "step": 38820 }, { "epoch": 3.5186695667935473, "grad_norm": 0.6302808523178101, "learning_rate": 1.2218879323812727e-05, "loss": 0.0753, "step": 38825 }, { "epoch": 3.519122711618633, "grad_norm": 0.5861710906028748, "learning_rate": 1.221208280355138e-05, "loss": 0.0651, "step": 38830 }, { "epoch": 3.519575856443719, "grad_norm": 0.5707703232765198, "learning_rate": 1.2205287563109124e-05, "loss": 0.0707, "step": 38835 }, { "epoch": 3.5200290012688056, "grad_norm": 0.6125868558883667, "learning_rate": 1.2198493603166012e-05, "loss": 0.0552, "step": 38840 }, { "epoch": 3.5204821460938915, "grad_norm": 0.5497829914093018, "learning_rate": 1.2191700924401977e-05, "loss": 0.0565, "step": 38845 }, { "epoch": 3.5209352909189775, "grad_norm": 0.8141387701034546, "learning_rate": 1.2184909527496863e-05, "loss": 0.0665, "step": 38850 }, { "epoch": 3.521388435744064, "grad_norm": 0.6571333408355713, "learning_rate": 1.2178119413130318e-05, "loss": 0.072, "step": 38855 }, { "epoch": 3.52184158056915, "grad_norm": 0.7351975440979004, "learning_rate": 1.2171330581981923e-05, "loss": 0.0584, "step": 38860 }, { "epoch": 3.522294725394236, "grad_norm": 0.5380064249038696, "learning_rate": 1.2164543034731105e-05, "loss": 0.0652, "step": 38865 }, { "epoch": 3.5227478702193222, "grad_norm": 0.5371142029762268, "learning_rate": 1.2157756772057147e-05, "loss": 0.0597, "step": 38870 }, { "epoch": 3.523201015044408, "grad_norm": 0.5224254131317139, "learning_rate": 1.2150971794639252e-05, "loss": 0.0585, "step": 38875 }, { "epoch": 3.523654159869494, "grad_norm": 0.6558645367622375, "learning_rate": 1.2144188103156423e-05, "loss": 0.0768, "step": 38880 }, { "epoch": 3.5241073046945806, "grad_norm": 0.682763397693634, "learning_rate": 1.2137405698287604e-05, "loss": 0.0614, "step": 38885 }, { "epoch": 3.5245604495196665, "grad_norm": 0.741858959197998, "learning_rate": 1.2130624580711571e-05, "loss": 0.0667, "step": 38890 }, { "epoch": 3.5250135943447525, "grad_norm": 0.6179368495941162, "learning_rate": 1.2123844751106972e-05, "loss": 0.0638, "step": 38895 }, { "epoch": 3.525466739169839, "grad_norm": 0.6677670478820801, "learning_rate": 1.211706621015236e-05, "loss": 0.0646, "step": 38900 }, { "epoch": 3.525919883994925, "grad_norm": 0.5427169799804688, "learning_rate": 1.2110288958526104e-05, "loss": 0.0678, "step": 38905 }, { "epoch": 3.526373028820011, "grad_norm": 0.6781365871429443, "learning_rate": 1.210351299690649e-05, "loss": 0.0626, "step": 38910 }, { "epoch": 3.526826173645097, "grad_norm": 0.6016629338264465, "learning_rate": 1.2096738325971657e-05, "loss": 0.0835, "step": 38915 }, { "epoch": 3.527279318470183, "grad_norm": 0.5579683780670166, "learning_rate": 1.2089964946399605e-05, "loss": 0.0587, "step": 38920 }, { "epoch": 3.527732463295269, "grad_norm": 0.5889155268669128, "learning_rate": 1.2083192858868233e-05, "loss": 0.0604, "step": 38925 }, { "epoch": 3.5281856081203555, "grad_norm": 0.5739731788635254, "learning_rate": 1.2076422064055286e-05, "loss": 0.0513, "step": 38930 }, { "epoch": 3.5286387529454415, "grad_norm": 0.7659353017807007, "learning_rate": 1.2069652562638389e-05, "loss": 0.0721, "step": 38935 }, { "epoch": 3.5290918977705275, "grad_norm": 0.5924193859100342, "learning_rate": 1.2062884355295035e-05, "loss": 0.061, "step": 38940 }, { "epoch": 3.5295450425956134, "grad_norm": 0.5822358131408691, "learning_rate": 1.2056117442702577e-05, "loss": 0.0593, "step": 38945 }, { "epoch": 3.5299981874207, "grad_norm": 0.6200891137123108, "learning_rate": 1.2049351825538274e-05, "loss": 0.0642, "step": 38950 }, { "epoch": 3.530451332245786, "grad_norm": 0.5050643682479858, "learning_rate": 1.2042587504479217e-05, "loss": 0.0606, "step": 38955 }, { "epoch": 3.5309044770708717, "grad_norm": 0.6569520235061646, "learning_rate": 1.2035824480202381e-05, "loss": 0.0617, "step": 38960 }, { "epoch": 3.5313576218959577, "grad_norm": 0.5871721506118774, "learning_rate": 1.2029062753384614e-05, "loss": 0.049, "step": 38965 }, { "epoch": 3.531810766721044, "grad_norm": 0.7402167916297913, "learning_rate": 1.2022302324702622e-05, "loss": 0.08, "step": 38970 }, { "epoch": 3.53226391154613, "grad_norm": 0.5531956553459167, "learning_rate": 1.2015543194833012e-05, "loss": 0.0555, "step": 38975 }, { "epoch": 3.532717056371216, "grad_norm": 0.6970641613006592, "learning_rate": 1.2008785364452227e-05, "loss": 0.0519, "step": 38980 }, { "epoch": 3.5331702011963024, "grad_norm": 0.7422012090682983, "learning_rate": 1.2002028834236591e-05, "loss": 0.0712, "step": 38985 }, { "epoch": 3.5336233460213884, "grad_norm": 0.8228721022605896, "learning_rate": 1.1995273604862309e-05, "loss": 0.0645, "step": 38990 }, { "epoch": 3.5340764908464743, "grad_norm": 0.5850398540496826, "learning_rate": 1.198851967700544e-05, "loss": 0.0623, "step": 38995 }, { "epoch": 3.5345296356715608, "grad_norm": 0.824297308921814, "learning_rate": 1.1981767051341908e-05, "loss": 0.0603, "step": 39000 }, { "epoch": 3.5349827804966467, "grad_norm": 0.48539847135543823, "learning_rate": 1.197501572854754e-05, "loss": 0.0565, "step": 39005 }, { "epoch": 3.5354359253217327, "grad_norm": 0.5478273630142212, "learning_rate": 1.1968265709298002e-05, "loss": 0.0665, "step": 39010 }, { "epoch": 3.535889070146819, "grad_norm": 0.4587608575820923, "learning_rate": 1.1961516994268837e-05, "loss": 0.058, "step": 39015 }, { "epoch": 3.536342214971905, "grad_norm": 0.6250929236412048, "learning_rate": 1.1954769584135458e-05, "loss": 0.0574, "step": 39020 }, { "epoch": 3.536795359796991, "grad_norm": 0.5999882817268372, "learning_rate": 1.1948023479573142e-05, "loss": 0.0566, "step": 39025 }, { "epoch": 3.5372485046220774, "grad_norm": 0.6680426001548767, "learning_rate": 1.1941278681257056e-05, "loss": 0.0551, "step": 39030 }, { "epoch": 3.5377016494471634, "grad_norm": 0.6023800373077393, "learning_rate": 1.1934535189862215e-05, "loss": 0.059, "step": 39035 }, { "epoch": 3.5381547942722493, "grad_norm": 0.5387729406356812, "learning_rate": 1.1927793006063498e-05, "loss": 0.0597, "step": 39040 }, { "epoch": 3.5386079390973357, "grad_norm": 0.506913423538208, "learning_rate": 1.1921052130535696e-05, "loss": 0.0544, "step": 39045 }, { "epoch": 3.5390610839224217, "grad_norm": 0.6576878428459167, "learning_rate": 1.1914312563953401e-05, "loss": 0.0761, "step": 39050 }, { "epoch": 3.5395142287475077, "grad_norm": 0.7194899320602417, "learning_rate": 1.1907574306991135e-05, "loss": 0.0665, "step": 39055 }, { "epoch": 3.539967373572594, "grad_norm": 0.6356311440467834, "learning_rate": 1.1900837360323258e-05, "loss": 0.0565, "step": 39060 }, { "epoch": 3.54042051839768, "grad_norm": 0.8751962780952454, "learning_rate": 1.1894101724623999e-05, "loss": 0.0802, "step": 39065 }, { "epoch": 3.540873663222766, "grad_norm": 0.5666666030883789, "learning_rate": 1.1887367400567487e-05, "loss": 0.0595, "step": 39070 }, { "epoch": 3.541326808047852, "grad_norm": 0.6262370347976685, "learning_rate": 1.1880634388827661e-05, "loss": 0.0565, "step": 39075 }, { "epoch": 3.5417799528729383, "grad_norm": 0.6134195923805237, "learning_rate": 1.1873902690078386e-05, "loss": 0.0664, "step": 39080 }, { "epoch": 3.5422330976980243, "grad_norm": 0.5859988331794739, "learning_rate": 1.1867172304993371e-05, "loss": 0.0674, "step": 39085 }, { "epoch": 3.5426862425231103, "grad_norm": 0.5229611992835999, "learning_rate": 1.1860443234246182e-05, "loss": 0.0664, "step": 39090 }, { "epoch": 3.5431393873481962, "grad_norm": 0.7445806264877319, "learning_rate": 1.1853715478510293e-05, "loss": 0.0607, "step": 39095 }, { "epoch": 3.5435925321732826, "grad_norm": 0.726244330406189, "learning_rate": 1.1846989038458984e-05, "loss": 0.0624, "step": 39100 }, { "epoch": 3.5440456769983686, "grad_norm": 0.5267159938812256, "learning_rate": 1.1840263914765467e-05, "loss": 0.0608, "step": 39105 }, { "epoch": 3.5444988218234545, "grad_norm": 0.5821358561515808, "learning_rate": 1.1833540108102789e-05, "loss": 0.0588, "step": 39110 }, { "epoch": 3.544951966648541, "grad_norm": 0.6585426926612854, "learning_rate": 1.1826817619143857e-05, "loss": 0.0718, "step": 39115 }, { "epoch": 3.545405111473627, "grad_norm": 0.6331679224967957, "learning_rate": 1.1820096448561491e-05, "loss": 0.0614, "step": 39120 }, { "epoch": 3.545858256298713, "grad_norm": 0.6032271385192871, "learning_rate": 1.1813376597028305e-05, "loss": 0.0629, "step": 39125 }, { "epoch": 3.5463114011237993, "grad_norm": 0.5587821006774902, "learning_rate": 1.1806658065216861e-05, "loss": 0.0607, "step": 39130 }, { "epoch": 3.5467645459488852, "grad_norm": 0.5109670162200928, "learning_rate": 1.1799940853799534e-05, "loss": 0.0524, "step": 39135 }, { "epoch": 3.547217690773971, "grad_norm": 1.0325148105621338, "learning_rate": 1.1793224963448583e-05, "loss": 0.0694, "step": 39140 }, { "epoch": 3.5476708355990576, "grad_norm": 0.6189673542976379, "learning_rate": 1.1786510394836159e-05, "loss": 0.0723, "step": 39145 }, { "epoch": 3.5481239804241436, "grad_norm": 0.5024570226669312, "learning_rate": 1.1779797148634222e-05, "loss": 0.0523, "step": 39150 }, { "epoch": 3.5485771252492295, "grad_norm": 0.6757173538208008, "learning_rate": 1.1773085225514677e-05, "loss": 0.0681, "step": 39155 }, { "epoch": 3.549030270074316, "grad_norm": 0.5952886343002319, "learning_rate": 1.1766374626149213e-05, "loss": 0.0575, "step": 39160 }, { "epoch": 3.549483414899402, "grad_norm": 0.5416528582572937, "learning_rate": 1.175966535120946e-05, "loss": 0.0721, "step": 39165 }, { "epoch": 3.549936559724488, "grad_norm": 0.7254804372787476, "learning_rate": 1.1752957401366874e-05, "loss": 0.0648, "step": 39170 }, { "epoch": 3.5503897045495743, "grad_norm": 0.7012931108474731, "learning_rate": 1.174625077729278e-05, "loss": 0.0625, "step": 39175 }, { "epoch": 3.55084284937466, "grad_norm": 0.5795900821685791, "learning_rate": 1.1739545479658398e-05, "loss": 0.0725, "step": 39180 }, { "epoch": 3.551295994199746, "grad_norm": 0.6575205326080322, "learning_rate": 1.173284150913479e-05, "loss": 0.0624, "step": 39185 }, { "epoch": 3.5517491390248326, "grad_norm": 0.6394016146659851, "learning_rate": 1.1726138866392885e-05, "loss": 0.0635, "step": 39190 }, { "epoch": 3.5522022838499185, "grad_norm": 0.6467261910438538, "learning_rate": 1.1719437552103491e-05, "loss": 0.0727, "step": 39195 }, { "epoch": 3.5526554286750045, "grad_norm": 0.702368974685669, "learning_rate": 1.1712737566937268e-05, "loss": 0.0707, "step": 39200 }, { "epoch": 3.553108573500091, "grad_norm": 0.5847074389457703, "learning_rate": 1.1706038911564767e-05, "loss": 0.0692, "step": 39205 }, { "epoch": 3.553561718325177, "grad_norm": 0.5164670944213867, "learning_rate": 1.169934158665639e-05, "loss": 0.0534, "step": 39210 }, { "epoch": 3.554014863150263, "grad_norm": 0.673330545425415, "learning_rate": 1.16926455928824e-05, "loss": 0.068, "step": 39215 }, { "epoch": 3.554468007975349, "grad_norm": 0.6929487586021423, "learning_rate": 1.1685950930912939e-05, "loss": 0.0727, "step": 39220 }, { "epoch": 3.5549211528004347, "grad_norm": 0.5148489475250244, "learning_rate": 1.1679257601418e-05, "loss": 0.0538, "step": 39225 }, { "epoch": 3.555374297625521, "grad_norm": 0.5991215109825134, "learning_rate": 1.1672565605067472e-05, "loss": 0.06, "step": 39230 }, { "epoch": 3.555827442450607, "grad_norm": 0.4800642728805542, "learning_rate": 1.1665874942531086e-05, "loss": 0.0508, "step": 39235 }, { "epoch": 3.556280587275693, "grad_norm": 0.6261712908744812, "learning_rate": 1.1659185614478441e-05, "loss": 0.0551, "step": 39240 }, { "epoch": 3.5567337321007795, "grad_norm": 0.5571467876434326, "learning_rate": 1.1652497621579012e-05, "loss": 0.0553, "step": 39245 }, { "epoch": 3.5571868769258654, "grad_norm": 0.559989869594574, "learning_rate": 1.1645810964502121e-05, "loss": 0.0662, "step": 39250 }, { "epoch": 3.5576400217509514, "grad_norm": 0.58782559633255, "learning_rate": 1.1639125643916993e-05, "loss": 0.0651, "step": 39255 }, { "epoch": 3.558093166576038, "grad_norm": 0.6316081285476685, "learning_rate": 1.1632441660492688e-05, "loss": 0.0677, "step": 39260 }, { "epoch": 3.5585463114011238, "grad_norm": 0.6181976199150085, "learning_rate": 1.1625759014898142e-05, "loss": 0.0562, "step": 39265 }, { "epoch": 3.5589994562262097, "grad_norm": 0.5798534750938416, "learning_rate": 1.1619077707802151e-05, "loss": 0.0577, "step": 39270 }, { "epoch": 3.559452601051296, "grad_norm": 0.6433562636375427, "learning_rate": 1.1612397739873379e-05, "loss": 0.0563, "step": 39275 }, { "epoch": 3.559905745876382, "grad_norm": 0.48564988374710083, "learning_rate": 1.1605719111780378e-05, "loss": 0.0644, "step": 39280 }, { "epoch": 3.560358890701468, "grad_norm": 0.5879780650138855, "learning_rate": 1.1599041824191534e-05, "loss": 0.0597, "step": 39285 }, { "epoch": 3.5608120355265545, "grad_norm": 0.7435640692710876, "learning_rate": 1.1592365877775116e-05, "loss": 0.0633, "step": 39290 }, { "epoch": 3.5612651803516404, "grad_norm": 0.521461009979248, "learning_rate": 1.1585691273199254e-05, "loss": 0.051, "step": 39295 }, { "epoch": 3.5617183251767264, "grad_norm": 0.5364567637443542, "learning_rate": 1.1579018011131932e-05, "loss": 0.0582, "step": 39300 }, { "epoch": 3.562171470001813, "grad_norm": 0.4763050377368927, "learning_rate": 1.1572346092241034e-05, "loss": 0.0582, "step": 39305 }, { "epoch": 3.5626246148268987, "grad_norm": 0.6735125184059143, "learning_rate": 1.156567551719428e-05, "loss": 0.0748, "step": 39310 }, { "epoch": 3.5630777596519847, "grad_norm": 0.6450954675674438, "learning_rate": 1.1559006286659263e-05, "loss": 0.0602, "step": 39315 }, { "epoch": 3.563530904477071, "grad_norm": 0.44301557540893555, "learning_rate": 1.1552338401303428e-05, "loss": 0.0483, "step": 39320 }, { "epoch": 3.563984049302157, "grad_norm": 0.7233856320381165, "learning_rate": 1.1545671861794122e-05, "loss": 0.0622, "step": 39325 }, { "epoch": 3.564437194127243, "grad_norm": 0.6479259729385376, "learning_rate": 1.1539006668798527e-05, "loss": 0.0613, "step": 39330 }, { "epoch": 3.5648903389523294, "grad_norm": 0.5646116137504578, "learning_rate": 1.1532342822983691e-05, "loss": 0.0538, "step": 39335 }, { "epoch": 3.5653434837774154, "grad_norm": 0.554964542388916, "learning_rate": 1.1525680325016541e-05, "loss": 0.0596, "step": 39340 }, { "epoch": 3.5657966286025014, "grad_norm": 1.3586021661758423, "learning_rate": 1.151901917556385e-05, "loss": 0.0552, "step": 39345 }, { "epoch": 3.5662497734275873, "grad_norm": 0.6078394651412964, "learning_rate": 1.1512359375292292e-05, "loss": 0.0589, "step": 39350 }, { "epoch": 3.5667029182526737, "grad_norm": 0.40518221259117126, "learning_rate": 1.150570092486835e-05, "loss": 0.055, "step": 39355 }, { "epoch": 3.5671560630777597, "grad_norm": 0.4872203767299652, "learning_rate": 1.1499043824958434e-05, "loss": 0.0862, "step": 39360 }, { "epoch": 3.5676092079028456, "grad_norm": 0.5937506556510925, "learning_rate": 1.1492388076228775e-05, "loss": 0.0635, "step": 39365 }, { "epoch": 3.5680623527279316, "grad_norm": 0.5294156670570374, "learning_rate": 1.1485733679345473e-05, "loss": 0.0624, "step": 39370 }, { "epoch": 3.568515497553018, "grad_norm": 0.5777794122695923, "learning_rate": 1.1479080634974529e-05, "loss": 0.0703, "step": 39375 }, { "epoch": 3.568968642378104, "grad_norm": 0.943160891532898, "learning_rate": 1.1472428943781752e-05, "loss": 0.0897, "step": 39380 }, { "epoch": 3.56942178720319, "grad_norm": 0.6313517689704895, "learning_rate": 1.1465778606432864e-05, "loss": 0.0534, "step": 39385 }, { "epoch": 3.5698749320282763, "grad_norm": 0.5880723595619202, "learning_rate": 1.1459129623593431e-05, "loss": 0.0596, "step": 39390 }, { "epoch": 3.5703280768533623, "grad_norm": 0.5656668543815613, "learning_rate": 1.1452481995928873e-05, "loss": 0.0549, "step": 39395 }, { "epoch": 3.5707812216784482, "grad_norm": 0.5261752605438232, "learning_rate": 1.1445835724104515e-05, "loss": 0.0568, "step": 39400 }, { "epoch": 3.5712343665035347, "grad_norm": 0.48981165885925293, "learning_rate": 1.1439190808785483e-05, "loss": 0.0546, "step": 39405 }, { "epoch": 3.5716875113286206, "grad_norm": 0.66724693775177, "learning_rate": 1.1432547250636827e-05, "loss": 0.0683, "step": 39410 }, { "epoch": 3.5721406561537066, "grad_norm": 0.7165637016296387, "learning_rate": 1.1425905050323427e-05, "loss": 0.0574, "step": 39415 }, { "epoch": 3.572593800978793, "grad_norm": 0.4153812527656555, "learning_rate": 1.141926420851003e-05, "loss": 0.0628, "step": 39420 }, { "epoch": 3.573046945803879, "grad_norm": 0.7146560549736023, "learning_rate": 1.1412624725861281e-05, "loss": 0.0814, "step": 39425 }, { "epoch": 3.573500090628965, "grad_norm": 0.5788551568984985, "learning_rate": 1.1405986603041624e-05, "loss": 0.0547, "step": 39430 }, { "epoch": 3.5739532354540513, "grad_norm": 0.6882078051567078, "learning_rate": 1.1399349840715433e-05, "loss": 0.0564, "step": 39435 }, { "epoch": 3.5744063802791373, "grad_norm": 0.6455124020576477, "learning_rate": 1.1392714439546906e-05, "loss": 0.0607, "step": 39440 }, { "epoch": 3.5748595251042232, "grad_norm": 0.5396366715431213, "learning_rate": 1.1386080400200111e-05, "loss": 0.0492, "step": 39445 }, { "epoch": 3.5753126699293096, "grad_norm": 0.667060911655426, "learning_rate": 1.137944772333901e-05, "loss": 0.0737, "step": 39450 }, { "epoch": 3.5757658147543956, "grad_norm": 0.6229990720748901, "learning_rate": 1.1372816409627365e-05, "loss": 0.0558, "step": 39455 }, { "epoch": 3.5762189595794815, "grad_norm": 0.7265906929969788, "learning_rate": 1.136618645972887e-05, "loss": 0.0561, "step": 39460 }, { "epoch": 3.576672104404568, "grad_norm": 0.5127130150794983, "learning_rate": 1.1359557874307045e-05, "loss": 0.0539, "step": 39465 }, { "epoch": 3.577125249229654, "grad_norm": 0.6826568245887756, "learning_rate": 1.1352930654025268e-05, "loss": 0.0586, "step": 39470 }, { "epoch": 3.57757839405474, "grad_norm": 0.6239691376686096, "learning_rate": 1.1346304799546823e-05, "loss": 0.0612, "step": 39475 }, { "epoch": 3.578031538879826, "grad_norm": 0.7687365412712097, "learning_rate": 1.1339680311534789e-05, "loss": 0.065, "step": 39480 }, { "epoch": 3.5784846837049122, "grad_norm": 0.562474250793457, "learning_rate": 1.133305719065218e-05, "loss": 0.0661, "step": 39485 }, { "epoch": 3.578937828529998, "grad_norm": 0.5077681541442871, "learning_rate": 1.1326435437561828e-05, "loss": 0.063, "step": 39490 }, { "epoch": 3.579390973355084, "grad_norm": 0.5276791453361511, "learning_rate": 1.1319815052926427e-05, "loss": 0.0653, "step": 39495 }, { "epoch": 3.57984411818017, "grad_norm": 0.5929635763168335, "learning_rate": 1.131319603740858e-05, "loss": 0.0757, "step": 39500 }, { "epoch": 3.5802972630052565, "grad_norm": 0.7213016748428345, "learning_rate": 1.1306578391670684e-05, "loss": 0.0626, "step": 39505 }, { "epoch": 3.5807504078303425, "grad_norm": 0.5402145981788635, "learning_rate": 1.1299962116375057e-05, "loss": 0.0581, "step": 39510 }, { "epoch": 3.5812035526554284, "grad_norm": 0.7591020464897156, "learning_rate": 1.1293347212183855e-05, "loss": 0.059, "step": 39515 }, { "epoch": 3.581656697480515, "grad_norm": 0.6057246923446655, "learning_rate": 1.1286733679759087e-05, "loss": 0.0561, "step": 39520 }, { "epoch": 3.582109842305601, "grad_norm": 0.5682316422462463, "learning_rate": 1.1280121519762667e-05, "loss": 0.0662, "step": 39525 }, { "epoch": 3.5825629871306868, "grad_norm": 0.557741105556488, "learning_rate": 1.1273510732856306e-05, "loss": 0.0649, "step": 39530 }, { "epoch": 3.583016131955773, "grad_norm": 0.60191810131073, "learning_rate": 1.1266901319701642e-05, "loss": 0.056, "step": 39535 }, { "epoch": 3.583469276780859, "grad_norm": 0.5958603620529175, "learning_rate": 1.1260293280960134e-05, "loss": 0.0505, "step": 39540 }, { "epoch": 3.583922421605945, "grad_norm": 0.6969460844993591, "learning_rate": 1.125368661729312e-05, "loss": 0.066, "step": 39545 }, { "epoch": 3.5843755664310315, "grad_norm": 0.4964829087257385, "learning_rate": 1.1247081329361795e-05, "loss": 0.0579, "step": 39550 }, { "epoch": 3.5848287112561175, "grad_norm": 0.6336683034896851, "learning_rate": 1.1240477417827211e-05, "loss": 0.0544, "step": 39555 }, { "epoch": 3.5852818560812034, "grad_norm": 0.595413863658905, "learning_rate": 1.123387488335031e-05, "loss": 0.0556, "step": 39560 }, { "epoch": 3.58573500090629, "grad_norm": 0.6725649237632751, "learning_rate": 1.1227273726591862e-05, "loss": 0.0711, "step": 39565 }, { "epoch": 3.586188145731376, "grad_norm": 0.5625118613243103, "learning_rate": 1.1220673948212515e-05, "loss": 0.0581, "step": 39570 }, { "epoch": 3.5866412905564617, "grad_norm": 0.602934718132019, "learning_rate": 1.1214075548872769e-05, "loss": 0.0648, "step": 39575 }, { "epoch": 3.587094435381548, "grad_norm": 0.7211571335792542, "learning_rate": 1.120747852923301e-05, "loss": 0.0797, "step": 39580 }, { "epoch": 3.587547580206634, "grad_norm": 0.49185776710510254, "learning_rate": 1.1200882889953462e-05, "loss": 0.053, "step": 39585 }, { "epoch": 3.58800072503172, "grad_norm": 0.6143662929534912, "learning_rate": 1.1194288631694222e-05, "loss": 0.0565, "step": 39590 }, { "epoch": 3.5884538698568065, "grad_norm": 0.7009915709495544, "learning_rate": 1.118769575511524e-05, "loss": 0.0713, "step": 39595 }, { "epoch": 3.5889070146818924, "grad_norm": 0.6479012370109558, "learning_rate": 1.1181104260876329e-05, "loss": 0.0549, "step": 39600 }, { "epoch": 3.5893601595069784, "grad_norm": 0.5337073802947998, "learning_rate": 1.1174514149637185e-05, "loss": 0.054, "step": 39605 }, { "epoch": 3.589813304332065, "grad_norm": 0.5384460687637329, "learning_rate": 1.1167925422057338e-05, "loss": 0.0573, "step": 39610 }, { "epoch": 3.5902664491571508, "grad_norm": 0.6750577688217163, "learning_rate": 1.116133807879619e-05, "loss": 0.0643, "step": 39615 }, { "epoch": 3.5907195939822367, "grad_norm": 0.599672794342041, "learning_rate": 1.1154752120513004e-05, "loss": 0.064, "step": 39620 }, { "epoch": 3.5911727388073227, "grad_norm": 0.5470041036605835, "learning_rate": 1.11481675478669e-05, "loss": 0.0549, "step": 39625 }, { "epoch": 3.5916258836324086, "grad_norm": 0.6715645790100098, "learning_rate": 1.1141584361516877e-05, "loss": 0.0566, "step": 39630 }, { "epoch": 3.592079028457495, "grad_norm": 0.6780377626419067, "learning_rate": 1.1135002562121777e-05, "loss": 0.0618, "step": 39635 }, { "epoch": 3.592532173282581, "grad_norm": 0.6679679751396179, "learning_rate": 1.1128422150340307e-05, "loss": 0.0615, "step": 39640 }, { "epoch": 3.592985318107667, "grad_norm": 0.6307404637336731, "learning_rate": 1.112184312683104e-05, "loss": 0.0514, "step": 39645 }, { "epoch": 3.5934384629327534, "grad_norm": 0.7533960342407227, "learning_rate": 1.1115265492252392e-05, "loss": 0.0917, "step": 39650 }, { "epoch": 3.5938916077578393, "grad_norm": 0.49498388171195984, "learning_rate": 1.1108689247262677e-05, "loss": 0.0576, "step": 39655 }, { "epoch": 3.5943447525829253, "grad_norm": 0.6457822322845459, "learning_rate": 1.1102114392520038e-05, "loss": 0.0604, "step": 39660 }, { "epoch": 3.5947978974080117, "grad_norm": 0.6475494503974915, "learning_rate": 1.109554092868249e-05, "loss": 0.061, "step": 39665 }, { "epoch": 3.5952510422330977, "grad_norm": 0.6695690155029297, "learning_rate": 1.1088968856407905e-05, "loss": 0.058, "step": 39670 }, { "epoch": 3.5957041870581836, "grad_norm": 0.6905806660652161, "learning_rate": 1.1082398176354012e-05, "loss": 0.0741, "step": 39675 }, { "epoch": 3.59615733188327, "grad_norm": 0.6592000722885132, "learning_rate": 1.1075828889178422e-05, "loss": 0.0648, "step": 39680 }, { "epoch": 3.596610476708356, "grad_norm": 0.6701268553733826, "learning_rate": 1.1069260995538585e-05, "loss": 0.0695, "step": 39685 }, { "epoch": 3.597063621533442, "grad_norm": 0.5227429866790771, "learning_rate": 1.1062694496091809e-05, "loss": 0.0553, "step": 39690 }, { "epoch": 3.5975167663585284, "grad_norm": 0.5096403956413269, "learning_rate": 1.1056129391495296e-05, "loss": 0.0628, "step": 39695 }, { "epoch": 3.5979699111836143, "grad_norm": 0.7473292350769043, "learning_rate": 1.1049565682406045e-05, "loss": 0.0793, "step": 39700 }, { "epoch": 3.5984230560087003, "grad_norm": 0.6430981755256653, "learning_rate": 1.1043003369481e-05, "loss": 0.062, "step": 39705 }, { "epoch": 3.5988762008337867, "grad_norm": 0.6084935069084167, "learning_rate": 1.1036442453376877e-05, "loss": 0.0663, "step": 39710 }, { "epoch": 3.5993293456588726, "grad_norm": 0.5776633620262146, "learning_rate": 1.102988293475032e-05, "loss": 0.0745, "step": 39715 }, { "epoch": 3.5997824904839586, "grad_norm": 0.657706618309021, "learning_rate": 1.1023324814257804e-05, "loss": 0.0676, "step": 39720 }, { "epoch": 3.600235635309045, "grad_norm": 0.5808331370353699, "learning_rate": 1.1016768092555658e-05, "loss": 0.0579, "step": 39725 }, { "epoch": 3.600688780134131, "grad_norm": 0.6171499490737915, "learning_rate": 1.1010212770300105e-05, "loss": 0.0603, "step": 39730 }, { "epoch": 3.601141924959217, "grad_norm": 0.6941208243370056, "learning_rate": 1.1003658848147166e-05, "loss": 0.058, "step": 39735 }, { "epoch": 3.6015950697843033, "grad_norm": 0.8832753896713257, "learning_rate": 1.0997106326752793e-05, "loss": 0.0763, "step": 39740 }, { "epoch": 3.6020482146093893, "grad_norm": 0.6707847118377686, "learning_rate": 1.0990555206772751e-05, "loss": 0.0557, "step": 39745 }, { "epoch": 3.6025013594344752, "grad_norm": 0.6015039086341858, "learning_rate": 1.0984005488862672e-05, "loss": 0.0599, "step": 39750 }, { "epoch": 3.602954504259561, "grad_norm": 0.49204495549201965, "learning_rate": 1.097745717367808e-05, "loss": 0.0588, "step": 39755 }, { "epoch": 3.6034076490846476, "grad_norm": 0.5238423347473145, "learning_rate": 1.0970910261874296e-05, "loss": 0.0495, "step": 39760 }, { "epoch": 3.6038607939097336, "grad_norm": 0.64637291431427, "learning_rate": 1.0964364754106565e-05, "loss": 0.0608, "step": 39765 }, { "epoch": 3.6043139387348195, "grad_norm": 0.7710843086242676, "learning_rate": 1.0957820651029954e-05, "loss": 0.0668, "step": 39770 }, { "epoch": 3.6047670835599055, "grad_norm": 0.5663730502128601, "learning_rate": 1.0951277953299388e-05, "loss": 0.0524, "step": 39775 }, { "epoch": 3.605220228384992, "grad_norm": 0.6338420510292053, "learning_rate": 1.0944736661569694e-05, "loss": 0.0565, "step": 39780 }, { "epoch": 3.605673373210078, "grad_norm": 0.698214054107666, "learning_rate": 1.0938196776495488e-05, "loss": 0.0594, "step": 39785 }, { "epoch": 3.606126518035164, "grad_norm": 0.8485409021377563, "learning_rate": 1.093165829873131e-05, "loss": 0.0712, "step": 39790 }, { "epoch": 3.6065796628602502, "grad_norm": 0.5253183245658875, "learning_rate": 1.0925121228931525e-05, "loss": 0.0516, "step": 39795 }, { "epoch": 3.607032807685336, "grad_norm": 0.611784040927887, "learning_rate": 1.0918585567750358e-05, "loss": 0.061, "step": 39800 }, { "epoch": 3.607485952510422, "grad_norm": 0.5630574822425842, "learning_rate": 1.0912051315841926e-05, "loss": 0.0637, "step": 39805 }, { "epoch": 3.6079390973355085, "grad_norm": 0.6490464806556702, "learning_rate": 1.0905518473860143e-05, "loss": 0.058, "step": 39810 }, { "epoch": 3.6083922421605945, "grad_norm": 0.5409383177757263, "learning_rate": 1.0898987042458844e-05, "loss": 0.0551, "step": 39815 }, { "epoch": 3.6088453869856805, "grad_norm": 0.6638473868370056, "learning_rate": 1.0892457022291688e-05, "loss": 0.0527, "step": 39820 }, { "epoch": 3.609298531810767, "grad_norm": 0.5643109083175659, "learning_rate": 1.0885928414012192e-05, "loss": 0.0726, "step": 39825 }, { "epoch": 3.609751676635853, "grad_norm": 0.6489797234535217, "learning_rate": 1.0879401218273764e-05, "loss": 0.066, "step": 39830 }, { "epoch": 3.610204821460939, "grad_norm": 0.6087438464164734, "learning_rate": 1.0872875435729632e-05, "loss": 0.064, "step": 39835 }, { "epoch": 3.610657966286025, "grad_norm": 0.7043792009353638, "learning_rate": 1.0866351067032904e-05, "loss": 0.0656, "step": 39840 }, { "epoch": 3.611111111111111, "grad_norm": 0.6178291440010071, "learning_rate": 1.085982811283654e-05, "loss": 0.0603, "step": 39845 }, { "epoch": 3.611564255936197, "grad_norm": 0.5895068049430847, "learning_rate": 1.0853306573793349e-05, "loss": 0.0696, "step": 39850 }, { "epoch": 3.6120174007612835, "grad_norm": 0.4816948473453522, "learning_rate": 1.0846786450556026e-05, "loss": 0.0516, "step": 39855 }, { "epoch": 3.6124705455863695, "grad_norm": 0.5981994271278381, "learning_rate": 1.08402677437771e-05, "loss": 0.0589, "step": 39860 }, { "epoch": 3.6129236904114554, "grad_norm": 0.5696563720703125, "learning_rate": 1.0833750454108965e-05, "loss": 0.0595, "step": 39865 }, { "epoch": 3.613376835236542, "grad_norm": 0.6119800806045532, "learning_rate": 1.0827234582203875e-05, "loss": 0.054, "step": 39870 }, { "epoch": 3.613829980061628, "grad_norm": 0.6222809553146362, "learning_rate": 1.082072012871393e-05, "loss": 0.0567, "step": 39875 }, { "epoch": 3.6142831248867138, "grad_norm": 0.704020082950592, "learning_rate": 1.0814207094291114e-05, "loss": 0.0563, "step": 39880 }, { "epoch": 3.6147362697117997, "grad_norm": 0.632868230342865, "learning_rate": 1.0807695479587249e-05, "loss": 0.0615, "step": 39885 }, { "epoch": 3.615189414536886, "grad_norm": 0.5782577395439148, "learning_rate": 1.080118528525402e-05, "loss": 0.0645, "step": 39890 }, { "epoch": 3.615642559361972, "grad_norm": 0.7931031584739685, "learning_rate": 1.0794676511942964e-05, "loss": 0.0799, "step": 39895 }, { "epoch": 3.616095704187058, "grad_norm": 0.6032798290252686, "learning_rate": 1.078816916030549e-05, "loss": 0.055, "step": 39900 }, { "epoch": 3.616548849012144, "grad_norm": 0.5158361196517944, "learning_rate": 1.0781663230992836e-05, "loss": 0.0537, "step": 39905 }, { "epoch": 3.6170019938372304, "grad_norm": 0.7301211953163147, "learning_rate": 1.0775158724656146e-05, "loss": 0.0768, "step": 39910 }, { "epoch": 3.6174551386623164, "grad_norm": 0.678224503993988, "learning_rate": 1.0768655641946377e-05, "loss": 0.0526, "step": 39915 }, { "epoch": 3.6179082834874023, "grad_norm": 0.8090957403182983, "learning_rate": 1.0762153983514361e-05, "loss": 0.0678, "step": 39920 }, { "epoch": 3.6183614283124887, "grad_norm": 0.5890260338783264, "learning_rate": 1.0755653750010789e-05, "loss": 0.0696, "step": 39925 }, { "epoch": 3.6188145731375747, "grad_norm": 0.5544470548629761, "learning_rate": 1.0749154942086195e-05, "loss": 0.0648, "step": 39930 }, { "epoch": 3.6192677179626607, "grad_norm": 0.5568411946296692, "learning_rate": 1.0742657560391003e-05, "loss": 0.0563, "step": 39935 }, { "epoch": 3.619720862787747, "grad_norm": 0.5901138782501221, "learning_rate": 1.0736161605575463e-05, "loss": 0.0633, "step": 39940 }, { "epoch": 3.620174007612833, "grad_norm": 0.9433371424674988, "learning_rate": 1.072966707828968e-05, "loss": 0.0629, "step": 39945 }, { "epoch": 3.620627152437919, "grad_norm": 0.6521857976913452, "learning_rate": 1.0723173979183657e-05, "loss": 0.0613, "step": 39950 }, { "epoch": 3.6210802972630054, "grad_norm": 0.6945914626121521, "learning_rate": 1.0716682308907191e-05, "loss": 0.0608, "step": 39955 }, { "epoch": 3.6215334420880914, "grad_norm": 0.6336696743965149, "learning_rate": 1.071019206811e-05, "loss": 0.0559, "step": 39960 }, { "epoch": 3.6219865869131773, "grad_norm": 0.62032550573349, "learning_rate": 1.0703703257441614e-05, "loss": 0.053, "step": 39965 }, { "epoch": 3.6224397317382637, "grad_norm": 0.523730456829071, "learning_rate": 1.069721587755143e-05, "loss": 0.0553, "step": 39970 }, { "epoch": 3.6228928765633497, "grad_norm": 0.595814049243927, "learning_rate": 1.0690729929088739e-05, "loss": 0.0515, "step": 39975 }, { "epoch": 3.6233460213884356, "grad_norm": 0.5818710923194885, "learning_rate": 1.0684245412702609e-05, "loss": 0.0587, "step": 39980 }, { "epoch": 3.623799166213522, "grad_norm": 0.607510507106781, "learning_rate": 1.0677762329042051e-05, "loss": 0.0644, "step": 39985 }, { "epoch": 3.624252311038608, "grad_norm": 0.722102701663971, "learning_rate": 1.0671280678755879e-05, "loss": 0.0624, "step": 39990 }, { "epoch": 3.624705455863694, "grad_norm": 0.478108674287796, "learning_rate": 1.0664800462492771e-05, "loss": 0.0633, "step": 39995 }, { "epoch": 3.6251586006887804, "grad_norm": 0.5112096071243286, "learning_rate": 1.06583216809013e-05, "loss": 0.0649, "step": 40000 }, { "epoch": 3.6256117455138663, "grad_norm": 0.5268200039863586, "learning_rate": 1.0651844334629824e-05, "loss": 0.0573, "step": 40005 }, { "epoch": 3.6260648903389523, "grad_norm": 0.5070276260375977, "learning_rate": 1.0645368424326624e-05, "loss": 0.0541, "step": 40010 }, { "epoch": 3.6265180351640387, "grad_norm": 0.5028098821640015, "learning_rate": 1.0638893950639805e-05, "loss": 0.0558, "step": 40015 }, { "epoch": 3.6269711799891247, "grad_norm": 0.572545051574707, "learning_rate": 1.0632420914217329e-05, "loss": 0.0641, "step": 40020 }, { "epoch": 3.6274243248142106, "grad_norm": 0.8210618495941162, "learning_rate": 1.062594931570704e-05, "loss": 0.0548, "step": 40025 }, { "epoch": 3.6278774696392966, "grad_norm": 0.501512885093689, "learning_rate": 1.0619479155756584e-05, "loss": 0.0562, "step": 40030 }, { "epoch": 3.6283306144643825, "grad_norm": 0.5552645325660706, "learning_rate": 1.0613010435013527e-05, "loss": 0.0551, "step": 40035 }, { "epoch": 3.628783759289469, "grad_norm": 0.5836438536643982, "learning_rate": 1.0606543154125244e-05, "loss": 0.0546, "step": 40040 }, { "epoch": 3.629236904114555, "grad_norm": 0.7573394775390625, "learning_rate": 1.0600077313738985e-05, "loss": 0.0602, "step": 40045 }, { "epoch": 3.629690048939641, "grad_norm": 0.4908658266067505, "learning_rate": 1.0593612914501872e-05, "loss": 0.0601, "step": 40050 }, { "epoch": 3.6301431937647273, "grad_norm": 0.6611331105232239, "learning_rate": 1.058714995706083e-05, "loss": 0.0537, "step": 40055 }, { "epoch": 3.6305963385898132, "grad_norm": 0.6354915499687195, "learning_rate": 1.0580688442062711e-05, "loss": 0.0621, "step": 40060 }, { "epoch": 3.631049483414899, "grad_norm": 0.5584373474121094, "learning_rate": 1.057422837015415e-05, "loss": 0.0598, "step": 40065 }, { "epoch": 3.6315026282399856, "grad_norm": 0.6990163922309875, "learning_rate": 1.0567769741981688e-05, "loss": 0.0645, "step": 40070 }, { "epoch": 3.6319557730650716, "grad_norm": 0.6026420593261719, "learning_rate": 1.0561312558191727e-05, "loss": 0.0625, "step": 40075 }, { "epoch": 3.6324089178901575, "grad_norm": 0.5929641127586365, "learning_rate": 1.055485681943047e-05, "loss": 0.0585, "step": 40080 }, { "epoch": 3.632862062715244, "grad_norm": 0.4959510564804077, "learning_rate": 1.0548402526344037e-05, "loss": 0.0556, "step": 40085 }, { "epoch": 3.63331520754033, "grad_norm": 0.7123229503631592, "learning_rate": 1.0541949679578361e-05, "loss": 0.0577, "step": 40090 }, { "epoch": 3.633768352365416, "grad_norm": 0.7260256409645081, "learning_rate": 1.0535498279779249e-05, "loss": 0.0681, "step": 40095 }, { "epoch": 3.6342214971905022, "grad_norm": 0.6710899472236633, "learning_rate": 1.0529048327592359e-05, "loss": 0.066, "step": 40100 }, { "epoch": 3.634674642015588, "grad_norm": 0.7041546106338501, "learning_rate": 1.0522599823663195e-05, "loss": 0.0638, "step": 40105 }, { "epoch": 3.635127786840674, "grad_norm": 0.7724911570549011, "learning_rate": 1.0516152768637147e-05, "loss": 0.0753, "step": 40110 }, { "epoch": 3.6355809316657606, "grad_norm": 0.6811385750770569, "learning_rate": 1.0509707163159421e-05, "loss": 0.0625, "step": 40115 }, { "epoch": 3.6360340764908465, "grad_norm": 0.6040549278259277, "learning_rate": 1.0503263007875106e-05, "loss": 0.0511, "step": 40120 }, { "epoch": 3.6364872213159325, "grad_norm": 0.5986378192901611, "learning_rate": 1.0496820303429125e-05, "loss": 0.0689, "step": 40125 }, { "epoch": 3.636940366141019, "grad_norm": 0.5571117401123047, "learning_rate": 1.0490379050466265e-05, "loss": 0.0566, "step": 40130 }, { "epoch": 3.637393510966105, "grad_norm": 0.596542477607727, "learning_rate": 1.048393924963118e-05, "loss": 0.0668, "step": 40135 }, { "epoch": 3.637846655791191, "grad_norm": 0.5167511701583862, "learning_rate": 1.0477500901568365e-05, "loss": 0.053, "step": 40140 }, { "epoch": 3.6382998006162772, "grad_norm": 0.6381316184997559, "learning_rate": 1.047106400692217e-05, "loss": 0.0638, "step": 40145 }, { "epoch": 3.638752945441363, "grad_norm": 0.603469967842102, "learning_rate": 1.0464628566336797e-05, "loss": 0.0557, "step": 40150 }, { "epoch": 3.639206090266449, "grad_norm": 0.43267622590065, "learning_rate": 1.0458194580456306e-05, "loss": 0.0507, "step": 40155 }, { "epoch": 3.639659235091535, "grad_norm": 0.7926977872848511, "learning_rate": 1.0451762049924623e-05, "loss": 0.057, "step": 40160 }, { "epoch": 3.6401123799166215, "grad_norm": 0.61785888671875, "learning_rate": 1.0445330975385515e-05, "loss": 0.0512, "step": 40165 }, { "epoch": 3.6405655247417075, "grad_norm": 0.556610643863678, "learning_rate": 1.0438901357482602e-05, "loss": 0.057, "step": 40170 }, { "epoch": 3.6410186695667934, "grad_norm": 0.7559689283370972, "learning_rate": 1.0432473196859363e-05, "loss": 0.0644, "step": 40175 }, { "epoch": 3.6414718143918794, "grad_norm": 0.6043082475662231, "learning_rate": 1.0426046494159122e-05, "loss": 0.0524, "step": 40180 }, { "epoch": 3.641924959216966, "grad_norm": 0.6032137274742126, "learning_rate": 1.0419621250025083e-05, "loss": 0.0599, "step": 40185 }, { "epoch": 3.6423781040420518, "grad_norm": 0.6169134378433228, "learning_rate": 1.041319746510028e-05, "loss": 0.0625, "step": 40190 }, { "epoch": 3.6428312488671377, "grad_norm": 0.5354403853416443, "learning_rate": 1.0406775140027606e-05, "loss": 0.0667, "step": 40195 }, { "epoch": 3.643284393692224, "grad_norm": 0.5920382738113403, "learning_rate": 1.0400354275449806e-05, "loss": 0.0589, "step": 40200 }, { "epoch": 3.64373753851731, "grad_norm": 0.49743664264678955, "learning_rate": 1.0393934872009477e-05, "loss": 0.0567, "step": 40205 }, { "epoch": 3.644190683342396, "grad_norm": 0.5867456793785095, "learning_rate": 1.0387516930349097e-05, "loss": 0.0655, "step": 40210 }, { "epoch": 3.6446438281674824, "grad_norm": 0.5928071141242981, "learning_rate": 1.0381100451110958e-05, "loss": 0.0634, "step": 40215 }, { "epoch": 3.6450969729925684, "grad_norm": 0.6124023199081421, "learning_rate": 1.0374685434937232e-05, "loss": 0.0604, "step": 40220 }, { "epoch": 3.6455501178176544, "grad_norm": 0.7099199891090393, "learning_rate": 1.0368271882469918e-05, "loss": 0.0822, "step": 40225 }, { "epoch": 3.6460032626427408, "grad_norm": 0.5702561140060425, "learning_rate": 1.036185979435091e-05, "loss": 0.0518, "step": 40230 }, { "epoch": 3.6464564074678267, "grad_norm": 0.7632157206535339, "learning_rate": 1.0355449171221927e-05, "loss": 0.0539, "step": 40235 }, { "epoch": 3.6469095522929127, "grad_norm": 0.8596763610839844, "learning_rate": 1.0349040013724537e-05, "loss": 0.0854, "step": 40240 }, { "epoch": 3.647362697117999, "grad_norm": 0.7341687679290771, "learning_rate": 1.0342632322500176e-05, "loss": 0.0641, "step": 40245 }, { "epoch": 3.647815841943085, "grad_norm": 0.5004958510398865, "learning_rate": 1.0336226098190118e-05, "loss": 0.0768, "step": 40250 }, { "epoch": 3.648268986768171, "grad_norm": 0.7271822094917297, "learning_rate": 1.0329821341435528e-05, "loss": 0.0756, "step": 40255 }, { "epoch": 3.6487221315932574, "grad_norm": 0.3940538167953491, "learning_rate": 1.032341805287736e-05, "loss": 0.0542, "step": 40260 }, { "epoch": 3.6491752764183434, "grad_norm": 0.6790839433670044, "learning_rate": 1.0317016233156481e-05, "loss": 0.0579, "step": 40265 }, { "epoch": 3.6496284212434293, "grad_norm": 0.5790198445320129, "learning_rate": 1.0310615882913582e-05, "loss": 0.0632, "step": 40270 }, { "epoch": 3.6500815660685157, "grad_norm": 0.48887497186660767, "learning_rate": 1.0304217002789203e-05, "loss": 0.0578, "step": 40275 }, { "epoch": 3.6505347108936017, "grad_norm": 0.6202101111412048, "learning_rate": 1.0297819593423772e-05, "loss": 0.0526, "step": 40280 }, { "epoch": 3.6509878557186877, "grad_norm": 0.6687305569648743, "learning_rate": 1.0291423655457507e-05, "loss": 0.0618, "step": 40285 }, { "epoch": 3.6514410005437736, "grad_norm": 0.5252429842948914, "learning_rate": 1.0285029189530546e-05, "loss": 0.0503, "step": 40290 }, { "epoch": 3.65189414536886, "grad_norm": 0.8336899280548096, "learning_rate": 1.0278636196282837e-05, "loss": 0.0678, "step": 40295 }, { "epoch": 3.652347290193946, "grad_norm": 0.7328901290893555, "learning_rate": 1.0272244676354187e-05, "loss": 0.0573, "step": 40300 }, { "epoch": 3.652800435019032, "grad_norm": 0.5721997618675232, "learning_rate": 1.0265854630384283e-05, "loss": 0.0597, "step": 40305 }, { "epoch": 3.653253579844118, "grad_norm": 0.5024186968803406, "learning_rate": 1.0259466059012612e-05, "loss": 0.047, "step": 40310 }, { "epoch": 3.6537067246692043, "grad_norm": 0.6110748648643494, "learning_rate": 1.025307896287857e-05, "loss": 0.0557, "step": 40315 }, { "epoch": 3.6541598694942903, "grad_norm": 0.5112395882606506, "learning_rate": 1.0246693342621372e-05, "loss": 0.0552, "step": 40320 }, { "epoch": 3.6546130143193762, "grad_norm": 0.5951237678527832, "learning_rate": 1.024030919888008e-05, "loss": 0.05, "step": 40325 }, { "epoch": 3.6550661591444626, "grad_norm": 0.571514368057251, "learning_rate": 1.0233926532293652e-05, "loss": 0.0851, "step": 40330 }, { "epoch": 3.6555193039695486, "grad_norm": 0.5413739085197449, "learning_rate": 1.0227545343500833e-05, "loss": 0.054, "step": 40335 }, { "epoch": 3.6559724487946346, "grad_norm": 0.4613712728023529, "learning_rate": 1.0221165633140276e-05, "loss": 0.052, "step": 40340 }, { "epoch": 3.656425593619721, "grad_norm": 0.45278507471084595, "learning_rate": 1.0214787401850459e-05, "loss": 0.0626, "step": 40345 }, { "epoch": 3.656878738444807, "grad_norm": 0.4944908320903778, "learning_rate": 1.0208410650269707e-05, "loss": 0.0538, "step": 40350 }, { "epoch": 3.657331883269893, "grad_norm": 0.5881061553955078, "learning_rate": 1.0202035379036237e-05, "loss": 0.0597, "step": 40355 }, { "epoch": 3.6577850280949793, "grad_norm": 0.459858775138855, "learning_rate": 1.019566158878805e-05, "loss": 0.0599, "step": 40360 }, { "epoch": 3.6582381729200653, "grad_norm": 0.614650547504425, "learning_rate": 1.0189289280163063e-05, "loss": 0.0528, "step": 40365 }, { "epoch": 3.658691317745151, "grad_norm": 0.6544560790061951, "learning_rate": 1.0182918453799014e-05, "loss": 0.0656, "step": 40370 }, { "epoch": 3.6591444625702376, "grad_norm": 0.49748846888542175, "learning_rate": 1.0176549110333486e-05, "loss": 0.0595, "step": 40375 }, { "epoch": 3.6595976073953236, "grad_norm": 0.529627799987793, "learning_rate": 1.0170181250403952e-05, "loss": 0.0821, "step": 40380 }, { "epoch": 3.6600507522204095, "grad_norm": 0.8017836213111877, "learning_rate": 1.0163814874647672e-05, "loss": 0.0634, "step": 40385 }, { "epoch": 3.660503897045496, "grad_norm": 0.6261945366859436, "learning_rate": 1.0157449983701824e-05, "loss": 0.0569, "step": 40390 }, { "epoch": 3.660957041870582, "grad_norm": 0.6882794499397278, "learning_rate": 1.0151086578203401e-05, "loss": 0.0571, "step": 40395 }, { "epoch": 3.661410186695668, "grad_norm": 0.585472583770752, "learning_rate": 1.0144724658789243e-05, "loss": 0.0608, "step": 40400 }, { "epoch": 3.6618633315207543, "grad_norm": 0.5183419585227966, "learning_rate": 1.0138364226096081e-05, "loss": 0.067, "step": 40405 }, { "epoch": 3.6623164763458402, "grad_norm": 0.5539580583572388, "learning_rate": 1.0132005280760435e-05, "loss": 0.0622, "step": 40410 }, { "epoch": 3.662769621170926, "grad_norm": 0.48147618770599365, "learning_rate": 1.0125647823418733e-05, "loss": 0.0522, "step": 40415 }, { "epoch": 3.6632227659960126, "grad_norm": 0.5538592338562012, "learning_rate": 1.0119291854707225e-05, "loss": 0.0527, "step": 40420 }, { "epoch": 3.6636759108210986, "grad_norm": 0.5784103870391846, "learning_rate": 1.011293737526201e-05, "loss": 0.0515, "step": 40425 }, { "epoch": 3.6641290556461845, "grad_norm": 0.5987176895141602, "learning_rate": 1.0106584385719075e-05, "loss": 0.0689, "step": 40430 }, { "epoch": 3.6645822004712705, "grad_norm": 0.5874171853065491, "learning_rate": 1.0100232886714189e-05, "loss": 0.0612, "step": 40435 }, { "epoch": 3.6650353452963564, "grad_norm": 0.5763682723045349, "learning_rate": 1.0093882878883044e-05, "loss": 0.0595, "step": 40440 }, { "epoch": 3.665488490121443, "grad_norm": 0.577867329120636, "learning_rate": 1.0087534362861136e-05, "loss": 0.0512, "step": 40445 }, { "epoch": 3.665941634946529, "grad_norm": 0.8375574946403503, "learning_rate": 1.0081187339283835e-05, "loss": 0.0734, "step": 40450 }, { "epoch": 3.6663947797716148, "grad_norm": 0.7024143934249878, "learning_rate": 1.0074841808786344e-05, "loss": 0.0646, "step": 40455 }, { "epoch": 3.666847924596701, "grad_norm": 0.4868936538696289, "learning_rate": 1.0068497772003725e-05, "loss": 0.0547, "step": 40460 }, { "epoch": 3.667301069421787, "grad_norm": 0.5255075097084045, "learning_rate": 1.0062155229570903e-05, "loss": 0.0561, "step": 40465 }, { "epoch": 3.667754214246873, "grad_norm": 0.6166797280311584, "learning_rate": 1.0055814182122635e-05, "loss": 0.0542, "step": 40470 }, { "epoch": 3.6682073590719595, "grad_norm": 0.7073317170143127, "learning_rate": 1.0049474630293536e-05, "loss": 0.0611, "step": 40475 }, { "epoch": 3.6686605038970455, "grad_norm": 0.5246196985244751, "learning_rate": 1.0043136574718062e-05, "loss": 0.0569, "step": 40480 }, { "epoch": 3.6691136487221314, "grad_norm": 0.5404182076454163, "learning_rate": 1.0036800016030546e-05, "loss": 0.0611, "step": 40485 }, { "epoch": 3.669566793547218, "grad_norm": 0.5852411985397339, "learning_rate": 1.003046495486514e-05, "loss": 0.0523, "step": 40490 }, { "epoch": 3.670019938372304, "grad_norm": 0.47851797938346863, "learning_rate": 1.0024131391855862e-05, "loss": 0.0573, "step": 40495 }, { "epoch": 3.6704730831973897, "grad_norm": 0.5135294198989868, "learning_rate": 1.0017799327636576e-05, "loss": 0.0716, "step": 40500 }, { "epoch": 3.670926228022476, "grad_norm": 0.4776511788368225, "learning_rate": 1.0011468762840993e-05, "loss": 0.0549, "step": 40505 }, { "epoch": 3.671379372847562, "grad_norm": 0.7342824935913086, "learning_rate": 1.0005139698102688e-05, "loss": 0.0629, "step": 40510 }, { "epoch": 3.671832517672648, "grad_norm": 0.6206876635551453, "learning_rate": 9.998812134055074e-06, "loss": 0.0578, "step": 40515 }, { "epoch": 3.6722856624977345, "grad_norm": 0.6236312985420227, "learning_rate": 9.99248607133141e-06, "loss": 0.0555, "step": 40520 }, { "epoch": 3.6727388073228204, "grad_norm": 0.6623988747596741, "learning_rate": 9.986161510564816e-06, "loss": 0.0702, "step": 40525 }, { "epoch": 3.6731919521479064, "grad_norm": 0.44945013523101807, "learning_rate": 9.979838452388243e-06, "loss": 0.0497, "step": 40530 }, { "epoch": 3.673645096972993, "grad_norm": 0.5860722064971924, "learning_rate": 9.973516897434525e-06, "loss": 0.0569, "step": 40535 }, { "epoch": 3.6740982417980788, "grad_norm": 0.5321076512336731, "learning_rate": 9.967196846336316e-06, "loss": 0.0527, "step": 40540 }, { "epoch": 3.6745513866231647, "grad_norm": 0.5390397310256958, "learning_rate": 9.960878299726127e-06, "loss": 0.0562, "step": 40545 }, { "epoch": 3.675004531448251, "grad_norm": 0.9280067682266235, "learning_rate": 9.954561258236322e-06, "loss": 0.0681, "step": 40550 }, { "epoch": 3.675457676273337, "grad_norm": 0.6100738644599915, "learning_rate": 9.948245722499106e-06, "loss": 0.077, "step": 40555 }, { "epoch": 3.675910821098423, "grad_norm": 0.41820815205574036, "learning_rate": 9.94193169314655e-06, "loss": 0.0574, "step": 40560 }, { "epoch": 3.676363965923509, "grad_norm": 0.5879338979721069, "learning_rate": 9.935619170810564e-06, "loss": 0.0677, "step": 40565 }, { "epoch": 3.6768171107485954, "grad_norm": 0.4683724343776703, "learning_rate": 9.929308156122904e-06, "loss": 0.0552, "step": 40570 }, { "epoch": 3.6772702555736814, "grad_norm": 0.46241098642349243, "learning_rate": 9.922998649715178e-06, "loss": 0.0524, "step": 40575 }, { "epoch": 3.6777234003987673, "grad_norm": 0.4664102792739868, "learning_rate": 9.916690652218836e-06, "loss": 0.0559, "step": 40580 }, { "epoch": 3.6781765452238533, "grad_norm": 0.5994166135787964, "learning_rate": 9.9103841642652e-06, "loss": 0.0698, "step": 40585 }, { "epoch": 3.6786296900489397, "grad_norm": 0.5636574625968933, "learning_rate": 9.904079186485418e-06, "loss": 0.0548, "step": 40590 }, { "epoch": 3.6790828348740257, "grad_norm": 0.7732637524604797, "learning_rate": 9.897775719510486e-06, "loss": 0.0661, "step": 40595 }, { "epoch": 3.6795359796991116, "grad_norm": 0.6193202137947083, "learning_rate": 9.891473763971282e-06, "loss": 0.0808, "step": 40600 }, { "epoch": 3.679989124524198, "grad_norm": 0.6402245163917542, "learning_rate": 9.885173320498476e-06, "loss": 0.0552, "step": 40605 }, { "epoch": 3.680442269349284, "grad_norm": 0.6495906710624695, "learning_rate": 9.878874389722648e-06, "loss": 0.0536, "step": 40610 }, { "epoch": 3.68089541417437, "grad_norm": 0.5772392153739929, "learning_rate": 9.872576972274167e-06, "loss": 0.0514, "step": 40615 }, { "epoch": 3.6813485589994563, "grad_norm": 0.466731995344162, "learning_rate": 9.866281068783296e-06, "loss": 0.063, "step": 40620 }, { "epoch": 3.6818017038245423, "grad_norm": 0.5972214937210083, "learning_rate": 9.859986679880149e-06, "loss": 0.0676, "step": 40625 }, { "epoch": 3.6822548486496283, "grad_norm": 0.6344808340072632, "learning_rate": 9.853693806194635e-06, "loss": 0.0574, "step": 40630 }, { "epoch": 3.6827079934747147, "grad_norm": 0.5774100422859192, "learning_rate": 9.847402448356583e-06, "loss": 0.0669, "step": 40635 }, { "epoch": 3.6831611382998006, "grad_norm": 0.515980064868927, "learning_rate": 9.8411126069956e-06, "loss": 0.0594, "step": 40640 }, { "epoch": 3.6836142831248866, "grad_norm": 0.6768441200256348, "learning_rate": 9.834824282741199e-06, "loss": 0.0595, "step": 40645 }, { "epoch": 3.684067427949973, "grad_norm": 0.5199012756347656, "learning_rate": 9.82853747622271e-06, "loss": 0.0492, "step": 40650 }, { "epoch": 3.684520572775059, "grad_norm": 0.7829825282096863, "learning_rate": 9.822252188069312e-06, "loss": 0.0864, "step": 40655 }, { "epoch": 3.684973717600145, "grad_norm": 0.6667171120643616, "learning_rate": 9.815968418910062e-06, "loss": 0.0645, "step": 40660 }, { "epoch": 3.6854268624252313, "grad_norm": 0.4748392105102539, "learning_rate": 9.809686169373811e-06, "loss": 0.052, "step": 40665 }, { "epoch": 3.6858800072503173, "grad_norm": 0.5854558348655701, "learning_rate": 9.803405440089309e-06, "loss": 0.0462, "step": 40670 }, { "epoch": 3.6863331520754032, "grad_norm": 0.5174146294593811, "learning_rate": 9.797126231685128e-06, "loss": 0.0552, "step": 40675 }, { "epoch": 3.6867862969004896, "grad_norm": 0.7202823162078857, "learning_rate": 9.790848544789685e-06, "loss": 0.0588, "step": 40680 }, { "epoch": 3.6872394417255756, "grad_norm": 0.5274192094802856, "learning_rate": 9.784572380031279e-06, "loss": 0.0636, "step": 40685 }, { "epoch": 3.6876925865506616, "grad_norm": 0.614141047000885, "learning_rate": 9.778297738037995e-06, "loss": 0.0571, "step": 40690 }, { "epoch": 3.6881457313757475, "grad_norm": 0.7509709596633911, "learning_rate": 9.772024619437828e-06, "loss": 0.065, "step": 40695 }, { "epoch": 3.688598876200834, "grad_norm": 0.5752926468849182, "learning_rate": 9.765753024858584e-06, "loss": 0.0543, "step": 40700 }, { "epoch": 3.68905202102592, "grad_norm": 0.722830057144165, "learning_rate": 9.75948295492792e-06, "loss": 0.0581, "step": 40705 }, { "epoch": 3.689505165851006, "grad_norm": 0.7752470374107361, "learning_rate": 9.753214410273371e-06, "loss": 0.0652, "step": 40710 }, { "epoch": 3.689958310676092, "grad_norm": 0.6447557210922241, "learning_rate": 9.74694739152226e-06, "loss": 0.0585, "step": 40715 }, { "epoch": 3.690411455501178, "grad_norm": 0.49906304478645325, "learning_rate": 9.740681899301818e-06, "loss": 0.0529, "step": 40720 }, { "epoch": 3.690864600326264, "grad_norm": 0.4925227165222168, "learning_rate": 9.73441793423909e-06, "loss": 0.0602, "step": 40725 }, { "epoch": 3.69131774515135, "grad_norm": 0.544880747795105, "learning_rate": 9.728155496960969e-06, "loss": 0.06, "step": 40730 }, { "epoch": 3.6917708899764365, "grad_norm": 0.5344750285148621, "learning_rate": 9.721894588094216e-06, "loss": 0.0663, "step": 40735 }, { "epoch": 3.6922240348015225, "grad_norm": 0.6476173400878906, "learning_rate": 9.715635208265417e-06, "loss": 0.0542, "step": 40740 }, { "epoch": 3.6926771796266085, "grad_norm": 0.5390517115592957, "learning_rate": 9.709377358101016e-06, "loss": 0.0596, "step": 40745 }, { "epoch": 3.693130324451695, "grad_norm": 0.5962778329849243, "learning_rate": 9.703121038227294e-06, "loss": 0.0536, "step": 40750 }, { "epoch": 3.693583469276781, "grad_norm": 0.7465822100639343, "learning_rate": 9.696866249270384e-06, "loss": 0.0643, "step": 40755 }, { "epoch": 3.694036614101867, "grad_norm": 0.5867232084274292, "learning_rate": 9.690612991856279e-06, "loss": 0.0683, "step": 40760 }, { "epoch": 3.694489758926953, "grad_norm": 0.7211371064186096, "learning_rate": 9.684361266610803e-06, "loss": 0.0582, "step": 40765 }, { "epoch": 3.694942903752039, "grad_norm": 1.0824236869812012, "learning_rate": 9.67811107415963e-06, "loss": 0.0744, "step": 40770 }, { "epoch": 3.695396048577125, "grad_norm": 0.6008620262145996, "learning_rate": 9.671862415128277e-06, "loss": 0.0512, "step": 40775 }, { "epoch": 3.6958491934022115, "grad_norm": 0.518368124961853, "learning_rate": 9.66561529014211e-06, "loss": 0.0641, "step": 40780 }, { "epoch": 3.6963023382272975, "grad_norm": 0.5970568656921387, "learning_rate": 9.659369699826354e-06, "loss": 0.0637, "step": 40785 }, { "epoch": 3.6967554830523834, "grad_norm": 0.6768395304679871, "learning_rate": 9.653125644806066e-06, "loss": 0.0596, "step": 40790 }, { "epoch": 3.69720862787747, "grad_norm": 0.7041844129562378, "learning_rate": 9.646883125706151e-06, "loss": 0.0614, "step": 40795 }, { "epoch": 3.697661772702556, "grad_norm": 0.4857621490955353, "learning_rate": 9.640642143151365e-06, "loss": 0.0591, "step": 40800 }, { "epoch": 3.6981149175276418, "grad_norm": 0.5803371667861938, "learning_rate": 9.634402697766303e-06, "loss": 0.0749, "step": 40805 }, { "epoch": 3.698568062352728, "grad_norm": 0.5573390126228333, "learning_rate": 9.628164790175407e-06, "loss": 0.1084, "step": 40810 }, { "epoch": 3.699021207177814, "grad_norm": 0.6068186163902283, "learning_rate": 9.621928421002981e-06, "loss": 0.0644, "step": 40815 }, { "epoch": 3.6994743520029, "grad_norm": 0.5670636892318726, "learning_rate": 9.61569359087316e-06, "loss": 0.0548, "step": 40820 }, { "epoch": 3.6999274968279865, "grad_norm": 0.47986066341400146, "learning_rate": 9.609460300409922e-06, "loss": 0.057, "step": 40825 }, { "epoch": 3.7003806416530725, "grad_norm": 0.699439287185669, "learning_rate": 9.603228550237103e-06, "loss": 0.0693, "step": 40830 }, { "epoch": 3.7008337864781584, "grad_norm": 0.5861011743545532, "learning_rate": 9.596998340978364e-06, "loss": 0.0638, "step": 40835 }, { "epoch": 3.7012869313032444, "grad_norm": 0.6180126667022705, "learning_rate": 9.59076967325725e-06, "loss": 0.0809, "step": 40840 }, { "epoch": 3.701740076128331, "grad_norm": 0.6708058714866638, "learning_rate": 9.584542547697114e-06, "loss": 0.0719, "step": 40845 }, { "epoch": 3.7021932209534167, "grad_norm": 0.6254531145095825, "learning_rate": 9.578316964921172e-06, "loss": 0.0559, "step": 40850 }, { "epoch": 3.7026463657785027, "grad_norm": 0.6536898612976074, "learning_rate": 9.57209292555248e-06, "loss": 0.0568, "step": 40855 }, { "epoch": 3.7030995106035887, "grad_norm": 0.5951834321022034, "learning_rate": 9.565870430213935e-06, "loss": 0.0514, "step": 40860 }, { "epoch": 3.703552655428675, "grad_norm": 0.5620163083076477, "learning_rate": 9.559649479528304e-06, "loss": 0.0478, "step": 40865 }, { "epoch": 3.704005800253761, "grad_norm": 0.7261366248130798, "learning_rate": 9.553430074118174e-06, "loss": 0.0608, "step": 40870 }, { "epoch": 3.704458945078847, "grad_norm": 0.7843256592750549, "learning_rate": 9.547212214605975e-06, "loss": 0.0716, "step": 40875 }, { "epoch": 3.7049120899039334, "grad_norm": 0.49928396940231323, "learning_rate": 9.540995901614016e-06, "loss": 0.0532, "step": 40880 }, { "epoch": 3.7053652347290194, "grad_norm": 0.5268108248710632, "learning_rate": 9.534781135764398e-06, "loss": 0.0555, "step": 40885 }, { "epoch": 3.7058183795541053, "grad_norm": 0.48087066411972046, "learning_rate": 9.52856791767912e-06, "loss": 0.0531, "step": 40890 }, { "epoch": 3.7062715243791917, "grad_norm": 0.4854755401611328, "learning_rate": 9.522356247979997e-06, "loss": 0.0635, "step": 40895 }, { "epoch": 3.7067246692042777, "grad_norm": 0.45432567596435547, "learning_rate": 9.51614612728868e-06, "loss": 0.0547, "step": 40900 }, { "epoch": 3.7071778140293636, "grad_norm": 0.7331855297088623, "learning_rate": 9.509937556226714e-06, "loss": 0.0649, "step": 40905 }, { "epoch": 3.70763095885445, "grad_norm": 0.5800535082817078, "learning_rate": 9.503730535415415e-06, "loss": 0.0656, "step": 40910 }, { "epoch": 3.708084103679536, "grad_norm": 0.5033579468727112, "learning_rate": 9.497525065476013e-06, "loss": 0.0671, "step": 40915 }, { "epoch": 3.708537248504622, "grad_norm": 0.6519862413406372, "learning_rate": 9.491321147029542e-06, "loss": 0.0506, "step": 40920 }, { "epoch": 3.7089903933297084, "grad_norm": 0.7810873985290527, "learning_rate": 9.485118780696884e-06, "loss": 0.0628, "step": 40925 }, { "epoch": 3.7094435381547943, "grad_norm": 0.5718472599983215, "learning_rate": 9.4789179670988e-06, "loss": 0.0628, "step": 40930 }, { "epoch": 3.7098966829798803, "grad_norm": 0.5301096439361572, "learning_rate": 9.472718706855834e-06, "loss": 0.0604, "step": 40935 }, { "epoch": 3.7103498278049667, "grad_norm": 0.6007637977600098, "learning_rate": 9.466521000588441e-06, "loss": 0.0557, "step": 40940 }, { "epoch": 3.7108029726300527, "grad_norm": 0.4921601712703705, "learning_rate": 9.460324848916876e-06, "loss": 0.0573, "step": 40945 }, { "epoch": 3.7112561174551386, "grad_norm": 0.731964647769928, "learning_rate": 9.454130252461243e-06, "loss": 0.0685, "step": 40950 }, { "epoch": 3.711709262280225, "grad_norm": 0.44827091693878174, "learning_rate": 9.447937211841528e-06, "loss": 0.0533, "step": 40955 }, { "epoch": 3.712162407105311, "grad_norm": 0.6398783922195435, "learning_rate": 9.4417457276775e-06, "loss": 0.0589, "step": 40960 }, { "epoch": 3.712615551930397, "grad_norm": 0.6758063435554504, "learning_rate": 9.435555800588833e-06, "loss": 0.0547, "step": 40965 }, { "epoch": 3.713068696755483, "grad_norm": 0.45668166875839233, "learning_rate": 9.429367431194986e-06, "loss": 0.0512, "step": 40970 }, { "epoch": 3.7135218415805693, "grad_norm": 0.5288527011871338, "learning_rate": 9.42318062011531e-06, "loss": 0.0507, "step": 40975 }, { "epoch": 3.7139749864056553, "grad_norm": 0.4949225187301636, "learning_rate": 9.416995367969003e-06, "loss": 0.0522, "step": 40980 }, { "epoch": 3.7144281312307412, "grad_norm": 0.5536748766899109, "learning_rate": 9.410811675375047e-06, "loss": 0.07, "step": 40985 }, { "epoch": 3.714881276055827, "grad_norm": 0.5062459111213684, "learning_rate": 9.40462954295234e-06, "loss": 0.0548, "step": 40990 }, { "epoch": 3.7153344208809136, "grad_norm": 0.5748721361160278, "learning_rate": 9.398448971319578e-06, "loss": 0.0521, "step": 40995 }, { "epoch": 3.7157875657059996, "grad_norm": 0.5092047452926636, "learning_rate": 9.392269961095319e-06, "loss": 0.0646, "step": 41000 }, { "epoch": 3.7162407105310855, "grad_norm": 0.8084284663200378, "learning_rate": 9.386092512897956e-06, "loss": 0.0581, "step": 41005 }, { "epoch": 3.716693855356172, "grad_norm": 0.5679295063018799, "learning_rate": 9.379916627345725e-06, "loss": 0.0543, "step": 41010 }, { "epoch": 3.717147000181258, "grad_norm": 0.5624294877052307, "learning_rate": 9.373742305056729e-06, "loss": 0.0523, "step": 41015 }, { "epoch": 3.717600145006344, "grad_norm": 0.6012994647026062, "learning_rate": 9.367569546648882e-06, "loss": 0.0654, "step": 41020 }, { "epoch": 3.7180532898314302, "grad_norm": 0.6544909477233887, "learning_rate": 9.361398352739964e-06, "loss": 0.0564, "step": 41025 }, { "epoch": 3.718506434656516, "grad_norm": 0.5303568840026855, "learning_rate": 9.355228723947582e-06, "loss": 0.0576, "step": 41030 }, { "epoch": 3.718959579481602, "grad_norm": 0.7187449336051941, "learning_rate": 9.349060660889189e-06, "loss": 0.0631, "step": 41035 }, { "epoch": 3.7194127243066886, "grad_norm": 0.5694624185562134, "learning_rate": 9.342894164182106e-06, "loss": 0.0569, "step": 41040 }, { "epoch": 3.7198658691317745, "grad_norm": 0.5511271953582764, "learning_rate": 9.336729234443467e-06, "loss": 0.0577, "step": 41045 }, { "epoch": 3.7203190139568605, "grad_norm": 0.7376673221588135, "learning_rate": 9.330565872290261e-06, "loss": 0.06, "step": 41050 }, { "epoch": 3.720772158781947, "grad_norm": 0.5927769541740417, "learning_rate": 9.324404078339322e-06, "loss": 0.0611, "step": 41055 }, { "epoch": 3.721225303607033, "grad_norm": 0.44607481360435486, "learning_rate": 9.318243853207312e-06, "loss": 0.0511, "step": 41060 }, { "epoch": 3.721678448432119, "grad_norm": 0.5531744360923767, "learning_rate": 9.312085197510767e-06, "loss": 0.0851, "step": 41065 }, { "epoch": 3.722131593257205, "grad_norm": 0.519862949848175, "learning_rate": 9.305928111866039e-06, "loss": 0.0525, "step": 41070 }, { "epoch": 3.722584738082291, "grad_norm": 0.5768027901649475, "learning_rate": 9.29977259688933e-06, "loss": 0.0677, "step": 41075 }, { "epoch": 3.723037882907377, "grad_norm": 0.576361894607544, "learning_rate": 9.293618653196687e-06, "loss": 0.0479, "step": 41080 }, { "epoch": 3.7234910277324635, "grad_norm": 0.6767917275428772, "learning_rate": 9.287466281403989e-06, "loss": 0.0686, "step": 41085 }, { "epoch": 3.7239441725575495, "grad_norm": 0.5382201075553894, "learning_rate": 9.28131548212699e-06, "loss": 0.0629, "step": 41090 }, { "epoch": 3.7243973173826355, "grad_norm": 0.6398400664329529, "learning_rate": 9.275166255981247e-06, "loss": 0.0636, "step": 41095 }, { "epoch": 3.7248504622077214, "grad_norm": 0.5666377544403076, "learning_rate": 9.269018603582183e-06, "loss": 0.0575, "step": 41100 }, { "epoch": 3.725303607032808, "grad_norm": 0.4716832935810089, "learning_rate": 9.262872525545056e-06, "loss": 0.0488, "step": 41105 }, { "epoch": 3.725756751857894, "grad_norm": 0.5662477016448975, "learning_rate": 9.256728022484953e-06, "loss": 0.0762, "step": 41110 }, { "epoch": 3.7262098966829797, "grad_norm": 0.5516357421875, "learning_rate": 9.250585095016846e-06, "loss": 0.0611, "step": 41115 }, { "epoch": 3.7266630415080657, "grad_norm": 0.517315149307251, "learning_rate": 9.244443743755504e-06, "loss": 0.0579, "step": 41120 }, { "epoch": 3.727116186333152, "grad_norm": 0.6644880175590515, "learning_rate": 9.23830396931556e-06, "loss": 0.063, "step": 41125 }, { "epoch": 3.727569331158238, "grad_norm": 0.5449059009552002, "learning_rate": 9.232165772311472e-06, "loss": 0.0607, "step": 41130 }, { "epoch": 3.728022475983324, "grad_norm": 0.7316362261772156, "learning_rate": 9.226029153357574e-06, "loss": 0.0559, "step": 41135 }, { "epoch": 3.7284756208084104, "grad_norm": 0.5490700602531433, "learning_rate": 9.219894113068009e-06, "loss": 0.0586, "step": 41140 }, { "epoch": 3.7289287656334964, "grad_norm": 0.566889226436615, "learning_rate": 9.213760652056776e-06, "loss": 0.0601, "step": 41145 }, { "epoch": 3.7293819104585824, "grad_norm": 0.47097551822662354, "learning_rate": 9.207628770937714e-06, "loss": 0.055, "step": 41150 }, { "epoch": 3.7298350552836688, "grad_norm": 0.5682114362716675, "learning_rate": 9.201498470324496e-06, "loss": 0.0623, "step": 41155 }, { "epoch": 3.7302882001087547, "grad_norm": 0.6201180219650269, "learning_rate": 9.195369750830667e-06, "loss": 0.0628, "step": 41160 }, { "epoch": 3.7307413449338407, "grad_norm": 0.6341581344604492, "learning_rate": 9.189242613069562e-06, "loss": 0.0485, "step": 41165 }, { "epoch": 3.731194489758927, "grad_norm": 0.5363916754722595, "learning_rate": 9.183117057654406e-06, "loss": 0.0545, "step": 41170 }, { "epoch": 3.731647634584013, "grad_norm": 0.5138009786605835, "learning_rate": 9.176993085198243e-06, "loss": 0.053, "step": 41175 }, { "epoch": 3.732100779409099, "grad_norm": 0.7001263499259949, "learning_rate": 9.170870696313955e-06, "loss": 0.0664, "step": 41180 }, { "epoch": 3.7325539242341854, "grad_norm": 0.6161647439002991, "learning_rate": 9.164749891614293e-06, "loss": 0.052, "step": 41185 }, { "epoch": 3.7330070690592714, "grad_norm": 0.6473723649978638, "learning_rate": 9.158630671711799e-06, "loss": 0.0624, "step": 41190 }, { "epoch": 3.7334602138843573, "grad_norm": 0.5660994648933411, "learning_rate": 9.15251303721891e-06, "loss": 0.0587, "step": 41195 }, { "epoch": 3.7339133587094437, "grad_norm": 0.5638372898101807, "learning_rate": 9.146396988747873e-06, "loss": 0.0597, "step": 41200 }, { "epoch": 3.7343665035345297, "grad_norm": 0.511390745639801, "learning_rate": 9.140282526910779e-06, "loss": 0.0527, "step": 41205 }, { "epoch": 3.7348196483596157, "grad_norm": 0.5340138077735901, "learning_rate": 9.134169652319585e-06, "loss": 0.0665, "step": 41210 }, { "epoch": 3.735272793184702, "grad_norm": 0.5366564989089966, "learning_rate": 9.12805836558604e-06, "loss": 0.048, "step": 41215 }, { "epoch": 3.735725938009788, "grad_norm": 0.5548655390739441, "learning_rate": 9.12194866732179e-06, "loss": 0.052, "step": 41220 }, { "epoch": 3.736179082834874, "grad_norm": 0.5904976725578308, "learning_rate": 9.11584055813828e-06, "loss": 0.0568, "step": 41225 }, { "epoch": 3.7366322276599604, "grad_norm": 0.6284873485565186, "learning_rate": 9.109734038646812e-06, "loss": 0.0503, "step": 41230 }, { "epoch": 3.7370853724850464, "grad_norm": 0.5428922772407532, "learning_rate": 9.10362910945855e-06, "loss": 0.0489, "step": 41235 }, { "epoch": 3.7375385173101323, "grad_norm": 0.5989227890968323, "learning_rate": 9.097525771184442e-06, "loss": 0.0495, "step": 41240 }, { "epoch": 3.7379916621352183, "grad_norm": 0.5110819935798645, "learning_rate": 9.091424024435338e-06, "loss": 0.0493, "step": 41245 }, { "epoch": 3.7384448069603047, "grad_norm": 0.4918350577354431, "learning_rate": 9.085323869821898e-06, "loss": 0.0573, "step": 41250 }, { "epoch": 3.7388979517853906, "grad_norm": 0.5257129073143005, "learning_rate": 9.079225307954616e-06, "loss": 0.0584, "step": 41255 }, { "epoch": 3.7393510966104766, "grad_norm": 0.6172276735305786, "learning_rate": 9.073128339443866e-06, "loss": 0.0497, "step": 41260 }, { "epoch": 3.7398042414355626, "grad_norm": 0.5910768508911133, "learning_rate": 9.067032964899797e-06, "loss": 0.0619, "step": 41265 }, { "epoch": 3.740257386260649, "grad_norm": 0.626603901386261, "learning_rate": 9.060939184932463e-06, "loss": 0.0703, "step": 41270 }, { "epoch": 3.740710531085735, "grad_norm": 0.6329191327095032, "learning_rate": 9.054847000151723e-06, "loss": 0.0494, "step": 41275 }, { "epoch": 3.741163675910821, "grad_norm": 0.5436473488807678, "learning_rate": 9.048756411167277e-06, "loss": 0.0476, "step": 41280 }, { "epoch": 3.7416168207359073, "grad_norm": 0.6879730820655823, "learning_rate": 9.042667418588702e-06, "loss": 0.0614, "step": 41285 }, { "epoch": 3.7420699655609932, "grad_norm": 0.4147736728191376, "learning_rate": 9.036580023025345e-06, "loss": 0.0471, "step": 41290 }, { "epoch": 3.742523110386079, "grad_norm": 0.5132617950439453, "learning_rate": 9.030494225086466e-06, "loss": 0.0505, "step": 41295 }, { "epoch": 3.7429762552111656, "grad_norm": 0.6500745415687561, "learning_rate": 9.024410025381124e-06, "loss": 0.0649, "step": 41300 }, { "epoch": 3.7434294000362516, "grad_norm": 0.4985469579696655, "learning_rate": 9.018327424518219e-06, "loss": 0.0509, "step": 41305 }, { "epoch": 3.7438825448613375, "grad_norm": 0.6340207457542419, "learning_rate": 9.012246423106524e-06, "loss": 0.0601, "step": 41310 }, { "epoch": 3.744335689686424, "grad_norm": 0.5296696424484253, "learning_rate": 9.006167021754595e-06, "loss": 0.0451, "step": 41315 }, { "epoch": 3.74478883451151, "grad_norm": 0.4950157701969147, "learning_rate": 9.000089221070885e-06, "loss": 0.0604, "step": 41320 }, { "epoch": 3.745241979336596, "grad_norm": 0.6865851879119873, "learning_rate": 8.994013021663655e-06, "loss": 0.052, "step": 41325 }, { "epoch": 3.7456951241616823, "grad_norm": 0.6431030631065369, "learning_rate": 8.987938424141004e-06, "loss": 0.0531, "step": 41330 }, { "epoch": 3.7461482689867682, "grad_norm": 0.47065624594688416, "learning_rate": 8.981865429110905e-06, "loss": 0.0548, "step": 41335 }, { "epoch": 3.746601413811854, "grad_norm": 0.5914103388786316, "learning_rate": 8.975794037181113e-06, "loss": 0.0494, "step": 41340 }, { "epoch": 3.7470545586369406, "grad_norm": 0.7564786076545715, "learning_rate": 8.969724248959277e-06, "loss": 0.0644, "step": 41345 }, { "epoch": 3.7475077034620266, "grad_norm": 0.5193860530853271, "learning_rate": 8.96365606505286e-06, "loss": 0.0503, "step": 41350 }, { "epoch": 3.7479608482871125, "grad_norm": 0.47134026885032654, "learning_rate": 8.957589486069164e-06, "loss": 0.0488, "step": 41355 }, { "epoch": 3.748413993112199, "grad_norm": 0.6502158641815186, "learning_rate": 8.951524512615337e-06, "loss": 0.0557, "step": 41360 }, { "epoch": 3.748867137937285, "grad_norm": 0.5269651412963867, "learning_rate": 8.94546114529835e-06, "loss": 0.0499, "step": 41365 }, { "epoch": 3.749320282762371, "grad_norm": 0.5262800455093384, "learning_rate": 8.93939938472505e-06, "loss": 0.0493, "step": 41370 }, { "epoch": 3.749773427587457, "grad_norm": 0.4470899701118469, "learning_rate": 8.933339231502087e-06, "loss": 0.0639, "step": 41375 }, { "epoch": 3.750226572412543, "grad_norm": 0.6694605946540833, "learning_rate": 8.927280686235967e-06, "loss": 0.0581, "step": 41380 }, { "epoch": 3.750679717237629, "grad_norm": 0.7028038501739502, "learning_rate": 8.921223749533019e-06, "loss": 0.0743, "step": 41385 }, { "epoch": 3.751132862062715, "grad_norm": 0.4219527840614319, "learning_rate": 8.915168421999442e-06, "loss": 0.0602, "step": 41390 }, { "epoch": 3.751586006887801, "grad_norm": 0.4795280396938324, "learning_rate": 8.909114704241247e-06, "loss": 0.0509, "step": 41395 }, { "epoch": 3.7520391517128875, "grad_norm": 0.5147716403007507, "learning_rate": 8.903062596864294e-06, "loss": 0.0541, "step": 41400 }, { "epoch": 3.7524922965379734, "grad_norm": 0.5284745693206787, "learning_rate": 8.897012100474278e-06, "loss": 0.0884, "step": 41405 }, { "epoch": 3.7529454413630594, "grad_norm": 0.5059119462966919, "learning_rate": 8.890963215676724e-06, "loss": 0.0664, "step": 41410 }, { "epoch": 3.753398586188146, "grad_norm": 0.5259023904800415, "learning_rate": 8.884915943077027e-06, "loss": 0.0576, "step": 41415 }, { "epoch": 3.7538517310132318, "grad_norm": 0.6185501217842102, "learning_rate": 8.878870283280388e-06, "loss": 0.0499, "step": 41420 }, { "epoch": 3.7543048758383177, "grad_norm": 0.5329769253730774, "learning_rate": 8.872826236891863e-06, "loss": 0.0574, "step": 41425 }, { "epoch": 3.754758020663404, "grad_norm": 0.588742733001709, "learning_rate": 8.866783804516338e-06, "loss": 0.0519, "step": 41430 }, { "epoch": 3.75521116548849, "grad_norm": 0.6531281471252441, "learning_rate": 8.860742986758536e-06, "loss": 0.0551, "step": 41435 }, { "epoch": 3.755664310313576, "grad_norm": 0.7319685816764832, "learning_rate": 8.854703784223042e-06, "loss": 0.0585, "step": 41440 }, { "epoch": 3.7561174551386625, "grad_norm": 0.5387296080589294, "learning_rate": 8.848666197514249e-06, "loss": 0.0724, "step": 41445 }, { "epoch": 3.7565705999637484, "grad_norm": 0.4522380530834198, "learning_rate": 8.842630227236403e-06, "loss": 0.0525, "step": 41450 }, { "epoch": 3.7570237447888344, "grad_norm": 0.6433842182159424, "learning_rate": 8.836595873993587e-06, "loss": 0.0574, "step": 41455 }, { "epoch": 3.757476889613921, "grad_norm": 0.8448172807693481, "learning_rate": 8.83056313838971e-06, "loss": 0.0789, "step": 41460 }, { "epoch": 3.7579300344390068, "grad_norm": 0.7130336761474609, "learning_rate": 8.824532021028551e-06, "loss": 0.0557, "step": 41465 }, { "epoch": 3.7583831792640927, "grad_norm": 0.5820212364196777, "learning_rate": 8.818502522513694e-06, "loss": 0.0536, "step": 41470 }, { "epoch": 3.758836324089179, "grad_norm": 0.5477057099342346, "learning_rate": 8.812474643448576e-06, "loss": 0.0612, "step": 41475 }, { "epoch": 3.759289468914265, "grad_norm": 0.49914196133613586, "learning_rate": 8.806448384436466e-06, "loss": 0.0592, "step": 41480 }, { "epoch": 3.759742613739351, "grad_norm": 0.8184438943862915, "learning_rate": 8.800423746080468e-06, "loss": 0.0627, "step": 41485 }, { "epoch": 3.7601957585644374, "grad_norm": 0.6412994861602783, "learning_rate": 8.794400728983546e-06, "loss": 0.0534, "step": 41490 }, { "epoch": 3.7606489033895234, "grad_norm": 0.6624844670295715, "learning_rate": 8.788379333748479e-06, "loss": 0.0492, "step": 41495 }, { "epoch": 3.7611020482146094, "grad_norm": 0.660589337348938, "learning_rate": 8.78235956097788e-06, "loss": 0.0633, "step": 41500 }, { "epoch": 3.7615551930396953, "grad_norm": 0.7129333019256592, "learning_rate": 8.776341411274233e-06, "loss": 0.0562, "step": 41505 }, { "epoch": 3.7620083378647817, "grad_norm": 0.5783237814903259, "learning_rate": 8.770324885239805e-06, "loss": 0.053, "step": 41510 }, { "epoch": 3.7624614826898677, "grad_norm": 0.5830681920051575, "learning_rate": 8.764309983476768e-06, "loss": 0.055, "step": 41515 }, { "epoch": 3.7629146275149536, "grad_norm": 0.5459400415420532, "learning_rate": 8.758296706587061e-06, "loss": 0.0538, "step": 41520 }, { "epoch": 3.7633677723400396, "grad_norm": 0.5084466338157654, "learning_rate": 8.752285055172507e-06, "loss": 0.055, "step": 41525 }, { "epoch": 3.763820917165126, "grad_norm": 0.5829471349716187, "learning_rate": 8.746275029834775e-06, "loss": 0.0513, "step": 41530 }, { "epoch": 3.764274061990212, "grad_norm": 0.5540586113929749, "learning_rate": 8.740266631175315e-06, "loss": 0.0691, "step": 41535 }, { "epoch": 3.764727206815298, "grad_norm": 0.6696521043777466, "learning_rate": 8.734259859795486e-06, "loss": 0.0584, "step": 41540 }, { "epoch": 3.7651803516403843, "grad_norm": 0.6056371331214905, "learning_rate": 8.72825471629641e-06, "loss": 0.0526, "step": 41545 }, { "epoch": 3.7656334964654703, "grad_norm": 0.4989579916000366, "learning_rate": 8.722251201279112e-06, "loss": 0.0548, "step": 41550 }, { "epoch": 3.7660866412905563, "grad_norm": 0.5596717596054077, "learning_rate": 8.716249315344415e-06, "loss": 0.0597, "step": 41555 }, { "epoch": 3.7665397861156427, "grad_norm": 0.60268634557724, "learning_rate": 8.710249059092987e-06, "loss": 0.0545, "step": 41560 }, { "epoch": 3.7669929309407286, "grad_norm": 0.6434841752052307, "learning_rate": 8.704250433125351e-06, "loss": 0.0575, "step": 41565 }, { "epoch": 3.7674460757658146, "grad_norm": 0.5780729651451111, "learning_rate": 8.698253438041829e-06, "loss": 0.0591, "step": 41570 }, { "epoch": 3.767899220590901, "grad_norm": 0.7195630073547363, "learning_rate": 8.692258074442619e-06, "loss": 0.059, "step": 41575 }, { "epoch": 3.768352365415987, "grad_norm": 0.5708093643188477, "learning_rate": 8.686264342927736e-06, "loss": 0.0521, "step": 41580 }, { "epoch": 3.768805510241073, "grad_norm": 0.7372010946273804, "learning_rate": 8.68027224409702e-06, "loss": 0.0602, "step": 41585 }, { "epoch": 3.7692586550661593, "grad_norm": 0.4936991333961487, "learning_rate": 8.674281778550195e-06, "loss": 0.0492, "step": 41590 }, { "epoch": 3.7697117998912453, "grad_norm": 0.6568192839622498, "learning_rate": 8.66829294688675e-06, "loss": 0.0577, "step": 41595 }, { "epoch": 3.7701649447163312, "grad_norm": 0.5177028179168701, "learning_rate": 8.662305749706079e-06, "loss": 0.05, "step": 41600 }, { "epoch": 3.7706180895414176, "grad_norm": 0.5712594389915466, "learning_rate": 8.656320187607368e-06, "loss": 0.0962, "step": 41605 }, { "epoch": 3.7710712343665036, "grad_norm": 0.5223526358604431, "learning_rate": 8.65033626118965e-06, "loss": 0.0587, "step": 41610 }, { "epoch": 3.7715243791915896, "grad_norm": 0.5062777996063232, "learning_rate": 8.644353971051825e-06, "loss": 0.0697, "step": 41615 }, { "epoch": 3.771977524016676, "grad_norm": 0.4965650737285614, "learning_rate": 8.638373317792563e-06, "loss": 0.0548, "step": 41620 }, { "epoch": 3.772430668841762, "grad_norm": 0.7099937796592712, "learning_rate": 8.632394302010444e-06, "loss": 0.0629, "step": 41625 }, { "epoch": 3.772883813666848, "grad_norm": 0.6484094858169556, "learning_rate": 8.626416924303833e-06, "loss": 0.0493, "step": 41630 }, { "epoch": 3.7733369584919343, "grad_norm": 0.5353913307189941, "learning_rate": 8.620441185270942e-06, "loss": 0.0495, "step": 41635 }, { "epoch": 3.7737901033170203, "grad_norm": 0.6078851819038391, "learning_rate": 8.614467085509845e-06, "loss": 0.0558, "step": 41640 }, { "epoch": 3.774243248142106, "grad_norm": 0.5480092167854309, "learning_rate": 8.608494625618419e-06, "loss": 0.0536, "step": 41645 }, { "epoch": 3.774696392967192, "grad_norm": 0.542544424533844, "learning_rate": 8.602523806194393e-06, "loss": 0.0581, "step": 41650 }, { "epoch": 3.7751495377922786, "grad_norm": 0.5595751404762268, "learning_rate": 8.59655462783533e-06, "loss": 0.0796, "step": 41655 }, { "epoch": 3.7756026826173645, "grad_norm": 0.49264585971832275, "learning_rate": 8.590587091138614e-06, "loss": 0.0502, "step": 41660 }, { "epoch": 3.7760558274424505, "grad_norm": 0.6380233764648438, "learning_rate": 8.584621196701498e-06, "loss": 0.0549, "step": 41665 }, { "epoch": 3.7765089722675365, "grad_norm": 0.5891509056091309, "learning_rate": 8.57865694512104e-06, "loss": 0.0448, "step": 41670 }, { "epoch": 3.776962117092623, "grad_norm": 0.5057823657989502, "learning_rate": 8.57269433699415e-06, "loss": 0.052, "step": 41675 }, { "epoch": 3.777415261917709, "grad_norm": 0.4546133875846863, "learning_rate": 8.566733372917562e-06, "loss": 0.0513, "step": 41680 }, { "epoch": 3.777868406742795, "grad_norm": 0.5032958388328552, "learning_rate": 8.560774053487844e-06, "loss": 0.047, "step": 41685 }, { "epoch": 3.778321551567881, "grad_norm": 0.641002893447876, "learning_rate": 8.554816379301425e-06, "loss": 0.0538, "step": 41690 }, { "epoch": 3.778774696392967, "grad_norm": 0.5910135507583618, "learning_rate": 8.548860350954543e-06, "loss": 0.0494, "step": 41695 }, { "epoch": 3.779227841218053, "grad_norm": 0.5521292090415955, "learning_rate": 8.54290596904328e-06, "loss": 0.0513, "step": 41700 }, { "epoch": 3.7796809860431395, "grad_norm": 0.6794717311859131, "learning_rate": 8.536953234163547e-06, "loss": 0.0538, "step": 41705 }, { "epoch": 3.7801341308682255, "grad_norm": 0.5363934636116028, "learning_rate": 8.531002146911103e-06, "loss": 0.0566, "step": 41710 }, { "epoch": 3.7805872756933114, "grad_norm": 0.713385820388794, "learning_rate": 8.52505270788152e-06, "loss": 0.0561, "step": 41715 }, { "epoch": 3.781040420518398, "grad_norm": 0.5171129107475281, "learning_rate": 8.519104917670242e-06, "loss": 0.047, "step": 41720 }, { "epoch": 3.781493565343484, "grad_norm": 0.64439857006073, "learning_rate": 8.513158776872516e-06, "loss": 0.0549, "step": 41725 }, { "epoch": 3.7819467101685698, "grad_norm": 0.6677426099777222, "learning_rate": 8.507214286083432e-06, "loss": 0.0576, "step": 41730 }, { "epoch": 3.782399854993656, "grad_norm": 0.5562488436698914, "learning_rate": 8.501271445897915e-06, "loss": 0.0558, "step": 41735 }, { "epoch": 3.782852999818742, "grad_norm": 0.6492145657539368, "learning_rate": 8.495330256910724e-06, "loss": 0.0587, "step": 41740 }, { "epoch": 3.783306144643828, "grad_norm": 0.4767974615097046, "learning_rate": 8.489390719716467e-06, "loss": 0.0531, "step": 41745 }, { "epoch": 3.7837592894689145, "grad_norm": 0.6261164546012878, "learning_rate": 8.48345283490957e-06, "loss": 0.0548, "step": 41750 }, { "epoch": 3.7842124342940004, "grad_norm": 0.43850457668304443, "learning_rate": 8.477516603084296e-06, "loss": 0.056, "step": 41755 }, { "epoch": 3.7846655791190864, "grad_norm": 0.5882686376571655, "learning_rate": 8.471582024834746e-06, "loss": 0.0501, "step": 41760 }, { "epoch": 3.785118723944173, "grad_norm": 0.5151940584182739, "learning_rate": 8.465649100754843e-06, "loss": 0.0485, "step": 41765 }, { "epoch": 3.7855718687692588, "grad_norm": 0.566463053226471, "learning_rate": 8.459717831438379e-06, "loss": 0.0565, "step": 41770 }, { "epoch": 3.7860250135943447, "grad_norm": 0.5869277119636536, "learning_rate": 8.453788217478944e-06, "loss": 0.0561, "step": 41775 }, { "epoch": 3.7864781584194307, "grad_norm": 0.64208984375, "learning_rate": 8.447860259469973e-06, "loss": 0.0519, "step": 41780 }, { "epoch": 3.786931303244517, "grad_norm": 0.5448466539382935, "learning_rate": 8.441933958004755e-06, "loss": 0.0522, "step": 41785 }, { "epoch": 3.787384448069603, "grad_norm": 0.6060240268707275, "learning_rate": 8.436009313676368e-06, "loss": 0.0512, "step": 41790 }, { "epoch": 3.787837592894689, "grad_norm": 0.5814392566680908, "learning_rate": 8.430086327077779e-06, "loss": 0.0539, "step": 41795 }, { "epoch": 3.788290737719775, "grad_norm": 0.5590090155601501, "learning_rate": 8.42416499880175e-06, "loss": 0.0655, "step": 41800 }, { "epoch": 3.7887438825448614, "grad_norm": 0.5661503672599792, "learning_rate": 8.418245329440885e-06, "loss": 0.0502, "step": 41805 }, { "epoch": 3.7891970273699473, "grad_norm": 0.4840131998062134, "learning_rate": 8.412327319587648e-06, "loss": 0.0456, "step": 41810 }, { "epoch": 3.7896501721950333, "grad_norm": 0.4929159879684448, "learning_rate": 8.406410969834283e-06, "loss": 0.0565, "step": 41815 }, { "epoch": 3.7901033170201197, "grad_norm": 0.4770697057247162, "learning_rate": 8.400496280772927e-06, "loss": 0.0505, "step": 41820 }, { "epoch": 3.7905564618452057, "grad_norm": 0.5729590654373169, "learning_rate": 8.394583252995513e-06, "loss": 0.061, "step": 41825 }, { "epoch": 3.7910096066702916, "grad_norm": 0.5424108505249023, "learning_rate": 8.388671887093817e-06, "loss": 0.0506, "step": 41830 }, { "epoch": 3.791462751495378, "grad_norm": 0.6812378764152527, "learning_rate": 8.382762183659465e-06, "loss": 0.0574, "step": 41835 }, { "epoch": 3.791915896320464, "grad_norm": 0.6656157970428467, "learning_rate": 8.376854143283877e-06, "loss": 0.0537, "step": 41840 }, { "epoch": 3.79236904114555, "grad_norm": 0.5693742036819458, "learning_rate": 8.370947766558354e-06, "loss": 0.0723, "step": 41845 }, { "epoch": 3.7928221859706364, "grad_norm": 0.5958921313285828, "learning_rate": 8.365043054074e-06, "loss": 0.0478, "step": 41850 }, { "epoch": 3.7932753307957223, "grad_norm": 0.5440536737442017, "learning_rate": 8.359140006421754e-06, "loss": 0.0586, "step": 41855 }, { "epoch": 3.7937284756208083, "grad_norm": 0.6116217970848083, "learning_rate": 8.353238624192413e-06, "loss": 0.0561, "step": 41860 }, { "epoch": 3.7941816204458947, "grad_norm": 0.4945302903652191, "learning_rate": 8.347338907976567e-06, "loss": 0.0471, "step": 41865 }, { "epoch": 3.7946347652709806, "grad_norm": 0.5343263149261475, "learning_rate": 8.341440858364677e-06, "loss": 0.063, "step": 41870 }, { "epoch": 3.7950879100960666, "grad_norm": 0.7852511405944824, "learning_rate": 8.33554447594702e-06, "loss": 0.0568, "step": 41875 }, { "epoch": 3.795541054921153, "grad_norm": 0.6637075543403625, "learning_rate": 8.329649761313696e-06, "loss": 0.0549, "step": 41880 }, { "epoch": 3.795994199746239, "grad_norm": 0.5792865753173828, "learning_rate": 8.323756715054676e-06, "loss": 0.0497, "step": 41885 }, { "epoch": 3.796447344571325, "grad_norm": 0.46413901448249817, "learning_rate": 8.317865337759703e-06, "loss": 0.0603, "step": 41890 }, { "epoch": 3.7969004893964113, "grad_norm": 0.5928824543952942, "learning_rate": 8.311975630018418e-06, "loss": 0.0532, "step": 41895 }, { "epoch": 3.7973536342214973, "grad_norm": 0.7051709890365601, "learning_rate": 8.30608759242025e-06, "loss": 0.0529, "step": 41900 }, { "epoch": 3.7978067790465833, "grad_norm": 0.5702993273735046, "learning_rate": 8.300201225554482e-06, "loss": 0.0579, "step": 41905 }, { "epoch": 3.798259923871669, "grad_norm": 0.5547850728034973, "learning_rate": 8.294316530010218e-06, "loss": 0.0487, "step": 41910 }, { "epoch": 3.7987130686967556, "grad_norm": 0.7222471833229065, "learning_rate": 8.288433506376398e-06, "loss": 0.0538, "step": 41915 }, { "epoch": 3.7991662135218416, "grad_norm": 0.6476911902427673, "learning_rate": 8.282552155241808e-06, "loss": 0.0582, "step": 41920 }, { "epoch": 3.7996193583469275, "grad_norm": 0.5761061906814575, "learning_rate": 8.276672477195049e-06, "loss": 0.0613, "step": 41925 }, { "epoch": 3.8000725031720135, "grad_norm": 0.48722922801971436, "learning_rate": 8.270794472824564e-06, "loss": 0.0479, "step": 41930 }, { "epoch": 3.8005256479971, "grad_norm": 0.5318543314933777, "learning_rate": 8.264918142718623e-06, "loss": 0.0494, "step": 41935 }, { "epoch": 3.800978792822186, "grad_norm": 0.4868148863315582, "learning_rate": 8.259043487465323e-06, "loss": 0.0534, "step": 41940 }, { "epoch": 3.801431937647272, "grad_norm": 0.6708976626396179, "learning_rate": 8.25317050765262e-06, "loss": 0.0503, "step": 41945 }, { "epoch": 3.8018850824723582, "grad_norm": 0.5057882070541382, "learning_rate": 8.24729920386827e-06, "loss": 0.0533, "step": 41950 }, { "epoch": 3.802338227297444, "grad_norm": 0.6612192392349243, "learning_rate": 8.24142957669988e-06, "loss": 0.0556, "step": 41955 }, { "epoch": 3.80279137212253, "grad_norm": 0.5591065287590027, "learning_rate": 8.235561626734887e-06, "loss": 0.0601, "step": 41960 }, { "epoch": 3.8032445169476166, "grad_norm": 0.5628452897071838, "learning_rate": 8.229695354560541e-06, "loss": 0.053, "step": 41965 }, { "epoch": 3.8036976617727025, "grad_norm": 0.4713382422924042, "learning_rate": 8.223830760763962e-06, "loss": 0.0544, "step": 41970 }, { "epoch": 3.8041508065977885, "grad_norm": 0.6395531892776489, "learning_rate": 8.217967845932073e-06, "loss": 0.0519, "step": 41975 }, { "epoch": 3.804603951422875, "grad_norm": 0.42475631833076477, "learning_rate": 8.212106610651634e-06, "loss": 0.0492, "step": 41980 }, { "epoch": 3.805057096247961, "grad_norm": 0.7281668782234192, "learning_rate": 8.206247055509242e-06, "loss": 0.068, "step": 41985 }, { "epoch": 3.805510241073047, "grad_norm": 0.43078815937042236, "learning_rate": 8.200389181091314e-06, "loss": 0.0494, "step": 41990 }, { "epoch": 3.805963385898133, "grad_norm": 0.6193016767501831, "learning_rate": 8.194532987984122e-06, "loss": 0.0563, "step": 41995 }, { "epoch": 3.806416530723219, "grad_norm": 0.6640599370002747, "learning_rate": 8.188678476773751e-06, "loss": 0.0554, "step": 42000 }, { "epoch": 3.806869675548305, "grad_norm": 0.537300169467926, "learning_rate": 8.18282564804612e-06, "loss": 0.0534, "step": 42005 }, { "epoch": 3.8073228203733915, "grad_norm": 0.532257080078125, "learning_rate": 8.176974502386983e-06, "loss": 0.0538, "step": 42010 }, { "epoch": 3.8077759651984775, "grad_norm": 0.58072429895401, "learning_rate": 8.171125040381918e-06, "loss": 0.0475, "step": 42015 }, { "epoch": 3.8082291100235635, "grad_norm": 0.4452934265136719, "learning_rate": 8.165277262616355e-06, "loss": 0.0514, "step": 42020 }, { "epoch": 3.80868225484865, "grad_norm": 0.5184932351112366, "learning_rate": 8.159431169675536e-06, "loss": 0.0485, "step": 42025 }, { "epoch": 3.809135399673736, "grad_norm": 0.6051374673843384, "learning_rate": 8.153586762144538e-06, "loss": 0.0552, "step": 42030 }, { "epoch": 3.809588544498822, "grad_norm": 0.6595606803894043, "learning_rate": 8.147744040608262e-06, "loss": 0.058, "step": 42035 }, { "epoch": 3.810041689323908, "grad_norm": 0.46177107095718384, "learning_rate": 8.141903005651472e-06, "loss": 0.0583, "step": 42040 }, { "epoch": 3.810494834148994, "grad_norm": 0.7462352514266968, "learning_rate": 8.136063657858722e-06, "loss": 0.0601, "step": 42045 }, { "epoch": 3.81094797897408, "grad_norm": 0.5887739658355713, "learning_rate": 8.130225997814428e-06, "loss": 0.0553, "step": 42050 }, { "epoch": 3.811401123799166, "grad_norm": 0.7064893245697021, "learning_rate": 8.124390026102815e-06, "loss": 0.062, "step": 42055 }, { "epoch": 3.8118542686242525, "grad_norm": 0.49018555879592896, "learning_rate": 8.118555743307946e-06, "loss": 0.0484, "step": 42060 }, { "epoch": 3.8123074134493384, "grad_norm": 0.5373245477676392, "learning_rate": 8.11272315001374e-06, "loss": 0.0504, "step": 42065 }, { "epoch": 3.8127605582744244, "grad_norm": 0.5504798889160156, "learning_rate": 8.106892246803896e-06, "loss": 0.059, "step": 42070 }, { "epoch": 3.8132137030995104, "grad_norm": 0.5453303456306458, "learning_rate": 8.101063034261993e-06, "loss": 0.052, "step": 42075 }, { "epoch": 3.8136668479245968, "grad_norm": 0.45534244179725647, "learning_rate": 8.095235512971414e-06, "loss": 0.0584, "step": 42080 }, { "epoch": 3.8141199927496827, "grad_norm": 0.5195428133010864, "learning_rate": 8.089409683515373e-06, "loss": 0.0477, "step": 42085 }, { "epoch": 3.8145731375747687, "grad_norm": 0.5335431694984436, "learning_rate": 8.083585546476944e-06, "loss": 0.0569, "step": 42090 }, { "epoch": 3.815026282399855, "grad_norm": 0.6132521033287048, "learning_rate": 8.077763102438973e-06, "loss": 0.0545, "step": 42095 }, { "epoch": 3.815479427224941, "grad_norm": 0.8293974995613098, "learning_rate": 8.071942351984202e-06, "loss": 0.0621, "step": 42100 }, { "epoch": 3.815932572050027, "grad_norm": 0.7207264304161072, "learning_rate": 8.06612329569516e-06, "loss": 0.0621, "step": 42105 }, { "epoch": 3.8163857168751134, "grad_norm": 0.7129797339439392, "learning_rate": 8.06030593415422e-06, "loss": 0.0531, "step": 42110 }, { "epoch": 3.8168388617001994, "grad_norm": 0.547874927520752, "learning_rate": 8.0544902679436e-06, "loss": 0.08, "step": 42115 }, { "epoch": 3.8172920065252853, "grad_norm": 0.5322951674461365, "learning_rate": 8.048676297645308e-06, "loss": 0.0479, "step": 42120 }, { "epoch": 3.8177451513503717, "grad_norm": 0.547518789768219, "learning_rate": 8.042864023841232e-06, "loss": 0.0543, "step": 42125 }, { "epoch": 3.8181982961754577, "grad_norm": 0.48507797718048096, "learning_rate": 8.037053447113054e-06, "loss": 0.047, "step": 42130 }, { "epoch": 3.8186514410005437, "grad_norm": 0.7088760137557983, "learning_rate": 8.031244568042296e-06, "loss": 0.0527, "step": 42135 }, { "epoch": 3.81910458582563, "grad_norm": 0.6656632423400879, "learning_rate": 8.025437387210333e-06, "loss": 0.0502, "step": 42140 }, { "epoch": 3.819557730650716, "grad_norm": 0.5118558406829834, "learning_rate": 8.019631905198315e-06, "loss": 0.0518, "step": 42145 }, { "epoch": 3.820010875475802, "grad_norm": 0.5386388897895813, "learning_rate": 8.013828122587288e-06, "loss": 0.0526, "step": 42150 }, { "epoch": 3.8204640203008884, "grad_norm": 0.5841344594955444, "learning_rate": 8.008026039958086e-06, "loss": 0.0485, "step": 42155 }, { "epoch": 3.8209171651259743, "grad_norm": 0.5274123549461365, "learning_rate": 8.00222565789137e-06, "loss": 0.0514, "step": 42160 }, { "epoch": 3.8213703099510603, "grad_norm": 0.5286490321159363, "learning_rate": 7.996426976967677e-06, "loss": 0.0497, "step": 42165 }, { "epoch": 3.8218234547761467, "grad_norm": 0.5613086223602295, "learning_rate": 7.990629997767302e-06, "loss": 0.0502, "step": 42170 }, { "epoch": 3.8222765996012327, "grad_norm": 0.5651296377182007, "learning_rate": 7.984834720870436e-06, "loss": 0.049, "step": 42175 }, { "epoch": 3.8227297444263186, "grad_norm": 0.5917447805404663, "learning_rate": 7.979041146857061e-06, "loss": 0.0676, "step": 42180 }, { "epoch": 3.8231828892514046, "grad_norm": 0.49244967103004456, "learning_rate": 7.973249276306993e-06, "loss": 0.0485, "step": 42185 }, { "epoch": 3.823636034076491, "grad_norm": 0.6482919454574585, "learning_rate": 7.967459109799911e-06, "loss": 0.0535, "step": 42190 }, { "epoch": 3.824089178901577, "grad_norm": 0.5219781994819641, "learning_rate": 7.961670647915263e-06, "loss": 0.0486, "step": 42195 }, { "epoch": 3.824542323726663, "grad_norm": 0.6079457998275757, "learning_rate": 7.955883891232385e-06, "loss": 0.0522, "step": 42200 }, { "epoch": 3.824995468551749, "grad_norm": 0.6085450053215027, "learning_rate": 7.950098840330408e-06, "loss": 0.0635, "step": 42205 }, { "epoch": 3.8254486133768353, "grad_norm": 0.4820556342601776, "learning_rate": 7.944315495788297e-06, "loss": 0.0504, "step": 42210 }, { "epoch": 3.8259017582019212, "grad_norm": 0.5877859592437744, "learning_rate": 7.938533858184868e-06, "loss": 0.0509, "step": 42215 }, { "epoch": 3.826354903027007, "grad_norm": 0.6463460922241211, "learning_rate": 7.932753928098726e-06, "loss": 0.0513, "step": 42220 }, { "epoch": 3.8268080478520936, "grad_norm": 0.5265727043151855, "learning_rate": 7.926975706108348e-06, "loss": 0.0637, "step": 42225 }, { "epoch": 3.8272611926771796, "grad_norm": 0.4917299151420593, "learning_rate": 7.921199192792014e-06, "loss": 0.0526, "step": 42230 }, { "epoch": 3.8277143375022655, "grad_norm": 0.4959161579608917, "learning_rate": 7.915424388727827e-06, "loss": 0.0611, "step": 42235 }, { "epoch": 3.828167482327352, "grad_norm": 0.4711969792842865, "learning_rate": 7.909651294493761e-06, "loss": 0.0559, "step": 42240 }, { "epoch": 3.828620627152438, "grad_norm": 0.5742162466049194, "learning_rate": 7.903879910667556e-06, "loss": 0.0518, "step": 42245 }, { "epoch": 3.829073771977524, "grad_norm": 0.6086093187332153, "learning_rate": 7.898110237826833e-06, "loss": 0.0569, "step": 42250 }, { "epoch": 3.8295269168026103, "grad_norm": 0.46767371892929077, "learning_rate": 7.892342276549022e-06, "loss": 0.0499, "step": 42255 }, { "epoch": 3.829980061627696, "grad_norm": 0.6156156063079834, "learning_rate": 7.886576027411376e-06, "loss": 0.0566, "step": 42260 }, { "epoch": 3.830433206452782, "grad_norm": 0.46399804949760437, "learning_rate": 7.880811490990992e-06, "loss": 0.0567, "step": 42265 }, { "epoch": 3.8308863512778686, "grad_norm": 0.5280250906944275, "learning_rate": 7.875048667864769e-06, "loss": 0.0515, "step": 42270 }, { "epoch": 3.8313394961029545, "grad_norm": 0.5170708298683167, "learning_rate": 7.869287558609473e-06, "loss": 0.046, "step": 42275 }, { "epoch": 3.8317926409280405, "grad_norm": 0.4694405198097229, "learning_rate": 7.863528163801668e-06, "loss": 0.0506, "step": 42280 }, { "epoch": 3.832245785753127, "grad_norm": 0.6639689207077026, "learning_rate": 7.85777048401776e-06, "loss": 0.0473, "step": 42285 }, { "epoch": 3.832698930578213, "grad_norm": 0.5782796740531921, "learning_rate": 7.852014519833966e-06, "loss": 0.0513, "step": 42290 }, { "epoch": 3.833152075403299, "grad_norm": 0.7043532729148865, "learning_rate": 7.846260271826366e-06, "loss": 0.055, "step": 42295 }, { "epoch": 3.8336052202283852, "grad_norm": 0.5602699518203735, "learning_rate": 7.840507740570834e-06, "loss": 0.0528, "step": 42300 }, { "epoch": 3.834058365053471, "grad_norm": 0.6752690076828003, "learning_rate": 7.834756926643092e-06, "loss": 0.0696, "step": 42305 }, { "epoch": 3.834511509878557, "grad_norm": 0.5749796628952026, "learning_rate": 7.829007830618679e-06, "loss": 0.0562, "step": 42310 }, { "epoch": 3.8349646547036436, "grad_norm": 0.5310213565826416, "learning_rate": 7.823260453072956e-06, "loss": 0.051, "step": 42315 }, { "epoch": 3.8354177995287295, "grad_norm": 0.5132745504379272, "learning_rate": 7.817514794581143e-06, "loss": 0.0611, "step": 42320 }, { "epoch": 3.8358709443538155, "grad_norm": 0.5166880488395691, "learning_rate": 7.811770855718259e-06, "loss": 0.0518, "step": 42325 }, { "epoch": 3.8363240891789014, "grad_norm": 0.6766800880432129, "learning_rate": 7.806028637059158e-06, "loss": 0.0569, "step": 42330 }, { "epoch": 3.8367772340039874, "grad_norm": 0.6057985424995422, "learning_rate": 7.800288139178522e-06, "loss": 0.0485, "step": 42335 }, { "epoch": 3.837230378829074, "grad_norm": 0.5981253981590271, "learning_rate": 7.794549362650855e-06, "loss": 0.0566, "step": 42340 }, { "epoch": 3.8376835236541598, "grad_norm": 0.534173846244812, "learning_rate": 7.788812308050515e-06, "loss": 0.0676, "step": 42345 }, { "epoch": 3.8381366684792457, "grad_norm": 0.5341783761978149, "learning_rate": 7.783076975951653e-06, "loss": 0.0583, "step": 42350 }, { "epoch": 3.838589813304332, "grad_norm": 0.6639144420623779, "learning_rate": 7.777343366928273e-06, "loss": 0.0681, "step": 42355 }, { "epoch": 3.839042958129418, "grad_norm": 0.6482329964637756, "learning_rate": 7.771611481554186e-06, "loss": 0.0517, "step": 42360 }, { "epoch": 3.839496102954504, "grad_norm": 0.5637864470481873, "learning_rate": 7.76588132040304e-06, "loss": 0.0505, "step": 42365 }, { "epoch": 3.8399492477795905, "grad_norm": 0.7223156690597534, "learning_rate": 7.760152884048325e-06, "loss": 0.0705, "step": 42370 }, { "epoch": 3.8404023926046764, "grad_norm": 0.4501446783542633, "learning_rate": 7.75442617306334e-06, "loss": 0.0639, "step": 42375 }, { "epoch": 3.8408555374297624, "grad_norm": 0.4684397280216217, "learning_rate": 7.74870118802121e-06, "loss": 0.0476, "step": 42380 }, { "epoch": 3.841308682254849, "grad_norm": 0.5682381391525269, "learning_rate": 7.742977929494899e-06, "loss": 0.0592, "step": 42385 }, { "epoch": 3.8417618270799347, "grad_norm": 0.6285183429718018, "learning_rate": 7.73725639805718e-06, "loss": 0.0487, "step": 42390 }, { "epoch": 3.8422149719050207, "grad_norm": 0.5653966069221497, "learning_rate": 7.731536594280686e-06, "loss": 0.0553, "step": 42395 }, { "epoch": 3.842668116730107, "grad_norm": 0.49126988649368286, "learning_rate": 7.725818518737848e-06, "loss": 0.0503, "step": 42400 }, { "epoch": 3.843121261555193, "grad_norm": 0.6614810824394226, "learning_rate": 7.72010217200093e-06, "loss": 0.0579, "step": 42405 }, { "epoch": 3.843574406380279, "grad_norm": 0.5628235936164856, "learning_rate": 7.714387554642034e-06, "loss": 0.0607, "step": 42410 }, { "epoch": 3.8440275512053654, "grad_norm": 0.5327675938606262, "learning_rate": 7.70867466723306e-06, "loss": 0.0562, "step": 42415 }, { "epoch": 3.8444806960304514, "grad_norm": 0.5157397389411926, "learning_rate": 7.702963510345781e-06, "loss": 0.0554, "step": 42420 }, { "epoch": 3.8449338408555374, "grad_norm": 0.5794927477836609, "learning_rate": 7.697254084551761e-06, "loss": 0.0538, "step": 42425 }, { "epoch": 3.8453869856806238, "grad_norm": 0.5169802308082581, "learning_rate": 7.691546390422393e-06, "loss": 0.0502, "step": 42430 }, { "epoch": 3.8458401305057097, "grad_norm": 0.44749918580055237, "learning_rate": 7.685840428528931e-06, "loss": 0.0538, "step": 42435 }, { "epoch": 3.8462932753307957, "grad_norm": 0.5531393885612488, "learning_rate": 7.680136199442394e-06, "loss": 0.0555, "step": 42440 }, { "epoch": 3.846746420155882, "grad_norm": 0.5289880037307739, "learning_rate": 7.6744337037337e-06, "loss": 0.0527, "step": 42445 }, { "epoch": 3.847199564980968, "grad_norm": 0.635259747505188, "learning_rate": 7.668732941973522e-06, "loss": 0.0496, "step": 42450 }, { "epoch": 3.847652709806054, "grad_norm": 0.5173382759094238, "learning_rate": 7.663033914732414e-06, "loss": 0.0518, "step": 42455 }, { "epoch": 3.84810585463114, "grad_norm": 0.5675581693649292, "learning_rate": 7.657336622580735e-06, "loss": 0.0523, "step": 42460 }, { "epoch": 3.8485589994562264, "grad_norm": 0.5690494179725647, "learning_rate": 7.651641066088663e-06, "loss": 0.0493, "step": 42465 }, { "epoch": 3.8490121442813123, "grad_norm": 0.5271755456924438, "learning_rate": 7.645947245826232e-06, "loss": 0.051, "step": 42470 }, { "epoch": 3.8494652891063983, "grad_norm": 0.509412944316864, "learning_rate": 7.64025516236325e-06, "loss": 0.0543, "step": 42475 }, { "epoch": 3.8499184339314843, "grad_norm": 0.5142716765403748, "learning_rate": 7.63456481626941e-06, "loss": 0.0576, "step": 42480 }, { "epoch": 3.8503715787565707, "grad_norm": 0.636634886264801, "learning_rate": 7.628876208114191e-06, "loss": 0.0466, "step": 42485 }, { "epoch": 3.8508247235816566, "grad_norm": 0.4920227825641632, "learning_rate": 7.623189338466908e-06, "loss": 0.0787, "step": 42490 }, { "epoch": 3.8512778684067426, "grad_norm": 0.5444526076316833, "learning_rate": 7.617504207896725e-06, "loss": 0.0534, "step": 42495 }, { "epoch": 3.851731013231829, "grad_norm": 0.6508980393409729, "learning_rate": 7.611820816972582e-06, "loss": 0.0633, "step": 42500 }, { "epoch": 3.852184158056915, "grad_norm": 0.6495841145515442, "learning_rate": 7.606139166263296e-06, "loss": 0.0671, "step": 42505 }, { "epoch": 3.852637302882001, "grad_norm": 0.42015644907951355, "learning_rate": 7.600459256337483e-06, "loss": 0.0533, "step": 42510 }, { "epoch": 3.8530904477070873, "grad_norm": 0.43118447065353394, "learning_rate": 7.5947810877635835e-06, "loss": 0.0473, "step": 42515 }, { "epoch": 3.8535435925321733, "grad_norm": 0.5895166397094727, "learning_rate": 7.589104661109889e-06, "loss": 0.0514, "step": 42520 }, { "epoch": 3.8539967373572592, "grad_norm": 0.6357372403144836, "learning_rate": 7.583429976944473e-06, "loss": 0.0546, "step": 42525 }, { "epoch": 3.8544498821823456, "grad_norm": 0.7332749962806702, "learning_rate": 7.5777570358352785e-06, "loss": 0.0692, "step": 42530 }, { "epoch": 3.8549030270074316, "grad_norm": 0.6217492818832397, "learning_rate": 7.572085838350051e-06, "loss": 0.0759, "step": 42535 }, { "epoch": 3.8553561718325176, "grad_norm": 0.645311713218689, "learning_rate": 7.5664163850563545e-06, "loss": 0.055, "step": 42540 }, { "epoch": 3.855809316657604, "grad_norm": 0.6093646287918091, "learning_rate": 7.560748676521609e-06, "loss": 0.0493, "step": 42545 }, { "epoch": 3.85626246148269, "grad_norm": 0.5568463802337646, "learning_rate": 7.5550827133130305e-06, "loss": 0.0541, "step": 42550 }, { "epoch": 3.856715606307776, "grad_norm": 0.5492305755615234, "learning_rate": 7.5494184959976735e-06, "loss": 0.051, "step": 42555 }, { "epoch": 3.8571687511328623, "grad_norm": 0.5196890234947205, "learning_rate": 7.54375602514241e-06, "loss": 0.0472, "step": 42560 }, { "epoch": 3.8576218959579482, "grad_norm": 0.7238665223121643, "learning_rate": 7.538095301313938e-06, "loss": 0.047, "step": 42565 }, { "epoch": 3.858075040783034, "grad_norm": 0.5323355197906494, "learning_rate": 7.532436325078798e-06, "loss": 0.0536, "step": 42570 }, { "epoch": 3.8585281856081206, "grad_norm": 0.6759328246116638, "learning_rate": 7.526779097003336e-06, "loss": 0.0558, "step": 42575 }, { "epoch": 3.8589813304332066, "grad_norm": 0.580814778804779, "learning_rate": 7.52112361765373e-06, "loss": 0.0537, "step": 42580 }, { "epoch": 3.8594344752582925, "grad_norm": 0.6336444020271301, "learning_rate": 7.51546988759598e-06, "loss": 0.0463, "step": 42585 }, { "epoch": 3.8598876200833785, "grad_norm": 0.5448915958404541, "learning_rate": 7.509817907395905e-06, "loss": 0.0545, "step": 42590 }, { "epoch": 3.860340764908465, "grad_norm": 0.543252170085907, "learning_rate": 7.504167677619176e-06, "loss": 0.0485, "step": 42595 }, { "epoch": 3.860793909733551, "grad_norm": 0.5984760522842407, "learning_rate": 7.498519198831261e-06, "loss": 0.0627, "step": 42600 }, { "epoch": 3.861247054558637, "grad_norm": 0.511452853679657, "learning_rate": 7.492872471597459e-06, "loss": 0.0597, "step": 42605 }, { "epoch": 3.8617001993837228, "grad_norm": 0.489035964012146, "learning_rate": 7.487227496482902e-06, "loss": 0.0549, "step": 42610 }, { "epoch": 3.862153344208809, "grad_norm": 0.6218560338020325, "learning_rate": 7.481584274052533e-06, "loss": 0.0562, "step": 42615 }, { "epoch": 3.862606489033895, "grad_norm": 0.6130510568618774, "learning_rate": 7.475942804871125e-06, "loss": 0.0543, "step": 42620 }, { "epoch": 3.863059633858981, "grad_norm": 0.6566656231880188, "learning_rate": 7.470303089503294e-06, "loss": 0.0527, "step": 42625 }, { "epoch": 3.8635127786840675, "grad_norm": 0.46177053451538086, "learning_rate": 7.4646651285134566e-06, "loss": 0.0482, "step": 42630 }, { "epoch": 3.8639659235091535, "grad_norm": 0.616936981678009, "learning_rate": 7.45902892246586e-06, "loss": 0.0494, "step": 42635 }, { "epoch": 3.8644190683342394, "grad_norm": 0.6674731969833374, "learning_rate": 7.453394471924577e-06, "loss": 0.0608, "step": 42640 }, { "epoch": 3.864872213159326, "grad_norm": 0.5167615413665771, "learning_rate": 7.447761777453502e-06, "loss": 0.0464, "step": 42645 }, { "epoch": 3.865325357984412, "grad_norm": 0.5599720478057861, "learning_rate": 7.442130839616368e-06, "loss": 0.0626, "step": 42650 }, { "epoch": 3.8657785028094978, "grad_norm": 0.6196523308753967, "learning_rate": 7.4365016589767134e-06, "loss": 0.0498, "step": 42655 }, { "epoch": 3.866231647634584, "grad_norm": 0.6934120655059814, "learning_rate": 7.430874236097912e-06, "loss": 0.0524, "step": 42660 }, { "epoch": 3.86668479245967, "grad_norm": 0.5544579029083252, "learning_rate": 7.425248571543159e-06, "loss": 0.0524, "step": 42665 }, { "epoch": 3.867137937284756, "grad_norm": 0.6532593369483948, "learning_rate": 7.419624665875458e-06, "loss": 0.055, "step": 42670 }, { "epoch": 3.8675910821098425, "grad_norm": 0.3808981478214264, "learning_rate": 7.414002519657673e-06, "loss": 0.0533, "step": 42675 }, { "epoch": 3.8680442269349284, "grad_norm": 0.40818077325820923, "learning_rate": 7.40838213345246e-06, "loss": 0.043, "step": 42680 }, { "epoch": 3.8684973717600144, "grad_norm": 0.5826719403266907, "learning_rate": 7.4027635078223025e-06, "loss": 0.0544, "step": 42685 }, { "epoch": 3.868950516585101, "grad_norm": 0.4733178913593292, "learning_rate": 7.397146643329539e-06, "loss": 0.054, "step": 42690 }, { "epoch": 3.8694036614101868, "grad_norm": 0.7228952646255493, "learning_rate": 7.391531540536273e-06, "loss": 0.0583, "step": 42695 }, { "epoch": 3.8698568062352727, "grad_norm": 0.7529389262199402, "learning_rate": 7.385918200004491e-06, "loss": 0.0611, "step": 42700 }, { "epoch": 3.870309951060359, "grad_norm": 0.6132723689079285, "learning_rate": 7.380306622295971e-06, "loss": 0.0608, "step": 42705 }, { "epoch": 3.870763095885445, "grad_norm": 0.5860670208930969, "learning_rate": 7.3746968079723115e-06, "loss": 0.0494, "step": 42710 }, { "epoch": 3.871216240710531, "grad_norm": 0.5274845361709595, "learning_rate": 7.369088757594969e-06, "loss": 0.0535, "step": 42715 }, { "epoch": 3.8716693855356175, "grad_norm": 0.4448307454586029, "learning_rate": 7.363482471725169e-06, "loss": 0.0512, "step": 42720 }, { "epoch": 3.8721225303607034, "grad_norm": 0.948063850402832, "learning_rate": 7.357877950924014e-06, "loss": 0.0806, "step": 42725 }, { "epoch": 3.8725756751857894, "grad_norm": 0.5763237476348877, "learning_rate": 7.352275195752395e-06, "loss": 0.0544, "step": 42730 }, { "epoch": 3.8730288200108753, "grad_norm": 0.6064983606338501, "learning_rate": 7.346674206771034e-06, "loss": 0.077, "step": 42735 }, { "epoch": 3.8734819648359613, "grad_norm": 0.8771200776100159, "learning_rate": 7.3410749845405025e-06, "loss": 0.0738, "step": 42740 }, { "epoch": 3.8739351096610477, "grad_norm": 0.44560229778289795, "learning_rate": 7.335477529621137e-06, "loss": 0.0564, "step": 42745 }, { "epoch": 3.8743882544861337, "grad_norm": 0.646857500076294, "learning_rate": 7.329881842573163e-06, "loss": 0.0643, "step": 42750 }, { "epoch": 3.8748413993112196, "grad_norm": 0.5275158286094666, "learning_rate": 7.324287923956588e-06, "loss": 0.0463, "step": 42755 }, { "epoch": 3.875294544136306, "grad_norm": 0.51801598072052, "learning_rate": 7.318695774331244e-06, "loss": 0.0522, "step": 42760 }, { "epoch": 3.875747688961392, "grad_norm": 0.44795575737953186, "learning_rate": 7.31310539425682e-06, "loss": 0.054, "step": 42765 }, { "epoch": 3.876200833786478, "grad_norm": 0.5523860454559326, "learning_rate": 7.30751678429277e-06, "loss": 0.0555, "step": 42770 }, { "epoch": 3.8766539786115644, "grad_norm": 0.6550801992416382, "learning_rate": 7.301929944998431e-06, "loss": 0.0531, "step": 42775 }, { "epoch": 3.8771071234366503, "grad_norm": 0.5549466013908386, "learning_rate": 7.296344876932926e-06, "loss": 0.0585, "step": 42780 }, { "epoch": 3.8775602682617363, "grad_norm": 0.5368512868881226, "learning_rate": 7.290761580655201e-06, "loss": 0.048, "step": 42785 }, { "epoch": 3.8780134130868227, "grad_norm": 0.4807197153568268, "learning_rate": 7.285180056724059e-06, "loss": 0.0572, "step": 42790 }, { "epoch": 3.8784665579119086, "grad_norm": 0.4547024369239807, "learning_rate": 7.279600305698072e-06, "loss": 0.0521, "step": 42795 }, { "epoch": 3.8789197027369946, "grad_norm": 0.4816891551017761, "learning_rate": 7.274022328135683e-06, "loss": 0.049, "step": 42800 }, { "epoch": 3.879372847562081, "grad_norm": 0.5689274072647095, "learning_rate": 7.268446124595133e-06, "loss": 0.0556, "step": 42805 }, { "epoch": 3.879825992387167, "grad_norm": 0.5722986459732056, "learning_rate": 7.262871695634488e-06, "loss": 0.0652, "step": 42810 }, { "epoch": 3.880279137212253, "grad_norm": 0.4526970088481903, "learning_rate": 7.257299041811638e-06, "loss": 0.0478, "step": 42815 }, { "epoch": 3.8807322820373393, "grad_norm": 0.6076830625534058, "learning_rate": 7.2517281636842935e-06, "loss": 0.0526, "step": 42820 }, { "epoch": 3.8811854268624253, "grad_norm": 0.6668768525123596, "learning_rate": 7.246159061810001e-06, "loss": 0.0666, "step": 42825 }, { "epoch": 3.8816385716875113, "grad_norm": 0.5263954997062683, "learning_rate": 7.240591736746111e-06, "loss": 0.0608, "step": 42830 }, { "epoch": 3.8820917165125977, "grad_norm": 0.599942684173584, "learning_rate": 7.2350261890498015e-06, "loss": 0.0509, "step": 42835 }, { "epoch": 3.8825448613376836, "grad_norm": 0.5307844877243042, "learning_rate": 7.229462419278079e-06, "loss": 0.0482, "step": 42840 }, { "epoch": 3.8829980061627696, "grad_norm": 0.6739217638969421, "learning_rate": 7.223900427987757e-06, "loss": 0.0542, "step": 42845 }, { "epoch": 3.883451150987856, "grad_norm": 0.5655048489570618, "learning_rate": 7.218340215735495e-06, "loss": 0.0615, "step": 42850 }, { "epoch": 3.883904295812942, "grad_norm": 0.5364390015602112, "learning_rate": 7.212781783077757e-06, "loss": 0.0475, "step": 42855 }, { "epoch": 3.884357440638028, "grad_norm": 0.648395836353302, "learning_rate": 7.20722513057083e-06, "loss": 0.0546, "step": 42860 }, { "epoch": 3.884810585463114, "grad_norm": 0.48367607593536377, "learning_rate": 7.201670258770829e-06, "loss": 0.0529, "step": 42865 }, { "epoch": 3.8852637302882003, "grad_norm": 0.692696750164032, "learning_rate": 7.196117168233676e-06, "loss": 0.0565, "step": 42870 }, { "epoch": 3.8857168751132862, "grad_norm": 0.49413102865219116, "learning_rate": 7.1905658595151444e-06, "loss": 0.0615, "step": 42875 }, { "epoch": 3.886170019938372, "grad_norm": 0.5350160002708435, "learning_rate": 7.1850163331708e-06, "loss": 0.0566, "step": 42880 }, { "epoch": 3.886623164763458, "grad_norm": 0.5446521639823914, "learning_rate": 7.179468589756045e-06, "loss": 0.0615, "step": 42885 }, { "epoch": 3.8870763095885446, "grad_norm": 0.47386589646339417, "learning_rate": 7.173922629826097e-06, "loss": 0.0543, "step": 42890 }, { "epoch": 3.8875294544136305, "grad_norm": 0.4717663526535034, "learning_rate": 7.1683784539359895e-06, "loss": 0.0584, "step": 42895 }, { "epoch": 3.8879825992387165, "grad_norm": 0.5049431920051575, "learning_rate": 7.162836062640604e-06, "loss": 0.0506, "step": 42900 }, { "epoch": 3.888435744063803, "grad_norm": 0.5313361287117004, "learning_rate": 7.157295456494614e-06, "loss": 0.0492, "step": 42905 }, { "epoch": 3.888888888888889, "grad_norm": 0.4963032007217407, "learning_rate": 7.1517566360525284e-06, "loss": 0.0515, "step": 42910 }, { "epoch": 3.889342033713975, "grad_norm": 0.5509923100471497, "learning_rate": 7.14621960186867e-06, "loss": 0.0477, "step": 42915 }, { "epoch": 3.889795178539061, "grad_norm": 0.47444984316825867, "learning_rate": 7.140684354497182e-06, "loss": 0.0417, "step": 42920 }, { "epoch": 3.890248323364147, "grad_norm": 0.48333457112312317, "learning_rate": 7.135150894492049e-06, "loss": 0.0497, "step": 42925 }, { "epoch": 3.890701468189233, "grad_norm": 0.5594387054443359, "learning_rate": 7.129619222407055e-06, "loss": 0.0468, "step": 42930 }, { "epoch": 3.8911546130143195, "grad_norm": 0.6206011176109314, "learning_rate": 7.124089338795809e-06, "loss": 0.0527, "step": 42935 }, { "epoch": 3.8916077578394055, "grad_norm": 0.5601134300231934, "learning_rate": 7.118561244211738e-06, "loss": 0.0553, "step": 42940 }, { "epoch": 3.8920609026644915, "grad_norm": 0.39252617955207825, "learning_rate": 7.11303493920811e-06, "loss": 0.0577, "step": 42945 }, { "epoch": 3.892514047489578, "grad_norm": 0.5502803325653076, "learning_rate": 7.1075104243379905e-06, "loss": 0.0451, "step": 42950 }, { "epoch": 3.892967192314664, "grad_norm": 0.4390685260295868, "learning_rate": 7.1019877001542775e-06, "loss": 0.0504, "step": 42955 }, { "epoch": 3.8934203371397498, "grad_norm": 0.4871829152107239, "learning_rate": 7.096466767209686e-06, "loss": 0.0851, "step": 42960 }, { "epoch": 3.893873481964836, "grad_norm": 0.4430684745311737, "learning_rate": 7.090947626056746e-06, "loss": 0.0478, "step": 42965 }, { "epoch": 3.894326626789922, "grad_norm": 0.45529666543006897, "learning_rate": 7.085430277247829e-06, "loss": 0.0537, "step": 42970 }, { "epoch": 3.894779771615008, "grad_norm": 0.5834225416183472, "learning_rate": 7.079914721335104e-06, "loss": 0.0496, "step": 42975 }, { "epoch": 3.8952329164400945, "grad_norm": 0.5670252442359924, "learning_rate": 7.074400958870573e-06, "loss": 0.0502, "step": 42980 }, { "epoch": 3.8956860612651805, "grad_norm": 0.5646926760673523, "learning_rate": 7.068888990406056e-06, "loss": 0.0488, "step": 42985 }, { "epoch": 3.8961392060902664, "grad_norm": 0.5994097590446472, "learning_rate": 7.06337881649318e-06, "loss": 0.0524, "step": 42990 }, { "epoch": 3.8965923509153524, "grad_norm": 0.6181179881095886, "learning_rate": 7.057870437683431e-06, "loss": 0.0517, "step": 42995 }, { "epoch": 3.897045495740439, "grad_norm": 0.6726389527320862, "learning_rate": 7.052363854528057e-06, "loss": 0.0563, "step": 43000 }, { "epoch": 3.8974986405655248, "grad_norm": 0.49640244245529175, "learning_rate": 7.046859067578187e-06, "loss": 0.059, "step": 43005 }, { "epoch": 3.8979517853906107, "grad_norm": 0.4970128536224365, "learning_rate": 7.041356077384728e-06, "loss": 0.06, "step": 43010 }, { "epoch": 3.8984049302156967, "grad_norm": 0.461650550365448, "learning_rate": 7.035854884498416e-06, "loss": 0.0509, "step": 43015 }, { "epoch": 3.898858075040783, "grad_norm": 0.6002557873725891, "learning_rate": 7.030355489469837e-06, "loss": 0.0461, "step": 43020 }, { "epoch": 3.899311219865869, "grad_norm": 0.4388865530490875, "learning_rate": 7.02485789284934e-06, "loss": 0.059, "step": 43025 }, { "epoch": 3.899764364690955, "grad_norm": 0.5378913283348083, "learning_rate": 7.019362095187149e-06, "loss": 0.0565, "step": 43030 }, { "epoch": 3.9002175095160414, "grad_norm": 0.7426803708076477, "learning_rate": 7.013868097033277e-06, "loss": 0.0594, "step": 43035 }, { "epoch": 3.9006706543411274, "grad_norm": 0.5556195974349976, "learning_rate": 7.008375898937558e-06, "loss": 0.0475, "step": 43040 }, { "epoch": 3.9011237991662133, "grad_norm": 0.6475315093994141, "learning_rate": 7.002885501449677e-06, "loss": 0.0521, "step": 43045 }, { "epoch": 3.9015769439912997, "grad_norm": 0.539790153503418, "learning_rate": 6.997396905119083e-06, "loss": 0.0495, "step": 43050 }, { "epoch": 3.9020300888163857, "grad_norm": 0.45764997601509094, "learning_rate": 6.9919101104951005e-06, "loss": 0.0445, "step": 43055 }, { "epoch": 3.9024832336414716, "grad_norm": 0.5736091136932373, "learning_rate": 6.986425118126844e-06, "loss": 0.0544, "step": 43060 }, { "epoch": 3.902936378466558, "grad_norm": 0.8334272503852844, "learning_rate": 6.980941928563242e-06, "loss": 0.0616, "step": 43065 }, { "epoch": 3.903389523291644, "grad_norm": 0.4712469279766083, "learning_rate": 6.975460542353077e-06, "loss": 0.0495, "step": 43070 }, { "epoch": 3.90384266811673, "grad_norm": 0.5478118658065796, "learning_rate": 6.9699809600449e-06, "loss": 0.0475, "step": 43075 }, { "epoch": 3.9042958129418164, "grad_norm": 0.5501245856285095, "learning_rate": 6.964503182187132e-06, "loss": 0.054, "step": 43080 }, { "epoch": 3.9047489577669023, "grad_norm": 0.7055322527885437, "learning_rate": 6.959027209327981e-06, "loss": 0.0579, "step": 43085 }, { "epoch": 3.9052021025919883, "grad_norm": 0.5027821660041809, "learning_rate": 6.953553042015479e-06, "loss": 0.0586, "step": 43090 }, { "epoch": 3.9056552474170747, "grad_norm": 0.5313326716423035, "learning_rate": 6.948080680797503e-06, "loss": 0.0485, "step": 43095 }, { "epoch": 3.9061083922421607, "grad_norm": 0.5019370913505554, "learning_rate": 6.942610126221702e-06, "loss": 0.0592, "step": 43100 }, { "epoch": 3.9065615370672466, "grad_norm": 0.48919036984443665, "learning_rate": 6.93714137883559e-06, "loss": 0.0464, "step": 43105 }, { "epoch": 3.907014681892333, "grad_norm": 0.4979970455169678, "learning_rate": 6.931674439186475e-06, "loss": 0.0463, "step": 43110 }, { "epoch": 3.907467826717419, "grad_norm": 0.45690691471099854, "learning_rate": 6.926209307821482e-06, "loss": 0.0539, "step": 43115 }, { "epoch": 3.907920971542505, "grad_norm": 0.4611555337905884, "learning_rate": 6.920745985287588e-06, "loss": 0.0488, "step": 43120 }, { "epoch": 3.9083741163675914, "grad_norm": 0.5722365379333496, "learning_rate": 6.915284472131531e-06, "loss": 0.0525, "step": 43125 }, { "epoch": 3.9088272611926773, "grad_norm": 0.6091144680976868, "learning_rate": 6.9098247688999255e-06, "loss": 0.0491, "step": 43130 }, { "epoch": 3.9092804060177633, "grad_norm": 0.5391323566436768, "learning_rate": 6.9043668761391735e-06, "loss": 0.0524, "step": 43135 }, { "epoch": 3.9097335508428492, "grad_norm": 0.5129873156547546, "learning_rate": 6.898910794395494e-06, "loss": 0.0624, "step": 43140 }, { "epoch": 3.910186695667935, "grad_norm": 0.5234388709068298, "learning_rate": 6.893456524214955e-06, "loss": 0.057, "step": 43145 }, { "epoch": 3.9106398404930216, "grad_norm": 0.5871055126190186, "learning_rate": 6.888004066143394e-06, "loss": 0.0537, "step": 43150 }, { "epoch": 3.9110929853181076, "grad_norm": 0.6238920092582703, "learning_rate": 6.882553420726517e-06, "loss": 0.0564, "step": 43155 }, { "epoch": 3.9115461301431935, "grad_norm": 0.6897544860839844, "learning_rate": 6.877104588509817e-06, "loss": 0.0666, "step": 43160 }, { "epoch": 3.91199927496828, "grad_norm": 0.545690655708313, "learning_rate": 6.871657570038617e-06, "loss": 0.0537, "step": 43165 }, { "epoch": 3.912452419793366, "grad_norm": 0.5859992504119873, "learning_rate": 6.866212365858055e-06, "loss": 0.0706, "step": 43170 }, { "epoch": 3.912905564618452, "grad_norm": 0.7603061199188232, "learning_rate": 6.860768976513082e-06, "loss": 0.0536, "step": 43175 }, { "epoch": 3.9133587094435383, "grad_norm": 0.5889226794242859, "learning_rate": 6.855327402548489e-06, "loss": 0.047, "step": 43180 }, { "epoch": 3.913811854268624, "grad_norm": 0.5471070408821106, "learning_rate": 6.849887644508865e-06, "loss": 0.0423, "step": 43185 }, { "epoch": 3.91426499909371, "grad_norm": 0.48655569553375244, "learning_rate": 6.844449702938621e-06, "loss": 0.0494, "step": 43190 }, { "epoch": 3.9147181439187966, "grad_norm": 0.7153316736221313, "learning_rate": 6.839013578381978e-06, "loss": 0.065, "step": 43195 }, { "epoch": 3.9151712887438825, "grad_norm": 0.8347910046577454, "learning_rate": 6.833579271383006e-06, "loss": 0.0562, "step": 43200 }, { "epoch": 3.9156244335689685, "grad_norm": 0.5187172293663025, "learning_rate": 6.828146782485559e-06, "loss": 0.0552, "step": 43205 }, { "epoch": 3.916077578394055, "grad_norm": 0.6203188300132751, "learning_rate": 6.822716112233327e-06, "loss": 0.0695, "step": 43210 }, { "epoch": 3.916530723219141, "grad_norm": 0.3876391053199768, "learning_rate": 6.817287261169811e-06, "loss": 0.0513, "step": 43215 }, { "epoch": 3.916983868044227, "grad_norm": 0.4888991713523865, "learning_rate": 6.811860229838327e-06, "loss": 0.0468, "step": 43220 }, { "epoch": 3.9174370128693132, "grad_norm": 0.6358319520950317, "learning_rate": 6.806435018782023e-06, "loss": 0.0568, "step": 43225 }, { "epoch": 3.917890157694399, "grad_norm": 0.5568011403083801, "learning_rate": 6.8010116285438545e-06, "loss": 0.0621, "step": 43230 }, { "epoch": 3.918343302519485, "grad_norm": 0.43315306305885315, "learning_rate": 6.795590059666596e-06, "loss": 0.042, "step": 43235 }, { "epoch": 3.9187964473445716, "grad_norm": 0.42773357033729553, "learning_rate": 6.790170312692837e-06, "loss": 0.0647, "step": 43240 }, { "epoch": 3.9192495921696575, "grad_norm": 0.3875947892665863, "learning_rate": 6.78475238816498e-06, "loss": 0.0438, "step": 43245 }, { "epoch": 3.9197027369947435, "grad_norm": 0.6248917579650879, "learning_rate": 6.779336286625273e-06, "loss": 0.0561, "step": 43250 }, { "epoch": 3.92015588181983, "grad_norm": 0.5294070839881897, "learning_rate": 6.773922008615746e-06, "loss": 0.0545, "step": 43255 }, { "epoch": 3.920609026644916, "grad_norm": 0.7019827365875244, "learning_rate": 6.768509554678268e-06, "loss": 0.0582, "step": 43260 }, { "epoch": 3.921062171470002, "grad_norm": 0.5543017387390137, "learning_rate": 6.763098925354516e-06, "loss": 0.0545, "step": 43265 }, { "epoch": 3.9215153162950878, "grad_norm": 0.5179862380027771, "learning_rate": 6.75769012118598e-06, "loss": 0.0562, "step": 43270 }, { "epoch": 3.921968461120174, "grad_norm": 0.46173420548439026, "learning_rate": 6.752283142713994e-06, "loss": 0.0499, "step": 43275 }, { "epoch": 3.92242160594526, "grad_norm": 0.525632917881012, "learning_rate": 6.746877990479678e-06, "loss": 0.0603, "step": 43280 }, { "epoch": 3.922874750770346, "grad_norm": 0.573930561542511, "learning_rate": 6.741474665023983e-06, "loss": 0.0504, "step": 43285 }, { "epoch": 3.923327895595432, "grad_norm": 0.6624478101730347, "learning_rate": 6.736073166887677e-06, "loss": 0.0562, "step": 43290 }, { "epoch": 3.9237810404205185, "grad_norm": 0.5119007229804993, "learning_rate": 6.7306734966113335e-06, "loss": 0.0466, "step": 43295 }, { "epoch": 3.9242341852456044, "grad_norm": 0.8084337115287781, "learning_rate": 6.725275654735372e-06, "loss": 0.0536, "step": 43300 }, { "epoch": 3.9246873300706904, "grad_norm": 0.6528505682945251, "learning_rate": 6.719879641799998e-06, "loss": 0.0601, "step": 43305 }, { "epoch": 3.9251404748957768, "grad_norm": 0.7162865400314331, "learning_rate": 6.714485458345252e-06, "loss": 0.0544, "step": 43310 }, { "epoch": 3.9255936197208627, "grad_norm": 0.538284957408905, "learning_rate": 6.70909310491098e-06, "loss": 0.0502, "step": 43315 }, { "epoch": 3.9260467645459487, "grad_norm": 0.5350258946418762, "learning_rate": 6.703702582036847e-06, "loss": 0.0516, "step": 43320 }, { "epoch": 3.926499909371035, "grad_norm": 0.7019336819648743, "learning_rate": 6.698313890262351e-06, "loss": 0.0833, "step": 43325 }, { "epoch": 3.926953054196121, "grad_norm": 0.4339917302131653, "learning_rate": 6.692927030126789e-06, "loss": 0.0519, "step": 43330 }, { "epoch": 3.927406199021207, "grad_norm": 0.5591995120048523, "learning_rate": 6.687542002169272e-06, "loss": 0.049, "step": 43335 }, { "epoch": 3.9278593438462934, "grad_norm": 0.5426175594329834, "learning_rate": 6.682158806928756e-06, "loss": 0.0625, "step": 43340 }, { "epoch": 3.9283124886713794, "grad_norm": 0.5194973945617676, "learning_rate": 6.676777444943963e-06, "loss": 0.0599, "step": 43345 }, { "epoch": 3.9287656334964653, "grad_norm": 0.592159628868103, "learning_rate": 6.671397916753494e-06, "loss": 0.0492, "step": 43350 }, { "epoch": 3.9292187783215518, "grad_norm": 0.6386652588844299, "learning_rate": 6.666020222895703e-06, "loss": 0.0542, "step": 43355 }, { "epoch": 3.9296719231466377, "grad_norm": 0.5338585376739502, "learning_rate": 6.660644363908811e-06, "loss": 0.0481, "step": 43360 }, { "epoch": 3.9301250679717237, "grad_norm": 0.43231460452079773, "learning_rate": 6.6552703403308325e-06, "loss": 0.0488, "step": 43365 }, { "epoch": 3.93057821279681, "grad_norm": 0.4940316677093506, "learning_rate": 6.6498981526995905e-06, "loss": 0.0441, "step": 43370 }, { "epoch": 3.931031357621896, "grad_norm": 0.49502259492874146, "learning_rate": 6.644527801552761e-06, "loss": 0.051, "step": 43375 }, { "epoch": 3.931484502446982, "grad_norm": 0.7865230441093445, "learning_rate": 6.639159287427779e-06, "loss": 0.0485, "step": 43380 }, { "epoch": 3.9319376472720684, "grad_norm": 0.6342473030090332, "learning_rate": 6.63379261086195e-06, "loss": 0.0499, "step": 43385 }, { "epoch": 3.9323907920971544, "grad_norm": 0.45987993478775024, "learning_rate": 6.628427772392365e-06, "loss": 0.0507, "step": 43390 }, { "epoch": 3.9328439369222403, "grad_norm": 0.4391911029815674, "learning_rate": 6.623064772555931e-06, "loss": 0.053, "step": 43395 }, { "epoch": 3.9332970817473263, "grad_norm": 0.48849472403526306, "learning_rate": 6.617703611889403e-06, "loss": 0.0472, "step": 43400 }, { "epoch": 3.9337502265724127, "grad_norm": 0.6041844487190247, "learning_rate": 6.612344290929296e-06, "loss": 0.0469, "step": 43405 }, { "epoch": 3.9342033713974986, "grad_norm": 0.49365001916885376, "learning_rate": 6.606986810211996e-06, "loss": 0.043, "step": 43410 }, { "epoch": 3.9346565162225846, "grad_norm": 0.5110320448875427, "learning_rate": 6.601631170273675e-06, "loss": 0.0509, "step": 43415 }, { "epoch": 3.9351096610476706, "grad_norm": 0.5576809644699097, "learning_rate": 6.596277371650317e-06, "loss": 0.0618, "step": 43420 }, { "epoch": 3.935562805872757, "grad_norm": 0.766617476940155, "learning_rate": 6.590925414877757e-06, "loss": 0.0538, "step": 43425 }, { "epoch": 3.936015950697843, "grad_norm": 0.4294264614582062, "learning_rate": 6.585575300491589e-06, "loss": 0.0671, "step": 43430 }, { "epoch": 3.936469095522929, "grad_norm": 0.5230284333229065, "learning_rate": 6.580227029027278e-06, "loss": 0.0489, "step": 43435 }, { "epoch": 3.9369222403480153, "grad_norm": 0.5926046967506409, "learning_rate": 6.574880601020073e-06, "loss": 0.0584, "step": 43440 }, { "epoch": 3.9373753851731013, "grad_norm": 0.48566940426826477, "learning_rate": 6.569536017005037e-06, "loss": 0.0508, "step": 43445 }, { "epoch": 3.937828529998187, "grad_norm": 0.5820852518081665, "learning_rate": 6.564193277517075e-06, "loss": 0.0498, "step": 43450 }, { "epoch": 3.9382816748232736, "grad_norm": 0.6218969225883484, "learning_rate": 6.558852383090883e-06, "loss": 0.051, "step": 43455 }, { "epoch": 3.9387348196483596, "grad_norm": 0.5293615460395813, "learning_rate": 6.553513334260977e-06, "loss": 0.0495, "step": 43460 }, { "epoch": 3.9391879644734455, "grad_norm": 0.5092620849609375, "learning_rate": 6.548176131561692e-06, "loss": 0.0504, "step": 43465 }, { "epoch": 3.939641109298532, "grad_norm": 0.5570350289344788, "learning_rate": 6.5428407755271735e-06, "loss": 0.0488, "step": 43470 }, { "epoch": 3.940094254123618, "grad_norm": 0.5529168248176575, "learning_rate": 6.537507266691392e-06, "loss": 0.0494, "step": 43475 }, { "epoch": 3.940547398948704, "grad_norm": 0.5255761742591858, "learning_rate": 6.532175605588128e-06, "loss": 0.043, "step": 43480 }, { "epoch": 3.9410005437737903, "grad_norm": 0.552958607673645, "learning_rate": 6.526845792750971e-06, "loss": 0.0423, "step": 43485 }, { "epoch": 3.9414536885988762, "grad_norm": 0.5610929727554321, "learning_rate": 6.521517828713333e-06, "loss": 0.0504, "step": 43490 }, { "epoch": 3.941906833423962, "grad_norm": 0.634003221988678, "learning_rate": 6.5161917140084294e-06, "loss": 0.0528, "step": 43495 }, { "epoch": 3.9423599782490486, "grad_norm": 0.5476390719413757, "learning_rate": 6.5108674491693166e-06, "loss": 0.0641, "step": 43500 }, { "epoch": 3.9428131230741346, "grad_norm": 0.48976942896842957, "learning_rate": 6.505545034728838e-06, "loss": 0.0508, "step": 43505 }, { "epoch": 3.9432662678992205, "grad_norm": 0.5203263163566589, "learning_rate": 6.500224471219662e-06, "loss": 0.0567, "step": 43510 }, { "epoch": 3.943719412724307, "grad_norm": 0.5269837379455566, "learning_rate": 6.49490575917428e-06, "loss": 0.0486, "step": 43515 }, { "epoch": 3.944172557549393, "grad_norm": 0.5853483080863953, "learning_rate": 6.489588899124974e-06, "loss": 0.0561, "step": 43520 }, { "epoch": 3.944625702374479, "grad_norm": 0.5058154463768005, "learning_rate": 6.484273891603879e-06, "loss": 0.0484, "step": 43525 }, { "epoch": 3.9450788471995653, "grad_norm": 0.5559809803962708, "learning_rate": 6.478960737142908e-06, "loss": 0.0483, "step": 43530 }, { "epoch": 3.945531992024651, "grad_norm": 0.5286941528320312, "learning_rate": 6.47364943627381e-06, "loss": 0.0465, "step": 43535 }, { "epoch": 3.945985136849737, "grad_norm": 0.4280717372894287, "learning_rate": 6.468339989528138e-06, "loss": 0.0646, "step": 43540 }, { "epoch": 3.946438281674823, "grad_norm": 0.5466699004173279, "learning_rate": 6.463032397437266e-06, "loss": 0.0482, "step": 43545 }, { "epoch": 3.946891426499909, "grad_norm": 0.38475316762924194, "learning_rate": 6.45772666053237e-06, "loss": 0.0488, "step": 43550 }, { "epoch": 3.9473445713249955, "grad_norm": 0.608540415763855, "learning_rate": 6.452422779344466e-06, "loss": 0.0591, "step": 43555 }, { "epoch": 3.9477977161500815, "grad_norm": 0.5507868528366089, "learning_rate": 6.447120754404359e-06, "loss": 0.0502, "step": 43560 }, { "epoch": 3.9482508609751674, "grad_norm": 0.48031240701675415, "learning_rate": 6.441820586242681e-06, "loss": 0.0471, "step": 43565 }, { "epoch": 3.948704005800254, "grad_norm": 0.49297448992729187, "learning_rate": 6.4365222753898695e-06, "loss": 0.0494, "step": 43570 }, { "epoch": 3.94915715062534, "grad_norm": 0.579959511756897, "learning_rate": 6.43122582237618e-06, "loss": 0.0491, "step": 43575 }, { "epoch": 3.9496102954504257, "grad_norm": 0.507138729095459, "learning_rate": 6.425931227731694e-06, "loss": 0.0463, "step": 43580 }, { "epoch": 3.950063440275512, "grad_norm": 0.6487850546836853, "learning_rate": 6.420638491986286e-06, "loss": 0.0527, "step": 43585 }, { "epoch": 3.950516585100598, "grad_norm": 0.5064910054206848, "learning_rate": 6.415347615669653e-06, "loss": 0.042, "step": 43590 }, { "epoch": 3.950969729925684, "grad_norm": 0.7720617651939392, "learning_rate": 6.4100585993113305e-06, "loss": 0.0619, "step": 43595 }, { "epoch": 3.9514228747507705, "grad_norm": 0.5818767547607422, "learning_rate": 6.40477144344061e-06, "loss": 0.0558, "step": 43600 }, { "epoch": 3.9518760195758564, "grad_norm": 0.42961448431015015, "learning_rate": 6.399486148586659e-06, "loss": 0.0417, "step": 43605 }, { "epoch": 3.9523291644009424, "grad_norm": 0.5500193238258362, "learning_rate": 6.394202715278422e-06, "loss": 0.0567, "step": 43610 }, { "epoch": 3.952782309226029, "grad_norm": 0.5409567356109619, "learning_rate": 6.388921144044657e-06, "loss": 0.0611, "step": 43615 }, { "epoch": 3.9532354540511148, "grad_norm": 0.6598479151725769, "learning_rate": 6.383641435413973e-06, "loss": 0.061, "step": 43620 }, { "epoch": 3.9536885988762007, "grad_norm": 0.5228052139282227, "learning_rate": 6.378363589914732e-06, "loss": 0.0473, "step": 43625 }, { "epoch": 3.954141743701287, "grad_norm": 0.5918974876403809, "learning_rate": 6.373087608075165e-06, "loss": 0.0487, "step": 43630 }, { "epoch": 3.954594888526373, "grad_norm": 0.6626885533332825, "learning_rate": 6.367813490423286e-06, "loss": 0.054, "step": 43635 }, { "epoch": 3.955048033351459, "grad_norm": 0.5394405722618103, "learning_rate": 6.3625412374869246e-06, "loss": 0.0484, "step": 43640 }, { "epoch": 3.9555011781765455, "grad_norm": 0.5197243094444275, "learning_rate": 6.3572708497937534e-06, "loss": 0.076, "step": 43645 }, { "epoch": 3.9559543230016314, "grad_norm": 0.498579740524292, "learning_rate": 6.352002327871201e-06, "loss": 0.0534, "step": 43650 }, { "epoch": 3.9564074678267174, "grad_norm": 0.4364263415336609, "learning_rate": 6.3467356722465674e-06, "loss": 0.0468, "step": 43655 }, { "epoch": 3.9568606126518038, "grad_norm": 0.5645531415939331, "learning_rate": 6.3414708834469316e-06, "loss": 0.0459, "step": 43660 }, { "epoch": 3.9573137574768897, "grad_norm": 0.6766155362129211, "learning_rate": 6.336207961999194e-06, "loss": 0.056, "step": 43665 }, { "epoch": 3.9577669023019757, "grad_norm": 0.4832950532436371, "learning_rate": 6.330946908430083e-06, "loss": 0.0447, "step": 43670 }, { "epoch": 3.9582200471270617, "grad_norm": 0.5226889848709106, "learning_rate": 6.325687723266105e-06, "loss": 0.0452, "step": 43675 }, { "epoch": 3.958673191952148, "grad_norm": 0.5116957426071167, "learning_rate": 6.32043040703362e-06, "loss": 0.0447, "step": 43680 }, { "epoch": 3.959126336777234, "grad_norm": 0.5385000705718994, "learning_rate": 6.315174960258771e-06, "loss": 0.0491, "step": 43685 }, { "epoch": 3.95957948160232, "grad_norm": 0.4667786657810211, "learning_rate": 6.309921383467524e-06, "loss": 0.0437, "step": 43690 }, { "epoch": 3.960032626427406, "grad_norm": 0.5195687413215637, "learning_rate": 6.304669677185679e-06, "loss": 0.0521, "step": 43695 }, { "epoch": 3.9604857712524923, "grad_norm": 0.5020073056221008, "learning_rate": 6.299419841938794e-06, "loss": 0.0441, "step": 43700 }, { "epoch": 3.9609389160775783, "grad_norm": 0.5964462161064148, "learning_rate": 6.294171878252303e-06, "loss": 0.0511, "step": 43705 }, { "epoch": 3.9613920609026643, "grad_norm": 0.5705807209014893, "learning_rate": 6.288925786651414e-06, "loss": 0.0503, "step": 43710 }, { "epoch": 3.9618452057277507, "grad_norm": 0.9563056826591492, "learning_rate": 6.283681567661157e-06, "loss": 0.0624, "step": 43715 }, { "epoch": 3.9622983505528366, "grad_norm": 0.7365022897720337, "learning_rate": 6.278439221806376e-06, "loss": 0.0747, "step": 43720 }, { "epoch": 3.9627514953779226, "grad_norm": 0.5810396075248718, "learning_rate": 6.2731987496117195e-06, "loss": 0.052, "step": 43725 }, { "epoch": 3.963204640203009, "grad_norm": 0.4246073067188263, "learning_rate": 6.26796015160167e-06, "loss": 0.0526, "step": 43730 }, { "epoch": 3.963657785028095, "grad_norm": 0.4102801978588104, "learning_rate": 6.262723428300502e-06, "loss": 0.0519, "step": 43735 }, { "epoch": 3.964110929853181, "grad_norm": 0.5985668897628784, "learning_rate": 6.257488580232304e-06, "loss": 0.0571, "step": 43740 }, { "epoch": 3.9645640746782673, "grad_norm": 0.5412587523460388, "learning_rate": 6.252255607920987e-06, "loss": 0.0531, "step": 43745 }, { "epoch": 3.9650172195033533, "grad_norm": 0.6017497181892395, "learning_rate": 6.247024511890259e-06, "loss": 0.0526, "step": 43750 }, { "epoch": 3.9654703643284392, "grad_norm": 0.4622003436088562, "learning_rate": 6.241795292663666e-06, "loss": 0.0743, "step": 43755 }, { "epoch": 3.9659235091535257, "grad_norm": 0.5117068886756897, "learning_rate": 6.2365679507645384e-06, "loss": 0.0449, "step": 43760 }, { "epoch": 3.9663766539786116, "grad_norm": 0.4992876946926117, "learning_rate": 6.231342486716035e-06, "loss": 0.0482, "step": 43765 }, { "epoch": 3.9668297988036976, "grad_norm": 0.5542626976966858, "learning_rate": 6.2261189010411195e-06, "loss": 0.0577, "step": 43770 }, { "epoch": 3.967282943628784, "grad_norm": 0.4890008270740509, "learning_rate": 6.220897194262565e-06, "loss": 0.0488, "step": 43775 }, { "epoch": 3.96773608845387, "grad_norm": 0.47952666878700256, "learning_rate": 6.215677366902973e-06, "loss": 0.05, "step": 43780 }, { "epoch": 3.968189233278956, "grad_norm": 0.6848961114883423, "learning_rate": 6.21045941948474e-06, "loss": 0.0443, "step": 43785 }, { "epoch": 3.9686423781040423, "grad_norm": 0.43777963519096375, "learning_rate": 6.2052433525300826e-06, "loss": 0.0523, "step": 43790 }, { "epoch": 3.9690955229291283, "grad_norm": 0.5899010896682739, "learning_rate": 6.200029166561022e-06, "loss": 0.0526, "step": 43795 }, { "epoch": 3.969548667754214, "grad_norm": 0.49917104840278625, "learning_rate": 6.1948168620993925e-06, "loss": 0.0495, "step": 43800 }, { "epoch": 3.9700018125793, "grad_norm": 0.6856719851493835, "learning_rate": 6.189606439666853e-06, "loss": 0.064, "step": 43805 }, { "epoch": 3.9704549574043866, "grad_norm": 0.5398417711257935, "learning_rate": 6.184397899784863e-06, "loss": 0.0412, "step": 43810 }, { "epoch": 3.9709081022294725, "grad_norm": 0.6674233675003052, "learning_rate": 6.179191242974691e-06, "loss": 0.0597, "step": 43815 }, { "epoch": 3.9713612470545585, "grad_norm": 0.6513929963111877, "learning_rate": 6.17398646975742e-06, "loss": 0.0521, "step": 43820 }, { "epoch": 3.9718143918796445, "grad_norm": 0.44436728954315186, "learning_rate": 6.168783580653942e-06, "loss": 0.0746, "step": 43825 }, { "epoch": 3.972267536704731, "grad_norm": 0.5938307046890259, "learning_rate": 6.163582576184973e-06, "loss": 0.0598, "step": 43830 }, { "epoch": 3.972720681529817, "grad_norm": 0.46433162689208984, "learning_rate": 6.1583834568710306e-06, "loss": 0.0484, "step": 43835 }, { "epoch": 3.973173826354903, "grad_norm": 0.5760900974273682, "learning_rate": 6.1531862232324406e-06, "loss": 0.0561, "step": 43840 }, { "epoch": 3.973626971179989, "grad_norm": 0.5060370564460754, "learning_rate": 6.147990875789339e-06, "loss": 0.0474, "step": 43845 }, { "epoch": 3.974080116005075, "grad_norm": 0.4769534468650818, "learning_rate": 6.142797415061688e-06, "loss": 0.043, "step": 43850 }, { "epoch": 3.974533260830161, "grad_norm": 0.5556926131248474, "learning_rate": 6.137605841569247e-06, "loss": 0.0515, "step": 43855 }, { "epoch": 3.9749864056552475, "grad_norm": 0.49854326248168945, "learning_rate": 6.132416155831591e-06, "loss": 0.046, "step": 43860 }, { "epoch": 3.9754395504803335, "grad_norm": 0.49343231320381165, "learning_rate": 6.127228358368104e-06, "loss": 0.0671, "step": 43865 }, { "epoch": 3.9758926953054194, "grad_norm": 0.7441072463989258, "learning_rate": 6.122042449697973e-06, "loss": 0.053, "step": 43870 }, { "epoch": 3.976345840130506, "grad_norm": 0.43850022554397583, "learning_rate": 6.116858430340225e-06, "loss": 0.0486, "step": 43875 }, { "epoch": 3.976798984955592, "grad_norm": 0.4115290641784668, "learning_rate": 6.111676300813668e-06, "loss": 0.0427, "step": 43880 }, { "epoch": 3.9772521297806778, "grad_norm": 0.39956384897232056, "learning_rate": 6.106496061636932e-06, "loss": 0.0476, "step": 43885 }, { "epoch": 3.977705274605764, "grad_norm": 0.5934980511665344, "learning_rate": 6.101317713328458e-06, "loss": 0.0521, "step": 43890 }, { "epoch": 3.97815841943085, "grad_norm": 0.5488932132720947, "learning_rate": 6.09614125640649e-06, "loss": 0.052, "step": 43895 }, { "epoch": 3.978611564255936, "grad_norm": 0.541161060333252, "learning_rate": 6.090966691389108e-06, "loss": 0.0504, "step": 43900 }, { "epoch": 3.9790647090810225, "grad_norm": 0.6311713457107544, "learning_rate": 6.085794018794158e-06, "loss": 0.0615, "step": 43905 }, { "epoch": 3.9795178539061085, "grad_norm": 0.42448243498802185, "learning_rate": 6.080623239139341e-06, "loss": 0.05, "step": 43910 }, { "epoch": 3.9799709987311944, "grad_norm": 0.5685125589370728, "learning_rate": 6.075454352942151e-06, "loss": 0.0506, "step": 43915 }, { "epoch": 3.980424143556281, "grad_norm": 0.617679238319397, "learning_rate": 6.0702873607198755e-06, "loss": 0.0598, "step": 43920 }, { "epoch": 3.980877288381367, "grad_norm": 0.5310163497924805, "learning_rate": 6.065122262989656e-06, "loss": 0.047, "step": 43925 }, { "epoch": 3.9813304332064527, "grad_norm": 0.48723524808883667, "learning_rate": 6.0599590602683855e-06, "loss": 0.0598, "step": 43930 }, { "epoch": 3.981783578031539, "grad_norm": 0.543034553527832, "learning_rate": 6.054797753072824e-06, "loss": 0.0509, "step": 43935 }, { "epoch": 3.982236722856625, "grad_norm": 0.5169171094894409, "learning_rate": 6.049638341919508e-06, "loss": 0.0467, "step": 43940 }, { "epoch": 3.982689867681711, "grad_norm": 0.4971575438976288, "learning_rate": 6.044480827324783e-06, "loss": 0.0469, "step": 43945 }, { "epoch": 3.983143012506797, "grad_norm": 0.47778281569480896, "learning_rate": 6.039325209804844e-06, "loss": 0.0523, "step": 43950 }, { "epoch": 3.9835961573318834, "grad_norm": 0.436344712972641, "learning_rate": 6.034171489875629e-06, "loss": 0.0526, "step": 43955 }, { "epoch": 3.9840493021569694, "grad_norm": 0.4753374457359314, "learning_rate": 6.029019668052951e-06, "loss": 0.0542, "step": 43960 }, { "epoch": 3.9845024469820554, "grad_norm": 0.47073280811309814, "learning_rate": 6.023869744852398e-06, "loss": 0.0481, "step": 43965 }, { "epoch": 3.9849555918071413, "grad_norm": 0.6548561453819275, "learning_rate": 6.018721720789369e-06, "loss": 0.0675, "step": 43970 }, { "epoch": 3.9854087366322277, "grad_norm": 0.6070924997329712, "learning_rate": 6.013575596379101e-06, "loss": 0.0511, "step": 43975 }, { "epoch": 3.9858618814573137, "grad_norm": 0.6195310950279236, "learning_rate": 6.0084313721365915e-06, "loss": 0.0492, "step": 43980 }, { "epoch": 3.9863150262823996, "grad_norm": 0.44496309757232666, "learning_rate": 6.003289048576696e-06, "loss": 0.0558, "step": 43985 }, { "epoch": 3.986768171107486, "grad_norm": 0.5652052164077759, "learning_rate": 5.9981486262140565e-06, "loss": 0.0501, "step": 43990 }, { "epoch": 3.987221315932572, "grad_norm": 0.5647843480110168, "learning_rate": 5.993010105563118e-06, "loss": 0.0553, "step": 43995 }, { "epoch": 3.987674460757658, "grad_norm": 0.7029692530632019, "learning_rate": 5.987873487138168e-06, "loss": 0.0542, "step": 44000 }, { "epoch": 3.9881276055827444, "grad_norm": 0.4847032427787781, "learning_rate": 5.982738771453253e-06, "loss": 0.0495, "step": 44005 }, { "epoch": 3.9885807504078303, "grad_norm": 0.4108147621154785, "learning_rate": 5.9776059590222756e-06, "loss": 0.0461, "step": 44010 }, { "epoch": 3.9890338952329163, "grad_norm": 0.5556061267852783, "learning_rate": 5.972475050358925e-06, "loss": 0.0451, "step": 44015 }, { "epoch": 3.9894870400580027, "grad_norm": 0.519020676612854, "learning_rate": 5.967346045976699e-06, "loss": 0.0456, "step": 44020 }, { "epoch": 3.9899401848830887, "grad_norm": 0.5465788841247559, "learning_rate": 5.9622189463889255e-06, "loss": 0.0515, "step": 44025 }, { "epoch": 3.9903933297081746, "grad_norm": 0.5689958333969116, "learning_rate": 5.957093752108703e-06, "loss": 0.0506, "step": 44030 }, { "epoch": 3.990846474533261, "grad_norm": 0.38932061195373535, "learning_rate": 5.9519704636489814e-06, "loss": 0.0406, "step": 44035 }, { "epoch": 3.991299619358347, "grad_norm": 0.6390541791915894, "learning_rate": 5.946849081522496e-06, "loss": 0.0575, "step": 44040 }, { "epoch": 3.991752764183433, "grad_norm": 0.5496806502342224, "learning_rate": 5.941729606241788e-06, "loss": 0.0552, "step": 44045 }, { "epoch": 3.9922059090085193, "grad_norm": 0.6891854405403137, "learning_rate": 5.936612038319239e-06, "loss": 0.0524, "step": 44050 }, { "epoch": 3.9926590538336053, "grad_norm": 0.5311045050621033, "learning_rate": 5.931496378266985e-06, "loss": 0.0626, "step": 44055 }, { "epoch": 3.9931121986586913, "grad_norm": 0.4875774383544922, "learning_rate": 5.9263826265970295e-06, "loss": 0.045, "step": 44060 }, { "epoch": 3.9935653434837777, "grad_norm": 0.5864740610122681, "learning_rate": 5.921270783821145e-06, "loss": 0.0629, "step": 44065 }, { "epoch": 3.9940184883088636, "grad_norm": 0.6067490577697754, "learning_rate": 5.916160850450928e-06, "loss": 0.0602, "step": 44070 }, { "epoch": 3.9944716331339496, "grad_norm": 0.4779771566390991, "learning_rate": 5.911052826997793e-06, "loss": 0.0793, "step": 44075 }, { "epoch": 3.9949247779590356, "grad_norm": 0.5562496185302734, "learning_rate": 5.905946713972934e-06, "loss": 0.0538, "step": 44080 }, { "epoch": 3.995377922784122, "grad_norm": 0.649774968624115, "learning_rate": 5.900842511887389e-06, "loss": 0.0573, "step": 44085 }, { "epoch": 3.995831067609208, "grad_norm": 0.5812689065933228, "learning_rate": 5.89574022125198e-06, "loss": 0.0517, "step": 44090 }, { "epoch": 3.996284212434294, "grad_norm": 0.5313678979873657, "learning_rate": 5.890639842577347e-06, "loss": 0.0532, "step": 44095 }, { "epoch": 3.99673735725938, "grad_norm": 0.5145637392997742, "learning_rate": 5.8855413763739345e-06, "loss": 0.0494, "step": 44100 }, { "epoch": 3.9971905020844662, "grad_norm": 0.5675956010818481, "learning_rate": 5.8804448231520095e-06, "loss": 0.0473, "step": 44105 }, { "epoch": 3.997643646909552, "grad_norm": 0.47308656573295593, "learning_rate": 5.875350183421629e-06, "loss": 0.0519, "step": 44110 }, { "epoch": 3.998096791734638, "grad_norm": 0.7900127172470093, "learning_rate": 5.8702574576926685e-06, "loss": 0.0601, "step": 44115 }, { "epoch": 3.9985499365597246, "grad_norm": 0.4490854740142822, "learning_rate": 5.8651666464748065e-06, "loss": 0.0448, "step": 44120 }, { "epoch": 3.9990030813848105, "grad_norm": 0.5937705636024475, "learning_rate": 5.860077750277529e-06, "loss": 0.0475, "step": 44125 }, { "epoch": 3.9994562262098965, "grad_norm": 0.630023717880249, "learning_rate": 5.854990769610147e-06, "loss": 0.0539, "step": 44130 }, { "epoch": 3.999909371034983, "grad_norm": 0.5764151215553284, "learning_rate": 5.849905704981762e-06, "loss": 0.0457, "step": 44135 }, { "epoch": 4.000362515860069, "grad_norm": 0.3670290410518646, "learning_rate": 5.844822556901286e-06, "loss": 0.0405, "step": 44140 }, { "epoch": 4.000815660685155, "grad_norm": 0.30478572845458984, "learning_rate": 5.839741325877446e-06, "loss": 0.0307, "step": 44145 }, { "epoch": 4.001268805510241, "grad_norm": 0.2769835591316223, "learning_rate": 5.834662012418762e-06, "loss": 0.0333, "step": 44150 }, { "epoch": 4.001721950335327, "grad_norm": 0.5134096145629883, "learning_rate": 5.8295846170335896e-06, "loss": 0.0464, "step": 44155 }, { "epoch": 4.002175095160413, "grad_norm": 0.35672420263290405, "learning_rate": 5.824509140230072e-06, "loss": 0.0326, "step": 44160 }, { "epoch": 4.0026282399854995, "grad_norm": 0.39132776856422424, "learning_rate": 5.819435582516158e-06, "loss": 0.0314, "step": 44165 }, { "epoch": 4.003081384810585, "grad_norm": 0.3741143047809601, "learning_rate": 5.81436394439962e-06, "loss": 0.0372, "step": 44170 }, { "epoch": 4.0035345296356715, "grad_norm": 0.3572913706302643, "learning_rate": 5.809294226388013e-06, "loss": 0.0305, "step": 44175 }, { "epoch": 4.003987674460758, "grad_norm": 0.3428414463996887, "learning_rate": 5.804226428988735e-06, "loss": 0.0398, "step": 44180 }, { "epoch": 4.004440819285843, "grad_norm": 0.5368165373802185, "learning_rate": 5.799160552708965e-06, "loss": 0.0375, "step": 44185 }, { "epoch": 4.00489396411093, "grad_norm": 0.42423781752586365, "learning_rate": 5.794096598055701e-06, "loss": 0.0308, "step": 44190 }, { "epoch": 4.005347108936016, "grad_norm": 0.36833426356315613, "learning_rate": 5.789034565535739e-06, "loss": 0.0325, "step": 44195 }, { "epoch": 4.005800253761102, "grad_norm": 0.5019369721412659, "learning_rate": 5.783974455655686e-06, "loss": 0.0563, "step": 44200 }, { "epoch": 4.006253398586188, "grad_norm": 0.29676324129104614, "learning_rate": 5.778916268921972e-06, "loss": 0.0381, "step": 44205 }, { "epoch": 4.0067065434112745, "grad_norm": 0.3852238655090332, "learning_rate": 5.7738600058408144e-06, "loss": 0.0326, "step": 44210 }, { "epoch": 4.00715968823636, "grad_norm": 0.32191821932792664, "learning_rate": 5.768805666918248e-06, "loss": 0.0327, "step": 44215 }, { "epoch": 4.0076128330614464, "grad_norm": 0.3786221146583557, "learning_rate": 5.7637532526601114e-06, "loss": 0.0315, "step": 44220 }, { "epoch": 4.008065977886533, "grad_norm": 0.35638055205345154, "learning_rate": 5.758702763572047e-06, "loss": 0.029, "step": 44225 }, { "epoch": 4.008519122711618, "grad_norm": 0.37201690673828125, "learning_rate": 5.753654200159517e-06, "loss": 0.0341, "step": 44230 }, { "epoch": 4.008972267536705, "grad_norm": 0.3533678948879242, "learning_rate": 5.748607562927785e-06, "loss": 0.0327, "step": 44235 }, { "epoch": 4.009425412361791, "grad_norm": 0.3535360097885132, "learning_rate": 5.743562852381907e-06, "loss": 0.0293, "step": 44240 }, { "epoch": 4.009878557186877, "grad_norm": 0.3331693112850189, "learning_rate": 5.738520069026784e-06, "loss": 0.0326, "step": 44245 }, { "epoch": 4.010331702011963, "grad_norm": 0.33687132596969604, "learning_rate": 5.733479213367069e-06, "loss": 0.0331, "step": 44250 }, { "epoch": 4.0107848468370495, "grad_norm": 0.30244573950767517, "learning_rate": 5.72844028590728e-06, "loss": 0.0302, "step": 44255 }, { "epoch": 4.011237991662135, "grad_norm": 0.7465749382972717, "learning_rate": 5.723403287151691e-06, "loss": 0.0338, "step": 44260 }, { "epoch": 4.011691136487221, "grad_norm": 0.38593316078186035, "learning_rate": 5.718368217604422e-06, "loss": 0.0331, "step": 44265 }, { "epoch": 4.012144281312308, "grad_norm": 0.3425769507884979, "learning_rate": 5.7133350777693804e-06, "loss": 0.0331, "step": 44270 }, { "epoch": 4.012597426137393, "grad_norm": 0.3196549117565155, "learning_rate": 5.708303868150277e-06, "loss": 0.0385, "step": 44275 }, { "epoch": 4.01305057096248, "grad_norm": 0.3388236165046692, "learning_rate": 5.703274589250659e-06, "loss": 0.0329, "step": 44280 }, { "epoch": 4.013503715787566, "grad_norm": 0.7472537755966187, "learning_rate": 5.698247241573829e-06, "loss": 0.0372, "step": 44285 }, { "epoch": 4.013956860612652, "grad_norm": 0.33757665753364563, "learning_rate": 5.693221825622946e-06, "loss": 0.0371, "step": 44290 }, { "epoch": 4.014410005437738, "grad_norm": 0.3043953776359558, "learning_rate": 5.688198341900949e-06, "loss": 0.0313, "step": 44295 }, { "epoch": 4.014863150262824, "grad_norm": 0.34955450892448425, "learning_rate": 5.683176790910586e-06, "loss": 0.0342, "step": 44300 }, { "epoch": 4.01531629508791, "grad_norm": 0.32216697931289673, "learning_rate": 5.678157173154431e-06, "loss": 0.0305, "step": 44305 }, { "epoch": 4.015769439912996, "grad_norm": 0.29578033089637756, "learning_rate": 5.673139489134827e-06, "loss": 0.0388, "step": 44310 }, { "epoch": 4.016222584738082, "grad_norm": 0.2840610146522522, "learning_rate": 5.668123739353961e-06, "loss": 0.0369, "step": 44315 }, { "epoch": 4.016675729563168, "grad_norm": 0.46351945400238037, "learning_rate": 5.663109924313805e-06, "loss": 0.0318, "step": 44320 }, { "epoch": 4.017128874388255, "grad_norm": 0.3362564146518707, "learning_rate": 5.658098044516141e-06, "loss": 0.03, "step": 44325 }, { "epoch": 4.01758201921334, "grad_norm": 0.3472951352596283, "learning_rate": 5.6530881004625754e-06, "loss": 0.0309, "step": 44330 }, { "epoch": 4.018035164038427, "grad_norm": 0.3179365396499634, "learning_rate": 5.648080092654478e-06, "loss": 0.0343, "step": 44335 }, { "epoch": 4.018488308863513, "grad_norm": 0.3797549307346344, "learning_rate": 5.6430740215930754e-06, "loss": 0.0316, "step": 44340 }, { "epoch": 4.018941453688599, "grad_norm": 0.35507458448410034, "learning_rate": 5.638069887779368e-06, "loss": 0.0301, "step": 44345 }, { "epoch": 4.019394598513685, "grad_norm": 0.3387324810028076, "learning_rate": 5.633067691714167e-06, "loss": 0.03, "step": 44350 }, { "epoch": 4.019847743338771, "grad_norm": 0.3888258635997772, "learning_rate": 5.628067433898102e-06, "loss": 0.0308, "step": 44355 }, { "epoch": 4.020300888163857, "grad_norm": 0.3601994812488556, "learning_rate": 5.6230691148315974e-06, "loss": 0.0327, "step": 44360 }, { "epoch": 4.020754032988943, "grad_norm": 0.285379022359848, "learning_rate": 5.618072735014887e-06, "loss": 0.032, "step": 44365 }, { "epoch": 4.02120717781403, "grad_norm": 0.36918991804122925, "learning_rate": 5.61307829494801e-06, "loss": 0.038, "step": 44370 }, { "epoch": 4.021660322639115, "grad_norm": 0.3692380487918854, "learning_rate": 5.608085795130802e-06, "loss": 0.0315, "step": 44375 }, { "epoch": 4.022113467464202, "grad_norm": 0.33402204513549805, "learning_rate": 5.60309523606293e-06, "loss": 0.0307, "step": 44380 }, { "epoch": 4.022566612289288, "grad_norm": 0.23395341634750366, "learning_rate": 5.5981066182438454e-06, "loss": 0.0316, "step": 44385 }, { "epoch": 4.0230197571143735, "grad_norm": 0.32573240995407104, "learning_rate": 5.593119942172809e-06, "loss": 0.0401, "step": 44390 }, { "epoch": 4.02347290193946, "grad_norm": 0.4976579546928406, "learning_rate": 5.588135208348888e-06, "loss": 0.033, "step": 44395 }, { "epoch": 4.023926046764546, "grad_norm": 0.2791353166103363, "learning_rate": 5.583152417270954e-06, "loss": 0.0308, "step": 44400 }, { "epoch": 4.024379191589632, "grad_norm": 0.33312731981277466, "learning_rate": 5.578171569437693e-06, "loss": 0.0308, "step": 44405 }, { "epoch": 4.024832336414718, "grad_norm": 0.3842225968837738, "learning_rate": 5.57319266534759e-06, "loss": 0.0352, "step": 44410 }, { "epoch": 4.025285481239805, "grad_norm": 0.39851924777030945, "learning_rate": 5.56821570549893e-06, "loss": 0.0312, "step": 44415 }, { "epoch": 4.02573862606489, "grad_norm": 0.35036957263946533, "learning_rate": 5.563240690389812e-06, "loss": 0.032, "step": 44420 }, { "epoch": 4.026191770889977, "grad_norm": 0.4065752625465393, "learning_rate": 5.558267620518129e-06, "loss": 0.038, "step": 44425 }, { "epoch": 4.026644915715062, "grad_norm": 0.5615440011024475, "learning_rate": 5.553296496381602e-06, "loss": 0.0392, "step": 44430 }, { "epoch": 4.0270980605401485, "grad_norm": 0.36072686314582825, "learning_rate": 5.548327318477736e-06, "loss": 0.0326, "step": 44435 }, { "epoch": 4.027551205365235, "grad_norm": 0.40734314918518066, "learning_rate": 5.5433600873038476e-06, "loss": 0.0334, "step": 44440 }, { "epoch": 4.02800435019032, "grad_norm": 0.3247428834438324, "learning_rate": 5.538394803357058e-06, "loss": 0.0376, "step": 44445 }, { "epoch": 4.028457495015407, "grad_norm": 0.34264063835144043, "learning_rate": 5.533431467134298e-06, "loss": 0.0306, "step": 44450 }, { "epoch": 4.028910639840493, "grad_norm": 0.4061838388442993, "learning_rate": 5.528470079132286e-06, "loss": 0.0324, "step": 44455 }, { "epoch": 4.029363784665579, "grad_norm": 0.4348844289779663, "learning_rate": 5.523510639847582e-06, "loss": 0.045, "step": 44460 }, { "epoch": 4.029816929490665, "grad_norm": 0.6912844181060791, "learning_rate": 5.518553149776515e-06, "loss": 0.0421, "step": 44465 }, { "epoch": 4.030270074315752, "grad_norm": 0.3943496346473694, "learning_rate": 5.513597609415236e-06, "loss": 0.032, "step": 44470 }, { "epoch": 4.030723219140837, "grad_norm": 0.28578492999076843, "learning_rate": 5.508644019259696e-06, "loss": 0.0306, "step": 44475 }, { "epoch": 4.0311763639659235, "grad_norm": 0.2941831648349762, "learning_rate": 5.503692379805642e-06, "loss": 0.0328, "step": 44480 }, { "epoch": 4.03162950879101, "grad_norm": 0.37419262528419495, "learning_rate": 5.498742691548653e-06, "loss": 0.0299, "step": 44485 }, { "epoch": 4.032082653616095, "grad_norm": 0.3114222586154938, "learning_rate": 5.493794954984089e-06, "loss": 0.0314, "step": 44490 }, { "epoch": 4.032535798441182, "grad_norm": 0.32633092999458313, "learning_rate": 5.4888491706071135e-06, "loss": 0.0321, "step": 44495 }, { "epoch": 4.032988943266268, "grad_norm": 0.3064146339893341, "learning_rate": 5.48390533891272e-06, "loss": 0.0333, "step": 44500 }, { "epoch": 4.033442088091354, "grad_norm": 0.38371533155441284, "learning_rate": 5.478963460395667e-06, "loss": 0.0339, "step": 44505 }, { "epoch": 4.03389523291644, "grad_norm": 0.4100285470485687, "learning_rate": 5.4740235355505544e-06, "loss": 0.0308, "step": 44510 }, { "epoch": 4.0343483777415265, "grad_norm": 0.2748866379261017, "learning_rate": 5.46908556487177e-06, "loss": 0.0363, "step": 44515 }, { "epoch": 4.034801522566612, "grad_norm": 0.3771367073059082, "learning_rate": 5.464149548853495e-06, "loss": 0.0323, "step": 44520 }, { "epoch": 4.0352546673916985, "grad_norm": 0.3356402516365051, "learning_rate": 5.459215487989752e-06, "loss": 0.0326, "step": 44525 }, { "epoch": 4.035707812216785, "grad_norm": 0.37189605832099915, "learning_rate": 5.454283382774314e-06, "loss": 0.0314, "step": 44530 }, { "epoch": 4.03616095704187, "grad_norm": 0.3255707323551178, "learning_rate": 5.449353233700813e-06, "loss": 0.0301, "step": 44535 }, { "epoch": 4.036614101866957, "grad_norm": 0.3169669806957245, "learning_rate": 5.444425041262647e-06, "loss": 0.0349, "step": 44540 }, { "epoch": 4.037067246692043, "grad_norm": 0.3707304000854492, "learning_rate": 5.439498805953025e-06, "loss": 0.0305, "step": 44545 }, { "epoch": 4.037520391517129, "grad_norm": 0.2933160364627838, "learning_rate": 5.43457452826499e-06, "loss": 0.0296, "step": 44550 }, { "epoch": 4.037973536342215, "grad_norm": 0.4166324734687805, "learning_rate": 5.429652208691338e-06, "loss": 0.0346, "step": 44555 }, { "epoch": 4.038426681167301, "grad_norm": 0.40482503175735474, "learning_rate": 5.424731847724715e-06, "loss": 0.0325, "step": 44560 }, { "epoch": 4.038879825992387, "grad_norm": 0.35407301783561707, "learning_rate": 5.4198134458575435e-06, "loss": 0.029, "step": 44565 }, { "epoch": 4.0393329708174734, "grad_norm": 0.2911752462387085, "learning_rate": 5.414897003582056e-06, "loss": 0.0314, "step": 44570 }, { "epoch": 4.039786115642559, "grad_norm": 0.5298934578895569, "learning_rate": 5.409982521390314e-06, "loss": 0.0338, "step": 44575 }, { "epoch": 4.040239260467645, "grad_norm": 0.27082177996635437, "learning_rate": 5.405069999774126e-06, "loss": 0.0309, "step": 44580 }, { "epoch": 4.040692405292732, "grad_norm": 0.33432087302207947, "learning_rate": 5.400159439225166e-06, "loss": 0.0319, "step": 44585 }, { "epoch": 4.041145550117817, "grad_norm": 0.36029288172721863, "learning_rate": 5.395250840234872e-06, "loss": 0.0413, "step": 44590 }, { "epoch": 4.041598694942904, "grad_norm": 0.35520726442337036, "learning_rate": 5.3903442032944975e-06, "loss": 0.0325, "step": 44595 }, { "epoch": 4.04205183976799, "grad_norm": 0.31737208366394043, "learning_rate": 5.385439528895117e-06, "loss": 0.0284, "step": 44600 }, { "epoch": 4.042504984593076, "grad_norm": 0.6402997970581055, "learning_rate": 5.380536817527565e-06, "loss": 0.0479, "step": 44605 }, { "epoch": 4.042958129418162, "grad_norm": 0.3407716155052185, "learning_rate": 5.3756360696825266e-06, "loss": 0.0283, "step": 44610 }, { "epoch": 4.043411274243248, "grad_norm": 0.28006747364997864, "learning_rate": 5.3707372858504665e-06, "loss": 0.0317, "step": 44615 }, { "epoch": 4.043864419068334, "grad_norm": 0.5695580840110779, "learning_rate": 5.3658404665216465e-06, "loss": 0.0483, "step": 44620 }, { "epoch": 4.04431756389342, "grad_norm": 0.4524097144603729, "learning_rate": 5.360945612186163e-06, "loss": 0.033, "step": 44625 }, { "epoch": 4.044770708718507, "grad_norm": 0.28028687834739685, "learning_rate": 5.356052723333871e-06, "loss": 0.032, "step": 44630 }, { "epoch": 4.045223853543592, "grad_norm": 0.35729682445526123, "learning_rate": 5.351161800454465e-06, "loss": 0.0448, "step": 44635 }, { "epoch": 4.045676998368679, "grad_norm": 0.37348124384880066, "learning_rate": 5.346272844037431e-06, "loss": 0.0314, "step": 44640 }, { "epoch": 4.046130143193765, "grad_norm": 0.3953837454319, "learning_rate": 5.341385854572056e-06, "loss": 0.0364, "step": 44645 }, { "epoch": 4.046583288018851, "grad_norm": 0.4019131660461426, "learning_rate": 5.336500832547428e-06, "loss": 0.03, "step": 44650 }, { "epoch": 4.047036432843937, "grad_norm": 0.3251018524169922, "learning_rate": 5.331617778452438e-06, "loss": 0.0304, "step": 44655 }, { "epoch": 4.047489577669023, "grad_norm": 0.31721386313438416, "learning_rate": 5.3267366927757965e-06, "loss": 0.0323, "step": 44660 }, { "epoch": 4.047942722494109, "grad_norm": 0.551552951335907, "learning_rate": 5.321857576005998e-06, "loss": 0.0323, "step": 44665 }, { "epoch": 4.048395867319195, "grad_norm": 0.5363367795944214, "learning_rate": 5.316980428631346e-06, "loss": 0.0411, "step": 44670 }, { "epoch": 4.048849012144282, "grad_norm": 0.3509891927242279, "learning_rate": 5.312105251139945e-06, "loss": 0.0335, "step": 44675 }, { "epoch": 4.049302156969367, "grad_norm": 0.31928959488868713, "learning_rate": 5.3072320440197e-06, "loss": 0.0338, "step": 44680 }, { "epoch": 4.049755301794454, "grad_norm": 0.397549033164978, "learning_rate": 5.302360807758336e-06, "loss": 0.0302, "step": 44685 }, { "epoch": 4.050208446619539, "grad_norm": 0.4043126404285431, "learning_rate": 5.297491542843361e-06, "loss": 0.034, "step": 44690 }, { "epoch": 4.050661591444626, "grad_norm": 0.31732887029647827, "learning_rate": 5.29262424976209e-06, "loss": 0.0302, "step": 44695 }, { "epoch": 4.051114736269712, "grad_norm": 0.37676766514778137, "learning_rate": 5.287758929001649e-06, "loss": 0.0321, "step": 44700 }, { "epoch": 4.0515678810947975, "grad_norm": 0.4736119508743286, "learning_rate": 5.282895581048952e-06, "loss": 0.0324, "step": 44705 }, { "epoch": 4.052021025919884, "grad_norm": 0.3505669832229614, "learning_rate": 5.278034206390736e-06, "loss": 0.0333, "step": 44710 }, { "epoch": 4.05247417074497, "grad_norm": 0.406840980052948, "learning_rate": 5.273174805513525e-06, "loss": 0.0309, "step": 44715 }, { "epoch": 4.052927315570056, "grad_norm": 0.4674794375896454, "learning_rate": 5.268317378903648e-06, "loss": 0.0317, "step": 44720 }, { "epoch": 4.053380460395142, "grad_norm": 0.4180234670639038, "learning_rate": 5.263461927047237e-06, "loss": 0.0294, "step": 44725 }, { "epoch": 4.053833605220229, "grad_norm": 0.5783466696739197, "learning_rate": 5.2586084504302215e-06, "loss": 0.0314, "step": 44730 }, { "epoch": 4.054286750045314, "grad_norm": 0.42847830057144165, "learning_rate": 5.253756949538352e-06, "loss": 0.0311, "step": 44735 }, { "epoch": 4.0547398948704005, "grad_norm": 0.28517967462539673, "learning_rate": 5.248907424857163e-06, "loss": 0.0307, "step": 44740 }, { "epoch": 4.055193039695487, "grad_norm": 0.3040149211883545, "learning_rate": 5.244059876871996e-06, "loss": 0.0306, "step": 44745 }, { "epoch": 4.0556461845205725, "grad_norm": 0.45143160223960876, "learning_rate": 5.2392143060679886e-06, "loss": 0.0336, "step": 44750 }, { "epoch": 4.056099329345659, "grad_norm": 0.38512280583381653, "learning_rate": 5.2343707129301005e-06, "loss": 0.0299, "step": 44755 }, { "epoch": 4.056552474170745, "grad_norm": 0.3163272440433502, "learning_rate": 5.2295290979430726e-06, "loss": 0.0543, "step": 44760 }, { "epoch": 4.057005618995831, "grad_norm": 0.3932570219039917, "learning_rate": 5.224689461591456e-06, "loss": 0.0288, "step": 44765 }, { "epoch": 4.057458763820917, "grad_norm": 0.44264018535614014, "learning_rate": 5.219851804359602e-06, "loss": 0.0318, "step": 44770 }, { "epoch": 4.057911908646004, "grad_norm": 0.3464570641517639, "learning_rate": 5.2150161267316614e-06, "loss": 0.0302, "step": 44775 }, { "epoch": 4.058365053471089, "grad_norm": 0.3860896825790405, "learning_rate": 5.210182429191602e-06, "loss": 0.0285, "step": 44780 }, { "epoch": 4.0588181982961755, "grad_norm": 0.41685089468955994, "learning_rate": 5.205350712223173e-06, "loss": 0.0341, "step": 44785 }, { "epoch": 4.059271343121262, "grad_norm": 0.36129239201545715, "learning_rate": 5.200520976309939e-06, "loss": 0.0296, "step": 44790 }, { "epoch": 4.059724487946347, "grad_norm": 0.38311970233917236, "learning_rate": 5.195693221935258e-06, "loss": 0.0392, "step": 44795 }, { "epoch": 4.060177632771434, "grad_norm": 0.23939785361289978, "learning_rate": 5.190867449582285e-06, "loss": 0.0311, "step": 44800 }, { "epoch": 4.06063077759652, "grad_norm": 0.3727918267250061, "learning_rate": 5.186043659734011e-06, "loss": 0.0308, "step": 44805 }, { "epoch": 4.061083922421606, "grad_norm": 0.3742983937263489, "learning_rate": 5.1812218528731695e-06, "loss": 0.0314, "step": 44810 }, { "epoch": 4.061537067246692, "grad_norm": 0.3140890300273895, "learning_rate": 5.176402029482352e-06, "loss": 0.031, "step": 44815 }, { "epoch": 4.061990212071779, "grad_norm": 0.35894912481307983, "learning_rate": 5.171584190043921e-06, "loss": 0.0321, "step": 44820 }, { "epoch": 4.062443356896864, "grad_norm": 0.40967753529548645, "learning_rate": 5.166768335040043e-06, "loss": 0.0325, "step": 44825 }, { "epoch": 4.0628965017219505, "grad_norm": 0.2622200846672058, "learning_rate": 5.1619544649527036e-06, "loss": 0.0302, "step": 44830 }, { "epoch": 4.063349646547036, "grad_norm": 0.3962791860103607, "learning_rate": 5.157142580263658e-06, "loss": 0.0315, "step": 44835 }, { "epoch": 4.063802791372122, "grad_norm": 0.4286402463912964, "learning_rate": 5.152332681454497e-06, "loss": 0.0384, "step": 44840 }, { "epoch": 4.064255936197209, "grad_norm": 0.26322704553604126, "learning_rate": 5.14752476900659e-06, "loss": 0.0338, "step": 44845 }, { "epoch": 4.064709081022294, "grad_norm": 0.3218093514442444, "learning_rate": 5.142718843401106e-06, "loss": 0.0306, "step": 44850 }, { "epoch": 4.065162225847381, "grad_norm": 0.43427199125289917, "learning_rate": 5.137914905119051e-06, "loss": 0.0325, "step": 44855 }, { "epoch": 4.065615370672467, "grad_norm": 0.376979798078537, "learning_rate": 5.13311295464117e-06, "loss": 0.0306, "step": 44860 }, { "epoch": 4.066068515497553, "grad_norm": 0.3579162061214447, "learning_rate": 5.128312992448067e-06, "loss": 0.0305, "step": 44865 }, { "epoch": 4.066521660322639, "grad_norm": 0.295926034450531, "learning_rate": 5.12351501902012e-06, "loss": 0.0313, "step": 44870 }, { "epoch": 4.0669748051477255, "grad_norm": 0.3494111895561218, "learning_rate": 5.118719034837502e-06, "loss": 0.0311, "step": 44875 }, { "epoch": 4.067427949972811, "grad_norm": 0.36233434081077576, "learning_rate": 5.113925040380216e-06, "loss": 0.0335, "step": 44880 }, { "epoch": 4.067881094797897, "grad_norm": 0.2992366552352905, "learning_rate": 5.10913303612802e-06, "loss": 0.0308, "step": 44885 }, { "epoch": 4.068334239622984, "grad_norm": 0.33275097608566284, "learning_rate": 5.104343022560523e-06, "loss": 0.0325, "step": 44890 }, { "epoch": 4.068787384448069, "grad_norm": 0.4513072073459625, "learning_rate": 5.0995550001571e-06, "loss": 0.0311, "step": 44895 }, { "epoch": 4.069240529273156, "grad_norm": 0.4456447660923004, "learning_rate": 5.094768969396934e-06, "loss": 0.0321, "step": 44900 }, { "epoch": 4.069693674098242, "grad_norm": 0.4657861590385437, "learning_rate": 5.089984930759034e-06, "loss": 0.029, "step": 44905 }, { "epoch": 4.070146818923328, "grad_norm": 0.48586997389793396, "learning_rate": 5.0852028847221565e-06, "loss": 0.0358, "step": 44910 }, { "epoch": 4.070599963748414, "grad_norm": 0.3855077922344208, "learning_rate": 5.080422831764914e-06, "loss": 0.031, "step": 44915 }, { "epoch": 4.0710531085735004, "grad_norm": 0.3013556897640228, "learning_rate": 5.07564477236569e-06, "loss": 0.0294, "step": 44920 }, { "epoch": 4.071506253398586, "grad_norm": 0.36709102988243103, "learning_rate": 5.070868707002663e-06, "loss": 0.0313, "step": 44925 }, { "epoch": 4.071959398223672, "grad_norm": 0.42924413084983826, "learning_rate": 5.066094636153848e-06, "loss": 0.0366, "step": 44930 }, { "epoch": 4.072412543048759, "grad_norm": 0.2793305516242981, "learning_rate": 5.061322560297008e-06, "loss": 0.0326, "step": 44935 }, { "epoch": 4.072865687873844, "grad_norm": 0.41356804966926575, "learning_rate": 5.056552479909752e-06, "loss": 0.0315, "step": 44940 }, { "epoch": 4.073318832698931, "grad_norm": 0.6101362705230713, "learning_rate": 5.051784395469464e-06, "loss": 0.0327, "step": 44945 }, { "epoch": 4.073771977524017, "grad_norm": 0.36660492420196533, "learning_rate": 5.047018307453333e-06, "loss": 0.0347, "step": 44950 }, { "epoch": 4.074225122349103, "grad_norm": 0.2724398672580719, "learning_rate": 5.042254216338366e-06, "loss": 0.0292, "step": 44955 }, { "epoch": 4.074678267174189, "grad_norm": 0.40414366126060486, "learning_rate": 5.037492122601331e-06, "loss": 0.0399, "step": 44960 }, { "epoch": 4.0751314119992745, "grad_norm": 0.4464336335659027, "learning_rate": 5.032732026718842e-06, "loss": 0.0326, "step": 44965 }, { "epoch": 4.075584556824361, "grad_norm": 0.3773195445537567, "learning_rate": 5.027973929167279e-06, "loss": 0.0342, "step": 44970 }, { "epoch": 4.076037701649447, "grad_norm": 0.39197689294815063, "learning_rate": 5.023217830422833e-06, "loss": 0.0304, "step": 44975 }, { "epoch": 4.076490846474533, "grad_norm": 0.30372709035873413, "learning_rate": 5.018463730961509e-06, "loss": 0.0315, "step": 44980 }, { "epoch": 4.076943991299619, "grad_norm": 0.34423714876174927, "learning_rate": 5.01371163125908e-06, "loss": 0.0295, "step": 44985 }, { "epoch": 4.077397136124706, "grad_norm": 0.5092384219169617, "learning_rate": 5.008961531791151e-06, "loss": 0.0326, "step": 44990 }, { "epoch": 4.077850280949791, "grad_norm": 0.37274405360221863, "learning_rate": 5.004213433033112e-06, "loss": 0.0314, "step": 44995 }, { "epoch": 4.078303425774878, "grad_norm": 0.3167955279350281, "learning_rate": 4.999467335460151e-06, "loss": 0.0323, "step": 45000 }, { "epoch": 4.078756570599964, "grad_norm": 0.3464316427707672, "learning_rate": 4.994723239547253e-06, "loss": 0.0318, "step": 45005 }, { "epoch": 4.0792097154250495, "grad_norm": 0.4623419940471649, "learning_rate": 4.989981145769224e-06, "loss": 0.0318, "step": 45010 }, { "epoch": 4.079662860250136, "grad_norm": 0.36388394236564636, "learning_rate": 4.985241054600645e-06, "loss": 0.0312, "step": 45015 }, { "epoch": 4.080116005075222, "grad_norm": 0.3249005675315857, "learning_rate": 4.98050296651591e-06, "loss": 0.0383, "step": 45020 }, { "epoch": 4.080569149900308, "grad_norm": 0.33607107400894165, "learning_rate": 4.975766881989203e-06, "loss": 0.0331, "step": 45025 }, { "epoch": 4.081022294725394, "grad_norm": 0.3387124836444855, "learning_rate": 4.97103280149451e-06, "loss": 0.0328, "step": 45030 }, { "epoch": 4.081475439550481, "grad_norm": 0.3409248888492584, "learning_rate": 4.966300725505632e-06, "loss": 0.031, "step": 45035 }, { "epoch": 4.081928584375566, "grad_norm": 0.3981485962867737, "learning_rate": 4.9615706544961476e-06, "loss": 0.0309, "step": 45040 }, { "epoch": 4.082381729200653, "grad_norm": 0.35735374689102173, "learning_rate": 4.956842588939447e-06, "loss": 0.0306, "step": 45045 }, { "epoch": 4.082834874025739, "grad_norm": 0.3922411799430847, "learning_rate": 4.952116529308718e-06, "loss": 0.0334, "step": 45050 }, { "epoch": 4.0832880188508245, "grad_norm": 0.27690085768699646, "learning_rate": 4.9473924760769365e-06, "loss": 0.0387, "step": 45055 }, { "epoch": 4.083741163675911, "grad_norm": 0.32586491107940674, "learning_rate": 4.9426704297169e-06, "loss": 0.0324, "step": 45060 }, { "epoch": 4.084194308500997, "grad_norm": 0.406619131565094, "learning_rate": 4.937950390701187e-06, "loss": 0.0337, "step": 45065 }, { "epoch": 4.084647453326083, "grad_norm": 0.3829565942287445, "learning_rate": 4.933232359502182e-06, "loss": 0.0326, "step": 45070 }, { "epoch": 4.085100598151169, "grad_norm": 0.2974691689014435, "learning_rate": 4.928516336592068e-06, "loss": 0.0332, "step": 45075 }, { "epoch": 4.085553742976256, "grad_norm": 0.5336580276489258, "learning_rate": 4.9238023224428135e-06, "loss": 0.0339, "step": 45080 }, { "epoch": 4.086006887801341, "grad_norm": 0.437174528837204, "learning_rate": 4.919090317526218e-06, "loss": 0.0305, "step": 45085 }, { "epoch": 4.0864600326264275, "grad_norm": 0.4301307201385498, "learning_rate": 4.914380322313852e-06, "loss": 0.0331, "step": 45090 }, { "epoch": 4.086913177451514, "grad_norm": 0.35056477785110474, "learning_rate": 4.909672337277094e-06, "loss": 0.0309, "step": 45095 }, { "epoch": 4.0873663222765995, "grad_norm": 0.3492966294288635, "learning_rate": 4.90496636288712e-06, "loss": 0.0311, "step": 45100 }, { "epoch": 4.087819467101686, "grad_norm": 0.29716891050338745, "learning_rate": 4.900262399614897e-06, "loss": 0.0318, "step": 45105 }, { "epoch": 4.088272611926771, "grad_norm": 0.3079107701778412, "learning_rate": 4.8955604479312166e-06, "loss": 0.0319, "step": 45110 }, { "epoch": 4.088725756751858, "grad_norm": 0.34173446893692017, "learning_rate": 4.890860508306641e-06, "loss": 0.0317, "step": 45115 }, { "epoch": 4.089178901576944, "grad_norm": 0.3869340121746063, "learning_rate": 4.886162581211545e-06, "loss": 0.0324, "step": 45120 }, { "epoch": 4.08963204640203, "grad_norm": 0.4639248549938202, "learning_rate": 4.881466667116097e-06, "loss": 0.04, "step": 45125 }, { "epoch": 4.090085191227116, "grad_norm": 0.5817530751228333, "learning_rate": 4.8767727664902594e-06, "loss": 0.0375, "step": 45130 }, { "epoch": 4.0905383360522025, "grad_norm": 0.3718843460083008, "learning_rate": 4.8720808798038115e-06, "loss": 0.0347, "step": 45135 }, { "epoch": 4.090991480877288, "grad_norm": 0.3476717174053192, "learning_rate": 4.867391007526314e-06, "loss": 0.0356, "step": 45140 }, { "epoch": 4.091444625702374, "grad_norm": 0.3363721966743469, "learning_rate": 4.862703150127123e-06, "loss": 0.0292, "step": 45145 }, { "epoch": 4.091897770527461, "grad_norm": 0.6611997485160828, "learning_rate": 4.8580173080754245e-06, "loss": 0.0395, "step": 45150 }, { "epoch": 4.092350915352546, "grad_norm": 0.3073831796646118, "learning_rate": 4.853333481840147e-06, "loss": 0.0382, "step": 45155 }, { "epoch": 4.092804060177633, "grad_norm": 0.370545893907547, "learning_rate": 4.848651671890078e-06, "loss": 0.0318, "step": 45160 }, { "epoch": 4.093257205002719, "grad_norm": 0.30988410115242004, "learning_rate": 4.843971878693751e-06, "loss": 0.0367, "step": 45165 }, { "epoch": 4.093710349827805, "grad_norm": 0.49257805943489075, "learning_rate": 4.8392941027195295e-06, "loss": 0.0313, "step": 45170 }, { "epoch": 4.094163494652891, "grad_norm": 0.3160229027271271, "learning_rate": 4.834618344435585e-06, "loss": 0.0357, "step": 45175 }, { "epoch": 4.0946166394779775, "grad_norm": 0.28555402159690857, "learning_rate": 4.8299446043098384e-06, "loss": 0.0335, "step": 45180 }, { "epoch": 4.095069784303063, "grad_norm": 0.30058521032333374, "learning_rate": 4.82527288281007e-06, "loss": 0.038, "step": 45185 }, { "epoch": 4.095522929128149, "grad_norm": 0.5155475735664368, "learning_rate": 4.820603180403796e-06, "loss": 0.0337, "step": 45190 }, { "epoch": 4.095976073953236, "grad_norm": 0.4330688416957855, "learning_rate": 4.815935497558382e-06, "loss": 0.0328, "step": 45195 }, { "epoch": 4.096429218778321, "grad_norm": 0.37273216247558594, "learning_rate": 4.811269834740972e-06, "loss": 0.0297, "step": 45200 }, { "epoch": 4.096882363603408, "grad_norm": 0.3966674506664276, "learning_rate": 4.806606192418492e-06, "loss": 0.0326, "step": 45205 }, { "epoch": 4.097335508428494, "grad_norm": 0.4173596203327179, "learning_rate": 4.801944571057704e-06, "loss": 0.031, "step": 45210 }, { "epoch": 4.09778865325358, "grad_norm": 0.3705933094024658, "learning_rate": 4.797284971125118e-06, "loss": 0.0311, "step": 45215 }, { "epoch": 4.098241798078666, "grad_norm": 0.4639393985271454, "learning_rate": 4.7926273930870905e-06, "loss": 0.0333, "step": 45220 }, { "epoch": 4.0986949429037525, "grad_norm": 0.3728889226913452, "learning_rate": 4.787971837409741e-06, "loss": 0.03, "step": 45225 }, { "epoch": 4.099148087728838, "grad_norm": 0.4598480761051178, "learning_rate": 4.783318304558998e-06, "loss": 0.0464, "step": 45230 }, { "epoch": 4.099601232553924, "grad_norm": 0.3792603015899658, "learning_rate": 4.778666795000605e-06, "loss": 0.031, "step": 45235 }, { "epoch": 4.10005437737901, "grad_norm": 0.30632638931274414, "learning_rate": 4.774017309200065e-06, "loss": 0.0341, "step": 45240 }, { "epoch": 4.100507522204096, "grad_norm": 0.3468043804168701, "learning_rate": 4.769369847622715e-06, "loss": 0.0342, "step": 45245 }, { "epoch": 4.100960667029183, "grad_norm": 0.26442354917526245, "learning_rate": 4.76472441073367e-06, "loss": 0.0304, "step": 45250 }, { "epoch": 4.101413811854268, "grad_norm": 0.3270632326602936, "learning_rate": 4.76008099899784e-06, "loss": 0.0305, "step": 45255 }, { "epoch": 4.101866956679355, "grad_norm": 0.40696981549263, "learning_rate": 4.75543961287995e-06, "loss": 0.0295, "step": 45260 }, { "epoch": 4.102320101504441, "grad_norm": 0.349313348531723, "learning_rate": 4.750800252844509e-06, "loss": 0.0317, "step": 45265 }, { "epoch": 4.102773246329527, "grad_norm": 0.35325098037719727, "learning_rate": 4.7461629193558235e-06, "loss": 0.0325, "step": 45270 }, { "epoch": 4.103226391154613, "grad_norm": 0.332505464553833, "learning_rate": 4.741527612878002e-06, "loss": 0.0309, "step": 45275 }, { "epoch": 4.103679535979699, "grad_norm": 0.3474579453468323, "learning_rate": 4.736894333874936e-06, "loss": 0.0304, "step": 45280 }, { "epoch": 4.104132680804785, "grad_norm": 0.3284897804260254, "learning_rate": 4.732263082810343e-06, "loss": 0.0303, "step": 45285 }, { "epoch": 4.104585825629871, "grad_norm": 0.3064083755016327, "learning_rate": 4.727633860147712e-06, "loss": 0.0295, "step": 45290 }, { "epoch": 4.105038970454958, "grad_norm": 0.43895357847213745, "learning_rate": 4.723006666350338e-06, "loss": 0.0353, "step": 45295 }, { "epoch": 4.105492115280043, "grad_norm": 0.44211727380752563, "learning_rate": 4.718381501881311e-06, "loss": 0.0351, "step": 45300 }, { "epoch": 4.10594526010513, "grad_norm": 0.5056605935096741, "learning_rate": 4.713758367203511e-06, "loss": 0.0312, "step": 45305 }, { "epoch": 4.106398404930216, "grad_norm": 0.3208152949810028, "learning_rate": 4.709137262779639e-06, "loss": 0.0308, "step": 45310 }, { "epoch": 4.1068515497553015, "grad_norm": 0.3491743206977844, "learning_rate": 4.704518189072171e-06, "loss": 0.0328, "step": 45315 }, { "epoch": 4.107304694580388, "grad_norm": 0.30128929018974304, "learning_rate": 4.699901146543381e-06, "loss": 0.0349, "step": 45320 }, { "epoch": 4.107757839405474, "grad_norm": 0.3585914373397827, "learning_rate": 4.695286135655349e-06, "loss": 0.0326, "step": 45325 }, { "epoch": 4.10821098423056, "grad_norm": 0.3599592447280884, "learning_rate": 4.690673156869938e-06, "loss": 0.0461, "step": 45330 }, { "epoch": 4.108664129055646, "grad_norm": 0.3909655511379242, "learning_rate": 4.686062210648828e-06, "loss": 0.0299, "step": 45335 }, { "epoch": 4.109117273880733, "grad_norm": 0.4225509762763977, "learning_rate": 4.6814532974534806e-06, "loss": 0.0312, "step": 45340 }, { "epoch": 4.109570418705818, "grad_norm": 0.398563951253891, "learning_rate": 4.6768464177451535e-06, "loss": 0.0327, "step": 45345 }, { "epoch": 4.110023563530905, "grad_norm": 0.321887731552124, "learning_rate": 4.672241571984909e-06, "loss": 0.0317, "step": 45350 }, { "epoch": 4.110476708355991, "grad_norm": 0.4308566153049469, "learning_rate": 4.6676387606336e-06, "loss": 0.0338, "step": 45355 }, { "epoch": 4.1109298531810765, "grad_norm": 0.3306896388530731, "learning_rate": 4.6630379841518686e-06, "loss": 0.0315, "step": 45360 }, { "epoch": 4.111382998006163, "grad_norm": 0.328136682510376, "learning_rate": 4.658439243000179e-06, "loss": 0.0318, "step": 45365 }, { "epoch": 4.111836142831248, "grad_norm": 0.32168513536453247, "learning_rate": 4.653842537638767e-06, "loss": 0.0312, "step": 45370 }, { "epoch": 4.112289287656335, "grad_norm": 0.4044472277164459, "learning_rate": 4.64924786852767e-06, "loss": 0.0305, "step": 45375 }, { "epoch": 4.112742432481421, "grad_norm": 0.32505616545677185, "learning_rate": 4.644655236126727e-06, "loss": 0.0325, "step": 45380 }, { "epoch": 4.113195577306507, "grad_norm": 0.3587111234664917, "learning_rate": 4.640064640895564e-06, "loss": 0.0307, "step": 45385 }, { "epoch": 4.113648722131593, "grad_norm": 0.4733003079891205, "learning_rate": 4.6354760832936186e-06, "loss": 0.0335, "step": 45390 }, { "epoch": 4.11410186695668, "grad_norm": 0.303584486246109, "learning_rate": 4.630889563780111e-06, "loss": 0.0323, "step": 45395 }, { "epoch": 4.114555011781765, "grad_norm": 0.3914554715156555, "learning_rate": 4.626305082814056e-06, "loss": 0.0317, "step": 45400 }, { "epoch": 4.1150081566068515, "grad_norm": 0.4577954411506653, "learning_rate": 4.62172264085429e-06, "loss": 0.043, "step": 45405 }, { "epoch": 4.115461301431938, "grad_norm": 0.44839608669281006, "learning_rate": 4.617142238359396e-06, "loss": 0.03, "step": 45410 }, { "epoch": 4.115914446257023, "grad_norm": 0.426687091588974, "learning_rate": 4.612563875787803e-06, "loss": 0.0362, "step": 45415 }, { "epoch": 4.11636759108211, "grad_norm": 0.31318199634552, "learning_rate": 4.607987553597712e-06, "loss": 0.0479, "step": 45420 }, { "epoch": 4.116820735907196, "grad_norm": 0.30328336358070374, "learning_rate": 4.603413272247112e-06, "loss": 0.0339, "step": 45425 }, { "epoch": 4.117273880732282, "grad_norm": 0.3183704912662506, "learning_rate": 4.598841032193821e-06, "loss": 0.0314, "step": 45430 }, { "epoch": 4.117727025557368, "grad_norm": 0.39277100563049316, "learning_rate": 4.594270833895401e-06, "loss": 0.0348, "step": 45435 }, { "epoch": 4.1181801703824545, "grad_norm": 0.3289138972759247, "learning_rate": 4.589702677809263e-06, "loss": 0.0317, "step": 45440 }, { "epoch": 4.11863331520754, "grad_norm": 0.31488412618637085, "learning_rate": 4.585136564392578e-06, "loss": 0.0314, "step": 45445 }, { "epoch": 4.1190864600326265, "grad_norm": 0.3647691309452057, "learning_rate": 4.580572494102323e-06, "loss": 0.0318, "step": 45450 }, { "epoch": 4.119539604857713, "grad_norm": 0.3087385594844818, "learning_rate": 4.576010467395286e-06, "loss": 0.0342, "step": 45455 }, { "epoch": 4.119992749682798, "grad_norm": 0.3353462219238281, "learning_rate": 4.571450484728015e-06, "loss": 0.0296, "step": 45460 }, { "epoch": 4.120445894507885, "grad_norm": 0.3931603729724884, "learning_rate": 4.566892546556892e-06, "loss": 0.0305, "step": 45465 }, { "epoch": 4.120899039332971, "grad_norm": 0.32356733083724976, "learning_rate": 4.562336653338067e-06, "loss": 0.0311, "step": 45470 }, { "epoch": 4.121352184158057, "grad_norm": 0.3820483982563019, "learning_rate": 4.5577828055274965e-06, "loss": 0.0304, "step": 45475 }, { "epoch": 4.121805328983143, "grad_norm": 0.37140586972236633, "learning_rate": 4.553231003580943e-06, "loss": 0.0345, "step": 45480 }, { "epoch": 4.1222584738082295, "grad_norm": 0.2953183948993683, "learning_rate": 4.5486812479539315e-06, "loss": 0.0305, "step": 45485 }, { "epoch": 4.122711618633315, "grad_norm": 0.30481937527656555, "learning_rate": 4.544133539101822e-06, "loss": 0.0407, "step": 45490 }, { "epoch": 4.123164763458401, "grad_norm": 0.4290999472141266, "learning_rate": 4.539587877479742e-06, "loss": 0.0336, "step": 45495 }, { "epoch": 4.123617908283487, "grad_norm": 0.3671700656414032, "learning_rate": 4.535044263542615e-06, "loss": 0.0319, "step": 45500 }, { "epoch": 4.124071053108573, "grad_norm": 0.3991177976131439, "learning_rate": 4.5305026977451925e-06, "loss": 0.0318, "step": 45505 }, { "epoch": 4.12452419793366, "grad_norm": 0.3397460877895355, "learning_rate": 4.525963180541964e-06, "loss": 0.0299, "step": 45510 }, { "epoch": 4.124977342758745, "grad_norm": 0.40345659852027893, "learning_rate": 4.521425712387278e-06, "loss": 0.0329, "step": 45515 }, { "epoch": 4.125430487583832, "grad_norm": 0.3179951012134552, "learning_rate": 4.516890293735213e-06, "loss": 0.0304, "step": 45520 }, { "epoch": 4.125883632408918, "grad_norm": 0.32678213715553284, "learning_rate": 4.512356925039693e-06, "loss": 0.0314, "step": 45525 }, { "epoch": 4.126336777234004, "grad_norm": 0.3367776572704315, "learning_rate": 4.507825606754431e-06, "loss": 0.03, "step": 45530 }, { "epoch": 4.12678992205909, "grad_norm": 0.5199195146560669, "learning_rate": 4.503296339332899e-06, "loss": 0.0359, "step": 45535 }, { "epoch": 4.127243066884176, "grad_norm": 0.42823246121406555, "learning_rate": 4.498769123228402e-06, "loss": 0.0335, "step": 45540 }, { "epoch": 4.127696211709262, "grad_norm": 0.42889201641082764, "learning_rate": 4.49424395889402e-06, "loss": 0.0348, "step": 45545 }, { "epoch": 4.128149356534348, "grad_norm": 0.47066542506217957, "learning_rate": 4.489720846782638e-06, "loss": 0.0357, "step": 45550 }, { "epoch": 4.128602501359435, "grad_norm": 0.3795391917228699, "learning_rate": 4.485199787346924e-06, "loss": 0.0306, "step": 45555 }, { "epoch": 4.12905564618452, "grad_norm": 0.454037606716156, "learning_rate": 4.480680781039348e-06, "loss": 0.041, "step": 45560 }, { "epoch": 4.129508791009607, "grad_norm": 0.29715970158576965, "learning_rate": 4.47616382831218e-06, "loss": 0.0319, "step": 45565 }, { "epoch": 4.129961935834693, "grad_norm": 0.351900190114975, "learning_rate": 4.471648929617475e-06, "loss": 0.0321, "step": 45570 }, { "epoch": 4.130415080659779, "grad_norm": 0.33583107590675354, "learning_rate": 4.4671360854070835e-06, "loss": 0.029, "step": 45575 }, { "epoch": 4.130868225484865, "grad_norm": 0.45071911811828613, "learning_rate": 4.462625296132655e-06, "loss": 0.0366, "step": 45580 }, { "epoch": 4.131321370309951, "grad_norm": 0.44953033328056335, "learning_rate": 4.45811656224562e-06, "loss": 0.0328, "step": 45585 }, { "epoch": 4.131774515135037, "grad_norm": 0.3377542495727539, "learning_rate": 4.453609884197235e-06, "loss": 0.031, "step": 45590 }, { "epoch": 4.132227659960123, "grad_norm": 0.3452862501144409, "learning_rate": 4.449105262438516e-06, "loss": 0.0319, "step": 45595 }, { "epoch": 4.13268080478521, "grad_norm": 0.32451331615448, "learning_rate": 4.444602697420289e-06, "loss": 0.0308, "step": 45600 }, { "epoch": 4.133133949610295, "grad_norm": 0.33293673396110535, "learning_rate": 4.440102189593173e-06, "loss": 0.056, "step": 45605 }, { "epoch": 4.133587094435382, "grad_norm": 0.33870023488998413, "learning_rate": 4.435603739407573e-06, "loss": 0.0323, "step": 45610 }, { "epoch": 4.134040239260468, "grad_norm": 0.511642575263977, "learning_rate": 4.4311073473137125e-06, "loss": 0.0388, "step": 45615 }, { "epoch": 4.134493384085554, "grad_norm": 0.3034577965736389, "learning_rate": 4.42661301376158e-06, "loss": 0.0326, "step": 45620 }, { "epoch": 4.13494652891064, "grad_norm": 0.33912691473960876, "learning_rate": 4.422120739200969e-06, "loss": 0.0297, "step": 45625 }, { "epoch": 4.135399673735726, "grad_norm": 0.24615907669067383, "learning_rate": 4.417630524081473e-06, "loss": 0.0338, "step": 45630 }, { "epoch": 4.135852818560812, "grad_norm": 0.3213069438934326, "learning_rate": 4.413142368852469e-06, "loss": 0.0319, "step": 45635 }, { "epoch": 4.136305963385898, "grad_norm": 0.2927902638912201, "learning_rate": 4.408656273963138e-06, "loss": 0.0333, "step": 45640 }, { "epoch": 4.136759108210984, "grad_norm": 0.35605257749557495, "learning_rate": 4.4041722398624505e-06, "loss": 0.0297, "step": 45645 }, { "epoch": 4.13721225303607, "grad_norm": 0.41232457756996155, "learning_rate": 4.399690266999166e-06, "loss": 0.0324, "step": 45650 }, { "epoch": 4.137665397861157, "grad_norm": 0.30813488364219666, "learning_rate": 4.395210355821838e-06, "loss": 0.0305, "step": 45655 }, { "epoch": 4.138118542686242, "grad_norm": 0.39635995030403137, "learning_rate": 4.3907325067788274e-06, "loss": 0.0339, "step": 45660 }, { "epoch": 4.1385716875113285, "grad_norm": 0.3369394838809967, "learning_rate": 4.386256720318277e-06, "loss": 0.0372, "step": 45665 }, { "epoch": 4.139024832336415, "grad_norm": 0.38042983412742615, "learning_rate": 4.381782996888123e-06, "loss": 0.0317, "step": 45670 }, { "epoch": 4.1394779771615005, "grad_norm": 0.4162040650844574, "learning_rate": 4.377311336936093e-06, "loss": 0.0311, "step": 45675 }, { "epoch": 4.139931121986587, "grad_norm": 0.29660680890083313, "learning_rate": 4.3728417409097135e-06, "loss": 0.0303, "step": 45680 }, { "epoch": 4.140384266811673, "grad_norm": 0.407292902469635, "learning_rate": 4.36837420925631e-06, "loss": 0.0318, "step": 45685 }, { "epoch": 4.140837411636759, "grad_norm": 0.44861966371536255, "learning_rate": 4.363908742422993e-06, "loss": 0.0364, "step": 45690 }, { "epoch": 4.141290556461845, "grad_norm": 0.3309824466705322, "learning_rate": 4.35944534085666e-06, "loss": 0.034, "step": 45695 }, { "epoch": 4.141743701286932, "grad_norm": 0.3436022102832794, "learning_rate": 4.354984005004018e-06, "loss": 0.0327, "step": 45700 }, { "epoch": 4.142196846112017, "grad_norm": 0.3615744709968567, "learning_rate": 4.350524735311551e-06, "loss": 0.0547, "step": 45705 }, { "epoch": 4.1426499909371035, "grad_norm": 0.4522641599178314, "learning_rate": 4.346067532225559e-06, "loss": 0.0327, "step": 45710 }, { "epoch": 4.14310313576219, "grad_norm": 0.37734052538871765, "learning_rate": 4.341612396192099e-06, "loss": 0.0386, "step": 45715 }, { "epoch": 4.143556280587275, "grad_norm": 0.3501504361629486, "learning_rate": 4.337159327657059e-06, "loss": 0.0302, "step": 45720 }, { "epoch": 4.144009425412362, "grad_norm": 0.40440553426742554, "learning_rate": 4.3327083270660985e-06, "loss": 0.034, "step": 45725 }, { "epoch": 4.144462570237448, "grad_norm": 0.35291337966918945, "learning_rate": 4.32825939486467e-06, "loss": 0.0327, "step": 45730 }, { "epoch": 4.144915715062534, "grad_norm": 0.366712361574173, "learning_rate": 4.3238125314980425e-06, "loss": 0.0317, "step": 45735 }, { "epoch": 4.14536885988762, "grad_norm": 0.37416815757751465, "learning_rate": 4.319367737411234e-06, "loss": 0.0369, "step": 45740 }, { "epoch": 4.145822004712707, "grad_norm": 0.330998033285141, "learning_rate": 4.314925013049098e-06, "loss": 0.0319, "step": 45745 }, { "epoch": 4.146275149537792, "grad_norm": 0.3831031918525696, "learning_rate": 4.310484358856262e-06, "loss": 0.031, "step": 45750 }, { "epoch": 4.1467282943628785, "grad_norm": 0.30530470609664917, "learning_rate": 4.306045775277137e-06, "loss": 0.0306, "step": 45755 }, { "epoch": 4.147181439187965, "grad_norm": 0.3131036162376404, "learning_rate": 4.3016092627559595e-06, "loss": 0.045, "step": 45760 }, { "epoch": 4.14763458401305, "grad_norm": 0.2749733328819275, "learning_rate": 4.2971748217367105e-06, "loss": 0.0286, "step": 45765 }, { "epoch": 4.148087728838137, "grad_norm": 0.3193877637386322, "learning_rate": 4.292742452663212e-06, "loss": 0.0426, "step": 45770 }, { "epoch": 4.148540873663222, "grad_norm": 0.36102816462516785, "learning_rate": 4.288312155979046e-06, "loss": 0.0353, "step": 45775 }, { "epoch": 4.148994018488309, "grad_norm": 0.4446983337402344, "learning_rate": 4.283883932127597e-06, "loss": 0.0324, "step": 45780 }, { "epoch": 4.149447163313395, "grad_norm": 0.43888065218925476, "learning_rate": 4.27945778155206e-06, "loss": 0.0319, "step": 45785 }, { "epoch": 4.149900308138481, "grad_norm": 0.323311984539032, "learning_rate": 4.27503370469538e-06, "loss": 0.0322, "step": 45790 }, { "epoch": 4.150353452963567, "grad_norm": 0.3628477454185486, "learning_rate": 4.27061170200034e-06, "loss": 0.0316, "step": 45795 }, { "epoch": 4.1508065977886535, "grad_norm": 0.3306914269924164, "learning_rate": 4.266191773909487e-06, "loss": 0.0446, "step": 45800 }, { "epoch": 4.151259742613739, "grad_norm": 0.5397790670394897, "learning_rate": 4.261773920865164e-06, "loss": 0.0343, "step": 45805 }, { "epoch": 4.151712887438825, "grad_norm": 0.342974454164505, "learning_rate": 4.257358143309531e-06, "loss": 0.0317, "step": 45810 }, { "epoch": 4.152166032263912, "grad_norm": 0.34234970808029175, "learning_rate": 4.252944441684495e-06, "loss": 0.0311, "step": 45815 }, { "epoch": 4.152619177088997, "grad_norm": 0.44266337156295776, "learning_rate": 4.248532816431797e-06, "loss": 0.0361, "step": 45820 }, { "epoch": 4.153072321914084, "grad_norm": 0.397113174200058, "learning_rate": 4.2441232679929525e-06, "loss": 0.033, "step": 45825 }, { "epoch": 4.15352546673917, "grad_norm": 0.3756028115749359, "learning_rate": 4.23971579680926e-06, "loss": 0.0287, "step": 45830 }, { "epoch": 4.153978611564256, "grad_norm": 0.2824544310569763, "learning_rate": 4.235310403321841e-06, "loss": 0.0312, "step": 45835 }, { "epoch": 4.154431756389342, "grad_norm": 0.5366391539573669, "learning_rate": 4.230907087971564e-06, "loss": 0.0304, "step": 45840 }, { "epoch": 4.154884901214428, "grad_norm": 0.35107001662254333, "learning_rate": 4.2265058511991335e-06, "loss": 0.0353, "step": 45845 }, { "epoch": 4.155338046039514, "grad_norm": 0.3486330211162567, "learning_rate": 4.222106693445019e-06, "loss": 0.0299, "step": 45850 }, { "epoch": 4.1557911908646, "grad_norm": 0.34999150037765503, "learning_rate": 4.2177096151494845e-06, "loss": 0.0299, "step": 45855 }, { "epoch": 4.156244335689687, "grad_norm": 0.23231524229049683, "learning_rate": 4.213314616752609e-06, "loss": 0.0304, "step": 45860 }, { "epoch": 4.156697480514772, "grad_norm": 0.4523567855358124, "learning_rate": 4.208921698694223e-06, "loss": 0.0334, "step": 45865 }, { "epoch": 4.157150625339859, "grad_norm": 0.5730501413345337, "learning_rate": 4.204530861413986e-06, "loss": 0.0336, "step": 45870 }, { "epoch": 4.157603770164945, "grad_norm": 0.3731224238872528, "learning_rate": 4.200142105351332e-06, "loss": 0.033, "step": 45875 }, { "epoch": 4.158056914990031, "grad_norm": 0.3289867639541626, "learning_rate": 4.19575543094548e-06, "loss": 0.029, "step": 45880 }, { "epoch": 4.158510059815117, "grad_norm": 0.308203786611557, "learning_rate": 4.191370838635473e-06, "loss": 0.0322, "step": 45885 }, { "epoch": 4.158963204640203, "grad_norm": 0.37321537733078003, "learning_rate": 4.186988328860089e-06, "loss": 0.0299, "step": 45890 }, { "epoch": 4.159416349465289, "grad_norm": 0.28769829869270325, "learning_rate": 4.182607902057959e-06, "loss": 0.0293, "step": 45895 }, { "epoch": 4.159869494290375, "grad_norm": 0.34583792090415955, "learning_rate": 4.178229558667465e-06, "loss": 0.0362, "step": 45900 }, { "epoch": 4.160322639115462, "grad_norm": 0.5345744490623474, "learning_rate": 4.173853299126798e-06, "loss": 0.0375, "step": 45905 }, { "epoch": 4.160775783940547, "grad_norm": 0.3788118362426758, "learning_rate": 4.1694791238739265e-06, "loss": 0.0304, "step": 45910 }, { "epoch": 4.161228928765634, "grad_norm": 0.5407845973968506, "learning_rate": 4.165107033346633e-06, "loss": 0.0364, "step": 45915 }, { "epoch": 4.161682073590719, "grad_norm": 0.3829767405986786, "learning_rate": 4.160737027982467e-06, "loss": 0.0312, "step": 45920 }, { "epoch": 4.162135218415806, "grad_norm": 0.2744486331939697, "learning_rate": 4.1563691082187885e-06, "loss": 0.0297, "step": 45925 }, { "epoch": 4.162588363240892, "grad_norm": 0.4546927809715271, "learning_rate": 4.152003274492733e-06, "loss": 0.032, "step": 45930 }, { "epoch": 4.1630415080659775, "grad_norm": 0.4092896580696106, "learning_rate": 4.147639527241231e-06, "loss": 0.0321, "step": 45935 }, { "epoch": 4.163494652891064, "grad_norm": 0.5250483751296997, "learning_rate": 4.143277866901024e-06, "loss": 0.037, "step": 45940 }, { "epoch": 4.16394779771615, "grad_norm": 0.3648969233036041, "learning_rate": 4.138918293908614e-06, "loss": 0.0348, "step": 45945 }, { "epoch": 4.164400942541236, "grad_norm": 0.48667800426483154, "learning_rate": 4.134560808700316e-06, "loss": 0.0317, "step": 45950 }, { "epoch": 4.164854087366322, "grad_norm": 0.34525614976882935, "learning_rate": 4.130205411712226e-06, "loss": 0.0498, "step": 45955 }, { "epoch": 4.165307232191409, "grad_norm": 0.47402751445770264, "learning_rate": 4.1258521033802265e-06, "loss": 0.0427, "step": 45960 }, { "epoch": 4.165760377016494, "grad_norm": 0.3606328070163727, "learning_rate": 4.121500884140012e-06, "loss": 0.0319, "step": 45965 }, { "epoch": 4.166213521841581, "grad_norm": 0.4036041498184204, "learning_rate": 4.1171517544270455e-06, "loss": 0.0305, "step": 45970 }, { "epoch": 4.166666666666667, "grad_norm": 0.35928356647491455, "learning_rate": 4.112804714676594e-06, "loss": 0.0367, "step": 45975 }, { "epoch": 4.1671198114917525, "grad_norm": 0.28510022163391113, "learning_rate": 4.108459765323702e-06, "loss": 0.0323, "step": 45980 }, { "epoch": 4.167572956316839, "grad_norm": 0.3149533271789551, "learning_rate": 4.104116906803218e-06, "loss": 0.0302, "step": 45985 }, { "epoch": 4.168026101141925, "grad_norm": 0.5032216310501099, "learning_rate": 4.099776139549782e-06, "loss": 0.0355, "step": 45990 }, { "epoch": 4.168479245967011, "grad_norm": 0.39808085560798645, "learning_rate": 4.095437463997814e-06, "loss": 0.0308, "step": 45995 }, { "epoch": 4.168932390792097, "grad_norm": 0.362179696559906, "learning_rate": 4.091100880581533e-06, "loss": 0.031, "step": 46000 }, { "epoch": 4.169385535617184, "grad_norm": 0.2833188772201538, "learning_rate": 4.086766389734945e-06, "loss": 0.03, "step": 46005 }, { "epoch": 4.169838680442269, "grad_norm": 0.39813992381095886, "learning_rate": 4.08243399189184e-06, "loss": 0.0361, "step": 46010 }, { "epoch": 4.1702918252673555, "grad_norm": 0.5110036134719849, "learning_rate": 4.078103687485818e-06, "loss": 0.0363, "step": 46015 }, { "epoch": 4.170744970092442, "grad_norm": 0.3371613025665283, "learning_rate": 4.073775476950251e-06, "loss": 0.0331, "step": 46020 }, { "epoch": 4.1711981149175275, "grad_norm": 0.3423420786857605, "learning_rate": 4.069449360718308e-06, "loss": 0.0309, "step": 46025 }, { "epoch": 4.171651259742614, "grad_norm": 0.4684586822986603, "learning_rate": 4.06512533922295e-06, "loss": 0.0319, "step": 46030 }, { "epoch": 4.1721044045677, "grad_norm": 0.27168047428131104, "learning_rate": 4.060803412896918e-06, "loss": 0.0296, "step": 46035 }, { "epoch": 4.172557549392786, "grad_norm": 0.48971131443977356, "learning_rate": 4.056483582172765e-06, "loss": 0.0319, "step": 46040 }, { "epoch": 4.173010694217872, "grad_norm": 0.2917250692844391, "learning_rate": 4.052165847482817e-06, "loss": 0.0323, "step": 46045 }, { "epoch": 4.173463839042958, "grad_norm": 0.3666843771934509, "learning_rate": 4.047850209259185e-06, "loss": 0.0301, "step": 46050 }, { "epoch": 4.173916983868044, "grad_norm": 0.30133259296417236, "learning_rate": 4.0435366679337975e-06, "loss": 0.0514, "step": 46055 }, { "epoch": 4.1743701286931305, "grad_norm": 0.32277387380599976, "learning_rate": 4.039225223938334e-06, "loss": 0.0325, "step": 46060 }, { "epoch": 4.174823273518216, "grad_norm": 0.3702951967716217, "learning_rate": 4.034915877704307e-06, "loss": 0.0317, "step": 46065 }, { "epoch": 4.175276418343302, "grad_norm": 0.3841034173965454, "learning_rate": 4.030608629662977e-06, "loss": 0.0409, "step": 46070 }, { "epoch": 4.175729563168389, "grad_norm": 0.37591221928596497, "learning_rate": 4.026303480245422e-06, "loss": 0.0347, "step": 46075 }, { "epoch": 4.176182707993474, "grad_norm": 0.43546250462532043, "learning_rate": 4.022000429882516e-06, "loss": 0.031, "step": 46080 }, { "epoch": 4.176635852818561, "grad_norm": 0.3253374993801117, "learning_rate": 4.017699479004888e-06, "loss": 0.0301, "step": 46085 }, { "epoch": 4.177088997643647, "grad_norm": 0.31090202927589417, "learning_rate": 4.013400628043001e-06, "loss": 0.0292, "step": 46090 }, { "epoch": 4.177542142468733, "grad_norm": 0.463543176651001, "learning_rate": 4.009103877427062e-06, "loss": 0.0331, "step": 46095 }, { "epoch": 4.177995287293819, "grad_norm": 0.6349220275878906, "learning_rate": 4.0048092275871055e-06, "loss": 0.039, "step": 46100 }, { "epoch": 4.1784484321189055, "grad_norm": 0.2977396547794342, "learning_rate": 4.000516678952942e-06, "loss": 0.0334, "step": 46105 }, { "epoch": 4.178901576943991, "grad_norm": 0.300139456987381, "learning_rate": 3.996226231954165e-06, "loss": 0.0352, "step": 46110 }, { "epoch": 4.179354721769077, "grad_norm": 0.3776756227016449, "learning_rate": 3.991937887020175e-06, "loss": 0.0319, "step": 46115 }, { "epoch": 4.179807866594164, "grad_norm": 0.4098707437515259, "learning_rate": 3.9876516445801315e-06, "loss": 0.0325, "step": 46120 }, { "epoch": 4.180261011419249, "grad_norm": 0.38302579522132874, "learning_rate": 3.983367505063021e-06, "loss": 0.0313, "step": 46125 }, { "epoch": 4.180714156244336, "grad_norm": 0.39376506209373474, "learning_rate": 3.979085468897595e-06, "loss": 0.0322, "step": 46130 }, { "epoch": 4.181167301069422, "grad_norm": 0.41847509145736694, "learning_rate": 3.974805536512397e-06, "loss": 0.03, "step": 46135 }, { "epoch": 4.181620445894508, "grad_norm": 0.34762004017829895, "learning_rate": 3.970527708335778e-06, "loss": 0.031, "step": 46140 }, { "epoch": 4.182073590719594, "grad_norm": 0.3471561670303345, "learning_rate": 3.966251984795843e-06, "loss": 0.0326, "step": 46145 }, { "epoch": 4.1825267355446805, "grad_norm": 0.3334375023841858, "learning_rate": 3.961978366320529e-06, "loss": 0.039, "step": 46150 }, { "epoch": 4.182979880369766, "grad_norm": 0.415701687335968, "learning_rate": 3.957706853337531e-06, "loss": 0.0304, "step": 46155 }, { "epoch": 4.183433025194852, "grad_norm": 0.35877111554145813, "learning_rate": 3.953437446274336e-06, "loss": 0.0356, "step": 46160 }, { "epoch": 4.183886170019939, "grad_norm": 0.7022866010665894, "learning_rate": 3.949170145558245e-06, "loss": 0.0393, "step": 46165 }, { "epoch": 4.184339314845024, "grad_norm": 0.43076175451278687, "learning_rate": 3.9449049516163215e-06, "loss": 0.0307, "step": 46170 }, { "epoch": 4.184792459670111, "grad_norm": 0.28641751408576965, "learning_rate": 3.9406418648754274e-06, "loss": 0.0309, "step": 46175 }, { "epoch": 4.185245604495197, "grad_norm": 0.3990881145000458, "learning_rate": 3.936380885762215e-06, "loss": 0.0324, "step": 46180 }, { "epoch": 4.185698749320283, "grad_norm": 0.30766770243644714, "learning_rate": 3.932122014703116e-06, "loss": 0.0407, "step": 46185 }, { "epoch": 4.186151894145369, "grad_norm": 0.8861696124076843, "learning_rate": 3.927865252124377e-06, "loss": 0.0488, "step": 46190 }, { "epoch": 4.1866050389704546, "grad_norm": 0.5284083485603333, "learning_rate": 3.9236105984520044e-06, "loss": 0.0666, "step": 46195 }, { "epoch": 4.187058183795541, "grad_norm": 0.43335726857185364, "learning_rate": 3.91935805411181e-06, "loss": 0.031, "step": 46200 }, { "epoch": 4.187511328620627, "grad_norm": 0.30341479182243347, "learning_rate": 3.915107619529387e-06, "loss": 0.0285, "step": 46205 }, { "epoch": 4.187964473445713, "grad_norm": 0.46157753467559814, "learning_rate": 3.910859295130115e-06, "loss": 0.035, "step": 46210 }, { "epoch": 4.188417618270799, "grad_norm": 0.33473268151283264, "learning_rate": 3.90661308133918e-06, "loss": 0.0317, "step": 46215 }, { "epoch": 4.188870763095886, "grad_norm": 0.3414852023124695, "learning_rate": 3.902368978581539e-06, "loss": 0.0324, "step": 46220 }, { "epoch": 4.189323907920971, "grad_norm": 0.5837480425834656, "learning_rate": 3.8981269872819414e-06, "loss": 0.0331, "step": 46225 }, { "epoch": 4.189777052746058, "grad_norm": 0.27269455790519714, "learning_rate": 3.89388710786493e-06, "loss": 0.0295, "step": 46230 }, { "epoch": 4.190230197571144, "grad_norm": 0.5313820242881775, "learning_rate": 3.889649340754825e-06, "loss": 0.0365, "step": 46235 }, { "epoch": 4.1906833423962295, "grad_norm": 0.3927899897098541, "learning_rate": 3.885413686375758e-06, "loss": 0.0353, "step": 46240 }, { "epoch": 4.191136487221316, "grad_norm": 0.3522627651691437, "learning_rate": 3.881180145151625e-06, "loss": 0.0322, "step": 46245 }, { "epoch": 4.191589632046402, "grad_norm": 0.3462730348110199, "learning_rate": 3.876948717506124e-06, "loss": 0.035, "step": 46250 }, { "epoch": 4.192042776871488, "grad_norm": 0.37485045194625854, "learning_rate": 3.872719403862731e-06, "loss": 0.0388, "step": 46255 }, { "epoch": 4.192495921696574, "grad_norm": 0.4213382303714752, "learning_rate": 3.868492204644727e-06, "loss": 0.0304, "step": 46260 }, { "epoch": 4.192949066521661, "grad_norm": 0.3090633451938629, "learning_rate": 3.864267120275161e-06, "loss": 0.0308, "step": 46265 }, { "epoch": 4.193402211346746, "grad_norm": 0.27650442719459534, "learning_rate": 3.860044151176892e-06, "loss": 0.0343, "step": 46270 }, { "epoch": 4.193855356171833, "grad_norm": 0.2697744369506836, "learning_rate": 3.855823297772548e-06, "loss": 0.0438, "step": 46275 }, { "epoch": 4.194308500996919, "grad_norm": 0.5159503817558289, "learning_rate": 3.851604560484559e-06, "loss": 0.0402, "step": 46280 }, { "epoch": 4.1947616458220045, "grad_norm": 0.30387037992477417, "learning_rate": 3.847387939735133e-06, "loss": 0.0333, "step": 46285 }, { "epoch": 4.195214790647091, "grad_norm": 0.3867470622062683, "learning_rate": 3.843173435946268e-06, "loss": 0.0281, "step": 46290 }, { "epoch": 4.195667935472177, "grad_norm": 0.3665938675403595, "learning_rate": 3.838961049539763e-06, "loss": 0.0327, "step": 46295 }, { "epoch": 4.196121080297263, "grad_norm": 0.37177103757858276, "learning_rate": 3.834750780937191e-06, "loss": 0.0299, "step": 46300 }, { "epoch": 4.196574225122349, "grad_norm": 0.32805192470550537, "learning_rate": 3.830542630559908e-06, "loss": 0.0349, "step": 46305 }, { "epoch": 4.197027369947435, "grad_norm": 0.3866741359233856, "learning_rate": 3.826336598829089e-06, "loss": 0.0332, "step": 46310 }, { "epoch": 4.197480514772521, "grad_norm": 0.35367029905319214, "learning_rate": 3.822132686165647e-06, "loss": 0.0346, "step": 46315 }, { "epoch": 4.197933659597608, "grad_norm": 0.30664923787117004, "learning_rate": 3.817930892990332e-06, "loss": 0.0352, "step": 46320 }, { "epoch": 4.198386804422693, "grad_norm": 0.3205491006374359, "learning_rate": 3.8137312197236514e-06, "loss": 0.0316, "step": 46325 }, { "epoch": 4.1988399492477795, "grad_norm": 0.2976067364215851, "learning_rate": 3.809533666785908e-06, "loss": 0.0311, "step": 46330 }, { "epoch": 4.199293094072866, "grad_norm": 0.3508842885494232, "learning_rate": 3.805338234597211e-06, "loss": 0.0366, "step": 46335 }, { "epoch": 4.199746238897951, "grad_norm": 0.2982533276081085, "learning_rate": 3.801144923577418e-06, "loss": 0.0293, "step": 46340 }, { "epoch": 4.200199383723038, "grad_norm": 0.3865510821342468, "learning_rate": 3.796953734146211e-06, "loss": 0.0336, "step": 46345 }, { "epoch": 4.200652528548124, "grad_norm": 0.3134104013442993, "learning_rate": 3.792764666723042e-06, "loss": 0.0344, "step": 46350 }, { "epoch": 4.20110567337321, "grad_norm": 0.2913089394569397, "learning_rate": 3.788577721727149e-06, "loss": 0.0364, "step": 46355 }, { "epoch": 4.201558818198296, "grad_norm": 0.35296738147735596, "learning_rate": 3.7843928995775794e-06, "loss": 0.0312, "step": 46360 }, { "epoch": 4.2020119630233825, "grad_norm": 0.35781630873680115, "learning_rate": 3.7802102006931296e-06, "loss": 0.0335, "step": 46365 }, { "epoch": 4.202465107848468, "grad_norm": 0.3057517409324646, "learning_rate": 3.7760296254924223e-06, "loss": 0.0299, "step": 46370 }, { "epoch": 4.2029182526735545, "grad_norm": 0.4509153664112091, "learning_rate": 3.7718511743938423e-06, "loss": 0.0317, "step": 46375 }, { "epoch": 4.203371397498641, "grad_norm": 0.41307637095451355, "learning_rate": 3.7676748478155693e-06, "loss": 0.0348, "step": 46380 }, { "epoch": 4.203824542323726, "grad_norm": 0.436661958694458, "learning_rate": 3.7635006461755883e-06, "loss": 0.0296, "step": 46385 }, { "epoch": 4.204277687148813, "grad_norm": 0.3182975947856903, "learning_rate": 3.7593285698916266e-06, "loss": 0.03, "step": 46390 }, { "epoch": 4.204730831973899, "grad_norm": 0.34484127163887024, "learning_rate": 3.7551586193812495e-06, "loss": 0.0297, "step": 46395 }, { "epoch": 4.205183976798985, "grad_norm": 0.3634030818939209, "learning_rate": 3.750990795061779e-06, "loss": 0.0302, "step": 46400 }, { "epoch": 4.205637121624071, "grad_norm": 0.32740968465805054, "learning_rate": 3.7468250973503287e-06, "loss": 0.0353, "step": 46405 }, { "epoch": 4.2060902664491575, "grad_norm": 0.37524911761283875, "learning_rate": 3.742661526663818e-06, "loss": 0.0322, "step": 46410 }, { "epoch": 4.206543411274243, "grad_norm": 0.38021421432495117, "learning_rate": 3.7385000834189165e-06, "loss": 0.0352, "step": 46415 }, { "epoch": 4.206996556099329, "grad_norm": 0.3320833742618561, "learning_rate": 3.7343407680321187e-06, "loss": 0.0303, "step": 46420 }, { "epoch": 4.207449700924416, "grad_norm": 0.29763543605804443, "learning_rate": 3.7301835809196855e-06, "loss": 0.0314, "step": 46425 }, { "epoch": 4.207902845749501, "grad_norm": 0.30135276913642883, "learning_rate": 3.7260285224976656e-06, "loss": 0.0351, "step": 46430 }, { "epoch": 4.208355990574588, "grad_norm": 0.4012422561645508, "learning_rate": 3.7218755931819145e-06, "loss": 0.0348, "step": 46435 }, { "epoch": 4.208809135399674, "grad_norm": 0.3818212151527405, "learning_rate": 3.7177247933880335e-06, "loss": 0.0363, "step": 46440 }, { "epoch": 4.20926228022476, "grad_norm": 0.3467015027999878, "learning_rate": 3.7135761235314566e-06, "loss": 0.0386, "step": 46445 }, { "epoch": 4.209715425049846, "grad_norm": 0.4811902344226837, "learning_rate": 3.7094295840273797e-06, "loss": 0.0319, "step": 46450 }, { "epoch": 4.210168569874932, "grad_norm": 0.3517909646034241, "learning_rate": 3.7052851752907847e-06, "loss": 0.0298, "step": 46455 }, { "epoch": 4.210621714700018, "grad_norm": 0.3830075263977051, "learning_rate": 3.7011428977364482e-06, "loss": 0.0304, "step": 46460 }, { "epoch": 4.211074859525104, "grad_norm": 0.35083499550819397, "learning_rate": 3.6970027517789246e-06, "loss": 0.0304, "step": 46465 }, { "epoch": 4.21152800435019, "grad_norm": 0.3727824091911316, "learning_rate": 3.6928647378325717e-06, "loss": 0.0325, "step": 46470 }, { "epoch": 4.211981149175276, "grad_norm": 0.2979271709918976, "learning_rate": 3.6887288563115214e-06, "loss": 0.03, "step": 46475 }, { "epoch": 4.212434294000363, "grad_norm": 0.36934754252433777, "learning_rate": 3.68459510762969e-06, "loss": 0.0327, "step": 46480 }, { "epoch": 4.212887438825448, "grad_norm": 0.5126949548721313, "learning_rate": 3.6804634922007865e-06, "loss": 0.0371, "step": 46485 }, { "epoch": 4.213340583650535, "grad_norm": 0.30404722690582275, "learning_rate": 3.6763340104382982e-06, "loss": 0.0382, "step": 46490 }, { "epoch": 4.213793728475621, "grad_norm": 0.41351059079170227, "learning_rate": 3.6722066627555144e-06, "loss": 0.0314, "step": 46495 }, { "epoch": 4.214246873300707, "grad_norm": 0.4239060580730438, "learning_rate": 3.6680814495654995e-06, "loss": 0.032, "step": 46500 }, { "epoch": 4.214700018125793, "grad_norm": 0.32503741979599, "learning_rate": 3.663958371281104e-06, "loss": 0.0295, "step": 46505 }, { "epoch": 4.215153162950879, "grad_norm": 0.39297282695770264, "learning_rate": 3.659837428314966e-06, "loss": 0.031, "step": 46510 }, { "epoch": 4.215606307775965, "grad_norm": 0.3177056312561035, "learning_rate": 3.65571862107951e-06, "loss": 0.0314, "step": 46515 }, { "epoch": 4.216059452601051, "grad_norm": 0.4095604717731476, "learning_rate": 3.651601949986952e-06, "loss": 0.0355, "step": 46520 }, { "epoch": 4.216512597426138, "grad_norm": 0.4327975809574127, "learning_rate": 3.6474874154492883e-06, "loss": 0.0324, "step": 46525 }, { "epoch": 4.216965742251223, "grad_norm": 0.22898124158382416, "learning_rate": 3.6433750178783033e-06, "loss": 0.0345, "step": 46530 }, { "epoch": 4.21741888707631, "grad_norm": 0.3326079249382019, "learning_rate": 3.6392647576855677e-06, "loss": 0.0347, "step": 46535 }, { "epoch": 4.217872031901396, "grad_norm": 0.28668105602264404, "learning_rate": 3.6351566352824273e-06, "loss": 0.0291, "step": 46540 }, { "epoch": 4.2183251767264816, "grad_norm": 0.3340740501880646, "learning_rate": 3.6310506510800396e-06, "loss": 0.0307, "step": 46545 }, { "epoch": 4.218778321551568, "grad_norm": 0.2954181730747223, "learning_rate": 3.6269468054893286e-06, "loss": 0.0306, "step": 46550 }, { "epoch": 4.219231466376654, "grad_norm": 0.3862089514732361, "learning_rate": 3.6228450989210045e-06, "loss": 0.0341, "step": 46555 }, { "epoch": 4.21968461120174, "grad_norm": 0.3618471920490265, "learning_rate": 3.618745531785564e-06, "loss": 0.0307, "step": 46560 }, { "epoch": 4.220137756026826, "grad_norm": 0.25530916452407837, "learning_rate": 3.6146481044933066e-06, "loss": 0.034, "step": 46565 }, { "epoch": 4.220590900851913, "grad_norm": 0.3020249009132385, "learning_rate": 3.6105528174542935e-06, "loss": 0.0313, "step": 46570 }, { "epoch": 4.221044045676998, "grad_norm": 0.2740788757801056, "learning_rate": 3.6064596710783853e-06, "loss": 0.0301, "step": 46575 }, { "epoch": 4.221497190502085, "grad_norm": 0.28339576721191406, "learning_rate": 3.602368665775227e-06, "loss": 0.0297, "step": 46580 }, { "epoch": 4.22195033532717, "grad_norm": 0.3180745244026184, "learning_rate": 3.598279801954238e-06, "loss": 0.031, "step": 46585 }, { "epoch": 4.2224034801522565, "grad_norm": 0.32218632102012634, "learning_rate": 3.5941930800246492e-06, "loss": 0.0324, "step": 46590 }, { "epoch": 4.222856624977343, "grad_norm": 0.3254651427268982, "learning_rate": 3.590108500395453e-06, "loss": 0.0301, "step": 46595 }, { "epoch": 4.2233097698024284, "grad_norm": 0.2992921471595764, "learning_rate": 3.586026063475434e-06, "loss": 0.0357, "step": 46600 }, { "epoch": 4.223762914627515, "grad_norm": 0.29182106256484985, "learning_rate": 3.5819457696731674e-06, "loss": 0.0318, "step": 46605 }, { "epoch": 4.224216059452601, "grad_norm": 0.3548925220966339, "learning_rate": 3.577867619397002e-06, "loss": 0.0308, "step": 46610 }, { "epoch": 4.224669204277687, "grad_norm": 0.22228994965553284, "learning_rate": 3.5737916130551e-06, "loss": 0.0292, "step": 46615 }, { "epoch": 4.225122349102773, "grad_norm": 0.7275446057319641, "learning_rate": 3.5697177510553625e-06, "loss": 0.0668, "step": 46620 }, { "epoch": 4.22557549392786, "grad_norm": 0.36142876744270325, "learning_rate": 3.5656460338055226e-06, "loss": 0.0297, "step": 46625 }, { "epoch": 4.226028638752945, "grad_norm": 0.3899620771408081, "learning_rate": 3.561576461713076e-06, "loss": 0.0307, "step": 46630 }, { "epoch": 4.2264817835780315, "grad_norm": 0.27230122685432434, "learning_rate": 3.557509035185297e-06, "loss": 0.0354, "step": 46635 }, { "epoch": 4.226934928403118, "grad_norm": 0.3155801594257355, "learning_rate": 3.5534437546292737e-06, "loss": 0.0318, "step": 46640 }, { "epoch": 4.227388073228203, "grad_norm": 0.28179308772087097, "learning_rate": 3.5493806204518394e-06, "loss": 0.0323, "step": 46645 }, { "epoch": 4.22784121805329, "grad_norm": 0.5652315616607666, "learning_rate": 3.545319633059649e-06, "loss": 0.042, "step": 46650 }, { "epoch": 4.228294362878376, "grad_norm": 0.3240806460380554, "learning_rate": 3.5412607928591218e-06, "loss": 0.0314, "step": 46655 }, { "epoch": 4.228747507703462, "grad_norm": 0.29651764035224915, "learning_rate": 3.5372041002564636e-06, "loss": 0.0311, "step": 46660 }, { "epoch": 4.229200652528548, "grad_norm": 0.31176140904426575, "learning_rate": 3.533149555657686e-06, "loss": 0.0357, "step": 46665 }, { "epoch": 4.229653797353635, "grad_norm": 0.3532927334308624, "learning_rate": 3.5290971594685475e-06, "loss": 0.0301, "step": 46670 }, { "epoch": 4.23010694217872, "grad_norm": 0.4733753204345703, "learning_rate": 3.525046912094626e-06, "loss": 0.0336, "step": 46675 }, { "epoch": 4.2305600870038065, "grad_norm": 0.29715999960899353, "learning_rate": 3.52099881394127e-06, "loss": 0.0311, "step": 46680 }, { "epoch": 4.231013231828893, "grad_norm": 0.4061320126056671, "learning_rate": 3.5169528654136112e-06, "loss": 0.0336, "step": 46685 }, { "epoch": 4.231466376653978, "grad_norm": 0.3977222144603729, "learning_rate": 3.512909066916581e-06, "loss": 0.0341, "step": 46690 }, { "epoch": 4.231919521479065, "grad_norm": 0.29329320788383484, "learning_rate": 3.5088674188548635e-06, "loss": 0.034, "step": 46695 }, { "epoch": 4.232372666304151, "grad_norm": 0.377836674451828, "learning_rate": 3.5048279216329664e-06, "loss": 0.0325, "step": 46700 }, { "epoch": 4.232825811129237, "grad_norm": 0.28224027156829834, "learning_rate": 3.5007905756551552e-06, "loss": 0.0277, "step": 46705 }, { "epoch": 4.233278955954323, "grad_norm": 0.3177873492240906, "learning_rate": 3.4967553813254893e-06, "loss": 0.0316, "step": 46710 }, { "epoch": 4.2337321007794095, "grad_norm": 0.47390058636665344, "learning_rate": 3.4927223390478212e-06, "loss": 0.032, "step": 46715 }, { "epoch": 4.234185245604495, "grad_norm": 0.339364230632782, "learning_rate": 3.4886914492257643e-06, "loss": 0.031, "step": 46720 }, { "epoch": 4.2346383904295815, "grad_norm": 0.30139899253845215, "learning_rate": 3.484662712262743e-06, "loss": 0.0298, "step": 46725 }, { "epoch": 4.235091535254667, "grad_norm": 0.3358241617679596, "learning_rate": 3.480636128561951e-06, "loss": 0.0308, "step": 46730 }, { "epoch": 4.235544680079753, "grad_norm": 0.38474565744400024, "learning_rate": 3.4766116985263634e-06, "loss": 0.0335, "step": 46735 }, { "epoch": 4.23599782490484, "grad_norm": 0.37768274545669556, "learning_rate": 3.472589422558764e-06, "loss": 0.0358, "step": 46740 }, { "epoch": 4.236450969729925, "grad_norm": 0.6690587997436523, "learning_rate": 3.468569301061683e-06, "loss": 0.0412, "step": 46745 }, { "epoch": 4.236904114555012, "grad_norm": 0.3119162321090698, "learning_rate": 3.4645513344374685e-06, "loss": 0.032, "step": 46750 }, { "epoch": 4.237357259380098, "grad_norm": 0.4237522780895233, "learning_rate": 3.4605355230882375e-06, "loss": 0.0316, "step": 46755 }, { "epoch": 4.237810404205184, "grad_norm": 0.3514566421508789, "learning_rate": 3.4565218674158878e-06, "loss": 0.032, "step": 46760 }, { "epoch": 4.23826354903027, "grad_norm": 0.26804623007774353, "learning_rate": 3.4525103678221234e-06, "loss": 0.031, "step": 46765 }, { "epoch": 4.238716693855356, "grad_norm": 0.25491806864738464, "learning_rate": 3.448501024708395e-06, "loss": 0.0299, "step": 46770 }, { "epoch": 4.239169838680442, "grad_norm": 0.4007047712802887, "learning_rate": 3.4444938384759733e-06, "loss": 0.0291, "step": 46775 }, { "epoch": 4.239622983505528, "grad_norm": 0.3882099986076355, "learning_rate": 3.4404888095258964e-06, "loss": 0.0297, "step": 46780 }, { "epoch": 4.240076128330615, "grad_norm": 0.4756724536418915, "learning_rate": 3.436485938258982e-06, "loss": 0.0346, "step": 46785 }, { "epoch": 4.2405292731557, "grad_norm": 0.3282778561115265, "learning_rate": 3.432485225075854e-06, "loss": 0.0309, "step": 46790 }, { "epoch": 4.240982417980787, "grad_norm": 0.4380147457122803, "learning_rate": 3.4284866703768877e-06, "loss": 0.032, "step": 46795 }, { "epoch": 4.241435562805873, "grad_norm": 0.24957546591758728, "learning_rate": 3.42449027456227e-06, "loss": 0.0324, "step": 46800 }, { "epoch": 4.241888707630959, "grad_norm": 0.38162103295326233, "learning_rate": 3.420496038031962e-06, "loss": 0.0321, "step": 46805 }, { "epoch": 4.242341852456045, "grad_norm": 0.3701268434524536, "learning_rate": 3.416503961185705e-06, "loss": 0.0316, "step": 46810 }, { "epoch": 4.242794997281131, "grad_norm": 0.32496145367622375, "learning_rate": 3.4125140444230206e-06, "loss": 0.0333, "step": 46815 }, { "epoch": 4.243248142106217, "grad_norm": 0.3732207417488098, "learning_rate": 3.408526288143235e-06, "loss": 0.03, "step": 46820 }, { "epoch": 4.243701286931303, "grad_norm": 0.3382984399795532, "learning_rate": 3.4045406927454364e-06, "loss": 0.0359, "step": 46825 }, { "epoch": 4.24415443175639, "grad_norm": 0.47344061732292175, "learning_rate": 3.4005572586285063e-06, "loss": 0.0306, "step": 46830 }, { "epoch": 4.244607576581475, "grad_norm": 0.24741019308567047, "learning_rate": 3.3965759861911086e-06, "loss": 0.0293, "step": 46835 }, { "epoch": 4.245060721406562, "grad_norm": 0.3496897518634796, "learning_rate": 3.3925968758316827e-06, "loss": 0.0334, "step": 46840 }, { "epoch": 4.245513866231648, "grad_norm": 0.3740985095500946, "learning_rate": 3.3886199279484694e-06, "loss": 0.0283, "step": 46845 }, { "epoch": 4.245967011056734, "grad_norm": 0.5648080110549927, "learning_rate": 3.3846451429394826e-06, "loss": 0.0452, "step": 46850 }, { "epoch": 4.24642015588182, "grad_norm": 0.3235083818435669, "learning_rate": 3.3806725212025157e-06, "loss": 0.0294, "step": 46855 }, { "epoch": 4.2468733007069055, "grad_norm": 0.30868279933929443, "learning_rate": 3.376702063135151e-06, "loss": 0.0379, "step": 46860 }, { "epoch": 4.247326445531992, "grad_norm": 0.34706562757492065, "learning_rate": 3.3727337691347476e-06, "loss": 0.0295, "step": 46865 }, { "epoch": 4.247779590357078, "grad_norm": 0.34736016392707825, "learning_rate": 3.3687676395984663e-06, "loss": 0.0317, "step": 46870 }, { "epoch": 4.248232735182164, "grad_norm": 0.39783796668052673, "learning_rate": 3.3648036749232337e-06, "loss": 0.0338, "step": 46875 }, { "epoch": 4.24868588000725, "grad_norm": 0.40672191977500916, "learning_rate": 3.3608418755057605e-06, "loss": 0.0331, "step": 46880 }, { "epoch": 4.249139024832337, "grad_norm": 0.39504775404930115, "learning_rate": 3.3568822417425483e-06, "loss": 0.0334, "step": 46885 }, { "epoch": 4.249592169657422, "grad_norm": 0.3567624092102051, "learning_rate": 3.3529247740298723e-06, "loss": 0.0298, "step": 46890 }, { "epoch": 4.2500453144825086, "grad_norm": 0.5094397664070129, "learning_rate": 3.3489694727638076e-06, "loss": 0.0315, "step": 46895 }, { "epoch": 4.250498459307595, "grad_norm": 0.4580332934856415, "learning_rate": 3.3450163383401984e-06, "loss": 0.0319, "step": 46900 }, { "epoch": 4.2509516041326805, "grad_norm": 0.2588721215724945, "learning_rate": 3.3410653711546753e-06, "loss": 0.0356, "step": 46905 }, { "epoch": 4.251404748957767, "grad_norm": 0.33047768473625183, "learning_rate": 3.33711657160265e-06, "loss": 0.0336, "step": 46910 }, { "epoch": 4.251857893782853, "grad_norm": 0.3175092339515686, "learning_rate": 3.3331699400793144e-06, "loss": 0.0305, "step": 46915 }, { "epoch": 4.252311038607939, "grad_norm": 0.2892926335334778, "learning_rate": 3.329225476979664e-06, "loss": 0.0366, "step": 46920 }, { "epoch": 4.252764183433025, "grad_norm": 0.2822805345058441, "learning_rate": 3.325283182698452e-06, "loss": 0.0317, "step": 46925 }, { "epoch": 4.253217328258112, "grad_norm": 0.44115757942199707, "learning_rate": 3.3213430576302273e-06, "loss": 0.0308, "step": 46930 }, { "epoch": 4.253670473083197, "grad_norm": 0.27868613600730896, "learning_rate": 3.3174051021693188e-06, "loss": 0.0362, "step": 46935 }, { "epoch": 4.2541236179082835, "grad_norm": 0.2807541489601135, "learning_rate": 3.3134693167098303e-06, "loss": 0.0299, "step": 46940 }, { "epoch": 4.25457676273337, "grad_norm": 0.3225482106208801, "learning_rate": 3.309535701645669e-06, "loss": 0.031, "step": 46945 }, { "epoch": 4.2550299075584554, "grad_norm": 0.4498594105243683, "learning_rate": 3.3056042573705064e-06, "loss": 0.0319, "step": 46950 }, { "epoch": 4.255483052383542, "grad_norm": 0.3683239221572876, "learning_rate": 3.3016749842778e-06, "loss": 0.0316, "step": 46955 }, { "epoch": 4.255936197208628, "grad_norm": 0.39965933561325073, "learning_rate": 3.297747882760807e-06, "loss": 0.0337, "step": 46960 }, { "epoch": 4.256389342033714, "grad_norm": 0.3724701404571533, "learning_rate": 3.29382295321253e-06, "loss": 0.0318, "step": 46965 }, { "epoch": 4.2568424868588, "grad_norm": 0.39592865109443665, "learning_rate": 3.2899001960257967e-06, "loss": 0.031, "step": 46970 }, { "epoch": 4.257295631683887, "grad_norm": 0.32146990299224854, "learning_rate": 3.28597961159319e-06, "loss": 0.0324, "step": 46975 }, { "epoch": 4.257748776508972, "grad_norm": 0.2893913686275482, "learning_rate": 3.2820612003070793e-06, "loss": 0.0275, "step": 46980 }, { "epoch": 4.2582019213340585, "grad_norm": 0.3853638768196106, "learning_rate": 3.278144962559637e-06, "loss": 0.0313, "step": 46985 }, { "epoch": 4.258655066159145, "grad_norm": 0.3203426003456116, "learning_rate": 3.2742308987427776e-06, "loss": 0.0316, "step": 46990 }, { "epoch": 4.25910821098423, "grad_norm": 0.3151632249355316, "learning_rate": 3.270319009248246e-06, "loss": 0.0293, "step": 46995 }, { "epoch": 4.259561355809317, "grad_norm": 0.6151881814002991, "learning_rate": 3.2664092944675235e-06, "loss": 0.0406, "step": 47000 }, { "epoch": 4.260014500634402, "grad_norm": 0.21302230656147003, "learning_rate": 3.262501754791908e-06, "loss": 0.0305, "step": 47005 }, { "epoch": 4.260467645459489, "grad_norm": 0.37168556451797485, "learning_rate": 3.25859639061247e-06, "loss": 0.0414, "step": 47010 }, { "epoch": 4.260920790284575, "grad_norm": 0.344030499458313, "learning_rate": 3.2546932023200472e-06, "loss": 0.0298, "step": 47015 }, { "epoch": 4.261373935109661, "grad_norm": 0.3299618065357208, "learning_rate": 3.2507921903052877e-06, "loss": 0.0333, "step": 47020 }, { "epoch": 4.261827079934747, "grad_norm": 0.31383970379829407, "learning_rate": 3.246893354958591e-06, "loss": 0.0307, "step": 47025 }, { "epoch": 4.2622802247598335, "grad_norm": 0.24558614194393158, "learning_rate": 3.242996696670167e-06, "loss": 0.0294, "step": 47030 }, { "epoch": 4.262733369584919, "grad_norm": 0.4381466209888458, "learning_rate": 3.2391022158299865e-06, "loss": 0.0351, "step": 47035 }, { "epoch": 4.263186514410005, "grad_norm": 0.4630342125892639, "learning_rate": 3.2352099128278078e-06, "loss": 0.0332, "step": 47040 }, { "epoch": 4.263639659235092, "grad_norm": 0.36901435256004333, "learning_rate": 3.2313197880531887e-06, "loss": 0.0356, "step": 47045 }, { "epoch": 4.264092804060177, "grad_norm": 0.3616742193698883, "learning_rate": 3.227431841895434e-06, "loss": 0.0322, "step": 47050 }, { "epoch": 4.264545948885264, "grad_norm": 0.2954500615596771, "learning_rate": 3.2235460747436663e-06, "loss": 0.0375, "step": 47055 }, { "epoch": 4.26499909371035, "grad_norm": 0.44509661197662354, "learning_rate": 3.2196624869867688e-06, "loss": 0.0327, "step": 47060 }, { "epoch": 4.265452238535436, "grad_norm": 0.29423099756240845, "learning_rate": 3.215781079013405e-06, "loss": 0.0312, "step": 47065 }, { "epoch": 4.265905383360522, "grad_norm": 0.3062439262866974, "learning_rate": 3.2119018512120485e-06, "loss": 0.0401, "step": 47070 }, { "epoch": 4.2663585281856085, "grad_norm": 0.25326666235923767, "learning_rate": 3.2080248039709076e-06, "loss": 0.0296, "step": 47075 }, { "epoch": 4.266811673010694, "grad_norm": 0.46776828169822693, "learning_rate": 3.204149937678019e-06, "loss": 0.0327, "step": 47080 }, { "epoch": 4.26726481783578, "grad_norm": 0.38370922207832336, "learning_rate": 3.2002772527211704e-06, "loss": 0.0297, "step": 47085 }, { "epoch": 4.267717962660867, "grad_norm": 0.2968084216117859, "learning_rate": 3.1964067494879403e-06, "loss": 0.0312, "step": 47090 }, { "epoch": 4.268171107485952, "grad_norm": 0.4056773781776428, "learning_rate": 3.1925384283656963e-06, "loss": 0.0324, "step": 47095 }, { "epoch": 4.268624252311039, "grad_norm": 0.3233136832714081, "learning_rate": 3.188672289741582e-06, "loss": 0.0322, "step": 47100 }, { "epoch": 4.269077397136125, "grad_norm": 0.45904383063316345, "learning_rate": 3.1848083340025148e-06, "loss": 0.032, "step": 47105 }, { "epoch": 4.269530541961211, "grad_norm": 0.4555202126502991, "learning_rate": 3.180946561535206e-06, "loss": 0.0303, "step": 47110 }, { "epoch": 4.269983686786297, "grad_norm": 0.63120436668396, "learning_rate": 3.1770869727261345e-06, "loss": 0.0446, "step": 47115 }, { "epoch": 4.2704368316113825, "grad_norm": 0.31402724981307983, "learning_rate": 3.1732295679615833e-06, "loss": 0.0312, "step": 47120 }, { "epoch": 4.270889976436469, "grad_norm": 0.3797401785850525, "learning_rate": 3.1693743476275933e-06, "loss": 0.0311, "step": 47125 }, { "epoch": 4.271343121261555, "grad_norm": 0.36773815751075745, "learning_rate": 3.1655213121100006e-06, "loss": 0.0323, "step": 47130 }, { "epoch": 4.271796266086641, "grad_norm": 0.45299389958381653, "learning_rate": 3.1616704617944166e-06, "loss": 0.0315, "step": 47135 }, { "epoch": 4.272249410911727, "grad_norm": 0.29369744658470154, "learning_rate": 3.157821797066227e-06, "loss": 0.0314, "step": 47140 }, { "epoch": 4.272702555736814, "grad_norm": 0.36842653155326843, "learning_rate": 3.153975318310623e-06, "loss": 0.0307, "step": 47145 }, { "epoch": 4.273155700561899, "grad_norm": 0.2624678611755371, "learning_rate": 3.1501310259125534e-06, "loss": 0.0307, "step": 47150 }, { "epoch": 4.273608845386986, "grad_norm": 0.46964770555496216, "learning_rate": 3.1462889202567567e-06, "loss": 0.0321, "step": 47155 }, { "epoch": 4.274061990212072, "grad_norm": 0.6119127869606018, "learning_rate": 3.142449001727754e-06, "loss": 0.0344, "step": 47160 }, { "epoch": 4.2745151350371575, "grad_norm": 0.3417167663574219, "learning_rate": 3.138611270709843e-06, "loss": 0.0346, "step": 47165 }, { "epoch": 4.274968279862244, "grad_norm": 0.32417646050453186, "learning_rate": 3.1347757275871024e-06, "loss": 0.032, "step": 47170 }, { "epoch": 4.27542142468733, "grad_norm": 0.3559877872467041, "learning_rate": 3.1309423727434035e-06, "loss": 0.034, "step": 47175 }, { "epoch": 4.275874569512416, "grad_norm": 0.41530781984329224, "learning_rate": 3.127111206562383e-06, "loss": 0.0304, "step": 47180 }, { "epoch": 4.276327714337502, "grad_norm": 0.3174123466014862, "learning_rate": 3.1232822294274683e-06, "loss": 0.0337, "step": 47185 }, { "epoch": 4.276780859162589, "grad_norm": 0.36622878909111023, "learning_rate": 3.1194554417218635e-06, "loss": 0.0278, "step": 47190 }, { "epoch": 4.277234003987674, "grad_norm": 0.21727436780929565, "learning_rate": 3.1156308438285486e-06, "loss": 0.0295, "step": 47195 }, { "epoch": 4.277687148812761, "grad_norm": 0.3575371205806732, "learning_rate": 3.1118084361303034e-06, "loss": 0.0307, "step": 47200 }, { "epoch": 4.278140293637847, "grad_norm": 0.42212510108947754, "learning_rate": 3.107988219009669e-06, "loss": 0.0345, "step": 47205 }, { "epoch": 4.2785934384629325, "grad_norm": 0.32952484488487244, "learning_rate": 3.104170192848968e-06, "loss": 0.0297, "step": 47210 }, { "epoch": 4.279046583288019, "grad_norm": 0.34487393498420715, "learning_rate": 3.1003543580303274e-06, "loss": 0.0297, "step": 47215 }, { "epoch": 4.279499728113105, "grad_norm": 0.3888365924358368, "learning_rate": 3.096540714935617e-06, "loss": 0.0358, "step": 47220 }, { "epoch": 4.279952872938191, "grad_norm": 0.2776094079017639, "learning_rate": 3.09272926394652e-06, "loss": 0.0296, "step": 47225 }, { "epoch": 4.280406017763277, "grad_norm": 0.36801937222480774, "learning_rate": 3.0889200054444816e-06, "loss": 0.0306, "step": 47230 }, { "epoch": 4.280859162588364, "grad_norm": 0.3428417444229126, "learning_rate": 3.0851129398107333e-06, "loss": 0.03, "step": 47235 }, { "epoch": 4.281312307413449, "grad_norm": 0.2919834852218628, "learning_rate": 3.0813080674263e-06, "loss": 0.0301, "step": 47240 }, { "epoch": 4.2817654522385356, "grad_norm": 0.4432760775089264, "learning_rate": 3.0775053886719557e-06, "loss": 0.0316, "step": 47245 }, { "epoch": 4.282218597063622, "grad_norm": 0.3234197795391083, "learning_rate": 3.073704903928287e-06, "loss": 0.0314, "step": 47250 }, { "epoch": 4.2826717418887075, "grad_norm": 0.5814387202262878, "learning_rate": 3.069906613575646e-06, "loss": 0.0377, "step": 47255 }, { "epoch": 4.283124886713794, "grad_norm": 0.33266162872314453, "learning_rate": 3.0661105179941584e-06, "loss": 0.0304, "step": 47260 }, { "epoch": 4.28357803153888, "grad_norm": 0.35459986329078674, "learning_rate": 3.062316617563754e-06, "loss": 0.0296, "step": 47265 }, { "epoch": 4.284031176363966, "grad_norm": 0.33183249831199646, "learning_rate": 3.0585249126641096e-06, "loss": 0.0296, "step": 47270 }, { "epoch": 4.284484321189052, "grad_norm": 0.48112544417381287, "learning_rate": 3.0547354036747134e-06, "loss": 0.0297, "step": 47275 }, { "epoch": 4.284937466014138, "grad_norm": 0.31025606393814087, "learning_rate": 3.0509480909748172e-06, "loss": 0.0321, "step": 47280 }, { "epoch": 4.285390610839224, "grad_norm": 0.2915705740451813, "learning_rate": 3.047162974943449e-06, "loss": 0.031, "step": 47285 }, { "epoch": 4.2858437556643105, "grad_norm": 0.31314200162887573, "learning_rate": 3.0433800559594437e-06, "loss": 0.0293, "step": 47290 }, { "epoch": 4.286296900489396, "grad_norm": 0.35491883754730225, "learning_rate": 3.0395993344013746e-06, "loss": 0.0335, "step": 47295 }, { "epoch": 4.2867500453144824, "grad_norm": 0.29254159331321716, "learning_rate": 3.0358208106476326e-06, "loss": 0.0305, "step": 47300 }, { "epoch": 4.287203190139569, "grad_norm": 0.3174590766429901, "learning_rate": 3.0320444850763683e-06, "loss": 0.0281, "step": 47305 }, { "epoch": 4.287656334964654, "grad_norm": 0.3503709137439728, "learning_rate": 3.028270358065513e-06, "loss": 0.0314, "step": 47310 }, { "epoch": 4.288109479789741, "grad_norm": 0.38687562942504883, "learning_rate": 3.0244984299927977e-06, "loss": 0.032, "step": 47315 }, { "epoch": 4.288562624614827, "grad_norm": 0.32115477323532104, "learning_rate": 3.0207287012357004e-06, "loss": 0.0312, "step": 47320 }, { "epoch": 4.289015769439913, "grad_norm": 0.3568165600299835, "learning_rate": 3.016961172171509e-06, "loss": 0.0319, "step": 47325 }, { "epoch": 4.289468914264999, "grad_norm": 0.3311033546924591, "learning_rate": 3.0131958431772743e-06, "loss": 0.0311, "step": 47330 }, { "epoch": 4.2899220590900855, "grad_norm": 0.26968199014663696, "learning_rate": 3.0094327146298256e-06, "loss": 0.0312, "step": 47335 }, { "epoch": 4.290375203915171, "grad_norm": 0.44332075119018555, "learning_rate": 3.0056717869057978e-06, "loss": 0.0322, "step": 47340 }, { "epoch": 4.290828348740257, "grad_norm": 0.30435848236083984, "learning_rate": 3.0019130603815616e-06, "loss": 0.0289, "step": 47345 }, { "epoch": 4.291281493565344, "grad_norm": 0.3870121240615845, "learning_rate": 2.998156535433305e-06, "loss": 0.0294, "step": 47350 }, { "epoch": 4.291734638390429, "grad_norm": 0.40885302424430847, "learning_rate": 2.9944022124369835e-06, "loss": 0.0328, "step": 47355 }, { "epoch": 4.292187783215516, "grad_norm": 0.3781692087650299, "learning_rate": 2.9906500917683266e-06, "loss": 0.0454, "step": 47360 }, { "epoch": 4.292640928040602, "grad_norm": 0.5802369117736816, "learning_rate": 2.9869001738028507e-06, "loss": 0.044, "step": 47365 }, { "epoch": 4.293094072865688, "grad_norm": 0.45252305269241333, "learning_rate": 2.983152458915839e-06, "loss": 0.0323, "step": 47370 }, { "epoch": 4.293547217690774, "grad_norm": 0.3657684028148651, "learning_rate": 2.9794069474823773e-06, "loss": 0.0317, "step": 47375 }, { "epoch": 4.2940003625158605, "grad_norm": 0.36218464374542236, "learning_rate": 2.975663639877313e-06, "loss": 0.0307, "step": 47380 }, { "epoch": 4.294453507340946, "grad_norm": 0.27168992161750793, "learning_rate": 2.9719225364752773e-06, "loss": 0.032, "step": 47385 }, { "epoch": 4.294906652166032, "grad_norm": 0.3370904326438904, "learning_rate": 2.9681836376506815e-06, "loss": 0.0298, "step": 47390 }, { "epoch": 4.295359796991118, "grad_norm": 0.6544480323791504, "learning_rate": 2.96444694377771e-06, "loss": 0.0357, "step": 47395 }, { "epoch": 4.295812941816204, "grad_norm": 0.3106696605682373, "learning_rate": 2.960712455230341e-06, "loss": 0.0332, "step": 47400 }, { "epoch": 4.296266086641291, "grad_norm": 0.2428044080734253, "learning_rate": 2.9569801723823205e-06, "loss": 0.038, "step": 47405 }, { "epoch": 4.296719231466376, "grad_norm": 0.4009562134742737, "learning_rate": 2.9532500956071766e-06, "loss": 0.0324, "step": 47410 }, { "epoch": 4.297172376291463, "grad_norm": 0.4589937627315521, "learning_rate": 2.9495222252782147e-06, "loss": 0.0359, "step": 47415 }, { "epoch": 4.297625521116549, "grad_norm": 0.31299781799316406, "learning_rate": 2.9457965617685184e-06, "loss": 0.0351, "step": 47420 }, { "epoch": 4.298078665941635, "grad_norm": 0.37145644426345825, "learning_rate": 2.942073105450963e-06, "loss": 0.0339, "step": 47425 }, { "epoch": 4.298531810766721, "grad_norm": 0.39258143305778503, "learning_rate": 2.9383518566981876e-06, "loss": 0.0317, "step": 47430 }, { "epoch": 4.298984955591807, "grad_norm": 0.4067826271057129, "learning_rate": 2.934632815882618e-06, "loss": 0.0362, "step": 47435 }, { "epoch": 4.299438100416893, "grad_norm": 0.3378666639328003, "learning_rate": 2.930915983376453e-06, "loss": 0.0344, "step": 47440 }, { "epoch": 4.299891245241979, "grad_norm": 0.338803768157959, "learning_rate": 2.9272013595516735e-06, "loss": 0.0298, "step": 47445 }, { "epoch": 4.300344390067066, "grad_norm": 0.304399311542511, "learning_rate": 2.923488944780048e-06, "loss": 0.0294, "step": 47450 }, { "epoch": 4.300797534892151, "grad_norm": 0.37735727429389954, "learning_rate": 2.91977873943311e-06, "loss": 0.0317, "step": 47455 }, { "epoch": 4.301250679717238, "grad_norm": 0.37006106972694397, "learning_rate": 2.916070743882182e-06, "loss": 0.0399, "step": 47460 }, { "epoch": 4.301703824542324, "grad_norm": 0.32810086011886597, "learning_rate": 2.9123649584983535e-06, "loss": 0.0314, "step": 47465 }, { "epoch": 4.3021569693674095, "grad_norm": 0.4016077518463135, "learning_rate": 2.908661383652514e-06, "loss": 0.0368, "step": 47470 }, { "epoch": 4.302610114192496, "grad_norm": 0.3427225947380066, "learning_rate": 2.9049600197153084e-06, "loss": 0.0298, "step": 47475 }, { "epoch": 4.303063259017582, "grad_norm": 0.3315044939517975, "learning_rate": 2.901260867057176e-06, "loss": 0.0401, "step": 47480 }, { "epoch": 4.303516403842668, "grad_norm": 0.4103744924068451, "learning_rate": 2.8975639260483243e-06, "loss": 0.0375, "step": 47485 }, { "epoch": 4.303969548667754, "grad_norm": 0.3496705889701843, "learning_rate": 2.8938691970587433e-06, "loss": 0.0366, "step": 47490 }, { "epoch": 4.304422693492841, "grad_norm": 0.293322890996933, "learning_rate": 2.8901766804582092e-06, "loss": 0.0337, "step": 47495 }, { "epoch": 4.304875838317926, "grad_norm": 0.2913435399532318, "learning_rate": 2.886486376616268e-06, "loss": 0.0302, "step": 47500 }, { "epoch": 4.305328983143013, "grad_norm": 0.3157009780406952, "learning_rate": 2.882798285902244e-06, "loss": 0.0304, "step": 47505 }, { "epoch": 4.305782127968099, "grad_norm": 0.38173341751098633, "learning_rate": 2.8791124086852473e-06, "loss": 0.0317, "step": 47510 }, { "epoch": 4.3062352727931845, "grad_norm": 0.3349648416042328, "learning_rate": 2.8754287453341524e-06, "loss": 0.0299, "step": 47515 }, { "epoch": 4.306688417618271, "grad_norm": 0.2642950117588043, "learning_rate": 2.8717472962176333e-06, "loss": 0.028, "step": 47520 }, { "epoch": 4.307141562443357, "grad_norm": 0.34681063890457153, "learning_rate": 2.868068061704124e-06, "loss": 0.0299, "step": 47525 }, { "epoch": 4.307594707268443, "grad_norm": 0.2555021643638611, "learning_rate": 2.864391042161846e-06, "loss": 0.0279, "step": 47530 }, { "epoch": 4.308047852093529, "grad_norm": 0.3445412218570709, "learning_rate": 2.8607162379587938e-06, "loss": 0.0302, "step": 47535 }, { "epoch": 4.308500996918616, "grad_norm": 0.44416844844818115, "learning_rate": 2.8570436494627427e-06, "loss": 0.0336, "step": 47540 }, { "epoch": 4.308954141743701, "grad_norm": 0.3432234823703766, "learning_rate": 2.8533732770412573e-06, "loss": 0.0465, "step": 47545 }, { "epoch": 4.309407286568788, "grad_norm": 0.3280832767486572, "learning_rate": 2.8497051210616494e-06, "loss": 0.0295, "step": 47550 }, { "epoch": 4.309860431393873, "grad_norm": 0.2904241681098938, "learning_rate": 2.846039181891047e-06, "loss": 0.0294, "step": 47555 }, { "epoch": 4.3103135762189595, "grad_norm": 0.35174137353897095, "learning_rate": 2.842375459896332e-06, "loss": 0.0314, "step": 47560 }, { "epoch": 4.310766721044046, "grad_norm": 0.3548619747161865, "learning_rate": 2.8387139554441666e-06, "loss": 0.0308, "step": 47565 }, { "epoch": 4.311219865869131, "grad_norm": 0.4695952832698822, "learning_rate": 2.8350546689010106e-06, "loss": 0.0339, "step": 47570 }, { "epoch": 4.311673010694218, "grad_norm": 0.36852163076400757, "learning_rate": 2.8313976006330676e-06, "loss": 0.0302, "step": 47575 }, { "epoch": 4.312126155519304, "grad_norm": 0.3974175453186035, "learning_rate": 2.8277427510063487e-06, "loss": 0.0343, "step": 47580 }, { "epoch": 4.31257930034439, "grad_norm": 0.4076997637748718, "learning_rate": 2.8240901203866328e-06, "loss": 0.0307, "step": 47585 }, { "epoch": 4.313032445169476, "grad_norm": 0.3596881926059723, "learning_rate": 2.820439709139469e-06, "loss": 0.0338, "step": 47590 }, { "epoch": 4.3134855899945626, "grad_norm": 0.40094801783561707, "learning_rate": 2.8167915176302072e-06, "loss": 0.0362, "step": 47595 }, { "epoch": 4.313938734819648, "grad_norm": 0.27728238701820374, "learning_rate": 2.8131455462239386e-06, "loss": 0.0349, "step": 47600 }, { "epoch": 4.3143918796447345, "grad_norm": 0.45192450284957886, "learning_rate": 2.8095017952855685e-06, "loss": 0.0315, "step": 47605 }, { "epoch": 4.314845024469821, "grad_norm": 0.28092503547668457, "learning_rate": 2.8058602651797606e-06, "loss": 0.0294, "step": 47610 }, { "epoch": 4.315298169294906, "grad_norm": 0.2832961678504944, "learning_rate": 2.8022209562709568e-06, "loss": 0.0304, "step": 47615 }, { "epoch": 4.315751314119993, "grad_norm": 0.3498591482639313, "learning_rate": 2.7985838689233905e-06, "loss": 0.0328, "step": 47620 }, { "epoch": 4.316204458945079, "grad_norm": 0.3421516418457031, "learning_rate": 2.794949003501049e-06, "loss": 0.0293, "step": 47625 }, { "epoch": 4.316657603770165, "grad_norm": 0.38162001967430115, "learning_rate": 2.791316360367721e-06, "loss": 0.031, "step": 47630 }, { "epoch": 4.317110748595251, "grad_norm": 0.4497714042663574, "learning_rate": 2.787685939886958e-06, "loss": 0.0321, "step": 47635 }, { "epoch": 4.3175638934203375, "grad_norm": 0.4259360134601593, "learning_rate": 2.7840577424220913e-06, "loss": 0.0445, "step": 47640 }, { "epoch": 4.318017038245423, "grad_norm": 0.41887950897216797, "learning_rate": 2.7804317683362472e-06, "loss": 0.0319, "step": 47645 }, { "epoch": 4.3184701830705094, "grad_norm": 0.27253279089927673, "learning_rate": 2.776808017992291e-06, "loss": 0.037, "step": 47650 }, { "epoch": 4.318923327895595, "grad_norm": 0.39002132415771484, "learning_rate": 2.773186491752908e-06, "loss": 0.0328, "step": 47655 }, { "epoch": 4.319376472720681, "grad_norm": 0.3817850947380066, "learning_rate": 2.7695671899805347e-06, "loss": 0.0293, "step": 47660 }, { "epoch": 4.319829617545768, "grad_norm": 0.37683090567588806, "learning_rate": 2.7659501130373855e-06, "loss": 0.0293, "step": 47665 }, { "epoch": 4.320282762370853, "grad_norm": 0.3783101439476013, "learning_rate": 2.7623352612854756e-06, "loss": 0.0357, "step": 47670 }, { "epoch": 4.32073590719594, "grad_norm": 0.4398363530635834, "learning_rate": 2.758722635086561e-06, "loss": 0.0326, "step": 47675 }, { "epoch": 4.321189052021026, "grad_norm": 0.26280340552330017, "learning_rate": 2.755112234802207e-06, "loss": 0.0287, "step": 47680 }, { "epoch": 4.321642196846112, "grad_norm": 0.3526003658771515, "learning_rate": 2.7515040607937416e-06, "loss": 0.0294, "step": 47685 }, { "epoch": 4.322095341671198, "grad_norm": 0.3556106686592102, "learning_rate": 2.747898113422265e-06, "loss": 0.0349, "step": 47690 }, { "epoch": 4.322548486496284, "grad_norm": 0.2731344699859619, "learning_rate": 2.74429439304868e-06, "loss": 0.0301, "step": 47695 }, { "epoch": 4.32300163132137, "grad_norm": 0.30179134011268616, "learning_rate": 2.740692900033623e-06, "loss": 0.0374, "step": 47700 }, { "epoch": 4.323454776146456, "grad_norm": 0.3266521692276001, "learning_rate": 2.73709363473755e-06, "loss": 0.0324, "step": 47705 }, { "epoch": 4.323907920971543, "grad_norm": 0.3301123082637787, "learning_rate": 2.7334965975206737e-06, "loss": 0.0316, "step": 47710 }, { "epoch": 4.324361065796628, "grad_norm": 0.4887038767337799, "learning_rate": 2.729901788742986e-06, "loss": 0.033, "step": 47715 }, { "epoch": 4.324814210621715, "grad_norm": 0.4089599549770355, "learning_rate": 2.7263092087642495e-06, "loss": 0.0356, "step": 47720 }, { "epoch": 4.325267355446801, "grad_norm": 0.27207043766975403, "learning_rate": 2.7227188579440207e-06, "loss": 0.0319, "step": 47725 }, { "epoch": 4.325720500271887, "grad_norm": 0.2912025451660156, "learning_rate": 2.7191307366416203e-06, "loss": 0.0344, "step": 47730 }, { "epoch": 4.326173645096973, "grad_norm": 0.35283973813056946, "learning_rate": 2.71554484521615e-06, "loss": 0.0326, "step": 47735 }, { "epoch": 4.326626789922059, "grad_norm": 0.2723279297351837, "learning_rate": 2.711961184026482e-06, "loss": 0.0395, "step": 47740 }, { "epoch": 4.327079934747145, "grad_norm": 0.34893062710762024, "learning_rate": 2.70837975343127e-06, "loss": 0.0303, "step": 47745 }, { "epoch": 4.327533079572231, "grad_norm": 0.30834394693374634, "learning_rate": 2.7048005537889524e-06, "loss": 0.0319, "step": 47750 }, { "epoch": 4.327986224397318, "grad_norm": 0.401511549949646, "learning_rate": 2.7012235854577323e-06, "loss": 0.0286, "step": 47755 }, { "epoch": 4.328439369222403, "grad_norm": 0.29636237025260925, "learning_rate": 2.697648848795592e-06, "loss": 0.0438, "step": 47760 }, { "epoch": 4.32889251404749, "grad_norm": 0.33751577138900757, "learning_rate": 2.6940763441602984e-06, "loss": 0.0312, "step": 47765 }, { "epoch": 4.329345658872576, "grad_norm": 0.34249579906463623, "learning_rate": 2.6905060719093767e-06, "loss": 0.0436, "step": 47770 }, { "epoch": 4.329798803697662, "grad_norm": 0.31048697233200073, "learning_rate": 2.686938032400155e-06, "loss": 0.0304, "step": 47775 }, { "epoch": 4.330251948522748, "grad_norm": 0.42581871151924133, "learning_rate": 2.6833722259897164e-06, "loss": 0.0316, "step": 47780 }, { "epoch": 4.330705093347834, "grad_norm": 0.29777446389198303, "learning_rate": 2.679808653034932e-06, "loss": 0.0343, "step": 47785 }, { "epoch": 4.33115823817292, "grad_norm": 0.3658560812473297, "learning_rate": 2.676247313892441e-06, "loss": 0.0316, "step": 47790 }, { "epoch": 4.331611382998006, "grad_norm": 0.39481332898139954, "learning_rate": 2.6726882089186613e-06, "loss": 0.0361, "step": 47795 }, { "epoch": 4.332064527823093, "grad_norm": 0.31467610597610474, "learning_rate": 2.669131338469799e-06, "loss": 0.0338, "step": 47800 }, { "epoch": 4.332517672648178, "grad_norm": 0.3409980833530426, "learning_rate": 2.6655767029018202e-06, "loss": 0.0304, "step": 47805 }, { "epoch": 4.332970817473265, "grad_norm": 0.3578323423862457, "learning_rate": 2.6620243025704762e-06, "loss": 0.0283, "step": 47810 }, { "epoch": 4.33342396229835, "grad_norm": 0.556515097618103, "learning_rate": 2.6584741378312906e-06, "loss": 0.0329, "step": 47815 }, { "epoch": 4.3338771071234365, "grad_norm": 0.2506255805492401, "learning_rate": 2.6549262090395597e-06, "loss": 0.033, "step": 47820 }, { "epoch": 4.334330251948523, "grad_norm": 0.2845155894756317, "learning_rate": 2.651380516550375e-06, "loss": 0.03, "step": 47825 }, { "epoch": 4.3347833967736085, "grad_norm": 0.4270329177379608, "learning_rate": 2.6478370607185805e-06, "loss": 0.0301, "step": 47830 }, { "epoch": 4.335236541598695, "grad_norm": 0.4020865559577942, "learning_rate": 2.644295841898811e-06, "loss": 0.0334, "step": 47835 }, { "epoch": 4.335689686423781, "grad_norm": 0.4579846262931824, "learning_rate": 2.6407568604454703e-06, "loss": 0.0311, "step": 47840 }, { "epoch": 4.336142831248867, "grad_norm": 0.36955878138542175, "learning_rate": 2.637220116712738e-06, "loss": 0.036, "step": 47845 }, { "epoch": 4.336595976073953, "grad_norm": 0.27670368552207947, "learning_rate": 2.6336856110545814e-06, "loss": 0.0299, "step": 47850 }, { "epoch": 4.33704912089904, "grad_norm": 0.2876233756542206, "learning_rate": 2.630153343824726e-06, "loss": 0.029, "step": 47855 }, { "epoch": 4.337502265724125, "grad_norm": 0.3196759819984436, "learning_rate": 2.626623315376686e-06, "loss": 0.036, "step": 47860 }, { "epoch": 4.3379554105492115, "grad_norm": 0.3109481632709503, "learning_rate": 2.623095526063754e-06, "loss": 0.0455, "step": 47865 }, { "epoch": 4.338408555374298, "grad_norm": 0.43528300523757935, "learning_rate": 2.619569976238978e-06, "loss": 0.0315, "step": 47870 }, { "epoch": 4.338861700199383, "grad_norm": 0.3429262638092041, "learning_rate": 2.616046666255209e-06, "loss": 0.0305, "step": 47875 }, { "epoch": 4.33931484502447, "grad_norm": 0.34011775255203247, "learning_rate": 2.6125255964650567e-06, "loss": 0.028, "step": 47880 }, { "epoch": 4.339767989849556, "grad_norm": 0.3440861999988556, "learning_rate": 2.609006767220906e-06, "loss": 0.0305, "step": 47885 }, { "epoch": 4.340221134674642, "grad_norm": 0.4113209545612335, "learning_rate": 2.6054901788749364e-06, "loss": 0.0354, "step": 47890 }, { "epoch": 4.340674279499728, "grad_norm": 0.36956363916397095, "learning_rate": 2.6019758317790716e-06, "loss": 0.0322, "step": 47895 }, { "epoch": 4.341127424324815, "grad_norm": 0.38530272245407104, "learning_rate": 2.598463726285047e-06, "loss": 0.0312, "step": 47900 }, { "epoch": 4.3415805691499, "grad_norm": 0.3994750380516052, "learning_rate": 2.5949538627443343e-06, "loss": 0.0312, "step": 47905 }, { "epoch": 4.3420337139749865, "grad_norm": 0.3831225633621216, "learning_rate": 2.591446241508219e-06, "loss": 0.0299, "step": 47910 }, { "epoch": 4.342486858800073, "grad_norm": 0.45344340801239014, "learning_rate": 2.5879408629277404e-06, "loss": 0.036, "step": 47915 }, { "epoch": 4.342940003625158, "grad_norm": 0.32800889015197754, "learning_rate": 2.5844377273537086e-06, "loss": 0.0297, "step": 47920 }, { "epoch": 4.343393148450245, "grad_norm": 0.23449097573757172, "learning_rate": 2.580936835136738e-06, "loss": 0.029, "step": 47925 }, { "epoch": 4.34384629327533, "grad_norm": 0.2905588150024414, "learning_rate": 2.577438186627179e-06, "loss": 0.0301, "step": 47930 }, { "epoch": 4.344299438100417, "grad_norm": 0.507323682308197, "learning_rate": 2.5739417821751903e-06, "loss": 0.035, "step": 47935 }, { "epoch": 4.344752582925503, "grad_norm": 0.5837030410766602, "learning_rate": 2.5704476221306893e-06, "loss": 0.0319, "step": 47940 }, { "epoch": 4.345205727750589, "grad_norm": 0.37738192081451416, "learning_rate": 2.5669557068433685e-06, "loss": 0.0293, "step": 47945 }, { "epoch": 4.345658872575675, "grad_norm": 0.36869925260543823, "learning_rate": 2.5634660366627124e-06, "loss": 0.0307, "step": 47950 }, { "epoch": 4.3461120174007615, "grad_norm": 0.31361123919487, "learning_rate": 2.5599786119379493e-06, "loss": 0.0322, "step": 47955 }, { "epoch": 4.346565162225847, "grad_norm": 0.4263281524181366, "learning_rate": 2.55649343301812e-06, "loss": 0.0317, "step": 47960 }, { "epoch": 4.347018307050933, "grad_norm": 0.2777095437049866, "learning_rate": 2.553010500252012e-06, "loss": 0.0302, "step": 47965 }, { "epoch": 4.34747145187602, "grad_norm": 0.38533514738082886, "learning_rate": 2.5495298139881994e-06, "loss": 0.032, "step": 47970 }, { "epoch": 4.347924596701105, "grad_norm": 0.4712035357952118, "learning_rate": 2.5460513745750427e-06, "loss": 0.0315, "step": 47975 }, { "epoch": 4.348377741526192, "grad_norm": 0.3353274464607239, "learning_rate": 2.542575182360643e-06, "loss": 0.0292, "step": 47980 }, { "epoch": 4.348830886351278, "grad_norm": 0.4523184895515442, "learning_rate": 2.5391012376929146e-06, "loss": 0.0363, "step": 47985 }, { "epoch": 4.349284031176364, "grad_norm": 0.3929762542247772, "learning_rate": 2.535629540919529e-06, "loss": 0.03, "step": 47990 }, { "epoch": 4.34973717600145, "grad_norm": 0.40619683265686035, "learning_rate": 2.532160092387928e-06, "loss": 0.0295, "step": 47995 }, { "epoch": 4.3501903208265364, "grad_norm": 0.4750794470310211, "learning_rate": 2.5286928924453476e-06, "loss": 0.0332, "step": 48000 }, { "epoch": 4.350643465651622, "grad_norm": 0.5171582102775574, "learning_rate": 2.525227941438776e-06, "loss": 0.0328, "step": 48005 }, { "epoch": 4.351096610476708, "grad_norm": 0.4576246738433838, "learning_rate": 2.5217652397149906e-06, "loss": 0.0309, "step": 48010 }, { "epoch": 4.351549755301795, "grad_norm": 0.33714208006858826, "learning_rate": 2.5183047876205374e-06, "loss": 0.0293, "step": 48015 }, { "epoch": 4.35200290012688, "grad_norm": 0.3139207065105438, "learning_rate": 2.5148465855017374e-06, "loss": 0.0283, "step": 48020 }, { "epoch": 4.352456044951967, "grad_norm": 0.28805601596832275, "learning_rate": 2.5113906337046967e-06, "loss": 0.0307, "step": 48025 }, { "epoch": 4.352909189777053, "grad_norm": 0.3568617105484009, "learning_rate": 2.5079369325752835e-06, "loss": 0.0296, "step": 48030 }, { "epoch": 4.353362334602139, "grad_norm": 0.2820644974708557, "learning_rate": 2.5044854824591456e-06, "loss": 0.0353, "step": 48035 }, { "epoch": 4.353815479427225, "grad_norm": 0.4123474955558777, "learning_rate": 2.501036283701705e-06, "loss": 0.032, "step": 48040 }, { "epoch": 4.354268624252311, "grad_norm": 0.3391786217689514, "learning_rate": 2.497589336648154e-06, "loss": 0.0285, "step": 48045 }, { "epoch": 4.354721769077397, "grad_norm": 0.2571937143802643, "learning_rate": 2.4941446416434757e-06, "loss": 0.032, "step": 48050 }, { "epoch": 4.355174913902483, "grad_norm": 0.5739266276359558, "learning_rate": 2.4907021990324077e-06, "loss": 0.0353, "step": 48055 }, { "epoch": 4.35562805872757, "grad_norm": 0.7575083374977112, "learning_rate": 2.4872620091594743e-06, "loss": 0.0471, "step": 48060 }, { "epoch": 4.356081203552655, "grad_norm": 0.4032142758369446, "learning_rate": 2.4838240723689675e-06, "loss": 0.0324, "step": 48065 }, { "epoch": 4.356534348377742, "grad_norm": 0.444638192653656, "learning_rate": 2.480388389004956e-06, "loss": 0.0315, "step": 48070 }, { "epoch": 4.356987493202828, "grad_norm": 0.47089657187461853, "learning_rate": 2.476954959411293e-06, "loss": 0.034, "step": 48075 }, { "epoch": 4.357440638027914, "grad_norm": 0.3551241457462311, "learning_rate": 2.4735237839315915e-06, "loss": 0.0348, "step": 48080 }, { "epoch": 4.357893782853, "grad_norm": 0.25945714116096497, "learning_rate": 2.4700948629092417e-06, "loss": 0.0361, "step": 48085 }, { "epoch": 4.3583469276780855, "grad_norm": 0.3292652368545532, "learning_rate": 2.4666681966874162e-06, "loss": 0.0311, "step": 48090 }, { "epoch": 4.358800072503172, "grad_norm": 0.4037098288536072, "learning_rate": 2.463243785609057e-06, "loss": 0.0309, "step": 48095 }, { "epoch": 4.359253217328258, "grad_norm": 0.3798333704471588, "learning_rate": 2.4598216300168704e-06, "loss": 0.0313, "step": 48100 }, { "epoch": 4.359706362153344, "grad_norm": 0.3666374981403351, "learning_rate": 2.4564017302533603e-06, "loss": 0.0301, "step": 48105 }, { "epoch": 4.36015950697843, "grad_norm": 0.312717080116272, "learning_rate": 2.4529840866607866e-06, "loss": 0.0321, "step": 48110 }, { "epoch": 4.360612651803517, "grad_norm": 0.3469960689544678, "learning_rate": 2.449568699581184e-06, "loss": 0.0303, "step": 48115 }, { "epoch": 4.361065796628602, "grad_norm": 0.2876608669757843, "learning_rate": 2.446155569356376e-06, "loss": 0.0349, "step": 48120 }, { "epoch": 4.361518941453689, "grad_norm": 0.2806805968284607, "learning_rate": 2.4427446963279367e-06, "loss": 0.0384, "step": 48125 }, { "epoch": 4.361972086278775, "grad_norm": 0.6158187985420227, "learning_rate": 2.4393360808372346e-06, "loss": 0.0496, "step": 48130 }, { "epoch": 4.3624252311038605, "grad_norm": 0.3313640058040619, "learning_rate": 2.4359297232254054e-06, "loss": 0.0316, "step": 48135 }, { "epoch": 4.362878375928947, "grad_norm": 0.4432685077190399, "learning_rate": 2.4325256238333537e-06, "loss": 0.0299, "step": 48140 }, { "epoch": 4.363331520754033, "grad_norm": 0.34445106983184814, "learning_rate": 2.4291237830017772e-06, "loss": 0.0462, "step": 48145 }, { "epoch": 4.363784665579119, "grad_norm": 0.33539581298828125, "learning_rate": 2.4257242010711113e-06, "loss": 0.0382, "step": 48150 }, { "epoch": 4.364237810404205, "grad_norm": 0.433167427778244, "learning_rate": 2.422326878381603e-06, "loss": 0.0334, "step": 48155 }, { "epoch": 4.364690955229292, "grad_norm": 0.2884577214717865, "learning_rate": 2.418931815273254e-06, "loss": 0.0319, "step": 48160 }, { "epoch": 4.365144100054377, "grad_norm": 0.2950303852558136, "learning_rate": 2.415539012085835e-06, "loss": 0.0311, "step": 48165 }, { "epoch": 4.3655972448794635, "grad_norm": 0.3474918305873871, "learning_rate": 2.4121484691589175e-06, "loss": 0.032, "step": 48170 }, { "epoch": 4.36605038970455, "grad_norm": 0.3642883002758026, "learning_rate": 2.40876018683181e-06, "loss": 0.0328, "step": 48175 }, { "epoch": 4.3665035345296355, "grad_norm": 0.3610813319683075, "learning_rate": 2.4053741654436217e-06, "loss": 0.0344, "step": 48180 }, { "epoch": 4.366956679354722, "grad_norm": 0.2692244052886963, "learning_rate": 2.4019904053332264e-06, "loss": 0.0383, "step": 48185 }, { "epoch": 4.367409824179808, "grad_norm": 0.4009759724140167, "learning_rate": 2.398608906839267e-06, "loss": 0.0311, "step": 48190 }, { "epoch": 4.367862969004894, "grad_norm": 0.3230663239955902, "learning_rate": 2.395229670300175e-06, "loss": 0.0387, "step": 48195 }, { "epoch": 4.36831611382998, "grad_norm": 0.26373091340065, "learning_rate": 2.391852696054131e-06, "loss": 0.0299, "step": 48200 }, { "epoch": 4.368769258655066, "grad_norm": 0.41764146089553833, "learning_rate": 2.388477984439119e-06, "loss": 0.0312, "step": 48205 }, { "epoch": 4.369222403480152, "grad_norm": 0.3045555055141449, "learning_rate": 2.3851055357928725e-06, "loss": 0.0326, "step": 48210 }, { "epoch": 4.3696755483052385, "grad_norm": 0.4815694987773895, "learning_rate": 2.3817353504529014e-06, "loss": 0.0319, "step": 48215 }, { "epoch": 4.370128693130324, "grad_norm": 0.30219221115112305, "learning_rate": 2.3783674287565143e-06, "loss": 0.0304, "step": 48220 }, { "epoch": 4.37058183795541, "grad_norm": 0.3887168765068054, "learning_rate": 2.3750017710407553e-06, "loss": 0.0428, "step": 48225 }, { "epoch": 4.371034982780497, "grad_norm": 0.38516923785209656, "learning_rate": 2.371638377642468e-06, "loss": 0.0379, "step": 48230 }, { "epoch": 4.371488127605582, "grad_norm": 0.3561106324195862, "learning_rate": 2.368277248898265e-06, "loss": 0.0333, "step": 48235 }, { "epoch": 4.371941272430669, "grad_norm": 0.33143898844718933, "learning_rate": 2.3649183851445183e-06, "loss": 0.0309, "step": 48240 }, { "epoch": 4.372394417255755, "grad_norm": 0.3574714958667755, "learning_rate": 2.3615617867174005e-06, "loss": 0.0312, "step": 48245 }, { "epoch": 4.372847562080841, "grad_norm": 0.427593857049942, "learning_rate": 2.3582074539528235e-06, "loss": 0.0389, "step": 48250 }, { "epoch": 4.373300706905927, "grad_norm": 0.3197125196456909, "learning_rate": 2.3548553871865054e-06, "loss": 0.0301, "step": 48255 }, { "epoch": 4.3737538517310135, "grad_norm": 0.31242460012435913, "learning_rate": 2.351505586753913e-06, "loss": 0.0306, "step": 48260 }, { "epoch": 4.374206996556099, "grad_norm": 0.3381958305835724, "learning_rate": 2.3481580529902985e-06, "loss": 0.0338, "step": 48265 }, { "epoch": 4.374660141381185, "grad_norm": 0.2935141921043396, "learning_rate": 2.344812786230682e-06, "loss": 0.0294, "step": 48270 }, { "epoch": 4.375113286206272, "grad_norm": 0.42885228991508484, "learning_rate": 2.3414697868098578e-06, "loss": 0.0328, "step": 48275 }, { "epoch": 4.375566431031357, "grad_norm": 0.4169885516166687, "learning_rate": 2.338129055062402e-06, "loss": 0.037, "step": 48280 }, { "epoch": 4.376019575856444, "grad_norm": 0.3392855226993561, "learning_rate": 2.3347905913226496e-06, "loss": 0.0328, "step": 48285 }, { "epoch": 4.37647272068153, "grad_norm": 0.3490350544452667, "learning_rate": 2.3314543959247176e-06, "loss": 0.0332, "step": 48290 }, { "epoch": 4.376925865506616, "grad_norm": 0.32694011926651, "learning_rate": 2.328120469202491e-06, "loss": 0.0282, "step": 48295 }, { "epoch": 4.377379010331702, "grad_norm": 0.41227179765701294, "learning_rate": 2.324788811489631e-06, "loss": 0.0286, "step": 48300 }, { "epoch": 4.3778321551567885, "grad_norm": 0.2844816744327545, "learning_rate": 2.3214594231195735e-06, "loss": 0.0289, "step": 48305 }, { "epoch": 4.378285299981874, "grad_norm": 0.3230150043964386, "learning_rate": 2.318132304425524e-06, "loss": 0.0308, "step": 48310 }, { "epoch": 4.37873844480696, "grad_norm": 0.2888811230659485, "learning_rate": 2.3148074557404642e-06, "loss": 0.0319, "step": 48315 }, { "epoch": 4.379191589632047, "grad_norm": 0.5420850515365601, "learning_rate": 2.311484877397141e-06, "loss": 0.0342, "step": 48320 }, { "epoch": 4.379644734457132, "grad_norm": 0.3469266891479492, "learning_rate": 2.3081645697280747e-06, "loss": 0.0318, "step": 48325 }, { "epoch": 4.380097879282219, "grad_norm": 0.3355007767677307, "learning_rate": 2.3048465330655744e-06, "loss": 0.0371, "step": 48330 }, { "epoch": 4.380551024107305, "grad_norm": 0.32331573963165283, "learning_rate": 2.301530767741708e-06, "loss": 0.031, "step": 48335 }, { "epoch": 4.381004168932391, "grad_norm": 0.3406866192817688, "learning_rate": 2.298217274088313e-06, "loss": 0.0286, "step": 48340 }, { "epoch": 4.381457313757477, "grad_norm": 0.3857710063457489, "learning_rate": 2.294906052437007e-06, "loss": 0.0305, "step": 48345 }, { "epoch": 4.3819104585825635, "grad_norm": 0.4924919009208679, "learning_rate": 2.2915971031191754e-06, "loss": 0.0302, "step": 48350 }, { "epoch": 4.382363603407649, "grad_norm": 0.3439856469631195, "learning_rate": 2.288290426465989e-06, "loss": 0.0304, "step": 48355 }, { "epoch": 4.382816748232735, "grad_norm": 0.27684730291366577, "learning_rate": 2.284986022808372e-06, "loss": 0.0361, "step": 48360 }, { "epoch": 4.383269893057821, "grad_norm": 0.3165806531906128, "learning_rate": 2.281683892477032e-06, "loss": 0.0329, "step": 48365 }, { "epoch": 4.383723037882907, "grad_norm": 0.3572239577770233, "learning_rate": 2.2783840358024446e-06, "loss": 0.0289, "step": 48370 }, { "epoch": 4.384176182707994, "grad_norm": 0.30641040205955505, "learning_rate": 2.275086453114869e-06, "loss": 0.0319, "step": 48375 }, { "epoch": 4.384629327533079, "grad_norm": 0.3502342998981476, "learning_rate": 2.27179114474432e-06, "loss": 0.0305, "step": 48380 }, { "epoch": 4.385082472358166, "grad_norm": 0.4498416483402252, "learning_rate": 2.2684981110206007e-06, "loss": 0.0325, "step": 48385 }, { "epoch": 4.385535617183252, "grad_norm": 0.3207806944847107, "learning_rate": 2.2652073522732743e-06, "loss": 0.0301, "step": 48390 }, { "epoch": 4.3859887620083375, "grad_norm": 0.3432600796222687, "learning_rate": 2.261918868831675e-06, "loss": 0.0336, "step": 48395 }, { "epoch": 4.386441906833424, "grad_norm": 0.31114694476127625, "learning_rate": 2.2586326610249287e-06, "loss": 0.0288, "step": 48400 }, { "epoch": 4.38689505165851, "grad_norm": 0.26820215582847595, "learning_rate": 2.2553487291819108e-06, "loss": 0.034, "step": 48405 }, { "epoch": 4.387348196483596, "grad_norm": 0.30091434717178345, "learning_rate": 2.252067073631284e-06, "loss": 0.0288, "step": 48410 }, { "epoch": 4.387801341308682, "grad_norm": 0.3310679495334625, "learning_rate": 2.2487876947014708e-06, "loss": 0.0283, "step": 48415 }, { "epoch": 4.388254486133769, "grad_norm": 0.39599189162254333, "learning_rate": 2.2455105927206732e-06, "loss": 0.0309, "step": 48420 }, { "epoch": 4.388707630958854, "grad_norm": 0.34297844767570496, "learning_rate": 2.2422357680168734e-06, "loss": 0.0318, "step": 48425 }, { "epoch": 4.389160775783941, "grad_norm": 0.4352855086326599, "learning_rate": 2.2389632209178115e-06, "loss": 0.0297, "step": 48430 }, { "epoch": 4.389613920609027, "grad_norm": 0.3708289861679077, "learning_rate": 2.2356929517510035e-06, "loss": 0.0294, "step": 48435 }, { "epoch": 4.3900670654341125, "grad_norm": 0.40884971618652344, "learning_rate": 2.2324249608437404e-06, "loss": 0.0308, "step": 48440 }, { "epoch": 4.390520210259199, "grad_norm": 0.4303436577320099, "learning_rate": 2.22915924852308e-06, "loss": 0.0312, "step": 48445 }, { "epoch": 4.390973355084285, "grad_norm": 0.25989896059036255, "learning_rate": 2.2258958151158664e-06, "loss": 0.0327, "step": 48450 }, { "epoch": 4.391426499909371, "grad_norm": 0.3151020407676697, "learning_rate": 2.2226346609486914e-06, "loss": 0.0289, "step": 48455 }, { "epoch": 4.391879644734457, "grad_norm": 0.2749687731266022, "learning_rate": 2.2193757863479437e-06, "loss": 0.0285, "step": 48460 }, { "epoch": 4.392332789559543, "grad_norm": 0.3490508198738098, "learning_rate": 2.216119191639768e-06, "loss": 0.0304, "step": 48465 }, { "epoch": 4.392785934384629, "grad_norm": 0.3330613672733307, "learning_rate": 2.212864877150081e-06, "loss": 0.0319, "step": 48470 }, { "epoch": 4.393239079209716, "grad_norm": 0.40434595942497253, "learning_rate": 2.209612843204592e-06, "loss": 0.0301, "step": 48475 }, { "epoch": 4.393692224034801, "grad_norm": 0.4568648636341095, "learning_rate": 2.206363090128741e-06, "loss": 0.0333, "step": 48480 }, { "epoch": 4.3941453688598875, "grad_norm": 0.35810384154319763, "learning_rate": 2.2031156182477836e-06, "loss": 0.0312, "step": 48485 }, { "epoch": 4.394598513684974, "grad_norm": 0.3856554627418518, "learning_rate": 2.1998704278867214e-06, "loss": 0.0311, "step": 48490 }, { "epoch": 4.395051658510059, "grad_norm": 0.2882387042045593, "learning_rate": 2.196627519370331e-06, "loss": 0.0303, "step": 48495 }, { "epoch": 4.395504803335146, "grad_norm": 0.3539751470088959, "learning_rate": 2.193386893023175e-06, "loss": 0.0353, "step": 48500 }, { "epoch": 4.395957948160232, "grad_norm": 0.27259063720703125, "learning_rate": 2.19014854916956e-06, "loss": 0.0273, "step": 48505 }, { "epoch": 4.396411092985318, "grad_norm": 0.5979635119438171, "learning_rate": 2.1869124881335938e-06, "loss": 0.0343, "step": 48510 }, { "epoch": 4.396864237810404, "grad_norm": 0.36546653509140015, "learning_rate": 2.183678710239137e-06, "loss": 0.0327, "step": 48515 }, { "epoch": 4.3973173826354905, "grad_norm": 0.3372959494590759, "learning_rate": 2.180447215809825e-06, "loss": 0.0323, "step": 48520 }, { "epoch": 4.397770527460576, "grad_norm": 0.32360586524009705, "learning_rate": 2.1772180051690767e-06, "loss": 0.0295, "step": 48525 }, { "epoch": 4.3982236722856625, "grad_norm": 0.29974159598350525, "learning_rate": 2.1739910786400586e-06, "loss": 0.0287, "step": 48530 }, { "epoch": 4.398676817110749, "grad_norm": 0.39833664894104004, "learning_rate": 2.170766436545732e-06, "loss": 0.0317, "step": 48535 }, { "epoch": 4.399129961935834, "grad_norm": 0.4217323064804077, "learning_rate": 2.1675440792088194e-06, "loss": 0.0306, "step": 48540 }, { "epoch": 4.399583106760921, "grad_norm": 0.31206777691841125, "learning_rate": 2.1643240069518095e-06, "loss": 0.0314, "step": 48545 }, { "epoch": 4.400036251586007, "grad_norm": 0.28213825821876526, "learning_rate": 2.161106220096981e-06, "loss": 0.028, "step": 48550 }, { "epoch": 4.400489396411093, "grad_norm": 0.30835986137390137, "learning_rate": 2.1578907189663546e-06, "loss": 0.0311, "step": 48555 }, { "epoch": 4.400942541236179, "grad_norm": 0.44930386543273926, "learning_rate": 2.1546775038817496e-06, "loss": 0.0339, "step": 48560 }, { "epoch": 4.4013956860612655, "grad_norm": 0.32405534386634827, "learning_rate": 2.151466575164743e-06, "loss": 0.0315, "step": 48565 }, { "epoch": 4.401848830886351, "grad_norm": 0.4844825267791748, "learning_rate": 2.1482579331366828e-06, "loss": 0.0319, "step": 48570 }, { "epoch": 4.402301975711437, "grad_norm": 0.3568063974380493, "learning_rate": 2.145051578118701e-06, "loss": 0.0278, "step": 48575 }, { "epoch": 4.402755120536524, "grad_norm": 0.44334128499031067, "learning_rate": 2.141847510431677e-06, "loss": 0.0322, "step": 48580 }, { "epoch": 4.403208265361609, "grad_norm": 0.3874726891517639, "learning_rate": 2.1386457303962825e-06, "loss": 0.0331, "step": 48585 }, { "epoch": 4.403661410186696, "grad_norm": 0.39704087376594543, "learning_rate": 2.1354462383329527e-06, "loss": 0.034, "step": 48590 }, { "epoch": 4.404114555011782, "grad_norm": 0.3380947709083557, "learning_rate": 2.1322490345618894e-06, "loss": 0.0296, "step": 48595 }, { "epoch": 4.404567699836868, "grad_norm": 0.34964752197265625, "learning_rate": 2.1290541194030816e-06, "loss": 0.0423, "step": 48600 }, { "epoch": 4.405020844661954, "grad_norm": 0.32220715284347534, "learning_rate": 2.1258614931762593e-06, "loss": 0.0319, "step": 48605 }, { "epoch": 4.4054739894870405, "grad_norm": 0.4304546117782593, "learning_rate": 2.122671156200956e-06, "loss": 0.0307, "step": 48610 }, { "epoch": 4.405927134312126, "grad_norm": 0.30045101046562195, "learning_rate": 2.119483108796458e-06, "loss": 0.0297, "step": 48615 }, { "epoch": 4.406380279137212, "grad_norm": 0.30740609765052795, "learning_rate": 2.1162973512818208e-06, "loss": 0.033, "step": 48620 }, { "epoch": 4.406833423962298, "grad_norm": 0.2666133940219879, "learning_rate": 2.1131138839758897e-06, "loss": 0.0273, "step": 48625 }, { "epoch": 4.407286568787384, "grad_norm": 0.483130544424057, "learning_rate": 2.1099327071972486e-06, "loss": 0.0359, "step": 48630 }, { "epoch": 4.407739713612471, "grad_norm": 0.35653117299079895, "learning_rate": 2.1067538212642844e-06, "loss": 0.03, "step": 48635 }, { "epoch": 4.408192858437556, "grad_norm": 0.3705826997756958, "learning_rate": 2.1035772264951376e-06, "loss": 0.0318, "step": 48640 }, { "epoch": 4.408646003262643, "grad_norm": 0.3212147057056427, "learning_rate": 2.10040292320772e-06, "loss": 0.0293, "step": 48645 }, { "epoch": 4.409099148087729, "grad_norm": 0.5035169124603271, "learning_rate": 2.0972309117197163e-06, "loss": 0.0317, "step": 48650 }, { "epoch": 4.409552292912815, "grad_norm": 0.3742232620716095, "learning_rate": 2.09406119234859e-06, "loss": 0.0291, "step": 48655 }, { "epoch": 4.410005437737901, "grad_norm": 0.298918217420578, "learning_rate": 2.0908937654115613e-06, "loss": 0.0302, "step": 48660 }, { "epoch": 4.410458582562987, "grad_norm": 0.2126983404159546, "learning_rate": 2.08772863122563e-06, "loss": 0.0331, "step": 48665 }, { "epoch": 4.410911727388073, "grad_norm": 0.6836822628974915, "learning_rate": 2.0845657901075623e-06, "loss": 0.0349, "step": 48670 }, { "epoch": 4.411364872213159, "grad_norm": 0.3833475112915039, "learning_rate": 2.0814052423738933e-06, "loss": 0.0326, "step": 48675 }, { "epoch": 4.411818017038246, "grad_norm": 0.3533196449279785, "learning_rate": 2.07824698834094e-06, "loss": 0.0319, "step": 48680 }, { "epoch": 4.412271161863331, "grad_norm": 0.30298250913619995, "learning_rate": 2.075091028324777e-06, "loss": 0.0302, "step": 48685 }, { "epoch": 4.412724306688418, "grad_norm": 0.4609816074371338, "learning_rate": 2.071937362641252e-06, "loss": 0.0351, "step": 48690 }, { "epoch": 4.413177451513504, "grad_norm": 0.31242990493774414, "learning_rate": 2.068785991605987e-06, "loss": 0.0307, "step": 48695 }, { "epoch": 4.41363059633859, "grad_norm": 0.3375948369503021, "learning_rate": 2.0656369155343693e-06, "loss": 0.0345, "step": 48700 }, { "epoch": 4.414083741163676, "grad_norm": 0.3310588598251343, "learning_rate": 2.0624901347415652e-06, "loss": 0.032, "step": 48705 }, { "epoch": 4.414536885988762, "grad_norm": 0.22309093177318573, "learning_rate": 2.059345649542502e-06, "loss": 0.0343, "step": 48710 }, { "epoch": 4.414990030813848, "grad_norm": 0.35957473516464233, "learning_rate": 2.056203460251882e-06, "loss": 0.0292, "step": 48715 }, { "epoch": 4.415443175638934, "grad_norm": 0.519071638584137, "learning_rate": 2.053063567184174e-06, "loss": 0.0356, "step": 48720 }, { "epoch": 4.415896320464021, "grad_norm": 0.31678837537765503, "learning_rate": 2.0499259706536185e-06, "loss": 0.0347, "step": 48725 }, { "epoch": 4.416349465289106, "grad_norm": 0.5058169364929199, "learning_rate": 2.0467906709742356e-06, "loss": 0.0352, "step": 48730 }, { "epoch": 4.416802610114193, "grad_norm": 0.3739681839942932, "learning_rate": 2.0436576684598e-06, "loss": 0.0314, "step": 48735 }, { "epoch": 4.417255754939278, "grad_norm": 0.3713376522064209, "learning_rate": 2.0405269634238666e-06, "loss": 0.0324, "step": 48740 }, { "epoch": 4.4177088997643645, "grad_norm": 0.3504594564437866, "learning_rate": 2.0373985561797565e-06, "loss": 0.0316, "step": 48745 }, { "epoch": 4.418162044589451, "grad_norm": 0.3580608367919922, "learning_rate": 2.0342724470405554e-06, "loss": 0.0319, "step": 48750 }, { "epoch": 4.4186151894145365, "grad_norm": 0.32080766558647156, "learning_rate": 2.0311486363191355e-06, "loss": 0.0298, "step": 48755 }, { "epoch": 4.419068334239623, "grad_norm": 0.32642775774002075, "learning_rate": 2.0280271243281275e-06, "loss": 0.0326, "step": 48760 }, { "epoch": 4.419521479064709, "grad_norm": 0.2953478693962097, "learning_rate": 2.0249079113799225e-06, "loss": 0.0361, "step": 48765 }, { "epoch": 4.419974623889795, "grad_norm": 0.4369277060031891, "learning_rate": 2.021790997786713e-06, "loss": 0.0307, "step": 48770 }, { "epoch": 4.420427768714881, "grad_norm": 0.37937647104263306, "learning_rate": 2.0186763838604155e-06, "loss": 0.0306, "step": 48775 }, { "epoch": 4.420880913539968, "grad_norm": 0.40137606859207153, "learning_rate": 2.015564069912762e-06, "loss": 0.0346, "step": 48780 }, { "epoch": 4.421334058365053, "grad_norm": 0.29897475242614746, "learning_rate": 2.012454056255225e-06, "loss": 0.0324, "step": 48785 }, { "epoch": 4.4217872031901395, "grad_norm": 0.40848928689956665, "learning_rate": 2.0093463431990532e-06, "loss": 0.0307, "step": 48790 }, { "epoch": 4.422240348015226, "grad_norm": 0.34945425391197205, "learning_rate": 2.006240931055278e-06, "loss": 0.0309, "step": 48795 }, { "epoch": 4.422693492840311, "grad_norm": 0.26251712441444397, "learning_rate": 2.003137820134676e-06, "loss": 0.0284, "step": 48800 }, { "epoch": 4.423146637665398, "grad_norm": 0.3930237293243408, "learning_rate": 2.000037010747827e-06, "loss": 0.0331, "step": 48805 }, { "epoch": 4.423599782490484, "grad_norm": 0.41286006569862366, "learning_rate": 1.996938503205037e-06, "loss": 0.0317, "step": 48810 }, { "epoch": 4.42405292731557, "grad_norm": 0.28440460562705994, "learning_rate": 1.9938422978164228e-06, "loss": 0.0287, "step": 48815 }, { "epoch": 4.424506072140656, "grad_norm": 0.4071197211742401, "learning_rate": 1.9907483948918476e-06, "loss": 0.0302, "step": 48820 }, { "epoch": 4.424959216965743, "grad_norm": 0.33135613799095154, "learning_rate": 1.987656794740947e-06, "loss": 0.03, "step": 48825 }, { "epoch": 4.425412361790828, "grad_norm": 0.42428138852119446, "learning_rate": 1.9845674976731453e-06, "loss": 0.0297, "step": 48830 }, { "epoch": 4.4258655066159145, "grad_norm": 0.29323694109916687, "learning_rate": 1.9814805039975987e-06, "loss": 0.0307, "step": 48835 }, { "epoch": 4.426318651441001, "grad_norm": 0.25764966011047363, "learning_rate": 1.978395814023268e-06, "loss": 0.0395, "step": 48840 }, { "epoch": 4.426771796266086, "grad_norm": 0.38126951456069946, "learning_rate": 1.975313428058867e-06, "loss": 0.0314, "step": 48845 }, { "epoch": 4.427224941091173, "grad_norm": 0.35315510630607605, "learning_rate": 1.9722333464128773e-06, "loss": 0.0315, "step": 48850 }, { "epoch": 4.427678085916259, "grad_norm": 0.34815970063209534, "learning_rate": 1.9691555693935686e-06, "loss": 0.0291, "step": 48855 }, { "epoch": 4.428131230741345, "grad_norm": 0.3511715829372406, "learning_rate": 1.966080097308945e-06, "loss": 0.0337, "step": 48860 }, { "epoch": 4.428584375566431, "grad_norm": 0.2846224009990692, "learning_rate": 1.963006930466818e-06, "loss": 0.0329, "step": 48865 }, { "epoch": 4.4290375203915175, "grad_norm": 0.3454594910144806, "learning_rate": 1.959936069174745e-06, "loss": 0.0297, "step": 48870 }, { "epoch": 4.429490665216603, "grad_norm": 0.36629223823547363, "learning_rate": 1.9568675137400544e-06, "loss": 0.0307, "step": 48875 }, { "epoch": 4.4299438100416895, "grad_norm": 0.33362746238708496, "learning_rate": 1.953801264469862e-06, "loss": 0.033, "step": 48880 }, { "epoch": 4.430396954866776, "grad_norm": 0.34714943170547485, "learning_rate": 1.95073732167102e-06, "loss": 0.0312, "step": 48885 }, { "epoch": 4.430850099691861, "grad_norm": 0.3279556632041931, "learning_rate": 1.9476756856501827e-06, "loss": 0.0317, "step": 48890 }, { "epoch": 4.431303244516948, "grad_norm": 0.26636895537376404, "learning_rate": 1.944616356713758e-06, "loss": 0.0283, "step": 48895 }, { "epoch": 4.431756389342033, "grad_norm": 0.36024755239486694, "learning_rate": 1.941559335167917e-06, "loss": 0.0317, "step": 48900 }, { "epoch": 4.43220953416712, "grad_norm": 0.618777871131897, "learning_rate": 1.938504621318615e-06, "loss": 0.033, "step": 48905 }, { "epoch": 4.432662678992206, "grad_norm": 0.3179643452167511, "learning_rate": 1.935452215471567e-06, "loss": 0.0331, "step": 48910 }, { "epoch": 4.433115823817292, "grad_norm": 0.2925373911857605, "learning_rate": 1.932402117932261e-06, "loss": 0.0302, "step": 48915 }, { "epoch": 4.433568968642378, "grad_norm": 0.369296133518219, "learning_rate": 1.9293543290059474e-06, "loss": 0.034, "step": 48920 }, { "epoch": 4.434022113467464, "grad_norm": 0.5149743556976318, "learning_rate": 1.926308848997646e-06, "loss": 0.0325, "step": 48925 }, { "epoch": 4.43447525829255, "grad_norm": 0.27495357394218445, "learning_rate": 1.9232656782121607e-06, "loss": 0.0322, "step": 48930 }, { "epoch": 4.434928403117636, "grad_norm": 0.4356018006801605, "learning_rate": 1.920224816954047e-06, "loss": 0.0297, "step": 48935 }, { "epoch": 4.435381547942723, "grad_norm": 0.3670159876346588, "learning_rate": 1.9171862655276372e-06, "loss": 0.0286, "step": 48940 }, { "epoch": 4.435834692767808, "grad_norm": 0.5642684102058411, "learning_rate": 1.914150024237027e-06, "loss": 0.0368, "step": 48945 }, { "epoch": 4.436287837592895, "grad_norm": 0.328342080116272, "learning_rate": 1.911116093386084e-06, "loss": 0.0298, "step": 48950 }, { "epoch": 4.436740982417981, "grad_norm": 0.3675147294998169, "learning_rate": 1.908084473278449e-06, "loss": 0.0362, "step": 48955 }, { "epoch": 4.437194127243067, "grad_norm": 0.36639106273651123, "learning_rate": 1.9050551642175284e-06, "loss": 0.0312, "step": 48960 }, { "epoch": 4.437647272068153, "grad_norm": 0.28567788004875183, "learning_rate": 1.9020281665064948e-06, "loss": 0.0302, "step": 48965 }, { "epoch": 4.438100416893239, "grad_norm": 0.42377644777297974, "learning_rate": 1.8990034804482887e-06, "loss": 0.0316, "step": 48970 }, { "epoch": 4.438553561718325, "grad_norm": 0.36124348640441895, "learning_rate": 1.8959811063456206e-06, "loss": 0.0292, "step": 48975 }, { "epoch": 4.439006706543411, "grad_norm": 0.2686590850353241, "learning_rate": 1.8929610445009794e-06, "loss": 0.0355, "step": 48980 }, { "epoch": 4.439459851368498, "grad_norm": 0.45634689927101135, "learning_rate": 1.8899432952166064e-06, "loss": 0.0327, "step": 48985 }, { "epoch": 4.439912996193583, "grad_norm": 0.41605016589164734, "learning_rate": 1.886927858794521e-06, "loss": 0.0317, "step": 48990 }, { "epoch": 4.44036614101867, "grad_norm": 0.4107591211795807, "learning_rate": 1.8839147355365071e-06, "loss": 0.0339, "step": 48995 }, { "epoch": 4.440819285843756, "grad_norm": 0.3115491271018982, "learning_rate": 1.8809039257441235e-06, "loss": 0.0294, "step": 49000 }, { "epoch": 4.441272430668842, "grad_norm": 0.3777179718017578, "learning_rate": 1.8778954297186847e-06, "loss": 0.038, "step": 49005 }, { "epoch": 4.441725575493928, "grad_norm": 0.4320811629295349, "learning_rate": 1.874889247761291e-06, "loss": 0.0412, "step": 49010 }, { "epoch": 4.4421787203190135, "grad_norm": 0.3676460385322571, "learning_rate": 1.8718853801728025e-06, "loss": 0.029, "step": 49015 }, { "epoch": 4.4426318651441, "grad_norm": 0.30936911702156067, "learning_rate": 1.8688838272538366e-06, "loss": 0.0302, "step": 49020 }, { "epoch": 4.443085009969186, "grad_norm": 0.2823067605495453, "learning_rate": 1.865884589304806e-06, "loss": 0.0306, "step": 49025 }, { "epoch": 4.443538154794272, "grad_norm": 0.3986038267612457, "learning_rate": 1.8628876666258565e-06, "loss": 0.0308, "step": 49030 }, { "epoch": 4.443991299619358, "grad_norm": 0.4526941478252411, "learning_rate": 1.8598930595169373e-06, "loss": 0.0314, "step": 49035 }, { "epoch": 4.444444444444445, "grad_norm": 0.3448081314563751, "learning_rate": 1.8569007682777417e-06, "loss": 0.0287, "step": 49040 }, { "epoch": 4.44489758926953, "grad_norm": 0.34383031725883484, "learning_rate": 1.853910793207736e-06, "loss": 0.0316, "step": 49045 }, { "epoch": 4.445350734094617, "grad_norm": 0.32391291856765747, "learning_rate": 1.8509231346061723e-06, "loss": 0.0306, "step": 49050 }, { "epoch": 4.445803878919703, "grad_norm": 0.3241802752017975, "learning_rate": 1.8479377927720392e-06, "loss": 0.0312, "step": 49055 }, { "epoch": 4.4462570237447885, "grad_norm": 0.31757301092147827, "learning_rate": 1.8449547680041201e-06, "loss": 0.0318, "step": 49060 }, { "epoch": 4.446710168569875, "grad_norm": 0.28834325075149536, "learning_rate": 1.8419740606009568e-06, "loss": 0.0328, "step": 49065 }, { "epoch": 4.447163313394961, "grad_norm": 0.30001717805862427, "learning_rate": 1.8389956708608547e-06, "loss": 0.0307, "step": 49070 }, { "epoch": 4.447616458220047, "grad_norm": 0.4333705008029938, "learning_rate": 1.8360195990819036e-06, "loss": 0.0306, "step": 49075 }, { "epoch": 4.448069603045133, "grad_norm": 0.462475061416626, "learning_rate": 1.8330458455619314e-06, "loss": 0.0332, "step": 49080 }, { "epoch": 4.44852274787022, "grad_norm": 0.3123109042644501, "learning_rate": 1.83007441059857e-06, "loss": 0.0311, "step": 49085 }, { "epoch": 4.448975892695305, "grad_norm": 0.43421670794487, "learning_rate": 1.8271052944891926e-06, "loss": 0.0285, "step": 49090 }, { "epoch": 4.4494290375203915, "grad_norm": 0.241958349943161, "learning_rate": 1.8241384975309444e-06, "loss": 0.031, "step": 49095 }, { "epoch": 4.449882182345478, "grad_norm": 0.42878326773643494, "learning_rate": 1.8211740200207606e-06, "loss": 0.0327, "step": 49100 }, { "epoch": 4.4503353271705635, "grad_norm": 0.3941817283630371, "learning_rate": 1.8182118622553091e-06, "loss": 0.0311, "step": 49105 }, { "epoch": 4.45078847199565, "grad_norm": 0.3229816257953644, "learning_rate": 1.8152520245310529e-06, "loss": 0.0294, "step": 49110 }, { "epoch": 4.451241616820736, "grad_norm": 0.36110204458236694, "learning_rate": 1.8122945071442132e-06, "loss": 0.0306, "step": 49115 }, { "epoch": 4.451694761645822, "grad_norm": 0.4842739403247833, "learning_rate": 1.809339310390773e-06, "loss": 0.0378, "step": 49120 }, { "epoch": 4.452147906470908, "grad_norm": 0.3667764365673065, "learning_rate": 1.8063864345665038e-06, "loss": 0.0294, "step": 49125 }, { "epoch": 4.452601051295995, "grad_norm": 0.3377862870693207, "learning_rate": 1.803435879966911e-06, "loss": 0.0292, "step": 49130 }, { "epoch": 4.45305419612108, "grad_norm": 0.29749795794487, "learning_rate": 1.8004876468873e-06, "loss": 0.0293, "step": 49135 }, { "epoch": 4.4535073409461665, "grad_norm": 0.3751126527786255, "learning_rate": 1.7975417356227319e-06, "loss": 0.0323, "step": 49140 }, { "epoch": 4.453960485771253, "grad_norm": 0.3246120810508728, "learning_rate": 1.794598146468024e-06, "loss": 0.0293, "step": 49145 }, { "epoch": 4.454413630596338, "grad_norm": 0.38838380575180054, "learning_rate": 1.7916568797177847e-06, "loss": 0.0327, "step": 49150 }, { "epoch": 4.454866775421425, "grad_norm": 0.36417239904403687, "learning_rate": 1.7887179356663647e-06, "loss": 0.0301, "step": 49155 }, { "epoch": 4.455319920246511, "grad_norm": 0.3052588105201721, "learning_rate": 1.7857813146079038e-06, "loss": 0.0284, "step": 49160 }, { "epoch": 4.455773065071597, "grad_norm": 0.3564564883708954, "learning_rate": 1.7828470168362976e-06, "loss": 0.0352, "step": 49165 }, { "epoch": 4.456226209896683, "grad_norm": 0.3172992169857025, "learning_rate": 1.7799150426452026e-06, "loss": 0.0308, "step": 49170 }, { "epoch": 4.456679354721769, "grad_norm": 0.3812861144542694, "learning_rate": 1.7769853923280705e-06, "loss": 0.0328, "step": 49175 }, { "epoch": 4.457132499546855, "grad_norm": 0.4730360507965088, "learning_rate": 1.7740580661780832e-06, "loss": 0.0332, "step": 49180 }, { "epoch": 4.4575856443719415, "grad_norm": 0.3034832775592804, "learning_rate": 1.7711330644882202e-06, "loss": 0.0316, "step": 49185 }, { "epoch": 4.458038789197027, "grad_norm": 0.3468772768974304, "learning_rate": 1.768210387551214e-06, "loss": 0.0303, "step": 49190 }, { "epoch": 4.458491934022113, "grad_norm": 0.3139873743057251, "learning_rate": 1.7652900356595615e-06, "loss": 0.0299, "step": 49195 }, { "epoch": 4.4589450788472, "grad_norm": 0.37680354714393616, "learning_rate": 1.7623720091055395e-06, "loss": 0.0298, "step": 49200 }, { "epoch": 4.459398223672285, "grad_norm": 0.3901195228099823, "learning_rate": 1.7594563081811788e-06, "loss": 0.0343, "step": 49205 }, { "epoch": 4.459851368497372, "grad_norm": 0.28030943870544434, "learning_rate": 1.75654293317829e-06, "loss": 0.0305, "step": 49210 }, { "epoch": 4.460304513322458, "grad_norm": 0.3751486539840698, "learning_rate": 1.753631884388443e-06, "loss": 0.0295, "step": 49215 }, { "epoch": 4.460757658147544, "grad_norm": 0.3010643720626831, "learning_rate": 1.750723162102974e-06, "loss": 0.0307, "step": 49220 }, { "epoch": 4.46121080297263, "grad_norm": 0.3609429597854614, "learning_rate": 1.7478167666129919e-06, "loss": 0.0303, "step": 49225 }, { "epoch": 4.4616639477977165, "grad_norm": 0.2642002999782562, "learning_rate": 1.7449126982093638e-06, "loss": 0.0336, "step": 49230 }, { "epoch": 4.462117092622802, "grad_norm": 0.24875137209892273, "learning_rate": 1.742010957182738e-06, "loss": 0.0278, "step": 49235 }, { "epoch": 4.462570237447888, "grad_norm": 0.2587583661079407, "learning_rate": 1.7391115438235183e-06, "loss": 0.0308, "step": 49240 }, { "epoch": 4.463023382272975, "grad_norm": 0.25259605050086975, "learning_rate": 1.736214458421878e-06, "loss": 0.0294, "step": 49245 }, { "epoch": 4.46347652709806, "grad_norm": 0.3153931200504303, "learning_rate": 1.7333197012677577e-06, "loss": 0.0281, "step": 49250 }, { "epoch": 4.463929671923147, "grad_norm": 0.4085935056209564, "learning_rate": 1.730427272650867e-06, "loss": 0.0291, "step": 49255 }, { "epoch": 4.464382816748233, "grad_norm": 0.30536895990371704, "learning_rate": 1.7275371728606826e-06, "loss": 0.0295, "step": 49260 }, { "epoch": 4.464835961573319, "grad_norm": 0.31584131717681885, "learning_rate": 1.7246494021864457e-06, "loss": 0.0306, "step": 49265 }, { "epoch": 4.465289106398405, "grad_norm": 0.49065157771110535, "learning_rate": 1.7217639609171638e-06, "loss": 0.0366, "step": 49270 }, { "epoch": 4.465742251223491, "grad_norm": 0.8380322456359863, "learning_rate": 1.7188808493416114e-06, "loss": 0.0333, "step": 49275 }, { "epoch": 4.466195396048577, "grad_norm": 0.2771506905555725, "learning_rate": 1.7160000677483357e-06, "loss": 0.0286, "step": 49280 }, { "epoch": 4.466648540873663, "grad_norm": 0.2944703996181488, "learning_rate": 1.7131216164256447e-06, "loss": 0.0306, "step": 49285 }, { "epoch": 4.467101685698749, "grad_norm": 0.27312934398651123, "learning_rate": 1.7102454956616165e-06, "loss": 0.0288, "step": 49290 }, { "epoch": 4.467554830523835, "grad_norm": 0.32198935747146606, "learning_rate": 1.7073717057440902e-06, "loss": 0.0302, "step": 49295 }, { "epoch": 4.468007975348922, "grad_norm": 0.4008784294128418, "learning_rate": 1.7045002469606747e-06, "loss": 0.0341, "step": 49300 }, { "epoch": 4.468461120174007, "grad_norm": 0.3045106530189514, "learning_rate": 1.7016311195987539e-06, "loss": 0.0354, "step": 49305 }, { "epoch": 4.468914264999094, "grad_norm": 0.4122338891029358, "learning_rate": 1.6987643239454675e-06, "loss": 0.0405, "step": 49310 }, { "epoch": 4.46936740982418, "grad_norm": 0.35521388053894043, "learning_rate": 1.6958998602877223e-06, "loss": 0.0292, "step": 49315 }, { "epoch": 4.4698205546492655, "grad_norm": 0.3647519052028656, "learning_rate": 1.6930377289122e-06, "loss": 0.0297, "step": 49320 }, { "epoch": 4.470273699474352, "grad_norm": 0.47372785210609436, "learning_rate": 1.6901779301053327e-06, "loss": 0.0311, "step": 49325 }, { "epoch": 4.470726844299438, "grad_norm": 0.27797767519950867, "learning_rate": 1.6873204641533441e-06, "loss": 0.0313, "step": 49330 }, { "epoch": 4.471179989124524, "grad_norm": 0.33656126260757446, "learning_rate": 1.6844653313422054e-06, "loss": 0.0292, "step": 49335 }, { "epoch": 4.47163313394961, "grad_norm": 0.5169609785079956, "learning_rate": 1.6816125319576575e-06, "loss": 0.0326, "step": 49340 }, { "epoch": 4.472086278774697, "grad_norm": 0.3700984716415405, "learning_rate": 1.678762066285211e-06, "loss": 0.0372, "step": 49345 }, { "epoch": 4.472539423599782, "grad_norm": 0.2669242024421692, "learning_rate": 1.6759139346101348e-06, "loss": 0.0298, "step": 49350 }, { "epoch": 4.472992568424869, "grad_norm": 0.31025734543800354, "learning_rate": 1.6730681372174844e-06, "loss": 0.0291, "step": 49355 }, { "epoch": 4.473445713249955, "grad_norm": 0.231910839676857, "learning_rate": 1.6702246743920541e-06, "loss": 0.0324, "step": 49360 }, { "epoch": 4.4738988580750405, "grad_norm": 0.3535751700401306, "learning_rate": 1.6673835464184272e-06, "loss": 0.0308, "step": 49365 }, { "epoch": 4.474352002900127, "grad_norm": 0.31783097982406616, "learning_rate": 1.6645447535809462e-06, "loss": 0.032, "step": 49370 }, { "epoch": 4.474805147725213, "grad_norm": 0.38207197189331055, "learning_rate": 1.661708296163708e-06, "loss": 0.03, "step": 49375 }, { "epoch": 4.475258292550299, "grad_norm": 0.3570038080215454, "learning_rate": 1.6588741744506004e-06, "loss": 0.0286, "step": 49380 }, { "epoch": 4.475711437375385, "grad_norm": 0.5266314744949341, "learning_rate": 1.6560423887252486e-06, "loss": 0.0365, "step": 49385 }, { "epoch": 4.476164582200472, "grad_norm": 0.41740769147872925, "learning_rate": 1.6532129392710704e-06, "loss": 0.0296, "step": 49390 }, { "epoch": 4.476617727025557, "grad_norm": 0.31209808588027954, "learning_rate": 1.6503858263712342e-06, "loss": 0.0304, "step": 49395 }, { "epoch": 4.477070871850644, "grad_norm": 0.2558649778366089, "learning_rate": 1.6475610503086741e-06, "loss": 0.0299, "step": 49400 }, { "epoch": 4.47752401667573, "grad_norm": 0.28313279151916504, "learning_rate": 1.6447386113661062e-06, "loss": 0.0316, "step": 49405 }, { "epoch": 4.4779771615008155, "grad_norm": 0.34654977917671204, "learning_rate": 1.6419185098259848e-06, "loss": 0.03, "step": 49410 }, { "epoch": 4.478430306325902, "grad_norm": 0.3345916271209717, "learning_rate": 1.6391007459705593e-06, "loss": 0.0302, "step": 49415 }, { "epoch": 4.478883451150988, "grad_norm": 0.42611727118492126, "learning_rate": 1.6362853200818261e-06, "loss": 0.0297, "step": 49420 }, { "epoch": 4.479336595976074, "grad_norm": 0.369744211435318, "learning_rate": 1.6334722324415546e-06, "loss": 0.0282, "step": 49425 }, { "epoch": 4.47978974080116, "grad_norm": 0.2631770074367523, "learning_rate": 1.6306614833312888e-06, "loss": 0.031, "step": 49430 }, { "epoch": 4.480242885626247, "grad_norm": 0.2930855453014374, "learning_rate": 1.627853073032315e-06, "loss": 0.0291, "step": 49435 }, { "epoch": 4.480696030451332, "grad_norm": 0.38088616728782654, "learning_rate": 1.625047001825708e-06, "loss": 0.0354, "step": 49440 }, { "epoch": 4.4811491752764185, "grad_norm": 0.3636727035045624, "learning_rate": 1.622243269992302e-06, "loss": 0.0353, "step": 49445 }, { "epoch": 4.481602320101504, "grad_norm": 0.24896816909313202, "learning_rate": 1.6194418778126858e-06, "loss": 0.0291, "step": 49450 }, { "epoch": 4.4820554649265905, "grad_norm": 0.37925639748573303, "learning_rate": 1.6166428255672384e-06, "loss": 0.0304, "step": 49455 }, { "epoch": 4.482508609751677, "grad_norm": 0.3033607602119446, "learning_rate": 1.6138461135360744e-06, "loss": 0.0287, "step": 49460 }, { "epoch": 4.482961754576762, "grad_norm": 0.4013117849826813, "learning_rate": 1.6110517419991006e-06, "loss": 0.042, "step": 49465 }, { "epoch": 4.483414899401849, "grad_norm": 0.25533929467201233, "learning_rate": 1.6082597112359766e-06, "loss": 0.0401, "step": 49470 }, { "epoch": 4.483868044226935, "grad_norm": 0.3492473363876343, "learning_rate": 1.6054700215261232e-06, "loss": 0.0321, "step": 49475 }, { "epoch": 4.484321189052021, "grad_norm": 0.3403693735599518, "learning_rate": 1.6026826731487477e-06, "loss": 0.0317, "step": 49480 }, { "epoch": 4.484774333877107, "grad_norm": 0.28472569584846497, "learning_rate": 1.5998976663827908e-06, "loss": 0.038, "step": 49485 }, { "epoch": 4.4852274787021935, "grad_norm": 0.25756412744522095, "learning_rate": 1.5971150015069907e-06, "loss": 0.0354, "step": 49490 }, { "epoch": 4.485680623527279, "grad_norm": 0.3893069624900818, "learning_rate": 1.5943346787998303e-06, "loss": 0.0357, "step": 49495 }, { "epoch": 4.486133768352365, "grad_norm": 0.4002930819988251, "learning_rate": 1.5915566985395646e-06, "loss": 0.0314, "step": 49500 }, { "epoch": 4.486586913177452, "grad_norm": 0.42391714453697205, "learning_rate": 1.588781061004227e-06, "loss": 0.0419, "step": 49505 }, { "epoch": 4.487040058002537, "grad_norm": 0.4536181688308716, "learning_rate": 1.5860077664715866e-06, "loss": 0.0317, "step": 49510 }, { "epoch": 4.487493202827624, "grad_norm": 0.3084106147289276, "learning_rate": 1.5832368152192079e-06, "loss": 0.0324, "step": 49515 }, { "epoch": 4.48794634765271, "grad_norm": 0.31097111105918884, "learning_rate": 1.5804682075244048e-06, "loss": 0.0337, "step": 49520 }, { "epoch": 4.488399492477796, "grad_norm": 0.3034355044364929, "learning_rate": 1.5777019436642558e-06, "loss": 0.0287, "step": 49525 }, { "epoch": 4.488852637302882, "grad_norm": 0.3215891718864441, "learning_rate": 1.574938023915623e-06, "loss": 0.0288, "step": 49530 }, { "epoch": 4.4893057821279685, "grad_norm": 0.4281143844127655, "learning_rate": 1.5721764485551016e-06, "loss": 0.0304, "step": 49535 }, { "epoch": 4.489758926953054, "grad_norm": 0.2919618785381317, "learning_rate": 1.5694172178590871e-06, "loss": 0.0288, "step": 49540 }, { "epoch": 4.49021207177814, "grad_norm": 0.27878880500793457, "learning_rate": 1.5666603321037171e-06, "loss": 0.0313, "step": 49545 }, { "epoch": 4.490665216603226, "grad_norm": 0.313606321811676, "learning_rate": 1.5639057915649042e-06, "loss": 0.0306, "step": 49550 }, { "epoch": 4.491118361428312, "grad_norm": 0.23263484239578247, "learning_rate": 1.5611535965183166e-06, "loss": 0.0302, "step": 49555 }, { "epoch": 4.491571506253399, "grad_norm": 0.37604066729545593, "learning_rate": 1.5584037472394064e-06, "loss": 0.0286, "step": 49560 }, { "epoch": 4.492024651078484, "grad_norm": 0.30995213985443115, "learning_rate": 1.5556562440033728e-06, "loss": 0.0291, "step": 49565 }, { "epoch": 4.492477795903571, "grad_norm": 0.5019620060920715, "learning_rate": 1.5529110870851877e-06, "loss": 0.0339, "step": 49570 }, { "epoch": 4.492930940728657, "grad_norm": 0.3437897264957428, "learning_rate": 1.5501682767595864e-06, "loss": 0.0299, "step": 49575 }, { "epoch": 4.493384085553743, "grad_norm": 0.31654560565948486, "learning_rate": 1.5474278133010666e-06, "loss": 0.032, "step": 49580 }, { "epoch": 4.493837230378829, "grad_norm": 0.3916182518005371, "learning_rate": 1.544689696983906e-06, "loss": 0.028, "step": 49585 }, { "epoch": 4.494290375203915, "grad_norm": 0.27895379066467285, "learning_rate": 1.5419539280821266e-06, "loss": 0.0308, "step": 49590 }, { "epoch": 4.494743520029001, "grad_norm": 0.2795836925506592, "learning_rate": 1.539220506869532e-06, "loss": 0.0353, "step": 49595 }, { "epoch": 4.495196664854087, "grad_norm": 0.5760939121246338, "learning_rate": 1.5364894336196757e-06, "loss": 0.0414, "step": 49600 }, { "epoch": 4.495649809679174, "grad_norm": 0.321523517370224, "learning_rate": 1.5337607086058886e-06, "loss": 0.0318, "step": 49605 }, { "epoch": 4.496102954504259, "grad_norm": 0.2747553884983063, "learning_rate": 1.5310343321012637e-06, "loss": 0.0342, "step": 49610 }, { "epoch": 4.496556099329346, "grad_norm": 0.316380113363266, "learning_rate": 1.5283103043786578e-06, "loss": 0.0338, "step": 49615 }, { "epoch": 4.497009244154432, "grad_norm": 0.2936653792858124, "learning_rate": 1.5255886257106915e-06, "loss": 0.0309, "step": 49620 }, { "epoch": 4.4974623889795176, "grad_norm": 0.2765498757362366, "learning_rate": 1.5228692963697499e-06, "loss": 0.0315, "step": 49625 }, { "epoch": 4.497915533804604, "grad_norm": 0.37574130296707153, "learning_rate": 1.5201523166279819e-06, "loss": 0.0316, "step": 49630 }, { "epoch": 4.49836867862969, "grad_norm": 0.30747365951538086, "learning_rate": 1.5174376867573114e-06, "loss": 0.0276, "step": 49635 }, { "epoch": 4.498821823454776, "grad_norm": 0.4585666060447693, "learning_rate": 1.514725407029416e-06, "loss": 0.0307, "step": 49640 }, { "epoch": 4.499274968279862, "grad_norm": 0.2733621299266815, "learning_rate": 1.512015477715742e-06, "loss": 0.0299, "step": 49645 }, { "epoch": 4.499728113104949, "grad_norm": 0.4701991081237793, "learning_rate": 1.5093078990874975e-06, "loss": 0.0349, "step": 49650 }, { "epoch": 4.500181257930034, "grad_norm": 0.3422728180885315, "learning_rate": 1.5066026714156551e-06, "loss": 0.0283, "step": 49655 }, { "epoch": 4.500634402755121, "grad_norm": 0.30530479550361633, "learning_rate": 1.5038997949709643e-06, "loss": 0.0293, "step": 49660 }, { "epoch": 4.501087547580207, "grad_norm": 0.34454119205474854, "learning_rate": 1.5011992700239257e-06, "loss": 0.0335, "step": 49665 }, { "epoch": 4.5015406924052925, "grad_norm": 0.3469929099082947, "learning_rate": 1.4985010968448039e-06, "loss": 0.0367, "step": 49670 }, { "epoch": 4.501993837230379, "grad_norm": 0.2441805601119995, "learning_rate": 1.4958052757036433e-06, "loss": 0.0365, "step": 49675 }, { "epoch": 4.502446982055465, "grad_norm": 0.2801656126976013, "learning_rate": 1.4931118068702287e-06, "loss": 0.0287, "step": 49680 }, { "epoch": 4.502900126880551, "grad_norm": 0.38567671179771423, "learning_rate": 1.490420690614136e-06, "loss": 0.0306, "step": 49685 }, { "epoch": 4.503353271705637, "grad_norm": 0.2685883939266205, "learning_rate": 1.4877319272046885e-06, "loss": 0.0287, "step": 49690 }, { "epoch": 4.503806416530724, "grad_norm": 0.28790730237960815, "learning_rate": 1.4850455169109711e-06, "loss": 0.0376, "step": 49695 }, { "epoch": 4.504259561355809, "grad_norm": 0.4385415315628052, "learning_rate": 1.4823614600018577e-06, "loss": 0.0296, "step": 49700 }, { "epoch": 4.504712706180896, "grad_norm": 0.3438429534435272, "learning_rate": 1.4796797567459525e-06, "loss": 0.0355, "step": 49705 }, { "epoch": 4.505165851005982, "grad_norm": 0.494659423828125, "learning_rate": 1.477000407411655e-06, "loss": 0.0314, "step": 49710 }, { "epoch": 4.5056189958310675, "grad_norm": 0.35773730278015137, "learning_rate": 1.4743234122671006e-06, "loss": 0.05, "step": 49715 }, { "epoch": 4.506072140656154, "grad_norm": 0.2915877401828766, "learning_rate": 1.471648771580214e-06, "loss": 0.0294, "step": 49720 }, { "epoch": 4.506525285481239, "grad_norm": 0.38753753900527954, "learning_rate": 1.4689764856186783e-06, "loss": 0.0295, "step": 49725 }, { "epoch": 4.506978430306326, "grad_norm": 0.25714951753616333, "learning_rate": 1.466306554649921e-06, "loss": 0.0297, "step": 49730 }, { "epoch": 4.507431575131412, "grad_norm": 0.4208517074584961, "learning_rate": 1.4636389789411703e-06, "loss": 0.0305, "step": 49735 }, { "epoch": 4.507884719956498, "grad_norm": 0.33384668827056885, "learning_rate": 1.4609737587593764e-06, "loss": 0.03, "step": 49740 }, { "epoch": 4.508337864781584, "grad_norm": 0.2985946536064148, "learning_rate": 1.4583108943712924e-06, "loss": 0.0283, "step": 49745 }, { "epoch": 4.508791009606671, "grad_norm": 0.293856143951416, "learning_rate": 1.4556503860434112e-06, "loss": 0.0331, "step": 49750 }, { "epoch": 4.509244154431756, "grad_norm": 0.49908530712127686, "learning_rate": 1.4529922340419943e-06, "loss": 0.0364, "step": 49755 }, { "epoch": 4.5096972992568425, "grad_norm": 0.38159623742103577, "learning_rate": 1.450336438633082e-06, "loss": 0.0327, "step": 49760 }, { "epoch": 4.510150444081929, "grad_norm": 0.2587296664714813, "learning_rate": 1.4476830000824532e-06, "loss": 0.029, "step": 49765 }, { "epoch": 4.510603588907014, "grad_norm": 0.3571746349334717, "learning_rate": 1.445031918655676e-06, "loss": 0.0308, "step": 49770 }, { "epoch": 4.511056733732101, "grad_norm": 0.3756680488586426, "learning_rate": 1.442383194618066e-06, "loss": 0.0304, "step": 49775 }, { "epoch": 4.511509878557187, "grad_norm": 0.37227967381477356, "learning_rate": 1.4397368282347085e-06, "loss": 0.0313, "step": 49780 }, { "epoch": 4.511963023382273, "grad_norm": 0.28265678882598877, "learning_rate": 1.437092819770458e-06, "loss": 0.0332, "step": 49785 }, { "epoch": 4.512416168207359, "grad_norm": 0.32678157091140747, "learning_rate": 1.434451169489917e-06, "loss": 0.0341, "step": 49790 }, { "epoch": 4.5128693130324455, "grad_norm": 0.3515908718109131, "learning_rate": 1.431811877657474e-06, "loss": 0.0295, "step": 49795 }, { "epoch": 4.513322457857531, "grad_norm": 0.3667244613170624, "learning_rate": 1.429174944537265e-06, "loss": 0.0304, "step": 49800 }, { "epoch": 4.5137756026826175, "grad_norm": 0.27094459533691406, "learning_rate": 1.4265403703931924e-06, "loss": 0.0313, "step": 49805 }, { "epoch": 4.514228747507703, "grad_norm": 0.3916061222553253, "learning_rate": 1.4239081554889316e-06, "loss": 0.0368, "step": 49810 }, { "epoch": 4.514681892332789, "grad_norm": 0.22218473255634308, "learning_rate": 1.421278300087911e-06, "loss": 0.0266, "step": 49815 }, { "epoch": 4.515135037157876, "grad_norm": 0.3123994469642639, "learning_rate": 1.4186508044533309e-06, "loss": 0.0286, "step": 49820 }, { "epoch": 4.515588181982961, "grad_norm": 0.39590147137641907, "learning_rate": 1.4160256688481504e-06, "loss": 0.0305, "step": 49825 }, { "epoch": 4.516041326808048, "grad_norm": 0.3656669855117798, "learning_rate": 1.4134028935350873e-06, "loss": 0.0302, "step": 49830 }, { "epoch": 4.516494471633134, "grad_norm": 0.4156181216239929, "learning_rate": 1.4107824787766427e-06, "loss": 0.0314, "step": 49835 }, { "epoch": 4.51694761645822, "grad_norm": 0.3367244601249695, "learning_rate": 1.4081644248350595e-06, "loss": 0.0295, "step": 49840 }, { "epoch": 4.517400761283306, "grad_norm": 0.24323491752147675, "learning_rate": 1.4055487319723587e-06, "loss": 0.0296, "step": 49845 }, { "epoch": 4.517853906108392, "grad_norm": 0.42469659447669983, "learning_rate": 1.4029354004503143e-06, "loss": 0.0358, "step": 49850 }, { "epoch": 4.518307050933478, "grad_norm": 0.27566221356391907, "learning_rate": 1.4003244305304697e-06, "loss": 0.0305, "step": 49855 }, { "epoch": 4.518760195758564, "grad_norm": 0.34640538692474365, "learning_rate": 1.3977158224741383e-06, "loss": 0.0289, "step": 49860 }, { "epoch": 4.519213340583651, "grad_norm": 0.277633935213089, "learning_rate": 1.3951095765423834e-06, "loss": 0.0296, "step": 49865 }, { "epoch": 4.519666485408736, "grad_norm": 0.4661649167537689, "learning_rate": 1.3925056929960433e-06, "loss": 0.035, "step": 49870 }, { "epoch": 4.520119630233823, "grad_norm": 0.27925440669059753, "learning_rate": 1.3899041720957157e-06, "loss": 0.0286, "step": 49875 }, { "epoch": 4.520572775058909, "grad_norm": 0.2610112428665161, "learning_rate": 1.3873050141017558e-06, "loss": 0.0302, "step": 49880 }, { "epoch": 4.521025919883995, "grad_norm": 0.36653465032577515, "learning_rate": 1.3847082192742944e-06, "loss": 0.0289, "step": 49885 }, { "epoch": 4.521479064709081, "grad_norm": 0.39589235186576843, "learning_rate": 1.3821137878732181e-06, "loss": 0.032, "step": 49890 }, { "epoch": 4.521932209534167, "grad_norm": 0.37525802850723267, "learning_rate": 1.3795217201581778e-06, "loss": 0.0296, "step": 49895 }, { "epoch": 4.522385354359253, "grad_norm": 0.3915235996246338, "learning_rate": 1.376932016388588e-06, "loss": 0.0322, "step": 49900 }, { "epoch": 4.522838499184339, "grad_norm": 0.24902531504631042, "learning_rate": 1.37434467682363e-06, "loss": 0.0282, "step": 49905 }, { "epoch": 4.523291644009426, "grad_norm": 0.42417794466018677, "learning_rate": 1.371759701722239e-06, "loss": 0.0308, "step": 49910 }, { "epoch": 4.523744788834511, "grad_norm": 0.29862460494041443, "learning_rate": 1.3691770913431269e-06, "loss": 0.0318, "step": 49915 }, { "epoch": 4.524197933659598, "grad_norm": 0.3356969356536865, "learning_rate": 1.3665968459447592e-06, "loss": 0.0288, "step": 49920 }, { "epoch": 4.524651078484684, "grad_norm": 0.26789990067481995, "learning_rate": 1.3640189657853686e-06, "loss": 0.0314, "step": 49925 }, { "epoch": 4.52510422330977, "grad_norm": 0.2531663179397583, "learning_rate": 1.361443451122954e-06, "loss": 0.0286, "step": 49930 }, { "epoch": 4.525557368134856, "grad_norm": 0.31438252329826355, "learning_rate": 1.358870302215262e-06, "loss": 0.0319, "step": 49935 }, { "epoch": 4.526010512959942, "grad_norm": 0.28412583470344543, "learning_rate": 1.3562995193198285e-06, "loss": 0.0315, "step": 49940 }, { "epoch": 4.526463657785028, "grad_norm": 0.29831913113594055, "learning_rate": 1.353731102693931e-06, "loss": 0.0301, "step": 49945 }, { "epoch": 4.526916802610114, "grad_norm": 0.483241468667984, "learning_rate": 1.3511650525946135e-06, "loss": 0.0378, "step": 49950 }, { "epoch": 4.527369947435201, "grad_norm": 0.3527648448944092, "learning_rate": 1.3486013692787015e-06, "loss": 0.0289, "step": 49955 }, { "epoch": 4.527823092260286, "grad_norm": 0.26070740818977356, "learning_rate": 1.3460400530027506e-06, "loss": 0.0318, "step": 49960 }, { "epoch": 4.528276237085373, "grad_norm": 0.4277680218219757, "learning_rate": 1.3434811040231115e-06, "loss": 0.0304, "step": 49965 }, { "epoch": 4.528729381910459, "grad_norm": 0.32053306698799133, "learning_rate": 1.3409245225958821e-06, "loss": 0.0287, "step": 49970 }, { "epoch": 4.529182526735545, "grad_norm": 0.29370930790901184, "learning_rate": 1.338370308976919e-06, "loss": 0.0314, "step": 49975 }, { "epoch": 4.529635671560631, "grad_norm": 0.44743210077285767, "learning_rate": 1.3358184634218617e-06, "loss": 0.0353, "step": 49980 }, { "epoch": 4.530088816385717, "grad_norm": 0.4771285653114319, "learning_rate": 1.3332689861860842e-06, "loss": 0.0304, "step": 49985 }, { "epoch": 4.530541961210803, "grad_norm": 0.3019351661205292, "learning_rate": 1.3307218775247516e-06, "loss": 0.0309, "step": 49990 }, { "epoch": 4.530995106035889, "grad_norm": 0.3403928279876709, "learning_rate": 1.3281771376927766e-06, "loss": 0.0295, "step": 49995 }, { "epoch": 4.531448250860975, "grad_norm": 0.38989952206611633, "learning_rate": 1.3256347669448305e-06, "loss": 0.0394, "step": 50000 }, { "epoch": 4.531901395686061, "grad_norm": 0.3852294385433197, "learning_rate": 1.323094765535368e-06, "loss": 0.0303, "step": 50005 }, { "epoch": 4.532354540511148, "grad_norm": 0.37784677743911743, "learning_rate": 1.320557133718578e-06, "loss": 0.0316, "step": 50010 }, { "epoch": 4.532807685336233, "grad_norm": 0.5971678495407104, "learning_rate": 1.3180218717484372e-06, "loss": 0.0417, "step": 50015 }, { "epoch": 4.5332608301613195, "grad_norm": 0.45162826776504517, "learning_rate": 1.3154889798786763e-06, "loss": 0.0379, "step": 50020 }, { "epoch": 4.533713974986406, "grad_norm": 0.4124961197376251, "learning_rate": 1.3129584583627785e-06, "loss": 0.0291, "step": 50025 }, { "epoch": 4.5341671198114915, "grad_norm": 0.3235558271408081, "learning_rate": 1.3104303074540136e-06, "loss": 0.0286, "step": 50030 }, { "epoch": 4.534620264636578, "grad_norm": 0.3227752149105072, "learning_rate": 1.3079045274053847e-06, "loss": 0.0309, "step": 50035 }, { "epoch": 4.535073409461664, "grad_norm": 0.34688568115234375, "learning_rate": 1.305381118469684e-06, "loss": 0.0296, "step": 50040 }, { "epoch": 4.53552655428675, "grad_norm": 0.35264408588409424, "learning_rate": 1.3028600808994485e-06, "loss": 0.0302, "step": 50045 }, { "epoch": 4.535979699111836, "grad_norm": 0.2823616862297058, "learning_rate": 1.3003414149469845e-06, "loss": 0.0304, "step": 50050 }, { "epoch": 4.536432843936923, "grad_norm": 0.38837218284606934, "learning_rate": 1.297825120864371e-06, "loss": 0.0313, "step": 50055 }, { "epoch": 4.536885988762008, "grad_norm": 0.4847777187824249, "learning_rate": 1.2953111989034262e-06, "loss": 0.0319, "step": 50060 }, { "epoch": 4.5373391335870945, "grad_norm": 0.26313719153404236, "learning_rate": 1.2927996493157513e-06, "loss": 0.0317, "step": 50065 }, { "epoch": 4.537792278412181, "grad_norm": 0.28666871786117554, "learning_rate": 1.2902904723527038e-06, "loss": 0.0298, "step": 50070 }, { "epoch": 4.538245423237266, "grad_norm": 0.3470998704433441, "learning_rate": 1.2877836682653966e-06, "loss": 0.0302, "step": 50075 }, { "epoch": 4.538698568062353, "grad_norm": 0.43428748846054077, "learning_rate": 1.2852792373047207e-06, "loss": 0.03, "step": 50080 }, { "epoch": 4.539151712887438, "grad_norm": 0.5515410900115967, "learning_rate": 1.2827771797213118e-06, "loss": 0.0383, "step": 50085 }, { "epoch": 4.539604857712525, "grad_norm": 0.27497923374176025, "learning_rate": 1.2802774957655834e-06, "loss": 0.0351, "step": 50090 }, { "epoch": 4.540058002537611, "grad_norm": 0.32050254940986633, "learning_rate": 1.2777801856876991e-06, "loss": 0.0337, "step": 50095 }, { "epoch": 4.540511147362697, "grad_norm": 0.35708221793174744, "learning_rate": 1.2752852497375955e-06, "loss": 0.0335, "step": 50100 }, { "epoch": 4.540964292187783, "grad_norm": 0.3801882565021515, "learning_rate": 1.2727926881649644e-06, "loss": 0.0292, "step": 50105 }, { "epoch": 4.5414174370128695, "grad_norm": 0.2579948604106903, "learning_rate": 1.270302501219256e-06, "loss": 0.0402, "step": 50110 }, { "epoch": 4.541870581837955, "grad_norm": 0.6559590697288513, "learning_rate": 1.2678146891497016e-06, "loss": 0.0376, "step": 50115 }, { "epoch": 4.542323726663041, "grad_norm": 0.27669230103492737, "learning_rate": 1.2653292522052746e-06, "loss": 0.0301, "step": 50120 }, { "epoch": 4.542776871488128, "grad_norm": 0.2921607494354248, "learning_rate": 1.2628461906347172e-06, "loss": 0.0295, "step": 50125 }, { "epoch": 4.543230016313213, "grad_norm": 0.32720085978507996, "learning_rate": 1.2603655046865393e-06, "loss": 0.0339, "step": 50130 }, { "epoch": 4.5436831611383, "grad_norm": 0.31473711133003235, "learning_rate": 1.2578871946090059e-06, "loss": 0.0294, "step": 50135 }, { "epoch": 4.544136305963386, "grad_norm": 0.3829305171966553, "learning_rate": 1.2554112606501495e-06, "loss": 0.0312, "step": 50140 }, { "epoch": 4.544589450788472, "grad_norm": 0.30948713421821594, "learning_rate": 1.2529377030577604e-06, "loss": 0.0298, "step": 50145 }, { "epoch": 4.545042595613558, "grad_norm": 0.2789067327976227, "learning_rate": 1.2504665220793937e-06, "loss": 0.0299, "step": 50150 }, { "epoch": 4.5454957404386445, "grad_norm": 0.3312527537345886, "learning_rate": 1.2479977179623654e-06, "loss": 0.0279, "step": 50155 }, { "epoch": 4.54594888526373, "grad_norm": 0.3573143482208252, "learning_rate": 1.2455312909537525e-06, "loss": 0.0327, "step": 50160 }, { "epoch": 4.546402030088816, "grad_norm": 0.34863996505737305, "learning_rate": 1.2430672413004024e-06, "loss": 0.0315, "step": 50165 }, { "epoch": 4.546855174913903, "grad_norm": 0.33159226179122925, "learning_rate": 1.240605569248912e-06, "loss": 0.0309, "step": 50170 }, { "epoch": 4.547308319738988, "grad_norm": 0.2815791964530945, "learning_rate": 1.2381462750456514e-06, "loss": 0.0317, "step": 50175 }, { "epoch": 4.547761464564075, "grad_norm": 0.2687772810459137, "learning_rate": 1.2356893589367397e-06, "loss": 0.029, "step": 50180 }, { "epoch": 4.548214609389161, "grad_norm": 0.4863278567790985, "learning_rate": 1.2332348211680694e-06, "loss": 0.0358, "step": 50185 }, { "epoch": 4.548667754214247, "grad_norm": 0.3177988529205322, "learning_rate": 1.230782661985297e-06, "loss": 0.0316, "step": 50190 }, { "epoch": 4.549120899039333, "grad_norm": 0.3270079493522644, "learning_rate": 1.2283328816338313e-06, "loss": 0.0326, "step": 50195 }, { "epoch": 4.549574043864419, "grad_norm": 0.27789294719696045, "learning_rate": 1.2258854803588432e-06, "loss": 0.0311, "step": 50200 }, { "epoch": 4.550027188689505, "grad_norm": 0.27028143405914307, "learning_rate": 1.2234404584052727e-06, "loss": 0.0275, "step": 50205 }, { "epoch": 4.550480333514591, "grad_norm": 0.42181119322776794, "learning_rate": 1.2209978160178215e-06, "loss": 0.0335, "step": 50210 }, { "epoch": 4.550933478339678, "grad_norm": 0.2526433765888214, "learning_rate": 1.2185575534409467e-06, "loss": 0.0293, "step": 50215 }, { "epoch": 4.551386623164763, "grad_norm": 0.3749733865261078, "learning_rate": 1.2161196709188722e-06, "loss": 0.0344, "step": 50220 }, { "epoch": 4.55183976798985, "grad_norm": 0.31550151109695435, "learning_rate": 1.2136841686955808e-06, "loss": 0.0283, "step": 50225 }, { "epoch": 4.552292912814936, "grad_norm": 0.4187583029270172, "learning_rate": 1.2112510470148164e-06, "loss": 0.0328, "step": 50230 }, { "epoch": 4.552746057640022, "grad_norm": 0.31338006258010864, "learning_rate": 1.2088203061200925e-06, "loss": 0.0305, "step": 50235 }, { "epoch": 4.553199202465108, "grad_norm": 0.2434743493795395, "learning_rate": 1.206391946254673e-06, "loss": 0.0281, "step": 50240 }, { "epoch": 4.553652347290194, "grad_norm": 0.6905632019042969, "learning_rate": 1.2039659676615911e-06, "loss": 0.0344, "step": 50245 }, { "epoch": 4.55410549211528, "grad_norm": 0.26849088072776794, "learning_rate": 1.2015423705836415e-06, "loss": 0.0278, "step": 50250 }, { "epoch": 4.554558636940366, "grad_norm": 0.39026060700416565, "learning_rate": 1.1991211552633718e-06, "loss": 0.0297, "step": 50255 }, { "epoch": 4.555011781765452, "grad_norm": 0.2690524458885193, "learning_rate": 1.1967023219431106e-06, "loss": 0.03, "step": 50260 }, { "epoch": 4.555464926590538, "grad_norm": 0.3790350556373596, "learning_rate": 1.1942858708649197e-06, "loss": 0.0301, "step": 50265 }, { "epoch": 4.555918071415625, "grad_norm": 0.3594743609428406, "learning_rate": 1.1918718022706499e-06, "loss": 0.03, "step": 50270 }, { "epoch": 4.55637121624071, "grad_norm": 0.31065279245376587, "learning_rate": 1.1894601164018997e-06, "loss": 0.0281, "step": 50275 }, { "epoch": 4.556824361065797, "grad_norm": 0.2554545998573303, "learning_rate": 1.1870508135000235e-06, "loss": 0.0279, "step": 50280 }, { "epoch": 4.557277505890883, "grad_norm": 0.29737988114356995, "learning_rate": 1.1846438938061616e-06, "loss": 0.0318, "step": 50285 }, { "epoch": 4.5577306507159685, "grad_norm": 0.31013622879981995, "learning_rate": 1.1822393575611822e-06, "loss": 0.0283, "step": 50290 }, { "epoch": 4.558183795541055, "grad_norm": 0.34487399458885193, "learning_rate": 1.1798372050057432e-06, "loss": 0.0278, "step": 50295 }, { "epoch": 4.558636940366141, "grad_norm": 0.38372474908828735, "learning_rate": 1.1774374363802492e-06, "loss": 0.034, "step": 50300 }, { "epoch": 4.559090085191227, "grad_norm": 0.3630434274673462, "learning_rate": 1.1750400519248667e-06, "loss": 0.0334, "step": 50305 }, { "epoch": 4.559543230016313, "grad_norm": 0.46430113911628723, "learning_rate": 1.172645051879534e-06, "loss": 0.0287, "step": 50310 }, { "epoch": 4.5599963748414, "grad_norm": 0.30820634961128235, "learning_rate": 1.1702524364839373e-06, "loss": 0.0294, "step": 50315 }, { "epoch": 4.560449519666485, "grad_norm": 0.3242165744304657, "learning_rate": 1.1678622059775324e-06, "loss": 0.0305, "step": 50320 }, { "epoch": 4.560902664491572, "grad_norm": 0.27523311972618103, "learning_rate": 1.1654743605995334e-06, "loss": 0.0308, "step": 50325 }, { "epoch": 4.561355809316658, "grad_norm": 0.44197866320610046, "learning_rate": 1.1630889005889156e-06, "loss": 0.0315, "step": 50330 }, { "epoch": 4.5618089541417435, "grad_norm": 0.3424735963344574, "learning_rate": 1.1607058261844272e-06, "loss": 0.031, "step": 50335 }, { "epoch": 4.56226209896683, "grad_norm": 0.29123833775520325, "learning_rate": 1.1583251376245496e-06, "loss": 0.0316, "step": 50340 }, { "epoch": 4.562715243791915, "grad_norm": 0.31109118461608887, "learning_rate": 1.155946835147556e-06, "loss": 0.034, "step": 50345 }, { "epoch": 4.563168388617002, "grad_norm": 0.46925097703933716, "learning_rate": 1.1535709189914646e-06, "loss": 0.0314, "step": 50350 }, { "epoch": 4.563621533442088, "grad_norm": 0.34041184186935425, "learning_rate": 1.1511973893940542e-06, "loss": 0.0316, "step": 50355 }, { "epoch": 4.564074678267174, "grad_norm": 0.5615184903144836, "learning_rate": 1.148826246592874e-06, "loss": 0.0372, "step": 50360 }, { "epoch": 4.56452782309226, "grad_norm": 0.36264893412590027, "learning_rate": 1.1464574908252228e-06, "loss": 0.0304, "step": 50365 }, { "epoch": 4.5649809679173465, "grad_norm": 0.3083003759384155, "learning_rate": 1.1440911223281725e-06, "loss": 0.0331, "step": 50370 }, { "epoch": 4.565434112742432, "grad_norm": 0.32305455207824707, "learning_rate": 1.1417271413385444e-06, "loss": 0.0296, "step": 50375 }, { "epoch": 4.5658872575675185, "grad_norm": 0.30438145995140076, "learning_rate": 1.13936554809293e-06, "loss": 0.0554, "step": 50380 }, { "epoch": 4.566340402392605, "grad_norm": 0.3425120413303375, "learning_rate": 1.1370063428276816e-06, "loss": 0.0291, "step": 50385 }, { "epoch": 4.56679354721769, "grad_norm": 0.32892248034477234, "learning_rate": 1.1346495257788996e-06, "loss": 0.0293, "step": 50390 }, { "epoch": 4.567246692042777, "grad_norm": 0.3881015181541443, "learning_rate": 1.1322950971824647e-06, "loss": 0.0303, "step": 50395 }, { "epoch": 4.567699836867863, "grad_norm": 0.33543336391448975, "learning_rate": 1.1299430572740022e-06, "loss": 0.032, "step": 50400 }, { "epoch": 4.568152981692949, "grad_norm": 0.33897197246551514, "learning_rate": 1.127593406288907e-06, "loss": 0.0282, "step": 50405 }, { "epoch": 4.568606126518035, "grad_norm": 0.2925536036491394, "learning_rate": 1.1252461444623414e-06, "loss": 0.0323, "step": 50410 }, { "epoch": 4.5690592713431215, "grad_norm": 0.3617934584617615, "learning_rate": 1.1229012720292032e-06, "loss": 0.0304, "step": 50415 }, { "epoch": 4.569512416168207, "grad_norm": 0.3462293744087219, "learning_rate": 1.1205587892241825e-06, "loss": 0.0302, "step": 50420 }, { "epoch": 4.569965560993293, "grad_norm": 0.27467307448387146, "learning_rate": 1.1182186962817088e-06, "loss": 0.0268, "step": 50425 }, { "epoch": 4.57041870581838, "grad_norm": 0.28629833459854126, "learning_rate": 1.1158809934359803e-06, "loss": 0.0291, "step": 50430 }, { "epoch": 4.570871850643465, "grad_norm": 0.4855368733406067, "learning_rate": 1.1135456809209604e-06, "loss": 0.0343, "step": 50435 }, { "epoch": 4.571324995468552, "grad_norm": 0.3582630753517151, "learning_rate": 1.1112127589703592e-06, "loss": 0.0287, "step": 50440 }, { "epoch": 4.571778140293638, "grad_norm": 0.25957944989204407, "learning_rate": 1.108882227817662e-06, "loss": 0.0317, "step": 50445 }, { "epoch": 4.572231285118724, "grad_norm": 0.3156619369983673, "learning_rate": 1.10655408769611e-06, "loss": 0.0292, "step": 50450 }, { "epoch": 4.57268442994381, "grad_norm": 0.3616334795951843, "learning_rate": 1.1042283388387003e-06, "loss": 0.0309, "step": 50455 }, { "epoch": 4.5731375747688965, "grad_norm": 0.2973182201385498, "learning_rate": 1.1019049814781939e-06, "loss": 0.0295, "step": 50460 }, { "epoch": 4.573590719593982, "grad_norm": 0.39234763383865356, "learning_rate": 1.0995840158471189e-06, "loss": 0.034, "step": 50465 }, { "epoch": 4.574043864419068, "grad_norm": 0.324690580368042, "learning_rate": 1.097265442177753e-06, "loss": 0.0306, "step": 50470 }, { "epoch": 4.574497009244155, "grad_norm": 0.3049437999725342, "learning_rate": 1.0949492607021438e-06, "loss": 0.0265, "step": 50475 }, { "epoch": 4.57495015406924, "grad_norm": 0.9981957674026489, "learning_rate": 1.0926354716520953e-06, "loss": 0.0412, "step": 50480 }, { "epoch": 4.575403298894327, "grad_norm": 0.3559283912181854, "learning_rate": 1.0903240752591664e-06, "loss": 0.0328, "step": 50485 }, { "epoch": 4.575856443719413, "grad_norm": 0.261267751455307, "learning_rate": 1.088015071754689e-06, "loss": 0.0272, "step": 50490 }, { "epoch": 4.576309588544499, "grad_norm": 0.30435270071029663, "learning_rate": 1.0857084613697472e-06, "loss": 0.0374, "step": 50495 }, { "epoch": 4.576762733369585, "grad_norm": 0.27167174220085144, "learning_rate": 1.0834042443351873e-06, "loss": 0.0283, "step": 50500 }, { "epoch": 4.5772158781946715, "grad_norm": 0.24596884846687317, "learning_rate": 1.0811024208816135e-06, "loss": 0.0283, "step": 50505 }, { "epoch": 4.577669023019757, "grad_norm": 0.3437539041042328, "learning_rate": 1.0788029912393943e-06, "loss": 0.0306, "step": 50510 }, { "epoch": 4.578122167844843, "grad_norm": 0.41870346665382385, "learning_rate": 1.0765059556386625e-06, "loss": 0.0321, "step": 50515 }, { "epoch": 4.57857531266993, "grad_norm": 0.3369331359863281, "learning_rate": 1.074211314309298e-06, "loss": 0.028, "step": 50520 }, { "epoch": 4.579028457495015, "grad_norm": 0.3469701111316681, "learning_rate": 1.071919067480956e-06, "loss": 0.0294, "step": 50525 }, { "epoch": 4.579481602320102, "grad_norm": 0.25319766998291016, "learning_rate": 1.0696292153830445e-06, "loss": 0.0331, "step": 50530 }, { "epoch": 4.579934747145187, "grad_norm": 0.3349122703075409, "learning_rate": 1.0673417582447248e-06, "loss": 0.0296, "step": 50535 }, { "epoch": 4.580387891970274, "grad_norm": 0.3173857033252716, "learning_rate": 1.065056696294936e-06, "loss": 0.0284, "step": 50540 }, { "epoch": 4.58084103679536, "grad_norm": 0.29199400544166565, "learning_rate": 1.0627740297623645e-06, "loss": 0.0277, "step": 50545 }, { "epoch": 4.5812941816204455, "grad_norm": 0.33704492449760437, "learning_rate": 1.060493758875461e-06, "loss": 0.0308, "step": 50550 }, { "epoch": 4.581747326445532, "grad_norm": 0.3297417163848877, "learning_rate": 1.0582158838624318e-06, "loss": 0.0295, "step": 50555 }, { "epoch": 4.582200471270618, "grad_norm": 0.2878890633583069, "learning_rate": 1.05594040495125e-06, "loss": 0.0312, "step": 50560 }, { "epoch": 4.582653616095704, "grad_norm": 0.2889998257160187, "learning_rate": 1.0536673223696503e-06, "loss": 0.0312, "step": 50565 }, { "epoch": 4.58310676092079, "grad_norm": 0.3509746193885803, "learning_rate": 1.0513966363451173e-06, "loss": 0.0348, "step": 50570 }, { "epoch": 4.583559905745877, "grad_norm": 0.40978506207466125, "learning_rate": 1.049128347104905e-06, "loss": 0.0299, "step": 50575 }, { "epoch": 4.584013050570962, "grad_norm": 0.322951078414917, "learning_rate": 1.0468624548760297e-06, "loss": 0.0311, "step": 50580 }, { "epoch": 4.584466195396049, "grad_norm": 0.3053247630596161, "learning_rate": 1.0445989598852484e-06, "loss": 0.028, "step": 50585 }, { "epoch": 4.584919340221135, "grad_norm": 0.5093318819999695, "learning_rate": 1.042337862359108e-06, "loss": 0.032, "step": 50590 }, { "epoch": 4.5853724850462205, "grad_norm": 0.38993382453918457, "learning_rate": 1.040079162523891e-06, "loss": 0.0319, "step": 50595 }, { "epoch": 4.585825629871307, "grad_norm": 0.29781368374824524, "learning_rate": 1.0378228606056501e-06, "loss": 0.0284, "step": 50600 }, { "epoch": 4.586278774696393, "grad_norm": 0.43527883291244507, "learning_rate": 1.035568956830202e-06, "loss": 0.0338, "step": 50605 }, { "epoch": 4.586731919521479, "grad_norm": 0.4158231019973755, "learning_rate": 1.0333174514231076e-06, "loss": 0.0311, "step": 50610 }, { "epoch": 4.587185064346565, "grad_norm": 0.39859527349472046, "learning_rate": 1.031068344609712e-06, "loss": 0.0314, "step": 50615 }, { "epoch": 4.587638209171651, "grad_norm": 0.3915175795555115, "learning_rate": 1.0288216366150905e-06, "loss": 0.0299, "step": 50620 }, { "epoch": 4.588091353996737, "grad_norm": 0.4553341269493103, "learning_rate": 1.026577327664105e-06, "loss": 0.0342, "step": 50625 }, { "epoch": 4.588544498821824, "grad_norm": 0.5293399691581726, "learning_rate": 1.024335417981373e-06, "loss": 0.032, "step": 50630 }, { "epoch": 4.588997643646909, "grad_norm": 0.5094519257545471, "learning_rate": 1.0220959077912484e-06, "loss": 0.0322, "step": 50635 }, { "epoch": 4.5894507884719955, "grad_norm": 0.3110584616661072, "learning_rate": 1.019858797317877e-06, "loss": 0.0281, "step": 50640 }, { "epoch": 4.589903933297082, "grad_norm": 0.27596884965896606, "learning_rate": 1.0176240867851378e-06, "loss": 0.0456, "step": 50645 }, { "epoch": 4.590357078122167, "grad_norm": 0.34781795740127563, "learning_rate": 1.015391776416691e-06, "loss": 0.031, "step": 50650 }, { "epoch": 4.590810222947254, "grad_norm": 0.3710280656814575, "learning_rate": 1.0131618664359438e-06, "loss": 0.0357, "step": 50655 }, { "epoch": 4.59126336777234, "grad_norm": 0.33380937576293945, "learning_rate": 1.0109343570660624e-06, "loss": 0.0295, "step": 50660 }, { "epoch": 4.591716512597426, "grad_norm": 0.3559473752975464, "learning_rate": 1.0087092485299848e-06, "loss": 0.0297, "step": 50665 }, { "epoch": 4.592169657422512, "grad_norm": 0.3399896025657654, "learning_rate": 1.0064865410503915e-06, "loss": 0.0301, "step": 50670 }, { "epoch": 4.592622802247599, "grad_norm": 0.6143879890441895, "learning_rate": 1.0042662348497407e-06, "loss": 0.0347, "step": 50675 }, { "epoch": 4.593075947072684, "grad_norm": 0.2820965647697449, "learning_rate": 1.002048330150232e-06, "loss": 0.0283, "step": 50680 }, { "epoch": 4.5935290918977705, "grad_norm": 0.3921872675418854, "learning_rate": 9.998328271738411e-07, "loss": 0.029, "step": 50685 }, { "epoch": 4.593982236722857, "grad_norm": 0.28736987709999084, "learning_rate": 9.97619726142296e-07, "loss": 0.0299, "step": 50690 }, { "epoch": 4.594435381547942, "grad_norm": 0.26975730061531067, "learning_rate": 9.954090272770804e-07, "loss": 0.0279, "step": 50695 }, { "epoch": 4.594888526373029, "grad_norm": 0.32471874356269836, "learning_rate": 9.932007307994428e-07, "loss": 0.0309, "step": 50700 }, { "epoch": 4.595341671198115, "grad_norm": 0.331610769033432, "learning_rate": 9.90994836930395e-07, "loss": 0.0307, "step": 50705 }, { "epoch": 4.595794816023201, "grad_norm": 0.29462775588035583, "learning_rate": 9.887913458906967e-07, "loss": 0.0303, "step": 50710 }, { "epoch": 4.596247960848287, "grad_norm": 0.3629380464553833, "learning_rate": 9.865902579008796e-07, "loss": 0.0296, "step": 50715 }, { "epoch": 4.5967011056733735, "grad_norm": 0.33579403162002563, "learning_rate": 9.843915731812287e-07, "loss": 0.0312, "step": 50720 }, { "epoch": 4.597154250498459, "grad_norm": 0.36851856112480164, "learning_rate": 9.821952919517845e-07, "loss": 0.0281, "step": 50725 }, { "epoch": 4.5976073953235455, "grad_norm": 0.3787820041179657, "learning_rate": 9.800014144323578e-07, "loss": 0.0365, "step": 50730 }, { "epoch": 4.598060540148632, "grad_norm": 0.29061487317085266, "learning_rate": 9.778099408425056e-07, "loss": 0.0367, "step": 50735 }, { "epoch": 4.598513684973717, "grad_norm": 0.3008498251438141, "learning_rate": 9.756208714015587e-07, "loss": 0.0313, "step": 50740 }, { "epoch": 4.598966829798804, "grad_norm": 0.7535842061042786, "learning_rate": 9.73434206328594e-07, "loss": 0.0389, "step": 50745 }, { "epoch": 4.59941997462389, "grad_norm": 0.33768123388290405, "learning_rate": 9.712499458424589e-07, "loss": 0.0308, "step": 50750 }, { "epoch": 4.599873119448976, "grad_norm": 0.26361608505249023, "learning_rate": 9.690680901617511e-07, "loss": 0.0284, "step": 50755 }, { "epoch": 4.600326264274062, "grad_norm": 0.3063221871852875, "learning_rate": 9.668886395048289e-07, "loss": 0.0285, "step": 50760 }, { "epoch": 4.6007794090991485, "grad_norm": 0.3010009527206421, "learning_rate": 9.647115940898204e-07, "loss": 0.0319, "step": 50765 }, { "epoch": 4.601232553924234, "grad_norm": 0.2837238013744354, "learning_rate": 9.625369541345992e-07, "loss": 0.0298, "step": 50770 }, { "epoch": 4.60168569874932, "grad_norm": 0.2732260227203369, "learning_rate": 9.603647198568073e-07, "loss": 0.0306, "step": 50775 }, { "epoch": 4.602138843574407, "grad_norm": 0.47298917174339294, "learning_rate": 9.581948914738404e-07, "loss": 0.0293, "step": 50780 }, { "epoch": 4.602591988399492, "grad_norm": 0.29609671235084534, "learning_rate": 9.560274692028526e-07, "loss": 0.0292, "step": 50785 }, { "epoch": 4.603045133224579, "grad_norm": 0.36546406149864197, "learning_rate": 9.538624532607677e-07, "loss": 0.0383, "step": 50790 }, { "epoch": 4.603498278049665, "grad_norm": 0.27609777450561523, "learning_rate": 9.516998438642539e-07, "loss": 0.0291, "step": 50795 }, { "epoch": 4.603951422874751, "grad_norm": 0.3032359480857849, "learning_rate": 9.49539641229752e-07, "loss": 0.0399, "step": 50800 }, { "epoch": 4.604404567699837, "grad_norm": 0.2540190815925598, "learning_rate": 9.473818455734529e-07, "loss": 0.0359, "step": 50805 }, { "epoch": 4.604857712524923, "grad_norm": 0.39596739411354065, "learning_rate": 9.452264571113118e-07, "loss": 0.032, "step": 50810 }, { "epoch": 4.605310857350009, "grad_norm": 0.24076025187969208, "learning_rate": 9.430734760590309e-07, "loss": 0.0328, "step": 50815 }, { "epoch": 4.605764002175095, "grad_norm": 0.35377126932144165, "learning_rate": 9.409229026320965e-07, "loss": 0.0285, "step": 50820 }, { "epoch": 4.606217147000181, "grad_norm": 0.3969152867794037, "learning_rate": 9.387747370457278e-07, "loss": 0.0328, "step": 50825 }, { "epoch": 4.606670291825267, "grad_norm": 0.4011712968349457, "learning_rate": 9.366289795149169e-07, "loss": 0.0338, "step": 50830 }, { "epoch": 4.607123436650354, "grad_norm": 0.48132553696632385, "learning_rate": 9.344856302544169e-07, "loss": 0.0333, "step": 50835 }, { "epoch": 4.607576581475439, "grad_norm": 0.29132020473480225, "learning_rate": 9.32344689478723e-07, "loss": 0.0306, "step": 50840 }, { "epoch": 4.608029726300526, "grad_norm": 0.3043225109577179, "learning_rate": 9.302061574021137e-07, "loss": 0.0344, "step": 50845 }, { "epoch": 4.608482871125612, "grad_norm": 0.4659122824668884, "learning_rate": 9.280700342386095e-07, "loss": 0.0331, "step": 50850 }, { "epoch": 4.608936015950698, "grad_norm": 0.30112630128860474, "learning_rate": 9.259363202019894e-07, "loss": 0.0328, "step": 50855 }, { "epoch": 4.609389160775784, "grad_norm": 0.3418450951576233, "learning_rate": 9.238050155058076e-07, "loss": 0.0284, "step": 50860 }, { "epoch": 4.60984230560087, "grad_norm": 0.35208776593208313, "learning_rate": 9.216761203633517e-07, "loss": 0.0303, "step": 50865 }, { "epoch": 4.610295450425956, "grad_norm": 0.34328025579452515, "learning_rate": 9.195496349876931e-07, "loss": 0.0365, "step": 50870 }, { "epoch": 4.610748595251042, "grad_norm": 0.34049299359321594, "learning_rate": 9.174255595916476e-07, "loss": 0.0312, "step": 50875 }, { "epoch": 4.611201740076129, "grad_norm": 0.35633957386016846, "learning_rate": 9.153038943877896e-07, "loss": 0.0309, "step": 50880 }, { "epoch": 4.611654884901214, "grad_norm": 0.42301326990127563, "learning_rate": 9.131846395884657e-07, "loss": 0.0393, "step": 50885 }, { "epoch": 4.612108029726301, "grad_norm": 0.34739017486572266, "learning_rate": 9.110677954057623e-07, "loss": 0.0308, "step": 50890 }, { "epoch": 4.612561174551386, "grad_norm": 0.38080018758773804, "learning_rate": 9.089533620515372e-07, "loss": 0.0287, "step": 50895 }, { "epoch": 4.6130143193764725, "grad_norm": 0.24570274353027344, "learning_rate": 9.068413397374048e-07, "loss": 0.0299, "step": 50900 }, { "epoch": 4.613467464201559, "grad_norm": 0.4001655578613281, "learning_rate": 9.04731728674732e-07, "loss": 0.0317, "step": 50905 }, { "epoch": 4.6139206090266445, "grad_norm": 0.2798764407634735, "learning_rate": 9.026245290746582e-07, "loss": 0.0304, "step": 50910 }, { "epoch": 4.614373753851731, "grad_norm": 0.33381709456443787, "learning_rate": 9.005197411480648e-07, "loss": 0.0288, "step": 50915 }, { "epoch": 4.614826898676817, "grad_norm": 0.3057801127433777, "learning_rate": 8.984173651056027e-07, "loss": 0.0279, "step": 50920 }, { "epoch": 4.615280043501903, "grad_norm": 0.3657742440700531, "learning_rate": 8.963174011576786e-07, "loss": 0.0301, "step": 50925 }, { "epoch": 4.615733188326989, "grad_norm": 0.32668137550354004, "learning_rate": 8.942198495144549e-07, "loss": 0.0303, "step": 50930 }, { "epoch": 4.616186333152076, "grad_norm": 0.41309288144111633, "learning_rate": 8.921247103858638e-07, "loss": 0.0328, "step": 50935 }, { "epoch": 4.616639477977161, "grad_norm": 0.36928626894950867, "learning_rate": 8.900319839815763e-07, "loss": 0.0355, "step": 50940 }, { "epoch": 4.6170926228022475, "grad_norm": 0.3136793375015259, "learning_rate": 8.879416705110389e-07, "loss": 0.0288, "step": 50945 }, { "epoch": 4.617545767627334, "grad_norm": 0.31083226203918457, "learning_rate": 8.858537701834535e-07, "loss": 0.0436, "step": 50950 }, { "epoch": 4.6179989124524194, "grad_norm": 0.29906222224235535, "learning_rate": 8.837682832077698e-07, "loss": 0.0312, "step": 50955 }, { "epoch": 4.618452057277506, "grad_norm": 0.27596229314804077, "learning_rate": 8.816852097927125e-07, "loss": 0.0318, "step": 50960 }, { "epoch": 4.618905202102592, "grad_norm": 0.2913641333580017, "learning_rate": 8.796045501467509e-07, "loss": 0.0313, "step": 50965 }, { "epoch": 4.619358346927678, "grad_norm": 0.5639922022819519, "learning_rate": 8.775263044781184e-07, "loss": 0.0563, "step": 50970 }, { "epoch": 4.619811491752764, "grad_norm": 0.5136366486549377, "learning_rate": 8.754504729948126e-07, "loss": 0.0348, "step": 50975 }, { "epoch": 4.620264636577851, "grad_norm": 0.27979499101638794, "learning_rate": 8.733770559045728e-07, "loss": 0.0287, "step": 50980 }, { "epoch": 4.620717781402936, "grad_norm": 0.3102927803993225, "learning_rate": 8.713060534149193e-07, "loss": 0.0289, "step": 50985 }, { "epoch": 4.6211709262280225, "grad_norm": 0.3439948558807373, "learning_rate": 8.692374657331081e-07, "loss": 0.0304, "step": 50990 }, { "epoch": 4.621624071053109, "grad_norm": 0.3287510573863983, "learning_rate": 8.671712930661713e-07, "loss": 0.0338, "step": 50995 }, { "epoch": 4.622077215878194, "grad_norm": 0.34321749210357666, "learning_rate": 8.651075356208876e-07, "loss": 0.0283, "step": 51000 }, { "epoch": 4.622530360703281, "grad_norm": 0.3386465311050415, "learning_rate": 8.630461936038031e-07, "loss": 0.0287, "step": 51005 }, { "epoch": 4.622983505528367, "grad_norm": 0.3395971953868866, "learning_rate": 8.609872672212138e-07, "loss": 0.0301, "step": 51010 }, { "epoch": 4.623436650353453, "grad_norm": 0.28764012455940247, "learning_rate": 8.589307566791771e-07, "loss": 0.0485, "step": 51015 }, { "epoch": 4.623889795178539, "grad_norm": 0.2767321765422821, "learning_rate": 8.568766621835117e-07, "loss": 0.0284, "step": 51020 }, { "epoch": 4.624342940003626, "grad_norm": 0.40962573885917664, "learning_rate": 8.548249839397949e-07, "loss": 0.0304, "step": 51025 }, { "epoch": 4.624796084828711, "grad_norm": 0.4999435245990753, "learning_rate": 8.527757221533539e-07, "loss": 0.0347, "step": 51030 }, { "epoch": 4.6252492296537975, "grad_norm": 0.3736153841018677, "learning_rate": 8.507288770292803e-07, "loss": 0.0298, "step": 51035 }, { "epoch": 4.625702374478884, "grad_norm": 0.4176051914691925, "learning_rate": 8.486844487724243e-07, "loss": 0.032, "step": 51040 }, { "epoch": 4.626155519303969, "grad_norm": 0.27295583486557007, "learning_rate": 8.466424375873943e-07, "loss": 0.0337, "step": 51045 }, { "epoch": 4.626608664129056, "grad_norm": 0.28084978461265564, "learning_rate": 8.446028436785547e-07, "loss": 0.0328, "step": 51050 }, { "epoch": 4.627061808954142, "grad_norm": 0.33346131443977356, "learning_rate": 8.425656672500282e-07, "loss": 0.0318, "step": 51055 }, { "epoch": 4.627514953779228, "grad_norm": 0.30322521924972534, "learning_rate": 8.405309085056962e-07, "loss": 0.0298, "step": 51060 }, { "epoch": 4.627968098604314, "grad_norm": 0.2856255769729614, "learning_rate": 8.384985676491958e-07, "loss": 0.0308, "step": 51065 }, { "epoch": 4.6284212434294005, "grad_norm": 0.25537899136543274, "learning_rate": 8.364686448839281e-07, "loss": 0.0293, "step": 51070 }, { "epoch": 4.628874388254486, "grad_norm": 0.2599482238292694, "learning_rate": 8.344411404130475e-07, "loss": 0.0303, "step": 51075 }, { "epoch": 4.6293275330795725, "grad_norm": 0.26087746024131775, "learning_rate": 8.324160544394666e-07, "loss": 0.0293, "step": 51080 }, { "epoch": 4.629780677904658, "grad_norm": 0.35372745990753174, "learning_rate": 8.303933871658593e-07, "loss": 0.0343, "step": 51085 }, { "epoch": 4.630233822729744, "grad_norm": 0.40366074442863464, "learning_rate": 8.28373138794647e-07, "loss": 0.0332, "step": 51090 }, { "epoch": 4.630686967554831, "grad_norm": 0.22080641984939575, "learning_rate": 8.263553095280235e-07, "loss": 0.0312, "step": 51095 }, { "epoch": 4.631140112379916, "grad_norm": 0.2974209785461426, "learning_rate": 8.243398995679358e-07, "loss": 0.0289, "step": 51100 }, { "epoch": 4.631593257205003, "grad_norm": 0.3568723201751709, "learning_rate": 8.223269091160807e-07, "loss": 0.0289, "step": 51105 }, { "epoch": 4.632046402030089, "grad_norm": 0.2783403992652893, "learning_rate": 8.20316338373922e-07, "loss": 0.0272, "step": 51110 }, { "epoch": 4.632499546855175, "grad_norm": 0.43445855379104614, "learning_rate": 8.183081875426796e-07, "loss": 0.0308, "step": 51115 }, { "epoch": 4.632952691680261, "grad_norm": 0.2924969494342804, "learning_rate": 8.163024568233286e-07, "loss": 0.0337, "step": 51120 }, { "epoch": 4.633405836505347, "grad_norm": 0.44014036655426025, "learning_rate": 8.142991464166e-07, "loss": 0.0518, "step": 51125 }, { "epoch": 4.633858981330433, "grad_norm": 0.2659880816936493, "learning_rate": 8.12298256522992e-07, "loss": 0.0282, "step": 51130 }, { "epoch": 4.634312126155519, "grad_norm": 0.23722894489765167, "learning_rate": 8.102997873427471e-07, "loss": 0.0288, "step": 51135 }, { "epoch": 4.634765270980606, "grad_norm": 0.31401124596595764, "learning_rate": 8.083037390758802e-07, "loss": 0.0322, "step": 51140 }, { "epoch": 4.635218415805691, "grad_norm": 0.3057168424129486, "learning_rate": 8.063101119221539e-07, "loss": 0.0335, "step": 51145 }, { "epoch": 4.635671560630778, "grad_norm": 0.31564900279045105, "learning_rate": 8.04318906081089e-07, "loss": 0.0321, "step": 51150 }, { "epoch": 4.636124705455864, "grad_norm": 0.290253221988678, "learning_rate": 8.023301217519679e-07, "loss": 0.0354, "step": 51155 }, { "epoch": 4.63657785028095, "grad_norm": 0.43528202176094055, "learning_rate": 8.003437591338258e-07, "loss": 0.0314, "step": 51160 }, { "epoch": 4.637030995106036, "grad_norm": 0.41597265005111694, "learning_rate": 7.983598184254676e-07, "loss": 0.0334, "step": 51165 }, { "epoch": 4.6374841399311215, "grad_norm": 0.32511046528816223, "learning_rate": 7.963782998254316e-07, "loss": 0.0297, "step": 51170 }, { "epoch": 4.637937284756208, "grad_norm": 0.312946617603302, "learning_rate": 7.943992035320424e-07, "loss": 0.0292, "step": 51175 }, { "epoch": 4.638390429581294, "grad_norm": 0.3186059594154358, "learning_rate": 7.924225297433641e-07, "loss": 0.0284, "step": 51180 }, { "epoch": 4.63884357440638, "grad_norm": 0.38341444730758667, "learning_rate": 7.904482786572187e-07, "loss": 0.0322, "step": 51185 }, { "epoch": 4.639296719231466, "grad_norm": 0.3085498809814453, "learning_rate": 7.884764504711984e-07, "loss": 0.0311, "step": 51190 }, { "epoch": 4.639749864056553, "grad_norm": 0.3407629430294037, "learning_rate": 7.865070453826339e-07, "loss": 0.0306, "step": 51195 }, { "epoch": 4.640203008881638, "grad_norm": 0.3668016195297241, "learning_rate": 7.845400635886347e-07, "loss": 0.0362, "step": 51200 }, { "epoch": 4.640656153706725, "grad_norm": 0.37068912386894226, "learning_rate": 7.825755052860512e-07, "loss": 0.0311, "step": 51205 }, { "epoch": 4.641109298531811, "grad_norm": 0.3323248624801636, "learning_rate": 7.80613370671493e-07, "loss": 0.0301, "step": 51210 }, { "epoch": 4.6415624433568965, "grad_norm": 0.26820436120033264, "learning_rate": 7.786536599413447e-07, "loss": 0.0318, "step": 51215 }, { "epoch": 4.642015588181983, "grad_norm": 0.3041902184486389, "learning_rate": 7.766963732917188e-07, "loss": 0.0317, "step": 51220 }, { "epoch": 4.642468733007069, "grad_norm": 0.3031187355518341, "learning_rate": 7.747415109185113e-07, "loss": 0.0307, "step": 51225 }, { "epoch": 4.642921877832155, "grad_norm": 0.4248000383377075, "learning_rate": 7.727890730173631e-07, "loss": 0.0318, "step": 51230 }, { "epoch": 4.643375022657241, "grad_norm": 0.6510140299797058, "learning_rate": 7.708390597836707e-07, "loss": 0.0346, "step": 51235 }, { "epoch": 4.643828167482328, "grad_norm": 0.28031665086746216, "learning_rate": 7.688914714126028e-07, "loss": 0.0283, "step": 51240 }, { "epoch": 4.644281312307413, "grad_norm": 0.4430466592311859, "learning_rate": 7.669463080990618e-07, "loss": 0.0316, "step": 51245 }, { "epoch": 4.6447344571324995, "grad_norm": 0.30225878953933716, "learning_rate": 7.650035700377283e-07, "loss": 0.0286, "step": 51250 }, { "epoch": 4.645187601957586, "grad_norm": 0.28992781043052673, "learning_rate": 7.630632574230301e-07, "loss": 0.0288, "step": 51255 }, { "epoch": 4.6456407467826715, "grad_norm": 0.2665085196495056, "learning_rate": 7.611253704491533e-07, "loss": 0.0304, "step": 51260 }, { "epoch": 4.646093891607758, "grad_norm": 0.3089313507080078, "learning_rate": 7.591899093100485e-07, "loss": 0.0301, "step": 51265 }, { "epoch": 4.646547036432844, "grad_norm": 0.32298538088798523, "learning_rate": 7.572568741994052e-07, "loss": 0.0299, "step": 51270 }, { "epoch": 4.64700018125793, "grad_norm": 0.29595211148262024, "learning_rate": 7.553262653106935e-07, "loss": 0.032, "step": 51275 }, { "epoch": 4.647453326083016, "grad_norm": 0.42121103405952454, "learning_rate": 7.533980828371229e-07, "loss": 0.0342, "step": 51280 }, { "epoch": 4.647906470908103, "grad_norm": 0.37485283613204956, "learning_rate": 7.514723269716695e-07, "loss": 0.034, "step": 51285 }, { "epoch": 4.648359615733188, "grad_norm": 0.6694417595863342, "learning_rate": 7.495489979070652e-07, "loss": 0.0385, "step": 51290 }, { "epoch": 4.6488127605582745, "grad_norm": 0.3492281436920166, "learning_rate": 7.476280958357923e-07, "loss": 0.0316, "step": 51295 }, { "epoch": 4.649265905383361, "grad_norm": 0.3389343023300171, "learning_rate": 7.457096209500997e-07, "loss": 0.0303, "step": 51300 }, { "epoch": 4.6497190502084464, "grad_norm": 0.25056490302085876, "learning_rate": 7.437935734419865e-07, "loss": 0.0291, "step": 51305 }, { "epoch": 4.650172195033533, "grad_norm": 0.2813882529735565, "learning_rate": 7.418799535032106e-07, "loss": 0.0299, "step": 51310 }, { "epoch": 4.650625339858619, "grad_norm": 0.3435172438621521, "learning_rate": 7.399687613252937e-07, "loss": 0.0319, "step": 51315 }, { "epoch": 4.651078484683705, "grad_norm": 0.3334507644176483, "learning_rate": 7.380599970994995e-07, "loss": 0.0284, "step": 51320 }, { "epoch": 4.651531629508791, "grad_norm": 0.2586388885974884, "learning_rate": 7.36153661016864e-07, "loss": 0.0281, "step": 51325 }, { "epoch": 4.651984774333878, "grad_norm": 0.3522493243217468, "learning_rate": 7.342497532681735e-07, "loss": 0.0299, "step": 51330 }, { "epoch": 4.652437919158963, "grad_norm": 0.2643219232559204, "learning_rate": 7.323482740439702e-07, "loss": 0.0286, "step": 51335 }, { "epoch": 4.6528910639840495, "grad_norm": 0.38746511936187744, "learning_rate": 7.304492235345573e-07, "loss": 0.0315, "step": 51340 }, { "epoch": 4.653344208809135, "grad_norm": 0.2634020745754242, "learning_rate": 7.285526019299882e-07, "loss": 0.0374, "step": 51345 }, { "epoch": 4.653797353634221, "grad_norm": 0.20911070704460144, "learning_rate": 7.266584094200807e-07, "loss": 0.0296, "step": 51350 }, { "epoch": 4.654250498459308, "grad_norm": 0.3733157813549042, "learning_rate": 7.247666461944052e-07, "loss": 0.0295, "step": 51355 }, { "epoch": 4.654703643284393, "grad_norm": 0.27770936489105225, "learning_rate": 7.22877312442291e-07, "loss": 0.0272, "step": 51360 }, { "epoch": 4.65515678810948, "grad_norm": 0.4796134829521179, "learning_rate": 7.209904083528174e-07, "loss": 0.0339, "step": 51365 }, { "epoch": 4.655609932934566, "grad_norm": 0.37206393480300903, "learning_rate": 7.191059341148359e-07, "loss": 0.0291, "step": 51370 }, { "epoch": 4.656063077759652, "grad_norm": 0.3326946496963501, "learning_rate": 7.172238899169403e-07, "loss": 0.0305, "step": 51375 }, { "epoch": 4.656516222584738, "grad_norm": 0.29340627789497375, "learning_rate": 7.153442759474854e-07, "loss": 0.0282, "step": 51380 }, { "epoch": 4.6569693674098245, "grad_norm": 0.22205477952957153, "learning_rate": 7.134670923945847e-07, "loss": 0.0312, "step": 51385 }, { "epoch": 4.65742251223491, "grad_norm": 0.2904679477214813, "learning_rate": 7.115923394461071e-07, "loss": 0.032, "step": 51390 }, { "epoch": 4.657875657059996, "grad_norm": 0.37981271743774414, "learning_rate": 7.097200172896806e-07, "loss": 0.0306, "step": 51395 }, { "epoch": 4.658328801885083, "grad_norm": 0.33980897068977356, "learning_rate": 7.078501261126857e-07, "loss": 0.0326, "step": 51400 }, { "epoch": 4.658781946710168, "grad_norm": 0.3614477515220642, "learning_rate": 7.059826661022617e-07, "loss": 0.0299, "step": 51405 }, { "epoch": 4.659235091535255, "grad_norm": 0.416633278131485, "learning_rate": 7.041176374453063e-07, "loss": 0.0309, "step": 51410 }, { "epoch": 4.659688236360341, "grad_norm": 0.29398447275161743, "learning_rate": 7.022550403284672e-07, "loss": 0.0291, "step": 51415 }, { "epoch": 4.660141381185427, "grad_norm": 0.30259814858436584, "learning_rate": 7.003948749381594e-07, "loss": 0.0289, "step": 51420 }, { "epoch": 4.660594526010513, "grad_norm": 0.31070390343666077, "learning_rate": 6.985371414605507e-07, "loss": 0.0276, "step": 51425 }, { "epoch": 4.661047670835599, "grad_norm": 0.25641143321990967, "learning_rate": 6.966818400815562e-07, "loss": 0.0303, "step": 51430 }, { "epoch": 4.661500815660685, "grad_norm": 0.37369298934936523, "learning_rate": 6.948289709868605e-07, "loss": 0.0528, "step": 51435 }, { "epoch": 4.661953960485771, "grad_norm": 0.31496715545654297, "learning_rate": 6.929785343618961e-07, "loss": 0.0323, "step": 51440 }, { "epoch": 4.662407105310857, "grad_norm": 0.26061370968818665, "learning_rate": 6.911305303918563e-07, "loss": 0.029, "step": 51445 }, { "epoch": 4.662860250135943, "grad_norm": 0.2839801013469696, "learning_rate": 6.892849592616935e-07, "loss": 0.0314, "step": 51450 }, { "epoch": 4.66331339496103, "grad_norm": 0.3034437894821167, "learning_rate": 6.874418211561095e-07, "loss": 0.0301, "step": 51455 }, { "epoch": 4.663766539786115, "grad_norm": 0.35846251249313354, "learning_rate": 6.856011162595682e-07, "loss": 0.0316, "step": 51460 }, { "epoch": 4.664219684611202, "grad_norm": 0.32078084349632263, "learning_rate": 6.837628447562805e-07, "loss": 0.0308, "step": 51465 }, { "epoch": 4.664672829436288, "grad_norm": 0.2598040699958801, "learning_rate": 6.819270068302325e-07, "loss": 0.0286, "step": 51470 }, { "epoch": 4.6651259742613735, "grad_norm": 0.2614583671092987, "learning_rate": 6.800936026651522e-07, "loss": 0.0287, "step": 51475 }, { "epoch": 4.66557911908646, "grad_norm": 0.3188168704509735, "learning_rate": 6.782626324445207e-07, "loss": 0.0392, "step": 51480 }, { "epoch": 4.666032263911546, "grad_norm": 0.35018113255500793, "learning_rate": 6.764340963515914e-07, "loss": 0.0313, "step": 51485 }, { "epoch": 4.666485408736632, "grad_norm": 0.3079105615615845, "learning_rate": 6.746079945693568e-07, "loss": 0.0317, "step": 51490 }, { "epoch": 4.666938553561718, "grad_norm": 0.28390106558799744, "learning_rate": 6.72784327280579e-07, "loss": 0.0322, "step": 51495 }, { "epoch": 4.667391698386805, "grad_norm": 0.26411184668540955, "learning_rate": 6.709630946677675e-07, "loss": 0.0286, "step": 51500 }, { "epoch": 4.66784484321189, "grad_norm": 0.2706574499607086, "learning_rate": 6.691442969131961e-07, "loss": 0.0284, "step": 51505 }, { "epoch": 4.668297988036977, "grad_norm": 0.36060383915901184, "learning_rate": 6.673279341988886e-07, "loss": 0.0302, "step": 51510 }, { "epoch": 4.668751132862063, "grad_norm": 0.2804127335548401, "learning_rate": 6.655140067066245e-07, "loss": 0.0312, "step": 51515 }, { "epoch": 4.6692042776871485, "grad_norm": 0.33610662817955017, "learning_rate": 6.637025146179476e-07, "loss": 0.0382, "step": 51520 }, { "epoch": 4.669657422512235, "grad_norm": 0.4526621997356415, "learning_rate": 6.618934581141517e-07, "loss": 0.0329, "step": 51525 }, { "epoch": 4.670110567337321, "grad_norm": 0.27585434913635254, "learning_rate": 6.600868373762809e-07, "loss": 0.0292, "step": 51530 }, { "epoch": 4.670563712162407, "grad_norm": 0.3055797517299652, "learning_rate": 6.582826525851543e-07, "loss": 0.0293, "step": 51535 }, { "epoch": 4.671016856987493, "grad_norm": 0.33010876178741455, "learning_rate": 6.564809039213221e-07, "loss": 0.0294, "step": 51540 }, { "epoch": 4.67147000181258, "grad_norm": 0.3181747794151306, "learning_rate": 6.54681591565115e-07, "loss": 0.0286, "step": 51545 }, { "epoch": 4.671923146637665, "grad_norm": 0.3013274073600769, "learning_rate": 6.528847156966001e-07, "loss": 0.0278, "step": 51550 }, { "epoch": 4.672376291462752, "grad_norm": 0.27734893560409546, "learning_rate": 6.510902764956167e-07, "loss": 0.0282, "step": 51555 }, { "epoch": 4.672829436287838, "grad_norm": 0.2971899211406708, "learning_rate": 6.492982741417464e-07, "loss": 0.0302, "step": 51560 }, { "epoch": 4.6732825811129235, "grad_norm": 0.3335123360157013, "learning_rate": 6.475087088143372e-07, "loss": 0.0292, "step": 51565 }, { "epoch": 4.67373572593801, "grad_norm": 0.3065066933631897, "learning_rate": 6.457215806924932e-07, "loss": 0.0284, "step": 51570 }, { "epoch": 4.674188870763096, "grad_norm": 0.4423082172870636, "learning_rate": 6.439368899550602e-07, "loss": 0.0302, "step": 51575 }, { "epoch": 4.674642015588182, "grad_norm": 0.29521143436431885, "learning_rate": 6.421546367806619e-07, "loss": 0.0305, "step": 51580 }, { "epoch": 4.675095160413268, "grad_norm": 0.31736505031585693, "learning_rate": 6.403748213476585e-07, "loss": 0.0297, "step": 51585 }, { "epoch": 4.675548305238355, "grad_norm": 0.3628505766391754, "learning_rate": 6.385974438341768e-07, "loss": 0.0292, "step": 51590 }, { "epoch": 4.67600145006344, "grad_norm": 0.4782697260379791, "learning_rate": 6.368225044181025e-07, "loss": 0.0366, "step": 51595 }, { "epoch": 4.6764545948885266, "grad_norm": 0.3342939615249634, "learning_rate": 6.350500032770656e-07, "loss": 0.0291, "step": 51600 }, { "epoch": 4.676907739713613, "grad_norm": 0.3024459183216095, "learning_rate": 6.332799405884604e-07, "loss": 0.0292, "step": 51605 }, { "epoch": 4.6773608845386985, "grad_norm": 0.316806823015213, "learning_rate": 6.315123165294368e-07, "loss": 0.0326, "step": 51610 }, { "epoch": 4.677814029363785, "grad_norm": 0.2844494879245758, "learning_rate": 6.297471312768977e-07, "loss": 0.0298, "step": 51615 }, { "epoch": 4.67826717418887, "grad_norm": 0.26149365305900574, "learning_rate": 6.279843850075046e-07, "loss": 0.0289, "step": 51620 }, { "epoch": 4.678720319013957, "grad_norm": 0.42042461037635803, "learning_rate": 6.262240778976746e-07, "loss": 0.0312, "step": 51625 }, { "epoch": 4.679173463839043, "grad_norm": 0.3554317355155945, "learning_rate": 6.244662101235754e-07, "loss": 0.029, "step": 51630 }, { "epoch": 4.679626608664129, "grad_norm": 0.2619273364543915, "learning_rate": 6.22710781861141e-07, "loss": 0.0313, "step": 51635 }, { "epoch": 4.680079753489215, "grad_norm": 0.4148635268211365, "learning_rate": 6.209577932860505e-07, "loss": 0.0296, "step": 51640 }, { "epoch": 4.6805328983143015, "grad_norm": 0.4174892008304596, "learning_rate": 6.192072445737468e-07, "loss": 0.0308, "step": 51645 }, { "epoch": 4.680986043139387, "grad_norm": 0.38370949029922485, "learning_rate": 6.174591358994231e-07, "loss": 0.0275, "step": 51650 }, { "epoch": 4.6814391879644734, "grad_norm": 0.3697872459888458, "learning_rate": 6.15713467438031e-07, "loss": 0.0307, "step": 51655 }, { "epoch": 4.68189233278956, "grad_norm": 0.3870922029018402, "learning_rate": 6.139702393642782e-07, "loss": 0.0308, "step": 51660 }, { "epoch": 4.682345477614645, "grad_norm": 0.3480609655380249, "learning_rate": 6.122294518526279e-07, "loss": 0.0305, "step": 51665 }, { "epoch": 4.682798622439732, "grad_norm": 0.2961190342903137, "learning_rate": 6.104911050772988e-07, "loss": 0.0294, "step": 51670 }, { "epoch": 4.683251767264818, "grad_norm": 0.4832896590232849, "learning_rate": 6.087551992122658e-07, "loss": 0.0353, "step": 51675 }, { "epoch": 4.683704912089904, "grad_norm": 0.3055986166000366, "learning_rate": 6.070217344312568e-07, "loss": 0.0321, "step": 51680 }, { "epoch": 4.68415805691499, "grad_norm": 0.304946631193161, "learning_rate": 6.052907109077577e-07, "loss": 0.0298, "step": 51685 }, { "epoch": 4.6846112017400765, "grad_norm": 0.37202516198158264, "learning_rate": 6.035621288150106e-07, "loss": 0.0324, "step": 51690 }, { "epoch": 4.685064346565162, "grad_norm": 0.44706764817237854, "learning_rate": 6.018359883260161e-07, "loss": 0.0315, "step": 51695 }, { "epoch": 4.685517491390248, "grad_norm": 0.27037954330444336, "learning_rate": 6.001122896135219e-07, "loss": 0.0389, "step": 51700 }, { "epoch": 4.685970636215334, "grad_norm": 0.4566481411457062, "learning_rate": 5.983910328500375e-07, "loss": 0.0343, "step": 51705 }, { "epoch": 4.68642378104042, "grad_norm": 0.2969619035720825, "learning_rate": 5.966722182078277e-07, "loss": 0.0301, "step": 51710 }, { "epoch": 4.686876925865507, "grad_norm": 0.3723188042640686, "learning_rate": 5.949558458589133e-07, "loss": 0.0315, "step": 51715 }, { "epoch": 4.687330070690592, "grad_norm": 0.3327583968639374, "learning_rate": 5.932419159750651e-07, "loss": 0.0323, "step": 51720 }, { "epoch": 4.687783215515679, "grad_norm": 0.24183964729309082, "learning_rate": 5.915304287278184e-07, "loss": 0.0324, "step": 51725 }, { "epoch": 4.688236360340765, "grad_norm": 0.25024425983428955, "learning_rate": 5.898213842884581e-07, "loss": 0.0282, "step": 51730 }, { "epoch": 4.688689505165851, "grad_norm": 0.4142306447029114, "learning_rate": 5.881147828280254e-07, "loss": 0.0387, "step": 51735 }, { "epoch": 4.689142649990937, "grad_norm": 0.3375193476676941, "learning_rate": 5.864106245173196e-07, "loss": 0.031, "step": 51740 }, { "epoch": 4.689595794816023, "grad_norm": 0.34856978058815, "learning_rate": 5.847089095268876e-07, "loss": 0.0286, "step": 51745 }, { "epoch": 4.690048939641109, "grad_norm": 0.5715129971504211, "learning_rate": 5.830096380270433e-07, "loss": 0.0345, "step": 51750 }, { "epoch": 4.690502084466195, "grad_norm": 0.31899532675743103, "learning_rate": 5.813128101878479e-07, "loss": 0.0308, "step": 51755 }, { "epoch": 4.690955229291282, "grad_norm": 0.3240903317928314, "learning_rate": 5.79618426179121e-07, "loss": 0.0323, "step": 51760 }, { "epoch": 4.691408374116367, "grad_norm": 0.32260021567344666, "learning_rate": 5.779264861704409e-07, "loss": 0.0309, "step": 51765 }, { "epoch": 4.691861518941454, "grad_norm": 0.337248295545578, "learning_rate": 5.762369903311276e-07, "loss": 0.0297, "step": 51770 }, { "epoch": 4.69231466376654, "grad_norm": 0.28979936242103577, "learning_rate": 5.745499388302794e-07, "loss": 0.0289, "step": 51775 }, { "epoch": 4.692767808591626, "grad_norm": 0.4053365886211395, "learning_rate": 5.728653318367277e-07, "loss": 0.0303, "step": 51780 }, { "epoch": 4.693220953416712, "grad_norm": 0.2818787097930908, "learning_rate": 5.711831695190683e-07, "loss": 0.0303, "step": 51785 }, { "epoch": 4.693674098241798, "grad_norm": 0.39318758249282837, "learning_rate": 5.69503452045661e-07, "loss": 0.0295, "step": 51790 }, { "epoch": 4.694127243066884, "grad_norm": 0.26240357756614685, "learning_rate": 5.678261795846018e-07, "loss": 0.0314, "step": 51795 }, { "epoch": 4.69458038789197, "grad_norm": 0.3285005986690521, "learning_rate": 5.661513523037593e-07, "loss": 0.0298, "step": 51800 }, { "epoch": 4.695033532717057, "grad_norm": 0.2701193392276764, "learning_rate": 5.64478970370752e-07, "loss": 0.0279, "step": 51805 }, { "epoch": 4.695486677542142, "grad_norm": 0.3108804523944855, "learning_rate": 5.628090339529462e-07, "loss": 0.0288, "step": 51810 }, { "epoch": 4.695939822367229, "grad_norm": 0.22179000079631805, "learning_rate": 5.611415432174777e-07, "loss": 0.0297, "step": 51815 }, { "epoch": 4.696392967192315, "grad_norm": 0.6799659132957458, "learning_rate": 5.594764983312212e-07, "loss": 0.0341, "step": 51820 }, { "epoch": 4.6968461120174005, "grad_norm": 0.31996098160743713, "learning_rate": 5.578138994608184e-07, "loss": 0.0301, "step": 51825 }, { "epoch": 4.697299256842487, "grad_norm": 0.4814654290676117, "learning_rate": 5.561537467726668e-07, "loss": 0.03, "step": 51830 }, { "epoch": 4.697752401667573, "grad_norm": 0.2862240672111511, "learning_rate": 5.544960404329086e-07, "loss": 0.0303, "step": 51835 }, { "epoch": 4.698205546492659, "grad_norm": 0.2570028305053711, "learning_rate": 5.528407806074526e-07, "loss": 0.0283, "step": 51840 }, { "epoch": 4.698658691317745, "grad_norm": 0.31342050433158875, "learning_rate": 5.511879674619525e-07, "loss": 0.0344, "step": 51845 }, { "epoch": 4.699111836142832, "grad_norm": 0.36475804448127747, "learning_rate": 5.495376011618286e-07, "loss": 0.029, "step": 51850 }, { "epoch": 4.699564980967917, "grad_norm": 0.3956480324268341, "learning_rate": 5.478896818722462e-07, "loss": 0.0314, "step": 51855 }, { "epoch": 4.700018125793004, "grad_norm": 0.3201424181461334, "learning_rate": 5.462442097581288e-07, "loss": 0.0304, "step": 51860 }, { "epoch": 4.70047127061809, "grad_norm": 0.32881924510002136, "learning_rate": 5.446011849841614e-07, "loss": 0.0299, "step": 51865 }, { "epoch": 4.7009244154431755, "grad_norm": 0.3359350264072418, "learning_rate": 5.429606077147708e-07, "loss": 0.0296, "step": 51870 }, { "epoch": 4.701377560268262, "grad_norm": 0.2844271957874298, "learning_rate": 5.413224781141535e-07, "loss": 0.0293, "step": 51875 }, { "epoch": 4.701830705093348, "grad_norm": 0.2725752294063568, "learning_rate": 5.396867963462476e-07, "loss": 0.0364, "step": 51880 }, { "epoch": 4.702283849918434, "grad_norm": 0.2360243797302246, "learning_rate": 5.380535625747557e-07, "loss": 0.0317, "step": 51885 }, { "epoch": 4.70273699474352, "grad_norm": 0.37692898511886597, "learning_rate": 5.364227769631386e-07, "loss": 0.0327, "step": 51890 }, { "epoch": 4.703190139568606, "grad_norm": 0.4539637863636017, "learning_rate": 5.347944396745936e-07, "loss": 0.041, "step": 51895 }, { "epoch": 4.703643284393692, "grad_norm": 0.314585417509079, "learning_rate": 5.331685508720929e-07, "loss": 0.03, "step": 51900 }, { "epoch": 4.704096429218779, "grad_norm": 0.3192468285560608, "learning_rate": 5.315451107183566e-07, "loss": 0.0344, "step": 51905 }, { "epoch": 4.704549574043864, "grad_norm": 0.3150053918361664, "learning_rate": 5.299241193758547e-07, "loss": 0.0311, "step": 51910 }, { "epoch": 4.7050027188689505, "grad_norm": 0.3147362172603607, "learning_rate": 5.283055770068213e-07, "loss": 0.0389, "step": 51915 }, { "epoch": 4.705455863694037, "grad_norm": 0.2599904239177704, "learning_rate": 5.26689483773235e-07, "loss": 0.0303, "step": 51920 }, { "epoch": 4.705909008519122, "grad_norm": 0.4907849133014679, "learning_rate": 5.250758398368416e-07, "loss": 0.0326, "step": 51925 }, { "epoch": 4.706362153344209, "grad_norm": 0.307991087436676, "learning_rate": 5.234646453591313e-07, "loss": 0.0346, "step": 51930 }, { "epoch": 4.706815298169295, "grad_norm": 0.4663177728652954, "learning_rate": 5.218559005013557e-07, "loss": 0.034, "step": 51935 }, { "epoch": 4.707268442994381, "grad_norm": 0.29821696877479553, "learning_rate": 5.202496054245137e-07, "loss": 0.0306, "step": 51940 }, { "epoch": 4.707721587819467, "grad_norm": 0.3118622601032257, "learning_rate": 5.186457602893657e-07, "loss": 0.0289, "step": 51945 }, { "epoch": 4.7081747326445536, "grad_norm": 0.41870778799057007, "learning_rate": 5.170443652564278e-07, "loss": 0.0337, "step": 51950 }, { "epoch": 4.708627877469639, "grad_norm": 0.30035167932510376, "learning_rate": 5.15445420485966e-07, "loss": 0.0306, "step": 51955 }, { "epoch": 4.7090810222947255, "grad_norm": 0.3176814317703247, "learning_rate": 5.138489261380025e-07, "loss": 0.0304, "step": 51960 }, { "epoch": 4.709534167119812, "grad_norm": 0.3080824315547943, "learning_rate": 5.122548823723178e-07, "loss": 0.0276, "step": 51965 }, { "epoch": 4.709987311944897, "grad_norm": 0.5297876596450806, "learning_rate": 5.106632893484398e-07, "loss": 0.033, "step": 51970 }, { "epoch": 4.710440456769984, "grad_norm": 0.4252324104309082, "learning_rate": 5.090741472256605e-07, "loss": 0.0362, "step": 51975 }, { "epoch": 4.710893601595069, "grad_norm": 0.2865072190761566, "learning_rate": 5.074874561630222e-07, "loss": 0.0286, "step": 51980 }, { "epoch": 4.711346746420156, "grad_norm": 0.44301825761795044, "learning_rate": 5.059032163193145e-07, "loss": 0.0384, "step": 51985 }, { "epoch": 4.711799891245242, "grad_norm": 0.22363238036632538, "learning_rate": 5.043214278530966e-07, "loss": 0.0291, "step": 51990 }, { "epoch": 4.712253036070328, "grad_norm": 0.3151535987854004, "learning_rate": 5.027420909226671e-07, "loss": 0.0287, "step": 51995 }, { "epoch": 4.712706180895414, "grad_norm": 0.3045580983161926, "learning_rate": 5.011652056860938e-07, "loss": 0.0277, "step": 52000 }, { "epoch": 4.7131593257205004, "grad_norm": 0.3013889193534851, "learning_rate": 4.995907723011866e-07, "loss": 0.031, "step": 52005 }, { "epoch": 4.713612470545586, "grad_norm": 0.2451419085264206, "learning_rate": 4.980187909255197e-07, "loss": 0.0359, "step": 52010 }, { "epoch": 4.714065615370672, "grad_norm": 0.31281617283821106, "learning_rate": 4.964492617164112e-07, "loss": 0.0296, "step": 52015 }, { "epoch": 4.714518760195759, "grad_norm": 0.2993902266025543, "learning_rate": 4.94882184830947e-07, "loss": 0.0294, "step": 52020 }, { "epoch": 4.714971905020844, "grad_norm": 0.2989322543144226, "learning_rate": 4.933175604259572e-07, "loss": 0.0302, "step": 52025 }, { "epoch": 4.715425049845931, "grad_norm": 0.2682243287563324, "learning_rate": 4.917553886580301e-07, "loss": 0.0286, "step": 52030 }, { "epoch": 4.715878194671017, "grad_norm": 0.25571444630622864, "learning_rate": 4.901956696835103e-07, "loss": 0.0273, "step": 52035 }, { "epoch": 4.716331339496103, "grad_norm": 0.2977421283721924, "learning_rate": 4.886384036584896e-07, "loss": 0.0312, "step": 52040 }, { "epoch": 4.716784484321189, "grad_norm": 0.3247155547142029, "learning_rate": 4.870835907388265e-07, "loss": 0.0339, "step": 52045 }, { "epoch": 4.717237629146275, "grad_norm": 0.3994290828704834, "learning_rate": 4.855312310801247e-07, "loss": 0.0314, "step": 52050 }, { "epoch": 4.717690773971361, "grad_norm": 0.3425166606903076, "learning_rate": 4.839813248377429e-07, "loss": 0.0287, "step": 52055 }, { "epoch": 4.718143918796447, "grad_norm": 0.28267747163772583, "learning_rate": 4.824338721667987e-07, "loss": 0.0311, "step": 52060 }, { "epoch": 4.718597063621534, "grad_norm": 0.2897492051124573, "learning_rate": 4.8088887322216e-07, "loss": 0.028, "step": 52065 }, { "epoch": 4.719050208446619, "grad_norm": 0.25033366680145264, "learning_rate": 4.793463281584504e-07, "loss": 0.0291, "step": 52070 }, { "epoch": 4.719503353271706, "grad_norm": 0.2942219376564026, "learning_rate": 4.778062371300518e-07, "loss": 0.0274, "step": 52075 }, { "epoch": 4.719956498096792, "grad_norm": 0.39186006784439087, "learning_rate": 4.7626860029109374e-07, "loss": 0.0304, "step": 52080 }, { "epoch": 4.720409642921878, "grad_norm": 0.3195688724517822, "learning_rate": 4.747334177954671e-07, "loss": 0.0292, "step": 52085 }, { "epoch": 4.720862787746964, "grad_norm": 0.3138681948184967, "learning_rate": 4.732006897968072e-07, "loss": 0.0347, "step": 52090 }, { "epoch": 4.72131593257205, "grad_norm": 0.3271506726741791, "learning_rate": 4.716704164485164e-07, "loss": 0.0314, "step": 52095 }, { "epoch": 4.721769077397136, "grad_norm": 0.6453778147697449, "learning_rate": 4.7014259790373884e-07, "loss": 0.0442, "step": 52100 }, { "epoch": 4.722222222222222, "grad_norm": 0.4185265004634857, "learning_rate": 4.6861723431538276e-07, "loss": 0.0313, "step": 52105 }, { "epoch": 4.722675367047309, "grad_norm": 0.342594712972641, "learning_rate": 4.6709432583610935e-07, "loss": 0.0315, "step": 52110 }, { "epoch": 4.723128511872394, "grad_norm": 0.3208771347999573, "learning_rate": 4.655738726183273e-07, "loss": 0.0299, "step": 52115 }, { "epoch": 4.723581656697481, "grad_norm": 0.24512000381946564, "learning_rate": 4.6405587481420656e-07, "loss": 0.0286, "step": 52120 }, { "epoch": 4.724034801522567, "grad_norm": 0.33416226506233215, "learning_rate": 4.625403325756672e-07, "loss": 0.0309, "step": 52125 }, { "epoch": 4.724487946347653, "grad_norm": 0.299578994512558, "learning_rate": 4.6102724605438794e-07, "loss": 0.0301, "step": 52130 }, { "epoch": 4.724941091172739, "grad_norm": 0.37477049231529236, "learning_rate": 4.595166154017977e-07, "loss": 0.0304, "step": 52135 }, { "epoch": 4.725394235997825, "grad_norm": 0.38969025015830994, "learning_rate": 4.5800844076907824e-07, "loss": 0.032, "step": 52140 }, { "epoch": 4.725847380822911, "grad_norm": 0.3335222899913788, "learning_rate": 4.565027223071727e-07, "loss": 0.0289, "step": 52145 }, { "epoch": 4.726300525647997, "grad_norm": 0.3541085720062256, "learning_rate": 4.5499946016676897e-07, "loss": 0.029, "step": 52150 }, { "epoch": 4.726753670473083, "grad_norm": 0.3061530292034149, "learning_rate": 4.5349865449831887e-07, "loss": 0.0308, "step": 52155 }, { "epoch": 4.727206815298169, "grad_norm": 0.3291768729686737, "learning_rate": 4.5200030545202164e-07, "loss": 0.0321, "step": 52160 }, { "epoch": 4.727659960123256, "grad_norm": 0.3097851574420929, "learning_rate": 4.5050441317782963e-07, "loss": 0.0288, "step": 52165 }, { "epoch": 4.728113104948341, "grad_norm": 0.4280093312263489, "learning_rate": 4.490109778254592e-07, "loss": 0.0318, "step": 52170 }, { "epoch": 4.7285662497734275, "grad_norm": 0.314503014087677, "learning_rate": 4.4751999954436576e-07, "loss": 0.0314, "step": 52175 }, { "epoch": 4.729019394598514, "grad_norm": 0.3014882802963257, "learning_rate": 4.460314784837716e-07, "loss": 0.0295, "step": 52180 }, { "epoch": 4.7294725394235995, "grad_norm": 0.33251115679740906, "learning_rate": 4.4454541479264923e-07, "loss": 0.0342, "step": 52185 }, { "epoch": 4.729925684248686, "grad_norm": 0.26798874139785767, "learning_rate": 4.4306180861971857e-07, "loss": 0.0282, "step": 52190 }, { "epoch": 4.730378829073772, "grad_norm": 0.3069441616535187, "learning_rate": 4.415806601134692e-07, "loss": 0.0296, "step": 52195 }, { "epoch": 4.730831973898858, "grad_norm": 0.2872147560119629, "learning_rate": 4.40101969422127e-07, "loss": 0.0289, "step": 52200 }, { "epoch": 4.731285118723944, "grad_norm": 0.37456589937210083, "learning_rate": 4.386257366936819e-07, "loss": 0.0296, "step": 52205 }, { "epoch": 4.731738263549031, "grad_norm": 0.2834879755973816, "learning_rate": 4.3715196207587683e-07, "loss": 0.0323, "step": 52210 }, { "epoch": 4.732191408374116, "grad_norm": 0.30648303031921387, "learning_rate": 4.3568064571620215e-07, "loss": 0.0308, "step": 52215 }, { "epoch": 4.7326445531992025, "grad_norm": 0.26654547452926636, "learning_rate": 4.3421178776192063e-07, "loss": 0.0276, "step": 52220 }, { "epoch": 4.733097698024289, "grad_norm": 0.3451708257198334, "learning_rate": 4.327453883600202e-07, "loss": 0.0317, "step": 52225 }, { "epoch": 4.733550842849374, "grad_norm": 0.22571858763694763, "learning_rate": 4.312814476572724e-07, "loss": 0.0327, "step": 52230 }, { "epoch": 4.734003987674461, "grad_norm": 0.37904423475265503, "learning_rate": 4.298199658001795e-07, "loss": 0.0304, "step": 52235 }, { "epoch": 4.734457132499546, "grad_norm": 0.43455418944358826, "learning_rate": 4.283609429350105e-07, "loss": 0.0322, "step": 52240 }, { "epoch": 4.734910277324633, "grad_norm": 0.31143972277641296, "learning_rate": 4.269043792077876e-07, "loss": 0.0306, "step": 52245 }, { "epoch": 4.735363422149719, "grad_norm": 0.3265089690685272, "learning_rate": 4.2545027476427743e-07, "loss": 0.029, "step": 52250 }, { "epoch": 4.735816566974805, "grad_norm": 0.3342696726322174, "learning_rate": 4.239986297500137e-07, "loss": 0.0265, "step": 52255 }, { "epoch": 4.736269711799891, "grad_norm": 0.21220703423023224, "learning_rate": 4.2254944431027455e-07, "loss": 0.0307, "step": 52260 }, { "epoch": 4.7367228566249775, "grad_norm": 0.3119123876094818, "learning_rate": 4.2110271859009396e-07, "loss": 0.0309, "step": 52265 }, { "epoch": 4.737176001450063, "grad_norm": 0.25534170866012573, "learning_rate": 4.196584527342617e-07, "loss": 0.0303, "step": 52270 }, { "epoch": 4.737629146275149, "grad_norm": 0.40706560015678406, "learning_rate": 4.182166468873233e-07, "loss": 0.0308, "step": 52275 }, { "epoch": 4.738082291100236, "grad_norm": 0.29813799262046814, "learning_rate": 4.167773011935716e-07, "loss": 0.03, "step": 52280 }, { "epoch": 4.738535435925321, "grad_norm": 0.41443997621536255, "learning_rate": 4.153404157970581e-07, "loss": 0.0315, "step": 52285 }, { "epoch": 4.738988580750408, "grad_norm": 0.2803541123867035, "learning_rate": 4.139059908415871e-07, "loss": 0.0279, "step": 52290 }, { "epoch": 4.739441725575494, "grad_norm": 0.36040741205215454, "learning_rate": 4.124740264707161e-07, "loss": 0.0339, "step": 52295 }, { "epoch": 4.73989487040058, "grad_norm": 0.3496566414833069, "learning_rate": 4.1104452282775543e-07, "loss": 0.0317, "step": 52300 }, { "epoch": 4.740348015225666, "grad_norm": 0.2873958647251129, "learning_rate": 4.0961748005577114e-07, "loss": 0.0293, "step": 52305 }, { "epoch": 4.7408011600507525, "grad_norm": 0.44048088788986206, "learning_rate": 4.081928982975852e-07, "loss": 0.0325, "step": 52310 }, { "epoch": 4.741254304875838, "grad_norm": 0.2874275743961334, "learning_rate": 4.067707776957641e-07, "loss": 0.0297, "step": 52315 }, { "epoch": 4.741707449700924, "grad_norm": 0.34464526176452637, "learning_rate": 4.053511183926356e-07, "loss": 0.0305, "step": 52320 }, { "epoch": 4.742160594526011, "grad_norm": 0.2931917905807495, "learning_rate": 4.039339205302861e-07, "loss": 0.0287, "step": 52325 }, { "epoch": 4.742613739351096, "grad_norm": 0.3943323493003845, "learning_rate": 4.02519184250541e-07, "loss": 0.0295, "step": 52330 }, { "epoch": 4.743066884176183, "grad_norm": 0.3316693902015686, "learning_rate": 4.0110690969499263e-07, "loss": 0.0285, "step": 52335 }, { "epoch": 4.743520029001269, "grad_norm": 0.27632948756217957, "learning_rate": 3.9969709700498073e-07, "loss": 0.0277, "step": 52340 }, { "epoch": 4.743973173826355, "grad_norm": 0.3579128086566925, "learning_rate": 3.982897463215979e-07, "loss": 0.0356, "step": 52345 }, { "epoch": 4.744426318651441, "grad_norm": 0.21759934723377228, "learning_rate": 3.968848577856954e-07, "loss": 0.0289, "step": 52350 }, { "epoch": 4.7448794634765274, "grad_norm": 0.31420862674713135, "learning_rate": 3.954824315378747e-07, "loss": 0.0282, "step": 52355 }, { "epoch": 4.745332608301613, "grad_norm": 0.37663161754608154, "learning_rate": 3.9408246771848733e-07, "loss": 0.0281, "step": 52360 }, { "epoch": 4.745785753126699, "grad_norm": 0.34350883960723877, "learning_rate": 3.926849664676463e-07, "loss": 0.0337, "step": 52365 }, { "epoch": 4.746238897951786, "grad_norm": 0.27988436818122864, "learning_rate": 3.912899279252091e-07, "loss": 0.0307, "step": 52370 }, { "epoch": 4.746692042776871, "grad_norm": 0.2996523976325989, "learning_rate": 3.8989735223079747e-07, "loss": 0.0295, "step": 52375 }, { "epoch": 4.747145187601958, "grad_norm": 0.39983052015304565, "learning_rate": 3.885072395237804e-07, "loss": 0.0627, "step": 52380 }, { "epoch": 4.747598332427044, "grad_norm": 0.36039504408836365, "learning_rate": 3.8711958994327445e-07, "loss": 0.03, "step": 52385 }, { "epoch": 4.74805147725213, "grad_norm": 0.2190648913383484, "learning_rate": 3.857344036281657e-07, "loss": 0.0298, "step": 52390 }, { "epoch": 4.748504622077216, "grad_norm": 0.21816220879554749, "learning_rate": 3.843516807170738e-07, "loss": 0.0296, "step": 52395 }, { "epoch": 4.748957766902302, "grad_norm": 0.2756965756416321, "learning_rate": 3.8297142134838816e-07, "loss": 0.0303, "step": 52400 }, { "epoch": 4.749410911727388, "grad_norm": 0.47179096937179565, "learning_rate": 3.815936256602454e-07, "loss": 0.0342, "step": 52405 }, { "epoch": 4.749864056552474, "grad_norm": 0.24566805362701416, "learning_rate": 3.802182937905324e-07, "loss": 0.0283, "step": 52410 }, { "epoch": 4.750317201377561, "grad_norm": 0.4926021099090576, "learning_rate": 3.788454258769003e-07, "loss": 0.0359, "step": 52415 }, { "epoch": 4.750770346202646, "grad_norm": 0.5589576363563538, "learning_rate": 3.7747502205673634e-07, "loss": 0.0336, "step": 52420 }, { "epoch": 4.751223491027733, "grad_norm": 0.6129581332206726, "learning_rate": 3.761070824671975e-07, "loss": 0.0395, "step": 52425 }, { "epoch": 4.751676635852818, "grad_norm": 0.28384485840797424, "learning_rate": 3.7474160724518537e-07, "loss": 0.0282, "step": 52430 }, { "epoch": 4.752129780677905, "grad_norm": 0.28147098422050476, "learning_rate": 3.733785965273573e-07, "loss": 0.0322, "step": 52435 }, { "epoch": 4.752582925502991, "grad_norm": 0.32632943987846375, "learning_rate": 3.720180504501292e-07, "loss": 0.03, "step": 52440 }, { "epoch": 4.7530360703280765, "grad_norm": 0.2821454107761383, "learning_rate": 3.7065996914965315e-07, "loss": 0.0296, "step": 52445 }, { "epoch": 4.753489215153163, "grad_norm": 0.38750678300857544, "learning_rate": 3.6930435276185946e-07, "loss": 0.0358, "step": 52450 }, { "epoch": 4.753942359978249, "grad_norm": 0.37869274616241455, "learning_rate": 3.67951201422409e-07, "loss": 0.0338, "step": 52455 }, { "epoch": 4.754395504803335, "grad_norm": 0.38981014490127563, "learning_rate": 3.6660051526672954e-07, "loss": 0.031, "step": 52460 }, { "epoch": 4.754848649628421, "grad_norm": 0.2674897313117981, "learning_rate": 3.652522944299991e-07, "loss": 0.0273, "step": 52465 }, { "epoch": 4.755301794453508, "grad_norm": 0.37715843319892883, "learning_rate": 3.63906539047143e-07, "loss": 0.0318, "step": 52470 }, { "epoch": 4.755754939278593, "grad_norm": 0.30579057335853577, "learning_rate": 3.6256324925285354e-07, "loss": 0.0311, "step": 52475 }, { "epoch": 4.75620808410368, "grad_norm": 0.40093863010406494, "learning_rate": 3.6122242518155935e-07, "loss": 0.0325, "step": 52480 }, { "epoch": 4.756661228928766, "grad_norm": 0.29778051376342773, "learning_rate": 3.598840669674558e-07, "loss": 0.0289, "step": 52485 }, { "epoch": 4.7571143737538515, "grad_norm": 0.3165791630744934, "learning_rate": 3.585481747444858e-07, "loss": 0.0283, "step": 52490 }, { "epoch": 4.757567518578938, "grad_norm": 0.37742355465888977, "learning_rate": 3.572147486463423e-07, "loss": 0.029, "step": 52495 }, { "epoch": 4.758020663404024, "grad_norm": 0.28241869807243347, "learning_rate": 3.5588378880647977e-07, "loss": 0.0282, "step": 52500 }, { "epoch": 4.75847380822911, "grad_norm": 0.3589056134223938, "learning_rate": 3.545552953580944e-07, "loss": 0.0291, "step": 52505 }, { "epoch": 4.758926953054196, "grad_norm": 0.23088981211185455, "learning_rate": 3.5322926843414926e-07, "loss": 0.047, "step": 52510 }, { "epoch": 4.759380097879282, "grad_norm": 0.3509417474269867, "learning_rate": 3.5190570816735214e-07, "loss": 0.0357, "step": 52515 }, { "epoch": 4.759833242704368, "grad_norm": 0.3510205149650574, "learning_rate": 3.505846146901581e-07, "loss": 0.0317, "step": 52520 }, { "epoch": 4.7602863875294545, "grad_norm": 0.4645374119281769, "learning_rate": 3.492659881347948e-07, "loss": 0.0311, "step": 52525 }, { "epoch": 4.76073953235454, "grad_norm": 0.2979884147644043, "learning_rate": 3.4794982863322336e-07, "loss": 0.0291, "step": 52530 }, { "epoch": 4.7611926771796265, "grad_norm": 0.414078950881958, "learning_rate": 3.466361363171661e-07, "loss": 0.0297, "step": 52535 }, { "epoch": 4.761645822004713, "grad_norm": 0.24654145538806915, "learning_rate": 3.453249113180984e-07, "loss": 0.0304, "step": 52540 }, { "epoch": 4.762098966829798, "grad_norm": 0.24258562922477722, "learning_rate": 3.4401615376724594e-07, "loss": 0.0286, "step": 52545 }, { "epoch": 4.762552111654885, "grad_norm": 0.3255762755870819, "learning_rate": 3.427098637955928e-07, "loss": 0.0308, "step": 52550 }, { "epoch": 4.763005256479971, "grad_norm": 0.32775479555130005, "learning_rate": 3.4140604153387324e-07, "loss": 0.0284, "step": 52555 }, { "epoch": 4.763458401305057, "grad_norm": 0.23408809304237366, "learning_rate": 3.401046871125746e-07, "loss": 0.0277, "step": 52560 }, { "epoch": 4.763911546130143, "grad_norm": 0.3294784724712372, "learning_rate": 3.388058006619316e-07, "loss": 0.0301, "step": 52565 }, { "epoch": 4.7643646909552295, "grad_norm": 0.4242016077041626, "learning_rate": 3.375093823119402e-07, "loss": 0.0346, "step": 52570 }, { "epoch": 4.764817835780315, "grad_norm": 0.35576000809669495, "learning_rate": 3.362154321923494e-07, "loss": 0.0294, "step": 52575 }, { "epoch": 4.765270980605401, "grad_norm": 0.3871137797832489, "learning_rate": 3.3492395043265566e-07, "loss": 0.0324, "step": 52580 }, { "epoch": 4.765724125430488, "grad_norm": 0.36510294675827026, "learning_rate": 3.336349371621111e-07, "loss": 0.0293, "step": 52585 }, { "epoch": 4.766177270255573, "grad_norm": 0.42018723487854004, "learning_rate": 3.3234839250972095e-07, "loss": 0.0296, "step": 52590 }, { "epoch": 4.76663041508066, "grad_norm": 0.3927171230316162, "learning_rate": 3.310643166042404e-07, "loss": 0.032, "step": 52595 }, { "epoch": 4.767083559905746, "grad_norm": 0.29936519265174866, "learning_rate": 3.297827095741834e-07, "loss": 0.0376, "step": 52600 }, { "epoch": 4.767536704730832, "grad_norm": 0.44147154688835144, "learning_rate": 3.285035715478141e-07, "loss": 0.0309, "step": 52605 }, { "epoch": 4.767989849555918, "grad_norm": 0.28771230578422546, "learning_rate": 3.272269026531466e-07, "loss": 0.03, "step": 52610 }, { "epoch": 4.7684429943810045, "grad_norm": 0.35032132267951965, "learning_rate": 3.25952703017951e-07, "loss": 0.0433, "step": 52615 }, { "epoch": 4.76889613920609, "grad_norm": 0.3318502902984619, "learning_rate": 3.246809727697503e-07, "loss": 0.0353, "step": 52620 }, { "epoch": 4.769349284031176, "grad_norm": 0.463171124458313, "learning_rate": 3.2341171203581766e-07, "loss": 0.039, "step": 52625 }, { "epoch": 4.769802428856263, "grad_norm": 0.35240909457206726, "learning_rate": 3.2214492094318485e-07, "loss": 0.03, "step": 52630 }, { "epoch": 4.770255573681348, "grad_norm": 0.31732630729675293, "learning_rate": 3.2088059961863094e-07, "loss": 0.0284, "step": 52635 }, { "epoch": 4.770708718506435, "grad_norm": 0.32824134826660156, "learning_rate": 3.196187481886881e-07, "loss": 0.0354, "step": 52640 }, { "epoch": 4.771161863331521, "grad_norm": 0.47465115785598755, "learning_rate": 3.1835936677964695e-07, "loss": 0.039, "step": 52645 }, { "epoch": 4.771615008156607, "grad_norm": 0.30130085349082947, "learning_rate": 3.1710245551754005e-07, "loss": 0.0314, "step": 52650 }, { "epoch": 4.772068152981693, "grad_norm": 0.34345269203186035, "learning_rate": 3.158480145281639e-07, "loss": 0.0301, "step": 52655 }, { "epoch": 4.7725212978067795, "grad_norm": 0.20623715221881866, "learning_rate": 3.145960439370654e-07, "loss": 0.0311, "step": 52660 }, { "epoch": 4.772974442631865, "grad_norm": 0.45973652601242065, "learning_rate": 3.13346543869536e-07, "loss": 0.0366, "step": 52665 }, { "epoch": 4.773427587456951, "grad_norm": 0.3211928904056549, "learning_rate": 3.120995144506367e-07, "loss": 0.0265, "step": 52670 }, { "epoch": 4.773880732282038, "grad_norm": 0.3219904601573944, "learning_rate": 3.1085495580515657e-07, "loss": 0.0304, "step": 52675 }, { "epoch": 4.774333877107123, "grad_norm": 0.28636518120765686, "learning_rate": 3.0961286805766267e-07, "loss": 0.0291, "step": 52680 }, { "epoch": 4.77478702193221, "grad_norm": 0.2597985863685608, "learning_rate": 3.083732513324583e-07, "loss": 0.0302, "step": 52685 }, { "epoch": 4.775240166757296, "grad_norm": 0.31433171033859253, "learning_rate": 3.071361057536054e-07, "loss": 0.0282, "step": 52690 }, { "epoch": 4.775693311582382, "grad_norm": 0.29621830582618713, "learning_rate": 3.0590143144492153e-07, "loss": 0.0287, "step": 52695 }, { "epoch": 4.776146456407468, "grad_norm": 0.5611416697502136, "learning_rate": 3.046692285299663e-07, "loss": 0.0375, "step": 52700 }, { "epoch": 4.776599601232554, "grad_norm": 0.31537020206451416, "learning_rate": 3.034394971320631e-07, "loss": 0.0295, "step": 52705 }, { "epoch": 4.77705274605764, "grad_norm": 0.3317258656024933, "learning_rate": 3.022122373742858e-07, "loss": 0.0281, "step": 52710 }, { "epoch": 4.777505890882726, "grad_norm": 0.355589359998703, "learning_rate": 3.009874493794557e-07, "loss": 0.0361, "step": 52715 }, { "epoch": 4.777959035707812, "grad_norm": 0.2750767171382904, "learning_rate": 2.9976513327015233e-07, "loss": 0.03, "step": 52720 }, { "epoch": 4.778412180532898, "grad_norm": 0.23197244107723236, "learning_rate": 2.9854528916870296e-07, "loss": 0.0291, "step": 52725 }, { "epoch": 4.778865325357985, "grad_norm": 0.28691810369491577, "learning_rate": 2.9732791719719323e-07, "loss": 0.0309, "step": 52730 }, { "epoch": 4.77931847018307, "grad_norm": 0.40070125460624695, "learning_rate": 2.961130174774535e-07, "loss": 0.0548, "step": 52735 }, { "epoch": 4.779771615008157, "grad_norm": 0.28486865758895874, "learning_rate": 2.949005901310753e-07, "loss": 0.0335, "step": 52740 }, { "epoch": 4.780224759833243, "grad_norm": 0.21930134296417236, "learning_rate": 2.9369063527940055e-07, "loss": 0.0266, "step": 52745 }, { "epoch": 4.7806779046583285, "grad_norm": 0.3172561824321747, "learning_rate": 2.924831530435157e-07, "loss": 0.0289, "step": 52750 }, { "epoch": 4.781131049483415, "grad_norm": 0.286663293838501, "learning_rate": 2.912781435442685e-07, "loss": 0.0349, "step": 52755 }, { "epoch": 4.781584194308501, "grad_norm": 0.4109570384025574, "learning_rate": 2.9007560690225977e-07, "loss": 0.0382, "step": 52760 }, { "epoch": 4.782037339133587, "grad_norm": 0.2842824459075928, "learning_rate": 2.888755432378376e-07, "loss": 0.0281, "step": 52765 }, { "epoch": 4.782490483958673, "grad_norm": 0.39609259366989136, "learning_rate": 2.87677952671106e-07, "loss": 0.0323, "step": 52770 }, { "epoch": 4.78294362878376, "grad_norm": 0.3918668031692505, "learning_rate": 2.8648283532191346e-07, "loss": 0.0285, "step": 52775 }, { "epoch": 4.783396773608845, "grad_norm": 0.3160632252693176, "learning_rate": 2.852901913098782e-07, "loss": 0.0304, "step": 52780 }, { "epoch": 4.783849918433932, "grad_norm": 0.43284785747528076, "learning_rate": 2.841000207543548e-07, "loss": 0.0306, "step": 52785 }, { "epoch": 4.784303063259017, "grad_norm": 0.2783825993537903, "learning_rate": 2.8291232377445355e-07, "loss": 0.0287, "step": 52790 }, { "epoch": 4.7847562080841035, "grad_norm": 0.3955860137939453, "learning_rate": 2.8172710048904594e-07, "loss": 0.03, "step": 52795 }, { "epoch": 4.78520935290919, "grad_norm": 0.3270691931247711, "learning_rate": 2.805443510167427e-07, "loss": 0.0298, "step": 52800 }, { "epoch": 4.785662497734275, "grad_norm": 0.3136618435382843, "learning_rate": 2.7936407547592134e-07, "loss": 0.0306, "step": 52805 }, { "epoch": 4.786115642559362, "grad_norm": 0.4651651978492737, "learning_rate": 2.781862739846985e-07, "loss": 0.0323, "step": 52810 }, { "epoch": 4.786568787384448, "grad_norm": 0.3324369192123413, "learning_rate": 2.770109466609494e-07, "loss": 0.0313, "step": 52815 }, { "epoch": 4.787021932209534, "grad_norm": 0.3449482023715973, "learning_rate": 2.7583809362230206e-07, "loss": 0.0319, "step": 52820 }, { "epoch": 4.78747507703462, "grad_norm": 0.4174617528915405, "learning_rate": 2.7466771498613765e-07, "loss": 0.0359, "step": 52825 }, { "epoch": 4.787928221859707, "grad_norm": 0.4032633304595947, "learning_rate": 2.7349981086958743e-07, "loss": 0.0302, "step": 52830 }, { "epoch": 4.788381366684792, "grad_norm": 0.32296833395957947, "learning_rate": 2.7233438138953294e-07, "loss": 0.0307, "step": 52835 }, { "epoch": 4.7888345115098785, "grad_norm": 0.2603532075881958, "learning_rate": 2.711714266626142e-07, "loss": 0.0274, "step": 52840 }, { "epoch": 4.789287656334965, "grad_norm": 0.32579582929611206, "learning_rate": 2.7001094680521865e-07, "loss": 0.052, "step": 52845 }, { "epoch": 4.78974080116005, "grad_norm": 0.27089083194732666, "learning_rate": 2.6885294193348674e-07, "loss": 0.0289, "step": 52850 }, { "epoch": 4.790193945985137, "grad_norm": 0.2621684670448303, "learning_rate": 2.676974121633119e-07, "loss": 0.0282, "step": 52855 }, { "epoch": 4.790647090810223, "grad_norm": 0.26655492186546326, "learning_rate": 2.6654435761034323e-07, "loss": 0.0308, "step": 52860 }, { "epoch": 4.791100235635309, "grad_norm": 0.2747894525527954, "learning_rate": 2.6539377838997457e-07, "loss": 0.0347, "step": 52865 }, { "epoch": 4.791553380460395, "grad_norm": 0.3342912197113037, "learning_rate": 2.6424567461736105e-07, "loss": 0.0295, "step": 52870 }, { "epoch": 4.7920065252854815, "grad_norm": 0.36868003010749817, "learning_rate": 2.6310004640739683e-07, "loss": 0.0329, "step": 52875 }, { "epoch": 4.792459670110567, "grad_norm": 0.2939322590827942, "learning_rate": 2.6195689387474856e-07, "loss": 0.0295, "step": 52880 }, { "epoch": 4.7929128149356535, "grad_norm": 0.4223608374595642, "learning_rate": 2.6081621713381354e-07, "loss": 0.0303, "step": 52885 }, { "epoch": 4.79336595976074, "grad_norm": 0.29164934158325195, "learning_rate": 2.596780162987561e-07, "loss": 0.0284, "step": 52890 }, { "epoch": 4.793819104585825, "grad_norm": 0.42116621136665344, "learning_rate": 2.5854229148348507e-07, "loss": 0.0351, "step": 52895 }, { "epoch": 4.794272249410912, "grad_norm": 0.320004403591156, "learning_rate": 2.574090428016651e-07, "loss": 0.0325, "step": 52900 }, { "epoch": 4.794725394235998, "grad_norm": 0.6363080143928528, "learning_rate": 2.5627827036671104e-07, "loss": 0.0386, "step": 52905 }, { "epoch": 4.795178539061084, "grad_norm": 0.3419525623321533, "learning_rate": 2.551499742917962e-07, "loss": 0.0302, "step": 52910 }, { "epoch": 4.79563168388617, "grad_norm": 0.2372281551361084, "learning_rate": 2.540241546898331e-07, "loss": 0.0301, "step": 52915 }, { "epoch": 4.7960848287112565, "grad_norm": 0.24378016591072083, "learning_rate": 2.52900811673501e-07, "loss": 0.0318, "step": 52920 }, { "epoch": 4.796537973536342, "grad_norm": 0.5767642259597778, "learning_rate": 2.517799453552183e-07, "loss": 0.0323, "step": 52925 }, { "epoch": 4.796991118361428, "grad_norm": 0.2901100516319275, "learning_rate": 2.5066155584716754e-07, "loss": 0.0274, "step": 52930 }, { "epoch": 4.797444263186515, "grad_norm": 0.36835575103759766, "learning_rate": 2.495456432612758e-07, "loss": 0.0298, "step": 52935 }, { "epoch": 4.7978974080116, "grad_norm": 0.38531213998794556, "learning_rate": 2.484322077092205e-07, "loss": 0.03, "step": 52940 }, { "epoch": 4.798350552836687, "grad_norm": 0.321704626083374, "learning_rate": 2.473212493024374e-07, "loss": 0.0318, "step": 52945 }, { "epoch": 4.798803697661773, "grad_norm": 0.3048117458820343, "learning_rate": 2.4621276815211257e-07, "loss": 0.0466, "step": 52950 }, { "epoch": 4.799256842486859, "grad_norm": 0.3910484313964844, "learning_rate": 2.4510676436917954e-07, "loss": 0.0284, "step": 52955 }, { "epoch": 4.799709987311945, "grad_norm": 0.2665465772151947, "learning_rate": 2.440032380643331e-07, "loss": 0.0291, "step": 52960 }, { "epoch": 4.800163132137031, "grad_norm": 0.4459267854690552, "learning_rate": 2.429021893480099e-07, "loss": 0.0303, "step": 52965 }, { "epoch": 4.800616276962117, "grad_norm": 0.27075621485710144, "learning_rate": 2.418036183304023e-07, "loss": 0.0292, "step": 52970 }, { "epoch": 4.801069421787203, "grad_norm": 0.3519892990589142, "learning_rate": 2.407075251214613e-07, "loss": 0.0296, "step": 52975 }, { "epoch": 4.801522566612289, "grad_norm": 0.39145922660827637, "learning_rate": 2.396139098308797e-07, "loss": 0.0303, "step": 52980 }, { "epoch": 4.801975711437375, "grad_norm": 0.4382132291793823, "learning_rate": 2.3852277256810885e-07, "loss": 0.0309, "step": 52985 }, { "epoch": 4.802428856262462, "grad_norm": 0.32836484909057617, "learning_rate": 2.3743411344235022e-07, "loss": 0.0317, "step": 52990 }, { "epoch": 4.802882001087547, "grad_norm": 0.27509933710098267, "learning_rate": 2.3634793256255562e-07, "loss": 0.0306, "step": 52995 }, { "epoch": 4.803335145912634, "grad_norm": 0.32808470726013184, "learning_rate": 2.3526423003743247e-07, "loss": 0.0281, "step": 53000 }, { "epoch": 4.80378829073772, "grad_norm": 0.37328505516052246, "learning_rate": 2.3418300597543575e-07, "loss": 0.0296, "step": 53005 }, { "epoch": 4.804241435562806, "grad_norm": 0.46846622228622437, "learning_rate": 2.3310426048477608e-07, "loss": 0.0377, "step": 53010 }, { "epoch": 4.804694580387892, "grad_norm": 0.30880483984947205, "learning_rate": 2.3202799367341432e-07, "loss": 0.0304, "step": 53015 }, { "epoch": 4.805147725212978, "grad_norm": 0.26813146471977234, "learning_rate": 2.3095420564906157e-07, "loss": 0.0317, "step": 53020 }, { "epoch": 4.805600870038064, "grad_norm": 0.30157095193862915, "learning_rate": 2.2988289651919016e-07, "loss": 0.0287, "step": 53025 }, { "epoch": 4.80605401486315, "grad_norm": 0.3945271670818329, "learning_rate": 2.2881406639100878e-07, "loss": 0.0304, "step": 53030 }, { "epoch": 4.806507159688237, "grad_norm": 0.27491438388824463, "learning_rate": 2.277477153714902e-07, "loss": 0.0329, "step": 53035 }, { "epoch": 4.806960304513322, "grad_norm": 0.3160557150840759, "learning_rate": 2.266838435673546e-07, "loss": 0.029, "step": 53040 }, { "epoch": 4.807413449338409, "grad_norm": 0.331682950258255, "learning_rate": 2.2562245108507242e-07, "loss": 0.03, "step": 53045 }, { "epoch": 4.807866594163494, "grad_norm": 0.27613958716392517, "learning_rate": 2.2456353803087527e-07, "loss": 0.0276, "step": 53050 }, { "epoch": 4.808319738988581, "grad_norm": 0.33196139335632324, "learning_rate": 2.2350710451072844e-07, "loss": 0.0307, "step": 53055 }, { "epoch": 4.808772883813667, "grad_norm": 0.35587388277053833, "learning_rate": 2.2245315063036954e-07, "loss": 0.0309, "step": 53060 }, { "epoch": 4.8092260286387525, "grad_norm": 0.27125027775764465, "learning_rate": 2.2140167649527809e-07, "loss": 0.028, "step": 53065 }, { "epoch": 4.809679173463839, "grad_norm": 0.3491378724575043, "learning_rate": 2.203526822106783e-07, "loss": 0.0317, "step": 53070 }, { "epoch": 4.810132318288925, "grad_norm": 0.4877491891384125, "learning_rate": 2.1930616788156388e-07, "loss": 0.0309, "step": 53075 }, { "epoch": 4.810585463114011, "grad_norm": 0.7574241161346436, "learning_rate": 2.1826213361265947e-07, "loss": 0.0446, "step": 53080 }, { "epoch": 4.811038607939097, "grad_norm": 0.27118057012557983, "learning_rate": 2.1722057950846198e-07, "loss": 0.0296, "step": 53085 }, { "epoch": 4.811491752764184, "grad_norm": 0.31461837887763977, "learning_rate": 2.161815056732075e-07, "loss": 0.0289, "step": 53090 }, { "epoch": 4.811944897589269, "grad_norm": 0.42136356234550476, "learning_rate": 2.1514491221088228e-07, "loss": 0.0326, "step": 53095 }, { "epoch": 4.8123980424143555, "grad_norm": 0.3269018828868866, "learning_rate": 2.141107992252367e-07, "loss": 0.0276, "step": 53100 }, { "epoch": 4.812851187239442, "grad_norm": 0.3520355820655823, "learning_rate": 2.1307916681975737e-07, "loss": 0.0288, "step": 53105 }, { "epoch": 4.8133043320645275, "grad_norm": 0.41076505184173584, "learning_rate": 2.1205001509769784e-07, "loss": 0.0502, "step": 53110 }, { "epoch": 4.813757476889614, "grad_norm": 0.26463890075683594, "learning_rate": 2.1102334416205072e-07, "loss": 0.028, "step": 53115 }, { "epoch": 4.8142106217147, "grad_norm": 0.3242962062358856, "learning_rate": 2.0999915411556714e-07, "loss": 0.0304, "step": 53120 }, { "epoch": 4.814663766539786, "grad_norm": 0.40324053168296814, "learning_rate": 2.0897744506075122e-07, "loss": 0.0313, "step": 53125 }, { "epoch": 4.815116911364872, "grad_norm": 0.2946799397468567, "learning_rate": 2.0795821709984897e-07, "loss": 0.0289, "step": 53130 }, { "epoch": 4.815570056189959, "grad_norm": 0.3257530629634857, "learning_rate": 2.0694147033487044e-07, "loss": 0.0282, "step": 53135 }, { "epoch": 4.816023201015044, "grad_norm": 0.2634826600551605, "learning_rate": 2.059272048675731e-07, "loss": 0.0361, "step": 53140 }, { "epoch": 4.8164763458401305, "grad_norm": 0.5027286410331726, "learning_rate": 2.0491542079945912e-07, "loss": 0.0362, "step": 53145 }, { "epoch": 4.816929490665217, "grad_norm": 0.301294207572937, "learning_rate": 2.039061182317975e-07, "loss": 0.03, "step": 53150 }, { "epoch": 4.817382635490302, "grad_norm": 0.2704896628856659, "learning_rate": 2.02899297265588e-07, "loss": 0.0326, "step": 53155 }, { "epoch": 4.817835780315389, "grad_norm": 0.28690361976623535, "learning_rate": 2.0189495800160275e-07, "loss": 0.0272, "step": 53160 }, { "epoch": 4.818288925140475, "grad_norm": 0.3024612069129944, "learning_rate": 2.0089310054035304e-07, "loss": 0.0283, "step": 53165 }, { "epoch": 4.818742069965561, "grad_norm": 0.3739992380142212, "learning_rate": 1.9989372498210313e-07, "loss": 0.0281, "step": 53170 }, { "epoch": 4.819195214790647, "grad_norm": 0.2744460701942444, "learning_rate": 1.9889683142687298e-07, "loss": 0.0316, "step": 53175 }, { "epoch": 4.819648359615734, "grad_norm": 0.31531763076782227, "learning_rate": 1.9790241997443282e-07, "loss": 0.0297, "step": 53180 }, { "epoch": 4.820101504440819, "grad_norm": 0.3008275628089905, "learning_rate": 1.9691049072430577e-07, "loss": 0.0291, "step": 53185 }, { "epoch": 4.8205546492659055, "grad_norm": 0.34694376587867737, "learning_rate": 1.9592104377575692e-07, "loss": 0.0299, "step": 53190 }, { "epoch": 4.821007794090992, "grad_norm": 0.35655003786087036, "learning_rate": 1.9493407922781815e-07, "loss": 0.0299, "step": 53195 }, { "epoch": 4.821460938916077, "grad_norm": 0.5082178115844727, "learning_rate": 1.9394959717926042e-07, "loss": 0.0379, "step": 53200 }, { "epoch": 4.821914083741164, "grad_norm": 0.2482839673757553, "learning_rate": 1.9296759772861327e-07, "loss": 0.0313, "step": 53205 }, { "epoch": 4.82236722856625, "grad_norm": 0.3473127484321594, "learning_rate": 1.9198808097415365e-07, "loss": 0.0331, "step": 53210 }, { "epoch": 4.822820373391336, "grad_norm": 0.3832574486732483, "learning_rate": 1.9101104701391704e-07, "loss": 0.0305, "step": 53215 }, { "epoch": 4.823273518216422, "grad_norm": 0.2456754595041275, "learning_rate": 1.9003649594567797e-07, "loss": 0.0293, "step": 53220 }, { "epoch": 4.8237266630415085, "grad_norm": 0.31294775009155273, "learning_rate": 1.8906442786697509e-07, "loss": 0.0308, "step": 53225 }, { "epoch": 4.824179807866594, "grad_norm": 0.3443145751953125, "learning_rate": 1.8809484287508893e-07, "loss": 0.0296, "step": 53230 }, { "epoch": 4.8246329526916805, "grad_norm": 0.2968918979167938, "learning_rate": 1.871277410670613e-07, "loss": 0.0284, "step": 53235 }, { "epoch": 4.825086097516766, "grad_norm": 0.4747791290283203, "learning_rate": 1.8616312253967872e-07, "loss": 0.0345, "step": 53240 }, { "epoch": 4.825539242341852, "grad_norm": 0.20984043180942535, "learning_rate": 1.85200987389475e-07, "loss": 0.0296, "step": 53245 }, { "epoch": 4.825992387166939, "grad_norm": 0.246623232960701, "learning_rate": 1.8424133571274814e-07, "loss": 0.0383, "step": 53250 }, { "epoch": 4.826445531992024, "grad_norm": 0.49114537239074707, "learning_rate": 1.8328416760553524e-07, "loss": 0.0299, "step": 53255 }, { "epoch": 4.826898676817111, "grad_norm": 0.39044076204299927, "learning_rate": 1.8232948316363464e-07, "loss": 0.0301, "step": 53260 }, { "epoch": 4.827351821642197, "grad_norm": 0.3862892687320709, "learning_rate": 1.8137728248258933e-07, "loss": 0.0312, "step": 53265 }, { "epoch": 4.827804966467283, "grad_norm": 0.33437642455101013, "learning_rate": 1.804275656576926e-07, "loss": 0.0294, "step": 53270 }, { "epoch": 4.828258111292369, "grad_norm": 0.3193496763706207, "learning_rate": 1.7948033278399612e-07, "loss": 0.0342, "step": 53275 }, { "epoch": 4.828711256117455, "grad_norm": 0.39583441615104675, "learning_rate": 1.785355839562991e-07, "loss": 0.0292, "step": 53280 }, { "epoch": 4.829164400942541, "grad_norm": 0.4888588786125183, "learning_rate": 1.775933192691509e-07, "loss": 0.0312, "step": 53285 }, { "epoch": 4.829617545767627, "grad_norm": 0.2580335736274719, "learning_rate": 1.7665353881685664e-07, "loss": 0.0287, "step": 53290 }, { "epoch": 4.830070690592714, "grad_norm": 0.3612678349018097, "learning_rate": 1.757162426934661e-07, "loss": 0.0388, "step": 53295 }, { "epoch": 4.830523835417799, "grad_norm": 0.2682831585407257, "learning_rate": 1.7478143099278755e-07, "loss": 0.027, "step": 53300 }, { "epoch": 4.830976980242886, "grad_norm": 0.37816667556762695, "learning_rate": 1.7384910380837393e-07, "loss": 0.029, "step": 53305 }, { "epoch": 4.831430125067972, "grad_norm": 0.2804603576660156, "learning_rate": 1.7291926123353398e-07, "loss": 0.0279, "step": 53310 }, { "epoch": 4.831883269893058, "grad_norm": 0.34449145197868347, "learning_rate": 1.7199190336132654e-07, "loss": 0.0291, "step": 53315 }, { "epoch": 4.832336414718144, "grad_norm": 0.3211376368999481, "learning_rate": 1.7106703028456629e-07, "loss": 0.0309, "step": 53320 }, { "epoch": 4.8327895595432295, "grad_norm": 0.25735095143318176, "learning_rate": 1.7014464209580693e-07, "loss": 0.0303, "step": 53325 }, { "epoch": 4.833242704368316, "grad_norm": 0.3962363004684448, "learning_rate": 1.6922473888736912e-07, "loss": 0.0337, "step": 53330 }, { "epoch": 4.833695849193402, "grad_norm": 0.3100205361843109, "learning_rate": 1.683073207513125e-07, "loss": 0.0274, "step": 53335 }, { "epoch": 4.834148994018488, "grad_norm": 0.2437240332365036, "learning_rate": 1.6739238777945255e-07, "loss": 0.0285, "step": 53340 }, { "epoch": 4.834602138843574, "grad_norm": 0.25928279757499695, "learning_rate": 1.6647994006335765e-07, "loss": 0.0301, "step": 53345 }, { "epoch": 4.835055283668661, "grad_norm": 0.2735882103443146, "learning_rate": 1.6556997769434368e-07, "loss": 0.0295, "step": 53350 }, { "epoch": 4.835508428493746, "grad_norm": 0.4169546365737915, "learning_rate": 1.64662500763485e-07, "loss": 0.0388, "step": 53355 }, { "epoch": 4.835961573318833, "grad_norm": 0.3079918920993805, "learning_rate": 1.6375750936159505e-07, "loss": 0.0289, "step": 53360 }, { "epoch": 4.836414718143919, "grad_norm": 0.28627830743789673, "learning_rate": 1.628550035792542e-07, "loss": 0.0283, "step": 53365 }, { "epoch": 4.8368678629690045, "grad_norm": 0.6688147187232971, "learning_rate": 1.6195498350677628e-07, "loss": 0.0331, "step": 53370 }, { "epoch": 4.837321007794091, "grad_norm": 0.2622286379337311, "learning_rate": 1.6105744923424204e-07, "loss": 0.0295, "step": 53375 }, { "epoch": 4.837774152619177, "grad_norm": 0.30224597454071045, "learning_rate": 1.6016240085147682e-07, "loss": 0.0288, "step": 53380 }, { "epoch": 4.838227297444263, "grad_norm": 0.3541781008243561, "learning_rate": 1.5926983844805067e-07, "loss": 0.0291, "step": 53385 }, { "epoch": 4.838680442269349, "grad_norm": 0.3182619512081146, "learning_rate": 1.583797621133004e-07, "loss": 0.0283, "step": 53390 }, { "epoch": 4.839133587094436, "grad_norm": 0.29726627469062805, "learning_rate": 1.5749217193630207e-07, "loss": 0.0318, "step": 53395 }, { "epoch": 4.839586731919521, "grad_norm": 0.26568982005119324, "learning_rate": 1.5660706800588177e-07, "loss": 0.0295, "step": 53400 }, { "epoch": 4.840039876744608, "grad_norm": 0.37123286724090576, "learning_rate": 1.55724450410627e-07, "loss": 0.0303, "step": 53405 }, { "epoch": 4.840493021569694, "grad_norm": 0.31161075830459595, "learning_rate": 1.548443192388671e-07, "loss": 0.0307, "step": 53410 }, { "epoch": 4.8409461663947795, "grad_norm": 0.49023744463920593, "learning_rate": 1.539666745786872e-07, "loss": 0.0331, "step": 53415 }, { "epoch": 4.841399311219866, "grad_norm": 0.28584641218185425, "learning_rate": 1.5309151651791975e-07, "loss": 0.027, "step": 53420 }, { "epoch": 4.841852456044952, "grad_norm": 0.3354950249195099, "learning_rate": 1.5221884514415308e-07, "loss": 0.0329, "step": 53425 }, { "epoch": 4.842305600870038, "grad_norm": 0.28786495327949524, "learning_rate": 1.5134866054472562e-07, "loss": 0.0285, "step": 53430 }, { "epoch": 4.842758745695124, "grad_norm": 0.2858549654483795, "learning_rate": 1.5048096280672607e-07, "loss": 0.0312, "step": 53435 }, { "epoch": 4.843211890520211, "grad_norm": 0.29671815037727356, "learning_rate": 1.496157520169905e-07, "loss": 0.0333, "step": 53440 }, { "epoch": 4.843665035345296, "grad_norm": 0.5035364627838135, "learning_rate": 1.487530282621108e-07, "loss": 0.0332, "step": 53445 }, { "epoch": 4.8441181801703825, "grad_norm": 0.3162417709827423, "learning_rate": 1.4789279162842618e-07, "loss": 0.0281, "step": 53450 }, { "epoch": 4.844571324995469, "grad_norm": 0.32134518027305603, "learning_rate": 1.4703504220203722e-07, "loss": 0.0361, "step": 53455 }, { "epoch": 4.8450244698205545, "grad_norm": 0.31055542826652527, "learning_rate": 1.4617978006878085e-07, "loss": 0.0284, "step": 53460 }, { "epoch": 4.845477614645641, "grad_norm": 0.27630794048309326, "learning_rate": 1.4532700531425249e-07, "loss": 0.0299, "step": 53465 }, { "epoch": 4.845930759470727, "grad_norm": 0.3648737967014313, "learning_rate": 1.4447671802380047e-07, "loss": 0.0383, "step": 53470 }, { "epoch": 4.846383904295813, "grad_norm": 0.3237707018852234, "learning_rate": 1.4362891828252068e-07, "loss": 0.0317, "step": 53475 }, { "epoch": 4.846837049120899, "grad_norm": 0.3432885706424713, "learning_rate": 1.4278360617526188e-07, "loss": 0.0297, "step": 53480 }, { "epoch": 4.847290193945986, "grad_norm": 0.2729429602622986, "learning_rate": 1.4194078178662307e-07, "loss": 0.0281, "step": 53485 }, { "epoch": 4.847743338771071, "grad_norm": 0.22779463231563568, "learning_rate": 1.4110044520095622e-07, "loss": 0.0288, "step": 53490 }, { "epoch": 4.8481964835961575, "grad_norm": 0.40915820002555847, "learning_rate": 1.4026259650235796e-07, "loss": 0.0297, "step": 53495 }, { "epoch": 4.848649628421244, "grad_norm": 0.33027195930480957, "learning_rate": 1.3942723577468064e-07, "loss": 0.0297, "step": 53500 }, { "epoch": 4.849102773246329, "grad_norm": 0.542305052280426, "learning_rate": 1.385943631015324e-07, "loss": 0.0314, "step": 53505 }, { "epoch": 4.849555918071416, "grad_norm": 0.33005058765411377, "learning_rate": 1.37763978566266e-07, "loss": 0.0286, "step": 53510 }, { "epoch": 4.850009062896501, "grad_norm": 0.746364414691925, "learning_rate": 1.369360822519844e-07, "loss": 0.0345, "step": 53515 }, { "epoch": 4.850462207721588, "grad_norm": 0.4115358889102936, "learning_rate": 1.3611067424154354e-07, "loss": 0.0338, "step": 53520 }, { "epoch": 4.850915352546674, "grad_norm": 0.40727078914642334, "learning_rate": 1.3528775461754961e-07, "loss": 0.0305, "step": 53525 }, { "epoch": 4.85136849737176, "grad_norm": 0.35050368309020996, "learning_rate": 1.3446732346236724e-07, "loss": 0.0313, "step": 53530 }, { "epoch": 4.851821642196846, "grad_norm": 0.31883272528648376, "learning_rate": 1.336493808580974e-07, "loss": 0.0288, "step": 53535 }, { "epoch": 4.8522747870219325, "grad_norm": 0.26497188210487366, "learning_rate": 1.3283392688660522e-07, "loss": 0.0289, "step": 53540 }, { "epoch": 4.852727931847018, "grad_norm": 0.29312893748283386, "learning_rate": 1.3202096162950039e-07, "loss": 0.028, "step": 53545 }, { "epoch": 4.853181076672104, "grad_norm": 0.25230348110198975, "learning_rate": 1.312104851681456e-07, "loss": 0.0283, "step": 53550 }, { "epoch": 4.853634221497191, "grad_norm": 0.2689155340194702, "learning_rate": 1.304024975836482e-07, "loss": 0.0277, "step": 53555 }, { "epoch": 4.854087366322276, "grad_norm": 0.25694912672042847, "learning_rate": 1.295969989568796e-07, "loss": 0.0286, "step": 53560 }, { "epoch": 4.854540511147363, "grad_norm": 0.23807471990585327, "learning_rate": 1.2879398936845033e-07, "loss": 0.0277, "step": 53565 }, { "epoch": 4.854993655972449, "grad_norm": 0.24800340831279755, "learning_rate": 1.2799346889872388e-07, "loss": 0.0341, "step": 53570 }, { "epoch": 4.855446800797535, "grad_norm": 0.2959878444671631, "learning_rate": 1.2719543762782227e-07, "loss": 0.0308, "step": 53575 }, { "epoch": 4.855899945622621, "grad_norm": 0.27843669056892395, "learning_rate": 1.2639989563560662e-07, "loss": 0.0392, "step": 53580 }, { "epoch": 4.8563530904477075, "grad_norm": 0.2692103683948517, "learning_rate": 1.2560684300169934e-07, "loss": 0.0356, "step": 53585 }, { "epoch": 4.856806235272793, "grad_norm": 0.34688088297843933, "learning_rate": 1.248162798054703e-07, "loss": 0.0289, "step": 53590 }, { "epoch": 4.857259380097879, "grad_norm": 0.510991096496582, "learning_rate": 1.2402820612603393e-07, "loss": 0.0374, "step": 53595 }, { "epoch": 4.857712524922965, "grad_norm": 0.34228795766830444, "learning_rate": 1.2324262204226612e-07, "loss": 0.0322, "step": 53600 }, { "epoch": 4.858165669748051, "grad_norm": 0.2170344889163971, "learning_rate": 1.2245952763278446e-07, "loss": 0.03, "step": 53605 }, { "epoch": 4.858618814573138, "grad_norm": 0.3253538906574249, "learning_rate": 1.216789229759624e-07, "loss": 0.0286, "step": 53610 }, { "epoch": 4.859071959398223, "grad_norm": 0.3047889769077301, "learning_rate": 1.2090080814992634e-07, "loss": 0.0296, "step": 53615 }, { "epoch": 4.85952510422331, "grad_norm": 0.592846155166626, "learning_rate": 1.201251832325445e-07, "loss": 0.0372, "step": 53620 }, { "epoch": 4.859978249048396, "grad_norm": 0.3257533013820648, "learning_rate": 1.193520483014493e-07, "loss": 0.0312, "step": 53625 }, { "epoch": 4.8604313938734816, "grad_norm": 0.2832227945327759, "learning_rate": 1.1858140343400936e-07, "loss": 0.0304, "step": 53630 }, { "epoch": 4.860884538698568, "grad_norm": 0.37285086512565613, "learning_rate": 1.1781324870735466e-07, "loss": 0.0278, "step": 53635 }, { "epoch": 4.861337683523654, "grad_norm": 0.3598077595233917, "learning_rate": 1.1704758419835982e-07, "loss": 0.04, "step": 53640 }, { "epoch": 4.86179082834874, "grad_norm": 0.31647589802742004, "learning_rate": 1.1628440998365519e-07, "loss": 0.0333, "step": 53645 }, { "epoch": 4.862243973173826, "grad_norm": 0.28991684317588806, "learning_rate": 1.1552372613962137e-07, "loss": 0.0277, "step": 53650 }, { "epoch": 4.862697117998913, "grad_norm": 0.3272274136543274, "learning_rate": 1.1476553274238356e-07, "loss": 0.0309, "step": 53655 }, { "epoch": 4.863150262823998, "grad_norm": 0.2793528139591217, "learning_rate": 1.1400982986782271e-07, "loss": 0.0293, "step": 53660 }, { "epoch": 4.863603407649085, "grad_norm": 0.25367510318756104, "learning_rate": 1.1325661759157003e-07, "loss": 0.0362, "step": 53665 }, { "epoch": 4.864056552474171, "grad_norm": 0.35506221652030945, "learning_rate": 1.1250589598900962e-07, "loss": 0.029, "step": 53670 }, { "epoch": 4.8645096972992565, "grad_norm": 0.3196638524532318, "learning_rate": 1.1175766513527308e-07, "loss": 0.0284, "step": 53675 }, { "epoch": 4.864962842124343, "grad_norm": 0.35386529564857483, "learning_rate": 1.1101192510524217e-07, "loss": 0.0295, "step": 53680 }, { "epoch": 4.865415986949429, "grad_norm": 0.3016529679298401, "learning_rate": 1.1026867597355162e-07, "loss": 0.0281, "step": 53685 }, { "epoch": 4.865869131774515, "grad_norm": 0.3223571181297302, "learning_rate": 1.0952791781458637e-07, "loss": 0.0324, "step": 53690 }, { "epoch": 4.866322276599601, "grad_norm": 0.3492845892906189, "learning_rate": 1.0878965070247882e-07, "loss": 0.0308, "step": 53695 }, { "epoch": 4.866775421424688, "grad_norm": 0.26267123222351074, "learning_rate": 1.0805387471111983e-07, "loss": 0.04, "step": 53700 }, { "epoch": 4.867228566249773, "grad_norm": 0.2745462954044342, "learning_rate": 1.0732058991414218e-07, "loss": 0.0304, "step": 53705 }, { "epoch": 4.86768171107486, "grad_norm": 0.4124862551689148, "learning_rate": 1.0658979638493716e-07, "loss": 0.0297, "step": 53710 }, { "epoch": 4.868134855899946, "grad_norm": 0.34056076407432556, "learning_rate": 1.0586149419664071e-07, "loss": 0.0337, "step": 53715 }, { "epoch": 4.8685880007250315, "grad_norm": 0.2985450029373169, "learning_rate": 1.0513568342213897e-07, "loss": 0.0284, "step": 53720 }, { "epoch": 4.869041145550118, "grad_norm": 0.3721948564052582, "learning_rate": 1.0441236413407385e-07, "loss": 0.0302, "step": 53725 }, { "epoch": 4.869494290375204, "grad_norm": 0.2948697805404663, "learning_rate": 1.0369153640483742e-07, "loss": 0.03, "step": 53730 }, { "epoch": 4.86994743520029, "grad_norm": 0.4261783957481384, "learning_rate": 1.0297320030656643e-07, "loss": 0.0343, "step": 53735 }, { "epoch": 4.870400580025376, "grad_norm": 0.3700468838214874, "learning_rate": 1.0225735591115615e-07, "loss": 0.0337, "step": 53740 }, { "epoch": 4.870853724850463, "grad_norm": 0.28579428791999817, "learning_rate": 1.0154400329024371e-07, "loss": 0.0281, "step": 53745 }, { "epoch": 4.871306869675548, "grad_norm": 0.21924571692943573, "learning_rate": 1.008331425152248e-07, "loss": 0.0277, "step": 53750 }, { "epoch": 4.871760014500635, "grad_norm": 0.24489715695381165, "learning_rate": 1.0012477365724249e-07, "loss": 0.0287, "step": 53755 }, { "epoch": 4.872213159325721, "grad_norm": 0.39191561937332153, "learning_rate": 9.941889678719285e-08, "loss": 0.0408, "step": 53760 }, { "epoch": 4.8726663041508065, "grad_norm": 0.22272147238254547, "learning_rate": 9.871551197571383e-08, "loss": 0.0295, "step": 53765 }, { "epoch": 4.873119448975893, "grad_norm": 0.29583582282066345, "learning_rate": 9.801461929320743e-08, "loss": 0.0298, "step": 53770 }, { "epoch": 4.873572593800979, "grad_norm": 0.3276205062866211, "learning_rate": 9.731621880981479e-08, "loss": 0.0294, "step": 53775 }, { "epoch": 4.874025738626065, "grad_norm": 0.26794302463531494, "learning_rate": 9.662031059543276e-08, "loss": 0.0292, "step": 53780 }, { "epoch": 4.874478883451151, "grad_norm": 0.3137385845184326, "learning_rate": 9.592689471970839e-08, "loss": 0.0288, "step": 53785 }, { "epoch": 4.874932028276237, "grad_norm": 0.2821750044822693, "learning_rate": 9.523597125203898e-08, "loss": 0.0304, "step": 53790 }, { "epoch": 4.875385173101323, "grad_norm": 0.38251855969429016, "learning_rate": 9.454754026157475e-08, "loss": 0.0316, "step": 53795 }, { "epoch": 4.8758383179264095, "grad_norm": 0.3085732161998749, "learning_rate": 9.38616018172106e-08, "loss": 0.0277, "step": 53800 }, { "epoch": 4.876291462751495, "grad_norm": 0.3398476839065552, "learning_rate": 9.317815598759716e-08, "loss": 0.0298, "step": 53805 }, { "epoch": 4.8767446075765815, "grad_norm": 0.37035590410232544, "learning_rate": 9.24972028411325e-08, "loss": 0.0281, "step": 53810 }, { "epoch": 4.877197752401668, "grad_norm": 0.2763870656490326, "learning_rate": 9.181874244597044e-08, "loss": 0.0347, "step": 53815 }, { "epoch": 4.877650897226753, "grad_norm": 0.40356001257896423, "learning_rate": 9.114277487000666e-08, "loss": 0.03, "step": 53820 }, { "epoch": 4.87810404205184, "grad_norm": 0.2791377604007721, "learning_rate": 9.046930018089539e-08, "loss": 0.0297, "step": 53825 }, { "epoch": 4.878557186876926, "grad_norm": 0.2751506268978119, "learning_rate": 8.979831844603548e-08, "loss": 0.0287, "step": 53830 }, { "epoch": 4.879010331702012, "grad_norm": 0.45597952604293823, "learning_rate": 8.912982973258154e-08, "loss": 0.0315, "step": 53835 }, { "epoch": 4.879463476527098, "grad_norm": 0.4019695222377777, "learning_rate": 8.846383410743841e-08, "loss": 0.0376, "step": 53840 }, { "epoch": 4.8799166213521845, "grad_norm": 0.36334428191185, "learning_rate": 8.780033163725276e-08, "loss": 0.0304, "step": 53845 }, { "epoch": 4.88036976617727, "grad_norm": 0.3325856328010559, "learning_rate": 8.713932238843258e-08, "loss": 0.03, "step": 53850 }, { "epoch": 4.880822911002356, "grad_norm": 0.3478330671787262, "learning_rate": 8.648080642713053e-08, "loss": 0.0325, "step": 53855 }, { "epoch": 4.881276055827443, "grad_norm": 0.3609403669834137, "learning_rate": 8.582478381925218e-08, "loss": 0.0287, "step": 53860 }, { "epoch": 4.881729200652528, "grad_norm": 0.3179287910461426, "learning_rate": 8.517125463045062e-08, "loss": 0.0311, "step": 53865 }, { "epoch": 4.882182345477615, "grad_norm": 0.2558819353580475, "learning_rate": 8.452021892613182e-08, "loss": 0.0295, "step": 53870 }, { "epoch": 4.8826354903027, "grad_norm": 0.3183493912220001, "learning_rate": 8.387167677145203e-08, "loss": 0.0288, "step": 53875 }, { "epoch": 4.883088635127787, "grad_norm": 0.284078449010849, "learning_rate": 8.32256282313204e-08, "loss": 0.029, "step": 53880 }, { "epoch": 4.883541779952873, "grad_norm": 0.29981932044029236, "learning_rate": 8.258207337038803e-08, "loss": 0.0319, "step": 53885 }, { "epoch": 4.883994924777959, "grad_norm": 0.4500233829021454, "learning_rate": 8.194101225306728e-08, "loss": 0.0322, "step": 53890 }, { "epoch": 4.884448069603045, "grad_norm": 0.27410629391670227, "learning_rate": 8.130244494351513e-08, "loss": 0.0296, "step": 53895 }, { "epoch": 4.884901214428131, "grad_norm": 0.2478155642747879, "learning_rate": 8.066637150563327e-08, "loss": 0.035, "step": 53900 }, { "epoch": 4.885354359253217, "grad_norm": 0.2925560474395752, "learning_rate": 8.003279200309021e-08, "loss": 0.032, "step": 53905 }, { "epoch": 4.885807504078303, "grad_norm": 0.26386645436286926, "learning_rate": 7.940170649928524e-08, "loss": 0.0277, "step": 53910 }, { "epoch": 4.88626064890339, "grad_norm": 0.31314152479171753, "learning_rate": 7.877311505738449e-08, "loss": 0.0282, "step": 53915 }, { "epoch": 4.886713793728475, "grad_norm": 0.30654606223106384, "learning_rate": 7.81470177402932e-08, "loss": 0.0299, "step": 53920 }, { "epoch": 4.887166938553562, "grad_norm": 0.28305256366729736, "learning_rate": 7.752341461067237e-08, "loss": 0.0309, "step": 53925 }, { "epoch": 4.887620083378648, "grad_norm": 0.2891874313354492, "learning_rate": 7.690230573093593e-08, "loss": 0.0293, "step": 53930 }, { "epoch": 4.888073228203734, "grad_norm": 0.23849327862262726, "learning_rate": 7.628369116324252e-08, "loss": 0.0361, "step": 53935 }, { "epoch": 4.88852637302882, "grad_norm": 0.32577621936798096, "learning_rate": 7.566757096950094e-08, "loss": 0.034, "step": 53940 }, { "epoch": 4.888979517853906, "grad_norm": 0.3341068923473358, "learning_rate": 7.505394521137299e-08, "loss": 0.031, "step": 53945 }, { "epoch": 4.889432662678992, "grad_norm": 0.30583855509757996, "learning_rate": 7.444281395027341e-08, "loss": 0.0438, "step": 53950 }, { "epoch": 4.889885807504078, "grad_norm": 0.3492557406425476, "learning_rate": 7.383417724736441e-08, "loss": 0.0341, "step": 53955 }, { "epoch": 4.890338952329165, "grad_norm": 0.5858792066574097, "learning_rate": 7.322803516355836e-08, "loss": 0.0358, "step": 53960 }, { "epoch": 4.89079209715425, "grad_norm": 0.2617092430591583, "learning_rate": 7.262438775951508e-08, "loss": 0.0323, "step": 53965 }, { "epoch": 4.891245241979337, "grad_norm": 0.2748427093029022, "learning_rate": 7.202323509565012e-08, "loss": 0.029, "step": 53970 }, { "epoch": 4.891698386804423, "grad_norm": 0.3496081829071045, "learning_rate": 7.142457723212371e-08, "loss": 0.0288, "step": 53975 }, { "epoch": 4.8921515316295086, "grad_norm": 0.39949414134025574, "learning_rate": 7.082841422885733e-08, "loss": 0.0318, "step": 53980 }, { "epoch": 4.892604676454595, "grad_norm": 0.2818411588668823, "learning_rate": 7.023474614550885e-08, "loss": 0.0341, "step": 53985 }, { "epoch": 4.893057821279681, "grad_norm": 0.2687528431415558, "learning_rate": 6.96435730414946e-08, "loss": 0.0271, "step": 53990 }, { "epoch": 4.893510966104767, "grad_norm": 0.34691694378852844, "learning_rate": 6.905489497597839e-08, "loss": 0.0294, "step": 53995 }, { "epoch": 4.893964110929853, "grad_norm": 0.6103754043579102, "learning_rate": 6.846871200787697e-08, "loss": 0.0378, "step": 54000 }, { "epoch": 4.89441725575494, "grad_norm": 0.4111710488796234, "learning_rate": 6.788502419585451e-08, "loss": 0.0313, "step": 54005 }, { "epoch": 4.894870400580025, "grad_norm": 0.2993771433830261, "learning_rate": 6.73038315983282e-08, "loss": 0.0305, "step": 54010 }, { "epoch": 4.895323545405112, "grad_norm": 0.3042665719985962, "learning_rate": 6.672513427346261e-08, "loss": 0.03, "step": 54015 }, { "epoch": 4.895776690230198, "grad_norm": 0.31074848771095276, "learning_rate": 6.614893227917251e-08, "loss": 0.0303, "step": 54020 }, { "epoch": 4.8962298350552835, "grad_norm": 0.42202281951904297, "learning_rate": 6.557522567312847e-08, "loss": 0.0299, "step": 54025 }, { "epoch": 4.89668297988037, "grad_norm": 0.452854722738266, "learning_rate": 6.500401451274563e-08, "loss": 0.0423, "step": 54030 }, { "epoch": 4.897136124705456, "grad_norm": 0.5210703015327454, "learning_rate": 6.443529885518662e-08, "loss": 0.037, "step": 54035 }, { "epoch": 4.897589269530542, "grad_norm": 0.2796189486980438, "learning_rate": 6.386907875737535e-08, "loss": 0.0333, "step": 54040 }, { "epoch": 4.898042414355628, "grad_norm": 0.24015061557292938, "learning_rate": 6.330535427597483e-08, "loss": 0.0288, "step": 54045 }, { "epoch": 4.898495559180714, "grad_norm": 0.5677340030670166, "learning_rate": 6.274412546740383e-08, "loss": 0.0452, "step": 54050 }, { "epoch": 4.8989487040058, "grad_norm": 0.27188631892204285, "learning_rate": 6.218539238783405e-08, "loss": 0.0295, "step": 54055 }, { "epoch": 4.899401848830887, "grad_norm": 0.35844096541404724, "learning_rate": 6.162915509317913e-08, "loss": 0.0283, "step": 54060 }, { "epoch": 4.899854993655972, "grad_norm": 0.30714064836502075, "learning_rate": 6.107541363910563e-08, "loss": 0.0314, "step": 54065 }, { "epoch": 4.9003081384810585, "grad_norm": 0.28070276975631714, "learning_rate": 6.052416808103867e-08, "loss": 0.0276, "step": 54070 }, { "epoch": 4.900761283306145, "grad_norm": 0.29225993156433105, "learning_rate": 5.997541847414246e-08, "loss": 0.0296, "step": 54075 }, { "epoch": 4.90121442813123, "grad_norm": 0.25996485352516174, "learning_rate": 5.942916487333694e-08, "loss": 0.0289, "step": 54080 }, { "epoch": 4.901667572956317, "grad_norm": 0.32522067427635193, "learning_rate": 5.888540733329506e-08, "loss": 0.0282, "step": 54085 }, { "epoch": 4.902120717781403, "grad_norm": 0.4756859838962555, "learning_rate": 5.8344145908428846e-08, "loss": 0.0359, "step": 54090 }, { "epoch": 4.902573862606489, "grad_norm": 0.3223693370819092, "learning_rate": 5.780538065291441e-08, "loss": 0.0345, "step": 54095 }, { "epoch": 4.903027007431575, "grad_norm": 0.329285204410553, "learning_rate": 5.7269111620669725e-08, "loss": 0.0275, "step": 54100 }, { "epoch": 4.903480152256662, "grad_norm": 0.279446005821228, "learning_rate": 5.673533886536297e-08, "loss": 0.0348, "step": 54105 }, { "epoch": 4.903933297081747, "grad_norm": 0.6186196804046631, "learning_rate": 5.6204062440415315e-08, "loss": 0.0317, "step": 54110 }, { "epoch": 4.9043864419068335, "grad_norm": 0.34289300441741943, "learning_rate": 5.567528239899811e-08, "loss": 0.0303, "step": 54115 }, { "epoch": 4.90483958673192, "grad_norm": 0.3043326437473297, "learning_rate": 5.5148998794030124e-08, "loss": 0.0368, "step": 54120 }, { "epoch": 4.905292731557005, "grad_norm": 0.2808796465396881, "learning_rate": 5.462521167818313e-08, "loss": 0.0268, "step": 54125 }, { "epoch": 4.905745876382092, "grad_norm": 0.30115780234336853, "learning_rate": 5.41039211038763e-08, "loss": 0.0292, "step": 54130 }, { "epoch": 4.906199021207177, "grad_norm": 1.039168119430542, "learning_rate": 5.3585127123284564e-08, "loss": 0.052, "step": 54135 }, { "epoch": 4.906652166032264, "grad_norm": 0.3166641294956207, "learning_rate": 5.30688297883275e-08, "loss": 0.0295, "step": 54140 }, { "epoch": 4.90710531085735, "grad_norm": 0.31922006607055664, "learning_rate": 5.255502915067212e-08, "loss": 0.0334, "step": 54145 }, { "epoch": 4.907558455682436, "grad_norm": 0.43251317739486694, "learning_rate": 5.204372526174395e-08, "loss": 0.0288, "step": 54150 }, { "epoch": 4.908011600507522, "grad_norm": 0.3040556013584137, "learning_rate": 5.1534918172713166e-08, "loss": 0.0287, "step": 54155 }, { "epoch": 4.9084647453326085, "grad_norm": 0.32790622115135193, "learning_rate": 5.1028607934502927e-08, "loss": 0.0456, "step": 54160 }, { "epoch": 4.908917890157694, "grad_norm": 0.5729996562004089, "learning_rate": 5.0524794597783806e-08, "loss": 0.0313, "step": 54165 }, { "epoch": 4.90937103498278, "grad_norm": 0.29422876238822937, "learning_rate": 5.002347821297659e-08, "loss": 0.0523, "step": 54170 }, { "epoch": 4.909824179807867, "grad_norm": 0.3185836970806122, "learning_rate": 4.952465883025503e-08, "loss": 0.036, "step": 54175 }, { "epoch": 4.910277324632952, "grad_norm": 0.3437546491622925, "learning_rate": 4.902833649953753e-08, "loss": 0.0293, "step": 54180 }, { "epoch": 4.910730469458039, "grad_norm": 0.3254506289958954, "learning_rate": 4.8534511270501014e-08, "loss": 0.0285, "step": 54185 }, { "epoch": 4.911183614283125, "grad_norm": 0.3416763246059418, "learning_rate": 4.804318319256429e-08, "loss": 0.0356, "step": 54190 }, { "epoch": 4.911636759108211, "grad_norm": 0.4610092341899872, "learning_rate": 4.7554352314901905e-08, "loss": 0.0324, "step": 54195 }, { "epoch": 4.912089903933297, "grad_norm": 0.2877773940563202, "learning_rate": 4.7068018686433066e-08, "loss": 0.0301, "step": 54200 }, { "epoch": 4.912543048758383, "grad_norm": 0.26643016934394836, "learning_rate": 4.6584182355835504e-08, "loss": 0.0303, "step": 54205 }, { "epoch": 4.912996193583469, "grad_norm": 0.2734176814556122, "learning_rate": 4.610284337152604e-08, "loss": 0.0315, "step": 54210 }, { "epoch": 4.913449338408555, "grad_norm": 0.28663790225982666, "learning_rate": 4.5624001781680023e-08, "loss": 0.032, "step": 54215 }, { "epoch": 4.913902483233642, "grad_norm": 0.5259978175163269, "learning_rate": 4.514765763422024e-08, "loss": 0.0316, "step": 54220 }, { "epoch": 4.914355628058727, "grad_norm": 0.3276764452457428, "learning_rate": 4.467381097681966e-08, "loss": 0.0325, "step": 54225 }, { "epoch": 4.914808772883814, "grad_norm": 0.2393101155757904, "learning_rate": 4.420246185689869e-08, "loss": 0.0269, "step": 54230 }, { "epoch": 4.9152619177089, "grad_norm": 0.34009402990341187, "learning_rate": 4.37336103216307e-08, "loss": 0.0298, "step": 54235 }, { "epoch": 4.915715062533986, "grad_norm": 0.29465627670288086, "learning_rate": 4.3267256417942046e-08, "loss": 0.0297, "step": 54240 }, { "epoch": 4.916168207359072, "grad_norm": 0.30116593837738037, "learning_rate": 4.2803400192500954e-08, "loss": 0.0286, "step": 54245 }, { "epoch": 4.916621352184158, "grad_norm": 0.29055899381637573, "learning_rate": 4.234204169173417e-08, "loss": 0.0556, "step": 54250 }, { "epoch": 4.917074497009244, "grad_norm": 0.26204025745391846, "learning_rate": 4.1883180961813095e-08, "loss": 0.0369, "step": 54255 }, { "epoch": 4.91752764183433, "grad_norm": 0.3508913218975067, "learning_rate": 4.1426818048659335e-08, "loss": 0.0304, "step": 54260 }, { "epoch": 4.917980786659417, "grad_norm": 0.3695443868637085, "learning_rate": 4.0972952997947454e-08, "loss": 0.0318, "step": 54265 }, { "epoch": 4.918433931484502, "grad_norm": 0.336277574300766, "learning_rate": 4.052158585509947e-08, "loss": 0.0357, "step": 54270 }, { "epoch": 4.918887076309589, "grad_norm": 0.5078957080841064, "learning_rate": 4.0072716665287556e-08, "loss": 0.0363, "step": 54275 }, { "epoch": 4.919340221134675, "grad_norm": 0.29535019397735596, "learning_rate": 3.962634547343691e-08, "loss": 0.0292, "step": 54280 }, { "epoch": 4.919793365959761, "grad_norm": 0.2658520042896271, "learning_rate": 3.918247232422012e-08, "loss": 0.028, "step": 54285 }, { "epoch": 4.920246510784847, "grad_norm": 0.31020382046699524, "learning_rate": 3.8741097262057214e-08, "loss": 0.0287, "step": 54290 }, { "epoch": 4.920699655609933, "grad_norm": 0.48225486278533936, "learning_rate": 3.830222033112674e-08, "loss": 0.0335, "step": 54295 }, { "epoch": 4.921152800435019, "grad_norm": 0.33355656266212463, "learning_rate": 3.786584157534634e-08, "loss": 0.0292, "step": 54300 }, { "epoch": 4.921605945260105, "grad_norm": 0.30490198731422424, "learning_rate": 3.743196103838942e-08, "loss": 0.0337, "step": 54305 }, { "epoch": 4.922059090085192, "grad_norm": 0.2780824601650238, "learning_rate": 3.700057876368512e-08, "loss": 0.0285, "step": 54310 }, { "epoch": 4.922512234910277, "grad_norm": 0.2743815779685974, "learning_rate": 3.657169479439892e-08, "loss": 0.0328, "step": 54315 }, { "epoch": 4.922965379735364, "grad_norm": 0.2483830451965332, "learning_rate": 3.6145309173454824e-08, "loss": 0.0292, "step": 54320 }, { "epoch": 4.923418524560449, "grad_norm": 0.3478350043296814, "learning_rate": 3.5721421943529784e-08, "loss": 0.0299, "step": 54325 }, { "epoch": 4.9238716693855356, "grad_norm": 0.23887228965759277, "learning_rate": 3.530003314704267e-08, "loss": 0.0282, "step": 54330 }, { "epoch": 4.924324814210622, "grad_norm": 0.28254106640815735, "learning_rate": 3.4881142826168056e-08, "loss": 0.0311, "step": 54335 }, { "epoch": 4.9247779590357075, "grad_norm": 0.2889055907726288, "learning_rate": 3.4464751022827976e-08, "loss": 0.0282, "step": 54340 }, { "epoch": 4.925231103860794, "grad_norm": 0.31521034240722656, "learning_rate": 3.4050857778694656e-08, "loss": 0.0268, "step": 54345 }, { "epoch": 4.92568424868588, "grad_norm": 0.4154950678348541, "learning_rate": 3.363946313519328e-08, "loss": 0.0289, "step": 54350 }, { "epoch": 4.926137393510966, "grad_norm": 0.4105927348136902, "learning_rate": 3.323056713349093e-08, "loss": 0.0301, "step": 54355 }, { "epoch": 4.926590538336052, "grad_norm": 0.3341861069202423, "learning_rate": 3.282416981451597e-08, "loss": 0.0299, "step": 54360 }, { "epoch": 4.927043683161139, "grad_norm": 0.24354945123195648, "learning_rate": 3.2420271218938646e-08, "loss": 0.0325, "step": 54365 }, { "epoch": 4.927496827986224, "grad_norm": 0.37959858775138855, "learning_rate": 3.201887138717941e-08, "loss": 0.0316, "step": 54370 }, { "epoch": 4.9279499728113105, "grad_norm": 0.35414186120033264, "learning_rate": 3.161997035941167e-08, "loss": 0.0292, "step": 54375 }, { "epoch": 4.928403117636397, "grad_norm": 0.2614344358444214, "learning_rate": 3.122356817555905e-08, "loss": 0.0316, "step": 54380 }, { "epoch": 4.9288562624614825, "grad_norm": 0.2598377764225006, "learning_rate": 3.08296648752926e-08, "loss": 0.0317, "step": 54385 }, { "epoch": 4.929309407286569, "grad_norm": 0.3414676785469055, "learning_rate": 3.0438260498033556e-08, "loss": 0.0293, "step": 54390 }, { "epoch": 4.929762552111655, "grad_norm": 0.5133808255195618, "learning_rate": 3.004935508295337e-08, "loss": 0.033, "step": 54395 }, { "epoch": 4.930215696936741, "grad_norm": 0.3160380423069, "learning_rate": 2.9662948668976453e-08, "loss": 0.0286, "step": 54400 }, { "epoch": 4.930668841761827, "grad_norm": 0.33027762174606323, "learning_rate": 2.927904129477188e-08, "loss": 0.0285, "step": 54405 }, { "epoch": 4.931121986586913, "grad_norm": 0.27182498574256897, "learning_rate": 2.8897632998764467e-08, "loss": 0.0311, "step": 54410 }, { "epoch": 4.931575131411999, "grad_norm": 0.27906423807144165, "learning_rate": 2.851872381912091e-08, "loss": 0.031, "step": 54415 }, { "epoch": 4.9320282762370855, "grad_norm": 0.29504677653312683, "learning_rate": 2.814231379376642e-08, "loss": 0.0292, "step": 54420 }, { "epoch": 4.932481421062171, "grad_norm": 0.38107922673225403, "learning_rate": 2.7768402960370866e-08, "loss": 0.0326, "step": 54425 }, { "epoch": 4.932934565887257, "grad_norm": 0.27861669659614563, "learning_rate": 2.739699135635432e-08, "loss": 0.0278, "step": 54430 }, { "epoch": 4.933387710712344, "grad_norm": 0.3686673939228058, "learning_rate": 2.7028079018889818e-08, "loss": 0.0293, "step": 54435 }, { "epoch": 4.933840855537429, "grad_norm": 0.3655710816383362, "learning_rate": 2.6661665984895057e-08, "loss": 0.0347, "step": 54440 }, { "epoch": 4.934294000362516, "grad_norm": 0.24736419320106506, "learning_rate": 2.629775229104625e-08, "loss": 0.0285, "step": 54445 }, { "epoch": 4.934747145187602, "grad_norm": 0.4031451940536499, "learning_rate": 2.5936337973758717e-08, "loss": 0.0323, "step": 54450 }, { "epoch": 4.935200290012688, "grad_norm": 0.2547902762889862, "learning_rate": 2.5577423069206298e-08, "loss": 0.0294, "step": 54455 }, { "epoch": 4.935653434837774, "grad_norm": 0.4753323793411255, "learning_rate": 2.522100761330748e-08, "loss": 0.0282, "step": 54460 }, { "epoch": 4.9361065796628605, "grad_norm": 0.38764333724975586, "learning_rate": 2.486709164173373e-08, "loss": 0.0301, "step": 54465 }, { "epoch": 4.936559724487946, "grad_norm": 0.3445596396923065, "learning_rate": 2.4515675189901165e-08, "loss": 0.0293, "step": 54470 }, { "epoch": 4.937012869313032, "grad_norm": 0.3619995713233948, "learning_rate": 2.4166758292984424e-08, "loss": 0.0311, "step": 54475 }, { "epoch": 4.937466014138119, "grad_norm": 0.2395082414150238, "learning_rate": 2.38203409859028e-08, "loss": 0.0333, "step": 54480 }, { "epoch": 4.937919158963204, "grad_norm": 0.35836437344551086, "learning_rate": 2.3476423303325778e-08, "loss": 0.0333, "step": 54485 }, { "epoch": 4.938372303788291, "grad_norm": 0.38394895195961, "learning_rate": 2.3135005279670275e-08, "loss": 0.0274, "step": 54490 }, { "epoch": 4.938825448613377, "grad_norm": 0.5057734847068787, "learning_rate": 2.2796086949106177e-08, "loss": 0.0322, "step": 54495 }, { "epoch": 4.939278593438463, "grad_norm": 0.3165879249572754, "learning_rate": 2.2459668345553577e-08, "loss": 0.0284, "step": 54500 }, { "epoch": 4.939731738263549, "grad_norm": 0.2897060811519623, "learning_rate": 2.2125749502679983e-08, "loss": 0.0289, "step": 54505 }, { "epoch": 4.9401848830886355, "grad_norm": 0.40032464265823364, "learning_rate": 2.1794330453905887e-08, "loss": 0.0305, "step": 54510 }, { "epoch": 4.940638027913721, "grad_norm": 0.3193376064300537, "learning_rate": 2.14654112323992e-08, "loss": 0.0354, "step": 54515 }, { "epoch": 4.941091172738807, "grad_norm": 0.2885609269142151, "learning_rate": 2.1138991871080817e-08, "loss": 0.0298, "step": 54520 }, { "epoch": 4.941544317563894, "grad_norm": 0.23488658666610718, "learning_rate": 2.081507240261349e-08, "loss": 0.0277, "step": 54525 }, { "epoch": 4.941997462388979, "grad_norm": 0.25595298409461975, "learning_rate": 2.049365285941851e-08, "loss": 0.0307, "step": 54530 }, { "epoch": 4.942450607214066, "grad_norm": 0.23705753684043884, "learning_rate": 2.0174733273661818e-08, "loss": 0.0286, "step": 54535 }, { "epoch": 4.942903752039152, "grad_norm": 0.33942508697509766, "learning_rate": 1.9858313677262318e-08, "loss": 0.0362, "step": 54540 }, { "epoch": 4.943356896864238, "grad_norm": 0.32528018951416016, "learning_rate": 1.954439410188913e-08, "loss": 0.0329, "step": 54545 }, { "epoch": 4.943810041689324, "grad_norm": 0.42598891258239746, "learning_rate": 1.923297457895601e-08, "loss": 0.0311, "step": 54550 }, { "epoch": 4.94426318651441, "grad_norm": 0.28933247923851013, "learning_rate": 1.8924055139635243e-08, "loss": 0.0299, "step": 54555 }, { "epoch": 4.944716331339496, "grad_norm": 0.3705953061580658, "learning_rate": 1.8617635814835443e-08, "loss": 0.0326, "step": 54560 }, { "epoch": 4.945169476164582, "grad_norm": 0.34851616621017456, "learning_rate": 1.8313716635229295e-08, "loss": 0.0284, "step": 54565 }, { "epoch": 4.945622620989669, "grad_norm": 0.3249080777168274, "learning_rate": 1.8012297631231357e-08, "loss": 0.0299, "step": 54570 }, { "epoch": 4.946075765814754, "grad_norm": 0.25143158435821533, "learning_rate": 1.771337883300639e-08, "loss": 0.0293, "step": 54575 }, { "epoch": 4.946528910639841, "grad_norm": 0.3685983121395111, "learning_rate": 1.7416960270474902e-08, "loss": 0.0307, "step": 54580 }, { "epoch": 4.946982055464927, "grad_norm": 0.2855602204799652, "learning_rate": 1.7123041973296506e-08, "loss": 0.034, "step": 54585 }, { "epoch": 4.947435200290013, "grad_norm": 0.273388534784317, "learning_rate": 1.6831623970892107e-08, "loss": 0.0311, "step": 54590 }, { "epoch": 4.947888345115099, "grad_norm": 0.23847085237503052, "learning_rate": 1.6542706292421717e-08, "loss": 0.0296, "step": 54595 }, { "epoch": 4.9483414899401845, "grad_norm": 0.3138188421726227, "learning_rate": 1.6256288966806645e-08, "loss": 0.0321, "step": 54600 }, { "epoch": 4.948794634765271, "grad_norm": 0.2787666320800781, "learning_rate": 1.5972372022704518e-08, "loss": 0.0561, "step": 54605 }, { "epoch": 4.949247779590357, "grad_norm": 0.2110186368227005, "learning_rate": 1.5690955488534276e-08, "loss": 0.0292, "step": 54610 }, { "epoch": 4.949700924415443, "grad_norm": 0.26325172185897827, "learning_rate": 1.54120393924595e-08, "loss": 0.0319, "step": 54615 }, { "epoch": 4.950154069240529, "grad_norm": 0.2878137230873108, "learning_rate": 1.513562376239397e-08, "loss": 0.0318, "step": 54620 }, { "epoch": 4.950607214065616, "grad_norm": 0.4457131028175354, "learning_rate": 1.4861708625998893e-08, "loss": 0.0324, "step": 54625 }, { "epoch": 4.951060358890701, "grad_norm": 0.2975198030471802, "learning_rate": 1.4590294010694006e-08, "loss": 0.0285, "step": 54630 }, { "epoch": 4.951513503715788, "grad_norm": 0.2807078957557678, "learning_rate": 1.4321379943638137e-08, "loss": 0.0295, "step": 54635 }, { "epoch": 4.951966648540874, "grad_norm": 0.3370315134525299, "learning_rate": 1.405496645174309e-08, "loss": 0.0283, "step": 54640 }, { "epoch": 4.9524197933659595, "grad_norm": 0.32623258233070374, "learning_rate": 1.379105356167365e-08, "loss": 0.0322, "step": 54645 }, { "epoch": 4.952872938191046, "grad_norm": 0.23244443535804749, "learning_rate": 1.3529641299844797e-08, "loss": 0.035, "step": 54650 }, { "epoch": 4.953326083016132, "grad_norm": 0.367971807718277, "learning_rate": 1.3270729692413386e-08, "loss": 0.03, "step": 54655 }, { "epoch": 4.953779227841218, "grad_norm": 0.3197718858718872, "learning_rate": 1.3014318765294798e-08, "loss": 0.0296, "step": 54660 }, { "epoch": 4.954232372666304, "grad_norm": 0.30895107984542847, "learning_rate": 1.276040854414906e-08, "loss": 0.0302, "step": 54665 }, { "epoch": 4.954685517491391, "grad_norm": 0.26561444997787476, "learning_rate": 1.2508999054391957e-08, "loss": 0.0303, "step": 54670 }, { "epoch": 4.955138662316476, "grad_norm": 0.3135116398334503, "learning_rate": 1.2260090321178364e-08, "loss": 0.0286, "step": 54675 }, { "epoch": 4.9555918071415626, "grad_norm": 0.35573485493659973, "learning_rate": 1.2013682369421686e-08, "loss": 0.0308, "step": 54680 }, { "epoch": 4.956044951966648, "grad_norm": 0.3063250184059143, "learning_rate": 1.1769775223785529e-08, "loss": 0.0281, "step": 54685 }, { "epoch": 4.9564980967917345, "grad_norm": 0.28702157735824585, "learning_rate": 1.1528368908675369e-08, "loss": 0.0367, "step": 54690 }, { "epoch": 4.956951241616821, "grad_norm": 0.34204089641571045, "learning_rate": 1.128946344825521e-08, "loss": 0.0284, "step": 54695 }, { "epoch": 4.957404386441906, "grad_norm": 0.38153451681137085, "learning_rate": 1.1053058866430932e-08, "loss": 0.0434, "step": 54700 }, { "epoch": 4.957857531266993, "grad_norm": 0.2613331377506256, "learning_rate": 1.081915518686416e-08, "loss": 0.0277, "step": 54705 }, { "epoch": 4.958310676092079, "grad_norm": 0.33039146661758423, "learning_rate": 1.0587752432963949e-08, "loss": 0.0288, "step": 54710 }, { "epoch": 4.958763820917165, "grad_norm": 0.25321289896965027, "learning_rate": 1.0358850627892324e-08, "loss": 0.0279, "step": 54715 }, { "epoch": 4.959216965742251, "grad_norm": 0.4089629650115967, "learning_rate": 1.0132449794550414e-08, "loss": 0.0317, "step": 54720 }, { "epoch": 4.9596701105673375, "grad_norm": 0.31135380268096924, "learning_rate": 9.908549955603418e-09, "loss": 0.0292, "step": 54725 }, { "epoch": 4.960123255392423, "grad_norm": 0.2698042392730713, "learning_rate": 9.68715113345564e-09, "loss": 0.0295, "step": 54730 }, { "epoch": 4.9605764002175095, "grad_norm": 0.5279852151870728, "learning_rate": 9.468253350267131e-09, "loss": 0.0376, "step": 54735 }, { "epoch": 4.961029545042596, "grad_norm": 0.37115272879600525, "learning_rate": 9.25185662794259e-09, "loss": 0.0298, "step": 54740 }, { "epoch": 4.961482689867681, "grad_norm": 0.3170807957649231, "learning_rate": 9.037960988139693e-09, "loss": 0.029, "step": 54745 }, { "epoch": 4.961935834692768, "grad_norm": 0.23480044305324554, "learning_rate": 8.826566452266316e-09, "loss": 0.0289, "step": 54750 }, { "epoch": 4.962388979517854, "grad_norm": 0.3041268587112427, "learning_rate": 8.617673041480533e-09, "loss": 0.0292, "step": 54755 }, { "epoch": 4.96284212434294, "grad_norm": 0.2464757263660431, "learning_rate": 8.411280776685071e-09, "loss": 0.0287, "step": 54760 }, { "epoch": 4.963295269168026, "grad_norm": 0.3108614385128021, "learning_rate": 8.207389678535626e-09, "loss": 0.0292, "step": 54765 }, { "epoch": 4.9637484139931125, "grad_norm": 0.2302302122116089, "learning_rate": 8.005999767440875e-09, "loss": 0.0278, "step": 54770 }, { "epoch": 4.964201558818198, "grad_norm": 0.3493000864982605, "learning_rate": 7.80711106355414e-09, "loss": 0.0278, "step": 54775 }, { "epoch": 4.964654703643284, "grad_norm": 0.35571447014808655, "learning_rate": 7.610723586778945e-09, "loss": 0.0327, "step": 54780 }, { "epoch": 4.965107848468371, "grad_norm": 0.275264173746109, "learning_rate": 7.416837356771788e-09, "loss": 0.0275, "step": 54785 }, { "epoch": 4.965560993293456, "grad_norm": 0.4269242286682129, "learning_rate": 7.2254523929365935e-09, "loss": 0.0316, "step": 54790 }, { "epoch": 4.966014138118543, "grad_norm": 0.251587450504303, "learning_rate": 7.036568714427483e-09, "loss": 0.0304, "step": 54795 }, { "epoch": 4.966467282943629, "grad_norm": 0.3339415192604065, "learning_rate": 6.850186340146003e-09, "loss": 0.0315, "step": 54800 }, { "epoch": 4.966920427768715, "grad_norm": 0.2580675482749939, "learning_rate": 6.666305288746677e-09, "loss": 0.029, "step": 54805 }, { "epoch": 4.967373572593801, "grad_norm": 0.26806002855300903, "learning_rate": 6.484925578634227e-09, "loss": 0.0279, "step": 54810 }, { "epoch": 4.9678267174188875, "grad_norm": 0.29381150007247925, "learning_rate": 6.306047227960799e-09, "loss": 0.0361, "step": 54815 }, { "epoch": 4.968279862243973, "grad_norm": 0.29161536693573, "learning_rate": 6.129670254623188e-09, "loss": 0.03, "step": 54820 }, { "epoch": 4.968733007069059, "grad_norm": 0.27822643518447876, "learning_rate": 5.955794676279491e-09, "loss": 0.0291, "step": 54825 }, { "epoch": 4.969186151894146, "grad_norm": 0.3019309639930725, "learning_rate": 5.78442051032968e-09, "loss": 0.031, "step": 54830 }, { "epoch": 4.969639296719231, "grad_norm": 0.24180422723293304, "learning_rate": 5.615547773923924e-09, "loss": 0.0314, "step": 54835 }, { "epoch": 4.970092441544318, "grad_norm": 0.35362666845321655, "learning_rate": 5.449176483962592e-09, "loss": 0.0344, "step": 54840 }, { "epoch": 4.970545586369404, "grad_norm": 0.34795406460762024, "learning_rate": 5.285306657099032e-09, "loss": 0.0342, "step": 54845 }, { "epoch": 4.97099873119449, "grad_norm": 0.2807685136795044, "learning_rate": 5.123938309731235e-09, "loss": 0.0272, "step": 54850 }, { "epoch": 4.971451876019576, "grad_norm": 0.36540934443473816, "learning_rate": 4.9650714580073976e-09, "loss": 0.0286, "step": 54855 }, { "epoch": 4.971905020844662, "grad_norm": 0.25353795289993286, "learning_rate": 4.808706117828687e-09, "loss": 0.0288, "step": 54860 }, { "epoch": 4.972358165669748, "grad_norm": 0.29604336619377136, "learning_rate": 4.654842304846474e-09, "loss": 0.0299, "step": 54865 }, { "epoch": 4.972811310494834, "grad_norm": 0.3387535512447357, "learning_rate": 4.503480034456775e-09, "loss": 0.0291, "step": 54870 }, { "epoch": 4.97326445531992, "grad_norm": 0.3495482802391052, "learning_rate": 4.354619321808584e-09, "loss": 0.0298, "step": 54875 }, { "epoch": 4.973717600145006, "grad_norm": 0.28190916776657104, "learning_rate": 4.208260181798318e-09, "loss": 0.0287, "step": 54880 }, { "epoch": 4.974170744970093, "grad_norm": 0.30442747473716736, "learning_rate": 4.06440262907537e-09, "loss": 0.0284, "step": 54885 }, { "epoch": 4.974623889795178, "grad_norm": 0.36739373207092285, "learning_rate": 3.923046678039333e-09, "loss": 0.0349, "step": 54890 }, { "epoch": 4.975077034620265, "grad_norm": 0.4362698495388031, "learning_rate": 3.784192342831671e-09, "loss": 0.0321, "step": 54895 }, { "epoch": 4.975530179445351, "grad_norm": 0.48269447684288025, "learning_rate": 3.647839637355155e-09, "loss": 0.0354, "step": 54900 }, { "epoch": 4.9759833242704365, "grad_norm": 0.3296763598918915, "learning_rate": 3.5139885752488723e-09, "loss": 0.0297, "step": 54905 }, { "epoch": 4.976436469095523, "grad_norm": 0.3243427574634552, "learning_rate": 3.3826391699159908e-09, "loss": 0.0419, "step": 54910 }, { "epoch": 4.976889613920609, "grad_norm": 0.33895790576934814, "learning_rate": 3.253791434498776e-09, "loss": 0.0303, "step": 54915 }, { "epoch": 4.977342758745695, "grad_norm": 0.31058409810066223, "learning_rate": 3.127445381889693e-09, "loss": 0.0276, "step": 54920 }, { "epoch": 4.977795903570781, "grad_norm": 0.4685627520084381, "learning_rate": 3.003601024736957e-09, "loss": 0.0327, "step": 54925 }, { "epoch": 4.978249048395868, "grad_norm": 0.2981189787387848, "learning_rate": 2.8822583754362086e-09, "loss": 0.0297, "step": 54930 }, { "epoch": 4.978702193220953, "grad_norm": 0.2710868716239929, "learning_rate": 2.7634174461277363e-09, "loss": 0.0274, "step": 54935 }, { "epoch": 4.97915533804604, "grad_norm": 0.34076789021492004, "learning_rate": 2.6470782487075797e-09, "loss": 0.0319, "step": 54940 }, { "epoch": 4.979608482871125, "grad_norm": 0.2997935116291046, "learning_rate": 2.5332407948164273e-09, "loss": 0.029, "step": 54945 }, { "epoch": 4.9800616276962115, "grad_norm": 0.24897605180740356, "learning_rate": 2.421905095850718e-09, "loss": 0.0296, "step": 54950 }, { "epoch": 4.980514772521298, "grad_norm": 0.24068520963191986, "learning_rate": 2.3130711629487657e-09, "loss": 0.0308, "step": 54955 }, { "epoch": 4.9809679173463834, "grad_norm": 0.29959744215011597, "learning_rate": 2.206739007007408e-09, "loss": 0.0293, "step": 54960 }, { "epoch": 4.98142106217147, "grad_norm": 0.3208563029766083, "learning_rate": 2.102908638665357e-09, "loss": 0.0304, "step": 54965 }, { "epoch": 4.981874206996556, "grad_norm": 0.2699945569038391, "learning_rate": 2.0015800683143016e-09, "loss": 0.0367, "step": 54970 }, { "epoch": 4.982327351821642, "grad_norm": 0.2550886869430542, "learning_rate": 1.9027533060989032e-09, "loss": 0.0309, "step": 54975 }, { "epoch": 4.982780496646728, "grad_norm": 0.34635812044143677, "learning_rate": 1.8064283619029233e-09, "loss": 0.0352, "step": 54980 }, { "epoch": 4.983233641471815, "grad_norm": 0.24550212919712067, "learning_rate": 1.7126052453714237e-09, "loss": 0.0282, "step": 54985 }, { "epoch": 4.9836867862969, "grad_norm": 0.23959247767925262, "learning_rate": 1.6212839658913402e-09, "loss": 0.0275, "step": 54990 }, { "epoch": 4.9841399311219865, "grad_norm": 0.3093593120574951, "learning_rate": 1.5324645326081354e-09, "loss": 0.0311, "step": 54995 }, { "epoch": 4.984593075947073, "grad_norm": 0.27165982127189636, "learning_rate": 1.4461469544035933e-09, "loss": 0.0295, "step": 55000 }, { "epoch": 4.985046220772158, "grad_norm": 0.36763668060302734, "learning_rate": 1.3623312399180243e-09, "loss": 0.0339, "step": 55005 }, { "epoch": 4.985499365597245, "grad_norm": 0.3161996304988861, "learning_rate": 1.281017397544715e-09, "loss": 0.028, "step": 55010 }, { "epoch": 4.985952510422331, "grad_norm": 0.31793326139450073, "learning_rate": 1.2022054354160484e-09, "loss": 0.03, "step": 55015 }, { "epoch": 4.986405655247417, "grad_norm": 0.2987171411514282, "learning_rate": 1.1258953614201595e-09, "loss": 0.029, "step": 55020 }, { "epoch": 4.986858800072503, "grad_norm": 0.35004356503486633, "learning_rate": 1.0520871831981583e-09, "loss": 0.0316, "step": 55025 }, { "epoch": 4.9873119448975896, "grad_norm": 0.3269137740135193, "learning_rate": 9.807809081330276e-10, "loss": 0.0283, "step": 55030 }, { "epoch": 4.987765089722675, "grad_norm": 0.2706756591796875, "learning_rate": 9.11976543363502e-10, "loss": 0.0285, "step": 55035 }, { "epoch": 4.9882182345477615, "grad_norm": 0.26163843274116516, "learning_rate": 8.456740957729637e-10, "loss": 0.0275, "step": 55040 }, { "epoch": 4.988671379372848, "grad_norm": 0.41741353273391724, "learning_rate": 7.81873571997771e-10, "loss": 0.0299, "step": 55045 }, { "epoch": 4.989124524197933, "grad_norm": 0.2804718017578125, "learning_rate": 7.20574978424482e-10, "loss": 0.0338, "step": 55050 }, { "epoch": 4.98957766902302, "grad_norm": 0.3132408559322357, "learning_rate": 6.617783211870788e-10, "loss": 0.0293, "step": 55055 }, { "epoch": 4.990030813848106, "grad_norm": 0.31203967332839966, "learning_rate": 6.054836061697433e-10, "loss": 0.0282, "step": 55060 }, { "epoch": 4.990483958673192, "grad_norm": 0.3290427625179291, "learning_rate": 5.516908390068576e-10, "loss": 0.0308, "step": 55065 }, { "epoch": 4.990937103498278, "grad_norm": 0.3083103597164154, "learning_rate": 5.004000250830032e-10, "loss": 0.0289, "step": 55070 }, { "epoch": 4.9913902483233645, "grad_norm": 0.2865157127380371, "learning_rate": 4.5161116952741057e-10, "loss": 0.0273, "step": 55075 }, { "epoch": 4.99184339314845, "grad_norm": 0.29471999406814575, "learning_rate": 4.053242772278365e-10, "loss": 0.0313, "step": 55080 }, { "epoch": 4.9922965379735365, "grad_norm": 0.2688508629798889, "learning_rate": 3.6153935281391105e-10, "loss": 0.0333, "step": 55085 }, { "epoch": 4.992749682798623, "grad_norm": 0.2554354667663574, "learning_rate": 3.202564006682396e-10, "loss": 0.0358, "step": 55090 }, { "epoch": 4.993202827623708, "grad_norm": 0.42392921447753906, "learning_rate": 2.8147542492362733e-10, "loss": 0.0348, "step": 55095 }, { "epoch": 4.993655972448795, "grad_norm": 0.31779608130455017, "learning_rate": 2.4519642946030375e-10, "loss": 0.0293, "step": 55100 }, { "epoch": 4.994109117273881, "grad_norm": 0.2884083092212677, "learning_rate": 2.1141941790592257e-10, "loss": 0.0261, "step": 55105 }, { "epoch": 4.994562262098967, "grad_norm": 0.2507490813732147, "learning_rate": 1.80144393646664e-10, "loss": 0.036, "step": 55110 }, { "epoch": 4.995015406924053, "grad_norm": 0.36977219581604004, "learning_rate": 1.5137135980780593e-10, "loss": 0.0568, "step": 55115 }, { "epoch": 4.9954685517491395, "grad_norm": 0.2854606509208679, "learning_rate": 1.2510031927315259e-10, "loss": 0.0336, "step": 55120 }, { "epoch": 4.995921696574225, "grad_norm": 0.4001579284667969, "learning_rate": 1.0133127466838144e-10, "loss": 0.0323, "step": 55125 }, { "epoch": 4.996374841399311, "grad_norm": 0.30729493498802185, "learning_rate": 8.006422837214533e-11, "loss": 0.0298, "step": 55130 }, { "epoch": 4.996827986224397, "grad_norm": 0.2959996163845062, "learning_rate": 6.129918251607247e-11, "loss": 0.0302, "step": 55135 }, { "epoch": 4.997281131049483, "grad_norm": 0.2695562243461609, "learning_rate": 4.503613897643977e-11, "loss": 0.0305, "step": 55140 }, { "epoch": 4.99773427587457, "grad_norm": 0.31757330894470215, "learning_rate": 3.127509937972395e-11, "loss": 0.0313, "step": 55145 }, { "epoch": 4.998187420699655, "grad_norm": 0.5916335582733154, "learning_rate": 2.0016065105377125e-11, "loss": 0.0313, "step": 55150 }, { "epoch": 4.998640565524742, "grad_norm": 0.2974715232849121, "learning_rate": 1.1259037280275664e-11, "loss": 0.0308, "step": 55155 }, { "epoch": 4.999093710349828, "grad_norm": 0.23441863059997559, "learning_rate": 5.004016775944642e-12, "loss": 0.0279, "step": 55160 }, { "epoch": 4.999546855174914, "grad_norm": 0.2952074408531189, "learning_rate": 1.2510042252111831e-12, "loss": 0.032, "step": 55165 }, { "epoch": 5.0, "grad_norm": 0.40935397148132324, "learning_rate": 0.0, "loss": 0.0329, "step": 55170 }, { "epoch": 5.0, "step": 55170, "total_flos": 4.236515906227798e+19, "train_loss": 0.23586479178022945, "train_runtime": 155185.7869, "train_samples_per_second": 11.376, "train_steps_per_second": 0.356 } ], "logging_steps": 5, "max_steps": 55170, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.236515906227798e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }