{ "best_metric": 0.15097101032733917, "best_model_checkpoint": "saved_model/lop_sep_2024/checkpoint-13492", "epoch": 2.999888827126181, "eval_steps": 500, "global_step": 20238, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": NaN, "learning_rate": 0.0, "loss": 70.4771, "step": 1 }, { "epoch": 0.0, "grad_norm": 18.768062591552734, "learning_rate": 3.5000000000000004e-06, "loss": 70.9315, "step": 10 }, { "epoch": 0.0, "grad_norm": 19.307415008544922, "learning_rate": 8.000000000000001e-06, "loss": 70.6744, "step": 20 }, { "epoch": 0.0, "grad_norm": 22.3438663482666, "learning_rate": 1.2e-05, "loss": 68.4448, "step": 30 }, { "epoch": 0.01, "grad_norm": 27.788856506347656, "learning_rate": 1.7000000000000003e-05, "loss": 64.7802, "step": 40 }, { "epoch": 0.01, "grad_norm": 34.86759948730469, "learning_rate": 2.15e-05, "loss": 56.6582, "step": 50 }, { "epoch": 0.01, "grad_norm": 43.42910385131836, "learning_rate": 2.6500000000000004e-05, "loss": 42.3126, "step": 60 }, { "epoch": 0.01, "grad_norm": 14.587575912475586, "learning_rate": 3.15e-05, "loss": 20.2735, "step": 70 }, { "epoch": 0.01, "grad_norm": 12.488475799560547, "learning_rate": 3.65e-05, "loss": 6.4981, "step": 80 }, { "epoch": 0.01, "grad_norm": 3.4391798973083496, "learning_rate": 4.15e-05, "loss": 0.9414, "step": 90 }, { "epoch": 0.01, "grad_norm": 1.6826833486557007, "learning_rate": 4.6500000000000005e-05, "loss": 0.4819, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.1236026287078857, "learning_rate": 5.1500000000000005e-05, "loss": 0.4228, "step": 110 }, { "epoch": 0.02, "grad_norm": 2.420114278793335, "learning_rate": 5.65e-05, "loss": 0.4401, "step": 120 }, { "epoch": 0.02, "grad_norm": 2.8946385383605957, "learning_rate": 6.15e-05, "loss": 0.6266, "step": 130 }, { "epoch": 0.02, "grad_norm": 12.989470481872559, "learning_rate": 6.65e-05, "loss": 0.5741, "step": 140 }, { "epoch": 0.02, "grad_norm": 2.2886247634887695, "learning_rate": 7.15e-05, "loss": 0.4404, "step": 150 }, { "epoch": 0.02, "grad_norm": 3.232952833175659, "learning_rate": 7.65e-05, "loss": 0.4768, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.457054615020752, "learning_rate": 8.15e-05, "loss": 0.4633, "step": 170 }, { "epoch": 0.03, "grad_norm": 1.454648733139038, "learning_rate": 8.65e-05, "loss": 0.4295, "step": 180 }, { "epoch": 0.03, "grad_norm": 3.4959018230438232, "learning_rate": 9.15e-05, "loss": 0.4526, "step": 190 }, { "epoch": 0.03, "grad_norm": 1.8352223634719849, "learning_rate": 9.65e-05, "loss": 0.3898, "step": 200 }, { "epoch": 0.03, "grad_norm": 2.780531406402588, "learning_rate": 9.999553969669938e-05, "loss": 0.3962, "step": 210 }, { "epoch": 0.03, "grad_norm": 2.455747127532959, "learning_rate": 9.998067201903063e-05, "loss": 0.4071, "step": 220 }, { "epoch": 0.03, "grad_norm": 1.8026726245880127, "learning_rate": 9.996580434136188e-05, "loss": 0.383, "step": 230 }, { "epoch": 0.04, "grad_norm": 1.4635368585586548, "learning_rate": 9.995093666369314e-05, "loss": 0.4433, "step": 240 }, { "epoch": 0.04, "grad_norm": 4.1103901863098145, "learning_rate": 9.993606898602439e-05, "loss": 0.3985, "step": 250 }, { "epoch": 0.04, "grad_norm": 4.443718910217285, "learning_rate": 9.992120130835564e-05, "loss": 0.3978, "step": 260 }, { "epoch": 0.04, "grad_norm": 5.011767387390137, "learning_rate": 9.990633363068688e-05, "loss": 0.4696, "step": 270 }, { "epoch": 0.04, "grad_norm": 5.497318744659424, "learning_rate": 9.989146595301815e-05, "loss": 0.4204, "step": 280 }, { "epoch": 0.04, "grad_norm": 1.5428732633590698, "learning_rate": 9.98765982753494e-05, "loss": 0.4107, "step": 290 }, { "epoch": 0.04, "grad_norm": 2.399179220199585, "learning_rate": 9.986173059768065e-05, "loss": 0.3831, "step": 300 }, { "epoch": 0.05, "grad_norm": 1.3291690349578857, "learning_rate": 9.98468629200119e-05, "loss": 0.3904, "step": 310 }, { "epoch": 0.05, "grad_norm": 1.5392258167266846, "learning_rate": 9.983199524234314e-05, "loss": 0.3349, "step": 320 }, { "epoch": 0.05, "grad_norm": 4.950677871704102, "learning_rate": 9.981712756467441e-05, "loss": 0.3969, "step": 330 }, { "epoch": 0.05, "grad_norm": 5.227260112762451, "learning_rate": 9.980225988700565e-05, "loss": 0.4026, "step": 340 }, { "epoch": 0.05, "grad_norm": 1.5111877918243408, "learning_rate": 9.97873922093369e-05, "loss": 0.3014, "step": 350 }, { "epoch": 0.05, "grad_norm": 2.647972583770752, "learning_rate": 9.977252453166816e-05, "loss": 0.2976, "step": 360 }, { "epoch": 0.05, "grad_norm": 3.595974922180176, "learning_rate": 9.975765685399942e-05, "loss": 0.3304, "step": 370 }, { "epoch": 0.06, "grad_norm": 3.389302968978882, "learning_rate": 9.974278917633067e-05, "loss": 0.2689, "step": 380 }, { "epoch": 0.06, "grad_norm": 2.902669906616211, "learning_rate": 9.972792149866191e-05, "loss": 0.2924, "step": 390 }, { "epoch": 0.06, "grad_norm": 3.135059118270874, "learning_rate": 9.971305382099317e-05, "loss": 0.3061, "step": 400 }, { "epoch": 0.06, "grad_norm": 3.143860101699829, "learning_rate": 9.969818614332441e-05, "loss": 0.2687, "step": 410 }, { "epoch": 0.06, "grad_norm": 2.1663131713867188, "learning_rate": 9.968331846565568e-05, "loss": 0.2633, "step": 420 }, { "epoch": 0.06, "grad_norm": 4.806302547454834, "learning_rate": 9.966845078798692e-05, "loss": 0.2626, "step": 430 }, { "epoch": 0.07, "grad_norm": 1.9977257251739502, "learning_rate": 9.965358311031817e-05, "loss": 0.2876, "step": 440 }, { "epoch": 0.07, "grad_norm": 1.0454623699188232, "learning_rate": 9.963871543264943e-05, "loss": 0.2663, "step": 450 }, { "epoch": 0.07, "grad_norm": 2.6900148391723633, "learning_rate": 9.962384775498068e-05, "loss": 0.2593, "step": 460 }, { "epoch": 0.07, "grad_norm": 4.413218975067139, "learning_rate": 9.960898007731194e-05, "loss": 0.2523, "step": 470 }, { "epoch": 0.07, "grad_norm": 4.643527984619141, "learning_rate": 9.959411239964318e-05, "loss": 0.272, "step": 480 }, { "epoch": 0.07, "grad_norm": 0.9696474075317383, "learning_rate": 9.957924472197443e-05, "loss": 0.2435, "step": 490 }, { "epoch": 0.07, "grad_norm": 1.4355976581573486, "learning_rate": 9.956437704430567e-05, "loss": 0.2267, "step": 500 }, { "epoch": 0.08, "grad_norm": 2.6311492919921875, "learning_rate": 9.954950936663694e-05, "loss": 0.2622, "step": 510 }, { "epoch": 0.08, "grad_norm": 2.0157711505889893, "learning_rate": 9.953464168896818e-05, "loss": 0.2461, "step": 520 }, { "epoch": 0.08, "grad_norm": 1.7786158323287964, "learning_rate": 9.951977401129944e-05, "loss": 0.2369, "step": 530 }, { "epoch": 0.08, "grad_norm": 2.6747984886169434, "learning_rate": 9.950490633363069e-05, "loss": 0.2115, "step": 540 }, { "epoch": 0.08, "grad_norm": 0.9853987693786621, "learning_rate": 9.949003865596195e-05, "loss": 0.2139, "step": 550 }, { "epoch": 0.08, "grad_norm": 2.4679114818573, "learning_rate": 9.94751709782932e-05, "loss": 0.262, "step": 560 }, { "epoch": 0.08, "grad_norm": 0.7220086455345154, "learning_rate": 9.946030330062444e-05, "loss": 0.2262, "step": 570 }, { "epoch": 0.09, "grad_norm": 1.3707178831100464, "learning_rate": 9.94454356229557e-05, "loss": 0.2501, "step": 580 }, { "epoch": 0.09, "grad_norm": 0.6298980712890625, "learning_rate": 9.943056794528694e-05, "loss": 0.2237, "step": 590 }, { "epoch": 0.09, "grad_norm": 5.072420597076416, "learning_rate": 9.941570026761821e-05, "loss": 0.2358, "step": 600 }, { "epoch": 0.09, "grad_norm": 0.9326792359352112, "learning_rate": 9.940083258994945e-05, "loss": 0.2013, "step": 610 }, { "epoch": 0.09, "grad_norm": 0.6766129732131958, "learning_rate": 9.93859649122807e-05, "loss": 0.1931, "step": 620 }, { "epoch": 0.09, "grad_norm": 4.741708755493164, "learning_rate": 9.937109723461196e-05, "loss": 0.2234, "step": 630 }, { "epoch": 0.09, "grad_norm": 4.426298141479492, "learning_rate": 9.935622955694321e-05, "loss": 0.2425, "step": 640 }, { "epoch": 0.1, "grad_norm": 0.8037042021751404, "learning_rate": 9.934136187927447e-05, "loss": 0.2243, "step": 650 }, { "epoch": 0.1, "grad_norm": 2.159052610397339, "learning_rate": 9.932649420160571e-05, "loss": 0.2101, "step": 660 }, { "epoch": 0.1, "grad_norm": 0.6584820747375488, "learning_rate": 9.931162652393696e-05, "loss": 0.2234, "step": 670 }, { "epoch": 0.1, "grad_norm": 4.1978840827941895, "learning_rate": 9.929675884626822e-05, "loss": 0.2182, "step": 680 }, { "epoch": 0.1, "grad_norm": 6.110688209533691, "learning_rate": 9.928189116859947e-05, "loss": 0.2312, "step": 690 }, { "epoch": 0.1, "grad_norm": 1.9383026361465454, "learning_rate": 9.926702349093073e-05, "loss": 0.2148, "step": 700 }, { "epoch": 0.11, "grad_norm": 0.8565307855606079, "learning_rate": 9.925215581326197e-05, "loss": 0.2262, "step": 710 }, { "epoch": 0.11, "grad_norm": 0.8206549286842346, "learning_rate": 9.923728813559322e-05, "loss": 0.2025, "step": 720 }, { "epoch": 0.11, "grad_norm": 1.799793004989624, "learning_rate": 9.922242045792448e-05, "loss": 0.2057, "step": 730 }, { "epoch": 0.11, "grad_norm": 2.572327136993408, "learning_rate": 9.920755278025573e-05, "loss": 0.1932, "step": 740 }, { "epoch": 0.11, "grad_norm": 3.779280185699463, "learning_rate": 9.919268510258697e-05, "loss": 0.2238, "step": 750 }, { "epoch": 0.11, "grad_norm": 4.892170429229736, "learning_rate": 9.917781742491823e-05, "loss": 0.2243, "step": 760 }, { "epoch": 0.11, "grad_norm": 0.9165033102035522, "learning_rate": 9.916294974724948e-05, "loss": 0.2015, "step": 770 }, { "epoch": 0.12, "grad_norm": 1.2273848056793213, "learning_rate": 9.914808206958074e-05, "loss": 0.1935, "step": 780 }, { "epoch": 0.12, "grad_norm": 0.9774864315986633, "learning_rate": 9.9133214391912e-05, "loss": 0.1876, "step": 790 }, { "epoch": 0.12, "grad_norm": 1.4223579168319702, "learning_rate": 9.911834671424323e-05, "loss": 0.1963, "step": 800 }, { "epoch": 0.12, "grad_norm": 0.9810081124305725, "learning_rate": 9.910347903657449e-05, "loss": 0.1806, "step": 810 }, { "epoch": 0.12, "grad_norm": 0.8502489328384399, "learning_rate": 9.908861135890574e-05, "loss": 0.2476, "step": 820 }, { "epoch": 0.12, "grad_norm": 1.8983548879623413, "learning_rate": 9.9073743681237e-05, "loss": 0.1994, "step": 830 }, { "epoch": 0.12, "grad_norm": 0.5797375440597534, "learning_rate": 9.905887600356824e-05, "loss": 0.1977, "step": 840 }, { "epoch": 0.13, "grad_norm": 0.8001552224159241, "learning_rate": 9.90440083258995e-05, "loss": 0.2037, "step": 850 }, { "epoch": 0.13, "grad_norm": 1.5331403017044067, "learning_rate": 9.902914064823075e-05, "loss": 0.197, "step": 860 }, { "epoch": 0.13, "grad_norm": 0.6793760061264038, "learning_rate": 9.9014272970562e-05, "loss": 0.2199, "step": 870 }, { "epoch": 0.13, "grad_norm": 0.6174043416976929, "learning_rate": 9.899940529289326e-05, "loss": 0.2152, "step": 880 }, { "epoch": 0.13, "grad_norm": 1.187853455543518, "learning_rate": 9.89845376152245e-05, "loss": 0.2005, "step": 890 }, { "epoch": 0.13, "grad_norm": 2.541826009750366, "learning_rate": 9.896966993755576e-05, "loss": 0.2007, "step": 900 }, { "epoch": 0.13, "grad_norm": 0.699213981628418, "learning_rate": 9.895480225988701e-05, "loss": 0.2046, "step": 910 }, { "epoch": 0.14, "grad_norm": 0.9006096124649048, "learning_rate": 9.893993458221826e-05, "loss": 0.196, "step": 920 }, { "epoch": 0.14, "grad_norm": 0.516926646232605, "learning_rate": 9.89250669045495e-05, "loss": 0.1857, "step": 930 }, { "epoch": 0.14, "grad_norm": 1.140449047088623, "learning_rate": 9.891019922688076e-05, "loss": 0.1988, "step": 940 }, { "epoch": 0.14, "grad_norm": 1.6413800716400146, "learning_rate": 9.889533154921203e-05, "loss": 0.1958, "step": 950 }, { "epoch": 0.14, "grad_norm": 0.7474754452705383, "learning_rate": 9.888046387154327e-05, "loss": 0.1907, "step": 960 }, { "epoch": 0.14, "grad_norm": 0.7224648594856262, "learning_rate": 9.886559619387453e-05, "loss": 0.1969, "step": 970 }, { "epoch": 0.15, "grad_norm": 0.9668368697166443, "learning_rate": 9.885072851620577e-05, "loss": 0.2143, "step": 980 }, { "epoch": 0.15, "grad_norm": 1.3923943042755127, "learning_rate": 9.883586083853702e-05, "loss": 0.1999, "step": 990 }, { "epoch": 0.15, "grad_norm": 2.00828218460083, "learning_rate": 9.882099316086828e-05, "loss": 0.1885, "step": 1000 }, { "epoch": 0.15, "grad_norm": 0.7937036156654358, "learning_rate": 9.880612548319953e-05, "loss": 0.1842, "step": 1010 }, { "epoch": 0.15, "grad_norm": 0.9639225602149963, "learning_rate": 9.879125780553079e-05, "loss": 0.1884, "step": 1020 }, { "epoch": 0.15, "grad_norm": 2.1198477745056152, "learning_rate": 9.877639012786203e-05, "loss": 0.1958, "step": 1030 }, { "epoch": 0.15, "grad_norm": 0.7752804756164551, "learning_rate": 9.87615224501933e-05, "loss": 0.1889, "step": 1040 }, { "epoch": 0.16, "grad_norm": 0.39463403820991516, "learning_rate": 9.874665477252454e-05, "loss": 0.1738, "step": 1050 }, { "epoch": 0.16, "grad_norm": 1.6980741024017334, "learning_rate": 9.873178709485579e-05, "loss": 0.1875, "step": 1060 }, { "epoch": 0.16, "grad_norm": 0.7736172080039978, "learning_rate": 9.871691941718703e-05, "loss": 0.1893, "step": 1070 }, { "epoch": 0.16, "grad_norm": 0.968677282333374, "learning_rate": 9.870205173951829e-05, "loss": 0.1737, "step": 1080 }, { "epoch": 0.16, "grad_norm": 1.1933799982070923, "learning_rate": 9.868718406184954e-05, "loss": 0.1706, "step": 1090 }, { "epoch": 0.16, "grad_norm": 2.9177980422973633, "learning_rate": 9.86723163841808e-05, "loss": 0.1942, "step": 1100 }, { "epoch": 0.16, "grad_norm": 0.6118002533912659, "learning_rate": 9.865744870651205e-05, "loss": 0.172, "step": 1110 }, { "epoch": 0.17, "grad_norm": 1.0720818042755127, "learning_rate": 9.864258102884329e-05, "loss": 0.1899, "step": 1120 }, { "epoch": 0.17, "grad_norm": 1.246082067489624, "learning_rate": 9.862771335117456e-05, "loss": 0.1944, "step": 1130 }, { "epoch": 0.17, "grad_norm": 2.5863025188446045, "learning_rate": 9.86128456735058e-05, "loss": 0.1836, "step": 1140 }, { "epoch": 0.17, "grad_norm": 1.1113203763961792, "learning_rate": 9.859797799583706e-05, "loss": 0.1969, "step": 1150 }, { "epoch": 0.17, "grad_norm": 1.720690369606018, "learning_rate": 9.85831103181683e-05, "loss": 0.2078, "step": 1160 }, { "epoch": 0.17, "grad_norm": 2.5786492824554443, "learning_rate": 9.856824264049957e-05, "loss": 0.1941, "step": 1170 }, { "epoch": 0.17, "grad_norm": 2.1644389629364014, "learning_rate": 9.855337496283081e-05, "loss": 0.1853, "step": 1180 }, { "epoch": 0.18, "grad_norm": 0.7992768287658691, "learning_rate": 9.853850728516206e-05, "loss": 0.1937, "step": 1190 }, { "epoch": 0.18, "grad_norm": 2.3800251483917236, "learning_rate": 9.852363960749332e-05, "loss": 0.1991, "step": 1200 }, { "epoch": 0.18, "grad_norm": 3.22444224357605, "learning_rate": 9.850877192982456e-05, "loss": 0.1876, "step": 1210 }, { "epoch": 0.18, "grad_norm": 0.721161961555481, "learning_rate": 9.849390425215583e-05, "loss": 0.1776, "step": 1220 }, { "epoch": 0.18, "grad_norm": 2.3023221492767334, "learning_rate": 9.847903657448707e-05, "loss": 0.1912, "step": 1230 }, { "epoch": 0.18, "grad_norm": 0.44242483377456665, "learning_rate": 9.846416889681832e-05, "loss": 0.1716, "step": 1240 }, { "epoch": 0.19, "grad_norm": 1.6946061849594116, "learning_rate": 9.844930121914956e-05, "loss": 0.1822, "step": 1250 }, { "epoch": 0.19, "grad_norm": 2.3526597023010254, "learning_rate": 9.843443354148083e-05, "loss": 0.1931, "step": 1260 }, { "epoch": 0.19, "grad_norm": 0.595988392829895, "learning_rate": 9.841956586381207e-05, "loss": 0.1764, "step": 1270 }, { "epoch": 0.19, "grad_norm": 1.5539588928222656, "learning_rate": 9.840469818614333e-05, "loss": 0.181, "step": 1280 }, { "epoch": 0.19, "grad_norm": 0.45790284872055054, "learning_rate": 9.838983050847458e-05, "loss": 0.1831, "step": 1290 }, { "epoch": 0.19, "grad_norm": 1.5096701383590698, "learning_rate": 9.837496283080582e-05, "loss": 0.1648, "step": 1300 }, { "epoch": 0.19, "grad_norm": 0.4358065128326416, "learning_rate": 9.836009515313709e-05, "loss": 0.1892, "step": 1310 }, { "epoch": 0.2, "grad_norm": 0.45344677567481995, "learning_rate": 9.834522747546833e-05, "loss": 0.1882, "step": 1320 }, { "epoch": 0.2, "grad_norm": 1.9001879692077637, "learning_rate": 9.833035979779959e-05, "loss": 0.182, "step": 1330 }, { "epoch": 0.2, "grad_norm": 1.2151376008987427, "learning_rate": 9.831549212013084e-05, "loss": 0.2068, "step": 1340 }, { "epoch": 0.2, "grad_norm": 2.489504814147949, "learning_rate": 9.83006244424621e-05, "loss": 0.1874, "step": 1350 }, { "epoch": 0.2, "grad_norm": 2.392239809036255, "learning_rate": 9.828575676479335e-05, "loss": 0.2012, "step": 1360 }, { "epoch": 0.2, "grad_norm": 0.8293458819389343, "learning_rate": 9.82708890871246e-05, "loss": 0.1756, "step": 1370 }, { "epoch": 0.2, "grad_norm": 0.4222114086151123, "learning_rate": 9.825602140945585e-05, "loss": 0.1781, "step": 1380 }, { "epoch": 0.21, "grad_norm": 0.6707155704498291, "learning_rate": 9.824115373178709e-05, "loss": 0.1687, "step": 1390 }, { "epoch": 0.21, "grad_norm": 1.2496354579925537, "learning_rate": 9.822628605411836e-05, "loss": 0.1698, "step": 1400 }, { "epoch": 0.21, "grad_norm": 0.7204919457435608, "learning_rate": 9.82114183764496e-05, "loss": 0.1756, "step": 1410 }, { "epoch": 0.21, "grad_norm": 1.3436416387557983, "learning_rate": 9.819655069878085e-05, "loss": 0.1813, "step": 1420 }, { "epoch": 0.21, "grad_norm": 1.1300007104873657, "learning_rate": 9.818168302111211e-05, "loss": 0.1865, "step": 1430 }, { "epoch": 0.21, "grad_norm": 0.4229620695114136, "learning_rate": 9.816681534344336e-05, "loss": 0.1765, "step": 1440 }, { "epoch": 0.21, "grad_norm": 1.7664425373077393, "learning_rate": 9.815194766577462e-05, "loss": 0.19, "step": 1450 }, { "epoch": 0.22, "grad_norm": 0.6647424697875977, "learning_rate": 9.813707998810586e-05, "loss": 0.1759, "step": 1460 }, { "epoch": 0.22, "grad_norm": 1.3758875131607056, "learning_rate": 9.812221231043711e-05, "loss": 0.1771, "step": 1470 }, { "epoch": 0.22, "grad_norm": 1.0963157415390015, "learning_rate": 9.810734463276836e-05, "loss": 0.181, "step": 1480 }, { "epoch": 0.22, "grad_norm": 1.6835664510726929, "learning_rate": 9.809247695509962e-05, "loss": 0.1668, "step": 1490 }, { "epoch": 0.22, "grad_norm": 2.1593921184539795, "learning_rate": 9.807760927743087e-05, "loss": 0.1816, "step": 1500 }, { "epoch": 0.22, "grad_norm": 1.253033995628357, "learning_rate": 9.806274159976212e-05, "loss": 0.1721, "step": 1510 }, { "epoch": 0.23, "grad_norm": 2.998976945877075, "learning_rate": 9.804787392209337e-05, "loss": 0.1867, "step": 1520 }, { "epoch": 0.23, "grad_norm": 1.396804928779602, "learning_rate": 9.803300624442463e-05, "loss": 0.1721, "step": 1530 }, { "epoch": 0.23, "grad_norm": 0.6815312504768372, "learning_rate": 9.801813856675588e-05, "loss": 0.1826, "step": 1540 }, { "epoch": 0.23, "grad_norm": 0.8488101959228516, "learning_rate": 9.800327088908713e-05, "loss": 0.1872, "step": 1550 }, { "epoch": 0.23, "grad_norm": 1.0025837421417236, "learning_rate": 9.798840321141838e-05, "loss": 0.1683, "step": 1560 }, { "epoch": 0.23, "grad_norm": 1.0327823162078857, "learning_rate": 9.797353553374962e-05, "loss": 0.1722, "step": 1570 }, { "epoch": 0.23, "grad_norm": 1.4573004245758057, "learning_rate": 9.795866785608089e-05, "loss": 0.1717, "step": 1580 }, { "epoch": 0.24, "grad_norm": 0.6835706233978271, "learning_rate": 9.794380017841213e-05, "loss": 0.1797, "step": 1590 }, { "epoch": 0.24, "grad_norm": 1.981534481048584, "learning_rate": 9.792893250074339e-05, "loss": 0.1733, "step": 1600 }, { "epoch": 0.24, "grad_norm": 0.49460187554359436, "learning_rate": 9.791406482307464e-05, "loss": 0.175, "step": 1610 }, { "epoch": 0.24, "grad_norm": 1.1679047346115112, "learning_rate": 9.78991971454059e-05, "loss": 0.1754, "step": 1620 }, { "epoch": 0.24, "grad_norm": 0.691370964050293, "learning_rate": 9.788432946773715e-05, "loss": 0.167, "step": 1630 }, { "epoch": 0.24, "grad_norm": 0.409994900226593, "learning_rate": 9.786946179006839e-05, "loss": 0.1688, "step": 1640 }, { "epoch": 0.24, "grad_norm": 0.7671188116073608, "learning_rate": 9.785459411239965e-05, "loss": 0.1626, "step": 1650 }, { "epoch": 0.25, "grad_norm": 0.6950274109840393, "learning_rate": 9.78397264347309e-05, "loss": 0.1717, "step": 1660 }, { "epoch": 0.25, "grad_norm": 1.0336863994598389, "learning_rate": 9.782485875706216e-05, "loss": 0.1906, "step": 1670 }, { "epoch": 0.25, "grad_norm": 3.111928939819336, "learning_rate": 9.780999107939341e-05, "loss": 0.2204, "step": 1680 }, { "epoch": 0.25, "grad_norm": 0.817621648311615, "learning_rate": 9.779512340172465e-05, "loss": 0.192, "step": 1690 }, { "epoch": 0.25, "grad_norm": 2.2715766429901123, "learning_rate": 9.77802557240559e-05, "loss": 0.1728, "step": 1700 }, { "epoch": 0.25, "grad_norm": 0.4694221019744873, "learning_rate": 9.776538804638716e-05, "loss": 0.1751, "step": 1710 }, { "epoch": 0.25, "grad_norm": 2.2308688163757324, "learning_rate": 9.775052036871842e-05, "loss": 0.1749, "step": 1720 }, { "epoch": 0.26, "grad_norm": 1.3650729656219482, "learning_rate": 9.773565269104966e-05, "loss": 0.1786, "step": 1730 }, { "epoch": 0.26, "grad_norm": 0.3981892466545105, "learning_rate": 9.772078501338091e-05, "loss": 0.1724, "step": 1740 }, { "epoch": 0.26, "grad_norm": 1.6411010026931763, "learning_rate": 9.770591733571217e-05, "loss": 0.1774, "step": 1750 }, { "epoch": 0.26, "grad_norm": 0.6380265951156616, "learning_rate": 9.769104965804342e-05, "loss": 0.1793, "step": 1760 }, { "epoch": 0.26, "grad_norm": 0.4115835130214691, "learning_rate": 9.767618198037468e-05, "loss": 0.1636, "step": 1770 }, { "epoch": 0.26, "grad_norm": 0.61713707447052, "learning_rate": 9.766131430270592e-05, "loss": 0.1769, "step": 1780 }, { "epoch": 0.27, "grad_norm": 1.0060006380081177, "learning_rate": 9.764644662503717e-05, "loss": 0.1691, "step": 1790 }, { "epoch": 0.27, "grad_norm": 0.9878020286560059, "learning_rate": 9.763157894736843e-05, "loss": 0.1749, "step": 1800 }, { "epoch": 0.27, "grad_norm": 2.226569414138794, "learning_rate": 9.761671126969968e-05, "loss": 0.1671, "step": 1810 }, { "epoch": 0.27, "grad_norm": 0.7667270302772522, "learning_rate": 9.760184359203092e-05, "loss": 0.1822, "step": 1820 }, { "epoch": 0.27, "grad_norm": 1.7759143114089966, "learning_rate": 9.758697591436218e-05, "loss": 0.175, "step": 1830 }, { "epoch": 0.27, "grad_norm": 1.309430718421936, "learning_rate": 9.757210823669343e-05, "loss": 0.1683, "step": 1840 }, { "epoch": 0.27, "grad_norm": 0.3665989339351654, "learning_rate": 9.755724055902469e-05, "loss": 0.1748, "step": 1850 }, { "epoch": 0.28, "grad_norm": 0.5607463121414185, "learning_rate": 9.754237288135594e-05, "loss": 0.1646, "step": 1860 }, { "epoch": 0.28, "grad_norm": 2.0324952602386475, "learning_rate": 9.752750520368718e-05, "loss": 0.1818, "step": 1870 }, { "epoch": 0.28, "grad_norm": 1.5004349946975708, "learning_rate": 9.751263752601844e-05, "loss": 0.1795, "step": 1880 }, { "epoch": 0.28, "grad_norm": 0.5794762372970581, "learning_rate": 9.749776984834969e-05, "loss": 0.1877, "step": 1890 }, { "epoch": 0.28, "grad_norm": 0.6873047947883606, "learning_rate": 9.748290217068095e-05, "loss": 0.1872, "step": 1900 }, { "epoch": 0.28, "grad_norm": 0.3563670814037323, "learning_rate": 9.746803449301219e-05, "loss": 0.173, "step": 1910 }, { "epoch": 0.28, "grad_norm": 0.47983822226524353, "learning_rate": 9.745316681534344e-05, "loss": 0.1654, "step": 1920 }, { "epoch": 0.29, "grad_norm": 0.31588858366012573, "learning_rate": 9.74382991376747e-05, "loss": 0.1588, "step": 1930 }, { "epoch": 0.29, "grad_norm": 1.5894639492034912, "learning_rate": 9.742343146000595e-05, "loss": 0.1778, "step": 1940 }, { "epoch": 0.29, "grad_norm": 0.5094777941703796, "learning_rate": 9.740856378233721e-05, "loss": 0.1869, "step": 1950 }, { "epoch": 0.29, "grad_norm": 0.4982427656650543, "learning_rate": 9.739369610466845e-05, "loss": 0.182, "step": 1960 }, { "epoch": 0.29, "grad_norm": 0.5018313527107239, "learning_rate": 9.73788284269997e-05, "loss": 0.1621, "step": 1970 }, { "epoch": 0.29, "grad_norm": 0.4199548363685608, "learning_rate": 9.736396074933096e-05, "loss": 0.1654, "step": 1980 }, { "epoch": 0.29, "grad_norm": 1.6279269456863403, "learning_rate": 9.734909307166221e-05, "loss": 0.1802, "step": 1990 }, { "epoch": 0.3, "grad_norm": 1.3614368438720703, "learning_rate": 9.733422539399347e-05, "loss": 0.1636, "step": 2000 }, { "epoch": 0.3, "grad_norm": 0.6040695905685425, "learning_rate": 9.731935771632471e-05, "loss": 0.1694, "step": 2010 }, { "epoch": 0.3, "grad_norm": 0.44964730739593506, "learning_rate": 9.730449003865598e-05, "loss": 0.164, "step": 2020 }, { "epoch": 0.3, "grad_norm": 1.9905171394348145, "learning_rate": 9.728962236098722e-05, "loss": 0.1692, "step": 2030 }, { "epoch": 0.3, "grad_norm": 1.676896572113037, "learning_rate": 9.727475468331847e-05, "loss": 0.1734, "step": 2040 }, { "epoch": 0.3, "grad_norm": 1.833280086517334, "learning_rate": 9.725988700564971e-05, "loss": 0.1842, "step": 2050 }, { "epoch": 0.31, "grad_norm": 1.3920565843582153, "learning_rate": 9.724501932798098e-05, "loss": 0.1834, "step": 2060 }, { "epoch": 0.31, "grad_norm": 0.7751169800758362, "learning_rate": 9.723015165031222e-05, "loss": 0.1755, "step": 2070 }, { "epoch": 0.31, "grad_norm": 1.8627113103866577, "learning_rate": 9.721528397264348e-05, "loss": 0.1695, "step": 2080 }, { "epoch": 0.31, "grad_norm": 0.434849351644516, "learning_rate": 9.720041629497473e-05, "loss": 0.1675, "step": 2090 }, { "epoch": 0.31, "grad_norm": 1.202000617980957, "learning_rate": 9.718554861730597e-05, "loss": 0.1749, "step": 2100 }, { "epoch": 0.31, "grad_norm": 0.7374542951583862, "learning_rate": 9.717068093963724e-05, "loss": 0.169, "step": 2110 }, { "epoch": 0.31, "grad_norm": 0.6856274604797363, "learning_rate": 9.715581326196848e-05, "loss": 0.1724, "step": 2120 }, { "epoch": 0.32, "grad_norm": 0.8698721528053284, "learning_rate": 9.714094558429974e-05, "loss": 0.1705, "step": 2130 }, { "epoch": 0.32, "grad_norm": 1.2129219770431519, "learning_rate": 9.712607790663098e-05, "loss": 0.1704, "step": 2140 }, { "epoch": 0.32, "grad_norm": 0.3243730962276459, "learning_rate": 9.711121022896225e-05, "loss": 0.1753, "step": 2150 }, { "epoch": 0.32, "grad_norm": 1.1494802236557007, "learning_rate": 9.709634255129349e-05, "loss": 0.1702, "step": 2160 }, { "epoch": 0.32, "grad_norm": 0.7077322602272034, "learning_rate": 9.708147487362474e-05, "loss": 0.176, "step": 2170 }, { "epoch": 0.32, "grad_norm": 3.5321290493011475, "learning_rate": 9.7066607195956e-05, "loss": 0.1799, "step": 2180 }, { "epoch": 0.32, "grad_norm": 0.40482622385025024, "learning_rate": 9.705173951828724e-05, "loss": 0.1817, "step": 2190 }, { "epoch": 0.33, "grad_norm": 2.4115407466888428, "learning_rate": 9.703687184061851e-05, "loss": 0.1863, "step": 2200 }, { "epoch": 0.33, "grad_norm": 2.6012375354766846, "learning_rate": 9.702200416294975e-05, "loss": 0.1697, "step": 2210 }, { "epoch": 0.33, "grad_norm": 3.5311667919158936, "learning_rate": 9.7007136485281e-05, "loss": 0.1756, "step": 2220 }, { "epoch": 0.33, "grad_norm": 2.0460686683654785, "learning_rate": 9.699226880761225e-05, "loss": 0.1733, "step": 2230 }, { "epoch": 0.33, "grad_norm": 2.7502028942108154, "learning_rate": 9.697740112994351e-05, "loss": 0.177, "step": 2240 }, { "epoch": 0.33, "grad_norm": 1.4361170530319214, "learning_rate": 9.696253345227476e-05, "loss": 0.1798, "step": 2250 }, { "epoch": 0.34, "grad_norm": 2.131711006164551, "learning_rate": 9.694766577460601e-05, "loss": 0.1678, "step": 2260 }, { "epoch": 0.34, "grad_norm": 0.6454190611839294, "learning_rate": 9.693279809693727e-05, "loss": 0.1575, "step": 2270 }, { "epoch": 0.34, "grad_norm": 0.5098270773887634, "learning_rate": 9.69179304192685e-05, "loss": 0.1788, "step": 2280 }, { "epoch": 0.34, "grad_norm": 0.5451599359512329, "learning_rate": 9.690306274159977e-05, "loss": 0.1641, "step": 2290 }, { "epoch": 0.34, "grad_norm": 0.5445219874382019, "learning_rate": 9.688819506393102e-05, "loss": 0.1667, "step": 2300 }, { "epoch": 0.34, "grad_norm": 0.9343221187591553, "learning_rate": 9.687332738626227e-05, "loss": 0.1676, "step": 2310 }, { "epoch": 0.34, "grad_norm": 0.7586581707000732, "learning_rate": 9.685845970859351e-05, "loss": 0.1726, "step": 2320 }, { "epoch": 0.35, "grad_norm": 0.4477677643299103, "learning_rate": 9.684359203092478e-05, "loss": 0.1703, "step": 2330 }, { "epoch": 0.35, "grad_norm": 0.32315364480018616, "learning_rate": 9.682872435325604e-05, "loss": 0.1541, "step": 2340 }, { "epoch": 0.35, "grad_norm": 0.7400575876235962, "learning_rate": 9.681385667558728e-05, "loss": 0.1718, "step": 2350 }, { "epoch": 0.35, "grad_norm": 0.6638018488883972, "learning_rate": 9.679898899791853e-05, "loss": 0.1673, "step": 2360 }, { "epoch": 0.35, "grad_norm": 0.43184348940849304, "learning_rate": 9.678412132024977e-05, "loss": 0.1853, "step": 2370 }, { "epoch": 0.35, "grad_norm": 1.706583023071289, "learning_rate": 9.676925364258104e-05, "loss": 0.169, "step": 2380 }, { "epoch": 0.35, "grad_norm": 2.9006285667419434, "learning_rate": 9.675438596491228e-05, "loss": 0.1705, "step": 2390 }, { "epoch": 0.36, "grad_norm": 0.9855265617370605, "learning_rate": 9.673951828724354e-05, "loss": 0.1694, "step": 2400 }, { "epoch": 0.36, "grad_norm": 1.0507304668426514, "learning_rate": 9.672465060957479e-05, "loss": 0.1595, "step": 2410 }, { "epoch": 0.36, "grad_norm": 1.4149112701416016, "learning_rate": 9.670978293190605e-05, "loss": 0.1763, "step": 2420 }, { "epoch": 0.36, "grad_norm": 0.5968409180641174, "learning_rate": 9.66949152542373e-05, "loss": 0.1552, "step": 2430 }, { "epoch": 0.36, "grad_norm": 0.7032540440559387, "learning_rate": 9.668004757656854e-05, "loss": 0.159, "step": 2440 }, { "epoch": 0.36, "grad_norm": 1.025107979774475, "learning_rate": 9.66651798988998e-05, "loss": 0.1634, "step": 2450 }, { "epoch": 0.36, "grad_norm": 0.37000682950019836, "learning_rate": 9.665031222123104e-05, "loss": 0.1653, "step": 2460 }, { "epoch": 0.37, "grad_norm": 0.8552184700965881, "learning_rate": 9.66354445435623e-05, "loss": 0.1566, "step": 2470 }, { "epoch": 0.37, "grad_norm": 0.3340959846973419, "learning_rate": 9.662057686589355e-05, "loss": 0.1654, "step": 2480 }, { "epoch": 0.37, "grad_norm": 0.7219660878181458, "learning_rate": 9.66057091882248e-05, "loss": 0.1668, "step": 2490 }, { "epoch": 0.37, "grad_norm": 0.3376657962799072, "learning_rate": 9.659084151055606e-05, "loss": 0.1631, "step": 2500 }, { "epoch": 0.37, "grad_norm": 0.40849828720092773, "learning_rate": 9.657597383288731e-05, "loss": 0.1714, "step": 2510 }, { "epoch": 0.37, "grad_norm": 4.0708909034729, "learning_rate": 9.656110615521857e-05, "loss": 0.1748, "step": 2520 }, { "epoch": 0.38, "grad_norm": 0.8569905757904053, "learning_rate": 9.654623847754981e-05, "loss": 0.1759, "step": 2530 }, { "epoch": 0.38, "grad_norm": 0.6083622574806213, "learning_rate": 9.653137079988106e-05, "loss": 0.1771, "step": 2540 }, { "epoch": 0.38, "grad_norm": 0.9755212068557739, "learning_rate": 9.651650312221232e-05, "loss": 0.1653, "step": 2550 }, { "epoch": 0.38, "grad_norm": 0.2756294906139374, "learning_rate": 9.650163544454357e-05, "loss": 0.161, "step": 2560 }, { "epoch": 0.38, "grad_norm": 0.36292827129364014, "learning_rate": 9.648676776687481e-05, "loss": 0.1627, "step": 2570 }, { "epoch": 0.38, "grad_norm": 0.31777092814445496, "learning_rate": 9.647190008920607e-05, "loss": 0.1635, "step": 2580 }, { "epoch": 0.38, "grad_norm": 0.4227207601070404, "learning_rate": 9.645703241153732e-05, "loss": 0.1693, "step": 2590 }, { "epoch": 0.39, "grad_norm": 0.3526834547519684, "learning_rate": 9.644216473386858e-05, "loss": 0.1567, "step": 2600 }, { "epoch": 0.39, "grad_norm": 0.9331281185150146, "learning_rate": 9.642729705619983e-05, "loss": 0.1691, "step": 2610 }, { "epoch": 0.39, "grad_norm": 1.4683771133422852, "learning_rate": 9.641242937853107e-05, "loss": 0.1773, "step": 2620 }, { "epoch": 0.39, "grad_norm": 2.2021124362945557, "learning_rate": 9.639756170086233e-05, "loss": 0.1749, "step": 2630 }, { "epoch": 0.39, "grad_norm": 0.63923180103302, "learning_rate": 9.638269402319358e-05, "loss": 0.1755, "step": 2640 }, { "epoch": 0.39, "grad_norm": 0.34077224135398865, "learning_rate": 9.636782634552484e-05, "loss": 0.1595, "step": 2650 }, { "epoch": 0.39, "grad_norm": 0.8533111810684204, "learning_rate": 9.635295866785608e-05, "loss": 0.1765, "step": 2660 }, { "epoch": 0.4, "grad_norm": 2.3589627742767334, "learning_rate": 9.633809099018733e-05, "loss": 0.1564, "step": 2670 }, { "epoch": 0.4, "grad_norm": 0.4352911412715912, "learning_rate": 9.632322331251859e-05, "loss": 0.1645, "step": 2680 }, { "epoch": 0.4, "grad_norm": 0.36782145500183105, "learning_rate": 9.630835563484984e-05, "loss": 0.1651, "step": 2690 }, { "epoch": 0.4, "grad_norm": 0.4545479714870453, "learning_rate": 9.62934879571811e-05, "loss": 0.168, "step": 2700 }, { "epoch": 0.4, "grad_norm": 1.7421327829360962, "learning_rate": 9.627862027951234e-05, "loss": 0.1643, "step": 2710 }, { "epoch": 0.4, "grad_norm": 0.7703805565834045, "learning_rate": 9.62637526018436e-05, "loss": 0.1613, "step": 2720 }, { "epoch": 0.4, "grad_norm": 0.7927514314651489, "learning_rate": 9.624888492417485e-05, "loss": 0.1542, "step": 2730 }, { "epoch": 0.41, "grad_norm": 0.9497324824333191, "learning_rate": 9.62340172465061e-05, "loss": 0.169, "step": 2740 }, { "epoch": 0.41, "grad_norm": 0.6463251113891602, "learning_rate": 9.621914956883736e-05, "loss": 0.1545, "step": 2750 }, { "epoch": 0.41, "grad_norm": 0.5164654850959778, "learning_rate": 9.62042818911686e-05, "loss": 0.1746, "step": 2760 }, { "epoch": 0.41, "grad_norm": 1.879035472869873, "learning_rate": 9.618941421349985e-05, "loss": 0.1617, "step": 2770 }, { "epoch": 0.41, "grad_norm": 1.6450806856155396, "learning_rate": 9.617454653583111e-05, "loss": 0.1694, "step": 2780 }, { "epoch": 0.41, "grad_norm": 0.4430038630962372, "learning_rate": 9.615967885816236e-05, "loss": 0.1618, "step": 2790 }, { "epoch": 0.42, "grad_norm": 2.111541509628296, "learning_rate": 9.61448111804936e-05, "loss": 0.1537, "step": 2800 }, { "epoch": 0.42, "grad_norm": 1.177138090133667, "learning_rate": 9.612994350282486e-05, "loss": 0.171, "step": 2810 }, { "epoch": 0.42, "grad_norm": 2.1342763900756836, "learning_rate": 9.611507582515611e-05, "loss": 0.1701, "step": 2820 }, { "epoch": 0.42, "grad_norm": 0.7735350131988525, "learning_rate": 9.610020814748737e-05, "loss": 0.1588, "step": 2830 }, { "epoch": 0.42, "grad_norm": 0.3419579267501831, "learning_rate": 9.608534046981862e-05, "loss": 0.1573, "step": 2840 }, { "epoch": 0.42, "grad_norm": 0.48323726654052734, "learning_rate": 9.607047279214987e-05, "loss": 0.1593, "step": 2850 }, { "epoch": 0.42, "grad_norm": 0.437145471572876, "learning_rate": 9.605560511448112e-05, "loss": 0.1691, "step": 2860 }, { "epoch": 0.43, "grad_norm": 0.7413353323936462, "learning_rate": 9.604073743681237e-05, "loss": 0.1665, "step": 2870 }, { "epoch": 0.43, "grad_norm": 0.7211183905601501, "learning_rate": 9.602586975914363e-05, "loss": 0.155, "step": 2880 }, { "epoch": 0.43, "grad_norm": 0.7412411570549011, "learning_rate": 9.601100208147487e-05, "loss": 0.1627, "step": 2890 }, { "epoch": 0.43, "grad_norm": 0.9689399003982544, "learning_rate": 9.599613440380613e-05, "loss": 0.1734, "step": 2900 }, { "epoch": 0.43, "grad_norm": 0.9807812571525574, "learning_rate": 9.598126672613738e-05, "loss": 0.1668, "step": 2910 }, { "epoch": 0.43, "grad_norm": 2.6782937049865723, "learning_rate": 9.596639904846864e-05, "loss": 0.1802, "step": 2920 }, { "epoch": 0.43, "grad_norm": 0.54747074842453, "learning_rate": 9.595153137079989e-05, "loss": 0.1727, "step": 2930 }, { "epoch": 0.44, "grad_norm": 1.2045674324035645, "learning_rate": 9.593666369313113e-05, "loss": 0.1677, "step": 2940 }, { "epoch": 0.44, "grad_norm": 1.2668368816375732, "learning_rate": 9.592179601546239e-05, "loss": 0.1652, "step": 2950 }, { "epoch": 0.44, "grad_norm": 0.7421100735664368, "learning_rate": 9.590692833779364e-05, "loss": 0.1552, "step": 2960 }, { "epoch": 0.44, "grad_norm": 0.7410224676132202, "learning_rate": 9.58920606601249e-05, "loss": 0.1569, "step": 2970 }, { "epoch": 0.44, "grad_norm": 0.5380434989929199, "learning_rate": 9.587719298245614e-05, "loss": 0.1676, "step": 2980 }, { "epoch": 0.44, "grad_norm": 1.3664904832839966, "learning_rate": 9.586232530478739e-05, "loss": 0.1629, "step": 2990 }, { "epoch": 0.44, "grad_norm": 1.5946515798568726, "learning_rate": 9.584745762711866e-05, "loss": 0.1779, "step": 3000 }, { "epoch": 0.45, "grad_norm": 1.7002317905426025, "learning_rate": 9.58325899494499e-05, "loss": 0.1769, "step": 3010 }, { "epoch": 0.45, "grad_norm": 0.5792291164398193, "learning_rate": 9.581772227178116e-05, "loss": 0.173, "step": 3020 }, { "epoch": 0.45, "grad_norm": 2.637342691421509, "learning_rate": 9.58028545941124e-05, "loss": 0.184, "step": 3030 }, { "epoch": 0.45, "grad_norm": 0.315929651260376, "learning_rate": 9.578798691644367e-05, "loss": 0.1626, "step": 3040 }, { "epoch": 0.45, "grad_norm": 1.4628597497940063, "learning_rate": 9.57731192387749e-05, "loss": 0.1585, "step": 3050 }, { "epoch": 0.45, "grad_norm": 1.037359356880188, "learning_rate": 9.575825156110616e-05, "loss": 0.1659, "step": 3060 }, { "epoch": 0.46, "grad_norm": 0.7535769939422607, "learning_rate": 9.574338388343742e-05, "loss": 0.1552, "step": 3070 }, { "epoch": 0.46, "grad_norm": 1.6997020244598389, "learning_rate": 9.572851620576866e-05, "loss": 0.161, "step": 3080 }, { "epoch": 0.46, "grad_norm": 0.6227778196334839, "learning_rate": 9.571364852809993e-05, "loss": 0.1664, "step": 3090 }, { "epoch": 0.46, "grad_norm": 0.9602690935134888, "learning_rate": 9.569878085043117e-05, "loss": 0.1588, "step": 3100 }, { "epoch": 0.46, "grad_norm": 0.9173592925071716, "learning_rate": 9.568391317276242e-05, "loss": 0.1628, "step": 3110 }, { "epoch": 0.46, "grad_norm": 1.1894303560256958, "learning_rate": 9.566904549509366e-05, "loss": 0.1629, "step": 3120 }, { "epoch": 0.46, "grad_norm": 0.7936124205589294, "learning_rate": 9.565417781742493e-05, "loss": 0.169, "step": 3130 }, { "epoch": 0.47, "grad_norm": 0.7208412289619446, "learning_rate": 9.563931013975617e-05, "loss": 0.1766, "step": 3140 }, { "epoch": 0.47, "grad_norm": 1.2436951398849487, "learning_rate": 9.562444246208743e-05, "loss": 0.1621, "step": 3150 }, { "epoch": 0.47, "grad_norm": 1.0052576065063477, "learning_rate": 9.560957478441868e-05, "loss": 0.1607, "step": 3160 }, { "epoch": 0.47, "grad_norm": 2.342569589614868, "learning_rate": 9.559470710674992e-05, "loss": 0.1655, "step": 3170 }, { "epoch": 0.47, "grad_norm": 0.5531949400901794, "learning_rate": 9.557983942908119e-05, "loss": 0.1684, "step": 3180 }, { "epoch": 0.47, "grad_norm": 0.5959020256996155, "learning_rate": 9.556497175141243e-05, "loss": 0.1592, "step": 3190 }, { "epoch": 0.47, "grad_norm": 0.7381748557090759, "learning_rate": 9.555010407374369e-05, "loss": 0.1689, "step": 3200 }, { "epoch": 0.48, "grad_norm": 1.3665748834609985, "learning_rate": 9.553523639607493e-05, "loss": 0.157, "step": 3210 }, { "epoch": 0.48, "grad_norm": 1.744511365890503, "learning_rate": 9.55203687184062e-05, "loss": 0.1715, "step": 3220 }, { "epoch": 0.48, "grad_norm": 1.2675743103027344, "learning_rate": 9.550550104073744e-05, "loss": 0.1627, "step": 3230 }, { "epoch": 0.48, "grad_norm": 0.7240175008773804, "learning_rate": 9.549063336306869e-05, "loss": 0.1652, "step": 3240 }, { "epoch": 0.48, "grad_norm": 1.61540687084198, "learning_rate": 9.547576568539995e-05, "loss": 0.182, "step": 3250 }, { "epoch": 0.48, "grad_norm": 1.7293829917907715, "learning_rate": 9.546089800773119e-05, "loss": 0.173, "step": 3260 }, { "epoch": 0.48, "grad_norm": 1.0945603847503662, "learning_rate": 9.544603033006246e-05, "loss": 0.1598, "step": 3270 }, { "epoch": 0.49, "grad_norm": 0.4362248182296753, "learning_rate": 9.54311626523937e-05, "loss": 0.1634, "step": 3280 }, { "epoch": 0.49, "grad_norm": 0.6548261642456055, "learning_rate": 9.541629497472495e-05, "loss": 0.1594, "step": 3290 }, { "epoch": 0.49, "grad_norm": 0.3333025872707367, "learning_rate": 9.54014272970562e-05, "loss": 0.1636, "step": 3300 }, { "epoch": 0.49, "grad_norm": 1.0312073230743408, "learning_rate": 9.538655961938746e-05, "loss": 0.1623, "step": 3310 }, { "epoch": 0.49, "grad_norm": 1.9303184747695923, "learning_rate": 9.53716919417187e-05, "loss": 0.1667, "step": 3320 }, { "epoch": 0.49, "grad_norm": 0.3857884109020233, "learning_rate": 9.535682426404996e-05, "loss": 0.1604, "step": 3330 }, { "epoch": 0.5, "grad_norm": 0.8410763144493103, "learning_rate": 9.534195658638121e-05, "loss": 0.1536, "step": 3340 }, { "epoch": 0.5, "grad_norm": 0.9080554842948914, "learning_rate": 9.532708890871245e-05, "loss": 0.1625, "step": 3350 }, { "epoch": 0.5, "grad_norm": 1.0813413858413696, "learning_rate": 9.531222123104372e-05, "loss": 0.1587, "step": 3360 }, { "epoch": 0.5, "grad_norm": 0.5331724286079407, "learning_rate": 9.529735355337496e-05, "loss": 0.1606, "step": 3370 }, { "epoch": 0.5, "grad_norm": 2.220775842666626, "learning_rate": 9.528248587570622e-05, "loss": 0.1642, "step": 3380 }, { "epoch": 0.5, "grad_norm": 1.435058355331421, "learning_rate": 9.526761819803747e-05, "loss": 0.1704, "step": 3390 }, { "epoch": 0.5, "grad_norm": 1.1016638278961182, "learning_rate": 9.525275052036873e-05, "loss": 0.1602, "step": 3400 }, { "epoch": 0.51, "grad_norm": 1.0263643264770508, "learning_rate": 9.523788284269998e-05, "loss": 0.1734, "step": 3410 }, { "epoch": 0.51, "grad_norm": 1.1863274574279785, "learning_rate": 9.522301516503122e-05, "loss": 0.1804, "step": 3420 }, { "epoch": 0.51, "grad_norm": 0.5040327310562134, "learning_rate": 9.520814748736248e-05, "loss": 0.1688, "step": 3430 }, { "epoch": 0.51, "grad_norm": 0.4307524561882019, "learning_rate": 9.519327980969372e-05, "loss": 0.162, "step": 3440 }, { "epoch": 0.51, "grad_norm": 1.9712070226669312, "learning_rate": 9.517841213202499e-05, "loss": 0.171, "step": 3450 }, { "epoch": 0.51, "grad_norm": 0.4332945942878723, "learning_rate": 9.516354445435623e-05, "loss": 0.1603, "step": 3460 }, { "epoch": 0.51, "grad_norm": 0.5506505966186523, "learning_rate": 9.514867677668748e-05, "loss": 0.156, "step": 3470 }, { "epoch": 0.52, "grad_norm": 1.394182562828064, "learning_rate": 9.513380909901874e-05, "loss": 0.1645, "step": 3480 }, { "epoch": 0.52, "grad_norm": 0.7404809594154358, "learning_rate": 9.511894142135e-05, "loss": 0.1652, "step": 3490 }, { "epoch": 0.52, "grad_norm": 2.453972339630127, "learning_rate": 9.510407374368125e-05, "loss": 0.17, "step": 3500 }, { "epoch": 0.52, "grad_norm": 1.8291536569595337, "learning_rate": 9.508920606601249e-05, "loss": 0.1711, "step": 3510 }, { "epoch": 0.52, "grad_norm": 1.329593539237976, "learning_rate": 9.507433838834374e-05, "loss": 0.1689, "step": 3520 }, { "epoch": 0.52, "grad_norm": 1.166062593460083, "learning_rate": 9.5059470710675e-05, "loss": 0.1697, "step": 3530 }, { "epoch": 0.52, "grad_norm": 0.4099891185760498, "learning_rate": 9.504460303300625e-05, "loss": 0.1536, "step": 3540 }, { "epoch": 0.53, "grad_norm": 0.6420068740844727, "learning_rate": 9.50297353553375e-05, "loss": 0.1588, "step": 3550 }, { "epoch": 0.53, "grad_norm": 1.9775470495224, "learning_rate": 9.501486767766875e-05, "loss": 0.1709, "step": 3560 }, { "epoch": 0.53, "grad_norm": 1.8352495431900024, "learning_rate": 9.5e-05, "loss": 0.1669, "step": 3570 }, { "epoch": 0.53, "grad_norm": 1.752253532409668, "learning_rate": 9.498513232233126e-05, "loss": 0.1556, "step": 3580 }, { "epoch": 0.53, "grad_norm": 1.41259765625, "learning_rate": 9.497026464466251e-05, "loss": 0.1606, "step": 3590 }, { "epoch": 0.53, "grad_norm": 0.6287984251976013, "learning_rate": 9.495539696699376e-05, "loss": 0.1553, "step": 3600 }, { "epoch": 0.54, "grad_norm": 0.47083550691604614, "learning_rate": 9.494052928932501e-05, "loss": 0.1577, "step": 3610 }, { "epoch": 0.54, "grad_norm": 0.32023587822914124, "learning_rate": 9.492566161165627e-05, "loss": 0.158, "step": 3620 }, { "epoch": 0.54, "grad_norm": 2.142852544784546, "learning_rate": 9.491079393398752e-05, "loss": 0.167, "step": 3630 }, { "epoch": 0.54, "grad_norm": 0.24139684438705444, "learning_rate": 9.489592625631876e-05, "loss": 0.1673, "step": 3640 }, { "epoch": 0.54, "grad_norm": 0.6434214115142822, "learning_rate": 9.488105857865002e-05, "loss": 0.1657, "step": 3650 }, { "epoch": 0.54, "grad_norm": 0.7096123695373535, "learning_rate": 9.486619090098127e-05, "loss": 0.1685, "step": 3660 }, { "epoch": 0.54, "grad_norm": 0.4420715868473053, "learning_rate": 9.485132322331253e-05, "loss": 0.1735, "step": 3670 }, { "epoch": 0.55, "grad_norm": 1.7800606489181519, "learning_rate": 9.483645554564378e-05, "loss": 0.1563, "step": 3680 }, { "epoch": 0.55, "grad_norm": 0.5434880256652832, "learning_rate": 9.482158786797502e-05, "loss": 0.1669, "step": 3690 }, { "epoch": 0.55, "grad_norm": 0.879857063293457, "learning_rate": 9.480672019030628e-05, "loss": 0.1715, "step": 3700 }, { "epoch": 0.55, "grad_norm": 1.1706616878509521, "learning_rate": 9.479185251263753e-05, "loss": 0.168, "step": 3710 }, { "epoch": 0.55, "grad_norm": 1.0965744256973267, "learning_rate": 9.477698483496879e-05, "loss": 0.1684, "step": 3720 }, { "epoch": 0.55, "grad_norm": 0.4567830562591553, "learning_rate": 9.476211715730004e-05, "loss": 0.1566, "step": 3730 }, { "epoch": 0.55, "grad_norm": 0.6394124031066895, "learning_rate": 9.474724947963128e-05, "loss": 0.1629, "step": 3740 }, { "epoch": 0.56, "grad_norm": 0.6630517244338989, "learning_rate": 9.473238180196254e-05, "loss": 0.1519, "step": 3750 }, { "epoch": 0.56, "grad_norm": 1.7115375995635986, "learning_rate": 9.471751412429379e-05, "loss": 0.1602, "step": 3760 }, { "epoch": 0.56, "grad_norm": 0.2985321581363678, "learning_rate": 9.470264644662505e-05, "loss": 0.154, "step": 3770 }, { "epoch": 0.56, "grad_norm": 1.459088921546936, "learning_rate": 9.468777876895629e-05, "loss": 0.1651, "step": 3780 }, { "epoch": 0.56, "grad_norm": 0.6432485580444336, "learning_rate": 9.467291109128754e-05, "loss": 0.1618, "step": 3790 }, { "epoch": 0.56, "grad_norm": 0.6554367542266846, "learning_rate": 9.46580434136188e-05, "loss": 0.1554, "step": 3800 }, { "epoch": 0.56, "grad_norm": 0.821692705154419, "learning_rate": 9.464317573595005e-05, "loss": 0.1665, "step": 3810 }, { "epoch": 0.57, "grad_norm": 0.4133671522140503, "learning_rate": 9.46283080582813e-05, "loss": 0.1574, "step": 3820 }, { "epoch": 0.57, "grad_norm": 1.609708547592163, "learning_rate": 9.461344038061255e-05, "loss": 0.1658, "step": 3830 }, { "epoch": 0.57, "grad_norm": 0.29960155487060547, "learning_rate": 9.45985727029438e-05, "loss": 0.1638, "step": 3840 }, { "epoch": 0.57, "grad_norm": 0.37204432487487793, "learning_rate": 9.458370502527506e-05, "loss": 0.1502, "step": 3850 }, { "epoch": 0.57, "grad_norm": 1.4778790473937988, "learning_rate": 9.456883734760631e-05, "loss": 0.1585, "step": 3860 }, { "epoch": 0.57, "grad_norm": 0.30051159858703613, "learning_rate": 9.455396966993755e-05, "loss": 0.1595, "step": 3870 }, { "epoch": 0.58, "grad_norm": 0.3415975570678711, "learning_rate": 9.453910199226881e-05, "loss": 0.1556, "step": 3880 }, { "epoch": 0.58, "grad_norm": 2.3571910858154297, "learning_rate": 9.452423431460006e-05, "loss": 0.1657, "step": 3890 }, { "epoch": 0.58, "grad_norm": 1.502201795578003, "learning_rate": 9.450936663693132e-05, "loss": 0.1525, "step": 3900 }, { "epoch": 0.58, "grad_norm": 0.4980350434780121, "learning_rate": 9.449449895926257e-05, "loss": 0.1575, "step": 3910 }, { "epoch": 0.58, "grad_norm": 0.7586767077445984, "learning_rate": 9.447963128159381e-05, "loss": 0.1685, "step": 3920 }, { "epoch": 0.58, "grad_norm": 0.33721524477005005, "learning_rate": 9.446476360392507e-05, "loss": 0.1616, "step": 3930 }, { "epoch": 0.58, "grad_norm": 0.36988070607185364, "learning_rate": 9.444989592625632e-05, "loss": 0.1645, "step": 3940 }, { "epoch": 0.59, "grad_norm": 0.6627219915390015, "learning_rate": 9.443502824858758e-05, "loss": 0.1507, "step": 3950 }, { "epoch": 0.59, "grad_norm": 0.4597422778606415, "learning_rate": 9.442016057091882e-05, "loss": 0.1492, "step": 3960 }, { "epoch": 0.59, "grad_norm": 0.6863605380058289, "learning_rate": 9.440529289325007e-05, "loss": 0.1649, "step": 3970 }, { "epoch": 0.59, "grad_norm": 0.924536943435669, "learning_rate": 9.439042521558133e-05, "loss": 0.1605, "step": 3980 }, { "epoch": 0.59, "grad_norm": 1.3096588850021362, "learning_rate": 9.437555753791258e-05, "loss": 0.1642, "step": 3990 }, { "epoch": 0.59, "grad_norm": 0.33845776319503784, "learning_rate": 9.436068986024384e-05, "loss": 0.1631, "step": 4000 }, { "epoch": 0.59, "grad_norm": 0.4413401782512665, "learning_rate": 9.434582218257508e-05, "loss": 0.1574, "step": 4010 }, { "epoch": 0.6, "grad_norm": 3.155444622039795, "learning_rate": 9.433095450490635e-05, "loss": 0.1685, "step": 4020 }, { "epoch": 0.6, "grad_norm": 0.26439428329467773, "learning_rate": 9.431608682723759e-05, "loss": 0.1585, "step": 4030 }, { "epoch": 0.6, "grad_norm": 0.3483470380306244, "learning_rate": 9.430121914956884e-05, "loss": 0.1636, "step": 4040 }, { "epoch": 0.6, "grad_norm": 0.4697681665420532, "learning_rate": 9.42863514719001e-05, "loss": 0.1528, "step": 4050 }, { "epoch": 0.6, "grad_norm": 0.4681277871131897, "learning_rate": 9.427148379423134e-05, "loss": 0.1492, "step": 4060 }, { "epoch": 0.6, "grad_norm": 1.5307562351226807, "learning_rate": 9.425661611656261e-05, "loss": 0.1599, "step": 4070 }, { "epoch": 0.6, "grad_norm": 0.5165395736694336, "learning_rate": 9.424174843889385e-05, "loss": 0.1605, "step": 4080 }, { "epoch": 0.61, "grad_norm": 1.7521703243255615, "learning_rate": 9.42268807612251e-05, "loss": 0.1564, "step": 4090 }, { "epoch": 0.61, "grad_norm": 1.0230251550674438, "learning_rate": 9.421201308355635e-05, "loss": 0.1552, "step": 4100 }, { "epoch": 0.61, "grad_norm": 1.3919179439544678, "learning_rate": 9.419714540588761e-05, "loss": 0.1598, "step": 4110 }, { "epoch": 0.61, "grad_norm": 1.1220026016235352, "learning_rate": 9.418227772821885e-05, "loss": 0.1674, "step": 4120 }, { "epoch": 0.61, "grad_norm": 1.90169358253479, "learning_rate": 9.416741005055011e-05, "loss": 0.1635, "step": 4130 }, { "epoch": 0.61, "grad_norm": 2.0692789554595947, "learning_rate": 9.415254237288136e-05, "loss": 0.1698, "step": 4140 }, { "epoch": 0.62, "grad_norm": 0.22812828421592712, "learning_rate": 9.41376746952126e-05, "loss": 0.1542, "step": 4150 }, { "epoch": 0.62, "grad_norm": 1.1693663597106934, "learning_rate": 9.412280701754387e-05, "loss": 0.1615, "step": 4160 }, { "epoch": 0.62, "grad_norm": 0.3354921042919159, "learning_rate": 9.410793933987511e-05, "loss": 0.1671, "step": 4170 }, { "epoch": 0.62, "grad_norm": 0.6962907314300537, "learning_rate": 9.409307166220637e-05, "loss": 0.1613, "step": 4180 }, { "epoch": 0.62, "grad_norm": 0.6421184539794922, "learning_rate": 9.407820398453761e-05, "loss": 0.1618, "step": 4190 }, { "epoch": 0.62, "grad_norm": 0.7653113007545471, "learning_rate": 9.406333630686888e-05, "loss": 0.1654, "step": 4200 }, { "epoch": 0.62, "grad_norm": 2.4837770462036133, "learning_rate": 9.404846862920012e-05, "loss": 0.1732, "step": 4210 }, { "epoch": 0.63, "grad_norm": 0.7182124853134155, "learning_rate": 9.403360095153138e-05, "loss": 0.1543, "step": 4220 }, { "epoch": 0.63, "grad_norm": 0.36238598823547363, "learning_rate": 9.401873327386263e-05, "loss": 0.1632, "step": 4230 }, { "epoch": 0.63, "grad_norm": 0.8226011991500854, "learning_rate": 9.400386559619387e-05, "loss": 0.1523, "step": 4240 }, { "epoch": 0.63, "grad_norm": 1.411160945892334, "learning_rate": 9.398899791852514e-05, "loss": 0.1557, "step": 4250 }, { "epoch": 0.63, "grad_norm": 1.1795895099639893, "learning_rate": 9.397413024085638e-05, "loss": 0.166, "step": 4260 }, { "epoch": 0.63, "grad_norm": 0.4530087113380432, "learning_rate": 9.395926256318764e-05, "loss": 0.1567, "step": 4270 }, { "epoch": 0.63, "grad_norm": 2.1539831161499023, "learning_rate": 9.394439488551888e-05, "loss": 0.1627, "step": 4280 }, { "epoch": 0.64, "grad_norm": 0.4007501006126404, "learning_rate": 9.392952720785015e-05, "loss": 0.1609, "step": 4290 }, { "epoch": 0.64, "grad_norm": 0.5847845673561096, "learning_rate": 9.391465953018139e-05, "loss": 0.16, "step": 4300 }, { "epoch": 0.64, "grad_norm": 0.2719428241252899, "learning_rate": 9.389979185251264e-05, "loss": 0.1712, "step": 4310 }, { "epoch": 0.64, "grad_norm": 1.8159836530685425, "learning_rate": 9.38849241748439e-05, "loss": 0.1618, "step": 4320 }, { "epoch": 0.64, "grad_norm": 2.801260232925415, "learning_rate": 9.387005649717514e-05, "loss": 0.1607, "step": 4330 }, { "epoch": 0.64, "grad_norm": 1.856196641921997, "learning_rate": 9.38551888195064e-05, "loss": 0.1585, "step": 4340 }, { "epoch": 0.64, "grad_norm": 0.8878040909767151, "learning_rate": 9.384032114183765e-05, "loss": 0.1576, "step": 4350 }, { "epoch": 0.65, "grad_norm": 0.753246009349823, "learning_rate": 9.38254534641689e-05, "loss": 0.1705, "step": 4360 }, { "epoch": 0.65, "grad_norm": 0.5893975496292114, "learning_rate": 9.381058578650014e-05, "loss": 0.1597, "step": 4370 }, { "epoch": 0.65, "grad_norm": 1.1142498254776, "learning_rate": 9.379571810883141e-05, "loss": 0.1548, "step": 4380 }, { "epoch": 0.65, "grad_norm": 0.9535636901855469, "learning_rate": 9.378085043116267e-05, "loss": 0.1747, "step": 4390 }, { "epoch": 0.65, "grad_norm": 1.1514941453933716, "learning_rate": 9.376598275349391e-05, "loss": 0.1619, "step": 4400 }, { "epoch": 0.65, "grad_norm": 0.8565515875816345, "learning_rate": 9.375111507582516e-05, "loss": 0.1619, "step": 4410 }, { "epoch": 0.66, "grad_norm": 0.49910810589790344, "learning_rate": 9.37362473981564e-05, "loss": 0.1742, "step": 4420 }, { "epoch": 0.66, "grad_norm": 0.327064573764801, "learning_rate": 9.372137972048767e-05, "loss": 0.15, "step": 4430 }, { "epoch": 0.66, "grad_norm": 0.32551905512809753, "learning_rate": 9.370651204281891e-05, "loss": 0.1617, "step": 4440 }, { "epoch": 0.66, "grad_norm": 0.39346516132354736, "learning_rate": 9.369164436515017e-05, "loss": 0.1597, "step": 4450 }, { "epoch": 0.66, "grad_norm": 0.9522442817687988, "learning_rate": 9.367677668748142e-05, "loss": 0.1538, "step": 4460 }, { "epoch": 0.66, "grad_norm": 1.2629470825195312, "learning_rate": 9.366190900981268e-05, "loss": 0.1596, "step": 4470 }, { "epoch": 0.66, "grad_norm": 0.3080795109272003, "learning_rate": 9.364704133214393e-05, "loss": 0.1606, "step": 4480 }, { "epoch": 0.67, "grad_norm": 0.3222214877605438, "learning_rate": 9.363217365447517e-05, "loss": 0.1502, "step": 4490 }, { "epoch": 0.67, "grad_norm": 1.2670872211456299, "learning_rate": 9.361730597680643e-05, "loss": 0.1571, "step": 4500 }, { "epoch": 0.67, "grad_norm": 1.1200741529464722, "learning_rate": 9.360243829913768e-05, "loss": 0.1644, "step": 4510 }, { "epoch": 0.67, "grad_norm": 0.3835075795650482, "learning_rate": 9.358757062146894e-05, "loss": 0.1548, "step": 4520 }, { "epoch": 0.67, "grad_norm": 1.4065500497817993, "learning_rate": 9.357270294380018e-05, "loss": 0.1622, "step": 4530 }, { "epoch": 0.67, "grad_norm": 1.0452622175216675, "learning_rate": 9.355783526613143e-05, "loss": 0.1646, "step": 4540 }, { "epoch": 0.67, "grad_norm": 0.7965924739837646, "learning_rate": 9.354296758846269e-05, "loss": 0.1689, "step": 4550 }, { "epoch": 0.68, "grad_norm": 1.2334436178207397, "learning_rate": 9.352809991079394e-05, "loss": 0.1754, "step": 4560 }, { "epoch": 0.68, "grad_norm": 1.1238471269607544, "learning_rate": 9.35132322331252e-05, "loss": 0.1593, "step": 4570 }, { "epoch": 0.68, "grad_norm": 0.39519649744033813, "learning_rate": 9.349836455545644e-05, "loss": 0.1664, "step": 4580 }, { "epoch": 0.68, "grad_norm": 0.2505687177181244, "learning_rate": 9.348349687778769e-05, "loss": 0.1644, "step": 4590 }, { "epoch": 0.68, "grad_norm": 1.1792103052139282, "learning_rate": 9.346862920011895e-05, "loss": 0.1644, "step": 4600 }, { "epoch": 0.68, "grad_norm": 0.8339989185333252, "learning_rate": 9.34537615224502e-05, "loss": 0.158, "step": 4610 }, { "epoch": 0.68, "grad_norm": 0.2600146532058716, "learning_rate": 9.343889384478144e-05, "loss": 0.1657, "step": 4620 }, { "epoch": 0.69, "grad_norm": 1.7673107385635376, "learning_rate": 9.34240261671127e-05, "loss": 0.1624, "step": 4630 }, { "epoch": 0.69, "grad_norm": 0.3657977283000946, "learning_rate": 9.340915848944395e-05, "loss": 0.165, "step": 4640 }, { "epoch": 0.69, "grad_norm": 0.5369674563407898, "learning_rate": 9.339429081177521e-05, "loss": 0.1643, "step": 4650 }, { "epoch": 0.69, "grad_norm": 1.0297385454177856, "learning_rate": 9.337942313410646e-05, "loss": 0.1551, "step": 4660 }, { "epoch": 0.69, "grad_norm": 1.2391917705535889, "learning_rate": 9.33645554564377e-05, "loss": 0.1541, "step": 4670 }, { "epoch": 0.69, "grad_norm": 1.3810855150222778, "learning_rate": 9.334968777876896e-05, "loss": 0.1597, "step": 4680 }, { "epoch": 0.7, "grad_norm": 1.7452163696289062, "learning_rate": 9.333482010110021e-05, "loss": 0.1672, "step": 4690 }, { "epoch": 0.7, "grad_norm": 1.4795212745666504, "learning_rate": 9.331995242343147e-05, "loss": 0.1686, "step": 4700 }, { "epoch": 0.7, "grad_norm": 1.1324572563171387, "learning_rate": 9.330508474576271e-05, "loss": 0.1568, "step": 4710 }, { "epoch": 0.7, "grad_norm": 1.038382887840271, "learning_rate": 9.329021706809396e-05, "loss": 0.1605, "step": 4720 }, { "epoch": 0.7, "grad_norm": 0.45325565338134766, "learning_rate": 9.327534939042522e-05, "loss": 0.1497, "step": 4730 }, { "epoch": 0.7, "grad_norm": 0.37989452481269836, "learning_rate": 9.326048171275647e-05, "loss": 0.1581, "step": 4740 }, { "epoch": 0.7, "grad_norm": 0.8576381802558899, "learning_rate": 9.324561403508773e-05, "loss": 0.1636, "step": 4750 }, { "epoch": 0.71, "grad_norm": 0.400477796792984, "learning_rate": 9.323074635741897e-05, "loss": 0.1526, "step": 4760 }, { "epoch": 0.71, "grad_norm": 0.37354516983032227, "learning_rate": 9.321587867975022e-05, "loss": 0.1575, "step": 4770 }, { "epoch": 0.71, "grad_norm": 2.289604425430298, "learning_rate": 9.320101100208148e-05, "loss": 0.1655, "step": 4780 }, { "epoch": 0.71, "grad_norm": 1.4728387594223022, "learning_rate": 9.318614332441273e-05, "loss": 0.1626, "step": 4790 }, { "epoch": 0.71, "grad_norm": 1.617341160774231, "learning_rate": 9.317127564674399e-05, "loss": 0.1641, "step": 4800 }, { "epoch": 0.71, "grad_norm": 1.5229002237319946, "learning_rate": 9.315640796907523e-05, "loss": 0.1575, "step": 4810 }, { "epoch": 0.71, "grad_norm": 0.7396848797798157, "learning_rate": 9.314154029140648e-05, "loss": 0.1692, "step": 4820 }, { "epoch": 0.72, "grad_norm": 1.8826942443847656, "learning_rate": 9.312667261373774e-05, "loss": 0.154, "step": 4830 }, { "epoch": 0.72, "grad_norm": 0.8908949494361877, "learning_rate": 9.3111804936069e-05, "loss": 0.1576, "step": 4840 }, { "epoch": 0.72, "grad_norm": 0.829403281211853, "learning_rate": 9.309693725840024e-05, "loss": 0.1626, "step": 4850 }, { "epoch": 0.72, "grad_norm": 1.9040987491607666, "learning_rate": 9.308206958073149e-05, "loss": 0.1617, "step": 4860 }, { "epoch": 0.72, "grad_norm": 0.24965029954910278, "learning_rate": 9.306720190306275e-05, "loss": 0.1677, "step": 4870 }, { "epoch": 0.72, "grad_norm": 0.3442169427871704, "learning_rate": 9.3052334225394e-05, "loss": 0.1563, "step": 4880 }, { "epoch": 0.72, "grad_norm": 0.4237022399902344, "learning_rate": 9.303746654772525e-05, "loss": 0.1645, "step": 4890 }, { "epoch": 0.73, "grad_norm": 0.734087347984314, "learning_rate": 9.30225988700565e-05, "loss": 0.1523, "step": 4900 }, { "epoch": 0.73, "grad_norm": 0.896193265914917, "learning_rate": 9.300773119238775e-05, "loss": 0.167, "step": 4910 }, { "epoch": 0.73, "grad_norm": 0.47176918387413025, "learning_rate": 9.2992863514719e-05, "loss": 0.1584, "step": 4920 }, { "epoch": 0.73, "grad_norm": 1.4008644819259644, "learning_rate": 9.297799583705026e-05, "loss": 0.1584, "step": 4930 }, { "epoch": 0.73, "grad_norm": 1.161672830581665, "learning_rate": 9.29631281593815e-05, "loss": 0.1653, "step": 4940 }, { "epoch": 0.73, "grad_norm": 1.0139836072921753, "learning_rate": 9.294826048171276e-05, "loss": 0.1574, "step": 4950 }, { "epoch": 0.74, "grad_norm": 0.755262017250061, "learning_rate": 9.293339280404401e-05, "loss": 0.1549, "step": 4960 }, { "epoch": 0.74, "grad_norm": 0.763871431350708, "learning_rate": 9.291852512637527e-05, "loss": 0.1598, "step": 4970 }, { "epoch": 0.74, "grad_norm": 0.4599681496620178, "learning_rate": 9.290365744870652e-05, "loss": 0.1581, "step": 4980 }, { "epoch": 0.74, "grad_norm": 0.6009973287582397, "learning_rate": 9.288878977103776e-05, "loss": 0.1627, "step": 4990 }, { "epoch": 0.74, "grad_norm": 0.40760254859924316, "learning_rate": 9.287392209336903e-05, "loss": 0.1633, "step": 5000 }, { "epoch": 0.74, "grad_norm": 0.41892722249031067, "learning_rate": 9.285905441570027e-05, "loss": 0.1606, "step": 5010 }, { "epoch": 0.74, "grad_norm": 0.4405099153518677, "learning_rate": 9.284418673803153e-05, "loss": 0.1565, "step": 5020 }, { "epoch": 0.75, "grad_norm": 0.49205201864242554, "learning_rate": 9.282931906036277e-05, "loss": 0.1631, "step": 5030 }, { "epoch": 0.75, "grad_norm": 0.3461616635322571, "learning_rate": 9.281445138269402e-05, "loss": 0.1629, "step": 5040 }, { "epoch": 0.75, "grad_norm": 0.33874744176864624, "learning_rate": 9.279958370502529e-05, "loss": 0.1497, "step": 5050 }, { "epoch": 0.75, "grad_norm": 0.9239627122879028, "learning_rate": 9.278471602735653e-05, "loss": 0.1568, "step": 5060 }, { "epoch": 0.75, "grad_norm": 1.629723310470581, "learning_rate": 9.276984834968779e-05, "loss": 0.1583, "step": 5070 }, { "epoch": 0.75, "grad_norm": 0.38796573877334595, "learning_rate": 9.275498067201903e-05, "loss": 0.1595, "step": 5080 }, { "epoch": 0.75, "grad_norm": 1.2880231142044067, "learning_rate": 9.27401129943503e-05, "loss": 0.1649, "step": 5090 }, { "epoch": 0.76, "grad_norm": 0.4222949147224426, "learning_rate": 9.272524531668154e-05, "loss": 0.1614, "step": 5100 }, { "epoch": 0.76, "grad_norm": 1.258013367652893, "learning_rate": 9.271037763901279e-05, "loss": 0.1645, "step": 5110 }, { "epoch": 0.76, "grad_norm": 1.2159521579742432, "learning_rate": 9.269550996134405e-05, "loss": 0.1536, "step": 5120 }, { "epoch": 0.76, "grad_norm": 0.3227091133594513, "learning_rate": 9.268064228367529e-05, "loss": 0.1621, "step": 5130 }, { "epoch": 0.76, "grad_norm": 1.7758958339691162, "learning_rate": 9.266577460600656e-05, "loss": 0.1545, "step": 5140 }, { "epoch": 0.76, "grad_norm": 1.6320562362670898, "learning_rate": 9.26509069283378e-05, "loss": 0.1676, "step": 5150 }, { "epoch": 0.76, "grad_norm": 0.9210827350616455, "learning_rate": 9.263603925066905e-05, "loss": 0.157, "step": 5160 }, { "epoch": 0.77, "grad_norm": 0.6037706732749939, "learning_rate": 9.26211715730003e-05, "loss": 0.1535, "step": 5170 }, { "epoch": 0.77, "grad_norm": 2.3255116939544678, "learning_rate": 9.260630389533156e-05, "loss": 0.1681, "step": 5180 }, { "epoch": 0.77, "grad_norm": 0.8426361680030823, "learning_rate": 9.25914362176628e-05, "loss": 0.1633, "step": 5190 }, { "epoch": 0.77, "grad_norm": 0.29783931374549866, "learning_rate": 9.257656853999406e-05, "loss": 0.1557, "step": 5200 }, { "epoch": 0.77, "grad_norm": 0.6732050776481628, "learning_rate": 9.256170086232531e-05, "loss": 0.1604, "step": 5210 }, { "epoch": 0.77, "grad_norm": 0.9958044290542603, "learning_rate": 9.254683318465655e-05, "loss": 0.1654, "step": 5220 }, { "epoch": 0.78, "grad_norm": 1.254502534866333, "learning_rate": 9.253196550698782e-05, "loss": 0.1559, "step": 5230 }, { "epoch": 0.78, "grad_norm": 0.2834548056125641, "learning_rate": 9.251709782931906e-05, "loss": 0.1625, "step": 5240 }, { "epoch": 0.78, "grad_norm": 0.3441438376903534, "learning_rate": 9.250223015165032e-05, "loss": 0.1511, "step": 5250 }, { "epoch": 0.78, "grad_norm": 0.31368038058280945, "learning_rate": 9.248736247398156e-05, "loss": 0.1587, "step": 5260 }, { "epoch": 0.78, "grad_norm": 0.5394402146339417, "learning_rate": 9.247249479631283e-05, "loss": 0.1582, "step": 5270 }, { "epoch": 0.78, "grad_norm": 0.29884475469589233, "learning_rate": 9.245762711864407e-05, "loss": 0.1552, "step": 5280 }, { "epoch": 0.78, "grad_norm": 1.530449628829956, "learning_rate": 9.244275944097532e-05, "loss": 0.1703, "step": 5290 }, { "epoch": 0.79, "grad_norm": 2.9983556270599365, "learning_rate": 9.242789176330658e-05, "loss": 0.1549, "step": 5300 }, { "epoch": 0.79, "grad_norm": 0.7032681703567505, "learning_rate": 9.241302408563782e-05, "loss": 0.1609, "step": 5310 }, { "epoch": 0.79, "grad_norm": 1.7556791305541992, "learning_rate": 9.239815640796909e-05, "loss": 0.1576, "step": 5320 }, { "epoch": 0.79, "grad_norm": 0.46143102645874023, "learning_rate": 9.238328873030033e-05, "loss": 0.1725, "step": 5330 }, { "epoch": 0.79, "grad_norm": 0.6132158041000366, "learning_rate": 9.236842105263158e-05, "loss": 0.1578, "step": 5340 }, { "epoch": 0.79, "grad_norm": 0.317751407623291, "learning_rate": 9.235355337496282e-05, "loss": 0.1548, "step": 5350 }, { "epoch": 0.79, "grad_norm": 0.5321698188781738, "learning_rate": 9.23386856972941e-05, "loss": 0.1616, "step": 5360 }, { "epoch": 0.8, "grad_norm": 1.509246826171875, "learning_rate": 9.232381801962533e-05, "loss": 0.1582, "step": 5370 }, { "epoch": 0.8, "grad_norm": 0.4150841534137726, "learning_rate": 9.230895034195659e-05, "loss": 0.1588, "step": 5380 }, { "epoch": 0.8, "grad_norm": 0.5779004693031311, "learning_rate": 9.229408266428784e-05, "loss": 0.1571, "step": 5390 }, { "epoch": 0.8, "grad_norm": 0.8876129984855652, "learning_rate": 9.227921498661909e-05, "loss": 0.1528, "step": 5400 }, { "epoch": 0.8, "grad_norm": 0.6326704025268555, "learning_rate": 9.226434730895035e-05, "loss": 0.1632, "step": 5410 }, { "epoch": 0.8, "grad_norm": 1.0380185842514038, "learning_rate": 9.22494796312816e-05, "loss": 0.1577, "step": 5420 }, { "epoch": 0.8, "grad_norm": 1.268033504486084, "learning_rate": 9.223461195361285e-05, "loss": 0.1608, "step": 5430 }, { "epoch": 0.81, "grad_norm": 0.5740419626235962, "learning_rate": 9.22197442759441e-05, "loss": 0.1648, "step": 5440 }, { "epoch": 0.81, "grad_norm": 0.36971914768218994, "learning_rate": 9.220487659827536e-05, "loss": 0.1601, "step": 5450 }, { "epoch": 0.81, "grad_norm": 0.39549487829208374, "learning_rate": 9.219000892060661e-05, "loss": 0.154, "step": 5460 }, { "epoch": 0.81, "grad_norm": 0.9269174933433533, "learning_rate": 9.217514124293785e-05, "loss": 0.1581, "step": 5470 }, { "epoch": 0.81, "grad_norm": 1.122969388961792, "learning_rate": 9.216027356526911e-05, "loss": 0.154, "step": 5480 }, { "epoch": 0.81, "grad_norm": 0.5427556037902832, "learning_rate": 9.214540588760036e-05, "loss": 0.1555, "step": 5490 }, { "epoch": 0.82, "grad_norm": 0.5179612636566162, "learning_rate": 9.213053820993162e-05, "loss": 0.1569, "step": 5500 }, { "epoch": 0.82, "grad_norm": 0.7659315466880798, "learning_rate": 9.211567053226286e-05, "loss": 0.1667, "step": 5510 }, { "epoch": 0.82, "grad_norm": 1.04986572265625, "learning_rate": 9.210080285459412e-05, "loss": 0.1646, "step": 5520 }, { "epoch": 0.82, "grad_norm": 1.455209493637085, "learning_rate": 9.208593517692537e-05, "loss": 0.1579, "step": 5530 }, { "epoch": 0.82, "grad_norm": 1.3870110511779785, "learning_rate": 9.207106749925662e-05, "loss": 0.1526, "step": 5540 }, { "epoch": 0.82, "grad_norm": 0.7180333733558655, "learning_rate": 9.205619982158788e-05, "loss": 0.16, "step": 5550 }, { "epoch": 0.82, "grad_norm": 0.5366026759147644, "learning_rate": 9.204133214391912e-05, "loss": 0.1571, "step": 5560 }, { "epoch": 0.83, "grad_norm": 1.4958914518356323, "learning_rate": 9.202646446625038e-05, "loss": 0.155, "step": 5570 }, { "epoch": 0.83, "grad_norm": 0.368592768907547, "learning_rate": 9.201159678858163e-05, "loss": 0.1559, "step": 5580 }, { "epoch": 0.83, "grad_norm": 0.8154517412185669, "learning_rate": 9.199672911091289e-05, "loss": 0.1526, "step": 5590 }, { "epoch": 0.83, "grad_norm": 0.32349810004234314, "learning_rate": 9.198186143324413e-05, "loss": 0.1689, "step": 5600 }, { "epoch": 0.83, "grad_norm": 0.8520846366882324, "learning_rate": 9.196699375557538e-05, "loss": 0.1599, "step": 5610 }, { "epoch": 0.83, "grad_norm": 0.8153768181800842, "learning_rate": 9.195212607790664e-05, "loss": 0.1583, "step": 5620 }, { "epoch": 0.83, "grad_norm": 1.0507680177688599, "learning_rate": 9.193725840023789e-05, "loss": 0.1553, "step": 5630 }, { "epoch": 0.84, "grad_norm": 0.34485527873039246, "learning_rate": 9.192239072256915e-05, "loss": 0.1521, "step": 5640 }, { "epoch": 0.84, "grad_norm": 0.3874395787715912, "learning_rate": 9.190752304490039e-05, "loss": 0.159, "step": 5650 }, { "epoch": 0.84, "grad_norm": 1.9768149852752686, "learning_rate": 9.189265536723164e-05, "loss": 0.1572, "step": 5660 }, { "epoch": 0.84, "grad_norm": 0.3996683359146118, "learning_rate": 9.18777876895629e-05, "loss": 0.1579, "step": 5670 }, { "epoch": 0.84, "grad_norm": 0.6045333743095398, "learning_rate": 9.186292001189415e-05, "loss": 0.1532, "step": 5680 }, { "epoch": 0.84, "grad_norm": 1.318505883216858, "learning_rate": 9.184805233422539e-05, "loss": 0.1589, "step": 5690 }, { "epoch": 0.84, "grad_norm": 0.7289694547653198, "learning_rate": 9.183318465655665e-05, "loss": 0.168, "step": 5700 }, { "epoch": 0.85, "grad_norm": 1.2598559856414795, "learning_rate": 9.18183169788879e-05, "loss": 0.1595, "step": 5710 }, { "epoch": 0.85, "grad_norm": 1.0090540647506714, "learning_rate": 9.180344930121916e-05, "loss": 0.1579, "step": 5720 }, { "epoch": 0.85, "grad_norm": 1.2339668273925781, "learning_rate": 9.178858162355041e-05, "loss": 0.1568, "step": 5730 }, { "epoch": 0.85, "grad_norm": 1.9758799076080322, "learning_rate": 9.177371394588165e-05, "loss": 0.1568, "step": 5740 }, { "epoch": 0.85, "grad_norm": 1.6045335531234741, "learning_rate": 9.175884626821291e-05, "loss": 0.1564, "step": 5750 }, { "epoch": 0.85, "grad_norm": 0.40253713726997375, "learning_rate": 9.174397859054416e-05, "loss": 0.1584, "step": 5760 }, { "epoch": 0.86, "grad_norm": 0.332417756319046, "learning_rate": 9.172911091287542e-05, "loss": 0.1492, "step": 5770 }, { "epoch": 0.86, "grad_norm": 0.3337450623512268, "learning_rate": 9.171424323520667e-05, "loss": 0.1496, "step": 5780 }, { "epoch": 0.86, "grad_norm": 0.6006415486335754, "learning_rate": 9.169937555753791e-05, "loss": 0.1664, "step": 5790 }, { "epoch": 0.86, "grad_norm": 1.1703321933746338, "learning_rate": 9.168450787986917e-05, "loss": 0.1515, "step": 5800 }, { "epoch": 0.86, "grad_norm": 1.246114730834961, "learning_rate": 9.166964020220042e-05, "loss": 0.1569, "step": 5810 }, { "epoch": 0.86, "grad_norm": 1.7468287944793701, "learning_rate": 9.165477252453168e-05, "loss": 0.1523, "step": 5820 }, { "epoch": 0.86, "grad_norm": 0.8291600942611694, "learning_rate": 9.163990484686292e-05, "loss": 0.1642, "step": 5830 }, { "epoch": 0.87, "grad_norm": 0.5658857226371765, "learning_rate": 9.162503716919417e-05, "loss": 0.151, "step": 5840 }, { "epoch": 0.87, "grad_norm": 1.1321245431900024, "learning_rate": 9.161016949152543e-05, "loss": 0.1544, "step": 5850 }, { "epoch": 0.87, "grad_norm": 0.9204747080802917, "learning_rate": 9.159530181385668e-05, "loss": 0.1565, "step": 5860 }, { "epoch": 0.87, "grad_norm": 0.616126298904419, "learning_rate": 9.158043413618794e-05, "loss": 0.1665, "step": 5870 }, { "epoch": 0.87, "grad_norm": 0.37067389488220215, "learning_rate": 9.156556645851918e-05, "loss": 0.1609, "step": 5880 }, { "epoch": 0.87, "grad_norm": 0.9909270405769348, "learning_rate": 9.155069878085043e-05, "loss": 0.1481, "step": 5890 }, { "epoch": 0.87, "grad_norm": 1.0309427976608276, "learning_rate": 9.153583110318169e-05, "loss": 0.1523, "step": 5900 }, { "epoch": 0.88, "grad_norm": 1.7706396579742432, "learning_rate": 9.152096342551294e-05, "loss": 0.1608, "step": 5910 }, { "epoch": 0.88, "grad_norm": 2.229520082473755, "learning_rate": 9.150609574784418e-05, "loss": 0.152, "step": 5920 }, { "epoch": 0.88, "grad_norm": 0.4807567298412323, "learning_rate": 9.149122807017544e-05, "loss": 0.156, "step": 5930 }, { "epoch": 0.88, "grad_norm": 0.37514257431030273, "learning_rate": 9.14763603925067e-05, "loss": 0.1587, "step": 5940 }, { "epoch": 0.88, "grad_norm": 2.0821585655212402, "learning_rate": 9.146149271483795e-05, "loss": 0.151, "step": 5950 }, { "epoch": 0.88, "grad_norm": 0.8426055908203125, "learning_rate": 9.14466250371692e-05, "loss": 0.1617, "step": 5960 }, { "epoch": 0.88, "grad_norm": 0.3103870451450348, "learning_rate": 9.143175735950044e-05, "loss": 0.1565, "step": 5970 }, { "epoch": 0.89, "grad_norm": 0.8296404480934143, "learning_rate": 9.141688968183171e-05, "loss": 0.1533, "step": 5980 }, { "epoch": 0.89, "grad_norm": 1.0698027610778809, "learning_rate": 9.140202200416295e-05, "loss": 0.1537, "step": 5990 }, { "epoch": 0.89, "grad_norm": 1.0414214134216309, "learning_rate": 9.138715432649421e-05, "loss": 0.1532, "step": 6000 }, { "epoch": 0.89, "grad_norm": 0.6497471332550049, "learning_rate": 9.137228664882545e-05, "loss": 0.1606, "step": 6010 }, { "epoch": 0.89, "grad_norm": 0.3416892886161804, "learning_rate": 9.13574189711567e-05, "loss": 0.1499, "step": 6020 }, { "epoch": 0.89, "grad_norm": 0.4196206033229828, "learning_rate": 9.134255129348796e-05, "loss": 0.1449, "step": 6030 }, { "epoch": 0.9, "grad_norm": 0.3455292582511902, "learning_rate": 9.132768361581921e-05, "loss": 0.149, "step": 6040 }, { "epoch": 0.9, "grad_norm": 0.3500816822052002, "learning_rate": 9.131281593815047e-05, "loss": 0.1543, "step": 6050 }, { "epoch": 0.9, "grad_norm": 0.5100831389427185, "learning_rate": 9.129794826048171e-05, "loss": 0.1535, "step": 6060 }, { "epoch": 0.9, "grad_norm": 1.426044225692749, "learning_rate": 9.128308058281298e-05, "loss": 0.1532, "step": 6070 }, { "epoch": 0.9, "grad_norm": 0.7652605772018433, "learning_rate": 9.126821290514422e-05, "loss": 0.1612, "step": 6080 }, { "epoch": 0.9, "grad_norm": 0.6746863126754761, "learning_rate": 9.125334522747547e-05, "loss": 0.1595, "step": 6090 }, { "epoch": 0.9, "grad_norm": 0.6860318779945374, "learning_rate": 9.123847754980673e-05, "loss": 0.156, "step": 6100 }, { "epoch": 0.91, "grad_norm": 1.8772507905960083, "learning_rate": 9.122360987213797e-05, "loss": 0.1549, "step": 6110 }, { "epoch": 0.91, "grad_norm": 0.5276684761047363, "learning_rate": 9.120874219446924e-05, "loss": 0.1692, "step": 6120 }, { "epoch": 0.91, "grad_norm": 1.8537567853927612, "learning_rate": 9.119387451680048e-05, "loss": 0.1646, "step": 6130 }, { "epoch": 0.91, "grad_norm": 1.389715552330017, "learning_rate": 9.117900683913173e-05, "loss": 0.1606, "step": 6140 }, { "epoch": 0.91, "grad_norm": 0.4574753940105438, "learning_rate": 9.116413916146298e-05, "loss": 0.1483, "step": 6150 }, { "epoch": 0.91, "grad_norm": 0.41414326429367065, "learning_rate": 9.114927148379424e-05, "loss": 0.1575, "step": 6160 }, { "epoch": 0.91, "grad_norm": 0.36833006143569946, "learning_rate": 9.113440380612549e-05, "loss": 0.1583, "step": 6170 }, { "epoch": 0.92, "grad_norm": 0.7688261866569519, "learning_rate": 9.111953612845674e-05, "loss": 0.1553, "step": 6180 }, { "epoch": 0.92, "grad_norm": 0.27714991569519043, "learning_rate": 9.1104668450788e-05, "loss": 0.1607, "step": 6190 }, { "epoch": 0.92, "grad_norm": 0.7245810627937317, "learning_rate": 9.108980077311924e-05, "loss": 0.1495, "step": 6200 }, { "epoch": 0.92, "grad_norm": 2.5172832012176514, "learning_rate": 9.10749330954505e-05, "loss": 0.17, "step": 6210 }, { "epoch": 0.92, "grad_norm": 0.39452874660491943, "learning_rate": 9.106006541778175e-05, "loss": 0.152, "step": 6220 }, { "epoch": 0.92, "grad_norm": 1.6295764446258545, "learning_rate": 9.1045197740113e-05, "loss": 0.155, "step": 6230 }, { "epoch": 0.92, "grad_norm": 0.8026036620140076, "learning_rate": 9.103033006244424e-05, "loss": 0.149, "step": 6240 }, { "epoch": 0.93, "grad_norm": 1.2626301050186157, "learning_rate": 9.101546238477551e-05, "loss": 0.1527, "step": 6250 }, { "epoch": 0.93, "grad_norm": 0.3312925398349762, "learning_rate": 9.100059470710675e-05, "loss": 0.147, "step": 6260 }, { "epoch": 0.93, "grad_norm": 1.411736249923706, "learning_rate": 9.0985727029438e-05, "loss": 0.1511, "step": 6270 }, { "epoch": 0.93, "grad_norm": 0.4057709872722626, "learning_rate": 9.097085935176926e-05, "loss": 0.1557, "step": 6280 }, { "epoch": 0.93, "grad_norm": 1.2932243347167969, "learning_rate": 9.09559916741005e-05, "loss": 0.1552, "step": 6290 }, { "epoch": 0.93, "grad_norm": 1.286341667175293, "learning_rate": 9.094112399643177e-05, "loss": 0.1472, "step": 6300 }, { "epoch": 0.94, "grad_norm": 1.9994767904281616, "learning_rate": 9.092625631876301e-05, "loss": 0.1651, "step": 6310 }, { "epoch": 0.94, "grad_norm": 1.4747790098190308, "learning_rate": 9.091138864109427e-05, "loss": 0.1657, "step": 6320 }, { "epoch": 0.94, "grad_norm": 0.8761974573135376, "learning_rate": 9.089652096342551e-05, "loss": 0.1469, "step": 6330 }, { "epoch": 0.94, "grad_norm": 0.5455211997032166, "learning_rate": 9.088165328575678e-05, "loss": 0.1611, "step": 6340 }, { "epoch": 0.94, "grad_norm": 1.3673850297927856, "learning_rate": 9.086678560808802e-05, "loss": 0.1655, "step": 6350 }, { "epoch": 0.94, "grad_norm": 0.8073334693908691, "learning_rate": 9.085191793041927e-05, "loss": 0.1555, "step": 6360 }, { "epoch": 0.94, "grad_norm": 0.519607424736023, "learning_rate": 9.083705025275053e-05, "loss": 0.1597, "step": 6370 }, { "epoch": 0.95, "grad_norm": 0.8752426505088806, "learning_rate": 9.082218257508177e-05, "loss": 0.1519, "step": 6380 }, { "epoch": 0.95, "grad_norm": 0.3730453550815582, "learning_rate": 9.080731489741304e-05, "loss": 0.1535, "step": 6390 }, { "epoch": 0.95, "grad_norm": 1.3529815673828125, "learning_rate": 9.079244721974428e-05, "loss": 0.1618, "step": 6400 }, { "epoch": 0.95, "grad_norm": 1.1450254917144775, "learning_rate": 9.077757954207553e-05, "loss": 0.1618, "step": 6410 }, { "epoch": 0.95, "grad_norm": 0.6045798659324646, "learning_rate": 9.076271186440677e-05, "loss": 0.1594, "step": 6420 }, { "epoch": 0.95, "grad_norm": 0.7981701493263245, "learning_rate": 9.074784418673804e-05, "loss": 0.1568, "step": 6430 }, { "epoch": 0.95, "grad_norm": 0.3604912757873535, "learning_rate": 9.07329765090693e-05, "loss": 0.1493, "step": 6440 }, { "epoch": 0.96, "grad_norm": 0.8800011873245239, "learning_rate": 9.071810883140054e-05, "loss": 0.149, "step": 6450 }, { "epoch": 0.96, "grad_norm": 1.3312394618988037, "learning_rate": 9.070324115373179e-05, "loss": 0.1507, "step": 6460 }, { "epoch": 0.96, "grad_norm": 1.0276165008544922, "learning_rate": 9.068837347606305e-05, "loss": 0.1539, "step": 6470 }, { "epoch": 0.96, "grad_norm": 1.2565714120864868, "learning_rate": 9.06735057983943e-05, "loss": 0.1615, "step": 6480 }, { "epoch": 0.96, "grad_norm": 1.5447033643722534, "learning_rate": 9.065863812072554e-05, "loss": 0.1467, "step": 6490 }, { "epoch": 0.96, "grad_norm": 1.4500904083251953, "learning_rate": 9.06437704430568e-05, "loss": 0.1514, "step": 6500 }, { "epoch": 0.96, "grad_norm": 1.5841881036758423, "learning_rate": 9.062890276538805e-05, "loss": 0.1576, "step": 6510 }, { "epoch": 0.97, "grad_norm": 0.9153670072555542, "learning_rate": 9.061403508771931e-05, "loss": 0.15, "step": 6520 }, { "epoch": 0.97, "grad_norm": 0.758307695388794, "learning_rate": 9.059916741005056e-05, "loss": 0.1564, "step": 6530 }, { "epoch": 0.97, "grad_norm": 0.8557407855987549, "learning_rate": 9.05842997323818e-05, "loss": 0.1545, "step": 6540 }, { "epoch": 0.97, "grad_norm": 1.1173782348632812, "learning_rate": 9.056943205471306e-05, "loss": 0.1519, "step": 6550 }, { "epoch": 0.97, "grad_norm": 1.6467628479003906, "learning_rate": 9.055456437704431e-05, "loss": 0.1522, "step": 6560 }, { "epoch": 0.97, "grad_norm": 0.2692776620388031, "learning_rate": 9.053969669937557e-05, "loss": 0.1558, "step": 6570 }, { "epoch": 0.98, "grad_norm": 0.31524816155433655, "learning_rate": 9.052482902170681e-05, "loss": 0.1534, "step": 6580 }, { "epoch": 0.98, "grad_norm": 0.44416341185569763, "learning_rate": 9.050996134403806e-05, "loss": 0.1574, "step": 6590 }, { "epoch": 0.98, "grad_norm": 1.8063576221466064, "learning_rate": 9.049509366636932e-05, "loss": 0.1636, "step": 6600 }, { "epoch": 0.98, "grad_norm": 0.8080287575721741, "learning_rate": 9.048022598870057e-05, "loss": 0.1518, "step": 6610 }, { "epoch": 0.98, "grad_norm": 0.5311675071716309, "learning_rate": 9.046535831103183e-05, "loss": 0.1546, "step": 6620 }, { "epoch": 0.98, "grad_norm": 0.27562418580055237, "learning_rate": 9.045049063336307e-05, "loss": 0.1472, "step": 6630 }, { "epoch": 0.98, "grad_norm": 1.1780297756195068, "learning_rate": 9.043562295569432e-05, "loss": 0.1484, "step": 6640 }, { "epoch": 0.99, "grad_norm": 1.2727899551391602, "learning_rate": 9.042075527802558e-05, "loss": 0.153, "step": 6650 }, { "epoch": 0.99, "grad_norm": 2.0948357582092285, "learning_rate": 9.040588760035683e-05, "loss": 0.1465, "step": 6660 }, { "epoch": 0.99, "grad_norm": 0.42854198813438416, "learning_rate": 9.039101992268807e-05, "loss": 0.1606, "step": 6670 }, { "epoch": 0.99, "grad_norm": 1.9154927730560303, "learning_rate": 9.037615224501933e-05, "loss": 0.1568, "step": 6680 }, { "epoch": 0.99, "grad_norm": 0.49313873052597046, "learning_rate": 9.036128456735058e-05, "loss": 0.15, "step": 6690 }, { "epoch": 0.99, "grad_norm": 1.1896381378173828, "learning_rate": 9.034641688968184e-05, "loss": 0.1519, "step": 6700 }, { "epoch": 0.99, "grad_norm": 1.2026231288909912, "learning_rate": 9.03315492120131e-05, "loss": 0.1514, "step": 6710 }, { "epoch": 1.0, "grad_norm": 0.7389522194862366, "learning_rate": 9.031668153434433e-05, "loss": 0.1506, "step": 6720 }, { "epoch": 1.0, "grad_norm": 1.0212032794952393, "learning_rate": 9.030181385667559e-05, "loss": 0.1541, "step": 6730 }, { "epoch": 1.0, "grad_norm": 0.937395453453064, "learning_rate": 9.028694617900684e-05, "loss": 0.1568, "step": 6740 }, { "epoch": 1.0, "eval_loss": 0.15145954489707947, "eval_runtime": 2482.4757, "eval_samples_per_second": 235.119, "eval_steps_per_second": 3.674, "step": 6746 }, { "epoch": 1.0, "grad_norm": 1.5865614414215088, "learning_rate": 9.02720785013381e-05, "loss": 0.1487, "step": 6750 }, { "epoch": 1.0, "grad_norm": 2.532686471939087, "learning_rate": 9.025721082366934e-05, "loss": 0.1525, "step": 6760 }, { "epoch": 1.0, "grad_norm": 0.690150797367096, "learning_rate": 9.02423431460006e-05, "loss": 0.1543, "step": 6770 }, { "epoch": 1.01, "grad_norm": 0.32673001289367676, "learning_rate": 9.022747546833185e-05, "loss": 0.1401, "step": 6780 }, { "epoch": 1.01, "grad_norm": 0.432534396648407, "learning_rate": 9.02126077906631e-05, "loss": 0.1482, "step": 6790 }, { "epoch": 1.01, "grad_norm": 1.469577431678772, "learning_rate": 9.019774011299436e-05, "loss": 0.1517, "step": 6800 }, { "epoch": 1.01, "grad_norm": 1.8778477907180786, "learning_rate": 9.01828724353256e-05, "loss": 0.143, "step": 6810 }, { "epoch": 1.01, "grad_norm": 0.7702934741973877, "learning_rate": 9.016800475765686e-05, "loss": 0.1547, "step": 6820 }, { "epoch": 1.01, "grad_norm": 1.1953880786895752, "learning_rate": 9.015313707998811e-05, "loss": 0.1537, "step": 6830 }, { "epoch": 1.01, "grad_norm": 1.4178327322006226, "learning_rate": 9.013826940231936e-05, "loss": 0.1463, "step": 6840 }, { "epoch": 1.02, "grad_norm": 1.5760785341262817, "learning_rate": 9.012340172465062e-05, "loss": 0.1661, "step": 6850 }, { "epoch": 1.02, "grad_norm": 0.34104621410369873, "learning_rate": 9.010853404698186e-05, "loss": 0.156, "step": 6860 }, { "epoch": 1.02, "grad_norm": 0.4194226562976837, "learning_rate": 9.009366636931312e-05, "loss": 0.1533, "step": 6870 }, { "epoch": 1.02, "grad_norm": 1.504073143005371, "learning_rate": 9.007879869164437e-05, "loss": 0.1447, "step": 6880 }, { "epoch": 1.02, "grad_norm": 1.8167688846588135, "learning_rate": 9.006393101397563e-05, "loss": 0.1501, "step": 6890 }, { "epoch": 1.02, "grad_norm": 0.4795800447463989, "learning_rate": 9.004906333630687e-05, "loss": 0.1444, "step": 6900 }, { "epoch": 1.02, "grad_norm": 0.4590786397457123, "learning_rate": 9.003419565863812e-05, "loss": 0.1471, "step": 6910 }, { "epoch": 1.03, "grad_norm": 0.7949062585830688, "learning_rate": 9.001932798096938e-05, "loss": 0.149, "step": 6920 }, { "epoch": 1.03, "grad_norm": 0.5231783390045166, "learning_rate": 9.000446030330063e-05, "loss": 0.1459, "step": 6930 }, { "epoch": 1.03, "grad_norm": 1.8959499597549438, "learning_rate": 8.998959262563189e-05, "loss": 0.1585, "step": 6940 }, { "epoch": 1.03, "grad_norm": 0.5581046342849731, "learning_rate": 8.997472494796313e-05, "loss": 0.1516, "step": 6950 }, { "epoch": 1.03, "grad_norm": 2.33567476272583, "learning_rate": 8.99598572702944e-05, "loss": 0.1512, "step": 6960 }, { "epoch": 1.03, "grad_norm": 0.262761652469635, "learning_rate": 8.994498959262564e-05, "loss": 0.1537, "step": 6970 }, { "epoch": 1.03, "grad_norm": 0.34269747138023376, "learning_rate": 8.993012191495689e-05, "loss": 0.1539, "step": 6980 }, { "epoch": 1.04, "grad_norm": 0.5534690618515015, "learning_rate": 8.991525423728813e-05, "loss": 0.1462, "step": 6990 }, { "epoch": 1.04, "grad_norm": 0.38825082778930664, "learning_rate": 8.990038655961939e-05, "loss": 0.1413, "step": 7000 }, { "epoch": 1.04, "grad_norm": 0.9814270734786987, "learning_rate": 8.988551888195064e-05, "loss": 0.1545, "step": 7010 }, { "epoch": 1.04, "grad_norm": 0.6726862192153931, "learning_rate": 8.98706512042819e-05, "loss": 0.1528, "step": 7020 }, { "epoch": 1.04, "grad_norm": 1.0980238914489746, "learning_rate": 8.985578352661315e-05, "loss": 0.1505, "step": 7030 }, { "epoch": 1.04, "grad_norm": 1.7269726991653442, "learning_rate": 8.984091584894439e-05, "loss": 0.1519, "step": 7040 }, { "epoch": 1.05, "grad_norm": 0.8081973195075989, "learning_rate": 8.982604817127566e-05, "loss": 0.1487, "step": 7050 }, { "epoch": 1.05, "grad_norm": 0.884931206703186, "learning_rate": 8.98111804936069e-05, "loss": 0.1451, "step": 7060 }, { "epoch": 1.05, "grad_norm": 0.8839534521102905, "learning_rate": 8.979631281593816e-05, "loss": 0.1534, "step": 7070 }, { "epoch": 1.05, "grad_norm": 0.30532771348953247, "learning_rate": 8.97814451382694e-05, "loss": 0.1534, "step": 7080 }, { "epoch": 1.05, "grad_norm": 1.0127397775650024, "learning_rate": 8.976657746060065e-05, "loss": 0.1516, "step": 7090 }, { "epoch": 1.05, "grad_norm": 1.871941328048706, "learning_rate": 8.975170978293192e-05, "loss": 0.1472, "step": 7100 }, { "epoch": 1.05, "grad_norm": 1.0577542781829834, "learning_rate": 8.973684210526316e-05, "loss": 0.1484, "step": 7110 }, { "epoch": 1.06, "grad_norm": 0.9246624112129211, "learning_rate": 8.972197442759442e-05, "loss": 0.1467, "step": 7120 }, { "epoch": 1.06, "grad_norm": 1.204312801361084, "learning_rate": 8.970710674992566e-05, "loss": 0.1523, "step": 7130 }, { "epoch": 1.06, "grad_norm": 0.6742764115333557, "learning_rate": 8.969223907225693e-05, "loss": 0.1497, "step": 7140 }, { "epoch": 1.06, "grad_norm": 1.5196964740753174, "learning_rate": 8.967737139458817e-05, "loss": 0.1512, "step": 7150 }, { "epoch": 1.06, "grad_norm": 0.3582424521446228, "learning_rate": 8.966250371691942e-05, "loss": 0.1618, "step": 7160 }, { "epoch": 1.06, "grad_norm": 0.34651896357536316, "learning_rate": 8.964763603925068e-05, "loss": 0.1511, "step": 7170 }, { "epoch": 1.06, "grad_norm": 1.343338966369629, "learning_rate": 8.963276836158192e-05, "loss": 0.1431, "step": 7180 }, { "epoch": 1.07, "grad_norm": 1.5551012754440308, "learning_rate": 8.961790068391319e-05, "loss": 0.1451, "step": 7190 }, { "epoch": 1.07, "grad_norm": 0.4299207627773285, "learning_rate": 8.960303300624443e-05, "loss": 0.1464, "step": 7200 }, { "epoch": 1.07, "grad_norm": 0.44814541935920715, "learning_rate": 8.958816532857568e-05, "loss": 0.1508, "step": 7210 }, { "epoch": 1.07, "grad_norm": 0.5933278799057007, "learning_rate": 8.957329765090692e-05, "loss": 0.1558, "step": 7220 }, { "epoch": 1.07, "grad_norm": 0.4118923842906952, "learning_rate": 8.955842997323819e-05, "loss": 0.1492, "step": 7230 }, { "epoch": 1.07, "grad_norm": 1.303074836730957, "learning_rate": 8.954356229556943e-05, "loss": 0.1451, "step": 7240 }, { "epoch": 1.07, "grad_norm": 1.5382298231124878, "learning_rate": 8.952869461790069e-05, "loss": 0.154, "step": 7250 }, { "epoch": 1.08, "grad_norm": 1.7457761764526367, "learning_rate": 8.951382694023194e-05, "loss": 0.1494, "step": 7260 }, { "epoch": 1.08, "grad_norm": 0.7700926661491394, "learning_rate": 8.949895926256318e-05, "loss": 0.1629, "step": 7270 }, { "epoch": 1.08, "grad_norm": 0.2998206615447998, "learning_rate": 8.948409158489445e-05, "loss": 0.1459, "step": 7280 }, { "epoch": 1.08, "grad_norm": 0.41362789273262024, "learning_rate": 8.94692239072257e-05, "loss": 0.1468, "step": 7290 }, { "epoch": 1.08, "grad_norm": 1.507246494293213, "learning_rate": 8.945435622955695e-05, "loss": 0.1526, "step": 7300 }, { "epoch": 1.08, "grad_norm": 1.7437586784362793, "learning_rate": 8.943948855188819e-05, "loss": 0.1632, "step": 7310 }, { "epoch": 1.09, "grad_norm": 0.2855551242828369, "learning_rate": 8.942462087421946e-05, "loss": 0.1485, "step": 7320 }, { "epoch": 1.09, "grad_norm": 0.5423638820648193, "learning_rate": 8.94097531965507e-05, "loss": 0.1572, "step": 7330 }, { "epoch": 1.09, "grad_norm": 0.9757463932037354, "learning_rate": 8.939488551888195e-05, "loss": 0.1523, "step": 7340 }, { "epoch": 1.09, "grad_norm": 0.25639691948890686, "learning_rate": 8.938001784121321e-05, "loss": 0.1576, "step": 7350 }, { "epoch": 1.09, "grad_norm": 0.7795055508613586, "learning_rate": 8.936515016354445e-05, "loss": 0.1516, "step": 7360 }, { "epoch": 1.09, "grad_norm": 0.48449787497520447, "learning_rate": 8.935028248587572e-05, "loss": 0.158, "step": 7370 }, { "epoch": 1.09, "grad_norm": 1.0721737146377563, "learning_rate": 8.933541480820696e-05, "loss": 0.1523, "step": 7380 }, { "epoch": 1.1, "grad_norm": 1.2508677244186401, "learning_rate": 8.932054713053821e-05, "loss": 0.1454, "step": 7390 }, { "epoch": 1.1, "grad_norm": 0.39935556054115295, "learning_rate": 8.930567945286946e-05, "loss": 0.1543, "step": 7400 }, { "epoch": 1.1, "grad_norm": 0.5586414337158203, "learning_rate": 8.929081177520072e-05, "loss": 0.1596, "step": 7410 }, { "epoch": 1.1, "grad_norm": 1.2489172220230103, "learning_rate": 8.927594409753197e-05, "loss": 0.1548, "step": 7420 }, { "epoch": 1.1, "grad_norm": 2.5162367820739746, "learning_rate": 8.926107641986322e-05, "loss": 0.15, "step": 7430 }, { "epoch": 1.1, "grad_norm": 1.1505943536758423, "learning_rate": 8.924620874219447e-05, "loss": 0.1472, "step": 7440 }, { "epoch": 1.1, "grad_norm": 1.3584840297698975, "learning_rate": 8.923134106452573e-05, "loss": 0.1535, "step": 7450 }, { "epoch": 1.11, "grad_norm": 0.40715011954307556, "learning_rate": 8.921647338685698e-05, "loss": 0.1556, "step": 7460 }, { "epoch": 1.11, "grad_norm": 1.03001070022583, "learning_rate": 8.920160570918823e-05, "loss": 0.1571, "step": 7470 }, { "epoch": 1.11, "grad_norm": 0.2230408489704132, "learning_rate": 8.918673803151948e-05, "loss": 0.147, "step": 7480 }, { "epoch": 1.11, "grad_norm": 1.4231066703796387, "learning_rate": 8.917187035385073e-05, "loss": 0.157, "step": 7490 }, { "epoch": 1.11, "grad_norm": 0.9302521347999573, "learning_rate": 8.915700267618199e-05, "loss": 0.1538, "step": 7500 }, { "epoch": 1.11, "grad_norm": 1.3577553033828735, "learning_rate": 8.914213499851324e-05, "loss": 0.1447, "step": 7510 }, { "epoch": 1.11, "grad_norm": 2.897783041000366, "learning_rate": 8.912726732084449e-05, "loss": 0.1476, "step": 7520 }, { "epoch": 1.12, "grad_norm": 0.3453262448310852, "learning_rate": 8.911239964317574e-05, "loss": 0.1603, "step": 7530 }, { "epoch": 1.12, "grad_norm": 1.2519605159759521, "learning_rate": 8.9097531965507e-05, "loss": 0.1551, "step": 7540 }, { "epoch": 1.12, "grad_norm": 1.0790461301803589, "learning_rate": 8.908266428783825e-05, "loss": 0.1472, "step": 7550 }, { "epoch": 1.12, "grad_norm": 0.6116156578063965, "learning_rate": 8.906779661016949e-05, "loss": 0.1483, "step": 7560 }, { "epoch": 1.12, "grad_norm": 0.32002952694892883, "learning_rate": 8.905292893250075e-05, "loss": 0.1455, "step": 7570 }, { "epoch": 1.12, "grad_norm": 2.1142666339874268, "learning_rate": 8.9038061254832e-05, "loss": 0.1514, "step": 7580 }, { "epoch": 1.13, "grad_norm": 0.7548339366912842, "learning_rate": 8.902319357716326e-05, "loss": 0.1503, "step": 7590 }, { "epoch": 1.13, "grad_norm": 1.0314886569976807, "learning_rate": 8.900832589949451e-05, "loss": 0.1484, "step": 7600 }, { "epoch": 1.13, "grad_norm": 1.0514923334121704, "learning_rate": 8.899345822182575e-05, "loss": 0.1597, "step": 7610 }, { "epoch": 1.13, "grad_norm": 0.3399512767791748, "learning_rate": 8.8978590544157e-05, "loss": 0.1531, "step": 7620 }, { "epoch": 1.13, "grad_norm": 1.079697608947754, "learning_rate": 8.896372286648826e-05, "loss": 0.1472, "step": 7630 }, { "epoch": 1.13, "grad_norm": 0.370299369096756, "learning_rate": 8.894885518881952e-05, "loss": 0.1448, "step": 7640 }, { "epoch": 1.13, "grad_norm": 2.146461248397827, "learning_rate": 8.893398751115076e-05, "loss": 0.1494, "step": 7650 }, { "epoch": 1.14, "grad_norm": 1.532644271850586, "learning_rate": 8.891911983348201e-05, "loss": 0.1442, "step": 7660 }, { "epoch": 1.14, "grad_norm": 1.0400909185409546, "learning_rate": 8.890425215581327e-05, "loss": 0.155, "step": 7670 }, { "epoch": 1.14, "grad_norm": 0.5614340901374817, "learning_rate": 8.888938447814452e-05, "loss": 0.149, "step": 7680 }, { "epoch": 1.14, "grad_norm": 1.8623751401901245, "learning_rate": 8.887451680047578e-05, "loss": 0.1646, "step": 7690 }, { "epoch": 1.14, "grad_norm": 1.2477229833602905, "learning_rate": 8.885964912280702e-05, "loss": 0.1502, "step": 7700 }, { "epoch": 1.14, "grad_norm": 1.0643510818481445, "learning_rate": 8.884478144513827e-05, "loss": 0.1558, "step": 7710 }, { "epoch": 1.14, "grad_norm": 0.7074918746948242, "learning_rate": 8.882991376746953e-05, "loss": 0.1465, "step": 7720 }, { "epoch": 1.15, "grad_norm": 1.3349978923797607, "learning_rate": 8.881504608980078e-05, "loss": 0.1538, "step": 7730 }, { "epoch": 1.15, "grad_norm": 0.33089113235473633, "learning_rate": 8.880017841213202e-05, "loss": 0.1466, "step": 7740 }, { "epoch": 1.15, "grad_norm": 1.4978218078613281, "learning_rate": 8.878531073446328e-05, "loss": 0.155, "step": 7750 }, { "epoch": 1.15, "grad_norm": 0.7654755711555481, "learning_rate": 8.877044305679453e-05, "loss": 0.1467, "step": 7760 }, { "epoch": 1.15, "grad_norm": 0.5333597660064697, "learning_rate": 8.875557537912579e-05, "loss": 0.1523, "step": 7770 }, { "epoch": 1.15, "grad_norm": 0.6722001433372498, "learning_rate": 8.874070770145704e-05, "loss": 0.1503, "step": 7780 }, { "epoch": 1.15, "grad_norm": 1.1389368772506714, "learning_rate": 8.872584002378828e-05, "loss": 0.1491, "step": 7790 }, { "epoch": 1.16, "grad_norm": 0.3832188546657562, "learning_rate": 8.871097234611954e-05, "loss": 0.1483, "step": 7800 }, { "epoch": 1.16, "grad_norm": 1.1785801649093628, "learning_rate": 8.869610466845079e-05, "loss": 0.1556, "step": 7810 }, { "epoch": 1.16, "grad_norm": 0.5937321186065674, "learning_rate": 8.868123699078205e-05, "loss": 0.1515, "step": 7820 }, { "epoch": 1.16, "grad_norm": 1.9288078546524048, "learning_rate": 8.86663693131133e-05, "loss": 0.1524, "step": 7830 }, { "epoch": 1.16, "grad_norm": 0.9835410714149475, "learning_rate": 8.865150163544454e-05, "loss": 0.1453, "step": 7840 }, { "epoch": 1.16, "grad_norm": 2.201693534851074, "learning_rate": 8.86366339577758e-05, "loss": 0.1516, "step": 7850 }, { "epoch": 1.17, "grad_norm": 2.7419443130493164, "learning_rate": 8.862176628010705e-05, "loss": 0.153, "step": 7860 }, { "epoch": 1.17, "grad_norm": 1.239664077758789, "learning_rate": 8.860689860243831e-05, "loss": 0.1514, "step": 7870 }, { "epoch": 1.17, "grad_norm": 0.9454985857009888, "learning_rate": 8.859203092476955e-05, "loss": 0.1453, "step": 7880 }, { "epoch": 1.17, "grad_norm": 0.5655785202980042, "learning_rate": 8.85771632471008e-05, "loss": 0.1442, "step": 7890 }, { "epoch": 1.17, "grad_norm": 1.2254222631454468, "learning_rate": 8.856229556943206e-05, "loss": 0.1555, "step": 7900 }, { "epoch": 1.17, "grad_norm": 1.7310748100280762, "learning_rate": 8.854742789176331e-05, "loss": 0.1514, "step": 7910 }, { "epoch": 1.17, "grad_norm": 2.103505849838257, "learning_rate": 8.853256021409457e-05, "loss": 0.1504, "step": 7920 }, { "epoch": 1.18, "grad_norm": 0.45951688289642334, "learning_rate": 8.851769253642581e-05, "loss": 0.1507, "step": 7930 }, { "epoch": 1.18, "grad_norm": 0.5322825312614441, "learning_rate": 8.850282485875708e-05, "loss": 0.1455, "step": 7940 }, { "epoch": 1.18, "grad_norm": 0.80711430311203, "learning_rate": 8.848795718108832e-05, "loss": 0.1489, "step": 7950 }, { "epoch": 1.18, "grad_norm": 1.1508015394210815, "learning_rate": 8.847308950341957e-05, "loss": 0.1504, "step": 7960 }, { "epoch": 1.18, "grad_norm": 0.5405661463737488, "learning_rate": 8.845822182575081e-05, "loss": 0.155, "step": 7970 }, { "epoch": 1.18, "grad_norm": 0.4886877238750458, "learning_rate": 8.844335414808207e-05, "loss": 0.1471, "step": 7980 }, { "epoch": 1.18, "grad_norm": 0.6233669519424438, "learning_rate": 8.842848647041332e-05, "loss": 0.1481, "step": 7990 }, { "epoch": 1.19, "grad_norm": 0.8412140011787415, "learning_rate": 8.841361879274458e-05, "loss": 0.1476, "step": 8000 }, { "epoch": 1.19, "grad_norm": 0.6617283821105957, "learning_rate": 8.839875111507583e-05, "loss": 0.1457, "step": 8010 }, { "epoch": 1.19, "grad_norm": 0.756656289100647, "learning_rate": 8.838388343740707e-05, "loss": 0.1436, "step": 8020 }, { "epoch": 1.19, "grad_norm": 1.0248194932937622, "learning_rate": 8.836901575973834e-05, "loss": 0.1483, "step": 8030 }, { "epoch": 1.19, "grad_norm": 0.9655269980430603, "learning_rate": 8.835414808206958e-05, "loss": 0.1455, "step": 8040 }, { "epoch": 1.19, "grad_norm": 0.9099483489990234, "learning_rate": 8.833928040440084e-05, "loss": 0.1449, "step": 8050 }, { "epoch": 1.19, "grad_norm": 0.3274220824241638, "learning_rate": 8.832441272673208e-05, "loss": 0.1586, "step": 8060 }, { "epoch": 1.2, "grad_norm": 0.5055317878723145, "learning_rate": 8.830954504906334e-05, "loss": 0.1482, "step": 8070 }, { "epoch": 1.2, "grad_norm": 0.36009564995765686, "learning_rate": 8.829467737139459e-05, "loss": 0.1536, "step": 8080 }, { "epoch": 1.2, "grad_norm": 0.616855263710022, "learning_rate": 8.827980969372584e-05, "loss": 0.1432, "step": 8090 }, { "epoch": 1.2, "grad_norm": 2.257025957107544, "learning_rate": 8.82649420160571e-05, "loss": 0.1491, "step": 8100 }, { "epoch": 1.2, "grad_norm": 1.2415412664413452, "learning_rate": 8.825007433838834e-05, "loss": 0.159, "step": 8110 }, { "epoch": 1.2, "grad_norm": 0.49086448550224304, "learning_rate": 8.823520666071961e-05, "loss": 0.1477, "step": 8120 }, { "epoch": 1.21, "grad_norm": 1.015421748161316, "learning_rate": 8.822033898305085e-05, "loss": 0.1456, "step": 8130 }, { "epoch": 1.21, "grad_norm": 0.44600141048431396, "learning_rate": 8.82054713053821e-05, "loss": 0.1567, "step": 8140 }, { "epoch": 1.21, "grad_norm": 1.0561884641647339, "learning_rate": 8.819060362771336e-05, "loss": 0.1583, "step": 8150 }, { "epoch": 1.21, "grad_norm": 1.7750810384750366, "learning_rate": 8.81757359500446e-05, "loss": 0.1497, "step": 8160 }, { "epoch": 1.21, "grad_norm": 0.3530331552028656, "learning_rate": 8.816086827237587e-05, "loss": 0.1529, "step": 8170 }, { "epoch": 1.21, "grad_norm": 1.564988613128662, "learning_rate": 8.814600059470711e-05, "loss": 0.157, "step": 8180 }, { "epoch": 1.21, "grad_norm": 0.599676251411438, "learning_rate": 8.813113291703837e-05, "loss": 0.15, "step": 8190 }, { "epoch": 1.22, "grad_norm": 1.738433837890625, "learning_rate": 8.81162652393696e-05, "loss": 0.1481, "step": 8200 }, { "epoch": 1.22, "grad_norm": 1.467214822769165, "learning_rate": 8.810139756170087e-05, "loss": 0.1456, "step": 8210 }, { "epoch": 1.22, "grad_norm": 0.8870519399642944, "learning_rate": 8.808652988403212e-05, "loss": 0.1467, "step": 8220 }, { "epoch": 1.22, "grad_norm": 0.8673567175865173, "learning_rate": 8.807166220636337e-05, "loss": 0.1511, "step": 8230 }, { "epoch": 1.22, "grad_norm": 0.7647720575332642, "learning_rate": 8.805679452869463e-05, "loss": 0.1409, "step": 8240 }, { "epoch": 1.22, "grad_norm": 1.1034643650054932, "learning_rate": 8.804192685102587e-05, "loss": 0.1377, "step": 8250 }, { "epoch": 1.22, "grad_norm": 1.3493852615356445, "learning_rate": 8.802705917335713e-05, "loss": 0.1511, "step": 8260 }, { "epoch": 1.23, "grad_norm": 1.0202926397323608, "learning_rate": 8.801219149568838e-05, "loss": 0.1443, "step": 8270 }, { "epoch": 1.23, "grad_norm": 1.9459919929504395, "learning_rate": 8.799732381801963e-05, "loss": 0.1476, "step": 8280 }, { "epoch": 1.23, "grad_norm": 1.475825309753418, "learning_rate": 8.798245614035087e-05, "loss": 0.1431, "step": 8290 }, { "epoch": 1.23, "grad_norm": 0.3227538466453552, "learning_rate": 8.796758846268214e-05, "loss": 0.151, "step": 8300 }, { "epoch": 1.23, "grad_norm": 2.327113151550293, "learning_rate": 8.795272078501338e-05, "loss": 0.1465, "step": 8310 }, { "epoch": 1.23, "grad_norm": 0.28288453817367554, "learning_rate": 8.793785310734464e-05, "loss": 0.1451, "step": 8320 }, { "epoch": 1.23, "grad_norm": 1.3570489883422852, "learning_rate": 8.792298542967589e-05, "loss": 0.1544, "step": 8330 }, { "epoch": 1.24, "grad_norm": 1.3985998630523682, "learning_rate": 8.790811775200713e-05, "loss": 0.1453, "step": 8340 }, { "epoch": 1.24, "grad_norm": 1.7871092557907104, "learning_rate": 8.78932500743384e-05, "loss": 0.1485, "step": 8350 }, { "epoch": 1.24, "grad_norm": 0.5759921669960022, "learning_rate": 8.787838239666964e-05, "loss": 0.1578, "step": 8360 }, { "epoch": 1.24, "grad_norm": 0.36994293332099915, "learning_rate": 8.78635147190009e-05, "loss": 0.1553, "step": 8370 }, { "epoch": 1.24, "grad_norm": 0.2848123610019684, "learning_rate": 8.784864704133214e-05, "loss": 0.1488, "step": 8380 }, { "epoch": 1.24, "grad_norm": 0.373757928609848, "learning_rate": 8.78337793636634e-05, "loss": 0.1434, "step": 8390 }, { "epoch": 1.25, "grad_norm": 0.3947400450706482, "learning_rate": 8.781891168599465e-05, "loss": 0.1433, "step": 8400 }, { "epoch": 1.25, "grad_norm": 0.7297700643539429, "learning_rate": 8.78040440083259e-05, "loss": 0.1489, "step": 8410 }, { "epoch": 1.25, "grad_norm": 0.4975549280643463, "learning_rate": 8.778917633065716e-05, "loss": 0.1556, "step": 8420 }, { "epoch": 1.25, "grad_norm": 0.2790517508983612, "learning_rate": 8.777430865298841e-05, "loss": 0.1478, "step": 8430 }, { "epoch": 1.25, "grad_norm": 1.5845293998718262, "learning_rate": 8.775944097531967e-05, "loss": 0.1446, "step": 8440 }, { "epoch": 1.25, "grad_norm": 1.2284702062606812, "learning_rate": 8.774457329765091e-05, "loss": 0.1506, "step": 8450 }, { "epoch": 1.25, "grad_norm": 0.9352693557739258, "learning_rate": 8.772970561998216e-05, "loss": 0.1447, "step": 8460 }, { "epoch": 1.26, "grad_norm": 0.5690743327140808, "learning_rate": 8.771632471008029e-05, "loss": 0.1677, "step": 8470 }, { "epoch": 1.26, "grad_norm": 1.257373571395874, "learning_rate": 8.770145703241154e-05, "loss": 0.1507, "step": 8480 }, { "epoch": 1.26, "grad_norm": 1.0186293125152588, "learning_rate": 8.76865893547428e-05, "loss": 0.1531, "step": 8490 }, { "epoch": 1.26, "grad_norm": 1.0547682046890259, "learning_rate": 8.767172167707405e-05, "loss": 0.1578, "step": 8500 }, { "epoch": 1.26, "grad_norm": 0.7997061014175415, "learning_rate": 8.765685399940529e-05, "loss": 0.1546, "step": 8510 }, { "epoch": 1.26, "grad_norm": 0.921222448348999, "learning_rate": 8.764198632173655e-05, "loss": 0.149, "step": 8520 }, { "epoch": 1.26, "grad_norm": 0.5580063462257385, "learning_rate": 8.76271186440678e-05, "loss": 0.153, "step": 8530 }, { "epoch": 1.27, "grad_norm": 1.184913992881775, "learning_rate": 8.761225096639906e-05, "loss": 0.1519, "step": 8540 }, { "epoch": 1.27, "grad_norm": 1.3542340993881226, "learning_rate": 8.759738328873031e-05, "loss": 0.1558, "step": 8550 }, { "epoch": 1.27, "grad_norm": 0.39081811904907227, "learning_rate": 8.758251561106155e-05, "loss": 0.1499, "step": 8560 }, { "epoch": 1.27, "grad_norm": 1.175268530845642, "learning_rate": 8.756764793339281e-05, "loss": 0.1463, "step": 8570 }, { "epoch": 1.27, "grad_norm": 0.516953706741333, "learning_rate": 8.755278025572406e-05, "loss": 0.15, "step": 8580 }, { "epoch": 1.27, "grad_norm": 0.45542412996292114, "learning_rate": 8.753791257805532e-05, "loss": 0.1443, "step": 8590 }, { "epoch": 1.27, "grad_norm": 0.25929051637649536, "learning_rate": 8.752304490038656e-05, "loss": 0.145, "step": 8600 }, { "epoch": 1.28, "grad_norm": 1.7502154111862183, "learning_rate": 8.750817722271781e-05, "loss": 0.1529, "step": 8610 }, { "epoch": 1.28, "grad_norm": 0.36616241931915283, "learning_rate": 8.749330954504907e-05, "loss": 0.1484, "step": 8620 }, { "epoch": 1.28, "grad_norm": 0.555590808391571, "learning_rate": 8.747844186738032e-05, "loss": 0.1502, "step": 8630 }, { "epoch": 1.28, "grad_norm": 1.2765053510665894, "learning_rate": 8.746357418971158e-05, "loss": 0.1497, "step": 8640 }, { "epoch": 1.28, "grad_norm": 1.160529613494873, "learning_rate": 8.744870651204282e-05, "loss": 0.1488, "step": 8650 }, { "epoch": 1.28, "grad_norm": 0.5419323444366455, "learning_rate": 8.743383883437407e-05, "loss": 0.1531, "step": 8660 }, { "epoch": 1.29, "grad_norm": 0.5298939943313599, "learning_rate": 8.741897115670533e-05, "loss": 0.1486, "step": 8670 }, { "epoch": 1.29, "grad_norm": 1.2307096719741821, "learning_rate": 8.740410347903658e-05, "loss": 0.1525, "step": 8680 }, { "epoch": 1.29, "grad_norm": 0.4540654122829437, "learning_rate": 8.738923580136782e-05, "loss": 0.1535, "step": 8690 }, { "epoch": 1.29, "grad_norm": 1.8886024951934814, "learning_rate": 8.737436812369908e-05, "loss": 0.1541, "step": 8700 }, { "epoch": 1.29, "grad_norm": 1.232670545578003, "learning_rate": 8.735950044603033e-05, "loss": 0.1596, "step": 8710 }, { "epoch": 1.29, "grad_norm": 1.6752057075500488, "learning_rate": 8.734463276836159e-05, "loss": 0.1579, "step": 8720 }, { "epoch": 1.29, "grad_norm": 0.6931501030921936, "learning_rate": 8.732976509069284e-05, "loss": 0.1492, "step": 8730 }, { "epoch": 1.3, "grad_norm": 1.9172624349594116, "learning_rate": 8.731489741302409e-05, "loss": 0.1492, "step": 8740 }, { "epoch": 1.3, "grad_norm": 0.35857778787612915, "learning_rate": 8.730002973535534e-05, "loss": 0.1436, "step": 8750 }, { "epoch": 1.3, "grad_norm": 0.6047963500022888, "learning_rate": 8.72851620576866e-05, "loss": 0.1434, "step": 8760 }, { "epoch": 1.3, "grad_norm": 1.0199538469314575, "learning_rate": 8.727029438001785e-05, "loss": 0.1467, "step": 8770 }, { "epoch": 1.3, "grad_norm": 0.5496332049369812, "learning_rate": 8.725542670234909e-05, "loss": 0.146, "step": 8780 }, { "epoch": 1.3, "grad_norm": 0.5620203614234924, "learning_rate": 8.724055902468035e-05, "loss": 0.1483, "step": 8790 }, { "epoch": 1.3, "grad_norm": 0.36885154247283936, "learning_rate": 8.72256913470116e-05, "loss": 0.1366, "step": 8800 }, { "epoch": 1.31, "grad_norm": 1.235111117362976, "learning_rate": 8.721082366934285e-05, "loss": 0.145, "step": 8810 }, { "epoch": 1.31, "grad_norm": 0.3093607723712921, "learning_rate": 8.719595599167411e-05, "loss": 0.1625, "step": 8820 }, { "epoch": 1.31, "grad_norm": 0.4602314233779907, "learning_rate": 8.718108831400535e-05, "loss": 0.1433, "step": 8830 }, { "epoch": 1.31, "grad_norm": 0.46107497811317444, "learning_rate": 8.71662206363366e-05, "loss": 0.1451, "step": 8840 }, { "epoch": 1.31, "grad_norm": 0.23074732720851898, "learning_rate": 8.715135295866786e-05, "loss": 0.1498, "step": 8850 }, { "epoch": 1.31, "grad_norm": 1.0006585121154785, "learning_rate": 8.713648528099912e-05, "loss": 0.1585, "step": 8860 }, { "epoch": 1.31, "grad_norm": 0.6358426213264465, "learning_rate": 8.712161760333037e-05, "loss": 0.1522, "step": 8870 }, { "epoch": 1.32, "grad_norm": 0.39934954047203064, "learning_rate": 8.710674992566161e-05, "loss": 0.1449, "step": 8880 }, { "epoch": 1.32, "grad_norm": 0.5131770372390747, "learning_rate": 8.709188224799288e-05, "loss": 0.1424, "step": 8890 }, { "epoch": 1.32, "grad_norm": 3.1670219898223877, "learning_rate": 8.707701457032412e-05, "loss": 0.1516, "step": 8900 }, { "epoch": 1.32, "grad_norm": 0.6028851270675659, "learning_rate": 8.706214689265538e-05, "loss": 0.1485, "step": 8910 }, { "epoch": 1.32, "grad_norm": 0.5696167945861816, "learning_rate": 8.704727921498662e-05, "loss": 0.1451, "step": 8920 }, { "epoch": 1.32, "grad_norm": 0.7680513262748718, "learning_rate": 8.703241153731787e-05, "loss": 0.1431, "step": 8930 }, { "epoch": 1.33, "grad_norm": 1.506998896598816, "learning_rate": 8.701754385964913e-05, "loss": 0.1562, "step": 8940 }, { "epoch": 1.33, "grad_norm": 0.24090486764907837, "learning_rate": 8.700267618198038e-05, "loss": 0.1519, "step": 8950 }, { "epoch": 1.33, "grad_norm": 0.36027562618255615, "learning_rate": 8.698780850431164e-05, "loss": 0.1425, "step": 8960 }, { "epoch": 1.33, "grad_norm": 1.2053183317184448, "learning_rate": 8.697294082664288e-05, "loss": 0.1426, "step": 8970 }, { "epoch": 1.33, "grad_norm": 0.4818166494369507, "learning_rate": 8.695807314897415e-05, "loss": 0.1549, "step": 8980 }, { "epoch": 1.33, "grad_norm": 1.4848952293395996, "learning_rate": 8.694320547130539e-05, "loss": 0.1505, "step": 8990 }, { "epoch": 1.33, "grad_norm": 1.1278949975967407, "learning_rate": 8.692833779363664e-05, "loss": 0.1514, "step": 9000 }, { "epoch": 1.34, "grad_norm": 0.55078125, "learning_rate": 8.691347011596788e-05, "loss": 0.1522, "step": 9010 }, { "epoch": 1.34, "grad_norm": 0.6898670196533203, "learning_rate": 8.689860243829914e-05, "loss": 0.1439, "step": 9020 }, { "epoch": 1.34, "grad_norm": 0.9387264251708984, "learning_rate": 8.688373476063039e-05, "loss": 0.1492, "step": 9030 }, { "epoch": 1.34, "grad_norm": 0.6217661499977112, "learning_rate": 8.686886708296165e-05, "loss": 0.1448, "step": 9040 }, { "epoch": 1.34, "grad_norm": 0.3894174098968506, "learning_rate": 8.68539994052929e-05, "loss": 0.1473, "step": 9050 }, { "epoch": 1.34, "grad_norm": 0.6577085852622986, "learning_rate": 8.683913172762414e-05, "loss": 0.1522, "step": 9060 }, { "epoch": 1.34, "grad_norm": 0.36721089482307434, "learning_rate": 8.682426404995541e-05, "loss": 0.1518, "step": 9070 }, { "epoch": 1.35, "grad_norm": 1.7687819004058838, "learning_rate": 8.680939637228665e-05, "loss": 0.1527, "step": 9080 }, { "epoch": 1.35, "grad_norm": 2.106863260269165, "learning_rate": 8.679452869461791e-05, "loss": 0.1526, "step": 9090 }, { "epoch": 1.35, "grad_norm": 0.8379847407341003, "learning_rate": 8.677966101694915e-05, "loss": 0.1525, "step": 9100 }, { "epoch": 1.35, "grad_norm": 0.9839054346084595, "learning_rate": 8.67647933392804e-05, "loss": 0.1476, "step": 9110 }, { "epoch": 1.35, "grad_norm": 1.1237411499023438, "learning_rate": 8.674992566161166e-05, "loss": 0.1516, "step": 9120 }, { "epoch": 1.35, "grad_norm": 1.2351957559585571, "learning_rate": 8.673505798394291e-05, "loss": 0.1553, "step": 9130 }, { "epoch": 1.35, "grad_norm": 1.330193281173706, "learning_rate": 8.672019030627417e-05, "loss": 0.1507, "step": 9140 }, { "epoch": 1.36, "grad_norm": 0.5106310844421387, "learning_rate": 8.670532262860541e-05, "loss": 0.1509, "step": 9150 }, { "epoch": 1.36, "grad_norm": 0.2819235026836395, "learning_rate": 8.669045495093668e-05, "loss": 0.1496, "step": 9160 }, { "epoch": 1.36, "grad_norm": 0.3204076290130615, "learning_rate": 8.667558727326792e-05, "loss": 0.1541, "step": 9170 }, { "epoch": 1.36, "grad_norm": 0.3066985309123993, "learning_rate": 8.666071959559917e-05, "loss": 0.1432, "step": 9180 }, { "epoch": 1.36, "grad_norm": 0.5264164805412292, "learning_rate": 8.664585191793041e-05, "loss": 0.1501, "step": 9190 }, { "epoch": 1.36, "grad_norm": 1.0511165857315063, "learning_rate": 8.663098424026167e-05, "loss": 0.1512, "step": 9200 }, { "epoch": 1.37, "grad_norm": 0.3407282531261444, "learning_rate": 8.661611656259294e-05, "loss": 0.1483, "step": 9210 }, { "epoch": 1.37, "grad_norm": 0.6586543321609497, "learning_rate": 8.660124888492418e-05, "loss": 0.1528, "step": 9220 }, { "epoch": 1.37, "grad_norm": 0.29302939772605896, "learning_rate": 8.658638120725543e-05, "loss": 0.146, "step": 9230 }, { "epoch": 1.37, "grad_norm": 1.0969610214233398, "learning_rate": 8.657151352958667e-05, "loss": 0.1515, "step": 9240 }, { "epoch": 1.37, "grad_norm": 0.3798691928386688, "learning_rate": 8.655664585191794e-05, "loss": 0.1502, "step": 9250 }, { "epoch": 1.37, "grad_norm": 0.5284072756767273, "learning_rate": 8.654177817424918e-05, "loss": 0.1438, "step": 9260 }, { "epoch": 1.37, "grad_norm": 0.5143082141876221, "learning_rate": 8.652691049658044e-05, "loss": 0.1548, "step": 9270 }, { "epoch": 1.38, "grad_norm": 2.502887725830078, "learning_rate": 8.65120428189117e-05, "loss": 0.154, "step": 9280 }, { "epoch": 1.38, "grad_norm": 0.34845390915870667, "learning_rate": 8.649717514124293e-05, "loss": 0.1461, "step": 9290 }, { "epoch": 1.38, "grad_norm": 0.576566219329834, "learning_rate": 8.64823074635742e-05, "loss": 0.157, "step": 9300 }, { "epoch": 1.38, "grad_norm": 0.34153446555137634, "learning_rate": 8.646743978590544e-05, "loss": 0.1529, "step": 9310 }, { "epoch": 1.38, "grad_norm": 1.3493391275405884, "learning_rate": 8.64525721082367e-05, "loss": 0.1418, "step": 9320 }, { "epoch": 1.38, "grad_norm": 0.4563460648059845, "learning_rate": 8.643770443056794e-05, "loss": 0.1525, "step": 9330 }, { "epoch": 1.38, "grad_norm": 0.9894744157791138, "learning_rate": 8.642283675289921e-05, "loss": 0.1508, "step": 9340 }, { "epoch": 1.39, "grad_norm": 0.21944831311702728, "learning_rate": 8.640796907523045e-05, "loss": 0.1478, "step": 9350 }, { "epoch": 1.39, "grad_norm": 1.688185691833496, "learning_rate": 8.63931013975617e-05, "loss": 0.1545, "step": 9360 }, { "epoch": 1.39, "grad_norm": 0.3348112106323242, "learning_rate": 8.637823371989296e-05, "loss": 0.1498, "step": 9370 }, { "epoch": 1.39, "grad_norm": 0.32698962092399597, "learning_rate": 8.636336604222421e-05, "loss": 0.1485, "step": 9380 }, { "epoch": 1.39, "grad_norm": 0.4172670543193817, "learning_rate": 8.634849836455547e-05, "loss": 0.157, "step": 9390 }, { "epoch": 1.39, "grad_norm": 0.5992726683616638, "learning_rate": 8.633363068688671e-05, "loss": 0.1473, "step": 9400 }, { "epoch": 1.39, "grad_norm": 0.4641229510307312, "learning_rate": 8.631876300921796e-05, "loss": 0.1454, "step": 9410 }, { "epoch": 1.4, "grad_norm": 1.2482893466949463, "learning_rate": 8.63038953315492e-05, "loss": 0.1438, "step": 9420 }, { "epoch": 1.4, "grad_norm": 0.3719273805618286, "learning_rate": 8.628902765388047e-05, "loss": 0.1467, "step": 9430 }, { "epoch": 1.4, "grad_norm": 0.5880813002586365, "learning_rate": 8.627415997621172e-05, "loss": 0.1496, "step": 9440 }, { "epoch": 1.4, "grad_norm": 0.5161025524139404, "learning_rate": 8.625929229854297e-05, "loss": 0.1449, "step": 9450 }, { "epoch": 1.4, "grad_norm": 0.3996943533420563, "learning_rate": 8.624442462087422e-05, "loss": 0.1499, "step": 9460 }, { "epoch": 1.4, "grad_norm": 0.5722046494483948, "learning_rate": 8.622955694320548e-05, "loss": 0.1474, "step": 9470 }, { "epoch": 1.41, "grad_norm": 1.8009772300720215, "learning_rate": 8.621617603330359e-05, "loss": 0.1635, "step": 9480 }, { "epoch": 1.41, "grad_norm": 0.959286630153656, "learning_rate": 8.620130835563486e-05, "loss": 0.153, "step": 9490 }, { "epoch": 1.41, "grad_norm": 1.4346505403518677, "learning_rate": 8.61864406779661e-05, "loss": 0.1599, "step": 9500 }, { "epoch": 1.41, "grad_norm": 0.3234134018421173, "learning_rate": 8.617157300029736e-05, "loss": 0.1445, "step": 9510 }, { "epoch": 1.41, "grad_norm": 0.4339183270931244, "learning_rate": 8.615670532262861e-05, "loss": 0.151, "step": 9520 }, { "epoch": 1.41, "grad_norm": 0.9209549427032471, "learning_rate": 8.614183764495987e-05, "loss": 0.1498, "step": 9530 }, { "epoch": 1.41, "grad_norm": 0.8989786505699158, "learning_rate": 8.612696996729112e-05, "loss": 0.1423, "step": 9540 }, { "epoch": 1.42, "grad_norm": 0.9294031858444214, "learning_rate": 8.611210228962236e-05, "loss": 0.1435, "step": 9550 }, { "epoch": 1.42, "grad_norm": 0.4785991907119751, "learning_rate": 8.609723461195362e-05, "loss": 0.143, "step": 9560 }, { "epoch": 1.42, "grad_norm": 0.7767314314842224, "learning_rate": 8.608236693428486e-05, "loss": 0.1424, "step": 9570 }, { "epoch": 1.42, "grad_norm": 1.2666493654251099, "learning_rate": 8.606749925661613e-05, "loss": 0.1527, "step": 9580 }, { "epoch": 1.42, "grad_norm": 0.4553685188293457, "learning_rate": 8.605263157894738e-05, "loss": 0.1511, "step": 9590 }, { "epoch": 1.42, "grad_norm": 0.3553789556026459, "learning_rate": 8.603776390127862e-05, "loss": 0.1475, "step": 9600 }, { "epoch": 1.42, "grad_norm": 0.8654747009277344, "learning_rate": 8.602289622360988e-05, "loss": 0.1389, "step": 9610 }, { "epoch": 1.43, "grad_norm": 0.7433649301528931, "learning_rate": 8.600802854594113e-05, "loss": 0.1477, "step": 9620 }, { "epoch": 1.43, "grad_norm": 0.25381016731262207, "learning_rate": 8.599316086827239e-05, "loss": 0.1494, "step": 9630 }, { "epoch": 1.43, "grad_norm": 1.8853144645690918, "learning_rate": 8.597829319060363e-05, "loss": 0.1529, "step": 9640 }, { "epoch": 1.43, "grad_norm": 0.3927299380302429, "learning_rate": 8.596342551293488e-05, "loss": 0.1487, "step": 9650 }, { "epoch": 1.43, "grad_norm": 0.46614259481430054, "learning_rate": 8.594855783526614e-05, "loss": 0.1521, "step": 9660 }, { "epoch": 1.43, "grad_norm": 1.0246176719665527, "learning_rate": 8.593369015759739e-05, "loss": 0.1439, "step": 9670 }, { "epoch": 1.43, "grad_norm": 0.7353618144989014, "learning_rate": 8.591882247992865e-05, "loss": 0.1433, "step": 9680 }, { "epoch": 1.44, "grad_norm": 0.6362217664718628, "learning_rate": 8.590395480225989e-05, "loss": 0.1451, "step": 9690 }, { "epoch": 1.44, "grad_norm": 0.9315203428268433, "learning_rate": 8.588908712459114e-05, "loss": 0.1453, "step": 9700 }, { "epoch": 1.44, "grad_norm": 1.2445534467697144, "learning_rate": 8.58742194469224e-05, "loss": 0.1356, "step": 9710 }, { "epoch": 1.44, "grad_norm": 0.40973347425460815, "learning_rate": 8.585935176925365e-05, "loss": 0.1481, "step": 9720 }, { "epoch": 1.44, "grad_norm": 0.397741436958313, "learning_rate": 8.584448409158489e-05, "loss": 0.1449, "step": 9730 }, { "epoch": 1.44, "grad_norm": 3.0569045543670654, "learning_rate": 8.582961641391615e-05, "loss": 0.1561, "step": 9740 }, { "epoch": 1.45, "grad_norm": 0.37079283595085144, "learning_rate": 8.58147487362474e-05, "loss": 0.1542, "step": 9750 }, { "epoch": 1.45, "grad_norm": 0.637252688407898, "learning_rate": 8.579988105857866e-05, "loss": 0.1543, "step": 9760 }, { "epoch": 1.45, "grad_norm": 0.6145210862159729, "learning_rate": 8.578501338090991e-05, "loss": 0.1461, "step": 9770 }, { "epoch": 1.45, "grad_norm": 1.3545942306518555, "learning_rate": 8.577014570324115e-05, "loss": 0.1531, "step": 9780 }, { "epoch": 1.45, "grad_norm": 0.720514714717865, "learning_rate": 8.575527802557241e-05, "loss": 0.1441, "step": 9790 }, { "epoch": 1.45, "grad_norm": 0.3378904461860657, "learning_rate": 8.574041034790366e-05, "loss": 0.1464, "step": 9800 }, { "epoch": 1.45, "grad_norm": 0.7078978419303894, "learning_rate": 8.572554267023492e-05, "loss": 0.1544, "step": 9810 }, { "epoch": 1.46, "grad_norm": 1.368641972541809, "learning_rate": 8.571067499256616e-05, "loss": 0.1477, "step": 9820 }, { "epoch": 1.46, "grad_norm": 0.42889463901519775, "learning_rate": 8.569580731489741e-05, "loss": 0.1487, "step": 9830 }, { "epoch": 1.46, "grad_norm": 1.435215950012207, "learning_rate": 8.568093963722867e-05, "loss": 0.1474, "step": 9840 }, { "epoch": 1.46, "grad_norm": 0.9903163313865662, "learning_rate": 8.566607195955992e-05, "loss": 0.1433, "step": 9850 }, { "epoch": 1.46, "grad_norm": 1.3809950351715088, "learning_rate": 8.565120428189118e-05, "loss": 0.1447, "step": 9860 }, { "epoch": 1.46, "grad_norm": 0.8493334650993347, "learning_rate": 8.563633660422242e-05, "loss": 0.148, "step": 9870 }, { "epoch": 1.46, "grad_norm": 0.33634793758392334, "learning_rate": 8.562146892655367e-05, "loss": 0.1451, "step": 9880 }, { "epoch": 1.47, "grad_norm": 0.403535395860672, "learning_rate": 8.560660124888493e-05, "loss": 0.1467, "step": 9890 }, { "epoch": 1.47, "grad_norm": 0.7498649954795837, "learning_rate": 8.559173357121618e-05, "loss": 0.1442, "step": 9900 }, { "epoch": 1.47, "grad_norm": 0.8162224888801575, "learning_rate": 8.557686589354744e-05, "loss": 0.1475, "step": 9910 }, { "epoch": 1.47, "grad_norm": 0.3795585632324219, "learning_rate": 8.556199821587868e-05, "loss": 0.1471, "step": 9920 }, { "epoch": 1.47, "grad_norm": 0.37970471382141113, "learning_rate": 8.554713053820995e-05, "loss": 0.1542, "step": 9930 }, { "epoch": 1.47, "grad_norm": 1.756047248840332, "learning_rate": 8.553226286054119e-05, "loss": 0.1424, "step": 9940 }, { "epoch": 1.47, "grad_norm": 0.36348363757133484, "learning_rate": 8.551739518287244e-05, "loss": 0.141, "step": 9950 }, { "epoch": 1.48, "grad_norm": 0.6128315329551697, "learning_rate": 8.550252750520368e-05, "loss": 0.1555, "step": 9960 }, { "epoch": 1.48, "grad_norm": 0.5416569113731384, "learning_rate": 8.548765982753494e-05, "loss": 0.1541, "step": 9970 }, { "epoch": 1.48, "grad_norm": 0.9355424642562866, "learning_rate": 8.54727921498662e-05, "loss": 0.1475, "step": 9980 }, { "epoch": 1.48, "grad_norm": 1.1831605434417725, "learning_rate": 8.545792447219745e-05, "loss": 0.1533, "step": 9990 }, { "epoch": 1.48, "grad_norm": 1.0193519592285156, "learning_rate": 8.54430567945287e-05, "loss": 0.1401, "step": 10000 }, { "epoch": 1.48, "grad_norm": 0.3186480700969696, "learning_rate": 8.542818911685994e-05, "loss": 0.1584, "step": 10010 }, { "epoch": 1.49, "grad_norm": 0.3131497800350189, "learning_rate": 8.541332143919121e-05, "loss": 0.1461, "step": 10020 }, { "epoch": 1.49, "grad_norm": 0.8620591759681702, "learning_rate": 8.539845376152245e-05, "loss": 0.1408, "step": 10030 }, { "epoch": 1.49, "grad_norm": 1.469287633895874, "learning_rate": 8.538358608385371e-05, "loss": 0.1462, "step": 10040 }, { "epoch": 1.49, "grad_norm": 0.40487322211265564, "learning_rate": 8.536871840618495e-05, "loss": 0.1419, "step": 10050 }, { "epoch": 1.49, "grad_norm": 0.30698254704475403, "learning_rate": 8.53538507285162e-05, "loss": 0.1473, "step": 10060 }, { "epoch": 1.49, "grad_norm": 1.4092752933502197, "learning_rate": 8.533898305084746e-05, "loss": 0.1605, "step": 10070 }, { "epoch": 1.49, "grad_norm": 0.45507368445396423, "learning_rate": 8.532411537317871e-05, "loss": 0.1482, "step": 10080 }, { "epoch": 1.5, "grad_norm": 0.43545153737068176, "learning_rate": 8.530924769550997e-05, "loss": 0.1506, "step": 10090 }, { "epoch": 1.5, "grad_norm": 0.4408663809299469, "learning_rate": 8.529438001784121e-05, "loss": 0.1448, "step": 10100 }, { "epoch": 1.5, "grad_norm": 0.2926168441772461, "learning_rate": 8.527951234017248e-05, "loss": 0.1538, "step": 10110 }, { "epoch": 1.5, "grad_norm": 0.5207601189613342, "learning_rate": 8.526464466250372e-05, "loss": 0.144, "step": 10120 }, { "epoch": 1.5, "grad_norm": 1.4128458499908447, "learning_rate": 8.524977698483497e-05, "loss": 0.1407, "step": 10130 }, { "epoch": 1.5, "grad_norm": 0.6333006620407104, "learning_rate": 8.523490930716622e-05, "loss": 0.1476, "step": 10140 }, { "epoch": 1.5, "grad_norm": 1.300640344619751, "learning_rate": 8.522004162949747e-05, "loss": 0.145, "step": 10150 }, { "epoch": 1.51, "grad_norm": 1.1418519020080566, "learning_rate": 8.520517395182873e-05, "loss": 0.138, "step": 10160 }, { "epoch": 1.51, "grad_norm": 0.2554391324520111, "learning_rate": 8.519030627415998e-05, "loss": 0.1514, "step": 10170 }, { "epoch": 1.51, "grad_norm": 0.49666744470596313, "learning_rate": 8.517543859649124e-05, "loss": 0.1406, "step": 10180 }, { "epoch": 1.51, "grad_norm": 1.7986241579055786, "learning_rate": 8.516057091882248e-05, "loss": 0.1476, "step": 10190 }, { "epoch": 1.51, "grad_norm": 1.6229901313781738, "learning_rate": 8.514570324115374e-05, "loss": 0.1539, "step": 10200 }, { "epoch": 1.51, "grad_norm": 1.0691964626312256, "learning_rate": 8.513083556348499e-05, "loss": 0.1462, "step": 10210 }, { "epoch": 1.51, "grad_norm": 0.3225235342979431, "learning_rate": 8.511596788581624e-05, "loss": 0.1574, "step": 10220 }, { "epoch": 1.52, "grad_norm": 0.46615543961524963, "learning_rate": 8.510110020814748e-05, "loss": 0.1395, "step": 10230 }, { "epoch": 1.52, "grad_norm": 1.2598484754562378, "learning_rate": 8.508623253047874e-05, "loss": 0.1358, "step": 10240 }, { "epoch": 1.52, "grad_norm": 0.3854273855686188, "learning_rate": 8.507136485281e-05, "loss": 0.1458, "step": 10250 }, { "epoch": 1.52, "grad_norm": 0.6581094264984131, "learning_rate": 8.505649717514125e-05, "loss": 0.1506, "step": 10260 }, { "epoch": 1.52, "grad_norm": 0.3710658550262451, "learning_rate": 8.50416294974725e-05, "loss": 0.1439, "step": 10270 }, { "epoch": 1.52, "grad_norm": 0.9312505722045898, "learning_rate": 8.502676181980374e-05, "loss": 0.1486, "step": 10280 }, { "epoch": 1.53, "grad_norm": 0.8688971996307373, "learning_rate": 8.501189414213501e-05, "loss": 0.1434, "step": 10290 }, { "epoch": 1.53, "grad_norm": 0.39325767755508423, "learning_rate": 8.499702646446625e-05, "loss": 0.1432, "step": 10300 }, { "epoch": 1.53, "grad_norm": 0.48646825551986694, "learning_rate": 8.49821587867975e-05, "loss": 0.1522, "step": 10310 }, { "epoch": 1.53, "grad_norm": 0.8287198543548584, "learning_rate": 8.496729110912876e-05, "loss": 0.1463, "step": 10320 }, { "epoch": 1.53, "grad_norm": 0.5924571752548218, "learning_rate": 8.495242343146002e-05, "loss": 0.1455, "step": 10330 }, { "epoch": 1.53, "grad_norm": 0.2828935980796814, "learning_rate": 8.493755575379127e-05, "loss": 0.1436, "step": 10340 }, { "epoch": 1.53, "grad_norm": 1.714647889137268, "learning_rate": 8.492268807612251e-05, "loss": 0.1476, "step": 10350 }, { "epoch": 1.54, "grad_norm": 1.1875629425048828, "learning_rate": 8.490782039845377e-05, "loss": 0.1486, "step": 10360 }, { "epoch": 1.54, "grad_norm": 1.7616733312606812, "learning_rate": 8.489295272078501e-05, "loss": 0.1464, "step": 10370 }, { "epoch": 1.54, "grad_norm": 0.29138264060020447, "learning_rate": 8.487808504311628e-05, "loss": 0.1407, "step": 10380 }, { "epoch": 1.54, "grad_norm": 0.49664175510406494, "learning_rate": 8.486321736544752e-05, "loss": 0.1515, "step": 10390 }, { "epoch": 1.54, "grad_norm": 0.4667070806026459, "learning_rate": 8.484834968777877e-05, "loss": 0.15, "step": 10400 }, { "epoch": 1.54, "grad_norm": 0.3020360469818115, "learning_rate": 8.483348201011003e-05, "loss": 0.145, "step": 10410 }, { "epoch": 1.54, "grad_norm": 0.41862982511520386, "learning_rate": 8.481861433244128e-05, "loss": 0.1495, "step": 10420 }, { "epoch": 1.55, "grad_norm": 0.3439256250858307, "learning_rate": 8.480374665477254e-05, "loss": 0.1509, "step": 10430 }, { "epoch": 1.55, "grad_norm": 0.6848132610321045, "learning_rate": 8.478887897710378e-05, "loss": 0.1456, "step": 10440 }, { "epoch": 1.55, "grad_norm": 0.34469151496887207, "learning_rate": 8.477401129943503e-05, "loss": 0.147, "step": 10450 }, { "epoch": 1.55, "grad_norm": 2.268392324447632, "learning_rate": 8.475914362176627e-05, "loss": 0.1561, "step": 10460 }, { "epoch": 1.55, "grad_norm": 1.23007071018219, "learning_rate": 8.474427594409754e-05, "loss": 0.159, "step": 10470 }, { "epoch": 1.55, "grad_norm": 0.5129968523979187, "learning_rate": 8.472940826642878e-05, "loss": 0.1468, "step": 10480 }, { "epoch": 1.55, "grad_norm": 0.5921646952629089, "learning_rate": 8.471454058876004e-05, "loss": 0.1428, "step": 10490 }, { "epoch": 1.56, "grad_norm": 0.3901660740375519, "learning_rate": 8.469967291109129e-05, "loss": 0.1483, "step": 10500 }, { "epoch": 1.56, "grad_norm": 0.8023537993431091, "learning_rate": 8.468480523342255e-05, "loss": 0.1503, "step": 10510 }, { "epoch": 1.56, "grad_norm": 0.708949625492096, "learning_rate": 8.46699375557538e-05, "loss": 0.1437, "step": 10520 }, { "epoch": 1.56, "grad_norm": 0.8118116855621338, "learning_rate": 8.465506987808504e-05, "loss": 0.1391, "step": 10530 }, { "epoch": 1.56, "grad_norm": 0.32893306016921997, "learning_rate": 8.46402022004163e-05, "loss": 0.1416, "step": 10540 }, { "epoch": 1.56, "grad_norm": 0.5859858393669128, "learning_rate": 8.462533452274754e-05, "loss": 0.1415, "step": 10550 }, { "epoch": 1.57, "grad_norm": 0.7075644135475159, "learning_rate": 8.461046684507881e-05, "loss": 0.1485, "step": 10560 }, { "epoch": 1.57, "grad_norm": 0.966854453086853, "learning_rate": 8.459559916741005e-05, "loss": 0.1433, "step": 10570 }, { "epoch": 1.57, "grad_norm": 0.5331383347511292, "learning_rate": 8.45807314897413e-05, "loss": 0.1508, "step": 10580 }, { "epoch": 1.57, "grad_norm": 1.2724835872650146, "learning_rate": 8.456586381207256e-05, "loss": 0.1507, "step": 10590 }, { "epoch": 1.57, "grad_norm": 0.28108617663383484, "learning_rate": 8.455099613440381e-05, "loss": 0.1453, "step": 10600 }, { "epoch": 1.57, "grad_norm": 0.3450464904308319, "learning_rate": 8.453612845673507e-05, "loss": 0.1478, "step": 10610 }, { "epoch": 1.57, "grad_norm": 0.8239097595214844, "learning_rate": 8.452126077906631e-05, "loss": 0.1438, "step": 10620 }, { "epoch": 1.58, "grad_norm": 0.9062354564666748, "learning_rate": 8.450639310139756e-05, "loss": 0.1467, "step": 10630 }, { "epoch": 1.58, "grad_norm": 0.6238219141960144, "learning_rate": 8.449152542372882e-05, "loss": 0.1454, "step": 10640 }, { "epoch": 1.58, "grad_norm": 2.089106798171997, "learning_rate": 8.447665774606007e-05, "loss": 0.1393, "step": 10650 }, { "epoch": 1.58, "grad_norm": 0.6828879714012146, "learning_rate": 8.446179006839133e-05, "loss": 0.1471, "step": 10660 }, { "epoch": 1.58, "grad_norm": 0.3843400478363037, "learning_rate": 8.444692239072257e-05, "loss": 0.1414, "step": 10670 }, { "epoch": 1.58, "grad_norm": 1.483210802078247, "learning_rate": 8.443205471305382e-05, "loss": 0.146, "step": 10680 }, { "epoch": 1.58, "grad_norm": 1.3872867822647095, "learning_rate": 8.441718703538508e-05, "loss": 0.1386, "step": 10690 }, { "epoch": 1.59, "grad_norm": 0.7687630653381348, "learning_rate": 8.440231935771633e-05, "loss": 0.1524, "step": 10700 }, { "epoch": 1.59, "grad_norm": 0.3632531762123108, "learning_rate": 8.438745168004758e-05, "loss": 0.1457, "step": 10710 }, { "epoch": 1.59, "grad_norm": 0.7489669919013977, "learning_rate": 8.437258400237883e-05, "loss": 0.149, "step": 10720 }, { "epoch": 1.59, "grad_norm": 0.3110584318637848, "learning_rate": 8.435771632471008e-05, "loss": 0.1464, "step": 10730 }, { "epoch": 1.59, "grad_norm": 0.9831928014755249, "learning_rate": 8.434284864704134e-05, "loss": 0.1494, "step": 10740 }, { "epoch": 1.59, "grad_norm": 0.27060386538505554, "learning_rate": 8.43279809693726e-05, "loss": 0.1492, "step": 10750 }, { "epoch": 1.59, "grad_norm": 0.3173629939556122, "learning_rate": 8.431311329170384e-05, "loss": 0.1501, "step": 10760 }, { "epoch": 1.6, "grad_norm": 1.2598687410354614, "learning_rate": 8.429824561403509e-05, "loss": 0.1528, "step": 10770 }, { "epoch": 1.6, "grad_norm": 0.29677924513816833, "learning_rate": 8.428337793636634e-05, "loss": 0.1419, "step": 10780 }, { "epoch": 1.6, "grad_norm": 0.3866405487060547, "learning_rate": 8.42685102586976e-05, "loss": 0.1359, "step": 10790 }, { "epoch": 1.6, "grad_norm": 0.707709014415741, "learning_rate": 8.425364258102884e-05, "loss": 0.145, "step": 10800 }, { "epoch": 1.6, "grad_norm": 0.9584312438964844, "learning_rate": 8.42387749033601e-05, "loss": 0.1472, "step": 10810 }, { "epoch": 1.6, "grad_norm": 0.9559441208839417, "learning_rate": 8.422390722569135e-05, "loss": 0.1432, "step": 10820 }, { "epoch": 1.61, "grad_norm": 1.803369402885437, "learning_rate": 8.42090395480226e-05, "loss": 0.1508, "step": 10830 }, { "epoch": 1.61, "grad_norm": 0.5966110229492188, "learning_rate": 8.419417187035386e-05, "loss": 0.1502, "step": 10840 }, { "epoch": 1.61, "grad_norm": 1.5593287944793701, "learning_rate": 8.41793041926851e-05, "loss": 0.145, "step": 10850 }, { "epoch": 1.61, "grad_norm": 2.490039348602295, "learning_rate": 8.416443651501636e-05, "loss": 0.1561, "step": 10860 }, { "epoch": 1.61, "grad_norm": 0.7378647327423096, "learning_rate": 8.414956883734761e-05, "loss": 0.1475, "step": 10870 }, { "epoch": 1.61, "grad_norm": 1.7256468534469604, "learning_rate": 8.413470115967887e-05, "loss": 0.1471, "step": 10880 }, { "epoch": 1.61, "grad_norm": 0.516965389251709, "learning_rate": 8.41198334820101e-05, "loss": 0.1427, "step": 10890 }, { "epoch": 1.62, "grad_norm": 0.44696128368377686, "learning_rate": 8.410496580434136e-05, "loss": 0.1506, "step": 10900 }, { "epoch": 1.62, "grad_norm": 0.663616418838501, "learning_rate": 8.409009812667262e-05, "loss": 0.1451, "step": 10910 }, { "epoch": 1.62, "grad_norm": 0.6834636330604553, "learning_rate": 8.407523044900387e-05, "loss": 0.1464, "step": 10920 }, { "epoch": 1.62, "grad_norm": 0.3726295232772827, "learning_rate": 8.406036277133513e-05, "loss": 0.1496, "step": 10930 }, { "epoch": 1.62, "grad_norm": 0.9244528412818909, "learning_rate": 8.404549509366637e-05, "loss": 0.1445, "step": 10940 }, { "epoch": 1.62, "grad_norm": 1.3497867584228516, "learning_rate": 8.403062741599762e-05, "loss": 0.1486, "step": 10950 }, { "epoch": 1.62, "grad_norm": 0.5133809447288513, "learning_rate": 8.401575973832888e-05, "loss": 0.1521, "step": 10960 }, { "epoch": 1.63, "grad_norm": 0.28419455885887146, "learning_rate": 8.400089206066013e-05, "loss": 0.1466, "step": 10970 }, { "epoch": 1.63, "grad_norm": 0.7684698104858398, "learning_rate": 8.398602438299139e-05, "loss": 0.15, "step": 10980 }, { "epoch": 1.63, "grad_norm": 1.1589276790618896, "learning_rate": 8.397115670532263e-05, "loss": 0.1472, "step": 10990 }, { "epoch": 1.63, "grad_norm": 1.120748519897461, "learning_rate": 8.39562890276539e-05, "loss": 0.1433, "step": 11000 }, { "epoch": 1.63, "grad_norm": 1.4150292873382568, "learning_rate": 8.394142134998514e-05, "loss": 0.1431, "step": 11010 }, { "epoch": 1.63, "grad_norm": 1.4702545404434204, "learning_rate": 8.392655367231639e-05, "loss": 0.1456, "step": 11020 }, { "epoch": 1.63, "grad_norm": 0.38210058212280273, "learning_rate": 8.391168599464763e-05, "loss": 0.1436, "step": 11030 }, { "epoch": 1.64, "grad_norm": 0.4304143190383911, "learning_rate": 8.389681831697889e-05, "loss": 0.1487, "step": 11040 }, { "epoch": 1.64, "grad_norm": 0.3502871096134186, "learning_rate": 8.388195063931014e-05, "loss": 0.1469, "step": 11050 }, { "epoch": 1.64, "grad_norm": 0.36102864146232605, "learning_rate": 8.38670829616414e-05, "loss": 0.1552, "step": 11060 }, { "epoch": 1.64, "grad_norm": 0.8794988989830017, "learning_rate": 8.385221528397265e-05, "loss": 0.1436, "step": 11070 }, { "epoch": 1.64, "grad_norm": 0.30111417174339294, "learning_rate": 8.383734760630389e-05, "loss": 0.1411, "step": 11080 }, { "epoch": 1.64, "grad_norm": 0.3722517788410187, "learning_rate": 8.382247992863516e-05, "loss": 0.1416, "step": 11090 }, { "epoch": 1.65, "grad_norm": 1.2735258340835571, "learning_rate": 8.38076122509664e-05, "loss": 0.1492, "step": 11100 }, { "epoch": 1.65, "grad_norm": 0.6874326467514038, "learning_rate": 8.379274457329766e-05, "loss": 0.1472, "step": 11110 }, { "epoch": 1.65, "grad_norm": 0.6189324855804443, "learning_rate": 8.37778768956289e-05, "loss": 0.1492, "step": 11120 }, { "epoch": 1.65, "grad_norm": 0.7874268293380737, "learning_rate": 8.376300921796015e-05, "loss": 0.1489, "step": 11130 }, { "epoch": 1.65, "grad_norm": 1.9058316946029663, "learning_rate": 8.374814154029141e-05, "loss": 0.1466, "step": 11140 }, { "epoch": 1.65, "grad_norm": 0.27435657382011414, "learning_rate": 8.373327386262266e-05, "loss": 0.1548, "step": 11150 }, { "epoch": 1.65, "grad_norm": 0.48889249563217163, "learning_rate": 8.371840618495392e-05, "loss": 0.1531, "step": 11160 }, { "epoch": 1.66, "grad_norm": 0.809974730014801, "learning_rate": 8.370353850728516e-05, "loss": 0.1492, "step": 11170 }, { "epoch": 1.66, "grad_norm": 0.6859524846076965, "learning_rate": 8.368867082961643e-05, "loss": 0.1462, "step": 11180 }, { "epoch": 1.66, "grad_norm": 2.093574047088623, "learning_rate": 8.367380315194767e-05, "loss": 0.1498, "step": 11190 }, { "epoch": 1.66, "grad_norm": 1.384391188621521, "learning_rate": 8.365893547427892e-05, "loss": 0.1529, "step": 11200 }, { "epoch": 1.66, "grad_norm": 1.227850317955017, "learning_rate": 8.364406779661016e-05, "loss": 0.1448, "step": 11210 }, { "epoch": 1.66, "grad_norm": 1.0657614469528198, "learning_rate": 8.362920011894142e-05, "loss": 0.1444, "step": 11220 }, { "epoch": 1.66, "grad_norm": 0.9369791746139526, "learning_rate": 8.361433244127267e-05, "loss": 0.1484, "step": 11230 }, { "epoch": 1.67, "grad_norm": 0.27957215905189514, "learning_rate": 8.359946476360393e-05, "loss": 0.142, "step": 11240 }, { "epoch": 1.67, "grad_norm": 0.39011701941490173, "learning_rate": 8.358459708593518e-05, "loss": 0.1485, "step": 11250 }, { "epoch": 1.67, "grad_norm": 0.4581025242805481, "learning_rate": 8.356972940826642e-05, "loss": 0.1567, "step": 11260 }, { "epoch": 1.67, "grad_norm": 0.7829630970954895, "learning_rate": 8.355486173059769e-05, "loss": 0.1439, "step": 11270 }, { "epoch": 1.67, "grad_norm": 0.7222351431846619, "learning_rate": 8.353999405292893e-05, "loss": 0.1411, "step": 11280 }, { "epoch": 1.67, "grad_norm": 0.4510668218135834, "learning_rate": 8.352512637526019e-05, "loss": 0.14, "step": 11290 }, { "epoch": 1.68, "grad_norm": 0.3421786427497864, "learning_rate": 8.351025869759144e-05, "loss": 0.1495, "step": 11300 }, { "epoch": 1.68, "grad_norm": 0.24841587245464325, "learning_rate": 8.34953910199227e-05, "loss": 0.1395, "step": 11310 }, { "epoch": 1.68, "grad_norm": 0.25271904468536377, "learning_rate": 8.348052334225395e-05, "loss": 0.1427, "step": 11320 }, { "epoch": 1.68, "grad_norm": 0.5229553580284119, "learning_rate": 8.34656556645852e-05, "loss": 0.1425, "step": 11330 }, { "epoch": 1.68, "grad_norm": 0.3751966953277588, "learning_rate": 8.345078798691645e-05, "loss": 0.1435, "step": 11340 }, { "epoch": 1.68, "grad_norm": 1.0860705375671387, "learning_rate": 8.343592030924769e-05, "loss": 0.1482, "step": 11350 }, { "epoch": 1.68, "grad_norm": 1.6879124641418457, "learning_rate": 8.342105263157896e-05, "loss": 0.1495, "step": 11360 }, { "epoch": 1.69, "grad_norm": 0.5659767985343933, "learning_rate": 8.34061849539102e-05, "loss": 0.1421, "step": 11370 }, { "epoch": 1.69, "grad_norm": 0.4345433712005615, "learning_rate": 8.339131727624145e-05, "loss": 0.1499, "step": 11380 }, { "epoch": 1.69, "grad_norm": 1.5097520351409912, "learning_rate": 8.337644959857271e-05, "loss": 0.1554, "step": 11390 }, { "epoch": 1.69, "grad_norm": 0.5569794178009033, "learning_rate": 8.336158192090396e-05, "loss": 0.1438, "step": 11400 }, { "epoch": 1.69, "grad_norm": 1.062078833580017, "learning_rate": 8.334671424323522e-05, "loss": 0.1532, "step": 11410 }, { "epoch": 1.69, "grad_norm": 0.6521728038787842, "learning_rate": 8.333184656556646e-05, "loss": 0.146, "step": 11420 }, { "epoch": 1.69, "grad_norm": 1.404455542564392, "learning_rate": 8.331697888789771e-05, "loss": 0.1506, "step": 11430 }, { "epoch": 1.7, "grad_norm": 0.4847467243671417, "learning_rate": 8.330211121022896e-05, "loss": 0.1416, "step": 11440 }, { "epoch": 1.7, "grad_norm": 0.5724780559539795, "learning_rate": 8.328724353256022e-05, "loss": 0.1499, "step": 11450 }, { "epoch": 1.7, "grad_norm": 0.3448980152606964, "learning_rate": 8.327237585489147e-05, "loss": 0.1492, "step": 11460 }, { "epoch": 1.7, "grad_norm": 0.49433252215385437, "learning_rate": 8.325750817722272e-05, "loss": 0.1497, "step": 11470 }, { "epoch": 1.7, "grad_norm": 1.2433007955551147, "learning_rate": 8.324264049955398e-05, "loss": 0.1426, "step": 11480 }, { "epoch": 1.7, "grad_norm": 0.5715564489364624, "learning_rate": 8.322777282188523e-05, "loss": 0.1462, "step": 11490 }, { "epoch": 1.7, "grad_norm": 0.7039369940757751, "learning_rate": 8.321290514421648e-05, "loss": 0.1468, "step": 11500 }, { "epoch": 1.71, "grad_norm": 0.6003488898277283, "learning_rate": 8.319803746654773e-05, "loss": 0.1423, "step": 11510 }, { "epoch": 1.71, "grad_norm": 0.4361822009086609, "learning_rate": 8.318316978887898e-05, "loss": 0.1389, "step": 11520 }, { "epoch": 1.71, "grad_norm": 0.3091180622577667, "learning_rate": 8.316830211121022e-05, "loss": 0.1503, "step": 11530 }, { "epoch": 1.71, "grad_norm": 0.3191719651222229, "learning_rate": 8.315343443354149e-05, "loss": 0.1499, "step": 11540 }, { "epoch": 1.71, "grad_norm": 0.5728769898414612, "learning_rate": 8.313856675587273e-05, "loss": 0.1539, "step": 11550 }, { "epoch": 1.71, "grad_norm": 1.094186782836914, "learning_rate": 8.312369907820399e-05, "loss": 0.1457, "step": 11560 }, { "epoch": 1.72, "grad_norm": 0.9020346999168396, "learning_rate": 8.310883140053524e-05, "loss": 0.1417, "step": 11570 }, { "epoch": 1.72, "grad_norm": 0.3199603855609894, "learning_rate": 8.30939637228665e-05, "loss": 0.1461, "step": 11580 }, { "epoch": 1.72, "grad_norm": 1.0080779790878296, "learning_rate": 8.307909604519775e-05, "loss": 0.1436, "step": 11590 }, { "epoch": 1.72, "grad_norm": 0.37147411704063416, "learning_rate": 8.306422836752899e-05, "loss": 0.1354, "step": 11600 }, { "epoch": 1.72, "grad_norm": 0.7766250371932983, "learning_rate": 8.304936068986025e-05, "loss": 0.1465, "step": 11610 }, { "epoch": 1.72, "grad_norm": 0.32852333784103394, "learning_rate": 8.303449301219149e-05, "loss": 0.1422, "step": 11620 }, { "epoch": 1.72, "grad_norm": 0.5026690363883972, "learning_rate": 8.301962533452276e-05, "loss": 0.1493, "step": 11630 }, { "epoch": 1.73, "grad_norm": 0.4522212743759155, "learning_rate": 8.300475765685401e-05, "loss": 0.1381, "step": 11640 }, { "epoch": 1.73, "grad_norm": 0.3424636125564575, "learning_rate": 8.298988997918525e-05, "loss": 0.1468, "step": 11650 }, { "epoch": 1.73, "grad_norm": 0.5932589769363403, "learning_rate": 8.29750223015165e-05, "loss": 0.1533, "step": 11660 }, { "epoch": 1.73, "grad_norm": 0.817040741443634, "learning_rate": 8.296015462384776e-05, "loss": 0.1396, "step": 11670 }, { "epoch": 1.73, "grad_norm": 0.25373414158821106, "learning_rate": 8.294528694617902e-05, "loss": 0.147, "step": 11680 }, { "epoch": 1.73, "grad_norm": 0.36040133237838745, "learning_rate": 8.293041926851026e-05, "loss": 0.1508, "step": 11690 }, { "epoch": 1.73, "grad_norm": 1.032599925994873, "learning_rate": 8.291555159084151e-05, "loss": 0.1394, "step": 11700 }, { "epoch": 1.74, "grad_norm": 0.42168989777565, "learning_rate": 8.290068391317277e-05, "loss": 0.1495, "step": 11710 }, { "epoch": 1.74, "grad_norm": 0.49878957867622375, "learning_rate": 8.288581623550402e-05, "loss": 0.1383, "step": 11720 }, { "epoch": 1.74, "grad_norm": 0.8637439012527466, "learning_rate": 8.287094855783528e-05, "loss": 0.1388, "step": 11730 }, { "epoch": 1.74, "grad_norm": 0.34474968910217285, "learning_rate": 8.285608088016652e-05, "loss": 0.1506, "step": 11740 }, { "epoch": 1.74, "grad_norm": 0.9870109558105469, "learning_rate": 8.284121320249777e-05, "loss": 0.1516, "step": 11750 }, { "epoch": 1.74, "grad_norm": 0.502234160900116, "learning_rate": 8.282634552482903e-05, "loss": 0.149, "step": 11760 }, { "epoch": 1.74, "grad_norm": 1.7274667024612427, "learning_rate": 8.281147784716028e-05, "loss": 0.1487, "step": 11770 }, { "epoch": 1.75, "grad_norm": 0.27660071849823, "learning_rate": 8.279661016949152e-05, "loss": 0.1398, "step": 11780 }, { "epoch": 1.75, "grad_norm": 0.35568374395370483, "learning_rate": 8.278174249182278e-05, "loss": 0.1505, "step": 11790 }, { "epoch": 1.75, "grad_norm": 1.317064881324768, "learning_rate": 8.276687481415403e-05, "loss": 0.1522, "step": 11800 }, { "epoch": 1.75, "grad_norm": 0.6406526565551758, "learning_rate": 8.275200713648529e-05, "loss": 0.1518, "step": 11810 }, { "epoch": 1.75, "grad_norm": 0.24043872952461243, "learning_rate": 8.273713945881654e-05, "loss": 0.1514, "step": 11820 }, { "epoch": 1.75, "grad_norm": 2.5549800395965576, "learning_rate": 8.272227178114778e-05, "loss": 0.1501, "step": 11830 }, { "epoch": 1.76, "grad_norm": 0.8846415877342224, "learning_rate": 8.270740410347904e-05, "loss": 0.1491, "step": 11840 }, { "epoch": 1.76, "grad_norm": 0.6220967173576355, "learning_rate": 8.269253642581029e-05, "loss": 0.1497, "step": 11850 }, { "epoch": 1.76, "grad_norm": 0.24483416974544525, "learning_rate": 8.267766874814155e-05, "loss": 0.1385, "step": 11860 }, { "epoch": 1.76, "grad_norm": 0.46550866961479187, "learning_rate": 8.266280107047279e-05, "loss": 0.1484, "step": 11870 }, { "epoch": 1.76, "grad_norm": 0.3104701340198517, "learning_rate": 8.264793339280404e-05, "loss": 0.1435, "step": 11880 }, { "epoch": 1.76, "grad_norm": 2.2419028282165527, "learning_rate": 8.26330657151353e-05, "loss": 0.1523, "step": 11890 }, { "epoch": 1.76, "grad_norm": 0.3503584563732147, "learning_rate": 8.261819803746655e-05, "loss": 0.1461, "step": 11900 }, { "epoch": 1.77, "grad_norm": 0.2698685824871063, "learning_rate": 8.260333035979781e-05, "loss": 0.148, "step": 11910 }, { "epoch": 1.77, "grad_norm": 0.36493173241615295, "learning_rate": 8.258846268212905e-05, "loss": 0.1447, "step": 11920 }, { "epoch": 1.77, "grad_norm": 0.378431111574173, "learning_rate": 8.25735950044603e-05, "loss": 0.1528, "step": 11930 }, { "epoch": 1.77, "grad_norm": 0.6286464333534241, "learning_rate": 8.255872732679156e-05, "loss": 0.1419, "step": 11940 }, { "epoch": 1.77, "grad_norm": 1.7695555686950684, "learning_rate": 8.254385964912281e-05, "loss": 0.1496, "step": 11950 }, { "epoch": 1.77, "grad_norm": 0.46831145882606506, "learning_rate": 8.252899197145407e-05, "loss": 0.1418, "step": 11960 }, { "epoch": 1.77, "grad_norm": 0.4352415204048157, "learning_rate": 8.251412429378531e-05, "loss": 0.1506, "step": 11970 }, { "epoch": 1.78, "grad_norm": 0.2404990941286087, "learning_rate": 8.249925661611658e-05, "loss": 0.1527, "step": 11980 }, { "epoch": 1.78, "grad_norm": 1.8183870315551758, "learning_rate": 8.248438893844782e-05, "loss": 0.1429, "step": 11990 }, { "epoch": 1.78, "grad_norm": 0.2836654484272003, "learning_rate": 8.246952126077907e-05, "loss": 0.1403, "step": 12000 }, { "epoch": 1.78, "grad_norm": 0.32020139694213867, "learning_rate": 8.245465358311032e-05, "loss": 0.1513, "step": 12010 }, { "epoch": 1.78, "grad_norm": 1.3946151733398438, "learning_rate": 8.243978590544157e-05, "loss": 0.1385, "step": 12020 }, { "epoch": 1.78, "grad_norm": 1.3150290250778198, "learning_rate": 8.242491822777282e-05, "loss": 0.1456, "step": 12030 }, { "epoch": 1.78, "grad_norm": 0.21857360005378723, "learning_rate": 8.241005055010408e-05, "loss": 0.1452, "step": 12040 }, { "epoch": 1.79, "grad_norm": 1.4940718412399292, "learning_rate": 8.239518287243533e-05, "loss": 0.1406, "step": 12050 }, { "epoch": 1.79, "grad_norm": 1.5689001083374023, "learning_rate": 8.238031519476658e-05, "loss": 0.1566, "step": 12060 }, { "epoch": 1.79, "grad_norm": 0.8649786710739136, "learning_rate": 8.236544751709784e-05, "loss": 0.1498, "step": 12070 }, { "epoch": 1.79, "grad_norm": 1.0453568696975708, "learning_rate": 8.235057983942908e-05, "loss": 0.1461, "step": 12080 }, { "epoch": 1.79, "grad_norm": 1.147850513458252, "learning_rate": 8.233571216176034e-05, "loss": 0.1438, "step": 12090 }, { "epoch": 1.79, "grad_norm": 0.5856927037239075, "learning_rate": 8.232084448409158e-05, "loss": 0.1477, "step": 12100 }, { "epoch": 1.8, "grad_norm": 0.24364539980888367, "learning_rate": 8.230597680642284e-05, "loss": 0.1489, "step": 12110 }, { "epoch": 1.8, "grad_norm": 0.500666618347168, "learning_rate": 8.229110912875409e-05, "loss": 0.1473, "step": 12120 }, { "epoch": 1.8, "grad_norm": 0.2935793697834015, "learning_rate": 8.227624145108535e-05, "loss": 0.1497, "step": 12130 }, { "epoch": 1.8, "grad_norm": 0.3563259541988373, "learning_rate": 8.22613737734166e-05, "loss": 0.1418, "step": 12140 }, { "epoch": 1.8, "grad_norm": 0.3523156940937042, "learning_rate": 8.224650609574784e-05, "loss": 0.1453, "step": 12150 }, { "epoch": 1.8, "grad_norm": 1.0185507535934448, "learning_rate": 8.223163841807911e-05, "loss": 0.1399, "step": 12160 }, { "epoch": 1.8, "grad_norm": 0.5404852628707886, "learning_rate": 8.221677074041035e-05, "loss": 0.1463, "step": 12170 }, { "epoch": 1.81, "grad_norm": 1.4389984607696533, "learning_rate": 8.22019030627416e-05, "loss": 0.1524, "step": 12180 }, { "epoch": 1.81, "grad_norm": 0.39658161997795105, "learning_rate": 8.218703538507285e-05, "loss": 0.1424, "step": 12190 }, { "epoch": 1.81, "grad_norm": 0.3328675329685211, "learning_rate": 8.217216770740411e-05, "loss": 0.1478, "step": 12200 }, { "epoch": 1.81, "grad_norm": 0.8993117213249207, "learning_rate": 8.215730002973536e-05, "loss": 0.1431, "step": 12210 }, { "epoch": 1.81, "grad_norm": 0.31393373012542725, "learning_rate": 8.214243235206661e-05, "loss": 0.153, "step": 12220 }, { "epoch": 1.81, "grad_norm": 0.40346676111221313, "learning_rate": 8.212756467439787e-05, "loss": 0.1552, "step": 12230 }, { "epoch": 1.81, "grad_norm": 0.2991549074649811, "learning_rate": 8.211269699672911e-05, "loss": 0.1443, "step": 12240 }, { "epoch": 1.82, "grad_norm": 0.729020357131958, "learning_rate": 8.209782931906038e-05, "loss": 0.1419, "step": 12250 }, { "epoch": 1.82, "grad_norm": 1.1650629043579102, "learning_rate": 8.208296164139162e-05, "loss": 0.1547, "step": 12260 }, { "epoch": 1.82, "grad_norm": 1.2502027750015259, "learning_rate": 8.206809396372287e-05, "loss": 0.1468, "step": 12270 }, { "epoch": 1.82, "grad_norm": 0.664297342300415, "learning_rate": 8.205322628605411e-05, "loss": 0.1468, "step": 12280 }, { "epoch": 1.82, "grad_norm": 0.310942143201828, "learning_rate": 8.203835860838538e-05, "loss": 0.1441, "step": 12290 }, { "epoch": 1.82, "grad_norm": 0.28401151299476624, "learning_rate": 8.202349093071664e-05, "loss": 0.1436, "step": 12300 }, { "epoch": 1.82, "grad_norm": 0.32295674085617065, "learning_rate": 8.200862325304788e-05, "loss": 0.1445, "step": 12310 }, { "epoch": 1.83, "grad_norm": 0.49534422159194946, "learning_rate": 8.199375557537913e-05, "loss": 0.1505, "step": 12320 }, { "epoch": 1.83, "grad_norm": 0.6425632834434509, "learning_rate": 8.197888789771037e-05, "loss": 0.1489, "step": 12330 }, { "epoch": 1.83, "grad_norm": 0.6739435195922852, "learning_rate": 8.196402022004164e-05, "loss": 0.1453, "step": 12340 }, { "epoch": 1.83, "grad_norm": 0.706912636756897, "learning_rate": 8.194915254237288e-05, "loss": 0.141, "step": 12350 }, { "epoch": 1.83, "grad_norm": 0.9807479381561279, "learning_rate": 8.193428486470414e-05, "loss": 0.1542, "step": 12360 }, { "epoch": 1.83, "grad_norm": 1.7933177947998047, "learning_rate": 8.191941718703539e-05, "loss": 0.1478, "step": 12370 }, { "epoch": 1.84, "grad_norm": 0.555246889591217, "learning_rate": 8.190454950936665e-05, "loss": 0.1593, "step": 12380 }, { "epoch": 1.84, "grad_norm": 0.279439777135849, "learning_rate": 8.18896818316979e-05, "loss": 0.1494, "step": 12390 }, { "epoch": 1.84, "grad_norm": 0.6358618140220642, "learning_rate": 8.187481415402914e-05, "loss": 0.1567, "step": 12400 }, { "epoch": 1.84, "grad_norm": 0.4418742060661316, "learning_rate": 8.18599464763604e-05, "loss": 0.1452, "step": 12410 }, { "epoch": 1.84, "grad_norm": 0.34239551424980164, "learning_rate": 8.184507879869164e-05, "loss": 0.1405, "step": 12420 }, { "epoch": 1.84, "grad_norm": 1.068682074546814, "learning_rate": 8.183021112102291e-05, "loss": 0.1454, "step": 12430 }, { "epoch": 1.84, "grad_norm": 0.748917281627655, "learning_rate": 8.181534344335415e-05, "loss": 0.1468, "step": 12440 }, { "epoch": 1.85, "grad_norm": 1.5174105167388916, "learning_rate": 8.18004757656854e-05, "loss": 0.137, "step": 12450 }, { "epoch": 1.85, "grad_norm": 0.29713815450668335, "learning_rate": 8.178560808801666e-05, "loss": 0.1443, "step": 12460 }, { "epoch": 1.85, "grad_norm": 0.3194617033004761, "learning_rate": 8.177074041034791e-05, "loss": 0.1463, "step": 12470 }, { "epoch": 1.85, "grad_norm": 0.9451056718826294, "learning_rate": 8.175587273267917e-05, "loss": 0.1488, "step": 12480 }, { "epoch": 1.85, "grad_norm": 0.5475955009460449, "learning_rate": 8.174100505501041e-05, "loss": 0.1487, "step": 12490 }, { "epoch": 1.85, "grad_norm": 0.4979711174964905, "learning_rate": 8.172613737734166e-05, "loss": 0.1379, "step": 12500 }, { "epoch": 1.85, "grad_norm": 0.6958137154579163, "learning_rate": 8.17112696996729e-05, "loss": 0.1395, "step": 12510 }, { "epoch": 1.86, "grad_norm": 0.6111878156661987, "learning_rate": 8.169640202200417e-05, "loss": 0.1396, "step": 12520 }, { "epoch": 1.86, "grad_norm": 1.024361252784729, "learning_rate": 8.168153434433541e-05, "loss": 0.1453, "step": 12530 }, { "epoch": 1.86, "grad_norm": 0.42374831438064575, "learning_rate": 8.166666666666667e-05, "loss": 0.1441, "step": 12540 }, { "epoch": 1.86, "grad_norm": 0.4444429278373718, "learning_rate": 8.165179898899792e-05, "loss": 0.1515, "step": 12550 }, { "epoch": 1.86, "grad_norm": 0.30336788296699524, "learning_rate": 8.163693131132918e-05, "loss": 0.1548, "step": 12560 }, { "epoch": 1.86, "grad_norm": 1.3255070447921753, "learning_rate": 8.162206363366043e-05, "loss": 0.1498, "step": 12570 }, { "epoch": 1.86, "grad_norm": 0.9393750429153442, "learning_rate": 8.160719595599167e-05, "loss": 0.1439, "step": 12580 }, { "epoch": 1.87, "grad_norm": 0.9588826298713684, "learning_rate": 8.159232827832293e-05, "loss": 0.1427, "step": 12590 }, { "epoch": 1.87, "grad_norm": 1.0595064163208008, "learning_rate": 8.157746060065417e-05, "loss": 0.1481, "step": 12600 }, { "epoch": 1.87, "grad_norm": 0.584743082523346, "learning_rate": 8.156259292298544e-05, "loss": 0.1322, "step": 12610 }, { "epoch": 1.87, "grad_norm": 0.6917292475700378, "learning_rate": 8.154772524531668e-05, "loss": 0.1406, "step": 12620 }, { "epoch": 1.87, "grad_norm": 0.39056283235549927, "learning_rate": 8.153285756764793e-05, "loss": 0.1464, "step": 12630 }, { "epoch": 1.87, "grad_norm": 2.2323410511016846, "learning_rate": 8.151798988997919e-05, "loss": 0.1497, "step": 12640 }, { "epoch": 1.88, "grad_norm": 0.2180223912000656, "learning_rate": 8.150312221231044e-05, "loss": 0.147, "step": 12650 }, { "epoch": 1.88, "grad_norm": 1.1179107427597046, "learning_rate": 8.14882545346417e-05, "loss": 0.1433, "step": 12660 }, { "epoch": 1.88, "grad_norm": 0.2884316146373749, "learning_rate": 8.147338685697294e-05, "loss": 0.1545, "step": 12670 }, { "epoch": 1.88, "grad_norm": 0.5363054275512695, "learning_rate": 8.14585191793042e-05, "loss": 0.152, "step": 12680 }, { "epoch": 1.88, "grad_norm": 0.2892308831214905, "learning_rate": 8.144365150163545e-05, "loss": 0.1472, "step": 12690 }, { "epoch": 1.88, "grad_norm": 0.8672362565994263, "learning_rate": 8.14287838239667e-05, "loss": 0.1566, "step": 12700 }, { "epoch": 1.88, "grad_norm": 0.8178413510322571, "learning_rate": 8.141391614629796e-05, "loss": 0.1459, "step": 12710 }, { "epoch": 1.89, "grad_norm": 0.3414991497993469, "learning_rate": 8.13990484686292e-05, "loss": 0.1454, "step": 12720 }, { "epoch": 1.89, "grad_norm": 1.8329755067825317, "learning_rate": 8.138418079096045e-05, "loss": 0.1429, "step": 12730 }, { "epoch": 1.89, "grad_norm": 0.6989759802818298, "learning_rate": 8.136931311329171e-05, "loss": 0.1533, "step": 12740 }, { "epoch": 1.89, "grad_norm": 0.6167986392974854, "learning_rate": 8.135444543562296e-05, "loss": 0.1415, "step": 12750 }, { "epoch": 1.89, "grad_norm": 0.7545574903488159, "learning_rate": 8.13395777579542e-05, "loss": 0.1451, "step": 12760 }, { "epoch": 1.89, "grad_norm": 0.7932333946228027, "learning_rate": 8.132471008028546e-05, "loss": 0.1473, "step": 12770 }, { "epoch": 1.89, "grad_norm": 0.26402798295021057, "learning_rate": 8.130984240261672e-05, "loss": 0.1394, "step": 12780 }, { "epoch": 1.9, "grad_norm": 0.5565439462661743, "learning_rate": 8.129497472494797e-05, "loss": 0.14, "step": 12790 }, { "epoch": 1.9, "grad_norm": 0.40671250224113464, "learning_rate": 8.128010704727922e-05, "loss": 0.1454, "step": 12800 }, { "epoch": 1.9, "grad_norm": 0.8522148728370667, "learning_rate": 8.126523936961047e-05, "loss": 0.1558, "step": 12810 }, { "epoch": 1.9, "grad_norm": 0.5363799929618835, "learning_rate": 8.125037169194172e-05, "loss": 0.1404, "step": 12820 }, { "epoch": 1.9, "grad_norm": 0.44085416197776794, "learning_rate": 8.123550401427298e-05, "loss": 0.1512, "step": 12830 }, { "epoch": 1.9, "grad_norm": 0.3973836600780487, "learning_rate": 8.122063633660423e-05, "loss": 0.1456, "step": 12840 }, { "epoch": 1.9, "grad_norm": 0.5485923886299133, "learning_rate": 8.120576865893547e-05, "loss": 0.1424, "step": 12850 }, { "epoch": 1.91, "grad_norm": 0.4346306622028351, "learning_rate": 8.119090098126673e-05, "loss": 0.1508, "step": 12860 }, { "epoch": 1.91, "grad_norm": 0.2852851450443268, "learning_rate": 8.117603330359798e-05, "loss": 0.1459, "step": 12870 }, { "epoch": 1.91, "grad_norm": 1.7352854013442993, "learning_rate": 8.116116562592924e-05, "loss": 0.1475, "step": 12880 }, { "epoch": 1.91, "grad_norm": 0.3881273567676544, "learning_rate": 8.114629794826049e-05, "loss": 0.1429, "step": 12890 }, { "epoch": 1.91, "grad_norm": 1.1615941524505615, "learning_rate": 8.113143027059173e-05, "loss": 0.1473, "step": 12900 }, { "epoch": 1.91, "grad_norm": 0.435965359210968, "learning_rate": 8.111656259292299e-05, "loss": 0.1461, "step": 12910 }, { "epoch": 1.92, "grad_norm": 0.6154923439025879, "learning_rate": 8.110169491525424e-05, "loss": 0.1438, "step": 12920 }, { "epoch": 1.92, "grad_norm": 0.5796480774879456, "learning_rate": 8.10868272375855e-05, "loss": 0.145, "step": 12930 }, { "epoch": 1.92, "grad_norm": 1.1157699823379517, "learning_rate": 8.107195955991674e-05, "loss": 0.15, "step": 12940 }, { "epoch": 1.92, "grad_norm": 0.9312446117401123, "learning_rate": 8.105709188224799e-05, "loss": 0.1419, "step": 12950 }, { "epoch": 1.92, "grad_norm": 0.6173706650733948, "learning_rate": 8.104222420457925e-05, "loss": 0.1464, "step": 12960 }, { "epoch": 1.92, "grad_norm": 0.7330856919288635, "learning_rate": 8.10273565269105e-05, "loss": 0.1507, "step": 12970 }, { "epoch": 1.92, "grad_norm": 0.6786613464355469, "learning_rate": 8.101248884924176e-05, "loss": 0.1486, "step": 12980 }, { "epoch": 1.93, "grad_norm": 1.6271902322769165, "learning_rate": 8.0997621171573e-05, "loss": 0.1497, "step": 12990 }, { "epoch": 1.93, "grad_norm": 0.4697366952896118, "learning_rate": 8.098275349390425e-05, "loss": 0.1436, "step": 13000 }, { "epoch": 1.93, "grad_norm": 1.4403237104415894, "learning_rate": 8.096788581623551e-05, "loss": 0.142, "step": 13010 }, { "epoch": 1.93, "grad_norm": 0.9504378437995911, "learning_rate": 8.095301813856676e-05, "loss": 0.1441, "step": 13020 }, { "epoch": 1.93, "grad_norm": 0.271784245967865, "learning_rate": 8.093815046089802e-05, "loss": 0.1419, "step": 13030 }, { "epoch": 1.93, "grad_norm": 0.47144797444343567, "learning_rate": 8.092328278322926e-05, "loss": 0.1471, "step": 13040 }, { "epoch": 1.93, "grad_norm": 0.2984504997730255, "learning_rate": 8.090841510556053e-05, "loss": 0.1453, "step": 13050 }, { "epoch": 1.94, "grad_norm": 1.020990252494812, "learning_rate": 8.089354742789177e-05, "loss": 0.1454, "step": 13060 }, { "epoch": 1.94, "grad_norm": 0.3087640702724457, "learning_rate": 8.087867975022302e-05, "loss": 0.1497, "step": 13070 }, { "epoch": 1.94, "grad_norm": 0.4391845464706421, "learning_rate": 8.086381207255426e-05, "loss": 0.1506, "step": 13080 }, { "epoch": 1.94, "grad_norm": 2.1297712326049805, "learning_rate": 8.084894439488552e-05, "loss": 0.1502, "step": 13090 }, { "epoch": 1.94, "grad_norm": 0.29619669914245605, "learning_rate": 8.083407671721677e-05, "loss": 0.1547, "step": 13100 }, { "epoch": 1.94, "grad_norm": 1.5125505924224854, "learning_rate": 8.081920903954803e-05, "loss": 0.145, "step": 13110 }, { "epoch": 1.94, "grad_norm": 1.0014768838882446, "learning_rate": 8.080434136187928e-05, "loss": 0.1471, "step": 13120 }, { "epoch": 1.95, "grad_norm": 0.2931712865829468, "learning_rate": 8.078947368421052e-05, "loss": 0.1463, "step": 13130 }, { "epoch": 1.95, "grad_norm": 1.7871705293655396, "learning_rate": 8.077460600654179e-05, "loss": 0.1438, "step": 13140 }, { "epoch": 1.95, "grad_norm": 1.4664849042892456, "learning_rate": 8.075973832887303e-05, "loss": 0.1432, "step": 13150 }, { "epoch": 1.95, "grad_norm": 1.5097386837005615, "learning_rate": 8.074487065120429e-05, "loss": 0.1459, "step": 13160 }, { "epoch": 1.95, "grad_norm": 0.2903710603713989, "learning_rate": 8.073000297353553e-05, "loss": 0.1455, "step": 13170 }, { "epoch": 1.95, "grad_norm": 0.6520920991897583, "learning_rate": 8.07151352958668e-05, "loss": 0.1491, "step": 13180 }, { "epoch": 1.96, "grad_norm": 0.3731312155723572, "learning_rate": 8.070026761819804e-05, "loss": 0.1372, "step": 13190 }, { "epoch": 1.96, "grad_norm": 0.653975248336792, "learning_rate": 8.06853999405293e-05, "loss": 0.1439, "step": 13200 }, { "epoch": 1.96, "grad_norm": 0.7446883320808411, "learning_rate": 8.067053226286055e-05, "loss": 0.148, "step": 13210 }, { "epoch": 1.96, "grad_norm": 1.1628508567810059, "learning_rate": 8.065566458519179e-05, "loss": 0.1482, "step": 13220 }, { "epoch": 1.96, "grad_norm": 1.871608018875122, "learning_rate": 8.064079690752306e-05, "loss": 0.1473, "step": 13230 }, { "epoch": 1.96, "grad_norm": 0.35242322087287903, "learning_rate": 8.06259292298543e-05, "loss": 0.1481, "step": 13240 }, { "epoch": 1.96, "grad_norm": 0.6800448894500732, "learning_rate": 8.061106155218555e-05, "loss": 0.1469, "step": 13250 }, { "epoch": 1.97, "grad_norm": 0.6389768719673157, "learning_rate": 8.05961938745168e-05, "loss": 0.1433, "step": 13260 }, { "epoch": 1.97, "grad_norm": 0.30566221475601196, "learning_rate": 8.058132619684806e-05, "loss": 0.1475, "step": 13270 }, { "epoch": 1.97, "grad_norm": 0.6695141196250916, "learning_rate": 8.05664585191793e-05, "loss": 0.1494, "step": 13280 }, { "epoch": 1.97, "grad_norm": 0.8827765583992004, "learning_rate": 8.055159084151056e-05, "loss": 0.1525, "step": 13290 }, { "epoch": 1.97, "grad_norm": 0.7884755730628967, "learning_rate": 8.053672316384181e-05, "loss": 0.1347, "step": 13300 }, { "epoch": 1.97, "grad_norm": 0.29021501541137695, "learning_rate": 8.052185548617306e-05, "loss": 0.1514, "step": 13310 }, { "epoch": 1.97, "grad_norm": 3.1000945568084717, "learning_rate": 8.050698780850432e-05, "loss": 0.1576, "step": 13320 }, { "epoch": 1.98, "grad_norm": 1.0783339738845825, "learning_rate": 8.049212013083556e-05, "loss": 0.1487, "step": 13330 }, { "epoch": 1.98, "grad_norm": 0.3350972533226013, "learning_rate": 8.047725245316682e-05, "loss": 0.1393, "step": 13340 }, { "epoch": 1.98, "grad_norm": 0.26007136702537537, "learning_rate": 8.046238477549807e-05, "loss": 0.138, "step": 13350 }, { "epoch": 1.98, "grad_norm": 0.37395599484443665, "learning_rate": 8.044751709782933e-05, "loss": 0.1444, "step": 13360 }, { "epoch": 1.98, "grad_norm": 0.48090776801109314, "learning_rate": 8.043264942016058e-05, "loss": 0.146, "step": 13370 }, { "epoch": 1.98, "grad_norm": 0.6154760718345642, "learning_rate": 8.041778174249182e-05, "loss": 0.1423, "step": 13380 }, { "epoch": 1.98, "grad_norm": 0.34295591711997986, "learning_rate": 8.040291406482308e-05, "loss": 0.1515, "step": 13390 }, { "epoch": 1.99, "grad_norm": 0.2805001735687256, "learning_rate": 8.038804638715432e-05, "loss": 0.1539, "step": 13400 }, { "epoch": 1.99, "grad_norm": 0.4166673719882965, "learning_rate": 8.037317870948559e-05, "loss": 0.1498, "step": 13410 }, { "epoch": 1.99, "grad_norm": 0.5549523830413818, "learning_rate": 8.035831103181683e-05, "loss": 0.1401, "step": 13420 }, { "epoch": 1.99, "grad_norm": 0.3639054298400879, "learning_rate": 8.034344335414809e-05, "loss": 0.1479, "step": 13430 }, { "epoch": 1.99, "grad_norm": 0.5058225393295288, "learning_rate": 8.032857567647934e-05, "loss": 0.1437, "step": 13440 }, { "epoch": 1.99, "grad_norm": 0.7204506397247314, "learning_rate": 8.03137079988106e-05, "loss": 0.147, "step": 13450 }, { "epoch": 2.0, "grad_norm": 0.34979936480522156, "learning_rate": 8.029884032114185e-05, "loss": 0.1409, "step": 13460 }, { "epoch": 2.0, "grad_norm": 0.71246337890625, "learning_rate": 8.028397264347309e-05, "loss": 0.1442, "step": 13470 }, { "epoch": 2.0, "grad_norm": 0.47565925121307373, "learning_rate": 8.026910496580435e-05, "loss": 0.145, "step": 13480 }, { "epoch": 2.0, "grad_norm": 0.7164893746376038, "learning_rate": 8.025423728813559e-05, "loss": 0.1368, "step": 13490 }, { "epoch": 2.0, "eval_loss": 0.15097101032733917, "eval_runtime": 2479.6392, "eval_samples_per_second": 235.388, "eval_steps_per_second": 3.678, "step": 13492 }, { "epoch": 2.0, "grad_norm": 0.3602011501789093, "learning_rate": 8.023936961046685e-05, "loss": 0.1409, "step": 13500 }, { "epoch": 2.0, "grad_norm": 1.1467143297195435, "learning_rate": 8.02245019327981e-05, "loss": 0.1445, "step": 13510 }, { "epoch": 2.0, "grad_norm": 0.553377091884613, "learning_rate": 8.020963425512935e-05, "loss": 0.1326, "step": 13520 }, { "epoch": 2.01, "grad_norm": 0.7122890949249268, "learning_rate": 8.01947665774606e-05, "loss": 0.1414, "step": 13530 }, { "epoch": 2.01, "grad_norm": 0.6272186040878296, "learning_rate": 8.017989889979186e-05, "loss": 0.1428, "step": 13540 }, { "epoch": 2.01, "grad_norm": 0.5186712741851807, "learning_rate": 8.016503122212312e-05, "loss": 0.1473, "step": 13550 }, { "epoch": 2.01, "grad_norm": 0.7185566425323486, "learning_rate": 8.015016354445436e-05, "loss": 0.1427, "step": 13560 }, { "epoch": 2.01, "grad_norm": 0.32846587896347046, "learning_rate": 8.013529586678561e-05, "loss": 0.143, "step": 13570 }, { "epoch": 2.01, "grad_norm": 1.1040384769439697, "learning_rate": 8.012042818911685e-05, "loss": 0.1442, "step": 13580 }, { "epoch": 2.01, "grad_norm": 0.3372102379798889, "learning_rate": 8.010556051144812e-05, "loss": 0.1373, "step": 13590 }, { "epoch": 2.02, "grad_norm": 0.5751693248748779, "learning_rate": 8.009069283377936e-05, "loss": 0.1372, "step": 13600 }, { "epoch": 2.02, "grad_norm": 0.8382410407066345, "learning_rate": 8.007582515611062e-05, "loss": 0.1393, "step": 13610 }, { "epoch": 2.02, "grad_norm": 0.2772016227245331, "learning_rate": 8.006095747844187e-05, "loss": 0.136, "step": 13620 }, { "epoch": 2.02, "grad_norm": 0.9653019309043884, "learning_rate": 8.004608980077313e-05, "loss": 0.1389, "step": 13630 }, { "epoch": 2.02, "grad_norm": 0.6972894668579102, "learning_rate": 8.003122212310438e-05, "loss": 0.1497, "step": 13640 }, { "epoch": 2.02, "grad_norm": 0.28461864590644836, "learning_rate": 8.001635444543562e-05, "loss": 0.1389, "step": 13650 }, { "epoch": 2.02, "grad_norm": 1.647381067276001, "learning_rate": 8.000148676776688e-05, "loss": 0.1411, "step": 13660 }, { "epoch": 2.03, "grad_norm": 0.6920177340507507, "learning_rate": 7.998661909009813e-05, "loss": 0.1464, "step": 13670 }, { "epoch": 2.03, "grad_norm": 0.29943400621414185, "learning_rate": 7.997175141242939e-05, "loss": 0.141, "step": 13680 }, { "epoch": 2.03, "grad_norm": 0.3627011179924011, "learning_rate": 7.995688373476064e-05, "loss": 0.1414, "step": 13690 }, { "epoch": 2.03, "grad_norm": 0.7024916410446167, "learning_rate": 7.994201605709188e-05, "loss": 0.1403, "step": 13700 }, { "epoch": 2.03, "grad_norm": 1.041019082069397, "learning_rate": 7.992714837942314e-05, "loss": 0.1461, "step": 13710 }, { "epoch": 2.03, "grad_norm": 0.9740252494812012, "learning_rate": 7.991228070175439e-05, "loss": 0.1423, "step": 13720 }, { "epoch": 2.04, "grad_norm": 0.3003685176372528, "learning_rate": 7.989741302408565e-05, "loss": 0.1615, "step": 13730 }, { "epoch": 2.04, "grad_norm": 0.32233926653862, "learning_rate": 7.988254534641689e-05, "loss": 0.1328, "step": 13740 }, { "epoch": 2.04, "grad_norm": 0.23946711421012878, "learning_rate": 7.986767766874814e-05, "loss": 0.1512, "step": 13750 }, { "epoch": 2.04, "grad_norm": 0.9506574273109436, "learning_rate": 7.98528099910794e-05, "loss": 0.1485, "step": 13760 }, { "epoch": 2.04, "grad_norm": 2.029719114303589, "learning_rate": 7.983794231341065e-05, "loss": 0.1527, "step": 13770 }, { "epoch": 2.04, "grad_norm": 0.22863923013210297, "learning_rate": 7.982307463574191e-05, "loss": 0.1448, "step": 13780 }, { "epoch": 2.04, "grad_norm": 0.300920307636261, "learning_rate": 7.980820695807315e-05, "loss": 0.1477, "step": 13790 }, { "epoch": 2.05, "grad_norm": 0.3894307315349579, "learning_rate": 7.97933392804044e-05, "loss": 0.1428, "step": 13800 }, { "epoch": 2.05, "grad_norm": 0.7516509294509888, "learning_rate": 7.977847160273566e-05, "loss": 0.1423, "step": 13810 }, { "epoch": 2.05, "grad_norm": 0.6397115588188171, "learning_rate": 7.976360392506691e-05, "loss": 0.1446, "step": 13820 }, { "epoch": 2.05, "grad_norm": 0.33174657821655273, "learning_rate": 7.975022301516504e-05, "loss": 0.1415, "step": 13830 }, { "epoch": 2.05, "grad_norm": 0.393704891204834, "learning_rate": 7.973535533749629e-05, "loss": 0.1471, "step": 13840 }, { "epoch": 2.05, "grad_norm": 1.0416827201843262, "learning_rate": 7.972048765982753e-05, "loss": 0.1369, "step": 13850 }, { "epoch": 2.05, "grad_norm": 1.268273949623108, "learning_rate": 7.970561998215879e-05, "loss": 0.1461, "step": 13860 }, { "epoch": 2.06, "grad_norm": 0.27585187554359436, "learning_rate": 7.969075230449004e-05, "loss": 0.1483, "step": 13870 }, { "epoch": 2.06, "grad_norm": 0.27429184317588806, "learning_rate": 7.96758846268213e-05, "loss": 0.1434, "step": 13880 }, { "epoch": 2.06, "grad_norm": 1.2318065166473389, "learning_rate": 7.966101694915254e-05, "loss": 0.1402, "step": 13890 }, { "epoch": 2.06, "grad_norm": 0.34538692235946655, "learning_rate": 7.96461492714838e-05, "loss": 0.1368, "step": 13900 }, { "epoch": 2.06, "grad_norm": 1.7623499631881714, "learning_rate": 7.963128159381505e-05, "loss": 0.1447, "step": 13910 }, { "epoch": 2.06, "grad_norm": 1.2780420780181885, "learning_rate": 7.96164139161463e-05, "loss": 0.1466, "step": 13920 }, { "epoch": 2.06, "grad_norm": 0.6700235605239868, "learning_rate": 7.960154623847756e-05, "loss": 0.1429, "step": 13930 }, { "epoch": 2.07, "grad_norm": 0.46738964319229126, "learning_rate": 7.95866785608088e-05, "loss": 0.1411, "step": 13940 }, { "epoch": 2.07, "grad_norm": 0.8376699090003967, "learning_rate": 7.957181088314005e-05, "loss": 0.1376, "step": 13950 }, { "epoch": 2.07, "grad_norm": 0.43358296155929565, "learning_rate": 7.955694320547131e-05, "loss": 0.1482, "step": 13960 }, { "epoch": 2.07, "grad_norm": 0.7221712470054626, "learning_rate": 7.954207552780256e-05, "loss": 0.139, "step": 13970 }, { "epoch": 2.07, "grad_norm": 0.6004229187965393, "learning_rate": 7.95272078501338e-05, "loss": 0.1446, "step": 13980 }, { "epoch": 2.07, "grad_norm": 0.7398319840431213, "learning_rate": 7.951234017246506e-05, "loss": 0.14, "step": 13990 }, { "epoch": 2.08, "grad_norm": 0.39415350556373596, "learning_rate": 7.949747249479631e-05, "loss": 0.1425, "step": 14000 }, { "epoch": 2.08, "grad_norm": 0.32933810353279114, "learning_rate": 7.948260481712757e-05, "loss": 0.141, "step": 14010 }, { "epoch": 2.08, "grad_norm": 1.5092155933380127, "learning_rate": 7.946773713945882e-05, "loss": 0.1498, "step": 14020 }, { "epoch": 2.08, "grad_norm": 1.36288583278656, "learning_rate": 7.945286946179007e-05, "loss": 0.1441, "step": 14030 }, { "epoch": 2.08, "grad_norm": 0.4052281081676483, "learning_rate": 7.943800178412132e-05, "loss": 0.1387, "step": 14040 }, { "epoch": 2.08, "grad_norm": 1.4487920999526978, "learning_rate": 7.942313410645257e-05, "loss": 0.1451, "step": 14050 }, { "epoch": 2.08, "grad_norm": 0.6891205906867981, "learning_rate": 7.940826642878383e-05, "loss": 0.1397, "step": 14060 }, { "epoch": 2.09, "grad_norm": 0.8474586606025696, "learning_rate": 7.939339875111508e-05, "loss": 0.1501, "step": 14070 }, { "epoch": 2.09, "grad_norm": 0.4147012531757355, "learning_rate": 7.937853107344633e-05, "loss": 0.131, "step": 14080 }, { "epoch": 2.09, "grad_norm": 1.2289682626724243, "learning_rate": 7.93636633957776e-05, "loss": 0.1447, "step": 14090 }, { "epoch": 2.09, "grad_norm": 0.4058608114719391, "learning_rate": 7.934879571810884e-05, "loss": 0.144, "step": 14100 }, { "epoch": 2.09, "grad_norm": 0.19358326494693756, "learning_rate": 7.933392804044009e-05, "loss": 0.1414, "step": 14110 }, { "epoch": 2.09, "grad_norm": 0.3566335439682007, "learning_rate": 7.931906036277133e-05, "loss": 0.137, "step": 14120 }, { "epoch": 2.09, "grad_norm": 0.7426595091819763, "learning_rate": 7.93041926851026e-05, "loss": 0.1302, "step": 14130 }, { "epoch": 2.1, "grad_norm": 1.8978757858276367, "learning_rate": 7.928932500743384e-05, "loss": 0.146, "step": 14140 }, { "epoch": 2.1, "grad_norm": 0.46663588285446167, "learning_rate": 7.92744573297651e-05, "loss": 0.144, "step": 14150 }, { "epoch": 2.1, "grad_norm": 0.2703957259654999, "learning_rate": 7.925958965209635e-05, "loss": 0.1376, "step": 14160 }, { "epoch": 2.1, "grad_norm": 0.3722328841686249, "learning_rate": 7.924472197442759e-05, "loss": 0.1486, "step": 14170 }, { "epoch": 2.1, "grad_norm": 1.1712533235549927, "learning_rate": 7.922985429675886e-05, "loss": 0.1369, "step": 14180 }, { "epoch": 2.1, "grad_norm": 0.2290862798690796, "learning_rate": 7.92149866190901e-05, "loss": 0.1448, "step": 14190 }, { "epoch": 2.1, "grad_norm": 0.2642858028411865, "learning_rate": 7.920011894142136e-05, "loss": 0.1406, "step": 14200 }, { "epoch": 2.11, "grad_norm": 0.7167410850524902, "learning_rate": 7.91852512637526e-05, "loss": 0.1376, "step": 14210 }, { "epoch": 2.11, "grad_norm": 1.1799192428588867, "learning_rate": 7.917038358608387e-05, "loss": 0.1393, "step": 14220 }, { "epoch": 2.11, "grad_norm": 0.2893933951854706, "learning_rate": 7.91555159084151e-05, "loss": 0.1457, "step": 14230 }, { "epoch": 2.11, "grad_norm": 0.45935529470443726, "learning_rate": 7.914064823074636e-05, "loss": 0.1436, "step": 14240 }, { "epoch": 2.11, "grad_norm": 0.2944597005844116, "learning_rate": 7.912578055307762e-05, "loss": 0.1412, "step": 14250 }, { "epoch": 2.11, "grad_norm": 0.5963975191116333, "learning_rate": 7.911091287540886e-05, "loss": 0.1474, "step": 14260 }, { "epoch": 2.12, "grad_norm": 0.6776736378669739, "learning_rate": 7.909604519774013e-05, "loss": 0.1374, "step": 14270 }, { "epoch": 2.12, "grad_norm": 0.3224552869796753, "learning_rate": 7.908117752007137e-05, "loss": 0.1385, "step": 14280 }, { "epoch": 2.12, "grad_norm": 0.4411897361278534, "learning_rate": 7.906630984240262e-05, "loss": 0.1445, "step": 14290 }, { "epoch": 2.12, "grad_norm": 0.4700928330421448, "learning_rate": 7.905144216473386e-05, "loss": 0.1388, "step": 14300 }, { "epoch": 2.12, "grad_norm": 0.5549743175506592, "learning_rate": 7.903657448706513e-05, "loss": 0.1418, "step": 14310 }, { "epoch": 2.12, "grad_norm": 0.7805799841880798, "learning_rate": 7.902170680939637e-05, "loss": 0.1429, "step": 14320 }, { "epoch": 2.12, "grad_norm": 0.6569302678108215, "learning_rate": 7.900683913172763e-05, "loss": 0.1413, "step": 14330 }, { "epoch": 2.13, "grad_norm": 0.9454148411750793, "learning_rate": 7.899197145405888e-05, "loss": 0.1422, "step": 14340 }, { "epoch": 2.13, "grad_norm": 0.41172319650650024, "learning_rate": 7.897710377639012e-05, "loss": 0.1411, "step": 14350 }, { "epoch": 2.13, "grad_norm": 0.7529705166816711, "learning_rate": 7.896223609872139e-05, "loss": 0.1388, "step": 14360 }, { "epoch": 2.13, "grad_norm": 0.5935275554656982, "learning_rate": 7.894736842105263e-05, "loss": 0.1448, "step": 14370 }, { "epoch": 2.13, "grad_norm": 1.1465091705322266, "learning_rate": 7.893250074338389e-05, "loss": 0.132, "step": 14380 }, { "epoch": 2.13, "grad_norm": 0.5180565714836121, "learning_rate": 7.891763306571513e-05, "loss": 0.1394, "step": 14390 }, { "epoch": 2.13, "grad_norm": 0.3858262896537781, "learning_rate": 7.89027653880464e-05, "loss": 0.1426, "step": 14400 }, { "epoch": 2.14, "grad_norm": 0.3078696131706238, "learning_rate": 7.888789771037765e-05, "loss": 0.1347, "step": 14410 }, { "epoch": 2.14, "grad_norm": 0.2894585430622101, "learning_rate": 7.887303003270889e-05, "loss": 0.1412, "step": 14420 }, { "epoch": 2.14, "grad_norm": 0.9229311943054199, "learning_rate": 7.885816235504015e-05, "loss": 0.145, "step": 14430 }, { "epoch": 2.14, "grad_norm": 0.8589563369750977, "learning_rate": 7.884329467737139e-05, "loss": 0.1423, "step": 14440 }, { "epoch": 2.14, "grad_norm": 0.33812132477760315, "learning_rate": 7.882842699970266e-05, "loss": 0.1312, "step": 14450 }, { "epoch": 2.14, "grad_norm": 0.4707375466823578, "learning_rate": 7.88135593220339e-05, "loss": 0.1333, "step": 14460 }, { "epoch": 2.14, "grad_norm": 0.35323792695999146, "learning_rate": 7.879869164436515e-05, "loss": 0.1503, "step": 14470 }, { "epoch": 2.15, "grad_norm": 0.7402976751327515, "learning_rate": 7.878382396669641e-05, "loss": 0.1429, "step": 14480 }, { "epoch": 2.15, "grad_norm": 0.49308282136917114, "learning_rate": 7.876895628902766e-05, "loss": 0.138, "step": 14490 }, { "epoch": 2.15, "grad_norm": 0.824302613735199, "learning_rate": 7.875408861135892e-05, "loss": 0.1456, "step": 14500 }, { "epoch": 2.15, "grad_norm": 1.5500679016113281, "learning_rate": 7.873922093369016e-05, "loss": 0.1466, "step": 14510 }, { "epoch": 2.15, "grad_norm": 0.34378743171691895, "learning_rate": 7.872435325602141e-05, "loss": 0.1382, "step": 14520 }, { "epoch": 2.15, "grad_norm": 0.48903822898864746, "learning_rate": 7.870948557835265e-05, "loss": 0.1398, "step": 14530 }, { "epoch": 2.16, "grad_norm": 0.954559862613678, "learning_rate": 7.869461790068392e-05, "loss": 0.1406, "step": 14540 }, { "epoch": 2.16, "grad_norm": 0.3512939214706421, "learning_rate": 7.867975022301516e-05, "loss": 0.1404, "step": 14550 }, { "epoch": 2.16, "grad_norm": 1.6540653705596924, "learning_rate": 7.866488254534642e-05, "loss": 0.1399, "step": 14560 }, { "epoch": 2.16, "grad_norm": 1.4381154775619507, "learning_rate": 7.865001486767767e-05, "loss": 0.1456, "step": 14570 }, { "epoch": 2.16, "grad_norm": 0.2611878514289856, "learning_rate": 7.863514719000893e-05, "loss": 0.1436, "step": 14580 }, { "epoch": 2.16, "grad_norm": 1.1198517084121704, "learning_rate": 7.862027951234018e-05, "loss": 0.1421, "step": 14590 }, { "epoch": 2.16, "grad_norm": 1.4881504774093628, "learning_rate": 7.860541183467142e-05, "loss": 0.1431, "step": 14600 }, { "epoch": 2.17, "grad_norm": 0.5614832043647766, "learning_rate": 7.859054415700268e-05, "loss": 0.1444, "step": 14610 }, { "epoch": 2.17, "grad_norm": 1.2753747701644897, "learning_rate": 7.857567647933393e-05, "loss": 0.1355, "step": 14620 }, { "epoch": 2.17, "grad_norm": 0.7569044232368469, "learning_rate": 7.856080880166519e-05, "loss": 0.1403, "step": 14630 }, { "epoch": 2.17, "grad_norm": 0.8014097213745117, "learning_rate": 7.854594112399643e-05, "loss": 0.1444, "step": 14640 }, { "epoch": 2.17, "grad_norm": 0.4055616855621338, "learning_rate": 7.853107344632768e-05, "loss": 0.1384, "step": 14650 }, { "epoch": 2.17, "grad_norm": 0.8355204463005066, "learning_rate": 7.851620576865894e-05, "loss": 0.1374, "step": 14660 }, { "epoch": 2.17, "grad_norm": 0.5635973215103149, "learning_rate": 7.85013380909902e-05, "loss": 0.1453, "step": 14670 }, { "epoch": 2.18, "grad_norm": 0.3572874069213867, "learning_rate": 7.848647041332145e-05, "loss": 0.1474, "step": 14680 }, { "epoch": 2.18, "grad_norm": 0.31111082434654236, "learning_rate": 7.847160273565269e-05, "loss": 0.1436, "step": 14690 }, { "epoch": 2.18, "grad_norm": 0.38512876629829407, "learning_rate": 7.845673505798394e-05, "loss": 0.138, "step": 14700 }, { "epoch": 2.18, "grad_norm": 0.26960206031799316, "learning_rate": 7.84418673803152e-05, "loss": 0.147, "step": 14710 }, { "epoch": 2.18, "grad_norm": 0.29528844356536865, "learning_rate": 7.842699970264645e-05, "loss": 0.1416, "step": 14720 }, { "epoch": 2.18, "grad_norm": 0.3126089572906494, "learning_rate": 7.841213202497771e-05, "loss": 0.1417, "step": 14730 }, { "epoch": 2.18, "grad_norm": 1.1311891078948975, "learning_rate": 7.839726434730895e-05, "loss": 0.1365, "step": 14740 }, { "epoch": 2.19, "grad_norm": 0.27010664343833923, "learning_rate": 7.83823966696402e-05, "loss": 0.1367, "step": 14750 }, { "epoch": 2.19, "grad_norm": 1.2416595220565796, "learning_rate": 7.836752899197146e-05, "loss": 0.137, "step": 14760 }, { "epoch": 2.19, "grad_norm": 0.2980870306491852, "learning_rate": 7.835266131430271e-05, "loss": 0.1486, "step": 14770 }, { "epoch": 2.19, "grad_norm": 0.44527989625930786, "learning_rate": 7.833779363663396e-05, "loss": 0.1392, "step": 14780 }, { "epoch": 2.19, "grad_norm": 0.36573994159698486, "learning_rate": 7.832292595896521e-05, "loss": 0.1469, "step": 14790 }, { "epoch": 2.19, "grad_norm": 0.3939895033836365, "learning_rate": 7.830805828129647e-05, "loss": 0.1449, "step": 14800 }, { "epoch": 2.2, "grad_norm": 0.5952978134155273, "learning_rate": 7.829319060362772e-05, "loss": 0.1399, "step": 14810 }, { "epoch": 2.2, "grad_norm": 0.4172046184539795, "learning_rate": 7.827832292595897e-05, "loss": 0.1417, "step": 14820 }, { "epoch": 2.2, "grad_norm": 1.6859674453735352, "learning_rate": 7.826345524829022e-05, "loss": 0.1503, "step": 14830 }, { "epoch": 2.2, "grad_norm": 0.4280993640422821, "learning_rate": 7.824858757062147e-05, "loss": 0.1466, "step": 14840 }, { "epoch": 2.2, "grad_norm": 0.28708526492118835, "learning_rate": 7.823371989295273e-05, "loss": 0.1401, "step": 14850 }, { "epoch": 2.2, "grad_norm": 1.4044026136398315, "learning_rate": 7.821885221528398e-05, "loss": 0.142, "step": 14860 }, { "epoch": 2.2, "grad_norm": 0.4096578061580658, "learning_rate": 7.820398453761522e-05, "loss": 0.1433, "step": 14870 }, { "epoch": 2.21, "grad_norm": 0.49273645877838135, "learning_rate": 7.818911685994648e-05, "loss": 0.1411, "step": 14880 }, { "epoch": 2.21, "grad_norm": 1.1558241844177246, "learning_rate": 7.817424918227773e-05, "loss": 0.1325, "step": 14890 }, { "epoch": 2.21, "grad_norm": 0.2521809935569763, "learning_rate": 7.815938150460899e-05, "loss": 0.1436, "step": 14900 }, { "epoch": 2.21, "grad_norm": 1.3384026288986206, "learning_rate": 7.814451382694024e-05, "loss": 0.1353, "step": 14910 }, { "epoch": 2.21, "grad_norm": 0.9027255177497864, "learning_rate": 7.812964614927148e-05, "loss": 0.1396, "step": 14920 }, { "epoch": 2.21, "grad_norm": 1.5526256561279297, "learning_rate": 7.811477847160274e-05, "loss": 0.1385, "step": 14930 }, { "epoch": 2.21, "grad_norm": 1.069063425064087, "learning_rate": 7.809991079393399e-05, "loss": 0.1359, "step": 14940 }, { "epoch": 2.22, "grad_norm": 0.2974967956542969, "learning_rate": 7.808504311626525e-05, "loss": 0.1342, "step": 14950 }, { "epoch": 2.22, "grad_norm": 0.25086086988449097, "learning_rate": 7.807017543859649e-05, "loss": 0.1332, "step": 14960 }, { "epoch": 2.22, "grad_norm": 0.6883974671363831, "learning_rate": 7.805530776092774e-05, "loss": 0.1399, "step": 14970 }, { "epoch": 2.22, "grad_norm": 0.8510827422142029, "learning_rate": 7.8040440083259e-05, "loss": 0.1441, "step": 14980 }, { "epoch": 2.22, "grad_norm": 1.4618726968765259, "learning_rate": 7.802557240559025e-05, "loss": 0.147, "step": 14990 }, { "epoch": 2.22, "grad_norm": 0.8736234307289124, "learning_rate": 7.80107047279215e-05, "loss": 0.1422, "step": 15000 }, { "epoch": 2.22, "grad_norm": 0.5815086364746094, "learning_rate": 7.799583705025275e-05, "loss": 0.1413, "step": 15010 }, { "epoch": 2.23, "grad_norm": 0.29546529054641724, "learning_rate": 7.7980969372584e-05, "loss": 0.1404, "step": 15020 }, { "epoch": 2.23, "grad_norm": 0.3832775056362152, "learning_rate": 7.796610169491526e-05, "loss": 0.1327, "step": 15030 }, { "epoch": 2.23, "grad_norm": 0.2785964608192444, "learning_rate": 7.795123401724651e-05, "loss": 0.1329, "step": 15040 }, { "epoch": 2.23, "grad_norm": 1.5065478086471558, "learning_rate": 7.793636633957775e-05, "loss": 0.1527, "step": 15050 }, { "epoch": 2.23, "grad_norm": 1.5339558124542236, "learning_rate": 7.792149866190901e-05, "loss": 0.1431, "step": 15060 }, { "epoch": 2.23, "grad_norm": 1.5240484476089478, "learning_rate": 7.790663098424028e-05, "loss": 0.1465, "step": 15070 }, { "epoch": 2.24, "grad_norm": 0.3933528661727905, "learning_rate": 7.789176330657152e-05, "loss": 0.1408, "step": 15080 }, { "epoch": 2.24, "grad_norm": 0.600188672542572, "learning_rate": 7.787689562890277e-05, "loss": 0.1413, "step": 15090 }, { "epoch": 2.24, "grad_norm": 0.3889021575450897, "learning_rate": 7.786202795123401e-05, "loss": 0.1377, "step": 15100 }, { "epoch": 2.24, "grad_norm": 0.4499037265777588, "learning_rate": 7.784716027356528e-05, "loss": 0.1401, "step": 15110 }, { "epoch": 2.24, "grad_norm": 0.772075355052948, "learning_rate": 7.783229259589652e-05, "loss": 0.1359, "step": 15120 }, { "epoch": 2.24, "grad_norm": 0.5900905132293701, "learning_rate": 7.781742491822778e-05, "loss": 0.1374, "step": 15130 }, { "epoch": 2.24, "grad_norm": 1.9632419347763062, "learning_rate": 7.780255724055903e-05, "loss": 0.1485, "step": 15140 }, { "epoch": 2.25, "grad_norm": 1.079269289970398, "learning_rate": 7.778768956289027e-05, "loss": 0.1428, "step": 15150 }, { "epoch": 2.25, "grad_norm": 0.5753262042999268, "learning_rate": 7.777282188522154e-05, "loss": 0.1429, "step": 15160 }, { "epoch": 2.25, "grad_norm": 0.28684714436531067, "learning_rate": 7.775795420755278e-05, "loss": 0.1363, "step": 15170 }, { "epoch": 2.25, "grad_norm": 0.2784688174724579, "learning_rate": 7.774308652988404e-05, "loss": 0.1447, "step": 15180 }, { "epoch": 2.25, "grad_norm": 0.4321437478065491, "learning_rate": 7.772821885221528e-05, "loss": 0.1437, "step": 15190 }, { "epoch": 2.25, "grad_norm": 0.4593127369880676, "learning_rate": 7.771335117454655e-05, "loss": 0.1394, "step": 15200 }, { "epoch": 2.25, "grad_norm": 0.6981044411659241, "learning_rate": 7.769848349687779e-05, "loss": 0.1469, "step": 15210 }, { "epoch": 2.26, "grad_norm": 0.9199114441871643, "learning_rate": 7.768361581920904e-05, "loss": 0.1396, "step": 15220 }, { "epoch": 2.26, "grad_norm": 0.5728020668029785, "learning_rate": 7.76687481415403e-05, "loss": 0.1433, "step": 15230 }, { "epoch": 2.26, "grad_norm": 0.3551998734474182, "learning_rate": 7.765388046387154e-05, "loss": 0.1418, "step": 15240 }, { "epoch": 2.26, "grad_norm": 0.25206756591796875, "learning_rate": 7.763901278620281e-05, "loss": 0.1405, "step": 15250 }, { "epoch": 2.26, "grad_norm": 0.40802913904190063, "learning_rate": 7.762414510853405e-05, "loss": 0.1406, "step": 15260 }, { "epoch": 2.26, "grad_norm": 0.4538644552230835, "learning_rate": 7.76092774308653e-05, "loss": 0.1414, "step": 15270 }, { "epoch": 2.26, "grad_norm": 0.5877019166946411, "learning_rate": 7.759440975319655e-05, "loss": 0.1398, "step": 15280 }, { "epoch": 2.27, "grad_norm": 1.7143536806106567, "learning_rate": 7.757954207552781e-05, "loss": 0.1444, "step": 15290 }, { "epoch": 2.27, "grad_norm": 0.8385623693466187, "learning_rate": 7.756467439785905e-05, "loss": 0.1453, "step": 15300 }, { "epoch": 2.27, "grad_norm": 0.5438138246536255, "learning_rate": 7.754980672019031e-05, "loss": 0.1476, "step": 15310 }, { "epoch": 2.27, "grad_norm": 1.067368507385254, "learning_rate": 7.753493904252156e-05, "loss": 0.1493, "step": 15320 }, { "epoch": 2.27, "grad_norm": 0.43237921595573425, "learning_rate": 7.75200713648528e-05, "loss": 0.1418, "step": 15330 }, { "epoch": 2.27, "grad_norm": 0.7449706792831421, "learning_rate": 7.750520368718407e-05, "loss": 0.1372, "step": 15340 }, { "epoch": 2.28, "grad_norm": 1.315883994102478, "learning_rate": 7.749033600951531e-05, "loss": 0.1482, "step": 15350 }, { "epoch": 2.28, "grad_norm": 0.20625711977481842, "learning_rate": 7.747546833184657e-05, "loss": 0.1374, "step": 15360 }, { "epoch": 2.28, "grad_norm": 0.29170262813568115, "learning_rate": 7.746060065417781e-05, "loss": 0.1434, "step": 15370 }, { "epoch": 2.28, "grad_norm": 0.6575577855110168, "learning_rate": 7.744573297650908e-05, "loss": 0.1411, "step": 15380 }, { "epoch": 2.28, "grad_norm": 0.36040765047073364, "learning_rate": 7.743086529884032e-05, "loss": 0.1387, "step": 15390 }, { "epoch": 2.28, "grad_norm": 0.6309950947761536, "learning_rate": 7.741599762117158e-05, "loss": 0.1478, "step": 15400 }, { "epoch": 2.28, "grad_norm": 0.2758764624595642, "learning_rate": 7.740112994350283e-05, "loss": 0.1437, "step": 15410 }, { "epoch": 2.29, "grad_norm": 0.648673951625824, "learning_rate": 7.738626226583407e-05, "loss": 0.1365, "step": 15420 }, { "epoch": 2.29, "grad_norm": 1.0597361326217651, "learning_rate": 7.737139458816534e-05, "loss": 0.1408, "step": 15430 }, { "epoch": 2.29, "grad_norm": 0.3122524321079254, "learning_rate": 7.735652691049658e-05, "loss": 0.1481, "step": 15440 }, { "epoch": 2.29, "grad_norm": 0.9538226127624512, "learning_rate": 7.734165923282784e-05, "loss": 0.149, "step": 15450 }, { "epoch": 2.29, "grad_norm": 0.28215184807777405, "learning_rate": 7.732679155515909e-05, "loss": 0.1316, "step": 15460 }, { "epoch": 2.29, "grad_norm": 0.4941287338733673, "learning_rate": 7.731192387749034e-05, "loss": 0.1409, "step": 15470 }, { "epoch": 2.29, "grad_norm": 0.3369395136833191, "learning_rate": 7.72970561998216e-05, "loss": 0.145, "step": 15480 }, { "epoch": 2.3, "grad_norm": 0.8342441320419312, "learning_rate": 7.728218852215284e-05, "loss": 0.1377, "step": 15490 }, { "epoch": 2.3, "grad_norm": 2.002681255340576, "learning_rate": 7.72673208444841e-05, "loss": 0.1436, "step": 15500 }, { "epoch": 2.3, "grad_norm": 1.1296112537384033, "learning_rate": 7.725245316681534e-05, "loss": 0.1372, "step": 15510 }, { "epoch": 2.3, "grad_norm": 0.4747876822948456, "learning_rate": 7.72375854891466e-05, "loss": 0.1391, "step": 15520 }, { "epoch": 2.3, "grad_norm": 0.37686342000961304, "learning_rate": 7.722271781147785e-05, "loss": 0.1417, "step": 15530 }, { "epoch": 2.3, "grad_norm": 0.6419855952262878, "learning_rate": 7.72078501338091e-05, "loss": 0.1507, "step": 15540 }, { "epoch": 2.3, "grad_norm": 0.5197919011116028, "learning_rate": 7.719298245614036e-05, "loss": 0.1406, "step": 15550 }, { "epoch": 2.31, "grad_norm": 0.6364248991012573, "learning_rate": 7.717811477847161e-05, "loss": 0.139, "step": 15560 }, { "epoch": 2.31, "grad_norm": 0.2479657530784607, "learning_rate": 7.716324710080287e-05, "loss": 0.1347, "step": 15570 }, { "epoch": 2.31, "grad_norm": 0.46477001905441284, "learning_rate": 7.71483794231341e-05, "loss": 0.1431, "step": 15580 }, { "epoch": 2.31, "grad_norm": 0.32222843170166016, "learning_rate": 7.713351174546536e-05, "loss": 0.1487, "step": 15590 }, { "epoch": 2.31, "grad_norm": 0.847084641456604, "learning_rate": 7.711864406779662e-05, "loss": 0.1503, "step": 15600 }, { "epoch": 2.31, "grad_norm": 0.4260410964488983, "learning_rate": 7.710377639012787e-05, "loss": 0.1367, "step": 15610 }, { "epoch": 2.32, "grad_norm": 0.7543812394142151, "learning_rate": 7.708890871245911e-05, "loss": 0.1353, "step": 15620 }, { "epoch": 2.32, "grad_norm": 1.0072932243347168, "learning_rate": 7.707404103479037e-05, "loss": 0.1471, "step": 15630 }, { "epoch": 2.32, "grad_norm": 0.30913281440734863, "learning_rate": 7.705917335712162e-05, "loss": 0.145, "step": 15640 }, { "epoch": 2.32, "grad_norm": 0.6405145525932312, "learning_rate": 7.704430567945288e-05, "loss": 0.1342, "step": 15650 }, { "epoch": 2.32, "grad_norm": 1.67426598072052, "learning_rate": 7.702943800178413e-05, "loss": 0.1342, "step": 15660 }, { "epoch": 2.32, "grad_norm": 0.5174002647399902, "learning_rate": 7.701457032411537e-05, "loss": 0.1419, "step": 15670 }, { "epoch": 2.32, "grad_norm": 0.7070042490959167, "learning_rate": 7.699970264644663e-05, "loss": 0.1388, "step": 15680 }, { "epoch": 2.33, "grad_norm": 0.992650032043457, "learning_rate": 7.698483496877788e-05, "loss": 0.1361, "step": 15690 }, { "epoch": 2.33, "grad_norm": 0.519932210445404, "learning_rate": 7.696996729110914e-05, "loss": 0.1409, "step": 15700 }, { "epoch": 2.33, "grad_norm": 0.5189968943595886, "learning_rate": 7.695509961344038e-05, "loss": 0.14, "step": 15710 }, { "epoch": 2.33, "grad_norm": 1.0756185054779053, "learning_rate": 7.694023193577163e-05, "loss": 0.1407, "step": 15720 }, { "epoch": 2.33, "grad_norm": 0.6444623470306396, "learning_rate": 7.692536425810289e-05, "loss": 0.1466, "step": 15730 }, { "epoch": 2.33, "grad_norm": 0.3298320174217224, "learning_rate": 7.691049658043414e-05, "loss": 0.136, "step": 15740 }, { "epoch": 2.33, "grad_norm": 0.20186583697795868, "learning_rate": 7.68956289027654e-05, "loss": 0.1393, "step": 15750 }, { "epoch": 2.34, "grad_norm": 0.335825115442276, "learning_rate": 7.688076122509664e-05, "loss": 0.1377, "step": 15760 }, { "epoch": 2.34, "grad_norm": 1.36127769947052, "learning_rate": 7.686589354742789e-05, "loss": 0.1457, "step": 15770 }, { "epoch": 2.34, "grad_norm": 1.2447770833969116, "learning_rate": 7.685102586975915e-05, "loss": 0.144, "step": 15780 }, { "epoch": 2.34, "grad_norm": 0.7446007132530212, "learning_rate": 7.68361581920904e-05, "loss": 0.1407, "step": 15790 }, { "epoch": 2.34, "grad_norm": 1.47913658618927, "learning_rate": 7.682129051442166e-05, "loss": 0.1473, "step": 15800 }, { "epoch": 2.34, "grad_norm": 0.7737151980400085, "learning_rate": 7.68064228367529e-05, "loss": 0.138, "step": 15810 }, { "epoch": 2.35, "grad_norm": 0.3995475172996521, "learning_rate": 7.679155515908415e-05, "loss": 0.1473, "step": 15820 }, { "epoch": 2.35, "grad_norm": 0.6491892337799072, "learning_rate": 7.677668748141541e-05, "loss": 0.1408, "step": 15830 }, { "epoch": 2.35, "grad_norm": 0.3302996754646301, "learning_rate": 7.676181980374666e-05, "loss": 0.1456, "step": 15840 }, { "epoch": 2.35, "grad_norm": 1.0726025104522705, "learning_rate": 7.67469521260779e-05, "loss": 0.1448, "step": 15850 }, { "epoch": 2.35, "grad_norm": 0.29980069398880005, "learning_rate": 7.673208444840916e-05, "loss": 0.1394, "step": 15860 }, { "epoch": 2.35, "grad_norm": 0.3691909611225128, "learning_rate": 7.671721677074041e-05, "loss": 0.139, "step": 15870 }, { "epoch": 2.35, "grad_norm": 0.26005685329437256, "learning_rate": 7.670234909307167e-05, "loss": 0.1369, "step": 15880 }, { "epoch": 2.36, "grad_norm": 0.5521954894065857, "learning_rate": 7.668748141540292e-05, "loss": 0.1394, "step": 15890 }, { "epoch": 2.36, "grad_norm": 0.44873157143592834, "learning_rate": 7.667261373773416e-05, "loss": 0.1408, "step": 15900 }, { "epoch": 2.36, "grad_norm": 0.8657715916633606, "learning_rate": 7.665774606006542e-05, "loss": 0.1368, "step": 15910 }, { "epoch": 2.36, "grad_norm": 0.876879870891571, "learning_rate": 7.664287838239667e-05, "loss": 0.1395, "step": 15920 }, { "epoch": 2.36, "grad_norm": 0.5348753929138184, "learning_rate": 7.662801070472793e-05, "loss": 0.147, "step": 15930 }, { "epoch": 2.36, "grad_norm": 1.3844401836395264, "learning_rate": 7.661314302705917e-05, "loss": 0.1448, "step": 15940 }, { "epoch": 2.36, "grad_norm": 0.532243013381958, "learning_rate": 7.659827534939042e-05, "loss": 0.1453, "step": 15950 }, { "epoch": 2.37, "grad_norm": 0.49074360728263855, "learning_rate": 7.658340767172168e-05, "loss": 0.1393, "step": 15960 }, { "epoch": 2.37, "grad_norm": 0.7487906217575073, "learning_rate": 7.656853999405293e-05, "loss": 0.1369, "step": 15970 }, { "epoch": 2.37, "grad_norm": 0.24927030503749847, "learning_rate": 7.655367231638419e-05, "loss": 0.146, "step": 15980 }, { "epoch": 2.37, "grad_norm": 0.3706786036491394, "learning_rate": 7.653880463871543e-05, "loss": 0.1432, "step": 15990 }, { "epoch": 2.37, "grad_norm": 0.342334508895874, "learning_rate": 7.652393696104668e-05, "loss": 0.1466, "step": 16000 }, { "epoch": 2.37, "grad_norm": 0.24996308982372284, "learning_rate": 7.650906928337794e-05, "loss": 0.1383, "step": 16010 }, { "epoch": 2.37, "grad_norm": 0.39586570858955383, "learning_rate": 7.64942016057092e-05, "loss": 0.1392, "step": 16020 }, { "epoch": 2.38, "grad_norm": 0.6906977891921997, "learning_rate": 7.647933392804044e-05, "loss": 0.1363, "step": 16030 }, { "epoch": 2.38, "grad_norm": 1.0736937522888184, "learning_rate": 7.646446625037169e-05, "loss": 0.1384, "step": 16040 }, { "epoch": 2.38, "grad_norm": 0.8316856622695923, "learning_rate": 7.644959857270295e-05, "loss": 0.1358, "step": 16050 }, { "epoch": 2.38, "grad_norm": 0.44582003355026245, "learning_rate": 7.64347308950342e-05, "loss": 0.1439, "step": 16060 }, { "epoch": 2.38, "grad_norm": 0.3568483591079712, "learning_rate": 7.641986321736545e-05, "loss": 0.1418, "step": 16070 }, { "epoch": 2.38, "grad_norm": 0.5303836464881897, "learning_rate": 7.64049955396967e-05, "loss": 0.1381, "step": 16080 }, { "epoch": 2.39, "grad_norm": 0.48559141159057617, "learning_rate": 7.639012786202796e-05, "loss": 0.1312, "step": 16090 }, { "epoch": 2.39, "grad_norm": 0.39514291286468506, "learning_rate": 7.63752601843592e-05, "loss": 0.1452, "step": 16100 }, { "epoch": 2.39, "grad_norm": 0.7182896137237549, "learning_rate": 7.636039250669046e-05, "loss": 0.133, "step": 16110 }, { "epoch": 2.39, "grad_norm": 1.0193687677383423, "learning_rate": 7.634552482902171e-05, "loss": 0.1429, "step": 16120 }, { "epoch": 2.39, "grad_norm": 0.29207757115364075, "learning_rate": 7.633065715135296e-05, "loss": 0.1457, "step": 16130 }, { "epoch": 2.39, "grad_norm": 0.8084988594055176, "learning_rate": 7.631578947368422e-05, "loss": 0.1446, "step": 16140 }, { "epoch": 2.39, "grad_norm": 0.3876637816429138, "learning_rate": 7.630092179601547e-05, "loss": 0.1339, "step": 16150 }, { "epoch": 2.4, "grad_norm": 1.8244117498397827, "learning_rate": 7.628605411834672e-05, "loss": 0.1456, "step": 16160 }, { "epoch": 2.4, "grad_norm": 0.2684149444103241, "learning_rate": 7.627118644067796e-05, "loss": 0.1347, "step": 16170 }, { "epoch": 2.4, "grad_norm": 0.710655927658081, "learning_rate": 7.625631876300923e-05, "loss": 0.1438, "step": 16180 }, { "epoch": 2.4, "grad_norm": 1.0059056282043457, "learning_rate": 7.624145108534047e-05, "loss": 0.1427, "step": 16190 }, { "epoch": 2.4, "grad_norm": 0.8452532887458801, "learning_rate": 7.622658340767173e-05, "loss": 0.1484, "step": 16200 }, { "epoch": 2.4, "grad_norm": 0.28337419033050537, "learning_rate": 7.621171573000298e-05, "loss": 0.1421, "step": 16210 }, { "epoch": 2.4, "grad_norm": 0.3758450448513031, "learning_rate": 7.619684805233422e-05, "loss": 0.1385, "step": 16220 }, { "epoch": 2.41, "grad_norm": 0.6121477484703064, "learning_rate": 7.618198037466549e-05, "loss": 0.1454, "step": 16230 }, { "epoch": 2.41, "grad_norm": 0.5063532590866089, "learning_rate": 7.616711269699673e-05, "loss": 0.1422, "step": 16240 }, { "epoch": 2.41, "grad_norm": 1.5977672338485718, "learning_rate": 7.615224501932799e-05, "loss": 0.1472, "step": 16250 }, { "epoch": 2.41, "grad_norm": 0.38274484872817993, "learning_rate": 7.613737734165923e-05, "loss": 0.1479, "step": 16260 }, { "epoch": 2.41, "grad_norm": 0.6593810319900513, "learning_rate": 7.61225096639905e-05, "loss": 0.1436, "step": 16270 }, { "epoch": 2.41, "grad_norm": 0.3354584276676178, "learning_rate": 7.610764198632174e-05, "loss": 0.1341, "step": 16280 }, { "epoch": 2.41, "grad_norm": 0.642591655254364, "learning_rate": 7.609277430865299e-05, "loss": 0.1391, "step": 16290 }, { "epoch": 2.42, "grad_norm": 0.25391778349876404, "learning_rate": 7.607790663098425e-05, "loss": 0.1409, "step": 16300 }, { "epoch": 2.42, "grad_norm": 0.478242427110672, "learning_rate": 7.606303895331549e-05, "loss": 0.1324, "step": 16310 }, { "epoch": 2.42, "grad_norm": 1.1422983407974243, "learning_rate": 7.604817127564676e-05, "loss": 0.1378, "step": 16320 }, { "epoch": 2.42, "grad_norm": 0.5172997713088989, "learning_rate": 7.6033303597978e-05, "loss": 0.142, "step": 16330 }, { "epoch": 2.42, "grad_norm": 0.5715184807777405, "learning_rate": 7.601843592030925e-05, "loss": 0.1425, "step": 16340 }, { "epoch": 2.42, "grad_norm": 0.6750907897949219, "learning_rate": 7.60035682426405e-05, "loss": 0.1496, "step": 16350 }, { "epoch": 2.43, "grad_norm": 0.7555719017982483, "learning_rate": 7.598870056497176e-05, "loss": 0.1384, "step": 16360 }, { "epoch": 2.43, "grad_norm": 0.9847421050071716, "learning_rate": 7.5973832887303e-05, "loss": 0.1445, "step": 16370 }, { "epoch": 2.43, "grad_norm": 2.2487268447875977, "learning_rate": 7.595896520963426e-05, "loss": 0.1437, "step": 16380 }, { "epoch": 2.43, "grad_norm": 0.2849732041358948, "learning_rate": 7.594409753196551e-05, "loss": 0.1373, "step": 16390 }, { "epoch": 2.43, "grad_norm": 0.9714717268943787, "learning_rate": 7.592922985429675e-05, "loss": 0.1395, "step": 16400 }, { "epoch": 2.43, "grad_norm": 0.6926866173744202, "learning_rate": 7.591436217662802e-05, "loss": 0.1417, "step": 16410 }, { "epoch": 2.43, "grad_norm": 0.687318742275238, "learning_rate": 7.589949449895926e-05, "loss": 0.1339, "step": 16420 }, { "epoch": 2.44, "grad_norm": 0.5382028222084045, "learning_rate": 7.588462682129052e-05, "loss": 0.1466, "step": 16430 }, { "epoch": 2.44, "grad_norm": 0.7722700238227844, "learning_rate": 7.586975914362177e-05, "loss": 0.1394, "step": 16440 }, { "epoch": 2.44, "grad_norm": 0.8510586619377136, "learning_rate": 7.585489146595303e-05, "loss": 0.1389, "step": 16450 }, { "epoch": 2.44, "grad_norm": 0.35136356949806213, "learning_rate": 7.584002378828428e-05, "loss": 0.1414, "step": 16460 }, { "epoch": 2.44, "grad_norm": 0.2880131006240845, "learning_rate": 7.582515611061552e-05, "loss": 0.1411, "step": 16470 }, { "epoch": 2.44, "grad_norm": 0.2339642494916916, "learning_rate": 7.581028843294678e-05, "loss": 0.14, "step": 16480 }, { "epoch": 2.44, "grad_norm": 0.9338073134422302, "learning_rate": 7.579542075527802e-05, "loss": 0.1414, "step": 16490 }, { "epoch": 2.45, "grad_norm": 0.46103188395500183, "learning_rate": 7.578055307760929e-05, "loss": 0.1396, "step": 16500 }, { "epoch": 2.45, "grad_norm": 0.4752241373062134, "learning_rate": 7.576568539994053e-05, "loss": 0.1455, "step": 16510 }, { "epoch": 2.45, "grad_norm": 0.4363197684288025, "learning_rate": 7.575081772227178e-05, "loss": 0.1391, "step": 16520 }, { "epoch": 2.45, "grad_norm": 0.7979761362075806, "learning_rate": 7.573595004460304e-05, "loss": 0.1379, "step": 16530 }, { "epoch": 2.45, "grad_norm": 0.3751250207424164, "learning_rate": 7.572108236693429e-05, "loss": 0.1444, "step": 16540 }, { "epoch": 2.45, "grad_norm": 0.4037092328071594, "learning_rate": 7.570621468926555e-05, "loss": 0.1438, "step": 16550 }, { "epoch": 2.45, "grad_norm": 0.4799390137195587, "learning_rate": 7.569134701159679e-05, "loss": 0.1403, "step": 16560 }, { "epoch": 2.46, "grad_norm": 0.8113133311271667, "learning_rate": 7.567647933392804e-05, "loss": 0.1372, "step": 16570 }, { "epoch": 2.46, "grad_norm": 1.4397985935211182, "learning_rate": 7.56616116562593e-05, "loss": 0.1379, "step": 16580 }, { "epoch": 2.46, "grad_norm": 0.3244461417198181, "learning_rate": 7.564674397859055e-05, "loss": 0.14, "step": 16590 }, { "epoch": 2.46, "grad_norm": 0.3702487647533417, "learning_rate": 7.56318763009218e-05, "loss": 0.1388, "step": 16600 }, { "epoch": 2.46, "grad_norm": 0.5789926648139954, "learning_rate": 7.561700862325305e-05, "loss": 0.1455, "step": 16610 }, { "epoch": 2.46, "grad_norm": 0.3471226096153259, "learning_rate": 7.56021409455843e-05, "loss": 0.1339, "step": 16620 }, { "epoch": 2.47, "grad_norm": 0.26994770765304565, "learning_rate": 7.558727326791556e-05, "loss": 0.1378, "step": 16630 }, { "epoch": 2.47, "grad_norm": 0.5144693851470947, "learning_rate": 7.557240559024681e-05, "loss": 0.1468, "step": 16640 }, { "epoch": 2.47, "grad_norm": 0.29384931921958923, "learning_rate": 7.555753791257805e-05, "loss": 0.1399, "step": 16650 }, { "epoch": 2.47, "grad_norm": 0.8755843639373779, "learning_rate": 7.554267023490931e-05, "loss": 0.1365, "step": 16660 }, { "epoch": 2.47, "grad_norm": 0.43195444345474243, "learning_rate": 7.552780255724056e-05, "loss": 0.1496, "step": 16670 }, { "epoch": 2.47, "grad_norm": 0.4010591506958008, "learning_rate": 7.551293487957182e-05, "loss": 0.1387, "step": 16680 }, { "epoch": 2.47, "grad_norm": 0.33434391021728516, "learning_rate": 7.549806720190306e-05, "loss": 0.1422, "step": 16690 }, { "epoch": 2.48, "grad_norm": 1.1877391338348389, "learning_rate": 7.548319952423432e-05, "loss": 0.1342, "step": 16700 }, { "epoch": 2.48, "grad_norm": 0.7518399357795715, "learning_rate": 7.546833184656557e-05, "loss": 0.1479, "step": 16710 }, { "epoch": 2.48, "grad_norm": 0.5379367470741272, "learning_rate": 7.545346416889682e-05, "loss": 0.1384, "step": 16720 }, { "epoch": 2.48, "grad_norm": 0.8337181210517883, "learning_rate": 7.543859649122808e-05, "loss": 0.1445, "step": 16730 }, { "epoch": 2.48, "grad_norm": 0.9827070832252502, "learning_rate": 7.542372881355932e-05, "loss": 0.1377, "step": 16740 }, { "epoch": 2.48, "grad_norm": 0.5948978066444397, "learning_rate": 7.540886113589058e-05, "loss": 0.1458, "step": 16750 }, { "epoch": 2.48, "grad_norm": 1.3337939977645874, "learning_rate": 7.539399345822183e-05, "loss": 0.1422, "step": 16760 }, { "epoch": 2.49, "grad_norm": 0.48446279764175415, "learning_rate": 7.537912578055309e-05, "loss": 0.1393, "step": 16770 }, { "epoch": 2.49, "grad_norm": 0.37337374687194824, "learning_rate": 7.536425810288434e-05, "loss": 0.1383, "step": 16780 }, { "epoch": 2.49, "grad_norm": 0.37720489501953125, "learning_rate": 7.534939042521558e-05, "loss": 0.1323, "step": 16790 }, { "epoch": 2.49, "grad_norm": 0.3711811304092407, "learning_rate": 7.533452274754684e-05, "loss": 0.1362, "step": 16800 }, { "epoch": 2.49, "grad_norm": 0.36137768626213074, "learning_rate": 7.531965506987809e-05, "loss": 0.1311, "step": 16810 }, { "epoch": 2.49, "grad_norm": 0.5882296562194824, "learning_rate": 7.530478739220935e-05, "loss": 0.1407, "step": 16820 }, { "epoch": 2.49, "grad_norm": 0.5880404114723206, "learning_rate": 7.528991971454059e-05, "loss": 0.1402, "step": 16830 }, { "epoch": 2.5, "grad_norm": 0.5515339374542236, "learning_rate": 7.527505203687184e-05, "loss": 0.1407, "step": 16840 }, { "epoch": 2.5, "grad_norm": 0.28839829564094543, "learning_rate": 7.52601843592031e-05, "loss": 0.1333, "step": 16850 }, { "epoch": 2.5, "grad_norm": 0.4314468502998352, "learning_rate": 7.524531668153435e-05, "loss": 0.1424, "step": 16860 }, { "epoch": 2.5, "grad_norm": 0.4594912827014923, "learning_rate": 7.52304490038656e-05, "loss": 0.1435, "step": 16870 }, { "epoch": 2.5, "grad_norm": 0.3527372479438782, "learning_rate": 7.521558132619685e-05, "loss": 0.139, "step": 16880 }, { "epoch": 2.5, "grad_norm": 0.6145148277282715, "learning_rate": 7.52007136485281e-05, "loss": 0.1386, "step": 16890 }, { "epoch": 2.51, "grad_norm": 0.47387057542800903, "learning_rate": 7.518584597085936e-05, "loss": 0.1448, "step": 16900 }, { "epoch": 2.51, "grad_norm": 0.3137947916984558, "learning_rate": 7.517097829319061e-05, "loss": 0.1415, "step": 16910 }, { "epoch": 2.51, "grad_norm": 0.25534674525260925, "learning_rate": 7.515611061552185e-05, "loss": 0.1413, "step": 16920 }, { "epoch": 2.51, "grad_norm": 0.8711992502212524, "learning_rate": 7.514124293785311e-05, "loss": 0.141, "step": 16930 }, { "epoch": 2.51, "grad_norm": 0.5922618508338928, "learning_rate": 7.512637526018436e-05, "loss": 0.1372, "step": 16940 }, { "epoch": 2.51, "grad_norm": 0.2968009114265442, "learning_rate": 7.511150758251562e-05, "loss": 0.1443, "step": 16950 }, { "epoch": 2.51, "grad_norm": 0.7671728730201721, "learning_rate": 7.509663990484687e-05, "loss": 0.1426, "step": 16960 }, { "epoch": 2.52, "grad_norm": 0.7778077721595764, "learning_rate": 7.508177222717811e-05, "loss": 0.1391, "step": 16970 }, { "epoch": 2.52, "grad_norm": 0.5975720882415771, "learning_rate": 7.506690454950937e-05, "loss": 0.1399, "step": 16980 }, { "epoch": 2.52, "grad_norm": 0.3030705153942108, "learning_rate": 7.505203687184062e-05, "loss": 0.1411, "step": 16990 }, { "epoch": 2.52, "grad_norm": 0.6025968790054321, "learning_rate": 7.503716919417188e-05, "loss": 0.1429, "step": 17000 }, { "epoch": 2.52, "grad_norm": 0.262866735458374, "learning_rate": 7.502230151650312e-05, "loss": 0.1359, "step": 17010 }, { "epoch": 2.52, "grad_norm": 1.0509122610092163, "learning_rate": 7.500743383883437e-05, "loss": 0.1488, "step": 17020 }, { "epoch": 2.52, "grad_norm": 0.8183063268661499, "learning_rate": 7.499256616116563e-05, "loss": 0.1436, "step": 17030 }, { "epoch": 2.53, "grad_norm": 0.247868612408638, "learning_rate": 7.497769848349688e-05, "loss": 0.1381, "step": 17040 }, { "epoch": 2.53, "grad_norm": 0.292298287153244, "learning_rate": 7.496283080582814e-05, "loss": 0.145, "step": 17050 }, { "epoch": 2.53, "grad_norm": 0.6261994242668152, "learning_rate": 7.494796312815938e-05, "loss": 0.1494, "step": 17060 }, { "epoch": 2.53, "grad_norm": 0.6475508213043213, "learning_rate": 7.493309545049065e-05, "loss": 0.1359, "step": 17070 }, { "epoch": 2.53, "grad_norm": 0.24762730300426483, "learning_rate": 7.491822777282189e-05, "loss": 0.1477, "step": 17080 }, { "epoch": 2.53, "grad_norm": 1.116014003753662, "learning_rate": 7.490336009515314e-05, "loss": 0.1436, "step": 17090 }, { "epoch": 2.53, "grad_norm": 0.24743910133838654, "learning_rate": 7.488849241748438e-05, "loss": 0.1452, "step": 17100 }, { "epoch": 2.54, "grad_norm": 0.4202406406402588, "learning_rate": 7.487362473981564e-05, "loss": 0.14, "step": 17110 }, { "epoch": 2.54, "grad_norm": 0.25671133399009705, "learning_rate": 7.485875706214691e-05, "loss": 0.1421, "step": 17120 }, { "epoch": 2.54, "grad_norm": 1.6126033067703247, "learning_rate": 7.484388938447815e-05, "loss": 0.1455, "step": 17130 }, { "epoch": 2.54, "grad_norm": 0.7320016026496887, "learning_rate": 7.48290217068094e-05, "loss": 0.1455, "step": 17140 }, { "epoch": 2.54, "grad_norm": 0.4042165279388428, "learning_rate": 7.481415402914064e-05, "loss": 0.138, "step": 17150 }, { "epoch": 2.54, "grad_norm": 0.34952160716056824, "learning_rate": 7.479928635147191e-05, "loss": 0.1372, "step": 17160 }, { "epoch": 2.55, "grad_norm": 0.43814823031425476, "learning_rate": 7.478441867380315e-05, "loss": 0.1426, "step": 17170 }, { "epoch": 2.55, "grad_norm": 0.30086013674736023, "learning_rate": 7.476955099613441e-05, "loss": 0.1333, "step": 17180 }, { "epoch": 2.55, "grad_norm": 0.6000199317932129, "learning_rate": 7.475468331846566e-05, "loss": 0.1436, "step": 17190 }, { "epoch": 2.55, "grad_norm": 0.31554168462753296, "learning_rate": 7.47398156407969e-05, "loss": 0.1476, "step": 17200 }, { "epoch": 2.55, "grad_norm": 0.5113075971603394, "learning_rate": 7.472494796312817e-05, "loss": 0.1442, "step": 17210 }, { "epoch": 2.55, "grad_norm": 0.6701615452766418, "learning_rate": 7.471008028545941e-05, "loss": 0.1412, "step": 17220 }, { "epoch": 2.55, "grad_norm": 0.9265919327735901, "learning_rate": 7.469521260779067e-05, "loss": 0.1443, "step": 17230 }, { "epoch": 2.56, "grad_norm": 0.5859856605529785, "learning_rate": 7.468034493012191e-05, "loss": 0.1479, "step": 17240 }, { "epoch": 2.56, "grad_norm": 0.39752840995788574, "learning_rate": 7.466547725245318e-05, "loss": 0.1487, "step": 17250 }, { "epoch": 2.56, "grad_norm": 0.5720125436782837, "learning_rate": 7.465060957478442e-05, "loss": 0.1419, "step": 17260 }, { "epoch": 2.56, "grad_norm": 0.754892110824585, "learning_rate": 7.463574189711567e-05, "loss": 0.1425, "step": 17270 }, { "epoch": 2.56, "grad_norm": 0.7733323574066162, "learning_rate": 7.462087421944693e-05, "loss": 0.1453, "step": 17280 }, { "epoch": 2.56, "grad_norm": 1.4931153059005737, "learning_rate": 7.460600654177817e-05, "loss": 0.135, "step": 17290 }, { "epoch": 2.56, "grad_norm": 0.3707599639892578, "learning_rate": 7.459113886410944e-05, "loss": 0.1415, "step": 17300 }, { "epoch": 2.57, "grad_norm": 0.3874984681606293, "learning_rate": 7.457627118644068e-05, "loss": 0.1476, "step": 17310 }, { "epoch": 2.57, "grad_norm": 0.3033313751220703, "learning_rate": 7.456140350877193e-05, "loss": 0.1414, "step": 17320 }, { "epoch": 2.57, "grad_norm": 0.6481125354766846, "learning_rate": 7.454653583110318e-05, "loss": 0.1432, "step": 17330 }, { "epoch": 2.57, "grad_norm": 1.2512054443359375, "learning_rate": 7.453166815343444e-05, "loss": 0.1396, "step": 17340 }, { "epoch": 2.57, "grad_norm": 0.6148348450660706, "learning_rate": 7.451680047576569e-05, "loss": 0.139, "step": 17350 }, { "epoch": 2.57, "grad_norm": 0.3186105191707611, "learning_rate": 7.450193279809694e-05, "loss": 0.1282, "step": 17360 }, { "epoch": 2.57, "grad_norm": 0.31010115146636963, "learning_rate": 7.44870651204282e-05, "loss": 0.1444, "step": 17370 }, { "epoch": 2.58, "grad_norm": 0.8012278079986572, "learning_rate": 7.447219744275944e-05, "loss": 0.1356, "step": 17380 }, { "epoch": 2.58, "grad_norm": 0.26003143191337585, "learning_rate": 7.44573297650907e-05, "loss": 0.1387, "step": 17390 }, { "epoch": 2.58, "grad_norm": 0.284709095954895, "learning_rate": 7.444246208742195e-05, "loss": 0.1456, "step": 17400 }, { "epoch": 2.58, "grad_norm": 1.3213711977005005, "learning_rate": 7.44275944097532e-05, "loss": 0.1347, "step": 17410 }, { "epoch": 2.58, "grad_norm": 0.8473525047302246, "learning_rate": 7.441272673208444e-05, "loss": 0.1453, "step": 17420 }, { "epoch": 2.58, "grad_norm": 0.36998623609542847, "learning_rate": 7.439785905441571e-05, "loss": 0.1308, "step": 17430 }, { "epoch": 2.59, "grad_norm": 2.219050168991089, "learning_rate": 7.438299137674695e-05, "loss": 0.1497, "step": 17440 }, { "epoch": 2.59, "grad_norm": 1.2892897129058838, "learning_rate": 7.43681236990782e-05, "loss": 0.1392, "step": 17450 }, { "epoch": 2.59, "grad_norm": 0.3273633122444153, "learning_rate": 7.435325602140946e-05, "loss": 0.1459, "step": 17460 }, { "epoch": 2.59, "grad_norm": 1.172643780708313, "learning_rate": 7.43383883437407e-05, "loss": 0.1396, "step": 17470 }, { "epoch": 2.59, "grad_norm": 0.7880512475967407, "learning_rate": 7.432352066607197e-05, "loss": 0.1404, "step": 17480 }, { "epoch": 2.59, "grad_norm": 0.3355169892311096, "learning_rate": 7.430865298840321e-05, "loss": 0.1513, "step": 17490 }, { "epoch": 2.59, "grad_norm": 0.5652661919593811, "learning_rate": 7.429378531073447e-05, "loss": 0.1349, "step": 17500 }, { "epoch": 2.6, "grad_norm": 0.34006038308143616, "learning_rate": 7.427891763306572e-05, "loss": 0.1406, "step": 17510 }, { "epoch": 2.6, "grad_norm": 0.258184552192688, "learning_rate": 7.426404995539698e-05, "loss": 0.1422, "step": 17520 }, { "epoch": 2.6, "grad_norm": 0.49831709265708923, "learning_rate": 7.424918227772823e-05, "loss": 0.1373, "step": 17530 }, { "epoch": 2.6, "grad_norm": 1.0806423425674438, "learning_rate": 7.423431460005947e-05, "loss": 0.1468, "step": 17540 }, { "epoch": 2.6, "grad_norm": 0.307546466588974, "learning_rate": 7.421944692239073e-05, "loss": 0.1376, "step": 17550 }, { "epoch": 2.6, "grad_norm": 0.5520646572113037, "learning_rate": 7.420457924472198e-05, "loss": 0.1426, "step": 17560 }, { "epoch": 2.6, "grad_norm": 0.2815137505531311, "learning_rate": 7.418971156705324e-05, "loss": 0.1359, "step": 17570 }, { "epoch": 2.61, "grad_norm": 0.4063834846019745, "learning_rate": 7.417484388938448e-05, "loss": 0.1448, "step": 17580 }, { "epoch": 2.61, "grad_norm": 0.4033089876174927, "learning_rate": 7.415997621171573e-05, "loss": 0.1406, "step": 17590 }, { "epoch": 2.61, "grad_norm": 0.300946980714798, "learning_rate": 7.414510853404699e-05, "loss": 0.1339, "step": 17600 }, { "epoch": 2.61, "grad_norm": 0.9063736200332642, "learning_rate": 7.413024085637824e-05, "loss": 0.146, "step": 17610 }, { "epoch": 2.61, "grad_norm": 1.2720870971679688, "learning_rate": 7.41153731787095e-05, "loss": 0.1394, "step": 17620 }, { "epoch": 2.61, "grad_norm": 0.36671578884124756, "learning_rate": 7.410050550104074e-05, "loss": 0.1451, "step": 17630 }, { "epoch": 2.61, "grad_norm": 0.28279754519462585, "learning_rate": 7.408563782337199e-05, "loss": 0.1421, "step": 17640 }, { "epoch": 2.62, "grad_norm": 1.5014578104019165, "learning_rate": 7.407077014570325e-05, "loss": 0.1362, "step": 17650 }, { "epoch": 2.62, "grad_norm": 0.6834672689437866, "learning_rate": 7.40559024680345e-05, "loss": 0.1504, "step": 17660 }, { "epoch": 2.62, "grad_norm": 0.5922780632972717, "learning_rate": 7.404103479036574e-05, "loss": 0.1375, "step": 17670 }, { "epoch": 2.62, "grad_norm": 0.2400318682193756, "learning_rate": 7.4026167112697e-05, "loss": 0.1401, "step": 17680 }, { "epoch": 2.62, "grad_norm": 0.8875470161437988, "learning_rate": 7.401129943502825e-05, "loss": 0.1465, "step": 17690 }, { "epoch": 2.62, "grad_norm": 0.2918970584869385, "learning_rate": 7.399643175735951e-05, "loss": 0.141, "step": 17700 }, { "epoch": 2.63, "grad_norm": 0.6647611260414124, "learning_rate": 7.398156407969076e-05, "loss": 0.1399, "step": 17710 }, { "epoch": 2.63, "grad_norm": 0.23712264001369476, "learning_rate": 7.3966696402022e-05, "loss": 0.1331, "step": 17720 }, { "epoch": 2.63, "grad_norm": 0.7306144833564758, "learning_rate": 7.395182872435326e-05, "loss": 0.1417, "step": 17730 }, { "epoch": 2.63, "grad_norm": 0.40289196372032166, "learning_rate": 7.393696104668451e-05, "loss": 0.1437, "step": 17740 }, { "epoch": 2.63, "grad_norm": 1.719495415687561, "learning_rate": 7.392209336901577e-05, "loss": 0.1472, "step": 17750 }, { "epoch": 2.63, "grad_norm": 0.24560745060443878, "learning_rate": 7.390722569134701e-05, "loss": 0.1434, "step": 17760 }, { "epoch": 2.63, "grad_norm": 1.0618599653244019, "learning_rate": 7.389235801367826e-05, "loss": 0.1481, "step": 17770 }, { "epoch": 2.64, "grad_norm": 0.5430271029472351, "learning_rate": 7.387749033600952e-05, "loss": 0.139, "step": 17780 }, { "epoch": 2.64, "grad_norm": 1.0089772939682007, "learning_rate": 7.386262265834077e-05, "loss": 0.1429, "step": 17790 }, { "epoch": 2.64, "grad_norm": 0.2407023161649704, "learning_rate": 7.384775498067203e-05, "loss": 0.1383, "step": 17800 }, { "epoch": 2.64, "grad_norm": 0.26621460914611816, "learning_rate": 7.383288730300327e-05, "loss": 0.139, "step": 17810 }, { "epoch": 2.64, "grad_norm": 0.26956406235694885, "learning_rate": 7.381801962533452e-05, "loss": 0.1417, "step": 17820 }, { "epoch": 2.64, "grad_norm": 0.2954663038253784, "learning_rate": 7.380315194766578e-05, "loss": 0.1337, "step": 17830 }, { "epoch": 2.64, "grad_norm": 1.1632752418518066, "learning_rate": 7.378828426999703e-05, "loss": 0.1466, "step": 17840 }, { "epoch": 2.65, "grad_norm": 1.1635485887527466, "learning_rate": 7.377341659232829e-05, "loss": 0.1374, "step": 17850 }, { "epoch": 2.65, "grad_norm": 0.622186541557312, "learning_rate": 7.375854891465953e-05, "loss": 0.1385, "step": 17860 }, { "epoch": 2.65, "grad_norm": 0.2939455211162567, "learning_rate": 7.374368123699078e-05, "loss": 0.137, "step": 17870 }, { "epoch": 2.65, "grad_norm": 0.38761767745018005, "learning_rate": 7.372881355932204e-05, "loss": 0.1361, "step": 17880 }, { "epoch": 2.65, "grad_norm": 0.9036255478858948, "learning_rate": 7.37139458816533e-05, "loss": 0.1374, "step": 17890 }, { "epoch": 2.65, "grad_norm": 0.34090399742126465, "learning_rate": 7.369907820398453e-05, "loss": 0.1325, "step": 17900 }, { "epoch": 2.65, "grad_norm": 0.29713836312294006, "learning_rate": 7.368421052631579e-05, "loss": 0.1408, "step": 17910 }, { "epoch": 2.66, "grad_norm": 0.826836884021759, "learning_rate": 7.366934284864704e-05, "loss": 0.1498, "step": 17920 }, { "epoch": 2.66, "grad_norm": 0.8794457316398621, "learning_rate": 7.36544751709783e-05, "loss": 0.1441, "step": 17930 }, { "epoch": 2.66, "grad_norm": 0.3596440851688385, "learning_rate": 7.363960749330955e-05, "loss": 0.1452, "step": 17940 }, { "epoch": 2.66, "grad_norm": 0.4660389721393585, "learning_rate": 7.36247398156408e-05, "loss": 0.1397, "step": 17950 }, { "epoch": 2.66, "grad_norm": 1.148728847503662, "learning_rate": 7.360987213797205e-05, "loss": 0.1429, "step": 17960 }, { "epoch": 2.66, "grad_norm": 0.5068449378013611, "learning_rate": 7.35950044603033e-05, "loss": 0.1367, "step": 17970 }, { "epoch": 2.67, "grad_norm": 0.3436690866947174, "learning_rate": 7.358013678263456e-05, "loss": 0.1434, "step": 17980 }, { "epoch": 2.67, "grad_norm": 0.28847482800483704, "learning_rate": 7.35652691049658e-05, "loss": 0.1452, "step": 17990 }, { "epoch": 2.67, "grad_norm": 0.3127191960811615, "learning_rate": 7.355040142729706e-05, "loss": 0.1468, "step": 18000 }, { "epoch": 2.67, "grad_norm": 0.3851916491985321, "learning_rate": 7.353553374962831e-05, "loss": 0.1416, "step": 18010 }, { "epoch": 2.67, "grad_norm": 1.0425105094909668, "learning_rate": 7.352066607195956e-05, "loss": 0.1336, "step": 18020 }, { "epoch": 2.67, "grad_norm": 0.5313891172409058, "learning_rate": 7.350579839429082e-05, "loss": 0.1449, "step": 18030 }, { "epoch": 2.67, "grad_norm": 0.3654390871524811, "learning_rate": 7.349093071662206e-05, "loss": 0.1304, "step": 18040 }, { "epoch": 2.68, "grad_norm": 0.8208723664283752, "learning_rate": 7.347606303895333e-05, "loss": 0.1395, "step": 18050 }, { "epoch": 2.68, "grad_norm": 0.265028178691864, "learning_rate": 7.346119536128457e-05, "loss": 0.1423, "step": 18060 }, { "epoch": 2.68, "grad_norm": 0.5879223346710205, "learning_rate": 7.344632768361583e-05, "loss": 0.1346, "step": 18070 }, { "epoch": 2.68, "grad_norm": 1.9254904985427856, "learning_rate": 7.343146000594707e-05, "loss": 0.1455, "step": 18080 }, { "epoch": 2.68, "grad_norm": 0.7956934571266174, "learning_rate": 7.341659232827832e-05, "loss": 0.1395, "step": 18090 }, { "epoch": 2.68, "grad_norm": 0.3552607595920563, "learning_rate": 7.340172465060958e-05, "loss": 0.1381, "step": 18100 }, { "epoch": 2.68, "grad_norm": 0.8107473254203796, "learning_rate": 7.338685697294083e-05, "loss": 0.1417, "step": 18110 }, { "epoch": 2.69, "grad_norm": 0.28250497579574585, "learning_rate": 7.337198929527209e-05, "loss": 0.144, "step": 18120 }, { "epoch": 2.69, "grad_norm": 0.26782554388046265, "learning_rate": 7.335712161760333e-05, "loss": 0.1386, "step": 18130 }, { "epoch": 2.69, "grad_norm": 0.5947808623313904, "learning_rate": 7.33422539399346e-05, "loss": 0.1424, "step": 18140 }, { "epoch": 2.69, "grad_norm": 0.2512226998806, "learning_rate": 7.332738626226584e-05, "loss": 0.1352, "step": 18150 }, { "epoch": 2.69, "grad_norm": 0.35504305362701416, "learning_rate": 7.331251858459709e-05, "loss": 0.1416, "step": 18160 }, { "epoch": 2.69, "grad_norm": 1.3061926364898682, "learning_rate": 7.329765090692835e-05, "loss": 0.1444, "step": 18170 }, { "epoch": 2.69, "grad_norm": 0.6368234157562256, "learning_rate": 7.328278322925959e-05, "loss": 0.1403, "step": 18180 }, { "epoch": 2.7, "grad_norm": 0.42611798644065857, "learning_rate": 7.326791555159086e-05, "loss": 0.1386, "step": 18190 }, { "epoch": 2.7, "grad_norm": 0.6984022855758667, "learning_rate": 7.32530478739221e-05, "loss": 0.1451, "step": 18200 }, { "epoch": 2.7, "grad_norm": 0.4926019310951233, "learning_rate": 7.323818019625335e-05, "loss": 0.1399, "step": 18210 }, { "epoch": 2.7, "grad_norm": 0.8579056262969971, "learning_rate": 7.322331251858459e-05, "loss": 0.143, "step": 18220 }, { "epoch": 2.7, "grad_norm": 0.9211030602455139, "learning_rate": 7.320844484091586e-05, "loss": 0.1469, "step": 18230 }, { "epoch": 2.7, "grad_norm": 0.8623634576797485, "learning_rate": 7.31935771632471e-05, "loss": 0.1424, "step": 18240 }, { "epoch": 2.71, "grad_norm": 0.2992829382419586, "learning_rate": 7.317870948557836e-05, "loss": 0.1417, "step": 18250 }, { "epoch": 2.71, "grad_norm": 0.6315504908561707, "learning_rate": 7.316384180790961e-05, "loss": 0.1417, "step": 18260 }, { "epoch": 2.71, "grad_norm": 0.9249059557914734, "learning_rate": 7.314897413024085e-05, "loss": 0.1468, "step": 18270 }, { "epoch": 2.71, "grad_norm": 0.35125473141670227, "learning_rate": 7.313410645257212e-05, "loss": 0.145, "step": 18280 }, { "epoch": 2.71, "grad_norm": 0.4258287250995636, "learning_rate": 7.311923877490336e-05, "loss": 0.1401, "step": 18290 }, { "epoch": 2.71, "grad_norm": 0.26481908559799194, "learning_rate": 7.310437109723462e-05, "loss": 0.1431, "step": 18300 }, { "epoch": 2.71, "grad_norm": 1.5498566627502441, "learning_rate": 7.308950341956586e-05, "loss": 0.1455, "step": 18310 }, { "epoch": 2.72, "grad_norm": 1.0560815334320068, "learning_rate": 7.307463574189713e-05, "loss": 0.1475, "step": 18320 }, { "epoch": 2.72, "grad_norm": 0.7546606659889221, "learning_rate": 7.305976806422837e-05, "loss": 0.1465, "step": 18330 }, { "epoch": 2.72, "grad_norm": 1.151910424232483, "learning_rate": 7.304490038655962e-05, "loss": 0.1435, "step": 18340 }, { "epoch": 2.72, "grad_norm": 0.38248512148857117, "learning_rate": 7.303003270889088e-05, "loss": 0.1462, "step": 18350 }, { "epoch": 2.72, "grad_norm": 0.24999620020389557, "learning_rate": 7.301516503122212e-05, "loss": 0.1423, "step": 18360 }, { "epoch": 2.72, "grad_norm": 1.2096368074417114, "learning_rate": 7.300029735355339e-05, "loss": 0.1402, "step": 18370 }, { "epoch": 2.72, "grad_norm": 0.6669314503669739, "learning_rate": 7.298542967588463e-05, "loss": 0.1419, "step": 18380 }, { "epoch": 2.73, "grad_norm": 0.7453274726867676, "learning_rate": 7.297056199821588e-05, "loss": 0.1507, "step": 18390 }, { "epoch": 2.73, "grad_norm": 0.3134549558162689, "learning_rate": 7.295569432054712e-05, "loss": 0.1401, "step": 18400 }, { "epoch": 2.73, "grad_norm": 0.509635329246521, "learning_rate": 7.294082664287839e-05, "loss": 0.1427, "step": 18410 }, { "epoch": 2.73, "grad_norm": 1.5291324853897095, "learning_rate": 7.292595896520963e-05, "loss": 0.1382, "step": 18420 }, { "epoch": 2.73, "grad_norm": 0.8732107877731323, "learning_rate": 7.291109128754089e-05, "loss": 0.1425, "step": 18430 }, { "epoch": 2.73, "grad_norm": 0.951556384563446, "learning_rate": 7.289622360987214e-05, "loss": 0.1417, "step": 18440 }, { "epoch": 2.73, "grad_norm": 0.22064682841300964, "learning_rate": 7.288135593220338e-05, "loss": 0.1389, "step": 18450 }, { "epoch": 2.74, "grad_norm": 0.36942338943481445, "learning_rate": 7.286648825453465e-05, "loss": 0.1356, "step": 18460 }, { "epoch": 2.74, "grad_norm": 0.9014129638671875, "learning_rate": 7.28516205768659e-05, "loss": 0.1439, "step": 18470 }, { "epoch": 2.74, "grad_norm": 0.31892070174217224, "learning_rate": 7.283675289919715e-05, "loss": 0.1439, "step": 18480 }, { "epoch": 2.74, "grad_norm": 0.3054444491863251, "learning_rate": 7.28218852215284e-05, "loss": 0.1385, "step": 18490 }, { "epoch": 2.74, "grad_norm": 0.23387959599494934, "learning_rate": 7.280701754385966e-05, "loss": 0.1406, "step": 18500 }, { "epoch": 2.74, "grad_norm": 0.26444515585899353, "learning_rate": 7.279214986619091e-05, "loss": 0.1412, "step": 18510 }, { "epoch": 2.75, "grad_norm": 0.8463549613952637, "learning_rate": 7.277728218852215e-05, "loss": 0.1437, "step": 18520 }, { "epoch": 2.75, "grad_norm": 0.9371518492698669, "learning_rate": 7.276241451085341e-05, "loss": 0.1401, "step": 18530 }, { "epoch": 2.75, "grad_norm": 0.3569345772266388, "learning_rate": 7.274754683318466e-05, "loss": 0.132, "step": 18540 }, { "epoch": 2.75, "grad_norm": 0.25233766436576843, "learning_rate": 7.273267915551592e-05, "loss": 0.1409, "step": 18550 }, { "epoch": 2.75, "grad_norm": 0.46643221378326416, "learning_rate": 7.271781147784716e-05, "loss": 0.1378, "step": 18560 }, { "epoch": 2.75, "grad_norm": 1.249118447303772, "learning_rate": 7.270294380017841e-05, "loss": 0.1477, "step": 18570 }, { "epoch": 2.75, "grad_norm": 0.29594314098358154, "learning_rate": 7.268807612250967e-05, "loss": 0.15, "step": 18580 }, { "epoch": 2.76, "grad_norm": 0.2833545207977295, "learning_rate": 7.267320844484092e-05, "loss": 0.1405, "step": 18590 }, { "epoch": 2.76, "grad_norm": 0.5924676656723022, "learning_rate": 7.265834076717218e-05, "loss": 0.1372, "step": 18600 }, { "epoch": 2.76, "grad_norm": 0.3902270495891571, "learning_rate": 7.264347308950342e-05, "loss": 0.1413, "step": 18610 }, { "epoch": 2.76, "grad_norm": 0.447142630815506, "learning_rate": 7.262860541183467e-05, "loss": 0.1426, "step": 18620 }, { "epoch": 2.76, "grad_norm": 0.4772433340549469, "learning_rate": 7.261373773416593e-05, "loss": 0.1394, "step": 18630 }, { "epoch": 2.76, "grad_norm": 0.45227327942848206, "learning_rate": 7.259887005649718e-05, "loss": 0.1315, "step": 18640 }, { "epoch": 2.76, "grad_norm": 1.1519386768341064, "learning_rate": 7.258400237882843e-05, "loss": 0.1408, "step": 18650 }, { "epoch": 2.77, "grad_norm": 0.8935173749923706, "learning_rate": 7.256913470115968e-05, "loss": 0.1429, "step": 18660 }, { "epoch": 2.77, "grad_norm": 0.24923725426197052, "learning_rate": 7.255426702349093e-05, "loss": 0.132, "step": 18670 }, { "epoch": 2.77, "grad_norm": 0.4108467102050781, "learning_rate": 7.253939934582219e-05, "loss": 0.1406, "step": 18680 }, { "epoch": 2.77, "grad_norm": 0.7500537037849426, "learning_rate": 7.252453166815344e-05, "loss": 0.1435, "step": 18690 }, { "epoch": 2.77, "grad_norm": 0.3367678225040436, "learning_rate": 7.250966399048469e-05, "loss": 0.1423, "step": 18700 }, { "epoch": 2.77, "grad_norm": 0.30210843682289124, "learning_rate": 7.249479631281594e-05, "loss": 0.1513, "step": 18710 }, { "epoch": 2.77, "grad_norm": 0.7015200257301331, "learning_rate": 7.24799286351472e-05, "loss": 0.14, "step": 18720 }, { "epoch": 2.78, "grad_norm": 0.45465758442878723, "learning_rate": 7.246506095747845e-05, "loss": 0.1443, "step": 18730 }, { "epoch": 2.78, "grad_norm": 0.9578850269317627, "learning_rate": 7.245019327980969e-05, "loss": 0.1461, "step": 18740 }, { "epoch": 2.78, "grad_norm": 0.20722438395023346, "learning_rate": 7.243532560214095e-05, "loss": 0.1311, "step": 18750 }, { "epoch": 2.78, "grad_norm": 0.9666074514389038, "learning_rate": 7.24204579244722e-05, "loss": 0.1319, "step": 18760 }, { "epoch": 2.78, "grad_norm": 0.286914199590683, "learning_rate": 7.240559024680346e-05, "loss": 0.1404, "step": 18770 }, { "epoch": 2.78, "grad_norm": 0.635274350643158, "learning_rate": 7.239072256913471e-05, "loss": 0.1369, "step": 18780 }, { "epoch": 2.79, "grad_norm": 0.46256542205810547, "learning_rate": 7.237585489146595e-05, "loss": 0.1416, "step": 18790 }, { "epoch": 2.79, "grad_norm": 0.5526597499847412, "learning_rate": 7.23609872137972e-05, "loss": 0.14, "step": 18800 }, { "epoch": 2.79, "grad_norm": 1.3282074928283691, "learning_rate": 7.234611953612846e-05, "loss": 0.1523, "step": 18810 }, { "epoch": 2.79, "grad_norm": 0.2414388209581375, "learning_rate": 7.233125185845972e-05, "loss": 0.133, "step": 18820 }, { "epoch": 2.79, "grad_norm": 0.8080794215202332, "learning_rate": 7.231638418079097e-05, "loss": 0.143, "step": 18830 }, { "epoch": 2.79, "grad_norm": 0.4373549818992615, "learning_rate": 7.230151650312221e-05, "loss": 0.1352, "step": 18840 }, { "epoch": 2.79, "grad_norm": 0.6084916591644287, "learning_rate": 7.228664882545347e-05, "loss": 0.1433, "step": 18850 }, { "epoch": 2.8, "grad_norm": 0.41089242696762085, "learning_rate": 7.227178114778472e-05, "loss": 0.145, "step": 18860 }, { "epoch": 2.8, "grad_norm": 0.2655513882637024, "learning_rate": 7.225691347011598e-05, "loss": 0.1423, "step": 18870 }, { "epoch": 2.8, "grad_norm": 0.4778943359851837, "learning_rate": 7.224204579244722e-05, "loss": 0.1348, "step": 18880 }, { "epoch": 2.8, "grad_norm": 0.4908297061920166, "learning_rate": 7.222717811477847e-05, "loss": 0.1434, "step": 18890 }, { "epoch": 2.8, "grad_norm": 0.3996427059173584, "learning_rate": 7.221231043710973e-05, "loss": 0.1404, "step": 18900 }, { "epoch": 2.8, "grad_norm": 0.6099945306777954, "learning_rate": 7.219744275944098e-05, "loss": 0.1454, "step": 18910 }, { "epoch": 2.8, "grad_norm": 0.45081639289855957, "learning_rate": 7.218257508177224e-05, "loss": 0.1354, "step": 18920 }, { "epoch": 2.81, "grad_norm": 1.0426857471466064, "learning_rate": 7.216770740410348e-05, "loss": 0.1491, "step": 18930 }, { "epoch": 2.81, "grad_norm": 0.772966206073761, "learning_rate": 7.215283972643473e-05, "loss": 0.1365, "step": 18940 }, { "epoch": 2.81, "grad_norm": 0.3265818655490875, "learning_rate": 7.213797204876599e-05, "loss": 0.1498, "step": 18950 }, { "epoch": 2.81, "grad_norm": 0.2635791003704071, "learning_rate": 7.212310437109724e-05, "loss": 0.1363, "step": 18960 }, { "epoch": 2.81, "grad_norm": 0.9569715857505798, "learning_rate": 7.210823669342848e-05, "loss": 0.1327, "step": 18970 }, { "epoch": 2.81, "grad_norm": 0.20809006690979004, "learning_rate": 7.209336901575974e-05, "loss": 0.1377, "step": 18980 }, { "epoch": 2.81, "grad_norm": 1.0042877197265625, "learning_rate": 7.207850133809099e-05, "loss": 0.1371, "step": 18990 }, { "epoch": 2.82, "grad_norm": 0.5755969882011414, "learning_rate": 7.206363366042225e-05, "loss": 0.1355, "step": 19000 }, { "epoch": 2.82, "grad_norm": 0.8315523862838745, "learning_rate": 7.20487659827535e-05, "loss": 0.1435, "step": 19010 }, { "epoch": 2.82, "grad_norm": 0.5126630663871765, "learning_rate": 7.203389830508474e-05, "loss": 0.1358, "step": 19020 }, { "epoch": 2.82, "grad_norm": 0.5610932111740112, "learning_rate": 7.201903062741601e-05, "loss": 0.1303, "step": 19030 }, { "epoch": 2.82, "grad_norm": 0.49236372113227844, "learning_rate": 7.200416294974725e-05, "loss": 0.1428, "step": 19040 }, { "epoch": 2.82, "grad_norm": 0.26628008484840393, "learning_rate": 7.198929527207851e-05, "loss": 0.1407, "step": 19050 }, { "epoch": 2.83, "grad_norm": 1.6078991889953613, "learning_rate": 7.197442759440975e-05, "loss": 0.135, "step": 19060 }, { "epoch": 2.83, "grad_norm": 0.3959672749042511, "learning_rate": 7.1959559916741e-05, "loss": 0.1418, "step": 19070 }, { "epoch": 2.83, "grad_norm": 0.35960277915000916, "learning_rate": 7.194469223907226e-05, "loss": 0.1386, "step": 19080 }, { "epoch": 2.83, "grad_norm": 0.46168479323387146, "learning_rate": 7.192982456140351e-05, "loss": 0.1465, "step": 19090 }, { "epoch": 2.83, "grad_norm": 0.7458998560905457, "learning_rate": 7.191495688373477e-05, "loss": 0.1347, "step": 19100 }, { "epoch": 2.83, "grad_norm": 0.629227876663208, "learning_rate": 7.190008920606601e-05, "loss": 0.1382, "step": 19110 }, { "epoch": 2.83, "grad_norm": 0.48990973830223083, "learning_rate": 7.188522152839728e-05, "loss": 0.1346, "step": 19120 }, { "epoch": 2.84, "grad_norm": 1.1364151239395142, "learning_rate": 7.187035385072852e-05, "loss": 0.1346, "step": 19130 }, { "epoch": 2.84, "grad_norm": 0.39966854453086853, "learning_rate": 7.185548617305977e-05, "loss": 0.1365, "step": 19140 }, { "epoch": 2.84, "grad_norm": 0.7130772471427917, "learning_rate": 7.184061849539101e-05, "loss": 0.1474, "step": 19150 }, { "epoch": 2.84, "grad_norm": 0.47073498368263245, "learning_rate": 7.182575081772227e-05, "loss": 0.136, "step": 19160 }, { "epoch": 2.84, "grad_norm": 0.7420998811721802, "learning_rate": 7.181088314005354e-05, "loss": 0.1423, "step": 19170 }, { "epoch": 2.84, "grad_norm": 0.5848856568336487, "learning_rate": 7.179601546238478e-05, "loss": 0.1435, "step": 19180 }, { "epoch": 2.84, "grad_norm": 0.41992321610450745, "learning_rate": 7.178114778471603e-05, "loss": 0.1385, "step": 19190 }, { "epoch": 2.85, "grad_norm": 0.2730390131473541, "learning_rate": 7.176628010704727e-05, "loss": 0.135, "step": 19200 }, { "epoch": 2.85, "grad_norm": 0.503334105014801, "learning_rate": 7.175141242937854e-05, "loss": 0.141, "step": 19210 }, { "epoch": 2.85, "grad_norm": 0.7644150257110596, "learning_rate": 7.173654475170978e-05, "loss": 0.1387, "step": 19220 }, { "epoch": 2.85, "grad_norm": 0.33800143003463745, "learning_rate": 7.172167707404104e-05, "loss": 0.1381, "step": 19230 }, { "epoch": 2.85, "grad_norm": 0.5975232124328613, "learning_rate": 7.17068093963723e-05, "loss": 0.1403, "step": 19240 }, { "epoch": 2.85, "grad_norm": 0.3202337920665741, "learning_rate": 7.169194171870353e-05, "loss": 0.1415, "step": 19250 }, { "epoch": 2.85, "grad_norm": 0.2667614221572876, "learning_rate": 7.16770740410348e-05, "loss": 0.1404, "step": 19260 }, { "epoch": 2.86, "grad_norm": 0.28970199823379517, "learning_rate": 7.166220636336604e-05, "loss": 0.144, "step": 19270 }, { "epoch": 2.86, "grad_norm": 0.9261873364448547, "learning_rate": 7.16473386856973e-05, "loss": 0.1355, "step": 19280 }, { "epoch": 2.86, "grad_norm": 0.30007144808769226, "learning_rate": 7.163247100802854e-05, "loss": 0.1357, "step": 19290 }, { "epoch": 2.86, "grad_norm": 1.1617704629898071, "learning_rate": 7.161760333035981e-05, "loss": 0.1433, "step": 19300 }, { "epoch": 2.86, "grad_norm": 0.5835827589035034, "learning_rate": 7.160273565269105e-05, "loss": 0.1412, "step": 19310 }, { "epoch": 2.86, "grad_norm": 0.3388794958591461, "learning_rate": 7.15878679750223e-05, "loss": 0.1467, "step": 19320 }, { "epoch": 2.87, "grad_norm": 0.5224015712738037, "learning_rate": 7.157300029735356e-05, "loss": 0.1409, "step": 19330 }, { "epoch": 2.87, "grad_norm": 0.2810278534889221, "learning_rate": 7.15581326196848e-05, "loss": 0.14, "step": 19340 }, { "epoch": 2.87, "grad_norm": 0.6086370348930359, "learning_rate": 7.154326494201607e-05, "loss": 0.1415, "step": 19350 }, { "epoch": 2.87, "grad_norm": 0.28138285875320435, "learning_rate": 7.152839726434731e-05, "loss": 0.1351, "step": 19360 }, { "epoch": 2.87, "grad_norm": 0.2224389910697937, "learning_rate": 7.151352958667857e-05, "loss": 0.1389, "step": 19370 }, { "epoch": 2.87, "grad_norm": 0.6104931235313416, "learning_rate": 7.14986619090098e-05, "loss": 0.144, "step": 19380 }, { "epoch": 2.87, "grad_norm": 0.29406848549842834, "learning_rate": 7.148379423134107e-05, "loss": 0.1367, "step": 19390 }, { "epoch": 2.88, "grad_norm": 0.9205821752548218, "learning_rate": 7.146892655367232e-05, "loss": 0.1391, "step": 19400 }, { "epoch": 2.88, "grad_norm": 0.89859539270401, "learning_rate": 7.145405887600357e-05, "loss": 0.1401, "step": 19410 }, { "epoch": 2.88, "grad_norm": 0.37001898884773254, "learning_rate": 7.143919119833483e-05, "loss": 0.1421, "step": 19420 }, { "epoch": 2.88, "grad_norm": 0.483979731798172, "learning_rate": 7.142432352066607e-05, "loss": 0.1388, "step": 19430 }, { "epoch": 2.88, "grad_norm": 0.3105422556400299, "learning_rate": 7.140945584299733e-05, "loss": 0.1451, "step": 19440 }, { "epoch": 2.88, "grad_norm": 0.598884105682373, "learning_rate": 7.139458816532858e-05, "loss": 0.1378, "step": 19450 }, { "epoch": 2.88, "grad_norm": 0.6454994082450867, "learning_rate": 7.137972048765983e-05, "loss": 0.1382, "step": 19460 }, { "epoch": 2.89, "grad_norm": 0.18754199147224426, "learning_rate": 7.136485280999107e-05, "loss": 0.1276, "step": 19470 }, { "epoch": 2.89, "grad_norm": 0.31574535369873047, "learning_rate": 7.134998513232234e-05, "loss": 0.1422, "step": 19480 }, { "epoch": 2.89, "grad_norm": 0.33168497681617737, "learning_rate": 7.133511745465358e-05, "loss": 0.1333, "step": 19490 }, { "epoch": 2.89, "grad_norm": 0.3626178503036499, "learning_rate": 7.132024977698484e-05, "loss": 0.1411, "step": 19500 }, { "epoch": 2.89, "grad_norm": 0.19792704284191132, "learning_rate": 7.130538209931609e-05, "loss": 0.1357, "step": 19510 }, { "epoch": 2.89, "grad_norm": 0.45540741086006165, "learning_rate": 7.129051442164735e-05, "loss": 0.1373, "step": 19520 }, { "epoch": 2.89, "grad_norm": 1.1599476337432861, "learning_rate": 7.12756467439786e-05, "loss": 0.1394, "step": 19530 }, { "epoch": 2.9, "grad_norm": 0.3340741991996765, "learning_rate": 7.126077906630984e-05, "loss": 0.1394, "step": 19540 }, { "epoch": 2.9, "grad_norm": 0.3207394480705261, "learning_rate": 7.12459113886411e-05, "loss": 0.1456, "step": 19550 }, { "epoch": 2.9, "grad_norm": 0.8679114580154419, "learning_rate": 7.123104371097235e-05, "loss": 0.1392, "step": 19560 }, { "epoch": 2.9, "grad_norm": 1.6166648864746094, "learning_rate": 7.12161760333036e-05, "loss": 0.1494, "step": 19570 }, { "epoch": 2.9, "grad_norm": 0.4138358533382416, "learning_rate": 7.120130835563486e-05, "loss": 0.1461, "step": 19580 }, { "epoch": 2.9, "grad_norm": 0.23959451913833618, "learning_rate": 7.11864406779661e-05, "loss": 0.1339, "step": 19590 }, { "epoch": 2.91, "grad_norm": 0.32449159026145935, "learning_rate": 7.117157300029736e-05, "loss": 0.1282, "step": 19600 }, { "epoch": 2.91, "grad_norm": 2.3437106609344482, "learning_rate": 7.115670532262861e-05, "loss": 0.1494, "step": 19610 }, { "epoch": 2.91, "grad_norm": 0.4861600995063782, "learning_rate": 7.114183764495987e-05, "loss": 0.1428, "step": 19620 }, { "epoch": 2.91, "grad_norm": 0.6348639726638794, "learning_rate": 7.112696996729111e-05, "loss": 0.1397, "step": 19630 }, { "epoch": 2.91, "grad_norm": 0.27723273634910583, "learning_rate": 7.111210228962236e-05, "loss": 0.1427, "step": 19640 }, { "epoch": 2.91, "grad_norm": 0.36210235953330994, "learning_rate": 7.109723461195362e-05, "loss": 0.1429, "step": 19650 }, { "epoch": 2.91, "grad_norm": 0.48523059487342834, "learning_rate": 7.108236693428487e-05, "loss": 0.1331, "step": 19660 }, { "epoch": 2.92, "grad_norm": 0.6052648425102234, "learning_rate": 7.106749925661613e-05, "loss": 0.1447, "step": 19670 }, { "epoch": 2.92, "grad_norm": 0.2304733544588089, "learning_rate": 7.105263157894737e-05, "loss": 0.1311, "step": 19680 }, { "epoch": 2.92, "grad_norm": 0.2759769558906555, "learning_rate": 7.103776390127862e-05, "loss": 0.1388, "step": 19690 }, { "epoch": 2.92, "grad_norm": 0.9866233468055725, "learning_rate": 7.102289622360988e-05, "loss": 0.1448, "step": 19700 }, { "epoch": 2.92, "grad_norm": 0.8615758419036865, "learning_rate": 7.100802854594113e-05, "loss": 0.1501, "step": 19710 }, { "epoch": 2.92, "grad_norm": 0.49602025747299194, "learning_rate": 7.099316086827237e-05, "loss": 0.1455, "step": 19720 }, { "epoch": 2.92, "grad_norm": 0.47930097579956055, "learning_rate": 7.097829319060363e-05, "loss": 0.1398, "step": 19730 }, { "epoch": 2.93, "grad_norm": 0.4859764575958252, "learning_rate": 7.096342551293488e-05, "loss": 0.138, "step": 19740 }, { "epoch": 2.93, "grad_norm": 0.7272999882698059, "learning_rate": 7.094855783526614e-05, "loss": 0.1323, "step": 19750 }, { "epoch": 2.93, "grad_norm": 0.3457830250263214, "learning_rate": 7.093369015759739e-05, "loss": 0.1425, "step": 19760 }, { "epoch": 2.93, "grad_norm": 0.23593512177467346, "learning_rate": 7.091882247992863e-05, "loss": 0.1293, "step": 19770 }, { "epoch": 2.93, "grad_norm": 0.8734978437423706, "learning_rate": 7.090395480225989e-05, "loss": 0.136, "step": 19780 }, { "epoch": 2.93, "grad_norm": 0.3335154950618744, "learning_rate": 7.088908712459114e-05, "loss": 0.1376, "step": 19790 }, { "epoch": 2.93, "grad_norm": 0.34812235832214355, "learning_rate": 7.08742194469224e-05, "loss": 0.1357, "step": 19800 }, { "epoch": 2.94, "grad_norm": 0.2229343056678772, "learning_rate": 7.085935176925364e-05, "loss": 0.1397, "step": 19810 }, { "epoch": 2.94, "grad_norm": 0.29468947649002075, "learning_rate": 7.08444840915849e-05, "loss": 0.1386, "step": 19820 }, { "epoch": 2.94, "grad_norm": 0.8740628361701965, "learning_rate": 7.082961641391615e-05, "loss": 0.1403, "step": 19830 }, { "epoch": 2.94, "grad_norm": 0.5060070753097534, "learning_rate": 7.08147487362474e-05, "loss": 0.1439, "step": 19840 }, { "epoch": 2.94, "grad_norm": 0.2761290967464447, "learning_rate": 7.079988105857866e-05, "loss": 0.1454, "step": 19850 }, { "epoch": 2.94, "grad_norm": 0.2517363429069519, "learning_rate": 7.07850133809099e-05, "loss": 0.1381, "step": 19860 }, { "epoch": 2.95, "grad_norm": 0.6545493006706238, "learning_rate": 7.077014570324115e-05, "loss": 0.1349, "step": 19870 }, { "epoch": 2.95, "grad_norm": 0.6688421964645386, "learning_rate": 7.075527802557241e-05, "loss": 0.1384, "step": 19880 }, { "epoch": 2.95, "grad_norm": 0.26555657386779785, "learning_rate": 7.074041034790366e-05, "loss": 0.1385, "step": 19890 }, { "epoch": 2.95, "grad_norm": 0.32324767112731934, "learning_rate": 7.072554267023492e-05, "loss": 0.1362, "step": 19900 }, { "epoch": 2.95, "grad_norm": 0.7072975039482117, "learning_rate": 7.071067499256616e-05, "loss": 0.1416, "step": 19910 }, { "epoch": 2.95, "grad_norm": 0.9408333897590637, "learning_rate": 7.069580731489741e-05, "loss": 0.137, "step": 19920 }, { "epoch": 2.95, "grad_norm": 0.7711098194122314, "learning_rate": 7.068093963722867e-05, "loss": 0.1379, "step": 19930 }, { "epoch": 2.96, "grad_norm": 1.1616792678833008, "learning_rate": 7.066607195955992e-05, "loss": 0.1445, "step": 19940 }, { "epoch": 2.96, "grad_norm": 0.3702917993068695, "learning_rate": 7.065120428189117e-05, "loss": 0.1516, "step": 19950 }, { "epoch": 2.96, "grad_norm": 1.209560513496399, "learning_rate": 7.063633660422242e-05, "loss": 0.1455, "step": 19960 }, { "epoch": 2.96, "grad_norm": 0.42448684573173523, "learning_rate": 7.062146892655367e-05, "loss": 0.1464, "step": 19970 }, { "epoch": 2.96, "grad_norm": 0.7959136366844177, "learning_rate": 7.060660124888493e-05, "loss": 0.1467, "step": 19980 }, { "epoch": 2.96, "grad_norm": 0.3942021131515503, "learning_rate": 7.059173357121618e-05, "loss": 0.14, "step": 19990 }, { "epoch": 2.96, "grad_norm": 0.2557319104671478, "learning_rate": 7.057686589354743e-05, "loss": 0.1373, "step": 20000 }, { "epoch": 2.97, "grad_norm": 0.9788160920143127, "learning_rate": 7.05619982158787e-05, "loss": 0.1346, "step": 20010 }, { "epoch": 2.97, "grad_norm": 0.39195263385772705, "learning_rate": 7.054713053820994e-05, "loss": 0.1364, "step": 20020 }, { "epoch": 2.97, "grad_norm": 0.37623974680900574, "learning_rate": 7.053226286054119e-05, "loss": 0.142, "step": 20030 }, { "epoch": 2.97, "grad_norm": 0.6487099528312683, "learning_rate": 7.051739518287243e-05, "loss": 0.1424, "step": 20040 }, { "epoch": 2.97, "grad_norm": 0.6612546443939209, "learning_rate": 7.050252750520369e-05, "loss": 0.1381, "step": 20050 }, { "epoch": 2.97, "grad_norm": 0.30832764506340027, "learning_rate": 7.048765982753494e-05, "loss": 0.139, "step": 20060 }, { "epoch": 2.97, "grad_norm": 0.5305371880531311, "learning_rate": 7.04727921498662e-05, "loss": 0.1415, "step": 20070 }, { "epoch": 2.98, "grad_norm": 0.4207991361618042, "learning_rate": 7.045792447219745e-05, "loss": 0.1385, "step": 20080 }, { "epoch": 2.98, "grad_norm": 0.2595292627811432, "learning_rate": 7.044305679452869e-05, "loss": 0.1286, "step": 20090 }, { "epoch": 2.98, "grad_norm": 0.864605724811554, "learning_rate": 7.042818911685996e-05, "loss": 0.1325, "step": 20100 }, { "epoch": 2.98, "grad_norm": 1.4537006616592407, "learning_rate": 7.04133214391912e-05, "loss": 0.135, "step": 20110 }, { "epoch": 2.98, "grad_norm": 0.3340538740158081, "learning_rate": 7.039845376152246e-05, "loss": 0.1375, "step": 20120 }, { "epoch": 2.98, "grad_norm": 1.0329761505126953, "learning_rate": 7.03835860838537e-05, "loss": 0.137, "step": 20130 }, { "epoch": 2.99, "grad_norm": 0.6245138645172119, "learning_rate": 7.036871840618495e-05, "loss": 0.1372, "step": 20140 }, { "epoch": 2.99, "grad_norm": 0.35035985708236694, "learning_rate": 7.03538507285162e-05, "loss": 0.1395, "step": 20150 }, { "epoch": 2.99, "grad_norm": 0.351352721452713, "learning_rate": 7.033898305084746e-05, "loss": 0.1433, "step": 20160 }, { "epoch": 2.99, "grad_norm": 0.4944305717945099, "learning_rate": 7.032411537317872e-05, "loss": 0.1388, "step": 20170 }, { "epoch": 2.99, "grad_norm": 0.8752815127372742, "learning_rate": 7.030924769550996e-05, "loss": 0.1341, "step": 20180 }, { "epoch": 2.99, "grad_norm": 0.30437782406806946, "learning_rate": 7.029438001784123e-05, "loss": 0.1306, "step": 20190 }, { "epoch": 2.99, "grad_norm": 0.797670841217041, "learning_rate": 7.027951234017247e-05, "loss": 0.1469, "step": 20200 }, { "epoch": 3.0, "grad_norm": 1.6191693544387817, "learning_rate": 7.026464466250372e-05, "loss": 0.1346, "step": 20210 }, { "epoch": 3.0, "grad_norm": 0.9991375803947449, "learning_rate": 7.024977698483498e-05, "loss": 0.1402, "step": 20220 }, { "epoch": 3.0, "grad_norm": 1.0731664896011353, "learning_rate": 7.023490930716622e-05, "loss": 0.1451, "step": 20230 }, { "epoch": 3.0, "eval_loss": 0.15701617300510406, "eval_runtime": 2481.1974, "eval_samples_per_second": 235.24, "eval_steps_per_second": 3.676, "step": 20238 } ], "logging_steps": 10, "max_steps": 67460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 8.63158939557036e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }