{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 52890, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 103.3125, "learning_rate": 9.996224236844906e-07, "loss": 5.1244, "step": 20 }, { "epoch": 0.0, "grad_norm": 133.25, "learning_rate": 9.99244280153685e-07, "loss": 4.3099, "step": 40 }, { "epoch": 0.01, "grad_norm": 74.375, "learning_rate": 9.988661366228795e-07, "loss": 3.1455, "step": 60 }, { "epoch": 0.01, "grad_norm": 83.875, "learning_rate": 9.98487993092074e-07, "loss": 3.0035, "step": 80 }, { "epoch": 0.01, "grad_norm": 106.0625, "learning_rate": 9.981098495612682e-07, "loss": 2.9661, "step": 100 }, { "epoch": 0.01, "grad_norm": 55.28125, "learning_rate": 9.977317060304627e-07, "loss": 2.634, "step": 120 }, { "epoch": 0.01, "grad_norm": 69.0, "learning_rate": 9.973535624996572e-07, "loss": 2.601, "step": 140 }, { "epoch": 0.02, "grad_norm": 64.6875, "learning_rate": 9.969754189688517e-07, "loss": 2.5738, "step": 160 }, { "epoch": 0.02, "grad_norm": 99.0, "learning_rate": 9.965972754380461e-07, "loss": 2.6165, "step": 180 }, { "epoch": 0.02, "grad_norm": 62.125, "learning_rate": 9.962191319072406e-07, "loss": 2.5547, "step": 200 }, { "epoch": 0.02, "grad_norm": 60.625, "learning_rate": 9.95840988376435e-07, "loss": 2.4761, "step": 220 }, { "epoch": 0.02, "grad_norm": 59.625, "learning_rate": 9.954628448456296e-07, "loss": 2.4663, "step": 240 }, { "epoch": 0.02, "grad_norm": 75.75, "learning_rate": 9.950847013148238e-07, "loss": 2.5917, "step": 260 }, { "epoch": 0.03, "grad_norm": 96.0, "learning_rate": 9.947065577840183e-07, "loss": 2.4079, "step": 280 }, { "epoch": 0.03, "grad_norm": 85.25, "learning_rate": 9.943284142532128e-07, "loss": 2.4535, "step": 300 }, { "epoch": 0.03, "grad_norm": 48.5625, "learning_rate": 9.939502707224072e-07, "loss": 2.4386, "step": 320 }, { "epoch": 0.03, "grad_norm": 53.46875, "learning_rate": 9.935721271916017e-07, "loss": 2.4755, "step": 340 }, { "epoch": 0.03, "grad_norm": 69.5, "learning_rate": 9.931939836607962e-07, "loss": 2.3306, "step": 360 }, { "epoch": 0.04, "grad_norm": 45.09375, "learning_rate": 9.928158401299905e-07, "loss": 2.3735, "step": 380 }, { "epoch": 0.04, "grad_norm": 60.90625, "learning_rate": 9.92437696599185e-07, "loss": 2.3466, "step": 400 }, { "epoch": 0.04, "grad_norm": 55.34375, "learning_rate": 9.920595530683794e-07, "loss": 2.3971, "step": 420 }, { "epoch": 0.04, "grad_norm": 49.375, "learning_rate": 9.916814095375739e-07, "loss": 2.366, "step": 440 }, { "epoch": 0.04, "grad_norm": 49.15625, "learning_rate": 9.913032660067684e-07, "loss": 2.3549, "step": 460 }, { "epoch": 0.05, "grad_norm": 66.75, "learning_rate": 9.909251224759626e-07, "loss": 2.23, "step": 480 }, { "epoch": 0.05, "grad_norm": 66.8125, "learning_rate": 9.905469789451573e-07, "loss": 2.4057, "step": 500 }, { "epoch": 0.05, "grad_norm": 60.375, "learning_rate": 9.901688354143518e-07, "loss": 2.339, "step": 520 }, { "epoch": 0.05, "grad_norm": 56.65625, "learning_rate": 9.89790691883546e-07, "loss": 2.3804, "step": 540 }, { "epoch": 0.05, "grad_norm": 78.375, "learning_rate": 9.894125483527405e-07, "loss": 2.2699, "step": 560 }, { "epoch": 0.05, "grad_norm": 65.4375, "learning_rate": 9.89034404821935e-07, "loss": 2.4262, "step": 580 }, { "epoch": 0.06, "grad_norm": 55.96875, "learning_rate": 9.886562612911295e-07, "loss": 2.3386, "step": 600 }, { "epoch": 0.06, "grad_norm": 71.0625, "learning_rate": 9.88278117760324e-07, "loss": 2.3846, "step": 620 }, { "epoch": 0.06, "grad_norm": 61.75, "learning_rate": 9.878999742295184e-07, "loss": 2.2632, "step": 640 }, { "epoch": 0.06, "grad_norm": 47.59375, "learning_rate": 9.875218306987127e-07, "loss": 2.2826, "step": 660 }, { "epoch": 0.06, "grad_norm": 68.75, "learning_rate": 9.871436871679071e-07, "loss": 2.3458, "step": 680 }, { "epoch": 0.07, "grad_norm": 51.0625, "learning_rate": 9.867655436371016e-07, "loss": 2.2185, "step": 700 }, { "epoch": 0.07, "grad_norm": 63.71875, "learning_rate": 9.86387400106296e-07, "loss": 2.2608, "step": 720 }, { "epoch": 0.07, "grad_norm": 78.375, "learning_rate": 9.860092565754906e-07, "loss": 2.3038, "step": 740 }, { "epoch": 0.07, "grad_norm": 53.25, "learning_rate": 9.856311130446848e-07, "loss": 2.2971, "step": 760 }, { "epoch": 0.07, "grad_norm": 61.84375, "learning_rate": 9.852529695138793e-07, "loss": 2.1458, "step": 780 }, { "epoch": 0.08, "grad_norm": 58.78125, "learning_rate": 9.848748259830738e-07, "loss": 2.2228, "step": 800 }, { "epoch": 0.08, "grad_norm": 48.90625, "learning_rate": 9.844966824522683e-07, "loss": 2.3179, "step": 820 }, { "epoch": 0.08, "grad_norm": 70.625, "learning_rate": 9.841185389214627e-07, "loss": 2.1868, "step": 840 }, { "epoch": 0.08, "grad_norm": 64.125, "learning_rate": 9.837403953906572e-07, "loss": 2.2075, "step": 860 }, { "epoch": 0.08, "grad_norm": 68.0, "learning_rate": 9.833622518598517e-07, "loss": 2.2674, "step": 880 }, { "epoch": 0.09, "grad_norm": 70.3125, "learning_rate": 9.829841083290462e-07, "loss": 2.3387, "step": 900 }, { "epoch": 0.09, "grad_norm": 58.5, "learning_rate": 9.826059647982406e-07, "loss": 2.2466, "step": 920 }, { "epoch": 0.09, "grad_norm": 110.9375, "learning_rate": 9.822278212674349e-07, "loss": 2.1914, "step": 940 }, { "epoch": 0.09, "grad_norm": 60.96875, "learning_rate": 9.818496777366294e-07, "loss": 2.1054, "step": 960 }, { "epoch": 0.09, "grad_norm": 73.1875, "learning_rate": 9.814715342058238e-07, "loss": 2.2398, "step": 980 }, { "epoch": 0.09, "grad_norm": 51.5, "learning_rate": 9.810933906750183e-07, "loss": 2.2347, "step": 1000 }, { "epoch": 0.1, "grad_norm": 79.125, "learning_rate": 9.807152471442128e-07, "loss": 2.2701, "step": 1020 }, { "epoch": 0.1, "grad_norm": 63.40625, "learning_rate": 9.803371036134073e-07, "loss": 2.3747, "step": 1040 }, { "epoch": 0.1, "grad_norm": 71.75, "learning_rate": 9.799589600826015e-07, "loss": 2.3053, "step": 1060 }, { "epoch": 0.1, "grad_norm": 84.0625, "learning_rate": 9.79580816551796e-07, "loss": 2.2083, "step": 1080 }, { "epoch": 0.1, "grad_norm": 58.5625, "learning_rate": 9.792026730209905e-07, "loss": 2.301, "step": 1100 }, { "epoch": 0.11, "grad_norm": 61.59375, "learning_rate": 9.78824529490185e-07, "loss": 2.1949, "step": 1120 }, { "epoch": 0.11, "grad_norm": 53.0625, "learning_rate": 9.784463859593794e-07, "loss": 2.1185, "step": 1140 }, { "epoch": 0.11, "grad_norm": 63.4375, "learning_rate": 9.780682424285737e-07, "loss": 2.2003, "step": 1160 }, { "epoch": 0.11, "grad_norm": 72.4375, "learning_rate": 9.776900988977682e-07, "loss": 2.1994, "step": 1180 }, { "epoch": 0.11, "grad_norm": 74.375, "learning_rate": 9.773119553669628e-07, "loss": 2.1708, "step": 1200 }, { "epoch": 0.12, "grad_norm": 47.75, "learning_rate": 9.76933811836157e-07, "loss": 2.1373, "step": 1220 }, { "epoch": 0.12, "grad_norm": 88.375, "learning_rate": 9.765556683053516e-07, "loss": 2.1869, "step": 1240 }, { "epoch": 0.12, "grad_norm": 162.75, "learning_rate": 9.76177524774546e-07, "loss": 2.3998, "step": 1260 }, { "epoch": 0.12, "grad_norm": 77.875, "learning_rate": 9.757993812437405e-07, "loss": 2.1531, "step": 1280 }, { "epoch": 0.12, "grad_norm": 80.75, "learning_rate": 9.75421237712935e-07, "loss": 2.1803, "step": 1300 }, { "epoch": 0.12, "grad_norm": 78.25, "learning_rate": 9.750430941821295e-07, "loss": 2.2272, "step": 1320 }, { "epoch": 0.13, "grad_norm": 62.40625, "learning_rate": 9.746649506513237e-07, "loss": 2.3047, "step": 1340 }, { "epoch": 0.13, "grad_norm": 92.8125, "learning_rate": 9.742868071205182e-07, "loss": 2.2091, "step": 1360 }, { "epoch": 0.13, "grad_norm": 77.0625, "learning_rate": 9.739086635897127e-07, "loss": 2.3365, "step": 1380 }, { "epoch": 0.13, "grad_norm": 67.0, "learning_rate": 9.735305200589072e-07, "loss": 2.1193, "step": 1400 }, { "epoch": 0.13, "grad_norm": 67.4375, "learning_rate": 9.731523765281016e-07, "loss": 2.1237, "step": 1420 }, { "epoch": 0.14, "grad_norm": 51.84375, "learning_rate": 9.72774232997296e-07, "loss": 2.1907, "step": 1440 }, { "epoch": 0.14, "grad_norm": 73.375, "learning_rate": 9.723960894664904e-07, "loss": 2.2086, "step": 1460 }, { "epoch": 0.14, "grad_norm": 56.625, "learning_rate": 9.720179459356848e-07, "loss": 2.232, "step": 1480 }, { "epoch": 0.14, "grad_norm": 67.875, "learning_rate": 9.716398024048793e-07, "loss": 2.236, "step": 1500 }, { "epoch": 0.14, "grad_norm": 85.6875, "learning_rate": 9.712616588740738e-07, "loss": 2.2241, "step": 1520 }, { "epoch": 0.15, "grad_norm": 64.5, "learning_rate": 9.708835153432683e-07, "loss": 2.1892, "step": 1540 }, { "epoch": 0.15, "grad_norm": 56.46875, "learning_rate": 9.705053718124627e-07, "loss": 2.2541, "step": 1560 }, { "epoch": 0.15, "grad_norm": 78.125, "learning_rate": 9.701272282816572e-07, "loss": 2.1645, "step": 1580 }, { "epoch": 0.15, "grad_norm": 66.5625, "learning_rate": 9.697490847508517e-07, "loss": 2.1083, "step": 1600 }, { "epoch": 0.15, "grad_norm": 70.375, "learning_rate": 9.69370941220046e-07, "loss": 2.1757, "step": 1620 }, { "epoch": 0.16, "grad_norm": 55.9375, "learning_rate": 9.689927976892404e-07, "loss": 2.1102, "step": 1640 }, { "epoch": 0.16, "grad_norm": 106.4375, "learning_rate": 9.68614654158435e-07, "loss": 2.1047, "step": 1660 }, { "epoch": 0.16, "grad_norm": 60.59375, "learning_rate": 9.682365106276294e-07, "loss": 2.2789, "step": 1680 }, { "epoch": 0.16, "grad_norm": 63.8125, "learning_rate": 9.678583670968239e-07, "loss": 2.1161, "step": 1700 }, { "epoch": 0.16, "grad_norm": 80.4375, "learning_rate": 9.674802235660181e-07, "loss": 2.2273, "step": 1720 }, { "epoch": 0.16, "grad_norm": 58.28125, "learning_rate": 9.671020800352126e-07, "loss": 2.1584, "step": 1740 }, { "epoch": 0.17, "grad_norm": 60.59375, "learning_rate": 9.66723936504407e-07, "loss": 2.159, "step": 1760 }, { "epoch": 0.17, "grad_norm": 77.25, "learning_rate": 9.663457929736015e-07, "loss": 2.1554, "step": 1780 }, { "epoch": 0.17, "grad_norm": 62.125, "learning_rate": 9.65967649442796e-07, "loss": 2.3024, "step": 1800 }, { "epoch": 0.17, "grad_norm": 47.9375, "learning_rate": 9.655895059119905e-07, "loss": 2.1416, "step": 1820 }, { "epoch": 0.17, "grad_norm": 78.875, "learning_rate": 9.652113623811847e-07, "loss": 2.2613, "step": 1840 }, { "epoch": 0.18, "grad_norm": 71.5, "learning_rate": 9.648332188503792e-07, "loss": 2.1109, "step": 1860 }, { "epoch": 0.18, "grad_norm": 42.03125, "learning_rate": 9.644550753195737e-07, "loss": 2.0072, "step": 1880 }, { "epoch": 0.18, "grad_norm": 47.875, "learning_rate": 9.640769317887682e-07, "loss": 2.1403, "step": 1900 }, { "epoch": 0.18, "grad_norm": 70.25, "learning_rate": 9.636987882579626e-07, "loss": 2.1649, "step": 1920 }, { "epoch": 0.18, "grad_norm": 52.15625, "learning_rate": 9.633206447271571e-07, "loss": 2.0728, "step": 1940 }, { "epoch": 0.19, "grad_norm": 65.625, "learning_rate": 9.629425011963516e-07, "loss": 2.2163, "step": 1960 }, { "epoch": 0.19, "grad_norm": 81.125, "learning_rate": 9.62564357665546e-07, "loss": 2.3386, "step": 1980 }, { "epoch": 0.19, "grad_norm": 65.0, "learning_rate": 9.621862141347405e-07, "loss": 2.1122, "step": 2000 }, { "epoch": 0.19, "grad_norm": 62.0, "learning_rate": 9.618080706039348e-07, "loss": 2.201, "step": 2020 }, { "epoch": 0.19, "grad_norm": 76.9375, "learning_rate": 9.614299270731293e-07, "loss": 2.2634, "step": 2040 }, { "epoch": 0.19, "grad_norm": 80.9375, "learning_rate": 9.610517835423238e-07, "loss": 2.1005, "step": 2060 }, { "epoch": 0.2, "grad_norm": 74.5, "learning_rate": 9.606736400115182e-07, "loss": 2.1028, "step": 2080 }, { "epoch": 0.2, "grad_norm": 76.25, "learning_rate": 9.602954964807127e-07, "loss": 2.2869, "step": 2100 }, { "epoch": 0.2, "grad_norm": 59.3125, "learning_rate": 9.59917352949907e-07, "loss": 2.1469, "step": 2120 }, { "epoch": 0.2, "grad_norm": 64.8125, "learning_rate": 9.595392094191014e-07, "loss": 2.084, "step": 2140 }, { "epoch": 0.2, "grad_norm": 73.125, "learning_rate": 9.59161065888296e-07, "loss": 2.1673, "step": 2160 }, { "epoch": 0.21, "grad_norm": 161.0, "learning_rate": 9.587829223574904e-07, "loss": 2.0762, "step": 2180 }, { "epoch": 0.21, "grad_norm": 62.0, "learning_rate": 9.584047788266849e-07, "loss": 2.0542, "step": 2200 }, { "epoch": 0.21, "grad_norm": 101.0, "learning_rate": 9.580266352958793e-07, "loss": 2.0866, "step": 2220 }, { "epoch": 0.21, "grad_norm": 66.0, "learning_rate": 9.576484917650736e-07, "loss": 2.1058, "step": 2240 }, { "epoch": 0.21, "grad_norm": 51.21875, "learning_rate": 9.572703482342683e-07, "loss": 2.18, "step": 2260 }, { "epoch": 0.22, "grad_norm": 131.125, "learning_rate": 9.568922047034628e-07, "loss": 2.0921, "step": 2280 }, { "epoch": 0.22, "grad_norm": 68.8125, "learning_rate": 9.56514061172657e-07, "loss": 2.1603, "step": 2300 }, { "epoch": 0.22, "grad_norm": 106.375, "learning_rate": 9.561359176418515e-07, "loss": 2.1403, "step": 2320 }, { "epoch": 0.22, "grad_norm": 68.1875, "learning_rate": 9.55757774111046e-07, "loss": 2.2704, "step": 2340 }, { "epoch": 0.22, "grad_norm": 83.6875, "learning_rate": 9.553796305802404e-07, "loss": 2.2066, "step": 2360 }, { "epoch": 0.22, "grad_norm": 75.9375, "learning_rate": 9.55001487049435e-07, "loss": 2.1317, "step": 2380 }, { "epoch": 0.23, "grad_norm": 84.875, "learning_rate": 9.546233435186292e-07, "loss": 2.1652, "step": 2400 }, { "epoch": 0.23, "grad_norm": 58.65625, "learning_rate": 9.542451999878237e-07, "loss": 2.1021, "step": 2420 }, { "epoch": 0.23, "grad_norm": 125.0, "learning_rate": 9.538670564570181e-07, "loss": 2.1924, "step": 2440 }, { "epoch": 0.23, "grad_norm": 56.96875, "learning_rate": 9.534889129262126e-07, "loss": 2.1115, "step": 2460 }, { "epoch": 0.23, "grad_norm": 56.03125, "learning_rate": 9.53110769395407e-07, "loss": 2.0641, "step": 2480 }, { "epoch": 0.24, "grad_norm": 83.625, "learning_rate": 9.527326258646014e-07, "loss": 2.0102, "step": 2500 }, { "epoch": 0.24, "grad_norm": 66.125, "learning_rate": 9.523544823337959e-07, "loss": 2.0708, "step": 2520 }, { "epoch": 0.24, "grad_norm": 75.75, "learning_rate": 9.519763388029903e-07, "loss": 2.0818, "step": 2540 }, { "epoch": 0.24, "grad_norm": 128.0, "learning_rate": 9.515981952721848e-07, "loss": 2.1258, "step": 2560 }, { "epoch": 0.24, "grad_norm": 75.8125, "learning_rate": 9.512200517413792e-07, "loss": 2.1426, "step": 2580 }, { "epoch": 0.25, "grad_norm": 63.875, "learning_rate": 9.508419082105737e-07, "loss": 2.1111, "step": 2600 }, { "epoch": 0.25, "grad_norm": 58.75, "learning_rate": 9.504637646797682e-07, "loss": 2.0416, "step": 2620 }, { "epoch": 0.25, "grad_norm": 55.0625, "learning_rate": 9.500856211489627e-07, "loss": 2.0279, "step": 2640 }, { "epoch": 0.25, "grad_norm": 56.96875, "learning_rate": 9.49707477618157e-07, "loss": 2.0627, "step": 2660 }, { "epoch": 0.25, "grad_norm": 55.75, "learning_rate": 9.493293340873515e-07, "loss": 2.0918, "step": 2680 }, { "epoch": 0.26, "grad_norm": 62.15625, "learning_rate": 9.48951190556546e-07, "loss": 2.1226, "step": 2700 }, { "epoch": 0.26, "grad_norm": 75.75, "learning_rate": 9.485730470257403e-07, "loss": 2.0557, "step": 2720 }, { "epoch": 0.26, "grad_norm": 82.25, "learning_rate": 9.481949034949348e-07, "loss": 2.1832, "step": 2740 }, { "epoch": 0.26, "grad_norm": 54.5625, "learning_rate": 9.478167599641293e-07, "loss": 2.0379, "step": 2760 }, { "epoch": 0.26, "grad_norm": 54.21875, "learning_rate": 9.474386164333237e-07, "loss": 2.0732, "step": 2780 }, { "epoch": 0.26, "grad_norm": 61.03125, "learning_rate": 9.470604729025181e-07, "loss": 2.1374, "step": 2800 }, { "epoch": 0.27, "grad_norm": 94.0625, "learning_rate": 9.466823293717125e-07, "loss": 2.0421, "step": 2820 }, { "epoch": 0.27, "grad_norm": 55.40625, "learning_rate": 9.46304185840907e-07, "loss": 2.0783, "step": 2840 }, { "epoch": 0.27, "grad_norm": 117.6875, "learning_rate": 9.459260423101015e-07, "loss": 2.1041, "step": 2860 }, { "epoch": 0.27, "grad_norm": 70.875, "learning_rate": 9.455478987792958e-07, "loss": 2.1819, "step": 2880 }, { "epoch": 0.27, "grad_norm": 90.3125, "learning_rate": 9.451697552484903e-07, "loss": 2.1047, "step": 2900 }, { "epoch": 0.28, "grad_norm": 51.5625, "learning_rate": 9.447916117176848e-07, "loss": 2.1001, "step": 2920 }, { "epoch": 0.28, "grad_norm": 77.6875, "learning_rate": 9.444134681868792e-07, "loss": 2.0817, "step": 2940 }, { "epoch": 0.28, "grad_norm": 101.1875, "learning_rate": 9.440353246560737e-07, "loss": 2.1162, "step": 2960 }, { "epoch": 0.28, "grad_norm": 83.375, "learning_rate": 9.436571811252682e-07, "loss": 2.1422, "step": 2980 }, { "epoch": 0.28, "grad_norm": 89.8125, "learning_rate": 9.432790375944626e-07, "loss": 2.1404, "step": 3000 }, { "epoch": 0.29, "grad_norm": 50.78125, "learning_rate": 9.42900894063657e-07, "loss": 2.0371, "step": 3020 }, { "epoch": 0.29, "grad_norm": 57.25, "learning_rate": 9.425227505328515e-07, "loss": 2.0327, "step": 3040 }, { "epoch": 0.29, "grad_norm": 112.375, "learning_rate": 9.421446070020459e-07, "loss": 2.1609, "step": 3060 }, { "epoch": 0.29, "grad_norm": 76.3125, "learning_rate": 9.417664634712404e-07, "loss": 2.1167, "step": 3080 }, { "epoch": 0.29, "grad_norm": 63.6875, "learning_rate": 9.413883199404348e-07, "loss": 2.023, "step": 3100 }, { "epoch": 0.29, "grad_norm": 46.6875, "learning_rate": 9.410101764096292e-07, "loss": 2.0556, "step": 3120 }, { "epoch": 0.3, "grad_norm": 84.9375, "learning_rate": 9.406320328788237e-07, "loss": 2.0876, "step": 3140 }, { "epoch": 0.3, "grad_norm": 107.5, "learning_rate": 9.40253889348018e-07, "loss": 2.087, "step": 3160 }, { "epoch": 0.3, "grad_norm": 63.4375, "learning_rate": 9.398757458172125e-07, "loss": 2.1845, "step": 3180 }, { "epoch": 0.3, "grad_norm": 52.9375, "learning_rate": 9.39497602286407e-07, "loss": 2.1219, "step": 3200 }, { "epoch": 0.3, "grad_norm": 72.1875, "learning_rate": 9.391194587556014e-07, "loss": 2.0844, "step": 3220 }, { "epoch": 0.31, "grad_norm": 96.875, "learning_rate": 9.387413152247958e-07, "loss": 2.1343, "step": 3240 }, { "epoch": 0.31, "grad_norm": 64.375, "learning_rate": 9.383631716939903e-07, "loss": 2.0837, "step": 3260 }, { "epoch": 0.31, "grad_norm": 82.0625, "learning_rate": 9.379850281631847e-07, "loss": 2.1041, "step": 3280 }, { "epoch": 0.31, "grad_norm": 70.3125, "learning_rate": 9.376068846323792e-07, "loss": 2.1016, "step": 3300 }, { "epoch": 0.31, "grad_norm": 70.125, "learning_rate": 9.372287411015737e-07, "loss": 2.1038, "step": 3320 }, { "epoch": 0.32, "grad_norm": 73.6875, "learning_rate": 9.368505975707681e-07, "loss": 2.1319, "step": 3340 }, { "epoch": 0.32, "grad_norm": 79.4375, "learning_rate": 9.364724540399626e-07, "loss": 1.9609, "step": 3360 }, { "epoch": 0.32, "grad_norm": 42.53125, "learning_rate": 9.36094310509157e-07, "loss": 2.1016, "step": 3380 }, { "epoch": 0.32, "grad_norm": 112.5, "learning_rate": 9.357161669783514e-07, "loss": 2.1199, "step": 3400 }, { "epoch": 0.32, "grad_norm": 65.625, "learning_rate": 9.353380234475459e-07, "loss": 2.0902, "step": 3420 }, { "epoch": 0.33, "grad_norm": 138.875, "learning_rate": 9.349598799167403e-07, "loss": 2.0669, "step": 3440 }, { "epoch": 0.33, "grad_norm": 83.25, "learning_rate": 9.345817363859347e-07, "loss": 2.1091, "step": 3460 }, { "epoch": 0.33, "grad_norm": 115.3125, "learning_rate": 9.342035928551292e-07, "loss": 2.0353, "step": 3480 }, { "epoch": 0.33, "grad_norm": 62.71875, "learning_rate": 9.338254493243236e-07, "loss": 2.0954, "step": 3500 }, { "epoch": 0.33, "grad_norm": 79.1875, "learning_rate": 9.33447305793518e-07, "loss": 2.096, "step": 3520 }, { "epoch": 0.33, "grad_norm": 158.0, "learning_rate": 9.330691622627125e-07, "loss": 2.1737, "step": 3540 }, { "epoch": 0.34, "grad_norm": 57.625, "learning_rate": 9.326910187319069e-07, "loss": 2.1029, "step": 3560 }, { "epoch": 0.34, "grad_norm": 86.3125, "learning_rate": 9.323128752011014e-07, "loss": 2.1356, "step": 3580 }, { "epoch": 0.34, "grad_norm": 93.3125, "learning_rate": 9.319347316702958e-07, "loss": 2.1022, "step": 3600 }, { "epoch": 0.34, "grad_norm": 70.0625, "learning_rate": 9.315565881394902e-07, "loss": 2.0402, "step": 3620 }, { "epoch": 0.34, "grad_norm": 68.6875, "learning_rate": 9.311784446086848e-07, "loss": 2.1377, "step": 3640 }, { "epoch": 0.35, "grad_norm": 52.15625, "learning_rate": 9.308003010778793e-07, "loss": 2.0708, "step": 3660 }, { "epoch": 0.35, "grad_norm": 71.625, "learning_rate": 9.304221575470736e-07, "loss": 2.1001, "step": 3680 }, { "epoch": 0.35, "grad_norm": 62.78125, "learning_rate": 9.300440140162681e-07, "loss": 2.1044, "step": 3700 }, { "epoch": 0.35, "grad_norm": 98.375, "learning_rate": 9.296658704854626e-07, "loss": 2.0937, "step": 3720 }, { "epoch": 0.35, "grad_norm": 80.875, "learning_rate": 9.292877269546569e-07, "loss": 2.1337, "step": 3740 }, { "epoch": 0.36, "grad_norm": 60.9375, "learning_rate": 9.289095834238514e-07, "loss": 2.066, "step": 3760 }, { "epoch": 0.36, "grad_norm": 58.8125, "learning_rate": 9.285314398930458e-07, "loss": 2.0683, "step": 3780 }, { "epoch": 0.36, "grad_norm": 78.125, "learning_rate": 9.281532963622403e-07, "loss": 1.9853, "step": 3800 }, { "epoch": 0.36, "grad_norm": 79.6875, "learning_rate": 9.277751528314347e-07, "loss": 2.0792, "step": 3820 }, { "epoch": 0.36, "grad_norm": 71.0625, "learning_rate": 9.273970093006291e-07, "loss": 2.0544, "step": 3840 }, { "epoch": 0.36, "grad_norm": 45.03125, "learning_rate": 9.270188657698236e-07, "loss": 2.0586, "step": 3860 }, { "epoch": 0.37, "grad_norm": 50.3125, "learning_rate": 9.26640722239018e-07, "loss": 2.0716, "step": 3880 }, { "epoch": 0.37, "grad_norm": 67.125, "learning_rate": 9.262625787082124e-07, "loss": 2.0274, "step": 3900 }, { "epoch": 0.37, "grad_norm": 51.84375, "learning_rate": 9.258844351774069e-07, "loss": 2.02, "step": 3920 }, { "epoch": 0.37, "grad_norm": 56.71875, "learning_rate": 9.255062916466014e-07, "loss": 2.1464, "step": 3940 }, { "epoch": 0.37, "grad_norm": 63.34375, "learning_rate": 9.251281481157957e-07, "loss": 2.08, "step": 3960 }, { "epoch": 0.38, "grad_norm": 98.5625, "learning_rate": 9.247500045849902e-07, "loss": 2.1212, "step": 3980 }, { "epoch": 0.38, "grad_norm": 72.75, "learning_rate": 9.243718610541848e-07, "loss": 2.1505, "step": 4000 }, { "epoch": 0.38, "grad_norm": 65.625, "learning_rate": 9.239937175233792e-07, "loss": 2.0742, "step": 4020 }, { "epoch": 0.38, "grad_norm": 90.625, "learning_rate": 9.236155739925736e-07, "loss": 2.0463, "step": 4040 }, { "epoch": 0.38, "grad_norm": 78.1875, "learning_rate": 9.23237430461768e-07, "loss": 2.0287, "step": 4060 }, { "epoch": 0.39, "grad_norm": 51.875, "learning_rate": 9.228592869309625e-07, "loss": 2.0933, "step": 4080 }, { "epoch": 0.39, "grad_norm": 56.78125, "learning_rate": 9.22481143400157e-07, "loss": 2.001, "step": 4100 }, { "epoch": 0.39, "grad_norm": 51.90625, "learning_rate": 9.221029998693513e-07, "loss": 1.9934, "step": 4120 }, { "epoch": 0.39, "grad_norm": 55.78125, "learning_rate": 9.217248563385458e-07, "loss": 2.0693, "step": 4140 }, { "epoch": 0.39, "grad_norm": 103.0625, "learning_rate": 9.213467128077403e-07, "loss": 2.0205, "step": 4160 }, { "epoch": 0.4, "grad_norm": 54.375, "learning_rate": 9.209685692769346e-07, "loss": 2.0219, "step": 4180 }, { "epoch": 0.4, "grad_norm": 65.6875, "learning_rate": 9.205904257461291e-07, "loss": 2.1043, "step": 4200 }, { "epoch": 0.4, "grad_norm": 70.8125, "learning_rate": 9.202122822153236e-07, "loss": 2.1014, "step": 4220 }, { "epoch": 0.4, "grad_norm": 73.5625, "learning_rate": 9.19834138684518e-07, "loss": 1.9973, "step": 4240 }, { "epoch": 0.4, "grad_norm": 61.375, "learning_rate": 9.194559951537124e-07, "loss": 2.0508, "step": 4260 }, { "epoch": 0.4, "grad_norm": 53.6875, "learning_rate": 9.190778516229069e-07, "loss": 2.0495, "step": 4280 }, { "epoch": 0.41, "grad_norm": 97.5, "learning_rate": 9.186997080921013e-07, "loss": 2.1024, "step": 4300 }, { "epoch": 0.41, "grad_norm": 80.3125, "learning_rate": 9.183215645612957e-07, "loss": 2.1573, "step": 4320 }, { "epoch": 0.41, "grad_norm": 57.875, "learning_rate": 9.179434210304903e-07, "loss": 2.0675, "step": 4340 }, { "epoch": 0.41, "grad_norm": 68.1875, "learning_rate": 9.175652774996847e-07, "loss": 2.0606, "step": 4360 }, { "epoch": 0.41, "grad_norm": 79.8125, "learning_rate": 9.171871339688792e-07, "loss": 1.9876, "step": 4380 }, { "epoch": 0.42, "grad_norm": 79.0, "learning_rate": 9.168089904380735e-07, "loss": 2.0817, "step": 4400 }, { "epoch": 0.42, "grad_norm": 78.0, "learning_rate": 9.16430846907268e-07, "loss": 2.0678, "step": 4420 }, { "epoch": 0.42, "grad_norm": 90.5625, "learning_rate": 9.160527033764625e-07, "loss": 2.1116, "step": 4440 }, { "epoch": 0.42, "grad_norm": 103.4375, "learning_rate": 9.156745598456569e-07, "loss": 2.1465, "step": 4460 }, { "epoch": 0.42, "grad_norm": 91.9375, "learning_rate": 9.152964163148513e-07, "loss": 2.2426, "step": 4480 }, { "epoch": 0.43, "grad_norm": 80.125, "learning_rate": 9.149182727840458e-07, "loss": 2.071, "step": 4500 }, { "epoch": 0.43, "grad_norm": 36.9375, "learning_rate": 9.145401292532402e-07, "loss": 2.0632, "step": 4520 }, { "epoch": 0.43, "grad_norm": 59.96875, "learning_rate": 9.141619857224346e-07, "loss": 2.0746, "step": 4540 }, { "epoch": 0.43, "grad_norm": 56.65625, "learning_rate": 9.137838421916291e-07, "loss": 1.9805, "step": 4560 }, { "epoch": 0.43, "grad_norm": 48.625, "learning_rate": 9.134056986608235e-07, "loss": 2.0773, "step": 4580 }, { "epoch": 0.43, "grad_norm": 70.6875, "learning_rate": 9.13027555130018e-07, "loss": 2.1277, "step": 4600 }, { "epoch": 0.44, "grad_norm": 56.0625, "learning_rate": 9.126494115992124e-07, "loss": 2.0332, "step": 4620 }, { "epoch": 0.44, "grad_norm": 52.90625, "learning_rate": 9.122712680684068e-07, "loss": 2.0042, "step": 4640 }, { "epoch": 0.44, "grad_norm": 64.6875, "learning_rate": 9.118931245376013e-07, "loss": 2.0929, "step": 4660 }, { "epoch": 0.44, "grad_norm": 46.0, "learning_rate": 9.115149810067956e-07, "loss": 2.1159, "step": 4680 }, { "epoch": 0.44, "grad_norm": 61.53125, "learning_rate": 9.111368374759902e-07, "loss": 2.1719, "step": 4700 }, { "epoch": 0.45, "grad_norm": 69.625, "learning_rate": 9.107586939451847e-07, "loss": 2.0822, "step": 4720 }, { "epoch": 0.45, "grad_norm": 83.5, "learning_rate": 9.103805504143791e-07, "loss": 2.0584, "step": 4740 }, { "epoch": 0.45, "grad_norm": 81.8125, "learning_rate": 9.100024068835735e-07, "loss": 2.0803, "step": 4760 }, { "epoch": 0.45, "grad_norm": 81.9375, "learning_rate": 9.09624263352768e-07, "loss": 2.0664, "step": 4780 }, { "epoch": 0.45, "grad_norm": 97.0625, "learning_rate": 9.092461198219624e-07, "loss": 2.0345, "step": 4800 }, { "epoch": 0.46, "grad_norm": 63.03125, "learning_rate": 9.088679762911569e-07, "loss": 2.1597, "step": 4820 }, { "epoch": 0.46, "grad_norm": 73.4375, "learning_rate": 9.084898327603513e-07, "loss": 2.0724, "step": 4840 }, { "epoch": 0.46, "grad_norm": 67.625, "learning_rate": 9.081116892295457e-07, "loss": 1.9974, "step": 4860 }, { "epoch": 0.46, "grad_norm": 62.15625, "learning_rate": 9.077335456987402e-07, "loss": 2.0378, "step": 4880 }, { "epoch": 0.46, "grad_norm": 78.8125, "learning_rate": 9.073554021679346e-07, "loss": 2.1548, "step": 4900 }, { "epoch": 0.47, "grad_norm": 52.6875, "learning_rate": 9.06977258637129e-07, "loss": 2.0852, "step": 4920 }, { "epoch": 0.47, "grad_norm": 53.6875, "learning_rate": 9.065991151063235e-07, "loss": 2.147, "step": 4940 }, { "epoch": 0.47, "grad_norm": 72.0625, "learning_rate": 9.062209715755179e-07, "loss": 1.944, "step": 4960 }, { "epoch": 0.47, "grad_norm": 112.125, "learning_rate": 9.058428280447123e-07, "loss": 2.0855, "step": 4980 }, { "epoch": 0.47, "grad_norm": 91.3125, "learning_rate": 9.054646845139068e-07, "loss": 1.9843, "step": 5000 }, { "epoch": 0.47, "grad_norm": 101.75, "learning_rate": 9.050865409831012e-07, "loss": 2.1488, "step": 5020 }, { "epoch": 0.48, "grad_norm": 61.4375, "learning_rate": 9.047083974522958e-07, "loss": 2.0525, "step": 5040 }, { "epoch": 0.48, "grad_norm": 67.5625, "learning_rate": 9.043302539214902e-07, "loss": 2.105, "step": 5060 }, { "epoch": 0.48, "grad_norm": 86.25, "learning_rate": 9.039521103906846e-07, "loss": 2.0717, "step": 5080 }, { "epoch": 0.48, "grad_norm": 64.3125, "learning_rate": 9.035739668598791e-07, "loss": 2.1015, "step": 5100 }, { "epoch": 0.48, "grad_norm": 166.625, "learning_rate": 9.031958233290735e-07, "loss": 2.134, "step": 5120 }, { "epoch": 0.49, "grad_norm": 68.4375, "learning_rate": 9.028176797982679e-07, "loss": 2.0938, "step": 5140 }, { "epoch": 0.49, "grad_norm": 77.4375, "learning_rate": 9.024395362674624e-07, "loss": 2.0848, "step": 5160 }, { "epoch": 0.49, "grad_norm": 102.1875, "learning_rate": 9.020613927366569e-07, "loss": 1.9225, "step": 5180 }, { "epoch": 0.49, "grad_norm": 68.5625, "learning_rate": 9.016832492058512e-07, "loss": 2.0038, "step": 5200 }, { "epoch": 0.49, "grad_norm": 55.09375, "learning_rate": 9.013051056750457e-07, "loss": 2.1238, "step": 5220 }, { "epoch": 0.5, "grad_norm": 65.75, "learning_rate": 9.009269621442402e-07, "loss": 2.0999, "step": 5240 }, { "epoch": 0.5, "grad_norm": 62.28125, "learning_rate": 9.005488186134345e-07, "loss": 2.0663, "step": 5260 }, { "epoch": 0.5, "grad_norm": 96.3125, "learning_rate": 9.00170675082629e-07, "loss": 2.06, "step": 5280 }, { "epoch": 0.5, "grad_norm": 82.5625, "learning_rate": 8.997925315518234e-07, "loss": 2.1223, "step": 5300 }, { "epoch": 0.5, "grad_norm": 116.3125, "learning_rate": 8.994143880210179e-07, "loss": 2.0237, "step": 5320 }, { "epoch": 0.5, "grad_norm": 63.875, "learning_rate": 8.990362444902123e-07, "loss": 2.0965, "step": 5340 }, { "epoch": 0.51, "grad_norm": 51.78125, "learning_rate": 8.986581009594067e-07, "loss": 2.0365, "step": 5360 }, { "epoch": 0.51, "grad_norm": 67.25, "learning_rate": 8.982799574286012e-07, "loss": 2.1012, "step": 5380 }, { "epoch": 0.51, "grad_norm": 69.3125, "learning_rate": 8.979018138977958e-07, "loss": 1.9336, "step": 5400 }, { "epoch": 0.51, "grad_norm": 127.125, "learning_rate": 8.975236703669901e-07, "loss": 2.1657, "step": 5420 }, { "epoch": 0.51, "grad_norm": 77.4375, "learning_rate": 8.971455268361846e-07, "loss": 1.9824, "step": 5440 }, { "epoch": 0.52, "grad_norm": 51.28125, "learning_rate": 8.967673833053791e-07, "loss": 2.1134, "step": 5460 }, { "epoch": 0.52, "grad_norm": 82.4375, "learning_rate": 8.963892397745734e-07, "loss": 2.0806, "step": 5480 }, { "epoch": 0.52, "grad_norm": 155.875, "learning_rate": 8.960110962437679e-07, "loss": 2.13, "step": 5500 }, { "epoch": 0.52, "grad_norm": 49.0625, "learning_rate": 8.956329527129624e-07, "loss": 2.1733, "step": 5520 }, { "epoch": 0.52, "grad_norm": 72.75, "learning_rate": 8.952548091821568e-07, "loss": 2.1312, "step": 5540 }, { "epoch": 0.53, "grad_norm": 98.0, "learning_rate": 8.948766656513512e-07, "loss": 2.0926, "step": 5560 }, { "epoch": 0.53, "grad_norm": 47.75, "learning_rate": 8.944985221205457e-07, "loss": 2.0788, "step": 5580 }, { "epoch": 0.53, "grad_norm": 69.5625, "learning_rate": 8.941203785897401e-07, "loss": 2.1218, "step": 5600 }, { "epoch": 0.53, "grad_norm": 80.4375, "learning_rate": 8.937422350589346e-07, "loss": 2.1038, "step": 5620 }, { "epoch": 0.53, "grad_norm": 54.625, "learning_rate": 8.933640915281289e-07, "loss": 2.0546, "step": 5640 }, { "epoch": 0.54, "grad_norm": 64.375, "learning_rate": 8.929859479973234e-07, "loss": 2.0949, "step": 5660 }, { "epoch": 0.54, "grad_norm": 69.0, "learning_rate": 8.926078044665179e-07, "loss": 2.037, "step": 5680 }, { "epoch": 0.54, "grad_norm": 80.75, "learning_rate": 8.922296609357122e-07, "loss": 2.1122, "step": 5700 }, { "epoch": 0.54, "grad_norm": 81.75, "learning_rate": 8.918515174049067e-07, "loss": 2.0332, "step": 5720 }, { "epoch": 0.54, "grad_norm": 80.0625, "learning_rate": 8.914733738741013e-07, "loss": 1.9553, "step": 5740 }, { "epoch": 0.54, "grad_norm": 93.625, "learning_rate": 8.910952303432957e-07, "loss": 1.9639, "step": 5760 }, { "epoch": 0.55, "grad_norm": 67.875, "learning_rate": 8.907170868124901e-07, "loss": 2.0789, "step": 5780 }, { "epoch": 0.55, "grad_norm": 71.625, "learning_rate": 8.903389432816846e-07, "loss": 2.2702, "step": 5800 }, { "epoch": 0.55, "grad_norm": 81.3125, "learning_rate": 8.89960799750879e-07, "loss": 2.124, "step": 5820 }, { "epoch": 0.55, "grad_norm": 96.6875, "learning_rate": 8.895826562200735e-07, "loss": 2.0788, "step": 5840 }, { "epoch": 0.55, "grad_norm": 51.375, "learning_rate": 8.892045126892679e-07, "loss": 2.1448, "step": 5860 }, { "epoch": 0.56, "grad_norm": 69.625, "learning_rate": 8.888263691584623e-07, "loss": 2.0283, "step": 5880 }, { "epoch": 0.56, "grad_norm": 63.71875, "learning_rate": 8.884482256276568e-07, "loss": 2.0592, "step": 5900 }, { "epoch": 0.56, "grad_norm": 81.125, "learning_rate": 8.880700820968511e-07, "loss": 2.1446, "step": 5920 }, { "epoch": 0.56, "grad_norm": 55.84375, "learning_rate": 8.876919385660456e-07, "loss": 2.0625, "step": 5940 }, { "epoch": 0.56, "grad_norm": 75.375, "learning_rate": 8.873137950352401e-07, "loss": 2.0988, "step": 5960 }, { "epoch": 0.57, "grad_norm": 68.0, "learning_rate": 8.869356515044345e-07, "loss": 2.007, "step": 5980 }, { "epoch": 0.57, "grad_norm": 132.875, "learning_rate": 8.865575079736289e-07, "loss": 2.103, "step": 6000 }, { "epoch": 0.57, "grad_norm": 55.53125, "learning_rate": 8.861793644428234e-07, "loss": 1.9625, "step": 6020 }, { "epoch": 0.57, "grad_norm": 76.875, "learning_rate": 8.858012209120178e-07, "loss": 2.1369, "step": 6040 }, { "epoch": 0.57, "grad_norm": 108.0625, "learning_rate": 8.854230773812122e-07, "loss": 2.0546, "step": 6060 }, { "epoch": 0.57, "grad_norm": 199.625, "learning_rate": 8.850449338504067e-07, "loss": 2.1589, "step": 6080 }, { "epoch": 0.58, "grad_norm": 58.53125, "learning_rate": 8.846667903196012e-07, "loss": 2.0696, "step": 6100 }, { "epoch": 0.58, "grad_norm": 52.84375, "learning_rate": 8.842886467887957e-07, "loss": 2.0543, "step": 6120 }, { "epoch": 0.58, "grad_norm": 76.0625, "learning_rate": 8.839105032579901e-07, "loss": 2.0709, "step": 6140 }, { "epoch": 0.58, "grad_norm": 86.125, "learning_rate": 8.835323597271845e-07, "loss": 2.1248, "step": 6160 }, { "epoch": 0.58, "grad_norm": 66.1875, "learning_rate": 8.83154216196379e-07, "loss": 2.0078, "step": 6180 }, { "epoch": 0.59, "grad_norm": 91.875, "learning_rate": 8.827760726655735e-07, "loss": 2.1086, "step": 6200 }, { "epoch": 0.59, "grad_norm": 70.75, "learning_rate": 8.823979291347678e-07, "loss": 2.0076, "step": 6220 }, { "epoch": 0.59, "grad_norm": 58.0, "learning_rate": 8.820197856039623e-07, "loss": 1.9816, "step": 6240 }, { "epoch": 0.59, "grad_norm": 70.1875, "learning_rate": 8.816416420731567e-07, "loss": 2.0527, "step": 6260 }, { "epoch": 0.59, "grad_norm": 64.5625, "learning_rate": 8.812634985423511e-07, "loss": 2.0384, "step": 6280 }, { "epoch": 0.6, "grad_norm": 57.03125, "learning_rate": 8.808853550115456e-07, "loss": 2.091, "step": 6300 }, { "epoch": 0.6, "grad_norm": 73.0, "learning_rate": 8.8050721148074e-07, "loss": 2.0033, "step": 6320 }, { "epoch": 0.6, "grad_norm": 61.90625, "learning_rate": 8.801290679499345e-07, "loss": 2.0876, "step": 6340 }, { "epoch": 0.6, "grad_norm": 56.65625, "learning_rate": 8.797509244191289e-07, "loss": 2.1797, "step": 6360 }, { "epoch": 0.6, "grad_norm": 72.5625, "learning_rate": 8.793727808883233e-07, "loss": 2.0717, "step": 6380 }, { "epoch": 0.61, "grad_norm": 149.375, "learning_rate": 8.789946373575178e-07, "loss": 2.0591, "step": 6400 }, { "epoch": 0.61, "grad_norm": 109.8125, "learning_rate": 8.786164938267123e-07, "loss": 2.0082, "step": 6420 }, { "epoch": 0.61, "grad_norm": 62.03125, "learning_rate": 8.782383502959067e-07, "loss": 2.0883, "step": 6440 }, { "epoch": 0.61, "grad_norm": 63.96875, "learning_rate": 8.778602067651012e-07, "loss": 2.0119, "step": 6460 }, { "epoch": 0.61, "grad_norm": 114.875, "learning_rate": 8.774820632342957e-07, "loss": 2.1563, "step": 6480 }, { "epoch": 0.61, "grad_norm": 62.15625, "learning_rate": 8.7710391970349e-07, "loss": 2.0736, "step": 6500 }, { "epoch": 0.62, "grad_norm": 79.125, "learning_rate": 8.767257761726845e-07, "loss": 2.0764, "step": 6520 }, { "epoch": 0.62, "grad_norm": 63.5625, "learning_rate": 8.76347632641879e-07, "loss": 2.0432, "step": 6540 }, { "epoch": 0.62, "grad_norm": 55.15625, "learning_rate": 8.759694891110734e-07, "loss": 2.0265, "step": 6560 }, { "epoch": 0.62, "grad_norm": 66.875, "learning_rate": 8.755913455802678e-07, "loss": 1.9963, "step": 6580 }, { "epoch": 0.62, "grad_norm": 82.5625, "learning_rate": 8.752132020494622e-07, "loss": 1.984, "step": 6600 }, { "epoch": 0.63, "grad_norm": 64.5, "learning_rate": 8.748350585186567e-07, "loss": 2.0488, "step": 6620 }, { "epoch": 0.63, "grad_norm": 81.4375, "learning_rate": 8.744569149878512e-07, "loss": 2.1111, "step": 6640 }, { "epoch": 0.63, "grad_norm": 74.6875, "learning_rate": 8.740787714570455e-07, "loss": 2.0399, "step": 6660 }, { "epoch": 0.63, "grad_norm": 70.3125, "learning_rate": 8.7370062792624e-07, "loss": 1.9512, "step": 6680 }, { "epoch": 0.63, "grad_norm": 166.0, "learning_rate": 8.733224843954345e-07, "loss": 2.0779, "step": 6700 }, { "epoch": 0.64, "grad_norm": 71.125, "learning_rate": 8.729443408646288e-07, "loss": 2.0487, "step": 6720 }, { "epoch": 0.64, "grad_norm": 119.8125, "learning_rate": 8.725661973338233e-07, "loss": 2.1232, "step": 6740 }, { "epoch": 0.64, "grad_norm": 104.5625, "learning_rate": 8.721880538030178e-07, "loss": 1.9915, "step": 6760 }, { "epoch": 0.64, "grad_norm": 74.125, "learning_rate": 8.718099102722122e-07, "loss": 2.1389, "step": 6780 }, { "epoch": 0.64, "grad_norm": 53.90625, "learning_rate": 8.714317667414067e-07, "loss": 2.1114, "step": 6800 }, { "epoch": 0.64, "grad_norm": 67.125, "learning_rate": 8.710536232106012e-07, "loss": 1.9978, "step": 6820 }, { "epoch": 0.65, "grad_norm": 69.125, "learning_rate": 8.706754796797956e-07, "loss": 2.1736, "step": 6840 }, { "epoch": 0.65, "grad_norm": 61.0625, "learning_rate": 8.702973361489901e-07, "loss": 2.1067, "step": 6860 }, { "epoch": 0.65, "grad_norm": 74.1875, "learning_rate": 8.699191926181844e-07, "loss": 1.9871, "step": 6880 }, { "epoch": 0.65, "grad_norm": 61.78125, "learning_rate": 8.695410490873789e-07, "loss": 1.9737, "step": 6900 }, { "epoch": 0.65, "grad_norm": 66.625, "learning_rate": 8.691629055565734e-07, "loss": 2.0105, "step": 6920 }, { "epoch": 0.66, "grad_norm": 68.6875, "learning_rate": 8.687847620257677e-07, "loss": 2.026, "step": 6940 }, { "epoch": 0.66, "grad_norm": 49.65625, "learning_rate": 8.684066184949622e-07, "loss": 2.1013, "step": 6960 }, { "epoch": 0.66, "grad_norm": 83.1875, "learning_rate": 8.680284749641567e-07, "loss": 2.0245, "step": 6980 }, { "epoch": 0.66, "grad_norm": 86.9375, "learning_rate": 8.676503314333511e-07, "loss": 1.9986, "step": 7000 }, { "epoch": 0.66, "grad_norm": 56.59375, "learning_rate": 8.672721879025455e-07, "loss": 2.0128, "step": 7020 }, { "epoch": 0.67, "grad_norm": 63.90625, "learning_rate": 8.6689404437174e-07, "loss": 2.0212, "step": 7040 }, { "epoch": 0.67, "grad_norm": 76.75, "learning_rate": 8.665159008409344e-07, "loss": 2.077, "step": 7060 }, { "epoch": 0.67, "grad_norm": 84.3125, "learning_rate": 8.661377573101288e-07, "loss": 2.075, "step": 7080 }, { "epoch": 0.67, "grad_norm": 99.0, "learning_rate": 8.657596137793233e-07, "loss": 2.1219, "step": 7100 }, { "epoch": 0.67, "grad_norm": 112.75, "learning_rate": 8.653814702485177e-07, "loss": 1.8914, "step": 7120 }, { "epoch": 0.67, "grad_norm": 83.1875, "learning_rate": 8.650033267177123e-07, "loss": 2.091, "step": 7140 }, { "epoch": 0.68, "grad_norm": 57.65625, "learning_rate": 8.646251831869067e-07, "loss": 2.1047, "step": 7160 }, { "epoch": 0.68, "grad_norm": 70.0625, "learning_rate": 8.642470396561011e-07, "loss": 2.0086, "step": 7180 }, { "epoch": 0.68, "grad_norm": 60.40625, "learning_rate": 8.638688961252956e-07, "loss": 2.0193, "step": 7200 }, { "epoch": 0.68, "grad_norm": 124.125, "learning_rate": 8.6349075259449e-07, "loss": 2.0588, "step": 7220 }, { "epoch": 0.68, "grad_norm": 49.28125, "learning_rate": 8.631126090636844e-07, "loss": 1.9948, "step": 7240 }, { "epoch": 0.69, "grad_norm": 71.75, "learning_rate": 8.627344655328789e-07, "loss": 2.0766, "step": 7260 }, { "epoch": 0.69, "grad_norm": 59.9375, "learning_rate": 8.623563220020733e-07, "loss": 1.9796, "step": 7280 }, { "epoch": 0.69, "grad_norm": 61.46875, "learning_rate": 8.619781784712677e-07, "loss": 2.213, "step": 7300 }, { "epoch": 0.69, "grad_norm": 52.40625, "learning_rate": 8.616000349404622e-07, "loss": 2.1209, "step": 7320 }, { "epoch": 0.69, "grad_norm": 86.25, "learning_rate": 8.612218914096566e-07, "loss": 1.9509, "step": 7340 }, { "epoch": 0.7, "grad_norm": 78.0, "learning_rate": 8.608437478788511e-07, "loss": 1.9703, "step": 7360 }, { "epoch": 0.7, "grad_norm": 62.8125, "learning_rate": 8.604656043480455e-07, "loss": 1.9881, "step": 7380 }, { "epoch": 0.7, "grad_norm": 125.875, "learning_rate": 8.600874608172399e-07, "loss": 1.985, "step": 7400 }, { "epoch": 0.7, "grad_norm": 73.375, "learning_rate": 8.597093172864344e-07, "loss": 2.0481, "step": 7420 }, { "epoch": 0.7, "grad_norm": 109.1875, "learning_rate": 8.593311737556289e-07, "loss": 1.9969, "step": 7440 }, { "epoch": 0.71, "grad_norm": 59.46875, "learning_rate": 8.589530302248232e-07, "loss": 1.9769, "step": 7460 }, { "epoch": 0.71, "grad_norm": 49.71875, "learning_rate": 8.585748866940178e-07, "loss": 2.0271, "step": 7480 }, { "epoch": 0.71, "grad_norm": 85.125, "learning_rate": 8.581967431632122e-07, "loss": 2.0354, "step": 7500 }, { "epoch": 0.71, "grad_norm": 58.8125, "learning_rate": 8.578185996324066e-07, "loss": 2.0471, "step": 7520 }, { "epoch": 0.71, "grad_norm": 90.25, "learning_rate": 8.574404561016011e-07, "loss": 2.0257, "step": 7540 }, { "epoch": 0.71, "grad_norm": 68.5, "learning_rate": 8.570623125707955e-07, "loss": 2.0477, "step": 7560 }, { "epoch": 0.72, "grad_norm": 73.875, "learning_rate": 8.5668416903999e-07, "loss": 2.1175, "step": 7580 }, { "epoch": 0.72, "grad_norm": 62.1875, "learning_rate": 8.563060255091844e-07, "loss": 2.0251, "step": 7600 }, { "epoch": 0.72, "grad_norm": 80.625, "learning_rate": 8.559278819783788e-07, "loss": 2.0128, "step": 7620 }, { "epoch": 0.72, "grad_norm": 57.25, "learning_rate": 8.555497384475733e-07, "loss": 2.0853, "step": 7640 }, { "epoch": 0.72, "grad_norm": 95.375, "learning_rate": 8.551715949167678e-07, "loss": 2.0591, "step": 7660 }, { "epoch": 0.73, "grad_norm": 53.625, "learning_rate": 8.547934513859621e-07, "loss": 2.081, "step": 7680 }, { "epoch": 0.73, "grad_norm": 74.875, "learning_rate": 8.544153078551566e-07, "loss": 2.0432, "step": 7700 }, { "epoch": 0.73, "grad_norm": 78.3125, "learning_rate": 8.540371643243511e-07, "loss": 1.9401, "step": 7720 }, { "epoch": 0.73, "grad_norm": 112.75, "learning_rate": 8.536590207935454e-07, "loss": 2.1571, "step": 7740 }, { "epoch": 0.73, "grad_norm": 56.65625, "learning_rate": 8.532808772627399e-07, "loss": 2.0862, "step": 7760 }, { "epoch": 0.74, "grad_norm": 113.1875, "learning_rate": 8.529027337319343e-07, "loss": 2.0632, "step": 7780 }, { "epoch": 0.74, "grad_norm": 94.3125, "learning_rate": 8.525245902011288e-07, "loss": 2.0372, "step": 7800 }, { "epoch": 0.74, "grad_norm": 51.125, "learning_rate": 8.521464466703232e-07, "loss": 2.0205, "step": 7820 }, { "epoch": 0.74, "grad_norm": 54.28125, "learning_rate": 8.517683031395177e-07, "loss": 1.9503, "step": 7840 }, { "epoch": 0.74, "grad_norm": 60.625, "learning_rate": 8.513901596087122e-07, "loss": 2.1562, "step": 7860 }, { "epoch": 0.74, "grad_norm": 64.75, "learning_rate": 8.510120160779067e-07, "loss": 2.0641, "step": 7880 }, { "epoch": 0.75, "grad_norm": 62.59375, "learning_rate": 8.50633872547101e-07, "loss": 2.0897, "step": 7900 }, { "epoch": 0.75, "grad_norm": 99.5625, "learning_rate": 8.502557290162955e-07, "loss": 2.0611, "step": 7920 }, { "epoch": 0.75, "grad_norm": 48.28125, "learning_rate": 8.4987758548549e-07, "loss": 2.0871, "step": 7940 }, { "epoch": 0.75, "grad_norm": 55.5, "learning_rate": 8.494994419546843e-07, "loss": 1.979, "step": 7960 }, { "epoch": 0.75, "grad_norm": 70.125, "learning_rate": 8.491212984238788e-07, "loss": 2.1194, "step": 7980 }, { "epoch": 0.76, "grad_norm": 83.3125, "learning_rate": 8.487431548930733e-07, "loss": 2.0838, "step": 8000 }, { "epoch": 0.76, "grad_norm": 65.25, "learning_rate": 8.483650113622677e-07, "loss": 2.079, "step": 8020 }, { "epoch": 0.76, "grad_norm": 68.0625, "learning_rate": 8.479868678314621e-07, "loss": 2.0234, "step": 8040 }, { "epoch": 0.76, "grad_norm": 64.8125, "learning_rate": 8.476087243006566e-07, "loss": 2.0205, "step": 8060 }, { "epoch": 0.76, "grad_norm": 56.0, "learning_rate": 8.47230580769851e-07, "loss": 1.975, "step": 8080 }, { "epoch": 0.77, "grad_norm": 81.625, "learning_rate": 8.468524372390454e-07, "loss": 2.1159, "step": 8100 }, { "epoch": 0.77, "grad_norm": 53.21875, "learning_rate": 8.464742937082398e-07, "loss": 2.0243, "step": 8120 }, { "epoch": 0.77, "grad_norm": 88.625, "learning_rate": 8.460961501774343e-07, "loss": 1.9559, "step": 8140 }, { "epoch": 0.77, "grad_norm": 93.3125, "learning_rate": 8.457180066466288e-07, "loss": 2.1196, "step": 8160 }, { "epoch": 0.77, "grad_norm": 61.09375, "learning_rate": 8.453398631158232e-07, "loss": 1.896, "step": 8180 }, { "epoch": 0.78, "grad_norm": 46.9375, "learning_rate": 8.449617195850177e-07, "loss": 2.0552, "step": 8200 }, { "epoch": 0.78, "grad_norm": 84.125, "learning_rate": 8.445835760542122e-07, "loss": 2.0865, "step": 8220 }, { "epoch": 0.78, "grad_norm": 48.53125, "learning_rate": 8.442054325234066e-07, "loss": 2.049, "step": 8240 }, { "epoch": 0.78, "grad_norm": 70.9375, "learning_rate": 8.43827288992601e-07, "loss": 2.0403, "step": 8260 }, { "epoch": 0.78, "grad_norm": 65.6875, "learning_rate": 8.434491454617955e-07, "loss": 2.1368, "step": 8280 }, { "epoch": 0.78, "grad_norm": 77.6875, "learning_rate": 8.430710019309899e-07, "loss": 1.979, "step": 8300 }, { "epoch": 0.79, "grad_norm": 70.0, "learning_rate": 8.426928584001843e-07, "loss": 1.8971, "step": 8320 }, { "epoch": 0.79, "grad_norm": 168.875, "learning_rate": 8.423147148693788e-07, "loss": 1.999, "step": 8340 }, { "epoch": 0.79, "grad_norm": 67.8125, "learning_rate": 8.419365713385732e-07, "loss": 1.9871, "step": 8360 }, { "epoch": 0.79, "grad_norm": 66.6875, "learning_rate": 8.415584278077677e-07, "loss": 1.867, "step": 8380 }, { "epoch": 0.79, "grad_norm": 63.625, "learning_rate": 8.41180284276962e-07, "loss": 1.9582, "step": 8400 }, { "epoch": 0.8, "grad_norm": 63.28125, "learning_rate": 8.408021407461565e-07, "loss": 1.9924, "step": 8420 }, { "epoch": 0.8, "grad_norm": 62.40625, "learning_rate": 8.40423997215351e-07, "loss": 1.9955, "step": 8440 }, { "epoch": 0.8, "grad_norm": 48.65625, "learning_rate": 8.400458536845453e-07, "loss": 2.0825, "step": 8460 }, { "epoch": 0.8, "grad_norm": 75.9375, "learning_rate": 8.396677101537398e-07, "loss": 1.9676, "step": 8480 }, { "epoch": 0.8, "grad_norm": 107.1875, "learning_rate": 8.392895666229343e-07, "loss": 1.9956, "step": 8500 }, { "epoch": 0.81, "grad_norm": 52.5625, "learning_rate": 8.389114230921287e-07, "loss": 2.0818, "step": 8520 }, { "epoch": 0.81, "grad_norm": 74.625, "learning_rate": 8.385332795613232e-07, "loss": 1.9427, "step": 8540 }, { "epoch": 0.81, "grad_norm": 70.125, "learning_rate": 8.381551360305177e-07, "loss": 2.1569, "step": 8560 }, { "epoch": 0.81, "grad_norm": 64.8125, "learning_rate": 8.377769924997121e-07, "loss": 2.0716, "step": 8580 }, { "epoch": 0.81, "grad_norm": 59.875, "learning_rate": 8.373988489689066e-07, "loss": 2.06, "step": 8600 }, { "epoch": 0.81, "grad_norm": 86.1875, "learning_rate": 8.37020705438101e-07, "loss": 2.0076, "step": 8620 }, { "epoch": 0.82, "grad_norm": 49.8125, "learning_rate": 8.366425619072954e-07, "loss": 2.0739, "step": 8640 }, { "epoch": 0.82, "grad_norm": 96.1875, "learning_rate": 8.362644183764899e-07, "loss": 2.0958, "step": 8660 }, { "epoch": 0.82, "grad_norm": 81.9375, "learning_rate": 8.358862748456844e-07, "loss": 2.0505, "step": 8680 }, { "epoch": 0.82, "grad_norm": 72.4375, "learning_rate": 8.355081313148787e-07, "loss": 2.0513, "step": 8700 }, { "epoch": 0.82, "grad_norm": 65.125, "learning_rate": 8.351299877840732e-07, "loss": 1.9823, "step": 8720 }, { "epoch": 0.83, "grad_norm": 60.28125, "learning_rate": 8.347518442532676e-07, "loss": 1.8811, "step": 8740 }, { "epoch": 0.83, "grad_norm": 57.125, "learning_rate": 8.34373700722462e-07, "loss": 2.0142, "step": 8760 }, { "epoch": 0.83, "grad_norm": 90.625, "learning_rate": 8.339955571916565e-07, "loss": 1.9925, "step": 8780 }, { "epoch": 0.83, "grad_norm": 71.4375, "learning_rate": 8.336174136608509e-07, "loss": 2.1022, "step": 8800 }, { "epoch": 0.83, "grad_norm": 95.5625, "learning_rate": 8.332392701300454e-07, "loss": 1.9648, "step": 8820 }, { "epoch": 0.84, "grad_norm": 103.9375, "learning_rate": 8.328611265992398e-07, "loss": 2.0242, "step": 8840 }, { "epoch": 0.84, "grad_norm": 54.96875, "learning_rate": 8.324829830684342e-07, "loss": 1.9736, "step": 8860 }, { "epoch": 0.84, "grad_norm": 53.0, "learning_rate": 8.321048395376288e-07, "loss": 2.0328, "step": 8880 }, { "epoch": 0.84, "grad_norm": 61.21875, "learning_rate": 8.317266960068233e-07, "loss": 1.9957, "step": 8900 }, { "epoch": 0.84, "grad_norm": 51.09375, "learning_rate": 8.313485524760176e-07, "loss": 1.9611, "step": 8920 }, { "epoch": 0.85, "grad_norm": 48.6875, "learning_rate": 8.309704089452121e-07, "loss": 1.9684, "step": 8940 }, { "epoch": 0.85, "grad_norm": 61.90625, "learning_rate": 8.305922654144066e-07, "loss": 1.989, "step": 8960 }, { "epoch": 0.85, "grad_norm": 54.28125, "learning_rate": 8.302141218836009e-07, "loss": 1.9718, "step": 8980 }, { "epoch": 0.85, "grad_norm": 90.625, "learning_rate": 8.298359783527954e-07, "loss": 2.0448, "step": 9000 }, { "epoch": 0.85, "grad_norm": 98.4375, "learning_rate": 8.294578348219899e-07, "loss": 1.9535, "step": 9020 }, { "epoch": 0.85, "grad_norm": 106.625, "learning_rate": 8.290796912911843e-07, "loss": 1.9675, "step": 9040 }, { "epoch": 0.86, "grad_norm": 67.5625, "learning_rate": 8.287015477603787e-07, "loss": 2.0108, "step": 9060 }, { "epoch": 0.86, "grad_norm": 47.28125, "learning_rate": 8.283234042295731e-07, "loss": 2.0104, "step": 9080 }, { "epoch": 0.86, "grad_norm": 57.34375, "learning_rate": 8.279452606987676e-07, "loss": 2.0557, "step": 9100 }, { "epoch": 0.86, "grad_norm": 95.5, "learning_rate": 8.27567117167962e-07, "loss": 1.9992, "step": 9120 }, { "epoch": 0.86, "grad_norm": 56.03125, "learning_rate": 8.271889736371564e-07, "loss": 1.9351, "step": 9140 }, { "epoch": 0.87, "grad_norm": 121.6875, "learning_rate": 8.268108301063509e-07, "loss": 2.049, "step": 9160 }, { "epoch": 0.87, "grad_norm": 65.25, "learning_rate": 8.264326865755454e-07, "loss": 2.0567, "step": 9180 }, { "epoch": 0.87, "grad_norm": 63.53125, "learning_rate": 8.260545430447397e-07, "loss": 2.1084, "step": 9200 }, { "epoch": 0.87, "grad_norm": 62.3125, "learning_rate": 8.256763995139342e-07, "loss": 2.0061, "step": 9220 }, { "epoch": 0.87, "grad_norm": 78.4375, "learning_rate": 8.252982559831288e-07, "loss": 2.0693, "step": 9240 }, { "epoch": 0.88, "grad_norm": 61.6875, "learning_rate": 8.249201124523232e-07, "loss": 1.9503, "step": 9260 }, { "epoch": 0.88, "grad_norm": 67.1875, "learning_rate": 8.245419689215176e-07, "loss": 2.061, "step": 9280 }, { "epoch": 0.88, "grad_norm": 128.5, "learning_rate": 8.241638253907121e-07, "loss": 2.0328, "step": 9300 }, { "epoch": 0.88, "grad_norm": 61.8125, "learning_rate": 8.237856818599065e-07, "loss": 1.9776, "step": 9320 }, { "epoch": 0.88, "grad_norm": 79.375, "learning_rate": 8.234075383291009e-07, "loss": 1.9705, "step": 9340 }, { "epoch": 0.88, "grad_norm": 52.1875, "learning_rate": 8.230293947982953e-07, "loss": 2.1075, "step": 9360 }, { "epoch": 0.89, "grad_norm": 54.6875, "learning_rate": 8.226512512674898e-07, "loss": 1.9804, "step": 9380 }, { "epoch": 0.89, "grad_norm": 60.125, "learning_rate": 8.222731077366843e-07, "loss": 2.0447, "step": 9400 }, { "epoch": 0.89, "grad_norm": 57.46875, "learning_rate": 8.218949642058786e-07, "loss": 2.0118, "step": 9420 }, { "epoch": 0.89, "grad_norm": 59.78125, "learning_rate": 8.215168206750731e-07, "loss": 2.0065, "step": 9440 }, { "epoch": 0.89, "grad_norm": 61.6875, "learning_rate": 8.211386771442676e-07, "loss": 1.9687, "step": 9460 }, { "epoch": 0.9, "grad_norm": 60.84375, "learning_rate": 8.207605336134619e-07, "loss": 2.0927, "step": 9480 }, { "epoch": 0.9, "grad_norm": 88.25, "learning_rate": 8.203823900826564e-07, "loss": 2.0385, "step": 9500 }, { "epoch": 0.9, "grad_norm": 58.6875, "learning_rate": 8.200042465518509e-07, "loss": 2.0446, "step": 9520 }, { "epoch": 0.9, "grad_norm": 65.375, "learning_rate": 8.196261030210453e-07, "loss": 2.1188, "step": 9540 }, { "epoch": 0.9, "grad_norm": 58.46875, "learning_rate": 8.192479594902397e-07, "loss": 1.9974, "step": 9560 }, { "epoch": 0.91, "grad_norm": 49.59375, "learning_rate": 8.188698159594343e-07, "loss": 1.9911, "step": 9580 }, { "epoch": 0.91, "grad_norm": 75.1875, "learning_rate": 8.184916724286287e-07, "loss": 1.9578, "step": 9600 }, { "epoch": 0.91, "grad_norm": 71.4375, "learning_rate": 8.181135288978232e-07, "loss": 2.0356, "step": 9620 }, { "epoch": 0.91, "grad_norm": 63.9375, "learning_rate": 8.177353853670176e-07, "loss": 2.0255, "step": 9640 }, { "epoch": 0.91, "grad_norm": 50.125, "learning_rate": 8.17357241836212e-07, "loss": 1.9512, "step": 9660 }, { "epoch": 0.92, "grad_norm": 118.0, "learning_rate": 8.169790983054065e-07, "loss": 2.0679, "step": 9680 }, { "epoch": 0.92, "grad_norm": 132.625, "learning_rate": 8.166009547746008e-07, "loss": 2.0188, "step": 9700 }, { "epoch": 0.92, "grad_norm": 73.0, "learning_rate": 8.162228112437953e-07, "loss": 1.9632, "step": 9720 }, { "epoch": 0.92, "grad_norm": 66.125, "learning_rate": 8.158446677129898e-07, "loss": 1.9628, "step": 9740 }, { "epoch": 0.92, "grad_norm": 62.15625, "learning_rate": 8.154665241821842e-07, "loss": 1.9865, "step": 9760 }, { "epoch": 0.92, "grad_norm": 56.28125, "learning_rate": 8.150883806513786e-07, "loss": 2.1228, "step": 9780 }, { "epoch": 0.93, "grad_norm": 58.5625, "learning_rate": 8.147102371205731e-07, "loss": 1.9204, "step": 9800 }, { "epoch": 0.93, "grad_norm": 88.125, "learning_rate": 8.143320935897675e-07, "loss": 1.989, "step": 9820 }, { "epoch": 0.93, "grad_norm": 100.5625, "learning_rate": 8.13953950058962e-07, "loss": 1.9791, "step": 9840 }, { "epoch": 0.93, "grad_norm": 66.375, "learning_rate": 8.135758065281564e-07, "loss": 2.0461, "step": 9860 }, { "epoch": 0.93, "grad_norm": 80.75, "learning_rate": 8.131976629973508e-07, "loss": 2.0083, "step": 9880 }, { "epoch": 0.94, "grad_norm": 52.34375, "learning_rate": 8.128195194665453e-07, "loss": 1.9767, "step": 9900 }, { "epoch": 0.94, "grad_norm": 92.6875, "learning_rate": 8.124413759357397e-07, "loss": 2.0318, "step": 9920 }, { "epoch": 0.94, "grad_norm": 70.625, "learning_rate": 8.120632324049342e-07, "loss": 2.0566, "step": 9940 }, { "epoch": 0.94, "grad_norm": 86.9375, "learning_rate": 8.116850888741287e-07, "loss": 2.0286, "step": 9960 }, { "epoch": 0.94, "grad_norm": 65.875, "learning_rate": 8.113069453433232e-07, "loss": 1.9629, "step": 9980 }, { "epoch": 0.95, "grad_norm": 90.6875, "learning_rate": 8.109288018125175e-07, "loss": 1.9711, "step": 10000 }, { "epoch": 0.95, "grad_norm": 52.125, "learning_rate": 8.10550658281712e-07, "loss": 1.9985, "step": 10020 }, { "epoch": 0.95, "grad_norm": 66.25, "learning_rate": 8.101725147509064e-07, "loss": 2.0052, "step": 10040 }, { "epoch": 0.95, "grad_norm": 65.375, "learning_rate": 8.097943712201009e-07, "loss": 1.9346, "step": 10060 }, { "epoch": 0.95, "grad_norm": 53.03125, "learning_rate": 8.094162276892953e-07, "loss": 1.984, "step": 10080 }, { "epoch": 0.95, "grad_norm": 110.25, "learning_rate": 8.090380841584897e-07, "loss": 2.0204, "step": 10100 }, { "epoch": 0.96, "grad_norm": 59.03125, "learning_rate": 8.086599406276842e-07, "loss": 2.0792, "step": 10120 }, { "epoch": 0.96, "grad_norm": 49.8125, "learning_rate": 8.082817970968786e-07, "loss": 2.0051, "step": 10140 }, { "epoch": 0.96, "grad_norm": 61.1875, "learning_rate": 8.07903653566073e-07, "loss": 2.0679, "step": 10160 }, { "epoch": 0.96, "grad_norm": 93.3125, "learning_rate": 8.075255100352675e-07, "loss": 2.099, "step": 10180 }, { "epoch": 0.96, "grad_norm": 79.6875, "learning_rate": 8.07147366504462e-07, "loss": 2.1754, "step": 10200 }, { "epoch": 0.97, "grad_norm": 68.875, "learning_rate": 8.067692229736563e-07, "loss": 2.0048, "step": 10220 }, { "epoch": 0.97, "grad_norm": 72.625, "learning_rate": 8.063910794428508e-07, "loss": 1.9806, "step": 10240 }, { "epoch": 0.97, "grad_norm": 98.1875, "learning_rate": 8.060129359120452e-07, "loss": 1.9797, "step": 10260 }, { "epoch": 0.97, "grad_norm": 67.0625, "learning_rate": 8.056347923812397e-07, "loss": 2.0226, "step": 10280 }, { "epoch": 0.97, "grad_norm": 55.40625, "learning_rate": 8.052566488504342e-07, "loss": 2.0109, "step": 10300 }, { "epoch": 0.98, "grad_norm": 107.125, "learning_rate": 8.048785053196286e-07, "loss": 1.8523, "step": 10320 }, { "epoch": 0.98, "grad_norm": 68.5625, "learning_rate": 8.045003617888231e-07, "loss": 1.9492, "step": 10340 }, { "epoch": 0.98, "grad_norm": 77.0, "learning_rate": 8.041222182580175e-07, "loss": 1.9863, "step": 10360 }, { "epoch": 0.98, "grad_norm": 76.5625, "learning_rate": 8.037440747272119e-07, "loss": 1.9792, "step": 10380 }, { "epoch": 0.98, "grad_norm": 76.0625, "learning_rate": 8.033659311964064e-07, "loss": 2.0448, "step": 10400 }, { "epoch": 0.99, "grad_norm": 79.3125, "learning_rate": 8.029877876656009e-07, "loss": 2.0606, "step": 10420 }, { "epoch": 0.99, "grad_norm": 53.5625, "learning_rate": 8.026096441347952e-07, "loss": 2.0716, "step": 10440 }, { "epoch": 0.99, "grad_norm": 75.0, "learning_rate": 8.022315006039897e-07, "loss": 1.9026, "step": 10460 }, { "epoch": 0.99, "grad_norm": 85.0625, "learning_rate": 8.018533570731842e-07, "loss": 1.9879, "step": 10480 }, { "epoch": 0.99, "grad_norm": 52.46875, "learning_rate": 8.014752135423785e-07, "loss": 2.0018, "step": 10500 }, { "epoch": 0.99, "grad_norm": 48.4375, "learning_rate": 8.01097070011573e-07, "loss": 1.9966, "step": 10520 }, { "epoch": 1.0, "grad_norm": 59.65625, "learning_rate": 8.007189264807675e-07, "loss": 2.0212, "step": 10540 }, { "epoch": 1.0, "grad_norm": 49.59375, "learning_rate": 8.003407829499619e-07, "loss": 2.0244, "step": 10560 }, { "epoch": 1.0, "grad_norm": 107.625, "learning_rate": 7.999626394191563e-07, "loss": 1.9597, "step": 10580 }, { "epoch": 1.0, "grad_norm": 67.875, "learning_rate": 7.995844958883507e-07, "loss": 1.7883, "step": 10600 }, { "epoch": 1.0, "grad_norm": 67.8125, "learning_rate": 7.992063523575452e-07, "loss": 1.7906, "step": 10620 }, { "epoch": 1.01, "grad_norm": 60.09375, "learning_rate": 7.988282088267398e-07, "loss": 1.8029, "step": 10640 }, { "epoch": 1.01, "grad_norm": 42.25, "learning_rate": 7.984500652959341e-07, "loss": 1.7842, "step": 10660 }, { "epoch": 1.01, "grad_norm": 65.6875, "learning_rate": 7.980719217651286e-07, "loss": 1.8204, "step": 10680 }, { "epoch": 1.01, "grad_norm": 65.625, "learning_rate": 7.976937782343231e-07, "loss": 1.7802, "step": 10700 }, { "epoch": 1.01, "grad_norm": 47.96875, "learning_rate": 7.973156347035174e-07, "loss": 1.7412, "step": 10720 }, { "epoch": 1.02, "grad_norm": 78.875, "learning_rate": 7.969374911727119e-07, "loss": 1.8131, "step": 10740 }, { "epoch": 1.02, "grad_norm": 56.09375, "learning_rate": 7.965593476419064e-07, "loss": 1.7726, "step": 10760 }, { "epoch": 1.02, "grad_norm": 61.1875, "learning_rate": 7.961812041111008e-07, "loss": 1.8361, "step": 10780 }, { "epoch": 1.02, "grad_norm": 48.46875, "learning_rate": 7.958030605802952e-07, "loss": 1.6977, "step": 10800 }, { "epoch": 1.02, "grad_norm": 74.75, "learning_rate": 7.954249170494897e-07, "loss": 1.7696, "step": 10820 }, { "epoch": 1.02, "grad_norm": 72.875, "learning_rate": 7.950467735186841e-07, "loss": 1.7746, "step": 10840 }, { "epoch": 1.03, "grad_norm": 47.46875, "learning_rate": 7.946686299878785e-07, "loss": 1.7854, "step": 10860 }, { "epoch": 1.03, "grad_norm": 75.0625, "learning_rate": 7.94290486457073e-07, "loss": 1.782, "step": 10880 }, { "epoch": 1.03, "grad_norm": 133.25, "learning_rate": 7.939123429262674e-07, "loss": 1.7646, "step": 10900 }, { "epoch": 1.03, "grad_norm": 74.5, "learning_rate": 7.935341993954619e-07, "loss": 1.6605, "step": 10920 }, { "epoch": 1.03, "grad_norm": 97.1875, "learning_rate": 7.931560558646562e-07, "loss": 1.8058, "step": 10940 }, { "epoch": 1.04, "grad_norm": 63.09375, "learning_rate": 7.927779123338507e-07, "loss": 1.8219, "step": 10960 }, { "epoch": 1.04, "grad_norm": 72.75, "learning_rate": 7.923997688030453e-07, "loss": 1.7732, "step": 10980 }, { "epoch": 1.04, "grad_norm": 93.8125, "learning_rate": 7.920216252722397e-07, "loss": 1.7333, "step": 11000 }, { "epoch": 1.04, "grad_norm": 68.375, "learning_rate": 7.916434817414341e-07, "loss": 1.7237, "step": 11020 }, { "epoch": 1.04, "grad_norm": 72.1875, "learning_rate": 7.912653382106286e-07, "loss": 1.8252, "step": 11040 }, { "epoch": 1.05, "grad_norm": 61.46875, "learning_rate": 7.90887194679823e-07, "loss": 1.688, "step": 11060 }, { "epoch": 1.05, "grad_norm": 50.96875, "learning_rate": 7.905090511490174e-07, "loss": 1.7669, "step": 11080 }, { "epoch": 1.05, "grad_norm": 49.28125, "learning_rate": 7.901309076182119e-07, "loss": 1.7935, "step": 11100 }, { "epoch": 1.05, "grad_norm": 46.09375, "learning_rate": 7.897527640874063e-07, "loss": 1.8401, "step": 11120 }, { "epoch": 1.05, "grad_norm": 91.375, "learning_rate": 7.893746205566008e-07, "loss": 1.8535, "step": 11140 }, { "epoch": 1.06, "grad_norm": 79.125, "learning_rate": 7.889964770257952e-07, "loss": 1.7106, "step": 11160 }, { "epoch": 1.06, "grad_norm": 84.6875, "learning_rate": 7.886183334949896e-07, "loss": 1.7925, "step": 11180 }, { "epoch": 1.06, "grad_norm": 60.59375, "learning_rate": 7.882401899641841e-07, "loss": 1.7557, "step": 11200 }, { "epoch": 1.06, "grad_norm": 66.5, "learning_rate": 7.878620464333785e-07, "loss": 1.7569, "step": 11220 }, { "epoch": 1.06, "grad_norm": 54.125, "learning_rate": 7.874839029025729e-07, "loss": 1.7211, "step": 11240 }, { "epoch": 1.06, "grad_norm": 89.9375, "learning_rate": 7.871057593717674e-07, "loss": 1.7641, "step": 11260 }, { "epoch": 1.07, "grad_norm": 53.125, "learning_rate": 7.867276158409618e-07, "loss": 1.8629, "step": 11280 }, { "epoch": 1.07, "grad_norm": 79.1875, "learning_rate": 7.863494723101562e-07, "loss": 1.8001, "step": 11300 }, { "epoch": 1.07, "grad_norm": 59.9375, "learning_rate": 7.859713287793507e-07, "loss": 1.8619, "step": 11320 }, { "epoch": 1.07, "grad_norm": 71.75, "learning_rate": 7.855931852485452e-07, "loss": 1.813, "step": 11340 }, { "epoch": 1.07, "grad_norm": 99.9375, "learning_rate": 7.852150417177397e-07, "loss": 1.9216, "step": 11360 }, { "epoch": 1.08, "grad_norm": 237.25, "learning_rate": 7.848368981869341e-07, "loss": 1.8799, "step": 11380 }, { "epoch": 1.08, "grad_norm": 42.0, "learning_rate": 7.844587546561285e-07, "loss": 1.8169, "step": 11400 }, { "epoch": 1.08, "grad_norm": 58.875, "learning_rate": 7.84080611125323e-07, "loss": 1.7881, "step": 11420 }, { "epoch": 1.08, "grad_norm": 83.875, "learning_rate": 7.837024675945175e-07, "loss": 1.6689, "step": 11440 }, { "epoch": 1.08, "grad_norm": 87.5, "learning_rate": 7.833243240637118e-07, "loss": 1.7903, "step": 11460 }, { "epoch": 1.09, "grad_norm": 76.0, "learning_rate": 7.829461805329063e-07, "loss": 1.6204, "step": 11480 }, { "epoch": 1.09, "grad_norm": 60.65625, "learning_rate": 7.825680370021008e-07, "loss": 1.832, "step": 11500 }, { "epoch": 1.09, "grad_norm": 51.875, "learning_rate": 7.821898934712951e-07, "loss": 1.6985, "step": 11520 }, { "epoch": 1.09, "grad_norm": 78.125, "learning_rate": 7.818117499404896e-07, "loss": 1.7127, "step": 11540 }, { "epoch": 1.09, "grad_norm": 59.40625, "learning_rate": 7.81433606409684e-07, "loss": 1.7354, "step": 11560 }, { "epoch": 1.09, "grad_norm": 69.125, "learning_rate": 7.810554628788785e-07, "loss": 1.759, "step": 11580 }, { "epoch": 1.1, "grad_norm": 53.875, "learning_rate": 7.806773193480729e-07, "loss": 1.8113, "step": 11600 }, { "epoch": 1.1, "grad_norm": 69.0625, "learning_rate": 7.802991758172673e-07, "loss": 1.7437, "step": 11620 }, { "epoch": 1.1, "grad_norm": 61.21875, "learning_rate": 7.799210322864618e-07, "loss": 1.8023, "step": 11640 }, { "epoch": 1.1, "grad_norm": 53.5625, "learning_rate": 7.795428887556562e-07, "loss": 1.6904, "step": 11660 }, { "epoch": 1.1, "grad_norm": 106.25, "learning_rate": 7.791647452248507e-07, "loss": 1.746, "step": 11680 }, { "epoch": 1.11, "grad_norm": 55.75, "learning_rate": 7.787866016940452e-07, "loss": 1.7889, "step": 11700 }, { "epoch": 1.11, "grad_norm": 49.84375, "learning_rate": 7.784084581632397e-07, "loss": 1.7731, "step": 11720 }, { "epoch": 1.11, "grad_norm": 65.6875, "learning_rate": 7.78030314632434e-07, "loss": 1.7623, "step": 11740 }, { "epoch": 1.11, "grad_norm": 52.53125, "learning_rate": 7.776521711016285e-07, "loss": 1.8033, "step": 11760 }, { "epoch": 1.11, "grad_norm": 77.375, "learning_rate": 7.77274027570823e-07, "loss": 1.8242, "step": 11780 }, { "epoch": 1.12, "grad_norm": 75.0, "learning_rate": 7.768958840400174e-07, "loss": 1.6878, "step": 11800 }, { "epoch": 1.12, "grad_norm": 68.875, "learning_rate": 7.765177405092118e-07, "loss": 1.7688, "step": 11820 }, { "epoch": 1.12, "grad_norm": 70.375, "learning_rate": 7.761395969784063e-07, "loss": 1.7053, "step": 11840 }, { "epoch": 1.12, "grad_norm": 50.6875, "learning_rate": 7.757614534476007e-07, "loss": 1.7667, "step": 11860 }, { "epoch": 1.12, "grad_norm": 53.84375, "learning_rate": 7.753833099167951e-07, "loss": 1.8148, "step": 11880 }, { "epoch": 1.12, "grad_norm": 54.65625, "learning_rate": 7.750051663859895e-07, "loss": 1.758, "step": 11900 }, { "epoch": 1.13, "grad_norm": 63.5625, "learning_rate": 7.74627022855184e-07, "loss": 1.8429, "step": 11920 }, { "epoch": 1.13, "grad_norm": 58.875, "learning_rate": 7.742488793243785e-07, "loss": 1.7386, "step": 11940 }, { "epoch": 1.13, "grad_norm": 48.90625, "learning_rate": 7.738707357935728e-07, "loss": 1.8145, "step": 11960 }, { "epoch": 1.13, "grad_norm": 53.1875, "learning_rate": 7.734925922627673e-07, "loss": 1.7081, "step": 11980 }, { "epoch": 1.13, "grad_norm": 64.0625, "learning_rate": 7.731144487319618e-07, "loss": 1.7029, "step": 12000 }, { "epoch": 1.14, "grad_norm": 52.5625, "learning_rate": 7.727363052011562e-07, "loss": 1.8116, "step": 12020 }, { "epoch": 1.14, "grad_norm": 85.9375, "learning_rate": 7.723581616703507e-07, "loss": 1.7725, "step": 12040 }, { "epoch": 1.14, "grad_norm": 61.125, "learning_rate": 7.719800181395452e-07, "loss": 1.8037, "step": 12060 }, { "epoch": 1.14, "grad_norm": 55.75, "learning_rate": 7.716018746087396e-07, "loss": 1.7618, "step": 12080 }, { "epoch": 1.14, "grad_norm": 78.625, "learning_rate": 7.71223731077934e-07, "loss": 1.7472, "step": 12100 }, { "epoch": 1.15, "grad_norm": 87.625, "learning_rate": 7.708455875471285e-07, "loss": 1.7094, "step": 12120 }, { "epoch": 1.15, "grad_norm": 55.625, "learning_rate": 7.704674440163229e-07, "loss": 1.8509, "step": 12140 }, { "epoch": 1.15, "grad_norm": 62.1875, "learning_rate": 7.700893004855174e-07, "loss": 1.7445, "step": 12160 }, { "epoch": 1.15, "grad_norm": 68.25, "learning_rate": 7.697111569547117e-07, "loss": 1.8586, "step": 12180 }, { "epoch": 1.15, "grad_norm": 73.375, "learning_rate": 7.693330134239062e-07, "loss": 1.7869, "step": 12200 }, { "epoch": 1.16, "grad_norm": 63.71875, "learning_rate": 7.689548698931007e-07, "loss": 1.6798, "step": 12220 }, { "epoch": 1.16, "grad_norm": 50.625, "learning_rate": 7.68576726362295e-07, "loss": 1.7414, "step": 12240 }, { "epoch": 1.16, "grad_norm": 70.125, "learning_rate": 7.681985828314895e-07, "loss": 1.7452, "step": 12260 }, { "epoch": 1.16, "grad_norm": 60.8125, "learning_rate": 7.67820439300684e-07, "loss": 1.869, "step": 12280 }, { "epoch": 1.16, "grad_norm": 49.375, "learning_rate": 7.674422957698784e-07, "loss": 1.6938, "step": 12300 }, { "epoch": 1.16, "grad_norm": 83.0625, "learning_rate": 7.670641522390728e-07, "loss": 1.6603, "step": 12320 }, { "epoch": 1.17, "grad_norm": 69.5625, "learning_rate": 7.666860087082673e-07, "loss": 1.7059, "step": 12340 }, { "epoch": 1.17, "grad_norm": 51.0, "learning_rate": 7.663078651774617e-07, "loss": 1.7743, "step": 12360 }, { "epoch": 1.17, "grad_norm": 58.4375, "learning_rate": 7.659297216466563e-07, "loss": 1.8554, "step": 12380 }, { "epoch": 1.17, "grad_norm": 86.625, "learning_rate": 7.655515781158507e-07, "loss": 1.7817, "step": 12400 }, { "epoch": 1.17, "grad_norm": 53.125, "learning_rate": 7.651734345850451e-07, "loss": 1.7744, "step": 12420 }, { "epoch": 1.18, "grad_norm": 49.21875, "learning_rate": 7.647952910542396e-07, "loss": 1.8232, "step": 12440 }, { "epoch": 1.18, "grad_norm": 64.1875, "learning_rate": 7.644171475234341e-07, "loss": 1.7839, "step": 12460 }, { "epoch": 1.18, "grad_norm": 77.0625, "learning_rate": 7.640390039926284e-07, "loss": 1.8145, "step": 12480 }, { "epoch": 1.18, "grad_norm": 64.375, "learning_rate": 7.636608604618229e-07, "loss": 1.6772, "step": 12500 }, { "epoch": 1.18, "grad_norm": 74.5625, "learning_rate": 7.632827169310173e-07, "loss": 1.7555, "step": 12520 }, { "epoch": 1.19, "grad_norm": 96.4375, "learning_rate": 7.629045734002117e-07, "loss": 1.6831, "step": 12540 }, { "epoch": 1.19, "grad_norm": 55.40625, "learning_rate": 7.625264298694062e-07, "loss": 1.8384, "step": 12560 }, { "epoch": 1.19, "grad_norm": 50.25, "learning_rate": 7.621482863386006e-07, "loss": 1.7862, "step": 12580 }, { "epoch": 1.19, "grad_norm": 51.46875, "learning_rate": 7.617701428077951e-07, "loss": 1.7822, "step": 12600 }, { "epoch": 1.19, "grad_norm": 63.625, "learning_rate": 7.613919992769895e-07, "loss": 1.7557, "step": 12620 }, { "epoch": 1.19, "grad_norm": 74.3125, "learning_rate": 7.610138557461839e-07, "loss": 1.8421, "step": 12640 }, { "epoch": 1.2, "grad_norm": 58.34375, "learning_rate": 7.606357122153784e-07, "loss": 1.715, "step": 12660 }, { "epoch": 1.2, "grad_norm": 65.6875, "learning_rate": 7.602575686845728e-07, "loss": 1.8373, "step": 12680 }, { "epoch": 1.2, "grad_norm": 64.3125, "learning_rate": 7.598794251537672e-07, "loss": 1.7039, "step": 12700 }, { "epoch": 1.2, "grad_norm": 74.4375, "learning_rate": 7.595012816229618e-07, "loss": 1.7355, "step": 12720 }, { "epoch": 1.2, "grad_norm": 73.5, "learning_rate": 7.591231380921563e-07, "loss": 1.7805, "step": 12740 }, { "epoch": 1.21, "grad_norm": 71.1875, "learning_rate": 7.587449945613506e-07, "loss": 1.7684, "step": 12760 }, { "epoch": 1.21, "grad_norm": 77.8125, "learning_rate": 7.583668510305451e-07, "loss": 1.7547, "step": 12780 }, { "epoch": 1.21, "grad_norm": 60.9375, "learning_rate": 7.579887074997395e-07, "loss": 1.7124, "step": 12800 }, { "epoch": 1.21, "grad_norm": 68.9375, "learning_rate": 7.57610563968934e-07, "loss": 1.8401, "step": 12820 }, { "epoch": 1.21, "grad_norm": 47.625, "learning_rate": 7.572324204381284e-07, "loss": 1.786, "step": 12840 }, { "epoch": 1.22, "grad_norm": 56.40625, "learning_rate": 7.568542769073228e-07, "loss": 1.7334, "step": 12860 }, { "epoch": 1.22, "grad_norm": 54.21875, "learning_rate": 7.564761333765173e-07, "loss": 1.7812, "step": 12880 }, { "epoch": 1.22, "grad_norm": 57.3125, "learning_rate": 7.560979898457117e-07, "loss": 1.7766, "step": 12900 }, { "epoch": 1.22, "grad_norm": 48.34375, "learning_rate": 7.557198463149061e-07, "loss": 1.7633, "step": 12920 }, { "epoch": 1.22, "grad_norm": 66.5625, "learning_rate": 7.553417027841006e-07, "loss": 1.8936, "step": 12940 }, { "epoch": 1.23, "grad_norm": 72.4375, "learning_rate": 7.549635592532951e-07, "loss": 1.8073, "step": 12960 }, { "epoch": 1.23, "grad_norm": 57.3125, "learning_rate": 7.545854157224894e-07, "loss": 1.7529, "step": 12980 }, { "epoch": 1.23, "grad_norm": 86.0625, "learning_rate": 7.542072721916839e-07, "loss": 1.7475, "step": 13000 }, { "epoch": 1.23, "grad_norm": 77.625, "learning_rate": 7.538291286608784e-07, "loss": 1.7339, "step": 13020 }, { "epoch": 1.23, "grad_norm": 84.625, "learning_rate": 7.534509851300727e-07, "loss": 1.6812, "step": 13040 }, { "epoch": 1.23, "grad_norm": 58.34375, "learning_rate": 7.530728415992672e-07, "loss": 1.8598, "step": 13060 }, { "epoch": 1.24, "grad_norm": 47.34375, "learning_rate": 7.526946980684618e-07, "loss": 1.789, "step": 13080 }, { "epoch": 1.24, "grad_norm": 64.0, "learning_rate": 7.523165545376562e-07, "loss": 1.7233, "step": 13100 }, { "epoch": 1.24, "grad_norm": 71.5625, "learning_rate": 7.519384110068506e-07, "loss": 1.7302, "step": 13120 }, { "epoch": 1.24, "grad_norm": 63.90625, "learning_rate": 7.51560267476045e-07, "loss": 1.6977, "step": 13140 }, { "epoch": 1.24, "grad_norm": 47.09375, "learning_rate": 7.511821239452395e-07, "loss": 1.8249, "step": 13160 }, { "epoch": 1.25, "grad_norm": 52.75, "learning_rate": 7.50803980414434e-07, "loss": 1.7705, "step": 13180 }, { "epoch": 1.25, "grad_norm": 67.0, "learning_rate": 7.504258368836283e-07, "loss": 1.6708, "step": 13200 }, { "epoch": 1.25, "grad_norm": 82.5, "learning_rate": 7.500476933528228e-07, "loss": 1.8331, "step": 13220 }, { "epoch": 1.25, "grad_norm": 96.625, "learning_rate": 7.496695498220173e-07, "loss": 1.8325, "step": 13240 }, { "epoch": 1.25, "grad_norm": 56.3125, "learning_rate": 7.492914062912116e-07, "loss": 1.7972, "step": 13260 }, { "epoch": 1.26, "grad_norm": 56.53125, "learning_rate": 7.489132627604061e-07, "loss": 1.7719, "step": 13280 }, { "epoch": 1.26, "grad_norm": 87.875, "learning_rate": 7.485351192296006e-07, "loss": 1.7606, "step": 13300 }, { "epoch": 1.26, "grad_norm": 73.6875, "learning_rate": 7.48156975698795e-07, "loss": 1.7746, "step": 13320 }, { "epoch": 1.26, "grad_norm": 80.625, "learning_rate": 7.477788321679894e-07, "loss": 1.767, "step": 13340 }, { "epoch": 1.26, "grad_norm": 136.125, "learning_rate": 7.474006886371839e-07, "loss": 1.8342, "step": 13360 }, { "epoch": 1.26, "grad_norm": 58.78125, "learning_rate": 7.470225451063783e-07, "loss": 1.739, "step": 13380 }, { "epoch": 1.27, "grad_norm": 60.75, "learning_rate": 7.466444015755728e-07, "loss": 1.7303, "step": 13400 }, { "epoch": 1.27, "grad_norm": 78.375, "learning_rate": 7.462662580447673e-07, "loss": 1.6749, "step": 13420 }, { "epoch": 1.27, "grad_norm": 65.5, "learning_rate": 7.458881145139617e-07, "loss": 1.8015, "step": 13440 }, { "epoch": 1.27, "grad_norm": 65.5625, "learning_rate": 7.455099709831562e-07, "loss": 1.7586, "step": 13460 }, { "epoch": 1.27, "grad_norm": 48.46875, "learning_rate": 7.451318274523505e-07, "loss": 1.6745, "step": 13480 }, { "epoch": 1.28, "grad_norm": 76.75, "learning_rate": 7.44753683921545e-07, "loss": 1.8741, "step": 13500 }, { "epoch": 1.28, "grad_norm": 71.625, "learning_rate": 7.443755403907395e-07, "loss": 1.7672, "step": 13520 }, { "epoch": 1.28, "grad_norm": 76.9375, "learning_rate": 7.439973968599339e-07, "loss": 1.8212, "step": 13540 }, { "epoch": 1.28, "grad_norm": 59.1875, "learning_rate": 7.436192533291283e-07, "loss": 1.775, "step": 13560 }, { "epoch": 1.28, "grad_norm": 97.8125, "learning_rate": 7.432411097983228e-07, "loss": 1.8158, "step": 13580 }, { "epoch": 1.29, "grad_norm": 65.5, "learning_rate": 7.428629662675172e-07, "loss": 1.7197, "step": 13600 }, { "epoch": 1.29, "grad_norm": 71.8125, "learning_rate": 7.424848227367117e-07, "loss": 1.7534, "step": 13620 }, { "epoch": 1.29, "grad_norm": 83.3125, "learning_rate": 7.421066792059061e-07, "loss": 1.7694, "step": 13640 }, { "epoch": 1.29, "grad_norm": 65.75, "learning_rate": 7.417285356751005e-07, "loss": 1.831, "step": 13660 }, { "epoch": 1.29, "grad_norm": 102.5, "learning_rate": 7.41350392144295e-07, "loss": 1.7945, "step": 13680 }, { "epoch": 1.3, "grad_norm": 63.1875, "learning_rate": 7.409722486134893e-07, "loss": 1.8187, "step": 13700 }, { "epoch": 1.3, "grad_norm": 48.125, "learning_rate": 7.405941050826838e-07, "loss": 1.7061, "step": 13720 }, { "epoch": 1.3, "grad_norm": 63.78125, "learning_rate": 7.402159615518783e-07, "loss": 1.7076, "step": 13740 }, { "epoch": 1.3, "grad_norm": 44.59375, "learning_rate": 7.398378180210727e-07, "loss": 1.6653, "step": 13760 }, { "epoch": 1.3, "grad_norm": 62.8125, "learning_rate": 7.394596744902672e-07, "loss": 1.8329, "step": 13780 }, { "epoch": 1.3, "grad_norm": 77.25, "learning_rate": 7.390815309594617e-07, "loss": 1.8275, "step": 13800 }, { "epoch": 1.31, "grad_norm": 52.40625, "learning_rate": 7.387033874286561e-07, "loss": 1.7821, "step": 13820 }, { "epoch": 1.31, "grad_norm": 54.875, "learning_rate": 7.383252438978506e-07, "loss": 1.8171, "step": 13840 }, { "epoch": 1.31, "grad_norm": 68.375, "learning_rate": 7.37947100367045e-07, "loss": 1.7546, "step": 13860 }, { "epoch": 1.31, "grad_norm": 66.875, "learning_rate": 7.375689568362394e-07, "loss": 1.79, "step": 13880 }, { "epoch": 1.31, "grad_norm": 62.1875, "learning_rate": 7.371908133054339e-07, "loss": 1.88, "step": 13900 }, { "epoch": 1.32, "grad_norm": 77.5625, "learning_rate": 7.368126697746283e-07, "loss": 1.7613, "step": 13920 }, { "epoch": 1.32, "grad_norm": 58.21875, "learning_rate": 7.364345262438227e-07, "loss": 1.7634, "step": 13940 }, { "epoch": 1.32, "grad_norm": 55.0, "learning_rate": 7.360563827130172e-07, "loss": 1.8992, "step": 13960 }, { "epoch": 1.32, "grad_norm": 84.0625, "learning_rate": 7.356782391822117e-07, "loss": 1.7488, "step": 13980 }, { "epoch": 1.32, "grad_norm": 61.3125, "learning_rate": 7.35300095651406e-07, "loss": 1.7939, "step": 14000 }, { "epoch": 1.33, "grad_norm": 54.5625, "learning_rate": 7.349219521206005e-07, "loss": 1.7303, "step": 14020 }, { "epoch": 1.33, "grad_norm": 48.6875, "learning_rate": 7.345438085897949e-07, "loss": 1.7983, "step": 14040 }, { "epoch": 1.33, "grad_norm": 58.125, "learning_rate": 7.341656650589893e-07, "loss": 1.7608, "step": 14060 }, { "epoch": 1.33, "grad_norm": 74.75, "learning_rate": 7.337875215281838e-07, "loss": 1.8493, "step": 14080 }, { "epoch": 1.33, "grad_norm": 76.125, "learning_rate": 7.334093779973782e-07, "loss": 1.8708, "step": 14100 }, { "epoch": 1.33, "grad_norm": 54.34375, "learning_rate": 7.330312344665728e-07, "loss": 1.7848, "step": 14120 }, { "epoch": 1.34, "grad_norm": 82.1875, "learning_rate": 7.326530909357672e-07, "loss": 1.8407, "step": 14140 }, { "epoch": 1.34, "grad_norm": 125.0, "learning_rate": 7.322749474049616e-07, "loss": 1.8569, "step": 14160 }, { "epoch": 1.34, "grad_norm": 67.875, "learning_rate": 7.318968038741561e-07, "loss": 1.8067, "step": 14180 }, { "epoch": 1.34, "grad_norm": 76.375, "learning_rate": 7.315186603433506e-07, "loss": 1.7579, "step": 14200 }, { "epoch": 1.34, "grad_norm": 51.34375, "learning_rate": 7.311405168125449e-07, "loss": 1.7588, "step": 14220 }, { "epoch": 1.35, "grad_norm": 67.75, "learning_rate": 7.307623732817394e-07, "loss": 1.7899, "step": 14240 }, { "epoch": 1.35, "grad_norm": 62.34375, "learning_rate": 7.303842297509339e-07, "loss": 1.755, "step": 14260 }, { "epoch": 1.35, "grad_norm": 91.0, "learning_rate": 7.300060862201282e-07, "loss": 1.8114, "step": 14280 }, { "epoch": 1.35, "grad_norm": 64.9375, "learning_rate": 7.296279426893227e-07, "loss": 1.7733, "step": 14300 }, { "epoch": 1.35, "grad_norm": 99.375, "learning_rate": 7.292497991585172e-07, "loss": 1.8316, "step": 14320 }, { "epoch": 1.36, "grad_norm": 65.0, "learning_rate": 7.288716556277116e-07, "loss": 1.7512, "step": 14340 }, { "epoch": 1.36, "grad_norm": 76.0625, "learning_rate": 7.28493512096906e-07, "loss": 1.771, "step": 14360 }, { "epoch": 1.36, "grad_norm": 52.78125, "learning_rate": 7.281153685661004e-07, "loss": 1.7127, "step": 14380 }, { "epoch": 1.36, "grad_norm": 83.8125, "learning_rate": 7.277372250352949e-07, "loss": 1.836, "step": 14400 }, { "epoch": 1.36, "grad_norm": 79.6875, "learning_rate": 7.273590815044894e-07, "loss": 1.7365, "step": 14420 }, { "epoch": 1.37, "grad_norm": 56.96875, "learning_rate": 7.269809379736837e-07, "loss": 1.7652, "step": 14440 }, { "epoch": 1.37, "grad_norm": 56.59375, "learning_rate": 7.266027944428782e-07, "loss": 1.8393, "step": 14460 }, { "epoch": 1.37, "grad_norm": 51.40625, "learning_rate": 7.262246509120728e-07, "loss": 1.8095, "step": 14480 }, { "epoch": 1.37, "grad_norm": 62.125, "learning_rate": 7.258465073812671e-07, "loss": 1.7587, "step": 14500 }, { "epoch": 1.37, "grad_norm": 57.28125, "learning_rate": 7.254683638504616e-07, "loss": 1.6792, "step": 14520 }, { "epoch": 1.37, "grad_norm": 61.09375, "learning_rate": 7.250902203196561e-07, "loss": 1.7675, "step": 14540 }, { "epoch": 1.38, "grad_norm": 56.40625, "learning_rate": 7.247120767888505e-07, "loss": 1.8372, "step": 14560 }, { "epoch": 1.38, "grad_norm": 69.375, "learning_rate": 7.243339332580449e-07, "loss": 1.7053, "step": 14580 }, { "epoch": 1.38, "grad_norm": 103.375, "learning_rate": 7.239557897272394e-07, "loss": 1.7466, "step": 14600 }, { "epoch": 1.38, "grad_norm": 63.4375, "learning_rate": 7.235776461964338e-07, "loss": 1.7912, "step": 14620 }, { "epoch": 1.38, "grad_norm": 74.1875, "learning_rate": 7.231995026656283e-07, "loss": 1.7838, "step": 14640 }, { "epoch": 1.39, "grad_norm": 71.25, "learning_rate": 7.228213591348226e-07, "loss": 1.6906, "step": 14660 }, { "epoch": 1.39, "grad_norm": 72.75, "learning_rate": 7.224432156040171e-07, "loss": 1.8053, "step": 14680 }, { "epoch": 1.39, "grad_norm": 107.3125, "learning_rate": 7.220650720732116e-07, "loss": 1.8471, "step": 14700 }, { "epoch": 1.39, "grad_norm": 107.0, "learning_rate": 7.216869285424059e-07, "loss": 1.7873, "step": 14720 }, { "epoch": 1.39, "grad_norm": 98.0, "learning_rate": 7.213087850116004e-07, "loss": 1.728, "step": 14740 }, { "epoch": 1.4, "grad_norm": 62.3125, "learning_rate": 7.209306414807949e-07, "loss": 1.7931, "step": 14760 }, { "epoch": 1.4, "grad_norm": 77.625, "learning_rate": 7.205524979499893e-07, "loss": 1.8416, "step": 14780 }, { "epoch": 1.4, "grad_norm": 67.25, "learning_rate": 7.201743544191837e-07, "loss": 1.7702, "step": 14800 }, { "epoch": 1.4, "grad_norm": 73.5, "learning_rate": 7.197962108883783e-07, "loss": 1.751, "step": 14820 }, { "epoch": 1.4, "grad_norm": 90.6875, "learning_rate": 7.194180673575727e-07, "loss": 1.8144, "step": 14840 }, { "epoch": 1.4, "grad_norm": 65.8125, "learning_rate": 7.190399238267672e-07, "loss": 1.7848, "step": 14860 }, { "epoch": 1.41, "grad_norm": 75.1875, "learning_rate": 7.186617802959616e-07, "loss": 1.8424, "step": 14880 }, { "epoch": 1.41, "grad_norm": 69.5625, "learning_rate": 7.18283636765156e-07, "loss": 1.7273, "step": 14900 }, { "epoch": 1.41, "grad_norm": 62.9375, "learning_rate": 7.179054932343505e-07, "loss": 1.7568, "step": 14920 }, { "epoch": 1.41, "grad_norm": 49.75, "learning_rate": 7.175273497035449e-07, "loss": 1.8136, "step": 14940 }, { "epoch": 1.41, "grad_norm": 75.8125, "learning_rate": 7.171492061727393e-07, "loss": 1.8259, "step": 14960 }, { "epoch": 1.42, "grad_norm": 51.125, "learning_rate": 7.167710626419338e-07, "loss": 1.7377, "step": 14980 }, { "epoch": 1.42, "grad_norm": 61.8125, "learning_rate": 7.163929191111282e-07, "loss": 1.7435, "step": 15000 }, { "epoch": 1.42, "grad_norm": 45.65625, "learning_rate": 7.160147755803226e-07, "loss": 1.7305, "step": 15020 }, { "epoch": 1.42, "grad_norm": 48.03125, "learning_rate": 7.156366320495171e-07, "loss": 1.7079, "step": 15040 }, { "epoch": 1.42, "grad_norm": 85.0, "learning_rate": 7.152584885187115e-07, "loss": 1.8445, "step": 15060 }, { "epoch": 1.43, "grad_norm": 73.5625, "learning_rate": 7.148803449879059e-07, "loss": 1.7944, "step": 15080 }, { "epoch": 1.43, "grad_norm": 66.75, "learning_rate": 7.145022014571004e-07, "loss": 1.72, "step": 15100 }, { "epoch": 1.43, "grad_norm": 78.4375, "learning_rate": 7.141240579262948e-07, "loss": 1.8119, "step": 15120 }, { "epoch": 1.43, "grad_norm": 66.25, "learning_rate": 7.137459143954893e-07, "loss": 1.8503, "step": 15140 }, { "epoch": 1.43, "grad_norm": 78.625, "learning_rate": 7.133677708646837e-07, "loss": 1.7444, "step": 15160 }, { "epoch": 1.44, "grad_norm": 88.5625, "learning_rate": 7.129896273338782e-07, "loss": 1.7226, "step": 15180 }, { "epoch": 1.44, "grad_norm": 52.25, "learning_rate": 7.126114838030727e-07, "loss": 1.6872, "step": 15200 }, { "epoch": 1.44, "grad_norm": 129.375, "learning_rate": 7.122333402722672e-07, "loss": 1.7477, "step": 15220 }, { "epoch": 1.44, "grad_norm": 60.46875, "learning_rate": 7.118551967414615e-07, "loss": 1.6581, "step": 15240 }, { "epoch": 1.44, "grad_norm": 59.1875, "learning_rate": 7.11477053210656e-07, "loss": 1.7731, "step": 15260 }, { "epoch": 1.44, "grad_norm": 70.8125, "learning_rate": 7.110989096798505e-07, "loss": 1.7503, "step": 15280 }, { "epoch": 1.45, "grad_norm": 82.9375, "learning_rate": 7.107207661490448e-07, "loss": 1.6437, "step": 15300 }, { "epoch": 1.45, "grad_norm": 66.375, "learning_rate": 7.103426226182393e-07, "loss": 1.782, "step": 15320 }, { "epoch": 1.45, "grad_norm": 120.25, "learning_rate": 7.099644790874337e-07, "loss": 1.9366, "step": 15340 }, { "epoch": 1.45, "grad_norm": 71.875, "learning_rate": 7.095863355566282e-07, "loss": 1.8142, "step": 15360 }, { "epoch": 1.45, "grad_norm": 63.03125, "learning_rate": 7.092081920258226e-07, "loss": 1.8358, "step": 15380 }, { "epoch": 1.46, "grad_norm": 59.0, "learning_rate": 7.08830048495017e-07, "loss": 1.7758, "step": 15400 }, { "epoch": 1.46, "grad_norm": 65.0625, "learning_rate": 7.084519049642115e-07, "loss": 1.7326, "step": 15420 }, { "epoch": 1.46, "grad_norm": 59.875, "learning_rate": 7.08073761433406e-07, "loss": 1.7505, "step": 15440 }, { "epoch": 1.46, "grad_norm": 99.0, "learning_rate": 7.076956179026003e-07, "loss": 1.7379, "step": 15460 }, { "epoch": 1.46, "grad_norm": 79.5625, "learning_rate": 7.073174743717948e-07, "loss": 1.8941, "step": 15480 }, { "epoch": 1.47, "grad_norm": 70.375, "learning_rate": 7.069393308409893e-07, "loss": 1.8052, "step": 15500 }, { "epoch": 1.47, "grad_norm": 93.9375, "learning_rate": 7.065611873101837e-07, "loss": 1.8094, "step": 15520 }, { "epoch": 1.47, "grad_norm": 63.5625, "learning_rate": 7.061830437793782e-07, "loss": 1.7806, "step": 15540 }, { "epoch": 1.47, "grad_norm": 57.65625, "learning_rate": 7.058049002485727e-07, "loss": 1.7275, "step": 15560 }, { "epoch": 1.47, "grad_norm": 58.96875, "learning_rate": 7.054267567177671e-07, "loss": 1.6737, "step": 15580 }, { "epoch": 1.47, "grad_norm": 66.75, "learning_rate": 7.050486131869615e-07, "loss": 1.7801, "step": 15600 }, { "epoch": 1.48, "grad_norm": 48.59375, "learning_rate": 7.046704696561559e-07, "loss": 1.7297, "step": 15620 }, { "epoch": 1.48, "grad_norm": 69.6875, "learning_rate": 7.042923261253504e-07, "loss": 1.7964, "step": 15640 }, { "epoch": 1.48, "grad_norm": 49.28125, "learning_rate": 7.039141825945449e-07, "loss": 1.8027, "step": 15660 }, { "epoch": 1.48, "grad_norm": 55.46875, "learning_rate": 7.035360390637392e-07, "loss": 1.7836, "step": 15680 }, { "epoch": 1.48, "grad_norm": 80.5, "learning_rate": 7.031578955329337e-07, "loss": 1.7742, "step": 15700 }, { "epoch": 1.49, "grad_norm": 63.03125, "learning_rate": 7.027797520021282e-07, "loss": 1.7406, "step": 15720 }, { "epoch": 1.49, "grad_norm": 51.53125, "learning_rate": 7.024016084713225e-07, "loss": 1.7009, "step": 15740 }, { "epoch": 1.49, "grad_norm": 113.25, "learning_rate": 7.02023464940517e-07, "loss": 1.7417, "step": 15760 }, { "epoch": 1.49, "grad_norm": 58.96875, "learning_rate": 7.016453214097115e-07, "loss": 1.8331, "step": 15780 }, { "epoch": 1.49, "grad_norm": 76.3125, "learning_rate": 7.012671778789059e-07, "loss": 1.702, "step": 15800 }, { "epoch": 1.5, "grad_norm": 56.3125, "learning_rate": 7.008890343481003e-07, "loss": 1.8167, "step": 15820 }, { "epoch": 1.5, "grad_norm": 56.78125, "learning_rate": 7.005108908172948e-07, "loss": 1.7286, "step": 15840 }, { "epoch": 1.5, "grad_norm": 69.6875, "learning_rate": 7.001327472864892e-07, "loss": 1.7007, "step": 15860 }, { "epoch": 1.5, "grad_norm": 66.125, "learning_rate": 6.997546037556838e-07, "loss": 1.8052, "step": 15880 }, { "epoch": 1.5, "grad_norm": 57.15625, "learning_rate": 6.993764602248782e-07, "loss": 1.7057, "step": 15900 }, { "epoch": 1.51, "grad_norm": 69.0, "learning_rate": 6.989983166940726e-07, "loss": 1.8017, "step": 15920 }, { "epoch": 1.51, "grad_norm": 61.59375, "learning_rate": 6.986201731632671e-07, "loss": 1.8606, "step": 15940 }, { "epoch": 1.51, "grad_norm": 61.96875, "learning_rate": 6.982420296324614e-07, "loss": 1.793, "step": 15960 }, { "epoch": 1.51, "grad_norm": 72.0, "learning_rate": 6.978638861016559e-07, "loss": 1.7604, "step": 15980 }, { "epoch": 1.51, "grad_norm": 67.625, "learning_rate": 6.974857425708504e-07, "loss": 1.6722, "step": 16000 }, { "epoch": 1.51, "grad_norm": 62.40625, "learning_rate": 6.971075990400448e-07, "loss": 1.804, "step": 16020 }, { "epoch": 1.52, "grad_norm": 58.84375, "learning_rate": 6.967294555092392e-07, "loss": 1.7244, "step": 16040 }, { "epoch": 1.52, "grad_norm": 49.125, "learning_rate": 6.963513119784337e-07, "loss": 1.7763, "step": 16060 }, { "epoch": 1.52, "grad_norm": 67.1875, "learning_rate": 6.959731684476281e-07, "loss": 1.7623, "step": 16080 }, { "epoch": 1.52, "grad_norm": 62.1875, "learning_rate": 6.955950249168225e-07, "loss": 1.8598, "step": 16100 }, { "epoch": 1.52, "grad_norm": 82.6875, "learning_rate": 6.95216881386017e-07, "loss": 1.789, "step": 16120 }, { "epoch": 1.53, "grad_norm": 84.0, "learning_rate": 6.948387378552114e-07, "loss": 1.7023, "step": 16140 }, { "epoch": 1.53, "grad_norm": 92.4375, "learning_rate": 6.944605943244059e-07, "loss": 1.7511, "step": 16160 }, { "epoch": 1.53, "grad_norm": 69.125, "learning_rate": 6.940824507936003e-07, "loss": 1.8223, "step": 16180 }, { "epoch": 1.53, "grad_norm": 61.15625, "learning_rate": 6.937043072627947e-07, "loss": 1.8385, "step": 16200 }, { "epoch": 1.53, "grad_norm": 71.0, "learning_rate": 6.933261637319893e-07, "loss": 1.6804, "step": 16220 }, { "epoch": 1.54, "grad_norm": 57.4375, "learning_rate": 6.929480202011837e-07, "loss": 1.8056, "step": 16240 }, { "epoch": 1.54, "grad_norm": 78.875, "learning_rate": 6.925698766703781e-07, "loss": 1.8111, "step": 16260 }, { "epoch": 1.54, "grad_norm": 64.125, "learning_rate": 6.921917331395726e-07, "loss": 1.6361, "step": 16280 }, { "epoch": 1.54, "grad_norm": 75.125, "learning_rate": 6.91813589608767e-07, "loss": 1.7612, "step": 16300 }, { "epoch": 1.54, "grad_norm": 64.25, "learning_rate": 6.914354460779614e-07, "loss": 1.7896, "step": 16320 }, { "epoch": 1.54, "grad_norm": 58.96875, "learning_rate": 6.910573025471559e-07, "loss": 1.7171, "step": 16340 }, { "epoch": 1.55, "grad_norm": 56.71875, "learning_rate": 6.906791590163503e-07, "loss": 1.7889, "step": 16360 }, { "epoch": 1.55, "grad_norm": 53.3125, "learning_rate": 6.903010154855448e-07, "loss": 1.7833, "step": 16380 }, { "epoch": 1.55, "grad_norm": 60.9375, "learning_rate": 6.899228719547392e-07, "loss": 1.8044, "step": 16400 }, { "epoch": 1.55, "grad_norm": 64.8125, "learning_rate": 6.895447284239336e-07, "loss": 1.8078, "step": 16420 }, { "epoch": 1.55, "grad_norm": 69.375, "learning_rate": 6.891665848931281e-07, "loss": 1.69, "step": 16440 }, { "epoch": 1.56, "grad_norm": 52.5625, "learning_rate": 6.887884413623226e-07, "loss": 1.7741, "step": 16460 }, { "epoch": 1.56, "grad_norm": 50.21875, "learning_rate": 6.884102978315169e-07, "loss": 1.8409, "step": 16480 }, { "epoch": 1.56, "grad_norm": 55.34375, "learning_rate": 6.880321543007114e-07, "loss": 1.8224, "step": 16500 }, { "epoch": 1.56, "grad_norm": 58.6875, "learning_rate": 6.876540107699058e-07, "loss": 1.7456, "step": 16520 }, { "epoch": 1.56, "grad_norm": 52.8125, "learning_rate": 6.872758672391002e-07, "loss": 1.8105, "step": 16540 }, { "epoch": 1.57, "grad_norm": 81.6875, "learning_rate": 6.868977237082947e-07, "loss": 1.863, "step": 16560 }, { "epoch": 1.57, "grad_norm": 56.0625, "learning_rate": 6.865195801774892e-07, "loss": 1.717, "step": 16580 }, { "epoch": 1.57, "grad_norm": 41.25, "learning_rate": 6.861414366466837e-07, "loss": 1.9108, "step": 16600 }, { "epoch": 1.57, "grad_norm": 58.375, "learning_rate": 6.857632931158781e-07, "loss": 1.7419, "step": 16620 }, { "epoch": 1.57, "grad_norm": 56.625, "learning_rate": 6.853851495850725e-07, "loss": 1.861, "step": 16640 }, { "epoch": 1.57, "grad_norm": 62.6875, "learning_rate": 6.85007006054267e-07, "loss": 1.9066, "step": 16660 }, { "epoch": 1.58, "grad_norm": 52.15625, "learning_rate": 6.846288625234614e-07, "loss": 1.8078, "step": 16680 }, { "epoch": 1.58, "grad_norm": 70.9375, "learning_rate": 6.842507189926558e-07, "loss": 1.7877, "step": 16700 }, { "epoch": 1.58, "grad_norm": 71.875, "learning_rate": 6.838725754618503e-07, "loss": 1.7093, "step": 16720 }, { "epoch": 1.58, "grad_norm": 75.0, "learning_rate": 6.834944319310448e-07, "loss": 1.8284, "step": 16740 }, { "epoch": 1.58, "grad_norm": 73.6875, "learning_rate": 6.831162884002391e-07, "loss": 1.6915, "step": 16760 }, { "epoch": 1.59, "grad_norm": 66.625, "learning_rate": 6.827381448694336e-07, "loss": 1.8282, "step": 16780 }, { "epoch": 1.59, "grad_norm": 65.4375, "learning_rate": 6.823600013386281e-07, "loss": 1.7428, "step": 16800 }, { "epoch": 1.59, "grad_norm": 64.6875, "learning_rate": 6.819818578078225e-07, "loss": 1.7914, "step": 16820 }, { "epoch": 1.59, "grad_norm": 94.0, "learning_rate": 6.816037142770169e-07, "loss": 1.7644, "step": 16840 }, { "epoch": 1.59, "grad_norm": 61.6875, "learning_rate": 6.812255707462113e-07, "loss": 1.7501, "step": 16860 }, { "epoch": 1.6, "grad_norm": 70.625, "learning_rate": 6.808474272154058e-07, "loss": 1.7679, "step": 16880 }, { "epoch": 1.6, "grad_norm": 82.1875, "learning_rate": 6.804692836846002e-07, "loss": 1.7524, "step": 16900 }, { "epoch": 1.6, "grad_norm": 43.09375, "learning_rate": 6.800911401537947e-07, "loss": 1.7659, "step": 16920 }, { "epoch": 1.6, "grad_norm": 53.96875, "learning_rate": 6.797129966229892e-07, "loss": 1.6591, "step": 16940 }, { "epoch": 1.6, "grad_norm": 51.46875, "learning_rate": 6.793348530921837e-07, "loss": 1.6777, "step": 16960 }, { "epoch": 1.61, "grad_norm": 56.34375, "learning_rate": 6.78956709561378e-07, "loss": 1.7987, "step": 16980 }, { "epoch": 1.61, "grad_norm": 86.1875, "learning_rate": 6.785785660305725e-07, "loss": 1.8583, "step": 17000 }, { "epoch": 1.61, "grad_norm": 78.0625, "learning_rate": 6.78200422499767e-07, "loss": 1.8726, "step": 17020 }, { "epoch": 1.61, "grad_norm": 88.375, "learning_rate": 6.778222789689614e-07, "loss": 1.8256, "step": 17040 }, { "epoch": 1.61, "grad_norm": 54.96875, "learning_rate": 6.774441354381558e-07, "loss": 1.8137, "step": 17060 }, { "epoch": 1.61, "grad_norm": 55.0, "learning_rate": 6.770659919073503e-07, "loss": 1.6887, "step": 17080 }, { "epoch": 1.62, "grad_norm": 55.03125, "learning_rate": 6.766878483765447e-07, "loss": 1.807, "step": 17100 }, { "epoch": 1.62, "grad_norm": 67.875, "learning_rate": 6.763097048457391e-07, "loss": 1.7649, "step": 17120 }, { "epoch": 1.62, "grad_norm": 58.90625, "learning_rate": 6.759315613149335e-07, "loss": 1.7151, "step": 17140 }, { "epoch": 1.62, "grad_norm": 72.8125, "learning_rate": 6.75553417784128e-07, "loss": 1.8062, "step": 17160 }, { "epoch": 1.62, "grad_norm": 69.0, "learning_rate": 6.751752742533225e-07, "loss": 1.7687, "step": 17180 }, { "epoch": 1.63, "grad_norm": 66.5, "learning_rate": 6.747971307225168e-07, "loss": 1.7421, "step": 17200 }, { "epoch": 1.63, "grad_norm": 57.625, "learning_rate": 6.744189871917113e-07, "loss": 1.8331, "step": 17220 }, { "epoch": 1.63, "grad_norm": 60.1875, "learning_rate": 6.740408436609058e-07, "loss": 1.7498, "step": 17240 }, { "epoch": 1.63, "grad_norm": 51.65625, "learning_rate": 6.736627001301001e-07, "loss": 1.7102, "step": 17260 }, { "epoch": 1.63, "grad_norm": 79.625, "learning_rate": 6.732845565992947e-07, "loss": 1.714, "step": 17280 }, { "epoch": 1.64, "grad_norm": 52.46875, "learning_rate": 6.729064130684892e-07, "loss": 1.6849, "step": 17300 }, { "epoch": 1.64, "grad_norm": 61.3125, "learning_rate": 6.725282695376836e-07, "loss": 1.8046, "step": 17320 }, { "epoch": 1.64, "grad_norm": 52.0625, "learning_rate": 6.72150126006878e-07, "loss": 1.6312, "step": 17340 }, { "epoch": 1.64, "grad_norm": 54.71875, "learning_rate": 6.717719824760725e-07, "loss": 1.6886, "step": 17360 }, { "epoch": 1.64, "grad_norm": 51.375, "learning_rate": 6.713938389452669e-07, "loss": 1.754, "step": 17380 }, { "epoch": 1.64, "grad_norm": 55.90625, "learning_rate": 6.710156954144614e-07, "loss": 1.7469, "step": 17400 }, { "epoch": 1.65, "grad_norm": 61.78125, "learning_rate": 6.706375518836558e-07, "loss": 1.7955, "step": 17420 }, { "epoch": 1.65, "grad_norm": 89.75, "learning_rate": 6.702594083528502e-07, "loss": 1.7665, "step": 17440 }, { "epoch": 1.65, "grad_norm": 56.0625, "learning_rate": 6.698812648220447e-07, "loss": 1.8611, "step": 17460 }, { "epoch": 1.65, "grad_norm": 63.625, "learning_rate": 6.69503121291239e-07, "loss": 1.7571, "step": 17480 }, { "epoch": 1.65, "grad_norm": 70.375, "learning_rate": 6.691249777604335e-07, "loss": 1.8701, "step": 17500 }, { "epoch": 1.66, "grad_norm": 91.1875, "learning_rate": 6.68746834229628e-07, "loss": 1.7109, "step": 17520 }, { "epoch": 1.66, "grad_norm": 51.1875, "learning_rate": 6.683686906988224e-07, "loss": 1.7953, "step": 17540 }, { "epoch": 1.66, "grad_norm": 57.1875, "learning_rate": 6.679905471680168e-07, "loss": 1.6835, "step": 17560 }, { "epoch": 1.66, "grad_norm": 62.1875, "learning_rate": 6.676124036372113e-07, "loss": 1.8026, "step": 17580 }, { "epoch": 1.66, "grad_norm": 58.125, "learning_rate": 6.672342601064057e-07, "loss": 1.698, "step": 17600 }, { "epoch": 1.67, "grad_norm": 53.4375, "learning_rate": 6.668561165756003e-07, "loss": 1.7425, "step": 17620 }, { "epoch": 1.67, "grad_norm": 58.4375, "learning_rate": 6.664779730447947e-07, "loss": 1.8302, "step": 17640 }, { "epoch": 1.67, "grad_norm": 57.84375, "learning_rate": 6.660998295139891e-07, "loss": 1.8451, "step": 17660 }, { "epoch": 1.67, "grad_norm": 60.84375, "learning_rate": 6.657216859831836e-07, "loss": 1.7618, "step": 17680 }, { "epoch": 1.67, "grad_norm": 64.25, "learning_rate": 6.65343542452378e-07, "loss": 1.7403, "step": 17700 }, { "epoch": 1.68, "grad_norm": 81.0, "learning_rate": 6.649653989215724e-07, "loss": 1.7854, "step": 17720 }, { "epoch": 1.68, "grad_norm": 57.125, "learning_rate": 6.645872553907669e-07, "loss": 1.7188, "step": 17740 }, { "epoch": 1.68, "grad_norm": 75.1875, "learning_rate": 6.642091118599614e-07, "loss": 1.7599, "step": 17760 }, { "epoch": 1.68, "grad_norm": 62.75, "learning_rate": 6.638309683291557e-07, "loss": 1.7381, "step": 17780 }, { "epoch": 1.68, "grad_norm": 52.96875, "learning_rate": 6.634528247983502e-07, "loss": 1.7222, "step": 17800 }, { "epoch": 1.68, "grad_norm": 54.875, "learning_rate": 6.630746812675446e-07, "loss": 1.808, "step": 17820 }, { "epoch": 1.69, "grad_norm": 51.03125, "learning_rate": 6.62696537736739e-07, "loss": 1.599, "step": 17840 }, { "epoch": 1.69, "grad_norm": 59.09375, "learning_rate": 6.623183942059335e-07, "loss": 1.808, "step": 17860 }, { "epoch": 1.69, "grad_norm": 77.6875, "learning_rate": 6.619402506751279e-07, "loss": 1.7878, "step": 17880 }, { "epoch": 1.69, "grad_norm": 63.5625, "learning_rate": 6.615621071443224e-07, "loss": 1.7078, "step": 17900 }, { "epoch": 1.69, "grad_norm": 47.6875, "learning_rate": 6.611839636135168e-07, "loss": 1.7639, "step": 17920 }, { "epoch": 1.7, "grad_norm": 63.34375, "learning_rate": 6.608058200827112e-07, "loss": 1.8108, "step": 17940 }, { "epoch": 1.7, "grad_norm": 52.8125, "learning_rate": 6.604276765519058e-07, "loss": 1.8749, "step": 17960 }, { "epoch": 1.7, "grad_norm": 63.28125, "learning_rate": 6.600495330211003e-07, "loss": 1.8105, "step": 17980 }, { "epoch": 1.7, "grad_norm": 68.875, "learning_rate": 6.596713894902946e-07, "loss": 1.6754, "step": 18000 }, { "epoch": 1.7, "grad_norm": 88.0625, "learning_rate": 6.592932459594891e-07, "loss": 1.7055, "step": 18020 }, { "epoch": 1.71, "grad_norm": 69.5625, "learning_rate": 6.589151024286836e-07, "loss": 1.7658, "step": 18040 }, { "epoch": 1.71, "grad_norm": 48.65625, "learning_rate": 6.58536958897878e-07, "loss": 1.747, "step": 18060 }, { "epoch": 1.71, "grad_norm": 49.3125, "learning_rate": 6.581588153670724e-07, "loss": 1.8633, "step": 18080 }, { "epoch": 1.71, "grad_norm": 68.5625, "learning_rate": 6.577806718362668e-07, "loss": 1.7298, "step": 18100 }, { "epoch": 1.71, "grad_norm": 56.59375, "learning_rate": 6.574025283054613e-07, "loss": 1.6837, "step": 18120 }, { "epoch": 1.71, "grad_norm": 90.5625, "learning_rate": 6.570243847746557e-07, "loss": 1.7123, "step": 18140 }, { "epoch": 1.72, "grad_norm": 136.875, "learning_rate": 6.566462412438501e-07, "loss": 1.7742, "step": 18160 }, { "epoch": 1.72, "grad_norm": 65.0, "learning_rate": 6.562680977130446e-07, "loss": 1.7315, "step": 18180 }, { "epoch": 1.72, "grad_norm": 57.875, "learning_rate": 6.558899541822391e-07, "loss": 1.7881, "step": 18200 }, { "epoch": 1.72, "grad_norm": 61.90625, "learning_rate": 6.555118106514334e-07, "loss": 1.6969, "step": 18220 }, { "epoch": 1.72, "grad_norm": 65.8125, "learning_rate": 6.551336671206279e-07, "loss": 1.7261, "step": 18240 }, { "epoch": 1.73, "grad_norm": 59.03125, "learning_rate": 6.547555235898224e-07, "loss": 1.7779, "step": 18260 }, { "epoch": 1.73, "grad_norm": 52.09375, "learning_rate": 6.543773800590167e-07, "loss": 1.7624, "step": 18280 }, { "epoch": 1.73, "grad_norm": 79.9375, "learning_rate": 6.539992365282112e-07, "loss": 1.8038, "step": 18300 }, { "epoch": 1.73, "grad_norm": 85.0, "learning_rate": 6.536210929974058e-07, "loss": 1.8728, "step": 18320 }, { "epoch": 1.73, "grad_norm": 63.71875, "learning_rate": 6.532429494666002e-07, "loss": 1.8022, "step": 18340 }, { "epoch": 1.74, "grad_norm": 67.3125, "learning_rate": 6.528648059357946e-07, "loss": 1.7176, "step": 18360 }, { "epoch": 1.74, "grad_norm": 69.8125, "learning_rate": 6.524866624049891e-07, "loss": 1.8417, "step": 18380 }, { "epoch": 1.74, "grad_norm": 57.25, "learning_rate": 6.521085188741835e-07, "loss": 1.8464, "step": 18400 }, { "epoch": 1.74, "grad_norm": 56.71875, "learning_rate": 6.51730375343378e-07, "loss": 1.6192, "step": 18420 }, { "epoch": 1.74, "grad_norm": 79.1875, "learning_rate": 6.513522318125723e-07, "loss": 1.8753, "step": 18440 }, { "epoch": 1.75, "grad_norm": 53.375, "learning_rate": 6.509740882817668e-07, "loss": 1.6674, "step": 18460 }, { "epoch": 1.75, "grad_norm": 63.75, "learning_rate": 6.505959447509613e-07, "loss": 1.7284, "step": 18480 }, { "epoch": 1.75, "grad_norm": 72.5, "learning_rate": 6.502178012201556e-07, "loss": 1.8274, "step": 18500 }, { "epoch": 1.75, "grad_norm": 51.3125, "learning_rate": 6.498396576893501e-07, "loss": 1.7632, "step": 18520 }, { "epoch": 1.75, "grad_norm": 51.65625, "learning_rate": 6.494615141585446e-07, "loss": 1.7839, "step": 18540 }, { "epoch": 1.75, "grad_norm": 53.46875, "learning_rate": 6.49083370627739e-07, "loss": 1.6718, "step": 18560 }, { "epoch": 1.76, "grad_norm": 71.0625, "learning_rate": 6.487052270969334e-07, "loss": 1.6606, "step": 18580 }, { "epoch": 1.76, "grad_norm": 62.9375, "learning_rate": 6.483270835661279e-07, "loss": 1.8147, "step": 18600 }, { "epoch": 1.76, "grad_norm": 63.1875, "learning_rate": 6.479489400353223e-07, "loss": 1.8021, "step": 18620 }, { "epoch": 1.76, "grad_norm": 54.53125, "learning_rate": 6.475707965045167e-07, "loss": 1.6672, "step": 18640 }, { "epoch": 1.76, "grad_norm": 69.0, "learning_rate": 6.471926529737113e-07, "loss": 1.744, "step": 18660 }, { "epoch": 1.77, "grad_norm": 69.5625, "learning_rate": 6.468145094429057e-07, "loss": 1.7739, "step": 18680 }, { "epoch": 1.77, "grad_norm": 53.875, "learning_rate": 6.464363659121002e-07, "loss": 1.7764, "step": 18700 }, { "epoch": 1.77, "grad_norm": 54.0625, "learning_rate": 6.460582223812946e-07, "loss": 1.7612, "step": 18720 }, { "epoch": 1.77, "grad_norm": 64.4375, "learning_rate": 6.45680078850489e-07, "loss": 1.632, "step": 18740 }, { "epoch": 1.77, "grad_norm": 91.1875, "learning_rate": 6.453019353196835e-07, "loss": 1.8534, "step": 18760 }, { "epoch": 1.78, "grad_norm": 95.5625, "learning_rate": 6.449237917888779e-07, "loss": 1.8283, "step": 18780 }, { "epoch": 1.78, "grad_norm": 68.5625, "learning_rate": 6.445456482580723e-07, "loss": 1.7274, "step": 18800 }, { "epoch": 1.78, "grad_norm": 75.875, "learning_rate": 6.441675047272668e-07, "loss": 1.7398, "step": 18820 }, { "epoch": 1.78, "grad_norm": 73.5625, "learning_rate": 6.437893611964612e-07, "loss": 1.717, "step": 18840 }, { "epoch": 1.78, "grad_norm": 62.34375, "learning_rate": 6.434112176656556e-07, "loss": 1.7714, "step": 18860 }, { "epoch": 1.78, "grad_norm": 86.5625, "learning_rate": 6.430330741348501e-07, "loss": 1.8233, "step": 18880 }, { "epoch": 1.79, "grad_norm": 39.03125, "learning_rate": 6.426549306040445e-07, "loss": 1.7017, "step": 18900 }, { "epoch": 1.79, "grad_norm": 74.8125, "learning_rate": 6.42276787073239e-07, "loss": 1.7794, "step": 18920 }, { "epoch": 1.79, "grad_norm": 58.96875, "learning_rate": 6.418986435424334e-07, "loss": 1.752, "step": 18940 }, { "epoch": 1.79, "grad_norm": 56.6875, "learning_rate": 6.415205000116278e-07, "loss": 1.6723, "step": 18960 }, { "epoch": 1.79, "grad_norm": 76.3125, "learning_rate": 6.411423564808223e-07, "loss": 1.7992, "step": 18980 }, { "epoch": 1.8, "grad_norm": 46.125, "learning_rate": 6.407642129500166e-07, "loss": 1.8077, "step": 19000 }, { "epoch": 1.8, "grad_norm": 68.0, "learning_rate": 6.403860694192112e-07, "loss": 1.6902, "step": 19020 }, { "epoch": 1.8, "grad_norm": 66.4375, "learning_rate": 6.400079258884057e-07, "loss": 1.7471, "step": 19040 }, { "epoch": 1.8, "grad_norm": 72.375, "learning_rate": 6.396297823576001e-07, "loss": 1.7479, "step": 19060 }, { "epoch": 1.8, "grad_norm": 109.5625, "learning_rate": 6.392516388267945e-07, "loss": 1.7985, "step": 19080 }, { "epoch": 1.81, "grad_norm": 74.625, "learning_rate": 6.38873495295989e-07, "loss": 1.7164, "step": 19100 }, { "epoch": 1.81, "grad_norm": 65.875, "learning_rate": 6.384953517651834e-07, "loss": 1.6436, "step": 19120 }, { "epoch": 1.81, "grad_norm": 61.3125, "learning_rate": 6.381172082343779e-07, "loss": 1.6285, "step": 19140 }, { "epoch": 1.81, "grad_norm": 71.25, "learning_rate": 6.377390647035723e-07, "loss": 1.7548, "step": 19160 }, { "epoch": 1.81, "grad_norm": 57.875, "learning_rate": 6.373609211727667e-07, "loss": 1.6347, "step": 19180 }, { "epoch": 1.82, "grad_norm": 46.90625, "learning_rate": 6.369827776419612e-07, "loss": 1.7618, "step": 19200 }, { "epoch": 1.82, "grad_norm": 54.84375, "learning_rate": 6.366046341111557e-07, "loss": 1.7912, "step": 19220 }, { "epoch": 1.82, "grad_norm": 55.0625, "learning_rate": 6.3622649058035e-07, "loss": 1.7301, "step": 19240 }, { "epoch": 1.82, "grad_norm": 48.1875, "learning_rate": 6.358483470495445e-07, "loss": 1.7273, "step": 19260 }, { "epoch": 1.82, "grad_norm": 76.125, "learning_rate": 6.35470203518739e-07, "loss": 1.7014, "step": 19280 }, { "epoch": 1.82, "grad_norm": 49.40625, "learning_rate": 6.350920599879333e-07, "loss": 1.8441, "step": 19300 }, { "epoch": 1.83, "grad_norm": 76.75, "learning_rate": 6.347139164571278e-07, "loss": 1.7316, "step": 19320 }, { "epoch": 1.83, "grad_norm": 78.25, "learning_rate": 6.343357729263222e-07, "loss": 1.7168, "step": 19340 }, { "epoch": 1.83, "grad_norm": 68.0, "learning_rate": 6.339576293955168e-07, "loss": 1.8219, "step": 19360 }, { "epoch": 1.83, "grad_norm": 55.75, "learning_rate": 6.335794858647112e-07, "loss": 1.7437, "step": 19380 }, { "epoch": 1.83, "grad_norm": 56.90625, "learning_rate": 6.332013423339056e-07, "loss": 1.7404, "step": 19400 }, { "epoch": 1.84, "grad_norm": 51.125, "learning_rate": 6.328231988031001e-07, "loss": 1.731, "step": 19420 }, { "epoch": 1.84, "grad_norm": 59.0625, "learning_rate": 6.324450552722946e-07, "loss": 1.7489, "step": 19440 }, { "epoch": 1.84, "grad_norm": 93.5625, "learning_rate": 6.320669117414889e-07, "loss": 1.7179, "step": 19460 }, { "epoch": 1.84, "grad_norm": 77.3125, "learning_rate": 6.316887682106834e-07, "loss": 1.7455, "step": 19480 }, { "epoch": 1.84, "grad_norm": 60.21875, "learning_rate": 6.313106246798779e-07, "loss": 1.7477, "step": 19500 }, { "epoch": 1.85, "grad_norm": 67.875, "learning_rate": 6.309324811490722e-07, "loss": 1.8577, "step": 19520 }, { "epoch": 1.85, "grad_norm": 84.9375, "learning_rate": 6.305543376182667e-07, "loss": 1.8332, "step": 19540 }, { "epoch": 1.85, "grad_norm": 69.3125, "learning_rate": 6.301761940874612e-07, "loss": 1.7734, "step": 19560 }, { "epoch": 1.85, "grad_norm": 55.9375, "learning_rate": 6.297980505566556e-07, "loss": 1.7441, "step": 19580 }, { "epoch": 1.85, "grad_norm": 51.25, "learning_rate": 6.2941990702585e-07, "loss": 1.7063, "step": 19600 }, { "epoch": 1.85, "grad_norm": 62.21875, "learning_rate": 6.290417634950445e-07, "loss": 1.8064, "step": 19620 }, { "epoch": 1.86, "grad_norm": 82.3125, "learning_rate": 6.286636199642389e-07, "loss": 1.9288, "step": 19640 }, { "epoch": 1.86, "grad_norm": 97.4375, "learning_rate": 6.282854764334333e-07, "loss": 1.7964, "step": 19660 }, { "epoch": 1.86, "grad_norm": 71.5, "learning_rate": 6.279073329026277e-07, "loss": 1.7302, "step": 19680 }, { "epoch": 1.86, "grad_norm": 54.21875, "learning_rate": 6.275291893718222e-07, "loss": 1.7965, "step": 19700 }, { "epoch": 1.86, "grad_norm": 68.4375, "learning_rate": 6.271510458410168e-07, "loss": 1.735, "step": 19720 }, { "epoch": 1.87, "grad_norm": 65.0625, "learning_rate": 6.267729023102111e-07, "loss": 1.7821, "step": 19740 }, { "epoch": 1.87, "grad_norm": 77.1875, "learning_rate": 6.263947587794056e-07, "loss": 1.6467, "step": 19760 }, { "epoch": 1.87, "grad_norm": 92.875, "learning_rate": 6.260166152486001e-07, "loss": 1.8485, "step": 19780 }, { "epoch": 1.87, "grad_norm": 43.9375, "learning_rate": 6.256384717177945e-07, "loss": 1.7091, "step": 19800 }, { "epoch": 1.87, "grad_norm": 53.25, "learning_rate": 6.252603281869889e-07, "loss": 1.8332, "step": 19820 }, { "epoch": 1.88, "grad_norm": 48.28125, "learning_rate": 6.248821846561834e-07, "loss": 1.8508, "step": 19840 }, { "epoch": 1.88, "grad_norm": 53.46875, "learning_rate": 6.245040411253778e-07, "loss": 1.744, "step": 19860 }, { "epoch": 1.88, "grad_norm": 64.9375, "learning_rate": 6.241258975945722e-07, "loss": 1.7944, "step": 19880 }, { "epoch": 1.88, "grad_norm": 59.375, "learning_rate": 6.237477540637667e-07, "loss": 1.7726, "step": 19900 }, { "epoch": 1.88, "grad_norm": 45.1875, "learning_rate": 6.233696105329611e-07, "loss": 1.778, "step": 19920 }, { "epoch": 1.89, "grad_norm": 49.9375, "learning_rate": 6.229914670021556e-07, "loss": 1.7223, "step": 19940 }, { "epoch": 1.89, "grad_norm": 69.0625, "learning_rate": 6.226133234713499e-07, "loss": 1.807, "step": 19960 }, { "epoch": 1.89, "grad_norm": 45.78125, "learning_rate": 6.222351799405444e-07, "loss": 1.8191, "step": 19980 }, { "epoch": 1.89, "grad_norm": 62.15625, "learning_rate": 6.218570364097389e-07, "loss": 1.8354, "step": 20000 }, { "epoch": 1.89, "grad_norm": 75.625, "learning_rate": 6.214788928789332e-07, "loss": 1.7356, "step": 20020 }, { "epoch": 1.89, "grad_norm": 55.34375, "learning_rate": 6.211007493481277e-07, "loss": 1.7371, "step": 20040 }, { "epoch": 1.9, "grad_norm": 57.78125, "learning_rate": 6.207226058173223e-07, "loss": 1.7965, "step": 20060 }, { "epoch": 1.9, "grad_norm": 110.1875, "learning_rate": 6.203444622865167e-07, "loss": 1.6851, "step": 20080 }, { "epoch": 1.9, "grad_norm": 74.1875, "learning_rate": 6.199663187557111e-07, "loss": 1.7433, "step": 20100 }, { "epoch": 1.9, "grad_norm": 54.46875, "learning_rate": 6.195881752249056e-07, "loss": 1.7359, "step": 20120 }, { "epoch": 1.9, "grad_norm": 80.375, "learning_rate": 6.192100316941e-07, "loss": 1.7647, "step": 20140 }, { "epoch": 1.91, "grad_norm": 67.625, "learning_rate": 6.188318881632945e-07, "loss": 1.6704, "step": 20160 }, { "epoch": 1.91, "grad_norm": 78.1875, "learning_rate": 6.184537446324889e-07, "loss": 1.8029, "step": 20180 }, { "epoch": 1.91, "grad_norm": 55.59375, "learning_rate": 6.180756011016833e-07, "loss": 1.7328, "step": 20200 }, { "epoch": 1.91, "grad_norm": 75.9375, "learning_rate": 6.176974575708778e-07, "loss": 1.7889, "step": 20220 }, { "epoch": 1.91, "grad_norm": 51.8125, "learning_rate": 6.173193140400723e-07, "loss": 1.8524, "step": 20240 }, { "epoch": 1.92, "grad_norm": 52.75, "learning_rate": 6.169411705092666e-07, "loss": 1.8042, "step": 20260 }, { "epoch": 1.92, "grad_norm": 51.3125, "learning_rate": 6.165630269784611e-07, "loss": 1.8007, "step": 20280 }, { "epoch": 1.92, "grad_norm": 70.8125, "learning_rate": 6.161848834476555e-07, "loss": 1.7286, "step": 20300 }, { "epoch": 1.92, "grad_norm": 61.34375, "learning_rate": 6.158067399168499e-07, "loss": 1.7664, "step": 20320 }, { "epoch": 1.92, "grad_norm": 62.375, "learning_rate": 6.154285963860444e-07, "loss": 1.786, "step": 20340 }, { "epoch": 1.92, "grad_norm": 81.0625, "learning_rate": 6.150504528552388e-07, "loss": 1.7736, "step": 20360 }, { "epoch": 1.93, "grad_norm": 63.65625, "learning_rate": 6.146723093244333e-07, "loss": 1.8188, "step": 20380 }, { "epoch": 1.93, "grad_norm": 56.34375, "learning_rate": 6.142941657936277e-07, "loss": 1.8596, "step": 20400 }, { "epoch": 1.93, "grad_norm": 59.5625, "learning_rate": 6.139160222628222e-07, "loss": 1.7769, "step": 20420 }, { "epoch": 1.93, "grad_norm": 79.875, "learning_rate": 6.135378787320167e-07, "loss": 1.7049, "step": 20440 }, { "epoch": 1.93, "grad_norm": 49.78125, "learning_rate": 6.131597352012112e-07, "loss": 1.7867, "step": 20460 }, { "epoch": 1.94, "grad_norm": 51.03125, "learning_rate": 6.127815916704055e-07, "loss": 1.7805, "step": 20480 }, { "epoch": 1.94, "grad_norm": 65.25, "learning_rate": 6.124034481396e-07, "loss": 1.7941, "step": 20500 }, { "epoch": 1.94, "grad_norm": 52.46875, "learning_rate": 6.120253046087945e-07, "loss": 1.7872, "step": 20520 }, { "epoch": 1.94, "grad_norm": 94.625, "learning_rate": 6.116471610779888e-07, "loss": 1.8097, "step": 20540 }, { "epoch": 1.94, "grad_norm": 50.21875, "learning_rate": 6.112690175471833e-07, "loss": 1.7045, "step": 20560 }, { "epoch": 1.95, "grad_norm": 72.25, "learning_rate": 6.108908740163778e-07, "loss": 1.7775, "step": 20580 }, { "epoch": 1.95, "grad_norm": 68.625, "learning_rate": 6.105127304855722e-07, "loss": 1.7464, "step": 20600 }, { "epoch": 1.95, "grad_norm": 50.8125, "learning_rate": 6.101345869547666e-07, "loss": 1.7768, "step": 20620 }, { "epoch": 1.95, "grad_norm": 51.28125, "learning_rate": 6.09756443423961e-07, "loss": 1.7388, "step": 20640 }, { "epoch": 1.95, "grad_norm": 101.5, "learning_rate": 6.093782998931555e-07, "loss": 1.7464, "step": 20660 }, { "epoch": 1.96, "grad_norm": 77.5625, "learning_rate": 6.090001563623499e-07, "loss": 1.7987, "step": 20680 }, { "epoch": 1.96, "grad_norm": 111.0625, "learning_rate": 6.086220128315443e-07, "loss": 1.6151, "step": 20700 }, { "epoch": 1.96, "grad_norm": 74.5625, "learning_rate": 6.082438693007388e-07, "loss": 1.8307, "step": 20720 }, { "epoch": 1.96, "grad_norm": 98.375, "learning_rate": 6.078657257699333e-07, "loss": 1.8152, "step": 20740 }, { "epoch": 1.96, "grad_norm": 60.25, "learning_rate": 6.074875822391277e-07, "loss": 1.753, "step": 20760 }, { "epoch": 1.96, "grad_norm": 72.75, "learning_rate": 6.071094387083222e-07, "loss": 1.8042, "step": 20780 }, { "epoch": 1.97, "grad_norm": 52.25, "learning_rate": 6.067312951775167e-07, "loss": 1.8273, "step": 20800 }, { "epoch": 1.97, "grad_norm": 48.28125, "learning_rate": 6.063531516467111e-07, "loss": 1.7648, "step": 20820 }, { "epoch": 1.97, "grad_norm": 59.53125, "learning_rate": 6.059750081159055e-07, "loss": 1.6874, "step": 20840 }, { "epoch": 1.97, "grad_norm": 54.3125, "learning_rate": 6.055968645851e-07, "loss": 1.6971, "step": 20860 }, { "epoch": 1.97, "grad_norm": 76.0625, "learning_rate": 6.052187210542944e-07, "loss": 1.7653, "step": 20880 }, { "epoch": 1.98, "grad_norm": 51.34375, "learning_rate": 6.048405775234888e-07, "loss": 1.7038, "step": 20900 }, { "epoch": 1.98, "grad_norm": 58.09375, "learning_rate": 6.044624339926832e-07, "loss": 1.8158, "step": 20920 }, { "epoch": 1.98, "grad_norm": 54.375, "learning_rate": 6.040842904618777e-07, "loss": 1.8113, "step": 20940 }, { "epoch": 1.98, "grad_norm": 87.3125, "learning_rate": 6.037061469310722e-07, "loss": 1.7534, "step": 20960 }, { "epoch": 1.98, "grad_norm": 58.4375, "learning_rate": 6.033280034002665e-07, "loss": 1.7212, "step": 20980 }, { "epoch": 1.99, "grad_norm": 94.4375, "learning_rate": 6.02949859869461e-07, "loss": 1.745, "step": 21000 }, { "epoch": 1.99, "grad_norm": 49.4375, "learning_rate": 6.025717163386555e-07, "loss": 1.7588, "step": 21020 }, { "epoch": 1.99, "grad_norm": 70.8125, "learning_rate": 6.021935728078498e-07, "loss": 1.8333, "step": 21040 }, { "epoch": 1.99, "grad_norm": 77.375, "learning_rate": 6.018154292770443e-07, "loss": 1.6461, "step": 21060 }, { "epoch": 1.99, "grad_norm": 65.1875, "learning_rate": 6.014372857462388e-07, "loss": 1.7415, "step": 21080 }, { "epoch": 1.99, "grad_norm": 56.25, "learning_rate": 6.010591422154332e-07, "loss": 1.7088, "step": 21100 }, { "epoch": 2.0, "grad_norm": 60.84375, "learning_rate": 6.006809986846277e-07, "loss": 1.6742, "step": 21120 }, { "epoch": 2.0, "grad_norm": 62.03125, "learning_rate": 6.003028551538222e-07, "loss": 1.741, "step": 21140 }, { "epoch": 2.0, "grad_norm": 48.9375, "learning_rate": 5.999247116230166e-07, "loss": 1.832, "step": 21160 }, { "epoch": 2.0, "grad_norm": 54.46875, "learning_rate": 5.995465680922111e-07, "loss": 1.5626, "step": 21180 }, { "epoch": 2.0, "grad_norm": 75.375, "learning_rate": 5.991684245614055e-07, "loss": 1.6111, "step": 21200 }, { "epoch": 2.01, "grad_norm": 63.40625, "learning_rate": 5.987902810305999e-07, "loss": 1.6103, "step": 21220 }, { "epoch": 2.01, "grad_norm": 44.34375, "learning_rate": 5.984121374997944e-07, "loss": 1.5554, "step": 21240 }, { "epoch": 2.01, "grad_norm": 75.3125, "learning_rate": 5.980339939689887e-07, "loss": 1.5105, "step": 21260 }, { "epoch": 2.01, "grad_norm": 63.15625, "learning_rate": 5.976558504381832e-07, "loss": 1.5628, "step": 21280 }, { "epoch": 2.01, "grad_norm": 59.46875, "learning_rate": 5.972777069073777e-07, "loss": 1.5626, "step": 21300 }, { "epoch": 2.02, "grad_norm": 51.15625, "learning_rate": 5.968995633765721e-07, "loss": 1.5411, "step": 21320 }, { "epoch": 2.02, "grad_norm": 59.5625, "learning_rate": 5.965214198457665e-07, "loss": 1.613, "step": 21340 }, { "epoch": 2.02, "grad_norm": 46.28125, "learning_rate": 5.96143276314961e-07, "loss": 1.5999, "step": 21360 }, { "epoch": 2.02, "grad_norm": 53.96875, "learning_rate": 5.957651327841554e-07, "loss": 1.7679, "step": 21380 }, { "epoch": 2.02, "grad_norm": 46.75, "learning_rate": 5.953869892533499e-07, "loss": 1.5461, "step": 21400 }, { "epoch": 2.02, "grad_norm": 56.6875, "learning_rate": 5.950088457225443e-07, "loss": 1.5189, "step": 21420 }, { "epoch": 2.03, "grad_norm": 90.75, "learning_rate": 5.946307021917387e-07, "loss": 1.5582, "step": 21440 }, { "epoch": 2.03, "grad_norm": 76.0, "learning_rate": 5.942525586609333e-07, "loss": 1.4733, "step": 21460 }, { "epoch": 2.03, "grad_norm": 51.59375, "learning_rate": 5.938744151301278e-07, "loss": 1.6764, "step": 21480 }, { "epoch": 2.03, "grad_norm": 116.375, "learning_rate": 5.934962715993221e-07, "loss": 1.5825, "step": 21500 }, { "epoch": 2.03, "grad_norm": 57.78125, "learning_rate": 5.931181280685166e-07, "loss": 1.5275, "step": 21520 }, { "epoch": 2.04, "grad_norm": 56.53125, "learning_rate": 5.92739984537711e-07, "loss": 1.519, "step": 21540 }, { "epoch": 2.04, "grad_norm": 61.125, "learning_rate": 5.923618410069054e-07, "loss": 1.5093, "step": 21560 }, { "epoch": 2.04, "grad_norm": 65.625, "learning_rate": 5.919836974760999e-07, "loss": 1.5351, "step": 21580 }, { "epoch": 2.04, "grad_norm": 53.6875, "learning_rate": 5.916055539452943e-07, "loss": 1.585, "step": 21600 }, { "epoch": 2.04, "grad_norm": 60.75, "learning_rate": 5.912274104144888e-07, "loss": 1.5854, "step": 21620 }, { "epoch": 2.05, "grad_norm": 57.4375, "learning_rate": 5.908492668836832e-07, "loss": 1.4904, "step": 21640 }, { "epoch": 2.05, "grad_norm": 72.5625, "learning_rate": 5.904711233528776e-07, "loss": 1.5939, "step": 21660 }, { "epoch": 2.05, "grad_norm": 73.1875, "learning_rate": 5.900929798220721e-07, "loss": 1.4732, "step": 21680 }, { "epoch": 2.05, "grad_norm": 68.0625, "learning_rate": 5.897148362912665e-07, "loss": 1.4965, "step": 21700 }, { "epoch": 2.05, "grad_norm": 61.15625, "learning_rate": 5.893366927604609e-07, "loss": 1.6585, "step": 21720 }, { "epoch": 2.06, "grad_norm": 58.8125, "learning_rate": 5.889585492296554e-07, "loss": 1.65, "step": 21740 }, { "epoch": 2.06, "grad_norm": 50.3125, "learning_rate": 5.885804056988499e-07, "loss": 1.6261, "step": 21760 }, { "epoch": 2.06, "grad_norm": 59.09375, "learning_rate": 5.882022621680442e-07, "loss": 1.5723, "step": 21780 }, { "epoch": 2.06, "grad_norm": 61.09375, "learning_rate": 5.878241186372387e-07, "loss": 1.567, "step": 21800 }, { "epoch": 2.06, "grad_norm": 57.40625, "learning_rate": 5.874459751064333e-07, "loss": 1.4886, "step": 21820 }, { "epoch": 2.06, "grad_norm": 82.5625, "learning_rate": 5.870678315756277e-07, "loss": 1.5157, "step": 21840 }, { "epoch": 2.07, "grad_norm": 64.875, "learning_rate": 5.866896880448221e-07, "loss": 1.4767, "step": 21860 }, { "epoch": 2.07, "grad_norm": 66.6875, "learning_rate": 5.863115445140165e-07, "loss": 1.6021, "step": 21880 }, { "epoch": 2.07, "grad_norm": 80.5, "learning_rate": 5.85933400983211e-07, "loss": 1.5284, "step": 21900 }, { "epoch": 2.07, "grad_norm": 61.84375, "learning_rate": 5.855552574524054e-07, "loss": 1.5389, "step": 21920 }, { "epoch": 2.07, "grad_norm": 69.8125, "learning_rate": 5.851771139215998e-07, "loss": 1.4867, "step": 21940 }, { "epoch": 2.08, "grad_norm": 43.875, "learning_rate": 5.847989703907943e-07, "loss": 1.5887, "step": 21960 }, { "epoch": 2.08, "grad_norm": 64.5625, "learning_rate": 5.844208268599888e-07, "loss": 1.5928, "step": 21980 }, { "epoch": 2.08, "grad_norm": 68.5625, "learning_rate": 5.840426833291831e-07, "loss": 1.6589, "step": 22000 }, { "epoch": 2.08, "grad_norm": 71.1875, "learning_rate": 5.836645397983776e-07, "loss": 1.6092, "step": 22020 }, { "epoch": 2.08, "grad_norm": 54.75, "learning_rate": 5.832863962675721e-07, "loss": 1.503, "step": 22040 }, { "epoch": 2.09, "grad_norm": 67.3125, "learning_rate": 5.829082527367664e-07, "loss": 1.5614, "step": 22060 }, { "epoch": 2.09, "grad_norm": 68.0, "learning_rate": 5.825301092059609e-07, "loss": 1.6066, "step": 22080 }, { "epoch": 2.09, "grad_norm": 93.25, "learning_rate": 5.821519656751554e-07, "loss": 1.5409, "step": 22100 }, { "epoch": 2.09, "grad_norm": 77.6875, "learning_rate": 5.817738221443498e-07, "loss": 1.6052, "step": 22120 }, { "epoch": 2.09, "grad_norm": 79.75, "learning_rate": 5.813956786135442e-07, "loss": 1.6158, "step": 22140 }, { "epoch": 2.09, "grad_norm": 61.75, "learning_rate": 5.810175350827388e-07, "loss": 1.5924, "step": 22160 }, { "epoch": 2.1, "grad_norm": 87.375, "learning_rate": 5.806393915519332e-07, "loss": 1.608, "step": 22180 }, { "epoch": 2.1, "grad_norm": 56.46875, "learning_rate": 5.802612480211277e-07, "loss": 1.554, "step": 22200 }, { "epoch": 2.1, "grad_norm": 64.375, "learning_rate": 5.79883104490322e-07, "loss": 1.4941, "step": 22220 }, { "epoch": 2.1, "grad_norm": 68.625, "learning_rate": 5.795049609595165e-07, "loss": 1.6285, "step": 22240 }, { "epoch": 2.1, "grad_norm": 86.6875, "learning_rate": 5.79126817428711e-07, "loss": 1.586, "step": 22260 }, { "epoch": 2.11, "grad_norm": 45.625, "learning_rate": 5.787486738979053e-07, "loss": 1.5519, "step": 22280 }, { "epoch": 2.11, "grad_norm": 66.1875, "learning_rate": 5.783705303670998e-07, "loss": 1.553, "step": 22300 }, { "epoch": 2.11, "grad_norm": 63.15625, "learning_rate": 5.779923868362943e-07, "loss": 1.4016, "step": 22320 }, { "epoch": 2.11, "grad_norm": 64.0625, "learning_rate": 5.776142433054887e-07, "loss": 1.5565, "step": 22340 }, { "epoch": 2.11, "grad_norm": 118.625, "learning_rate": 5.772360997746831e-07, "loss": 1.5126, "step": 22360 }, { "epoch": 2.12, "grad_norm": 61.6875, "learning_rate": 5.768579562438776e-07, "loss": 1.6061, "step": 22380 }, { "epoch": 2.12, "grad_norm": 53.6875, "learning_rate": 5.76479812713072e-07, "loss": 1.5049, "step": 22400 }, { "epoch": 2.12, "grad_norm": 53.5625, "learning_rate": 5.761016691822665e-07, "loss": 1.5556, "step": 22420 }, { "epoch": 2.12, "grad_norm": 84.125, "learning_rate": 5.757235256514608e-07, "loss": 1.5172, "step": 22440 }, { "epoch": 2.12, "grad_norm": 57.125, "learning_rate": 5.753453821206553e-07, "loss": 1.5251, "step": 22460 }, { "epoch": 2.13, "grad_norm": 52.0, "learning_rate": 5.749672385898498e-07, "loss": 1.4947, "step": 22480 }, { "epoch": 2.13, "grad_norm": 71.9375, "learning_rate": 5.745890950590441e-07, "loss": 1.496, "step": 22500 }, { "epoch": 2.13, "grad_norm": 56.28125, "learning_rate": 5.742109515282387e-07, "loss": 1.6366, "step": 22520 }, { "epoch": 2.13, "grad_norm": 72.5625, "learning_rate": 5.738328079974332e-07, "loss": 1.5916, "step": 22540 }, { "epoch": 2.13, "grad_norm": 66.4375, "learning_rate": 5.734546644666276e-07, "loss": 1.572, "step": 22560 }, { "epoch": 2.13, "grad_norm": 47.25, "learning_rate": 5.73076520935822e-07, "loss": 1.5181, "step": 22580 }, { "epoch": 2.14, "grad_norm": 62.75, "learning_rate": 5.726983774050165e-07, "loss": 1.6111, "step": 22600 }, { "epoch": 2.14, "grad_norm": 51.375, "learning_rate": 5.723202338742109e-07, "loss": 1.5621, "step": 22620 }, { "epoch": 2.14, "grad_norm": 50.84375, "learning_rate": 5.719420903434054e-07, "loss": 1.5096, "step": 22640 }, { "epoch": 2.14, "grad_norm": 61.875, "learning_rate": 5.715639468125998e-07, "loss": 1.5789, "step": 22660 }, { "epoch": 2.14, "grad_norm": 81.5, "learning_rate": 5.711858032817942e-07, "loss": 1.5334, "step": 22680 }, { "epoch": 2.15, "grad_norm": 68.9375, "learning_rate": 5.708076597509887e-07, "loss": 1.6067, "step": 22700 }, { "epoch": 2.15, "grad_norm": 74.8125, "learning_rate": 5.704295162201831e-07, "loss": 1.6537, "step": 22720 }, { "epoch": 2.15, "grad_norm": 52.3125, "learning_rate": 5.700513726893775e-07, "loss": 1.4958, "step": 22740 }, { "epoch": 2.15, "grad_norm": 57.9375, "learning_rate": 5.69673229158572e-07, "loss": 1.6364, "step": 22760 }, { "epoch": 2.15, "grad_norm": 104.6875, "learning_rate": 5.692950856277664e-07, "loss": 1.5181, "step": 22780 }, { "epoch": 2.16, "grad_norm": 64.6875, "learning_rate": 5.689169420969608e-07, "loss": 1.6591, "step": 22800 }, { "epoch": 2.16, "grad_norm": 91.4375, "learning_rate": 5.685387985661553e-07, "loss": 1.5405, "step": 22820 }, { "epoch": 2.16, "grad_norm": 63.3125, "learning_rate": 5.681606550353497e-07, "loss": 1.4792, "step": 22840 }, { "epoch": 2.16, "grad_norm": 84.125, "learning_rate": 5.677825115045442e-07, "loss": 1.4895, "step": 22860 }, { "epoch": 2.16, "grad_norm": 62.0, "learning_rate": 5.674043679737387e-07, "loss": 1.5591, "step": 22880 }, { "epoch": 2.16, "grad_norm": 63.9375, "learning_rate": 5.670262244429331e-07, "loss": 1.599, "step": 22900 }, { "epoch": 2.17, "grad_norm": 66.8125, "learning_rate": 5.666480809121276e-07, "loss": 1.6139, "step": 22920 }, { "epoch": 2.17, "grad_norm": 109.25, "learning_rate": 5.66269937381322e-07, "loss": 1.5495, "step": 22940 }, { "epoch": 2.17, "grad_norm": 84.1875, "learning_rate": 5.658917938505164e-07, "loss": 1.6005, "step": 22960 }, { "epoch": 2.17, "grad_norm": 57.125, "learning_rate": 5.655136503197109e-07, "loss": 1.5384, "step": 22980 }, { "epoch": 2.17, "grad_norm": 73.25, "learning_rate": 5.651355067889054e-07, "loss": 1.5543, "step": 23000 }, { "epoch": 2.18, "grad_norm": 52.125, "learning_rate": 5.647573632580997e-07, "loss": 1.5076, "step": 23020 }, { "epoch": 2.18, "grad_norm": 52.6875, "learning_rate": 5.643792197272942e-07, "loss": 1.5428, "step": 23040 }, { "epoch": 2.18, "grad_norm": 76.25, "learning_rate": 5.640010761964887e-07, "loss": 1.5638, "step": 23060 }, { "epoch": 2.18, "grad_norm": 48.90625, "learning_rate": 5.63622932665683e-07, "loss": 1.5839, "step": 23080 }, { "epoch": 2.18, "grad_norm": 74.0, "learning_rate": 5.632447891348775e-07, "loss": 1.5441, "step": 23100 }, { "epoch": 2.19, "grad_norm": 61.90625, "learning_rate": 5.628666456040719e-07, "loss": 1.5854, "step": 23120 }, { "epoch": 2.19, "grad_norm": 71.25, "learning_rate": 5.624885020732664e-07, "loss": 1.5531, "step": 23140 }, { "epoch": 2.19, "grad_norm": 110.75, "learning_rate": 5.621103585424608e-07, "loss": 1.5323, "step": 23160 }, { "epoch": 2.19, "grad_norm": 73.375, "learning_rate": 5.617322150116552e-07, "loss": 1.572, "step": 23180 }, { "epoch": 2.19, "grad_norm": 88.25, "learning_rate": 5.613540714808498e-07, "loss": 1.4873, "step": 23200 }, { "epoch": 2.2, "grad_norm": 70.75, "learning_rate": 5.609759279500443e-07, "loss": 1.545, "step": 23220 }, { "epoch": 2.2, "grad_norm": 57.40625, "learning_rate": 5.605977844192386e-07, "loss": 1.5191, "step": 23240 }, { "epoch": 2.2, "grad_norm": 65.4375, "learning_rate": 5.602196408884331e-07, "loss": 1.5962, "step": 23260 }, { "epoch": 2.2, "grad_norm": 58.25, "learning_rate": 5.598414973576276e-07, "loss": 1.4449, "step": 23280 }, { "epoch": 2.2, "grad_norm": 99.125, "learning_rate": 5.594633538268219e-07, "loss": 1.4933, "step": 23300 }, { "epoch": 2.2, "grad_norm": 62.875, "learning_rate": 5.590852102960164e-07, "loss": 1.5198, "step": 23320 }, { "epoch": 2.21, "grad_norm": 72.0, "learning_rate": 5.587070667652109e-07, "loss": 1.6208, "step": 23340 }, { "epoch": 2.21, "grad_norm": 60.0, "learning_rate": 5.583289232344053e-07, "loss": 1.5792, "step": 23360 }, { "epoch": 2.21, "grad_norm": 54.84375, "learning_rate": 5.579507797035997e-07, "loss": 1.5194, "step": 23380 }, { "epoch": 2.21, "grad_norm": 57.84375, "learning_rate": 5.575726361727941e-07, "loss": 1.5103, "step": 23400 }, { "epoch": 2.21, "grad_norm": 52.21875, "learning_rate": 5.571944926419886e-07, "loss": 1.5909, "step": 23420 }, { "epoch": 2.22, "grad_norm": 90.5, "learning_rate": 5.56816349111183e-07, "loss": 1.5455, "step": 23440 }, { "epoch": 2.22, "grad_norm": 85.3125, "learning_rate": 5.564382055803774e-07, "loss": 1.5711, "step": 23460 }, { "epoch": 2.22, "grad_norm": 58.15625, "learning_rate": 5.560600620495719e-07, "loss": 1.5518, "step": 23480 }, { "epoch": 2.22, "grad_norm": 62.28125, "learning_rate": 5.556819185187664e-07, "loss": 1.6045, "step": 23500 }, { "epoch": 2.22, "grad_norm": 60.875, "learning_rate": 5.553037749879607e-07, "loss": 1.5856, "step": 23520 }, { "epoch": 2.23, "grad_norm": 50.75, "learning_rate": 5.549256314571552e-07, "loss": 1.6366, "step": 23540 }, { "epoch": 2.23, "grad_norm": 75.0625, "learning_rate": 5.545474879263498e-07, "loss": 1.5648, "step": 23560 }, { "epoch": 2.23, "grad_norm": 50.3125, "learning_rate": 5.541693443955442e-07, "loss": 1.565, "step": 23580 }, { "epoch": 2.23, "grad_norm": 54.46875, "learning_rate": 5.537912008647386e-07, "loss": 1.5149, "step": 23600 }, { "epoch": 2.23, "grad_norm": 92.75, "learning_rate": 5.534130573339331e-07, "loss": 1.4956, "step": 23620 }, { "epoch": 2.23, "grad_norm": 54.3125, "learning_rate": 5.530349138031275e-07, "loss": 1.4991, "step": 23640 }, { "epoch": 2.24, "grad_norm": 48.15625, "learning_rate": 5.52656770272322e-07, "loss": 1.5886, "step": 23660 }, { "epoch": 2.24, "grad_norm": 60.9375, "learning_rate": 5.522786267415164e-07, "loss": 1.5823, "step": 23680 }, { "epoch": 2.24, "grad_norm": 68.0625, "learning_rate": 5.519004832107108e-07, "loss": 1.5445, "step": 23700 }, { "epoch": 2.24, "grad_norm": 56.25, "learning_rate": 5.515223396799053e-07, "loss": 1.6111, "step": 23720 }, { "epoch": 2.24, "grad_norm": 76.6875, "learning_rate": 5.511441961490996e-07, "loss": 1.6356, "step": 23740 }, { "epoch": 2.25, "grad_norm": 70.25, "learning_rate": 5.507660526182941e-07, "loss": 1.5651, "step": 23760 }, { "epoch": 2.25, "grad_norm": 75.0625, "learning_rate": 5.503879090874886e-07, "loss": 1.4972, "step": 23780 }, { "epoch": 2.25, "grad_norm": 54.6875, "learning_rate": 5.50009765556683e-07, "loss": 1.5978, "step": 23800 }, { "epoch": 2.25, "grad_norm": 56.28125, "learning_rate": 5.496316220258774e-07, "loss": 1.5318, "step": 23820 }, { "epoch": 2.25, "grad_norm": 41.65625, "learning_rate": 5.492534784950719e-07, "loss": 1.468, "step": 23840 }, { "epoch": 2.26, "grad_norm": 52.625, "learning_rate": 5.488753349642663e-07, "loss": 1.4403, "step": 23860 }, { "epoch": 2.26, "grad_norm": 51.03125, "learning_rate": 5.484971914334607e-07, "loss": 1.5273, "step": 23880 }, { "epoch": 2.26, "grad_norm": 118.5, "learning_rate": 5.481190479026553e-07, "loss": 1.5754, "step": 23900 }, { "epoch": 2.26, "grad_norm": 53.65625, "learning_rate": 5.477409043718497e-07, "loss": 1.652, "step": 23920 }, { "epoch": 2.26, "grad_norm": 55.46875, "learning_rate": 5.473627608410442e-07, "loss": 1.5018, "step": 23940 }, { "epoch": 2.27, "grad_norm": 75.875, "learning_rate": 5.469846173102386e-07, "loss": 1.5074, "step": 23960 }, { "epoch": 2.27, "grad_norm": 65.375, "learning_rate": 5.46606473779433e-07, "loss": 1.5339, "step": 23980 }, { "epoch": 2.27, "grad_norm": 58.84375, "learning_rate": 5.462283302486275e-07, "loss": 1.6258, "step": 24000 }, { "epoch": 2.27, "grad_norm": 53.34375, "learning_rate": 5.45850186717822e-07, "loss": 1.5179, "step": 24020 }, { "epoch": 2.27, "grad_norm": 94.25, "learning_rate": 5.454720431870163e-07, "loss": 1.6923, "step": 24040 }, { "epoch": 2.27, "grad_norm": 59.78125, "learning_rate": 5.450938996562108e-07, "loss": 1.6017, "step": 24060 }, { "epoch": 2.28, "grad_norm": 82.5625, "learning_rate": 5.447157561254052e-07, "loss": 1.6311, "step": 24080 }, { "epoch": 2.28, "grad_norm": 64.1875, "learning_rate": 5.443376125945996e-07, "loss": 1.6113, "step": 24100 }, { "epoch": 2.28, "grad_norm": 62.40625, "learning_rate": 5.439594690637941e-07, "loss": 1.5551, "step": 24120 }, { "epoch": 2.28, "grad_norm": 62.3125, "learning_rate": 5.435813255329885e-07, "loss": 1.4569, "step": 24140 }, { "epoch": 2.28, "grad_norm": 58.46875, "learning_rate": 5.43203182002183e-07, "loss": 1.5644, "step": 24160 }, { "epoch": 2.29, "grad_norm": 92.8125, "learning_rate": 5.428250384713774e-07, "loss": 1.6306, "step": 24180 }, { "epoch": 2.29, "grad_norm": 55.125, "learning_rate": 5.424468949405718e-07, "loss": 1.5071, "step": 24200 }, { "epoch": 2.29, "grad_norm": 65.5, "learning_rate": 5.420687514097663e-07, "loss": 1.58, "step": 24220 }, { "epoch": 2.29, "grad_norm": 49.53125, "learning_rate": 5.416906078789607e-07, "loss": 1.58, "step": 24240 }, { "epoch": 2.29, "grad_norm": 59.03125, "learning_rate": 5.413124643481552e-07, "loss": 1.5532, "step": 24260 }, { "epoch": 2.3, "grad_norm": 61.5625, "learning_rate": 5.409343208173497e-07, "loss": 1.5801, "step": 24280 }, { "epoch": 2.3, "grad_norm": 57.875, "learning_rate": 5.405561772865442e-07, "loss": 1.518, "step": 24300 }, { "epoch": 2.3, "grad_norm": 54.40625, "learning_rate": 5.401780337557385e-07, "loss": 1.5373, "step": 24320 }, { "epoch": 2.3, "grad_norm": 69.625, "learning_rate": 5.39799890224933e-07, "loss": 1.5509, "step": 24340 }, { "epoch": 2.3, "grad_norm": 56.59375, "learning_rate": 5.394217466941274e-07, "loss": 1.603, "step": 24360 }, { "epoch": 2.3, "grad_norm": 71.375, "learning_rate": 5.390436031633219e-07, "loss": 1.3879, "step": 24380 }, { "epoch": 2.31, "grad_norm": 50.21875, "learning_rate": 5.386654596325163e-07, "loss": 1.5509, "step": 24400 }, { "epoch": 2.31, "grad_norm": 69.125, "learning_rate": 5.382873161017107e-07, "loss": 1.5635, "step": 24420 }, { "epoch": 2.31, "grad_norm": 67.125, "learning_rate": 5.379091725709052e-07, "loss": 1.4765, "step": 24440 }, { "epoch": 2.31, "grad_norm": 98.0, "learning_rate": 5.375310290400996e-07, "loss": 1.5189, "step": 24460 }, { "epoch": 2.31, "grad_norm": 58.8125, "learning_rate": 5.37152885509294e-07, "loss": 1.4824, "step": 24480 }, { "epoch": 2.32, "grad_norm": 74.6875, "learning_rate": 5.367747419784885e-07, "loss": 1.559, "step": 24500 }, { "epoch": 2.32, "grad_norm": 73.875, "learning_rate": 5.36396598447683e-07, "loss": 1.559, "step": 24520 }, { "epoch": 2.32, "grad_norm": 54.9375, "learning_rate": 5.360184549168773e-07, "loss": 1.5174, "step": 24540 }, { "epoch": 2.32, "grad_norm": 57.5625, "learning_rate": 5.356403113860718e-07, "loss": 1.5646, "step": 24560 }, { "epoch": 2.32, "grad_norm": 58.8125, "learning_rate": 5.352621678552663e-07, "loss": 1.4964, "step": 24580 }, { "epoch": 2.33, "grad_norm": 54.28125, "learning_rate": 5.348840243244608e-07, "loss": 1.5191, "step": 24600 }, { "epoch": 2.33, "grad_norm": 95.9375, "learning_rate": 5.345058807936552e-07, "loss": 1.567, "step": 24620 }, { "epoch": 2.33, "grad_norm": 91.5625, "learning_rate": 5.341277372628497e-07, "loss": 1.5026, "step": 24640 }, { "epoch": 2.33, "grad_norm": 61.03125, "learning_rate": 5.337495937320441e-07, "loss": 1.6615, "step": 24660 }, { "epoch": 2.33, "grad_norm": 53.625, "learning_rate": 5.333714502012385e-07, "loss": 1.5603, "step": 24680 }, { "epoch": 2.34, "grad_norm": 59.0625, "learning_rate": 5.329933066704329e-07, "loss": 1.5761, "step": 24700 }, { "epoch": 2.34, "grad_norm": 83.4375, "learning_rate": 5.326151631396274e-07, "loss": 1.6023, "step": 24720 }, { "epoch": 2.34, "grad_norm": 68.125, "learning_rate": 5.322370196088219e-07, "loss": 1.5586, "step": 24740 }, { "epoch": 2.34, "grad_norm": 55.96875, "learning_rate": 5.318588760780162e-07, "loss": 1.5758, "step": 24760 }, { "epoch": 2.34, "grad_norm": 53.3125, "learning_rate": 5.314807325472107e-07, "loss": 1.5305, "step": 24780 }, { "epoch": 2.34, "grad_norm": 62.0625, "learning_rate": 5.311025890164052e-07, "loss": 1.5119, "step": 24800 }, { "epoch": 2.35, "grad_norm": 45.6875, "learning_rate": 5.307244454855995e-07, "loss": 1.5253, "step": 24820 }, { "epoch": 2.35, "grad_norm": 72.8125, "learning_rate": 5.30346301954794e-07, "loss": 1.5749, "step": 24840 }, { "epoch": 2.35, "grad_norm": 78.875, "learning_rate": 5.299681584239885e-07, "loss": 1.5423, "step": 24860 }, { "epoch": 2.35, "grad_norm": 78.4375, "learning_rate": 5.295900148931829e-07, "loss": 1.5867, "step": 24880 }, { "epoch": 2.35, "grad_norm": 49.9375, "learning_rate": 5.292118713623773e-07, "loss": 1.6509, "step": 24900 }, { "epoch": 2.36, "grad_norm": 68.875, "learning_rate": 5.288337278315718e-07, "loss": 1.5651, "step": 24920 }, { "epoch": 2.36, "grad_norm": 55.15625, "learning_rate": 5.284555843007662e-07, "loss": 1.539, "step": 24940 }, { "epoch": 2.36, "grad_norm": 62.25, "learning_rate": 5.280774407699608e-07, "loss": 1.584, "step": 24960 }, { "epoch": 2.36, "grad_norm": 56.8125, "learning_rate": 5.276992972391551e-07, "loss": 1.5746, "step": 24980 }, { "epoch": 2.36, "grad_norm": 71.3125, "learning_rate": 5.273211537083496e-07, "loss": 1.5028, "step": 25000 }, { "epoch": 2.37, "grad_norm": 45.90625, "learning_rate": 5.269430101775441e-07, "loss": 1.6168, "step": 25020 }, { "epoch": 2.37, "grad_norm": 53.8125, "learning_rate": 5.265648666467384e-07, "loss": 1.5284, "step": 25040 }, { "epoch": 2.37, "grad_norm": 55.875, "learning_rate": 5.261867231159329e-07, "loss": 1.5533, "step": 25060 }, { "epoch": 2.37, "grad_norm": 62.375, "learning_rate": 5.258085795851274e-07, "loss": 1.5737, "step": 25080 }, { "epoch": 2.37, "grad_norm": 54.59375, "learning_rate": 5.254304360543218e-07, "loss": 1.5599, "step": 25100 }, { "epoch": 2.37, "grad_norm": 68.1875, "learning_rate": 5.250522925235162e-07, "loss": 1.5021, "step": 25120 }, { "epoch": 2.38, "grad_norm": 62.71875, "learning_rate": 5.246741489927107e-07, "loss": 1.44, "step": 25140 }, { "epoch": 2.38, "grad_norm": 52.96875, "learning_rate": 5.242960054619051e-07, "loss": 1.4916, "step": 25160 }, { "epoch": 2.38, "grad_norm": 54.625, "learning_rate": 5.239178619310996e-07, "loss": 1.5164, "step": 25180 }, { "epoch": 2.38, "grad_norm": 52.5625, "learning_rate": 5.23539718400294e-07, "loss": 1.4848, "step": 25200 }, { "epoch": 2.38, "grad_norm": 72.0625, "learning_rate": 5.231615748694884e-07, "loss": 1.5058, "step": 25220 }, { "epoch": 2.39, "grad_norm": 74.125, "learning_rate": 5.227834313386829e-07, "loss": 1.6166, "step": 25240 }, { "epoch": 2.39, "grad_norm": 53.40625, "learning_rate": 5.224052878078772e-07, "loss": 1.5508, "step": 25260 }, { "epoch": 2.39, "grad_norm": 57.21875, "learning_rate": 5.220271442770717e-07, "loss": 1.6444, "step": 25280 }, { "epoch": 2.39, "grad_norm": 62.5, "learning_rate": 5.216490007462663e-07, "loss": 1.5956, "step": 25300 }, { "epoch": 2.39, "grad_norm": 71.0, "learning_rate": 5.212708572154607e-07, "loss": 1.5861, "step": 25320 }, { "epoch": 2.4, "grad_norm": 51.8125, "learning_rate": 5.208927136846551e-07, "loss": 1.5077, "step": 25340 }, { "epoch": 2.4, "grad_norm": 67.5625, "learning_rate": 5.205145701538496e-07, "loss": 1.592, "step": 25360 }, { "epoch": 2.4, "grad_norm": 49.03125, "learning_rate": 5.20136426623044e-07, "loss": 1.6034, "step": 25380 }, { "epoch": 2.4, "grad_norm": 49.4375, "learning_rate": 5.197582830922385e-07, "loss": 1.4929, "step": 25400 }, { "epoch": 2.4, "grad_norm": 64.875, "learning_rate": 5.193801395614329e-07, "loss": 1.532, "step": 25420 }, { "epoch": 2.4, "grad_norm": 57.1875, "learning_rate": 5.190019960306273e-07, "loss": 1.4758, "step": 25440 }, { "epoch": 2.41, "grad_norm": 64.0, "learning_rate": 5.186238524998218e-07, "loss": 1.5308, "step": 25460 }, { "epoch": 2.41, "grad_norm": 48.34375, "learning_rate": 5.182457089690162e-07, "loss": 1.5379, "step": 25480 }, { "epoch": 2.41, "grad_norm": 55.1875, "learning_rate": 5.178675654382106e-07, "loss": 1.5318, "step": 25500 }, { "epoch": 2.41, "grad_norm": 67.0, "learning_rate": 5.174894219074051e-07, "loss": 1.5173, "step": 25520 }, { "epoch": 2.41, "grad_norm": 44.46875, "learning_rate": 5.171112783765996e-07, "loss": 1.5026, "step": 25540 }, { "epoch": 2.42, "grad_norm": 60.875, "learning_rate": 5.167331348457939e-07, "loss": 1.5759, "step": 25560 }, { "epoch": 2.42, "grad_norm": 60.75, "learning_rate": 5.163549913149884e-07, "loss": 1.5473, "step": 25580 }, { "epoch": 2.42, "grad_norm": 46.84375, "learning_rate": 5.159768477841828e-07, "loss": 1.4403, "step": 25600 }, { "epoch": 2.42, "grad_norm": 50.34375, "learning_rate": 5.155987042533772e-07, "loss": 1.5662, "step": 25620 }, { "epoch": 2.42, "grad_norm": 62.6875, "learning_rate": 5.152205607225717e-07, "loss": 1.4555, "step": 25640 }, { "epoch": 2.43, "grad_norm": 61.59375, "learning_rate": 5.148424171917662e-07, "loss": 1.6103, "step": 25660 }, { "epoch": 2.43, "grad_norm": 71.5625, "learning_rate": 5.144642736609607e-07, "loss": 1.5453, "step": 25680 }, { "epoch": 2.43, "grad_norm": 68.1875, "learning_rate": 5.140861301301551e-07, "loss": 1.6433, "step": 25700 }, { "epoch": 2.43, "grad_norm": 54.34375, "learning_rate": 5.137079865993495e-07, "loss": 1.5406, "step": 25720 }, { "epoch": 2.43, "grad_norm": 52.03125, "learning_rate": 5.13329843068544e-07, "loss": 1.518, "step": 25740 }, { "epoch": 2.44, "grad_norm": 85.6875, "learning_rate": 5.129516995377385e-07, "loss": 1.6169, "step": 25760 }, { "epoch": 2.44, "grad_norm": 68.0625, "learning_rate": 5.125735560069328e-07, "loss": 1.5893, "step": 25780 }, { "epoch": 2.44, "grad_norm": 55.71875, "learning_rate": 5.121954124761273e-07, "loss": 1.6479, "step": 25800 }, { "epoch": 2.44, "grad_norm": 58.96875, "learning_rate": 5.118172689453218e-07, "loss": 1.6356, "step": 25820 }, { "epoch": 2.44, "grad_norm": 57.5625, "learning_rate": 5.114391254145161e-07, "loss": 1.5878, "step": 25840 }, { "epoch": 2.44, "grad_norm": 91.875, "learning_rate": 5.110609818837106e-07, "loss": 1.4991, "step": 25860 }, { "epoch": 2.45, "grad_norm": 62.8125, "learning_rate": 5.10682838352905e-07, "loss": 1.4639, "step": 25880 }, { "epoch": 2.45, "grad_norm": 74.1875, "learning_rate": 5.103046948220995e-07, "loss": 1.6055, "step": 25900 }, { "epoch": 2.45, "grad_norm": 69.8125, "learning_rate": 5.099265512912939e-07, "loss": 1.555, "step": 25920 }, { "epoch": 2.45, "grad_norm": 65.125, "learning_rate": 5.095484077604883e-07, "loss": 1.625, "step": 25940 }, { "epoch": 2.45, "grad_norm": 60.71875, "learning_rate": 5.091702642296828e-07, "loss": 1.5422, "step": 25960 }, { "epoch": 2.46, "grad_norm": 56.875, "learning_rate": 5.087921206988773e-07, "loss": 1.6063, "step": 25980 }, { "epoch": 2.46, "grad_norm": 58.0625, "learning_rate": 5.084139771680717e-07, "loss": 1.498, "step": 26000 }, { "epoch": 2.46, "grad_norm": 56.65625, "learning_rate": 5.080358336372662e-07, "loss": 1.5283, "step": 26020 }, { "epoch": 2.46, "grad_norm": 50.3125, "learning_rate": 5.076576901064607e-07, "loss": 1.4989, "step": 26040 }, { "epoch": 2.46, "grad_norm": 66.0625, "learning_rate": 5.07279546575655e-07, "loss": 1.4776, "step": 26060 }, { "epoch": 2.47, "grad_norm": 78.9375, "learning_rate": 5.069014030448495e-07, "loss": 1.4897, "step": 26080 }, { "epoch": 2.47, "grad_norm": 65.0, "learning_rate": 5.06523259514044e-07, "loss": 1.5321, "step": 26100 }, { "epoch": 2.47, "grad_norm": 71.0, "learning_rate": 5.061451159832384e-07, "loss": 1.5652, "step": 26120 }, { "epoch": 2.47, "grad_norm": 52.8125, "learning_rate": 5.057669724524328e-07, "loss": 1.5619, "step": 26140 }, { "epoch": 2.47, "grad_norm": 58.71875, "learning_rate": 5.053888289216273e-07, "loss": 1.6027, "step": 26160 }, { "epoch": 2.47, "grad_norm": 62.4375, "learning_rate": 5.050106853908217e-07, "loss": 1.5552, "step": 26180 }, { "epoch": 2.48, "grad_norm": 66.1875, "learning_rate": 5.046325418600162e-07, "loss": 1.5539, "step": 26200 }, { "epoch": 2.48, "grad_norm": 44.78125, "learning_rate": 5.042543983292105e-07, "loss": 1.4551, "step": 26220 }, { "epoch": 2.48, "grad_norm": 87.8125, "learning_rate": 5.03876254798405e-07, "loss": 1.5585, "step": 26240 }, { "epoch": 2.48, "grad_norm": 97.8125, "learning_rate": 5.034981112675995e-07, "loss": 1.5294, "step": 26260 }, { "epoch": 2.48, "grad_norm": 70.25, "learning_rate": 5.031199677367938e-07, "loss": 1.5808, "step": 26280 }, { "epoch": 2.49, "grad_norm": 62.375, "learning_rate": 5.027418242059883e-07, "loss": 1.5392, "step": 26300 }, { "epoch": 2.49, "grad_norm": 67.9375, "learning_rate": 5.023636806751828e-07, "loss": 1.53, "step": 26320 }, { "epoch": 2.49, "grad_norm": 62.3125, "learning_rate": 5.019855371443772e-07, "loss": 1.5298, "step": 26340 }, { "epoch": 2.49, "grad_norm": 64.375, "learning_rate": 5.016073936135717e-07, "loss": 1.493, "step": 26360 }, { "epoch": 2.49, "grad_norm": 66.125, "learning_rate": 5.012292500827662e-07, "loss": 1.595, "step": 26380 }, { "epoch": 2.5, "grad_norm": 79.0, "learning_rate": 5.008511065519606e-07, "loss": 1.4776, "step": 26400 }, { "epoch": 2.5, "grad_norm": 70.125, "learning_rate": 5.004729630211551e-07, "loss": 1.5509, "step": 26420 }, { "epoch": 2.5, "grad_norm": 70.4375, "learning_rate": 5.000948194903495e-07, "loss": 1.6068, "step": 26440 }, { "epoch": 2.5, "grad_norm": 62.21875, "learning_rate": 4.997166759595439e-07, "loss": 1.553, "step": 26460 }, { "epoch": 2.5, "grad_norm": 57.125, "learning_rate": 4.993385324287384e-07, "loss": 1.6032, "step": 26480 }, { "epoch": 2.51, "grad_norm": 91.125, "learning_rate": 4.989603888979328e-07, "loss": 1.4595, "step": 26500 }, { "epoch": 2.51, "grad_norm": 78.6875, "learning_rate": 4.985822453671272e-07, "loss": 1.5136, "step": 26520 }, { "epoch": 2.51, "grad_norm": 57.625, "learning_rate": 4.982041018363217e-07, "loss": 1.592, "step": 26540 }, { "epoch": 2.51, "grad_norm": 71.6875, "learning_rate": 4.978259583055161e-07, "loss": 1.5033, "step": 26560 }, { "epoch": 2.51, "grad_norm": 67.375, "learning_rate": 4.974478147747105e-07, "loss": 1.5144, "step": 26580 }, { "epoch": 2.51, "grad_norm": 53.96875, "learning_rate": 4.97069671243905e-07, "loss": 1.5845, "step": 26600 }, { "epoch": 2.52, "grad_norm": 77.5625, "learning_rate": 4.966915277130995e-07, "loss": 1.6229, "step": 26620 }, { "epoch": 2.52, "grad_norm": 59.28125, "learning_rate": 4.96313384182294e-07, "loss": 1.6134, "step": 26640 }, { "epoch": 2.52, "grad_norm": 43.34375, "learning_rate": 4.959352406514883e-07, "loss": 1.59, "step": 26660 }, { "epoch": 2.52, "grad_norm": 76.875, "learning_rate": 4.955570971206828e-07, "loss": 1.5741, "step": 26680 }, { "epoch": 2.52, "grad_norm": 70.9375, "learning_rate": 4.951789535898772e-07, "loss": 1.5271, "step": 26700 }, { "epoch": 2.53, "grad_norm": 53.28125, "learning_rate": 4.948008100590716e-07, "loss": 1.436, "step": 26720 }, { "epoch": 2.53, "grad_norm": 59.03125, "learning_rate": 4.944226665282661e-07, "loss": 1.5151, "step": 26740 }, { "epoch": 2.53, "grad_norm": 74.6875, "learning_rate": 4.940445229974605e-07, "loss": 1.4958, "step": 26760 }, { "epoch": 2.53, "grad_norm": 55.03125, "learning_rate": 4.936663794666551e-07, "loss": 1.5223, "step": 26780 }, { "epoch": 2.53, "grad_norm": 59.1875, "learning_rate": 4.932882359358494e-07, "loss": 1.5894, "step": 26800 }, { "epoch": 2.54, "grad_norm": 74.5, "learning_rate": 4.929100924050439e-07, "loss": 1.4906, "step": 26820 }, { "epoch": 2.54, "grad_norm": 65.5, "learning_rate": 4.925319488742383e-07, "loss": 1.5399, "step": 26840 }, { "epoch": 2.54, "grad_norm": 60.0, "learning_rate": 4.921538053434327e-07, "loss": 1.5463, "step": 26860 }, { "epoch": 2.54, "grad_norm": 56.5, "learning_rate": 4.917756618126272e-07, "loss": 1.5375, "step": 26880 }, { "epoch": 2.54, "grad_norm": 93.3125, "learning_rate": 4.913975182818216e-07, "loss": 1.5447, "step": 26900 }, { "epoch": 2.54, "grad_norm": 88.3125, "learning_rate": 4.910193747510161e-07, "loss": 1.5076, "step": 26920 }, { "epoch": 2.55, "grad_norm": 88.9375, "learning_rate": 4.906412312202105e-07, "loss": 1.6689, "step": 26940 }, { "epoch": 2.55, "grad_norm": 77.25, "learning_rate": 4.90263087689405e-07, "loss": 1.5952, "step": 26960 }, { "epoch": 2.55, "grad_norm": 53.3125, "learning_rate": 4.898849441585995e-07, "loss": 1.4589, "step": 26980 }, { "epoch": 2.55, "grad_norm": 61.1875, "learning_rate": 4.895068006277939e-07, "loss": 1.4918, "step": 27000 }, { "epoch": 2.55, "grad_norm": 73.375, "learning_rate": 4.891286570969883e-07, "loss": 1.533, "step": 27020 }, { "epoch": 2.56, "grad_norm": 75.6875, "learning_rate": 4.887505135661827e-07, "loss": 1.5737, "step": 27040 }, { "epoch": 2.56, "grad_norm": 67.3125, "learning_rate": 4.883723700353772e-07, "loss": 1.5193, "step": 27060 }, { "epoch": 2.56, "grad_norm": 85.625, "learning_rate": 4.879942265045716e-07, "loss": 1.5908, "step": 27080 }, { "epoch": 2.56, "grad_norm": 92.3125, "learning_rate": 4.87616082973766e-07, "loss": 1.612, "step": 27100 }, { "epoch": 2.56, "grad_norm": 58.03125, "learning_rate": 4.872379394429605e-07, "loss": 1.5158, "step": 27120 }, { "epoch": 2.57, "grad_norm": 62.125, "learning_rate": 4.86859795912155e-07, "loss": 1.5388, "step": 27140 }, { "epoch": 2.57, "grad_norm": 53.1875, "learning_rate": 4.864816523813494e-07, "loss": 1.5011, "step": 27160 }, { "epoch": 2.57, "grad_norm": 51.40625, "learning_rate": 4.861035088505438e-07, "loss": 1.5886, "step": 27180 }, { "epoch": 2.57, "grad_norm": 45.90625, "learning_rate": 4.857253653197383e-07, "loss": 1.5871, "step": 27200 }, { "epoch": 2.57, "grad_norm": 67.6875, "learning_rate": 4.853472217889328e-07, "loss": 1.5536, "step": 27220 }, { "epoch": 2.58, "grad_norm": 50.34375, "learning_rate": 4.849690782581271e-07, "loss": 1.4826, "step": 27240 }, { "epoch": 2.58, "grad_norm": 52.6875, "learning_rate": 4.845909347273216e-07, "loss": 1.599, "step": 27260 }, { "epoch": 2.58, "grad_norm": 57.21875, "learning_rate": 4.842127911965161e-07, "loss": 1.5626, "step": 27280 }, { "epoch": 2.58, "grad_norm": 68.0625, "learning_rate": 4.838346476657104e-07, "loss": 1.4922, "step": 27300 }, { "epoch": 2.58, "grad_norm": 70.5, "learning_rate": 4.834565041349049e-07, "loss": 1.6091, "step": 27320 }, { "epoch": 2.58, "grad_norm": 61.34375, "learning_rate": 4.830783606040994e-07, "loss": 1.532, "step": 27340 }, { "epoch": 2.59, "grad_norm": 82.0625, "learning_rate": 4.827002170732939e-07, "loss": 1.5448, "step": 27360 }, { "epoch": 2.59, "grad_norm": 42.21875, "learning_rate": 4.823220735424882e-07, "loss": 1.4501, "step": 27380 }, { "epoch": 2.59, "grad_norm": 64.875, "learning_rate": 4.819439300116827e-07, "loss": 1.6265, "step": 27400 }, { "epoch": 2.59, "grad_norm": 60.3125, "learning_rate": 4.815657864808772e-07, "loss": 1.4552, "step": 27420 }, { "epoch": 2.59, "grad_norm": 60.0625, "learning_rate": 4.811876429500715e-07, "loss": 1.6434, "step": 27440 }, { "epoch": 2.6, "grad_norm": 72.1875, "learning_rate": 4.80809499419266e-07, "loss": 1.5811, "step": 27460 }, { "epoch": 2.6, "grad_norm": 61.59375, "learning_rate": 4.804313558884605e-07, "loss": 1.5929, "step": 27480 }, { "epoch": 2.6, "grad_norm": 55.90625, "learning_rate": 4.80053212357655e-07, "loss": 1.598, "step": 27500 }, { "epoch": 2.6, "grad_norm": 61.5625, "learning_rate": 4.796750688268493e-07, "loss": 1.5171, "step": 27520 }, { "epoch": 2.6, "grad_norm": 77.5, "learning_rate": 4.792969252960438e-07, "loss": 1.5044, "step": 27540 }, { "epoch": 2.61, "grad_norm": 69.0, "learning_rate": 4.789187817652383e-07, "loss": 1.5655, "step": 27560 }, { "epoch": 2.61, "grad_norm": 58.0625, "learning_rate": 4.785406382344327e-07, "loss": 1.4868, "step": 27580 }, { "epoch": 2.61, "grad_norm": 65.0, "learning_rate": 4.781624947036271e-07, "loss": 1.5374, "step": 27600 }, { "epoch": 2.61, "grad_norm": 72.875, "learning_rate": 4.777843511728216e-07, "loss": 1.5488, "step": 27620 }, { "epoch": 2.61, "grad_norm": 58.1875, "learning_rate": 4.77406207642016e-07, "loss": 1.5216, "step": 27640 }, { "epoch": 2.61, "grad_norm": 53.96875, "learning_rate": 4.770280641112104e-07, "loss": 1.5225, "step": 27660 }, { "epoch": 2.62, "grad_norm": 57.0625, "learning_rate": 4.766499205804049e-07, "loss": 1.5891, "step": 27680 }, { "epoch": 2.62, "grad_norm": 99.8125, "learning_rate": 4.7627177704959934e-07, "loss": 1.5102, "step": 27700 }, { "epoch": 2.62, "grad_norm": 68.9375, "learning_rate": 4.758936335187938e-07, "loss": 1.5994, "step": 27720 }, { "epoch": 2.62, "grad_norm": 58.21875, "learning_rate": 4.7551548998798824e-07, "loss": 1.5237, "step": 27740 }, { "epoch": 2.62, "grad_norm": 58.375, "learning_rate": 4.7513734645718266e-07, "loss": 1.5272, "step": 27760 }, { "epoch": 2.63, "grad_norm": 59.9375, "learning_rate": 4.7475920292637713e-07, "loss": 1.5752, "step": 27780 }, { "epoch": 2.63, "grad_norm": 63.25, "learning_rate": 4.7438105939557155e-07, "loss": 1.5307, "step": 27800 }, { "epoch": 2.63, "grad_norm": 67.5, "learning_rate": 4.74002915864766e-07, "loss": 1.5077, "step": 27820 }, { "epoch": 2.63, "grad_norm": 72.625, "learning_rate": 4.736247723339605e-07, "loss": 1.4981, "step": 27840 }, { "epoch": 2.63, "grad_norm": 54.125, "learning_rate": 4.732466288031549e-07, "loss": 1.62, "step": 27860 }, { "epoch": 2.64, "grad_norm": 61.59375, "learning_rate": 4.7286848527234934e-07, "loss": 1.5666, "step": 27880 }, { "epoch": 2.64, "grad_norm": 55.15625, "learning_rate": 4.7249034174154377e-07, "loss": 1.5126, "step": 27900 }, { "epoch": 2.64, "grad_norm": 58.5, "learning_rate": 4.7211219821073824e-07, "loss": 1.6101, "step": 27920 }, { "epoch": 2.64, "grad_norm": 64.125, "learning_rate": 4.7173405467993266e-07, "loss": 1.5902, "step": 27940 }, { "epoch": 2.64, "grad_norm": 51.25, "learning_rate": 4.713559111491271e-07, "loss": 1.5703, "step": 27960 }, { "epoch": 2.65, "grad_norm": 56.21875, "learning_rate": 4.7097776761832156e-07, "loss": 1.57, "step": 27980 }, { "epoch": 2.65, "grad_norm": 117.5625, "learning_rate": 4.70599624087516e-07, "loss": 1.4706, "step": 28000 }, { "epoch": 2.65, "grad_norm": 58.75, "learning_rate": 4.7022148055671045e-07, "loss": 1.49, "step": 28020 }, { "epoch": 2.65, "grad_norm": 85.1875, "learning_rate": 4.698433370259049e-07, "loss": 1.4867, "step": 28040 }, { "epoch": 2.65, "grad_norm": 62.3125, "learning_rate": 4.6946519349509935e-07, "loss": 1.6106, "step": 28060 }, { "epoch": 2.65, "grad_norm": 69.8125, "learning_rate": 4.6908704996429377e-07, "loss": 1.5242, "step": 28080 }, { "epoch": 2.66, "grad_norm": 73.9375, "learning_rate": 4.687089064334882e-07, "loss": 1.5871, "step": 28100 }, { "epoch": 2.66, "grad_norm": 75.6875, "learning_rate": 4.6833076290268266e-07, "loss": 1.5284, "step": 28120 }, { "epoch": 2.66, "grad_norm": 60.28125, "learning_rate": 4.679526193718771e-07, "loss": 1.5682, "step": 28140 }, { "epoch": 2.66, "grad_norm": 52.8125, "learning_rate": 4.675744758410715e-07, "loss": 1.5701, "step": 28160 }, { "epoch": 2.66, "grad_norm": 71.5, "learning_rate": 4.67196332310266e-07, "loss": 1.5179, "step": 28180 }, { "epoch": 2.67, "grad_norm": 57.90625, "learning_rate": 4.6681818877946046e-07, "loss": 1.544, "step": 28200 }, { "epoch": 2.67, "grad_norm": 81.5625, "learning_rate": 4.664400452486549e-07, "loss": 1.5441, "step": 28220 }, { "epoch": 2.67, "grad_norm": 76.0625, "learning_rate": 4.660619017178493e-07, "loss": 1.5856, "step": 28240 }, { "epoch": 2.67, "grad_norm": 77.125, "learning_rate": 4.6568375818704377e-07, "loss": 1.5011, "step": 28260 }, { "epoch": 2.67, "grad_norm": 67.6875, "learning_rate": 4.653056146562382e-07, "loss": 1.6049, "step": 28280 }, { "epoch": 2.68, "grad_norm": 59.4375, "learning_rate": 4.649274711254326e-07, "loss": 1.5911, "step": 28300 }, { "epoch": 2.68, "grad_norm": 76.0625, "learning_rate": 4.6454932759462704e-07, "loss": 1.5146, "step": 28320 }, { "epoch": 2.68, "grad_norm": 56.34375, "learning_rate": 4.641711840638215e-07, "loss": 1.4633, "step": 28340 }, { "epoch": 2.68, "grad_norm": 70.5, "learning_rate": 4.63793040533016e-07, "loss": 1.5427, "step": 28360 }, { "epoch": 2.68, "grad_norm": 72.0625, "learning_rate": 4.634148970022104e-07, "loss": 1.5296, "step": 28380 }, { "epoch": 2.68, "grad_norm": 63.96875, "learning_rate": 4.630367534714049e-07, "loss": 1.4503, "step": 28400 }, { "epoch": 2.69, "grad_norm": 61.09375, "learning_rate": 4.626586099405993e-07, "loss": 1.5682, "step": 28420 }, { "epoch": 2.69, "grad_norm": 109.375, "learning_rate": 4.622804664097937e-07, "loss": 1.6351, "step": 28440 }, { "epoch": 2.69, "grad_norm": 64.4375, "learning_rate": 4.6190232287898814e-07, "loss": 1.5257, "step": 28460 }, { "epoch": 2.69, "grad_norm": 66.3125, "learning_rate": 4.615241793481826e-07, "loss": 1.4647, "step": 28480 }, { "epoch": 2.69, "grad_norm": 62.1875, "learning_rate": 4.6114603581737704e-07, "loss": 1.5068, "step": 28500 }, { "epoch": 2.7, "grad_norm": 60.46875, "learning_rate": 4.6076789228657146e-07, "loss": 1.5132, "step": 28520 }, { "epoch": 2.7, "grad_norm": 63.21875, "learning_rate": 4.60389748755766e-07, "loss": 1.5879, "step": 28540 }, { "epoch": 2.7, "grad_norm": 77.9375, "learning_rate": 4.600116052249604e-07, "loss": 1.5218, "step": 28560 }, { "epoch": 2.7, "grad_norm": 58.0625, "learning_rate": 4.5963346169415483e-07, "loss": 1.596, "step": 28580 }, { "epoch": 2.7, "grad_norm": 48.78125, "learning_rate": 4.592553181633493e-07, "loss": 1.6146, "step": 28600 }, { "epoch": 2.71, "grad_norm": 77.8125, "learning_rate": 4.5887717463254373e-07, "loss": 1.4568, "step": 28620 }, { "epoch": 2.71, "grad_norm": 155.25, "learning_rate": 4.5849903110173815e-07, "loss": 1.5704, "step": 28640 }, { "epoch": 2.71, "grad_norm": 56.4375, "learning_rate": 4.5812088757093257e-07, "loss": 1.5383, "step": 28660 }, { "epoch": 2.71, "grad_norm": 98.5, "learning_rate": 4.5774274404012704e-07, "loss": 1.5281, "step": 28680 }, { "epoch": 2.71, "grad_norm": 62.25, "learning_rate": 4.5736460050932146e-07, "loss": 1.5042, "step": 28700 }, { "epoch": 2.72, "grad_norm": 65.625, "learning_rate": 4.5698645697851594e-07, "loss": 1.5688, "step": 28720 }, { "epoch": 2.72, "grad_norm": 54.90625, "learning_rate": 4.566083134477104e-07, "loss": 1.5805, "step": 28740 }, { "epoch": 2.72, "grad_norm": 57.90625, "learning_rate": 4.5623016991690483e-07, "loss": 1.5071, "step": 28760 }, { "epoch": 2.72, "grad_norm": 63.375, "learning_rate": 4.5585202638609926e-07, "loss": 1.3748, "step": 28780 }, { "epoch": 2.72, "grad_norm": 67.6875, "learning_rate": 4.554738828552937e-07, "loss": 1.6716, "step": 28800 }, { "epoch": 2.72, "grad_norm": 73.5625, "learning_rate": 4.5509573932448815e-07, "loss": 1.4361, "step": 28820 }, { "epoch": 2.73, "grad_norm": 62.0625, "learning_rate": 4.5471759579368257e-07, "loss": 1.6571, "step": 28840 }, { "epoch": 2.73, "grad_norm": 58.5, "learning_rate": 4.54339452262877e-07, "loss": 1.5489, "step": 28860 }, { "epoch": 2.73, "grad_norm": 53.53125, "learning_rate": 4.539613087320715e-07, "loss": 1.5146, "step": 28880 }, { "epoch": 2.73, "grad_norm": 47.5, "learning_rate": 4.5358316520126594e-07, "loss": 1.5828, "step": 28900 }, { "epoch": 2.73, "grad_norm": 50.25, "learning_rate": 4.5320502167046036e-07, "loss": 1.4854, "step": 28920 }, { "epoch": 2.74, "grad_norm": 73.75, "learning_rate": 4.528268781396548e-07, "loss": 1.5217, "step": 28940 }, { "epoch": 2.74, "grad_norm": 56.71875, "learning_rate": 4.5244873460884926e-07, "loss": 1.5262, "step": 28960 }, { "epoch": 2.74, "grad_norm": 54.8125, "learning_rate": 4.520705910780437e-07, "loss": 1.5702, "step": 28980 }, { "epoch": 2.74, "grad_norm": 80.3125, "learning_rate": 4.516924475472381e-07, "loss": 1.5637, "step": 29000 }, { "epoch": 2.74, "grad_norm": 107.75, "learning_rate": 4.513143040164326e-07, "loss": 1.5309, "step": 29020 }, { "epoch": 2.75, "grad_norm": 48.09375, "learning_rate": 4.50936160485627e-07, "loss": 1.5284, "step": 29040 }, { "epoch": 2.75, "grad_norm": 56.90625, "learning_rate": 4.5055801695482147e-07, "loss": 1.5613, "step": 29060 }, { "epoch": 2.75, "grad_norm": 62.375, "learning_rate": 4.5017987342401595e-07, "loss": 1.5028, "step": 29080 }, { "epoch": 2.75, "grad_norm": 59.625, "learning_rate": 4.4980172989321037e-07, "loss": 1.6023, "step": 29100 }, { "epoch": 2.75, "grad_norm": 70.625, "learning_rate": 4.494235863624048e-07, "loss": 1.5756, "step": 29120 }, { "epoch": 2.75, "grad_norm": 66.8125, "learning_rate": 4.490454428315992e-07, "loss": 1.6159, "step": 29140 }, { "epoch": 2.76, "grad_norm": 67.8125, "learning_rate": 4.486672993007937e-07, "loss": 1.5633, "step": 29160 }, { "epoch": 2.76, "grad_norm": 44.71875, "learning_rate": 4.482891557699881e-07, "loss": 1.5654, "step": 29180 }, { "epoch": 2.76, "grad_norm": 58.59375, "learning_rate": 4.4791101223918253e-07, "loss": 1.5233, "step": 29200 }, { "epoch": 2.76, "grad_norm": 61.90625, "learning_rate": 4.47532868708377e-07, "loss": 1.5729, "step": 29220 }, { "epoch": 2.76, "grad_norm": 76.5625, "learning_rate": 4.471547251775715e-07, "loss": 1.5359, "step": 29240 }, { "epoch": 2.77, "grad_norm": 80.875, "learning_rate": 4.467765816467659e-07, "loss": 1.5896, "step": 29260 }, { "epoch": 2.77, "grad_norm": 80.5, "learning_rate": 4.463984381159603e-07, "loss": 1.6558, "step": 29280 }, { "epoch": 2.77, "grad_norm": 61.0625, "learning_rate": 4.460202945851548e-07, "loss": 1.5273, "step": 29300 }, { "epoch": 2.77, "grad_norm": 64.1875, "learning_rate": 4.456421510543492e-07, "loss": 1.5447, "step": 29320 }, { "epoch": 2.77, "grad_norm": 69.375, "learning_rate": 4.4526400752354363e-07, "loss": 1.5843, "step": 29340 }, { "epoch": 2.78, "grad_norm": 68.125, "learning_rate": 4.448858639927381e-07, "loss": 1.5857, "step": 29360 }, { "epoch": 2.78, "grad_norm": 68.0, "learning_rate": 4.4450772046193253e-07, "loss": 1.579, "step": 29380 }, { "epoch": 2.78, "grad_norm": 70.6875, "learning_rate": 4.44129576931127e-07, "loss": 1.5822, "step": 29400 }, { "epoch": 2.78, "grad_norm": 64.9375, "learning_rate": 4.437514334003214e-07, "loss": 1.6115, "step": 29420 }, { "epoch": 2.78, "grad_norm": 101.9375, "learning_rate": 4.433732898695159e-07, "loss": 1.6074, "step": 29440 }, { "epoch": 2.79, "grad_norm": 70.6875, "learning_rate": 4.429951463387103e-07, "loss": 1.5953, "step": 29460 }, { "epoch": 2.79, "grad_norm": 57.25, "learning_rate": 4.4261700280790474e-07, "loss": 1.492, "step": 29480 }, { "epoch": 2.79, "grad_norm": 70.5625, "learning_rate": 4.422388592770992e-07, "loss": 1.6068, "step": 29500 }, { "epoch": 2.79, "grad_norm": 57.6875, "learning_rate": 4.4186071574629364e-07, "loss": 1.5683, "step": 29520 }, { "epoch": 2.79, "grad_norm": 57.78125, "learning_rate": 4.4148257221548806e-07, "loss": 1.5758, "step": 29540 }, { "epoch": 2.79, "grad_norm": 72.125, "learning_rate": 4.411044286846825e-07, "loss": 1.5107, "step": 29560 }, { "epoch": 2.8, "grad_norm": 63.46875, "learning_rate": 4.40726285153877e-07, "loss": 1.5422, "step": 29580 }, { "epoch": 2.8, "grad_norm": 43.5, "learning_rate": 4.4034814162307143e-07, "loss": 1.5609, "step": 29600 }, { "epoch": 2.8, "grad_norm": 57.8125, "learning_rate": 4.3996999809226585e-07, "loss": 1.5686, "step": 29620 }, { "epoch": 2.8, "grad_norm": 64.625, "learning_rate": 4.395918545614603e-07, "loss": 1.4721, "step": 29640 }, { "epoch": 2.8, "grad_norm": 77.875, "learning_rate": 4.3921371103065475e-07, "loss": 1.5926, "step": 29660 }, { "epoch": 2.81, "grad_norm": 66.0, "learning_rate": 4.3883556749984917e-07, "loss": 1.5014, "step": 29680 }, { "epoch": 2.81, "grad_norm": 51.5, "learning_rate": 4.3845742396904364e-07, "loss": 1.5162, "step": 29700 }, { "epoch": 2.81, "grad_norm": 80.625, "learning_rate": 4.3807928043823806e-07, "loss": 1.5249, "step": 29720 }, { "epoch": 2.81, "grad_norm": 75.0, "learning_rate": 4.377011369074325e-07, "loss": 1.5725, "step": 29740 }, { "epoch": 2.81, "grad_norm": 79.4375, "learning_rate": 4.3732299337662696e-07, "loss": 1.444, "step": 29760 }, { "epoch": 2.82, "grad_norm": 57.4375, "learning_rate": 4.3694484984582143e-07, "loss": 1.5409, "step": 29780 }, { "epoch": 2.82, "grad_norm": 79.1875, "learning_rate": 4.3656670631501585e-07, "loss": 1.5865, "step": 29800 }, { "epoch": 2.82, "grad_norm": 75.4375, "learning_rate": 4.361885627842103e-07, "loss": 1.4412, "step": 29820 }, { "epoch": 2.82, "grad_norm": 58.125, "learning_rate": 4.3581041925340475e-07, "loss": 1.5931, "step": 29840 }, { "epoch": 2.82, "grad_norm": 69.4375, "learning_rate": 4.3543227572259917e-07, "loss": 1.6071, "step": 29860 }, { "epoch": 2.82, "grad_norm": 60.65625, "learning_rate": 4.350541321917936e-07, "loss": 1.5137, "step": 29880 }, { "epoch": 2.83, "grad_norm": 77.8125, "learning_rate": 4.34675988660988e-07, "loss": 1.5608, "step": 29900 }, { "epoch": 2.83, "grad_norm": 45.5, "learning_rate": 4.342978451301825e-07, "loss": 1.5166, "step": 29920 }, { "epoch": 2.83, "grad_norm": 59.5, "learning_rate": 4.3391970159937696e-07, "loss": 1.6169, "step": 29940 }, { "epoch": 2.83, "grad_norm": 61.375, "learning_rate": 4.335415580685714e-07, "loss": 1.4541, "step": 29960 }, { "epoch": 2.83, "grad_norm": 60.28125, "learning_rate": 4.3316341453776586e-07, "loss": 1.5651, "step": 29980 }, { "epoch": 2.84, "grad_norm": 65.4375, "learning_rate": 4.327852710069603e-07, "loss": 1.6816, "step": 30000 }, { "epoch": 2.84, "grad_norm": 60.125, "learning_rate": 4.324071274761547e-07, "loss": 1.502, "step": 30020 }, { "epoch": 2.84, "grad_norm": 61.6875, "learning_rate": 4.320289839453491e-07, "loss": 1.4747, "step": 30040 }, { "epoch": 2.84, "grad_norm": 47.0, "learning_rate": 4.316508404145436e-07, "loss": 1.5672, "step": 30060 }, { "epoch": 2.84, "grad_norm": 63.0625, "learning_rate": 4.31272696883738e-07, "loss": 1.5038, "step": 30080 }, { "epoch": 2.85, "grad_norm": 55.0625, "learning_rate": 4.308945533529325e-07, "loss": 1.5884, "step": 30100 }, { "epoch": 2.85, "grad_norm": 56.4375, "learning_rate": 4.3051640982212697e-07, "loss": 1.5646, "step": 30120 }, { "epoch": 2.85, "grad_norm": 85.8125, "learning_rate": 4.301382662913214e-07, "loss": 1.4855, "step": 30140 }, { "epoch": 2.85, "grad_norm": 91.0, "learning_rate": 4.297601227605158e-07, "loss": 1.5274, "step": 30160 }, { "epoch": 2.85, "grad_norm": 83.1875, "learning_rate": 4.293819792297103e-07, "loss": 1.5331, "step": 30180 }, { "epoch": 2.85, "grad_norm": 57.71875, "learning_rate": 4.290038356989047e-07, "loss": 1.4909, "step": 30200 }, { "epoch": 2.86, "grad_norm": 84.5, "learning_rate": 4.286256921680991e-07, "loss": 1.5937, "step": 30220 }, { "epoch": 2.86, "grad_norm": 78.5, "learning_rate": 4.2824754863729355e-07, "loss": 1.5401, "step": 30240 }, { "epoch": 2.86, "grad_norm": 65.1875, "learning_rate": 4.27869405106488e-07, "loss": 1.5679, "step": 30260 }, { "epoch": 2.86, "grad_norm": 66.25, "learning_rate": 4.274912615756825e-07, "loss": 1.4403, "step": 30280 }, { "epoch": 2.86, "grad_norm": 56.34375, "learning_rate": 4.271131180448769e-07, "loss": 1.515, "step": 30300 }, { "epoch": 2.87, "grad_norm": 50.9375, "learning_rate": 4.267349745140714e-07, "loss": 1.4955, "step": 30320 }, { "epoch": 2.87, "grad_norm": 58.6875, "learning_rate": 4.263568309832658e-07, "loss": 1.4947, "step": 30340 }, { "epoch": 2.87, "grad_norm": 51.75, "learning_rate": 4.2597868745246023e-07, "loss": 1.578, "step": 30360 }, { "epoch": 2.87, "grad_norm": 85.9375, "learning_rate": 4.2560054392165465e-07, "loss": 1.5881, "step": 30380 }, { "epoch": 2.87, "grad_norm": 79.25, "learning_rate": 4.2522240039084913e-07, "loss": 1.5744, "step": 30400 }, { "epoch": 2.88, "grad_norm": 76.8125, "learning_rate": 4.2484425686004355e-07, "loss": 1.5821, "step": 30420 }, { "epoch": 2.88, "grad_norm": 51.375, "learning_rate": 4.2446611332923797e-07, "loss": 1.585, "step": 30440 }, { "epoch": 2.88, "grad_norm": 62.59375, "learning_rate": 4.240879697984325e-07, "loss": 1.5283, "step": 30460 }, { "epoch": 2.88, "grad_norm": 52.0625, "learning_rate": 4.237098262676269e-07, "loss": 1.4999, "step": 30480 }, { "epoch": 2.88, "grad_norm": 51.65625, "learning_rate": 4.2333168273682134e-07, "loss": 1.6095, "step": 30500 }, { "epoch": 2.89, "grad_norm": 76.4375, "learning_rate": 4.2295353920601576e-07, "loss": 1.5921, "step": 30520 }, { "epoch": 2.89, "grad_norm": 72.1875, "learning_rate": 4.2257539567521024e-07, "loss": 1.6093, "step": 30540 }, { "epoch": 2.89, "grad_norm": 67.25, "learning_rate": 4.2219725214440466e-07, "loss": 1.4845, "step": 30560 }, { "epoch": 2.89, "grad_norm": 55.6875, "learning_rate": 4.218191086135991e-07, "loss": 1.584, "step": 30580 }, { "epoch": 2.89, "grad_norm": 50.96875, "learning_rate": 4.2144096508279355e-07, "loss": 1.4808, "step": 30600 }, { "epoch": 2.89, "grad_norm": 100.75, "learning_rate": 4.21062821551988e-07, "loss": 1.5642, "step": 30620 }, { "epoch": 2.9, "grad_norm": 53.71875, "learning_rate": 4.2068467802118245e-07, "loss": 1.5242, "step": 30640 }, { "epoch": 2.9, "grad_norm": 54.25, "learning_rate": 4.2030653449037687e-07, "loss": 1.5593, "step": 30660 }, { "epoch": 2.9, "grad_norm": 61.28125, "learning_rate": 4.1992839095957134e-07, "loss": 1.5166, "step": 30680 }, { "epoch": 2.9, "grad_norm": 56.5625, "learning_rate": 4.1955024742876577e-07, "loss": 1.6194, "step": 30700 }, { "epoch": 2.9, "grad_norm": 59.03125, "learning_rate": 4.191721038979602e-07, "loss": 1.5371, "step": 30720 }, { "epoch": 2.91, "grad_norm": 55.03125, "learning_rate": 4.1879396036715466e-07, "loss": 1.4101, "step": 30740 }, { "epoch": 2.91, "grad_norm": 68.125, "learning_rate": 4.184158168363491e-07, "loss": 1.5908, "step": 30760 }, { "epoch": 2.91, "grad_norm": 51.59375, "learning_rate": 4.180376733055435e-07, "loss": 1.5034, "step": 30780 }, { "epoch": 2.91, "grad_norm": 58.4375, "learning_rate": 4.1765952977473803e-07, "loss": 1.5745, "step": 30800 }, { "epoch": 2.91, "grad_norm": 75.4375, "learning_rate": 4.1728138624393245e-07, "loss": 1.5812, "step": 30820 }, { "epoch": 2.92, "grad_norm": 66.875, "learning_rate": 4.169032427131269e-07, "loss": 1.5836, "step": 30840 }, { "epoch": 2.92, "grad_norm": 53.125, "learning_rate": 4.165250991823213e-07, "loss": 1.534, "step": 30860 }, { "epoch": 2.92, "grad_norm": 76.0625, "learning_rate": 4.1614695565151577e-07, "loss": 1.5019, "step": 30880 }, { "epoch": 2.92, "grad_norm": 61.9375, "learning_rate": 4.157688121207102e-07, "loss": 1.507, "step": 30900 }, { "epoch": 2.92, "grad_norm": 85.25, "learning_rate": 4.153906685899046e-07, "loss": 1.5362, "step": 30920 }, { "epoch": 2.92, "grad_norm": 71.625, "learning_rate": 4.150125250590991e-07, "loss": 1.5695, "step": 30940 }, { "epoch": 2.93, "grad_norm": 69.25, "learning_rate": 4.146343815282935e-07, "loss": 1.5302, "step": 30960 }, { "epoch": 2.93, "grad_norm": 79.0, "learning_rate": 4.14256237997488e-07, "loss": 1.545, "step": 30980 }, { "epoch": 2.93, "grad_norm": 68.125, "learning_rate": 4.138780944666824e-07, "loss": 1.5746, "step": 31000 }, { "epoch": 2.93, "grad_norm": 56.15625, "learning_rate": 4.134999509358769e-07, "loss": 1.5426, "step": 31020 }, { "epoch": 2.93, "grad_norm": 53.6875, "learning_rate": 4.131218074050713e-07, "loss": 1.5182, "step": 31040 }, { "epoch": 2.94, "grad_norm": 89.4375, "learning_rate": 4.127436638742657e-07, "loss": 1.4772, "step": 31060 }, { "epoch": 2.94, "grad_norm": 66.375, "learning_rate": 4.123655203434602e-07, "loss": 1.5478, "step": 31080 }, { "epoch": 2.94, "grad_norm": 65.8125, "learning_rate": 4.119873768126546e-07, "loss": 1.4511, "step": 31100 }, { "epoch": 2.94, "grad_norm": 56.5, "learning_rate": 4.1160923328184904e-07, "loss": 1.5412, "step": 31120 }, { "epoch": 2.94, "grad_norm": 50.8125, "learning_rate": 4.1123108975104346e-07, "loss": 1.6, "step": 31140 }, { "epoch": 2.95, "grad_norm": 71.5625, "learning_rate": 4.10852946220238e-07, "loss": 1.4424, "step": 31160 }, { "epoch": 2.95, "grad_norm": 59.28125, "learning_rate": 4.104748026894324e-07, "loss": 1.5589, "step": 31180 }, { "epoch": 2.95, "grad_norm": 56.6875, "learning_rate": 4.1009665915862683e-07, "loss": 1.5787, "step": 31200 }, { "epoch": 2.95, "grad_norm": 61.6875, "learning_rate": 4.097185156278213e-07, "loss": 1.4297, "step": 31220 }, { "epoch": 2.95, "grad_norm": 64.4375, "learning_rate": 4.093403720970157e-07, "loss": 1.557, "step": 31240 }, { "epoch": 2.96, "grad_norm": 49.65625, "learning_rate": 4.0896222856621014e-07, "loss": 1.518, "step": 31260 }, { "epoch": 2.96, "grad_norm": 59.5, "learning_rate": 4.0858408503540457e-07, "loss": 1.5745, "step": 31280 }, { "epoch": 2.96, "grad_norm": 58.25, "learning_rate": 4.0820594150459904e-07, "loss": 1.4711, "step": 31300 }, { "epoch": 2.96, "grad_norm": 66.625, "learning_rate": 4.0782779797379346e-07, "loss": 1.5911, "step": 31320 }, { "epoch": 2.96, "grad_norm": 78.6875, "learning_rate": 4.0744965444298794e-07, "loss": 1.5814, "step": 31340 }, { "epoch": 2.96, "grad_norm": 61.25, "learning_rate": 4.070715109121824e-07, "loss": 1.4427, "step": 31360 }, { "epoch": 2.97, "grad_norm": 69.875, "learning_rate": 4.0669336738137683e-07, "loss": 1.4784, "step": 31380 }, { "epoch": 2.97, "grad_norm": 50.6875, "learning_rate": 4.0631522385057125e-07, "loss": 1.5007, "step": 31400 }, { "epoch": 2.97, "grad_norm": 66.8125, "learning_rate": 4.0593708031976573e-07, "loss": 1.6264, "step": 31420 }, { "epoch": 2.97, "grad_norm": 62.65625, "learning_rate": 4.0555893678896015e-07, "loss": 1.5666, "step": 31440 }, { "epoch": 2.97, "grad_norm": 71.5, "learning_rate": 4.0518079325815457e-07, "loss": 1.736, "step": 31460 }, { "epoch": 2.98, "grad_norm": 69.1875, "learning_rate": 4.04802649727349e-07, "loss": 1.543, "step": 31480 }, { "epoch": 2.98, "grad_norm": 70.1875, "learning_rate": 4.044245061965435e-07, "loss": 1.5761, "step": 31500 }, { "epoch": 2.98, "grad_norm": 65.75, "learning_rate": 4.0404636266573794e-07, "loss": 1.5896, "step": 31520 }, { "epoch": 2.98, "grad_norm": 54.65625, "learning_rate": 4.0366821913493236e-07, "loss": 1.4913, "step": 31540 }, { "epoch": 2.98, "grad_norm": 73.0625, "learning_rate": 4.0329007560412683e-07, "loss": 1.5724, "step": 31560 }, { "epoch": 2.99, "grad_norm": 66.3125, "learning_rate": 4.0291193207332126e-07, "loss": 1.5511, "step": 31580 }, { "epoch": 2.99, "grad_norm": 68.5625, "learning_rate": 4.025337885425157e-07, "loss": 1.5837, "step": 31600 }, { "epoch": 2.99, "grad_norm": 62.375, "learning_rate": 4.021556450117101e-07, "loss": 1.5772, "step": 31620 }, { "epoch": 2.99, "grad_norm": 69.25, "learning_rate": 4.0177750148090457e-07, "loss": 1.5872, "step": 31640 }, { "epoch": 2.99, "grad_norm": 69.0625, "learning_rate": 4.01399357950099e-07, "loss": 1.5467, "step": 31660 }, { "epoch": 2.99, "grad_norm": 54.21875, "learning_rate": 4.0102121441929347e-07, "loss": 1.5222, "step": 31680 }, { "epoch": 3.0, "grad_norm": 67.6875, "learning_rate": 4.0064307088848794e-07, "loss": 1.5318, "step": 31700 }, { "epoch": 3.0, "grad_norm": 77.0625, "learning_rate": 4.0026492735768236e-07, "loss": 1.5116, "step": 31720 }, { "epoch": 3.0, "grad_norm": 63.15625, "learning_rate": 3.998867838268768e-07, "loss": 1.4655, "step": 31740 }, { "epoch": 3.0, "grad_norm": 72.25, "learning_rate": 3.995086402960712e-07, "loss": 1.3648, "step": 31760 }, { "epoch": 3.0, "grad_norm": 50.46875, "learning_rate": 3.991304967652657e-07, "loss": 1.3922, "step": 31780 }, { "epoch": 3.01, "grad_norm": 52.625, "learning_rate": 3.987523532344601e-07, "loss": 1.4272, "step": 31800 }, { "epoch": 3.01, "grad_norm": 68.375, "learning_rate": 3.983742097036545e-07, "loss": 1.3381, "step": 31820 }, { "epoch": 3.01, "grad_norm": 61.75, "learning_rate": 3.97996066172849e-07, "loss": 1.3762, "step": 31840 }, { "epoch": 3.01, "grad_norm": 64.0, "learning_rate": 3.9761792264204347e-07, "loss": 1.36, "step": 31860 }, { "epoch": 3.01, "grad_norm": 54.15625, "learning_rate": 3.972397791112379e-07, "loss": 1.3718, "step": 31880 }, { "epoch": 3.02, "grad_norm": 57.75, "learning_rate": 3.9686163558043237e-07, "loss": 1.4966, "step": 31900 }, { "epoch": 3.02, "grad_norm": 75.0625, "learning_rate": 3.964834920496268e-07, "loss": 1.4002, "step": 31920 }, { "epoch": 3.02, "grad_norm": 53.6875, "learning_rate": 3.961053485188212e-07, "loss": 1.3682, "step": 31940 }, { "epoch": 3.02, "grad_norm": 60.3125, "learning_rate": 3.9572720498801563e-07, "loss": 1.4108, "step": 31960 }, { "epoch": 3.02, "grad_norm": 100.0625, "learning_rate": 3.953490614572101e-07, "loss": 1.4387, "step": 31980 }, { "epoch": 3.03, "grad_norm": 61.40625, "learning_rate": 3.9497091792640453e-07, "loss": 1.3657, "step": 32000 }, { "epoch": 3.03, "grad_norm": 57.0625, "learning_rate": 3.94592774395599e-07, "loss": 1.3044, "step": 32020 }, { "epoch": 3.03, "grad_norm": 75.625, "learning_rate": 3.942146308647935e-07, "loss": 1.5226, "step": 32040 }, { "epoch": 3.03, "grad_norm": 79.125, "learning_rate": 3.938364873339879e-07, "loss": 1.5141, "step": 32060 }, { "epoch": 3.03, "grad_norm": 77.125, "learning_rate": 3.934583438031823e-07, "loss": 1.3971, "step": 32080 }, { "epoch": 3.03, "grad_norm": 74.6875, "learning_rate": 3.9308020027237674e-07, "loss": 1.3343, "step": 32100 }, { "epoch": 3.04, "grad_norm": 58.6875, "learning_rate": 3.927020567415712e-07, "loss": 1.3938, "step": 32120 }, { "epoch": 3.04, "grad_norm": 57.375, "learning_rate": 3.9232391321076564e-07, "loss": 1.4423, "step": 32140 }, { "epoch": 3.04, "grad_norm": 64.875, "learning_rate": 3.9194576967996006e-07, "loss": 1.4377, "step": 32160 }, { "epoch": 3.04, "grad_norm": 57.78125, "learning_rate": 3.9156762614915453e-07, "loss": 1.416, "step": 32180 }, { "epoch": 3.04, "grad_norm": 52.8125, "learning_rate": 3.91189482618349e-07, "loss": 1.4056, "step": 32200 }, { "epoch": 3.05, "grad_norm": 52.375, "learning_rate": 3.908113390875434e-07, "loss": 1.4797, "step": 32220 }, { "epoch": 3.05, "grad_norm": 66.0625, "learning_rate": 3.9043319555673785e-07, "loss": 1.3557, "step": 32240 }, { "epoch": 3.05, "grad_norm": 67.5, "learning_rate": 3.900550520259323e-07, "loss": 1.3408, "step": 32260 }, { "epoch": 3.05, "grad_norm": 61.5625, "learning_rate": 3.8967690849512674e-07, "loss": 1.4133, "step": 32280 }, { "epoch": 3.05, "grad_norm": 70.9375, "learning_rate": 3.8929876496432116e-07, "loss": 1.4408, "step": 32300 }, { "epoch": 3.06, "grad_norm": 60.65625, "learning_rate": 3.8892062143351564e-07, "loss": 1.3561, "step": 32320 }, { "epoch": 3.06, "grad_norm": 57.78125, "learning_rate": 3.8854247790271006e-07, "loss": 1.4019, "step": 32340 }, { "epoch": 3.06, "grad_norm": 54.75, "learning_rate": 3.881643343719045e-07, "loss": 1.2897, "step": 32360 }, { "epoch": 3.06, "grad_norm": 55.5, "learning_rate": 3.8778619084109896e-07, "loss": 1.4233, "step": 32380 }, { "epoch": 3.06, "grad_norm": 57.90625, "learning_rate": 3.8740804731029343e-07, "loss": 1.3995, "step": 32400 }, { "epoch": 3.06, "grad_norm": 60.0625, "learning_rate": 3.8702990377948785e-07, "loss": 1.3652, "step": 32420 }, { "epoch": 3.07, "grad_norm": 62.25, "learning_rate": 3.8665176024868227e-07, "loss": 1.4912, "step": 32440 }, { "epoch": 3.07, "grad_norm": 69.0625, "learning_rate": 3.8627361671787675e-07, "loss": 1.369, "step": 32460 }, { "epoch": 3.07, "grad_norm": 50.65625, "learning_rate": 3.8589547318707117e-07, "loss": 1.426, "step": 32480 }, { "epoch": 3.07, "grad_norm": 57.90625, "learning_rate": 3.855173296562656e-07, "loss": 1.2934, "step": 32500 }, { "epoch": 3.07, "grad_norm": 70.875, "learning_rate": 3.8513918612546e-07, "loss": 1.4486, "step": 32520 }, { "epoch": 3.08, "grad_norm": 70.4375, "learning_rate": 3.847610425946545e-07, "loss": 1.5197, "step": 32540 }, { "epoch": 3.08, "grad_norm": 60.625, "learning_rate": 3.8438289906384896e-07, "loss": 1.4181, "step": 32560 }, { "epoch": 3.08, "grad_norm": 70.75, "learning_rate": 3.840047555330434e-07, "loss": 1.3959, "step": 32580 }, { "epoch": 3.08, "grad_norm": 89.6875, "learning_rate": 3.8362661200223785e-07, "loss": 1.3343, "step": 32600 }, { "epoch": 3.08, "grad_norm": 55.75, "learning_rate": 3.832484684714323e-07, "loss": 1.4992, "step": 32620 }, { "epoch": 3.09, "grad_norm": 72.875, "learning_rate": 3.828703249406267e-07, "loss": 1.4326, "step": 32640 }, { "epoch": 3.09, "grad_norm": 59.40625, "learning_rate": 3.8249218140982117e-07, "loss": 1.4861, "step": 32660 }, { "epoch": 3.09, "grad_norm": 50.625, "learning_rate": 3.821140378790156e-07, "loss": 1.4226, "step": 32680 }, { "epoch": 3.09, "grad_norm": 56.3125, "learning_rate": 3.8173589434821e-07, "loss": 1.383, "step": 32700 }, { "epoch": 3.09, "grad_norm": 56.09375, "learning_rate": 3.813577508174045e-07, "loss": 1.5258, "step": 32720 }, { "epoch": 3.1, "grad_norm": 68.4375, "learning_rate": 3.8097960728659896e-07, "loss": 1.3827, "step": 32740 }, { "epoch": 3.1, "grad_norm": 52.09375, "learning_rate": 3.806014637557934e-07, "loss": 1.4179, "step": 32760 }, { "epoch": 3.1, "grad_norm": 55.5625, "learning_rate": 3.802233202249878e-07, "loss": 1.3972, "step": 32780 }, { "epoch": 3.1, "grad_norm": 51.625, "learning_rate": 3.798451766941823e-07, "loss": 1.4764, "step": 32800 }, { "epoch": 3.1, "grad_norm": 75.3125, "learning_rate": 3.794670331633767e-07, "loss": 1.4087, "step": 32820 }, { "epoch": 3.1, "grad_norm": 67.8125, "learning_rate": 3.790888896325711e-07, "loss": 1.4917, "step": 32840 }, { "epoch": 3.11, "grad_norm": 59.4375, "learning_rate": 3.7871074610176554e-07, "loss": 1.3819, "step": 32860 }, { "epoch": 3.11, "grad_norm": 56.0625, "learning_rate": 3.7833260257096e-07, "loss": 1.3787, "step": 32880 }, { "epoch": 3.11, "grad_norm": 64.0625, "learning_rate": 3.779544590401545e-07, "loss": 1.4065, "step": 32900 }, { "epoch": 3.11, "grad_norm": 59.375, "learning_rate": 3.775763155093489e-07, "loss": 1.427, "step": 32920 }, { "epoch": 3.11, "grad_norm": 61.09375, "learning_rate": 3.771981719785434e-07, "loss": 1.3911, "step": 32940 }, { "epoch": 3.12, "grad_norm": 65.9375, "learning_rate": 3.768200284477378e-07, "loss": 1.3648, "step": 32960 }, { "epoch": 3.12, "grad_norm": 66.6875, "learning_rate": 3.7644188491693223e-07, "loss": 1.4081, "step": 32980 }, { "epoch": 3.12, "grad_norm": 55.40625, "learning_rate": 3.7606374138612665e-07, "loss": 1.4276, "step": 33000 }, { "epoch": 3.12, "grad_norm": 54.34375, "learning_rate": 3.756855978553211e-07, "loss": 1.4641, "step": 33020 }, { "epoch": 3.12, "grad_norm": 64.9375, "learning_rate": 3.7530745432451555e-07, "loss": 1.35, "step": 33040 }, { "epoch": 3.13, "grad_norm": 46.28125, "learning_rate": 3.7492931079370997e-07, "loss": 1.4257, "step": 33060 }, { "epoch": 3.13, "grad_norm": 57.6875, "learning_rate": 3.745511672629045e-07, "loss": 1.4374, "step": 33080 }, { "epoch": 3.13, "grad_norm": 63.09375, "learning_rate": 3.741730237320989e-07, "loss": 1.4295, "step": 33100 }, { "epoch": 3.13, "grad_norm": 69.4375, "learning_rate": 3.7379488020129334e-07, "loss": 1.3691, "step": 33120 }, { "epoch": 3.13, "grad_norm": 63.34375, "learning_rate": 3.734167366704878e-07, "loss": 1.4291, "step": 33140 }, { "epoch": 3.13, "grad_norm": 52.3125, "learning_rate": 3.7303859313968223e-07, "loss": 1.4837, "step": 33160 }, { "epoch": 3.14, "grad_norm": 81.8125, "learning_rate": 3.7266044960887665e-07, "loss": 1.4837, "step": 33180 }, { "epoch": 3.14, "grad_norm": 57.09375, "learning_rate": 3.722823060780711e-07, "loss": 1.3974, "step": 33200 }, { "epoch": 3.14, "grad_norm": 65.1875, "learning_rate": 3.7190416254726555e-07, "loss": 1.3418, "step": 33220 }, { "epoch": 3.14, "grad_norm": 70.25, "learning_rate": 3.7152601901645997e-07, "loss": 1.4303, "step": 33240 }, { "epoch": 3.14, "grad_norm": 54.75, "learning_rate": 3.7114787548565445e-07, "loss": 1.4754, "step": 33260 }, { "epoch": 3.15, "grad_norm": 65.8125, "learning_rate": 3.707697319548489e-07, "loss": 1.4474, "step": 33280 }, { "epoch": 3.15, "grad_norm": 61.4375, "learning_rate": 3.7039158842404334e-07, "loss": 1.3412, "step": 33300 }, { "epoch": 3.15, "grad_norm": 80.0, "learning_rate": 3.7001344489323776e-07, "loss": 1.3211, "step": 33320 }, { "epoch": 3.15, "grad_norm": 62.0, "learning_rate": 3.696353013624322e-07, "loss": 1.3295, "step": 33340 }, { "epoch": 3.15, "grad_norm": 70.375, "learning_rate": 3.6925715783162666e-07, "loss": 1.4678, "step": 33360 }, { "epoch": 3.16, "grad_norm": 59.125, "learning_rate": 3.688790143008211e-07, "loss": 1.3333, "step": 33380 }, { "epoch": 3.16, "grad_norm": 63.5625, "learning_rate": 3.685008707700155e-07, "loss": 1.3726, "step": 33400 }, { "epoch": 3.16, "grad_norm": 64.625, "learning_rate": 3.6812272723921003e-07, "loss": 1.4712, "step": 33420 }, { "epoch": 3.16, "grad_norm": 64.9375, "learning_rate": 3.6774458370840445e-07, "loss": 1.3021, "step": 33440 }, { "epoch": 3.16, "grad_norm": 56.5625, "learning_rate": 3.6736644017759887e-07, "loss": 1.3805, "step": 33460 }, { "epoch": 3.17, "grad_norm": 59.1875, "learning_rate": 3.669882966467933e-07, "loss": 1.4557, "step": 33480 }, { "epoch": 3.17, "grad_norm": 64.1875, "learning_rate": 3.6661015311598777e-07, "loss": 1.4616, "step": 33500 }, { "epoch": 3.17, "grad_norm": 68.6875, "learning_rate": 3.662320095851822e-07, "loss": 1.3874, "step": 33520 }, { "epoch": 3.17, "grad_norm": 56.59375, "learning_rate": 3.658538660543766e-07, "loss": 1.4573, "step": 33540 }, { "epoch": 3.17, "grad_norm": 70.4375, "learning_rate": 3.654757225235711e-07, "loss": 1.3524, "step": 33560 }, { "epoch": 3.17, "grad_norm": 69.75, "learning_rate": 3.650975789927655e-07, "loss": 1.3711, "step": 33580 }, { "epoch": 3.18, "grad_norm": 68.75, "learning_rate": 3.6471943546196e-07, "loss": 1.3215, "step": 33600 }, { "epoch": 3.18, "grad_norm": 51.875, "learning_rate": 3.6434129193115445e-07, "loss": 1.4378, "step": 33620 }, { "epoch": 3.18, "grad_norm": 53.5, "learning_rate": 3.639631484003489e-07, "loss": 1.4243, "step": 33640 }, { "epoch": 3.18, "grad_norm": 118.8125, "learning_rate": 3.635850048695433e-07, "loss": 1.3906, "step": 33660 }, { "epoch": 3.18, "grad_norm": 49.21875, "learning_rate": 3.632068613387377e-07, "loss": 1.3534, "step": 33680 }, { "epoch": 3.19, "grad_norm": 61.1875, "learning_rate": 3.628287178079322e-07, "loss": 1.423, "step": 33700 }, { "epoch": 3.19, "grad_norm": 72.3125, "learning_rate": 3.624505742771266e-07, "loss": 1.4193, "step": 33720 }, { "epoch": 3.19, "grad_norm": 66.625, "learning_rate": 3.6207243074632103e-07, "loss": 1.4438, "step": 33740 }, { "epoch": 3.19, "grad_norm": 70.9375, "learning_rate": 3.6169428721551545e-07, "loss": 1.4021, "step": 33760 }, { "epoch": 3.19, "grad_norm": 64.4375, "learning_rate": 3.6131614368471e-07, "loss": 1.3662, "step": 33780 }, { "epoch": 3.2, "grad_norm": 74.6875, "learning_rate": 3.609380001539044e-07, "loss": 1.4504, "step": 33800 }, { "epoch": 3.2, "grad_norm": 83.25, "learning_rate": 3.605598566230988e-07, "loss": 1.3239, "step": 33820 }, { "epoch": 3.2, "grad_norm": 78.375, "learning_rate": 3.601817130922933e-07, "loss": 1.4084, "step": 33840 }, { "epoch": 3.2, "grad_norm": 78.5625, "learning_rate": 3.598035695614877e-07, "loss": 1.4374, "step": 33860 }, { "epoch": 3.2, "grad_norm": 67.5, "learning_rate": 3.5942542603068214e-07, "loss": 1.3577, "step": 33880 }, { "epoch": 3.2, "grad_norm": 54.09375, "learning_rate": 3.590472824998766e-07, "loss": 1.3353, "step": 33900 }, { "epoch": 3.21, "grad_norm": 53.34375, "learning_rate": 3.5866913896907104e-07, "loss": 1.3715, "step": 33920 }, { "epoch": 3.21, "grad_norm": 56.6875, "learning_rate": 3.582909954382655e-07, "loss": 1.3706, "step": 33940 }, { "epoch": 3.21, "grad_norm": 84.9375, "learning_rate": 3.5791285190745993e-07, "loss": 1.384, "step": 33960 }, { "epoch": 3.21, "grad_norm": 57.84375, "learning_rate": 3.575347083766544e-07, "loss": 1.3953, "step": 33980 }, { "epoch": 3.21, "grad_norm": 78.625, "learning_rate": 3.5715656484584883e-07, "loss": 1.3413, "step": 34000 }, { "epoch": 3.22, "grad_norm": 82.25, "learning_rate": 3.5677842131504325e-07, "loss": 1.3603, "step": 34020 }, { "epoch": 3.22, "grad_norm": 62.34375, "learning_rate": 3.564002777842377e-07, "loss": 1.3919, "step": 34040 }, { "epoch": 3.22, "grad_norm": 57.0, "learning_rate": 3.5602213425343214e-07, "loss": 1.4083, "step": 34060 }, { "epoch": 3.22, "grad_norm": 77.875, "learning_rate": 3.5564399072262657e-07, "loss": 1.4505, "step": 34080 }, { "epoch": 3.22, "grad_norm": 77.3125, "learning_rate": 3.55265847191821e-07, "loss": 1.3939, "step": 34100 }, { "epoch": 3.23, "grad_norm": 70.3125, "learning_rate": 3.548877036610155e-07, "loss": 1.4308, "step": 34120 }, { "epoch": 3.23, "grad_norm": 59.0, "learning_rate": 3.5450956013020994e-07, "loss": 1.3746, "step": 34140 }, { "epoch": 3.23, "grad_norm": 68.375, "learning_rate": 3.5413141659940436e-07, "loss": 1.3084, "step": 34160 }, { "epoch": 3.23, "grad_norm": 59.75, "learning_rate": 3.5375327306859883e-07, "loss": 1.4063, "step": 34180 }, { "epoch": 3.23, "grad_norm": 62.4375, "learning_rate": 3.5337512953779325e-07, "loss": 1.4903, "step": 34200 }, { "epoch": 3.24, "grad_norm": 63.875, "learning_rate": 3.529969860069877e-07, "loss": 1.4406, "step": 34220 }, { "epoch": 3.24, "grad_norm": 61.71875, "learning_rate": 3.526188424761821e-07, "loss": 1.51, "step": 34240 }, { "epoch": 3.24, "grad_norm": 73.1875, "learning_rate": 3.5224069894537657e-07, "loss": 1.375, "step": 34260 }, { "epoch": 3.24, "grad_norm": 60.40625, "learning_rate": 3.51862555414571e-07, "loss": 1.3797, "step": 34280 }, { "epoch": 3.24, "grad_norm": 55.25, "learning_rate": 3.5148441188376547e-07, "loss": 1.3388, "step": 34300 }, { "epoch": 3.24, "grad_norm": 96.0, "learning_rate": 3.5110626835295994e-07, "loss": 1.4355, "step": 34320 }, { "epoch": 3.25, "grad_norm": 65.125, "learning_rate": 3.5072812482215436e-07, "loss": 1.3663, "step": 34340 }, { "epoch": 3.25, "grad_norm": 56.59375, "learning_rate": 3.503499812913488e-07, "loss": 1.3666, "step": 34360 }, { "epoch": 3.25, "grad_norm": 62.875, "learning_rate": 3.4997183776054326e-07, "loss": 1.3237, "step": 34380 }, { "epoch": 3.25, "grad_norm": 47.59375, "learning_rate": 3.495936942297377e-07, "loss": 1.5043, "step": 34400 }, { "epoch": 3.25, "grad_norm": 64.75, "learning_rate": 3.492155506989321e-07, "loss": 1.3668, "step": 34420 }, { "epoch": 3.26, "grad_norm": 76.0, "learning_rate": 3.488374071681265e-07, "loss": 1.4768, "step": 34440 }, { "epoch": 3.26, "grad_norm": 55.625, "learning_rate": 3.48459263637321e-07, "loss": 1.3829, "step": 34460 }, { "epoch": 3.26, "grad_norm": 55.375, "learning_rate": 3.4808112010651547e-07, "loss": 1.3135, "step": 34480 }, { "epoch": 3.26, "grad_norm": 57.8125, "learning_rate": 3.477029765757099e-07, "loss": 1.3666, "step": 34500 }, { "epoch": 3.26, "grad_norm": 62.34375, "learning_rate": 3.4732483304490436e-07, "loss": 1.3976, "step": 34520 }, { "epoch": 3.27, "grad_norm": 56.21875, "learning_rate": 3.469466895140988e-07, "loss": 1.4472, "step": 34540 }, { "epoch": 3.27, "grad_norm": 79.125, "learning_rate": 3.465685459832932e-07, "loss": 1.3524, "step": 34560 }, { "epoch": 3.27, "grad_norm": 82.375, "learning_rate": 3.4619040245248763e-07, "loss": 1.3981, "step": 34580 }, { "epoch": 3.27, "grad_norm": 57.21875, "learning_rate": 3.458122589216821e-07, "loss": 1.3506, "step": 34600 }, { "epoch": 3.27, "grad_norm": 73.8125, "learning_rate": 3.454341153908765e-07, "loss": 1.4456, "step": 34620 }, { "epoch": 3.27, "grad_norm": 97.1875, "learning_rate": 3.45055971860071e-07, "loss": 1.3992, "step": 34640 }, { "epoch": 3.28, "grad_norm": 81.5625, "learning_rate": 3.4467782832926547e-07, "loss": 1.4132, "step": 34660 }, { "epoch": 3.28, "grad_norm": 103.8125, "learning_rate": 3.442996847984599e-07, "loss": 1.3113, "step": 34680 }, { "epoch": 3.28, "grad_norm": 63.34375, "learning_rate": 3.439215412676543e-07, "loss": 1.4059, "step": 34700 }, { "epoch": 3.28, "grad_norm": 114.6875, "learning_rate": 3.4354339773684874e-07, "loss": 1.4594, "step": 34720 }, { "epoch": 3.28, "grad_norm": 73.5, "learning_rate": 3.431652542060432e-07, "loss": 1.269, "step": 34740 }, { "epoch": 3.29, "grad_norm": 67.5625, "learning_rate": 3.4278711067523763e-07, "loss": 1.3647, "step": 34760 }, { "epoch": 3.29, "grad_norm": 79.75, "learning_rate": 3.4240896714443205e-07, "loss": 1.4436, "step": 34780 }, { "epoch": 3.29, "grad_norm": 53.625, "learning_rate": 3.4203082361362653e-07, "loss": 1.3997, "step": 34800 }, { "epoch": 3.29, "grad_norm": 66.125, "learning_rate": 3.41652680082821e-07, "loss": 1.3917, "step": 34820 }, { "epoch": 3.29, "grad_norm": 60.1875, "learning_rate": 3.412745365520154e-07, "loss": 1.3912, "step": 34840 }, { "epoch": 3.3, "grad_norm": 65.5, "learning_rate": 3.408963930212099e-07, "loss": 1.468, "step": 34860 }, { "epoch": 3.3, "grad_norm": 81.25, "learning_rate": 3.405182494904043e-07, "loss": 1.4517, "step": 34880 }, { "epoch": 3.3, "grad_norm": 74.0625, "learning_rate": 3.4014010595959874e-07, "loss": 1.5081, "step": 34900 }, { "epoch": 3.3, "grad_norm": 70.125, "learning_rate": 3.3976196242879316e-07, "loss": 1.2948, "step": 34920 }, { "epoch": 3.3, "grad_norm": 49.84375, "learning_rate": 3.3938381889798764e-07, "loss": 1.2734, "step": 34940 }, { "epoch": 3.3, "grad_norm": 60.5, "learning_rate": 3.3900567536718206e-07, "loss": 1.5046, "step": 34960 }, { "epoch": 3.31, "grad_norm": 104.375, "learning_rate": 3.386275318363765e-07, "loss": 1.4961, "step": 34980 }, { "epoch": 3.31, "grad_norm": 61.78125, "learning_rate": 3.38249388305571e-07, "loss": 1.4419, "step": 35000 }, { "epoch": 3.31, "grad_norm": 60.03125, "learning_rate": 3.378712447747654e-07, "loss": 1.4329, "step": 35020 }, { "epoch": 3.31, "grad_norm": 56.90625, "learning_rate": 3.3749310124395985e-07, "loss": 1.4915, "step": 35040 }, { "epoch": 3.31, "grad_norm": 83.3125, "learning_rate": 3.3711495771315427e-07, "loss": 1.4082, "step": 35060 }, { "epoch": 3.32, "grad_norm": 57.0625, "learning_rate": 3.3673681418234874e-07, "loss": 1.4659, "step": 35080 }, { "epoch": 3.32, "grad_norm": 68.75, "learning_rate": 3.3635867065154316e-07, "loss": 1.4333, "step": 35100 }, { "epoch": 3.32, "grad_norm": 84.375, "learning_rate": 3.359805271207376e-07, "loss": 1.4449, "step": 35120 }, { "epoch": 3.32, "grad_norm": 62.96875, "learning_rate": 3.3560238358993206e-07, "loss": 1.4052, "step": 35140 }, { "epoch": 3.32, "grad_norm": 58.46875, "learning_rate": 3.352242400591265e-07, "loss": 1.4209, "step": 35160 }, { "epoch": 3.33, "grad_norm": 94.875, "learning_rate": 3.3484609652832096e-07, "loss": 1.4097, "step": 35180 }, { "epoch": 3.33, "grad_norm": 62.84375, "learning_rate": 3.344679529975154e-07, "loss": 1.4691, "step": 35200 }, { "epoch": 3.33, "grad_norm": 50.25, "learning_rate": 3.3408980946670985e-07, "loss": 1.3639, "step": 35220 }, { "epoch": 3.33, "grad_norm": 71.0, "learning_rate": 3.3371166593590427e-07, "loss": 1.3215, "step": 35240 }, { "epoch": 3.33, "grad_norm": 82.5, "learning_rate": 3.333335224050987e-07, "loss": 1.428, "step": 35260 }, { "epoch": 3.34, "grad_norm": 80.4375, "learning_rate": 3.3295537887429317e-07, "loss": 1.4102, "step": 35280 }, { "epoch": 3.34, "grad_norm": 62.8125, "learning_rate": 3.325772353434876e-07, "loss": 1.3712, "step": 35300 }, { "epoch": 3.34, "grad_norm": 65.9375, "learning_rate": 3.32199091812682e-07, "loss": 1.4205, "step": 35320 }, { "epoch": 3.34, "grad_norm": 72.0, "learning_rate": 3.3182094828187654e-07, "loss": 1.3905, "step": 35340 }, { "epoch": 3.34, "grad_norm": 75.0, "learning_rate": 3.3144280475107096e-07, "loss": 1.4007, "step": 35360 }, { "epoch": 3.34, "grad_norm": 69.125, "learning_rate": 3.310646612202654e-07, "loss": 1.4627, "step": 35380 }, { "epoch": 3.35, "grad_norm": 92.25, "learning_rate": 3.306865176894598e-07, "loss": 1.3625, "step": 35400 }, { "epoch": 3.35, "grad_norm": 56.5, "learning_rate": 3.303083741586543e-07, "loss": 1.4248, "step": 35420 }, { "epoch": 3.35, "grad_norm": 56.34375, "learning_rate": 3.299302306278487e-07, "loss": 1.384, "step": 35440 }, { "epoch": 3.35, "grad_norm": 60.875, "learning_rate": 3.295520870970431e-07, "loss": 1.4455, "step": 35460 }, { "epoch": 3.35, "grad_norm": 58.90625, "learning_rate": 3.2917394356623754e-07, "loss": 1.4998, "step": 35480 }, { "epoch": 3.36, "grad_norm": 64.375, "learning_rate": 3.28795800035432e-07, "loss": 1.4044, "step": 35500 }, { "epoch": 3.36, "grad_norm": 65.0, "learning_rate": 3.284176565046265e-07, "loss": 1.3424, "step": 35520 }, { "epoch": 3.36, "grad_norm": 78.1875, "learning_rate": 3.280395129738209e-07, "loss": 1.3932, "step": 35540 }, { "epoch": 3.36, "grad_norm": 115.625, "learning_rate": 3.276613694430154e-07, "loss": 1.4185, "step": 35560 }, { "epoch": 3.36, "grad_norm": 51.8125, "learning_rate": 3.272832259122098e-07, "loss": 1.357, "step": 35580 }, { "epoch": 3.37, "grad_norm": 81.0, "learning_rate": 3.2690508238140423e-07, "loss": 1.3221, "step": 35600 }, { "epoch": 3.37, "grad_norm": 58.84375, "learning_rate": 3.265269388505987e-07, "loss": 1.3425, "step": 35620 }, { "epoch": 3.37, "grad_norm": 79.8125, "learning_rate": 3.261487953197931e-07, "loss": 1.5156, "step": 35640 }, { "epoch": 3.37, "grad_norm": 99.625, "learning_rate": 3.2577065178898754e-07, "loss": 1.4331, "step": 35660 }, { "epoch": 3.37, "grad_norm": 64.4375, "learning_rate": 3.2539250825818196e-07, "loss": 1.4773, "step": 35680 }, { "epoch": 3.37, "grad_norm": 57.9375, "learning_rate": 3.250143647273765e-07, "loss": 1.386, "step": 35700 }, { "epoch": 3.38, "grad_norm": 67.6875, "learning_rate": 3.246362211965709e-07, "loss": 1.4126, "step": 35720 }, { "epoch": 3.38, "grad_norm": 79.6875, "learning_rate": 3.2425807766576533e-07, "loss": 1.4665, "step": 35740 }, { "epoch": 3.38, "grad_norm": 53.125, "learning_rate": 3.238799341349598e-07, "loss": 1.3703, "step": 35760 }, { "epoch": 3.38, "grad_norm": 53.0, "learning_rate": 3.2350179060415423e-07, "loss": 1.4106, "step": 35780 }, { "epoch": 3.38, "grad_norm": 54.09375, "learning_rate": 3.2312364707334865e-07, "loss": 1.3638, "step": 35800 }, { "epoch": 3.39, "grad_norm": 86.25, "learning_rate": 3.2274550354254307e-07, "loss": 1.4043, "step": 35820 }, { "epoch": 3.39, "grad_norm": 70.25, "learning_rate": 3.2236736001173755e-07, "loss": 1.4668, "step": 35840 }, { "epoch": 3.39, "grad_norm": 46.53125, "learning_rate": 3.2198921648093197e-07, "loss": 1.4028, "step": 35860 }, { "epoch": 3.39, "grad_norm": 62.8125, "learning_rate": 3.2161107295012644e-07, "loss": 1.4119, "step": 35880 }, { "epoch": 3.39, "grad_norm": 62.8125, "learning_rate": 3.212329294193209e-07, "loss": 1.3546, "step": 35900 }, { "epoch": 3.4, "grad_norm": 59.78125, "learning_rate": 3.2085478588851534e-07, "loss": 1.4402, "step": 35920 }, { "epoch": 3.4, "grad_norm": 63.1875, "learning_rate": 3.2047664235770976e-07, "loss": 1.3611, "step": 35940 }, { "epoch": 3.4, "grad_norm": 55.875, "learning_rate": 3.200984988269042e-07, "loss": 1.4106, "step": 35960 }, { "epoch": 3.4, "grad_norm": 61.53125, "learning_rate": 3.1972035529609865e-07, "loss": 1.4203, "step": 35980 }, { "epoch": 3.4, "grad_norm": 56.03125, "learning_rate": 3.193422117652931e-07, "loss": 1.4757, "step": 36000 }, { "epoch": 3.41, "grad_norm": 62.4375, "learning_rate": 3.189640682344875e-07, "loss": 1.4443, "step": 36020 }, { "epoch": 3.41, "grad_norm": 74.125, "learning_rate": 3.18585924703682e-07, "loss": 1.3875, "step": 36040 }, { "epoch": 3.41, "grad_norm": 72.4375, "learning_rate": 3.1820778117287645e-07, "loss": 1.3827, "step": 36060 }, { "epoch": 3.41, "grad_norm": 63.84375, "learning_rate": 3.1782963764207087e-07, "loss": 1.4127, "step": 36080 }, { "epoch": 3.41, "grad_norm": 58.0, "learning_rate": 3.1745149411126534e-07, "loss": 1.4644, "step": 36100 }, { "epoch": 3.41, "grad_norm": 67.3125, "learning_rate": 3.1707335058045976e-07, "loss": 1.3745, "step": 36120 }, { "epoch": 3.42, "grad_norm": 73.25, "learning_rate": 3.166952070496542e-07, "loss": 1.4179, "step": 36140 }, { "epoch": 3.42, "grad_norm": 58.1875, "learning_rate": 3.163170635188486e-07, "loss": 1.4096, "step": 36160 }, { "epoch": 3.42, "grad_norm": 70.125, "learning_rate": 3.159389199880431e-07, "loss": 1.4569, "step": 36180 }, { "epoch": 3.42, "grad_norm": 55.6875, "learning_rate": 3.155607764572375e-07, "loss": 1.338, "step": 36200 }, { "epoch": 3.42, "grad_norm": 70.5, "learning_rate": 3.15182632926432e-07, "loss": 1.4023, "step": 36220 }, { "epoch": 3.43, "grad_norm": 351.5, "learning_rate": 3.1480448939562645e-07, "loss": 1.4474, "step": 36240 }, { "epoch": 3.43, "grad_norm": 69.4375, "learning_rate": 3.1442634586482087e-07, "loss": 1.3545, "step": 36260 }, { "epoch": 3.43, "grad_norm": 69.875, "learning_rate": 3.140482023340153e-07, "loss": 1.3057, "step": 36280 }, { "epoch": 3.43, "grad_norm": 48.0625, "learning_rate": 3.136700588032097e-07, "loss": 1.4568, "step": 36300 }, { "epoch": 3.43, "grad_norm": 81.625, "learning_rate": 3.132919152724042e-07, "loss": 1.3281, "step": 36320 }, { "epoch": 3.44, "grad_norm": 52.8125, "learning_rate": 3.129137717415986e-07, "loss": 1.4704, "step": 36340 }, { "epoch": 3.44, "grad_norm": 65.1875, "learning_rate": 3.1253562821079303e-07, "loss": 1.4349, "step": 36360 }, { "epoch": 3.44, "grad_norm": 76.125, "learning_rate": 3.121574846799875e-07, "loss": 1.4236, "step": 36380 }, { "epoch": 3.44, "grad_norm": 50.34375, "learning_rate": 3.11779341149182e-07, "loss": 1.2999, "step": 36400 }, { "epoch": 3.44, "grad_norm": 61.28125, "learning_rate": 3.114011976183764e-07, "loss": 1.3776, "step": 36420 }, { "epoch": 3.44, "grad_norm": 71.1875, "learning_rate": 3.110230540875708e-07, "loss": 1.3691, "step": 36440 }, { "epoch": 3.45, "grad_norm": 51.34375, "learning_rate": 3.106449105567653e-07, "loss": 1.335, "step": 36460 }, { "epoch": 3.45, "grad_norm": 60.625, "learning_rate": 3.102667670259597e-07, "loss": 1.2949, "step": 36480 }, { "epoch": 3.45, "grad_norm": 55.96875, "learning_rate": 3.0988862349515414e-07, "loss": 1.402, "step": 36500 }, { "epoch": 3.45, "grad_norm": 76.25, "learning_rate": 3.095104799643486e-07, "loss": 1.4453, "step": 36520 }, { "epoch": 3.45, "grad_norm": 64.8125, "learning_rate": 3.0913233643354303e-07, "loss": 1.3819, "step": 36540 }, { "epoch": 3.46, "grad_norm": 54.96875, "learning_rate": 3.087541929027375e-07, "loss": 1.3811, "step": 36560 }, { "epoch": 3.46, "grad_norm": 57.84375, "learning_rate": 3.08376049371932e-07, "loss": 1.4251, "step": 36580 }, { "epoch": 3.46, "grad_norm": 57.8125, "learning_rate": 3.079979058411264e-07, "loss": 1.4553, "step": 36600 }, { "epoch": 3.46, "grad_norm": 53.875, "learning_rate": 3.076197623103208e-07, "loss": 1.3474, "step": 36620 }, { "epoch": 3.46, "grad_norm": 63.6875, "learning_rate": 3.0724161877951525e-07, "loss": 1.4288, "step": 36640 }, { "epoch": 3.47, "grad_norm": 55.78125, "learning_rate": 3.068634752487097e-07, "loss": 1.431, "step": 36660 }, { "epoch": 3.47, "grad_norm": 63.46875, "learning_rate": 3.0648533171790414e-07, "loss": 1.3994, "step": 36680 }, { "epoch": 3.47, "grad_norm": 90.375, "learning_rate": 3.0610718818709856e-07, "loss": 1.4356, "step": 36700 }, { "epoch": 3.47, "grad_norm": 71.0625, "learning_rate": 3.0572904465629304e-07, "loss": 1.3515, "step": 36720 }, { "epoch": 3.47, "grad_norm": 67.1875, "learning_rate": 3.053509011254875e-07, "loss": 1.3501, "step": 36740 }, { "epoch": 3.48, "grad_norm": 66.25, "learning_rate": 3.0497275759468193e-07, "loss": 1.4016, "step": 36760 }, { "epoch": 3.48, "grad_norm": 69.4375, "learning_rate": 3.0459461406387635e-07, "loss": 1.3546, "step": 36780 }, { "epoch": 3.48, "grad_norm": 51.0625, "learning_rate": 3.0421647053307083e-07, "loss": 1.4189, "step": 36800 }, { "epoch": 3.48, "grad_norm": 72.0625, "learning_rate": 3.0383832700226525e-07, "loss": 1.4171, "step": 36820 }, { "epoch": 3.48, "grad_norm": 54.59375, "learning_rate": 3.0346018347145967e-07, "loss": 1.4448, "step": 36840 }, { "epoch": 3.48, "grad_norm": 49.0625, "learning_rate": 3.0308203994065415e-07, "loss": 1.4652, "step": 36860 }, { "epoch": 3.49, "grad_norm": 72.4375, "learning_rate": 3.0270389640984857e-07, "loss": 1.41, "step": 36880 }, { "epoch": 3.49, "grad_norm": 67.1875, "learning_rate": 3.02325752879043e-07, "loss": 1.3746, "step": 36900 }, { "epoch": 3.49, "grad_norm": 86.5, "learning_rate": 3.0194760934823746e-07, "loss": 1.3318, "step": 36920 }, { "epoch": 3.49, "grad_norm": 61.90625, "learning_rate": 3.0156946581743194e-07, "loss": 1.3764, "step": 36940 }, { "epoch": 3.49, "grad_norm": 63.375, "learning_rate": 3.0119132228662636e-07, "loss": 1.4998, "step": 36960 }, { "epoch": 3.5, "grad_norm": 52.8125, "learning_rate": 3.008131787558208e-07, "loss": 1.4612, "step": 36980 }, { "epoch": 3.5, "grad_norm": 62.125, "learning_rate": 3.0043503522501525e-07, "loss": 1.3757, "step": 37000 }, { "epoch": 3.5, "grad_norm": 56.5625, "learning_rate": 3.000568916942097e-07, "loss": 1.3478, "step": 37020 }, { "epoch": 3.5, "grad_norm": 87.3125, "learning_rate": 2.996787481634041e-07, "loss": 1.3444, "step": 37040 }, { "epoch": 3.5, "grad_norm": 59.375, "learning_rate": 2.993006046325985e-07, "loss": 1.4231, "step": 37060 }, { "epoch": 3.51, "grad_norm": 64.625, "learning_rate": 2.98922461101793e-07, "loss": 1.4394, "step": 37080 }, { "epoch": 3.51, "grad_norm": 58.21875, "learning_rate": 2.9854431757098747e-07, "loss": 1.3933, "step": 37100 }, { "epoch": 3.51, "grad_norm": 87.5625, "learning_rate": 2.981661740401819e-07, "loss": 1.4413, "step": 37120 }, { "epoch": 3.51, "grad_norm": 63.03125, "learning_rate": 2.9778803050937636e-07, "loss": 1.513, "step": 37140 }, { "epoch": 3.51, "grad_norm": 87.375, "learning_rate": 2.974098869785708e-07, "loss": 1.3841, "step": 37160 }, { "epoch": 3.51, "grad_norm": 69.6875, "learning_rate": 2.970317434477652e-07, "loss": 1.4671, "step": 37180 }, { "epoch": 3.52, "grad_norm": 63.5625, "learning_rate": 2.966535999169596e-07, "loss": 1.3005, "step": 37200 }, { "epoch": 3.52, "grad_norm": 58.96875, "learning_rate": 2.962754563861541e-07, "loss": 1.3706, "step": 37220 }, { "epoch": 3.52, "grad_norm": 59.0625, "learning_rate": 2.958973128553485e-07, "loss": 1.3605, "step": 37240 }, { "epoch": 3.52, "grad_norm": 53.15625, "learning_rate": 2.95519169324543e-07, "loss": 1.4114, "step": 37260 }, { "epoch": 3.52, "grad_norm": 76.8125, "learning_rate": 2.9514102579373747e-07, "loss": 1.3111, "step": 37280 }, { "epoch": 3.53, "grad_norm": 66.9375, "learning_rate": 2.947628822629319e-07, "loss": 1.3771, "step": 37300 }, { "epoch": 3.53, "grad_norm": 72.5625, "learning_rate": 2.943847387321263e-07, "loss": 1.3847, "step": 37320 }, { "epoch": 3.53, "grad_norm": 55.4375, "learning_rate": 2.940065952013208e-07, "loss": 1.3796, "step": 37340 }, { "epoch": 3.53, "grad_norm": 60.9375, "learning_rate": 2.936284516705152e-07, "loss": 1.4556, "step": 37360 }, { "epoch": 3.53, "grad_norm": 77.6875, "learning_rate": 2.9325030813970963e-07, "loss": 1.334, "step": 37380 }, { "epoch": 3.54, "grad_norm": 77.5625, "learning_rate": 2.9287216460890405e-07, "loss": 1.4496, "step": 37400 }, { "epoch": 3.54, "grad_norm": 62.6875, "learning_rate": 2.924940210780985e-07, "loss": 1.3992, "step": 37420 }, { "epoch": 3.54, "grad_norm": 75.125, "learning_rate": 2.92115877547293e-07, "loss": 1.3938, "step": 37440 }, { "epoch": 3.54, "grad_norm": 117.625, "learning_rate": 2.917377340164874e-07, "loss": 1.4084, "step": 37460 }, { "epoch": 3.54, "grad_norm": 81.9375, "learning_rate": 2.913595904856819e-07, "loss": 1.4994, "step": 37480 }, { "epoch": 3.55, "grad_norm": 87.6875, "learning_rate": 2.909814469548763e-07, "loss": 1.383, "step": 37500 }, { "epoch": 3.55, "grad_norm": 59.1875, "learning_rate": 2.9060330342407074e-07, "loss": 1.3683, "step": 37520 }, { "epoch": 3.55, "grad_norm": 71.5, "learning_rate": 2.9022515989326516e-07, "loss": 1.4564, "step": 37540 }, { "epoch": 3.55, "grad_norm": 59.53125, "learning_rate": 2.8984701636245963e-07, "loss": 1.3854, "step": 37560 }, { "epoch": 3.55, "grad_norm": 66.8125, "learning_rate": 2.8946887283165405e-07, "loss": 1.4842, "step": 37580 }, { "epoch": 3.55, "grad_norm": 60.84375, "learning_rate": 2.890907293008485e-07, "loss": 1.4141, "step": 37600 }, { "epoch": 3.56, "grad_norm": 53.40625, "learning_rate": 2.88712585770043e-07, "loss": 1.4447, "step": 37620 }, { "epoch": 3.56, "grad_norm": 64.0625, "learning_rate": 2.883344422392374e-07, "loss": 1.3315, "step": 37640 }, { "epoch": 3.56, "grad_norm": 77.8125, "learning_rate": 2.8795629870843184e-07, "loss": 1.4563, "step": 37660 }, { "epoch": 3.56, "grad_norm": 74.1875, "learning_rate": 2.8757815517762627e-07, "loss": 1.3781, "step": 37680 }, { "epoch": 3.56, "grad_norm": 68.625, "learning_rate": 2.8720001164682074e-07, "loss": 1.3988, "step": 37700 }, { "epoch": 3.57, "grad_norm": 76.0625, "learning_rate": 2.8682186811601516e-07, "loss": 1.4011, "step": 37720 }, { "epoch": 3.57, "grad_norm": 76.0625, "learning_rate": 2.864437245852096e-07, "loss": 1.371, "step": 37740 }, { "epoch": 3.57, "grad_norm": 67.6875, "learning_rate": 2.8606558105440406e-07, "loss": 1.3667, "step": 37760 }, { "epoch": 3.57, "grad_norm": 51.25, "learning_rate": 2.856874375235985e-07, "loss": 1.4467, "step": 37780 }, { "epoch": 3.57, "grad_norm": 50.96875, "learning_rate": 2.8530929399279295e-07, "loss": 1.3751, "step": 37800 }, { "epoch": 3.58, "grad_norm": 61.90625, "learning_rate": 2.8493115046198743e-07, "loss": 1.3476, "step": 37820 }, { "epoch": 3.58, "grad_norm": 76.5625, "learning_rate": 2.8455300693118185e-07, "loss": 1.2849, "step": 37840 }, { "epoch": 3.58, "grad_norm": 49.28125, "learning_rate": 2.8417486340037627e-07, "loss": 1.4695, "step": 37860 }, { "epoch": 3.58, "grad_norm": 66.0625, "learning_rate": 2.837967198695707e-07, "loss": 1.373, "step": 37880 }, { "epoch": 3.58, "grad_norm": 61.21875, "learning_rate": 2.8341857633876516e-07, "loss": 1.365, "step": 37900 }, { "epoch": 3.58, "grad_norm": 77.9375, "learning_rate": 2.830404328079596e-07, "loss": 1.3943, "step": 37920 }, { "epoch": 3.59, "grad_norm": 71.875, "learning_rate": 2.82662289277154e-07, "loss": 1.4403, "step": 37940 }, { "epoch": 3.59, "grad_norm": 64.875, "learning_rate": 2.8228414574634853e-07, "loss": 1.4911, "step": 37960 }, { "epoch": 3.59, "grad_norm": 62.4375, "learning_rate": 2.8190600221554296e-07, "loss": 1.4561, "step": 37980 }, { "epoch": 3.59, "grad_norm": 63.9375, "learning_rate": 2.815278586847374e-07, "loss": 1.2846, "step": 38000 }, { "epoch": 3.59, "grad_norm": 61.875, "learning_rate": 2.811497151539318e-07, "loss": 1.4566, "step": 38020 }, { "epoch": 3.6, "grad_norm": 67.625, "learning_rate": 2.8077157162312627e-07, "loss": 1.4428, "step": 38040 }, { "epoch": 3.6, "grad_norm": 68.6875, "learning_rate": 2.803934280923207e-07, "loss": 1.4209, "step": 38060 }, { "epoch": 3.6, "grad_norm": 60.71875, "learning_rate": 2.800152845615151e-07, "loss": 1.2564, "step": 38080 }, { "epoch": 3.6, "grad_norm": 64.5, "learning_rate": 2.796371410307096e-07, "loss": 1.5052, "step": 38100 }, { "epoch": 3.6, "grad_norm": 65.75, "learning_rate": 2.79258997499904e-07, "loss": 1.3543, "step": 38120 }, { "epoch": 3.61, "grad_norm": 64.25, "learning_rate": 2.788808539690985e-07, "loss": 1.3216, "step": 38140 }, { "epoch": 3.61, "grad_norm": 64.4375, "learning_rate": 2.785027104382929e-07, "loss": 1.3808, "step": 38160 }, { "epoch": 3.61, "grad_norm": 61.875, "learning_rate": 2.781245669074874e-07, "loss": 1.4452, "step": 38180 }, { "epoch": 3.61, "grad_norm": 73.0, "learning_rate": 2.777464233766818e-07, "loss": 1.3897, "step": 38200 }, { "epoch": 3.61, "grad_norm": 51.90625, "learning_rate": 2.773682798458762e-07, "loss": 1.4239, "step": 38220 }, { "epoch": 3.62, "grad_norm": 63.875, "learning_rate": 2.769901363150707e-07, "loss": 1.2993, "step": 38240 }, { "epoch": 3.62, "grad_norm": 60.1875, "learning_rate": 2.766119927842651e-07, "loss": 1.4217, "step": 38260 }, { "epoch": 3.62, "grad_norm": 77.9375, "learning_rate": 2.7623384925345954e-07, "loss": 1.2893, "step": 38280 }, { "epoch": 3.62, "grad_norm": 59.9375, "learning_rate": 2.7585570572265396e-07, "loss": 1.4078, "step": 38300 }, { "epoch": 3.62, "grad_norm": 74.75, "learning_rate": 2.754775621918485e-07, "loss": 1.4683, "step": 38320 }, { "epoch": 3.62, "grad_norm": 53.9375, "learning_rate": 2.750994186610429e-07, "loss": 1.4265, "step": 38340 }, { "epoch": 3.63, "grad_norm": 51.625, "learning_rate": 2.7472127513023733e-07, "loss": 1.3032, "step": 38360 }, { "epoch": 3.63, "grad_norm": 60.71875, "learning_rate": 2.743431315994318e-07, "loss": 1.4044, "step": 38380 }, { "epoch": 3.63, "grad_norm": 57.46875, "learning_rate": 2.7396498806862623e-07, "loss": 1.2896, "step": 38400 }, { "epoch": 3.63, "grad_norm": 60.75, "learning_rate": 2.7358684453782065e-07, "loss": 1.4532, "step": 38420 }, { "epoch": 3.63, "grad_norm": 115.375, "learning_rate": 2.732087010070151e-07, "loss": 1.4785, "step": 38440 }, { "epoch": 3.64, "grad_norm": 71.5, "learning_rate": 2.7283055747620954e-07, "loss": 1.3397, "step": 38460 }, { "epoch": 3.64, "grad_norm": 48.15625, "learning_rate": 2.7245241394540396e-07, "loss": 1.3967, "step": 38480 }, { "epoch": 3.64, "grad_norm": 59.96875, "learning_rate": 2.7207427041459844e-07, "loss": 1.4357, "step": 38500 }, { "epoch": 3.64, "grad_norm": 118.1875, "learning_rate": 2.716961268837929e-07, "loss": 1.3167, "step": 38520 }, { "epoch": 3.64, "grad_norm": 77.375, "learning_rate": 2.7131798335298733e-07, "loss": 1.4363, "step": 38540 }, { "epoch": 3.65, "grad_norm": 57.375, "learning_rate": 2.7093983982218176e-07, "loss": 1.3934, "step": 38560 }, { "epoch": 3.65, "grad_norm": 66.9375, "learning_rate": 2.7056169629137623e-07, "loss": 1.3952, "step": 38580 }, { "epoch": 3.65, "grad_norm": 53.09375, "learning_rate": 2.7018355276057065e-07, "loss": 1.3954, "step": 38600 }, { "epoch": 3.65, "grad_norm": 80.3125, "learning_rate": 2.6980540922976507e-07, "loss": 1.4784, "step": 38620 }, { "epoch": 3.65, "grad_norm": 65.5, "learning_rate": 2.694272656989595e-07, "loss": 1.3971, "step": 38640 }, { "epoch": 3.65, "grad_norm": 53.59375, "learning_rate": 2.69049122168154e-07, "loss": 1.412, "step": 38660 }, { "epoch": 3.66, "grad_norm": 81.3125, "learning_rate": 2.6867097863734844e-07, "loss": 1.4045, "step": 38680 }, { "epoch": 3.66, "grad_norm": 68.875, "learning_rate": 2.6829283510654286e-07, "loss": 1.3473, "step": 38700 }, { "epoch": 3.66, "grad_norm": 70.5625, "learning_rate": 2.6791469157573734e-07, "loss": 1.4147, "step": 38720 }, { "epoch": 3.66, "grad_norm": 76.8125, "learning_rate": 2.6753654804493176e-07, "loss": 1.438, "step": 38740 }, { "epoch": 3.66, "grad_norm": 65.375, "learning_rate": 2.671584045141262e-07, "loss": 1.4028, "step": 38760 }, { "epoch": 3.67, "grad_norm": 51.65625, "learning_rate": 2.667802609833206e-07, "loss": 1.343, "step": 38780 }, { "epoch": 3.67, "grad_norm": 68.0625, "learning_rate": 2.664021174525151e-07, "loss": 1.3459, "step": 38800 }, { "epoch": 3.67, "grad_norm": 60.15625, "learning_rate": 2.660239739217095e-07, "loss": 1.3992, "step": 38820 }, { "epoch": 3.67, "grad_norm": 84.75, "learning_rate": 2.6564583039090397e-07, "loss": 1.3889, "step": 38840 }, { "epoch": 3.67, "grad_norm": 80.25, "learning_rate": 2.6526768686009845e-07, "loss": 1.3914, "step": 38860 }, { "epoch": 3.68, "grad_norm": 63.0, "learning_rate": 2.6488954332929287e-07, "loss": 1.2944, "step": 38880 }, { "epoch": 3.68, "grad_norm": 90.6875, "learning_rate": 2.645113997984873e-07, "loss": 1.4389, "step": 38900 }, { "epoch": 3.68, "grad_norm": 82.75, "learning_rate": 2.6413325626768176e-07, "loss": 1.3992, "step": 38920 }, { "epoch": 3.68, "grad_norm": 76.0, "learning_rate": 2.637551127368762e-07, "loss": 1.4458, "step": 38940 }, { "epoch": 3.68, "grad_norm": 72.875, "learning_rate": 2.633769692060706e-07, "loss": 1.4511, "step": 38960 }, { "epoch": 3.69, "grad_norm": 78.4375, "learning_rate": 2.6299882567526503e-07, "loss": 1.3879, "step": 38980 }, { "epoch": 3.69, "grad_norm": 65.0625, "learning_rate": 2.626206821444595e-07, "loss": 1.3725, "step": 39000 }, { "epoch": 3.69, "grad_norm": 65.5625, "learning_rate": 2.62242538613654e-07, "loss": 1.3031, "step": 39020 }, { "epoch": 3.69, "grad_norm": 68.9375, "learning_rate": 2.618643950828484e-07, "loss": 1.4757, "step": 39040 }, { "epoch": 3.69, "grad_norm": 64.375, "learning_rate": 2.6148625155204287e-07, "loss": 1.3408, "step": 39060 }, { "epoch": 3.69, "grad_norm": 57.4375, "learning_rate": 2.611081080212373e-07, "loss": 1.359, "step": 39080 }, { "epoch": 3.7, "grad_norm": 59.78125, "learning_rate": 2.607299644904317e-07, "loss": 1.4038, "step": 39100 }, { "epoch": 3.7, "grad_norm": 65.375, "learning_rate": 2.6035182095962613e-07, "loss": 1.4142, "step": 39120 }, { "epoch": 3.7, "grad_norm": 72.25, "learning_rate": 2.599736774288206e-07, "loss": 1.4517, "step": 39140 }, { "epoch": 3.7, "grad_norm": 57.125, "learning_rate": 2.5959553389801503e-07, "loss": 1.4371, "step": 39160 }, { "epoch": 3.7, "grad_norm": 82.1875, "learning_rate": 2.592173903672095e-07, "loss": 1.3103, "step": 39180 }, { "epoch": 3.71, "grad_norm": 73.125, "learning_rate": 2.58839246836404e-07, "loss": 1.3995, "step": 39200 }, { "epoch": 3.71, "grad_norm": 72.375, "learning_rate": 2.584611033055984e-07, "loss": 1.4843, "step": 39220 }, { "epoch": 3.71, "grad_norm": 56.1875, "learning_rate": 2.580829597747928e-07, "loss": 1.4293, "step": 39240 }, { "epoch": 3.71, "grad_norm": 73.4375, "learning_rate": 2.5770481624398724e-07, "loss": 1.4464, "step": 39260 }, { "epoch": 3.71, "grad_norm": 64.875, "learning_rate": 2.573266727131817e-07, "loss": 1.3824, "step": 39280 }, { "epoch": 3.72, "grad_norm": 47.34375, "learning_rate": 2.5694852918237614e-07, "loss": 1.4179, "step": 39300 }, { "epoch": 3.72, "grad_norm": 53.34375, "learning_rate": 2.5657038565157056e-07, "loss": 1.4673, "step": 39320 }, { "epoch": 3.72, "grad_norm": 57.21875, "learning_rate": 2.5619224212076503e-07, "loss": 1.3636, "step": 39340 }, { "epoch": 3.72, "grad_norm": 75.0, "learning_rate": 2.558140985899595e-07, "loss": 1.4275, "step": 39360 }, { "epoch": 3.72, "grad_norm": 62.1875, "learning_rate": 2.5543595505915393e-07, "loss": 1.4423, "step": 39380 }, { "epoch": 3.72, "grad_norm": 57.0625, "learning_rate": 2.5505781152834835e-07, "loss": 1.3202, "step": 39400 }, { "epoch": 3.73, "grad_norm": 74.9375, "learning_rate": 2.546796679975428e-07, "loss": 1.4716, "step": 39420 }, { "epoch": 3.73, "grad_norm": 62.03125, "learning_rate": 2.5430152446673725e-07, "loss": 1.3571, "step": 39440 }, { "epoch": 3.73, "grad_norm": 65.25, "learning_rate": 2.5392338093593167e-07, "loss": 1.44, "step": 39460 }, { "epoch": 3.73, "grad_norm": 48.59375, "learning_rate": 2.5354523740512614e-07, "loss": 1.4017, "step": 39480 }, { "epoch": 3.73, "grad_norm": 58.78125, "learning_rate": 2.5316709387432056e-07, "loss": 1.4145, "step": 39500 }, { "epoch": 3.74, "grad_norm": 108.5, "learning_rate": 2.52788950343515e-07, "loss": 1.3431, "step": 39520 }, { "epoch": 3.74, "grad_norm": 66.0, "learning_rate": 2.524108068127095e-07, "loss": 1.4277, "step": 39540 }, { "epoch": 3.74, "grad_norm": 72.625, "learning_rate": 2.5203266328190393e-07, "loss": 1.4133, "step": 39560 }, { "epoch": 3.74, "grad_norm": 68.75, "learning_rate": 2.5165451975109835e-07, "loss": 1.3857, "step": 39580 }, { "epoch": 3.74, "grad_norm": 57.0, "learning_rate": 2.512763762202928e-07, "loss": 1.4572, "step": 39600 }, { "epoch": 3.75, "grad_norm": 71.5625, "learning_rate": 2.5089823268948725e-07, "loss": 1.3965, "step": 39620 }, { "epoch": 3.75, "grad_norm": 66.75, "learning_rate": 2.5052008915868167e-07, "loss": 1.3597, "step": 39640 }, { "epoch": 3.75, "grad_norm": 69.0625, "learning_rate": 2.501419456278761e-07, "loss": 1.3411, "step": 39660 }, { "epoch": 3.75, "grad_norm": 54.0625, "learning_rate": 2.4976380209707057e-07, "loss": 1.4024, "step": 39680 }, { "epoch": 3.75, "grad_norm": 75.4375, "learning_rate": 2.49385658566265e-07, "loss": 1.4341, "step": 39700 }, { "epoch": 3.75, "grad_norm": 60.875, "learning_rate": 2.4900751503545946e-07, "loss": 1.4068, "step": 39720 }, { "epoch": 3.76, "grad_norm": 61.0, "learning_rate": 2.486293715046539e-07, "loss": 1.3496, "step": 39740 }, { "epoch": 3.76, "grad_norm": 59.875, "learning_rate": 2.4825122797384836e-07, "loss": 1.39, "step": 39760 }, { "epoch": 3.76, "grad_norm": 71.4375, "learning_rate": 2.478730844430428e-07, "loss": 1.3572, "step": 39780 }, { "epoch": 3.76, "grad_norm": 77.4375, "learning_rate": 2.474949409122372e-07, "loss": 1.3969, "step": 39800 }, { "epoch": 3.76, "grad_norm": 52.34375, "learning_rate": 2.471167973814317e-07, "loss": 1.4315, "step": 39820 }, { "epoch": 3.77, "grad_norm": 102.9375, "learning_rate": 2.467386538506261e-07, "loss": 1.3345, "step": 39840 }, { "epoch": 3.77, "grad_norm": 68.6875, "learning_rate": 2.4636051031982057e-07, "loss": 1.3399, "step": 39860 }, { "epoch": 3.77, "grad_norm": 54.78125, "learning_rate": 2.45982366789015e-07, "loss": 1.4267, "step": 39880 }, { "epoch": 3.77, "grad_norm": 70.875, "learning_rate": 2.456042232582094e-07, "loss": 1.4507, "step": 39900 }, { "epoch": 3.77, "grad_norm": 56.40625, "learning_rate": 2.452260797274039e-07, "loss": 1.4029, "step": 39920 }, { "epoch": 3.78, "grad_norm": 71.25, "learning_rate": 2.448479361965983e-07, "loss": 1.4685, "step": 39940 }, { "epoch": 3.78, "grad_norm": 62.96875, "learning_rate": 2.444697926657928e-07, "loss": 1.4023, "step": 39960 }, { "epoch": 3.78, "grad_norm": 69.875, "learning_rate": 2.440916491349872e-07, "loss": 1.4639, "step": 39980 }, { "epoch": 3.78, "grad_norm": 65.6875, "learning_rate": 2.437135056041816e-07, "loss": 1.3749, "step": 40000 }, { "epoch": 3.78, "grad_norm": 69.6875, "learning_rate": 2.433353620733761e-07, "loss": 1.41, "step": 40020 }, { "epoch": 3.79, "grad_norm": 67.1875, "learning_rate": 2.429572185425705e-07, "loss": 1.4743, "step": 40040 }, { "epoch": 3.79, "grad_norm": 71.375, "learning_rate": 2.42579075011765e-07, "loss": 1.3925, "step": 40060 }, { "epoch": 3.79, "grad_norm": 63.84375, "learning_rate": 2.422009314809594e-07, "loss": 1.4514, "step": 40080 }, { "epoch": 3.79, "grad_norm": 101.9375, "learning_rate": 2.418227879501539e-07, "loss": 1.4082, "step": 40100 }, { "epoch": 3.79, "grad_norm": 72.375, "learning_rate": 2.414446444193483e-07, "loss": 1.3906, "step": 40120 }, { "epoch": 3.79, "grad_norm": 54.625, "learning_rate": 2.4106650088854273e-07, "loss": 1.3851, "step": 40140 }, { "epoch": 3.8, "grad_norm": 75.1875, "learning_rate": 2.406883573577372e-07, "loss": 1.3869, "step": 40160 }, { "epoch": 3.8, "grad_norm": 65.5625, "learning_rate": 2.4031021382693163e-07, "loss": 1.3966, "step": 40180 }, { "epoch": 3.8, "grad_norm": 61.78125, "learning_rate": 2.399320702961261e-07, "loss": 1.3518, "step": 40200 }, { "epoch": 3.8, "grad_norm": 64.125, "learning_rate": 2.395539267653205e-07, "loss": 1.4587, "step": 40220 }, { "epoch": 3.8, "grad_norm": 64.25, "learning_rate": 2.3917578323451495e-07, "loss": 1.4854, "step": 40240 }, { "epoch": 3.81, "grad_norm": 68.75, "learning_rate": 2.3879763970370937e-07, "loss": 1.3629, "step": 40260 }, { "epoch": 3.81, "grad_norm": 93.6875, "learning_rate": 2.3841949617290384e-07, "loss": 1.4779, "step": 40280 }, { "epoch": 3.81, "grad_norm": 52.40625, "learning_rate": 2.380413526420983e-07, "loss": 1.3754, "step": 40300 }, { "epoch": 3.81, "grad_norm": 61.46875, "learning_rate": 2.3766320911129274e-07, "loss": 1.395, "step": 40320 }, { "epoch": 3.81, "grad_norm": 72.3125, "learning_rate": 2.3728506558048716e-07, "loss": 1.3755, "step": 40340 }, { "epoch": 3.82, "grad_norm": 57.53125, "learning_rate": 2.3690692204968163e-07, "loss": 1.3909, "step": 40360 }, { "epoch": 3.82, "grad_norm": 66.6875, "learning_rate": 2.3652877851887605e-07, "loss": 1.385, "step": 40380 }, { "epoch": 3.82, "grad_norm": 59.40625, "learning_rate": 2.361506349880705e-07, "loss": 1.4461, "step": 40400 }, { "epoch": 3.82, "grad_norm": 70.75, "learning_rate": 2.3577249145726495e-07, "loss": 1.379, "step": 40420 }, { "epoch": 3.82, "grad_norm": 73.0, "learning_rate": 2.3539434792645937e-07, "loss": 1.3794, "step": 40440 }, { "epoch": 3.82, "grad_norm": 58.6875, "learning_rate": 2.3501620439565384e-07, "loss": 1.4292, "step": 40460 }, { "epoch": 3.83, "grad_norm": 67.25, "learning_rate": 2.3463806086484827e-07, "loss": 1.4327, "step": 40480 }, { "epoch": 3.83, "grad_norm": 88.4375, "learning_rate": 2.3425991733404271e-07, "loss": 1.5276, "step": 40500 }, { "epoch": 3.83, "grad_norm": 65.9375, "learning_rate": 2.3388177380323714e-07, "loss": 1.3107, "step": 40520 }, { "epoch": 3.83, "grad_norm": 65.25, "learning_rate": 2.335036302724316e-07, "loss": 1.3638, "step": 40540 }, { "epoch": 3.83, "grad_norm": 83.625, "learning_rate": 2.3312548674162606e-07, "loss": 1.3733, "step": 40560 }, { "epoch": 3.84, "grad_norm": 66.6875, "learning_rate": 2.3274734321082048e-07, "loss": 1.4898, "step": 40580 }, { "epoch": 3.84, "grad_norm": 64.5, "learning_rate": 2.3236919968001493e-07, "loss": 1.4436, "step": 40600 }, { "epoch": 3.84, "grad_norm": 67.5625, "learning_rate": 2.3199105614920935e-07, "loss": 1.4491, "step": 40620 }, { "epoch": 3.84, "grad_norm": 64.1875, "learning_rate": 2.3161291261840382e-07, "loss": 1.3816, "step": 40640 }, { "epoch": 3.84, "grad_norm": 59.03125, "learning_rate": 2.3123476908759827e-07, "loss": 1.4964, "step": 40660 }, { "epoch": 3.85, "grad_norm": 59.28125, "learning_rate": 2.308566255567927e-07, "loss": 1.364, "step": 40680 }, { "epoch": 3.85, "grad_norm": 62.15625, "learning_rate": 2.3047848202598714e-07, "loss": 1.3505, "step": 40700 }, { "epoch": 3.85, "grad_norm": 64.9375, "learning_rate": 2.3010033849518159e-07, "loss": 1.3727, "step": 40720 }, { "epoch": 3.85, "grad_norm": 74.375, "learning_rate": 2.2972219496437603e-07, "loss": 1.4421, "step": 40740 }, { "epoch": 3.85, "grad_norm": 88.5625, "learning_rate": 2.2934405143357046e-07, "loss": 1.4352, "step": 40760 }, { "epoch": 3.86, "grad_norm": 59.125, "learning_rate": 2.289659079027649e-07, "loss": 1.3484, "step": 40780 }, { "epoch": 3.86, "grad_norm": 61.25, "learning_rate": 2.2858776437195938e-07, "loss": 1.404, "step": 40800 }, { "epoch": 3.86, "grad_norm": 79.5, "learning_rate": 2.282096208411538e-07, "loss": 1.4222, "step": 40820 }, { "epoch": 3.86, "grad_norm": 55.6875, "learning_rate": 2.2783147731034825e-07, "loss": 1.3627, "step": 40840 }, { "epoch": 3.86, "grad_norm": 71.25, "learning_rate": 2.2745333377954267e-07, "loss": 1.4019, "step": 40860 }, { "epoch": 3.86, "grad_norm": 75.0, "learning_rate": 2.2707519024873712e-07, "loss": 1.3863, "step": 40880 }, { "epoch": 3.87, "grad_norm": 58.78125, "learning_rate": 2.266970467179316e-07, "loss": 1.3336, "step": 40900 }, { "epoch": 3.87, "grad_norm": 72.5625, "learning_rate": 2.26318903187126e-07, "loss": 1.437, "step": 40920 }, { "epoch": 3.87, "grad_norm": 53.90625, "learning_rate": 2.2594075965632046e-07, "loss": 1.3679, "step": 40940 }, { "epoch": 3.87, "grad_norm": 62.96875, "learning_rate": 2.2556261612551488e-07, "loss": 1.4215, "step": 40960 }, { "epoch": 3.87, "grad_norm": 62.03125, "learning_rate": 2.2518447259470935e-07, "loss": 1.4289, "step": 40980 }, { "epoch": 3.88, "grad_norm": 60.78125, "learning_rate": 2.2480632906390378e-07, "loss": 1.52, "step": 41000 }, { "epoch": 3.88, "grad_norm": 54.03125, "learning_rate": 2.2442818553309822e-07, "loss": 1.3679, "step": 41020 }, { "epoch": 3.88, "grad_norm": 61.6875, "learning_rate": 2.2405004200229267e-07, "loss": 1.5041, "step": 41040 }, { "epoch": 3.88, "grad_norm": 58.40625, "learning_rate": 2.2367189847148712e-07, "loss": 1.4264, "step": 41060 }, { "epoch": 3.88, "grad_norm": 58.03125, "learning_rate": 2.2329375494068157e-07, "loss": 1.3305, "step": 41080 }, { "epoch": 3.89, "grad_norm": 62.5625, "learning_rate": 2.22915611409876e-07, "loss": 1.3939, "step": 41100 }, { "epoch": 3.89, "grad_norm": 62.5, "learning_rate": 2.2253746787907044e-07, "loss": 1.3357, "step": 41120 }, { "epoch": 3.89, "grad_norm": 61.03125, "learning_rate": 2.2215932434826486e-07, "loss": 1.3863, "step": 41140 }, { "epoch": 3.89, "grad_norm": 71.5625, "learning_rate": 2.2178118081745933e-07, "loss": 1.5696, "step": 41160 }, { "epoch": 3.89, "grad_norm": 56.25, "learning_rate": 2.2140303728665378e-07, "loss": 1.3852, "step": 41180 }, { "epoch": 3.89, "grad_norm": 56.0625, "learning_rate": 2.210248937558482e-07, "loss": 1.3793, "step": 41200 }, { "epoch": 3.9, "grad_norm": 68.8125, "learning_rate": 2.2064675022504265e-07, "loss": 1.3969, "step": 41220 }, { "epoch": 3.9, "grad_norm": 86.3125, "learning_rate": 2.202686066942371e-07, "loss": 1.4984, "step": 41240 }, { "epoch": 3.9, "grad_norm": 73.5, "learning_rate": 2.1989046316343154e-07, "loss": 1.4178, "step": 41260 }, { "epoch": 3.9, "grad_norm": 82.875, "learning_rate": 2.19512319632626e-07, "loss": 1.4235, "step": 41280 }, { "epoch": 3.9, "grad_norm": 57.90625, "learning_rate": 2.191341761018204e-07, "loss": 1.3364, "step": 41300 }, { "epoch": 3.91, "grad_norm": 60.28125, "learning_rate": 2.187560325710149e-07, "loss": 1.3978, "step": 41320 }, { "epoch": 3.91, "grad_norm": 147.0, "learning_rate": 2.183778890402093e-07, "loss": 1.3867, "step": 41340 }, { "epoch": 3.91, "grad_norm": 106.375, "learning_rate": 2.1799974550940376e-07, "loss": 1.4255, "step": 41360 }, { "epoch": 3.91, "grad_norm": 75.1875, "learning_rate": 2.1762160197859818e-07, "loss": 1.4822, "step": 41380 }, { "epoch": 3.91, "grad_norm": 60.625, "learning_rate": 2.1724345844779263e-07, "loss": 1.4114, "step": 41400 }, { "epoch": 3.92, "grad_norm": 66.0, "learning_rate": 2.168653149169871e-07, "loss": 1.4062, "step": 41420 }, { "epoch": 3.92, "grad_norm": 61.8125, "learning_rate": 2.1648717138618152e-07, "loss": 1.4739, "step": 41440 }, { "epoch": 3.92, "grad_norm": 79.8125, "learning_rate": 2.1610902785537597e-07, "loss": 1.4277, "step": 41460 }, { "epoch": 3.92, "grad_norm": 55.84375, "learning_rate": 2.157308843245704e-07, "loss": 1.4047, "step": 41480 }, { "epoch": 3.92, "grad_norm": 84.0, "learning_rate": 2.1535274079376486e-07, "loss": 1.3995, "step": 41500 }, { "epoch": 3.93, "grad_norm": 66.125, "learning_rate": 2.149745972629593e-07, "loss": 1.497, "step": 41520 }, { "epoch": 3.93, "grad_norm": 75.0625, "learning_rate": 2.1459645373215373e-07, "loss": 1.4269, "step": 41540 }, { "epoch": 3.93, "grad_norm": 78.0625, "learning_rate": 2.1421831020134818e-07, "loss": 1.4038, "step": 41560 }, { "epoch": 3.93, "grad_norm": 64.3125, "learning_rate": 2.138401666705426e-07, "loss": 1.3915, "step": 41580 }, { "epoch": 3.93, "grad_norm": 53.21875, "learning_rate": 2.1346202313973708e-07, "loss": 1.385, "step": 41600 }, { "epoch": 3.93, "grad_norm": 64.75, "learning_rate": 2.130838796089315e-07, "loss": 1.4048, "step": 41620 }, { "epoch": 3.94, "grad_norm": 69.9375, "learning_rate": 2.1270573607812595e-07, "loss": 1.383, "step": 41640 }, { "epoch": 3.94, "grad_norm": 55.25, "learning_rate": 2.123275925473204e-07, "loss": 1.4297, "step": 41660 }, { "epoch": 3.94, "grad_norm": 62.84375, "learning_rate": 2.1194944901651484e-07, "loss": 1.4653, "step": 41680 }, { "epoch": 3.94, "grad_norm": 63.0625, "learning_rate": 2.115713054857093e-07, "loss": 1.3194, "step": 41700 }, { "epoch": 3.94, "grad_norm": 93.0, "learning_rate": 2.111931619549037e-07, "loss": 1.3567, "step": 41720 }, { "epoch": 3.95, "grad_norm": 73.375, "learning_rate": 2.1081501842409816e-07, "loss": 1.3192, "step": 41740 }, { "epoch": 3.95, "grad_norm": 60.53125, "learning_rate": 2.1043687489329263e-07, "loss": 1.3712, "step": 41760 }, { "epoch": 3.95, "grad_norm": 55.0, "learning_rate": 2.1005873136248705e-07, "loss": 1.4137, "step": 41780 }, { "epoch": 3.95, "grad_norm": 62.53125, "learning_rate": 2.096805878316815e-07, "loss": 1.4085, "step": 41800 }, { "epoch": 3.95, "grad_norm": 61.84375, "learning_rate": 2.0930244430087592e-07, "loss": 1.3769, "step": 41820 }, { "epoch": 3.96, "grad_norm": 71.75, "learning_rate": 2.0892430077007037e-07, "loss": 1.3457, "step": 41840 }, { "epoch": 3.96, "grad_norm": 70.9375, "learning_rate": 2.0854615723926482e-07, "loss": 1.3542, "step": 41860 }, { "epoch": 3.96, "grad_norm": 78.5, "learning_rate": 2.0816801370845927e-07, "loss": 1.438, "step": 41880 }, { "epoch": 3.96, "grad_norm": 78.1875, "learning_rate": 2.0778987017765371e-07, "loss": 1.4064, "step": 41900 }, { "epoch": 3.96, "grad_norm": 73.8125, "learning_rate": 2.0741172664684814e-07, "loss": 1.4502, "step": 41920 }, { "epoch": 3.96, "grad_norm": 48.5, "learning_rate": 2.070335831160426e-07, "loss": 1.3534, "step": 41940 }, { "epoch": 3.97, "grad_norm": 99.125, "learning_rate": 2.0665543958523703e-07, "loss": 1.3896, "step": 41960 }, { "epoch": 3.97, "grad_norm": 66.1875, "learning_rate": 2.0627729605443148e-07, "loss": 1.3242, "step": 41980 }, { "epoch": 3.97, "grad_norm": 83.0, "learning_rate": 2.058991525236259e-07, "loss": 1.49, "step": 42000 }, { "epoch": 3.97, "grad_norm": 74.3125, "learning_rate": 2.0552100899282037e-07, "loss": 1.451, "step": 42020 }, { "epoch": 3.97, "grad_norm": 65.125, "learning_rate": 2.0514286546201482e-07, "loss": 1.307, "step": 42040 }, { "epoch": 3.98, "grad_norm": 67.75, "learning_rate": 2.0476472193120924e-07, "loss": 1.5365, "step": 42060 }, { "epoch": 3.98, "grad_norm": 56.3125, "learning_rate": 2.043865784004037e-07, "loss": 1.3086, "step": 42080 }, { "epoch": 3.98, "grad_norm": 72.8125, "learning_rate": 2.040084348695981e-07, "loss": 1.3316, "step": 42100 }, { "epoch": 3.98, "grad_norm": 68.75, "learning_rate": 2.0363029133879259e-07, "loss": 1.4897, "step": 42120 }, { "epoch": 3.98, "grad_norm": 63.0625, "learning_rate": 2.0325214780798703e-07, "loss": 1.3649, "step": 42140 }, { "epoch": 3.99, "grad_norm": 63.75, "learning_rate": 2.0287400427718146e-07, "loss": 1.4543, "step": 42160 }, { "epoch": 3.99, "grad_norm": 59.21875, "learning_rate": 2.024958607463759e-07, "loss": 1.4654, "step": 42180 }, { "epoch": 3.99, "grad_norm": 63.5, "learning_rate": 2.0211771721557035e-07, "loss": 1.4143, "step": 42200 }, { "epoch": 3.99, "grad_norm": 74.875, "learning_rate": 2.017395736847648e-07, "loss": 1.3129, "step": 42220 }, { "epoch": 3.99, "grad_norm": 74.375, "learning_rate": 2.0136143015395922e-07, "loss": 1.4688, "step": 42240 }, { "epoch": 4.0, "grad_norm": 78.1875, "learning_rate": 2.0098328662315367e-07, "loss": 1.4214, "step": 42260 }, { "epoch": 4.0, "grad_norm": 76.6875, "learning_rate": 2.0060514309234814e-07, "loss": 1.3917, "step": 42280 }, { "epoch": 4.0, "grad_norm": 59.3125, "learning_rate": 2.0022699956154256e-07, "loss": 1.3565, "step": 42300 }, { "epoch": 4.0, "grad_norm": 71.4375, "learning_rate": 1.99848856030737e-07, "loss": 1.3224, "step": 42320 }, { "epoch": 4.0, "grad_norm": 79.0, "learning_rate": 1.9947071249993143e-07, "loss": 1.3764, "step": 42340 }, { "epoch": 4.0, "grad_norm": 67.6875, "learning_rate": 1.9909256896912588e-07, "loss": 1.2843, "step": 42360 }, { "epoch": 4.01, "grad_norm": 63.21875, "learning_rate": 1.9871442543832035e-07, "loss": 1.3439, "step": 42380 }, { "epoch": 4.01, "grad_norm": 52.71875, "learning_rate": 1.9833628190751478e-07, "loss": 1.2762, "step": 42400 }, { "epoch": 4.01, "grad_norm": 50.59375, "learning_rate": 1.9795813837670922e-07, "loss": 1.3074, "step": 42420 }, { "epoch": 4.01, "grad_norm": 65.4375, "learning_rate": 1.9757999484590365e-07, "loss": 1.451, "step": 42440 }, { "epoch": 4.01, "grad_norm": 79.9375, "learning_rate": 1.9720185131509812e-07, "loss": 1.337, "step": 42460 }, { "epoch": 4.02, "grad_norm": 80.875, "learning_rate": 1.9682370778429254e-07, "loss": 1.316, "step": 42480 }, { "epoch": 4.02, "grad_norm": 73.875, "learning_rate": 1.96445564253487e-07, "loss": 1.252, "step": 42500 }, { "epoch": 4.02, "grad_norm": 59.90625, "learning_rate": 1.9606742072268144e-07, "loss": 1.3394, "step": 42520 }, { "epoch": 4.02, "grad_norm": 61.0625, "learning_rate": 1.9568927719187586e-07, "loss": 1.4067, "step": 42540 }, { "epoch": 4.02, "grad_norm": 73.3125, "learning_rate": 1.9531113366107033e-07, "loss": 1.3229, "step": 42560 }, { "epoch": 4.03, "grad_norm": 99.625, "learning_rate": 1.9493299013026475e-07, "loss": 1.3768, "step": 42580 }, { "epoch": 4.03, "grad_norm": 57.78125, "learning_rate": 1.945548465994592e-07, "loss": 1.3839, "step": 42600 }, { "epoch": 4.03, "grad_norm": 61.0, "learning_rate": 1.9417670306865365e-07, "loss": 1.3187, "step": 42620 }, { "epoch": 4.03, "grad_norm": 56.5625, "learning_rate": 1.937985595378481e-07, "loss": 1.3785, "step": 42640 }, { "epoch": 4.03, "grad_norm": 52.09375, "learning_rate": 1.9342041600704254e-07, "loss": 1.2809, "step": 42660 }, { "epoch": 4.03, "grad_norm": 60.65625, "learning_rate": 1.9304227247623697e-07, "loss": 1.3787, "step": 42680 }, { "epoch": 4.04, "grad_norm": 58.15625, "learning_rate": 1.926641289454314e-07, "loss": 1.2968, "step": 42700 }, { "epoch": 4.04, "grad_norm": 76.875, "learning_rate": 1.9228598541462586e-07, "loss": 1.3426, "step": 42720 }, { "epoch": 4.04, "grad_norm": 64.125, "learning_rate": 1.919078418838203e-07, "loss": 1.389, "step": 42740 }, { "epoch": 4.04, "grad_norm": 74.875, "learning_rate": 1.9152969835301476e-07, "loss": 1.3012, "step": 42760 }, { "epoch": 4.04, "grad_norm": 68.3125, "learning_rate": 1.9115155482220918e-07, "loss": 1.3064, "step": 42780 }, { "epoch": 4.05, "grad_norm": 67.375, "learning_rate": 1.9077341129140363e-07, "loss": 1.2949, "step": 42800 }, { "epoch": 4.05, "grad_norm": 63.84375, "learning_rate": 1.9039526776059807e-07, "loss": 1.3852, "step": 42820 }, { "epoch": 4.05, "grad_norm": 60.4375, "learning_rate": 1.9001712422979252e-07, "loss": 1.3849, "step": 42840 }, { "epoch": 4.05, "grad_norm": 56.71875, "learning_rate": 1.8963898069898694e-07, "loss": 1.2859, "step": 42860 }, { "epoch": 4.05, "grad_norm": 60.90625, "learning_rate": 1.892608371681814e-07, "loss": 1.3057, "step": 42880 }, { "epoch": 4.06, "grad_norm": 78.1875, "learning_rate": 1.8888269363737586e-07, "loss": 1.3186, "step": 42900 }, { "epoch": 4.06, "grad_norm": 56.84375, "learning_rate": 1.8850455010657029e-07, "loss": 1.36, "step": 42920 }, { "epoch": 4.06, "grad_norm": 63.03125, "learning_rate": 1.8812640657576473e-07, "loss": 1.2948, "step": 42940 }, { "epoch": 4.06, "grad_norm": 57.90625, "learning_rate": 1.8774826304495915e-07, "loss": 1.434, "step": 42960 }, { "epoch": 4.06, "grad_norm": 58.1875, "learning_rate": 1.8737011951415363e-07, "loss": 1.3389, "step": 42980 }, { "epoch": 4.07, "grad_norm": 82.6875, "learning_rate": 1.8699197598334808e-07, "loss": 1.3851, "step": 43000 }, { "epoch": 4.07, "grad_norm": 59.65625, "learning_rate": 1.866138324525425e-07, "loss": 1.3092, "step": 43020 }, { "epoch": 4.07, "grad_norm": 83.875, "learning_rate": 1.8623568892173695e-07, "loss": 1.2534, "step": 43040 }, { "epoch": 4.07, "grad_norm": 56.9375, "learning_rate": 1.8585754539093137e-07, "loss": 1.3689, "step": 43060 }, { "epoch": 4.07, "grad_norm": 65.9375, "learning_rate": 1.8547940186012584e-07, "loss": 1.3127, "step": 43080 }, { "epoch": 4.07, "grad_norm": 72.9375, "learning_rate": 1.8510125832932026e-07, "loss": 1.4009, "step": 43100 }, { "epoch": 4.08, "grad_norm": 57.40625, "learning_rate": 1.847231147985147e-07, "loss": 1.3522, "step": 43120 }, { "epoch": 4.08, "grad_norm": 60.40625, "learning_rate": 1.8434497126770916e-07, "loss": 1.3938, "step": 43140 }, { "epoch": 4.08, "grad_norm": 62.71875, "learning_rate": 1.839668277369036e-07, "loss": 1.3835, "step": 43160 }, { "epoch": 4.08, "grad_norm": 53.5625, "learning_rate": 1.8358868420609805e-07, "loss": 1.3114, "step": 43180 }, { "epoch": 4.08, "grad_norm": 71.3125, "learning_rate": 1.8321054067529248e-07, "loss": 1.3608, "step": 43200 }, { "epoch": 4.09, "grad_norm": 109.1875, "learning_rate": 1.8283239714448692e-07, "loss": 1.3388, "step": 43220 }, { "epoch": 4.09, "grad_norm": 64.8125, "learning_rate": 1.8245425361368137e-07, "loss": 1.2882, "step": 43240 }, { "epoch": 4.09, "grad_norm": 61.8125, "learning_rate": 1.8207611008287582e-07, "loss": 1.3305, "step": 43260 }, { "epoch": 4.09, "grad_norm": 63.46875, "learning_rate": 1.8169796655207027e-07, "loss": 1.3216, "step": 43280 }, { "epoch": 4.09, "grad_norm": 54.28125, "learning_rate": 1.813198230212647e-07, "loss": 1.3642, "step": 43300 }, { "epoch": 4.1, "grad_norm": 56.78125, "learning_rate": 1.8094167949045914e-07, "loss": 1.2574, "step": 43320 }, { "epoch": 4.1, "grad_norm": 56.28125, "learning_rate": 1.8056353595965358e-07, "loss": 1.2901, "step": 43340 }, { "epoch": 4.1, "grad_norm": 59.375, "learning_rate": 1.8018539242884803e-07, "loss": 1.3838, "step": 43360 }, { "epoch": 4.1, "grad_norm": 57.25, "learning_rate": 1.7980724889804248e-07, "loss": 1.307, "step": 43380 }, { "epoch": 4.1, "grad_norm": 67.4375, "learning_rate": 1.794291053672369e-07, "loss": 1.2821, "step": 43400 }, { "epoch": 4.1, "grad_norm": 89.375, "learning_rate": 1.7905096183643137e-07, "loss": 1.337, "step": 43420 }, { "epoch": 4.11, "grad_norm": 104.4375, "learning_rate": 1.786728183056258e-07, "loss": 1.3838, "step": 43440 }, { "epoch": 4.11, "grad_norm": 63.78125, "learning_rate": 1.7829467477482024e-07, "loss": 1.3542, "step": 43460 }, { "epoch": 4.11, "grad_norm": 64.1875, "learning_rate": 1.779165312440147e-07, "loss": 1.3268, "step": 43480 }, { "epoch": 4.11, "grad_norm": 60.1875, "learning_rate": 1.775383877132091e-07, "loss": 1.2482, "step": 43500 }, { "epoch": 4.11, "grad_norm": 72.0, "learning_rate": 1.7716024418240359e-07, "loss": 1.3675, "step": 43520 }, { "epoch": 4.12, "grad_norm": 51.375, "learning_rate": 1.76782100651598e-07, "loss": 1.3264, "step": 43540 }, { "epoch": 4.12, "grad_norm": 79.0625, "learning_rate": 1.7640395712079246e-07, "loss": 1.2761, "step": 43560 }, { "epoch": 4.12, "grad_norm": 59.5625, "learning_rate": 1.7602581358998688e-07, "loss": 1.3908, "step": 43580 }, { "epoch": 4.12, "grad_norm": 67.3125, "learning_rate": 1.7564767005918135e-07, "loss": 1.3082, "step": 43600 }, { "epoch": 4.12, "grad_norm": 79.625, "learning_rate": 1.752695265283758e-07, "loss": 1.3252, "step": 43620 }, { "epoch": 4.13, "grad_norm": 82.0, "learning_rate": 1.7489138299757022e-07, "loss": 1.3506, "step": 43640 }, { "epoch": 4.13, "grad_norm": 75.6875, "learning_rate": 1.7451323946676467e-07, "loss": 1.2852, "step": 43660 }, { "epoch": 4.13, "grad_norm": 62.15625, "learning_rate": 1.7413509593595912e-07, "loss": 1.2953, "step": 43680 }, { "epoch": 4.13, "grad_norm": 70.0625, "learning_rate": 1.7375695240515356e-07, "loss": 1.3503, "step": 43700 }, { "epoch": 4.13, "grad_norm": 69.625, "learning_rate": 1.73378808874348e-07, "loss": 1.3355, "step": 43720 }, { "epoch": 4.13, "grad_norm": 82.3125, "learning_rate": 1.7300066534354243e-07, "loss": 1.3057, "step": 43740 }, { "epoch": 4.14, "grad_norm": 71.3125, "learning_rate": 1.7262252181273688e-07, "loss": 1.2385, "step": 43760 }, { "epoch": 4.14, "grad_norm": 64.8125, "learning_rate": 1.7224437828193133e-07, "loss": 1.3327, "step": 43780 }, { "epoch": 4.14, "grad_norm": 64.9375, "learning_rate": 1.7186623475112578e-07, "loss": 1.2779, "step": 43800 }, { "epoch": 4.14, "grad_norm": 75.875, "learning_rate": 1.714880912203202e-07, "loss": 1.3603, "step": 43820 }, { "epoch": 4.14, "grad_norm": 47.15625, "learning_rate": 1.7110994768951465e-07, "loss": 1.2305, "step": 43840 }, { "epoch": 4.15, "grad_norm": 69.1875, "learning_rate": 1.7073180415870912e-07, "loss": 1.2893, "step": 43860 }, { "epoch": 4.15, "grad_norm": 61.5, "learning_rate": 1.7035366062790354e-07, "loss": 1.3423, "step": 43880 }, { "epoch": 4.15, "grad_norm": 58.71875, "learning_rate": 1.69975517097098e-07, "loss": 1.2352, "step": 43900 }, { "epoch": 4.15, "grad_norm": 51.1875, "learning_rate": 1.695973735662924e-07, "loss": 1.314, "step": 43920 }, { "epoch": 4.15, "grad_norm": 60.59375, "learning_rate": 1.6921923003548688e-07, "loss": 1.2465, "step": 43940 }, { "epoch": 4.16, "grad_norm": 87.125, "learning_rate": 1.688410865046813e-07, "loss": 1.3573, "step": 43960 }, { "epoch": 4.16, "grad_norm": 78.6875, "learning_rate": 1.6846294297387575e-07, "loss": 1.3748, "step": 43980 }, { "epoch": 4.16, "grad_norm": 64.75, "learning_rate": 1.680847994430702e-07, "loss": 1.4336, "step": 44000 }, { "epoch": 4.16, "grad_norm": 69.0625, "learning_rate": 1.6770665591226462e-07, "loss": 1.3579, "step": 44020 }, { "epoch": 4.16, "grad_norm": 65.125, "learning_rate": 1.673285123814591e-07, "loss": 1.3783, "step": 44040 }, { "epoch": 4.17, "grad_norm": 57.1875, "learning_rate": 1.6695036885065352e-07, "loss": 1.277, "step": 44060 }, { "epoch": 4.17, "grad_norm": 63.78125, "learning_rate": 1.6657222531984797e-07, "loss": 1.3084, "step": 44080 }, { "epoch": 4.17, "grad_norm": 62.75, "learning_rate": 1.6619408178904241e-07, "loss": 1.4371, "step": 44100 }, { "epoch": 4.17, "grad_norm": 60.6875, "learning_rate": 1.6581593825823686e-07, "loss": 1.3106, "step": 44120 }, { "epoch": 4.17, "grad_norm": 55.90625, "learning_rate": 1.654377947274313e-07, "loss": 1.3069, "step": 44140 }, { "epoch": 4.17, "grad_norm": 73.625, "learning_rate": 1.6505965119662573e-07, "loss": 1.3339, "step": 44160 }, { "epoch": 4.18, "grad_norm": 56.03125, "learning_rate": 1.6468150766582018e-07, "loss": 1.1934, "step": 44180 }, { "epoch": 4.18, "grad_norm": 59.59375, "learning_rate": 1.643033641350146e-07, "loss": 1.2749, "step": 44200 }, { "epoch": 4.18, "grad_norm": 52.15625, "learning_rate": 1.6392522060420907e-07, "loss": 1.2481, "step": 44220 }, { "epoch": 4.18, "grad_norm": 74.25, "learning_rate": 1.6354707707340352e-07, "loss": 1.3556, "step": 44240 }, { "epoch": 4.18, "grad_norm": 54.09375, "learning_rate": 1.6316893354259794e-07, "loss": 1.3386, "step": 44260 }, { "epoch": 4.19, "grad_norm": 60.78125, "learning_rate": 1.627907900117924e-07, "loss": 1.4361, "step": 44280 }, { "epoch": 4.19, "grad_norm": 67.5, "learning_rate": 1.6241264648098684e-07, "loss": 1.3713, "step": 44300 }, { "epoch": 4.19, "grad_norm": 65.875, "learning_rate": 1.6203450295018129e-07, "loss": 1.3499, "step": 44320 }, { "epoch": 4.19, "grad_norm": 54.84375, "learning_rate": 1.6165635941937573e-07, "loss": 1.3358, "step": 44340 }, { "epoch": 4.19, "grad_norm": 54.78125, "learning_rate": 1.6127821588857015e-07, "loss": 1.32, "step": 44360 }, { "epoch": 4.2, "grad_norm": 72.0, "learning_rate": 1.6090007235776463e-07, "loss": 1.3005, "step": 44380 }, { "epoch": 4.2, "grad_norm": 58.6875, "learning_rate": 1.6052192882695905e-07, "loss": 1.2939, "step": 44400 }, { "epoch": 4.2, "grad_norm": 64.875, "learning_rate": 1.601437852961535e-07, "loss": 1.4002, "step": 44420 }, { "epoch": 4.2, "grad_norm": 61.25, "learning_rate": 1.5976564176534792e-07, "loss": 1.2259, "step": 44440 }, { "epoch": 4.2, "grad_norm": 71.1875, "learning_rate": 1.5938749823454237e-07, "loss": 1.432, "step": 44460 }, { "epoch": 4.2, "grad_norm": 73.25, "learning_rate": 1.5900935470373684e-07, "loss": 1.316, "step": 44480 }, { "epoch": 4.21, "grad_norm": 93.3125, "learning_rate": 1.5863121117293126e-07, "loss": 1.3683, "step": 44500 }, { "epoch": 4.21, "grad_norm": 65.9375, "learning_rate": 1.582530676421257e-07, "loss": 1.2994, "step": 44520 }, { "epoch": 4.21, "grad_norm": 76.5, "learning_rate": 1.5787492411132013e-07, "loss": 1.3194, "step": 44540 }, { "epoch": 4.21, "grad_norm": 63.0625, "learning_rate": 1.574967805805146e-07, "loss": 1.2961, "step": 44560 }, { "epoch": 4.21, "grad_norm": 60.8125, "learning_rate": 1.5711863704970905e-07, "loss": 1.4499, "step": 44580 }, { "epoch": 4.22, "grad_norm": 76.3125, "learning_rate": 1.5674049351890348e-07, "loss": 1.3242, "step": 44600 }, { "epoch": 4.22, "grad_norm": 70.25, "learning_rate": 1.5636234998809792e-07, "loss": 1.3083, "step": 44620 }, { "epoch": 4.22, "grad_norm": 68.1875, "learning_rate": 1.5598420645729237e-07, "loss": 1.3901, "step": 44640 }, { "epoch": 4.22, "grad_norm": 53.3125, "learning_rate": 1.5560606292648682e-07, "loss": 1.3179, "step": 44660 }, { "epoch": 4.22, "grad_norm": 92.125, "learning_rate": 1.5522791939568124e-07, "loss": 1.2245, "step": 44680 }, { "epoch": 4.23, "grad_norm": 77.0625, "learning_rate": 1.548497758648757e-07, "loss": 1.3041, "step": 44700 }, { "epoch": 4.23, "grad_norm": 56.09375, "learning_rate": 1.5447163233407014e-07, "loss": 1.3729, "step": 44720 }, { "epoch": 4.23, "grad_norm": 68.625, "learning_rate": 1.5409348880326458e-07, "loss": 1.314, "step": 44740 }, { "epoch": 4.23, "grad_norm": 65.625, "learning_rate": 1.5371534527245903e-07, "loss": 1.3265, "step": 44760 }, { "epoch": 4.23, "grad_norm": 60.09375, "learning_rate": 1.5333720174165345e-07, "loss": 1.257, "step": 44780 }, { "epoch": 4.24, "grad_norm": 57.375, "learning_rate": 1.529590582108479e-07, "loss": 1.3232, "step": 44800 }, { "epoch": 4.24, "grad_norm": 79.1875, "learning_rate": 1.5258091468004235e-07, "loss": 1.3386, "step": 44820 }, { "epoch": 4.24, "grad_norm": 80.5, "learning_rate": 1.522027711492368e-07, "loss": 1.3517, "step": 44840 }, { "epoch": 4.24, "grad_norm": 59.4375, "learning_rate": 1.5182462761843124e-07, "loss": 1.2794, "step": 44860 }, { "epoch": 4.24, "grad_norm": 69.375, "learning_rate": 1.5144648408762566e-07, "loss": 1.2523, "step": 44880 }, { "epoch": 4.24, "grad_norm": 98.875, "learning_rate": 1.5106834055682014e-07, "loss": 1.3374, "step": 44900 }, { "epoch": 4.25, "grad_norm": 77.8125, "learning_rate": 1.5069019702601456e-07, "loss": 1.3555, "step": 44920 }, { "epoch": 4.25, "grad_norm": 59.78125, "learning_rate": 1.50312053495209e-07, "loss": 1.3583, "step": 44940 }, { "epoch": 4.25, "grad_norm": 56.75, "learning_rate": 1.4993390996440346e-07, "loss": 1.3552, "step": 44960 }, { "epoch": 4.25, "grad_norm": 51.96875, "learning_rate": 1.4955576643359788e-07, "loss": 1.3089, "step": 44980 }, { "epoch": 4.25, "grad_norm": 56.9375, "learning_rate": 1.4917762290279235e-07, "loss": 1.3219, "step": 45000 }, { "epoch": 4.26, "grad_norm": 71.5, "learning_rate": 1.4879947937198677e-07, "loss": 1.3258, "step": 45020 }, { "epoch": 4.26, "grad_norm": 84.1875, "learning_rate": 1.4842133584118122e-07, "loss": 1.3392, "step": 45040 }, { "epoch": 4.26, "grad_norm": 66.625, "learning_rate": 1.4804319231037564e-07, "loss": 1.2842, "step": 45060 }, { "epoch": 4.26, "grad_norm": 72.8125, "learning_rate": 1.4766504877957012e-07, "loss": 1.32, "step": 45080 }, { "epoch": 4.26, "grad_norm": 59.25, "learning_rate": 1.4728690524876456e-07, "loss": 1.334, "step": 45100 }, { "epoch": 4.27, "grad_norm": 32.84375, "learning_rate": 1.4690876171795899e-07, "loss": 1.2431, "step": 45120 }, { "epoch": 4.27, "grad_norm": 47.5, "learning_rate": 1.4653061818715343e-07, "loss": 1.3683, "step": 45140 }, { "epoch": 4.27, "grad_norm": 60.78125, "learning_rate": 1.4615247465634785e-07, "loss": 1.4307, "step": 45160 }, { "epoch": 4.27, "grad_norm": 74.0, "learning_rate": 1.4577433112554233e-07, "loss": 1.3282, "step": 45180 }, { "epoch": 4.27, "grad_norm": 59.28125, "learning_rate": 1.4539618759473678e-07, "loss": 1.3151, "step": 45200 }, { "epoch": 4.27, "grad_norm": 60.0, "learning_rate": 1.450180440639312e-07, "loss": 1.2706, "step": 45220 }, { "epoch": 4.28, "grad_norm": 62.0625, "learning_rate": 1.4463990053312565e-07, "loss": 1.2947, "step": 45240 }, { "epoch": 4.28, "grad_norm": 62.03125, "learning_rate": 1.442617570023201e-07, "loss": 1.3164, "step": 45260 }, { "epoch": 4.28, "grad_norm": 66.5, "learning_rate": 1.4388361347151454e-07, "loss": 1.2706, "step": 45280 }, { "epoch": 4.28, "grad_norm": 61.75, "learning_rate": 1.4350546994070896e-07, "loss": 1.3473, "step": 45300 }, { "epoch": 4.28, "grad_norm": 51.75, "learning_rate": 1.431273264099034e-07, "loss": 1.2948, "step": 45320 }, { "epoch": 4.29, "grad_norm": 59.3125, "learning_rate": 1.4274918287909788e-07, "loss": 1.262, "step": 45340 }, { "epoch": 4.29, "grad_norm": 101.625, "learning_rate": 1.423710393482923e-07, "loss": 1.3206, "step": 45360 }, { "epoch": 4.29, "grad_norm": 61.96875, "learning_rate": 1.4199289581748675e-07, "loss": 1.3655, "step": 45380 }, { "epoch": 4.29, "grad_norm": 56.375, "learning_rate": 1.4161475228668117e-07, "loss": 1.3355, "step": 45400 }, { "epoch": 4.29, "grad_norm": 61.6875, "learning_rate": 1.4123660875587562e-07, "loss": 1.3372, "step": 45420 }, { "epoch": 4.3, "grad_norm": 55.9375, "learning_rate": 1.408584652250701e-07, "loss": 1.2686, "step": 45440 }, { "epoch": 4.3, "grad_norm": 64.75, "learning_rate": 1.4048032169426452e-07, "loss": 1.2195, "step": 45460 }, { "epoch": 4.3, "grad_norm": 57.5625, "learning_rate": 1.4010217816345897e-07, "loss": 1.3275, "step": 45480 }, { "epoch": 4.3, "grad_norm": 62.40625, "learning_rate": 1.397240346326534e-07, "loss": 1.2764, "step": 45500 }, { "epoch": 4.3, "grad_norm": 54.5, "learning_rate": 1.3934589110184786e-07, "loss": 1.2676, "step": 45520 }, { "epoch": 4.31, "grad_norm": 69.625, "learning_rate": 1.3896774757104228e-07, "loss": 1.4004, "step": 45540 }, { "epoch": 4.31, "grad_norm": 61.78125, "learning_rate": 1.3858960404023673e-07, "loss": 1.3371, "step": 45560 }, { "epoch": 4.31, "grad_norm": 67.125, "learning_rate": 1.3821146050943118e-07, "loss": 1.4369, "step": 45580 }, { "epoch": 4.31, "grad_norm": 49.09375, "learning_rate": 1.3783331697862563e-07, "loss": 1.229, "step": 45600 }, { "epoch": 4.31, "grad_norm": 96.4375, "learning_rate": 1.3745517344782007e-07, "loss": 1.3351, "step": 45620 }, { "epoch": 4.31, "grad_norm": 69.5625, "learning_rate": 1.370770299170145e-07, "loss": 1.3794, "step": 45640 }, { "epoch": 4.32, "grad_norm": 50.8125, "learning_rate": 1.3669888638620894e-07, "loss": 1.3705, "step": 45660 }, { "epoch": 4.32, "grad_norm": 62.1875, "learning_rate": 1.3632074285540336e-07, "loss": 1.3257, "step": 45680 }, { "epoch": 4.32, "grad_norm": 54.1875, "learning_rate": 1.3594259932459784e-07, "loss": 1.2859, "step": 45700 }, { "epoch": 4.32, "grad_norm": 56.84375, "learning_rate": 1.3556445579379229e-07, "loss": 1.3751, "step": 45720 }, { "epoch": 4.32, "grad_norm": 51.90625, "learning_rate": 1.351863122629867e-07, "loss": 1.3439, "step": 45740 }, { "epoch": 4.33, "grad_norm": 58.6875, "learning_rate": 1.3480816873218116e-07, "loss": 1.3596, "step": 45760 }, { "epoch": 4.33, "grad_norm": 62.71875, "learning_rate": 1.344300252013756e-07, "loss": 1.3372, "step": 45780 }, { "epoch": 4.33, "grad_norm": 62.5625, "learning_rate": 1.3405188167057005e-07, "loss": 1.2702, "step": 45800 }, { "epoch": 4.33, "grad_norm": 85.125, "learning_rate": 1.336737381397645e-07, "loss": 1.3388, "step": 45820 }, { "epoch": 4.33, "grad_norm": 86.875, "learning_rate": 1.3329559460895892e-07, "loss": 1.2513, "step": 45840 }, { "epoch": 4.34, "grad_norm": 70.8125, "learning_rate": 1.3291745107815337e-07, "loss": 1.3841, "step": 45860 }, { "epoch": 4.34, "grad_norm": 64.8125, "learning_rate": 1.3253930754734782e-07, "loss": 1.2639, "step": 45880 }, { "epoch": 4.34, "grad_norm": 64.5, "learning_rate": 1.3216116401654226e-07, "loss": 1.4525, "step": 45900 }, { "epoch": 4.34, "grad_norm": 76.0, "learning_rate": 1.3178302048573668e-07, "loss": 1.3358, "step": 45920 }, { "epoch": 4.34, "grad_norm": 88.0625, "learning_rate": 1.3140487695493113e-07, "loss": 1.3377, "step": 45940 }, { "epoch": 4.34, "grad_norm": 56.9375, "learning_rate": 1.310267334241256e-07, "loss": 1.4271, "step": 45960 }, { "epoch": 4.35, "grad_norm": 67.4375, "learning_rate": 1.3064858989332003e-07, "loss": 1.3373, "step": 45980 }, { "epoch": 4.35, "grad_norm": 69.8125, "learning_rate": 1.3027044636251448e-07, "loss": 1.3895, "step": 46000 }, { "epoch": 4.35, "grad_norm": 66.125, "learning_rate": 1.298923028317089e-07, "loss": 1.354, "step": 46020 }, { "epoch": 4.35, "grad_norm": 49.0625, "learning_rate": 1.2951415930090337e-07, "loss": 1.3607, "step": 46040 }, { "epoch": 4.35, "grad_norm": 66.8125, "learning_rate": 1.2913601577009782e-07, "loss": 1.2385, "step": 46060 }, { "epoch": 4.36, "grad_norm": 79.0, "learning_rate": 1.2875787223929224e-07, "loss": 1.2652, "step": 46080 }, { "epoch": 4.36, "grad_norm": 58.09375, "learning_rate": 1.283797287084867e-07, "loss": 1.2984, "step": 46100 }, { "epoch": 4.36, "grad_norm": 64.75, "learning_rate": 1.280015851776811e-07, "loss": 1.2636, "step": 46120 }, { "epoch": 4.36, "grad_norm": 52.625, "learning_rate": 1.2762344164687558e-07, "loss": 1.4068, "step": 46140 }, { "epoch": 4.36, "grad_norm": 53.9375, "learning_rate": 1.2724529811607e-07, "loss": 1.3923, "step": 46160 }, { "epoch": 4.37, "grad_norm": 55.40625, "learning_rate": 1.2686715458526445e-07, "loss": 1.3176, "step": 46180 }, { "epoch": 4.37, "grad_norm": 70.4375, "learning_rate": 1.264890110544589e-07, "loss": 1.34, "step": 46200 }, { "epoch": 4.37, "grad_norm": 75.875, "learning_rate": 1.2611086752365335e-07, "loss": 1.3322, "step": 46220 }, { "epoch": 4.37, "grad_norm": 66.75, "learning_rate": 1.257327239928478e-07, "loss": 1.3074, "step": 46240 }, { "epoch": 4.37, "grad_norm": 77.4375, "learning_rate": 1.2535458046204222e-07, "loss": 1.2949, "step": 46260 }, { "epoch": 4.38, "grad_norm": 59.625, "learning_rate": 1.2497643693123666e-07, "loss": 1.2506, "step": 46280 }, { "epoch": 4.38, "grad_norm": 57.75, "learning_rate": 1.245982934004311e-07, "loss": 1.3649, "step": 46300 }, { "epoch": 4.38, "grad_norm": 74.375, "learning_rate": 1.2422014986962556e-07, "loss": 1.3469, "step": 46320 }, { "epoch": 4.38, "grad_norm": 65.5625, "learning_rate": 1.2384200633882e-07, "loss": 1.4123, "step": 46340 }, { "epoch": 4.38, "grad_norm": 76.875, "learning_rate": 1.2346386280801443e-07, "loss": 1.3209, "step": 46360 }, { "epoch": 4.38, "grad_norm": 64.75, "learning_rate": 1.2308571927720888e-07, "loss": 1.3176, "step": 46380 }, { "epoch": 4.39, "grad_norm": 74.8125, "learning_rate": 1.2270757574640333e-07, "loss": 1.3496, "step": 46400 }, { "epoch": 4.39, "grad_norm": 219.875, "learning_rate": 1.2232943221559777e-07, "loss": 1.3346, "step": 46420 }, { "epoch": 4.39, "grad_norm": 53.90625, "learning_rate": 1.2195128868479222e-07, "loss": 1.3611, "step": 46440 }, { "epoch": 4.39, "grad_norm": 66.375, "learning_rate": 1.2157314515398667e-07, "loss": 1.3481, "step": 46460 }, { "epoch": 4.39, "grad_norm": 82.375, "learning_rate": 1.211950016231811e-07, "loss": 1.4615, "step": 46480 }, { "epoch": 4.4, "grad_norm": 66.9375, "learning_rate": 1.2081685809237554e-07, "loss": 1.2017, "step": 46500 }, { "epoch": 4.4, "grad_norm": 53.75, "learning_rate": 1.2043871456156999e-07, "loss": 1.3885, "step": 46520 }, { "epoch": 4.4, "grad_norm": 74.75, "learning_rate": 1.2006057103076443e-07, "loss": 1.3945, "step": 46540 }, { "epoch": 4.4, "grad_norm": 53.8125, "learning_rate": 1.1968242749995888e-07, "loss": 1.4098, "step": 46560 }, { "epoch": 4.4, "grad_norm": 54.5625, "learning_rate": 1.193042839691533e-07, "loss": 1.2309, "step": 46580 }, { "epoch": 4.41, "grad_norm": 76.375, "learning_rate": 1.1892614043834775e-07, "loss": 1.3407, "step": 46600 }, { "epoch": 4.41, "grad_norm": 69.875, "learning_rate": 1.185479969075422e-07, "loss": 1.3216, "step": 46620 }, { "epoch": 4.41, "grad_norm": 89.9375, "learning_rate": 1.1816985337673665e-07, "loss": 1.4638, "step": 46640 }, { "epoch": 4.41, "grad_norm": 68.625, "learning_rate": 1.1779170984593108e-07, "loss": 1.3594, "step": 46660 }, { "epoch": 4.41, "grad_norm": 65.1875, "learning_rate": 1.1741356631512553e-07, "loss": 1.279, "step": 46680 }, { "epoch": 4.41, "grad_norm": 83.6875, "learning_rate": 1.1703542278431996e-07, "loss": 1.2234, "step": 46700 }, { "epoch": 4.42, "grad_norm": 65.4375, "learning_rate": 1.1665727925351441e-07, "loss": 1.2726, "step": 46720 }, { "epoch": 4.42, "grad_norm": 71.375, "learning_rate": 1.1627913572270886e-07, "loss": 1.3769, "step": 46740 }, { "epoch": 4.42, "grad_norm": 77.5625, "learning_rate": 1.159009921919033e-07, "loss": 1.3536, "step": 46760 }, { "epoch": 4.42, "grad_norm": 67.0, "learning_rate": 1.1552284866109774e-07, "loss": 1.2303, "step": 46780 }, { "epoch": 4.42, "grad_norm": 63.15625, "learning_rate": 1.1514470513029219e-07, "loss": 1.3474, "step": 46800 }, { "epoch": 4.43, "grad_norm": 57.59375, "learning_rate": 1.1476656159948662e-07, "loss": 1.3236, "step": 46820 }, { "epoch": 4.43, "grad_norm": 73.0625, "learning_rate": 1.1438841806868106e-07, "loss": 1.2705, "step": 46840 }, { "epoch": 4.43, "grad_norm": 69.5625, "learning_rate": 1.1401027453787552e-07, "loss": 1.3801, "step": 46860 }, { "epoch": 4.43, "grad_norm": 58.5625, "learning_rate": 1.1363213100706995e-07, "loss": 1.3132, "step": 46880 }, { "epoch": 4.43, "grad_norm": 69.75, "learning_rate": 1.132539874762644e-07, "loss": 1.3228, "step": 46900 }, { "epoch": 4.44, "grad_norm": 56.625, "learning_rate": 1.1287584394545883e-07, "loss": 1.3565, "step": 46920 }, { "epoch": 4.44, "grad_norm": 92.1875, "learning_rate": 1.1249770041465328e-07, "loss": 1.4107, "step": 46940 }, { "epoch": 4.44, "grad_norm": 75.375, "learning_rate": 1.1211955688384772e-07, "loss": 1.359, "step": 46960 }, { "epoch": 4.44, "grad_norm": 79.875, "learning_rate": 1.1174141335304218e-07, "loss": 1.3833, "step": 46980 }, { "epoch": 4.44, "grad_norm": 68.0625, "learning_rate": 1.1136326982223661e-07, "loss": 1.3504, "step": 47000 }, { "epoch": 4.45, "grad_norm": 83.9375, "learning_rate": 1.1098512629143106e-07, "loss": 1.3007, "step": 47020 }, { "epoch": 4.45, "grad_norm": 65.6875, "learning_rate": 1.106069827606255e-07, "loss": 1.3499, "step": 47040 }, { "epoch": 4.45, "grad_norm": 63.0, "learning_rate": 1.1022883922981993e-07, "loss": 1.3585, "step": 47060 }, { "epoch": 4.45, "grad_norm": 85.0, "learning_rate": 1.0985069569901438e-07, "loss": 1.3445, "step": 47080 }, { "epoch": 4.45, "grad_norm": 54.78125, "learning_rate": 1.0947255216820881e-07, "loss": 1.2835, "step": 47100 }, { "epoch": 4.45, "grad_norm": 61.59375, "learning_rate": 1.0909440863740327e-07, "loss": 1.2405, "step": 47120 }, { "epoch": 4.46, "grad_norm": 90.25, "learning_rate": 1.0871626510659771e-07, "loss": 1.2945, "step": 47140 }, { "epoch": 4.46, "grad_norm": 56.28125, "learning_rate": 1.0833812157579216e-07, "loss": 1.3271, "step": 47160 }, { "epoch": 4.46, "grad_norm": 56.03125, "learning_rate": 1.0795997804498659e-07, "loss": 1.3141, "step": 47180 }, { "epoch": 4.46, "grad_norm": 50.59375, "learning_rate": 1.0758183451418104e-07, "loss": 1.351, "step": 47200 }, { "epoch": 4.46, "grad_norm": 65.5, "learning_rate": 1.0720369098337547e-07, "loss": 1.3218, "step": 47220 }, { "epoch": 4.47, "grad_norm": 64.6875, "learning_rate": 1.0682554745256993e-07, "loss": 1.3122, "step": 47240 }, { "epoch": 4.47, "grad_norm": 51.875, "learning_rate": 1.0644740392176437e-07, "loss": 1.3517, "step": 47260 }, { "epoch": 4.47, "grad_norm": 78.9375, "learning_rate": 1.0606926039095882e-07, "loss": 1.4208, "step": 47280 }, { "epoch": 4.47, "grad_norm": 51.4375, "learning_rate": 1.0569111686015325e-07, "loss": 1.336, "step": 47300 }, { "epoch": 4.47, "grad_norm": 86.375, "learning_rate": 1.0531297332934768e-07, "loss": 1.3902, "step": 47320 }, { "epoch": 4.48, "grad_norm": 88.625, "learning_rate": 1.0493482979854213e-07, "loss": 1.3191, "step": 47340 }, { "epoch": 4.48, "grad_norm": 86.0625, "learning_rate": 1.0455668626773658e-07, "loss": 1.307, "step": 47360 }, { "epoch": 4.48, "grad_norm": 67.75, "learning_rate": 1.0417854273693103e-07, "loss": 1.3867, "step": 47380 }, { "epoch": 4.48, "grad_norm": 61.125, "learning_rate": 1.0380039920612546e-07, "loss": 1.3503, "step": 47400 }, { "epoch": 4.48, "grad_norm": 62.9375, "learning_rate": 1.0342225567531991e-07, "loss": 1.271, "step": 47420 }, { "epoch": 4.48, "grad_norm": 60.4375, "learning_rate": 1.0304411214451434e-07, "loss": 1.3358, "step": 47440 }, { "epoch": 4.49, "grad_norm": 54.875, "learning_rate": 1.0266596861370879e-07, "loss": 1.3072, "step": 47460 }, { "epoch": 4.49, "grad_norm": 60.0625, "learning_rate": 1.0228782508290324e-07, "loss": 1.4395, "step": 47480 }, { "epoch": 4.49, "grad_norm": 55.0625, "learning_rate": 1.0190968155209769e-07, "loss": 1.2708, "step": 47500 }, { "epoch": 4.49, "grad_norm": 55.59375, "learning_rate": 1.0153153802129212e-07, "loss": 1.3655, "step": 47520 }, { "epoch": 4.49, "grad_norm": 70.8125, "learning_rate": 1.0115339449048656e-07, "loss": 1.3221, "step": 47540 }, { "epoch": 4.5, "grad_norm": 58.125, "learning_rate": 1.00775250959681e-07, "loss": 1.3146, "step": 47560 }, { "epoch": 4.5, "grad_norm": 82.375, "learning_rate": 1.0039710742887544e-07, "loss": 1.4091, "step": 47580 }, { "epoch": 4.5, "grad_norm": 81.6875, "learning_rate": 1.000189638980699e-07, "loss": 1.4975, "step": 47600 }, { "epoch": 4.5, "grad_norm": 64.625, "learning_rate": 9.964082036726433e-08, "loss": 1.2658, "step": 47620 }, { "epoch": 4.5, "grad_norm": 66.5, "learning_rate": 9.926267683645878e-08, "loss": 1.343, "step": 47640 }, { "epoch": 4.51, "grad_norm": 70.0, "learning_rate": 9.888453330565322e-08, "loss": 1.2768, "step": 47660 }, { "epoch": 4.51, "grad_norm": 71.9375, "learning_rate": 9.850638977484766e-08, "loss": 1.2412, "step": 47680 }, { "epoch": 4.51, "grad_norm": 61.0, "learning_rate": 9.81282462440421e-08, "loss": 1.3141, "step": 47700 }, { "epoch": 4.51, "grad_norm": 81.125, "learning_rate": 9.775010271323656e-08, "loss": 1.3319, "step": 47720 }, { "epoch": 4.51, "grad_norm": 70.6875, "learning_rate": 9.7371959182431e-08, "loss": 1.3988, "step": 47740 }, { "epoch": 4.52, "grad_norm": 61.25, "learning_rate": 9.699381565162544e-08, "loss": 1.2966, "step": 47760 }, { "epoch": 4.52, "grad_norm": 53.59375, "learning_rate": 9.661567212081988e-08, "loss": 1.3256, "step": 47780 }, { "epoch": 4.52, "grad_norm": 63.375, "learning_rate": 9.623752859001431e-08, "loss": 1.3215, "step": 47800 }, { "epoch": 4.52, "grad_norm": 111.6875, "learning_rate": 9.585938505920876e-08, "loss": 1.3497, "step": 47820 }, { "epoch": 4.52, "grad_norm": 64.625, "learning_rate": 9.54812415284032e-08, "loss": 1.3466, "step": 47840 }, { "epoch": 4.52, "grad_norm": 71.125, "learning_rate": 9.510309799759766e-08, "loss": 1.3301, "step": 47860 }, { "epoch": 4.53, "grad_norm": 64.0625, "learning_rate": 9.472495446679209e-08, "loss": 1.4499, "step": 47880 }, { "epoch": 4.53, "grad_norm": 75.0625, "learning_rate": 9.434681093598654e-08, "loss": 1.4131, "step": 47900 }, { "epoch": 4.53, "grad_norm": 57.90625, "learning_rate": 9.396866740518097e-08, "loss": 1.3384, "step": 47920 }, { "epoch": 4.53, "grad_norm": 60.09375, "learning_rate": 9.359052387437542e-08, "loss": 1.3501, "step": 47940 }, { "epoch": 4.53, "grad_norm": 54.53125, "learning_rate": 9.321238034356985e-08, "loss": 1.3278, "step": 47960 }, { "epoch": 4.54, "grad_norm": 58.0, "learning_rate": 9.283423681276432e-08, "loss": 1.3404, "step": 47980 }, { "epoch": 4.54, "grad_norm": 63.53125, "learning_rate": 9.245609328195875e-08, "loss": 1.3284, "step": 48000 }, { "epoch": 4.54, "grad_norm": 79.8125, "learning_rate": 9.207794975115318e-08, "loss": 1.3528, "step": 48020 }, { "epoch": 4.54, "grad_norm": 68.3125, "learning_rate": 9.169980622034763e-08, "loss": 1.4362, "step": 48040 }, { "epoch": 4.54, "grad_norm": 71.4375, "learning_rate": 9.132166268954207e-08, "loss": 1.4823, "step": 48060 }, { "epoch": 4.55, "grad_norm": 95.5, "learning_rate": 9.094351915873651e-08, "loss": 1.3332, "step": 48080 }, { "epoch": 4.55, "grad_norm": 70.9375, "learning_rate": 9.056537562793096e-08, "loss": 1.3638, "step": 48100 }, { "epoch": 4.55, "grad_norm": 70.5625, "learning_rate": 9.018723209712541e-08, "loss": 1.3538, "step": 48120 }, { "epoch": 4.55, "grad_norm": 54.09375, "learning_rate": 8.980908856631984e-08, "loss": 1.3523, "step": 48140 }, { "epoch": 4.55, "grad_norm": 84.375, "learning_rate": 8.943094503551429e-08, "loss": 1.3176, "step": 48160 }, { "epoch": 4.55, "grad_norm": 64.25, "learning_rate": 8.905280150470873e-08, "loss": 1.4906, "step": 48180 }, { "epoch": 4.56, "grad_norm": 52.375, "learning_rate": 8.867465797390317e-08, "loss": 1.3775, "step": 48200 }, { "epoch": 4.56, "grad_norm": 75.9375, "learning_rate": 8.829651444309762e-08, "loss": 1.2116, "step": 48220 }, { "epoch": 4.56, "grad_norm": 62.75, "learning_rate": 8.791837091229207e-08, "loss": 1.2772, "step": 48240 }, { "epoch": 4.56, "grad_norm": 61.21875, "learning_rate": 8.75402273814865e-08, "loss": 1.4184, "step": 48260 }, { "epoch": 4.56, "grad_norm": 67.25, "learning_rate": 8.716208385068094e-08, "loss": 1.3029, "step": 48280 }, { "epoch": 4.57, "grad_norm": 62.09375, "learning_rate": 8.678394031987539e-08, "loss": 1.3634, "step": 48300 }, { "epoch": 4.57, "grad_norm": 79.5, "learning_rate": 8.640579678906982e-08, "loss": 1.3963, "step": 48320 }, { "epoch": 4.57, "grad_norm": 92.5625, "learning_rate": 8.602765325826428e-08, "loss": 1.3531, "step": 48340 }, { "epoch": 4.57, "grad_norm": 78.75, "learning_rate": 8.564950972745872e-08, "loss": 1.3542, "step": 48360 }, { "epoch": 4.57, "grad_norm": 79.8125, "learning_rate": 8.527136619665317e-08, "loss": 1.3637, "step": 48380 }, { "epoch": 4.58, "grad_norm": 83.0625, "learning_rate": 8.48932226658476e-08, "loss": 1.3512, "step": 48400 }, { "epoch": 4.58, "grad_norm": 45.3125, "learning_rate": 8.451507913504205e-08, "loss": 1.242, "step": 48420 }, { "epoch": 4.58, "grad_norm": 78.125, "learning_rate": 8.413693560423648e-08, "loss": 1.3456, "step": 48440 }, { "epoch": 4.58, "grad_norm": 67.5, "learning_rate": 8.375879207343094e-08, "loss": 1.2762, "step": 48460 }, { "epoch": 4.58, "grad_norm": 59.4375, "learning_rate": 8.338064854262538e-08, "loss": 1.3527, "step": 48480 }, { "epoch": 4.58, "grad_norm": 75.75, "learning_rate": 8.300250501181981e-08, "loss": 1.3539, "step": 48500 }, { "epoch": 4.59, "grad_norm": 71.625, "learning_rate": 8.262436148101426e-08, "loss": 1.2919, "step": 48520 }, { "epoch": 4.59, "grad_norm": 77.4375, "learning_rate": 8.22462179502087e-08, "loss": 1.2577, "step": 48540 }, { "epoch": 4.59, "grad_norm": 56.90625, "learning_rate": 8.186807441940314e-08, "loss": 1.286, "step": 48560 }, { "epoch": 4.59, "grad_norm": 60.75, "learning_rate": 8.148993088859758e-08, "loss": 1.4127, "step": 48580 }, { "epoch": 4.59, "grad_norm": 53.875, "learning_rate": 8.111178735779204e-08, "loss": 1.2502, "step": 48600 }, { "epoch": 4.6, "grad_norm": 62.5, "learning_rate": 8.073364382698647e-08, "loss": 1.2631, "step": 48620 }, { "epoch": 4.6, "grad_norm": 70.3125, "learning_rate": 8.035550029618092e-08, "loss": 1.2837, "step": 48640 }, { "epoch": 4.6, "grad_norm": 49.3125, "learning_rate": 7.997735676537535e-08, "loss": 1.3133, "step": 48660 }, { "epoch": 4.6, "grad_norm": 60.4375, "learning_rate": 7.95992132345698e-08, "loss": 1.4197, "step": 48680 }, { "epoch": 4.6, "grad_norm": 60.8125, "learning_rate": 7.922106970376424e-08, "loss": 1.3261, "step": 48700 }, { "epoch": 4.61, "grad_norm": 77.0, "learning_rate": 7.884292617295868e-08, "loss": 1.2969, "step": 48720 }, { "epoch": 4.61, "grad_norm": 58.09375, "learning_rate": 7.846478264215313e-08, "loss": 1.2446, "step": 48740 }, { "epoch": 4.61, "grad_norm": 52.125, "learning_rate": 7.808663911134757e-08, "loss": 1.2991, "step": 48760 }, { "epoch": 4.61, "grad_norm": 62.4375, "learning_rate": 7.770849558054201e-08, "loss": 1.2746, "step": 48780 }, { "epoch": 4.61, "grad_norm": 53.84375, "learning_rate": 7.733035204973645e-08, "loss": 1.3841, "step": 48800 }, { "epoch": 4.62, "grad_norm": 66.625, "learning_rate": 7.69522085189309e-08, "loss": 1.3453, "step": 48820 }, { "epoch": 4.62, "grad_norm": 84.25, "learning_rate": 7.657406498812534e-08, "loss": 1.2658, "step": 48840 }, { "epoch": 4.62, "grad_norm": 76.75, "learning_rate": 7.619592145731979e-08, "loss": 1.2577, "step": 48860 }, { "epoch": 4.62, "grad_norm": 78.125, "learning_rate": 7.581777792651423e-08, "loss": 1.3535, "step": 48880 }, { "epoch": 4.62, "grad_norm": 65.375, "learning_rate": 7.543963439570867e-08, "loss": 1.2434, "step": 48900 }, { "epoch": 4.62, "grad_norm": 67.3125, "learning_rate": 7.506149086490311e-08, "loss": 1.2902, "step": 48920 }, { "epoch": 4.63, "grad_norm": 67.9375, "learning_rate": 7.468334733409756e-08, "loss": 1.4072, "step": 48940 }, { "epoch": 4.63, "grad_norm": 68.8125, "learning_rate": 7.4305203803292e-08, "loss": 1.281, "step": 48960 }, { "epoch": 4.63, "grad_norm": 75.0625, "learning_rate": 7.392706027248644e-08, "loss": 1.2644, "step": 48980 }, { "epoch": 4.63, "grad_norm": 82.875, "learning_rate": 7.354891674168089e-08, "loss": 1.4248, "step": 49000 }, { "epoch": 4.63, "grad_norm": 63.3125, "learning_rate": 7.317077321087532e-08, "loss": 1.4421, "step": 49020 }, { "epoch": 4.64, "grad_norm": 79.1875, "learning_rate": 7.279262968006977e-08, "loss": 1.3399, "step": 49040 }, { "epoch": 4.64, "grad_norm": 65.3125, "learning_rate": 7.24144861492642e-08, "loss": 1.3482, "step": 49060 }, { "epoch": 4.64, "grad_norm": 74.5, "learning_rate": 7.203634261845867e-08, "loss": 1.3647, "step": 49080 }, { "epoch": 4.64, "grad_norm": 72.25, "learning_rate": 7.16581990876531e-08, "loss": 1.2468, "step": 49100 }, { "epoch": 4.64, "grad_norm": 61.5, "learning_rate": 7.128005555684755e-08, "loss": 1.3302, "step": 49120 }, { "epoch": 4.65, "grad_norm": 70.875, "learning_rate": 7.090191202604198e-08, "loss": 1.3486, "step": 49140 }, { "epoch": 4.65, "grad_norm": 69.625, "learning_rate": 7.052376849523643e-08, "loss": 1.2402, "step": 49160 }, { "epoch": 4.65, "grad_norm": 71.25, "learning_rate": 7.014562496443086e-08, "loss": 1.3408, "step": 49180 }, { "epoch": 4.65, "grad_norm": 61.40625, "learning_rate": 6.97674814336253e-08, "loss": 1.3726, "step": 49200 }, { "epoch": 4.65, "grad_norm": 45.6875, "learning_rate": 6.938933790281976e-08, "loss": 1.2908, "step": 49220 }, { "epoch": 4.65, "grad_norm": 66.125, "learning_rate": 6.90111943720142e-08, "loss": 1.2688, "step": 49240 }, { "epoch": 4.66, "grad_norm": 44.46875, "learning_rate": 6.863305084120864e-08, "loss": 1.2785, "step": 49260 }, { "epoch": 4.66, "grad_norm": 76.25, "learning_rate": 6.825490731040308e-08, "loss": 1.2984, "step": 49280 }, { "epoch": 4.66, "grad_norm": 76.9375, "learning_rate": 6.787676377959752e-08, "loss": 1.394, "step": 49300 }, { "epoch": 4.66, "grad_norm": 62.21875, "learning_rate": 6.749862024879196e-08, "loss": 1.336, "step": 49320 }, { "epoch": 4.66, "grad_norm": 67.875, "learning_rate": 6.712047671798642e-08, "loss": 1.3415, "step": 49340 }, { "epoch": 4.67, "grad_norm": 57.78125, "learning_rate": 6.674233318718085e-08, "loss": 1.428, "step": 49360 }, { "epoch": 4.67, "grad_norm": 74.8125, "learning_rate": 6.63641896563753e-08, "loss": 1.2926, "step": 49380 }, { "epoch": 4.67, "grad_norm": 58.21875, "learning_rate": 6.598604612556974e-08, "loss": 1.3439, "step": 49400 }, { "epoch": 4.67, "grad_norm": 52.34375, "learning_rate": 6.560790259476418e-08, "loss": 1.2582, "step": 49420 }, { "epoch": 4.67, "grad_norm": 63.09375, "learning_rate": 6.522975906395862e-08, "loss": 1.385, "step": 49440 }, { "epoch": 4.68, "grad_norm": 64.4375, "learning_rate": 6.485161553315307e-08, "loss": 1.3645, "step": 49460 }, { "epoch": 4.68, "grad_norm": 74.875, "learning_rate": 6.447347200234751e-08, "loss": 1.3553, "step": 49480 }, { "epoch": 4.68, "grad_norm": 76.5, "learning_rate": 6.409532847154195e-08, "loss": 1.3469, "step": 49500 }, { "epoch": 4.68, "grad_norm": 62.90625, "learning_rate": 6.37171849407364e-08, "loss": 1.2842, "step": 49520 }, { "epoch": 4.68, "grad_norm": 57.90625, "learning_rate": 6.333904140993083e-08, "loss": 1.2475, "step": 49540 }, { "epoch": 4.69, "grad_norm": 51.21875, "learning_rate": 6.296089787912528e-08, "loss": 1.4054, "step": 49560 }, { "epoch": 4.69, "grad_norm": 63.75, "learning_rate": 6.258275434831973e-08, "loss": 1.2913, "step": 49580 }, { "epoch": 4.69, "grad_norm": 62.875, "learning_rate": 6.220461081751416e-08, "loss": 1.2777, "step": 49600 }, { "epoch": 4.69, "grad_norm": 67.8125, "learning_rate": 6.182646728670861e-08, "loss": 1.4151, "step": 49620 }, { "epoch": 4.69, "grad_norm": 68.8125, "learning_rate": 6.144832375590306e-08, "loss": 1.3384, "step": 49640 }, { "epoch": 4.69, "grad_norm": 93.875, "learning_rate": 6.107018022509749e-08, "loss": 1.3058, "step": 49660 }, { "epoch": 4.7, "grad_norm": 106.4375, "learning_rate": 6.069203669429194e-08, "loss": 1.358, "step": 49680 }, { "epoch": 4.7, "grad_norm": 59.34375, "learning_rate": 6.031389316348639e-08, "loss": 1.3103, "step": 49700 }, { "epoch": 4.7, "grad_norm": 67.625, "learning_rate": 5.993574963268082e-08, "loss": 1.2339, "step": 49720 }, { "epoch": 4.7, "grad_norm": 74.5, "learning_rate": 5.955760610187527e-08, "loss": 1.3031, "step": 49740 }, { "epoch": 4.7, "grad_norm": 79.8125, "learning_rate": 5.917946257106971e-08, "loss": 1.2387, "step": 49760 }, { "epoch": 4.71, "grad_norm": 60.5, "learning_rate": 5.880131904026415e-08, "loss": 1.2539, "step": 49780 }, { "epoch": 4.71, "grad_norm": 79.75, "learning_rate": 5.842317550945859e-08, "loss": 1.3769, "step": 49800 }, { "epoch": 4.71, "grad_norm": 86.0625, "learning_rate": 5.8045031978653034e-08, "loss": 1.3498, "step": 49820 }, { "epoch": 4.71, "grad_norm": 66.875, "learning_rate": 5.7666888447847475e-08, "loss": 1.345, "step": 49840 }, { "epoch": 4.71, "grad_norm": 61.03125, "learning_rate": 5.728874491704192e-08, "loss": 1.3413, "step": 49860 }, { "epoch": 4.72, "grad_norm": 69.1875, "learning_rate": 5.6910601386236364e-08, "loss": 1.3266, "step": 49880 }, { "epoch": 4.72, "grad_norm": 115.125, "learning_rate": 5.6532457855430805e-08, "loss": 1.3336, "step": 49900 }, { "epoch": 4.72, "grad_norm": 59.78125, "learning_rate": 5.615431432462525e-08, "loss": 1.4154, "step": 49920 }, { "epoch": 4.72, "grad_norm": 62.8125, "learning_rate": 5.5776170793819694e-08, "loss": 1.4303, "step": 49940 }, { "epoch": 4.72, "grad_norm": 88.4375, "learning_rate": 5.5398027263014136e-08, "loss": 1.4158, "step": 49960 }, { "epoch": 4.72, "grad_norm": 75.25, "learning_rate": 5.5019883732208583e-08, "loss": 1.2602, "step": 49980 }, { "epoch": 4.73, "grad_norm": 69.8125, "learning_rate": 5.4641740201403024e-08, "loss": 1.3976, "step": 50000 }, { "epoch": 4.73, "grad_norm": 69.875, "learning_rate": 5.4263596670597466e-08, "loss": 1.304, "step": 50020 }, { "epoch": 4.73, "grad_norm": 69.125, "learning_rate": 5.388545313979191e-08, "loss": 1.2925, "step": 50040 }, { "epoch": 4.73, "grad_norm": 64.0, "learning_rate": 5.350730960898635e-08, "loss": 1.3776, "step": 50060 }, { "epoch": 4.73, "grad_norm": 53.0, "learning_rate": 5.312916607818079e-08, "loss": 1.4188, "step": 50080 }, { "epoch": 4.74, "grad_norm": 66.375, "learning_rate": 5.275102254737524e-08, "loss": 1.2999, "step": 50100 }, { "epoch": 4.74, "grad_norm": 64.8125, "learning_rate": 5.237287901656968e-08, "loss": 1.4259, "step": 50120 }, { "epoch": 4.74, "grad_norm": 58.84375, "learning_rate": 5.199473548576412e-08, "loss": 1.4155, "step": 50140 }, { "epoch": 4.74, "grad_norm": 85.25, "learning_rate": 5.161659195495857e-08, "loss": 1.2742, "step": 50160 }, { "epoch": 4.74, "grad_norm": 71.5625, "learning_rate": 5.123844842415301e-08, "loss": 1.3077, "step": 50180 }, { "epoch": 4.75, "grad_norm": 70.5625, "learning_rate": 5.086030489334745e-08, "loss": 1.4397, "step": 50200 }, { "epoch": 4.75, "grad_norm": 70.125, "learning_rate": 5.04821613625419e-08, "loss": 1.2983, "step": 50220 }, { "epoch": 4.75, "grad_norm": 74.375, "learning_rate": 5.010401783173634e-08, "loss": 1.3589, "step": 50240 }, { "epoch": 4.75, "grad_norm": 67.875, "learning_rate": 4.972587430093078e-08, "loss": 1.3045, "step": 50260 }, { "epoch": 4.75, "grad_norm": 64.5, "learning_rate": 4.934773077012522e-08, "loss": 1.3912, "step": 50280 }, { "epoch": 4.76, "grad_norm": 63.46875, "learning_rate": 4.896958723931966e-08, "loss": 1.3652, "step": 50300 }, { "epoch": 4.76, "grad_norm": 72.75, "learning_rate": 4.85914437085141e-08, "loss": 1.3422, "step": 50320 }, { "epoch": 4.76, "grad_norm": 53.46875, "learning_rate": 4.821330017770855e-08, "loss": 1.2532, "step": 50340 }, { "epoch": 4.76, "grad_norm": 56.9375, "learning_rate": 4.783515664690299e-08, "loss": 1.4502, "step": 50360 }, { "epoch": 4.76, "grad_norm": 51.40625, "learning_rate": 4.745701311609743e-08, "loss": 1.3256, "step": 50380 }, { "epoch": 4.76, "grad_norm": 88.25, "learning_rate": 4.707886958529188e-08, "loss": 1.2456, "step": 50400 }, { "epoch": 4.77, "grad_norm": 64.9375, "learning_rate": 4.670072605448632e-08, "loss": 1.3789, "step": 50420 }, { "epoch": 4.77, "grad_norm": 69.6875, "learning_rate": 4.632258252368076e-08, "loss": 1.3511, "step": 50440 }, { "epoch": 4.77, "grad_norm": 60.84375, "learning_rate": 4.594443899287521e-08, "loss": 1.3829, "step": 50460 }, { "epoch": 4.77, "grad_norm": 65.75, "learning_rate": 4.556629546206965e-08, "loss": 1.316, "step": 50480 }, { "epoch": 4.77, "grad_norm": 60.9375, "learning_rate": 4.518815193126409e-08, "loss": 1.3926, "step": 50500 }, { "epoch": 4.78, "grad_norm": 68.9375, "learning_rate": 4.481000840045853e-08, "loss": 1.3642, "step": 50520 }, { "epoch": 4.78, "grad_norm": 52.03125, "learning_rate": 4.4431864869652975e-08, "loss": 1.3064, "step": 50540 }, { "epoch": 4.78, "grad_norm": 64.8125, "learning_rate": 4.4053721338847417e-08, "loss": 1.3323, "step": 50560 }, { "epoch": 4.78, "grad_norm": 67.25, "learning_rate": 4.367557780804186e-08, "loss": 1.2735, "step": 50580 }, { "epoch": 4.78, "grad_norm": 55.1875, "learning_rate": 4.3297434277236306e-08, "loss": 1.3655, "step": 50600 }, { "epoch": 4.79, "grad_norm": 71.1875, "learning_rate": 4.2919290746430747e-08, "loss": 1.2776, "step": 50620 }, { "epoch": 4.79, "grad_norm": 62.9375, "learning_rate": 4.254114721562519e-08, "loss": 1.2622, "step": 50640 }, { "epoch": 4.79, "grad_norm": 79.0, "learning_rate": 4.2163003684819636e-08, "loss": 1.4572, "step": 50660 }, { "epoch": 4.79, "grad_norm": 70.625, "learning_rate": 4.178486015401408e-08, "loss": 1.304, "step": 50680 }, { "epoch": 4.79, "grad_norm": 68.5, "learning_rate": 4.140671662320852e-08, "loss": 1.3719, "step": 50700 }, { "epoch": 4.79, "grad_norm": 65.0625, "learning_rate": 4.1028573092402966e-08, "loss": 1.3088, "step": 50720 }, { "epoch": 4.8, "grad_norm": 63.25, "learning_rate": 4.065042956159741e-08, "loss": 1.3505, "step": 50740 }, { "epoch": 4.8, "grad_norm": 61.8125, "learning_rate": 4.027228603079184e-08, "loss": 1.2913, "step": 50760 }, { "epoch": 4.8, "grad_norm": 54.9375, "learning_rate": 3.989414249998629e-08, "loss": 1.3873, "step": 50780 }, { "epoch": 4.8, "grad_norm": 62.9375, "learning_rate": 3.951599896918073e-08, "loss": 1.4092, "step": 50800 }, { "epoch": 4.8, "grad_norm": 67.5, "learning_rate": 3.913785543837517e-08, "loss": 1.2893, "step": 50820 }, { "epoch": 4.81, "grad_norm": 59.625, "learning_rate": 3.875971190756962e-08, "loss": 1.354, "step": 50840 }, { "epoch": 4.81, "grad_norm": 54.25, "learning_rate": 3.838156837676406e-08, "loss": 1.3941, "step": 50860 }, { "epoch": 4.81, "grad_norm": 72.8125, "learning_rate": 3.80034248459585e-08, "loss": 1.3033, "step": 50880 }, { "epoch": 4.81, "grad_norm": 69.625, "learning_rate": 3.762528131515295e-08, "loss": 1.2482, "step": 50900 }, { "epoch": 4.81, "grad_norm": 53.9375, "learning_rate": 3.724713778434739e-08, "loss": 1.3942, "step": 50920 }, { "epoch": 4.82, "grad_norm": 89.1875, "learning_rate": 3.686899425354183e-08, "loss": 1.3397, "step": 50940 }, { "epoch": 4.82, "grad_norm": 58.75, "learning_rate": 3.649085072273628e-08, "loss": 1.2749, "step": 50960 }, { "epoch": 4.82, "grad_norm": 71.3125, "learning_rate": 3.611270719193072e-08, "loss": 1.3422, "step": 50980 }, { "epoch": 4.82, "grad_norm": 54.34375, "learning_rate": 3.5734563661125155e-08, "loss": 1.2571, "step": 51000 }, { "epoch": 4.82, "grad_norm": 58.84375, "learning_rate": 3.53564201303196e-08, "loss": 1.2357, "step": 51020 }, { "epoch": 4.83, "grad_norm": 61.78125, "learning_rate": 3.4978276599514044e-08, "loss": 1.3384, "step": 51040 }, { "epoch": 4.83, "grad_norm": 61.40625, "learning_rate": 3.4600133068708485e-08, "loss": 1.1818, "step": 51060 }, { "epoch": 4.83, "grad_norm": 67.25, "learning_rate": 3.422198953790293e-08, "loss": 1.3936, "step": 51080 }, { "epoch": 4.83, "grad_norm": 70.8125, "learning_rate": 3.3843846007097374e-08, "loss": 1.2922, "step": 51100 }, { "epoch": 4.83, "grad_norm": 62.75, "learning_rate": 3.3465702476291815e-08, "loss": 1.4717, "step": 51120 }, { "epoch": 4.83, "grad_norm": 55.375, "learning_rate": 3.308755894548626e-08, "loss": 1.342, "step": 51140 }, { "epoch": 4.84, "grad_norm": 84.625, "learning_rate": 3.2709415414680704e-08, "loss": 1.3032, "step": 51160 }, { "epoch": 4.84, "grad_norm": 67.625, "learning_rate": 3.2331271883875145e-08, "loss": 1.366, "step": 51180 }, { "epoch": 4.84, "grad_norm": 67.625, "learning_rate": 3.195312835306959e-08, "loss": 1.3584, "step": 51200 }, { "epoch": 4.84, "grad_norm": 65.25, "learning_rate": 3.1574984822264034e-08, "loss": 1.3087, "step": 51220 }, { "epoch": 4.84, "grad_norm": 61.5, "learning_rate": 3.1196841291458475e-08, "loss": 1.3388, "step": 51240 }, { "epoch": 4.85, "grad_norm": 63.78125, "learning_rate": 3.0818697760652917e-08, "loss": 1.2657, "step": 51260 }, { "epoch": 4.85, "grad_norm": 68.75, "learning_rate": 3.044055422984736e-08, "loss": 1.3103, "step": 51280 }, { "epoch": 4.85, "grad_norm": 56.4375, "learning_rate": 3.00624106990418e-08, "loss": 1.4243, "step": 51300 }, { "epoch": 4.85, "grad_norm": 83.875, "learning_rate": 2.9684267168236243e-08, "loss": 1.2968, "step": 51320 }, { "epoch": 4.85, "grad_norm": 64.3125, "learning_rate": 2.9306123637430688e-08, "loss": 1.313, "step": 51340 }, { "epoch": 4.86, "grad_norm": 78.875, "learning_rate": 2.892798010662513e-08, "loss": 1.3133, "step": 51360 }, { "epoch": 4.86, "grad_norm": 68.0625, "learning_rate": 2.8549836575819574e-08, "loss": 1.3116, "step": 51380 }, { "epoch": 4.86, "grad_norm": 72.0625, "learning_rate": 2.8171693045014018e-08, "loss": 1.3143, "step": 51400 }, { "epoch": 4.86, "grad_norm": 56.96875, "learning_rate": 2.7793549514208456e-08, "loss": 1.3107, "step": 51420 }, { "epoch": 4.86, "grad_norm": 57.75, "learning_rate": 2.74154059834029e-08, "loss": 1.3432, "step": 51440 }, { "epoch": 4.86, "grad_norm": 75.25, "learning_rate": 2.7037262452597345e-08, "loss": 1.3119, "step": 51460 }, { "epoch": 4.87, "grad_norm": 81.875, "learning_rate": 2.6659118921791786e-08, "loss": 1.2609, "step": 51480 }, { "epoch": 4.87, "grad_norm": 70.25, "learning_rate": 2.628097539098623e-08, "loss": 1.2975, "step": 51500 }, { "epoch": 4.87, "grad_norm": 63.9375, "learning_rate": 2.5902831860180668e-08, "loss": 1.3698, "step": 51520 }, { "epoch": 4.87, "grad_norm": 72.125, "learning_rate": 2.5524688329375113e-08, "loss": 1.3457, "step": 51540 }, { "epoch": 4.87, "grad_norm": 62.4375, "learning_rate": 2.5146544798569557e-08, "loss": 1.3434, "step": 51560 }, { "epoch": 4.88, "grad_norm": 92.9375, "learning_rate": 2.4768401267763998e-08, "loss": 1.3211, "step": 51580 }, { "epoch": 4.88, "grad_norm": 59.6875, "learning_rate": 2.4390257736958443e-08, "loss": 1.3081, "step": 51600 }, { "epoch": 4.88, "grad_norm": 62.09375, "learning_rate": 2.4012114206152887e-08, "loss": 1.333, "step": 51620 }, { "epoch": 4.88, "grad_norm": 54.59375, "learning_rate": 2.3633970675347325e-08, "loss": 1.316, "step": 51640 }, { "epoch": 4.88, "grad_norm": 70.9375, "learning_rate": 2.325582714454177e-08, "loss": 1.3066, "step": 51660 }, { "epoch": 4.89, "grad_norm": 74.5625, "learning_rate": 2.2877683613736214e-08, "loss": 1.3551, "step": 51680 }, { "epoch": 4.89, "grad_norm": 72.0, "learning_rate": 2.2499540082930655e-08, "loss": 1.3349, "step": 51700 }, { "epoch": 4.89, "grad_norm": 69.0, "learning_rate": 2.21213965521251e-08, "loss": 1.2775, "step": 51720 }, { "epoch": 4.89, "grad_norm": 74.4375, "learning_rate": 2.1743253021319544e-08, "loss": 1.3279, "step": 51740 }, { "epoch": 4.89, "grad_norm": 79.25, "learning_rate": 2.1365109490513982e-08, "loss": 1.4207, "step": 51760 }, { "epoch": 4.9, "grad_norm": 66.4375, "learning_rate": 2.0986965959708426e-08, "loss": 1.3499, "step": 51780 }, { "epoch": 4.9, "grad_norm": 64.1875, "learning_rate": 2.060882242890287e-08, "loss": 1.3024, "step": 51800 }, { "epoch": 4.9, "grad_norm": 53.90625, "learning_rate": 2.0230678898097312e-08, "loss": 1.2509, "step": 51820 }, { "epoch": 4.9, "grad_norm": 58.78125, "learning_rate": 1.9852535367291757e-08, "loss": 1.3702, "step": 51840 }, { "epoch": 4.9, "grad_norm": 57.5625, "learning_rate": 1.94743918364862e-08, "loss": 1.3503, "step": 51860 }, { "epoch": 4.9, "grad_norm": 58.75, "learning_rate": 1.909624830568064e-08, "loss": 1.2938, "step": 51880 }, { "epoch": 4.91, "grad_norm": 53.6875, "learning_rate": 1.8718104774875083e-08, "loss": 1.384, "step": 51900 }, { "epoch": 4.91, "grad_norm": 69.6875, "learning_rate": 1.8339961244069524e-08, "loss": 1.3979, "step": 51920 }, { "epoch": 4.91, "grad_norm": 62.40625, "learning_rate": 1.796181771326397e-08, "loss": 1.3146, "step": 51940 }, { "epoch": 4.91, "grad_norm": 65.6875, "learning_rate": 1.7583674182458413e-08, "loss": 1.2778, "step": 51960 }, { "epoch": 4.91, "grad_norm": 57.375, "learning_rate": 1.7205530651652855e-08, "loss": 1.2466, "step": 51980 }, { "epoch": 4.92, "grad_norm": 77.25, "learning_rate": 1.6827387120847296e-08, "loss": 1.3475, "step": 52000 }, { "epoch": 4.92, "grad_norm": 56.28125, "learning_rate": 1.644924359004174e-08, "loss": 1.2942, "step": 52020 }, { "epoch": 4.92, "grad_norm": 68.5, "learning_rate": 1.607110005923618e-08, "loss": 1.3799, "step": 52040 }, { "epoch": 4.92, "grad_norm": 57.90625, "learning_rate": 1.5692956528430626e-08, "loss": 1.366, "step": 52060 }, { "epoch": 4.92, "grad_norm": 70.5625, "learning_rate": 1.531481299762507e-08, "loss": 1.3255, "step": 52080 }, { "epoch": 4.93, "grad_norm": 59.0625, "learning_rate": 1.493666946681951e-08, "loss": 1.3114, "step": 52100 }, { "epoch": 4.93, "grad_norm": 61.46875, "learning_rate": 1.4558525936013954e-08, "loss": 1.2898, "step": 52120 }, { "epoch": 4.93, "grad_norm": 57.6875, "learning_rate": 1.4180382405208397e-08, "loss": 1.3028, "step": 52140 }, { "epoch": 4.93, "grad_norm": 61.96875, "learning_rate": 1.380223887440284e-08, "loss": 1.3921, "step": 52160 }, { "epoch": 4.93, "grad_norm": 61.53125, "learning_rate": 1.3424095343597283e-08, "loss": 1.3036, "step": 52180 }, { "epoch": 4.93, "grad_norm": 52.0625, "learning_rate": 1.3045951812791725e-08, "loss": 1.2947, "step": 52200 }, { "epoch": 4.94, "grad_norm": 63.03125, "learning_rate": 1.2667808281986167e-08, "loss": 1.3312, "step": 52220 }, { "epoch": 4.94, "grad_norm": 68.375, "learning_rate": 1.2289664751180611e-08, "loss": 1.2572, "step": 52240 }, { "epoch": 4.94, "grad_norm": 65.9375, "learning_rate": 1.1911521220375054e-08, "loss": 1.346, "step": 52260 }, { "epoch": 4.94, "grad_norm": 60.625, "learning_rate": 1.1533377689569495e-08, "loss": 1.2104, "step": 52280 }, { "epoch": 4.94, "grad_norm": 89.4375, "learning_rate": 1.115523415876394e-08, "loss": 1.3119, "step": 52300 }, { "epoch": 4.95, "grad_norm": 72.875, "learning_rate": 1.0777090627958382e-08, "loss": 1.4323, "step": 52320 }, { "epoch": 4.95, "grad_norm": 59.21875, "learning_rate": 1.0398947097152824e-08, "loss": 1.2751, "step": 52340 }, { "epoch": 4.95, "grad_norm": 63.96875, "learning_rate": 1.0020803566347268e-08, "loss": 1.3833, "step": 52360 }, { "epoch": 4.95, "grad_norm": 91.8125, "learning_rate": 9.64266003554171e-09, "loss": 1.4989, "step": 52380 }, { "epoch": 4.95, "grad_norm": 57.59375, "learning_rate": 9.264516504736152e-09, "loss": 1.3041, "step": 52400 }, { "epoch": 4.96, "grad_norm": 62.28125, "learning_rate": 8.886372973930595e-09, "loss": 1.2397, "step": 52420 }, { "epoch": 4.96, "grad_norm": 51.78125, "learning_rate": 8.50822944312504e-09, "loss": 1.3625, "step": 52440 }, { "epoch": 4.96, "grad_norm": 75.5, "learning_rate": 8.13008591231948e-09, "loss": 1.3415, "step": 52460 }, { "epoch": 4.96, "grad_norm": 53.59375, "learning_rate": 7.751942381513923e-09, "loss": 1.232, "step": 52480 }, { "epoch": 4.96, "grad_norm": 67.1875, "learning_rate": 7.373798850708366e-09, "loss": 1.2606, "step": 52500 }, { "epoch": 4.97, "grad_norm": 62.0, "learning_rate": 6.995655319902809e-09, "loss": 1.3394, "step": 52520 }, { "epoch": 4.97, "grad_norm": 62.6875, "learning_rate": 6.6175117890972525e-09, "loss": 1.2363, "step": 52540 }, { "epoch": 4.97, "grad_norm": 62.625, "learning_rate": 6.2393682582916944e-09, "loss": 1.2659, "step": 52560 }, { "epoch": 4.97, "grad_norm": 82.5625, "learning_rate": 5.861224727486137e-09, "loss": 1.3677, "step": 52580 }, { "epoch": 4.97, "grad_norm": 54.65625, "learning_rate": 5.48308119668058e-09, "loss": 1.2904, "step": 52600 }, { "epoch": 4.97, "grad_norm": 76.4375, "learning_rate": 5.104937665875023e-09, "loss": 1.313, "step": 52620 }, { "epoch": 4.98, "grad_norm": 67.8125, "learning_rate": 4.726794135069465e-09, "loss": 1.3009, "step": 52640 }, { "epoch": 4.98, "grad_norm": 75.75, "learning_rate": 4.3486506042639085e-09, "loss": 1.3895, "step": 52660 }, { "epoch": 4.98, "grad_norm": 87.25, "learning_rate": 3.970507073458351e-09, "loss": 1.3273, "step": 52680 }, { "epoch": 4.98, "grad_norm": 59.78125, "learning_rate": 3.592363542652794e-09, "loss": 1.3326, "step": 52700 }, { "epoch": 4.98, "grad_norm": 73.0, "learning_rate": 3.2142200118472365e-09, "loss": 1.2185, "step": 52720 }, { "epoch": 4.99, "grad_norm": 61.0625, "learning_rate": 2.8360764810416793e-09, "loss": 1.4331, "step": 52740 }, { "epoch": 4.99, "grad_norm": 66.8125, "learning_rate": 2.457932950236122e-09, "loss": 1.2308, "step": 52760 }, { "epoch": 4.99, "grad_norm": 60.71875, "learning_rate": 2.0797894194305645e-09, "loss": 1.4168, "step": 52780 }, { "epoch": 4.99, "grad_norm": 56.5625, "learning_rate": 1.7016458886250076e-09, "loss": 1.3373, "step": 52800 }, { "epoch": 4.99, "grad_norm": 53.40625, "learning_rate": 1.3235023578194504e-09, "loss": 1.3291, "step": 52820 }, { "epoch": 5.0, "grad_norm": 54.625, "learning_rate": 9.45358827013893e-10, "loss": 1.38, "step": 52840 }, { "epoch": 5.0, "grad_norm": 84.875, "learning_rate": 5.672152962083359e-10, "loss": 1.2983, "step": 52860 }, { "epoch": 5.0, "grad_norm": 135.625, "learning_rate": 1.8907176540277862e-10, "loss": 1.4019, "step": 52880 } ], "logging_steps": 20, "max_steps": 52890, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.7722335641506468e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }