diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18529 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 52890, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 103.3125, + "learning_rate": 9.996224236844906e-07, + "loss": 5.1244, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 133.25, + "learning_rate": 9.99244280153685e-07, + "loss": 4.3099, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 74.375, + "learning_rate": 9.988661366228795e-07, + "loss": 3.1455, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 83.875, + "learning_rate": 9.98487993092074e-07, + "loss": 3.0035, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 106.0625, + "learning_rate": 9.981098495612682e-07, + "loss": 2.9661, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 55.28125, + "learning_rate": 9.977317060304627e-07, + "loss": 2.634, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 69.0, + "learning_rate": 9.973535624996572e-07, + "loss": 2.601, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 64.6875, + "learning_rate": 9.969754189688517e-07, + "loss": 2.5738, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 99.0, + "learning_rate": 9.965972754380461e-07, + "loss": 2.6165, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 62.125, + "learning_rate": 9.962191319072406e-07, + "loss": 2.5547, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 60.625, + "learning_rate": 9.95840988376435e-07, + "loss": 2.4761, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 59.625, + "learning_rate": 9.954628448456296e-07, + "loss": 2.4663, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 75.75, + "learning_rate": 9.950847013148238e-07, + "loss": 2.5917, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 96.0, + "learning_rate": 9.947065577840183e-07, + "loss": 2.4079, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 85.25, + "learning_rate": 9.943284142532128e-07, + "loss": 2.4535, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 48.5625, + "learning_rate": 9.939502707224072e-07, + "loss": 2.4386, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 53.46875, + "learning_rate": 9.935721271916017e-07, + "loss": 2.4755, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 69.5, + "learning_rate": 9.931939836607962e-07, + "loss": 2.3306, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 45.09375, + "learning_rate": 9.928158401299905e-07, + "loss": 2.3735, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 60.90625, + "learning_rate": 9.92437696599185e-07, + "loss": 2.3466, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 55.34375, + "learning_rate": 9.920595530683794e-07, + "loss": 2.3971, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 49.375, + "learning_rate": 9.916814095375739e-07, + "loss": 2.366, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 49.15625, + "learning_rate": 9.913032660067684e-07, + "loss": 2.3549, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 66.75, + "learning_rate": 9.909251224759626e-07, + "loss": 2.23, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 66.8125, + "learning_rate": 9.905469789451573e-07, + "loss": 2.4057, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 60.375, + "learning_rate": 9.901688354143518e-07, + "loss": 2.339, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 56.65625, + "learning_rate": 9.89790691883546e-07, + "loss": 2.3804, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 78.375, + "learning_rate": 9.894125483527405e-07, + "loss": 2.2699, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 65.4375, + "learning_rate": 9.89034404821935e-07, + "loss": 2.4262, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 55.96875, + "learning_rate": 9.886562612911295e-07, + "loss": 2.3386, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 71.0625, + "learning_rate": 9.88278117760324e-07, + "loss": 2.3846, + "step": 620 + }, + { + "epoch": 0.06, + "grad_norm": 61.75, + "learning_rate": 9.878999742295184e-07, + "loss": 2.2632, + "step": 640 + }, + { + "epoch": 0.06, + "grad_norm": 47.59375, + "learning_rate": 9.875218306987127e-07, + "loss": 2.2826, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 68.75, + "learning_rate": 9.871436871679071e-07, + "loss": 2.3458, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 51.0625, + "learning_rate": 9.867655436371016e-07, + "loss": 2.2185, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 63.71875, + "learning_rate": 9.86387400106296e-07, + "loss": 2.2608, + "step": 720 + }, + { + "epoch": 0.07, + "grad_norm": 78.375, + "learning_rate": 9.860092565754906e-07, + "loss": 2.3038, + "step": 740 + }, + { + "epoch": 0.07, + "grad_norm": 53.25, + "learning_rate": 9.856311130446848e-07, + "loss": 2.2971, + "step": 760 + }, + { + "epoch": 0.07, + "grad_norm": 61.84375, + "learning_rate": 9.852529695138793e-07, + "loss": 2.1458, + "step": 780 + }, + { + "epoch": 0.08, + "grad_norm": 58.78125, + "learning_rate": 9.848748259830738e-07, + "loss": 2.2228, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 48.90625, + "learning_rate": 9.844966824522683e-07, + "loss": 2.3179, + "step": 820 + }, + { + "epoch": 0.08, + "grad_norm": 70.625, + "learning_rate": 9.841185389214627e-07, + "loss": 2.1868, + "step": 840 + }, + { + "epoch": 0.08, + "grad_norm": 64.125, + "learning_rate": 9.837403953906572e-07, + "loss": 2.2075, + "step": 860 + }, + { + "epoch": 0.08, + "grad_norm": 68.0, + "learning_rate": 9.833622518598517e-07, + "loss": 2.2674, + "step": 880 + }, + { + "epoch": 0.09, + "grad_norm": 70.3125, + "learning_rate": 9.829841083290462e-07, + "loss": 2.3387, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 58.5, + "learning_rate": 9.826059647982406e-07, + "loss": 2.2466, + "step": 920 + }, + { + "epoch": 0.09, + "grad_norm": 110.9375, + "learning_rate": 9.822278212674349e-07, + "loss": 2.1914, + "step": 940 + }, + { + "epoch": 0.09, + "grad_norm": 60.96875, + "learning_rate": 9.818496777366294e-07, + "loss": 2.1054, + "step": 960 + }, + { + "epoch": 0.09, + "grad_norm": 73.1875, + "learning_rate": 9.814715342058238e-07, + "loss": 2.2398, + "step": 980 + }, + { + "epoch": 0.09, + "grad_norm": 51.5, + "learning_rate": 9.810933906750183e-07, + "loss": 2.2347, + "step": 1000 + }, + { + "epoch": 0.1, + "grad_norm": 79.125, + "learning_rate": 9.807152471442128e-07, + "loss": 2.2701, + "step": 1020 + }, + { + "epoch": 0.1, + "grad_norm": 63.40625, + "learning_rate": 9.803371036134073e-07, + "loss": 2.3747, + "step": 1040 + }, + { + "epoch": 0.1, + "grad_norm": 71.75, + "learning_rate": 9.799589600826015e-07, + "loss": 2.3053, + "step": 1060 + }, + { + "epoch": 0.1, + "grad_norm": 84.0625, + "learning_rate": 9.79580816551796e-07, + "loss": 2.2083, + "step": 1080 + }, + { + "epoch": 0.1, + "grad_norm": 58.5625, + "learning_rate": 9.792026730209905e-07, + "loss": 2.301, + "step": 1100 + }, + { + "epoch": 0.11, + "grad_norm": 61.59375, + "learning_rate": 9.78824529490185e-07, + "loss": 2.1949, + "step": 1120 + }, + { + "epoch": 0.11, + "grad_norm": 53.0625, + "learning_rate": 9.784463859593794e-07, + "loss": 2.1185, + "step": 1140 + }, + { + "epoch": 0.11, + "grad_norm": 63.4375, + "learning_rate": 9.780682424285737e-07, + "loss": 2.2003, + "step": 1160 + }, + { + "epoch": 0.11, + "grad_norm": 72.4375, + "learning_rate": 9.776900988977682e-07, + "loss": 2.1994, + "step": 1180 + }, + { + "epoch": 0.11, + "grad_norm": 74.375, + "learning_rate": 9.773119553669628e-07, + "loss": 2.1708, + "step": 1200 + }, + { + "epoch": 0.12, + "grad_norm": 47.75, + "learning_rate": 9.76933811836157e-07, + "loss": 2.1373, + "step": 1220 + }, + { + "epoch": 0.12, + "grad_norm": 88.375, + "learning_rate": 9.765556683053516e-07, + "loss": 2.1869, + "step": 1240 + }, + { + "epoch": 0.12, + "grad_norm": 162.75, + "learning_rate": 9.76177524774546e-07, + "loss": 2.3998, + "step": 1260 + }, + { + "epoch": 0.12, + "grad_norm": 77.875, + "learning_rate": 9.757993812437405e-07, + "loss": 2.1531, + "step": 1280 + }, + { + "epoch": 0.12, + "grad_norm": 80.75, + "learning_rate": 9.75421237712935e-07, + "loss": 2.1803, + "step": 1300 + }, + { + "epoch": 0.12, + "grad_norm": 78.25, + "learning_rate": 9.750430941821295e-07, + "loss": 2.2272, + "step": 1320 + }, + { + "epoch": 0.13, + "grad_norm": 62.40625, + "learning_rate": 9.746649506513237e-07, + "loss": 2.3047, + "step": 1340 + }, + { + "epoch": 0.13, + "grad_norm": 92.8125, + "learning_rate": 9.742868071205182e-07, + "loss": 2.2091, + "step": 1360 + }, + { + "epoch": 0.13, + "grad_norm": 77.0625, + "learning_rate": 9.739086635897127e-07, + "loss": 2.3365, + "step": 1380 + }, + { + "epoch": 0.13, + "grad_norm": 67.0, + "learning_rate": 9.735305200589072e-07, + "loss": 2.1193, + "step": 1400 + }, + { + "epoch": 0.13, + "grad_norm": 67.4375, + "learning_rate": 9.731523765281016e-07, + "loss": 2.1237, + "step": 1420 + }, + { + "epoch": 0.14, + "grad_norm": 51.84375, + "learning_rate": 9.72774232997296e-07, + "loss": 2.1907, + "step": 1440 + }, + { + "epoch": 0.14, + "grad_norm": 73.375, + "learning_rate": 9.723960894664904e-07, + "loss": 2.2086, + "step": 1460 + }, + { + "epoch": 0.14, + "grad_norm": 56.625, + "learning_rate": 9.720179459356848e-07, + "loss": 2.232, + "step": 1480 + }, + { + "epoch": 0.14, + "grad_norm": 67.875, + "learning_rate": 9.716398024048793e-07, + "loss": 2.236, + "step": 1500 + }, + { + "epoch": 0.14, + "grad_norm": 85.6875, + "learning_rate": 9.712616588740738e-07, + "loss": 2.2241, + "step": 1520 + }, + { + "epoch": 0.15, + "grad_norm": 64.5, + "learning_rate": 9.708835153432683e-07, + "loss": 2.1892, + "step": 1540 + }, + { + "epoch": 0.15, + "grad_norm": 56.46875, + "learning_rate": 9.705053718124627e-07, + "loss": 2.2541, + "step": 1560 + }, + { + "epoch": 0.15, + "grad_norm": 78.125, + "learning_rate": 9.701272282816572e-07, + "loss": 2.1645, + "step": 1580 + }, + { + "epoch": 0.15, + "grad_norm": 66.5625, + "learning_rate": 9.697490847508517e-07, + "loss": 2.1083, + "step": 1600 + }, + { + "epoch": 0.15, + "grad_norm": 70.375, + "learning_rate": 9.69370941220046e-07, + "loss": 2.1757, + "step": 1620 + }, + { + "epoch": 0.16, + "grad_norm": 55.9375, + "learning_rate": 9.689927976892404e-07, + "loss": 2.1102, + "step": 1640 + }, + { + "epoch": 0.16, + "grad_norm": 106.4375, + "learning_rate": 9.68614654158435e-07, + "loss": 2.1047, + "step": 1660 + }, + { + "epoch": 0.16, + "grad_norm": 60.59375, + "learning_rate": 9.682365106276294e-07, + "loss": 2.2789, + "step": 1680 + }, + { + "epoch": 0.16, + "grad_norm": 63.8125, + "learning_rate": 9.678583670968239e-07, + "loss": 2.1161, + "step": 1700 + }, + { + "epoch": 0.16, + "grad_norm": 80.4375, + "learning_rate": 9.674802235660181e-07, + "loss": 2.2273, + "step": 1720 + }, + { + "epoch": 0.16, + "grad_norm": 58.28125, + "learning_rate": 9.671020800352126e-07, + "loss": 2.1584, + "step": 1740 + }, + { + "epoch": 0.17, + "grad_norm": 60.59375, + "learning_rate": 9.66723936504407e-07, + "loss": 2.159, + "step": 1760 + }, + { + "epoch": 0.17, + "grad_norm": 77.25, + "learning_rate": 9.663457929736015e-07, + "loss": 2.1554, + "step": 1780 + }, + { + "epoch": 0.17, + "grad_norm": 62.125, + "learning_rate": 9.65967649442796e-07, + "loss": 2.3024, + "step": 1800 + }, + { + "epoch": 0.17, + "grad_norm": 47.9375, + "learning_rate": 9.655895059119905e-07, + "loss": 2.1416, + "step": 1820 + }, + { + "epoch": 0.17, + "grad_norm": 78.875, + "learning_rate": 9.652113623811847e-07, + "loss": 2.2613, + "step": 1840 + }, + { + "epoch": 0.18, + "grad_norm": 71.5, + "learning_rate": 9.648332188503792e-07, + "loss": 2.1109, + "step": 1860 + }, + { + "epoch": 0.18, + "grad_norm": 42.03125, + "learning_rate": 9.644550753195737e-07, + "loss": 2.0072, + "step": 1880 + }, + { + "epoch": 0.18, + "grad_norm": 47.875, + "learning_rate": 9.640769317887682e-07, + "loss": 2.1403, + "step": 1900 + }, + { + "epoch": 0.18, + "grad_norm": 70.25, + "learning_rate": 9.636987882579626e-07, + "loss": 2.1649, + "step": 1920 + }, + { + "epoch": 0.18, + "grad_norm": 52.15625, + "learning_rate": 9.633206447271571e-07, + "loss": 2.0728, + "step": 1940 + }, + { + "epoch": 0.19, + "grad_norm": 65.625, + "learning_rate": 9.629425011963516e-07, + "loss": 2.2163, + "step": 1960 + }, + { + "epoch": 0.19, + "grad_norm": 81.125, + "learning_rate": 9.62564357665546e-07, + "loss": 2.3386, + "step": 1980 + }, + { + "epoch": 0.19, + "grad_norm": 65.0, + "learning_rate": 9.621862141347405e-07, + "loss": 2.1122, + "step": 2000 + }, + { + "epoch": 0.19, + "grad_norm": 62.0, + "learning_rate": 9.618080706039348e-07, + "loss": 2.201, + "step": 2020 + }, + { + "epoch": 0.19, + "grad_norm": 76.9375, + "learning_rate": 9.614299270731293e-07, + "loss": 2.2634, + "step": 2040 + }, + { + "epoch": 0.19, + "grad_norm": 80.9375, + "learning_rate": 9.610517835423238e-07, + "loss": 2.1005, + "step": 2060 + }, + { + "epoch": 0.2, + "grad_norm": 74.5, + "learning_rate": 9.606736400115182e-07, + "loss": 2.1028, + "step": 2080 + }, + { + "epoch": 0.2, + "grad_norm": 76.25, + "learning_rate": 9.602954964807127e-07, + "loss": 2.2869, + "step": 2100 + }, + { + "epoch": 0.2, + "grad_norm": 59.3125, + "learning_rate": 9.59917352949907e-07, + "loss": 2.1469, + "step": 2120 + }, + { + "epoch": 0.2, + "grad_norm": 64.8125, + "learning_rate": 9.595392094191014e-07, + "loss": 2.084, + "step": 2140 + }, + { + "epoch": 0.2, + "grad_norm": 73.125, + "learning_rate": 9.59161065888296e-07, + "loss": 2.1673, + "step": 2160 + }, + { + "epoch": 0.21, + "grad_norm": 161.0, + "learning_rate": 9.587829223574904e-07, + "loss": 2.0762, + "step": 2180 + }, + { + "epoch": 0.21, + "grad_norm": 62.0, + "learning_rate": 9.584047788266849e-07, + "loss": 2.0542, + "step": 2200 + }, + { + "epoch": 0.21, + "grad_norm": 101.0, + "learning_rate": 9.580266352958793e-07, + "loss": 2.0866, + "step": 2220 + }, + { + "epoch": 0.21, + "grad_norm": 66.0, + "learning_rate": 9.576484917650736e-07, + "loss": 2.1058, + "step": 2240 + }, + { + "epoch": 0.21, + "grad_norm": 51.21875, + "learning_rate": 9.572703482342683e-07, + "loss": 2.18, + "step": 2260 + }, + { + "epoch": 0.22, + "grad_norm": 131.125, + "learning_rate": 9.568922047034628e-07, + "loss": 2.0921, + "step": 2280 + }, + { + "epoch": 0.22, + "grad_norm": 68.8125, + "learning_rate": 9.56514061172657e-07, + "loss": 2.1603, + "step": 2300 + }, + { + "epoch": 0.22, + "grad_norm": 106.375, + "learning_rate": 9.561359176418515e-07, + "loss": 2.1403, + "step": 2320 + }, + { + "epoch": 0.22, + "grad_norm": 68.1875, + "learning_rate": 9.55757774111046e-07, + "loss": 2.2704, + "step": 2340 + }, + { + "epoch": 0.22, + "grad_norm": 83.6875, + "learning_rate": 9.553796305802404e-07, + "loss": 2.2066, + "step": 2360 + }, + { + "epoch": 0.22, + "grad_norm": 75.9375, + "learning_rate": 9.55001487049435e-07, + "loss": 2.1317, + "step": 2380 + }, + { + "epoch": 0.23, + "grad_norm": 84.875, + "learning_rate": 9.546233435186292e-07, + "loss": 2.1652, + "step": 2400 + }, + { + "epoch": 0.23, + "grad_norm": 58.65625, + "learning_rate": 9.542451999878237e-07, + "loss": 2.1021, + "step": 2420 + }, + { + "epoch": 0.23, + "grad_norm": 125.0, + "learning_rate": 9.538670564570181e-07, + "loss": 2.1924, + "step": 2440 + }, + { + "epoch": 0.23, + "grad_norm": 56.96875, + "learning_rate": 9.534889129262126e-07, + "loss": 2.1115, + "step": 2460 + }, + { + "epoch": 0.23, + "grad_norm": 56.03125, + "learning_rate": 9.53110769395407e-07, + "loss": 2.0641, + "step": 2480 + }, + { + "epoch": 0.24, + "grad_norm": 83.625, + "learning_rate": 9.527326258646014e-07, + "loss": 2.0102, + "step": 2500 + }, + { + "epoch": 0.24, + "grad_norm": 66.125, + "learning_rate": 9.523544823337959e-07, + "loss": 2.0708, + "step": 2520 + }, + { + "epoch": 0.24, + "grad_norm": 75.75, + "learning_rate": 9.519763388029903e-07, + "loss": 2.0818, + "step": 2540 + }, + { + "epoch": 0.24, + "grad_norm": 128.0, + "learning_rate": 9.515981952721848e-07, + "loss": 2.1258, + "step": 2560 + }, + { + "epoch": 0.24, + "grad_norm": 75.8125, + "learning_rate": 9.512200517413792e-07, + "loss": 2.1426, + "step": 2580 + }, + { + "epoch": 0.25, + "grad_norm": 63.875, + "learning_rate": 9.508419082105737e-07, + "loss": 2.1111, + "step": 2600 + }, + { + "epoch": 0.25, + "grad_norm": 58.75, + "learning_rate": 9.504637646797682e-07, + "loss": 2.0416, + "step": 2620 + }, + { + "epoch": 0.25, + "grad_norm": 55.0625, + "learning_rate": 9.500856211489627e-07, + "loss": 2.0279, + "step": 2640 + }, + { + "epoch": 0.25, + "grad_norm": 56.96875, + "learning_rate": 9.49707477618157e-07, + "loss": 2.0627, + "step": 2660 + }, + { + "epoch": 0.25, + "grad_norm": 55.75, + "learning_rate": 9.493293340873515e-07, + "loss": 2.0918, + "step": 2680 + }, + { + "epoch": 0.26, + "grad_norm": 62.15625, + "learning_rate": 9.48951190556546e-07, + "loss": 2.1226, + "step": 2700 + }, + { + "epoch": 0.26, + "grad_norm": 75.75, + "learning_rate": 9.485730470257403e-07, + "loss": 2.0557, + "step": 2720 + }, + { + "epoch": 0.26, + "grad_norm": 82.25, + "learning_rate": 9.481949034949348e-07, + "loss": 2.1832, + "step": 2740 + }, + { + "epoch": 0.26, + "grad_norm": 54.5625, + "learning_rate": 9.478167599641293e-07, + "loss": 2.0379, + "step": 2760 + }, + { + "epoch": 0.26, + "grad_norm": 54.21875, + "learning_rate": 9.474386164333237e-07, + "loss": 2.0732, + "step": 2780 + }, + { + "epoch": 0.26, + "grad_norm": 61.03125, + "learning_rate": 9.470604729025181e-07, + "loss": 2.1374, + "step": 2800 + }, + { + "epoch": 0.27, + "grad_norm": 94.0625, + "learning_rate": 9.466823293717125e-07, + "loss": 2.0421, + "step": 2820 + }, + { + "epoch": 0.27, + "grad_norm": 55.40625, + "learning_rate": 9.46304185840907e-07, + "loss": 2.0783, + "step": 2840 + }, + { + "epoch": 0.27, + "grad_norm": 117.6875, + "learning_rate": 9.459260423101015e-07, + "loss": 2.1041, + "step": 2860 + }, + { + "epoch": 0.27, + "grad_norm": 70.875, + "learning_rate": 9.455478987792958e-07, + "loss": 2.1819, + "step": 2880 + }, + { + "epoch": 0.27, + "grad_norm": 90.3125, + "learning_rate": 9.451697552484903e-07, + "loss": 2.1047, + "step": 2900 + }, + { + "epoch": 0.28, + "grad_norm": 51.5625, + "learning_rate": 9.447916117176848e-07, + "loss": 2.1001, + "step": 2920 + }, + { + "epoch": 0.28, + "grad_norm": 77.6875, + "learning_rate": 9.444134681868792e-07, + "loss": 2.0817, + "step": 2940 + }, + { + "epoch": 0.28, + "grad_norm": 101.1875, + "learning_rate": 9.440353246560737e-07, + "loss": 2.1162, + "step": 2960 + }, + { + "epoch": 0.28, + "grad_norm": 83.375, + "learning_rate": 9.436571811252682e-07, + "loss": 2.1422, + "step": 2980 + }, + { + "epoch": 0.28, + "grad_norm": 89.8125, + "learning_rate": 9.432790375944626e-07, + "loss": 2.1404, + "step": 3000 + }, + { + "epoch": 0.29, + "grad_norm": 50.78125, + "learning_rate": 9.42900894063657e-07, + "loss": 2.0371, + "step": 3020 + }, + { + "epoch": 0.29, + "grad_norm": 57.25, + "learning_rate": 9.425227505328515e-07, + "loss": 2.0327, + "step": 3040 + }, + { + "epoch": 0.29, + "grad_norm": 112.375, + "learning_rate": 9.421446070020459e-07, + "loss": 2.1609, + "step": 3060 + }, + { + "epoch": 0.29, + "grad_norm": 76.3125, + "learning_rate": 9.417664634712404e-07, + "loss": 2.1167, + "step": 3080 + }, + { + "epoch": 0.29, + "grad_norm": 63.6875, + "learning_rate": 9.413883199404348e-07, + "loss": 2.023, + "step": 3100 + }, + { + "epoch": 0.29, + "grad_norm": 46.6875, + "learning_rate": 9.410101764096292e-07, + "loss": 2.0556, + "step": 3120 + }, + { + "epoch": 0.3, + "grad_norm": 84.9375, + "learning_rate": 9.406320328788237e-07, + "loss": 2.0876, + "step": 3140 + }, + { + "epoch": 0.3, + "grad_norm": 107.5, + "learning_rate": 9.40253889348018e-07, + "loss": 2.087, + "step": 3160 + }, + { + "epoch": 0.3, + "grad_norm": 63.4375, + "learning_rate": 9.398757458172125e-07, + "loss": 2.1845, + "step": 3180 + }, + { + "epoch": 0.3, + "grad_norm": 52.9375, + "learning_rate": 9.39497602286407e-07, + "loss": 2.1219, + "step": 3200 + }, + { + "epoch": 0.3, + "grad_norm": 72.1875, + "learning_rate": 9.391194587556014e-07, + "loss": 2.0844, + "step": 3220 + }, + { + "epoch": 0.31, + "grad_norm": 96.875, + "learning_rate": 9.387413152247958e-07, + "loss": 2.1343, + "step": 3240 + }, + { + "epoch": 0.31, + "grad_norm": 64.375, + "learning_rate": 9.383631716939903e-07, + "loss": 2.0837, + "step": 3260 + }, + { + "epoch": 0.31, + "grad_norm": 82.0625, + "learning_rate": 9.379850281631847e-07, + "loss": 2.1041, + "step": 3280 + }, + { + "epoch": 0.31, + "grad_norm": 70.3125, + "learning_rate": 9.376068846323792e-07, + "loss": 2.1016, + "step": 3300 + }, + { + "epoch": 0.31, + "grad_norm": 70.125, + "learning_rate": 9.372287411015737e-07, + "loss": 2.1038, + "step": 3320 + }, + { + "epoch": 0.32, + "grad_norm": 73.6875, + "learning_rate": 9.368505975707681e-07, + "loss": 2.1319, + "step": 3340 + }, + { + "epoch": 0.32, + "grad_norm": 79.4375, + "learning_rate": 9.364724540399626e-07, + "loss": 1.9609, + "step": 3360 + }, + { + "epoch": 0.32, + "grad_norm": 42.53125, + "learning_rate": 9.36094310509157e-07, + "loss": 2.1016, + "step": 3380 + }, + { + "epoch": 0.32, + "grad_norm": 112.5, + "learning_rate": 9.357161669783514e-07, + "loss": 2.1199, + "step": 3400 + }, + { + "epoch": 0.32, + "grad_norm": 65.625, + "learning_rate": 9.353380234475459e-07, + "loss": 2.0902, + "step": 3420 + }, + { + "epoch": 0.33, + "grad_norm": 138.875, + "learning_rate": 9.349598799167403e-07, + "loss": 2.0669, + "step": 3440 + }, + { + "epoch": 0.33, + "grad_norm": 83.25, + "learning_rate": 9.345817363859347e-07, + "loss": 2.1091, + "step": 3460 + }, + { + "epoch": 0.33, + "grad_norm": 115.3125, + "learning_rate": 9.342035928551292e-07, + "loss": 2.0353, + "step": 3480 + }, + { + "epoch": 0.33, + "grad_norm": 62.71875, + "learning_rate": 9.338254493243236e-07, + "loss": 2.0954, + "step": 3500 + }, + { + "epoch": 0.33, + "grad_norm": 79.1875, + "learning_rate": 9.33447305793518e-07, + "loss": 2.096, + "step": 3520 + }, + { + "epoch": 0.33, + "grad_norm": 158.0, + "learning_rate": 9.330691622627125e-07, + "loss": 2.1737, + "step": 3540 + }, + { + "epoch": 0.34, + "grad_norm": 57.625, + "learning_rate": 9.326910187319069e-07, + "loss": 2.1029, + "step": 3560 + }, + { + "epoch": 0.34, + "grad_norm": 86.3125, + "learning_rate": 9.323128752011014e-07, + "loss": 2.1356, + "step": 3580 + }, + { + "epoch": 0.34, + "grad_norm": 93.3125, + "learning_rate": 9.319347316702958e-07, + "loss": 2.1022, + "step": 3600 + }, + { + "epoch": 0.34, + "grad_norm": 70.0625, + "learning_rate": 9.315565881394902e-07, + "loss": 2.0402, + "step": 3620 + }, + { + "epoch": 0.34, + "grad_norm": 68.6875, + "learning_rate": 9.311784446086848e-07, + "loss": 2.1377, + "step": 3640 + }, + { + "epoch": 0.35, + "grad_norm": 52.15625, + "learning_rate": 9.308003010778793e-07, + "loss": 2.0708, + "step": 3660 + }, + { + "epoch": 0.35, + "grad_norm": 71.625, + "learning_rate": 9.304221575470736e-07, + "loss": 2.1001, + "step": 3680 + }, + { + "epoch": 0.35, + "grad_norm": 62.78125, + "learning_rate": 9.300440140162681e-07, + "loss": 2.1044, + "step": 3700 + }, + { + "epoch": 0.35, + "grad_norm": 98.375, + "learning_rate": 9.296658704854626e-07, + "loss": 2.0937, + "step": 3720 + }, + { + "epoch": 0.35, + "grad_norm": 80.875, + "learning_rate": 9.292877269546569e-07, + "loss": 2.1337, + "step": 3740 + }, + { + "epoch": 0.36, + "grad_norm": 60.9375, + "learning_rate": 9.289095834238514e-07, + "loss": 2.066, + "step": 3760 + }, + { + "epoch": 0.36, + "grad_norm": 58.8125, + "learning_rate": 9.285314398930458e-07, + "loss": 2.0683, + "step": 3780 + }, + { + "epoch": 0.36, + "grad_norm": 78.125, + "learning_rate": 9.281532963622403e-07, + "loss": 1.9853, + "step": 3800 + }, + { + "epoch": 0.36, + "grad_norm": 79.6875, + "learning_rate": 9.277751528314347e-07, + "loss": 2.0792, + "step": 3820 + }, + { + "epoch": 0.36, + "grad_norm": 71.0625, + "learning_rate": 9.273970093006291e-07, + "loss": 2.0544, + "step": 3840 + }, + { + "epoch": 0.36, + "grad_norm": 45.03125, + "learning_rate": 9.270188657698236e-07, + "loss": 2.0586, + "step": 3860 + }, + { + "epoch": 0.37, + "grad_norm": 50.3125, + "learning_rate": 9.26640722239018e-07, + "loss": 2.0716, + "step": 3880 + }, + { + "epoch": 0.37, + "grad_norm": 67.125, + "learning_rate": 9.262625787082124e-07, + "loss": 2.0274, + "step": 3900 + }, + { + "epoch": 0.37, + "grad_norm": 51.84375, + "learning_rate": 9.258844351774069e-07, + "loss": 2.02, + "step": 3920 + }, + { + "epoch": 0.37, + "grad_norm": 56.71875, + "learning_rate": 9.255062916466014e-07, + "loss": 2.1464, + "step": 3940 + }, + { + "epoch": 0.37, + "grad_norm": 63.34375, + "learning_rate": 9.251281481157957e-07, + "loss": 2.08, + "step": 3960 + }, + { + "epoch": 0.38, + "grad_norm": 98.5625, + "learning_rate": 9.247500045849902e-07, + "loss": 2.1212, + "step": 3980 + }, + { + "epoch": 0.38, + "grad_norm": 72.75, + "learning_rate": 9.243718610541848e-07, + "loss": 2.1505, + "step": 4000 + }, + { + "epoch": 0.38, + "grad_norm": 65.625, + "learning_rate": 9.239937175233792e-07, + "loss": 2.0742, + "step": 4020 + }, + { + "epoch": 0.38, + "grad_norm": 90.625, + "learning_rate": 9.236155739925736e-07, + "loss": 2.0463, + "step": 4040 + }, + { + "epoch": 0.38, + "grad_norm": 78.1875, + "learning_rate": 9.23237430461768e-07, + "loss": 2.0287, + "step": 4060 + }, + { + "epoch": 0.39, + "grad_norm": 51.875, + "learning_rate": 9.228592869309625e-07, + "loss": 2.0933, + "step": 4080 + }, + { + "epoch": 0.39, + "grad_norm": 56.78125, + "learning_rate": 9.22481143400157e-07, + "loss": 2.001, + "step": 4100 + }, + { + "epoch": 0.39, + "grad_norm": 51.90625, + "learning_rate": 9.221029998693513e-07, + "loss": 1.9934, + "step": 4120 + }, + { + "epoch": 0.39, + "grad_norm": 55.78125, + "learning_rate": 9.217248563385458e-07, + "loss": 2.0693, + "step": 4140 + }, + { + "epoch": 0.39, + "grad_norm": 103.0625, + "learning_rate": 9.213467128077403e-07, + "loss": 2.0205, + "step": 4160 + }, + { + "epoch": 0.4, + "grad_norm": 54.375, + "learning_rate": 9.209685692769346e-07, + "loss": 2.0219, + "step": 4180 + }, + { + "epoch": 0.4, + "grad_norm": 65.6875, + "learning_rate": 9.205904257461291e-07, + "loss": 2.1043, + "step": 4200 + }, + { + "epoch": 0.4, + "grad_norm": 70.8125, + "learning_rate": 9.202122822153236e-07, + "loss": 2.1014, + "step": 4220 + }, + { + "epoch": 0.4, + "grad_norm": 73.5625, + "learning_rate": 9.19834138684518e-07, + "loss": 1.9973, + "step": 4240 + }, + { + "epoch": 0.4, + "grad_norm": 61.375, + "learning_rate": 9.194559951537124e-07, + "loss": 2.0508, + "step": 4260 + }, + { + "epoch": 0.4, + "grad_norm": 53.6875, + "learning_rate": 9.190778516229069e-07, + "loss": 2.0495, + "step": 4280 + }, + { + "epoch": 0.41, + "grad_norm": 97.5, + "learning_rate": 9.186997080921013e-07, + "loss": 2.1024, + "step": 4300 + }, + { + "epoch": 0.41, + "grad_norm": 80.3125, + "learning_rate": 9.183215645612957e-07, + "loss": 2.1573, + "step": 4320 + }, + { + "epoch": 0.41, + "grad_norm": 57.875, + "learning_rate": 9.179434210304903e-07, + "loss": 2.0675, + "step": 4340 + }, + { + "epoch": 0.41, + "grad_norm": 68.1875, + "learning_rate": 9.175652774996847e-07, + "loss": 2.0606, + "step": 4360 + }, + { + "epoch": 0.41, + "grad_norm": 79.8125, + "learning_rate": 9.171871339688792e-07, + "loss": 1.9876, + "step": 4380 + }, + { + "epoch": 0.42, + "grad_norm": 79.0, + "learning_rate": 9.168089904380735e-07, + "loss": 2.0817, + "step": 4400 + }, + { + "epoch": 0.42, + "grad_norm": 78.0, + "learning_rate": 9.16430846907268e-07, + "loss": 2.0678, + "step": 4420 + }, + { + "epoch": 0.42, + "grad_norm": 90.5625, + "learning_rate": 9.160527033764625e-07, + "loss": 2.1116, + "step": 4440 + }, + { + "epoch": 0.42, + "grad_norm": 103.4375, + "learning_rate": 9.156745598456569e-07, + "loss": 2.1465, + "step": 4460 + }, + { + "epoch": 0.42, + "grad_norm": 91.9375, + "learning_rate": 9.152964163148513e-07, + "loss": 2.2426, + "step": 4480 + }, + { + "epoch": 0.43, + "grad_norm": 80.125, + "learning_rate": 9.149182727840458e-07, + "loss": 2.071, + "step": 4500 + }, + { + "epoch": 0.43, + "grad_norm": 36.9375, + "learning_rate": 9.145401292532402e-07, + "loss": 2.0632, + "step": 4520 + }, + { + "epoch": 0.43, + "grad_norm": 59.96875, + "learning_rate": 9.141619857224346e-07, + "loss": 2.0746, + "step": 4540 + }, + { + "epoch": 0.43, + "grad_norm": 56.65625, + "learning_rate": 9.137838421916291e-07, + "loss": 1.9805, + "step": 4560 + }, + { + "epoch": 0.43, + "grad_norm": 48.625, + "learning_rate": 9.134056986608235e-07, + "loss": 2.0773, + "step": 4580 + }, + { + "epoch": 0.43, + "grad_norm": 70.6875, + "learning_rate": 9.13027555130018e-07, + "loss": 2.1277, + "step": 4600 + }, + { + "epoch": 0.44, + "grad_norm": 56.0625, + "learning_rate": 9.126494115992124e-07, + "loss": 2.0332, + "step": 4620 + }, + { + "epoch": 0.44, + "grad_norm": 52.90625, + "learning_rate": 9.122712680684068e-07, + "loss": 2.0042, + "step": 4640 + }, + { + "epoch": 0.44, + "grad_norm": 64.6875, + "learning_rate": 9.118931245376013e-07, + "loss": 2.0929, + "step": 4660 + }, + { + "epoch": 0.44, + "grad_norm": 46.0, + "learning_rate": 9.115149810067956e-07, + "loss": 2.1159, + "step": 4680 + }, + { + "epoch": 0.44, + "grad_norm": 61.53125, + "learning_rate": 9.111368374759902e-07, + "loss": 2.1719, + "step": 4700 + }, + { + "epoch": 0.45, + "grad_norm": 69.625, + "learning_rate": 9.107586939451847e-07, + "loss": 2.0822, + "step": 4720 + }, + { + "epoch": 0.45, + "grad_norm": 83.5, + "learning_rate": 9.103805504143791e-07, + "loss": 2.0584, + "step": 4740 + }, + { + "epoch": 0.45, + "grad_norm": 81.8125, + "learning_rate": 9.100024068835735e-07, + "loss": 2.0803, + "step": 4760 + }, + { + "epoch": 0.45, + "grad_norm": 81.9375, + "learning_rate": 9.09624263352768e-07, + "loss": 2.0664, + "step": 4780 + }, + { + "epoch": 0.45, + "grad_norm": 97.0625, + "learning_rate": 9.092461198219624e-07, + "loss": 2.0345, + "step": 4800 + }, + { + "epoch": 0.46, + "grad_norm": 63.03125, + "learning_rate": 9.088679762911569e-07, + "loss": 2.1597, + "step": 4820 + }, + { + "epoch": 0.46, + "grad_norm": 73.4375, + "learning_rate": 9.084898327603513e-07, + "loss": 2.0724, + "step": 4840 + }, + { + "epoch": 0.46, + "grad_norm": 67.625, + "learning_rate": 9.081116892295457e-07, + "loss": 1.9974, + "step": 4860 + }, + { + "epoch": 0.46, + "grad_norm": 62.15625, + "learning_rate": 9.077335456987402e-07, + "loss": 2.0378, + "step": 4880 + }, + { + "epoch": 0.46, + "grad_norm": 78.8125, + "learning_rate": 9.073554021679346e-07, + "loss": 2.1548, + "step": 4900 + }, + { + "epoch": 0.47, + "grad_norm": 52.6875, + "learning_rate": 9.06977258637129e-07, + "loss": 2.0852, + "step": 4920 + }, + { + "epoch": 0.47, + "grad_norm": 53.6875, + "learning_rate": 9.065991151063235e-07, + "loss": 2.147, + "step": 4940 + }, + { + "epoch": 0.47, + "grad_norm": 72.0625, + "learning_rate": 9.062209715755179e-07, + "loss": 1.944, + "step": 4960 + }, + { + "epoch": 0.47, + "grad_norm": 112.125, + "learning_rate": 9.058428280447123e-07, + "loss": 2.0855, + "step": 4980 + }, + { + "epoch": 0.47, + "grad_norm": 91.3125, + "learning_rate": 9.054646845139068e-07, + "loss": 1.9843, + "step": 5000 + }, + { + "epoch": 0.47, + "grad_norm": 101.75, + "learning_rate": 9.050865409831012e-07, + "loss": 2.1488, + "step": 5020 + }, + { + "epoch": 0.48, + "grad_norm": 61.4375, + "learning_rate": 9.047083974522958e-07, + "loss": 2.0525, + "step": 5040 + }, + { + "epoch": 0.48, + "grad_norm": 67.5625, + "learning_rate": 9.043302539214902e-07, + "loss": 2.105, + "step": 5060 + }, + { + "epoch": 0.48, + "grad_norm": 86.25, + "learning_rate": 9.039521103906846e-07, + "loss": 2.0717, + "step": 5080 + }, + { + "epoch": 0.48, + "grad_norm": 64.3125, + "learning_rate": 9.035739668598791e-07, + "loss": 2.1015, + "step": 5100 + }, + { + "epoch": 0.48, + "grad_norm": 166.625, + "learning_rate": 9.031958233290735e-07, + "loss": 2.134, + "step": 5120 + }, + { + "epoch": 0.49, + "grad_norm": 68.4375, + "learning_rate": 9.028176797982679e-07, + "loss": 2.0938, + "step": 5140 + }, + { + "epoch": 0.49, + "grad_norm": 77.4375, + "learning_rate": 9.024395362674624e-07, + "loss": 2.0848, + "step": 5160 + }, + { + "epoch": 0.49, + "grad_norm": 102.1875, + "learning_rate": 9.020613927366569e-07, + "loss": 1.9225, + "step": 5180 + }, + { + "epoch": 0.49, + "grad_norm": 68.5625, + "learning_rate": 9.016832492058512e-07, + "loss": 2.0038, + "step": 5200 + }, + { + "epoch": 0.49, + "grad_norm": 55.09375, + "learning_rate": 9.013051056750457e-07, + "loss": 2.1238, + "step": 5220 + }, + { + "epoch": 0.5, + "grad_norm": 65.75, + "learning_rate": 9.009269621442402e-07, + "loss": 2.0999, + "step": 5240 + }, + { + "epoch": 0.5, + "grad_norm": 62.28125, + "learning_rate": 9.005488186134345e-07, + "loss": 2.0663, + "step": 5260 + }, + { + "epoch": 0.5, + "grad_norm": 96.3125, + "learning_rate": 9.00170675082629e-07, + "loss": 2.06, + "step": 5280 + }, + { + "epoch": 0.5, + "grad_norm": 82.5625, + "learning_rate": 8.997925315518234e-07, + "loss": 2.1223, + "step": 5300 + }, + { + "epoch": 0.5, + "grad_norm": 116.3125, + "learning_rate": 8.994143880210179e-07, + "loss": 2.0237, + "step": 5320 + }, + { + "epoch": 0.5, + "grad_norm": 63.875, + "learning_rate": 8.990362444902123e-07, + "loss": 2.0965, + "step": 5340 + }, + { + "epoch": 0.51, + "grad_norm": 51.78125, + "learning_rate": 8.986581009594067e-07, + "loss": 2.0365, + "step": 5360 + }, + { + "epoch": 0.51, + "grad_norm": 67.25, + "learning_rate": 8.982799574286012e-07, + "loss": 2.1012, + "step": 5380 + }, + { + "epoch": 0.51, + "grad_norm": 69.3125, + "learning_rate": 8.979018138977958e-07, + "loss": 1.9336, + "step": 5400 + }, + { + "epoch": 0.51, + "grad_norm": 127.125, + "learning_rate": 8.975236703669901e-07, + "loss": 2.1657, + "step": 5420 + }, + { + "epoch": 0.51, + "grad_norm": 77.4375, + "learning_rate": 8.971455268361846e-07, + "loss": 1.9824, + "step": 5440 + }, + { + "epoch": 0.52, + "grad_norm": 51.28125, + "learning_rate": 8.967673833053791e-07, + "loss": 2.1134, + "step": 5460 + }, + { + "epoch": 0.52, + "grad_norm": 82.4375, + "learning_rate": 8.963892397745734e-07, + "loss": 2.0806, + "step": 5480 + }, + { + "epoch": 0.52, + "grad_norm": 155.875, + "learning_rate": 8.960110962437679e-07, + "loss": 2.13, + "step": 5500 + }, + { + "epoch": 0.52, + "grad_norm": 49.0625, + "learning_rate": 8.956329527129624e-07, + "loss": 2.1733, + "step": 5520 + }, + { + "epoch": 0.52, + "grad_norm": 72.75, + "learning_rate": 8.952548091821568e-07, + "loss": 2.1312, + "step": 5540 + }, + { + "epoch": 0.53, + "grad_norm": 98.0, + "learning_rate": 8.948766656513512e-07, + "loss": 2.0926, + "step": 5560 + }, + { + "epoch": 0.53, + "grad_norm": 47.75, + "learning_rate": 8.944985221205457e-07, + "loss": 2.0788, + "step": 5580 + }, + { + "epoch": 0.53, + "grad_norm": 69.5625, + "learning_rate": 8.941203785897401e-07, + "loss": 2.1218, + "step": 5600 + }, + { + "epoch": 0.53, + "grad_norm": 80.4375, + "learning_rate": 8.937422350589346e-07, + "loss": 2.1038, + "step": 5620 + }, + { + "epoch": 0.53, + "grad_norm": 54.625, + "learning_rate": 8.933640915281289e-07, + "loss": 2.0546, + "step": 5640 + }, + { + "epoch": 0.54, + "grad_norm": 64.375, + "learning_rate": 8.929859479973234e-07, + "loss": 2.0949, + "step": 5660 + }, + { + "epoch": 0.54, + "grad_norm": 69.0, + "learning_rate": 8.926078044665179e-07, + "loss": 2.037, + "step": 5680 + }, + { + "epoch": 0.54, + "grad_norm": 80.75, + "learning_rate": 8.922296609357122e-07, + "loss": 2.1122, + "step": 5700 + }, + { + "epoch": 0.54, + "grad_norm": 81.75, + "learning_rate": 8.918515174049067e-07, + "loss": 2.0332, + "step": 5720 + }, + { + "epoch": 0.54, + "grad_norm": 80.0625, + "learning_rate": 8.914733738741013e-07, + "loss": 1.9553, + "step": 5740 + }, + { + "epoch": 0.54, + "grad_norm": 93.625, + "learning_rate": 8.910952303432957e-07, + "loss": 1.9639, + "step": 5760 + }, + { + "epoch": 0.55, + "grad_norm": 67.875, + "learning_rate": 8.907170868124901e-07, + "loss": 2.0789, + "step": 5780 + }, + { + "epoch": 0.55, + "grad_norm": 71.625, + "learning_rate": 8.903389432816846e-07, + "loss": 2.2702, + "step": 5800 + }, + { + "epoch": 0.55, + "grad_norm": 81.3125, + "learning_rate": 8.89960799750879e-07, + "loss": 2.124, + "step": 5820 + }, + { + "epoch": 0.55, + "grad_norm": 96.6875, + "learning_rate": 8.895826562200735e-07, + "loss": 2.0788, + "step": 5840 + }, + { + "epoch": 0.55, + "grad_norm": 51.375, + "learning_rate": 8.892045126892679e-07, + "loss": 2.1448, + "step": 5860 + }, + { + "epoch": 0.56, + "grad_norm": 69.625, + "learning_rate": 8.888263691584623e-07, + "loss": 2.0283, + "step": 5880 + }, + { + "epoch": 0.56, + "grad_norm": 63.71875, + "learning_rate": 8.884482256276568e-07, + "loss": 2.0592, + "step": 5900 + }, + { + "epoch": 0.56, + "grad_norm": 81.125, + "learning_rate": 8.880700820968511e-07, + "loss": 2.1446, + "step": 5920 + }, + { + "epoch": 0.56, + "grad_norm": 55.84375, + "learning_rate": 8.876919385660456e-07, + "loss": 2.0625, + "step": 5940 + }, + { + "epoch": 0.56, + "grad_norm": 75.375, + "learning_rate": 8.873137950352401e-07, + "loss": 2.0988, + "step": 5960 + }, + { + "epoch": 0.57, + "grad_norm": 68.0, + "learning_rate": 8.869356515044345e-07, + "loss": 2.007, + "step": 5980 + }, + { + "epoch": 0.57, + "grad_norm": 132.875, + "learning_rate": 8.865575079736289e-07, + "loss": 2.103, + "step": 6000 + }, + { + "epoch": 0.57, + "grad_norm": 55.53125, + "learning_rate": 8.861793644428234e-07, + "loss": 1.9625, + "step": 6020 + }, + { + "epoch": 0.57, + "grad_norm": 76.875, + "learning_rate": 8.858012209120178e-07, + "loss": 2.1369, + "step": 6040 + }, + { + "epoch": 0.57, + "grad_norm": 108.0625, + "learning_rate": 8.854230773812122e-07, + "loss": 2.0546, + "step": 6060 + }, + { + "epoch": 0.57, + "grad_norm": 199.625, + "learning_rate": 8.850449338504067e-07, + "loss": 2.1589, + "step": 6080 + }, + { + "epoch": 0.58, + "grad_norm": 58.53125, + "learning_rate": 8.846667903196012e-07, + "loss": 2.0696, + "step": 6100 + }, + { + "epoch": 0.58, + "grad_norm": 52.84375, + "learning_rate": 8.842886467887957e-07, + "loss": 2.0543, + "step": 6120 + }, + { + "epoch": 0.58, + "grad_norm": 76.0625, + "learning_rate": 8.839105032579901e-07, + "loss": 2.0709, + "step": 6140 + }, + { + "epoch": 0.58, + "grad_norm": 86.125, + "learning_rate": 8.835323597271845e-07, + "loss": 2.1248, + "step": 6160 + }, + { + "epoch": 0.58, + "grad_norm": 66.1875, + "learning_rate": 8.83154216196379e-07, + "loss": 2.0078, + "step": 6180 + }, + { + "epoch": 0.59, + "grad_norm": 91.875, + "learning_rate": 8.827760726655735e-07, + "loss": 2.1086, + "step": 6200 + }, + { + "epoch": 0.59, + "grad_norm": 70.75, + "learning_rate": 8.823979291347678e-07, + "loss": 2.0076, + "step": 6220 + }, + { + "epoch": 0.59, + "grad_norm": 58.0, + "learning_rate": 8.820197856039623e-07, + "loss": 1.9816, + "step": 6240 + }, + { + "epoch": 0.59, + "grad_norm": 70.1875, + "learning_rate": 8.816416420731567e-07, + "loss": 2.0527, + "step": 6260 + }, + { + "epoch": 0.59, + "grad_norm": 64.5625, + "learning_rate": 8.812634985423511e-07, + "loss": 2.0384, + "step": 6280 + }, + { + "epoch": 0.6, + "grad_norm": 57.03125, + "learning_rate": 8.808853550115456e-07, + "loss": 2.091, + "step": 6300 + }, + { + "epoch": 0.6, + "grad_norm": 73.0, + "learning_rate": 8.8050721148074e-07, + "loss": 2.0033, + "step": 6320 + }, + { + "epoch": 0.6, + "grad_norm": 61.90625, + "learning_rate": 8.801290679499345e-07, + "loss": 2.0876, + "step": 6340 + }, + { + "epoch": 0.6, + "grad_norm": 56.65625, + "learning_rate": 8.797509244191289e-07, + "loss": 2.1797, + "step": 6360 + }, + { + "epoch": 0.6, + "grad_norm": 72.5625, + "learning_rate": 8.793727808883233e-07, + "loss": 2.0717, + "step": 6380 + }, + { + "epoch": 0.61, + "grad_norm": 149.375, + "learning_rate": 8.789946373575178e-07, + "loss": 2.0591, + "step": 6400 + }, + { + "epoch": 0.61, + "grad_norm": 109.8125, + "learning_rate": 8.786164938267123e-07, + "loss": 2.0082, + "step": 6420 + }, + { + "epoch": 0.61, + "grad_norm": 62.03125, + "learning_rate": 8.782383502959067e-07, + "loss": 2.0883, + "step": 6440 + }, + { + "epoch": 0.61, + "grad_norm": 63.96875, + "learning_rate": 8.778602067651012e-07, + "loss": 2.0119, + "step": 6460 + }, + { + "epoch": 0.61, + "grad_norm": 114.875, + "learning_rate": 8.774820632342957e-07, + "loss": 2.1563, + "step": 6480 + }, + { + "epoch": 0.61, + "grad_norm": 62.15625, + "learning_rate": 8.7710391970349e-07, + "loss": 2.0736, + "step": 6500 + }, + { + "epoch": 0.62, + "grad_norm": 79.125, + "learning_rate": 8.767257761726845e-07, + "loss": 2.0764, + "step": 6520 + }, + { + "epoch": 0.62, + "grad_norm": 63.5625, + "learning_rate": 8.76347632641879e-07, + "loss": 2.0432, + "step": 6540 + }, + { + "epoch": 0.62, + "grad_norm": 55.15625, + "learning_rate": 8.759694891110734e-07, + "loss": 2.0265, + "step": 6560 + }, + { + "epoch": 0.62, + "grad_norm": 66.875, + "learning_rate": 8.755913455802678e-07, + "loss": 1.9963, + "step": 6580 + }, + { + "epoch": 0.62, + "grad_norm": 82.5625, + "learning_rate": 8.752132020494622e-07, + "loss": 1.984, + "step": 6600 + }, + { + "epoch": 0.63, + "grad_norm": 64.5, + "learning_rate": 8.748350585186567e-07, + "loss": 2.0488, + "step": 6620 + }, + { + "epoch": 0.63, + "grad_norm": 81.4375, + "learning_rate": 8.744569149878512e-07, + "loss": 2.1111, + "step": 6640 + }, + { + "epoch": 0.63, + "grad_norm": 74.6875, + "learning_rate": 8.740787714570455e-07, + "loss": 2.0399, + "step": 6660 + }, + { + "epoch": 0.63, + "grad_norm": 70.3125, + "learning_rate": 8.7370062792624e-07, + "loss": 1.9512, + "step": 6680 + }, + { + "epoch": 0.63, + "grad_norm": 166.0, + "learning_rate": 8.733224843954345e-07, + "loss": 2.0779, + "step": 6700 + }, + { + "epoch": 0.64, + "grad_norm": 71.125, + "learning_rate": 8.729443408646288e-07, + "loss": 2.0487, + "step": 6720 + }, + { + "epoch": 0.64, + "grad_norm": 119.8125, + "learning_rate": 8.725661973338233e-07, + "loss": 2.1232, + "step": 6740 + }, + { + "epoch": 0.64, + "grad_norm": 104.5625, + "learning_rate": 8.721880538030178e-07, + "loss": 1.9915, + "step": 6760 + }, + { + "epoch": 0.64, + "grad_norm": 74.125, + "learning_rate": 8.718099102722122e-07, + "loss": 2.1389, + "step": 6780 + }, + { + "epoch": 0.64, + "grad_norm": 53.90625, + "learning_rate": 8.714317667414067e-07, + "loss": 2.1114, + "step": 6800 + }, + { + "epoch": 0.64, + "grad_norm": 67.125, + "learning_rate": 8.710536232106012e-07, + "loss": 1.9978, + "step": 6820 + }, + { + "epoch": 0.65, + "grad_norm": 69.125, + "learning_rate": 8.706754796797956e-07, + "loss": 2.1736, + "step": 6840 + }, + { + "epoch": 0.65, + "grad_norm": 61.0625, + "learning_rate": 8.702973361489901e-07, + "loss": 2.1067, + "step": 6860 + }, + { + "epoch": 0.65, + "grad_norm": 74.1875, + "learning_rate": 8.699191926181844e-07, + "loss": 1.9871, + "step": 6880 + }, + { + "epoch": 0.65, + "grad_norm": 61.78125, + "learning_rate": 8.695410490873789e-07, + "loss": 1.9737, + "step": 6900 + }, + { + "epoch": 0.65, + "grad_norm": 66.625, + "learning_rate": 8.691629055565734e-07, + "loss": 2.0105, + "step": 6920 + }, + { + "epoch": 0.66, + "grad_norm": 68.6875, + "learning_rate": 8.687847620257677e-07, + "loss": 2.026, + "step": 6940 + }, + { + "epoch": 0.66, + "grad_norm": 49.65625, + "learning_rate": 8.684066184949622e-07, + "loss": 2.1013, + "step": 6960 + }, + { + "epoch": 0.66, + "grad_norm": 83.1875, + "learning_rate": 8.680284749641567e-07, + "loss": 2.0245, + "step": 6980 + }, + { + "epoch": 0.66, + "grad_norm": 86.9375, + "learning_rate": 8.676503314333511e-07, + "loss": 1.9986, + "step": 7000 + }, + { + "epoch": 0.66, + "grad_norm": 56.59375, + "learning_rate": 8.672721879025455e-07, + "loss": 2.0128, + "step": 7020 + }, + { + "epoch": 0.67, + "grad_norm": 63.90625, + "learning_rate": 8.6689404437174e-07, + "loss": 2.0212, + "step": 7040 + }, + { + "epoch": 0.67, + "grad_norm": 76.75, + "learning_rate": 8.665159008409344e-07, + "loss": 2.077, + "step": 7060 + }, + { + "epoch": 0.67, + "grad_norm": 84.3125, + "learning_rate": 8.661377573101288e-07, + "loss": 2.075, + "step": 7080 + }, + { + "epoch": 0.67, + "grad_norm": 99.0, + "learning_rate": 8.657596137793233e-07, + "loss": 2.1219, + "step": 7100 + }, + { + "epoch": 0.67, + "grad_norm": 112.75, + "learning_rate": 8.653814702485177e-07, + "loss": 1.8914, + "step": 7120 + }, + { + "epoch": 0.67, + "grad_norm": 83.1875, + "learning_rate": 8.650033267177123e-07, + "loss": 2.091, + "step": 7140 + }, + { + "epoch": 0.68, + "grad_norm": 57.65625, + "learning_rate": 8.646251831869067e-07, + "loss": 2.1047, + "step": 7160 + }, + { + "epoch": 0.68, + "grad_norm": 70.0625, + "learning_rate": 8.642470396561011e-07, + "loss": 2.0086, + "step": 7180 + }, + { + "epoch": 0.68, + "grad_norm": 60.40625, + "learning_rate": 8.638688961252956e-07, + "loss": 2.0193, + "step": 7200 + }, + { + "epoch": 0.68, + "grad_norm": 124.125, + "learning_rate": 8.6349075259449e-07, + "loss": 2.0588, + "step": 7220 + }, + { + "epoch": 0.68, + "grad_norm": 49.28125, + "learning_rate": 8.631126090636844e-07, + "loss": 1.9948, + "step": 7240 + }, + { + "epoch": 0.69, + "grad_norm": 71.75, + "learning_rate": 8.627344655328789e-07, + "loss": 2.0766, + "step": 7260 + }, + { + "epoch": 0.69, + "grad_norm": 59.9375, + "learning_rate": 8.623563220020733e-07, + "loss": 1.9796, + "step": 7280 + }, + { + "epoch": 0.69, + "grad_norm": 61.46875, + "learning_rate": 8.619781784712677e-07, + "loss": 2.213, + "step": 7300 + }, + { + "epoch": 0.69, + "grad_norm": 52.40625, + "learning_rate": 8.616000349404622e-07, + "loss": 2.1209, + "step": 7320 + }, + { + "epoch": 0.69, + "grad_norm": 86.25, + "learning_rate": 8.612218914096566e-07, + "loss": 1.9509, + "step": 7340 + }, + { + "epoch": 0.7, + "grad_norm": 78.0, + "learning_rate": 8.608437478788511e-07, + "loss": 1.9703, + "step": 7360 + }, + { + "epoch": 0.7, + "grad_norm": 62.8125, + "learning_rate": 8.604656043480455e-07, + "loss": 1.9881, + "step": 7380 + }, + { + "epoch": 0.7, + "grad_norm": 125.875, + "learning_rate": 8.600874608172399e-07, + "loss": 1.985, + "step": 7400 + }, + { + "epoch": 0.7, + "grad_norm": 73.375, + "learning_rate": 8.597093172864344e-07, + "loss": 2.0481, + "step": 7420 + }, + { + "epoch": 0.7, + "grad_norm": 109.1875, + "learning_rate": 8.593311737556289e-07, + "loss": 1.9969, + "step": 7440 + }, + { + "epoch": 0.71, + "grad_norm": 59.46875, + "learning_rate": 8.589530302248232e-07, + "loss": 1.9769, + "step": 7460 + }, + { + "epoch": 0.71, + "grad_norm": 49.71875, + "learning_rate": 8.585748866940178e-07, + "loss": 2.0271, + "step": 7480 + }, + { + "epoch": 0.71, + "grad_norm": 85.125, + "learning_rate": 8.581967431632122e-07, + "loss": 2.0354, + "step": 7500 + }, + { + "epoch": 0.71, + "grad_norm": 58.8125, + "learning_rate": 8.578185996324066e-07, + "loss": 2.0471, + "step": 7520 + }, + { + "epoch": 0.71, + "grad_norm": 90.25, + "learning_rate": 8.574404561016011e-07, + "loss": 2.0257, + "step": 7540 + }, + { + "epoch": 0.71, + "grad_norm": 68.5, + "learning_rate": 8.570623125707955e-07, + "loss": 2.0477, + "step": 7560 + }, + { + "epoch": 0.72, + "grad_norm": 73.875, + "learning_rate": 8.5668416903999e-07, + "loss": 2.1175, + "step": 7580 + }, + { + "epoch": 0.72, + "grad_norm": 62.1875, + "learning_rate": 8.563060255091844e-07, + "loss": 2.0251, + "step": 7600 + }, + { + "epoch": 0.72, + "grad_norm": 80.625, + "learning_rate": 8.559278819783788e-07, + "loss": 2.0128, + "step": 7620 + }, + { + "epoch": 0.72, + "grad_norm": 57.25, + "learning_rate": 8.555497384475733e-07, + "loss": 2.0853, + "step": 7640 + }, + { + "epoch": 0.72, + "grad_norm": 95.375, + "learning_rate": 8.551715949167678e-07, + "loss": 2.0591, + "step": 7660 + }, + { + "epoch": 0.73, + "grad_norm": 53.625, + "learning_rate": 8.547934513859621e-07, + "loss": 2.081, + "step": 7680 + }, + { + "epoch": 0.73, + "grad_norm": 74.875, + "learning_rate": 8.544153078551566e-07, + "loss": 2.0432, + "step": 7700 + }, + { + "epoch": 0.73, + "grad_norm": 78.3125, + "learning_rate": 8.540371643243511e-07, + "loss": 1.9401, + "step": 7720 + }, + { + "epoch": 0.73, + "grad_norm": 112.75, + "learning_rate": 8.536590207935454e-07, + "loss": 2.1571, + "step": 7740 + }, + { + "epoch": 0.73, + "grad_norm": 56.65625, + "learning_rate": 8.532808772627399e-07, + "loss": 2.0862, + "step": 7760 + }, + { + "epoch": 0.74, + "grad_norm": 113.1875, + "learning_rate": 8.529027337319343e-07, + "loss": 2.0632, + "step": 7780 + }, + { + "epoch": 0.74, + "grad_norm": 94.3125, + "learning_rate": 8.525245902011288e-07, + "loss": 2.0372, + "step": 7800 + }, + { + "epoch": 0.74, + "grad_norm": 51.125, + "learning_rate": 8.521464466703232e-07, + "loss": 2.0205, + "step": 7820 + }, + { + "epoch": 0.74, + "grad_norm": 54.28125, + "learning_rate": 8.517683031395177e-07, + "loss": 1.9503, + "step": 7840 + }, + { + "epoch": 0.74, + "grad_norm": 60.625, + "learning_rate": 8.513901596087122e-07, + "loss": 2.1562, + "step": 7860 + }, + { + "epoch": 0.74, + "grad_norm": 64.75, + "learning_rate": 8.510120160779067e-07, + "loss": 2.0641, + "step": 7880 + }, + { + "epoch": 0.75, + "grad_norm": 62.59375, + "learning_rate": 8.50633872547101e-07, + "loss": 2.0897, + "step": 7900 + }, + { + "epoch": 0.75, + "grad_norm": 99.5625, + "learning_rate": 8.502557290162955e-07, + "loss": 2.0611, + "step": 7920 + }, + { + "epoch": 0.75, + "grad_norm": 48.28125, + "learning_rate": 8.4987758548549e-07, + "loss": 2.0871, + "step": 7940 + }, + { + "epoch": 0.75, + "grad_norm": 55.5, + "learning_rate": 8.494994419546843e-07, + "loss": 1.979, + "step": 7960 + }, + { + "epoch": 0.75, + "grad_norm": 70.125, + "learning_rate": 8.491212984238788e-07, + "loss": 2.1194, + "step": 7980 + }, + { + "epoch": 0.76, + "grad_norm": 83.3125, + "learning_rate": 8.487431548930733e-07, + "loss": 2.0838, + "step": 8000 + }, + { + "epoch": 0.76, + "grad_norm": 65.25, + "learning_rate": 8.483650113622677e-07, + "loss": 2.079, + "step": 8020 + }, + { + "epoch": 0.76, + "grad_norm": 68.0625, + "learning_rate": 8.479868678314621e-07, + "loss": 2.0234, + "step": 8040 + }, + { + "epoch": 0.76, + "grad_norm": 64.8125, + "learning_rate": 8.476087243006566e-07, + "loss": 2.0205, + "step": 8060 + }, + { + "epoch": 0.76, + "grad_norm": 56.0, + "learning_rate": 8.47230580769851e-07, + "loss": 1.975, + "step": 8080 + }, + { + "epoch": 0.77, + "grad_norm": 81.625, + "learning_rate": 8.468524372390454e-07, + "loss": 2.1159, + "step": 8100 + }, + { + "epoch": 0.77, + "grad_norm": 53.21875, + "learning_rate": 8.464742937082398e-07, + "loss": 2.0243, + "step": 8120 + }, + { + "epoch": 0.77, + "grad_norm": 88.625, + "learning_rate": 8.460961501774343e-07, + "loss": 1.9559, + "step": 8140 + }, + { + "epoch": 0.77, + "grad_norm": 93.3125, + "learning_rate": 8.457180066466288e-07, + "loss": 2.1196, + "step": 8160 + }, + { + "epoch": 0.77, + "grad_norm": 61.09375, + "learning_rate": 8.453398631158232e-07, + "loss": 1.896, + "step": 8180 + }, + { + "epoch": 0.78, + "grad_norm": 46.9375, + "learning_rate": 8.449617195850177e-07, + "loss": 2.0552, + "step": 8200 + }, + { + "epoch": 0.78, + "grad_norm": 84.125, + "learning_rate": 8.445835760542122e-07, + "loss": 2.0865, + "step": 8220 + }, + { + "epoch": 0.78, + "grad_norm": 48.53125, + "learning_rate": 8.442054325234066e-07, + "loss": 2.049, + "step": 8240 + }, + { + "epoch": 0.78, + "grad_norm": 70.9375, + "learning_rate": 8.43827288992601e-07, + "loss": 2.0403, + "step": 8260 + }, + { + "epoch": 0.78, + "grad_norm": 65.6875, + "learning_rate": 8.434491454617955e-07, + "loss": 2.1368, + "step": 8280 + }, + { + "epoch": 0.78, + "grad_norm": 77.6875, + "learning_rate": 8.430710019309899e-07, + "loss": 1.979, + "step": 8300 + }, + { + "epoch": 0.79, + "grad_norm": 70.0, + "learning_rate": 8.426928584001843e-07, + "loss": 1.8971, + "step": 8320 + }, + { + "epoch": 0.79, + "grad_norm": 168.875, + "learning_rate": 8.423147148693788e-07, + "loss": 1.999, + "step": 8340 + }, + { + "epoch": 0.79, + "grad_norm": 67.8125, + "learning_rate": 8.419365713385732e-07, + "loss": 1.9871, + "step": 8360 + }, + { + "epoch": 0.79, + "grad_norm": 66.6875, + "learning_rate": 8.415584278077677e-07, + "loss": 1.867, + "step": 8380 + }, + { + "epoch": 0.79, + "grad_norm": 63.625, + "learning_rate": 8.41180284276962e-07, + "loss": 1.9582, + "step": 8400 + }, + { + "epoch": 0.8, + "grad_norm": 63.28125, + "learning_rate": 8.408021407461565e-07, + "loss": 1.9924, + "step": 8420 + }, + { + "epoch": 0.8, + "grad_norm": 62.40625, + "learning_rate": 8.40423997215351e-07, + "loss": 1.9955, + "step": 8440 + }, + { + "epoch": 0.8, + "grad_norm": 48.65625, + "learning_rate": 8.400458536845453e-07, + "loss": 2.0825, + "step": 8460 + }, + { + "epoch": 0.8, + "grad_norm": 75.9375, + "learning_rate": 8.396677101537398e-07, + "loss": 1.9676, + "step": 8480 + }, + { + "epoch": 0.8, + "grad_norm": 107.1875, + "learning_rate": 8.392895666229343e-07, + "loss": 1.9956, + "step": 8500 + }, + { + "epoch": 0.81, + "grad_norm": 52.5625, + "learning_rate": 8.389114230921287e-07, + "loss": 2.0818, + "step": 8520 + }, + { + "epoch": 0.81, + "grad_norm": 74.625, + "learning_rate": 8.385332795613232e-07, + "loss": 1.9427, + "step": 8540 + }, + { + "epoch": 0.81, + "grad_norm": 70.125, + "learning_rate": 8.381551360305177e-07, + "loss": 2.1569, + "step": 8560 + }, + { + "epoch": 0.81, + "grad_norm": 64.8125, + "learning_rate": 8.377769924997121e-07, + "loss": 2.0716, + "step": 8580 + }, + { + "epoch": 0.81, + "grad_norm": 59.875, + "learning_rate": 8.373988489689066e-07, + "loss": 2.06, + "step": 8600 + }, + { + "epoch": 0.81, + "grad_norm": 86.1875, + "learning_rate": 8.37020705438101e-07, + "loss": 2.0076, + "step": 8620 + }, + { + "epoch": 0.82, + "grad_norm": 49.8125, + "learning_rate": 8.366425619072954e-07, + "loss": 2.0739, + "step": 8640 + }, + { + "epoch": 0.82, + "grad_norm": 96.1875, + "learning_rate": 8.362644183764899e-07, + "loss": 2.0958, + "step": 8660 + }, + { + "epoch": 0.82, + "grad_norm": 81.9375, + "learning_rate": 8.358862748456844e-07, + "loss": 2.0505, + "step": 8680 + }, + { + "epoch": 0.82, + "grad_norm": 72.4375, + "learning_rate": 8.355081313148787e-07, + "loss": 2.0513, + "step": 8700 + }, + { + "epoch": 0.82, + "grad_norm": 65.125, + "learning_rate": 8.351299877840732e-07, + "loss": 1.9823, + "step": 8720 + }, + { + "epoch": 0.83, + "grad_norm": 60.28125, + "learning_rate": 8.347518442532676e-07, + "loss": 1.8811, + "step": 8740 + }, + { + "epoch": 0.83, + "grad_norm": 57.125, + "learning_rate": 8.34373700722462e-07, + "loss": 2.0142, + "step": 8760 + }, + { + "epoch": 0.83, + "grad_norm": 90.625, + "learning_rate": 8.339955571916565e-07, + "loss": 1.9925, + "step": 8780 + }, + { + "epoch": 0.83, + "grad_norm": 71.4375, + "learning_rate": 8.336174136608509e-07, + "loss": 2.1022, + "step": 8800 + }, + { + "epoch": 0.83, + "grad_norm": 95.5625, + "learning_rate": 8.332392701300454e-07, + "loss": 1.9648, + "step": 8820 + }, + { + "epoch": 0.84, + "grad_norm": 103.9375, + "learning_rate": 8.328611265992398e-07, + "loss": 2.0242, + "step": 8840 + }, + { + "epoch": 0.84, + "grad_norm": 54.96875, + "learning_rate": 8.324829830684342e-07, + "loss": 1.9736, + "step": 8860 + }, + { + "epoch": 0.84, + "grad_norm": 53.0, + "learning_rate": 8.321048395376288e-07, + "loss": 2.0328, + "step": 8880 + }, + { + "epoch": 0.84, + "grad_norm": 61.21875, + "learning_rate": 8.317266960068233e-07, + "loss": 1.9957, + "step": 8900 + }, + { + "epoch": 0.84, + "grad_norm": 51.09375, + "learning_rate": 8.313485524760176e-07, + "loss": 1.9611, + "step": 8920 + }, + { + "epoch": 0.85, + "grad_norm": 48.6875, + "learning_rate": 8.309704089452121e-07, + "loss": 1.9684, + "step": 8940 + }, + { + "epoch": 0.85, + "grad_norm": 61.90625, + "learning_rate": 8.305922654144066e-07, + "loss": 1.989, + "step": 8960 + }, + { + "epoch": 0.85, + "grad_norm": 54.28125, + "learning_rate": 8.302141218836009e-07, + "loss": 1.9718, + "step": 8980 + }, + { + "epoch": 0.85, + "grad_norm": 90.625, + "learning_rate": 8.298359783527954e-07, + "loss": 2.0448, + "step": 9000 + }, + { + "epoch": 0.85, + "grad_norm": 98.4375, + "learning_rate": 8.294578348219899e-07, + "loss": 1.9535, + "step": 9020 + }, + { + "epoch": 0.85, + "grad_norm": 106.625, + "learning_rate": 8.290796912911843e-07, + "loss": 1.9675, + "step": 9040 + }, + { + "epoch": 0.86, + "grad_norm": 67.5625, + "learning_rate": 8.287015477603787e-07, + "loss": 2.0108, + "step": 9060 + }, + { + "epoch": 0.86, + "grad_norm": 47.28125, + "learning_rate": 8.283234042295731e-07, + "loss": 2.0104, + "step": 9080 + }, + { + "epoch": 0.86, + "grad_norm": 57.34375, + "learning_rate": 8.279452606987676e-07, + "loss": 2.0557, + "step": 9100 + }, + { + "epoch": 0.86, + "grad_norm": 95.5, + "learning_rate": 8.27567117167962e-07, + "loss": 1.9992, + "step": 9120 + }, + { + "epoch": 0.86, + "grad_norm": 56.03125, + "learning_rate": 8.271889736371564e-07, + "loss": 1.9351, + "step": 9140 + }, + { + "epoch": 0.87, + "grad_norm": 121.6875, + "learning_rate": 8.268108301063509e-07, + "loss": 2.049, + "step": 9160 + }, + { + "epoch": 0.87, + "grad_norm": 65.25, + "learning_rate": 8.264326865755454e-07, + "loss": 2.0567, + "step": 9180 + }, + { + "epoch": 0.87, + "grad_norm": 63.53125, + "learning_rate": 8.260545430447397e-07, + "loss": 2.1084, + "step": 9200 + }, + { + "epoch": 0.87, + "grad_norm": 62.3125, + "learning_rate": 8.256763995139342e-07, + "loss": 2.0061, + "step": 9220 + }, + { + "epoch": 0.87, + "grad_norm": 78.4375, + "learning_rate": 8.252982559831288e-07, + "loss": 2.0693, + "step": 9240 + }, + { + "epoch": 0.88, + "grad_norm": 61.6875, + "learning_rate": 8.249201124523232e-07, + "loss": 1.9503, + "step": 9260 + }, + { + "epoch": 0.88, + "grad_norm": 67.1875, + "learning_rate": 8.245419689215176e-07, + "loss": 2.061, + "step": 9280 + }, + { + "epoch": 0.88, + "grad_norm": 128.5, + "learning_rate": 8.241638253907121e-07, + "loss": 2.0328, + "step": 9300 + }, + { + "epoch": 0.88, + "grad_norm": 61.8125, + "learning_rate": 8.237856818599065e-07, + "loss": 1.9776, + "step": 9320 + }, + { + "epoch": 0.88, + "grad_norm": 79.375, + "learning_rate": 8.234075383291009e-07, + "loss": 1.9705, + "step": 9340 + }, + { + "epoch": 0.88, + "grad_norm": 52.1875, + "learning_rate": 8.230293947982953e-07, + "loss": 2.1075, + "step": 9360 + }, + { + "epoch": 0.89, + "grad_norm": 54.6875, + "learning_rate": 8.226512512674898e-07, + "loss": 1.9804, + "step": 9380 + }, + { + "epoch": 0.89, + "grad_norm": 60.125, + "learning_rate": 8.222731077366843e-07, + "loss": 2.0447, + "step": 9400 + }, + { + "epoch": 0.89, + "grad_norm": 57.46875, + "learning_rate": 8.218949642058786e-07, + "loss": 2.0118, + "step": 9420 + }, + { + "epoch": 0.89, + "grad_norm": 59.78125, + "learning_rate": 8.215168206750731e-07, + "loss": 2.0065, + "step": 9440 + }, + { + "epoch": 0.89, + "grad_norm": 61.6875, + "learning_rate": 8.211386771442676e-07, + "loss": 1.9687, + "step": 9460 + }, + { + "epoch": 0.9, + "grad_norm": 60.84375, + "learning_rate": 8.207605336134619e-07, + "loss": 2.0927, + "step": 9480 + }, + { + "epoch": 0.9, + "grad_norm": 88.25, + "learning_rate": 8.203823900826564e-07, + "loss": 2.0385, + "step": 9500 + }, + { + "epoch": 0.9, + "grad_norm": 58.6875, + "learning_rate": 8.200042465518509e-07, + "loss": 2.0446, + "step": 9520 + }, + { + "epoch": 0.9, + "grad_norm": 65.375, + "learning_rate": 8.196261030210453e-07, + "loss": 2.1188, + "step": 9540 + }, + { + "epoch": 0.9, + "grad_norm": 58.46875, + "learning_rate": 8.192479594902397e-07, + "loss": 1.9974, + "step": 9560 + }, + { + "epoch": 0.91, + "grad_norm": 49.59375, + "learning_rate": 8.188698159594343e-07, + "loss": 1.9911, + "step": 9580 + }, + { + "epoch": 0.91, + "grad_norm": 75.1875, + "learning_rate": 8.184916724286287e-07, + "loss": 1.9578, + "step": 9600 + }, + { + "epoch": 0.91, + "grad_norm": 71.4375, + "learning_rate": 8.181135288978232e-07, + "loss": 2.0356, + "step": 9620 + }, + { + "epoch": 0.91, + "grad_norm": 63.9375, + "learning_rate": 8.177353853670176e-07, + "loss": 2.0255, + "step": 9640 + }, + { + "epoch": 0.91, + "grad_norm": 50.125, + "learning_rate": 8.17357241836212e-07, + "loss": 1.9512, + "step": 9660 + }, + { + "epoch": 0.92, + "grad_norm": 118.0, + "learning_rate": 8.169790983054065e-07, + "loss": 2.0679, + "step": 9680 + }, + { + "epoch": 0.92, + "grad_norm": 132.625, + "learning_rate": 8.166009547746008e-07, + "loss": 2.0188, + "step": 9700 + }, + { + "epoch": 0.92, + "grad_norm": 73.0, + "learning_rate": 8.162228112437953e-07, + "loss": 1.9632, + "step": 9720 + }, + { + "epoch": 0.92, + "grad_norm": 66.125, + "learning_rate": 8.158446677129898e-07, + "loss": 1.9628, + "step": 9740 + }, + { + "epoch": 0.92, + "grad_norm": 62.15625, + "learning_rate": 8.154665241821842e-07, + "loss": 1.9865, + "step": 9760 + }, + { + "epoch": 0.92, + "grad_norm": 56.28125, + "learning_rate": 8.150883806513786e-07, + "loss": 2.1228, + "step": 9780 + }, + { + "epoch": 0.93, + "grad_norm": 58.5625, + "learning_rate": 8.147102371205731e-07, + "loss": 1.9204, + "step": 9800 + }, + { + "epoch": 0.93, + "grad_norm": 88.125, + "learning_rate": 8.143320935897675e-07, + "loss": 1.989, + "step": 9820 + }, + { + "epoch": 0.93, + "grad_norm": 100.5625, + "learning_rate": 8.13953950058962e-07, + "loss": 1.9791, + "step": 9840 + }, + { + "epoch": 0.93, + "grad_norm": 66.375, + "learning_rate": 8.135758065281564e-07, + "loss": 2.0461, + "step": 9860 + }, + { + "epoch": 0.93, + "grad_norm": 80.75, + "learning_rate": 8.131976629973508e-07, + "loss": 2.0083, + "step": 9880 + }, + { + "epoch": 0.94, + "grad_norm": 52.34375, + "learning_rate": 8.128195194665453e-07, + "loss": 1.9767, + "step": 9900 + }, + { + "epoch": 0.94, + "grad_norm": 92.6875, + "learning_rate": 8.124413759357397e-07, + "loss": 2.0318, + "step": 9920 + }, + { + "epoch": 0.94, + "grad_norm": 70.625, + "learning_rate": 8.120632324049342e-07, + "loss": 2.0566, + "step": 9940 + }, + { + "epoch": 0.94, + "grad_norm": 86.9375, + "learning_rate": 8.116850888741287e-07, + "loss": 2.0286, + "step": 9960 + }, + { + "epoch": 0.94, + "grad_norm": 65.875, + "learning_rate": 8.113069453433232e-07, + "loss": 1.9629, + "step": 9980 + }, + { + "epoch": 0.95, + "grad_norm": 90.6875, + "learning_rate": 8.109288018125175e-07, + "loss": 1.9711, + "step": 10000 + }, + { + "epoch": 0.95, + "grad_norm": 52.125, + "learning_rate": 8.10550658281712e-07, + "loss": 1.9985, + "step": 10020 + }, + { + "epoch": 0.95, + "grad_norm": 66.25, + "learning_rate": 8.101725147509064e-07, + "loss": 2.0052, + "step": 10040 + }, + { + "epoch": 0.95, + "grad_norm": 65.375, + "learning_rate": 8.097943712201009e-07, + "loss": 1.9346, + "step": 10060 + }, + { + "epoch": 0.95, + "grad_norm": 53.03125, + "learning_rate": 8.094162276892953e-07, + "loss": 1.984, + "step": 10080 + }, + { + "epoch": 0.95, + "grad_norm": 110.25, + "learning_rate": 8.090380841584897e-07, + "loss": 2.0204, + "step": 10100 + }, + { + "epoch": 0.96, + "grad_norm": 59.03125, + "learning_rate": 8.086599406276842e-07, + "loss": 2.0792, + "step": 10120 + }, + { + "epoch": 0.96, + "grad_norm": 49.8125, + "learning_rate": 8.082817970968786e-07, + "loss": 2.0051, + "step": 10140 + }, + { + "epoch": 0.96, + "grad_norm": 61.1875, + "learning_rate": 8.07903653566073e-07, + "loss": 2.0679, + "step": 10160 + }, + { + "epoch": 0.96, + "grad_norm": 93.3125, + "learning_rate": 8.075255100352675e-07, + "loss": 2.099, + "step": 10180 + }, + { + "epoch": 0.96, + "grad_norm": 79.6875, + "learning_rate": 8.07147366504462e-07, + "loss": 2.1754, + "step": 10200 + }, + { + "epoch": 0.97, + "grad_norm": 68.875, + "learning_rate": 8.067692229736563e-07, + "loss": 2.0048, + "step": 10220 + }, + { + "epoch": 0.97, + "grad_norm": 72.625, + "learning_rate": 8.063910794428508e-07, + "loss": 1.9806, + "step": 10240 + }, + { + "epoch": 0.97, + "grad_norm": 98.1875, + "learning_rate": 8.060129359120452e-07, + "loss": 1.9797, + "step": 10260 + }, + { + "epoch": 0.97, + "grad_norm": 67.0625, + "learning_rate": 8.056347923812397e-07, + "loss": 2.0226, + "step": 10280 + }, + { + "epoch": 0.97, + "grad_norm": 55.40625, + "learning_rate": 8.052566488504342e-07, + "loss": 2.0109, + "step": 10300 + }, + { + "epoch": 0.98, + "grad_norm": 107.125, + "learning_rate": 8.048785053196286e-07, + "loss": 1.8523, + "step": 10320 + }, + { + "epoch": 0.98, + "grad_norm": 68.5625, + "learning_rate": 8.045003617888231e-07, + "loss": 1.9492, + "step": 10340 + }, + { + "epoch": 0.98, + "grad_norm": 77.0, + "learning_rate": 8.041222182580175e-07, + "loss": 1.9863, + "step": 10360 + }, + { + "epoch": 0.98, + "grad_norm": 76.5625, + "learning_rate": 8.037440747272119e-07, + "loss": 1.9792, + "step": 10380 + }, + { + "epoch": 0.98, + "grad_norm": 76.0625, + "learning_rate": 8.033659311964064e-07, + "loss": 2.0448, + "step": 10400 + }, + { + "epoch": 0.99, + "grad_norm": 79.3125, + "learning_rate": 8.029877876656009e-07, + "loss": 2.0606, + "step": 10420 + }, + { + "epoch": 0.99, + "grad_norm": 53.5625, + "learning_rate": 8.026096441347952e-07, + "loss": 2.0716, + "step": 10440 + }, + { + "epoch": 0.99, + "grad_norm": 75.0, + "learning_rate": 8.022315006039897e-07, + "loss": 1.9026, + "step": 10460 + }, + { + "epoch": 0.99, + "grad_norm": 85.0625, + "learning_rate": 8.018533570731842e-07, + "loss": 1.9879, + "step": 10480 + }, + { + "epoch": 0.99, + "grad_norm": 52.46875, + "learning_rate": 8.014752135423785e-07, + "loss": 2.0018, + "step": 10500 + }, + { + "epoch": 0.99, + "grad_norm": 48.4375, + "learning_rate": 8.01097070011573e-07, + "loss": 1.9966, + "step": 10520 + }, + { + "epoch": 1.0, + "grad_norm": 59.65625, + "learning_rate": 8.007189264807675e-07, + "loss": 2.0212, + "step": 10540 + }, + { + "epoch": 1.0, + "grad_norm": 49.59375, + "learning_rate": 8.003407829499619e-07, + "loss": 2.0244, + "step": 10560 + }, + { + "epoch": 1.0, + "grad_norm": 107.625, + "learning_rate": 7.999626394191563e-07, + "loss": 1.9597, + "step": 10580 + }, + { + "epoch": 1.0, + "grad_norm": 67.875, + "learning_rate": 7.995844958883507e-07, + "loss": 1.7883, + "step": 10600 + }, + { + "epoch": 1.0, + "grad_norm": 67.8125, + "learning_rate": 7.992063523575452e-07, + "loss": 1.7906, + "step": 10620 + }, + { + "epoch": 1.01, + "grad_norm": 60.09375, + "learning_rate": 7.988282088267398e-07, + "loss": 1.8029, + "step": 10640 + }, + { + "epoch": 1.01, + "grad_norm": 42.25, + "learning_rate": 7.984500652959341e-07, + "loss": 1.7842, + "step": 10660 + }, + { + "epoch": 1.01, + "grad_norm": 65.6875, + "learning_rate": 7.980719217651286e-07, + "loss": 1.8204, + "step": 10680 + }, + { + "epoch": 1.01, + "grad_norm": 65.625, + "learning_rate": 7.976937782343231e-07, + "loss": 1.7802, + "step": 10700 + }, + { + "epoch": 1.01, + "grad_norm": 47.96875, + "learning_rate": 7.973156347035174e-07, + "loss": 1.7412, + "step": 10720 + }, + { + "epoch": 1.02, + "grad_norm": 78.875, + "learning_rate": 7.969374911727119e-07, + "loss": 1.8131, + "step": 10740 + }, + { + "epoch": 1.02, + "grad_norm": 56.09375, + "learning_rate": 7.965593476419064e-07, + "loss": 1.7726, + "step": 10760 + }, + { + "epoch": 1.02, + "grad_norm": 61.1875, + "learning_rate": 7.961812041111008e-07, + "loss": 1.8361, + "step": 10780 + }, + { + "epoch": 1.02, + "grad_norm": 48.46875, + "learning_rate": 7.958030605802952e-07, + "loss": 1.6977, + "step": 10800 + }, + { + "epoch": 1.02, + "grad_norm": 74.75, + "learning_rate": 7.954249170494897e-07, + "loss": 1.7696, + "step": 10820 + }, + { + "epoch": 1.02, + "grad_norm": 72.875, + "learning_rate": 7.950467735186841e-07, + "loss": 1.7746, + "step": 10840 + }, + { + "epoch": 1.03, + "grad_norm": 47.46875, + "learning_rate": 7.946686299878785e-07, + "loss": 1.7854, + "step": 10860 + }, + { + "epoch": 1.03, + "grad_norm": 75.0625, + "learning_rate": 7.94290486457073e-07, + "loss": 1.782, + "step": 10880 + }, + { + "epoch": 1.03, + "grad_norm": 133.25, + "learning_rate": 7.939123429262674e-07, + "loss": 1.7646, + "step": 10900 + }, + { + "epoch": 1.03, + "grad_norm": 74.5, + "learning_rate": 7.935341993954619e-07, + "loss": 1.6605, + "step": 10920 + }, + { + "epoch": 1.03, + "grad_norm": 97.1875, + "learning_rate": 7.931560558646562e-07, + "loss": 1.8058, + "step": 10940 + }, + { + "epoch": 1.04, + "grad_norm": 63.09375, + "learning_rate": 7.927779123338507e-07, + "loss": 1.8219, + "step": 10960 + }, + { + "epoch": 1.04, + "grad_norm": 72.75, + "learning_rate": 7.923997688030453e-07, + "loss": 1.7732, + "step": 10980 + }, + { + "epoch": 1.04, + "grad_norm": 93.8125, + "learning_rate": 7.920216252722397e-07, + "loss": 1.7333, + "step": 11000 + }, + { + "epoch": 1.04, + "grad_norm": 68.375, + "learning_rate": 7.916434817414341e-07, + "loss": 1.7237, + "step": 11020 + }, + { + "epoch": 1.04, + "grad_norm": 72.1875, + "learning_rate": 7.912653382106286e-07, + "loss": 1.8252, + "step": 11040 + }, + { + "epoch": 1.05, + "grad_norm": 61.46875, + "learning_rate": 7.90887194679823e-07, + "loss": 1.688, + "step": 11060 + }, + { + "epoch": 1.05, + "grad_norm": 50.96875, + "learning_rate": 7.905090511490174e-07, + "loss": 1.7669, + "step": 11080 + }, + { + "epoch": 1.05, + "grad_norm": 49.28125, + "learning_rate": 7.901309076182119e-07, + "loss": 1.7935, + "step": 11100 + }, + { + "epoch": 1.05, + "grad_norm": 46.09375, + "learning_rate": 7.897527640874063e-07, + "loss": 1.8401, + "step": 11120 + }, + { + "epoch": 1.05, + "grad_norm": 91.375, + "learning_rate": 7.893746205566008e-07, + "loss": 1.8535, + "step": 11140 + }, + { + "epoch": 1.06, + "grad_norm": 79.125, + "learning_rate": 7.889964770257952e-07, + "loss": 1.7106, + "step": 11160 + }, + { + "epoch": 1.06, + "grad_norm": 84.6875, + "learning_rate": 7.886183334949896e-07, + "loss": 1.7925, + "step": 11180 + }, + { + "epoch": 1.06, + "grad_norm": 60.59375, + "learning_rate": 7.882401899641841e-07, + "loss": 1.7557, + "step": 11200 + }, + { + "epoch": 1.06, + "grad_norm": 66.5, + "learning_rate": 7.878620464333785e-07, + "loss": 1.7569, + "step": 11220 + }, + { + "epoch": 1.06, + "grad_norm": 54.125, + "learning_rate": 7.874839029025729e-07, + "loss": 1.7211, + "step": 11240 + }, + { + "epoch": 1.06, + "grad_norm": 89.9375, + "learning_rate": 7.871057593717674e-07, + "loss": 1.7641, + "step": 11260 + }, + { + "epoch": 1.07, + "grad_norm": 53.125, + "learning_rate": 7.867276158409618e-07, + "loss": 1.8629, + "step": 11280 + }, + { + "epoch": 1.07, + "grad_norm": 79.1875, + "learning_rate": 7.863494723101562e-07, + "loss": 1.8001, + "step": 11300 + }, + { + "epoch": 1.07, + "grad_norm": 59.9375, + "learning_rate": 7.859713287793507e-07, + "loss": 1.8619, + "step": 11320 + }, + { + "epoch": 1.07, + "grad_norm": 71.75, + "learning_rate": 7.855931852485452e-07, + "loss": 1.813, + "step": 11340 + }, + { + "epoch": 1.07, + "grad_norm": 99.9375, + "learning_rate": 7.852150417177397e-07, + "loss": 1.9216, + "step": 11360 + }, + { + "epoch": 1.08, + "grad_norm": 237.25, + "learning_rate": 7.848368981869341e-07, + "loss": 1.8799, + "step": 11380 + }, + { + "epoch": 1.08, + "grad_norm": 42.0, + "learning_rate": 7.844587546561285e-07, + "loss": 1.8169, + "step": 11400 + }, + { + "epoch": 1.08, + "grad_norm": 58.875, + "learning_rate": 7.84080611125323e-07, + "loss": 1.7881, + "step": 11420 + }, + { + "epoch": 1.08, + "grad_norm": 83.875, + "learning_rate": 7.837024675945175e-07, + "loss": 1.6689, + "step": 11440 + }, + { + "epoch": 1.08, + "grad_norm": 87.5, + "learning_rate": 7.833243240637118e-07, + "loss": 1.7903, + "step": 11460 + }, + { + "epoch": 1.09, + "grad_norm": 76.0, + "learning_rate": 7.829461805329063e-07, + "loss": 1.6204, + "step": 11480 + }, + { + "epoch": 1.09, + "grad_norm": 60.65625, + "learning_rate": 7.825680370021008e-07, + "loss": 1.832, + "step": 11500 + }, + { + "epoch": 1.09, + "grad_norm": 51.875, + "learning_rate": 7.821898934712951e-07, + "loss": 1.6985, + "step": 11520 + }, + { + "epoch": 1.09, + "grad_norm": 78.125, + "learning_rate": 7.818117499404896e-07, + "loss": 1.7127, + "step": 11540 + }, + { + "epoch": 1.09, + "grad_norm": 59.40625, + "learning_rate": 7.81433606409684e-07, + "loss": 1.7354, + "step": 11560 + }, + { + "epoch": 1.09, + "grad_norm": 69.125, + "learning_rate": 7.810554628788785e-07, + "loss": 1.759, + "step": 11580 + }, + { + "epoch": 1.1, + "grad_norm": 53.875, + "learning_rate": 7.806773193480729e-07, + "loss": 1.8113, + "step": 11600 + }, + { + "epoch": 1.1, + "grad_norm": 69.0625, + "learning_rate": 7.802991758172673e-07, + "loss": 1.7437, + "step": 11620 + }, + { + "epoch": 1.1, + "grad_norm": 61.21875, + "learning_rate": 7.799210322864618e-07, + "loss": 1.8023, + "step": 11640 + }, + { + "epoch": 1.1, + "grad_norm": 53.5625, + "learning_rate": 7.795428887556562e-07, + "loss": 1.6904, + "step": 11660 + }, + { + "epoch": 1.1, + "grad_norm": 106.25, + "learning_rate": 7.791647452248507e-07, + "loss": 1.746, + "step": 11680 + }, + { + "epoch": 1.11, + "grad_norm": 55.75, + "learning_rate": 7.787866016940452e-07, + "loss": 1.7889, + "step": 11700 + }, + { + "epoch": 1.11, + "grad_norm": 49.84375, + "learning_rate": 7.784084581632397e-07, + "loss": 1.7731, + "step": 11720 + }, + { + "epoch": 1.11, + "grad_norm": 65.6875, + "learning_rate": 7.78030314632434e-07, + "loss": 1.7623, + "step": 11740 + }, + { + "epoch": 1.11, + "grad_norm": 52.53125, + "learning_rate": 7.776521711016285e-07, + "loss": 1.8033, + "step": 11760 + }, + { + "epoch": 1.11, + "grad_norm": 77.375, + "learning_rate": 7.77274027570823e-07, + "loss": 1.8242, + "step": 11780 + }, + { + "epoch": 1.12, + "grad_norm": 75.0, + "learning_rate": 7.768958840400174e-07, + "loss": 1.6878, + "step": 11800 + }, + { + "epoch": 1.12, + "grad_norm": 68.875, + "learning_rate": 7.765177405092118e-07, + "loss": 1.7688, + "step": 11820 + }, + { + "epoch": 1.12, + "grad_norm": 70.375, + "learning_rate": 7.761395969784063e-07, + "loss": 1.7053, + "step": 11840 + }, + { + "epoch": 1.12, + "grad_norm": 50.6875, + "learning_rate": 7.757614534476007e-07, + "loss": 1.7667, + "step": 11860 + }, + { + "epoch": 1.12, + "grad_norm": 53.84375, + "learning_rate": 7.753833099167951e-07, + "loss": 1.8148, + "step": 11880 + }, + { + "epoch": 1.12, + "grad_norm": 54.65625, + "learning_rate": 7.750051663859895e-07, + "loss": 1.758, + "step": 11900 + }, + { + "epoch": 1.13, + "grad_norm": 63.5625, + "learning_rate": 7.74627022855184e-07, + "loss": 1.8429, + "step": 11920 + }, + { + "epoch": 1.13, + "grad_norm": 58.875, + "learning_rate": 7.742488793243785e-07, + "loss": 1.7386, + "step": 11940 + }, + { + "epoch": 1.13, + "grad_norm": 48.90625, + "learning_rate": 7.738707357935728e-07, + "loss": 1.8145, + "step": 11960 + }, + { + "epoch": 1.13, + "grad_norm": 53.1875, + "learning_rate": 7.734925922627673e-07, + "loss": 1.7081, + "step": 11980 + }, + { + "epoch": 1.13, + "grad_norm": 64.0625, + "learning_rate": 7.731144487319618e-07, + "loss": 1.7029, + "step": 12000 + }, + { + "epoch": 1.14, + "grad_norm": 52.5625, + "learning_rate": 7.727363052011562e-07, + "loss": 1.8116, + "step": 12020 + }, + { + "epoch": 1.14, + "grad_norm": 85.9375, + "learning_rate": 7.723581616703507e-07, + "loss": 1.7725, + "step": 12040 + }, + { + "epoch": 1.14, + "grad_norm": 61.125, + "learning_rate": 7.719800181395452e-07, + "loss": 1.8037, + "step": 12060 + }, + { + "epoch": 1.14, + "grad_norm": 55.75, + "learning_rate": 7.716018746087396e-07, + "loss": 1.7618, + "step": 12080 + }, + { + "epoch": 1.14, + "grad_norm": 78.625, + "learning_rate": 7.71223731077934e-07, + "loss": 1.7472, + "step": 12100 + }, + { + "epoch": 1.15, + "grad_norm": 87.625, + "learning_rate": 7.708455875471285e-07, + "loss": 1.7094, + "step": 12120 + }, + { + "epoch": 1.15, + "grad_norm": 55.625, + "learning_rate": 7.704674440163229e-07, + "loss": 1.8509, + "step": 12140 + }, + { + "epoch": 1.15, + "grad_norm": 62.1875, + "learning_rate": 7.700893004855174e-07, + "loss": 1.7445, + "step": 12160 + }, + { + "epoch": 1.15, + "grad_norm": 68.25, + "learning_rate": 7.697111569547117e-07, + "loss": 1.8586, + "step": 12180 + }, + { + "epoch": 1.15, + "grad_norm": 73.375, + "learning_rate": 7.693330134239062e-07, + "loss": 1.7869, + "step": 12200 + }, + { + "epoch": 1.16, + "grad_norm": 63.71875, + "learning_rate": 7.689548698931007e-07, + "loss": 1.6798, + "step": 12220 + }, + { + "epoch": 1.16, + "grad_norm": 50.625, + "learning_rate": 7.68576726362295e-07, + "loss": 1.7414, + "step": 12240 + }, + { + "epoch": 1.16, + "grad_norm": 70.125, + "learning_rate": 7.681985828314895e-07, + "loss": 1.7452, + "step": 12260 + }, + { + "epoch": 1.16, + "grad_norm": 60.8125, + "learning_rate": 7.67820439300684e-07, + "loss": 1.869, + "step": 12280 + }, + { + "epoch": 1.16, + "grad_norm": 49.375, + "learning_rate": 7.674422957698784e-07, + "loss": 1.6938, + "step": 12300 + }, + { + "epoch": 1.16, + "grad_norm": 83.0625, + "learning_rate": 7.670641522390728e-07, + "loss": 1.6603, + "step": 12320 + }, + { + "epoch": 1.17, + "grad_norm": 69.5625, + "learning_rate": 7.666860087082673e-07, + "loss": 1.7059, + "step": 12340 + }, + { + "epoch": 1.17, + "grad_norm": 51.0, + "learning_rate": 7.663078651774617e-07, + "loss": 1.7743, + "step": 12360 + }, + { + "epoch": 1.17, + "grad_norm": 58.4375, + "learning_rate": 7.659297216466563e-07, + "loss": 1.8554, + "step": 12380 + }, + { + "epoch": 1.17, + "grad_norm": 86.625, + "learning_rate": 7.655515781158507e-07, + "loss": 1.7817, + "step": 12400 + }, + { + "epoch": 1.17, + "grad_norm": 53.125, + "learning_rate": 7.651734345850451e-07, + "loss": 1.7744, + "step": 12420 + }, + { + "epoch": 1.18, + "grad_norm": 49.21875, + "learning_rate": 7.647952910542396e-07, + "loss": 1.8232, + "step": 12440 + }, + { + "epoch": 1.18, + "grad_norm": 64.1875, + "learning_rate": 7.644171475234341e-07, + "loss": 1.7839, + "step": 12460 + }, + { + "epoch": 1.18, + "grad_norm": 77.0625, + "learning_rate": 7.640390039926284e-07, + "loss": 1.8145, + "step": 12480 + }, + { + "epoch": 1.18, + "grad_norm": 64.375, + "learning_rate": 7.636608604618229e-07, + "loss": 1.6772, + "step": 12500 + }, + { + "epoch": 1.18, + "grad_norm": 74.5625, + "learning_rate": 7.632827169310173e-07, + "loss": 1.7555, + "step": 12520 + }, + { + "epoch": 1.19, + "grad_norm": 96.4375, + "learning_rate": 7.629045734002117e-07, + "loss": 1.6831, + "step": 12540 + }, + { + "epoch": 1.19, + "grad_norm": 55.40625, + "learning_rate": 7.625264298694062e-07, + "loss": 1.8384, + "step": 12560 + }, + { + "epoch": 1.19, + "grad_norm": 50.25, + "learning_rate": 7.621482863386006e-07, + "loss": 1.7862, + "step": 12580 + }, + { + "epoch": 1.19, + "grad_norm": 51.46875, + "learning_rate": 7.617701428077951e-07, + "loss": 1.7822, + "step": 12600 + }, + { + "epoch": 1.19, + "grad_norm": 63.625, + "learning_rate": 7.613919992769895e-07, + "loss": 1.7557, + "step": 12620 + }, + { + "epoch": 1.19, + "grad_norm": 74.3125, + "learning_rate": 7.610138557461839e-07, + "loss": 1.8421, + "step": 12640 + }, + { + "epoch": 1.2, + "grad_norm": 58.34375, + "learning_rate": 7.606357122153784e-07, + "loss": 1.715, + "step": 12660 + }, + { + "epoch": 1.2, + "grad_norm": 65.6875, + "learning_rate": 7.602575686845728e-07, + "loss": 1.8373, + "step": 12680 + }, + { + "epoch": 1.2, + "grad_norm": 64.3125, + "learning_rate": 7.598794251537672e-07, + "loss": 1.7039, + "step": 12700 + }, + { + "epoch": 1.2, + "grad_norm": 74.4375, + "learning_rate": 7.595012816229618e-07, + "loss": 1.7355, + "step": 12720 + }, + { + "epoch": 1.2, + "grad_norm": 73.5, + "learning_rate": 7.591231380921563e-07, + "loss": 1.7805, + "step": 12740 + }, + { + "epoch": 1.21, + "grad_norm": 71.1875, + "learning_rate": 7.587449945613506e-07, + "loss": 1.7684, + "step": 12760 + }, + { + "epoch": 1.21, + "grad_norm": 77.8125, + "learning_rate": 7.583668510305451e-07, + "loss": 1.7547, + "step": 12780 + }, + { + "epoch": 1.21, + "grad_norm": 60.9375, + "learning_rate": 7.579887074997395e-07, + "loss": 1.7124, + "step": 12800 + }, + { + "epoch": 1.21, + "grad_norm": 68.9375, + "learning_rate": 7.57610563968934e-07, + "loss": 1.8401, + "step": 12820 + }, + { + "epoch": 1.21, + "grad_norm": 47.625, + "learning_rate": 7.572324204381284e-07, + "loss": 1.786, + "step": 12840 + }, + { + "epoch": 1.22, + "grad_norm": 56.40625, + "learning_rate": 7.568542769073228e-07, + "loss": 1.7334, + "step": 12860 + }, + { + "epoch": 1.22, + "grad_norm": 54.21875, + "learning_rate": 7.564761333765173e-07, + "loss": 1.7812, + "step": 12880 + }, + { + "epoch": 1.22, + "grad_norm": 57.3125, + "learning_rate": 7.560979898457117e-07, + "loss": 1.7766, + "step": 12900 + }, + { + "epoch": 1.22, + "grad_norm": 48.34375, + "learning_rate": 7.557198463149061e-07, + "loss": 1.7633, + "step": 12920 + }, + { + "epoch": 1.22, + "grad_norm": 66.5625, + "learning_rate": 7.553417027841006e-07, + "loss": 1.8936, + "step": 12940 + }, + { + "epoch": 1.23, + "grad_norm": 72.4375, + "learning_rate": 7.549635592532951e-07, + "loss": 1.8073, + "step": 12960 + }, + { + "epoch": 1.23, + "grad_norm": 57.3125, + "learning_rate": 7.545854157224894e-07, + "loss": 1.7529, + "step": 12980 + }, + { + "epoch": 1.23, + "grad_norm": 86.0625, + "learning_rate": 7.542072721916839e-07, + "loss": 1.7475, + "step": 13000 + }, + { + "epoch": 1.23, + "grad_norm": 77.625, + "learning_rate": 7.538291286608784e-07, + "loss": 1.7339, + "step": 13020 + }, + { + "epoch": 1.23, + "grad_norm": 84.625, + "learning_rate": 7.534509851300727e-07, + "loss": 1.6812, + "step": 13040 + }, + { + "epoch": 1.23, + "grad_norm": 58.34375, + "learning_rate": 7.530728415992672e-07, + "loss": 1.8598, + "step": 13060 + }, + { + "epoch": 1.24, + "grad_norm": 47.34375, + "learning_rate": 7.526946980684618e-07, + "loss": 1.789, + "step": 13080 + }, + { + "epoch": 1.24, + "grad_norm": 64.0, + "learning_rate": 7.523165545376562e-07, + "loss": 1.7233, + "step": 13100 + }, + { + "epoch": 1.24, + "grad_norm": 71.5625, + "learning_rate": 7.519384110068506e-07, + "loss": 1.7302, + "step": 13120 + }, + { + "epoch": 1.24, + "grad_norm": 63.90625, + "learning_rate": 7.51560267476045e-07, + "loss": 1.6977, + "step": 13140 + }, + { + "epoch": 1.24, + "grad_norm": 47.09375, + "learning_rate": 7.511821239452395e-07, + "loss": 1.8249, + "step": 13160 + }, + { + "epoch": 1.25, + "grad_norm": 52.75, + "learning_rate": 7.50803980414434e-07, + "loss": 1.7705, + "step": 13180 + }, + { + "epoch": 1.25, + "grad_norm": 67.0, + "learning_rate": 7.504258368836283e-07, + "loss": 1.6708, + "step": 13200 + }, + { + "epoch": 1.25, + "grad_norm": 82.5, + "learning_rate": 7.500476933528228e-07, + "loss": 1.8331, + "step": 13220 + }, + { + "epoch": 1.25, + "grad_norm": 96.625, + "learning_rate": 7.496695498220173e-07, + "loss": 1.8325, + "step": 13240 + }, + { + "epoch": 1.25, + "grad_norm": 56.3125, + "learning_rate": 7.492914062912116e-07, + "loss": 1.7972, + "step": 13260 + }, + { + "epoch": 1.26, + "grad_norm": 56.53125, + "learning_rate": 7.489132627604061e-07, + "loss": 1.7719, + "step": 13280 + }, + { + "epoch": 1.26, + "grad_norm": 87.875, + "learning_rate": 7.485351192296006e-07, + "loss": 1.7606, + "step": 13300 + }, + { + "epoch": 1.26, + "grad_norm": 73.6875, + "learning_rate": 7.48156975698795e-07, + "loss": 1.7746, + "step": 13320 + }, + { + "epoch": 1.26, + "grad_norm": 80.625, + "learning_rate": 7.477788321679894e-07, + "loss": 1.767, + "step": 13340 + }, + { + "epoch": 1.26, + "grad_norm": 136.125, + "learning_rate": 7.474006886371839e-07, + "loss": 1.8342, + "step": 13360 + }, + { + "epoch": 1.26, + "grad_norm": 58.78125, + "learning_rate": 7.470225451063783e-07, + "loss": 1.739, + "step": 13380 + }, + { + "epoch": 1.27, + "grad_norm": 60.75, + "learning_rate": 7.466444015755728e-07, + "loss": 1.7303, + "step": 13400 + }, + { + "epoch": 1.27, + "grad_norm": 78.375, + "learning_rate": 7.462662580447673e-07, + "loss": 1.6749, + "step": 13420 + }, + { + "epoch": 1.27, + "grad_norm": 65.5, + "learning_rate": 7.458881145139617e-07, + "loss": 1.8015, + "step": 13440 + }, + { + "epoch": 1.27, + "grad_norm": 65.5625, + "learning_rate": 7.455099709831562e-07, + "loss": 1.7586, + "step": 13460 + }, + { + "epoch": 1.27, + "grad_norm": 48.46875, + "learning_rate": 7.451318274523505e-07, + "loss": 1.6745, + "step": 13480 + }, + { + "epoch": 1.28, + "grad_norm": 76.75, + "learning_rate": 7.44753683921545e-07, + "loss": 1.8741, + "step": 13500 + }, + { + "epoch": 1.28, + "grad_norm": 71.625, + "learning_rate": 7.443755403907395e-07, + "loss": 1.7672, + "step": 13520 + }, + { + "epoch": 1.28, + "grad_norm": 76.9375, + "learning_rate": 7.439973968599339e-07, + "loss": 1.8212, + "step": 13540 + }, + { + "epoch": 1.28, + "grad_norm": 59.1875, + "learning_rate": 7.436192533291283e-07, + "loss": 1.775, + "step": 13560 + }, + { + "epoch": 1.28, + "grad_norm": 97.8125, + "learning_rate": 7.432411097983228e-07, + "loss": 1.8158, + "step": 13580 + }, + { + "epoch": 1.29, + "grad_norm": 65.5, + "learning_rate": 7.428629662675172e-07, + "loss": 1.7197, + "step": 13600 + }, + { + "epoch": 1.29, + "grad_norm": 71.8125, + "learning_rate": 7.424848227367117e-07, + "loss": 1.7534, + "step": 13620 + }, + { + "epoch": 1.29, + "grad_norm": 83.3125, + "learning_rate": 7.421066792059061e-07, + "loss": 1.7694, + "step": 13640 + }, + { + "epoch": 1.29, + "grad_norm": 65.75, + "learning_rate": 7.417285356751005e-07, + "loss": 1.831, + "step": 13660 + }, + { + "epoch": 1.29, + "grad_norm": 102.5, + "learning_rate": 7.41350392144295e-07, + "loss": 1.7945, + "step": 13680 + }, + { + "epoch": 1.3, + "grad_norm": 63.1875, + "learning_rate": 7.409722486134893e-07, + "loss": 1.8187, + "step": 13700 + }, + { + "epoch": 1.3, + "grad_norm": 48.125, + "learning_rate": 7.405941050826838e-07, + "loss": 1.7061, + "step": 13720 + }, + { + "epoch": 1.3, + "grad_norm": 63.78125, + "learning_rate": 7.402159615518783e-07, + "loss": 1.7076, + "step": 13740 + }, + { + "epoch": 1.3, + "grad_norm": 44.59375, + "learning_rate": 7.398378180210727e-07, + "loss": 1.6653, + "step": 13760 + }, + { + "epoch": 1.3, + "grad_norm": 62.8125, + "learning_rate": 7.394596744902672e-07, + "loss": 1.8329, + "step": 13780 + }, + { + "epoch": 1.3, + "grad_norm": 77.25, + "learning_rate": 7.390815309594617e-07, + "loss": 1.8275, + "step": 13800 + }, + { + "epoch": 1.31, + "grad_norm": 52.40625, + "learning_rate": 7.387033874286561e-07, + "loss": 1.7821, + "step": 13820 + }, + { + "epoch": 1.31, + "grad_norm": 54.875, + "learning_rate": 7.383252438978506e-07, + "loss": 1.8171, + "step": 13840 + }, + { + "epoch": 1.31, + "grad_norm": 68.375, + "learning_rate": 7.37947100367045e-07, + "loss": 1.7546, + "step": 13860 + }, + { + "epoch": 1.31, + "grad_norm": 66.875, + "learning_rate": 7.375689568362394e-07, + "loss": 1.79, + "step": 13880 + }, + { + "epoch": 1.31, + "grad_norm": 62.1875, + "learning_rate": 7.371908133054339e-07, + "loss": 1.88, + "step": 13900 + }, + { + "epoch": 1.32, + "grad_norm": 77.5625, + "learning_rate": 7.368126697746283e-07, + "loss": 1.7613, + "step": 13920 + }, + { + "epoch": 1.32, + "grad_norm": 58.21875, + "learning_rate": 7.364345262438227e-07, + "loss": 1.7634, + "step": 13940 + }, + { + "epoch": 1.32, + "grad_norm": 55.0, + "learning_rate": 7.360563827130172e-07, + "loss": 1.8992, + "step": 13960 + }, + { + "epoch": 1.32, + "grad_norm": 84.0625, + "learning_rate": 7.356782391822117e-07, + "loss": 1.7488, + "step": 13980 + }, + { + "epoch": 1.32, + "grad_norm": 61.3125, + "learning_rate": 7.35300095651406e-07, + "loss": 1.7939, + "step": 14000 + }, + { + "epoch": 1.33, + "grad_norm": 54.5625, + "learning_rate": 7.349219521206005e-07, + "loss": 1.7303, + "step": 14020 + }, + { + "epoch": 1.33, + "grad_norm": 48.6875, + "learning_rate": 7.345438085897949e-07, + "loss": 1.7983, + "step": 14040 + }, + { + "epoch": 1.33, + "grad_norm": 58.125, + "learning_rate": 7.341656650589893e-07, + "loss": 1.7608, + "step": 14060 + }, + { + "epoch": 1.33, + "grad_norm": 74.75, + "learning_rate": 7.337875215281838e-07, + "loss": 1.8493, + "step": 14080 + }, + { + "epoch": 1.33, + "grad_norm": 76.125, + "learning_rate": 7.334093779973782e-07, + "loss": 1.8708, + "step": 14100 + }, + { + "epoch": 1.33, + "grad_norm": 54.34375, + "learning_rate": 7.330312344665728e-07, + "loss": 1.7848, + "step": 14120 + }, + { + "epoch": 1.34, + "grad_norm": 82.1875, + "learning_rate": 7.326530909357672e-07, + "loss": 1.8407, + "step": 14140 + }, + { + "epoch": 1.34, + "grad_norm": 125.0, + "learning_rate": 7.322749474049616e-07, + "loss": 1.8569, + "step": 14160 + }, + { + "epoch": 1.34, + "grad_norm": 67.875, + "learning_rate": 7.318968038741561e-07, + "loss": 1.8067, + "step": 14180 + }, + { + "epoch": 1.34, + "grad_norm": 76.375, + "learning_rate": 7.315186603433506e-07, + "loss": 1.7579, + "step": 14200 + }, + { + "epoch": 1.34, + "grad_norm": 51.34375, + "learning_rate": 7.311405168125449e-07, + "loss": 1.7588, + "step": 14220 + }, + { + "epoch": 1.35, + "grad_norm": 67.75, + "learning_rate": 7.307623732817394e-07, + "loss": 1.7899, + "step": 14240 + }, + { + "epoch": 1.35, + "grad_norm": 62.34375, + "learning_rate": 7.303842297509339e-07, + "loss": 1.755, + "step": 14260 + }, + { + "epoch": 1.35, + "grad_norm": 91.0, + "learning_rate": 7.300060862201282e-07, + "loss": 1.8114, + "step": 14280 + }, + { + "epoch": 1.35, + "grad_norm": 64.9375, + "learning_rate": 7.296279426893227e-07, + "loss": 1.7733, + "step": 14300 + }, + { + "epoch": 1.35, + "grad_norm": 99.375, + "learning_rate": 7.292497991585172e-07, + "loss": 1.8316, + "step": 14320 + }, + { + "epoch": 1.36, + "grad_norm": 65.0, + "learning_rate": 7.288716556277116e-07, + "loss": 1.7512, + "step": 14340 + }, + { + "epoch": 1.36, + "grad_norm": 76.0625, + "learning_rate": 7.28493512096906e-07, + "loss": 1.771, + "step": 14360 + }, + { + "epoch": 1.36, + "grad_norm": 52.78125, + "learning_rate": 7.281153685661004e-07, + "loss": 1.7127, + "step": 14380 + }, + { + "epoch": 1.36, + "grad_norm": 83.8125, + "learning_rate": 7.277372250352949e-07, + "loss": 1.836, + "step": 14400 + }, + { + "epoch": 1.36, + "grad_norm": 79.6875, + "learning_rate": 7.273590815044894e-07, + "loss": 1.7365, + "step": 14420 + }, + { + "epoch": 1.37, + "grad_norm": 56.96875, + "learning_rate": 7.269809379736837e-07, + "loss": 1.7652, + "step": 14440 + }, + { + "epoch": 1.37, + "grad_norm": 56.59375, + "learning_rate": 7.266027944428782e-07, + "loss": 1.8393, + "step": 14460 + }, + { + "epoch": 1.37, + "grad_norm": 51.40625, + "learning_rate": 7.262246509120728e-07, + "loss": 1.8095, + "step": 14480 + }, + { + "epoch": 1.37, + "grad_norm": 62.125, + "learning_rate": 7.258465073812671e-07, + "loss": 1.7587, + "step": 14500 + }, + { + "epoch": 1.37, + "grad_norm": 57.28125, + "learning_rate": 7.254683638504616e-07, + "loss": 1.6792, + "step": 14520 + }, + { + "epoch": 1.37, + "grad_norm": 61.09375, + "learning_rate": 7.250902203196561e-07, + "loss": 1.7675, + "step": 14540 + }, + { + "epoch": 1.38, + "grad_norm": 56.40625, + "learning_rate": 7.247120767888505e-07, + "loss": 1.8372, + "step": 14560 + }, + { + "epoch": 1.38, + "grad_norm": 69.375, + "learning_rate": 7.243339332580449e-07, + "loss": 1.7053, + "step": 14580 + }, + { + "epoch": 1.38, + "grad_norm": 103.375, + "learning_rate": 7.239557897272394e-07, + "loss": 1.7466, + "step": 14600 + }, + { + "epoch": 1.38, + "grad_norm": 63.4375, + "learning_rate": 7.235776461964338e-07, + "loss": 1.7912, + "step": 14620 + }, + { + "epoch": 1.38, + "grad_norm": 74.1875, + "learning_rate": 7.231995026656283e-07, + "loss": 1.7838, + "step": 14640 + }, + { + "epoch": 1.39, + "grad_norm": 71.25, + "learning_rate": 7.228213591348226e-07, + "loss": 1.6906, + "step": 14660 + }, + { + "epoch": 1.39, + "grad_norm": 72.75, + "learning_rate": 7.224432156040171e-07, + "loss": 1.8053, + "step": 14680 + }, + { + "epoch": 1.39, + "grad_norm": 107.3125, + "learning_rate": 7.220650720732116e-07, + "loss": 1.8471, + "step": 14700 + }, + { + "epoch": 1.39, + "grad_norm": 107.0, + "learning_rate": 7.216869285424059e-07, + "loss": 1.7873, + "step": 14720 + }, + { + "epoch": 1.39, + "grad_norm": 98.0, + "learning_rate": 7.213087850116004e-07, + "loss": 1.728, + "step": 14740 + }, + { + "epoch": 1.4, + "grad_norm": 62.3125, + "learning_rate": 7.209306414807949e-07, + "loss": 1.7931, + "step": 14760 + }, + { + "epoch": 1.4, + "grad_norm": 77.625, + "learning_rate": 7.205524979499893e-07, + "loss": 1.8416, + "step": 14780 + }, + { + "epoch": 1.4, + "grad_norm": 67.25, + "learning_rate": 7.201743544191837e-07, + "loss": 1.7702, + "step": 14800 + }, + { + "epoch": 1.4, + "grad_norm": 73.5, + "learning_rate": 7.197962108883783e-07, + "loss": 1.751, + "step": 14820 + }, + { + "epoch": 1.4, + "grad_norm": 90.6875, + "learning_rate": 7.194180673575727e-07, + "loss": 1.8144, + "step": 14840 + }, + { + "epoch": 1.4, + "grad_norm": 65.8125, + "learning_rate": 7.190399238267672e-07, + "loss": 1.7848, + "step": 14860 + }, + { + "epoch": 1.41, + "grad_norm": 75.1875, + "learning_rate": 7.186617802959616e-07, + "loss": 1.8424, + "step": 14880 + }, + { + "epoch": 1.41, + "grad_norm": 69.5625, + "learning_rate": 7.18283636765156e-07, + "loss": 1.7273, + "step": 14900 + }, + { + "epoch": 1.41, + "grad_norm": 62.9375, + "learning_rate": 7.179054932343505e-07, + "loss": 1.7568, + "step": 14920 + }, + { + "epoch": 1.41, + "grad_norm": 49.75, + "learning_rate": 7.175273497035449e-07, + "loss": 1.8136, + "step": 14940 + }, + { + "epoch": 1.41, + "grad_norm": 75.8125, + "learning_rate": 7.171492061727393e-07, + "loss": 1.8259, + "step": 14960 + }, + { + "epoch": 1.42, + "grad_norm": 51.125, + "learning_rate": 7.167710626419338e-07, + "loss": 1.7377, + "step": 14980 + }, + { + "epoch": 1.42, + "grad_norm": 61.8125, + "learning_rate": 7.163929191111282e-07, + "loss": 1.7435, + "step": 15000 + }, + { + "epoch": 1.42, + "grad_norm": 45.65625, + "learning_rate": 7.160147755803226e-07, + "loss": 1.7305, + "step": 15020 + }, + { + "epoch": 1.42, + "grad_norm": 48.03125, + "learning_rate": 7.156366320495171e-07, + "loss": 1.7079, + "step": 15040 + }, + { + "epoch": 1.42, + "grad_norm": 85.0, + "learning_rate": 7.152584885187115e-07, + "loss": 1.8445, + "step": 15060 + }, + { + "epoch": 1.43, + "grad_norm": 73.5625, + "learning_rate": 7.148803449879059e-07, + "loss": 1.7944, + "step": 15080 + }, + { + "epoch": 1.43, + "grad_norm": 66.75, + "learning_rate": 7.145022014571004e-07, + "loss": 1.72, + "step": 15100 + }, + { + "epoch": 1.43, + "grad_norm": 78.4375, + "learning_rate": 7.141240579262948e-07, + "loss": 1.8119, + "step": 15120 + }, + { + "epoch": 1.43, + "grad_norm": 66.25, + "learning_rate": 7.137459143954893e-07, + "loss": 1.8503, + "step": 15140 + }, + { + "epoch": 1.43, + "grad_norm": 78.625, + "learning_rate": 7.133677708646837e-07, + "loss": 1.7444, + "step": 15160 + }, + { + "epoch": 1.44, + "grad_norm": 88.5625, + "learning_rate": 7.129896273338782e-07, + "loss": 1.7226, + "step": 15180 + }, + { + "epoch": 1.44, + "grad_norm": 52.25, + "learning_rate": 7.126114838030727e-07, + "loss": 1.6872, + "step": 15200 + }, + { + "epoch": 1.44, + "grad_norm": 129.375, + "learning_rate": 7.122333402722672e-07, + "loss": 1.7477, + "step": 15220 + }, + { + "epoch": 1.44, + "grad_norm": 60.46875, + "learning_rate": 7.118551967414615e-07, + "loss": 1.6581, + "step": 15240 + }, + { + "epoch": 1.44, + "grad_norm": 59.1875, + "learning_rate": 7.11477053210656e-07, + "loss": 1.7731, + "step": 15260 + }, + { + "epoch": 1.44, + "grad_norm": 70.8125, + "learning_rate": 7.110989096798505e-07, + "loss": 1.7503, + "step": 15280 + }, + { + "epoch": 1.45, + "grad_norm": 82.9375, + "learning_rate": 7.107207661490448e-07, + "loss": 1.6437, + "step": 15300 + }, + { + "epoch": 1.45, + "grad_norm": 66.375, + "learning_rate": 7.103426226182393e-07, + "loss": 1.782, + "step": 15320 + }, + { + "epoch": 1.45, + "grad_norm": 120.25, + "learning_rate": 7.099644790874337e-07, + "loss": 1.9366, + "step": 15340 + }, + { + "epoch": 1.45, + "grad_norm": 71.875, + "learning_rate": 7.095863355566282e-07, + "loss": 1.8142, + "step": 15360 + }, + { + "epoch": 1.45, + "grad_norm": 63.03125, + "learning_rate": 7.092081920258226e-07, + "loss": 1.8358, + "step": 15380 + }, + { + "epoch": 1.46, + "grad_norm": 59.0, + "learning_rate": 7.08830048495017e-07, + "loss": 1.7758, + "step": 15400 + }, + { + "epoch": 1.46, + "grad_norm": 65.0625, + "learning_rate": 7.084519049642115e-07, + "loss": 1.7326, + "step": 15420 + }, + { + "epoch": 1.46, + "grad_norm": 59.875, + "learning_rate": 7.08073761433406e-07, + "loss": 1.7505, + "step": 15440 + }, + { + "epoch": 1.46, + "grad_norm": 99.0, + "learning_rate": 7.076956179026003e-07, + "loss": 1.7379, + "step": 15460 + }, + { + "epoch": 1.46, + "grad_norm": 79.5625, + "learning_rate": 7.073174743717948e-07, + "loss": 1.8941, + "step": 15480 + }, + { + "epoch": 1.47, + "grad_norm": 70.375, + "learning_rate": 7.069393308409893e-07, + "loss": 1.8052, + "step": 15500 + }, + { + "epoch": 1.47, + "grad_norm": 93.9375, + "learning_rate": 7.065611873101837e-07, + "loss": 1.8094, + "step": 15520 + }, + { + "epoch": 1.47, + "grad_norm": 63.5625, + "learning_rate": 7.061830437793782e-07, + "loss": 1.7806, + "step": 15540 + }, + { + "epoch": 1.47, + "grad_norm": 57.65625, + "learning_rate": 7.058049002485727e-07, + "loss": 1.7275, + "step": 15560 + }, + { + "epoch": 1.47, + "grad_norm": 58.96875, + "learning_rate": 7.054267567177671e-07, + "loss": 1.6737, + "step": 15580 + }, + { + "epoch": 1.47, + "grad_norm": 66.75, + "learning_rate": 7.050486131869615e-07, + "loss": 1.7801, + "step": 15600 + }, + { + "epoch": 1.48, + "grad_norm": 48.59375, + "learning_rate": 7.046704696561559e-07, + "loss": 1.7297, + "step": 15620 + }, + { + "epoch": 1.48, + "grad_norm": 69.6875, + "learning_rate": 7.042923261253504e-07, + "loss": 1.7964, + "step": 15640 + }, + { + "epoch": 1.48, + "grad_norm": 49.28125, + "learning_rate": 7.039141825945449e-07, + "loss": 1.8027, + "step": 15660 + }, + { + "epoch": 1.48, + "grad_norm": 55.46875, + "learning_rate": 7.035360390637392e-07, + "loss": 1.7836, + "step": 15680 + }, + { + "epoch": 1.48, + "grad_norm": 80.5, + "learning_rate": 7.031578955329337e-07, + "loss": 1.7742, + "step": 15700 + }, + { + "epoch": 1.49, + "grad_norm": 63.03125, + "learning_rate": 7.027797520021282e-07, + "loss": 1.7406, + "step": 15720 + }, + { + "epoch": 1.49, + "grad_norm": 51.53125, + "learning_rate": 7.024016084713225e-07, + "loss": 1.7009, + "step": 15740 + }, + { + "epoch": 1.49, + "grad_norm": 113.25, + "learning_rate": 7.02023464940517e-07, + "loss": 1.7417, + "step": 15760 + }, + { + "epoch": 1.49, + "grad_norm": 58.96875, + "learning_rate": 7.016453214097115e-07, + "loss": 1.8331, + "step": 15780 + }, + { + "epoch": 1.49, + "grad_norm": 76.3125, + "learning_rate": 7.012671778789059e-07, + "loss": 1.702, + "step": 15800 + }, + { + "epoch": 1.5, + "grad_norm": 56.3125, + "learning_rate": 7.008890343481003e-07, + "loss": 1.8167, + "step": 15820 + }, + { + "epoch": 1.5, + "grad_norm": 56.78125, + "learning_rate": 7.005108908172948e-07, + "loss": 1.7286, + "step": 15840 + }, + { + "epoch": 1.5, + "grad_norm": 69.6875, + "learning_rate": 7.001327472864892e-07, + "loss": 1.7007, + "step": 15860 + }, + { + "epoch": 1.5, + "grad_norm": 66.125, + "learning_rate": 6.997546037556838e-07, + "loss": 1.8052, + "step": 15880 + }, + { + "epoch": 1.5, + "grad_norm": 57.15625, + "learning_rate": 6.993764602248782e-07, + "loss": 1.7057, + "step": 15900 + }, + { + "epoch": 1.51, + "grad_norm": 69.0, + "learning_rate": 6.989983166940726e-07, + "loss": 1.8017, + "step": 15920 + }, + { + "epoch": 1.51, + "grad_norm": 61.59375, + "learning_rate": 6.986201731632671e-07, + "loss": 1.8606, + "step": 15940 + }, + { + "epoch": 1.51, + "grad_norm": 61.96875, + "learning_rate": 6.982420296324614e-07, + "loss": 1.793, + "step": 15960 + }, + { + "epoch": 1.51, + "grad_norm": 72.0, + "learning_rate": 6.978638861016559e-07, + "loss": 1.7604, + "step": 15980 + }, + { + "epoch": 1.51, + "grad_norm": 67.625, + "learning_rate": 6.974857425708504e-07, + "loss": 1.6722, + "step": 16000 + }, + { + "epoch": 1.51, + "grad_norm": 62.40625, + "learning_rate": 6.971075990400448e-07, + "loss": 1.804, + "step": 16020 + }, + { + "epoch": 1.52, + "grad_norm": 58.84375, + "learning_rate": 6.967294555092392e-07, + "loss": 1.7244, + "step": 16040 + }, + { + "epoch": 1.52, + "grad_norm": 49.125, + "learning_rate": 6.963513119784337e-07, + "loss": 1.7763, + "step": 16060 + }, + { + "epoch": 1.52, + "grad_norm": 67.1875, + "learning_rate": 6.959731684476281e-07, + "loss": 1.7623, + "step": 16080 + }, + { + "epoch": 1.52, + "grad_norm": 62.1875, + "learning_rate": 6.955950249168225e-07, + "loss": 1.8598, + "step": 16100 + }, + { + "epoch": 1.52, + "grad_norm": 82.6875, + "learning_rate": 6.95216881386017e-07, + "loss": 1.789, + "step": 16120 + }, + { + "epoch": 1.53, + "grad_norm": 84.0, + "learning_rate": 6.948387378552114e-07, + "loss": 1.7023, + "step": 16140 + }, + { + "epoch": 1.53, + "grad_norm": 92.4375, + "learning_rate": 6.944605943244059e-07, + "loss": 1.7511, + "step": 16160 + }, + { + "epoch": 1.53, + "grad_norm": 69.125, + "learning_rate": 6.940824507936003e-07, + "loss": 1.8223, + "step": 16180 + }, + { + "epoch": 1.53, + "grad_norm": 61.15625, + "learning_rate": 6.937043072627947e-07, + "loss": 1.8385, + "step": 16200 + }, + { + "epoch": 1.53, + "grad_norm": 71.0, + "learning_rate": 6.933261637319893e-07, + "loss": 1.6804, + "step": 16220 + }, + { + "epoch": 1.54, + "grad_norm": 57.4375, + "learning_rate": 6.929480202011837e-07, + "loss": 1.8056, + "step": 16240 + }, + { + "epoch": 1.54, + "grad_norm": 78.875, + "learning_rate": 6.925698766703781e-07, + "loss": 1.8111, + "step": 16260 + }, + { + "epoch": 1.54, + "grad_norm": 64.125, + "learning_rate": 6.921917331395726e-07, + "loss": 1.6361, + "step": 16280 + }, + { + "epoch": 1.54, + "grad_norm": 75.125, + "learning_rate": 6.91813589608767e-07, + "loss": 1.7612, + "step": 16300 + }, + { + "epoch": 1.54, + "grad_norm": 64.25, + "learning_rate": 6.914354460779614e-07, + "loss": 1.7896, + "step": 16320 + }, + { + "epoch": 1.54, + "grad_norm": 58.96875, + "learning_rate": 6.910573025471559e-07, + "loss": 1.7171, + "step": 16340 + }, + { + "epoch": 1.55, + "grad_norm": 56.71875, + "learning_rate": 6.906791590163503e-07, + "loss": 1.7889, + "step": 16360 + }, + { + "epoch": 1.55, + "grad_norm": 53.3125, + "learning_rate": 6.903010154855448e-07, + "loss": 1.7833, + "step": 16380 + }, + { + "epoch": 1.55, + "grad_norm": 60.9375, + "learning_rate": 6.899228719547392e-07, + "loss": 1.8044, + "step": 16400 + }, + { + "epoch": 1.55, + "grad_norm": 64.8125, + "learning_rate": 6.895447284239336e-07, + "loss": 1.8078, + "step": 16420 + }, + { + "epoch": 1.55, + "grad_norm": 69.375, + "learning_rate": 6.891665848931281e-07, + "loss": 1.69, + "step": 16440 + }, + { + "epoch": 1.56, + "grad_norm": 52.5625, + "learning_rate": 6.887884413623226e-07, + "loss": 1.7741, + "step": 16460 + }, + { + "epoch": 1.56, + "grad_norm": 50.21875, + "learning_rate": 6.884102978315169e-07, + "loss": 1.8409, + "step": 16480 + }, + { + "epoch": 1.56, + "grad_norm": 55.34375, + "learning_rate": 6.880321543007114e-07, + "loss": 1.8224, + "step": 16500 + }, + { + "epoch": 1.56, + "grad_norm": 58.6875, + "learning_rate": 6.876540107699058e-07, + "loss": 1.7456, + "step": 16520 + }, + { + "epoch": 1.56, + "grad_norm": 52.8125, + "learning_rate": 6.872758672391002e-07, + "loss": 1.8105, + "step": 16540 + }, + { + "epoch": 1.57, + "grad_norm": 81.6875, + "learning_rate": 6.868977237082947e-07, + "loss": 1.863, + "step": 16560 + }, + { + "epoch": 1.57, + "grad_norm": 56.0625, + "learning_rate": 6.865195801774892e-07, + "loss": 1.717, + "step": 16580 + }, + { + "epoch": 1.57, + "grad_norm": 41.25, + "learning_rate": 6.861414366466837e-07, + "loss": 1.9108, + "step": 16600 + }, + { + "epoch": 1.57, + "grad_norm": 58.375, + "learning_rate": 6.857632931158781e-07, + "loss": 1.7419, + "step": 16620 + }, + { + "epoch": 1.57, + "grad_norm": 56.625, + "learning_rate": 6.853851495850725e-07, + "loss": 1.861, + "step": 16640 + }, + { + "epoch": 1.57, + "grad_norm": 62.6875, + "learning_rate": 6.85007006054267e-07, + "loss": 1.9066, + "step": 16660 + }, + { + "epoch": 1.58, + "grad_norm": 52.15625, + "learning_rate": 6.846288625234614e-07, + "loss": 1.8078, + "step": 16680 + }, + { + "epoch": 1.58, + "grad_norm": 70.9375, + "learning_rate": 6.842507189926558e-07, + "loss": 1.7877, + "step": 16700 + }, + { + "epoch": 1.58, + "grad_norm": 71.875, + "learning_rate": 6.838725754618503e-07, + "loss": 1.7093, + "step": 16720 + }, + { + "epoch": 1.58, + "grad_norm": 75.0, + "learning_rate": 6.834944319310448e-07, + "loss": 1.8284, + "step": 16740 + }, + { + "epoch": 1.58, + "grad_norm": 73.6875, + "learning_rate": 6.831162884002391e-07, + "loss": 1.6915, + "step": 16760 + }, + { + "epoch": 1.59, + "grad_norm": 66.625, + "learning_rate": 6.827381448694336e-07, + "loss": 1.8282, + "step": 16780 + }, + { + "epoch": 1.59, + "grad_norm": 65.4375, + "learning_rate": 6.823600013386281e-07, + "loss": 1.7428, + "step": 16800 + }, + { + "epoch": 1.59, + "grad_norm": 64.6875, + "learning_rate": 6.819818578078225e-07, + "loss": 1.7914, + "step": 16820 + }, + { + "epoch": 1.59, + "grad_norm": 94.0, + "learning_rate": 6.816037142770169e-07, + "loss": 1.7644, + "step": 16840 + }, + { + "epoch": 1.59, + "grad_norm": 61.6875, + "learning_rate": 6.812255707462113e-07, + "loss": 1.7501, + "step": 16860 + }, + { + "epoch": 1.6, + "grad_norm": 70.625, + "learning_rate": 6.808474272154058e-07, + "loss": 1.7679, + "step": 16880 + }, + { + "epoch": 1.6, + "grad_norm": 82.1875, + "learning_rate": 6.804692836846002e-07, + "loss": 1.7524, + "step": 16900 + }, + { + "epoch": 1.6, + "grad_norm": 43.09375, + "learning_rate": 6.800911401537947e-07, + "loss": 1.7659, + "step": 16920 + }, + { + "epoch": 1.6, + "grad_norm": 53.96875, + "learning_rate": 6.797129966229892e-07, + "loss": 1.6591, + "step": 16940 + }, + { + "epoch": 1.6, + "grad_norm": 51.46875, + "learning_rate": 6.793348530921837e-07, + "loss": 1.6777, + "step": 16960 + }, + { + "epoch": 1.61, + "grad_norm": 56.34375, + "learning_rate": 6.78956709561378e-07, + "loss": 1.7987, + "step": 16980 + }, + { + "epoch": 1.61, + "grad_norm": 86.1875, + "learning_rate": 6.785785660305725e-07, + "loss": 1.8583, + "step": 17000 + }, + { + "epoch": 1.61, + "grad_norm": 78.0625, + "learning_rate": 6.78200422499767e-07, + "loss": 1.8726, + "step": 17020 + }, + { + "epoch": 1.61, + "grad_norm": 88.375, + "learning_rate": 6.778222789689614e-07, + "loss": 1.8256, + "step": 17040 + }, + { + "epoch": 1.61, + "grad_norm": 54.96875, + "learning_rate": 6.774441354381558e-07, + "loss": 1.8137, + "step": 17060 + }, + { + "epoch": 1.61, + "grad_norm": 55.0, + "learning_rate": 6.770659919073503e-07, + "loss": 1.6887, + "step": 17080 + }, + { + "epoch": 1.62, + "grad_norm": 55.03125, + "learning_rate": 6.766878483765447e-07, + "loss": 1.807, + "step": 17100 + }, + { + "epoch": 1.62, + "grad_norm": 67.875, + "learning_rate": 6.763097048457391e-07, + "loss": 1.7649, + "step": 17120 + }, + { + "epoch": 1.62, + "grad_norm": 58.90625, + "learning_rate": 6.759315613149335e-07, + "loss": 1.7151, + "step": 17140 + }, + { + "epoch": 1.62, + "grad_norm": 72.8125, + "learning_rate": 6.75553417784128e-07, + "loss": 1.8062, + "step": 17160 + }, + { + "epoch": 1.62, + "grad_norm": 69.0, + "learning_rate": 6.751752742533225e-07, + "loss": 1.7687, + "step": 17180 + }, + { + "epoch": 1.63, + "grad_norm": 66.5, + "learning_rate": 6.747971307225168e-07, + "loss": 1.7421, + "step": 17200 + }, + { + "epoch": 1.63, + "grad_norm": 57.625, + "learning_rate": 6.744189871917113e-07, + "loss": 1.8331, + "step": 17220 + }, + { + "epoch": 1.63, + "grad_norm": 60.1875, + "learning_rate": 6.740408436609058e-07, + "loss": 1.7498, + "step": 17240 + }, + { + "epoch": 1.63, + "grad_norm": 51.65625, + "learning_rate": 6.736627001301001e-07, + "loss": 1.7102, + "step": 17260 + }, + { + "epoch": 1.63, + "grad_norm": 79.625, + "learning_rate": 6.732845565992947e-07, + "loss": 1.714, + "step": 17280 + }, + { + "epoch": 1.64, + "grad_norm": 52.46875, + "learning_rate": 6.729064130684892e-07, + "loss": 1.6849, + "step": 17300 + }, + { + "epoch": 1.64, + "grad_norm": 61.3125, + "learning_rate": 6.725282695376836e-07, + "loss": 1.8046, + "step": 17320 + }, + { + "epoch": 1.64, + "grad_norm": 52.0625, + "learning_rate": 6.72150126006878e-07, + "loss": 1.6312, + "step": 17340 + }, + { + "epoch": 1.64, + "grad_norm": 54.71875, + "learning_rate": 6.717719824760725e-07, + "loss": 1.6886, + "step": 17360 + }, + { + "epoch": 1.64, + "grad_norm": 51.375, + "learning_rate": 6.713938389452669e-07, + "loss": 1.754, + "step": 17380 + }, + { + "epoch": 1.64, + "grad_norm": 55.90625, + "learning_rate": 6.710156954144614e-07, + "loss": 1.7469, + "step": 17400 + }, + { + "epoch": 1.65, + "grad_norm": 61.78125, + "learning_rate": 6.706375518836558e-07, + "loss": 1.7955, + "step": 17420 + }, + { + "epoch": 1.65, + "grad_norm": 89.75, + "learning_rate": 6.702594083528502e-07, + "loss": 1.7665, + "step": 17440 + }, + { + "epoch": 1.65, + "grad_norm": 56.0625, + "learning_rate": 6.698812648220447e-07, + "loss": 1.8611, + "step": 17460 + }, + { + "epoch": 1.65, + "grad_norm": 63.625, + "learning_rate": 6.69503121291239e-07, + "loss": 1.7571, + "step": 17480 + }, + { + "epoch": 1.65, + "grad_norm": 70.375, + "learning_rate": 6.691249777604335e-07, + "loss": 1.8701, + "step": 17500 + }, + { + "epoch": 1.66, + "grad_norm": 91.1875, + "learning_rate": 6.68746834229628e-07, + "loss": 1.7109, + "step": 17520 + }, + { + "epoch": 1.66, + "grad_norm": 51.1875, + "learning_rate": 6.683686906988224e-07, + "loss": 1.7953, + "step": 17540 + }, + { + "epoch": 1.66, + "grad_norm": 57.1875, + "learning_rate": 6.679905471680168e-07, + "loss": 1.6835, + "step": 17560 + }, + { + "epoch": 1.66, + "grad_norm": 62.1875, + "learning_rate": 6.676124036372113e-07, + "loss": 1.8026, + "step": 17580 + }, + { + "epoch": 1.66, + "grad_norm": 58.125, + "learning_rate": 6.672342601064057e-07, + "loss": 1.698, + "step": 17600 + }, + { + "epoch": 1.67, + "grad_norm": 53.4375, + "learning_rate": 6.668561165756003e-07, + "loss": 1.7425, + "step": 17620 + }, + { + "epoch": 1.67, + "grad_norm": 58.4375, + "learning_rate": 6.664779730447947e-07, + "loss": 1.8302, + "step": 17640 + }, + { + "epoch": 1.67, + "grad_norm": 57.84375, + "learning_rate": 6.660998295139891e-07, + "loss": 1.8451, + "step": 17660 + }, + { + "epoch": 1.67, + "grad_norm": 60.84375, + "learning_rate": 6.657216859831836e-07, + "loss": 1.7618, + "step": 17680 + }, + { + "epoch": 1.67, + "grad_norm": 64.25, + "learning_rate": 6.65343542452378e-07, + "loss": 1.7403, + "step": 17700 + }, + { + "epoch": 1.68, + "grad_norm": 81.0, + "learning_rate": 6.649653989215724e-07, + "loss": 1.7854, + "step": 17720 + }, + { + "epoch": 1.68, + "grad_norm": 57.125, + "learning_rate": 6.645872553907669e-07, + "loss": 1.7188, + "step": 17740 + }, + { + "epoch": 1.68, + "grad_norm": 75.1875, + "learning_rate": 6.642091118599614e-07, + "loss": 1.7599, + "step": 17760 + }, + { + "epoch": 1.68, + "grad_norm": 62.75, + "learning_rate": 6.638309683291557e-07, + "loss": 1.7381, + "step": 17780 + }, + { + "epoch": 1.68, + "grad_norm": 52.96875, + "learning_rate": 6.634528247983502e-07, + "loss": 1.7222, + "step": 17800 + }, + { + "epoch": 1.68, + "grad_norm": 54.875, + "learning_rate": 6.630746812675446e-07, + "loss": 1.808, + "step": 17820 + }, + { + "epoch": 1.69, + "grad_norm": 51.03125, + "learning_rate": 6.62696537736739e-07, + "loss": 1.599, + "step": 17840 + }, + { + "epoch": 1.69, + "grad_norm": 59.09375, + "learning_rate": 6.623183942059335e-07, + "loss": 1.808, + "step": 17860 + }, + { + "epoch": 1.69, + "grad_norm": 77.6875, + "learning_rate": 6.619402506751279e-07, + "loss": 1.7878, + "step": 17880 + }, + { + "epoch": 1.69, + "grad_norm": 63.5625, + "learning_rate": 6.615621071443224e-07, + "loss": 1.7078, + "step": 17900 + }, + { + "epoch": 1.69, + "grad_norm": 47.6875, + "learning_rate": 6.611839636135168e-07, + "loss": 1.7639, + "step": 17920 + }, + { + "epoch": 1.7, + "grad_norm": 63.34375, + "learning_rate": 6.608058200827112e-07, + "loss": 1.8108, + "step": 17940 + }, + { + "epoch": 1.7, + "grad_norm": 52.8125, + "learning_rate": 6.604276765519058e-07, + "loss": 1.8749, + "step": 17960 + }, + { + "epoch": 1.7, + "grad_norm": 63.28125, + "learning_rate": 6.600495330211003e-07, + "loss": 1.8105, + "step": 17980 + }, + { + "epoch": 1.7, + "grad_norm": 68.875, + "learning_rate": 6.596713894902946e-07, + "loss": 1.6754, + "step": 18000 + }, + { + "epoch": 1.7, + "grad_norm": 88.0625, + "learning_rate": 6.592932459594891e-07, + "loss": 1.7055, + "step": 18020 + }, + { + "epoch": 1.71, + "grad_norm": 69.5625, + "learning_rate": 6.589151024286836e-07, + "loss": 1.7658, + "step": 18040 + }, + { + "epoch": 1.71, + "grad_norm": 48.65625, + "learning_rate": 6.58536958897878e-07, + "loss": 1.747, + "step": 18060 + }, + { + "epoch": 1.71, + "grad_norm": 49.3125, + "learning_rate": 6.581588153670724e-07, + "loss": 1.8633, + "step": 18080 + }, + { + "epoch": 1.71, + "grad_norm": 68.5625, + "learning_rate": 6.577806718362668e-07, + "loss": 1.7298, + "step": 18100 + }, + { + "epoch": 1.71, + "grad_norm": 56.59375, + "learning_rate": 6.574025283054613e-07, + "loss": 1.6837, + "step": 18120 + }, + { + "epoch": 1.71, + "grad_norm": 90.5625, + "learning_rate": 6.570243847746557e-07, + "loss": 1.7123, + "step": 18140 + }, + { + "epoch": 1.72, + "grad_norm": 136.875, + "learning_rate": 6.566462412438501e-07, + "loss": 1.7742, + "step": 18160 + }, + { + "epoch": 1.72, + "grad_norm": 65.0, + "learning_rate": 6.562680977130446e-07, + "loss": 1.7315, + "step": 18180 + }, + { + "epoch": 1.72, + "grad_norm": 57.875, + "learning_rate": 6.558899541822391e-07, + "loss": 1.7881, + "step": 18200 + }, + { + "epoch": 1.72, + "grad_norm": 61.90625, + "learning_rate": 6.555118106514334e-07, + "loss": 1.6969, + "step": 18220 + }, + { + "epoch": 1.72, + "grad_norm": 65.8125, + "learning_rate": 6.551336671206279e-07, + "loss": 1.7261, + "step": 18240 + }, + { + "epoch": 1.73, + "grad_norm": 59.03125, + "learning_rate": 6.547555235898224e-07, + "loss": 1.7779, + "step": 18260 + }, + { + "epoch": 1.73, + "grad_norm": 52.09375, + "learning_rate": 6.543773800590167e-07, + "loss": 1.7624, + "step": 18280 + }, + { + "epoch": 1.73, + "grad_norm": 79.9375, + "learning_rate": 6.539992365282112e-07, + "loss": 1.8038, + "step": 18300 + }, + { + "epoch": 1.73, + "grad_norm": 85.0, + "learning_rate": 6.536210929974058e-07, + "loss": 1.8728, + "step": 18320 + }, + { + "epoch": 1.73, + "grad_norm": 63.71875, + "learning_rate": 6.532429494666002e-07, + "loss": 1.8022, + "step": 18340 + }, + { + "epoch": 1.74, + "grad_norm": 67.3125, + "learning_rate": 6.528648059357946e-07, + "loss": 1.7176, + "step": 18360 + }, + { + "epoch": 1.74, + "grad_norm": 69.8125, + "learning_rate": 6.524866624049891e-07, + "loss": 1.8417, + "step": 18380 + }, + { + "epoch": 1.74, + "grad_norm": 57.25, + "learning_rate": 6.521085188741835e-07, + "loss": 1.8464, + "step": 18400 + }, + { + "epoch": 1.74, + "grad_norm": 56.71875, + "learning_rate": 6.51730375343378e-07, + "loss": 1.6192, + "step": 18420 + }, + { + "epoch": 1.74, + "grad_norm": 79.1875, + "learning_rate": 6.513522318125723e-07, + "loss": 1.8753, + "step": 18440 + }, + { + "epoch": 1.75, + "grad_norm": 53.375, + "learning_rate": 6.509740882817668e-07, + "loss": 1.6674, + "step": 18460 + }, + { + "epoch": 1.75, + "grad_norm": 63.75, + "learning_rate": 6.505959447509613e-07, + "loss": 1.7284, + "step": 18480 + }, + { + "epoch": 1.75, + "grad_norm": 72.5, + "learning_rate": 6.502178012201556e-07, + "loss": 1.8274, + "step": 18500 + }, + { + "epoch": 1.75, + "grad_norm": 51.3125, + "learning_rate": 6.498396576893501e-07, + "loss": 1.7632, + "step": 18520 + }, + { + "epoch": 1.75, + "grad_norm": 51.65625, + "learning_rate": 6.494615141585446e-07, + "loss": 1.7839, + "step": 18540 + }, + { + "epoch": 1.75, + "grad_norm": 53.46875, + "learning_rate": 6.49083370627739e-07, + "loss": 1.6718, + "step": 18560 + }, + { + "epoch": 1.76, + "grad_norm": 71.0625, + "learning_rate": 6.487052270969334e-07, + "loss": 1.6606, + "step": 18580 + }, + { + "epoch": 1.76, + "grad_norm": 62.9375, + "learning_rate": 6.483270835661279e-07, + "loss": 1.8147, + "step": 18600 + }, + { + "epoch": 1.76, + "grad_norm": 63.1875, + "learning_rate": 6.479489400353223e-07, + "loss": 1.8021, + "step": 18620 + }, + { + "epoch": 1.76, + "grad_norm": 54.53125, + "learning_rate": 6.475707965045167e-07, + "loss": 1.6672, + "step": 18640 + }, + { + "epoch": 1.76, + "grad_norm": 69.0, + "learning_rate": 6.471926529737113e-07, + "loss": 1.744, + "step": 18660 + }, + { + "epoch": 1.77, + "grad_norm": 69.5625, + "learning_rate": 6.468145094429057e-07, + "loss": 1.7739, + "step": 18680 + }, + { + "epoch": 1.77, + "grad_norm": 53.875, + "learning_rate": 6.464363659121002e-07, + "loss": 1.7764, + "step": 18700 + }, + { + "epoch": 1.77, + "grad_norm": 54.0625, + "learning_rate": 6.460582223812946e-07, + "loss": 1.7612, + "step": 18720 + }, + { + "epoch": 1.77, + "grad_norm": 64.4375, + "learning_rate": 6.45680078850489e-07, + "loss": 1.632, + "step": 18740 + }, + { + "epoch": 1.77, + "grad_norm": 91.1875, + "learning_rate": 6.453019353196835e-07, + "loss": 1.8534, + "step": 18760 + }, + { + "epoch": 1.78, + "grad_norm": 95.5625, + "learning_rate": 6.449237917888779e-07, + "loss": 1.8283, + "step": 18780 + }, + { + "epoch": 1.78, + "grad_norm": 68.5625, + "learning_rate": 6.445456482580723e-07, + "loss": 1.7274, + "step": 18800 + }, + { + "epoch": 1.78, + "grad_norm": 75.875, + "learning_rate": 6.441675047272668e-07, + "loss": 1.7398, + "step": 18820 + }, + { + "epoch": 1.78, + "grad_norm": 73.5625, + "learning_rate": 6.437893611964612e-07, + "loss": 1.717, + "step": 18840 + }, + { + "epoch": 1.78, + "grad_norm": 62.34375, + "learning_rate": 6.434112176656556e-07, + "loss": 1.7714, + "step": 18860 + }, + { + "epoch": 1.78, + "grad_norm": 86.5625, + "learning_rate": 6.430330741348501e-07, + "loss": 1.8233, + "step": 18880 + }, + { + "epoch": 1.79, + "grad_norm": 39.03125, + "learning_rate": 6.426549306040445e-07, + "loss": 1.7017, + "step": 18900 + }, + { + "epoch": 1.79, + "grad_norm": 74.8125, + "learning_rate": 6.42276787073239e-07, + "loss": 1.7794, + "step": 18920 + }, + { + "epoch": 1.79, + "grad_norm": 58.96875, + "learning_rate": 6.418986435424334e-07, + "loss": 1.752, + "step": 18940 + }, + { + "epoch": 1.79, + "grad_norm": 56.6875, + "learning_rate": 6.415205000116278e-07, + "loss": 1.6723, + "step": 18960 + }, + { + "epoch": 1.79, + "grad_norm": 76.3125, + "learning_rate": 6.411423564808223e-07, + "loss": 1.7992, + "step": 18980 + }, + { + "epoch": 1.8, + "grad_norm": 46.125, + "learning_rate": 6.407642129500166e-07, + "loss": 1.8077, + "step": 19000 + }, + { + "epoch": 1.8, + "grad_norm": 68.0, + "learning_rate": 6.403860694192112e-07, + "loss": 1.6902, + "step": 19020 + }, + { + "epoch": 1.8, + "grad_norm": 66.4375, + "learning_rate": 6.400079258884057e-07, + "loss": 1.7471, + "step": 19040 + }, + { + "epoch": 1.8, + "grad_norm": 72.375, + "learning_rate": 6.396297823576001e-07, + "loss": 1.7479, + "step": 19060 + }, + { + "epoch": 1.8, + "grad_norm": 109.5625, + "learning_rate": 6.392516388267945e-07, + "loss": 1.7985, + "step": 19080 + }, + { + "epoch": 1.81, + "grad_norm": 74.625, + "learning_rate": 6.38873495295989e-07, + "loss": 1.7164, + "step": 19100 + }, + { + "epoch": 1.81, + "grad_norm": 65.875, + "learning_rate": 6.384953517651834e-07, + "loss": 1.6436, + "step": 19120 + }, + { + "epoch": 1.81, + "grad_norm": 61.3125, + "learning_rate": 6.381172082343779e-07, + "loss": 1.6285, + "step": 19140 + }, + { + "epoch": 1.81, + "grad_norm": 71.25, + "learning_rate": 6.377390647035723e-07, + "loss": 1.7548, + "step": 19160 + }, + { + "epoch": 1.81, + "grad_norm": 57.875, + "learning_rate": 6.373609211727667e-07, + "loss": 1.6347, + "step": 19180 + }, + { + "epoch": 1.82, + "grad_norm": 46.90625, + "learning_rate": 6.369827776419612e-07, + "loss": 1.7618, + "step": 19200 + }, + { + "epoch": 1.82, + "grad_norm": 54.84375, + "learning_rate": 6.366046341111557e-07, + "loss": 1.7912, + "step": 19220 + }, + { + "epoch": 1.82, + "grad_norm": 55.0625, + "learning_rate": 6.3622649058035e-07, + "loss": 1.7301, + "step": 19240 + }, + { + "epoch": 1.82, + "grad_norm": 48.1875, + "learning_rate": 6.358483470495445e-07, + "loss": 1.7273, + "step": 19260 + }, + { + "epoch": 1.82, + "grad_norm": 76.125, + "learning_rate": 6.35470203518739e-07, + "loss": 1.7014, + "step": 19280 + }, + { + "epoch": 1.82, + "grad_norm": 49.40625, + "learning_rate": 6.350920599879333e-07, + "loss": 1.8441, + "step": 19300 + }, + { + "epoch": 1.83, + "grad_norm": 76.75, + "learning_rate": 6.347139164571278e-07, + "loss": 1.7316, + "step": 19320 + }, + { + "epoch": 1.83, + "grad_norm": 78.25, + "learning_rate": 6.343357729263222e-07, + "loss": 1.7168, + "step": 19340 + }, + { + "epoch": 1.83, + "grad_norm": 68.0, + "learning_rate": 6.339576293955168e-07, + "loss": 1.8219, + "step": 19360 + }, + { + "epoch": 1.83, + "grad_norm": 55.75, + "learning_rate": 6.335794858647112e-07, + "loss": 1.7437, + "step": 19380 + }, + { + "epoch": 1.83, + "grad_norm": 56.90625, + "learning_rate": 6.332013423339056e-07, + "loss": 1.7404, + "step": 19400 + }, + { + "epoch": 1.84, + "grad_norm": 51.125, + "learning_rate": 6.328231988031001e-07, + "loss": 1.731, + "step": 19420 + }, + { + "epoch": 1.84, + "grad_norm": 59.0625, + "learning_rate": 6.324450552722946e-07, + "loss": 1.7489, + "step": 19440 + }, + { + "epoch": 1.84, + "grad_norm": 93.5625, + "learning_rate": 6.320669117414889e-07, + "loss": 1.7179, + "step": 19460 + }, + { + "epoch": 1.84, + "grad_norm": 77.3125, + "learning_rate": 6.316887682106834e-07, + "loss": 1.7455, + "step": 19480 + }, + { + "epoch": 1.84, + "grad_norm": 60.21875, + "learning_rate": 6.313106246798779e-07, + "loss": 1.7477, + "step": 19500 + }, + { + "epoch": 1.85, + "grad_norm": 67.875, + "learning_rate": 6.309324811490722e-07, + "loss": 1.8577, + "step": 19520 + }, + { + "epoch": 1.85, + "grad_norm": 84.9375, + "learning_rate": 6.305543376182667e-07, + "loss": 1.8332, + "step": 19540 + }, + { + "epoch": 1.85, + "grad_norm": 69.3125, + "learning_rate": 6.301761940874612e-07, + "loss": 1.7734, + "step": 19560 + }, + { + "epoch": 1.85, + "grad_norm": 55.9375, + "learning_rate": 6.297980505566556e-07, + "loss": 1.7441, + "step": 19580 + }, + { + "epoch": 1.85, + "grad_norm": 51.25, + "learning_rate": 6.2941990702585e-07, + "loss": 1.7063, + "step": 19600 + }, + { + "epoch": 1.85, + "grad_norm": 62.21875, + "learning_rate": 6.290417634950445e-07, + "loss": 1.8064, + "step": 19620 + }, + { + "epoch": 1.86, + "grad_norm": 82.3125, + "learning_rate": 6.286636199642389e-07, + "loss": 1.9288, + "step": 19640 + }, + { + "epoch": 1.86, + "grad_norm": 97.4375, + "learning_rate": 6.282854764334333e-07, + "loss": 1.7964, + "step": 19660 + }, + { + "epoch": 1.86, + "grad_norm": 71.5, + "learning_rate": 6.279073329026277e-07, + "loss": 1.7302, + "step": 19680 + }, + { + "epoch": 1.86, + "grad_norm": 54.21875, + "learning_rate": 6.275291893718222e-07, + "loss": 1.7965, + "step": 19700 + }, + { + "epoch": 1.86, + "grad_norm": 68.4375, + "learning_rate": 6.271510458410168e-07, + "loss": 1.735, + "step": 19720 + }, + { + "epoch": 1.87, + "grad_norm": 65.0625, + "learning_rate": 6.267729023102111e-07, + "loss": 1.7821, + "step": 19740 + }, + { + "epoch": 1.87, + "grad_norm": 77.1875, + "learning_rate": 6.263947587794056e-07, + "loss": 1.6467, + "step": 19760 + }, + { + "epoch": 1.87, + "grad_norm": 92.875, + "learning_rate": 6.260166152486001e-07, + "loss": 1.8485, + "step": 19780 + }, + { + "epoch": 1.87, + "grad_norm": 43.9375, + "learning_rate": 6.256384717177945e-07, + "loss": 1.7091, + "step": 19800 + }, + { + "epoch": 1.87, + "grad_norm": 53.25, + "learning_rate": 6.252603281869889e-07, + "loss": 1.8332, + "step": 19820 + }, + { + "epoch": 1.88, + "grad_norm": 48.28125, + "learning_rate": 6.248821846561834e-07, + "loss": 1.8508, + "step": 19840 + }, + { + "epoch": 1.88, + "grad_norm": 53.46875, + "learning_rate": 6.245040411253778e-07, + "loss": 1.744, + "step": 19860 + }, + { + "epoch": 1.88, + "grad_norm": 64.9375, + "learning_rate": 6.241258975945722e-07, + "loss": 1.7944, + "step": 19880 + }, + { + "epoch": 1.88, + "grad_norm": 59.375, + "learning_rate": 6.237477540637667e-07, + "loss": 1.7726, + "step": 19900 + }, + { + "epoch": 1.88, + "grad_norm": 45.1875, + "learning_rate": 6.233696105329611e-07, + "loss": 1.778, + "step": 19920 + }, + { + "epoch": 1.89, + "grad_norm": 49.9375, + "learning_rate": 6.229914670021556e-07, + "loss": 1.7223, + "step": 19940 + }, + { + "epoch": 1.89, + "grad_norm": 69.0625, + "learning_rate": 6.226133234713499e-07, + "loss": 1.807, + "step": 19960 + }, + { + "epoch": 1.89, + "grad_norm": 45.78125, + "learning_rate": 6.222351799405444e-07, + "loss": 1.8191, + "step": 19980 + }, + { + "epoch": 1.89, + "grad_norm": 62.15625, + "learning_rate": 6.218570364097389e-07, + "loss": 1.8354, + "step": 20000 + }, + { + "epoch": 1.89, + "grad_norm": 75.625, + "learning_rate": 6.214788928789332e-07, + "loss": 1.7356, + "step": 20020 + }, + { + "epoch": 1.89, + "grad_norm": 55.34375, + "learning_rate": 6.211007493481277e-07, + "loss": 1.7371, + "step": 20040 + }, + { + "epoch": 1.9, + "grad_norm": 57.78125, + "learning_rate": 6.207226058173223e-07, + "loss": 1.7965, + "step": 20060 + }, + { + "epoch": 1.9, + "grad_norm": 110.1875, + "learning_rate": 6.203444622865167e-07, + "loss": 1.6851, + "step": 20080 + }, + { + "epoch": 1.9, + "grad_norm": 74.1875, + "learning_rate": 6.199663187557111e-07, + "loss": 1.7433, + "step": 20100 + }, + { + "epoch": 1.9, + "grad_norm": 54.46875, + "learning_rate": 6.195881752249056e-07, + "loss": 1.7359, + "step": 20120 + }, + { + "epoch": 1.9, + "grad_norm": 80.375, + "learning_rate": 6.192100316941e-07, + "loss": 1.7647, + "step": 20140 + }, + { + "epoch": 1.91, + "grad_norm": 67.625, + "learning_rate": 6.188318881632945e-07, + "loss": 1.6704, + "step": 20160 + }, + { + "epoch": 1.91, + "grad_norm": 78.1875, + "learning_rate": 6.184537446324889e-07, + "loss": 1.8029, + "step": 20180 + }, + { + "epoch": 1.91, + "grad_norm": 55.59375, + "learning_rate": 6.180756011016833e-07, + "loss": 1.7328, + "step": 20200 + }, + { + "epoch": 1.91, + "grad_norm": 75.9375, + "learning_rate": 6.176974575708778e-07, + "loss": 1.7889, + "step": 20220 + }, + { + "epoch": 1.91, + "grad_norm": 51.8125, + "learning_rate": 6.173193140400723e-07, + "loss": 1.8524, + "step": 20240 + }, + { + "epoch": 1.92, + "grad_norm": 52.75, + "learning_rate": 6.169411705092666e-07, + "loss": 1.8042, + "step": 20260 + }, + { + "epoch": 1.92, + "grad_norm": 51.3125, + "learning_rate": 6.165630269784611e-07, + "loss": 1.8007, + "step": 20280 + }, + { + "epoch": 1.92, + "grad_norm": 70.8125, + "learning_rate": 6.161848834476555e-07, + "loss": 1.7286, + "step": 20300 + }, + { + "epoch": 1.92, + "grad_norm": 61.34375, + "learning_rate": 6.158067399168499e-07, + "loss": 1.7664, + "step": 20320 + }, + { + "epoch": 1.92, + "grad_norm": 62.375, + "learning_rate": 6.154285963860444e-07, + "loss": 1.786, + "step": 20340 + }, + { + "epoch": 1.92, + "grad_norm": 81.0625, + "learning_rate": 6.150504528552388e-07, + "loss": 1.7736, + "step": 20360 + }, + { + "epoch": 1.93, + "grad_norm": 63.65625, + "learning_rate": 6.146723093244333e-07, + "loss": 1.8188, + "step": 20380 + }, + { + "epoch": 1.93, + "grad_norm": 56.34375, + "learning_rate": 6.142941657936277e-07, + "loss": 1.8596, + "step": 20400 + }, + { + "epoch": 1.93, + "grad_norm": 59.5625, + "learning_rate": 6.139160222628222e-07, + "loss": 1.7769, + "step": 20420 + }, + { + "epoch": 1.93, + "grad_norm": 79.875, + "learning_rate": 6.135378787320167e-07, + "loss": 1.7049, + "step": 20440 + }, + { + "epoch": 1.93, + "grad_norm": 49.78125, + "learning_rate": 6.131597352012112e-07, + "loss": 1.7867, + "step": 20460 + }, + { + "epoch": 1.94, + "grad_norm": 51.03125, + "learning_rate": 6.127815916704055e-07, + "loss": 1.7805, + "step": 20480 + }, + { + "epoch": 1.94, + "grad_norm": 65.25, + "learning_rate": 6.124034481396e-07, + "loss": 1.7941, + "step": 20500 + }, + { + "epoch": 1.94, + "grad_norm": 52.46875, + "learning_rate": 6.120253046087945e-07, + "loss": 1.7872, + "step": 20520 + }, + { + "epoch": 1.94, + "grad_norm": 94.625, + "learning_rate": 6.116471610779888e-07, + "loss": 1.8097, + "step": 20540 + }, + { + "epoch": 1.94, + "grad_norm": 50.21875, + "learning_rate": 6.112690175471833e-07, + "loss": 1.7045, + "step": 20560 + }, + { + "epoch": 1.95, + "grad_norm": 72.25, + "learning_rate": 6.108908740163778e-07, + "loss": 1.7775, + "step": 20580 + }, + { + "epoch": 1.95, + "grad_norm": 68.625, + "learning_rate": 6.105127304855722e-07, + "loss": 1.7464, + "step": 20600 + }, + { + "epoch": 1.95, + "grad_norm": 50.8125, + "learning_rate": 6.101345869547666e-07, + "loss": 1.7768, + "step": 20620 + }, + { + "epoch": 1.95, + "grad_norm": 51.28125, + "learning_rate": 6.09756443423961e-07, + "loss": 1.7388, + "step": 20640 + }, + { + "epoch": 1.95, + "grad_norm": 101.5, + "learning_rate": 6.093782998931555e-07, + "loss": 1.7464, + "step": 20660 + }, + { + "epoch": 1.96, + "grad_norm": 77.5625, + "learning_rate": 6.090001563623499e-07, + "loss": 1.7987, + "step": 20680 + }, + { + "epoch": 1.96, + "grad_norm": 111.0625, + "learning_rate": 6.086220128315443e-07, + "loss": 1.6151, + "step": 20700 + }, + { + "epoch": 1.96, + "grad_norm": 74.5625, + "learning_rate": 6.082438693007388e-07, + "loss": 1.8307, + "step": 20720 + }, + { + "epoch": 1.96, + "grad_norm": 98.375, + "learning_rate": 6.078657257699333e-07, + "loss": 1.8152, + "step": 20740 + }, + { + "epoch": 1.96, + "grad_norm": 60.25, + "learning_rate": 6.074875822391277e-07, + "loss": 1.753, + "step": 20760 + }, + { + "epoch": 1.96, + "grad_norm": 72.75, + "learning_rate": 6.071094387083222e-07, + "loss": 1.8042, + "step": 20780 + }, + { + "epoch": 1.97, + "grad_norm": 52.25, + "learning_rate": 6.067312951775167e-07, + "loss": 1.8273, + "step": 20800 + }, + { + "epoch": 1.97, + "grad_norm": 48.28125, + "learning_rate": 6.063531516467111e-07, + "loss": 1.7648, + "step": 20820 + }, + { + "epoch": 1.97, + "grad_norm": 59.53125, + "learning_rate": 6.059750081159055e-07, + "loss": 1.6874, + "step": 20840 + }, + { + "epoch": 1.97, + "grad_norm": 54.3125, + "learning_rate": 6.055968645851e-07, + "loss": 1.6971, + "step": 20860 + }, + { + "epoch": 1.97, + "grad_norm": 76.0625, + "learning_rate": 6.052187210542944e-07, + "loss": 1.7653, + "step": 20880 + }, + { + "epoch": 1.98, + "grad_norm": 51.34375, + "learning_rate": 6.048405775234888e-07, + "loss": 1.7038, + "step": 20900 + }, + { + "epoch": 1.98, + "grad_norm": 58.09375, + "learning_rate": 6.044624339926832e-07, + "loss": 1.8158, + "step": 20920 + }, + { + "epoch": 1.98, + "grad_norm": 54.375, + "learning_rate": 6.040842904618777e-07, + "loss": 1.8113, + "step": 20940 + }, + { + "epoch": 1.98, + "grad_norm": 87.3125, + "learning_rate": 6.037061469310722e-07, + "loss": 1.7534, + "step": 20960 + }, + { + "epoch": 1.98, + "grad_norm": 58.4375, + "learning_rate": 6.033280034002665e-07, + "loss": 1.7212, + "step": 20980 + }, + { + "epoch": 1.99, + "grad_norm": 94.4375, + "learning_rate": 6.02949859869461e-07, + "loss": 1.745, + "step": 21000 + }, + { + "epoch": 1.99, + "grad_norm": 49.4375, + "learning_rate": 6.025717163386555e-07, + "loss": 1.7588, + "step": 21020 + }, + { + "epoch": 1.99, + "grad_norm": 70.8125, + "learning_rate": 6.021935728078498e-07, + "loss": 1.8333, + "step": 21040 + }, + { + "epoch": 1.99, + "grad_norm": 77.375, + "learning_rate": 6.018154292770443e-07, + "loss": 1.6461, + "step": 21060 + }, + { + "epoch": 1.99, + "grad_norm": 65.1875, + "learning_rate": 6.014372857462388e-07, + "loss": 1.7415, + "step": 21080 + }, + { + "epoch": 1.99, + "grad_norm": 56.25, + "learning_rate": 6.010591422154332e-07, + "loss": 1.7088, + "step": 21100 + }, + { + "epoch": 2.0, + "grad_norm": 60.84375, + "learning_rate": 6.006809986846277e-07, + "loss": 1.6742, + "step": 21120 + }, + { + "epoch": 2.0, + "grad_norm": 62.03125, + "learning_rate": 6.003028551538222e-07, + "loss": 1.741, + "step": 21140 + }, + { + "epoch": 2.0, + "grad_norm": 48.9375, + "learning_rate": 5.999247116230166e-07, + "loss": 1.832, + "step": 21160 + }, + { + "epoch": 2.0, + "grad_norm": 54.46875, + "learning_rate": 5.995465680922111e-07, + "loss": 1.5626, + "step": 21180 + }, + { + "epoch": 2.0, + "grad_norm": 75.375, + "learning_rate": 5.991684245614055e-07, + "loss": 1.6111, + "step": 21200 + }, + { + "epoch": 2.01, + "grad_norm": 63.40625, + "learning_rate": 5.987902810305999e-07, + "loss": 1.6103, + "step": 21220 + }, + { + "epoch": 2.01, + "grad_norm": 44.34375, + "learning_rate": 5.984121374997944e-07, + "loss": 1.5554, + "step": 21240 + }, + { + "epoch": 2.01, + "grad_norm": 75.3125, + "learning_rate": 5.980339939689887e-07, + "loss": 1.5105, + "step": 21260 + }, + { + "epoch": 2.01, + "grad_norm": 63.15625, + "learning_rate": 5.976558504381832e-07, + "loss": 1.5628, + "step": 21280 + }, + { + "epoch": 2.01, + "grad_norm": 59.46875, + "learning_rate": 5.972777069073777e-07, + "loss": 1.5626, + "step": 21300 + }, + { + "epoch": 2.02, + "grad_norm": 51.15625, + "learning_rate": 5.968995633765721e-07, + "loss": 1.5411, + "step": 21320 + }, + { + "epoch": 2.02, + "grad_norm": 59.5625, + "learning_rate": 5.965214198457665e-07, + "loss": 1.613, + "step": 21340 + }, + { + "epoch": 2.02, + "grad_norm": 46.28125, + "learning_rate": 5.96143276314961e-07, + "loss": 1.5999, + "step": 21360 + }, + { + "epoch": 2.02, + "grad_norm": 53.96875, + "learning_rate": 5.957651327841554e-07, + "loss": 1.7679, + "step": 21380 + }, + { + "epoch": 2.02, + "grad_norm": 46.75, + "learning_rate": 5.953869892533499e-07, + "loss": 1.5461, + "step": 21400 + }, + { + "epoch": 2.02, + "grad_norm": 56.6875, + "learning_rate": 5.950088457225443e-07, + "loss": 1.5189, + "step": 21420 + }, + { + "epoch": 2.03, + "grad_norm": 90.75, + "learning_rate": 5.946307021917387e-07, + "loss": 1.5582, + "step": 21440 + }, + { + "epoch": 2.03, + "grad_norm": 76.0, + "learning_rate": 5.942525586609333e-07, + "loss": 1.4733, + "step": 21460 + }, + { + "epoch": 2.03, + "grad_norm": 51.59375, + "learning_rate": 5.938744151301278e-07, + "loss": 1.6764, + "step": 21480 + }, + { + "epoch": 2.03, + "grad_norm": 116.375, + "learning_rate": 5.934962715993221e-07, + "loss": 1.5825, + "step": 21500 + }, + { + "epoch": 2.03, + "grad_norm": 57.78125, + "learning_rate": 5.931181280685166e-07, + "loss": 1.5275, + "step": 21520 + }, + { + "epoch": 2.04, + "grad_norm": 56.53125, + "learning_rate": 5.92739984537711e-07, + "loss": 1.519, + "step": 21540 + }, + { + "epoch": 2.04, + "grad_norm": 61.125, + "learning_rate": 5.923618410069054e-07, + "loss": 1.5093, + "step": 21560 + }, + { + "epoch": 2.04, + "grad_norm": 65.625, + "learning_rate": 5.919836974760999e-07, + "loss": 1.5351, + "step": 21580 + }, + { + "epoch": 2.04, + "grad_norm": 53.6875, + "learning_rate": 5.916055539452943e-07, + "loss": 1.585, + "step": 21600 + }, + { + "epoch": 2.04, + "grad_norm": 60.75, + "learning_rate": 5.912274104144888e-07, + "loss": 1.5854, + "step": 21620 + }, + { + "epoch": 2.05, + "grad_norm": 57.4375, + "learning_rate": 5.908492668836832e-07, + "loss": 1.4904, + "step": 21640 + }, + { + "epoch": 2.05, + "grad_norm": 72.5625, + "learning_rate": 5.904711233528776e-07, + "loss": 1.5939, + "step": 21660 + }, + { + "epoch": 2.05, + "grad_norm": 73.1875, + "learning_rate": 5.900929798220721e-07, + "loss": 1.4732, + "step": 21680 + }, + { + "epoch": 2.05, + "grad_norm": 68.0625, + "learning_rate": 5.897148362912665e-07, + "loss": 1.4965, + "step": 21700 + }, + { + "epoch": 2.05, + "grad_norm": 61.15625, + "learning_rate": 5.893366927604609e-07, + "loss": 1.6585, + "step": 21720 + }, + { + "epoch": 2.06, + "grad_norm": 58.8125, + "learning_rate": 5.889585492296554e-07, + "loss": 1.65, + "step": 21740 + }, + { + "epoch": 2.06, + "grad_norm": 50.3125, + "learning_rate": 5.885804056988499e-07, + "loss": 1.6261, + "step": 21760 + }, + { + "epoch": 2.06, + "grad_norm": 59.09375, + "learning_rate": 5.882022621680442e-07, + "loss": 1.5723, + "step": 21780 + }, + { + "epoch": 2.06, + "grad_norm": 61.09375, + "learning_rate": 5.878241186372387e-07, + "loss": 1.567, + "step": 21800 + }, + { + "epoch": 2.06, + "grad_norm": 57.40625, + "learning_rate": 5.874459751064333e-07, + "loss": 1.4886, + "step": 21820 + }, + { + "epoch": 2.06, + "grad_norm": 82.5625, + "learning_rate": 5.870678315756277e-07, + "loss": 1.5157, + "step": 21840 + }, + { + "epoch": 2.07, + "grad_norm": 64.875, + "learning_rate": 5.866896880448221e-07, + "loss": 1.4767, + "step": 21860 + }, + { + "epoch": 2.07, + "grad_norm": 66.6875, + "learning_rate": 5.863115445140165e-07, + "loss": 1.6021, + "step": 21880 + }, + { + "epoch": 2.07, + "grad_norm": 80.5, + "learning_rate": 5.85933400983211e-07, + "loss": 1.5284, + "step": 21900 + }, + { + "epoch": 2.07, + "grad_norm": 61.84375, + "learning_rate": 5.855552574524054e-07, + "loss": 1.5389, + "step": 21920 + }, + { + "epoch": 2.07, + "grad_norm": 69.8125, + "learning_rate": 5.851771139215998e-07, + "loss": 1.4867, + "step": 21940 + }, + { + "epoch": 2.08, + "grad_norm": 43.875, + "learning_rate": 5.847989703907943e-07, + "loss": 1.5887, + "step": 21960 + }, + { + "epoch": 2.08, + "grad_norm": 64.5625, + "learning_rate": 5.844208268599888e-07, + "loss": 1.5928, + "step": 21980 + }, + { + "epoch": 2.08, + "grad_norm": 68.5625, + "learning_rate": 5.840426833291831e-07, + "loss": 1.6589, + "step": 22000 + }, + { + "epoch": 2.08, + "grad_norm": 71.1875, + "learning_rate": 5.836645397983776e-07, + "loss": 1.6092, + "step": 22020 + }, + { + "epoch": 2.08, + "grad_norm": 54.75, + "learning_rate": 5.832863962675721e-07, + "loss": 1.503, + "step": 22040 + }, + { + "epoch": 2.09, + "grad_norm": 67.3125, + "learning_rate": 5.829082527367664e-07, + "loss": 1.5614, + "step": 22060 + }, + { + "epoch": 2.09, + "grad_norm": 68.0, + "learning_rate": 5.825301092059609e-07, + "loss": 1.6066, + "step": 22080 + }, + { + "epoch": 2.09, + "grad_norm": 93.25, + "learning_rate": 5.821519656751554e-07, + "loss": 1.5409, + "step": 22100 + }, + { + "epoch": 2.09, + "grad_norm": 77.6875, + "learning_rate": 5.817738221443498e-07, + "loss": 1.6052, + "step": 22120 + }, + { + "epoch": 2.09, + "grad_norm": 79.75, + "learning_rate": 5.813956786135442e-07, + "loss": 1.6158, + "step": 22140 + }, + { + "epoch": 2.09, + "grad_norm": 61.75, + "learning_rate": 5.810175350827388e-07, + "loss": 1.5924, + "step": 22160 + }, + { + "epoch": 2.1, + "grad_norm": 87.375, + "learning_rate": 5.806393915519332e-07, + "loss": 1.608, + "step": 22180 + }, + { + "epoch": 2.1, + "grad_norm": 56.46875, + "learning_rate": 5.802612480211277e-07, + "loss": 1.554, + "step": 22200 + }, + { + "epoch": 2.1, + "grad_norm": 64.375, + "learning_rate": 5.79883104490322e-07, + "loss": 1.4941, + "step": 22220 + }, + { + "epoch": 2.1, + "grad_norm": 68.625, + "learning_rate": 5.795049609595165e-07, + "loss": 1.6285, + "step": 22240 + }, + { + "epoch": 2.1, + "grad_norm": 86.6875, + "learning_rate": 5.79126817428711e-07, + "loss": 1.586, + "step": 22260 + }, + { + "epoch": 2.11, + "grad_norm": 45.625, + "learning_rate": 5.787486738979053e-07, + "loss": 1.5519, + "step": 22280 + }, + { + "epoch": 2.11, + "grad_norm": 66.1875, + "learning_rate": 5.783705303670998e-07, + "loss": 1.553, + "step": 22300 + }, + { + "epoch": 2.11, + "grad_norm": 63.15625, + "learning_rate": 5.779923868362943e-07, + "loss": 1.4016, + "step": 22320 + }, + { + "epoch": 2.11, + "grad_norm": 64.0625, + "learning_rate": 5.776142433054887e-07, + "loss": 1.5565, + "step": 22340 + }, + { + "epoch": 2.11, + "grad_norm": 118.625, + "learning_rate": 5.772360997746831e-07, + "loss": 1.5126, + "step": 22360 + }, + { + "epoch": 2.12, + "grad_norm": 61.6875, + "learning_rate": 5.768579562438776e-07, + "loss": 1.6061, + "step": 22380 + }, + { + "epoch": 2.12, + "grad_norm": 53.6875, + "learning_rate": 5.76479812713072e-07, + "loss": 1.5049, + "step": 22400 + }, + { + "epoch": 2.12, + "grad_norm": 53.5625, + "learning_rate": 5.761016691822665e-07, + "loss": 1.5556, + "step": 22420 + }, + { + "epoch": 2.12, + "grad_norm": 84.125, + "learning_rate": 5.757235256514608e-07, + "loss": 1.5172, + "step": 22440 + }, + { + "epoch": 2.12, + "grad_norm": 57.125, + "learning_rate": 5.753453821206553e-07, + "loss": 1.5251, + "step": 22460 + }, + { + "epoch": 2.13, + "grad_norm": 52.0, + "learning_rate": 5.749672385898498e-07, + "loss": 1.4947, + "step": 22480 + }, + { + "epoch": 2.13, + "grad_norm": 71.9375, + "learning_rate": 5.745890950590441e-07, + "loss": 1.496, + "step": 22500 + }, + { + "epoch": 2.13, + "grad_norm": 56.28125, + "learning_rate": 5.742109515282387e-07, + "loss": 1.6366, + "step": 22520 + }, + { + "epoch": 2.13, + "grad_norm": 72.5625, + "learning_rate": 5.738328079974332e-07, + "loss": 1.5916, + "step": 22540 + }, + { + "epoch": 2.13, + "grad_norm": 66.4375, + "learning_rate": 5.734546644666276e-07, + "loss": 1.572, + "step": 22560 + }, + { + "epoch": 2.13, + "grad_norm": 47.25, + "learning_rate": 5.73076520935822e-07, + "loss": 1.5181, + "step": 22580 + }, + { + "epoch": 2.14, + "grad_norm": 62.75, + "learning_rate": 5.726983774050165e-07, + "loss": 1.6111, + "step": 22600 + }, + { + "epoch": 2.14, + "grad_norm": 51.375, + "learning_rate": 5.723202338742109e-07, + "loss": 1.5621, + "step": 22620 + }, + { + "epoch": 2.14, + "grad_norm": 50.84375, + "learning_rate": 5.719420903434054e-07, + "loss": 1.5096, + "step": 22640 + }, + { + "epoch": 2.14, + "grad_norm": 61.875, + "learning_rate": 5.715639468125998e-07, + "loss": 1.5789, + "step": 22660 + }, + { + "epoch": 2.14, + "grad_norm": 81.5, + "learning_rate": 5.711858032817942e-07, + "loss": 1.5334, + "step": 22680 + }, + { + "epoch": 2.15, + "grad_norm": 68.9375, + "learning_rate": 5.708076597509887e-07, + "loss": 1.6067, + "step": 22700 + }, + { + "epoch": 2.15, + "grad_norm": 74.8125, + "learning_rate": 5.704295162201831e-07, + "loss": 1.6537, + "step": 22720 + }, + { + "epoch": 2.15, + "grad_norm": 52.3125, + "learning_rate": 5.700513726893775e-07, + "loss": 1.4958, + "step": 22740 + }, + { + "epoch": 2.15, + "grad_norm": 57.9375, + "learning_rate": 5.69673229158572e-07, + "loss": 1.6364, + "step": 22760 + }, + { + "epoch": 2.15, + "grad_norm": 104.6875, + "learning_rate": 5.692950856277664e-07, + "loss": 1.5181, + "step": 22780 + }, + { + "epoch": 2.16, + "grad_norm": 64.6875, + "learning_rate": 5.689169420969608e-07, + "loss": 1.6591, + "step": 22800 + }, + { + "epoch": 2.16, + "grad_norm": 91.4375, + "learning_rate": 5.685387985661553e-07, + "loss": 1.5405, + "step": 22820 + }, + { + "epoch": 2.16, + "grad_norm": 63.3125, + "learning_rate": 5.681606550353497e-07, + "loss": 1.4792, + "step": 22840 + }, + { + "epoch": 2.16, + "grad_norm": 84.125, + "learning_rate": 5.677825115045442e-07, + "loss": 1.4895, + "step": 22860 + }, + { + "epoch": 2.16, + "grad_norm": 62.0, + "learning_rate": 5.674043679737387e-07, + "loss": 1.5591, + "step": 22880 + }, + { + "epoch": 2.16, + "grad_norm": 63.9375, + "learning_rate": 5.670262244429331e-07, + "loss": 1.599, + "step": 22900 + }, + { + "epoch": 2.17, + "grad_norm": 66.8125, + "learning_rate": 5.666480809121276e-07, + "loss": 1.6139, + "step": 22920 + }, + { + "epoch": 2.17, + "grad_norm": 109.25, + "learning_rate": 5.66269937381322e-07, + "loss": 1.5495, + "step": 22940 + }, + { + "epoch": 2.17, + "grad_norm": 84.1875, + "learning_rate": 5.658917938505164e-07, + "loss": 1.6005, + "step": 22960 + }, + { + "epoch": 2.17, + "grad_norm": 57.125, + "learning_rate": 5.655136503197109e-07, + "loss": 1.5384, + "step": 22980 + }, + { + "epoch": 2.17, + "grad_norm": 73.25, + "learning_rate": 5.651355067889054e-07, + "loss": 1.5543, + "step": 23000 + }, + { + "epoch": 2.18, + "grad_norm": 52.125, + "learning_rate": 5.647573632580997e-07, + "loss": 1.5076, + "step": 23020 + }, + { + "epoch": 2.18, + "grad_norm": 52.6875, + "learning_rate": 5.643792197272942e-07, + "loss": 1.5428, + "step": 23040 + }, + { + "epoch": 2.18, + "grad_norm": 76.25, + "learning_rate": 5.640010761964887e-07, + "loss": 1.5638, + "step": 23060 + }, + { + "epoch": 2.18, + "grad_norm": 48.90625, + "learning_rate": 5.63622932665683e-07, + "loss": 1.5839, + "step": 23080 + }, + { + "epoch": 2.18, + "grad_norm": 74.0, + "learning_rate": 5.632447891348775e-07, + "loss": 1.5441, + "step": 23100 + }, + { + "epoch": 2.19, + "grad_norm": 61.90625, + "learning_rate": 5.628666456040719e-07, + "loss": 1.5854, + "step": 23120 + }, + { + "epoch": 2.19, + "grad_norm": 71.25, + "learning_rate": 5.624885020732664e-07, + "loss": 1.5531, + "step": 23140 + }, + { + "epoch": 2.19, + "grad_norm": 110.75, + "learning_rate": 5.621103585424608e-07, + "loss": 1.5323, + "step": 23160 + }, + { + "epoch": 2.19, + "grad_norm": 73.375, + "learning_rate": 5.617322150116552e-07, + "loss": 1.572, + "step": 23180 + }, + { + "epoch": 2.19, + "grad_norm": 88.25, + "learning_rate": 5.613540714808498e-07, + "loss": 1.4873, + "step": 23200 + }, + { + "epoch": 2.2, + "grad_norm": 70.75, + "learning_rate": 5.609759279500443e-07, + "loss": 1.545, + "step": 23220 + }, + { + "epoch": 2.2, + "grad_norm": 57.40625, + "learning_rate": 5.605977844192386e-07, + "loss": 1.5191, + "step": 23240 + }, + { + "epoch": 2.2, + "grad_norm": 65.4375, + "learning_rate": 5.602196408884331e-07, + "loss": 1.5962, + "step": 23260 + }, + { + "epoch": 2.2, + "grad_norm": 58.25, + "learning_rate": 5.598414973576276e-07, + "loss": 1.4449, + "step": 23280 + }, + { + "epoch": 2.2, + "grad_norm": 99.125, + "learning_rate": 5.594633538268219e-07, + "loss": 1.4933, + "step": 23300 + }, + { + "epoch": 2.2, + "grad_norm": 62.875, + "learning_rate": 5.590852102960164e-07, + "loss": 1.5198, + "step": 23320 + }, + { + "epoch": 2.21, + "grad_norm": 72.0, + "learning_rate": 5.587070667652109e-07, + "loss": 1.6208, + "step": 23340 + }, + { + "epoch": 2.21, + "grad_norm": 60.0, + "learning_rate": 5.583289232344053e-07, + "loss": 1.5792, + "step": 23360 + }, + { + "epoch": 2.21, + "grad_norm": 54.84375, + "learning_rate": 5.579507797035997e-07, + "loss": 1.5194, + "step": 23380 + }, + { + "epoch": 2.21, + "grad_norm": 57.84375, + "learning_rate": 5.575726361727941e-07, + "loss": 1.5103, + "step": 23400 + }, + { + "epoch": 2.21, + "grad_norm": 52.21875, + "learning_rate": 5.571944926419886e-07, + "loss": 1.5909, + "step": 23420 + }, + { + "epoch": 2.22, + "grad_norm": 90.5, + "learning_rate": 5.56816349111183e-07, + "loss": 1.5455, + "step": 23440 + }, + { + "epoch": 2.22, + "grad_norm": 85.3125, + "learning_rate": 5.564382055803774e-07, + "loss": 1.5711, + "step": 23460 + }, + { + "epoch": 2.22, + "grad_norm": 58.15625, + "learning_rate": 5.560600620495719e-07, + "loss": 1.5518, + "step": 23480 + }, + { + "epoch": 2.22, + "grad_norm": 62.28125, + "learning_rate": 5.556819185187664e-07, + "loss": 1.6045, + "step": 23500 + }, + { + "epoch": 2.22, + "grad_norm": 60.875, + "learning_rate": 5.553037749879607e-07, + "loss": 1.5856, + "step": 23520 + }, + { + "epoch": 2.23, + "grad_norm": 50.75, + "learning_rate": 5.549256314571552e-07, + "loss": 1.6366, + "step": 23540 + }, + { + "epoch": 2.23, + "grad_norm": 75.0625, + "learning_rate": 5.545474879263498e-07, + "loss": 1.5648, + "step": 23560 + }, + { + "epoch": 2.23, + "grad_norm": 50.3125, + "learning_rate": 5.541693443955442e-07, + "loss": 1.565, + "step": 23580 + }, + { + "epoch": 2.23, + "grad_norm": 54.46875, + "learning_rate": 5.537912008647386e-07, + "loss": 1.5149, + "step": 23600 + }, + { + "epoch": 2.23, + "grad_norm": 92.75, + "learning_rate": 5.534130573339331e-07, + "loss": 1.4956, + "step": 23620 + }, + { + "epoch": 2.23, + "grad_norm": 54.3125, + "learning_rate": 5.530349138031275e-07, + "loss": 1.4991, + "step": 23640 + }, + { + "epoch": 2.24, + "grad_norm": 48.15625, + "learning_rate": 5.52656770272322e-07, + "loss": 1.5886, + "step": 23660 + }, + { + "epoch": 2.24, + "grad_norm": 60.9375, + "learning_rate": 5.522786267415164e-07, + "loss": 1.5823, + "step": 23680 + }, + { + "epoch": 2.24, + "grad_norm": 68.0625, + "learning_rate": 5.519004832107108e-07, + "loss": 1.5445, + "step": 23700 + }, + { + "epoch": 2.24, + "grad_norm": 56.25, + "learning_rate": 5.515223396799053e-07, + "loss": 1.6111, + "step": 23720 + }, + { + "epoch": 2.24, + "grad_norm": 76.6875, + "learning_rate": 5.511441961490996e-07, + "loss": 1.6356, + "step": 23740 + }, + { + "epoch": 2.25, + "grad_norm": 70.25, + "learning_rate": 5.507660526182941e-07, + "loss": 1.5651, + "step": 23760 + }, + { + "epoch": 2.25, + "grad_norm": 75.0625, + "learning_rate": 5.503879090874886e-07, + "loss": 1.4972, + "step": 23780 + }, + { + "epoch": 2.25, + "grad_norm": 54.6875, + "learning_rate": 5.50009765556683e-07, + "loss": 1.5978, + "step": 23800 + }, + { + "epoch": 2.25, + "grad_norm": 56.28125, + "learning_rate": 5.496316220258774e-07, + "loss": 1.5318, + "step": 23820 + }, + { + "epoch": 2.25, + "grad_norm": 41.65625, + "learning_rate": 5.492534784950719e-07, + "loss": 1.468, + "step": 23840 + }, + { + "epoch": 2.26, + "grad_norm": 52.625, + "learning_rate": 5.488753349642663e-07, + "loss": 1.4403, + "step": 23860 + }, + { + "epoch": 2.26, + "grad_norm": 51.03125, + "learning_rate": 5.484971914334607e-07, + "loss": 1.5273, + "step": 23880 + }, + { + "epoch": 2.26, + "grad_norm": 118.5, + "learning_rate": 5.481190479026553e-07, + "loss": 1.5754, + "step": 23900 + }, + { + "epoch": 2.26, + "grad_norm": 53.65625, + "learning_rate": 5.477409043718497e-07, + "loss": 1.652, + "step": 23920 + }, + { + "epoch": 2.26, + "grad_norm": 55.46875, + "learning_rate": 5.473627608410442e-07, + "loss": 1.5018, + "step": 23940 + }, + { + "epoch": 2.27, + "grad_norm": 75.875, + "learning_rate": 5.469846173102386e-07, + "loss": 1.5074, + "step": 23960 + }, + { + "epoch": 2.27, + "grad_norm": 65.375, + "learning_rate": 5.46606473779433e-07, + "loss": 1.5339, + "step": 23980 + }, + { + "epoch": 2.27, + "grad_norm": 58.84375, + "learning_rate": 5.462283302486275e-07, + "loss": 1.6258, + "step": 24000 + }, + { + "epoch": 2.27, + "grad_norm": 53.34375, + "learning_rate": 5.45850186717822e-07, + "loss": 1.5179, + "step": 24020 + }, + { + "epoch": 2.27, + "grad_norm": 94.25, + "learning_rate": 5.454720431870163e-07, + "loss": 1.6923, + "step": 24040 + }, + { + "epoch": 2.27, + "grad_norm": 59.78125, + "learning_rate": 5.450938996562108e-07, + "loss": 1.6017, + "step": 24060 + }, + { + "epoch": 2.28, + "grad_norm": 82.5625, + "learning_rate": 5.447157561254052e-07, + "loss": 1.6311, + "step": 24080 + }, + { + "epoch": 2.28, + "grad_norm": 64.1875, + "learning_rate": 5.443376125945996e-07, + "loss": 1.6113, + "step": 24100 + }, + { + "epoch": 2.28, + "grad_norm": 62.40625, + "learning_rate": 5.439594690637941e-07, + "loss": 1.5551, + "step": 24120 + }, + { + "epoch": 2.28, + "grad_norm": 62.3125, + "learning_rate": 5.435813255329885e-07, + "loss": 1.4569, + "step": 24140 + }, + { + "epoch": 2.28, + "grad_norm": 58.46875, + "learning_rate": 5.43203182002183e-07, + "loss": 1.5644, + "step": 24160 + }, + { + "epoch": 2.29, + "grad_norm": 92.8125, + "learning_rate": 5.428250384713774e-07, + "loss": 1.6306, + "step": 24180 + }, + { + "epoch": 2.29, + "grad_norm": 55.125, + "learning_rate": 5.424468949405718e-07, + "loss": 1.5071, + "step": 24200 + }, + { + "epoch": 2.29, + "grad_norm": 65.5, + "learning_rate": 5.420687514097663e-07, + "loss": 1.58, + "step": 24220 + }, + { + "epoch": 2.29, + "grad_norm": 49.53125, + "learning_rate": 5.416906078789607e-07, + "loss": 1.58, + "step": 24240 + }, + { + "epoch": 2.29, + "grad_norm": 59.03125, + "learning_rate": 5.413124643481552e-07, + "loss": 1.5532, + "step": 24260 + }, + { + "epoch": 2.3, + "grad_norm": 61.5625, + "learning_rate": 5.409343208173497e-07, + "loss": 1.5801, + "step": 24280 + }, + { + "epoch": 2.3, + "grad_norm": 57.875, + "learning_rate": 5.405561772865442e-07, + "loss": 1.518, + "step": 24300 + }, + { + "epoch": 2.3, + "grad_norm": 54.40625, + "learning_rate": 5.401780337557385e-07, + "loss": 1.5373, + "step": 24320 + }, + { + "epoch": 2.3, + "grad_norm": 69.625, + "learning_rate": 5.39799890224933e-07, + "loss": 1.5509, + "step": 24340 + }, + { + "epoch": 2.3, + "grad_norm": 56.59375, + "learning_rate": 5.394217466941274e-07, + "loss": 1.603, + "step": 24360 + }, + { + "epoch": 2.3, + "grad_norm": 71.375, + "learning_rate": 5.390436031633219e-07, + "loss": 1.3879, + "step": 24380 + }, + { + "epoch": 2.31, + "grad_norm": 50.21875, + "learning_rate": 5.386654596325163e-07, + "loss": 1.5509, + "step": 24400 + }, + { + "epoch": 2.31, + "grad_norm": 69.125, + "learning_rate": 5.382873161017107e-07, + "loss": 1.5635, + "step": 24420 + }, + { + "epoch": 2.31, + "grad_norm": 67.125, + "learning_rate": 5.379091725709052e-07, + "loss": 1.4765, + "step": 24440 + }, + { + "epoch": 2.31, + "grad_norm": 98.0, + "learning_rate": 5.375310290400996e-07, + "loss": 1.5189, + "step": 24460 + }, + { + "epoch": 2.31, + "grad_norm": 58.8125, + "learning_rate": 5.37152885509294e-07, + "loss": 1.4824, + "step": 24480 + }, + { + "epoch": 2.32, + "grad_norm": 74.6875, + "learning_rate": 5.367747419784885e-07, + "loss": 1.559, + "step": 24500 + }, + { + "epoch": 2.32, + "grad_norm": 73.875, + "learning_rate": 5.36396598447683e-07, + "loss": 1.559, + "step": 24520 + }, + { + "epoch": 2.32, + "grad_norm": 54.9375, + "learning_rate": 5.360184549168773e-07, + "loss": 1.5174, + "step": 24540 + }, + { + "epoch": 2.32, + "grad_norm": 57.5625, + "learning_rate": 5.356403113860718e-07, + "loss": 1.5646, + "step": 24560 + }, + { + "epoch": 2.32, + "grad_norm": 58.8125, + "learning_rate": 5.352621678552663e-07, + "loss": 1.4964, + "step": 24580 + }, + { + "epoch": 2.33, + "grad_norm": 54.28125, + "learning_rate": 5.348840243244608e-07, + "loss": 1.5191, + "step": 24600 + }, + { + "epoch": 2.33, + "grad_norm": 95.9375, + "learning_rate": 5.345058807936552e-07, + "loss": 1.567, + "step": 24620 + }, + { + "epoch": 2.33, + "grad_norm": 91.5625, + "learning_rate": 5.341277372628497e-07, + "loss": 1.5026, + "step": 24640 + }, + { + "epoch": 2.33, + "grad_norm": 61.03125, + "learning_rate": 5.337495937320441e-07, + "loss": 1.6615, + "step": 24660 + }, + { + "epoch": 2.33, + "grad_norm": 53.625, + "learning_rate": 5.333714502012385e-07, + "loss": 1.5603, + "step": 24680 + }, + { + "epoch": 2.34, + "grad_norm": 59.0625, + "learning_rate": 5.329933066704329e-07, + "loss": 1.5761, + "step": 24700 + }, + { + "epoch": 2.34, + "grad_norm": 83.4375, + "learning_rate": 5.326151631396274e-07, + "loss": 1.6023, + "step": 24720 + }, + { + "epoch": 2.34, + "grad_norm": 68.125, + "learning_rate": 5.322370196088219e-07, + "loss": 1.5586, + "step": 24740 + }, + { + "epoch": 2.34, + "grad_norm": 55.96875, + "learning_rate": 5.318588760780162e-07, + "loss": 1.5758, + "step": 24760 + }, + { + "epoch": 2.34, + "grad_norm": 53.3125, + "learning_rate": 5.314807325472107e-07, + "loss": 1.5305, + "step": 24780 + }, + { + "epoch": 2.34, + "grad_norm": 62.0625, + "learning_rate": 5.311025890164052e-07, + "loss": 1.5119, + "step": 24800 + }, + { + "epoch": 2.35, + "grad_norm": 45.6875, + "learning_rate": 5.307244454855995e-07, + "loss": 1.5253, + "step": 24820 + }, + { + "epoch": 2.35, + "grad_norm": 72.8125, + "learning_rate": 5.30346301954794e-07, + "loss": 1.5749, + "step": 24840 + }, + { + "epoch": 2.35, + "grad_norm": 78.875, + "learning_rate": 5.299681584239885e-07, + "loss": 1.5423, + "step": 24860 + }, + { + "epoch": 2.35, + "grad_norm": 78.4375, + "learning_rate": 5.295900148931829e-07, + "loss": 1.5867, + "step": 24880 + }, + { + "epoch": 2.35, + "grad_norm": 49.9375, + "learning_rate": 5.292118713623773e-07, + "loss": 1.6509, + "step": 24900 + }, + { + "epoch": 2.36, + "grad_norm": 68.875, + "learning_rate": 5.288337278315718e-07, + "loss": 1.5651, + "step": 24920 + }, + { + "epoch": 2.36, + "grad_norm": 55.15625, + "learning_rate": 5.284555843007662e-07, + "loss": 1.539, + "step": 24940 + }, + { + "epoch": 2.36, + "grad_norm": 62.25, + "learning_rate": 5.280774407699608e-07, + "loss": 1.584, + "step": 24960 + }, + { + "epoch": 2.36, + "grad_norm": 56.8125, + "learning_rate": 5.276992972391551e-07, + "loss": 1.5746, + "step": 24980 + }, + { + "epoch": 2.36, + "grad_norm": 71.3125, + "learning_rate": 5.273211537083496e-07, + "loss": 1.5028, + "step": 25000 + }, + { + "epoch": 2.37, + "grad_norm": 45.90625, + "learning_rate": 5.269430101775441e-07, + "loss": 1.6168, + "step": 25020 + }, + { + "epoch": 2.37, + "grad_norm": 53.8125, + "learning_rate": 5.265648666467384e-07, + "loss": 1.5284, + "step": 25040 + }, + { + "epoch": 2.37, + "grad_norm": 55.875, + "learning_rate": 5.261867231159329e-07, + "loss": 1.5533, + "step": 25060 + }, + { + "epoch": 2.37, + "grad_norm": 62.375, + "learning_rate": 5.258085795851274e-07, + "loss": 1.5737, + "step": 25080 + }, + { + "epoch": 2.37, + "grad_norm": 54.59375, + "learning_rate": 5.254304360543218e-07, + "loss": 1.5599, + "step": 25100 + }, + { + "epoch": 2.37, + "grad_norm": 68.1875, + "learning_rate": 5.250522925235162e-07, + "loss": 1.5021, + "step": 25120 + }, + { + "epoch": 2.38, + "grad_norm": 62.71875, + "learning_rate": 5.246741489927107e-07, + "loss": 1.44, + "step": 25140 + }, + { + "epoch": 2.38, + "grad_norm": 52.96875, + "learning_rate": 5.242960054619051e-07, + "loss": 1.4916, + "step": 25160 + }, + { + "epoch": 2.38, + "grad_norm": 54.625, + "learning_rate": 5.239178619310996e-07, + "loss": 1.5164, + "step": 25180 + }, + { + "epoch": 2.38, + "grad_norm": 52.5625, + "learning_rate": 5.23539718400294e-07, + "loss": 1.4848, + "step": 25200 + }, + { + "epoch": 2.38, + "grad_norm": 72.0625, + "learning_rate": 5.231615748694884e-07, + "loss": 1.5058, + "step": 25220 + }, + { + "epoch": 2.39, + "grad_norm": 74.125, + "learning_rate": 5.227834313386829e-07, + "loss": 1.6166, + "step": 25240 + }, + { + "epoch": 2.39, + "grad_norm": 53.40625, + "learning_rate": 5.224052878078772e-07, + "loss": 1.5508, + "step": 25260 + }, + { + "epoch": 2.39, + "grad_norm": 57.21875, + "learning_rate": 5.220271442770717e-07, + "loss": 1.6444, + "step": 25280 + }, + { + "epoch": 2.39, + "grad_norm": 62.5, + "learning_rate": 5.216490007462663e-07, + "loss": 1.5956, + "step": 25300 + }, + { + "epoch": 2.39, + "grad_norm": 71.0, + "learning_rate": 5.212708572154607e-07, + "loss": 1.5861, + "step": 25320 + }, + { + "epoch": 2.4, + "grad_norm": 51.8125, + "learning_rate": 5.208927136846551e-07, + "loss": 1.5077, + "step": 25340 + }, + { + "epoch": 2.4, + "grad_norm": 67.5625, + "learning_rate": 5.205145701538496e-07, + "loss": 1.592, + "step": 25360 + }, + { + "epoch": 2.4, + "grad_norm": 49.03125, + "learning_rate": 5.20136426623044e-07, + "loss": 1.6034, + "step": 25380 + }, + { + "epoch": 2.4, + "grad_norm": 49.4375, + "learning_rate": 5.197582830922385e-07, + "loss": 1.4929, + "step": 25400 + }, + { + "epoch": 2.4, + "grad_norm": 64.875, + "learning_rate": 5.193801395614329e-07, + "loss": 1.532, + "step": 25420 + }, + { + "epoch": 2.4, + "grad_norm": 57.1875, + "learning_rate": 5.190019960306273e-07, + "loss": 1.4758, + "step": 25440 + }, + { + "epoch": 2.41, + "grad_norm": 64.0, + "learning_rate": 5.186238524998218e-07, + "loss": 1.5308, + "step": 25460 + }, + { + "epoch": 2.41, + "grad_norm": 48.34375, + "learning_rate": 5.182457089690162e-07, + "loss": 1.5379, + "step": 25480 + }, + { + "epoch": 2.41, + "grad_norm": 55.1875, + "learning_rate": 5.178675654382106e-07, + "loss": 1.5318, + "step": 25500 + }, + { + "epoch": 2.41, + "grad_norm": 67.0, + "learning_rate": 5.174894219074051e-07, + "loss": 1.5173, + "step": 25520 + }, + { + "epoch": 2.41, + "grad_norm": 44.46875, + "learning_rate": 5.171112783765996e-07, + "loss": 1.5026, + "step": 25540 + }, + { + "epoch": 2.42, + "grad_norm": 60.875, + "learning_rate": 5.167331348457939e-07, + "loss": 1.5759, + "step": 25560 + }, + { + "epoch": 2.42, + "grad_norm": 60.75, + "learning_rate": 5.163549913149884e-07, + "loss": 1.5473, + "step": 25580 + }, + { + "epoch": 2.42, + "grad_norm": 46.84375, + "learning_rate": 5.159768477841828e-07, + "loss": 1.4403, + "step": 25600 + }, + { + "epoch": 2.42, + "grad_norm": 50.34375, + "learning_rate": 5.155987042533772e-07, + "loss": 1.5662, + "step": 25620 + }, + { + "epoch": 2.42, + "grad_norm": 62.6875, + "learning_rate": 5.152205607225717e-07, + "loss": 1.4555, + "step": 25640 + }, + { + "epoch": 2.43, + "grad_norm": 61.59375, + "learning_rate": 5.148424171917662e-07, + "loss": 1.6103, + "step": 25660 + }, + { + "epoch": 2.43, + "grad_norm": 71.5625, + "learning_rate": 5.144642736609607e-07, + "loss": 1.5453, + "step": 25680 + }, + { + "epoch": 2.43, + "grad_norm": 68.1875, + "learning_rate": 5.140861301301551e-07, + "loss": 1.6433, + "step": 25700 + }, + { + "epoch": 2.43, + "grad_norm": 54.34375, + "learning_rate": 5.137079865993495e-07, + "loss": 1.5406, + "step": 25720 + }, + { + "epoch": 2.43, + "grad_norm": 52.03125, + "learning_rate": 5.13329843068544e-07, + "loss": 1.518, + "step": 25740 + }, + { + "epoch": 2.44, + "grad_norm": 85.6875, + "learning_rate": 5.129516995377385e-07, + "loss": 1.6169, + "step": 25760 + }, + { + "epoch": 2.44, + "grad_norm": 68.0625, + "learning_rate": 5.125735560069328e-07, + "loss": 1.5893, + "step": 25780 + }, + { + "epoch": 2.44, + "grad_norm": 55.71875, + "learning_rate": 5.121954124761273e-07, + "loss": 1.6479, + "step": 25800 + }, + { + "epoch": 2.44, + "grad_norm": 58.96875, + "learning_rate": 5.118172689453218e-07, + "loss": 1.6356, + "step": 25820 + }, + { + "epoch": 2.44, + "grad_norm": 57.5625, + "learning_rate": 5.114391254145161e-07, + "loss": 1.5878, + "step": 25840 + }, + { + "epoch": 2.44, + "grad_norm": 91.875, + "learning_rate": 5.110609818837106e-07, + "loss": 1.4991, + "step": 25860 + }, + { + "epoch": 2.45, + "grad_norm": 62.8125, + "learning_rate": 5.10682838352905e-07, + "loss": 1.4639, + "step": 25880 + }, + { + "epoch": 2.45, + "grad_norm": 74.1875, + "learning_rate": 5.103046948220995e-07, + "loss": 1.6055, + "step": 25900 + }, + { + "epoch": 2.45, + "grad_norm": 69.8125, + "learning_rate": 5.099265512912939e-07, + "loss": 1.555, + "step": 25920 + }, + { + "epoch": 2.45, + "grad_norm": 65.125, + "learning_rate": 5.095484077604883e-07, + "loss": 1.625, + "step": 25940 + }, + { + "epoch": 2.45, + "grad_norm": 60.71875, + "learning_rate": 5.091702642296828e-07, + "loss": 1.5422, + "step": 25960 + }, + { + "epoch": 2.46, + "grad_norm": 56.875, + "learning_rate": 5.087921206988773e-07, + "loss": 1.6063, + "step": 25980 + }, + { + "epoch": 2.46, + "grad_norm": 58.0625, + "learning_rate": 5.084139771680717e-07, + "loss": 1.498, + "step": 26000 + }, + { + "epoch": 2.46, + "grad_norm": 56.65625, + "learning_rate": 5.080358336372662e-07, + "loss": 1.5283, + "step": 26020 + }, + { + "epoch": 2.46, + "grad_norm": 50.3125, + "learning_rate": 5.076576901064607e-07, + "loss": 1.4989, + "step": 26040 + }, + { + "epoch": 2.46, + "grad_norm": 66.0625, + "learning_rate": 5.07279546575655e-07, + "loss": 1.4776, + "step": 26060 + }, + { + "epoch": 2.47, + "grad_norm": 78.9375, + "learning_rate": 5.069014030448495e-07, + "loss": 1.4897, + "step": 26080 + }, + { + "epoch": 2.47, + "grad_norm": 65.0, + "learning_rate": 5.06523259514044e-07, + "loss": 1.5321, + "step": 26100 + }, + { + "epoch": 2.47, + "grad_norm": 71.0, + "learning_rate": 5.061451159832384e-07, + "loss": 1.5652, + "step": 26120 + }, + { + "epoch": 2.47, + "grad_norm": 52.8125, + "learning_rate": 5.057669724524328e-07, + "loss": 1.5619, + "step": 26140 + }, + { + "epoch": 2.47, + "grad_norm": 58.71875, + "learning_rate": 5.053888289216273e-07, + "loss": 1.6027, + "step": 26160 + }, + { + "epoch": 2.47, + "grad_norm": 62.4375, + "learning_rate": 5.050106853908217e-07, + "loss": 1.5552, + "step": 26180 + }, + { + "epoch": 2.48, + "grad_norm": 66.1875, + "learning_rate": 5.046325418600162e-07, + "loss": 1.5539, + "step": 26200 + }, + { + "epoch": 2.48, + "grad_norm": 44.78125, + "learning_rate": 5.042543983292105e-07, + "loss": 1.4551, + "step": 26220 + }, + { + "epoch": 2.48, + "grad_norm": 87.8125, + "learning_rate": 5.03876254798405e-07, + "loss": 1.5585, + "step": 26240 + }, + { + "epoch": 2.48, + "grad_norm": 97.8125, + "learning_rate": 5.034981112675995e-07, + "loss": 1.5294, + "step": 26260 + }, + { + "epoch": 2.48, + "grad_norm": 70.25, + "learning_rate": 5.031199677367938e-07, + "loss": 1.5808, + "step": 26280 + }, + { + "epoch": 2.49, + "grad_norm": 62.375, + "learning_rate": 5.027418242059883e-07, + "loss": 1.5392, + "step": 26300 + }, + { + "epoch": 2.49, + "grad_norm": 67.9375, + "learning_rate": 5.023636806751828e-07, + "loss": 1.53, + "step": 26320 + }, + { + "epoch": 2.49, + "grad_norm": 62.3125, + "learning_rate": 5.019855371443772e-07, + "loss": 1.5298, + "step": 26340 + }, + { + "epoch": 2.49, + "grad_norm": 64.375, + "learning_rate": 5.016073936135717e-07, + "loss": 1.493, + "step": 26360 + }, + { + "epoch": 2.49, + "grad_norm": 66.125, + "learning_rate": 5.012292500827662e-07, + "loss": 1.595, + "step": 26380 + }, + { + "epoch": 2.5, + "grad_norm": 79.0, + "learning_rate": 5.008511065519606e-07, + "loss": 1.4776, + "step": 26400 + }, + { + "epoch": 2.5, + "grad_norm": 70.125, + "learning_rate": 5.004729630211551e-07, + "loss": 1.5509, + "step": 26420 + }, + { + "epoch": 2.5, + "grad_norm": 70.4375, + "learning_rate": 5.000948194903495e-07, + "loss": 1.6068, + "step": 26440 + }, + { + "epoch": 2.5, + "grad_norm": 62.21875, + "learning_rate": 4.997166759595439e-07, + "loss": 1.553, + "step": 26460 + }, + { + "epoch": 2.5, + "grad_norm": 57.125, + "learning_rate": 4.993385324287384e-07, + "loss": 1.6032, + "step": 26480 + }, + { + "epoch": 2.51, + "grad_norm": 91.125, + "learning_rate": 4.989603888979328e-07, + "loss": 1.4595, + "step": 26500 + }, + { + "epoch": 2.51, + "grad_norm": 78.6875, + "learning_rate": 4.985822453671272e-07, + "loss": 1.5136, + "step": 26520 + }, + { + "epoch": 2.51, + "grad_norm": 57.625, + "learning_rate": 4.982041018363217e-07, + "loss": 1.592, + "step": 26540 + }, + { + "epoch": 2.51, + "grad_norm": 71.6875, + "learning_rate": 4.978259583055161e-07, + "loss": 1.5033, + "step": 26560 + }, + { + "epoch": 2.51, + "grad_norm": 67.375, + "learning_rate": 4.974478147747105e-07, + "loss": 1.5144, + "step": 26580 + }, + { + "epoch": 2.51, + "grad_norm": 53.96875, + "learning_rate": 4.97069671243905e-07, + "loss": 1.5845, + "step": 26600 + }, + { + "epoch": 2.52, + "grad_norm": 77.5625, + "learning_rate": 4.966915277130995e-07, + "loss": 1.6229, + "step": 26620 + }, + { + "epoch": 2.52, + "grad_norm": 59.28125, + "learning_rate": 4.96313384182294e-07, + "loss": 1.6134, + "step": 26640 + }, + { + "epoch": 2.52, + "grad_norm": 43.34375, + "learning_rate": 4.959352406514883e-07, + "loss": 1.59, + "step": 26660 + }, + { + "epoch": 2.52, + "grad_norm": 76.875, + "learning_rate": 4.955570971206828e-07, + "loss": 1.5741, + "step": 26680 + }, + { + "epoch": 2.52, + "grad_norm": 70.9375, + "learning_rate": 4.951789535898772e-07, + "loss": 1.5271, + "step": 26700 + }, + { + "epoch": 2.53, + "grad_norm": 53.28125, + "learning_rate": 4.948008100590716e-07, + "loss": 1.436, + "step": 26720 + }, + { + "epoch": 2.53, + "grad_norm": 59.03125, + "learning_rate": 4.944226665282661e-07, + "loss": 1.5151, + "step": 26740 + }, + { + "epoch": 2.53, + "grad_norm": 74.6875, + "learning_rate": 4.940445229974605e-07, + "loss": 1.4958, + "step": 26760 + }, + { + "epoch": 2.53, + "grad_norm": 55.03125, + "learning_rate": 4.936663794666551e-07, + "loss": 1.5223, + "step": 26780 + }, + { + "epoch": 2.53, + "grad_norm": 59.1875, + "learning_rate": 4.932882359358494e-07, + "loss": 1.5894, + "step": 26800 + }, + { + "epoch": 2.54, + "grad_norm": 74.5, + "learning_rate": 4.929100924050439e-07, + "loss": 1.4906, + "step": 26820 + }, + { + "epoch": 2.54, + "grad_norm": 65.5, + "learning_rate": 4.925319488742383e-07, + "loss": 1.5399, + "step": 26840 + }, + { + "epoch": 2.54, + "grad_norm": 60.0, + "learning_rate": 4.921538053434327e-07, + "loss": 1.5463, + "step": 26860 + }, + { + "epoch": 2.54, + "grad_norm": 56.5, + "learning_rate": 4.917756618126272e-07, + "loss": 1.5375, + "step": 26880 + }, + { + "epoch": 2.54, + "grad_norm": 93.3125, + "learning_rate": 4.913975182818216e-07, + "loss": 1.5447, + "step": 26900 + }, + { + "epoch": 2.54, + "grad_norm": 88.3125, + "learning_rate": 4.910193747510161e-07, + "loss": 1.5076, + "step": 26920 + }, + { + "epoch": 2.55, + "grad_norm": 88.9375, + "learning_rate": 4.906412312202105e-07, + "loss": 1.6689, + "step": 26940 + }, + { + "epoch": 2.55, + "grad_norm": 77.25, + "learning_rate": 4.90263087689405e-07, + "loss": 1.5952, + "step": 26960 + }, + { + "epoch": 2.55, + "grad_norm": 53.3125, + "learning_rate": 4.898849441585995e-07, + "loss": 1.4589, + "step": 26980 + }, + { + "epoch": 2.55, + "grad_norm": 61.1875, + "learning_rate": 4.895068006277939e-07, + "loss": 1.4918, + "step": 27000 + }, + { + "epoch": 2.55, + "grad_norm": 73.375, + "learning_rate": 4.891286570969883e-07, + "loss": 1.533, + "step": 27020 + }, + { + "epoch": 2.56, + "grad_norm": 75.6875, + "learning_rate": 4.887505135661827e-07, + "loss": 1.5737, + "step": 27040 + }, + { + "epoch": 2.56, + "grad_norm": 67.3125, + "learning_rate": 4.883723700353772e-07, + "loss": 1.5193, + "step": 27060 + }, + { + "epoch": 2.56, + "grad_norm": 85.625, + "learning_rate": 4.879942265045716e-07, + "loss": 1.5908, + "step": 27080 + }, + { + "epoch": 2.56, + "grad_norm": 92.3125, + "learning_rate": 4.87616082973766e-07, + "loss": 1.612, + "step": 27100 + }, + { + "epoch": 2.56, + "grad_norm": 58.03125, + "learning_rate": 4.872379394429605e-07, + "loss": 1.5158, + "step": 27120 + }, + { + "epoch": 2.57, + "grad_norm": 62.125, + "learning_rate": 4.86859795912155e-07, + "loss": 1.5388, + "step": 27140 + }, + { + "epoch": 2.57, + "grad_norm": 53.1875, + "learning_rate": 4.864816523813494e-07, + "loss": 1.5011, + "step": 27160 + }, + { + "epoch": 2.57, + "grad_norm": 51.40625, + "learning_rate": 4.861035088505438e-07, + "loss": 1.5886, + "step": 27180 + }, + { + "epoch": 2.57, + "grad_norm": 45.90625, + "learning_rate": 4.857253653197383e-07, + "loss": 1.5871, + "step": 27200 + }, + { + "epoch": 2.57, + "grad_norm": 67.6875, + "learning_rate": 4.853472217889328e-07, + "loss": 1.5536, + "step": 27220 + }, + { + "epoch": 2.58, + "grad_norm": 50.34375, + "learning_rate": 4.849690782581271e-07, + "loss": 1.4826, + "step": 27240 + }, + { + "epoch": 2.58, + "grad_norm": 52.6875, + "learning_rate": 4.845909347273216e-07, + "loss": 1.599, + "step": 27260 + }, + { + "epoch": 2.58, + "grad_norm": 57.21875, + "learning_rate": 4.842127911965161e-07, + "loss": 1.5626, + "step": 27280 + }, + { + "epoch": 2.58, + "grad_norm": 68.0625, + "learning_rate": 4.838346476657104e-07, + "loss": 1.4922, + "step": 27300 + }, + { + "epoch": 2.58, + "grad_norm": 70.5, + "learning_rate": 4.834565041349049e-07, + "loss": 1.6091, + "step": 27320 + }, + { + "epoch": 2.58, + "grad_norm": 61.34375, + "learning_rate": 4.830783606040994e-07, + "loss": 1.532, + "step": 27340 + }, + { + "epoch": 2.59, + "grad_norm": 82.0625, + "learning_rate": 4.827002170732939e-07, + "loss": 1.5448, + "step": 27360 + }, + { + "epoch": 2.59, + "grad_norm": 42.21875, + "learning_rate": 4.823220735424882e-07, + "loss": 1.4501, + "step": 27380 + }, + { + "epoch": 2.59, + "grad_norm": 64.875, + "learning_rate": 4.819439300116827e-07, + "loss": 1.6265, + "step": 27400 + }, + { + "epoch": 2.59, + "grad_norm": 60.3125, + "learning_rate": 4.815657864808772e-07, + "loss": 1.4552, + "step": 27420 + }, + { + "epoch": 2.59, + "grad_norm": 60.0625, + "learning_rate": 4.811876429500715e-07, + "loss": 1.6434, + "step": 27440 + }, + { + "epoch": 2.6, + "grad_norm": 72.1875, + "learning_rate": 4.80809499419266e-07, + "loss": 1.5811, + "step": 27460 + }, + { + "epoch": 2.6, + "grad_norm": 61.59375, + "learning_rate": 4.804313558884605e-07, + "loss": 1.5929, + "step": 27480 + }, + { + "epoch": 2.6, + "grad_norm": 55.90625, + "learning_rate": 4.80053212357655e-07, + "loss": 1.598, + "step": 27500 + }, + { + "epoch": 2.6, + "grad_norm": 61.5625, + "learning_rate": 4.796750688268493e-07, + "loss": 1.5171, + "step": 27520 + }, + { + "epoch": 2.6, + "grad_norm": 77.5, + "learning_rate": 4.792969252960438e-07, + "loss": 1.5044, + "step": 27540 + }, + { + "epoch": 2.61, + "grad_norm": 69.0, + "learning_rate": 4.789187817652383e-07, + "loss": 1.5655, + "step": 27560 + }, + { + "epoch": 2.61, + "grad_norm": 58.0625, + "learning_rate": 4.785406382344327e-07, + "loss": 1.4868, + "step": 27580 + }, + { + "epoch": 2.61, + "grad_norm": 65.0, + "learning_rate": 4.781624947036271e-07, + "loss": 1.5374, + "step": 27600 + }, + { + "epoch": 2.61, + "grad_norm": 72.875, + "learning_rate": 4.777843511728216e-07, + "loss": 1.5488, + "step": 27620 + }, + { + "epoch": 2.61, + "grad_norm": 58.1875, + "learning_rate": 4.77406207642016e-07, + "loss": 1.5216, + "step": 27640 + }, + { + "epoch": 2.61, + "grad_norm": 53.96875, + "learning_rate": 4.770280641112104e-07, + "loss": 1.5225, + "step": 27660 + }, + { + "epoch": 2.62, + "grad_norm": 57.0625, + "learning_rate": 4.766499205804049e-07, + "loss": 1.5891, + "step": 27680 + }, + { + "epoch": 2.62, + "grad_norm": 99.8125, + "learning_rate": 4.7627177704959934e-07, + "loss": 1.5102, + "step": 27700 + }, + { + "epoch": 2.62, + "grad_norm": 68.9375, + "learning_rate": 4.758936335187938e-07, + "loss": 1.5994, + "step": 27720 + }, + { + "epoch": 2.62, + "grad_norm": 58.21875, + "learning_rate": 4.7551548998798824e-07, + "loss": 1.5237, + "step": 27740 + }, + { + "epoch": 2.62, + "grad_norm": 58.375, + "learning_rate": 4.7513734645718266e-07, + "loss": 1.5272, + "step": 27760 + }, + { + "epoch": 2.63, + "grad_norm": 59.9375, + "learning_rate": 4.7475920292637713e-07, + "loss": 1.5752, + "step": 27780 + }, + { + "epoch": 2.63, + "grad_norm": 63.25, + "learning_rate": 4.7438105939557155e-07, + "loss": 1.5307, + "step": 27800 + }, + { + "epoch": 2.63, + "grad_norm": 67.5, + "learning_rate": 4.74002915864766e-07, + "loss": 1.5077, + "step": 27820 + }, + { + "epoch": 2.63, + "grad_norm": 72.625, + "learning_rate": 4.736247723339605e-07, + "loss": 1.4981, + "step": 27840 + }, + { + "epoch": 2.63, + "grad_norm": 54.125, + "learning_rate": 4.732466288031549e-07, + "loss": 1.62, + "step": 27860 + }, + { + "epoch": 2.64, + "grad_norm": 61.59375, + "learning_rate": 4.7286848527234934e-07, + "loss": 1.5666, + "step": 27880 + }, + { + "epoch": 2.64, + "grad_norm": 55.15625, + "learning_rate": 4.7249034174154377e-07, + "loss": 1.5126, + "step": 27900 + }, + { + "epoch": 2.64, + "grad_norm": 58.5, + "learning_rate": 4.7211219821073824e-07, + "loss": 1.6101, + "step": 27920 + }, + { + "epoch": 2.64, + "grad_norm": 64.125, + "learning_rate": 4.7173405467993266e-07, + "loss": 1.5902, + "step": 27940 + }, + { + "epoch": 2.64, + "grad_norm": 51.25, + "learning_rate": 4.713559111491271e-07, + "loss": 1.5703, + "step": 27960 + }, + { + "epoch": 2.65, + "grad_norm": 56.21875, + "learning_rate": 4.7097776761832156e-07, + "loss": 1.57, + "step": 27980 + }, + { + "epoch": 2.65, + "grad_norm": 117.5625, + "learning_rate": 4.70599624087516e-07, + "loss": 1.4706, + "step": 28000 + }, + { + "epoch": 2.65, + "grad_norm": 58.75, + "learning_rate": 4.7022148055671045e-07, + "loss": 1.49, + "step": 28020 + }, + { + "epoch": 2.65, + "grad_norm": 85.1875, + "learning_rate": 4.698433370259049e-07, + "loss": 1.4867, + "step": 28040 + }, + { + "epoch": 2.65, + "grad_norm": 62.3125, + "learning_rate": 4.6946519349509935e-07, + "loss": 1.6106, + "step": 28060 + }, + { + "epoch": 2.65, + "grad_norm": 69.8125, + "learning_rate": 4.6908704996429377e-07, + "loss": 1.5242, + "step": 28080 + }, + { + "epoch": 2.66, + "grad_norm": 73.9375, + "learning_rate": 4.687089064334882e-07, + "loss": 1.5871, + "step": 28100 + }, + { + "epoch": 2.66, + "grad_norm": 75.6875, + "learning_rate": 4.6833076290268266e-07, + "loss": 1.5284, + "step": 28120 + }, + { + "epoch": 2.66, + "grad_norm": 60.28125, + "learning_rate": 4.679526193718771e-07, + "loss": 1.5682, + "step": 28140 + }, + { + "epoch": 2.66, + "grad_norm": 52.8125, + "learning_rate": 4.675744758410715e-07, + "loss": 1.5701, + "step": 28160 + }, + { + "epoch": 2.66, + "grad_norm": 71.5, + "learning_rate": 4.67196332310266e-07, + "loss": 1.5179, + "step": 28180 + }, + { + "epoch": 2.67, + "grad_norm": 57.90625, + "learning_rate": 4.6681818877946046e-07, + "loss": 1.544, + "step": 28200 + }, + { + "epoch": 2.67, + "grad_norm": 81.5625, + "learning_rate": 4.664400452486549e-07, + "loss": 1.5441, + "step": 28220 + }, + { + "epoch": 2.67, + "grad_norm": 76.0625, + "learning_rate": 4.660619017178493e-07, + "loss": 1.5856, + "step": 28240 + }, + { + "epoch": 2.67, + "grad_norm": 77.125, + "learning_rate": 4.6568375818704377e-07, + "loss": 1.5011, + "step": 28260 + }, + { + "epoch": 2.67, + "grad_norm": 67.6875, + "learning_rate": 4.653056146562382e-07, + "loss": 1.6049, + "step": 28280 + }, + { + "epoch": 2.68, + "grad_norm": 59.4375, + "learning_rate": 4.649274711254326e-07, + "loss": 1.5911, + "step": 28300 + }, + { + "epoch": 2.68, + "grad_norm": 76.0625, + "learning_rate": 4.6454932759462704e-07, + "loss": 1.5146, + "step": 28320 + }, + { + "epoch": 2.68, + "grad_norm": 56.34375, + "learning_rate": 4.641711840638215e-07, + "loss": 1.4633, + "step": 28340 + }, + { + "epoch": 2.68, + "grad_norm": 70.5, + "learning_rate": 4.63793040533016e-07, + "loss": 1.5427, + "step": 28360 + }, + { + "epoch": 2.68, + "grad_norm": 72.0625, + "learning_rate": 4.634148970022104e-07, + "loss": 1.5296, + "step": 28380 + }, + { + "epoch": 2.68, + "grad_norm": 63.96875, + "learning_rate": 4.630367534714049e-07, + "loss": 1.4503, + "step": 28400 + }, + { + "epoch": 2.69, + "grad_norm": 61.09375, + "learning_rate": 4.626586099405993e-07, + "loss": 1.5682, + "step": 28420 + }, + { + "epoch": 2.69, + "grad_norm": 109.375, + "learning_rate": 4.622804664097937e-07, + "loss": 1.6351, + "step": 28440 + }, + { + "epoch": 2.69, + "grad_norm": 64.4375, + "learning_rate": 4.6190232287898814e-07, + "loss": 1.5257, + "step": 28460 + }, + { + "epoch": 2.69, + "grad_norm": 66.3125, + "learning_rate": 4.615241793481826e-07, + "loss": 1.4647, + "step": 28480 + }, + { + "epoch": 2.69, + "grad_norm": 62.1875, + "learning_rate": 4.6114603581737704e-07, + "loss": 1.5068, + "step": 28500 + }, + { + "epoch": 2.7, + "grad_norm": 60.46875, + "learning_rate": 4.6076789228657146e-07, + "loss": 1.5132, + "step": 28520 + }, + { + "epoch": 2.7, + "grad_norm": 63.21875, + "learning_rate": 4.60389748755766e-07, + "loss": 1.5879, + "step": 28540 + }, + { + "epoch": 2.7, + "grad_norm": 77.9375, + "learning_rate": 4.600116052249604e-07, + "loss": 1.5218, + "step": 28560 + }, + { + "epoch": 2.7, + "grad_norm": 58.0625, + "learning_rate": 4.5963346169415483e-07, + "loss": 1.596, + "step": 28580 + }, + { + "epoch": 2.7, + "grad_norm": 48.78125, + "learning_rate": 4.592553181633493e-07, + "loss": 1.6146, + "step": 28600 + }, + { + "epoch": 2.71, + "grad_norm": 77.8125, + "learning_rate": 4.5887717463254373e-07, + "loss": 1.4568, + "step": 28620 + }, + { + "epoch": 2.71, + "grad_norm": 155.25, + "learning_rate": 4.5849903110173815e-07, + "loss": 1.5704, + "step": 28640 + }, + { + "epoch": 2.71, + "grad_norm": 56.4375, + "learning_rate": 4.5812088757093257e-07, + "loss": 1.5383, + "step": 28660 + }, + { + "epoch": 2.71, + "grad_norm": 98.5, + "learning_rate": 4.5774274404012704e-07, + "loss": 1.5281, + "step": 28680 + }, + { + "epoch": 2.71, + "grad_norm": 62.25, + "learning_rate": 4.5736460050932146e-07, + "loss": 1.5042, + "step": 28700 + }, + { + "epoch": 2.72, + "grad_norm": 65.625, + "learning_rate": 4.5698645697851594e-07, + "loss": 1.5688, + "step": 28720 + }, + { + "epoch": 2.72, + "grad_norm": 54.90625, + "learning_rate": 4.566083134477104e-07, + "loss": 1.5805, + "step": 28740 + }, + { + "epoch": 2.72, + "grad_norm": 57.90625, + "learning_rate": 4.5623016991690483e-07, + "loss": 1.5071, + "step": 28760 + }, + { + "epoch": 2.72, + "grad_norm": 63.375, + "learning_rate": 4.5585202638609926e-07, + "loss": 1.3748, + "step": 28780 + }, + { + "epoch": 2.72, + "grad_norm": 67.6875, + "learning_rate": 4.554738828552937e-07, + "loss": 1.6716, + "step": 28800 + }, + { + "epoch": 2.72, + "grad_norm": 73.5625, + "learning_rate": 4.5509573932448815e-07, + "loss": 1.4361, + "step": 28820 + }, + { + "epoch": 2.73, + "grad_norm": 62.0625, + "learning_rate": 4.5471759579368257e-07, + "loss": 1.6571, + "step": 28840 + }, + { + "epoch": 2.73, + "grad_norm": 58.5, + "learning_rate": 4.54339452262877e-07, + "loss": 1.5489, + "step": 28860 + }, + { + "epoch": 2.73, + "grad_norm": 53.53125, + "learning_rate": 4.539613087320715e-07, + "loss": 1.5146, + "step": 28880 + }, + { + "epoch": 2.73, + "grad_norm": 47.5, + "learning_rate": 4.5358316520126594e-07, + "loss": 1.5828, + "step": 28900 + }, + { + "epoch": 2.73, + "grad_norm": 50.25, + "learning_rate": 4.5320502167046036e-07, + "loss": 1.4854, + "step": 28920 + }, + { + "epoch": 2.74, + "grad_norm": 73.75, + "learning_rate": 4.528268781396548e-07, + "loss": 1.5217, + "step": 28940 + }, + { + "epoch": 2.74, + "grad_norm": 56.71875, + "learning_rate": 4.5244873460884926e-07, + "loss": 1.5262, + "step": 28960 + }, + { + "epoch": 2.74, + "grad_norm": 54.8125, + "learning_rate": 4.520705910780437e-07, + "loss": 1.5702, + "step": 28980 + }, + { + "epoch": 2.74, + "grad_norm": 80.3125, + "learning_rate": 4.516924475472381e-07, + "loss": 1.5637, + "step": 29000 + }, + { + "epoch": 2.74, + "grad_norm": 107.75, + "learning_rate": 4.513143040164326e-07, + "loss": 1.5309, + "step": 29020 + }, + { + "epoch": 2.75, + "grad_norm": 48.09375, + "learning_rate": 4.50936160485627e-07, + "loss": 1.5284, + "step": 29040 + }, + { + "epoch": 2.75, + "grad_norm": 56.90625, + "learning_rate": 4.5055801695482147e-07, + "loss": 1.5613, + "step": 29060 + }, + { + "epoch": 2.75, + "grad_norm": 62.375, + "learning_rate": 4.5017987342401595e-07, + "loss": 1.5028, + "step": 29080 + }, + { + "epoch": 2.75, + "grad_norm": 59.625, + "learning_rate": 4.4980172989321037e-07, + "loss": 1.6023, + "step": 29100 + }, + { + "epoch": 2.75, + "grad_norm": 70.625, + "learning_rate": 4.494235863624048e-07, + "loss": 1.5756, + "step": 29120 + }, + { + "epoch": 2.75, + "grad_norm": 66.8125, + "learning_rate": 4.490454428315992e-07, + "loss": 1.6159, + "step": 29140 + }, + { + "epoch": 2.76, + "grad_norm": 67.8125, + "learning_rate": 4.486672993007937e-07, + "loss": 1.5633, + "step": 29160 + }, + { + "epoch": 2.76, + "grad_norm": 44.71875, + "learning_rate": 4.482891557699881e-07, + "loss": 1.5654, + "step": 29180 + }, + { + "epoch": 2.76, + "grad_norm": 58.59375, + "learning_rate": 4.4791101223918253e-07, + "loss": 1.5233, + "step": 29200 + }, + { + "epoch": 2.76, + "grad_norm": 61.90625, + "learning_rate": 4.47532868708377e-07, + "loss": 1.5729, + "step": 29220 + }, + { + "epoch": 2.76, + "grad_norm": 76.5625, + "learning_rate": 4.471547251775715e-07, + "loss": 1.5359, + "step": 29240 + }, + { + "epoch": 2.77, + "grad_norm": 80.875, + "learning_rate": 4.467765816467659e-07, + "loss": 1.5896, + "step": 29260 + }, + { + "epoch": 2.77, + "grad_norm": 80.5, + "learning_rate": 4.463984381159603e-07, + "loss": 1.6558, + "step": 29280 + }, + { + "epoch": 2.77, + "grad_norm": 61.0625, + "learning_rate": 4.460202945851548e-07, + "loss": 1.5273, + "step": 29300 + }, + { + "epoch": 2.77, + "grad_norm": 64.1875, + "learning_rate": 4.456421510543492e-07, + "loss": 1.5447, + "step": 29320 + }, + { + "epoch": 2.77, + "grad_norm": 69.375, + "learning_rate": 4.4526400752354363e-07, + "loss": 1.5843, + "step": 29340 + }, + { + "epoch": 2.78, + "grad_norm": 68.125, + "learning_rate": 4.448858639927381e-07, + "loss": 1.5857, + "step": 29360 + }, + { + "epoch": 2.78, + "grad_norm": 68.0, + "learning_rate": 4.4450772046193253e-07, + "loss": 1.579, + "step": 29380 + }, + { + "epoch": 2.78, + "grad_norm": 70.6875, + "learning_rate": 4.44129576931127e-07, + "loss": 1.5822, + "step": 29400 + }, + { + "epoch": 2.78, + "grad_norm": 64.9375, + "learning_rate": 4.437514334003214e-07, + "loss": 1.6115, + "step": 29420 + }, + { + "epoch": 2.78, + "grad_norm": 101.9375, + "learning_rate": 4.433732898695159e-07, + "loss": 1.6074, + "step": 29440 + }, + { + "epoch": 2.79, + "grad_norm": 70.6875, + "learning_rate": 4.429951463387103e-07, + "loss": 1.5953, + "step": 29460 + }, + { + "epoch": 2.79, + "grad_norm": 57.25, + "learning_rate": 4.4261700280790474e-07, + "loss": 1.492, + "step": 29480 + }, + { + "epoch": 2.79, + "grad_norm": 70.5625, + "learning_rate": 4.422388592770992e-07, + "loss": 1.6068, + "step": 29500 + }, + { + "epoch": 2.79, + "grad_norm": 57.6875, + "learning_rate": 4.4186071574629364e-07, + "loss": 1.5683, + "step": 29520 + }, + { + "epoch": 2.79, + "grad_norm": 57.78125, + "learning_rate": 4.4148257221548806e-07, + "loss": 1.5758, + "step": 29540 + }, + { + "epoch": 2.79, + "grad_norm": 72.125, + "learning_rate": 4.411044286846825e-07, + "loss": 1.5107, + "step": 29560 + }, + { + "epoch": 2.8, + "grad_norm": 63.46875, + "learning_rate": 4.40726285153877e-07, + "loss": 1.5422, + "step": 29580 + }, + { + "epoch": 2.8, + "grad_norm": 43.5, + "learning_rate": 4.4034814162307143e-07, + "loss": 1.5609, + "step": 29600 + }, + { + "epoch": 2.8, + "grad_norm": 57.8125, + "learning_rate": 4.3996999809226585e-07, + "loss": 1.5686, + "step": 29620 + }, + { + "epoch": 2.8, + "grad_norm": 64.625, + "learning_rate": 4.395918545614603e-07, + "loss": 1.4721, + "step": 29640 + }, + { + "epoch": 2.8, + "grad_norm": 77.875, + "learning_rate": 4.3921371103065475e-07, + "loss": 1.5926, + "step": 29660 + }, + { + "epoch": 2.81, + "grad_norm": 66.0, + "learning_rate": 4.3883556749984917e-07, + "loss": 1.5014, + "step": 29680 + }, + { + "epoch": 2.81, + "grad_norm": 51.5, + "learning_rate": 4.3845742396904364e-07, + "loss": 1.5162, + "step": 29700 + }, + { + "epoch": 2.81, + "grad_norm": 80.625, + "learning_rate": 4.3807928043823806e-07, + "loss": 1.5249, + "step": 29720 + }, + { + "epoch": 2.81, + "grad_norm": 75.0, + "learning_rate": 4.377011369074325e-07, + "loss": 1.5725, + "step": 29740 + }, + { + "epoch": 2.81, + "grad_norm": 79.4375, + "learning_rate": 4.3732299337662696e-07, + "loss": 1.444, + "step": 29760 + }, + { + "epoch": 2.82, + "grad_norm": 57.4375, + "learning_rate": 4.3694484984582143e-07, + "loss": 1.5409, + "step": 29780 + }, + { + "epoch": 2.82, + "grad_norm": 79.1875, + "learning_rate": 4.3656670631501585e-07, + "loss": 1.5865, + "step": 29800 + }, + { + "epoch": 2.82, + "grad_norm": 75.4375, + "learning_rate": 4.361885627842103e-07, + "loss": 1.4412, + "step": 29820 + }, + { + "epoch": 2.82, + "grad_norm": 58.125, + "learning_rate": 4.3581041925340475e-07, + "loss": 1.5931, + "step": 29840 + }, + { + "epoch": 2.82, + "grad_norm": 69.4375, + "learning_rate": 4.3543227572259917e-07, + "loss": 1.6071, + "step": 29860 + }, + { + "epoch": 2.82, + "grad_norm": 60.65625, + "learning_rate": 4.350541321917936e-07, + "loss": 1.5137, + "step": 29880 + }, + { + "epoch": 2.83, + "grad_norm": 77.8125, + "learning_rate": 4.34675988660988e-07, + "loss": 1.5608, + "step": 29900 + }, + { + "epoch": 2.83, + "grad_norm": 45.5, + "learning_rate": 4.342978451301825e-07, + "loss": 1.5166, + "step": 29920 + }, + { + "epoch": 2.83, + "grad_norm": 59.5, + "learning_rate": 4.3391970159937696e-07, + "loss": 1.6169, + "step": 29940 + }, + { + "epoch": 2.83, + "grad_norm": 61.375, + "learning_rate": 4.335415580685714e-07, + "loss": 1.4541, + "step": 29960 + }, + { + "epoch": 2.83, + "grad_norm": 60.28125, + "learning_rate": 4.3316341453776586e-07, + "loss": 1.5651, + "step": 29980 + }, + { + "epoch": 2.84, + "grad_norm": 65.4375, + "learning_rate": 4.327852710069603e-07, + "loss": 1.6816, + "step": 30000 + }, + { + "epoch": 2.84, + "grad_norm": 60.125, + "learning_rate": 4.324071274761547e-07, + "loss": 1.502, + "step": 30020 + }, + { + "epoch": 2.84, + "grad_norm": 61.6875, + "learning_rate": 4.320289839453491e-07, + "loss": 1.4747, + "step": 30040 + }, + { + "epoch": 2.84, + "grad_norm": 47.0, + "learning_rate": 4.316508404145436e-07, + "loss": 1.5672, + "step": 30060 + }, + { + "epoch": 2.84, + "grad_norm": 63.0625, + "learning_rate": 4.31272696883738e-07, + "loss": 1.5038, + "step": 30080 + }, + { + "epoch": 2.85, + "grad_norm": 55.0625, + "learning_rate": 4.308945533529325e-07, + "loss": 1.5884, + "step": 30100 + }, + { + "epoch": 2.85, + "grad_norm": 56.4375, + "learning_rate": 4.3051640982212697e-07, + "loss": 1.5646, + "step": 30120 + }, + { + "epoch": 2.85, + "grad_norm": 85.8125, + "learning_rate": 4.301382662913214e-07, + "loss": 1.4855, + "step": 30140 + }, + { + "epoch": 2.85, + "grad_norm": 91.0, + "learning_rate": 4.297601227605158e-07, + "loss": 1.5274, + "step": 30160 + }, + { + "epoch": 2.85, + "grad_norm": 83.1875, + "learning_rate": 4.293819792297103e-07, + "loss": 1.5331, + "step": 30180 + }, + { + "epoch": 2.85, + "grad_norm": 57.71875, + "learning_rate": 4.290038356989047e-07, + "loss": 1.4909, + "step": 30200 + }, + { + "epoch": 2.86, + "grad_norm": 84.5, + "learning_rate": 4.286256921680991e-07, + "loss": 1.5937, + "step": 30220 + }, + { + "epoch": 2.86, + "grad_norm": 78.5, + "learning_rate": 4.2824754863729355e-07, + "loss": 1.5401, + "step": 30240 + }, + { + "epoch": 2.86, + "grad_norm": 65.1875, + "learning_rate": 4.27869405106488e-07, + "loss": 1.5679, + "step": 30260 + }, + { + "epoch": 2.86, + "grad_norm": 66.25, + "learning_rate": 4.274912615756825e-07, + "loss": 1.4403, + "step": 30280 + }, + { + "epoch": 2.86, + "grad_norm": 56.34375, + "learning_rate": 4.271131180448769e-07, + "loss": 1.515, + "step": 30300 + }, + { + "epoch": 2.87, + "grad_norm": 50.9375, + "learning_rate": 4.267349745140714e-07, + "loss": 1.4955, + "step": 30320 + }, + { + "epoch": 2.87, + "grad_norm": 58.6875, + "learning_rate": 4.263568309832658e-07, + "loss": 1.4947, + "step": 30340 + }, + { + "epoch": 2.87, + "grad_norm": 51.75, + "learning_rate": 4.2597868745246023e-07, + "loss": 1.578, + "step": 30360 + }, + { + "epoch": 2.87, + "grad_norm": 85.9375, + "learning_rate": 4.2560054392165465e-07, + "loss": 1.5881, + "step": 30380 + }, + { + "epoch": 2.87, + "grad_norm": 79.25, + "learning_rate": 4.2522240039084913e-07, + "loss": 1.5744, + "step": 30400 + }, + { + "epoch": 2.88, + "grad_norm": 76.8125, + "learning_rate": 4.2484425686004355e-07, + "loss": 1.5821, + "step": 30420 + }, + { + "epoch": 2.88, + "grad_norm": 51.375, + "learning_rate": 4.2446611332923797e-07, + "loss": 1.585, + "step": 30440 + }, + { + "epoch": 2.88, + "grad_norm": 62.59375, + "learning_rate": 4.240879697984325e-07, + "loss": 1.5283, + "step": 30460 + }, + { + "epoch": 2.88, + "grad_norm": 52.0625, + "learning_rate": 4.237098262676269e-07, + "loss": 1.4999, + "step": 30480 + }, + { + "epoch": 2.88, + "grad_norm": 51.65625, + "learning_rate": 4.2333168273682134e-07, + "loss": 1.6095, + "step": 30500 + }, + { + "epoch": 2.89, + "grad_norm": 76.4375, + "learning_rate": 4.2295353920601576e-07, + "loss": 1.5921, + "step": 30520 + }, + { + "epoch": 2.89, + "grad_norm": 72.1875, + "learning_rate": 4.2257539567521024e-07, + "loss": 1.6093, + "step": 30540 + }, + { + "epoch": 2.89, + "grad_norm": 67.25, + "learning_rate": 4.2219725214440466e-07, + "loss": 1.4845, + "step": 30560 + }, + { + "epoch": 2.89, + "grad_norm": 55.6875, + "learning_rate": 4.218191086135991e-07, + "loss": 1.584, + "step": 30580 + }, + { + "epoch": 2.89, + "grad_norm": 50.96875, + "learning_rate": 4.2144096508279355e-07, + "loss": 1.4808, + "step": 30600 + }, + { + "epoch": 2.89, + "grad_norm": 100.75, + "learning_rate": 4.21062821551988e-07, + "loss": 1.5642, + "step": 30620 + }, + { + "epoch": 2.9, + "grad_norm": 53.71875, + "learning_rate": 4.2068467802118245e-07, + "loss": 1.5242, + "step": 30640 + }, + { + "epoch": 2.9, + "grad_norm": 54.25, + "learning_rate": 4.2030653449037687e-07, + "loss": 1.5593, + "step": 30660 + }, + { + "epoch": 2.9, + "grad_norm": 61.28125, + "learning_rate": 4.1992839095957134e-07, + "loss": 1.5166, + "step": 30680 + }, + { + "epoch": 2.9, + "grad_norm": 56.5625, + "learning_rate": 4.1955024742876577e-07, + "loss": 1.6194, + "step": 30700 + }, + { + "epoch": 2.9, + "grad_norm": 59.03125, + "learning_rate": 4.191721038979602e-07, + "loss": 1.5371, + "step": 30720 + }, + { + "epoch": 2.91, + "grad_norm": 55.03125, + "learning_rate": 4.1879396036715466e-07, + "loss": 1.4101, + "step": 30740 + }, + { + "epoch": 2.91, + "grad_norm": 68.125, + "learning_rate": 4.184158168363491e-07, + "loss": 1.5908, + "step": 30760 + }, + { + "epoch": 2.91, + "grad_norm": 51.59375, + "learning_rate": 4.180376733055435e-07, + "loss": 1.5034, + "step": 30780 + }, + { + "epoch": 2.91, + "grad_norm": 58.4375, + "learning_rate": 4.1765952977473803e-07, + "loss": 1.5745, + "step": 30800 + }, + { + "epoch": 2.91, + "grad_norm": 75.4375, + "learning_rate": 4.1728138624393245e-07, + "loss": 1.5812, + "step": 30820 + }, + { + "epoch": 2.92, + "grad_norm": 66.875, + "learning_rate": 4.169032427131269e-07, + "loss": 1.5836, + "step": 30840 + }, + { + "epoch": 2.92, + "grad_norm": 53.125, + "learning_rate": 4.165250991823213e-07, + "loss": 1.534, + "step": 30860 + }, + { + "epoch": 2.92, + "grad_norm": 76.0625, + "learning_rate": 4.1614695565151577e-07, + "loss": 1.5019, + "step": 30880 + }, + { + "epoch": 2.92, + "grad_norm": 61.9375, + "learning_rate": 4.157688121207102e-07, + "loss": 1.507, + "step": 30900 + }, + { + "epoch": 2.92, + "grad_norm": 85.25, + "learning_rate": 4.153906685899046e-07, + "loss": 1.5362, + "step": 30920 + }, + { + "epoch": 2.92, + "grad_norm": 71.625, + "learning_rate": 4.150125250590991e-07, + "loss": 1.5695, + "step": 30940 + }, + { + "epoch": 2.93, + "grad_norm": 69.25, + "learning_rate": 4.146343815282935e-07, + "loss": 1.5302, + "step": 30960 + }, + { + "epoch": 2.93, + "grad_norm": 79.0, + "learning_rate": 4.14256237997488e-07, + "loss": 1.545, + "step": 30980 + }, + { + "epoch": 2.93, + "grad_norm": 68.125, + "learning_rate": 4.138780944666824e-07, + "loss": 1.5746, + "step": 31000 + }, + { + "epoch": 2.93, + "grad_norm": 56.15625, + "learning_rate": 4.134999509358769e-07, + "loss": 1.5426, + "step": 31020 + }, + { + "epoch": 2.93, + "grad_norm": 53.6875, + "learning_rate": 4.131218074050713e-07, + "loss": 1.5182, + "step": 31040 + }, + { + "epoch": 2.94, + "grad_norm": 89.4375, + "learning_rate": 4.127436638742657e-07, + "loss": 1.4772, + "step": 31060 + }, + { + "epoch": 2.94, + "grad_norm": 66.375, + "learning_rate": 4.123655203434602e-07, + "loss": 1.5478, + "step": 31080 + }, + { + "epoch": 2.94, + "grad_norm": 65.8125, + "learning_rate": 4.119873768126546e-07, + "loss": 1.4511, + "step": 31100 + }, + { + "epoch": 2.94, + "grad_norm": 56.5, + "learning_rate": 4.1160923328184904e-07, + "loss": 1.5412, + "step": 31120 + }, + { + "epoch": 2.94, + "grad_norm": 50.8125, + "learning_rate": 4.1123108975104346e-07, + "loss": 1.6, + "step": 31140 + }, + { + "epoch": 2.95, + "grad_norm": 71.5625, + "learning_rate": 4.10852946220238e-07, + "loss": 1.4424, + "step": 31160 + }, + { + "epoch": 2.95, + "grad_norm": 59.28125, + "learning_rate": 4.104748026894324e-07, + "loss": 1.5589, + "step": 31180 + }, + { + "epoch": 2.95, + "grad_norm": 56.6875, + "learning_rate": 4.1009665915862683e-07, + "loss": 1.5787, + "step": 31200 + }, + { + "epoch": 2.95, + "grad_norm": 61.6875, + "learning_rate": 4.097185156278213e-07, + "loss": 1.4297, + "step": 31220 + }, + { + "epoch": 2.95, + "grad_norm": 64.4375, + "learning_rate": 4.093403720970157e-07, + "loss": 1.557, + "step": 31240 + }, + { + "epoch": 2.96, + "grad_norm": 49.65625, + "learning_rate": 4.0896222856621014e-07, + "loss": 1.518, + "step": 31260 + }, + { + "epoch": 2.96, + "grad_norm": 59.5, + "learning_rate": 4.0858408503540457e-07, + "loss": 1.5745, + "step": 31280 + }, + { + "epoch": 2.96, + "grad_norm": 58.25, + "learning_rate": 4.0820594150459904e-07, + "loss": 1.4711, + "step": 31300 + }, + { + "epoch": 2.96, + "grad_norm": 66.625, + "learning_rate": 4.0782779797379346e-07, + "loss": 1.5911, + "step": 31320 + }, + { + "epoch": 2.96, + "grad_norm": 78.6875, + "learning_rate": 4.0744965444298794e-07, + "loss": 1.5814, + "step": 31340 + }, + { + "epoch": 2.96, + "grad_norm": 61.25, + "learning_rate": 4.070715109121824e-07, + "loss": 1.4427, + "step": 31360 + }, + { + "epoch": 2.97, + "grad_norm": 69.875, + "learning_rate": 4.0669336738137683e-07, + "loss": 1.4784, + "step": 31380 + }, + { + "epoch": 2.97, + "grad_norm": 50.6875, + "learning_rate": 4.0631522385057125e-07, + "loss": 1.5007, + "step": 31400 + }, + { + "epoch": 2.97, + "grad_norm": 66.8125, + "learning_rate": 4.0593708031976573e-07, + "loss": 1.6264, + "step": 31420 + }, + { + "epoch": 2.97, + "grad_norm": 62.65625, + "learning_rate": 4.0555893678896015e-07, + "loss": 1.5666, + "step": 31440 + }, + { + "epoch": 2.97, + "grad_norm": 71.5, + "learning_rate": 4.0518079325815457e-07, + "loss": 1.736, + "step": 31460 + }, + { + "epoch": 2.98, + "grad_norm": 69.1875, + "learning_rate": 4.04802649727349e-07, + "loss": 1.543, + "step": 31480 + }, + { + "epoch": 2.98, + "grad_norm": 70.1875, + "learning_rate": 4.044245061965435e-07, + "loss": 1.5761, + "step": 31500 + }, + { + "epoch": 2.98, + "grad_norm": 65.75, + "learning_rate": 4.0404636266573794e-07, + "loss": 1.5896, + "step": 31520 + }, + { + "epoch": 2.98, + "grad_norm": 54.65625, + "learning_rate": 4.0366821913493236e-07, + "loss": 1.4913, + "step": 31540 + }, + { + "epoch": 2.98, + "grad_norm": 73.0625, + "learning_rate": 4.0329007560412683e-07, + "loss": 1.5724, + "step": 31560 + }, + { + "epoch": 2.99, + "grad_norm": 66.3125, + "learning_rate": 4.0291193207332126e-07, + "loss": 1.5511, + "step": 31580 + }, + { + "epoch": 2.99, + "grad_norm": 68.5625, + "learning_rate": 4.025337885425157e-07, + "loss": 1.5837, + "step": 31600 + }, + { + "epoch": 2.99, + "grad_norm": 62.375, + "learning_rate": 4.021556450117101e-07, + "loss": 1.5772, + "step": 31620 + }, + { + "epoch": 2.99, + "grad_norm": 69.25, + "learning_rate": 4.0177750148090457e-07, + "loss": 1.5872, + "step": 31640 + }, + { + "epoch": 2.99, + "grad_norm": 69.0625, + "learning_rate": 4.01399357950099e-07, + "loss": 1.5467, + "step": 31660 + }, + { + "epoch": 2.99, + "grad_norm": 54.21875, + "learning_rate": 4.0102121441929347e-07, + "loss": 1.5222, + "step": 31680 + }, + { + "epoch": 3.0, + "grad_norm": 67.6875, + "learning_rate": 4.0064307088848794e-07, + "loss": 1.5318, + "step": 31700 + }, + { + "epoch": 3.0, + "grad_norm": 77.0625, + "learning_rate": 4.0026492735768236e-07, + "loss": 1.5116, + "step": 31720 + }, + { + "epoch": 3.0, + "grad_norm": 63.15625, + "learning_rate": 3.998867838268768e-07, + "loss": 1.4655, + "step": 31740 + }, + { + "epoch": 3.0, + "grad_norm": 72.25, + "learning_rate": 3.995086402960712e-07, + "loss": 1.3648, + "step": 31760 + }, + { + "epoch": 3.0, + "grad_norm": 50.46875, + "learning_rate": 3.991304967652657e-07, + "loss": 1.3922, + "step": 31780 + }, + { + "epoch": 3.01, + "grad_norm": 52.625, + "learning_rate": 3.987523532344601e-07, + "loss": 1.4272, + "step": 31800 + }, + { + "epoch": 3.01, + "grad_norm": 68.375, + "learning_rate": 3.983742097036545e-07, + "loss": 1.3381, + "step": 31820 + }, + { + "epoch": 3.01, + "grad_norm": 61.75, + "learning_rate": 3.97996066172849e-07, + "loss": 1.3762, + "step": 31840 + }, + { + "epoch": 3.01, + "grad_norm": 64.0, + "learning_rate": 3.9761792264204347e-07, + "loss": 1.36, + "step": 31860 + }, + { + "epoch": 3.01, + "grad_norm": 54.15625, + "learning_rate": 3.972397791112379e-07, + "loss": 1.3718, + "step": 31880 + }, + { + "epoch": 3.02, + "grad_norm": 57.75, + "learning_rate": 3.9686163558043237e-07, + "loss": 1.4966, + "step": 31900 + }, + { + "epoch": 3.02, + "grad_norm": 75.0625, + "learning_rate": 3.964834920496268e-07, + "loss": 1.4002, + "step": 31920 + }, + { + "epoch": 3.02, + "grad_norm": 53.6875, + "learning_rate": 3.961053485188212e-07, + "loss": 1.3682, + "step": 31940 + }, + { + "epoch": 3.02, + "grad_norm": 60.3125, + "learning_rate": 3.9572720498801563e-07, + "loss": 1.4108, + "step": 31960 + }, + { + "epoch": 3.02, + "grad_norm": 100.0625, + "learning_rate": 3.953490614572101e-07, + "loss": 1.4387, + "step": 31980 + }, + { + "epoch": 3.03, + "grad_norm": 61.40625, + "learning_rate": 3.9497091792640453e-07, + "loss": 1.3657, + "step": 32000 + }, + { + "epoch": 3.03, + "grad_norm": 57.0625, + "learning_rate": 3.94592774395599e-07, + "loss": 1.3044, + "step": 32020 + }, + { + "epoch": 3.03, + "grad_norm": 75.625, + "learning_rate": 3.942146308647935e-07, + "loss": 1.5226, + "step": 32040 + }, + { + "epoch": 3.03, + "grad_norm": 79.125, + "learning_rate": 3.938364873339879e-07, + "loss": 1.5141, + "step": 32060 + }, + { + "epoch": 3.03, + "grad_norm": 77.125, + "learning_rate": 3.934583438031823e-07, + "loss": 1.3971, + "step": 32080 + }, + { + "epoch": 3.03, + "grad_norm": 74.6875, + "learning_rate": 3.9308020027237674e-07, + "loss": 1.3343, + "step": 32100 + }, + { + "epoch": 3.04, + "grad_norm": 58.6875, + "learning_rate": 3.927020567415712e-07, + "loss": 1.3938, + "step": 32120 + }, + { + "epoch": 3.04, + "grad_norm": 57.375, + "learning_rate": 3.9232391321076564e-07, + "loss": 1.4423, + "step": 32140 + }, + { + "epoch": 3.04, + "grad_norm": 64.875, + "learning_rate": 3.9194576967996006e-07, + "loss": 1.4377, + "step": 32160 + }, + { + "epoch": 3.04, + "grad_norm": 57.78125, + "learning_rate": 3.9156762614915453e-07, + "loss": 1.416, + "step": 32180 + }, + { + "epoch": 3.04, + "grad_norm": 52.8125, + "learning_rate": 3.91189482618349e-07, + "loss": 1.4056, + "step": 32200 + }, + { + "epoch": 3.05, + "grad_norm": 52.375, + "learning_rate": 3.908113390875434e-07, + "loss": 1.4797, + "step": 32220 + }, + { + "epoch": 3.05, + "grad_norm": 66.0625, + "learning_rate": 3.9043319555673785e-07, + "loss": 1.3557, + "step": 32240 + }, + { + "epoch": 3.05, + "grad_norm": 67.5, + "learning_rate": 3.900550520259323e-07, + "loss": 1.3408, + "step": 32260 + }, + { + "epoch": 3.05, + "grad_norm": 61.5625, + "learning_rate": 3.8967690849512674e-07, + "loss": 1.4133, + "step": 32280 + }, + { + "epoch": 3.05, + "grad_norm": 70.9375, + "learning_rate": 3.8929876496432116e-07, + "loss": 1.4408, + "step": 32300 + }, + { + "epoch": 3.06, + "grad_norm": 60.65625, + "learning_rate": 3.8892062143351564e-07, + "loss": 1.3561, + "step": 32320 + }, + { + "epoch": 3.06, + "grad_norm": 57.78125, + "learning_rate": 3.8854247790271006e-07, + "loss": 1.4019, + "step": 32340 + }, + { + "epoch": 3.06, + "grad_norm": 54.75, + "learning_rate": 3.881643343719045e-07, + "loss": 1.2897, + "step": 32360 + }, + { + "epoch": 3.06, + "grad_norm": 55.5, + "learning_rate": 3.8778619084109896e-07, + "loss": 1.4233, + "step": 32380 + }, + { + "epoch": 3.06, + "grad_norm": 57.90625, + "learning_rate": 3.8740804731029343e-07, + "loss": 1.3995, + "step": 32400 + }, + { + "epoch": 3.06, + "grad_norm": 60.0625, + "learning_rate": 3.8702990377948785e-07, + "loss": 1.3652, + "step": 32420 + }, + { + "epoch": 3.07, + "grad_norm": 62.25, + "learning_rate": 3.8665176024868227e-07, + "loss": 1.4912, + "step": 32440 + }, + { + "epoch": 3.07, + "grad_norm": 69.0625, + "learning_rate": 3.8627361671787675e-07, + "loss": 1.369, + "step": 32460 + }, + { + "epoch": 3.07, + "grad_norm": 50.65625, + "learning_rate": 3.8589547318707117e-07, + "loss": 1.426, + "step": 32480 + }, + { + "epoch": 3.07, + "grad_norm": 57.90625, + "learning_rate": 3.855173296562656e-07, + "loss": 1.2934, + "step": 32500 + }, + { + "epoch": 3.07, + "grad_norm": 70.875, + "learning_rate": 3.8513918612546e-07, + "loss": 1.4486, + "step": 32520 + }, + { + "epoch": 3.08, + "grad_norm": 70.4375, + "learning_rate": 3.847610425946545e-07, + "loss": 1.5197, + "step": 32540 + }, + { + "epoch": 3.08, + "grad_norm": 60.625, + "learning_rate": 3.8438289906384896e-07, + "loss": 1.4181, + "step": 32560 + }, + { + "epoch": 3.08, + "grad_norm": 70.75, + "learning_rate": 3.840047555330434e-07, + "loss": 1.3959, + "step": 32580 + }, + { + "epoch": 3.08, + "grad_norm": 89.6875, + "learning_rate": 3.8362661200223785e-07, + "loss": 1.3343, + "step": 32600 + }, + { + "epoch": 3.08, + "grad_norm": 55.75, + "learning_rate": 3.832484684714323e-07, + "loss": 1.4992, + "step": 32620 + }, + { + "epoch": 3.09, + "grad_norm": 72.875, + "learning_rate": 3.828703249406267e-07, + "loss": 1.4326, + "step": 32640 + }, + { + "epoch": 3.09, + "grad_norm": 59.40625, + "learning_rate": 3.8249218140982117e-07, + "loss": 1.4861, + "step": 32660 + }, + { + "epoch": 3.09, + "grad_norm": 50.625, + "learning_rate": 3.821140378790156e-07, + "loss": 1.4226, + "step": 32680 + }, + { + "epoch": 3.09, + "grad_norm": 56.3125, + "learning_rate": 3.8173589434821e-07, + "loss": 1.383, + "step": 32700 + }, + { + "epoch": 3.09, + "grad_norm": 56.09375, + "learning_rate": 3.813577508174045e-07, + "loss": 1.5258, + "step": 32720 + }, + { + "epoch": 3.1, + "grad_norm": 68.4375, + "learning_rate": 3.8097960728659896e-07, + "loss": 1.3827, + "step": 32740 + }, + { + "epoch": 3.1, + "grad_norm": 52.09375, + "learning_rate": 3.806014637557934e-07, + "loss": 1.4179, + "step": 32760 + }, + { + "epoch": 3.1, + "grad_norm": 55.5625, + "learning_rate": 3.802233202249878e-07, + "loss": 1.3972, + "step": 32780 + }, + { + "epoch": 3.1, + "grad_norm": 51.625, + "learning_rate": 3.798451766941823e-07, + "loss": 1.4764, + "step": 32800 + }, + { + "epoch": 3.1, + "grad_norm": 75.3125, + "learning_rate": 3.794670331633767e-07, + "loss": 1.4087, + "step": 32820 + }, + { + "epoch": 3.1, + "grad_norm": 67.8125, + "learning_rate": 3.790888896325711e-07, + "loss": 1.4917, + "step": 32840 + }, + { + "epoch": 3.11, + "grad_norm": 59.4375, + "learning_rate": 3.7871074610176554e-07, + "loss": 1.3819, + "step": 32860 + }, + { + "epoch": 3.11, + "grad_norm": 56.0625, + "learning_rate": 3.7833260257096e-07, + "loss": 1.3787, + "step": 32880 + }, + { + "epoch": 3.11, + "grad_norm": 64.0625, + "learning_rate": 3.779544590401545e-07, + "loss": 1.4065, + "step": 32900 + }, + { + "epoch": 3.11, + "grad_norm": 59.375, + "learning_rate": 3.775763155093489e-07, + "loss": 1.427, + "step": 32920 + }, + { + "epoch": 3.11, + "grad_norm": 61.09375, + "learning_rate": 3.771981719785434e-07, + "loss": 1.3911, + "step": 32940 + }, + { + "epoch": 3.12, + "grad_norm": 65.9375, + "learning_rate": 3.768200284477378e-07, + "loss": 1.3648, + "step": 32960 + }, + { + "epoch": 3.12, + "grad_norm": 66.6875, + "learning_rate": 3.7644188491693223e-07, + "loss": 1.4081, + "step": 32980 + }, + { + "epoch": 3.12, + "grad_norm": 55.40625, + "learning_rate": 3.7606374138612665e-07, + "loss": 1.4276, + "step": 33000 + }, + { + "epoch": 3.12, + "grad_norm": 54.34375, + "learning_rate": 3.756855978553211e-07, + "loss": 1.4641, + "step": 33020 + }, + { + "epoch": 3.12, + "grad_norm": 64.9375, + "learning_rate": 3.7530745432451555e-07, + "loss": 1.35, + "step": 33040 + }, + { + "epoch": 3.13, + "grad_norm": 46.28125, + "learning_rate": 3.7492931079370997e-07, + "loss": 1.4257, + "step": 33060 + }, + { + "epoch": 3.13, + "grad_norm": 57.6875, + "learning_rate": 3.745511672629045e-07, + "loss": 1.4374, + "step": 33080 + }, + { + "epoch": 3.13, + "grad_norm": 63.09375, + "learning_rate": 3.741730237320989e-07, + "loss": 1.4295, + "step": 33100 + }, + { + "epoch": 3.13, + "grad_norm": 69.4375, + "learning_rate": 3.7379488020129334e-07, + "loss": 1.3691, + "step": 33120 + }, + { + "epoch": 3.13, + "grad_norm": 63.34375, + "learning_rate": 3.734167366704878e-07, + "loss": 1.4291, + "step": 33140 + }, + { + "epoch": 3.13, + "grad_norm": 52.3125, + "learning_rate": 3.7303859313968223e-07, + "loss": 1.4837, + "step": 33160 + }, + { + "epoch": 3.14, + "grad_norm": 81.8125, + "learning_rate": 3.7266044960887665e-07, + "loss": 1.4837, + "step": 33180 + }, + { + "epoch": 3.14, + "grad_norm": 57.09375, + "learning_rate": 3.722823060780711e-07, + "loss": 1.3974, + "step": 33200 + }, + { + "epoch": 3.14, + "grad_norm": 65.1875, + "learning_rate": 3.7190416254726555e-07, + "loss": 1.3418, + "step": 33220 + }, + { + "epoch": 3.14, + "grad_norm": 70.25, + "learning_rate": 3.7152601901645997e-07, + "loss": 1.4303, + "step": 33240 + }, + { + "epoch": 3.14, + "grad_norm": 54.75, + "learning_rate": 3.7114787548565445e-07, + "loss": 1.4754, + "step": 33260 + }, + { + "epoch": 3.15, + "grad_norm": 65.8125, + "learning_rate": 3.707697319548489e-07, + "loss": 1.4474, + "step": 33280 + }, + { + "epoch": 3.15, + "grad_norm": 61.4375, + "learning_rate": 3.7039158842404334e-07, + "loss": 1.3412, + "step": 33300 + }, + { + "epoch": 3.15, + "grad_norm": 80.0, + "learning_rate": 3.7001344489323776e-07, + "loss": 1.3211, + "step": 33320 + }, + { + "epoch": 3.15, + "grad_norm": 62.0, + "learning_rate": 3.696353013624322e-07, + "loss": 1.3295, + "step": 33340 + }, + { + "epoch": 3.15, + "grad_norm": 70.375, + "learning_rate": 3.6925715783162666e-07, + "loss": 1.4678, + "step": 33360 + }, + { + "epoch": 3.16, + "grad_norm": 59.125, + "learning_rate": 3.688790143008211e-07, + "loss": 1.3333, + "step": 33380 + }, + { + "epoch": 3.16, + "grad_norm": 63.5625, + "learning_rate": 3.685008707700155e-07, + "loss": 1.3726, + "step": 33400 + }, + { + "epoch": 3.16, + "grad_norm": 64.625, + "learning_rate": 3.6812272723921003e-07, + "loss": 1.4712, + "step": 33420 + }, + { + "epoch": 3.16, + "grad_norm": 64.9375, + "learning_rate": 3.6774458370840445e-07, + "loss": 1.3021, + "step": 33440 + }, + { + "epoch": 3.16, + "grad_norm": 56.5625, + "learning_rate": 3.6736644017759887e-07, + "loss": 1.3805, + "step": 33460 + }, + { + "epoch": 3.17, + "grad_norm": 59.1875, + "learning_rate": 3.669882966467933e-07, + "loss": 1.4557, + "step": 33480 + }, + { + "epoch": 3.17, + "grad_norm": 64.1875, + "learning_rate": 3.6661015311598777e-07, + "loss": 1.4616, + "step": 33500 + }, + { + "epoch": 3.17, + "grad_norm": 68.6875, + "learning_rate": 3.662320095851822e-07, + "loss": 1.3874, + "step": 33520 + }, + { + "epoch": 3.17, + "grad_norm": 56.59375, + "learning_rate": 3.658538660543766e-07, + "loss": 1.4573, + "step": 33540 + }, + { + "epoch": 3.17, + "grad_norm": 70.4375, + "learning_rate": 3.654757225235711e-07, + "loss": 1.3524, + "step": 33560 + }, + { + "epoch": 3.17, + "grad_norm": 69.75, + "learning_rate": 3.650975789927655e-07, + "loss": 1.3711, + "step": 33580 + }, + { + "epoch": 3.18, + "grad_norm": 68.75, + "learning_rate": 3.6471943546196e-07, + "loss": 1.3215, + "step": 33600 + }, + { + "epoch": 3.18, + "grad_norm": 51.875, + "learning_rate": 3.6434129193115445e-07, + "loss": 1.4378, + "step": 33620 + }, + { + "epoch": 3.18, + "grad_norm": 53.5, + "learning_rate": 3.639631484003489e-07, + "loss": 1.4243, + "step": 33640 + }, + { + "epoch": 3.18, + "grad_norm": 118.8125, + "learning_rate": 3.635850048695433e-07, + "loss": 1.3906, + "step": 33660 + }, + { + "epoch": 3.18, + "grad_norm": 49.21875, + "learning_rate": 3.632068613387377e-07, + "loss": 1.3534, + "step": 33680 + }, + { + "epoch": 3.19, + "grad_norm": 61.1875, + "learning_rate": 3.628287178079322e-07, + "loss": 1.423, + "step": 33700 + }, + { + "epoch": 3.19, + "grad_norm": 72.3125, + "learning_rate": 3.624505742771266e-07, + "loss": 1.4193, + "step": 33720 + }, + { + "epoch": 3.19, + "grad_norm": 66.625, + "learning_rate": 3.6207243074632103e-07, + "loss": 1.4438, + "step": 33740 + }, + { + "epoch": 3.19, + "grad_norm": 70.9375, + "learning_rate": 3.6169428721551545e-07, + "loss": 1.4021, + "step": 33760 + }, + { + "epoch": 3.19, + "grad_norm": 64.4375, + "learning_rate": 3.6131614368471e-07, + "loss": 1.3662, + "step": 33780 + }, + { + "epoch": 3.2, + "grad_norm": 74.6875, + "learning_rate": 3.609380001539044e-07, + "loss": 1.4504, + "step": 33800 + }, + { + "epoch": 3.2, + "grad_norm": 83.25, + "learning_rate": 3.605598566230988e-07, + "loss": 1.3239, + "step": 33820 + }, + { + "epoch": 3.2, + "grad_norm": 78.375, + "learning_rate": 3.601817130922933e-07, + "loss": 1.4084, + "step": 33840 + }, + { + "epoch": 3.2, + "grad_norm": 78.5625, + "learning_rate": 3.598035695614877e-07, + "loss": 1.4374, + "step": 33860 + }, + { + "epoch": 3.2, + "grad_norm": 67.5, + "learning_rate": 3.5942542603068214e-07, + "loss": 1.3577, + "step": 33880 + }, + { + "epoch": 3.2, + "grad_norm": 54.09375, + "learning_rate": 3.590472824998766e-07, + "loss": 1.3353, + "step": 33900 + }, + { + "epoch": 3.21, + "grad_norm": 53.34375, + "learning_rate": 3.5866913896907104e-07, + "loss": 1.3715, + "step": 33920 + }, + { + "epoch": 3.21, + "grad_norm": 56.6875, + "learning_rate": 3.582909954382655e-07, + "loss": 1.3706, + "step": 33940 + }, + { + "epoch": 3.21, + "grad_norm": 84.9375, + "learning_rate": 3.5791285190745993e-07, + "loss": 1.384, + "step": 33960 + }, + { + "epoch": 3.21, + "grad_norm": 57.84375, + "learning_rate": 3.575347083766544e-07, + "loss": 1.3953, + "step": 33980 + }, + { + "epoch": 3.21, + "grad_norm": 78.625, + "learning_rate": 3.5715656484584883e-07, + "loss": 1.3413, + "step": 34000 + }, + { + "epoch": 3.22, + "grad_norm": 82.25, + "learning_rate": 3.5677842131504325e-07, + "loss": 1.3603, + "step": 34020 + }, + { + "epoch": 3.22, + "grad_norm": 62.34375, + "learning_rate": 3.564002777842377e-07, + "loss": 1.3919, + "step": 34040 + }, + { + "epoch": 3.22, + "grad_norm": 57.0, + "learning_rate": 3.5602213425343214e-07, + "loss": 1.4083, + "step": 34060 + }, + { + "epoch": 3.22, + "grad_norm": 77.875, + "learning_rate": 3.5564399072262657e-07, + "loss": 1.4505, + "step": 34080 + }, + { + "epoch": 3.22, + "grad_norm": 77.3125, + "learning_rate": 3.55265847191821e-07, + "loss": 1.3939, + "step": 34100 + }, + { + "epoch": 3.23, + "grad_norm": 70.3125, + "learning_rate": 3.548877036610155e-07, + "loss": 1.4308, + "step": 34120 + }, + { + "epoch": 3.23, + "grad_norm": 59.0, + "learning_rate": 3.5450956013020994e-07, + "loss": 1.3746, + "step": 34140 + }, + { + "epoch": 3.23, + "grad_norm": 68.375, + "learning_rate": 3.5413141659940436e-07, + "loss": 1.3084, + "step": 34160 + }, + { + "epoch": 3.23, + "grad_norm": 59.75, + "learning_rate": 3.5375327306859883e-07, + "loss": 1.4063, + "step": 34180 + }, + { + "epoch": 3.23, + "grad_norm": 62.4375, + "learning_rate": 3.5337512953779325e-07, + "loss": 1.4903, + "step": 34200 + }, + { + "epoch": 3.24, + "grad_norm": 63.875, + "learning_rate": 3.529969860069877e-07, + "loss": 1.4406, + "step": 34220 + }, + { + "epoch": 3.24, + "grad_norm": 61.71875, + "learning_rate": 3.526188424761821e-07, + "loss": 1.51, + "step": 34240 + }, + { + "epoch": 3.24, + "grad_norm": 73.1875, + "learning_rate": 3.5224069894537657e-07, + "loss": 1.375, + "step": 34260 + }, + { + "epoch": 3.24, + "grad_norm": 60.40625, + "learning_rate": 3.51862555414571e-07, + "loss": 1.3797, + "step": 34280 + }, + { + "epoch": 3.24, + "grad_norm": 55.25, + "learning_rate": 3.5148441188376547e-07, + "loss": 1.3388, + "step": 34300 + }, + { + "epoch": 3.24, + "grad_norm": 96.0, + "learning_rate": 3.5110626835295994e-07, + "loss": 1.4355, + "step": 34320 + }, + { + "epoch": 3.25, + "grad_norm": 65.125, + "learning_rate": 3.5072812482215436e-07, + "loss": 1.3663, + "step": 34340 + }, + { + "epoch": 3.25, + "grad_norm": 56.59375, + "learning_rate": 3.503499812913488e-07, + "loss": 1.3666, + "step": 34360 + }, + { + "epoch": 3.25, + "grad_norm": 62.875, + "learning_rate": 3.4997183776054326e-07, + "loss": 1.3237, + "step": 34380 + }, + { + "epoch": 3.25, + "grad_norm": 47.59375, + "learning_rate": 3.495936942297377e-07, + "loss": 1.5043, + "step": 34400 + }, + { + "epoch": 3.25, + "grad_norm": 64.75, + "learning_rate": 3.492155506989321e-07, + "loss": 1.3668, + "step": 34420 + }, + { + "epoch": 3.26, + "grad_norm": 76.0, + "learning_rate": 3.488374071681265e-07, + "loss": 1.4768, + "step": 34440 + }, + { + "epoch": 3.26, + "grad_norm": 55.625, + "learning_rate": 3.48459263637321e-07, + "loss": 1.3829, + "step": 34460 + }, + { + "epoch": 3.26, + "grad_norm": 55.375, + "learning_rate": 3.4808112010651547e-07, + "loss": 1.3135, + "step": 34480 + }, + { + "epoch": 3.26, + "grad_norm": 57.8125, + "learning_rate": 3.477029765757099e-07, + "loss": 1.3666, + "step": 34500 + }, + { + "epoch": 3.26, + "grad_norm": 62.34375, + "learning_rate": 3.4732483304490436e-07, + "loss": 1.3976, + "step": 34520 + }, + { + "epoch": 3.27, + "grad_norm": 56.21875, + "learning_rate": 3.469466895140988e-07, + "loss": 1.4472, + "step": 34540 + }, + { + "epoch": 3.27, + "grad_norm": 79.125, + "learning_rate": 3.465685459832932e-07, + "loss": 1.3524, + "step": 34560 + }, + { + "epoch": 3.27, + "grad_norm": 82.375, + "learning_rate": 3.4619040245248763e-07, + "loss": 1.3981, + "step": 34580 + }, + { + "epoch": 3.27, + "grad_norm": 57.21875, + "learning_rate": 3.458122589216821e-07, + "loss": 1.3506, + "step": 34600 + }, + { + "epoch": 3.27, + "grad_norm": 73.8125, + "learning_rate": 3.454341153908765e-07, + "loss": 1.4456, + "step": 34620 + }, + { + "epoch": 3.27, + "grad_norm": 97.1875, + "learning_rate": 3.45055971860071e-07, + "loss": 1.3992, + "step": 34640 + }, + { + "epoch": 3.28, + "grad_norm": 81.5625, + "learning_rate": 3.4467782832926547e-07, + "loss": 1.4132, + "step": 34660 + }, + { + "epoch": 3.28, + "grad_norm": 103.8125, + "learning_rate": 3.442996847984599e-07, + "loss": 1.3113, + "step": 34680 + }, + { + "epoch": 3.28, + "grad_norm": 63.34375, + "learning_rate": 3.439215412676543e-07, + "loss": 1.4059, + "step": 34700 + }, + { + "epoch": 3.28, + "grad_norm": 114.6875, + "learning_rate": 3.4354339773684874e-07, + "loss": 1.4594, + "step": 34720 + }, + { + "epoch": 3.28, + "grad_norm": 73.5, + "learning_rate": 3.431652542060432e-07, + "loss": 1.269, + "step": 34740 + }, + { + "epoch": 3.29, + "grad_norm": 67.5625, + "learning_rate": 3.4278711067523763e-07, + "loss": 1.3647, + "step": 34760 + }, + { + "epoch": 3.29, + "grad_norm": 79.75, + "learning_rate": 3.4240896714443205e-07, + "loss": 1.4436, + "step": 34780 + }, + { + "epoch": 3.29, + "grad_norm": 53.625, + "learning_rate": 3.4203082361362653e-07, + "loss": 1.3997, + "step": 34800 + }, + { + "epoch": 3.29, + "grad_norm": 66.125, + "learning_rate": 3.41652680082821e-07, + "loss": 1.3917, + "step": 34820 + }, + { + "epoch": 3.29, + "grad_norm": 60.1875, + "learning_rate": 3.412745365520154e-07, + "loss": 1.3912, + "step": 34840 + }, + { + "epoch": 3.3, + "grad_norm": 65.5, + "learning_rate": 3.408963930212099e-07, + "loss": 1.468, + "step": 34860 + }, + { + "epoch": 3.3, + "grad_norm": 81.25, + "learning_rate": 3.405182494904043e-07, + "loss": 1.4517, + "step": 34880 + }, + { + "epoch": 3.3, + "grad_norm": 74.0625, + "learning_rate": 3.4014010595959874e-07, + "loss": 1.5081, + "step": 34900 + }, + { + "epoch": 3.3, + "grad_norm": 70.125, + "learning_rate": 3.3976196242879316e-07, + "loss": 1.2948, + "step": 34920 + }, + { + "epoch": 3.3, + "grad_norm": 49.84375, + "learning_rate": 3.3938381889798764e-07, + "loss": 1.2734, + "step": 34940 + }, + { + "epoch": 3.3, + "grad_norm": 60.5, + "learning_rate": 3.3900567536718206e-07, + "loss": 1.5046, + "step": 34960 + }, + { + "epoch": 3.31, + "grad_norm": 104.375, + "learning_rate": 3.386275318363765e-07, + "loss": 1.4961, + "step": 34980 + }, + { + "epoch": 3.31, + "grad_norm": 61.78125, + "learning_rate": 3.38249388305571e-07, + "loss": 1.4419, + "step": 35000 + }, + { + "epoch": 3.31, + "grad_norm": 60.03125, + "learning_rate": 3.378712447747654e-07, + "loss": 1.4329, + "step": 35020 + }, + { + "epoch": 3.31, + "grad_norm": 56.90625, + "learning_rate": 3.3749310124395985e-07, + "loss": 1.4915, + "step": 35040 + }, + { + "epoch": 3.31, + "grad_norm": 83.3125, + "learning_rate": 3.3711495771315427e-07, + "loss": 1.4082, + "step": 35060 + }, + { + "epoch": 3.32, + "grad_norm": 57.0625, + "learning_rate": 3.3673681418234874e-07, + "loss": 1.4659, + "step": 35080 + }, + { + "epoch": 3.32, + "grad_norm": 68.75, + "learning_rate": 3.3635867065154316e-07, + "loss": 1.4333, + "step": 35100 + }, + { + "epoch": 3.32, + "grad_norm": 84.375, + "learning_rate": 3.359805271207376e-07, + "loss": 1.4449, + "step": 35120 + }, + { + "epoch": 3.32, + "grad_norm": 62.96875, + "learning_rate": 3.3560238358993206e-07, + "loss": 1.4052, + "step": 35140 + }, + { + "epoch": 3.32, + "grad_norm": 58.46875, + "learning_rate": 3.352242400591265e-07, + "loss": 1.4209, + "step": 35160 + }, + { + "epoch": 3.33, + "grad_norm": 94.875, + "learning_rate": 3.3484609652832096e-07, + "loss": 1.4097, + "step": 35180 + }, + { + "epoch": 3.33, + "grad_norm": 62.84375, + "learning_rate": 3.344679529975154e-07, + "loss": 1.4691, + "step": 35200 + }, + { + "epoch": 3.33, + "grad_norm": 50.25, + "learning_rate": 3.3408980946670985e-07, + "loss": 1.3639, + "step": 35220 + }, + { + "epoch": 3.33, + "grad_norm": 71.0, + "learning_rate": 3.3371166593590427e-07, + "loss": 1.3215, + "step": 35240 + }, + { + "epoch": 3.33, + "grad_norm": 82.5, + "learning_rate": 3.333335224050987e-07, + "loss": 1.428, + "step": 35260 + }, + { + "epoch": 3.34, + "grad_norm": 80.4375, + "learning_rate": 3.3295537887429317e-07, + "loss": 1.4102, + "step": 35280 + }, + { + "epoch": 3.34, + "grad_norm": 62.8125, + "learning_rate": 3.325772353434876e-07, + "loss": 1.3712, + "step": 35300 + }, + { + "epoch": 3.34, + "grad_norm": 65.9375, + "learning_rate": 3.32199091812682e-07, + "loss": 1.4205, + "step": 35320 + }, + { + "epoch": 3.34, + "grad_norm": 72.0, + "learning_rate": 3.3182094828187654e-07, + "loss": 1.3905, + "step": 35340 + }, + { + "epoch": 3.34, + "grad_norm": 75.0, + "learning_rate": 3.3144280475107096e-07, + "loss": 1.4007, + "step": 35360 + }, + { + "epoch": 3.34, + "grad_norm": 69.125, + "learning_rate": 3.310646612202654e-07, + "loss": 1.4627, + "step": 35380 + }, + { + "epoch": 3.35, + "grad_norm": 92.25, + "learning_rate": 3.306865176894598e-07, + "loss": 1.3625, + "step": 35400 + }, + { + "epoch": 3.35, + "grad_norm": 56.5, + "learning_rate": 3.303083741586543e-07, + "loss": 1.4248, + "step": 35420 + }, + { + "epoch": 3.35, + "grad_norm": 56.34375, + "learning_rate": 3.299302306278487e-07, + "loss": 1.384, + "step": 35440 + }, + { + "epoch": 3.35, + "grad_norm": 60.875, + "learning_rate": 3.295520870970431e-07, + "loss": 1.4455, + "step": 35460 + }, + { + "epoch": 3.35, + "grad_norm": 58.90625, + "learning_rate": 3.2917394356623754e-07, + "loss": 1.4998, + "step": 35480 + }, + { + "epoch": 3.36, + "grad_norm": 64.375, + "learning_rate": 3.28795800035432e-07, + "loss": 1.4044, + "step": 35500 + }, + { + "epoch": 3.36, + "grad_norm": 65.0, + "learning_rate": 3.284176565046265e-07, + "loss": 1.3424, + "step": 35520 + }, + { + "epoch": 3.36, + "grad_norm": 78.1875, + "learning_rate": 3.280395129738209e-07, + "loss": 1.3932, + "step": 35540 + }, + { + "epoch": 3.36, + "grad_norm": 115.625, + "learning_rate": 3.276613694430154e-07, + "loss": 1.4185, + "step": 35560 + }, + { + "epoch": 3.36, + "grad_norm": 51.8125, + "learning_rate": 3.272832259122098e-07, + "loss": 1.357, + "step": 35580 + }, + { + "epoch": 3.37, + "grad_norm": 81.0, + "learning_rate": 3.2690508238140423e-07, + "loss": 1.3221, + "step": 35600 + }, + { + "epoch": 3.37, + "grad_norm": 58.84375, + "learning_rate": 3.265269388505987e-07, + "loss": 1.3425, + "step": 35620 + }, + { + "epoch": 3.37, + "grad_norm": 79.8125, + "learning_rate": 3.261487953197931e-07, + "loss": 1.5156, + "step": 35640 + }, + { + "epoch": 3.37, + "grad_norm": 99.625, + "learning_rate": 3.2577065178898754e-07, + "loss": 1.4331, + "step": 35660 + }, + { + "epoch": 3.37, + "grad_norm": 64.4375, + "learning_rate": 3.2539250825818196e-07, + "loss": 1.4773, + "step": 35680 + }, + { + "epoch": 3.37, + "grad_norm": 57.9375, + "learning_rate": 3.250143647273765e-07, + "loss": 1.386, + "step": 35700 + }, + { + "epoch": 3.38, + "grad_norm": 67.6875, + "learning_rate": 3.246362211965709e-07, + "loss": 1.4126, + "step": 35720 + }, + { + "epoch": 3.38, + "grad_norm": 79.6875, + "learning_rate": 3.2425807766576533e-07, + "loss": 1.4665, + "step": 35740 + }, + { + "epoch": 3.38, + "grad_norm": 53.125, + "learning_rate": 3.238799341349598e-07, + "loss": 1.3703, + "step": 35760 + }, + { + "epoch": 3.38, + "grad_norm": 53.0, + "learning_rate": 3.2350179060415423e-07, + "loss": 1.4106, + "step": 35780 + }, + { + "epoch": 3.38, + "grad_norm": 54.09375, + "learning_rate": 3.2312364707334865e-07, + "loss": 1.3638, + "step": 35800 + }, + { + "epoch": 3.39, + "grad_norm": 86.25, + "learning_rate": 3.2274550354254307e-07, + "loss": 1.4043, + "step": 35820 + }, + { + "epoch": 3.39, + "grad_norm": 70.25, + "learning_rate": 3.2236736001173755e-07, + "loss": 1.4668, + "step": 35840 + }, + { + "epoch": 3.39, + "grad_norm": 46.53125, + "learning_rate": 3.2198921648093197e-07, + "loss": 1.4028, + "step": 35860 + }, + { + "epoch": 3.39, + "grad_norm": 62.8125, + "learning_rate": 3.2161107295012644e-07, + "loss": 1.4119, + "step": 35880 + }, + { + "epoch": 3.39, + "grad_norm": 62.8125, + "learning_rate": 3.212329294193209e-07, + "loss": 1.3546, + "step": 35900 + }, + { + "epoch": 3.4, + "grad_norm": 59.78125, + "learning_rate": 3.2085478588851534e-07, + "loss": 1.4402, + "step": 35920 + }, + { + "epoch": 3.4, + "grad_norm": 63.1875, + "learning_rate": 3.2047664235770976e-07, + "loss": 1.3611, + "step": 35940 + }, + { + "epoch": 3.4, + "grad_norm": 55.875, + "learning_rate": 3.200984988269042e-07, + "loss": 1.4106, + "step": 35960 + }, + { + "epoch": 3.4, + "grad_norm": 61.53125, + "learning_rate": 3.1972035529609865e-07, + "loss": 1.4203, + "step": 35980 + }, + { + "epoch": 3.4, + "grad_norm": 56.03125, + "learning_rate": 3.193422117652931e-07, + "loss": 1.4757, + "step": 36000 + }, + { + "epoch": 3.41, + "grad_norm": 62.4375, + "learning_rate": 3.189640682344875e-07, + "loss": 1.4443, + "step": 36020 + }, + { + "epoch": 3.41, + "grad_norm": 74.125, + "learning_rate": 3.18585924703682e-07, + "loss": 1.3875, + "step": 36040 + }, + { + "epoch": 3.41, + "grad_norm": 72.4375, + "learning_rate": 3.1820778117287645e-07, + "loss": 1.3827, + "step": 36060 + }, + { + "epoch": 3.41, + "grad_norm": 63.84375, + "learning_rate": 3.1782963764207087e-07, + "loss": 1.4127, + "step": 36080 + }, + { + "epoch": 3.41, + "grad_norm": 58.0, + "learning_rate": 3.1745149411126534e-07, + "loss": 1.4644, + "step": 36100 + }, + { + "epoch": 3.41, + "grad_norm": 67.3125, + "learning_rate": 3.1707335058045976e-07, + "loss": 1.3745, + "step": 36120 + }, + { + "epoch": 3.42, + "grad_norm": 73.25, + "learning_rate": 3.166952070496542e-07, + "loss": 1.4179, + "step": 36140 + }, + { + "epoch": 3.42, + "grad_norm": 58.1875, + "learning_rate": 3.163170635188486e-07, + "loss": 1.4096, + "step": 36160 + }, + { + "epoch": 3.42, + "grad_norm": 70.125, + "learning_rate": 3.159389199880431e-07, + "loss": 1.4569, + "step": 36180 + }, + { + "epoch": 3.42, + "grad_norm": 55.6875, + "learning_rate": 3.155607764572375e-07, + "loss": 1.338, + "step": 36200 + }, + { + "epoch": 3.42, + "grad_norm": 70.5, + "learning_rate": 3.15182632926432e-07, + "loss": 1.4023, + "step": 36220 + }, + { + "epoch": 3.43, + "grad_norm": 351.5, + "learning_rate": 3.1480448939562645e-07, + "loss": 1.4474, + "step": 36240 + }, + { + "epoch": 3.43, + "grad_norm": 69.4375, + "learning_rate": 3.1442634586482087e-07, + "loss": 1.3545, + "step": 36260 + }, + { + "epoch": 3.43, + "grad_norm": 69.875, + "learning_rate": 3.140482023340153e-07, + "loss": 1.3057, + "step": 36280 + }, + { + "epoch": 3.43, + "grad_norm": 48.0625, + "learning_rate": 3.136700588032097e-07, + "loss": 1.4568, + "step": 36300 + }, + { + "epoch": 3.43, + "grad_norm": 81.625, + "learning_rate": 3.132919152724042e-07, + "loss": 1.3281, + "step": 36320 + }, + { + "epoch": 3.44, + "grad_norm": 52.8125, + "learning_rate": 3.129137717415986e-07, + "loss": 1.4704, + "step": 36340 + }, + { + "epoch": 3.44, + "grad_norm": 65.1875, + "learning_rate": 3.1253562821079303e-07, + "loss": 1.4349, + "step": 36360 + }, + { + "epoch": 3.44, + "grad_norm": 76.125, + "learning_rate": 3.121574846799875e-07, + "loss": 1.4236, + "step": 36380 + }, + { + "epoch": 3.44, + "grad_norm": 50.34375, + "learning_rate": 3.11779341149182e-07, + "loss": 1.2999, + "step": 36400 + }, + { + "epoch": 3.44, + "grad_norm": 61.28125, + "learning_rate": 3.114011976183764e-07, + "loss": 1.3776, + "step": 36420 + }, + { + "epoch": 3.44, + "grad_norm": 71.1875, + "learning_rate": 3.110230540875708e-07, + "loss": 1.3691, + "step": 36440 + }, + { + "epoch": 3.45, + "grad_norm": 51.34375, + "learning_rate": 3.106449105567653e-07, + "loss": 1.335, + "step": 36460 + }, + { + "epoch": 3.45, + "grad_norm": 60.625, + "learning_rate": 3.102667670259597e-07, + "loss": 1.2949, + "step": 36480 + }, + { + "epoch": 3.45, + "grad_norm": 55.96875, + "learning_rate": 3.0988862349515414e-07, + "loss": 1.402, + "step": 36500 + }, + { + "epoch": 3.45, + "grad_norm": 76.25, + "learning_rate": 3.095104799643486e-07, + "loss": 1.4453, + "step": 36520 + }, + { + "epoch": 3.45, + "grad_norm": 64.8125, + "learning_rate": 3.0913233643354303e-07, + "loss": 1.3819, + "step": 36540 + }, + { + "epoch": 3.46, + "grad_norm": 54.96875, + "learning_rate": 3.087541929027375e-07, + "loss": 1.3811, + "step": 36560 + }, + { + "epoch": 3.46, + "grad_norm": 57.84375, + "learning_rate": 3.08376049371932e-07, + "loss": 1.4251, + "step": 36580 + }, + { + "epoch": 3.46, + "grad_norm": 57.8125, + "learning_rate": 3.079979058411264e-07, + "loss": 1.4553, + "step": 36600 + }, + { + "epoch": 3.46, + "grad_norm": 53.875, + "learning_rate": 3.076197623103208e-07, + "loss": 1.3474, + "step": 36620 + }, + { + "epoch": 3.46, + "grad_norm": 63.6875, + "learning_rate": 3.0724161877951525e-07, + "loss": 1.4288, + "step": 36640 + }, + { + "epoch": 3.47, + "grad_norm": 55.78125, + "learning_rate": 3.068634752487097e-07, + "loss": 1.431, + "step": 36660 + }, + { + "epoch": 3.47, + "grad_norm": 63.46875, + "learning_rate": 3.0648533171790414e-07, + "loss": 1.3994, + "step": 36680 + }, + { + "epoch": 3.47, + "grad_norm": 90.375, + "learning_rate": 3.0610718818709856e-07, + "loss": 1.4356, + "step": 36700 + }, + { + "epoch": 3.47, + "grad_norm": 71.0625, + "learning_rate": 3.0572904465629304e-07, + "loss": 1.3515, + "step": 36720 + }, + { + "epoch": 3.47, + "grad_norm": 67.1875, + "learning_rate": 3.053509011254875e-07, + "loss": 1.3501, + "step": 36740 + }, + { + "epoch": 3.48, + "grad_norm": 66.25, + "learning_rate": 3.0497275759468193e-07, + "loss": 1.4016, + "step": 36760 + }, + { + "epoch": 3.48, + "grad_norm": 69.4375, + "learning_rate": 3.0459461406387635e-07, + "loss": 1.3546, + "step": 36780 + }, + { + "epoch": 3.48, + "grad_norm": 51.0625, + "learning_rate": 3.0421647053307083e-07, + "loss": 1.4189, + "step": 36800 + }, + { + "epoch": 3.48, + "grad_norm": 72.0625, + "learning_rate": 3.0383832700226525e-07, + "loss": 1.4171, + "step": 36820 + }, + { + "epoch": 3.48, + "grad_norm": 54.59375, + "learning_rate": 3.0346018347145967e-07, + "loss": 1.4448, + "step": 36840 + }, + { + "epoch": 3.48, + "grad_norm": 49.0625, + "learning_rate": 3.0308203994065415e-07, + "loss": 1.4652, + "step": 36860 + }, + { + "epoch": 3.49, + "grad_norm": 72.4375, + "learning_rate": 3.0270389640984857e-07, + "loss": 1.41, + "step": 36880 + }, + { + "epoch": 3.49, + "grad_norm": 67.1875, + "learning_rate": 3.02325752879043e-07, + "loss": 1.3746, + "step": 36900 + }, + { + "epoch": 3.49, + "grad_norm": 86.5, + "learning_rate": 3.0194760934823746e-07, + "loss": 1.3318, + "step": 36920 + }, + { + "epoch": 3.49, + "grad_norm": 61.90625, + "learning_rate": 3.0156946581743194e-07, + "loss": 1.3764, + "step": 36940 + }, + { + "epoch": 3.49, + "grad_norm": 63.375, + "learning_rate": 3.0119132228662636e-07, + "loss": 1.4998, + "step": 36960 + }, + { + "epoch": 3.5, + "grad_norm": 52.8125, + "learning_rate": 3.008131787558208e-07, + "loss": 1.4612, + "step": 36980 + }, + { + "epoch": 3.5, + "grad_norm": 62.125, + "learning_rate": 3.0043503522501525e-07, + "loss": 1.3757, + "step": 37000 + }, + { + "epoch": 3.5, + "grad_norm": 56.5625, + "learning_rate": 3.000568916942097e-07, + "loss": 1.3478, + "step": 37020 + }, + { + "epoch": 3.5, + "grad_norm": 87.3125, + "learning_rate": 2.996787481634041e-07, + "loss": 1.3444, + "step": 37040 + }, + { + "epoch": 3.5, + "grad_norm": 59.375, + "learning_rate": 2.993006046325985e-07, + "loss": 1.4231, + "step": 37060 + }, + { + "epoch": 3.51, + "grad_norm": 64.625, + "learning_rate": 2.98922461101793e-07, + "loss": 1.4394, + "step": 37080 + }, + { + "epoch": 3.51, + "grad_norm": 58.21875, + "learning_rate": 2.9854431757098747e-07, + "loss": 1.3933, + "step": 37100 + }, + { + "epoch": 3.51, + "grad_norm": 87.5625, + "learning_rate": 2.981661740401819e-07, + "loss": 1.4413, + "step": 37120 + }, + { + "epoch": 3.51, + "grad_norm": 63.03125, + "learning_rate": 2.9778803050937636e-07, + "loss": 1.513, + "step": 37140 + }, + { + "epoch": 3.51, + "grad_norm": 87.375, + "learning_rate": 2.974098869785708e-07, + "loss": 1.3841, + "step": 37160 + }, + { + "epoch": 3.51, + "grad_norm": 69.6875, + "learning_rate": 2.970317434477652e-07, + "loss": 1.4671, + "step": 37180 + }, + { + "epoch": 3.52, + "grad_norm": 63.5625, + "learning_rate": 2.966535999169596e-07, + "loss": 1.3005, + "step": 37200 + }, + { + "epoch": 3.52, + "grad_norm": 58.96875, + "learning_rate": 2.962754563861541e-07, + "loss": 1.3706, + "step": 37220 + }, + { + "epoch": 3.52, + "grad_norm": 59.0625, + "learning_rate": 2.958973128553485e-07, + "loss": 1.3605, + "step": 37240 + }, + { + "epoch": 3.52, + "grad_norm": 53.15625, + "learning_rate": 2.95519169324543e-07, + "loss": 1.4114, + "step": 37260 + }, + { + "epoch": 3.52, + "grad_norm": 76.8125, + "learning_rate": 2.9514102579373747e-07, + "loss": 1.3111, + "step": 37280 + }, + { + "epoch": 3.53, + "grad_norm": 66.9375, + "learning_rate": 2.947628822629319e-07, + "loss": 1.3771, + "step": 37300 + }, + { + "epoch": 3.53, + "grad_norm": 72.5625, + "learning_rate": 2.943847387321263e-07, + "loss": 1.3847, + "step": 37320 + }, + { + "epoch": 3.53, + "grad_norm": 55.4375, + "learning_rate": 2.940065952013208e-07, + "loss": 1.3796, + "step": 37340 + }, + { + "epoch": 3.53, + "grad_norm": 60.9375, + "learning_rate": 2.936284516705152e-07, + "loss": 1.4556, + "step": 37360 + }, + { + "epoch": 3.53, + "grad_norm": 77.6875, + "learning_rate": 2.9325030813970963e-07, + "loss": 1.334, + "step": 37380 + }, + { + "epoch": 3.54, + "grad_norm": 77.5625, + "learning_rate": 2.9287216460890405e-07, + "loss": 1.4496, + "step": 37400 + }, + { + "epoch": 3.54, + "grad_norm": 62.6875, + "learning_rate": 2.924940210780985e-07, + "loss": 1.3992, + "step": 37420 + }, + { + "epoch": 3.54, + "grad_norm": 75.125, + "learning_rate": 2.92115877547293e-07, + "loss": 1.3938, + "step": 37440 + }, + { + "epoch": 3.54, + "grad_norm": 117.625, + "learning_rate": 2.917377340164874e-07, + "loss": 1.4084, + "step": 37460 + }, + { + "epoch": 3.54, + "grad_norm": 81.9375, + "learning_rate": 2.913595904856819e-07, + "loss": 1.4994, + "step": 37480 + }, + { + "epoch": 3.55, + "grad_norm": 87.6875, + "learning_rate": 2.909814469548763e-07, + "loss": 1.383, + "step": 37500 + }, + { + "epoch": 3.55, + "grad_norm": 59.1875, + "learning_rate": 2.9060330342407074e-07, + "loss": 1.3683, + "step": 37520 + }, + { + "epoch": 3.55, + "grad_norm": 71.5, + "learning_rate": 2.9022515989326516e-07, + "loss": 1.4564, + "step": 37540 + }, + { + "epoch": 3.55, + "grad_norm": 59.53125, + "learning_rate": 2.8984701636245963e-07, + "loss": 1.3854, + "step": 37560 + }, + { + "epoch": 3.55, + "grad_norm": 66.8125, + "learning_rate": 2.8946887283165405e-07, + "loss": 1.4842, + "step": 37580 + }, + { + "epoch": 3.55, + "grad_norm": 60.84375, + "learning_rate": 2.890907293008485e-07, + "loss": 1.4141, + "step": 37600 + }, + { + "epoch": 3.56, + "grad_norm": 53.40625, + "learning_rate": 2.88712585770043e-07, + "loss": 1.4447, + "step": 37620 + }, + { + "epoch": 3.56, + "grad_norm": 64.0625, + "learning_rate": 2.883344422392374e-07, + "loss": 1.3315, + "step": 37640 + }, + { + "epoch": 3.56, + "grad_norm": 77.8125, + "learning_rate": 2.8795629870843184e-07, + "loss": 1.4563, + "step": 37660 + }, + { + "epoch": 3.56, + "grad_norm": 74.1875, + "learning_rate": 2.8757815517762627e-07, + "loss": 1.3781, + "step": 37680 + }, + { + "epoch": 3.56, + "grad_norm": 68.625, + "learning_rate": 2.8720001164682074e-07, + "loss": 1.3988, + "step": 37700 + }, + { + "epoch": 3.57, + "grad_norm": 76.0625, + "learning_rate": 2.8682186811601516e-07, + "loss": 1.4011, + "step": 37720 + }, + { + "epoch": 3.57, + "grad_norm": 76.0625, + "learning_rate": 2.864437245852096e-07, + "loss": 1.371, + "step": 37740 + }, + { + "epoch": 3.57, + "grad_norm": 67.6875, + "learning_rate": 2.8606558105440406e-07, + "loss": 1.3667, + "step": 37760 + }, + { + "epoch": 3.57, + "grad_norm": 51.25, + "learning_rate": 2.856874375235985e-07, + "loss": 1.4467, + "step": 37780 + }, + { + "epoch": 3.57, + "grad_norm": 50.96875, + "learning_rate": 2.8530929399279295e-07, + "loss": 1.3751, + "step": 37800 + }, + { + "epoch": 3.58, + "grad_norm": 61.90625, + "learning_rate": 2.8493115046198743e-07, + "loss": 1.3476, + "step": 37820 + }, + { + "epoch": 3.58, + "grad_norm": 76.5625, + "learning_rate": 2.8455300693118185e-07, + "loss": 1.2849, + "step": 37840 + }, + { + "epoch": 3.58, + "grad_norm": 49.28125, + "learning_rate": 2.8417486340037627e-07, + "loss": 1.4695, + "step": 37860 + }, + { + "epoch": 3.58, + "grad_norm": 66.0625, + "learning_rate": 2.837967198695707e-07, + "loss": 1.373, + "step": 37880 + }, + { + "epoch": 3.58, + "grad_norm": 61.21875, + "learning_rate": 2.8341857633876516e-07, + "loss": 1.365, + "step": 37900 + }, + { + "epoch": 3.58, + "grad_norm": 77.9375, + "learning_rate": 2.830404328079596e-07, + "loss": 1.3943, + "step": 37920 + }, + { + "epoch": 3.59, + "grad_norm": 71.875, + "learning_rate": 2.82662289277154e-07, + "loss": 1.4403, + "step": 37940 + }, + { + "epoch": 3.59, + "grad_norm": 64.875, + "learning_rate": 2.8228414574634853e-07, + "loss": 1.4911, + "step": 37960 + }, + { + "epoch": 3.59, + "grad_norm": 62.4375, + "learning_rate": 2.8190600221554296e-07, + "loss": 1.4561, + "step": 37980 + }, + { + "epoch": 3.59, + "grad_norm": 63.9375, + "learning_rate": 2.815278586847374e-07, + "loss": 1.2846, + "step": 38000 + }, + { + "epoch": 3.59, + "grad_norm": 61.875, + "learning_rate": 2.811497151539318e-07, + "loss": 1.4566, + "step": 38020 + }, + { + "epoch": 3.6, + "grad_norm": 67.625, + "learning_rate": 2.8077157162312627e-07, + "loss": 1.4428, + "step": 38040 + }, + { + "epoch": 3.6, + "grad_norm": 68.6875, + "learning_rate": 2.803934280923207e-07, + "loss": 1.4209, + "step": 38060 + }, + { + "epoch": 3.6, + "grad_norm": 60.71875, + "learning_rate": 2.800152845615151e-07, + "loss": 1.2564, + "step": 38080 + }, + { + "epoch": 3.6, + "grad_norm": 64.5, + "learning_rate": 2.796371410307096e-07, + "loss": 1.5052, + "step": 38100 + }, + { + "epoch": 3.6, + "grad_norm": 65.75, + "learning_rate": 2.79258997499904e-07, + "loss": 1.3543, + "step": 38120 + }, + { + "epoch": 3.61, + "grad_norm": 64.25, + "learning_rate": 2.788808539690985e-07, + "loss": 1.3216, + "step": 38140 + }, + { + "epoch": 3.61, + "grad_norm": 64.4375, + "learning_rate": 2.785027104382929e-07, + "loss": 1.3808, + "step": 38160 + }, + { + "epoch": 3.61, + "grad_norm": 61.875, + "learning_rate": 2.781245669074874e-07, + "loss": 1.4452, + "step": 38180 + }, + { + "epoch": 3.61, + "grad_norm": 73.0, + "learning_rate": 2.777464233766818e-07, + "loss": 1.3897, + "step": 38200 + }, + { + "epoch": 3.61, + "grad_norm": 51.90625, + "learning_rate": 2.773682798458762e-07, + "loss": 1.4239, + "step": 38220 + }, + { + "epoch": 3.62, + "grad_norm": 63.875, + "learning_rate": 2.769901363150707e-07, + "loss": 1.2993, + "step": 38240 + }, + { + "epoch": 3.62, + "grad_norm": 60.1875, + "learning_rate": 2.766119927842651e-07, + "loss": 1.4217, + "step": 38260 + }, + { + "epoch": 3.62, + "grad_norm": 77.9375, + "learning_rate": 2.7623384925345954e-07, + "loss": 1.2893, + "step": 38280 + }, + { + "epoch": 3.62, + "grad_norm": 59.9375, + "learning_rate": 2.7585570572265396e-07, + "loss": 1.4078, + "step": 38300 + }, + { + "epoch": 3.62, + "grad_norm": 74.75, + "learning_rate": 2.754775621918485e-07, + "loss": 1.4683, + "step": 38320 + }, + { + "epoch": 3.62, + "grad_norm": 53.9375, + "learning_rate": 2.750994186610429e-07, + "loss": 1.4265, + "step": 38340 + }, + { + "epoch": 3.63, + "grad_norm": 51.625, + "learning_rate": 2.7472127513023733e-07, + "loss": 1.3032, + "step": 38360 + }, + { + "epoch": 3.63, + "grad_norm": 60.71875, + "learning_rate": 2.743431315994318e-07, + "loss": 1.4044, + "step": 38380 + }, + { + "epoch": 3.63, + "grad_norm": 57.46875, + "learning_rate": 2.7396498806862623e-07, + "loss": 1.2896, + "step": 38400 + }, + { + "epoch": 3.63, + "grad_norm": 60.75, + "learning_rate": 2.7358684453782065e-07, + "loss": 1.4532, + "step": 38420 + }, + { + "epoch": 3.63, + "grad_norm": 115.375, + "learning_rate": 2.732087010070151e-07, + "loss": 1.4785, + "step": 38440 + }, + { + "epoch": 3.64, + "grad_norm": 71.5, + "learning_rate": 2.7283055747620954e-07, + "loss": 1.3397, + "step": 38460 + }, + { + "epoch": 3.64, + "grad_norm": 48.15625, + "learning_rate": 2.7245241394540396e-07, + "loss": 1.3967, + "step": 38480 + }, + { + "epoch": 3.64, + "grad_norm": 59.96875, + "learning_rate": 2.7207427041459844e-07, + "loss": 1.4357, + "step": 38500 + }, + { + "epoch": 3.64, + "grad_norm": 118.1875, + "learning_rate": 2.716961268837929e-07, + "loss": 1.3167, + "step": 38520 + }, + { + "epoch": 3.64, + "grad_norm": 77.375, + "learning_rate": 2.7131798335298733e-07, + "loss": 1.4363, + "step": 38540 + }, + { + "epoch": 3.65, + "grad_norm": 57.375, + "learning_rate": 2.7093983982218176e-07, + "loss": 1.3934, + "step": 38560 + }, + { + "epoch": 3.65, + "grad_norm": 66.9375, + "learning_rate": 2.7056169629137623e-07, + "loss": 1.3952, + "step": 38580 + }, + { + "epoch": 3.65, + "grad_norm": 53.09375, + "learning_rate": 2.7018355276057065e-07, + "loss": 1.3954, + "step": 38600 + }, + { + "epoch": 3.65, + "grad_norm": 80.3125, + "learning_rate": 2.6980540922976507e-07, + "loss": 1.4784, + "step": 38620 + }, + { + "epoch": 3.65, + "grad_norm": 65.5, + "learning_rate": 2.694272656989595e-07, + "loss": 1.3971, + "step": 38640 + }, + { + "epoch": 3.65, + "grad_norm": 53.59375, + "learning_rate": 2.69049122168154e-07, + "loss": 1.412, + "step": 38660 + }, + { + "epoch": 3.66, + "grad_norm": 81.3125, + "learning_rate": 2.6867097863734844e-07, + "loss": 1.4045, + "step": 38680 + }, + { + "epoch": 3.66, + "grad_norm": 68.875, + "learning_rate": 2.6829283510654286e-07, + "loss": 1.3473, + "step": 38700 + }, + { + "epoch": 3.66, + "grad_norm": 70.5625, + "learning_rate": 2.6791469157573734e-07, + "loss": 1.4147, + "step": 38720 + }, + { + "epoch": 3.66, + "grad_norm": 76.8125, + "learning_rate": 2.6753654804493176e-07, + "loss": 1.438, + "step": 38740 + }, + { + "epoch": 3.66, + "grad_norm": 65.375, + "learning_rate": 2.671584045141262e-07, + "loss": 1.4028, + "step": 38760 + }, + { + "epoch": 3.67, + "grad_norm": 51.65625, + "learning_rate": 2.667802609833206e-07, + "loss": 1.343, + "step": 38780 + }, + { + "epoch": 3.67, + "grad_norm": 68.0625, + "learning_rate": 2.664021174525151e-07, + "loss": 1.3459, + "step": 38800 + }, + { + "epoch": 3.67, + "grad_norm": 60.15625, + "learning_rate": 2.660239739217095e-07, + "loss": 1.3992, + "step": 38820 + }, + { + "epoch": 3.67, + "grad_norm": 84.75, + "learning_rate": 2.6564583039090397e-07, + "loss": 1.3889, + "step": 38840 + }, + { + "epoch": 3.67, + "grad_norm": 80.25, + "learning_rate": 2.6526768686009845e-07, + "loss": 1.3914, + "step": 38860 + }, + { + "epoch": 3.68, + "grad_norm": 63.0, + "learning_rate": 2.6488954332929287e-07, + "loss": 1.2944, + "step": 38880 + }, + { + "epoch": 3.68, + "grad_norm": 90.6875, + "learning_rate": 2.645113997984873e-07, + "loss": 1.4389, + "step": 38900 + }, + { + "epoch": 3.68, + "grad_norm": 82.75, + "learning_rate": 2.6413325626768176e-07, + "loss": 1.3992, + "step": 38920 + }, + { + "epoch": 3.68, + "grad_norm": 76.0, + "learning_rate": 2.637551127368762e-07, + "loss": 1.4458, + "step": 38940 + }, + { + "epoch": 3.68, + "grad_norm": 72.875, + "learning_rate": 2.633769692060706e-07, + "loss": 1.4511, + "step": 38960 + }, + { + "epoch": 3.69, + "grad_norm": 78.4375, + "learning_rate": 2.6299882567526503e-07, + "loss": 1.3879, + "step": 38980 + }, + { + "epoch": 3.69, + "grad_norm": 65.0625, + "learning_rate": 2.626206821444595e-07, + "loss": 1.3725, + "step": 39000 + }, + { + "epoch": 3.69, + "grad_norm": 65.5625, + "learning_rate": 2.62242538613654e-07, + "loss": 1.3031, + "step": 39020 + }, + { + "epoch": 3.69, + "grad_norm": 68.9375, + "learning_rate": 2.618643950828484e-07, + "loss": 1.4757, + "step": 39040 + }, + { + "epoch": 3.69, + "grad_norm": 64.375, + "learning_rate": 2.6148625155204287e-07, + "loss": 1.3408, + "step": 39060 + }, + { + "epoch": 3.69, + "grad_norm": 57.4375, + "learning_rate": 2.611081080212373e-07, + "loss": 1.359, + "step": 39080 + }, + { + "epoch": 3.7, + "grad_norm": 59.78125, + "learning_rate": 2.607299644904317e-07, + "loss": 1.4038, + "step": 39100 + }, + { + "epoch": 3.7, + "grad_norm": 65.375, + "learning_rate": 2.6035182095962613e-07, + "loss": 1.4142, + "step": 39120 + }, + { + "epoch": 3.7, + "grad_norm": 72.25, + "learning_rate": 2.599736774288206e-07, + "loss": 1.4517, + "step": 39140 + }, + { + "epoch": 3.7, + "grad_norm": 57.125, + "learning_rate": 2.5959553389801503e-07, + "loss": 1.4371, + "step": 39160 + }, + { + "epoch": 3.7, + "grad_norm": 82.1875, + "learning_rate": 2.592173903672095e-07, + "loss": 1.3103, + "step": 39180 + }, + { + "epoch": 3.71, + "grad_norm": 73.125, + "learning_rate": 2.58839246836404e-07, + "loss": 1.3995, + "step": 39200 + }, + { + "epoch": 3.71, + "grad_norm": 72.375, + "learning_rate": 2.584611033055984e-07, + "loss": 1.4843, + "step": 39220 + }, + { + "epoch": 3.71, + "grad_norm": 56.1875, + "learning_rate": 2.580829597747928e-07, + "loss": 1.4293, + "step": 39240 + }, + { + "epoch": 3.71, + "grad_norm": 73.4375, + "learning_rate": 2.5770481624398724e-07, + "loss": 1.4464, + "step": 39260 + }, + { + "epoch": 3.71, + "grad_norm": 64.875, + "learning_rate": 2.573266727131817e-07, + "loss": 1.3824, + "step": 39280 + }, + { + "epoch": 3.72, + "grad_norm": 47.34375, + "learning_rate": 2.5694852918237614e-07, + "loss": 1.4179, + "step": 39300 + }, + { + "epoch": 3.72, + "grad_norm": 53.34375, + "learning_rate": 2.5657038565157056e-07, + "loss": 1.4673, + "step": 39320 + }, + { + "epoch": 3.72, + "grad_norm": 57.21875, + "learning_rate": 2.5619224212076503e-07, + "loss": 1.3636, + "step": 39340 + }, + { + "epoch": 3.72, + "grad_norm": 75.0, + "learning_rate": 2.558140985899595e-07, + "loss": 1.4275, + "step": 39360 + }, + { + "epoch": 3.72, + "grad_norm": 62.1875, + "learning_rate": 2.5543595505915393e-07, + "loss": 1.4423, + "step": 39380 + }, + { + "epoch": 3.72, + "grad_norm": 57.0625, + "learning_rate": 2.5505781152834835e-07, + "loss": 1.3202, + "step": 39400 + }, + { + "epoch": 3.73, + "grad_norm": 74.9375, + "learning_rate": 2.546796679975428e-07, + "loss": 1.4716, + "step": 39420 + }, + { + "epoch": 3.73, + "grad_norm": 62.03125, + "learning_rate": 2.5430152446673725e-07, + "loss": 1.3571, + "step": 39440 + }, + { + "epoch": 3.73, + "grad_norm": 65.25, + "learning_rate": 2.5392338093593167e-07, + "loss": 1.44, + "step": 39460 + }, + { + "epoch": 3.73, + "grad_norm": 48.59375, + "learning_rate": 2.5354523740512614e-07, + "loss": 1.4017, + "step": 39480 + }, + { + "epoch": 3.73, + "grad_norm": 58.78125, + "learning_rate": 2.5316709387432056e-07, + "loss": 1.4145, + "step": 39500 + }, + { + "epoch": 3.74, + "grad_norm": 108.5, + "learning_rate": 2.52788950343515e-07, + "loss": 1.3431, + "step": 39520 + }, + { + "epoch": 3.74, + "grad_norm": 66.0, + "learning_rate": 2.524108068127095e-07, + "loss": 1.4277, + "step": 39540 + }, + { + "epoch": 3.74, + "grad_norm": 72.625, + "learning_rate": 2.5203266328190393e-07, + "loss": 1.4133, + "step": 39560 + }, + { + "epoch": 3.74, + "grad_norm": 68.75, + "learning_rate": 2.5165451975109835e-07, + "loss": 1.3857, + "step": 39580 + }, + { + "epoch": 3.74, + "grad_norm": 57.0, + "learning_rate": 2.512763762202928e-07, + "loss": 1.4572, + "step": 39600 + }, + { + "epoch": 3.75, + "grad_norm": 71.5625, + "learning_rate": 2.5089823268948725e-07, + "loss": 1.3965, + "step": 39620 + }, + { + "epoch": 3.75, + "grad_norm": 66.75, + "learning_rate": 2.5052008915868167e-07, + "loss": 1.3597, + "step": 39640 + }, + { + "epoch": 3.75, + "grad_norm": 69.0625, + "learning_rate": 2.501419456278761e-07, + "loss": 1.3411, + "step": 39660 + }, + { + "epoch": 3.75, + "grad_norm": 54.0625, + "learning_rate": 2.4976380209707057e-07, + "loss": 1.4024, + "step": 39680 + }, + { + "epoch": 3.75, + "grad_norm": 75.4375, + "learning_rate": 2.49385658566265e-07, + "loss": 1.4341, + "step": 39700 + }, + { + "epoch": 3.75, + "grad_norm": 60.875, + "learning_rate": 2.4900751503545946e-07, + "loss": 1.4068, + "step": 39720 + }, + { + "epoch": 3.76, + "grad_norm": 61.0, + "learning_rate": 2.486293715046539e-07, + "loss": 1.3496, + "step": 39740 + }, + { + "epoch": 3.76, + "grad_norm": 59.875, + "learning_rate": 2.4825122797384836e-07, + "loss": 1.39, + "step": 39760 + }, + { + "epoch": 3.76, + "grad_norm": 71.4375, + "learning_rate": 2.478730844430428e-07, + "loss": 1.3572, + "step": 39780 + }, + { + "epoch": 3.76, + "grad_norm": 77.4375, + "learning_rate": 2.474949409122372e-07, + "loss": 1.3969, + "step": 39800 + }, + { + "epoch": 3.76, + "grad_norm": 52.34375, + "learning_rate": 2.471167973814317e-07, + "loss": 1.4315, + "step": 39820 + }, + { + "epoch": 3.77, + "grad_norm": 102.9375, + "learning_rate": 2.467386538506261e-07, + "loss": 1.3345, + "step": 39840 + }, + { + "epoch": 3.77, + "grad_norm": 68.6875, + "learning_rate": 2.4636051031982057e-07, + "loss": 1.3399, + "step": 39860 + }, + { + "epoch": 3.77, + "grad_norm": 54.78125, + "learning_rate": 2.45982366789015e-07, + "loss": 1.4267, + "step": 39880 + }, + { + "epoch": 3.77, + "grad_norm": 70.875, + "learning_rate": 2.456042232582094e-07, + "loss": 1.4507, + "step": 39900 + }, + { + "epoch": 3.77, + "grad_norm": 56.40625, + "learning_rate": 2.452260797274039e-07, + "loss": 1.4029, + "step": 39920 + }, + { + "epoch": 3.78, + "grad_norm": 71.25, + "learning_rate": 2.448479361965983e-07, + "loss": 1.4685, + "step": 39940 + }, + { + "epoch": 3.78, + "grad_norm": 62.96875, + "learning_rate": 2.444697926657928e-07, + "loss": 1.4023, + "step": 39960 + }, + { + "epoch": 3.78, + "grad_norm": 69.875, + "learning_rate": 2.440916491349872e-07, + "loss": 1.4639, + "step": 39980 + }, + { + "epoch": 3.78, + "grad_norm": 65.6875, + "learning_rate": 2.437135056041816e-07, + "loss": 1.3749, + "step": 40000 + }, + { + "epoch": 3.78, + "grad_norm": 69.6875, + "learning_rate": 2.433353620733761e-07, + "loss": 1.41, + "step": 40020 + }, + { + "epoch": 3.79, + "grad_norm": 67.1875, + "learning_rate": 2.429572185425705e-07, + "loss": 1.4743, + "step": 40040 + }, + { + "epoch": 3.79, + "grad_norm": 71.375, + "learning_rate": 2.42579075011765e-07, + "loss": 1.3925, + "step": 40060 + }, + { + "epoch": 3.79, + "grad_norm": 63.84375, + "learning_rate": 2.422009314809594e-07, + "loss": 1.4514, + "step": 40080 + }, + { + "epoch": 3.79, + "grad_norm": 101.9375, + "learning_rate": 2.418227879501539e-07, + "loss": 1.4082, + "step": 40100 + }, + { + "epoch": 3.79, + "grad_norm": 72.375, + "learning_rate": 2.414446444193483e-07, + "loss": 1.3906, + "step": 40120 + }, + { + "epoch": 3.79, + "grad_norm": 54.625, + "learning_rate": 2.4106650088854273e-07, + "loss": 1.3851, + "step": 40140 + }, + { + "epoch": 3.8, + "grad_norm": 75.1875, + "learning_rate": 2.406883573577372e-07, + "loss": 1.3869, + "step": 40160 + }, + { + "epoch": 3.8, + "grad_norm": 65.5625, + "learning_rate": 2.4031021382693163e-07, + "loss": 1.3966, + "step": 40180 + }, + { + "epoch": 3.8, + "grad_norm": 61.78125, + "learning_rate": 2.399320702961261e-07, + "loss": 1.3518, + "step": 40200 + }, + { + "epoch": 3.8, + "grad_norm": 64.125, + "learning_rate": 2.395539267653205e-07, + "loss": 1.4587, + "step": 40220 + }, + { + "epoch": 3.8, + "grad_norm": 64.25, + "learning_rate": 2.3917578323451495e-07, + "loss": 1.4854, + "step": 40240 + }, + { + "epoch": 3.81, + "grad_norm": 68.75, + "learning_rate": 2.3879763970370937e-07, + "loss": 1.3629, + "step": 40260 + }, + { + "epoch": 3.81, + "grad_norm": 93.6875, + "learning_rate": 2.3841949617290384e-07, + "loss": 1.4779, + "step": 40280 + }, + { + "epoch": 3.81, + "grad_norm": 52.40625, + "learning_rate": 2.380413526420983e-07, + "loss": 1.3754, + "step": 40300 + }, + { + "epoch": 3.81, + "grad_norm": 61.46875, + "learning_rate": 2.3766320911129274e-07, + "loss": 1.395, + "step": 40320 + }, + { + "epoch": 3.81, + "grad_norm": 72.3125, + "learning_rate": 2.3728506558048716e-07, + "loss": 1.3755, + "step": 40340 + }, + { + "epoch": 3.82, + "grad_norm": 57.53125, + "learning_rate": 2.3690692204968163e-07, + "loss": 1.3909, + "step": 40360 + }, + { + "epoch": 3.82, + "grad_norm": 66.6875, + "learning_rate": 2.3652877851887605e-07, + "loss": 1.385, + "step": 40380 + }, + { + "epoch": 3.82, + "grad_norm": 59.40625, + "learning_rate": 2.361506349880705e-07, + "loss": 1.4461, + "step": 40400 + }, + { + "epoch": 3.82, + "grad_norm": 70.75, + "learning_rate": 2.3577249145726495e-07, + "loss": 1.379, + "step": 40420 + }, + { + "epoch": 3.82, + "grad_norm": 73.0, + "learning_rate": 2.3539434792645937e-07, + "loss": 1.3794, + "step": 40440 + }, + { + "epoch": 3.82, + "grad_norm": 58.6875, + "learning_rate": 2.3501620439565384e-07, + "loss": 1.4292, + "step": 40460 + }, + { + "epoch": 3.83, + "grad_norm": 67.25, + "learning_rate": 2.3463806086484827e-07, + "loss": 1.4327, + "step": 40480 + }, + { + "epoch": 3.83, + "grad_norm": 88.4375, + "learning_rate": 2.3425991733404271e-07, + "loss": 1.5276, + "step": 40500 + }, + { + "epoch": 3.83, + "grad_norm": 65.9375, + "learning_rate": 2.3388177380323714e-07, + "loss": 1.3107, + "step": 40520 + }, + { + "epoch": 3.83, + "grad_norm": 65.25, + "learning_rate": 2.335036302724316e-07, + "loss": 1.3638, + "step": 40540 + }, + { + "epoch": 3.83, + "grad_norm": 83.625, + "learning_rate": 2.3312548674162606e-07, + "loss": 1.3733, + "step": 40560 + }, + { + "epoch": 3.84, + "grad_norm": 66.6875, + "learning_rate": 2.3274734321082048e-07, + "loss": 1.4898, + "step": 40580 + }, + { + "epoch": 3.84, + "grad_norm": 64.5, + "learning_rate": 2.3236919968001493e-07, + "loss": 1.4436, + "step": 40600 + }, + { + "epoch": 3.84, + "grad_norm": 67.5625, + "learning_rate": 2.3199105614920935e-07, + "loss": 1.4491, + "step": 40620 + }, + { + "epoch": 3.84, + "grad_norm": 64.1875, + "learning_rate": 2.3161291261840382e-07, + "loss": 1.3816, + "step": 40640 + }, + { + "epoch": 3.84, + "grad_norm": 59.03125, + "learning_rate": 2.3123476908759827e-07, + "loss": 1.4964, + "step": 40660 + }, + { + "epoch": 3.85, + "grad_norm": 59.28125, + "learning_rate": 2.308566255567927e-07, + "loss": 1.364, + "step": 40680 + }, + { + "epoch": 3.85, + "grad_norm": 62.15625, + "learning_rate": 2.3047848202598714e-07, + "loss": 1.3505, + "step": 40700 + }, + { + "epoch": 3.85, + "grad_norm": 64.9375, + "learning_rate": 2.3010033849518159e-07, + "loss": 1.3727, + "step": 40720 + }, + { + "epoch": 3.85, + "grad_norm": 74.375, + "learning_rate": 2.2972219496437603e-07, + "loss": 1.4421, + "step": 40740 + }, + { + "epoch": 3.85, + "grad_norm": 88.5625, + "learning_rate": 2.2934405143357046e-07, + "loss": 1.4352, + "step": 40760 + }, + { + "epoch": 3.86, + "grad_norm": 59.125, + "learning_rate": 2.289659079027649e-07, + "loss": 1.3484, + "step": 40780 + }, + { + "epoch": 3.86, + "grad_norm": 61.25, + "learning_rate": 2.2858776437195938e-07, + "loss": 1.404, + "step": 40800 + }, + { + "epoch": 3.86, + "grad_norm": 79.5, + "learning_rate": 2.282096208411538e-07, + "loss": 1.4222, + "step": 40820 + }, + { + "epoch": 3.86, + "grad_norm": 55.6875, + "learning_rate": 2.2783147731034825e-07, + "loss": 1.3627, + "step": 40840 + }, + { + "epoch": 3.86, + "grad_norm": 71.25, + "learning_rate": 2.2745333377954267e-07, + "loss": 1.4019, + "step": 40860 + }, + { + "epoch": 3.86, + "grad_norm": 75.0, + "learning_rate": 2.2707519024873712e-07, + "loss": 1.3863, + "step": 40880 + }, + { + "epoch": 3.87, + "grad_norm": 58.78125, + "learning_rate": 2.266970467179316e-07, + "loss": 1.3336, + "step": 40900 + }, + { + "epoch": 3.87, + "grad_norm": 72.5625, + "learning_rate": 2.26318903187126e-07, + "loss": 1.437, + "step": 40920 + }, + { + "epoch": 3.87, + "grad_norm": 53.90625, + "learning_rate": 2.2594075965632046e-07, + "loss": 1.3679, + "step": 40940 + }, + { + "epoch": 3.87, + "grad_norm": 62.96875, + "learning_rate": 2.2556261612551488e-07, + "loss": 1.4215, + "step": 40960 + }, + { + "epoch": 3.87, + "grad_norm": 62.03125, + "learning_rate": 2.2518447259470935e-07, + "loss": 1.4289, + "step": 40980 + }, + { + "epoch": 3.88, + "grad_norm": 60.78125, + "learning_rate": 2.2480632906390378e-07, + "loss": 1.52, + "step": 41000 + }, + { + "epoch": 3.88, + "grad_norm": 54.03125, + "learning_rate": 2.2442818553309822e-07, + "loss": 1.3679, + "step": 41020 + }, + { + "epoch": 3.88, + "grad_norm": 61.6875, + "learning_rate": 2.2405004200229267e-07, + "loss": 1.5041, + "step": 41040 + }, + { + "epoch": 3.88, + "grad_norm": 58.40625, + "learning_rate": 2.2367189847148712e-07, + "loss": 1.4264, + "step": 41060 + }, + { + "epoch": 3.88, + "grad_norm": 58.03125, + "learning_rate": 2.2329375494068157e-07, + "loss": 1.3305, + "step": 41080 + }, + { + "epoch": 3.89, + "grad_norm": 62.5625, + "learning_rate": 2.22915611409876e-07, + "loss": 1.3939, + "step": 41100 + }, + { + "epoch": 3.89, + "grad_norm": 62.5, + "learning_rate": 2.2253746787907044e-07, + "loss": 1.3357, + "step": 41120 + }, + { + "epoch": 3.89, + "grad_norm": 61.03125, + "learning_rate": 2.2215932434826486e-07, + "loss": 1.3863, + "step": 41140 + }, + { + "epoch": 3.89, + "grad_norm": 71.5625, + "learning_rate": 2.2178118081745933e-07, + "loss": 1.5696, + "step": 41160 + }, + { + "epoch": 3.89, + "grad_norm": 56.25, + "learning_rate": 2.2140303728665378e-07, + "loss": 1.3852, + "step": 41180 + }, + { + "epoch": 3.89, + "grad_norm": 56.0625, + "learning_rate": 2.210248937558482e-07, + "loss": 1.3793, + "step": 41200 + }, + { + "epoch": 3.9, + "grad_norm": 68.8125, + "learning_rate": 2.2064675022504265e-07, + "loss": 1.3969, + "step": 41220 + }, + { + "epoch": 3.9, + "grad_norm": 86.3125, + "learning_rate": 2.202686066942371e-07, + "loss": 1.4984, + "step": 41240 + }, + { + "epoch": 3.9, + "grad_norm": 73.5, + "learning_rate": 2.1989046316343154e-07, + "loss": 1.4178, + "step": 41260 + }, + { + "epoch": 3.9, + "grad_norm": 82.875, + "learning_rate": 2.19512319632626e-07, + "loss": 1.4235, + "step": 41280 + }, + { + "epoch": 3.9, + "grad_norm": 57.90625, + "learning_rate": 2.191341761018204e-07, + "loss": 1.3364, + "step": 41300 + }, + { + "epoch": 3.91, + "grad_norm": 60.28125, + "learning_rate": 2.187560325710149e-07, + "loss": 1.3978, + "step": 41320 + }, + { + "epoch": 3.91, + "grad_norm": 147.0, + "learning_rate": 2.183778890402093e-07, + "loss": 1.3867, + "step": 41340 + }, + { + "epoch": 3.91, + "grad_norm": 106.375, + "learning_rate": 2.1799974550940376e-07, + "loss": 1.4255, + "step": 41360 + }, + { + "epoch": 3.91, + "grad_norm": 75.1875, + "learning_rate": 2.1762160197859818e-07, + "loss": 1.4822, + "step": 41380 + }, + { + "epoch": 3.91, + "grad_norm": 60.625, + "learning_rate": 2.1724345844779263e-07, + "loss": 1.4114, + "step": 41400 + }, + { + "epoch": 3.92, + "grad_norm": 66.0, + "learning_rate": 2.168653149169871e-07, + "loss": 1.4062, + "step": 41420 + }, + { + "epoch": 3.92, + "grad_norm": 61.8125, + "learning_rate": 2.1648717138618152e-07, + "loss": 1.4739, + "step": 41440 + }, + { + "epoch": 3.92, + "grad_norm": 79.8125, + "learning_rate": 2.1610902785537597e-07, + "loss": 1.4277, + "step": 41460 + }, + { + "epoch": 3.92, + "grad_norm": 55.84375, + "learning_rate": 2.157308843245704e-07, + "loss": 1.4047, + "step": 41480 + }, + { + "epoch": 3.92, + "grad_norm": 84.0, + "learning_rate": 2.1535274079376486e-07, + "loss": 1.3995, + "step": 41500 + }, + { + "epoch": 3.93, + "grad_norm": 66.125, + "learning_rate": 2.149745972629593e-07, + "loss": 1.497, + "step": 41520 + }, + { + "epoch": 3.93, + "grad_norm": 75.0625, + "learning_rate": 2.1459645373215373e-07, + "loss": 1.4269, + "step": 41540 + }, + { + "epoch": 3.93, + "grad_norm": 78.0625, + "learning_rate": 2.1421831020134818e-07, + "loss": 1.4038, + "step": 41560 + }, + { + "epoch": 3.93, + "grad_norm": 64.3125, + "learning_rate": 2.138401666705426e-07, + "loss": 1.3915, + "step": 41580 + }, + { + "epoch": 3.93, + "grad_norm": 53.21875, + "learning_rate": 2.1346202313973708e-07, + "loss": 1.385, + "step": 41600 + }, + { + "epoch": 3.93, + "grad_norm": 64.75, + "learning_rate": 2.130838796089315e-07, + "loss": 1.4048, + "step": 41620 + }, + { + "epoch": 3.94, + "grad_norm": 69.9375, + "learning_rate": 2.1270573607812595e-07, + "loss": 1.383, + "step": 41640 + }, + { + "epoch": 3.94, + "grad_norm": 55.25, + "learning_rate": 2.123275925473204e-07, + "loss": 1.4297, + "step": 41660 + }, + { + "epoch": 3.94, + "grad_norm": 62.84375, + "learning_rate": 2.1194944901651484e-07, + "loss": 1.4653, + "step": 41680 + }, + { + "epoch": 3.94, + "grad_norm": 63.0625, + "learning_rate": 2.115713054857093e-07, + "loss": 1.3194, + "step": 41700 + }, + { + "epoch": 3.94, + "grad_norm": 93.0, + "learning_rate": 2.111931619549037e-07, + "loss": 1.3567, + "step": 41720 + }, + { + "epoch": 3.95, + "grad_norm": 73.375, + "learning_rate": 2.1081501842409816e-07, + "loss": 1.3192, + "step": 41740 + }, + { + "epoch": 3.95, + "grad_norm": 60.53125, + "learning_rate": 2.1043687489329263e-07, + "loss": 1.3712, + "step": 41760 + }, + { + "epoch": 3.95, + "grad_norm": 55.0, + "learning_rate": 2.1005873136248705e-07, + "loss": 1.4137, + "step": 41780 + }, + { + "epoch": 3.95, + "grad_norm": 62.53125, + "learning_rate": 2.096805878316815e-07, + "loss": 1.4085, + "step": 41800 + }, + { + "epoch": 3.95, + "grad_norm": 61.84375, + "learning_rate": 2.0930244430087592e-07, + "loss": 1.3769, + "step": 41820 + }, + { + "epoch": 3.96, + "grad_norm": 71.75, + "learning_rate": 2.0892430077007037e-07, + "loss": 1.3457, + "step": 41840 + }, + { + "epoch": 3.96, + "grad_norm": 70.9375, + "learning_rate": 2.0854615723926482e-07, + "loss": 1.3542, + "step": 41860 + }, + { + "epoch": 3.96, + "grad_norm": 78.5, + "learning_rate": 2.0816801370845927e-07, + "loss": 1.438, + "step": 41880 + }, + { + "epoch": 3.96, + "grad_norm": 78.1875, + "learning_rate": 2.0778987017765371e-07, + "loss": 1.4064, + "step": 41900 + }, + { + "epoch": 3.96, + "grad_norm": 73.8125, + "learning_rate": 2.0741172664684814e-07, + "loss": 1.4502, + "step": 41920 + }, + { + "epoch": 3.96, + "grad_norm": 48.5, + "learning_rate": 2.070335831160426e-07, + "loss": 1.3534, + "step": 41940 + }, + { + "epoch": 3.97, + "grad_norm": 99.125, + "learning_rate": 2.0665543958523703e-07, + "loss": 1.3896, + "step": 41960 + }, + { + "epoch": 3.97, + "grad_norm": 66.1875, + "learning_rate": 2.0627729605443148e-07, + "loss": 1.3242, + "step": 41980 + }, + { + "epoch": 3.97, + "grad_norm": 83.0, + "learning_rate": 2.058991525236259e-07, + "loss": 1.49, + "step": 42000 + }, + { + "epoch": 3.97, + "grad_norm": 74.3125, + "learning_rate": 2.0552100899282037e-07, + "loss": 1.451, + "step": 42020 + }, + { + "epoch": 3.97, + "grad_norm": 65.125, + "learning_rate": 2.0514286546201482e-07, + "loss": 1.307, + "step": 42040 + }, + { + "epoch": 3.98, + "grad_norm": 67.75, + "learning_rate": 2.0476472193120924e-07, + "loss": 1.5365, + "step": 42060 + }, + { + "epoch": 3.98, + "grad_norm": 56.3125, + "learning_rate": 2.043865784004037e-07, + "loss": 1.3086, + "step": 42080 + }, + { + "epoch": 3.98, + "grad_norm": 72.8125, + "learning_rate": 2.040084348695981e-07, + "loss": 1.3316, + "step": 42100 + }, + { + "epoch": 3.98, + "grad_norm": 68.75, + "learning_rate": 2.0363029133879259e-07, + "loss": 1.4897, + "step": 42120 + }, + { + "epoch": 3.98, + "grad_norm": 63.0625, + "learning_rate": 2.0325214780798703e-07, + "loss": 1.3649, + "step": 42140 + }, + { + "epoch": 3.99, + "grad_norm": 63.75, + "learning_rate": 2.0287400427718146e-07, + "loss": 1.4543, + "step": 42160 + }, + { + "epoch": 3.99, + "grad_norm": 59.21875, + "learning_rate": 2.024958607463759e-07, + "loss": 1.4654, + "step": 42180 + }, + { + "epoch": 3.99, + "grad_norm": 63.5, + "learning_rate": 2.0211771721557035e-07, + "loss": 1.4143, + "step": 42200 + }, + { + "epoch": 3.99, + "grad_norm": 74.875, + "learning_rate": 2.017395736847648e-07, + "loss": 1.3129, + "step": 42220 + }, + { + "epoch": 3.99, + "grad_norm": 74.375, + "learning_rate": 2.0136143015395922e-07, + "loss": 1.4688, + "step": 42240 + }, + { + "epoch": 4.0, + "grad_norm": 78.1875, + "learning_rate": 2.0098328662315367e-07, + "loss": 1.4214, + "step": 42260 + }, + { + "epoch": 4.0, + "grad_norm": 76.6875, + "learning_rate": 2.0060514309234814e-07, + "loss": 1.3917, + "step": 42280 + }, + { + "epoch": 4.0, + "grad_norm": 59.3125, + "learning_rate": 2.0022699956154256e-07, + "loss": 1.3565, + "step": 42300 + }, + { + "epoch": 4.0, + "grad_norm": 71.4375, + "learning_rate": 1.99848856030737e-07, + "loss": 1.3224, + "step": 42320 + }, + { + "epoch": 4.0, + "grad_norm": 79.0, + "learning_rate": 1.9947071249993143e-07, + "loss": 1.3764, + "step": 42340 + }, + { + "epoch": 4.0, + "grad_norm": 67.6875, + "learning_rate": 1.9909256896912588e-07, + "loss": 1.2843, + "step": 42360 + }, + { + "epoch": 4.01, + "grad_norm": 63.21875, + "learning_rate": 1.9871442543832035e-07, + "loss": 1.3439, + "step": 42380 + }, + { + "epoch": 4.01, + "grad_norm": 52.71875, + "learning_rate": 1.9833628190751478e-07, + "loss": 1.2762, + "step": 42400 + }, + { + "epoch": 4.01, + "grad_norm": 50.59375, + "learning_rate": 1.9795813837670922e-07, + "loss": 1.3074, + "step": 42420 + }, + { + "epoch": 4.01, + "grad_norm": 65.4375, + "learning_rate": 1.9757999484590365e-07, + "loss": 1.451, + "step": 42440 + }, + { + "epoch": 4.01, + "grad_norm": 79.9375, + "learning_rate": 1.9720185131509812e-07, + "loss": 1.337, + "step": 42460 + }, + { + "epoch": 4.02, + "grad_norm": 80.875, + "learning_rate": 1.9682370778429254e-07, + "loss": 1.316, + "step": 42480 + }, + { + "epoch": 4.02, + "grad_norm": 73.875, + "learning_rate": 1.96445564253487e-07, + "loss": 1.252, + "step": 42500 + }, + { + "epoch": 4.02, + "grad_norm": 59.90625, + "learning_rate": 1.9606742072268144e-07, + "loss": 1.3394, + "step": 42520 + }, + { + "epoch": 4.02, + "grad_norm": 61.0625, + "learning_rate": 1.9568927719187586e-07, + "loss": 1.4067, + "step": 42540 + }, + { + "epoch": 4.02, + "grad_norm": 73.3125, + "learning_rate": 1.9531113366107033e-07, + "loss": 1.3229, + "step": 42560 + }, + { + "epoch": 4.03, + "grad_norm": 99.625, + "learning_rate": 1.9493299013026475e-07, + "loss": 1.3768, + "step": 42580 + }, + { + "epoch": 4.03, + "grad_norm": 57.78125, + "learning_rate": 1.945548465994592e-07, + "loss": 1.3839, + "step": 42600 + }, + { + "epoch": 4.03, + "grad_norm": 61.0, + "learning_rate": 1.9417670306865365e-07, + "loss": 1.3187, + "step": 42620 + }, + { + "epoch": 4.03, + "grad_norm": 56.5625, + "learning_rate": 1.937985595378481e-07, + "loss": 1.3785, + "step": 42640 + }, + { + "epoch": 4.03, + "grad_norm": 52.09375, + "learning_rate": 1.9342041600704254e-07, + "loss": 1.2809, + "step": 42660 + }, + { + "epoch": 4.03, + "grad_norm": 60.65625, + "learning_rate": 1.9304227247623697e-07, + "loss": 1.3787, + "step": 42680 + }, + { + "epoch": 4.04, + "grad_norm": 58.15625, + "learning_rate": 1.926641289454314e-07, + "loss": 1.2968, + "step": 42700 + }, + { + "epoch": 4.04, + "grad_norm": 76.875, + "learning_rate": 1.9228598541462586e-07, + "loss": 1.3426, + "step": 42720 + }, + { + "epoch": 4.04, + "grad_norm": 64.125, + "learning_rate": 1.919078418838203e-07, + "loss": 1.389, + "step": 42740 + }, + { + "epoch": 4.04, + "grad_norm": 74.875, + "learning_rate": 1.9152969835301476e-07, + "loss": 1.3012, + "step": 42760 + }, + { + "epoch": 4.04, + "grad_norm": 68.3125, + "learning_rate": 1.9115155482220918e-07, + "loss": 1.3064, + "step": 42780 + }, + { + "epoch": 4.05, + "grad_norm": 67.375, + "learning_rate": 1.9077341129140363e-07, + "loss": 1.2949, + "step": 42800 + }, + { + "epoch": 4.05, + "grad_norm": 63.84375, + "learning_rate": 1.9039526776059807e-07, + "loss": 1.3852, + "step": 42820 + }, + { + "epoch": 4.05, + "grad_norm": 60.4375, + "learning_rate": 1.9001712422979252e-07, + "loss": 1.3849, + "step": 42840 + }, + { + "epoch": 4.05, + "grad_norm": 56.71875, + "learning_rate": 1.8963898069898694e-07, + "loss": 1.2859, + "step": 42860 + }, + { + "epoch": 4.05, + "grad_norm": 60.90625, + "learning_rate": 1.892608371681814e-07, + "loss": 1.3057, + "step": 42880 + }, + { + "epoch": 4.06, + "grad_norm": 78.1875, + "learning_rate": 1.8888269363737586e-07, + "loss": 1.3186, + "step": 42900 + }, + { + "epoch": 4.06, + "grad_norm": 56.84375, + "learning_rate": 1.8850455010657029e-07, + "loss": 1.36, + "step": 42920 + }, + { + "epoch": 4.06, + "grad_norm": 63.03125, + "learning_rate": 1.8812640657576473e-07, + "loss": 1.2948, + "step": 42940 + }, + { + "epoch": 4.06, + "grad_norm": 57.90625, + "learning_rate": 1.8774826304495915e-07, + "loss": 1.434, + "step": 42960 + }, + { + "epoch": 4.06, + "grad_norm": 58.1875, + "learning_rate": 1.8737011951415363e-07, + "loss": 1.3389, + "step": 42980 + }, + { + "epoch": 4.07, + "grad_norm": 82.6875, + "learning_rate": 1.8699197598334808e-07, + "loss": 1.3851, + "step": 43000 + }, + { + "epoch": 4.07, + "grad_norm": 59.65625, + "learning_rate": 1.866138324525425e-07, + "loss": 1.3092, + "step": 43020 + }, + { + "epoch": 4.07, + "grad_norm": 83.875, + "learning_rate": 1.8623568892173695e-07, + "loss": 1.2534, + "step": 43040 + }, + { + "epoch": 4.07, + "grad_norm": 56.9375, + "learning_rate": 1.8585754539093137e-07, + "loss": 1.3689, + "step": 43060 + }, + { + "epoch": 4.07, + "grad_norm": 65.9375, + "learning_rate": 1.8547940186012584e-07, + "loss": 1.3127, + "step": 43080 + }, + { + "epoch": 4.07, + "grad_norm": 72.9375, + "learning_rate": 1.8510125832932026e-07, + "loss": 1.4009, + "step": 43100 + }, + { + "epoch": 4.08, + "grad_norm": 57.40625, + "learning_rate": 1.847231147985147e-07, + "loss": 1.3522, + "step": 43120 + }, + { + "epoch": 4.08, + "grad_norm": 60.40625, + "learning_rate": 1.8434497126770916e-07, + "loss": 1.3938, + "step": 43140 + }, + { + "epoch": 4.08, + "grad_norm": 62.71875, + "learning_rate": 1.839668277369036e-07, + "loss": 1.3835, + "step": 43160 + }, + { + "epoch": 4.08, + "grad_norm": 53.5625, + "learning_rate": 1.8358868420609805e-07, + "loss": 1.3114, + "step": 43180 + }, + { + "epoch": 4.08, + "grad_norm": 71.3125, + "learning_rate": 1.8321054067529248e-07, + "loss": 1.3608, + "step": 43200 + }, + { + "epoch": 4.09, + "grad_norm": 109.1875, + "learning_rate": 1.8283239714448692e-07, + "loss": 1.3388, + "step": 43220 + }, + { + "epoch": 4.09, + "grad_norm": 64.8125, + "learning_rate": 1.8245425361368137e-07, + "loss": 1.2882, + "step": 43240 + }, + { + "epoch": 4.09, + "grad_norm": 61.8125, + "learning_rate": 1.8207611008287582e-07, + "loss": 1.3305, + "step": 43260 + }, + { + "epoch": 4.09, + "grad_norm": 63.46875, + "learning_rate": 1.8169796655207027e-07, + "loss": 1.3216, + "step": 43280 + }, + { + "epoch": 4.09, + "grad_norm": 54.28125, + "learning_rate": 1.813198230212647e-07, + "loss": 1.3642, + "step": 43300 + }, + { + "epoch": 4.1, + "grad_norm": 56.78125, + "learning_rate": 1.8094167949045914e-07, + "loss": 1.2574, + "step": 43320 + }, + { + "epoch": 4.1, + "grad_norm": 56.28125, + "learning_rate": 1.8056353595965358e-07, + "loss": 1.2901, + "step": 43340 + }, + { + "epoch": 4.1, + "grad_norm": 59.375, + "learning_rate": 1.8018539242884803e-07, + "loss": 1.3838, + "step": 43360 + }, + { + "epoch": 4.1, + "grad_norm": 57.25, + "learning_rate": 1.7980724889804248e-07, + "loss": 1.307, + "step": 43380 + }, + { + "epoch": 4.1, + "grad_norm": 67.4375, + "learning_rate": 1.794291053672369e-07, + "loss": 1.2821, + "step": 43400 + }, + { + "epoch": 4.1, + "grad_norm": 89.375, + "learning_rate": 1.7905096183643137e-07, + "loss": 1.337, + "step": 43420 + }, + { + "epoch": 4.11, + "grad_norm": 104.4375, + "learning_rate": 1.786728183056258e-07, + "loss": 1.3838, + "step": 43440 + }, + { + "epoch": 4.11, + "grad_norm": 63.78125, + "learning_rate": 1.7829467477482024e-07, + "loss": 1.3542, + "step": 43460 + }, + { + "epoch": 4.11, + "grad_norm": 64.1875, + "learning_rate": 1.779165312440147e-07, + "loss": 1.3268, + "step": 43480 + }, + { + "epoch": 4.11, + "grad_norm": 60.1875, + "learning_rate": 1.775383877132091e-07, + "loss": 1.2482, + "step": 43500 + }, + { + "epoch": 4.11, + "grad_norm": 72.0, + "learning_rate": 1.7716024418240359e-07, + "loss": 1.3675, + "step": 43520 + }, + { + "epoch": 4.12, + "grad_norm": 51.375, + "learning_rate": 1.76782100651598e-07, + "loss": 1.3264, + "step": 43540 + }, + { + "epoch": 4.12, + "grad_norm": 79.0625, + "learning_rate": 1.7640395712079246e-07, + "loss": 1.2761, + "step": 43560 + }, + { + "epoch": 4.12, + "grad_norm": 59.5625, + "learning_rate": 1.7602581358998688e-07, + "loss": 1.3908, + "step": 43580 + }, + { + "epoch": 4.12, + "grad_norm": 67.3125, + "learning_rate": 1.7564767005918135e-07, + "loss": 1.3082, + "step": 43600 + }, + { + "epoch": 4.12, + "grad_norm": 79.625, + "learning_rate": 1.752695265283758e-07, + "loss": 1.3252, + "step": 43620 + }, + { + "epoch": 4.13, + "grad_norm": 82.0, + "learning_rate": 1.7489138299757022e-07, + "loss": 1.3506, + "step": 43640 + }, + { + "epoch": 4.13, + "grad_norm": 75.6875, + "learning_rate": 1.7451323946676467e-07, + "loss": 1.2852, + "step": 43660 + }, + { + "epoch": 4.13, + "grad_norm": 62.15625, + "learning_rate": 1.7413509593595912e-07, + "loss": 1.2953, + "step": 43680 + }, + { + "epoch": 4.13, + "grad_norm": 70.0625, + "learning_rate": 1.7375695240515356e-07, + "loss": 1.3503, + "step": 43700 + }, + { + "epoch": 4.13, + "grad_norm": 69.625, + "learning_rate": 1.73378808874348e-07, + "loss": 1.3355, + "step": 43720 + }, + { + "epoch": 4.13, + "grad_norm": 82.3125, + "learning_rate": 1.7300066534354243e-07, + "loss": 1.3057, + "step": 43740 + }, + { + "epoch": 4.14, + "grad_norm": 71.3125, + "learning_rate": 1.7262252181273688e-07, + "loss": 1.2385, + "step": 43760 + }, + { + "epoch": 4.14, + "grad_norm": 64.8125, + "learning_rate": 1.7224437828193133e-07, + "loss": 1.3327, + "step": 43780 + }, + { + "epoch": 4.14, + "grad_norm": 64.9375, + "learning_rate": 1.7186623475112578e-07, + "loss": 1.2779, + "step": 43800 + }, + { + "epoch": 4.14, + "grad_norm": 75.875, + "learning_rate": 1.714880912203202e-07, + "loss": 1.3603, + "step": 43820 + }, + { + "epoch": 4.14, + "grad_norm": 47.15625, + "learning_rate": 1.7110994768951465e-07, + "loss": 1.2305, + "step": 43840 + }, + { + "epoch": 4.15, + "grad_norm": 69.1875, + "learning_rate": 1.7073180415870912e-07, + "loss": 1.2893, + "step": 43860 + }, + { + "epoch": 4.15, + "grad_norm": 61.5, + "learning_rate": 1.7035366062790354e-07, + "loss": 1.3423, + "step": 43880 + }, + { + "epoch": 4.15, + "grad_norm": 58.71875, + "learning_rate": 1.69975517097098e-07, + "loss": 1.2352, + "step": 43900 + }, + { + "epoch": 4.15, + "grad_norm": 51.1875, + "learning_rate": 1.695973735662924e-07, + "loss": 1.314, + "step": 43920 + }, + { + "epoch": 4.15, + "grad_norm": 60.59375, + "learning_rate": 1.6921923003548688e-07, + "loss": 1.2465, + "step": 43940 + }, + { + "epoch": 4.16, + "grad_norm": 87.125, + "learning_rate": 1.688410865046813e-07, + "loss": 1.3573, + "step": 43960 + }, + { + "epoch": 4.16, + "grad_norm": 78.6875, + "learning_rate": 1.6846294297387575e-07, + "loss": 1.3748, + "step": 43980 + }, + { + "epoch": 4.16, + "grad_norm": 64.75, + "learning_rate": 1.680847994430702e-07, + "loss": 1.4336, + "step": 44000 + }, + { + "epoch": 4.16, + "grad_norm": 69.0625, + "learning_rate": 1.6770665591226462e-07, + "loss": 1.3579, + "step": 44020 + }, + { + "epoch": 4.16, + "grad_norm": 65.125, + "learning_rate": 1.673285123814591e-07, + "loss": 1.3783, + "step": 44040 + }, + { + "epoch": 4.17, + "grad_norm": 57.1875, + "learning_rate": 1.6695036885065352e-07, + "loss": 1.277, + "step": 44060 + }, + { + "epoch": 4.17, + "grad_norm": 63.78125, + "learning_rate": 1.6657222531984797e-07, + "loss": 1.3084, + "step": 44080 + }, + { + "epoch": 4.17, + "grad_norm": 62.75, + "learning_rate": 1.6619408178904241e-07, + "loss": 1.4371, + "step": 44100 + }, + { + "epoch": 4.17, + "grad_norm": 60.6875, + "learning_rate": 1.6581593825823686e-07, + "loss": 1.3106, + "step": 44120 + }, + { + "epoch": 4.17, + "grad_norm": 55.90625, + "learning_rate": 1.654377947274313e-07, + "loss": 1.3069, + "step": 44140 + }, + { + "epoch": 4.17, + "grad_norm": 73.625, + "learning_rate": 1.6505965119662573e-07, + "loss": 1.3339, + "step": 44160 + }, + { + "epoch": 4.18, + "grad_norm": 56.03125, + "learning_rate": 1.6468150766582018e-07, + "loss": 1.1934, + "step": 44180 + }, + { + "epoch": 4.18, + "grad_norm": 59.59375, + "learning_rate": 1.643033641350146e-07, + "loss": 1.2749, + "step": 44200 + }, + { + "epoch": 4.18, + "grad_norm": 52.15625, + "learning_rate": 1.6392522060420907e-07, + "loss": 1.2481, + "step": 44220 + }, + { + "epoch": 4.18, + "grad_norm": 74.25, + "learning_rate": 1.6354707707340352e-07, + "loss": 1.3556, + "step": 44240 + }, + { + "epoch": 4.18, + "grad_norm": 54.09375, + "learning_rate": 1.6316893354259794e-07, + "loss": 1.3386, + "step": 44260 + }, + { + "epoch": 4.19, + "grad_norm": 60.78125, + "learning_rate": 1.627907900117924e-07, + "loss": 1.4361, + "step": 44280 + }, + { + "epoch": 4.19, + "grad_norm": 67.5, + "learning_rate": 1.6241264648098684e-07, + "loss": 1.3713, + "step": 44300 + }, + { + "epoch": 4.19, + "grad_norm": 65.875, + "learning_rate": 1.6203450295018129e-07, + "loss": 1.3499, + "step": 44320 + }, + { + "epoch": 4.19, + "grad_norm": 54.84375, + "learning_rate": 1.6165635941937573e-07, + "loss": 1.3358, + "step": 44340 + }, + { + "epoch": 4.19, + "grad_norm": 54.78125, + "learning_rate": 1.6127821588857015e-07, + "loss": 1.32, + "step": 44360 + }, + { + "epoch": 4.2, + "grad_norm": 72.0, + "learning_rate": 1.6090007235776463e-07, + "loss": 1.3005, + "step": 44380 + }, + { + "epoch": 4.2, + "grad_norm": 58.6875, + "learning_rate": 1.6052192882695905e-07, + "loss": 1.2939, + "step": 44400 + }, + { + "epoch": 4.2, + "grad_norm": 64.875, + "learning_rate": 1.601437852961535e-07, + "loss": 1.4002, + "step": 44420 + }, + { + "epoch": 4.2, + "grad_norm": 61.25, + "learning_rate": 1.5976564176534792e-07, + "loss": 1.2259, + "step": 44440 + }, + { + "epoch": 4.2, + "grad_norm": 71.1875, + "learning_rate": 1.5938749823454237e-07, + "loss": 1.432, + "step": 44460 + }, + { + "epoch": 4.2, + "grad_norm": 73.25, + "learning_rate": 1.5900935470373684e-07, + "loss": 1.316, + "step": 44480 + }, + { + "epoch": 4.21, + "grad_norm": 93.3125, + "learning_rate": 1.5863121117293126e-07, + "loss": 1.3683, + "step": 44500 + }, + { + "epoch": 4.21, + "grad_norm": 65.9375, + "learning_rate": 1.582530676421257e-07, + "loss": 1.2994, + "step": 44520 + }, + { + "epoch": 4.21, + "grad_norm": 76.5, + "learning_rate": 1.5787492411132013e-07, + "loss": 1.3194, + "step": 44540 + }, + { + "epoch": 4.21, + "grad_norm": 63.0625, + "learning_rate": 1.574967805805146e-07, + "loss": 1.2961, + "step": 44560 + }, + { + "epoch": 4.21, + "grad_norm": 60.8125, + "learning_rate": 1.5711863704970905e-07, + "loss": 1.4499, + "step": 44580 + }, + { + "epoch": 4.22, + "grad_norm": 76.3125, + "learning_rate": 1.5674049351890348e-07, + "loss": 1.3242, + "step": 44600 + }, + { + "epoch": 4.22, + "grad_norm": 70.25, + "learning_rate": 1.5636234998809792e-07, + "loss": 1.3083, + "step": 44620 + }, + { + "epoch": 4.22, + "grad_norm": 68.1875, + "learning_rate": 1.5598420645729237e-07, + "loss": 1.3901, + "step": 44640 + }, + { + "epoch": 4.22, + "grad_norm": 53.3125, + "learning_rate": 1.5560606292648682e-07, + "loss": 1.3179, + "step": 44660 + }, + { + "epoch": 4.22, + "grad_norm": 92.125, + "learning_rate": 1.5522791939568124e-07, + "loss": 1.2245, + "step": 44680 + }, + { + "epoch": 4.23, + "grad_norm": 77.0625, + "learning_rate": 1.548497758648757e-07, + "loss": 1.3041, + "step": 44700 + }, + { + "epoch": 4.23, + "grad_norm": 56.09375, + "learning_rate": 1.5447163233407014e-07, + "loss": 1.3729, + "step": 44720 + }, + { + "epoch": 4.23, + "grad_norm": 68.625, + "learning_rate": 1.5409348880326458e-07, + "loss": 1.314, + "step": 44740 + }, + { + "epoch": 4.23, + "grad_norm": 65.625, + "learning_rate": 1.5371534527245903e-07, + "loss": 1.3265, + "step": 44760 + }, + { + "epoch": 4.23, + "grad_norm": 60.09375, + "learning_rate": 1.5333720174165345e-07, + "loss": 1.257, + "step": 44780 + }, + { + "epoch": 4.24, + "grad_norm": 57.375, + "learning_rate": 1.529590582108479e-07, + "loss": 1.3232, + "step": 44800 + }, + { + "epoch": 4.24, + "grad_norm": 79.1875, + "learning_rate": 1.5258091468004235e-07, + "loss": 1.3386, + "step": 44820 + }, + { + "epoch": 4.24, + "grad_norm": 80.5, + "learning_rate": 1.522027711492368e-07, + "loss": 1.3517, + "step": 44840 + }, + { + "epoch": 4.24, + "grad_norm": 59.4375, + "learning_rate": 1.5182462761843124e-07, + "loss": 1.2794, + "step": 44860 + }, + { + "epoch": 4.24, + "grad_norm": 69.375, + "learning_rate": 1.5144648408762566e-07, + "loss": 1.2523, + "step": 44880 + }, + { + "epoch": 4.24, + "grad_norm": 98.875, + "learning_rate": 1.5106834055682014e-07, + "loss": 1.3374, + "step": 44900 + }, + { + "epoch": 4.25, + "grad_norm": 77.8125, + "learning_rate": 1.5069019702601456e-07, + "loss": 1.3555, + "step": 44920 + }, + { + "epoch": 4.25, + "grad_norm": 59.78125, + "learning_rate": 1.50312053495209e-07, + "loss": 1.3583, + "step": 44940 + }, + { + "epoch": 4.25, + "grad_norm": 56.75, + "learning_rate": 1.4993390996440346e-07, + "loss": 1.3552, + "step": 44960 + }, + { + "epoch": 4.25, + "grad_norm": 51.96875, + "learning_rate": 1.4955576643359788e-07, + "loss": 1.3089, + "step": 44980 + }, + { + "epoch": 4.25, + "grad_norm": 56.9375, + "learning_rate": 1.4917762290279235e-07, + "loss": 1.3219, + "step": 45000 + }, + { + "epoch": 4.26, + "grad_norm": 71.5, + "learning_rate": 1.4879947937198677e-07, + "loss": 1.3258, + "step": 45020 + }, + { + "epoch": 4.26, + "grad_norm": 84.1875, + "learning_rate": 1.4842133584118122e-07, + "loss": 1.3392, + "step": 45040 + }, + { + "epoch": 4.26, + "grad_norm": 66.625, + "learning_rate": 1.4804319231037564e-07, + "loss": 1.2842, + "step": 45060 + }, + { + "epoch": 4.26, + "grad_norm": 72.8125, + "learning_rate": 1.4766504877957012e-07, + "loss": 1.32, + "step": 45080 + }, + { + "epoch": 4.26, + "grad_norm": 59.25, + "learning_rate": 1.4728690524876456e-07, + "loss": 1.334, + "step": 45100 + }, + { + "epoch": 4.27, + "grad_norm": 32.84375, + "learning_rate": 1.4690876171795899e-07, + "loss": 1.2431, + "step": 45120 + }, + { + "epoch": 4.27, + "grad_norm": 47.5, + "learning_rate": 1.4653061818715343e-07, + "loss": 1.3683, + "step": 45140 + }, + { + "epoch": 4.27, + "grad_norm": 60.78125, + "learning_rate": 1.4615247465634785e-07, + "loss": 1.4307, + "step": 45160 + }, + { + "epoch": 4.27, + "grad_norm": 74.0, + "learning_rate": 1.4577433112554233e-07, + "loss": 1.3282, + "step": 45180 + }, + { + "epoch": 4.27, + "grad_norm": 59.28125, + "learning_rate": 1.4539618759473678e-07, + "loss": 1.3151, + "step": 45200 + }, + { + "epoch": 4.27, + "grad_norm": 60.0, + "learning_rate": 1.450180440639312e-07, + "loss": 1.2706, + "step": 45220 + }, + { + "epoch": 4.28, + "grad_norm": 62.0625, + "learning_rate": 1.4463990053312565e-07, + "loss": 1.2947, + "step": 45240 + }, + { + "epoch": 4.28, + "grad_norm": 62.03125, + "learning_rate": 1.442617570023201e-07, + "loss": 1.3164, + "step": 45260 + }, + { + "epoch": 4.28, + "grad_norm": 66.5, + "learning_rate": 1.4388361347151454e-07, + "loss": 1.2706, + "step": 45280 + }, + { + "epoch": 4.28, + "grad_norm": 61.75, + "learning_rate": 1.4350546994070896e-07, + "loss": 1.3473, + "step": 45300 + }, + { + "epoch": 4.28, + "grad_norm": 51.75, + "learning_rate": 1.431273264099034e-07, + "loss": 1.2948, + "step": 45320 + }, + { + "epoch": 4.29, + "grad_norm": 59.3125, + "learning_rate": 1.4274918287909788e-07, + "loss": 1.262, + "step": 45340 + }, + { + "epoch": 4.29, + "grad_norm": 101.625, + "learning_rate": 1.423710393482923e-07, + "loss": 1.3206, + "step": 45360 + }, + { + "epoch": 4.29, + "grad_norm": 61.96875, + "learning_rate": 1.4199289581748675e-07, + "loss": 1.3655, + "step": 45380 + }, + { + "epoch": 4.29, + "grad_norm": 56.375, + "learning_rate": 1.4161475228668117e-07, + "loss": 1.3355, + "step": 45400 + }, + { + "epoch": 4.29, + "grad_norm": 61.6875, + "learning_rate": 1.4123660875587562e-07, + "loss": 1.3372, + "step": 45420 + }, + { + "epoch": 4.3, + "grad_norm": 55.9375, + "learning_rate": 1.408584652250701e-07, + "loss": 1.2686, + "step": 45440 + }, + { + "epoch": 4.3, + "grad_norm": 64.75, + "learning_rate": 1.4048032169426452e-07, + "loss": 1.2195, + "step": 45460 + }, + { + "epoch": 4.3, + "grad_norm": 57.5625, + "learning_rate": 1.4010217816345897e-07, + "loss": 1.3275, + "step": 45480 + }, + { + "epoch": 4.3, + "grad_norm": 62.40625, + "learning_rate": 1.397240346326534e-07, + "loss": 1.2764, + "step": 45500 + }, + { + "epoch": 4.3, + "grad_norm": 54.5, + "learning_rate": 1.3934589110184786e-07, + "loss": 1.2676, + "step": 45520 + }, + { + "epoch": 4.31, + "grad_norm": 69.625, + "learning_rate": 1.3896774757104228e-07, + "loss": 1.4004, + "step": 45540 + }, + { + "epoch": 4.31, + "grad_norm": 61.78125, + "learning_rate": 1.3858960404023673e-07, + "loss": 1.3371, + "step": 45560 + }, + { + "epoch": 4.31, + "grad_norm": 67.125, + "learning_rate": 1.3821146050943118e-07, + "loss": 1.4369, + "step": 45580 + }, + { + "epoch": 4.31, + "grad_norm": 49.09375, + "learning_rate": 1.3783331697862563e-07, + "loss": 1.229, + "step": 45600 + }, + { + "epoch": 4.31, + "grad_norm": 96.4375, + "learning_rate": 1.3745517344782007e-07, + "loss": 1.3351, + "step": 45620 + }, + { + "epoch": 4.31, + "grad_norm": 69.5625, + "learning_rate": 1.370770299170145e-07, + "loss": 1.3794, + "step": 45640 + }, + { + "epoch": 4.32, + "grad_norm": 50.8125, + "learning_rate": 1.3669888638620894e-07, + "loss": 1.3705, + "step": 45660 + }, + { + "epoch": 4.32, + "grad_norm": 62.1875, + "learning_rate": 1.3632074285540336e-07, + "loss": 1.3257, + "step": 45680 + }, + { + "epoch": 4.32, + "grad_norm": 54.1875, + "learning_rate": 1.3594259932459784e-07, + "loss": 1.2859, + "step": 45700 + }, + { + "epoch": 4.32, + "grad_norm": 56.84375, + "learning_rate": 1.3556445579379229e-07, + "loss": 1.3751, + "step": 45720 + }, + { + "epoch": 4.32, + "grad_norm": 51.90625, + "learning_rate": 1.351863122629867e-07, + "loss": 1.3439, + "step": 45740 + }, + { + "epoch": 4.33, + "grad_norm": 58.6875, + "learning_rate": 1.3480816873218116e-07, + "loss": 1.3596, + "step": 45760 + }, + { + "epoch": 4.33, + "grad_norm": 62.71875, + "learning_rate": 1.344300252013756e-07, + "loss": 1.3372, + "step": 45780 + }, + { + "epoch": 4.33, + "grad_norm": 62.5625, + "learning_rate": 1.3405188167057005e-07, + "loss": 1.2702, + "step": 45800 + }, + { + "epoch": 4.33, + "grad_norm": 85.125, + "learning_rate": 1.336737381397645e-07, + "loss": 1.3388, + "step": 45820 + }, + { + "epoch": 4.33, + "grad_norm": 86.875, + "learning_rate": 1.3329559460895892e-07, + "loss": 1.2513, + "step": 45840 + }, + { + "epoch": 4.34, + "grad_norm": 70.8125, + "learning_rate": 1.3291745107815337e-07, + "loss": 1.3841, + "step": 45860 + }, + { + "epoch": 4.34, + "grad_norm": 64.8125, + "learning_rate": 1.3253930754734782e-07, + "loss": 1.2639, + "step": 45880 + }, + { + "epoch": 4.34, + "grad_norm": 64.5, + "learning_rate": 1.3216116401654226e-07, + "loss": 1.4525, + "step": 45900 + }, + { + "epoch": 4.34, + "grad_norm": 76.0, + "learning_rate": 1.3178302048573668e-07, + "loss": 1.3358, + "step": 45920 + }, + { + "epoch": 4.34, + "grad_norm": 88.0625, + "learning_rate": 1.3140487695493113e-07, + "loss": 1.3377, + "step": 45940 + }, + { + "epoch": 4.34, + "grad_norm": 56.9375, + "learning_rate": 1.310267334241256e-07, + "loss": 1.4271, + "step": 45960 + }, + { + "epoch": 4.35, + "grad_norm": 67.4375, + "learning_rate": 1.3064858989332003e-07, + "loss": 1.3373, + "step": 45980 + }, + { + "epoch": 4.35, + "grad_norm": 69.8125, + "learning_rate": 1.3027044636251448e-07, + "loss": 1.3895, + "step": 46000 + }, + { + "epoch": 4.35, + "grad_norm": 66.125, + "learning_rate": 1.298923028317089e-07, + "loss": 1.354, + "step": 46020 + }, + { + "epoch": 4.35, + "grad_norm": 49.0625, + "learning_rate": 1.2951415930090337e-07, + "loss": 1.3607, + "step": 46040 + }, + { + "epoch": 4.35, + "grad_norm": 66.8125, + "learning_rate": 1.2913601577009782e-07, + "loss": 1.2385, + "step": 46060 + }, + { + "epoch": 4.36, + "grad_norm": 79.0, + "learning_rate": 1.2875787223929224e-07, + "loss": 1.2652, + "step": 46080 + }, + { + "epoch": 4.36, + "grad_norm": 58.09375, + "learning_rate": 1.283797287084867e-07, + "loss": 1.2984, + "step": 46100 + }, + { + "epoch": 4.36, + "grad_norm": 64.75, + "learning_rate": 1.280015851776811e-07, + "loss": 1.2636, + "step": 46120 + }, + { + "epoch": 4.36, + "grad_norm": 52.625, + "learning_rate": 1.2762344164687558e-07, + "loss": 1.4068, + "step": 46140 + }, + { + "epoch": 4.36, + "grad_norm": 53.9375, + "learning_rate": 1.2724529811607e-07, + "loss": 1.3923, + "step": 46160 + }, + { + "epoch": 4.37, + "grad_norm": 55.40625, + "learning_rate": 1.2686715458526445e-07, + "loss": 1.3176, + "step": 46180 + }, + { + "epoch": 4.37, + "grad_norm": 70.4375, + "learning_rate": 1.264890110544589e-07, + "loss": 1.34, + "step": 46200 + }, + { + "epoch": 4.37, + "grad_norm": 75.875, + "learning_rate": 1.2611086752365335e-07, + "loss": 1.3322, + "step": 46220 + }, + { + "epoch": 4.37, + "grad_norm": 66.75, + "learning_rate": 1.257327239928478e-07, + "loss": 1.3074, + "step": 46240 + }, + { + "epoch": 4.37, + "grad_norm": 77.4375, + "learning_rate": 1.2535458046204222e-07, + "loss": 1.2949, + "step": 46260 + }, + { + "epoch": 4.38, + "grad_norm": 59.625, + "learning_rate": 1.2497643693123666e-07, + "loss": 1.2506, + "step": 46280 + }, + { + "epoch": 4.38, + "grad_norm": 57.75, + "learning_rate": 1.245982934004311e-07, + "loss": 1.3649, + "step": 46300 + }, + { + "epoch": 4.38, + "grad_norm": 74.375, + "learning_rate": 1.2422014986962556e-07, + "loss": 1.3469, + "step": 46320 + }, + { + "epoch": 4.38, + "grad_norm": 65.5625, + "learning_rate": 1.2384200633882e-07, + "loss": 1.4123, + "step": 46340 + }, + { + "epoch": 4.38, + "grad_norm": 76.875, + "learning_rate": 1.2346386280801443e-07, + "loss": 1.3209, + "step": 46360 + }, + { + "epoch": 4.38, + "grad_norm": 64.75, + "learning_rate": 1.2308571927720888e-07, + "loss": 1.3176, + "step": 46380 + }, + { + "epoch": 4.39, + "grad_norm": 74.8125, + "learning_rate": 1.2270757574640333e-07, + "loss": 1.3496, + "step": 46400 + }, + { + "epoch": 4.39, + "grad_norm": 219.875, + "learning_rate": 1.2232943221559777e-07, + "loss": 1.3346, + "step": 46420 + }, + { + "epoch": 4.39, + "grad_norm": 53.90625, + "learning_rate": 1.2195128868479222e-07, + "loss": 1.3611, + "step": 46440 + }, + { + "epoch": 4.39, + "grad_norm": 66.375, + "learning_rate": 1.2157314515398667e-07, + "loss": 1.3481, + "step": 46460 + }, + { + "epoch": 4.39, + "grad_norm": 82.375, + "learning_rate": 1.211950016231811e-07, + "loss": 1.4615, + "step": 46480 + }, + { + "epoch": 4.4, + "grad_norm": 66.9375, + "learning_rate": 1.2081685809237554e-07, + "loss": 1.2017, + "step": 46500 + }, + { + "epoch": 4.4, + "grad_norm": 53.75, + "learning_rate": 1.2043871456156999e-07, + "loss": 1.3885, + "step": 46520 + }, + { + "epoch": 4.4, + "grad_norm": 74.75, + "learning_rate": 1.2006057103076443e-07, + "loss": 1.3945, + "step": 46540 + }, + { + "epoch": 4.4, + "grad_norm": 53.8125, + "learning_rate": 1.1968242749995888e-07, + "loss": 1.4098, + "step": 46560 + }, + { + "epoch": 4.4, + "grad_norm": 54.5625, + "learning_rate": 1.193042839691533e-07, + "loss": 1.2309, + "step": 46580 + }, + { + "epoch": 4.41, + "grad_norm": 76.375, + "learning_rate": 1.1892614043834775e-07, + "loss": 1.3407, + "step": 46600 + }, + { + "epoch": 4.41, + "grad_norm": 69.875, + "learning_rate": 1.185479969075422e-07, + "loss": 1.3216, + "step": 46620 + }, + { + "epoch": 4.41, + "grad_norm": 89.9375, + "learning_rate": 1.1816985337673665e-07, + "loss": 1.4638, + "step": 46640 + }, + { + "epoch": 4.41, + "grad_norm": 68.625, + "learning_rate": 1.1779170984593108e-07, + "loss": 1.3594, + "step": 46660 + }, + { + "epoch": 4.41, + "grad_norm": 65.1875, + "learning_rate": 1.1741356631512553e-07, + "loss": 1.279, + "step": 46680 + }, + { + "epoch": 4.41, + "grad_norm": 83.6875, + "learning_rate": 1.1703542278431996e-07, + "loss": 1.2234, + "step": 46700 + }, + { + "epoch": 4.42, + "grad_norm": 65.4375, + "learning_rate": 1.1665727925351441e-07, + "loss": 1.2726, + "step": 46720 + }, + { + "epoch": 4.42, + "grad_norm": 71.375, + "learning_rate": 1.1627913572270886e-07, + "loss": 1.3769, + "step": 46740 + }, + { + "epoch": 4.42, + "grad_norm": 77.5625, + "learning_rate": 1.159009921919033e-07, + "loss": 1.3536, + "step": 46760 + }, + { + "epoch": 4.42, + "grad_norm": 67.0, + "learning_rate": 1.1552284866109774e-07, + "loss": 1.2303, + "step": 46780 + }, + { + "epoch": 4.42, + "grad_norm": 63.15625, + "learning_rate": 1.1514470513029219e-07, + "loss": 1.3474, + "step": 46800 + }, + { + "epoch": 4.43, + "grad_norm": 57.59375, + "learning_rate": 1.1476656159948662e-07, + "loss": 1.3236, + "step": 46820 + }, + { + "epoch": 4.43, + "grad_norm": 73.0625, + "learning_rate": 1.1438841806868106e-07, + "loss": 1.2705, + "step": 46840 + }, + { + "epoch": 4.43, + "grad_norm": 69.5625, + "learning_rate": 1.1401027453787552e-07, + "loss": 1.3801, + "step": 46860 + }, + { + "epoch": 4.43, + "grad_norm": 58.5625, + "learning_rate": 1.1363213100706995e-07, + "loss": 1.3132, + "step": 46880 + }, + { + "epoch": 4.43, + "grad_norm": 69.75, + "learning_rate": 1.132539874762644e-07, + "loss": 1.3228, + "step": 46900 + }, + { + "epoch": 4.44, + "grad_norm": 56.625, + "learning_rate": 1.1287584394545883e-07, + "loss": 1.3565, + "step": 46920 + }, + { + "epoch": 4.44, + "grad_norm": 92.1875, + "learning_rate": 1.1249770041465328e-07, + "loss": 1.4107, + "step": 46940 + }, + { + "epoch": 4.44, + "grad_norm": 75.375, + "learning_rate": 1.1211955688384772e-07, + "loss": 1.359, + "step": 46960 + }, + { + "epoch": 4.44, + "grad_norm": 79.875, + "learning_rate": 1.1174141335304218e-07, + "loss": 1.3833, + "step": 46980 + }, + { + "epoch": 4.44, + "grad_norm": 68.0625, + "learning_rate": 1.1136326982223661e-07, + "loss": 1.3504, + "step": 47000 + }, + { + "epoch": 4.45, + "grad_norm": 83.9375, + "learning_rate": 1.1098512629143106e-07, + "loss": 1.3007, + "step": 47020 + }, + { + "epoch": 4.45, + "grad_norm": 65.6875, + "learning_rate": 1.106069827606255e-07, + "loss": 1.3499, + "step": 47040 + }, + { + "epoch": 4.45, + "grad_norm": 63.0, + "learning_rate": 1.1022883922981993e-07, + "loss": 1.3585, + "step": 47060 + }, + { + "epoch": 4.45, + "grad_norm": 85.0, + "learning_rate": 1.0985069569901438e-07, + "loss": 1.3445, + "step": 47080 + }, + { + "epoch": 4.45, + "grad_norm": 54.78125, + "learning_rate": 1.0947255216820881e-07, + "loss": 1.2835, + "step": 47100 + }, + { + "epoch": 4.45, + "grad_norm": 61.59375, + "learning_rate": 1.0909440863740327e-07, + "loss": 1.2405, + "step": 47120 + }, + { + "epoch": 4.46, + "grad_norm": 90.25, + "learning_rate": 1.0871626510659771e-07, + "loss": 1.2945, + "step": 47140 + }, + { + "epoch": 4.46, + "grad_norm": 56.28125, + "learning_rate": 1.0833812157579216e-07, + "loss": 1.3271, + "step": 47160 + }, + { + "epoch": 4.46, + "grad_norm": 56.03125, + "learning_rate": 1.0795997804498659e-07, + "loss": 1.3141, + "step": 47180 + }, + { + "epoch": 4.46, + "grad_norm": 50.59375, + "learning_rate": 1.0758183451418104e-07, + "loss": 1.351, + "step": 47200 + }, + { + "epoch": 4.46, + "grad_norm": 65.5, + "learning_rate": 1.0720369098337547e-07, + "loss": 1.3218, + "step": 47220 + }, + { + "epoch": 4.47, + "grad_norm": 64.6875, + "learning_rate": 1.0682554745256993e-07, + "loss": 1.3122, + "step": 47240 + }, + { + "epoch": 4.47, + "grad_norm": 51.875, + "learning_rate": 1.0644740392176437e-07, + "loss": 1.3517, + "step": 47260 + }, + { + "epoch": 4.47, + "grad_norm": 78.9375, + "learning_rate": 1.0606926039095882e-07, + "loss": 1.4208, + "step": 47280 + }, + { + "epoch": 4.47, + "grad_norm": 51.4375, + "learning_rate": 1.0569111686015325e-07, + "loss": 1.336, + "step": 47300 + }, + { + "epoch": 4.47, + "grad_norm": 86.375, + "learning_rate": 1.0531297332934768e-07, + "loss": 1.3902, + "step": 47320 + }, + { + "epoch": 4.48, + "grad_norm": 88.625, + "learning_rate": 1.0493482979854213e-07, + "loss": 1.3191, + "step": 47340 + }, + { + "epoch": 4.48, + "grad_norm": 86.0625, + "learning_rate": 1.0455668626773658e-07, + "loss": 1.307, + "step": 47360 + }, + { + "epoch": 4.48, + "grad_norm": 67.75, + "learning_rate": 1.0417854273693103e-07, + "loss": 1.3867, + "step": 47380 + }, + { + "epoch": 4.48, + "grad_norm": 61.125, + "learning_rate": 1.0380039920612546e-07, + "loss": 1.3503, + "step": 47400 + }, + { + "epoch": 4.48, + "grad_norm": 62.9375, + "learning_rate": 1.0342225567531991e-07, + "loss": 1.271, + "step": 47420 + }, + { + "epoch": 4.48, + "grad_norm": 60.4375, + "learning_rate": 1.0304411214451434e-07, + "loss": 1.3358, + "step": 47440 + }, + { + "epoch": 4.49, + "grad_norm": 54.875, + "learning_rate": 1.0266596861370879e-07, + "loss": 1.3072, + "step": 47460 + }, + { + "epoch": 4.49, + "grad_norm": 60.0625, + "learning_rate": 1.0228782508290324e-07, + "loss": 1.4395, + "step": 47480 + }, + { + "epoch": 4.49, + "grad_norm": 55.0625, + "learning_rate": 1.0190968155209769e-07, + "loss": 1.2708, + "step": 47500 + }, + { + "epoch": 4.49, + "grad_norm": 55.59375, + "learning_rate": 1.0153153802129212e-07, + "loss": 1.3655, + "step": 47520 + }, + { + "epoch": 4.49, + "grad_norm": 70.8125, + "learning_rate": 1.0115339449048656e-07, + "loss": 1.3221, + "step": 47540 + }, + { + "epoch": 4.5, + "grad_norm": 58.125, + "learning_rate": 1.00775250959681e-07, + "loss": 1.3146, + "step": 47560 + }, + { + "epoch": 4.5, + "grad_norm": 82.375, + "learning_rate": 1.0039710742887544e-07, + "loss": 1.4091, + "step": 47580 + }, + { + "epoch": 4.5, + "grad_norm": 81.6875, + "learning_rate": 1.000189638980699e-07, + "loss": 1.4975, + "step": 47600 + }, + { + "epoch": 4.5, + "grad_norm": 64.625, + "learning_rate": 9.964082036726433e-08, + "loss": 1.2658, + "step": 47620 + }, + { + "epoch": 4.5, + "grad_norm": 66.5, + "learning_rate": 9.926267683645878e-08, + "loss": 1.343, + "step": 47640 + }, + { + "epoch": 4.51, + "grad_norm": 70.0, + "learning_rate": 9.888453330565322e-08, + "loss": 1.2768, + "step": 47660 + }, + { + "epoch": 4.51, + "grad_norm": 71.9375, + "learning_rate": 9.850638977484766e-08, + "loss": 1.2412, + "step": 47680 + }, + { + "epoch": 4.51, + "grad_norm": 61.0, + "learning_rate": 9.81282462440421e-08, + "loss": 1.3141, + "step": 47700 + }, + { + "epoch": 4.51, + "grad_norm": 81.125, + "learning_rate": 9.775010271323656e-08, + "loss": 1.3319, + "step": 47720 + }, + { + "epoch": 4.51, + "grad_norm": 70.6875, + "learning_rate": 9.7371959182431e-08, + "loss": 1.3988, + "step": 47740 + }, + { + "epoch": 4.52, + "grad_norm": 61.25, + "learning_rate": 9.699381565162544e-08, + "loss": 1.2966, + "step": 47760 + }, + { + "epoch": 4.52, + "grad_norm": 53.59375, + "learning_rate": 9.661567212081988e-08, + "loss": 1.3256, + "step": 47780 + }, + { + "epoch": 4.52, + "grad_norm": 63.375, + "learning_rate": 9.623752859001431e-08, + "loss": 1.3215, + "step": 47800 + }, + { + "epoch": 4.52, + "grad_norm": 111.6875, + "learning_rate": 9.585938505920876e-08, + "loss": 1.3497, + "step": 47820 + }, + { + "epoch": 4.52, + "grad_norm": 64.625, + "learning_rate": 9.54812415284032e-08, + "loss": 1.3466, + "step": 47840 + }, + { + "epoch": 4.52, + "grad_norm": 71.125, + "learning_rate": 9.510309799759766e-08, + "loss": 1.3301, + "step": 47860 + }, + { + "epoch": 4.53, + "grad_norm": 64.0625, + "learning_rate": 9.472495446679209e-08, + "loss": 1.4499, + "step": 47880 + }, + { + "epoch": 4.53, + "grad_norm": 75.0625, + "learning_rate": 9.434681093598654e-08, + "loss": 1.4131, + "step": 47900 + }, + { + "epoch": 4.53, + "grad_norm": 57.90625, + "learning_rate": 9.396866740518097e-08, + "loss": 1.3384, + "step": 47920 + }, + { + "epoch": 4.53, + "grad_norm": 60.09375, + "learning_rate": 9.359052387437542e-08, + "loss": 1.3501, + "step": 47940 + }, + { + "epoch": 4.53, + "grad_norm": 54.53125, + "learning_rate": 9.321238034356985e-08, + "loss": 1.3278, + "step": 47960 + }, + { + "epoch": 4.54, + "grad_norm": 58.0, + "learning_rate": 9.283423681276432e-08, + "loss": 1.3404, + "step": 47980 + }, + { + "epoch": 4.54, + "grad_norm": 63.53125, + "learning_rate": 9.245609328195875e-08, + "loss": 1.3284, + "step": 48000 + }, + { + "epoch": 4.54, + "grad_norm": 79.8125, + "learning_rate": 9.207794975115318e-08, + "loss": 1.3528, + "step": 48020 + }, + { + "epoch": 4.54, + "grad_norm": 68.3125, + "learning_rate": 9.169980622034763e-08, + "loss": 1.4362, + "step": 48040 + }, + { + "epoch": 4.54, + "grad_norm": 71.4375, + "learning_rate": 9.132166268954207e-08, + "loss": 1.4823, + "step": 48060 + }, + { + "epoch": 4.55, + "grad_norm": 95.5, + "learning_rate": 9.094351915873651e-08, + "loss": 1.3332, + "step": 48080 + }, + { + "epoch": 4.55, + "grad_norm": 70.9375, + "learning_rate": 9.056537562793096e-08, + "loss": 1.3638, + "step": 48100 + }, + { + "epoch": 4.55, + "grad_norm": 70.5625, + "learning_rate": 9.018723209712541e-08, + "loss": 1.3538, + "step": 48120 + }, + { + "epoch": 4.55, + "grad_norm": 54.09375, + "learning_rate": 8.980908856631984e-08, + "loss": 1.3523, + "step": 48140 + }, + { + "epoch": 4.55, + "grad_norm": 84.375, + "learning_rate": 8.943094503551429e-08, + "loss": 1.3176, + "step": 48160 + }, + { + "epoch": 4.55, + "grad_norm": 64.25, + "learning_rate": 8.905280150470873e-08, + "loss": 1.4906, + "step": 48180 + }, + { + "epoch": 4.56, + "grad_norm": 52.375, + "learning_rate": 8.867465797390317e-08, + "loss": 1.3775, + "step": 48200 + }, + { + "epoch": 4.56, + "grad_norm": 75.9375, + "learning_rate": 8.829651444309762e-08, + "loss": 1.2116, + "step": 48220 + }, + { + "epoch": 4.56, + "grad_norm": 62.75, + "learning_rate": 8.791837091229207e-08, + "loss": 1.2772, + "step": 48240 + }, + { + "epoch": 4.56, + "grad_norm": 61.21875, + "learning_rate": 8.75402273814865e-08, + "loss": 1.4184, + "step": 48260 + }, + { + "epoch": 4.56, + "grad_norm": 67.25, + "learning_rate": 8.716208385068094e-08, + "loss": 1.3029, + "step": 48280 + }, + { + "epoch": 4.57, + "grad_norm": 62.09375, + "learning_rate": 8.678394031987539e-08, + "loss": 1.3634, + "step": 48300 + }, + { + "epoch": 4.57, + "grad_norm": 79.5, + "learning_rate": 8.640579678906982e-08, + "loss": 1.3963, + "step": 48320 + }, + { + "epoch": 4.57, + "grad_norm": 92.5625, + "learning_rate": 8.602765325826428e-08, + "loss": 1.3531, + "step": 48340 + }, + { + "epoch": 4.57, + "grad_norm": 78.75, + "learning_rate": 8.564950972745872e-08, + "loss": 1.3542, + "step": 48360 + }, + { + "epoch": 4.57, + "grad_norm": 79.8125, + "learning_rate": 8.527136619665317e-08, + "loss": 1.3637, + "step": 48380 + }, + { + "epoch": 4.58, + "grad_norm": 83.0625, + "learning_rate": 8.48932226658476e-08, + "loss": 1.3512, + "step": 48400 + }, + { + "epoch": 4.58, + "grad_norm": 45.3125, + "learning_rate": 8.451507913504205e-08, + "loss": 1.242, + "step": 48420 + }, + { + "epoch": 4.58, + "grad_norm": 78.125, + "learning_rate": 8.413693560423648e-08, + "loss": 1.3456, + "step": 48440 + }, + { + "epoch": 4.58, + "grad_norm": 67.5, + "learning_rate": 8.375879207343094e-08, + "loss": 1.2762, + "step": 48460 + }, + { + "epoch": 4.58, + "grad_norm": 59.4375, + "learning_rate": 8.338064854262538e-08, + "loss": 1.3527, + "step": 48480 + }, + { + "epoch": 4.58, + "grad_norm": 75.75, + "learning_rate": 8.300250501181981e-08, + "loss": 1.3539, + "step": 48500 + }, + { + "epoch": 4.59, + "grad_norm": 71.625, + "learning_rate": 8.262436148101426e-08, + "loss": 1.2919, + "step": 48520 + }, + { + "epoch": 4.59, + "grad_norm": 77.4375, + "learning_rate": 8.22462179502087e-08, + "loss": 1.2577, + "step": 48540 + }, + { + "epoch": 4.59, + "grad_norm": 56.90625, + "learning_rate": 8.186807441940314e-08, + "loss": 1.286, + "step": 48560 + }, + { + "epoch": 4.59, + "grad_norm": 60.75, + "learning_rate": 8.148993088859758e-08, + "loss": 1.4127, + "step": 48580 + }, + { + "epoch": 4.59, + "grad_norm": 53.875, + "learning_rate": 8.111178735779204e-08, + "loss": 1.2502, + "step": 48600 + }, + { + "epoch": 4.6, + "grad_norm": 62.5, + "learning_rate": 8.073364382698647e-08, + "loss": 1.2631, + "step": 48620 + }, + { + "epoch": 4.6, + "grad_norm": 70.3125, + "learning_rate": 8.035550029618092e-08, + "loss": 1.2837, + "step": 48640 + }, + { + "epoch": 4.6, + "grad_norm": 49.3125, + "learning_rate": 7.997735676537535e-08, + "loss": 1.3133, + "step": 48660 + }, + { + "epoch": 4.6, + "grad_norm": 60.4375, + "learning_rate": 7.95992132345698e-08, + "loss": 1.4197, + "step": 48680 + }, + { + "epoch": 4.6, + "grad_norm": 60.8125, + "learning_rate": 7.922106970376424e-08, + "loss": 1.3261, + "step": 48700 + }, + { + "epoch": 4.61, + "grad_norm": 77.0, + "learning_rate": 7.884292617295868e-08, + "loss": 1.2969, + "step": 48720 + }, + { + "epoch": 4.61, + "grad_norm": 58.09375, + "learning_rate": 7.846478264215313e-08, + "loss": 1.2446, + "step": 48740 + }, + { + "epoch": 4.61, + "grad_norm": 52.125, + "learning_rate": 7.808663911134757e-08, + "loss": 1.2991, + "step": 48760 + }, + { + "epoch": 4.61, + "grad_norm": 62.4375, + "learning_rate": 7.770849558054201e-08, + "loss": 1.2746, + "step": 48780 + }, + { + "epoch": 4.61, + "grad_norm": 53.84375, + "learning_rate": 7.733035204973645e-08, + "loss": 1.3841, + "step": 48800 + }, + { + "epoch": 4.62, + "grad_norm": 66.625, + "learning_rate": 7.69522085189309e-08, + "loss": 1.3453, + "step": 48820 + }, + { + "epoch": 4.62, + "grad_norm": 84.25, + "learning_rate": 7.657406498812534e-08, + "loss": 1.2658, + "step": 48840 + }, + { + "epoch": 4.62, + "grad_norm": 76.75, + "learning_rate": 7.619592145731979e-08, + "loss": 1.2577, + "step": 48860 + }, + { + "epoch": 4.62, + "grad_norm": 78.125, + "learning_rate": 7.581777792651423e-08, + "loss": 1.3535, + "step": 48880 + }, + { + "epoch": 4.62, + "grad_norm": 65.375, + "learning_rate": 7.543963439570867e-08, + "loss": 1.2434, + "step": 48900 + }, + { + "epoch": 4.62, + "grad_norm": 67.3125, + "learning_rate": 7.506149086490311e-08, + "loss": 1.2902, + "step": 48920 + }, + { + "epoch": 4.63, + "grad_norm": 67.9375, + "learning_rate": 7.468334733409756e-08, + "loss": 1.4072, + "step": 48940 + }, + { + "epoch": 4.63, + "grad_norm": 68.8125, + "learning_rate": 7.4305203803292e-08, + "loss": 1.281, + "step": 48960 + }, + { + "epoch": 4.63, + "grad_norm": 75.0625, + "learning_rate": 7.392706027248644e-08, + "loss": 1.2644, + "step": 48980 + }, + { + "epoch": 4.63, + "grad_norm": 82.875, + "learning_rate": 7.354891674168089e-08, + "loss": 1.4248, + "step": 49000 + }, + { + "epoch": 4.63, + "grad_norm": 63.3125, + "learning_rate": 7.317077321087532e-08, + "loss": 1.4421, + "step": 49020 + }, + { + "epoch": 4.64, + "grad_norm": 79.1875, + "learning_rate": 7.279262968006977e-08, + "loss": 1.3399, + "step": 49040 + }, + { + "epoch": 4.64, + "grad_norm": 65.3125, + "learning_rate": 7.24144861492642e-08, + "loss": 1.3482, + "step": 49060 + }, + { + "epoch": 4.64, + "grad_norm": 74.5, + "learning_rate": 7.203634261845867e-08, + "loss": 1.3647, + "step": 49080 + }, + { + "epoch": 4.64, + "grad_norm": 72.25, + "learning_rate": 7.16581990876531e-08, + "loss": 1.2468, + "step": 49100 + }, + { + "epoch": 4.64, + "grad_norm": 61.5, + "learning_rate": 7.128005555684755e-08, + "loss": 1.3302, + "step": 49120 + }, + { + "epoch": 4.65, + "grad_norm": 70.875, + "learning_rate": 7.090191202604198e-08, + "loss": 1.3486, + "step": 49140 + }, + { + "epoch": 4.65, + "grad_norm": 69.625, + "learning_rate": 7.052376849523643e-08, + "loss": 1.2402, + "step": 49160 + }, + { + "epoch": 4.65, + "grad_norm": 71.25, + "learning_rate": 7.014562496443086e-08, + "loss": 1.3408, + "step": 49180 + }, + { + "epoch": 4.65, + "grad_norm": 61.40625, + "learning_rate": 6.97674814336253e-08, + "loss": 1.3726, + "step": 49200 + }, + { + "epoch": 4.65, + "grad_norm": 45.6875, + "learning_rate": 6.938933790281976e-08, + "loss": 1.2908, + "step": 49220 + }, + { + "epoch": 4.65, + "grad_norm": 66.125, + "learning_rate": 6.90111943720142e-08, + "loss": 1.2688, + "step": 49240 + }, + { + "epoch": 4.66, + "grad_norm": 44.46875, + "learning_rate": 6.863305084120864e-08, + "loss": 1.2785, + "step": 49260 + }, + { + "epoch": 4.66, + "grad_norm": 76.25, + "learning_rate": 6.825490731040308e-08, + "loss": 1.2984, + "step": 49280 + }, + { + "epoch": 4.66, + "grad_norm": 76.9375, + "learning_rate": 6.787676377959752e-08, + "loss": 1.394, + "step": 49300 + }, + { + "epoch": 4.66, + "grad_norm": 62.21875, + "learning_rate": 6.749862024879196e-08, + "loss": 1.336, + "step": 49320 + }, + { + "epoch": 4.66, + "grad_norm": 67.875, + "learning_rate": 6.712047671798642e-08, + "loss": 1.3415, + "step": 49340 + }, + { + "epoch": 4.67, + "grad_norm": 57.78125, + "learning_rate": 6.674233318718085e-08, + "loss": 1.428, + "step": 49360 + }, + { + "epoch": 4.67, + "grad_norm": 74.8125, + "learning_rate": 6.63641896563753e-08, + "loss": 1.2926, + "step": 49380 + }, + { + "epoch": 4.67, + "grad_norm": 58.21875, + "learning_rate": 6.598604612556974e-08, + "loss": 1.3439, + "step": 49400 + }, + { + "epoch": 4.67, + "grad_norm": 52.34375, + "learning_rate": 6.560790259476418e-08, + "loss": 1.2582, + "step": 49420 + }, + { + "epoch": 4.67, + "grad_norm": 63.09375, + "learning_rate": 6.522975906395862e-08, + "loss": 1.385, + "step": 49440 + }, + { + "epoch": 4.68, + "grad_norm": 64.4375, + "learning_rate": 6.485161553315307e-08, + "loss": 1.3645, + "step": 49460 + }, + { + "epoch": 4.68, + "grad_norm": 74.875, + "learning_rate": 6.447347200234751e-08, + "loss": 1.3553, + "step": 49480 + }, + { + "epoch": 4.68, + "grad_norm": 76.5, + "learning_rate": 6.409532847154195e-08, + "loss": 1.3469, + "step": 49500 + }, + { + "epoch": 4.68, + "grad_norm": 62.90625, + "learning_rate": 6.37171849407364e-08, + "loss": 1.2842, + "step": 49520 + }, + { + "epoch": 4.68, + "grad_norm": 57.90625, + "learning_rate": 6.333904140993083e-08, + "loss": 1.2475, + "step": 49540 + }, + { + "epoch": 4.69, + "grad_norm": 51.21875, + "learning_rate": 6.296089787912528e-08, + "loss": 1.4054, + "step": 49560 + }, + { + "epoch": 4.69, + "grad_norm": 63.75, + "learning_rate": 6.258275434831973e-08, + "loss": 1.2913, + "step": 49580 + }, + { + "epoch": 4.69, + "grad_norm": 62.875, + "learning_rate": 6.220461081751416e-08, + "loss": 1.2777, + "step": 49600 + }, + { + "epoch": 4.69, + "grad_norm": 67.8125, + "learning_rate": 6.182646728670861e-08, + "loss": 1.4151, + "step": 49620 + }, + { + "epoch": 4.69, + "grad_norm": 68.8125, + "learning_rate": 6.144832375590306e-08, + "loss": 1.3384, + "step": 49640 + }, + { + "epoch": 4.69, + "grad_norm": 93.875, + "learning_rate": 6.107018022509749e-08, + "loss": 1.3058, + "step": 49660 + }, + { + "epoch": 4.7, + "grad_norm": 106.4375, + "learning_rate": 6.069203669429194e-08, + "loss": 1.358, + "step": 49680 + }, + { + "epoch": 4.7, + "grad_norm": 59.34375, + "learning_rate": 6.031389316348639e-08, + "loss": 1.3103, + "step": 49700 + }, + { + "epoch": 4.7, + "grad_norm": 67.625, + "learning_rate": 5.993574963268082e-08, + "loss": 1.2339, + "step": 49720 + }, + { + "epoch": 4.7, + "grad_norm": 74.5, + "learning_rate": 5.955760610187527e-08, + "loss": 1.3031, + "step": 49740 + }, + { + "epoch": 4.7, + "grad_norm": 79.8125, + "learning_rate": 5.917946257106971e-08, + "loss": 1.2387, + "step": 49760 + }, + { + "epoch": 4.71, + "grad_norm": 60.5, + "learning_rate": 5.880131904026415e-08, + "loss": 1.2539, + "step": 49780 + }, + { + "epoch": 4.71, + "grad_norm": 79.75, + "learning_rate": 5.842317550945859e-08, + "loss": 1.3769, + "step": 49800 + }, + { + "epoch": 4.71, + "grad_norm": 86.0625, + "learning_rate": 5.8045031978653034e-08, + "loss": 1.3498, + "step": 49820 + }, + { + "epoch": 4.71, + "grad_norm": 66.875, + "learning_rate": 5.7666888447847475e-08, + "loss": 1.345, + "step": 49840 + }, + { + "epoch": 4.71, + "grad_norm": 61.03125, + "learning_rate": 5.728874491704192e-08, + "loss": 1.3413, + "step": 49860 + }, + { + "epoch": 4.72, + "grad_norm": 69.1875, + "learning_rate": 5.6910601386236364e-08, + "loss": 1.3266, + "step": 49880 + }, + { + "epoch": 4.72, + "grad_norm": 115.125, + "learning_rate": 5.6532457855430805e-08, + "loss": 1.3336, + "step": 49900 + }, + { + "epoch": 4.72, + "grad_norm": 59.78125, + "learning_rate": 5.615431432462525e-08, + "loss": 1.4154, + "step": 49920 + }, + { + "epoch": 4.72, + "grad_norm": 62.8125, + "learning_rate": 5.5776170793819694e-08, + "loss": 1.4303, + "step": 49940 + }, + { + "epoch": 4.72, + "grad_norm": 88.4375, + "learning_rate": 5.5398027263014136e-08, + "loss": 1.4158, + "step": 49960 + }, + { + "epoch": 4.72, + "grad_norm": 75.25, + "learning_rate": 5.5019883732208583e-08, + "loss": 1.2602, + "step": 49980 + }, + { + "epoch": 4.73, + "grad_norm": 69.8125, + "learning_rate": 5.4641740201403024e-08, + "loss": 1.3976, + "step": 50000 + }, + { + "epoch": 4.73, + "grad_norm": 69.875, + "learning_rate": 5.4263596670597466e-08, + "loss": 1.304, + "step": 50020 + }, + { + "epoch": 4.73, + "grad_norm": 69.125, + "learning_rate": 5.388545313979191e-08, + "loss": 1.2925, + "step": 50040 + }, + { + "epoch": 4.73, + "grad_norm": 64.0, + "learning_rate": 5.350730960898635e-08, + "loss": 1.3776, + "step": 50060 + }, + { + "epoch": 4.73, + "grad_norm": 53.0, + "learning_rate": 5.312916607818079e-08, + "loss": 1.4188, + "step": 50080 + }, + { + "epoch": 4.74, + "grad_norm": 66.375, + "learning_rate": 5.275102254737524e-08, + "loss": 1.2999, + "step": 50100 + }, + { + "epoch": 4.74, + "grad_norm": 64.8125, + "learning_rate": 5.237287901656968e-08, + "loss": 1.4259, + "step": 50120 + }, + { + "epoch": 4.74, + "grad_norm": 58.84375, + "learning_rate": 5.199473548576412e-08, + "loss": 1.4155, + "step": 50140 + }, + { + "epoch": 4.74, + "grad_norm": 85.25, + "learning_rate": 5.161659195495857e-08, + "loss": 1.2742, + "step": 50160 + }, + { + "epoch": 4.74, + "grad_norm": 71.5625, + "learning_rate": 5.123844842415301e-08, + "loss": 1.3077, + "step": 50180 + }, + { + "epoch": 4.75, + "grad_norm": 70.5625, + "learning_rate": 5.086030489334745e-08, + "loss": 1.4397, + "step": 50200 + }, + { + "epoch": 4.75, + "grad_norm": 70.125, + "learning_rate": 5.04821613625419e-08, + "loss": 1.2983, + "step": 50220 + }, + { + "epoch": 4.75, + "grad_norm": 74.375, + "learning_rate": 5.010401783173634e-08, + "loss": 1.3589, + "step": 50240 + }, + { + "epoch": 4.75, + "grad_norm": 67.875, + "learning_rate": 4.972587430093078e-08, + "loss": 1.3045, + "step": 50260 + }, + { + "epoch": 4.75, + "grad_norm": 64.5, + "learning_rate": 4.934773077012522e-08, + "loss": 1.3912, + "step": 50280 + }, + { + "epoch": 4.76, + "grad_norm": 63.46875, + "learning_rate": 4.896958723931966e-08, + "loss": 1.3652, + "step": 50300 + }, + { + "epoch": 4.76, + "grad_norm": 72.75, + "learning_rate": 4.85914437085141e-08, + "loss": 1.3422, + "step": 50320 + }, + { + "epoch": 4.76, + "grad_norm": 53.46875, + "learning_rate": 4.821330017770855e-08, + "loss": 1.2532, + "step": 50340 + }, + { + "epoch": 4.76, + "grad_norm": 56.9375, + "learning_rate": 4.783515664690299e-08, + "loss": 1.4502, + "step": 50360 + }, + { + "epoch": 4.76, + "grad_norm": 51.40625, + "learning_rate": 4.745701311609743e-08, + "loss": 1.3256, + "step": 50380 + }, + { + "epoch": 4.76, + "grad_norm": 88.25, + "learning_rate": 4.707886958529188e-08, + "loss": 1.2456, + "step": 50400 + }, + { + "epoch": 4.77, + "grad_norm": 64.9375, + "learning_rate": 4.670072605448632e-08, + "loss": 1.3789, + "step": 50420 + }, + { + "epoch": 4.77, + "grad_norm": 69.6875, + "learning_rate": 4.632258252368076e-08, + "loss": 1.3511, + "step": 50440 + }, + { + "epoch": 4.77, + "grad_norm": 60.84375, + "learning_rate": 4.594443899287521e-08, + "loss": 1.3829, + "step": 50460 + }, + { + "epoch": 4.77, + "grad_norm": 65.75, + "learning_rate": 4.556629546206965e-08, + "loss": 1.316, + "step": 50480 + }, + { + "epoch": 4.77, + "grad_norm": 60.9375, + "learning_rate": 4.518815193126409e-08, + "loss": 1.3926, + "step": 50500 + }, + { + "epoch": 4.78, + "grad_norm": 68.9375, + "learning_rate": 4.481000840045853e-08, + "loss": 1.3642, + "step": 50520 + }, + { + "epoch": 4.78, + "grad_norm": 52.03125, + "learning_rate": 4.4431864869652975e-08, + "loss": 1.3064, + "step": 50540 + }, + { + "epoch": 4.78, + "grad_norm": 64.8125, + "learning_rate": 4.4053721338847417e-08, + "loss": 1.3323, + "step": 50560 + }, + { + "epoch": 4.78, + "grad_norm": 67.25, + "learning_rate": 4.367557780804186e-08, + "loss": 1.2735, + "step": 50580 + }, + { + "epoch": 4.78, + "grad_norm": 55.1875, + "learning_rate": 4.3297434277236306e-08, + "loss": 1.3655, + "step": 50600 + }, + { + "epoch": 4.79, + "grad_norm": 71.1875, + "learning_rate": 4.2919290746430747e-08, + "loss": 1.2776, + "step": 50620 + }, + { + "epoch": 4.79, + "grad_norm": 62.9375, + "learning_rate": 4.254114721562519e-08, + "loss": 1.2622, + "step": 50640 + }, + { + "epoch": 4.79, + "grad_norm": 79.0, + "learning_rate": 4.2163003684819636e-08, + "loss": 1.4572, + "step": 50660 + }, + { + "epoch": 4.79, + "grad_norm": 70.625, + "learning_rate": 4.178486015401408e-08, + "loss": 1.304, + "step": 50680 + }, + { + "epoch": 4.79, + "grad_norm": 68.5, + "learning_rate": 4.140671662320852e-08, + "loss": 1.3719, + "step": 50700 + }, + { + "epoch": 4.79, + "grad_norm": 65.0625, + "learning_rate": 4.1028573092402966e-08, + "loss": 1.3088, + "step": 50720 + }, + { + "epoch": 4.8, + "grad_norm": 63.25, + "learning_rate": 4.065042956159741e-08, + "loss": 1.3505, + "step": 50740 + }, + { + "epoch": 4.8, + "grad_norm": 61.8125, + "learning_rate": 4.027228603079184e-08, + "loss": 1.2913, + "step": 50760 + }, + { + "epoch": 4.8, + "grad_norm": 54.9375, + "learning_rate": 3.989414249998629e-08, + "loss": 1.3873, + "step": 50780 + }, + { + "epoch": 4.8, + "grad_norm": 62.9375, + "learning_rate": 3.951599896918073e-08, + "loss": 1.4092, + "step": 50800 + }, + { + "epoch": 4.8, + "grad_norm": 67.5, + "learning_rate": 3.913785543837517e-08, + "loss": 1.2893, + "step": 50820 + }, + { + "epoch": 4.81, + "grad_norm": 59.625, + "learning_rate": 3.875971190756962e-08, + "loss": 1.354, + "step": 50840 + }, + { + "epoch": 4.81, + "grad_norm": 54.25, + "learning_rate": 3.838156837676406e-08, + "loss": 1.3941, + "step": 50860 + }, + { + "epoch": 4.81, + "grad_norm": 72.8125, + "learning_rate": 3.80034248459585e-08, + "loss": 1.3033, + "step": 50880 + }, + { + "epoch": 4.81, + "grad_norm": 69.625, + "learning_rate": 3.762528131515295e-08, + "loss": 1.2482, + "step": 50900 + }, + { + "epoch": 4.81, + "grad_norm": 53.9375, + "learning_rate": 3.724713778434739e-08, + "loss": 1.3942, + "step": 50920 + }, + { + "epoch": 4.82, + "grad_norm": 89.1875, + "learning_rate": 3.686899425354183e-08, + "loss": 1.3397, + "step": 50940 + }, + { + "epoch": 4.82, + "grad_norm": 58.75, + "learning_rate": 3.649085072273628e-08, + "loss": 1.2749, + "step": 50960 + }, + { + "epoch": 4.82, + "grad_norm": 71.3125, + "learning_rate": 3.611270719193072e-08, + "loss": 1.3422, + "step": 50980 + }, + { + "epoch": 4.82, + "grad_norm": 54.34375, + "learning_rate": 3.5734563661125155e-08, + "loss": 1.2571, + "step": 51000 + }, + { + "epoch": 4.82, + "grad_norm": 58.84375, + "learning_rate": 3.53564201303196e-08, + "loss": 1.2357, + "step": 51020 + }, + { + "epoch": 4.83, + "grad_norm": 61.78125, + "learning_rate": 3.4978276599514044e-08, + "loss": 1.3384, + "step": 51040 + }, + { + "epoch": 4.83, + "grad_norm": 61.40625, + "learning_rate": 3.4600133068708485e-08, + "loss": 1.1818, + "step": 51060 + }, + { + "epoch": 4.83, + "grad_norm": 67.25, + "learning_rate": 3.422198953790293e-08, + "loss": 1.3936, + "step": 51080 + }, + { + "epoch": 4.83, + "grad_norm": 70.8125, + "learning_rate": 3.3843846007097374e-08, + "loss": 1.2922, + "step": 51100 + }, + { + "epoch": 4.83, + "grad_norm": 62.75, + "learning_rate": 3.3465702476291815e-08, + "loss": 1.4717, + "step": 51120 + }, + { + "epoch": 4.83, + "grad_norm": 55.375, + "learning_rate": 3.308755894548626e-08, + "loss": 1.342, + "step": 51140 + }, + { + "epoch": 4.84, + "grad_norm": 84.625, + "learning_rate": 3.2709415414680704e-08, + "loss": 1.3032, + "step": 51160 + }, + { + "epoch": 4.84, + "grad_norm": 67.625, + "learning_rate": 3.2331271883875145e-08, + "loss": 1.366, + "step": 51180 + }, + { + "epoch": 4.84, + "grad_norm": 67.625, + "learning_rate": 3.195312835306959e-08, + "loss": 1.3584, + "step": 51200 + }, + { + "epoch": 4.84, + "grad_norm": 65.25, + "learning_rate": 3.1574984822264034e-08, + "loss": 1.3087, + "step": 51220 + }, + { + "epoch": 4.84, + "grad_norm": 61.5, + "learning_rate": 3.1196841291458475e-08, + "loss": 1.3388, + "step": 51240 + }, + { + "epoch": 4.85, + "grad_norm": 63.78125, + "learning_rate": 3.0818697760652917e-08, + "loss": 1.2657, + "step": 51260 + }, + { + "epoch": 4.85, + "grad_norm": 68.75, + "learning_rate": 3.044055422984736e-08, + "loss": 1.3103, + "step": 51280 + }, + { + "epoch": 4.85, + "grad_norm": 56.4375, + "learning_rate": 3.00624106990418e-08, + "loss": 1.4243, + "step": 51300 + }, + { + "epoch": 4.85, + "grad_norm": 83.875, + "learning_rate": 2.9684267168236243e-08, + "loss": 1.2968, + "step": 51320 + }, + { + "epoch": 4.85, + "grad_norm": 64.3125, + "learning_rate": 2.9306123637430688e-08, + "loss": 1.313, + "step": 51340 + }, + { + "epoch": 4.86, + "grad_norm": 78.875, + "learning_rate": 2.892798010662513e-08, + "loss": 1.3133, + "step": 51360 + }, + { + "epoch": 4.86, + "grad_norm": 68.0625, + "learning_rate": 2.8549836575819574e-08, + "loss": 1.3116, + "step": 51380 + }, + { + "epoch": 4.86, + "grad_norm": 72.0625, + "learning_rate": 2.8171693045014018e-08, + "loss": 1.3143, + "step": 51400 + }, + { + "epoch": 4.86, + "grad_norm": 56.96875, + "learning_rate": 2.7793549514208456e-08, + "loss": 1.3107, + "step": 51420 + }, + { + "epoch": 4.86, + "grad_norm": 57.75, + "learning_rate": 2.74154059834029e-08, + "loss": 1.3432, + "step": 51440 + }, + { + "epoch": 4.86, + "grad_norm": 75.25, + "learning_rate": 2.7037262452597345e-08, + "loss": 1.3119, + "step": 51460 + }, + { + "epoch": 4.87, + "grad_norm": 81.875, + "learning_rate": 2.6659118921791786e-08, + "loss": 1.2609, + "step": 51480 + }, + { + "epoch": 4.87, + "grad_norm": 70.25, + "learning_rate": 2.628097539098623e-08, + "loss": 1.2975, + "step": 51500 + }, + { + "epoch": 4.87, + "grad_norm": 63.9375, + "learning_rate": 2.5902831860180668e-08, + "loss": 1.3698, + "step": 51520 + }, + { + "epoch": 4.87, + "grad_norm": 72.125, + "learning_rate": 2.5524688329375113e-08, + "loss": 1.3457, + "step": 51540 + }, + { + "epoch": 4.87, + "grad_norm": 62.4375, + "learning_rate": 2.5146544798569557e-08, + "loss": 1.3434, + "step": 51560 + }, + { + "epoch": 4.88, + "grad_norm": 92.9375, + "learning_rate": 2.4768401267763998e-08, + "loss": 1.3211, + "step": 51580 + }, + { + "epoch": 4.88, + "grad_norm": 59.6875, + "learning_rate": 2.4390257736958443e-08, + "loss": 1.3081, + "step": 51600 + }, + { + "epoch": 4.88, + "grad_norm": 62.09375, + "learning_rate": 2.4012114206152887e-08, + "loss": 1.333, + "step": 51620 + }, + { + "epoch": 4.88, + "grad_norm": 54.59375, + "learning_rate": 2.3633970675347325e-08, + "loss": 1.316, + "step": 51640 + }, + { + "epoch": 4.88, + "grad_norm": 70.9375, + "learning_rate": 2.325582714454177e-08, + "loss": 1.3066, + "step": 51660 + }, + { + "epoch": 4.89, + "grad_norm": 74.5625, + "learning_rate": 2.2877683613736214e-08, + "loss": 1.3551, + "step": 51680 + }, + { + "epoch": 4.89, + "grad_norm": 72.0, + "learning_rate": 2.2499540082930655e-08, + "loss": 1.3349, + "step": 51700 + }, + { + "epoch": 4.89, + "grad_norm": 69.0, + "learning_rate": 2.21213965521251e-08, + "loss": 1.2775, + "step": 51720 + }, + { + "epoch": 4.89, + "grad_norm": 74.4375, + "learning_rate": 2.1743253021319544e-08, + "loss": 1.3279, + "step": 51740 + }, + { + "epoch": 4.89, + "grad_norm": 79.25, + "learning_rate": 2.1365109490513982e-08, + "loss": 1.4207, + "step": 51760 + }, + { + "epoch": 4.9, + "grad_norm": 66.4375, + "learning_rate": 2.0986965959708426e-08, + "loss": 1.3499, + "step": 51780 + }, + { + "epoch": 4.9, + "grad_norm": 64.1875, + "learning_rate": 2.060882242890287e-08, + "loss": 1.3024, + "step": 51800 + }, + { + "epoch": 4.9, + "grad_norm": 53.90625, + "learning_rate": 2.0230678898097312e-08, + "loss": 1.2509, + "step": 51820 + }, + { + "epoch": 4.9, + "grad_norm": 58.78125, + "learning_rate": 1.9852535367291757e-08, + "loss": 1.3702, + "step": 51840 + }, + { + "epoch": 4.9, + "grad_norm": 57.5625, + "learning_rate": 1.94743918364862e-08, + "loss": 1.3503, + "step": 51860 + }, + { + "epoch": 4.9, + "grad_norm": 58.75, + "learning_rate": 1.909624830568064e-08, + "loss": 1.2938, + "step": 51880 + }, + { + "epoch": 4.91, + "grad_norm": 53.6875, + "learning_rate": 1.8718104774875083e-08, + "loss": 1.384, + "step": 51900 + }, + { + "epoch": 4.91, + "grad_norm": 69.6875, + "learning_rate": 1.8339961244069524e-08, + "loss": 1.3979, + "step": 51920 + }, + { + "epoch": 4.91, + "grad_norm": 62.40625, + "learning_rate": 1.796181771326397e-08, + "loss": 1.3146, + "step": 51940 + }, + { + "epoch": 4.91, + "grad_norm": 65.6875, + "learning_rate": 1.7583674182458413e-08, + "loss": 1.2778, + "step": 51960 + }, + { + "epoch": 4.91, + "grad_norm": 57.375, + "learning_rate": 1.7205530651652855e-08, + "loss": 1.2466, + "step": 51980 + }, + { + "epoch": 4.92, + "grad_norm": 77.25, + "learning_rate": 1.6827387120847296e-08, + "loss": 1.3475, + "step": 52000 + }, + { + "epoch": 4.92, + "grad_norm": 56.28125, + "learning_rate": 1.644924359004174e-08, + "loss": 1.2942, + "step": 52020 + }, + { + "epoch": 4.92, + "grad_norm": 68.5, + "learning_rate": 1.607110005923618e-08, + "loss": 1.3799, + "step": 52040 + }, + { + "epoch": 4.92, + "grad_norm": 57.90625, + "learning_rate": 1.5692956528430626e-08, + "loss": 1.366, + "step": 52060 + }, + { + "epoch": 4.92, + "grad_norm": 70.5625, + "learning_rate": 1.531481299762507e-08, + "loss": 1.3255, + "step": 52080 + }, + { + "epoch": 4.93, + "grad_norm": 59.0625, + "learning_rate": 1.493666946681951e-08, + "loss": 1.3114, + "step": 52100 + }, + { + "epoch": 4.93, + "grad_norm": 61.46875, + "learning_rate": 1.4558525936013954e-08, + "loss": 1.2898, + "step": 52120 + }, + { + "epoch": 4.93, + "grad_norm": 57.6875, + "learning_rate": 1.4180382405208397e-08, + "loss": 1.3028, + "step": 52140 + }, + { + "epoch": 4.93, + "grad_norm": 61.96875, + "learning_rate": 1.380223887440284e-08, + "loss": 1.3921, + "step": 52160 + }, + { + "epoch": 4.93, + "grad_norm": 61.53125, + "learning_rate": 1.3424095343597283e-08, + "loss": 1.3036, + "step": 52180 + }, + { + "epoch": 4.93, + "grad_norm": 52.0625, + "learning_rate": 1.3045951812791725e-08, + "loss": 1.2947, + "step": 52200 + }, + { + "epoch": 4.94, + "grad_norm": 63.03125, + "learning_rate": 1.2667808281986167e-08, + "loss": 1.3312, + "step": 52220 + }, + { + "epoch": 4.94, + "grad_norm": 68.375, + "learning_rate": 1.2289664751180611e-08, + "loss": 1.2572, + "step": 52240 + }, + { + "epoch": 4.94, + "grad_norm": 65.9375, + "learning_rate": 1.1911521220375054e-08, + "loss": 1.346, + "step": 52260 + }, + { + "epoch": 4.94, + "grad_norm": 60.625, + "learning_rate": 1.1533377689569495e-08, + "loss": 1.2104, + "step": 52280 + }, + { + "epoch": 4.94, + "grad_norm": 89.4375, + "learning_rate": 1.115523415876394e-08, + "loss": 1.3119, + "step": 52300 + }, + { + "epoch": 4.95, + "grad_norm": 72.875, + "learning_rate": 1.0777090627958382e-08, + "loss": 1.4323, + "step": 52320 + }, + { + "epoch": 4.95, + "grad_norm": 59.21875, + "learning_rate": 1.0398947097152824e-08, + "loss": 1.2751, + "step": 52340 + }, + { + "epoch": 4.95, + "grad_norm": 63.96875, + "learning_rate": 1.0020803566347268e-08, + "loss": 1.3833, + "step": 52360 + }, + { + "epoch": 4.95, + "grad_norm": 91.8125, + "learning_rate": 9.64266003554171e-09, + "loss": 1.4989, + "step": 52380 + }, + { + "epoch": 4.95, + "grad_norm": 57.59375, + "learning_rate": 9.264516504736152e-09, + "loss": 1.3041, + "step": 52400 + }, + { + "epoch": 4.96, + "grad_norm": 62.28125, + "learning_rate": 8.886372973930595e-09, + "loss": 1.2397, + "step": 52420 + }, + { + "epoch": 4.96, + "grad_norm": 51.78125, + "learning_rate": 8.50822944312504e-09, + "loss": 1.3625, + "step": 52440 + }, + { + "epoch": 4.96, + "grad_norm": 75.5, + "learning_rate": 8.13008591231948e-09, + "loss": 1.3415, + "step": 52460 + }, + { + "epoch": 4.96, + "grad_norm": 53.59375, + "learning_rate": 7.751942381513923e-09, + "loss": 1.232, + "step": 52480 + }, + { + "epoch": 4.96, + "grad_norm": 67.1875, + "learning_rate": 7.373798850708366e-09, + "loss": 1.2606, + "step": 52500 + }, + { + "epoch": 4.97, + "grad_norm": 62.0, + "learning_rate": 6.995655319902809e-09, + "loss": 1.3394, + "step": 52520 + }, + { + "epoch": 4.97, + "grad_norm": 62.6875, + "learning_rate": 6.6175117890972525e-09, + "loss": 1.2363, + "step": 52540 + }, + { + "epoch": 4.97, + "grad_norm": 62.625, + "learning_rate": 6.2393682582916944e-09, + "loss": 1.2659, + "step": 52560 + }, + { + "epoch": 4.97, + "grad_norm": 82.5625, + "learning_rate": 5.861224727486137e-09, + "loss": 1.3677, + "step": 52580 + }, + { + "epoch": 4.97, + "grad_norm": 54.65625, + "learning_rate": 5.48308119668058e-09, + "loss": 1.2904, + "step": 52600 + }, + { + "epoch": 4.97, + "grad_norm": 76.4375, + "learning_rate": 5.104937665875023e-09, + "loss": 1.313, + "step": 52620 + }, + { + "epoch": 4.98, + "grad_norm": 67.8125, + "learning_rate": 4.726794135069465e-09, + "loss": 1.3009, + "step": 52640 + }, + { + "epoch": 4.98, + "grad_norm": 75.75, + "learning_rate": 4.3486506042639085e-09, + "loss": 1.3895, + "step": 52660 + }, + { + "epoch": 4.98, + "grad_norm": 87.25, + "learning_rate": 3.970507073458351e-09, + "loss": 1.3273, + "step": 52680 + }, + { + "epoch": 4.98, + "grad_norm": 59.78125, + "learning_rate": 3.592363542652794e-09, + "loss": 1.3326, + "step": 52700 + }, + { + "epoch": 4.98, + "grad_norm": 73.0, + "learning_rate": 3.2142200118472365e-09, + "loss": 1.2185, + "step": 52720 + }, + { + "epoch": 4.99, + "grad_norm": 61.0625, + "learning_rate": 2.8360764810416793e-09, + "loss": 1.4331, + "step": 52740 + }, + { + "epoch": 4.99, + "grad_norm": 66.8125, + "learning_rate": 2.457932950236122e-09, + "loss": 1.2308, + "step": 52760 + }, + { + "epoch": 4.99, + "grad_norm": 60.71875, + "learning_rate": 2.0797894194305645e-09, + "loss": 1.4168, + "step": 52780 + }, + { + "epoch": 4.99, + "grad_norm": 56.5625, + "learning_rate": 1.7016458886250076e-09, + "loss": 1.3373, + "step": 52800 + }, + { + "epoch": 4.99, + "grad_norm": 53.40625, + "learning_rate": 1.3235023578194504e-09, + "loss": 1.3291, + "step": 52820 + }, + { + "epoch": 5.0, + "grad_norm": 54.625, + "learning_rate": 9.45358827013893e-10, + "loss": 1.38, + "step": 52840 + }, + { + "epoch": 5.0, + "grad_norm": 84.875, + "learning_rate": 5.672152962083359e-10, + "loss": 1.2983, + "step": 52860 + }, + { + "epoch": 5.0, + "grad_norm": 135.625, + "learning_rate": 1.8907176540277862e-10, + "loss": 1.4019, + "step": 52880 + } + ], + "logging_steps": 20, + "max_steps": 52890, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "total_flos": 1.7722335641506468e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}