{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99999750000625, "eval_steps": 500, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.3668140172958374, "learning_rate": 0.001, "loss": 1.2955, "step": 100 }, { "epoch": 0.0, "grad_norm": 0.4789515733718872, "learning_rate": 0.001, "loss": 0.2147, "step": 200 }, { "epoch": 0.0, "grad_norm": 0.8046264052391052, "learning_rate": 0.001, "loss": 0.1773, "step": 300 }, { "epoch": 0.0, "grad_norm": 0.6500861644744873, "learning_rate": 0.001, "loss": 0.169, "step": 400 }, { "epoch": 0.0, "grad_norm": 1.9476549625396729, "learning_rate": 0.001, "loss": 0.155, "step": 500 }, { "epoch": 0.01, "grad_norm": 0.7570195198059082, "learning_rate": 0.001, "loss": 0.221, "step": 600 }, { "epoch": 0.01, "grad_norm": 0.13534319400787354, "learning_rate": 0.001, "loss": 0.242, "step": 700 }, { "epoch": 0.01, "grad_norm": 0.12334191799163818, "learning_rate": 0.001, "loss": 0.2387, "step": 800 }, { "epoch": 0.01, "grad_norm": 2.0074245929718018, "learning_rate": 0.001, "loss": 0.1844, "step": 900 }, { "epoch": 0.01, "grad_norm": 0.2459566444158554, "learning_rate": 0.001, "loss": 0.2273, "step": 1000 }, { "epoch": 0.01, "grad_norm": 0.35431796312332153, "learning_rate": 0.001, "loss": 0.2406, "step": 1100 }, { "epoch": 0.01, "grad_norm": 0.07735779881477356, "learning_rate": 0.001, "loss": 0.2362, "step": 1200 }, { "epoch": 0.01, "grad_norm": 0.197942316532135, "learning_rate": 0.001, "loss": 0.2361, "step": 1300 }, { "epoch": 0.01, "grad_norm": 0.06753970682621002, "learning_rate": 0.001, "loss": 0.2346, "step": 1400 }, { "epoch": 0.01, "grad_norm": 0.17562294006347656, "learning_rate": 0.001, "loss": 0.2356, "step": 1500 }, { "epoch": 0.02, "grad_norm": 0.12020650506019592, "learning_rate": 0.001, "loss": 0.2343, "step": 1600 }, { "epoch": 0.02, "grad_norm": 0.07772481441497803, "learning_rate": 0.001, "loss": 0.2343, "step": 1700 }, { "epoch": 0.02, "grad_norm": 0.041362863034009933, "learning_rate": 0.001, "loss": 0.2345, "step": 1800 }, { "epoch": 0.02, "grad_norm": 0.050947155803442, "learning_rate": 0.001, "loss": 0.2333, "step": 1900 }, { "epoch": 0.02, "grad_norm": 0.24440822005271912, "learning_rate": 0.001, "loss": 0.2346, "step": 2000 }, { "epoch": 0.02, "grad_norm": 0.4386675953865051, "learning_rate": 0.001, "loss": 0.2346, "step": 2100 }, { "epoch": 0.02, "grad_norm": 0.054741185158491135, "learning_rate": 0.001, "loss": 0.2347, "step": 2200 }, { "epoch": 0.02, "grad_norm": 0.5285407304763794, "learning_rate": 0.001, "loss": 0.2341, "step": 2300 }, { "epoch": 0.02, "grad_norm": 0.5406210422515869, "learning_rate": 0.001, "loss": 0.2322, "step": 2400 }, { "epoch": 0.02, "grad_norm": 1.1667808294296265, "learning_rate": 0.001, "loss": 0.1683, "step": 2500 }, { "epoch": 0.03, "grad_norm": 0.11829289048910141, "learning_rate": 0.001, "loss": 0.2138, "step": 2600 }, { "epoch": 0.03, "grad_norm": 1.528359055519104, "learning_rate": 0.001, "loss": 0.1902, "step": 2700 }, { "epoch": 0.03, "grad_norm": 0.45457515120506287, "learning_rate": 0.001, "loss": 0.1592, "step": 2800 }, { "epoch": 0.03, "grad_norm": 0.2595893144607544, "learning_rate": 0.001, "loss": 0.1511, "step": 2900 }, { "epoch": 0.03, "grad_norm": 0.5346922278404236, "learning_rate": 0.001, "loss": 0.1439, "step": 3000 }, { "epoch": 0.03, "grad_norm": 3.5066208839416504, "learning_rate": 0.001, "loss": 0.1617, "step": 3100 }, { "epoch": 0.03, "grad_norm": 24.826475143432617, "learning_rate": 0.001, "loss": 0.2024, "step": 3200 }, { "epoch": 0.03, "grad_norm": 10.144634246826172, "learning_rate": 0.001, "loss": 0.1882, "step": 3300 }, { "epoch": 0.03, "grad_norm": 0.43425676226615906, "learning_rate": 0.001, "loss": 0.169, "step": 3400 }, { "epoch": 0.03, "grad_norm": 0.3496113717556, "learning_rate": 0.001, "loss": 0.1542, "step": 3500 }, { "epoch": 0.04, "grad_norm": 6.317073345184326, "learning_rate": 0.001, "loss": 0.1676, "step": 3600 }, { "epoch": 0.04, "grad_norm": 1.1362758874893188, "learning_rate": 0.001, "loss": 0.1599, "step": 3700 }, { "epoch": 0.04, "grad_norm": 4.871659755706787, "learning_rate": 0.001, "loss": 0.1473, "step": 3800 }, { "epoch": 0.04, "grad_norm": 0.10563373565673828, "learning_rate": 0.001, "loss": 0.1652, "step": 3900 }, { "epoch": 0.04, "grad_norm": 0.08865318447351456, "learning_rate": 0.001, "loss": 0.2326, "step": 4000 }, { "epoch": 0.04, "grad_norm": 0.0642586424946785, "learning_rate": 0.001, "loss": 0.2329, "step": 4100 }, { "epoch": 0.04, "grad_norm": 0.36199188232421875, "learning_rate": 0.001, "loss": 0.2331, "step": 4200 }, { "epoch": 0.04, "grad_norm": 0.17750632762908936, "learning_rate": 0.001, "loss": 0.2326, "step": 4300 }, { "epoch": 0.04, "grad_norm": 0.103765107691288, "learning_rate": 0.001, "loss": 0.2329, "step": 4400 }, { "epoch": 0.04, "grad_norm": 0.11186927556991577, "learning_rate": 0.001, "loss": 0.2326, "step": 4500 }, { "epoch": 0.05, "grad_norm": 0.04914987459778786, "learning_rate": 0.001, "loss": 0.2326, "step": 4600 }, { "epoch": 0.05, "grad_norm": 0.09826149046421051, "learning_rate": 0.001, "loss": 0.2324, "step": 4700 }, { "epoch": 0.05, "grad_norm": 0.08518774062395096, "learning_rate": 0.001, "loss": 0.2327, "step": 4800 }, { "epoch": 0.05, "grad_norm": 0.12364567071199417, "learning_rate": 0.001, "loss": 0.2321, "step": 4900 }, { "epoch": 0.05, "grad_norm": 0.10944374650716782, "learning_rate": 0.001, "loss": 0.2322, "step": 5000 }, { "epoch": 0.05, "grad_norm": 0.08173243701457977, "learning_rate": 0.001, "loss": 0.2326, "step": 5100 }, { "epoch": 0.05, "grad_norm": 0.17504490911960602, "learning_rate": 0.001, "loss": 0.232, "step": 5200 }, { "epoch": 0.05, "grad_norm": 0.03396083042025566, "learning_rate": 0.001, "loss": 0.2326, "step": 5300 }, { "epoch": 0.05, "grad_norm": 0.12226787954568863, "learning_rate": 0.001, "loss": 0.2324, "step": 5400 }, { "epoch": 0.05, "grad_norm": 0.029385367408394814, "learning_rate": 0.001, "loss": 0.2324, "step": 5500 }, { "epoch": 0.06, "grad_norm": 0.08070210367441177, "learning_rate": 0.001, "loss": 0.2322, "step": 5600 }, { "epoch": 0.06, "grad_norm": 0.026348430663347244, "learning_rate": 0.001, "loss": 0.2316, "step": 5700 }, { "epoch": 0.06, "grad_norm": 0.06884663552045822, "learning_rate": 0.001, "loss": 0.2322, "step": 5800 }, { "epoch": 0.06, "grad_norm": 0.09100496768951416, "learning_rate": 0.001, "loss": 0.3271, "step": 5900 }, { "epoch": 0.06, "grad_norm": 0.0949195995926857, "learning_rate": 0.001, "loss": 0.2322, "step": 6000 }, { "epoch": 0.06, "grad_norm": 0.17315314710140228, "learning_rate": 0.001, "loss": 0.232, "step": 6100 }, { "epoch": 0.06, "grad_norm": 0.04644012451171875, "learning_rate": 0.001, "loss": 0.2317, "step": 6200 }, { "epoch": 0.06, "grad_norm": 0.03242076560854912, "learning_rate": 0.001, "loss": 0.2317, "step": 6300 }, { "epoch": 0.06, "grad_norm": 0.03038044273853302, "learning_rate": 0.001, "loss": 0.2322, "step": 6400 }, { "epoch": 0.06, "grad_norm": 0.04407713562250137, "learning_rate": 0.001, "loss": 0.2321, "step": 6500 }, { "epoch": 0.07, "grad_norm": 0.04973585903644562, "learning_rate": 0.001, "loss": 0.2321, "step": 6600 }, { "epoch": 0.07, "grad_norm": 0.043713077902793884, "learning_rate": 0.001, "loss": 0.2319, "step": 6700 }, { "epoch": 0.07, "grad_norm": 0.0361105352640152, "learning_rate": 0.001, "loss": 0.2319, "step": 6800 }, { "epoch": 0.07, "grad_norm": 0.038385313004255295, "learning_rate": 0.001, "loss": 0.2319, "step": 6900 }, { "epoch": 0.07, "grad_norm": 0.059859637171030045, "learning_rate": 0.001, "loss": 0.2318, "step": 7000 }, { "epoch": 0.07, "grad_norm": 0.10737486183643341, "learning_rate": 0.001, "loss": 0.232, "step": 7100 }, { "epoch": 0.07, "grad_norm": 0.07841573655605316, "learning_rate": 0.001, "loss": 0.2319, "step": 7200 }, { "epoch": 0.07, "grad_norm": 0.12177613377571106, "learning_rate": 0.001, "loss": 0.2318, "step": 7300 }, { "epoch": 0.07, "grad_norm": 0.04158034175634384, "learning_rate": 0.001, "loss": 0.2318, "step": 7400 }, { "epoch": 0.07, "grad_norm": 0.04334099590778351, "learning_rate": 0.001, "loss": 0.2318, "step": 7500 }, { "epoch": 0.08, "grad_norm": 0.04868987202644348, "learning_rate": 0.001, "loss": 0.2317, "step": 7600 }, { "epoch": 0.08, "grad_norm": 0.11688575893640518, "learning_rate": 0.001, "loss": 0.2318, "step": 7700 }, { "epoch": 0.08, "grad_norm": 0.05144130066037178, "learning_rate": 0.001, "loss": 0.2319, "step": 7800 }, { "epoch": 0.08, "grad_norm": 0.04202236235141754, "learning_rate": 0.001, "loss": 0.2318, "step": 7900 }, { "epoch": 0.08, "grad_norm": 0.07848116755485535, "learning_rate": 0.001, "loss": 0.2314, "step": 8000 }, { "epoch": 0.08, "grad_norm": 0.05292198061943054, "learning_rate": 0.001, "loss": 0.2317, "step": 8100 }, { "epoch": 0.08, "grad_norm": 0.05817991867661476, "learning_rate": 0.001, "loss": 0.2318, "step": 8200 }, { "epoch": 0.08, "grad_norm": 0.03250608965754509, "learning_rate": 0.001, "loss": 0.2316, "step": 8300 }, { "epoch": 0.08, "grad_norm": 0.29823893308639526, "learning_rate": 0.001, "loss": 0.2311, "step": 8400 }, { "epoch": 0.08, "grad_norm": 1.852128505706787, "learning_rate": 0.001, "loss": 0.1864, "step": 8500 }, { "epoch": 0.09, "grad_norm": 61.31148147583008, "learning_rate": 0.001, "loss": 0.1911, "step": 8600 }, { "epoch": 0.09, "grad_norm": 3.4901123046875, "learning_rate": 0.001, "loss": 0.1934, "step": 8700 }, { "epoch": 0.09, "grad_norm": 0.9580036401748657, "learning_rate": 0.001, "loss": 0.1706, "step": 8800 }, { "epoch": 0.09, "grad_norm": 0.5461576581001282, "learning_rate": 0.001, "loss": 0.1597, "step": 8900 }, { "epoch": 0.09, "grad_norm": 3.481351375579834, "learning_rate": 0.001, "loss": 0.1511, "step": 9000 }, { "epoch": 0.09, "grad_norm": 0.3008120656013489, "learning_rate": 0.001, "loss": 0.154, "step": 9100 }, { "epoch": 0.09, "grad_norm": 0.23753711581230164, "learning_rate": 0.001, "loss": 0.1406, "step": 9200 }, { "epoch": 0.09, "grad_norm": 0.9201159477233887, "learning_rate": 0.001, "loss": 0.1444, "step": 9300 }, { "epoch": 0.09, "grad_norm": 1.6734191179275513, "learning_rate": 0.001, "loss": 0.1385, "step": 9400 }, { "epoch": 0.09, "grad_norm": 1.7249393463134766, "learning_rate": 0.001, "loss": 0.1393, "step": 9500 }, { "epoch": 0.1, "grad_norm": 0.5765690207481384, "learning_rate": 0.001, "loss": 0.1397, "step": 9600 }, { "epoch": 0.1, "grad_norm": 0.4266449213027954, "learning_rate": 0.001, "loss": 0.1386, "step": 9700 }, { "epoch": 0.1, "grad_norm": 0.23247841000556946, "learning_rate": 0.001, "loss": 0.1343, "step": 9800 }, { "epoch": 0.1, "grad_norm": 0.19435954093933105, "learning_rate": 0.001, "loss": 0.1306, "step": 9900 }, { "epoch": 0.1, "grad_norm": 0.27626514434814453, "learning_rate": 0.001, "loss": 0.133, "step": 10000 }, { "epoch": 0.1, "grad_norm": 0.1834883689880371, "learning_rate": 0.001, "loss": 0.1299, "step": 10100 }, { "epoch": 0.1, "grad_norm": 0.4306440055370331, "learning_rate": 0.001, "loss": 0.1309, "step": 10200 }, { "epoch": 0.1, "grad_norm": 0.15750516951084137, "learning_rate": 0.001, "loss": 0.1266, "step": 10300 }, { "epoch": 0.1, "grad_norm": 0.2934073805809021, "learning_rate": 0.001, "loss": 0.1278, "step": 10400 }, { "epoch": 0.1, "grad_norm": 0.27599695324897766, "learning_rate": 0.001, "loss": 0.1286, "step": 10500 }, { "epoch": 0.11, "grad_norm": 0.39952772855758667, "learning_rate": 0.001, "loss": 0.1252, "step": 10600 }, { "epoch": 0.11, "grad_norm": 0.4082016348838806, "learning_rate": 0.001, "loss": 0.1272, "step": 10700 }, { "epoch": 0.11, "grad_norm": 0.303307443857193, "learning_rate": 0.001, "loss": 0.1249, "step": 10800 }, { "epoch": 0.11, "grad_norm": 0.1597479283809662, "learning_rate": 0.001, "loss": 0.1247, "step": 10900 }, { "epoch": 0.11, "grad_norm": 1.03666090965271, "learning_rate": 0.001, "loss": 0.1286, "step": 11000 }, { "epoch": 0.11, "grad_norm": 0.2832247018814087, "learning_rate": 0.001, "loss": 0.1248, "step": 11100 }, { "epoch": 0.11, "grad_norm": 0.49678078293800354, "learning_rate": 0.001, "loss": 0.1258, "step": 11200 }, { "epoch": 0.11, "grad_norm": 0.3678058385848999, "learning_rate": 0.001, "loss": 0.1256, "step": 11300 }, { "epoch": 0.11, "grad_norm": 0.26233455538749695, "learning_rate": 0.001, "loss": 0.1233, "step": 11400 }, { "epoch": 0.11, "grad_norm": 0.22039958834648132, "learning_rate": 0.001, "loss": 0.1197, "step": 11500 }, { "epoch": 0.12, "grad_norm": 0.14722639322280884, "learning_rate": 0.001, "loss": 0.1225, "step": 11600 }, { "epoch": 0.12, "grad_norm": 0.19015900790691376, "learning_rate": 0.001, "loss": 0.1217, "step": 11700 }, { "epoch": 0.12, "grad_norm": 0.15655829012393951, "learning_rate": 0.001, "loss": 0.1185, "step": 11800 }, { "epoch": 0.12, "grad_norm": 3.5397889614105225, "learning_rate": 0.001, "loss": 0.119, "step": 11900 }, { "epoch": 0.12, "grad_norm": 0.845320999622345, "learning_rate": 0.001, "loss": 0.1276, "step": 12000 }, { "epoch": 0.12, "grad_norm": 0.34136563539505005, "learning_rate": 0.001, "loss": 0.122, "step": 12100 }, { "epoch": 0.12, "grad_norm": 0.2509533762931824, "learning_rate": 0.001, "loss": 0.1199, "step": 12200 }, { "epoch": 0.12, "grad_norm": 0.31120267510414124, "learning_rate": 0.001, "loss": 0.1191, "step": 12300 }, { "epoch": 0.12, "grad_norm": 0.3903524875640869, "learning_rate": 0.001, "loss": 0.1183, "step": 12400 }, { "epoch": 0.12, "grad_norm": 0.19971555471420288, "learning_rate": 0.001, "loss": 0.1177, "step": 12500 }, { "epoch": 0.13, "grad_norm": 0.36589089035987854, "learning_rate": 0.001, "loss": 0.1158, "step": 12600 }, { "epoch": 0.13, "grad_norm": 0.19200453162193298, "learning_rate": 0.001, "loss": 0.1166, "step": 12700 }, { "epoch": 0.13, "grad_norm": 0.6393672823905945, "learning_rate": 0.001, "loss": 0.1171, "step": 12800 }, { "epoch": 0.13, "grad_norm": 0.32421180605888367, "learning_rate": 0.001, "loss": 0.118, "step": 12900 }, { "epoch": 0.13, "grad_norm": 0.6238926649093628, "learning_rate": 0.001, "loss": 0.1166, "step": 13000 }, { "epoch": 0.13, "grad_norm": 0.1363907754421234, "learning_rate": 0.001, "loss": 0.1156, "step": 13100 }, { "epoch": 0.13, "grad_norm": 0.16790109872817993, "learning_rate": 0.001, "loss": 0.1142, "step": 13200 }, { "epoch": 0.13, "grad_norm": 0.1915178894996643, "learning_rate": 0.001, "loss": 0.1126, "step": 13300 }, { "epoch": 0.13, "grad_norm": 0.12727123498916626, "learning_rate": 0.001, "loss": 0.1156, "step": 13400 }, { "epoch": 0.13, "grad_norm": 0.29520758986473083, "learning_rate": 0.001, "loss": 0.1129, "step": 13500 }, { "epoch": 0.14, "grad_norm": 0.1663757860660553, "learning_rate": 0.001, "loss": 0.1132, "step": 13600 }, { "epoch": 0.14, "grad_norm": 0.1840706318616867, "learning_rate": 0.001, "loss": 0.119, "step": 13700 }, { "epoch": 0.14, "grad_norm": 0.16156257688999176, "learning_rate": 0.001, "loss": 0.1146, "step": 13800 }, { "epoch": 0.14, "grad_norm": 0.17348338663578033, "learning_rate": 0.001, "loss": 0.1141, "step": 13900 }, { "epoch": 0.14, "grad_norm": 0.18696527183055878, "learning_rate": 0.001, "loss": 0.1108, "step": 14000 }, { "epoch": 0.14, "grad_norm": 0.15352846682071686, "learning_rate": 0.001, "loss": 0.1134, "step": 14100 }, { "epoch": 0.14, "grad_norm": 0.23210759460926056, "learning_rate": 0.001, "loss": 0.1142, "step": 14200 }, { "epoch": 0.14, "grad_norm": 0.18328526616096497, "learning_rate": 0.001, "loss": 0.1109, "step": 14300 }, { "epoch": 0.14, "grad_norm": 0.17674757540225983, "learning_rate": 0.001, "loss": 0.1083, "step": 14400 }, { "epoch": 0.14, "grad_norm": 0.34446394443511963, "learning_rate": 0.001, "loss": 0.1203, "step": 14500 }, { "epoch": 0.15, "grad_norm": 0.22947299480438232, "learning_rate": 0.001, "loss": 0.1095, "step": 14600 }, { "epoch": 0.15, "grad_norm": 0.15071985125541687, "learning_rate": 0.001, "loss": 0.1088, "step": 14700 }, { "epoch": 0.15, "grad_norm": 0.14273251593112946, "learning_rate": 0.001, "loss": 0.1091, "step": 14800 }, { "epoch": 0.15, "grad_norm": 0.20266981422901154, "learning_rate": 0.001, "loss": 0.1089, "step": 14900 }, { "epoch": 0.15, "grad_norm": 0.1495724767446518, "learning_rate": 0.001, "loss": 0.1089, "step": 15000 }, { "epoch": 0.15, "grad_norm": 0.1711970865726471, "learning_rate": 0.001, "loss": 0.1063, "step": 15100 }, { "epoch": 0.15, "grad_norm": 0.20727260410785675, "learning_rate": 0.001, "loss": 0.104, "step": 15200 }, { "epoch": 0.15, "grad_norm": 0.22724412381649017, "learning_rate": 0.001, "loss": 0.1087, "step": 15300 }, { "epoch": 0.15, "grad_norm": 0.15561726689338684, "learning_rate": 0.001, "loss": 0.1086, "step": 15400 }, { "epoch": 0.15, "grad_norm": 0.2139796018600464, "learning_rate": 0.001, "loss": 0.1054, "step": 15500 }, { "epoch": 0.16, "grad_norm": 0.24371370673179626, "learning_rate": 0.001, "loss": 0.1077, "step": 15600 }, { "epoch": 0.16, "grad_norm": 0.22944559156894684, "learning_rate": 0.001, "loss": 0.1092, "step": 15700 }, { "epoch": 0.16, "grad_norm": 0.19578562676906586, "learning_rate": 0.001, "loss": 0.1077, "step": 15800 }, { "epoch": 0.16, "grad_norm": 0.17588412761688232, "learning_rate": 0.001, "loss": 0.1048, "step": 15900 }, { "epoch": 0.16, "grad_norm": 0.16697707772254944, "learning_rate": 0.001, "loss": 0.1072, "step": 16000 }, { "epoch": 0.16, "grad_norm": 0.1927742063999176, "learning_rate": 0.001, "loss": 0.1036, "step": 16100 }, { "epoch": 0.16, "grad_norm": 0.25396087765693665, "learning_rate": 0.001, "loss": 0.1068, "step": 16200 }, { "epoch": 0.16, "grad_norm": 0.21014653146266937, "learning_rate": 0.001, "loss": 0.1012, "step": 16300 }, { "epoch": 0.16, "grad_norm": 0.32085150480270386, "learning_rate": 0.001, "loss": 0.1062, "step": 16400 }, { "epoch": 0.16, "grad_norm": 0.10534122586250305, "learning_rate": 0.001, "loss": 0.103, "step": 16500 }, { "epoch": 0.17, "grad_norm": 0.24365462362766266, "learning_rate": 0.001, "loss": 0.106, "step": 16600 }, { "epoch": 0.17, "grad_norm": 0.15197184681892395, "learning_rate": 0.001, "loss": 0.1051, "step": 16700 }, { "epoch": 0.17, "grad_norm": 0.23027855157852173, "learning_rate": 0.001, "loss": 0.1065, "step": 16800 }, { "epoch": 0.17, "grad_norm": 0.14924216270446777, "learning_rate": 0.001, "loss": 0.1068, "step": 16900 }, { "epoch": 0.17, "grad_norm": 0.13331858813762665, "learning_rate": 0.001, "loss": 0.1035, "step": 17000 }, { "epoch": 0.17, "grad_norm": 0.20150358974933624, "learning_rate": 0.001, "loss": 0.1065, "step": 17100 }, { "epoch": 0.17, "grad_norm": 0.1429535299539566, "learning_rate": 0.001, "loss": 0.1056, "step": 17200 }, { "epoch": 0.17, "grad_norm": 0.16326557099819183, "learning_rate": 0.001, "loss": 0.1022, "step": 17300 }, { "epoch": 0.17, "grad_norm": 0.15712429583072662, "learning_rate": 0.001, "loss": 0.1051, "step": 17400 }, { "epoch": 0.17, "grad_norm": 0.33204013109207153, "learning_rate": 0.001, "loss": 0.1046, "step": 17500 }, { "epoch": 0.18, "grad_norm": 0.17703518271446228, "learning_rate": 0.001, "loss": 0.1057, "step": 17600 }, { "epoch": 0.18, "grad_norm": 0.14861218631267548, "learning_rate": 0.001, "loss": 0.1052, "step": 17700 }, { "epoch": 0.18, "grad_norm": 0.18271447718143463, "learning_rate": 0.001, "loss": 0.1049, "step": 17800 }, { "epoch": 0.18, "grad_norm": 0.2245068997144699, "learning_rate": 0.001, "loss": 0.1033, "step": 17900 }, { "epoch": 0.18, "grad_norm": 0.2233046442270279, "learning_rate": 0.001, "loss": 0.1049, "step": 18000 }, { "epoch": 0.18, "grad_norm": 0.1915113776922226, "learning_rate": 0.001, "loss": 0.1039, "step": 18100 }, { "epoch": 0.18, "grad_norm": 0.1070462241768837, "learning_rate": 0.001, "loss": 0.1028, "step": 18200 }, { "epoch": 0.18, "grad_norm": 0.14523275196552277, "learning_rate": 0.001, "loss": 0.0983, "step": 18300 }, { "epoch": 0.18, "grad_norm": 0.24468256533145905, "learning_rate": 0.001, "loss": 0.1018, "step": 18400 }, { "epoch": 0.18, "grad_norm": 0.17596426606178284, "learning_rate": 0.001, "loss": 0.1017, "step": 18500 }, { "epoch": 0.19, "grad_norm": 0.15113884210586548, "learning_rate": 0.001, "loss": 0.1022, "step": 18600 }, { "epoch": 0.19, "grad_norm": 0.1756398230791092, "learning_rate": 0.001, "loss": 0.1032, "step": 18700 }, { "epoch": 0.19, "grad_norm": 0.1491193026304245, "learning_rate": 0.001, "loss": 0.1016, "step": 18800 }, { "epoch": 0.19, "grad_norm": 0.15422752499580383, "learning_rate": 0.001, "loss": 0.0989, "step": 18900 }, { "epoch": 0.19, "grad_norm": 0.13713973760604858, "learning_rate": 0.001, "loss": 0.1002, "step": 19000 }, { "epoch": 0.19, "grad_norm": 0.16012702882289886, "learning_rate": 0.001, "loss": 0.101, "step": 19100 }, { "epoch": 0.19, "grad_norm": 0.23414984345436096, "learning_rate": 0.001, "loss": 0.0975, "step": 19200 }, { "epoch": 0.19, "grad_norm": 0.13922521471977234, "learning_rate": 0.001, "loss": 0.1002, "step": 19300 }, { "epoch": 0.19, "grad_norm": 0.14608104526996613, "learning_rate": 0.001, "loss": 0.098, "step": 19400 }, { "epoch": 0.19, "grad_norm": 0.19267164170742035, "learning_rate": 0.001, "loss": 0.098, "step": 19500 }, { "epoch": 0.2, "grad_norm": 0.1570904552936554, "learning_rate": 0.001, "loss": 0.1034, "step": 19600 }, { "epoch": 0.2, "grad_norm": 0.3922866880893707, "learning_rate": 0.001, "loss": 0.1008, "step": 19700 }, { "epoch": 0.2, "grad_norm": 0.20500238239765167, "learning_rate": 0.001, "loss": 0.1025, "step": 19800 }, { "epoch": 0.2, "grad_norm": 0.2044358104467392, "learning_rate": 0.001, "loss": 0.0982, "step": 19900 }, { "epoch": 0.2, "grad_norm": 0.1722269356250763, "learning_rate": 0.001, "loss": 0.1007, "step": 20000 }, { "epoch": 0.2, "grad_norm": 0.21868231892585754, "learning_rate": 0.001, "loss": 0.1, "step": 20100 }, { "epoch": 0.2, "grad_norm": 0.12817895412445068, "learning_rate": 0.001, "loss": 0.1, "step": 20200 }, { "epoch": 0.2, "grad_norm": 0.12333246320486069, "learning_rate": 0.001, "loss": 0.0987, "step": 20300 }, { "epoch": 0.2, "grad_norm": 0.1742565631866455, "learning_rate": 0.001, "loss": 0.0981, "step": 20400 }, { "epoch": 0.2, "grad_norm": 0.15747936069965363, "learning_rate": 0.001, "loss": 0.1012, "step": 20500 }, { "epoch": 0.21, "grad_norm": 0.27314338088035583, "learning_rate": 0.001, "loss": 0.1014, "step": 20600 }, { "epoch": 0.21, "grad_norm": 0.9368189573287964, "learning_rate": 0.001, "loss": 0.1035, "step": 20700 }, { "epoch": 0.21, "grad_norm": 0.3574996590614319, "learning_rate": 0.001, "loss": 0.0992, "step": 20800 }, { "epoch": 0.21, "grad_norm": 0.28280141949653625, "learning_rate": 0.001, "loss": 0.0975, "step": 20900 }, { "epoch": 0.21, "grad_norm": 0.21435654163360596, "learning_rate": 0.001, "loss": 0.0998, "step": 21000 }, { "epoch": 0.21, "grad_norm": 0.20617541670799255, "learning_rate": 0.001, "loss": 0.0994, "step": 21100 }, { "epoch": 0.21, "grad_norm": 0.21885354816913605, "learning_rate": 0.001, "loss": 0.099, "step": 21200 }, { "epoch": 0.21, "grad_norm": 0.24429431557655334, "learning_rate": 0.001, "loss": 0.1018, "step": 21300 }, { "epoch": 0.21, "grad_norm": 0.24264854192733765, "learning_rate": 0.001, "loss": 0.1009, "step": 21400 }, { "epoch": 0.21, "grad_norm": 0.19410717487335205, "learning_rate": 0.001, "loss": 0.1007, "step": 21500 }, { "epoch": 0.22, "grad_norm": 0.15938735008239746, "learning_rate": 0.001, "loss": 0.0965, "step": 21600 }, { "epoch": 0.22, "grad_norm": 0.678229808807373, "learning_rate": 0.001, "loss": 0.1001, "step": 21700 }, { "epoch": 0.22, "grad_norm": 0.2967202663421631, "learning_rate": 0.001, "loss": 0.1003, "step": 21800 }, { "epoch": 0.22, "grad_norm": 0.7940108180046082, "learning_rate": 0.001, "loss": 0.1001, "step": 21900 }, { "epoch": 0.22, "grad_norm": 0.24995733797550201, "learning_rate": 0.001, "loss": 0.0992, "step": 22000 }, { "epoch": 0.22, "grad_norm": 0.1626627892255783, "learning_rate": 0.001, "loss": 0.0992, "step": 22100 }, { "epoch": 0.22, "grad_norm": 0.21141190826892853, "learning_rate": 0.001, "loss": 0.0961, "step": 22200 }, { "epoch": 0.22, "grad_norm": 0.21122020483016968, "learning_rate": 0.001, "loss": 0.0968, "step": 22300 }, { "epoch": 0.22, "grad_norm": 0.2558838725090027, "learning_rate": 0.001, "loss": 0.098, "step": 22400 }, { "epoch": 0.22, "grad_norm": 0.1975196897983551, "learning_rate": 0.001, "loss": 0.0987, "step": 22500 }, { "epoch": 0.23, "grad_norm": 0.14767397940158844, "learning_rate": 0.001, "loss": 0.096, "step": 22600 }, { "epoch": 0.23, "grad_norm": 0.17532730102539062, "learning_rate": 0.001, "loss": 0.0985, "step": 22700 }, { "epoch": 0.23, "grad_norm": 0.1320209801197052, "learning_rate": 0.001, "loss": 0.0968, "step": 22800 }, { "epoch": 0.23, "grad_norm": 0.273934930562973, "learning_rate": 0.001, "loss": 0.0978, "step": 22900 }, { "epoch": 0.23, "grad_norm": 0.15103434026241302, "learning_rate": 0.001, "loss": 0.0995, "step": 23000 }, { "epoch": 0.23, "grad_norm": 0.2021692842245102, "learning_rate": 0.001, "loss": 0.0952, "step": 23100 }, { "epoch": 0.23, "grad_norm": 0.1648433655500412, "learning_rate": 0.001, "loss": 0.0938, "step": 23200 }, { "epoch": 0.23, "grad_norm": 0.17460817098617554, "learning_rate": 0.001, "loss": 0.0959, "step": 23300 }, { "epoch": 0.23, "grad_norm": 0.15195918083190918, "learning_rate": 0.001, "loss": 0.094, "step": 23400 }, { "epoch": 0.23, "grad_norm": 0.1664193570613861, "learning_rate": 0.001, "loss": 0.094, "step": 23500 }, { "epoch": 0.24, "grad_norm": 0.14700663089752197, "learning_rate": 0.001, "loss": 0.0951, "step": 23600 }, { "epoch": 0.24, "grad_norm": 0.22301018238067627, "learning_rate": 0.001, "loss": 0.0919, "step": 23700 }, { "epoch": 0.24, "grad_norm": 0.1666121482849121, "learning_rate": 0.001, "loss": 0.0928, "step": 23800 }, { "epoch": 0.24, "grad_norm": 0.1971474438905716, "learning_rate": 0.001, "loss": 0.0949, "step": 23900 }, { "epoch": 0.24, "grad_norm": 0.15959730744361877, "learning_rate": 0.001, "loss": 0.095, "step": 24000 }, { "epoch": 0.24, "grad_norm": 0.29146862030029297, "learning_rate": 0.001, "loss": 0.0942, "step": 24100 }, { "epoch": 0.24, "grad_norm": 0.15853939950466156, "learning_rate": 0.001, "loss": 0.0978, "step": 24200 }, { "epoch": 0.24, "grad_norm": 0.16822876036167145, "learning_rate": 0.001, "loss": 0.0934, "step": 24300 }, { "epoch": 0.24, "grad_norm": 0.15456752479076385, "learning_rate": 0.001, "loss": 0.0948, "step": 24400 }, { "epoch": 0.24, "grad_norm": 0.15123625099658966, "learning_rate": 0.001, "loss": 0.0926, "step": 24500 }, { "epoch": 0.25, "grad_norm": 0.16344180703163147, "learning_rate": 0.001, "loss": 0.0935, "step": 24600 }, { "epoch": 0.25, "grad_norm": 0.22936996817588806, "learning_rate": 0.001, "loss": 0.0936, "step": 24700 }, { "epoch": 0.25, "grad_norm": 0.16810204088687897, "learning_rate": 0.001, "loss": 0.0978, "step": 24800 }, { "epoch": 0.25, "grad_norm": 0.14977198839187622, "learning_rate": 0.001, "loss": 0.0936, "step": 24900 }, { "epoch": 0.25, "grad_norm": 0.18207716941833496, "learning_rate": 0.001, "loss": 0.093, "step": 25000 }, { "epoch": 0.25, "grad_norm": 0.2584002912044525, "learning_rate": 0.001, "loss": 0.0958, "step": 25100 }, { "epoch": 0.25, "grad_norm": 0.23717880249023438, "learning_rate": 0.001, "loss": 0.0927, "step": 25200 }, { "epoch": 0.25, "grad_norm": 0.1896461844444275, "learning_rate": 0.001, "loss": 0.094, "step": 25300 }, { "epoch": 0.25, "grad_norm": 0.21543921530246735, "learning_rate": 0.001, "loss": 0.0953, "step": 25400 }, { "epoch": 0.25, "grad_norm": 0.14013002812862396, "learning_rate": 0.001, "loss": 0.0958, "step": 25500 }, { "epoch": 0.26, "grad_norm": 0.1744927018880844, "learning_rate": 0.001, "loss": 0.0946, "step": 25600 }, { "epoch": 0.26, "grad_norm": 0.16546490788459778, "learning_rate": 0.001, "loss": 0.0962, "step": 25700 }, { "epoch": 0.26, "grad_norm": 0.16227766871452332, "learning_rate": 0.001, "loss": 0.0952, "step": 25800 }, { "epoch": 0.26, "grad_norm": 0.181349515914917, "learning_rate": 0.001, "loss": 0.0951, "step": 25900 }, { "epoch": 0.26, "grad_norm": 0.20408563315868378, "learning_rate": 0.001, "loss": 0.0915, "step": 26000 }, { "epoch": 0.26, "grad_norm": 0.1793171763420105, "learning_rate": 0.001, "loss": 0.0942, "step": 26100 }, { "epoch": 0.26, "grad_norm": 0.14634822309017181, "learning_rate": 0.001, "loss": 0.0961, "step": 26200 }, { "epoch": 0.26, "grad_norm": 0.18879148364067078, "learning_rate": 0.001, "loss": 0.0942, "step": 26300 }, { "epoch": 0.26, "grad_norm": 0.20523515343666077, "learning_rate": 0.001, "loss": 0.0912, "step": 26400 }, { "epoch": 0.26, "grad_norm": 0.18672947585582733, "learning_rate": 0.001, "loss": 0.092, "step": 26500 }, { "epoch": 0.27, "grad_norm": 0.18561910092830658, "learning_rate": 0.001, "loss": 0.0913, "step": 26600 }, { "epoch": 0.27, "grad_norm": 0.23991861939430237, "learning_rate": 0.001, "loss": 0.0925, "step": 26700 }, { "epoch": 0.27, "grad_norm": 0.1660347878932953, "learning_rate": 0.001, "loss": 0.0939, "step": 26800 }, { "epoch": 0.27, "grad_norm": 0.2105019986629486, "learning_rate": 0.001, "loss": 0.093, "step": 26900 }, { "epoch": 0.27, "grad_norm": 0.2271376997232437, "learning_rate": 0.001, "loss": 0.0899, "step": 27000 }, { "epoch": 0.27, "grad_norm": 0.14487460255622864, "learning_rate": 0.001, "loss": 0.0906, "step": 27100 }, { "epoch": 0.27, "grad_norm": 0.1597098708152771, "learning_rate": 0.001, "loss": 0.0919, "step": 27200 }, { "epoch": 0.27, "grad_norm": 0.18633900582790375, "learning_rate": 0.001, "loss": 0.0892, "step": 27300 }, { "epoch": 0.27, "grad_norm": 0.12663201987743378, "learning_rate": 0.001, "loss": 0.0913, "step": 27400 }, { "epoch": 0.27, "grad_norm": 0.17320451140403748, "learning_rate": 0.001, "loss": 0.0911, "step": 27500 }, { "epoch": 0.28, "grad_norm": 0.16872632503509521, "learning_rate": 0.001, "loss": 0.091, "step": 27600 }, { "epoch": 0.28, "grad_norm": 0.18602560460567474, "learning_rate": 0.001, "loss": 0.0908, "step": 27700 }, { "epoch": 0.28, "grad_norm": 0.17392034828662872, "learning_rate": 0.001, "loss": 0.0882, "step": 27800 }, { "epoch": 0.28, "grad_norm": 0.10278663039207458, "learning_rate": 0.001, "loss": 0.088, "step": 27900 }, { "epoch": 0.28, "grad_norm": 0.15355843305587769, "learning_rate": 0.001, "loss": 0.0876, "step": 28000 }, { "epoch": 0.28, "grad_norm": 0.17331954836845398, "learning_rate": 0.001, "loss": 0.0906, "step": 28100 }, { "epoch": 0.28, "grad_norm": 0.16750375926494598, "learning_rate": 0.001, "loss": 0.0935, "step": 28200 }, { "epoch": 0.28, "grad_norm": 0.27208462357521057, "learning_rate": 0.001, "loss": 0.0884, "step": 28300 }, { "epoch": 0.28, "grad_norm": 0.2215784639120102, "learning_rate": 0.001, "loss": 0.0904, "step": 28400 }, { "epoch": 0.28, "grad_norm": 0.1542549580335617, "learning_rate": 0.001, "loss": 0.0903, "step": 28500 }, { "epoch": 0.29, "grad_norm": 0.22874318063259125, "learning_rate": 0.001, "loss": 0.0889, "step": 28600 }, { "epoch": 0.29, "grad_norm": 0.22677820920944214, "learning_rate": 0.001, "loss": 0.0915, "step": 28700 }, { "epoch": 0.29, "grad_norm": 0.22208420932292938, "learning_rate": 0.001, "loss": 0.0902, "step": 28800 }, { "epoch": 0.29, "grad_norm": 0.18172180652618408, "learning_rate": 0.001, "loss": 0.091, "step": 28900 }, { "epoch": 0.29, "grad_norm": 0.264664888381958, "learning_rate": 0.001, "loss": 0.091, "step": 29000 }, { "epoch": 0.29, "grad_norm": 0.15961118042469025, "learning_rate": 0.001, "loss": 0.0864, "step": 29100 }, { "epoch": 0.29, "grad_norm": 0.16828449070453644, "learning_rate": 0.001, "loss": 0.0902, "step": 29200 }, { "epoch": 0.29, "grad_norm": 0.25299304723739624, "learning_rate": 0.001, "loss": 0.0895, "step": 29300 }, { "epoch": 0.29, "grad_norm": 0.2019224911928177, "learning_rate": 0.001, "loss": 0.0887, "step": 29400 }, { "epoch": 0.29, "grad_norm": 0.19100870192050934, "learning_rate": 0.001, "loss": 0.0897, "step": 29500 }, { "epoch": 0.3, "grad_norm": 0.25321510434150696, "learning_rate": 0.001, "loss": 0.092, "step": 29600 }, { "epoch": 0.3, "grad_norm": 0.18171149492263794, "learning_rate": 0.001, "loss": 0.089, "step": 29700 }, { "epoch": 0.3, "grad_norm": 0.19380785524845123, "learning_rate": 0.001, "loss": 0.0895, "step": 29800 }, { "epoch": 0.3, "grad_norm": 0.18437138199806213, "learning_rate": 0.001, "loss": 0.0903, "step": 29900 }, { "epoch": 0.3, "grad_norm": 0.1717921495437622, "learning_rate": 0.001, "loss": 0.0885, "step": 30000 }, { "epoch": 0.3, "grad_norm": 0.23623107373714447, "learning_rate": 0.001, "loss": 0.0882, "step": 30100 }, { "epoch": 0.3, "grad_norm": 0.17992794513702393, "learning_rate": 0.001, "loss": 0.0885, "step": 30200 }, { "epoch": 0.3, "grad_norm": 0.19958259165287018, "learning_rate": 0.001, "loss": 0.088, "step": 30300 }, { "epoch": 0.3, "grad_norm": 0.14418841898441315, "learning_rate": 0.001, "loss": 0.0908, "step": 30400 }, { "epoch": 0.3, "grad_norm": 0.13934949040412903, "learning_rate": 0.001, "loss": 0.0919, "step": 30500 }, { "epoch": 0.31, "grad_norm": 0.1410313993692398, "learning_rate": 0.001, "loss": 0.0891, "step": 30600 }, { "epoch": 0.31, "grad_norm": 0.27084311842918396, "learning_rate": 0.001, "loss": 0.0917, "step": 30700 }, { "epoch": 0.31, "grad_norm": 0.18704760074615479, "learning_rate": 0.001, "loss": 0.0866, "step": 30800 }, { "epoch": 0.31, "grad_norm": 0.16178588569164276, "learning_rate": 0.001, "loss": 0.088, "step": 30900 }, { "epoch": 0.31, "grad_norm": 0.1699521839618683, "learning_rate": 0.001, "loss": 0.0891, "step": 31000 }, { "epoch": 0.31, "grad_norm": 0.21340341866016388, "learning_rate": 0.001, "loss": 0.0871, "step": 31100 }, { "epoch": 0.31, "grad_norm": 0.21089456975460052, "learning_rate": 0.001, "loss": 0.0898, "step": 31200 }, { "epoch": 0.31, "grad_norm": 0.17899860441684723, "learning_rate": 0.001, "loss": 0.0874, "step": 31300 }, { "epoch": 0.31, "grad_norm": 0.2222578376531601, "learning_rate": 0.001, "loss": 0.0875, "step": 31400 }, { "epoch": 0.31, "grad_norm": 0.22845357656478882, "learning_rate": 0.001, "loss": 0.0895, "step": 31500 }, { "epoch": 0.32, "grad_norm": 0.22213339805603027, "learning_rate": 0.001, "loss": 0.0877, "step": 31600 }, { "epoch": 0.32, "grad_norm": 0.1989658772945404, "learning_rate": 0.001, "loss": 0.09, "step": 31700 }, { "epoch": 0.32, "grad_norm": 0.28217941522598267, "learning_rate": 0.001, "loss": 0.0869, "step": 31800 }, { "epoch": 0.32, "grad_norm": 0.1880946159362793, "learning_rate": 0.001, "loss": 0.0895, "step": 31900 }, { "epoch": 0.32, "grad_norm": 0.2522743046283722, "learning_rate": 0.001, "loss": 0.0876, "step": 32000 }, { "epoch": 0.32, "grad_norm": 0.15146856009960175, "learning_rate": 0.001, "loss": 0.0892, "step": 32100 }, { "epoch": 0.32, "grad_norm": 0.20138536393642426, "learning_rate": 0.001, "loss": 0.0897, "step": 32200 }, { "epoch": 0.32, "grad_norm": 0.19894324243068695, "learning_rate": 0.001, "loss": 0.089, "step": 32300 }, { "epoch": 0.32, "grad_norm": 0.20011819899082184, "learning_rate": 0.001, "loss": 0.0877, "step": 32400 }, { "epoch": 0.32, "grad_norm": 0.22739243507385254, "learning_rate": 0.001, "loss": 0.0875, "step": 32500 }, { "epoch": 0.33, "grad_norm": 0.16710792481899261, "learning_rate": 0.001, "loss": 0.0852, "step": 32600 }, { "epoch": 0.33, "grad_norm": 0.20454761385917664, "learning_rate": 0.001, "loss": 0.0862, "step": 32700 }, { "epoch": 0.33, "grad_norm": 0.12356776744127274, "learning_rate": 0.001, "loss": 0.0854, "step": 32800 }, { "epoch": 0.33, "grad_norm": 0.18977922201156616, "learning_rate": 0.001, "loss": 0.0915, "step": 32900 }, { "epoch": 0.33, "grad_norm": 0.18791726231575012, "learning_rate": 0.001, "loss": 0.0842, "step": 33000 }, { "epoch": 0.33, "grad_norm": 0.23529213666915894, "learning_rate": 0.001, "loss": 0.086, "step": 33100 }, { "epoch": 0.33, "grad_norm": 0.25430527329444885, "learning_rate": 0.001, "loss": 0.0833, "step": 33200 }, { "epoch": 0.33, "grad_norm": 0.22178427875041962, "learning_rate": 0.001, "loss": 0.0874, "step": 33300 }, { "epoch": 0.33, "grad_norm": 0.27455243468284607, "learning_rate": 0.001, "loss": 0.0845, "step": 33400 }, { "epoch": 0.33, "grad_norm": 0.1998920440673828, "learning_rate": 0.001, "loss": 0.0869, "step": 33500 }, { "epoch": 0.34, "grad_norm": 0.1991311013698578, "learning_rate": 0.001, "loss": 0.0873, "step": 33600 }, { "epoch": 0.34, "grad_norm": 0.2600191831588745, "learning_rate": 0.001, "loss": 0.0869, "step": 33700 }, { "epoch": 0.34, "grad_norm": 0.16889439523220062, "learning_rate": 0.001, "loss": 0.0841, "step": 33800 }, { "epoch": 0.34, "grad_norm": 0.17337612807750702, "learning_rate": 0.001, "loss": 0.0847, "step": 33900 }, { "epoch": 0.34, "grad_norm": 0.12141957134008408, "learning_rate": 0.001, "loss": 0.0873, "step": 34000 }, { "epoch": 0.34, "grad_norm": 0.30542996525764465, "learning_rate": 0.001, "loss": 0.086, "step": 34100 }, { "epoch": 0.34, "grad_norm": 0.256072461605072, "learning_rate": 0.001, "loss": 0.0845, "step": 34200 }, { "epoch": 0.34, "grad_norm": 0.19596265256404877, "learning_rate": 0.001, "loss": 0.0847, "step": 34300 }, { "epoch": 0.34, "grad_norm": 0.17981210350990295, "learning_rate": 0.001, "loss": 0.0853, "step": 34400 }, { "epoch": 0.34, "grad_norm": 0.18695278465747833, "learning_rate": 0.001, "loss": 0.0867, "step": 34500 }, { "epoch": 0.35, "grad_norm": 0.20189203321933746, "learning_rate": 0.001, "loss": 0.0867, "step": 34600 }, { "epoch": 0.35, "grad_norm": 0.20751608908176422, "learning_rate": 0.001, "loss": 0.0855, "step": 34700 }, { "epoch": 0.35, "grad_norm": 0.15412236750125885, "learning_rate": 0.001, "loss": 0.0876, "step": 34800 }, { "epoch": 0.35, "grad_norm": 0.21551938354969025, "learning_rate": 0.001, "loss": 0.0854, "step": 34900 }, { "epoch": 0.35, "grad_norm": 0.15149344503879547, "learning_rate": 0.001, "loss": 0.0863, "step": 35000 }, { "epoch": 0.35, "grad_norm": 0.21960322558879852, "learning_rate": 0.001, "loss": 0.0913, "step": 35100 }, { "epoch": 0.35, "grad_norm": 0.317090779542923, "learning_rate": 0.001, "loss": 0.0832, "step": 35200 }, { "epoch": 0.35, "grad_norm": 0.20051142573356628, "learning_rate": 0.001, "loss": 0.0856, "step": 35300 }, { "epoch": 0.35, "grad_norm": 0.1955852061510086, "learning_rate": 0.001, "loss": 0.0867, "step": 35400 }, { "epoch": 0.35, "grad_norm": 0.13714253902435303, "learning_rate": 0.001, "loss": 0.0864, "step": 35500 }, { "epoch": 0.36, "grad_norm": 0.18536311388015747, "learning_rate": 0.001, "loss": 0.0868, "step": 35600 }, { "epoch": 0.36, "grad_norm": 0.1795514076948166, "learning_rate": 0.001, "loss": 0.0829, "step": 35700 }, { "epoch": 0.36, "grad_norm": 0.1465149074792862, "learning_rate": 0.001, "loss": 0.0851, "step": 35800 }, { "epoch": 0.36, "grad_norm": 0.17687107622623444, "learning_rate": 0.001, "loss": 0.0861, "step": 35900 }, { "epoch": 0.36, "grad_norm": 0.1795363575220108, "learning_rate": 0.001, "loss": 0.0822, "step": 36000 }, { "epoch": 0.36, "grad_norm": 0.1741327941417694, "learning_rate": 0.001, "loss": 0.0847, "step": 36100 }, { "epoch": 0.36, "grad_norm": 0.2547447681427002, "learning_rate": 0.001, "loss": 0.0862, "step": 36200 }, { "epoch": 0.36, "grad_norm": 0.16002462804317474, "learning_rate": 0.001, "loss": 0.0856, "step": 36300 }, { "epoch": 0.36, "grad_norm": 0.14787407219409943, "learning_rate": 0.001, "loss": 0.0844, "step": 36400 }, { "epoch": 0.36, "grad_norm": 0.23449848592281342, "learning_rate": 0.001, "loss": 0.0823, "step": 36500 }, { "epoch": 0.37, "grad_norm": 0.18626731634140015, "learning_rate": 0.001, "loss": 0.0804, "step": 36600 }, { "epoch": 0.37, "grad_norm": 0.1434779316186905, "learning_rate": 0.001, "loss": 0.0844, "step": 36700 }, { "epoch": 0.37, "grad_norm": 0.1594706028699875, "learning_rate": 0.001, "loss": 0.0869, "step": 36800 }, { "epoch": 0.37, "grad_norm": 0.18195496499538422, "learning_rate": 0.001, "loss": 0.0846, "step": 36900 }, { "epoch": 0.37, "grad_norm": 0.18613013625144958, "learning_rate": 0.001, "loss": 0.0872, "step": 37000 }, { "epoch": 0.37, "grad_norm": 0.16158261895179749, "learning_rate": 0.001, "loss": 0.0846, "step": 37100 }, { "epoch": 0.37, "grad_norm": 0.17811179161071777, "learning_rate": 0.001, "loss": 0.0832, "step": 37200 }, { "epoch": 0.37, "grad_norm": 0.24112731218338013, "learning_rate": 0.001, "loss": 0.0804, "step": 37300 }, { "epoch": 0.37, "grad_norm": 0.1778961569070816, "learning_rate": 0.001, "loss": 0.0837, "step": 37400 }, { "epoch": 0.37, "grad_norm": 0.18162128329277039, "learning_rate": 0.001, "loss": 0.0867, "step": 37500 }, { "epoch": 0.38, "grad_norm": 0.15079495310783386, "learning_rate": 0.001, "loss": 0.0829, "step": 37600 }, { "epoch": 0.38, "grad_norm": 0.26986435055732727, "learning_rate": 0.001, "loss": 0.0843, "step": 37700 }, { "epoch": 0.38, "grad_norm": 0.2643984854221344, "learning_rate": 0.001, "loss": 0.0829, "step": 37800 }, { "epoch": 0.38, "grad_norm": 0.281751424074173, "learning_rate": 0.001, "loss": 0.0821, "step": 37900 }, { "epoch": 0.38, "grad_norm": 0.23095449805259705, "learning_rate": 0.001, "loss": 0.0836, "step": 38000 }, { "epoch": 0.38, "grad_norm": 0.18625666201114655, "learning_rate": 0.001, "loss": 0.0831, "step": 38100 }, { "epoch": 0.38, "grad_norm": 0.13689708709716797, "learning_rate": 0.001, "loss": 0.0839, "step": 38200 }, { "epoch": 0.38, "grad_norm": 0.14063656330108643, "learning_rate": 0.001, "loss": 0.0817, "step": 38300 }, { "epoch": 0.38, "grad_norm": 0.1880202442407608, "learning_rate": 0.001, "loss": 0.082, "step": 38400 }, { "epoch": 0.38, "grad_norm": 0.15921075642108917, "learning_rate": 0.001, "loss": 0.0789, "step": 38500 }, { "epoch": 0.39, "grad_norm": 0.1744866818189621, "learning_rate": 0.001, "loss": 0.0818, "step": 38600 }, { "epoch": 0.39, "grad_norm": 0.26724693179130554, "learning_rate": 0.001, "loss": 0.0847, "step": 38700 }, { "epoch": 0.39, "grad_norm": 0.14382457733154297, "learning_rate": 0.001, "loss": 0.0829, "step": 38800 }, { "epoch": 0.39, "grad_norm": 0.14012865722179413, "learning_rate": 0.001, "loss": 0.082, "step": 38900 }, { "epoch": 0.39, "grad_norm": 0.24175578355789185, "learning_rate": 0.001, "loss": 0.0835, "step": 39000 }, { "epoch": 0.39, "grad_norm": 0.3397182822227478, "learning_rate": 0.001, "loss": 0.081, "step": 39100 }, { "epoch": 0.39, "grad_norm": 0.1553467959165573, "learning_rate": 0.001, "loss": 0.0829, "step": 39200 }, { "epoch": 0.39, "grad_norm": 0.20726840198040009, "learning_rate": 0.001, "loss": 0.083, "step": 39300 }, { "epoch": 0.39, "grad_norm": 0.21219220757484436, "learning_rate": 0.001, "loss": 0.084, "step": 39400 }, { "epoch": 0.39, "grad_norm": 0.19203193485736847, "learning_rate": 0.001, "loss": 0.0819, "step": 39500 }, { "epoch": 0.4, "grad_norm": 0.22557440400123596, "learning_rate": 0.001, "loss": 0.0803, "step": 39600 }, { "epoch": 0.4, "grad_norm": 0.23452799022197723, "learning_rate": 0.001, "loss": 0.0806, "step": 39700 }, { "epoch": 0.4, "grad_norm": 0.28543928265571594, "learning_rate": 0.001, "loss": 0.0827, "step": 39800 }, { "epoch": 0.4, "grad_norm": 0.19713571667671204, "learning_rate": 0.001, "loss": 0.08, "step": 39900 }, { "epoch": 0.4, "grad_norm": 0.18496285378932953, "learning_rate": 0.001, "loss": 0.0841, "step": 40000 }, { "epoch": 0.4, "grad_norm": 0.1363649070262909, "learning_rate": 0.001, "loss": 0.0813, "step": 40100 }, { "epoch": 0.4, "grad_norm": 0.1736011952161789, "learning_rate": 0.001, "loss": 0.0796, "step": 40200 }, { "epoch": 0.4, "grad_norm": 0.21385334432125092, "learning_rate": 0.001, "loss": 0.0814, "step": 40300 }, { "epoch": 0.4, "grad_norm": 0.2105669230222702, "learning_rate": 0.001, "loss": 0.0816, "step": 40400 }, { "epoch": 0.4, "grad_norm": 0.2278176248073578, "learning_rate": 0.001, "loss": 0.0825, "step": 40500 }, { "epoch": 0.41, "grad_norm": 0.17637114226818085, "learning_rate": 0.001, "loss": 0.0812, "step": 40600 }, { "epoch": 0.41, "grad_norm": 0.20035295188426971, "learning_rate": 0.001, "loss": 0.0853, "step": 40700 }, { "epoch": 0.41, "grad_norm": 0.25408777594566345, "learning_rate": 0.001, "loss": 0.0811, "step": 40800 }, { "epoch": 0.41, "grad_norm": 0.2177010476589203, "learning_rate": 0.001, "loss": 0.0796, "step": 40900 }, { "epoch": 0.41, "grad_norm": 0.1639321744441986, "learning_rate": 0.001, "loss": 0.0824, "step": 41000 }, { "epoch": 0.41, "grad_norm": 0.15798155963420868, "learning_rate": 0.001, "loss": 0.0834, "step": 41100 }, { "epoch": 0.41, "grad_norm": 0.14857494831085205, "learning_rate": 0.001, "loss": 0.0825, "step": 41200 }, { "epoch": 0.41, "grad_norm": 0.15640319883823395, "learning_rate": 0.001, "loss": 0.0814, "step": 41300 }, { "epoch": 0.41, "grad_norm": 0.1530522108078003, "learning_rate": 0.001, "loss": 0.0825, "step": 41400 }, { "epoch": 0.41, "grad_norm": 0.2990354001522064, "learning_rate": 0.001, "loss": 0.0785, "step": 41500 }, { "epoch": 0.42, "grad_norm": 0.19239626824855804, "learning_rate": 0.001, "loss": 0.0809, "step": 41600 }, { "epoch": 0.42, "grad_norm": 0.13975249230861664, "learning_rate": 0.001, "loss": 0.0825, "step": 41700 }, { "epoch": 0.42, "grad_norm": 0.22527189552783966, "learning_rate": 0.001, "loss": 0.0819, "step": 41800 }, { "epoch": 0.42, "grad_norm": 0.3547128438949585, "learning_rate": 0.001, "loss": 0.1013, "step": 41900 }, { "epoch": 0.42, "grad_norm": 0.22032135725021362, "learning_rate": 0.001, "loss": 0.0806, "step": 42000 }, { "epoch": 0.42, "grad_norm": 0.12712807953357697, "learning_rate": 0.001, "loss": 0.0791, "step": 42100 }, { "epoch": 0.42, "grad_norm": 0.29608944058418274, "learning_rate": 0.001, "loss": 0.0783, "step": 42200 }, { "epoch": 0.42, "grad_norm": 0.23063918948173523, "learning_rate": 0.001, "loss": 0.0828, "step": 42300 }, { "epoch": 0.42, "grad_norm": 0.19996796548366547, "learning_rate": 0.001, "loss": 0.0813, "step": 42400 }, { "epoch": 0.42, "grad_norm": 0.19479811191558838, "learning_rate": 0.001, "loss": 0.0811, "step": 42500 }, { "epoch": 0.43, "grad_norm": 0.1822797805070877, "learning_rate": 0.001, "loss": 0.0796, "step": 42600 }, { "epoch": 0.43, "grad_norm": 0.36260533332824707, "learning_rate": 0.001, "loss": 0.0797, "step": 42700 }, { "epoch": 0.43, "grad_norm": 0.14315147697925568, "learning_rate": 0.001, "loss": 0.0869, "step": 42800 }, { "epoch": 0.43, "grad_norm": 0.20261742174625397, "learning_rate": 0.001, "loss": 0.1856, "step": 42900 }, { "epoch": 0.43, "grad_norm": 0.18873733282089233, "learning_rate": 0.001, "loss": 0.0775, "step": 43000 }, { "epoch": 0.43, "grad_norm": 0.2189916968345642, "learning_rate": 0.001, "loss": 0.0796, "step": 43100 }, { "epoch": 0.43, "grad_norm": 0.1823868304491043, "learning_rate": 0.001, "loss": 0.0822, "step": 43200 }, { "epoch": 0.43, "grad_norm": 0.2595207691192627, "learning_rate": 0.001, "loss": 0.0776, "step": 43300 }, { "epoch": 0.43, "grad_norm": 0.1713092178106308, "learning_rate": 0.001, "loss": 0.0811, "step": 43400 }, { "epoch": 0.43, "grad_norm": 0.24840323626995087, "learning_rate": 0.001, "loss": 0.104, "step": 43500 }, { "epoch": 0.44, "grad_norm": 0.23451556265354156, "learning_rate": 0.001, "loss": 0.077, "step": 43600 }, { "epoch": 0.44, "grad_norm": 0.2142404466867447, "learning_rate": 0.001, "loss": 0.0789, "step": 43700 }, { "epoch": 0.44, "grad_norm": 0.22932325303554535, "learning_rate": 0.001, "loss": 0.0778, "step": 43800 }, { "epoch": 0.44, "grad_norm": 0.2027159184217453, "learning_rate": 0.001, "loss": 0.0794, "step": 43900 }, { "epoch": 0.44, "grad_norm": 0.22258317470550537, "learning_rate": 0.001, "loss": 0.0787, "step": 44000 }, { "epoch": 0.44, "grad_norm": 0.2979215681552887, "learning_rate": 0.001, "loss": 0.0767, "step": 44100 }, { "epoch": 0.44, "grad_norm": 0.2110917568206787, "learning_rate": 0.001, "loss": 0.0782, "step": 44200 }, { "epoch": 0.44, "grad_norm": 0.24181802570819855, "learning_rate": 0.001, "loss": 0.0804, "step": 44300 }, { "epoch": 0.44, "grad_norm": 0.1810845136642456, "learning_rate": 0.001, "loss": 0.0786, "step": 44400 }, { "epoch": 0.44, "grad_norm": 0.23404444754123688, "learning_rate": 0.001, "loss": 0.0785, "step": 44500 }, { "epoch": 0.45, "grad_norm": 0.2591089904308319, "learning_rate": 0.001, "loss": 0.0765, "step": 44600 }, { "epoch": 0.45, "grad_norm": 0.22720029950141907, "learning_rate": 0.001, "loss": 0.0798, "step": 44700 }, { "epoch": 0.45, "grad_norm": 0.22449086606502533, "learning_rate": 0.001, "loss": 0.0766, "step": 44800 }, { "epoch": 0.45, "grad_norm": 0.2302643209695816, "learning_rate": 0.001, "loss": 0.0798, "step": 44900 }, { "epoch": 0.45, "grad_norm": 0.2040921300649643, "learning_rate": 0.001, "loss": 0.0841, "step": 45000 }, { "epoch": 0.45, "grad_norm": 0.21232621371746063, "learning_rate": 0.001, "loss": 0.0789, "step": 45100 }, { "epoch": 0.45, "grad_norm": 0.20054876804351807, "learning_rate": 0.001, "loss": 0.0779, "step": 45200 }, { "epoch": 0.45, "grad_norm": 0.24335692822933197, "learning_rate": 0.001, "loss": 0.0784, "step": 45300 }, { "epoch": 0.45, "grad_norm": 0.22172445058822632, "learning_rate": 0.001, "loss": 0.0797, "step": 45400 }, { "epoch": 0.45, "grad_norm": 0.20524169504642487, "learning_rate": 0.001, "loss": 0.0803, "step": 45500 }, { "epoch": 0.46, "grad_norm": 0.17150288820266724, "learning_rate": 0.001, "loss": 0.0791, "step": 45600 }, { "epoch": 0.46, "grad_norm": 0.38285690546035767, "learning_rate": 0.001, "loss": 0.079, "step": 45700 }, { "epoch": 0.46, "grad_norm": 0.16937342286109924, "learning_rate": 0.001, "loss": 0.0791, "step": 45800 }, { "epoch": 0.46, "grad_norm": 0.19271647930145264, "learning_rate": 0.001, "loss": 0.079, "step": 45900 }, { "epoch": 0.46, "grad_norm": 0.20048774778842926, "learning_rate": 0.001, "loss": 0.0797, "step": 46000 }, { "epoch": 0.46, "grad_norm": 0.2141706347465515, "learning_rate": 0.001, "loss": 0.0798, "step": 46100 }, { "epoch": 0.46, "grad_norm": 0.20665834844112396, "learning_rate": 0.001, "loss": 0.0778, "step": 46200 }, { "epoch": 0.46, "grad_norm": 0.18385255336761475, "learning_rate": 0.001, "loss": 0.0779, "step": 46300 }, { "epoch": 0.46, "grad_norm": 0.22467826306819916, "learning_rate": 0.001, "loss": 0.0732, "step": 46400 }, { "epoch": 0.46, "grad_norm": 0.18363313376903534, "learning_rate": 0.001, "loss": 0.0796, "step": 46500 }, { "epoch": 0.47, "grad_norm": 0.2288578897714615, "learning_rate": 0.001, "loss": 0.0763, "step": 46600 }, { "epoch": 0.47, "grad_norm": 0.2535518407821655, "learning_rate": 0.001, "loss": 0.0791, "step": 46700 }, { "epoch": 0.47, "grad_norm": 0.20715934038162231, "learning_rate": 0.001, "loss": 0.0777, "step": 46800 }, { "epoch": 0.47, "grad_norm": 0.12203960865736008, "learning_rate": 0.001, "loss": 0.0805, "step": 46900 }, { "epoch": 0.47, "grad_norm": 0.138369619846344, "learning_rate": 0.001, "loss": 0.0768, "step": 47000 }, { "epoch": 0.47, "grad_norm": 0.2319127321243286, "learning_rate": 0.001, "loss": 0.0784, "step": 47100 }, { "epoch": 0.47, "grad_norm": 0.2058788686990738, "learning_rate": 0.001, "loss": 0.0783, "step": 47200 }, { "epoch": 0.47, "grad_norm": 0.21334126591682434, "learning_rate": 0.001, "loss": 0.0763, "step": 47300 }, { "epoch": 0.47, "grad_norm": 0.23397529125213623, "learning_rate": 0.001, "loss": 0.081, "step": 47400 }, { "epoch": 0.47, "grad_norm": 0.24460141360759735, "learning_rate": 0.001, "loss": 0.0752, "step": 47500 }, { "epoch": 0.48, "grad_norm": 0.22441798448562622, "learning_rate": 0.001, "loss": 0.0779, "step": 47600 }, { "epoch": 0.48, "grad_norm": 0.20988881587982178, "learning_rate": 0.001, "loss": 0.08, "step": 47700 }, { "epoch": 0.48, "grad_norm": 0.17863024771213531, "learning_rate": 0.001, "loss": 0.0787, "step": 47800 }, { "epoch": 0.48, "grad_norm": 0.17980898916721344, "learning_rate": 0.001, "loss": 0.0802, "step": 47900 }, { "epoch": 0.48, "grad_norm": 0.2614147961139679, "learning_rate": 0.001, "loss": 0.0787, "step": 48000 }, { "epoch": 0.48, "grad_norm": 0.16281504929065704, "learning_rate": 0.001, "loss": 0.0779, "step": 48100 }, { "epoch": 0.48, "grad_norm": 0.3099921941757202, "learning_rate": 0.001, "loss": 0.0747, "step": 48200 }, { "epoch": 0.48, "grad_norm": 0.2542015016078949, "learning_rate": 0.001, "loss": 0.0831, "step": 48300 }, { "epoch": 0.48, "grad_norm": 0.17419801652431488, "learning_rate": 0.001, "loss": 0.0787, "step": 48400 }, { "epoch": 0.48, "grad_norm": 0.2089216262102127, "learning_rate": 0.001, "loss": 0.0781, "step": 48500 }, { "epoch": 0.49, "grad_norm": 0.26476818323135376, "learning_rate": 0.001, "loss": 0.0792, "step": 48600 }, { "epoch": 0.49, "grad_norm": 0.18907053768634796, "learning_rate": 0.001, "loss": 0.078, "step": 48700 }, { "epoch": 0.49, "grad_norm": 0.2528514564037323, "learning_rate": 0.001, "loss": 0.0791, "step": 48800 }, { "epoch": 0.49, "grad_norm": 0.2794158458709717, "learning_rate": 0.001, "loss": 0.0799, "step": 48900 }, { "epoch": 0.49, "grad_norm": 0.24547474086284637, "learning_rate": 0.001, "loss": 0.0765, "step": 49000 }, { "epoch": 0.49, "grad_norm": 0.17239224910736084, "learning_rate": 0.001, "loss": 0.0807, "step": 49100 }, { "epoch": 0.49, "grad_norm": 0.22998745739459991, "learning_rate": 0.001, "loss": 0.079, "step": 49200 }, { "epoch": 0.49, "grad_norm": 0.2727990746498108, "learning_rate": 0.001, "loss": 0.078, "step": 49300 }, { "epoch": 0.49, "grad_norm": 0.2488749623298645, "learning_rate": 0.001, "loss": 0.0757, "step": 49400 }, { "epoch": 0.49, "grad_norm": 0.20260153710842133, "learning_rate": 0.001, "loss": 0.0787, "step": 49500 }, { "epoch": 0.5, "grad_norm": 0.30832308530807495, "learning_rate": 0.001, "loss": 0.0789, "step": 49600 }, { "epoch": 0.5, "grad_norm": 0.17934545874595642, "learning_rate": 0.001, "loss": 0.0768, "step": 49700 }, { "epoch": 0.5, "grad_norm": 0.1972292810678482, "learning_rate": 0.001, "loss": 0.0786, "step": 49800 }, { "epoch": 0.5, "grad_norm": 0.1899816393852234, "learning_rate": 0.001, "loss": 0.0782, "step": 49900 }, { "epoch": 0.5, "grad_norm": 0.17765800654888153, "learning_rate": 0.001, "loss": 0.0784, "step": 50000 }, { "epoch": 0.5, "grad_norm": 0.3285583555698395, "learning_rate": 0.001, "loss": 0.0793, "step": 50100 }, { "epoch": 0.5, "grad_norm": 0.2769279181957245, "learning_rate": 0.001, "loss": 0.0818, "step": 50200 }, { "epoch": 0.5, "grad_norm": 0.1661899834871292, "learning_rate": 0.001, "loss": 0.1088, "step": 50300 }, { "epoch": 0.5, "grad_norm": 0.32694903016090393, "learning_rate": 0.001, "loss": 0.0799, "step": 50400 }, { "epoch": 0.5, "grad_norm": 0.1976955235004425, "learning_rate": 0.001, "loss": 0.0768, "step": 50500 }, { "epoch": 0.51, "grad_norm": 0.2623777687549591, "learning_rate": 0.001, "loss": 0.0764, "step": 50600 }, { "epoch": 0.51, "grad_norm": 0.19917914271354675, "learning_rate": 0.001, "loss": 0.079, "step": 50700 }, { "epoch": 0.51, "grad_norm": 0.22838640213012695, "learning_rate": 0.001, "loss": 0.076, "step": 50800 }, { "epoch": 0.51, "grad_norm": 0.1831175684928894, "learning_rate": 0.001, "loss": 0.0744, "step": 50900 }, { "epoch": 0.51, "grad_norm": 0.1774362176656723, "learning_rate": 0.001, "loss": 0.076, "step": 51000 }, { "epoch": 0.51, "grad_norm": 0.24986374378204346, "learning_rate": 0.001, "loss": 0.0754, "step": 51100 }, { "epoch": 0.51, "grad_norm": 0.15164266526699066, "learning_rate": 0.001, "loss": 0.0757, "step": 51200 }, { "epoch": 0.51, "grad_norm": 0.19118934869766235, "learning_rate": 0.001, "loss": 0.0787, "step": 51300 }, { "epoch": 0.51, "grad_norm": 0.1625840663909912, "learning_rate": 0.001, "loss": 0.0778, "step": 51400 }, { "epoch": 0.51, "grad_norm": 0.14519533514976501, "learning_rate": 0.001, "loss": 0.077, "step": 51500 }, { "epoch": 0.52, "grad_norm": 0.16799670457839966, "learning_rate": 0.001, "loss": 0.0764, "step": 51600 }, { "epoch": 0.52, "grad_norm": 0.15635591745376587, "learning_rate": 0.001, "loss": 0.0738, "step": 51700 }, { "epoch": 0.52, "grad_norm": 0.25875189900398254, "learning_rate": 0.001, "loss": 0.0757, "step": 51800 }, { "epoch": 0.52, "grad_norm": 0.2601448595523834, "learning_rate": 0.001, "loss": 0.0721, "step": 51900 }, { "epoch": 0.52, "grad_norm": 0.20097233355045319, "learning_rate": 0.001, "loss": 0.0764, "step": 52000 }, { "epoch": 0.52, "grad_norm": 0.17383421957492828, "learning_rate": 0.001, "loss": 0.0768, "step": 52100 }, { "epoch": 0.52, "grad_norm": 0.152663916349411, "learning_rate": 0.001, "loss": 0.0747, "step": 52200 }, { "epoch": 0.52, "grad_norm": 0.1773347705602646, "learning_rate": 0.001, "loss": 0.0743, "step": 52300 }, { "epoch": 0.52, "grad_norm": 0.15975210070610046, "learning_rate": 0.001, "loss": 0.0769, "step": 52400 }, { "epoch": 0.52, "grad_norm": 0.27663958072662354, "learning_rate": 0.001, "loss": 0.0747, "step": 52500 }, { "epoch": 0.53, "grad_norm": 0.20124509930610657, "learning_rate": 0.001, "loss": 0.0755, "step": 52600 }, { "epoch": 0.53, "grad_norm": 0.19016942381858826, "learning_rate": 0.001, "loss": 0.0709, "step": 52700 }, { "epoch": 0.53, "grad_norm": 0.34517988562583923, "learning_rate": 0.001, "loss": 0.0751, "step": 52800 }, { "epoch": 0.53, "grad_norm": 0.27312055230140686, "learning_rate": 0.001, "loss": 0.0761, "step": 52900 }, { "epoch": 0.53, "grad_norm": 0.2835043668746948, "learning_rate": 0.001, "loss": 0.0731, "step": 53000 }, { "epoch": 0.53, "grad_norm": 0.1630600243806839, "learning_rate": 0.001, "loss": 0.0741, "step": 53100 }, { "epoch": 0.53, "grad_norm": 0.2430613487958908, "learning_rate": 0.001, "loss": 0.0767, "step": 53200 }, { "epoch": 0.53, "grad_norm": 0.19533057510852814, "learning_rate": 0.001, "loss": 0.077, "step": 53300 }, { "epoch": 0.53, "grad_norm": 0.21139401197433472, "learning_rate": 0.001, "loss": 0.0711, "step": 53400 }, { "epoch": 0.53, "grad_norm": 0.18416912853717804, "learning_rate": 0.001, "loss": 0.0729, "step": 53500 }, { "epoch": 0.54, "grad_norm": 0.24703727662563324, "learning_rate": 0.001, "loss": 0.071, "step": 53600 }, { "epoch": 0.54, "grad_norm": 0.14476247131824493, "learning_rate": 0.001, "loss": 0.0754, "step": 53700 }, { "epoch": 0.54, "grad_norm": 0.210220068693161, "learning_rate": 0.001, "loss": 0.0738, "step": 53800 }, { "epoch": 0.54, "grad_norm": 0.16544660925865173, "learning_rate": 0.001, "loss": 0.072, "step": 53900 }, { "epoch": 0.54, "grad_norm": 0.17049700021743774, "learning_rate": 0.001, "loss": 0.0728, "step": 54000 }, { "epoch": 0.54, "grad_norm": 0.18656505644321442, "learning_rate": 0.001, "loss": 0.0739, "step": 54100 }, { "epoch": 0.54, "grad_norm": 0.19484791159629822, "learning_rate": 0.001, "loss": 0.0748, "step": 54200 }, { "epoch": 0.54, "grad_norm": 0.1982715129852295, "learning_rate": 0.001, "loss": 0.0729, "step": 54300 }, { "epoch": 0.54, "grad_norm": 0.2108699083328247, "learning_rate": 0.001, "loss": 0.0735, "step": 54400 }, { "epoch": 0.54, "grad_norm": 0.23962444067001343, "learning_rate": 0.001, "loss": 0.0703, "step": 54500 }, { "epoch": 0.55, "grad_norm": 0.29319801926612854, "learning_rate": 0.001, "loss": 0.0735, "step": 54600 }, { "epoch": 0.55, "grad_norm": 0.1804085075855255, "learning_rate": 0.001, "loss": 0.0719, "step": 54700 }, { "epoch": 0.55, "grad_norm": 0.2394474297761917, "learning_rate": 0.001, "loss": 0.0721, "step": 54800 }, { "epoch": 0.55, "grad_norm": 0.20954197645187378, "learning_rate": 0.001, "loss": 0.0745, "step": 54900 }, { "epoch": 0.55, "grad_norm": 0.17135080695152283, "learning_rate": 0.001, "loss": 0.0728, "step": 55000 }, { "epoch": 0.55, "grad_norm": 0.3152260482311249, "learning_rate": 0.001, "loss": 0.0735, "step": 55100 }, { "epoch": 0.55, "grad_norm": 0.22659769654273987, "learning_rate": 0.001, "loss": 0.0752, "step": 55200 }, { "epoch": 0.55, "grad_norm": 0.2605753540992737, "learning_rate": 0.001, "loss": 0.073, "step": 55300 }, { "epoch": 0.55, "grad_norm": 0.2309567779302597, "learning_rate": 0.001, "loss": 0.0744, "step": 55400 }, { "epoch": 0.55, "grad_norm": 0.19917166233062744, "learning_rate": 0.001, "loss": 0.073, "step": 55500 }, { "epoch": 0.56, "grad_norm": 0.2609159052371979, "learning_rate": 0.001, "loss": 0.0705, "step": 55600 }, { "epoch": 0.56, "grad_norm": 0.26976123452186584, "learning_rate": 0.001, "loss": 0.0731, "step": 55700 }, { "epoch": 0.56, "grad_norm": 0.25275784730911255, "learning_rate": 0.001, "loss": 0.0808, "step": 55800 }, { "epoch": 0.56, "grad_norm": 0.2392340749502182, "learning_rate": 0.001, "loss": 0.0763, "step": 55900 }, { "epoch": 0.56, "grad_norm": 0.27718254923820496, "learning_rate": 0.001, "loss": 0.0743, "step": 56000 }, { "epoch": 0.56, "grad_norm": 0.19996067881584167, "learning_rate": 0.001, "loss": 0.0807, "step": 56100 }, { "epoch": 0.56, "grad_norm": 0.16322393715381622, "learning_rate": 0.001, "loss": 0.0753, "step": 56200 }, { "epoch": 0.56, "grad_norm": 0.25598809123039246, "learning_rate": 0.001, "loss": 0.0773, "step": 56300 }, { "epoch": 0.56, "grad_norm": 0.15482768416404724, "learning_rate": 0.001, "loss": 0.0729, "step": 56400 }, { "epoch": 0.56, "grad_norm": 0.4033351242542267, "learning_rate": 0.001, "loss": 0.0773, "step": 56500 }, { "epoch": 0.57, "grad_norm": 0.2869590222835541, "learning_rate": 0.001, "loss": 0.0732, "step": 56600 }, { "epoch": 0.57, "grad_norm": 0.19079795479774475, "learning_rate": 0.001, "loss": 0.0712, "step": 56700 }, { "epoch": 0.57, "grad_norm": 0.21604031324386597, "learning_rate": 0.001, "loss": 0.0714, "step": 56800 }, { "epoch": 0.57, "grad_norm": 0.23917321860790253, "learning_rate": 0.001, "loss": 0.0743, "step": 56900 }, { "epoch": 0.57, "grad_norm": 0.16785088181495667, "learning_rate": 0.001, "loss": 0.0722, "step": 57000 }, { "epoch": 0.57, "grad_norm": 0.22009502351284027, "learning_rate": 0.001, "loss": 0.0738, "step": 57100 }, { "epoch": 0.57, "grad_norm": 0.23401811718940735, "learning_rate": 0.001, "loss": 0.0759, "step": 57200 }, { "epoch": 0.57, "grad_norm": 0.19278208911418915, "learning_rate": 0.001, "loss": 0.0738, "step": 57300 }, { "epoch": 0.57, "grad_norm": 0.22170820832252502, "learning_rate": 0.001, "loss": 0.07, "step": 57400 }, { "epoch": 0.57, "grad_norm": 0.2148713767528534, "learning_rate": 0.001, "loss": 0.0716, "step": 57500 }, { "epoch": 0.58, "grad_norm": 0.2093653529882431, "learning_rate": 0.001, "loss": 0.0722, "step": 57600 }, { "epoch": 0.58, "grad_norm": 0.2912674844264984, "learning_rate": 0.001, "loss": 0.0738, "step": 57700 }, { "epoch": 0.58, "grad_norm": 0.3146283030509949, "learning_rate": 0.001, "loss": 0.0735, "step": 57800 }, { "epoch": 0.58, "grad_norm": 0.2355007380247116, "learning_rate": 0.001, "loss": 0.0719, "step": 57900 }, { "epoch": 0.58, "grad_norm": 0.19035007059574127, "learning_rate": 0.001, "loss": 0.0699, "step": 58000 }, { "epoch": 0.58, "grad_norm": 0.13338258862495422, "learning_rate": 0.001, "loss": 0.0727, "step": 58100 }, { "epoch": 0.58, "grad_norm": 0.22755542397499084, "learning_rate": 0.001, "loss": 0.072, "step": 58200 }, { "epoch": 0.58, "grad_norm": 0.23752057552337646, "learning_rate": 0.001, "loss": 0.0703, "step": 58300 }, { "epoch": 0.58, "grad_norm": 0.20008322596549988, "learning_rate": 0.001, "loss": 0.0721, "step": 58400 }, { "epoch": 0.58, "grad_norm": 0.1769803911447525, "learning_rate": 0.001, "loss": 0.0724, "step": 58500 }, { "epoch": 0.59, "grad_norm": 0.19137178361415863, "learning_rate": 0.001, "loss": 0.0735, "step": 58600 }, { "epoch": 0.59, "grad_norm": 0.22157849371433258, "learning_rate": 0.001, "loss": 0.0735, "step": 58700 }, { "epoch": 0.59, "grad_norm": 0.2098543494939804, "learning_rate": 0.001, "loss": 0.0701, "step": 58800 }, { "epoch": 0.59, "grad_norm": 0.22936704754829407, "learning_rate": 0.001, "loss": 0.0691, "step": 58900 }, { "epoch": 0.59, "grad_norm": 0.15228866040706635, "learning_rate": 0.001, "loss": 0.0729, "step": 59000 }, { "epoch": 0.59, "grad_norm": 0.27094388008117676, "learning_rate": 0.001, "loss": 0.0706, "step": 59100 }, { "epoch": 0.59, "grad_norm": 0.17357999086380005, "learning_rate": 0.001, "loss": 0.071, "step": 59200 }, { "epoch": 0.59, "grad_norm": 0.2912188768386841, "learning_rate": 0.001, "loss": 0.0719, "step": 59300 }, { "epoch": 0.59, "grad_norm": 0.24029956758022308, "learning_rate": 0.001, "loss": 0.07, "step": 59400 }, { "epoch": 0.59, "grad_norm": 0.1956549882888794, "learning_rate": 0.001, "loss": 0.0712, "step": 59500 }, { "epoch": 0.6, "grad_norm": 0.26984256505966187, "learning_rate": 0.001, "loss": 0.0713, "step": 59600 }, { "epoch": 0.6, "grad_norm": 0.18548165261745453, "learning_rate": 0.001, "loss": 0.0686, "step": 59700 }, { "epoch": 0.6, "grad_norm": 0.1833103895187378, "learning_rate": 0.001, "loss": 0.0672, "step": 59800 }, { "epoch": 0.6, "grad_norm": 0.20417752861976624, "learning_rate": 0.001, "loss": 0.069, "step": 59900 }, { "epoch": 0.6, "grad_norm": 0.3695315420627594, "learning_rate": 0.001, "loss": 0.0703, "step": 60000 }, { "epoch": 0.6, "grad_norm": 0.23288464546203613, "learning_rate": 0.001, "loss": 0.0704, "step": 60100 }, { "epoch": 0.6, "grad_norm": 0.21595774590969086, "learning_rate": 0.001, "loss": 0.0697, "step": 60200 }, { "epoch": 0.6, "grad_norm": 0.16371206939220428, "learning_rate": 0.001, "loss": 0.0704, "step": 60300 }, { "epoch": 0.6, "grad_norm": 0.2600916028022766, "learning_rate": 0.001, "loss": 0.0693, "step": 60400 }, { "epoch": 0.6, "grad_norm": 0.21177971363067627, "learning_rate": 0.001, "loss": 0.0707, "step": 60500 }, { "epoch": 0.61, "grad_norm": 0.16886168718338013, "learning_rate": 0.001, "loss": 0.0701, "step": 60600 }, { "epoch": 0.61, "grad_norm": 0.29835718870162964, "learning_rate": 0.001, "loss": 0.0683, "step": 60700 }, { "epoch": 0.61, "grad_norm": 0.2594737410545349, "learning_rate": 0.001, "loss": 0.0723, "step": 60800 }, { "epoch": 0.61, "grad_norm": 0.2057715505361557, "learning_rate": 0.001, "loss": 0.0693, "step": 60900 }, { "epoch": 0.61, "grad_norm": 0.2127043902873993, "learning_rate": 0.001, "loss": 0.0699, "step": 61000 }, { "epoch": 0.61, "grad_norm": 0.18162322044372559, "learning_rate": 0.001, "loss": 0.0714, "step": 61100 }, { "epoch": 0.61, "grad_norm": 0.21535515785217285, "learning_rate": 0.001, "loss": 0.0711, "step": 61200 }, { "epoch": 0.61, "grad_norm": 0.19364242255687714, "learning_rate": 0.001, "loss": 0.0715, "step": 61300 }, { "epoch": 0.61, "grad_norm": 0.14159826934337616, "learning_rate": 0.001, "loss": 0.07, "step": 61400 }, { "epoch": 0.61, "grad_norm": 0.21536406874656677, "learning_rate": 0.001, "loss": 0.0689, "step": 61500 }, { "epoch": 0.62, "grad_norm": 0.19926196336746216, "learning_rate": 0.001, "loss": 0.0689, "step": 61600 }, { "epoch": 0.62, "grad_norm": 0.20217150449752808, "learning_rate": 0.001, "loss": 0.071, "step": 61700 }, { "epoch": 0.62, "grad_norm": 0.17570650577545166, "learning_rate": 0.001, "loss": 0.0719, "step": 61800 }, { "epoch": 0.62, "grad_norm": 0.19788751006126404, "learning_rate": 0.001, "loss": 0.0687, "step": 61900 }, { "epoch": 0.62, "grad_norm": 0.22191910445690155, "learning_rate": 0.001, "loss": 0.0687, "step": 62000 }, { "epoch": 0.62, "grad_norm": 0.19544494152069092, "learning_rate": 0.001, "loss": 0.0704, "step": 62100 }, { "epoch": 0.62, "grad_norm": 0.32939237356185913, "learning_rate": 0.001, "loss": 0.0713, "step": 62200 }, { "epoch": 0.62, "grad_norm": 0.1809149980545044, "learning_rate": 0.001, "loss": 0.0701, "step": 62300 }, { "epoch": 0.62, "grad_norm": 0.2769867479801178, "learning_rate": 0.001, "loss": 0.0718, "step": 62400 }, { "epoch": 0.62, "grad_norm": 0.15998759865760803, "learning_rate": 0.001, "loss": 0.0691, "step": 62500 }, { "epoch": 0.63, "grad_norm": 0.29498517513275146, "learning_rate": 0.001, "loss": 0.0722, "step": 62600 }, { "epoch": 0.63, "grad_norm": 0.19759228825569153, "learning_rate": 0.001, "loss": 0.0686, "step": 62700 }, { "epoch": 0.63, "grad_norm": 0.12064652889966965, "learning_rate": 0.001, "loss": 0.0707, "step": 62800 }, { "epoch": 0.63, "grad_norm": 0.19079501926898956, "learning_rate": 0.001, "loss": 0.0662, "step": 62900 }, { "epoch": 0.63, "grad_norm": 0.22422794997692108, "learning_rate": 0.001, "loss": 0.0662, "step": 63000 }, { "epoch": 0.63, "grad_norm": 0.16929177939891815, "learning_rate": 0.001, "loss": 0.0677, "step": 63100 }, { "epoch": 0.63, "grad_norm": 0.20057950913906097, "learning_rate": 0.001, "loss": 0.0699, "step": 63200 }, { "epoch": 0.63, "grad_norm": 0.4213920533657074, "learning_rate": 0.001, "loss": 0.0701, "step": 63300 }, { "epoch": 0.63, "grad_norm": 0.28028371930122375, "learning_rate": 0.001, "loss": 0.0697, "step": 63400 }, { "epoch": 0.63, "grad_norm": 0.18094098567962646, "learning_rate": 0.001, "loss": 0.0727, "step": 63500 }, { "epoch": 0.64, "grad_norm": 0.30136585235595703, "learning_rate": 0.001, "loss": 0.0711, "step": 63600 }, { "epoch": 0.64, "grad_norm": 0.192775696516037, "learning_rate": 0.001, "loss": 0.0721, "step": 63700 }, { "epoch": 0.64, "grad_norm": 0.2211129367351532, "learning_rate": 0.001, "loss": 0.0695, "step": 63800 }, { "epoch": 0.64, "grad_norm": 0.19226811826229095, "learning_rate": 0.001, "loss": 0.0699, "step": 63900 }, { "epoch": 0.64, "grad_norm": 0.2471201866865158, "learning_rate": 0.001, "loss": 0.0692, "step": 64000 }, { "epoch": 0.64, "grad_norm": 0.2547115385532379, "learning_rate": 0.001, "loss": 0.0673, "step": 64100 }, { "epoch": 0.64, "grad_norm": 0.1899893879890442, "learning_rate": 0.001, "loss": 0.0693, "step": 64200 }, { "epoch": 0.64, "grad_norm": 0.21257919073104858, "learning_rate": 0.001, "loss": 0.0684, "step": 64300 }, { "epoch": 0.64, "grad_norm": 0.26688677072525024, "learning_rate": 0.001, "loss": 0.0683, "step": 64400 }, { "epoch": 0.64, "grad_norm": 0.18874968588352203, "learning_rate": 0.001, "loss": 0.0688, "step": 64500 }, { "epoch": 0.65, "grad_norm": 0.2013721913099289, "learning_rate": 0.001, "loss": 0.0684, "step": 64600 }, { "epoch": 0.65, "grad_norm": 0.19745351374149323, "learning_rate": 0.001, "loss": 0.0685, "step": 64700 }, { "epoch": 0.65, "grad_norm": 0.2137337028980255, "learning_rate": 0.001, "loss": 0.0671, "step": 64800 }, { "epoch": 0.65, "grad_norm": 0.20300865173339844, "learning_rate": 0.001, "loss": 0.0684, "step": 64900 }, { "epoch": 0.65, "grad_norm": 0.1723690927028656, "learning_rate": 0.001, "loss": 0.0681, "step": 65000 }, { "epoch": 0.65, "grad_norm": 0.20693708956241608, "learning_rate": 0.001, "loss": 0.0685, "step": 65100 }, { "epoch": 0.65, "grad_norm": 0.33531713485717773, "learning_rate": 0.001, "loss": 0.0687, "step": 65200 }, { "epoch": 0.65, "grad_norm": 0.2180265337228775, "learning_rate": 0.001, "loss": 0.0719, "step": 65300 }, { "epoch": 0.65, "grad_norm": 0.27855604887008667, "learning_rate": 0.001, "loss": 0.0686, "step": 65400 }, { "epoch": 0.65, "grad_norm": 0.2309376448392868, "learning_rate": 0.001, "loss": 0.0682, "step": 65500 }, { "epoch": 0.66, "grad_norm": 0.25525444746017456, "learning_rate": 0.001, "loss": 0.0698, "step": 65600 }, { "epoch": 0.66, "grad_norm": 0.1746407151222229, "learning_rate": 0.001, "loss": 0.0692, "step": 65700 }, { "epoch": 0.66, "grad_norm": 0.29511937499046326, "learning_rate": 0.001, "loss": 0.0675, "step": 65800 }, { "epoch": 0.66, "grad_norm": 0.23610210418701172, "learning_rate": 0.001, "loss": 0.0682, "step": 65900 }, { "epoch": 0.66, "grad_norm": 0.24088448286056519, "learning_rate": 0.001, "loss": 0.065, "step": 66000 }, { "epoch": 0.66, "grad_norm": 0.3865065574645996, "learning_rate": 0.001, "loss": 0.068, "step": 66100 }, { "epoch": 0.66, "grad_norm": 0.16312183439731598, "learning_rate": 0.001, "loss": 0.0674, "step": 66200 }, { "epoch": 0.66, "grad_norm": 0.33910611271858215, "learning_rate": 0.001, "loss": 0.0657, "step": 66300 }, { "epoch": 0.66, "grad_norm": 0.1491781622171402, "learning_rate": 0.001, "loss": 0.0663, "step": 66400 }, { "epoch": 0.66, "grad_norm": 0.27082210779190063, "learning_rate": 0.001, "loss": 0.0692, "step": 66500 }, { "epoch": 0.67, "grad_norm": 0.302495539188385, "learning_rate": 0.001, "loss": 0.0668, "step": 66600 }, { "epoch": 0.67, "grad_norm": 0.1906341165304184, "learning_rate": 0.001, "loss": 0.0689, "step": 66700 }, { "epoch": 0.67, "grad_norm": 0.21256040036678314, "learning_rate": 0.001, "loss": 0.0665, "step": 66800 }, { "epoch": 0.67, "grad_norm": 0.16603924334049225, "learning_rate": 0.001, "loss": 0.07, "step": 66900 }, { "epoch": 0.67, "grad_norm": 0.17136050760746002, "learning_rate": 0.001, "loss": 0.0715, "step": 67000 }, { "epoch": 0.67, "grad_norm": 0.1679474115371704, "learning_rate": 0.001, "loss": 0.0667, "step": 67100 }, { "epoch": 0.67, "grad_norm": 0.18445661664009094, "learning_rate": 0.001, "loss": 0.0688, "step": 67200 }, { "epoch": 0.67, "grad_norm": 0.16743460297584534, "learning_rate": 0.001, "loss": 0.0672, "step": 67300 }, { "epoch": 0.67, "grad_norm": 0.24309833347797394, "learning_rate": 0.001, "loss": 0.066, "step": 67400 }, { "epoch": 0.67, "grad_norm": 0.15661662817001343, "learning_rate": 0.001, "loss": 0.0686, "step": 67500 }, { "epoch": 0.68, "grad_norm": 0.32759585976600647, "learning_rate": 0.001, "loss": 0.0666, "step": 67600 }, { "epoch": 0.68, "grad_norm": 0.1508253961801529, "learning_rate": 0.001, "loss": 0.068, "step": 67700 }, { "epoch": 0.68, "grad_norm": 0.17459799349308014, "learning_rate": 0.001, "loss": 0.069, "step": 67800 }, { "epoch": 0.68, "grad_norm": 0.2405272275209427, "learning_rate": 0.001, "loss": 0.0693, "step": 67900 }, { "epoch": 0.68, "grad_norm": 0.2469649761915207, "learning_rate": 0.001, "loss": 0.0678, "step": 68000 }, { "epoch": 0.68, "grad_norm": 0.25917258858680725, "learning_rate": 0.001, "loss": 0.0694, "step": 68100 }, { "epoch": 0.68, "grad_norm": 0.1784822642803192, "learning_rate": 0.001, "loss": 0.0668, "step": 68200 }, { "epoch": 0.68, "grad_norm": 0.22977730631828308, "learning_rate": 0.001, "loss": 0.0656, "step": 68300 }, { "epoch": 0.68, "grad_norm": 0.1646946221590042, "learning_rate": 0.001, "loss": 0.068, "step": 68400 }, { "epoch": 0.68, "grad_norm": 0.3220691978931427, "learning_rate": 0.001, "loss": 0.0665, "step": 68500 }, { "epoch": 0.69, "grad_norm": 0.22109118103981018, "learning_rate": 0.001, "loss": 0.0684, "step": 68600 }, { "epoch": 0.69, "grad_norm": 0.12051670998334885, "learning_rate": 0.001, "loss": 0.0675, "step": 68700 }, { "epoch": 0.69, "grad_norm": 0.19576141238212585, "learning_rate": 0.001, "loss": 0.0655, "step": 68800 }, { "epoch": 0.69, "grad_norm": 0.12783344089984894, "learning_rate": 0.001, "loss": 0.0677, "step": 68900 }, { "epoch": 0.69, "grad_norm": 0.24854913353919983, "learning_rate": 0.001, "loss": 0.0684, "step": 69000 }, { "epoch": 0.69, "grad_norm": 0.19816453754901886, "learning_rate": 0.001, "loss": 0.067, "step": 69100 }, { "epoch": 0.69, "grad_norm": 0.20371900498867035, "learning_rate": 0.001, "loss": 0.0669, "step": 69200 }, { "epoch": 0.69, "grad_norm": 0.24654364585876465, "learning_rate": 0.001, "loss": 0.0665, "step": 69300 }, { "epoch": 0.69, "grad_norm": 0.22933346033096313, "learning_rate": 0.001, "loss": 0.0697, "step": 69400 }, { "epoch": 0.69, "grad_norm": 0.3056330382823944, "learning_rate": 0.001, "loss": 0.0688, "step": 69500 }, { "epoch": 0.7, "grad_norm": 0.14624419808387756, "learning_rate": 0.001, "loss": 0.0686, "step": 69600 }, { "epoch": 0.7, "grad_norm": 0.23571297526359558, "learning_rate": 0.001, "loss": 0.0727, "step": 69700 }, { "epoch": 0.7, "grad_norm": 0.20212960243225098, "learning_rate": 0.001, "loss": 0.0708, "step": 69800 }, { "epoch": 0.7, "grad_norm": 0.22400203347206116, "learning_rate": 0.001, "loss": 0.0645, "step": 69900 }, { "epoch": 0.7, "grad_norm": 0.15693353116512299, "learning_rate": 0.001, "loss": 0.066, "step": 70000 }, { "epoch": 0.7, "grad_norm": 0.21171632409095764, "learning_rate": 0.001, "loss": 0.0651, "step": 70100 }, { "epoch": 0.7, "grad_norm": 0.16716106235980988, "learning_rate": 0.001, "loss": 0.0651, "step": 70200 }, { "epoch": 0.7, "grad_norm": 0.19692525267601013, "learning_rate": 0.001, "loss": 0.0677, "step": 70300 }, { "epoch": 0.7, "grad_norm": 0.23514828085899353, "learning_rate": 0.001, "loss": 0.0651, "step": 70400 }, { "epoch": 0.7, "grad_norm": 0.22567568719387054, "learning_rate": 0.001, "loss": 0.0658, "step": 70500 }, { "epoch": 0.71, "grad_norm": 0.20934154093265533, "learning_rate": 0.001, "loss": 0.0661, "step": 70600 }, { "epoch": 0.71, "grad_norm": 0.25384077429771423, "learning_rate": 0.001, "loss": 0.0658, "step": 70700 }, { "epoch": 0.71, "grad_norm": 0.27204346656799316, "learning_rate": 0.001, "loss": 0.0685, "step": 70800 }, { "epoch": 0.71, "grad_norm": 0.1900806725025177, "learning_rate": 0.001, "loss": 0.0637, "step": 70900 }, { "epoch": 0.71, "grad_norm": 0.4064619243144989, "learning_rate": 0.001, "loss": 0.07, "step": 71000 }, { "epoch": 0.71, "grad_norm": 0.22942863404750824, "learning_rate": 0.001, "loss": 0.067, "step": 71100 }, { "epoch": 0.71, "grad_norm": 0.3398168683052063, "learning_rate": 0.001, "loss": 0.0673, "step": 71200 }, { "epoch": 0.71, "grad_norm": 0.2937333881855011, "learning_rate": 0.001, "loss": 0.0689, "step": 71300 }, { "epoch": 0.71, "grad_norm": 0.15955261886119843, "learning_rate": 0.001, "loss": 0.0644, "step": 71400 }, { "epoch": 0.71, "grad_norm": 0.32867005467414856, "learning_rate": 0.001, "loss": 0.0668, "step": 71500 }, { "epoch": 0.72, "grad_norm": 0.22879061102867126, "learning_rate": 0.001, "loss": 0.0641, "step": 71600 }, { "epoch": 0.72, "grad_norm": 0.3147716224193573, "learning_rate": 0.001, "loss": 0.0643, "step": 71700 }, { "epoch": 0.72, "grad_norm": 0.19312891364097595, "learning_rate": 0.001, "loss": 0.0654, "step": 71800 }, { "epoch": 0.72, "grad_norm": 0.3658990263938904, "learning_rate": 0.001, "loss": 0.066, "step": 71900 }, { "epoch": 0.72, "grad_norm": 0.2730260193347931, "learning_rate": 0.001, "loss": 0.0673, "step": 72000 }, { "epoch": 0.72, "grad_norm": 0.3601909279823303, "learning_rate": 0.001, "loss": 0.0643, "step": 72100 }, { "epoch": 0.72, "grad_norm": 0.13944287598133087, "learning_rate": 0.001, "loss": 0.0671, "step": 72200 }, { "epoch": 0.72, "grad_norm": 0.1590428501367569, "learning_rate": 0.001, "loss": 0.0651, "step": 72300 }, { "epoch": 0.72, "grad_norm": 0.17583294212818146, "learning_rate": 0.001, "loss": 0.0665, "step": 72400 }, { "epoch": 0.72, "grad_norm": 0.1566411554813385, "learning_rate": 0.001, "loss": 0.0666, "step": 72500 }, { "epoch": 0.73, "grad_norm": 0.26495423913002014, "learning_rate": 0.001, "loss": 0.0651, "step": 72600 }, { "epoch": 0.73, "grad_norm": 0.17272372543811798, "learning_rate": 0.001, "loss": 0.0689, "step": 72700 }, { "epoch": 0.73, "grad_norm": 0.2443661093711853, "learning_rate": 0.001, "loss": 0.065, "step": 72800 }, { "epoch": 0.73, "grad_norm": 0.26695558428764343, "learning_rate": 0.001, "loss": 0.0637, "step": 72900 }, { "epoch": 0.73, "grad_norm": 0.14408937096595764, "learning_rate": 0.001, "loss": 0.0676, "step": 73000 }, { "epoch": 0.73, "grad_norm": 0.18142744898796082, "learning_rate": 0.001, "loss": 0.0653, "step": 73100 }, { "epoch": 0.73, "grad_norm": 0.17100819945335388, "learning_rate": 0.001, "loss": 0.0631, "step": 73200 }, { "epoch": 0.73, "grad_norm": 0.3703427314758301, "learning_rate": 0.001, "loss": 0.0665, "step": 73300 }, { "epoch": 0.73, "grad_norm": 0.19516532123088837, "learning_rate": 0.001, "loss": 0.0656, "step": 73400 }, { "epoch": 0.73, "grad_norm": 0.17610041797161102, "learning_rate": 0.001, "loss": 0.0658, "step": 73500 }, { "epoch": 0.74, "grad_norm": 0.13331599533557892, "learning_rate": 0.001, "loss": 0.0653, "step": 73600 }, { "epoch": 0.74, "grad_norm": 0.23824097216129303, "learning_rate": 0.001, "loss": 0.065, "step": 73700 }, { "epoch": 0.74, "grad_norm": 0.1464979499578476, "learning_rate": 0.001, "loss": 0.0638, "step": 73800 }, { "epoch": 0.74, "grad_norm": 0.18163511157035828, "learning_rate": 0.001, "loss": 0.0661, "step": 73900 }, { "epoch": 0.74, "grad_norm": 0.1809806078672409, "learning_rate": 0.001, "loss": 0.0643, "step": 74000 }, { "epoch": 0.74, "grad_norm": 0.23994535207748413, "learning_rate": 0.001, "loss": 0.0636, "step": 74100 }, { "epoch": 0.74, "grad_norm": 0.17924870550632477, "learning_rate": 0.001, "loss": 0.064, "step": 74200 }, { "epoch": 0.74, "grad_norm": 0.15770521759986877, "learning_rate": 0.001, "loss": 0.0661, "step": 74300 }, { "epoch": 0.74, "grad_norm": 0.24632355570793152, "learning_rate": 0.001, "loss": 0.0644, "step": 74400 }, { "epoch": 0.74, "grad_norm": 0.18300195038318634, "learning_rate": 0.001, "loss": 0.0592, "step": 74500 }, { "epoch": 0.75, "grad_norm": 0.2745151221752167, "learning_rate": 0.001, "loss": 0.063, "step": 74600 }, { "epoch": 0.75, "grad_norm": 0.18871140480041504, "learning_rate": 0.001, "loss": 0.063, "step": 74700 }, { "epoch": 0.75, "grad_norm": 0.30228421092033386, "learning_rate": 0.001, "loss": 0.0661, "step": 74800 }, { "epoch": 0.75, "grad_norm": 0.26834210753440857, "learning_rate": 0.001, "loss": 0.0626, "step": 74900 }, { "epoch": 0.75, "grad_norm": 0.1998053640127182, "learning_rate": 0.001, "loss": 0.0655, "step": 75000 }, { "epoch": 0.75, "grad_norm": 0.16265703737735748, "learning_rate": 0.001, "loss": 0.0648, "step": 75100 }, { "epoch": 0.75, "grad_norm": 0.3203764259815216, "learning_rate": 0.001, "loss": 0.0636, "step": 75200 }, { "epoch": 0.75, "grad_norm": 0.29416751861572266, "learning_rate": 0.001, "loss": 0.0613, "step": 75300 }, { "epoch": 0.75, "grad_norm": 0.1761980801820755, "learning_rate": 0.001, "loss": 0.0718, "step": 75400 }, { "epoch": 0.75, "grad_norm": 0.24760745465755463, "learning_rate": 0.001, "loss": 0.0641, "step": 75500 }, { "epoch": 0.76, "grad_norm": 0.3362966477870941, "learning_rate": 0.001, "loss": 0.0678, "step": 75600 }, { "epoch": 0.76, "grad_norm": 0.20644457638263702, "learning_rate": 0.001, "loss": 0.0653, "step": 75700 }, { "epoch": 0.76, "grad_norm": 0.22632303833961487, "learning_rate": 0.001, "loss": 0.0679, "step": 75800 }, { "epoch": 0.76, "grad_norm": 0.22177743911743164, "learning_rate": 0.001, "loss": 0.0628, "step": 75900 }, { "epoch": 0.76, "grad_norm": 0.9697771072387695, "learning_rate": 0.001, "loss": 0.0659, "step": 76000 }, { "epoch": 0.76, "grad_norm": 0.21862226724624634, "learning_rate": 0.001, "loss": 0.0654, "step": 76100 }, { "epoch": 0.76, "grad_norm": 0.27506422996520996, "learning_rate": 0.001, "loss": 0.0636, "step": 76200 }, { "epoch": 0.76, "grad_norm": 0.4953247606754303, "learning_rate": 0.001, "loss": 0.0648, "step": 76300 }, { "epoch": 0.76, "grad_norm": 0.44132623076438904, "learning_rate": 0.001, "loss": 0.0641, "step": 76400 }, { "epoch": 0.76, "grad_norm": 0.28104710578918457, "learning_rate": 0.001, "loss": 0.0623, "step": 76500 }, { "epoch": 0.77, "grad_norm": 0.270434707403183, "learning_rate": 0.001, "loss": 0.0642, "step": 76600 }, { "epoch": 0.77, "grad_norm": 0.17920733988285065, "learning_rate": 0.001, "loss": 0.0641, "step": 76700 }, { "epoch": 0.77, "grad_norm": 0.27689895033836365, "learning_rate": 0.001, "loss": 0.0645, "step": 76800 }, { "epoch": 0.77, "grad_norm": 0.22936861217021942, "learning_rate": 0.001, "loss": 0.0625, "step": 76900 }, { "epoch": 0.77, "grad_norm": 0.2662585973739624, "learning_rate": 0.001, "loss": 0.0671, "step": 77000 }, { "epoch": 0.77, "grad_norm": 0.23035678267478943, "learning_rate": 0.001, "loss": 0.0622, "step": 77100 }, { "epoch": 0.77, "grad_norm": 0.19333815574645996, "learning_rate": 0.001, "loss": 0.0655, "step": 77200 }, { "epoch": 0.77, "grad_norm": 0.2870350182056427, "learning_rate": 0.001, "loss": 0.0634, "step": 77300 }, { "epoch": 0.77, "grad_norm": 0.22997340559959412, "learning_rate": 0.001, "loss": 0.0676, "step": 77400 }, { "epoch": 0.77, "grad_norm": 0.19435285031795502, "learning_rate": 0.001, "loss": 0.0655, "step": 77500 }, { "epoch": 0.78, "grad_norm": 0.2826205790042877, "learning_rate": 0.001, "loss": 0.0635, "step": 77600 }, { "epoch": 0.78, "grad_norm": 0.20007766783237457, "learning_rate": 0.001, "loss": 0.0617, "step": 77700 }, { "epoch": 0.78, "grad_norm": 0.15860234200954437, "learning_rate": 0.001, "loss": 0.0657, "step": 77800 }, { "epoch": 0.78, "grad_norm": 0.40526214241981506, "learning_rate": 0.001, "loss": 0.0649, "step": 77900 }, { "epoch": 0.78, "grad_norm": 0.24454933404922485, "learning_rate": 0.001, "loss": 0.0634, "step": 78000 }, { "epoch": 0.78, "grad_norm": 0.12802359461784363, "learning_rate": 0.001, "loss": 0.0635, "step": 78100 }, { "epoch": 0.78, "grad_norm": 0.32250648736953735, "learning_rate": 0.001, "loss": 0.0648, "step": 78200 }, { "epoch": 0.78, "grad_norm": 0.253478467464447, "learning_rate": 0.001, "loss": 0.0648, "step": 78300 }, { "epoch": 0.78, "grad_norm": 0.25307029485702515, "learning_rate": 0.001, "loss": 0.0648, "step": 78400 }, { "epoch": 0.78, "grad_norm": 0.19091230630874634, "learning_rate": 0.001, "loss": 0.065, "step": 78500 }, { "epoch": 0.79, "grad_norm": 0.17312967777252197, "learning_rate": 0.001, "loss": 0.0624, "step": 78600 }, { "epoch": 0.79, "grad_norm": 0.19466041028499603, "learning_rate": 0.001, "loss": 0.0622, "step": 78700 }, { "epoch": 0.79, "grad_norm": 0.25837138295173645, "learning_rate": 0.001, "loss": 0.0641, "step": 78800 }, { "epoch": 0.79, "grad_norm": 0.1573166698217392, "learning_rate": 0.001, "loss": 0.0645, "step": 78900 }, { "epoch": 0.79, "grad_norm": 0.1644609123468399, "learning_rate": 0.001, "loss": 0.0644, "step": 79000 }, { "epoch": 0.79, "grad_norm": 0.20255005359649658, "learning_rate": 0.001, "loss": 0.0647, "step": 79100 }, { "epoch": 0.79, "grad_norm": 0.48706310987472534, "learning_rate": 0.001, "loss": 0.0642, "step": 79200 }, { "epoch": 0.79, "grad_norm": 0.3525262176990509, "learning_rate": 0.001, "loss": 0.0639, "step": 79300 }, { "epoch": 0.79, "grad_norm": 0.20806559920310974, "learning_rate": 0.001, "loss": 0.0639, "step": 79400 }, { "epoch": 0.79, "grad_norm": 0.441980242729187, "learning_rate": 0.001, "loss": 0.0645, "step": 79500 }, { "epoch": 0.8, "grad_norm": 0.16818083822727203, "learning_rate": 0.001, "loss": 0.0625, "step": 79600 }, { "epoch": 0.8, "grad_norm": 0.1843559443950653, "learning_rate": 0.001, "loss": 0.064, "step": 79700 }, { "epoch": 0.8, "grad_norm": 0.19608129560947418, "learning_rate": 0.001, "loss": 0.0634, "step": 79800 }, { "epoch": 0.8, "grad_norm": 0.34710460901260376, "learning_rate": 0.001, "loss": 0.0626, "step": 79900 }, { "epoch": 0.8, "grad_norm": 0.4062146842479706, "learning_rate": 0.001, "loss": 0.0637, "step": 80000 }, { "epoch": 0.8, "grad_norm": 0.23054763674736023, "learning_rate": 0.001, "loss": 0.0629, "step": 80100 }, { "epoch": 0.8, "grad_norm": 0.20241042971611023, "learning_rate": 0.001, "loss": 0.0632, "step": 80200 }, { "epoch": 0.8, "grad_norm": 0.17540830373764038, "learning_rate": 0.001, "loss": 0.0645, "step": 80300 }, { "epoch": 0.8, "grad_norm": 0.2995645999908447, "learning_rate": 0.001, "loss": 0.0619, "step": 80400 }, { "epoch": 0.8, "grad_norm": 0.2701890766620636, "learning_rate": 0.001, "loss": 0.0624, "step": 80500 }, { "epoch": 0.81, "grad_norm": 0.5655909180641174, "learning_rate": 0.001, "loss": 0.0637, "step": 80600 }, { "epoch": 0.81, "grad_norm": 0.24868199229240417, "learning_rate": 0.001, "loss": 0.0626, "step": 80700 }, { "epoch": 0.81, "grad_norm": 0.205698162317276, "learning_rate": 0.001, "loss": 0.0616, "step": 80800 }, { "epoch": 0.81, "grad_norm": 0.4373738169670105, "learning_rate": 0.001, "loss": 0.0635, "step": 80900 }, { "epoch": 0.81, "grad_norm": 0.20648936927318573, "learning_rate": 0.001, "loss": 0.063, "step": 81000 }, { "epoch": 0.81, "grad_norm": 0.49470582604408264, "learning_rate": 0.001, "loss": 0.064, "step": 81100 }, { "epoch": 0.81, "grad_norm": 0.2360522598028183, "learning_rate": 0.001, "loss": 0.0606, "step": 81200 }, { "epoch": 0.81, "grad_norm": 0.38575538992881775, "learning_rate": 0.001, "loss": 0.0626, "step": 81300 }, { "epoch": 0.81, "grad_norm": 0.23714828491210938, "learning_rate": 0.001, "loss": 0.0628, "step": 81400 }, { "epoch": 0.81, "grad_norm": 0.5665257573127747, "learning_rate": 0.001, "loss": 0.064, "step": 81500 }, { "epoch": 0.82, "grad_norm": 0.2335139662027359, "learning_rate": 0.001, "loss": 0.0628, "step": 81600 }, { "epoch": 0.82, "grad_norm": 0.23121795058250427, "learning_rate": 0.001, "loss": 0.0617, "step": 81700 }, { "epoch": 0.82, "grad_norm": 0.2850015163421631, "learning_rate": 0.001, "loss": 0.0634, "step": 81800 }, { "epoch": 0.82, "grad_norm": 0.25949451327323914, "learning_rate": 0.001, "loss": 0.0611, "step": 81900 }, { "epoch": 0.82, "grad_norm": 0.15866072475910187, "learning_rate": 0.001, "loss": 0.0633, "step": 82000 }, { "epoch": 0.82, "grad_norm": 0.1362059861421585, "learning_rate": 0.001, "loss": 0.0637, "step": 82100 }, { "epoch": 0.82, "grad_norm": 0.23973006010055542, "learning_rate": 0.001, "loss": 0.0619, "step": 82200 }, { "epoch": 0.82, "grad_norm": 0.2586152255535126, "learning_rate": 0.001, "loss": 0.0595, "step": 82300 }, { "epoch": 0.82, "grad_norm": 0.33245041966438293, "learning_rate": 0.001, "loss": 0.0632, "step": 82400 }, { "epoch": 0.82, "grad_norm": 0.1873330920934677, "learning_rate": 0.001, "loss": 0.0636, "step": 82500 }, { "epoch": 0.83, "grad_norm": 0.23043370246887207, "learning_rate": 0.001, "loss": 0.0644, "step": 82600 }, { "epoch": 0.83, "grad_norm": 0.21046708524227142, "learning_rate": 0.001, "loss": 0.0631, "step": 82700 }, { "epoch": 0.83, "grad_norm": 0.15473945438861847, "learning_rate": 0.001, "loss": 0.06, "step": 82800 }, { "epoch": 0.83, "grad_norm": 1.422141194343567, "learning_rate": 0.001, "loss": 0.0636, "step": 82900 }, { "epoch": 0.83, "grad_norm": 0.16424107551574707, "learning_rate": 0.001, "loss": 0.0643, "step": 83000 }, { "epoch": 0.83, "grad_norm": 0.3594319820404053, "learning_rate": 0.001, "loss": 0.0624, "step": 83100 }, { "epoch": 0.83, "grad_norm": 0.26430365443229675, "learning_rate": 0.001, "loss": 0.0593, "step": 83200 }, { "epoch": 0.83, "grad_norm": 0.20655816793441772, "learning_rate": 0.001, "loss": 0.0619, "step": 83300 }, { "epoch": 0.83, "grad_norm": 0.39340272545814514, "learning_rate": 0.001, "loss": 0.0624, "step": 83400 }, { "epoch": 0.83, "grad_norm": 0.3113759160041809, "learning_rate": 0.001, "loss": 0.0598, "step": 83500 }, { "epoch": 0.84, "grad_norm": 0.33689817786216736, "learning_rate": 0.001, "loss": 0.0604, "step": 83600 }, { "epoch": 0.84, "grad_norm": 0.2195175141096115, "learning_rate": 0.001, "loss": 0.0618, "step": 83700 }, { "epoch": 0.84, "grad_norm": 0.2397637814283371, "learning_rate": 0.001, "loss": 0.0618, "step": 83800 }, { "epoch": 0.84, "grad_norm": 0.28967469930648804, "learning_rate": 0.001, "loss": 0.0612, "step": 83900 }, { "epoch": 0.84, "grad_norm": 0.23908008635044098, "learning_rate": 0.001, "loss": 0.0599, "step": 84000 }, { "epoch": 0.84, "grad_norm": 0.36196354031562805, "learning_rate": 0.001, "loss": 0.061, "step": 84100 }, { "epoch": 0.84, "grad_norm": 0.3068004250526428, "learning_rate": 0.001, "loss": 0.0614, "step": 84200 }, { "epoch": 0.84, "grad_norm": 0.2148333489894867, "learning_rate": 0.001, "loss": 0.0624, "step": 84300 }, { "epoch": 0.84, "grad_norm": 0.19169430434703827, "learning_rate": 0.001, "loss": 0.0615, "step": 84400 }, { "epoch": 0.84, "grad_norm": 0.23916268348693848, "learning_rate": 0.001, "loss": 0.0654, "step": 84500 }, { "epoch": 0.85, "grad_norm": 0.20304815471172333, "learning_rate": 0.001, "loss": 0.0613, "step": 84600 }, { "epoch": 0.85, "grad_norm": 0.2983682155609131, "learning_rate": 0.001, "loss": 0.0617, "step": 84700 }, { "epoch": 0.85, "grad_norm": 0.22442661225795746, "learning_rate": 0.001, "loss": 0.0593, "step": 84800 }, { "epoch": 0.85, "grad_norm": 0.28299954533576965, "learning_rate": 0.001, "loss": 0.0636, "step": 84900 }, { "epoch": 0.85, "grad_norm": 0.30491936206817627, "learning_rate": 0.001, "loss": 0.0608, "step": 85000 }, { "epoch": 0.85, "grad_norm": 0.30804798007011414, "learning_rate": 0.001, "loss": 0.0609, "step": 85100 }, { "epoch": 0.85, "grad_norm": 0.18533004820346832, "learning_rate": 0.001, "loss": 0.0602, "step": 85200 }, { "epoch": 0.85, "grad_norm": 0.23856715857982635, "learning_rate": 0.001, "loss": 0.0638, "step": 85300 }, { "epoch": 0.85, "grad_norm": 0.2646658420562744, "learning_rate": 0.001, "loss": 0.0622, "step": 85400 }, { "epoch": 0.85, "grad_norm": 0.2357235699892044, "learning_rate": 0.001, "loss": 0.0617, "step": 85500 }, { "epoch": 0.86, "grad_norm": 0.1675509363412857, "learning_rate": 0.001, "loss": 0.0593, "step": 85600 }, { "epoch": 0.86, "grad_norm": 0.20707982778549194, "learning_rate": 0.001, "loss": 0.0617, "step": 85700 }, { "epoch": 0.86, "grad_norm": 0.34539708495140076, "learning_rate": 0.001, "loss": 0.06, "step": 85800 }, { "epoch": 0.86, "grad_norm": 0.28429824113845825, "learning_rate": 0.001, "loss": 0.0587, "step": 85900 }, { "epoch": 0.86, "grad_norm": 0.3121056854724884, "learning_rate": 0.001, "loss": 0.0615, "step": 86000 }, { "epoch": 0.86, "grad_norm": 0.25750598311424255, "learning_rate": 0.001, "loss": 0.0613, "step": 86100 }, { "epoch": 0.86, "grad_norm": 0.18927526473999023, "learning_rate": 0.001, "loss": 0.0592, "step": 86200 }, { "epoch": 0.86, "grad_norm": 0.3551163971424103, "learning_rate": 0.001, "loss": 0.0619, "step": 86300 }, { "epoch": 0.86, "grad_norm": 0.19404169917106628, "learning_rate": 0.001, "loss": 0.0617, "step": 86400 }, { "epoch": 0.86, "grad_norm": 0.16969504952430725, "learning_rate": 0.001, "loss": 0.0599, "step": 86500 }, { "epoch": 0.87, "grad_norm": 0.20026318728923798, "learning_rate": 0.001, "loss": 0.0606, "step": 86600 }, { "epoch": 0.87, "grad_norm": 0.30545106530189514, "learning_rate": 0.001, "loss": 0.0594, "step": 86700 }, { "epoch": 0.87, "grad_norm": 0.2734260559082031, "learning_rate": 0.001, "loss": 0.0644, "step": 86800 }, { "epoch": 0.87, "grad_norm": 0.3157080411911011, "learning_rate": 0.001, "loss": 0.0618, "step": 86900 }, { "epoch": 0.87, "grad_norm": 0.19793906807899475, "learning_rate": 0.001, "loss": 0.0616, "step": 87000 }, { "epoch": 0.87, "grad_norm": 0.1849125623703003, "learning_rate": 0.001, "loss": 0.0596, "step": 87100 }, { "epoch": 0.87, "grad_norm": 0.18340341746807098, "learning_rate": 0.001, "loss": 0.0625, "step": 87200 }, { "epoch": 0.87, "grad_norm": 0.26056426763534546, "learning_rate": 0.001, "loss": 0.0595, "step": 87300 }, { "epoch": 0.87, "grad_norm": 0.22235774993896484, "learning_rate": 0.001, "loss": 0.0606, "step": 87400 }, { "epoch": 0.87, "grad_norm": 0.31580013036727905, "learning_rate": 0.001, "loss": 0.0615, "step": 87500 }, { "epoch": 0.88, "grad_norm": 0.2364477515220642, "learning_rate": 0.001, "loss": 0.0616, "step": 87600 }, { "epoch": 0.88, "grad_norm": 0.23212990164756775, "learning_rate": 0.001, "loss": 0.0594, "step": 87700 }, { "epoch": 0.88, "grad_norm": 0.21986854076385498, "learning_rate": 0.001, "loss": 0.0592, "step": 87800 }, { "epoch": 0.88, "grad_norm": 0.2496929168701172, "learning_rate": 0.001, "loss": 0.0593, "step": 87900 }, { "epoch": 0.88, "grad_norm": 0.19572298228740692, "learning_rate": 0.001, "loss": 0.0588, "step": 88000 }, { "epoch": 0.88, "grad_norm": 0.16231012344360352, "learning_rate": 0.001, "loss": 0.0599, "step": 88100 }, { "epoch": 0.88, "grad_norm": 0.21093867719173431, "learning_rate": 0.001, "loss": 0.0625, "step": 88200 }, { "epoch": 0.88, "grad_norm": 0.16491778194904327, "learning_rate": 0.001, "loss": 0.0602, "step": 88300 }, { "epoch": 0.88, "grad_norm": 0.24729378521442413, "learning_rate": 0.001, "loss": 0.0573, "step": 88400 }, { "epoch": 0.88, "grad_norm": 0.3726213276386261, "learning_rate": 0.001, "loss": 0.0589, "step": 88500 }, { "epoch": 0.89, "grad_norm": 0.1926572024822235, "learning_rate": 0.001, "loss": 0.0602, "step": 88600 }, { "epoch": 0.89, "grad_norm": 0.2153882533311844, "learning_rate": 0.001, "loss": 0.0597, "step": 88700 }, { "epoch": 0.89, "grad_norm": 0.25205257534980774, "learning_rate": 0.001, "loss": 0.0581, "step": 88800 }, { "epoch": 0.89, "grad_norm": 0.16898304224014282, "learning_rate": 0.001, "loss": 0.0614, "step": 88900 }, { "epoch": 0.89, "grad_norm": 0.2840329110622406, "learning_rate": 0.001, "loss": 0.0615, "step": 89000 }, { "epoch": 0.89, "grad_norm": 0.22306442260742188, "learning_rate": 0.001, "loss": 0.0606, "step": 89100 }, { "epoch": 0.89, "grad_norm": 0.2778179943561554, "learning_rate": 0.001, "loss": 0.0606, "step": 89200 }, { "epoch": 0.89, "grad_norm": 0.1956636756658554, "learning_rate": 0.001, "loss": 0.0585, "step": 89300 }, { "epoch": 0.89, "grad_norm": 0.15973015129566193, "learning_rate": 0.001, "loss": 0.0598, "step": 89400 }, { "epoch": 0.89, "grad_norm": 0.2306407243013382, "learning_rate": 0.001, "loss": 0.0597, "step": 89500 }, { "epoch": 0.9, "grad_norm": 0.19012047350406647, "learning_rate": 0.001, "loss": 0.0608, "step": 89600 }, { "epoch": 0.9, "grad_norm": 0.214030921459198, "learning_rate": 0.001, "loss": 0.0586, "step": 89700 }, { "epoch": 0.9, "grad_norm": 0.26291027665138245, "learning_rate": 0.001, "loss": 0.0599, "step": 89800 }, { "epoch": 0.9, "grad_norm": 0.140648752450943, "learning_rate": 0.001, "loss": 0.0605, "step": 89900 }, { "epoch": 0.9, "grad_norm": 0.3011924624443054, "learning_rate": 0.001, "loss": 0.0609, "step": 90000 }, { "epoch": 0.9, "grad_norm": 0.24463798105716705, "learning_rate": 0.001, "loss": 0.0587, "step": 90100 }, { "epoch": 0.9, "grad_norm": 0.2608613073825836, "learning_rate": 0.001, "loss": 0.0595, "step": 90200 }, { "epoch": 0.9, "grad_norm": 0.23249809443950653, "learning_rate": 0.001, "loss": 0.0592, "step": 90300 }, { "epoch": 0.9, "grad_norm": 0.36541712284088135, "learning_rate": 0.001, "loss": 0.0599, "step": 90400 }, { "epoch": 0.9, "grad_norm": 0.45584437251091003, "learning_rate": 0.001, "loss": 0.0587, "step": 90500 }, { "epoch": 0.91, "grad_norm": 0.20905092358589172, "learning_rate": 0.001, "loss": 0.0595, "step": 90600 }, { "epoch": 0.91, "grad_norm": 0.18202795088291168, "learning_rate": 0.001, "loss": 0.0568, "step": 90700 }, { "epoch": 0.91, "grad_norm": 0.2321150153875351, "learning_rate": 0.001, "loss": 0.0605, "step": 90800 }, { "epoch": 0.91, "grad_norm": 0.17175626754760742, "learning_rate": 0.001, "loss": 0.0596, "step": 90900 }, { "epoch": 0.91, "grad_norm": 0.21932841837406158, "learning_rate": 0.001, "loss": 0.0585, "step": 91000 }, { "epoch": 0.91, "grad_norm": 0.30282464623451233, "learning_rate": 0.001, "loss": 0.0593, "step": 91100 }, { "epoch": 0.91, "grad_norm": 0.2639208436012268, "learning_rate": 0.001, "loss": 0.0599, "step": 91200 }, { "epoch": 0.91, "grad_norm": 0.23805926740169525, "learning_rate": 0.001, "loss": 0.0576, "step": 91300 }, { "epoch": 0.91, "grad_norm": 0.2307603508234024, "learning_rate": 0.001, "loss": 0.0602, "step": 91400 }, { "epoch": 0.91, "grad_norm": 0.1786148101091385, "learning_rate": 0.001, "loss": 0.0598, "step": 91500 }, { "epoch": 0.92, "grad_norm": 0.1955350786447525, "learning_rate": 0.001, "loss": 0.0576, "step": 91600 }, { "epoch": 0.92, "grad_norm": 0.24684827029705048, "learning_rate": 0.001, "loss": 0.0571, "step": 91700 }, { "epoch": 0.92, "grad_norm": 0.2771402895450592, "learning_rate": 0.001, "loss": 0.058, "step": 91800 }, { "epoch": 0.92, "grad_norm": 0.28878656029701233, "learning_rate": 0.001, "loss": 0.0585, "step": 91900 }, { "epoch": 0.92, "grad_norm": 0.7780060172080994, "learning_rate": 0.001, "loss": 0.0574, "step": 92000 }, { "epoch": 0.92, "grad_norm": 0.25102126598358154, "learning_rate": 0.001, "loss": 0.0576, "step": 92100 }, { "epoch": 0.92, "grad_norm": 0.26416492462158203, "learning_rate": 0.001, "loss": 0.0614, "step": 92200 }, { "epoch": 0.92, "grad_norm": 0.26566821336746216, "learning_rate": 0.001, "loss": 0.0586, "step": 92300 }, { "epoch": 0.92, "grad_norm": 0.25432705879211426, "learning_rate": 0.001, "loss": 0.0586, "step": 92400 }, { "epoch": 0.92, "grad_norm": 0.2592636048793793, "learning_rate": 0.001, "loss": 0.0576, "step": 92500 }, { "epoch": 0.93, "grad_norm": 0.3514898419380188, "learning_rate": 0.001, "loss": 0.0579, "step": 92600 }, { "epoch": 0.93, "grad_norm": 0.2749045491218567, "learning_rate": 0.001, "loss": 0.061, "step": 92700 }, { "epoch": 0.93, "grad_norm": 0.2799491882324219, "learning_rate": 0.001, "loss": 0.0579, "step": 92800 }, { "epoch": 0.93, "grad_norm": 0.2252642959356308, "learning_rate": 0.001, "loss": 0.0584, "step": 92900 }, { "epoch": 0.93, "grad_norm": 0.18218593299388885, "learning_rate": 0.001, "loss": 0.0577, "step": 93000 }, { "epoch": 0.93, "grad_norm": 0.27551427483558655, "learning_rate": 0.001, "loss": 0.0605, "step": 93100 }, { "epoch": 0.93, "grad_norm": 0.26159995794296265, "learning_rate": 0.001, "loss": 0.0562, "step": 93200 }, { "epoch": 0.93, "grad_norm": 0.15979285538196564, "learning_rate": 0.001, "loss": 0.0615, "step": 93300 }, { "epoch": 0.93, "grad_norm": 0.23418280482292175, "learning_rate": 0.001, "loss": 0.0594, "step": 93400 }, { "epoch": 0.93, "grad_norm": 0.16936419904232025, "learning_rate": 0.001, "loss": 0.0611, "step": 93500 }, { "epoch": 0.94, "grad_norm": 0.2862916886806488, "learning_rate": 0.001, "loss": 0.0584, "step": 93600 }, { "epoch": 0.94, "grad_norm": 0.5302750468254089, "learning_rate": 0.001, "loss": 0.0561, "step": 93700 }, { "epoch": 0.94, "grad_norm": 0.43644002079963684, "learning_rate": 0.001, "loss": 0.0581, "step": 93800 }, { "epoch": 0.94, "grad_norm": 0.19219018518924713, "learning_rate": 0.001, "loss": 0.0591, "step": 93900 }, { "epoch": 0.94, "grad_norm": 0.29645296931266785, "learning_rate": 0.001, "loss": 0.0587, "step": 94000 }, { "epoch": 0.94, "grad_norm": 0.24861380457878113, "learning_rate": 0.001, "loss": 0.0594, "step": 94100 }, { "epoch": 0.94, "grad_norm": 0.2443215548992157, "learning_rate": 0.001, "loss": 0.057, "step": 94200 }, { "epoch": 0.94, "grad_norm": 0.13077589869499207, "learning_rate": 0.001, "loss": 0.0563, "step": 94300 }, { "epoch": 0.94, "grad_norm": 0.24280287325382233, "learning_rate": 0.001, "loss": 0.0591, "step": 94400 }, { "epoch": 0.94, "grad_norm": 0.25838151574134827, "learning_rate": 0.001, "loss": 0.0583, "step": 94500 }, { "epoch": 0.95, "grad_norm": 0.33244743943214417, "learning_rate": 0.001, "loss": 0.0587, "step": 94600 }, { "epoch": 0.95, "grad_norm": 0.45074304938316345, "learning_rate": 0.001, "loss": 0.0572, "step": 94700 }, { "epoch": 0.95, "grad_norm": 0.2540782392024994, "learning_rate": 0.001, "loss": 0.0584, "step": 94800 }, { "epoch": 0.95, "grad_norm": 0.29180458188056946, "learning_rate": 0.001, "loss": 0.0609, "step": 94900 }, { "epoch": 0.95, "grad_norm": 0.18510323762893677, "learning_rate": 0.001, "loss": 0.058, "step": 95000 }, { "epoch": 0.95, "grad_norm": 0.28962787985801697, "learning_rate": 0.001, "loss": 0.0562, "step": 95100 }, { "epoch": 0.95, "grad_norm": 0.26887577772140503, "learning_rate": 0.001, "loss": 0.0573, "step": 95200 }, { "epoch": 0.95, "grad_norm": 0.20729154348373413, "learning_rate": 0.001, "loss": 0.057, "step": 95300 }, { "epoch": 0.95, "grad_norm": 0.19953325390815735, "learning_rate": 0.001, "loss": 0.0594, "step": 95400 }, { "epoch": 0.95, "grad_norm": 0.15926332771778107, "learning_rate": 0.001, "loss": 0.0582, "step": 95500 }, { "epoch": 0.96, "grad_norm": 0.23609544336795807, "learning_rate": 0.001, "loss": 0.0579, "step": 95600 }, { "epoch": 0.96, "grad_norm": 0.13997937738895416, "learning_rate": 0.001, "loss": 0.0574, "step": 95700 }, { "epoch": 0.96, "grad_norm": 0.23629073798656464, "learning_rate": 0.001, "loss": 0.0585, "step": 95800 }, { "epoch": 0.96, "grad_norm": 0.3770292401313782, "learning_rate": 0.001, "loss": 0.0572, "step": 95900 }, { "epoch": 0.96, "grad_norm": 0.3013598322868347, "learning_rate": 0.001, "loss": 0.0606, "step": 96000 }, { "epoch": 0.96, "grad_norm": 0.2350749522447586, "learning_rate": 0.001, "loss": 0.057, "step": 96100 }, { "epoch": 0.96, "grad_norm": 0.301268994808197, "learning_rate": 0.001, "loss": 0.0586, "step": 96200 }, { "epoch": 0.96, "grad_norm": 0.22475981712341309, "learning_rate": 0.001, "loss": 0.0593, "step": 96300 }, { "epoch": 0.96, "grad_norm": 0.3032160997390747, "learning_rate": 0.001, "loss": 0.0591, "step": 96400 }, { "epoch": 0.96, "grad_norm": 0.5848428010940552, "learning_rate": 0.001, "loss": 0.0559, "step": 96500 }, { "epoch": 0.97, "grad_norm": 0.20164470374584198, "learning_rate": 0.001, "loss": 0.0579, "step": 96600 }, { "epoch": 0.97, "grad_norm": 0.18068142235279083, "learning_rate": 0.001, "loss": 0.0579, "step": 96700 }, { "epoch": 0.97, "grad_norm": 0.31181275844573975, "learning_rate": 0.001, "loss": 0.0588, "step": 96800 }, { "epoch": 0.97, "grad_norm": 0.23156049847602844, "learning_rate": 0.001, "loss": 0.058, "step": 96900 }, { "epoch": 0.97, "grad_norm": 0.18572886288166046, "learning_rate": 0.001, "loss": 0.0599, "step": 97000 }, { "epoch": 0.97, "grad_norm": 0.17736677825450897, "learning_rate": 0.001, "loss": 0.0561, "step": 97100 }, { "epoch": 0.97, "grad_norm": 0.4838601052761078, "learning_rate": 0.001, "loss": 0.0595, "step": 97200 }, { "epoch": 0.97, "grad_norm": 0.21476797759532928, "learning_rate": 0.001, "loss": 0.0609, "step": 97300 }, { "epoch": 0.97, "grad_norm": 0.2181667536497116, "learning_rate": 0.001, "loss": 0.0583, "step": 97400 }, { "epoch": 0.97, "grad_norm": 0.26551786065101624, "learning_rate": 0.001, "loss": 0.0566, "step": 97500 }, { "epoch": 0.98, "grad_norm": 0.2258795201778412, "learning_rate": 0.001, "loss": 0.0574, "step": 97600 }, { "epoch": 0.98, "grad_norm": 0.17733299732208252, "learning_rate": 0.001, "loss": 0.0588, "step": 97700 }, { "epoch": 0.98, "grad_norm": 0.4031812846660614, "learning_rate": 0.001, "loss": 0.0584, "step": 97800 }, { "epoch": 0.98, "grad_norm": 0.22529329359531403, "learning_rate": 0.001, "loss": 0.0572, "step": 97900 }, { "epoch": 0.98, "grad_norm": 0.2503925561904907, "learning_rate": 0.001, "loss": 0.0588, "step": 98000 }, { "epoch": 0.98, "grad_norm": 0.17040744423866272, "learning_rate": 0.001, "loss": 0.0603, "step": 98100 }, { "epoch": 0.98, "grad_norm": 0.17749032378196716, "learning_rate": 0.001, "loss": 0.057, "step": 98200 }, { "epoch": 0.98, "grad_norm": 0.3931177854537964, "learning_rate": 0.001, "loss": 0.0566, "step": 98300 }, { "epoch": 0.98, "grad_norm": 0.22418583929538727, "learning_rate": 0.001, "loss": 0.0574, "step": 98400 }, { "epoch": 0.98, "grad_norm": 0.30830493569374084, "learning_rate": 0.001, "loss": 0.0593, "step": 98500 }, { "epoch": 0.99, "grad_norm": 0.2269369661808014, "learning_rate": 0.001, "loss": 0.0585, "step": 98600 }, { "epoch": 0.99, "grad_norm": 0.31830596923828125, "learning_rate": 0.001, "loss": 0.0548, "step": 98700 }, { "epoch": 0.99, "grad_norm": 0.25759172439575195, "learning_rate": 0.001, "loss": 0.0564, "step": 98800 }, { "epoch": 0.99, "grad_norm": 0.23925898969173431, "learning_rate": 0.001, "loss": 0.0592, "step": 98900 }, { "epoch": 0.99, "grad_norm": 0.17434507608413696, "learning_rate": 0.001, "loss": 0.0583, "step": 99000 }, { "epoch": 0.99, "grad_norm": 0.3493863642215729, "learning_rate": 0.001, "loss": 0.0571, "step": 99100 }, { "epoch": 0.99, "grad_norm": 0.20887431502342224, "learning_rate": 0.001, "loss": 0.0564, "step": 99200 }, { "epoch": 0.99, "grad_norm": 0.18060541152954102, "learning_rate": 0.001, "loss": 0.0583, "step": 99300 }, { "epoch": 0.99, "grad_norm": 0.3689703047275543, "learning_rate": 0.001, "loss": 0.0565, "step": 99400 }, { "epoch": 0.99, "grad_norm": 0.25323519110679626, "learning_rate": 0.001, "loss": 0.0576, "step": 99500 }, { "epoch": 1.0, "grad_norm": 0.27348294854164124, "learning_rate": 0.001, "loss": 0.0568, "step": 99600 }, { "epoch": 1.0, "grad_norm": 0.25492238998413086, "learning_rate": 0.001, "loss": 0.0561, "step": 99700 }, { "epoch": 1.0, "grad_norm": 0.2604049742221832, "learning_rate": 0.001, "loss": 0.0564, "step": 99800 }, { "epoch": 1.0, "grad_norm": 0.37222278118133545, "learning_rate": 0.001, "loss": 0.059, "step": 99900 }, { "epoch": 1.0, "grad_norm": 0.3180735111236572, "learning_rate": 0.001, "loss": 0.0588, "step": 100000 }, { "epoch": 1.0, "step": 100000, "total_flos": 8.920695708927918e+18, "train_loss": 0.09189929046154022, "train_runtime": 235079.2305, "train_samples_per_second": 54.45, "train_steps_per_second": 0.425 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "total_flos": 8.920695708927918e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }