diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7031 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.99999750000625, + "eval_steps": 500, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.3668140172958374, + "learning_rate": 0.001, + "loss": 1.2955, + "step": 100 + }, + { + "epoch": 0.0, + "grad_norm": 0.4789515733718872, + "learning_rate": 0.001, + "loss": 0.2147, + "step": 200 + }, + { + "epoch": 0.0, + "grad_norm": 0.8046264052391052, + "learning_rate": 0.001, + "loss": 0.1773, + "step": 300 + }, + { + "epoch": 0.0, + "grad_norm": 0.6500861644744873, + "learning_rate": 0.001, + "loss": 0.169, + "step": 400 + }, + { + "epoch": 0.0, + "grad_norm": 1.9476549625396729, + "learning_rate": 0.001, + "loss": 0.155, + "step": 500 + }, + { + "epoch": 0.01, + "grad_norm": 0.7570195198059082, + "learning_rate": 0.001, + "loss": 0.221, + "step": 600 + }, + { + "epoch": 0.01, + "grad_norm": 0.13534319400787354, + "learning_rate": 0.001, + "loss": 0.242, + "step": 700 + }, + { + "epoch": 0.01, + "grad_norm": 0.12334191799163818, + "learning_rate": 0.001, + "loss": 0.2387, + "step": 800 + }, + { + "epoch": 0.01, + "grad_norm": 2.0074245929718018, + "learning_rate": 0.001, + "loss": 0.1844, + "step": 900 + }, + { + "epoch": 0.01, + "grad_norm": 0.2459566444158554, + "learning_rate": 0.001, + "loss": 0.2273, + "step": 1000 + }, + { + "epoch": 0.01, + "grad_norm": 0.35431796312332153, + "learning_rate": 0.001, + "loss": 0.2406, + "step": 1100 + }, + { + "epoch": 0.01, + "grad_norm": 0.07735779881477356, + "learning_rate": 0.001, + "loss": 0.2362, + "step": 1200 + }, + { + "epoch": 0.01, + "grad_norm": 0.197942316532135, + "learning_rate": 0.001, + "loss": 0.2361, + "step": 1300 + }, + { + "epoch": 0.01, + "grad_norm": 0.06753970682621002, + "learning_rate": 0.001, + "loss": 0.2346, + "step": 1400 + }, + { + "epoch": 0.01, + "grad_norm": 0.17562294006347656, + "learning_rate": 0.001, + "loss": 0.2356, + "step": 1500 + }, + { + "epoch": 0.02, + "grad_norm": 0.12020650506019592, + "learning_rate": 0.001, + "loss": 0.2343, + "step": 1600 + }, + { + "epoch": 0.02, + "grad_norm": 0.07772481441497803, + "learning_rate": 0.001, + "loss": 0.2343, + "step": 1700 + }, + { + "epoch": 0.02, + "grad_norm": 0.041362863034009933, + "learning_rate": 0.001, + "loss": 0.2345, + "step": 1800 + }, + { + "epoch": 0.02, + "grad_norm": 0.050947155803442, + "learning_rate": 0.001, + "loss": 0.2333, + "step": 1900 + }, + { + "epoch": 0.02, + "grad_norm": 0.24440822005271912, + "learning_rate": 0.001, + "loss": 0.2346, + "step": 2000 + }, + { + "epoch": 0.02, + "grad_norm": 0.4386675953865051, + "learning_rate": 0.001, + "loss": 0.2346, + "step": 2100 + }, + { + "epoch": 0.02, + "grad_norm": 0.054741185158491135, + "learning_rate": 0.001, + "loss": 0.2347, + "step": 2200 + }, + { + "epoch": 0.02, + "grad_norm": 0.5285407304763794, + "learning_rate": 0.001, + "loss": 0.2341, + "step": 2300 + }, + { + "epoch": 0.02, + "grad_norm": 0.5406210422515869, + "learning_rate": 0.001, + "loss": 0.2322, + "step": 2400 + }, + { + "epoch": 0.02, + "grad_norm": 1.1667808294296265, + "learning_rate": 0.001, + "loss": 0.1683, + "step": 2500 + }, + { + "epoch": 0.03, + "grad_norm": 0.11829289048910141, + "learning_rate": 0.001, + "loss": 0.2138, + "step": 2600 + }, + { + "epoch": 0.03, + "grad_norm": 1.528359055519104, + "learning_rate": 0.001, + "loss": 0.1902, + "step": 2700 + }, + { + "epoch": 0.03, + "grad_norm": 0.45457515120506287, + "learning_rate": 0.001, + "loss": 0.1592, + "step": 2800 + }, + { + "epoch": 0.03, + "grad_norm": 0.2595893144607544, + "learning_rate": 0.001, + "loss": 0.1511, + "step": 2900 + }, + { + "epoch": 0.03, + "grad_norm": 0.5346922278404236, + "learning_rate": 0.001, + "loss": 0.1439, + "step": 3000 + }, + { + "epoch": 0.03, + "grad_norm": 3.5066208839416504, + "learning_rate": 0.001, + "loss": 0.1617, + "step": 3100 + }, + { + "epoch": 0.03, + "grad_norm": 24.826475143432617, + "learning_rate": 0.001, + "loss": 0.2024, + "step": 3200 + }, + { + "epoch": 0.03, + "grad_norm": 10.144634246826172, + "learning_rate": 0.001, + "loss": 0.1882, + "step": 3300 + }, + { + "epoch": 0.03, + "grad_norm": 0.43425676226615906, + "learning_rate": 0.001, + "loss": 0.169, + "step": 3400 + }, + { + "epoch": 0.03, + "grad_norm": 0.3496113717556, + "learning_rate": 0.001, + "loss": 0.1542, + "step": 3500 + }, + { + "epoch": 0.04, + "grad_norm": 6.317073345184326, + "learning_rate": 0.001, + "loss": 0.1676, + "step": 3600 + }, + { + "epoch": 0.04, + "grad_norm": 1.1362758874893188, + "learning_rate": 0.001, + "loss": 0.1599, + "step": 3700 + }, + { + "epoch": 0.04, + "grad_norm": 4.871659755706787, + "learning_rate": 0.001, + "loss": 0.1473, + "step": 3800 + }, + { + "epoch": 0.04, + "grad_norm": 0.10563373565673828, + "learning_rate": 0.001, + "loss": 0.1652, + "step": 3900 + }, + { + "epoch": 0.04, + "grad_norm": 0.08865318447351456, + "learning_rate": 0.001, + "loss": 0.2326, + "step": 4000 + }, + { + "epoch": 0.04, + "grad_norm": 0.0642586424946785, + "learning_rate": 0.001, + "loss": 0.2329, + "step": 4100 + }, + { + "epoch": 0.04, + "grad_norm": 0.36199188232421875, + "learning_rate": 0.001, + "loss": 0.2331, + "step": 4200 + }, + { + "epoch": 0.04, + "grad_norm": 0.17750632762908936, + "learning_rate": 0.001, + "loss": 0.2326, + "step": 4300 + }, + { + "epoch": 0.04, + "grad_norm": 0.103765107691288, + "learning_rate": 0.001, + "loss": 0.2329, + "step": 4400 + }, + { + "epoch": 0.04, + "grad_norm": 0.11186927556991577, + "learning_rate": 0.001, + "loss": 0.2326, + "step": 4500 + }, + { + "epoch": 0.05, + "grad_norm": 0.04914987459778786, + "learning_rate": 0.001, + "loss": 0.2326, + "step": 4600 + }, + { + "epoch": 0.05, + "grad_norm": 0.09826149046421051, + "learning_rate": 0.001, + "loss": 0.2324, + "step": 4700 + }, + { + "epoch": 0.05, + "grad_norm": 0.08518774062395096, + "learning_rate": 0.001, + "loss": 0.2327, + "step": 4800 + }, + { + "epoch": 0.05, + "grad_norm": 0.12364567071199417, + "learning_rate": 0.001, + "loss": 0.2321, + "step": 4900 + }, + { + "epoch": 0.05, + "grad_norm": 0.10944374650716782, + "learning_rate": 0.001, + "loss": 0.2322, + "step": 5000 + }, + { + "epoch": 0.05, + "grad_norm": 0.08173243701457977, + "learning_rate": 0.001, + "loss": 0.2326, + "step": 5100 + }, + { + "epoch": 0.05, + "grad_norm": 0.17504490911960602, + "learning_rate": 0.001, + "loss": 0.232, + "step": 5200 + }, + { + "epoch": 0.05, + "grad_norm": 0.03396083042025566, + "learning_rate": 0.001, + "loss": 0.2326, + "step": 5300 + }, + { + "epoch": 0.05, + "grad_norm": 0.12226787954568863, + "learning_rate": 0.001, + "loss": 0.2324, + "step": 5400 + }, + { + "epoch": 0.05, + "grad_norm": 0.029385367408394814, + "learning_rate": 0.001, + "loss": 0.2324, + "step": 5500 + }, + { + "epoch": 0.06, + "grad_norm": 0.08070210367441177, + "learning_rate": 0.001, + "loss": 0.2322, + "step": 5600 + }, + { + "epoch": 0.06, + "grad_norm": 0.026348430663347244, + "learning_rate": 0.001, + "loss": 0.2316, + "step": 5700 + }, + { + "epoch": 0.06, + "grad_norm": 0.06884663552045822, + "learning_rate": 0.001, + "loss": 0.2322, + "step": 5800 + }, + { + "epoch": 0.06, + "grad_norm": 0.09100496768951416, + "learning_rate": 0.001, + "loss": 0.3271, + "step": 5900 + }, + { + "epoch": 0.06, + "grad_norm": 0.0949195995926857, + "learning_rate": 0.001, + "loss": 0.2322, + "step": 6000 + }, + { + "epoch": 0.06, + "grad_norm": 0.17315314710140228, + "learning_rate": 0.001, + "loss": 0.232, + "step": 6100 + }, + { + "epoch": 0.06, + "grad_norm": 0.04644012451171875, + "learning_rate": 0.001, + "loss": 0.2317, + "step": 6200 + }, + { + "epoch": 0.06, + "grad_norm": 0.03242076560854912, + "learning_rate": 0.001, + "loss": 0.2317, + "step": 6300 + }, + { + "epoch": 0.06, + "grad_norm": 0.03038044273853302, + "learning_rate": 0.001, + "loss": 0.2322, + "step": 6400 + }, + { + "epoch": 0.06, + "grad_norm": 0.04407713562250137, + "learning_rate": 0.001, + "loss": 0.2321, + "step": 6500 + }, + { + "epoch": 0.07, + "grad_norm": 0.04973585903644562, + "learning_rate": 0.001, + "loss": 0.2321, + "step": 6600 + }, + { + "epoch": 0.07, + "grad_norm": 0.043713077902793884, + "learning_rate": 0.001, + "loss": 0.2319, + "step": 6700 + }, + { + "epoch": 0.07, + "grad_norm": 0.0361105352640152, + "learning_rate": 0.001, + "loss": 0.2319, + "step": 6800 + }, + { + "epoch": 0.07, + "grad_norm": 0.038385313004255295, + "learning_rate": 0.001, + "loss": 0.2319, + "step": 6900 + }, + { + "epoch": 0.07, + "grad_norm": 0.059859637171030045, + "learning_rate": 0.001, + "loss": 0.2318, + "step": 7000 + }, + { + "epoch": 0.07, + "grad_norm": 0.10737486183643341, + "learning_rate": 0.001, + "loss": 0.232, + "step": 7100 + }, + { + "epoch": 0.07, + "grad_norm": 0.07841573655605316, + "learning_rate": 0.001, + "loss": 0.2319, + "step": 7200 + }, + { + "epoch": 0.07, + "grad_norm": 0.12177613377571106, + "learning_rate": 0.001, + "loss": 0.2318, + "step": 7300 + }, + { + "epoch": 0.07, + "grad_norm": 0.04158034175634384, + "learning_rate": 0.001, + "loss": 0.2318, + "step": 7400 + }, + { + "epoch": 0.07, + "grad_norm": 0.04334099590778351, + "learning_rate": 0.001, + "loss": 0.2318, + "step": 7500 + }, + { + "epoch": 0.08, + "grad_norm": 0.04868987202644348, + "learning_rate": 0.001, + "loss": 0.2317, + "step": 7600 + }, + { + "epoch": 0.08, + "grad_norm": 0.11688575893640518, + "learning_rate": 0.001, + "loss": 0.2318, + "step": 7700 + }, + { + "epoch": 0.08, + "grad_norm": 0.05144130066037178, + "learning_rate": 0.001, + "loss": 0.2319, + "step": 7800 + }, + { + "epoch": 0.08, + "grad_norm": 0.04202236235141754, + "learning_rate": 0.001, + "loss": 0.2318, + "step": 7900 + }, + { + "epoch": 0.08, + "grad_norm": 0.07848116755485535, + "learning_rate": 0.001, + "loss": 0.2314, + "step": 8000 + }, + { + "epoch": 0.08, + "grad_norm": 0.05292198061943054, + "learning_rate": 0.001, + "loss": 0.2317, + "step": 8100 + }, + { + "epoch": 0.08, + "grad_norm": 0.05817991867661476, + "learning_rate": 0.001, + "loss": 0.2318, + "step": 8200 + }, + { + "epoch": 0.08, + "grad_norm": 0.03250608965754509, + "learning_rate": 0.001, + "loss": 0.2316, + "step": 8300 + }, + { + "epoch": 0.08, + "grad_norm": 0.29823893308639526, + "learning_rate": 0.001, + "loss": 0.2311, + "step": 8400 + }, + { + "epoch": 0.08, + "grad_norm": 1.852128505706787, + "learning_rate": 0.001, + "loss": 0.1864, + "step": 8500 + }, + { + "epoch": 0.09, + "grad_norm": 61.31148147583008, + "learning_rate": 0.001, + "loss": 0.1911, + "step": 8600 + }, + { + "epoch": 0.09, + "grad_norm": 3.4901123046875, + "learning_rate": 0.001, + "loss": 0.1934, + "step": 8700 + }, + { + "epoch": 0.09, + "grad_norm": 0.9580036401748657, + "learning_rate": 0.001, + "loss": 0.1706, + "step": 8800 + }, + { + "epoch": 0.09, + "grad_norm": 0.5461576581001282, + "learning_rate": 0.001, + "loss": 0.1597, + "step": 8900 + }, + { + "epoch": 0.09, + "grad_norm": 3.481351375579834, + "learning_rate": 0.001, + "loss": 0.1511, + "step": 9000 + }, + { + "epoch": 0.09, + "grad_norm": 0.3008120656013489, + "learning_rate": 0.001, + "loss": 0.154, + "step": 9100 + }, + { + "epoch": 0.09, + "grad_norm": 0.23753711581230164, + "learning_rate": 0.001, + "loss": 0.1406, + "step": 9200 + }, + { + "epoch": 0.09, + "grad_norm": 0.9201159477233887, + "learning_rate": 0.001, + "loss": 0.1444, + "step": 9300 + }, + { + "epoch": 0.09, + "grad_norm": 1.6734191179275513, + "learning_rate": 0.001, + "loss": 0.1385, + "step": 9400 + }, + { + "epoch": 0.09, + "grad_norm": 1.7249393463134766, + "learning_rate": 0.001, + "loss": 0.1393, + "step": 9500 + }, + { + "epoch": 0.1, + "grad_norm": 0.5765690207481384, + "learning_rate": 0.001, + "loss": 0.1397, + "step": 9600 + }, + { + "epoch": 0.1, + "grad_norm": 0.4266449213027954, + "learning_rate": 0.001, + "loss": 0.1386, + "step": 9700 + }, + { + "epoch": 0.1, + "grad_norm": 0.23247841000556946, + "learning_rate": 0.001, + "loss": 0.1343, + "step": 9800 + }, + { + "epoch": 0.1, + "grad_norm": 0.19435954093933105, + "learning_rate": 0.001, + "loss": 0.1306, + "step": 9900 + }, + { + "epoch": 0.1, + "grad_norm": 0.27626514434814453, + "learning_rate": 0.001, + "loss": 0.133, + "step": 10000 + }, + { + "epoch": 0.1, + "grad_norm": 0.1834883689880371, + "learning_rate": 0.001, + "loss": 0.1299, + "step": 10100 + }, + { + "epoch": 0.1, + "grad_norm": 0.4306440055370331, + "learning_rate": 0.001, + "loss": 0.1309, + "step": 10200 + }, + { + "epoch": 0.1, + "grad_norm": 0.15750516951084137, + "learning_rate": 0.001, + "loss": 0.1266, + "step": 10300 + }, + { + "epoch": 0.1, + "grad_norm": 0.2934073805809021, + "learning_rate": 0.001, + "loss": 0.1278, + "step": 10400 + }, + { + "epoch": 0.1, + "grad_norm": 0.27599695324897766, + "learning_rate": 0.001, + "loss": 0.1286, + "step": 10500 + }, + { + "epoch": 0.11, + "grad_norm": 0.39952772855758667, + "learning_rate": 0.001, + "loss": 0.1252, + "step": 10600 + }, + { + "epoch": 0.11, + "grad_norm": 0.4082016348838806, + "learning_rate": 0.001, + "loss": 0.1272, + "step": 10700 + }, + { + "epoch": 0.11, + "grad_norm": 0.303307443857193, + "learning_rate": 0.001, + "loss": 0.1249, + "step": 10800 + }, + { + "epoch": 0.11, + "grad_norm": 0.1597479283809662, + "learning_rate": 0.001, + "loss": 0.1247, + "step": 10900 + }, + { + "epoch": 0.11, + "grad_norm": 1.03666090965271, + "learning_rate": 0.001, + "loss": 0.1286, + "step": 11000 + }, + { + "epoch": 0.11, + "grad_norm": 0.2832247018814087, + "learning_rate": 0.001, + "loss": 0.1248, + "step": 11100 + }, + { + "epoch": 0.11, + "grad_norm": 0.49678078293800354, + "learning_rate": 0.001, + "loss": 0.1258, + "step": 11200 + }, + { + "epoch": 0.11, + "grad_norm": 0.3678058385848999, + "learning_rate": 0.001, + "loss": 0.1256, + "step": 11300 + }, + { + "epoch": 0.11, + "grad_norm": 0.26233455538749695, + "learning_rate": 0.001, + "loss": 0.1233, + "step": 11400 + }, + { + "epoch": 0.11, + "grad_norm": 0.22039958834648132, + "learning_rate": 0.001, + "loss": 0.1197, + "step": 11500 + }, + { + "epoch": 0.12, + "grad_norm": 0.14722639322280884, + "learning_rate": 0.001, + "loss": 0.1225, + "step": 11600 + }, + { + "epoch": 0.12, + "grad_norm": 0.19015900790691376, + "learning_rate": 0.001, + "loss": 0.1217, + "step": 11700 + }, + { + "epoch": 0.12, + "grad_norm": 0.15655829012393951, + "learning_rate": 0.001, + "loss": 0.1185, + "step": 11800 + }, + { + "epoch": 0.12, + "grad_norm": 3.5397889614105225, + "learning_rate": 0.001, + "loss": 0.119, + "step": 11900 + }, + { + "epoch": 0.12, + "grad_norm": 0.845320999622345, + "learning_rate": 0.001, + "loss": 0.1276, + "step": 12000 + }, + { + "epoch": 0.12, + "grad_norm": 0.34136563539505005, + "learning_rate": 0.001, + "loss": 0.122, + "step": 12100 + }, + { + "epoch": 0.12, + "grad_norm": 0.2509533762931824, + "learning_rate": 0.001, + "loss": 0.1199, + "step": 12200 + }, + { + "epoch": 0.12, + "grad_norm": 0.31120267510414124, + "learning_rate": 0.001, + "loss": 0.1191, + "step": 12300 + }, + { + "epoch": 0.12, + "grad_norm": 0.3903524875640869, + "learning_rate": 0.001, + "loss": 0.1183, + "step": 12400 + }, + { + "epoch": 0.12, + "grad_norm": 0.19971555471420288, + "learning_rate": 0.001, + "loss": 0.1177, + "step": 12500 + }, + { + "epoch": 0.13, + "grad_norm": 0.36589089035987854, + "learning_rate": 0.001, + "loss": 0.1158, + "step": 12600 + }, + { + "epoch": 0.13, + "grad_norm": 0.19200453162193298, + "learning_rate": 0.001, + "loss": 0.1166, + "step": 12700 + }, + { + "epoch": 0.13, + "grad_norm": 0.6393672823905945, + "learning_rate": 0.001, + "loss": 0.1171, + "step": 12800 + }, + { + "epoch": 0.13, + "grad_norm": 0.32421180605888367, + "learning_rate": 0.001, + "loss": 0.118, + "step": 12900 + }, + { + "epoch": 0.13, + "grad_norm": 0.6238926649093628, + "learning_rate": 0.001, + "loss": 0.1166, + "step": 13000 + }, + { + "epoch": 0.13, + "grad_norm": 0.1363907754421234, + "learning_rate": 0.001, + "loss": 0.1156, + "step": 13100 + }, + { + "epoch": 0.13, + "grad_norm": 0.16790109872817993, + "learning_rate": 0.001, + "loss": 0.1142, + "step": 13200 + }, + { + "epoch": 0.13, + "grad_norm": 0.1915178894996643, + "learning_rate": 0.001, + "loss": 0.1126, + "step": 13300 + }, + { + "epoch": 0.13, + "grad_norm": 0.12727123498916626, + "learning_rate": 0.001, + "loss": 0.1156, + "step": 13400 + }, + { + "epoch": 0.13, + "grad_norm": 0.29520758986473083, + "learning_rate": 0.001, + "loss": 0.1129, + "step": 13500 + }, + { + "epoch": 0.14, + "grad_norm": 0.1663757860660553, + "learning_rate": 0.001, + "loss": 0.1132, + "step": 13600 + }, + { + "epoch": 0.14, + "grad_norm": 0.1840706318616867, + "learning_rate": 0.001, + "loss": 0.119, + "step": 13700 + }, + { + "epoch": 0.14, + "grad_norm": 0.16156257688999176, + "learning_rate": 0.001, + "loss": 0.1146, + "step": 13800 + }, + { + "epoch": 0.14, + "grad_norm": 0.17348338663578033, + "learning_rate": 0.001, + "loss": 0.1141, + "step": 13900 + }, + { + "epoch": 0.14, + "grad_norm": 0.18696527183055878, + "learning_rate": 0.001, + "loss": 0.1108, + "step": 14000 + }, + { + "epoch": 0.14, + "grad_norm": 0.15352846682071686, + "learning_rate": 0.001, + "loss": 0.1134, + "step": 14100 + }, + { + "epoch": 0.14, + "grad_norm": 0.23210759460926056, + "learning_rate": 0.001, + "loss": 0.1142, + "step": 14200 + }, + { + "epoch": 0.14, + "grad_norm": 0.18328526616096497, + "learning_rate": 0.001, + "loss": 0.1109, + "step": 14300 + }, + { + "epoch": 0.14, + "grad_norm": 0.17674757540225983, + "learning_rate": 0.001, + "loss": 0.1083, + "step": 14400 + }, + { + "epoch": 0.14, + "grad_norm": 0.34446394443511963, + "learning_rate": 0.001, + "loss": 0.1203, + "step": 14500 + }, + { + "epoch": 0.15, + "grad_norm": 0.22947299480438232, + "learning_rate": 0.001, + "loss": 0.1095, + "step": 14600 + }, + { + "epoch": 0.15, + "grad_norm": 0.15071985125541687, + "learning_rate": 0.001, + "loss": 0.1088, + "step": 14700 + }, + { + "epoch": 0.15, + "grad_norm": 0.14273251593112946, + "learning_rate": 0.001, + "loss": 0.1091, + "step": 14800 + }, + { + "epoch": 0.15, + "grad_norm": 0.20266981422901154, + "learning_rate": 0.001, + "loss": 0.1089, + "step": 14900 + }, + { + "epoch": 0.15, + "grad_norm": 0.1495724767446518, + "learning_rate": 0.001, + "loss": 0.1089, + "step": 15000 + }, + { + "epoch": 0.15, + "grad_norm": 0.1711970865726471, + "learning_rate": 0.001, + "loss": 0.1063, + "step": 15100 + }, + { + "epoch": 0.15, + "grad_norm": 0.20727260410785675, + "learning_rate": 0.001, + "loss": 0.104, + "step": 15200 + }, + { + "epoch": 0.15, + "grad_norm": 0.22724412381649017, + "learning_rate": 0.001, + "loss": 0.1087, + "step": 15300 + }, + { + "epoch": 0.15, + "grad_norm": 0.15561726689338684, + "learning_rate": 0.001, + "loss": 0.1086, + "step": 15400 + }, + { + "epoch": 0.15, + "grad_norm": 0.2139796018600464, + "learning_rate": 0.001, + "loss": 0.1054, + "step": 15500 + }, + { + "epoch": 0.16, + "grad_norm": 0.24371370673179626, + "learning_rate": 0.001, + "loss": 0.1077, + "step": 15600 + }, + { + "epoch": 0.16, + "grad_norm": 0.22944559156894684, + "learning_rate": 0.001, + "loss": 0.1092, + "step": 15700 + }, + { + "epoch": 0.16, + "grad_norm": 0.19578562676906586, + "learning_rate": 0.001, + "loss": 0.1077, + "step": 15800 + }, + { + "epoch": 0.16, + "grad_norm": 0.17588412761688232, + "learning_rate": 0.001, + "loss": 0.1048, + "step": 15900 + }, + { + "epoch": 0.16, + "grad_norm": 0.16697707772254944, + "learning_rate": 0.001, + "loss": 0.1072, + "step": 16000 + }, + { + "epoch": 0.16, + "grad_norm": 0.1927742063999176, + "learning_rate": 0.001, + "loss": 0.1036, + "step": 16100 + }, + { + "epoch": 0.16, + "grad_norm": 0.25396087765693665, + "learning_rate": 0.001, + "loss": 0.1068, + "step": 16200 + }, + { + "epoch": 0.16, + "grad_norm": 0.21014653146266937, + "learning_rate": 0.001, + "loss": 0.1012, + "step": 16300 + }, + { + "epoch": 0.16, + "grad_norm": 0.32085150480270386, + "learning_rate": 0.001, + "loss": 0.1062, + "step": 16400 + }, + { + "epoch": 0.16, + "grad_norm": 0.10534122586250305, + "learning_rate": 0.001, + "loss": 0.103, + "step": 16500 + }, + { + "epoch": 0.17, + "grad_norm": 0.24365462362766266, + "learning_rate": 0.001, + "loss": 0.106, + "step": 16600 + }, + { + "epoch": 0.17, + "grad_norm": 0.15197184681892395, + "learning_rate": 0.001, + "loss": 0.1051, + "step": 16700 + }, + { + "epoch": 0.17, + "grad_norm": 0.23027855157852173, + "learning_rate": 0.001, + "loss": 0.1065, + "step": 16800 + }, + { + "epoch": 0.17, + "grad_norm": 0.14924216270446777, + "learning_rate": 0.001, + "loss": 0.1068, + "step": 16900 + }, + { + "epoch": 0.17, + "grad_norm": 0.13331858813762665, + "learning_rate": 0.001, + "loss": 0.1035, + "step": 17000 + }, + { + "epoch": 0.17, + "grad_norm": 0.20150358974933624, + "learning_rate": 0.001, + "loss": 0.1065, + "step": 17100 + }, + { + "epoch": 0.17, + "grad_norm": 0.1429535299539566, + "learning_rate": 0.001, + "loss": 0.1056, + "step": 17200 + }, + { + "epoch": 0.17, + "grad_norm": 0.16326557099819183, + "learning_rate": 0.001, + "loss": 0.1022, + "step": 17300 + }, + { + "epoch": 0.17, + "grad_norm": 0.15712429583072662, + "learning_rate": 0.001, + "loss": 0.1051, + "step": 17400 + }, + { + "epoch": 0.17, + "grad_norm": 0.33204013109207153, + "learning_rate": 0.001, + "loss": 0.1046, + "step": 17500 + }, + { + "epoch": 0.18, + "grad_norm": 0.17703518271446228, + "learning_rate": 0.001, + "loss": 0.1057, + "step": 17600 + }, + { + "epoch": 0.18, + "grad_norm": 0.14861218631267548, + "learning_rate": 0.001, + "loss": 0.1052, + "step": 17700 + }, + { + "epoch": 0.18, + "grad_norm": 0.18271447718143463, + "learning_rate": 0.001, + "loss": 0.1049, + "step": 17800 + }, + { + "epoch": 0.18, + "grad_norm": 0.2245068997144699, + "learning_rate": 0.001, + "loss": 0.1033, + "step": 17900 + }, + { + "epoch": 0.18, + "grad_norm": 0.2233046442270279, + "learning_rate": 0.001, + "loss": 0.1049, + "step": 18000 + }, + { + "epoch": 0.18, + "grad_norm": 0.1915113776922226, + "learning_rate": 0.001, + "loss": 0.1039, + "step": 18100 + }, + { + "epoch": 0.18, + "grad_norm": 0.1070462241768837, + "learning_rate": 0.001, + "loss": 0.1028, + "step": 18200 + }, + { + "epoch": 0.18, + "grad_norm": 0.14523275196552277, + "learning_rate": 0.001, + "loss": 0.0983, + "step": 18300 + }, + { + "epoch": 0.18, + "grad_norm": 0.24468256533145905, + "learning_rate": 0.001, + "loss": 0.1018, + "step": 18400 + }, + { + "epoch": 0.18, + "grad_norm": 0.17596426606178284, + "learning_rate": 0.001, + "loss": 0.1017, + "step": 18500 + }, + { + "epoch": 0.19, + "grad_norm": 0.15113884210586548, + "learning_rate": 0.001, + "loss": 0.1022, + "step": 18600 + }, + { + "epoch": 0.19, + "grad_norm": 0.1756398230791092, + "learning_rate": 0.001, + "loss": 0.1032, + "step": 18700 + }, + { + "epoch": 0.19, + "grad_norm": 0.1491193026304245, + "learning_rate": 0.001, + "loss": 0.1016, + "step": 18800 + }, + { + "epoch": 0.19, + "grad_norm": 0.15422752499580383, + "learning_rate": 0.001, + "loss": 0.0989, + "step": 18900 + }, + { + "epoch": 0.19, + "grad_norm": 0.13713973760604858, + "learning_rate": 0.001, + "loss": 0.1002, + "step": 19000 + }, + { + "epoch": 0.19, + "grad_norm": 0.16012702882289886, + "learning_rate": 0.001, + "loss": 0.101, + "step": 19100 + }, + { + "epoch": 0.19, + "grad_norm": 0.23414984345436096, + "learning_rate": 0.001, + "loss": 0.0975, + "step": 19200 + }, + { + "epoch": 0.19, + "grad_norm": 0.13922521471977234, + "learning_rate": 0.001, + "loss": 0.1002, + "step": 19300 + }, + { + "epoch": 0.19, + "grad_norm": 0.14608104526996613, + "learning_rate": 0.001, + "loss": 0.098, + "step": 19400 + }, + { + "epoch": 0.19, + "grad_norm": 0.19267164170742035, + "learning_rate": 0.001, + "loss": 0.098, + "step": 19500 + }, + { + "epoch": 0.2, + "grad_norm": 0.1570904552936554, + "learning_rate": 0.001, + "loss": 0.1034, + "step": 19600 + }, + { + "epoch": 0.2, + "grad_norm": 0.3922866880893707, + "learning_rate": 0.001, + "loss": 0.1008, + "step": 19700 + }, + { + "epoch": 0.2, + "grad_norm": 0.20500238239765167, + "learning_rate": 0.001, + "loss": 0.1025, + "step": 19800 + }, + { + "epoch": 0.2, + "grad_norm": 0.2044358104467392, + "learning_rate": 0.001, + "loss": 0.0982, + "step": 19900 + }, + { + "epoch": 0.2, + "grad_norm": 0.1722269356250763, + "learning_rate": 0.001, + "loss": 0.1007, + "step": 20000 + }, + { + "epoch": 0.2, + "grad_norm": 0.21868231892585754, + "learning_rate": 0.001, + "loss": 0.1, + "step": 20100 + }, + { + "epoch": 0.2, + "grad_norm": 0.12817895412445068, + "learning_rate": 0.001, + "loss": 0.1, + "step": 20200 + }, + { + "epoch": 0.2, + "grad_norm": 0.12333246320486069, + "learning_rate": 0.001, + "loss": 0.0987, + "step": 20300 + }, + { + "epoch": 0.2, + "grad_norm": 0.1742565631866455, + "learning_rate": 0.001, + "loss": 0.0981, + "step": 20400 + }, + { + "epoch": 0.2, + "grad_norm": 0.15747936069965363, + "learning_rate": 0.001, + "loss": 0.1012, + "step": 20500 + }, + { + "epoch": 0.21, + "grad_norm": 0.27314338088035583, + "learning_rate": 0.001, + "loss": 0.1014, + "step": 20600 + }, + { + "epoch": 0.21, + "grad_norm": 0.9368189573287964, + "learning_rate": 0.001, + "loss": 0.1035, + "step": 20700 + }, + { + "epoch": 0.21, + "grad_norm": 0.3574996590614319, + "learning_rate": 0.001, + "loss": 0.0992, + "step": 20800 + }, + { + "epoch": 0.21, + "grad_norm": 0.28280141949653625, + "learning_rate": 0.001, + "loss": 0.0975, + "step": 20900 + }, + { + "epoch": 0.21, + "grad_norm": 0.21435654163360596, + "learning_rate": 0.001, + "loss": 0.0998, + "step": 21000 + }, + { + "epoch": 0.21, + "grad_norm": 0.20617541670799255, + "learning_rate": 0.001, + "loss": 0.0994, + "step": 21100 + }, + { + "epoch": 0.21, + "grad_norm": 0.21885354816913605, + "learning_rate": 0.001, + "loss": 0.099, + "step": 21200 + }, + { + "epoch": 0.21, + "grad_norm": 0.24429431557655334, + "learning_rate": 0.001, + "loss": 0.1018, + "step": 21300 + }, + { + "epoch": 0.21, + "grad_norm": 0.24264854192733765, + "learning_rate": 0.001, + "loss": 0.1009, + "step": 21400 + }, + { + "epoch": 0.21, + "grad_norm": 0.19410717487335205, + "learning_rate": 0.001, + "loss": 0.1007, + "step": 21500 + }, + { + "epoch": 0.22, + "grad_norm": 0.15938735008239746, + "learning_rate": 0.001, + "loss": 0.0965, + "step": 21600 + }, + { + "epoch": 0.22, + "grad_norm": 0.678229808807373, + "learning_rate": 0.001, + "loss": 0.1001, + "step": 21700 + }, + { + "epoch": 0.22, + "grad_norm": 0.2967202663421631, + "learning_rate": 0.001, + "loss": 0.1003, + "step": 21800 + }, + { + "epoch": 0.22, + "grad_norm": 0.7940108180046082, + "learning_rate": 0.001, + "loss": 0.1001, + "step": 21900 + }, + { + "epoch": 0.22, + "grad_norm": 0.24995733797550201, + "learning_rate": 0.001, + "loss": 0.0992, + "step": 22000 + }, + { + "epoch": 0.22, + "grad_norm": 0.1626627892255783, + "learning_rate": 0.001, + "loss": 0.0992, + "step": 22100 + }, + { + "epoch": 0.22, + "grad_norm": 0.21141190826892853, + "learning_rate": 0.001, + "loss": 0.0961, + "step": 22200 + }, + { + "epoch": 0.22, + "grad_norm": 0.21122020483016968, + "learning_rate": 0.001, + "loss": 0.0968, + "step": 22300 + }, + { + "epoch": 0.22, + "grad_norm": 0.2558838725090027, + "learning_rate": 0.001, + "loss": 0.098, + "step": 22400 + }, + { + "epoch": 0.22, + "grad_norm": 0.1975196897983551, + "learning_rate": 0.001, + "loss": 0.0987, + "step": 22500 + }, + { + "epoch": 0.23, + "grad_norm": 0.14767397940158844, + "learning_rate": 0.001, + "loss": 0.096, + "step": 22600 + }, + { + "epoch": 0.23, + "grad_norm": 0.17532730102539062, + "learning_rate": 0.001, + "loss": 0.0985, + "step": 22700 + }, + { + "epoch": 0.23, + "grad_norm": 0.1320209801197052, + "learning_rate": 0.001, + "loss": 0.0968, + "step": 22800 + }, + { + "epoch": 0.23, + "grad_norm": 0.273934930562973, + "learning_rate": 0.001, + "loss": 0.0978, + "step": 22900 + }, + { + "epoch": 0.23, + "grad_norm": 0.15103434026241302, + "learning_rate": 0.001, + "loss": 0.0995, + "step": 23000 + }, + { + "epoch": 0.23, + "grad_norm": 0.2021692842245102, + "learning_rate": 0.001, + "loss": 0.0952, + "step": 23100 + }, + { + "epoch": 0.23, + "grad_norm": 0.1648433655500412, + "learning_rate": 0.001, + "loss": 0.0938, + "step": 23200 + }, + { + "epoch": 0.23, + "grad_norm": 0.17460817098617554, + "learning_rate": 0.001, + "loss": 0.0959, + "step": 23300 + }, + { + "epoch": 0.23, + "grad_norm": 0.15195918083190918, + "learning_rate": 0.001, + "loss": 0.094, + "step": 23400 + }, + { + "epoch": 0.23, + "grad_norm": 0.1664193570613861, + "learning_rate": 0.001, + "loss": 0.094, + "step": 23500 + }, + { + "epoch": 0.24, + "grad_norm": 0.14700663089752197, + "learning_rate": 0.001, + "loss": 0.0951, + "step": 23600 + }, + { + "epoch": 0.24, + "grad_norm": 0.22301018238067627, + "learning_rate": 0.001, + "loss": 0.0919, + "step": 23700 + }, + { + "epoch": 0.24, + "grad_norm": 0.1666121482849121, + "learning_rate": 0.001, + "loss": 0.0928, + "step": 23800 + }, + { + "epoch": 0.24, + "grad_norm": 0.1971474438905716, + "learning_rate": 0.001, + "loss": 0.0949, + "step": 23900 + }, + { + "epoch": 0.24, + "grad_norm": 0.15959730744361877, + "learning_rate": 0.001, + "loss": 0.095, + "step": 24000 + }, + { + "epoch": 0.24, + "grad_norm": 0.29146862030029297, + "learning_rate": 0.001, + "loss": 0.0942, + "step": 24100 + }, + { + "epoch": 0.24, + "grad_norm": 0.15853939950466156, + "learning_rate": 0.001, + "loss": 0.0978, + "step": 24200 + }, + { + "epoch": 0.24, + "grad_norm": 0.16822876036167145, + "learning_rate": 0.001, + "loss": 0.0934, + "step": 24300 + }, + { + "epoch": 0.24, + "grad_norm": 0.15456752479076385, + "learning_rate": 0.001, + "loss": 0.0948, + "step": 24400 + }, + { + "epoch": 0.24, + "grad_norm": 0.15123625099658966, + "learning_rate": 0.001, + "loss": 0.0926, + "step": 24500 + }, + { + "epoch": 0.25, + "grad_norm": 0.16344180703163147, + "learning_rate": 0.001, + "loss": 0.0935, + "step": 24600 + }, + { + "epoch": 0.25, + "grad_norm": 0.22936996817588806, + "learning_rate": 0.001, + "loss": 0.0936, + "step": 24700 + }, + { + "epoch": 0.25, + "grad_norm": 0.16810204088687897, + "learning_rate": 0.001, + "loss": 0.0978, + "step": 24800 + }, + { + "epoch": 0.25, + "grad_norm": 0.14977198839187622, + "learning_rate": 0.001, + "loss": 0.0936, + "step": 24900 + }, + { + "epoch": 0.25, + "grad_norm": 0.18207716941833496, + "learning_rate": 0.001, + "loss": 0.093, + "step": 25000 + }, + { + "epoch": 0.25, + "grad_norm": 0.2584002912044525, + "learning_rate": 0.001, + "loss": 0.0958, + "step": 25100 + }, + { + "epoch": 0.25, + "grad_norm": 0.23717880249023438, + "learning_rate": 0.001, + "loss": 0.0927, + "step": 25200 + }, + { + "epoch": 0.25, + "grad_norm": 0.1896461844444275, + "learning_rate": 0.001, + "loss": 0.094, + "step": 25300 + }, + { + "epoch": 0.25, + "grad_norm": 0.21543921530246735, + "learning_rate": 0.001, + "loss": 0.0953, + "step": 25400 + }, + { + "epoch": 0.25, + "grad_norm": 0.14013002812862396, + "learning_rate": 0.001, + "loss": 0.0958, + "step": 25500 + }, + { + "epoch": 0.26, + "grad_norm": 0.1744927018880844, + "learning_rate": 0.001, + "loss": 0.0946, + "step": 25600 + }, + { + "epoch": 0.26, + "grad_norm": 0.16546490788459778, + "learning_rate": 0.001, + "loss": 0.0962, + "step": 25700 + }, + { + "epoch": 0.26, + "grad_norm": 0.16227766871452332, + "learning_rate": 0.001, + "loss": 0.0952, + "step": 25800 + }, + { + "epoch": 0.26, + "grad_norm": 0.181349515914917, + "learning_rate": 0.001, + "loss": 0.0951, + "step": 25900 + }, + { + "epoch": 0.26, + "grad_norm": 0.20408563315868378, + "learning_rate": 0.001, + "loss": 0.0915, + "step": 26000 + }, + { + "epoch": 0.26, + "grad_norm": 0.1793171763420105, + "learning_rate": 0.001, + "loss": 0.0942, + "step": 26100 + }, + { + "epoch": 0.26, + "grad_norm": 0.14634822309017181, + "learning_rate": 0.001, + "loss": 0.0961, + "step": 26200 + }, + { + "epoch": 0.26, + "grad_norm": 0.18879148364067078, + "learning_rate": 0.001, + "loss": 0.0942, + "step": 26300 + }, + { + "epoch": 0.26, + "grad_norm": 0.20523515343666077, + "learning_rate": 0.001, + "loss": 0.0912, + "step": 26400 + }, + { + "epoch": 0.26, + "grad_norm": 0.18672947585582733, + "learning_rate": 0.001, + "loss": 0.092, + "step": 26500 + }, + { + "epoch": 0.27, + "grad_norm": 0.18561910092830658, + "learning_rate": 0.001, + "loss": 0.0913, + "step": 26600 + }, + { + "epoch": 0.27, + "grad_norm": 0.23991861939430237, + "learning_rate": 0.001, + "loss": 0.0925, + "step": 26700 + }, + { + "epoch": 0.27, + "grad_norm": 0.1660347878932953, + "learning_rate": 0.001, + "loss": 0.0939, + "step": 26800 + }, + { + "epoch": 0.27, + "grad_norm": 0.2105019986629486, + "learning_rate": 0.001, + "loss": 0.093, + "step": 26900 + }, + { + "epoch": 0.27, + "grad_norm": 0.2271376997232437, + "learning_rate": 0.001, + "loss": 0.0899, + "step": 27000 + }, + { + "epoch": 0.27, + "grad_norm": 0.14487460255622864, + "learning_rate": 0.001, + "loss": 0.0906, + "step": 27100 + }, + { + "epoch": 0.27, + "grad_norm": 0.1597098708152771, + "learning_rate": 0.001, + "loss": 0.0919, + "step": 27200 + }, + { + "epoch": 0.27, + "grad_norm": 0.18633900582790375, + "learning_rate": 0.001, + "loss": 0.0892, + "step": 27300 + }, + { + "epoch": 0.27, + "grad_norm": 0.12663201987743378, + "learning_rate": 0.001, + "loss": 0.0913, + "step": 27400 + }, + { + "epoch": 0.27, + "grad_norm": 0.17320451140403748, + "learning_rate": 0.001, + "loss": 0.0911, + "step": 27500 + }, + { + "epoch": 0.28, + "grad_norm": 0.16872632503509521, + "learning_rate": 0.001, + "loss": 0.091, + "step": 27600 + }, + { + "epoch": 0.28, + "grad_norm": 0.18602560460567474, + "learning_rate": 0.001, + "loss": 0.0908, + "step": 27700 + }, + { + "epoch": 0.28, + "grad_norm": 0.17392034828662872, + "learning_rate": 0.001, + "loss": 0.0882, + "step": 27800 + }, + { + "epoch": 0.28, + "grad_norm": 0.10278663039207458, + "learning_rate": 0.001, + "loss": 0.088, + "step": 27900 + }, + { + "epoch": 0.28, + "grad_norm": 0.15355843305587769, + "learning_rate": 0.001, + "loss": 0.0876, + "step": 28000 + }, + { + "epoch": 0.28, + "grad_norm": 0.17331954836845398, + "learning_rate": 0.001, + "loss": 0.0906, + "step": 28100 + }, + { + "epoch": 0.28, + "grad_norm": 0.16750375926494598, + "learning_rate": 0.001, + "loss": 0.0935, + "step": 28200 + }, + { + "epoch": 0.28, + "grad_norm": 0.27208462357521057, + "learning_rate": 0.001, + "loss": 0.0884, + "step": 28300 + }, + { + "epoch": 0.28, + "grad_norm": 0.2215784639120102, + "learning_rate": 0.001, + "loss": 0.0904, + "step": 28400 + }, + { + "epoch": 0.28, + "grad_norm": 0.1542549580335617, + "learning_rate": 0.001, + "loss": 0.0903, + "step": 28500 + }, + { + "epoch": 0.29, + "grad_norm": 0.22874318063259125, + "learning_rate": 0.001, + "loss": 0.0889, + "step": 28600 + }, + { + "epoch": 0.29, + "grad_norm": 0.22677820920944214, + "learning_rate": 0.001, + "loss": 0.0915, + "step": 28700 + }, + { + "epoch": 0.29, + "grad_norm": 0.22208420932292938, + "learning_rate": 0.001, + "loss": 0.0902, + "step": 28800 + }, + { + "epoch": 0.29, + "grad_norm": 0.18172180652618408, + "learning_rate": 0.001, + "loss": 0.091, + "step": 28900 + }, + { + "epoch": 0.29, + "grad_norm": 0.264664888381958, + "learning_rate": 0.001, + "loss": 0.091, + "step": 29000 + }, + { + "epoch": 0.29, + "grad_norm": 0.15961118042469025, + "learning_rate": 0.001, + "loss": 0.0864, + "step": 29100 + }, + { + "epoch": 0.29, + "grad_norm": 0.16828449070453644, + "learning_rate": 0.001, + "loss": 0.0902, + "step": 29200 + }, + { + "epoch": 0.29, + "grad_norm": 0.25299304723739624, + "learning_rate": 0.001, + "loss": 0.0895, + "step": 29300 + }, + { + "epoch": 0.29, + "grad_norm": 0.2019224911928177, + "learning_rate": 0.001, + "loss": 0.0887, + "step": 29400 + }, + { + "epoch": 0.29, + "grad_norm": 0.19100870192050934, + "learning_rate": 0.001, + "loss": 0.0897, + "step": 29500 + }, + { + "epoch": 0.3, + "grad_norm": 0.25321510434150696, + "learning_rate": 0.001, + "loss": 0.092, + "step": 29600 + }, + { + "epoch": 0.3, + "grad_norm": 0.18171149492263794, + "learning_rate": 0.001, + "loss": 0.089, + "step": 29700 + }, + { + "epoch": 0.3, + "grad_norm": 0.19380785524845123, + "learning_rate": 0.001, + "loss": 0.0895, + "step": 29800 + }, + { + "epoch": 0.3, + "grad_norm": 0.18437138199806213, + "learning_rate": 0.001, + "loss": 0.0903, + "step": 29900 + }, + { + "epoch": 0.3, + "grad_norm": 0.1717921495437622, + "learning_rate": 0.001, + "loss": 0.0885, + "step": 30000 + }, + { + "epoch": 0.3, + "grad_norm": 0.23623107373714447, + "learning_rate": 0.001, + "loss": 0.0882, + "step": 30100 + }, + { + "epoch": 0.3, + "grad_norm": 0.17992794513702393, + "learning_rate": 0.001, + "loss": 0.0885, + "step": 30200 + }, + { + "epoch": 0.3, + "grad_norm": 0.19958259165287018, + "learning_rate": 0.001, + "loss": 0.088, + "step": 30300 + }, + { + "epoch": 0.3, + "grad_norm": 0.14418841898441315, + "learning_rate": 0.001, + "loss": 0.0908, + "step": 30400 + }, + { + "epoch": 0.3, + "grad_norm": 0.13934949040412903, + "learning_rate": 0.001, + "loss": 0.0919, + "step": 30500 + }, + { + "epoch": 0.31, + "grad_norm": 0.1410313993692398, + "learning_rate": 0.001, + "loss": 0.0891, + "step": 30600 + }, + { + "epoch": 0.31, + "grad_norm": 0.27084311842918396, + "learning_rate": 0.001, + "loss": 0.0917, + "step": 30700 + }, + { + "epoch": 0.31, + "grad_norm": 0.18704760074615479, + "learning_rate": 0.001, + "loss": 0.0866, + "step": 30800 + }, + { + "epoch": 0.31, + "grad_norm": 0.16178588569164276, + "learning_rate": 0.001, + "loss": 0.088, + "step": 30900 + }, + { + "epoch": 0.31, + "grad_norm": 0.1699521839618683, + "learning_rate": 0.001, + "loss": 0.0891, + "step": 31000 + }, + { + "epoch": 0.31, + "grad_norm": 0.21340341866016388, + "learning_rate": 0.001, + "loss": 0.0871, + "step": 31100 + }, + { + "epoch": 0.31, + "grad_norm": 0.21089456975460052, + "learning_rate": 0.001, + "loss": 0.0898, + "step": 31200 + }, + { + "epoch": 0.31, + "grad_norm": 0.17899860441684723, + "learning_rate": 0.001, + "loss": 0.0874, + "step": 31300 + }, + { + "epoch": 0.31, + "grad_norm": 0.2222578376531601, + "learning_rate": 0.001, + "loss": 0.0875, + "step": 31400 + }, + { + "epoch": 0.31, + "grad_norm": 0.22845357656478882, + "learning_rate": 0.001, + "loss": 0.0895, + "step": 31500 + }, + { + "epoch": 0.32, + "grad_norm": 0.22213339805603027, + "learning_rate": 0.001, + "loss": 0.0877, + "step": 31600 + }, + { + "epoch": 0.32, + "grad_norm": 0.1989658772945404, + "learning_rate": 0.001, + "loss": 0.09, + "step": 31700 + }, + { + "epoch": 0.32, + "grad_norm": 0.28217941522598267, + "learning_rate": 0.001, + "loss": 0.0869, + "step": 31800 + }, + { + "epoch": 0.32, + "grad_norm": 0.1880946159362793, + "learning_rate": 0.001, + "loss": 0.0895, + "step": 31900 + }, + { + "epoch": 0.32, + "grad_norm": 0.2522743046283722, + "learning_rate": 0.001, + "loss": 0.0876, + "step": 32000 + }, + { + "epoch": 0.32, + "grad_norm": 0.15146856009960175, + "learning_rate": 0.001, + "loss": 0.0892, + "step": 32100 + }, + { + "epoch": 0.32, + "grad_norm": 0.20138536393642426, + "learning_rate": 0.001, + "loss": 0.0897, + "step": 32200 + }, + { + "epoch": 0.32, + "grad_norm": 0.19894324243068695, + "learning_rate": 0.001, + "loss": 0.089, + "step": 32300 + }, + { + "epoch": 0.32, + "grad_norm": 0.20011819899082184, + "learning_rate": 0.001, + "loss": 0.0877, + "step": 32400 + }, + { + "epoch": 0.32, + "grad_norm": 0.22739243507385254, + "learning_rate": 0.001, + "loss": 0.0875, + "step": 32500 + }, + { + "epoch": 0.33, + "grad_norm": 0.16710792481899261, + "learning_rate": 0.001, + "loss": 0.0852, + "step": 32600 + }, + { + "epoch": 0.33, + "grad_norm": 0.20454761385917664, + "learning_rate": 0.001, + "loss": 0.0862, + "step": 32700 + }, + { + "epoch": 0.33, + "grad_norm": 0.12356776744127274, + "learning_rate": 0.001, + "loss": 0.0854, + "step": 32800 + }, + { + "epoch": 0.33, + "grad_norm": 0.18977922201156616, + "learning_rate": 0.001, + "loss": 0.0915, + "step": 32900 + }, + { + "epoch": 0.33, + "grad_norm": 0.18791726231575012, + "learning_rate": 0.001, + "loss": 0.0842, + "step": 33000 + }, + { + "epoch": 0.33, + "grad_norm": 0.23529213666915894, + "learning_rate": 0.001, + "loss": 0.086, + "step": 33100 + }, + { + "epoch": 0.33, + "grad_norm": 0.25430527329444885, + "learning_rate": 0.001, + "loss": 0.0833, + "step": 33200 + }, + { + "epoch": 0.33, + "grad_norm": 0.22178427875041962, + "learning_rate": 0.001, + "loss": 0.0874, + "step": 33300 + }, + { + "epoch": 0.33, + "grad_norm": 0.27455243468284607, + "learning_rate": 0.001, + "loss": 0.0845, + "step": 33400 + }, + { + "epoch": 0.33, + "grad_norm": 0.1998920440673828, + "learning_rate": 0.001, + "loss": 0.0869, + "step": 33500 + }, + { + "epoch": 0.34, + "grad_norm": 0.1991311013698578, + "learning_rate": 0.001, + "loss": 0.0873, + "step": 33600 + }, + { + "epoch": 0.34, + "grad_norm": 0.2600191831588745, + "learning_rate": 0.001, + "loss": 0.0869, + "step": 33700 + }, + { + "epoch": 0.34, + "grad_norm": 0.16889439523220062, + "learning_rate": 0.001, + "loss": 0.0841, + "step": 33800 + }, + { + "epoch": 0.34, + "grad_norm": 0.17337612807750702, + "learning_rate": 0.001, + "loss": 0.0847, + "step": 33900 + }, + { + "epoch": 0.34, + "grad_norm": 0.12141957134008408, + "learning_rate": 0.001, + "loss": 0.0873, + "step": 34000 + }, + { + "epoch": 0.34, + "grad_norm": 0.30542996525764465, + "learning_rate": 0.001, + "loss": 0.086, + "step": 34100 + }, + { + "epoch": 0.34, + "grad_norm": 0.256072461605072, + "learning_rate": 0.001, + "loss": 0.0845, + "step": 34200 + }, + { + "epoch": 0.34, + "grad_norm": 0.19596265256404877, + "learning_rate": 0.001, + "loss": 0.0847, + "step": 34300 + }, + { + "epoch": 0.34, + "grad_norm": 0.17981210350990295, + "learning_rate": 0.001, + "loss": 0.0853, + "step": 34400 + }, + { + "epoch": 0.34, + "grad_norm": 0.18695278465747833, + "learning_rate": 0.001, + "loss": 0.0867, + "step": 34500 + }, + { + "epoch": 0.35, + "grad_norm": 0.20189203321933746, + "learning_rate": 0.001, + "loss": 0.0867, + "step": 34600 + }, + { + "epoch": 0.35, + "grad_norm": 0.20751608908176422, + "learning_rate": 0.001, + "loss": 0.0855, + "step": 34700 + }, + { + "epoch": 0.35, + "grad_norm": 0.15412236750125885, + "learning_rate": 0.001, + "loss": 0.0876, + "step": 34800 + }, + { + "epoch": 0.35, + "grad_norm": 0.21551938354969025, + "learning_rate": 0.001, + "loss": 0.0854, + "step": 34900 + }, + { + "epoch": 0.35, + "grad_norm": 0.15149344503879547, + "learning_rate": 0.001, + "loss": 0.0863, + "step": 35000 + }, + { + "epoch": 0.35, + "grad_norm": 0.21960322558879852, + "learning_rate": 0.001, + "loss": 0.0913, + "step": 35100 + }, + { + "epoch": 0.35, + "grad_norm": 0.317090779542923, + "learning_rate": 0.001, + "loss": 0.0832, + "step": 35200 + }, + { + "epoch": 0.35, + "grad_norm": 0.20051142573356628, + "learning_rate": 0.001, + "loss": 0.0856, + "step": 35300 + }, + { + "epoch": 0.35, + "grad_norm": 0.1955852061510086, + "learning_rate": 0.001, + "loss": 0.0867, + "step": 35400 + }, + { + "epoch": 0.35, + "grad_norm": 0.13714253902435303, + "learning_rate": 0.001, + "loss": 0.0864, + "step": 35500 + }, + { + "epoch": 0.36, + "grad_norm": 0.18536311388015747, + "learning_rate": 0.001, + "loss": 0.0868, + "step": 35600 + }, + { + "epoch": 0.36, + "grad_norm": 0.1795514076948166, + "learning_rate": 0.001, + "loss": 0.0829, + "step": 35700 + }, + { + "epoch": 0.36, + "grad_norm": 0.1465149074792862, + "learning_rate": 0.001, + "loss": 0.0851, + "step": 35800 + }, + { + "epoch": 0.36, + "grad_norm": 0.17687107622623444, + "learning_rate": 0.001, + "loss": 0.0861, + "step": 35900 + }, + { + "epoch": 0.36, + "grad_norm": 0.1795363575220108, + "learning_rate": 0.001, + "loss": 0.0822, + "step": 36000 + }, + { + "epoch": 0.36, + "grad_norm": 0.1741327941417694, + "learning_rate": 0.001, + "loss": 0.0847, + "step": 36100 + }, + { + "epoch": 0.36, + "grad_norm": 0.2547447681427002, + "learning_rate": 0.001, + "loss": 0.0862, + "step": 36200 + }, + { + "epoch": 0.36, + "grad_norm": 0.16002462804317474, + "learning_rate": 0.001, + "loss": 0.0856, + "step": 36300 + }, + { + "epoch": 0.36, + "grad_norm": 0.14787407219409943, + "learning_rate": 0.001, + "loss": 0.0844, + "step": 36400 + }, + { + "epoch": 0.36, + "grad_norm": 0.23449848592281342, + "learning_rate": 0.001, + "loss": 0.0823, + "step": 36500 + }, + { + "epoch": 0.37, + "grad_norm": 0.18626731634140015, + "learning_rate": 0.001, + "loss": 0.0804, + "step": 36600 + }, + { + "epoch": 0.37, + "grad_norm": 0.1434779316186905, + "learning_rate": 0.001, + "loss": 0.0844, + "step": 36700 + }, + { + "epoch": 0.37, + "grad_norm": 0.1594706028699875, + "learning_rate": 0.001, + "loss": 0.0869, + "step": 36800 + }, + { + "epoch": 0.37, + "grad_norm": 0.18195496499538422, + "learning_rate": 0.001, + "loss": 0.0846, + "step": 36900 + }, + { + "epoch": 0.37, + "grad_norm": 0.18613013625144958, + "learning_rate": 0.001, + "loss": 0.0872, + "step": 37000 + }, + { + "epoch": 0.37, + "grad_norm": 0.16158261895179749, + "learning_rate": 0.001, + "loss": 0.0846, + "step": 37100 + }, + { + "epoch": 0.37, + "grad_norm": 0.17811179161071777, + "learning_rate": 0.001, + "loss": 0.0832, + "step": 37200 + }, + { + "epoch": 0.37, + "grad_norm": 0.24112731218338013, + "learning_rate": 0.001, + "loss": 0.0804, + "step": 37300 + }, + { + "epoch": 0.37, + "grad_norm": 0.1778961569070816, + "learning_rate": 0.001, + "loss": 0.0837, + "step": 37400 + }, + { + "epoch": 0.37, + "grad_norm": 0.18162128329277039, + "learning_rate": 0.001, + "loss": 0.0867, + "step": 37500 + }, + { + "epoch": 0.38, + "grad_norm": 0.15079495310783386, + "learning_rate": 0.001, + "loss": 0.0829, + "step": 37600 + }, + { + "epoch": 0.38, + "grad_norm": 0.26986435055732727, + "learning_rate": 0.001, + "loss": 0.0843, + "step": 37700 + }, + { + "epoch": 0.38, + "grad_norm": 0.2643984854221344, + "learning_rate": 0.001, + "loss": 0.0829, + "step": 37800 + }, + { + "epoch": 0.38, + "grad_norm": 0.281751424074173, + "learning_rate": 0.001, + "loss": 0.0821, + "step": 37900 + }, + { + "epoch": 0.38, + "grad_norm": 0.23095449805259705, + "learning_rate": 0.001, + "loss": 0.0836, + "step": 38000 + }, + { + "epoch": 0.38, + "grad_norm": 0.18625666201114655, + "learning_rate": 0.001, + "loss": 0.0831, + "step": 38100 + }, + { + "epoch": 0.38, + "grad_norm": 0.13689708709716797, + "learning_rate": 0.001, + "loss": 0.0839, + "step": 38200 + }, + { + "epoch": 0.38, + "grad_norm": 0.14063656330108643, + "learning_rate": 0.001, + "loss": 0.0817, + "step": 38300 + }, + { + "epoch": 0.38, + "grad_norm": 0.1880202442407608, + "learning_rate": 0.001, + "loss": 0.082, + "step": 38400 + }, + { + "epoch": 0.38, + "grad_norm": 0.15921075642108917, + "learning_rate": 0.001, + "loss": 0.0789, + "step": 38500 + }, + { + "epoch": 0.39, + "grad_norm": 0.1744866818189621, + "learning_rate": 0.001, + "loss": 0.0818, + "step": 38600 + }, + { + "epoch": 0.39, + "grad_norm": 0.26724693179130554, + "learning_rate": 0.001, + "loss": 0.0847, + "step": 38700 + }, + { + "epoch": 0.39, + "grad_norm": 0.14382457733154297, + "learning_rate": 0.001, + "loss": 0.0829, + "step": 38800 + }, + { + "epoch": 0.39, + "grad_norm": 0.14012865722179413, + "learning_rate": 0.001, + "loss": 0.082, + "step": 38900 + }, + { + "epoch": 0.39, + "grad_norm": 0.24175578355789185, + "learning_rate": 0.001, + "loss": 0.0835, + "step": 39000 + }, + { + "epoch": 0.39, + "grad_norm": 0.3397182822227478, + "learning_rate": 0.001, + "loss": 0.081, + "step": 39100 + }, + { + "epoch": 0.39, + "grad_norm": 0.1553467959165573, + "learning_rate": 0.001, + "loss": 0.0829, + "step": 39200 + }, + { + "epoch": 0.39, + "grad_norm": 0.20726840198040009, + "learning_rate": 0.001, + "loss": 0.083, + "step": 39300 + }, + { + "epoch": 0.39, + "grad_norm": 0.21219220757484436, + "learning_rate": 0.001, + "loss": 0.084, + "step": 39400 + }, + { + "epoch": 0.39, + "grad_norm": 0.19203193485736847, + "learning_rate": 0.001, + "loss": 0.0819, + "step": 39500 + }, + { + "epoch": 0.4, + "grad_norm": 0.22557440400123596, + "learning_rate": 0.001, + "loss": 0.0803, + "step": 39600 + }, + { + "epoch": 0.4, + "grad_norm": 0.23452799022197723, + "learning_rate": 0.001, + "loss": 0.0806, + "step": 39700 + }, + { + "epoch": 0.4, + "grad_norm": 0.28543928265571594, + "learning_rate": 0.001, + "loss": 0.0827, + "step": 39800 + }, + { + "epoch": 0.4, + "grad_norm": 0.19713571667671204, + "learning_rate": 0.001, + "loss": 0.08, + "step": 39900 + }, + { + "epoch": 0.4, + "grad_norm": 0.18496285378932953, + "learning_rate": 0.001, + "loss": 0.0841, + "step": 40000 + }, + { + "epoch": 0.4, + "grad_norm": 0.1363649070262909, + "learning_rate": 0.001, + "loss": 0.0813, + "step": 40100 + }, + { + "epoch": 0.4, + "grad_norm": 0.1736011952161789, + "learning_rate": 0.001, + "loss": 0.0796, + "step": 40200 + }, + { + "epoch": 0.4, + "grad_norm": 0.21385334432125092, + "learning_rate": 0.001, + "loss": 0.0814, + "step": 40300 + }, + { + "epoch": 0.4, + "grad_norm": 0.2105669230222702, + "learning_rate": 0.001, + "loss": 0.0816, + "step": 40400 + }, + { + "epoch": 0.4, + "grad_norm": 0.2278176248073578, + "learning_rate": 0.001, + "loss": 0.0825, + "step": 40500 + }, + { + "epoch": 0.41, + "grad_norm": 0.17637114226818085, + "learning_rate": 0.001, + "loss": 0.0812, + "step": 40600 + }, + { + "epoch": 0.41, + "grad_norm": 0.20035295188426971, + "learning_rate": 0.001, + "loss": 0.0853, + "step": 40700 + }, + { + "epoch": 0.41, + "grad_norm": 0.25408777594566345, + "learning_rate": 0.001, + "loss": 0.0811, + "step": 40800 + }, + { + "epoch": 0.41, + "grad_norm": 0.2177010476589203, + "learning_rate": 0.001, + "loss": 0.0796, + "step": 40900 + }, + { + "epoch": 0.41, + "grad_norm": 0.1639321744441986, + "learning_rate": 0.001, + "loss": 0.0824, + "step": 41000 + }, + { + "epoch": 0.41, + "grad_norm": 0.15798155963420868, + "learning_rate": 0.001, + "loss": 0.0834, + "step": 41100 + }, + { + "epoch": 0.41, + "grad_norm": 0.14857494831085205, + "learning_rate": 0.001, + "loss": 0.0825, + "step": 41200 + }, + { + "epoch": 0.41, + "grad_norm": 0.15640319883823395, + "learning_rate": 0.001, + "loss": 0.0814, + "step": 41300 + }, + { + "epoch": 0.41, + "grad_norm": 0.1530522108078003, + "learning_rate": 0.001, + "loss": 0.0825, + "step": 41400 + }, + { + "epoch": 0.41, + "grad_norm": 0.2990354001522064, + "learning_rate": 0.001, + "loss": 0.0785, + "step": 41500 + }, + { + "epoch": 0.42, + "grad_norm": 0.19239626824855804, + "learning_rate": 0.001, + "loss": 0.0809, + "step": 41600 + }, + { + "epoch": 0.42, + "grad_norm": 0.13975249230861664, + "learning_rate": 0.001, + "loss": 0.0825, + "step": 41700 + }, + { + "epoch": 0.42, + "grad_norm": 0.22527189552783966, + "learning_rate": 0.001, + "loss": 0.0819, + "step": 41800 + }, + { + "epoch": 0.42, + "grad_norm": 0.3547128438949585, + "learning_rate": 0.001, + "loss": 0.1013, + "step": 41900 + }, + { + "epoch": 0.42, + "grad_norm": 0.22032135725021362, + "learning_rate": 0.001, + "loss": 0.0806, + "step": 42000 + }, + { + "epoch": 0.42, + "grad_norm": 0.12712807953357697, + "learning_rate": 0.001, + "loss": 0.0791, + "step": 42100 + }, + { + "epoch": 0.42, + "grad_norm": 0.29608944058418274, + "learning_rate": 0.001, + "loss": 0.0783, + "step": 42200 + }, + { + "epoch": 0.42, + "grad_norm": 0.23063918948173523, + "learning_rate": 0.001, + "loss": 0.0828, + "step": 42300 + }, + { + "epoch": 0.42, + "grad_norm": 0.19996796548366547, + "learning_rate": 0.001, + "loss": 0.0813, + "step": 42400 + }, + { + "epoch": 0.42, + "grad_norm": 0.19479811191558838, + "learning_rate": 0.001, + "loss": 0.0811, + "step": 42500 + }, + { + "epoch": 0.43, + "grad_norm": 0.1822797805070877, + "learning_rate": 0.001, + "loss": 0.0796, + "step": 42600 + }, + { + "epoch": 0.43, + "grad_norm": 0.36260533332824707, + "learning_rate": 0.001, + "loss": 0.0797, + "step": 42700 + }, + { + "epoch": 0.43, + "grad_norm": 0.14315147697925568, + "learning_rate": 0.001, + "loss": 0.0869, + "step": 42800 + }, + { + "epoch": 0.43, + "grad_norm": 0.20261742174625397, + "learning_rate": 0.001, + "loss": 0.1856, + "step": 42900 + }, + { + "epoch": 0.43, + "grad_norm": 0.18873733282089233, + "learning_rate": 0.001, + "loss": 0.0775, + "step": 43000 + }, + { + "epoch": 0.43, + "grad_norm": 0.2189916968345642, + "learning_rate": 0.001, + "loss": 0.0796, + "step": 43100 + }, + { + "epoch": 0.43, + "grad_norm": 0.1823868304491043, + "learning_rate": 0.001, + "loss": 0.0822, + "step": 43200 + }, + { + "epoch": 0.43, + "grad_norm": 0.2595207691192627, + "learning_rate": 0.001, + "loss": 0.0776, + "step": 43300 + }, + { + "epoch": 0.43, + "grad_norm": 0.1713092178106308, + "learning_rate": 0.001, + "loss": 0.0811, + "step": 43400 + }, + { + "epoch": 0.43, + "grad_norm": 0.24840323626995087, + "learning_rate": 0.001, + "loss": 0.104, + "step": 43500 + }, + { + "epoch": 0.44, + "grad_norm": 0.23451556265354156, + "learning_rate": 0.001, + "loss": 0.077, + "step": 43600 + }, + { + "epoch": 0.44, + "grad_norm": 0.2142404466867447, + "learning_rate": 0.001, + "loss": 0.0789, + "step": 43700 + }, + { + "epoch": 0.44, + "grad_norm": 0.22932325303554535, + "learning_rate": 0.001, + "loss": 0.0778, + "step": 43800 + }, + { + "epoch": 0.44, + "grad_norm": 0.2027159184217453, + "learning_rate": 0.001, + "loss": 0.0794, + "step": 43900 + }, + { + "epoch": 0.44, + "grad_norm": 0.22258317470550537, + "learning_rate": 0.001, + "loss": 0.0787, + "step": 44000 + }, + { + "epoch": 0.44, + "grad_norm": 0.2979215681552887, + "learning_rate": 0.001, + "loss": 0.0767, + "step": 44100 + }, + { + "epoch": 0.44, + "grad_norm": 0.2110917568206787, + "learning_rate": 0.001, + "loss": 0.0782, + "step": 44200 + }, + { + "epoch": 0.44, + "grad_norm": 0.24181802570819855, + "learning_rate": 0.001, + "loss": 0.0804, + "step": 44300 + }, + { + "epoch": 0.44, + "grad_norm": 0.1810845136642456, + "learning_rate": 0.001, + "loss": 0.0786, + "step": 44400 + }, + { + "epoch": 0.44, + "grad_norm": 0.23404444754123688, + "learning_rate": 0.001, + "loss": 0.0785, + "step": 44500 + }, + { + "epoch": 0.45, + "grad_norm": 0.2591089904308319, + "learning_rate": 0.001, + "loss": 0.0765, + "step": 44600 + }, + { + "epoch": 0.45, + "grad_norm": 0.22720029950141907, + "learning_rate": 0.001, + "loss": 0.0798, + "step": 44700 + }, + { + "epoch": 0.45, + "grad_norm": 0.22449086606502533, + "learning_rate": 0.001, + "loss": 0.0766, + "step": 44800 + }, + { + "epoch": 0.45, + "grad_norm": 0.2302643209695816, + "learning_rate": 0.001, + "loss": 0.0798, + "step": 44900 + }, + { + "epoch": 0.45, + "grad_norm": 0.2040921300649643, + "learning_rate": 0.001, + "loss": 0.0841, + "step": 45000 + }, + { + "epoch": 0.45, + "grad_norm": 0.21232621371746063, + "learning_rate": 0.001, + "loss": 0.0789, + "step": 45100 + }, + { + "epoch": 0.45, + "grad_norm": 0.20054876804351807, + "learning_rate": 0.001, + "loss": 0.0779, + "step": 45200 + }, + { + "epoch": 0.45, + "grad_norm": 0.24335692822933197, + "learning_rate": 0.001, + "loss": 0.0784, + "step": 45300 + }, + { + "epoch": 0.45, + "grad_norm": 0.22172445058822632, + "learning_rate": 0.001, + "loss": 0.0797, + "step": 45400 + }, + { + "epoch": 0.45, + "grad_norm": 0.20524169504642487, + "learning_rate": 0.001, + "loss": 0.0803, + "step": 45500 + }, + { + "epoch": 0.46, + "grad_norm": 0.17150288820266724, + "learning_rate": 0.001, + "loss": 0.0791, + "step": 45600 + }, + { + "epoch": 0.46, + "grad_norm": 0.38285690546035767, + "learning_rate": 0.001, + "loss": 0.079, + "step": 45700 + }, + { + "epoch": 0.46, + "grad_norm": 0.16937342286109924, + "learning_rate": 0.001, + "loss": 0.0791, + "step": 45800 + }, + { + "epoch": 0.46, + "grad_norm": 0.19271647930145264, + "learning_rate": 0.001, + "loss": 0.079, + "step": 45900 + }, + { + "epoch": 0.46, + "grad_norm": 0.20048774778842926, + "learning_rate": 0.001, + "loss": 0.0797, + "step": 46000 + }, + { + "epoch": 0.46, + "grad_norm": 0.2141706347465515, + "learning_rate": 0.001, + "loss": 0.0798, + "step": 46100 + }, + { + "epoch": 0.46, + "grad_norm": 0.20665834844112396, + "learning_rate": 0.001, + "loss": 0.0778, + "step": 46200 + }, + { + "epoch": 0.46, + "grad_norm": 0.18385255336761475, + "learning_rate": 0.001, + "loss": 0.0779, + "step": 46300 + }, + { + "epoch": 0.46, + "grad_norm": 0.22467826306819916, + "learning_rate": 0.001, + "loss": 0.0732, + "step": 46400 + }, + { + "epoch": 0.46, + "grad_norm": 0.18363313376903534, + "learning_rate": 0.001, + "loss": 0.0796, + "step": 46500 + }, + { + "epoch": 0.47, + "grad_norm": 0.2288578897714615, + "learning_rate": 0.001, + "loss": 0.0763, + "step": 46600 + }, + { + "epoch": 0.47, + "grad_norm": 0.2535518407821655, + "learning_rate": 0.001, + "loss": 0.0791, + "step": 46700 + }, + { + "epoch": 0.47, + "grad_norm": 0.20715934038162231, + "learning_rate": 0.001, + "loss": 0.0777, + "step": 46800 + }, + { + "epoch": 0.47, + "grad_norm": 0.12203960865736008, + "learning_rate": 0.001, + "loss": 0.0805, + "step": 46900 + }, + { + "epoch": 0.47, + "grad_norm": 0.138369619846344, + "learning_rate": 0.001, + "loss": 0.0768, + "step": 47000 + }, + { + "epoch": 0.47, + "grad_norm": 0.2319127321243286, + "learning_rate": 0.001, + "loss": 0.0784, + "step": 47100 + }, + { + "epoch": 0.47, + "grad_norm": 0.2058788686990738, + "learning_rate": 0.001, + "loss": 0.0783, + "step": 47200 + }, + { + "epoch": 0.47, + "grad_norm": 0.21334126591682434, + "learning_rate": 0.001, + "loss": 0.0763, + "step": 47300 + }, + { + "epoch": 0.47, + "grad_norm": 0.23397529125213623, + "learning_rate": 0.001, + "loss": 0.081, + "step": 47400 + }, + { + "epoch": 0.47, + "grad_norm": 0.24460141360759735, + "learning_rate": 0.001, + "loss": 0.0752, + "step": 47500 + }, + { + "epoch": 0.48, + "grad_norm": 0.22441798448562622, + "learning_rate": 0.001, + "loss": 0.0779, + "step": 47600 + }, + { + "epoch": 0.48, + "grad_norm": 0.20988881587982178, + "learning_rate": 0.001, + "loss": 0.08, + "step": 47700 + }, + { + "epoch": 0.48, + "grad_norm": 0.17863024771213531, + "learning_rate": 0.001, + "loss": 0.0787, + "step": 47800 + }, + { + "epoch": 0.48, + "grad_norm": 0.17980898916721344, + "learning_rate": 0.001, + "loss": 0.0802, + "step": 47900 + }, + { + "epoch": 0.48, + "grad_norm": 0.2614147961139679, + "learning_rate": 0.001, + "loss": 0.0787, + "step": 48000 + }, + { + "epoch": 0.48, + "grad_norm": 0.16281504929065704, + "learning_rate": 0.001, + "loss": 0.0779, + "step": 48100 + }, + { + "epoch": 0.48, + "grad_norm": 0.3099921941757202, + "learning_rate": 0.001, + "loss": 0.0747, + "step": 48200 + }, + { + "epoch": 0.48, + "grad_norm": 0.2542015016078949, + "learning_rate": 0.001, + "loss": 0.0831, + "step": 48300 + }, + { + "epoch": 0.48, + "grad_norm": 0.17419801652431488, + "learning_rate": 0.001, + "loss": 0.0787, + "step": 48400 + }, + { + "epoch": 0.48, + "grad_norm": 0.2089216262102127, + "learning_rate": 0.001, + "loss": 0.0781, + "step": 48500 + }, + { + "epoch": 0.49, + "grad_norm": 0.26476818323135376, + "learning_rate": 0.001, + "loss": 0.0792, + "step": 48600 + }, + { + "epoch": 0.49, + "grad_norm": 0.18907053768634796, + "learning_rate": 0.001, + "loss": 0.078, + "step": 48700 + }, + { + "epoch": 0.49, + "grad_norm": 0.2528514564037323, + "learning_rate": 0.001, + "loss": 0.0791, + "step": 48800 + }, + { + "epoch": 0.49, + "grad_norm": 0.2794158458709717, + "learning_rate": 0.001, + "loss": 0.0799, + "step": 48900 + }, + { + "epoch": 0.49, + "grad_norm": 0.24547474086284637, + "learning_rate": 0.001, + "loss": 0.0765, + "step": 49000 + }, + { + "epoch": 0.49, + "grad_norm": 0.17239224910736084, + "learning_rate": 0.001, + "loss": 0.0807, + "step": 49100 + }, + { + "epoch": 0.49, + "grad_norm": 0.22998745739459991, + "learning_rate": 0.001, + "loss": 0.079, + "step": 49200 + }, + { + "epoch": 0.49, + "grad_norm": 0.2727990746498108, + "learning_rate": 0.001, + "loss": 0.078, + "step": 49300 + }, + { + "epoch": 0.49, + "grad_norm": 0.2488749623298645, + "learning_rate": 0.001, + "loss": 0.0757, + "step": 49400 + }, + { + "epoch": 0.49, + "grad_norm": 0.20260153710842133, + "learning_rate": 0.001, + "loss": 0.0787, + "step": 49500 + }, + { + "epoch": 0.5, + "grad_norm": 0.30832308530807495, + "learning_rate": 0.001, + "loss": 0.0789, + "step": 49600 + }, + { + "epoch": 0.5, + "grad_norm": 0.17934545874595642, + "learning_rate": 0.001, + "loss": 0.0768, + "step": 49700 + }, + { + "epoch": 0.5, + "grad_norm": 0.1972292810678482, + "learning_rate": 0.001, + "loss": 0.0786, + "step": 49800 + }, + { + "epoch": 0.5, + "grad_norm": 0.1899816393852234, + "learning_rate": 0.001, + "loss": 0.0782, + "step": 49900 + }, + { + "epoch": 0.5, + "grad_norm": 0.17765800654888153, + "learning_rate": 0.001, + "loss": 0.0784, + "step": 50000 + }, + { + "epoch": 0.5, + "grad_norm": 0.3285583555698395, + "learning_rate": 0.001, + "loss": 0.0793, + "step": 50100 + }, + { + "epoch": 0.5, + "grad_norm": 0.2769279181957245, + "learning_rate": 0.001, + "loss": 0.0818, + "step": 50200 + }, + { + "epoch": 0.5, + "grad_norm": 0.1661899834871292, + "learning_rate": 0.001, + "loss": 0.1088, + "step": 50300 + }, + { + "epoch": 0.5, + "grad_norm": 0.32694903016090393, + "learning_rate": 0.001, + "loss": 0.0799, + "step": 50400 + }, + { + "epoch": 0.5, + "grad_norm": 0.1976955235004425, + "learning_rate": 0.001, + "loss": 0.0768, + "step": 50500 + }, + { + "epoch": 0.51, + "grad_norm": 0.2623777687549591, + "learning_rate": 0.001, + "loss": 0.0764, + "step": 50600 + }, + { + "epoch": 0.51, + "grad_norm": 0.19917914271354675, + "learning_rate": 0.001, + "loss": 0.079, + "step": 50700 + }, + { + "epoch": 0.51, + "grad_norm": 0.22838640213012695, + "learning_rate": 0.001, + "loss": 0.076, + "step": 50800 + }, + { + "epoch": 0.51, + "grad_norm": 0.1831175684928894, + "learning_rate": 0.001, + "loss": 0.0744, + "step": 50900 + }, + { + "epoch": 0.51, + "grad_norm": 0.1774362176656723, + "learning_rate": 0.001, + "loss": 0.076, + "step": 51000 + }, + { + "epoch": 0.51, + "grad_norm": 0.24986374378204346, + "learning_rate": 0.001, + "loss": 0.0754, + "step": 51100 + }, + { + "epoch": 0.51, + "grad_norm": 0.15164266526699066, + "learning_rate": 0.001, + "loss": 0.0757, + "step": 51200 + }, + { + "epoch": 0.51, + "grad_norm": 0.19118934869766235, + "learning_rate": 0.001, + "loss": 0.0787, + "step": 51300 + }, + { + "epoch": 0.51, + "grad_norm": 0.1625840663909912, + "learning_rate": 0.001, + "loss": 0.0778, + "step": 51400 + }, + { + "epoch": 0.51, + "grad_norm": 0.14519533514976501, + "learning_rate": 0.001, + "loss": 0.077, + "step": 51500 + }, + { + "epoch": 0.52, + "grad_norm": 0.16799670457839966, + "learning_rate": 0.001, + "loss": 0.0764, + "step": 51600 + }, + { + "epoch": 0.52, + "grad_norm": 0.15635591745376587, + "learning_rate": 0.001, + "loss": 0.0738, + "step": 51700 + }, + { + "epoch": 0.52, + "grad_norm": 0.25875189900398254, + "learning_rate": 0.001, + "loss": 0.0757, + "step": 51800 + }, + { + "epoch": 0.52, + "grad_norm": 0.2601448595523834, + "learning_rate": 0.001, + "loss": 0.0721, + "step": 51900 + }, + { + "epoch": 0.52, + "grad_norm": 0.20097233355045319, + "learning_rate": 0.001, + "loss": 0.0764, + "step": 52000 + }, + { + "epoch": 0.52, + "grad_norm": 0.17383421957492828, + "learning_rate": 0.001, + "loss": 0.0768, + "step": 52100 + }, + { + "epoch": 0.52, + "grad_norm": 0.152663916349411, + "learning_rate": 0.001, + "loss": 0.0747, + "step": 52200 + }, + { + "epoch": 0.52, + "grad_norm": 0.1773347705602646, + "learning_rate": 0.001, + "loss": 0.0743, + "step": 52300 + }, + { + "epoch": 0.52, + "grad_norm": 0.15975210070610046, + "learning_rate": 0.001, + "loss": 0.0769, + "step": 52400 + }, + { + "epoch": 0.52, + "grad_norm": 0.27663958072662354, + "learning_rate": 0.001, + "loss": 0.0747, + "step": 52500 + }, + { + "epoch": 0.53, + "grad_norm": 0.20124509930610657, + "learning_rate": 0.001, + "loss": 0.0755, + "step": 52600 + }, + { + "epoch": 0.53, + "grad_norm": 0.19016942381858826, + "learning_rate": 0.001, + "loss": 0.0709, + "step": 52700 + }, + { + "epoch": 0.53, + "grad_norm": 0.34517988562583923, + "learning_rate": 0.001, + "loss": 0.0751, + "step": 52800 + }, + { + "epoch": 0.53, + "grad_norm": 0.27312055230140686, + "learning_rate": 0.001, + "loss": 0.0761, + "step": 52900 + }, + { + "epoch": 0.53, + "grad_norm": 0.2835043668746948, + "learning_rate": 0.001, + "loss": 0.0731, + "step": 53000 + }, + { + "epoch": 0.53, + "grad_norm": 0.1630600243806839, + "learning_rate": 0.001, + "loss": 0.0741, + "step": 53100 + }, + { + "epoch": 0.53, + "grad_norm": 0.2430613487958908, + "learning_rate": 0.001, + "loss": 0.0767, + "step": 53200 + }, + { + "epoch": 0.53, + "grad_norm": 0.19533057510852814, + "learning_rate": 0.001, + "loss": 0.077, + "step": 53300 + }, + { + "epoch": 0.53, + "grad_norm": 0.21139401197433472, + "learning_rate": 0.001, + "loss": 0.0711, + "step": 53400 + }, + { + "epoch": 0.53, + "grad_norm": 0.18416912853717804, + "learning_rate": 0.001, + "loss": 0.0729, + "step": 53500 + }, + { + "epoch": 0.54, + "grad_norm": 0.24703727662563324, + "learning_rate": 0.001, + "loss": 0.071, + "step": 53600 + }, + { + "epoch": 0.54, + "grad_norm": 0.14476247131824493, + "learning_rate": 0.001, + "loss": 0.0754, + "step": 53700 + }, + { + "epoch": 0.54, + "grad_norm": 0.210220068693161, + "learning_rate": 0.001, + "loss": 0.0738, + "step": 53800 + }, + { + "epoch": 0.54, + "grad_norm": 0.16544660925865173, + "learning_rate": 0.001, + "loss": 0.072, + "step": 53900 + }, + { + "epoch": 0.54, + "grad_norm": 0.17049700021743774, + "learning_rate": 0.001, + "loss": 0.0728, + "step": 54000 + }, + { + "epoch": 0.54, + "grad_norm": 0.18656505644321442, + "learning_rate": 0.001, + "loss": 0.0739, + "step": 54100 + }, + { + "epoch": 0.54, + "grad_norm": 0.19484791159629822, + "learning_rate": 0.001, + "loss": 0.0748, + "step": 54200 + }, + { + "epoch": 0.54, + "grad_norm": 0.1982715129852295, + "learning_rate": 0.001, + "loss": 0.0729, + "step": 54300 + }, + { + "epoch": 0.54, + "grad_norm": 0.2108699083328247, + "learning_rate": 0.001, + "loss": 0.0735, + "step": 54400 + }, + { + "epoch": 0.54, + "grad_norm": 0.23962444067001343, + "learning_rate": 0.001, + "loss": 0.0703, + "step": 54500 + }, + { + "epoch": 0.55, + "grad_norm": 0.29319801926612854, + "learning_rate": 0.001, + "loss": 0.0735, + "step": 54600 + }, + { + "epoch": 0.55, + "grad_norm": 0.1804085075855255, + "learning_rate": 0.001, + "loss": 0.0719, + "step": 54700 + }, + { + "epoch": 0.55, + "grad_norm": 0.2394474297761917, + "learning_rate": 0.001, + "loss": 0.0721, + "step": 54800 + }, + { + "epoch": 0.55, + "grad_norm": 0.20954197645187378, + "learning_rate": 0.001, + "loss": 0.0745, + "step": 54900 + }, + { + "epoch": 0.55, + "grad_norm": 0.17135080695152283, + "learning_rate": 0.001, + "loss": 0.0728, + "step": 55000 + }, + { + "epoch": 0.55, + "grad_norm": 0.3152260482311249, + "learning_rate": 0.001, + "loss": 0.0735, + "step": 55100 + }, + { + "epoch": 0.55, + "grad_norm": 0.22659769654273987, + "learning_rate": 0.001, + "loss": 0.0752, + "step": 55200 + }, + { + "epoch": 0.55, + "grad_norm": 0.2605753540992737, + "learning_rate": 0.001, + "loss": 0.073, + "step": 55300 + }, + { + "epoch": 0.55, + "grad_norm": 0.2309567779302597, + "learning_rate": 0.001, + "loss": 0.0744, + "step": 55400 + }, + { + "epoch": 0.55, + "grad_norm": 0.19917166233062744, + "learning_rate": 0.001, + "loss": 0.073, + "step": 55500 + }, + { + "epoch": 0.56, + "grad_norm": 0.2609159052371979, + "learning_rate": 0.001, + "loss": 0.0705, + "step": 55600 + }, + { + "epoch": 0.56, + "grad_norm": 0.26976123452186584, + "learning_rate": 0.001, + "loss": 0.0731, + "step": 55700 + }, + { + "epoch": 0.56, + "grad_norm": 0.25275784730911255, + "learning_rate": 0.001, + "loss": 0.0808, + "step": 55800 + }, + { + "epoch": 0.56, + "grad_norm": 0.2392340749502182, + "learning_rate": 0.001, + "loss": 0.0763, + "step": 55900 + }, + { + "epoch": 0.56, + "grad_norm": 0.27718254923820496, + "learning_rate": 0.001, + "loss": 0.0743, + "step": 56000 + }, + { + "epoch": 0.56, + "grad_norm": 0.19996067881584167, + "learning_rate": 0.001, + "loss": 0.0807, + "step": 56100 + }, + { + "epoch": 0.56, + "grad_norm": 0.16322393715381622, + "learning_rate": 0.001, + "loss": 0.0753, + "step": 56200 + }, + { + "epoch": 0.56, + "grad_norm": 0.25598809123039246, + "learning_rate": 0.001, + "loss": 0.0773, + "step": 56300 + }, + { + "epoch": 0.56, + "grad_norm": 0.15482768416404724, + "learning_rate": 0.001, + "loss": 0.0729, + "step": 56400 + }, + { + "epoch": 0.56, + "grad_norm": 0.4033351242542267, + "learning_rate": 0.001, + "loss": 0.0773, + "step": 56500 + }, + { + "epoch": 0.57, + "grad_norm": 0.2869590222835541, + "learning_rate": 0.001, + "loss": 0.0732, + "step": 56600 + }, + { + "epoch": 0.57, + "grad_norm": 0.19079795479774475, + "learning_rate": 0.001, + "loss": 0.0712, + "step": 56700 + }, + { + "epoch": 0.57, + "grad_norm": 0.21604031324386597, + "learning_rate": 0.001, + "loss": 0.0714, + "step": 56800 + }, + { + "epoch": 0.57, + "grad_norm": 0.23917321860790253, + "learning_rate": 0.001, + "loss": 0.0743, + "step": 56900 + }, + { + "epoch": 0.57, + "grad_norm": 0.16785088181495667, + "learning_rate": 0.001, + "loss": 0.0722, + "step": 57000 + }, + { + "epoch": 0.57, + "grad_norm": 0.22009502351284027, + "learning_rate": 0.001, + "loss": 0.0738, + "step": 57100 + }, + { + "epoch": 0.57, + "grad_norm": 0.23401811718940735, + "learning_rate": 0.001, + "loss": 0.0759, + "step": 57200 + }, + { + "epoch": 0.57, + "grad_norm": 0.19278208911418915, + "learning_rate": 0.001, + "loss": 0.0738, + "step": 57300 + }, + { + "epoch": 0.57, + "grad_norm": 0.22170820832252502, + "learning_rate": 0.001, + "loss": 0.07, + "step": 57400 + }, + { + "epoch": 0.57, + "grad_norm": 0.2148713767528534, + "learning_rate": 0.001, + "loss": 0.0716, + "step": 57500 + }, + { + "epoch": 0.58, + "grad_norm": 0.2093653529882431, + "learning_rate": 0.001, + "loss": 0.0722, + "step": 57600 + }, + { + "epoch": 0.58, + "grad_norm": 0.2912674844264984, + "learning_rate": 0.001, + "loss": 0.0738, + "step": 57700 + }, + { + "epoch": 0.58, + "grad_norm": 0.3146283030509949, + "learning_rate": 0.001, + "loss": 0.0735, + "step": 57800 + }, + { + "epoch": 0.58, + "grad_norm": 0.2355007380247116, + "learning_rate": 0.001, + "loss": 0.0719, + "step": 57900 + }, + { + "epoch": 0.58, + "grad_norm": 0.19035007059574127, + "learning_rate": 0.001, + "loss": 0.0699, + "step": 58000 + }, + { + "epoch": 0.58, + "grad_norm": 0.13338258862495422, + "learning_rate": 0.001, + "loss": 0.0727, + "step": 58100 + }, + { + "epoch": 0.58, + "grad_norm": 0.22755542397499084, + "learning_rate": 0.001, + "loss": 0.072, + "step": 58200 + }, + { + "epoch": 0.58, + "grad_norm": 0.23752057552337646, + "learning_rate": 0.001, + "loss": 0.0703, + "step": 58300 + }, + { + "epoch": 0.58, + "grad_norm": 0.20008322596549988, + "learning_rate": 0.001, + "loss": 0.0721, + "step": 58400 + }, + { + "epoch": 0.58, + "grad_norm": 0.1769803911447525, + "learning_rate": 0.001, + "loss": 0.0724, + "step": 58500 + }, + { + "epoch": 0.59, + "grad_norm": 0.19137178361415863, + "learning_rate": 0.001, + "loss": 0.0735, + "step": 58600 + }, + { + "epoch": 0.59, + "grad_norm": 0.22157849371433258, + "learning_rate": 0.001, + "loss": 0.0735, + "step": 58700 + }, + { + "epoch": 0.59, + "grad_norm": 0.2098543494939804, + "learning_rate": 0.001, + "loss": 0.0701, + "step": 58800 + }, + { + "epoch": 0.59, + "grad_norm": 0.22936704754829407, + "learning_rate": 0.001, + "loss": 0.0691, + "step": 58900 + }, + { + "epoch": 0.59, + "grad_norm": 0.15228866040706635, + "learning_rate": 0.001, + "loss": 0.0729, + "step": 59000 + }, + { + "epoch": 0.59, + "grad_norm": 0.27094388008117676, + "learning_rate": 0.001, + "loss": 0.0706, + "step": 59100 + }, + { + "epoch": 0.59, + "grad_norm": 0.17357999086380005, + "learning_rate": 0.001, + "loss": 0.071, + "step": 59200 + }, + { + "epoch": 0.59, + "grad_norm": 0.2912188768386841, + "learning_rate": 0.001, + "loss": 0.0719, + "step": 59300 + }, + { + "epoch": 0.59, + "grad_norm": 0.24029956758022308, + "learning_rate": 0.001, + "loss": 0.07, + "step": 59400 + }, + { + "epoch": 0.59, + "grad_norm": 0.1956549882888794, + "learning_rate": 0.001, + "loss": 0.0712, + "step": 59500 + }, + { + "epoch": 0.6, + "grad_norm": 0.26984256505966187, + "learning_rate": 0.001, + "loss": 0.0713, + "step": 59600 + }, + { + "epoch": 0.6, + "grad_norm": 0.18548165261745453, + "learning_rate": 0.001, + "loss": 0.0686, + "step": 59700 + }, + { + "epoch": 0.6, + "grad_norm": 0.1833103895187378, + "learning_rate": 0.001, + "loss": 0.0672, + "step": 59800 + }, + { + "epoch": 0.6, + "grad_norm": 0.20417752861976624, + "learning_rate": 0.001, + "loss": 0.069, + "step": 59900 + }, + { + "epoch": 0.6, + "grad_norm": 0.3695315420627594, + "learning_rate": 0.001, + "loss": 0.0703, + "step": 60000 + }, + { + "epoch": 0.6, + "grad_norm": 0.23288464546203613, + "learning_rate": 0.001, + "loss": 0.0704, + "step": 60100 + }, + { + "epoch": 0.6, + "grad_norm": 0.21595774590969086, + "learning_rate": 0.001, + "loss": 0.0697, + "step": 60200 + }, + { + "epoch": 0.6, + "grad_norm": 0.16371206939220428, + "learning_rate": 0.001, + "loss": 0.0704, + "step": 60300 + }, + { + "epoch": 0.6, + "grad_norm": 0.2600916028022766, + "learning_rate": 0.001, + "loss": 0.0693, + "step": 60400 + }, + { + "epoch": 0.6, + "grad_norm": 0.21177971363067627, + "learning_rate": 0.001, + "loss": 0.0707, + "step": 60500 + }, + { + "epoch": 0.61, + "grad_norm": 0.16886168718338013, + "learning_rate": 0.001, + "loss": 0.0701, + "step": 60600 + }, + { + "epoch": 0.61, + "grad_norm": 0.29835718870162964, + "learning_rate": 0.001, + "loss": 0.0683, + "step": 60700 + }, + { + "epoch": 0.61, + "grad_norm": 0.2594737410545349, + "learning_rate": 0.001, + "loss": 0.0723, + "step": 60800 + }, + { + "epoch": 0.61, + "grad_norm": 0.2057715505361557, + "learning_rate": 0.001, + "loss": 0.0693, + "step": 60900 + }, + { + "epoch": 0.61, + "grad_norm": 0.2127043902873993, + "learning_rate": 0.001, + "loss": 0.0699, + "step": 61000 + }, + { + "epoch": 0.61, + "grad_norm": 0.18162322044372559, + "learning_rate": 0.001, + "loss": 0.0714, + "step": 61100 + }, + { + "epoch": 0.61, + "grad_norm": 0.21535515785217285, + "learning_rate": 0.001, + "loss": 0.0711, + "step": 61200 + }, + { + "epoch": 0.61, + "grad_norm": 0.19364242255687714, + "learning_rate": 0.001, + "loss": 0.0715, + "step": 61300 + }, + { + "epoch": 0.61, + "grad_norm": 0.14159826934337616, + "learning_rate": 0.001, + "loss": 0.07, + "step": 61400 + }, + { + "epoch": 0.61, + "grad_norm": 0.21536406874656677, + "learning_rate": 0.001, + "loss": 0.0689, + "step": 61500 + }, + { + "epoch": 0.62, + "grad_norm": 0.19926196336746216, + "learning_rate": 0.001, + "loss": 0.0689, + "step": 61600 + }, + { + "epoch": 0.62, + "grad_norm": 0.20217150449752808, + "learning_rate": 0.001, + "loss": 0.071, + "step": 61700 + }, + { + "epoch": 0.62, + "grad_norm": 0.17570650577545166, + "learning_rate": 0.001, + "loss": 0.0719, + "step": 61800 + }, + { + "epoch": 0.62, + "grad_norm": 0.19788751006126404, + "learning_rate": 0.001, + "loss": 0.0687, + "step": 61900 + }, + { + "epoch": 0.62, + "grad_norm": 0.22191910445690155, + "learning_rate": 0.001, + "loss": 0.0687, + "step": 62000 + }, + { + "epoch": 0.62, + "grad_norm": 0.19544494152069092, + "learning_rate": 0.001, + "loss": 0.0704, + "step": 62100 + }, + { + "epoch": 0.62, + "grad_norm": 0.32939237356185913, + "learning_rate": 0.001, + "loss": 0.0713, + "step": 62200 + }, + { + "epoch": 0.62, + "grad_norm": 0.1809149980545044, + "learning_rate": 0.001, + "loss": 0.0701, + "step": 62300 + }, + { + "epoch": 0.62, + "grad_norm": 0.2769867479801178, + "learning_rate": 0.001, + "loss": 0.0718, + "step": 62400 + }, + { + "epoch": 0.62, + "grad_norm": 0.15998759865760803, + "learning_rate": 0.001, + "loss": 0.0691, + "step": 62500 + }, + { + "epoch": 0.63, + "grad_norm": 0.29498517513275146, + "learning_rate": 0.001, + "loss": 0.0722, + "step": 62600 + }, + { + "epoch": 0.63, + "grad_norm": 0.19759228825569153, + "learning_rate": 0.001, + "loss": 0.0686, + "step": 62700 + }, + { + "epoch": 0.63, + "grad_norm": 0.12064652889966965, + "learning_rate": 0.001, + "loss": 0.0707, + "step": 62800 + }, + { + "epoch": 0.63, + "grad_norm": 0.19079501926898956, + "learning_rate": 0.001, + "loss": 0.0662, + "step": 62900 + }, + { + "epoch": 0.63, + "grad_norm": 0.22422794997692108, + "learning_rate": 0.001, + "loss": 0.0662, + "step": 63000 + }, + { + "epoch": 0.63, + "grad_norm": 0.16929177939891815, + "learning_rate": 0.001, + "loss": 0.0677, + "step": 63100 + }, + { + "epoch": 0.63, + "grad_norm": 0.20057950913906097, + "learning_rate": 0.001, + "loss": 0.0699, + "step": 63200 + }, + { + "epoch": 0.63, + "grad_norm": 0.4213920533657074, + "learning_rate": 0.001, + "loss": 0.0701, + "step": 63300 + }, + { + "epoch": 0.63, + "grad_norm": 0.28028371930122375, + "learning_rate": 0.001, + "loss": 0.0697, + "step": 63400 + }, + { + "epoch": 0.63, + "grad_norm": 0.18094098567962646, + "learning_rate": 0.001, + "loss": 0.0727, + "step": 63500 + }, + { + "epoch": 0.64, + "grad_norm": 0.30136585235595703, + "learning_rate": 0.001, + "loss": 0.0711, + "step": 63600 + }, + { + "epoch": 0.64, + "grad_norm": 0.192775696516037, + "learning_rate": 0.001, + "loss": 0.0721, + "step": 63700 + }, + { + "epoch": 0.64, + "grad_norm": 0.2211129367351532, + "learning_rate": 0.001, + "loss": 0.0695, + "step": 63800 + }, + { + "epoch": 0.64, + "grad_norm": 0.19226811826229095, + "learning_rate": 0.001, + "loss": 0.0699, + "step": 63900 + }, + { + "epoch": 0.64, + "grad_norm": 0.2471201866865158, + "learning_rate": 0.001, + "loss": 0.0692, + "step": 64000 + }, + { + "epoch": 0.64, + "grad_norm": 0.2547115385532379, + "learning_rate": 0.001, + "loss": 0.0673, + "step": 64100 + }, + { + "epoch": 0.64, + "grad_norm": 0.1899893879890442, + "learning_rate": 0.001, + "loss": 0.0693, + "step": 64200 + }, + { + "epoch": 0.64, + "grad_norm": 0.21257919073104858, + "learning_rate": 0.001, + "loss": 0.0684, + "step": 64300 + }, + { + "epoch": 0.64, + "grad_norm": 0.26688677072525024, + "learning_rate": 0.001, + "loss": 0.0683, + "step": 64400 + }, + { + "epoch": 0.64, + "grad_norm": 0.18874968588352203, + "learning_rate": 0.001, + "loss": 0.0688, + "step": 64500 + }, + { + "epoch": 0.65, + "grad_norm": 0.2013721913099289, + "learning_rate": 0.001, + "loss": 0.0684, + "step": 64600 + }, + { + "epoch": 0.65, + "grad_norm": 0.19745351374149323, + "learning_rate": 0.001, + "loss": 0.0685, + "step": 64700 + }, + { + "epoch": 0.65, + "grad_norm": 0.2137337028980255, + "learning_rate": 0.001, + "loss": 0.0671, + "step": 64800 + }, + { + "epoch": 0.65, + "grad_norm": 0.20300865173339844, + "learning_rate": 0.001, + "loss": 0.0684, + "step": 64900 + }, + { + "epoch": 0.65, + "grad_norm": 0.1723690927028656, + "learning_rate": 0.001, + "loss": 0.0681, + "step": 65000 + }, + { + "epoch": 0.65, + "grad_norm": 0.20693708956241608, + "learning_rate": 0.001, + "loss": 0.0685, + "step": 65100 + }, + { + "epoch": 0.65, + "grad_norm": 0.33531713485717773, + "learning_rate": 0.001, + "loss": 0.0687, + "step": 65200 + }, + { + "epoch": 0.65, + "grad_norm": 0.2180265337228775, + "learning_rate": 0.001, + "loss": 0.0719, + "step": 65300 + }, + { + "epoch": 0.65, + "grad_norm": 0.27855604887008667, + "learning_rate": 0.001, + "loss": 0.0686, + "step": 65400 + }, + { + "epoch": 0.65, + "grad_norm": 0.2309376448392868, + "learning_rate": 0.001, + "loss": 0.0682, + "step": 65500 + }, + { + "epoch": 0.66, + "grad_norm": 0.25525444746017456, + "learning_rate": 0.001, + "loss": 0.0698, + "step": 65600 + }, + { + "epoch": 0.66, + "grad_norm": 0.1746407151222229, + "learning_rate": 0.001, + "loss": 0.0692, + "step": 65700 + }, + { + "epoch": 0.66, + "grad_norm": 0.29511937499046326, + "learning_rate": 0.001, + "loss": 0.0675, + "step": 65800 + }, + { + "epoch": 0.66, + "grad_norm": 0.23610210418701172, + "learning_rate": 0.001, + "loss": 0.0682, + "step": 65900 + }, + { + "epoch": 0.66, + "grad_norm": 0.24088448286056519, + "learning_rate": 0.001, + "loss": 0.065, + "step": 66000 + }, + { + "epoch": 0.66, + "grad_norm": 0.3865065574645996, + "learning_rate": 0.001, + "loss": 0.068, + "step": 66100 + }, + { + "epoch": 0.66, + "grad_norm": 0.16312183439731598, + "learning_rate": 0.001, + "loss": 0.0674, + "step": 66200 + }, + { + "epoch": 0.66, + "grad_norm": 0.33910611271858215, + "learning_rate": 0.001, + "loss": 0.0657, + "step": 66300 + }, + { + "epoch": 0.66, + "grad_norm": 0.1491781622171402, + "learning_rate": 0.001, + "loss": 0.0663, + "step": 66400 + }, + { + "epoch": 0.66, + "grad_norm": 0.27082210779190063, + "learning_rate": 0.001, + "loss": 0.0692, + "step": 66500 + }, + { + "epoch": 0.67, + "grad_norm": 0.302495539188385, + "learning_rate": 0.001, + "loss": 0.0668, + "step": 66600 + }, + { + "epoch": 0.67, + "grad_norm": 0.1906341165304184, + "learning_rate": 0.001, + "loss": 0.0689, + "step": 66700 + }, + { + "epoch": 0.67, + "grad_norm": 0.21256040036678314, + "learning_rate": 0.001, + "loss": 0.0665, + "step": 66800 + }, + { + "epoch": 0.67, + "grad_norm": 0.16603924334049225, + "learning_rate": 0.001, + "loss": 0.07, + "step": 66900 + }, + { + "epoch": 0.67, + "grad_norm": 0.17136050760746002, + "learning_rate": 0.001, + "loss": 0.0715, + "step": 67000 + }, + { + "epoch": 0.67, + "grad_norm": 0.1679474115371704, + "learning_rate": 0.001, + "loss": 0.0667, + "step": 67100 + }, + { + "epoch": 0.67, + "grad_norm": 0.18445661664009094, + "learning_rate": 0.001, + "loss": 0.0688, + "step": 67200 + }, + { + "epoch": 0.67, + "grad_norm": 0.16743460297584534, + "learning_rate": 0.001, + "loss": 0.0672, + "step": 67300 + }, + { + "epoch": 0.67, + "grad_norm": 0.24309833347797394, + "learning_rate": 0.001, + "loss": 0.066, + "step": 67400 + }, + { + "epoch": 0.67, + "grad_norm": 0.15661662817001343, + "learning_rate": 0.001, + "loss": 0.0686, + "step": 67500 + }, + { + "epoch": 0.68, + "grad_norm": 0.32759585976600647, + "learning_rate": 0.001, + "loss": 0.0666, + "step": 67600 + }, + { + "epoch": 0.68, + "grad_norm": 0.1508253961801529, + "learning_rate": 0.001, + "loss": 0.068, + "step": 67700 + }, + { + "epoch": 0.68, + "grad_norm": 0.17459799349308014, + "learning_rate": 0.001, + "loss": 0.069, + "step": 67800 + }, + { + "epoch": 0.68, + "grad_norm": 0.2405272275209427, + "learning_rate": 0.001, + "loss": 0.0693, + "step": 67900 + }, + { + "epoch": 0.68, + "grad_norm": 0.2469649761915207, + "learning_rate": 0.001, + "loss": 0.0678, + "step": 68000 + }, + { + "epoch": 0.68, + "grad_norm": 0.25917258858680725, + "learning_rate": 0.001, + "loss": 0.0694, + "step": 68100 + }, + { + "epoch": 0.68, + "grad_norm": 0.1784822642803192, + "learning_rate": 0.001, + "loss": 0.0668, + "step": 68200 + }, + { + "epoch": 0.68, + "grad_norm": 0.22977730631828308, + "learning_rate": 0.001, + "loss": 0.0656, + "step": 68300 + }, + { + "epoch": 0.68, + "grad_norm": 0.1646946221590042, + "learning_rate": 0.001, + "loss": 0.068, + "step": 68400 + }, + { + "epoch": 0.68, + "grad_norm": 0.3220691978931427, + "learning_rate": 0.001, + "loss": 0.0665, + "step": 68500 + }, + { + "epoch": 0.69, + "grad_norm": 0.22109118103981018, + "learning_rate": 0.001, + "loss": 0.0684, + "step": 68600 + }, + { + "epoch": 0.69, + "grad_norm": 0.12051670998334885, + "learning_rate": 0.001, + "loss": 0.0675, + "step": 68700 + }, + { + "epoch": 0.69, + "grad_norm": 0.19576141238212585, + "learning_rate": 0.001, + "loss": 0.0655, + "step": 68800 + }, + { + "epoch": 0.69, + "grad_norm": 0.12783344089984894, + "learning_rate": 0.001, + "loss": 0.0677, + "step": 68900 + }, + { + "epoch": 0.69, + "grad_norm": 0.24854913353919983, + "learning_rate": 0.001, + "loss": 0.0684, + "step": 69000 + }, + { + "epoch": 0.69, + "grad_norm": 0.19816453754901886, + "learning_rate": 0.001, + "loss": 0.067, + "step": 69100 + }, + { + "epoch": 0.69, + "grad_norm": 0.20371900498867035, + "learning_rate": 0.001, + "loss": 0.0669, + "step": 69200 + }, + { + "epoch": 0.69, + "grad_norm": 0.24654364585876465, + "learning_rate": 0.001, + "loss": 0.0665, + "step": 69300 + }, + { + "epoch": 0.69, + "grad_norm": 0.22933346033096313, + "learning_rate": 0.001, + "loss": 0.0697, + "step": 69400 + }, + { + "epoch": 0.69, + "grad_norm": 0.3056330382823944, + "learning_rate": 0.001, + "loss": 0.0688, + "step": 69500 + }, + { + "epoch": 0.7, + "grad_norm": 0.14624419808387756, + "learning_rate": 0.001, + "loss": 0.0686, + "step": 69600 + }, + { + "epoch": 0.7, + "grad_norm": 0.23571297526359558, + "learning_rate": 0.001, + "loss": 0.0727, + "step": 69700 + }, + { + "epoch": 0.7, + "grad_norm": 0.20212960243225098, + "learning_rate": 0.001, + "loss": 0.0708, + "step": 69800 + }, + { + "epoch": 0.7, + "grad_norm": 0.22400203347206116, + "learning_rate": 0.001, + "loss": 0.0645, + "step": 69900 + }, + { + "epoch": 0.7, + "grad_norm": 0.15693353116512299, + "learning_rate": 0.001, + "loss": 0.066, + "step": 70000 + }, + { + "epoch": 0.7, + "grad_norm": 0.21171632409095764, + "learning_rate": 0.001, + "loss": 0.0651, + "step": 70100 + }, + { + "epoch": 0.7, + "grad_norm": 0.16716106235980988, + "learning_rate": 0.001, + "loss": 0.0651, + "step": 70200 + }, + { + "epoch": 0.7, + "grad_norm": 0.19692525267601013, + "learning_rate": 0.001, + "loss": 0.0677, + "step": 70300 + }, + { + "epoch": 0.7, + "grad_norm": 0.23514828085899353, + "learning_rate": 0.001, + "loss": 0.0651, + "step": 70400 + }, + { + "epoch": 0.7, + "grad_norm": 0.22567568719387054, + "learning_rate": 0.001, + "loss": 0.0658, + "step": 70500 + }, + { + "epoch": 0.71, + "grad_norm": 0.20934154093265533, + "learning_rate": 0.001, + "loss": 0.0661, + "step": 70600 + }, + { + "epoch": 0.71, + "grad_norm": 0.25384077429771423, + "learning_rate": 0.001, + "loss": 0.0658, + "step": 70700 + }, + { + "epoch": 0.71, + "grad_norm": 0.27204346656799316, + "learning_rate": 0.001, + "loss": 0.0685, + "step": 70800 + }, + { + "epoch": 0.71, + "grad_norm": 0.1900806725025177, + "learning_rate": 0.001, + "loss": 0.0637, + "step": 70900 + }, + { + "epoch": 0.71, + "grad_norm": 0.4064619243144989, + "learning_rate": 0.001, + "loss": 0.07, + "step": 71000 + }, + { + "epoch": 0.71, + "grad_norm": 0.22942863404750824, + "learning_rate": 0.001, + "loss": 0.067, + "step": 71100 + }, + { + "epoch": 0.71, + "grad_norm": 0.3398168683052063, + "learning_rate": 0.001, + "loss": 0.0673, + "step": 71200 + }, + { + "epoch": 0.71, + "grad_norm": 0.2937333881855011, + "learning_rate": 0.001, + "loss": 0.0689, + "step": 71300 + }, + { + "epoch": 0.71, + "grad_norm": 0.15955261886119843, + "learning_rate": 0.001, + "loss": 0.0644, + "step": 71400 + }, + { + "epoch": 0.71, + "grad_norm": 0.32867005467414856, + "learning_rate": 0.001, + "loss": 0.0668, + "step": 71500 + }, + { + "epoch": 0.72, + "grad_norm": 0.22879061102867126, + "learning_rate": 0.001, + "loss": 0.0641, + "step": 71600 + }, + { + "epoch": 0.72, + "grad_norm": 0.3147716224193573, + "learning_rate": 0.001, + "loss": 0.0643, + "step": 71700 + }, + { + "epoch": 0.72, + "grad_norm": 0.19312891364097595, + "learning_rate": 0.001, + "loss": 0.0654, + "step": 71800 + }, + { + "epoch": 0.72, + "grad_norm": 0.3658990263938904, + "learning_rate": 0.001, + "loss": 0.066, + "step": 71900 + }, + { + "epoch": 0.72, + "grad_norm": 0.2730260193347931, + "learning_rate": 0.001, + "loss": 0.0673, + "step": 72000 + }, + { + "epoch": 0.72, + "grad_norm": 0.3601909279823303, + "learning_rate": 0.001, + "loss": 0.0643, + "step": 72100 + }, + { + "epoch": 0.72, + "grad_norm": 0.13944287598133087, + "learning_rate": 0.001, + "loss": 0.0671, + "step": 72200 + }, + { + "epoch": 0.72, + "grad_norm": 0.1590428501367569, + "learning_rate": 0.001, + "loss": 0.0651, + "step": 72300 + }, + { + "epoch": 0.72, + "grad_norm": 0.17583294212818146, + "learning_rate": 0.001, + "loss": 0.0665, + "step": 72400 + }, + { + "epoch": 0.72, + "grad_norm": 0.1566411554813385, + "learning_rate": 0.001, + "loss": 0.0666, + "step": 72500 + }, + { + "epoch": 0.73, + "grad_norm": 0.26495423913002014, + "learning_rate": 0.001, + "loss": 0.0651, + "step": 72600 + }, + { + "epoch": 0.73, + "grad_norm": 0.17272372543811798, + "learning_rate": 0.001, + "loss": 0.0689, + "step": 72700 + }, + { + "epoch": 0.73, + "grad_norm": 0.2443661093711853, + "learning_rate": 0.001, + "loss": 0.065, + "step": 72800 + }, + { + "epoch": 0.73, + "grad_norm": 0.26695558428764343, + "learning_rate": 0.001, + "loss": 0.0637, + "step": 72900 + }, + { + "epoch": 0.73, + "grad_norm": 0.14408937096595764, + "learning_rate": 0.001, + "loss": 0.0676, + "step": 73000 + }, + { + "epoch": 0.73, + "grad_norm": 0.18142744898796082, + "learning_rate": 0.001, + "loss": 0.0653, + "step": 73100 + }, + { + "epoch": 0.73, + "grad_norm": 0.17100819945335388, + "learning_rate": 0.001, + "loss": 0.0631, + "step": 73200 + }, + { + "epoch": 0.73, + "grad_norm": 0.3703427314758301, + "learning_rate": 0.001, + "loss": 0.0665, + "step": 73300 + }, + { + "epoch": 0.73, + "grad_norm": 0.19516532123088837, + "learning_rate": 0.001, + "loss": 0.0656, + "step": 73400 + }, + { + "epoch": 0.73, + "grad_norm": 0.17610041797161102, + "learning_rate": 0.001, + "loss": 0.0658, + "step": 73500 + }, + { + "epoch": 0.74, + "grad_norm": 0.13331599533557892, + "learning_rate": 0.001, + "loss": 0.0653, + "step": 73600 + }, + { + "epoch": 0.74, + "grad_norm": 0.23824097216129303, + "learning_rate": 0.001, + "loss": 0.065, + "step": 73700 + }, + { + "epoch": 0.74, + "grad_norm": 0.1464979499578476, + "learning_rate": 0.001, + "loss": 0.0638, + "step": 73800 + }, + { + "epoch": 0.74, + "grad_norm": 0.18163511157035828, + "learning_rate": 0.001, + "loss": 0.0661, + "step": 73900 + }, + { + "epoch": 0.74, + "grad_norm": 0.1809806078672409, + "learning_rate": 0.001, + "loss": 0.0643, + "step": 74000 + }, + { + "epoch": 0.74, + "grad_norm": 0.23994535207748413, + "learning_rate": 0.001, + "loss": 0.0636, + "step": 74100 + }, + { + "epoch": 0.74, + "grad_norm": 0.17924870550632477, + "learning_rate": 0.001, + "loss": 0.064, + "step": 74200 + }, + { + "epoch": 0.74, + "grad_norm": 0.15770521759986877, + "learning_rate": 0.001, + "loss": 0.0661, + "step": 74300 + }, + { + "epoch": 0.74, + "grad_norm": 0.24632355570793152, + "learning_rate": 0.001, + "loss": 0.0644, + "step": 74400 + }, + { + "epoch": 0.74, + "grad_norm": 0.18300195038318634, + "learning_rate": 0.001, + "loss": 0.0592, + "step": 74500 + }, + { + "epoch": 0.75, + "grad_norm": 0.2745151221752167, + "learning_rate": 0.001, + "loss": 0.063, + "step": 74600 + }, + { + "epoch": 0.75, + "grad_norm": 0.18871140480041504, + "learning_rate": 0.001, + "loss": 0.063, + "step": 74700 + }, + { + "epoch": 0.75, + "grad_norm": 0.30228421092033386, + "learning_rate": 0.001, + "loss": 0.0661, + "step": 74800 + }, + { + "epoch": 0.75, + "grad_norm": 0.26834210753440857, + "learning_rate": 0.001, + "loss": 0.0626, + "step": 74900 + }, + { + "epoch": 0.75, + "grad_norm": 0.1998053640127182, + "learning_rate": 0.001, + "loss": 0.0655, + "step": 75000 + }, + { + "epoch": 0.75, + "grad_norm": 0.16265703737735748, + "learning_rate": 0.001, + "loss": 0.0648, + "step": 75100 + }, + { + "epoch": 0.75, + "grad_norm": 0.3203764259815216, + "learning_rate": 0.001, + "loss": 0.0636, + "step": 75200 + }, + { + "epoch": 0.75, + "grad_norm": 0.29416751861572266, + "learning_rate": 0.001, + "loss": 0.0613, + "step": 75300 + }, + { + "epoch": 0.75, + "grad_norm": 0.1761980801820755, + "learning_rate": 0.001, + "loss": 0.0718, + "step": 75400 + }, + { + "epoch": 0.75, + "grad_norm": 0.24760745465755463, + "learning_rate": 0.001, + "loss": 0.0641, + "step": 75500 + }, + { + "epoch": 0.76, + "grad_norm": 0.3362966477870941, + "learning_rate": 0.001, + "loss": 0.0678, + "step": 75600 + }, + { + "epoch": 0.76, + "grad_norm": 0.20644457638263702, + "learning_rate": 0.001, + "loss": 0.0653, + "step": 75700 + }, + { + "epoch": 0.76, + "grad_norm": 0.22632303833961487, + "learning_rate": 0.001, + "loss": 0.0679, + "step": 75800 + }, + { + "epoch": 0.76, + "grad_norm": 0.22177743911743164, + "learning_rate": 0.001, + "loss": 0.0628, + "step": 75900 + }, + { + "epoch": 0.76, + "grad_norm": 0.9697771072387695, + "learning_rate": 0.001, + "loss": 0.0659, + "step": 76000 + }, + { + "epoch": 0.76, + "grad_norm": 0.21862226724624634, + "learning_rate": 0.001, + "loss": 0.0654, + "step": 76100 + }, + { + "epoch": 0.76, + "grad_norm": 0.27506422996520996, + "learning_rate": 0.001, + "loss": 0.0636, + "step": 76200 + }, + { + "epoch": 0.76, + "grad_norm": 0.4953247606754303, + "learning_rate": 0.001, + "loss": 0.0648, + "step": 76300 + }, + { + "epoch": 0.76, + "grad_norm": 0.44132623076438904, + "learning_rate": 0.001, + "loss": 0.0641, + "step": 76400 + }, + { + "epoch": 0.76, + "grad_norm": 0.28104710578918457, + "learning_rate": 0.001, + "loss": 0.0623, + "step": 76500 + }, + { + "epoch": 0.77, + "grad_norm": 0.270434707403183, + "learning_rate": 0.001, + "loss": 0.0642, + "step": 76600 + }, + { + "epoch": 0.77, + "grad_norm": 0.17920733988285065, + "learning_rate": 0.001, + "loss": 0.0641, + "step": 76700 + }, + { + "epoch": 0.77, + "grad_norm": 0.27689895033836365, + "learning_rate": 0.001, + "loss": 0.0645, + "step": 76800 + }, + { + "epoch": 0.77, + "grad_norm": 0.22936861217021942, + "learning_rate": 0.001, + "loss": 0.0625, + "step": 76900 + }, + { + "epoch": 0.77, + "grad_norm": 0.2662585973739624, + "learning_rate": 0.001, + "loss": 0.0671, + "step": 77000 + }, + { + "epoch": 0.77, + "grad_norm": 0.23035678267478943, + "learning_rate": 0.001, + "loss": 0.0622, + "step": 77100 + }, + { + "epoch": 0.77, + "grad_norm": 0.19333815574645996, + "learning_rate": 0.001, + "loss": 0.0655, + "step": 77200 + }, + { + "epoch": 0.77, + "grad_norm": 0.2870350182056427, + "learning_rate": 0.001, + "loss": 0.0634, + "step": 77300 + }, + { + "epoch": 0.77, + "grad_norm": 0.22997340559959412, + "learning_rate": 0.001, + "loss": 0.0676, + "step": 77400 + }, + { + "epoch": 0.77, + "grad_norm": 0.19435285031795502, + "learning_rate": 0.001, + "loss": 0.0655, + "step": 77500 + }, + { + "epoch": 0.78, + "grad_norm": 0.2826205790042877, + "learning_rate": 0.001, + "loss": 0.0635, + "step": 77600 + }, + { + "epoch": 0.78, + "grad_norm": 0.20007766783237457, + "learning_rate": 0.001, + "loss": 0.0617, + "step": 77700 + }, + { + "epoch": 0.78, + "grad_norm": 0.15860234200954437, + "learning_rate": 0.001, + "loss": 0.0657, + "step": 77800 + }, + { + "epoch": 0.78, + "grad_norm": 0.40526214241981506, + "learning_rate": 0.001, + "loss": 0.0649, + "step": 77900 + }, + { + "epoch": 0.78, + "grad_norm": 0.24454933404922485, + "learning_rate": 0.001, + "loss": 0.0634, + "step": 78000 + }, + { + "epoch": 0.78, + "grad_norm": 0.12802359461784363, + "learning_rate": 0.001, + "loss": 0.0635, + "step": 78100 + }, + { + "epoch": 0.78, + "grad_norm": 0.32250648736953735, + "learning_rate": 0.001, + "loss": 0.0648, + "step": 78200 + }, + { + "epoch": 0.78, + "grad_norm": 0.253478467464447, + "learning_rate": 0.001, + "loss": 0.0648, + "step": 78300 + }, + { + "epoch": 0.78, + "grad_norm": 0.25307029485702515, + "learning_rate": 0.001, + "loss": 0.0648, + "step": 78400 + }, + { + "epoch": 0.78, + "grad_norm": 0.19091230630874634, + "learning_rate": 0.001, + "loss": 0.065, + "step": 78500 + }, + { + "epoch": 0.79, + "grad_norm": 0.17312967777252197, + "learning_rate": 0.001, + "loss": 0.0624, + "step": 78600 + }, + { + "epoch": 0.79, + "grad_norm": 0.19466041028499603, + "learning_rate": 0.001, + "loss": 0.0622, + "step": 78700 + }, + { + "epoch": 0.79, + "grad_norm": 0.25837138295173645, + "learning_rate": 0.001, + "loss": 0.0641, + "step": 78800 + }, + { + "epoch": 0.79, + "grad_norm": 0.1573166698217392, + "learning_rate": 0.001, + "loss": 0.0645, + "step": 78900 + }, + { + "epoch": 0.79, + "grad_norm": 0.1644609123468399, + "learning_rate": 0.001, + "loss": 0.0644, + "step": 79000 + }, + { + "epoch": 0.79, + "grad_norm": 0.20255005359649658, + "learning_rate": 0.001, + "loss": 0.0647, + "step": 79100 + }, + { + "epoch": 0.79, + "grad_norm": 0.48706310987472534, + "learning_rate": 0.001, + "loss": 0.0642, + "step": 79200 + }, + { + "epoch": 0.79, + "grad_norm": 0.3525262176990509, + "learning_rate": 0.001, + "loss": 0.0639, + "step": 79300 + }, + { + "epoch": 0.79, + "grad_norm": 0.20806559920310974, + "learning_rate": 0.001, + "loss": 0.0639, + "step": 79400 + }, + { + "epoch": 0.79, + "grad_norm": 0.441980242729187, + "learning_rate": 0.001, + "loss": 0.0645, + "step": 79500 + }, + { + "epoch": 0.8, + "grad_norm": 0.16818083822727203, + "learning_rate": 0.001, + "loss": 0.0625, + "step": 79600 + }, + { + "epoch": 0.8, + "grad_norm": 0.1843559443950653, + "learning_rate": 0.001, + "loss": 0.064, + "step": 79700 + }, + { + "epoch": 0.8, + "grad_norm": 0.19608129560947418, + "learning_rate": 0.001, + "loss": 0.0634, + "step": 79800 + }, + { + "epoch": 0.8, + "grad_norm": 0.34710460901260376, + "learning_rate": 0.001, + "loss": 0.0626, + "step": 79900 + }, + { + "epoch": 0.8, + "grad_norm": 0.4062146842479706, + "learning_rate": 0.001, + "loss": 0.0637, + "step": 80000 + }, + { + "epoch": 0.8, + "grad_norm": 0.23054763674736023, + "learning_rate": 0.001, + "loss": 0.0629, + "step": 80100 + }, + { + "epoch": 0.8, + "grad_norm": 0.20241042971611023, + "learning_rate": 0.001, + "loss": 0.0632, + "step": 80200 + }, + { + "epoch": 0.8, + "grad_norm": 0.17540830373764038, + "learning_rate": 0.001, + "loss": 0.0645, + "step": 80300 + }, + { + "epoch": 0.8, + "grad_norm": 0.2995645999908447, + "learning_rate": 0.001, + "loss": 0.0619, + "step": 80400 + }, + { + "epoch": 0.8, + "grad_norm": 0.2701890766620636, + "learning_rate": 0.001, + "loss": 0.0624, + "step": 80500 + }, + { + "epoch": 0.81, + "grad_norm": 0.5655909180641174, + "learning_rate": 0.001, + "loss": 0.0637, + "step": 80600 + }, + { + "epoch": 0.81, + "grad_norm": 0.24868199229240417, + "learning_rate": 0.001, + "loss": 0.0626, + "step": 80700 + }, + { + "epoch": 0.81, + "grad_norm": 0.205698162317276, + "learning_rate": 0.001, + "loss": 0.0616, + "step": 80800 + }, + { + "epoch": 0.81, + "grad_norm": 0.4373738169670105, + "learning_rate": 0.001, + "loss": 0.0635, + "step": 80900 + }, + { + "epoch": 0.81, + "grad_norm": 0.20648936927318573, + "learning_rate": 0.001, + "loss": 0.063, + "step": 81000 + }, + { + "epoch": 0.81, + "grad_norm": 0.49470582604408264, + "learning_rate": 0.001, + "loss": 0.064, + "step": 81100 + }, + { + "epoch": 0.81, + "grad_norm": 0.2360522598028183, + "learning_rate": 0.001, + "loss": 0.0606, + "step": 81200 + }, + { + "epoch": 0.81, + "grad_norm": 0.38575538992881775, + "learning_rate": 0.001, + "loss": 0.0626, + "step": 81300 + }, + { + "epoch": 0.81, + "grad_norm": 0.23714828491210938, + "learning_rate": 0.001, + "loss": 0.0628, + "step": 81400 + }, + { + "epoch": 0.81, + "grad_norm": 0.5665257573127747, + "learning_rate": 0.001, + "loss": 0.064, + "step": 81500 + }, + { + "epoch": 0.82, + "grad_norm": 0.2335139662027359, + "learning_rate": 0.001, + "loss": 0.0628, + "step": 81600 + }, + { + "epoch": 0.82, + "grad_norm": 0.23121795058250427, + "learning_rate": 0.001, + "loss": 0.0617, + "step": 81700 + }, + { + "epoch": 0.82, + "grad_norm": 0.2850015163421631, + "learning_rate": 0.001, + "loss": 0.0634, + "step": 81800 + }, + { + "epoch": 0.82, + "grad_norm": 0.25949451327323914, + "learning_rate": 0.001, + "loss": 0.0611, + "step": 81900 + }, + { + "epoch": 0.82, + "grad_norm": 0.15866072475910187, + "learning_rate": 0.001, + "loss": 0.0633, + "step": 82000 + }, + { + "epoch": 0.82, + "grad_norm": 0.1362059861421585, + "learning_rate": 0.001, + "loss": 0.0637, + "step": 82100 + }, + { + "epoch": 0.82, + "grad_norm": 0.23973006010055542, + "learning_rate": 0.001, + "loss": 0.0619, + "step": 82200 + }, + { + "epoch": 0.82, + "grad_norm": 0.2586152255535126, + "learning_rate": 0.001, + "loss": 0.0595, + "step": 82300 + }, + { + "epoch": 0.82, + "grad_norm": 0.33245041966438293, + "learning_rate": 0.001, + "loss": 0.0632, + "step": 82400 + }, + { + "epoch": 0.82, + "grad_norm": 0.1873330920934677, + "learning_rate": 0.001, + "loss": 0.0636, + "step": 82500 + }, + { + "epoch": 0.83, + "grad_norm": 0.23043370246887207, + "learning_rate": 0.001, + "loss": 0.0644, + "step": 82600 + }, + { + "epoch": 0.83, + "grad_norm": 0.21046708524227142, + "learning_rate": 0.001, + "loss": 0.0631, + "step": 82700 + }, + { + "epoch": 0.83, + "grad_norm": 0.15473945438861847, + "learning_rate": 0.001, + "loss": 0.06, + "step": 82800 + }, + { + "epoch": 0.83, + "grad_norm": 1.422141194343567, + "learning_rate": 0.001, + "loss": 0.0636, + "step": 82900 + }, + { + "epoch": 0.83, + "grad_norm": 0.16424107551574707, + "learning_rate": 0.001, + "loss": 0.0643, + "step": 83000 + }, + { + "epoch": 0.83, + "grad_norm": 0.3594319820404053, + "learning_rate": 0.001, + "loss": 0.0624, + "step": 83100 + }, + { + "epoch": 0.83, + "grad_norm": 0.26430365443229675, + "learning_rate": 0.001, + "loss": 0.0593, + "step": 83200 + }, + { + "epoch": 0.83, + "grad_norm": 0.20655816793441772, + "learning_rate": 0.001, + "loss": 0.0619, + "step": 83300 + }, + { + "epoch": 0.83, + "grad_norm": 0.39340272545814514, + "learning_rate": 0.001, + "loss": 0.0624, + "step": 83400 + }, + { + "epoch": 0.83, + "grad_norm": 0.3113759160041809, + "learning_rate": 0.001, + "loss": 0.0598, + "step": 83500 + }, + { + "epoch": 0.84, + "grad_norm": 0.33689817786216736, + "learning_rate": 0.001, + "loss": 0.0604, + "step": 83600 + }, + { + "epoch": 0.84, + "grad_norm": 0.2195175141096115, + "learning_rate": 0.001, + "loss": 0.0618, + "step": 83700 + }, + { + "epoch": 0.84, + "grad_norm": 0.2397637814283371, + "learning_rate": 0.001, + "loss": 0.0618, + "step": 83800 + }, + { + "epoch": 0.84, + "grad_norm": 0.28967469930648804, + "learning_rate": 0.001, + "loss": 0.0612, + "step": 83900 + }, + { + "epoch": 0.84, + "grad_norm": 0.23908008635044098, + "learning_rate": 0.001, + "loss": 0.0599, + "step": 84000 + }, + { + "epoch": 0.84, + "grad_norm": 0.36196354031562805, + "learning_rate": 0.001, + "loss": 0.061, + "step": 84100 + }, + { + "epoch": 0.84, + "grad_norm": 0.3068004250526428, + "learning_rate": 0.001, + "loss": 0.0614, + "step": 84200 + }, + { + "epoch": 0.84, + "grad_norm": 0.2148333489894867, + "learning_rate": 0.001, + "loss": 0.0624, + "step": 84300 + }, + { + "epoch": 0.84, + "grad_norm": 0.19169430434703827, + "learning_rate": 0.001, + "loss": 0.0615, + "step": 84400 + }, + { + "epoch": 0.84, + "grad_norm": 0.23916268348693848, + "learning_rate": 0.001, + "loss": 0.0654, + "step": 84500 + }, + { + "epoch": 0.85, + "grad_norm": 0.20304815471172333, + "learning_rate": 0.001, + "loss": 0.0613, + "step": 84600 + }, + { + "epoch": 0.85, + "grad_norm": 0.2983682155609131, + "learning_rate": 0.001, + "loss": 0.0617, + "step": 84700 + }, + { + "epoch": 0.85, + "grad_norm": 0.22442661225795746, + "learning_rate": 0.001, + "loss": 0.0593, + "step": 84800 + }, + { + "epoch": 0.85, + "grad_norm": 0.28299954533576965, + "learning_rate": 0.001, + "loss": 0.0636, + "step": 84900 + }, + { + "epoch": 0.85, + "grad_norm": 0.30491936206817627, + "learning_rate": 0.001, + "loss": 0.0608, + "step": 85000 + }, + { + "epoch": 0.85, + "grad_norm": 0.30804798007011414, + "learning_rate": 0.001, + "loss": 0.0609, + "step": 85100 + }, + { + "epoch": 0.85, + "grad_norm": 0.18533004820346832, + "learning_rate": 0.001, + "loss": 0.0602, + "step": 85200 + }, + { + "epoch": 0.85, + "grad_norm": 0.23856715857982635, + "learning_rate": 0.001, + "loss": 0.0638, + "step": 85300 + }, + { + "epoch": 0.85, + "grad_norm": 0.2646658420562744, + "learning_rate": 0.001, + "loss": 0.0622, + "step": 85400 + }, + { + "epoch": 0.85, + "grad_norm": 0.2357235699892044, + "learning_rate": 0.001, + "loss": 0.0617, + "step": 85500 + }, + { + "epoch": 0.86, + "grad_norm": 0.1675509363412857, + "learning_rate": 0.001, + "loss": 0.0593, + "step": 85600 + }, + { + "epoch": 0.86, + "grad_norm": 0.20707982778549194, + "learning_rate": 0.001, + "loss": 0.0617, + "step": 85700 + }, + { + "epoch": 0.86, + "grad_norm": 0.34539708495140076, + "learning_rate": 0.001, + "loss": 0.06, + "step": 85800 + }, + { + "epoch": 0.86, + "grad_norm": 0.28429824113845825, + "learning_rate": 0.001, + "loss": 0.0587, + "step": 85900 + }, + { + "epoch": 0.86, + "grad_norm": 0.3121056854724884, + "learning_rate": 0.001, + "loss": 0.0615, + "step": 86000 + }, + { + "epoch": 0.86, + "grad_norm": 0.25750598311424255, + "learning_rate": 0.001, + "loss": 0.0613, + "step": 86100 + }, + { + "epoch": 0.86, + "grad_norm": 0.18927526473999023, + "learning_rate": 0.001, + "loss": 0.0592, + "step": 86200 + }, + { + "epoch": 0.86, + "grad_norm": 0.3551163971424103, + "learning_rate": 0.001, + "loss": 0.0619, + "step": 86300 + }, + { + "epoch": 0.86, + "grad_norm": 0.19404169917106628, + "learning_rate": 0.001, + "loss": 0.0617, + "step": 86400 + }, + { + "epoch": 0.86, + "grad_norm": 0.16969504952430725, + "learning_rate": 0.001, + "loss": 0.0599, + "step": 86500 + }, + { + "epoch": 0.87, + "grad_norm": 0.20026318728923798, + "learning_rate": 0.001, + "loss": 0.0606, + "step": 86600 + }, + { + "epoch": 0.87, + "grad_norm": 0.30545106530189514, + "learning_rate": 0.001, + "loss": 0.0594, + "step": 86700 + }, + { + "epoch": 0.87, + "grad_norm": 0.2734260559082031, + "learning_rate": 0.001, + "loss": 0.0644, + "step": 86800 + }, + { + "epoch": 0.87, + "grad_norm": 0.3157080411911011, + "learning_rate": 0.001, + "loss": 0.0618, + "step": 86900 + }, + { + "epoch": 0.87, + "grad_norm": 0.19793906807899475, + "learning_rate": 0.001, + "loss": 0.0616, + "step": 87000 + }, + { + "epoch": 0.87, + "grad_norm": 0.1849125623703003, + "learning_rate": 0.001, + "loss": 0.0596, + "step": 87100 + }, + { + "epoch": 0.87, + "grad_norm": 0.18340341746807098, + "learning_rate": 0.001, + "loss": 0.0625, + "step": 87200 + }, + { + "epoch": 0.87, + "grad_norm": 0.26056426763534546, + "learning_rate": 0.001, + "loss": 0.0595, + "step": 87300 + }, + { + "epoch": 0.87, + "grad_norm": 0.22235774993896484, + "learning_rate": 0.001, + "loss": 0.0606, + "step": 87400 + }, + { + "epoch": 0.87, + "grad_norm": 0.31580013036727905, + "learning_rate": 0.001, + "loss": 0.0615, + "step": 87500 + }, + { + "epoch": 0.88, + "grad_norm": 0.2364477515220642, + "learning_rate": 0.001, + "loss": 0.0616, + "step": 87600 + }, + { + "epoch": 0.88, + "grad_norm": 0.23212990164756775, + "learning_rate": 0.001, + "loss": 0.0594, + "step": 87700 + }, + { + "epoch": 0.88, + "grad_norm": 0.21986854076385498, + "learning_rate": 0.001, + "loss": 0.0592, + "step": 87800 + }, + { + "epoch": 0.88, + "grad_norm": 0.2496929168701172, + "learning_rate": 0.001, + "loss": 0.0593, + "step": 87900 + }, + { + "epoch": 0.88, + "grad_norm": 0.19572298228740692, + "learning_rate": 0.001, + "loss": 0.0588, + "step": 88000 + }, + { + "epoch": 0.88, + "grad_norm": 0.16231012344360352, + "learning_rate": 0.001, + "loss": 0.0599, + "step": 88100 + }, + { + "epoch": 0.88, + "grad_norm": 0.21093867719173431, + "learning_rate": 0.001, + "loss": 0.0625, + "step": 88200 + }, + { + "epoch": 0.88, + "grad_norm": 0.16491778194904327, + "learning_rate": 0.001, + "loss": 0.0602, + "step": 88300 + }, + { + "epoch": 0.88, + "grad_norm": 0.24729378521442413, + "learning_rate": 0.001, + "loss": 0.0573, + "step": 88400 + }, + { + "epoch": 0.88, + "grad_norm": 0.3726213276386261, + "learning_rate": 0.001, + "loss": 0.0589, + "step": 88500 + }, + { + "epoch": 0.89, + "grad_norm": 0.1926572024822235, + "learning_rate": 0.001, + "loss": 0.0602, + "step": 88600 + }, + { + "epoch": 0.89, + "grad_norm": 0.2153882533311844, + "learning_rate": 0.001, + "loss": 0.0597, + "step": 88700 + }, + { + "epoch": 0.89, + "grad_norm": 0.25205257534980774, + "learning_rate": 0.001, + "loss": 0.0581, + "step": 88800 + }, + { + "epoch": 0.89, + "grad_norm": 0.16898304224014282, + "learning_rate": 0.001, + "loss": 0.0614, + "step": 88900 + }, + { + "epoch": 0.89, + "grad_norm": 0.2840329110622406, + "learning_rate": 0.001, + "loss": 0.0615, + "step": 89000 + }, + { + "epoch": 0.89, + "grad_norm": 0.22306442260742188, + "learning_rate": 0.001, + "loss": 0.0606, + "step": 89100 + }, + { + "epoch": 0.89, + "grad_norm": 0.2778179943561554, + "learning_rate": 0.001, + "loss": 0.0606, + "step": 89200 + }, + { + "epoch": 0.89, + "grad_norm": 0.1956636756658554, + "learning_rate": 0.001, + "loss": 0.0585, + "step": 89300 + }, + { + "epoch": 0.89, + "grad_norm": 0.15973015129566193, + "learning_rate": 0.001, + "loss": 0.0598, + "step": 89400 + }, + { + "epoch": 0.89, + "grad_norm": 0.2306407243013382, + "learning_rate": 0.001, + "loss": 0.0597, + "step": 89500 + }, + { + "epoch": 0.9, + "grad_norm": 0.19012047350406647, + "learning_rate": 0.001, + "loss": 0.0608, + "step": 89600 + }, + { + "epoch": 0.9, + "grad_norm": 0.214030921459198, + "learning_rate": 0.001, + "loss": 0.0586, + "step": 89700 + }, + { + "epoch": 0.9, + "grad_norm": 0.26291027665138245, + "learning_rate": 0.001, + "loss": 0.0599, + "step": 89800 + }, + { + "epoch": 0.9, + "grad_norm": 0.140648752450943, + "learning_rate": 0.001, + "loss": 0.0605, + "step": 89900 + }, + { + "epoch": 0.9, + "grad_norm": 0.3011924624443054, + "learning_rate": 0.001, + "loss": 0.0609, + "step": 90000 + }, + { + "epoch": 0.9, + "grad_norm": 0.24463798105716705, + "learning_rate": 0.001, + "loss": 0.0587, + "step": 90100 + }, + { + "epoch": 0.9, + "grad_norm": 0.2608613073825836, + "learning_rate": 0.001, + "loss": 0.0595, + "step": 90200 + }, + { + "epoch": 0.9, + "grad_norm": 0.23249809443950653, + "learning_rate": 0.001, + "loss": 0.0592, + "step": 90300 + }, + { + "epoch": 0.9, + "grad_norm": 0.36541712284088135, + "learning_rate": 0.001, + "loss": 0.0599, + "step": 90400 + }, + { + "epoch": 0.9, + "grad_norm": 0.45584437251091003, + "learning_rate": 0.001, + "loss": 0.0587, + "step": 90500 + }, + { + "epoch": 0.91, + "grad_norm": 0.20905092358589172, + "learning_rate": 0.001, + "loss": 0.0595, + "step": 90600 + }, + { + "epoch": 0.91, + "grad_norm": 0.18202795088291168, + "learning_rate": 0.001, + "loss": 0.0568, + "step": 90700 + }, + { + "epoch": 0.91, + "grad_norm": 0.2321150153875351, + "learning_rate": 0.001, + "loss": 0.0605, + "step": 90800 + }, + { + "epoch": 0.91, + "grad_norm": 0.17175626754760742, + "learning_rate": 0.001, + "loss": 0.0596, + "step": 90900 + }, + { + "epoch": 0.91, + "grad_norm": 0.21932841837406158, + "learning_rate": 0.001, + "loss": 0.0585, + "step": 91000 + }, + { + "epoch": 0.91, + "grad_norm": 0.30282464623451233, + "learning_rate": 0.001, + "loss": 0.0593, + "step": 91100 + }, + { + "epoch": 0.91, + "grad_norm": 0.2639208436012268, + "learning_rate": 0.001, + "loss": 0.0599, + "step": 91200 + }, + { + "epoch": 0.91, + "grad_norm": 0.23805926740169525, + "learning_rate": 0.001, + "loss": 0.0576, + "step": 91300 + }, + { + "epoch": 0.91, + "grad_norm": 0.2307603508234024, + "learning_rate": 0.001, + "loss": 0.0602, + "step": 91400 + }, + { + "epoch": 0.91, + "grad_norm": 0.1786148101091385, + "learning_rate": 0.001, + "loss": 0.0598, + "step": 91500 + }, + { + "epoch": 0.92, + "grad_norm": 0.1955350786447525, + "learning_rate": 0.001, + "loss": 0.0576, + "step": 91600 + }, + { + "epoch": 0.92, + "grad_norm": 0.24684827029705048, + "learning_rate": 0.001, + "loss": 0.0571, + "step": 91700 + }, + { + "epoch": 0.92, + "grad_norm": 0.2771402895450592, + "learning_rate": 0.001, + "loss": 0.058, + "step": 91800 + }, + { + "epoch": 0.92, + "grad_norm": 0.28878656029701233, + "learning_rate": 0.001, + "loss": 0.0585, + "step": 91900 + }, + { + "epoch": 0.92, + "grad_norm": 0.7780060172080994, + "learning_rate": 0.001, + "loss": 0.0574, + "step": 92000 + }, + { + "epoch": 0.92, + "grad_norm": 0.25102126598358154, + "learning_rate": 0.001, + "loss": 0.0576, + "step": 92100 + }, + { + "epoch": 0.92, + "grad_norm": 0.26416492462158203, + "learning_rate": 0.001, + "loss": 0.0614, + "step": 92200 + }, + { + "epoch": 0.92, + "grad_norm": 0.26566821336746216, + "learning_rate": 0.001, + "loss": 0.0586, + "step": 92300 + }, + { + "epoch": 0.92, + "grad_norm": 0.25432705879211426, + "learning_rate": 0.001, + "loss": 0.0586, + "step": 92400 + }, + { + "epoch": 0.92, + "grad_norm": 0.2592636048793793, + "learning_rate": 0.001, + "loss": 0.0576, + "step": 92500 + }, + { + "epoch": 0.93, + "grad_norm": 0.3514898419380188, + "learning_rate": 0.001, + "loss": 0.0579, + "step": 92600 + }, + { + "epoch": 0.93, + "grad_norm": 0.2749045491218567, + "learning_rate": 0.001, + "loss": 0.061, + "step": 92700 + }, + { + "epoch": 0.93, + "grad_norm": 0.2799491882324219, + "learning_rate": 0.001, + "loss": 0.0579, + "step": 92800 + }, + { + "epoch": 0.93, + "grad_norm": 0.2252642959356308, + "learning_rate": 0.001, + "loss": 0.0584, + "step": 92900 + }, + { + "epoch": 0.93, + "grad_norm": 0.18218593299388885, + "learning_rate": 0.001, + "loss": 0.0577, + "step": 93000 + }, + { + "epoch": 0.93, + "grad_norm": 0.27551427483558655, + "learning_rate": 0.001, + "loss": 0.0605, + "step": 93100 + }, + { + "epoch": 0.93, + "grad_norm": 0.26159995794296265, + "learning_rate": 0.001, + "loss": 0.0562, + "step": 93200 + }, + { + "epoch": 0.93, + "grad_norm": 0.15979285538196564, + "learning_rate": 0.001, + "loss": 0.0615, + "step": 93300 + }, + { + "epoch": 0.93, + "grad_norm": 0.23418280482292175, + "learning_rate": 0.001, + "loss": 0.0594, + "step": 93400 + }, + { + "epoch": 0.93, + "grad_norm": 0.16936419904232025, + "learning_rate": 0.001, + "loss": 0.0611, + "step": 93500 + }, + { + "epoch": 0.94, + "grad_norm": 0.2862916886806488, + "learning_rate": 0.001, + "loss": 0.0584, + "step": 93600 + }, + { + "epoch": 0.94, + "grad_norm": 0.5302750468254089, + "learning_rate": 0.001, + "loss": 0.0561, + "step": 93700 + }, + { + "epoch": 0.94, + "grad_norm": 0.43644002079963684, + "learning_rate": 0.001, + "loss": 0.0581, + "step": 93800 + }, + { + "epoch": 0.94, + "grad_norm": 0.19219018518924713, + "learning_rate": 0.001, + "loss": 0.0591, + "step": 93900 + }, + { + "epoch": 0.94, + "grad_norm": 0.29645296931266785, + "learning_rate": 0.001, + "loss": 0.0587, + "step": 94000 + }, + { + "epoch": 0.94, + "grad_norm": 0.24861380457878113, + "learning_rate": 0.001, + "loss": 0.0594, + "step": 94100 + }, + { + "epoch": 0.94, + "grad_norm": 0.2443215548992157, + "learning_rate": 0.001, + "loss": 0.057, + "step": 94200 + }, + { + "epoch": 0.94, + "grad_norm": 0.13077589869499207, + "learning_rate": 0.001, + "loss": 0.0563, + "step": 94300 + }, + { + "epoch": 0.94, + "grad_norm": 0.24280287325382233, + "learning_rate": 0.001, + "loss": 0.0591, + "step": 94400 + }, + { + "epoch": 0.94, + "grad_norm": 0.25838151574134827, + "learning_rate": 0.001, + "loss": 0.0583, + "step": 94500 + }, + { + "epoch": 0.95, + "grad_norm": 0.33244743943214417, + "learning_rate": 0.001, + "loss": 0.0587, + "step": 94600 + }, + { + "epoch": 0.95, + "grad_norm": 0.45074304938316345, + "learning_rate": 0.001, + "loss": 0.0572, + "step": 94700 + }, + { + "epoch": 0.95, + "grad_norm": 0.2540782392024994, + "learning_rate": 0.001, + "loss": 0.0584, + "step": 94800 + }, + { + "epoch": 0.95, + "grad_norm": 0.29180458188056946, + "learning_rate": 0.001, + "loss": 0.0609, + "step": 94900 + }, + { + "epoch": 0.95, + "grad_norm": 0.18510323762893677, + "learning_rate": 0.001, + "loss": 0.058, + "step": 95000 + }, + { + "epoch": 0.95, + "grad_norm": 0.28962787985801697, + "learning_rate": 0.001, + "loss": 0.0562, + "step": 95100 + }, + { + "epoch": 0.95, + "grad_norm": 0.26887577772140503, + "learning_rate": 0.001, + "loss": 0.0573, + "step": 95200 + }, + { + "epoch": 0.95, + "grad_norm": 0.20729154348373413, + "learning_rate": 0.001, + "loss": 0.057, + "step": 95300 + }, + { + "epoch": 0.95, + "grad_norm": 0.19953325390815735, + "learning_rate": 0.001, + "loss": 0.0594, + "step": 95400 + }, + { + "epoch": 0.95, + "grad_norm": 0.15926332771778107, + "learning_rate": 0.001, + "loss": 0.0582, + "step": 95500 + }, + { + "epoch": 0.96, + "grad_norm": 0.23609544336795807, + "learning_rate": 0.001, + "loss": 0.0579, + "step": 95600 + }, + { + "epoch": 0.96, + "grad_norm": 0.13997937738895416, + "learning_rate": 0.001, + "loss": 0.0574, + "step": 95700 + }, + { + "epoch": 0.96, + "grad_norm": 0.23629073798656464, + "learning_rate": 0.001, + "loss": 0.0585, + "step": 95800 + }, + { + "epoch": 0.96, + "grad_norm": 0.3770292401313782, + "learning_rate": 0.001, + "loss": 0.0572, + "step": 95900 + }, + { + "epoch": 0.96, + "grad_norm": 0.3013598322868347, + "learning_rate": 0.001, + "loss": 0.0606, + "step": 96000 + }, + { + "epoch": 0.96, + "grad_norm": 0.2350749522447586, + "learning_rate": 0.001, + "loss": 0.057, + "step": 96100 + }, + { + "epoch": 0.96, + "grad_norm": 0.301268994808197, + "learning_rate": 0.001, + "loss": 0.0586, + "step": 96200 + }, + { + "epoch": 0.96, + "grad_norm": 0.22475981712341309, + "learning_rate": 0.001, + "loss": 0.0593, + "step": 96300 + }, + { + "epoch": 0.96, + "grad_norm": 0.3032160997390747, + "learning_rate": 0.001, + "loss": 0.0591, + "step": 96400 + }, + { + "epoch": 0.96, + "grad_norm": 0.5848428010940552, + "learning_rate": 0.001, + "loss": 0.0559, + "step": 96500 + }, + { + "epoch": 0.97, + "grad_norm": 0.20164470374584198, + "learning_rate": 0.001, + "loss": 0.0579, + "step": 96600 + }, + { + "epoch": 0.97, + "grad_norm": 0.18068142235279083, + "learning_rate": 0.001, + "loss": 0.0579, + "step": 96700 + }, + { + "epoch": 0.97, + "grad_norm": 0.31181275844573975, + "learning_rate": 0.001, + "loss": 0.0588, + "step": 96800 + }, + { + "epoch": 0.97, + "grad_norm": 0.23156049847602844, + "learning_rate": 0.001, + "loss": 0.058, + "step": 96900 + }, + { + "epoch": 0.97, + "grad_norm": 0.18572886288166046, + "learning_rate": 0.001, + "loss": 0.0599, + "step": 97000 + }, + { + "epoch": 0.97, + "grad_norm": 0.17736677825450897, + "learning_rate": 0.001, + "loss": 0.0561, + "step": 97100 + }, + { + "epoch": 0.97, + "grad_norm": 0.4838601052761078, + "learning_rate": 0.001, + "loss": 0.0595, + "step": 97200 + }, + { + "epoch": 0.97, + "grad_norm": 0.21476797759532928, + "learning_rate": 0.001, + "loss": 0.0609, + "step": 97300 + }, + { + "epoch": 0.97, + "grad_norm": 0.2181667536497116, + "learning_rate": 0.001, + "loss": 0.0583, + "step": 97400 + }, + { + "epoch": 0.97, + "grad_norm": 0.26551786065101624, + "learning_rate": 0.001, + "loss": 0.0566, + "step": 97500 + }, + { + "epoch": 0.98, + "grad_norm": 0.2258795201778412, + "learning_rate": 0.001, + "loss": 0.0574, + "step": 97600 + }, + { + "epoch": 0.98, + "grad_norm": 0.17733299732208252, + "learning_rate": 0.001, + "loss": 0.0588, + "step": 97700 + }, + { + "epoch": 0.98, + "grad_norm": 0.4031812846660614, + "learning_rate": 0.001, + "loss": 0.0584, + "step": 97800 + }, + { + "epoch": 0.98, + "grad_norm": 0.22529329359531403, + "learning_rate": 0.001, + "loss": 0.0572, + "step": 97900 + }, + { + "epoch": 0.98, + "grad_norm": 0.2503925561904907, + "learning_rate": 0.001, + "loss": 0.0588, + "step": 98000 + }, + { + "epoch": 0.98, + "grad_norm": 0.17040744423866272, + "learning_rate": 0.001, + "loss": 0.0603, + "step": 98100 + }, + { + "epoch": 0.98, + "grad_norm": 0.17749032378196716, + "learning_rate": 0.001, + "loss": 0.057, + "step": 98200 + }, + { + "epoch": 0.98, + "grad_norm": 0.3931177854537964, + "learning_rate": 0.001, + "loss": 0.0566, + "step": 98300 + }, + { + "epoch": 0.98, + "grad_norm": 0.22418583929538727, + "learning_rate": 0.001, + "loss": 0.0574, + "step": 98400 + }, + { + "epoch": 0.98, + "grad_norm": 0.30830493569374084, + "learning_rate": 0.001, + "loss": 0.0593, + "step": 98500 + }, + { + "epoch": 0.99, + "grad_norm": 0.2269369661808014, + "learning_rate": 0.001, + "loss": 0.0585, + "step": 98600 + }, + { + "epoch": 0.99, + "grad_norm": 0.31830596923828125, + "learning_rate": 0.001, + "loss": 0.0548, + "step": 98700 + }, + { + "epoch": 0.99, + "grad_norm": 0.25759172439575195, + "learning_rate": 0.001, + "loss": 0.0564, + "step": 98800 + }, + { + "epoch": 0.99, + "grad_norm": 0.23925898969173431, + "learning_rate": 0.001, + "loss": 0.0592, + "step": 98900 + }, + { + "epoch": 0.99, + "grad_norm": 0.17434507608413696, + "learning_rate": 0.001, + "loss": 0.0583, + "step": 99000 + }, + { + "epoch": 0.99, + "grad_norm": 0.3493863642215729, + "learning_rate": 0.001, + "loss": 0.0571, + "step": 99100 + }, + { + "epoch": 0.99, + "grad_norm": 0.20887431502342224, + "learning_rate": 0.001, + "loss": 0.0564, + "step": 99200 + }, + { + "epoch": 0.99, + "grad_norm": 0.18060541152954102, + "learning_rate": 0.001, + "loss": 0.0583, + "step": 99300 + }, + { + "epoch": 0.99, + "grad_norm": 0.3689703047275543, + "learning_rate": 0.001, + "loss": 0.0565, + "step": 99400 + }, + { + "epoch": 0.99, + "grad_norm": 0.25323519110679626, + "learning_rate": 0.001, + "loss": 0.0576, + "step": 99500 + }, + { + "epoch": 1.0, + "grad_norm": 0.27348294854164124, + "learning_rate": 0.001, + "loss": 0.0568, + "step": 99600 + }, + { + "epoch": 1.0, + "grad_norm": 0.25492238998413086, + "learning_rate": 0.001, + "loss": 0.0561, + "step": 99700 + }, + { + "epoch": 1.0, + "grad_norm": 0.2604049742221832, + "learning_rate": 0.001, + "loss": 0.0564, + "step": 99800 + }, + { + "epoch": 1.0, + "grad_norm": 0.37222278118133545, + "learning_rate": 0.001, + "loss": 0.059, + "step": 99900 + }, + { + "epoch": 1.0, + "grad_norm": 0.3180735111236572, + "learning_rate": 0.001, + "loss": 0.0588, + "step": 100000 + }, + { + "epoch": 1.0, + "step": 100000, + "total_flos": 8.920695708927918e+18, + "train_loss": 0.09189929046154022, + "train_runtime": 235079.2305, + "train_samples_per_second": 54.45, + "train_steps_per_second": 0.425 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "total_flos": 8.920695708927918e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +} +