|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.99999750000625, |
|
"eval_steps": 500, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.3668140172958374, |
|
"learning_rate": 0.001, |
|
"loss": 1.2955, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.4789515733718872, |
|
"learning_rate": 0.001, |
|
"loss": 0.2147, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.8046264052391052, |
|
"learning_rate": 0.001, |
|
"loss": 0.1773, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.6500861644744873, |
|
"learning_rate": 0.001, |
|
"loss": 0.169, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.9476549625396729, |
|
"learning_rate": 0.001, |
|
"loss": 0.155, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7570195198059082, |
|
"learning_rate": 0.001, |
|
"loss": 0.221, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.13534319400787354, |
|
"learning_rate": 0.001, |
|
"loss": 0.242, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.12334191799163818, |
|
"learning_rate": 0.001, |
|
"loss": 0.2387, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.0074245929718018, |
|
"learning_rate": 0.001, |
|
"loss": 0.1844, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.2459566444158554, |
|
"learning_rate": 0.001, |
|
"loss": 0.2273, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.35431796312332153, |
|
"learning_rate": 0.001, |
|
"loss": 0.2406, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.07735779881477356, |
|
"learning_rate": 0.001, |
|
"loss": 0.2362, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.197942316532135, |
|
"learning_rate": 0.001, |
|
"loss": 0.2361, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.06753970682621002, |
|
"learning_rate": 0.001, |
|
"loss": 0.2346, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.17562294006347656, |
|
"learning_rate": 0.001, |
|
"loss": 0.2356, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.12020650506019592, |
|
"learning_rate": 0.001, |
|
"loss": 0.2343, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.07772481441497803, |
|
"learning_rate": 0.001, |
|
"loss": 0.2343, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.041362863034009933, |
|
"learning_rate": 0.001, |
|
"loss": 0.2345, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.050947155803442, |
|
"learning_rate": 0.001, |
|
"loss": 0.2333, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.24440822005271912, |
|
"learning_rate": 0.001, |
|
"loss": 0.2346, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.4386675953865051, |
|
"learning_rate": 0.001, |
|
"loss": 0.2346, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.054741185158491135, |
|
"learning_rate": 0.001, |
|
"loss": 0.2347, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5285407304763794, |
|
"learning_rate": 0.001, |
|
"loss": 0.2341, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5406210422515869, |
|
"learning_rate": 0.001, |
|
"loss": 0.2322, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.1667808294296265, |
|
"learning_rate": 0.001, |
|
"loss": 0.1683, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.11829289048910141, |
|
"learning_rate": 0.001, |
|
"loss": 0.2138, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.528359055519104, |
|
"learning_rate": 0.001, |
|
"loss": 0.1902, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.45457515120506287, |
|
"learning_rate": 0.001, |
|
"loss": 0.1592, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2595893144607544, |
|
"learning_rate": 0.001, |
|
"loss": 0.1511, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.5346922278404236, |
|
"learning_rate": 0.001, |
|
"loss": 0.1439, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.5066208839416504, |
|
"learning_rate": 0.001, |
|
"loss": 0.1617, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 24.826475143432617, |
|
"learning_rate": 0.001, |
|
"loss": 0.2024, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 10.144634246826172, |
|
"learning_rate": 0.001, |
|
"loss": 0.1882, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.43425676226615906, |
|
"learning_rate": 0.001, |
|
"loss": 0.169, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3496113717556, |
|
"learning_rate": 0.001, |
|
"loss": 0.1542, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.317073345184326, |
|
"learning_rate": 0.001, |
|
"loss": 0.1676, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1362758874893188, |
|
"learning_rate": 0.001, |
|
"loss": 0.1599, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.871659755706787, |
|
"learning_rate": 0.001, |
|
"loss": 0.1473, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.10563373565673828, |
|
"learning_rate": 0.001, |
|
"loss": 0.1652, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.08865318447351456, |
|
"learning_rate": 0.001, |
|
"loss": 0.2326, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.0642586424946785, |
|
"learning_rate": 0.001, |
|
"loss": 0.2329, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.36199188232421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.2331, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.17750632762908936, |
|
"learning_rate": 0.001, |
|
"loss": 0.2326, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.103765107691288, |
|
"learning_rate": 0.001, |
|
"loss": 0.2329, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.11186927556991577, |
|
"learning_rate": 0.001, |
|
"loss": 0.2326, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.04914987459778786, |
|
"learning_rate": 0.001, |
|
"loss": 0.2326, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.09826149046421051, |
|
"learning_rate": 0.001, |
|
"loss": 0.2324, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.08518774062395096, |
|
"learning_rate": 0.001, |
|
"loss": 0.2327, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.12364567071199417, |
|
"learning_rate": 0.001, |
|
"loss": 0.2321, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.10944374650716782, |
|
"learning_rate": 0.001, |
|
"loss": 0.2322, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.08173243701457977, |
|
"learning_rate": 0.001, |
|
"loss": 0.2326, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.17504490911960602, |
|
"learning_rate": 0.001, |
|
"loss": 0.232, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.03396083042025566, |
|
"learning_rate": 0.001, |
|
"loss": 0.2326, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.12226787954568863, |
|
"learning_rate": 0.001, |
|
"loss": 0.2324, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.029385367408394814, |
|
"learning_rate": 0.001, |
|
"loss": 0.2324, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.08070210367441177, |
|
"learning_rate": 0.001, |
|
"loss": 0.2322, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.026348430663347244, |
|
"learning_rate": 0.001, |
|
"loss": 0.2316, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.06884663552045822, |
|
"learning_rate": 0.001, |
|
"loss": 0.2322, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.09100496768951416, |
|
"learning_rate": 0.001, |
|
"loss": 0.3271, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.0949195995926857, |
|
"learning_rate": 0.001, |
|
"loss": 0.2322, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.17315314710140228, |
|
"learning_rate": 0.001, |
|
"loss": 0.232, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.04644012451171875, |
|
"learning_rate": 0.001, |
|
"loss": 0.2317, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.03242076560854912, |
|
"learning_rate": 0.001, |
|
"loss": 0.2317, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.03038044273853302, |
|
"learning_rate": 0.001, |
|
"loss": 0.2322, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.04407713562250137, |
|
"learning_rate": 0.001, |
|
"loss": 0.2321, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.04973585903644562, |
|
"learning_rate": 0.001, |
|
"loss": 0.2321, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.043713077902793884, |
|
"learning_rate": 0.001, |
|
"loss": 0.2319, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.0361105352640152, |
|
"learning_rate": 0.001, |
|
"loss": 0.2319, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.038385313004255295, |
|
"learning_rate": 0.001, |
|
"loss": 0.2319, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.059859637171030045, |
|
"learning_rate": 0.001, |
|
"loss": 0.2318, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.10737486183643341, |
|
"learning_rate": 0.001, |
|
"loss": 0.232, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.07841573655605316, |
|
"learning_rate": 0.001, |
|
"loss": 0.2319, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.12177613377571106, |
|
"learning_rate": 0.001, |
|
"loss": 0.2318, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.04158034175634384, |
|
"learning_rate": 0.001, |
|
"loss": 0.2318, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.04334099590778351, |
|
"learning_rate": 0.001, |
|
"loss": 0.2318, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.04868987202644348, |
|
"learning_rate": 0.001, |
|
"loss": 0.2317, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.11688575893640518, |
|
"learning_rate": 0.001, |
|
"loss": 0.2318, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05144130066037178, |
|
"learning_rate": 0.001, |
|
"loss": 0.2319, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.04202236235141754, |
|
"learning_rate": 0.001, |
|
"loss": 0.2318, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.07848116755485535, |
|
"learning_rate": 0.001, |
|
"loss": 0.2314, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05292198061943054, |
|
"learning_rate": 0.001, |
|
"loss": 0.2317, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.05817991867661476, |
|
"learning_rate": 0.001, |
|
"loss": 0.2318, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.03250608965754509, |
|
"learning_rate": 0.001, |
|
"loss": 0.2316, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.29823893308639526, |
|
"learning_rate": 0.001, |
|
"loss": 0.2311, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.852128505706787, |
|
"learning_rate": 0.001, |
|
"loss": 0.1864, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 61.31148147583008, |
|
"learning_rate": 0.001, |
|
"loss": 0.1911, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.4901123046875, |
|
"learning_rate": 0.001, |
|
"loss": 0.1934, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9580036401748657, |
|
"learning_rate": 0.001, |
|
"loss": 0.1706, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5461576581001282, |
|
"learning_rate": 0.001, |
|
"loss": 0.1597, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.481351375579834, |
|
"learning_rate": 0.001, |
|
"loss": 0.1511, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.3008120656013489, |
|
"learning_rate": 0.001, |
|
"loss": 0.154, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.23753711581230164, |
|
"learning_rate": 0.001, |
|
"loss": 0.1406, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9201159477233887, |
|
"learning_rate": 0.001, |
|
"loss": 0.1444, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.6734191179275513, |
|
"learning_rate": 0.001, |
|
"loss": 0.1385, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.7249393463134766, |
|
"learning_rate": 0.001, |
|
"loss": 0.1393, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5765690207481384, |
|
"learning_rate": 0.001, |
|
"loss": 0.1397, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.4266449213027954, |
|
"learning_rate": 0.001, |
|
"loss": 0.1386, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.23247841000556946, |
|
"learning_rate": 0.001, |
|
"loss": 0.1343, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.19435954093933105, |
|
"learning_rate": 0.001, |
|
"loss": 0.1306, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.27626514434814453, |
|
"learning_rate": 0.001, |
|
"loss": 0.133, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.1834883689880371, |
|
"learning_rate": 0.001, |
|
"loss": 0.1299, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.4306440055370331, |
|
"learning_rate": 0.001, |
|
"loss": 0.1309, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.15750516951084137, |
|
"learning_rate": 0.001, |
|
"loss": 0.1266, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.2934073805809021, |
|
"learning_rate": 0.001, |
|
"loss": 0.1278, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.27599695324897766, |
|
"learning_rate": 0.001, |
|
"loss": 0.1286, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.39952772855758667, |
|
"learning_rate": 0.001, |
|
"loss": 0.1252, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.4082016348838806, |
|
"learning_rate": 0.001, |
|
"loss": 0.1272, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.303307443857193, |
|
"learning_rate": 0.001, |
|
"loss": 0.1249, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.1597479283809662, |
|
"learning_rate": 0.001, |
|
"loss": 0.1247, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.03666090965271, |
|
"learning_rate": 0.001, |
|
"loss": 0.1286, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.2832247018814087, |
|
"learning_rate": 0.001, |
|
"loss": 0.1248, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.49678078293800354, |
|
"learning_rate": 0.001, |
|
"loss": 0.1258, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.3678058385848999, |
|
"learning_rate": 0.001, |
|
"loss": 0.1256, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.26233455538749695, |
|
"learning_rate": 0.001, |
|
"loss": 0.1233, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.22039958834648132, |
|
"learning_rate": 0.001, |
|
"loss": 0.1197, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.14722639322280884, |
|
"learning_rate": 0.001, |
|
"loss": 0.1225, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.19015900790691376, |
|
"learning_rate": 0.001, |
|
"loss": 0.1217, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.15655829012393951, |
|
"learning_rate": 0.001, |
|
"loss": 0.1185, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.5397889614105225, |
|
"learning_rate": 0.001, |
|
"loss": 0.119, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.845320999622345, |
|
"learning_rate": 0.001, |
|
"loss": 0.1276, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.34136563539505005, |
|
"learning_rate": 0.001, |
|
"loss": 0.122, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.2509533762931824, |
|
"learning_rate": 0.001, |
|
"loss": 0.1199, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.31120267510414124, |
|
"learning_rate": 0.001, |
|
"loss": 0.1191, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.3903524875640869, |
|
"learning_rate": 0.001, |
|
"loss": 0.1183, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.19971555471420288, |
|
"learning_rate": 0.001, |
|
"loss": 0.1177, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.36589089035987854, |
|
"learning_rate": 0.001, |
|
"loss": 0.1158, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.19200453162193298, |
|
"learning_rate": 0.001, |
|
"loss": 0.1166, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6393672823905945, |
|
"learning_rate": 0.001, |
|
"loss": 0.1171, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.32421180605888367, |
|
"learning_rate": 0.001, |
|
"loss": 0.118, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6238926649093628, |
|
"learning_rate": 0.001, |
|
"loss": 0.1166, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.1363907754421234, |
|
"learning_rate": 0.001, |
|
"loss": 0.1156, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.16790109872817993, |
|
"learning_rate": 0.001, |
|
"loss": 0.1142, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.1915178894996643, |
|
"learning_rate": 0.001, |
|
"loss": 0.1126, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.12727123498916626, |
|
"learning_rate": 0.001, |
|
"loss": 0.1156, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.29520758986473083, |
|
"learning_rate": 0.001, |
|
"loss": 0.1129, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.1663757860660553, |
|
"learning_rate": 0.001, |
|
"loss": 0.1132, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.1840706318616867, |
|
"learning_rate": 0.001, |
|
"loss": 0.119, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.16156257688999176, |
|
"learning_rate": 0.001, |
|
"loss": 0.1146, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.17348338663578033, |
|
"learning_rate": 0.001, |
|
"loss": 0.1141, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.18696527183055878, |
|
"learning_rate": 0.001, |
|
"loss": 0.1108, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.15352846682071686, |
|
"learning_rate": 0.001, |
|
"loss": 0.1134, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.23210759460926056, |
|
"learning_rate": 0.001, |
|
"loss": 0.1142, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.18328526616096497, |
|
"learning_rate": 0.001, |
|
"loss": 0.1109, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.17674757540225983, |
|
"learning_rate": 0.001, |
|
"loss": 0.1083, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.34446394443511963, |
|
"learning_rate": 0.001, |
|
"loss": 0.1203, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.22947299480438232, |
|
"learning_rate": 0.001, |
|
"loss": 0.1095, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.15071985125541687, |
|
"learning_rate": 0.001, |
|
"loss": 0.1088, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.14273251593112946, |
|
"learning_rate": 0.001, |
|
"loss": 0.1091, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.20266981422901154, |
|
"learning_rate": 0.001, |
|
"loss": 0.1089, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.1495724767446518, |
|
"learning_rate": 0.001, |
|
"loss": 0.1089, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.1711970865726471, |
|
"learning_rate": 0.001, |
|
"loss": 0.1063, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.20727260410785675, |
|
"learning_rate": 0.001, |
|
"loss": 0.104, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.22724412381649017, |
|
"learning_rate": 0.001, |
|
"loss": 0.1087, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.15561726689338684, |
|
"learning_rate": 0.001, |
|
"loss": 0.1086, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2139796018600464, |
|
"learning_rate": 0.001, |
|
"loss": 0.1054, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.24371370673179626, |
|
"learning_rate": 0.001, |
|
"loss": 0.1077, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.22944559156894684, |
|
"learning_rate": 0.001, |
|
"loss": 0.1092, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.19578562676906586, |
|
"learning_rate": 0.001, |
|
"loss": 0.1077, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.17588412761688232, |
|
"learning_rate": 0.001, |
|
"loss": 0.1048, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.16697707772254944, |
|
"learning_rate": 0.001, |
|
"loss": 0.1072, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.1927742063999176, |
|
"learning_rate": 0.001, |
|
"loss": 0.1036, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.25396087765693665, |
|
"learning_rate": 0.001, |
|
"loss": 0.1068, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.21014653146266937, |
|
"learning_rate": 0.001, |
|
"loss": 0.1012, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.32085150480270386, |
|
"learning_rate": 0.001, |
|
"loss": 0.1062, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.10534122586250305, |
|
"learning_rate": 0.001, |
|
"loss": 0.103, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.24365462362766266, |
|
"learning_rate": 0.001, |
|
"loss": 0.106, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.15197184681892395, |
|
"learning_rate": 0.001, |
|
"loss": 0.1051, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.23027855157852173, |
|
"learning_rate": 0.001, |
|
"loss": 0.1065, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.14924216270446777, |
|
"learning_rate": 0.001, |
|
"loss": 0.1068, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.13331858813762665, |
|
"learning_rate": 0.001, |
|
"loss": 0.1035, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.20150358974933624, |
|
"learning_rate": 0.001, |
|
"loss": 0.1065, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.1429535299539566, |
|
"learning_rate": 0.001, |
|
"loss": 0.1056, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.16326557099819183, |
|
"learning_rate": 0.001, |
|
"loss": 0.1022, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.15712429583072662, |
|
"learning_rate": 0.001, |
|
"loss": 0.1051, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.33204013109207153, |
|
"learning_rate": 0.001, |
|
"loss": 0.1046, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.17703518271446228, |
|
"learning_rate": 0.001, |
|
"loss": 0.1057, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.14861218631267548, |
|
"learning_rate": 0.001, |
|
"loss": 0.1052, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.18271447718143463, |
|
"learning_rate": 0.001, |
|
"loss": 0.1049, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2245068997144699, |
|
"learning_rate": 0.001, |
|
"loss": 0.1033, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2233046442270279, |
|
"learning_rate": 0.001, |
|
"loss": 0.1049, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1915113776922226, |
|
"learning_rate": 0.001, |
|
"loss": 0.1039, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.1070462241768837, |
|
"learning_rate": 0.001, |
|
"loss": 0.1028, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.14523275196552277, |
|
"learning_rate": 0.001, |
|
"loss": 0.0983, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.24468256533145905, |
|
"learning_rate": 0.001, |
|
"loss": 0.1018, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.17596426606178284, |
|
"learning_rate": 0.001, |
|
"loss": 0.1017, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.15113884210586548, |
|
"learning_rate": 0.001, |
|
"loss": 0.1022, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1756398230791092, |
|
"learning_rate": 0.001, |
|
"loss": 0.1032, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.1491193026304245, |
|
"learning_rate": 0.001, |
|
"loss": 0.1016, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.15422752499580383, |
|
"learning_rate": 0.001, |
|
"loss": 0.0989, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.13713973760604858, |
|
"learning_rate": 0.001, |
|
"loss": 0.1002, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.16012702882289886, |
|
"learning_rate": 0.001, |
|
"loss": 0.101, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.23414984345436096, |
|
"learning_rate": 0.001, |
|
"loss": 0.0975, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.13922521471977234, |
|
"learning_rate": 0.001, |
|
"loss": 0.1002, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.14608104526996613, |
|
"learning_rate": 0.001, |
|
"loss": 0.098, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.19267164170742035, |
|
"learning_rate": 0.001, |
|
"loss": 0.098, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1570904552936554, |
|
"learning_rate": 0.001, |
|
"loss": 0.1034, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3922866880893707, |
|
"learning_rate": 0.001, |
|
"loss": 0.1008, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.20500238239765167, |
|
"learning_rate": 0.001, |
|
"loss": 0.1025, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2044358104467392, |
|
"learning_rate": 0.001, |
|
"loss": 0.0982, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1722269356250763, |
|
"learning_rate": 0.001, |
|
"loss": 0.1007, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.21868231892585754, |
|
"learning_rate": 0.001, |
|
"loss": 0.1, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.12817895412445068, |
|
"learning_rate": 0.001, |
|
"loss": 0.1, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.12333246320486069, |
|
"learning_rate": 0.001, |
|
"loss": 0.0987, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1742565631866455, |
|
"learning_rate": 0.001, |
|
"loss": 0.0981, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.15747936069965363, |
|
"learning_rate": 0.001, |
|
"loss": 0.1012, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.27314338088035583, |
|
"learning_rate": 0.001, |
|
"loss": 0.1014, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.9368189573287964, |
|
"learning_rate": 0.001, |
|
"loss": 0.1035, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.3574996590614319, |
|
"learning_rate": 0.001, |
|
"loss": 0.0992, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.28280141949653625, |
|
"learning_rate": 0.001, |
|
"loss": 0.0975, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.21435654163360596, |
|
"learning_rate": 0.001, |
|
"loss": 0.0998, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.20617541670799255, |
|
"learning_rate": 0.001, |
|
"loss": 0.0994, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.21885354816913605, |
|
"learning_rate": 0.001, |
|
"loss": 0.099, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.24429431557655334, |
|
"learning_rate": 0.001, |
|
"loss": 0.1018, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.24264854192733765, |
|
"learning_rate": 0.001, |
|
"loss": 0.1009, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.19410717487335205, |
|
"learning_rate": 0.001, |
|
"loss": 0.1007, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.15938735008239746, |
|
"learning_rate": 0.001, |
|
"loss": 0.0965, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.678229808807373, |
|
"learning_rate": 0.001, |
|
"loss": 0.1001, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2967202663421631, |
|
"learning_rate": 0.001, |
|
"loss": 0.1003, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7940108180046082, |
|
"learning_rate": 0.001, |
|
"loss": 0.1001, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.24995733797550201, |
|
"learning_rate": 0.001, |
|
"loss": 0.0992, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.1626627892255783, |
|
"learning_rate": 0.001, |
|
"loss": 0.0992, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21141190826892853, |
|
"learning_rate": 0.001, |
|
"loss": 0.0961, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.21122020483016968, |
|
"learning_rate": 0.001, |
|
"loss": 0.0968, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2558838725090027, |
|
"learning_rate": 0.001, |
|
"loss": 0.098, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.1975196897983551, |
|
"learning_rate": 0.001, |
|
"loss": 0.0987, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.14767397940158844, |
|
"learning_rate": 0.001, |
|
"loss": 0.096, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.17532730102539062, |
|
"learning_rate": 0.001, |
|
"loss": 0.0985, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.1320209801197052, |
|
"learning_rate": 0.001, |
|
"loss": 0.0968, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.273934930562973, |
|
"learning_rate": 0.001, |
|
"loss": 0.0978, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.15103434026241302, |
|
"learning_rate": 0.001, |
|
"loss": 0.0995, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2021692842245102, |
|
"learning_rate": 0.001, |
|
"loss": 0.0952, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.1648433655500412, |
|
"learning_rate": 0.001, |
|
"loss": 0.0938, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.17460817098617554, |
|
"learning_rate": 0.001, |
|
"loss": 0.0959, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.15195918083190918, |
|
"learning_rate": 0.001, |
|
"loss": 0.094, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.1664193570613861, |
|
"learning_rate": 0.001, |
|
"loss": 0.094, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.14700663089752197, |
|
"learning_rate": 0.001, |
|
"loss": 0.0951, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.22301018238067627, |
|
"learning_rate": 0.001, |
|
"loss": 0.0919, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1666121482849121, |
|
"learning_rate": 0.001, |
|
"loss": 0.0928, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.1971474438905716, |
|
"learning_rate": 0.001, |
|
"loss": 0.0949, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.15959730744361877, |
|
"learning_rate": 0.001, |
|
"loss": 0.095, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.29146862030029297, |
|
"learning_rate": 0.001, |
|
"loss": 0.0942, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.15853939950466156, |
|
"learning_rate": 0.001, |
|
"loss": 0.0978, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.16822876036167145, |
|
"learning_rate": 0.001, |
|
"loss": 0.0934, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.15456752479076385, |
|
"learning_rate": 0.001, |
|
"loss": 0.0948, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.15123625099658966, |
|
"learning_rate": 0.001, |
|
"loss": 0.0926, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.16344180703163147, |
|
"learning_rate": 0.001, |
|
"loss": 0.0935, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.22936996817588806, |
|
"learning_rate": 0.001, |
|
"loss": 0.0936, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.16810204088687897, |
|
"learning_rate": 0.001, |
|
"loss": 0.0978, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.14977198839187622, |
|
"learning_rate": 0.001, |
|
"loss": 0.0936, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.18207716941833496, |
|
"learning_rate": 0.001, |
|
"loss": 0.093, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2584002912044525, |
|
"learning_rate": 0.001, |
|
"loss": 0.0958, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.23717880249023438, |
|
"learning_rate": 0.001, |
|
"loss": 0.0927, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.1896461844444275, |
|
"learning_rate": 0.001, |
|
"loss": 0.094, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.21543921530246735, |
|
"learning_rate": 0.001, |
|
"loss": 0.0953, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.14013002812862396, |
|
"learning_rate": 0.001, |
|
"loss": 0.0958, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.1744927018880844, |
|
"learning_rate": 0.001, |
|
"loss": 0.0946, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.16546490788459778, |
|
"learning_rate": 0.001, |
|
"loss": 0.0962, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.16227766871452332, |
|
"learning_rate": 0.001, |
|
"loss": 0.0952, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.181349515914917, |
|
"learning_rate": 0.001, |
|
"loss": 0.0951, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.20408563315868378, |
|
"learning_rate": 0.001, |
|
"loss": 0.0915, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.1793171763420105, |
|
"learning_rate": 0.001, |
|
"loss": 0.0942, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.14634822309017181, |
|
"learning_rate": 0.001, |
|
"loss": 0.0961, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.18879148364067078, |
|
"learning_rate": 0.001, |
|
"loss": 0.0942, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.20523515343666077, |
|
"learning_rate": 0.001, |
|
"loss": 0.0912, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.18672947585582733, |
|
"learning_rate": 0.001, |
|
"loss": 0.092, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.18561910092830658, |
|
"learning_rate": 0.001, |
|
"loss": 0.0913, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.23991861939430237, |
|
"learning_rate": 0.001, |
|
"loss": 0.0925, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.1660347878932953, |
|
"learning_rate": 0.001, |
|
"loss": 0.0939, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2105019986629486, |
|
"learning_rate": 0.001, |
|
"loss": 0.093, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2271376997232437, |
|
"learning_rate": 0.001, |
|
"loss": 0.0899, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.14487460255622864, |
|
"learning_rate": 0.001, |
|
"loss": 0.0906, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.1597098708152771, |
|
"learning_rate": 0.001, |
|
"loss": 0.0919, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.18633900582790375, |
|
"learning_rate": 0.001, |
|
"loss": 0.0892, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.12663201987743378, |
|
"learning_rate": 0.001, |
|
"loss": 0.0913, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.17320451140403748, |
|
"learning_rate": 0.001, |
|
"loss": 0.0911, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.16872632503509521, |
|
"learning_rate": 0.001, |
|
"loss": 0.091, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.18602560460567474, |
|
"learning_rate": 0.001, |
|
"loss": 0.0908, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.17392034828662872, |
|
"learning_rate": 0.001, |
|
"loss": 0.0882, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.10278663039207458, |
|
"learning_rate": 0.001, |
|
"loss": 0.088, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.15355843305587769, |
|
"learning_rate": 0.001, |
|
"loss": 0.0876, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.17331954836845398, |
|
"learning_rate": 0.001, |
|
"loss": 0.0906, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.16750375926494598, |
|
"learning_rate": 0.001, |
|
"loss": 0.0935, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.27208462357521057, |
|
"learning_rate": 0.001, |
|
"loss": 0.0884, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2215784639120102, |
|
"learning_rate": 0.001, |
|
"loss": 0.0904, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.1542549580335617, |
|
"learning_rate": 0.001, |
|
"loss": 0.0903, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.22874318063259125, |
|
"learning_rate": 0.001, |
|
"loss": 0.0889, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.22677820920944214, |
|
"learning_rate": 0.001, |
|
"loss": 0.0915, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.22208420932292938, |
|
"learning_rate": 0.001, |
|
"loss": 0.0902, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.18172180652618408, |
|
"learning_rate": 0.001, |
|
"loss": 0.091, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.264664888381958, |
|
"learning_rate": 0.001, |
|
"loss": 0.091, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.15961118042469025, |
|
"learning_rate": 0.001, |
|
"loss": 0.0864, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.16828449070453644, |
|
"learning_rate": 0.001, |
|
"loss": 0.0902, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.25299304723739624, |
|
"learning_rate": 0.001, |
|
"loss": 0.0895, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.2019224911928177, |
|
"learning_rate": 0.001, |
|
"loss": 0.0887, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.19100870192050934, |
|
"learning_rate": 0.001, |
|
"loss": 0.0897, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.25321510434150696, |
|
"learning_rate": 0.001, |
|
"loss": 0.092, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.18171149492263794, |
|
"learning_rate": 0.001, |
|
"loss": 0.089, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.19380785524845123, |
|
"learning_rate": 0.001, |
|
"loss": 0.0895, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.18437138199806213, |
|
"learning_rate": 0.001, |
|
"loss": 0.0903, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.1717921495437622, |
|
"learning_rate": 0.001, |
|
"loss": 0.0885, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.23623107373714447, |
|
"learning_rate": 0.001, |
|
"loss": 0.0882, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.17992794513702393, |
|
"learning_rate": 0.001, |
|
"loss": 0.0885, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.19958259165287018, |
|
"learning_rate": 0.001, |
|
"loss": 0.088, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.14418841898441315, |
|
"learning_rate": 0.001, |
|
"loss": 0.0908, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.13934949040412903, |
|
"learning_rate": 0.001, |
|
"loss": 0.0919, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1410313993692398, |
|
"learning_rate": 0.001, |
|
"loss": 0.0891, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.27084311842918396, |
|
"learning_rate": 0.001, |
|
"loss": 0.0917, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.18704760074615479, |
|
"learning_rate": 0.001, |
|
"loss": 0.0866, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.16178588569164276, |
|
"learning_rate": 0.001, |
|
"loss": 0.088, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.1699521839618683, |
|
"learning_rate": 0.001, |
|
"loss": 0.0891, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.21340341866016388, |
|
"learning_rate": 0.001, |
|
"loss": 0.0871, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.21089456975460052, |
|
"learning_rate": 0.001, |
|
"loss": 0.0898, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.17899860441684723, |
|
"learning_rate": 0.001, |
|
"loss": 0.0874, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.2222578376531601, |
|
"learning_rate": 0.001, |
|
"loss": 0.0875, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.22845357656478882, |
|
"learning_rate": 0.001, |
|
"loss": 0.0895, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.22213339805603027, |
|
"learning_rate": 0.001, |
|
"loss": 0.0877, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.1989658772945404, |
|
"learning_rate": 0.001, |
|
"loss": 0.09, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.28217941522598267, |
|
"learning_rate": 0.001, |
|
"loss": 0.0869, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.1880946159362793, |
|
"learning_rate": 0.001, |
|
"loss": 0.0895, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2522743046283722, |
|
"learning_rate": 0.001, |
|
"loss": 0.0876, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.15146856009960175, |
|
"learning_rate": 0.001, |
|
"loss": 0.0892, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.20138536393642426, |
|
"learning_rate": 0.001, |
|
"loss": 0.0897, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.19894324243068695, |
|
"learning_rate": 0.001, |
|
"loss": 0.089, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.20011819899082184, |
|
"learning_rate": 0.001, |
|
"loss": 0.0877, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.22739243507385254, |
|
"learning_rate": 0.001, |
|
"loss": 0.0875, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.16710792481899261, |
|
"learning_rate": 0.001, |
|
"loss": 0.0852, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.20454761385917664, |
|
"learning_rate": 0.001, |
|
"loss": 0.0862, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.12356776744127274, |
|
"learning_rate": 0.001, |
|
"loss": 0.0854, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.18977922201156616, |
|
"learning_rate": 0.001, |
|
"loss": 0.0915, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.18791726231575012, |
|
"learning_rate": 0.001, |
|
"loss": 0.0842, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.23529213666915894, |
|
"learning_rate": 0.001, |
|
"loss": 0.086, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.25430527329444885, |
|
"learning_rate": 0.001, |
|
"loss": 0.0833, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.22178427875041962, |
|
"learning_rate": 0.001, |
|
"loss": 0.0874, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.27455243468284607, |
|
"learning_rate": 0.001, |
|
"loss": 0.0845, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1998920440673828, |
|
"learning_rate": 0.001, |
|
"loss": 0.0869, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1991311013698578, |
|
"learning_rate": 0.001, |
|
"loss": 0.0873, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2600191831588745, |
|
"learning_rate": 0.001, |
|
"loss": 0.0869, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.16889439523220062, |
|
"learning_rate": 0.001, |
|
"loss": 0.0841, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.17337612807750702, |
|
"learning_rate": 0.001, |
|
"loss": 0.0847, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.12141957134008408, |
|
"learning_rate": 0.001, |
|
"loss": 0.0873, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.30542996525764465, |
|
"learning_rate": 0.001, |
|
"loss": 0.086, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.256072461605072, |
|
"learning_rate": 0.001, |
|
"loss": 0.0845, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.19596265256404877, |
|
"learning_rate": 0.001, |
|
"loss": 0.0847, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.17981210350990295, |
|
"learning_rate": 0.001, |
|
"loss": 0.0853, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.18695278465747833, |
|
"learning_rate": 0.001, |
|
"loss": 0.0867, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.20189203321933746, |
|
"learning_rate": 0.001, |
|
"loss": 0.0867, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.20751608908176422, |
|
"learning_rate": 0.001, |
|
"loss": 0.0855, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.15412236750125885, |
|
"learning_rate": 0.001, |
|
"loss": 0.0876, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21551938354969025, |
|
"learning_rate": 0.001, |
|
"loss": 0.0854, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.15149344503879547, |
|
"learning_rate": 0.001, |
|
"loss": 0.0863, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21960322558879852, |
|
"learning_rate": 0.001, |
|
"loss": 0.0913, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.317090779542923, |
|
"learning_rate": 0.001, |
|
"loss": 0.0832, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.20051142573356628, |
|
"learning_rate": 0.001, |
|
"loss": 0.0856, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.1955852061510086, |
|
"learning_rate": 0.001, |
|
"loss": 0.0867, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.13714253902435303, |
|
"learning_rate": 0.001, |
|
"loss": 0.0864, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.18536311388015747, |
|
"learning_rate": 0.001, |
|
"loss": 0.0868, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1795514076948166, |
|
"learning_rate": 0.001, |
|
"loss": 0.0829, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1465149074792862, |
|
"learning_rate": 0.001, |
|
"loss": 0.0851, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.17687107622623444, |
|
"learning_rate": 0.001, |
|
"loss": 0.0861, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1795363575220108, |
|
"learning_rate": 0.001, |
|
"loss": 0.0822, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1741327941417694, |
|
"learning_rate": 0.001, |
|
"loss": 0.0847, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2547447681427002, |
|
"learning_rate": 0.001, |
|
"loss": 0.0862, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.16002462804317474, |
|
"learning_rate": 0.001, |
|
"loss": 0.0856, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.14787407219409943, |
|
"learning_rate": 0.001, |
|
"loss": 0.0844, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.23449848592281342, |
|
"learning_rate": 0.001, |
|
"loss": 0.0823, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.18626731634140015, |
|
"learning_rate": 0.001, |
|
"loss": 0.0804, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.1434779316186905, |
|
"learning_rate": 0.001, |
|
"loss": 0.0844, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.1594706028699875, |
|
"learning_rate": 0.001, |
|
"loss": 0.0869, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.18195496499538422, |
|
"learning_rate": 0.001, |
|
"loss": 0.0846, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.18613013625144958, |
|
"learning_rate": 0.001, |
|
"loss": 0.0872, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.16158261895179749, |
|
"learning_rate": 0.001, |
|
"loss": 0.0846, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.17811179161071777, |
|
"learning_rate": 0.001, |
|
"loss": 0.0832, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.24112731218338013, |
|
"learning_rate": 0.001, |
|
"loss": 0.0804, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.1778961569070816, |
|
"learning_rate": 0.001, |
|
"loss": 0.0837, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.18162128329277039, |
|
"learning_rate": 0.001, |
|
"loss": 0.0867, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.15079495310783386, |
|
"learning_rate": 0.001, |
|
"loss": 0.0829, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.26986435055732727, |
|
"learning_rate": 0.001, |
|
"loss": 0.0843, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.2643984854221344, |
|
"learning_rate": 0.001, |
|
"loss": 0.0829, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.281751424074173, |
|
"learning_rate": 0.001, |
|
"loss": 0.0821, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.23095449805259705, |
|
"learning_rate": 0.001, |
|
"loss": 0.0836, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.18625666201114655, |
|
"learning_rate": 0.001, |
|
"loss": 0.0831, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.13689708709716797, |
|
"learning_rate": 0.001, |
|
"loss": 0.0839, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.14063656330108643, |
|
"learning_rate": 0.001, |
|
"loss": 0.0817, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.1880202442407608, |
|
"learning_rate": 0.001, |
|
"loss": 0.082, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.15921075642108917, |
|
"learning_rate": 0.001, |
|
"loss": 0.0789, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.1744866818189621, |
|
"learning_rate": 0.001, |
|
"loss": 0.0818, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.26724693179130554, |
|
"learning_rate": 0.001, |
|
"loss": 0.0847, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.14382457733154297, |
|
"learning_rate": 0.001, |
|
"loss": 0.0829, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.14012865722179413, |
|
"learning_rate": 0.001, |
|
"loss": 0.082, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.24175578355789185, |
|
"learning_rate": 0.001, |
|
"loss": 0.0835, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3397182822227478, |
|
"learning_rate": 0.001, |
|
"loss": 0.081, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.1553467959165573, |
|
"learning_rate": 0.001, |
|
"loss": 0.0829, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.20726840198040009, |
|
"learning_rate": 0.001, |
|
"loss": 0.083, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.21219220757484436, |
|
"learning_rate": 0.001, |
|
"loss": 0.084, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.19203193485736847, |
|
"learning_rate": 0.001, |
|
"loss": 0.0819, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.22557440400123596, |
|
"learning_rate": 0.001, |
|
"loss": 0.0803, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.23452799022197723, |
|
"learning_rate": 0.001, |
|
"loss": 0.0806, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.28543928265571594, |
|
"learning_rate": 0.001, |
|
"loss": 0.0827, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.19713571667671204, |
|
"learning_rate": 0.001, |
|
"loss": 0.08, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.18496285378932953, |
|
"learning_rate": 0.001, |
|
"loss": 0.0841, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1363649070262909, |
|
"learning_rate": 0.001, |
|
"loss": 0.0813, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1736011952161789, |
|
"learning_rate": 0.001, |
|
"loss": 0.0796, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.21385334432125092, |
|
"learning_rate": 0.001, |
|
"loss": 0.0814, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2105669230222702, |
|
"learning_rate": 0.001, |
|
"loss": 0.0816, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2278176248073578, |
|
"learning_rate": 0.001, |
|
"loss": 0.0825, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.17637114226818085, |
|
"learning_rate": 0.001, |
|
"loss": 0.0812, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.20035295188426971, |
|
"learning_rate": 0.001, |
|
"loss": 0.0853, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.25408777594566345, |
|
"learning_rate": 0.001, |
|
"loss": 0.0811, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2177010476589203, |
|
"learning_rate": 0.001, |
|
"loss": 0.0796, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.1639321744441986, |
|
"learning_rate": 0.001, |
|
"loss": 0.0824, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.15798155963420868, |
|
"learning_rate": 0.001, |
|
"loss": 0.0834, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.14857494831085205, |
|
"learning_rate": 0.001, |
|
"loss": 0.0825, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.15640319883823395, |
|
"learning_rate": 0.001, |
|
"loss": 0.0814, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.1530522108078003, |
|
"learning_rate": 0.001, |
|
"loss": 0.0825, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2990354001522064, |
|
"learning_rate": 0.001, |
|
"loss": 0.0785, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19239626824855804, |
|
"learning_rate": 0.001, |
|
"loss": 0.0809, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.13975249230861664, |
|
"learning_rate": 0.001, |
|
"loss": 0.0825, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.22527189552783966, |
|
"learning_rate": 0.001, |
|
"loss": 0.0819, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.3547128438949585, |
|
"learning_rate": 0.001, |
|
"loss": 0.1013, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.22032135725021362, |
|
"learning_rate": 0.001, |
|
"loss": 0.0806, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.12712807953357697, |
|
"learning_rate": 0.001, |
|
"loss": 0.0791, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.29608944058418274, |
|
"learning_rate": 0.001, |
|
"loss": 0.0783, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.23063918948173523, |
|
"learning_rate": 0.001, |
|
"loss": 0.0828, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19996796548366547, |
|
"learning_rate": 0.001, |
|
"loss": 0.0813, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.19479811191558838, |
|
"learning_rate": 0.001, |
|
"loss": 0.0811, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.1822797805070877, |
|
"learning_rate": 0.001, |
|
"loss": 0.0796, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.36260533332824707, |
|
"learning_rate": 0.001, |
|
"loss": 0.0797, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.14315147697925568, |
|
"learning_rate": 0.001, |
|
"loss": 0.0869, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.20261742174625397, |
|
"learning_rate": 0.001, |
|
"loss": 0.1856, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.18873733282089233, |
|
"learning_rate": 0.001, |
|
"loss": 0.0775, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2189916968345642, |
|
"learning_rate": 0.001, |
|
"loss": 0.0796, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.1823868304491043, |
|
"learning_rate": 0.001, |
|
"loss": 0.0822, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2595207691192627, |
|
"learning_rate": 0.001, |
|
"loss": 0.0776, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.1713092178106308, |
|
"learning_rate": 0.001, |
|
"loss": 0.0811, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.24840323626995087, |
|
"learning_rate": 0.001, |
|
"loss": 0.104, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.23451556265354156, |
|
"learning_rate": 0.001, |
|
"loss": 0.077, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2142404466867447, |
|
"learning_rate": 0.001, |
|
"loss": 0.0789, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.22932325303554535, |
|
"learning_rate": 0.001, |
|
"loss": 0.0778, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2027159184217453, |
|
"learning_rate": 0.001, |
|
"loss": 0.0794, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.22258317470550537, |
|
"learning_rate": 0.001, |
|
"loss": 0.0787, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2979215681552887, |
|
"learning_rate": 0.001, |
|
"loss": 0.0767, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2110917568206787, |
|
"learning_rate": 0.001, |
|
"loss": 0.0782, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.24181802570819855, |
|
"learning_rate": 0.001, |
|
"loss": 0.0804, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.1810845136642456, |
|
"learning_rate": 0.001, |
|
"loss": 0.0786, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.23404444754123688, |
|
"learning_rate": 0.001, |
|
"loss": 0.0785, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2591089904308319, |
|
"learning_rate": 0.001, |
|
"loss": 0.0765, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.22720029950141907, |
|
"learning_rate": 0.001, |
|
"loss": 0.0798, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.22449086606502533, |
|
"learning_rate": 0.001, |
|
"loss": 0.0766, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2302643209695816, |
|
"learning_rate": 0.001, |
|
"loss": 0.0798, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2040921300649643, |
|
"learning_rate": 0.001, |
|
"loss": 0.0841, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.21232621371746063, |
|
"learning_rate": 0.001, |
|
"loss": 0.0789, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20054876804351807, |
|
"learning_rate": 0.001, |
|
"loss": 0.0779, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.24335692822933197, |
|
"learning_rate": 0.001, |
|
"loss": 0.0784, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.22172445058822632, |
|
"learning_rate": 0.001, |
|
"loss": 0.0797, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20524169504642487, |
|
"learning_rate": 0.001, |
|
"loss": 0.0803, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.17150288820266724, |
|
"learning_rate": 0.001, |
|
"loss": 0.0791, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.38285690546035767, |
|
"learning_rate": 0.001, |
|
"loss": 0.079, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.16937342286109924, |
|
"learning_rate": 0.001, |
|
"loss": 0.0791, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.19271647930145264, |
|
"learning_rate": 0.001, |
|
"loss": 0.079, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.20048774778842926, |
|
"learning_rate": 0.001, |
|
"loss": 0.0797, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2141706347465515, |
|
"learning_rate": 0.001, |
|
"loss": 0.0798, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.20665834844112396, |
|
"learning_rate": 0.001, |
|
"loss": 0.0778, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.18385255336761475, |
|
"learning_rate": 0.001, |
|
"loss": 0.0779, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.22467826306819916, |
|
"learning_rate": 0.001, |
|
"loss": 0.0732, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.18363313376903534, |
|
"learning_rate": 0.001, |
|
"loss": 0.0796, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2288578897714615, |
|
"learning_rate": 0.001, |
|
"loss": 0.0763, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2535518407821655, |
|
"learning_rate": 0.001, |
|
"loss": 0.0791, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.20715934038162231, |
|
"learning_rate": 0.001, |
|
"loss": 0.0777, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.12203960865736008, |
|
"learning_rate": 0.001, |
|
"loss": 0.0805, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.138369619846344, |
|
"learning_rate": 0.001, |
|
"loss": 0.0768, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2319127321243286, |
|
"learning_rate": 0.001, |
|
"loss": 0.0784, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2058788686990738, |
|
"learning_rate": 0.001, |
|
"loss": 0.0783, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.21334126591682434, |
|
"learning_rate": 0.001, |
|
"loss": 0.0763, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.23397529125213623, |
|
"learning_rate": 0.001, |
|
"loss": 0.081, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.24460141360759735, |
|
"learning_rate": 0.001, |
|
"loss": 0.0752, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.22441798448562622, |
|
"learning_rate": 0.001, |
|
"loss": 0.0779, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.20988881587982178, |
|
"learning_rate": 0.001, |
|
"loss": 0.08, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.17863024771213531, |
|
"learning_rate": 0.001, |
|
"loss": 0.0787, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.17980898916721344, |
|
"learning_rate": 0.001, |
|
"loss": 0.0802, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2614147961139679, |
|
"learning_rate": 0.001, |
|
"loss": 0.0787, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.16281504929065704, |
|
"learning_rate": 0.001, |
|
"loss": 0.0779, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3099921941757202, |
|
"learning_rate": 0.001, |
|
"loss": 0.0747, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2542015016078949, |
|
"learning_rate": 0.001, |
|
"loss": 0.0831, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.17419801652431488, |
|
"learning_rate": 0.001, |
|
"loss": 0.0787, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2089216262102127, |
|
"learning_rate": 0.001, |
|
"loss": 0.0781, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.26476818323135376, |
|
"learning_rate": 0.001, |
|
"loss": 0.0792, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.18907053768634796, |
|
"learning_rate": 0.001, |
|
"loss": 0.078, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2528514564037323, |
|
"learning_rate": 0.001, |
|
"loss": 0.0791, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2794158458709717, |
|
"learning_rate": 0.001, |
|
"loss": 0.0799, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.24547474086284637, |
|
"learning_rate": 0.001, |
|
"loss": 0.0765, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.17239224910736084, |
|
"learning_rate": 0.001, |
|
"loss": 0.0807, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.22998745739459991, |
|
"learning_rate": 0.001, |
|
"loss": 0.079, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2727990746498108, |
|
"learning_rate": 0.001, |
|
"loss": 0.078, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2488749623298645, |
|
"learning_rate": 0.001, |
|
"loss": 0.0757, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.20260153710842133, |
|
"learning_rate": 0.001, |
|
"loss": 0.0787, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.30832308530807495, |
|
"learning_rate": 0.001, |
|
"loss": 0.0789, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.17934545874595642, |
|
"learning_rate": 0.001, |
|
"loss": 0.0768, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1972292810678482, |
|
"learning_rate": 0.001, |
|
"loss": 0.0786, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1899816393852234, |
|
"learning_rate": 0.001, |
|
"loss": 0.0782, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.17765800654888153, |
|
"learning_rate": 0.001, |
|
"loss": 0.0784, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3285583555698395, |
|
"learning_rate": 0.001, |
|
"loss": 0.0793, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2769279181957245, |
|
"learning_rate": 0.001, |
|
"loss": 0.0818, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1661899834871292, |
|
"learning_rate": 0.001, |
|
"loss": 0.1088, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.32694903016090393, |
|
"learning_rate": 0.001, |
|
"loss": 0.0799, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1976955235004425, |
|
"learning_rate": 0.001, |
|
"loss": 0.0768, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2623777687549591, |
|
"learning_rate": 0.001, |
|
"loss": 0.0764, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.19917914271354675, |
|
"learning_rate": 0.001, |
|
"loss": 0.079, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.22838640213012695, |
|
"learning_rate": 0.001, |
|
"loss": 0.076, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.1831175684928894, |
|
"learning_rate": 0.001, |
|
"loss": 0.0744, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.1774362176656723, |
|
"learning_rate": 0.001, |
|
"loss": 0.076, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.24986374378204346, |
|
"learning_rate": 0.001, |
|
"loss": 0.0754, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.15164266526699066, |
|
"learning_rate": 0.001, |
|
"loss": 0.0757, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.19118934869766235, |
|
"learning_rate": 0.001, |
|
"loss": 0.0787, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.1625840663909912, |
|
"learning_rate": 0.001, |
|
"loss": 0.0778, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.14519533514976501, |
|
"learning_rate": 0.001, |
|
"loss": 0.077, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.16799670457839966, |
|
"learning_rate": 0.001, |
|
"loss": 0.0764, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.15635591745376587, |
|
"learning_rate": 0.001, |
|
"loss": 0.0738, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.25875189900398254, |
|
"learning_rate": 0.001, |
|
"loss": 0.0757, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.2601448595523834, |
|
"learning_rate": 0.001, |
|
"loss": 0.0721, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.20097233355045319, |
|
"learning_rate": 0.001, |
|
"loss": 0.0764, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.17383421957492828, |
|
"learning_rate": 0.001, |
|
"loss": 0.0768, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.152663916349411, |
|
"learning_rate": 0.001, |
|
"loss": 0.0747, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.1773347705602646, |
|
"learning_rate": 0.001, |
|
"loss": 0.0743, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.15975210070610046, |
|
"learning_rate": 0.001, |
|
"loss": 0.0769, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.27663958072662354, |
|
"learning_rate": 0.001, |
|
"loss": 0.0747, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.20124509930610657, |
|
"learning_rate": 0.001, |
|
"loss": 0.0755, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.19016942381858826, |
|
"learning_rate": 0.001, |
|
"loss": 0.0709, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.34517988562583923, |
|
"learning_rate": 0.001, |
|
"loss": 0.0751, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.27312055230140686, |
|
"learning_rate": 0.001, |
|
"loss": 0.0761, |
|
"step": 52900 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.2835043668746948, |
|
"learning_rate": 0.001, |
|
"loss": 0.0731, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.1630600243806839, |
|
"learning_rate": 0.001, |
|
"loss": 0.0741, |
|
"step": 53100 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.2430613487958908, |
|
"learning_rate": 0.001, |
|
"loss": 0.0767, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.19533057510852814, |
|
"learning_rate": 0.001, |
|
"loss": 0.077, |
|
"step": 53300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.21139401197433472, |
|
"learning_rate": 0.001, |
|
"loss": 0.0711, |
|
"step": 53400 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.18416912853717804, |
|
"learning_rate": 0.001, |
|
"loss": 0.0729, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.24703727662563324, |
|
"learning_rate": 0.001, |
|
"loss": 0.071, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.14476247131824493, |
|
"learning_rate": 0.001, |
|
"loss": 0.0754, |
|
"step": 53700 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.210220068693161, |
|
"learning_rate": 0.001, |
|
"loss": 0.0738, |
|
"step": 53800 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.16544660925865173, |
|
"learning_rate": 0.001, |
|
"loss": 0.072, |
|
"step": 53900 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.17049700021743774, |
|
"learning_rate": 0.001, |
|
"loss": 0.0728, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.18656505644321442, |
|
"learning_rate": 0.001, |
|
"loss": 0.0739, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.19484791159629822, |
|
"learning_rate": 0.001, |
|
"loss": 0.0748, |
|
"step": 54200 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.1982715129852295, |
|
"learning_rate": 0.001, |
|
"loss": 0.0729, |
|
"step": 54300 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.2108699083328247, |
|
"learning_rate": 0.001, |
|
"loss": 0.0735, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.23962444067001343, |
|
"learning_rate": 0.001, |
|
"loss": 0.0703, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.29319801926612854, |
|
"learning_rate": 0.001, |
|
"loss": 0.0735, |
|
"step": 54600 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.1804085075855255, |
|
"learning_rate": 0.001, |
|
"loss": 0.0719, |
|
"step": 54700 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2394474297761917, |
|
"learning_rate": 0.001, |
|
"loss": 0.0721, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.20954197645187378, |
|
"learning_rate": 0.001, |
|
"loss": 0.0745, |
|
"step": 54900 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.17135080695152283, |
|
"learning_rate": 0.001, |
|
"loss": 0.0728, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.3152260482311249, |
|
"learning_rate": 0.001, |
|
"loss": 0.0735, |
|
"step": 55100 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.22659769654273987, |
|
"learning_rate": 0.001, |
|
"loss": 0.0752, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2605753540992737, |
|
"learning_rate": 0.001, |
|
"loss": 0.073, |
|
"step": 55300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2309567779302597, |
|
"learning_rate": 0.001, |
|
"loss": 0.0744, |
|
"step": 55400 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.19917166233062744, |
|
"learning_rate": 0.001, |
|
"loss": 0.073, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2609159052371979, |
|
"learning_rate": 0.001, |
|
"loss": 0.0705, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.26976123452186584, |
|
"learning_rate": 0.001, |
|
"loss": 0.0731, |
|
"step": 55700 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.25275784730911255, |
|
"learning_rate": 0.001, |
|
"loss": 0.0808, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2392340749502182, |
|
"learning_rate": 0.001, |
|
"loss": 0.0763, |
|
"step": 55900 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.27718254923820496, |
|
"learning_rate": 0.001, |
|
"loss": 0.0743, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.19996067881584167, |
|
"learning_rate": 0.001, |
|
"loss": 0.0807, |
|
"step": 56100 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.16322393715381622, |
|
"learning_rate": 0.001, |
|
"loss": 0.0753, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.25598809123039246, |
|
"learning_rate": 0.001, |
|
"loss": 0.0773, |
|
"step": 56300 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.15482768416404724, |
|
"learning_rate": 0.001, |
|
"loss": 0.0729, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.4033351242542267, |
|
"learning_rate": 0.001, |
|
"loss": 0.0773, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.2869590222835541, |
|
"learning_rate": 0.001, |
|
"loss": 0.0732, |
|
"step": 56600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.19079795479774475, |
|
"learning_rate": 0.001, |
|
"loss": 0.0712, |
|
"step": 56700 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.21604031324386597, |
|
"learning_rate": 0.001, |
|
"loss": 0.0714, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.23917321860790253, |
|
"learning_rate": 0.001, |
|
"loss": 0.0743, |
|
"step": 56900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.16785088181495667, |
|
"learning_rate": 0.001, |
|
"loss": 0.0722, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.22009502351284027, |
|
"learning_rate": 0.001, |
|
"loss": 0.0738, |
|
"step": 57100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.23401811718940735, |
|
"learning_rate": 0.001, |
|
"loss": 0.0759, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.19278208911418915, |
|
"learning_rate": 0.001, |
|
"loss": 0.0738, |
|
"step": 57300 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.22170820832252502, |
|
"learning_rate": 0.001, |
|
"loss": 0.07, |
|
"step": 57400 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.2148713767528534, |
|
"learning_rate": 0.001, |
|
"loss": 0.0716, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2093653529882431, |
|
"learning_rate": 0.001, |
|
"loss": 0.0722, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2912674844264984, |
|
"learning_rate": 0.001, |
|
"loss": 0.0738, |
|
"step": 57700 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.3146283030509949, |
|
"learning_rate": 0.001, |
|
"loss": 0.0735, |
|
"step": 57800 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2355007380247116, |
|
"learning_rate": 0.001, |
|
"loss": 0.0719, |
|
"step": 57900 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.19035007059574127, |
|
"learning_rate": 0.001, |
|
"loss": 0.0699, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.13338258862495422, |
|
"learning_rate": 0.001, |
|
"loss": 0.0727, |
|
"step": 58100 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.22755542397499084, |
|
"learning_rate": 0.001, |
|
"loss": 0.072, |
|
"step": 58200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.23752057552337646, |
|
"learning_rate": 0.001, |
|
"loss": 0.0703, |
|
"step": 58300 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.20008322596549988, |
|
"learning_rate": 0.001, |
|
"loss": 0.0721, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.1769803911447525, |
|
"learning_rate": 0.001, |
|
"loss": 0.0724, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.19137178361415863, |
|
"learning_rate": 0.001, |
|
"loss": 0.0735, |
|
"step": 58600 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.22157849371433258, |
|
"learning_rate": 0.001, |
|
"loss": 0.0735, |
|
"step": 58700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2098543494939804, |
|
"learning_rate": 0.001, |
|
"loss": 0.0701, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.22936704754829407, |
|
"learning_rate": 0.001, |
|
"loss": 0.0691, |
|
"step": 58900 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.15228866040706635, |
|
"learning_rate": 0.001, |
|
"loss": 0.0729, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.27094388008117676, |
|
"learning_rate": 0.001, |
|
"loss": 0.0706, |
|
"step": 59100 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.17357999086380005, |
|
"learning_rate": 0.001, |
|
"loss": 0.071, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2912188768386841, |
|
"learning_rate": 0.001, |
|
"loss": 0.0719, |
|
"step": 59300 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.24029956758022308, |
|
"learning_rate": 0.001, |
|
"loss": 0.07, |
|
"step": 59400 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.1956549882888794, |
|
"learning_rate": 0.001, |
|
"loss": 0.0712, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.26984256505966187, |
|
"learning_rate": 0.001, |
|
"loss": 0.0713, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.18548165261745453, |
|
"learning_rate": 0.001, |
|
"loss": 0.0686, |
|
"step": 59700 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.1833103895187378, |
|
"learning_rate": 0.001, |
|
"loss": 0.0672, |
|
"step": 59800 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.20417752861976624, |
|
"learning_rate": 0.001, |
|
"loss": 0.069, |
|
"step": 59900 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.3695315420627594, |
|
"learning_rate": 0.001, |
|
"loss": 0.0703, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.23288464546203613, |
|
"learning_rate": 0.001, |
|
"loss": 0.0704, |
|
"step": 60100 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.21595774590969086, |
|
"learning_rate": 0.001, |
|
"loss": 0.0697, |
|
"step": 60200 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.16371206939220428, |
|
"learning_rate": 0.001, |
|
"loss": 0.0704, |
|
"step": 60300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2600916028022766, |
|
"learning_rate": 0.001, |
|
"loss": 0.0693, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.21177971363067627, |
|
"learning_rate": 0.001, |
|
"loss": 0.0707, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.16886168718338013, |
|
"learning_rate": 0.001, |
|
"loss": 0.0701, |
|
"step": 60600 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.29835718870162964, |
|
"learning_rate": 0.001, |
|
"loss": 0.0683, |
|
"step": 60700 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2594737410545349, |
|
"learning_rate": 0.001, |
|
"loss": 0.0723, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2057715505361557, |
|
"learning_rate": 0.001, |
|
"loss": 0.0693, |
|
"step": 60900 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2127043902873993, |
|
"learning_rate": 0.001, |
|
"loss": 0.0699, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.18162322044372559, |
|
"learning_rate": 0.001, |
|
"loss": 0.0714, |
|
"step": 61100 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.21535515785217285, |
|
"learning_rate": 0.001, |
|
"loss": 0.0711, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.19364242255687714, |
|
"learning_rate": 0.001, |
|
"loss": 0.0715, |
|
"step": 61300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.14159826934337616, |
|
"learning_rate": 0.001, |
|
"loss": 0.07, |
|
"step": 61400 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.21536406874656677, |
|
"learning_rate": 0.001, |
|
"loss": 0.0689, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.19926196336746216, |
|
"learning_rate": 0.001, |
|
"loss": 0.0689, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.20217150449752808, |
|
"learning_rate": 0.001, |
|
"loss": 0.071, |
|
"step": 61700 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.17570650577545166, |
|
"learning_rate": 0.001, |
|
"loss": 0.0719, |
|
"step": 61800 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.19788751006126404, |
|
"learning_rate": 0.001, |
|
"loss": 0.0687, |
|
"step": 61900 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.22191910445690155, |
|
"learning_rate": 0.001, |
|
"loss": 0.0687, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.19544494152069092, |
|
"learning_rate": 0.001, |
|
"loss": 0.0704, |
|
"step": 62100 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.32939237356185913, |
|
"learning_rate": 0.001, |
|
"loss": 0.0713, |
|
"step": 62200 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.1809149980545044, |
|
"learning_rate": 0.001, |
|
"loss": 0.0701, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.2769867479801178, |
|
"learning_rate": 0.001, |
|
"loss": 0.0718, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.15998759865760803, |
|
"learning_rate": 0.001, |
|
"loss": 0.0691, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.29498517513275146, |
|
"learning_rate": 0.001, |
|
"loss": 0.0722, |
|
"step": 62600 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.19759228825569153, |
|
"learning_rate": 0.001, |
|
"loss": 0.0686, |
|
"step": 62700 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.12064652889966965, |
|
"learning_rate": 0.001, |
|
"loss": 0.0707, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.19079501926898956, |
|
"learning_rate": 0.001, |
|
"loss": 0.0662, |
|
"step": 62900 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.22422794997692108, |
|
"learning_rate": 0.001, |
|
"loss": 0.0662, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.16929177939891815, |
|
"learning_rate": 0.001, |
|
"loss": 0.0677, |
|
"step": 63100 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.20057950913906097, |
|
"learning_rate": 0.001, |
|
"loss": 0.0699, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.4213920533657074, |
|
"learning_rate": 0.001, |
|
"loss": 0.0701, |
|
"step": 63300 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.28028371930122375, |
|
"learning_rate": 0.001, |
|
"loss": 0.0697, |
|
"step": 63400 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.18094098567962646, |
|
"learning_rate": 0.001, |
|
"loss": 0.0727, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.30136585235595703, |
|
"learning_rate": 0.001, |
|
"loss": 0.0711, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.192775696516037, |
|
"learning_rate": 0.001, |
|
"loss": 0.0721, |
|
"step": 63700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2211129367351532, |
|
"learning_rate": 0.001, |
|
"loss": 0.0695, |
|
"step": 63800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.19226811826229095, |
|
"learning_rate": 0.001, |
|
"loss": 0.0699, |
|
"step": 63900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2471201866865158, |
|
"learning_rate": 0.001, |
|
"loss": 0.0692, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2547115385532379, |
|
"learning_rate": 0.001, |
|
"loss": 0.0673, |
|
"step": 64100 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.1899893879890442, |
|
"learning_rate": 0.001, |
|
"loss": 0.0693, |
|
"step": 64200 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.21257919073104858, |
|
"learning_rate": 0.001, |
|
"loss": 0.0684, |
|
"step": 64300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.26688677072525024, |
|
"learning_rate": 0.001, |
|
"loss": 0.0683, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.18874968588352203, |
|
"learning_rate": 0.001, |
|
"loss": 0.0688, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2013721913099289, |
|
"learning_rate": 0.001, |
|
"loss": 0.0684, |
|
"step": 64600 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.19745351374149323, |
|
"learning_rate": 0.001, |
|
"loss": 0.0685, |
|
"step": 64700 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2137337028980255, |
|
"learning_rate": 0.001, |
|
"loss": 0.0671, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.20300865173339844, |
|
"learning_rate": 0.001, |
|
"loss": 0.0684, |
|
"step": 64900 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.1723690927028656, |
|
"learning_rate": 0.001, |
|
"loss": 0.0681, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.20693708956241608, |
|
"learning_rate": 0.001, |
|
"loss": 0.0685, |
|
"step": 65100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.33531713485717773, |
|
"learning_rate": 0.001, |
|
"loss": 0.0687, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2180265337228775, |
|
"learning_rate": 0.001, |
|
"loss": 0.0719, |
|
"step": 65300 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.27855604887008667, |
|
"learning_rate": 0.001, |
|
"loss": 0.0686, |
|
"step": 65400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2309376448392868, |
|
"learning_rate": 0.001, |
|
"loss": 0.0682, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.25525444746017456, |
|
"learning_rate": 0.001, |
|
"loss": 0.0698, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.1746407151222229, |
|
"learning_rate": 0.001, |
|
"loss": 0.0692, |
|
"step": 65700 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.29511937499046326, |
|
"learning_rate": 0.001, |
|
"loss": 0.0675, |
|
"step": 65800 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.23610210418701172, |
|
"learning_rate": 0.001, |
|
"loss": 0.0682, |
|
"step": 65900 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.24088448286056519, |
|
"learning_rate": 0.001, |
|
"loss": 0.065, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.3865065574645996, |
|
"learning_rate": 0.001, |
|
"loss": 0.068, |
|
"step": 66100 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.16312183439731598, |
|
"learning_rate": 0.001, |
|
"loss": 0.0674, |
|
"step": 66200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.33910611271858215, |
|
"learning_rate": 0.001, |
|
"loss": 0.0657, |
|
"step": 66300 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.1491781622171402, |
|
"learning_rate": 0.001, |
|
"loss": 0.0663, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.27082210779190063, |
|
"learning_rate": 0.001, |
|
"loss": 0.0692, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.302495539188385, |
|
"learning_rate": 0.001, |
|
"loss": 0.0668, |
|
"step": 66600 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.1906341165304184, |
|
"learning_rate": 0.001, |
|
"loss": 0.0689, |
|
"step": 66700 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.21256040036678314, |
|
"learning_rate": 0.001, |
|
"loss": 0.0665, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.16603924334049225, |
|
"learning_rate": 0.001, |
|
"loss": 0.07, |
|
"step": 66900 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.17136050760746002, |
|
"learning_rate": 0.001, |
|
"loss": 0.0715, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.1679474115371704, |
|
"learning_rate": 0.001, |
|
"loss": 0.0667, |
|
"step": 67100 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.18445661664009094, |
|
"learning_rate": 0.001, |
|
"loss": 0.0688, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.16743460297584534, |
|
"learning_rate": 0.001, |
|
"loss": 0.0672, |
|
"step": 67300 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.24309833347797394, |
|
"learning_rate": 0.001, |
|
"loss": 0.066, |
|
"step": 67400 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.15661662817001343, |
|
"learning_rate": 0.001, |
|
"loss": 0.0686, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.32759585976600647, |
|
"learning_rate": 0.001, |
|
"loss": 0.0666, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.1508253961801529, |
|
"learning_rate": 0.001, |
|
"loss": 0.068, |
|
"step": 67700 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.17459799349308014, |
|
"learning_rate": 0.001, |
|
"loss": 0.069, |
|
"step": 67800 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2405272275209427, |
|
"learning_rate": 0.001, |
|
"loss": 0.0693, |
|
"step": 67900 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2469649761915207, |
|
"learning_rate": 0.001, |
|
"loss": 0.0678, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.25917258858680725, |
|
"learning_rate": 0.001, |
|
"loss": 0.0694, |
|
"step": 68100 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.1784822642803192, |
|
"learning_rate": 0.001, |
|
"loss": 0.0668, |
|
"step": 68200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.22977730631828308, |
|
"learning_rate": 0.001, |
|
"loss": 0.0656, |
|
"step": 68300 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.1646946221590042, |
|
"learning_rate": 0.001, |
|
"loss": 0.068, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.3220691978931427, |
|
"learning_rate": 0.001, |
|
"loss": 0.0665, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.22109118103981018, |
|
"learning_rate": 0.001, |
|
"loss": 0.0684, |
|
"step": 68600 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.12051670998334885, |
|
"learning_rate": 0.001, |
|
"loss": 0.0675, |
|
"step": 68700 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.19576141238212585, |
|
"learning_rate": 0.001, |
|
"loss": 0.0655, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.12783344089984894, |
|
"learning_rate": 0.001, |
|
"loss": 0.0677, |
|
"step": 68900 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.24854913353919983, |
|
"learning_rate": 0.001, |
|
"loss": 0.0684, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.19816453754901886, |
|
"learning_rate": 0.001, |
|
"loss": 0.067, |
|
"step": 69100 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.20371900498867035, |
|
"learning_rate": 0.001, |
|
"loss": 0.0669, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.24654364585876465, |
|
"learning_rate": 0.001, |
|
"loss": 0.0665, |
|
"step": 69300 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.22933346033096313, |
|
"learning_rate": 0.001, |
|
"loss": 0.0697, |
|
"step": 69400 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.3056330382823944, |
|
"learning_rate": 0.001, |
|
"loss": 0.0688, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.14624419808387756, |
|
"learning_rate": 0.001, |
|
"loss": 0.0686, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.23571297526359558, |
|
"learning_rate": 0.001, |
|
"loss": 0.0727, |
|
"step": 69700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.20212960243225098, |
|
"learning_rate": 0.001, |
|
"loss": 0.0708, |
|
"step": 69800 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.22400203347206116, |
|
"learning_rate": 0.001, |
|
"loss": 0.0645, |
|
"step": 69900 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.15693353116512299, |
|
"learning_rate": 0.001, |
|
"loss": 0.066, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.21171632409095764, |
|
"learning_rate": 0.001, |
|
"loss": 0.0651, |
|
"step": 70100 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.16716106235980988, |
|
"learning_rate": 0.001, |
|
"loss": 0.0651, |
|
"step": 70200 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.19692525267601013, |
|
"learning_rate": 0.001, |
|
"loss": 0.0677, |
|
"step": 70300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.23514828085899353, |
|
"learning_rate": 0.001, |
|
"loss": 0.0651, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.22567568719387054, |
|
"learning_rate": 0.001, |
|
"loss": 0.0658, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.20934154093265533, |
|
"learning_rate": 0.001, |
|
"loss": 0.0661, |
|
"step": 70600 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.25384077429771423, |
|
"learning_rate": 0.001, |
|
"loss": 0.0658, |
|
"step": 70700 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.27204346656799316, |
|
"learning_rate": 0.001, |
|
"loss": 0.0685, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.1900806725025177, |
|
"learning_rate": 0.001, |
|
"loss": 0.0637, |
|
"step": 70900 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.4064619243144989, |
|
"learning_rate": 0.001, |
|
"loss": 0.07, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.22942863404750824, |
|
"learning_rate": 0.001, |
|
"loss": 0.067, |
|
"step": 71100 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3398168683052063, |
|
"learning_rate": 0.001, |
|
"loss": 0.0673, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2937333881855011, |
|
"learning_rate": 0.001, |
|
"loss": 0.0689, |
|
"step": 71300 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.15955261886119843, |
|
"learning_rate": 0.001, |
|
"loss": 0.0644, |
|
"step": 71400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.32867005467414856, |
|
"learning_rate": 0.001, |
|
"loss": 0.0668, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.22879061102867126, |
|
"learning_rate": 0.001, |
|
"loss": 0.0641, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3147716224193573, |
|
"learning_rate": 0.001, |
|
"loss": 0.0643, |
|
"step": 71700 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.19312891364097595, |
|
"learning_rate": 0.001, |
|
"loss": 0.0654, |
|
"step": 71800 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3658990263938904, |
|
"learning_rate": 0.001, |
|
"loss": 0.066, |
|
"step": 71900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.2730260193347931, |
|
"learning_rate": 0.001, |
|
"loss": 0.0673, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3601909279823303, |
|
"learning_rate": 0.001, |
|
"loss": 0.0643, |
|
"step": 72100 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.13944287598133087, |
|
"learning_rate": 0.001, |
|
"loss": 0.0671, |
|
"step": 72200 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1590428501367569, |
|
"learning_rate": 0.001, |
|
"loss": 0.0651, |
|
"step": 72300 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.17583294212818146, |
|
"learning_rate": 0.001, |
|
"loss": 0.0665, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1566411554813385, |
|
"learning_rate": 0.001, |
|
"loss": 0.0666, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.26495423913002014, |
|
"learning_rate": 0.001, |
|
"loss": 0.0651, |
|
"step": 72600 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.17272372543811798, |
|
"learning_rate": 0.001, |
|
"loss": 0.0689, |
|
"step": 72700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.2443661093711853, |
|
"learning_rate": 0.001, |
|
"loss": 0.065, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.26695558428764343, |
|
"learning_rate": 0.001, |
|
"loss": 0.0637, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.14408937096595764, |
|
"learning_rate": 0.001, |
|
"loss": 0.0676, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.18142744898796082, |
|
"learning_rate": 0.001, |
|
"loss": 0.0653, |
|
"step": 73100 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.17100819945335388, |
|
"learning_rate": 0.001, |
|
"loss": 0.0631, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.3703427314758301, |
|
"learning_rate": 0.001, |
|
"loss": 0.0665, |
|
"step": 73300 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.19516532123088837, |
|
"learning_rate": 0.001, |
|
"loss": 0.0656, |
|
"step": 73400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.17610041797161102, |
|
"learning_rate": 0.001, |
|
"loss": 0.0658, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.13331599533557892, |
|
"learning_rate": 0.001, |
|
"loss": 0.0653, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.23824097216129303, |
|
"learning_rate": 0.001, |
|
"loss": 0.065, |
|
"step": 73700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.1464979499578476, |
|
"learning_rate": 0.001, |
|
"loss": 0.0638, |
|
"step": 73800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.18163511157035828, |
|
"learning_rate": 0.001, |
|
"loss": 0.0661, |
|
"step": 73900 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.1809806078672409, |
|
"learning_rate": 0.001, |
|
"loss": 0.0643, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.23994535207748413, |
|
"learning_rate": 0.001, |
|
"loss": 0.0636, |
|
"step": 74100 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.17924870550632477, |
|
"learning_rate": 0.001, |
|
"loss": 0.064, |
|
"step": 74200 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.15770521759986877, |
|
"learning_rate": 0.001, |
|
"loss": 0.0661, |
|
"step": 74300 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.24632355570793152, |
|
"learning_rate": 0.001, |
|
"loss": 0.0644, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.18300195038318634, |
|
"learning_rate": 0.001, |
|
"loss": 0.0592, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2745151221752167, |
|
"learning_rate": 0.001, |
|
"loss": 0.063, |
|
"step": 74600 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.18871140480041504, |
|
"learning_rate": 0.001, |
|
"loss": 0.063, |
|
"step": 74700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.30228421092033386, |
|
"learning_rate": 0.001, |
|
"loss": 0.0661, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.26834210753440857, |
|
"learning_rate": 0.001, |
|
"loss": 0.0626, |
|
"step": 74900 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.1998053640127182, |
|
"learning_rate": 0.001, |
|
"loss": 0.0655, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.16265703737735748, |
|
"learning_rate": 0.001, |
|
"loss": 0.0648, |
|
"step": 75100 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.3203764259815216, |
|
"learning_rate": 0.001, |
|
"loss": 0.0636, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.29416751861572266, |
|
"learning_rate": 0.001, |
|
"loss": 0.0613, |
|
"step": 75300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.1761980801820755, |
|
"learning_rate": 0.001, |
|
"loss": 0.0718, |
|
"step": 75400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.24760745465755463, |
|
"learning_rate": 0.001, |
|
"loss": 0.0641, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3362966477870941, |
|
"learning_rate": 0.001, |
|
"loss": 0.0678, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.20644457638263702, |
|
"learning_rate": 0.001, |
|
"loss": 0.0653, |
|
"step": 75700 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.22632303833961487, |
|
"learning_rate": 0.001, |
|
"loss": 0.0679, |
|
"step": 75800 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.22177743911743164, |
|
"learning_rate": 0.001, |
|
"loss": 0.0628, |
|
"step": 75900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9697771072387695, |
|
"learning_rate": 0.001, |
|
"loss": 0.0659, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.21862226724624634, |
|
"learning_rate": 0.001, |
|
"loss": 0.0654, |
|
"step": 76100 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.27506422996520996, |
|
"learning_rate": 0.001, |
|
"loss": 0.0636, |
|
"step": 76200 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.4953247606754303, |
|
"learning_rate": 0.001, |
|
"loss": 0.0648, |
|
"step": 76300 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.44132623076438904, |
|
"learning_rate": 0.001, |
|
"loss": 0.0641, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.28104710578918457, |
|
"learning_rate": 0.001, |
|
"loss": 0.0623, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.270434707403183, |
|
"learning_rate": 0.001, |
|
"loss": 0.0642, |
|
"step": 76600 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.17920733988285065, |
|
"learning_rate": 0.001, |
|
"loss": 0.0641, |
|
"step": 76700 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.27689895033836365, |
|
"learning_rate": 0.001, |
|
"loss": 0.0645, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.22936861217021942, |
|
"learning_rate": 0.001, |
|
"loss": 0.0625, |
|
"step": 76900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2662585973739624, |
|
"learning_rate": 0.001, |
|
"loss": 0.0671, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.23035678267478943, |
|
"learning_rate": 0.001, |
|
"loss": 0.0622, |
|
"step": 77100 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.19333815574645996, |
|
"learning_rate": 0.001, |
|
"loss": 0.0655, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2870350182056427, |
|
"learning_rate": 0.001, |
|
"loss": 0.0634, |
|
"step": 77300 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.22997340559959412, |
|
"learning_rate": 0.001, |
|
"loss": 0.0676, |
|
"step": 77400 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.19435285031795502, |
|
"learning_rate": 0.001, |
|
"loss": 0.0655, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.2826205790042877, |
|
"learning_rate": 0.001, |
|
"loss": 0.0635, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.20007766783237457, |
|
"learning_rate": 0.001, |
|
"loss": 0.0617, |
|
"step": 77700 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.15860234200954437, |
|
"learning_rate": 0.001, |
|
"loss": 0.0657, |
|
"step": 77800 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.40526214241981506, |
|
"learning_rate": 0.001, |
|
"loss": 0.0649, |
|
"step": 77900 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.24454933404922485, |
|
"learning_rate": 0.001, |
|
"loss": 0.0634, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.12802359461784363, |
|
"learning_rate": 0.001, |
|
"loss": 0.0635, |
|
"step": 78100 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.32250648736953735, |
|
"learning_rate": 0.001, |
|
"loss": 0.0648, |
|
"step": 78200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.253478467464447, |
|
"learning_rate": 0.001, |
|
"loss": 0.0648, |
|
"step": 78300 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.25307029485702515, |
|
"learning_rate": 0.001, |
|
"loss": 0.0648, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.19091230630874634, |
|
"learning_rate": 0.001, |
|
"loss": 0.065, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.17312967777252197, |
|
"learning_rate": 0.001, |
|
"loss": 0.0624, |
|
"step": 78600 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.19466041028499603, |
|
"learning_rate": 0.001, |
|
"loss": 0.0622, |
|
"step": 78700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.25837138295173645, |
|
"learning_rate": 0.001, |
|
"loss": 0.0641, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.1573166698217392, |
|
"learning_rate": 0.001, |
|
"loss": 0.0645, |
|
"step": 78900 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.1644609123468399, |
|
"learning_rate": 0.001, |
|
"loss": 0.0644, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.20255005359649658, |
|
"learning_rate": 0.001, |
|
"loss": 0.0647, |
|
"step": 79100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.48706310987472534, |
|
"learning_rate": 0.001, |
|
"loss": 0.0642, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.3525262176990509, |
|
"learning_rate": 0.001, |
|
"loss": 0.0639, |
|
"step": 79300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.20806559920310974, |
|
"learning_rate": 0.001, |
|
"loss": 0.0639, |
|
"step": 79400 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.441980242729187, |
|
"learning_rate": 0.001, |
|
"loss": 0.0645, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.16818083822727203, |
|
"learning_rate": 0.001, |
|
"loss": 0.0625, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.1843559443950653, |
|
"learning_rate": 0.001, |
|
"loss": 0.064, |
|
"step": 79700 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.19608129560947418, |
|
"learning_rate": 0.001, |
|
"loss": 0.0634, |
|
"step": 79800 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.34710460901260376, |
|
"learning_rate": 0.001, |
|
"loss": 0.0626, |
|
"step": 79900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4062146842479706, |
|
"learning_rate": 0.001, |
|
"loss": 0.0637, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.23054763674736023, |
|
"learning_rate": 0.001, |
|
"loss": 0.0629, |
|
"step": 80100 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.20241042971611023, |
|
"learning_rate": 0.001, |
|
"loss": 0.0632, |
|
"step": 80200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.17540830373764038, |
|
"learning_rate": 0.001, |
|
"loss": 0.0645, |
|
"step": 80300 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.2995645999908447, |
|
"learning_rate": 0.001, |
|
"loss": 0.0619, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.2701890766620636, |
|
"learning_rate": 0.001, |
|
"loss": 0.0624, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5655909180641174, |
|
"learning_rate": 0.001, |
|
"loss": 0.0637, |
|
"step": 80600 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.24868199229240417, |
|
"learning_rate": 0.001, |
|
"loss": 0.0626, |
|
"step": 80700 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.205698162317276, |
|
"learning_rate": 0.001, |
|
"loss": 0.0616, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.4373738169670105, |
|
"learning_rate": 0.001, |
|
"loss": 0.0635, |
|
"step": 80900 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.20648936927318573, |
|
"learning_rate": 0.001, |
|
"loss": 0.063, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.49470582604408264, |
|
"learning_rate": 0.001, |
|
"loss": 0.064, |
|
"step": 81100 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.2360522598028183, |
|
"learning_rate": 0.001, |
|
"loss": 0.0606, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.38575538992881775, |
|
"learning_rate": 0.001, |
|
"loss": 0.0626, |
|
"step": 81300 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.23714828491210938, |
|
"learning_rate": 0.001, |
|
"loss": 0.0628, |
|
"step": 81400 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5665257573127747, |
|
"learning_rate": 0.001, |
|
"loss": 0.064, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2335139662027359, |
|
"learning_rate": 0.001, |
|
"loss": 0.0628, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.23121795058250427, |
|
"learning_rate": 0.001, |
|
"loss": 0.0617, |
|
"step": 81700 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2850015163421631, |
|
"learning_rate": 0.001, |
|
"loss": 0.0634, |
|
"step": 81800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.25949451327323914, |
|
"learning_rate": 0.001, |
|
"loss": 0.0611, |
|
"step": 81900 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.15866072475910187, |
|
"learning_rate": 0.001, |
|
"loss": 0.0633, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.1362059861421585, |
|
"learning_rate": 0.001, |
|
"loss": 0.0637, |
|
"step": 82100 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.23973006010055542, |
|
"learning_rate": 0.001, |
|
"loss": 0.0619, |
|
"step": 82200 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2586152255535126, |
|
"learning_rate": 0.001, |
|
"loss": 0.0595, |
|
"step": 82300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.33245041966438293, |
|
"learning_rate": 0.001, |
|
"loss": 0.0632, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.1873330920934677, |
|
"learning_rate": 0.001, |
|
"loss": 0.0636, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.23043370246887207, |
|
"learning_rate": 0.001, |
|
"loss": 0.0644, |
|
"step": 82600 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.21046708524227142, |
|
"learning_rate": 0.001, |
|
"loss": 0.0631, |
|
"step": 82700 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.15473945438861847, |
|
"learning_rate": 0.001, |
|
"loss": 0.06, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.422141194343567, |
|
"learning_rate": 0.001, |
|
"loss": 0.0636, |
|
"step": 82900 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.16424107551574707, |
|
"learning_rate": 0.001, |
|
"loss": 0.0643, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.3594319820404053, |
|
"learning_rate": 0.001, |
|
"loss": 0.0624, |
|
"step": 83100 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.26430365443229675, |
|
"learning_rate": 0.001, |
|
"loss": 0.0593, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.20655816793441772, |
|
"learning_rate": 0.001, |
|
"loss": 0.0619, |
|
"step": 83300 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.39340272545814514, |
|
"learning_rate": 0.001, |
|
"loss": 0.0624, |
|
"step": 83400 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.3113759160041809, |
|
"learning_rate": 0.001, |
|
"loss": 0.0598, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.33689817786216736, |
|
"learning_rate": 0.001, |
|
"loss": 0.0604, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2195175141096115, |
|
"learning_rate": 0.001, |
|
"loss": 0.0618, |
|
"step": 83700 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2397637814283371, |
|
"learning_rate": 0.001, |
|
"loss": 0.0618, |
|
"step": 83800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.28967469930648804, |
|
"learning_rate": 0.001, |
|
"loss": 0.0612, |
|
"step": 83900 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.23908008635044098, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.36196354031562805, |
|
"learning_rate": 0.001, |
|
"loss": 0.061, |
|
"step": 84100 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3068004250526428, |
|
"learning_rate": 0.001, |
|
"loss": 0.0614, |
|
"step": 84200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2148333489894867, |
|
"learning_rate": 0.001, |
|
"loss": 0.0624, |
|
"step": 84300 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.19169430434703827, |
|
"learning_rate": 0.001, |
|
"loss": 0.0615, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.23916268348693848, |
|
"learning_rate": 0.001, |
|
"loss": 0.0654, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.20304815471172333, |
|
"learning_rate": 0.001, |
|
"loss": 0.0613, |
|
"step": 84600 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2983682155609131, |
|
"learning_rate": 0.001, |
|
"loss": 0.0617, |
|
"step": 84700 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.22442661225795746, |
|
"learning_rate": 0.001, |
|
"loss": 0.0593, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.28299954533576965, |
|
"learning_rate": 0.001, |
|
"loss": 0.0636, |
|
"step": 84900 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.30491936206817627, |
|
"learning_rate": 0.001, |
|
"loss": 0.0608, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.30804798007011414, |
|
"learning_rate": 0.001, |
|
"loss": 0.0609, |
|
"step": 85100 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.18533004820346832, |
|
"learning_rate": 0.001, |
|
"loss": 0.0602, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.23856715857982635, |
|
"learning_rate": 0.001, |
|
"loss": 0.0638, |
|
"step": 85300 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2646658420562744, |
|
"learning_rate": 0.001, |
|
"loss": 0.0622, |
|
"step": 85400 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2357235699892044, |
|
"learning_rate": 0.001, |
|
"loss": 0.0617, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.1675509363412857, |
|
"learning_rate": 0.001, |
|
"loss": 0.0593, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.20707982778549194, |
|
"learning_rate": 0.001, |
|
"loss": 0.0617, |
|
"step": 85700 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.34539708495140076, |
|
"learning_rate": 0.001, |
|
"loss": 0.06, |
|
"step": 85800 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.28429824113845825, |
|
"learning_rate": 0.001, |
|
"loss": 0.0587, |
|
"step": 85900 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.3121056854724884, |
|
"learning_rate": 0.001, |
|
"loss": 0.0615, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.25750598311424255, |
|
"learning_rate": 0.001, |
|
"loss": 0.0613, |
|
"step": 86100 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.18927526473999023, |
|
"learning_rate": 0.001, |
|
"loss": 0.0592, |
|
"step": 86200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.3551163971424103, |
|
"learning_rate": 0.001, |
|
"loss": 0.0619, |
|
"step": 86300 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.19404169917106628, |
|
"learning_rate": 0.001, |
|
"loss": 0.0617, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.16969504952430725, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.20026318728923798, |
|
"learning_rate": 0.001, |
|
"loss": 0.0606, |
|
"step": 86600 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.30545106530189514, |
|
"learning_rate": 0.001, |
|
"loss": 0.0594, |
|
"step": 86700 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.2734260559082031, |
|
"learning_rate": 0.001, |
|
"loss": 0.0644, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.3157080411911011, |
|
"learning_rate": 0.001, |
|
"loss": 0.0618, |
|
"step": 86900 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.19793906807899475, |
|
"learning_rate": 0.001, |
|
"loss": 0.0616, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.1849125623703003, |
|
"learning_rate": 0.001, |
|
"loss": 0.0596, |
|
"step": 87100 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.18340341746807098, |
|
"learning_rate": 0.001, |
|
"loss": 0.0625, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.26056426763534546, |
|
"learning_rate": 0.001, |
|
"loss": 0.0595, |
|
"step": 87300 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.22235774993896484, |
|
"learning_rate": 0.001, |
|
"loss": 0.0606, |
|
"step": 87400 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.31580013036727905, |
|
"learning_rate": 0.001, |
|
"loss": 0.0615, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.2364477515220642, |
|
"learning_rate": 0.001, |
|
"loss": 0.0616, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.23212990164756775, |
|
"learning_rate": 0.001, |
|
"loss": 0.0594, |
|
"step": 87700 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.21986854076385498, |
|
"learning_rate": 0.001, |
|
"loss": 0.0592, |
|
"step": 87800 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.2496929168701172, |
|
"learning_rate": 0.001, |
|
"loss": 0.0593, |
|
"step": 87900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.19572298228740692, |
|
"learning_rate": 0.001, |
|
"loss": 0.0588, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.16231012344360352, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 88100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.21093867719173431, |
|
"learning_rate": 0.001, |
|
"loss": 0.0625, |
|
"step": 88200 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.16491778194904327, |
|
"learning_rate": 0.001, |
|
"loss": 0.0602, |
|
"step": 88300 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.24729378521442413, |
|
"learning_rate": 0.001, |
|
"loss": 0.0573, |
|
"step": 88400 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.3726213276386261, |
|
"learning_rate": 0.001, |
|
"loss": 0.0589, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.1926572024822235, |
|
"learning_rate": 0.001, |
|
"loss": 0.0602, |
|
"step": 88600 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2153882533311844, |
|
"learning_rate": 0.001, |
|
"loss": 0.0597, |
|
"step": 88700 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.25205257534980774, |
|
"learning_rate": 0.001, |
|
"loss": 0.0581, |
|
"step": 88800 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.16898304224014282, |
|
"learning_rate": 0.001, |
|
"loss": 0.0614, |
|
"step": 88900 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2840329110622406, |
|
"learning_rate": 0.001, |
|
"loss": 0.0615, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.22306442260742188, |
|
"learning_rate": 0.001, |
|
"loss": 0.0606, |
|
"step": 89100 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2778179943561554, |
|
"learning_rate": 0.001, |
|
"loss": 0.0606, |
|
"step": 89200 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.1956636756658554, |
|
"learning_rate": 0.001, |
|
"loss": 0.0585, |
|
"step": 89300 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.15973015129566193, |
|
"learning_rate": 0.001, |
|
"loss": 0.0598, |
|
"step": 89400 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2306407243013382, |
|
"learning_rate": 0.001, |
|
"loss": 0.0597, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.19012047350406647, |
|
"learning_rate": 0.001, |
|
"loss": 0.0608, |
|
"step": 89600 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.214030921459198, |
|
"learning_rate": 0.001, |
|
"loss": 0.0586, |
|
"step": 89700 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.26291027665138245, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 89800 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.140648752450943, |
|
"learning_rate": 0.001, |
|
"loss": 0.0605, |
|
"step": 89900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.3011924624443054, |
|
"learning_rate": 0.001, |
|
"loss": 0.0609, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.24463798105716705, |
|
"learning_rate": 0.001, |
|
"loss": 0.0587, |
|
"step": 90100 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.2608613073825836, |
|
"learning_rate": 0.001, |
|
"loss": 0.0595, |
|
"step": 90200 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.23249809443950653, |
|
"learning_rate": 0.001, |
|
"loss": 0.0592, |
|
"step": 90300 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.36541712284088135, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 90400 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.45584437251091003, |
|
"learning_rate": 0.001, |
|
"loss": 0.0587, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.20905092358589172, |
|
"learning_rate": 0.001, |
|
"loss": 0.0595, |
|
"step": 90600 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.18202795088291168, |
|
"learning_rate": 0.001, |
|
"loss": 0.0568, |
|
"step": 90700 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.2321150153875351, |
|
"learning_rate": 0.001, |
|
"loss": 0.0605, |
|
"step": 90800 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.17175626754760742, |
|
"learning_rate": 0.001, |
|
"loss": 0.0596, |
|
"step": 90900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.21932841837406158, |
|
"learning_rate": 0.001, |
|
"loss": 0.0585, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.30282464623451233, |
|
"learning_rate": 0.001, |
|
"loss": 0.0593, |
|
"step": 91100 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.2639208436012268, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 91200 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.23805926740169525, |
|
"learning_rate": 0.001, |
|
"loss": 0.0576, |
|
"step": 91300 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.2307603508234024, |
|
"learning_rate": 0.001, |
|
"loss": 0.0602, |
|
"step": 91400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.1786148101091385, |
|
"learning_rate": 0.001, |
|
"loss": 0.0598, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.1955350786447525, |
|
"learning_rate": 0.001, |
|
"loss": 0.0576, |
|
"step": 91600 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.24684827029705048, |
|
"learning_rate": 0.001, |
|
"loss": 0.0571, |
|
"step": 91700 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.2771402895450592, |
|
"learning_rate": 0.001, |
|
"loss": 0.058, |
|
"step": 91800 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.28878656029701233, |
|
"learning_rate": 0.001, |
|
"loss": 0.0585, |
|
"step": 91900 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.7780060172080994, |
|
"learning_rate": 0.001, |
|
"loss": 0.0574, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.25102126598358154, |
|
"learning_rate": 0.001, |
|
"loss": 0.0576, |
|
"step": 92100 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.26416492462158203, |
|
"learning_rate": 0.001, |
|
"loss": 0.0614, |
|
"step": 92200 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.26566821336746216, |
|
"learning_rate": 0.001, |
|
"loss": 0.0586, |
|
"step": 92300 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.25432705879211426, |
|
"learning_rate": 0.001, |
|
"loss": 0.0586, |
|
"step": 92400 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.2592636048793793, |
|
"learning_rate": 0.001, |
|
"loss": 0.0576, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.3514898419380188, |
|
"learning_rate": 0.001, |
|
"loss": 0.0579, |
|
"step": 92600 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.2749045491218567, |
|
"learning_rate": 0.001, |
|
"loss": 0.061, |
|
"step": 92700 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.2799491882324219, |
|
"learning_rate": 0.001, |
|
"loss": 0.0579, |
|
"step": 92800 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.2252642959356308, |
|
"learning_rate": 0.001, |
|
"loss": 0.0584, |
|
"step": 92900 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.18218593299388885, |
|
"learning_rate": 0.001, |
|
"loss": 0.0577, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.27551427483558655, |
|
"learning_rate": 0.001, |
|
"loss": 0.0605, |
|
"step": 93100 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.26159995794296265, |
|
"learning_rate": 0.001, |
|
"loss": 0.0562, |
|
"step": 93200 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.15979285538196564, |
|
"learning_rate": 0.001, |
|
"loss": 0.0615, |
|
"step": 93300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.23418280482292175, |
|
"learning_rate": 0.001, |
|
"loss": 0.0594, |
|
"step": 93400 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.16936419904232025, |
|
"learning_rate": 0.001, |
|
"loss": 0.0611, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2862916886806488, |
|
"learning_rate": 0.001, |
|
"loss": 0.0584, |
|
"step": 93600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5302750468254089, |
|
"learning_rate": 0.001, |
|
"loss": 0.0561, |
|
"step": 93700 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.43644002079963684, |
|
"learning_rate": 0.001, |
|
"loss": 0.0581, |
|
"step": 93800 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.19219018518924713, |
|
"learning_rate": 0.001, |
|
"loss": 0.0591, |
|
"step": 93900 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.29645296931266785, |
|
"learning_rate": 0.001, |
|
"loss": 0.0587, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.24861380457878113, |
|
"learning_rate": 0.001, |
|
"loss": 0.0594, |
|
"step": 94100 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2443215548992157, |
|
"learning_rate": 0.001, |
|
"loss": 0.057, |
|
"step": 94200 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.13077589869499207, |
|
"learning_rate": 0.001, |
|
"loss": 0.0563, |
|
"step": 94300 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.24280287325382233, |
|
"learning_rate": 0.001, |
|
"loss": 0.0591, |
|
"step": 94400 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.25838151574134827, |
|
"learning_rate": 0.001, |
|
"loss": 0.0583, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.33244743943214417, |
|
"learning_rate": 0.001, |
|
"loss": 0.0587, |
|
"step": 94600 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.45074304938316345, |
|
"learning_rate": 0.001, |
|
"loss": 0.0572, |
|
"step": 94700 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2540782392024994, |
|
"learning_rate": 0.001, |
|
"loss": 0.0584, |
|
"step": 94800 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.29180458188056946, |
|
"learning_rate": 0.001, |
|
"loss": 0.0609, |
|
"step": 94900 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.18510323762893677, |
|
"learning_rate": 0.001, |
|
"loss": 0.058, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.28962787985801697, |
|
"learning_rate": 0.001, |
|
"loss": 0.0562, |
|
"step": 95100 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.26887577772140503, |
|
"learning_rate": 0.001, |
|
"loss": 0.0573, |
|
"step": 95200 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.20729154348373413, |
|
"learning_rate": 0.001, |
|
"loss": 0.057, |
|
"step": 95300 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.19953325390815735, |
|
"learning_rate": 0.001, |
|
"loss": 0.0594, |
|
"step": 95400 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.15926332771778107, |
|
"learning_rate": 0.001, |
|
"loss": 0.0582, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.23609544336795807, |
|
"learning_rate": 0.001, |
|
"loss": 0.0579, |
|
"step": 95600 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.13997937738895416, |
|
"learning_rate": 0.001, |
|
"loss": 0.0574, |
|
"step": 95700 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.23629073798656464, |
|
"learning_rate": 0.001, |
|
"loss": 0.0585, |
|
"step": 95800 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3770292401313782, |
|
"learning_rate": 0.001, |
|
"loss": 0.0572, |
|
"step": 95900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3013598322868347, |
|
"learning_rate": 0.001, |
|
"loss": 0.0606, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2350749522447586, |
|
"learning_rate": 0.001, |
|
"loss": 0.057, |
|
"step": 96100 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.301268994808197, |
|
"learning_rate": 0.001, |
|
"loss": 0.0586, |
|
"step": 96200 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.22475981712341309, |
|
"learning_rate": 0.001, |
|
"loss": 0.0593, |
|
"step": 96300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3032160997390747, |
|
"learning_rate": 0.001, |
|
"loss": 0.0591, |
|
"step": 96400 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5848428010940552, |
|
"learning_rate": 0.001, |
|
"loss": 0.0559, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.20164470374584198, |
|
"learning_rate": 0.001, |
|
"loss": 0.0579, |
|
"step": 96600 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.18068142235279083, |
|
"learning_rate": 0.001, |
|
"loss": 0.0579, |
|
"step": 96700 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.31181275844573975, |
|
"learning_rate": 0.001, |
|
"loss": 0.0588, |
|
"step": 96800 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.23156049847602844, |
|
"learning_rate": 0.001, |
|
"loss": 0.058, |
|
"step": 96900 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.18572886288166046, |
|
"learning_rate": 0.001, |
|
"loss": 0.0599, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.17736677825450897, |
|
"learning_rate": 0.001, |
|
"loss": 0.0561, |
|
"step": 97100 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4838601052761078, |
|
"learning_rate": 0.001, |
|
"loss": 0.0595, |
|
"step": 97200 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.21476797759532928, |
|
"learning_rate": 0.001, |
|
"loss": 0.0609, |
|
"step": 97300 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2181667536497116, |
|
"learning_rate": 0.001, |
|
"loss": 0.0583, |
|
"step": 97400 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.26551786065101624, |
|
"learning_rate": 0.001, |
|
"loss": 0.0566, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.2258795201778412, |
|
"learning_rate": 0.001, |
|
"loss": 0.0574, |
|
"step": 97600 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.17733299732208252, |
|
"learning_rate": 0.001, |
|
"loss": 0.0588, |
|
"step": 97700 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.4031812846660614, |
|
"learning_rate": 0.001, |
|
"loss": 0.0584, |
|
"step": 97800 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.22529329359531403, |
|
"learning_rate": 0.001, |
|
"loss": 0.0572, |
|
"step": 97900 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.2503925561904907, |
|
"learning_rate": 0.001, |
|
"loss": 0.0588, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.17040744423866272, |
|
"learning_rate": 0.001, |
|
"loss": 0.0603, |
|
"step": 98100 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.17749032378196716, |
|
"learning_rate": 0.001, |
|
"loss": 0.057, |
|
"step": 98200 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.3931177854537964, |
|
"learning_rate": 0.001, |
|
"loss": 0.0566, |
|
"step": 98300 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.22418583929538727, |
|
"learning_rate": 0.001, |
|
"loss": 0.0574, |
|
"step": 98400 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.30830493569374084, |
|
"learning_rate": 0.001, |
|
"loss": 0.0593, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.2269369661808014, |
|
"learning_rate": 0.001, |
|
"loss": 0.0585, |
|
"step": 98600 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.31830596923828125, |
|
"learning_rate": 0.001, |
|
"loss": 0.0548, |
|
"step": 98700 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.25759172439575195, |
|
"learning_rate": 0.001, |
|
"loss": 0.0564, |
|
"step": 98800 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.23925898969173431, |
|
"learning_rate": 0.001, |
|
"loss": 0.0592, |
|
"step": 98900 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.17434507608413696, |
|
"learning_rate": 0.001, |
|
"loss": 0.0583, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.3493863642215729, |
|
"learning_rate": 0.001, |
|
"loss": 0.0571, |
|
"step": 99100 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.20887431502342224, |
|
"learning_rate": 0.001, |
|
"loss": 0.0564, |
|
"step": 99200 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.18060541152954102, |
|
"learning_rate": 0.001, |
|
"loss": 0.0583, |
|
"step": 99300 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.3689703047275543, |
|
"learning_rate": 0.001, |
|
"loss": 0.0565, |
|
"step": 99400 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.25323519110679626, |
|
"learning_rate": 0.001, |
|
"loss": 0.0576, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.27348294854164124, |
|
"learning_rate": 0.001, |
|
"loss": 0.0568, |
|
"step": 99600 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.25492238998413086, |
|
"learning_rate": 0.001, |
|
"loss": 0.0561, |
|
"step": 99700 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2604049742221832, |
|
"learning_rate": 0.001, |
|
"loss": 0.0564, |
|
"step": 99800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.37222278118133545, |
|
"learning_rate": 0.001, |
|
"loss": 0.059, |
|
"step": 99900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3180735111236572, |
|
"learning_rate": 0.001, |
|
"loss": 0.0588, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 100000, |
|
"total_flos": 8.920695708927918e+18, |
|
"train_loss": 0.09189929046154022, |
|
"train_runtime": 235079.2305, |
|
"train_samples_per_second": 54.45, |
|
"train_steps_per_second": 0.425 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"total_flos": 8.920695708927918e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|
|
|