urdu-mt5-mmarco / trainer_state.json
Muhammad Umer Tariq Butt
Upload fine-tuned Urdu mT5 IR msmarco model
57838b2
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.99999750000625,
"eval_steps": 500,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.3668140172958374,
"learning_rate": 0.001,
"loss": 1.2955,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 0.4789515733718872,
"learning_rate": 0.001,
"loss": 0.2147,
"step": 200
},
{
"epoch": 0.0,
"grad_norm": 0.8046264052391052,
"learning_rate": 0.001,
"loss": 0.1773,
"step": 300
},
{
"epoch": 0.0,
"grad_norm": 0.6500861644744873,
"learning_rate": 0.001,
"loss": 0.169,
"step": 400
},
{
"epoch": 0.0,
"grad_norm": 1.9476549625396729,
"learning_rate": 0.001,
"loss": 0.155,
"step": 500
},
{
"epoch": 0.01,
"grad_norm": 0.7570195198059082,
"learning_rate": 0.001,
"loss": 0.221,
"step": 600
},
{
"epoch": 0.01,
"grad_norm": 0.13534319400787354,
"learning_rate": 0.001,
"loss": 0.242,
"step": 700
},
{
"epoch": 0.01,
"grad_norm": 0.12334191799163818,
"learning_rate": 0.001,
"loss": 0.2387,
"step": 800
},
{
"epoch": 0.01,
"grad_norm": 2.0074245929718018,
"learning_rate": 0.001,
"loss": 0.1844,
"step": 900
},
{
"epoch": 0.01,
"grad_norm": 0.2459566444158554,
"learning_rate": 0.001,
"loss": 0.2273,
"step": 1000
},
{
"epoch": 0.01,
"grad_norm": 0.35431796312332153,
"learning_rate": 0.001,
"loss": 0.2406,
"step": 1100
},
{
"epoch": 0.01,
"grad_norm": 0.07735779881477356,
"learning_rate": 0.001,
"loss": 0.2362,
"step": 1200
},
{
"epoch": 0.01,
"grad_norm": 0.197942316532135,
"learning_rate": 0.001,
"loss": 0.2361,
"step": 1300
},
{
"epoch": 0.01,
"grad_norm": 0.06753970682621002,
"learning_rate": 0.001,
"loss": 0.2346,
"step": 1400
},
{
"epoch": 0.01,
"grad_norm": 0.17562294006347656,
"learning_rate": 0.001,
"loss": 0.2356,
"step": 1500
},
{
"epoch": 0.02,
"grad_norm": 0.12020650506019592,
"learning_rate": 0.001,
"loss": 0.2343,
"step": 1600
},
{
"epoch": 0.02,
"grad_norm": 0.07772481441497803,
"learning_rate": 0.001,
"loss": 0.2343,
"step": 1700
},
{
"epoch": 0.02,
"grad_norm": 0.041362863034009933,
"learning_rate": 0.001,
"loss": 0.2345,
"step": 1800
},
{
"epoch": 0.02,
"grad_norm": 0.050947155803442,
"learning_rate": 0.001,
"loss": 0.2333,
"step": 1900
},
{
"epoch": 0.02,
"grad_norm": 0.24440822005271912,
"learning_rate": 0.001,
"loss": 0.2346,
"step": 2000
},
{
"epoch": 0.02,
"grad_norm": 0.4386675953865051,
"learning_rate": 0.001,
"loss": 0.2346,
"step": 2100
},
{
"epoch": 0.02,
"grad_norm": 0.054741185158491135,
"learning_rate": 0.001,
"loss": 0.2347,
"step": 2200
},
{
"epoch": 0.02,
"grad_norm": 0.5285407304763794,
"learning_rate": 0.001,
"loss": 0.2341,
"step": 2300
},
{
"epoch": 0.02,
"grad_norm": 0.5406210422515869,
"learning_rate": 0.001,
"loss": 0.2322,
"step": 2400
},
{
"epoch": 0.02,
"grad_norm": 1.1667808294296265,
"learning_rate": 0.001,
"loss": 0.1683,
"step": 2500
},
{
"epoch": 0.03,
"grad_norm": 0.11829289048910141,
"learning_rate": 0.001,
"loss": 0.2138,
"step": 2600
},
{
"epoch": 0.03,
"grad_norm": 1.528359055519104,
"learning_rate": 0.001,
"loss": 0.1902,
"step": 2700
},
{
"epoch": 0.03,
"grad_norm": 0.45457515120506287,
"learning_rate": 0.001,
"loss": 0.1592,
"step": 2800
},
{
"epoch": 0.03,
"grad_norm": 0.2595893144607544,
"learning_rate": 0.001,
"loss": 0.1511,
"step": 2900
},
{
"epoch": 0.03,
"grad_norm": 0.5346922278404236,
"learning_rate": 0.001,
"loss": 0.1439,
"step": 3000
},
{
"epoch": 0.03,
"grad_norm": 3.5066208839416504,
"learning_rate": 0.001,
"loss": 0.1617,
"step": 3100
},
{
"epoch": 0.03,
"grad_norm": 24.826475143432617,
"learning_rate": 0.001,
"loss": 0.2024,
"step": 3200
},
{
"epoch": 0.03,
"grad_norm": 10.144634246826172,
"learning_rate": 0.001,
"loss": 0.1882,
"step": 3300
},
{
"epoch": 0.03,
"grad_norm": 0.43425676226615906,
"learning_rate": 0.001,
"loss": 0.169,
"step": 3400
},
{
"epoch": 0.03,
"grad_norm": 0.3496113717556,
"learning_rate": 0.001,
"loss": 0.1542,
"step": 3500
},
{
"epoch": 0.04,
"grad_norm": 6.317073345184326,
"learning_rate": 0.001,
"loss": 0.1676,
"step": 3600
},
{
"epoch": 0.04,
"grad_norm": 1.1362758874893188,
"learning_rate": 0.001,
"loss": 0.1599,
"step": 3700
},
{
"epoch": 0.04,
"grad_norm": 4.871659755706787,
"learning_rate": 0.001,
"loss": 0.1473,
"step": 3800
},
{
"epoch": 0.04,
"grad_norm": 0.10563373565673828,
"learning_rate": 0.001,
"loss": 0.1652,
"step": 3900
},
{
"epoch": 0.04,
"grad_norm": 0.08865318447351456,
"learning_rate": 0.001,
"loss": 0.2326,
"step": 4000
},
{
"epoch": 0.04,
"grad_norm": 0.0642586424946785,
"learning_rate": 0.001,
"loss": 0.2329,
"step": 4100
},
{
"epoch": 0.04,
"grad_norm": 0.36199188232421875,
"learning_rate": 0.001,
"loss": 0.2331,
"step": 4200
},
{
"epoch": 0.04,
"grad_norm": 0.17750632762908936,
"learning_rate": 0.001,
"loss": 0.2326,
"step": 4300
},
{
"epoch": 0.04,
"grad_norm": 0.103765107691288,
"learning_rate": 0.001,
"loss": 0.2329,
"step": 4400
},
{
"epoch": 0.04,
"grad_norm": 0.11186927556991577,
"learning_rate": 0.001,
"loss": 0.2326,
"step": 4500
},
{
"epoch": 0.05,
"grad_norm": 0.04914987459778786,
"learning_rate": 0.001,
"loss": 0.2326,
"step": 4600
},
{
"epoch": 0.05,
"grad_norm": 0.09826149046421051,
"learning_rate": 0.001,
"loss": 0.2324,
"step": 4700
},
{
"epoch": 0.05,
"grad_norm": 0.08518774062395096,
"learning_rate": 0.001,
"loss": 0.2327,
"step": 4800
},
{
"epoch": 0.05,
"grad_norm": 0.12364567071199417,
"learning_rate": 0.001,
"loss": 0.2321,
"step": 4900
},
{
"epoch": 0.05,
"grad_norm": 0.10944374650716782,
"learning_rate": 0.001,
"loss": 0.2322,
"step": 5000
},
{
"epoch": 0.05,
"grad_norm": 0.08173243701457977,
"learning_rate": 0.001,
"loss": 0.2326,
"step": 5100
},
{
"epoch": 0.05,
"grad_norm": 0.17504490911960602,
"learning_rate": 0.001,
"loss": 0.232,
"step": 5200
},
{
"epoch": 0.05,
"grad_norm": 0.03396083042025566,
"learning_rate": 0.001,
"loss": 0.2326,
"step": 5300
},
{
"epoch": 0.05,
"grad_norm": 0.12226787954568863,
"learning_rate": 0.001,
"loss": 0.2324,
"step": 5400
},
{
"epoch": 0.05,
"grad_norm": 0.029385367408394814,
"learning_rate": 0.001,
"loss": 0.2324,
"step": 5500
},
{
"epoch": 0.06,
"grad_norm": 0.08070210367441177,
"learning_rate": 0.001,
"loss": 0.2322,
"step": 5600
},
{
"epoch": 0.06,
"grad_norm": 0.026348430663347244,
"learning_rate": 0.001,
"loss": 0.2316,
"step": 5700
},
{
"epoch": 0.06,
"grad_norm": 0.06884663552045822,
"learning_rate": 0.001,
"loss": 0.2322,
"step": 5800
},
{
"epoch": 0.06,
"grad_norm": 0.09100496768951416,
"learning_rate": 0.001,
"loss": 0.3271,
"step": 5900
},
{
"epoch": 0.06,
"grad_norm": 0.0949195995926857,
"learning_rate": 0.001,
"loss": 0.2322,
"step": 6000
},
{
"epoch": 0.06,
"grad_norm": 0.17315314710140228,
"learning_rate": 0.001,
"loss": 0.232,
"step": 6100
},
{
"epoch": 0.06,
"grad_norm": 0.04644012451171875,
"learning_rate": 0.001,
"loss": 0.2317,
"step": 6200
},
{
"epoch": 0.06,
"grad_norm": 0.03242076560854912,
"learning_rate": 0.001,
"loss": 0.2317,
"step": 6300
},
{
"epoch": 0.06,
"grad_norm": 0.03038044273853302,
"learning_rate": 0.001,
"loss": 0.2322,
"step": 6400
},
{
"epoch": 0.06,
"grad_norm": 0.04407713562250137,
"learning_rate": 0.001,
"loss": 0.2321,
"step": 6500
},
{
"epoch": 0.07,
"grad_norm": 0.04973585903644562,
"learning_rate": 0.001,
"loss": 0.2321,
"step": 6600
},
{
"epoch": 0.07,
"grad_norm": 0.043713077902793884,
"learning_rate": 0.001,
"loss": 0.2319,
"step": 6700
},
{
"epoch": 0.07,
"grad_norm": 0.0361105352640152,
"learning_rate": 0.001,
"loss": 0.2319,
"step": 6800
},
{
"epoch": 0.07,
"grad_norm": 0.038385313004255295,
"learning_rate": 0.001,
"loss": 0.2319,
"step": 6900
},
{
"epoch": 0.07,
"grad_norm": 0.059859637171030045,
"learning_rate": 0.001,
"loss": 0.2318,
"step": 7000
},
{
"epoch": 0.07,
"grad_norm": 0.10737486183643341,
"learning_rate": 0.001,
"loss": 0.232,
"step": 7100
},
{
"epoch": 0.07,
"grad_norm": 0.07841573655605316,
"learning_rate": 0.001,
"loss": 0.2319,
"step": 7200
},
{
"epoch": 0.07,
"grad_norm": 0.12177613377571106,
"learning_rate": 0.001,
"loss": 0.2318,
"step": 7300
},
{
"epoch": 0.07,
"grad_norm": 0.04158034175634384,
"learning_rate": 0.001,
"loss": 0.2318,
"step": 7400
},
{
"epoch": 0.07,
"grad_norm": 0.04334099590778351,
"learning_rate": 0.001,
"loss": 0.2318,
"step": 7500
},
{
"epoch": 0.08,
"grad_norm": 0.04868987202644348,
"learning_rate": 0.001,
"loss": 0.2317,
"step": 7600
},
{
"epoch": 0.08,
"grad_norm": 0.11688575893640518,
"learning_rate": 0.001,
"loss": 0.2318,
"step": 7700
},
{
"epoch": 0.08,
"grad_norm": 0.05144130066037178,
"learning_rate": 0.001,
"loss": 0.2319,
"step": 7800
},
{
"epoch": 0.08,
"grad_norm": 0.04202236235141754,
"learning_rate": 0.001,
"loss": 0.2318,
"step": 7900
},
{
"epoch": 0.08,
"grad_norm": 0.07848116755485535,
"learning_rate": 0.001,
"loss": 0.2314,
"step": 8000
},
{
"epoch": 0.08,
"grad_norm": 0.05292198061943054,
"learning_rate": 0.001,
"loss": 0.2317,
"step": 8100
},
{
"epoch": 0.08,
"grad_norm": 0.05817991867661476,
"learning_rate": 0.001,
"loss": 0.2318,
"step": 8200
},
{
"epoch": 0.08,
"grad_norm": 0.03250608965754509,
"learning_rate": 0.001,
"loss": 0.2316,
"step": 8300
},
{
"epoch": 0.08,
"grad_norm": 0.29823893308639526,
"learning_rate": 0.001,
"loss": 0.2311,
"step": 8400
},
{
"epoch": 0.08,
"grad_norm": 1.852128505706787,
"learning_rate": 0.001,
"loss": 0.1864,
"step": 8500
},
{
"epoch": 0.09,
"grad_norm": 61.31148147583008,
"learning_rate": 0.001,
"loss": 0.1911,
"step": 8600
},
{
"epoch": 0.09,
"grad_norm": 3.4901123046875,
"learning_rate": 0.001,
"loss": 0.1934,
"step": 8700
},
{
"epoch": 0.09,
"grad_norm": 0.9580036401748657,
"learning_rate": 0.001,
"loss": 0.1706,
"step": 8800
},
{
"epoch": 0.09,
"grad_norm": 0.5461576581001282,
"learning_rate": 0.001,
"loss": 0.1597,
"step": 8900
},
{
"epoch": 0.09,
"grad_norm": 3.481351375579834,
"learning_rate": 0.001,
"loss": 0.1511,
"step": 9000
},
{
"epoch": 0.09,
"grad_norm": 0.3008120656013489,
"learning_rate": 0.001,
"loss": 0.154,
"step": 9100
},
{
"epoch": 0.09,
"grad_norm": 0.23753711581230164,
"learning_rate": 0.001,
"loss": 0.1406,
"step": 9200
},
{
"epoch": 0.09,
"grad_norm": 0.9201159477233887,
"learning_rate": 0.001,
"loss": 0.1444,
"step": 9300
},
{
"epoch": 0.09,
"grad_norm": 1.6734191179275513,
"learning_rate": 0.001,
"loss": 0.1385,
"step": 9400
},
{
"epoch": 0.09,
"grad_norm": 1.7249393463134766,
"learning_rate": 0.001,
"loss": 0.1393,
"step": 9500
},
{
"epoch": 0.1,
"grad_norm": 0.5765690207481384,
"learning_rate": 0.001,
"loss": 0.1397,
"step": 9600
},
{
"epoch": 0.1,
"grad_norm": 0.4266449213027954,
"learning_rate": 0.001,
"loss": 0.1386,
"step": 9700
},
{
"epoch": 0.1,
"grad_norm": 0.23247841000556946,
"learning_rate": 0.001,
"loss": 0.1343,
"step": 9800
},
{
"epoch": 0.1,
"grad_norm": 0.19435954093933105,
"learning_rate": 0.001,
"loss": 0.1306,
"step": 9900
},
{
"epoch": 0.1,
"grad_norm": 0.27626514434814453,
"learning_rate": 0.001,
"loss": 0.133,
"step": 10000
},
{
"epoch": 0.1,
"grad_norm": 0.1834883689880371,
"learning_rate": 0.001,
"loss": 0.1299,
"step": 10100
},
{
"epoch": 0.1,
"grad_norm": 0.4306440055370331,
"learning_rate": 0.001,
"loss": 0.1309,
"step": 10200
},
{
"epoch": 0.1,
"grad_norm": 0.15750516951084137,
"learning_rate": 0.001,
"loss": 0.1266,
"step": 10300
},
{
"epoch": 0.1,
"grad_norm": 0.2934073805809021,
"learning_rate": 0.001,
"loss": 0.1278,
"step": 10400
},
{
"epoch": 0.1,
"grad_norm": 0.27599695324897766,
"learning_rate": 0.001,
"loss": 0.1286,
"step": 10500
},
{
"epoch": 0.11,
"grad_norm": 0.39952772855758667,
"learning_rate": 0.001,
"loss": 0.1252,
"step": 10600
},
{
"epoch": 0.11,
"grad_norm": 0.4082016348838806,
"learning_rate": 0.001,
"loss": 0.1272,
"step": 10700
},
{
"epoch": 0.11,
"grad_norm": 0.303307443857193,
"learning_rate": 0.001,
"loss": 0.1249,
"step": 10800
},
{
"epoch": 0.11,
"grad_norm": 0.1597479283809662,
"learning_rate": 0.001,
"loss": 0.1247,
"step": 10900
},
{
"epoch": 0.11,
"grad_norm": 1.03666090965271,
"learning_rate": 0.001,
"loss": 0.1286,
"step": 11000
},
{
"epoch": 0.11,
"grad_norm": 0.2832247018814087,
"learning_rate": 0.001,
"loss": 0.1248,
"step": 11100
},
{
"epoch": 0.11,
"grad_norm": 0.49678078293800354,
"learning_rate": 0.001,
"loss": 0.1258,
"step": 11200
},
{
"epoch": 0.11,
"grad_norm": 0.3678058385848999,
"learning_rate": 0.001,
"loss": 0.1256,
"step": 11300
},
{
"epoch": 0.11,
"grad_norm": 0.26233455538749695,
"learning_rate": 0.001,
"loss": 0.1233,
"step": 11400
},
{
"epoch": 0.11,
"grad_norm": 0.22039958834648132,
"learning_rate": 0.001,
"loss": 0.1197,
"step": 11500
},
{
"epoch": 0.12,
"grad_norm": 0.14722639322280884,
"learning_rate": 0.001,
"loss": 0.1225,
"step": 11600
},
{
"epoch": 0.12,
"grad_norm": 0.19015900790691376,
"learning_rate": 0.001,
"loss": 0.1217,
"step": 11700
},
{
"epoch": 0.12,
"grad_norm": 0.15655829012393951,
"learning_rate": 0.001,
"loss": 0.1185,
"step": 11800
},
{
"epoch": 0.12,
"grad_norm": 3.5397889614105225,
"learning_rate": 0.001,
"loss": 0.119,
"step": 11900
},
{
"epoch": 0.12,
"grad_norm": 0.845320999622345,
"learning_rate": 0.001,
"loss": 0.1276,
"step": 12000
},
{
"epoch": 0.12,
"grad_norm": 0.34136563539505005,
"learning_rate": 0.001,
"loss": 0.122,
"step": 12100
},
{
"epoch": 0.12,
"grad_norm": 0.2509533762931824,
"learning_rate": 0.001,
"loss": 0.1199,
"step": 12200
},
{
"epoch": 0.12,
"grad_norm": 0.31120267510414124,
"learning_rate": 0.001,
"loss": 0.1191,
"step": 12300
},
{
"epoch": 0.12,
"grad_norm": 0.3903524875640869,
"learning_rate": 0.001,
"loss": 0.1183,
"step": 12400
},
{
"epoch": 0.12,
"grad_norm": 0.19971555471420288,
"learning_rate": 0.001,
"loss": 0.1177,
"step": 12500
},
{
"epoch": 0.13,
"grad_norm": 0.36589089035987854,
"learning_rate": 0.001,
"loss": 0.1158,
"step": 12600
},
{
"epoch": 0.13,
"grad_norm": 0.19200453162193298,
"learning_rate": 0.001,
"loss": 0.1166,
"step": 12700
},
{
"epoch": 0.13,
"grad_norm": 0.6393672823905945,
"learning_rate": 0.001,
"loss": 0.1171,
"step": 12800
},
{
"epoch": 0.13,
"grad_norm": 0.32421180605888367,
"learning_rate": 0.001,
"loss": 0.118,
"step": 12900
},
{
"epoch": 0.13,
"grad_norm": 0.6238926649093628,
"learning_rate": 0.001,
"loss": 0.1166,
"step": 13000
},
{
"epoch": 0.13,
"grad_norm": 0.1363907754421234,
"learning_rate": 0.001,
"loss": 0.1156,
"step": 13100
},
{
"epoch": 0.13,
"grad_norm": 0.16790109872817993,
"learning_rate": 0.001,
"loss": 0.1142,
"step": 13200
},
{
"epoch": 0.13,
"grad_norm": 0.1915178894996643,
"learning_rate": 0.001,
"loss": 0.1126,
"step": 13300
},
{
"epoch": 0.13,
"grad_norm": 0.12727123498916626,
"learning_rate": 0.001,
"loss": 0.1156,
"step": 13400
},
{
"epoch": 0.13,
"grad_norm": 0.29520758986473083,
"learning_rate": 0.001,
"loss": 0.1129,
"step": 13500
},
{
"epoch": 0.14,
"grad_norm": 0.1663757860660553,
"learning_rate": 0.001,
"loss": 0.1132,
"step": 13600
},
{
"epoch": 0.14,
"grad_norm": 0.1840706318616867,
"learning_rate": 0.001,
"loss": 0.119,
"step": 13700
},
{
"epoch": 0.14,
"grad_norm": 0.16156257688999176,
"learning_rate": 0.001,
"loss": 0.1146,
"step": 13800
},
{
"epoch": 0.14,
"grad_norm": 0.17348338663578033,
"learning_rate": 0.001,
"loss": 0.1141,
"step": 13900
},
{
"epoch": 0.14,
"grad_norm": 0.18696527183055878,
"learning_rate": 0.001,
"loss": 0.1108,
"step": 14000
},
{
"epoch": 0.14,
"grad_norm": 0.15352846682071686,
"learning_rate": 0.001,
"loss": 0.1134,
"step": 14100
},
{
"epoch": 0.14,
"grad_norm": 0.23210759460926056,
"learning_rate": 0.001,
"loss": 0.1142,
"step": 14200
},
{
"epoch": 0.14,
"grad_norm": 0.18328526616096497,
"learning_rate": 0.001,
"loss": 0.1109,
"step": 14300
},
{
"epoch": 0.14,
"grad_norm": 0.17674757540225983,
"learning_rate": 0.001,
"loss": 0.1083,
"step": 14400
},
{
"epoch": 0.14,
"grad_norm": 0.34446394443511963,
"learning_rate": 0.001,
"loss": 0.1203,
"step": 14500
},
{
"epoch": 0.15,
"grad_norm": 0.22947299480438232,
"learning_rate": 0.001,
"loss": 0.1095,
"step": 14600
},
{
"epoch": 0.15,
"grad_norm": 0.15071985125541687,
"learning_rate": 0.001,
"loss": 0.1088,
"step": 14700
},
{
"epoch": 0.15,
"grad_norm": 0.14273251593112946,
"learning_rate": 0.001,
"loss": 0.1091,
"step": 14800
},
{
"epoch": 0.15,
"grad_norm": 0.20266981422901154,
"learning_rate": 0.001,
"loss": 0.1089,
"step": 14900
},
{
"epoch": 0.15,
"grad_norm": 0.1495724767446518,
"learning_rate": 0.001,
"loss": 0.1089,
"step": 15000
},
{
"epoch": 0.15,
"grad_norm": 0.1711970865726471,
"learning_rate": 0.001,
"loss": 0.1063,
"step": 15100
},
{
"epoch": 0.15,
"grad_norm": 0.20727260410785675,
"learning_rate": 0.001,
"loss": 0.104,
"step": 15200
},
{
"epoch": 0.15,
"grad_norm": 0.22724412381649017,
"learning_rate": 0.001,
"loss": 0.1087,
"step": 15300
},
{
"epoch": 0.15,
"grad_norm": 0.15561726689338684,
"learning_rate": 0.001,
"loss": 0.1086,
"step": 15400
},
{
"epoch": 0.15,
"grad_norm": 0.2139796018600464,
"learning_rate": 0.001,
"loss": 0.1054,
"step": 15500
},
{
"epoch": 0.16,
"grad_norm": 0.24371370673179626,
"learning_rate": 0.001,
"loss": 0.1077,
"step": 15600
},
{
"epoch": 0.16,
"grad_norm": 0.22944559156894684,
"learning_rate": 0.001,
"loss": 0.1092,
"step": 15700
},
{
"epoch": 0.16,
"grad_norm": 0.19578562676906586,
"learning_rate": 0.001,
"loss": 0.1077,
"step": 15800
},
{
"epoch": 0.16,
"grad_norm": 0.17588412761688232,
"learning_rate": 0.001,
"loss": 0.1048,
"step": 15900
},
{
"epoch": 0.16,
"grad_norm": 0.16697707772254944,
"learning_rate": 0.001,
"loss": 0.1072,
"step": 16000
},
{
"epoch": 0.16,
"grad_norm": 0.1927742063999176,
"learning_rate": 0.001,
"loss": 0.1036,
"step": 16100
},
{
"epoch": 0.16,
"grad_norm": 0.25396087765693665,
"learning_rate": 0.001,
"loss": 0.1068,
"step": 16200
},
{
"epoch": 0.16,
"grad_norm": 0.21014653146266937,
"learning_rate": 0.001,
"loss": 0.1012,
"step": 16300
},
{
"epoch": 0.16,
"grad_norm": 0.32085150480270386,
"learning_rate": 0.001,
"loss": 0.1062,
"step": 16400
},
{
"epoch": 0.16,
"grad_norm": 0.10534122586250305,
"learning_rate": 0.001,
"loss": 0.103,
"step": 16500
},
{
"epoch": 0.17,
"grad_norm": 0.24365462362766266,
"learning_rate": 0.001,
"loss": 0.106,
"step": 16600
},
{
"epoch": 0.17,
"grad_norm": 0.15197184681892395,
"learning_rate": 0.001,
"loss": 0.1051,
"step": 16700
},
{
"epoch": 0.17,
"grad_norm": 0.23027855157852173,
"learning_rate": 0.001,
"loss": 0.1065,
"step": 16800
},
{
"epoch": 0.17,
"grad_norm": 0.14924216270446777,
"learning_rate": 0.001,
"loss": 0.1068,
"step": 16900
},
{
"epoch": 0.17,
"grad_norm": 0.13331858813762665,
"learning_rate": 0.001,
"loss": 0.1035,
"step": 17000
},
{
"epoch": 0.17,
"grad_norm": 0.20150358974933624,
"learning_rate": 0.001,
"loss": 0.1065,
"step": 17100
},
{
"epoch": 0.17,
"grad_norm": 0.1429535299539566,
"learning_rate": 0.001,
"loss": 0.1056,
"step": 17200
},
{
"epoch": 0.17,
"grad_norm": 0.16326557099819183,
"learning_rate": 0.001,
"loss": 0.1022,
"step": 17300
},
{
"epoch": 0.17,
"grad_norm": 0.15712429583072662,
"learning_rate": 0.001,
"loss": 0.1051,
"step": 17400
},
{
"epoch": 0.17,
"grad_norm": 0.33204013109207153,
"learning_rate": 0.001,
"loss": 0.1046,
"step": 17500
},
{
"epoch": 0.18,
"grad_norm": 0.17703518271446228,
"learning_rate": 0.001,
"loss": 0.1057,
"step": 17600
},
{
"epoch": 0.18,
"grad_norm": 0.14861218631267548,
"learning_rate": 0.001,
"loss": 0.1052,
"step": 17700
},
{
"epoch": 0.18,
"grad_norm": 0.18271447718143463,
"learning_rate": 0.001,
"loss": 0.1049,
"step": 17800
},
{
"epoch": 0.18,
"grad_norm": 0.2245068997144699,
"learning_rate": 0.001,
"loss": 0.1033,
"step": 17900
},
{
"epoch": 0.18,
"grad_norm": 0.2233046442270279,
"learning_rate": 0.001,
"loss": 0.1049,
"step": 18000
},
{
"epoch": 0.18,
"grad_norm": 0.1915113776922226,
"learning_rate": 0.001,
"loss": 0.1039,
"step": 18100
},
{
"epoch": 0.18,
"grad_norm": 0.1070462241768837,
"learning_rate": 0.001,
"loss": 0.1028,
"step": 18200
},
{
"epoch": 0.18,
"grad_norm": 0.14523275196552277,
"learning_rate": 0.001,
"loss": 0.0983,
"step": 18300
},
{
"epoch": 0.18,
"grad_norm": 0.24468256533145905,
"learning_rate": 0.001,
"loss": 0.1018,
"step": 18400
},
{
"epoch": 0.18,
"grad_norm": 0.17596426606178284,
"learning_rate": 0.001,
"loss": 0.1017,
"step": 18500
},
{
"epoch": 0.19,
"grad_norm": 0.15113884210586548,
"learning_rate": 0.001,
"loss": 0.1022,
"step": 18600
},
{
"epoch": 0.19,
"grad_norm": 0.1756398230791092,
"learning_rate": 0.001,
"loss": 0.1032,
"step": 18700
},
{
"epoch": 0.19,
"grad_norm": 0.1491193026304245,
"learning_rate": 0.001,
"loss": 0.1016,
"step": 18800
},
{
"epoch": 0.19,
"grad_norm": 0.15422752499580383,
"learning_rate": 0.001,
"loss": 0.0989,
"step": 18900
},
{
"epoch": 0.19,
"grad_norm": 0.13713973760604858,
"learning_rate": 0.001,
"loss": 0.1002,
"step": 19000
},
{
"epoch": 0.19,
"grad_norm": 0.16012702882289886,
"learning_rate": 0.001,
"loss": 0.101,
"step": 19100
},
{
"epoch": 0.19,
"grad_norm": 0.23414984345436096,
"learning_rate": 0.001,
"loss": 0.0975,
"step": 19200
},
{
"epoch": 0.19,
"grad_norm": 0.13922521471977234,
"learning_rate": 0.001,
"loss": 0.1002,
"step": 19300
},
{
"epoch": 0.19,
"grad_norm": 0.14608104526996613,
"learning_rate": 0.001,
"loss": 0.098,
"step": 19400
},
{
"epoch": 0.19,
"grad_norm": 0.19267164170742035,
"learning_rate": 0.001,
"loss": 0.098,
"step": 19500
},
{
"epoch": 0.2,
"grad_norm": 0.1570904552936554,
"learning_rate": 0.001,
"loss": 0.1034,
"step": 19600
},
{
"epoch": 0.2,
"grad_norm": 0.3922866880893707,
"learning_rate": 0.001,
"loss": 0.1008,
"step": 19700
},
{
"epoch": 0.2,
"grad_norm": 0.20500238239765167,
"learning_rate": 0.001,
"loss": 0.1025,
"step": 19800
},
{
"epoch": 0.2,
"grad_norm": 0.2044358104467392,
"learning_rate": 0.001,
"loss": 0.0982,
"step": 19900
},
{
"epoch": 0.2,
"grad_norm": 0.1722269356250763,
"learning_rate": 0.001,
"loss": 0.1007,
"step": 20000
},
{
"epoch": 0.2,
"grad_norm": 0.21868231892585754,
"learning_rate": 0.001,
"loss": 0.1,
"step": 20100
},
{
"epoch": 0.2,
"grad_norm": 0.12817895412445068,
"learning_rate": 0.001,
"loss": 0.1,
"step": 20200
},
{
"epoch": 0.2,
"grad_norm": 0.12333246320486069,
"learning_rate": 0.001,
"loss": 0.0987,
"step": 20300
},
{
"epoch": 0.2,
"grad_norm": 0.1742565631866455,
"learning_rate": 0.001,
"loss": 0.0981,
"step": 20400
},
{
"epoch": 0.2,
"grad_norm": 0.15747936069965363,
"learning_rate": 0.001,
"loss": 0.1012,
"step": 20500
},
{
"epoch": 0.21,
"grad_norm": 0.27314338088035583,
"learning_rate": 0.001,
"loss": 0.1014,
"step": 20600
},
{
"epoch": 0.21,
"grad_norm": 0.9368189573287964,
"learning_rate": 0.001,
"loss": 0.1035,
"step": 20700
},
{
"epoch": 0.21,
"grad_norm": 0.3574996590614319,
"learning_rate": 0.001,
"loss": 0.0992,
"step": 20800
},
{
"epoch": 0.21,
"grad_norm": 0.28280141949653625,
"learning_rate": 0.001,
"loss": 0.0975,
"step": 20900
},
{
"epoch": 0.21,
"grad_norm": 0.21435654163360596,
"learning_rate": 0.001,
"loss": 0.0998,
"step": 21000
},
{
"epoch": 0.21,
"grad_norm": 0.20617541670799255,
"learning_rate": 0.001,
"loss": 0.0994,
"step": 21100
},
{
"epoch": 0.21,
"grad_norm": 0.21885354816913605,
"learning_rate": 0.001,
"loss": 0.099,
"step": 21200
},
{
"epoch": 0.21,
"grad_norm": 0.24429431557655334,
"learning_rate": 0.001,
"loss": 0.1018,
"step": 21300
},
{
"epoch": 0.21,
"grad_norm": 0.24264854192733765,
"learning_rate": 0.001,
"loss": 0.1009,
"step": 21400
},
{
"epoch": 0.21,
"grad_norm": 0.19410717487335205,
"learning_rate": 0.001,
"loss": 0.1007,
"step": 21500
},
{
"epoch": 0.22,
"grad_norm": 0.15938735008239746,
"learning_rate": 0.001,
"loss": 0.0965,
"step": 21600
},
{
"epoch": 0.22,
"grad_norm": 0.678229808807373,
"learning_rate": 0.001,
"loss": 0.1001,
"step": 21700
},
{
"epoch": 0.22,
"grad_norm": 0.2967202663421631,
"learning_rate": 0.001,
"loss": 0.1003,
"step": 21800
},
{
"epoch": 0.22,
"grad_norm": 0.7940108180046082,
"learning_rate": 0.001,
"loss": 0.1001,
"step": 21900
},
{
"epoch": 0.22,
"grad_norm": 0.24995733797550201,
"learning_rate": 0.001,
"loss": 0.0992,
"step": 22000
},
{
"epoch": 0.22,
"grad_norm": 0.1626627892255783,
"learning_rate": 0.001,
"loss": 0.0992,
"step": 22100
},
{
"epoch": 0.22,
"grad_norm": 0.21141190826892853,
"learning_rate": 0.001,
"loss": 0.0961,
"step": 22200
},
{
"epoch": 0.22,
"grad_norm": 0.21122020483016968,
"learning_rate": 0.001,
"loss": 0.0968,
"step": 22300
},
{
"epoch": 0.22,
"grad_norm": 0.2558838725090027,
"learning_rate": 0.001,
"loss": 0.098,
"step": 22400
},
{
"epoch": 0.22,
"grad_norm": 0.1975196897983551,
"learning_rate": 0.001,
"loss": 0.0987,
"step": 22500
},
{
"epoch": 0.23,
"grad_norm": 0.14767397940158844,
"learning_rate": 0.001,
"loss": 0.096,
"step": 22600
},
{
"epoch": 0.23,
"grad_norm": 0.17532730102539062,
"learning_rate": 0.001,
"loss": 0.0985,
"step": 22700
},
{
"epoch": 0.23,
"grad_norm": 0.1320209801197052,
"learning_rate": 0.001,
"loss": 0.0968,
"step": 22800
},
{
"epoch": 0.23,
"grad_norm": 0.273934930562973,
"learning_rate": 0.001,
"loss": 0.0978,
"step": 22900
},
{
"epoch": 0.23,
"grad_norm": 0.15103434026241302,
"learning_rate": 0.001,
"loss": 0.0995,
"step": 23000
},
{
"epoch": 0.23,
"grad_norm": 0.2021692842245102,
"learning_rate": 0.001,
"loss": 0.0952,
"step": 23100
},
{
"epoch": 0.23,
"grad_norm": 0.1648433655500412,
"learning_rate": 0.001,
"loss": 0.0938,
"step": 23200
},
{
"epoch": 0.23,
"grad_norm": 0.17460817098617554,
"learning_rate": 0.001,
"loss": 0.0959,
"step": 23300
},
{
"epoch": 0.23,
"grad_norm": 0.15195918083190918,
"learning_rate": 0.001,
"loss": 0.094,
"step": 23400
},
{
"epoch": 0.23,
"grad_norm": 0.1664193570613861,
"learning_rate": 0.001,
"loss": 0.094,
"step": 23500
},
{
"epoch": 0.24,
"grad_norm": 0.14700663089752197,
"learning_rate": 0.001,
"loss": 0.0951,
"step": 23600
},
{
"epoch": 0.24,
"grad_norm": 0.22301018238067627,
"learning_rate": 0.001,
"loss": 0.0919,
"step": 23700
},
{
"epoch": 0.24,
"grad_norm": 0.1666121482849121,
"learning_rate": 0.001,
"loss": 0.0928,
"step": 23800
},
{
"epoch": 0.24,
"grad_norm": 0.1971474438905716,
"learning_rate": 0.001,
"loss": 0.0949,
"step": 23900
},
{
"epoch": 0.24,
"grad_norm": 0.15959730744361877,
"learning_rate": 0.001,
"loss": 0.095,
"step": 24000
},
{
"epoch": 0.24,
"grad_norm": 0.29146862030029297,
"learning_rate": 0.001,
"loss": 0.0942,
"step": 24100
},
{
"epoch": 0.24,
"grad_norm": 0.15853939950466156,
"learning_rate": 0.001,
"loss": 0.0978,
"step": 24200
},
{
"epoch": 0.24,
"grad_norm": 0.16822876036167145,
"learning_rate": 0.001,
"loss": 0.0934,
"step": 24300
},
{
"epoch": 0.24,
"grad_norm": 0.15456752479076385,
"learning_rate": 0.001,
"loss": 0.0948,
"step": 24400
},
{
"epoch": 0.24,
"grad_norm": 0.15123625099658966,
"learning_rate": 0.001,
"loss": 0.0926,
"step": 24500
},
{
"epoch": 0.25,
"grad_norm": 0.16344180703163147,
"learning_rate": 0.001,
"loss": 0.0935,
"step": 24600
},
{
"epoch": 0.25,
"grad_norm": 0.22936996817588806,
"learning_rate": 0.001,
"loss": 0.0936,
"step": 24700
},
{
"epoch": 0.25,
"grad_norm": 0.16810204088687897,
"learning_rate": 0.001,
"loss": 0.0978,
"step": 24800
},
{
"epoch": 0.25,
"grad_norm": 0.14977198839187622,
"learning_rate": 0.001,
"loss": 0.0936,
"step": 24900
},
{
"epoch": 0.25,
"grad_norm": 0.18207716941833496,
"learning_rate": 0.001,
"loss": 0.093,
"step": 25000
},
{
"epoch": 0.25,
"grad_norm": 0.2584002912044525,
"learning_rate": 0.001,
"loss": 0.0958,
"step": 25100
},
{
"epoch": 0.25,
"grad_norm": 0.23717880249023438,
"learning_rate": 0.001,
"loss": 0.0927,
"step": 25200
},
{
"epoch": 0.25,
"grad_norm": 0.1896461844444275,
"learning_rate": 0.001,
"loss": 0.094,
"step": 25300
},
{
"epoch": 0.25,
"grad_norm": 0.21543921530246735,
"learning_rate": 0.001,
"loss": 0.0953,
"step": 25400
},
{
"epoch": 0.25,
"grad_norm": 0.14013002812862396,
"learning_rate": 0.001,
"loss": 0.0958,
"step": 25500
},
{
"epoch": 0.26,
"grad_norm": 0.1744927018880844,
"learning_rate": 0.001,
"loss": 0.0946,
"step": 25600
},
{
"epoch": 0.26,
"grad_norm": 0.16546490788459778,
"learning_rate": 0.001,
"loss": 0.0962,
"step": 25700
},
{
"epoch": 0.26,
"grad_norm": 0.16227766871452332,
"learning_rate": 0.001,
"loss": 0.0952,
"step": 25800
},
{
"epoch": 0.26,
"grad_norm": 0.181349515914917,
"learning_rate": 0.001,
"loss": 0.0951,
"step": 25900
},
{
"epoch": 0.26,
"grad_norm": 0.20408563315868378,
"learning_rate": 0.001,
"loss": 0.0915,
"step": 26000
},
{
"epoch": 0.26,
"grad_norm": 0.1793171763420105,
"learning_rate": 0.001,
"loss": 0.0942,
"step": 26100
},
{
"epoch": 0.26,
"grad_norm": 0.14634822309017181,
"learning_rate": 0.001,
"loss": 0.0961,
"step": 26200
},
{
"epoch": 0.26,
"grad_norm": 0.18879148364067078,
"learning_rate": 0.001,
"loss": 0.0942,
"step": 26300
},
{
"epoch": 0.26,
"grad_norm": 0.20523515343666077,
"learning_rate": 0.001,
"loss": 0.0912,
"step": 26400
},
{
"epoch": 0.26,
"grad_norm": 0.18672947585582733,
"learning_rate": 0.001,
"loss": 0.092,
"step": 26500
},
{
"epoch": 0.27,
"grad_norm": 0.18561910092830658,
"learning_rate": 0.001,
"loss": 0.0913,
"step": 26600
},
{
"epoch": 0.27,
"grad_norm": 0.23991861939430237,
"learning_rate": 0.001,
"loss": 0.0925,
"step": 26700
},
{
"epoch": 0.27,
"grad_norm": 0.1660347878932953,
"learning_rate": 0.001,
"loss": 0.0939,
"step": 26800
},
{
"epoch": 0.27,
"grad_norm": 0.2105019986629486,
"learning_rate": 0.001,
"loss": 0.093,
"step": 26900
},
{
"epoch": 0.27,
"grad_norm": 0.2271376997232437,
"learning_rate": 0.001,
"loss": 0.0899,
"step": 27000
},
{
"epoch": 0.27,
"grad_norm": 0.14487460255622864,
"learning_rate": 0.001,
"loss": 0.0906,
"step": 27100
},
{
"epoch": 0.27,
"grad_norm": 0.1597098708152771,
"learning_rate": 0.001,
"loss": 0.0919,
"step": 27200
},
{
"epoch": 0.27,
"grad_norm": 0.18633900582790375,
"learning_rate": 0.001,
"loss": 0.0892,
"step": 27300
},
{
"epoch": 0.27,
"grad_norm": 0.12663201987743378,
"learning_rate": 0.001,
"loss": 0.0913,
"step": 27400
},
{
"epoch": 0.27,
"grad_norm": 0.17320451140403748,
"learning_rate": 0.001,
"loss": 0.0911,
"step": 27500
},
{
"epoch": 0.28,
"grad_norm": 0.16872632503509521,
"learning_rate": 0.001,
"loss": 0.091,
"step": 27600
},
{
"epoch": 0.28,
"grad_norm": 0.18602560460567474,
"learning_rate": 0.001,
"loss": 0.0908,
"step": 27700
},
{
"epoch": 0.28,
"grad_norm": 0.17392034828662872,
"learning_rate": 0.001,
"loss": 0.0882,
"step": 27800
},
{
"epoch": 0.28,
"grad_norm": 0.10278663039207458,
"learning_rate": 0.001,
"loss": 0.088,
"step": 27900
},
{
"epoch": 0.28,
"grad_norm": 0.15355843305587769,
"learning_rate": 0.001,
"loss": 0.0876,
"step": 28000
},
{
"epoch": 0.28,
"grad_norm": 0.17331954836845398,
"learning_rate": 0.001,
"loss": 0.0906,
"step": 28100
},
{
"epoch": 0.28,
"grad_norm": 0.16750375926494598,
"learning_rate": 0.001,
"loss": 0.0935,
"step": 28200
},
{
"epoch": 0.28,
"grad_norm": 0.27208462357521057,
"learning_rate": 0.001,
"loss": 0.0884,
"step": 28300
},
{
"epoch": 0.28,
"grad_norm": 0.2215784639120102,
"learning_rate": 0.001,
"loss": 0.0904,
"step": 28400
},
{
"epoch": 0.28,
"grad_norm": 0.1542549580335617,
"learning_rate": 0.001,
"loss": 0.0903,
"step": 28500
},
{
"epoch": 0.29,
"grad_norm": 0.22874318063259125,
"learning_rate": 0.001,
"loss": 0.0889,
"step": 28600
},
{
"epoch": 0.29,
"grad_norm": 0.22677820920944214,
"learning_rate": 0.001,
"loss": 0.0915,
"step": 28700
},
{
"epoch": 0.29,
"grad_norm": 0.22208420932292938,
"learning_rate": 0.001,
"loss": 0.0902,
"step": 28800
},
{
"epoch": 0.29,
"grad_norm": 0.18172180652618408,
"learning_rate": 0.001,
"loss": 0.091,
"step": 28900
},
{
"epoch": 0.29,
"grad_norm": 0.264664888381958,
"learning_rate": 0.001,
"loss": 0.091,
"step": 29000
},
{
"epoch": 0.29,
"grad_norm": 0.15961118042469025,
"learning_rate": 0.001,
"loss": 0.0864,
"step": 29100
},
{
"epoch": 0.29,
"grad_norm": 0.16828449070453644,
"learning_rate": 0.001,
"loss": 0.0902,
"step": 29200
},
{
"epoch": 0.29,
"grad_norm": 0.25299304723739624,
"learning_rate": 0.001,
"loss": 0.0895,
"step": 29300
},
{
"epoch": 0.29,
"grad_norm": 0.2019224911928177,
"learning_rate": 0.001,
"loss": 0.0887,
"step": 29400
},
{
"epoch": 0.29,
"grad_norm": 0.19100870192050934,
"learning_rate": 0.001,
"loss": 0.0897,
"step": 29500
},
{
"epoch": 0.3,
"grad_norm": 0.25321510434150696,
"learning_rate": 0.001,
"loss": 0.092,
"step": 29600
},
{
"epoch": 0.3,
"grad_norm": 0.18171149492263794,
"learning_rate": 0.001,
"loss": 0.089,
"step": 29700
},
{
"epoch": 0.3,
"grad_norm": 0.19380785524845123,
"learning_rate": 0.001,
"loss": 0.0895,
"step": 29800
},
{
"epoch": 0.3,
"grad_norm": 0.18437138199806213,
"learning_rate": 0.001,
"loss": 0.0903,
"step": 29900
},
{
"epoch": 0.3,
"grad_norm": 0.1717921495437622,
"learning_rate": 0.001,
"loss": 0.0885,
"step": 30000
},
{
"epoch": 0.3,
"grad_norm": 0.23623107373714447,
"learning_rate": 0.001,
"loss": 0.0882,
"step": 30100
},
{
"epoch": 0.3,
"grad_norm": 0.17992794513702393,
"learning_rate": 0.001,
"loss": 0.0885,
"step": 30200
},
{
"epoch": 0.3,
"grad_norm": 0.19958259165287018,
"learning_rate": 0.001,
"loss": 0.088,
"step": 30300
},
{
"epoch": 0.3,
"grad_norm": 0.14418841898441315,
"learning_rate": 0.001,
"loss": 0.0908,
"step": 30400
},
{
"epoch": 0.3,
"grad_norm": 0.13934949040412903,
"learning_rate": 0.001,
"loss": 0.0919,
"step": 30500
},
{
"epoch": 0.31,
"grad_norm": 0.1410313993692398,
"learning_rate": 0.001,
"loss": 0.0891,
"step": 30600
},
{
"epoch": 0.31,
"grad_norm": 0.27084311842918396,
"learning_rate": 0.001,
"loss": 0.0917,
"step": 30700
},
{
"epoch": 0.31,
"grad_norm": 0.18704760074615479,
"learning_rate": 0.001,
"loss": 0.0866,
"step": 30800
},
{
"epoch": 0.31,
"grad_norm": 0.16178588569164276,
"learning_rate": 0.001,
"loss": 0.088,
"step": 30900
},
{
"epoch": 0.31,
"grad_norm": 0.1699521839618683,
"learning_rate": 0.001,
"loss": 0.0891,
"step": 31000
},
{
"epoch": 0.31,
"grad_norm": 0.21340341866016388,
"learning_rate": 0.001,
"loss": 0.0871,
"step": 31100
},
{
"epoch": 0.31,
"grad_norm": 0.21089456975460052,
"learning_rate": 0.001,
"loss": 0.0898,
"step": 31200
},
{
"epoch": 0.31,
"grad_norm": 0.17899860441684723,
"learning_rate": 0.001,
"loss": 0.0874,
"step": 31300
},
{
"epoch": 0.31,
"grad_norm": 0.2222578376531601,
"learning_rate": 0.001,
"loss": 0.0875,
"step": 31400
},
{
"epoch": 0.31,
"grad_norm": 0.22845357656478882,
"learning_rate": 0.001,
"loss": 0.0895,
"step": 31500
},
{
"epoch": 0.32,
"grad_norm": 0.22213339805603027,
"learning_rate": 0.001,
"loss": 0.0877,
"step": 31600
},
{
"epoch": 0.32,
"grad_norm": 0.1989658772945404,
"learning_rate": 0.001,
"loss": 0.09,
"step": 31700
},
{
"epoch": 0.32,
"grad_norm": 0.28217941522598267,
"learning_rate": 0.001,
"loss": 0.0869,
"step": 31800
},
{
"epoch": 0.32,
"grad_norm": 0.1880946159362793,
"learning_rate": 0.001,
"loss": 0.0895,
"step": 31900
},
{
"epoch": 0.32,
"grad_norm": 0.2522743046283722,
"learning_rate": 0.001,
"loss": 0.0876,
"step": 32000
},
{
"epoch": 0.32,
"grad_norm": 0.15146856009960175,
"learning_rate": 0.001,
"loss": 0.0892,
"step": 32100
},
{
"epoch": 0.32,
"grad_norm": 0.20138536393642426,
"learning_rate": 0.001,
"loss": 0.0897,
"step": 32200
},
{
"epoch": 0.32,
"grad_norm": 0.19894324243068695,
"learning_rate": 0.001,
"loss": 0.089,
"step": 32300
},
{
"epoch": 0.32,
"grad_norm": 0.20011819899082184,
"learning_rate": 0.001,
"loss": 0.0877,
"step": 32400
},
{
"epoch": 0.32,
"grad_norm": 0.22739243507385254,
"learning_rate": 0.001,
"loss": 0.0875,
"step": 32500
},
{
"epoch": 0.33,
"grad_norm": 0.16710792481899261,
"learning_rate": 0.001,
"loss": 0.0852,
"step": 32600
},
{
"epoch": 0.33,
"grad_norm": 0.20454761385917664,
"learning_rate": 0.001,
"loss": 0.0862,
"step": 32700
},
{
"epoch": 0.33,
"grad_norm": 0.12356776744127274,
"learning_rate": 0.001,
"loss": 0.0854,
"step": 32800
},
{
"epoch": 0.33,
"grad_norm": 0.18977922201156616,
"learning_rate": 0.001,
"loss": 0.0915,
"step": 32900
},
{
"epoch": 0.33,
"grad_norm": 0.18791726231575012,
"learning_rate": 0.001,
"loss": 0.0842,
"step": 33000
},
{
"epoch": 0.33,
"grad_norm": 0.23529213666915894,
"learning_rate": 0.001,
"loss": 0.086,
"step": 33100
},
{
"epoch": 0.33,
"grad_norm": 0.25430527329444885,
"learning_rate": 0.001,
"loss": 0.0833,
"step": 33200
},
{
"epoch": 0.33,
"grad_norm": 0.22178427875041962,
"learning_rate": 0.001,
"loss": 0.0874,
"step": 33300
},
{
"epoch": 0.33,
"grad_norm": 0.27455243468284607,
"learning_rate": 0.001,
"loss": 0.0845,
"step": 33400
},
{
"epoch": 0.33,
"grad_norm": 0.1998920440673828,
"learning_rate": 0.001,
"loss": 0.0869,
"step": 33500
},
{
"epoch": 0.34,
"grad_norm": 0.1991311013698578,
"learning_rate": 0.001,
"loss": 0.0873,
"step": 33600
},
{
"epoch": 0.34,
"grad_norm": 0.2600191831588745,
"learning_rate": 0.001,
"loss": 0.0869,
"step": 33700
},
{
"epoch": 0.34,
"grad_norm": 0.16889439523220062,
"learning_rate": 0.001,
"loss": 0.0841,
"step": 33800
},
{
"epoch": 0.34,
"grad_norm": 0.17337612807750702,
"learning_rate": 0.001,
"loss": 0.0847,
"step": 33900
},
{
"epoch": 0.34,
"grad_norm": 0.12141957134008408,
"learning_rate": 0.001,
"loss": 0.0873,
"step": 34000
},
{
"epoch": 0.34,
"grad_norm": 0.30542996525764465,
"learning_rate": 0.001,
"loss": 0.086,
"step": 34100
},
{
"epoch": 0.34,
"grad_norm": 0.256072461605072,
"learning_rate": 0.001,
"loss": 0.0845,
"step": 34200
},
{
"epoch": 0.34,
"grad_norm": 0.19596265256404877,
"learning_rate": 0.001,
"loss": 0.0847,
"step": 34300
},
{
"epoch": 0.34,
"grad_norm": 0.17981210350990295,
"learning_rate": 0.001,
"loss": 0.0853,
"step": 34400
},
{
"epoch": 0.34,
"grad_norm": 0.18695278465747833,
"learning_rate": 0.001,
"loss": 0.0867,
"step": 34500
},
{
"epoch": 0.35,
"grad_norm": 0.20189203321933746,
"learning_rate": 0.001,
"loss": 0.0867,
"step": 34600
},
{
"epoch": 0.35,
"grad_norm": 0.20751608908176422,
"learning_rate": 0.001,
"loss": 0.0855,
"step": 34700
},
{
"epoch": 0.35,
"grad_norm": 0.15412236750125885,
"learning_rate": 0.001,
"loss": 0.0876,
"step": 34800
},
{
"epoch": 0.35,
"grad_norm": 0.21551938354969025,
"learning_rate": 0.001,
"loss": 0.0854,
"step": 34900
},
{
"epoch": 0.35,
"grad_norm": 0.15149344503879547,
"learning_rate": 0.001,
"loss": 0.0863,
"step": 35000
},
{
"epoch": 0.35,
"grad_norm": 0.21960322558879852,
"learning_rate": 0.001,
"loss": 0.0913,
"step": 35100
},
{
"epoch": 0.35,
"grad_norm": 0.317090779542923,
"learning_rate": 0.001,
"loss": 0.0832,
"step": 35200
},
{
"epoch": 0.35,
"grad_norm": 0.20051142573356628,
"learning_rate": 0.001,
"loss": 0.0856,
"step": 35300
},
{
"epoch": 0.35,
"grad_norm": 0.1955852061510086,
"learning_rate": 0.001,
"loss": 0.0867,
"step": 35400
},
{
"epoch": 0.35,
"grad_norm": 0.13714253902435303,
"learning_rate": 0.001,
"loss": 0.0864,
"step": 35500
},
{
"epoch": 0.36,
"grad_norm": 0.18536311388015747,
"learning_rate": 0.001,
"loss": 0.0868,
"step": 35600
},
{
"epoch": 0.36,
"grad_norm": 0.1795514076948166,
"learning_rate": 0.001,
"loss": 0.0829,
"step": 35700
},
{
"epoch": 0.36,
"grad_norm": 0.1465149074792862,
"learning_rate": 0.001,
"loss": 0.0851,
"step": 35800
},
{
"epoch": 0.36,
"grad_norm": 0.17687107622623444,
"learning_rate": 0.001,
"loss": 0.0861,
"step": 35900
},
{
"epoch": 0.36,
"grad_norm": 0.1795363575220108,
"learning_rate": 0.001,
"loss": 0.0822,
"step": 36000
},
{
"epoch": 0.36,
"grad_norm": 0.1741327941417694,
"learning_rate": 0.001,
"loss": 0.0847,
"step": 36100
},
{
"epoch": 0.36,
"grad_norm": 0.2547447681427002,
"learning_rate": 0.001,
"loss": 0.0862,
"step": 36200
},
{
"epoch": 0.36,
"grad_norm": 0.16002462804317474,
"learning_rate": 0.001,
"loss": 0.0856,
"step": 36300
},
{
"epoch": 0.36,
"grad_norm": 0.14787407219409943,
"learning_rate": 0.001,
"loss": 0.0844,
"step": 36400
},
{
"epoch": 0.36,
"grad_norm": 0.23449848592281342,
"learning_rate": 0.001,
"loss": 0.0823,
"step": 36500
},
{
"epoch": 0.37,
"grad_norm": 0.18626731634140015,
"learning_rate": 0.001,
"loss": 0.0804,
"step": 36600
},
{
"epoch": 0.37,
"grad_norm": 0.1434779316186905,
"learning_rate": 0.001,
"loss": 0.0844,
"step": 36700
},
{
"epoch": 0.37,
"grad_norm": 0.1594706028699875,
"learning_rate": 0.001,
"loss": 0.0869,
"step": 36800
},
{
"epoch": 0.37,
"grad_norm": 0.18195496499538422,
"learning_rate": 0.001,
"loss": 0.0846,
"step": 36900
},
{
"epoch": 0.37,
"grad_norm": 0.18613013625144958,
"learning_rate": 0.001,
"loss": 0.0872,
"step": 37000
},
{
"epoch": 0.37,
"grad_norm": 0.16158261895179749,
"learning_rate": 0.001,
"loss": 0.0846,
"step": 37100
},
{
"epoch": 0.37,
"grad_norm": 0.17811179161071777,
"learning_rate": 0.001,
"loss": 0.0832,
"step": 37200
},
{
"epoch": 0.37,
"grad_norm": 0.24112731218338013,
"learning_rate": 0.001,
"loss": 0.0804,
"step": 37300
},
{
"epoch": 0.37,
"grad_norm": 0.1778961569070816,
"learning_rate": 0.001,
"loss": 0.0837,
"step": 37400
},
{
"epoch": 0.37,
"grad_norm": 0.18162128329277039,
"learning_rate": 0.001,
"loss": 0.0867,
"step": 37500
},
{
"epoch": 0.38,
"grad_norm": 0.15079495310783386,
"learning_rate": 0.001,
"loss": 0.0829,
"step": 37600
},
{
"epoch": 0.38,
"grad_norm": 0.26986435055732727,
"learning_rate": 0.001,
"loss": 0.0843,
"step": 37700
},
{
"epoch": 0.38,
"grad_norm": 0.2643984854221344,
"learning_rate": 0.001,
"loss": 0.0829,
"step": 37800
},
{
"epoch": 0.38,
"grad_norm": 0.281751424074173,
"learning_rate": 0.001,
"loss": 0.0821,
"step": 37900
},
{
"epoch": 0.38,
"grad_norm": 0.23095449805259705,
"learning_rate": 0.001,
"loss": 0.0836,
"step": 38000
},
{
"epoch": 0.38,
"grad_norm": 0.18625666201114655,
"learning_rate": 0.001,
"loss": 0.0831,
"step": 38100
},
{
"epoch": 0.38,
"grad_norm": 0.13689708709716797,
"learning_rate": 0.001,
"loss": 0.0839,
"step": 38200
},
{
"epoch": 0.38,
"grad_norm": 0.14063656330108643,
"learning_rate": 0.001,
"loss": 0.0817,
"step": 38300
},
{
"epoch": 0.38,
"grad_norm": 0.1880202442407608,
"learning_rate": 0.001,
"loss": 0.082,
"step": 38400
},
{
"epoch": 0.38,
"grad_norm": 0.15921075642108917,
"learning_rate": 0.001,
"loss": 0.0789,
"step": 38500
},
{
"epoch": 0.39,
"grad_norm": 0.1744866818189621,
"learning_rate": 0.001,
"loss": 0.0818,
"step": 38600
},
{
"epoch": 0.39,
"grad_norm": 0.26724693179130554,
"learning_rate": 0.001,
"loss": 0.0847,
"step": 38700
},
{
"epoch": 0.39,
"grad_norm": 0.14382457733154297,
"learning_rate": 0.001,
"loss": 0.0829,
"step": 38800
},
{
"epoch": 0.39,
"grad_norm": 0.14012865722179413,
"learning_rate": 0.001,
"loss": 0.082,
"step": 38900
},
{
"epoch": 0.39,
"grad_norm": 0.24175578355789185,
"learning_rate": 0.001,
"loss": 0.0835,
"step": 39000
},
{
"epoch": 0.39,
"grad_norm": 0.3397182822227478,
"learning_rate": 0.001,
"loss": 0.081,
"step": 39100
},
{
"epoch": 0.39,
"grad_norm": 0.1553467959165573,
"learning_rate": 0.001,
"loss": 0.0829,
"step": 39200
},
{
"epoch": 0.39,
"grad_norm": 0.20726840198040009,
"learning_rate": 0.001,
"loss": 0.083,
"step": 39300
},
{
"epoch": 0.39,
"grad_norm": 0.21219220757484436,
"learning_rate": 0.001,
"loss": 0.084,
"step": 39400
},
{
"epoch": 0.39,
"grad_norm": 0.19203193485736847,
"learning_rate": 0.001,
"loss": 0.0819,
"step": 39500
},
{
"epoch": 0.4,
"grad_norm": 0.22557440400123596,
"learning_rate": 0.001,
"loss": 0.0803,
"step": 39600
},
{
"epoch": 0.4,
"grad_norm": 0.23452799022197723,
"learning_rate": 0.001,
"loss": 0.0806,
"step": 39700
},
{
"epoch": 0.4,
"grad_norm": 0.28543928265571594,
"learning_rate": 0.001,
"loss": 0.0827,
"step": 39800
},
{
"epoch": 0.4,
"grad_norm": 0.19713571667671204,
"learning_rate": 0.001,
"loss": 0.08,
"step": 39900
},
{
"epoch": 0.4,
"grad_norm": 0.18496285378932953,
"learning_rate": 0.001,
"loss": 0.0841,
"step": 40000
},
{
"epoch": 0.4,
"grad_norm": 0.1363649070262909,
"learning_rate": 0.001,
"loss": 0.0813,
"step": 40100
},
{
"epoch": 0.4,
"grad_norm": 0.1736011952161789,
"learning_rate": 0.001,
"loss": 0.0796,
"step": 40200
},
{
"epoch": 0.4,
"grad_norm": 0.21385334432125092,
"learning_rate": 0.001,
"loss": 0.0814,
"step": 40300
},
{
"epoch": 0.4,
"grad_norm": 0.2105669230222702,
"learning_rate": 0.001,
"loss": 0.0816,
"step": 40400
},
{
"epoch": 0.4,
"grad_norm": 0.2278176248073578,
"learning_rate": 0.001,
"loss": 0.0825,
"step": 40500
},
{
"epoch": 0.41,
"grad_norm": 0.17637114226818085,
"learning_rate": 0.001,
"loss": 0.0812,
"step": 40600
},
{
"epoch": 0.41,
"grad_norm": 0.20035295188426971,
"learning_rate": 0.001,
"loss": 0.0853,
"step": 40700
},
{
"epoch": 0.41,
"grad_norm": 0.25408777594566345,
"learning_rate": 0.001,
"loss": 0.0811,
"step": 40800
},
{
"epoch": 0.41,
"grad_norm": 0.2177010476589203,
"learning_rate": 0.001,
"loss": 0.0796,
"step": 40900
},
{
"epoch": 0.41,
"grad_norm": 0.1639321744441986,
"learning_rate": 0.001,
"loss": 0.0824,
"step": 41000
},
{
"epoch": 0.41,
"grad_norm": 0.15798155963420868,
"learning_rate": 0.001,
"loss": 0.0834,
"step": 41100
},
{
"epoch": 0.41,
"grad_norm": 0.14857494831085205,
"learning_rate": 0.001,
"loss": 0.0825,
"step": 41200
},
{
"epoch": 0.41,
"grad_norm": 0.15640319883823395,
"learning_rate": 0.001,
"loss": 0.0814,
"step": 41300
},
{
"epoch": 0.41,
"grad_norm": 0.1530522108078003,
"learning_rate": 0.001,
"loss": 0.0825,
"step": 41400
},
{
"epoch": 0.41,
"grad_norm": 0.2990354001522064,
"learning_rate": 0.001,
"loss": 0.0785,
"step": 41500
},
{
"epoch": 0.42,
"grad_norm": 0.19239626824855804,
"learning_rate": 0.001,
"loss": 0.0809,
"step": 41600
},
{
"epoch": 0.42,
"grad_norm": 0.13975249230861664,
"learning_rate": 0.001,
"loss": 0.0825,
"step": 41700
},
{
"epoch": 0.42,
"grad_norm": 0.22527189552783966,
"learning_rate": 0.001,
"loss": 0.0819,
"step": 41800
},
{
"epoch": 0.42,
"grad_norm": 0.3547128438949585,
"learning_rate": 0.001,
"loss": 0.1013,
"step": 41900
},
{
"epoch": 0.42,
"grad_norm": 0.22032135725021362,
"learning_rate": 0.001,
"loss": 0.0806,
"step": 42000
},
{
"epoch": 0.42,
"grad_norm": 0.12712807953357697,
"learning_rate": 0.001,
"loss": 0.0791,
"step": 42100
},
{
"epoch": 0.42,
"grad_norm": 0.29608944058418274,
"learning_rate": 0.001,
"loss": 0.0783,
"step": 42200
},
{
"epoch": 0.42,
"grad_norm": 0.23063918948173523,
"learning_rate": 0.001,
"loss": 0.0828,
"step": 42300
},
{
"epoch": 0.42,
"grad_norm": 0.19996796548366547,
"learning_rate": 0.001,
"loss": 0.0813,
"step": 42400
},
{
"epoch": 0.42,
"grad_norm": 0.19479811191558838,
"learning_rate": 0.001,
"loss": 0.0811,
"step": 42500
},
{
"epoch": 0.43,
"grad_norm": 0.1822797805070877,
"learning_rate": 0.001,
"loss": 0.0796,
"step": 42600
},
{
"epoch": 0.43,
"grad_norm": 0.36260533332824707,
"learning_rate": 0.001,
"loss": 0.0797,
"step": 42700
},
{
"epoch": 0.43,
"grad_norm": 0.14315147697925568,
"learning_rate": 0.001,
"loss": 0.0869,
"step": 42800
},
{
"epoch": 0.43,
"grad_norm": 0.20261742174625397,
"learning_rate": 0.001,
"loss": 0.1856,
"step": 42900
},
{
"epoch": 0.43,
"grad_norm": 0.18873733282089233,
"learning_rate": 0.001,
"loss": 0.0775,
"step": 43000
},
{
"epoch": 0.43,
"grad_norm": 0.2189916968345642,
"learning_rate": 0.001,
"loss": 0.0796,
"step": 43100
},
{
"epoch": 0.43,
"grad_norm": 0.1823868304491043,
"learning_rate": 0.001,
"loss": 0.0822,
"step": 43200
},
{
"epoch": 0.43,
"grad_norm": 0.2595207691192627,
"learning_rate": 0.001,
"loss": 0.0776,
"step": 43300
},
{
"epoch": 0.43,
"grad_norm": 0.1713092178106308,
"learning_rate": 0.001,
"loss": 0.0811,
"step": 43400
},
{
"epoch": 0.43,
"grad_norm": 0.24840323626995087,
"learning_rate": 0.001,
"loss": 0.104,
"step": 43500
},
{
"epoch": 0.44,
"grad_norm": 0.23451556265354156,
"learning_rate": 0.001,
"loss": 0.077,
"step": 43600
},
{
"epoch": 0.44,
"grad_norm": 0.2142404466867447,
"learning_rate": 0.001,
"loss": 0.0789,
"step": 43700
},
{
"epoch": 0.44,
"grad_norm": 0.22932325303554535,
"learning_rate": 0.001,
"loss": 0.0778,
"step": 43800
},
{
"epoch": 0.44,
"grad_norm": 0.2027159184217453,
"learning_rate": 0.001,
"loss": 0.0794,
"step": 43900
},
{
"epoch": 0.44,
"grad_norm": 0.22258317470550537,
"learning_rate": 0.001,
"loss": 0.0787,
"step": 44000
},
{
"epoch": 0.44,
"grad_norm": 0.2979215681552887,
"learning_rate": 0.001,
"loss": 0.0767,
"step": 44100
},
{
"epoch": 0.44,
"grad_norm": 0.2110917568206787,
"learning_rate": 0.001,
"loss": 0.0782,
"step": 44200
},
{
"epoch": 0.44,
"grad_norm": 0.24181802570819855,
"learning_rate": 0.001,
"loss": 0.0804,
"step": 44300
},
{
"epoch": 0.44,
"grad_norm": 0.1810845136642456,
"learning_rate": 0.001,
"loss": 0.0786,
"step": 44400
},
{
"epoch": 0.44,
"grad_norm": 0.23404444754123688,
"learning_rate": 0.001,
"loss": 0.0785,
"step": 44500
},
{
"epoch": 0.45,
"grad_norm": 0.2591089904308319,
"learning_rate": 0.001,
"loss": 0.0765,
"step": 44600
},
{
"epoch": 0.45,
"grad_norm": 0.22720029950141907,
"learning_rate": 0.001,
"loss": 0.0798,
"step": 44700
},
{
"epoch": 0.45,
"grad_norm": 0.22449086606502533,
"learning_rate": 0.001,
"loss": 0.0766,
"step": 44800
},
{
"epoch": 0.45,
"grad_norm": 0.2302643209695816,
"learning_rate": 0.001,
"loss": 0.0798,
"step": 44900
},
{
"epoch": 0.45,
"grad_norm": 0.2040921300649643,
"learning_rate": 0.001,
"loss": 0.0841,
"step": 45000
},
{
"epoch": 0.45,
"grad_norm": 0.21232621371746063,
"learning_rate": 0.001,
"loss": 0.0789,
"step": 45100
},
{
"epoch": 0.45,
"grad_norm": 0.20054876804351807,
"learning_rate": 0.001,
"loss": 0.0779,
"step": 45200
},
{
"epoch": 0.45,
"grad_norm": 0.24335692822933197,
"learning_rate": 0.001,
"loss": 0.0784,
"step": 45300
},
{
"epoch": 0.45,
"grad_norm": 0.22172445058822632,
"learning_rate": 0.001,
"loss": 0.0797,
"step": 45400
},
{
"epoch": 0.45,
"grad_norm": 0.20524169504642487,
"learning_rate": 0.001,
"loss": 0.0803,
"step": 45500
},
{
"epoch": 0.46,
"grad_norm": 0.17150288820266724,
"learning_rate": 0.001,
"loss": 0.0791,
"step": 45600
},
{
"epoch": 0.46,
"grad_norm": 0.38285690546035767,
"learning_rate": 0.001,
"loss": 0.079,
"step": 45700
},
{
"epoch": 0.46,
"grad_norm": 0.16937342286109924,
"learning_rate": 0.001,
"loss": 0.0791,
"step": 45800
},
{
"epoch": 0.46,
"grad_norm": 0.19271647930145264,
"learning_rate": 0.001,
"loss": 0.079,
"step": 45900
},
{
"epoch": 0.46,
"grad_norm": 0.20048774778842926,
"learning_rate": 0.001,
"loss": 0.0797,
"step": 46000
},
{
"epoch": 0.46,
"grad_norm": 0.2141706347465515,
"learning_rate": 0.001,
"loss": 0.0798,
"step": 46100
},
{
"epoch": 0.46,
"grad_norm": 0.20665834844112396,
"learning_rate": 0.001,
"loss": 0.0778,
"step": 46200
},
{
"epoch": 0.46,
"grad_norm": 0.18385255336761475,
"learning_rate": 0.001,
"loss": 0.0779,
"step": 46300
},
{
"epoch": 0.46,
"grad_norm": 0.22467826306819916,
"learning_rate": 0.001,
"loss": 0.0732,
"step": 46400
},
{
"epoch": 0.46,
"grad_norm": 0.18363313376903534,
"learning_rate": 0.001,
"loss": 0.0796,
"step": 46500
},
{
"epoch": 0.47,
"grad_norm": 0.2288578897714615,
"learning_rate": 0.001,
"loss": 0.0763,
"step": 46600
},
{
"epoch": 0.47,
"grad_norm": 0.2535518407821655,
"learning_rate": 0.001,
"loss": 0.0791,
"step": 46700
},
{
"epoch": 0.47,
"grad_norm": 0.20715934038162231,
"learning_rate": 0.001,
"loss": 0.0777,
"step": 46800
},
{
"epoch": 0.47,
"grad_norm": 0.12203960865736008,
"learning_rate": 0.001,
"loss": 0.0805,
"step": 46900
},
{
"epoch": 0.47,
"grad_norm": 0.138369619846344,
"learning_rate": 0.001,
"loss": 0.0768,
"step": 47000
},
{
"epoch": 0.47,
"grad_norm": 0.2319127321243286,
"learning_rate": 0.001,
"loss": 0.0784,
"step": 47100
},
{
"epoch": 0.47,
"grad_norm": 0.2058788686990738,
"learning_rate": 0.001,
"loss": 0.0783,
"step": 47200
},
{
"epoch": 0.47,
"grad_norm": 0.21334126591682434,
"learning_rate": 0.001,
"loss": 0.0763,
"step": 47300
},
{
"epoch": 0.47,
"grad_norm": 0.23397529125213623,
"learning_rate": 0.001,
"loss": 0.081,
"step": 47400
},
{
"epoch": 0.47,
"grad_norm": 0.24460141360759735,
"learning_rate": 0.001,
"loss": 0.0752,
"step": 47500
},
{
"epoch": 0.48,
"grad_norm": 0.22441798448562622,
"learning_rate": 0.001,
"loss": 0.0779,
"step": 47600
},
{
"epoch": 0.48,
"grad_norm": 0.20988881587982178,
"learning_rate": 0.001,
"loss": 0.08,
"step": 47700
},
{
"epoch": 0.48,
"grad_norm": 0.17863024771213531,
"learning_rate": 0.001,
"loss": 0.0787,
"step": 47800
},
{
"epoch": 0.48,
"grad_norm": 0.17980898916721344,
"learning_rate": 0.001,
"loss": 0.0802,
"step": 47900
},
{
"epoch": 0.48,
"grad_norm": 0.2614147961139679,
"learning_rate": 0.001,
"loss": 0.0787,
"step": 48000
},
{
"epoch": 0.48,
"grad_norm": 0.16281504929065704,
"learning_rate": 0.001,
"loss": 0.0779,
"step": 48100
},
{
"epoch": 0.48,
"grad_norm": 0.3099921941757202,
"learning_rate": 0.001,
"loss": 0.0747,
"step": 48200
},
{
"epoch": 0.48,
"grad_norm": 0.2542015016078949,
"learning_rate": 0.001,
"loss": 0.0831,
"step": 48300
},
{
"epoch": 0.48,
"grad_norm": 0.17419801652431488,
"learning_rate": 0.001,
"loss": 0.0787,
"step": 48400
},
{
"epoch": 0.48,
"grad_norm": 0.2089216262102127,
"learning_rate": 0.001,
"loss": 0.0781,
"step": 48500
},
{
"epoch": 0.49,
"grad_norm": 0.26476818323135376,
"learning_rate": 0.001,
"loss": 0.0792,
"step": 48600
},
{
"epoch": 0.49,
"grad_norm": 0.18907053768634796,
"learning_rate": 0.001,
"loss": 0.078,
"step": 48700
},
{
"epoch": 0.49,
"grad_norm": 0.2528514564037323,
"learning_rate": 0.001,
"loss": 0.0791,
"step": 48800
},
{
"epoch": 0.49,
"grad_norm": 0.2794158458709717,
"learning_rate": 0.001,
"loss": 0.0799,
"step": 48900
},
{
"epoch": 0.49,
"grad_norm": 0.24547474086284637,
"learning_rate": 0.001,
"loss": 0.0765,
"step": 49000
},
{
"epoch": 0.49,
"grad_norm": 0.17239224910736084,
"learning_rate": 0.001,
"loss": 0.0807,
"step": 49100
},
{
"epoch": 0.49,
"grad_norm": 0.22998745739459991,
"learning_rate": 0.001,
"loss": 0.079,
"step": 49200
},
{
"epoch": 0.49,
"grad_norm": 0.2727990746498108,
"learning_rate": 0.001,
"loss": 0.078,
"step": 49300
},
{
"epoch": 0.49,
"grad_norm": 0.2488749623298645,
"learning_rate": 0.001,
"loss": 0.0757,
"step": 49400
},
{
"epoch": 0.49,
"grad_norm": 0.20260153710842133,
"learning_rate": 0.001,
"loss": 0.0787,
"step": 49500
},
{
"epoch": 0.5,
"grad_norm": 0.30832308530807495,
"learning_rate": 0.001,
"loss": 0.0789,
"step": 49600
},
{
"epoch": 0.5,
"grad_norm": 0.17934545874595642,
"learning_rate": 0.001,
"loss": 0.0768,
"step": 49700
},
{
"epoch": 0.5,
"grad_norm": 0.1972292810678482,
"learning_rate": 0.001,
"loss": 0.0786,
"step": 49800
},
{
"epoch": 0.5,
"grad_norm": 0.1899816393852234,
"learning_rate": 0.001,
"loss": 0.0782,
"step": 49900
},
{
"epoch": 0.5,
"grad_norm": 0.17765800654888153,
"learning_rate": 0.001,
"loss": 0.0784,
"step": 50000
},
{
"epoch": 0.5,
"grad_norm": 0.3285583555698395,
"learning_rate": 0.001,
"loss": 0.0793,
"step": 50100
},
{
"epoch": 0.5,
"grad_norm": 0.2769279181957245,
"learning_rate": 0.001,
"loss": 0.0818,
"step": 50200
},
{
"epoch": 0.5,
"grad_norm": 0.1661899834871292,
"learning_rate": 0.001,
"loss": 0.1088,
"step": 50300
},
{
"epoch": 0.5,
"grad_norm": 0.32694903016090393,
"learning_rate": 0.001,
"loss": 0.0799,
"step": 50400
},
{
"epoch": 0.5,
"grad_norm": 0.1976955235004425,
"learning_rate": 0.001,
"loss": 0.0768,
"step": 50500
},
{
"epoch": 0.51,
"grad_norm": 0.2623777687549591,
"learning_rate": 0.001,
"loss": 0.0764,
"step": 50600
},
{
"epoch": 0.51,
"grad_norm": 0.19917914271354675,
"learning_rate": 0.001,
"loss": 0.079,
"step": 50700
},
{
"epoch": 0.51,
"grad_norm": 0.22838640213012695,
"learning_rate": 0.001,
"loss": 0.076,
"step": 50800
},
{
"epoch": 0.51,
"grad_norm": 0.1831175684928894,
"learning_rate": 0.001,
"loss": 0.0744,
"step": 50900
},
{
"epoch": 0.51,
"grad_norm": 0.1774362176656723,
"learning_rate": 0.001,
"loss": 0.076,
"step": 51000
},
{
"epoch": 0.51,
"grad_norm": 0.24986374378204346,
"learning_rate": 0.001,
"loss": 0.0754,
"step": 51100
},
{
"epoch": 0.51,
"grad_norm": 0.15164266526699066,
"learning_rate": 0.001,
"loss": 0.0757,
"step": 51200
},
{
"epoch": 0.51,
"grad_norm": 0.19118934869766235,
"learning_rate": 0.001,
"loss": 0.0787,
"step": 51300
},
{
"epoch": 0.51,
"grad_norm": 0.1625840663909912,
"learning_rate": 0.001,
"loss": 0.0778,
"step": 51400
},
{
"epoch": 0.51,
"grad_norm": 0.14519533514976501,
"learning_rate": 0.001,
"loss": 0.077,
"step": 51500
},
{
"epoch": 0.52,
"grad_norm": 0.16799670457839966,
"learning_rate": 0.001,
"loss": 0.0764,
"step": 51600
},
{
"epoch": 0.52,
"grad_norm": 0.15635591745376587,
"learning_rate": 0.001,
"loss": 0.0738,
"step": 51700
},
{
"epoch": 0.52,
"grad_norm": 0.25875189900398254,
"learning_rate": 0.001,
"loss": 0.0757,
"step": 51800
},
{
"epoch": 0.52,
"grad_norm": 0.2601448595523834,
"learning_rate": 0.001,
"loss": 0.0721,
"step": 51900
},
{
"epoch": 0.52,
"grad_norm": 0.20097233355045319,
"learning_rate": 0.001,
"loss": 0.0764,
"step": 52000
},
{
"epoch": 0.52,
"grad_norm": 0.17383421957492828,
"learning_rate": 0.001,
"loss": 0.0768,
"step": 52100
},
{
"epoch": 0.52,
"grad_norm": 0.152663916349411,
"learning_rate": 0.001,
"loss": 0.0747,
"step": 52200
},
{
"epoch": 0.52,
"grad_norm": 0.1773347705602646,
"learning_rate": 0.001,
"loss": 0.0743,
"step": 52300
},
{
"epoch": 0.52,
"grad_norm": 0.15975210070610046,
"learning_rate": 0.001,
"loss": 0.0769,
"step": 52400
},
{
"epoch": 0.52,
"grad_norm": 0.27663958072662354,
"learning_rate": 0.001,
"loss": 0.0747,
"step": 52500
},
{
"epoch": 0.53,
"grad_norm": 0.20124509930610657,
"learning_rate": 0.001,
"loss": 0.0755,
"step": 52600
},
{
"epoch": 0.53,
"grad_norm": 0.19016942381858826,
"learning_rate": 0.001,
"loss": 0.0709,
"step": 52700
},
{
"epoch": 0.53,
"grad_norm": 0.34517988562583923,
"learning_rate": 0.001,
"loss": 0.0751,
"step": 52800
},
{
"epoch": 0.53,
"grad_norm": 0.27312055230140686,
"learning_rate": 0.001,
"loss": 0.0761,
"step": 52900
},
{
"epoch": 0.53,
"grad_norm": 0.2835043668746948,
"learning_rate": 0.001,
"loss": 0.0731,
"step": 53000
},
{
"epoch": 0.53,
"grad_norm": 0.1630600243806839,
"learning_rate": 0.001,
"loss": 0.0741,
"step": 53100
},
{
"epoch": 0.53,
"grad_norm": 0.2430613487958908,
"learning_rate": 0.001,
"loss": 0.0767,
"step": 53200
},
{
"epoch": 0.53,
"grad_norm": 0.19533057510852814,
"learning_rate": 0.001,
"loss": 0.077,
"step": 53300
},
{
"epoch": 0.53,
"grad_norm": 0.21139401197433472,
"learning_rate": 0.001,
"loss": 0.0711,
"step": 53400
},
{
"epoch": 0.53,
"grad_norm": 0.18416912853717804,
"learning_rate": 0.001,
"loss": 0.0729,
"step": 53500
},
{
"epoch": 0.54,
"grad_norm": 0.24703727662563324,
"learning_rate": 0.001,
"loss": 0.071,
"step": 53600
},
{
"epoch": 0.54,
"grad_norm": 0.14476247131824493,
"learning_rate": 0.001,
"loss": 0.0754,
"step": 53700
},
{
"epoch": 0.54,
"grad_norm": 0.210220068693161,
"learning_rate": 0.001,
"loss": 0.0738,
"step": 53800
},
{
"epoch": 0.54,
"grad_norm": 0.16544660925865173,
"learning_rate": 0.001,
"loss": 0.072,
"step": 53900
},
{
"epoch": 0.54,
"grad_norm": 0.17049700021743774,
"learning_rate": 0.001,
"loss": 0.0728,
"step": 54000
},
{
"epoch": 0.54,
"grad_norm": 0.18656505644321442,
"learning_rate": 0.001,
"loss": 0.0739,
"step": 54100
},
{
"epoch": 0.54,
"grad_norm": 0.19484791159629822,
"learning_rate": 0.001,
"loss": 0.0748,
"step": 54200
},
{
"epoch": 0.54,
"grad_norm": 0.1982715129852295,
"learning_rate": 0.001,
"loss": 0.0729,
"step": 54300
},
{
"epoch": 0.54,
"grad_norm": 0.2108699083328247,
"learning_rate": 0.001,
"loss": 0.0735,
"step": 54400
},
{
"epoch": 0.54,
"grad_norm": 0.23962444067001343,
"learning_rate": 0.001,
"loss": 0.0703,
"step": 54500
},
{
"epoch": 0.55,
"grad_norm": 0.29319801926612854,
"learning_rate": 0.001,
"loss": 0.0735,
"step": 54600
},
{
"epoch": 0.55,
"grad_norm": 0.1804085075855255,
"learning_rate": 0.001,
"loss": 0.0719,
"step": 54700
},
{
"epoch": 0.55,
"grad_norm": 0.2394474297761917,
"learning_rate": 0.001,
"loss": 0.0721,
"step": 54800
},
{
"epoch": 0.55,
"grad_norm": 0.20954197645187378,
"learning_rate": 0.001,
"loss": 0.0745,
"step": 54900
},
{
"epoch": 0.55,
"grad_norm": 0.17135080695152283,
"learning_rate": 0.001,
"loss": 0.0728,
"step": 55000
},
{
"epoch": 0.55,
"grad_norm": 0.3152260482311249,
"learning_rate": 0.001,
"loss": 0.0735,
"step": 55100
},
{
"epoch": 0.55,
"grad_norm": 0.22659769654273987,
"learning_rate": 0.001,
"loss": 0.0752,
"step": 55200
},
{
"epoch": 0.55,
"grad_norm": 0.2605753540992737,
"learning_rate": 0.001,
"loss": 0.073,
"step": 55300
},
{
"epoch": 0.55,
"grad_norm": 0.2309567779302597,
"learning_rate": 0.001,
"loss": 0.0744,
"step": 55400
},
{
"epoch": 0.55,
"grad_norm": 0.19917166233062744,
"learning_rate": 0.001,
"loss": 0.073,
"step": 55500
},
{
"epoch": 0.56,
"grad_norm": 0.2609159052371979,
"learning_rate": 0.001,
"loss": 0.0705,
"step": 55600
},
{
"epoch": 0.56,
"grad_norm": 0.26976123452186584,
"learning_rate": 0.001,
"loss": 0.0731,
"step": 55700
},
{
"epoch": 0.56,
"grad_norm": 0.25275784730911255,
"learning_rate": 0.001,
"loss": 0.0808,
"step": 55800
},
{
"epoch": 0.56,
"grad_norm": 0.2392340749502182,
"learning_rate": 0.001,
"loss": 0.0763,
"step": 55900
},
{
"epoch": 0.56,
"grad_norm": 0.27718254923820496,
"learning_rate": 0.001,
"loss": 0.0743,
"step": 56000
},
{
"epoch": 0.56,
"grad_norm": 0.19996067881584167,
"learning_rate": 0.001,
"loss": 0.0807,
"step": 56100
},
{
"epoch": 0.56,
"grad_norm": 0.16322393715381622,
"learning_rate": 0.001,
"loss": 0.0753,
"step": 56200
},
{
"epoch": 0.56,
"grad_norm": 0.25598809123039246,
"learning_rate": 0.001,
"loss": 0.0773,
"step": 56300
},
{
"epoch": 0.56,
"grad_norm": 0.15482768416404724,
"learning_rate": 0.001,
"loss": 0.0729,
"step": 56400
},
{
"epoch": 0.56,
"grad_norm": 0.4033351242542267,
"learning_rate": 0.001,
"loss": 0.0773,
"step": 56500
},
{
"epoch": 0.57,
"grad_norm": 0.2869590222835541,
"learning_rate": 0.001,
"loss": 0.0732,
"step": 56600
},
{
"epoch": 0.57,
"grad_norm": 0.19079795479774475,
"learning_rate": 0.001,
"loss": 0.0712,
"step": 56700
},
{
"epoch": 0.57,
"grad_norm": 0.21604031324386597,
"learning_rate": 0.001,
"loss": 0.0714,
"step": 56800
},
{
"epoch": 0.57,
"grad_norm": 0.23917321860790253,
"learning_rate": 0.001,
"loss": 0.0743,
"step": 56900
},
{
"epoch": 0.57,
"grad_norm": 0.16785088181495667,
"learning_rate": 0.001,
"loss": 0.0722,
"step": 57000
},
{
"epoch": 0.57,
"grad_norm": 0.22009502351284027,
"learning_rate": 0.001,
"loss": 0.0738,
"step": 57100
},
{
"epoch": 0.57,
"grad_norm": 0.23401811718940735,
"learning_rate": 0.001,
"loss": 0.0759,
"step": 57200
},
{
"epoch": 0.57,
"grad_norm": 0.19278208911418915,
"learning_rate": 0.001,
"loss": 0.0738,
"step": 57300
},
{
"epoch": 0.57,
"grad_norm": 0.22170820832252502,
"learning_rate": 0.001,
"loss": 0.07,
"step": 57400
},
{
"epoch": 0.57,
"grad_norm": 0.2148713767528534,
"learning_rate": 0.001,
"loss": 0.0716,
"step": 57500
},
{
"epoch": 0.58,
"grad_norm": 0.2093653529882431,
"learning_rate": 0.001,
"loss": 0.0722,
"step": 57600
},
{
"epoch": 0.58,
"grad_norm": 0.2912674844264984,
"learning_rate": 0.001,
"loss": 0.0738,
"step": 57700
},
{
"epoch": 0.58,
"grad_norm": 0.3146283030509949,
"learning_rate": 0.001,
"loss": 0.0735,
"step": 57800
},
{
"epoch": 0.58,
"grad_norm": 0.2355007380247116,
"learning_rate": 0.001,
"loss": 0.0719,
"step": 57900
},
{
"epoch": 0.58,
"grad_norm": 0.19035007059574127,
"learning_rate": 0.001,
"loss": 0.0699,
"step": 58000
},
{
"epoch": 0.58,
"grad_norm": 0.13338258862495422,
"learning_rate": 0.001,
"loss": 0.0727,
"step": 58100
},
{
"epoch": 0.58,
"grad_norm": 0.22755542397499084,
"learning_rate": 0.001,
"loss": 0.072,
"step": 58200
},
{
"epoch": 0.58,
"grad_norm": 0.23752057552337646,
"learning_rate": 0.001,
"loss": 0.0703,
"step": 58300
},
{
"epoch": 0.58,
"grad_norm": 0.20008322596549988,
"learning_rate": 0.001,
"loss": 0.0721,
"step": 58400
},
{
"epoch": 0.58,
"grad_norm": 0.1769803911447525,
"learning_rate": 0.001,
"loss": 0.0724,
"step": 58500
},
{
"epoch": 0.59,
"grad_norm": 0.19137178361415863,
"learning_rate": 0.001,
"loss": 0.0735,
"step": 58600
},
{
"epoch": 0.59,
"grad_norm": 0.22157849371433258,
"learning_rate": 0.001,
"loss": 0.0735,
"step": 58700
},
{
"epoch": 0.59,
"grad_norm": 0.2098543494939804,
"learning_rate": 0.001,
"loss": 0.0701,
"step": 58800
},
{
"epoch": 0.59,
"grad_norm": 0.22936704754829407,
"learning_rate": 0.001,
"loss": 0.0691,
"step": 58900
},
{
"epoch": 0.59,
"grad_norm": 0.15228866040706635,
"learning_rate": 0.001,
"loss": 0.0729,
"step": 59000
},
{
"epoch": 0.59,
"grad_norm": 0.27094388008117676,
"learning_rate": 0.001,
"loss": 0.0706,
"step": 59100
},
{
"epoch": 0.59,
"grad_norm": 0.17357999086380005,
"learning_rate": 0.001,
"loss": 0.071,
"step": 59200
},
{
"epoch": 0.59,
"grad_norm": 0.2912188768386841,
"learning_rate": 0.001,
"loss": 0.0719,
"step": 59300
},
{
"epoch": 0.59,
"grad_norm": 0.24029956758022308,
"learning_rate": 0.001,
"loss": 0.07,
"step": 59400
},
{
"epoch": 0.59,
"grad_norm": 0.1956549882888794,
"learning_rate": 0.001,
"loss": 0.0712,
"step": 59500
},
{
"epoch": 0.6,
"grad_norm": 0.26984256505966187,
"learning_rate": 0.001,
"loss": 0.0713,
"step": 59600
},
{
"epoch": 0.6,
"grad_norm": 0.18548165261745453,
"learning_rate": 0.001,
"loss": 0.0686,
"step": 59700
},
{
"epoch": 0.6,
"grad_norm": 0.1833103895187378,
"learning_rate": 0.001,
"loss": 0.0672,
"step": 59800
},
{
"epoch": 0.6,
"grad_norm": 0.20417752861976624,
"learning_rate": 0.001,
"loss": 0.069,
"step": 59900
},
{
"epoch": 0.6,
"grad_norm": 0.3695315420627594,
"learning_rate": 0.001,
"loss": 0.0703,
"step": 60000
},
{
"epoch": 0.6,
"grad_norm": 0.23288464546203613,
"learning_rate": 0.001,
"loss": 0.0704,
"step": 60100
},
{
"epoch": 0.6,
"grad_norm": 0.21595774590969086,
"learning_rate": 0.001,
"loss": 0.0697,
"step": 60200
},
{
"epoch": 0.6,
"grad_norm": 0.16371206939220428,
"learning_rate": 0.001,
"loss": 0.0704,
"step": 60300
},
{
"epoch": 0.6,
"grad_norm": 0.2600916028022766,
"learning_rate": 0.001,
"loss": 0.0693,
"step": 60400
},
{
"epoch": 0.6,
"grad_norm": 0.21177971363067627,
"learning_rate": 0.001,
"loss": 0.0707,
"step": 60500
},
{
"epoch": 0.61,
"grad_norm": 0.16886168718338013,
"learning_rate": 0.001,
"loss": 0.0701,
"step": 60600
},
{
"epoch": 0.61,
"grad_norm": 0.29835718870162964,
"learning_rate": 0.001,
"loss": 0.0683,
"step": 60700
},
{
"epoch": 0.61,
"grad_norm": 0.2594737410545349,
"learning_rate": 0.001,
"loss": 0.0723,
"step": 60800
},
{
"epoch": 0.61,
"grad_norm": 0.2057715505361557,
"learning_rate": 0.001,
"loss": 0.0693,
"step": 60900
},
{
"epoch": 0.61,
"grad_norm": 0.2127043902873993,
"learning_rate": 0.001,
"loss": 0.0699,
"step": 61000
},
{
"epoch": 0.61,
"grad_norm": 0.18162322044372559,
"learning_rate": 0.001,
"loss": 0.0714,
"step": 61100
},
{
"epoch": 0.61,
"grad_norm": 0.21535515785217285,
"learning_rate": 0.001,
"loss": 0.0711,
"step": 61200
},
{
"epoch": 0.61,
"grad_norm": 0.19364242255687714,
"learning_rate": 0.001,
"loss": 0.0715,
"step": 61300
},
{
"epoch": 0.61,
"grad_norm": 0.14159826934337616,
"learning_rate": 0.001,
"loss": 0.07,
"step": 61400
},
{
"epoch": 0.61,
"grad_norm": 0.21536406874656677,
"learning_rate": 0.001,
"loss": 0.0689,
"step": 61500
},
{
"epoch": 0.62,
"grad_norm": 0.19926196336746216,
"learning_rate": 0.001,
"loss": 0.0689,
"step": 61600
},
{
"epoch": 0.62,
"grad_norm": 0.20217150449752808,
"learning_rate": 0.001,
"loss": 0.071,
"step": 61700
},
{
"epoch": 0.62,
"grad_norm": 0.17570650577545166,
"learning_rate": 0.001,
"loss": 0.0719,
"step": 61800
},
{
"epoch": 0.62,
"grad_norm": 0.19788751006126404,
"learning_rate": 0.001,
"loss": 0.0687,
"step": 61900
},
{
"epoch": 0.62,
"grad_norm": 0.22191910445690155,
"learning_rate": 0.001,
"loss": 0.0687,
"step": 62000
},
{
"epoch": 0.62,
"grad_norm": 0.19544494152069092,
"learning_rate": 0.001,
"loss": 0.0704,
"step": 62100
},
{
"epoch": 0.62,
"grad_norm": 0.32939237356185913,
"learning_rate": 0.001,
"loss": 0.0713,
"step": 62200
},
{
"epoch": 0.62,
"grad_norm": 0.1809149980545044,
"learning_rate": 0.001,
"loss": 0.0701,
"step": 62300
},
{
"epoch": 0.62,
"grad_norm": 0.2769867479801178,
"learning_rate": 0.001,
"loss": 0.0718,
"step": 62400
},
{
"epoch": 0.62,
"grad_norm": 0.15998759865760803,
"learning_rate": 0.001,
"loss": 0.0691,
"step": 62500
},
{
"epoch": 0.63,
"grad_norm": 0.29498517513275146,
"learning_rate": 0.001,
"loss": 0.0722,
"step": 62600
},
{
"epoch": 0.63,
"grad_norm": 0.19759228825569153,
"learning_rate": 0.001,
"loss": 0.0686,
"step": 62700
},
{
"epoch": 0.63,
"grad_norm": 0.12064652889966965,
"learning_rate": 0.001,
"loss": 0.0707,
"step": 62800
},
{
"epoch": 0.63,
"grad_norm": 0.19079501926898956,
"learning_rate": 0.001,
"loss": 0.0662,
"step": 62900
},
{
"epoch": 0.63,
"grad_norm": 0.22422794997692108,
"learning_rate": 0.001,
"loss": 0.0662,
"step": 63000
},
{
"epoch": 0.63,
"grad_norm": 0.16929177939891815,
"learning_rate": 0.001,
"loss": 0.0677,
"step": 63100
},
{
"epoch": 0.63,
"grad_norm": 0.20057950913906097,
"learning_rate": 0.001,
"loss": 0.0699,
"step": 63200
},
{
"epoch": 0.63,
"grad_norm": 0.4213920533657074,
"learning_rate": 0.001,
"loss": 0.0701,
"step": 63300
},
{
"epoch": 0.63,
"grad_norm": 0.28028371930122375,
"learning_rate": 0.001,
"loss": 0.0697,
"step": 63400
},
{
"epoch": 0.63,
"grad_norm": 0.18094098567962646,
"learning_rate": 0.001,
"loss": 0.0727,
"step": 63500
},
{
"epoch": 0.64,
"grad_norm": 0.30136585235595703,
"learning_rate": 0.001,
"loss": 0.0711,
"step": 63600
},
{
"epoch": 0.64,
"grad_norm": 0.192775696516037,
"learning_rate": 0.001,
"loss": 0.0721,
"step": 63700
},
{
"epoch": 0.64,
"grad_norm": 0.2211129367351532,
"learning_rate": 0.001,
"loss": 0.0695,
"step": 63800
},
{
"epoch": 0.64,
"grad_norm": 0.19226811826229095,
"learning_rate": 0.001,
"loss": 0.0699,
"step": 63900
},
{
"epoch": 0.64,
"grad_norm": 0.2471201866865158,
"learning_rate": 0.001,
"loss": 0.0692,
"step": 64000
},
{
"epoch": 0.64,
"grad_norm": 0.2547115385532379,
"learning_rate": 0.001,
"loss": 0.0673,
"step": 64100
},
{
"epoch": 0.64,
"grad_norm": 0.1899893879890442,
"learning_rate": 0.001,
"loss": 0.0693,
"step": 64200
},
{
"epoch": 0.64,
"grad_norm": 0.21257919073104858,
"learning_rate": 0.001,
"loss": 0.0684,
"step": 64300
},
{
"epoch": 0.64,
"grad_norm": 0.26688677072525024,
"learning_rate": 0.001,
"loss": 0.0683,
"step": 64400
},
{
"epoch": 0.64,
"grad_norm": 0.18874968588352203,
"learning_rate": 0.001,
"loss": 0.0688,
"step": 64500
},
{
"epoch": 0.65,
"grad_norm": 0.2013721913099289,
"learning_rate": 0.001,
"loss": 0.0684,
"step": 64600
},
{
"epoch": 0.65,
"grad_norm": 0.19745351374149323,
"learning_rate": 0.001,
"loss": 0.0685,
"step": 64700
},
{
"epoch": 0.65,
"grad_norm": 0.2137337028980255,
"learning_rate": 0.001,
"loss": 0.0671,
"step": 64800
},
{
"epoch": 0.65,
"grad_norm": 0.20300865173339844,
"learning_rate": 0.001,
"loss": 0.0684,
"step": 64900
},
{
"epoch": 0.65,
"grad_norm": 0.1723690927028656,
"learning_rate": 0.001,
"loss": 0.0681,
"step": 65000
},
{
"epoch": 0.65,
"grad_norm": 0.20693708956241608,
"learning_rate": 0.001,
"loss": 0.0685,
"step": 65100
},
{
"epoch": 0.65,
"grad_norm": 0.33531713485717773,
"learning_rate": 0.001,
"loss": 0.0687,
"step": 65200
},
{
"epoch": 0.65,
"grad_norm": 0.2180265337228775,
"learning_rate": 0.001,
"loss": 0.0719,
"step": 65300
},
{
"epoch": 0.65,
"grad_norm": 0.27855604887008667,
"learning_rate": 0.001,
"loss": 0.0686,
"step": 65400
},
{
"epoch": 0.65,
"grad_norm": 0.2309376448392868,
"learning_rate": 0.001,
"loss": 0.0682,
"step": 65500
},
{
"epoch": 0.66,
"grad_norm": 0.25525444746017456,
"learning_rate": 0.001,
"loss": 0.0698,
"step": 65600
},
{
"epoch": 0.66,
"grad_norm": 0.1746407151222229,
"learning_rate": 0.001,
"loss": 0.0692,
"step": 65700
},
{
"epoch": 0.66,
"grad_norm": 0.29511937499046326,
"learning_rate": 0.001,
"loss": 0.0675,
"step": 65800
},
{
"epoch": 0.66,
"grad_norm": 0.23610210418701172,
"learning_rate": 0.001,
"loss": 0.0682,
"step": 65900
},
{
"epoch": 0.66,
"grad_norm": 0.24088448286056519,
"learning_rate": 0.001,
"loss": 0.065,
"step": 66000
},
{
"epoch": 0.66,
"grad_norm": 0.3865065574645996,
"learning_rate": 0.001,
"loss": 0.068,
"step": 66100
},
{
"epoch": 0.66,
"grad_norm": 0.16312183439731598,
"learning_rate": 0.001,
"loss": 0.0674,
"step": 66200
},
{
"epoch": 0.66,
"grad_norm": 0.33910611271858215,
"learning_rate": 0.001,
"loss": 0.0657,
"step": 66300
},
{
"epoch": 0.66,
"grad_norm": 0.1491781622171402,
"learning_rate": 0.001,
"loss": 0.0663,
"step": 66400
},
{
"epoch": 0.66,
"grad_norm": 0.27082210779190063,
"learning_rate": 0.001,
"loss": 0.0692,
"step": 66500
},
{
"epoch": 0.67,
"grad_norm": 0.302495539188385,
"learning_rate": 0.001,
"loss": 0.0668,
"step": 66600
},
{
"epoch": 0.67,
"grad_norm": 0.1906341165304184,
"learning_rate": 0.001,
"loss": 0.0689,
"step": 66700
},
{
"epoch": 0.67,
"grad_norm": 0.21256040036678314,
"learning_rate": 0.001,
"loss": 0.0665,
"step": 66800
},
{
"epoch": 0.67,
"grad_norm": 0.16603924334049225,
"learning_rate": 0.001,
"loss": 0.07,
"step": 66900
},
{
"epoch": 0.67,
"grad_norm": 0.17136050760746002,
"learning_rate": 0.001,
"loss": 0.0715,
"step": 67000
},
{
"epoch": 0.67,
"grad_norm": 0.1679474115371704,
"learning_rate": 0.001,
"loss": 0.0667,
"step": 67100
},
{
"epoch": 0.67,
"grad_norm": 0.18445661664009094,
"learning_rate": 0.001,
"loss": 0.0688,
"step": 67200
},
{
"epoch": 0.67,
"grad_norm": 0.16743460297584534,
"learning_rate": 0.001,
"loss": 0.0672,
"step": 67300
},
{
"epoch": 0.67,
"grad_norm": 0.24309833347797394,
"learning_rate": 0.001,
"loss": 0.066,
"step": 67400
},
{
"epoch": 0.67,
"grad_norm": 0.15661662817001343,
"learning_rate": 0.001,
"loss": 0.0686,
"step": 67500
},
{
"epoch": 0.68,
"grad_norm": 0.32759585976600647,
"learning_rate": 0.001,
"loss": 0.0666,
"step": 67600
},
{
"epoch": 0.68,
"grad_norm": 0.1508253961801529,
"learning_rate": 0.001,
"loss": 0.068,
"step": 67700
},
{
"epoch": 0.68,
"grad_norm": 0.17459799349308014,
"learning_rate": 0.001,
"loss": 0.069,
"step": 67800
},
{
"epoch": 0.68,
"grad_norm": 0.2405272275209427,
"learning_rate": 0.001,
"loss": 0.0693,
"step": 67900
},
{
"epoch": 0.68,
"grad_norm": 0.2469649761915207,
"learning_rate": 0.001,
"loss": 0.0678,
"step": 68000
},
{
"epoch": 0.68,
"grad_norm": 0.25917258858680725,
"learning_rate": 0.001,
"loss": 0.0694,
"step": 68100
},
{
"epoch": 0.68,
"grad_norm": 0.1784822642803192,
"learning_rate": 0.001,
"loss": 0.0668,
"step": 68200
},
{
"epoch": 0.68,
"grad_norm": 0.22977730631828308,
"learning_rate": 0.001,
"loss": 0.0656,
"step": 68300
},
{
"epoch": 0.68,
"grad_norm": 0.1646946221590042,
"learning_rate": 0.001,
"loss": 0.068,
"step": 68400
},
{
"epoch": 0.68,
"grad_norm": 0.3220691978931427,
"learning_rate": 0.001,
"loss": 0.0665,
"step": 68500
},
{
"epoch": 0.69,
"grad_norm": 0.22109118103981018,
"learning_rate": 0.001,
"loss": 0.0684,
"step": 68600
},
{
"epoch": 0.69,
"grad_norm": 0.12051670998334885,
"learning_rate": 0.001,
"loss": 0.0675,
"step": 68700
},
{
"epoch": 0.69,
"grad_norm": 0.19576141238212585,
"learning_rate": 0.001,
"loss": 0.0655,
"step": 68800
},
{
"epoch": 0.69,
"grad_norm": 0.12783344089984894,
"learning_rate": 0.001,
"loss": 0.0677,
"step": 68900
},
{
"epoch": 0.69,
"grad_norm": 0.24854913353919983,
"learning_rate": 0.001,
"loss": 0.0684,
"step": 69000
},
{
"epoch": 0.69,
"grad_norm": 0.19816453754901886,
"learning_rate": 0.001,
"loss": 0.067,
"step": 69100
},
{
"epoch": 0.69,
"grad_norm": 0.20371900498867035,
"learning_rate": 0.001,
"loss": 0.0669,
"step": 69200
},
{
"epoch": 0.69,
"grad_norm": 0.24654364585876465,
"learning_rate": 0.001,
"loss": 0.0665,
"step": 69300
},
{
"epoch": 0.69,
"grad_norm": 0.22933346033096313,
"learning_rate": 0.001,
"loss": 0.0697,
"step": 69400
},
{
"epoch": 0.69,
"grad_norm": 0.3056330382823944,
"learning_rate": 0.001,
"loss": 0.0688,
"step": 69500
},
{
"epoch": 0.7,
"grad_norm": 0.14624419808387756,
"learning_rate": 0.001,
"loss": 0.0686,
"step": 69600
},
{
"epoch": 0.7,
"grad_norm": 0.23571297526359558,
"learning_rate": 0.001,
"loss": 0.0727,
"step": 69700
},
{
"epoch": 0.7,
"grad_norm": 0.20212960243225098,
"learning_rate": 0.001,
"loss": 0.0708,
"step": 69800
},
{
"epoch": 0.7,
"grad_norm": 0.22400203347206116,
"learning_rate": 0.001,
"loss": 0.0645,
"step": 69900
},
{
"epoch": 0.7,
"grad_norm": 0.15693353116512299,
"learning_rate": 0.001,
"loss": 0.066,
"step": 70000
},
{
"epoch": 0.7,
"grad_norm": 0.21171632409095764,
"learning_rate": 0.001,
"loss": 0.0651,
"step": 70100
},
{
"epoch": 0.7,
"grad_norm": 0.16716106235980988,
"learning_rate": 0.001,
"loss": 0.0651,
"step": 70200
},
{
"epoch": 0.7,
"grad_norm": 0.19692525267601013,
"learning_rate": 0.001,
"loss": 0.0677,
"step": 70300
},
{
"epoch": 0.7,
"grad_norm": 0.23514828085899353,
"learning_rate": 0.001,
"loss": 0.0651,
"step": 70400
},
{
"epoch": 0.7,
"grad_norm": 0.22567568719387054,
"learning_rate": 0.001,
"loss": 0.0658,
"step": 70500
},
{
"epoch": 0.71,
"grad_norm": 0.20934154093265533,
"learning_rate": 0.001,
"loss": 0.0661,
"step": 70600
},
{
"epoch": 0.71,
"grad_norm": 0.25384077429771423,
"learning_rate": 0.001,
"loss": 0.0658,
"step": 70700
},
{
"epoch": 0.71,
"grad_norm": 0.27204346656799316,
"learning_rate": 0.001,
"loss": 0.0685,
"step": 70800
},
{
"epoch": 0.71,
"grad_norm": 0.1900806725025177,
"learning_rate": 0.001,
"loss": 0.0637,
"step": 70900
},
{
"epoch": 0.71,
"grad_norm": 0.4064619243144989,
"learning_rate": 0.001,
"loss": 0.07,
"step": 71000
},
{
"epoch": 0.71,
"grad_norm": 0.22942863404750824,
"learning_rate": 0.001,
"loss": 0.067,
"step": 71100
},
{
"epoch": 0.71,
"grad_norm": 0.3398168683052063,
"learning_rate": 0.001,
"loss": 0.0673,
"step": 71200
},
{
"epoch": 0.71,
"grad_norm": 0.2937333881855011,
"learning_rate": 0.001,
"loss": 0.0689,
"step": 71300
},
{
"epoch": 0.71,
"grad_norm": 0.15955261886119843,
"learning_rate": 0.001,
"loss": 0.0644,
"step": 71400
},
{
"epoch": 0.71,
"grad_norm": 0.32867005467414856,
"learning_rate": 0.001,
"loss": 0.0668,
"step": 71500
},
{
"epoch": 0.72,
"grad_norm": 0.22879061102867126,
"learning_rate": 0.001,
"loss": 0.0641,
"step": 71600
},
{
"epoch": 0.72,
"grad_norm": 0.3147716224193573,
"learning_rate": 0.001,
"loss": 0.0643,
"step": 71700
},
{
"epoch": 0.72,
"grad_norm": 0.19312891364097595,
"learning_rate": 0.001,
"loss": 0.0654,
"step": 71800
},
{
"epoch": 0.72,
"grad_norm": 0.3658990263938904,
"learning_rate": 0.001,
"loss": 0.066,
"step": 71900
},
{
"epoch": 0.72,
"grad_norm": 0.2730260193347931,
"learning_rate": 0.001,
"loss": 0.0673,
"step": 72000
},
{
"epoch": 0.72,
"grad_norm": 0.3601909279823303,
"learning_rate": 0.001,
"loss": 0.0643,
"step": 72100
},
{
"epoch": 0.72,
"grad_norm": 0.13944287598133087,
"learning_rate": 0.001,
"loss": 0.0671,
"step": 72200
},
{
"epoch": 0.72,
"grad_norm": 0.1590428501367569,
"learning_rate": 0.001,
"loss": 0.0651,
"step": 72300
},
{
"epoch": 0.72,
"grad_norm": 0.17583294212818146,
"learning_rate": 0.001,
"loss": 0.0665,
"step": 72400
},
{
"epoch": 0.72,
"grad_norm": 0.1566411554813385,
"learning_rate": 0.001,
"loss": 0.0666,
"step": 72500
},
{
"epoch": 0.73,
"grad_norm": 0.26495423913002014,
"learning_rate": 0.001,
"loss": 0.0651,
"step": 72600
},
{
"epoch": 0.73,
"grad_norm": 0.17272372543811798,
"learning_rate": 0.001,
"loss": 0.0689,
"step": 72700
},
{
"epoch": 0.73,
"grad_norm": 0.2443661093711853,
"learning_rate": 0.001,
"loss": 0.065,
"step": 72800
},
{
"epoch": 0.73,
"grad_norm": 0.26695558428764343,
"learning_rate": 0.001,
"loss": 0.0637,
"step": 72900
},
{
"epoch": 0.73,
"grad_norm": 0.14408937096595764,
"learning_rate": 0.001,
"loss": 0.0676,
"step": 73000
},
{
"epoch": 0.73,
"grad_norm": 0.18142744898796082,
"learning_rate": 0.001,
"loss": 0.0653,
"step": 73100
},
{
"epoch": 0.73,
"grad_norm": 0.17100819945335388,
"learning_rate": 0.001,
"loss": 0.0631,
"step": 73200
},
{
"epoch": 0.73,
"grad_norm": 0.3703427314758301,
"learning_rate": 0.001,
"loss": 0.0665,
"step": 73300
},
{
"epoch": 0.73,
"grad_norm": 0.19516532123088837,
"learning_rate": 0.001,
"loss": 0.0656,
"step": 73400
},
{
"epoch": 0.73,
"grad_norm": 0.17610041797161102,
"learning_rate": 0.001,
"loss": 0.0658,
"step": 73500
},
{
"epoch": 0.74,
"grad_norm": 0.13331599533557892,
"learning_rate": 0.001,
"loss": 0.0653,
"step": 73600
},
{
"epoch": 0.74,
"grad_norm": 0.23824097216129303,
"learning_rate": 0.001,
"loss": 0.065,
"step": 73700
},
{
"epoch": 0.74,
"grad_norm": 0.1464979499578476,
"learning_rate": 0.001,
"loss": 0.0638,
"step": 73800
},
{
"epoch": 0.74,
"grad_norm": 0.18163511157035828,
"learning_rate": 0.001,
"loss": 0.0661,
"step": 73900
},
{
"epoch": 0.74,
"grad_norm": 0.1809806078672409,
"learning_rate": 0.001,
"loss": 0.0643,
"step": 74000
},
{
"epoch": 0.74,
"grad_norm": 0.23994535207748413,
"learning_rate": 0.001,
"loss": 0.0636,
"step": 74100
},
{
"epoch": 0.74,
"grad_norm": 0.17924870550632477,
"learning_rate": 0.001,
"loss": 0.064,
"step": 74200
},
{
"epoch": 0.74,
"grad_norm": 0.15770521759986877,
"learning_rate": 0.001,
"loss": 0.0661,
"step": 74300
},
{
"epoch": 0.74,
"grad_norm": 0.24632355570793152,
"learning_rate": 0.001,
"loss": 0.0644,
"step": 74400
},
{
"epoch": 0.74,
"grad_norm": 0.18300195038318634,
"learning_rate": 0.001,
"loss": 0.0592,
"step": 74500
},
{
"epoch": 0.75,
"grad_norm": 0.2745151221752167,
"learning_rate": 0.001,
"loss": 0.063,
"step": 74600
},
{
"epoch": 0.75,
"grad_norm": 0.18871140480041504,
"learning_rate": 0.001,
"loss": 0.063,
"step": 74700
},
{
"epoch": 0.75,
"grad_norm": 0.30228421092033386,
"learning_rate": 0.001,
"loss": 0.0661,
"step": 74800
},
{
"epoch": 0.75,
"grad_norm": 0.26834210753440857,
"learning_rate": 0.001,
"loss": 0.0626,
"step": 74900
},
{
"epoch": 0.75,
"grad_norm": 0.1998053640127182,
"learning_rate": 0.001,
"loss": 0.0655,
"step": 75000
},
{
"epoch": 0.75,
"grad_norm": 0.16265703737735748,
"learning_rate": 0.001,
"loss": 0.0648,
"step": 75100
},
{
"epoch": 0.75,
"grad_norm": 0.3203764259815216,
"learning_rate": 0.001,
"loss": 0.0636,
"step": 75200
},
{
"epoch": 0.75,
"grad_norm": 0.29416751861572266,
"learning_rate": 0.001,
"loss": 0.0613,
"step": 75300
},
{
"epoch": 0.75,
"grad_norm": 0.1761980801820755,
"learning_rate": 0.001,
"loss": 0.0718,
"step": 75400
},
{
"epoch": 0.75,
"grad_norm": 0.24760745465755463,
"learning_rate": 0.001,
"loss": 0.0641,
"step": 75500
},
{
"epoch": 0.76,
"grad_norm": 0.3362966477870941,
"learning_rate": 0.001,
"loss": 0.0678,
"step": 75600
},
{
"epoch": 0.76,
"grad_norm": 0.20644457638263702,
"learning_rate": 0.001,
"loss": 0.0653,
"step": 75700
},
{
"epoch": 0.76,
"grad_norm": 0.22632303833961487,
"learning_rate": 0.001,
"loss": 0.0679,
"step": 75800
},
{
"epoch": 0.76,
"grad_norm": 0.22177743911743164,
"learning_rate": 0.001,
"loss": 0.0628,
"step": 75900
},
{
"epoch": 0.76,
"grad_norm": 0.9697771072387695,
"learning_rate": 0.001,
"loss": 0.0659,
"step": 76000
},
{
"epoch": 0.76,
"grad_norm": 0.21862226724624634,
"learning_rate": 0.001,
"loss": 0.0654,
"step": 76100
},
{
"epoch": 0.76,
"grad_norm": 0.27506422996520996,
"learning_rate": 0.001,
"loss": 0.0636,
"step": 76200
},
{
"epoch": 0.76,
"grad_norm": 0.4953247606754303,
"learning_rate": 0.001,
"loss": 0.0648,
"step": 76300
},
{
"epoch": 0.76,
"grad_norm": 0.44132623076438904,
"learning_rate": 0.001,
"loss": 0.0641,
"step": 76400
},
{
"epoch": 0.76,
"grad_norm": 0.28104710578918457,
"learning_rate": 0.001,
"loss": 0.0623,
"step": 76500
},
{
"epoch": 0.77,
"grad_norm": 0.270434707403183,
"learning_rate": 0.001,
"loss": 0.0642,
"step": 76600
},
{
"epoch": 0.77,
"grad_norm": 0.17920733988285065,
"learning_rate": 0.001,
"loss": 0.0641,
"step": 76700
},
{
"epoch": 0.77,
"grad_norm": 0.27689895033836365,
"learning_rate": 0.001,
"loss": 0.0645,
"step": 76800
},
{
"epoch": 0.77,
"grad_norm": 0.22936861217021942,
"learning_rate": 0.001,
"loss": 0.0625,
"step": 76900
},
{
"epoch": 0.77,
"grad_norm": 0.2662585973739624,
"learning_rate": 0.001,
"loss": 0.0671,
"step": 77000
},
{
"epoch": 0.77,
"grad_norm": 0.23035678267478943,
"learning_rate": 0.001,
"loss": 0.0622,
"step": 77100
},
{
"epoch": 0.77,
"grad_norm": 0.19333815574645996,
"learning_rate": 0.001,
"loss": 0.0655,
"step": 77200
},
{
"epoch": 0.77,
"grad_norm": 0.2870350182056427,
"learning_rate": 0.001,
"loss": 0.0634,
"step": 77300
},
{
"epoch": 0.77,
"grad_norm": 0.22997340559959412,
"learning_rate": 0.001,
"loss": 0.0676,
"step": 77400
},
{
"epoch": 0.77,
"grad_norm": 0.19435285031795502,
"learning_rate": 0.001,
"loss": 0.0655,
"step": 77500
},
{
"epoch": 0.78,
"grad_norm": 0.2826205790042877,
"learning_rate": 0.001,
"loss": 0.0635,
"step": 77600
},
{
"epoch": 0.78,
"grad_norm": 0.20007766783237457,
"learning_rate": 0.001,
"loss": 0.0617,
"step": 77700
},
{
"epoch": 0.78,
"grad_norm": 0.15860234200954437,
"learning_rate": 0.001,
"loss": 0.0657,
"step": 77800
},
{
"epoch": 0.78,
"grad_norm": 0.40526214241981506,
"learning_rate": 0.001,
"loss": 0.0649,
"step": 77900
},
{
"epoch": 0.78,
"grad_norm": 0.24454933404922485,
"learning_rate": 0.001,
"loss": 0.0634,
"step": 78000
},
{
"epoch": 0.78,
"grad_norm": 0.12802359461784363,
"learning_rate": 0.001,
"loss": 0.0635,
"step": 78100
},
{
"epoch": 0.78,
"grad_norm": 0.32250648736953735,
"learning_rate": 0.001,
"loss": 0.0648,
"step": 78200
},
{
"epoch": 0.78,
"grad_norm": 0.253478467464447,
"learning_rate": 0.001,
"loss": 0.0648,
"step": 78300
},
{
"epoch": 0.78,
"grad_norm": 0.25307029485702515,
"learning_rate": 0.001,
"loss": 0.0648,
"step": 78400
},
{
"epoch": 0.78,
"grad_norm": 0.19091230630874634,
"learning_rate": 0.001,
"loss": 0.065,
"step": 78500
},
{
"epoch": 0.79,
"grad_norm": 0.17312967777252197,
"learning_rate": 0.001,
"loss": 0.0624,
"step": 78600
},
{
"epoch": 0.79,
"grad_norm": 0.19466041028499603,
"learning_rate": 0.001,
"loss": 0.0622,
"step": 78700
},
{
"epoch": 0.79,
"grad_norm": 0.25837138295173645,
"learning_rate": 0.001,
"loss": 0.0641,
"step": 78800
},
{
"epoch": 0.79,
"grad_norm": 0.1573166698217392,
"learning_rate": 0.001,
"loss": 0.0645,
"step": 78900
},
{
"epoch": 0.79,
"grad_norm": 0.1644609123468399,
"learning_rate": 0.001,
"loss": 0.0644,
"step": 79000
},
{
"epoch": 0.79,
"grad_norm": 0.20255005359649658,
"learning_rate": 0.001,
"loss": 0.0647,
"step": 79100
},
{
"epoch": 0.79,
"grad_norm": 0.48706310987472534,
"learning_rate": 0.001,
"loss": 0.0642,
"step": 79200
},
{
"epoch": 0.79,
"grad_norm": 0.3525262176990509,
"learning_rate": 0.001,
"loss": 0.0639,
"step": 79300
},
{
"epoch": 0.79,
"grad_norm": 0.20806559920310974,
"learning_rate": 0.001,
"loss": 0.0639,
"step": 79400
},
{
"epoch": 0.79,
"grad_norm": 0.441980242729187,
"learning_rate": 0.001,
"loss": 0.0645,
"step": 79500
},
{
"epoch": 0.8,
"grad_norm": 0.16818083822727203,
"learning_rate": 0.001,
"loss": 0.0625,
"step": 79600
},
{
"epoch": 0.8,
"grad_norm": 0.1843559443950653,
"learning_rate": 0.001,
"loss": 0.064,
"step": 79700
},
{
"epoch": 0.8,
"grad_norm": 0.19608129560947418,
"learning_rate": 0.001,
"loss": 0.0634,
"step": 79800
},
{
"epoch": 0.8,
"grad_norm": 0.34710460901260376,
"learning_rate": 0.001,
"loss": 0.0626,
"step": 79900
},
{
"epoch": 0.8,
"grad_norm": 0.4062146842479706,
"learning_rate": 0.001,
"loss": 0.0637,
"step": 80000
},
{
"epoch": 0.8,
"grad_norm": 0.23054763674736023,
"learning_rate": 0.001,
"loss": 0.0629,
"step": 80100
},
{
"epoch": 0.8,
"grad_norm": 0.20241042971611023,
"learning_rate": 0.001,
"loss": 0.0632,
"step": 80200
},
{
"epoch": 0.8,
"grad_norm": 0.17540830373764038,
"learning_rate": 0.001,
"loss": 0.0645,
"step": 80300
},
{
"epoch": 0.8,
"grad_norm": 0.2995645999908447,
"learning_rate": 0.001,
"loss": 0.0619,
"step": 80400
},
{
"epoch": 0.8,
"grad_norm": 0.2701890766620636,
"learning_rate": 0.001,
"loss": 0.0624,
"step": 80500
},
{
"epoch": 0.81,
"grad_norm": 0.5655909180641174,
"learning_rate": 0.001,
"loss": 0.0637,
"step": 80600
},
{
"epoch": 0.81,
"grad_norm": 0.24868199229240417,
"learning_rate": 0.001,
"loss": 0.0626,
"step": 80700
},
{
"epoch": 0.81,
"grad_norm": 0.205698162317276,
"learning_rate": 0.001,
"loss": 0.0616,
"step": 80800
},
{
"epoch": 0.81,
"grad_norm": 0.4373738169670105,
"learning_rate": 0.001,
"loss": 0.0635,
"step": 80900
},
{
"epoch": 0.81,
"grad_norm": 0.20648936927318573,
"learning_rate": 0.001,
"loss": 0.063,
"step": 81000
},
{
"epoch": 0.81,
"grad_norm": 0.49470582604408264,
"learning_rate": 0.001,
"loss": 0.064,
"step": 81100
},
{
"epoch": 0.81,
"grad_norm": 0.2360522598028183,
"learning_rate": 0.001,
"loss": 0.0606,
"step": 81200
},
{
"epoch": 0.81,
"grad_norm": 0.38575538992881775,
"learning_rate": 0.001,
"loss": 0.0626,
"step": 81300
},
{
"epoch": 0.81,
"grad_norm": 0.23714828491210938,
"learning_rate": 0.001,
"loss": 0.0628,
"step": 81400
},
{
"epoch": 0.81,
"grad_norm": 0.5665257573127747,
"learning_rate": 0.001,
"loss": 0.064,
"step": 81500
},
{
"epoch": 0.82,
"grad_norm": 0.2335139662027359,
"learning_rate": 0.001,
"loss": 0.0628,
"step": 81600
},
{
"epoch": 0.82,
"grad_norm": 0.23121795058250427,
"learning_rate": 0.001,
"loss": 0.0617,
"step": 81700
},
{
"epoch": 0.82,
"grad_norm": 0.2850015163421631,
"learning_rate": 0.001,
"loss": 0.0634,
"step": 81800
},
{
"epoch": 0.82,
"grad_norm": 0.25949451327323914,
"learning_rate": 0.001,
"loss": 0.0611,
"step": 81900
},
{
"epoch": 0.82,
"grad_norm": 0.15866072475910187,
"learning_rate": 0.001,
"loss": 0.0633,
"step": 82000
},
{
"epoch": 0.82,
"grad_norm": 0.1362059861421585,
"learning_rate": 0.001,
"loss": 0.0637,
"step": 82100
},
{
"epoch": 0.82,
"grad_norm": 0.23973006010055542,
"learning_rate": 0.001,
"loss": 0.0619,
"step": 82200
},
{
"epoch": 0.82,
"grad_norm": 0.2586152255535126,
"learning_rate": 0.001,
"loss": 0.0595,
"step": 82300
},
{
"epoch": 0.82,
"grad_norm": 0.33245041966438293,
"learning_rate": 0.001,
"loss": 0.0632,
"step": 82400
},
{
"epoch": 0.82,
"grad_norm": 0.1873330920934677,
"learning_rate": 0.001,
"loss": 0.0636,
"step": 82500
},
{
"epoch": 0.83,
"grad_norm": 0.23043370246887207,
"learning_rate": 0.001,
"loss": 0.0644,
"step": 82600
},
{
"epoch": 0.83,
"grad_norm": 0.21046708524227142,
"learning_rate": 0.001,
"loss": 0.0631,
"step": 82700
},
{
"epoch": 0.83,
"grad_norm": 0.15473945438861847,
"learning_rate": 0.001,
"loss": 0.06,
"step": 82800
},
{
"epoch": 0.83,
"grad_norm": 1.422141194343567,
"learning_rate": 0.001,
"loss": 0.0636,
"step": 82900
},
{
"epoch": 0.83,
"grad_norm": 0.16424107551574707,
"learning_rate": 0.001,
"loss": 0.0643,
"step": 83000
},
{
"epoch": 0.83,
"grad_norm": 0.3594319820404053,
"learning_rate": 0.001,
"loss": 0.0624,
"step": 83100
},
{
"epoch": 0.83,
"grad_norm": 0.26430365443229675,
"learning_rate": 0.001,
"loss": 0.0593,
"step": 83200
},
{
"epoch": 0.83,
"grad_norm": 0.20655816793441772,
"learning_rate": 0.001,
"loss": 0.0619,
"step": 83300
},
{
"epoch": 0.83,
"grad_norm": 0.39340272545814514,
"learning_rate": 0.001,
"loss": 0.0624,
"step": 83400
},
{
"epoch": 0.83,
"grad_norm": 0.3113759160041809,
"learning_rate": 0.001,
"loss": 0.0598,
"step": 83500
},
{
"epoch": 0.84,
"grad_norm": 0.33689817786216736,
"learning_rate": 0.001,
"loss": 0.0604,
"step": 83600
},
{
"epoch": 0.84,
"grad_norm": 0.2195175141096115,
"learning_rate": 0.001,
"loss": 0.0618,
"step": 83700
},
{
"epoch": 0.84,
"grad_norm": 0.2397637814283371,
"learning_rate": 0.001,
"loss": 0.0618,
"step": 83800
},
{
"epoch": 0.84,
"grad_norm": 0.28967469930648804,
"learning_rate": 0.001,
"loss": 0.0612,
"step": 83900
},
{
"epoch": 0.84,
"grad_norm": 0.23908008635044098,
"learning_rate": 0.001,
"loss": 0.0599,
"step": 84000
},
{
"epoch": 0.84,
"grad_norm": 0.36196354031562805,
"learning_rate": 0.001,
"loss": 0.061,
"step": 84100
},
{
"epoch": 0.84,
"grad_norm": 0.3068004250526428,
"learning_rate": 0.001,
"loss": 0.0614,
"step": 84200
},
{
"epoch": 0.84,
"grad_norm": 0.2148333489894867,
"learning_rate": 0.001,
"loss": 0.0624,
"step": 84300
},
{
"epoch": 0.84,
"grad_norm": 0.19169430434703827,
"learning_rate": 0.001,
"loss": 0.0615,
"step": 84400
},
{
"epoch": 0.84,
"grad_norm": 0.23916268348693848,
"learning_rate": 0.001,
"loss": 0.0654,
"step": 84500
},
{
"epoch": 0.85,
"grad_norm": 0.20304815471172333,
"learning_rate": 0.001,
"loss": 0.0613,
"step": 84600
},
{
"epoch": 0.85,
"grad_norm": 0.2983682155609131,
"learning_rate": 0.001,
"loss": 0.0617,
"step": 84700
},
{
"epoch": 0.85,
"grad_norm": 0.22442661225795746,
"learning_rate": 0.001,
"loss": 0.0593,
"step": 84800
},
{
"epoch": 0.85,
"grad_norm": 0.28299954533576965,
"learning_rate": 0.001,
"loss": 0.0636,
"step": 84900
},
{
"epoch": 0.85,
"grad_norm": 0.30491936206817627,
"learning_rate": 0.001,
"loss": 0.0608,
"step": 85000
},
{
"epoch": 0.85,
"grad_norm": 0.30804798007011414,
"learning_rate": 0.001,
"loss": 0.0609,
"step": 85100
},
{
"epoch": 0.85,
"grad_norm": 0.18533004820346832,
"learning_rate": 0.001,
"loss": 0.0602,
"step": 85200
},
{
"epoch": 0.85,
"grad_norm": 0.23856715857982635,
"learning_rate": 0.001,
"loss": 0.0638,
"step": 85300
},
{
"epoch": 0.85,
"grad_norm": 0.2646658420562744,
"learning_rate": 0.001,
"loss": 0.0622,
"step": 85400
},
{
"epoch": 0.85,
"grad_norm": 0.2357235699892044,
"learning_rate": 0.001,
"loss": 0.0617,
"step": 85500
},
{
"epoch": 0.86,
"grad_norm": 0.1675509363412857,
"learning_rate": 0.001,
"loss": 0.0593,
"step": 85600
},
{
"epoch": 0.86,
"grad_norm": 0.20707982778549194,
"learning_rate": 0.001,
"loss": 0.0617,
"step": 85700
},
{
"epoch": 0.86,
"grad_norm": 0.34539708495140076,
"learning_rate": 0.001,
"loss": 0.06,
"step": 85800
},
{
"epoch": 0.86,
"grad_norm": 0.28429824113845825,
"learning_rate": 0.001,
"loss": 0.0587,
"step": 85900
},
{
"epoch": 0.86,
"grad_norm": 0.3121056854724884,
"learning_rate": 0.001,
"loss": 0.0615,
"step": 86000
},
{
"epoch": 0.86,
"grad_norm": 0.25750598311424255,
"learning_rate": 0.001,
"loss": 0.0613,
"step": 86100
},
{
"epoch": 0.86,
"grad_norm": 0.18927526473999023,
"learning_rate": 0.001,
"loss": 0.0592,
"step": 86200
},
{
"epoch": 0.86,
"grad_norm": 0.3551163971424103,
"learning_rate": 0.001,
"loss": 0.0619,
"step": 86300
},
{
"epoch": 0.86,
"grad_norm": 0.19404169917106628,
"learning_rate": 0.001,
"loss": 0.0617,
"step": 86400
},
{
"epoch": 0.86,
"grad_norm": 0.16969504952430725,
"learning_rate": 0.001,
"loss": 0.0599,
"step": 86500
},
{
"epoch": 0.87,
"grad_norm": 0.20026318728923798,
"learning_rate": 0.001,
"loss": 0.0606,
"step": 86600
},
{
"epoch": 0.87,
"grad_norm": 0.30545106530189514,
"learning_rate": 0.001,
"loss": 0.0594,
"step": 86700
},
{
"epoch": 0.87,
"grad_norm": 0.2734260559082031,
"learning_rate": 0.001,
"loss": 0.0644,
"step": 86800
},
{
"epoch": 0.87,
"grad_norm": 0.3157080411911011,
"learning_rate": 0.001,
"loss": 0.0618,
"step": 86900
},
{
"epoch": 0.87,
"grad_norm": 0.19793906807899475,
"learning_rate": 0.001,
"loss": 0.0616,
"step": 87000
},
{
"epoch": 0.87,
"grad_norm": 0.1849125623703003,
"learning_rate": 0.001,
"loss": 0.0596,
"step": 87100
},
{
"epoch": 0.87,
"grad_norm": 0.18340341746807098,
"learning_rate": 0.001,
"loss": 0.0625,
"step": 87200
},
{
"epoch": 0.87,
"grad_norm": 0.26056426763534546,
"learning_rate": 0.001,
"loss": 0.0595,
"step": 87300
},
{
"epoch": 0.87,
"grad_norm": 0.22235774993896484,
"learning_rate": 0.001,
"loss": 0.0606,
"step": 87400
},
{
"epoch": 0.87,
"grad_norm": 0.31580013036727905,
"learning_rate": 0.001,
"loss": 0.0615,
"step": 87500
},
{
"epoch": 0.88,
"grad_norm": 0.2364477515220642,
"learning_rate": 0.001,
"loss": 0.0616,
"step": 87600
},
{
"epoch": 0.88,
"grad_norm": 0.23212990164756775,
"learning_rate": 0.001,
"loss": 0.0594,
"step": 87700
},
{
"epoch": 0.88,
"grad_norm": 0.21986854076385498,
"learning_rate": 0.001,
"loss": 0.0592,
"step": 87800
},
{
"epoch": 0.88,
"grad_norm": 0.2496929168701172,
"learning_rate": 0.001,
"loss": 0.0593,
"step": 87900
},
{
"epoch": 0.88,
"grad_norm": 0.19572298228740692,
"learning_rate": 0.001,
"loss": 0.0588,
"step": 88000
},
{
"epoch": 0.88,
"grad_norm": 0.16231012344360352,
"learning_rate": 0.001,
"loss": 0.0599,
"step": 88100
},
{
"epoch": 0.88,
"grad_norm": 0.21093867719173431,
"learning_rate": 0.001,
"loss": 0.0625,
"step": 88200
},
{
"epoch": 0.88,
"grad_norm": 0.16491778194904327,
"learning_rate": 0.001,
"loss": 0.0602,
"step": 88300
},
{
"epoch": 0.88,
"grad_norm": 0.24729378521442413,
"learning_rate": 0.001,
"loss": 0.0573,
"step": 88400
},
{
"epoch": 0.88,
"grad_norm": 0.3726213276386261,
"learning_rate": 0.001,
"loss": 0.0589,
"step": 88500
},
{
"epoch": 0.89,
"grad_norm": 0.1926572024822235,
"learning_rate": 0.001,
"loss": 0.0602,
"step": 88600
},
{
"epoch": 0.89,
"grad_norm": 0.2153882533311844,
"learning_rate": 0.001,
"loss": 0.0597,
"step": 88700
},
{
"epoch": 0.89,
"grad_norm": 0.25205257534980774,
"learning_rate": 0.001,
"loss": 0.0581,
"step": 88800
},
{
"epoch": 0.89,
"grad_norm": 0.16898304224014282,
"learning_rate": 0.001,
"loss": 0.0614,
"step": 88900
},
{
"epoch": 0.89,
"grad_norm": 0.2840329110622406,
"learning_rate": 0.001,
"loss": 0.0615,
"step": 89000
},
{
"epoch": 0.89,
"grad_norm": 0.22306442260742188,
"learning_rate": 0.001,
"loss": 0.0606,
"step": 89100
},
{
"epoch": 0.89,
"grad_norm": 0.2778179943561554,
"learning_rate": 0.001,
"loss": 0.0606,
"step": 89200
},
{
"epoch": 0.89,
"grad_norm": 0.1956636756658554,
"learning_rate": 0.001,
"loss": 0.0585,
"step": 89300
},
{
"epoch": 0.89,
"grad_norm": 0.15973015129566193,
"learning_rate": 0.001,
"loss": 0.0598,
"step": 89400
},
{
"epoch": 0.89,
"grad_norm": 0.2306407243013382,
"learning_rate": 0.001,
"loss": 0.0597,
"step": 89500
},
{
"epoch": 0.9,
"grad_norm": 0.19012047350406647,
"learning_rate": 0.001,
"loss": 0.0608,
"step": 89600
},
{
"epoch": 0.9,
"grad_norm": 0.214030921459198,
"learning_rate": 0.001,
"loss": 0.0586,
"step": 89700
},
{
"epoch": 0.9,
"grad_norm": 0.26291027665138245,
"learning_rate": 0.001,
"loss": 0.0599,
"step": 89800
},
{
"epoch": 0.9,
"grad_norm": 0.140648752450943,
"learning_rate": 0.001,
"loss": 0.0605,
"step": 89900
},
{
"epoch": 0.9,
"grad_norm": 0.3011924624443054,
"learning_rate": 0.001,
"loss": 0.0609,
"step": 90000
},
{
"epoch": 0.9,
"grad_norm": 0.24463798105716705,
"learning_rate": 0.001,
"loss": 0.0587,
"step": 90100
},
{
"epoch": 0.9,
"grad_norm": 0.2608613073825836,
"learning_rate": 0.001,
"loss": 0.0595,
"step": 90200
},
{
"epoch": 0.9,
"grad_norm": 0.23249809443950653,
"learning_rate": 0.001,
"loss": 0.0592,
"step": 90300
},
{
"epoch": 0.9,
"grad_norm": 0.36541712284088135,
"learning_rate": 0.001,
"loss": 0.0599,
"step": 90400
},
{
"epoch": 0.9,
"grad_norm": 0.45584437251091003,
"learning_rate": 0.001,
"loss": 0.0587,
"step": 90500
},
{
"epoch": 0.91,
"grad_norm": 0.20905092358589172,
"learning_rate": 0.001,
"loss": 0.0595,
"step": 90600
},
{
"epoch": 0.91,
"grad_norm": 0.18202795088291168,
"learning_rate": 0.001,
"loss": 0.0568,
"step": 90700
},
{
"epoch": 0.91,
"grad_norm": 0.2321150153875351,
"learning_rate": 0.001,
"loss": 0.0605,
"step": 90800
},
{
"epoch": 0.91,
"grad_norm": 0.17175626754760742,
"learning_rate": 0.001,
"loss": 0.0596,
"step": 90900
},
{
"epoch": 0.91,
"grad_norm": 0.21932841837406158,
"learning_rate": 0.001,
"loss": 0.0585,
"step": 91000
},
{
"epoch": 0.91,
"grad_norm": 0.30282464623451233,
"learning_rate": 0.001,
"loss": 0.0593,
"step": 91100
},
{
"epoch": 0.91,
"grad_norm": 0.2639208436012268,
"learning_rate": 0.001,
"loss": 0.0599,
"step": 91200
},
{
"epoch": 0.91,
"grad_norm": 0.23805926740169525,
"learning_rate": 0.001,
"loss": 0.0576,
"step": 91300
},
{
"epoch": 0.91,
"grad_norm": 0.2307603508234024,
"learning_rate": 0.001,
"loss": 0.0602,
"step": 91400
},
{
"epoch": 0.91,
"grad_norm": 0.1786148101091385,
"learning_rate": 0.001,
"loss": 0.0598,
"step": 91500
},
{
"epoch": 0.92,
"grad_norm": 0.1955350786447525,
"learning_rate": 0.001,
"loss": 0.0576,
"step": 91600
},
{
"epoch": 0.92,
"grad_norm": 0.24684827029705048,
"learning_rate": 0.001,
"loss": 0.0571,
"step": 91700
},
{
"epoch": 0.92,
"grad_norm": 0.2771402895450592,
"learning_rate": 0.001,
"loss": 0.058,
"step": 91800
},
{
"epoch": 0.92,
"grad_norm": 0.28878656029701233,
"learning_rate": 0.001,
"loss": 0.0585,
"step": 91900
},
{
"epoch": 0.92,
"grad_norm": 0.7780060172080994,
"learning_rate": 0.001,
"loss": 0.0574,
"step": 92000
},
{
"epoch": 0.92,
"grad_norm": 0.25102126598358154,
"learning_rate": 0.001,
"loss": 0.0576,
"step": 92100
},
{
"epoch": 0.92,
"grad_norm": 0.26416492462158203,
"learning_rate": 0.001,
"loss": 0.0614,
"step": 92200
},
{
"epoch": 0.92,
"grad_norm": 0.26566821336746216,
"learning_rate": 0.001,
"loss": 0.0586,
"step": 92300
},
{
"epoch": 0.92,
"grad_norm": 0.25432705879211426,
"learning_rate": 0.001,
"loss": 0.0586,
"step": 92400
},
{
"epoch": 0.92,
"grad_norm": 0.2592636048793793,
"learning_rate": 0.001,
"loss": 0.0576,
"step": 92500
},
{
"epoch": 0.93,
"grad_norm": 0.3514898419380188,
"learning_rate": 0.001,
"loss": 0.0579,
"step": 92600
},
{
"epoch": 0.93,
"grad_norm": 0.2749045491218567,
"learning_rate": 0.001,
"loss": 0.061,
"step": 92700
},
{
"epoch": 0.93,
"grad_norm": 0.2799491882324219,
"learning_rate": 0.001,
"loss": 0.0579,
"step": 92800
},
{
"epoch": 0.93,
"grad_norm": 0.2252642959356308,
"learning_rate": 0.001,
"loss": 0.0584,
"step": 92900
},
{
"epoch": 0.93,
"grad_norm": 0.18218593299388885,
"learning_rate": 0.001,
"loss": 0.0577,
"step": 93000
},
{
"epoch": 0.93,
"grad_norm": 0.27551427483558655,
"learning_rate": 0.001,
"loss": 0.0605,
"step": 93100
},
{
"epoch": 0.93,
"grad_norm": 0.26159995794296265,
"learning_rate": 0.001,
"loss": 0.0562,
"step": 93200
},
{
"epoch": 0.93,
"grad_norm": 0.15979285538196564,
"learning_rate": 0.001,
"loss": 0.0615,
"step": 93300
},
{
"epoch": 0.93,
"grad_norm": 0.23418280482292175,
"learning_rate": 0.001,
"loss": 0.0594,
"step": 93400
},
{
"epoch": 0.93,
"grad_norm": 0.16936419904232025,
"learning_rate": 0.001,
"loss": 0.0611,
"step": 93500
},
{
"epoch": 0.94,
"grad_norm": 0.2862916886806488,
"learning_rate": 0.001,
"loss": 0.0584,
"step": 93600
},
{
"epoch": 0.94,
"grad_norm": 0.5302750468254089,
"learning_rate": 0.001,
"loss": 0.0561,
"step": 93700
},
{
"epoch": 0.94,
"grad_norm": 0.43644002079963684,
"learning_rate": 0.001,
"loss": 0.0581,
"step": 93800
},
{
"epoch": 0.94,
"grad_norm": 0.19219018518924713,
"learning_rate": 0.001,
"loss": 0.0591,
"step": 93900
},
{
"epoch": 0.94,
"grad_norm": 0.29645296931266785,
"learning_rate": 0.001,
"loss": 0.0587,
"step": 94000
},
{
"epoch": 0.94,
"grad_norm": 0.24861380457878113,
"learning_rate": 0.001,
"loss": 0.0594,
"step": 94100
},
{
"epoch": 0.94,
"grad_norm": 0.2443215548992157,
"learning_rate": 0.001,
"loss": 0.057,
"step": 94200
},
{
"epoch": 0.94,
"grad_norm": 0.13077589869499207,
"learning_rate": 0.001,
"loss": 0.0563,
"step": 94300
},
{
"epoch": 0.94,
"grad_norm": 0.24280287325382233,
"learning_rate": 0.001,
"loss": 0.0591,
"step": 94400
},
{
"epoch": 0.94,
"grad_norm": 0.25838151574134827,
"learning_rate": 0.001,
"loss": 0.0583,
"step": 94500
},
{
"epoch": 0.95,
"grad_norm": 0.33244743943214417,
"learning_rate": 0.001,
"loss": 0.0587,
"step": 94600
},
{
"epoch": 0.95,
"grad_norm": 0.45074304938316345,
"learning_rate": 0.001,
"loss": 0.0572,
"step": 94700
},
{
"epoch": 0.95,
"grad_norm": 0.2540782392024994,
"learning_rate": 0.001,
"loss": 0.0584,
"step": 94800
},
{
"epoch": 0.95,
"grad_norm": 0.29180458188056946,
"learning_rate": 0.001,
"loss": 0.0609,
"step": 94900
},
{
"epoch": 0.95,
"grad_norm": 0.18510323762893677,
"learning_rate": 0.001,
"loss": 0.058,
"step": 95000
},
{
"epoch": 0.95,
"grad_norm": 0.28962787985801697,
"learning_rate": 0.001,
"loss": 0.0562,
"step": 95100
},
{
"epoch": 0.95,
"grad_norm": 0.26887577772140503,
"learning_rate": 0.001,
"loss": 0.0573,
"step": 95200
},
{
"epoch": 0.95,
"grad_norm": 0.20729154348373413,
"learning_rate": 0.001,
"loss": 0.057,
"step": 95300
},
{
"epoch": 0.95,
"grad_norm": 0.19953325390815735,
"learning_rate": 0.001,
"loss": 0.0594,
"step": 95400
},
{
"epoch": 0.95,
"grad_norm": 0.15926332771778107,
"learning_rate": 0.001,
"loss": 0.0582,
"step": 95500
},
{
"epoch": 0.96,
"grad_norm": 0.23609544336795807,
"learning_rate": 0.001,
"loss": 0.0579,
"step": 95600
},
{
"epoch": 0.96,
"grad_norm": 0.13997937738895416,
"learning_rate": 0.001,
"loss": 0.0574,
"step": 95700
},
{
"epoch": 0.96,
"grad_norm": 0.23629073798656464,
"learning_rate": 0.001,
"loss": 0.0585,
"step": 95800
},
{
"epoch": 0.96,
"grad_norm": 0.3770292401313782,
"learning_rate": 0.001,
"loss": 0.0572,
"step": 95900
},
{
"epoch": 0.96,
"grad_norm": 0.3013598322868347,
"learning_rate": 0.001,
"loss": 0.0606,
"step": 96000
},
{
"epoch": 0.96,
"grad_norm": 0.2350749522447586,
"learning_rate": 0.001,
"loss": 0.057,
"step": 96100
},
{
"epoch": 0.96,
"grad_norm": 0.301268994808197,
"learning_rate": 0.001,
"loss": 0.0586,
"step": 96200
},
{
"epoch": 0.96,
"grad_norm": 0.22475981712341309,
"learning_rate": 0.001,
"loss": 0.0593,
"step": 96300
},
{
"epoch": 0.96,
"grad_norm": 0.3032160997390747,
"learning_rate": 0.001,
"loss": 0.0591,
"step": 96400
},
{
"epoch": 0.96,
"grad_norm": 0.5848428010940552,
"learning_rate": 0.001,
"loss": 0.0559,
"step": 96500
},
{
"epoch": 0.97,
"grad_norm": 0.20164470374584198,
"learning_rate": 0.001,
"loss": 0.0579,
"step": 96600
},
{
"epoch": 0.97,
"grad_norm": 0.18068142235279083,
"learning_rate": 0.001,
"loss": 0.0579,
"step": 96700
},
{
"epoch": 0.97,
"grad_norm": 0.31181275844573975,
"learning_rate": 0.001,
"loss": 0.0588,
"step": 96800
},
{
"epoch": 0.97,
"grad_norm": 0.23156049847602844,
"learning_rate": 0.001,
"loss": 0.058,
"step": 96900
},
{
"epoch": 0.97,
"grad_norm": 0.18572886288166046,
"learning_rate": 0.001,
"loss": 0.0599,
"step": 97000
},
{
"epoch": 0.97,
"grad_norm": 0.17736677825450897,
"learning_rate": 0.001,
"loss": 0.0561,
"step": 97100
},
{
"epoch": 0.97,
"grad_norm": 0.4838601052761078,
"learning_rate": 0.001,
"loss": 0.0595,
"step": 97200
},
{
"epoch": 0.97,
"grad_norm": 0.21476797759532928,
"learning_rate": 0.001,
"loss": 0.0609,
"step": 97300
},
{
"epoch": 0.97,
"grad_norm": 0.2181667536497116,
"learning_rate": 0.001,
"loss": 0.0583,
"step": 97400
},
{
"epoch": 0.97,
"grad_norm": 0.26551786065101624,
"learning_rate": 0.001,
"loss": 0.0566,
"step": 97500
},
{
"epoch": 0.98,
"grad_norm": 0.2258795201778412,
"learning_rate": 0.001,
"loss": 0.0574,
"step": 97600
},
{
"epoch": 0.98,
"grad_norm": 0.17733299732208252,
"learning_rate": 0.001,
"loss": 0.0588,
"step": 97700
},
{
"epoch": 0.98,
"grad_norm": 0.4031812846660614,
"learning_rate": 0.001,
"loss": 0.0584,
"step": 97800
},
{
"epoch": 0.98,
"grad_norm": 0.22529329359531403,
"learning_rate": 0.001,
"loss": 0.0572,
"step": 97900
},
{
"epoch": 0.98,
"grad_norm": 0.2503925561904907,
"learning_rate": 0.001,
"loss": 0.0588,
"step": 98000
},
{
"epoch": 0.98,
"grad_norm": 0.17040744423866272,
"learning_rate": 0.001,
"loss": 0.0603,
"step": 98100
},
{
"epoch": 0.98,
"grad_norm": 0.17749032378196716,
"learning_rate": 0.001,
"loss": 0.057,
"step": 98200
},
{
"epoch": 0.98,
"grad_norm": 0.3931177854537964,
"learning_rate": 0.001,
"loss": 0.0566,
"step": 98300
},
{
"epoch": 0.98,
"grad_norm": 0.22418583929538727,
"learning_rate": 0.001,
"loss": 0.0574,
"step": 98400
},
{
"epoch": 0.98,
"grad_norm": 0.30830493569374084,
"learning_rate": 0.001,
"loss": 0.0593,
"step": 98500
},
{
"epoch": 0.99,
"grad_norm": 0.2269369661808014,
"learning_rate": 0.001,
"loss": 0.0585,
"step": 98600
},
{
"epoch": 0.99,
"grad_norm": 0.31830596923828125,
"learning_rate": 0.001,
"loss": 0.0548,
"step": 98700
},
{
"epoch": 0.99,
"grad_norm": 0.25759172439575195,
"learning_rate": 0.001,
"loss": 0.0564,
"step": 98800
},
{
"epoch": 0.99,
"grad_norm": 0.23925898969173431,
"learning_rate": 0.001,
"loss": 0.0592,
"step": 98900
},
{
"epoch": 0.99,
"grad_norm": 0.17434507608413696,
"learning_rate": 0.001,
"loss": 0.0583,
"step": 99000
},
{
"epoch": 0.99,
"grad_norm": 0.3493863642215729,
"learning_rate": 0.001,
"loss": 0.0571,
"step": 99100
},
{
"epoch": 0.99,
"grad_norm": 0.20887431502342224,
"learning_rate": 0.001,
"loss": 0.0564,
"step": 99200
},
{
"epoch": 0.99,
"grad_norm": 0.18060541152954102,
"learning_rate": 0.001,
"loss": 0.0583,
"step": 99300
},
{
"epoch": 0.99,
"grad_norm": 0.3689703047275543,
"learning_rate": 0.001,
"loss": 0.0565,
"step": 99400
},
{
"epoch": 0.99,
"grad_norm": 0.25323519110679626,
"learning_rate": 0.001,
"loss": 0.0576,
"step": 99500
},
{
"epoch": 1.0,
"grad_norm": 0.27348294854164124,
"learning_rate": 0.001,
"loss": 0.0568,
"step": 99600
},
{
"epoch": 1.0,
"grad_norm": 0.25492238998413086,
"learning_rate": 0.001,
"loss": 0.0561,
"step": 99700
},
{
"epoch": 1.0,
"grad_norm": 0.2604049742221832,
"learning_rate": 0.001,
"loss": 0.0564,
"step": 99800
},
{
"epoch": 1.0,
"grad_norm": 0.37222278118133545,
"learning_rate": 0.001,
"loss": 0.059,
"step": 99900
},
{
"epoch": 1.0,
"grad_norm": 0.3180735111236572,
"learning_rate": 0.001,
"loss": 0.0588,
"step": 100000
},
{
"epoch": 1.0,
"step": 100000,
"total_flos": 8.920695708927918e+18,
"train_loss": 0.09189929046154022,
"train_runtime": 235079.2305,
"train_samples_per_second": 54.45,
"train_steps_per_second": 0.425
}
],
"logging_steps": 100,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"total_flos": 8.920695708927918e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}