|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 2930, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0034129692832764505, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 6.825938566552902e-07, |
|
"loss": 3.0499, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.017064846416382253, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.4129692832764506e-06, |
|
"loss": 3.0421, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.034129692832764506, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 6.825938566552901e-06, |
|
"loss": 3.0559, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.051194539249146756, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.0238907849829352e-05, |
|
"loss": 2.9957, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06825938566552901, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.3651877133105803e-05, |
|
"loss": 2.8653, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08532423208191127, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 1.7064846416382256e-05, |
|
"loss": 2.7049, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10238907849829351, |
|
"grad_norm": 18.5, |
|
"learning_rate": 2.0477815699658705e-05, |
|
"loss": 2.5238, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11945392491467577, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 2.3890784982935157e-05, |
|
"loss": 2.3984, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.13651877133105803, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 2.7303754266211605e-05, |
|
"loss": 2.3001, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15358361774744028, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.071672354948806e-05, |
|
"loss": 2.1645, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.17064846416382254, |
|
"grad_norm": 1.25, |
|
"learning_rate": 3.412969283276451e-05, |
|
"loss": 2.0453, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18771331058020477, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.754266211604096e-05, |
|
"loss": 1.8952, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.20477815699658702, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.095563139931741e-05, |
|
"loss": 1.7862, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22184300341296928, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.436860068259386e-05, |
|
"loss": 1.6922, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.23890784982935154, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 4.778156996587031e-05, |
|
"loss": 1.6006, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25597269624573377, |
|
"grad_norm": 0.375, |
|
"learning_rate": 5.119453924914676e-05, |
|
"loss": 1.5335, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.27303754266211605, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 5.460750853242321e-05, |
|
"loss": 1.4832, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2901023890784983, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.802047781569966e-05, |
|
"loss": 1.4393, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.30716723549488056, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 6.143344709897612e-05, |
|
"loss": 1.3951, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3242320819112628, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 6.484641638225257e-05, |
|
"loss": 1.3594, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3412969283276451, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 6.825938566552902e-05, |
|
"loss": 1.3456, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3583617747440273, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 7.167235494880547e-05, |
|
"loss": 1.3174, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.37542662116040953, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 7.508532423208191e-05, |
|
"loss": 1.3087, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3924914675767918, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 7.849829351535837e-05, |
|
"loss": 1.3001, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.40955631399317405, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 8.191126279863482e-05, |
|
"loss": 1.2871, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.42662116040955633, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 8.532423208191128e-05, |
|
"loss": 1.2567, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.44368600682593856, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 8.873720136518772e-05, |
|
"loss": 1.2582, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.46075085324232085, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 9.215017064846417e-05, |
|
"loss": 1.2471, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4778156996587031, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.556313993174063e-05, |
|
"loss": 1.2357, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4948805460750853, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 9.897610921501707e-05, |
|
"loss": 1.2303, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5119453924914675, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00010238907849829352, |
|
"loss": 1.226, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5290102389078498, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00010580204778156998, |
|
"loss": 1.2251, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5460750853242321, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00010921501706484642, |
|
"loss": 1.2135, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5631399317406144, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00011262798634812288, |
|
"loss": 1.2069, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5802047781569966, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00011604095563139932, |
|
"loss": 1.2005, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5972696245733788, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00011945392491467577, |
|
"loss": 1.1944, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6143344709897611, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00012286689419795224, |
|
"loss": 1.1775, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6313993174061433, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00012627986348122866, |
|
"loss": 1.1844, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6484641638225256, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00012969283276450513, |
|
"loss": 1.1711, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6655290102389079, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00013310580204778158, |
|
"loss": 1.1824, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6825938566552902, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00013651877133105805, |
|
"loss": 1.169, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6996587030716723, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00013993174061433447, |
|
"loss": 1.1691, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7167235494880546, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00014334470989761094, |
|
"loss": 1.1573, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7337883959044369, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00014675767918088738, |
|
"loss": 1.1637, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7508532423208191, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00015017064846416383, |
|
"loss": 1.1605, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7679180887372014, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00015358361774744027, |
|
"loss": 1.1539, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7849829351535836, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00015699658703071675, |
|
"loss": 1.1458, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8020477815699659, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0001604095563139932, |
|
"loss": 1.1524, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8191126279863481, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00016382252559726964, |
|
"loss": 1.1505, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8361774744027304, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00016723549488054608, |
|
"loss": 1.1493, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8532423208191127, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00017064846416382255, |
|
"loss": 1.1391, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8703071672354948, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00017406143344709897, |
|
"loss": 1.1213, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8873720136518771, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00017747440273037544, |
|
"loss": 1.1311, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9044368600682594, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001808873720136519, |
|
"loss": 1.1224, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9215017064846417, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00018430034129692833, |
|
"loss": 1.1369, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9385665529010239, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00018771331058020478, |
|
"loss": 1.1203, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9556313993174061, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00019112627986348125, |
|
"loss": 1.1281, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9726962457337884, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0001945392491467577, |
|
"loss": 1.1231, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9897610921501706, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00019795221843003414, |
|
"loss": 1.1249, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.4640614986419678, |
|
"eval_runtime": 0.5515, |
|
"eval_samples_per_second": 18.133, |
|
"eval_steps_per_second": 1.813, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.006825938566553, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00019999971613668125, |
|
"loss": 1.1028, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.023890784982935, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00019999652269285281, |
|
"loss": 1.0985, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0409556313993173, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00019998978108973762, |
|
"loss": 1.0885, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0580204778156996, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019997949156654686, |
|
"loss": 1.1064, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.075085324232082, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00019996565448838176, |
|
"loss": 1.0991, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0921501706484642, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0001999482703462211, |
|
"loss": 1.0947, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1092150170648465, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00019992733975690333, |
|
"loss": 1.097, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.1262798634812285, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00019990286346310493, |
|
"loss": 1.0835, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1433447098976108, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00019987484233331394, |
|
"loss": 1.1033, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.1604095563139931, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 0.00019984327736179936, |
|
"loss": 1.1011, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1774744027303754, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.0001998081696685755, |
|
"loss": 1.0986, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1945392491467577, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001997695204993626, |
|
"loss": 1.0859, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.21160409556314, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00019972733122554246, |
|
"loss": 1.0867, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.2286689419795223, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00019968160334410975, |
|
"loss": 1.0949, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2457337883959045, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00019963233847761894, |
|
"loss": 1.0683, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.2627986348122868, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00019957953837412677, |
|
"loss": 1.0829, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2798634812286689, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001995232049071302, |
|
"loss": 1.0878, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2969283276450512, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00019946334007549978, |
|
"loss": 1.0697, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3139931740614335, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019939994600340905, |
|
"loss": 1.0765, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.3310580204778157, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019933302494025884, |
|
"loss": 1.0772, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.348122866894198, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019926257926059768, |
|
"loss": 1.0739, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.36518771331058, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00019918861146403733, |
|
"loss": 1.0816, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3822525597269624, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001991111241751644, |
|
"loss": 1.0711, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.3993174061433447, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019903012014344686, |
|
"loss": 1.0616, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.416382252559727, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019894560224313678, |
|
"loss": 1.0624, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.4334470989761092, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019885757347316813, |
|
"loss": 1.0572, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4505119453924915, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0001987660369570505, |
|
"loss": 1.0701, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.4675767918088738, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019867099594275827, |
|
"loss": 1.0669, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.484641638225256, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00019857245380261525, |
|
"loss": 1.0724, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.5017064846416384, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0001984704140331751, |
|
"loss": 1.0728, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.5187713310580204, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019836488025509736, |
|
"loss": 1.0712, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.5358361774744027, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019825585621301872, |
|
"loss": 1.0569, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.552901023890785, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00019814334577542038, |
|
"loss": 1.0638, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.5699658703071673, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001980273529344907, |
|
"loss": 1.0638, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5870307167235493, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019790788180598358, |
|
"loss": 1.0556, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.6040955631399316, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019778493662907237, |
|
"loss": 1.056, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.621160409556314, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00019765852176619944, |
|
"loss": 1.0512, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.6382252559726962, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00019752864170292152, |
|
"loss": 1.0585, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.6552901023890785, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00019739530104775032, |
|
"loss": 1.0628, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.6723549488054608, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019725850453198925, |
|
"loss": 1.0612, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.689419795221843, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00019711825700956536, |
|
"loss": 1.0549, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0001969745634568572, |
|
"loss": 1.0506, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.7235494880546076, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00019682742897251818, |
|
"loss": 1.0418, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.74061433447099, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0001966768587772957, |
|
"loss": 1.0508, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.757679180887372, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00019652285821384596, |
|
"loss": 1.0519, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.7747440273037542, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0001963654327465442, |
|
"loss": 1.0554, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7918088737201365, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00019620458796129104, |
|
"loss": 1.0421, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.8088737201365188, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0001960403295653141, |
|
"loss": 1.0421, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.8259385665529009, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019587266338696565, |
|
"loss": 1.046, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.8430034129692832, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00019570159537551552, |
|
"loss": 1.0528, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.8600682593856654, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019552713160094038, |
|
"loss": 1.0481, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.8771331058020477, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00019534927825370815, |
|
"loss": 1.0477, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.89419795221843, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00019516804164455826, |
|
"loss": 1.0513, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.9112627986348123, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00019498342820427794, |
|
"loss": 1.0505, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.9283276450511946, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00019479544448347392, |
|
"loss": 1.0538, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.9453924914675769, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019460409715233996, |
|
"loss": 1.0332, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.9624573378839592, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00019440939300042028, |
|
"loss": 1.047, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.9795221843003414, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00019421133893636854, |
|
"loss": 1.0321, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9965870307167235, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019400994198770274, |
|
"loss": 1.0415, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.451392650604248, |
|
"eval_runtime": 0.5484, |
|
"eval_samples_per_second": 18.236, |
|
"eval_steps_per_second": 1.824, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 2.013651877133106, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019380520930055602, |
|
"loss": 1.0194, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.030716723549488, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0001935971481394227, |
|
"loss": 0.9985, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.04778156996587, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019338576588690104, |
|
"loss": 1.0026, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.0648464163822524, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019317107004343078, |
|
"loss": 1.0018, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.0819112627986347, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0001929530682270274, |
|
"loss": 1.0096, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.098976109215017, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001927317681730115, |
|
"loss": 1.0047, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.1160409556313993, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00019250717773373462, |
|
"loss": 0.9998, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.1331058020477816, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00019227930487830035, |
|
"loss": 1.0121, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.150170648464164, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019204815769228176, |
|
"loss": 1.0064, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.167235494880546, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019181374437743438, |
|
"loss": 0.9968, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.1843003412969284, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019157607325140524, |
|
"loss": 1.0046, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.2013651877133107, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00019133515274743771, |
|
"loss": 1.0161, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.218430034129693, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00019109099141407233, |
|
"loss": 1.004, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.2354948805460753, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0001908435979148434, |
|
"loss": 1.0071, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.252559726962457, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019059298102797146, |
|
"loss": 1.0117, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.26962457337884, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0001903391496460522, |
|
"loss": 0.9996, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.2866894197952217, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001900821127757405, |
|
"loss": 1.0038, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.303754266211604, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0001898218795374311, |
|
"loss": 1.0105, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.3208191126279862, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.0001895584591649349, |
|
"loss": 0.9929, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.3378839590443685, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00018929186100515136, |
|
"loss": 1.0018, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.354948805460751, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00018902209451773674, |
|
"loss": 0.9955, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.372013651877133, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0001887491692747686, |
|
"loss": 0.9953, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.3890784982935154, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.000188473094960406, |
|
"loss": 0.9833, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.4061433447098977, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018819388137054604, |
|
"loss": 1.0089, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.42320819112628, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00018791153841247614, |
|
"loss": 1.0031, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.4402730375426622, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00018762607610452254, |
|
"loss": 1.002, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.4573378839590445, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00018733750457569485, |
|
"loss": 1.0003, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.474402730375427, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00018704583406532662, |
|
"loss": 1.004, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.491467576791809, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00018675107492271208, |
|
"loss": 1.0075, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.508532423208191, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001864532376067387, |
|
"loss": 1.0035, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.5255972696245736, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00018615233268551643, |
|
"loss": 0.9968, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.5426621160409555, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018584837083600244, |
|
"loss": 1.0124, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.5597269624573378, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00018554136284362237, |
|
"loss": 1.0012, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.57679180887372, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018523131960188755, |
|
"loss": 0.9915, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.5938566552901023, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0001849182521120087, |
|
"loss": 0.9996, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.6109215017064846, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00018460217148250524, |
|
"loss": 0.9975, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.627986348122867, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001842830889288114, |
|
"loss": 1.0008, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.645051194539249, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00018396101577287813, |
|
"loss": 1.0041, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.6621160409556315, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00018363596344277144, |
|
"loss": 0.9995, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.6791808873720138, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001833079434722668, |
|
"loss": 1.002, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.696245733788396, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00018297696750044, |
|
"loss": 1.0057, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.7133105802047783, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018264304727125407, |
|
"loss": 0.9966, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.73037542662116, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00018230619463314266, |
|
"loss": 0.9887, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.747440273037543, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00018196642153858958, |
|
"loss": 0.9993, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.7645051194539247, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00018162374004370463, |
|
"loss": 0.9953, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.781569965870307, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001812781623077959, |
|
"loss": 0.9856, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.7986348122866893, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018092970059293835, |
|
"loss": 1.0029, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.8156996587030716, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0001805783672635386, |
|
"loss": 0.991, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.832764505119454, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018022417478589627, |
|
"loss": 1.0053, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.849829351535836, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00017986713572776174, |
|
"loss": 0.9865, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.8668941979522184, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00017950726275789, |
|
"loss": 0.9948, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.8839590443686007, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00017914456864559126, |
|
"loss": 0.9916, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.901023890784983, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001787790662602779, |
|
"loss": 0.9985, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.9180887372013653, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00017841076857100767, |
|
"loss": 0.994, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.9351535836177476, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001780396886460237, |
|
"loss": 0.9811, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.9522184300341294, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00017766583965229065, |
|
"loss": 0.9872, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.969283276450512, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00017728923485502759, |
|
"loss": 0.9951, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.986348122866894, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00017690988761723725, |
|
"loss": 0.9915, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.4749691486358643, |
|
"eval_runtime": 0.5425, |
|
"eval_samples_per_second": 18.434, |
|
"eval_steps_per_second": 1.843, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 3.0034129692832763, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00017652781139923196, |
|
"loss": 0.9883, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.0204778156996586, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.000176143019758156, |
|
"loss": 0.9611, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.037542662116041, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0001757555263475044, |
|
"loss": 0.9542, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.054607508532423, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00017536534491663873, |
|
"loss": 0.9614, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.0716723549488054, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00017497248931029914, |
|
"loss": 0.9538, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.0887372013651877, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.000174576973468113, |
|
"loss": 0.9581, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.10580204778157, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017417881142410037, |
|
"loss": 0.9466, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.1228668941979523, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00017377801730617613, |
|
"loss": 0.9632, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.1399317406143346, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00017337460533564845, |
|
"loss": 0.948, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.156996587030717, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017296858982671442, |
|
"loss": 0.9515, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.174061433447099, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00017255998518595194, |
|
"loss": 0.9625, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.1911262798634814, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00017214880591180873, |
|
"loss": 0.9532, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 3.2081911262798632, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001717350665940877, |
|
"loss": 0.9499, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.2252559726962455, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00017131878191342932, |
|
"loss": 0.9505, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 3.242320819112628, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00017089996664079084, |
|
"loss": 0.9489, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.25938566552901, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017047863563692198, |
|
"loss": 0.9623, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 3.2764505119453924, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00017005480385183774, |
|
"loss": 0.9474, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.2935153583617747, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00016962848632428795, |
|
"loss": 0.9558, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 3.310580204778157, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00016919969818122345, |
|
"loss": 0.9538, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.3276450511945392, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00016876845463725975, |
|
"loss": 0.955, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.3447098976109215, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0001683347709941367, |
|
"loss": 0.9615, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.361774744027304, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.0001678986626401759, |
|
"loss": 0.9591, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 3.378839590443686, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00016746014504973448, |
|
"loss": 0.9479, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.3959044368600684, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00016701923378265615, |
|
"loss": 0.9511, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 3.4129692832764507, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00016657594448371896, |
|
"loss": 0.962, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.430034129692833, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0001661302928820803, |
|
"loss": 0.9612, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.4470989761092152, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00016568229479071872, |
|
"loss": 0.9524, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.464163822525597, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001652319661058729, |
|
"loss": 0.9557, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 3.4812286689419794, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00016477932280647747, |
|
"loss": 0.9635, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.4982935153583616, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00016432438095359623, |
|
"loss": 0.9549, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.515358361774744, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00016386715668985211, |
|
"loss": 0.9456, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.532423208191126, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00016340766623885438, |
|
"loss": 0.945, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 3.5494880546075085, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00016294592590462316, |
|
"loss": 0.95, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.5665529010238908, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0001624819520710107, |
|
"loss": 0.9583, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 3.583617747440273, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00016201576120112007, |
|
"loss": 0.9443, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.6006825938566553, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001615473698367212, |
|
"loss": 0.9635, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 3.6177474402730376, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00016107679459766367, |
|
"loss": 0.9524, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.63481228668942, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001606040521812872, |
|
"loss": 0.9552, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 3.651877133105802, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00016012915936182892, |
|
"loss": 0.9502, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.6689419795221845, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00015965213298982855, |
|
"loss": 0.9629, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.6860068259385663, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00015917298999153015, |
|
"loss": 0.9591, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.703071672354949, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00015869174736828168, |
|
"loss": 0.9699, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 3.720136518771331, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00015820842219593182, |
|
"loss": 0.9478, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.737201365187713, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00015772303162422385, |
|
"loss": 0.9646, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 3.7542662116040955, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00015723559287618728, |
|
"loss": 0.9601, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.7713310580204777, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00015674612324752683, |
|
"loss": 0.9548, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 3.78839590443686, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00015625464010600844, |
|
"loss": 0.9625, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.8054607508532423, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00015576116089084327, |
|
"loss": 0.9448, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 3.8225255972696246, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00015526570311206884, |
|
"loss": 0.9547, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.839590443686007, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00015476828434992762, |
|
"loss": 0.9527, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.856655290102389, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00015426892225424337, |
|
"loss": 0.9499, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.8737201365187715, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00015376763454379478, |
|
"loss": 0.9593, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 3.8907849829351537, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001532644390056868, |
|
"loss": 0.9457, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.9078498293515356, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00015275935349471959, |
|
"loss": 0.9622, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 3.9249146757679183, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00015225239593275473, |
|
"loss": 0.9584, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.9419795221843, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00015174358430807957, |
|
"loss": 0.9547, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 3.9590443686006824, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00015123293667476887, |
|
"loss": 0.9546, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.9761092150170647, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00015072047115204397, |
|
"loss": 0.945, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 3.993174061433447, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00015020620592363034, |
|
"loss": 0.9551, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.529212474822998, |
|
"eval_runtime": 0.5437, |
|
"eval_samples_per_second": 18.394, |
|
"eval_steps_per_second": 1.839, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 4.010238907849829, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00014969015923711195, |
|
"loss": 0.925, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 4.027303754266212, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00014917234940328396, |
|
"loss": 0.9111, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.044368600682594, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00014865279479550292, |
|
"loss": 0.9124, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 4.061433447098976, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00014813151384903493, |
|
"loss": 0.912, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.078498293515358, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00014760852506040162, |
|
"loss": 0.9113, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 4.09556313993174, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001470838469867234, |
|
"loss": 0.9168, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.112627986348123, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00014655749824506151, |
|
"loss": 0.9152, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 4.129692832764505, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00014602949751175713, |
|
"loss": 0.9098, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.146757679180888, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00014549986352176882, |
|
"loss": 0.9213, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 4.163822525597269, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00014496861506800758, |
|
"loss": 0.9128, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.180887372013652, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001444357710006703, |
|
"loss": 0.9102, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 4.197952218430034, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001439013502265707, |
|
"loss": 0.9058, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.215017064846417, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00014336537170846848, |
|
"loss": 0.9233, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 4.2320819112627985, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00014282785446439653, |
|
"loss": 0.9092, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.249146757679181, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00014228881756698603, |
|
"loss": 0.9093, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 4.266211604095563, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00014174828014278985, |
|
"loss": 0.9271, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.283276450511945, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00014120626137160375, |
|
"loss": 0.9189, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 4.300341296928328, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00014066278048578584, |
|
"loss": 0.9078, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.3174061433447095, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00014011785676957422, |
|
"loss": 0.9115, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 4.334470989761092, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00013957150955840267, |
|
"loss": 0.9099, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.351535836177474, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0001390237582382147, |
|
"loss": 0.9208, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 4.368600682593857, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00013847462224477538, |
|
"loss": 0.9133, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.385665529010239, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00013792412106298198, |
|
"loss": 0.9088, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 4.402730375426621, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00013737227422617267, |
|
"loss": 0.9176, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.419795221843003, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00013681910131543309, |
|
"loss": 0.9143, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 4.436860068259386, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00013626462195890168, |
|
"loss": 0.9148, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.453924914675768, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00013570885583107347, |
|
"loss": 0.9165, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 4.4709897610921505, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00013515182265210165, |
|
"loss": 0.9198, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.488054607508532, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00013459354218709794, |
|
"loss": 0.9294, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 4.505119453924914, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00013403403424543139, |
|
"loss": 0.9137, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.522184300341297, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00013347331868002527, |
|
"loss": 0.9172, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 4.53924914675768, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0001329114153866529, |
|
"loss": 0.9237, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.5563139931740615, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00013234834430323145, |
|
"loss": 0.9144, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 4.573378839590443, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00013178412540911457, |
|
"loss": 0.9193, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.590443686006826, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00013121877872438354, |
|
"loss": 0.9217, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 4.607508532423208, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00013065232430913676, |
|
"loss": 0.9252, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.624573378839591, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00013008478226277816, |
|
"loss": 0.9265, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 4.6416382252559725, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00012951617272330377, |
|
"loss": 0.9221, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.658703071672355, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00012894651586658736, |
|
"loss": 0.9131, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 4.675767918088737, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00012837583190566446, |
|
"loss": 0.9109, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.69283276450512, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00012780414109001518, |
|
"loss": 0.9204, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 4.709897610921502, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00012723146370484568, |
|
"loss": 0.9154, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.726962457337884, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00012665782007036835, |
|
"loss": 0.9251, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 4.744027303754266, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0001260832305410809, |
|
"loss": 0.926, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.761092150170649, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00012550771550504396, |
|
"loss": 0.9137, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 4.778156996587031, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00012493129538315788, |
|
"loss": 0.9181, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.795221843003413, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00012435399062843796, |
|
"loss": 0.9207, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 4.812286689419795, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00012377582172528877, |
|
"loss": 0.9156, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.829351535836177, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00012319680918877732, |
|
"loss": 0.9222, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 4.84641638225256, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00012261697356390506, |
|
"loss": 0.9297, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.863481228668942, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00012203633542487907, |
|
"loss": 0.9146, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.8805460750853245, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00012145491537438174, |
|
"loss": 0.917, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.897610921501706, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00012087273404284002, |
|
"loss": 0.912, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 4.914675767918089, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0001202898120876932, |
|
"loss": 0.9224, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.931740614334471, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00011970617019266, |
|
"loss": 0.9167, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 4.948805460750854, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00011912182906700466, |
|
"loss": 0.9166, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.965870307167235, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00011853680944480206, |
|
"loss": 0.9243, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 4.982935153583618, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00011795113208420208, |
|
"loss": 0.9128, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.9287, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.5924570560455322, |
|
"eval_runtime": 0.5421, |
|
"eval_samples_per_second": 18.446, |
|
"eval_steps_per_second": 1.845, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 5.017064846416382, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00011677788729636427, |
|
"loss": 0.8743, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.034129692832765, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001161903614991679, |
|
"loss": 0.8731, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 5.051194539249146, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00011560226122218, |
|
"loss": 0.8735, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.068259385665529, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00011501360733286085, |
|
"loss": 0.8808, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 5.085324232081911, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00011442442071831434, |
|
"loss": 0.8776, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.102389078498294, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00011383472228454699, |
|
"loss": 0.872, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 5.1194539249146755, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00011324453295572618, |
|
"loss": 0.8801, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.136518771331058, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00011265387367343763, |
|
"loss": 0.8767, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 5.15358361774744, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00011206276539594221, |
|
"loss": 0.8764, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.170648464163823, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00011147122909743257, |
|
"loss": 0.8768, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 5.187713310580205, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00011087928576728865, |
|
"loss": 0.8848, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.204778156996587, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00011028695640933309, |
|
"loss": 0.8905, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 5.221843003412969, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00010969426204108583, |
|
"loss": 0.8872, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.238907849829351, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00010910122369301842, |
|
"loss": 0.8749, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 5.255972696245734, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00010850786240780786, |
|
"loss": 0.884, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.273037542662116, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00010791419923958976, |
|
"loss": 0.8739, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 5.290102389078498, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00010732025525321145, |
|
"loss": 0.8902, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.30716723549488, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00010672605152348449, |
|
"loss": 0.8863, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 5.324232081911263, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00010613160913443682, |
|
"loss": 0.8752, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.341296928327645, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00010553694917856478, |
|
"loss": 0.8782, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 5.3583617747440275, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00010494209275608455, |
|
"loss": 0.8804, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 5.375426621160409, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00010434706097418338, |
|
"loss": 0.8889, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 5.392491467576792, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00010375187494627098, |
|
"loss": 0.8861, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.409556313993174, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00010315655579123, |
|
"loss": 0.878, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 5.426621160409557, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00010256112463266687, |
|
"loss": 0.893, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.4436860068259385, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00010196560259816221, |
|
"loss": 0.8913, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 5.460750853242321, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00010137001081852113, |
|
"loss": 0.8848, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.477815699658703, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00010077437042702362, |
|
"loss": 0.8867, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 5.494880546075085, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00010017870255867445, |
|
"loss": 0.8843, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.511945392491468, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.958302834945332e-05, |
|
"loss": 0.8905, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 5.5290102389078495, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 9.898736893556502e-05, |
|
"loss": 0.8903, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.546075085324232, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.839174545268931e-05, |
|
"loss": 0.897, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 5.563139931740614, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 9.7796179035231e-05, |
|
"loss": 0.8925, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.580204778156997, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 9.720069081557009e-05, |
|
"loss": 0.8748, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 5.597269624573379, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.660530192331191e-05, |
|
"loss": 0.8829, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.614334470989761, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.601003348453734e-05, |
|
"loss": 0.8922, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 5.631399317406143, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 9.541490662105326e-05, |
|
"loss": 0.8936, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.648464163822526, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 9.481994244964297e-05, |
|
"loss": 0.8897, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 5.665529010238908, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 9.422516208131709e-05, |
|
"loss": 0.8762, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.6825938566552905, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 9.363058662056443e-05, |
|
"loss": 0.8842, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 5.699658703071672, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 9.303623716460297e-05, |
|
"loss": 0.8906, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.716723549488055, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.244213480263148e-05, |
|
"loss": 0.8911, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 5.733788395904437, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.184830061508113e-05, |
|
"loss": 0.8893, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.750853242320819, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 9.125475567286744e-05, |
|
"loss": 0.8826, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 5.7679180887372015, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 9.066152103664283e-05, |
|
"loss": 0.8845, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.784982935153583, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.006861775604904e-05, |
|
"loss": 0.8808, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 5.802047781569966, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 8.947606686897045e-05, |
|
"loss": 0.8829, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.819112627986348, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 8.88838894007875e-05, |
|
"loss": 0.8835, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 5.836177474402731, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 8.829210636363067e-05, |
|
"loss": 0.8894, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.853242320819112, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.770073875563493e-05, |
|
"loss": 0.8822, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 5.870307167235495, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 8.710980756019467e-05, |
|
"loss": 0.8811, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.887372013651877, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 8.651933374521907e-05, |
|
"loss": 0.8906, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 5.90443686006826, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 8.592933826238818e-05, |
|
"loss": 0.8773, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.921501706484642, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 8.533984204640941e-05, |
|
"loss": 0.8843, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 5.938566552901024, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 8.4750866014275e-05, |
|
"loss": 0.8907, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.955631399317406, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 8.416243106451934e-05, |
|
"loss": 0.8795, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 5.972696245733788, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 8.357455807647778e-05, |
|
"loss": 0.8767, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.989761092150171, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 8.29872679095457e-05, |
|
"loss": 0.8733, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.6554951667785645, |
|
"eval_runtime": 0.5458, |
|
"eval_samples_per_second": 18.321, |
|
"eval_steps_per_second": 1.832, |
|
"step": 1758 |
|
}, |
|
{ |
|
"epoch": 6.006825938566553, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 8.240058140243834e-05, |
|
"loss": 0.8646, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.023890784982935, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 8.181451937245131e-05, |
|
"loss": 0.8498, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 6.040955631399317, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 8.122910261472214e-05, |
|
"loss": 0.8455, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.0580204778157, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 8.064435190149218e-05, |
|
"loss": 0.8363, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 6.075085324232082, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 8.006028798136962e-05, |
|
"loss": 0.855, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.092150170648464, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 7.947693157859337e-05, |
|
"loss": 0.8556, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 6.109215017064846, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 7.889430339229754e-05, |
|
"loss": 0.8606, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 6.126279863481229, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 7.831242409577716e-05, |
|
"loss": 0.8535, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 6.143344709897611, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 7.773131433575444e-05, |
|
"loss": 0.851, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.160409556313994, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 7.715099473164632e-05, |
|
"loss": 0.8468, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 6.177474402730375, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 7.657148587483271e-05, |
|
"loss": 0.8518, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 6.194539249146757, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 7.599280832792596e-05, |
|
"loss": 0.8467, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 6.21160409556314, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 7.541498262404125e-05, |
|
"loss": 0.8549, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 6.228668941979522, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 7.483802926606787e-05, |
|
"loss": 0.8534, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 6.2457337883959045, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 7.426196872594182e-05, |
|
"loss": 0.8491, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 6.262798634812286, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 7.368682144391944e-05, |
|
"loss": 0.8503, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 6.279863481228669, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 7.311260782785207e-05, |
|
"loss": 0.8528, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 6.296928327645051, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 7.253934825246193e-05, |
|
"loss": 0.8592, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 6.313993174061434, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 7.196706305861925e-05, |
|
"loss": 0.8528, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 6.3310580204778155, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 7.139577255262034e-05, |
|
"loss": 0.8528, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 6.348122866894198, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.082549700546726e-05, |
|
"loss": 0.8561, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 6.36518771331058, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 7.025625665214844e-05, |
|
"loss": 0.8562, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 6.382252559726963, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 6.968807169092059e-05, |
|
"loss": 0.8561, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 6.399317406143345, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 6.912096228259236e-05, |
|
"loss": 0.8598, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 6.4163822525597265, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 6.855494854980857e-05, |
|
"loss": 0.8573, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 6.433447098976109, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 6.799005057633644e-05, |
|
"loss": 0.8576, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 6.450511945392491, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 6.742628840635284e-05, |
|
"loss": 0.855, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 6.467576791808874, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 6.68636820437331e-05, |
|
"loss": 0.8628, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 6.484641638225256, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 6.630225145134144e-05, |
|
"loss": 0.8489, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.501706484641638, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 6.574201655032216e-05, |
|
"loss": 0.8534, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 6.51877133105802, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 6.518299721939323e-05, |
|
"loss": 0.8582, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 6.535836177474403, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 6.462521329414066e-05, |
|
"loss": 0.8561, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 6.552901023890785, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 6.406868456631483e-05, |
|
"loss": 0.8618, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.5699658703071675, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 6.351343078312819e-05, |
|
"loss": 0.8575, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 6.587030716723549, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 6.295947164655447e-05, |
|
"loss": 0.8504, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.604095563139932, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 6.240682681262971e-05, |
|
"loss": 0.8619, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 6.621160409556314, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 6.185551589075482e-05, |
|
"loss": 0.8536, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.638225255972696, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 6.130555844299973e-05, |
|
"loss": 0.8511, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 6.6552901023890785, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 6.075697398340913e-05, |
|
"loss": 0.859, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.672354948805461, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 6.0209781977310486e-05, |
|
"loss": 0.8617, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 6.689419795221843, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.9664001840622886e-05, |
|
"loss": 0.8478, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.706484641638225, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.91196529391683e-05, |
|
"loss": 0.8548, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 6.723549488054608, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 5.857675458798453e-05, |
|
"loss": 0.8623, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 6.7406143344709895, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 5.8035326050639615e-05, |
|
"loss": 0.853, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 6.757679180887372, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 5.749538653854861e-05, |
|
"loss": 0.8594, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.774744027303754, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 5.695695521029163e-05, |
|
"loss": 0.8528, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 6.791808873720137, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 5.642005117093419e-05, |
|
"loss": 0.8485, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.808873720136519, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 5.5884693471349256e-05, |
|
"loss": 0.8578, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 6.825938566552901, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 5.535090110754131e-05, |
|
"loss": 0.8549, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.843003412969283, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 5.481869301997236e-05, |
|
"loss": 0.8625, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 6.860068259385666, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.428808809288975e-05, |
|
"loss": 0.8529, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.877133105802048, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 5.37591051536561e-05, |
|
"loss": 0.8505, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 6.8941979522184305, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.32317629720814e-05, |
|
"loss": 0.8585, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.911262798634812, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.270608025975686e-05, |
|
"loss": 0.8563, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 6.928327645051194, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 5.218207566939116e-05, |
|
"loss": 0.8534, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.945392491467577, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 5.1659767794148316e-05, |
|
"loss": 0.853, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 6.962457337883959, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 5.1139175166988187e-05, |
|
"loss": 0.8622, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.979522184300341, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 5.062031626000873e-05, |
|
"loss": 0.8602, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 6.996587030716723, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 5.0103209483790636e-05, |
|
"loss": 0.8577, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.731566905975342, |
|
"eval_runtime": 0.5528, |
|
"eval_samples_per_second": 18.088, |
|
"eval_steps_per_second": 1.809, |
|
"step": 2051 |
|
}, |
|
{ |
|
"epoch": 7.013651877133106, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 4.9587873186744025e-05, |
|
"loss": 0.8366, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 7.030716723549488, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.9074325654457446e-05, |
|
"loss": 0.8237, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 7.047781569965871, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.856258510904899e-05, |
|
"loss": 0.8231, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 7.064846416382252, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 4.805266970851975e-05, |
|
"loss": 0.8253, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 7.081911262798635, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 4.7544597546109514e-05, |
|
"loss": 0.8313, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 7.098976109215017, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 4.7038386649654764e-05, |
|
"loss": 0.8322, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 7.1160409556314, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.6534054980949113e-05, |
|
"loss": 0.8317, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 7.1331058020477816, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 4.603162043510566e-05, |
|
"loss": 0.8356, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 7.150170648464163, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.553110083992237e-05, |
|
"loss": 0.8289, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 7.167235494880546, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 4.50325139552493e-05, |
|
"loss": 0.8382, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 7.184300341296928, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 4.4535877472358466e-05, |
|
"loss": 0.8363, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 7.201365187713311, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 4.404120901331618e-05, |
|
"loss": 0.8388, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 7.2184300341296925, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 4.354852613035763e-05, |
|
"loss": 0.8291, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 7.235494880546075, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 4.305784630526416e-05, |
|
"loss": 0.8361, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 7.252559726962457, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 4.2569186948743e-05, |
|
"loss": 0.8416, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 7.26962457337884, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 4.2082565399809404e-05, |
|
"loss": 0.8281, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 7.286689419795222, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 4.159799892517148e-05, |
|
"loss": 0.8281, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 7.303754266211604, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 4.111550471861747e-05, |
|
"loss": 0.8352, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 7.320819112627986, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 4.06350999004057e-05, |
|
"loss": 0.833, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 7.337883959044369, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 4.0156801516657095e-05, |
|
"loss": 0.825, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 7.354948805460751, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.968062653875031e-05, |
|
"loss": 0.8386, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 7.372013651877133, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.920659186271953e-05, |
|
"loss": 0.8454, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 7.389078498293515, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 3.873471430865515e-05, |
|
"loss": 0.8431, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 7.406143344709897, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.8265010620106533e-05, |
|
"loss": 0.8392, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 7.42320819112628, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.779749746348831e-05, |
|
"loss": 0.8362, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 7.440273037542662, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 3.7332191427488784e-05, |
|
"loss": 0.8348, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 7.4573378839590445, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 3.6869109022481386e-05, |
|
"loss": 0.831, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 7.474402730375426, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 3.640826667993891e-05, |
|
"loss": 0.8314, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 7.491467576791809, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 3.59496807518503e-05, |
|
"loss": 0.8258, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 7.508532423208191, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.549336751014057e-05, |
|
"loss": 0.8482, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 7.525597269624574, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 3.503934314609343e-05, |
|
"loss": 0.8387, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 7.5426621160409555, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 3.458762376977669e-05, |
|
"loss": 0.8344, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 7.559726962457338, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 3.41382254094707e-05, |
|
"loss": 0.8315, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 7.57679180887372, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 3.369116401109963e-05, |
|
"loss": 0.8331, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 7.593856655290102, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.3246455437665594e-05, |
|
"loss": 0.8322, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 7.610921501706485, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 3.280411546868583e-05, |
|
"loss": 0.8281, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 7.627986348122867, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 3.2364159799632786e-05, |
|
"loss": 0.8281, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 7.645051194539249, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 3.192660404137729e-05, |
|
"loss": 0.832, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 7.662116040955631, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 3.14914637196345e-05, |
|
"loss": 0.8361, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 7.679180887372014, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.105875427441297e-05, |
|
"loss": 0.837, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 7.696245733788396, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 3.0628491059467014e-05, |
|
"loss": 0.8351, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 7.713310580204778, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 3.020068934175171e-05, |
|
"loss": 0.838, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 7.73037542662116, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.977536430088125e-05, |
|
"loss": 0.8355, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 7.747440273037543, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 2.9352531028590424e-05, |
|
"loss": 0.8261, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 7.764505119453925, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.8932204528198926e-05, |
|
"loss": 0.8367, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 7.7815699658703075, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.8514399714079132e-05, |
|
"loss": 0.8405, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 7.798634812286689, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.8099131411126867e-05, |
|
"loss": 0.8408, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 7.815699658703072, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 2.7686414354235356e-05, |
|
"loss": 0.8397, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 7.832764505119454, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.7276263187772423e-05, |
|
"loss": 0.8385, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 7.849829351535837, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 2.6868692465060828e-05, |
|
"loss": 0.8309, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.8668941979522184, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.6463716647861904e-05, |
|
"loss": 0.8229, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 7.8839590443686, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.6061350105862382e-05, |
|
"loss": 0.8226, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 7.901023890784983, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.5661607116164532e-05, |
|
"loss": 0.8334, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 7.918088737201365, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 2.5264501862779667e-05, |
|
"loss": 0.8444, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 7.935153583617748, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.4870048436124595e-05, |
|
"loss": 0.8403, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 7.952218430034129, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 2.4478260832521938e-05, |
|
"loss": 0.8302, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 7.969283276450512, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.4089152953703332e-05, |
|
"loss": 0.8265, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 7.986348122866894, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.37027386063162e-05, |
|
"loss": 0.8364, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.7742018699645996, |
|
"eval_runtime": 0.5517, |
|
"eval_samples_per_second": 18.125, |
|
"eval_steps_per_second": 1.813, |
|
"step": 2344 |
|
}, |
|
{ |
|
"epoch": 8.003412969283277, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.331903150143391e-05, |
|
"loss": 0.83, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 8.020477815699659, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 2.293804525406915e-05, |
|
"loss": 0.8208, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 8.03754266211604, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 2.255979338269093e-05, |
|
"loss": 0.8288, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 8.054607508532424, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 2.2184289308744844e-05, |
|
"loss": 0.8251, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 8.071672354948806, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.1811546356176872e-05, |
|
"loss": 0.8202, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 8.088737201365188, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 2.144157775096063e-05, |
|
"loss": 0.8191, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 8.10580204778157, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.1074396620628e-05, |
|
"loss": 0.8161, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 8.122866894197951, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 2.0710015993803422e-05, |
|
"loss": 0.8259, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 8.139931740614335, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.0348448799741537e-05, |
|
"loss": 0.8271, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 8.156996587030717, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.9989707867868425e-05, |
|
"loss": 0.8222, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 8.174061433447099, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.9633805927326387e-05, |
|
"loss": 0.8176, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 8.19112627986348, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.9280755606522384e-05, |
|
"loss": 0.8303, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 8.208191126279864, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.893056943267969e-05, |
|
"loss": 0.8179, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 8.225255972696246, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.8583259831393663e-05, |
|
"loss": 0.8219, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 8.242320819112628, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.8238839126190686e-05, |
|
"loss": 0.829, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 8.25938566552901, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.7897319538090962e-05, |
|
"loss": 0.8233, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 8.276450511945393, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.755871318517488e-05, |
|
"loss": 0.8224, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 8.293515358361775, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 1.722303208215297e-05, |
|
"loss": 0.8239, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 8.310580204778157, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.6890288139939625e-05, |
|
"loss": 0.8324, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 8.327645051194539, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.6560493165230516e-05, |
|
"loss": 0.8216, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 8.344709897610922, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.623365886008357e-05, |
|
"loss": 0.8249, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 8.361774744027304, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.5909796821503785e-05, |
|
"loss": 0.8327, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 8.378839590443686, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.5588918541031783e-05, |
|
"loss": 0.8202, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 8.395904436860068, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.5271035404335954e-05, |
|
"loss": 0.8213, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 8.41296928327645, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.4956158690808585e-05, |
|
"loss": 0.8217, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 8.430034129692833, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.464429957316552e-05, |
|
"loss": 0.8235, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 8.447098976109215, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.433546911704977e-05, |
|
"loss": 0.8257, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 8.464163822525597, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.402967828063897e-05, |
|
"loss": 0.8228, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 8.481228668941979, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.37269379142563e-05, |
|
"loss": 0.8155, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 8.498293515358363, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.3427258759985739e-05, |
|
"loss": 0.8329, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 8.515358361774744, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.3130651451290798e-05, |
|
"loss": 0.8224, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 8.532423208191126, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.2837126512637198e-05, |
|
"loss": 0.8219, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 8.549488054607508, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.2546694359119493e-05, |
|
"loss": 0.8151, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 8.56655290102389, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.2259365296091464e-05, |
|
"loss": 0.8237, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 8.583617747440274, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 1.1975149518800454e-05, |
|
"loss": 0.8207, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 8.600682593856655, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.1694057112025636e-05, |
|
"loss": 0.8221, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 8.617747440273037, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.141609804972017e-05, |
|
"loss": 0.828, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 8.634812286689419, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.1141282194657287e-05, |
|
"loss": 0.8232, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 8.651877133105803, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 1.086961929808038e-05, |
|
"loss": 0.8281, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 8.668941979522184, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 1.0601118999356907e-05, |
|
"loss": 0.8252, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 8.686006825938566, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.0335790825636449e-05, |
|
"loss": 0.8225, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 8.703071672354948, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.00736441915126e-05, |
|
"loss": 0.8199, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 8.720136518771332, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.814688398688998e-06, |
|
"loss": 0.8146, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 8.737201365187714, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 9.558932635649131e-06, |
|
"loss": 0.8303, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 8.754266211604095, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 9.306385977330411e-06, |
|
"loss": 0.8224, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 8.771331058020477, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 9.057057384802181e-06, |
|
"loss": 0.8228, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 8.788395904436861, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.810955704947666e-06, |
|
"loss": 0.8231, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 8.805460750853243, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 8.568089670150115e-06, |
|
"loss": 0.8278, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 8.822525597269625, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 8.328467897982995e-06, |
|
"loss": 0.8248, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 8.839590443686006, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 8.092098890904098e-06, |
|
"loss": 0.8195, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 8.856655290102388, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.858991035953944e-06, |
|
"loss": 0.8203, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 8.873720136518772, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 7.629152604458156e-06, |
|
"loss": 0.8257, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.890784982935154, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.402591751733989e-06, |
|
"loss": 0.8128, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 8.907849829351536, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 7.179316516800894e-06, |
|
"loss": 0.8251, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 8.924914675767917, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 6.959334822095354e-06, |
|
"loss": 0.824, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 8.941979522184301, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 6.7426544731897245e-06, |
|
"loss": 0.8287, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 8.959044368600683, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 6.529283158515276e-06, |
|
"loss": 0.8264, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 8.976109215017065, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 6.319228449089376e-06, |
|
"loss": 0.8179, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 8.993174061433447, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 6.11249779824693e-06, |
|
"loss": 0.8311, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.7970776557922363, |
|
"eval_runtime": 0.547, |
|
"eval_samples_per_second": 18.282, |
|
"eval_steps_per_second": 1.828, |
|
"step": 2637 |
|
}, |
|
{ |
|
"epoch": 9.01023890784983, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 5.909098541375746e-06, |
|
"loss": 0.827, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 9.027303754266212, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 5.7090378956564216e-06, |
|
"loss": 0.8173, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 9.044368600682594, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.512322959806193e-06, |
|
"loss": 0.8315, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 9.061433447098976, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 5.3189607138270255e-06, |
|
"loss": 0.8278, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 9.078498293515358, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 5.128958018758012e-06, |
|
"loss": 0.821, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 9.095563139931741, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 4.942321616431833e-06, |
|
"loss": 0.8261, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 9.112627986348123, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 4.7590581292356276e-06, |
|
"loss": 0.8267, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 9.129692832764505, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 4.579174059875946e-06, |
|
"loss": 0.8265, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 9.146757679180887, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 4.402675791148059e-06, |
|
"loss": 0.8217, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 9.16382252559727, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 4.229569585709425e-06, |
|
"loss": 0.8245, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 9.180887372013652, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 4.0598615858575605e-06, |
|
"loss": 0.8211, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 9.197952218430034, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 3.89355781331201e-06, |
|
"loss": 0.8162, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 9.215017064846416, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 3.730664169000708e-06, |
|
"loss": 0.8154, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 9.2320819112628, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 3.571186432850626e-06, |
|
"loss": 0.8245, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 9.249146757679181, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 3.415130263582611e-06, |
|
"loss": 0.8198, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 9.266211604095563, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 3.2625011985107257e-06, |
|
"loss": 0.8178, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 9.283276450511945, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 3.1133046533455947e-06, |
|
"loss": 0.825, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 9.300341296928327, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.967545922002379e-06, |
|
"loss": 0.8249, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 9.31740614334471, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.8252301764128962e-06, |
|
"loss": 0.8228, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 9.334470989761092, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.686362466342085e-06, |
|
"loss": 0.822, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 9.351535836177474, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.550947719208829e-06, |
|
"loss": 0.8224, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 9.368600682593856, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.4189907399111534e-06, |
|
"loss": 0.8224, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 9.38566552901024, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.2904962106556793e-06, |
|
"loss": 0.82, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 9.402730375426621, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.1654686907915167e-06, |
|
"loss": 0.8183, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 9.419795221843003, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.0439126166485025e-06, |
|
"loss": 0.8189, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 9.436860068259385, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.925832301379726e-06, |
|
"loss": 0.8215, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 9.453924914675769, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.8112319348085771e-06, |
|
"loss": 0.8235, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 9.47098976109215, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.700115583279993e-06, |
|
"loss": 0.8157, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 9.488054607508532, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.592487189516212e-06, |
|
"loss": 0.8192, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 9.505119453924914, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.4883505724768932e-06, |
|
"loss": 0.8168, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 9.522184300341298, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.3877094272235712e-06, |
|
"loss": 0.8296, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 9.53924914675768, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.2905673247885718e-06, |
|
"loss": 0.8166, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 9.556313993174061, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.196927712048257e-06, |
|
"loss": 0.817, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 9.573378839590443, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.1067939116008009e-06, |
|
"loss": 0.813, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 9.590443686006825, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.020169121648218e-06, |
|
"loss": 0.8114, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 9.607508532423209, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 9.370564158829087e-07, |
|
"loss": 0.8146, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 9.62457337883959, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 8.574587433786363e-07, |
|
"loss": 0.8216, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 9.641638225255972, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 7.813789284857986e-07, |
|
"loss": 0.8157, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 9.658703071672354, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.088196707312977e-07, |
|
"loss": 0.8283, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 9.675767918088738, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 6.39783544722694e-07, |
|
"loss": 0.8092, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 9.69283276450512, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 5.742730000568908e-07, |
|
"loss": 0.8242, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 9.709897610921502, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 5.12290361233192e-07, |
|
"loss": 0.8239, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 9.726962457337883, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 4.538378275708133e-07, |
|
"loss": 0.8145, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 9.744027303754265, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.989174731308998e-07, |
|
"loss": 0.8249, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 9.761092150170649, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 3.4753124664286265e-07, |
|
"loss": 0.817, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 9.77815699658703, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 2.9968097143526775e-07, |
|
"loss": 0.8115, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 9.795221843003413, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 2.5536834537114307e-07, |
|
"loss": 0.8192, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 9.812286689419794, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 2.145949407877157e-07, |
|
"loss": 0.8181, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 9.829351535836178, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 1.7736220444064533e-07, |
|
"loss": 0.8203, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 9.84641638225256, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.436714574526543e-07, |
|
"loss": 0.826, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 9.863481228668942, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.1352389526668727e-07, |
|
"loss": 0.8241, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 9.880546075085324, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 8.692058760345622e-08, |
|
"loss": 0.8268, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 9.897610921501707, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 6.386247842353754e-08, |
|
"loss": 0.8106, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.914675767918089, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 4.435038589380991e-08, |
|
"loss": 0.8232, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 9.93174061433447, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 2.8385002358466418e-08, |
|
"loss": 0.8187, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 9.948805460750853, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 1.5966894314456415e-08, |
|
"loss": 0.8284, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 9.965870307167236, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 7.096502391346071e-09, |
|
"loss": 0.8275, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 9.982935153583618, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 1.7741413357197368e-09, |
|
"loss": 0.8271, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0, |
|
"loss": 0.8243, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.7977683544158936, |
|
"eval_runtime": 0.5422, |
|
"eval_samples_per_second": 18.444, |
|
"eval_steps_per_second": 1.844, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 2930, |
|
"total_flos": 1.7464232891960525e+18, |
|
"train_loss": 0.9647074054125633, |
|
"train_runtime": 17674.2713, |
|
"train_samples_per_second": 7.945, |
|
"train_steps_per_second": 0.166 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2930, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7464232891960525e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|