|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 20, |
|
"global_step": 1320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0030303030303030303, |
|
"grad_norm": 0.42330464720726013, |
|
"learning_rate": 4e-07, |
|
"loss": 2.4685, |
|
"num_input_tokens_seen": 10296, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.006060606060606061, |
|
"grad_norm": 0.4667194187641144, |
|
"learning_rate": 8e-07, |
|
"loss": 2.4399, |
|
"num_input_tokens_seen": 20376, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00909090909090909, |
|
"grad_norm": 0.38802874088287354, |
|
"learning_rate": 1.2e-06, |
|
"loss": 2.3101, |
|
"num_input_tokens_seen": 32664, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.012121212121212121, |
|
"grad_norm": 0.4379090368747711, |
|
"learning_rate": 1.6e-06, |
|
"loss": 2.2743, |
|
"num_input_tokens_seen": 41904, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.015151515151515152, |
|
"grad_norm": 0.4267907738685608, |
|
"learning_rate": 2e-06, |
|
"loss": 2.355, |
|
"num_input_tokens_seen": 52776, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01818181818181818, |
|
"grad_norm": 0.5171758532524109, |
|
"learning_rate": 1.999990798125535e-06, |
|
"loss": 2.633, |
|
"num_input_tokens_seen": 61464, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.021212121212121213, |
|
"grad_norm": 0.47265326976776123, |
|
"learning_rate": 1.9999631927138275e-06, |
|
"loss": 2.3386, |
|
"num_input_tokens_seen": 72624, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.024242424242424242, |
|
"grad_norm": 0.5586420893669128, |
|
"learning_rate": 1.9999171843999306e-06, |
|
"loss": 2.3536, |
|
"num_input_tokens_seen": 81840, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02727272727272727, |
|
"grad_norm": 0.39176592230796814, |
|
"learning_rate": 1.9998527742422515e-06, |
|
"loss": 2.2979, |
|
"num_input_tokens_seen": 91968, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.030303030303030304, |
|
"grad_norm": 0.4795871078968048, |
|
"learning_rate": 1.9997699637225253e-06, |
|
"loss": 2.3755, |
|
"num_input_tokens_seen": 102984, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.030303030303030304, |
|
"eval_loss": 2.3641138076782227, |
|
"eval_runtime": 5.815, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 102984, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03333333333333333, |
|
"grad_norm": 0.4541929364204407, |
|
"learning_rate": 1.9996687547457825e-06, |
|
"loss": 2.286, |
|
"num_input_tokens_seen": 113352, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.03636363636363636, |
|
"grad_norm": 0.4055442810058594, |
|
"learning_rate": 1.999549149640303e-06, |
|
"loss": 2.3933, |
|
"num_input_tokens_seen": 125184, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03939393939393939, |
|
"grad_norm": 0.5810754299163818, |
|
"learning_rate": 1.9994111511575657e-06, |
|
"loss": 2.1378, |
|
"num_input_tokens_seen": 135480, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04242424242424243, |
|
"grad_norm": 0.41868993639945984, |
|
"learning_rate": 1.999254762472182e-06, |
|
"loss": 2.2551, |
|
"num_input_tokens_seen": 147384, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.045454545454545456, |
|
"grad_norm": 0.5975711941719055, |
|
"learning_rate": 1.999079987181824e-06, |
|
"loss": 2.506, |
|
"num_input_tokens_seen": 156912, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.048484848484848485, |
|
"grad_norm": 0.422783762216568, |
|
"learning_rate": 1.9988868293071435e-06, |
|
"loss": 2.4742, |
|
"num_input_tokens_seen": 167568, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.051515151515151514, |
|
"grad_norm": 0.32683178782463074, |
|
"learning_rate": 1.998675293291676e-06, |
|
"loss": 2.5007, |
|
"num_input_tokens_seen": 176616, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.05454545454545454, |
|
"grad_norm": 0.4234691858291626, |
|
"learning_rate": 1.998445384001741e-06, |
|
"loss": 2.4632, |
|
"num_input_tokens_seen": 187272, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.05757575757575758, |
|
"grad_norm": 0.4502381980419159, |
|
"learning_rate": 1.99819710672633e-06, |
|
"loss": 2.4556, |
|
"num_input_tokens_seen": 196992, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.06060606060606061, |
|
"grad_norm": 0.5127580165863037, |
|
"learning_rate": 1.9979304671769838e-06, |
|
"loss": 2.5355, |
|
"num_input_tokens_seen": 208824, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06060606060606061, |
|
"eval_loss": 2.361894130706787, |
|
"eval_runtime": 5.8061, |
|
"eval_samples_per_second": 3.445, |
|
"eval_steps_per_second": 3.445, |
|
"num_input_tokens_seen": 208824, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06363636363636363, |
|
"grad_norm": 0.5844971537590027, |
|
"learning_rate": 1.997645471487661e-06, |
|
"loss": 2.497, |
|
"num_input_tokens_seen": 217272, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 0.41816312074661255, |
|
"learning_rate": 1.9973421262145992e-06, |
|
"loss": 2.4371, |
|
"num_input_tokens_seen": 229560, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0696969696969697, |
|
"grad_norm": 0.505349338054657, |
|
"learning_rate": 1.99702043833616e-06, |
|
"loss": 2.4757, |
|
"num_input_tokens_seen": 239568, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.07272727272727272, |
|
"grad_norm": 0.4537525177001953, |
|
"learning_rate": 1.9966804152526726e-06, |
|
"loss": 2.4514, |
|
"num_input_tokens_seen": 251664, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"grad_norm": 0.40902894735336304, |
|
"learning_rate": 1.996322064786261e-06, |
|
"loss": 2.3474, |
|
"num_input_tokens_seen": 263040, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07878787878787878, |
|
"grad_norm": 0.48902806639671326, |
|
"learning_rate": 1.9959453951806656e-06, |
|
"loss": 2.4297, |
|
"num_input_tokens_seen": 271080, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.08181818181818182, |
|
"grad_norm": 0.4684095084667206, |
|
"learning_rate": 1.995550415101052e-06, |
|
"loss": 2.6676, |
|
"num_input_tokens_seen": 282000, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.08484848484848485, |
|
"grad_norm": 0.33189377188682556, |
|
"learning_rate": 1.9951371336338145e-06, |
|
"loss": 2.1799, |
|
"num_input_tokens_seen": 290568, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.08787878787878788, |
|
"grad_norm": 0.4579316973686218, |
|
"learning_rate": 1.994705560286361e-06, |
|
"loss": 2.5315, |
|
"num_input_tokens_seen": 298920, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 0.42468497157096863, |
|
"learning_rate": 1.994255704986903e-06, |
|
"loss": 2.4679, |
|
"num_input_tokens_seen": 309744, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"eval_loss": 2.360027551651001, |
|
"eval_runtime": 5.8148, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 309744, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09393939393939393, |
|
"grad_norm": 0.5245186686515808, |
|
"learning_rate": 1.993787578084219e-06, |
|
"loss": 2.4576, |
|
"num_input_tokens_seen": 321360, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.09696969696969697, |
|
"grad_norm": 0.38165679574012756, |
|
"learning_rate": 1.9933011903474228e-06, |
|
"loss": 2.275, |
|
"num_input_tokens_seen": 332736, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5568698644638062, |
|
"learning_rate": 1.992796552965711e-06, |
|
"loss": 2.2761, |
|
"num_input_tokens_seen": 344568, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.10303030303030303, |
|
"grad_norm": 0.39623475074768066, |
|
"learning_rate": 1.9922736775481083e-06, |
|
"loss": 2.3385, |
|
"num_input_tokens_seen": 356616, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.10606060606060606, |
|
"grad_norm": 0.532319188117981, |
|
"learning_rate": 1.991732576123199e-06, |
|
"loss": 2.3342, |
|
"num_input_tokens_seen": 367680, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10909090909090909, |
|
"grad_norm": 0.505707859992981, |
|
"learning_rate": 1.9911732611388524e-06, |
|
"loss": 2.3604, |
|
"num_input_tokens_seen": 377376, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.11212121212121212, |
|
"grad_norm": 0.4921689033508301, |
|
"learning_rate": 1.9905957454619343e-06, |
|
"loss": 2.2869, |
|
"num_input_tokens_seen": 387432, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.11515151515151516, |
|
"grad_norm": 0.47557827830314636, |
|
"learning_rate": 1.9900000423780104e-06, |
|
"loss": 2.601, |
|
"num_input_tokens_seen": 395808, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.11818181818181818, |
|
"grad_norm": 1.9346156120300293, |
|
"learning_rate": 1.9893861655910444e-06, |
|
"loss": 2.3741, |
|
"num_input_tokens_seen": 407568, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": 0.45454809069633484, |
|
"learning_rate": 1.988754129223079e-06, |
|
"loss": 2.3929, |
|
"num_input_tokens_seen": 417648, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"eval_loss": 2.3575997352600098, |
|
"eval_runtime": 5.8145, |
|
"eval_samples_per_second": 3.44, |
|
"eval_steps_per_second": 3.44, |
|
"num_input_tokens_seen": 417648, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12424242424242424, |
|
"grad_norm": 0.533509373664856, |
|
"learning_rate": 1.9881039478139115e-06, |
|
"loss": 2.3717, |
|
"num_input_tokens_seen": 428568, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.12727272727272726, |
|
"grad_norm": 0.3749203681945801, |
|
"learning_rate": 1.9874356363207624e-06, |
|
"loss": 2.2728, |
|
"num_input_tokens_seen": 437688, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1303030303030303, |
|
"grad_norm": 0.41353124380111694, |
|
"learning_rate": 1.986749210117927e-06, |
|
"loss": 2.5347, |
|
"num_input_tokens_seen": 447408, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.4702826142311096, |
|
"learning_rate": 1.986044684996425e-06, |
|
"loss": 2.4081, |
|
"num_input_tokens_seen": 456120, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.13636363636363635, |
|
"grad_norm": 0.5201271772384644, |
|
"learning_rate": 1.985322077163636e-06, |
|
"loss": 2.5697, |
|
"num_input_tokens_seen": 467208, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1393939393939394, |
|
"grad_norm": 0.5325783491134644, |
|
"learning_rate": 1.9845814032429257e-06, |
|
"loss": 2.3267, |
|
"num_input_tokens_seen": 477168, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.14242424242424243, |
|
"grad_norm": 0.49566376209259033, |
|
"learning_rate": 1.9838226802732656e-06, |
|
"loss": 2.5342, |
|
"num_input_tokens_seen": 486888, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.14545454545454545, |
|
"grad_norm": 0.5317257046699524, |
|
"learning_rate": 1.9830459257088395e-06, |
|
"loss": 2.5662, |
|
"num_input_tokens_seen": 496584, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1484848484848485, |
|
"grad_norm": 0.6195109486579895, |
|
"learning_rate": 1.982251157418642e-06, |
|
"loss": 2.3294, |
|
"num_input_tokens_seen": 503736, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 0.4253556728363037, |
|
"learning_rate": 1.981438393686069e-06, |
|
"loss": 2.6105, |
|
"num_input_tokens_seen": 513600, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"eval_loss": 2.3544414043426514, |
|
"eval_runtime": 5.8171, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 513600, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15454545454545454, |
|
"grad_norm": 0.5861473083496094, |
|
"learning_rate": 1.980607653208495e-06, |
|
"loss": 2.6435, |
|
"num_input_tokens_seen": 519960, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.15757575757575756, |
|
"grad_norm": 0.44223421812057495, |
|
"learning_rate": 1.9797589550968434e-06, |
|
"loss": 2.4326, |
|
"num_input_tokens_seen": 529392, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1606060606060606, |
|
"grad_norm": 0.7290481328964233, |
|
"learning_rate": 1.9788923188751478e-06, |
|
"loss": 2.5169, |
|
"num_input_tokens_seen": 537000, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.16363636363636364, |
|
"grad_norm": 0.43159109354019165, |
|
"learning_rate": 1.978007764480103e-06, |
|
"loss": 2.3097, |
|
"num_input_tokens_seen": 546864, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.46773430705070496, |
|
"learning_rate": 1.977105312260605e-06, |
|
"loss": 2.2565, |
|
"num_input_tokens_seen": 558432, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1696969696969697, |
|
"grad_norm": 0.46607473492622375, |
|
"learning_rate": 1.976184982977284e-06, |
|
"loss": 2.3503, |
|
"num_input_tokens_seen": 569016, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.17272727272727273, |
|
"grad_norm": 0.5427464842796326, |
|
"learning_rate": 1.975246797802026e-06, |
|
"loss": 2.2801, |
|
"num_input_tokens_seen": 580392, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.17575757575757575, |
|
"grad_norm": 0.4266676902770996, |
|
"learning_rate": 1.974290778317487e-06, |
|
"loss": 2.4019, |
|
"num_input_tokens_seen": 590568, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1787878787878788, |
|
"grad_norm": 0.4442364275455475, |
|
"learning_rate": 1.973316946516595e-06, |
|
"loss": 2.3779, |
|
"num_input_tokens_seen": 601704, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 0.4435305595397949, |
|
"learning_rate": 1.9723253248020455e-06, |
|
"loss": 2.2488, |
|
"num_input_tokens_seen": 613584, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"eval_loss": 2.3512158393859863, |
|
"eval_runtime": 5.819, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 613584, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18484848484848485, |
|
"grad_norm": 0.5893362164497375, |
|
"learning_rate": 1.9713159359857833e-06, |
|
"loss": 2.4906, |
|
"num_input_tokens_seen": 624792, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.18787878787878787, |
|
"grad_norm": 0.4149838089942932, |
|
"learning_rate": 1.9702888032884826e-06, |
|
"loss": 2.5957, |
|
"num_input_tokens_seen": 635832, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.19090909090909092, |
|
"grad_norm": 0.42286068201065063, |
|
"learning_rate": 1.969243950339009e-06, |
|
"loss": 2.1759, |
|
"num_input_tokens_seen": 647664, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.19393939393939394, |
|
"grad_norm": 0.5177129507064819, |
|
"learning_rate": 1.9681814011738758e-06, |
|
"loss": 2.5093, |
|
"num_input_tokens_seen": 656952, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.19696969696969696, |
|
"grad_norm": 0.5667068958282471, |
|
"learning_rate": 1.9671011802366934e-06, |
|
"loss": 2.5727, |
|
"num_input_tokens_seen": 664104, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.566889762878418, |
|
"learning_rate": 1.9660033123776056e-06, |
|
"loss": 2.3728, |
|
"num_input_tokens_seen": 674016, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.20303030303030303, |
|
"grad_norm": 0.4465801417827606, |
|
"learning_rate": 1.964887822852718e-06, |
|
"loss": 2.4271, |
|
"num_input_tokens_seen": 684480, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.20606060606060606, |
|
"grad_norm": 0.5765467286109924, |
|
"learning_rate": 1.963754737323516e-06, |
|
"loss": 2.5413, |
|
"num_input_tokens_seen": 694056, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.20909090909090908, |
|
"grad_norm": 0.5330570936203003, |
|
"learning_rate": 1.9626040818562783e-06, |
|
"loss": 2.4513, |
|
"num_input_tokens_seen": 704640, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.21212121212121213, |
|
"grad_norm": 0.6006715297698975, |
|
"learning_rate": 1.9614358829214722e-06, |
|
"loss": 2.3866, |
|
"num_input_tokens_seen": 713640, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21212121212121213, |
|
"eval_loss": 2.349419355392456, |
|
"eval_runtime": 5.8237, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 713640, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21515151515151515, |
|
"grad_norm": 0.4789717495441437, |
|
"learning_rate": 1.960250167393147e-06, |
|
"loss": 2.4217, |
|
"num_input_tokens_seen": 722880, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.21818181818181817, |
|
"grad_norm": 0.558068037033081, |
|
"learning_rate": 1.959046962548316e-06, |
|
"loss": 2.5271, |
|
"num_input_tokens_seen": 733104, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.22121212121212122, |
|
"grad_norm": 0.5164092183113098, |
|
"learning_rate": 1.9578262960663305e-06, |
|
"loss": 2.4228, |
|
"num_input_tokens_seen": 745392, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.22424242424242424, |
|
"grad_norm": 0.49615126848220825, |
|
"learning_rate": 1.9565881960282384e-06, |
|
"loss": 2.1895, |
|
"num_input_tokens_seen": 755736, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 0.6630756258964539, |
|
"learning_rate": 1.9553326909161436e-06, |
|
"loss": 2.4702, |
|
"num_input_tokens_seen": 767040, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23030303030303031, |
|
"grad_norm": 0.5331915020942688, |
|
"learning_rate": 1.954059809612546e-06, |
|
"loss": 2.4535, |
|
"num_input_tokens_seen": 776496, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.23333333333333334, |
|
"grad_norm": 0.44153809547424316, |
|
"learning_rate": 1.9527695813996817e-06, |
|
"loss": 2.3757, |
|
"num_input_tokens_seen": 785568, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.23636363636363636, |
|
"grad_norm": 0.4671899378299713, |
|
"learning_rate": 1.9514620359588454e-06, |
|
"loss": 2.3609, |
|
"num_input_tokens_seen": 797496, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.23939393939393938, |
|
"grad_norm": 0.49474212527275085, |
|
"learning_rate": 1.9501372033697097e-06, |
|
"loss": 2.4576, |
|
"num_input_tokens_seen": 808536, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 0.5353239178657532, |
|
"learning_rate": 1.948795114109632e-06, |
|
"loss": 2.2509, |
|
"num_input_tokens_seen": 818592, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"eval_loss": 2.3466238975524902, |
|
"eval_runtime": 5.8178, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 818592, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24545454545454545, |
|
"grad_norm": 0.4847556948661804, |
|
"learning_rate": 1.947435799052955e-06, |
|
"loss": 2.4558, |
|
"num_input_tokens_seen": 828336, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.24848484848484848, |
|
"grad_norm": 0.5099437236785889, |
|
"learning_rate": 1.9460592894702946e-06, |
|
"loss": 2.3038, |
|
"num_input_tokens_seen": 838080, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.2515151515151515, |
|
"grad_norm": 0.47751423716545105, |
|
"learning_rate": 1.944665617027823e-06, |
|
"loss": 2.2954, |
|
"num_input_tokens_seen": 850128, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2545454545454545, |
|
"grad_norm": 0.4297049045562744, |
|
"learning_rate": 1.943254813786535e-06, |
|
"loss": 2.2327, |
|
"num_input_tokens_seen": 862416, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.25757575757575757, |
|
"grad_norm": 0.5330982804298401, |
|
"learning_rate": 1.941826912201518e-06, |
|
"loss": 2.487, |
|
"num_input_tokens_seen": 873936, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2606060606060606, |
|
"grad_norm": 0.4737272560596466, |
|
"learning_rate": 1.9403819451212004e-06, |
|
"loss": 2.6736, |
|
"num_input_tokens_seen": 883584, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.2636363636363636, |
|
"grad_norm": 0.6267192363739014, |
|
"learning_rate": 1.938919945786595e-06, |
|
"loss": 2.2313, |
|
"num_input_tokens_seen": 892632, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.42695581912994385, |
|
"learning_rate": 1.9374409478305385e-06, |
|
"loss": 2.4444, |
|
"num_input_tokens_seen": 904920, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.2696969696969697, |
|
"grad_norm": 0.5554710030555725, |
|
"learning_rate": 1.935944985276914e-06, |
|
"loss": 2.5038, |
|
"num_input_tokens_seen": 913752, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 0.6374077796936035, |
|
"learning_rate": 1.9344320925398713e-06, |
|
"loss": 2.3807, |
|
"num_input_tokens_seen": 920952, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"eval_loss": 2.3428144454956055, |
|
"eval_runtime": 5.8159, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 920952, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.27575757575757576, |
|
"grad_norm": 0.41562148928642273, |
|
"learning_rate": 1.932902304423033e-06, |
|
"loss": 2.5033, |
|
"num_input_tokens_seen": 932280, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.2787878787878788, |
|
"grad_norm": 0.47822168469429016, |
|
"learning_rate": 1.931355656118694e-06, |
|
"loss": 2.275, |
|
"num_input_tokens_seen": 944568, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.2818181818181818, |
|
"grad_norm": 0.553165853023529, |
|
"learning_rate": 1.9297921832070134e-06, |
|
"loss": 2.567, |
|
"num_input_tokens_seen": 952032, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.28484848484848485, |
|
"grad_norm": 0.5379563570022583, |
|
"learning_rate": 1.928211921655195e-06, |
|
"loss": 2.5257, |
|
"num_input_tokens_seen": 963840, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.2878787878787879, |
|
"grad_norm": 0.5385987758636475, |
|
"learning_rate": 1.9266149078166603e-06, |
|
"loss": 2.3678, |
|
"num_input_tokens_seen": 975288, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2909090909090909, |
|
"grad_norm": 0.42638707160949707, |
|
"learning_rate": 1.9250011784302106e-06, |
|
"loss": 2.232, |
|
"num_input_tokens_seen": 987144, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.29393939393939394, |
|
"grad_norm": 0.450655996799469, |
|
"learning_rate": 1.923370770619184e-06, |
|
"loss": 2.1844, |
|
"num_input_tokens_seen": 998664, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.296969696969697, |
|
"grad_norm": 0.477781742811203, |
|
"learning_rate": 1.921723721890602e-06, |
|
"loss": 2.3571, |
|
"num_input_tokens_seen": 1008504, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7921934723854065, |
|
"learning_rate": 1.920060070134301e-06, |
|
"loss": 2.472, |
|
"num_input_tokens_seen": 1016664, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 0.5304360389709473, |
|
"learning_rate": 1.91837985362207e-06, |
|
"loss": 2.4112, |
|
"num_input_tokens_seen": 1026192, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"eval_loss": 2.340877056121826, |
|
"eval_runtime": 5.8187, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 1026192, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.30606060606060603, |
|
"grad_norm": 0.4748481810092926, |
|
"learning_rate": 1.9166831110067615e-06, |
|
"loss": 2.5731, |
|
"num_input_tokens_seen": 1037064, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3090909090909091, |
|
"grad_norm": 0.44187602400779724, |
|
"learning_rate": 1.914969881321407e-06, |
|
"loss": 2.5743, |
|
"num_input_tokens_seen": 1049352, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.31212121212121213, |
|
"grad_norm": 0.6284915208816528, |
|
"learning_rate": 1.913240203978318e-06, |
|
"loss": 2.4531, |
|
"num_input_tokens_seen": 1057272, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.3151515151515151, |
|
"grad_norm": 0.6538528800010681, |
|
"learning_rate": 1.9114941187681783e-06, |
|
"loss": 2.5391, |
|
"num_input_tokens_seen": 1065120, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3181818181818182, |
|
"grad_norm": 1.0042399168014526, |
|
"learning_rate": 1.9097316658591304e-06, |
|
"loss": 2.4156, |
|
"num_input_tokens_seen": 1074192, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3212121212121212, |
|
"grad_norm": 0.48325198888778687, |
|
"learning_rate": 1.9079528857958504e-06, |
|
"loss": 2.5733, |
|
"num_input_tokens_seen": 1084416, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3242424242424242, |
|
"grad_norm": 0.6697909832000732, |
|
"learning_rate": 1.906157819498616e-06, |
|
"loss": 2.5264, |
|
"num_input_tokens_seen": 1092888, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.32727272727272727, |
|
"grad_norm": 0.6655834913253784, |
|
"learning_rate": 1.904346508262363e-06, |
|
"loss": 2.3912, |
|
"num_input_tokens_seen": 1100160, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3303030303030303, |
|
"grad_norm": 1.1694029569625854, |
|
"learning_rate": 1.9025189937557386e-06, |
|
"loss": 2.462, |
|
"num_input_tokens_seen": 1107360, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.4985570013523102, |
|
"learning_rate": 1.90067531802014e-06, |
|
"loss": 2.2447, |
|
"num_input_tokens_seen": 1119096, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"eval_loss": 2.339911937713623, |
|
"eval_runtime": 5.8137, |
|
"eval_samples_per_second": 3.44, |
|
"eval_steps_per_second": 3.44, |
|
"num_input_tokens_seen": 1119096, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.33636363636363636, |
|
"grad_norm": 0.4883664548397064, |
|
"learning_rate": 1.8988155234687495e-06, |
|
"loss": 2.4013, |
|
"num_input_tokens_seen": 1131384, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.3393939393939394, |
|
"grad_norm": 0.7224740982055664, |
|
"learning_rate": 1.8969396528855567e-06, |
|
"loss": 2.4763, |
|
"num_input_tokens_seen": 1142616, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.3424242424242424, |
|
"grad_norm": 0.569634199142456, |
|
"learning_rate": 1.8950477494243762e-06, |
|
"loss": 2.3552, |
|
"num_input_tokens_seen": 1154904, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.34545454545454546, |
|
"grad_norm": 0.45122525095939636, |
|
"learning_rate": 1.8931398566078523e-06, |
|
"loss": 2.4198, |
|
"num_input_tokens_seen": 1164264, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.3484848484848485, |
|
"grad_norm": 0.5598176121711731, |
|
"learning_rate": 1.8912160183264612e-06, |
|
"loss": 2.5283, |
|
"num_input_tokens_seen": 1175472, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3515151515151515, |
|
"grad_norm": 0.5492939352989197, |
|
"learning_rate": 1.8892762788374985e-06, |
|
"loss": 2.5246, |
|
"num_input_tokens_seen": 1185264, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.35454545454545455, |
|
"grad_norm": 0.557397723197937, |
|
"learning_rate": 1.8873206827640624e-06, |
|
"loss": 2.3821, |
|
"num_input_tokens_seen": 1197408, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.3575757575757576, |
|
"grad_norm": 0.42229530215263367, |
|
"learning_rate": 1.8853492750940275e-06, |
|
"loss": 2.3593, |
|
"num_input_tokens_seen": 1207656, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3606060606060606, |
|
"grad_norm": 0.4781576693058014, |
|
"learning_rate": 1.8833621011790078e-06, |
|
"loss": 2.2261, |
|
"num_input_tokens_seen": 1219080, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.46443861722946167, |
|
"learning_rate": 1.8813592067333155e-06, |
|
"loss": 2.4046, |
|
"num_input_tokens_seen": 1230048, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"eval_loss": 2.339547872543335, |
|
"eval_runtime": 5.8158, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 1230048, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.36666666666666664, |
|
"grad_norm": 0.4926714599132538, |
|
"learning_rate": 1.8793406378329092e-06, |
|
"loss": 2.1956, |
|
"num_input_tokens_seen": 1239288, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.3696969696969697, |
|
"grad_norm": 0.9403526186943054, |
|
"learning_rate": 1.877306440914333e-06, |
|
"loss": 2.3843, |
|
"num_input_tokens_seen": 1246512, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.37272727272727274, |
|
"grad_norm": 0.8498961329460144, |
|
"learning_rate": 1.8752566627736477e-06, |
|
"loss": 2.2977, |
|
"num_input_tokens_seen": 1256256, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.37575757575757573, |
|
"grad_norm": 0.5305018424987793, |
|
"learning_rate": 1.8731913505653569e-06, |
|
"loss": 2.4575, |
|
"num_input_tokens_seen": 1265712, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3787878787878788, |
|
"grad_norm": 0.4798325002193451, |
|
"learning_rate": 1.8711105518013199e-06, |
|
"loss": 2.3638, |
|
"num_input_tokens_seen": 1273848, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38181818181818183, |
|
"grad_norm": 0.5862890481948853, |
|
"learning_rate": 1.869014314349659e-06, |
|
"loss": 2.388, |
|
"num_input_tokens_seen": 1283664, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.38484848484848483, |
|
"grad_norm": 0.5504214763641357, |
|
"learning_rate": 1.8669026864336591e-06, |
|
"loss": 2.3997, |
|
"num_input_tokens_seen": 1293768, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.3878787878787879, |
|
"grad_norm": 0.662431538105011, |
|
"learning_rate": 1.8647757166306572e-06, |
|
"loss": 2.4629, |
|
"num_input_tokens_seen": 1303392, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.39090909090909093, |
|
"grad_norm": 0.5133792757987976, |
|
"learning_rate": 1.8626334538709263e-06, |
|
"loss": 2.3915, |
|
"num_input_tokens_seen": 1313784, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.3939393939393939, |
|
"grad_norm": 0.47367045283317566, |
|
"learning_rate": 1.8604759474365492e-06, |
|
"loss": 2.4396, |
|
"num_input_tokens_seen": 1326072, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3939393939393939, |
|
"eval_loss": 2.338432788848877, |
|
"eval_runtime": 5.8115, |
|
"eval_samples_per_second": 3.441, |
|
"eval_steps_per_second": 3.441, |
|
"num_input_tokens_seen": 1326072, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.396969696969697, |
|
"grad_norm": 0.5194035768508911, |
|
"learning_rate": 1.858303246960284e-06, |
|
"loss": 2.4028, |
|
"num_input_tokens_seen": 1335864, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.4642770290374756, |
|
"learning_rate": 1.856115402424423e-06, |
|
"loss": 2.434, |
|
"num_input_tokens_seen": 1347552, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.403030303030303, |
|
"grad_norm": 0.5999087691307068, |
|
"learning_rate": 1.8539124641596437e-06, |
|
"loss": 2.3149, |
|
"num_input_tokens_seen": 1356912, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.40606060606060607, |
|
"grad_norm": 0.588898241519928, |
|
"learning_rate": 1.851694482843849e-06, |
|
"loss": 2.5401, |
|
"num_input_tokens_seen": 1368408, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.4090909090909091, |
|
"grad_norm": 0.49462223052978516, |
|
"learning_rate": 1.8494615095010037e-06, |
|
"loss": 2.3905, |
|
"num_input_tokens_seen": 1380696, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4121212121212121, |
|
"grad_norm": 1.0041953325271606, |
|
"learning_rate": 1.8472135954999582e-06, |
|
"loss": 2.7022, |
|
"num_input_tokens_seen": 1389096, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.41515151515151516, |
|
"grad_norm": 0.5517657399177551, |
|
"learning_rate": 1.8449507925532685e-06, |
|
"loss": 2.5369, |
|
"num_input_tokens_seen": 1400784, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.41818181818181815, |
|
"grad_norm": 0.6180247068405151, |
|
"learning_rate": 1.8426731527160064e-06, |
|
"loss": 2.2525, |
|
"num_input_tokens_seen": 1413072, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.4212121212121212, |
|
"grad_norm": 0.6159691214561462, |
|
"learning_rate": 1.8403807283845616e-06, |
|
"loss": 2.3052, |
|
"num_input_tokens_seen": 1422888, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"grad_norm": 0.6237558722496033, |
|
"learning_rate": 1.8380735722954367e-06, |
|
"loss": 2.344, |
|
"num_input_tokens_seen": 1432128, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"eval_loss": 2.3386666774749756, |
|
"eval_runtime": 5.8175, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 1432128, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.42727272727272725, |
|
"grad_norm": 0.6814020276069641, |
|
"learning_rate": 1.835751737524033e-06, |
|
"loss": 2.4498, |
|
"num_input_tokens_seen": 1439928, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.4303030303030303, |
|
"grad_norm": 0.5670037865638733, |
|
"learning_rate": 1.8334152774834309e-06, |
|
"loss": 2.3934, |
|
"num_input_tokens_seen": 1449624, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.43333333333333335, |
|
"grad_norm": 0.6628959774971008, |
|
"learning_rate": 1.83106424592316e-06, |
|
"loss": 2.52, |
|
"num_input_tokens_seen": 1460520, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.43636363636363634, |
|
"grad_norm": 0.6537968516349792, |
|
"learning_rate": 1.8286986969279643e-06, |
|
"loss": 2.5132, |
|
"num_input_tokens_seen": 1469712, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.4393939393939394, |
|
"grad_norm": 0.5633306503295898, |
|
"learning_rate": 1.8263186849165555e-06, |
|
"loss": 2.403, |
|
"num_input_tokens_seen": 1480824, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.44242424242424244, |
|
"grad_norm": 0.5708298683166504, |
|
"learning_rate": 1.8239242646403628e-06, |
|
"loss": 2.5149, |
|
"num_input_tokens_seen": 1488816, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.44545454545454544, |
|
"grad_norm": 0.7049750685691833, |
|
"learning_rate": 1.8215154911822737e-06, |
|
"loss": 2.2043, |
|
"num_input_tokens_seen": 1497816, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.4484848484848485, |
|
"grad_norm": 0.5039754509925842, |
|
"learning_rate": 1.8190924199553655e-06, |
|
"loss": 2.439, |
|
"num_input_tokens_seen": 1508928, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.45151515151515154, |
|
"grad_norm": 0.5821936726570129, |
|
"learning_rate": 1.816655106701631e-06, |
|
"loss": 2.4665, |
|
"num_input_tokens_seen": 1519512, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.5108533501625061, |
|
"learning_rate": 1.8142036074906968e-06, |
|
"loss": 2.4901, |
|
"num_input_tokens_seen": 1529520, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"eval_loss": 2.337289333343506, |
|
"eval_runtime": 5.817, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 1529520, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4575757575757576, |
|
"grad_norm": 0.4282449781894684, |
|
"learning_rate": 1.8117379787185333e-06, |
|
"loss": 2.1503, |
|
"num_input_tokens_seen": 1541808, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.46060606060606063, |
|
"grad_norm": 0.6109529137611389, |
|
"learning_rate": 1.809258277106156e-06, |
|
"loss": 2.4026, |
|
"num_input_tokens_seen": 1550952, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.4636363636363636, |
|
"grad_norm": 0.5644070506095886, |
|
"learning_rate": 1.8067645596983226e-06, |
|
"loss": 2.4195, |
|
"num_input_tokens_seen": 1562064, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 0.665733814239502, |
|
"learning_rate": 1.804256883862219e-06, |
|
"loss": 2.6243, |
|
"num_input_tokens_seen": 1569240, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.4696969696969697, |
|
"grad_norm": 0.6493149995803833, |
|
"learning_rate": 1.8017353072861416e-06, |
|
"loss": 2.3603, |
|
"num_input_tokens_seen": 1579560, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4727272727272727, |
|
"grad_norm": 0.5297104120254517, |
|
"learning_rate": 1.7991998879781676e-06, |
|
"loss": 2.2741, |
|
"num_input_tokens_seen": 1591248, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.47575757575757577, |
|
"grad_norm": 0.4405084252357483, |
|
"learning_rate": 1.796650684264823e-06, |
|
"loss": 2.5167, |
|
"num_input_tokens_seen": 1602840, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.47878787878787876, |
|
"grad_norm": 0.6081413626670837, |
|
"learning_rate": 1.7940877547897383e-06, |
|
"loss": 2.404, |
|
"num_input_tokens_seen": 1610520, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.4818181818181818, |
|
"grad_norm": 0.7665295600891113, |
|
"learning_rate": 1.7915111585123026e-06, |
|
"loss": 2.3861, |
|
"num_input_tokens_seen": 1617936, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 0.5678819417953491, |
|
"learning_rate": 1.7889209547063038e-06, |
|
"loss": 2.3335, |
|
"num_input_tokens_seen": 1628424, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"eval_loss": 2.336883068084717, |
|
"eval_runtime": 5.8244, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 1628424, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.48787878787878786, |
|
"grad_norm": 0.6124878525733948, |
|
"learning_rate": 1.7863172029585684e-06, |
|
"loss": 2.6274, |
|
"num_input_tokens_seen": 1636704, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.4909090909090909, |
|
"grad_norm": 0.5369870066642761, |
|
"learning_rate": 1.7836999631675877e-06, |
|
"loss": 2.2444, |
|
"num_input_tokens_seen": 1646760, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.49393939393939396, |
|
"grad_norm": 0.47992056608200073, |
|
"learning_rate": 1.7810692955421418e-06, |
|
"loss": 2.3407, |
|
"num_input_tokens_seen": 1657824, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.49696969696969695, |
|
"grad_norm": 0.5946272611618042, |
|
"learning_rate": 1.778425260599914e-06, |
|
"loss": 2.5075, |
|
"num_input_tokens_seen": 1669800, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5064172744750977, |
|
"learning_rate": 1.7757679191660974e-06, |
|
"loss": 2.4304, |
|
"num_input_tokens_seen": 1678896, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.503030303030303, |
|
"grad_norm": 0.676836371421814, |
|
"learning_rate": 1.7730973323719996e-06, |
|
"loss": 2.3898, |
|
"num_input_tokens_seen": 1686696, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5060606060606061, |
|
"grad_norm": 0.45694637298583984, |
|
"learning_rate": 1.7704135616536297e-06, |
|
"loss": 2.1912, |
|
"num_input_tokens_seen": 1695648, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.509090909090909, |
|
"grad_norm": 0.5608468651771545, |
|
"learning_rate": 1.767716668750292e-06, |
|
"loss": 2.4971, |
|
"num_input_tokens_seen": 1703112, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5121212121212121, |
|
"grad_norm": 0.5195941925048828, |
|
"learning_rate": 1.7650067157031607e-06, |
|
"loss": 2.3934, |
|
"num_input_tokens_seen": 1715400, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.5151515151515151, |
|
"grad_norm": 0.3820761442184448, |
|
"learning_rate": 1.7622837648538558e-06, |
|
"loss": 2.1842, |
|
"num_input_tokens_seen": 1725816, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5151515151515151, |
|
"eval_loss": 2.3365180492401123, |
|
"eval_runtime": 5.8166, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 1725816, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5181818181818182, |
|
"grad_norm": 0.5152050852775574, |
|
"learning_rate": 1.7595478788430067e-06, |
|
"loss": 2.2292, |
|
"num_input_tokens_seen": 1737240, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.5212121212121212, |
|
"grad_norm": 0.6499360203742981, |
|
"learning_rate": 1.7567991206088122e-06, |
|
"loss": 2.3013, |
|
"num_input_tokens_seen": 1743792, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.5242424242424243, |
|
"grad_norm": 0.6490241885185242, |
|
"learning_rate": 1.7540375533855931e-06, |
|
"loss": 2.5828, |
|
"num_input_tokens_seen": 1755192, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.5272727272727272, |
|
"grad_norm": 0.5575884580612183, |
|
"learning_rate": 1.751263240702337e-06, |
|
"loss": 2.2834, |
|
"num_input_tokens_seen": 1765656, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.5303030303030303, |
|
"grad_norm": 0.6133118867874146, |
|
"learning_rate": 1.7484762463812359e-06, |
|
"loss": 2.5502, |
|
"num_input_tokens_seen": 1773504, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.477857768535614, |
|
"learning_rate": 1.7456766345362195e-06, |
|
"loss": 2.2939, |
|
"num_input_tokens_seen": 1785792, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.5363636363636364, |
|
"grad_norm": 1.5005486011505127, |
|
"learning_rate": 1.7428644695714798e-06, |
|
"loss": 2.3919, |
|
"num_input_tokens_seen": 1792848, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.5393939393939394, |
|
"grad_norm": 0.6583260893821716, |
|
"learning_rate": 1.7400398161799901e-06, |
|
"loss": 2.4862, |
|
"num_input_tokens_seen": 1802256, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.5424242424242425, |
|
"grad_norm": 0.5908564925193787, |
|
"learning_rate": 1.7372027393420136e-06, |
|
"loss": 2.4536, |
|
"num_input_tokens_seen": 1812840, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 0.6152108311653137, |
|
"learning_rate": 1.7343533043236135e-06, |
|
"loss": 2.2118, |
|
"num_input_tokens_seen": 1822440, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"eval_loss": 2.335080623626709, |
|
"eval_runtime": 5.8256, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 1822440, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5484848484848485, |
|
"grad_norm": 0.5109455585479736, |
|
"learning_rate": 1.7314915766751482e-06, |
|
"loss": 2.3442, |
|
"num_input_tokens_seen": 1833168, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.5515151515151515, |
|
"grad_norm": 0.4524301290512085, |
|
"learning_rate": 1.7286176222297643e-06, |
|
"loss": 2.3881, |
|
"num_input_tokens_seen": 1845072, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.5545454545454546, |
|
"grad_norm": 0.4554661810398102, |
|
"learning_rate": 1.7257315071018814e-06, |
|
"loss": 2.2764, |
|
"num_input_tokens_seen": 1857168, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.5575757575757576, |
|
"grad_norm": 0.42852118611335754, |
|
"learning_rate": 1.7228332976856717e-06, |
|
"loss": 2.364, |
|
"num_input_tokens_seen": 1869456, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.5606060606060606, |
|
"grad_norm": 0.7273756861686707, |
|
"learning_rate": 1.7199230606535347e-06, |
|
"loss": 2.4654, |
|
"num_input_tokens_seen": 1878168, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5636363636363636, |
|
"grad_norm": 0.7303619384765625, |
|
"learning_rate": 1.717000862954559e-06, |
|
"loss": 2.4599, |
|
"num_input_tokens_seen": 1888608, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.5666666666666667, |
|
"grad_norm": 0.6044741868972778, |
|
"learning_rate": 1.7140667718129853e-06, |
|
"loss": 2.2146, |
|
"num_input_tokens_seen": 1897008, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.5696969696969697, |
|
"grad_norm": 0.5754801630973816, |
|
"learning_rate": 1.7111208547266607e-06, |
|
"loss": 2.4951, |
|
"num_input_tokens_seen": 1906776, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.5727272727272728, |
|
"grad_norm": 0.47109347581863403, |
|
"learning_rate": 1.7081631794654818e-06, |
|
"loss": 2.1497, |
|
"num_input_tokens_seen": 1919064, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.5757575757575758, |
|
"grad_norm": 0.6136711835861206, |
|
"learning_rate": 1.7051938140698408e-06, |
|
"loss": 2.3233, |
|
"num_input_tokens_seen": 1928688, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5757575757575758, |
|
"eval_loss": 2.334742546081543, |
|
"eval_runtime": 5.8193, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 1928688, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5787878787878787, |
|
"grad_norm": 0.6149052977561951, |
|
"learning_rate": 1.702212826849056e-06, |
|
"loss": 2.319, |
|
"num_input_tokens_seen": 1940784, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.5818181818181818, |
|
"grad_norm": 0.5667149424552917, |
|
"learning_rate": 1.6992202863798037e-06, |
|
"loss": 2.5949, |
|
"num_input_tokens_seen": 1950840, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.5848484848484848, |
|
"grad_norm": 0.5343450307846069, |
|
"learning_rate": 1.6962162615045377e-06, |
|
"loss": 2.3292, |
|
"num_input_tokens_seen": 1963128, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.5878787878787879, |
|
"grad_norm": 0.5003802180290222, |
|
"learning_rate": 1.6932008213299071e-06, |
|
"loss": 2.5239, |
|
"num_input_tokens_seen": 1975008, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.5909090909090909, |
|
"grad_norm": 0.5460373759269714, |
|
"learning_rate": 1.6901740352251675e-06, |
|
"loss": 2.4818, |
|
"num_input_tokens_seen": 1983648, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.593939393939394, |
|
"grad_norm": 0.5535560250282288, |
|
"learning_rate": 1.6871359728205828e-06, |
|
"loss": 2.1795, |
|
"num_input_tokens_seen": 1993536, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.5969696969696969, |
|
"grad_norm": 0.4466463029384613, |
|
"learning_rate": 1.6840867040058254e-06, |
|
"loss": 2.3585, |
|
"num_input_tokens_seen": 2002872, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5831019878387451, |
|
"learning_rate": 1.6810262989283674e-06, |
|
"loss": 2.3718, |
|
"num_input_tokens_seen": 2012400, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.603030303030303, |
|
"grad_norm": 0.5981975197792053, |
|
"learning_rate": 1.6779548279918671e-06, |
|
"loss": 2.314, |
|
"num_input_tokens_seen": 2022936, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.5155858397483826, |
|
"learning_rate": 1.6748723618545496e-06, |
|
"loss": 2.6427, |
|
"num_input_tokens_seen": 2031480, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"eval_loss": 2.334027051925659, |
|
"eval_runtime": 5.8193, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 2031480, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6090909090909091, |
|
"grad_norm": 0.46695375442504883, |
|
"learning_rate": 1.6717789714275808e-06, |
|
"loss": 2.2379, |
|
"num_input_tokens_seen": 2043768, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.6121212121212121, |
|
"grad_norm": 0.8030733466148376, |
|
"learning_rate": 1.6686747278734364e-06, |
|
"loss": 2.3286, |
|
"num_input_tokens_seen": 2052456, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.6151515151515151, |
|
"grad_norm": 0.5807926654815674, |
|
"learning_rate": 1.6655597026042654e-06, |
|
"loss": 2.3891, |
|
"num_input_tokens_seen": 2062608, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.6181818181818182, |
|
"grad_norm": 0.5125523209571838, |
|
"learning_rate": 1.6624339672802466e-06, |
|
"loss": 2.2766, |
|
"num_input_tokens_seen": 2070624, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.6212121212121212, |
|
"grad_norm": 0.4872816205024719, |
|
"learning_rate": 1.65929759380794e-06, |
|
"loss": 2.3172, |
|
"num_input_tokens_seen": 2082024, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6242424242424243, |
|
"grad_norm": 0.5617727637290955, |
|
"learning_rate": 1.6561506543386332e-06, |
|
"loss": 2.2975, |
|
"num_input_tokens_seen": 2093928, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.6272727272727273, |
|
"grad_norm": 0.7218233942985535, |
|
"learning_rate": 1.6529932212666813e-06, |
|
"loss": 2.5706, |
|
"num_input_tokens_seen": 2102712, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.6303030303030303, |
|
"grad_norm": 0.5542349219322205, |
|
"learning_rate": 1.6498253672278403e-06, |
|
"loss": 2.4111, |
|
"num_input_tokens_seen": 2111352, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.6333333333333333, |
|
"grad_norm": 0.5303030610084534, |
|
"learning_rate": 1.6466471650975989e-06, |
|
"loss": 2.3655, |
|
"num_input_tokens_seen": 2123184, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 0.4791901111602783, |
|
"learning_rate": 1.6434586879894994e-06, |
|
"loss": 2.1955, |
|
"num_input_tokens_seen": 2132520, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"eval_loss": 2.3337419033050537, |
|
"eval_runtime": 5.8194, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 2132520, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6393939393939394, |
|
"grad_norm": 1.0299837589263916, |
|
"learning_rate": 1.6402600092534571e-06, |
|
"loss": 2.4297, |
|
"num_input_tokens_seen": 2140344, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.6424242424242425, |
|
"grad_norm": 0.5022935271263123, |
|
"learning_rate": 1.637051202474072e-06, |
|
"loss": 2.3299, |
|
"num_input_tokens_seen": 2150592, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.6454545454545455, |
|
"grad_norm": 0.7252947688102722, |
|
"learning_rate": 1.6338323414689384e-06, |
|
"loss": 2.4036, |
|
"num_input_tokens_seen": 2158848, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.6484848484848484, |
|
"grad_norm": 0.49614864587783813, |
|
"learning_rate": 1.6306035002869418e-06, |
|
"loss": 2.3709, |
|
"num_input_tokens_seen": 2166120, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.6515151515151515, |
|
"grad_norm": 0.5736730098724365, |
|
"learning_rate": 1.6273647532065615e-06, |
|
"loss": 2.6169, |
|
"num_input_tokens_seen": 2177760, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6545454545454545, |
|
"grad_norm": 0.8251070380210876, |
|
"learning_rate": 1.6241161747341568e-06, |
|
"loss": 2.4805, |
|
"num_input_tokens_seen": 2185488, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.6575757575757576, |
|
"grad_norm": 1.2293510437011719, |
|
"learning_rate": 1.6208578396022566e-06, |
|
"loss": 2.1922, |
|
"num_input_tokens_seen": 2196336, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.6606060606060606, |
|
"grad_norm": 0.6561338305473328, |
|
"learning_rate": 1.6175898227678376e-06, |
|
"loss": 2.4529, |
|
"num_input_tokens_seen": 2204520, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.6636363636363637, |
|
"grad_norm": 0.4846937954425812, |
|
"learning_rate": 1.6143121994106012e-06, |
|
"loss": 2.3597, |
|
"num_input_tokens_seen": 2216808, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.6437355279922485, |
|
"learning_rate": 1.611025044931245e-06, |
|
"loss": 2.4364, |
|
"num_input_tokens_seen": 2227752, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"eval_loss": 2.3327877521514893, |
|
"eval_runtime": 5.8187, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 2227752, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6696969696969697, |
|
"grad_norm": 0.5672312378883362, |
|
"learning_rate": 1.6077284349497254e-06, |
|
"loss": 2.5148, |
|
"num_input_tokens_seen": 2237808, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.6727272727272727, |
|
"grad_norm": 0.5006369948387146, |
|
"learning_rate": 1.6044224453035203e-06, |
|
"loss": 2.1969, |
|
"num_input_tokens_seen": 2249304, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.6757575757575758, |
|
"grad_norm": 0.6202157735824585, |
|
"learning_rate": 1.6011071520458845e-06, |
|
"loss": 2.5604, |
|
"num_input_tokens_seen": 2260176, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.6787878787878788, |
|
"grad_norm": 0.555921733379364, |
|
"learning_rate": 1.5977826314440987e-06, |
|
"loss": 2.2211, |
|
"num_input_tokens_seen": 2270184, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 0.5153559446334839, |
|
"learning_rate": 1.5944489599777161e-06, |
|
"loss": 2.3477, |
|
"num_input_tokens_seen": 2281464, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6848484848484848, |
|
"grad_norm": 0.5477102994918823, |
|
"learning_rate": 1.5911062143368027e-06, |
|
"loss": 2.4645, |
|
"num_input_tokens_seen": 2292720, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.6878787878787879, |
|
"grad_norm": 0.5461196303367615, |
|
"learning_rate": 1.5877544714201726e-06, |
|
"loss": 2.5217, |
|
"num_input_tokens_seen": 2303376, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.6909090909090909, |
|
"grad_norm": 0.5640104413032532, |
|
"learning_rate": 1.5843938083336194e-06, |
|
"loss": 2.5123, |
|
"num_input_tokens_seen": 2312544, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.693939393939394, |
|
"grad_norm": 0.4936680197715759, |
|
"learning_rate": 1.5810243023881432e-06, |
|
"loss": 2.2975, |
|
"num_input_tokens_seen": 2323344, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.696969696969697, |
|
"grad_norm": 0.4782181680202484, |
|
"learning_rate": 1.5776460310981702e-06, |
|
"loss": 2.3568, |
|
"num_input_tokens_seen": 2332056, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.696969696969697, |
|
"eval_loss": 2.332925319671631, |
|
"eval_runtime": 5.8201, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 3.436, |
|
"num_input_tokens_seen": 2332056, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5433066487312317, |
|
"learning_rate": 1.5742590721797725e-06, |
|
"loss": 2.5328, |
|
"num_input_tokens_seen": 2342400, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.703030303030303, |
|
"grad_norm": 1.0040984153747559, |
|
"learning_rate": 1.5708635035488756e-06, |
|
"loss": 2.5166, |
|
"num_input_tokens_seen": 2350536, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.706060606060606, |
|
"grad_norm": 0.5495861172676086, |
|
"learning_rate": 1.5674594033194706e-06, |
|
"loss": 2.3471, |
|
"num_input_tokens_seen": 2361528, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.7090909090909091, |
|
"grad_norm": 0.6494752764701843, |
|
"learning_rate": 1.5640468498018153e-06, |
|
"loss": 2.4315, |
|
"num_input_tokens_seen": 2370552, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.7121212121212122, |
|
"grad_norm": 0.5859867930412292, |
|
"learning_rate": 1.5606259215006325e-06, |
|
"loss": 2.5083, |
|
"num_input_tokens_seen": 2380368, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7151515151515152, |
|
"grad_norm": 0.606728196144104, |
|
"learning_rate": 1.5571966971133037e-06, |
|
"loss": 2.3308, |
|
"num_input_tokens_seen": 2389176, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.7181818181818181, |
|
"grad_norm": 0.453156441450119, |
|
"learning_rate": 1.5537592555280594e-06, |
|
"loss": 2.3236, |
|
"num_input_tokens_seen": 2398944, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.7212121212121212, |
|
"grad_norm": 0.8148333430290222, |
|
"learning_rate": 1.5503136758221653e-06, |
|
"loss": 2.8391, |
|
"num_input_tokens_seen": 2404656, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.7242424242424242, |
|
"grad_norm": 0.4754016399383545, |
|
"learning_rate": 1.5468600372601009e-06, |
|
"loss": 2.6875, |
|
"num_input_tokens_seen": 2416392, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.7027032375335693, |
|
"learning_rate": 1.543398419291737e-06, |
|
"loss": 2.4508, |
|
"num_input_tokens_seen": 2425032, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"eval_loss": 2.332369089126587, |
|
"eval_runtime": 5.8166, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 2425032, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7303030303030303, |
|
"grad_norm": 0.8816015124320984, |
|
"learning_rate": 1.5399289015505096e-06, |
|
"loss": 2.4884, |
|
"num_input_tokens_seen": 2432280, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 0.5385542511940002, |
|
"learning_rate": 1.536451563851584e-06, |
|
"loss": 2.3159, |
|
"num_input_tokens_seen": 2442576, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.7363636363636363, |
|
"grad_norm": 0.5668327808380127, |
|
"learning_rate": 1.5329664861900237e-06, |
|
"loss": 2.5522, |
|
"num_input_tokens_seen": 2450664, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.7393939393939394, |
|
"grad_norm": 0.5444993376731873, |
|
"learning_rate": 1.5294737487389462e-06, |
|
"loss": 2.4853, |
|
"num_input_tokens_seen": 2462568, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.7424242424242424, |
|
"grad_norm": 0.5722953081130981, |
|
"learning_rate": 1.5259734318476807e-06, |
|
"loss": 2.5841, |
|
"num_input_tokens_seen": 2472312, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7454545454545455, |
|
"grad_norm": 0.5933071970939636, |
|
"learning_rate": 1.5224656160399186e-06, |
|
"loss": 2.4222, |
|
"num_input_tokens_seen": 2483016, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.7484848484848485, |
|
"grad_norm": 0.6787658929824829, |
|
"learning_rate": 1.518950382011861e-06, |
|
"loss": 2.261, |
|
"num_input_tokens_seen": 2492688, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.7515151515151515, |
|
"grad_norm": 0.5823308825492859, |
|
"learning_rate": 1.5154278106303649e-06, |
|
"loss": 2.3332, |
|
"num_input_tokens_seen": 2504472, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.7545454545454545, |
|
"grad_norm": 0.5042080879211426, |
|
"learning_rate": 1.511897982931078e-06, |
|
"loss": 2.3521, |
|
"num_input_tokens_seen": 2516160, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 0.6808260679244995, |
|
"learning_rate": 1.50836098011658e-06, |
|
"loss": 2.3093, |
|
"num_input_tokens_seen": 2527320, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"eval_loss": 2.3320088386535645, |
|
"eval_runtime": 5.8161, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 2527320, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7606060606060606, |
|
"grad_norm": 0.5960633158683777, |
|
"learning_rate": 1.5048168835545094e-06, |
|
"loss": 2.4031, |
|
"num_input_tokens_seen": 2535744, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.7636363636363637, |
|
"grad_norm": 0.4656361937522888, |
|
"learning_rate": 1.5012657747756961e-06, |
|
"loss": 2.2842, |
|
"num_input_tokens_seen": 2546376, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.7666666666666667, |
|
"grad_norm": 0.7001519203186035, |
|
"learning_rate": 1.4977077354722828e-06, |
|
"loss": 2.4888, |
|
"num_input_tokens_seen": 2553456, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.7696969696969697, |
|
"grad_norm": 0.5070295333862305, |
|
"learning_rate": 1.4941428474958469e-06, |
|
"loss": 2.3082, |
|
"num_input_tokens_seen": 2563632, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.7727272727272727, |
|
"grad_norm": 0.559223473072052, |
|
"learning_rate": 1.4905711928555178e-06, |
|
"loss": 2.4127, |
|
"num_input_tokens_seen": 2573184, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7757575757575758, |
|
"grad_norm": 0.45378220081329346, |
|
"learning_rate": 1.4869928537160892e-06, |
|
"loss": 2.1886, |
|
"num_input_tokens_seen": 2585472, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.7787878787878788, |
|
"grad_norm": 0.5591022968292236, |
|
"learning_rate": 1.4834079123961308e-06, |
|
"loss": 2.2753, |
|
"num_input_tokens_seen": 2594304, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.7818181818181819, |
|
"grad_norm": 0.6257476806640625, |
|
"learning_rate": 1.479816451366092e-06, |
|
"loss": 2.4605, |
|
"num_input_tokens_seen": 2601600, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.7848484848484848, |
|
"grad_norm": 0.5094606280326843, |
|
"learning_rate": 1.4762185532464057e-06, |
|
"loss": 2.4019, |
|
"num_input_tokens_seen": 2612280, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.7878787878787878, |
|
"grad_norm": 0.4572422206401825, |
|
"learning_rate": 1.472614300805591e-06, |
|
"loss": 2.5201, |
|
"num_input_tokens_seen": 2624280, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7878787878787878, |
|
"eval_loss": 2.3315682411193848, |
|
"eval_runtime": 5.8196, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 2624280, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7909090909090909, |
|
"grad_norm": 0.5242352485656738, |
|
"learning_rate": 1.4690037769583428e-06, |
|
"loss": 2.429, |
|
"num_input_tokens_seen": 2634072, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.793939393939394, |
|
"grad_norm": 0.48639097809791565, |
|
"learning_rate": 1.4653870647636297e-06, |
|
"loss": 2.4341, |
|
"num_input_tokens_seen": 2643864, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.796969696969697, |
|
"grad_norm": 0.48426756262779236, |
|
"learning_rate": 1.4617642474227797e-06, |
|
"loss": 2.2926, |
|
"num_input_tokens_seen": 2656152, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5517458319664001, |
|
"learning_rate": 1.45813540827757e-06, |
|
"loss": 2.6445, |
|
"num_input_tokens_seen": 2665968, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.803030303030303, |
|
"grad_norm": 0.540124237537384, |
|
"learning_rate": 1.4545006308083055e-06, |
|
"loss": 2.2952, |
|
"num_input_tokens_seen": 2677680, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.806060606060606, |
|
"grad_norm": 0.5651832222938538, |
|
"learning_rate": 1.4508599986319015e-06, |
|
"loss": 2.4097, |
|
"num_input_tokens_seen": 2687376, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.8090909090909091, |
|
"grad_norm": 0.4706498980522156, |
|
"learning_rate": 1.4472135954999578e-06, |
|
"loss": 2.2751, |
|
"num_input_tokens_seen": 2699112, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.8121212121212121, |
|
"grad_norm": 0.5661342144012451, |
|
"learning_rate": 1.4435615052968358e-06, |
|
"loss": 2.4527, |
|
"num_input_tokens_seen": 2710008, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.8151515151515152, |
|
"grad_norm": 0.49977409839630127, |
|
"learning_rate": 1.4399038120377224e-06, |
|
"loss": 2.3689, |
|
"num_input_tokens_seen": 2720136, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 0.5473623871803284, |
|
"learning_rate": 1.4362405998667043e-06, |
|
"loss": 2.4758, |
|
"num_input_tokens_seen": 2729160, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"eval_loss": 2.3316752910614014, |
|
"eval_runtime": 5.8161, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 2729160, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8212121212121212, |
|
"grad_norm": 0.5338855385780334, |
|
"learning_rate": 1.432571953054828e-06, |
|
"loss": 2.3434, |
|
"num_input_tokens_seen": 2739168, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.8242424242424242, |
|
"grad_norm": 0.5923134684562683, |
|
"learning_rate": 1.4288979559981615e-06, |
|
"loss": 2.364, |
|
"num_input_tokens_seen": 2747688, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.8272727272727273, |
|
"grad_norm": 0.48334839940071106, |
|
"learning_rate": 1.4252186932158546e-06, |
|
"loss": 2.4677, |
|
"num_input_tokens_seen": 2758488, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.8303030303030303, |
|
"grad_norm": 0.5619869828224182, |
|
"learning_rate": 1.421534249348192e-06, |
|
"loss": 2.5121, |
|
"num_input_tokens_seen": 2768832, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.6507293581962585, |
|
"learning_rate": 1.4178447091546497e-06, |
|
"loss": 2.491, |
|
"num_input_tokens_seen": 2779584, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8363636363636363, |
|
"grad_norm": 0.8891876935958862, |
|
"learning_rate": 1.414150157511941e-06, |
|
"loss": 2.3513, |
|
"num_input_tokens_seen": 2786232, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.8393939393939394, |
|
"grad_norm": 0.5667576193809509, |
|
"learning_rate": 1.410450679412067e-06, |
|
"loss": 2.4317, |
|
"num_input_tokens_seen": 2796216, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.8424242424242424, |
|
"grad_norm": 0.4579615592956543, |
|
"learning_rate": 1.406746359960361e-06, |
|
"loss": 2.3216, |
|
"num_input_tokens_seen": 2807352, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.8454545454545455, |
|
"grad_norm": 0.4524303376674652, |
|
"learning_rate": 1.403037284373529e-06, |
|
"loss": 2.2947, |
|
"num_input_tokens_seen": 2817936, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": 0.8141398429870605, |
|
"learning_rate": 1.3993235379776908e-06, |
|
"loss": 2.5013, |
|
"num_input_tokens_seen": 2827104, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"eval_loss": 2.33099102973938, |
|
"eval_runtime": 5.8178, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 2827104, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8515151515151516, |
|
"grad_norm": 0.6005460023880005, |
|
"learning_rate": 1.395605206206417e-06, |
|
"loss": 2.3728, |
|
"num_input_tokens_seen": 2834520, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.8545454545454545, |
|
"grad_norm": 0.6270483136177063, |
|
"learning_rate": 1.3918823745987625e-06, |
|
"loss": 2.5102, |
|
"num_input_tokens_seen": 2845560, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.8575757575757575, |
|
"grad_norm": 0.5506067872047424, |
|
"learning_rate": 1.3881551287973006e-06, |
|
"loss": 2.4606, |
|
"num_input_tokens_seen": 2856168, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.8606060606060606, |
|
"grad_norm": 0.5318931937217712, |
|
"learning_rate": 1.384423554546151e-06, |
|
"loss": 2.6367, |
|
"num_input_tokens_seen": 2866872, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.8636363636363636, |
|
"grad_norm": 0.5173328518867493, |
|
"learning_rate": 1.3806877376890084e-06, |
|
"loss": 2.4952, |
|
"num_input_tokens_seen": 2878296, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 0.6837607622146606, |
|
"learning_rate": 1.3769477641671668e-06, |
|
"loss": 2.4297, |
|
"num_input_tokens_seen": 2887056, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.8696969696969697, |
|
"grad_norm": 0.5360056757926941, |
|
"learning_rate": 1.373203720017544e-06, |
|
"loss": 2.3496, |
|
"num_input_tokens_seen": 2896152, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.8727272727272727, |
|
"grad_norm": 0.5022287368774414, |
|
"learning_rate": 1.3694556913706996e-06, |
|
"loss": 2.4491, |
|
"num_input_tokens_seen": 2905776, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.8757575757575757, |
|
"grad_norm": 0.691007137298584, |
|
"learning_rate": 1.3657037644488574e-06, |
|
"loss": 2.1934, |
|
"num_input_tokens_seen": 2915568, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.8787878787878788, |
|
"grad_norm": 0.5107728838920593, |
|
"learning_rate": 1.361948025563918e-06, |
|
"loss": 2.3654, |
|
"num_input_tokens_seen": 2926128, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8787878787878788, |
|
"eval_loss": 2.33089542388916, |
|
"eval_runtime": 5.8222, |
|
"eval_samples_per_second": 3.435, |
|
"eval_steps_per_second": 3.435, |
|
"num_input_tokens_seen": 2926128, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8818181818181818, |
|
"grad_norm": 0.5568860769271851, |
|
"learning_rate": 1.3581885611154759e-06, |
|
"loss": 2.4307, |
|
"num_input_tokens_seen": 2933568, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.8848484848484849, |
|
"grad_norm": 0.6976082921028137, |
|
"learning_rate": 1.3544254575888313e-06, |
|
"loss": 2.6203, |
|
"num_input_tokens_seen": 2942616, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.8878787878787879, |
|
"grad_norm": 0.5394561290740967, |
|
"learning_rate": 1.3506588015529994e-06, |
|
"loss": 2.4422, |
|
"num_input_tokens_seen": 2952480, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.8909090909090909, |
|
"grad_norm": 0.5144073963165283, |
|
"learning_rate": 1.3468886796587202e-06, |
|
"loss": 2.2622, |
|
"num_input_tokens_seen": 2962344, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.8939393939393939, |
|
"grad_norm": 0.5705990195274353, |
|
"learning_rate": 1.3431151786364647e-06, |
|
"loss": 2.3397, |
|
"num_input_tokens_seen": 2969832, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.896969696969697, |
|
"grad_norm": 0.7521764636039734, |
|
"learning_rate": 1.33933838529444e-06, |
|
"loss": 2.4768, |
|
"num_input_tokens_seen": 2979312, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.4214877784252167, |
|
"learning_rate": 1.3355583865165912e-06, |
|
"loss": 2.3752, |
|
"num_input_tokens_seen": 2990568, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.9030303030303031, |
|
"grad_norm": 0.6079035401344299, |
|
"learning_rate": 1.331775269260604e-06, |
|
"loss": 2.3682, |
|
"num_input_tokens_seen": 2998584, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.906060606060606, |
|
"grad_norm": 0.5687966346740723, |
|
"learning_rate": 1.3279891205559034e-06, |
|
"loss": 2.4906, |
|
"num_input_tokens_seen": 3005784, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.6438218355178833, |
|
"learning_rate": 1.3242000275016527e-06, |
|
"loss": 2.4142, |
|
"num_input_tokens_seen": 3013968, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"eval_loss": 2.3308167457580566, |
|
"eval_runtime": 5.8211, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 3.436, |
|
"num_input_tokens_seen": 3013968, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9121212121212121, |
|
"grad_norm": 0.8877610564231873, |
|
"learning_rate": 1.3204080772647478e-06, |
|
"loss": 2.8198, |
|
"num_input_tokens_seen": 3021504, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.9151515151515152, |
|
"grad_norm": 0.6974935531616211, |
|
"learning_rate": 1.3166133570778143e-06, |
|
"loss": 2.4954, |
|
"num_input_tokens_seen": 3033264, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.9181818181818182, |
|
"grad_norm": 0.4437900483608246, |
|
"learning_rate": 1.3128159542371987e-06, |
|
"loss": 2.4191, |
|
"num_input_tokens_seen": 3044688, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.9212121212121213, |
|
"grad_norm": 0.41366204619407654, |
|
"learning_rate": 1.309015956100962e-06, |
|
"loss": 2.2432, |
|
"num_input_tokens_seen": 3056592, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.9242424242424242, |
|
"grad_norm": 0.4901912808418274, |
|
"learning_rate": 1.3052134500868686e-06, |
|
"loss": 2.4408, |
|
"num_input_tokens_seen": 3066048, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9272727272727272, |
|
"grad_norm": 0.7082731127738953, |
|
"learning_rate": 1.301408523670376e-06, |
|
"loss": 2.5248, |
|
"num_input_tokens_seen": 3076128, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.9303030303030303, |
|
"grad_norm": 0.6702643036842346, |
|
"learning_rate": 1.297601264382622e-06, |
|
"loss": 2.4202, |
|
"num_input_tokens_seen": 3085464, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.5271164178848267, |
|
"learning_rate": 1.2937917598084123e-06, |
|
"loss": 2.3525, |
|
"num_input_tokens_seen": 3094440, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.9363636363636364, |
|
"grad_norm": 0.5742107629776001, |
|
"learning_rate": 1.2899800975842038e-06, |
|
"loss": 2.3598, |
|
"num_input_tokens_seen": 3105720, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.9393939393939394, |
|
"grad_norm": 0.653012216091156, |
|
"learning_rate": 1.286166365396089e-06, |
|
"loss": 2.588, |
|
"num_input_tokens_seen": 3113856, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9393939393939394, |
|
"eval_loss": 2.3307266235351562, |
|
"eval_runtime": 5.8207, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 3.436, |
|
"num_input_tokens_seen": 3113856, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9424242424242424, |
|
"grad_norm": 0.7475118041038513, |
|
"learning_rate": 1.2823506509777807e-06, |
|
"loss": 2.4249, |
|
"num_input_tokens_seen": 3123288, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.9454545454545454, |
|
"grad_norm": 0.7373444437980652, |
|
"learning_rate": 1.2785330421085917e-06, |
|
"loss": 2.3551, |
|
"num_input_tokens_seen": 3131256, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.9484848484848485, |
|
"grad_norm": 0.5523613691329956, |
|
"learning_rate": 1.2747136266114156e-06, |
|
"loss": 2.1922, |
|
"num_input_tokens_seen": 3139656, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.9515151515151515, |
|
"grad_norm": 0.7101964950561523, |
|
"learning_rate": 1.270892492350707e-06, |
|
"loss": 2.4905, |
|
"num_input_tokens_seen": 3147744, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.9545454545454546, |
|
"grad_norm": 0.5868334770202637, |
|
"learning_rate": 1.267069727230461e-06, |
|
"loss": 2.4588, |
|
"num_input_tokens_seen": 3158376, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9575757575757575, |
|
"grad_norm": 0.6006575226783752, |
|
"learning_rate": 1.2632454191921894e-06, |
|
"loss": 2.3059, |
|
"num_input_tokens_seen": 3168120, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.9606060606060606, |
|
"grad_norm": 0.5622104406356812, |
|
"learning_rate": 1.2594196562128978e-06, |
|
"loss": 2.5159, |
|
"num_input_tokens_seen": 3178176, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.9636363636363636, |
|
"grad_norm": 0.5180094242095947, |
|
"learning_rate": 1.2555925263030634e-06, |
|
"loss": 2.3614, |
|
"num_input_tokens_seen": 3189816, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.9666666666666667, |
|
"grad_norm": 0.7544111013412476, |
|
"learning_rate": 1.2517641175046078e-06, |
|
"loss": 2.6341, |
|
"num_input_tokens_seen": 3198528, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 0.5005560517311096, |
|
"learning_rate": 1.2479345178888752e-06, |
|
"loss": 2.1493, |
|
"num_input_tokens_seen": 3209904, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"eval_loss": 2.3306069374084473, |
|
"eval_runtime": 5.8165, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 3209904, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9727272727272728, |
|
"grad_norm": 0.6143120527267456, |
|
"learning_rate": 1.244103815554602e-06, |
|
"loss": 2.5543, |
|
"num_input_tokens_seen": 3220584, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.9757575757575757, |
|
"grad_norm": 0.6468402147293091, |
|
"learning_rate": 1.2402720986258936e-06, |
|
"loss": 2.3468, |
|
"num_input_tokens_seen": 3231576, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.9787878787878788, |
|
"grad_norm": 0.6000608205795288, |
|
"learning_rate": 1.2364394552501951e-06, |
|
"loss": 2.3648, |
|
"num_input_tokens_seen": 3239208, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.9818181818181818, |
|
"grad_norm": 0.6772189140319824, |
|
"learning_rate": 1.2326059735962648e-06, |
|
"loss": 2.5894, |
|
"num_input_tokens_seen": 3246072, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.9848484848484849, |
|
"grad_norm": 0.5030667185783386, |
|
"learning_rate": 1.228771741852145e-06, |
|
"loss": 2.4484, |
|
"num_input_tokens_seen": 3258000, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9878787878787879, |
|
"grad_norm": 0.8376536965370178, |
|
"learning_rate": 1.2249368482231334e-06, |
|
"loss": 2.5076, |
|
"num_input_tokens_seen": 3264912, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.990909090909091, |
|
"grad_norm": 0.6285922527313232, |
|
"learning_rate": 1.2211013809297546e-06, |
|
"loss": 2.3112, |
|
"num_input_tokens_seen": 3272832, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.9939393939393939, |
|
"grad_norm": 0.49095821380615234, |
|
"learning_rate": 1.21726542820573e-06, |
|
"loss": 2.3038, |
|
"num_input_tokens_seen": 3283848, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.996969696969697, |
|
"grad_norm": 0.5539312958717346, |
|
"learning_rate": 1.213429078295948e-06, |
|
"loss": 2.3811, |
|
"num_input_tokens_seen": 3295272, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.46812400221824646, |
|
"learning_rate": 1.2095924194544344e-06, |
|
"loss": 2.4287, |
|
"num_input_tokens_seen": 3305760, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.3300185203552246, |
|
"eval_runtime": 5.8178, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 3305760, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.003030303030303, |
|
"grad_norm": 0.45484259724617004, |
|
"learning_rate": 1.2057555399423218e-06, |
|
"loss": 2.4229, |
|
"num_input_tokens_seen": 3316512, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.006060606060606, |
|
"grad_norm": 0.506411612033844, |
|
"learning_rate": 1.201918528025819e-06, |
|
"loss": 2.3718, |
|
"num_input_tokens_seen": 3328800, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.009090909090909, |
|
"grad_norm": 0.7456917762756348, |
|
"learning_rate": 1.1980814719741809e-06, |
|
"loss": 2.5418, |
|
"num_input_tokens_seen": 3335424, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.0121212121212122, |
|
"grad_norm": 0.6323581337928772, |
|
"learning_rate": 1.1942444600576783e-06, |
|
"loss": 2.4076, |
|
"num_input_tokens_seen": 3344904, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.0151515151515151, |
|
"grad_norm": 0.6008067727088928, |
|
"learning_rate": 1.1904075805455657e-06, |
|
"loss": 2.3543, |
|
"num_input_tokens_seen": 3355176, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.018181818181818, |
|
"grad_norm": 0.6115381121635437, |
|
"learning_rate": 1.186570921704052e-06, |
|
"loss": 2.3537, |
|
"num_input_tokens_seen": 3366096, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.0212121212121212, |
|
"grad_norm": 0.5540327429771423, |
|
"learning_rate": 1.18273457179427e-06, |
|
"loss": 2.1717, |
|
"num_input_tokens_seen": 3375696, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.0242424242424242, |
|
"grad_norm": 0.6130234599113464, |
|
"learning_rate": 1.1788986190702453e-06, |
|
"loss": 2.408, |
|
"num_input_tokens_seen": 3384288, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.0272727272727273, |
|
"grad_norm": 0.6069101095199585, |
|
"learning_rate": 1.1750631517768667e-06, |
|
"loss": 2.3485, |
|
"num_input_tokens_seen": 3391128, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.0303030303030303, |
|
"grad_norm": 0.5664869546890259, |
|
"learning_rate": 1.1712282581478552e-06, |
|
"loss": 2.4617, |
|
"num_input_tokens_seen": 3401640, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0303030303030303, |
|
"eval_loss": 2.3293986320495605, |
|
"eval_runtime": 5.8211, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 3.436, |
|
"num_input_tokens_seen": 3401640, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0333333333333334, |
|
"grad_norm": 0.5258334875106812, |
|
"learning_rate": 1.167394026403735e-06, |
|
"loss": 2.3971, |
|
"num_input_tokens_seen": 3411120, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.0363636363636364, |
|
"grad_norm": 0.5583547353744507, |
|
"learning_rate": 1.1635605447498048e-06, |
|
"loss": 2.3265, |
|
"num_input_tokens_seen": 3420912, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.0393939393939393, |
|
"grad_norm": 0.5852888822555542, |
|
"learning_rate": 1.1597279013741067e-06, |
|
"loss": 2.5114, |
|
"num_input_tokens_seen": 3429744, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.0424242424242425, |
|
"grad_norm": 0.5078532695770264, |
|
"learning_rate": 1.1558961844453978e-06, |
|
"loss": 2.5497, |
|
"num_input_tokens_seen": 3438936, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.0454545454545454, |
|
"grad_norm": 0.9847856760025024, |
|
"learning_rate": 1.152065482111125e-06, |
|
"loss": 2.5458, |
|
"num_input_tokens_seen": 3444912, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0484848484848486, |
|
"grad_norm": 0.49534177780151367, |
|
"learning_rate": 1.1482358824953919e-06, |
|
"loss": 2.3622, |
|
"num_input_tokens_seen": 3456936, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.0515151515151515, |
|
"grad_norm": 0.6851257681846619, |
|
"learning_rate": 1.144407473696937e-06, |
|
"loss": 2.221, |
|
"num_input_tokens_seen": 3466344, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.0545454545454545, |
|
"grad_norm": 0.4764980375766754, |
|
"learning_rate": 1.1405803437871027e-06, |
|
"loss": 2.3708, |
|
"num_input_tokens_seen": 3478632, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.0575757575757576, |
|
"grad_norm": 0.6040279865264893, |
|
"learning_rate": 1.136754580807811e-06, |
|
"loss": 2.5175, |
|
"num_input_tokens_seen": 3485496, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"grad_norm": 0.6335225701332092, |
|
"learning_rate": 1.1329302727695389e-06, |
|
"loss": 2.2166, |
|
"num_input_tokens_seen": 3496272, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"eval_loss": 2.329413890838623, |
|
"eval_runtime": 5.8255, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 3496272, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0636363636363637, |
|
"grad_norm": 0.6664142608642578, |
|
"learning_rate": 1.1291075076492928e-06, |
|
"loss": 2.5228, |
|
"num_input_tokens_seen": 3506712, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.5364813208580017, |
|
"learning_rate": 1.1252863733885845e-06, |
|
"loss": 2.4304, |
|
"num_input_tokens_seen": 3518856, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.0696969696969698, |
|
"grad_norm": 0.7389492988586426, |
|
"learning_rate": 1.1214669578914087e-06, |
|
"loss": 2.0998, |
|
"num_input_tokens_seen": 3528456, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.0727272727272728, |
|
"grad_norm": 0.709426999092102, |
|
"learning_rate": 1.1176493490222192e-06, |
|
"loss": 2.146, |
|
"num_input_tokens_seen": 3537048, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.0757575757575757, |
|
"grad_norm": 0.7311533093452454, |
|
"learning_rate": 1.1138336346039113e-06, |
|
"loss": 2.3275, |
|
"num_input_tokens_seen": 3544536, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0787878787878789, |
|
"grad_norm": 0.5675577521324158, |
|
"learning_rate": 1.1100199024157966e-06, |
|
"loss": 2.3477, |
|
"num_input_tokens_seen": 3551472, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.0818181818181818, |
|
"grad_norm": 0.6367121934890747, |
|
"learning_rate": 1.1062082401915878e-06, |
|
"loss": 2.4356, |
|
"num_input_tokens_seen": 3561312, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.084848484848485, |
|
"grad_norm": 0.5750899910926819, |
|
"learning_rate": 1.1023987356173782e-06, |
|
"loss": 2.5201, |
|
"num_input_tokens_seen": 3570456, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.087878787878788, |
|
"grad_norm": 0.46258801221847534, |
|
"learning_rate": 1.0985914763296245e-06, |
|
"loss": 2.0526, |
|
"num_input_tokens_seen": 3582744, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 0.6125935912132263, |
|
"learning_rate": 1.0947865499131315e-06, |
|
"loss": 2.2984, |
|
"num_input_tokens_seen": 3595032, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"eval_loss": 2.328953504562378, |
|
"eval_runtime": 5.8254, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 3595032, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.093939393939394, |
|
"grad_norm": 0.48193785548210144, |
|
"learning_rate": 1.0909840438990383e-06, |
|
"loss": 1.7515, |
|
"num_input_tokens_seen": 3606048, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.096969696969697, |
|
"grad_norm": 0.48528820276260376, |
|
"learning_rate": 1.0871840457628012e-06, |
|
"loss": 2.3416, |
|
"num_input_tokens_seen": 3616368, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.46913468837738037, |
|
"learning_rate": 1.0833866429221858e-06, |
|
"loss": 2.3327, |
|
"num_input_tokens_seen": 3628368, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.103030303030303, |
|
"grad_norm": 0.5710415840148926, |
|
"learning_rate": 1.0795919227352523e-06, |
|
"loss": 2.401, |
|
"num_input_tokens_seen": 3637848, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.106060606060606, |
|
"grad_norm": 0.5964322090148926, |
|
"learning_rate": 1.0757999724983474e-06, |
|
"loss": 2.2503, |
|
"num_input_tokens_seen": 3647640, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1090909090909091, |
|
"grad_norm": 0.5693560242652893, |
|
"learning_rate": 1.0720108794440967e-06, |
|
"loss": 2.4449, |
|
"num_input_tokens_seen": 3658272, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.112121212121212, |
|
"grad_norm": 0.7325261235237122, |
|
"learning_rate": 1.068224730739396e-06, |
|
"loss": 2.2787, |
|
"num_input_tokens_seen": 3668760, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.1151515151515152, |
|
"grad_norm": 0.5507751107215881, |
|
"learning_rate": 1.064441613483409e-06, |
|
"loss": 2.2226, |
|
"num_input_tokens_seen": 3679608, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.1181818181818182, |
|
"grad_norm": 0.4701879620552063, |
|
"learning_rate": 1.0606616147055602e-06, |
|
"loss": 2.6116, |
|
"num_input_tokens_seen": 3689832, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.121212121212121, |
|
"grad_norm": 0.5531448125839233, |
|
"learning_rate": 1.056884821363535e-06, |
|
"loss": 2.1242, |
|
"num_input_tokens_seen": 3700392, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.121212121212121, |
|
"eval_loss": 2.3289198875427246, |
|
"eval_runtime": 5.8244, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 3700392, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1242424242424243, |
|
"grad_norm": 0.7482770085334778, |
|
"learning_rate": 1.05311132034128e-06, |
|
"loss": 2.3979, |
|
"num_input_tokens_seen": 3709632, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.1272727272727272, |
|
"grad_norm": 0.6427175998687744, |
|
"learning_rate": 1.0493411984470007e-06, |
|
"loss": 2.4608, |
|
"num_input_tokens_seen": 3717720, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.1303030303030304, |
|
"grad_norm": 0.5718503594398499, |
|
"learning_rate": 1.0455745424111686e-06, |
|
"loss": 2.5028, |
|
"num_input_tokens_seen": 3728280, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.1333333333333333, |
|
"grad_norm": 0.8905156850814819, |
|
"learning_rate": 1.0418114388845242e-06, |
|
"loss": 2.5461, |
|
"num_input_tokens_seen": 3735888, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 0.535351037979126, |
|
"learning_rate": 1.038051974436082e-06, |
|
"loss": 2.2596, |
|
"num_input_tokens_seen": 3747720, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1393939393939394, |
|
"grad_norm": 0.5600206255912781, |
|
"learning_rate": 1.034296235551143e-06, |
|
"loss": 2.2801, |
|
"num_input_tokens_seen": 3758640, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.1424242424242423, |
|
"grad_norm": 0.5470922589302063, |
|
"learning_rate": 1.0305443086293003e-06, |
|
"loss": 2.3337, |
|
"num_input_tokens_seen": 3769128, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.1454545454545455, |
|
"grad_norm": 0.5066417455673218, |
|
"learning_rate": 1.0267962799824562e-06, |
|
"loss": 2.6706, |
|
"num_input_tokens_seen": 3779304, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.1484848484848484, |
|
"grad_norm": 0.46135252714157104, |
|
"learning_rate": 1.0230522358328331e-06, |
|
"loss": 2.2422, |
|
"num_input_tokens_seen": 3789312, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.1515151515151516, |
|
"grad_norm": 0.7310757637023926, |
|
"learning_rate": 1.0193122623109917e-06, |
|
"loss": 2.4892, |
|
"num_input_tokens_seen": 3796848, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1515151515151516, |
|
"eval_loss": 2.3289122581481934, |
|
"eval_runtime": 5.8291, |
|
"eval_samples_per_second": 3.431, |
|
"eval_steps_per_second": 3.431, |
|
"num_input_tokens_seen": 3796848, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1545454545454545, |
|
"grad_norm": 0.5655786991119385, |
|
"learning_rate": 1.015576445453849e-06, |
|
"loss": 2.2826, |
|
"num_input_tokens_seen": 3806640, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.1575757575757575, |
|
"grad_norm": 0.6524637341499329, |
|
"learning_rate": 1.0118448712026992e-06, |
|
"loss": 2.4358, |
|
"num_input_tokens_seen": 3817608, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.1606060606060606, |
|
"grad_norm": 0.6280786991119385, |
|
"learning_rate": 1.0081176254012374e-06, |
|
"loss": 2.421, |
|
"num_input_tokens_seen": 3827592, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.1636363636363636, |
|
"grad_norm": 0.6797434687614441, |
|
"learning_rate": 1.0043947937935832e-06, |
|
"loss": 2.3245, |
|
"num_input_tokens_seen": 3837264, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.5665332078933716, |
|
"learning_rate": 1.0006764620223093e-06, |
|
"loss": 2.3388, |
|
"num_input_tokens_seen": 3847656, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1696969696969697, |
|
"grad_norm": 0.4868026077747345, |
|
"learning_rate": 9.96962715626471e-07, |
|
"loss": 2.3956, |
|
"num_input_tokens_seen": 3858600, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.1727272727272728, |
|
"grad_norm": 0.77336585521698, |
|
"learning_rate": 9.932536400396393e-07, |
|
"loss": 2.3562, |
|
"num_input_tokens_seen": 3870120, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.1757575757575758, |
|
"grad_norm": 0.6464818120002747, |
|
"learning_rate": 9.895493205879332e-07, |
|
"loss": 2.5851, |
|
"num_input_tokens_seen": 3879600, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.1787878787878787, |
|
"grad_norm": 0.6274628639221191, |
|
"learning_rate": 9.858498424880592e-07, |
|
"loss": 2.7061, |
|
"num_input_tokens_seen": 3889296, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 0.5714861154556274, |
|
"learning_rate": 9.821552908453506e-07, |
|
"loss": 2.4251, |
|
"num_input_tokens_seen": 3901464, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"eval_loss": 2.3287835121154785, |
|
"eval_runtime": 5.8272, |
|
"eval_samples_per_second": 3.432, |
|
"eval_steps_per_second": 3.432, |
|
"num_input_tokens_seen": 3901464, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1848484848484848, |
|
"grad_norm": 0.6943346261978149, |
|
"learning_rate": 9.784657506518078e-07, |
|
"loss": 2.6212, |
|
"num_input_tokens_seen": 3910656, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.187878787878788, |
|
"grad_norm": 0.4821998178958893, |
|
"learning_rate": 9.747813067841455e-07, |
|
"loss": 2.3086, |
|
"num_input_tokens_seen": 3922944, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.190909090909091, |
|
"grad_norm": 0.7086381912231445, |
|
"learning_rate": 9.711020440018384e-07, |
|
"loss": 2.5027, |
|
"num_input_tokens_seen": 3931752, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.1939393939393939, |
|
"grad_norm": 0.5712624788284302, |
|
"learning_rate": 9.674280469451718e-07, |
|
"loss": 2.4088, |
|
"num_input_tokens_seen": 3942120, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.196969696969697, |
|
"grad_norm": 0.6443710327148438, |
|
"learning_rate": 9.637594001332956e-07, |
|
"loss": 2.3161, |
|
"num_input_tokens_seen": 3952248, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.6674967408180237, |
|
"learning_rate": 9.600961879622777e-07, |
|
"loss": 2.4837, |
|
"num_input_tokens_seen": 3960600, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.2030303030303031, |
|
"grad_norm": 0.5792006254196167, |
|
"learning_rate": 9.564384947031646e-07, |
|
"loss": 2.3195, |
|
"num_input_tokens_seen": 3971568, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.206060606060606, |
|
"grad_norm": 0.7185015082359314, |
|
"learning_rate": 9.527864045000421e-07, |
|
"loss": 2.5749, |
|
"num_input_tokens_seen": 3983592, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.209090909090909, |
|
"grad_norm": 0.6423861980438232, |
|
"learning_rate": 9.491400013680988e-07, |
|
"loss": 2.39, |
|
"num_input_tokens_seen": 3994008, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 0.6292434334754944, |
|
"learning_rate": 9.454993691916948e-07, |
|
"loss": 2.3579, |
|
"num_input_tokens_seen": 4004496, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"eval_loss": 2.3282077312469482, |
|
"eval_runtime": 5.8254, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 4004496, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.215151515151515, |
|
"grad_norm": 0.6097608208656311, |
|
"learning_rate": 9.418645917224303e-07, |
|
"loss": 2.3152, |
|
"num_input_tokens_seen": 4016592, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.2181818181818183, |
|
"grad_norm": 0.5774179100990295, |
|
"learning_rate": 9.382357525772202e-07, |
|
"loss": 2.4599, |
|
"num_input_tokens_seen": 4024800, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.2212121212121212, |
|
"grad_norm": 0.645380973815918, |
|
"learning_rate": 9.346129352363705e-07, |
|
"loss": 2.2412, |
|
"num_input_tokens_seen": 4035144, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.2242424242424241, |
|
"grad_norm": 0.6434935331344604, |
|
"learning_rate": 9.309962230416574e-07, |
|
"loss": 2.4022, |
|
"num_input_tokens_seen": 4042920, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.2272727272727273, |
|
"grad_norm": 0.5125094056129456, |
|
"learning_rate": 9.273856991944089e-07, |
|
"loss": 2.4082, |
|
"num_input_tokens_seen": 4053072, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2303030303030302, |
|
"grad_norm": 0.5167670845985413, |
|
"learning_rate": 9.237814467535941e-07, |
|
"loss": 2.3188, |
|
"num_input_tokens_seen": 4063368, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.2333333333333334, |
|
"grad_norm": 0.5533791184425354, |
|
"learning_rate": 9.201835486339084e-07, |
|
"loss": 2.4367, |
|
"num_input_tokens_seen": 4072392, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.2363636363636363, |
|
"grad_norm": 0.5429077744483948, |
|
"learning_rate": 9.165920876038694e-07, |
|
"loss": 2.3054, |
|
"num_input_tokens_seen": 4083072, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.2393939393939393, |
|
"grad_norm": 0.530968427658081, |
|
"learning_rate": 9.130071462839108e-07, |
|
"loss": 2.4475, |
|
"num_input_tokens_seen": 4093776, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.2424242424242424, |
|
"grad_norm": 0.5137664675712585, |
|
"learning_rate": 9.094288071444822e-07, |
|
"loss": 2.4868, |
|
"num_input_tokens_seen": 4106040, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2424242424242424, |
|
"eval_loss": 2.3283748626708984, |
|
"eval_runtime": 5.8265, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 4106040, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2454545454545454, |
|
"grad_norm": 0.7732150554656982, |
|
"learning_rate": 9.058571525041534e-07, |
|
"loss": 2.4682, |
|
"num_input_tokens_seen": 4117392, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 1.2484848484848485, |
|
"grad_norm": 0.6861566305160522, |
|
"learning_rate": 9.022922645277176e-07, |
|
"loss": 2.372, |
|
"num_input_tokens_seen": 4125696, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.2515151515151515, |
|
"grad_norm": 0.4728741943836212, |
|
"learning_rate": 8.987342252243042e-07, |
|
"loss": 2.4424, |
|
"num_input_tokens_seen": 4137816, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 1.2545454545454544, |
|
"grad_norm": 0.5557587742805481, |
|
"learning_rate": 8.951831164454908e-07, |
|
"loss": 2.4164, |
|
"num_input_tokens_seen": 4150104, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.2575757575757576, |
|
"grad_norm": 0.6730014085769653, |
|
"learning_rate": 8.916390198834203e-07, |
|
"loss": 2.4451, |
|
"num_input_tokens_seen": 4160832, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2606060606060607, |
|
"grad_norm": 0.7126666307449341, |
|
"learning_rate": 8.88102017068922e-07, |
|
"loss": 2.3256, |
|
"num_input_tokens_seen": 4170216, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.2636363636363637, |
|
"grad_norm": 0.6457303762435913, |
|
"learning_rate": 8.845721893696354e-07, |
|
"loss": 2.2176, |
|
"num_input_tokens_seen": 4181256, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 1.2666666666666666, |
|
"grad_norm": 1.0662436485290527, |
|
"learning_rate": 8.810496179881387e-07, |
|
"loss": 2.3812, |
|
"num_input_tokens_seen": 4192128, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.2696969696969698, |
|
"grad_norm": 0.4683075547218323, |
|
"learning_rate": 8.775343839600816e-07, |
|
"loss": 2.4275, |
|
"num_input_tokens_seen": 4202208, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 0.5171107649803162, |
|
"learning_rate": 8.740265681523195e-07, |
|
"loss": 2.4706, |
|
"num_input_tokens_seen": 4210464, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"eval_loss": 2.3279545307159424, |
|
"eval_runtime": 5.823, |
|
"eval_samples_per_second": 3.435, |
|
"eval_steps_per_second": 3.435, |
|
"num_input_tokens_seen": 4210464, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2757575757575759, |
|
"grad_norm": 0.7313932180404663, |
|
"learning_rate": 8.705262512610539e-07, |
|
"loss": 2.4054, |
|
"num_input_tokens_seen": 4217928, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 1.2787878787878788, |
|
"grad_norm": 0.7015888690948486, |
|
"learning_rate": 8.670335138099765e-07, |
|
"loss": 2.4653, |
|
"num_input_tokens_seen": 4226904, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.2818181818181817, |
|
"grad_norm": 0.6179009079933167, |
|
"learning_rate": 8.635484361484158e-07, |
|
"loss": 2.3184, |
|
"num_input_tokens_seen": 4237656, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.284848484848485, |
|
"grad_norm": 0.5112322568893433, |
|
"learning_rate": 8.600710984494909e-07, |
|
"loss": 2.3415, |
|
"num_input_tokens_seen": 4248720, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.2878787878787878, |
|
"grad_norm": 0.7824225425720215, |
|
"learning_rate": 8.56601580708263e-07, |
|
"loss": 2.6382, |
|
"num_input_tokens_seen": 4253448, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.290909090909091, |
|
"grad_norm": 0.7822674512863159, |
|
"learning_rate": 8.531399627398991e-07, |
|
"loss": 2.5681, |
|
"num_input_tokens_seen": 4261488, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.293939393939394, |
|
"grad_norm": 0.5791777968406677, |
|
"learning_rate": 8.496863241778346e-07, |
|
"loss": 2.2039, |
|
"num_input_tokens_seen": 4273104, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 1.2969696969696969, |
|
"grad_norm": 0.5415911674499512, |
|
"learning_rate": 8.462407444719405e-07, |
|
"loss": 2.3936, |
|
"num_input_tokens_seen": 4283136, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.5852922797203064, |
|
"learning_rate": 8.428033028866967e-07, |
|
"loss": 2.3669, |
|
"num_input_tokens_seen": 4292208, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 1.303030303030303, |
|
"grad_norm": 0.5799878239631653, |
|
"learning_rate": 8.393740784993677e-07, |
|
"loss": 2.4704, |
|
"num_input_tokens_seen": 4302240, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.303030303030303, |
|
"eval_loss": 2.3276970386505127, |
|
"eval_runtime": 5.8227, |
|
"eval_samples_per_second": 3.435, |
|
"eval_steps_per_second": 3.435, |
|
"num_input_tokens_seen": 4302240, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3060606060606061, |
|
"grad_norm": 0.7296667695045471, |
|
"learning_rate": 8.359531501981846e-07, |
|
"loss": 2.7633, |
|
"num_input_tokens_seen": 4311888, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 1.309090909090909, |
|
"grad_norm": 0.9460285305976868, |
|
"learning_rate": 8.325405966805295e-07, |
|
"loss": 2.1671, |
|
"num_input_tokens_seen": 4321992, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.312121212121212, |
|
"grad_norm": 0.5294950008392334, |
|
"learning_rate": 8.291364964511247e-07, |
|
"loss": 2.4139, |
|
"num_input_tokens_seen": 4332408, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 1.3151515151515152, |
|
"grad_norm": 0.6206031441688538, |
|
"learning_rate": 8.25740927820228e-07, |
|
"loss": 2.5621, |
|
"num_input_tokens_seen": 4344696, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.3181818181818181, |
|
"grad_norm": 0.5652275085449219, |
|
"learning_rate": 8.223539689018299e-07, |
|
"loss": 2.4142, |
|
"num_input_tokens_seen": 4356168, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3212121212121213, |
|
"grad_norm": 0.6217209696769714, |
|
"learning_rate": 8.189756976118568e-07, |
|
"loss": 2.3459, |
|
"num_input_tokens_seen": 4364568, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.3242424242424242, |
|
"grad_norm": 0.5359376072883606, |
|
"learning_rate": 8.156061916663807e-07, |
|
"loss": 2.2973, |
|
"num_input_tokens_seen": 4374984, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 1.3272727272727272, |
|
"grad_norm": 0.531065821647644, |
|
"learning_rate": 8.12245528579828e-07, |
|
"loss": 2.5294, |
|
"num_input_tokens_seen": 4385424, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.3303030303030303, |
|
"grad_norm": 0.837188184261322, |
|
"learning_rate": 8.088937856631974e-07, |
|
"loss": 2.4239, |
|
"num_input_tokens_seen": 4395192, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.6112043857574463, |
|
"learning_rate": 8.055510400222836e-07, |
|
"loss": 2.4403, |
|
"num_input_tokens_seen": 4405608, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"eval_loss": 2.328122138977051, |
|
"eval_runtime": 5.8246, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 4405608, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3363636363636364, |
|
"grad_norm": 0.6124045252799988, |
|
"learning_rate": 8.022173685559011e-07, |
|
"loss": 2.389, |
|
"num_input_tokens_seen": 4417896, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.3393939393939394, |
|
"grad_norm": 0.6339285969734192, |
|
"learning_rate": 7.988928479541154e-07, |
|
"loss": 2.3811, |
|
"num_input_tokens_seen": 4428000, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.3424242424242423, |
|
"grad_norm": 0.5700270533561707, |
|
"learning_rate": 7.955775546964797e-07, |
|
"loss": 2.4351, |
|
"num_input_tokens_seen": 4436736, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 1.3454545454545455, |
|
"grad_norm": 0.5536416172981262, |
|
"learning_rate": 7.922715650502746e-07, |
|
"loss": 2.4343, |
|
"num_input_tokens_seen": 4447488, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.3484848484848486, |
|
"grad_norm": 0.6863646507263184, |
|
"learning_rate": 7.889749550687552e-07, |
|
"loss": 2.5435, |
|
"num_input_tokens_seen": 4455840, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3515151515151516, |
|
"grad_norm": 0.6737553477287292, |
|
"learning_rate": 7.856878005893988e-07, |
|
"loss": 2.3398, |
|
"num_input_tokens_seen": 4463568, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.3545454545454545, |
|
"grad_norm": 0.7057380676269531, |
|
"learning_rate": 7.824101772321625e-07, |
|
"loss": 2.3618, |
|
"num_input_tokens_seen": 4472904, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 1.3575757575757577, |
|
"grad_norm": 0.47144582867622375, |
|
"learning_rate": 7.791421603977435e-07, |
|
"loss": 2.1904, |
|
"num_input_tokens_seen": 4484400, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.3606060606060606, |
|
"grad_norm": 0.5720792412757874, |
|
"learning_rate": 7.758838252658433e-07, |
|
"loss": 2.3122, |
|
"num_input_tokens_seen": 4493592, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.6241214275360107, |
|
"learning_rate": 7.726352467934386e-07, |
|
"loss": 2.4964, |
|
"num_input_tokens_seen": 4502664, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"eval_loss": 2.327789783477783, |
|
"eval_runtime": 5.8265, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 4502664, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3666666666666667, |
|
"grad_norm": 0.6216875910758972, |
|
"learning_rate": 7.693964997130581e-07, |
|
"loss": 2.4142, |
|
"num_input_tokens_seen": 4510920, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 1.3696969696969696, |
|
"grad_norm": 0.5733647346496582, |
|
"learning_rate": 7.661676585310618e-07, |
|
"loss": 2.3751, |
|
"num_input_tokens_seen": 4523208, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.3727272727272728, |
|
"grad_norm": 0.5904967784881592, |
|
"learning_rate": 7.629487975259276e-07, |
|
"loss": 2.5808, |
|
"num_input_tokens_seen": 4532520, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 1.3757575757575757, |
|
"grad_norm": 0.44976285099983215, |
|
"learning_rate": 7.597399907465431e-07, |
|
"loss": 2.3199, |
|
"num_input_tokens_seen": 4544688, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.378787878787879, |
|
"grad_norm": 0.6326127052307129, |
|
"learning_rate": 7.565413120105009e-07, |
|
"loss": 2.3752, |
|
"num_input_tokens_seen": 4554000, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3818181818181818, |
|
"grad_norm": 0.5754263997077942, |
|
"learning_rate": 7.533528349024014e-07, |
|
"loss": 2.3512, |
|
"num_input_tokens_seen": 4564368, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.3848484848484848, |
|
"grad_norm": 0.7068946957588196, |
|
"learning_rate": 7.5017463277216e-07, |
|
"loss": 2.3772, |
|
"num_input_tokens_seen": 4574448, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 1.387878787878788, |
|
"grad_norm": 0.6131560206413269, |
|
"learning_rate": 7.470067787333188e-07, |
|
"loss": 2.4036, |
|
"num_input_tokens_seen": 4582464, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.3909090909090909, |
|
"grad_norm": 0.6577942967414856, |
|
"learning_rate": 7.43849345661367e-07, |
|
"loss": 2.3063, |
|
"num_input_tokens_seen": 4592976, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 1.393939393939394, |
|
"grad_norm": 0.7147580981254578, |
|
"learning_rate": 7.407024061920599e-07, |
|
"loss": 2.4129, |
|
"num_input_tokens_seen": 4603920, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.393939393939394, |
|
"eval_loss": 2.32749080657959, |
|
"eval_runtime": 5.8263, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 4603920, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.396969696969697, |
|
"grad_norm": 0.5320861339569092, |
|
"learning_rate": 7.375660327197534e-07, |
|
"loss": 2.3207, |
|
"num_input_tokens_seen": 4614072, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.6758208870887756, |
|
"learning_rate": 7.344402973957346e-07, |
|
"loss": 2.4536, |
|
"num_input_tokens_seen": 4622640, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.403030303030303, |
|
"grad_norm": 0.5670093894004822, |
|
"learning_rate": 7.313252721265638e-07, |
|
"loss": 2.5495, |
|
"num_input_tokens_seen": 4634040, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 1.406060606060606, |
|
"grad_norm": 0.5245952606201172, |
|
"learning_rate": 7.282210285724195e-07, |
|
"loss": 2.4487, |
|
"num_input_tokens_seen": 4644192, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.4090909090909092, |
|
"grad_norm": 0.4705655872821808, |
|
"learning_rate": 7.251276381454506e-07, |
|
"loss": 2.5896, |
|
"num_input_tokens_seen": 4653720, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.412121212121212, |
|
"grad_norm": 0.5075128674507141, |
|
"learning_rate": 7.22045172008133e-07, |
|
"loss": 2.261, |
|
"num_input_tokens_seen": 4666008, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.415151515151515, |
|
"grad_norm": 0.5407282710075378, |
|
"learning_rate": 7.189737010716326e-07, |
|
"loss": 2.384, |
|
"num_input_tokens_seen": 4674936, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 1.4181818181818182, |
|
"grad_norm": 0.6681150794029236, |
|
"learning_rate": 7.159132959941745e-07, |
|
"loss": 2.4542, |
|
"num_input_tokens_seen": 4684272, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.4212121212121211, |
|
"grad_norm": 0.6024764776229858, |
|
"learning_rate": 7.128640271794171e-07, |
|
"loss": 2.3937, |
|
"num_input_tokens_seen": 4695576, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 1.4242424242424243, |
|
"grad_norm": 0.5031726956367493, |
|
"learning_rate": 7.098259647748328e-07, |
|
"loss": 2.2943, |
|
"num_input_tokens_seen": 4705800, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4242424242424243, |
|
"eval_loss": 2.3277194499969482, |
|
"eval_runtime": 5.8264, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 4705800, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4272727272727272, |
|
"grad_norm": 0.5406504273414612, |
|
"learning_rate": 7.067991786700929e-07, |
|
"loss": 2.3552, |
|
"num_input_tokens_seen": 4718088, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 1.4303030303030302, |
|
"grad_norm": 0.5154955387115479, |
|
"learning_rate": 7.037837384954625e-07, |
|
"loss": 2.4507, |
|
"num_input_tokens_seen": 4729536, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.4333333333333333, |
|
"grad_norm": 0.710150420665741, |
|
"learning_rate": 7.007797136201966e-07, |
|
"loss": 2.4813, |
|
"num_input_tokens_seen": 4738272, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 1.4363636363636363, |
|
"grad_norm": 0.5603686571121216, |
|
"learning_rate": 6.977871731509438e-07, |
|
"loss": 2.4679, |
|
"num_input_tokens_seen": 4747488, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.4393939393939394, |
|
"grad_norm": 0.6040205359458923, |
|
"learning_rate": 6.948061859301593e-07, |
|
"loss": 2.5084, |
|
"num_input_tokens_seen": 4756032, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4424242424242424, |
|
"grad_norm": 0.6151003837585449, |
|
"learning_rate": 6.918368205345182e-07, |
|
"loss": 2.3797, |
|
"num_input_tokens_seen": 4766904, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.4454545454545453, |
|
"grad_norm": 0.5921849012374878, |
|
"learning_rate": 6.888791452733397e-07, |
|
"loss": 2.4923, |
|
"num_input_tokens_seen": 4777680, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 1.4484848484848485, |
|
"grad_norm": 0.5749545693397522, |
|
"learning_rate": 6.859332281870147e-07, |
|
"loss": 2.5362, |
|
"num_input_tokens_seen": 4788432, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.4515151515151516, |
|
"grad_norm": 0.5609776973724365, |
|
"learning_rate": 6.829991370454411e-07, |
|
"loss": 2.433, |
|
"num_input_tokens_seen": 4799712, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 0.6038674116134644, |
|
"learning_rate": 6.800769393464656e-07, |
|
"loss": 2.362, |
|
"num_input_tokens_seen": 4808688, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"eval_loss": 2.3274452686309814, |
|
"eval_runtime": 5.8255, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 4808688, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4575757575757575, |
|
"grad_norm": 0.6705885529518127, |
|
"learning_rate": 6.771667023143284e-07, |
|
"loss": 2.5027, |
|
"num_input_tokens_seen": 4817136, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 1.4606060606060607, |
|
"grad_norm": 0.6026042699813843, |
|
"learning_rate": 6.742684928981188e-07, |
|
"loss": 2.6941, |
|
"num_input_tokens_seen": 4829112, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.4636363636363636, |
|
"grad_norm": 0.5220550894737244, |
|
"learning_rate": 6.713823777702359e-07, |
|
"loss": 2.2785, |
|
"num_input_tokens_seen": 4838664, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 0.6457405090332031, |
|
"learning_rate": 6.685084233248517e-07, |
|
"loss": 2.502, |
|
"num_input_tokens_seen": 4846656, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.4696969696969697, |
|
"grad_norm": 0.831514298915863, |
|
"learning_rate": 6.656466956763864e-07, |
|
"loss": 2.4094, |
|
"num_input_tokens_seen": 4855296, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4727272727272727, |
|
"grad_norm": 0.623429000377655, |
|
"learning_rate": 6.627972606579866e-07, |
|
"loss": 2.3646, |
|
"num_input_tokens_seen": 4867584, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.4757575757575758, |
|
"grad_norm": 0.6878921389579773, |
|
"learning_rate": 6.599601838200104e-07, |
|
"loss": 2.3642, |
|
"num_input_tokens_seen": 4879584, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 1.4787878787878788, |
|
"grad_norm": 0.8445355296134949, |
|
"learning_rate": 6.571355304285202e-07, |
|
"loss": 2.571, |
|
"num_input_tokens_seen": 4889976, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.481818181818182, |
|
"grad_norm": 0.5575315356254578, |
|
"learning_rate": 6.543233654637804e-07, |
|
"loss": 2.5749, |
|
"num_input_tokens_seen": 4899048, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 1.4848484848484849, |
|
"grad_norm": 0.5096350312232971, |
|
"learning_rate": 6.515237536187644e-07, |
|
"loss": 2.2386, |
|
"num_input_tokens_seen": 4910088, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4848484848484849, |
|
"eval_loss": 2.3277652263641357, |
|
"eval_runtime": 5.8263, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 4910088, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4878787878787878, |
|
"grad_norm": 0.7003534436225891, |
|
"learning_rate": 6.487367592976633e-07, |
|
"loss": 2.5641, |
|
"num_input_tokens_seen": 4922376, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 1.490909090909091, |
|
"grad_norm": 0.5951968431472778, |
|
"learning_rate": 6.459624466144067e-07, |
|
"loss": 2.298, |
|
"num_input_tokens_seen": 4934664, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.493939393939394, |
|
"grad_norm": 0.7097399234771729, |
|
"learning_rate": 6.432008793911877e-07, |
|
"loss": 2.3938, |
|
"num_input_tokens_seen": 4943352, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 1.496969696969697, |
|
"grad_norm": 0.5688740015029907, |
|
"learning_rate": 6.404521211569937e-07, |
|
"loss": 2.421, |
|
"num_input_tokens_seen": 4953888, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.6089447736740112, |
|
"learning_rate": 6.377162351461442e-07, |
|
"loss": 2.1273, |
|
"num_input_tokens_seen": 4965024, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.503030303030303, |
|
"grad_norm": 0.5698357224464417, |
|
"learning_rate": 6.349932842968391e-07, |
|
"loss": 2.3928, |
|
"num_input_tokens_seen": 4977216, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.506060606060606, |
|
"grad_norm": 0.6300851702690125, |
|
"learning_rate": 6.322833312497082e-07, |
|
"loss": 2.3595, |
|
"num_input_tokens_seen": 4986720, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 1.509090909090909, |
|
"grad_norm": 0.5977615714073181, |
|
"learning_rate": 6.295864383463705e-07, |
|
"loss": 2.5852, |
|
"num_input_tokens_seen": 4995072, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.5121212121212122, |
|
"grad_norm": 0.6872332096099854, |
|
"learning_rate": 6.269026676280008e-07, |
|
"loss": 2.4611, |
|
"num_input_tokens_seen": 5003256, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 0.7128229141235352, |
|
"learning_rate": 6.242320808339023e-07, |
|
"loss": 2.0949, |
|
"num_input_tokens_seen": 5010864, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"eval_loss": 2.3277881145477295, |
|
"eval_runtime": 5.8265, |
|
"eval_samples_per_second": 3.433, |
|
"eval_steps_per_second": 3.433, |
|
"num_input_tokens_seen": 5010864, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.518181818181818, |
|
"grad_norm": 0.48600301146507263, |
|
"learning_rate": 6.215747394000864e-07, |
|
"loss": 2.2478, |
|
"num_input_tokens_seen": 5021400, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 1.5212121212121212, |
|
"grad_norm": 0.6063188314437866, |
|
"learning_rate": 6.189307044578585e-07, |
|
"loss": 2.1912, |
|
"num_input_tokens_seen": 5031576, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 1.5242424242424244, |
|
"grad_norm": 0.6136674284934998, |
|
"learning_rate": 6.163000368324124e-07, |
|
"loss": 2.3441, |
|
"num_input_tokens_seen": 5042136, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 1.5272727272727273, |
|
"grad_norm": 0.6810842156410217, |
|
"learning_rate": 6.136827970414317e-07, |
|
"loss": 2.3444, |
|
"num_input_tokens_seen": 5052480, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.5303030303030303, |
|
"grad_norm": 0.70346599817276, |
|
"learning_rate": 6.11079045293696e-07, |
|
"loss": 2.5014, |
|
"num_input_tokens_seen": 5062872, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 0.6263840198516846, |
|
"learning_rate": 6.084888414876976e-07, |
|
"loss": 2.2427, |
|
"num_input_tokens_seen": 5073744, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 1.5363636363636364, |
|
"grad_norm": 0.6593678593635559, |
|
"learning_rate": 6.059122452102618e-07, |
|
"loss": 2.3813, |
|
"num_input_tokens_seen": 5082432, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 1.5393939393939395, |
|
"grad_norm": 0.521698534488678, |
|
"learning_rate": 6.033493157351772e-07, |
|
"loss": 2.6378, |
|
"num_input_tokens_seen": 5092848, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 1.5424242424242425, |
|
"grad_norm": 0.46363523602485657, |
|
"learning_rate": 6.008001120218322e-07, |
|
"loss": 2.4006, |
|
"num_input_tokens_seen": 5105136, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 0.7737420797348022, |
|
"learning_rate": 5.982646927138584e-07, |
|
"loss": 2.5504, |
|
"num_input_tokens_seen": 5114064, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"eval_loss": 2.3275692462921143, |
|
"eval_runtime": 5.8238, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 5114064, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5484848484848484, |
|
"grad_norm": 0.6213299036026001, |
|
"learning_rate": 5.957431161377809e-07, |
|
"loss": 2.4085, |
|
"num_input_tokens_seen": 5125872, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 1.5515151515151515, |
|
"grad_norm": 0.7610370516777039, |
|
"learning_rate": 5.932354403016777e-07, |
|
"loss": 2.263, |
|
"num_input_tokens_seen": 5135208, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.5545454545454547, |
|
"grad_norm": 0.5635423064231873, |
|
"learning_rate": 5.907417228938442e-07, |
|
"loss": 2.352, |
|
"num_input_tokens_seen": 5146896, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 1.5575757575757576, |
|
"grad_norm": 0.5265647768974304, |
|
"learning_rate": 5.88262021281467e-07, |
|
"loss": 2.3172, |
|
"num_input_tokens_seen": 5159184, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 1.5606060606060606, |
|
"grad_norm": 0.8375009298324585, |
|
"learning_rate": 5.857963925093034e-07, |
|
"loss": 2.4402, |
|
"num_input_tokens_seen": 5167656, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5636363636363635, |
|
"grad_norm": 0.5335946679115295, |
|
"learning_rate": 5.833448932983693e-07, |
|
"loss": 2.5926, |
|
"num_input_tokens_seen": 5179680, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 1.5666666666666667, |
|
"grad_norm": 0.8245714902877808, |
|
"learning_rate": 5.809075800446348e-07, |
|
"loss": 2.5999, |
|
"num_input_tokens_seen": 5190216, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 1.5696969696969698, |
|
"grad_norm": 0.5047762393951416, |
|
"learning_rate": 5.784845088177263e-07, |
|
"loss": 2.379, |
|
"num_input_tokens_seen": 5201592, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 1.5727272727272728, |
|
"grad_norm": 0.5322418212890625, |
|
"learning_rate": 5.760757353596371e-07, |
|
"loss": 2.3246, |
|
"num_input_tokens_seen": 5213040, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 1.5757575757575757, |
|
"grad_norm": 0.47743648290634155, |
|
"learning_rate": 5.736813150834447e-07, |
|
"loss": 2.4542, |
|
"num_input_tokens_seen": 5223360, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5757575757575757, |
|
"eval_loss": 2.3277275562286377, |
|
"eval_runtime": 5.824, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 5223360, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5787878787878786, |
|
"grad_norm": 0.4745235741138458, |
|
"learning_rate": 5.713013030720356e-07, |
|
"loss": 2.3253, |
|
"num_input_tokens_seen": 5235480, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 1.5818181818181818, |
|
"grad_norm": 0.521117091178894, |
|
"learning_rate": 5.6893575407684e-07, |
|
"loss": 2.3232, |
|
"num_input_tokens_seen": 5246280, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 1.584848484848485, |
|
"grad_norm": 0.6688542366027832, |
|
"learning_rate": 5.665847225165695e-07, |
|
"loss": 2.323, |
|
"num_input_tokens_seen": 5257248, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 1.587878787878788, |
|
"grad_norm": 0.6905980706214905, |
|
"learning_rate": 5.642482624759672e-07, |
|
"loss": 2.6128, |
|
"num_input_tokens_seen": 5268264, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 0.555060088634491, |
|
"learning_rate": 5.619264277045634e-07, |
|
"loss": 2.5484, |
|
"num_input_tokens_seen": 5280432, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.593939393939394, |
|
"grad_norm": 0.6293858289718628, |
|
"learning_rate": 5.596192716154385e-07, |
|
"loss": 2.5, |
|
"num_input_tokens_seen": 5290488, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 1.596969696969697, |
|
"grad_norm": 0.9078196883201599, |
|
"learning_rate": 5.573268472839937e-07, |
|
"loss": 2.4814, |
|
"num_input_tokens_seen": 5299536, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6941189765930176, |
|
"learning_rate": 5.550492074467317e-07, |
|
"loss": 2.4972, |
|
"num_input_tokens_seen": 5309544, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 1.603030303030303, |
|
"grad_norm": 0.6833639740943909, |
|
"learning_rate": 5.527864045000421e-07, |
|
"loss": 2.5041, |
|
"num_input_tokens_seen": 5319024, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 1.606060606060606, |
|
"grad_norm": 0.6468996405601501, |
|
"learning_rate": 5.505384904989965e-07, |
|
"loss": 2.3262, |
|
"num_input_tokens_seen": 5329752, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.606060606060606, |
|
"eval_loss": 2.327099323272705, |
|
"eval_runtime": 5.8238, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 5329752, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6090909090909091, |
|
"grad_norm": 0.7046562433242798, |
|
"learning_rate": 5.483055171561511e-07, |
|
"loss": 2.2181, |
|
"num_input_tokens_seen": 5340552, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 1.612121212121212, |
|
"grad_norm": 0.48583197593688965, |
|
"learning_rate": 5.460875358403565e-07, |
|
"loss": 2.3349, |
|
"num_input_tokens_seen": 5350320, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 1.6151515151515152, |
|
"grad_norm": 0.6768611073493958, |
|
"learning_rate": 5.438845975755772e-07, |
|
"loss": 2.4784, |
|
"num_input_tokens_seen": 5356608, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 1.6181818181818182, |
|
"grad_norm": 0.6648526191711426, |
|
"learning_rate": 5.416967530397164e-07, |
|
"loss": 2.2265, |
|
"num_input_tokens_seen": 5366568, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 1.621212121212121, |
|
"grad_norm": 0.5271417498588562, |
|
"learning_rate": 5.395240525634511e-07, |
|
"loss": 2.4877, |
|
"num_input_tokens_seen": 5378856, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6242424242424243, |
|
"grad_norm": 0.5848326086997986, |
|
"learning_rate": 5.37366546129074e-07, |
|
"loss": 2.3169, |
|
"num_input_tokens_seen": 5391120, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 1.6272727272727274, |
|
"grad_norm": 0.5480791330337524, |
|
"learning_rate": 5.35224283369343e-07, |
|
"loss": 2.4456, |
|
"num_input_tokens_seen": 5398752, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 1.6303030303030304, |
|
"grad_norm": 0.47689610719680786, |
|
"learning_rate": 5.330973135663411e-07, |
|
"loss": 2.5053, |
|
"num_input_tokens_seen": 5411040, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 1.6333333333333333, |
|
"grad_norm": 0.5623081922531128, |
|
"learning_rate": 5.309856856503409e-07, |
|
"loss": 2.4062, |
|
"num_input_tokens_seen": 5422848, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 0.539359450340271, |
|
"learning_rate": 5.2888944819868e-07, |
|
"loss": 2.2278, |
|
"num_input_tokens_seen": 5435136, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"eval_loss": 2.32759428024292, |
|
"eval_runtime": 5.8229, |
|
"eval_samples_per_second": 3.435, |
|
"eval_steps_per_second": 3.435, |
|
"num_input_tokens_seen": 5435136, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6393939393939394, |
|
"grad_norm": 0.5953027009963989, |
|
"learning_rate": 5.26808649434643e-07, |
|
"loss": 2.3976, |
|
"num_input_tokens_seen": 5445672, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 1.6424242424242426, |
|
"grad_norm": 0.5432310700416565, |
|
"learning_rate": 5.247433372263522e-07, |
|
"loss": 2.4648, |
|
"num_input_tokens_seen": 5456640, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 1.6454545454545455, |
|
"grad_norm": 0.5668439865112305, |
|
"learning_rate": 5.226935590856675e-07, |
|
"loss": 2.2962, |
|
"num_input_tokens_seen": 5465976, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 1.6484848484848484, |
|
"grad_norm": 0.5815810561180115, |
|
"learning_rate": 5.20659362167091e-07, |
|
"loss": 2.3107, |
|
"num_input_tokens_seen": 5477016, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 1.6515151515151514, |
|
"grad_norm": 0.5914052724838257, |
|
"learning_rate": 5.186407932666846e-07, |
|
"loss": 2.2394, |
|
"num_input_tokens_seen": 5487504, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6545454545454545, |
|
"grad_norm": 0.8601570129394531, |
|
"learning_rate": 5.166378988209924e-07, |
|
"loss": 2.6481, |
|
"num_input_tokens_seen": 5496600, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 1.6575757575757577, |
|
"grad_norm": 0.6369432210922241, |
|
"learning_rate": 5.146507249059727e-07, |
|
"loss": 2.5754, |
|
"num_input_tokens_seen": 5506416, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 1.6606060606060606, |
|
"grad_norm": 0.712243914604187, |
|
"learning_rate": 5.126793172359373e-07, |
|
"loss": 2.3295, |
|
"num_input_tokens_seen": 5514600, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 1.6636363636363636, |
|
"grad_norm": 0.6746931672096252, |
|
"learning_rate": 5.107237211625016e-07, |
|
"loss": 2.3752, |
|
"num_input_tokens_seen": 5522616, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.5909104943275452, |
|
"learning_rate": 5.087839816735391e-07, |
|
"loss": 2.2484, |
|
"num_input_tokens_seen": 5533488, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_loss": 2.326948642730713, |
|
"eval_runtime": 5.8207, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 3.436, |
|
"num_input_tokens_seen": 5533488, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6696969696969697, |
|
"grad_norm": 0.578524112701416, |
|
"learning_rate": 5.068601433921479e-07, |
|
"loss": 2.392, |
|
"num_input_tokens_seen": 5544864, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 1.6727272727272728, |
|
"grad_norm": 0.6614283323287964, |
|
"learning_rate": 5.04952250575624e-07, |
|
"loss": 2.4998, |
|
"num_input_tokens_seen": 5555928, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 1.6757575757575758, |
|
"grad_norm": 0.5955278277397156, |
|
"learning_rate": 5.030603471144432e-07, |
|
"loss": 2.3944, |
|
"num_input_tokens_seen": 5567088, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 1.6787878787878787, |
|
"grad_norm": 0.5927826166152954, |
|
"learning_rate": 5.011844765312504e-07, |
|
"loss": 2.487, |
|
"num_input_tokens_seen": 5578128, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 1.6818181818181817, |
|
"grad_norm": 0.6427227258682251, |
|
"learning_rate": 4.9932468197986e-07, |
|
"loss": 2.5279, |
|
"num_input_tokens_seen": 5588952, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6848484848484848, |
|
"grad_norm": 0.49643516540527344, |
|
"learning_rate": 4.974810062442615e-07, |
|
"loss": 2.4558, |
|
"num_input_tokens_seen": 5599992, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 1.687878787878788, |
|
"grad_norm": 0.5617672204971313, |
|
"learning_rate": 4.956534917376373e-07, |
|
"loss": 2.3407, |
|
"num_input_tokens_seen": 5611752, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 1.690909090909091, |
|
"grad_norm": 0.7746953368186951, |
|
"learning_rate": 4.938421805013844e-07, |
|
"loss": 2.4067, |
|
"num_input_tokens_seen": 5619072, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 1.6939393939393939, |
|
"grad_norm": 0.6146767139434814, |
|
"learning_rate": 4.920471142041496e-07, |
|
"loss": 2.2224, |
|
"num_input_tokens_seen": 5629824, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 1.696969696969697, |
|
"grad_norm": 0.7500237822532654, |
|
"learning_rate": 4.902683341408698e-07, |
|
"loss": 2.4764, |
|
"num_input_tokens_seen": 5639376, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.696969696969697, |
|
"eval_loss": 2.327069044113159, |
|
"eval_runtime": 5.8204, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 3.436, |
|
"num_input_tokens_seen": 5639376, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.512405276298523, |
|
"learning_rate": 4.88505881231822e-07, |
|
"loss": 2.4383, |
|
"num_input_tokens_seen": 5649624, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 1.7030303030303031, |
|
"grad_norm": 0.6521934866905212, |
|
"learning_rate": 4.867597960216823e-07, |
|
"loss": 2.3752, |
|
"num_input_tokens_seen": 5659800, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 1.706060606060606, |
|
"grad_norm": 0.5437342524528503, |
|
"learning_rate": 4.85030118678593e-07, |
|
"loss": 2.2427, |
|
"num_input_tokens_seen": 5668296, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 1.709090909090909, |
|
"grad_norm": 0.5007622838020325, |
|
"learning_rate": 4.833168889932384e-07, |
|
"loss": 2.3027, |
|
"num_input_tokens_seen": 5678832, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 1.7121212121212122, |
|
"grad_norm": 0.9229590892791748, |
|
"learning_rate": 4.816201463779299e-07, |
|
"loss": 2.4966, |
|
"num_input_tokens_seen": 5686872, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7151515151515153, |
|
"grad_norm": 0.7598445415496826, |
|
"learning_rate": 4.799399298656985e-07, |
|
"loss": 2.5635, |
|
"num_input_tokens_seen": 5697216, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 1.7181818181818183, |
|
"grad_norm": 0.5250843167304993, |
|
"learning_rate": 4.782762781093983e-07, |
|
"loss": 2.3295, |
|
"num_input_tokens_seen": 5706840, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 1.7212121212121212, |
|
"grad_norm": 0.7306003570556641, |
|
"learning_rate": 4.7662922938081575e-07, |
|
"loss": 2.3937, |
|
"num_input_tokens_seen": 5715816, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 1.7242424242424241, |
|
"grad_norm": 0.7364092469215393, |
|
"learning_rate": 4.7499882156978934e-07, |
|
"loss": 2.3815, |
|
"num_input_tokens_seen": 5724456, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 0.6539096236228943, |
|
"learning_rate": 4.7338509218333966e-07, |
|
"loss": 2.3489, |
|
"num_input_tokens_seen": 5732496, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"eval_loss": 2.326911687850952, |
|
"eval_runtime": 5.8239, |
|
"eval_samples_per_second": 3.434, |
|
"eval_steps_per_second": 3.434, |
|
"num_input_tokens_seen": 5732496, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7303030303030305, |
|
"grad_norm": 0.6865965127944946, |
|
"learning_rate": 4.717880783448046e-07, |
|
"loss": 2.2154, |
|
"num_input_tokens_seen": 5744784, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 0.6450785994529724, |
|
"learning_rate": 4.7020781679298636e-07, |
|
"loss": 2.5799, |
|
"num_input_tokens_seen": 5752872, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 1.7363636363636363, |
|
"grad_norm": 0.6152123808860779, |
|
"learning_rate": 4.6864434388130604e-07, |
|
"loss": 2.4051, |
|
"num_input_tokens_seen": 5762880, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 1.7393939393939393, |
|
"grad_norm": 0.5718716382980347, |
|
"learning_rate": 4.6709769557696724e-07, |
|
"loss": 2.2532, |
|
"num_input_tokens_seen": 5773632, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 1.7424242424242424, |
|
"grad_norm": 0.6017091274261475, |
|
"learning_rate": 4.6556790746012866e-07, |
|
"loss": 2.2363, |
|
"num_input_tokens_seen": 5784960, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7454545454545456, |
|
"grad_norm": 0.5728634595870972, |
|
"learning_rate": 4.6405501472308593e-07, |
|
"loss": 2.264, |
|
"num_input_tokens_seen": 5794392, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 1.7484848484848485, |
|
"grad_norm": 0.7092226147651672, |
|
"learning_rate": 4.6255905216946174e-07, |
|
"loss": 2.6636, |
|
"num_input_tokens_seen": 5801088, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 1.7515151515151515, |
|
"grad_norm": 0.6607272028923035, |
|
"learning_rate": 4.6108005421340517e-07, |
|
"loss": 2.3849, |
|
"num_input_tokens_seen": 5810232, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 1.7545454545454544, |
|
"grad_norm": 0.6151024699211121, |
|
"learning_rate": 4.5961805487879993e-07, |
|
"loss": 2.1526, |
|
"num_input_tokens_seen": 5819976, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 1.7575757575757576, |
|
"grad_norm": 0.5664975047111511, |
|
"learning_rate": 4.581730877984817e-07, |
|
"loss": 2.3448, |
|
"num_input_tokens_seen": 5831304, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7575757575757576, |
|
"eval_loss": 2.326674699783325, |
|
"eval_runtime": 5.817, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 5831304, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7606060606060607, |
|
"grad_norm": 0.6864150166511536, |
|
"learning_rate": 4.567451862134651e-07, |
|
"loss": 2.2982, |
|
"num_input_tokens_seen": 5841792, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 1.7636363636363637, |
|
"grad_norm": 0.5514176487922668, |
|
"learning_rate": 4.553343829721776e-07, |
|
"loss": 2.296, |
|
"num_input_tokens_seen": 5852640, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 1.7666666666666666, |
|
"grad_norm": 0.5415042638778687, |
|
"learning_rate": 4.539407105297053e-07, |
|
"loss": 2.3767, |
|
"num_input_tokens_seen": 5864328, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 1.7696969696969695, |
|
"grad_norm": 0.7088015675544739, |
|
"learning_rate": 4.5256420094704516e-07, |
|
"loss": 2.1989, |
|
"num_input_tokens_seen": 5873424, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 1.7727272727272727, |
|
"grad_norm": 0.5956241488456726, |
|
"learning_rate": 4.5120488589036816e-07, |
|
"loss": 2.2727, |
|
"num_input_tokens_seen": 5881608, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.7757575757575759, |
|
"grad_norm": 0.6199578046798706, |
|
"learning_rate": 4.498627966302905e-07, |
|
"loss": 2.3122, |
|
"num_input_tokens_seen": 5892984, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 1.7787878787878788, |
|
"grad_norm": 0.6161043643951416, |
|
"learning_rate": 4.485379640411545e-07, |
|
"loss": 2.607, |
|
"num_input_tokens_seen": 5903832, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 1.7818181818181817, |
|
"grad_norm": 0.7086969017982483, |
|
"learning_rate": 4.4723041860031803e-07, |
|
"loss": 2.444, |
|
"num_input_tokens_seen": 5914224, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 1.7848484848484847, |
|
"grad_norm": 0.5110089182853699, |
|
"learning_rate": 4.459401903874538e-07, |
|
"loss": 2.462, |
|
"num_input_tokens_seen": 5925768, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 1.7878787878787878, |
|
"grad_norm": 0.6780450344085693, |
|
"learning_rate": 4.4466730908385664e-07, |
|
"loss": 2.4997, |
|
"num_input_tokens_seen": 5934528, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.7878787878787878, |
|
"eval_loss": 2.3263440132141113, |
|
"eval_runtime": 5.8197, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 5934528, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.790909090909091, |
|
"grad_norm": 0.6635234355926514, |
|
"learning_rate": 4.434118039717616e-07, |
|
"loss": 2.2541, |
|
"num_input_tokens_seen": 5944224, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 1.793939393939394, |
|
"grad_norm": 0.6881716251373291, |
|
"learning_rate": 4.4217370393366995e-07, |
|
"loss": 2.2483, |
|
"num_input_tokens_seen": 5954688, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 1.7969696969696969, |
|
"grad_norm": 1.0131621360778809, |
|
"learning_rate": 4.40953037451684e-07, |
|
"loss": 2.403, |
|
"num_input_tokens_seen": 5964072, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.5744723677635193, |
|
"learning_rate": 4.3974983260685345e-07, |
|
"loss": 2.5772, |
|
"num_input_tokens_seen": 5975184, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 1.803030303030303, |
|
"grad_norm": 0.6319069266319275, |
|
"learning_rate": 4.3856411707852814e-07, |
|
"loss": 2.3809, |
|
"num_input_tokens_seen": 5981496, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.8060606060606061, |
|
"grad_norm": 0.49835190176963806, |
|
"learning_rate": 4.373959181437216e-07, |
|
"loss": 2.3452, |
|
"num_input_tokens_seen": 5993088, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 1.809090909090909, |
|
"grad_norm": 0.825423538684845, |
|
"learning_rate": 4.3624526267648363e-07, |
|
"loss": 2.2971, |
|
"num_input_tokens_seen": 6003864, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 1.812121212121212, |
|
"grad_norm": 0.5639837384223938, |
|
"learning_rate": 4.351121771472823e-07, |
|
"loss": 2.1717, |
|
"num_input_tokens_seen": 6013824, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 1.8151515151515152, |
|
"grad_norm": 0.6175968050956726, |
|
"learning_rate": 4.3399668762239446e-07, |
|
"loss": 2.3326, |
|
"num_input_tokens_seen": 6024120, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.5506427884101868, |
|
"learning_rate": 4.328988197633066e-07, |
|
"loss": 2.311, |
|
"num_input_tokens_seen": 6035544, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"eval_loss": 2.326775550842285, |
|
"eval_runtime": 5.8185, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 6035544, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8212121212121213, |
|
"grad_norm": 0.6879149675369263, |
|
"learning_rate": 4.3181859882612426e-07, |
|
"loss": 2.4867, |
|
"num_input_tokens_seen": 6047520, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 1.8242424242424242, |
|
"grad_norm": 0.9205613136291504, |
|
"learning_rate": 4.307560496609911e-07, |
|
"loss": 2.5415, |
|
"num_input_tokens_seen": 6055488, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 1.8272727272727272, |
|
"grad_norm": 0.7125353813171387, |
|
"learning_rate": 4.297111967115171e-07, |
|
"loss": 2.3684, |
|
"num_input_tokens_seen": 6063720, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 1.8303030303030303, |
|
"grad_norm": 0.7578244805335999, |
|
"learning_rate": 4.286840640142166e-07, |
|
"loss": 2.1882, |
|
"num_input_tokens_seen": 6071664, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.5936377644538879, |
|
"learning_rate": 4.2767467519795497e-07, |
|
"loss": 2.4383, |
|
"num_input_tokens_seen": 6081360, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.8363636363636364, |
|
"grad_norm": 0.5857051610946655, |
|
"learning_rate": 4.2668305348340495e-07, |
|
"loss": 2.2313, |
|
"num_input_tokens_seen": 6090624, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 1.8393939393939394, |
|
"grad_norm": 0.5357300639152527, |
|
"learning_rate": 4.2570922168251294e-07, |
|
"loss": 2.3837, |
|
"num_input_tokens_seen": 6100944, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 1.8424242424242423, |
|
"grad_norm": 0.8577349185943604, |
|
"learning_rate": 4.2475320219797406e-07, |
|
"loss": 2.3874, |
|
"num_input_tokens_seen": 6108792, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 1.8454545454545455, |
|
"grad_norm": 0.5311655402183533, |
|
"learning_rate": 4.2381501702271623e-07, |
|
"loss": 2.3853, |
|
"num_input_tokens_seen": 6121080, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 1.8484848484848486, |
|
"grad_norm": 0.5314241051673889, |
|
"learning_rate": 4.228946877393953e-07, |
|
"loss": 2.3858, |
|
"num_input_tokens_seen": 6131112, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8484848484848486, |
|
"eval_loss": 2.3265769481658936, |
|
"eval_runtime": 5.8173, |
|
"eval_samples_per_second": 3.438, |
|
"eval_steps_per_second": 3.438, |
|
"num_input_tokens_seen": 6131112, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8515151515151516, |
|
"grad_norm": 0.6820886731147766, |
|
"learning_rate": 4.219922355198972e-07, |
|
"loss": 2.3291, |
|
"num_input_tokens_seen": 6141072, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 1.8545454545454545, |
|
"grad_norm": 0.6875143051147461, |
|
"learning_rate": 4.211076811248524e-07, |
|
"loss": 2.344, |
|
"num_input_tokens_seen": 6152040, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 1.8575757575757574, |
|
"grad_norm": 0.6124435067176819, |
|
"learning_rate": 4.2024104490315696e-07, |
|
"loss": 2.275, |
|
"num_input_tokens_seen": 6163368, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 1.8606060606060606, |
|
"grad_norm": 0.6159326434135437, |
|
"learning_rate": 4.1939234679150516e-07, |
|
"loss": 2.4138, |
|
"num_input_tokens_seen": 6171072, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 1.8636363636363638, |
|
"grad_norm": 0.4833909273147583, |
|
"learning_rate": 4.185616063139308e-07, |
|
"loss": 2.2974, |
|
"num_input_tokens_seen": 6183312, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.8235618472099304, |
|
"learning_rate": 4.177488425813578e-07, |
|
"loss": 2.4087, |
|
"num_input_tokens_seen": 6193104, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 1.8696969696969696, |
|
"grad_norm": 0.5075482726097107, |
|
"learning_rate": 4.1695407429116063e-07, |
|
"loss": 2.4328, |
|
"num_input_tokens_seen": 6205392, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 1.8727272727272726, |
|
"grad_norm": 0.5093833208084106, |
|
"learning_rate": 4.1617731972673466e-07, |
|
"loss": 2.4412, |
|
"num_input_tokens_seen": 6215808, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 1.8757575757575757, |
|
"grad_norm": 0.5927122235298157, |
|
"learning_rate": 4.1541859675707454e-07, |
|
"loss": 2.2544, |
|
"num_input_tokens_seen": 6226224, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 1.878787878787879, |
|
"grad_norm": 0.6176667809486389, |
|
"learning_rate": 4.146779228363644e-07, |
|
"loss": 2.3602, |
|
"num_input_tokens_seen": 6235464, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.878787878787879, |
|
"eval_loss": 2.3264036178588867, |
|
"eval_runtime": 5.821, |
|
"eval_samples_per_second": 3.436, |
|
"eval_steps_per_second": 3.436, |
|
"num_input_tokens_seen": 6235464, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.8818181818181818, |
|
"grad_norm": 0.5281220078468323, |
|
"learning_rate": 4.139553150035751e-07, |
|
"loss": 2.439, |
|
"num_input_tokens_seen": 6245400, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 1.8848484848484848, |
|
"grad_norm": 0.6205955147743225, |
|
"learning_rate": 4.1325078988207303e-07, |
|
"loss": 2.466, |
|
"num_input_tokens_seen": 6252768, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 1.887878787878788, |
|
"grad_norm": 0.5631701350212097, |
|
"learning_rate": 4.1256436367923777e-07, |
|
"loss": 2.5193, |
|
"num_input_tokens_seen": 6264432, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 1.8909090909090909, |
|
"grad_norm": 0.6673572659492493, |
|
"learning_rate": 4.118960521860884e-07, |
|
"loss": 2.4064, |
|
"num_input_tokens_seen": 6273264, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 1.893939393939394, |
|
"grad_norm": 0.6367799639701843, |
|
"learning_rate": 4.1124587077692115e-07, |
|
"loss": 2.2931, |
|
"num_input_tokens_seen": 6284112, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.896969696969697, |
|
"grad_norm": 1.2654261589050293, |
|
"learning_rate": 4.106138344089554e-07, |
|
"loss": 2.4058, |
|
"num_input_tokens_seen": 6292248, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.4898473024368286, |
|
"learning_rate": 4.0999995762198936e-07, |
|
"loss": 2.4485, |
|
"num_input_tokens_seen": 6302352, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 1.903030303030303, |
|
"grad_norm": 0.5527143478393555, |
|
"learning_rate": 4.094042545380659e-07, |
|
"loss": 2.1889, |
|
"num_input_tokens_seen": 6311712, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 1.906060606060606, |
|
"grad_norm": 0.6194308996200562, |
|
"learning_rate": 4.088267388611474e-07, |
|
"loss": 2.3617, |
|
"num_input_tokens_seen": 6323304, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 0.5801293849945068, |
|
"learning_rate": 4.082674238768009e-07, |
|
"loss": 2.2347, |
|
"num_input_tokens_seen": 6335304, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"eval_loss": 2.326760768890381, |
|
"eval_runtime": 5.8194, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 6335304, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.912121212121212, |
|
"grad_norm": 0.638659656047821, |
|
"learning_rate": 4.0772632245189193e-07, |
|
"loss": 2.2904, |
|
"num_input_tokens_seen": 6345624, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 1.915151515151515, |
|
"grad_norm": 0.5953812003135681, |
|
"learning_rate": 4.0720344703428906e-07, |
|
"loss": 2.3719, |
|
"num_input_tokens_seen": 6355632, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 1.9181818181818182, |
|
"grad_norm": 0.5857142806053162, |
|
"learning_rate": 4.066988096525772e-07, |
|
"loss": 2.3489, |
|
"num_input_tokens_seen": 6363840, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 1.9212121212121214, |
|
"grad_norm": 0.5746711492538452, |
|
"learning_rate": 4.062124219157808e-07, |
|
"loss": 2.3433, |
|
"num_input_tokens_seen": 6375000, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 1.9242424242424243, |
|
"grad_norm": 0.6761659383773804, |
|
"learning_rate": 4.057442950130972e-07, |
|
"loss": 2.4374, |
|
"num_input_tokens_seen": 6385632, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9272727272727272, |
|
"grad_norm": 0.4828651249408722, |
|
"learning_rate": 4.05294439713639e-07, |
|
"loss": 2.3613, |
|
"num_input_tokens_seen": 6397728, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 1.9303030303030302, |
|
"grad_norm": 0.6450832486152649, |
|
"learning_rate": 4.048628663661859e-07, |
|
"loss": 2.1642, |
|
"num_input_tokens_seen": 6409512, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 1.9333333333333333, |
|
"grad_norm": 0.6221877336502075, |
|
"learning_rate": 4.044495848989475e-07, |
|
"loss": 2.4558, |
|
"num_input_tokens_seen": 6419664, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 1.9363636363636365, |
|
"grad_norm": 0.825742781162262, |
|
"learning_rate": 4.040546048193343e-07, |
|
"loss": 2.5869, |
|
"num_input_tokens_seen": 6428712, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 1.9393939393939394, |
|
"grad_norm": 0.69305020570755, |
|
"learning_rate": 4.0367793521373886e-07, |
|
"loss": 2.577, |
|
"num_input_tokens_seen": 6435960, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9393939393939394, |
|
"eval_loss": 2.3265655040740967, |
|
"eval_runtime": 5.8193, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 6435960, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9424242424242424, |
|
"grad_norm": 0.51558518409729, |
|
"learning_rate": 4.0331958474732744e-07, |
|
"loss": 2.4398, |
|
"num_input_tokens_seen": 6446952, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 1.9454545454545453, |
|
"grad_norm": 0.7710928916931152, |
|
"learning_rate": 4.0297956166384e-07, |
|
"loss": 2.3546, |
|
"num_input_tokens_seen": 6454440, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 1.9484848484848485, |
|
"grad_norm": 0.6520776748657227, |
|
"learning_rate": 4.0265787378540076e-07, |
|
"loss": 2.2851, |
|
"num_input_tokens_seen": 6465888, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 1.9515151515151516, |
|
"grad_norm": 0.7156710624694824, |
|
"learning_rate": 4.023545285123386e-07, |
|
"loss": 2.501, |
|
"num_input_tokens_seen": 6474384, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 1.9545454545454546, |
|
"grad_norm": 0.5886339545249939, |
|
"learning_rate": 4.020695328230162e-07, |
|
"loss": 2.3128, |
|
"num_input_tokens_seen": 6485712, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9575757575757575, |
|
"grad_norm": 0.5593713521957397, |
|
"learning_rate": 4.018028932736699e-07, |
|
"loss": 2.2989, |
|
"num_input_tokens_seen": 6497160, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 1.9606060606060605, |
|
"grad_norm": 0.5878450870513916, |
|
"learning_rate": 4.01554615998259e-07, |
|
"loss": 2.4504, |
|
"num_input_tokens_seen": 6508920, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 1.9636363636363636, |
|
"grad_norm": 0.5121827721595764, |
|
"learning_rate": 4.013247067083242e-07, |
|
"loss": 2.4132, |
|
"num_input_tokens_seen": 6520176, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 1.9666666666666668, |
|
"grad_norm": 0.49630334973335266, |
|
"learning_rate": 4.011131706928566e-07, |
|
"loss": 2.3645, |
|
"num_input_tokens_seen": 6531528, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"grad_norm": 0.543795645236969, |
|
"learning_rate": 4.0092001281817576e-07, |
|
"loss": 2.3001, |
|
"num_input_tokens_seen": 6543816, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"eval_loss": 2.3271186351776123, |
|
"eval_runtime": 5.816, |
|
"eval_samples_per_second": 3.439, |
|
"eval_steps_per_second": 3.439, |
|
"num_input_tokens_seen": 6543816, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9727272727272727, |
|
"grad_norm": 0.8191571235656738, |
|
"learning_rate": 4.0074523752781806e-07, |
|
"loss": 2.8758, |
|
"num_input_tokens_seen": 6552936, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 1.9757575757575756, |
|
"grad_norm": 0.6543108820915222, |
|
"learning_rate": 4.0058884884243416e-07, |
|
"loss": 2.3766, |
|
"num_input_tokens_seen": 6562896, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 1.9787878787878788, |
|
"grad_norm": 0.5305016040802002, |
|
"learning_rate": 4.004508503596967e-07, |
|
"loss": 2.3732, |
|
"num_input_tokens_seen": 6575184, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 1.981818181818182, |
|
"grad_norm": 0.5914813280105591, |
|
"learning_rate": 4.0033124525421757e-07, |
|
"loss": 2.3789, |
|
"num_input_tokens_seen": 6586032, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 1.9848484848484849, |
|
"grad_norm": 0.712382435798645, |
|
"learning_rate": 4.0023003627747455e-07, |
|
"loss": 2.2654, |
|
"num_input_tokens_seen": 6594768, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9878787878787878, |
|
"grad_norm": 0.5054189562797546, |
|
"learning_rate": 4.0014722575774835e-07, |
|
"loss": 2.4605, |
|
"num_input_tokens_seen": 6604728, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 1.990909090909091, |
|
"grad_norm": 0.5901520252227783, |
|
"learning_rate": 4.000828156000692e-07, |
|
"loss": 2.4816, |
|
"num_input_tokens_seen": 6616536, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 1.993939393939394, |
|
"grad_norm": 0.7864160537719727, |
|
"learning_rate": 4.000368072861723e-07, |
|
"loss": 2.482, |
|
"num_input_tokens_seen": 6624480, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 1.996969696969697, |
|
"grad_norm": 0.49510428309440613, |
|
"learning_rate": 4.0000920187446465e-07, |
|
"loss": 2.45, |
|
"num_input_tokens_seen": 6636768, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6357753872871399, |
|
"learning_rate": 4e-07, |
|
"loss": 2.2129, |
|
"num_input_tokens_seen": 6646824, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.326845645904541, |
|
"eval_runtime": 5.8186, |
|
"eval_samples_per_second": 3.437, |
|
"eval_steps_per_second": 3.437, |
|
"num_input_tokens_seen": 6646824, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"num_input_tokens_seen": 6646824, |
|
"step": 1320, |
|
"total_flos": 3.059943926859694e+17, |
|
"train_loss": 2.3998946460810573, |
|
"train_runtime": 5038.8172, |
|
"train_samples_per_second": 0.786, |
|
"train_steps_per_second": 0.262 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1320, |
|
"num_input_tokens_seen": 6646824, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.059943926859694e+17, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|