{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 20, "global_step": 1320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030303030303030303, "grad_norm": 0.42330464720726013, "learning_rate": 4e-07, "loss": 2.4685, "num_input_tokens_seen": 10296, "step": 2 }, { "epoch": 0.006060606060606061, "grad_norm": 0.4667194187641144, "learning_rate": 8e-07, "loss": 2.4399, "num_input_tokens_seen": 20376, "step": 4 }, { "epoch": 0.00909090909090909, "grad_norm": 0.38802874088287354, "learning_rate": 1.2e-06, "loss": 2.3101, "num_input_tokens_seen": 32664, "step": 6 }, { "epoch": 0.012121212121212121, "grad_norm": 0.4379090368747711, "learning_rate": 1.6e-06, "loss": 2.2743, "num_input_tokens_seen": 41904, "step": 8 }, { "epoch": 0.015151515151515152, "grad_norm": 0.4267907738685608, "learning_rate": 2e-06, "loss": 2.355, "num_input_tokens_seen": 52776, "step": 10 }, { "epoch": 0.01818181818181818, "grad_norm": 0.5171758532524109, "learning_rate": 1.999990798125535e-06, "loss": 2.633, "num_input_tokens_seen": 61464, "step": 12 }, { "epoch": 0.021212121212121213, "grad_norm": 0.47265326976776123, "learning_rate": 1.9999631927138275e-06, "loss": 2.3386, "num_input_tokens_seen": 72624, "step": 14 }, { "epoch": 0.024242424242424242, "grad_norm": 0.5586420893669128, "learning_rate": 1.9999171843999306e-06, "loss": 2.3536, "num_input_tokens_seen": 81840, "step": 16 }, { "epoch": 0.02727272727272727, "grad_norm": 0.39176592230796814, "learning_rate": 1.9998527742422515e-06, "loss": 2.2979, "num_input_tokens_seen": 91968, "step": 18 }, { "epoch": 0.030303030303030304, "grad_norm": 0.4795871078968048, "learning_rate": 1.9997699637225253e-06, "loss": 2.3755, "num_input_tokens_seen": 102984, "step": 20 }, { "epoch": 0.030303030303030304, "eval_loss": 2.3641138076782227, "eval_runtime": 5.815, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 102984, "step": 20 }, { "epoch": 0.03333333333333333, "grad_norm": 0.4541929364204407, "learning_rate": 1.9996687547457825e-06, "loss": 2.286, "num_input_tokens_seen": 113352, "step": 22 }, { "epoch": 0.03636363636363636, "grad_norm": 0.4055442810058594, "learning_rate": 1.999549149640303e-06, "loss": 2.3933, "num_input_tokens_seen": 125184, "step": 24 }, { "epoch": 0.03939393939393939, "grad_norm": 0.5810754299163818, "learning_rate": 1.9994111511575657e-06, "loss": 2.1378, "num_input_tokens_seen": 135480, "step": 26 }, { "epoch": 0.04242424242424243, "grad_norm": 0.41868993639945984, "learning_rate": 1.999254762472182e-06, "loss": 2.2551, "num_input_tokens_seen": 147384, "step": 28 }, { "epoch": 0.045454545454545456, "grad_norm": 0.5975711941719055, "learning_rate": 1.999079987181824e-06, "loss": 2.506, "num_input_tokens_seen": 156912, "step": 30 }, { "epoch": 0.048484848484848485, "grad_norm": 0.422783762216568, "learning_rate": 1.9988868293071435e-06, "loss": 2.4742, "num_input_tokens_seen": 167568, "step": 32 }, { "epoch": 0.051515151515151514, "grad_norm": 0.32683178782463074, "learning_rate": 1.998675293291676e-06, "loss": 2.5007, "num_input_tokens_seen": 176616, "step": 34 }, { "epoch": 0.05454545454545454, "grad_norm": 0.4234691858291626, "learning_rate": 1.998445384001741e-06, "loss": 2.4632, "num_input_tokens_seen": 187272, "step": 36 }, { "epoch": 0.05757575757575758, "grad_norm": 0.4502381980419159, "learning_rate": 1.99819710672633e-06, "loss": 2.4556, "num_input_tokens_seen": 196992, "step": 38 }, { "epoch": 0.06060606060606061, "grad_norm": 0.5127580165863037, "learning_rate": 1.9979304671769838e-06, "loss": 2.5355, "num_input_tokens_seen": 208824, "step": 40 }, { "epoch": 0.06060606060606061, "eval_loss": 2.361894130706787, "eval_runtime": 5.8061, "eval_samples_per_second": 3.445, "eval_steps_per_second": 3.445, "num_input_tokens_seen": 208824, "step": 40 }, { "epoch": 0.06363636363636363, "grad_norm": 0.5844971537590027, "learning_rate": 1.997645471487661e-06, "loss": 2.497, "num_input_tokens_seen": 217272, "step": 42 }, { "epoch": 0.06666666666666667, "grad_norm": 0.41816312074661255, "learning_rate": 1.9973421262145992e-06, "loss": 2.4371, "num_input_tokens_seen": 229560, "step": 44 }, { "epoch": 0.0696969696969697, "grad_norm": 0.505349338054657, "learning_rate": 1.99702043833616e-06, "loss": 2.4757, "num_input_tokens_seen": 239568, "step": 46 }, { "epoch": 0.07272727272727272, "grad_norm": 0.4537525177001953, "learning_rate": 1.9966804152526726e-06, "loss": 2.4514, "num_input_tokens_seen": 251664, "step": 48 }, { "epoch": 0.07575757575757576, "grad_norm": 0.40902894735336304, "learning_rate": 1.996322064786261e-06, "loss": 2.3474, "num_input_tokens_seen": 263040, "step": 50 }, { "epoch": 0.07878787878787878, "grad_norm": 0.48902806639671326, "learning_rate": 1.9959453951806656e-06, "loss": 2.4297, "num_input_tokens_seen": 271080, "step": 52 }, { "epoch": 0.08181818181818182, "grad_norm": 0.4684095084667206, "learning_rate": 1.995550415101052e-06, "loss": 2.6676, "num_input_tokens_seen": 282000, "step": 54 }, { "epoch": 0.08484848484848485, "grad_norm": 0.33189377188682556, "learning_rate": 1.9951371336338145e-06, "loss": 2.1799, "num_input_tokens_seen": 290568, "step": 56 }, { "epoch": 0.08787878787878788, "grad_norm": 0.4579316973686218, "learning_rate": 1.994705560286361e-06, "loss": 2.5315, "num_input_tokens_seen": 298920, "step": 58 }, { "epoch": 0.09090909090909091, "grad_norm": 0.42468497157096863, "learning_rate": 1.994255704986903e-06, "loss": 2.4679, "num_input_tokens_seen": 309744, "step": 60 }, { "epoch": 0.09090909090909091, "eval_loss": 2.360027551651001, "eval_runtime": 5.8148, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 309744, "step": 60 }, { "epoch": 0.09393939393939393, "grad_norm": 0.5245186686515808, "learning_rate": 1.993787578084219e-06, "loss": 2.4576, "num_input_tokens_seen": 321360, "step": 62 }, { "epoch": 0.09696969696969697, "grad_norm": 0.38165679574012756, "learning_rate": 1.9933011903474228e-06, "loss": 2.275, "num_input_tokens_seen": 332736, "step": 64 }, { "epoch": 0.1, "grad_norm": 0.5568698644638062, "learning_rate": 1.992796552965711e-06, "loss": 2.2761, "num_input_tokens_seen": 344568, "step": 66 }, { "epoch": 0.10303030303030303, "grad_norm": 0.39623475074768066, "learning_rate": 1.9922736775481083e-06, "loss": 2.3385, "num_input_tokens_seen": 356616, "step": 68 }, { "epoch": 0.10606060606060606, "grad_norm": 0.532319188117981, "learning_rate": 1.991732576123199e-06, "loss": 2.3342, "num_input_tokens_seen": 367680, "step": 70 }, { "epoch": 0.10909090909090909, "grad_norm": 0.505707859992981, "learning_rate": 1.9911732611388524e-06, "loss": 2.3604, "num_input_tokens_seen": 377376, "step": 72 }, { "epoch": 0.11212121212121212, "grad_norm": 0.4921689033508301, "learning_rate": 1.9905957454619343e-06, "loss": 2.2869, "num_input_tokens_seen": 387432, "step": 74 }, { "epoch": 0.11515151515151516, "grad_norm": 0.47557827830314636, "learning_rate": 1.9900000423780104e-06, "loss": 2.601, "num_input_tokens_seen": 395808, "step": 76 }, { "epoch": 0.11818181818181818, "grad_norm": 1.9346156120300293, "learning_rate": 1.9893861655910444e-06, "loss": 2.3741, "num_input_tokens_seen": 407568, "step": 78 }, { "epoch": 0.12121212121212122, "grad_norm": 0.45454809069633484, "learning_rate": 1.988754129223079e-06, "loss": 2.3929, "num_input_tokens_seen": 417648, "step": 80 }, { "epoch": 0.12121212121212122, "eval_loss": 2.3575997352600098, "eval_runtime": 5.8145, "eval_samples_per_second": 3.44, "eval_steps_per_second": 3.44, "num_input_tokens_seen": 417648, "step": 80 }, { "epoch": 0.12424242424242424, "grad_norm": 0.533509373664856, "learning_rate": 1.9881039478139115e-06, "loss": 2.3717, "num_input_tokens_seen": 428568, "step": 82 }, { "epoch": 0.12727272727272726, "grad_norm": 0.3749203681945801, "learning_rate": 1.9874356363207624e-06, "loss": 2.2728, "num_input_tokens_seen": 437688, "step": 84 }, { "epoch": 0.1303030303030303, "grad_norm": 0.41353124380111694, "learning_rate": 1.986749210117927e-06, "loss": 2.5347, "num_input_tokens_seen": 447408, "step": 86 }, { "epoch": 0.13333333333333333, "grad_norm": 0.4702826142311096, "learning_rate": 1.986044684996425e-06, "loss": 2.4081, "num_input_tokens_seen": 456120, "step": 88 }, { "epoch": 0.13636363636363635, "grad_norm": 0.5201271772384644, "learning_rate": 1.985322077163636e-06, "loss": 2.5697, "num_input_tokens_seen": 467208, "step": 90 }, { "epoch": 0.1393939393939394, "grad_norm": 0.5325783491134644, "learning_rate": 1.9845814032429257e-06, "loss": 2.3267, "num_input_tokens_seen": 477168, "step": 92 }, { "epoch": 0.14242424242424243, "grad_norm": 0.49566376209259033, "learning_rate": 1.9838226802732656e-06, "loss": 2.5342, "num_input_tokens_seen": 486888, "step": 94 }, { "epoch": 0.14545454545454545, "grad_norm": 0.5317257046699524, "learning_rate": 1.9830459257088395e-06, "loss": 2.5662, "num_input_tokens_seen": 496584, "step": 96 }, { "epoch": 0.1484848484848485, "grad_norm": 0.6195109486579895, "learning_rate": 1.982251157418642e-06, "loss": 2.3294, "num_input_tokens_seen": 503736, "step": 98 }, { "epoch": 0.15151515151515152, "grad_norm": 0.4253556728363037, "learning_rate": 1.981438393686069e-06, "loss": 2.6105, "num_input_tokens_seen": 513600, "step": 100 }, { "epoch": 0.15151515151515152, "eval_loss": 2.3544414043426514, "eval_runtime": 5.8171, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 513600, "step": 100 }, { "epoch": 0.15454545454545454, "grad_norm": 0.5861473083496094, "learning_rate": 1.980607653208495e-06, "loss": 2.6435, "num_input_tokens_seen": 519960, "step": 102 }, { "epoch": 0.15757575757575756, "grad_norm": 0.44223421812057495, "learning_rate": 1.9797589550968434e-06, "loss": 2.4326, "num_input_tokens_seen": 529392, "step": 104 }, { "epoch": 0.1606060606060606, "grad_norm": 0.7290481328964233, "learning_rate": 1.9788923188751478e-06, "loss": 2.5169, "num_input_tokens_seen": 537000, "step": 106 }, { "epoch": 0.16363636363636364, "grad_norm": 0.43159109354019165, "learning_rate": 1.978007764480103e-06, "loss": 2.3097, "num_input_tokens_seen": 546864, "step": 108 }, { "epoch": 0.16666666666666666, "grad_norm": 0.46773430705070496, "learning_rate": 1.977105312260605e-06, "loss": 2.2565, "num_input_tokens_seen": 558432, "step": 110 }, { "epoch": 0.1696969696969697, "grad_norm": 0.46607473492622375, "learning_rate": 1.976184982977284e-06, "loss": 2.3503, "num_input_tokens_seen": 569016, "step": 112 }, { "epoch": 0.17272727272727273, "grad_norm": 0.5427464842796326, "learning_rate": 1.975246797802026e-06, "loss": 2.2801, "num_input_tokens_seen": 580392, "step": 114 }, { "epoch": 0.17575757575757575, "grad_norm": 0.4266676902770996, "learning_rate": 1.974290778317487e-06, "loss": 2.4019, "num_input_tokens_seen": 590568, "step": 116 }, { "epoch": 0.1787878787878788, "grad_norm": 0.4442364275455475, "learning_rate": 1.973316946516595e-06, "loss": 2.3779, "num_input_tokens_seen": 601704, "step": 118 }, { "epoch": 0.18181818181818182, "grad_norm": 0.4435305595397949, "learning_rate": 1.9723253248020455e-06, "loss": 2.2488, "num_input_tokens_seen": 613584, "step": 120 }, { "epoch": 0.18181818181818182, "eval_loss": 2.3512158393859863, "eval_runtime": 5.819, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 613584, "step": 120 }, { "epoch": 0.18484848484848485, "grad_norm": 0.5893362164497375, "learning_rate": 1.9713159359857833e-06, "loss": 2.4906, "num_input_tokens_seen": 624792, "step": 122 }, { "epoch": 0.18787878787878787, "grad_norm": 0.4149838089942932, "learning_rate": 1.9702888032884826e-06, "loss": 2.5957, "num_input_tokens_seen": 635832, "step": 124 }, { "epoch": 0.19090909090909092, "grad_norm": 0.42286068201065063, "learning_rate": 1.969243950339009e-06, "loss": 2.1759, "num_input_tokens_seen": 647664, "step": 126 }, { "epoch": 0.19393939393939394, "grad_norm": 0.5177129507064819, "learning_rate": 1.9681814011738758e-06, "loss": 2.5093, "num_input_tokens_seen": 656952, "step": 128 }, { "epoch": 0.19696969696969696, "grad_norm": 0.5667068958282471, "learning_rate": 1.9671011802366934e-06, "loss": 2.5727, "num_input_tokens_seen": 664104, "step": 130 }, { "epoch": 0.2, "grad_norm": 0.566889762878418, "learning_rate": 1.9660033123776056e-06, "loss": 2.3728, "num_input_tokens_seen": 674016, "step": 132 }, { "epoch": 0.20303030303030303, "grad_norm": 0.4465801417827606, "learning_rate": 1.964887822852718e-06, "loss": 2.4271, "num_input_tokens_seen": 684480, "step": 134 }, { "epoch": 0.20606060606060606, "grad_norm": 0.5765467286109924, "learning_rate": 1.963754737323516e-06, "loss": 2.5413, "num_input_tokens_seen": 694056, "step": 136 }, { "epoch": 0.20909090909090908, "grad_norm": 0.5330570936203003, "learning_rate": 1.9626040818562783e-06, "loss": 2.4513, "num_input_tokens_seen": 704640, "step": 138 }, { "epoch": 0.21212121212121213, "grad_norm": 0.6006715297698975, "learning_rate": 1.9614358829214722e-06, "loss": 2.3866, "num_input_tokens_seen": 713640, "step": 140 }, { "epoch": 0.21212121212121213, "eval_loss": 2.349419355392456, "eval_runtime": 5.8237, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 713640, "step": 140 }, { "epoch": 0.21515151515151515, "grad_norm": 0.4789717495441437, "learning_rate": 1.960250167393147e-06, "loss": 2.4217, "num_input_tokens_seen": 722880, "step": 142 }, { "epoch": 0.21818181818181817, "grad_norm": 0.558068037033081, "learning_rate": 1.959046962548316e-06, "loss": 2.5271, "num_input_tokens_seen": 733104, "step": 144 }, { "epoch": 0.22121212121212122, "grad_norm": 0.5164092183113098, "learning_rate": 1.9578262960663305e-06, "loss": 2.4228, "num_input_tokens_seen": 745392, "step": 146 }, { "epoch": 0.22424242424242424, "grad_norm": 0.49615126848220825, "learning_rate": 1.9565881960282384e-06, "loss": 2.1895, "num_input_tokens_seen": 755736, "step": 148 }, { "epoch": 0.22727272727272727, "grad_norm": 0.6630756258964539, "learning_rate": 1.9553326909161436e-06, "loss": 2.4702, "num_input_tokens_seen": 767040, "step": 150 }, { "epoch": 0.23030303030303031, "grad_norm": 0.5331915020942688, "learning_rate": 1.954059809612546e-06, "loss": 2.4535, "num_input_tokens_seen": 776496, "step": 152 }, { "epoch": 0.23333333333333334, "grad_norm": 0.44153809547424316, "learning_rate": 1.9527695813996817e-06, "loss": 2.3757, "num_input_tokens_seen": 785568, "step": 154 }, { "epoch": 0.23636363636363636, "grad_norm": 0.4671899378299713, "learning_rate": 1.9514620359588454e-06, "loss": 2.3609, "num_input_tokens_seen": 797496, "step": 156 }, { "epoch": 0.23939393939393938, "grad_norm": 0.49474212527275085, "learning_rate": 1.9501372033697097e-06, "loss": 2.4576, "num_input_tokens_seen": 808536, "step": 158 }, { "epoch": 0.24242424242424243, "grad_norm": 0.5353239178657532, "learning_rate": 1.948795114109632e-06, "loss": 2.2509, "num_input_tokens_seen": 818592, "step": 160 }, { "epoch": 0.24242424242424243, "eval_loss": 2.3466238975524902, "eval_runtime": 5.8178, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 818592, "step": 160 }, { "epoch": 0.24545454545454545, "grad_norm": 0.4847556948661804, "learning_rate": 1.947435799052955e-06, "loss": 2.4558, "num_input_tokens_seen": 828336, "step": 162 }, { "epoch": 0.24848484848484848, "grad_norm": 0.5099437236785889, "learning_rate": 1.9460592894702946e-06, "loss": 2.3038, "num_input_tokens_seen": 838080, "step": 164 }, { "epoch": 0.2515151515151515, "grad_norm": 0.47751423716545105, "learning_rate": 1.944665617027823e-06, "loss": 2.2954, "num_input_tokens_seen": 850128, "step": 166 }, { "epoch": 0.2545454545454545, "grad_norm": 0.4297049045562744, "learning_rate": 1.943254813786535e-06, "loss": 2.2327, "num_input_tokens_seen": 862416, "step": 168 }, { "epoch": 0.25757575757575757, "grad_norm": 0.5330982804298401, "learning_rate": 1.941826912201518e-06, "loss": 2.487, "num_input_tokens_seen": 873936, "step": 170 }, { "epoch": 0.2606060606060606, "grad_norm": 0.4737272560596466, "learning_rate": 1.9403819451212004e-06, "loss": 2.6736, "num_input_tokens_seen": 883584, "step": 172 }, { "epoch": 0.2636363636363636, "grad_norm": 0.6267192363739014, "learning_rate": 1.938919945786595e-06, "loss": 2.2313, "num_input_tokens_seen": 892632, "step": 174 }, { "epoch": 0.26666666666666666, "grad_norm": 0.42695581912994385, "learning_rate": 1.9374409478305385e-06, "loss": 2.4444, "num_input_tokens_seen": 904920, "step": 176 }, { "epoch": 0.2696969696969697, "grad_norm": 0.5554710030555725, "learning_rate": 1.935944985276914e-06, "loss": 2.5038, "num_input_tokens_seen": 913752, "step": 178 }, { "epoch": 0.2727272727272727, "grad_norm": 0.6374077796936035, "learning_rate": 1.9344320925398713e-06, "loss": 2.3807, "num_input_tokens_seen": 920952, "step": 180 }, { "epoch": 0.2727272727272727, "eval_loss": 2.3428144454956055, "eval_runtime": 5.8159, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 920952, "step": 180 }, { "epoch": 0.27575757575757576, "grad_norm": 0.41562148928642273, "learning_rate": 1.932902304423033e-06, "loss": 2.5033, "num_input_tokens_seen": 932280, "step": 182 }, { "epoch": 0.2787878787878788, "grad_norm": 0.47822168469429016, "learning_rate": 1.931355656118694e-06, "loss": 2.275, "num_input_tokens_seen": 944568, "step": 184 }, { "epoch": 0.2818181818181818, "grad_norm": 0.553165853023529, "learning_rate": 1.9297921832070134e-06, "loss": 2.567, "num_input_tokens_seen": 952032, "step": 186 }, { "epoch": 0.28484848484848485, "grad_norm": 0.5379563570022583, "learning_rate": 1.928211921655195e-06, "loss": 2.5257, "num_input_tokens_seen": 963840, "step": 188 }, { "epoch": 0.2878787878787879, "grad_norm": 0.5385987758636475, "learning_rate": 1.9266149078166603e-06, "loss": 2.3678, "num_input_tokens_seen": 975288, "step": 190 }, { "epoch": 0.2909090909090909, "grad_norm": 0.42638707160949707, "learning_rate": 1.9250011784302106e-06, "loss": 2.232, "num_input_tokens_seen": 987144, "step": 192 }, { "epoch": 0.29393939393939394, "grad_norm": 0.450655996799469, "learning_rate": 1.923370770619184e-06, "loss": 2.1844, "num_input_tokens_seen": 998664, "step": 194 }, { "epoch": 0.296969696969697, "grad_norm": 0.477781742811203, "learning_rate": 1.921723721890602e-06, "loss": 2.3571, "num_input_tokens_seen": 1008504, "step": 196 }, { "epoch": 0.3, "grad_norm": 0.7921934723854065, "learning_rate": 1.920060070134301e-06, "loss": 2.472, "num_input_tokens_seen": 1016664, "step": 198 }, { "epoch": 0.30303030303030304, "grad_norm": 0.5304360389709473, "learning_rate": 1.91837985362207e-06, "loss": 2.4112, "num_input_tokens_seen": 1026192, "step": 200 }, { "epoch": 0.30303030303030304, "eval_loss": 2.340877056121826, "eval_runtime": 5.8187, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 1026192, "step": 200 }, { "epoch": 0.30606060606060603, "grad_norm": 0.4748481810092926, "learning_rate": 1.9166831110067615e-06, "loss": 2.5731, "num_input_tokens_seen": 1037064, "step": 202 }, { "epoch": 0.3090909090909091, "grad_norm": 0.44187602400779724, "learning_rate": 1.914969881321407e-06, "loss": 2.5743, "num_input_tokens_seen": 1049352, "step": 204 }, { "epoch": 0.31212121212121213, "grad_norm": 0.6284915208816528, "learning_rate": 1.913240203978318e-06, "loss": 2.4531, "num_input_tokens_seen": 1057272, "step": 206 }, { "epoch": 0.3151515151515151, "grad_norm": 0.6538528800010681, "learning_rate": 1.9114941187681783e-06, "loss": 2.5391, "num_input_tokens_seen": 1065120, "step": 208 }, { "epoch": 0.3181818181818182, "grad_norm": 1.0042399168014526, "learning_rate": 1.9097316658591304e-06, "loss": 2.4156, "num_input_tokens_seen": 1074192, "step": 210 }, { "epoch": 0.3212121212121212, "grad_norm": 0.48325198888778687, "learning_rate": 1.9079528857958504e-06, "loss": 2.5733, "num_input_tokens_seen": 1084416, "step": 212 }, { "epoch": 0.3242424242424242, "grad_norm": 0.6697909832000732, "learning_rate": 1.906157819498616e-06, "loss": 2.5264, "num_input_tokens_seen": 1092888, "step": 214 }, { "epoch": 0.32727272727272727, "grad_norm": 0.6655834913253784, "learning_rate": 1.904346508262363e-06, "loss": 2.3912, "num_input_tokens_seen": 1100160, "step": 216 }, { "epoch": 0.3303030303030303, "grad_norm": 1.1694029569625854, "learning_rate": 1.9025189937557386e-06, "loss": 2.462, "num_input_tokens_seen": 1107360, "step": 218 }, { "epoch": 0.3333333333333333, "grad_norm": 0.4985570013523102, "learning_rate": 1.90067531802014e-06, "loss": 2.2447, "num_input_tokens_seen": 1119096, "step": 220 }, { "epoch": 0.3333333333333333, "eval_loss": 2.339911937713623, "eval_runtime": 5.8137, "eval_samples_per_second": 3.44, "eval_steps_per_second": 3.44, "num_input_tokens_seen": 1119096, "step": 220 }, { "epoch": 0.33636363636363636, "grad_norm": 0.4883664548397064, "learning_rate": 1.8988155234687495e-06, "loss": 2.4013, "num_input_tokens_seen": 1131384, "step": 222 }, { "epoch": 0.3393939393939394, "grad_norm": 0.7224740982055664, "learning_rate": 1.8969396528855567e-06, "loss": 2.4763, "num_input_tokens_seen": 1142616, "step": 224 }, { "epoch": 0.3424242424242424, "grad_norm": 0.569634199142456, "learning_rate": 1.8950477494243762e-06, "loss": 2.3552, "num_input_tokens_seen": 1154904, "step": 226 }, { "epoch": 0.34545454545454546, "grad_norm": 0.45122525095939636, "learning_rate": 1.8931398566078523e-06, "loss": 2.4198, "num_input_tokens_seen": 1164264, "step": 228 }, { "epoch": 0.3484848484848485, "grad_norm": 0.5598176121711731, "learning_rate": 1.8912160183264612e-06, "loss": 2.5283, "num_input_tokens_seen": 1175472, "step": 230 }, { "epoch": 0.3515151515151515, "grad_norm": 0.5492939352989197, "learning_rate": 1.8892762788374985e-06, "loss": 2.5246, "num_input_tokens_seen": 1185264, "step": 232 }, { "epoch": 0.35454545454545455, "grad_norm": 0.557397723197937, "learning_rate": 1.8873206827640624e-06, "loss": 2.3821, "num_input_tokens_seen": 1197408, "step": 234 }, { "epoch": 0.3575757575757576, "grad_norm": 0.42229530215263367, "learning_rate": 1.8853492750940275e-06, "loss": 2.3593, "num_input_tokens_seen": 1207656, "step": 236 }, { "epoch": 0.3606060606060606, "grad_norm": 0.4781576693058014, "learning_rate": 1.8833621011790078e-06, "loss": 2.2261, "num_input_tokens_seen": 1219080, "step": 238 }, { "epoch": 0.36363636363636365, "grad_norm": 0.46443861722946167, "learning_rate": 1.8813592067333155e-06, "loss": 2.4046, "num_input_tokens_seen": 1230048, "step": 240 }, { "epoch": 0.36363636363636365, "eval_loss": 2.339547872543335, "eval_runtime": 5.8158, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 1230048, "step": 240 }, { "epoch": 0.36666666666666664, "grad_norm": 0.4926714599132538, "learning_rate": 1.8793406378329092e-06, "loss": 2.1956, "num_input_tokens_seen": 1239288, "step": 242 }, { "epoch": 0.3696969696969697, "grad_norm": 0.9403526186943054, "learning_rate": 1.877306440914333e-06, "loss": 2.3843, "num_input_tokens_seen": 1246512, "step": 244 }, { "epoch": 0.37272727272727274, "grad_norm": 0.8498961329460144, "learning_rate": 1.8752566627736477e-06, "loss": 2.2977, "num_input_tokens_seen": 1256256, "step": 246 }, { "epoch": 0.37575757575757573, "grad_norm": 0.5305018424987793, "learning_rate": 1.8731913505653569e-06, "loss": 2.4575, "num_input_tokens_seen": 1265712, "step": 248 }, { "epoch": 0.3787878787878788, "grad_norm": 0.4798325002193451, "learning_rate": 1.8711105518013199e-06, "loss": 2.3638, "num_input_tokens_seen": 1273848, "step": 250 }, { "epoch": 0.38181818181818183, "grad_norm": 0.5862890481948853, "learning_rate": 1.869014314349659e-06, "loss": 2.388, "num_input_tokens_seen": 1283664, "step": 252 }, { "epoch": 0.38484848484848483, "grad_norm": 0.5504214763641357, "learning_rate": 1.8669026864336591e-06, "loss": 2.3997, "num_input_tokens_seen": 1293768, "step": 254 }, { "epoch": 0.3878787878787879, "grad_norm": 0.662431538105011, "learning_rate": 1.8647757166306572e-06, "loss": 2.4629, "num_input_tokens_seen": 1303392, "step": 256 }, { "epoch": 0.39090909090909093, "grad_norm": 0.5133792757987976, "learning_rate": 1.8626334538709263e-06, "loss": 2.3915, "num_input_tokens_seen": 1313784, "step": 258 }, { "epoch": 0.3939393939393939, "grad_norm": 0.47367045283317566, "learning_rate": 1.8604759474365492e-06, "loss": 2.4396, "num_input_tokens_seen": 1326072, "step": 260 }, { "epoch": 0.3939393939393939, "eval_loss": 2.338432788848877, "eval_runtime": 5.8115, "eval_samples_per_second": 3.441, "eval_steps_per_second": 3.441, "num_input_tokens_seen": 1326072, "step": 260 }, { "epoch": 0.396969696969697, "grad_norm": 0.5194035768508911, "learning_rate": 1.858303246960284e-06, "loss": 2.4028, "num_input_tokens_seen": 1335864, "step": 262 }, { "epoch": 0.4, "grad_norm": 0.4642770290374756, "learning_rate": 1.856115402424423e-06, "loss": 2.434, "num_input_tokens_seen": 1347552, "step": 264 }, { "epoch": 0.403030303030303, "grad_norm": 0.5999087691307068, "learning_rate": 1.8539124641596437e-06, "loss": 2.3149, "num_input_tokens_seen": 1356912, "step": 266 }, { "epoch": 0.40606060606060607, "grad_norm": 0.588898241519928, "learning_rate": 1.851694482843849e-06, "loss": 2.5401, "num_input_tokens_seen": 1368408, "step": 268 }, { "epoch": 0.4090909090909091, "grad_norm": 0.49462223052978516, "learning_rate": 1.8494615095010037e-06, "loss": 2.3905, "num_input_tokens_seen": 1380696, "step": 270 }, { "epoch": 0.4121212121212121, "grad_norm": 1.0041953325271606, "learning_rate": 1.8472135954999582e-06, "loss": 2.7022, "num_input_tokens_seen": 1389096, "step": 272 }, { "epoch": 0.41515151515151516, "grad_norm": 0.5517657399177551, "learning_rate": 1.8449507925532685e-06, "loss": 2.5369, "num_input_tokens_seen": 1400784, "step": 274 }, { "epoch": 0.41818181818181815, "grad_norm": 0.6180247068405151, "learning_rate": 1.8426731527160064e-06, "loss": 2.2525, "num_input_tokens_seen": 1413072, "step": 276 }, { "epoch": 0.4212121212121212, "grad_norm": 0.6159691214561462, "learning_rate": 1.8403807283845616e-06, "loss": 2.3052, "num_input_tokens_seen": 1422888, "step": 278 }, { "epoch": 0.42424242424242425, "grad_norm": 0.6237558722496033, "learning_rate": 1.8380735722954367e-06, "loss": 2.344, "num_input_tokens_seen": 1432128, "step": 280 }, { "epoch": 0.42424242424242425, "eval_loss": 2.3386666774749756, "eval_runtime": 5.8175, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 1432128, "step": 280 }, { "epoch": 0.42727272727272725, "grad_norm": 0.6814020276069641, "learning_rate": 1.835751737524033e-06, "loss": 2.4498, "num_input_tokens_seen": 1439928, "step": 282 }, { "epoch": 0.4303030303030303, "grad_norm": 0.5670037865638733, "learning_rate": 1.8334152774834309e-06, "loss": 2.3934, "num_input_tokens_seen": 1449624, "step": 284 }, { "epoch": 0.43333333333333335, "grad_norm": 0.6628959774971008, "learning_rate": 1.83106424592316e-06, "loss": 2.52, "num_input_tokens_seen": 1460520, "step": 286 }, { "epoch": 0.43636363636363634, "grad_norm": 0.6537968516349792, "learning_rate": 1.8286986969279643e-06, "loss": 2.5132, "num_input_tokens_seen": 1469712, "step": 288 }, { "epoch": 0.4393939393939394, "grad_norm": 0.5633306503295898, "learning_rate": 1.8263186849165555e-06, "loss": 2.403, "num_input_tokens_seen": 1480824, "step": 290 }, { "epoch": 0.44242424242424244, "grad_norm": 0.5708298683166504, "learning_rate": 1.8239242646403628e-06, "loss": 2.5149, "num_input_tokens_seen": 1488816, "step": 292 }, { "epoch": 0.44545454545454544, "grad_norm": 0.7049750685691833, "learning_rate": 1.8215154911822737e-06, "loss": 2.2043, "num_input_tokens_seen": 1497816, "step": 294 }, { "epoch": 0.4484848484848485, "grad_norm": 0.5039754509925842, "learning_rate": 1.8190924199553655e-06, "loss": 2.439, "num_input_tokens_seen": 1508928, "step": 296 }, { "epoch": 0.45151515151515154, "grad_norm": 0.5821936726570129, "learning_rate": 1.816655106701631e-06, "loss": 2.4665, "num_input_tokens_seen": 1519512, "step": 298 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5108533501625061, "learning_rate": 1.8142036074906968e-06, "loss": 2.4901, "num_input_tokens_seen": 1529520, "step": 300 }, { "epoch": 0.45454545454545453, "eval_loss": 2.337289333343506, "eval_runtime": 5.817, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 1529520, "step": 300 }, { "epoch": 0.4575757575757576, "grad_norm": 0.4282449781894684, "learning_rate": 1.8117379787185333e-06, "loss": 2.1503, "num_input_tokens_seen": 1541808, "step": 302 }, { "epoch": 0.46060606060606063, "grad_norm": 0.6109529137611389, "learning_rate": 1.809258277106156e-06, "loss": 2.4026, "num_input_tokens_seen": 1550952, "step": 304 }, { "epoch": 0.4636363636363636, "grad_norm": 0.5644070506095886, "learning_rate": 1.8067645596983226e-06, "loss": 2.4195, "num_input_tokens_seen": 1562064, "step": 306 }, { "epoch": 0.4666666666666667, "grad_norm": 0.665733814239502, "learning_rate": 1.804256883862219e-06, "loss": 2.6243, "num_input_tokens_seen": 1569240, "step": 308 }, { "epoch": 0.4696969696969697, "grad_norm": 0.6493149995803833, "learning_rate": 1.8017353072861416e-06, "loss": 2.3603, "num_input_tokens_seen": 1579560, "step": 310 }, { "epoch": 0.4727272727272727, "grad_norm": 0.5297104120254517, "learning_rate": 1.7991998879781676e-06, "loss": 2.2741, "num_input_tokens_seen": 1591248, "step": 312 }, { "epoch": 0.47575757575757577, "grad_norm": 0.4405084252357483, "learning_rate": 1.796650684264823e-06, "loss": 2.5167, "num_input_tokens_seen": 1602840, "step": 314 }, { "epoch": 0.47878787878787876, "grad_norm": 0.6081413626670837, "learning_rate": 1.7940877547897383e-06, "loss": 2.404, "num_input_tokens_seen": 1610520, "step": 316 }, { "epoch": 0.4818181818181818, "grad_norm": 0.7665295600891113, "learning_rate": 1.7915111585123026e-06, "loss": 2.3861, "num_input_tokens_seen": 1617936, "step": 318 }, { "epoch": 0.48484848484848486, "grad_norm": 0.5678819417953491, "learning_rate": 1.7889209547063038e-06, "loss": 2.3335, "num_input_tokens_seen": 1628424, "step": 320 }, { "epoch": 0.48484848484848486, "eval_loss": 2.336883068084717, "eval_runtime": 5.8244, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 1628424, "step": 320 }, { "epoch": 0.48787878787878786, "grad_norm": 0.6124878525733948, "learning_rate": 1.7863172029585684e-06, "loss": 2.6274, "num_input_tokens_seen": 1636704, "step": 322 }, { "epoch": 0.4909090909090909, "grad_norm": 0.5369870066642761, "learning_rate": 1.7836999631675877e-06, "loss": 2.2444, "num_input_tokens_seen": 1646760, "step": 324 }, { "epoch": 0.49393939393939396, "grad_norm": 0.47992056608200073, "learning_rate": 1.7810692955421418e-06, "loss": 2.3407, "num_input_tokens_seen": 1657824, "step": 326 }, { "epoch": 0.49696969696969695, "grad_norm": 0.5946272611618042, "learning_rate": 1.778425260599914e-06, "loss": 2.5075, "num_input_tokens_seen": 1669800, "step": 328 }, { "epoch": 0.5, "grad_norm": 0.5064172744750977, "learning_rate": 1.7757679191660974e-06, "loss": 2.4304, "num_input_tokens_seen": 1678896, "step": 330 }, { "epoch": 0.503030303030303, "grad_norm": 0.676836371421814, "learning_rate": 1.7730973323719996e-06, "loss": 2.3898, "num_input_tokens_seen": 1686696, "step": 332 }, { "epoch": 0.5060606060606061, "grad_norm": 0.45694637298583984, "learning_rate": 1.7704135616536297e-06, "loss": 2.1912, "num_input_tokens_seen": 1695648, "step": 334 }, { "epoch": 0.509090909090909, "grad_norm": 0.5608468651771545, "learning_rate": 1.767716668750292e-06, "loss": 2.4971, "num_input_tokens_seen": 1703112, "step": 336 }, { "epoch": 0.5121212121212121, "grad_norm": 0.5195941925048828, "learning_rate": 1.7650067157031607e-06, "loss": 2.3934, "num_input_tokens_seen": 1715400, "step": 338 }, { "epoch": 0.5151515151515151, "grad_norm": 0.3820761442184448, "learning_rate": 1.7622837648538558e-06, "loss": 2.1842, "num_input_tokens_seen": 1725816, "step": 340 }, { "epoch": 0.5151515151515151, "eval_loss": 2.3365180492401123, "eval_runtime": 5.8166, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 1725816, "step": 340 }, { "epoch": 0.5181818181818182, "grad_norm": 0.5152050852775574, "learning_rate": 1.7595478788430067e-06, "loss": 2.2292, "num_input_tokens_seen": 1737240, "step": 342 }, { "epoch": 0.5212121212121212, "grad_norm": 0.6499360203742981, "learning_rate": 1.7567991206088122e-06, "loss": 2.3013, "num_input_tokens_seen": 1743792, "step": 344 }, { "epoch": 0.5242424242424243, "grad_norm": 0.6490241885185242, "learning_rate": 1.7540375533855931e-06, "loss": 2.5828, "num_input_tokens_seen": 1755192, "step": 346 }, { "epoch": 0.5272727272727272, "grad_norm": 0.5575884580612183, "learning_rate": 1.751263240702337e-06, "loss": 2.2834, "num_input_tokens_seen": 1765656, "step": 348 }, { "epoch": 0.5303030303030303, "grad_norm": 0.6133118867874146, "learning_rate": 1.7484762463812359e-06, "loss": 2.5502, "num_input_tokens_seen": 1773504, "step": 350 }, { "epoch": 0.5333333333333333, "grad_norm": 0.477857768535614, "learning_rate": 1.7456766345362195e-06, "loss": 2.2939, "num_input_tokens_seen": 1785792, "step": 352 }, { "epoch": 0.5363636363636364, "grad_norm": 1.5005486011505127, "learning_rate": 1.7428644695714798e-06, "loss": 2.3919, "num_input_tokens_seen": 1792848, "step": 354 }, { "epoch": 0.5393939393939394, "grad_norm": 0.6583260893821716, "learning_rate": 1.7400398161799901e-06, "loss": 2.4862, "num_input_tokens_seen": 1802256, "step": 356 }, { "epoch": 0.5424242424242425, "grad_norm": 0.5908564925193787, "learning_rate": 1.7372027393420136e-06, "loss": 2.4536, "num_input_tokens_seen": 1812840, "step": 358 }, { "epoch": 0.5454545454545454, "grad_norm": 0.6152108311653137, "learning_rate": 1.7343533043236135e-06, "loss": 2.2118, "num_input_tokens_seen": 1822440, "step": 360 }, { "epoch": 0.5454545454545454, "eval_loss": 2.335080623626709, "eval_runtime": 5.8256, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 1822440, "step": 360 }, { "epoch": 0.5484848484848485, "grad_norm": 0.5109455585479736, "learning_rate": 1.7314915766751482e-06, "loss": 2.3442, "num_input_tokens_seen": 1833168, "step": 362 }, { "epoch": 0.5515151515151515, "grad_norm": 0.4524301290512085, "learning_rate": 1.7286176222297643e-06, "loss": 2.3881, "num_input_tokens_seen": 1845072, "step": 364 }, { "epoch": 0.5545454545454546, "grad_norm": 0.4554661810398102, "learning_rate": 1.7257315071018814e-06, "loss": 2.2764, "num_input_tokens_seen": 1857168, "step": 366 }, { "epoch": 0.5575757575757576, "grad_norm": 0.42852118611335754, "learning_rate": 1.7228332976856717e-06, "loss": 2.364, "num_input_tokens_seen": 1869456, "step": 368 }, { "epoch": 0.5606060606060606, "grad_norm": 0.7273756861686707, "learning_rate": 1.7199230606535347e-06, "loss": 2.4654, "num_input_tokens_seen": 1878168, "step": 370 }, { "epoch": 0.5636363636363636, "grad_norm": 0.7303619384765625, "learning_rate": 1.717000862954559e-06, "loss": 2.4599, "num_input_tokens_seen": 1888608, "step": 372 }, { "epoch": 0.5666666666666667, "grad_norm": 0.6044741868972778, "learning_rate": 1.7140667718129853e-06, "loss": 2.2146, "num_input_tokens_seen": 1897008, "step": 374 }, { "epoch": 0.5696969696969697, "grad_norm": 0.5754801630973816, "learning_rate": 1.7111208547266607e-06, "loss": 2.4951, "num_input_tokens_seen": 1906776, "step": 376 }, { "epoch": 0.5727272727272728, "grad_norm": 0.47109347581863403, "learning_rate": 1.7081631794654818e-06, "loss": 2.1497, "num_input_tokens_seen": 1919064, "step": 378 }, { "epoch": 0.5757575757575758, "grad_norm": 0.6136711835861206, "learning_rate": 1.7051938140698408e-06, "loss": 2.3233, "num_input_tokens_seen": 1928688, "step": 380 }, { "epoch": 0.5757575757575758, "eval_loss": 2.334742546081543, "eval_runtime": 5.8193, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 1928688, "step": 380 }, { "epoch": 0.5787878787878787, "grad_norm": 0.6149052977561951, "learning_rate": 1.702212826849056e-06, "loss": 2.319, "num_input_tokens_seen": 1940784, "step": 382 }, { "epoch": 0.5818181818181818, "grad_norm": 0.5667149424552917, "learning_rate": 1.6992202863798037e-06, "loss": 2.5949, "num_input_tokens_seen": 1950840, "step": 384 }, { "epoch": 0.5848484848484848, "grad_norm": 0.5343450307846069, "learning_rate": 1.6962162615045377e-06, "loss": 2.3292, "num_input_tokens_seen": 1963128, "step": 386 }, { "epoch": 0.5878787878787879, "grad_norm": 0.5003802180290222, "learning_rate": 1.6932008213299071e-06, "loss": 2.5239, "num_input_tokens_seen": 1975008, "step": 388 }, { "epoch": 0.5909090909090909, "grad_norm": 0.5460373759269714, "learning_rate": 1.6901740352251675e-06, "loss": 2.4818, "num_input_tokens_seen": 1983648, "step": 390 }, { "epoch": 0.593939393939394, "grad_norm": 0.5535560250282288, "learning_rate": 1.6871359728205828e-06, "loss": 2.1795, "num_input_tokens_seen": 1993536, "step": 392 }, { "epoch": 0.5969696969696969, "grad_norm": 0.4466463029384613, "learning_rate": 1.6840867040058254e-06, "loss": 2.3585, "num_input_tokens_seen": 2002872, "step": 394 }, { "epoch": 0.6, "grad_norm": 0.5831019878387451, "learning_rate": 1.6810262989283674e-06, "loss": 2.3718, "num_input_tokens_seen": 2012400, "step": 396 }, { "epoch": 0.603030303030303, "grad_norm": 0.5981975197792053, "learning_rate": 1.6779548279918671e-06, "loss": 2.314, "num_input_tokens_seen": 2022936, "step": 398 }, { "epoch": 0.6060606060606061, "grad_norm": 0.5155858397483826, "learning_rate": 1.6748723618545496e-06, "loss": 2.6427, "num_input_tokens_seen": 2031480, "step": 400 }, { "epoch": 0.6060606060606061, "eval_loss": 2.334027051925659, "eval_runtime": 5.8193, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 2031480, "step": 400 }, { "epoch": 0.6090909090909091, "grad_norm": 0.46695375442504883, "learning_rate": 1.6717789714275808e-06, "loss": 2.2379, "num_input_tokens_seen": 2043768, "step": 402 }, { "epoch": 0.6121212121212121, "grad_norm": 0.8030733466148376, "learning_rate": 1.6686747278734364e-06, "loss": 2.3286, "num_input_tokens_seen": 2052456, "step": 404 }, { "epoch": 0.6151515151515151, "grad_norm": 0.5807926654815674, "learning_rate": 1.6655597026042654e-06, "loss": 2.3891, "num_input_tokens_seen": 2062608, "step": 406 }, { "epoch": 0.6181818181818182, "grad_norm": 0.5125523209571838, "learning_rate": 1.6624339672802466e-06, "loss": 2.2766, "num_input_tokens_seen": 2070624, "step": 408 }, { "epoch": 0.6212121212121212, "grad_norm": 0.4872816205024719, "learning_rate": 1.65929759380794e-06, "loss": 2.3172, "num_input_tokens_seen": 2082024, "step": 410 }, { "epoch": 0.6242424242424243, "grad_norm": 0.5617727637290955, "learning_rate": 1.6561506543386332e-06, "loss": 2.2975, "num_input_tokens_seen": 2093928, "step": 412 }, { "epoch": 0.6272727272727273, "grad_norm": 0.7218233942985535, "learning_rate": 1.6529932212666813e-06, "loss": 2.5706, "num_input_tokens_seen": 2102712, "step": 414 }, { "epoch": 0.6303030303030303, "grad_norm": 0.5542349219322205, "learning_rate": 1.6498253672278403e-06, "loss": 2.4111, "num_input_tokens_seen": 2111352, "step": 416 }, { "epoch": 0.6333333333333333, "grad_norm": 0.5303030610084534, "learning_rate": 1.6466471650975989e-06, "loss": 2.3655, "num_input_tokens_seen": 2123184, "step": 418 }, { "epoch": 0.6363636363636364, "grad_norm": 0.4791901111602783, "learning_rate": 1.6434586879894994e-06, "loss": 2.1955, "num_input_tokens_seen": 2132520, "step": 420 }, { "epoch": 0.6363636363636364, "eval_loss": 2.3337419033050537, "eval_runtime": 5.8194, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 2132520, "step": 420 }, { "epoch": 0.6393939393939394, "grad_norm": 1.0299837589263916, "learning_rate": 1.6402600092534571e-06, "loss": 2.4297, "num_input_tokens_seen": 2140344, "step": 422 }, { "epoch": 0.6424242424242425, "grad_norm": 0.5022935271263123, "learning_rate": 1.637051202474072e-06, "loss": 2.3299, "num_input_tokens_seen": 2150592, "step": 424 }, { "epoch": 0.6454545454545455, "grad_norm": 0.7252947688102722, "learning_rate": 1.6338323414689384e-06, "loss": 2.4036, "num_input_tokens_seen": 2158848, "step": 426 }, { "epoch": 0.6484848484848484, "grad_norm": 0.49614864587783813, "learning_rate": 1.6306035002869418e-06, "loss": 2.3709, "num_input_tokens_seen": 2166120, "step": 428 }, { "epoch": 0.6515151515151515, "grad_norm": 0.5736730098724365, "learning_rate": 1.6273647532065615e-06, "loss": 2.6169, "num_input_tokens_seen": 2177760, "step": 430 }, { "epoch": 0.6545454545454545, "grad_norm": 0.8251070380210876, "learning_rate": 1.6241161747341568e-06, "loss": 2.4805, "num_input_tokens_seen": 2185488, "step": 432 }, { "epoch": 0.6575757575757576, "grad_norm": 1.2293510437011719, "learning_rate": 1.6208578396022566e-06, "loss": 2.1922, "num_input_tokens_seen": 2196336, "step": 434 }, { "epoch": 0.6606060606060606, "grad_norm": 0.6561338305473328, "learning_rate": 1.6175898227678376e-06, "loss": 2.4529, "num_input_tokens_seen": 2204520, "step": 436 }, { "epoch": 0.6636363636363637, "grad_norm": 0.4846937954425812, "learning_rate": 1.6143121994106012e-06, "loss": 2.3597, "num_input_tokens_seen": 2216808, "step": 438 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6437355279922485, "learning_rate": 1.611025044931245e-06, "loss": 2.4364, "num_input_tokens_seen": 2227752, "step": 440 }, { "epoch": 0.6666666666666666, "eval_loss": 2.3327877521514893, "eval_runtime": 5.8187, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 2227752, "step": 440 }, { "epoch": 0.6696969696969697, "grad_norm": 0.5672312378883362, "learning_rate": 1.6077284349497254e-06, "loss": 2.5148, "num_input_tokens_seen": 2237808, "step": 442 }, { "epoch": 0.6727272727272727, "grad_norm": 0.5006369948387146, "learning_rate": 1.6044224453035203e-06, "loss": 2.1969, "num_input_tokens_seen": 2249304, "step": 444 }, { "epoch": 0.6757575757575758, "grad_norm": 0.6202157735824585, "learning_rate": 1.6011071520458845e-06, "loss": 2.5604, "num_input_tokens_seen": 2260176, "step": 446 }, { "epoch": 0.6787878787878788, "grad_norm": 0.555921733379364, "learning_rate": 1.5977826314440987e-06, "loss": 2.2211, "num_input_tokens_seen": 2270184, "step": 448 }, { "epoch": 0.6818181818181818, "grad_norm": 0.5153559446334839, "learning_rate": 1.5944489599777161e-06, "loss": 2.3477, "num_input_tokens_seen": 2281464, "step": 450 }, { "epoch": 0.6848484848484848, "grad_norm": 0.5477102994918823, "learning_rate": 1.5911062143368027e-06, "loss": 2.4645, "num_input_tokens_seen": 2292720, "step": 452 }, { "epoch": 0.6878787878787879, "grad_norm": 0.5461196303367615, "learning_rate": 1.5877544714201726e-06, "loss": 2.5217, "num_input_tokens_seen": 2303376, "step": 454 }, { "epoch": 0.6909090909090909, "grad_norm": 0.5640104413032532, "learning_rate": 1.5843938083336194e-06, "loss": 2.5123, "num_input_tokens_seen": 2312544, "step": 456 }, { "epoch": 0.693939393939394, "grad_norm": 0.4936680197715759, "learning_rate": 1.5810243023881432e-06, "loss": 2.2975, "num_input_tokens_seen": 2323344, "step": 458 }, { "epoch": 0.696969696969697, "grad_norm": 0.4782181680202484, "learning_rate": 1.5776460310981702e-06, "loss": 2.3568, "num_input_tokens_seen": 2332056, "step": 460 }, { "epoch": 0.696969696969697, "eval_loss": 2.332925319671631, "eval_runtime": 5.8201, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "num_input_tokens_seen": 2332056, "step": 460 }, { "epoch": 0.7, "grad_norm": 0.5433066487312317, "learning_rate": 1.5742590721797725e-06, "loss": 2.5328, "num_input_tokens_seen": 2342400, "step": 462 }, { "epoch": 0.703030303030303, "grad_norm": 1.0040984153747559, "learning_rate": 1.5708635035488756e-06, "loss": 2.5166, "num_input_tokens_seen": 2350536, "step": 464 }, { "epoch": 0.706060606060606, "grad_norm": 0.5495861172676086, "learning_rate": 1.5674594033194706e-06, "loss": 2.3471, "num_input_tokens_seen": 2361528, "step": 466 }, { "epoch": 0.7090909090909091, "grad_norm": 0.6494752764701843, "learning_rate": 1.5640468498018153e-06, "loss": 2.4315, "num_input_tokens_seen": 2370552, "step": 468 }, { "epoch": 0.7121212121212122, "grad_norm": 0.5859867930412292, "learning_rate": 1.5606259215006325e-06, "loss": 2.5083, "num_input_tokens_seen": 2380368, "step": 470 }, { "epoch": 0.7151515151515152, "grad_norm": 0.606728196144104, "learning_rate": 1.5571966971133037e-06, "loss": 2.3308, "num_input_tokens_seen": 2389176, "step": 472 }, { "epoch": 0.7181818181818181, "grad_norm": 0.453156441450119, "learning_rate": 1.5537592555280594e-06, "loss": 2.3236, "num_input_tokens_seen": 2398944, "step": 474 }, { "epoch": 0.7212121212121212, "grad_norm": 0.8148333430290222, "learning_rate": 1.5503136758221653e-06, "loss": 2.8391, "num_input_tokens_seen": 2404656, "step": 476 }, { "epoch": 0.7242424242424242, "grad_norm": 0.4754016399383545, "learning_rate": 1.5468600372601009e-06, "loss": 2.6875, "num_input_tokens_seen": 2416392, "step": 478 }, { "epoch": 0.7272727272727273, "grad_norm": 0.7027032375335693, "learning_rate": 1.543398419291737e-06, "loss": 2.4508, "num_input_tokens_seen": 2425032, "step": 480 }, { "epoch": 0.7272727272727273, "eval_loss": 2.332369089126587, "eval_runtime": 5.8166, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 2425032, "step": 480 }, { "epoch": 0.7303030303030303, "grad_norm": 0.8816015124320984, "learning_rate": 1.5399289015505096e-06, "loss": 2.4884, "num_input_tokens_seen": 2432280, "step": 482 }, { "epoch": 0.7333333333333333, "grad_norm": 0.5385542511940002, "learning_rate": 1.536451563851584e-06, "loss": 2.3159, "num_input_tokens_seen": 2442576, "step": 484 }, { "epoch": 0.7363636363636363, "grad_norm": 0.5668327808380127, "learning_rate": 1.5329664861900237e-06, "loss": 2.5522, "num_input_tokens_seen": 2450664, "step": 486 }, { "epoch": 0.7393939393939394, "grad_norm": 0.5444993376731873, "learning_rate": 1.5294737487389462e-06, "loss": 2.4853, "num_input_tokens_seen": 2462568, "step": 488 }, { "epoch": 0.7424242424242424, "grad_norm": 0.5722953081130981, "learning_rate": 1.5259734318476807e-06, "loss": 2.5841, "num_input_tokens_seen": 2472312, "step": 490 }, { "epoch": 0.7454545454545455, "grad_norm": 0.5933071970939636, "learning_rate": 1.5224656160399186e-06, "loss": 2.4222, "num_input_tokens_seen": 2483016, "step": 492 }, { "epoch": 0.7484848484848485, "grad_norm": 0.6787658929824829, "learning_rate": 1.518950382011861e-06, "loss": 2.261, "num_input_tokens_seen": 2492688, "step": 494 }, { "epoch": 0.7515151515151515, "grad_norm": 0.5823308825492859, "learning_rate": 1.5154278106303649e-06, "loss": 2.3332, "num_input_tokens_seen": 2504472, "step": 496 }, { "epoch": 0.7545454545454545, "grad_norm": 0.5042080879211426, "learning_rate": 1.511897982931078e-06, "loss": 2.3521, "num_input_tokens_seen": 2516160, "step": 498 }, { "epoch": 0.7575757575757576, "grad_norm": 0.6808260679244995, "learning_rate": 1.50836098011658e-06, "loss": 2.3093, "num_input_tokens_seen": 2527320, "step": 500 }, { "epoch": 0.7575757575757576, "eval_loss": 2.3320088386535645, "eval_runtime": 5.8161, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 2527320, "step": 500 }, { "epoch": 0.7606060606060606, "grad_norm": 0.5960633158683777, "learning_rate": 1.5048168835545094e-06, "loss": 2.4031, "num_input_tokens_seen": 2535744, "step": 502 }, { "epoch": 0.7636363636363637, "grad_norm": 0.4656361937522888, "learning_rate": 1.5012657747756961e-06, "loss": 2.2842, "num_input_tokens_seen": 2546376, "step": 504 }, { "epoch": 0.7666666666666667, "grad_norm": 0.7001519203186035, "learning_rate": 1.4977077354722828e-06, "loss": 2.4888, "num_input_tokens_seen": 2553456, "step": 506 }, { "epoch": 0.7696969696969697, "grad_norm": 0.5070295333862305, "learning_rate": 1.4941428474958469e-06, "loss": 2.3082, "num_input_tokens_seen": 2563632, "step": 508 }, { "epoch": 0.7727272727272727, "grad_norm": 0.559223473072052, "learning_rate": 1.4905711928555178e-06, "loss": 2.4127, "num_input_tokens_seen": 2573184, "step": 510 }, { "epoch": 0.7757575757575758, "grad_norm": 0.45378220081329346, "learning_rate": 1.4869928537160892e-06, "loss": 2.1886, "num_input_tokens_seen": 2585472, "step": 512 }, { "epoch": 0.7787878787878788, "grad_norm": 0.5591022968292236, "learning_rate": 1.4834079123961308e-06, "loss": 2.2753, "num_input_tokens_seen": 2594304, "step": 514 }, { "epoch": 0.7818181818181819, "grad_norm": 0.6257476806640625, "learning_rate": 1.479816451366092e-06, "loss": 2.4605, "num_input_tokens_seen": 2601600, "step": 516 }, { "epoch": 0.7848484848484848, "grad_norm": 0.5094606280326843, "learning_rate": 1.4762185532464057e-06, "loss": 2.4019, "num_input_tokens_seen": 2612280, "step": 518 }, { "epoch": 0.7878787878787878, "grad_norm": 0.4572422206401825, "learning_rate": 1.472614300805591e-06, "loss": 2.5201, "num_input_tokens_seen": 2624280, "step": 520 }, { "epoch": 0.7878787878787878, "eval_loss": 2.3315682411193848, "eval_runtime": 5.8196, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 2624280, "step": 520 }, { "epoch": 0.7909090909090909, "grad_norm": 0.5242352485656738, "learning_rate": 1.4690037769583428e-06, "loss": 2.429, "num_input_tokens_seen": 2634072, "step": 522 }, { "epoch": 0.793939393939394, "grad_norm": 0.48639097809791565, "learning_rate": 1.4653870647636297e-06, "loss": 2.4341, "num_input_tokens_seen": 2643864, "step": 524 }, { "epoch": 0.796969696969697, "grad_norm": 0.48426756262779236, "learning_rate": 1.4617642474227797e-06, "loss": 2.2926, "num_input_tokens_seen": 2656152, "step": 526 }, { "epoch": 0.8, "grad_norm": 0.5517458319664001, "learning_rate": 1.45813540827757e-06, "loss": 2.6445, "num_input_tokens_seen": 2665968, "step": 528 }, { "epoch": 0.803030303030303, "grad_norm": 0.540124237537384, "learning_rate": 1.4545006308083055e-06, "loss": 2.2952, "num_input_tokens_seen": 2677680, "step": 530 }, { "epoch": 0.806060606060606, "grad_norm": 0.5651832222938538, "learning_rate": 1.4508599986319015e-06, "loss": 2.4097, "num_input_tokens_seen": 2687376, "step": 532 }, { "epoch": 0.8090909090909091, "grad_norm": 0.4706498980522156, "learning_rate": 1.4472135954999578e-06, "loss": 2.2751, "num_input_tokens_seen": 2699112, "step": 534 }, { "epoch": 0.8121212121212121, "grad_norm": 0.5661342144012451, "learning_rate": 1.4435615052968358e-06, "loss": 2.4527, "num_input_tokens_seen": 2710008, "step": 536 }, { "epoch": 0.8151515151515152, "grad_norm": 0.49977409839630127, "learning_rate": 1.4399038120377224e-06, "loss": 2.3689, "num_input_tokens_seen": 2720136, "step": 538 }, { "epoch": 0.8181818181818182, "grad_norm": 0.5473623871803284, "learning_rate": 1.4362405998667043e-06, "loss": 2.4758, "num_input_tokens_seen": 2729160, "step": 540 }, { "epoch": 0.8181818181818182, "eval_loss": 2.3316752910614014, "eval_runtime": 5.8161, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 2729160, "step": 540 }, { "epoch": 0.8212121212121212, "grad_norm": 0.5338855385780334, "learning_rate": 1.432571953054828e-06, "loss": 2.3434, "num_input_tokens_seen": 2739168, "step": 542 }, { "epoch": 0.8242424242424242, "grad_norm": 0.5923134684562683, "learning_rate": 1.4288979559981615e-06, "loss": 2.364, "num_input_tokens_seen": 2747688, "step": 544 }, { "epoch": 0.8272727272727273, "grad_norm": 0.48334839940071106, "learning_rate": 1.4252186932158546e-06, "loss": 2.4677, "num_input_tokens_seen": 2758488, "step": 546 }, { "epoch": 0.8303030303030303, "grad_norm": 0.5619869828224182, "learning_rate": 1.421534249348192e-06, "loss": 2.5121, "num_input_tokens_seen": 2768832, "step": 548 }, { "epoch": 0.8333333333333334, "grad_norm": 0.6507293581962585, "learning_rate": 1.4178447091546497e-06, "loss": 2.491, "num_input_tokens_seen": 2779584, "step": 550 }, { "epoch": 0.8363636363636363, "grad_norm": 0.8891876935958862, "learning_rate": 1.414150157511941e-06, "loss": 2.3513, "num_input_tokens_seen": 2786232, "step": 552 }, { "epoch": 0.8393939393939394, "grad_norm": 0.5667576193809509, "learning_rate": 1.410450679412067e-06, "loss": 2.4317, "num_input_tokens_seen": 2796216, "step": 554 }, { "epoch": 0.8424242424242424, "grad_norm": 0.4579615592956543, "learning_rate": 1.406746359960361e-06, "loss": 2.3216, "num_input_tokens_seen": 2807352, "step": 556 }, { "epoch": 0.8454545454545455, "grad_norm": 0.4524303376674652, "learning_rate": 1.403037284373529e-06, "loss": 2.2947, "num_input_tokens_seen": 2817936, "step": 558 }, { "epoch": 0.8484848484848485, "grad_norm": 0.8141398429870605, "learning_rate": 1.3993235379776908e-06, "loss": 2.5013, "num_input_tokens_seen": 2827104, "step": 560 }, { "epoch": 0.8484848484848485, "eval_loss": 2.33099102973938, "eval_runtime": 5.8178, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 2827104, "step": 560 }, { "epoch": 0.8515151515151516, "grad_norm": 0.6005460023880005, "learning_rate": 1.395605206206417e-06, "loss": 2.3728, "num_input_tokens_seen": 2834520, "step": 562 }, { "epoch": 0.8545454545454545, "grad_norm": 0.6270483136177063, "learning_rate": 1.3918823745987625e-06, "loss": 2.5102, "num_input_tokens_seen": 2845560, "step": 564 }, { "epoch": 0.8575757575757575, "grad_norm": 0.5506067872047424, "learning_rate": 1.3881551287973006e-06, "loss": 2.4606, "num_input_tokens_seen": 2856168, "step": 566 }, { "epoch": 0.8606060606060606, "grad_norm": 0.5318931937217712, "learning_rate": 1.384423554546151e-06, "loss": 2.6367, "num_input_tokens_seen": 2866872, "step": 568 }, { "epoch": 0.8636363636363636, "grad_norm": 0.5173328518867493, "learning_rate": 1.3806877376890084e-06, "loss": 2.4952, "num_input_tokens_seen": 2878296, "step": 570 }, { "epoch": 0.8666666666666667, "grad_norm": 0.6837607622146606, "learning_rate": 1.3769477641671668e-06, "loss": 2.4297, "num_input_tokens_seen": 2887056, "step": 572 }, { "epoch": 0.8696969696969697, "grad_norm": 0.5360056757926941, "learning_rate": 1.373203720017544e-06, "loss": 2.3496, "num_input_tokens_seen": 2896152, "step": 574 }, { "epoch": 0.8727272727272727, "grad_norm": 0.5022287368774414, "learning_rate": 1.3694556913706996e-06, "loss": 2.4491, "num_input_tokens_seen": 2905776, "step": 576 }, { "epoch": 0.8757575757575757, "grad_norm": 0.691007137298584, "learning_rate": 1.3657037644488574e-06, "loss": 2.1934, "num_input_tokens_seen": 2915568, "step": 578 }, { "epoch": 0.8787878787878788, "grad_norm": 0.5107728838920593, "learning_rate": 1.361948025563918e-06, "loss": 2.3654, "num_input_tokens_seen": 2926128, "step": 580 }, { "epoch": 0.8787878787878788, "eval_loss": 2.33089542388916, "eval_runtime": 5.8222, "eval_samples_per_second": 3.435, "eval_steps_per_second": 3.435, "num_input_tokens_seen": 2926128, "step": 580 }, { "epoch": 0.8818181818181818, "grad_norm": 0.5568860769271851, "learning_rate": 1.3581885611154759e-06, "loss": 2.4307, "num_input_tokens_seen": 2933568, "step": 582 }, { "epoch": 0.8848484848484849, "grad_norm": 0.6976082921028137, "learning_rate": 1.3544254575888313e-06, "loss": 2.6203, "num_input_tokens_seen": 2942616, "step": 584 }, { "epoch": 0.8878787878787879, "grad_norm": 0.5394561290740967, "learning_rate": 1.3506588015529994e-06, "loss": 2.4422, "num_input_tokens_seen": 2952480, "step": 586 }, { "epoch": 0.8909090909090909, "grad_norm": 0.5144073963165283, "learning_rate": 1.3468886796587202e-06, "loss": 2.2622, "num_input_tokens_seen": 2962344, "step": 588 }, { "epoch": 0.8939393939393939, "grad_norm": 0.5705990195274353, "learning_rate": 1.3431151786364647e-06, "loss": 2.3397, "num_input_tokens_seen": 2969832, "step": 590 }, { "epoch": 0.896969696969697, "grad_norm": 0.7521764636039734, "learning_rate": 1.33933838529444e-06, "loss": 2.4768, "num_input_tokens_seen": 2979312, "step": 592 }, { "epoch": 0.9, "grad_norm": 0.4214877784252167, "learning_rate": 1.3355583865165912e-06, "loss": 2.3752, "num_input_tokens_seen": 2990568, "step": 594 }, { "epoch": 0.9030303030303031, "grad_norm": 0.6079035401344299, "learning_rate": 1.331775269260604e-06, "loss": 2.3682, "num_input_tokens_seen": 2998584, "step": 596 }, { "epoch": 0.906060606060606, "grad_norm": 0.5687966346740723, "learning_rate": 1.3279891205559034e-06, "loss": 2.4906, "num_input_tokens_seen": 3005784, "step": 598 }, { "epoch": 0.9090909090909091, "grad_norm": 0.6438218355178833, "learning_rate": 1.3242000275016527e-06, "loss": 2.4142, "num_input_tokens_seen": 3013968, "step": 600 }, { "epoch": 0.9090909090909091, "eval_loss": 2.3308167457580566, "eval_runtime": 5.8211, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "num_input_tokens_seen": 3013968, "step": 600 }, { "epoch": 0.9121212121212121, "grad_norm": 0.8877610564231873, "learning_rate": 1.3204080772647478e-06, "loss": 2.8198, "num_input_tokens_seen": 3021504, "step": 602 }, { "epoch": 0.9151515151515152, "grad_norm": 0.6974935531616211, "learning_rate": 1.3166133570778143e-06, "loss": 2.4954, "num_input_tokens_seen": 3033264, "step": 604 }, { "epoch": 0.9181818181818182, "grad_norm": 0.4437900483608246, "learning_rate": 1.3128159542371987e-06, "loss": 2.4191, "num_input_tokens_seen": 3044688, "step": 606 }, { "epoch": 0.9212121212121213, "grad_norm": 0.41366204619407654, "learning_rate": 1.309015956100962e-06, "loss": 2.2432, "num_input_tokens_seen": 3056592, "step": 608 }, { "epoch": 0.9242424242424242, "grad_norm": 0.4901912808418274, "learning_rate": 1.3052134500868686e-06, "loss": 2.4408, "num_input_tokens_seen": 3066048, "step": 610 }, { "epoch": 0.9272727272727272, "grad_norm": 0.7082731127738953, "learning_rate": 1.301408523670376e-06, "loss": 2.5248, "num_input_tokens_seen": 3076128, "step": 612 }, { "epoch": 0.9303030303030303, "grad_norm": 0.6702643036842346, "learning_rate": 1.297601264382622e-06, "loss": 2.4202, "num_input_tokens_seen": 3085464, "step": 614 }, { "epoch": 0.9333333333333333, "grad_norm": 0.5271164178848267, "learning_rate": 1.2937917598084123e-06, "loss": 2.3525, "num_input_tokens_seen": 3094440, "step": 616 }, { "epoch": 0.9363636363636364, "grad_norm": 0.5742107629776001, "learning_rate": 1.2899800975842038e-06, "loss": 2.3598, "num_input_tokens_seen": 3105720, "step": 618 }, { "epoch": 0.9393939393939394, "grad_norm": 0.653012216091156, "learning_rate": 1.286166365396089e-06, "loss": 2.588, "num_input_tokens_seen": 3113856, "step": 620 }, { "epoch": 0.9393939393939394, "eval_loss": 2.3307266235351562, "eval_runtime": 5.8207, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "num_input_tokens_seen": 3113856, "step": 620 }, { "epoch": 0.9424242424242424, "grad_norm": 0.7475118041038513, "learning_rate": 1.2823506509777807e-06, "loss": 2.4249, "num_input_tokens_seen": 3123288, "step": 622 }, { "epoch": 0.9454545454545454, "grad_norm": 0.7373444437980652, "learning_rate": 1.2785330421085917e-06, "loss": 2.3551, "num_input_tokens_seen": 3131256, "step": 624 }, { "epoch": 0.9484848484848485, "grad_norm": 0.5523613691329956, "learning_rate": 1.2747136266114156e-06, "loss": 2.1922, "num_input_tokens_seen": 3139656, "step": 626 }, { "epoch": 0.9515151515151515, "grad_norm": 0.7101964950561523, "learning_rate": 1.270892492350707e-06, "loss": 2.4905, "num_input_tokens_seen": 3147744, "step": 628 }, { "epoch": 0.9545454545454546, "grad_norm": 0.5868334770202637, "learning_rate": 1.267069727230461e-06, "loss": 2.4588, "num_input_tokens_seen": 3158376, "step": 630 }, { "epoch": 0.9575757575757575, "grad_norm": 0.6006575226783752, "learning_rate": 1.2632454191921894e-06, "loss": 2.3059, "num_input_tokens_seen": 3168120, "step": 632 }, { "epoch": 0.9606060606060606, "grad_norm": 0.5622104406356812, "learning_rate": 1.2594196562128978e-06, "loss": 2.5159, "num_input_tokens_seen": 3178176, "step": 634 }, { "epoch": 0.9636363636363636, "grad_norm": 0.5180094242095947, "learning_rate": 1.2555925263030634e-06, "loss": 2.3614, "num_input_tokens_seen": 3189816, "step": 636 }, { "epoch": 0.9666666666666667, "grad_norm": 0.7544111013412476, "learning_rate": 1.2517641175046078e-06, "loss": 2.6341, "num_input_tokens_seen": 3198528, "step": 638 }, { "epoch": 0.9696969696969697, "grad_norm": 0.5005560517311096, "learning_rate": 1.2479345178888752e-06, "loss": 2.1493, "num_input_tokens_seen": 3209904, "step": 640 }, { "epoch": 0.9696969696969697, "eval_loss": 2.3306069374084473, "eval_runtime": 5.8165, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 3209904, "step": 640 }, { "epoch": 0.9727272727272728, "grad_norm": 0.6143120527267456, "learning_rate": 1.244103815554602e-06, "loss": 2.5543, "num_input_tokens_seen": 3220584, "step": 642 }, { "epoch": 0.9757575757575757, "grad_norm": 0.6468402147293091, "learning_rate": 1.2402720986258936e-06, "loss": 2.3468, "num_input_tokens_seen": 3231576, "step": 644 }, { "epoch": 0.9787878787878788, "grad_norm": 0.6000608205795288, "learning_rate": 1.2364394552501951e-06, "loss": 2.3648, "num_input_tokens_seen": 3239208, "step": 646 }, { "epoch": 0.9818181818181818, "grad_norm": 0.6772189140319824, "learning_rate": 1.2326059735962648e-06, "loss": 2.5894, "num_input_tokens_seen": 3246072, "step": 648 }, { "epoch": 0.9848484848484849, "grad_norm": 0.5030667185783386, "learning_rate": 1.228771741852145e-06, "loss": 2.4484, "num_input_tokens_seen": 3258000, "step": 650 }, { "epoch": 0.9878787878787879, "grad_norm": 0.8376536965370178, "learning_rate": 1.2249368482231334e-06, "loss": 2.5076, "num_input_tokens_seen": 3264912, "step": 652 }, { "epoch": 0.990909090909091, "grad_norm": 0.6285922527313232, "learning_rate": 1.2211013809297546e-06, "loss": 2.3112, "num_input_tokens_seen": 3272832, "step": 654 }, { "epoch": 0.9939393939393939, "grad_norm": 0.49095821380615234, "learning_rate": 1.21726542820573e-06, "loss": 2.3038, "num_input_tokens_seen": 3283848, "step": 656 }, { "epoch": 0.996969696969697, "grad_norm": 0.5539312958717346, "learning_rate": 1.213429078295948e-06, "loss": 2.3811, "num_input_tokens_seen": 3295272, "step": 658 }, { "epoch": 1.0, "grad_norm": 0.46812400221824646, "learning_rate": 1.2095924194544344e-06, "loss": 2.4287, "num_input_tokens_seen": 3305760, "step": 660 }, { "epoch": 1.0, "eval_loss": 2.3300185203552246, "eval_runtime": 5.8178, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 3305760, "step": 660 }, { "epoch": 1.003030303030303, "grad_norm": 0.45484259724617004, "learning_rate": 1.2057555399423218e-06, "loss": 2.4229, "num_input_tokens_seen": 3316512, "step": 662 }, { "epoch": 1.006060606060606, "grad_norm": 0.506411612033844, "learning_rate": 1.201918528025819e-06, "loss": 2.3718, "num_input_tokens_seen": 3328800, "step": 664 }, { "epoch": 1.009090909090909, "grad_norm": 0.7456917762756348, "learning_rate": 1.1980814719741809e-06, "loss": 2.5418, "num_input_tokens_seen": 3335424, "step": 666 }, { "epoch": 1.0121212121212122, "grad_norm": 0.6323581337928772, "learning_rate": 1.1942444600576783e-06, "loss": 2.4076, "num_input_tokens_seen": 3344904, "step": 668 }, { "epoch": 1.0151515151515151, "grad_norm": 0.6008067727088928, "learning_rate": 1.1904075805455657e-06, "loss": 2.3543, "num_input_tokens_seen": 3355176, "step": 670 }, { "epoch": 1.018181818181818, "grad_norm": 0.6115381121635437, "learning_rate": 1.186570921704052e-06, "loss": 2.3537, "num_input_tokens_seen": 3366096, "step": 672 }, { "epoch": 1.0212121212121212, "grad_norm": 0.5540327429771423, "learning_rate": 1.18273457179427e-06, "loss": 2.1717, "num_input_tokens_seen": 3375696, "step": 674 }, { "epoch": 1.0242424242424242, "grad_norm": 0.6130234599113464, "learning_rate": 1.1788986190702453e-06, "loss": 2.408, "num_input_tokens_seen": 3384288, "step": 676 }, { "epoch": 1.0272727272727273, "grad_norm": 0.6069101095199585, "learning_rate": 1.1750631517768667e-06, "loss": 2.3485, "num_input_tokens_seen": 3391128, "step": 678 }, { "epoch": 1.0303030303030303, "grad_norm": 0.5664869546890259, "learning_rate": 1.1712282581478552e-06, "loss": 2.4617, "num_input_tokens_seen": 3401640, "step": 680 }, { "epoch": 1.0303030303030303, "eval_loss": 2.3293986320495605, "eval_runtime": 5.8211, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "num_input_tokens_seen": 3401640, "step": 680 }, { "epoch": 1.0333333333333334, "grad_norm": 0.5258334875106812, "learning_rate": 1.167394026403735e-06, "loss": 2.3971, "num_input_tokens_seen": 3411120, "step": 682 }, { "epoch": 1.0363636363636364, "grad_norm": 0.5583547353744507, "learning_rate": 1.1635605447498048e-06, "loss": 2.3265, "num_input_tokens_seen": 3420912, "step": 684 }, { "epoch": 1.0393939393939393, "grad_norm": 0.5852888822555542, "learning_rate": 1.1597279013741067e-06, "loss": 2.5114, "num_input_tokens_seen": 3429744, "step": 686 }, { "epoch": 1.0424242424242425, "grad_norm": 0.5078532695770264, "learning_rate": 1.1558961844453978e-06, "loss": 2.5497, "num_input_tokens_seen": 3438936, "step": 688 }, { "epoch": 1.0454545454545454, "grad_norm": 0.9847856760025024, "learning_rate": 1.152065482111125e-06, "loss": 2.5458, "num_input_tokens_seen": 3444912, "step": 690 }, { "epoch": 1.0484848484848486, "grad_norm": 0.49534177780151367, "learning_rate": 1.1482358824953919e-06, "loss": 2.3622, "num_input_tokens_seen": 3456936, "step": 692 }, { "epoch": 1.0515151515151515, "grad_norm": 0.6851257681846619, "learning_rate": 1.144407473696937e-06, "loss": 2.221, "num_input_tokens_seen": 3466344, "step": 694 }, { "epoch": 1.0545454545454545, "grad_norm": 0.4764980375766754, "learning_rate": 1.1405803437871027e-06, "loss": 2.3708, "num_input_tokens_seen": 3478632, "step": 696 }, { "epoch": 1.0575757575757576, "grad_norm": 0.6040279865264893, "learning_rate": 1.136754580807811e-06, "loss": 2.5175, "num_input_tokens_seen": 3485496, "step": 698 }, { "epoch": 1.0606060606060606, "grad_norm": 0.6335225701332092, "learning_rate": 1.1329302727695389e-06, "loss": 2.2166, "num_input_tokens_seen": 3496272, "step": 700 }, { "epoch": 1.0606060606060606, "eval_loss": 2.329413890838623, "eval_runtime": 5.8255, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 3496272, "step": 700 }, { "epoch": 1.0636363636363637, "grad_norm": 0.6664142608642578, "learning_rate": 1.1291075076492928e-06, "loss": 2.5228, "num_input_tokens_seen": 3506712, "step": 702 }, { "epoch": 1.0666666666666667, "grad_norm": 0.5364813208580017, "learning_rate": 1.1252863733885845e-06, "loss": 2.4304, "num_input_tokens_seen": 3518856, "step": 704 }, { "epoch": 1.0696969696969698, "grad_norm": 0.7389492988586426, "learning_rate": 1.1214669578914087e-06, "loss": 2.0998, "num_input_tokens_seen": 3528456, "step": 706 }, { "epoch": 1.0727272727272728, "grad_norm": 0.709426999092102, "learning_rate": 1.1176493490222192e-06, "loss": 2.146, "num_input_tokens_seen": 3537048, "step": 708 }, { "epoch": 1.0757575757575757, "grad_norm": 0.7311533093452454, "learning_rate": 1.1138336346039113e-06, "loss": 2.3275, "num_input_tokens_seen": 3544536, "step": 710 }, { "epoch": 1.0787878787878789, "grad_norm": 0.5675577521324158, "learning_rate": 1.1100199024157966e-06, "loss": 2.3477, "num_input_tokens_seen": 3551472, "step": 712 }, { "epoch": 1.0818181818181818, "grad_norm": 0.6367121934890747, "learning_rate": 1.1062082401915878e-06, "loss": 2.4356, "num_input_tokens_seen": 3561312, "step": 714 }, { "epoch": 1.084848484848485, "grad_norm": 0.5750899910926819, "learning_rate": 1.1023987356173782e-06, "loss": 2.5201, "num_input_tokens_seen": 3570456, "step": 716 }, { "epoch": 1.087878787878788, "grad_norm": 0.46258801221847534, "learning_rate": 1.0985914763296245e-06, "loss": 2.0526, "num_input_tokens_seen": 3582744, "step": 718 }, { "epoch": 1.0909090909090908, "grad_norm": 0.6125935912132263, "learning_rate": 1.0947865499131315e-06, "loss": 2.2984, "num_input_tokens_seen": 3595032, "step": 720 }, { "epoch": 1.0909090909090908, "eval_loss": 2.328953504562378, "eval_runtime": 5.8254, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 3595032, "step": 720 }, { "epoch": 1.093939393939394, "grad_norm": 0.48193785548210144, "learning_rate": 1.0909840438990383e-06, "loss": 1.7515, "num_input_tokens_seen": 3606048, "step": 722 }, { "epoch": 1.096969696969697, "grad_norm": 0.48528820276260376, "learning_rate": 1.0871840457628012e-06, "loss": 2.3416, "num_input_tokens_seen": 3616368, "step": 724 }, { "epoch": 1.1, "grad_norm": 0.46913468837738037, "learning_rate": 1.0833866429221858e-06, "loss": 2.3327, "num_input_tokens_seen": 3628368, "step": 726 }, { "epoch": 1.103030303030303, "grad_norm": 0.5710415840148926, "learning_rate": 1.0795919227352523e-06, "loss": 2.401, "num_input_tokens_seen": 3637848, "step": 728 }, { "epoch": 1.106060606060606, "grad_norm": 0.5964322090148926, "learning_rate": 1.0757999724983474e-06, "loss": 2.2503, "num_input_tokens_seen": 3647640, "step": 730 }, { "epoch": 1.1090909090909091, "grad_norm": 0.5693560242652893, "learning_rate": 1.0720108794440967e-06, "loss": 2.4449, "num_input_tokens_seen": 3658272, "step": 732 }, { "epoch": 1.112121212121212, "grad_norm": 0.7325261235237122, "learning_rate": 1.068224730739396e-06, "loss": 2.2787, "num_input_tokens_seen": 3668760, "step": 734 }, { "epoch": 1.1151515151515152, "grad_norm": 0.5507751107215881, "learning_rate": 1.064441613483409e-06, "loss": 2.2226, "num_input_tokens_seen": 3679608, "step": 736 }, { "epoch": 1.1181818181818182, "grad_norm": 0.4701879620552063, "learning_rate": 1.0606616147055602e-06, "loss": 2.6116, "num_input_tokens_seen": 3689832, "step": 738 }, { "epoch": 1.121212121212121, "grad_norm": 0.5531448125839233, "learning_rate": 1.056884821363535e-06, "loss": 2.1242, "num_input_tokens_seen": 3700392, "step": 740 }, { "epoch": 1.121212121212121, "eval_loss": 2.3289198875427246, "eval_runtime": 5.8244, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 3700392, "step": 740 }, { "epoch": 1.1242424242424243, "grad_norm": 0.7482770085334778, "learning_rate": 1.05311132034128e-06, "loss": 2.3979, "num_input_tokens_seen": 3709632, "step": 742 }, { "epoch": 1.1272727272727272, "grad_norm": 0.6427175998687744, "learning_rate": 1.0493411984470007e-06, "loss": 2.4608, "num_input_tokens_seen": 3717720, "step": 744 }, { "epoch": 1.1303030303030304, "grad_norm": 0.5718503594398499, "learning_rate": 1.0455745424111686e-06, "loss": 2.5028, "num_input_tokens_seen": 3728280, "step": 746 }, { "epoch": 1.1333333333333333, "grad_norm": 0.8905156850814819, "learning_rate": 1.0418114388845242e-06, "loss": 2.5461, "num_input_tokens_seen": 3735888, "step": 748 }, { "epoch": 1.1363636363636362, "grad_norm": 0.535351037979126, "learning_rate": 1.038051974436082e-06, "loss": 2.2596, "num_input_tokens_seen": 3747720, "step": 750 }, { "epoch": 1.1393939393939394, "grad_norm": 0.5600206255912781, "learning_rate": 1.034296235551143e-06, "loss": 2.2801, "num_input_tokens_seen": 3758640, "step": 752 }, { "epoch": 1.1424242424242423, "grad_norm": 0.5470922589302063, "learning_rate": 1.0305443086293003e-06, "loss": 2.3337, "num_input_tokens_seen": 3769128, "step": 754 }, { "epoch": 1.1454545454545455, "grad_norm": 0.5066417455673218, "learning_rate": 1.0267962799824562e-06, "loss": 2.6706, "num_input_tokens_seen": 3779304, "step": 756 }, { "epoch": 1.1484848484848484, "grad_norm": 0.46135252714157104, "learning_rate": 1.0230522358328331e-06, "loss": 2.2422, "num_input_tokens_seen": 3789312, "step": 758 }, { "epoch": 1.1515151515151516, "grad_norm": 0.7310757637023926, "learning_rate": 1.0193122623109917e-06, "loss": 2.4892, "num_input_tokens_seen": 3796848, "step": 760 }, { "epoch": 1.1515151515151516, "eval_loss": 2.3289122581481934, "eval_runtime": 5.8291, "eval_samples_per_second": 3.431, "eval_steps_per_second": 3.431, "num_input_tokens_seen": 3796848, "step": 760 }, { "epoch": 1.1545454545454545, "grad_norm": 0.5655786991119385, "learning_rate": 1.015576445453849e-06, "loss": 2.2826, "num_input_tokens_seen": 3806640, "step": 762 }, { "epoch": 1.1575757575757575, "grad_norm": 0.6524637341499329, "learning_rate": 1.0118448712026992e-06, "loss": 2.4358, "num_input_tokens_seen": 3817608, "step": 764 }, { "epoch": 1.1606060606060606, "grad_norm": 0.6280786991119385, "learning_rate": 1.0081176254012374e-06, "loss": 2.421, "num_input_tokens_seen": 3827592, "step": 766 }, { "epoch": 1.1636363636363636, "grad_norm": 0.6797434687614441, "learning_rate": 1.0043947937935832e-06, "loss": 2.3245, "num_input_tokens_seen": 3837264, "step": 768 }, { "epoch": 1.1666666666666667, "grad_norm": 0.5665332078933716, "learning_rate": 1.0006764620223093e-06, "loss": 2.3388, "num_input_tokens_seen": 3847656, "step": 770 }, { "epoch": 1.1696969696969697, "grad_norm": 0.4868026077747345, "learning_rate": 9.96962715626471e-07, "loss": 2.3956, "num_input_tokens_seen": 3858600, "step": 772 }, { "epoch": 1.1727272727272728, "grad_norm": 0.77336585521698, "learning_rate": 9.932536400396393e-07, "loss": 2.3562, "num_input_tokens_seen": 3870120, "step": 774 }, { "epoch": 1.1757575757575758, "grad_norm": 0.6464818120002747, "learning_rate": 9.895493205879332e-07, "loss": 2.5851, "num_input_tokens_seen": 3879600, "step": 776 }, { "epoch": 1.1787878787878787, "grad_norm": 0.6274628639221191, "learning_rate": 9.858498424880592e-07, "loss": 2.7061, "num_input_tokens_seen": 3889296, "step": 778 }, { "epoch": 1.1818181818181819, "grad_norm": 0.5714861154556274, "learning_rate": 9.821552908453506e-07, "loss": 2.4251, "num_input_tokens_seen": 3901464, "step": 780 }, { "epoch": 1.1818181818181819, "eval_loss": 2.3287835121154785, "eval_runtime": 5.8272, "eval_samples_per_second": 3.432, "eval_steps_per_second": 3.432, "num_input_tokens_seen": 3901464, "step": 780 }, { "epoch": 1.1848484848484848, "grad_norm": 0.6943346261978149, "learning_rate": 9.784657506518078e-07, "loss": 2.6212, "num_input_tokens_seen": 3910656, "step": 782 }, { "epoch": 1.187878787878788, "grad_norm": 0.4821998178958893, "learning_rate": 9.747813067841455e-07, "loss": 2.3086, "num_input_tokens_seen": 3922944, "step": 784 }, { "epoch": 1.190909090909091, "grad_norm": 0.7086381912231445, "learning_rate": 9.711020440018384e-07, "loss": 2.5027, "num_input_tokens_seen": 3931752, "step": 786 }, { "epoch": 1.1939393939393939, "grad_norm": 0.5712624788284302, "learning_rate": 9.674280469451718e-07, "loss": 2.4088, "num_input_tokens_seen": 3942120, "step": 788 }, { "epoch": 1.196969696969697, "grad_norm": 0.6443710327148438, "learning_rate": 9.637594001332956e-07, "loss": 2.3161, "num_input_tokens_seen": 3952248, "step": 790 }, { "epoch": 1.2, "grad_norm": 0.6674967408180237, "learning_rate": 9.600961879622777e-07, "loss": 2.4837, "num_input_tokens_seen": 3960600, "step": 792 }, { "epoch": 1.2030303030303031, "grad_norm": 0.5792006254196167, "learning_rate": 9.564384947031646e-07, "loss": 2.3195, "num_input_tokens_seen": 3971568, "step": 794 }, { "epoch": 1.206060606060606, "grad_norm": 0.7185015082359314, "learning_rate": 9.527864045000421e-07, "loss": 2.5749, "num_input_tokens_seen": 3983592, "step": 796 }, { "epoch": 1.209090909090909, "grad_norm": 0.6423861980438232, "learning_rate": 9.491400013680988e-07, "loss": 2.39, "num_input_tokens_seen": 3994008, "step": 798 }, { "epoch": 1.2121212121212122, "grad_norm": 0.6292434334754944, "learning_rate": 9.454993691916948e-07, "loss": 2.3579, "num_input_tokens_seen": 4004496, "step": 800 }, { "epoch": 1.2121212121212122, "eval_loss": 2.3282077312469482, "eval_runtime": 5.8254, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 4004496, "step": 800 }, { "epoch": 1.215151515151515, "grad_norm": 0.6097608208656311, "learning_rate": 9.418645917224303e-07, "loss": 2.3152, "num_input_tokens_seen": 4016592, "step": 802 }, { "epoch": 1.2181818181818183, "grad_norm": 0.5774179100990295, "learning_rate": 9.382357525772202e-07, "loss": 2.4599, "num_input_tokens_seen": 4024800, "step": 804 }, { "epoch": 1.2212121212121212, "grad_norm": 0.645380973815918, "learning_rate": 9.346129352363705e-07, "loss": 2.2412, "num_input_tokens_seen": 4035144, "step": 806 }, { "epoch": 1.2242424242424241, "grad_norm": 0.6434935331344604, "learning_rate": 9.309962230416574e-07, "loss": 2.4022, "num_input_tokens_seen": 4042920, "step": 808 }, { "epoch": 1.2272727272727273, "grad_norm": 0.5125094056129456, "learning_rate": 9.273856991944089e-07, "loss": 2.4082, "num_input_tokens_seen": 4053072, "step": 810 }, { "epoch": 1.2303030303030302, "grad_norm": 0.5167670845985413, "learning_rate": 9.237814467535941e-07, "loss": 2.3188, "num_input_tokens_seen": 4063368, "step": 812 }, { "epoch": 1.2333333333333334, "grad_norm": 0.5533791184425354, "learning_rate": 9.201835486339084e-07, "loss": 2.4367, "num_input_tokens_seen": 4072392, "step": 814 }, { "epoch": 1.2363636363636363, "grad_norm": 0.5429077744483948, "learning_rate": 9.165920876038694e-07, "loss": 2.3054, "num_input_tokens_seen": 4083072, "step": 816 }, { "epoch": 1.2393939393939393, "grad_norm": 0.530968427658081, "learning_rate": 9.130071462839108e-07, "loss": 2.4475, "num_input_tokens_seen": 4093776, "step": 818 }, { "epoch": 1.2424242424242424, "grad_norm": 0.5137664675712585, "learning_rate": 9.094288071444822e-07, "loss": 2.4868, "num_input_tokens_seen": 4106040, "step": 820 }, { "epoch": 1.2424242424242424, "eval_loss": 2.3283748626708984, "eval_runtime": 5.8265, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 4106040, "step": 820 }, { "epoch": 1.2454545454545454, "grad_norm": 0.7732150554656982, "learning_rate": 9.058571525041534e-07, "loss": 2.4682, "num_input_tokens_seen": 4117392, "step": 822 }, { "epoch": 1.2484848484848485, "grad_norm": 0.6861566305160522, "learning_rate": 9.022922645277176e-07, "loss": 2.372, "num_input_tokens_seen": 4125696, "step": 824 }, { "epoch": 1.2515151515151515, "grad_norm": 0.4728741943836212, "learning_rate": 8.987342252243042e-07, "loss": 2.4424, "num_input_tokens_seen": 4137816, "step": 826 }, { "epoch": 1.2545454545454544, "grad_norm": 0.5557587742805481, "learning_rate": 8.951831164454908e-07, "loss": 2.4164, "num_input_tokens_seen": 4150104, "step": 828 }, { "epoch": 1.2575757575757576, "grad_norm": 0.6730014085769653, "learning_rate": 8.916390198834203e-07, "loss": 2.4451, "num_input_tokens_seen": 4160832, "step": 830 }, { "epoch": 1.2606060606060607, "grad_norm": 0.7126666307449341, "learning_rate": 8.88102017068922e-07, "loss": 2.3256, "num_input_tokens_seen": 4170216, "step": 832 }, { "epoch": 1.2636363636363637, "grad_norm": 0.6457303762435913, "learning_rate": 8.845721893696354e-07, "loss": 2.2176, "num_input_tokens_seen": 4181256, "step": 834 }, { "epoch": 1.2666666666666666, "grad_norm": 1.0662436485290527, "learning_rate": 8.810496179881387e-07, "loss": 2.3812, "num_input_tokens_seen": 4192128, "step": 836 }, { "epoch": 1.2696969696969698, "grad_norm": 0.4683075547218323, "learning_rate": 8.775343839600816e-07, "loss": 2.4275, "num_input_tokens_seen": 4202208, "step": 838 }, { "epoch": 1.2727272727272727, "grad_norm": 0.5171107649803162, "learning_rate": 8.740265681523195e-07, "loss": 2.4706, "num_input_tokens_seen": 4210464, "step": 840 }, { "epoch": 1.2727272727272727, "eval_loss": 2.3279545307159424, "eval_runtime": 5.823, "eval_samples_per_second": 3.435, "eval_steps_per_second": 3.435, "num_input_tokens_seen": 4210464, "step": 840 }, { "epoch": 1.2757575757575759, "grad_norm": 0.7313932180404663, "learning_rate": 8.705262512610539e-07, "loss": 2.4054, "num_input_tokens_seen": 4217928, "step": 842 }, { "epoch": 1.2787878787878788, "grad_norm": 0.7015888690948486, "learning_rate": 8.670335138099765e-07, "loss": 2.4653, "num_input_tokens_seen": 4226904, "step": 844 }, { "epoch": 1.2818181818181817, "grad_norm": 0.6179009079933167, "learning_rate": 8.635484361484158e-07, "loss": 2.3184, "num_input_tokens_seen": 4237656, "step": 846 }, { "epoch": 1.284848484848485, "grad_norm": 0.5112322568893433, "learning_rate": 8.600710984494909e-07, "loss": 2.3415, "num_input_tokens_seen": 4248720, "step": 848 }, { "epoch": 1.2878787878787878, "grad_norm": 0.7824225425720215, "learning_rate": 8.56601580708263e-07, "loss": 2.6382, "num_input_tokens_seen": 4253448, "step": 850 }, { "epoch": 1.290909090909091, "grad_norm": 0.7822674512863159, "learning_rate": 8.531399627398991e-07, "loss": 2.5681, "num_input_tokens_seen": 4261488, "step": 852 }, { "epoch": 1.293939393939394, "grad_norm": 0.5791777968406677, "learning_rate": 8.496863241778346e-07, "loss": 2.2039, "num_input_tokens_seen": 4273104, "step": 854 }, { "epoch": 1.2969696969696969, "grad_norm": 0.5415911674499512, "learning_rate": 8.462407444719405e-07, "loss": 2.3936, "num_input_tokens_seen": 4283136, "step": 856 }, { "epoch": 1.3, "grad_norm": 0.5852922797203064, "learning_rate": 8.428033028866967e-07, "loss": 2.3669, "num_input_tokens_seen": 4292208, "step": 858 }, { "epoch": 1.303030303030303, "grad_norm": 0.5799878239631653, "learning_rate": 8.393740784993677e-07, "loss": 2.4704, "num_input_tokens_seen": 4302240, "step": 860 }, { "epoch": 1.303030303030303, "eval_loss": 2.3276970386505127, "eval_runtime": 5.8227, "eval_samples_per_second": 3.435, "eval_steps_per_second": 3.435, "num_input_tokens_seen": 4302240, "step": 860 }, { "epoch": 1.3060606060606061, "grad_norm": 0.7296667695045471, "learning_rate": 8.359531501981846e-07, "loss": 2.7633, "num_input_tokens_seen": 4311888, "step": 862 }, { "epoch": 1.309090909090909, "grad_norm": 0.9460285305976868, "learning_rate": 8.325405966805295e-07, "loss": 2.1671, "num_input_tokens_seen": 4321992, "step": 864 }, { "epoch": 1.312121212121212, "grad_norm": 0.5294950008392334, "learning_rate": 8.291364964511247e-07, "loss": 2.4139, "num_input_tokens_seen": 4332408, "step": 866 }, { "epoch": 1.3151515151515152, "grad_norm": 0.6206031441688538, "learning_rate": 8.25740927820228e-07, "loss": 2.5621, "num_input_tokens_seen": 4344696, "step": 868 }, { "epoch": 1.3181818181818181, "grad_norm": 0.5652275085449219, "learning_rate": 8.223539689018299e-07, "loss": 2.4142, "num_input_tokens_seen": 4356168, "step": 870 }, { "epoch": 1.3212121212121213, "grad_norm": 0.6217209696769714, "learning_rate": 8.189756976118568e-07, "loss": 2.3459, "num_input_tokens_seen": 4364568, "step": 872 }, { "epoch": 1.3242424242424242, "grad_norm": 0.5359376072883606, "learning_rate": 8.156061916663807e-07, "loss": 2.2973, "num_input_tokens_seen": 4374984, "step": 874 }, { "epoch": 1.3272727272727272, "grad_norm": 0.531065821647644, "learning_rate": 8.12245528579828e-07, "loss": 2.5294, "num_input_tokens_seen": 4385424, "step": 876 }, { "epoch": 1.3303030303030303, "grad_norm": 0.837188184261322, "learning_rate": 8.088937856631974e-07, "loss": 2.4239, "num_input_tokens_seen": 4395192, "step": 878 }, { "epoch": 1.3333333333333333, "grad_norm": 0.6112043857574463, "learning_rate": 8.055510400222836e-07, "loss": 2.4403, "num_input_tokens_seen": 4405608, "step": 880 }, { "epoch": 1.3333333333333333, "eval_loss": 2.328122138977051, "eval_runtime": 5.8246, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 4405608, "step": 880 }, { "epoch": 1.3363636363636364, "grad_norm": 0.6124045252799988, "learning_rate": 8.022173685559011e-07, "loss": 2.389, "num_input_tokens_seen": 4417896, "step": 882 }, { "epoch": 1.3393939393939394, "grad_norm": 0.6339285969734192, "learning_rate": 7.988928479541154e-07, "loss": 2.3811, "num_input_tokens_seen": 4428000, "step": 884 }, { "epoch": 1.3424242424242423, "grad_norm": 0.5700270533561707, "learning_rate": 7.955775546964797e-07, "loss": 2.4351, "num_input_tokens_seen": 4436736, "step": 886 }, { "epoch": 1.3454545454545455, "grad_norm": 0.5536416172981262, "learning_rate": 7.922715650502746e-07, "loss": 2.4343, "num_input_tokens_seen": 4447488, "step": 888 }, { "epoch": 1.3484848484848486, "grad_norm": 0.6863646507263184, "learning_rate": 7.889749550687552e-07, "loss": 2.5435, "num_input_tokens_seen": 4455840, "step": 890 }, { "epoch": 1.3515151515151516, "grad_norm": 0.6737553477287292, "learning_rate": 7.856878005893988e-07, "loss": 2.3398, "num_input_tokens_seen": 4463568, "step": 892 }, { "epoch": 1.3545454545454545, "grad_norm": 0.7057380676269531, "learning_rate": 7.824101772321625e-07, "loss": 2.3618, "num_input_tokens_seen": 4472904, "step": 894 }, { "epoch": 1.3575757575757577, "grad_norm": 0.47144582867622375, "learning_rate": 7.791421603977435e-07, "loss": 2.1904, "num_input_tokens_seen": 4484400, "step": 896 }, { "epoch": 1.3606060606060606, "grad_norm": 0.5720792412757874, "learning_rate": 7.758838252658433e-07, "loss": 2.3122, "num_input_tokens_seen": 4493592, "step": 898 }, { "epoch": 1.3636363636363638, "grad_norm": 0.6241214275360107, "learning_rate": 7.726352467934386e-07, "loss": 2.4964, "num_input_tokens_seen": 4502664, "step": 900 }, { "epoch": 1.3636363636363638, "eval_loss": 2.327789783477783, "eval_runtime": 5.8265, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 4502664, "step": 900 }, { "epoch": 1.3666666666666667, "grad_norm": 0.6216875910758972, "learning_rate": 7.693964997130581e-07, "loss": 2.4142, "num_input_tokens_seen": 4510920, "step": 902 }, { "epoch": 1.3696969696969696, "grad_norm": 0.5733647346496582, "learning_rate": 7.661676585310618e-07, "loss": 2.3751, "num_input_tokens_seen": 4523208, "step": 904 }, { "epoch": 1.3727272727272728, "grad_norm": 0.5904967784881592, "learning_rate": 7.629487975259276e-07, "loss": 2.5808, "num_input_tokens_seen": 4532520, "step": 906 }, { "epoch": 1.3757575757575757, "grad_norm": 0.44976285099983215, "learning_rate": 7.597399907465431e-07, "loss": 2.3199, "num_input_tokens_seen": 4544688, "step": 908 }, { "epoch": 1.378787878787879, "grad_norm": 0.6326127052307129, "learning_rate": 7.565413120105009e-07, "loss": 2.3752, "num_input_tokens_seen": 4554000, "step": 910 }, { "epoch": 1.3818181818181818, "grad_norm": 0.5754263997077942, "learning_rate": 7.533528349024014e-07, "loss": 2.3512, "num_input_tokens_seen": 4564368, "step": 912 }, { "epoch": 1.3848484848484848, "grad_norm": 0.7068946957588196, "learning_rate": 7.5017463277216e-07, "loss": 2.3772, "num_input_tokens_seen": 4574448, "step": 914 }, { "epoch": 1.387878787878788, "grad_norm": 0.6131560206413269, "learning_rate": 7.470067787333188e-07, "loss": 2.4036, "num_input_tokens_seen": 4582464, "step": 916 }, { "epoch": 1.3909090909090909, "grad_norm": 0.6577942967414856, "learning_rate": 7.43849345661367e-07, "loss": 2.3063, "num_input_tokens_seen": 4592976, "step": 918 }, { "epoch": 1.393939393939394, "grad_norm": 0.7147580981254578, "learning_rate": 7.407024061920599e-07, "loss": 2.4129, "num_input_tokens_seen": 4603920, "step": 920 }, { "epoch": 1.393939393939394, "eval_loss": 2.32749080657959, "eval_runtime": 5.8263, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 4603920, "step": 920 }, { "epoch": 1.396969696969697, "grad_norm": 0.5320861339569092, "learning_rate": 7.375660327197534e-07, "loss": 2.3207, "num_input_tokens_seen": 4614072, "step": 922 }, { "epoch": 1.4, "grad_norm": 0.6758208870887756, "learning_rate": 7.344402973957346e-07, "loss": 2.4536, "num_input_tokens_seen": 4622640, "step": 924 }, { "epoch": 1.403030303030303, "grad_norm": 0.5670093894004822, "learning_rate": 7.313252721265638e-07, "loss": 2.5495, "num_input_tokens_seen": 4634040, "step": 926 }, { "epoch": 1.406060606060606, "grad_norm": 0.5245952606201172, "learning_rate": 7.282210285724195e-07, "loss": 2.4487, "num_input_tokens_seen": 4644192, "step": 928 }, { "epoch": 1.4090909090909092, "grad_norm": 0.4705655872821808, "learning_rate": 7.251276381454506e-07, "loss": 2.5896, "num_input_tokens_seen": 4653720, "step": 930 }, { "epoch": 1.412121212121212, "grad_norm": 0.5075128674507141, "learning_rate": 7.22045172008133e-07, "loss": 2.261, "num_input_tokens_seen": 4666008, "step": 932 }, { "epoch": 1.415151515151515, "grad_norm": 0.5407282710075378, "learning_rate": 7.189737010716326e-07, "loss": 2.384, "num_input_tokens_seen": 4674936, "step": 934 }, { "epoch": 1.4181818181818182, "grad_norm": 0.6681150794029236, "learning_rate": 7.159132959941745e-07, "loss": 2.4542, "num_input_tokens_seen": 4684272, "step": 936 }, { "epoch": 1.4212121212121211, "grad_norm": 0.6024764776229858, "learning_rate": 7.128640271794171e-07, "loss": 2.3937, "num_input_tokens_seen": 4695576, "step": 938 }, { "epoch": 1.4242424242424243, "grad_norm": 0.5031726956367493, "learning_rate": 7.098259647748328e-07, "loss": 2.2943, "num_input_tokens_seen": 4705800, "step": 940 }, { "epoch": 1.4242424242424243, "eval_loss": 2.3277194499969482, "eval_runtime": 5.8264, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 4705800, "step": 940 }, { "epoch": 1.4272727272727272, "grad_norm": 0.5406504273414612, "learning_rate": 7.067991786700929e-07, "loss": 2.3552, "num_input_tokens_seen": 4718088, "step": 942 }, { "epoch": 1.4303030303030302, "grad_norm": 0.5154955387115479, "learning_rate": 7.037837384954625e-07, "loss": 2.4507, "num_input_tokens_seen": 4729536, "step": 944 }, { "epoch": 1.4333333333333333, "grad_norm": 0.710150420665741, "learning_rate": 7.007797136201966e-07, "loss": 2.4813, "num_input_tokens_seen": 4738272, "step": 946 }, { "epoch": 1.4363636363636363, "grad_norm": 0.5603686571121216, "learning_rate": 6.977871731509438e-07, "loss": 2.4679, "num_input_tokens_seen": 4747488, "step": 948 }, { "epoch": 1.4393939393939394, "grad_norm": 0.6040205359458923, "learning_rate": 6.948061859301593e-07, "loss": 2.5084, "num_input_tokens_seen": 4756032, "step": 950 }, { "epoch": 1.4424242424242424, "grad_norm": 0.6151003837585449, "learning_rate": 6.918368205345182e-07, "loss": 2.3797, "num_input_tokens_seen": 4766904, "step": 952 }, { "epoch": 1.4454545454545453, "grad_norm": 0.5921849012374878, "learning_rate": 6.888791452733397e-07, "loss": 2.4923, "num_input_tokens_seen": 4777680, "step": 954 }, { "epoch": 1.4484848484848485, "grad_norm": 0.5749545693397522, "learning_rate": 6.859332281870147e-07, "loss": 2.5362, "num_input_tokens_seen": 4788432, "step": 956 }, { "epoch": 1.4515151515151516, "grad_norm": 0.5609776973724365, "learning_rate": 6.829991370454411e-07, "loss": 2.433, "num_input_tokens_seen": 4799712, "step": 958 }, { "epoch": 1.4545454545454546, "grad_norm": 0.6038674116134644, "learning_rate": 6.800769393464656e-07, "loss": 2.362, "num_input_tokens_seen": 4808688, "step": 960 }, { "epoch": 1.4545454545454546, "eval_loss": 2.3274452686309814, "eval_runtime": 5.8255, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 4808688, "step": 960 }, { "epoch": 1.4575757575757575, "grad_norm": 0.6705885529518127, "learning_rate": 6.771667023143284e-07, "loss": 2.5027, "num_input_tokens_seen": 4817136, "step": 962 }, { "epoch": 1.4606060606060607, "grad_norm": 0.6026042699813843, "learning_rate": 6.742684928981188e-07, "loss": 2.6941, "num_input_tokens_seen": 4829112, "step": 964 }, { "epoch": 1.4636363636363636, "grad_norm": 0.5220550894737244, "learning_rate": 6.713823777702359e-07, "loss": 2.2785, "num_input_tokens_seen": 4838664, "step": 966 }, { "epoch": 1.4666666666666668, "grad_norm": 0.6457405090332031, "learning_rate": 6.685084233248517e-07, "loss": 2.502, "num_input_tokens_seen": 4846656, "step": 968 }, { "epoch": 1.4696969696969697, "grad_norm": 0.831514298915863, "learning_rate": 6.656466956763864e-07, "loss": 2.4094, "num_input_tokens_seen": 4855296, "step": 970 }, { "epoch": 1.4727272727272727, "grad_norm": 0.623429000377655, "learning_rate": 6.627972606579866e-07, "loss": 2.3646, "num_input_tokens_seen": 4867584, "step": 972 }, { "epoch": 1.4757575757575758, "grad_norm": 0.6878921389579773, "learning_rate": 6.599601838200104e-07, "loss": 2.3642, "num_input_tokens_seen": 4879584, "step": 974 }, { "epoch": 1.4787878787878788, "grad_norm": 0.8445355296134949, "learning_rate": 6.571355304285202e-07, "loss": 2.571, "num_input_tokens_seen": 4889976, "step": 976 }, { "epoch": 1.481818181818182, "grad_norm": 0.5575315356254578, "learning_rate": 6.543233654637804e-07, "loss": 2.5749, "num_input_tokens_seen": 4899048, "step": 978 }, { "epoch": 1.4848484848484849, "grad_norm": 0.5096350312232971, "learning_rate": 6.515237536187644e-07, "loss": 2.2386, "num_input_tokens_seen": 4910088, "step": 980 }, { "epoch": 1.4848484848484849, "eval_loss": 2.3277652263641357, "eval_runtime": 5.8263, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 4910088, "step": 980 }, { "epoch": 1.4878787878787878, "grad_norm": 0.7003534436225891, "learning_rate": 6.487367592976633e-07, "loss": 2.5641, "num_input_tokens_seen": 4922376, "step": 982 }, { "epoch": 1.490909090909091, "grad_norm": 0.5951968431472778, "learning_rate": 6.459624466144067e-07, "loss": 2.298, "num_input_tokens_seen": 4934664, "step": 984 }, { "epoch": 1.493939393939394, "grad_norm": 0.7097399234771729, "learning_rate": 6.432008793911877e-07, "loss": 2.3938, "num_input_tokens_seen": 4943352, "step": 986 }, { "epoch": 1.496969696969697, "grad_norm": 0.5688740015029907, "learning_rate": 6.404521211569937e-07, "loss": 2.421, "num_input_tokens_seen": 4953888, "step": 988 }, { "epoch": 1.5, "grad_norm": 0.6089447736740112, "learning_rate": 6.377162351461442e-07, "loss": 2.1273, "num_input_tokens_seen": 4965024, "step": 990 }, { "epoch": 1.503030303030303, "grad_norm": 0.5698357224464417, "learning_rate": 6.349932842968391e-07, "loss": 2.3928, "num_input_tokens_seen": 4977216, "step": 992 }, { "epoch": 1.506060606060606, "grad_norm": 0.6300851702690125, "learning_rate": 6.322833312497082e-07, "loss": 2.3595, "num_input_tokens_seen": 4986720, "step": 994 }, { "epoch": 1.509090909090909, "grad_norm": 0.5977615714073181, "learning_rate": 6.295864383463705e-07, "loss": 2.5852, "num_input_tokens_seen": 4995072, "step": 996 }, { "epoch": 1.5121212121212122, "grad_norm": 0.6872332096099854, "learning_rate": 6.269026676280008e-07, "loss": 2.4611, "num_input_tokens_seen": 5003256, "step": 998 }, { "epoch": 1.5151515151515151, "grad_norm": 0.7128229141235352, "learning_rate": 6.242320808339023e-07, "loss": 2.0949, "num_input_tokens_seen": 5010864, "step": 1000 }, { "epoch": 1.5151515151515151, "eval_loss": 2.3277881145477295, "eval_runtime": 5.8265, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "num_input_tokens_seen": 5010864, "step": 1000 }, { "epoch": 1.518181818181818, "grad_norm": 0.48600301146507263, "learning_rate": 6.215747394000864e-07, "loss": 2.2478, "num_input_tokens_seen": 5021400, "step": 1002 }, { "epoch": 1.5212121212121212, "grad_norm": 0.6063188314437866, "learning_rate": 6.189307044578585e-07, "loss": 2.1912, "num_input_tokens_seen": 5031576, "step": 1004 }, { "epoch": 1.5242424242424244, "grad_norm": 0.6136674284934998, "learning_rate": 6.163000368324124e-07, "loss": 2.3441, "num_input_tokens_seen": 5042136, "step": 1006 }, { "epoch": 1.5272727272727273, "grad_norm": 0.6810842156410217, "learning_rate": 6.136827970414317e-07, "loss": 2.3444, "num_input_tokens_seen": 5052480, "step": 1008 }, { "epoch": 1.5303030303030303, "grad_norm": 0.70346599817276, "learning_rate": 6.11079045293696e-07, "loss": 2.5014, "num_input_tokens_seen": 5062872, "step": 1010 }, { "epoch": 1.5333333333333332, "grad_norm": 0.6263840198516846, "learning_rate": 6.084888414876976e-07, "loss": 2.2427, "num_input_tokens_seen": 5073744, "step": 1012 }, { "epoch": 1.5363636363636364, "grad_norm": 0.6593678593635559, "learning_rate": 6.059122452102618e-07, "loss": 2.3813, "num_input_tokens_seen": 5082432, "step": 1014 }, { "epoch": 1.5393939393939395, "grad_norm": 0.521698534488678, "learning_rate": 6.033493157351772e-07, "loss": 2.6378, "num_input_tokens_seen": 5092848, "step": 1016 }, { "epoch": 1.5424242424242425, "grad_norm": 0.46363523602485657, "learning_rate": 6.008001120218322e-07, "loss": 2.4006, "num_input_tokens_seen": 5105136, "step": 1018 }, { "epoch": 1.5454545454545454, "grad_norm": 0.7737420797348022, "learning_rate": 5.982646927138584e-07, "loss": 2.5504, "num_input_tokens_seen": 5114064, "step": 1020 }, { "epoch": 1.5454545454545454, "eval_loss": 2.3275692462921143, "eval_runtime": 5.8238, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 5114064, "step": 1020 }, { "epoch": 1.5484848484848484, "grad_norm": 0.6213299036026001, "learning_rate": 5.957431161377809e-07, "loss": 2.4085, "num_input_tokens_seen": 5125872, "step": 1022 }, { "epoch": 1.5515151515151515, "grad_norm": 0.7610370516777039, "learning_rate": 5.932354403016777e-07, "loss": 2.263, "num_input_tokens_seen": 5135208, "step": 1024 }, { "epoch": 1.5545454545454547, "grad_norm": 0.5635423064231873, "learning_rate": 5.907417228938442e-07, "loss": 2.352, "num_input_tokens_seen": 5146896, "step": 1026 }, { "epoch": 1.5575757575757576, "grad_norm": 0.5265647768974304, "learning_rate": 5.88262021281467e-07, "loss": 2.3172, "num_input_tokens_seen": 5159184, "step": 1028 }, { "epoch": 1.5606060606060606, "grad_norm": 0.8375009298324585, "learning_rate": 5.857963925093034e-07, "loss": 2.4402, "num_input_tokens_seen": 5167656, "step": 1030 }, { "epoch": 1.5636363636363635, "grad_norm": 0.5335946679115295, "learning_rate": 5.833448932983693e-07, "loss": 2.5926, "num_input_tokens_seen": 5179680, "step": 1032 }, { "epoch": 1.5666666666666667, "grad_norm": 0.8245714902877808, "learning_rate": 5.809075800446348e-07, "loss": 2.5999, "num_input_tokens_seen": 5190216, "step": 1034 }, { "epoch": 1.5696969696969698, "grad_norm": 0.5047762393951416, "learning_rate": 5.784845088177263e-07, "loss": 2.379, "num_input_tokens_seen": 5201592, "step": 1036 }, { "epoch": 1.5727272727272728, "grad_norm": 0.5322418212890625, "learning_rate": 5.760757353596371e-07, "loss": 2.3246, "num_input_tokens_seen": 5213040, "step": 1038 }, { "epoch": 1.5757575757575757, "grad_norm": 0.47743648290634155, "learning_rate": 5.736813150834447e-07, "loss": 2.4542, "num_input_tokens_seen": 5223360, "step": 1040 }, { "epoch": 1.5757575757575757, "eval_loss": 2.3277275562286377, "eval_runtime": 5.824, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 5223360, "step": 1040 }, { "epoch": 1.5787878787878786, "grad_norm": 0.4745235741138458, "learning_rate": 5.713013030720356e-07, "loss": 2.3253, "num_input_tokens_seen": 5235480, "step": 1042 }, { "epoch": 1.5818181818181818, "grad_norm": 0.521117091178894, "learning_rate": 5.6893575407684e-07, "loss": 2.3232, "num_input_tokens_seen": 5246280, "step": 1044 }, { "epoch": 1.584848484848485, "grad_norm": 0.6688542366027832, "learning_rate": 5.665847225165695e-07, "loss": 2.323, "num_input_tokens_seen": 5257248, "step": 1046 }, { "epoch": 1.587878787878788, "grad_norm": 0.6905980706214905, "learning_rate": 5.642482624759672e-07, "loss": 2.6128, "num_input_tokens_seen": 5268264, "step": 1048 }, { "epoch": 1.5909090909090908, "grad_norm": 0.555060088634491, "learning_rate": 5.619264277045634e-07, "loss": 2.5484, "num_input_tokens_seen": 5280432, "step": 1050 }, { "epoch": 1.593939393939394, "grad_norm": 0.6293858289718628, "learning_rate": 5.596192716154385e-07, "loss": 2.5, "num_input_tokens_seen": 5290488, "step": 1052 }, { "epoch": 1.596969696969697, "grad_norm": 0.9078196883201599, "learning_rate": 5.573268472839937e-07, "loss": 2.4814, "num_input_tokens_seen": 5299536, "step": 1054 }, { "epoch": 1.6, "grad_norm": 0.6941189765930176, "learning_rate": 5.550492074467317e-07, "loss": 2.4972, "num_input_tokens_seen": 5309544, "step": 1056 }, { "epoch": 1.603030303030303, "grad_norm": 0.6833639740943909, "learning_rate": 5.527864045000421e-07, "loss": 2.5041, "num_input_tokens_seen": 5319024, "step": 1058 }, { "epoch": 1.606060606060606, "grad_norm": 0.6468996405601501, "learning_rate": 5.505384904989965e-07, "loss": 2.3262, "num_input_tokens_seen": 5329752, "step": 1060 }, { "epoch": 1.606060606060606, "eval_loss": 2.327099323272705, "eval_runtime": 5.8238, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 5329752, "step": 1060 }, { "epoch": 1.6090909090909091, "grad_norm": 0.7046562433242798, "learning_rate": 5.483055171561511e-07, "loss": 2.2181, "num_input_tokens_seen": 5340552, "step": 1062 }, { "epoch": 1.612121212121212, "grad_norm": 0.48583197593688965, "learning_rate": 5.460875358403565e-07, "loss": 2.3349, "num_input_tokens_seen": 5350320, "step": 1064 }, { "epoch": 1.6151515151515152, "grad_norm": 0.6768611073493958, "learning_rate": 5.438845975755772e-07, "loss": 2.4784, "num_input_tokens_seen": 5356608, "step": 1066 }, { "epoch": 1.6181818181818182, "grad_norm": 0.6648526191711426, "learning_rate": 5.416967530397164e-07, "loss": 2.2265, "num_input_tokens_seen": 5366568, "step": 1068 }, { "epoch": 1.621212121212121, "grad_norm": 0.5271417498588562, "learning_rate": 5.395240525634511e-07, "loss": 2.4877, "num_input_tokens_seen": 5378856, "step": 1070 }, { "epoch": 1.6242424242424243, "grad_norm": 0.5848326086997986, "learning_rate": 5.37366546129074e-07, "loss": 2.3169, "num_input_tokens_seen": 5391120, "step": 1072 }, { "epoch": 1.6272727272727274, "grad_norm": 0.5480791330337524, "learning_rate": 5.35224283369343e-07, "loss": 2.4456, "num_input_tokens_seen": 5398752, "step": 1074 }, { "epoch": 1.6303030303030304, "grad_norm": 0.47689610719680786, "learning_rate": 5.330973135663411e-07, "loss": 2.5053, "num_input_tokens_seen": 5411040, "step": 1076 }, { "epoch": 1.6333333333333333, "grad_norm": 0.5623081922531128, "learning_rate": 5.309856856503409e-07, "loss": 2.4062, "num_input_tokens_seen": 5422848, "step": 1078 }, { "epoch": 1.6363636363636362, "grad_norm": 0.539359450340271, "learning_rate": 5.2888944819868e-07, "loss": 2.2278, "num_input_tokens_seen": 5435136, "step": 1080 }, { "epoch": 1.6363636363636362, "eval_loss": 2.32759428024292, "eval_runtime": 5.8229, "eval_samples_per_second": 3.435, "eval_steps_per_second": 3.435, "num_input_tokens_seen": 5435136, "step": 1080 }, { "epoch": 1.6393939393939394, "grad_norm": 0.5953027009963989, "learning_rate": 5.26808649434643e-07, "loss": 2.3976, "num_input_tokens_seen": 5445672, "step": 1082 }, { "epoch": 1.6424242424242426, "grad_norm": 0.5432310700416565, "learning_rate": 5.247433372263522e-07, "loss": 2.4648, "num_input_tokens_seen": 5456640, "step": 1084 }, { "epoch": 1.6454545454545455, "grad_norm": 0.5668439865112305, "learning_rate": 5.226935590856675e-07, "loss": 2.2962, "num_input_tokens_seen": 5465976, "step": 1086 }, { "epoch": 1.6484848484848484, "grad_norm": 0.5815810561180115, "learning_rate": 5.20659362167091e-07, "loss": 2.3107, "num_input_tokens_seen": 5477016, "step": 1088 }, { "epoch": 1.6515151515151514, "grad_norm": 0.5914052724838257, "learning_rate": 5.186407932666846e-07, "loss": 2.2394, "num_input_tokens_seen": 5487504, "step": 1090 }, { "epoch": 1.6545454545454545, "grad_norm": 0.8601570129394531, "learning_rate": 5.166378988209924e-07, "loss": 2.6481, "num_input_tokens_seen": 5496600, "step": 1092 }, { "epoch": 1.6575757575757577, "grad_norm": 0.6369432210922241, "learning_rate": 5.146507249059727e-07, "loss": 2.5754, "num_input_tokens_seen": 5506416, "step": 1094 }, { "epoch": 1.6606060606060606, "grad_norm": 0.712243914604187, "learning_rate": 5.126793172359373e-07, "loss": 2.3295, "num_input_tokens_seen": 5514600, "step": 1096 }, { "epoch": 1.6636363636363636, "grad_norm": 0.6746931672096252, "learning_rate": 5.107237211625016e-07, "loss": 2.3752, "num_input_tokens_seen": 5522616, "step": 1098 }, { "epoch": 1.6666666666666665, "grad_norm": 0.5909104943275452, "learning_rate": 5.087839816735391e-07, "loss": 2.2484, "num_input_tokens_seen": 5533488, "step": 1100 }, { "epoch": 1.6666666666666665, "eval_loss": 2.326948642730713, "eval_runtime": 5.8207, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "num_input_tokens_seen": 5533488, "step": 1100 }, { "epoch": 1.6696969696969697, "grad_norm": 0.578524112701416, "learning_rate": 5.068601433921479e-07, "loss": 2.392, "num_input_tokens_seen": 5544864, "step": 1102 }, { "epoch": 1.6727272727272728, "grad_norm": 0.6614283323287964, "learning_rate": 5.04952250575624e-07, "loss": 2.4998, "num_input_tokens_seen": 5555928, "step": 1104 }, { "epoch": 1.6757575757575758, "grad_norm": 0.5955278277397156, "learning_rate": 5.030603471144432e-07, "loss": 2.3944, "num_input_tokens_seen": 5567088, "step": 1106 }, { "epoch": 1.6787878787878787, "grad_norm": 0.5927826166152954, "learning_rate": 5.011844765312504e-07, "loss": 2.487, "num_input_tokens_seen": 5578128, "step": 1108 }, { "epoch": 1.6818181818181817, "grad_norm": 0.6427227258682251, "learning_rate": 4.9932468197986e-07, "loss": 2.5279, "num_input_tokens_seen": 5588952, "step": 1110 }, { "epoch": 1.6848484848484848, "grad_norm": 0.49643516540527344, "learning_rate": 4.974810062442615e-07, "loss": 2.4558, "num_input_tokens_seen": 5599992, "step": 1112 }, { "epoch": 1.687878787878788, "grad_norm": 0.5617672204971313, "learning_rate": 4.956534917376373e-07, "loss": 2.3407, "num_input_tokens_seen": 5611752, "step": 1114 }, { "epoch": 1.690909090909091, "grad_norm": 0.7746953368186951, "learning_rate": 4.938421805013844e-07, "loss": 2.4067, "num_input_tokens_seen": 5619072, "step": 1116 }, { "epoch": 1.6939393939393939, "grad_norm": 0.6146767139434814, "learning_rate": 4.920471142041496e-07, "loss": 2.2224, "num_input_tokens_seen": 5629824, "step": 1118 }, { "epoch": 1.696969696969697, "grad_norm": 0.7500237822532654, "learning_rate": 4.902683341408698e-07, "loss": 2.4764, "num_input_tokens_seen": 5639376, "step": 1120 }, { "epoch": 1.696969696969697, "eval_loss": 2.327069044113159, "eval_runtime": 5.8204, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "num_input_tokens_seen": 5639376, "step": 1120 }, { "epoch": 1.7, "grad_norm": 0.512405276298523, "learning_rate": 4.88505881231822e-07, "loss": 2.4383, "num_input_tokens_seen": 5649624, "step": 1122 }, { "epoch": 1.7030303030303031, "grad_norm": 0.6521934866905212, "learning_rate": 4.867597960216823e-07, "loss": 2.3752, "num_input_tokens_seen": 5659800, "step": 1124 }, { "epoch": 1.706060606060606, "grad_norm": 0.5437342524528503, "learning_rate": 4.85030118678593e-07, "loss": 2.2427, "num_input_tokens_seen": 5668296, "step": 1126 }, { "epoch": 1.709090909090909, "grad_norm": 0.5007622838020325, "learning_rate": 4.833168889932384e-07, "loss": 2.3027, "num_input_tokens_seen": 5678832, "step": 1128 }, { "epoch": 1.7121212121212122, "grad_norm": 0.9229590892791748, "learning_rate": 4.816201463779299e-07, "loss": 2.4966, "num_input_tokens_seen": 5686872, "step": 1130 }, { "epoch": 1.7151515151515153, "grad_norm": 0.7598445415496826, "learning_rate": 4.799399298656985e-07, "loss": 2.5635, "num_input_tokens_seen": 5697216, "step": 1132 }, { "epoch": 1.7181818181818183, "grad_norm": 0.5250843167304993, "learning_rate": 4.782762781093983e-07, "loss": 2.3295, "num_input_tokens_seen": 5706840, "step": 1134 }, { "epoch": 1.7212121212121212, "grad_norm": 0.7306003570556641, "learning_rate": 4.7662922938081575e-07, "loss": 2.3937, "num_input_tokens_seen": 5715816, "step": 1136 }, { "epoch": 1.7242424242424241, "grad_norm": 0.7364092469215393, "learning_rate": 4.7499882156978934e-07, "loss": 2.3815, "num_input_tokens_seen": 5724456, "step": 1138 }, { "epoch": 1.7272727272727273, "grad_norm": 0.6539096236228943, "learning_rate": 4.7338509218333966e-07, "loss": 2.3489, "num_input_tokens_seen": 5732496, "step": 1140 }, { "epoch": 1.7272727272727273, "eval_loss": 2.326911687850952, "eval_runtime": 5.8239, "eval_samples_per_second": 3.434, "eval_steps_per_second": 3.434, "num_input_tokens_seen": 5732496, "step": 1140 }, { "epoch": 1.7303030303030305, "grad_norm": 0.6865965127944946, "learning_rate": 4.717880783448046e-07, "loss": 2.2154, "num_input_tokens_seen": 5744784, "step": 1142 }, { "epoch": 1.7333333333333334, "grad_norm": 0.6450785994529724, "learning_rate": 4.7020781679298636e-07, "loss": 2.5799, "num_input_tokens_seen": 5752872, "step": 1144 }, { "epoch": 1.7363636363636363, "grad_norm": 0.6152123808860779, "learning_rate": 4.6864434388130604e-07, "loss": 2.4051, "num_input_tokens_seen": 5762880, "step": 1146 }, { "epoch": 1.7393939393939393, "grad_norm": 0.5718716382980347, "learning_rate": 4.6709769557696724e-07, "loss": 2.2532, "num_input_tokens_seen": 5773632, "step": 1148 }, { "epoch": 1.7424242424242424, "grad_norm": 0.6017091274261475, "learning_rate": 4.6556790746012866e-07, "loss": 2.2363, "num_input_tokens_seen": 5784960, "step": 1150 }, { "epoch": 1.7454545454545456, "grad_norm": 0.5728634595870972, "learning_rate": 4.6405501472308593e-07, "loss": 2.264, "num_input_tokens_seen": 5794392, "step": 1152 }, { "epoch": 1.7484848484848485, "grad_norm": 0.7092226147651672, "learning_rate": 4.6255905216946174e-07, "loss": 2.6636, "num_input_tokens_seen": 5801088, "step": 1154 }, { "epoch": 1.7515151515151515, "grad_norm": 0.6607272028923035, "learning_rate": 4.6108005421340517e-07, "loss": 2.3849, "num_input_tokens_seen": 5810232, "step": 1156 }, { "epoch": 1.7545454545454544, "grad_norm": 0.6151024699211121, "learning_rate": 4.5961805487879993e-07, "loss": 2.1526, "num_input_tokens_seen": 5819976, "step": 1158 }, { "epoch": 1.7575757575757576, "grad_norm": 0.5664975047111511, "learning_rate": 4.581730877984817e-07, "loss": 2.3448, "num_input_tokens_seen": 5831304, "step": 1160 }, { "epoch": 1.7575757575757576, "eval_loss": 2.326674699783325, "eval_runtime": 5.817, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 5831304, "step": 1160 }, { "epoch": 1.7606060606060607, "grad_norm": 0.6864150166511536, "learning_rate": 4.567451862134651e-07, "loss": 2.2982, "num_input_tokens_seen": 5841792, "step": 1162 }, { "epoch": 1.7636363636363637, "grad_norm": 0.5514176487922668, "learning_rate": 4.553343829721776e-07, "loss": 2.296, "num_input_tokens_seen": 5852640, "step": 1164 }, { "epoch": 1.7666666666666666, "grad_norm": 0.5415042638778687, "learning_rate": 4.539407105297053e-07, "loss": 2.3767, "num_input_tokens_seen": 5864328, "step": 1166 }, { "epoch": 1.7696969696969695, "grad_norm": 0.7088015675544739, "learning_rate": 4.5256420094704516e-07, "loss": 2.1989, "num_input_tokens_seen": 5873424, "step": 1168 }, { "epoch": 1.7727272727272727, "grad_norm": 0.5956241488456726, "learning_rate": 4.5120488589036816e-07, "loss": 2.2727, "num_input_tokens_seen": 5881608, "step": 1170 }, { "epoch": 1.7757575757575759, "grad_norm": 0.6199578046798706, "learning_rate": 4.498627966302905e-07, "loss": 2.3122, "num_input_tokens_seen": 5892984, "step": 1172 }, { "epoch": 1.7787878787878788, "grad_norm": 0.6161043643951416, "learning_rate": 4.485379640411545e-07, "loss": 2.607, "num_input_tokens_seen": 5903832, "step": 1174 }, { "epoch": 1.7818181818181817, "grad_norm": 0.7086969017982483, "learning_rate": 4.4723041860031803e-07, "loss": 2.444, "num_input_tokens_seen": 5914224, "step": 1176 }, { "epoch": 1.7848484848484847, "grad_norm": 0.5110089182853699, "learning_rate": 4.459401903874538e-07, "loss": 2.462, "num_input_tokens_seen": 5925768, "step": 1178 }, { "epoch": 1.7878787878787878, "grad_norm": 0.6780450344085693, "learning_rate": 4.4466730908385664e-07, "loss": 2.4997, "num_input_tokens_seen": 5934528, "step": 1180 }, { "epoch": 1.7878787878787878, "eval_loss": 2.3263440132141113, "eval_runtime": 5.8197, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 5934528, "step": 1180 }, { "epoch": 1.790909090909091, "grad_norm": 0.6635234355926514, "learning_rate": 4.434118039717616e-07, "loss": 2.2541, "num_input_tokens_seen": 5944224, "step": 1182 }, { "epoch": 1.793939393939394, "grad_norm": 0.6881716251373291, "learning_rate": 4.4217370393366995e-07, "loss": 2.2483, "num_input_tokens_seen": 5954688, "step": 1184 }, { "epoch": 1.7969696969696969, "grad_norm": 1.0131621360778809, "learning_rate": 4.40953037451684e-07, "loss": 2.403, "num_input_tokens_seen": 5964072, "step": 1186 }, { "epoch": 1.8, "grad_norm": 0.5744723677635193, "learning_rate": 4.3974983260685345e-07, "loss": 2.5772, "num_input_tokens_seen": 5975184, "step": 1188 }, { "epoch": 1.803030303030303, "grad_norm": 0.6319069266319275, "learning_rate": 4.3856411707852814e-07, "loss": 2.3809, "num_input_tokens_seen": 5981496, "step": 1190 }, { "epoch": 1.8060606060606061, "grad_norm": 0.49835190176963806, "learning_rate": 4.373959181437216e-07, "loss": 2.3452, "num_input_tokens_seen": 5993088, "step": 1192 }, { "epoch": 1.809090909090909, "grad_norm": 0.825423538684845, "learning_rate": 4.3624526267648363e-07, "loss": 2.2971, "num_input_tokens_seen": 6003864, "step": 1194 }, { "epoch": 1.812121212121212, "grad_norm": 0.5639837384223938, "learning_rate": 4.351121771472823e-07, "loss": 2.1717, "num_input_tokens_seen": 6013824, "step": 1196 }, { "epoch": 1.8151515151515152, "grad_norm": 0.6175968050956726, "learning_rate": 4.3399668762239446e-07, "loss": 2.3326, "num_input_tokens_seen": 6024120, "step": 1198 }, { "epoch": 1.8181818181818183, "grad_norm": 0.5506427884101868, "learning_rate": 4.328988197633066e-07, "loss": 2.311, "num_input_tokens_seen": 6035544, "step": 1200 }, { "epoch": 1.8181818181818183, "eval_loss": 2.326775550842285, "eval_runtime": 5.8185, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 6035544, "step": 1200 }, { "epoch": 1.8212121212121213, "grad_norm": 0.6879149675369263, "learning_rate": 4.3181859882612426e-07, "loss": 2.4867, "num_input_tokens_seen": 6047520, "step": 1202 }, { "epoch": 1.8242424242424242, "grad_norm": 0.9205613136291504, "learning_rate": 4.307560496609911e-07, "loss": 2.5415, "num_input_tokens_seen": 6055488, "step": 1204 }, { "epoch": 1.8272727272727272, "grad_norm": 0.7125353813171387, "learning_rate": 4.297111967115171e-07, "loss": 2.3684, "num_input_tokens_seen": 6063720, "step": 1206 }, { "epoch": 1.8303030303030303, "grad_norm": 0.7578244805335999, "learning_rate": 4.286840640142166e-07, "loss": 2.1882, "num_input_tokens_seen": 6071664, "step": 1208 }, { "epoch": 1.8333333333333335, "grad_norm": 0.5936377644538879, "learning_rate": 4.2767467519795497e-07, "loss": 2.4383, "num_input_tokens_seen": 6081360, "step": 1210 }, { "epoch": 1.8363636363636364, "grad_norm": 0.5857051610946655, "learning_rate": 4.2668305348340495e-07, "loss": 2.2313, "num_input_tokens_seen": 6090624, "step": 1212 }, { "epoch": 1.8393939393939394, "grad_norm": 0.5357300639152527, "learning_rate": 4.2570922168251294e-07, "loss": 2.3837, "num_input_tokens_seen": 6100944, "step": 1214 }, { "epoch": 1.8424242424242423, "grad_norm": 0.8577349185943604, "learning_rate": 4.2475320219797406e-07, "loss": 2.3874, "num_input_tokens_seen": 6108792, "step": 1216 }, { "epoch": 1.8454545454545455, "grad_norm": 0.5311655402183533, "learning_rate": 4.2381501702271623e-07, "loss": 2.3853, "num_input_tokens_seen": 6121080, "step": 1218 }, { "epoch": 1.8484848484848486, "grad_norm": 0.5314241051673889, "learning_rate": 4.228946877393953e-07, "loss": 2.3858, "num_input_tokens_seen": 6131112, "step": 1220 }, { "epoch": 1.8484848484848486, "eval_loss": 2.3265769481658936, "eval_runtime": 5.8173, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "num_input_tokens_seen": 6131112, "step": 1220 }, { "epoch": 1.8515151515151516, "grad_norm": 0.6820886731147766, "learning_rate": 4.219922355198972e-07, "loss": 2.3291, "num_input_tokens_seen": 6141072, "step": 1222 }, { "epoch": 1.8545454545454545, "grad_norm": 0.6875143051147461, "learning_rate": 4.211076811248524e-07, "loss": 2.344, "num_input_tokens_seen": 6152040, "step": 1224 }, { "epoch": 1.8575757575757574, "grad_norm": 0.6124435067176819, "learning_rate": 4.2024104490315696e-07, "loss": 2.275, "num_input_tokens_seen": 6163368, "step": 1226 }, { "epoch": 1.8606060606060606, "grad_norm": 0.6159326434135437, "learning_rate": 4.1939234679150516e-07, "loss": 2.4138, "num_input_tokens_seen": 6171072, "step": 1228 }, { "epoch": 1.8636363636363638, "grad_norm": 0.4833909273147583, "learning_rate": 4.185616063139308e-07, "loss": 2.2974, "num_input_tokens_seen": 6183312, "step": 1230 }, { "epoch": 1.8666666666666667, "grad_norm": 0.8235618472099304, "learning_rate": 4.177488425813578e-07, "loss": 2.4087, "num_input_tokens_seen": 6193104, "step": 1232 }, { "epoch": 1.8696969696969696, "grad_norm": 0.5075482726097107, "learning_rate": 4.1695407429116063e-07, "loss": 2.4328, "num_input_tokens_seen": 6205392, "step": 1234 }, { "epoch": 1.8727272727272726, "grad_norm": 0.5093833208084106, "learning_rate": 4.1617731972673466e-07, "loss": 2.4412, "num_input_tokens_seen": 6215808, "step": 1236 }, { "epoch": 1.8757575757575757, "grad_norm": 0.5927122235298157, "learning_rate": 4.1541859675707454e-07, "loss": 2.2544, "num_input_tokens_seen": 6226224, "step": 1238 }, { "epoch": 1.878787878787879, "grad_norm": 0.6176667809486389, "learning_rate": 4.146779228363644e-07, "loss": 2.3602, "num_input_tokens_seen": 6235464, "step": 1240 }, { "epoch": 1.878787878787879, "eval_loss": 2.3264036178588867, "eval_runtime": 5.821, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "num_input_tokens_seen": 6235464, "step": 1240 }, { "epoch": 1.8818181818181818, "grad_norm": 0.5281220078468323, "learning_rate": 4.139553150035751e-07, "loss": 2.439, "num_input_tokens_seen": 6245400, "step": 1242 }, { "epoch": 1.8848484848484848, "grad_norm": 0.6205955147743225, "learning_rate": 4.1325078988207303e-07, "loss": 2.466, "num_input_tokens_seen": 6252768, "step": 1244 }, { "epoch": 1.887878787878788, "grad_norm": 0.5631701350212097, "learning_rate": 4.1256436367923777e-07, "loss": 2.5193, "num_input_tokens_seen": 6264432, "step": 1246 }, { "epoch": 1.8909090909090909, "grad_norm": 0.6673572659492493, "learning_rate": 4.118960521860884e-07, "loss": 2.4064, "num_input_tokens_seen": 6273264, "step": 1248 }, { "epoch": 1.893939393939394, "grad_norm": 0.6367799639701843, "learning_rate": 4.1124587077692115e-07, "loss": 2.2931, "num_input_tokens_seen": 6284112, "step": 1250 }, { "epoch": 1.896969696969697, "grad_norm": 1.2654261589050293, "learning_rate": 4.106138344089554e-07, "loss": 2.4058, "num_input_tokens_seen": 6292248, "step": 1252 }, { "epoch": 1.9, "grad_norm": 0.4898473024368286, "learning_rate": 4.0999995762198936e-07, "loss": 2.4485, "num_input_tokens_seen": 6302352, "step": 1254 }, { "epoch": 1.903030303030303, "grad_norm": 0.5527143478393555, "learning_rate": 4.094042545380659e-07, "loss": 2.1889, "num_input_tokens_seen": 6311712, "step": 1256 }, { "epoch": 1.906060606060606, "grad_norm": 0.6194308996200562, "learning_rate": 4.088267388611474e-07, "loss": 2.3617, "num_input_tokens_seen": 6323304, "step": 1258 }, { "epoch": 1.9090909090909092, "grad_norm": 0.5801293849945068, "learning_rate": 4.082674238768009e-07, "loss": 2.2347, "num_input_tokens_seen": 6335304, "step": 1260 }, { "epoch": 1.9090909090909092, "eval_loss": 2.326760768890381, "eval_runtime": 5.8194, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 6335304, "step": 1260 }, { "epoch": 1.912121212121212, "grad_norm": 0.638659656047821, "learning_rate": 4.0772632245189193e-07, "loss": 2.2904, "num_input_tokens_seen": 6345624, "step": 1262 }, { "epoch": 1.915151515151515, "grad_norm": 0.5953812003135681, "learning_rate": 4.0720344703428906e-07, "loss": 2.3719, "num_input_tokens_seen": 6355632, "step": 1264 }, { "epoch": 1.9181818181818182, "grad_norm": 0.5857142806053162, "learning_rate": 4.066988096525772e-07, "loss": 2.3489, "num_input_tokens_seen": 6363840, "step": 1266 }, { "epoch": 1.9212121212121214, "grad_norm": 0.5746711492538452, "learning_rate": 4.062124219157808e-07, "loss": 2.3433, "num_input_tokens_seen": 6375000, "step": 1268 }, { "epoch": 1.9242424242424243, "grad_norm": 0.6761659383773804, "learning_rate": 4.057442950130972e-07, "loss": 2.4374, "num_input_tokens_seen": 6385632, "step": 1270 }, { "epoch": 1.9272727272727272, "grad_norm": 0.4828651249408722, "learning_rate": 4.05294439713639e-07, "loss": 2.3613, "num_input_tokens_seen": 6397728, "step": 1272 }, { "epoch": 1.9303030303030302, "grad_norm": 0.6450832486152649, "learning_rate": 4.048628663661859e-07, "loss": 2.1642, "num_input_tokens_seen": 6409512, "step": 1274 }, { "epoch": 1.9333333333333333, "grad_norm": 0.6221877336502075, "learning_rate": 4.044495848989475e-07, "loss": 2.4558, "num_input_tokens_seen": 6419664, "step": 1276 }, { "epoch": 1.9363636363636365, "grad_norm": 0.825742781162262, "learning_rate": 4.040546048193343e-07, "loss": 2.5869, "num_input_tokens_seen": 6428712, "step": 1278 }, { "epoch": 1.9393939393939394, "grad_norm": 0.69305020570755, "learning_rate": 4.0367793521373886e-07, "loss": 2.577, "num_input_tokens_seen": 6435960, "step": 1280 }, { "epoch": 1.9393939393939394, "eval_loss": 2.3265655040740967, "eval_runtime": 5.8193, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 6435960, "step": 1280 }, { "epoch": 1.9424242424242424, "grad_norm": 0.51558518409729, "learning_rate": 4.0331958474732744e-07, "loss": 2.4398, "num_input_tokens_seen": 6446952, "step": 1282 }, { "epoch": 1.9454545454545453, "grad_norm": 0.7710928916931152, "learning_rate": 4.0297956166384e-07, "loss": 2.3546, "num_input_tokens_seen": 6454440, "step": 1284 }, { "epoch": 1.9484848484848485, "grad_norm": 0.6520776748657227, "learning_rate": 4.0265787378540076e-07, "loss": 2.2851, "num_input_tokens_seen": 6465888, "step": 1286 }, { "epoch": 1.9515151515151516, "grad_norm": 0.7156710624694824, "learning_rate": 4.023545285123386e-07, "loss": 2.501, "num_input_tokens_seen": 6474384, "step": 1288 }, { "epoch": 1.9545454545454546, "grad_norm": 0.5886339545249939, "learning_rate": 4.020695328230162e-07, "loss": 2.3128, "num_input_tokens_seen": 6485712, "step": 1290 }, { "epoch": 1.9575757575757575, "grad_norm": 0.5593713521957397, "learning_rate": 4.018028932736699e-07, "loss": 2.2989, "num_input_tokens_seen": 6497160, "step": 1292 }, { "epoch": 1.9606060606060605, "grad_norm": 0.5878450870513916, "learning_rate": 4.01554615998259e-07, "loss": 2.4504, "num_input_tokens_seen": 6508920, "step": 1294 }, { "epoch": 1.9636363636363636, "grad_norm": 0.5121827721595764, "learning_rate": 4.013247067083242e-07, "loss": 2.4132, "num_input_tokens_seen": 6520176, "step": 1296 }, { "epoch": 1.9666666666666668, "grad_norm": 0.49630334973335266, "learning_rate": 4.011131706928566e-07, "loss": 2.3645, "num_input_tokens_seen": 6531528, "step": 1298 }, { "epoch": 1.9696969696969697, "grad_norm": 0.543795645236969, "learning_rate": 4.0092001281817576e-07, "loss": 2.3001, "num_input_tokens_seen": 6543816, "step": 1300 }, { "epoch": 1.9696969696969697, "eval_loss": 2.3271186351776123, "eval_runtime": 5.816, "eval_samples_per_second": 3.439, "eval_steps_per_second": 3.439, "num_input_tokens_seen": 6543816, "step": 1300 }, { "epoch": 1.9727272727272727, "grad_norm": 0.8191571235656738, "learning_rate": 4.0074523752781806e-07, "loss": 2.8758, "num_input_tokens_seen": 6552936, "step": 1302 }, { "epoch": 1.9757575757575756, "grad_norm": 0.6543108820915222, "learning_rate": 4.0058884884243416e-07, "loss": 2.3766, "num_input_tokens_seen": 6562896, "step": 1304 }, { "epoch": 1.9787878787878788, "grad_norm": 0.5305016040802002, "learning_rate": 4.004508503596967e-07, "loss": 2.3732, "num_input_tokens_seen": 6575184, "step": 1306 }, { "epoch": 1.981818181818182, "grad_norm": 0.5914813280105591, "learning_rate": 4.0033124525421757e-07, "loss": 2.3789, "num_input_tokens_seen": 6586032, "step": 1308 }, { "epoch": 1.9848484848484849, "grad_norm": 0.712382435798645, "learning_rate": 4.0023003627747455e-07, "loss": 2.2654, "num_input_tokens_seen": 6594768, "step": 1310 }, { "epoch": 1.9878787878787878, "grad_norm": 0.5054189562797546, "learning_rate": 4.0014722575774835e-07, "loss": 2.4605, "num_input_tokens_seen": 6604728, "step": 1312 }, { "epoch": 1.990909090909091, "grad_norm": 0.5901520252227783, "learning_rate": 4.000828156000692e-07, "loss": 2.4816, "num_input_tokens_seen": 6616536, "step": 1314 }, { "epoch": 1.993939393939394, "grad_norm": 0.7864160537719727, "learning_rate": 4.000368072861723e-07, "loss": 2.482, "num_input_tokens_seen": 6624480, "step": 1316 }, { "epoch": 1.996969696969697, "grad_norm": 0.49510428309440613, "learning_rate": 4.0000920187446465e-07, "loss": 2.45, "num_input_tokens_seen": 6636768, "step": 1318 }, { "epoch": 2.0, "grad_norm": 0.6357753872871399, "learning_rate": 4e-07, "loss": 2.2129, "num_input_tokens_seen": 6646824, "step": 1320 }, { "epoch": 2.0, "eval_loss": 2.326845645904541, "eval_runtime": 5.8186, "eval_samples_per_second": 3.437, "eval_steps_per_second": 3.437, "num_input_tokens_seen": 6646824, "step": 1320 }, { "epoch": 2.0, "num_input_tokens_seen": 6646824, "step": 1320, "total_flos": 3.059943926859694e+17, "train_loss": 2.3998946460810573, "train_runtime": 5038.8172, "train_samples_per_second": 0.786, "train_steps_per_second": 0.262 } ], "logging_steps": 2, "max_steps": 1320, "num_input_tokens_seen": 6646824, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.059943926859694e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }