llama3-8B-DWP-lora / trainer_state.json
aaronday3's picture
Upload folder using huggingface_hub
919e6c4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 20,
"global_step": 1320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030303030303030303,
"grad_norm": 0.42330464720726013,
"learning_rate": 4e-07,
"loss": 2.4685,
"num_input_tokens_seen": 10296,
"step": 2
},
{
"epoch": 0.006060606060606061,
"grad_norm": 0.4667194187641144,
"learning_rate": 8e-07,
"loss": 2.4399,
"num_input_tokens_seen": 20376,
"step": 4
},
{
"epoch": 0.00909090909090909,
"grad_norm": 0.38802874088287354,
"learning_rate": 1.2e-06,
"loss": 2.3101,
"num_input_tokens_seen": 32664,
"step": 6
},
{
"epoch": 0.012121212121212121,
"grad_norm": 0.4379090368747711,
"learning_rate": 1.6e-06,
"loss": 2.2743,
"num_input_tokens_seen": 41904,
"step": 8
},
{
"epoch": 0.015151515151515152,
"grad_norm": 0.4267907738685608,
"learning_rate": 2e-06,
"loss": 2.355,
"num_input_tokens_seen": 52776,
"step": 10
},
{
"epoch": 0.01818181818181818,
"grad_norm": 0.5171758532524109,
"learning_rate": 1.999990798125535e-06,
"loss": 2.633,
"num_input_tokens_seen": 61464,
"step": 12
},
{
"epoch": 0.021212121212121213,
"grad_norm": 0.47265326976776123,
"learning_rate": 1.9999631927138275e-06,
"loss": 2.3386,
"num_input_tokens_seen": 72624,
"step": 14
},
{
"epoch": 0.024242424242424242,
"grad_norm": 0.5586420893669128,
"learning_rate": 1.9999171843999306e-06,
"loss": 2.3536,
"num_input_tokens_seen": 81840,
"step": 16
},
{
"epoch": 0.02727272727272727,
"grad_norm": 0.39176592230796814,
"learning_rate": 1.9998527742422515e-06,
"loss": 2.2979,
"num_input_tokens_seen": 91968,
"step": 18
},
{
"epoch": 0.030303030303030304,
"grad_norm": 0.4795871078968048,
"learning_rate": 1.9997699637225253e-06,
"loss": 2.3755,
"num_input_tokens_seen": 102984,
"step": 20
},
{
"epoch": 0.030303030303030304,
"eval_loss": 2.3641138076782227,
"eval_runtime": 5.815,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 102984,
"step": 20
},
{
"epoch": 0.03333333333333333,
"grad_norm": 0.4541929364204407,
"learning_rate": 1.9996687547457825e-06,
"loss": 2.286,
"num_input_tokens_seen": 113352,
"step": 22
},
{
"epoch": 0.03636363636363636,
"grad_norm": 0.4055442810058594,
"learning_rate": 1.999549149640303e-06,
"loss": 2.3933,
"num_input_tokens_seen": 125184,
"step": 24
},
{
"epoch": 0.03939393939393939,
"grad_norm": 0.5810754299163818,
"learning_rate": 1.9994111511575657e-06,
"loss": 2.1378,
"num_input_tokens_seen": 135480,
"step": 26
},
{
"epoch": 0.04242424242424243,
"grad_norm": 0.41868993639945984,
"learning_rate": 1.999254762472182e-06,
"loss": 2.2551,
"num_input_tokens_seen": 147384,
"step": 28
},
{
"epoch": 0.045454545454545456,
"grad_norm": 0.5975711941719055,
"learning_rate": 1.999079987181824e-06,
"loss": 2.506,
"num_input_tokens_seen": 156912,
"step": 30
},
{
"epoch": 0.048484848484848485,
"grad_norm": 0.422783762216568,
"learning_rate": 1.9988868293071435e-06,
"loss": 2.4742,
"num_input_tokens_seen": 167568,
"step": 32
},
{
"epoch": 0.051515151515151514,
"grad_norm": 0.32683178782463074,
"learning_rate": 1.998675293291676e-06,
"loss": 2.5007,
"num_input_tokens_seen": 176616,
"step": 34
},
{
"epoch": 0.05454545454545454,
"grad_norm": 0.4234691858291626,
"learning_rate": 1.998445384001741e-06,
"loss": 2.4632,
"num_input_tokens_seen": 187272,
"step": 36
},
{
"epoch": 0.05757575757575758,
"grad_norm": 0.4502381980419159,
"learning_rate": 1.99819710672633e-06,
"loss": 2.4556,
"num_input_tokens_seen": 196992,
"step": 38
},
{
"epoch": 0.06060606060606061,
"grad_norm": 0.5127580165863037,
"learning_rate": 1.9979304671769838e-06,
"loss": 2.5355,
"num_input_tokens_seen": 208824,
"step": 40
},
{
"epoch": 0.06060606060606061,
"eval_loss": 2.361894130706787,
"eval_runtime": 5.8061,
"eval_samples_per_second": 3.445,
"eval_steps_per_second": 3.445,
"num_input_tokens_seen": 208824,
"step": 40
},
{
"epoch": 0.06363636363636363,
"grad_norm": 0.5844971537590027,
"learning_rate": 1.997645471487661e-06,
"loss": 2.497,
"num_input_tokens_seen": 217272,
"step": 42
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.41816312074661255,
"learning_rate": 1.9973421262145992e-06,
"loss": 2.4371,
"num_input_tokens_seen": 229560,
"step": 44
},
{
"epoch": 0.0696969696969697,
"grad_norm": 0.505349338054657,
"learning_rate": 1.99702043833616e-06,
"loss": 2.4757,
"num_input_tokens_seen": 239568,
"step": 46
},
{
"epoch": 0.07272727272727272,
"grad_norm": 0.4537525177001953,
"learning_rate": 1.9966804152526726e-06,
"loss": 2.4514,
"num_input_tokens_seen": 251664,
"step": 48
},
{
"epoch": 0.07575757575757576,
"grad_norm": 0.40902894735336304,
"learning_rate": 1.996322064786261e-06,
"loss": 2.3474,
"num_input_tokens_seen": 263040,
"step": 50
},
{
"epoch": 0.07878787878787878,
"grad_norm": 0.48902806639671326,
"learning_rate": 1.9959453951806656e-06,
"loss": 2.4297,
"num_input_tokens_seen": 271080,
"step": 52
},
{
"epoch": 0.08181818181818182,
"grad_norm": 0.4684095084667206,
"learning_rate": 1.995550415101052e-06,
"loss": 2.6676,
"num_input_tokens_seen": 282000,
"step": 54
},
{
"epoch": 0.08484848484848485,
"grad_norm": 0.33189377188682556,
"learning_rate": 1.9951371336338145e-06,
"loss": 2.1799,
"num_input_tokens_seen": 290568,
"step": 56
},
{
"epoch": 0.08787878787878788,
"grad_norm": 0.4579316973686218,
"learning_rate": 1.994705560286361e-06,
"loss": 2.5315,
"num_input_tokens_seen": 298920,
"step": 58
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.42468497157096863,
"learning_rate": 1.994255704986903e-06,
"loss": 2.4679,
"num_input_tokens_seen": 309744,
"step": 60
},
{
"epoch": 0.09090909090909091,
"eval_loss": 2.360027551651001,
"eval_runtime": 5.8148,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 309744,
"step": 60
},
{
"epoch": 0.09393939393939393,
"grad_norm": 0.5245186686515808,
"learning_rate": 1.993787578084219e-06,
"loss": 2.4576,
"num_input_tokens_seen": 321360,
"step": 62
},
{
"epoch": 0.09696969696969697,
"grad_norm": 0.38165679574012756,
"learning_rate": 1.9933011903474228e-06,
"loss": 2.275,
"num_input_tokens_seen": 332736,
"step": 64
},
{
"epoch": 0.1,
"grad_norm": 0.5568698644638062,
"learning_rate": 1.992796552965711e-06,
"loss": 2.2761,
"num_input_tokens_seen": 344568,
"step": 66
},
{
"epoch": 0.10303030303030303,
"grad_norm": 0.39623475074768066,
"learning_rate": 1.9922736775481083e-06,
"loss": 2.3385,
"num_input_tokens_seen": 356616,
"step": 68
},
{
"epoch": 0.10606060606060606,
"grad_norm": 0.532319188117981,
"learning_rate": 1.991732576123199e-06,
"loss": 2.3342,
"num_input_tokens_seen": 367680,
"step": 70
},
{
"epoch": 0.10909090909090909,
"grad_norm": 0.505707859992981,
"learning_rate": 1.9911732611388524e-06,
"loss": 2.3604,
"num_input_tokens_seen": 377376,
"step": 72
},
{
"epoch": 0.11212121212121212,
"grad_norm": 0.4921689033508301,
"learning_rate": 1.9905957454619343e-06,
"loss": 2.2869,
"num_input_tokens_seen": 387432,
"step": 74
},
{
"epoch": 0.11515151515151516,
"grad_norm": 0.47557827830314636,
"learning_rate": 1.9900000423780104e-06,
"loss": 2.601,
"num_input_tokens_seen": 395808,
"step": 76
},
{
"epoch": 0.11818181818181818,
"grad_norm": 1.9346156120300293,
"learning_rate": 1.9893861655910444e-06,
"loss": 2.3741,
"num_input_tokens_seen": 407568,
"step": 78
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.45454809069633484,
"learning_rate": 1.988754129223079e-06,
"loss": 2.3929,
"num_input_tokens_seen": 417648,
"step": 80
},
{
"epoch": 0.12121212121212122,
"eval_loss": 2.3575997352600098,
"eval_runtime": 5.8145,
"eval_samples_per_second": 3.44,
"eval_steps_per_second": 3.44,
"num_input_tokens_seen": 417648,
"step": 80
},
{
"epoch": 0.12424242424242424,
"grad_norm": 0.533509373664856,
"learning_rate": 1.9881039478139115e-06,
"loss": 2.3717,
"num_input_tokens_seen": 428568,
"step": 82
},
{
"epoch": 0.12727272727272726,
"grad_norm": 0.3749203681945801,
"learning_rate": 1.9874356363207624e-06,
"loss": 2.2728,
"num_input_tokens_seen": 437688,
"step": 84
},
{
"epoch": 0.1303030303030303,
"grad_norm": 0.41353124380111694,
"learning_rate": 1.986749210117927e-06,
"loss": 2.5347,
"num_input_tokens_seen": 447408,
"step": 86
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.4702826142311096,
"learning_rate": 1.986044684996425e-06,
"loss": 2.4081,
"num_input_tokens_seen": 456120,
"step": 88
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.5201271772384644,
"learning_rate": 1.985322077163636e-06,
"loss": 2.5697,
"num_input_tokens_seen": 467208,
"step": 90
},
{
"epoch": 0.1393939393939394,
"grad_norm": 0.5325783491134644,
"learning_rate": 1.9845814032429257e-06,
"loss": 2.3267,
"num_input_tokens_seen": 477168,
"step": 92
},
{
"epoch": 0.14242424242424243,
"grad_norm": 0.49566376209259033,
"learning_rate": 1.9838226802732656e-06,
"loss": 2.5342,
"num_input_tokens_seen": 486888,
"step": 94
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.5317257046699524,
"learning_rate": 1.9830459257088395e-06,
"loss": 2.5662,
"num_input_tokens_seen": 496584,
"step": 96
},
{
"epoch": 0.1484848484848485,
"grad_norm": 0.6195109486579895,
"learning_rate": 1.982251157418642e-06,
"loss": 2.3294,
"num_input_tokens_seen": 503736,
"step": 98
},
{
"epoch": 0.15151515151515152,
"grad_norm": 0.4253556728363037,
"learning_rate": 1.981438393686069e-06,
"loss": 2.6105,
"num_input_tokens_seen": 513600,
"step": 100
},
{
"epoch": 0.15151515151515152,
"eval_loss": 2.3544414043426514,
"eval_runtime": 5.8171,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 513600,
"step": 100
},
{
"epoch": 0.15454545454545454,
"grad_norm": 0.5861473083496094,
"learning_rate": 1.980607653208495e-06,
"loss": 2.6435,
"num_input_tokens_seen": 519960,
"step": 102
},
{
"epoch": 0.15757575757575756,
"grad_norm": 0.44223421812057495,
"learning_rate": 1.9797589550968434e-06,
"loss": 2.4326,
"num_input_tokens_seen": 529392,
"step": 104
},
{
"epoch": 0.1606060606060606,
"grad_norm": 0.7290481328964233,
"learning_rate": 1.9788923188751478e-06,
"loss": 2.5169,
"num_input_tokens_seen": 537000,
"step": 106
},
{
"epoch": 0.16363636363636364,
"grad_norm": 0.43159109354019165,
"learning_rate": 1.978007764480103e-06,
"loss": 2.3097,
"num_input_tokens_seen": 546864,
"step": 108
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.46773430705070496,
"learning_rate": 1.977105312260605e-06,
"loss": 2.2565,
"num_input_tokens_seen": 558432,
"step": 110
},
{
"epoch": 0.1696969696969697,
"grad_norm": 0.46607473492622375,
"learning_rate": 1.976184982977284e-06,
"loss": 2.3503,
"num_input_tokens_seen": 569016,
"step": 112
},
{
"epoch": 0.17272727272727273,
"grad_norm": 0.5427464842796326,
"learning_rate": 1.975246797802026e-06,
"loss": 2.2801,
"num_input_tokens_seen": 580392,
"step": 114
},
{
"epoch": 0.17575757575757575,
"grad_norm": 0.4266676902770996,
"learning_rate": 1.974290778317487e-06,
"loss": 2.4019,
"num_input_tokens_seen": 590568,
"step": 116
},
{
"epoch": 0.1787878787878788,
"grad_norm": 0.4442364275455475,
"learning_rate": 1.973316946516595e-06,
"loss": 2.3779,
"num_input_tokens_seen": 601704,
"step": 118
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.4435305595397949,
"learning_rate": 1.9723253248020455e-06,
"loss": 2.2488,
"num_input_tokens_seen": 613584,
"step": 120
},
{
"epoch": 0.18181818181818182,
"eval_loss": 2.3512158393859863,
"eval_runtime": 5.819,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 613584,
"step": 120
},
{
"epoch": 0.18484848484848485,
"grad_norm": 0.5893362164497375,
"learning_rate": 1.9713159359857833e-06,
"loss": 2.4906,
"num_input_tokens_seen": 624792,
"step": 122
},
{
"epoch": 0.18787878787878787,
"grad_norm": 0.4149838089942932,
"learning_rate": 1.9702888032884826e-06,
"loss": 2.5957,
"num_input_tokens_seen": 635832,
"step": 124
},
{
"epoch": 0.19090909090909092,
"grad_norm": 0.42286068201065063,
"learning_rate": 1.969243950339009e-06,
"loss": 2.1759,
"num_input_tokens_seen": 647664,
"step": 126
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.5177129507064819,
"learning_rate": 1.9681814011738758e-06,
"loss": 2.5093,
"num_input_tokens_seen": 656952,
"step": 128
},
{
"epoch": 0.19696969696969696,
"grad_norm": 0.5667068958282471,
"learning_rate": 1.9671011802366934e-06,
"loss": 2.5727,
"num_input_tokens_seen": 664104,
"step": 130
},
{
"epoch": 0.2,
"grad_norm": 0.566889762878418,
"learning_rate": 1.9660033123776056e-06,
"loss": 2.3728,
"num_input_tokens_seen": 674016,
"step": 132
},
{
"epoch": 0.20303030303030303,
"grad_norm": 0.4465801417827606,
"learning_rate": 1.964887822852718e-06,
"loss": 2.4271,
"num_input_tokens_seen": 684480,
"step": 134
},
{
"epoch": 0.20606060606060606,
"grad_norm": 0.5765467286109924,
"learning_rate": 1.963754737323516e-06,
"loss": 2.5413,
"num_input_tokens_seen": 694056,
"step": 136
},
{
"epoch": 0.20909090909090908,
"grad_norm": 0.5330570936203003,
"learning_rate": 1.9626040818562783e-06,
"loss": 2.4513,
"num_input_tokens_seen": 704640,
"step": 138
},
{
"epoch": 0.21212121212121213,
"grad_norm": 0.6006715297698975,
"learning_rate": 1.9614358829214722e-06,
"loss": 2.3866,
"num_input_tokens_seen": 713640,
"step": 140
},
{
"epoch": 0.21212121212121213,
"eval_loss": 2.349419355392456,
"eval_runtime": 5.8237,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 713640,
"step": 140
},
{
"epoch": 0.21515151515151515,
"grad_norm": 0.4789717495441437,
"learning_rate": 1.960250167393147e-06,
"loss": 2.4217,
"num_input_tokens_seen": 722880,
"step": 142
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.558068037033081,
"learning_rate": 1.959046962548316e-06,
"loss": 2.5271,
"num_input_tokens_seen": 733104,
"step": 144
},
{
"epoch": 0.22121212121212122,
"grad_norm": 0.5164092183113098,
"learning_rate": 1.9578262960663305e-06,
"loss": 2.4228,
"num_input_tokens_seen": 745392,
"step": 146
},
{
"epoch": 0.22424242424242424,
"grad_norm": 0.49615126848220825,
"learning_rate": 1.9565881960282384e-06,
"loss": 2.1895,
"num_input_tokens_seen": 755736,
"step": 148
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.6630756258964539,
"learning_rate": 1.9553326909161436e-06,
"loss": 2.4702,
"num_input_tokens_seen": 767040,
"step": 150
},
{
"epoch": 0.23030303030303031,
"grad_norm": 0.5331915020942688,
"learning_rate": 1.954059809612546e-06,
"loss": 2.4535,
"num_input_tokens_seen": 776496,
"step": 152
},
{
"epoch": 0.23333333333333334,
"grad_norm": 0.44153809547424316,
"learning_rate": 1.9527695813996817e-06,
"loss": 2.3757,
"num_input_tokens_seen": 785568,
"step": 154
},
{
"epoch": 0.23636363636363636,
"grad_norm": 0.4671899378299713,
"learning_rate": 1.9514620359588454e-06,
"loss": 2.3609,
"num_input_tokens_seen": 797496,
"step": 156
},
{
"epoch": 0.23939393939393938,
"grad_norm": 0.49474212527275085,
"learning_rate": 1.9501372033697097e-06,
"loss": 2.4576,
"num_input_tokens_seen": 808536,
"step": 158
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.5353239178657532,
"learning_rate": 1.948795114109632e-06,
"loss": 2.2509,
"num_input_tokens_seen": 818592,
"step": 160
},
{
"epoch": 0.24242424242424243,
"eval_loss": 2.3466238975524902,
"eval_runtime": 5.8178,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 818592,
"step": 160
},
{
"epoch": 0.24545454545454545,
"grad_norm": 0.4847556948661804,
"learning_rate": 1.947435799052955e-06,
"loss": 2.4558,
"num_input_tokens_seen": 828336,
"step": 162
},
{
"epoch": 0.24848484848484848,
"grad_norm": 0.5099437236785889,
"learning_rate": 1.9460592894702946e-06,
"loss": 2.3038,
"num_input_tokens_seen": 838080,
"step": 164
},
{
"epoch": 0.2515151515151515,
"grad_norm": 0.47751423716545105,
"learning_rate": 1.944665617027823e-06,
"loss": 2.2954,
"num_input_tokens_seen": 850128,
"step": 166
},
{
"epoch": 0.2545454545454545,
"grad_norm": 0.4297049045562744,
"learning_rate": 1.943254813786535e-06,
"loss": 2.2327,
"num_input_tokens_seen": 862416,
"step": 168
},
{
"epoch": 0.25757575757575757,
"grad_norm": 0.5330982804298401,
"learning_rate": 1.941826912201518e-06,
"loss": 2.487,
"num_input_tokens_seen": 873936,
"step": 170
},
{
"epoch": 0.2606060606060606,
"grad_norm": 0.4737272560596466,
"learning_rate": 1.9403819451212004e-06,
"loss": 2.6736,
"num_input_tokens_seen": 883584,
"step": 172
},
{
"epoch": 0.2636363636363636,
"grad_norm": 0.6267192363739014,
"learning_rate": 1.938919945786595e-06,
"loss": 2.2313,
"num_input_tokens_seen": 892632,
"step": 174
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.42695581912994385,
"learning_rate": 1.9374409478305385e-06,
"loss": 2.4444,
"num_input_tokens_seen": 904920,
"step": 176
},
{
"epoch": 0.2696969696969697,
"grad_norm": 0.5554710030555725,
"learning_rate": 1.935944985276914e-06,
"loss": 2.5038,
"num_input_tokens_seen": 913752,
"step": 178
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.6374077796936035,
"learning_rate": 1.9344320925398713e-06,
"loss": 2.3807,
"num_input_tokens_seen": 920952,
"step": 180
},
{
"epoch": 0.2727272727272727,
"eval_loss": 2.3428144454956055,
"eval_runtime": 5.8159,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 920952,
"step": 180
},
{
"epoch": 0.27575757575757576,
"grad_norm": 0.41562148928642273,
"learning_rate": 1.932902304423033e-06,
"loss": 2.5033,
"num_input_tokens_seen": 932280,
"step": 182
},
{
"epoch": 0.2787878787878788,
"grad_norm": 0.47822168469429016,
"learning_rate": 1.931355656118694e-06,
"loss": 2.275,
"num_input_tokens_seen": 944568,
"step": 184
},
{
"epoch": 0.2818181818181818,
"grad_norm": 0.553165853023529,
"learning_rate": 1.9297921832070134e-06,
"loss": 2.567,
"num_input_tokens_seen": 952032,
"step": 186
},
{
"epoch": 0.28484848484848485,
"grad_norm": 0.5379563570022583,
"learning_rate": 1.928211921655195e-06,
"loss": 2.5257,
"num_input_tokens_seen": 963840,
"step": 188
},
{
"epoch": 0.2878787878787879,
"grad_norm": 0.5385987758636475,
"learning_rate": 1.9266149078166603e-06,
"loss": 2.3678,
"num_input_tokens_seen": 975288,
"step": 190
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.42638707160949707,
"learning_rate": 1.9250011784302106e-06,
"loss": 2.232,
"num_input_tokens_seen": 987144,
"step": 192
},
{
"epoch": 0.29393939393939394,
"grad_norm": 0.450655996799469,
"learning_rate": 1.923370770619184e-06,
"loss": 2.1844,
"num_input_tokens_seen": 998664,
"step": 194
},
{
"epoch": 0.296969696969697,
"grad_norm": 0.477781742811203,
"learning_rate": 1.921723721890602e-06,
"loss": 2.3571,
"num_input_tokens_seen": 1008504,
"step": 196
},
{
"epoch": 0.3,
"grad_norm": 0.7921934723854065,
"learning_rate": 1.920060070134301e-06,
"loss": 2.472,
"num_input_tokens_seen": 1016664,
"step": 198
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.5304360389709473,
"learning_rate": 1.91837985362207e-06,
"loss": 2.4112,
"num_input_tokens_seen": 1026192,
"step": 200
},
{
"epoch": 0.30303030303030304,
"eval_loss": 2.340877056121826,
"eval_runtime": 5.8187,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 1026192,
"step": 200
},
{
"epoch": 0.30606060606060603,
"grad_norm": 0.4748481810092926,
"learning_rate": 1.9166831110067615e-06,
"loss": 2.5731,
"num_input_tokens_seen": 1037064,
"step": 202
},
{
"epoch": 0.3090909090909091,
"grad_norm": 0.44187602400779724,
"learning_rate": 1.914969881321407e-06,
"loss": 2.5743,
"num_input_tokens_seen": 1049352,
"step": 204
},
{
"epoch": 0.31212121212121213,
"grad_norm": 0.6284915208816528,
"learning_rate": 1.913240203978318e-06,
"loss": 2.4531,
"num_input_tokens_seen": 1057272,
"step": 206
},
{
"epoch": 0.3151515151515151,
"grad_norm": 0.6538528800010681,
"learning_rate": 1.9114941187681783e-06,
"loss": 2.5391,
"num_input_tokens_seen": 1065120,
"step": 208
},
{
"epoch": 0.3181818181818182,
"grad_norm": 1.0042399168014526,
"learning_rate": 1.9097316658591304e-06,
"loss": 2.4156,
"num_input_tokens_seen": 1074192,
"step": 210
},
{
"epoch": 0.3212121212121212,
"grad_norm": 0.48325198888778687,
"learning_rate": 1.9079528857958504e-06,
"loss": 2.5733,
"num_input_tokens_seen": 1084416,
"step": 212
},
{
"epoch": 0.3242424242424242,
"grad_norm": 0.6697909832000732,
"learning_rate": 1.906157819498616e-06,
"loss": 2.5264,
"num_input_tokens_seen": 1092888,
"step": 214
},
{
"epoch": 0.32727272727272727,
"grad_norm": 0.6655834913253784,
"learning_rate": 1.904346508262363e-06,
"loss": 2.3912,
"num_input_tokens_seen": 1100160,
"step": 216
},
{
"epoch": 0.3303030303030303,
"grad_norm": 1.1694029569625854,
"learning_rate": 1.9025189937557386e-06,
"loss": 2.462,
"num_input_tokens_seen": 1107360,
"step": 218
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.4985570013523102,
"learning_rate": 1.90067531802014e-06,
"loss": 2.2447,
"num_input_tokens_seen": 1119096,
"step": 220
},
{
"epoch": 0.3333333333333333,
"eval_loss": 2.339911937713623,
"eval_runtime": 5.8137,
"eval_samples_per_second": 3.44,
"eval_steps_per_second": 3.44,
"num_input_tokens_seen": 1119096,
"step": 220
},
{
"epoch": 0.33636363636363636,
"grad_norm": 0.4883664548397064,
"learning_rate": 1.8988155234687495e-06,
"loss": 2.4013,
"num_input_tokens_seen": 1131384,
"step": 222
},
{
"epoch": 0.3393939393939394,
"grad_norm": 0.7224740982055664,
"learning_rate": 1.8969396528855567e-06,
"loss": 2.4763,
"num_input_tokens_seen": 1142616,
"step": 224
},
{
"epoch": 0.3424242424242424,
"grad_norm": 0.569634199142456,
"learning_rate": 1.8950477494243762e-06,
"loss": 2.3552,
"num_input_tokens_seen": 1154904,
"step": 226
},
{
"epoch": 0.34545454545454546,
"grad_norm": 0.45122525095939636,
"learning_rate": 1.8931398566078523e-06,
"loss": 2.4198,
"num_input_tokens_seen": 1164264,
"step": 228
},
{
"epoch": 0.3484848484848485,
"grad_norm": 0.5598176121711731,
"learning_rate": 1.8912160183264612e-06,
"loss": 2.5283,
"num_input_tokens_seen": 1175472,
"step": 230
},
{
"epoch": 0.3515151515151515,
"grad_norm": 0.5492939352989197,
"learning_rate": 1.8892762788374985e-06,
"loss": 2.5246,
"num_input_tokens_seen": 1185264,
"step": 232
},
{
"epoch": 0.35454545454545455,
"grad_norm": 0.557397723197937,
"learning_rate": 1.8873206827640624e-06,
"loss": 2.3821,
"num_input_tokens_seen": 1197408,
"step": 234
},
{
"epoch": 0.3575757575757576,
"grad_norm": 0.42229530215263367,
"learning_rate": 1.8853492750940275e-06,
"loss": 2.3593,
"num_input_tokens_seen": 1207656,
"step": 236
},
{
"epoch": 0.3606060606060606,
"grad_norm": 0.4781576693058014,
"learning_rate": 1.8833621011790078e-06,
"loss": 2.2261,
"num_input_tokens_seen": 1219080,
"step": 238
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.46443861722946167,
"learning_rate": 1.8813592067333155e-06,
"loss": 2.4046,
"num_input_tokens_seen": 1230048,
"step": 240
},
{
"epoch": 0.36363636363636365,
"eval_loss": 2.339547872543335,
"eval_runtime": 5.8158,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 1230048,
"step": 240
},
{
"epoch": 0.36666666666666664,
"grad_norm": 0.4926714599132538,
"learning_rate": 1.8793406378329092e-06,
"loss": 2.1956,
"num_input_tokens_seen": 1239288,
"step": 242
},
{
"epoch": 0.3696969696969697,
"grad_norm": 0.9403526186943054,
"learning_rate": 1.877306440914333e-06,
"loss": 2.3843,
"num_input_tokens_seen": 1246512,
"step": 244
},
{
"epoch": 0.37272727272727274,
"grad_norm": 0.8498961329460144,
"learning_rate": 1.8752566627736477e-06,
"loss": 2.2977,
"num_input_tokens_seen": 1256256,
"step": 246
},
{
"epoch": 0.37575757575757573,
"grad_norm": 0.5305018424987793,
"learning_rate": 1.8731913505653569e-06,
"loss": 2.4575,
"num_input_tokens_seen": 1265712,
"step": 248
},
{
"epoch": 0.3787878787878788,
"grad_norm": 0.4798325002193451,
"learning_rate": 1.8711105518013199e-06,
"loss": 2.3638,
"num_input_tokens_seen": 1273848,
"step": 250
},
{
"epoch": 0.38181818181818183,
"grad_norm": 0.5862890481948853,
"learning_rate": 1.869014314349659e-06,
"loss": 2.388,
"num_input_tokens_seen": 1283664,
"step": 252
},
{
"epoch": 0.38484848484848483,
"grad_norm": 0.5504214763641357,
"learning_rate": 1.8669026864336591e-06,
"loss": 2.3997,
"num_input_tokens_seen": 1293768,
"step": 254
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.662431538105011,
"learning_rate": 1.8647757166306572e-06,
"loss": 2.4629,
"num_input_tokens_seen": 1303392,
"step": 256
},
{
"epoch": 0.39090909090909093,
"grad_norm": 0.5133792757987976,
"learning_rate": 1.8626334538709263e-06,
"loss": 2.3915,
"num_input_tokens_seen": 1313784,
"step": 258
},
{
"epoch": 0.3939393939393939,
"grad_norm": 0.47367045283317566,
"learning_rate": 1.8604759474365492e-06,
"loss": 2.4396,
"num_input_tokens_seen": 1326072,
"step": 260
},
{
"epoch": 0.3939393939393939,
"eval_loss": 2.338432788848877,
"eval_runtime": 5.8115,
"eval_samples_per_second": 3.441,
"eval_steps_per_second": 3.441,
"num_input_tokens_seen": 1326072,
"step": 260
},
{
"epoch": 0.396969696969697,
"grad_norm": 0.5194035768508911,
"learning_rate": 1.858303246960284e-06,
"loss": 2.4028,
"num_input_tokens_seen": 1335864,
"step": 262
},
{
"epoch": 0.4,
"grad_norm": 0.4642770290374756,
"learning_rate": 1.856115402424423e-06,
"loss": 2.434,
"num_input_tokens_seen": 1347552,
"step": 264
},
{
"epoch": 0.403030303030303,
"grad_norm": 0.5999087691307068,
"learning_rate": 1.8539124641596437e-06,
"loss": 2.3149,
"num_input_tokens_seen": 1356912,
"step": 266
},
{
"epoch": 0.40606060606060607,
"grad_norm": 0.588898241519928,
"learning_rate": 1.851694482843849e-06,
"loss": 2.5401,
"num_input_tokens_seen": 1368408,
"step": 268
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.49462223052978516,
"learning_rate": 1.8494615095010037e-06,
"loss": 2.3905,
"num_input_tokens_seen": 1380696,
"step": 270
},
{
"epoch": 0.4121212121212121,
"grad_norm": 1.0041953325271606,
"learning_rate": 1.8472135954999582e-06,
"loss": 2.7022,
"num_input_tokens_seen": 1389096,
"step": 272
},
{
"epoch": 0.41515151515151516,
"grad_norm": 0.5517657399177551,
"learning_rate": 1.8449507925532685e-06,
"loss": 2.5369,
"num_input_tokens_seen": 1400784,
"step": 274
},
{
"epoch": 0.41818181818181815,
"grad_norm": 0.6180247068405151,
"learning_rate": 1.8426731527160064e-06,
"loss": 2.2525,
"num_input_tokens_seen": 1413072,
"step": 276
},
{
"epoch": 0.4212121212121212,
"grad_norm": 0.6159691214561462,
"learning_rate": 1.8403807283845616e-06,
"loss": 2.3052,
"num_input_tokens_seen": 1422888,
"step": 278
},
{
"epoch": 0.42424242424242425,
"grad_norm": 0.6237558722496033,
"learning_rate": 1.8380735722954367e-06,
"loss": 2.344,
"num_input_tokens_seen": 1432128,
"step": 280
},
{
"epoch": 0.42424242424242425,
"eval_loss": 2.3386666774749756,
"eval_runtime": 5.8175,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 1432128,
"step": 280
},
{
"epoch": 0.42727272727272725,
"grad_norm": 0.6814020276069641,
"learning_rate": 1.835751737524033e-06,
"loss": 2.4498,
"num_input_tokens_seen": 1439928,
"step": 282
},
{
"epoch": 0.4303030303030303,
"grad_norm": 0.5670037865638733,
"learning_rate": 1.8334152774834309e-06,
"loss": 2.3934,
"num_input_tokens_seen": 1449624,
"step": 284
},
{
"epoch": 0.43333333333333335,
"grad_norm": 0.6628959774971008,
"learning_rate": 1.83106424592316e-06,
"loss": 2.52,
"num_input_tokens_seen": 1460520,
"step": 286
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.6537968516349792,
"learning_rate": 1.8286986969279643e-06,
"loss": 2.5132,
"num_input_tokens_seen": 1469712,
"step": 288
},
{
"epoch": 0.4393939393939394,
"grad_norm": 0.5633306503295898,
"learning_rate": 1.8263186849165555e-06,
"loss": 2.403,
"num_input_tokens_seen": 1480824,
"step": 290
},
{
"epoch": 0.44242424242424244,
"grad_norm": 0.5708298683166504,
"learning_rate": 1.8239242646403628e-06,
"loss": 2.5149,
"num_input_tokens_seen": 1488816,
"step": 292
},
{
"epoch": 0.44545454545454544,
"grad_norm": 0.7049750685691833,
"learning_rate": 1.8215154911822737e-06,
"loss": 2.2043,
"num_input_tokens_seen": 1497816,
"step": 294
},
{
"epoch": 0.4484848484848485,
"grad_norm": 0.5039754509925842,
"learning_rate": 1.8190924199553655e-06,
"loss": 2.439,
"num_input_tokens_seen": 1508928,
"step": 296
},
{
"epoch": 0.45151515151515154,
"grad_norm": 0.5821936726570129,
"learning_rate": 1.816655106701631e-06,
"loss": 2.4665,
"num_input_tokens_seen": 1519512,
"step": 298
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.5108533501625061,
"learning_rate": 1.8142036074906968e-06,
"loss": 2.4901,
"num_input_tokens_seen": 1529520,
"step": 300
},
{
"epoch": 0.45454545454545453,
"eval_loss": 2.337289333343506,
"eval_runtime": 5.817,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 1529520,
"step": 300
},
{
"epoch": 0.4575757575757576,
"grad_norm": 0.4282449781894684,
"learning_rate": 1.8117379787185333e-06,
"loss": 2.1503,
"num_input_tokens_seen": 1541808,
"step": 302
},
{
"epoch": 0.46060606060606063,
"grad_norm": 0.6109529137611389,
"learning_rate": 1.809258277106156e-06,
"loss": 2.4026,
"num_input_tokens_seen": 1550952,
"step": 304
},
{
"epoch": 0.4636363636363636,
"grad_norm": 0.5644070506095886,
"learning_rate": 1.8067645596983226e-06,
"loss": 2.4195,
"num_input_tokens_seen": 1562064,
"step": 306
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.665733814239502,
"learning_rate": 1.804256883862219e-06,
"loss": 2.6243,
"num_input_tokens_seen": 1569240,
"step": 308
},
{
"epoch": 0.4696969696969697,
"grad_norm": 0.6493149995803833,
"learning_rate": 1.8017353072861416e-06,
"loss": 2.3603,
"num_input_tokens_seen": 1579560,
"step": 310
},
{
"epoch": 0.4727272727272727,
"grad_norm": 0.5297104120254517,
"learning_rate": 1.7991998879781676e-06,
"loss": 2.2741,
"num_input_tokens_seen": 1591248,
"step": 312
},
{
"epoch": 0.47575757575757577,
"grad_norm": 0.4405084252357483,
"learning_rate": 1.796650684264823e-06,
"loss": 2.5167,
"num_input_tokens_seen": 1602840,
"step": 314
},
{
"epoch": 0.47878787878787876,
"grad_norm": 0.6081413626670837,
"learning_rate": 1.7940877547897383e-06,
"loss": 2.404,
"num_input_tokens_seen": 1610520,
"step": 316
},
{
"epoch": 0.4818181818181818,
"grad_norm": 0.7665295600891113,
"learning_rate": 1.7915111585123026e-06,
"loss": 2.3861,
"num_input_tokens_seen": 1617936,
"step": 318
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.5678819417953491,
"learning_rate": 1.7889209547063038e-06,
"loss": 2.3335,
"num_input_tokens_seen": 1628424,
"step": 320
},
{
"epoch": 0.48484848484848486,
"eval_loss": 2.336883068084717,
"eval_runtime": 5.8244,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 1628424,
"step": 320
},
{
"epoch": 0.48787878787878786,
"grad_norm": 0.6124878525733948,
"learning_rate": 1.7863172029585684e-06,
"loss": 2.6274,
"num_input_tokens_seen": 1636704,
"step": 322
},
{
"epoch": 0.4909090909090909,
"grad_norm": 0.5369870066642761,
"learning_rate": 1.7836999631675877e-06,
"loss": 2.2444,
"num_input_tokens_seen": 1646760,
"step": 324
},
{
"epoch": 0.49393939393939396,
"grad_norm": 0.47992056608200073,
"learning_rate": 1.7810692955421418e-06,
"loss": 2.3407,
"num_input_tokens_seen": 1657824,
"step": 326
},
{
"epoch": 0.49696969696969695,
"grad_norm": 0.5946272611618042,
"learning_rate": 1.778425260599914e-06,
"loss": 2.5075,
"num_input_tokens_seen": 1669800,
"step": 328
},
{
"epoch": 0.5,
"grad_norm": 0.5064172744750977,
"learning_rate": 1.7757679191660974e-06,
"loss": 2.4304,
"num_input_tokens_seen": 1678896,
"step": 330
},
{
"epoch": 0.503030303030303,
"grad_norm": 0.676836371421814,
"learning_rate": 1.7730973323719996e-06,
"loss": 2.3898,
"num_input_tokens_seen": 1686696,
"step": 332
},
{
"epoch": 0.5060606060606061,
"grad_norm": 0.45694637298583984,
"learning_rate": 1.7704135616536297e-06,
"loss": 2.1912,
"num_input_tokens_seen": 1695648,
"step": 334
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.5608468651771545,
"learning_rate": 1.767716668750292e-06,
"loss": 2.4971,
"num_input_tokens_seen": 1703112,
"step": 336
},
{
"epoch": 0.5121212121212121,
"grad_norm": 0.5195941925048828,
"learning_rate": 1.7650067157031607e-06,
"loss": 2.3934,
"num_input_tokens_seen": 1715400,
"step": 338
},
{
"epoch": 0.5151515151515151,
"grad_norm": 0.3820761442184448,
"learning_rate": 1.7622837648538558e-06,
"loss": 2.1842,
"num_input_tokens_seen": 1725816,
"step": 340
},
{
"epoch": 0.5151515151515151,
"eval_loss": 2.3365180492401123,
"eval_runtime": 5.8166,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 1725816,
"step": 340
},
{
"epoch": 0.5181818181818182,
"grad_norm": 0.5152050852775574,
"learning_rate": 1.7595478788430067e-06,
"loss": 2.2292,
"num_input_tokens_seen": 1737240,
"step": 342
},
{
"epoch": 0.5212121212121212,
"grad_norm": 0.6499360203742981,
"learning_rate": 1.7567991206088122e-06,
"loss": 2.3013,
"num_input_tokens_seen": 1743792,
"step": 344
},
{
"epoch": 0.5242424242424243,
"grad_norm": 0.6490241885185242,
"learning_rate": 1.7540375533855931e-06,
"loss": 2.5828,
"num_input_tokens_seen": 1755192,
"step": 346
},
{
"epoch": 0.5272727272727272,
"grad_norm": 0.5575884580612183,
"learning_rate": 1.751263240702337e-06,
"loss": 2.2834,
"num_input_tokens_seen": 1765656,
"step": 348
},
{
"epoch": 0.5303030303030303,
"grad_norm": 0.6133118867874146,
"learning_rate": 1.7484762463812359e-06,
"loss": 2.5502,
"num_input_tokens_seen": 1773504,
"step": 350
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.477857768535614,
"learning_rate": 1.7456766345362195e-06,
"loss": 2.2939,
"num_input_tokens_seen": 1785792,
"step": 352
},
{
"epoch": 0.5363636363636364,
"grad_norm": 1.5005486011505127,
"learning_rate": 1.7428644695714798e-06,
"loss": 2.3919,
"num_input_tokens_seen": 1792848,
"step": 354
},
{
"epoch": 0.5393939393939394,
"grad_norm": 0.6583260893821716,
"learning_rate": 1.7400398161799901e-06,
"loss": 2.4862,
"num_input_tokens_seen": 1802256,
"step": 356
},
{
"epoch": 0.5424242424242425,
"grad_norm": 0.5908564925193787,
"learning_rate": 1.7372027393420136e-06,
"loss": 2.4536,
"num_input_tokens_seen": 1812840,
"step": 358
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.6152108311653137,
"learning_rate": 1.7343533043236135e-06,
"loss": 2.2118,
"num_input_tokens_seen": 1822440,
"step": 360
},
{
"epoch": 0.5454545454545454,
"eval_loss": 2.335080623626709,
"eval_runtime": 5.8256,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 1822440,
"step": 360
},
{
"epoch": 0.5484848484848485,
"grad_norm": 0.5109455585479736,
"learning_rate": 1.7314915766751482e-06,
"loss": 2.3442,
"num_input_tokens_seen": 1833168,
"step": 362
},
{
"epoch": 0.5515151515151515,
"grad_norm": 0.4524301290512085,
"learning_rate": 1.7286176222297643e-06,
"loss": 2.3881,
"num_input_tokens_seen": 1845072,
"step": 364
},
{
"epoch": 0.5545454545454546,
"grad_norm": 0.4554661810398102,
"learning_rate": 1.7257315071018814e-06,
"loss": 2.2764,
"num_input_tokens_seen": 1857168,
"step": 366
},
{
"epoch": 0.5575757575757576,
"grad_norm": 0.42852118611335754,
"learning_rate": 1.7228332976856717e-06,
"loss": 2.364,
"num_input_tokens_seen": 1869456,
"step": 368
},
{
"epoch": 0.5606060606060606,
"grad_norm": 0.7273756861686707,
"learning_rate": 1.7199230606535347e-06,
"loss": 2.4654,
"num_input_tokens_seen": 1878168,
"step": 370
},
{
"epoch": 0.5636363636363636,
"grad_norm": 0.7303619384765625,
"learning_rate": 1.717000862954559e-06,
"loss": 2.4599,
"num_input_tokens_seen": 1888608,
"step": 372
},
{
"epoch": 0.5666666666666667,
"grad_norm": 0.6044741868972778,
"learning_rate": 1.7140667718129853e-06,
"loss": 2.2146,
"num_input_tokens_seen": 1897008,
"step": 374
},
{
"epoch": 0.5696969696969697,
"grad_norm": 0.5754801630973816,
"learning_rate": 1.7111208547266607e-06,
"loss": 2.4951,
"num_input_tokens_seen": 1906776,
"step": 376
},
{
"epoch": 0.5727272727272728,
"grad_norm": 0.47109347581863403,
"learning_rate": 1.7081631794654818e-06,
"loss": 2.1497,
"num_input_tokens_seen": 1919064,
"step": 378
},
{
"epoch": 0.5757575757575758,
"grad_norm": 0.6136711835861206,
"learning_rate": 1.7051938140698408e-06,
"loss": 2.3233,
"num_input_tokens_seen": 1928688,
"step": 380
},
{
"epoch": 0.5757575757575758,
"eval_loss": 2.334742546081543,
"eval_runtime": 5.8193,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 1928688,
"step": 380
},
{
"epoch": 0.5787878787878787,
"grad_norm": 0.6149052977561951,
"learning_rate": 1.702212826849056e-06,
"loss": 2.319,
"num_input_tokens_seen": 1940784,
"step": 382
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.5667149424552917,
"learning_rate": 1.6992202863798037e-06,
"loss": 2.5949,
"num_input_tokens_seen": 1950840,
"step": 384
},
{
"epoch": 0.5848484848484848,
"grad_norm": 0.5343450307846069,
"learning_rate": 1.6962162615045377e-06,
"loss": 2.3292,
"num_input_tokens_seen": 1963128,
"step": 386
},
{
"epoch": 0.5878787878787879,
"grad_norm": 0.5003802180290222,
"learning_rate": 1.6932008213299071e-06,
"loss": 2.5239,
"num_input_tokens_seen": 1975008,
"step": 388
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.5460373759269714,
"learning_rate": 1.6901740352251675e-06,
"loss": 2.4818,
"num_input_tokens_seen": 1983648,
"step": 390
},
{
"epoch": 0.593939393939394,
"grad_norm": 0.5535560250282288,
"learning_rate": 1.6871359728205828e-06,
"loss": 2.1795,
"num_input_tokens_seen": 1993536,
"step": 392
},
{
"epoch": 0.5969696969696969,
"grad_norm": 0.4466463029384613,
"learning_rate": 1.6840867040058254e-06,
"loss": 2.3585,
"num_input_tokens_seen": 2002872,
"step": 394
},
{
"epoch": 0.6,
"grad_norm": 0.5831019878387451,
"learning_rate": 1.6810262989283674e-06,
"loss": 2.3718,
"num_input_tokens_seen": 2012400,
"step": 396
},
{
"epoch": 0.603030303030303,
"grad_norm": 0.5981975197792053,
"learning_rate": 1.6779548279918671e-06,
"loss": 2.314,
"num_input_tokens_seen": 2022936,
"step": 398
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.5155858397483826,
"learning_rate": 1.6748723618545496e-06,
"loss": 2.6427,
"num_input_tokens_seen": 2031480,
"step": 400
},
{
"epoch": 0.6060606060606061,
"eval_loss": 2.334027051925659,
"eval_runtime": 5.8193,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 2031480,
"step": 400
},
{
"epoch": 0.6090909090909091,
"grad_norm": 0.46695375442504883,
"learning_rate": 1.6717789714275808e-06,
"loss": 2.2379,
"num_input_tokens_seen": 2043768,
"step": 402
},
{
"epoch": 0.6121212121212121,
"grad_norm": 0.8030733466148376,
"learning_rate": 1.6686747278734364e-06,
"loss": 2.3286,
"num_input_tokens_seen": 2052456,
"step": 404
},
{
"epoch": 0.6151515151515151,
"grad_norm": 0.5807926654815674,
"learning_rate": 1.6655597026042654e-06,
"loss": 2.3891,
"num_input_tokens_seen": 2062608,
"step": 406
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.5125523209571838,
"learning_rate": 1.6624339672802466e-06,
"loss": 2.2766,
"num_input_tokens_seen": 2070624,
"step": 408
},
{
"epoch": 0.6212121212121212,
"grad_norm": 0.4872816205024719,
"learning_rate": 1.65929759380794e-06,
"loss": 2.3172,
"num_input_tokens_seen": 2082024,
"step": 410
},
{
"epoch": 0.6242424242424243,
"grad_norm": 0.5617727637290955,
"learning_rate": 1.6561506543386332e-06,
"loss": 2.2975,
"num_input_tokens_seen": 2093928,
"step": 412
},
{
"epoch": 0.6272727272727273,
"grad_norm": 0.7218233942985535,
"learning_rate": 1.6529932212666813e-06,
"loss": 2.5706,
"num_input_tokens_seen": 2102712,
"step": 414
},
{
"epoch": 0.6303030303030303,
"grad_norm": 0.5542349219322205,
"learning_rate": 1.6498253672278403e-06,
"loss": 2.4111,
"num_input_tokens_seen": 2111352,
"step": 416
},
{
"epoch": 0.6333333333333333,
"grad_norm": 0.5303030610084534,
"learning_rate": 1.6466471650975989e-06,
"loss": 2.3655,
"num_input_tokens_seen": 2123184,
"step": 418
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.4791901111602783,
"learning_rate": 1.6434586879894994e-06,
"loss": 2.1955,
"num_input_tokens_seen": 2132520,
"step": 420
},
{
"epoch": 0.6363636363636364,
"eval_loss": 2.3337419033050537,
"eval_runtime": 5.8194,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 2132520,
"step": 420
},
{
"epoch": 0.6393939393939394,
"grad_norm": 1.0299837589263916,
"learning_rate": 1.6402600092534571e-06,
"loss": 2.4297,
"num_input_tokens_seen": 2140344,
"step": 422
},
{
"epoch": 0.6424242424242425,
"grad_norm": 0.5022935271263123,
"learning_rate": 1.637051202474072e-06,
"loss": 2.3299,
"num_input_tokens_seen": 2150592,
"step": 424
},
{
"epoch": 0.6454545454545455,
"grad_norm": 0.7252947688102722,
"learning_rate": 1.6338323414689384e-06,
"loss": 2.4036,
"num_input_tokens_seen": 2158848,
"step": 426
},
{
"epoch": 0.6484848484848484,
"grad_norm": 0.49614864587783813,
"learning_rate": 1.6306035002869418e-06,
"loss": 2.3709,
"num_input_tokens_seen": 2166120,
"step": 428
},
{
"epoch": 0.6515151515151515,
"grad_norm": 0.5736730098724365,
"learning_rate": 1.6273647532065615e-06,
"loss": 2.6169,
"num_input_tokens_seen": 2177760,
"step": 430
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.8251070380210876,
"learning_rate": 1.6241161747341568e-06,
"loss": 2.4805,
"num_input_tokens_seen": 2185488,
"step": 432
},
{
"epoch": 0.6575757575757576,
"grad_norm": 1.2293510437011719,
"learning_rate": 1.6208578396022566e-06,
"loss": 2.1922,
"num_input_tokens_seen": 2196336,
"step": 434
},
{
"epoch": 0.6606060606060606,
"grad_norm": 0.6561338305473328,
"learning_rate": 1.6175898227678376e-06,
"loss": 2.4529,
"num_input_tokens_seen": 2204520,
"step": 436
},
{
"epoch": 0.6636363636363637,
"grad_norm": 0.4846937954425812,
"learning_rate": 1.6143121994106012e-06,
"loss": 2.3597,
"num_input_tokens_seen": 2216808,
"step": 438
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.6437355279922485,
"learning_rate": 1.611025044931245e-06,
"loss": 2.4364,
"num_input_tokens_seen": 2227752,
"step": 440
},
{
"epoch": 0.6666666666666666,
"eval_loss": 2.3327877521514893,
"eval_runtime": 5.8187,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 2227752,
"step": 440
},
{
"epoch": 0.6696969696969697,
"grad_norm": 0.5672312378883362,
"learning_rate": 1.6077284349497254e-06,
"loss": 2.5148,
"num_input_tokens_seen": 2237808,
"step": 442
},
{
"epoch": 0.6727272727272727,
"grad_norm": 0.5006369948387146,
"learning_rate": 1.6044224453035203e-06,
"loss": 2.1969,
"num_input_tokens_seen": 2249304,
"step": 444
},
{
"epoch": 0.6757575757575758,
"grad_norm": 0.6202157735824585,
"learning_rate": 1.6011071520458845e-06,
"loss": 2.5604,
"num_input_tokens_seen": 2260176,
"step": 446
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.555921733379364,
"learning_rate": 1.5977826314440987e-06,
"loss": 2.2211,
"num_input_tokens_seen": 2270184,
"step": 448
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.5153559446334839,
"learning_rate": 1.5944489599777161e-06,
"loss": 2.3477,
"num_input_tokens_seen": 2281464,
"step": 450
},
{
"epoch": 0.6848484848484848,
"grad_norm": 0.5477102994918823,
"learning_rate": 1.5911062143368027e-06,
"loss": 2.4645,
"num_input_tokens_seen": 2292720,
"step": 452
},
{
"epoch": 0.6878787878787879,
"grad_norm": 0.5461196303367615,
"learning_rate": 1.5877544714201726e-06,
"loss": 2.5217,
"num_input_tokens_seen": 2303376,
"step": 454
},
{
"epoch": 0.6909090909090909,
"grad_norm": 0.5640104413032532,
"learning_rate": 1.5843938083336194e-06,
"loss": 2.5123,
"num_input_tokens_seen": 2312544,
"step": 456
},
{
"epoch": 0.693939393939394,
"grad_norm": 0.4936680197715759,
"learning_rate": 1.5810243023881432e-06,
"loss": 2.2975,
"num_input_tokens_seen": 2323344,
"step": 458
},
{
"epoch": 0.696969696969697,
"grad_norm": 0.4782181680202484,
"learning_rate": 1.5776460310981702e-06,
"loss": 2.3568,
"num_input_tokens_seen": 2332056,
"step": 460
},
{
"epoch": 0.696969696969697,
"eval_loss": 2.332925319671631,
"eval_runtime": 5.8201,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 3.436,
"num_input_tokens_seen": 2332056,
"step": 460
},
{
"epoch": 0.7,
"grad_norm": 0.5433066487312317,
"learning_rate": 1.5742590721797725e-06,
"loss": 2.5328,
"num_input_tokens_seen": 2342400,
"step": 462
},
{
"epoch": 0.703030303030303,
"grad_norm": 1.0040984153747559,
"learning_rate": 1.5708635035488756e-06,
"loss": 2.5166,
"num_input_tokens_seen": 2350536,
"step": 464
},
{
"epoch": 0.706060606060606,
"grad_norm": 0.5495861172676086,
"learning_rate": 1.5674594033194706e-06,
"loss": 2.3471,
"num_input_tokens_seen": 2361528,
"step": 466
},
{
"epoch": 0.7090909090909091,
"grad_norm": 0.6494752764701843,
"learning_rate": 1.5640468498018153e-06,
"loss": 2.4315,
"num_input_tokens_seen": 2370552,
"step": 468
},
{
"epoch": 0.7121212121212122,
"grad_norm": 0.5859867930412292,
"learning_rate": 1.5606259215006325e-06,
"loss": 2.5083,
"num_input_tokens_seen": 2380368,
"step": 470
},
{
"epoch": 0.7151515151515152,
"grad_norm": 0.606728196144104,
"learning_rate": 1.5571966971133037e-06,
"loss": 2.3308,
"num_input_tokens_seen": 2389176,
"step": 472
},
{
"epoch": 0.7181818181818181,
"grad_norm": 0.453156441450119,
"learning_rate": 1.5537592555280594e-06,
"loss": 2.3236,
"num_input_tokens_seen": 2398944,
"step": 474
},
{
"epoch": 0.7212121212121212,
"grad_norm": 0.8148333430290222,
"learning_rate": 1.5503136758221653e-06,
"loss": 2.8391,
"num_input_tokens_seen": 2404656,
"step": 476
},
{
"epoch": 0.7242424242424242,
"grad_norm": 0.4754016399383545,
"learning_rate": 1.5468600372601009e-06,
"loss": 2.6875,
"num_input_tokens_seen": 2416392,
"step": 478
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.7027032375335693,
"learning_rate": 1.543398419291737e-06,
"loss": 2.4508,
"num_input_tokens_seen": 2425032,
"step": 480
},
{
"epoch": 0.7272727272727273,
"eval_loss": 2.332369089126587,
"eval_runtime": 5.8166,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 2425032,
"step": 480
},
{
"epoch": 0.7303030303030303,
"grad_norm": 0.8816015124320984,
"learning_rate": 1.5399289015505096e-06,
"loss": 2.4884,
"num_input_tokens_seen": 2432280,
"step": 482
},
{
"epoch": 0.7333333333333333,
"grad_norm": 0.5385542511940002,
"learning_rate": 1.536451563851584e-06,
"loss": 2.3159,
"num_input_tokens_seen": 2442576,
"step": 484
},
{
"epoch": 0.7363636363636363,
"grad_norm": 0.5668327808380127,
"learning_rate": 1.5329664861900237e-06,
"loss": 2.5522,
"num_input_tokens_seen": 2450664,
"step": 486
},
{
"epoch": 0.7393939393939394,
"grad_norm": 0.5444993376731873,
"learning_rate": 1.5294737487389462e-06,
"loss": 2.4853,
"num_input_tokens_seen": 2462568,
"step": 488
},
{
"epoch": 0.7424242424242424,
"grad_norm": 0.5722953081130981,
"learning_rate": 1.5259734318476807e-06,
"loss": 2.5841,
"num_input_tokens_seen": 2472312,
"step": 490
},
{
"epoch": 0.7454545454545455,
"grad_norm": 0.5933071970939636,
"learning_rate": 1.5224656160399186e-06,
"loss": 2.4222,
"num_input_tokens_seen": 2483016,
"step": 492
},
{
"epoch": 0.7484848484848485,
"grad_norm": 0.6787658929824829,
"learning_rate": 1.518950382011861e-06,
"loss": 2.261,
"num_input_tokens_seen": 2492688,
"step": 494
},
{
"epoch": 0.7515151515151515,
"grad_norm": 0.5823308825492859,
"learning_rate": 1.5154278106303649e-06,
"loss": 2.3332,
"num_input_tokens_seen": 2504472,
"step": 496
},
{
"epoch": 0.7545454545454545,
"grad_norm": 0.5042080879211426,
"learning_rate": 1.511897982931078e-06,
"loss": 2.3521,
"num_input_tokens_seen": 2516160,
"step": 498
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.6808260679244995,
"learning_rate": 1.50836098011658e-06,
"loss": 2.3093,
"num_input_tokens_seen": 2527320,
"step": 500
},
{
"epoch": 0.7575757575757576,
"eval_loss": 2.3320088386535645,
"eval_runtime": 5.8161,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 2527320,
"step": 500
},
{
"epoch": 0.7606060606060606,
"grad_norm": 0.5960633158683777,
"learning_rate": 1.5048168835545094e-06,
"loss": 2.4031,
"num_input_tokens_seen": 2535744,
"step": 502
},
{
"epoch": 0.7636363636363637,
"grad_norm": 0.4656361937522888,
"learning_rate": 1.5012657747756961e-06,
"loss": 2.2842,
"num_input_tokens_seen": 2546376,
"step": 504
},
{
"epoch": 0.7666666666666667,
"grad_norm": 0.7001519203186035,
"learning_rate": 1.4977077354722828e-06,
"loss": 2.4888,
"num_input_tokens_seen": 2553456,
"step": 506
},
{
"epoch": 0.7696969696969697,
"grad_norm": 0.5070295333862305,
"learning_rate": 1.4941428474958469e-06,
"loss": 2.3082,
"num_input_tokens_seen": 2563632,
"step": 508
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.559223473072052,
"learning_rate": 1.4905711928555178e-06,
"loss": 2.4127,
"num_input_tokens_seen": 2573184,
"step": 510
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.45378220081329346,
"learning_rate": 1.4869928537160892e-06,
"loss": 2.1886,
"num_input_tokens_seen": 2585472,
"step": 512
},
{
"epoch": 0.7787878787878788,
"grad_norm": 0.5591022968292236,
"learning_rate": 1.4834079123961308e-06,
"loss": 2.2753,
"num_input_tokens_seen": 2594304,
"step": 514
},
{
"epoch": 0.7818181818181819,
"grad_norm": 0.6257476806640625,
"learning_rate": 1.479816451366092e-06,
"loss": 2.4605,
"num_input_tokens_seen": 2601600,
"step": 516
},
{
"epoch": 0.7848484848484848,
"grad_norm": 0.5094606280326843,
"learning_rate": 1.4762185532464057e-06,
"loss": 2.4019,
"num_input_tokens_seen": 2612280,
"step": 518
},
{
"epoch": 0.7878787878787878,
"grad_norm": 0.4572422206401825,
"learning_rate": 1.472614300805591e-06,
"loss": 2.5201,
"num_input_tokens_seen": 2624280,
"step": 520
},
{
"epoch": 0.7878787878787878,
"eval_loss": 2.3315682411193848,
"eval_runtime": 5.8196,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 2624280,
"step": 520
},
{
"epoch": 0.7909090909090909,
"grad_norm": 0.5242352485656738,
"learning_rate": 1.4690037769583428e-06,
"loss": 2.429,
"num_input_tokens_seen": 2634072,
"step": 522
},
{
"epoch": 0.793939393939394,
"grad_norm": 0.48639097809791565,
"learning_rate": 1.4653870647636297e-06,
"loss": 2.4341,
"num_input_tokens_seen": 2643864,
"step": 524
},
{
"epoch": 0.796969696969697,
"grad_norm": 0.48426756262779236,
"learning_rate": 1.4617642474227797e-06,
"loss": 2.2926,
"num_input_tokens_seen": 2656152,
"step": 526
},
{
"epoch": 0.8,
"grad_norm": 0.5517458319664001,
"learning_rate": 1.45813540827757e-06,
"loss": 2.6445,
"num_input_tokens_seen": 2665968,
"step": 528
},
{
"epoch": 0.803030303030303,
"grad_norm": 0.540124237537384,
"learning_rate": 1.4545006308083055e-06,
"loss": 2.2952,
"num_input_tokens_seen": 2677680,
"step": 530
},
{
"epoch": 0.806060606060606,
"grad_norm": 0.5651832222938538,
"learning_rate": 1.4508599986319015e-06,
"loss": 2.4097,
"num_input_tokens_seen": 2687376,
"step": 532
},
{
"epoch": 0.8090909090909091,
"grad_norm": 0.4706498980522156,
"learning_rate": 1.4472135954999578e-06,
"loss": 2.2751,
"num_input_tokens_seen": 2699112,
"step": 534
},
{
"epoch": 0.8121212121212121,
"grad_norm": 0.5661342144012451,
"learning_rate": 1.4435615052968358e-06,
"loss": 2.4527,
"num_input_tokens_seen": 2710008,
"step": 536
},
{
"epoch": 0.8151515151515152,
"grad_norm": 0.49977409839630127,
"learning_rate": 1.4399038120377224e-06,
"loss": 2.3689,
"num_input_tokens_seen": 2720136,
"step": 538
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.5473623871803284,
"learning_rate": 1.4362405998667043e-06,
"loss": 2.4758,
"num_input_tokens_seen": 2729160,
"step": 540
},
{
"epoch": 0.8181818181818182,
"eval_loss": 2.3316752910614014,
"eval_runtime": 5.8161,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 2729160,
"step": 540
},
{
"epoch": 0.8212121212121212,
"grad_norm": 0.5338855385780334,
"learning_rate": 1.432571953054828e-06,
"loss": 2.3434,
"num_input_tokens_seen": 2739168,
"step": 542
},
{
"epoch": 0.8242424242424242,
"grad_norm": 0.5923134684562683,
"learning_rate": 1.4288979559981615e-06,
"loss": 2.364,
"num_input_tokens_seen": 2747688,
"step": 544
},
{
"epoch": 0.8272727272727273,
"grad_norm": 0.48334839940071106,
"learning_rate": 1.4252186932158546e-06,
"loss": 2.4677,
"num_input_tokens_seen": 2758488,
"step": 546
},
{
"epoch": 0.8303030303030303,
"grad_norm": 0.5619869828224182,
"learning_rate": 1.421534249348192e-06,
"loss": 2.5121,
"num_input_tokens_seen": 2768832,
"step": 548
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.6507293581962585,
"learning_rate": 1.4178447091546497e-06,
"loss": 2.491,
"num_input_tokens_seen": 2779584,
"step": 550
},
{
"epoch": 0.8363636363636363,
"grad_norm": 0.8891876935958862,
"learning_rate": 1.414150157511941e-06,
"loss": 2.3513,
"num_input_tokens_seen": 2786232,
"step": 552
},
{
"epoch": 0.8393939393939394,
"grad_norm": 0.5667576193809509,
"learning_rate": 1.410450679412067e-06,
"loss": 2.4317,
"num_input_tokens_seen": 2796216,
"step": 554
},
{
"epoch": 0.8424242424242424,
"grad_norm": 0.4579615592956543,
"learning_rate": 1.406746359960361e-06,
"loss": 2.3216,
"num_input_tokens_seen": 2807352,
"step": 556
},
{
"epoch": 0.8454545454545455,
"grad_norm": 0.4524303376674652,
"learning_rate": 1.403037284373529e-06,
"loss": 2.2947,
"num_input_tokens_seen": 2817936,
"step": 558
},
{
"epoch": 0.8484848484848485,
"grad_norm": 0.8141398429870605,
"learning_rate": 1.3993235379776908e-06,
"loss": 2.5013,
"num_input_tokens_seen": 2827104,
"step": 560
},
{
"epoch": 0.8484848484848485,
"eval_loss": 2.33099102973938,
"eval_runtime": 5.8178,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 2827104,
"step": 560
},
{
"epoch": 0.8515151515151516,
"grad_norm": 0.6005460023880005,
"learning_rate": 1.395605206206417e-06,
"loss": 2.3728,
"num_input_tokens_seen": 2834520,
"step": 562
},
{
"epoch": 0.8545454545454545,
"grad_norm": 0.6270483136177063,
"learning_rate": 1.3918823745987625e-06,
"loss": 2.5102,
"num_input_tokens_seen": 2845560,
"step": 564
},
{
"epoch": 0.8575757575757575,
"grad_norm": 0.5506067872047424,
"learning_rate": 1.3881551287973006e-06,
"loss": 2.4606,
"num_input_tokens_seen": 2856168,
"step": 566
},
{
"epoch": 0.8606060606060606,
"grad_norm": 0.5318931937217712,
"learning_rate": 1.384423554546151e-06,
"loss": 2.6367,
"num_input_tokens_seen": 2866872,
"step": 568
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.5173328518867493,
"learning_rate": 1.3806877376890084e-06,
"loss": 2.4952,
"num_input_tokens_seen": 2878296,
"step": 570
},
{
"epoch": 0.8666666666666667,
"grad_norm": 0.6837607622146606,
"learning_rate": 1.3769477641671668e-06,
"loss": 2.4297,
"num_input_tokens_seen": 2887056,
"step": 572
},
{
"epoch": 0.8696969696969697,
"grad_norm": 0.5360056757926941,
"learning_rate": 1.373203720017544e-06,
"loss": 2.3496,
"num_input_tokens_seen": 2896152,
"step": 574
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.5022287368774414,
"learning_rate": 1.3694556913706996e-06,
"loss": 2.4491,
"num_input_tokens_seen": 2905776,
"step": 576
},
{
"epoch": 0.8757575757575757,
"grad_norm": 0.691007137298584,
"learning_rate": 1.3657037644488574e-06,
"loss": 2.1934,
"num_input_tokens_seen": 2915568,
"step": 578
},
{
"epoch": 0.8787878787878788,
"grad_norm": 0.5107728838920593,
"learning_rate": 1.361948025563918e-06,
"loss": 2.3654,
"num_input_tokens_seen": 2926128,
"step": 580
},
{
"epoch": 0.8787878787878788,
"eval_loss": 2.33089542388916,
"eval_runtime": 5.8222,
"eval_samples_per_second": 3.435,
"eval_steps_per_second": 3.435,
"num_input_tokens_seen": 2926128,
"step": 580
},
{
"epoch": 0.8818181818181818,
"grad_norm": 0.5568860769271851,
"learning_rate": 1.3581885611154759e-06,
"loss": 2.4307,
"num_input_tokens_seen": 2933568,
"step": 582
},
{
"epoch": 0.8848484848484849,
"grad_norm": 0.6976082921028137,
"learning_rate": 1.3544254575888313e-06,
"loss": 2.6203,
"num_input_tokens_seen": 2942616,
"step": 584
},
{
"epoch": 0.8878787878787879,
"grad_norm": 0.5394561290740967,
"learning_rate": 1.3506588015529994e-06,
"loss": 2.4422,
"num_input_tokens_seen": 2952480,
"step": 586
},
{
"epoch": 0.8909090909090909,
"grad_norm": 0.5144073963165283,
"learning_rate": 1.3468886796587202e-06,
"loss": 2.2622,
"num_input_tokens_seen": 2962344,
"step": 588
},
{
"epoch": 0.8939393939393939,
"grad_norm": 0.5705990195274353,
"learning_rate": 1.3431151786364647e-06,
"loss": 2.3397,
"num_input_tokens_seen": 2969832,
"step": 590
},
{
"epoch": 0.896969696969697,
"grad_norm": 0.7521764636039734,
"learning_rate": 1.33933838529444e-06,
"loss": 2.4768,
"num_input_tokens_seen": 2979312,
"step": 592
},
{
"epoch": 0.9,
"grad_norm": 0.4214877784252167,
"learning_rate": 1.3355583865165912e-06,
"loss": 2.3752,
"num_input_tokens_seen": 2990568,
"step": 594
},
{
"epoch": 0.9030303030303031,
"grad_norm": 0.6079035401344299,
"learning_rate": 1.331775269260604e-06,
"loss": 2.3682,
"num_input_tokens_seen": 2998584,
"step": 596
},
{
"epoch": 0.906060606060606,
"grad_norm": 0.5687966346740723,
"learning_rate": 1.3279891205559034e-06,
"loss": 2.4906,
"num_input_tokens_seen": 3005784,
"step": 598
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.6438218355178833,
"learning_rate": 1.3242000275016527e-06,
"loss": 2.4142,
"num_input_tokens_seen": 3013968,
"step": 600
},
{
"epoch": 0.9090909090909091,
"eval_loss": 2.3308167457580566,
"eval_runtime": 5.8211,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 3.436,
"num_input_tokens_seen": 3013968,
"step": 600
},
{
"epoch": 0.9121212121212121,
"grad_norm": 0.8877610564231873,
"learning_rate": 1.3204080772647478e-06,
"loss": 2.8198,
"num_input_tokens_seen": 3021504,
"step": 602
},
{
"epoch": 0.9151515151515152,
"grad_norm": 0.6974935531616211,
"learning_rate": 1.3166133570778143e-06,
"loss": 2.4954,
"num_input_tokens_seen": 3033264,
"step": 604
},
{
"epoch": 0.9181818181818182,
"grad_norm": 0.4437900483608246,
"learning_rate": 1.3128159542371987e-06,
"loss": 2.4191,
"num_input_tokens_seen": 3044688,
"step": 606
},
{
"epoch": 0.9212121212121213,
"grad_norm": 0.41366204619407654,
"learning_rate": 1.309015956100962e-06,
"loss": 2.2432,
"num_input_tokens_seen": 3056592,
"step": 608
},
{
"epoch": 0.9242424242424242,
"grad_norm": 0.4901912808418274,
"learning_rate": 1.3052134500868686e-06,
"loss": 2.4408,
"num_input_tokens_seen": 3066048,
"step": 610
},
{
"epoch": 0.9272727272727272,
"grad_norm": 0.7082731127738953,
"learning_rate": 1.301408523670376e-06,
"loss": 2.5248,
"num_input_tokens_seen": 3076128,
"step": 612
},
{
"epoch": 0.9303030303030303,
"grad_norm": 0.6702643036842346,
"learning_rate": 1.297601264382622e-06,
"loss": 2.4202,
"num_input_tokens_seen": 3085464,
"step": 614
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.5271164178848267,
"learning_rate": 1.2937917598084123e-06,
"loss": 2.3525,
"num_input_tokens_seen": 3094440,
"step": 616
},
{
"epoch": 0.9363636363636364,
"grad_norm": 0.5742107629776001,
"learning_rate": 1.2899800975842038e-06,
"loss": 2.3598,
"num_input_tokens_seen": 3105720,
"step": 618
},
{
"epoch": 0.9393939393939394,
"grad_norm": 0.653012216091156,
"learning_rate": 1.286166365396089e-06,
"loss": 2.588,
"num_input_tokens_seen": 3113856,
"step": 620
},
{
"epoch": 0.9393939393939394,
"eval_loss": 2.3307266235351562,
"eval_runtime": 5.8207,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 3.436,
"num_input_tokens_seen": 3113856,
"step": 620
},
{
"epoch": 0.9424242424242424,
"grad_norm": 0.7475118041038513,
"learning_rate": 1.2823506509777807e-06,
"loss": 2.4249,
"num_input_tokens_seen": 3123288,
"step": 622
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.7373444437980652,
"learning_rate": 1.2785330421085917e-06,
"loss": 2.3551,
"num_input_tokens_seen": 3131256,
"step": 624
},
{
"epoch": 0.9484848484848485,
"grad_norm": 0.5523613691329956,
"learning_rate": 1.2747136266114156e-06,
"loss": 2.1922,
"num_input_tokens_seen": 3139656,
"step": 626
},
{
"epoch": 0.9515151515151515,
"grad_norm": 0.7101964950561523,
"learning_rate": 1.270892492350707e-06,
"loss": 2.4905,
"num_input_tokens_seen": 3147744,
"step": 628
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.5868334770202637,
"learning_rate": 1.267069727230461e-06,
"loss": 2.4588,
"num_input_tokens_seen": 3158376,
"step": 630
},
{
"epoch": 0.9575757575757575,
"grad_norm": 0.6006575226783752,
"learning_rate": 1.2632454191921894e-06,
"loss": 2.3059,
"num_input_tokens_seen": 3168120,
"step": 632
},
{
"epoch": 0.9606060606060606,
"grad_norm": 0.5622104406356812,
"learning_rate": 1.2594196562128978e-06,
"loss": 2.5159,
"num_input_tokens_seen": 3178176,
"step": 634
},
{
"epoch": 0.9636363636363636,
"grad_norm": 0.5180094242095947,
"learning_rate": 1.2555925263030634e-06,
"loss": 2.3614,
"num_input_tokens_seen": 3189816,
"step": 636
},
{
"epoch": 0.9666666666666667,
"grad_norm": 0.7544111013412476,
"learning_rate": 1.2517641175046078e-06,
"loss": 2.6341,
"num_input_tokens_seen": 3198528,
"step": 638
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.5005560517311096,
"learning_rate": 1.2479345178888752e-06,
"loss": 2.1493,
"num_input_tokens_seen": 3209904,
"step": 640
},
{
"epoch": 0.9696969696969697,
"eval_loss": 2.3306069374084473,
"eval_runtime": 5.8165,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 3209904,
"step": 640
},
{
"epoch": 0.9727272727272728,
"grad_norm": 0.6143120527267456,
"learning_rate": 1.244103815554602e-06,
"loss": 2.5543,
"num_input_tokens_seen": 3220584,
"step": 642
},
{
"epoch": 0.9757575757575757,
"grad_norm": 0.6468402147293091,
"learning_rate": 1.2402720986258936e-06,
"loss": 2.3468,
"num_input_tokens_seen": 3231576,
"step": 644
},
{
"epoch": 0.9787878787878788,
"grad_norm": 0.6000608205795288,
"learning_rate": 1.2364394552501951e-06,
"loss": 2.3648,
"num_input_tokens_seen": 3239208,
"step": 646
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.6772189140319824,
"learning_rate": 1.2326059735962648e-06,
"loss": 2.5894,
"num_input_tokens_seen": 3246072,
"step": 648
},
{
"epoch": 0.9848484848484849,
"grad_norm": 0.5030667185783386,
"learning_rate": 1.228771741852145e-06,
"loss": 2.4484,
"num_input_tokens_seen": 3258000,
"step": 650
},
{
"epoch": 0.9878787878787879,
"grad_norm": 0.8376536965370178,
"learning_rate": 1.2249368482231334e-06,
"loss": 2.5076,
"num_input_tokens_seen": 3264912,
"step": 652
},
{
"epoch": 0.990909090909091,
"grad_norm": 0.6285922527313232,
"learning_rate": 1.2211013809297546e-06,
"loss": 2.3112,
"num_input_tokens_seen": 3272832,
"step": 654
},
{
"epoch": 0.9939393939393939,
"grad_norm": 0.49095821380615234,
"learning_rate": 1.21726542820573e-06,
"loss": 2.3038,
"num_input_tokens_seen": 3283848,
"step": 656
},
{
"epoch": 0.996969696969697,
"grad_norm": 0.5539312958717346,
"learning_rate": 1.213429078295948e-06,
"loss": 2.3811,
"num_input_tokens_seen": 3295272,
"step": 658
},
{
"epoch": 1.0,
"grad_norm": 0.46812400221824646,
"learning_rate": 1.2095924194544344e-06,
"loss": 2.4287,
"num_input_tokens_seen": 3305760,
"step": 660
},
{
"epoch": 1.0,
"eval_loss": 2.3300185203552246,
"eval_runtime": 5.8178,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 3305760,
"step": 660
},
{
"epoch": 1.003030303030303,
"grad_norm": 0.45484259724617004,
"learning_rate": 1.2057555399423218e-06,
"loss": 2.4229,
"num_input_tokens_seen": 3316512,
"step": 662
},
{
"epoch": 1.006060606060606,
"grad_norm": 0.506411612033844,
"learning_rate": 1.201918528025819e-06,
"loss": 2.3718,
"num_input_tokens_seen": 3328800,
"step": 664
},
{
"epoch": 1.009090909090909,
"grad_norm": 0.7456917762756348,
"learning_rate": 1.1980814719741809e-06,
"loss": 2.5418,
"num_input_tokens_seen": 3335424,
"step": 666
},
{
"epoch": 1.0121212121212122,
"grad_norm": 0.6323581337928772,
"learning_rate": 1.1942444600576783e-06,
"loss": 2.4076,
"num_input_tokens_seen": 3344904,
"step": 668
},
{
"epoch": 1.0151515151515151,
"grad_norm": 0.6008067727088928,
"learning_rate": 1.1904075805455657e-06,
"loss": 2.3543,
"num_input_tokens_seen": 3355176,
"step": 670
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.6115381121635437,
"learning_rate": 1.186570921704052e-06,
"loss": 2.3537,
"num_input_tokens_seen": 3366096,
"step": 672
},
{
"epoch": 1.0212121212121212,
"grad_norm": 0.5540327429771423,
"learning_rate": 1.18273457179427e-06,
"loss": 2.1717,
"num_input_tokens_seen": 3375696,
"step": 674
},
{
"epoch": 1.0242424242424242,
"grad_norm": 0.6130234599113464,
"learning_rate": 1.1788986190702453e-06,
"loss": 2.408,
"num_input_tokens_seen": 3384288,
"step": 676
},
{
"epoch": 1.0272727272727273,
"grad_norm": 0.6069101095199585,
"learning_rate": 1.1750631517768667e-06,
"loss": 2.3485,
"num_input_tokens_seen": 3391128,
"step": 678
},
{
"epoch": 1.0303030303030303,
"grad_norm": 0.5664869546890259,
"learning_rate": 1.1712282581478552e-06,
"loss": 2.4617,
"num_input_tokens_seen": 3401640,
"step": 680
},
{
"epoch": 1.0303030303030303,
"eval_loss": 2.3293986320495605,
"eval_runtime": 5.8211,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 3.436,
"num_input_tokens_seen": 3401640,
"step": 680
},
{
"epoch": 1.0333333333333334,
"grad_norm": 0.5258334875106812,
"learning_rate": 1.167394026403735e-06,
"loss": 2.3971,
"num_input_tokens_seen": 3411120,
"step": 682
},
{
"epoch": 1.0363636363636364,
"grad_norm": 0.5583547353744507,
"learning_rate": 1.1635605447498048e-06,
"loss": 2.3265,
"num_input_tokens_seen": 3420912,
"step": 684
},
{
"epoch": 1.0393939393939393,
"grad_norm": 0.5852888822555542,
"learning_rate": 1.1597279013741067e-06,
"loss": 2.5114,
"num_input_tokens_seen": 3429744,
"step": 686
},
{
"epoch": 1.0424242424242425,
"grad_norm": 0.5078532695770264,
"learning_rate": 1.1558961844453978e-06,
"loss": 2.5497,
"num_input_tokens_seen": 3438936,
"step": 688
},
{
"epoch": 1.0454545454545454,
"grad_norm": 0.9847856760025024,
"learning_rate": 1.152065482111125e-06,
"loss": 2.5458,
"num_input_tokens_seen": 3444912,
"step": 690
},
{
"epoch": 1.0484848484848486,
"grad_norm": 0.49534177780151367,
"learning_rate": 1.1482358824953919e-06,
"loss": 2.3622,
"num_input_tokens_seen": 3456936,
"step": 692
},
{
"epoch": 1.0515151515151515,
"grad_norm": 0.6851257681846619,
"learning_rate": 1.144407473696937e-06,
"loss": 2.221,
"num_input_tokens_seen": 3466344,
"step": 694
},
{
"epoch": 1.0545454545454545,
"grad_norm": 0.4764980375766754,
"learning_rate": 1.1405803437871027e-06,
"loss": 2.3708,
"num_input_tokens_seen": 3478632,
"step": 696
},
{
"epoch": 1.0575757575757576,
"grad_norm": 0.6040279865264893,
"learning_rate": 1.136754580807811e-06,
"loss": 2.5175,
"num_input_tokens_seen": 3485496,
"step": 698
},
{
"epoch": 1.0606060606060606,
"grad_norm": 0.6335225701332092,
"learning_rate": 1.1329302727695389e-06,
"loss": 2.2166,
"num_input_tokens_seen": 3496272,
"step": 700
},
{
"epoch": 1.0606060606060606,
"eval_loss": 2.329413890838623,
"eval_runtime": 5.8255,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 3496272,
"step": 700
},
{
"epoch": 1.0636363636363637,
"grad_norm": 0.6664142608642578,
"learning_rate": 1.1291075076492928e-06,
"loss": 2.5228,
"num_input_tokens_seen": 3506712,
"step": 702
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.5364813208580017,
"learning_rate": 1.1252863733885845e-06,
"loss": 2.4304,
"num_input_tokens_seen": 3518856,
"step": 704
},
{
"epoch": 1.0696969696969698,
"grad_norm": 0.7389492988586426,
"learning_rate": 1.1214669578914087e-06,
"loss": 2.0998,
"num_input_tokens_seen": 3528456,
"step": 706
},
{
"epoch": 1.0727272727272728,
"grad_norm": 0.709426999092102,
"learning_rate": 1.1176493490222192e-06,
"loss": 2.146,
"num_input_tokens_seen": 3537048,
"step": 708
},
{
"epoch": 1.0757575757575757,
"grad_norm": 0.7311533093452454,
"learning_rate": 1.1138336346039113e-06,
"loss": 2.3275,
"num_input_tokens_seen": 3544536,
"step": 710
},
{
"epoch": 1.0787878787878789,
"grad_norm": 0.5675577521324158,
"learning_rate": 1.1100199024157966e-06,
"loss": 2.3477,
"num_input_tokens_seen": 3551472,
"step": 712
},
{
"epoch": 1.0818181818181818,
"grad_norm": 0.6367121934890747,
"learning_rate": 1.1062082401915878e-06,
"loss": 2.4356,
"num_input_tokens_seen": 3561312,
"step": 714
},
{
"epoch": 1.084848484848485,
"grad_norm": 0.5750899910926819,
"learning_rate": 1.1023987356173782e-06,
"loss": 2.5201,
"num_input_tokens_seen": 3570456,
"step": 716
},
{
"epoch": 1.087878787878788,
"grad_norm": 0.46258801221847534,
"learning_rate": 1.0985914763296245e-06,
"loss": 2.0526,
"num_input_tokens_seen": 3582744,
"step": 718
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.6125935912132263,
"learning_rate": 1.0947865499131315e-06,
"loss": 2.2984,
"num_input_tokens_seen": 3595032,
"step": 720
},
{
"epoch": 1.0909090909090908,
"eval_loss": 2.328953504562378,
"eval_runtime": 5.8254,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 3595032,
"step": 720
},
{
"epoch": 1.093939393939394,
"grad_norm": 0.48193785548210144,
"learning_rate": 1.0909840438990383e-06,
"loss": 1.7515,
"num_input_tokens_seen": 3606048,
"step": 722
},
{
"epoch": 1.096969696969697,
"grad_norm": 0.48528820276260376,
"learning_rate": 1.0871840457628012e-06,
"loss": 2.3416,
"num_input_tokens_seen": 3616368,
"step": 724
},
{
"epoch": 1.1,
"grad_norm": 0.46913468837738037,
"learning_rate": 1.0833866429221858e-06,
"loss": 2.3327,
"num_input_tokens_seen": 3628368,
"step": 726
},
{
"epoch": 1.103030303030303,
"grad_norm": 0.5710415840148926,
"learning_rate": 1.0795919227352523e-06,
"loss": 2.401,
"num_input_tokens_seen": 3637848,
"step": 728
},
{
"epoch": 1.106060606060606,
"grad_norm": 0.5964322090148926,
"learning_rate": 1.0757999724983474e-06,
"loss": 2.2503,
"num_input_tokens_seen": 3647640,
"step": 730
},
{
"epoch": 1.1090909090909091,
"grad_norm": 0.5693560242652893,
"learning_rate": 1.0720108794440967e-06,
"loss": 2.4449,
"num_input_tokens_seen": 3658272,
"step": 732
},
{
"epoch": 1.112121212121212,
"grad_norm": 0.7325261235237122,
"learning_rate": 1.068224730739396e-06,
"loss": 2.2787,
"num_input_tokens_seen": 3668760,
"step": 734
},
{
"epoch": 1.1151515151515152,
"grad_norm": 0.5507751107215881,
"learning_rate": 1.064441613483409e-06,
"loss": 2.2226,
"num_input_tokens_seen": 3679608,
"step": 736
},
{
"epoch": 1.1181818181818182,
"grad_norm": 0.4701879620552063,
"learning_rate": 1.0606616147055602e-06,
"loss": 2.6116,
"num_input_tokens_seen": 3689832,
"step": 738
},
{
"epoch": 1.121212121212121,
"grad_norm": 0.5531448125839233,
"learning_rate": 1.056884821363535e-06,
"loss": 2.1242,
"num_input_tokens_seen": 3700392,
"step": 740
},
{
"epoch": 1.121212121212121,
"eval_loss": 2.3289198875427246,
"eval_runtime": 5.8244,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 3700392,
"step": 740
},
{
"epoch": 1.1242424242424243,
"grad_norm": 0.7482770085334778,
"learning_rate": 1.05311132034128e-06,
"loss": 2.3979,
"num_input_tokens_seen": 3709632,
"step": 742
},
{
"epoch": 1.1272727272727272,
"grad_norm": 0.6427175998687744,
"learning_rate": 1.0493411984470007e-06,
"loss": 2.4608,
"num_input_tokens_seen": 3717720,
"step": 744
},
{
"epoch": 1.1303030303030304,
"grad_norm": 0.5718503594398499,
"learning_rate": 1.0455745424111686e-06,
"loss": 2.5028,
"num_input_tokens_seen": 3728280,
"step": 746
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.8905156850814819,
"learning_rate": 1.0418114388845242e-06,
"loss": 2.5461,
"num_input_tokens_seen": 3735888,
"step": 748
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.535351037979126,
"learning_rate": 1.038051974436082e-06,
"loss": 2.2596,
"num_input_tokens_seen": 3747720,
"step": 750
},
{
"epoch": 1.1393939393939394,
"grad_norm": 0.5600206255912781,
"learning_rate": 1.034296235551143e-06,
"loss": 2.2801,
"num_input_tokens_seen": 3758640,
"step": 752
},
{
"epoch": 1.1424242424242423,
"grad_norm": 0.5470922589302063,
"learning_rate": 1.0305443086293003e-06,
"loss": 2.3337,
"num_input_tokens_seen": 3769128,
"step": 754
},
{
"epoch": 1.1454545454545455,
"grad_norm": 0.5066417455673218,
"learning_rate": 1.0267962799824562e-06,
"loss": 2.6706,
"num_input_tokens_seen": 3779304,
"step": 756
},
{
"epoch": 1.1484848484848484,
"grad_norm": 0.46135252714157104,
"learning_rate": 1.0230522358328331e-06,
"loss": 2.2422,
"num_input_tokens_seen": 3789312,
"step": 758
},
{
"epoch": 1.1515151515151516,
"grad_norm": 0.7310757637023926,
"learning_rate": 1.0193122623109917e-06,
"loss": 2.4892,
"num_input_tokens_seen": 3796848,
"step": 760
},
{
"epoch": 1.1515151515151516,
"eval_loss": 2.3289122581481934,
"eval_runtime": 5.8291,
"eval_samples_per_second": 3.431,
"eval_steps_per_second": 3.431,
"num_input_tokens_seen": 3796848,
"step": 760
},
{
"epoch": 1.1545454545454545,
"grad_norm": 0.5655786991119385,
"learning_rate": 1.015576445453849e-06,
"loss": 2.2826,
"num_input_tokens_seen": 3806640,
"step": 762
},
{
"epoch": 1.1575757575757575,
"grad_norm": 0.6524637341499329,
"learning_rate": 1.0118448712026992e-06,
"loss": 2.4358,
"num_input_tokens_seen": 3817608,
"step": 764
},
{
"epoch": 1.1606060606060606,
"grad_norm": 0.6280786991119385,
"learning_rate": 1.0081176254012374e-06,
"loss": 2.421,
"num_input_tokens_seen": 3827592,
"step": 766
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.6797434687614441,
"learning_rate": 1.0043947937935832e-06,
"loss": 2.3245,
"num_input_tokens_seen": 3837264,
"step": 768
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.5665332078933716,
"learning_rate": 1.0006764620223093e-06,
"loss": 2.3388,
"num_input_tokens_seen": 3847656,
"step": 770
},
{
"epoch": 1.1696969696969697,
"grad_norm": 0.4868026077747345,
"learning_rate": 9.96962715626471e-07,
"loss": 2.3956,
"num_input_tokens_seen": 3858600,
"step": 772
},
{
"epoch": 1.1727272727272728,
"grad_norm": 0.77336585521698,
"learning_rate": 9.932536400396393e-07,
"loss": 2.3562,
"num_input_tokens_seen": 3870120,
"step": 774
},
{
"epoch": 1.1757575757575758,
"grad_norm": 0.6464818120002747,
"learning_rate": 9.895493205879332e-07,
"loss": 2.5851,
"num_input_tokens_seen": 3879600,
"step": 776
},
{
"epoch": 1.1787878787878787,
"grad_norm": 0.6274628639221191,
"learning_rate": 9.858498424880592e-07,
"loss": 2.7061,
"num_input_tokens_seen": 3889296,
"step": 778
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.5714861154556274,
"learning_rate": 9.821552908453506e-07,
"loss": 2.4251,
"num_input_tokens_seen": 3901464,
"step": 780
},
{
"epoch": 1.1818181818181819,
"eval_loss": 2.3287835121154785,
"eval_runtime": 5.8272,
"eval_samples_per_second": 3.432,
"eval_steps_per_second": 3.432,
"num_input_tokens_seen": 3901464,
"step": 780
},
{
"epoch": 1.1848484848484848,
"grad_norm": 0.6943346261978149,
"learning_rate": 9.784657506518078e-07,
"loss": 2.6212,
"num_input_tokens_seen": 3910656,
"step": 782
},
{
"epoch": 1.187878787878788,
"grad_norm": 0.4821998178958893,
"learning_rate": 9.747813067841455e-07,
"loss": 2.3086,
"num_input_tokens_seen": 3922944,
"step": 784
},
{
"epoch": 1.190909090909091,
"grad_norm": 0.7086381912231445,
"learning_rate": 9.711020440018384e-07,
"loss": 2.5027,
"num_input_tokens_seen": 3931752,
"step": 786
},
{
"epoch": 1.1939393939393939,
"grad_norm": 0.5712624788284302,
"learning_rate": 9.674280469451718e-07,
"loss": 2.4088,
"num_input_tokens_seen": 3942120,
"step": 788
},
{
"epoch": 1.196969696969697,
"grad_norm": 0.6443710327148438,
"learning_rate": 9.637594001332956e-07,
"loss": 2.3161,
"num_input_tokens_seen": 3952248,
"step": 790
},
{
"epoch": 1.2,
"grad_norm": 0.6674967408180237,
"learning_rate": 9.600961879622777e-07,
"loss": 2.4837,
"num_input_tokens_seen": 3960600,
"step": 792
},
{
"epoch": 1.2030303030303031,
"grad_norm": 0.5792006254196167,
"learning_rate": 9.564384947031646e-07,
"loss": 2.3195,
"num_input_tokens_seen": 3971568,
"step": 794
},
{
"epoch": 1.206060606060606,
"grad_norm": 0.7185015082359314,
"learning_rate": 9.527864045000421e-07,
"loss": 2.5749,
"num_input_tokens_seen": 3983592,
"step": 796
},
{
"epoch": 1.209090909090909,
"grad_norm": 0.6423861980438232,
"learning_rate": 9.491400013680988e-07,
"loss": 2.39,
"num_input_tokens_seen": 3994008,
"step": 798
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.6292434334754944,
"learning_rate": 9.454993691916948e-07,
"loss": 2.3579,
"num_input_tokens_seen": 4004496,
"step": 800
},
{
"epoch": 1.2121212121212122,
"eval_loss": 2.3282077312469482,
"eval_runtime": 5.8254,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 4004496,
"step": 800
},
{
"epoch": 1.215151515151515,
"grad_norm": 0.6097608208656311,
"learning_rate": 9.418645917224303e-07,
"loss": 2.3152,
"num_input_tokens_seen": 4016592,
"step": 802
},
{
"epoch": 1.2181818181818183,
"grad_norm": 0.5774179100990295,
"learning_rate": 9.382357525772202e-07,
"loss": 2.4599,
"num_input_tokens_seen": 4024800,
"step": 804
},
{
"epoch": 1.2212121212121212,
"grad_norm": 0.645380973815918,
"learning_rate": 9.346129352363705e-07,
"loss": 2.2412,
"num_input_tokens_seen": 4035144,
"step": 806
},
{
"epoch": 1.2242424242424241,
"grad_norm": 0.6434935331344604,
"learning_rate": 9.309962230416574e-07,
"loss": 2.4022,
"num_input_tokens_seen": 4042920,
"step": 808
},
{
"epoch": 1.2272727272727273,
"grad_norm": 0.5125094056129456,
"learning_rate": 9.273856991944089e-07,
"loss": 2.4082,
"num_input_tokens_seen": 4053072,
"step": 810
},
{
"epoch": 1.2303030303030302,
"grad_norm": 0.5167670845985413,
"learning_rate": 9.237814467535941e-07,
"loss": 2.3188,
"num_input_tokens_seen": 4063368,
"step": 812
},
{
"epoch": 1.2333333333333334,
"grad_norm": 0.5533791184425354,
"learning_rate": 9.201835486339084e-07,
"loss": 2.4367,
"num_input_tokens_seen": 4072392,
"step": 814
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.5429077744483948,
"learning_rate": 9.165920876038694e-07,
"loss": 2.3054,
"num_input_tokens_seen": 4083072,
"step": 816
},
{
"epoch": 1.2393939393939393,
"grad_norm": 0.530968427658081,
"learning_rate": 9.130071462839108e-07,
"loss": 2.4475,
"num_input_tokens_seen": 4093776,
"step": 818
},
{
"epoch": 1.2424242424242424,
"grad_norm": 0.5137664675712585,
"learning_rate": 9.094288071444822e-07,
"loss": 2.4868,
"num_input_tokens_seen": 4106040,
"step": 820
},
{
"epoch": 1.2424242424242424,
"eval_loss": 2.3283748626708984,
"eval_runtime": 5.8265,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 4106040,
"step": 820
},
{
"epoch": 1.2454545454545454,
"grad_norm": 0.7732150554656982,
"learning_rate": 9.058571525041534e-07,
"loss": 2.4682,
"num_input_tokens_seen": 4117392,
"step": 822
},
{
"epoch": 1.2484848484848485,
"grad_norm": 0.6861566305160522,
"learning_rate": 9.022922645277176e-07,
"loss": 2.372,
"num_input_tokens_seen": 4125696,
"step": 824
},
{
"epoch": 1.2515151515151515,
"grad_norm": 0.4728741943836212,
"learning_rate": 8.987342252243042e-07,
"loss": 2.4424,
"num_input_tokens_seen": 4137816,
"step": 826
},
{
"epoch": 1.2545454545454544,
"grad_norm": 0.5557587742805481,
"learning_rate": 8.951831164454908e-07,
"loss": 2.4164,
"num_input_tokens_seen": 4150104,
"step": 828
},
{
"epoch": 1.2575757575757576,
"grad_norm": 0.6730014085769653,
"learning_rate": 8.916390198834203e-07,
"loss": 2.4451,
"num_input_tokens_seen": 4160832,
"step": 830
},
{
"epoch": 1.2606060606060607,
"grad_norm": 0.7126666307449341,
"learning_rate": 8.88102017068922e-07,
"loss": 2.3256,
"num_input_tokens_seen": 4170216,
"step": 832
},
{
"epoch": 1.2636363636363637,
"grad_norm": 0.6457303762435913,
"learning_rate": 8.845721893696354e-07,
"loss": 2.2176,
"num_input_tokens_seen": 4181256,
"step": 834
},
{
"epoch": 1.2666666666666666,
"grad_norm": 1.0662436485290527,
"learning_rate": 8.810496179881387e-07,
"loss": 2.3812,
"num_input_tokens_seen": 4192128,
"step": 836
},
{
"epoch": 1.2696969696969698,
"grad_norm": 0.4683075547218323,
"learning_rate": 8.775343839600816e-07,
"loss": 2.4275,
"num_input_tokens_seen": 4202208,
"step": 838
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.5171107649803162,
"learning_rate": 8.740265681523195e-07,
"loss": 2.4706,
"num_input_tokens_seen": 4210464,
"step": 840
},
{
"epoch": 1.2727272727272727,
"eval_loss": 2.3279545307159424,
"eval_runtime": 5.823,
"eval_samples_per_second": 3.435,
"eval_steps_per_second": 3.435,
"num_input_tokens_seen": 4210464,
"step": 840
},
{
"epoch": 1.2757575757575759,
"grad_norm": 0.7313932180404663,
"learning_rate": 8.705262512610539e-07,
"loss": 2.4054,
"num_input_tokens_seen": 4217928,
"step": 842
},
{
"epoch": 1.2787878787878788,
"grad_norm": 0.7015888690948486,
"learning_rate": 8.670335138099765e-07,
"loss": 2.4653,
"num_input_tokens_seen": 4226904,
"step": 844
},
{
"epoch": 1.2818181818181817,
"grad_norm": 0.6179009079933167,
"learning_rate": 8.635484361484158e-07,
"loss": 2.3184,
"num_input_tokens_seen": 4237656,
"step": 846
},
{
"epoch": 1.284848484848485,
"grad_norm": 0.5112322568893433,
"learning_rate": 8.600710984494909e-07,
"loss": 2.3415,
"num_input_tokens_seen": 4248720,
"step": 848
},
{
"epoch": 1.2878787878787878,
"grad_norm": 0.7824225425720215,
"learning_rate": 8.56601580708263e-07,
"loss": 2.6382,
"num_input_tokens_seen": 4253448,
"step": 850
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.7822674512863159,
"learning_rate": 8.531399627398991e-07,
"loss": 2.5681,
"num_input_tokens_seen": 4261488,
"step": 852
},
{
"epoch": 1.293939393939394,
"grad_norm": 0.5791777968406677,
"learning_rate": 8.496863241778346e-07,
"loss": 2.2039,
"num_input_tokens_seen": 4273104,
"step": 854
},
{
"epoch": 1.2969696969696969,
"grad_norm": 0.5415911674499512,
"learning_rate": 8.462407444719405e-07,
"loss": 2.3936,
"num_input_tokens_seen": 4283136,
"step": 856
},
{
"epoch": 1.3,
"grad_norm": 0.5852922797203064,
"learning_rate": 8.428033028866967e-07,
"loss": 2.3669,
"num_input_tokens_seen": 4292208,
"step": 858
},
{
"epoch": 1.303030303030303,
"grad_norm": 0.5799878239631653,
"learning_rate": 8.393740784993677e-07,
"loss": 2.4704,
"num_input_tokens_seen": 4302240,
"step": 860
},
{
"epoch": 1.303030303030303,
"eval_loss": 2.3276970386505127,
"eval_runtime": 5.8227,
"eval_samples_per_second": 3.435,
"eval_steps_per_second": 3.435,
"num_input_tokens_seen": 4302240,
"step": 860
},
{
"epoch": 1.3060606060606061,
"grad_norm": 0.7296667695045471,
"learning_rate": 8.359531501981846e-07,
"loss": 2.7633,
"num_input_tokens_seen": 4311888,
"step": 862
},
{
"epoch": 1.309090909090909,
"grad_norm": 0.9460285305976868,
"learning_rate": 8.325405966805295e-07,
"loss": 2.1671,
"num_input_tokens_seen": 4321992,
"step": 864
},
{
"epoch": 1.312121212121212,
"grad_norm": 0.5294950008392334,
"learning_rate": 8.291364964511247e-07,
"loss": 2.4139,
"num_input_tokens_seen": 4332408,
"step": 866
},
{
"epoch": 1.3151515151515152,
"grad_norm": 0.6206031441688538,
"learning_rate": 8.25740927820228e-07,
"loss": 2.5621,
"num_input_tokens_seen": 4344696,
"step": 868
},
{
"epoch": 1.3181818181818181,
"grad_norm": 0.5652275085449219,
"learning_rate": 8.223539689018299e-07,
"loss": 2.4142,
"num_input_tokens_seen": 4356168,
"step": 870
},
{
"epoch": 1.3212121212121213,
"grad_norm": 0.6217209696769714,
"learning_rate": 8.189756976118568e-07,
"loss": 2.3459,
"num_input_tokens_seen": 4364568,
"step": 872
},
{
"epoch": 1.3242424242424242,
"grad_norm": 0.5359376072883606,
"learning_rate": 8.156061916663807e-07,
"loss": 2.2973,
"num_input_tokens_seen": 4374984,
"step": 874
},
{
"epoch": 1.3272727272727272,
"grad_norm": 0.531065821647644,
"learning_rate": 8.12245528579828e-07,
"loss": 2.5294,
"num_input_tokens_seen": 4385424,
"step": 876
},
{
"epoch": 1.3303030303030303,
"grad_norm": 0.837188184261322,
"learning_rate": 8.088937856631974e-07,
"loss": 2.4239,
"num_input_tokens_seen": 4395192,
"step": 878
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.6112043857574463,
"learning_rate": 8.055510400222836e-07,
"loss": 2.4403,
"num_input_tokens_seen": 4405608,
"step": 880
},
{
"epoch": 1.3333333333333333,
"eval_loss": 2.328122138977051,
"eval_runtime": 5.8246,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 4405608,
"step": 880
},
{
"epoch": 1.3363636363636364,
"grad_norm": 0.6124045252799988,
"learning_rate": 8.022173685559011e-07,
"loss": 2.389,
"num_input_tokens_seen": 4417896,
"step": 882
},
{
"epoch": 1.3393939393939394,
"grad_norm": 0.6339285969734192,
"learning_rate": 7.988928479541154e-07,
"loss": 2.3811,
"num_input_tokens_seen": 4428000,
"step": 884
},
{
"epoch": 1.3424242424242423,
"grad_norm": 0.5700270533561707,
"learning_rate": 7.955775546964797e-07,
"loss": 2.4351,
"num_input_tokens_seen": 4436736,
"step": 886
},
{
"epoch": 1.3454545454545455,
"grad_norm": 0.5536416172981262,
"learning_rate": 7.922715650502746e-07,
"loss": 2.4343,
"num_input_tokens_seen": 4447488,
"step": 888
},
{
"epoch": 1.3484848484848486,
"grad_norm": 0.6863646507263184,
"learning_rate": 7.889749550687552e-07,
"loss": 2.5435,
"num_input_tokens_seen": 4455840,
"step": 890
},
{
"epoch": 1.3515151515151516,
"grad_norm": 0.6737553477287292,
"learning_rate": 7.856878005893988e-07,
"loss": 2.3398,
"num_input_tokens_seen": 4463568,
"step": 892
},
{
"epoch": 1.3545454545454545,
"grad_norm": 0.7057380676269531,
"learning_rate": 7.824101772321625e-07,
"loss": 2.3618,
"num_input_tokens_seen": 4472904,
"step": 894
},
{
"epoch": 1.3575757575757577,
"grad_norm": 0.47144582867622375,
"learning_rate": 7.791421603977435e-07,
"loss": 2.1904,
"num_input_tokens_seen": 4484400,
"step": 896
},
{
"epoch": 1.3606060606060606,
"grad_norm": 0.5720792412757874,
"learning_rate": 7.758838252658433e-07,
"loss": 2.3122,
"num_input_tokens_seen": 4493592,
"step": 898
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.6241214275360107,
"learning_rate": 7.726352467934386e-07,
"loss": 2.4964,
"num_input_tokens_seen": 4502664,
"step": 900
},
{
"epoch": 1.3636363636363638,
"eval_loss": 2.327789783477783,
"eval_runtime": 5.8265,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 4502664,
"step": 900
},
{
"epoch": 1.3666666666666667,
"grad_norm": 0.6216875910758972,
"learning_rate": 7.693964997130581e-07,
"loss": 2.4142,
"num_input_tokens_seen": 4510920,
"step": 902
},
{
"epoch": 1.3696969696969696,
"grad_norm": 0.5733647346496582,
"learning_rate": 7.661676585310618e-07,
"loss": 2.3751,
"num_input_tokens_seen": 4523208,
"step": 904
},
{
"epoch": 1.3727272727272728,
"grad_norm": 0.5904967784881592,
"learning_rate": 7.629487975259276e-07,
"loss": 2.5808,
"num_input_tokens_seen": 4532520,
"step": 906
},
{
"epoch": 1.3757575757575757,
"grad_norm": 0.44976285099983215,
"learning_rate": 7.597399907465431e-07,
"loss": 2.3199,
"num_input_tokens_seen": 4544688,
"step": 908
},
{
"epoch": 1.378787878787879,
"grad_norm": 0.6326127052307129,
"learning_rate": 7.565413120105009e-07,
"loss": 2.3752,
"num_input_tokens_seen": 4554000,
"step": 910
},
{
"epoch": 1.3818181818181818,
"grad_norm": 0.5754263997077942,
"learning_rate": 7.533528349024014e-07,
"loss": 2.3512,
"num_input_tokens_seen": 4564368,
"step": 912
},
{
"epoch": 1.3848484848484848,
"grad_norm": 0.7068946957588196,
"learning_rate": 7.5017463277216e-07,
"loss": 2.3772,
"num_input_tokens_seen": 4574448,
"step": 914
},
{
"epoch": 1.387878787878788,
"grad_norm": 0.6131560206413269,
"learning_rate": 7.470067787333188e-07,
"loss": 2.4036,
"num_input_tokens_seen": 4582464,
"step": 916
},
{
"epoch": 1.3909090909090909,
"grad_norm": 0.6577942967414856,
"learning_rate": 7.43849345661367e-07,
"loss": 2.3063,
"num_input_tokens_seen": 4592976,
"step": 918
},
{
"epoch": 1.393939393939394,
"grad_norm": 0.7147580981254578,
"learning_rate": 7.407024061920599e-07,
"loss": 2.4129,
"num_input_tokens_seen": 4603920,
"step": 920
},
{
"epoch": 1.393939393939394,
"eval_loss": 2.32749080657959,
"eval_runtime": 5.8263,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 4603920,
"step": 920
},
{
"epoch": 1.396969696969697,
"grad_norm": 0.5320861339569092,
"learning_rate": 7.375660327197534e-07,
"loss": 2.3207,
"num_input_tokens_seen": 4614072,
"step": 922
},
{
"epoch": 1.4,
"grad_norm": 0.6758208870887756,
"learning_rate": 7.344402973957346e-07,
"loss": 2.4536,
"num_input_tokens_seen": 4622640,
"step": 924
},
{
"epoch": 1.403030303030303,
"grad_norm": 0.5670093894004822,
"learning_rate": 7.313252721265638e-07,
"loss": 2.5495,
"num_input_tokens_seen": 4634040,
"step": 926
},
{
"epoch": 1.406060606060606,
"grad_norm": 0.5245952606201172,
"learning_rate": 7.282210285724195e-07,
"loss": 2.4487,
"num_input_tokens_seen": 4644192,
"step": 928
},
{
"epoch": 1.4090909090909092,
"grad_norm": 0.4705655872821808,
"learning_rate": 7.251276381454506e-07,
"loss": 2.5896,
"num_input_tokens_seen": 4653720,
"step": 930
},
{
"epoch": 1.412121212121212,
"grad_norm": 0.5075128674507141,
"learning_rate": 7.22045172008133e-07,
"loss": 2.261,
"num_input_tokens_seen": 4666008,
"step": 932
},
{
"epoch": 1.415151515151515,
"grad_norm": 0.5407282710075378,
"learning_rate": 7.189737010716326e-07,
"loss": 2.384,
"num_input_tokens_seen": 4674936,
"step": 934
},
{
"epoch": 1.4181818181818182,
"grad_norm": 0.6681150794029236,
"learning_rate": 7.159132959941745e-07,
"loss": 2.4542,
"num_input_tokens_seen": 4684272,
"step": 936
},
{
"epoch": 1.4212121212121211,
"grad_norm": 0.6024764776229858,
"learning_rate": 7.128640271794171e-07,
"loss": 2.3937,
"num_input_tokens_seen": 4695576,
"step": 938
},
{
"epoch": 1.4242424242424243,
"grad_norm": 0.5031726956367493,
"learning_rate": 7.098259647748328e-07,
"loss": 2.2943,
"num_input_tokens_seen": 4705800,
"step": 940
},
{
"epoch": 1.4242424242424243,
"eval_loss": 2.3277194499969482,
"eval_runtime": 5.8264,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 4705800,
"step": 940
},
{
"epoch": 1.4272727272727272,
"grad_norm": 0.5406504273414612,
"learning_rate": 7.067991786700929e-07,
"loss": 2.3552,
"num_input_tokens_seen": 4718088,
"step": 942
},
{
"epoch": 1.4303030303030302,
"grad_norm": 0.5154955387115479,
"learning_rate": 7.037837384954625e-07,
"loss": 2.4507,
"num_input_tokens_seen": 4729536,
"step": 944
},
{
"epoch": 1.4333333333333333,
"grad_norm": 0.710150420665741,
"learning_rate": 7.007797136201966e-07,
"loss": 2.4813,
"num_input_tokens_seen": 4738272,
"step": 946
},
{
"epoch": 1.4363636363636363,
"grad_norm": 0.5603686571121216,
"learning_rate": 6.977871731509438e-07,
"loss": 2.4679,
"num_input_tokens_seen": 4747488,
"step": 948
},
{
"epoch": 1.4393939393939394,
"grad_norm": 0.6040205359458923,
"learning_rate": 6.948061859301593e-07,
"loss": 2.5084,
"num_input_tokens_seen": 4756032,
"step": 950
},
{
"epoch": 1.4424242424242424,
"grad_norm": 0.6151003837585449,
"learning_rate": 6.918368205345182e-07,
"loss": 2.3797,
"num_input_tokens_seen": 4766904,
"step": 952
},
{
"epoch": 1.4454545454545453,
"grad_norm": 0.5921849012374878,
"learning_rate": 6.888791452733397e-07,
"loss": 2.4923,
"num_input_tokens_seen": 4777680,
"step": 954
},
{
"epoch": 1.4484848484848485,
"grad_norm": 0.5749545693397522,
"learning_rate": 6.859332281870147e-07,
"loss": 2.5362,
"num_input_tokens_seen": 4788432,
"step": 956
},
{
"epoch": 1.4515151515151516,
"grad_norm": 0.5609776973724365,
"learning_rate": 6.829991370454411e-07,
"loss": 2.433,
"num_input_tokens_seen": 4799712,
"step": 958
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.6038674116134644,
"learning_rate": 6.800769393464656e-07,
"loss": 2.362,
"num_input_tokens_seen": 4808688,
"step": 960
},
{
"epoch": 1.4545454545454546,
"eval_loss": 2.3274452686309814,
"eval_runtime": 5.8255,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 4808688,
"step": 960
},
{
"epoch": 1.4575757575757575,
"grad_norm": 0.6705885529518127,
"learning_rate": 6.771667023143284e-07,
"loss": 2.5027,
"num_input_tokens_seen": 4817136,
"step": 962
},
{
"epoch": 1.4606060606060607,
"grad_norm": 0.6026042699813843,
"learning_rate": 6.742684928981188e-07,
"loss": 2.6941,
"num_input_tokens_seen": 4829112,
"step": 964
},
{
"epoch": 1.4636363636363636,
"grad_norm": 0.5220550894737244,
"learning_rate": 6.713823777702359e-07,
"loss": 2.2785,
"num_input_tokens_seen": 4838664,
"step": 966
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.6457405090332031,
"learning_rate": 6.685084233248517e-07,
"loss": 2.502,
"num_input_tokens_seen": 4846656,
"step": 968
},
{
"epoch": 1.4696969696969697,
"grad_norm": 0.831514298915863,
"learning_rate": 6.656466956763864e-07,
"loss": 2.4094,
"num_input_tokens_seen": 4855296,
"step": 970
},
{
"epoch": 1.4727272727272727,
"grad_norm": 0.623429000377655,
"learning_rate": 6.627972606579866e-07,
"loss": 2.3646,
"num_input_tokens_seen": 4867584,
"step": 972
},
{
"epoch": 1.4757575757575758,
"grad_norm": 0.6878921389579773,
"learning_rate": 6.599601838200104e-07,
"loss": 2.3642,
"num_input_tokens_seen": 4879584,
"step": 974
},
{
"epoch": 1.4787878787878788,
"grad_norm": 0.8445355296134949,
"learning_rate": 6.571355304285202e-07,
"loss": 2.571,
"num_input_tokens_seen": 4889976,
"step": 976
},
{
"epoch": 1.481818181818182,
"grad_norm": 0.5575315356254578,
"learning_rate": 6.543233654637804e-07,
"loss": 2.5749,
"num_input_tokens_seen": 4899048,
"step": 978
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.5096350312232971,
"learning_rate": 6.515237536187644e-07,
"loss": 2.2386,
"num_input_tokens_seen": 4910088,
"step": 980
},
{
"epoch": 1.4848484848484849,
"eval_loss": 2.3277652263641357,
"eval_runtime": 5.8263,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 4910088,
"step": 980
},
{
"epoch": 1.4878787878787878,
"grad_norm": 0.7003534436225891,
"learning_rate": 6.487367592976633e-07,
"loss": 2.5641,
"num_input_tokens_seen": 4922376,
"step": 982
},
{
"epoch": 1.490909090909091,
"grad_norm": 0.5951968431472778,
"learning_rate": 6.459624466144067e-07,
"loss": 2.298,
"num_input_tokens_seen": 4934664,
"step": 984
},
{
"epoch": 1.493939393939394,
"grad_norm": 0.7097399234771729,
"learning_rate": 6.432008793911877e-07,
"loss": 2.3938,
"num_input_tokens_seen": 4943352,
"step": 986
},
{
"epoch": 1.496969696969697,
"grad_norm": 0.5688740015029907,
"learning_rate": 6.404521211569937e-07,
"loss": 2.421,
"num_input_tokens_seen": 4953888,
"step": 988
},
{
"epoch": 1.5,
"grad_norm": 0.6089447736740112,
"learning_rate": 6.377162351461442e-07,
"loss": 2.1273,
"num_input_tokens_seen": 4965024,
"step": 990
},
{
"epoch": 1.503030303030303,
"grad_norm": 0.5698357224464417,
"learning_rate": 6.349932842968391e-07,
"loss": 2.3928,
"num_input_tokens_seen": 4977216,
"step": 992
},
{
"epoch": 1.506060606060606,
"grad_norm": 0.6300851702690125,
"learning_rate": 6.322833312497082e-07,
"loss": 2.3595,
"num_input_tokens_seen": 4986720,
"step": 994
},
{
"epoch": 1.509090909090909,
"grad_norm": 0.5977615714073181,
"learning_rate": 6.295864383463705e-07,
"loss": 2.5852,
"num_input_tokens_seen": 4995072,
"step": 996
},
{
"epoch": 1.5121212121212122,
"grad_norm": 0.6872332096099854,
"learning_rate": 6.269026676280008e-07,
"loss": 2.4611,
"num_input_tokens_seen": 5003256,
"step": 998
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.7128229141235352,
"learning_rate": 6.242320808339023e-07,
"loss": 2.0949,
"num_input_tokens_seen": 5010864,
"step": 1000
},
{
"epoch": 1.5151515151515151,
"eval_loss": 2.3277881145477295,
"eval_runtime": 5.8265,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 3.433,
"num_input_tokens_seen": 5010864,
"step": 1000
},
{
"epoch": 1.518181818181818,
"grad_norm": 0.48600301146507263,
"learning_rate": 6.215747394000864e-07,
"loss": 2.2478,
"num_input_tokens_seen": 5021400,
"step": 1002
},
{
"epoch": 1.5212121212121212,
"grad_norm": 0.6063188314437866,
"learning_rate": 6.189307044578585e-07,
"loss": 2.1912,
"num_input_tokens_seen": 5031576,
"step": 1004
},
{
"epoch": 1.5242424242424244,
"grad_norm": 0.6136674284934998,
"learning_rate": 6.163000368324124e-07,
"loss": 2.3441,
"num_input_tokens_seen": 5042136,
"step": 1006
},
{
"epoch": 1.5272727272727273,
"grad_norm": 0.6810842156410217,
"learning_rate": 6.136827970414317e-07,
"loss": 2.3444,
"num_input_tokens_seen": 5052480,
"step": 1008
},
{
"epoch": 1.5303030303030303,
"grad_norm": 0.70346599817276,
"learning_rate": 6.11079045293696e-07,
"loss": 2.5014,
"num_input_tokens_seen": 5062872,
"step": 1010
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.6263840198516846,
"learning_rate": 6.084888414876976e-07,
"loss": 2.2427,
"num_input_tokens_seen": 5073744,
"step": 1012
},
{
"epoch": 1.5363636363636364,
"grad_norm": 0.6593678593635559,
"learning_rate": 6.059122452102618e-07,
"loss": 2.3813,
"num_input_tokens_seen": 5082432,
"step": 1014
},
{
"epoch": 1.5393939393939395,
"grad_norm": 0.521698534488678,
"learning_rate": 6.033493157351772e-07,
"loss": 2.6378,
"num_input_tokens_seen": 5092848,
"step": 1016
},
{
"epoch": 1.5424242424242425,
"grad_norm": 0.46363523602485657,
"learning_rate": 6.008001120218322e-07,
"loss": 2.4006,
"num_input_tokens_seen": 5105136,
"step": 1018
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.7737420797348022,
"learning_rate": 5.982646927138584e-07,
"loss": 2.5504,
"num_input_tokens_seen": 5114064,
"step": 1020
},
{
"epoch": 1.5454545454545454,
"eval_loss": 2.3275692462921143,
"eval_runtime": 5.8238,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 5114064,
"step": 1020
},
{
"epoch": 1.5484848484848484,
"grad_norm": 0.6213299036026001,
"learning_rate": 5.957431161377809e-07,
"loss": 2.4085,
"num_input_tokens_seen": 5125872,
"step": 1022
},
{
"epoch": 1.5515151515151515,
"grad_norm": 0.7610370516777039,
"learning_rate": 5.932354403016777e-07,
"loss": 2.263,
"num_input_tokens_seen": 5135208,
"step": 1024
},
{
"epoch": 1.5545454545454547,
"grad_norm": 0.5635423064231873,
"learning_rate": 5.907417228938442e-07,
"loss": 2.352,
"num_input_tokens_seen": 5146896,
"step": 1026
},
{
"epoch": 1.5575757575757576,
"grad_norm": 0.5265647768974304,
"learning_rate": 5.88262021281467e-07,
"loss": 2.3172,
"num_input_tokens_seen": 5159184,
"step": 1028
},
{
"epoch": 1.5606060606060606,
"grad_norm": 0.8375009298324585,
"learning_rate": 5.857963925093034e-07,
"loss": 2.4402,
"num_input_tokens_seen": 5167656,
"step": 1030
},
{
"epoch": 1.5636363636363635,
"grad_norm": 0.5335946679115295,
"learning_rate": 5.833448932983693e-07,
"loss": 2.5926,
"num_input_tokens_seen": 5179680,
"step": 1032
},
{
"epoch": 1.5666666666666667,
"grad_norm": 0.8245714902877808,
"learning_rate": 5.809075800446348e-07,
"loss": 2.5999,
"num_input_tokens_seen": 5190216,
"step": 1034
},
{
"epoch": 1.5696969696969698,
"grad_norm": 0.5047762393951416,
"learning_rate": 5.784845088177263e-07,
"loss": 2.379,
"num_input_tokens_seen": 5201592,
"step": 1036
},
{
"epoch": 1.5727272727272728,
"grad_norm": 0.5322418212890625,
"learning_rate": 5.760757353596371e-07,
"loss": 2.3246,
"num_input_tokens_seen": 5213040,
"step": 1038
},
{
"epoch": 1.5757575757575757,
"grad_norm": 0.47743648290634155,
"learning_rate": 5.736813150834447e-07,
"loss": 2.4542,
"num_input_tokens_seen": 5223360,
"step": 1040
},
{
"epoch": 1.5757575757575757,
"eval_loss": 2.3277275562286377,
"eval_runtime": 5.824,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 5223360,
"step": 1040
},
{
"epoch": 1.5787878787878786,
"grad_norm": 0.4745235741138458,
"learning_rate": 5.713013030720356e-07,
"loss": 2.3253,
"num_input_tokens_seen": 5235480,
"step": 1042
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.521117091178894,
"learning_rate": 5.6893575407684e-07,
"loss": 2.3232,
"num_input_tokens_seen": 5246280,
"step": 1044
},
{
"epoch": 1.584848484848485,
"grad_norm": 0.6688542366027832,
"learning_rate": 5.665847225165695e-07,
"loss": 2.323,
"num_input_tokens_seen": 5257248,
"step": 1046
},
{
"epoch": 1.587878787878788,
"grad_norm": 0.6905980706214905,
"learning_rate": 5.642482624759672e-07,
"loss": 2.6128,
"num_input_tokens_seen": 5268264,
"step": 1048
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.555060088634491,
"learning_rate": 5.619264277045634e-07,
"loss": 2.5484,
"num_input_tokens_seen": 5280432,
"step": 1050
},
{
"epoch": 1.593939393939394,
"grad_norm": 0.6293858289718628,
"learning_rate": 5.596192716154385e-07,
"loss": 2.5,
"num_input_tokens_seen": 5290488,
"step": 1052
},
{
"epoch": 1.596969696969697,
"grad_norm": 0.9078196883201599,
"learning_rate": 5.573268472839937e-07,
"loss": 2.4814,
"num_input_tokens_seen": 5299536,
"step": 1054
},
{
"epoch": 1.6,
"grad_norm": 0.6941189765930176,
"learning_rate": 5.550492074467317e-07,
"loss": 2.4972,
"num_input_tokens_seen": 5309544,
"step": 1056
},
{
"epoch": 1.603030303030303,
"grad_norm": 0.6833639740943909,
"learning_rate": 5.527864045000421e-07,
"loss": 2.5041,
"num_input_tokens_seen": 5319024,
"step": 1058
},
{
"epoch": 1.606060606060606,
"grad_norm": 0.6468996405601501,
"learning_rate": 5.505384904989965e-07,
"loss": 2.3262,
"num_input_tokens_seen": 5329752,
"step": 1060
},
{
"epoch": 1.606060606060606,
"eval_loss": 2.327099323272705,
"eval_runtime": 5.8238,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 5329752,
"step": 1060
},
{
"epoch": 1.6090909090909091,
"grad_norm": 0.7046562433242798,
"learning_rate": 5.483055171561511e-07,
"loss": 2.2181,
"num_input_tokens_seen": 5340552,
"step": 1062
},
{
"epoch": 1.612121212121212,
"grad_norm": 0.48583197593688965,
"learning_rate": 5.460875358403565e-07,
"loss": 2.3349,
"num_input_tokens_seen": 5350320,
"step": 1064
},
{
"epoch": 1.6151515151515152,
"grad_norm": 0.6768611073493958,
"learning_rate": 5.438845975755772e-07,
"loss": 2.4784,
"num_input_tokens_seen": 5356608,
"step": 1066
},
{
"epoch": 1.6181818181818182,
"grad_norm": 0.6648526191711426,
"learning_rate": 5.416967530397164e-07,
"loss": 2.2265,
"num_input_tokens_seen": 5366568,
"step": 1068
},
{
"epoch": 1.621212121212121,
"grad_norm": 0.5271417498588562,
"learning_rate": 5.395240525634511e-07,
"loss": 2.4877,
"num_input_tokens_seen": 5378856,
"step": 1070
},
{
"epoch": 1.6242424242424243,
"grad_norm": 0.5848326086997986,
"learning_rate": 5.37366546129074e-07,
"loss": 2.3169,
"num_input_tokens_seen": 5391120,
"step": 1072
},
{
"epoch": 1.6272727272727274,
"grad_norm": 0.5480791330337524,
"learning_rate": 5.35224283369343e-07,
"loss": 2.4456,
"num_input_tokens_seen": 5398752,
"step": 1074
},
{
"epoch": 1.6303030303030304,
"grad_norm": 0.47689610719680786,
"learning_rate": 5.330973135663411e-07,
"loss": 2.5053,
"num_input_tokens_seen": 5411040,
"step": 1076
},
{
"epoch": 1.6333333333333333,
"grad_norm": 0.5623081922531128,
"learning_rate": 5.309856856503409e-07,
"loss": 2.4062,
"num_input_tokens_seen": 5422848,
"step": 1078
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.539359450340271,
"learning_rate": 5.2888944819868e-07,
"loss": 2.2278,
"num_input_tokens_seen": 5435136,
"step": 1080
},
{
"epoch": 1.6363636363636362,
"eval_loss": 2.32759428024292,
"eval_runtime": 5.8229,
"eval_samples_per_second": 3.435,
"eval_steps_per_second": 3.435,
"num_input_tokens_seen": 5435136,
"step": 1080
},
{
"epoch": 1.6393939393939394,
"grad_norm": 0.5953027009963989,
"learning_rate": 5.26808649434643e-07,
"loss": 2.3976,
"num_input_tokens_seen": 5445672,
"step": 1082
},
{
"epoch": 1.6424242424242426,
"grad_norm": 0.5432310700416565,
"learning_rate": 5.247433372263522e-07,
"loss": 2.4648,
"num_input_tokens_seen": 5456640,
"step": 1084
},
{
"epoch": 1.6454545454545455,
"grad_norm": 0.5668439865112305,
"learning_rate": 5.226935590856675e-07,
"loss": 2.2962,
"num_input_tokens_seen": 5465976,
"step": 1086
},
{
"epoch": 1.6484848484848484,
"grad_norm": 0.5815810561180115,
"learning_rate": 5.20659362167091e-07,
"loss": 2.3107,
"num_input_tokens_seen": 5477016,
"step": 1088
},
{
"epoch": 1.6515151515151514,
"grad_norm": 0.5914052724838257,
"learning_rate": 5.186407932666846e-07,
"loss": 2.2394,
"num_input_tokens_seen": 5487504,
"step": 1090
},
{
"epoch": 1.6545454545454545,
"grad_norm": 0.8601570129394531,
"learning_rate": 5.166378988209924e-07,
"loss": 2.6481,
"num_input_tokens_seen": 5496600,
"step": 1092
},
{
"epoch": 1.6575757575757577,
"grad_norm": 0.6369432210922241,
"learning_rate": 5.146507249059727e-07,
"loss": 2.5754,
"num_input_tokens_seen": 5506416,
"step": 1094
},
{
"epoch": 1.6606060606060606,
"grad_norm": 0.712243914604187,
"learning_rate": 5.126793172359373e-07,
"loss": 2.3295,
"num_input_tokens_seen": 5514600,
"step": 1096
},
{
"epoch": 1.6636363636363636,
"grad_norm": 0.6746931672096252,
"learning_rate": 5.107237211625016e-07,
"loss": 2.3752,
"num_input_tokens_seen": 5522616,
"step": 1098
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.5909104943275452,
"learning_rate": 5.087839816735391e-07,
"loss": 2.2484,
"num_input_tokens_seen": 5533488,
"step": 1100
},
{
"epoch": 1.6666666666666665,
"eval_loss": 2.326948642730713,
"eval_runtime": 5.8207,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 3.436,
"num_input_tokens_seen": 5533488,
"step": 1100
},
{
"epoch": 1.6696969696969697,
"grad_norm": 0.578524112701416,
"learning_rate": 5.068601433921479e-07,
"loss": 2.392,
"num_input_tokens_seen": 5544864,
"step": 1102
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.6614283323287964,
"learning_rate": 5.04952250575624e-07,
"loss": 2.4998,
"num_input_tokens_seen": 5555928,
"step": 1104
},
{
"epoch": 1.6757575757575758,
"grad_norm": 0.5955278277397156,
"learning_rate": 5.030603471144432e-07,
"loss": 2.3944,
"num_input_tokens_seen": 5567088,
"step": 1106
},
{
"epoch": 1.6787878787878787,
"grad_norm": 0.5927826166152954,
"learning_rate": 5.011844765312504e-07,
"loss": 2.487,
"num_input_tokens_seen": 5578128,
"step": 1108
},
{
"epoch": 1.6818181818181817,
"grad_norm": 0.6427227258682251,
"learning_rate": 4.9932468197986e-07,
"loss": 2.5279,
"num_input_tokens_seen": 5588952,
"step": 1110
},
{
"epoch": 1.6848484848484848,
"grad_norm": 0.49643516540527344,
"learning_rate": 4.974810062442615e-07,
"loss": 2.4558,
"num_input_tokens_seen": 5599992,
"step": 1112
},
{
"epoch": 1.687878787878788,
"grad_norm": 0.5617672204971313,
"learning_rate": 4.956534917376373e-07,
"loss": 2.3407,
"num_input_tokens_seen": 5611752,
"step": 1114
},
{
"epoch": 1.690909090909091,
"grad_norm": 0.7746953368186951,
"learning_rate": 4.938421805013844e-07,
"loss": 2.4067,
"num_input_tokens_seen": 5619072,
"step": 1116
},
{
"epoch": 1.6939393939393939,
"grad_norm": 0.6146767139434814,
"learning_rate": 4.920471142041496e-07,
"loss": 2.2224,
"num_input_tokens_seen": 5629824,
"step": 1118
},
{
"epoch": 1.696969696969697,
"grad_norm": 0.7500237822532654,
"learning_rate": 4.902683341408698e-07,
"loss": 2.4764,
"num_input_tokens_seen": 5639376,
"step": 1120
},
{
"epoch": 1.696969696969697,
"eval_loss": 2.327069044113159,
"eval_runtime": 5.8204,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 3.436,
"num_input_tokens_seen": 5639376,
"step": 1120
},
{
"epoch": 1.7,
"grad_norm": 0.512405276298523,
"learning_rate": 4.88505881231822e-07,
"loss": 2.4383,
"num_input_tokens_seen": 5649624,
"step": 1122
},
{
"epoch": 1.7030303030303031,
"grad_norm": 0.6521934866905212,
"learning_rate": 4.867597960216823e-07,
"loss": 2.3752,
"num_input_tokens_seen": 5659800,
"step": 1124
},
{
"epoch": 1.706060606060606,
"grad_norm": 0.5437342524528503,
"learning_rate": 4.85030118678593e-07,
"loss": 2.2427,
"num_input_tokens_seen": 5668296,
"step": 1126
},
{
"epoch": 1.709090909090909,
"grad_norm": 0.5007622838020325,
"learning_rate": 4.833168889932384e-07,
"loss": 2.3027,
"num_input_tokens_seen": 5678832,
"step": 1128
},
{
"epoch": 1.7121212121212122,
"grad_norm": 0.9229590892791748,
"learning_rate": 4.816201463779299e-07,
"loss": 2.4966,
"num_input_tokens_seen": 5686872,
"step": 1130
},
{
"epoch": 1.7151515151515153,
"grad_norm": 0.7598445415496826,
"learning_rate": 4.799399298656985e-07,
"loss": 2.5635,
"num_input_tokens_seen": 5697216,
"step": 1132
},
{
"epoch": 1.7181818181818183,
"grad_norm": 0.5250843167304993,
"learning_rate": 4.782762781093983e-07,
"loss": 2.3295,
"num_input_tokens_seen": 5706840,
"step": 1134
},
{
"epoch": 1.7212121212121212,
"grad_norm": 0.7306003570556641,
"learning_rate": 4.7662922938081575e-07,
"loss": 2.3937,
"num_input_tokens_seen": 5715816,
"step": 1136
},
{
"epoch": 1.7242424242424241,
"grad_norm": 0.7364092469215393,
"learning_rate": 4.7499882156978934e-07,
"loss": 2.3815,
"num_input_tokens_seen": 5724456,
"step": 1138
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.6539096236228943,
"learning_rate": 4.7338509218333966e-07,
"loss": 2.3489,
"num_input_tokens_seen": 5732496,
"step": 1140
},
{
"epoch": 1.7272727272727273,
"eval_loss": 2.326911687850952,
"eval_runtime": 5.8239,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 3.434,
"num_input_tokens_seen": 5732496,
"step": 1140
},
{
"epoch": 1.7303030303030305,
"grad_norm": 0.6865965127944946,
"learning_rate": 4.717880783448046e-07,
"loss": 2.2154,
"num_input_tokens_seen": 5744784,
"step": 1142
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.6450785994529724,
"learning_rate": 4.7020781679298636e-07,
"loss": 2.5799,
"num_input_tokens_seen": 5752872,
"step": 1144
},
{
"epoch": 1.7363636363636363,
"grad_norm": 0.6152123808860779,
"learning_rate": 4.6864434388130604e-07,
"loss": 2.4051,
"num_input_tokens_seen": 5762880,
"step": 1146
},
{
"epoch": 1.7393939393939393,
"grad_norm": 0.5718716382980347,
"learning_rate": 4.6709769557696724e-07,
"loss": 2.2532,
"num_input_tokens_seen": 5773632,
"step": 1148
},
{
"epoch": 1.7424242424242424,
"grad_norm": 0.6017091274261475,
"learning_rate": 4.6556790746012866e-07,
"loss": 2.2363,
"num_input_tokens_seen": 5784960,
"step": 1150
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.5728634595870972,
"learning_rate": 4.6405501472308593e-07,
"loss": 2.264,
"num_input_tokens_seen": 5794392,
"step": 1152
},
{
"epoch": 1.7484848484848485,
"grad_norm": 0.7092226147651672,
"learning_rate": 4.6255905216946174e-07,
"loss": 2.6636,
"num_input_tokens_seen": 5801088,
"step": 1154
},
{
"epoch": 1.7515151515151515,
"grad_norm": 0.6607272028923035,
"learning_rate": 4.6108005421340517e-07,
"loss": 2.3849,
"num_input_tokens_seen": 5810232,
"step": 1156
},
{
"epoch": 1.7545454545454544,
"grad_norm": 0.6151024699211121,
"learning_rate": 4.5961805487879993e-07,
"loss": 2.1526,
"num_input_tokens_seen": 5819976,
"step": 1158
},
{
"epoch": 1.7575757575757576,
"grad_norm": 0.5664975047111511,
"learning_rate": 4.581730877984817e-07,
"loss": 2.3448,
"num_input_tokens_seen": 5831304,
"step": 1160
},
{
"epoch": 1.7575757575757576,
"eval_loss": 2.326674699783325,
"eval_runtime": 5.817,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 5831304,
"step": 1160
},
{
"epoch": 1.7606060606060607,
"grad_norm": 0.6864150166511536,
"learning_rate": 4.567451862134651e-07,
"loss": 2.2982,
"num_input_tokens_seen": 5841792,
"step": 1162
},
{
"epoch": 1.7636363636363637,
"grad_norm": 0.5514176487922668,
"learning_rate": 4.553343829721776e-07,
"loss": 2.296,
"num_input_tokens_seen": 5852640,
"step": 1164
},
{
"epoch": 1.7666666666666666,
"grad_norm": 0.5415042638778687,
"learning_rate": 4.539407105297053e-07,
"loss": 2.3767,
"num_input_tokens_seen": 5864328,
"step": 1166
},
{
"epoch": 1.7696969696969695,
"grad_norm": 0.7088015675544739,
"learning_rate": 4.5256420094704516e-07,
"loss": 2.1989,
"num_input_tokens_seen": 5873424,
"step": 1168
},
{
"epoch": 1.7727272727272727,
"grad_norm": 0.5956241488456726,
"learning_rate": 4.5120488589036816e-07,
"loss": 2.2727,
"num_input_tokens_seen": 5881608,
"step": 1170
},
{
"epoch": 1.7757575757575759,
"grad_norm": 0.6199578046798706,
"learning_rate": 4.498627966302905e-07,
"loss": 2.3122,
"num_input_tokens_seen": 5892984,
"step": 1172
},
{
"epoch": 1.7787878787878788,
"grad_norm": 0.6161043643951416,
"learning_rate": 4.485379640411545e-07,
"loss": 2.607,
"num_input_tokens_seen": 5903832,
"step": 1174
},
{
"epoch": 1.7818181818181817,
"grad_norm": 0.7086969017982483,
"learning_rate": 4.4723041860031803e-07,
"loss": 2.444,
"num_input_tokens_seen": 5914224,
"step": 1176
},
{
"epoch": 1.7848484848484847,
"grad_norm": 0.5110089182853699,
"learning_rate": 4.459401903874538e-07,
"loss": 2.462,
"num_input_tokens_seen": 5925768,
"step": 1178
},
{
"epoch": 1.7878787878787878,
"grad_norm": 0.6780450344085693,
"learning_rate": 4.4466730908385664e-07,
"loss": 2.4997,
"num_input_tokens_seen": 5934528,
"step": 1180
},
{
"epoch": 1.7878787878787878,
"eval_loss": 2.3263440132141113,
"eval_runtime": 5.8197,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 5934528,
"step": 1180
},
{
"epoch": 1.790909090909091,
"grad_norm": 0.6635234355926514,
"learning_rate": 4.434118039717616e-07,
"loss": 2.2541,
"num_input_tokens_seen": 5944224,
"step": 1182
},
{
"epoch": 1.793939393939394,
"grad_norm": 0.6881716251373291,
"learning_rate": 4.4217370393366995e-07,
"loss": 2.2483,
"num_input_tokens_seen": 5954688,
"step": 1184
},
{
"epoch": 1.7969696969696969,
"grad_norm": 1.0131621360778809,
"learning_rate": 4.40953037451684e-07,
"loss": 2.403,
"num_input_tokens_seen": 5964072,
"step": 1186
},
{
"epoch": 1.8,
"grad_norm": 0.5744723677635193,
"learning_rate": 4.3974983260685345e-07,
"loss": 2.5772,
"num_input_tokens_seen": 5975184,
"step": 1188
},
{
"epoch": 1.803030303030303,
"grad_norm": 0.6319069266319275,
"learning_rate": 4.3856411707852814e-07,
"loss": 2.3809,
"num_input_tokens_seen": 5981496,
"step": 1190
},
{
"epoch": 1.8060606060606061,
"grad_norm": 0.49835190176963806,
"learning_rate": 4.373959181437216e-07,
"loss": 2.3452,
"num_input_tokens_seen": 5993088,
"step": 1192
},
{
"epoch": 1.809090909090909,
"grad_norm": 0.825423538684845,
"learning_rate": 4.3624526267648363e-07,
"loss": 2.2971,
"num_input_tokens_seen": 6003864,
"step": 1194
},
{
"epoch": 1.812121212121212,
"grad_norm": 0.5639837384223938,
"learning_rate": 4.351121771472823e-07,
"loss": 2.1717,
"num_input_tokens_seen": 6013824,
"step": 1196
},
{
"epoch": 1.8151515151515152,
"grad_norm": 0.6175968050956726,
"learning_rate": 4.3399668762239446e-07,
"loss": 2.3326,
"num_input_tokens_seen": 6024120,
"step": 1198
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.5506427884101868,
"learning_rate": 4.328988197633066e-07,
"loss": 2.311,
"num_input_tokens_seen": 6035544,
"step": 1200
},
{
"epoch": 1.8181818181818183,
"eval_loss": 2.326775550842285,
"eval_runtime": 5.8185,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 6035544,
"step": 1200
},
{
"epoch": 1.8212121212121213,
"grad_norm": 0.6879149675369263,
"learning_rate": 4.3181859882612426e-07,
"loss": 2.4867,
"num_input_tokens_seen": 6047520,
"step": 1202
},
{
"epoch": 1.8242424242424242,
"grad_norm": 0.9205613136291504,
"learning_rate": 4.307560496609911e-07,
"loss": 2.5415,
"num_input_tokens_seen": 6055488,
"step": 1204
},
{
"epoch": 1.8272727272727272,
"grad_norm": 0.7125353813171387,
"learning_rate": 4.297111967115171e-07,
"loss": 2.3684,
"num_input_tokens_seen": 6063720,
"step": 1206
},
{
"epoch": 1.8303030303030303,
"grad_norm": 0.7578244805335999,
"learning_rate": 4.286840640142166e-07,
"loss": 2.1882,
"num_input_tokens_seen": 6071664,
"step": 1208
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.5936377644538879,
"learning_rate": 4.2767467519795497e-07,
"loss": 2.4383,
"num_input_tokens_seen": 6081360,
"step": 1210
},
{
"epoch": 1.8363636363636364,
"grad_norm": 0.5857051610946655,
"learning_rate": 4.2668305348340495e-07,
"loss": 2.2313,
"num_input_tokens_seen": 6090624,
"step": 1212
},
{
"epoch": 1.8393939393939394,
"grad_norm": 0.5357300639152527,
"learning_rate": 4.2570922168251294e-07,
"loss": 2.3837,
"num_input_tokens_seen": 6100944,
"step": 1214
},
{
"epoch": 1.8424242424242423,
"grad_norm": 0.8577349185943604,
"learning_rate": 4.2475320219797406e-07,
"loss": 2.3874,
"num_input_tokens_seen": 6108792,
"step": 1216
},
{
"epoch": 1.8454545454545455,
"grad_norm": 0.5311655402183533,
"learning_rate": 4.2381501702271623e-07,
"loss": 2.3853,
"num_input_tokens_seen": 6121080,
"step": 1218
},
{
"epoch": 1.8484848484848486,
"grad_norm": 0.5314241051673889,
"learning_rate": 4.228946877393953e-07,
"loss": 2.3858,
"num_input_tokens_seen": 6131112,
"step": 1220
},
{
"epoch": 1.8484848484848486,
"eval_loss": 2.3265769481658936,
"eval_runtime": 5.8173,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 3.438,
"num_input_tokens_seen": 6131112,
"step": 1220
},
{
"epoch": 1.8515151515151516,
"grad_norm": 0.6820886731147766,
"learning_rate": 4.219922355198972e-07,
"loss": 2.3291,
"num_input_tokens_seen": 6141072,
"step": 1222
},
{
"epoch": 1.8545454545454545,
"grad_norm": 0.6875143051147461,
"learning_rate": 4.211076811248524e-07,
"loss": 2.344,
"num_input_tokens_seen": 6152040,
"step": 1224
},
{
"epoch": 1.8575757575757574,
"grad_norm": 0.6124435067176819,
"learning_rate": 4.2024104490315696e-07,
"loss": 2.275,
"num_input_tokens_seen": 6163368,
"step": 1226
},
{
"epoch": 1.8606060606060606,
"grad_norm": 0.6159326434135437,
"learning_rate": 4.1939234679150516e-07,
"loss": 2.4138,
"num_input_tokens_seen": 6171072,
"step": 1228
},
{
"epoch": 1.8636363636363638,
"grad_norm": 0.4833909273147583,
"learning_rate": 4.185616063139308e-07,
"loss": 2.2974,
"num_input_tokens_seen": 6183312,
"step": 1230
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.8235618472099304,
"learning_rate": 4.177488425813578e-07,
"loss": 2.4087,
"num_input_tokens_seen": 6193104,
"step": 1232
},
{
"epoch": 1.8696969696969696,
"grad_norm": 0.5075482726097107,
"learning_rate": 4.1695407429116063e-07,
"loss": 2.4328,
"num_input_tokens_seen": 6205392,
"step": 1234
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.5093833208084106,
"learning_rate": 4.1617731972673466e-07,
"loss": 2.4412,
"num_input_tokens_seen": 6215808,
"step": 1236
},
{
"epoch": 1.8757575757575757,
"grad_norm": 0.5927122235298157,
"learning_rate": 4.1541859675707454e-07,
"loss": 2.2544,
"num_input_tokens_seen": 6226224,
"step": 1238
},
{
"epoch": 1.878787878787879,
"grad_norm": 0.6176667809486389,
"learning_rate": 4.146779228363644e-07,
"loss": 2.3602,
"num_input_tokens_seen": 6235464,
"step": 1240
},
{
"epoch": 1.878787878787879,
"eval_loss": 2.3264036178588867,
"eval_runtime": 5.821,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 3.436,
"num_input_tokens_seen": 6235464,
"step": 1240
},
{
"epoch": 1.8818181818181818,
"grad_norm": 0.5281220078468323,
"learning_rate": 4.139553150035751e-07,
"loss": 2.439,
"num_input_tokens_seen": 6245400,
"step": 1242
},
{
"epoch": 1.8848484848484848,
"grad_norm": 0.6205955147743225,
"learning_rate": 4.1325078988207303e-07,
"loss": 2.466,
"num_input_tokens_seen": 6252768,
"step": 1244
},
{
"epoch": 1.887878787878788,
"grad_norm": 0.5631701350212097,
"learning_rate": 4.1256436367923777e-07,
"loss": 2.5193,
"num_input_tokens_seen": 6264432,
"step": 1246
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.6673572659492493,
"learning_rate": 4.118960521860884e-07,
"loss": 2.4064,
"num_input_tokens_seen": 6273264,
"step": 1248
},
{
"epoch": 1.893939393939394,
"grad_norm": 0.6367799639701843,
"learning_rate": 4.1124587077692115e-07,
"loss": 2.2931,
"num_input_tokens_seen": 6284112,
"step": 1250
},
{
"epoch": 1.896969696969697,
"grad_norm": 1.2654261589050293,
"learning_rate": 4.106138344089554e-07,
"loss": 2.4058,
"num_input_tokens_seen": 6292248,
"step": 1252
},
{
"epoch": 1.9,
"grad_norm": 0.4898473024368286,
"learning_rate": 4.0999995762198936e-07,
"loss": 2.4485,
"num_input_tokens_seen": 6302352,
"step": 1254
},
{
"epoch": 1.903030303030303,
"grad_norm": 0.5527143478393555,
"learning_rate": 4.094042545380659e-07,
"loss": 2.1889,
"num_input_tokens_seen": 6311712,
"step": 1256
},
{
"epoch": 1.906060606060606,
"grad_norm": 0.6194308996200562,
"learning_rate": 4.088267388611474e-07,
"loss": 2.3617,
"num_input_tokens_seen": 6323304,
"step": 1258
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.5801293849945068,
"learning_rate": 4.082674238768009e-07,
"loss": 2.2347,
"num_input_tokens_seen": 6335304,
"step": 1260
},
{
"epoch": 1.9090909090909092,
"eval_loss": 2.326760768890381,
"eval_runtime": 5.8194,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 6335304,
"step": 1260
},
{
"epoch": 1.912121212121212,
"grad_norm": 0.638659656047821,
"learning_rate": 4.0772632245189193e-07,
"loss": 2.2904,
"num_input_tokens_seen": 6345624,
"step": 1262
},
{
"epoch": 1.915151515151515,
"grad_norm": 0.5953812003135681,
"learning_rate": 4.0720344703428906e-07,
"loss": 2.3719,
"num_input_tokens_seen": 6355632,
"step": 1264
},
{
"epoch": 1.9181818181818182,
"grad_norm": 0.5857142806053162,
"learning_rate": 4.066988096525772e-07,
"loss": 2.3489,
"num_input_tokens_seen": 6363840,
"step": 1266
},
{
"epoch": 1.9212121212121214,
"grad_norm": 0.5746711492538452,
"learning_rate": 4.062124219157808e-07,
"loss": 2.3433,
"num_input_tokens_seen": 6375000,
"step": 1268
},
{
"epoch": 1.9242424242424243,
"grad_norm": 0.6761659383773804,
"learning_rate": 4.057442950130972e-07,
"loss": 2.4374,
"num_input_tokens_seen": 6385632,
"step": 1270
},
{
"epoch": 1.9272727272727272,
"grad_norm": 0.4828651249408722,
"learning_rate": 4.05294439713639e-07,
"loss": 2.3613,
"num_input_tokens_seen": 6397728,
"step": 1272
},
{
"epoch": 1.9303030303030302,
"grad_norm": 0.6450832486152649,
"learning_rate": 4.048628663661859e-07,
"loss": 2.1642,
"num_input_tokens_seen": 6409512,
"step": 1274
},
{
"epoch": 1.9333333333333333,
"grad_norm": 0.6221877336502075,
"learning_rate": 4.044495848989475e-07,
"loss": 2.4558,
"num_input_tokens_seen": 6419664,
"step": 1276
},
{
"epoch": 1.9363636363636365,
"grad_norm": 0.825742781162262,
"learning_rate": 4.040546048193343e-07,
"loss": 2.5869,
"num_input_tokens_seen": 6428712,
"step": 1278
},
{
"epoch": 1.9393939393939394,
"grad_norm": 0.69305020570755,
"learning_rate": 4.0367793521373886e-07,
"loss": 2.577,
"num_input_tokens_seen": 6435960,
"step": 1280
},
{
"epoch": 1.9393939393939394,
"eval_loss": 2.3265655040740967,
"eval_runtime": 5.8193,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 6435960,
"step": 1280
},
{
"epoch": 1.9424242424242424,
"grad_norm": 0.51558518409729,
"learning_rate": 4.0331958474732744e-07,
"loss": 2.4398,
"num_input_tokens_seen": 6446952,
"step": 1282
},
{
"epoch": 1.9454545454545453,
"grad_norm": 0.7710928916931152,
"learning_rate": 4.0297956166384e-07,
"loss": 2.3546,
"num_input_tokens_seen": 6454440,
"step": 1284
},
{
"epoch": 1.9484848484848485,
"grad_norm": 0.6520776748657227,
"learning_rate": 4.0265787378540076e-07,
"loss": 2.2851,
"num_input_tokens_seen": 6465888,
"step": 1286
},
{
"epoch": 1.9515151515151516,
"grad_norm": 0.7156710624694824,
"learning_rate": 4.023545285123386e-07,
"loss": 2.501,
"num_input_tokens_seen": 6474384,
"step": 1288
},
{
"epoch": 1.9545454545454546,
"grad_norm": 0.5886339545249939,
"learning_rate": 4.020695328230162e-07,
"loss": 2.3128,
"num_input_tokens_seen": 6485712,
"step": 1290
},
{
"epoch": 1.9575757575757575,
"grad_norm": 0.5593713521957397,
"learning_rate": 4.018028932736699e-07,
"loss": 2.2989,
"num_input_tokens_seen": 6497160,
"step": 1292
},
{
"epoch": 1.9606060606060605,
"grad_norm": 0.5878450870513916,
"learning_rate": 4.01554615998259e-07,
"loss": 2.4504,
"num_input_tokens_seen": 6508920,
"step": 1294
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.5121827721595764,
"learning_rate": 4.013247067083242e-07,
"loss": 2.4132,
"num_input_tokens_seen": 6520176,
"step": 1296
},
{
"epoch": 1.9666666666666668,
"grad_norm": 0.49630334973335266,
"learning_rate": 4.011131706928566e-07,
"loss": 2.3645,
"num_input_tokens_seen": 6531528,
"step": 1298
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.543795645236969,
"learning_rate": 4.0092001281817576e-07,
"loss": 2.3001,
"num_input_tokens_seen": 6543816,
"step": 1300
},
{
"epoch": 1.9696969696969697,
"eval_loss": 2.3271186351776123,
"eval_runtime": 5.816,
"eval_samples_per_second": 3.439,
"eval_steps_per_second": 3.439,
"num_input_tokens_seen": 6543816,
"step": 1300
},
{
"epoch": 1.9727272727272727,
"grad_norm": 0.8191571235656738,
"learning_rate": 4.0074523752781806e-07,
"loss": 2.8758,
"num_input_tokens_seen": 6552936,
"step": 1302
},
{
"epoch": 1.9757575757575756,
"grad_norm": 0.6543108820915222,
"learning_rate": 4.0058884884243416e-07,
"loss": 2.3766,
"num_input_tokens_seen": 6562896,
"step": 1304
},
{
"epoch": 1.9787878787878788,
"grad_norm": 0.5305016040802002,
"learning_rate": 4.004508503596967e-07,
"loss": 2.3732,
"num_input_tokens_seen": 6575184,
"step": 1306
},
{
"epoch": 1.981818181818182,
"grad_norm": 0.5914813280105591,
"learning_rate": 4.0033124525421757e-07,
"loss": 2.3789,
"num_input_tokens_seen": 6586032,
"step": 1308
},
{
"epoch": 1.9848484848484849,
"grad_norm": 0.712382435798645,
"learning_rate": 4.0023003627747455e-07,
"loss": 2.2654,
"num_input_tokens_seen": 6594768,
"step": 1310
},
{
"epoch": 1.9878787878787878,
"grad_norm": 0.5054189562797546,
"learning_rate": 4.0014722575774835e-07,
"loss": 2.4605,
"num_input_tokens_seen": 6604728,
"step": 1312
},
{
"epoch": 1.990909090909091,
"grad_norm": 0.5901520252227783,
"learning_rate": 4.000828156000692e-07,
"loss": 2.4816,
"num_input_tokens_seen": 6616536,
"step": 1314
},
{
"epoch": 1.993939393939394,
"grad_norm": 0.7864160537719727,
"learning_rate": 4.000368072861723e-07,
"loss": 2.482,
"num_input_tokens_seen": 6624480,
"step": 1316
},
{
"epoch": 1.996969696969697,
"grad_norm": 0.49510428309440613,
"learning_rate": 4.0000920187446465e-07,
"loss": 2.45,
"num_input_tokens_seen": 6636768,
"step": 1318
},
{
"epoch": 2.0,
"grad_norm": 0.6357753872871399,
"learning_rate": 4e-07,
"loss": 2.2129,
"num_input_tokens_seen": 6646824,
"step": 1320
},
{
"epoch": 2.0,
"eval_loss": 2.326845645904541,
"eval_runtime": 5.8186,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 3.437,
"num_input_tokens_seen": 6646824,
"step": 1320
},
{
"epoch": 2.0,
"num_input_tokens_seen": 6646824,
"step": 1320,
"total_flos": 3.059943926859694e+17,
"train_loss": 2.3998946460810573,
"train_runtime": 5038.8172,
"train_samples_per_second": 0.786,
"train_steps_per_second": 0.262
}
],
"logging_steps": 2,
"max_steps": 1320,
"num_input_tokens_seen": 6646824,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.059943926859694e+17,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}