2020-Q2-75p-filtered / trainer_state.json
DouglasPontes's picture
End of training
41620af
raw
history blame
80.8 kB
{
"best_metric": 2.2240703105926514,
"best_model_checkpoint": "./model_tweets_2020_Q2_75/checkpoint-2368000",
"epoch": 6.7372770733268394,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"eval_loss": 2.486480474472046,
"eval_runtime": 326.396,
"eval_samples_per_second": 919.068,
"eval_steps_per_second": 57.442,
"step": 8000
},
{
"epoch": 0.04,
"learning_rate": 4.0726666666666665e-07,
"loss": 2.6592,
"step": 16000
},
{
"epoch": 0.04,
"eval_loss": 2.459786891937256,
"eval_runtime": 328.9344,
"eval_samples_per_second": 911.975,
"eval_steps_per_second": 56.999,
"step": 16000
},
{
"epoch": 0.07,
"eval_loss": 2.447173595428467,
"eval_runtime": 326.9299,
"eval_samples_per_second": 917.567,
"eval_steps_per_second": 57.349,
"step": 24000
},
{
"epoch": 0.09,
"learning_rate": 4.0453333333333336e-07,
"loss": 2.6211,
"step": 32000
},
{
"epoch": 0.09,
"eval_loss": 2.4340574741363525,
"eval_runtime": 331.0965,
"eval_samples_per_second": 906.02,
"eval_steps_per_second": 56.627,
"step": 32000
},
{
"epoch": 0.11,
"eval_loss": 2.4222826957702637,
"eval_runtime": 330.5401,
"eval_samples_per_second": 907.545,
"eval_steps_per_second": 56.722,
"step": 40000
},
{
"epoch": 0.13,
"learning_rate": 4.018e-07,
"loss": 2.6048,
"step": 48000
},
{
"epoch": 0.13,
"eval_loss": 2.4217443466186523,
"eval_runtime": 328.8317,
"eval_samples_per_second": 912.26,
"eval_steps_per_second": 57.017,
"step": 48000
},
{
"epoch": 0.16,
"eval_loss": 2.418403387069702,
"eval_runtime": 328.052,
"eval_samples_per_second": 914.428,
"eval_steps_per_second": 57.153,
"step": 56000
},
{
"epoch": 0.18,
"learning_rate": 3.9906666666666667e-07,
"loss": 2.5861,
"step": 64000
},
{
"epoch": 0.18,
"eval_loss": 2.4061949253082275,
"eval_runtime": 330.9569,
"eval_samples_per_second": 906.402,
"eval_steps_per_second": 56.651,
"step": 64000
},
{
"epoch": 0.2,
"eval_loss": 2.3918895721435547,
"eval_runtime": 327.7147,
"eval_samples_per_second": 915.369,
"eval_steps_per_second": 57.211,
"step": 72000
},
{
"epoch": 0.22,
"learning_rate": 3.963333333333333e-07,
"loss": 2.5736,
"step": 80000
},
{
"epoch": 0.22,
"eval_loss": 2.3896288871765137,
"eval_runtime": 329.9795,
"eval_samples_per_second": 909.087,
"eval_steps_per_second": 56.819,
"step": 80000
},
{
"epoch": 0.25,
"eval_loss": 2.3951096534729004,
"eval_runtime": 328.5879,
"eval_samples_per_second": 912.937,
"eval_steps_per_second": 57.059,
"step": 88000
},
{
"epoch": 0.27,
"learning_rate": 3.936e-07,
"loss": 2.5559,
"step": 96000
},
{
"epoch": 0.27,
"eval_loss": 2.3903470039367676,
"eval_runtime": 328.446,
"eval_samples_per_second": 913.331,
"eval_steps_per_second": 57.084,
"step": 96000
},
{
"epoch": 0.29,
"eval_loss": 2.3835983276367188,
"eval_runtime": 330.3118,
"eval_samples_per_second": 908.172,
"eval_steps_per_second": 56.762,
"step": 104000
},
{
"epoch": 0.31,
"learning_rate": 3.908666666666667e-07,
"loss": 2.5551,
"step": 112000
},
{
"epoch": 0.31,
"eval_loss": 2.374908685684204,
"eval_runtime": 328.8458,
"eval_samples_per_second": 912.221,
"eval_steps_per_second": 57.015,
"step": 112000
},
{
"epoch": 0.34,
"eval_loss": 2.3793506622314453,
"eval_runtime": 329.1311,
"eval_samples_per_second": 911.43,
"eval_steps_per_second": 56.965,
"step": 120000
},
{
"epoch": 0.36,
"learning_rate": 3.8813333333333334e-07,
"loss": 2.5371,
"step": 128000
},
{
"epoch": 0.36,
"eval_loss": 2.3733017444610596,
"eval_runtime": 328.0343,
"eval_samples_per_second": 914.477,
"eval_steps_per_second": 57.156,
"step": 128000
},
{
"epoch": 0.38,
"eval_loss": 2.3703365325927734,
"eval_runtime": 328.4858,
"eval_samples_per_second": 913.221,
"eval_steps_per_second": 57.077,
"step": 136000
},
{
"epoch": 0.4,
"learning_rate": 3.854e-07,
"loss": 2.5417,
"step": 144000
},
{
"epoch": 0.4,
"eval_loss": 2.366170883178711,
"eval_runtime": 328.5536,
"eval_samples_per_second": 913.032,
"eval_steps_per_second": 57.065,
"step": 144000
},
{
"epoch": 0.43,
"eval_loss": 2.372772216796875,
"eval_runtime": 330.4279,
"eval_samples_per_second": 907.853,
"eval_steps_per_second": 56.742,
"step": 152000
},
{
"epoch": 0.45,
"learning_rate": 3.8266666666666665e-07,
"loss": 2.5316,
"step": 160000
},
{
"epoch": 0.45,
"eval_loss": 2.364302158355713,
"eval_runtime": 328.3485,
"eval_samples_per_second": 913.603,
"eval_steps_per_second": 57.101,
"step": 160000
},
{
"epoch": 0.47,
"eval_loss": 2.3567655086517334,
"eval_runtime": 329.7531,
"eval_samples_per_second": 909.711,
"eval_steps_per_second": 56.858,
"step": 168000
},
{
"epoch": 0.49,
"learning_rate": 3.799333333333333e-07,
"loss": 2.5296,
"step": 176000
},
{
"epoch": 0.49,
"eval_loss": 2.3554866313934326,
"eval_runtime": 329.7554,
"eval_samples_per_second": 909.705,
"eval_steps_per_second": 56.857,
"step": 176000
},
{
"epoch": 0.52,
"eval_loss": 2.3506195545196533,
"eval_runtime": 331.0345,
"eval_samples_per_second": 906.19,
"eval_steps_per_second": 56.638,
"step": 184000
},
{
"epoch": 0.54,
"learning_rate": 3.772e-07,
"loss": 2.5215,
"step": 192000
},
{
"epoch": 0.54,
"eval_loss": 2.348207473754883,
"eval_runtime": 329.4713,
"eval_samples_per_second": 910.489,
"eval_steps_per_second": 56.906,
"step": 192000
},
{
"epoch": 0.56,
"eval_loss": 2.351372480392456,
"eval_runtime": 329.1084,
"eval_samples_per_second": 911.493,
"eval_steps_per_second": 56.969,
"step": 200000
},
{
"epoch": 0.58,
"learning_rate": 3.7446666666666667e-07,
"loss": 2.5274,
"step": 208000
},
{
"epoch": 0.58,
"eval_loss": 2.3531200885772705,
"eval_runtime": 330.4502,
"eval_samples_per_second": 907.792,
"eval_steps_per_second": 56.738,
"step": 208000
},
{
"epoch": 0.61,
"eval_loss": 2.3463432788848877,
"eval_runtime": 331.5879,
"eval_samples_per_second": 904.677,
"eval_steps_per_second": 56.543,
"step": 216000
},
{
"epoch": 0.63,
"learning_rate": 3.7173333333333333e-07,
"loss": 2.5215,
"step": 224000
},
{
"epoch": 0.63,
"eval_loss": 2.346996784210205,
"eval_runtime": 330.4016,
"eval_samples_per_second": 907.925,
"eval_steps_per_second": 56.746,
"step": 224000
},
{
"epoch": 0.65,
"eval_loss": 2.3407442569732666,
"eval_runtime": 331.6282,
"eval_samples_per_second": 904.567,
"eval_steps_per_second": 56.536,
"step": 232000
},
{
"epoch": 0.67,
"learning_rate": 3.69e-07,
"loss": 2.5096,
"step": 240000
},
{
"epoch": 0.67,
"eval_loss": 2.340013265609741,
"eval_runtime": 330.2111,
"eval_samples_per_second": 908.449,
"eval_steps_per_second": 56.779,
"step": 240000
},
{
"epoch": 0.7,
"eval_loss": 2.340172290802002,
"eval_runtime": 330.4785,
"eval_samples_per_second": 907.714,
"eval_steps_per_second": 56.733,
"step": 248000
},
{
"epoch": 0.72,
"learning_rate": 3.6626666666666664e-07,
"loss": 2.5176,
"step": 256000
},
{
"epoch": 0.72,
"eval_loss": 2.330843210220337,
"eval_runtime": 329.4037,
"eval_samples_per_second": 910.676,
"eval_steps_per_second": 56.918,
"step": 256000
},
{
"epoch": 0.74,
"eval_loss": 2.3342106342315674,
"eval_runtime": 329.9573,
"eval_samples_per_second": 909.148,
"eval_steps_per_second": 56.823,
"step": 264000
},
{
"epoch": 0.76,
"learning_rate": 3.6353333333333335e-07,
"loss": 2.5048,
"step": 272000
},
{
"epoch": 0.76,
"eval_loss": 2.333300828933716,
"eval_runtime": 330.8736,
"eval_samples_per_second": 906.63,
"eval_steps_per_second": 56.665,
"step": 272000
},
{
"epoch": 0.79,
"eval_loss": 2.3288071155548096,
"eval_runtime": 329.6491,
"eval_samples_per_second": 909.998,
"eval_steps_per_second": 56.876,
"step": 280000
},
{
"epoch": 0.81,
"learning_rate": 3.608e-07,
"loss": 2.4979,
"step": 288000
},
{
"epoch": 0.81,
"eval_loss": 2.329832077026367,
"eval_runtime": 329.6289,
"eval_samples_per_second": 910.054,
"eval_steps_per_second": 56.879,
"step": 288000
},
{
"epoch": 0.83,
"eval_loss": 2.323723554611206,
"eval_runtime": 330.3451,
"eval_samples_per_second": 908.081,
"eval_steps_per_second": 56.756,
"step": 296000
},
{
"epoch": 0.85,
"learning_rate": 3.5806666666666666e-07,
"loss": 2.4963,
"step": 304000
},
{
"epoch": 0.85,
"eval_loss": 2.326643943786621,
"eval_runtime": 331.0075,
"eval_samples_per_second": 906.263,
"eval_steps_per_second": 56.642,
"step": 304000
},
{
"epoch": 0.88,
"eval_loss": 2.3196959495544434,
"eval_runtime": 329.7349,
"eval_samples_per_second": 909.761,
"eval_steps_per_second": 56.861,
"step": 312000
},
{
"epoch": 0.9,
"learning_rate": 3.553333333333333e-07,
"loss": 2.4972,
"step": 320000
},
{
"epoch": 0.9,
"eval_loss": 2.327077627182007,
"eval_runtime": 329.8959,
"eval_samples_per_second": 909.317,
"eval_steps_per_second": 56.833,
"step": 320000
},
{
"epoch": 0.92,
"eval_loss": 2.327465534210205,
"eval_runtime": 329.8835,
"eval_samples_per_second": 909.351,
"eval_steps_per_second": 56.835,
"step": 328000
},
{
"epoch": 0.94,
"learning_rate": 3.5259999999999997e-07,
"loss": 2.4969,
"step": 336000
},
{
"epoch": 0.94,
"eval_loss": 2.3209738731384277,
"eval_runtime": 330.0001,
"eval_samples_per_second": 909.03,
"eval_steps_per_second": 56.815,
"step": 336000
},
{
"epoch": 0.97,
"eval_loss": 2.3222391605377197,
"eval_runtime": 331.2832,
"eval_samples_per_second": 905.509,
"eval_steps_per_second": 56.595,
"step": 344000
},
{
"epoch": 0.99,
"learning_rate": 3.498666666666667e-07,
"loss": 2.4961,
"step": 352000
},
{
"epoch": 0.99,
"eval_loss": 2.324232339859009,
"eval_runtime": 329.8848,
"eval_samples_per_second": 909.348,
"eval_steps_per_second": 56.835,
"step": 352000
},
{
"epoch": 1.01,
"eval_loss": 2.3154807090759277,
"eval_runtime": 330.051,
"eval_samples_per_second": 908.89,
"eval_steps_per_second": 56.806,
"step": 360000
},
{
"epoch": 1.03,
"learning_rate": 3.4713333333333333e-07,
"loss": 2.49,
"step": 368000
},
{
"epoch": 1.03,
"eval_loss": 2.3175013065338135,
"eval_runtime": 331.4361,
"eval_samples_per_second": 905.092,
"eval_steps_per_second": 56.569,
"step": 368000
},
{
"epoch": 1.06,
"eval_loss": 2.307647228240967,
"eval_runtime": 332.1323,
"eval_samples_per_second": 903.194,
"eval_steps_per_second": 56.45,
"step": 376000
},
{
"epoch": 1.08,
"learning_rate": 3.444e-07,
"loss": 2.4847,
"step": 384000
},
{
"epoch": 1.08,
"eval_loss": 2.313831090927124,
"eval_runtime": 330.4544,
"eval_samples_per_second": 907.78,
"eval_steps_per_second": 56.737,
"step": 384000
},
{
"epoch": 1.1,
"eval_loss": 2.3183014392852783,
"eval_runtime": 331.0864,
"eval_samples_per_second": 906.047,
"eval_steps_per_second": 56.629,
"step": 392000
},
{
"epoch": 1.12,
"learning_rate": 3.416666666666667e-07,
"loss": 2.4767,
"step": 400000
},
{
"epoch": 1.12,
"eval_loss": 2.3118338584899902,
"eval_runtime": 330.5298,
"eval_samples_per_second": 907.573,
"eval_steps_per_second": 56.724,
"step": 400000
},
{
"epoch": 1.15,
"eval_loss": 2.3151934146881104,
"eval_runtime": 334.1069,
"eval_samples_per_second": 897.856,
"eval_steps_per_second": 56.117,
"step": 408000
},
{
"epoch": 1.17,
"learning_rate": 3.3893333333333335e-07,
"loss": 2.4788,
"step": 416000
},
{
"epoch": 1.17,
"eval_loss": 2.3089170455932617,
"eval_runtime": 330.2914,
"eval_samples_per_second": 908.228,
"eval_steps_per_second": 56.765,
"step": 416000
},
{
"epoch": 1.19,
"eval_loss": 2.3051483631134033,
"eval_runtime": 330.8266,
"eval_samples_per_second": 906.759,
"eval_steps_per_second": 56.673,
"step": 424000
},
{
"epoch": 1.21,
"learning_rate": 3.3619999999999995e-07,
"loss": 2.4738,
"step": 432000
},
{
"epoch": 1.21,
"eval_loss": 2.310180425643921,
"eval_runtime": 329.7325,
"eval_samples_per_second": 909.768,
"eval_steps_per_second": 56.861,
"step": 432000
},
{
"epoch": 1.24,
"eval_loss": 2.3069398403167725,
"eval_runtime": 330.3228,
"eval_samples_per_second": 908.142,
"eval_steps_per_second": 56.76,
"step": 440000
},
{
"epoch": 1.26,
"learning_rate": 3.3346666666666666e-07,
"loss": 2.4635,
"step": 448000
},
{
"epoch": 1.26,
"eval_loss": 2.3003976345062256,
"eval_runtime": 331.7126,
"eval_samples_per_second": 904.337,
"eval_steps_per_second": 56.522,
"step": 448000
},
{
"epoch": 1.28,
"eval_loss": 2.3066189289093018,
"eval_runtime": 331.273,
"eval_samples_per_second": 905.537,
"eval_steps_per_second": 56.597,
"step": 456000
},
{
"epoch": 1.3,
"learning_rate": 3.307333333333333e-07,
"loss": 2.4828,
"step": 464000
},
{
"epoch": 1.3,
"eval_loss": 2.307849168777466,
"eval_runtime": 333.5774,
"eval_samples_per_second": 899.282,
"eval_steps_per_second": 56.206,
"step": 464000
},
{
"epoch": 1.32,
"eval_loss": 2.3072116374969482,
"eval_runtime": 330.1882,
"eval_samples_per_second": 908.512,
"eval_steps_per_second": 56.783,
"step": 472000
},
{
"epoch": 1.35,
"learning_rate": 3.28e-07,
"loss": 2.4675,
"step": 480000
},
{
"epoch": 1.35,
"eval_loss": 2.3072662353515625,
"eval_runtime": 330.6229,
"eval_samples_per_second": 907.318,
"eval_steps_per_second": 56.708,
"step": 480000
},
{
"epoch": 1.37,
"eval_loss": 2.3013877868652344,
"eval_runtime": 332.2733,
"eval_samples_per_second": 902.811,
"eval_steps_per_second": 56.426,
"step": 488000
},
{
"epoch": 1.39,
"learning_rate": 3.252666666666667e-07,
"loss": 2.4676,
"step": 496000
},
{
"epoch": 1.39,
"eval_loss": 2.298736095428467,
"eval_runtime": 330.3572,
"eval_samples_per_second": 908.047,
"eval_steps_per_second": 56.754,
"step": 496000
},
{
"epoch": 1.41,
"eval_loss": 2.2987987995147705,
"eval_runtime": 330.8652,
"eval_samples_per_second": 906.653,
"eval_steps_per_second": 56.667,
"step": 504000
},
{
"epoch": 1.44,
"learning_rate": 3.2253333333333334e-07,
"loss": 2.4678,
"step": 512000
},
{
"epoch": 1.44,
"eval_loss": 2.2971158027648926,
"eval_runtime": 333.0983,
"eval_samples_per_second": 900.575,
"eval_steps_per_second": 56.287,
"step": 512000
},
{
"epoch": 1.46,
"eval_loss": 2.2968783378601074,
"eval_runtime": 331.5018,
"eval_samples_per_second": 904.912,
"eval_steps_per_second": 56.558,
"step": 520000
},
{
"epoch": 1.48,
"learning_rate": 3.198e-07,
"loss": 2.4634,
"step": 528000
},
{
"epoch": 1.48,
"eval_loss": 2.2989814281463623,
"eval_runtime": 333.5333,
"eval_samples_per_second": 899.4,
"eval_steps_per_second": 56.213,
"step": 528000
},
{
"epoch": 1.5,
"eval_loss": 2.2869136333465576,
"eval_runtime": 332.2841,
"eval_samples_per_second": 902.782,
"eval_steps_per_second": 56.425,
"step": 536000
},
{
"epoch": 1.53,
"learning_rate": 3.1706666666666665e-07,
"loss": 2.4657,
"step": 544000
},
{
"epoch": 1.53,
"eval_loss": 2.293611526489258,
"eval_runtime": 331.5364,
"eval_samples_per_second": 904.818,
"eval_steps_per_second": 56.552,
"step": 544000
},
{
"epoch": 1.55,
"eval_loss": 2.291510581970215,
"eval_runtime": 331.7602,
"eval_samples_per_second": 904.207,
"eval_steps_per_second": 56.514,
"step": 552000
},
{
"epoch": 1.57,
"learning_rate": 3.1433333333333336e-07,
"loss": 2.4607,
"step": 560000
},
{
"epoch": 1.57,
"eval_loss": 2.290339469909668,
"eval_runtime": 331.9545,
"eval_samples_per_second": 903.678,
"eval_steps_per_second": 56.481,
"step": 560000
},
{
"epoch": 1.59,
"eval_loss": 2.2934372425079346,
"eval_runtime": 334.4873,
"eval_samples_per_second": 896.835,
"eval_steps_per_second": 56.053,
"step": 568000
},
{
"epoch": 1.62,
"learning_rate": 3.116e-07,
"loss": 2.4558,
"step": 576000
},
{
"epoch": 1.62,
"eval_loss": 2.284529447555542,
"eval_runtime": 334.3226,
"eval_samples_per_second": 897.277,
"eval_steps_per_second": 56.081,
"step": 576000
},
{
"epoch": 1.64,
"eval_loss": 2.289668083190918,
"eval_runtime": 335.2625,
"eval_samples_per_second": 894.761,
"eval_steps_per_second": 55.923,
"step": 584000
},
{
"epoch": 1.66,
"learning_rate": 3.0886666666666667e-07,
"loss": 2.4662,
"step": 592000
},
{
"epoch": 1.66,
"eval_loss": 2.2928452491760254,
"eval_runtime": 330.9741,
"eval_samples_per_second": 906.355,
"eval_steps_per_second": 56.648,
"step": 592000
},
{
"epoch": 1.68,
"eval_loss": 2.286137580871582,
"eval_runtime": 332.2239,
"eval_samples_per_second": 902.945,
"eval_steps_per_second": 56.435,
"step": 600000
},
{
"epoch": 1.71,
"learning_rate": 3.061333333333333e-07,
"loss": 2.4658,
"step": 608000
},
{
"epoch": 1.71,
"eval_loss": 2.2883219718933105,
"eval_runtime": 334.2582,
"eval_samples_per_second": 897.45,
"eval_steps_per_second": 56.091,
"step": 608000
},
{
"epoch": 1.73,
"eval_loss": 2.287848472595215,
"eval_runtime": 332.6298,
"eval_samples_per_second": 901.843,
"eval_steps_per_second": 56.366,
"step": 616000
},
{
"epoch": 1.75,
"learning_rate": 3.034e-07,
"loss": 2.4533,
"step": 624000
},
{
"epoch": 1.75,
"eval_loss": 2.2891786098480225,
"eval_runtime": 333.5921,
"eval_samples_per_second": 899.242,
"eval_steps_per_second": 56.203,
"step": 624000
},
{
"epoch": 1.77,
"eval_loss": 2.2885706424713135,
"eval_runtime": 332.9432,
"eval_samples_per_second": 900.995,
"eval_steps_per_second": 56.313,
"step": 632000
},
{
"epoch": 1.8,
"learning_rate": 3.0066666666666663e-07,
"loss": 2.4575,
"step": 640000
},
{
"epoch": 1.8,
"eval_loss": 2.2894177436828613,
"eval_runtime": 334.1499,
"eval_samples_per_second": 897.741,
"eval_steps_per_second": 56.11,
"step": 640000
},
{
"epoch": 1.82,
"eval_loss": 2.2870869636535645,
"eval_runtime": 332.2509,
"eval_samples_per_second": 902.872,
"eval_steps_per_second": 56.43,
"step": 648000
},
{
"epoch": 1.84,
"learning_rate": 2.9793333333333334e-07,
"loss": 2.4565,
"step": 656000
},
{
"epoch": 1.84,
"eval_loss": 2.2797837257385254,
"eval_runtime": 332.3564,
"eval_samples_per_second": 902.585,
"eval_steps_per_second": 56.412,
"step": 656000
},
{
"epoch": 1.86,
"eval_loss": 2.2877373695373535,
"eval_runtime": 332.7462,
"eval_samples_per_second": 901.528,
"eval_steps_per_second": 56.346,
"step": 664000
},
{
"epoch": 1.89,
"learning_rate": 2.952e-07,
"loss": 2.4548,
"step": 672000
},
{
"epoch": 1.89,
"eval_loss": 2.2859256267547607,
"eval_runtime": 333.4649,
"eval_samples_per_second": 899.585,
"eval_steps_per_second": 56.225,
"step": 672000
},
{
"epoch": 1.91,
"eval_loss": 2.2786755561828613,
"eval_runtime": 331.6465,
"eval_samples_per_second": 904.517,
"eval_steps_per_second": 56.533,
"step": 680000
},
{
"epoch": 1.93,
"learning_rate": 2.9246666666666665e-07,
"loss": 2.4507,
"step": 688000
},
{
"epoch": 1.93,
"eval_loss": 2.277973175048828,
"eval_runtime": 332.624,
"eval_samples_per_second": 901.859,
"eval_steps_per_second": 56.367,
"step": 688000
},
{
"epoch": 1.95,
"eval_loss": 2.2825992107391357,
"eval_runtime": 332.2329,
"eval_samples_per_second": 902.921,
"eval_steps_per_second": 56.433,
"step": 696000
},
{
"epoch": 1.98,
"learning_rate": 2.897333333333333e-07,
"loss": 2.4455,
"step": 704000
},
{
"epoch": 1.98,
"eval_loss": 2.283816337585449,
"eval_runtime": 332.7513,
"eval_samples_per_second": 901.514,
"eval_steps_per_second": 56.345,
"step": 704000
},
{
"epoch": 2.0,
"eval_loss": 2.2763917446136475,
"eval_runtime": 331.7671,
"eval_samples_per_second": 904.188,
"eval_steps_per_second": 56.513,
"step": 712000
},
{
"epoch": 2.02,
"learning_rate": 2.8699999999999996e-07,
"loss": 2.4516,
"step": 720000
},
{
"epoch": 2.02,
"eval_loss": 2.281355381011963,
"eval_runtime": 331.7857,
"eval_samples_per_second": 904.138,
"eval_steps_per_second": 56.509,
"step": 720000
},
{
"epoch": 2.04,
"eval_loss": 2.2807059288024902,
"eval_runtime": 332.6438,
"eval_samples_per_second": 901.805,
"eval_steps_per_second": 56.364,
"step": 728000
},
{
"epoch": 2.07,
"learning_rate": 2.8426666666666667e-07,
"loss": 2.445,
"step": 736000
},
{
"epoch": 2.07,
"eval_loss": 2.2740111351013184,
"eval_runtime": 332.4045,
"eval_samples_per_second": 902.455,
"eval_steps_per_second": 56.404,
"step": 736000
},
{
"epoch": 2.09,
"eval_loss": 2.277953624725342,
"eval_runtime": 331.8291,
"eval_samples_per_second": 904.02,
"eval_steps_per_second": 56.502,
"step": 744000
},
{
"epoch": 2.11,
"learning_rate": 2.815333333333333e-07,
"loss": 2.4466,
"step": 752000
},
{
"epoch": 2.11,
"eval_loss": 2.2774717807769775,
"eval_runtime": 331.8071,
"eval_samples_per_second": 904.079,
"eval_steps_per_second": 56.506,
"step": 752000
},
{
"epoch": 2.13,
"eval_loss": 2.2783188819885254,
"eval_runtime": 333.2568,
"eval_samples_per_second": 900.147,
"eval_steps_per_second": 56.26,
"step": 760000
},
{
"epoch": 2.16,
"learning_rate": 2.7880000000000003e-07,
"loss": 2.4476,
"step": 768000
},
{
"epoch": 2.16,
"eval_loss": 2.2762770652770996,
"eval_runtime": 331.8887,
"eval_samples_per_second": 903.857,
"eval_steps_per_second": 56.492,
"step": 768000
},
{
"epoch": 2.18,
"eval_loss": 2.2737369537353516,
"eval_runtime": 331.8743,
"eval_samples_per_second": 903.896,
"eval_steps_per_second": 56.494,
"step": 776000
},
{
"epoch": 2.2,
"learning_rate": 2.7606666666666664e-07,
"loss": 2.4449,
"step": 784000
},
{
"epoch": 2.2,
"eval_loss": 2.2752888202667236,
"eval_runtime": 334.1528,
"eval_samples_per_second": 897.733,
"eval_steps_per_second": 56.109,
"step": 784000
},
{
"epoch": 2.22,
"eval_loss": 2.276200532913208,
"eval_runtime": 332.3689,
"eval_samples_per_second": 902.551,
"eval_steps_per_second": 56.41,
"step": 792000
},
{
"epoch": 2.25,
"learning_rate": 2.733333333333333e-07,
"loss": 2.4424,
"step": 800000
},
{
"epoch": 2.25,
"eval_loss": 2.276653528213501,
"eval_runtime": 332.4217,
"eval_samples_per_second": 902.408,
"eval_steps_per_second": 56.401,
"step": 800000
},
{
"epoch": 2.27,
"eval_loss": 2.2701988220214844,
"eval_runtime": 332.7419,
"eval_samples_per_second": 901.54,
"eval_steps_per_second": 56.347,
"step": 808000
},
{
"epoch": 2.29,
"learning_rate": 2.706e-07,
"loss": 2.4528,
"step": 816000
},
{
"epoch": 2.29,
"eval_loss": 2.26547908782959,
"eval_runtime": 332.284,
"eval_samples_per_second": 902.782,
"eval_steps_per_second": 56.425,
"step": 816000
},
{
"epoch": 2.31,
"eval_loss": 2.272664785385132,
"eval_runtime": 332.94,
"eval_samples_per_second": 901.003,
"eval_steps_per_second": 56.313,
"step": 824000
},
{
"epoch": 2.34,
"learning_rate": 2.6786666666666666e-07,
"loss": 2.4523,
"step": 832000
},
{
"epoch": 2.34,
"eval_loss": 2.2732608318328857,
"eval_runtime": 332.6487,
"eval_samples_per_second": 901.792,
"eval_steps_per_second": 56.363,
"step": 832000
},
{
"epoch": 2.36,
"eval_loss": 2.2654263973236084,
"eval_runtime": 332.7531,
"eval_samples_per_second": 901.509,
"eval_steps_per_second": 56.345,
"step": 840000
},
{
"epoch": 2.38,
"learning_rate": 2.651333333333333e-07,
"loss": 2.4395,
"step": 848000
},
{
"epoch": 2.38,
"eval_loss": 2.2673776149749756,
"eval_runtime": 332.3327,
"eval_samples_per_second": 902.65,
"eval_steps_per_second": 56.416,
"step": 848000
},
{
"epoch": 2.4,
"eval_loss": 2.275400161743164,
"eval_runtime": 333.0968,
"eval_samples_per_second": 900.579,
"eval_steps_per_second": 56.287,
"step": 856000
},
{
"epoch": 2.43,
"learning_rate": 2.624e-07,
"loss": 2.434,
"step": 864000
},
{
"epoch": 2.43,
"eval_loss": 2.2722461223602295,
"eval_runtime": 333.3836,
"eval_samples_per_second": 899.804,
"eval_steps_per_second": 56.239,
"step": 864000
},
{
"epoch": 2.45,
"eval_loss": 2.266554117202759,
"eval_runtime": 332.9633,
"eval_samples_per_second": 900.94,
"eval_steps_per_second": 56.31,
"step": 872000
},
{
"epoch": 2.47,
"learning_rate": 2.596666666666667e-07,
"loss": 2.4407,
"step": 880000
},
{
"epoch": 2.47,
"eval_loss": 2.265575647354126,
"eval_runtime": 334.6536,
"eval_samples_per_second": 896.39,
"eval_steps_per_second": 56.025,
"step": 880000
},
{
"epoch": 2.49,
"eval_loss": 2.265437602996826,
"eval_runtime": 333.1051,
"eval_samples_per_second": 900.556,
"eval_steps_per_second": 56.286,
"step": 888000
},
{
"epoch": 2.52,
"learning_rate": 2.5693333333333333e-07,
"loss": 2.4352,
"step": 896000
},
{
"epoch": 2.52,
"eval_loss": 2.263028383255005,
"eval_runtime": 333.7641,
"eval_samples_per_second": 898.778,
"eval_steps_per_second": 56.174,
"step": 896000
},
{
"epoch": 2.54,
"eval_loss": 2.2662160396575928,
"eval_runtime": 333.089,
"eval_samples_per_second": 900.6,
"eval_steps_per_second": 56.288,
"step": 904000
},
{
"epoch": 2.56,
"learning_rate": 2.542e-07,
"loss": 2.4393,
"step": 912000
},
{
"epoch": 2.56,
"eval_loss": 2.2692363262176514,
"eval_runtime": 333.5532,
"eval_samples_per_second": 899.347,
"eval_steps_per_second": 56.21,
"step": 912000
},
{
"epoch": 2.58,
"eval_loss": 2.2558484077453613,
"eval_runtime": 335.5892,
"eval_samples_per_second": 893.891,
"eval_steps_per_second": 55.869,
"step": 920000
},
{
"epoch": 2.61,
"learning_rate": 2.5146666666666664e-07,
"loss": 2.4378,
"step": 928000
},
{
"epoch": 2.61,
"eval_loss": 2.2619380950927734,
"eval_runtime": 333.9818,
"eval_samples_per_second": 898.193,
"eval_steps_per_second": 56.138,
"step": 928000
},
{
"epoch": 2.63,
"eval_loss": 2.261375665664673,
"eval_runtime": 333.299,
"eval_samples_per_second": 900.033,
"eval_steps_per_second": 56.253,
"step": 936000
},
{
"epoch": 2.65,
"learning_rate": 2.4873333333333335e-07,
"loss": 2.4392,
"step": 944000
},
{
"epoch": 2.65,
"eval_loss": 2.2577741146087646,
"eval_runtime": 332.5892,
"eval_samples_per_second": 901.954,
"eval_steps_per_second": 56.373,
"step": 944000
},
{
"epoch": 2.67,
"eval_loss": 2.267181873321533,
"eval_runtime": 333.717,
"eval_samples_per_second": 898.905,
"eval_steps_per_second": 56.182,
"step": 952000
},
{
"epoch": 2.69,
"learning_rate": 2.46e-07,
"loss": 2.437,
"step": 960000
},
{
"epoch": 2.69,
"eval_loss": 2.2597758769989014,
"eval_runtime": 334.1825,
"eval_samples_per_second": 897.653,
"eval_steps_per_second": 56.104,
"step": 960000
},
{
"epoch": 2.72,
"eval_loss": 2.263289451599121,
"eval_runtime": 333.6576,
"eval_samples_per_second": 899.065,
"eval_steps_per_second": 56.192,
"step": 968000
},
{
"epoch": 2.74,
"learning_rate": 2.4326666666666666e-07,
"loss": 2.4388,
"step": 976000
},
{
"epoch": 2.74,
"eval_loss": 2.256582260131836,
"eval_runtime": 335.1086,
"eval_samples_per_second": 895.172,
"eval_steps_per_second": 55.949,
"step": 976000
},
{
"epoch": 2.76,
"eval_loss": 2.255068778991699,
"eval_runtime": 334.1259,
"eval_samples_per_second": 897.805,
"eval_steps_per_second": 56.114,
"step": 984000
},
{
"epoch": 2.78,
"learning_rate": 2.405333333333333e-07,
"loss": 2.4386,
"step": 992000
},
{
"epoch": 2.78,
"eval_loss": 2.2605791091918945,
"eval_runtime": 334.7883,
"eval_samples_per_second": 896.029,
"eval_steps_per_second": 56.003,
"step": 992000
},
{
"epoch": 2.81,
"eval_loss": 2.263402223587036,
"eval_runtime": 334.1108,
"eval_samples_per_second": 897.846,
"eval_steps_per_second": 56.116,
"step": 1000000
},
{
"epoch": 2.83,
"learning_rate": 2.3779999999999997e-07,
"loss": 2.4402,
"step": 1008000
},
{
"epoch": 2.83,
"eval_loss": 2.264103889465332,
"eval_runtime": 334.5974,
"eval_samples_per_second": 896.54,
"eval_steps_per_second": 56.035,
"step": 1008000
},
{
"epoch": 2.85,
"eval_loss": 2.2618744373321533,
"eval_runtime": 335.396,
"eval_samples_per_second": 894.405,
"eval_steps_per_second": 55.901,
"step": 1016000
},
{
"epoch": 2.87,
"learning_rate": 2.3506666666666668e-07,
"loss": 2.4442,
"step": 1024000
},
{
"epoch": 2.87,
"eval_loss": 2.258431911468506,
"eval_runtime": 334.1391,
"eval_samples_per_second": 897.77,
"eval_steps_per_second": 56.111,
"step": 1024000
},
{
"epoch": 2.9,
"eval_loss": 2.257888078689575,
"eval_runtime": 337.4777,
"eval_samples_per_second": 888.888,
"eval_steps_per_second": 55.556,
"step": 1032000
},
{
"epoch": 2.92,
"learning_rate": 2.3233333333333334e-07,
"loss": 2.4327,
"step": 1040000
},
{
"epoch": 2.92,
"eval_loss": 2.252260684967041,
"eval_runtime": 335.916,
"eval_samples_per_second": 893.021,
"eval_steps_per_second": 55.815,
"step": 1040000
},
{
"epoch": 2.94,
"eval_loss": 2.2561793327331543,
"eval_runtime": 335.381,
"eval_samples_per_second": 894.446,
"eval_steps_per_second": 55.904,
"step": 1048000
},
{
"epoch": 2.96,
"learning_rate": 2.2960000000000002e-07,
"loss": 2.4289,
"step": 1056000
},
{
"epoch": 2.96,
"eval_loss": 2.259270191192627,
"eval_runtime": 338.94,
"eval_samples_per_second": 885.053,
"eval_steps_per_second": 55.317,
"step": 1056000
},
{
"epoch": 2.99,
"eval_loss": 2.256190776824951,
"eval_runtime": 337.0761,
"eval_samples_per_second": 889.947,
"eval_steps_per_second": 55.622,
"step": 1064000
},
{
"epoch": 3.01,
"learning_rate": 2.2686666666666667e-07,
"loss": 2.4319,
"step": 1072000
},
{
"epoch": 3.01,
"eval_loss": 2.253577709197998,
"eval_runtime": 337.3724,
"eval_samples_per_second": 889.166,
"eval_steps_per_second": 55.574,
"step": 1072000
},
{
"epoch": 3.03,
"eval_loss": 2.260322332382202,
"eval_runtime": 335.9181,
"eval_samples_per_second": 893.015,
"eval_steps_per_second": 55.814,
"step": 1080000
},
{
"epoch": 3.05,
"learning_rate": 2.2413333333333333e-07,
"loss": 2.4174,
"step": 1088000
},
{
"epoch": 3.05,
"eval_loss": 2.2548863887786865,
"eval_runtime": 336.1593,
"eval_samples_per_second": 892.374,
"eval_steps_per_second": 55.774,
"step": 1088000
},
{
"epoch": 3.08,
"eval_loss": 2.2595221996307373,
"eval_runtime": 338.2665,
"eval_samples_per_second": 886.816,
"eval_steps_per_second": 55.427,
"step": 1096000
},
{
"epoch": 3.1,
"learning_rate": 2.214e-07,
"loss": 2.4155,
"step": 1104000
},
{
"epoch": 3.1,
"eval_loss": 2.255467176437378,
"eval_runtime": 335.0383,
"eval_samples_per_second": 895.36,
"eval_steps_per_second": 55.961,
"step": 1104000
},
{
"epoch": 3.12,
"eval_loss": 2.250143527984619,
"eval_runtime": 337.3147,
"eval_samples_per_second": 889.318,
"eval_steps_per_second": 55.583,
"step": 1112000
},
{
"epoch": 3.14,
"learning_rate": 2.1866666666666667e-07,
"loss": 2.427,
"step": 1120000
},
{
"epoch": 3.14,
"eval_loss": 2.2528042793273926,
"eval_runtime": 335.8317,
"eval_samples_per_second": 893.245,
"eval_steps_per_second": 55.829,
"step": 1120000
},
{
"epoch": 3.17,
"eval_loss": 2.252933979034424,
"eval_runtime": 335.79,
"eval_samples_per_second": 893.356,
"eval_steps_per_second": 55.835,
"step": 1128000
},
{
"epoch": 3.19,
"learning_rate": 2.1593333333333332e-07,
"loss": 2.4222,
"step": 1136000
},
{
"epoch": 3.19,
"eval_loss": 2.253556251525879,
"eval_runtime": 336.7473,
"eval_samples_per_second": 890.816,
"eval_steps_per_second": 55.677,
"step": 1136000
},
{
"epoch": 3.21,
"eval_loss": 2.258152723312378,
"eval_runtime": 337.5276,
"eval_samples_per_second": 888.757,
"eval_steps_per_second": 55.548,
"step": 1144000
},
{
"epoch": 3.23,
"learning_rate": 2.132e-07,
"loss": 2.4232,
"step": 1152000
},
{
"epoch": 3.23,
"eval_loss": 2.2522146701812744,
"eval_runtime": 335.4197,
"eval_samples_per_second": 894.342,
"eval_steps_per_second": 55.897,
"step": 1152000
},
{
"epoch": 3.26,
"eval_loss": 2.2524819374084473,
"eval_runtime": 337.4419,
"eval_samples_per_second": 888.983,
"eval_steps_per_second": 55.562,
"step": 1160000
},
{
"epoch": 3.28,
"learning_rate": 2.1046666666666666e-07,
"loss": 2.4252,
"step": 1168000
},
{
"epoch": 3.28,
"eval_loss": 2.2537834644317627,
"eval_runtime": 336.2053,
"eval_samples_per_second": 892.252,
"eval_steps_per_second": 55.767,
"step": 1168000
},
{
"epoch": 3.3,
"eval_loss": 2.2512009143829346,
"eval_runtime": 335.734,
"eval_samples_per_second": 893.505,
"eval_steps_per_second": 55.845,
"step": 1176000
},
{
"epoch": 3.32,
"learning_rate": 2.0773333333333334e-07,
"loss": 2.4209,
"step": 1184000
},
{
"epoch": 3.32,
"eval_loss": 2.255702018737793,
"eval_runtime": 337.398,
"eval_samples_per_second": 889.098,
"eval_steps_per_second": 55.569,
"step": 1184000
},
{
"epoch": 3.35,
"eval_loss": 2.2445454597473145,
"eval_runtime": 338.3834,
"eval_samples_per_second": 886.509,
"eval_steps_per_second": 55.408,
"step": 1192000
},
{
"epoch": 3.37,
"learning_rate": 2.05e-07,
"loss": 2.4243,
"step": 1200000
},
{
"epoch": 3.37,
"eval_loss": 2.257007122039795,
"eval_runtime": 336.8153,
"eval_samples_per_second": 890.636,
"eval_steps_per_second": 55.666,
"step": 1200000
},
{
"epoch": 3.39,
"eval_loss": 2.25388240814209,
"eval_runtime": 339.0365,
"eval_samples_per_second": 884.801,
"eval_steps_per_second": 55.301,
"step": 1208000
},
{
"epoch": 3.41,
"learning_rate": 2.0226666666666668e-07,
"loss": 2.4278,
"step": 1216000
},
{
"epoch": 3.41,
"eval_loss": 2.2514150142669678,
"eval_runtime": 340.5571,
"eval_samples_per_second": 880.851,
"eval_steps_per_second": 55.054,
"step": 1216000
},
{
"epoch": 3.44,
"eval_loss": 2.2454025745391846,
"eval_runtime": 337.7515,
"eval_samples_per_second": 888.168,
"eval_steps_per_second": 55.511,
"step": 1224000
},
{
"epoch": 3.46,
"learning_rate": 1.9953333333333333e-07,
"loss": 2.4286,
"step": 1232000
},
{
"epoch": 3.46,
"eval_loss": 2.246293306350708,
"eval_runtime": 339.7018,
"eval_samples_per_second": 883.069,
"eval_steps_per_second": 55.193,
"step": 1232000
},
{
"epoch": 3.48,
"eval_loss": 2.25063157081604,
"eval_runtime": 336.5454,
"eval_samples_per_second": 891.351,
"eval_steps_per_second": 55.71,
"step": 1240000
},
{
"epoch": 3.5,
"learning_rate": 1.968e-07,
"loss": 2.4274,
"step": 1248000
},
{
"epoch": 3.5,
"eval_loss": 2.2426698207855225,
"eval_runtime": 339.3399,
"eval_samples_per_second": 884.01,
"eval_steps_per_second": 55.251,
"step": 1248000
},
{
"epoch": 3.53,
"eval_loss": 2.2535440921783447,
"eval_runtime": 339.1007,
"eval_samples_per_second": 884.634,
"eval_steps_per_second": 55.29,
"step": 1256000
},
{
"epoch": 3.55,
"learning_rate": 1.9406666666666667e-07,
"loss": 2.4201,
"step": 1264000
},
{
"epoch": 3.55,
"eval_loss": 2.2516891956329346,
"eval_runtime": 337.2905,
"eval_samples_per_second": 889.382,
"eval_steps_per_second": 55.587,
"step": 1264000
},
{
"epoch": 3.57,
"eval_loss": 2.2436001300811768,
"eval_runtime": 340.5027,
"eval_samples_per_second": 880.992,
"eval_steps_per_second": 55.063,
"step": 1272000
},
{
"epoch": 3.59,
"learning_rate": 1.9133333333333333e-07,
"loss": 2.4233,
"step": 1280000
},
{
"epoch": 3.59,
"eval_loss": 2.242955446243286,
"eval_runtime": 338.961,
"eval_samples_per_second": 884.999,
"eval_steps_per_second": 55.313,
"step": 1280000
},
{
"epoch": 3.62,
"eval_loss": 2.247040271759033,
"eval_runtime": 336.8862,
"eval_samples_per_second": 890.449,
"eval_steps_per_second": 55.654,
"step": 1288000
},
{
"epoch": 3.64,
"learning_rate": 1.886e-07,
"loss": 2.4183,
"step": 1296000
},
{
"epoch": 3.64,
"eval_loss": 2.244565963745117,
"eval_runtime": 337.986,
"eval_samples_per_second": 887.552,
"eval_steps_per_second": 55.473,
"step": 1296000
},
{
"epoch": 3.66,
"eval_loss": 2.2539021968841553,
"eval_runtime": 340.0782,
"eval_samples_per_second": 882.091,
"eval_steps_per_second": 55.131,
"step": 1304000
},
{
"epoch": 3.68,
"learning_rate": 1.8586666666666666e-07,
"loss": 2.428,
"step": 1312000
},
{
"epoch": 3.68,
"eval_loss": 2.249154806137085,
"eval_runtime": 337.5652,
"eval_samples_per_second": 888.658,
"eval_steps_per_second": 55.542,
"step": 1312000
},
{
"epoch": 3.71,
"eval_loss": 2.2543509006500244,
"eval_runtime": 337.3598,
"eval_samples_per_second": 889.199,
"eval_steps_per_second": 55.576,
"step": 1320000
},
{
"epoch": 3.73,
"learning_rate": 1.8313333333333332e-07,
"loss": 2.4206,
"step": 1328000
},
{
"epoch": 3.73,
"eval_loss": 2.2478220462799072,
"eval_runtime": 339.2392,
"eval_samples_per_second": 884.273,
"eval_steps_per_second": 55.268,
"step": 1328000
},
{
"epoch": 3.75,
"eval_loss": 2.2420246601104736,
"eval_runtime": 337.9033,
"eval_samples_per_second": 887.769,
"eval_steps_per_second": 55.486,
"step": 1336000
},
{
"epoch": 3.77,
"learning_rate": 1.804e-07,
"loss": 2.4287,
"step": 1344000
},
{
"epoch": 3.77,
"eval_loss": 2.244210958480835,
"eval_runtime": 337.3268,
"eval_samples_per_second": 889.286,
"eval_steps_per_second": 55.581,
"step": 1344000
},
{
"epoch": 3.8,
"eval_loss": 2.2426180839538574,
"eval_runtime": 339.5586,
"eval_samples_per_second": 883.441,
"eval_steps_per_second": 55.216,
"step": 1352000
},
{
"epoch": 3.82,
"learning_rate": 1.7766666666666666e-07,
"loss": 2.4297,
"step": 1360000
},
{
"epoch": 3.82,
"eval_loss": 2.242596387863159,
"eval_runtime": 337.7343,
"eval_samples_per_second": 888.213,
"eval_steps_per_second": 55.514,
"step": 1360000
},
{
"epoch": 3.84,
"eval_loss": 2.2480640411376953,
"eval_runtime": 337.5382,
"eval_samples_per_second": 888.729,
"eval_steps_per_second": 55.546,
"step": 1368000
},
{
"epoch": 3.86,
"learning_rate": 1.7493333333333334e-07,
"loss": 2.4185,
"step": 1376000
},
{
"epoch": 3.86,
"eval_loss": 2.2448768615722656,
"eval_runtime": 339.0271,
"eval_samples_per_second": 884.826,
"eval_steps_per_second": 55.302,
"step": 1376000
},
{
"epoch": 3.89,
"eval_loss": 2.246758222579956,
"eval_runtime": 338.8022,
"eval_samples_per_second": 885.413,
"eval_steps_per_second": 55.339,
"step": 1384000
},
{
"epoch": 3.91,
"learning_rate": 1.722e-07,
"loss": 2.4217,
"step": 1392000
},
{
"epoch": 3.91,
"eval_loss": 2.2466745376586914,
"eval_runtime": 341.1017,
"eval_samples_per_second": 879.444,
"eval_steps_per_second": 54.966,
"step": 1392000
},
{
"epoch": 3.93,
"eval_loss": 2.2463412284851074,
"eval_runtime": 340.0034,
"eval_samples_per_second": 882.285,
"eval_steps_per_second": 55.144,
"step": 1400000
},
{
"epoch": 3.95,
"learning_rate": 1.6946666666666668e-07,
"loss": 2.4144,
"step": 1408000
},
{
"epoch": 3.95,
"eval_loss": 2.2481906414031982,
"eval_runtime": 338.7844,
"eval_samples_per_second": 885.46,
"eval_steps_per_second": 55.342,
"step": 1408000
},
{
"epoch": 3.97,
"eval_loss": 2.242440938949585,
"eval_runtime": 339.569,
"eval_samples_per_second": 883.414,
"eval_steps_per_second": 55.214,
"step": 1416000
},
{
"epoch": 4.0,
"learning_rate": 1.6673333333333333e-07,
"loss": 2.4175,
"step": 1424000
},
{
"epoch": 4.0,
"eval_loss": 2.2414705753326416,
"eval_runtime": 339.2204,
"eval_samples_per_second": 884.322,
"eval_steps_per_second": 55.271,
"step": 1424000
},
{
"epoch": 4.02,
"eval_loss": 2.2450637817382812,
"eval_runtime": 338.8494,
"eval_samples_per_second": 885.29,
"eval_steps_per_second": 55.331,
"step": 1432000
},
{
"epoch": 4.04,
"learning_rate": 1.64e-07,
"loss": 2.4169,
"step": 1440000
},
{
"epoch": 4.04,
"eval_loss": 2.244276285171509,
"eval_runtime": 338.3144,
"eval_samples_per_second": 886.69,
"eval_steps_per_second": 55.419,
"step": 1440000
},
{
"epoch": 4.06,
"eval_loss": 2.2389209270477295,
"eval_runtime": 343.9025,
"eval_samples_per_second": 872.282,
"eval_steps_per_second": 54.518,
"step": 1448000
},
{
"epoch": 4.09,
"learning_rate": 1.6126666666666667e-07,
"loss": 2.4142,
"step": 1456000
},
{
"epoch": 4.09,
"eval_loss": 2.2376506328582764,
"eval_runtime": 338.9552,
"eval_samples_per_second": 885.014,
"eval_steps_per_second": 55.314,
"step": 1456000
},
{
"epoch": 4.11,
"eval_loss": 2.239941358566284,
"eval_runtime": 342.029,
"eval_samples_per_second": 877.06,
"eval_steps_per_second": 54.817,
"step": 1464000
},
{
"epoch": 4.13,
"learning_rate": 1.5853333333333332e-07,
"loss": 2.4122,
"step": 1472000
},
{
"epoch": 4.13,
"eval_loss": 2.24470853805542,
"eval_runtime": 338.988,
"eval_samples_per_second": 884.928,
"eval_steps_per_second": 55.309,
"step": 1472000
},
{
"epoch": 4.15,
"eval_loss": 2.24562931060791,
"eval_runtime": 341.3425,
"eval_samples_per_second": 878.824,
"eval_steps_per_second": 54.927,
"step": 1480000
},
{
"epoch": 4.18,
"learning_rate": 1.558e-07,
"loss": 2.4166,
"step": 1488000
},
{
"epoch": 4.18,
"eval_loss": 2.245072364807129,
"eval_runtime": 339.7578,
"eval_samples_per_second": 882.923,
"eval_steps_per_second": 55.183,
"step": 1488000
},
{
"epoch": 4.2,
"eval_loss": 2.2368929386138916,
"eval_runtime": 340.9662,
"eval_samples_per_second": 879.794,
"eval_steps_per_second": 54.988,
"step": 1496000
},
{
"epoch": 4.22,
"learning_rate": 1.5306666666666666e-07,
"loss": 2.4165,
"step": 1504000
},
{
"epoch": 4.22,
"eval_loss": 2.2426319122314453,
"eval_runtime": 339.1777,
"eval_samples_per_second": 884.433,
"eval_steps_per_second": 55.278,
"step": 1504000
},
{
"epoch": 4.24,
"eval_loss": 2.238410472869873,
"eval_runtime": 340.071,
"eval_samples_per_second": 882.11,
"eval_steps_per_second": 55.133,
"step": 1512000
},
{
"epoch": 4.27,
"learning_rate": 1.5033333333333332e-07,
"loss": 2.4204,
"step": 1520000
},
{
"epoch": 4.27,
"eval_loss": 2.245389461517334,
"eval_runtime": 339.4968,
"eval_samples_per_second": 883.602,
"eval_steps_per_second": 55.226,
"step": 1520000
},
{
"epoch": 4.29,
"eval_loss": 2.242230176925659,
"eval_runtime": 341.1938,
"eval_samples_per_second": 879.207,
"eval_steps_per_second": 54.951,
"step": 1528000
},
{
"epoch": 4.31,
"learning_rate": 1.476e-07,
"loss": 2.4192,
"step": 1536000
},
{
"epoch": 4.31,
"eval_loss": 2.2423222064971924,
"eval_runtime": 341.7051,
"eval_samples_per_second": 877.892,
"eval_steps_per_second": 54.869,
"step": 1536000
},
{
"epoch": 4.33,
"eval_loss": 2.2434957027435303,
"eval_runtime": 344.5773,
"eval_samples_per_second": 870.574,
"eval_steps_per_second": 54.412,
"step": 1544000
},
{
"epoch": 4.36,
"learning_rate": 1.4486666666666665e-07,
"loss": 2.4167,
"step": 1552000
},
{
"epoch": 4.36,
"eval_loss": 2.2450661659240723,
"eval_runtime": 342.2307,
"eval_samples_per_second": 876.543,
"eval_steps_per_second": 54.785,
"step": 1552000
},
{
"epoch": 4.38,
"eval_loss": 2.2442915439605713,
"eval_runtime": 339.3897,
"eval_samples_per_second": 883.881,
"eval_steps_per_second": 55.243,
"step": 1560000
},
{
"epoch": 4.4,
"learning_rate": 1.4213333333333334e-07,
"loss": 2.4124,
"step": 1568000
},
{
"epoch": 4.4,
"eval_loss": 2.243011713027954,
"eval_runtime": 339.9044,
"eval_samples_per_second": 882.542,
"eval_steps_per_second": 55.16,
"step": 1568000
},
{
"epoch": 4.42,
"eval_loss": 2.2422168254852295,
"eval_runtime": 340.0517,
"eval_samples_per_second": 882.16,
"eval_steps_per_second": 55.136,
"step": 1576000
},
{
"epoch": 4.45,
"learning_rate": 1.3940000000000002e-07,
"loss": 2.406,
"step": 1584000
},
{
"epoch": 4.45,
"eval_loss": 2.2356574535369873,
"eval_runtime": 339.8299,
"eval_samples_per_second": 882.736,
"eval_steps_per_second": 55.172,
"step": 1584000
},
{
"epoch": 4.47,
"eval_loss": 2.2395410537719727,
"eval_runtime": 340.769,
"eval_samples_per_second": 880.303,
"eval_steps_per_second": 55.02,
"step": 1592000
},
{
"epoch": 4.49,
"learning_rate": 1.3666666666666665e-07,
"loss": 2.4166,
"step": 1600000
},
{
"epoch": 4.49,
"eval_loss": 2.2377548217773438,
"eval_runtime": 341.8287,
"eval_samples_per_second": 877.574,
"eval_steps_per_second": 54.849,
"step": 1600000
},
{
"epoch": 4.51,
"eval_loss": 2.2419931888580322,
"eval_runtime": 341.3154,
"eval_samples_per_second": 878.894,
"eval_steps_per_second": 54.932,
"step": 1608000
},
{
"epoch": 4.54,
"learning_rate": 1.3393333333333333e-07,
"loss": 2.4144,
"step": 1616000
},
{
"epoch": 4.54,
"eval_loss": 2.2401504516601562,
"eval_runtime": 341.1783,
"eval_samples_per_second": 879.247,
"eval_steps_per_second": 54.954,
"step": 1616000
},
{
"epoch": 4.56,
"eval_loss": 2.238373041152954,
"eval_runtime": 340.7212,
"eval_samples_per_second": 880.427,
"eval_steps_per_second": 55.027,
"step": 1624000
},
{
"epoch": 4.58,
"learning_rate": 1.312e-07,
"loss": 2.4219,
"step": 1632000
},
{
"epoch": 4.58,
"eval_loss": 2.2437572479248047,
"eval_runtime": 342.9314,
"eval_samples_per_second": 874.752,
"eval_steps_per_second": 54.673,
"step": 1632000
},
{
"epoch": 4.6,
"eval_loss": 2.2455334663391113,
"eval_runtime": 340.3903,
"eval_samples_per_second": 881.282,
"eval_steps_per_second": 55.081,
"step": 1640000
},
{
"epoch": 4.63,
"learning_rate": 1.2846666666666667e-07,
"loss": 2.4061,
"step": 1648000
},
{
"epoch": 4.63,
"eval_loss": 2.2396554946899414,
"eval_runtime": 342.4586,
"eval_samples_per_second": 875.96,
"eval_steps_per_second": 54.748,
"step": 1648000
},
{
"epoch": 4.65,
"eval_loss": 2.23541522026062,
"eval_runtime": 341.3616,
"eval_samples_per_second": 878.775,
"eval_steps_per_second": 54.924,
"step": 1656000
},
{
"epoch": 4.67,
"learning_rate": 1.2573333333333332e-07,
"loss": 2.411,
"step": 1664000
},
{
"epoch": 4.67,
"eval_loss": 2.2392566204071045,
"eval_runtime": 340.724,
"eval_samples_per_second": 880.419,
"eval_steps_per_second": 55.027,
"step": 1664000
},
{
"epoch": 4.69,
"eval_loss": 2.238832473754883,
"eval_runtime": 342.3701,
"eval_samples_per_second": 876.186,
"eval_steps_per_second": 54.762,
"step": 1672000
},
{
"epoch": 4.72,
"learning_rate": 1.23e-07,
"loss": 2.4125,
"step": 1680000
},
{
"epoch": 4.72,
"eval_loss": 2.2406108379364014,
"eval_runtime": 343.1331,
"eval_samples_per_second": 874.238,
"eval_steps_per_second": 54.641,
"step": 1680000
},
{
"epoch": 4.74,
"eval_loss": 2.2330496311187744,
"eval_runtime": 341.3886,
"eval_samples_per_second": 878.705,
"eval_steps_per_second": 54.92,
"step": 1688000
},
{
"epoch": 4.76,
"learning_rate": 1.2026666666666666e-07,
"loss": 2.4092,
"step": 1696000
},
{
"epoch": 4.76,
"eval_loss": 2.2335941791534424,
"eval_runtime": 341.2812,
"eval_samples_per_second": 878.982,
"eval_steps_per_second": 54.937,
"step": 1696000
},
{
"epoch": 4.78,
"eval_loss": 2.239811658859253,
"eval_runtime": 341.3993,
"eval_samples_per_second": 878.678,
"eval_steps_per_second": 54.918,
"step": 1704000
},
{
"epoch": 4.81,
"learning_rate": 1.1753333333333334e-07,
"loss": 2.4078,
"step": 1712000
},
{
"epoch": 4.81,
"eval_loss": 2.2368171215057373,
"eval_runtime": 342.0299,
"eval_samples_per_second": 877.058,
"eval_steps_per_second": 54.817,
"step": 1712000
},
{
"epoch": 4.83,
"eval_loss": 2.236109495162964,
"eval_runtime": 341.4467,
"eval_samples_per_second": 878.556,
"eval_steps_per_second": 54.91,
"step": 1720000
},
{
"epoch": 4.85,
"learning_rate": 1.1480000000000001e-07,
"loss": 2.4185,
"step": 1728000
},
{
"epoch": 4.85,
"eval_loss": 2.2378110885620117,
"eval_runtime": 342.0137,
"eval_samples_per_second": 877.099,
"eval_steps_per_second": 54.819,
"step": 1728000
},
{
"epoch": 4.87,
"eval_loss": 2.2338638305664062,
"eval_runtime": 341.8702,
"eval_samples_per_second": 877.467,
"eval_steps_per_second": 54.842,
"step": 1736000
},
{
"epoch": 4.9,
"learning_rate": 1.1206666666666666e-07,
"loss": 2.4088,
"step": 1744000
},
{
"epoch": 4.9,
"eval_loss": 2.2365610599517822,
"eval_runtime": 343.0203,
"eval_samples_per_second": 874.526,
"eval_steps_per_second": 54.659,
"step": 1744000
},
{
"epoch": 4.92,
"eval_loss": 2.238463878631592,
"eval_runtime": 342.0664,
"eval_samples_per_second": 876.964,
"eval_steps_per_second": 54.811,
"step": 1752000
},
{
"epoch": 4.94,
"learning_rate": 1.0933333333333333e-07,
"loss": 2.4095,
"step": 1760000
},
{
"epoch": 4.94,
"eval_loss": 2.2336812019348145,
"eval_runtime": 343.3628,
"eval_samples_per_second": 873.653,
"eval_steps_per_second": 54.604,
"step": 1760000
},
{
"epoch": 4.96,
"eval_loss": 2.2413289546966553,
"eval_runtime": 342.3332,
"eval_samples_per_second": 876.281,
"eval_steps_per_second": 54.768,
"step": 1768000
},
{
"epoch": 4.99,
"learning_rate": 1.066e-07,
"loss": 2.4078,
"step": 1776000
},
{
"epoch": 4.99,
"eval_loss": 2.237656593322754,
"eval_runtime": 342.7683,
"eval_samples_per_second": 875.168,
"eval_steps_per_second": 54.699,
"step": 1776000
},
{
"epoch": 5.01,
"eval_loss": 2.2302229404449463,
"eval_runtime": 342.0809,
"eval_samples_per_second": 876.927,
"eval_steps_per_second": 54.809,
"step": 1784000
},
{
"epoch": 5.03,
"learning_rate": 1.0386666666666667e-07,
"loss": 2.4073,
"step": 1792000
},
{
"epoch": 5.03,
"eval_loss": 2.2356677055358887,
"eval_runtime": 342.5577,
"eval_samples_per_second": 875.707,
"eval_steps_per_second": 54.732,
"step": 1792000
},
{
"epoch": 5.05,
"eval_loss": 2.2384088039398193,
"eval_runtime": 342.428,
"eval_samples_per_second": 876.038,
"eval_steps_per_second": 54.753,
"step": 1800000
},
{
"epoch": 5.08,
"learning_rate": 1.0113333333333334e-07,
"loss": 2.4073,
"step": 1808000
},
{
"epoch": 5.08,
"eval_loss": 2.2321836948394775,
"eval_runtime": 342.2614,
"eval_samples_per_second": 876.465,
"eval_steps_per_second": 54.78,
"step": 1808000
},
{
"epoch": 5.1,
"eval_loss": 2.234363317489624,
"eval_runtime": 344.4478,
"eval_samples_per_second": 870.901,
"eval_steps_per_second": 54.432,
"step": 1816000
},
{
"epoch": 5.12,
"learning_rate": 9.84e-08,
"loss": 2.4043,
"step": 1824000
},
{
"epoch": 5.12,
"eval_loss": 2.232731819152832,
"eval_runtime": 342.8187,
"eval_samples_per_second": 875.04,
"eval_steps_per_second": 54.691,
"step": 1824000
},
{
"epoch": 5.14,
"eval_loss": 2.234955072402954,
"eval_runtime": 343.4884,
"eval_samples_per_second": 873.334,
"eval_steps_per_second": 54.584,
"step": 1832000
},
{
"epoch": 5.17,
"learning_rate": 9.566666666666666e-08,
"loss": 2.4082,
"step": 1840000
},
{
"epoch": 5.17,
"eval_loss": 2.2375595569610596,
"eval_runtime": 343.7508,
"eval_samples_per_second": 872.667,
"eval_steps_per_second": 54.542,
"step": 1840000
},
{
"epoch": 5.19,
"eval_loss": 2.2363414764404297,
"eval_runtime": 343.8693,
"eval_samples_per_second": 872.366,
"eval_steps_per_second": 54.524,
"step": 1848000
},
{
"epoch": 5.21,
"learning_rate": 9.293333333333333e-08,
"loss": 2.4073,
"step": 1856000
},
{
"epoch": 5.21,
"eval_loss": 2.23234224319458,
"eval_runtime": 342.8958,
"eval_samples_per_second": 874.843,
"eval_steps_per_second": 54.678,
"step": 1856000
},
{
"epoch": 5.23,
"eval_loss": 2.2419273853302,
"eval_runtime": 343.3653,
"eval_samples_per_second": 873.647,
"eval_steps_per_second": 54.604,
"step": 1864000
},
{
"epoch": 5.26,
"learning_rate": 9.02e-08,
"loss": 2.4148,
"step": 1872000
},
{
"epoch": 5.26,
"eval_loss": 2.2292640209198,
"eval_runtime": 344.1756,
"eval_samples_per_second": 871.59,
"eval_steps_per_second": 54.475,
"step": 1872000
},
{
"epoch": 5.28,
"eval_loss": 2.2345802783966064,
"eval_runtime": 346.0562,
"eval_samples_per_second": 866.854,
"eval_steps_per_second": 54.179,
"step": 1880000
},
{
"epoch": 5.3,
"learning_rate": 8.746666666666667e-08,
"loss": 2.4098,
"step": 1888000
},
{
"epoch": 5.3,
"eval_loss": 2.237226963043213,
"eval_runtime": 345.4208,
"eval_samples_per_second": 868.448,
"eval_steps_per_second": 54.279,
"step": 1888000
},
{
"epoch": 5.32,
"eval_loss": 2.237149953842163,
"eval_runtime": 343.8922,
"eval_samples_per_second": 872.308,
"eval_steps_per_second": 54.52,
"step": 1896000
},
{
"epoch": 5.34,
"learning_rate": 8.473333333333334e-08,
"loss": 2.407,
"step": 1904000
},
{
"epoch": 5.34,
"eval_loss": 2.2396621704101562,
"eval_runtime": 346.177,
"eval_samples_per_second": 866.551,
"eval_steps_per_second": 54.16,
"step": 1904000
},
{
"epoch": 5.37,
"eval_loss": 2.2300214767456055,
"eval_runtime": 345.6113,
"eval_samples_per_second": 867.969,
"eval_steps_per_second": 54.249,
"step": 1912000
},
{
"epoch": 5.39,
"learning_rate": 8.2e-08,
"loss": 2.4108,
"step": 1920000
},
{
"epoch": 5.39,
"eval_loss": 2.2317283153533936,
"eval_runtime": 344.6229,
"eval_samples_per_second": 870.459,
"eval_steps_per_second": 54.404,
"step": 1920000
},
{
"epoch": 5.41,
"eval_loss": 2.2349703311920166,
"eval_runtime": 344.7164,
"eval_samples_per_second": 870.223,
"eval_steps_per_second": 54.39,
"step": 1928000
},
{
"epoch": 5.43,
"learning_rate": 7.926666666666666e-08,
"loss": 2.4168,
"step": 1936000
},
{
"epoch": 5.43,
"eval_loss": 2.2343006134033203,
"eval_runtime": 344.4965,
"eval_samples_per_second": 870.778,
"eval_steps_per_second": 54.424,
"step": 1936000
},
{
"epoch": 5.46,
"eval_loss": 2.232745885848999,
"eval_runtime": 343.8717,
"eval_samples_per_second": 872.36,
"eval_steps_per_second": 54.523,
"step": 1944000
},
{
"epoch": 5.48,
"learning_rate": 7.653333333333333e-08,
"loss": 2.4113,
"step": 1952000
},
{
"epoch": 5.48,
"eval_loss": 2.2363381385803223,
"eval_runtime": 343.8179,
"eval_samples_per_second": 872.497,
"eval_steps_per_second": 54.532,
"step": 1952000
},
{
"epoch": 5.5,
"eval_loss": 2.231372833251953,
"eval_runtime": 345.6256,
"eval_samples_per_second": 867.933,
"eval_steps_per_second": 54.247,
"step": 1960000
},
{
"epoch": 5.52,
"learning_rate": 7.38e-08,
"loss": 2.4131,
"step": 1968000
},
{
"epoch": 5.52,
"eval_loss": 2.23030686378479,
"eval_runtime": 344.554,
"eval_samples_per_second": 870.633,
"eval_steps_per_second": 54.415,
"step": 1968000
},
{
"epoch": 5.55,
"eval_loss": 2.2353336811065674,
"eval_runtime": 345.2222,
"eval_samples_per_second": 868.948,
"eval_steps_per_second": 54.31,
"step": 1976000
},
{
"epoch": 5.57,
"learning_rate": 7.106666666666667e-08,
"loss": 2.4129,
"step": 1984000
},
{
"epoch": 5.57,
"eval_loss": 2.235344886779785,
"eval_runtime": 344.2446,
"eval_samples_per_second": 871.415,
"eval_steps_per_second": 54.464,
"step": 1984000
},
{
"epoch": 5.59,
"eval_loss": 2.2295796871185303,
"eval_runtime": 344.0878,
"eval_samples_per_second": 871.812,
"eval_steps_per_second": 54.489,
"step": 1992000
},
{
"epoch": 5.61,
"learning_rate": 6.833333333333332e-08,
"loss": 2.4129,
"step": 2000000
},
{
"epoch": 5.61,
"eval_loss": 2.2313883304595947,
"eval_runtime": 344.0986,
"eval_samples_per_second": 871.785,
"eval_steps_per_second": 54.487,
"step": 2000000
},
{
"epoch": 5.64,
"eval_loss": 2.2287940979003906,
"eval_runtime": 343.9635,
"eval_samples_per_second": 872.127,
"eval_steps_per_second": 54.509,
"step": 2008000
},
{
"epoch": 5.66,
"learning_rate": 6.56e-08,
"loss": 2.4045,
"step": 2016000
},
{
"epoch": 5.66,
"eval_loss": 2.2346994876861572,
"eval_runtime": 344.4276,
"eval_samples_per_second": 870.952,
"eval_steps_per_second": 54.435,
"step": 2016000
},
{
"epoch": 5.68,
"eval_loss": 2.2348926067352295,
"eval_runtime": 344.0715,
"eval_samples_per_second": 871.854,
"eval_steps_per_second": 54.492,
"step": 2024000
},
{
"epoch": 5.7,
"learning_rate": 6.286666666666666e-08,
"loss": 2.4089,
"step": 2032000
},
{
"epoch": 5.7,
"eval_loss": 2.231017589569092,
"eval_runtime": 344.7006,
"eval_samples_per_second": 870.262,
"eval_steps_per_second": 54.392,
"step": 2032000
},
{
"epoch": 5.73,
"eval_loss": 2.2342352867126465,
"eval_runtime": 344.3635,
"eval_samples_per_second": 871.114,
"eval_steps_per_second": 54.445,
"step": 2040000
},
{
"epoch": 5.75,
"learning_rate": 6.013333333333333e-08,
"loss": 2.4091,
"step": 2048000
},
{
"epoch": 5.75,
"eval_loss": 2.2319512367248535,
"eval_runtime": 345.0718,
"eval_samples_per_second": 869.326,
"eval_steps_per_second": 54.334,
"step": 2048000
},
{
"epoch": 5.77,
"eval_loss": 2.231105327606201,
"eval_runtime": 345.8677,
"eval_samples_per_second": 867.326,
"eval_steps_per_second": 54.209,
"step": 2056000
},
{
"epoch": 5.79,
"learning_rate": 5.7400000000000004e-08,
"loss": 2.4137,
"step": 2064000
},
{
"epoch": 5.79,
"eval_loss": 2.2278153896331787,
"eval_runtime": 345.3364,
"eval_samples_per_second": 868.66,
"eval_steps_per_second": 54.292,
"step": 2064000
},
{
"epoch": 5.82,
"eval_loss": 2.2343814373016357,
"eval_runtime": 344.9277,
"eval_samples_per_second": 869.69,
"eval_steps_per_second": 54.356,
"step": 2072000
},
{
"epoch": 5.84,
"learning_rate": 5.4666666666666666e-08,
"loss": 2.4063,
"step": 2080000
},
{
"epoch": 5.84,
"eval_loss": 2.233853340148926,
"eval_runtime": 346.3645,
"eval_samples_per_second": 866.082,
"eval_steps_per_second": 54.131,
"step": 2080000
},
{
"epoch": 5.86,
"eval_loss": 2.22705078125,
"eval_runtime": 344.9123,
"eval_samples_per_second": 869.728,
"eval_steps_per_second": 54.359,
"step": 2088000
},
{
"epoch": 5.88,
"learning_rate": 5.1933333333333335e-08,
"loss": 2.4046,
"step": 2096000
},
{
"epoch": 5.88,
"eval_loss": 2.22632098197937,
"eval_runtime": 346.6997,
"eval_samples_per_second": 865.244,
"eval_steps_per_second": 54.078,
"step": 2096000
},
{
"epoch": 5.91,
"eval_loss": 2.236851453781128,
"eval_runtime": 346.071,
"eval_samples_per_second": 866.816,
"eval_steps_per_second": 54.177,
"step": 2104000
},
{
"epoch": 5.93,
"learning_rate": 4.92e-08,
"loss": 2.4105,
"step": 2112000
},
{
"epoch": 5.93,
"eval_loss": 2.2329680919647217,
"eval_runtime": 347.6747,
"eval_samples_per_second": 862.818,
"eval_steps_per_second": 53.927,
"step": 2112000
},
{
"epoch": 5.95,
"eval_loss": 2.236093521118164,
"eval_runtime": 346.0239,
"eval_samples_per_second": 866.934,
"eval_steps_per_second": 54.184,
"step": 2120000
},
{
"epoch": 5.97,
"learning_rate": 4.6466666666666666e-08,
"loss": 2.4045,
"step": 2128000
},
{
"epoch": 5.97,
"eval_loss": 2.231955051422119,
"eval_runtime": 345.4242,
"eval_samples_per_second": 868.439,
"eval_steps_per_second": 54.278,
"step": 2128000
},
{
"epoch": 6.0,
"eval_loss": 2.2282557487487793,
"eval_runtime": 345.9755,
"eval_samples_per_second": 867.056,
"eval_steps_per_second": 54.192,
"step": 2136000
},
{
"epoch": 6.02,
"learning_rate": 4.3733333333333335e-08,
"loss": 2.4093,
"step": 2144000
},
{
"epoch": 6.02,
"eval_loss": 2.22622013092041,
"eval_runtime": 347.3733,
"eval_samples_per_second": 863.567,
"eval_steps_per_second": 53.974,
"step": 2144000
},
{
"epoch": 6.04,
"eval_loss": 2.229443311691284,
"eval_runtime": 346.1833,
"eval_samples_per_second": 866.535,
"eval_steps_per_second": 54.159,
"step": 2152000
},
{
"epoch": 6.06,
"learning_rate": 4.1e-08,
"loss": 2.4109,
"step": 2160000
},
{
"epoch": 6.06,
"eval_loss": 2.233405351638794,
"eval_runtime": 346.0615,
"eval_samples_per_second": 866.84,
"eval_steps_per_second": 54.178,
"step": 2160000
},
{
"epoch": 6.09,
"eval_loss": 2.236346483230591,
"eval_runtime": 345.3461,
"eval_samples_per_second": 868.636,
"eval_steps_per_second": 54.29,
"step": 2168000
},
{
"epoch": 6.11,
"learning_rate": 3.8266666666666665e-08,
"loss": 2.4061,
"step": 2176000
},
{
"epoch": 6.11,
"eval_loss": 2.2308871746063232,
"eval_runtime": 347.2972,
"eval_samples_per_second": 863.756,
"eval_steps_per_second": 53.985,
"step": 2176000
},
{
"epoch": 6.13,
"eval_loss": 2.2269339561462402,
"eval_runtime": 347.1506,
"eval_samples_per_second": 864.121,
"eval_steps_per_second": 54.008,
"step": 2184000
},
{
"epoch": 6.15,
"learning_rate": 3.5533333333333334e-08,
"loss": 2.4007,
"step": 2192000
},
{
"epoch": 6.15,
"eval_loss": 2.236927032470703,
"eval_runtime": 347.4425,
"eval_samples_per_second": 863.395,
"eval_steps_per_second": 53.963,
"step": 2192000
},
{
"epoch": 6.18,
"eval_loss": 2.229724168777466,
"eval_runtime": 345.6431,
"eval_samples_per_second": 867.889,
"eval_steps_per_second": 54.244,
"step": 2200000
},
{
"epoch": 6.2,
"learning_rate": 3.28e-08,
"loss": 2.4034,
"step": 2208000
},
{
"epoch": 6.2,
"eval_loss": 2.2266740798950195,
"eval_runtime": 346.5718,
"eval_samples_per_second": 865.564,
"eval_steps_per_second": 54.098,
"step": 2208000
},
{
"epoch": 6.22,
"eval_loss": 2.2310221195220947,
"eval_runtime": 346.6214,
"eval_samples_per_second": 865.44,
"eval_steps_per_second": 54.091,
"step": 2216000
},
{
"epoch": 6.24,
"learning_rate": 3.0066666666666665e-08,
"loss": 2.4049,
"step": 2224000
},
{
"epoch": 6.24,
"eval_loss": 2.236175060272217,
"eval_runtime": 348.3819,
"eval_samples_per_second": 861.067,
"eval_steps_per_second": 53.817,
"step": 2224000
},
{
"epoch": 6.27,
"eval_loss": 2.231903076171875,
"eval_runtime": 347.5521,
"eval_samples_per_second": 863.122,
"eval_steps_per_second": 53.946,
"step": 2232000
},
{
"epoch": 6.29,
"learning_rate": 2.7333333333333333e-08,
"loss": 2.4052,
"step": 2240000
},
{
"epoch": 6.29,
"eval_loss": 2.2307627201080322,
"eval_runtime": 347.1934,
"eval_samples_per_second": 864.014,
"eval_steps_per_second": 54.002,
"step": 2240000
},
{
"epoch": 6.31,
"eval_loss": 2.2225306034088135,
"eval_runtime": 347.1345,
"eval_samples_per_second": 864.161,
"eval_steps_per_second": 54.011,
"step": 2248000
},
{
"epoch": 6.33,
"learning_rate": 2.46e-08,
"loss": 2.4102,
"step": 2256000
},
{
"epoch": 6.33,
"eval_loss": 2.2365851402282715,
"eval_runtime": 347.8217,
"eval_samples_per_second": 862.453,
"eval_steps_per_second": 53.904,
"step": 2256000
},
{
"epoch": 6.36,
"eval_loss": 2.232743740081787,
"eval_runtime": 347.6812,
"eval_samples_per_second": 862.802,
"eval_steps_per_second": 53.926,
"step": 2264000
},
{
"epoch": 6.38,
"learning_rate": 2.1866666666666667e-08,
"loss": 2.4046,
"step": 2272000
},
{
"epoch": 6.38,
"eval_loss": 2.2304911613464355,
"eval_runtime": 348.0072,
"eval_samples_per_second": 861.994,
"eval_steps_per_second": 53.875,
"step": 2272000
},
{
"epoch": 6.4,
"eval_loss": 2.230863094329834,
"eval_runtime": 347.1957,
"eval_samples_per_second": 864.008,
"eval_steps_per_second": 54.001,
"step": 2280000
},
{
"epoch": 6.42,
"learning_rate": 1.9133333333333333e-08,
"loss": 2.4066,
"step": 2288000
},
{
"epoch": 6.42,
"eval_loss": 2.2291176319122314,
"eval_runtime": 347.3697,
"eval_samples_per_second": 863.576,
"eval_steps_per_second": 53.974,
"step": 2288000
},
{
"epoch": 6.45,
"eval_loss": 2.2300875186920166,
"eval_runtime": 348.7468,
"eval_samples_per_second": 860.165,
"eval_steps_per_second": 53.761,
"step": 2296000
},
{
"epoch": 6.47,
"learning_rate": 1.64e-08,
"loss": 2.4041,
"step": 2304000
},
{
"epoch": 6.47,
"eval_loss": 2.237844467163086,
"eval_runtime": 347.5443,
"eval_samples_per_second": 863.142,
"eval_steps_per_second": 53.947,
"step": 2304000
},
{
"epoch": 6.49,
"eval_loss": 2.2317147254943848,
"eval_runtime": 348.3274,
"eval_samples_per_second": 861.201,
"eval_steps_per_second": 53.826,
"step": 2312000
},
{
"epoch": 6.51,
"learning_rate": 1.3666666666666667e-08,
"loss": 2.4081,
"step": 2320000
},
{
"epoch": 6.51,
"eval_loss": 2.232565402984619,
"eval_runtime": 349.1958,
"eval_samples_per_second": 859.059,
"eval_steps_per_second": 53.692,
"step": 2320000
},
{
"epoch": 6.54,
"eval_loss": 2.2412142753601074,
"eval_runtime": 347.7133,
"eval_samples_per_second": 862.722,
"eval_steps_per_second": 53.921,
"step": 2328000
},
{
"epoch": 6.56,
"learning_rate": 1.0933333333333334e-08,
"loss": 2.4147,
"step": 2336000
},
{
"epoch": 6.56,
"eval_loss": 2.2348580360412598,
"eval_runtime": 348.9265,
"eval_samples_per_second": 859.722,
"eval_steps_per_second": 53.733,
"step": 2336000
},
{
"epoch": 6.58,
"eval_loss": 2.229579210281372,
"eval_runtime": 348.1888,
"eval_samples_per_second": 861.544,
"eval_steps_per_second": 53.847,
"step": 2344000
},
{
"epoch": 6.6,
"learning_rate": 8.2e-09,
"loss": 2.4105,
"step": 2352000
},
{
"epoch": 6.6,
"eval_loss": 2.231281280517578,
"eval_runtime": 348.2564,
"eval_samples_per_second": 861.377,
"eval_steps_per_second": 53.837,
"step": 2352000
},
{
"epoch": 6.62,
"eval_loss": 2.229707717895508,
"eval_runtime": 349.2314,
"eval_samples_per_second": 858.972,
"eval_steps_per_second": 53.686,
"step": 2360000
},
{
"epoch": 6.65,
"learning_rate": 5.466666666666667e-09,
"loss": 2.4096,
"step": 2368000
},
{
"epoch": 6.65,
"eval_loss": 2.2240703105926514,
"eval_runtime": 348.1981,
"eval_samples_per_second": 861.521,
"eval_steps_per_second": 53.846,
"step": 2368000
},
{
"epoch": 6.67,
"eval_loss": 2.232208490371704,
"eval_runtime": 349.4311,
"eval_samples_per_second": 858.481,
"eval_steps_per_second": 53.656,
"step": 2376000
},
{
"epoch": 6.69,
"learning_rate": 2.7333333333333334e-09,
"loss": 2.4089,
"step": 2384000
},
{
"epoch": 6.69,
"eval_loss": 2.234354019165039,
"eval_runtime": 349.741,
"eval_samples_per_second": 857.72,
"eval_steps_per_second": 53.608,
"step": 2384000
},
{
"epoch": 6.71,
"eval_loss": 2.229137659072876,
"eval_runtime": 349.1189,
"eval_samples_per_second": 859.249,
"eval_steps_per_second": 53.704,
"step": 2392000
},
{
"epoch": 6.74,
"learning_rate": 0.0,
"loss": 2.4048,
"step": 2400000
},
{
"epoch": 6.74,
"eval_loss": 2.227388858795166,
"eval_runtime": 348.8655,
"eval_samples_per_second": 859.873,
"eval_steps_per_second": 53.743,
"step": 2400000
},
{
"epoch": 6.74,
"step": 2400000,
"total_flos": 7.587638746774346e+17,
"train_loss": 2.443978935546875,
"train_runtime": 257950.5102,
"train_samples_per_second": 148.866,
"train_steps_per_second": 9.304
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 7,
"save_steps": 32000,
"total_flos": 7.587638746774346e+17,
"trial_name": null,
"trial_params": null
}