2020-Q1-75p-filtered-random / trainer_state.json
DouglasPontes's picture
End of training
33a781c
{
"best_metric": 1.9023408889770508,
"best_model_checkpoint": "./model_tweets_2020_Q1_75/checkpoint-2336000",
"epoch": 19.569471624266146,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07,
"eval_loss": 2.2396600246429443,
"eval_runtime": 113.355,
"eval_samples_per_second": 911.075,
"eval_steps_per_second": 56.945,
"step": 8000
},
{
"epoch": 0.13,
"learning_rate": 4.0726666666666665e-07,
"loss": 2.4342,
"step": 16000
},
{
"epoch": 0.13,
"eval_loss": 2.1510801315307617,
"eval_runtime": 111.953,
"eval_samples_per_second": 922.486,
"eval_steps_per_second": 57.658,
"step": 16000
},
{
"epoch": 0.2,
"eval_loss": 2.1108760833740234,
"eval_runtime": 111.7901,
"eval_samples_per_second": 923.83,
"eval_steps_per_second": 57.742,
"step": 24000
},
{
"epoch": 0.26,
"learning_rate": 4.0453333333333336e-07,
"loss": 2.2417,
"step": 32000
},
{
"epoch": 0.26,
"eval_loss": 2.0788779258728027,
"eval_runtime": 111.8843,
"eval_samples_per_second": 923.051,
"eval_steps_per_second": 57.694,
"step": 32000
},
{
"epoch": 0.33,
"eval_loss": 2.065674066543579,
"eval_runtime": 111.7346,
"eval_samples_per_second": 924.288,
"eval_steps_per_second": 57.771,
"step": 40000
},
{
"epoch": 0.39,
"learning_rate": 4.018e-07,
"loss": 2.1852,
"step": 48000
},
{
"epoch": 0.39,
"eval_loss": 2.0397331714630127,
"eval_runtime": 114.2687,
"eval_samples_per_second": 903.791,
"eval_steps_per_second": 56.49,
"step": 48000
},
{
"epoch": 0.46,
"eval_loss": 2.0303494930267334,
"eval_runtime": 112.0167,
"eval_samples_per_second": 921.961,
"eval_steps_per_second": 57.625,
"step": 56000
},
{
"epoch": 0.52,
"learning_rate": 3.9906666666666667e-07,
"loss": 2.1511,
"step": 64000
},
{
"epoch": 0.52,
"eval_loss": 2.024770975112915,
"eval_runtime": 112.2634,
"eval_samples_per_second": 919.935,
"eval_steps_per_second": 57.499,
"step": 64000
},
{
"epoch": 0.59,
"eval_loss": 2.022064685821533,
"eval_runtime": 112.5031,
"eval_samples_per_second": 917.975,
"eval_steps_per_second": 57.376,
"step": 72000
},
{
"epoch": 0.65,
"learning_rate": 3.963333333333333e-07,
"loss": 2.1261,
"step": 80000
},
{
"epoch": 0.65,
"eval_loss": 2.0128211975097656,
"eval_runtime": 113.0139,
"eval_samples_per_second": 913.826,
"eval_steps_per_second": 57.117,
"step": 80000
},
{
"epoch": 0.72,
"eval_loss": 2.0067052841186523,
"eval_runtime": 112.3731,
"eval_samples_per_second": 919.037,
"eval_steps_per_second": 57.443,
"step": 88000
},
{
"epoch": 0.78,
"learning_rate": 3.936e-07,
"loss": 2.1179,
"step": 96000
},
{
"epoch": 0.78,
"eval_loss": 2.003864288330078,
"eval_runtime": 113.5425,
"eval_samples_per_second": 909.571,
"eval_steps_per_second": 56.851,
"step": 96000
},
{
"epoch": 0.85,
"eval_loss": 1.9972714185714722,
"eval_runtime": 112.6781,
"eval_samples_per_second": 916.549,
"eval_steps_per_second": 57.287,
"step": 104000
},
{
"epoch": 0.91,
"learning_rate": 3.908666666666667e-07,
"loss": 2.1097,
"step": 112000
},
{
"epoch": 0.91,
"eval_loss": 1.98354971408844,
"eval_runtime": 112.0904,
"eval_samples_per_second": 921.355,
"eval_steps_per_second": 57.587,
"step": 112000
},
{
"epoch": 0.98,
"eval_loss": 1.9983153343200684,
"eval_runtime": 112.7204,
"eval_samples_per_second": 916.205,
"eval_steps_per_second": 57.266,
"step": 120000
},
{
"epoch": 1.04,
"learning_rate": 3.8813333333333334e-07,
"loss": 2.1031,
"step": 128000
},
{
"epoch": 1.04,
"eval_loss": 1.9898955821990967,
"eval_runtime": 114.024,
"eval_samples_per_second": 905.731,
"eval_steps_per_second": 56.611,
"step": 128000
},
{
"epoch": 1.11,
"eval_loss": 1.9755431413650513,
"eval_runtime": 114.5549,
"eval_samples_per_second": 901.533,
"eval_steps_per_second": 56.349,
"step": 136000
},
{
"epoch": 1.17,
"learning_rate": 3.854e-07,
"loss": 2.0977,
"step": 144000
},
{
"epoch": 1.17,
"eval_loss": 1.9855457544326782,
"eval_runtime": 113.3525,
"eval_samples_per_second": 911.096,
"eval_steps_per_second": 56.946,
"step": 144000
},
{
"epoch": 1.24,
"eval_loss": 1.9721323251724243,
"eval_runtime": 114.0229,
"eval_samples_per_second": 905.739,
"eval_steps_per_second": 56.611,
"step": 152000
},
{
"epoch": 1.3,
"learning_rate": 3.8266666666666665e-07,
"loss": 2.0892,
"step": 160000
},
{
"epoch": 1.3,
"eval_loss": 1.9813446998596191,
"eval_runtime": 113.5565,
"eval_samples_per_second": 909.459,
"eval_steps_per_second": 56.844,
"step": 160000
},
{
"epoch": 1.37,
"eval_loss": 1.9827616214752197,
"eval_runtime": 113.4289,
"eval_samples_per_second": 910.482,
"eval_steps_per_second": 56.908,
"step": 168000
},
{
"epoch": 1.44,
"learning_rate": 3.799333333333333e-07,
"loss": 2.0882,
"step": 176000
},
{
"epoch": 1.44,
"eval_loss": 1.9703537225723267,
"eval_runtime": 112.6429,
"eval_samples_per_second": 916.835,
"eval_steps_per_second": 57.305,
"step": 176000
},
{
"epoch": 1.5,
"eval_loss": 1.9728624820709229,
"eval_runtime": 113.7442,
"eval_samples_per_second": 907.958,
"eval_steps_per_second": 56.75,
"step": 184000
},
{
"epoch": 1.57,
"learning_rate": 3.772e-07,
"loss": 2.0884,
"step": 192000
},
{
"epoch": 1.57,
"eval_loss": 1.9721413850784302,
"eval_runtime": 113.2577,
"eval_samples_per_second": 911.859,
"eval_steps_per_second": 56.994,
"step": 192000
},
{
"epoch": 1.63,
"eval_loss": 1.9663499593734741,
"eval_runtime": 115.0819,
"eval_samples_per_second": 897.404,
"eval_steps_per_second": 56.09,
"step": 200000
},
{
"epoch": 1.7,
"learning_rate": 3.7446666666666667e-07,
"loss": 2.0814,
"step": 208000
},
{
"epoch": 1.7,
"eval_loss": 1.9612431526184082,
"eval_runtime": 113.9384,
"eval_samples_per_second": 906.411,
"eval_steps_per_second": 56.653,
"step": 208000
},
{
"epoch": 1.76,
"eval_loss": 1.971712350845337,
"eval_runtime": 113.4328,
"eval_samples_per_second": 910.451,
"eval_steps_per_second": 56.906,
"step": 216000
},
{
"epoch": 1.83,
"learning_rate": 3.7173333333333333e-07,
"loss": 2.0806,
"step": 224000
},
{
"epoch": 1.83,
"eval_loss": 1.959405541419983,
"eval_runtime": 113.1738,
"eval_samples_per_second": 912.535,
"eval_steps_per_second": 57.036,
"step": 224000
},
{
"epoch": 1.89,
"eval_loss": 1.960507869720459,
"eval_runtime": 112.7387,
"eval_samples_per_second": 916.056,
"eval_steps_per_second": 57.256,
"step": 232000
},
{
"epoch": 1.96,
"learning_rate": 3.69e-07,
"loss": 2.0838,
"step": 240000
},
{
"epoch": 1.96,
"eval_loss": 1.9555588960647583,
"eval_runtime": 113.0742,
"eval_samples_per_second": 913.338,
"eval_steps_per_second": 57.086,
"step": 240000
},
{
"epoch": 2.02,
"eval_loss": 1.955155849456787,
"eval_runtime": 114.9852,
"eval_samples_per_second": 898.159,
"eval_steps_per_second": 56.138,
"step": 248000
},
{
"epoch": 2.09,
"learning_rate": 3.6626666666666664e-07,
"loss": 2.0711,
"step": 256000
},
{
"epoch": 2.09,
"eval_loss": 1.965279459953308,
"eval_runtime": 115.2918,
"eval_samples_per_second": 895.771,
"eval_steps_per_second": 55.988,
"step": 256000
},
{
"epoch": 2.15,
"eval_loss": 1.9581040143966675,
"eval_runtime": 112.9905,
"eval_samples_per_second": 914.015,
"eval_steps_per_second": 57.129,
"step": 264000
},
{
"epoch": 2.22,
"learning_rate": 3.6353333333333335e-07,
"loss": 2.065,
"step": 272000
},
{
"epoch": 2.22,
"eval_loss": 1.9558922052383423,
"eval_runtime": 113.2393,
"eval_samples_per_second": 912.007,
"eval_steps_per_second": 57.003,
"step": 272000
},
{
"epoch": 2.28,
"eval_loss": 1.9615230560302734,
"eval_runtime": 113.7321,
"eval_samples_per_second": 908.055,
"eval_steps_per_second": 56.756,
"step": 280000
},
{
"epoch": 2.35,
"learning_rate": 3.608e-07,
"loss": 2.0769,
"step": 288000
},
{
"epoch": 2.35,
"eval_loss": 1.9493736028671265,
"eval_runtime": 113.1393,
"eval_samples_per_second": 912.813,
"eval_steps_per_second": 57.054,
"step": 288000
},
{
"epoch": 2.41,
"eval_loss": 1.9487217664718628,
"eval_runtime": 112.3053,
"eval_samples_per_second": 919.591,
"eval_steps_per_second": 57.477,
"step": 296000
},
{
"epoch": 2.48,
"learning_rate": 3.5806666666666666e-07,
"loss": 2.0733,
"step": 304000
},
{
"epoch": 2.48,
"eval_loss": 1.9546173810958862,
"eval_runtime": 113.1729,
"eval_samples_per_second": 912.542,
"eval_steps_per_second": 57.037,
"step": 304000
},
{
"epoch": 2.54,
"eval_loss": 1.9445385932922363,
"eval_runtime": 112.3458,
"eval_samples_per_second": 919.26,
"eval_steps_per_second": 57.457,
"step": 312000
},
{
"epoch": 2.61,
"learning_rate": 3.553333333333333e-07,
"loss": 2.0675,
"step": 320000
},
{
"epoch": 2.61,
"eval_loss": 1.9534730911254883,
"eval_runtime": 113.0946,
"eval_samples_per_second": 913.174,
"eval_steps_per_second": 57.076,
"step": 320000
},
{
"epoch": 2.67,
"eval_loss": 1.9580506086349487,
"eval_runtime": 112.7099,
"eval_samples_per_second": 916.291,
"eval_steps_per_second": 57.271,
"step": 328000
},
{
"epoch": 2.74,
"learning_rate": 3.5259999999999997e-07,
"loss": 2.0599,
"step": 336000
},
{
"epoch": 2.74,
"eval_loss": 1.9472216367721558,
"eval_runtime": 113.0268,
"eval_samples_per_second": 913.722,
"eval_steps_per_second": 57.11,
"step": 336000
},
{
"epoch": 2.8,
"eval_loss": 1.9545352458953857,
"eval_runtime": 112.8522,
"eval_samples_per_second": 915.135,
"eval_steps_per_second": 57.199,
"step": 344000
},
{
"epoch": 2.87,
"learning_rate": 3.498666666666667e-07,
"loss": 2.0675,
"step": 352000
},
{
"epoch": 2.87,
"eval_loss": 1.9551931619644165,
"eval_runtime": 112.8344,
"eval_samples_per_second": 915.28,
"eval_steps_per_second": 57.208,
"step": 352000
},
{
"epoch": 2.94,
"eval_loss": 1.9397163391113281,
"eval_runtime": 112.7266,
"eval_samples_per_second": 916.155,
"eval_steps_per_second": 57.262,
"step": 360000
},
{
"epoch": 3.0,
"learning_rate": 3.4713333333333333e-07,
"loss": 2.0711,
"step": 368000
},
{
"epoch": 3.0,
"eval_loss": 1.9475340843200684,
"eval_runtime": 113.527,
"eval_samples_per_second": 909.695,
"eval_steps_per_second": 56.859,
"step": 368000
},
{
"epoch": 3.07,
"eval_loss": 1.9387180805206299,
"eval_runtime": 112.9318,
"eval_samples_per_second": 914.49,
"eval_steps_per_second": 57.158,
"step": 376000
},
{
"epoch": 3.13,
"learning_rate": 3.444e-07,
"loss": 2.0663,
"step": 384000
},
{
"epoch": 3.13,
"eval_loss": 1.948356032371521,
"eval_runtime": 113.6939,
"eval_samples_per_second": 908.36,
"eval_steps_per_second": 56.775,
"step": 384000
},
{
"epoch": 3.2,
"eval_loss": 1.942366361618042,
"eval_runtime": 112.9142,
"eval_samples_per_second": 914.633,
"eval_steps_per_second": 57.167,
"step": 392000
},
{
"epoch": 3.26,
"learning_rate": 3.416666666666667e-07,
"loss": 2.0628,
"step": 400000
},
{
"epoch": 3.26,
"eval_loss": 1.941139578819275,
"eval_runtime": 112.7091,
"eval_samples_per_second": 916.297,
"eval_steps_per_second": 57.271,
"step": 400000
},
{
"epoch": 3.33,
"eval_loss": 1.940949559211731,
"eval_runtime": 112.6381,
"eval_samples_per_second": 916.875,
"eval_steps_per_second": 57.307,
"step": 408000
},
{
"epoch": 3.39,
"learning_rate": 3.3893333333333335e-07,
"loss": 2.0651,
"step": 416000
},
{
"epoch": 3.39,
"eval_loss": 1.9446566104888916,
"eval_runtime": 112.949,
"eval_samples_per_second": 914.351,
"eval_steps_per_second": 57.15,
"step": 416000
},
{
"epoch": 3.46,
"eval_loss": 1.940216064453125,
"eval_runtime": 114.0012,
"eval_samples_per_second": 905.912,
"eval_steps_per_second": 56.622,
"step": 424000
},
{
"epoch": 3.52,
"learning_rate": 3.3619999999999995e-07,
"loss": 2.0598,
"step": 432000
},
{
"epoch": 3.52,
"eval_loss": 1.9503767490386963,
"eval_runtime": 113.8999,
"eval_samples_per_second": 906.717,
"eval_steps_per_second": 56.673,
"step": 432000
},
{
"epoch": 3.59,
"eval_loss": 1.9414310455322266,
"eval_runtime": 113.4917,
"eval_samples_per_second": 909.978,
"eval_steps_per_second": 56.876,
"step": 440000
},
{
"epoch": 3.65,
"learning_rate": 3.3346666666666666e-07,
"loss": 2.0612,
"step": 448000
},
{
"epoch": 3.65,
"eval_loss": 1.9329679012298584,
"eval_runtime": 113.8065,
"eval_samples_per_second": 907.462,
"eval_steps_per_second": 56.719,
"step": 448000
},
{
"epoch": 3.72,
"eval_loss": 1.942373514175415,
"eval_runtime": 112.7208,
"eval_samples_per_second": 916.202,
"eval_steps_per_second": 57.265,
"step": 456000
},
{
"epoch": 3.78,
"learning_rate": 3.307333333333333e-07,
"loss": 2.0653,
"step": 464000
},
{
"epoch": 3.78,
"eval_loss": 1.930959939956665,
"eval_runtime": 113.0386,
"eval_samples_per_second": 913.626,
"eval_steps_per_second": 57.104,
"step": 464000
},
{
"epoch": 3.85,
"eval_loss": 1.9363598823547363,
"eval_runtime": 112.8486,
"eval_samples_per_second": 915.164,
"eval_steps_per_second": 57.201,
"step": 472000
},
{
"epoch": 3.91,
"learning_rate": 3.28e-07,
"loss": 2.0585,
"step": 480000
},
{
"epoch": 3.91,
"eval_loss": 1.9507150650024414,
"eval_runtime": 114.0147,
"eval_samples_per_second": 905.804,
"eval_steps_per_second": 56.615,
"step": 480000
},
{
"epoch": 3.98,
"eval_loss": 1.9320358037948608,
"eval_runtime": 113.3729,
"eval_samples_per_second": 910.932,
"eval_steps_per_second": 56.936,
"step": 488000
},
{
"epoch": 4.04,
"learning_rate": 3.252666666666667e-07,
"loss": 2.0593,
"step": 496000
},
{
"epoch": 4.04,
"eval_loss": 1.9416472911834717,
"eval_runtime": 113.4866,
"eval_samples_per_second": 910.02,
"eval_steps_per_second": 56.879,
"step": 496000
},
{
"epoch": 4.11,
"eval_loss": 1.934741735458374,
"eval_runtime": 112.805,
"eval_samples_per_second": 915.518,
"eval_steps_per_second": 57.223,
"step": 504000
},
{
"epoch": 4.17,
"learning_rate": 3.2253333333333334e-07,
"loss": 2.0671,
"step": 512000
},
{
"epoch": 4.17,
"eval_loss": 1.9390867948532104,
"eval_runtime": 112.908,
"eval_samples_per_second": 914.683,
"eval_steps_per_second": 57.17,
"step": 512000
},
{
"epoch": 4.24,
"eval_loss": 1.9453818798065186,
"eval_runtime": 112.778,
"eval_samples_per_second": 915.737,
"eval_steps_per_second": 57.236,
"step": 520000
},
{
"epoch": 4.31,
"learning_rate": 3.198e-07,
"loss": 2.0552,
"step": 528000
},
{
"epoch": 4.31,
"eval_loss": 1.9501063823699951,
"eval_runtime": 112.534,
"eval_samples_per_second": 917.723,
"eval_steps_per_second": 57.36,
"step": 528000
},
{
"epoch": 4.37,
"eval_loss": 1.935518741607666,
"eval_runtime": 113.9789,
"eval_samples_per_second": 906.089,
"eval_steps_per_second": 56.633,
"step": 536000
},
{
"epoch": 4.44,
"learning_rate": 3.1706666666666665e-07,
"loss": 2.0626,
"step": 544000
},
{
"epoch": 4.44,
"eval_loss": 1.9239717721939087,
"eval_runtime": 112.9585,
"eval_samples_per_second": 914.273,
"eval_steps_per_second": 57.145,
"step": 544000
},
{
"epoch": 4.5,
"eval_loss": 1.9398826360702515,
"eval_runtime": 113.7219,
"eval_samples_per_second": 908.137,
"eval_steps_per_second": 56.761,
"step": 552000
},
{
"epoch": 4.57,
"learning_rate": 3.1433333333333336e-07,
"loss": 2.0592,
"step": 560000
},
{
"epoch": 4.57,
"eval_loss": 1.9360318183898926,
"eval_runtime": 113.1836,
"eval_samples_per_second": 912.456,
"eval_steps_per_second": 57.031,
"step": 560000
},
{
"epoch": 4.63,
"eval_loss": 1.9377766847610474,
"eval_runtime": 113.185,
"eval_samples_per_second": 912.444,
"eval_steps_per_second": 57.031,
"step": 568000
},
{
"epoch": 4.7,
"learning_rate": 3.116e-07,
"loss": 2.0584,
"step": 576000
},
{
"epoch": 4.7,
"eval_loss": 1.9293311834335327,
"eval_runtime": 113.6435,
"eval_samples_per_second": 908.763,
"eval_steps_per_second": 56.8,
"step": 576000
},
{
"epoch": 4.76,
"eval_loss": 1.943053126335144,
"eval_runtime": 113.0332,
"eval_samples_per_second": 913.67,
"eval_steps_per_second": 57.107,
"step": 584000
},
{
"epoch": 4.83,
"learning_rate": 3.0886666666666667e-07,
"loss": 2.0515,
"step": 592000
},
{
"epoch": 4.83,
"eval_loss": 1.9324830770492554,
"eval_runtime": 113.1852,
"eval_samples_per_second": 912.443,
"eval_steps_per_second": 57.03,
"step": 592000
},
{
"epoch": 4.89,
"eval_loss": 1.9265968799591064,
"eval_runtime": 113.5248,
"eval_samples_per_second": 909.713,
"eval_steps_per_second": 56.86,
"step": 600000
},
{
"epoch": 4.96,
"learning_rate": 3.061333333333333e-07,
"loss": 2.0545,
"step": 608000
},
{
"epoch": 4.96,
"eval_loss": 1.921515941619873,
"eval_runtime": 113.9086,
"eval_samples_per_second": 906.648,
"eval_steps_per_second": 56.668,
"step": 608000
},
{
"epoch": 5.02,
"eval_loss": 1.924493670463562,
"eval_runtime": 113.1157,
"eval_samples_per_second": 913.003,
"eval_steps_per_second": 57.065,
"step": 616000
},
{
"epoch": 5.09,
"learning_rate": 3.034e-07,
"loss": 2.0525,
"step": 624000
},
{
"epoch": 5.09,
"eval_loss": 1.9372978210449219,
"eval_runtime": 113.8901,
"eval_samples_per_second": 906.795,
"eval_steps_per_second": 56.677,
"step": 624000
},
{
"epoch": 5.15,
"eval_loss": 1.934131145477295,
"eval_runtime": 112.8536,
"eval_samples_per_second": 915.124,
"eval_steps_per_second": 57.198,
"step": 632000
},
{
"epoch": 5.22,
"learning_rate": 3.0066666666666663e-07,
"loss": 2.0556,
"step": 640000
},
{
"epoch": 5.22,
"eval_loss": 1.9312899112701416,
"eval_runtime": 113.0744,
"eval_samples_per_second": 913.336,
"eval_steps_per_second": 57.086,
"step": 640000
},
{
"epoch": 5.28,
"eval_loss": 1.922965407371521,
"eval_runtime": 113.8801,
"eval_samples_per_second": 906.875,
"eval_steps_per_second": 56.682,
"step": 648000
},
{
"epoch": 5.35,
"learning_rate": 2.9793333333333334e-07,
"loss": 2.0567,
"step": 656000
},
{
"epoch": 5.35,
"eval_loss": 1.930014729499817,
"eval_runtime": 112.8937,
"eval_samples_per_second": 914.799,
"eval_steps_per_second": 57.178,
"step": 656000
},
{
"epoch": 5.41,
"eval_loss": 1.9337064027786255,
"eval_runtime": 113.4421,
"eval_samples_per_second": 910.376,
"eval_steps_per_second": 56.901,
"step": 664000
},
{
"epoch": 5.48,
"learning_rate": 2.952e-07,
"loss": 2.0506,
"step": 672000
},
{
"epoch": 5.48,
"eval_loss": 1.9316705465316772,
"eval_runtime": 113.3757,
"eval_samples_per_second": 910.91,
"eval_steps_per_second": 56.935,
"step": 672000
},
{
"epoch": 5.54,
"eval_loss": 1.9275363683700562,
"eval_runtime": 113.4405,
"eval_samples_per_second": 910.389,
"eval_steps_per_second": 56.902,
"step": 680000
},
{
"epoch": 5.61,
"learning_rate": 2.9246666666666665e-07,
"loss": 2.0561,
"step": 688000
},
{
"epoch": 5.61,
"eval_loss": 1.9376088380813599,
"eval_runtime": 113.0322,
"eval_samples_per_second": 913.678,
"eval_steps_per_second": 57.108,
"step": 688000
},
{
"epoch": 5.68,
"eval_loss": 1.9461050033569336,
"eval_runtime": 113.738,
"eval_samples_per_second": 908.008,
"eval_steps_per_second": 56.753,
"step": 696000
},
{
"epoch": 5.74,
"learning_rate": 2.897333333333333e-07,
"loss": 2.0496,
"step": 704000
},
{
"epoch": 5.74,
"eval_loss": 1.9239321947097778,
"eval_runtime": 113.8527,
"eval_samples_per_second": 907.093,
"eval_steps_per_second": 56.696,
"step": 704000
},
{
"epoch": 5.81,
"eval_loss": 1.9250530004501343,
"eval_runtime": 113.6483,
"eval_samples_per_second": 908.725,
"eval_steps_per_second": 56.798,
"step": 712000
},
{
"epoch": 5.87,
"learning_rate": 2.8699999999999996e-07,
"loss": 2.045,
"step": 720000
},
{
"epoch": 5.87,
"eval_loss": 1.9309498071670532,
"eval_runtime": 114.1187,
"eval_samples_per_second": 904.979,
"eval_steps_per_second": 56.564,
"step": 720000
},
{
"epoch": 5.94,
"eval_loss": 1.925881266593933,
"eval_runtime": 113.5651,
"eval_samples_per_second": 909.391,
"eval_steps_per_second": 56.84,
"step": 728000
},
{
"epoch": 6.0,
"learning_rate": 2.8426666666666667e-07,
"loss": 2.0512,
"step": 736000
},
{
"epoch": 6.0,
"eval_loss": 1.9236810207366943,
"eval_runtime": 113.1127,
"eval_samples_per_second": 913.027,
"eval_steps_per_second": 57.067,
"step": 736000
},
{
"epoch": 6.07,
"eval_loss": 1.9148136377334595,
"eval_runtime": 113.2705,
"eval_samples_per_second": 911.756,
"eval_steps_per_second": 56.987,
"step": 744000
},
{
"epoch": 6.13,
"learning_rate": 2.815333333333333e-07,
"loss": 2.0512,
"step": 752000
},
{
"epoch": 6.13,
"eval_loss": 1.9219812154769897,
"eval_runtime": 114.1512,
"eval_samples_per_second": 904.721,
"eval_steps_per_second": 56.548,
"step": 752000
},
{
"epoch": 6.2,
"eval_loss": 1.9397040605545044,
"eval_runtime": 113.2005,
"eval_samples_per_second": 912.319,
"eval_steps_per_second": 57.023,
"step": 760000
},
{
"epoch": 6.26,
"learning_rate": 2.7880000000000003e-07,
"loss": 2.0445,
"step": 768000
},
{
"epoch": 6.26,
"eval_loss": 1.9240758419036865,
"eval_runtime": 113.1775,
"eval_samples_per_second": 912.505,
"eval_steps_per_second": 57.034,
"step": 768000
},
{
"epoch": 6.33,
"eval_loss": 1.9330027103424072,
"eval_runtime": 113.0566,
"eval_samples_per_second": 913.481,
"eval_steps_per_second": 57.095,
"step": 776000
},
{
"epoch": 6.39,
"learning_rate": 2.7606666666666664e-07,
"loss": 2.0481,
"step": 784000
},
{
"epoch": 6.39,
"eval_loss": 1.9123960733413696,
"eval_runtime": 113.268,
"eval_samples_per_second": 911.775,
"eval_steps_per_second": 56.989,
"step": 784000
},
{
"epoch": 6.46,
"eval_loss": 1.9267631769180298,
"eval_runtime": 113.7712,
"eval_samples_per_second": 907.743,
"eval_steps_per_second": 56.737,
"step": 792000
},
{
"epoch": 6.52,
"learning_rate": 2.733333333333333e-07,
"loss": 2.048,
"step": 800000
},
{
"epoch": 6.52,
"eval_loss": 1.921078085899353,
"eval_runtime": 113.8106,
"eval_samples_per_second": 907.429,
"eval_steps_per_second": 56.717,
"step": 800000
},
{
"epoch": 6.59,
"eval_loss": 1.9279391765594482,
"eval_runtime": 113.2864,
"eval_samples_per_second": 911.627,
"eval_steps_per_second": 56.979,
"step": 808000
},
{
"epoch": 6.65,
"learning_rate": 2.706e-07,
"loss": 2.0555,
"step": 816000
},
{
"epoch": 6.65,
"eval_loss": 1.9168628454208374,
"eval_runtime": 113.2491,
"eval_samples_per_second": 911.928,
"eval_steps_per_second": 56.998,
"step": 816000
},
{
"epoch": 6.72,
"eval_loss": 1.922944188117981,
"eval_runtime": 113.2414,
"eval_samples_per_second": 911.99,
"eval_steps_per_second": 57.002,
"step": 824000
},
{
"epoch": 6.78,
"learning_rate": 2.6786666666666666e-07,
"loss": 2.052,
"step": 832000
},
{
"epoch": 6.78,
"eval_loss": 1.9253454208374023,
"eval_runtime": 114.0468,
"eval_samples_per_second": 905.549,
"eval_steps_per_second": 56.6,
"step": 832000
},
{
"epoch": 6.85,
"eval_loss": 1.9244375228881836,
"eval_runtime": 113.2582,
"eval_samples_per_second": 911.855,
"eval_steps_per_second": 56.994,
"step": 840000
},
{
"epoch": 6.91,
"learning_rate": 2.651333333333333e-07,
"loss": 2.0475,
"step": 848000
},
{
"epoch": 6.91,
"eval_loss": 1.9191728830337524,
"eval_runtime": 113.2946,
"eval_samples_per_second": 911.561,
"eval_steps_per_second": 56.975,
"step": 848000
},
{
"epoch": 6.98,
"eval_loss": 1.9167262315750122,
"eval_runtime": 113.3788,
"eval_samples_per_second": 910.884,
"eval_steps_per_second": 56.933,
"step": 856000
},
{
"epoch": 7.05,
"learning_rate": 2.624e-07,
"loss": 2.0521,
"step": 864000
},
{
"epoch": 7.05,
"eval_loss": 1.9202110767364502,
"eval_runtime": 113.3134,
"eval_samples_per_second": 911.41,
"eval_steps_per_second": 56.966,
"step": 864000
},
{
"epoch": 7.11,
"eval_loss": 1.9240491390228271,
"eval_runtime": 113.6592,
"eval_samples_per_second": 908.638,
"eval_steps_per_second": 56.793,
"step": 872000
},
{
"epoch": 7.18,
"learning_rate": 2.596666666666667e-07,
"loss": 2.0516,
"step": 880000
},
{
"epoch": 7.18,
"eval_loss": 1.923065423965454,
"eval_runtime": 113.5487,
"eval_samples_per_second": 909.522,
"eval_steps_per_second": 56.848,
"step": 880000
},
{
"epoch": 7.24,
"eval_loss": 1.9245978593826294,
"eval_runtime": 114.3166,
"eval_samples_per_second": 903.412,
"eval_steps_per_second": 56.466,
"step": 888000
},
{
"epoch": 7.31,
"learning_rate": 2.5693333333333333e-07,
"loss": 2.0526,
"step": 896000
},
{
"epoch": 7.31,
"eval_loss": 1.9173697233200073,
"eval_runtime": 113.4859,
"eval_samples_per_second": 910.025,
"eval_steps_per_second": 56.879,
"step": 896000
},
{
"epoch": 7.37,
"eval_loss": 1.9256370067596436,
"eval_runtime": 114.6588,
"eval_samples_per_second": 900.716,
"eval_steps_per_second": 56.297,
"step": 904000
},
{
"epoch": 7.44,
"learning_rate": 2.542e-07,
"loss": 2.044,
"step": 912000
},
{
"epoch": 7.44,
"eval_loss": 1.9233709573745728,
"eval_runtime": 114.8311,
"eval_samples_per_second": 899.364,
"eval_steps_per_second": 56.213,
"step": 912000
},
{
"epoch": 7.5,
"eval_loss": 1.9208239316940308,
"eval_runtime": 114.4555,
"eval_samples_per_second": 902.316,
"eval_steps_per_second": 56.397,
"step": 920000
},
{
"epoch": 7.57,
"learning_rate": 2.5146666666666664e-07,
"loss": 2.0493,
"step": 928000
},
{
"epoch": 7.57,
"eval_loss": 1.9232600927352905,
"eval_runtime": 113.2901,
"eval_samples_per_second": 911.598,
"eval_steps_per_second": 56.978,
"step": 928000
},
{
"epoch": 7.63,
"eval_loss": 1.918021321296692,
"eval_runtime": 114.0382,
"eval_samples_per_second": 905.617,
"eval_steps_per_second": 56.604,
"step": 936000
},
{
"epoch": 7.7,
"learning_rate": 2.4873333333333335e-07,
"loss": 2.0535,
"step": 944000
},
{
"epoch": 7.7,
"eval_loss": 1.919961929321289,
"eval_runtime": 114.7577,
"eval_samples_per_second": 899.94,
"eval_steps_per_second": 56.249,
"step": 944000
},
{
"epoch": 7.76,
"eval_loss": 1.9151924848556519,
"eval_runtime": 113.434,
"eval_samples_per_second": 910.441,
"eval_steps_per_second": 56.905,
"step": 952000
},
{
"epoch": 7.83,
"learning_rate": 2.46e-07,
"loss": 2.0454,
"step": 960000
},
{
"epoch": 7.83,
"eval_loss": 1.926845669746399,
"eval_runtime": 114.0309,
"eval_samples_per_second": 905.676,
"eval_steps_per_second": 56.607,
"step": 960000
},
{
"epoch": 7.89,
"eval_loss": 1.9206236600875854,
"eval_runtime": 113.4283,
"eval_samples_per_second": 910.487,
"eval_steps_per_second": 56.908,
"step": 968000
},
{
"epoch": 7.96,
"learning_rate": 2.4326666666666666e-07,
"loss": 2.0428,
"step": 976000
},
{
"epoch": 7.96,
"eval_loss": 1.9169600009918213,
"eval_runtime": 113.0231,
"eval_samples_per_second": 913.751,
"eval_steps_per_second": 57.112,
"step": 976000
},
{
"epoch": 8.02,
"eval_loss": 1.923983097076416,
"eval_runtime": 114.0029,
"eval_samples_per_second": 905.898,
"eval_steps_per_second": 56.621,
"step": 984000
},
{
"epoch": 8.09,
"learning_rate": 2.405333333333333e-07,
"loss": 2.052,
"step": 992000
},
{
"epoch": 8.09,
"eval_loss": 1.9306118488311768,
"eval_runtime": 114.2567,
"eval_samples_per_second": 903.886,
"eval_steps_per_second": 56.496,
"step": 992000
},
{
"epoch": 8.15,
"eval_loss": 1.9191603660583496,
"eval_runtime": 114.2092,
"eval_samples_per_second": 904.262,
"eval_steps_per_second": 56.519,
"step": 1000000
},
{
"epoch": 8.22,
"learning_rate": 2.3779999999999997e-07,
"loss": 2.0472,
"step": 1008000
},
{
"epoch": 8.22,
"eval_loss": 1.9313241243362427,
"eval_runtime": 114.3737,
"eval_samples_per_second": 902.961,
"eval_steps_per_second": 56.438,
"step": 1008000
},
{
"epoch": 8.28,
"eval_loss": 1.9238113164901733,
"eval_runtime": 114.2747,
"eval_samples_per_second": 903.743,
"eval_steps_per_second": 56.487,
"step": 1016000
},
{
"epoch": 8.35,
"learning_rate": 2.3506666666666668e-07,
"loss": 2.0454,
"step": 1024000
},
{
"epoch": 8.35,
"eval_loss": 1.9162325859069824,
"eval_runtime": 114.6251,
"eval_samples_per_second": 900.98,
"eval_steps_per_second": 56.314,
"step": 1024000
},
{
"epoch": 8.41,
"eval_loss": 1.913014531135559,
"eval_runtime": 113.9073,
"eval_samples_per_second": 906.658,
"eval_steps_per_second": 56.669,
"step": 1032000
},
{
"epoch": 8.48,
"learning_rate": 2.3233333333333334e-07,
"loss": 2.0503,
"step": 1040000
},
{
"epoch": 8.48,
"eval_loss": 1.9260133504867554,
"eval_runtime": 114.9945,
"eval_samples_per_second": 898.086,
"eval_steps_per_second": 56.133,
"step": 1040000
},
{
"epoch": 8.55,
"eval_loss": 1.9212397336959839,
"eval_runtime": 113.1012,
"eval_samples_per_second": 913.12,
"eval_steps_per_second": 57.073,
"step": 1048000
},
{
"epoch": 8.61,
"learning_rate": 2.2960000000000002e-07,
"loss": 2.0511,
"step": 1056000
},
{
"epoch": 8.61,
"eval_loss": 1.9114716053009033,
"eval_runtime": 113.5853,
"eval_samples_per_second": 909.229,
"eval_steps_per_second": 56.83,
"step": 1056000
},
{
"epoch": 8.68,
"eval_loss": 1.9123215675354004,
"eval_runtime": 113.8372,
"eval_samples_per_second": 907.217,
"eval_steps_per_second": 56.704,
"step": 1064000
},
{
"epoch": 8.74,
"learning_rate": 2.2686666666666667e-07,
"loss": 2.049,
"step": 1072000
},
{
"epoch": 8.74,
"eval_loss": 1.9258580207824707,
"eval_runtime": 115.4682,
"eval_samples_per_second": 894.402,
"eval_steps_per_second": 55.903,
"step": 1072000
},
{
"epoch": 8.81,
"eval_loss": 1.932053804397583,
"eval_runtime": 113.8053,
"eval_samples_per_second": 907.471,
"eval_steps_per_second": 56.72,
"step": 1080000
},
{
"epoch": 8.87,
"learning_rate": 2.2413333333333333e-07,
"loss": 2.0463,
"step": 1088000
},
{
"epoch": 8.87,
"eval_loss": 1.9148298501968384,
"eval_runtime": 114.0323,
"eval_samples_per_second": 905.664,
"eval_steps_per_second": 56.607,
"step": 1088000
},
{
"epoch": 8.94,
"eval_loss": 1.9144847393035889,
"eval_runtime": 113.7624,
"eval_samples_per_second": 907.813,
"eval_steps_per_second": 56.741,
"step": 1096000
},
{
"epoch": 9.0,
"learning_rate": 2.214e-07,
"loss": 2.0494,
"step": 1104000
},
{
"epoch": 9.0,
"eval_loss": 1.9097198247909546,
"eval_runtime": 114.8448,
"eval_samples_per_second": 899.257,
"eval_steps_per_second": 56.206,
"step": 1104000
},
{
"epoch": 9.07,
"eval_loss": 1.9135308265686035,
"eval_runtime": 114.2552,
"eval_samples_per_second": 903.898,
"eval_steps_per_second": 56.496,
"step": 1112000
},
{
"epoch": 9.13,
"learning_rate": 2.1866666666666667e-07,
"loss": 2.0467,
"step": 1120000
},
{
"epoch": 9.13,
"eval_loss": 1.9163955450057983,
"eval_runtime": 114.3157,
"eval_samples_per_second": 903.419,
"eval_steps_per_second": 56.466,
"step": 1120000
},
{
"epoch": 9.2,
"eval_loss": 1.9224337339401245,
"eval_runtime": 113.5325,
"eval_samples_per_second": 909.652,
"eval_steps_per_second": 56.856,
"step": 1128000
},
{
"epoch": 9.26,
"learning_rate": 2.1593333333333332e-07,
"loss": 2.0483,
"step": 1136000
},
{
"epoch": 9.26,
"eval_loss": 1.9134596586227417,
"eval_runtime": 113.9566,
"eval_samples_per_second": 906.266,
"eval_steps_per_second": 56.644,
"step": 1136000
},
{
"epoch": 9.33,
"eval_loss": 1.919922947883606,
"eval_runtime": 114.7403,
"eval_samples_per_second": 900.076,
"eval_steps_per_second": 56.257,
"step": 1144000
},
{
"epoch": 9.39,
"learning_rate": 2.132e-07,
"loss": 2.0437,
"step": 1152000
},
{
"epoch": 9.39,
"eval_loss": 1.9213168621063232,
"eval_runtime": 113.9265,
"eval_samples_per_second": 906.505,
"eval_steps_per_second": 56.659,
"step": 1152000
},
{
"epoch": 9.46,
"eval_loss": 1.9161458015441895,
"eval_runtime": 114.3737,
"eval_samples_per_second": 902.961,
"eval_steps_per_second": 56.438,
"step": 1160000
},
{
"epoch": 9.52,
"learning_rate": 2.1046666666666666e-07,
"loss": 2.0526,
"step": 1168000
},
{
"epoch": 9.52,
"eval_loss": 1.9148198366165161,
"eval_runtime": 113.75,
"eval_samples_per_second": 907.912,
"eval_steps_per_second": 56.747,
"step": 1168000
},
{
"epoch": 9.59,
"eval_loss": 1.9182627201080322,
"eval_runtime": 114.4421,
"eval_samples_per_second": 902.421,
"eval_steps_per_second": 56.404,
"step": 1176000
},
{
"epoch": 9.65,
"learning_rate": 2.0773333333333334e-07,
"loss": 2.0408,
"step": 1184000
},
{
"epoch": 9.65,
"eval_loss": 1.9078502655029297,
"eval_runtime": 116.0253,
"eval_samples_per_second": 890.108,
"eval_steps_per_second": 55.634,
"step": 1184000
},
{
"epoch": 9.72,
"eval_loss": 1.918637752532959,
"eval_runtime": 116.0532,
"eval_samples_per_second": 889.893,
"eval_steps_per_second": 55.621,
"step": 1192000
},
{
"epoch": 9.78,
"learning_rate": 2.05e-07,
"loss": 2.0488,
"step": 1200000
},
{
"epoch": 9.78,
"eval_loss": 1.9140615463256836,
"eval_runtime": 114.9098,
"eval_samples_per_second": 898.748,
"eval_steps_per_second": 56.174,
"step": 1200000
},
{
"epoch": 9.85,
"eval_loss": 1.907893419265747,
"eval_runtime": 114.1171,
"eval_samples_per_second": 904.991,
"eval_steps_per_second": 56.565,
"step": 1208000
},
{
"epoch": 9.92,
"learning_rate": 2.0226666666666668e-07,
"loss": 2.0441,
"step": 1216000
},
{
"epoch": 9.92,
"eval_loss": 1.9250520467758179,
"eval_runtime": 113.8841,
"eval_samples_per_second": 906.843,
"eval_steps_per_second": 56.68,
"step": 1216000
},
{
"epoch": 9.98,
"eval_loss": 1.9254791736602783,
"eval_runtime": 114.3655,
"eval_samples_per_second": 903.026,
"eval_steps_per_second": 56.442,
"step": 1224000
},
{
"epoch": 10.05,
"learning_rate": 1.9953333333333333e-07,
"loss": 2.0483,
"step": 1232000
},
{
"epoch": 10.05,
"eval_loss": 1.9108103513717651,
"eval_runtime": 114.7681,
"eval_samples_per_second": 899.858,
"eval_steps_per_second": 56.244,
"step": 1232000
},
{
"epoch": 10.11,
"eval_loss": 1.904534935951233,
"eval_runtime": 115.2497,
"eval_samples_per_second": 896.097,
"eval_steps_per_second": 56.009,
"step": 1240000
},
{
"epoch": 10.18,
"learning_rate": 1.968e-07,
"loss": 2.0503,
"step": 1248000
},
{
"epoch": 10.18,
"eval_loss": 1.9169738292694092,
"eval_runtime": 115.5795,
"eval_samples_per_second": 893.541,
"eval_steps_per_second": 55.849,
"step": 1248000
},
{
"epoch": 10.24,
"eval_loss": 1.9024699926376343,
"eval_runtime": 115.6553,
"eval_samples_per_second": 892.955,
"eval_steps_per_second": 55.812,
"step": 1256000
},
{
"epoch": 10.31,
"learning_rate": 1.9406666666666667e-07,
"loss": 2.0334,
"step": 1264000
},
{
"epoch": 10.31,
"eval_loss": 1.9198503494262695,
"eval_runtime": 115.9185,
"eval_samples_per_second": 890.927,
"eval_steps_per_second": 55.686,
"step": 1264000
},
{
"epoch": 10.37,
"eval_loss": 1.9187484979629517,
"eval_runtime": 114.9362,
"eval_samples_per_second": 898.542,
"eval_steps_per_second": 56.162,
"step": 1272000
},
{
"epoch": 10.44,
"learning_rate": 1.9133333333333333e-07,
"loss": 2.0388,
"step": 1280000
},
{
"epoch": 10.44,
"eval_loss": 1.902976393699646,
"eval_runtime": 115.6842,
"eval_samples_per_second": 892.732,
"eval_steps_per_second": 55.798,
"step": 1280000
},
{
"epoch": 10.5,
"eval_loss": 1.9231475591659546,
"eval_runtime": 114.4315,
"eval_samples_per_second": 902.505,
"eval_steps_per_second": 56.409,
"step": 1288000
},
{
"epoch": 10.57,
"learning_rate": 1.886e-07,
"loss": 2.0489,
"step": 1296000
},
{
"epoch": 10.57,
"eval_loss": 1.9084066152572632,
"eval_runtime": 114.2511,
"eval_samples_per_second": 903.93,
"eval_steps_per_second": 56.498,
"step": 1296000
},
{
"epoch": 10.63,
"eval_loss": 1.9184343814849854,
"eval_runtime": 115.1565,
"eval_samples_per_second": 896.823,
"eval_steps_per_second": 56.054,
"step": 1304000
},
{
"epoch": 10.7,
"learning_rate": 1.8586666666666666e-07,
"loss": 2.0476,
"step": 1312000
},
{
"epoch": 10.7,
"eval_loss": 1.9159677028656006,
"eval_runtime": 114.4329,
"eval_samples_per_second": 902.494,
"eval_steps_per_second": 56.409,
"step": 1312000
},
{
"epoch": 10.76,
"eval_loss": 1.9276108741760254,
"eval_runtime": 114.2813,
"eval_samples_per_second": 903.691,
"eval_steps_per_second": 56.483,
"step": 1320000
},
{
"epoch": 10.83,
"learning_rate": 1.8313333333333332e-07,
"loss": 2.037,
"step": 1328000
},
{
"epoch": 10.83,
"eval_loss": 1.9041118621826172,
"eval_runtime": 114.8143,
"eval_samples_per_second": 899.496,
"eval_steps_per_second": 56.221,
"step": 1328000
},
{
"epoch": 10.89,
"eval_loss": 1.9227638244628906,
"eval_runtime": 115.0142,
"eval_samples_per_second": 897.933,
"eval_steps_per_second": 56.124,
"step": 1336000
},
{
"epoch": 10.96,
"learning_rate": 1.804e-07,
"loss": 2.0447,
"step": 1344000
},
{
"epoch": 10.96,
"eval_loss": 1.9151026010513306,
"eval_runtime": 115.3034,
"eval_samples_per_second": 895.68,
"eval_steps_per_second": 55.983,
"step": 1344000
},
{
"epoch": 11.02,
"eval_loss": 1.9068875312805176,
"eval_runtime": 114.8441,
"eval_samples_per_second": 899.263,
"eval_steps_per_second": 56.207,
"step": 1352000
},
{
"epoch": 11.09,
"learning_rate": 1.7766666666666666e-07,
"loss": 2.039,
"step": 1360000
},
{
"epoch": 11.09,
"eval_loss": 1.9274860620498657,
"eval_runtime": 116.0333,
"eval_samples_per_second": 890.046,
"eval_steps_per_second": 55.631,
"step": 1360000
},
{
"epoch": 11.15,
"eval_loss": 1.9066658020019531,
"eval_runtime": 115.792,
"eval_samples_per_second": 891.901,
"eval_steps_per_second": 55.747,
"step": 1368000
},
{
"epoch": 11.22,
"learning_rate": 1.7493333333333334e-07,
"loss": 2.0434,
"step": 1376000
},
{
"epoch": 11.22,
"eval_loss": 1.9086920022964478,
"eval_runtime": 115.0319,
"eval_samples_per_second": 897.795,
"eval_steps_per_second": 56.115,
"step": 1376000
},
{
"epoch": 11.29,
"eval_loss": 1.9041084051132202,
"eval_runtime": 115.5247,
"eval_samples_per_second": 893.965,
"eval_steps_per_second": 55.876,
"step": 1384000
},
{
"epoch": 11.35,
"learning_rate": 1.722e-07,
"loss": 2.0501,
"step": 1392000
},
{
"epoch": 11.35,
"eval_loss": 1.9032894372940063,
"eval_runtime": 115.2083,
"eval_samples_per_second": 896.42,
"eval_steps_per_second": 56.029,
"step": 1392000
},
{
"epoch": 11.42,
"eval_loss": 1.9152663946151733,
"eval_runtime": 115.0179,
"eval_samples_per_second": 897.903,
"eval_steps_per_second": 56.122,
"step": 1400000
},
{
"epoch": 11.48,
"learning_rate": 1.6946666666666668e-07,
"loss": 2.0455,
"step": 1408000
},
{
"epoch": 11.48,
"eval_loss": 1.9173645973205566,
"eval_runtime": 115.462,
"eval_samples_per_second": 894.45,
"eval_steps_per_second": 55.906,
"step": 1408000
},
{
"epoch": 11.55,
"eval_loss": 1.9174134731292725,
"eval_runtime": 114.825,
"eval_samples_per_second": 899.412,
"eval_steps_per_second": 56.216,
"step": 1416000
},
{
"epoch": 11.61,
"learning_rate": 1.6673333333333333e-07,
"loss": 2.0466,
"step": 1424000
},
{
"epoch": 11.61,
"eval_loss": 1.9260660409927368,
"eval_runtime": 114.5404,
"eval_samples_per_second": 901.647,
"eval_steps_per_second": 56.356,
"step": 1424000
},
{
"epoch": 11.68,
"eval_loss": 1.9181084632873535,
"eval_runtime": 115.0034,
"eval_samples_per_second": 898.017,
"eval_steps_per_second": 56.129,
"step": 1432000
},
{
"epoch": 11.74,
"learning_rate": 1.64e-07,
"loss": 2.0424,
"step": 1440000
},
{
"epoch": 11.74,
"eval_loss": 1.9141377210617065,
"eval_runtime": 114.2837,
"eval_samples_per_second": 903.672,
"eval_steps_per_second": 56.482,
"step": 1440000
},
{
"epoch": 11.81,
"eval_loss": 1.9004480838775635,
"eval_runtime": 114.3666,
"eval_samples_per_second": 903.017,
"eval_steps_per_second": 56.441,
"step": 1448000
},
{
"epoch": 11.87,
"learning_rate": 1.6126666666666667e-07,
"loss": 2.0441,
"step": 1456000
},
{
"epoch": 11.87,
"eval_loss": 1.919699788093567,
"eval_runtime": 115.2012,
"eval_samples_per_second": 896.475,
"eval_steps_per_second": 56.032,
"step": 1456000
},
{
"epoch": 11.94,
"eval_loss": 1.9074804782867432,
"eval_runtime": 114.4122,
"eval_samples_per_second": 902.658,
"eval_steps_per_second": 56.419,
"step": 1464000
},
{
"epoch": 12.0,
"learning_rate": 1.5853333333333332e-07,
"loss": 2.04,
"step": 1472000
},
{
"epoch": 12.0,
"eval_loss": 1.9121414422988892,
"eval_runtime": 114.3242,
"eval_samples_per_second": 903.352,
"eval_steps_per_second": 56.462,
"step": 1472000
},
{
"epoch": 12.07,
"eval_loss": 1.9210638999938965,
"eval_runtime": 114.213,
"eval_samples_per_second": 904.231,
"eval_steps_per_second": 56.517,
"step": 1480000
},
{
"epoch": 12.13,
"learning_rate": 1.558e-07,
"loss": 2.0375,
"step": 1488000
},
{
"epoch": 12.13,
"eval_loss": 1.9110891819000244,
"eval_runtime": 114.5716,
"eval_samples_per_second": 901.401,
"eval_steps_per_second": 56.34,
"step": 1488000
},
{
"epoch": 12.2,
"eval_loss": 1.9187558889389038,
"eval_runtime": 114.4912,
"eval_samples_per_second": 902.034,
"eval_steps_per_second": 56.38,
"step": 1496000
},
{
"epoch": 12.26,
"learning_rate": 1.5306666666666666e-07,
"loss": 2.0482,
"step": 1504000
},
{
"epoch": 12.26,
"eval_loss": 1.9099169969558716,
"eval_runtime": 114.3549,
"eval_samples_per_second": 903.109,
"eval_steps_per_second": 56.447,
"step": 1504000
},
{
"epoch": 12.33,
"eval_loss": 1.9160943031311035,
"eval_runtime": 116.1161,
"eval_samples_per_second": 889.412,
"eval_steps_per_second": 55.591,
"step": 1512000
},
{
"epoch": 12.39,
"learning_rate": 1.5033333333333332e-07,
"loss": 2.0432,
"step": 1520000
},
{
"epoch": 12.39,
"eval_loss": 1.9197900295257568,
"eval_runtime": 114.3248,
"eval_samples_per_second": 903.347,
"eval_steps_per_second": 56.462,
"step": 1520000
},
{
"epoch": 12.46,
"eval_loss": 1.9154330492019653,
"eval_runtime": 114.7975,
"eval_samples_per_second": 899.627,
"eval_steps_per_second": 56.229,
"step": 1528000
},
{
"epoch": 12.52,
"learning_rate": 1.476e-07,
"loss": 2.0514,
"step": 1536000
},
{
"epoch": 12.52,
"eval_loss": 1.9058637619018555,
"eval_runtime": 114.484,
"eval_samples_per_second": 902.091,
"eval_steps_per_second": 56.383,
"step": 1536000
},
{
"epoch": 12.59,
"eval_loss": 1.920427680015564,
"eval_runtime": 114.3098,
"eval_samples_per_second": 903.466,
"eval_steps_per_second": 56.469,
"step": 1544000
},
{
"epoch": 12.65,
"learning_rate": 1.4486666666666665e-07,
"loss": 2.0397,
"step": 1552000
},
{
"epoch": 12.65,
"eval_loss": 1.9054511785507202,
"eval_runtime": 114.3602,
"eval_samples_per_second": 903.068,
"eval_steps_per_second": 56.444,
"step": 1552000
},
{
"epoch": 12.72,
"eval_loss": 1.896202802658081,
"eval_runtime": 115.3412,
"eval_samples_per_second": 895.387,
"eval_steps_per_second": 55.964,
"step": 1560000
},
{
"epoch": 12.79,
"learning_rate": 1.4213333333333334e-07,
"loss": 2.0454,
"step": 1568000
},
{
"epoch": 12.79,
"eval_loss": 1.9040275812149048,
"eval_runtime": 114.7741,
"eval_samples_per_second": 899.811,
"eval_steps_per_second": 56.241,
"step": 1568000
},
{
"epoch": 12.85,
"eval_loss": 1.916807770729065,
"eval_runtime": 114.6956,
"eval_samples_per_second": 900.427,
"eval_steps_per_second": 56.279,
"step": 1576000
},
{
"epoch": 12.92,
"learning_rate": 1.3940000000000002e-07,
"loss": 2.0391,
"step": 1584000
},
{
"epoch": 12.92,
"eval_loss": 1.9037362337112427,
"eval_runtime": 114.764,
"eval_samples_per_second": 899.89,
"eval_steps_per_second": 56.246,
"step": 1584000
},
{
"epoch": 12.98,
"eval_loss": 1.9186286926269531,
"eval_runtime": 114.5005,
"eval_samples_per_second": 901.961,
"eval_steps_per_second": 56.375,
"step": 1592000
},
{
"epoch": 13.05,
"learning_rate": 1.3666666666666665e-07,
"loss": 2.0414,
"step": 1600000
},
{
"epoch": 13.05,
"eval_loss": 1.9122203588485718,
"eval_runtime": 114.4898,
"eval_samples_per_second": 902.045,
"eval_steps_per_second": 56.381,
"step": 1600000
},
{
"epoch": 13.11,
"eval_loss": 1.9115867614746094,
"eval_runtime": 115.3456,
"eval_samples_per_second": 895.352,
"eval_steps_per_second": 55.962,
"step": 1608000
},
{
"epoch": 13.18,
"learning_rate": 1.3393333333333333e-07,
"loss": 2.0431,
"step": 1616000
},
{
"epoch": 13.18,
"eval_loss": 1.9056520462036133,
"eval_runtime": 114.6382,
"eval_samples_per_second": 900.878,
"eval_steps_per_second": 56.308,
"step": 1616000
},
{
"epoch": 13.24,
"eval_loss": 1.9115238189697266,
"eval_runtime": 114.36,
"eval_samples_per_second": 903.07,
"eval_steps_per_second": 56.445,
"step": 1624000
},
{
"epoch": 13.31,
"learning_rate": 1.312e-07,
"loss": 2.0368,
"step": 1632000
},
{
"epoch": 13.31,
"eval_loss": 1.911974549293518,
"eval_runtime": 114.4999,
"eval_samples_per_second": 901.966,
"eval_steps_per_second": 56.376,
"step": 1632000
},
{
"epoch": 13.37,
"eval_loss": 1.9113932847976685,
"eval_runtime": 115.0038,
"eval_samples_per_second": 898.014,
"eval_steps_per_second": 56.129,
"step": 1640000
},
{
"epoch": 13.44,
"learning_rate": 1.2846666666666667e-07,
"loss": 2.0427,
"step": 1648000
},
{
"epoch": 13.44,
"eval_loss": 1.9128488302230835,
"eval_runtime": 115.5184,
"eval_samples_per_second": 894.013,
"eval_steps_per_second": 55.879,
"step": 1648000
},
{
"epoch": 13.5,
"eval_loss": 1.9200862646102905,
"eval_runtime": 115.9784,
"eval_samples_per_second": 890.467,
"eval_steps_per_second": 55.657,
"step": 1656000
},
{
"epoch": 13.57,
"learning_rate": 1.2573333333333332e-07,
"loss": 2.0366,
"step": 1664000
},
{
"epoch": 13.57,
"eval_loss": 1.9053164720535278,
"eval_runtime": 115.3446,
"eval_samples_per_second": 895.361,
"eval_steps_per_second": 55.963,
"step": 1664000
},
{
"epoch": 13.63,
"eval_loss": 1.9077204465866089,
"eval_runtime": 114.7784,
"eval_samples_per_second": 899.777,
"eval_steps_per_second": 56.239,
"step": 1672000
},
{
"epoch": 13.7,
"learning_rate": 1.23e-07,
"loss": 2.0423,
"step": 1680000
},
{
"epoch": 13.7,
"eval_loss": 1.9155118465423584,
"eval_runtime": 114.5734,
"eval_samples_per_second": 901.387,
"eval_steps_per_second": 56.339,
"step": 1680000
},
{
"epoch": 13.76,
"eval_loss": 1.9025253057479858,
"eval_runtime": 115.4889,
"eval_samples_per_second": 894.242,
"eval_steps_per_second": 55.893,
"step": 1688000
},
{
"epoch": 13.83,
"learning_rate": 1.2026666666666666e-07,
"loss": 2.0345,
"step": 1696000
},
{
"epoch": 13.83,
"eval_loss": 1.911736011505127,
"eval_runtime": 115.8028,
"eval_samples_per_second": 891.818,
"eval_steps_per_second": 55.741,
"step": 1696000
},
{
"epoch": 13.89,
"eval_loss": 1.9146357774734497,
"eval_runtime": 115.518,
"eval_samples_per_second": 894.017,
"eval_steps_per_second": 55.879,
"step": 1704000
},
{
"epoch": 13.96,
"learning_rate": 1.1753333333333334e-07,
"loss": 2.0523,
"step": 1712000
},
{
"epoch": 13.96,
"eval_loss": 1.9094045162200928,
"eval_runtime": 115.0577,
"eval_samples_per_second": 897.593,
"eval_steps_per_second": 56.102,
"step": 1712000
},
{
"epoch": 14.02,
"eval_loss": 1.9028066396713257,
"eval_runtime": 115.3284,
"eval_samples_per_second": 895.486,
"eval_steps_per_second": 55.971,
"step": 1720000
},
{
"epoch": 14.09,
"learning_rate": 1.1480000000000001e-07,
"loss": 2.0405,
"step": 1728000
},
{
"epoch": 14.09,
"eval_loss": 1.9033746719360352,
"eval_runtime": 116.3993,
"eval_samples_per_second": 887.247,
"eval_steps_per_second": 55.456,
"step": 1728000
},
{
"epoch": 14.16,
"eval_loss": 1.903308629989624,
"eval_runtime": 115.3224,
"eval_samples_per_second": 895.533,
"eval_steps_per_second": 55.973,
"step": 1736000
},
{
"epoch": 14.22,
"learning_rate": 1.1206666666666666e-07,
"loss": 2.0416,
"step": 1744000
},
{
"epoch": 14.22,
"eval_loss": 1.8958499431610107,
"eval_runtime": 115.3629,
"eval_samples_per_second": 895.218,
"eval_steps_per_second": 55.954,
"step": 1744000
},
{
"epoch": 14.29,
"eval_loss": 1.9071624279022217,
"eval_runtime": 114.3411,
"eval_samples_per_second": 903.218,
"eval_steps_per_second": 56.454,
"step": 1752000
},
{
"epoch": 14.35,
"learning_rate": 1.0933333333333333e-07,
"loss": 2.0453,
"step": 1760000
},
{
"epoch": 14.35,
"eval_loss": 1.90669846534729,
"eval_runtime": 114.9673,
"eval_samples_per_second": 898.299,
"eval_steps_per_second": 56.146,
"step": 1760000
},
{
"epoch": 14.42,
"eval_loss": 1.9112778902053833,
"eval_runtime": 115.0041,
"eval_samples_per_second": 898.012,
"eval_steps_per_second": 56.128,
"step": 1768000
},
{
"epoch": 14.48,
"learning_rate": 1.066e-07,
"loss": 2.0425,
"step": 1776000
},
{
"epoch": 14.48,
"eval_loss": 1.9103703498840332,
"eval_runtime": 115.7959,
"eval_samples_per_second": 891.871,
"eval_steps_per_second": 55.745,
"step": 1776000
},
{
"epoch": 14.55,
"eval_loss": 1.9110212326049805,
"eval_runtime": 115.8835,
"eval_samples_per_second": 891.197,
"eval_steps_per_second": 55.702,
"step": 1784000
},
{
"epoch": 14.61,
"learning_rate": 1.0386666666666667e-07,
"loss": 2.0404,
"step": 1792000
},
{
"epoch": 14.61,
"eval_loss": 1.9037020206451416,
"eval_runtime": 115.7942,
"eval_samples_per_second": 891.884,
"eval_steps_per_second": 55.745,
"step": 1792000
},
{
"epoch": 14.68,
"eval_loss": 1.9003052711486816,
"eval_runtime": 115.8783,
"eval_samples_per_second": 891.236,
"eval_steps_per_second": 55.705,
"step": 1800000
},
{
"epoch": 14.74,
"learning_rate": 1.0113333333333334e-07,
"loss": 2.0427,
"step": 1808000
},
{
"epoch": 14.74,
"eval_loss": 1.911608099937439,
"eval_runtime": 116.1597,
"eval_samples_per_second": 889.078,
"eval_steps_per_second": 55.57,
"step": 1808000
},
{
"epoch": 14.81,
"eval_loss": 1.9105613231658936,
"eval_runtime": 116.0079,
"eval_samples_per_second": 890.241,
"eval_steps_per_second": 55.643,
"step": 1816000
},
{
"epoch": 14.87,
"learning_rate": 9.84e-08,
"loss": 2.0368,
"step": 1824000
},
{
"epoch": 14.87,
"eval_loss": 1.9095083475112915,
"eval_runtime": 116.8018,
"eval_samples_per_second": 884.19,
"eval_steps_per_second": 55.265,
"step": 1824000
},
{
"epoch": 14.94,
"eval_loss": 1.8979859352111816,
"eval_runtime": 114.5417,
"eval_samples_per_second": 901.637,
"eval_steps_per_second": 56.355,
"step": 1832000
},
{
"epoch": 15.0,
"learning_rate": 9.566666666666666e-08,
"loss": 2.0441,
"step": 1840000
},
{
"epoch": 15.0,
"eval_loss": 1.918567419052124,
"eval_runtime": 114.9208,
"eval_samples_per_second": 898.662,
"eval_steps_per_second": 56.169,
"step": 1840000
},
{
"epoch": 15.07,
"eval_loss": 1.903983473777771,
"eval_runtime": 114.4082,
"eval_samples_per_second": 902.689,
"eval_steps_per_second": 56.421,
"step": 1848000
},
{
"epoch": 15.13,
"learning_rate": 9.293333333333333e-08,
"loss": 2.0313,
"step": 1856000
},
{
"epoch": 15.13,
"eval_loss": 1.9185600280761719,
"eval_runtime": 115.728,
"eval_samples_per_second": 892.394,
"eval_steps_per_second": 55.777,
"step": 1856000
},
{
"epoch": 15.2,
"eval_loss": 1.9016015529632568,
"eval_runtime": 114.5834,
"eval_samples_per_second": 901.308,
"eval_steps_per_second": 56.335,
"step": 1864000
},
{
"epoch": 15.26,
"learning_rate": 9.02e-08,
"loss": 2.0488,
"step": 1872000
},
{
"epoch": 15.26,
"eval_loss": 1.9047600030899048,
"eval_runtime": 115.0099,
"eval_samples_per_second": 897.966,
"eval_steps_per_second": 56.126,
"step": 1872000
},
{
"epoch": 15.33,
"eval_loss": 1.899457335472107,
"eval_runtime": 115.8151,
"eval_samples_per_second": 891.723,
"eval_steps_per_second": 55.735,
"step": 1880000
},
{
"epoch": 15.39,
"learning_rate": 8.746666666666667e-08,
"loss": 2.0361,
"step": 1888000
},
{
"epoch": 15.39,
"eval_loss": 1.9119617938995361,
"eval_runtime": 116.351,
"eval_samples_per_second": 887.616,
"eval_steps_per_second": 55.479,
"step": 1888000
},
{
"epoch": 15.46,
"eval_loss": 1.907942295074463,
"eval_runtime": 116.3149,
"eval_samples_per_second": 887.892,
"eval_steps_per_second": 55.496,
"step": 1896000
},
{
"epoch": 15.53,
"learning_rate": 8.473333333333334e-08,
"loss": 2.0449,
"step": 1904000
},
{
"epoch": 15.53,
"eval_loss": 1.9109671115875244,
"eval_runtime": 114.9775,
"eval_samples_per_second": 898.219,
"eval_steps_per_second": 56.141,
"step": 1904000
},
{
"epoch": 15.59,
"eval_loss": 1.909091591835022,
"eval_runtime": 115.2151,
"eval_samples_per_second": 896.367,
"eval_steps_per_second": 56.026,
"step": 1912000
},
{
"epoch": 15.66,
"learning_rate": 8.2e-08,
"loss": 2.043,
"step": 1920000
},
{
"epoch": 15.66,
"eval_loss": 1.9061814546585083,
"eval_runtime": 115.9012,
"eval_samples_per_second": 891.06,
"eval_steps_per_second": 55.694,
"step": 1920000
},
{
"epoch": 15.72,
"eval_loss": 1.9070407152175903,
"eval_runtime": 115.1701,
"eval_samples_per_second": 896.717,
"eval_steps_per_second": 56.048,
"step": 1928000
},
{
"epoch": 15.79,
"learning_rate": 7.926666666666666e-08,
"loss": 2.0414,
"step": 1936000
},
{
"epoch": 15.79,
"eval_loss": 1.913381576538086,
"eval_runtime": 115.5442,
"eval_samples_per_second": 893.814,
"eval_steps_per_second": 55.866,
"step": 1936000
},
{
"epoch": 15.85,
"eval_loss": 1.9079296588897705,
"eval_runtime": 115.3858,
"eval_samples_per_second": 895.041,
"eval_steps_per_second": 55.943,
"step": 1944000
},
{
"epoch": 15.92,
"learning_rate": 7.653333333333333e-08,
"loss": 2.0419,
"step": 1952000
},
{
"epoch": 15.92,
"eval_loss": 1.9060734510421753,
"eval_runtime": 115.6219,
"eval_samples_per_second": 893.213,
"eval_steps_per_second": 55.829,
"step": 1952000
},
{
"epoch": 15.98,
"eval_loss": 1.9058138132095337,
"eval_runtime": 115.3288,
"eval_samples_per_second": 895.483,
"eval_steps_per_second": 55.97,
"step": 1960000
},
{
"epoch": 16.05,
"learning_rate": 7.38e-08,
"loss": 2.0384,
"step": 1968000
},
{
"epoch": 16.05,
"eval_loss": 1.9113844633102417,
"eval_runtime": 115.36,
"eval_samples_per_second": 895.241,
"eval_steps_per_second": 55.955,
"step": 1968000
},
{
"epoch": 16.11,
"eval_loss": 1.904008150100708,
"eval_runtime": 115.3314,
"eval_samples_per_second": 895.463,
"eval_steps_per_second": 55.969,
"step": 1976000
},
{
"epoch": 16.18,
"learning_rate": 7.106666666666667e-08,
"loss": 2.0391,
"step": 1984000
},
{
"epoch": 16.18,
"eval_loss": 1.9094995260238647,
"eval_runtime": 116.0038,
"eval_samples_per_second": 890.273,
"eval_steps_per_second": 55.645,
"step": 1984000
},
{
"epoch": 16.24,
"eval_loss": 1.918182134628296,
"eval_runtime": 117.9985,
"eval_samples_per_second": 875.223,
"eval_steps_per_second": 54.704,
"step": 1992000
},
{
"epoch": 16.31,
"learning_rate": 6.833333333333332e-08,
"loss": 2.0405,
"step": 2000000
},
{
"epoch": 16.31,
"eval_loss": 1.9111247062683105,
"eval_runtime": 115.7318,
"eval_samples_per_second": 892.365,
"eval_steps_per_second": 55.776,
"step": 2000000
},
{
"epoch": 16.37,
"eval_loss": 1.9056226015090942,
"eval_runtime": 115.3321,
"eval_samples_per_second": 895.458,
"eval_steps_per_second": 55.969,
"step": 2008000
},
{
"epoch": 16.44,
"learning_rate": 6.56e-08,
"loss": 2.0404,
"step": 2016000
},
{
"epoch": 16.44,
"eval_loss": 1.9133949279785156,
"eval_runtime": 115.6177,
"eval_samples_per_second": 893.245,
"eval_steps_per_second": 55.831,
"step": 2016000
},
{
"epoch": 16.5,
"eval_loss": 1.9069831371307373,
"eval_runtime": 116.2332,
"eval_samples_per_second": 888.516,
"eval_steps_per_second": 55.535,
"step": 2024000
},
{
"epoch": 16.57,
"learning_rate": 6.286666666666666e-08,
"loss": 2.0414,
"step": 2032000
},
{
"epoch": 16.57,
"eval_loss": 1.9084620475769043,
"eval_runtime": 115.922,
"eval_samples_per_second": 890.901,
"eval_steps_per_second": 55.684,
"step": 2032000
},
{
"epoch": 16.63,
"eval_loss": 1.9063148498535156,
"eval_runtime": 116.5212,
"eval_samples_per_second": 886.319,
"eval_steps_per_second": 55.398,
"step": 2040000
},
{
"epoch": 16.7,
"learning_rate": 6.013333333333333e-08,
"loss": 2.0483,
"step": 2048000
},
{
"epoch": 16.7,
"eval_loss": 1.9186962842941284,
"eval_runtime": 116.2964,
"eval_samples_per_second": 888.032,
"eval_steps_per_second": 55.505,
"step": 2048000
},
{
"epoch": 16.76,
"eval_loss": 1.9105137586593628,
"eval_runtime": 115.5049,
"eval_samples_per_second": 894.118,
"eval_steps_per_second": 55.885,
"step": 2056000
},
{
"epoch": 16.83,
"learning_rate": 5.7400000000000004e-08,
"loss": 2.0452,
"step": 2064000
},
{
"epoch": 16.83,
"eval_loss": 1.9117952585220337,
"eval_runtime": 116.0281,
"eval_samples_per_second": 890.086,
"eval_steps_per_second": 55.633,
"step": 2064000
},
{
"epoch": 16.89,
"eval_loss": 1.9091888666152954,
"eval_runtime": 117.9897,
"eval_samples_per_second": 875.288,
"eval_steps_per_second": 54.708,
"step": 2072000
},
{
"epoch": 16.96,
"learning_rate": 5.4666666666666666e-08,
"loss": 2.0401,
"step": 2080000
},
{
"epoch": 16.96,
"eval_loss": 1.9113515615463257,
"eval_runtime": 116.2625,
"eval_samples_per_second": 888.291,
"eval_steps_per_second": 55.521,
"step": 2080000
},
{
"epoch": 17.03,
"eval_loss": 1.9098221063613892,
"eval_runtime": 115.7155,
"eval_samples_per_second": 892.491,
"eval_steps_per_second": 55.783,
"step": 2088000
},
{
"epoch": 17.09,
"learning_rate": 5.1933333333333335e-08,
"loss": 2.0353,
"step": 2096000
},
{
"epoch": 17.09,
"eval_loss": 1.9069087505340576,
"eval_runtime": 116.5348,
"eval_samples_per_second": 886.216,
"eval_steps_per_second": 55.391,
"step": 2096000
},
{
"epoch": 17.16,
"eval_loss": 1.9027125835418701,
"eval_runtime": 115.7058,
"eval_samples_per_second": 892.566,
"eval_steps_per_second": 55.788,
"step": 2104000
},
{
"epoch": 17.22,
"learning_rate": 4.92e-08,
"loss": 2.0468,
"step": 2112000
},
{
"epoch": 17.22,
"eval_loss": 1.910232424736023,
"eval_runtime": 115.6722,
"eval_samples_per_second": 892.825,
"eval_steps_per_second": 55.804,
"step": 2112000
},
{
"epoch": 17.29,
"eval_loss": 1.9046436548233032,
"eval_runtime": 116.4733,
"eval_samples_per_second": 886.684,
"eval_steps_per_second": 55.42,
"step": 2120000
},
{
"epoch": 17.35,
"learning_rate": 4.6466666666666666e-08,
"loss": 2.0448,
"step": 2128000
},
{
"epoch": 17.35,
"eval_loss": 1.9024384021759033,
"eval_runtime": 115.5381,
"eval_samples_per_second": 893.861,
"eval_steps_per_second": 55.869,
"step": 2128000
},
{
"epoch": 17.42,
"eval_loss": 1.910799264907837,
"eval_runtime": 116.2371,
"eval_samples_per_second": 888.486,
"eval_steps_per_second": 55.533,
"step": 2136000
},
{
"epoch": 17.48,
"learning_rate": 4.3733333333333335e-08,
"loss": 2.0435,
"step": 2144000
},
{
"epoch": 17.48,
"eval_loss": 1.9122228622436523,
"eval_runtime": 116.6097,
"eval_samples_per_second": 885.647,
"eval_steps_per_second": 55.356,
"step": 2144000
},
{
"epoch": 17.55,
"eval_loss": 1.9043642282485962,
"eval_runtime": 115.6604,
"eval_samples_per_second": 892.916,
"eval_steps_per_second": 55.81,
"step": 2152000
},
{
"epoch": 17.61,
"learning_rate": 4.1e-08,
"loss": 2.0421,
"step": 2160000
},
{
"epoch": 17.61,
"eval_loss": 1.9069358110427856,
"eval_runtime": 116.7738,
"eval_samples_per_second": 884.402,
"eval_steps_per_second": 55.278,
"step": 2160000
},
{
"epoch": 17.68,
"eval_loss": 1.9019508361816406,
"eval_runtime": 116.2558,
"eval_samples_per_second": 888.342,
"eval_steps_per_second": 55.524,
"step": 2168000
},
{
"epoch": 17.74,
"learning_rate": 3.8266666666666665e-08,
"loss": 2.0366,
"step": 2176000
},
{
"epoch": 17.74,
"eval_loss": 1.9152798652648926,
"eval_runtime": 116.0194,
"eval_samples_per_second": 890.153,
"eval_steps_per_second": 55.637,
"step": 2176000
},
{
"epoch": 17.81,
"eval_loss": 1.9072139263153076,
"eval_runtime": 115.6971,
"eval_samples_per_second": 892.633,
"eval_steps_per_second": 55.792,
"step": 2184000
},
{
"epoch": 17.87,
"learning_rate": 3.5533333333333334e-08,
"loss": 2.034,
"step": 2192000
},
{
"epoch": 17.87,
"eval_loss": 1.9181559085845947,
"eval_runtime": 116.0059,
"eval_samples_per_second": 890.256,
"eval_steps_per_second": 55.644,
"step": 2192000
},
{
"epoch": 17.94,
"eval_loss": 1.9085872173309326,
"eval_runtime": 116.4771,
"eval_samples_per_second": 886.655,
"eval_steps_per_second": 55.419,
"step": 2200000
},
{
"epoch": 18.0,
"learning_rate": 3.28e-08,
"loss": 2.0397,
"step": 2208000
},
{
"epoch": 18.0,
"eval_loss": 1.9070638418197632,
"eval_runtime": 116.2437,
"eval_samples_per_second": 888.435,
"eval_steps_per_second": 55.53,
"step": 2208000
},
{
"epoch": 18.07,
"eval_loss": 1.9146629571914673,
"eval_runtime": 117.2397,
"eval_samples_per_second": 880.887,
"eval_steps_per_second": 55.058,
"step": 2216000
},
{
"epoch": 18.13,
"learning_rate": 3.0066666666666665e-08,
"loss": 2.0374,
"step": 2224000
},
{
"epoch": 18.13,
"eval_loss": 1.9199682474136353,
"eval_runtime": 116.9772,
"eval_samples_per_second": 882.865,
"eval_steps_per_second": 55.182,
"step": 2224000
},
{
"epoch": 18.2,
"eval_loss": 1.917845368385315,
"eval_runtime": 116.926,
"eval_samples_per_second": 883.251,
"eval_steps_per_second": 55.206,
"step": 2232000
},
{
"epoch": 18.26,
"learning_rate": 2.7333333333333333e-08,
"loss": 2.0413,
"step": 2240000
},
{
"epoch": 18.26,
"eval_loss": 1.9089611768722534,
"eval_runtime": 116.5053,
"eval_samples_per_second": 886.44,
"eval_steps_per_second": 55.405,
"step": 2240000
},
{
"epoch": 18.33,
"eval_loss": 1.9036976099014282,
"eval_runtime": 115.8055,
"eval_samples_per_second": 891.797,
"eval_steps_per_second": 55.74,
"step": 2248000
},
{
"epoch": 18.4,
"learning_rate": 2.46e-08,
"loss": 2.047,
"step": 2256000
},
{
"epoch": 18.4,
"eval_loss": 1.9126322269439697,
"eval_runtime": 116.2519,
"eval_samples_per_second": 888.372,
"eval_steps_per_second": 55.526,
"step": 2256000
},
{
"epoch": 18.46,
"eval_loss": 1.9117310047149658,
"eval_runtime": 116.138,
"eval_samples_per_second": 889.244,
"eval_steps_per_second": 55.58,
"step": 2264000
},
{
"epoch": 18.53,
"learning_rate": 2.1866666666666667e-08,
"loss": 2.0395,
"step": 2272000
},
{
"epoch": 18.53,
"eval_loss": 1.911027431488037,
"eval_runtime": 116.0632,
"eval_samples_per_second": 889.817,
"eval_steps_per_second": 55.616,
"step": 2272000
},
{
"epoch": 18.59,
"eval_loss": 1.9157801866531372,
"eval_runtime": 116.0777,
"eval_samples_per_second": 889.706,
"eval_steps_per_second": 55.609,
"step": 2280000
},
{
"epoch": 18.66,
"learning_rate": 1.9133333333333333e-08,
"loss": 2.0447,
"step": 2288000
},
{
"epoch": 18.66,
"eval_loss": 1.9016647338867188,
"eval_runtime": 116.0944,
"eval_samples_per_second": 889.578,
"eval_steps_per_second": 55.601,
"step": 2288000
},
{
"epoch": 18.72,
"eval_loss": 1.9071747064590454,
"eval_runtime": 117.2998,
"eval_samples_per_second": 880.436,
"eval_steps_per_second": 55.03,
"step": 2296000
},
{
"epoch": 18.79,
"learning_rate": 1.64e-08,
"loss": 2.0377,
"step": 2304000
},
{
"epoch": 18.79,
"eval_loss": 1.91358482837677,
"eval_runtime": 118.15,
"eval_samples_per_second": 874.101,
"eval_steps_per_second": 54.634,
"step": 2304000
},
{
"epoch": 18.85,
"eval_loss": 1.9084006547927856,
"eval_runtime": 118.6409,
"eval_samples_per_second": 870.484,
"eval_steps_per_second": 54.408,
"step": 2312000
},
{
"epoch": 18.92,
"learning_rate": 1.3666666666666667e-08,
"loss": 2.0312,
"step": 2320000
},
{
"epoch": 18.92,
"eval_loss": 1.909172773361206,
"eval_runtime": 117.7813,
"eval_samples_per_second": 876.837,
"eval_steps_per_second": 54.805,
"step": 2320000
},
{
"epoch": 18.98,
"eval_loss": 1.9103314876556396,
"eval_runtime": 117.0394,
"eval_samples_per_second": 882.395,
"eval_steps_per_second": 55.152,
"step": 2328000
},
{
"epoch": 19.05,
"learning_rate": 1.0933333333333334e-08,
"loss": 2.0387,
"step": 2336000
},
{
"epoch": 19.05,
"eval_loss": 1.9023408889770508,
"eval_runtime": 117.1786,
"eval_samples_per_second": 881.347,
"eval_steps_per_second": 55.087,
"step": 2336000
},
{
"epoch": 19.11,
"eval_loss": 1.9034806489944458,
"eval_runtime": 118.479,
"eval_samples_per_second": 871.674,
"eval_steps_per_second": 54.482,
"step": 2344000
},
{
"epoch": 19.18,
"learning_rate": 8.2e-09,
"loss": 2.0358,
"step": 2352000
},
{
"epoch": 19.18,
"eval_loss": 1.9131251573562622,
"eval_runtime": 116.6651,
"eval_samples_per_second": 885.226,
"eval_steps_per_second": 55.329,
"step": 2352000
},
{
"epoch": 19.24,
"eval_loss": 1.9065865278244019,
"eval_runtime": 118.6652,
"eval_samples_per_second": 870.306,
"eval_steps_per_second": 54.397,
"step": 2360000
},
{
"epoch": 19.31,
"learning_rate": 5.466666666666667e-09,
"loss": 2.0402,
"step": 2368000
},
{
"epoch": 19.31,
"eval_loss": 1.9083107709884644,
"eval_runtime": 117.2736,
"eval_samples_per_second": 880.633,
"eval_steps_per_second": 55.042,
"step": 2368000
},
{
"epoch": 19.37,
"eval_loss": 1.9068409204483032,
"eval_runtime": 117.4261,
"eval_samples_per_second": 879.49,
"eval_steps_per_second": 54.971,
"step": 2376000
},
{
"epoch": 19.44,
"learning_rate": 2.7333333333333334e-09,
"loss": 2.0319,
"step": 2384000
},
{
"epoch": 19.44,
"eval_loss": 1.9011958837509155,
"eval_runtime": 117.8174,
"eval_samples_per_second": 876.568,
"eval_steps_per_second": 54.788,
"step": 2384000
},
{
"epoch": 19.5,
"eval_loss": 1.927274465560913,
"eval_runtime": 118.5235,
"eval_samples_per_second": 871.346,
"eval_steps_per_second": 54.462,
"step": 2392000
},
{
"epoch": 19.57,
"learning_rate": 0.0,
"loss": 2.0436,
"step": 2400000
},
{
"epoch": 19.57,
"eval_loss": 1.905896782875061,
"eval_runtime": 116.8523,
"eval_samples_per_second": 883.808,
"eval_steps_per_second": 55.241,
"step": 2400000
},
{
"epoch": 19.57,
"step": 2400000,
"total_flos": 7.485113755399533e+17,
"train_loss": 2.0565961393229166,
"train_runtime": 185876.2415,
"train_samples_per_second": 206.589,
"train_steps_per_second": 12.912
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 20,
"save_steps": 32000,
"total_flos": 7.485113755399533e+17,
"trial_name": null,
"trial_params": null
}