bertweet-2020-Q1-filtered / trainer_state.json
DouglasPontes's picture
Training in progress, step 32000
7866ae8
raw
history blame
81.5 kB
{
"best_metric": 2.7100777626037598,
"best_model_checkpoint": "./model_tweets_2020_Q2/checkpoint-192000",
"epoch": 19.569471624266146,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07,
"eval_loss": 2.977957248687744,
"eval_runtime": 125.5213,
"eval_samples_per_second": 822.769,
"eval_steps_per_second": 51.426,
"step": 8000
},
{
"epoch": 0.13,
"learning_rate": 9.939131159843243e-06,
"loss": 3.1296,
"step": 16000
},
{
"epoch": 0.13,
"eval_loss": 2.894831418991089,
"eval_runtime": 126.0129,
"eval_samples_per_second": 819.559,
"eval_steps_per_second": 51.225,
"step": 16000
},
{
"epoch": 0.2,
"eval_loss": 2.8589611053466797,
"eval_runtime": 125.8909,
"eval_samples_per_second": 820.353,
"eval_steps_per_second": 51.275,
"step": 24000
},
{
"epoch": 0.26,
"learning_rate": 9.872425581589261e-06,
"loss": 2.9018,
"step": 32000
},
{
"epoch": 0.26,
"eval_loss": 2.8033480644226074,
"eval_runtime": 125.7264,
"eval_samples_per_second": 821.427,
"eval_steps_per_second": 51.342,
"step": 32000
},
{
"epoch": 0.33,
"eval_loss": 2.7938032150268555,
"eval_runtime": 125.7192,
"eval_samples_per_second": 821.474,
"eval_steps_per_second": 51.345,
"step": 40000
},
{
"epoch": 0.39,
"learning_rate": 9.80572000333528e-06,
"loss": 2.8331,
"step": 48000
},
{
"epoch": 0.39,
"eval_loss": 2.7694976329803467,
"eval_runtime": 127.0405,
"eval_samples_per_second": 812.93,
"eval_steps_per_second": 50.811,
"step": 48000
},
{
"epoch": 0.46,
"eval_loss": 2.7614457607269287,
"eval_runtime": 125.7185,
"eval_samples_per_second": 821.478,
"eval_steps_per_second": 51.345,
"step": 56000
},
{
"epoch": 0.52,
"learning_rate": 9.739014425081299e-06,
"loss": 2.7723,
"step": 64000
},
{
"epoch": 0.52,
"eval_loss": 2.7416625022888184,
"eval_runtime": 126.1624,
"eval_samples_per_second": 818.588,
"eval_steps_per_second": 51.164,
"step": 64000
},
{
"epoch": 0.59,
"eval_loss": 2.7248806953430176,
"eval_runtime": 126.0454,
"eval_samples_per_second": 819.348,
"eval_steps_per_second": 51.212,
"step": 72000
},
{
"epoch": 0.65,
"learning_rate": 9.672308846827316e-06,
"loss": 2.75,
"step": 80000
},
{
"epoch": 0.65,
"eval_loss": 2.7202229499816895,
"eval_runtime": 126.948,
"eval_samples_per_second": 813.522,
"eval_steps_per_second": 50.848,
"step": 80000
},
{
"epoch": 0.72,
"eval_loss": 2.7112038135528564,
"eval_runtime": 126.8524,
"eval_samples_per_second": 814.135,
"eval_steps_per_second": 50.886,
"step": 88000
},
{
"epoch": 0.78,
"learning_rate": 9.605603268573334e-06,
"loss": 2.735,
"step": 96000
},
{
"epoch": 0.78,
"eval_loss": 2.7228710651397705,
"eval_runtime": 126.981,
"eval_samples_per_second": 813.311,
"eval_steps_per_second": 50.834,
"step": 96000
},
{
"epoch": 0.85,
"eval_loss": 2.7370951175689697,
"eval_runtime": 126.6893,
"eval_samples_per_second": 815.183,
"eval_steps_per_second": 50.951,
"step": 104000
},
{
"epoch": 0.91,
"learning_rate": 9.538897690319354e-06,
"loss": 2.7137,
"step": 112000
},
{
"epoch": 0.91,
"eval_loss": 2.7059037685394287,
"eval_runtime": 126.3306,
"eval_samples_per_second": 817.498,
"eval_steps_per_second": 51.096,
"step": 112000
},
{
"epoch": 0.98,
"eval_loss": 2.7120730876922607,
"eval_runtime": 126.2744,
"eval_samples_per_second": 817.862,
"eval_steps_per_second": 51.119,
"step": 120000
},
{
"epoch": 1.04,
"learning_rate": 9.472192112065373e-06,
"loss": 2.7155,
"step": 128000
},
{
"epoch": 1.04,
"eval_loss": 2.7248668670654297,
"eval_runtime": 126.8126,
"eval_samples_per_second": 814.391,
"eval_steps_per_second": 50.902,
"step": 128000
},
{
"epoch": 1.11,
"eval_loss": 2.7130985260009766,
"eval_runtime": 126.6262,
"eval_samples_per_second": 815.589,
"eval_steps_per_second": 50.977,
"step": 136000
},
{
"epoch": 1.17,
"learning_rate": 9.405486533811392e-06,
"loss": 2.7152,
"step": 144000
},
{
"epoch": 1.17,
"eval_loss": 2.6999881267547607,
"eval_runtime": 126.4279,
"eval_samples_per_second": 816.868,
"eval_steps_per_second": 51.057,
"step": 144000
},
{
"epoch": 1.24,
"eval_loss": 2.703012704849243,
"eval_runtime": 126.2932,
"eval_samples_per_second": 817.74,
"eval_steps_per_second": 51.111,
"step": 152000
},
{
"epoch": 1.3,
"learning_rate": 9.338780955557409e-06,
"loss": 2.7151,
"step": 160000
},
{
"epoch": 1.3,
"eval_loss": 2.721385955810547,
"eval_runtime": 126.566,
"eval_samples_per_second": 815.977,
"eval_steps_per_second": 51.001,
"step": 160000
},
{
"epoch": 1.37,
"eval_loss": 2.707641839981079,
"eval_runtime": 126.5896,
"eval_samples_per_second": 815.826,
"eval_steps_per_second": 50.992,
"step": 168000
},
{
"epoch": 1.44,
"learning_rate": 9.272075377303427e-06,
"loss": 2.7166,
"step": 176000
},
{
"epoch": 1.44,
"eval_loss": 2.7106387615203857,
"eval_runtime": 126.9356,
"eval_samples_per_second": 813.602,
"eval_steps_per_second": 50.853,
"step": 176000
},
{
"epoch": 1.5,
"eval_loss": 2.719717025756836,
"eval_runtime": 127.5317,
"eval_samples_per_second": 809.798,
"eval_steps_per_second": 50.615,
"step": 184000
},
{
"epoch": 1.57,
"learning_rate": 9.205369799049446e-06,
"loss": 2.7144,
"step": 192000
},
{
"epoch": 1.57,
"eval_loss": 2.7100777626037598,
"eval_runtime": 126.3318,
"eval_samples_per_second": 817.49,
"eval_steps_per_second": 51.096,
"step": 192000
},
{
"epoch": 1.63,
"eval_loss": 2.723472833633423,
"eval_runtime": 127.1568,
"eval_samples_per_second": 812.186,
"eval_steps_per_second": 50.764,
"step": 200000
},
{
"epoch": 1.7,
"learning_rate": 9.138664220795464e-06,
"loss": 2.7179,
"step": 208000
},
{
"epoch": 1.7,
"eval_loss": 2.706564426422119,
"eval_runtime": 127.089,
"eval_samples_per_second": 812.62,
"eval_steps_per_second": 50.791,
"step": 208000
},
{
"epoch": 1.76,
"eval_loss": 2.7282984256744385,
"eval_runtime": 127.4927,
"eval_samples_per_second": 810.047,
"eval_steps_per_second": 50.63,
"step": 216000
},
{
"epoch": 1.83,
"learning_rate": 9.071958642541483e-06,
"loss": 2.7231,
"step": 224000
},
{
"epoch": 1.83,
"eval_loss": 2.7203216552734375,
"eval_runtime": 127.4298,
"eval_samples_per_second": 810.446,
"eval_steps_per_second": 50.655,
"step": 224000
},
{
"epoch": 1.89,
"eval_loss": 2.711085319519043,
"eval_runtime": 126.4739,
"eval_samples_per_second": 816.571,
"eval_steps_per_second": 51.038,
"step": 232000
},
{
"epoch": 1.96,
"learning_rate": 9.005253064287502e-06,
"loss": 2.7284,
"step": 240000
},
{
"epoch": 1.96,
"eval_loss": 2.721714973449707,
"eval_runtime": 126.401,
"eval_samples_per_second": 817.043,
"eval_steps_per_second": 51.068,
"step": 240000
},
{
"epoch": 2.02,
"eval_loss": 2.725090265274048,
"eval_runtime": 127.0199,
"eval_samples_per_second": 813.061,
"eval_steps_per_second": 50.819,
"step": 248000
},
{
"epoch": 2.09,
"learning_rate": 8.93854748603352e-06,
"loss": 2.7242,
"step": 256000
},
{
"epoch": 2.09,
"eval_loss": 2.718090057373047,
"eval_runtime": 127.9402,
"eval_samples_per_second": 807.213,
"eval_steps_per_second": 50.453,
"step": 256000
},
{
"epoch": 2.15,
"eval_loss": 2.723750591278076,
"eval_runtime": 127.1287,
"eval_samples_per_second": 812.366,
"eval_steps_per_second": 50.775,
"step": 264000
},
{
"epoch": 2.22,
"learning_rate": 8.871841907779539e-06,
"loss": 2.7171,
"step": 272000
},
{
"epoch": 2.22,
"eval_loss": 2.748772144317627,
"eval_runtime": 128.6406,
"eval_samples_per_second": 802.818,
"eval_steps_per_second": 50.179,
"step": 272000
},
{
"epoch": 2.28,
"eval_loss": 2.731541633605957,
"eval_runtime": 127.5368,
"eval_samples_per_second": 809.766,
"eval_steps_per_second": 50.613,
"step": 280000
},
{
"epoch": 2.35,
"learning_rate": 8.805136329525557e-06,
"loss": 2.7312,
"step": 288000
},
{
"epoch": 2.35,
"eval_loss": 2.746854305267334,
"eval_runtime": 127.6128,
"eval_samples_per_second": 809.284,
"eval_steps_per_second": 50.583,
"step": 288000
},
{
"epoch": 2.41,
"eval_loss": 2.7363078594207764,
"eval_runtime": 127.9259,
"eval_samples_per_second": 807.303,
"eval_steps_per_second": 50.459,
"step": 296000
},
{
"epoch": 2.48,
"learning_rate": 8.738430751271576e-06,
"loss": 2.7386,
"step": 304000
},
{
"epoch": 2.48,
"eval_loss": 2.7398250102996826,
"eval_runtime": 127.3013,
"eval_samples_per_second": 811.264,
"eval_steps_per_second": 50.706,
"step": 304000
},
{
"epoch": 2.54,
"eval_loss": 2.747743844985962,
"eval_runtime": 127.9865,
"eval_samples_per_second": 806.921,
"eval_steps_per_second": 50.435,
"step": 312000
},
{
"epoch": 2.61,
"learning_rate": 8.671725173017595e-06,
"loss": 2.7457,
"step": 320000
},
{
"epoch": 2.61,
"eval_loss": 2.753558397293091,
"eval_runtime": 128.9208,
"eval_samples_per_second": 801.073,
"eval_steps_per_second": 50.069,
"step": 320000
},
{
"epoch": 2.67,
"eval_loss": 2.748337984085083,
"eval_runtime": 128.758,
"eval_samples_per_second": 802.086,
"eval_steps_per_second": 50.133,
"step": 328000
},
{
"epoch": 2.74,
"learning_rate": 8.605019594763613e-06,
"loss": 2.7496,
"step": 336000
},
{
"epoch": 2.74,
"eval_loss": 2.752856969833374,
"eval_runtime": 128.3684,
"eval_samples_per_second": 804.521,
"eval_steps_per_second": 50.285,
"step": 336000
},
{
"epoch": 2.8,
"eval_loss": 2.749178171157837,
"eval_runtime": 129.8422,
"eval_samples_per_second": 795.388,
"eval_steps_per_second": 49.714,
"step": 344000
},
{
"epoch": 2.87,
"learning_rate": 8.538314016509632e-06,
"loss": 2.7521,
"step": 352000
},
{
"epoch": 2.87,
"eval_loss": 2.761200189590454,
"eval_runtime": 127.6309,
"eval_samples_per_second": 809.169,
"eval_steps_per_second": 50.576,
"step": 352000
},
{
"epoch": 2.94,
"eval_loss": 2.7700963020324707,
"eval_runtime": 128.3946,
"eval_samples_per_second": 804.356,
"eval_steps_per_second": 50.275,
"step": 360000
},
{
"epoch": 3.0,
"learning_rate": 8.471608438255649e-06,
"loss": 2.7649,
"step": 368000
},
{
"epoch": 3.0,
"eval_loss": 2.7705161571502686,
"eval_runtime": 128.8577,
"eval_samples_per_second": 801.466,
"eval_steps_per_second": 50.094,
"step": 368000
},
{
"epoch": 3.07,
"eval_loss": 2.782761335372925,
"eval_runtime": 129.17,
"eval_samples_per_second": 799.528,
"eval_steps_per_second": 49.973,
"step": 376000
},
{
"epoch": 3.13,
"learning_rate": 8.404902860001667e-06,
"loss": 2.7516,
"step": 384000
},
{
"epoch": 3.13,
"eval_loss": 2.7680482864379883,
"eval_runtime": 128.9028,
"eval_samples_per_second": 801.185,
"eval_steps_per_second": 50.077,
"step": 384000
},
{
"epoch": 3.2,
"eval_loss": 2.784294605255127,
"eval_runtime": 128.4737,
"eval_samples_per_second": 803.861,
"eval_steps_per_second": 50.244,
"step": 392000
},
{
"epoch": 3.26,
"learning_rate": 8.338197281747686e-06,
"loss": 2.762,
"step": 400000
},
{
"epoch": 3.26,
"eval_loss": 2.7915961742401123,
"eval_runtime": 128.2651,
"eval_samples_per_second": 805.168,
"eval_steps_per_second": 50.325,
"step": 400000
},
{
"epoch": 3.33,
"eval_loss": 2.7691826820373535,
"eval_runtime": 128.6705,
"eval_samples_per_second": 802.632,
"eval_steps_per_second": 50.167,
"step": 408000
},
{
"epoch": 3.39,
"learning_rate": 8.271491703493705e-06,
"loss": 2.7789,
"step": 416000
},
{
"epoch": 3.39,
"eval_loss": 2.783369302749634,
"eval_runtime": 128.6603,
"eval_samples_per_second": 802.695,
"eval_steps_per_second": 50.171,
"step": 416000
},
{
"epoch": 3.46,
"eval_loss": 2.7788405418395996,
"eval_runtime": 129.7209,
"eval_samples_per_second": 796.132,
"eval_steps_per_second": 49.761,
"step": 424000
},
{
"epoch": 3.52,
"learning_rate": 8.204786125239725e-06,
"loss": 2.7879,
"step": 432000
},
{
"epoch": 3.52,
"eval_loss": 2.803699493408203,
"eval_runtime": 128.2575,
"eval_samples_per_second": 805.216,
"eval_steps_per_second": 50.328,
"step": 432000
},
{
"epoch": 3.59,
"eval_loss": 2.791905403137207,
"eval_runtime": 129.4159,
"eval_samples_per_second": 798.009,
"eval_steps_per_second": 49.878,
"step": 440000
},
{
"epoch": 3.65,
"learning_rate": 8.138080546985743e-06,
"loss": 2.7853,
"step": 448000
},
{
"epoch": 3.65,
"eval_loss": 2.8077127933502197,
"eval_runtime": 127.9753,
"eval_samples_per_second": 806.992,
"eval_steps_per_second": 50.439,
"step": 448000
},
{
"epoch": 3.72,
"eval_loss": 2.7903032302856445,
"eval_runtime": 128.9005,
"eval_samples_per_second": 801.2,
"eval_steps_per_second": 50.077,
"step": 456000
},
{
"epoch": 3.78,
"learning_rate": 8.07137496873176e-06,
"loss": 2.7976,
"step": 464000
},
{
"epoch": 3.78,
"eval_loss": 2.810896158218384,
"eval_runtime": 129.0626,
"eval_samples_per_second": 800.193,
"eval_steps_per_second": 50.014,
"step": 464000
},
{
"epoch": 3.85,
"eval_loss": 2.795713424682617,
"eval_runtime": 128.0638,
"eval_samples_per_second": 806.434,
"eval_steps_per_second": 50.405,
"step": 472000
},
{
"epoch": 3.91,
"learning_rate": 8.004669390477779e-06,
"loss": 2.789,
"step": 480000
},
{
"epoch": 3.91,
"eval_loss": 2.8023178577423096,
"eval_runtime": 128.1962,
"eval_samples_per_second": 805.601,
"eval_steps_per_second": 50.353,
"step": 480000
},
{
"epoch": 3.98,
"eval_loss": 2.8125839233398438,
"eval_runtime": 128.7992,
"eval_samples_per_second": 801.83,
"eval_steps_per_second": 50.117,
"step": 488000
},
{
"epoch": 4.04,
"learning_rate": 7.937963812223798e-06,
"loss": 2.8089,
"step": 496000
},
{
"epoch": 4.04,
"eval_loss": 2.815424919128418,
"eval_runtime": 128.7985,
"eval_samples_per_second": 801.834,
"eval_steps_per_second": 50.117,
"step": 496000
},
{
"epoch": 4.11,
"eval_loss": 2.8122923374176025,
"eval_runtime": 127.4092,
"eval_samples_per_second": 810.577,
"eval_steps_per_second": 50.664,
"step": 504000
},
{
"epoch": 4.17,
"learning_rate": 7.871258233969816e-06,
"loss": 2.7915,
"step": 512000
},
{
"epoch": 4.17,
"eval_loss": 2.8145976066589355,
"eval_runtime": 128.9266,
"eval_samples_per_second": 801.037,
"eval_steps_per_second": 50.067,
"step": 512000
},
{
"epoch": 4.24,
"eval_loss": 2.8249683380126953,
"eval_runtime": 129.1348,
"eval_samples_per_second": 799.746,
"eval_steps_per_second": 49.987,
"step": 520000
},
{
"epoch": 4.31,
"learning_rate": 7.804552655715835e-06,
"loss": 2.8094,
"step": 528000
},
{
"epoch": 4.31,
"eval_loss": 2.820560932159424,
"eval_runtime": 129.6096,
"eval_samples_per_second": 796.816,
"eval_steps_per_second": 49.803,
"step": 528000
},
{
"epoch": 4.37,
"eval_loss": 2.818159341812134,
"eval_runtime": 128.5096,
"eval_samples_per_second": 803.637,
"eval_steps_per_second": 50.23,
"step": 536000
},
{
"epoch": 4.44,
"learning_rate": 7.737847077461853e-06,
"loss": 2.8196,
"step": 544000
},
{
"epoch": 4.44,
"eval_loss": 2.8351361751556396,
"eval_runtime": 129.1287,
"eval_samples_per_second": 799.783,
"eval_steps_per_second": 49.989,
"step": 544000
},
{
"epoch": 4.5,
"eval_loss": 2.839430570602417,
"eval_runtime": 129.5203,
"eval_samples_per_second": 797.365,
"eval_steps_per_second": 49.838,
"step": 552000
},
{
"epoch": 4.57,
"learning_rate": 7.671141499207872e-06,
"loss": 2.8316,
"step": 560000
},
{
"epoch": 4.57,
"eval_loss": 2.8396623134613037,
"eval_runtime": 128.6713,
"eval_samples_per_second": 802.627,
"eval_steps_per_second": 50.167,
"step": 560000
},
{
"epoch": 4.63,
"eval_loss": 2.8402562141418457,
"eval_runtime": 128.654,
"eval_samples_per_second": 802.735,
"eval_steps_per_second": 50.173,
"step": 568000
},
{
"epoch": 4.7,
"learning_rate": 7.604435920953891e-06,
"loss": 2.8444,
"step": 576000
},
{
"epoch": 4.7,
"eval_loss": 2.8350980281829834,
"eval_runtime": 129.3424,
"eval_samples_per_second": 798.462,
"eval_steps_per_second": 49.906,
"step": 576000
},
{
"epoch": 4.76,
"eval_loss": 2.8574254512786865,
"eval_runtime": 129.6206,
"eval_samples_per_second": 796.748,
"eval_steps_per_second": 49.799,
"step": 584000
},
{
"epoch": 4.83,
"learning_rate": 7.537730342699909e-06,
"loss": 2.833,
"step": 592000
},
{
"epoch": 4.83,
"eval_loss": 2.86171293258667,
"eval_runtime": 129.2684,
"eval_samples_per_second": 798.919,
"eval_steps_per_second": 49.935,
"step": 592000
},
{
"epoch": 4.89,
"eval_loss": 2.857750654220581,
"eval_runtime": 128.5027,
"eval_samples_per_second": 803.679,
"eval_steps_per_second": 50.232,
"step": 600000
},
{
"epoch": 4.96,
"learning_rate": 7.471024764445928e-06,
"loss": 2.839,
"step": 608000
},
{
"epoch": 4.96,
"eval_loss": 2.8577184677124023,
"eval_runtime": 128.7081,
"eval_samples_per_second": 802.397,
"eval_steps_per_second": 50.152,
"step": 608000
},
{
"epoch": 5.02,
"eval_loss": 2.8726649284362793,
"eval_runtime": 128.6474,
"eval_samples_per_second": 802.776,
"eval_steps_per_second": 50.176,
"step": 616000
},
{
"epoch": 5.09,
"learning_rate": 7.4043191861919465e-06,
"loss": 2.8427,
"step": 624000
},
{
"epoch": 5.09,
"eval_loss": 2.858550786972046,
"eval_runtime": 129.0947,
"eval_samples_per_second": 799.994,
"eval_steps_per_second": 50.002,
"step": 624000
},
{
"epoch": 5.15,
"eval_loss": 2.880849599838257,
"eval_runtime": 128.221,
"eval_samples_per_second": 805.445,
"eval_steps_per_second": 50.343,
"step": 632000
},
{
"epoch": 5.22,
"learning_rate": 7.337613607937964e-06,
"loss": 2.8599,
"step": 640000
},
{
"epoch": 5.22,
"eval_loss": 2.8959789276123047,
"eval_runtime": 129.9831,
"eval_samples_per_second": 794.527,
"eval_steps_per_second": 49.66,
"step": 640000
},
{
"epoch": 5.28,
"eval_loss": 2.8883421421051025,
"eval_runtime": 129.4941,
"eval_samples_per_second": 797.527,
"eval_steps_per_second": 49.848,
"step": 648000
},
{
"epoch": 5.35,
"learning_rate": 7.270908029683983e-06,
"loss": 2.8694,
"step": 656000
},
{
"epoch": 5.35,
"eval_loss": 2.8884825706481934,
"eval_runtime": 129.3172,
"eval_samples_per_second": 798.618,
"eval_steps_per_second": 49.916,
"step": 656000
},
{
"epoch": 5.41,
"eval_loss": 2.887291431427002,
"eval_runtime": 129.1298,
"eval_samples_per_second": 799.777,
"eval_steps_per_second": 49.988,
"step": 664000
},
{
"epoch": 5.48,
"learning_rate": 7.2042024514300015e-06,
"loss": 2.8626,
"step": 672000
},
{
"epoch": 5.48,
"eval_loss": 2.8929550647735596,
"eval_runtime": 129.4886,
"eval_samples_per_second": 797.56,
"eval_steps_per_second": 49.85,
"step": 672000
},
{
"epoch": 5.54,
"eval_loss": 2.8987772464752197,
"eval_runtime": 129.8683,
"eval_samples_per_second": 795.229,
"eval_steps_per_second": 49.704,
"step": 680000
},
{
"epoch": 5.61,
"learning_rate": 7.13749687317602e-06,
"loss": 2.8921,
"step": 688000
},
{
"epoch": 5.61,
"eval_loss": 2.9117259979248047,
"eval_runtime": 128.3205,
"eval_samples_per_second": 804.821,
"eval_steps_per_second": 50.304,
"step": 688000
},
{
"epoch": 5.68,
"eval_loss": 2.912231206893921,
"eval_runtime": 128.7871,
"eval_samples_per_second": 801.905,
"eval_steps_per_second": 50.121,
"step": 696000
},
{
"epoch": 5.74,
"learning_rate": 7.070791294922038e-06,
"loss": 2.8884,
"step": 704000
},
{
"epoch": 5.74,
"eval_loss": 2.900118827819824,
"eval_runtime": 130.1834,
"eval_samples_per_second": 793.304,
"eval_steps_per_second": 49.584,
"step": 704000
},
{
"epoch": 5.81,
"eval_loss": 2.9093644618988037,
"eval_runtime": 129.4918,
"eval_samples_per_second": 797.541,
"eval_steps_per_second": 49.849,
"step": 712000
},
{
"epoch": 5.87,
"learning_rate": 7.0040857166680564e-06,
"loss": 2.8974,
"step": 720000
},
{
"epoch": 5.87,
"eval_loss": 2.9110264778137207,
"eval_runtime": 129.9051,
"eval_samples_per_second": 795.003,
"eval_steps_per_second": 49.69,
"step": 720000
},
{
"epoch": 5.94,
"eval_loss": 2.9044594764709473,
"eval_runtime": 129.2324,
"eval_samples_per_second": 799.141,
"eval_steps_per_second": 49.949,
"step": 728000
},
{
"epoch": 6.0,
"learning_rate": 6.937380138414076e-06,
"loss": 2.903,
"step": 736000
},
{
"epoch": 6.0,
"eval_loss": 2.933678388595581,
"eval_runtime": 130.3644,
"eval_samples_per_second": 792.202,
"eval_steps_per_second": 49.515,
"step": 736000
},
{
"epoch": 6.07,
"eval_loss": 2.931581735610962,
"eval_runtime": 128.3976,
"eval_samples_per_second": 804.337,
"eval_steps_per_second": 50.274,
"step": 744000
},
{
"epoch": 6.13,
"learning_rate": 6.8706745601600945e-06,
"loss": 2.9057,
"step": 752000
},
{
"epoch": 6.13,
"eval_loss": 2.944746971130371,
"eval_runtime": 128.9912,
"eval_samples_per_second": 800.636,
"eval_steps_per_second": 50.042,
"step": 752000
},
{
"epoch": 6.2,
"eval_loss": 2.936281681060791,
"eval_runtime": 129.9533,
"eval_samples_per_second": 794.709,
"eval_steps_per_second": 49.672,
"step": 760000
},
{
"epoch": 6.26,
"learning_rate": 6.803968981906113e-06,
"loss": 2.9146,
"step": 768000
},
{
"epoch": 6.26,
"eval_loss": 2.943751096725464,
"eval_runtime": 129.9494,
"eval_samples_per_second": 794.732,
"eval_steps_per_second": 49.673,
"step": 768000
},
{
"epoch": 6.33,
"eval_loss": 2.9474806785583496,
"eval_runtime": 130.0993,
"eval_samples_per_second": 793.817,
"eval_steps_per_second": 49.616,
"step": 776000
},
{
"epoch": 6.39,
"learning_rate": 6.737263403652131e-06,
"loss": 2.9221,
"step": 784000
},
{
"epoch": 6.39,
"eval_loss": 2.9394171237945557,
"eval_runtime": 129.1928,
"eval_samples_per_second": 799.387,
"eval_steps_per_second": 49.964,
"step": 784000
},
{
"epoch": 6.46,
"eval_loss": 2.937087297439575,
"eval_runtime": 129.9118,
"eval_samples_per_second": 794.963,
"eval_steps_per_second": 49.688,
"step": 792000
},
{
"epoch": 6.52,
"learning_rate": 6.6705578253981495e-06,
"loss": 2.9316,
"step": 800000
},
{
"epoch": 6.52,
"eval_loss": 2.949429512023926,
"eval_runtime": 129.8602,
"eval_samples_per_second": 795.278,
"eval_steps_per_second": 49.707,
"step": 800000
},
{
"epoch": 6.59,
"eval_loss": 2.9727399349212646,
"eval_runtime": 130.9441,
"eval_samples_per_second": 788.695,
"eval_steps_per_second": 49.296,
"step": 808000
},
{
"epoch": 6.65,
"learning_rate": 6.603852247144168e-06,
"loss": 2.9421,
"step": 816000
},
{
"epoch": 6.65,
"eval_loss": 2.9758830070495605,
"eval_runtime": 129.8861,
"eval_samples_per_second": 795.12,
"eval_steps_per_second": 49.697,
"step": 816000
},
{
"epoch": 6.72,
"eval_loss": 2.966480016708374,
"eval_runtime": 129.44,
"eval_samples_per_second": 797.86,
"eval_steps_per_second": 49.869,
"step": 824000
},
{
"epoch": 6.78,
"learning_rate": 6.537146668890187e-06,
"loss": 2.9538,
"step": 832000
},
{
"epoch": 6.78,
"eval_loss": 2.9650251865386963,
"eval_runtime": 129.4919,
"eval_samples_per_second": 797.54,
"eval_steps_per_second": 49.849,
"step": 832000
},
{
"epoch": 6.85,
"eval_loss": 2.976144313812256,
"eval_runtime": 129.8294,
"eval_samples_per_second": 795.467,
"eval_steps_per_second": 49.719,
"step": 840000
},
{
"epoch": 6.91,
"learning_rate": 6.4704410906362044e-06,
"loss": 2.9594,
"step": 848000
},
{
"epoch": 6.91,
"eval_loss": 2.990086317062378,
"eval_runtime": 129.827,
"eval_samples_per_second": 795.482,
"eval_steps_per_second": 49.72,
"step": 848000
},
{
"epoch": 6.98,
"eval_loss": 2.973181962966919,
"eval_runtime": 131.5126,
"eval_samples_per_second": 785.286,
"eval_steps_per_second": 49.083,
"step": 856000
},
{
"epoch": 7.05,
"learning_rate": 6.403735512382223e-06,
"loss": 2.9564,
"step": 864000
},
{
"epoch": 7.05,
"eval_loss": 2.9896528720855713,
"eval_runtime": 129.878,
"eval_samples_per_second": 795.169,
"eval_steps_per_second": 49.7,
"step": 864000
},
{
"epoch": 7.11,
"eval_loss": 2.980059862136841,
"eval_runtime": 129.5351,
"eval_samples_per_second": 797.274,
"eval_steps_per_second": 49.832,
"step": 872000
},
{
"epoch": 7.18,
"learning_rate": 6.337029934128242e-06,
"loss": 2.9561,
"step": 880000
},
{
"epoch": 7.18,
"eval_loss": 2.983869791030884,
"eval_runtime": 130.0357,
"eval_samples_per_second": 794.205,
"eval_steps_per_second": 49.64,
"step": 880000
},
{
"epoch": 7.24,
"eval_loss": 2.9887585639953613,
"eval_runtime": 130.015,
"eval_samples_per_second": 794.331,
"eval_steps_per_second": 49.648,
"step": 888000
},
{
"epoch": 7.31,
"learning_rate": 6.270324355874261e-06,
"loss": 2.9669,
"step": 896000
},
{
"epoch": 7.31,
"eval_loss": 2.99999737739563,
"eval_runtime": 130.6345,
"eval_samples_per_second": 790.564,
"eval_steps_per_second": 49.413,
"step": 896000
},
{
"epoch": 7.37,
"eval_loss": 2.9786183834075928,
"eval_runtime": 129.9739,
"eval_samples_per_second": 794.582,
"eval_steps_per_second": 49.664,
"step": 904000
},
{
"epoch": 7.44,
"learning_rate": 6.20361877762028e-06,
"loss": 2.9649,
"step": 912000
},
{
"epoch": 7.44,
"eval_loss": 2.994581460952759,
"eval_runtime": 131.0156,
"eval_samples_per_second": 788.265,
"eval_steps_per_second": 49.269,
"step": 912000
},
{
"epoch": 7.5,
"eval_loss": 3.0002031326293945,
"eval_runtime": 131.7355,
"eval_samples_per_second": 783.957,
"eval_steps_per_second": 49.0,
"step": 920000
},
{
"epoch": 7.57,
"learning_rate": 6.1369131993662975e-06,
"loss": 2.9665,
"step": 928000
},
{
"epoch": 7.57,
"eval_loss": 2.9960474967956543,
"eval_runtime": 131.6559,
"eval_samples_per_second": 784.431,
"eval_steps_per_second": 49.029,
"step": 928000
},
{
"epoch": 7.63,
"eval_loss": 3.0067989826202393,
"eval_runtime": 131.8152,
"eval_samples_per_second": 783.483,
"eval_steps_per_second": 48.97,
"step": 936000
},
{
"epoch": 7.7,
"learning_rate": 6.070207621112316e-06,
"loss": 2.9708,
"step": 944000
},
{
"epoch": 7.7,
"eval_loss": 2.993788242340088,
"eval_runtime": 130.3799,
"eval_samples_per_second": 792.108,
"eval_steps_per_second": 49.509,
"step": 944000
},
{
"epoch": 7.76,
"eval_loss": 3.0126230716705322,
"eval_runtime": 130.4447,
"eval_samples_per_second": 791.715,
"eval_steps_per_second": 49.485,
"step": 952000
},
{
"epoch": 7.83,
"learning_rate": 6.003502042858335e-06,
"loss": 2.981,
"step": 960000
},
{
"epoch": 7.83,
"eval_loss": 2.9959194660186768,
"eval_runtime": 132.0738,
"eval_samples_per_second": 781.949,
"eval_steps_per_second": 48.874,
"step": 960000
},
{
"epoch": 7.89,
"eval_loss": 2.995976448059082,
"eval_runtime": 130.9412,
"eval_samples_per_second": 788.713,
"eval_steps_per_second": 49.297,
"step": 968000
},
{
"epoch": 7.96,
"learning_rate": 5.936796464604353e-06,
"loss": 2.9805,
"step": 976000
},
{
"epoch": 7.96,
"eval_loss": 2.991947889328003,
"eval_runtime": 130.0819,
"eval_samples_per_second": 793.923,
"eval_steps_per_second": 49.623,
"step": 976000
},
{
"epoch": 8.02,
"eval_loss": 3.0058255195617676,
"eval_runtime": 130.7007,
"eval_samples_per_second": 790.164,
"eval_steps_per_second": 49.388,
"step": 984000
},
{
"epoch": 8.09,
"learning_rate": 5.870090886350371e-06,
"loss": 2.9705,
"step": 992000
},
{
"epoch": 8.09,
"eval_loss": 3.0232017040252686,
"eval_runtime": 129.9163,
"eval_samples_per_second": 794.935,
"eval_steps_per_second": 49.686,
"step": 992000
},
{
"epoch": 8.15,
"eval_loss": 3.0046939849853516,
"eval_runtime": 130.7903,
"eval_samples_per_second": 789.623,
"eval_steps_per_second": 49.354,
"step": 1000000
},
{
"epoch": 8.22,
"learning_rate": 5.80338530809639e-06,
"loss": 2.9715,
"step": 1008000
},
{
"epoch": 8.22,
"eval_loss": 3.0068600177764893,
"eval_runtime": 131.6119,
"eval_samples_per_second": 784.693,
"eval_steps_per_second": 49.046,
"step": 1008000
},
{
"epoch": 8.28,
"eval_loss": 3.0018742084503174,
"eval_runtime": 131.7567,
"eval_samples_per_second": 783.831,
"eval_steps_per_second": 48.992,
"step": 1016000
},
{
"epoch": 8.35,
"learning_rate": 5.736679729842408e-06,
"loss": 2.9695,
"step": 1024000
},
{
"epoch": 8.35,
"eval_loss": 3.021596670150757,
"eval_runtime": 131.2334,
"eval_samples_per_second": 786.956,
"eval_steps_per_second": 49.187,
"step": 1024000
},
{
"epoch": 8.41,
"eval_loss": 3.0219063758850098,
"eval_runtime": 131.6228,
"eval_samples_per_second": 784.629,
"eval_steps_per_second": 49.042,
"step": 1032000
},
{
"epoch": 8.48,
"learning_rate": 5.669974151588427e-06,
"loss": 2.9762,
"step": 1040000
},
{
"epoch": 8.48,
"eval_loss": 3.018242597579956,
"eval_runtime": 131.898,
"eval_samples_per_second": 782.991,
"eval_steps_per_second": 48.939,
"step": 1040000
},
{
"epoch": 8.55,
"eval_loss": 3.0332210063934326,
"eval_runtime": 132.3771,
"eval_samples_per_second": 780.158,
"eval_steps_per_second": 48.762,
"step": 1048000
},
{
"epoch": 8.61,
"learning_rate": 5.603268573334446e-06,
"loss": 2.9786,
"step": 1056000
},
{
"epoch": 8.61,
"eval_loss": 3.001666307449341,
"eval_runtime": 131.4368,
"eval_samples_per_second": 785.739,
"eval_steps_per_second": 49.111,
"step": 1056000
},
{
"epoch": 8.68,
"eval_loss": 3.0236458778381348,
"eval_runtime": 130.9562,
"eval_samples_per_second": 788.622,
"eval_steps_per_second": 49.291,
"step": 1064000
},
{
"epoch": 8.74,
"learning_rate": 5.536562995080464e-06,
"loss": 2.9889,
"step": 1072000
},
{
"epoch": 8.74,
"eval_loss": 3.0273077487945557,
"eval_runtime": 131.9047,
"eval_samples_per_second": 782.952,
"eval_steps_per_second": 48.937,
"step": 1072000
},
{
"epoch": 8.81,
"eval_loss": 3.01967191696167,
"eval_runtime": 131.9615,
"eval_samples_per_second": 782.615,
"eval_steps_per_second": 48.916,
"step": 1080000
},
{
"epoch": 8.87,
"learning_rate": 5.469857416826483e-06,
"loss": 2.9842,
"step": 1088000
},
{
"epoch": 8.87,
"eval_loss": 3.037600040435791,
"eval_runtime": 131.9507,
"eval_samples_per_second": 782.679,
"eval_steps_per_second": 48.92,
"step": 1088000
},
{
"epoch": 8.94,
"eval_loss": 3.032285213470459,
"eval_runtime": 131.7234,
"eval_samples_per_second": 784.029,
"eval_steps_per_second": 49.004,
"step": 1096000
},
{
"epoch": 9.0,
"learning_rate": 5.403151838572501e-06,
"loss": 2.9912,
"step": 1104000
},
{
"epoch": 9.0,
"eval_loss": 3.031731367111206,
"eval_runtime": 131.8868,
"eval_samples_per_second": 783.058,
"eval_steps_per_second": 48.944,
"step": 1104000
},
{
"epoch": 9.07,
"eval_loss": 3.022475481033325,
"eval_runtime": 131.0568,
"eval_samples_per_second": 788.017,
"eval_steps_per_second": 49.253,
"step": 1112000
},
{
"epoch": 9.13,
"learning_rate": 5.33644626031852e-06,
"loss": 2.9919,
"step": 1120000
},
{
"epoch": 9.13,
"eval_loss": 3.036106824874878,
"eval_runtime": 132.2182,
"eval_samples_per_second": 781.095,
"eval_steps_per_second": 48.821,
"step": 1120000
},
{
"epoch": 9.2,
"eval_loss": 3.0432300567626953,
"eval_runtime": 131.9088,
"eval_samples_per_second": 782.927,
"eval_steps_per_second": 48.935,
"step": 1128000
},
{
"epoch": 9.26,
"learning_rate": 5.269740682064538e-06,
"loss": 2.9872,
"step": 1136000
},
{
"epoch": 9.26,
"eval_loss": 3.0306613445281982,
"eval_runtime": 131.2348,
"eval_samples_per_second": 786.948,
"eval_steps_per_second": 49.187,
"step": 1136000
},
{
"epoch": 9.33,
"eval_loss": 3.0481879711151123,
"eval_runtime": 131.7205,
"eval_samples_per_second": 784.046,
"eval_steps_per_second": 49.005,
"step": 1144000
},
{
"epoch": 9.39,
"learning_rate": 5.203035103810556e-06,
"loss": 2.9823,
"step": 1152000
},
{
"epoch": 9.39,
"eval_loss": 3.035399913787842,
"eval_runtime": 131.2188,
"eval_samples_per_second": 787.044,
"eval_steps_per_second": 49.193,
"step": 1152000
},
{
"epoch": 9.46,
"eval_loss": 3.0419015884399414,
"eval_runtime": 131.8024,
"eval_samples_per_second": 783.559,
"eval_steps_per_second": 48.975,
"step": 1160000
},
{
"epoch": 9.52,
"learning_rate": 5.136329525556575e-06,
"loss": 2.9882,
"step": 1168000
},
{
"epoch": 9.52,
"eval_loss": 3.0567431449890137,
"eval_runtime": 132.7773,
"eval_samples_per_second": 777.806,
"eval_steps_per_second": 48.615,
"step": 1168000
},
{
"epoch": 9.59,
"eval_loss": 3.0395400524139404,
"eval_runtime": 131.6554,
"eval_samples_per_second": 784.434,
"eval_steps_per_second": 49.03,
"step": 1176000
},
{
"epoch": 9.65,
"learning_rate": 5.0696239473025935e-06,
"loss": 3.0079,
"step": 1184000
},
{
"epoch": 9.65,
"eval_loss": 3.0572261810302734,
"eval_runtime": 132.0184,
"eval_samples_per_second": 782.278,
"eval_steps_per_second": 48.895,
"step": 1184000
},
{
"epoch": 9.72,
"eval_loss": 3.04028058052063,
"eval_runtime": 131.8056,
"eval_samples_per_second": 783.54,
"eval_steps_per_second": 48.974,
"step": 1192000
},
{
"epoch": 9.78,
"learning_rate": 5.002918369048611e-06,
"loss": 3.0243,
"step": 1200000
},
{
"epoch": 9.78,
"eval_loss": 3.047227621078491,
"eval_runtime": 131.9863,
"eval_samples_per_second": 782.467,
"eval_steps_per_second": 48.907,
"step": 1200000
},
{
"epoch": 9.85,
"eval_loss": 3.052279472351074,
"eval_runtime": 132.2017,
"eval_samples_per_second": 781.193,
"eval_steps_per_second": 48.827,
"step": 1208000
},
{
"epoch": 9.92,
"learning_rate": 4.936212790794631e-06,
"loss": 3.0127,
"step": 1216000
},
{
"epoch": 9.92,
"eval_loss": 3.053439140319824,
"eval_runtime": 131.3363,
"eval_samples_per_second": 786.34,
"eval_steps_per_second": 49.149,
"step": 1216000
},
{
"epoch": 9.98,
"eval_loss": 3.0434141159057617,
"eval_runtime": 131.7363,
"eval_samples_per_second": 783.952,
"eval_steps_per_second": 48.999,
"step": 1224000
},
{
"epoch": 10.05,
"learning_rate": 4.869507212540649e-06,
"loss": 3.0106,
"step": 1232000
},
{
"epoch": 10.05,
"eval_loss": 3.0687036514282227,
"eval_runtime": 131.4287,
"eval_samples_per_second": 785.788,
"eval_steps_per_second": 49.114,
"step": 1232000
},
{
"epoch": 10.11,
"eval_loss": 3.0677733421325684,
"eval_runtime": 132.6312,
"eval_samples_per_second": 778.663,
"eval_steps_per_second": 48.669,
"step": 1240000
},
{
"epoch": 10.18,
"learning_rate": 4.802801634286667e-06,
"loss": 3.0063,
"step": 1248000
},
{
"epoch": 10.18,
"eval_loss": 3.0652401447296143,
"eval_runtime": 132.5035,
"eval_samples_per_second": 779.413,
"eval_steps_per_second": 48.716,
"step": 1248000
},
{
"epoch": 10.24,
"eval_loss": 3.0768234729766846,
"eval_runtime": 131.7104,
"eval_samples_per_second": 784.107,
"eval_steps_per_second": 49.009,
"step": 1256000
},
{
"epoch": 10.31,
"learning_rate": 4.7360960560326865e-06,
"loss": 3.0187,
"step": 1264000
},
{
"epoch": 10.31,
"eval_loss": 3.069179058074951,
"eval_runtime": 132.7895,
"eval_samples_per_second": 777.735,
"eval_steps_per_second": 48.611,
"step": 1264000
},
{
"epoch": 10.37,
"eval_loss": 3.0621213912963867,
"eval_runtime": 132.041,
"eval_samples_per_second": 782.144,
"eval_steps_per_second": 48.886,
"step": 1272000
},
{
"epoch": 10.44,
"learning_rate": 4.669390477778704e-06,
"loss": 3.0202,
"step": 1280000
},
{
"epoch": 10.44,
"eval_loss": 3.0663187503814697,
"eval_runtime": 132.2635,
"eval_samples_per_second": 780.828,
"eval_steps_per_second": 48.804,
"step": 1280000
},
{
"epoch": 10.5,
"eval_loss": 3.0537171363830566,
"eval_runtime": 132.2536,
"eval_samples_per_second": 780.886,
"eval_steps_per_second": 48.808,
"step": 1288000
},
{
"epoch": 10.57,
"learning_rate": 4.602684899524723e-06,
"loss": 3.0219,
"step": 1296000
},
{
"epoch": 10.57,
"eval_loss": 3.072500705718994,
"eval_runtime": 132.0295,
"eval_samples_per_second": 782.212,
"eval_steps_per_second": 48.891,
"step": 1296000
},
{
"epoch": 10.63,
"eval_loss": 3.0664169788360596,
"eval_runtime": 131.9651,
"eval_samples_per_second": 782.593,
"eval_steps_per_second": 48.914,
"step": 1304000
},
{
"epoch": 10.7,
"learning_rate": 4.5359793212707415e-06,
"loss": 3.0232,
"step": 1312000
},
{
"epoch": 10.7,
"eval_loss": 3.0724074840545654,
"eval_runtime": 133.2104,
"eval_samples_per_second": 775.277,
"eval_steps_per_second": 48.457,
"step": 1312000
},
{
"epoch": 10.76,
"eval_loss": 3.0476126670837402,
"eval_runtime": 132.7171,
"eval_samples_per_second": 778.159,
"eval_steps_per_second": 48.637,
"step": 1320000
},
{
"epoch": 10.83,
"learning_rate": 4.46927374301676e-06,
"loss": 3.0247,
"step": 1328000
},
{
"epoch": 10.83,
"eval_loss": 3.0729353427886963,
"eval_runtime": 132.4018,
"eval_samples_per_second": 780.012,
"eval_steps_per_second": 48.753,
"step": 1328000
},
{
"epoch": 10.89,
"eval_loss": 3.0645902156829834,
"eval_runtime": 133.3334,
"eval_samples_per_second": 774.562,
"eval_steps_per_second": 48.412,
"step": 1336000
},
{
"epoch": 10.96,
"learning_rate": 4.402568164762779e-06,
"loss": 3.0335,
"step": 1344000
},
{
"epoch": 10.96,
"eval_loss": 3.0603559017181396,
"eval_runtime": 131.9232,
"eval_samples_per_second": 782.842,
"eval_steps_per_second": 48.93,
"step": 1344000
},
{
"epoch": 11.02,
"eval_loss": 3.0630509853363037,
"eval_runtime": 132.4502,
"eval_samples_per_second": 779.727,
"eval_steps_per_second": 48.735,
"step": 1352000
},
{
"epoch": 11.09,
"learning_rate": 4.335862586508797e-06,
"loss": 3.0182,
"step": 1360000
},
{
"epoch": 11.09,
"eval_loss": 3.0669026374816895,
"eval_runtime": 133.3499,
"eval_samples_per_second": 774.466,
"eval_steps_per_second": 48.406,
"step": 1360000
},
{
"epoch": 11.15,
"eval_loss": 3.0626471042633057,
"eval_runtime": 133.0041,
"eval_samples_per_second": 776.48,
"eval_steps_per_second": 48.532,
"step": 1368000
},
{
"epoch": 11.22,
"learning_rate": 4.269157008254816e-06,
"loss": 3.0124,
"step": 1376000
},
{
"epoch": 11.22,
"eval_loss": 3.053469181060791,
"eval_runtime": 133.5969,
"eval_samples_per_second": 773.034,
"eval_steps_per_second": 48.317,
"step": 1376000
},
{
"epoch": 11.29,
"eval_loss": 3.076792001724243,
"eval_runtime": 133.0672,
"eval_samples_per_second": 776.112,
"eval_steps_per_second": 48.509,
"step": 1384000
},
{
"epoch": 11.35,
"learning_rate": 4.202451430000834e-06,
"loss": 3.016,
"step": 1392000
},
{
"epoch": 11.35,
"eval_loss": 3.0615081787109375,
"eval_runtime": 133.9693,
"eval_samples_per_second": 770.886,
"eval_steps_per_second": 48.183,
"step": 1392000
},
{
"epoch": 11.42,
"eval_loss": 3.0689148902893066,
"eval_runtime": 134.418,
"eval_samples_per_second": 768.312,
"eval_steps_per_second": 48.022,
"step": 1400000
},
{
"epoch": 11.48,
"learning_rate": 4.135745851746852e-06,
"loss": 3.0133,
"step": 1408000
},
{
"epoch": 11.48,
"eval_loss": 3.069943428039551,
"eval_runtime": 133.7409,
"eval_samples_per_second": 772.202,
"eval_steps_per_second": 48.265,
"step": 1408000
},
{
"epoch": 11.55,
"eval_loss": 3.0647213459014893,
"eval_runtime": 134.5422,
"eval_samples_per_second": 767.603,
"eval_steps_per_second": 47.977,
"step": 1416000
},
{
"epoch": 11.61,
"learning_rate": 4.069040273492872e-06,
"loss": 3.0227,
"step": 1424000
},
{
"epoch": 11.61,
"eval_loss": 3.0704684257507324,
"eval_runtime": 135.8934,
"eval_samples_per_second": 759.97,
"eval_steps_per_second": 47.5,
"step": 1424000
},
{
"epoch": 11.68,
"eval_loss": 3.0705504417419434,
"eval_runtime": 133.4155,
"eval_samples_per_second": 774.086,
"eval_steps_per_second": 48.383,
"step": 1432000
},
{
"epoch": 11.74,
"learning_rate": 4.0023346952388895e-06,
"loss": 3.0267,
"step": 1440000
},
{
"epoch": 11.74,
"eval_loss": 3.069384813308716,
"eval_runtime": 133.2021,
"eval_samples_per_second": 775.326,
"eval_steps_per_second": 48.46,
"step": 1440000
},
{
"epoch": 11.81,
"eval_loss": 3.0720527172088623,
"eval_runtime": 133.9349,
"eval_samples_per_second": 771.083,
"eval_steps_per_second": 48.195,
"step": 1448000
},
{
"epoch": 11.87,
"learning_rate": 3.935629116984908e-06,
"loss": 3.021,
"step": 1456000
},
{
"epoch": 11.87,
"eval_loss": 3.068966865539551,
"eval_runtime": 132.597,
"eval_samples_per_second": 778.864,
"eval_steps_per_second": 48.681,
"step": 1456000
},
{
"epoch": 11.94,
"eval_loss": 3.060349702835083,
"eval_runtime": 134.1972,
"eval_samples_per_second": 769.576,
"eval_steps_per_second": 48.101,
"step": 1464000
},
{
"epoch": 12.0,
"learning_rate": 3.868923538730927e-06,
"loss": 3.0144,
"step": 1472000
},
{
"epoch": 12.0,
"eval_loss": 3.065760374069214,
"eval_runtime": 134.4544,
"eval_samples_per_second": 768.104,
"eval_steps_per_second": 48.009,
"step": 1472000
},
{
"epoch": 12.07,
"eval_loss": 3.0719916820526123,
"eval_runtime": 133.6199,
"eval_samples_per_second": 772.902,
"eval_steps_per_second": 48.309,
"step": 1480000
},
{
"epoch": 12.13,
"learning_rate": 3.8022179604769453e-06,
"loss": 3.0204,
"step": 1488000
},
{
"epoch": 12.13,
"eval_loss": 3.066779851913452,
"eval_runtime": 133.3793,
"eval_samples_per_second": 774.296,
"eval_steps_per_second": 48.396,
"step": 1488000
},
{
"epoch": 12.2,
"eval_loss": 3.0773117542266846,
"eval_runtime": 135.2249,
"eval_samples_per_second": 763.728,
"eval_steps_per_second": 47.735,
"step": 1496000
},
{
"epoch": 12.26,
"learning_rate": 3.735512382222964e-06,
"loss": 3.0085,
"step": 1504000
},
{
"epoch": 12.26,
"eval_loss": 3.0847675800323486,
"eval_runtime": 133.4406,
"eval_samples_per_second": 773.94,
"eval_steps_per_second": 48.374,
"step": 1504000
},
{
"epoch": 12.33,
"eval_loss": 3.0567853450775146,
"eval_runtime": 136.5184,
"eval_samples_per_second": 756.492,
"eval_steps_per_second": 47.283,
"step": 1512000
},
{
"epoch": 12.39,
"learning_rate": 3.668806803968982e-06,
"loss": 3.0146,
"step": 1520000
},
{
"epoch": 12.39,
"eval_loss": 3.0783281326293945,
"eval_runtime": 134.8805,
"eval_samples_per_second": 765.678,
"eval_steps_per_second": 47.857,
"step": 1520000
},
{
"epoch": 12.46,
"eval_loss": 3.073552370071411,
"eval_runtime": 133.8542,
"eval_samples_per_second": 771.549,
"eval_steps_per_second": 48.224,
"step": 1528000
},
{
"epoch": 12.52,
"learning_rate": 3.6021012257150007e-06,
"loss": 3.02,
"step": 1536000
},
{
"epoch": 12.52,
"eval_loss": 3.0533952713012695,
"eval_runtime": 133.5934,
"eval_samples_per_second": 773.055,
"eval_steps_per_second": 48.318,
"step": 1536000
},
{
"epoch": 12.59,
"eval_loss": 3.0684494972229004,
"eval_runtime": 133.2901,
"eval_samples_per_second": 774.814,
"eval_steps_per_second": 48.428,
"step": 1544000
},
{
"epoch": 12.65,
"learning_rate": 3.535395647461019e-06,
"loss": 3.0229,
"step": 1552000
},
{
"epoch": 12.65,
"eval_loss": 3.07673978805542,
"eval_runtime": 134.115,
"eval_samples_per_second": 770.048,
"eval_steps_per_second": 48.13,
"step": 1552000
},
{
"epoch": 12.72,
"eval_loss": 3.0568747520446777,
"eval_runtime": 134.3484,
"eval_samples_per_second": 768.71,
"eval_steps_per_second": 48.047,
"step": 1560000
},
{
"epoch": 12.79,
"learning_rate": 3.468690069207038e-06,
"loss": 3.0152,
"step": 1568000
},
{
"epoch": 12.79,
"eval_loss": 3.0787863731384277,
"eval_runtime": 133.764,
"eval_samples_per_second": 772.068,
"eval_steps_per_second": 48.257,
"step": 1568000
},
{
"epoch": 12.85,
"eval_loss": 3.066344738006592,
"eval_runtime": 133.9216,
"eval_samples_per_second": 771.16,
"eval_steps_per_second": 48.2,
"step": 1576000
},
{
"epoch": 12.92,
"learning_rate": 3.4019844909530565e-06,
"loss": 3.02,
"step": 1584000
},
{
"epoch": 12.92,
"eval_loss": 3.067016839981079,
"eval_runtime": 133.9971,
"eval_samples_per_second": 770.725,
"eval_steps_per_second": 48.173,
"step": 1584000
},
{
"epoch": 12.98,
"eval_loss": 3.0683343410491943,
"eval_runtime": 134.2208,
"eval_samples_per_second": 769.441,
"eval_steps_per_second": 48.092,
"step": 1592000
},
{
"epoch": 13.05,
"learning_rate": 3.3352789126990747e-06,
"loss": 3.0128,
"step": 1600000
},
{
"epoch": 13.05,
"eval_loss": 3.071779489517212,
"eval_runtime": 134.2033,
"eval_samples_per_second": 769.541,
"eval_steps_per_second": 48.099,
"step": 1600000
},
{
"epoch": 13.11,
"eval_loss": 3.0846707820892334,
"eval_runtime": 134.6625,
"eval_samples_per_second": 766.917,
"eval_steps_per_second": 47.935,
"step": 1608000
},
{
"epoch": 13.18,
"learning_rate": 3.2685733344450933e-06,
"loss": 3.016,
"step": 1616000
},
{
"epoch": 13.18,
"eval_loss": 3.066356897354126,
"eval_runtime": 134.4556,
"eval_samples_per_second": 768.097,
"eval_steps_per_second": 48.008,
"step": 1616000
},
{
"epoch": 13.24,
"eval_loss": 3.0687520503997803,
"eval_runtime": 134.3299,
"eval_samples_per_second": 768.816,
"eval_steps_per_second": 48.053,
"step": 1624000
},
{
"epoch": 13.31,
"learning_rate": 3.2018677561911115e-06,
"loss": 3.0007,
"step": 1632000
},
{
"epoch": 13.31,
"eval_loss": 3.0740671157836914,
"eval_runtime": 134.4424,
"eval_samples_per_second": 768.173,
"eval_steps_per_second": 48.013,
"step": 1632000
},
{
"epoch": 13.37,
"eval_loss": 3.0663323402404785,
"eval_runtime": 134.2383,
"eval_samples_per_second": 769.341,
"eval_steps_per_second": 48.086,
"step": 1640000
},
{
"epoch": 13.44,
"learning_rate": 3.1351621779371306e-06,
"loss": 3.0241,
"step": 1648000
},
{
"epoch": 13.44,
"eval_loss": 3.0607213973999023,
"eval_runtime": 134.0502,
"eval_samples_per_second": 770.42,
"eval_steps_per_second": 48.154,
"step": 1648000
},
{
"epoch": 13.5,
"eval_loss": 3.0635085105895996,
"eval_runtime": 133.9453,
"eval_samples_per_second": 771.024,
"eval_steps_per_second": 48.191,
"step": 1656000
},
{
"epoch": 13.57,
"learning_rate": 3.0684565996831487e-06,
"loss": 3.0103,
"step": 1664000
},
{
"epoch": 13.57,
"eval_loss": 3.0730724334716797,
"eval_runtime": 135.0683,
"eval_samples_per_second": 764.613,
"eval_steps_per_second": 47.791,
"step": 1664000
},
{
"epoch": 13.63,
"eval_loss": 3.0649466514587402,
"eval_runtime": 134.138,
"eval_samples_per_second": 769.916,
"eval_steps_per_second": 48.122,
"step": 1672000
},
{
"epoch": 13.7,
"learning_rate": 3.0017510214291673e-06,
"loss": 3.0188,
"step": 1680000
},
{
"epoch": 13.7,
"eval_loss": 3.058675765991211,
"eval_runtime": 134.4659,
"eval_samples_per_second": 768.039,
"eval_steps_per_second": 48.005,
"step": 1680000
},
{
"epoch": 13.76,
"eval_loss": 3.0703861713409424,
"eval_runtime": 134.1628,
"eval_samples_per_second": 769.773,
"eval_steps_per_second": 48.113,
"step": 1688000
},
{
"epoch": 13.83,
"learning_rate": 2.9350454431751855e-06,
"loss": 3.0217,
"step": 1696000
},
{
"epoch": 13.83,
"eval_loss": 3.066443920135498,
"eval_runtime": 135.8944,
"eval_samples_per_second": 759.965,
"eval_steps_per_second": 47.5,
"step": 1696000
},
{
"epoch": 13.89,
"eval_loss": 3.0626626014709473,
"eval_runtime": 135.45,
"eval_samples_per_second": 762.458,
"eval_steps_per_second": 47.656,
"step": 1704000
},
{
"epoch": 13.96,
"learning_rate": 2.868339864921204e-06,
"loss": 3.0282,
"step": 1712000
},
{
"epoch": 13.96,
"eval_loss": 3.071357488632202,
"eval_runtime": 134.3182,
"eval_samples_per_second": 768.883,
"eval_steps_per_second": 48.058,
"step": 1712000
},
{
"epoch": 14.02,
"eval_loss": 3.0688371658325195,
"eval_runtime": 135.2782,
"eval_samples_per_second": 763.427,
"eval_steps_per_second": 47.716,
"step": 1720000
},
{
"epoch": 14.09,
"learning_rate": 2.801634286667223e-06,
"loss": 3.0166,
"step": 1728000
},
{
"epoch": 14.09,
"eval_loss": 3.05212664604187,
"eval_runtime": 135.0648,
"eval_samples_per_second": 764.633,
"eval_steps_per_second": 47.792,
"step": 1728000
},
{
"epoch": 14.16,
"eval_loss": 3.0538179874420166,
"eval_runtime": 134.2844,
"eval_samples_per_second": 769.076,
"eval_steps_per_second": 48.07,
"step": 1736000
},
{
"epoch": 14.22,
"learning_rate": 2.7349287084132413e-06,
"loss": 3.0134,
"step": 1744000
},
{
"epoch": 14.22,
"eval_loss": 3.064086437225342,
"eval_runtime": 135.4053,
"eval_samples_per_second": 762.71,
"eval_steps_per_second": 47.672,
"step": 1744000
},
{
"epoch": 14.29,
"eval_loss": 3.063884735107422,
"eval_runtime": 134.2537,
"eval_samples_per_second": 769.253,
"eval_steps_per_second": 48.081,
"step": 1752000
},
{
"epoch": 14.35,
"learning_rate": 2.66822313015926e-06,
"loss": 3.0032,
"step": 1760000
},
{
"epoch": 14.35,
"eval_loss": 3.0587823390960693,
"eval_runtime": 135.0451,
"eval_samples_per_second": 764.745,
"eval_steps_per_second": 47.799,
"step": 1760000
},
{
"epoch": 14.42,
"eval_loss": 3.064620018005371,
"eval_runtime": 134.9837,
"eval_samples_per_second": 765.092,
"eval_steps_per_second": 47.821,
"step": 1768000
},
{
"epoch": 14.48,
"learning_rate": 2.601517551905278e-06,
"loss": 3.0136,
"step": 1776000
},
{
"epoch": 14.48,
"eval_loss": 3.062889337539673,
"eval_runtime": 134.9119,
"eval_samples_per_second": 765.499,
"eval_steps_per_second": 47.846,
"step": 1776000
},
{
"epoch": 14.55,
"eval_loss": 3.0578110218048096,
"eval_runtime": 136.5221,
"eval_samples_per_second": 756.471,
"eval_steps_per_second": 47.282,
"step": 1784000
},
{
"epoch": 14.61,
"learning_rate": 2.5348119736512967e-06,
"loss": 3.0086,
"step": 1792000
},
{
"epoch": 14.61,
"eval_loss": 3.0528934001922607,
"eval_runtime": 135.6145,
"eval_samples_per_second": 761.534,
"eval_steps_per_second": 47.598,
"step": 1792000
},
{
"epoch": 14.68,
"eval_loss": 3.0615251064300537,
"eval_runtime": 135.3281,
"eval_samples_per_second": 763.145,
"eval_steps_per_second": 47.699,
"step": 1800000
},
{
"epoch": 14.74,
"learning_rate": 2.4681063953973154e-06,
"loss": 3.019,
"step": 1808000
},
{
"epoch": 14.74,
"eval_loss": 3.0565857887268066,
"eval_runtime": 134.9377,
"eval_samples_per_second": 765.353,
"eval_steps_per_second": 47.837,
"step": 1808000
},
{
"epoch": 14.81,
"eval_loss": 3.0658679008483887,
"eval_runtime": 135.2159,
"eval_samples_per_second": 763.778,
"eval_steps_per_second": 47.738,
"step": 1816000
},
{
"epoch": 14.87,
"learning_rate": 2.4014008171433335e-06,
"loss": 3.024,
"step": 1824000
},
{
"epoch": 14.87,
"eval_loss": 3.061464786529541,
"eval_runtime": 135.2789,
"eval_samples_per_second": 763.423,
"eval_steps_per_second": 47.716,
"step": 1824000
},
{
"epoch": 14.94,
"eval_loss": 3.0530033111572266,
"eval_runtime": 135.9081,
"eval_samples_per_second": 759.889,
"eval_steps_per_second": 47.495,
"step": 1832000
},
{
"epoch": 15.0,
"learning_rate": 2.334695238889352e-06,
"loss": 3.0089,
"step": 1840000
},
{
"epoch": 15.0,
"eval_loss": 3.0796985626220703,
"eval_runtime": 135.2715,
"eval_samples_per_second": 763.465,
"eval_steps_per_second": 47.719,
"step": 1840000
},
{
"epoch": 15.07,
"eval_loss": 3.0700411796569824,
"eval_runtime": 136.6273,
"eval_samples_per_second": 755.888,
"eval_steps_per_second": 47.245,
"step": 1848000
},
{
"epoch": 15.13,
"learning_rate": 2.2679896606353707e-06,
"loss": 3.0174,
"step": 1856000
},
{
"epoch": 15.13,
"eval_loss": 3.0748071670532227,
"eval_runtime": 136.44,
"eval_samples_per_second": 756.926,
"eval_steps_per_second": 47.31,
"step": 1856000
},
{
"epoch": 15.2,
"eval_loss": 3.064267635345459,
"eval_runtime": 135.3728,
"eval_samples_per_second": 762.894,
"eval_steps_per_second": 47.683,
"step": 1864000
},
{
"epoch": 15.26,
"learning_rate": 2.2012840823813894e-06,
"loss": 3.0176,
"step": 1872000
},
{
"epoch": 15.26,
"eval_loss": 3.0627517700195312,
"eval_runtime": 135.5713,
"eval_samples_per_second": 761.776,
"eval_steps_per_second": 47.613,
"step": 1872000
},
{
"epoch": 15.33,
"eval_loss": 3.0629563331604004,
"eval_runtime": 135.894,
"eval_samples_per_second": 759.967,
"eval_steps_per_second": 47.5,
"step": 1880000
},
{
"epoch": 15.39,
"learning_rate": 2.134578504127408e-06,
"loss": 3.0164,
"step": 1888000
},
{
"epoch": 15.39,
"eval_loss": 3.0721538066864014,
"eval_runtime": 135.9329,
"eval_samples_per_second": 759.75,
"eval_steps_per_second": 47.487,
"step": 1888000
},
{
"epoch": 15.46,
"eval_loss": 3.0744197368621826,
"eval_runtime": 135.4506,
"eval_samples_per_second": 762.455,
"eval_steps_per_second": 47.656,
"step": 1896000
},
{
"epoch": 15.53,
"learning_rate": 2.067872925873426e-06,
"loss": 3.0302,
"step": 1904000
},
{
"epoch": 15.53,
"eval_loss": 3.0739452838897705,
"eval_runtime": 135.8281,
"eval_samples_per_second": 760.336,
"eval_steps_per_second": 47.523,
"step": 1904000
},
{
"epoch": 15.59,
"eval_loss": 3.0700225830078125,
"eval_runtime": 136.0724,
"eval_samples_per_second": 758.971,
"eval_steps_per_second": 47.438,
"step": 1912000
},
{
"epoch": 15.66,
"learning_rate": 2.0011673476194448e-06,
"loss": 3.0204,
"step": 1920000
},
{
"epoch": 15.66,
"eval_loss": 3.0751476287841797,
"eval_runtime": 136.119,
"eval_samples_per_second": 758.711,
"eval_steps_per_second": 47.422,
"step": 1920000
},
{
"epoch": 15.72,
"eval_loss": 3.0597870349884033,
"eval_runtime": 136.3427,
"eval_samples_per_second": 757.466,
"eval_steps_per_second": 47.344,
"step": 1928000
},
{
"epoch": 15.79,
"learning_rate": 1.9344617693654634e-06,
"loss": 3.0147,
"step": 1936000
},
{
"epoch": 15.79,
"eval_loss": 3.0522122383117676,
"eval_runtime": 136.0082,
"eval_samples_per_second": 759.329,
"eval_steps_per_second": 47.46,
"step": 1936000
},
{
"epoch": 15.85,
"eval_loss": 3.065509557723999,
"eval_runtime": 136.1421,
"eval_samples_per_second": 758.582,
"eval_steps_per_second": 47.414,
"step": 1944000
},
{
"epoch": 15.92,
"learning_rate": 1.867756191111482e-06,
"loss": 3.0245,
"step": 1952000
},
{
"epoch": 15.92,
"eval_loss": 3.0568597316741943,
"eval_runtime": 136.6776,
"eval_samples_per_second": 755.61,
"eval_steps_per_second": 47.228,
"step": 1952000
},
{
"epoch": 15.98,
"eval_loss": 3.062300205230713,
"eval_runtime": 136.0258,
"eval_samples_per_second": 759.231,
"eval_steps_per_second": 47.454,
"step": 1960000
},
{
"epoch": 16.05,
"learning_rate": 1.8010506128575004e-06,
"loss": 3.0069,
"step": 1968000
},
{
"epoch": 16.05,
"eval_loss": 3.059983730316162,
"eval_runtime": 136.4638,
"eval_samples_per_second": 756.794,
"eval_steps_per_second": 47.302,
"step": 1968000
},
{
"epoch": 16.11,
"eval_loss": 3.0638678073883057,
"eval_runtime": 137.569,
"eval_samples_per_second": 750.714,
"eval_steps_per_second": 46.922,
"step": 1976000
},
{
"epoch": 16.18,
"learning_rate": 1.734345034603519e-06,
"loss": 3.0068,
"step": 1984000
},
{
"epoch": 16.18,
"eval_loss": 3.077465534210205,
"eval_runtime": 136.0507,
"eval_samples_per_second": 759.092,
"eval_steps_per_second": 47.446,
"step": 1984000
},
{
"epoch": 16.24,
"eval_loss": 3.0668864250183105,
"eval_runtime": 136.2552,
"eval_samples_per_second": 757.953,
"eval_steps_per_second": 47.374,
"step": 1992000
},
{
"epoch": 16.31,
"learning_rate": 1.6676394563495374e-06,
"loss": 3.0275,
"step": 2000000
},
{
"epoch": 16.31,
"eval_loss": 3.062725782394409,
"eval_runtime": 136.3436,
"eval_samples_per_second": 757.461,
"eval_steps_per_second": 47.344,
"step": 2000000
},
{
"epoch": 16.37,
"eval_loss": 3.0644514560699463,
"eval_runtime": 137.752,
"eval_samples_per_second": 749.717,
"eval_steps_per_second": 46.86,
"step": 2008000
},
{
"epoch": 16.44,
"learning_rate": 1.6009338780955558e-06,
"loss": 3.0164,
"step": 2016000
},
{
"epoch": 16.44,
"eval_loss": 3.0666866302490234,
"eval_runtime": 135.9171,
"eval_samples_per_second": 759.838,
"eval_steps_per_second": 47.492,
"step": 2016000
},
{
"epoch": 16.5,
"eval_loss": 3.048987627029419,
"eval_runtime": 136.0156,
"eval_samples_per_second": 759.288,
"eval_steps_per_second": 47.458,
"step": 2024000
},
{
"epoch": 16.57,
"learning_rate": 1.5342282998415744e-06,
"loss": 3.0148,
"step": 2032000
},
{
"epoch": 16.57,
"eval_loss": 3.061800003051758,
"eval_runtime": 137.187,
"eval_samples_per_second": 752.805,
"eval_steps_per_second": 47.053,
"step": 2032000
},
{
"epoch": 16.63,
"eval_loss": 3.0544731616973877,
"eval_runtime": 137.5014,
"eval_samples_per_second": 751.083,
"eval_steps_per_second": 46.945,
"step": 2040000
},
{
"epoch": 16.7,
"learning_rate": 1.4675227215875928e-06,
"loss": 3.022,
"step": 2048000
},
{
"epoch": 16.7,
"eval_loss": 3.0651352405548096,
"eval_runtime": 137.0124,
"eval_samples_per_second": 753.764,
"eval_steps_per_second": 47.113,
"step": 2048000
},
{
"epoch": 16.76,
"eval_loss": 3.068650484085083,
"eval_runtime": 137.324,
"eval_samples_per_second": 752.053,
"eval_steps_per_second": 47.006,
"step": 2056000
},
{
"epoch": 16.83,
"learning_rate": 1.4008171433336116e-06,
"loss": 3.0235,
"step": 2064000
},
{
"epoch": 16.83,
"eval_loss": 3.0515873432159424,
"eval_runtime": 137.8405,
"eval_samples_per_second": 749.235,
"eval_steps_per_second": 46.829,
"step": 2064000
},
{
"epoch": 16.89,
"eval_loss": 3.0761473178863525,
"eval_runtime": 137.435,
"eval_samples_per_second": 751.446,
"eval_steps_per_second": 46.968,
"step": 2072000
},
{
"epoch": 16.96,
"learning_rate": 1.33411156507963e-06,
"loss": 3.0194,
"step": 2080000
},
{
"epoch": 16.96,
"eval_loss": 3.0807414054870605,
"eval_runtime": 136.8928,
"eval_samples_per_second": 754.423,
"eval_steps_per_second": 47.154,
"step": 2080000
},
{
"epoch": 17.03,
"eval_loss": 3.060075283050537,
"eval_runtime": 136.6441,
"eval_samples_per_second": 755.796,
"eval_steps_per_second": 47.24,
"step": 2088000
},
{
"epoch": 17.09,
"learning_rate": 1.2674059868256484e-06,
"loss": 3.0142,
"step": 2096000
},
{
"epoch": 17.09,
"eval_loss": 3.0721395015716553,
"eval_runtime": 136.5201,
"eval_samples_per_second": 756.482,
"eval_steps_per_second": 47.282,
"step": 2096000
},
{
"epoch": 17.16,
"eval_loss": 3.0653316974639893,
"eval_runtime": 138.2812,
"eval_samples_per_second": 746.848,
"eval_steps_per_second": 46.68,
"step": 2104000
},
{
"epoch": 17.22,
"learning_rate": 1.2007004085716668e-06,
"loss": 3.0183,
"step": 2112000
},
{
"epoch": 17.22,
"eval_loss": 3.061683416366577,
"eval_runtime": 136.6654,
"eval_samples_per_second": 755.678,
"eval_steps_per_second": 47.232,
"step": 2112000
},
{
"epoch": 17.29,
"eval_loss": 3.062178373336792,
"eval_runtime": 137.9621,
"eval_samples_per_second": 748.575,
"eval_steps_per_second": 46.788,
"step": 2120000
},
{
"epoch": 17.35,
"learning_rate": 1.1339948303176854e-06,
"loss": 3.0092,
"step": 2128000
},
{
"epoch": 17.35,
"eval_loss": 3.068242311477661,
"eval_runtime": 137.4752,
"eval_samples_per_second": 751.227,
"eval_steps_per_second": 46.954,
"step": 2128000
},
{
"epoch": 17.42,
"eval_loss": 3.073157787322998,
"eval_runtime": 136.5003,
"eval_samples_per_second": 756.592,
"eval_steps_per_second": 47.289,
"step": 2136000
},
{
"epoch": 17.48,
"learning_rate": 1.067289252063704e-06,
"loss": 3.0071,
"step": 2144000
},
{
"epoch": 17.48,
"eval_loss": 3.0763022899627686,
"eval_runtime": 137.95,
"eval_samples_per_second": 748.641,
"eval_steps_per_second": 46.792,
"step": 2144000
},
{
"epoch": 17.55,
"eval_loss": 3.0674524307250977,
"eval_runtime": 137.3106,
"eval_samples_per_second": 752.127,
"eval_steps_per_second": 47.01,
"step": 2152000
},
{
"epoch": 17.61,
"learning_rate": 1.0005836738097224e-06,
"loss": 3.0272,
"step": 2160000
},
{
"epoch": 17.61,
"eval_loss": 3.0671498775482178,
"eval_runtime": 138.0717,
"eval_samples_per_second": 747.981,
"eval_steps_per_second": 46.751,
"step": 2160000
},
{
"epoch": 17.68,
"eval_loss": 3.062239170074463,
"eval_runtime": 138.0499,
"eval_samples_per_second": 748.099,
"eval_steps_per_second": 46.758,
"step": 2168000
},
{
"epoch": 17.74,
"learning_rate": 9.33878095555741e-07,
"loss": 3.0235,
"step": 2176000
},
{
"epoch": 17.74,
"eval_loss": 3.0789263248443604,
"eval_runtime": 137.5626,
"eval_samples_per_second": 750.749,
"eval_steps_per_second": 46.924,
"step": 2176000
},
{
"epoch": 17.81,
"eval_loss": 3.062295436859131,
"eval_runtime": 138.8694,
"eval_samples_per_second": 743.684,
"eval_steps_per_second": 46.483,
"step": 2184000
},
{
"epoch": 17.87,
"learning_rate": 8.671725173017595e-07,
"loss": 3.0179,
"step": 2192000
},
{
"epoch": 17.87,
"eval_loss": 3.078376054763794,
"eval_runtime": 136.985,
"eval_samples_per_second": 753.914,
"eval_steps_per_second": 47.122,
"step": 2192000
},
{
"epoch": 17.94,
"eval_loss": 3.062905788421631,
"eval_runtime": 137.7472,
"eval_samples_per_second": 749.743,
"eval_steps_per_second": 46.861,
"step": 2200000
},
{
"epoch": 18.0,
"learning_rate": 8.004669390477779e-07,
"loss": 3.0209,
"step": 2208000
},
{
"epoch": 18.0,
"eval_loss": 3.0731070041656494,
"eval_runtime": 138.0906,
"eval_samples_per_second": 747.878,
"eval_steps_per_second": 46.745,
"step": 2208000
},
{
"epoch": 18.07,
"eval_loss": 3.0945563316345215,
"eval_runtime": 137.4959,
"eval_samples_per_second": 751.113,
"eval_steps_per_second": 46.947,
"step": 2216000
},
{
"epoch": 18.13,
"learning_rate": 7.337613607937964e-07,
"loss": 3.0237,
"step": 2224000
},
{
"epoch": 18.13,
"eval_loss": 3.065315008163452,
"eval_runtime": 138.0159,
"eval_samples_per_second": 748.283,
"eval_steps_per_second": 46.77,
"step": 2224000
},
{
"epoch": 18.2,
"eval_loss": 3.0589962005615234,
"eval_runtime": 137.6387,
"eval_samples_per_second": 750.334,
"eval_steps_per_second": 46.898,
"step": 2232000
},
{
"epoch": 18.26,
"learning_rate": 6.67055782539815e-07,
"loss": 3.0164,
"step": 2240000
},
{
"epoch": 18.26,
"eval_loss": 3.070741891860962,
"eval_runtime": 138.4523,
"eval_samples_per_second": 745.925,
"eval_steps_per_second": 46.623,
"step": 2240000
},
{
"epoch": 18.33,
"eval_loss": 3.0545763969421387,
"eval_runtime": 138.1194,
"eval_samples_per_second": 747.723,
"eval_steps_per_second": 46.735,
"step": 2248000
},
{
"epoch": 18.4,
"learning_rate": 6.003502042858334e-07,
"loss": 3.0206,
"step": 2256000
},
{
"epoch": 18.4,
"eval_loss": 3.0741806030273438,
"eval_runtime": 138.8634,
"eval_samples_per_second": 743.717,
"eval_steps_per_second": 46.485,
"step": 2256000
},
{
"epoch": 18.46,
"eval_loss": 3.0793333053588867,
"eval_runtime": 138.6181,
"eval_samples_per_second": 745.032,
"eval_steps_per_second": 46.567,
"step": 2264000
},
{
"epoch": 18.53,
"learning_rate": 5.33644626031852e-07,
"loss": 3.0138,
"step": 2272000
},
{
"epoch": 18.53,
"eval_loss": 3.05604887008667,
"eval_runtime": 139.1325,
"eval_samples_per_second": 742.278,
"eval_steps_per_second": 46.395,
"step": 2272000
},
{
"epoch": 18.59,
"eval_loss": 3.086977958679199,
"eval_runtime": 137.8163,
"eval_samples_per_second": 749.367,
"eval_steps_per_second": 46.838,
"step": 2280000
},
{
"epoch": 18.66,
"learning_rate": 4.669390477778705e-07,
"loss": 3.0377,
"step": 2288000
},
{
"epoch": 18.66,
"eval_loss": 3.07423996925354,
"eval_runtime": 137.4738,
"eval_samples_per_second": 751.234,
"eval_steps_per_second": 46.954,
"step": 2288000
},
{
"epoch": 18.72,
"eval_loss": 3.0675508975982666,
"eval_runtime": 138.0596,
"eval_samples_per_second": 748.046,
"eval_steps_per_second": 46.755,
"step": 2296000
},
{
"epoch": 18.79,
"learning_rate": 4.0023346952388894e-07,
"loss": 3.0227,
"step": 2304000
},
{
"epoch": 18.79,
"eval_loss": 3.06254506111145,
"eval_runtime": 139.3504,
"eval_samples_per_second": 741.117,
"eval_steps_per_second": 46.322,
"step": 2304000
},
{
"epoch": 18.85,
"eval_loss": 3.0736207962036133,
"eval_runtime": 139.5433,
"eval_samples_per_second": 740.093,
"eval_steps_per_second": 46.258,
"step": 2312000
},
{
"epoch": 18.92,
"learning_rate": 3.335278912699075e-07,
"loss": 3.0359,
"step": 2320000
},
{
"epoch": 18.92,
"eval_loss": 3.0800607204437256,
"eval_runtime": 138.2846,
"eval_samples_per_second": 746.829,
"eval_steps_per_second": 46.679,
"step": 2320000
},
{
"epoch": 18.98,
"eval_loss": 3.0709972381591797,
"eval_runtime": 139.0505,
"eval_samples_per_second": 742.716,
"eval_steps_per_second": 46.422,
"step": 2328000
},
{
"epoch": 19.05,
"learning_rate": 2.66822313015926e-07,
"loss": 3.0248,
"step": 2336000
},
{
"epoch": 19.05,
"eval_loss": 3.069218158721924,
"eval_runtime": 138.9779,
"eval_samples_per_second": 743.104,
"eval_steps_per_second": 46.446,
"step": 2336000
},
{
"epoch": 19.11,
"eval_loss": 3.067660331726074,
"eval_runtime": 138.4099,
"eval_samples_per_second": 746.154,
"eval_steps_per_second": 46.637,
"step": 2344000
},
{
"epoch": 19.18,
"learning_rate": 2.0011673476194447e-07,
"loss": 3.0235,
"step": 2352000
},
{
"epoch": 19.18,
"eval_loss": 3.089552879333496,
"eval_runtime": 138.573,
"eval_samples_per_second": 745.275,
"eval_steps_per_second": 46.582,
"step": 2352000
},
{
"epoch": 19.24,
"eval_loss": 3.0777699947357178,
"eval_runtime": 140.4362,
"eval_samples_per_second": 735.388,
"eval_steps_per_second": 45.964,
"step": 2360000
},
{
"epoch": 19.31,
"learning_rate": 1.33411156507963e-07,
"loss": 3.0187,
"step": 2368000
},
{
"epoch": 19.31,
"eval_loss": 3.069951295852661,
"eval_runtime": 140.0319,
"eval_samples_per_second": 737.511,
"eval_steps_per_second": 46.097,
"step": 2368000
},
{
"epoch": 19.37,
"eval_loss": 3.0742506980895996,
"eval_runtime": 139.3192,
"eval_samples_per_second": 741.283,
"eval_steps_per_second": 46.332,
"step": 2376000
},
{
"epoch": 19.44,
"learning_rate": 6.67055782539815e-08,
"loss": 3.0189,
"step": 2384000
},
{
"epoch": 19.44,
"eval_loss": 3.0780065059661865,
"eval_runtime": 138.4114,
"eval_samples_per_second": 746.145,
"eval_steps_per_second": 46.636,
"step": 2384000
},
{
"epoch": 19.5,
"eval_loss": 3.0866599082946777,
"eval_runtime": 138.3665,
"eval_samples_per_second": 746.387,
"eval_steps_per_second": 46.651,
"step": 2392000
},
{
"epoch": 19.57,
"learning_rate": 0.0,
"loss": 3.0184,
"step": 2400000
},
{
"epoch": 19.57,
"eval_loss": 3.079288959503174,
"eval_runtime": 138.4519,
"eval_samples_per_second": 745.927,
"eval_steps_per_second": 46.623,
"step": 2400000
},
{
"epoch": 19.57,
"step": 2400000,
"total_flos": 7.178820925216543e+17,
"train_loss": 2.9400340771484377,
"train_runtime": 198144.865,
"train_samples_per_second": 193.798,
"train_steps_per_second": 12.112
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 20,
"save_steps": 32000,
"total_flos": 7.178820925216543e+17,
"trial_name": null,
"trial_params": null
}