bertweet-2020-Q1-filtered / trainer_state.json
DouglasPontes's picture
End of training
27d0002
{
"best_metric": 2.671285629272461,
"best_model_checkpoint": "./model_tweets_2020_Q2/checkpoint-32000",
"epoch": 19.569471624266146,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07,
"eval_loss": 2.635031223297119,
"eval_runtime": 126.0563,
"eval_samples_per_second": 819.277,
"eval_steps_per_second": 51.207,
"step": 8000
},
{
"epoch": 0.13,
"learning_rate": 9.939131159843243e-06,
"loss": 2.7848,
"step": 16000
},
{
"epoch": 0.13,
"eval_loss": 2.6555588245391846,
"eval_runtime": 126.7417,
"eval_samples_per_second": 814.846,
"eval_steps_per_second": 50.93,
"step": 16000
},
{
"epoch": 0.2,
"eval_loss": 2.6695027351379395,
"eval_runtime": 125.9524,
"eval_samples_per_second": 819.953,
"eval_steps_per_second": 51.25,
"step": 24000
},
{
"epoch": 0.26,
"learning_rate": 9.872425581589261e-06,
"loss": 2.7545,
"step": 32000
},
{
"epoch": 0.26,
"eval_loss": 2.671285629272461,
"eval_runtime": 126.9886,
"eval_samples_per_second": 813.262,
"eval_steps_per_second": 50.831,
"step": 32000
},
{
"epoch": 0.33,
"eval_loss": 2.708911895751953,
"eval_runtime": 126.0433,
"eval_samples_per_second": 819.361,
"eval_steps_per_second": 51.213,
"step": 40000
},
{
"epoch": 0.39,
"learning_rate": 9.80572000333528e-06,
"loss": 2.7717,
"step": 48000
},
{
"epoch": 0.39,
"eval_loss": 2.7143805027008057,
"eval_runtime": 126.2378,
"eval_samples_per_second": 818.099,
"eval_steps_per_second": 51.134,
"step": 48000
},
{
"epoch": 0.46,
"eval_loss": 2.7240307331085205,
"eval_runtime": 125.5002,
"eval_samples_per_second": 822.907,
"eval_steps_per_second": 51.434,
"step": 56000
},
{
"epoch": 0.52,
"learning_rate": 9.739014425081299e-06,
"loss": 2.8043,
"step": 64000
},
{
"epoch": 0.52,
"eval_loss": 2.749925374984741,
"eval_runtime": 126.3275,
"eval_samples_per_second": 817.518,
"eval_steps_per_second": 51.097,
"step": 64000
},
{
"epoch": 0.59,
"eval_loss": 2.770448684692383,
"eval_runtime": 115.1543,
"eval_samples_per_second": 896.84,
"eval_steps_per_second": 56.055,
"step": 72000
},
{
"epoch": 0.65,
"learning_rate": 9.672308846827316e-06,
"loss": 2.8401,
"step": 80000
},
{
"epoch": 0.65,
"eval_loss": 2.782008409500122,
"eval_runtime": 116.1441,
"eval_samples_per_second": 889.197,
"eval_steps_per_second": 55.578,
"step": 80000
},
{
"epoch": 0.72,
"eval_loss": 2.8068478107452393,
"eval_runtime": 116.2984,
"eval_samples_per_second": 888.017,
"eval_steps_per_second": 55.504,
"step": 88000
},
{
"epoch": 0.78,
"learning_rate": 9.605603268573334e-06,
"loss": 2.8723,
"step": 96000
},
{
"epoch": 0.78,
"eval_loss": 2.8150370121002197,
"eval_runtime": 116.0456,
"eval_samples_per_second": 889.952,
"eval_steps_per_second": 55.625,
"step": 96000
},
{
"epoch": 0.85,
"eval_loss": 2.8410351276397705,
"eval_runtime": 114.7666,
"eval_samples_per_second": 899.87,
"eval_steps_per_second": 56.245,
"step": 104000
},
{
"epoch": 0.91,
"learning_rate": 9.538897690319354e-06,
"loss": 2.9004,
"step": 112000
},
{
"epoch": 0.91,
"eval_loss": 2.865703582763672,
"eval_runtime": 115.4628,
"eval_samples_per_second": 894.444,
"eval_steps_per_second": 55.905,
"step": 112000
},
{
"epoch": 0.98,
"eval_loss": 2.882617950439453,
"eval_runtime": 116.5627,
"eval_samples_per_second": 886.004,
"eval_steps_per_second": 55.378,
"step": 120000
},
{
"epoch": 1.04,
"learning_rate": 9.472192112065373e-06,
"loss": 2.9396,
"step": 128000
},
{
"epoch": 1.04,
"eval_loss": 2.9071033000946045,
"eval_runtime": 116.4638,
"eval_samples_per_second": 886.756,
"eval_steps_per_second": 55.425,
"step": 128000
},
{
"epoch": 1.11,
"eval_loss": 2.949030876159668,
"eval_runtime": 115.1354,
"eval_samples_per_second": 896.987,
"eval_steps_per_second": 56.064,
"step": 136000
},
{
"epoch": 1.17,
"learning_rate": 9.405486533811392e-06,
"loss": 2.9801,
"step": 144000
},
{
"epoch": 1.17,
"eval_loss": 2.951450824737549,
"eval_runtime": 114.8755,
"eval_samples_per_second": 899.017,
"eval_steps_per_second": 56.191,
"step": 144000
},
{
"epoch": 1.24,
"eval_loss": 2.9862585067749023,
"eval_runtime": 116.1529,
"eval_samples_per_second": 889.129,
"eval_steps_per_second": 55.573,
"step": 152000
},
{
"epoch": 1.3,
"learning_rate": 9.338780955557409e-06,
"loss": 3.0173,
"step": 160000
},
{
"epoch": 1.3,
"eval_loss": 2.991586685180664,
"eval_runtime": 116.0798,
"eval_samples_per_second": 889.69,
"eval_steps_per_second": 55.608,
"step": 160000
},
{
"epoch": 1.37,
"eval_loss": 3.0230655670166016,
"eval_runtime": 115.2701,
"eval_samples_per_second": 895.939,
"eval_steps_per_second": 55.999,
"step": 168000
},
{
"epoch": 1.44,
"learning_rate": 9.272075377303427e-06,
"loss": 3.0674,
"step": 176000
},
{
"epoch": 1.44,
"eval_loss": 3.0447049140930176,
"eval_runtime": 115.1489,
"eval_samples_per_second": 896.882,
"eval_steps_per_second": 56.058,
"step": 176000
},
{
"epoch": 1.5,
"eval_loss": 3.0638155937194824,
"eval_runtime": 116.1134,
"eval_samples_per_second": 889.432,
"eval_steps_per_second": 55.592,
"step": 184000
},
{
"epoch": 1.57,
"learning_rate": 9.205369799049446e-06,
"loss": 3.1059,
"step": 192000
},
{
"epoch": 1.57,
"eval_loss": 3.094524383544922,
"eval_runtime": 114.9725,
"eval_samples_per_second": 898.258,
"eval_steps_per_second": 56.144,
"step": 192000
},
{
"epoch": 1.63,
"eval_loss": 3.1008002758026123,
"eval_runtime": 116.6453,
"eval_samples_per_second": 885.377,
"eval_steps_per_second": 55.339,
"step": 200000
},
{
"epoch": 1.7,
"learning_rate": 9.138664220795464e-06,
"loss": 3.1283,
"step": 208000
},
{
"epoch": 1.7,
"eval_loss": 3.1256680488586426,
"eval_runtime": 115.0624,
"eval_samples_per_second": 897.556,
"eval_steps_per_second": 56.1,
"step": 208000
},
{
"epoch": 1.76,
"eval_loss": 3.1262004375457764,
"eval_runtime": 114.9392,
"eval_samples_per_second": 898.518,
"eval_steps_per_second": 56.16,
"step": 216000
},
{
"epoch": 1.83,
"learning_rate": 9.071958642541483e-06,
"loss": 3.1684,
"step": 224000
},
{
"epoch": 1.83,
"eval_loss": 3.152285099029541,
"eval_runtime": 115.5854,
"eval_samples_per_second": 893.495,
"eval_steps_per_second": 55.846,
"step": 224000
},
{
"epoch": 1.89,
"eval_loss": 3.1842401027679443,
"eval_runtime": 114.9021,
"eval_samples_per_second": 898.809,
"eval_steps_per_second": 56.178,
"step": 232000
},
{
"epoch": 1.96,
"learning_rate": 9.005253064287502e-06,
"loss": 3.1966,
"step": 240000
},
{
"epoch": 1.96,
"eval_loss": 3.1820068359375,
"eval_runtime": 117.5401,
"eval_samples_per_second": 878.637,
"eval_steps_per_second": 54.917,
"step": 240000
},
{
"epoch": 2.02,
"eval_loss": 3.197575569152832,
"eval_runtime": 119.2185,
"eval_samples_per_second": 866.266,
"eval_steps_per_second": 54.144,
"step": 248000
},
{
"epoch": 2.09,
"learning_rate": 8.93854748603352e-06,
"loss": 3.2055,
"step": 256000
},
{
"epoch": 2.09,
"eval_loss": 3.2012782096862793,
"eval_runtime": 116.0617,
"eval_samples_per_second": 889.829,
"eval_steps_per_second": 55.617,
"step": 256000
},
{
"epoch": 2.15,
"eval_loss": 3.219731092453003,
"eval_runtime": 115.3459,
"eval_samples_per_second": 895.351,
"eval_steps_per_second": 55.962,
"step": 264000
},
{
"epoch": 2.22,
"learning_rate": 8.871841907779539e-06,
"loss": 3.2186,
"step": 272000
},
{
"epoch": 2.22,
"eval_loss": 3.2258596420288086,
"eval_runtime": 117.0782,
"eval_samples_per_second": 882.102,
"eval_steps_per_second": 55.134,
"step": 272000
},
{
"epoch": 2.28,
"eval_loss": 3.2410128116607666,
"eval_runtime": 115.7081,
"eval_samples_per_second": 892.547,
"eval_steps_per_second": 55.787,
"step": 280000
},
{
"epoch": 2.35,
"learning_rate": 8.805136329525557e-06,
"loss": 3.2518,
"step": 288000
},
{
"epoch": 2.35,
"eval_loss": 3.2448806762695312,
"eval_runtime": 116.2706,
"eval_samples_per_second": 888.23,
"eval_steps_per_second": 55.517,
"step": 288000
},
{
"epoch": 2.41,
"eval_loss": 3.2685933113098145,
"eval_runtime": 117.0296,
"eval_samples_per_second": 882.469,
"eval_steps_per_second": 55.157,
"step": 296000
},
{
"epoch": 2.48,
"learning_rate": 8.738430751271576e-06,
"loss": 3.2705,
"step": 304000
},
{
"epoch": 2.48,
"eval_loss": 3.270232915878296,
"eval_runtime": 115.7748,
"eval_samples_per_second": 892.034,
"eval_steps_per_second": 55.755,
"step": 304000
},
{
"epoch": 2.54,
"eval_loss": 3.271563768386841,
"eval_runtime": 114.7956,
"eval_samples_per_second": 899.643,
"eval_steps_per_second": 56.23,
"step": 312000
},
{
"epoch": 2.61,
"learning_rate": 8.671725173017595e-06,
"loss": 3.2677,
"step": 320000
},
{
"epoch": 2.61,
"eval_loss": 3.2934534549713135,
"eval_runtime": 116.4472,
"eval_samples_per_second": 886.883,
"eval_steps_per_second": 55.433,
"step": 320000
},
{
"epoch": 2.67,
"eval_loss": 3.2941575050354004,
"eval_runtime": 115.658,
"eval_samples_per_second": 892.934,
"eval_steps_per_second": 55.811,
"step": 328000
},
{
"epoch": 2.74,
"learning_rate": 8.605019594763613e-06,
"loss": 3.2955,
"step": 336000
},
{
"epoch": 2.74,
"eval_loss": 3.304429054260254,
"eval_runtime": 115.4488,
"eval_samples_per_second": 894.552,
"eval_steps_per_second": 55.912,
"step": 336000
},
{
"epoch": 2.8,
"eval_loss": 3.3109662532806396,
"eval_runtime": 114.8039,
"eval_samples_per_second": 899.577,
"eval_steps_per_second": 56.226,
"step": 344000
},
{
"epoch": 2.87,
"learning_rate": 8.538314016509632e-06,
"loss": 3.2966,
"step": 352000
},
{
"epoch": 2.87,
"eval_loss": 3.3053431510925293,
"eval_runtime": 115.0477,
"eval_samples_per_second": 897.671,
"eval_steps_per_second": 56.107,
"step": 352000
},
{
"epoch": 2.94,
"eval_loss": 3.3276007175445557,
"eval_runtime": 115.8876,
"eval_samples_per_second": 891.165,
"eval_steps_per_second": 55.701,
"step": 360000
},
{
"epoch": 3.0,
"learning_rate": 8.471608438255649e-06,
"loss": 3.311,
"step": 368000
},
{
"epoch": 3.0,
"eval_loss": 3.3256120681762695,
"eval_runtime": 117.3196,
"eval_samples_per_second": 880.288,
"eval_steps_per_second": 55.021,
"step": 368000
},
{
"epoch": 3.07,
"eval_loss": 3.3292236328125,
"eval_runtime": 117.5646,
"eval_samples_per_second": 878.453,
"eval_steps_per_second": 54.906,
"step": 376000
},
{
"epoch": 3.13,
"learning_rate": 8.404902860001667e-06,
"loss": 3.3217,
"step": 384000
},
{
"epoch": 3.13,
"eval_loss": 3.333477258682251,
"eval_runtime": 116.7284,
"eval_samples_per_second": 884.746,
"eval_steps_per_second": 55.299,
"step": 384000
},
{
"epoch": 3.2,
"eval_loss": 3.316025972366333,
"eval_runtime": 118.1544,
"eval_samples_per_second": 874.068,
"eval_steps_per_second": 54.632,
"step": 392000
},
{
"epoch": 3.26,
"learning_rate": 8.338197281747686e-06,
"loss": 3.3145,
"step": 400000
},
{
"epoch": 3.26,
"eval_loss": 3.337838649749756,
"eval_runtime": 116.066,
"eval_samples_per_second": 889.796,
"eval_steps_per_second": 55.615,
"step": 400000
},
{
"epoch": 3.33,
"eval_loss": 3.3306798934936523,
"eval_runtime": 117.4533,
"eval_samples_per_second": 879.285,
"eval_steps_per_second": 54.958,
"step": 408000
},
{
"epoch": 3.39,
"learning_rate": 8.271491703493705e-06,
"loss": 3.3246,
"step": 416000
},
{
"epoch": 3.39,
"eval_loss": 3.342693567276001,
"eval_runtime": 115.6289,
"eval_samples_per_second": 893.159,
"eval_steps_per_second": 55.825,
"step": 416000
},
{
"epoch": 3.46,
"eval_loss": 3.3543155193328857,
"eval_runtime": 115.7056,
"eval_samples_per_second": 892.567,
"eval_steps_per_second": 55.788,
"step": 424000
},
{
"epoch": 3.52,
"learning_rate": 8.204786125239725e-06,
"loss": 3.3131,
"step": 432000
},
{
"epoch": 3.52,
"eval_loss": 3.340524196624756,
"eval_runtime": 116.2105,
"eval_samples_per_second": 888.689,
"eval_steps_per_second": 55.546,
"step": 432000
},
{
"epoch": 3.59,
"eval_loss": 3.336106777191162,
"eval_runtime": 114.9141,
"eval_samples_per_second": 898.714,
"eval_steps_per_second": 56.172,
"step": 440000
},
{
"epoch": 3.65,
"learning_rate": 8.138080546985743e-06,
"loss": 3.3266,
"step": 448000
},
{
"epoch": 3.65,
"eval_loss": 3.370443344116211,
"eval_runtime": 115.193,
"eval_samples_per_second": 896.539,
"eval_steps_per_second": 56.036,
"step": 448000
},
{
"epoch": 3.72,
"eval_loss": 3.354923963546753,
"eval_runtime": 115.5245,
"eval_samples_per_second": 893.967,
"eval_steps_per_second": 55.876,
"step": 456000
},
{
"epoch": 3.78,
"learning_rate": 8.07137496873176e-06,
"loss": 3.3358,
"step": 464000
},
{
"epoch": 3.78,
"eval_loss": 3.360276937484741,
"eval_runtime": 116.1443,
"eval_samples_per_second": 889.196,
"eval_steps_per_second": 55.577,
"step": 464000
},
{
"epoch": 3.85,
"eval_loss": 3.3641881942749023,
"eval_runtime": 115.4508,
"eval_samples_per_second": 894.537,
"eval_steps_per_second": 55.911,
"step": 472000
},
{
"epoch": 3.91,
"learning_rate": 8.004669390477779e-06,
"loss": 3.3385,
"step": 480000
},
{
"epoch": 3.91,
"eval_loss": 3.3572633266448975,
"eval_runtime": 114.9449,
"eval_samples_per_second": 898.474,
"eval_steps_per_second": 56.157,
"step": 480000
},
{
"epoch": 3.98,
"eval_loss": 3.3658275604248047,
"eval_runtime": 115.0066,
"eval_samples_per_second": 897.992,
"eval_steps_per_second": 56.127,
"step": 488000
},
{
"epoch": 4.04,
"learning_rate": 7.937963812223798e-06,
"loss": 3.3375,
"step": 496000
},
{
"epoch": 4.04,
"eval_loss": 3.345881700515747,
"eval_runtime": 115.316,
"eval_samples_per_second": 895.583,
"eval_steps_per_second": 55.977,
"step": 496000
},
{
"epoch": 4.11,
"eval_loss": 3.3702762126922607,
"eval_runtime": 114.9631,
"eval_samples_per_second": 898.331,
"eval_steps_per_second": 56.148,
"step": 504000
},
{
"epoch": 4.17,
"learning_rate": 7.871258233969816e-06,
"loss": 3.3237,
"step": 512000
},
{
"epoch": 4.17,
"eval_loss": 3.3564202785491943,
"eval_runtime": 116.3254,
"eval_samples_per_second": 887.811,
"eval_steps_per_second": 55.491,
"step": 512000
},
{
"epoch": 4.24,
"eval_loss": 3.3553359508514404,
"eval_runtime": 115.6968,
"eval_samples_per_second": 892.635,
"eval_steps_per_second": 55.792,
"step": 520000
},
{
"epoch": 4.31,
"learning_rate": 7.804552655715835e-06,
"loss": 3.34,
"step": 528000
},
{
"epoch": 4.31,
"eval_loss": 3.35756778717041,
"eval_runtime": 114.9307,
"eval_samples_per_second": 898.585,
"eval_steps_per_second": 56.164,
"step": 528000
},
{
"epoch": 4.37,
"eval_loss": 3.3548436164855957,
"eval_runtime": 116.9698,
"eval_samples_per_second": 882.92,
"eval_steps_per_second": 55.185,
"step": 536000
},
{
"epoch": 4.44,
"learning_rate": 7.737847077461853e-06,
"loss": 3.3247,
"step": 544000
},
{
"epoch": 4.44,
"eval_loss": 3.3525540828704834,
"eval_runtime": 114.951,
"eval_samples_per_second": 898.427,
"eval_steps_per_second": 56.154,
"step": 544000
},
{
"epoch": 4.5,
"eval_loss": 3.367372512817383,
"eval_runtime": 116.891,
"eval_samples_per_second": 883.515,
"eval_steps_per_second": 55.222,
"step": 552000
},
{
"epoch": 4.57,
"learning_rate": 7.671141499207872e-06,
"loss": 3.318,
"step": 560000
},
{
"epoch": 4.57,
"eval_loss": 3.3607981204986572,
"eval_runtime": 115.5047,
"eval_samples_per_second": 894.12,
"eval_steps_per_second": 55.885,
"step": 560000
},
{
"epoch": 4.63,
"eval_loss": 3.3527328968048096,
"eval_runtime": 116.278,
"eval_samples_per_second": 888.173,
"eval_steps_per_second": 55.514,
"step": 568000
},
{
"epoch": 4.7,
"learning_rate": 7.604435920953891e-06,
"loss": 3.3318,
"step": 576000
},
{
"epoch": 4.7,
"eval_loss": 3.3600049018859863,
"eval_runtime": 115.0864,
"eval_samples_per_second": 897.369,
"eval_steps_per_second": 56.088,
"step": 576000
},
{
"epoch": 4.76,
"eval_loss": 3.366177797317505,
"eval_runtime": 116.1802,
"eval_samples_per_second": 888.921,
"eval_steps_per_second": 55.56,
"step": 584000
},
{
"epoch": 4.83,
"learning_rate": 7.537730342699909e-06,
"loss": 3.3211,
"step": 592000
},
{
"epoch": 4.83,
"eval_loss": 3.36027193069458,
"eval_runtime": 115.5036,
"eval_samples_per_second": 894.128,
"eval_steps_per_second": 55.886,
"step": 592000
},
{
"epoch": 4.89,
"eval_loss": 3.364029884338379,
"eval_runtime": 114.9019,
"eval_samples_per_second": 898.81,
"eval_steps_per_second": 56.178,
"step": 600000
},
{
"epoch": 4.96,
"learning_rate": 7.471024764445928e-06,
"loss": 3.3344,
"step": 608000
},
{
"epoch": 4.96,
"eval_loss": 3.376020669937134,
"eval_runtime": 115.5882,
"eval_samples_per_second": 893.473,
"eval_steps_per_second": 55.845,
"step": 608000
},
{
"epoch": 5.02,
"eval_loss": 3.3876428604125977,
"eval_runtime": 115.0301,
"eval_samples_per_second": 897.809,
"eval_steps_per_second": 56.116,
"step": 616000
},
{
"epoch": 5.09,
"learning_rate": 7.4043191861919465e-06,
"loss": 3.331,
"step": 624000
},
{
"epoch": 5.09,
"eval_loss": 3.351862668991089,
"eval_runtime": 115.49,
"eval_samples_per_second": 894.233,
"eval_steps_per_second": 55.892,
"step": 624000
},
{
"epoch": 5.15,
"eval_loss": 3.373405933380127,
"eval_runtime": 115.9525,
"eval_samples_per_second": 890.666,
"eval_steps_per_second": 55.669,
"step": 632000
},
{
"epoch": 5.22,
"learning_rate": 7.337613607937964e-06,
"loss": 3.3293,
"step": 640000
},
{
"epoch": 5.22,
"eval_loss": 3.373460531234741,
"eval_runtime": 115.1854,
"eval_samples_per_second": 896.598,
"eval_steps_per_second": 56.04,
"step": 640000
},
{
"epoch": 5.28,
"eval_loss": 3.3703157901763916,
"eval_runtime": 115.0036,
"eval_samples_per_second": 898.016,
"eval_steps_per_second": 56.129,
"step": 648000
},
{
"epoch": 5.35,
"learning_rate": 7.270908029683983e-06,
"loss": 3.3317,
"step": 656000
},
{
"epoch": 5.35,
"eval_loss": 3.382647752761841,
"eval_runtime": 115.8086,
"eval_samples_per_second": 891.773,
"eval_steps_per_second": 55.739,
"step": 656000
},
{
"epoch": 5.41,
"eval_loss": 3.3825886249542236,
"eval_runtime": 115.3628,
"eval_samples_per_second": 895.219,
"eval_steps_per_second": 55.954,
"step": 664000
},
{
"epoch": 5.48,
"learning_rate": 7.2042024514300015e-06,
"loss": 3.3291,
"step": 672000
},
{
"epoch": 5.48,
"eval_loss": 3.391868829727173,
"eval_runtime": 115.4028,
"eval_samples_per_second": 894.909,
"eval_steps_per_second": 55.935,
"step": 672000
},
{
"epoch": 5.54,
"eval_loss": 3.378626585006714,
"eval_runtime": 115.4498,
"eval_samples_per_second": 894.545,
"eval_steps_per_second": 55.912,
"step": 680000
},
{
"epoch": 5.61,
"learning_rate": 7.13749687317602e-06,
"loss": 3.3423,
"step": 688000
},
{
"epoch": 5.61,
"eval_loss": 3.377542734146118,
"eval_runtime": 115.2629,
"eval_samples_per_second": 895.995,
"eval_steps_per_second": 56.002,
"step": 688000
},
{
"epoch": 5.68,
"eval_loss": 3.373429298400879,
"eval_runtime": 115.5205,
"eval_samples_per_second": 893.997,
"eval_steps_per_second": 55.878,
"step": 696000
},
{
"epoch": 5.74,
"learning_rate": 7.070791294922038e-06,
"loss": 3.3364,
"step": 704000
},
{
"epoch": 5.74,
"eval_loss": 3.372532367706299,
"eval_runtime": 115.5543,
"eval_samples_per_second": 893.735,
"eval_steps_per_second": 55.861,
"step": 704000
},
{
"epoch": 5.81,
"eval_loss": 3.3855302333831787,
"eval_runtime": 115.9379,
"eval_samples_per_second": 890.778,
"eval_steps_per_second": 55.676,
"step": 712000
},
{
"epoch": 5.87,
"learning_rate": 7.0040857166680564e-06,
"loss": 3.347,
"step": 720000
},
{
"epoch": 5.87,
"eval_loss": 3.3774046897888184,
"eval_runtime": 114.6511,
"eval_samples_per_second": 900.776,
"eval_steps_per_second": 56.301,
"step": 720000
},
{
"epoch": 5.94,
"eval_loss": 3.3717195987701416,
"eval_runtime": 115.9173,
"eval_samples_per_second": 890.937,
"eval_steps_per_second": 55.686,
"step": 728000
},
{
"epoch": 6.0,
"learning_rate": 6.937380138414076e-06,
"loss": 3.3311,
"step": 736000
},
{
"epoch": 6.0,
"eval_loss": 3.392944097518921,
"eval_runtime": 115.7013,
"eval_samples_per_second": 892.6,
"eval_steps_per_second": 55.79,
"step": 736000
},
{
"epoch": 6.07,
"eval_loss": 3.389941930770874,
"eval_runtime": 117.4363,
"eval_samples_per_second": 879.413,
"eval_steps_per_second": 54.966,
"step": 744000
},
{
"epoch": 6.13,
"learning_rate": 6.8706745601600945e-06,
"loss": 3.3445,
"step": 752000
},
{
"epoch": 6.13,
"eval_loss": 3.3985016345977783,
"eval_runtime": 115.5779,
"eval_samples_per_second": 893.553,
"eval_steps_per_second": 55.85,
"step": 752000
},
{
"epoch": 6.2,
"eval_loss": 3.3865506649017334,
"eval_runtime": 114.8487,
"eval_samples_per_second": 899.227,
"eval_steps_per_second": 56.204,
"step": 760000
},
{
"epoch": 6.26,
"learning_rate": 6.803968981906113e-06,
"loss": 3.345,
"step": 768000
},
{
"epoch": 6.26,
"eval_loss": 3.3942770957946777,
"eval_runtime": 115.533,
"eval_samples_per_second": 893.901,
"eval_steps_per_second": 55.871,
"step": 768000
},
{
"epoch": 6.33,
"eval_loss": 3.373379945755005,
"eval_runtime": 115.2598,
"eval_samples_per_second": 896.019,
"eval_steps_per_second": 56.004,
"step": 776000
},
{
"epoch": 6.39,
"learning_rate": 6.737263403652131e-06,
"loss": 3.3427,
"step": 784000
},
{
"epoch": 6.39,
"eval_loss": 3.383202314376831,
"eval_runtime": 114.9199,
"eval_samples_per_second": 898.669,
"eval_steps_per_second": 56.17,
"step": 784000
},
{
"epoch": 6.46,
"eval_loss": 3.3966336250305176,
"eval_runtime": 115.6206,
"eval_samples_per_second": 893.223,
"eval_steps_per_second": 55.829,
"step": 792000
},
{
"epoch": 6.52,
"learning_rate": 6.6705578253981495e-06,
"loss": 3.3406,
"step": 800000
},
{
"epoch": 6.52,
"eval_loss": 3.3891854286193848,
"eval_runtime": 115.5059,
"eval_samples_per_second": 894.11,
"eval_steps_per_second": 55.885,
"step": 800000
},
{
"epoch": 6.59,
"eval_loss": 3.390401601791382,
"eval_runtime": 116.1612,
"eval_samples_per_second": 889.066,
"eval_steps_per_second": 55.569,
"step": 808000
},
{
"epoch": 6.65,
"learning_rate": 6.603852247144168e-06,
"loss": 3.3406,
"step": 816000
},
{
"epoch": 6.65,
"eval_loss": 3.386686086654663,
"eval_runtime": 115.3671,
"eval_samples_per_second": 895.186,
"eval_steps_per_second": 55.952,
"step": 816000
},
{
"epoch": 6.72,
"eval_loss": 3.390192747116089,
"eval_runtime": 114.8586,
"eval_samples_per_second": 899.149,
"eval_steps_per_second": 56.2,
"step": 824000
},
{
"epoch": 6.78,
"learning_rate": 6.537146668890187e-06,
"loss": 3.3354,
"step": 832000
},
{
"epoch": 6.78,
"eval_loss": 3.371840000152588,
"eval_runtime": 115.0229,
"eval_samples_per_second": 897.865,
"eval_steps_per_second": 56.119,
"step": 832000
},
{
"epoch": 6.85,
"eval_loss": 3.383141279220581,
"eval_runtime": 115.453,
"eval_samples_per_second": 894.52,
"eval_steps_per_second": 55.91,
"step": 840000
},
{
"epoch": 6.91,
"learning_rate": 6.4704410906362044e-06,
"loss": 3.3521,
"step": 848000
},
{
"epoch": 6.91,
"eval_loss": 3.3909192085266113,
"eval_runtime": 115.5241,
"eval_samples_per_second": 893.97,
"eval_steps_per_second": 55.876,
"step": 848000
},
{
"epoch": 6.98,
"eval_loss": 3.3798959255218506,
"eval_runtime": 115.2184,
"eval_samples_per_second": 896.342,
"eval_steps_per_second": 56.024,
"step": 856000
},
{
"epoch": 7.05,
"learning_rate": 6.403735512382223e-06,
"loss": 3.3538,
"step": 864000
},
{
"epoch": 7.05,
"eval_loss": 3.3828136920928955,
"eval_runtime": 115.3784,
"eval_samples_per_second": 895.098,
"eval_steps_per_second": 55.946,
"step": 864000
},
{
"epoch": 7.11,
"eval_loss": 3.378514051437378,
"eval_runtime": 115.0377,
"eval_samples_per_second": 897.749,
"eval_steps_per_second": 56.112,
"step": 872000
},
{
"epoch": 7.18,
"learning_rate": 6.337029934128242e-06,
"loss": 3.3363,
"step": 880000
},
{
"epoch": 7.18,
"eval_loss": 3.3993334770202637,
"eval_runtime": 115.5145,
"eval_samples_per_second": 894.043,
"eval_steps_per_second": 55.88,
"step": 880000
},
{
"epoch": 7.24,
"eval_loss": 3.3849687576293945,
"eval_runtime": 114.7628,
"eval_samples_per_second": 899.9,
"eval_steps_per_second": 56.246,
"step": 888000
},
{
"epoch": 7.31,
"learning_rate": 6.270324355874261e-06,
"loss": 3.3341,
"step": 896000
},
{
"epoch": 7.31,
"eval_loss": 3.3932485580444336,
"eval_runtime": 115.0217,
"eval_samples_per_second": 897.874,
"eval_steps_per_second": 56.12,
"step": 896000
},
{
"epoch": 7.37,
"eval_loss": 3.398083209991455,
"eval_runtime": 115.1782,
"eval_samples_per_second": 896.654,
"eval_steps_per_second": 56.044,
"step": 904000
},
{
"epoch": 7.44,
"learning_rate": 6.20361877762028e-06,
"loss": 3.3458,
"step": 912000
},
{
"epoch": 7.44,
"eval_loss": 3.393594741821289,
"eval_runtime": 116.8302,
"eval_samples_per_second": 883.975,
"eval_steps_per_second": 55.251,
"step": 912000
},
{
"epoch": 7.5,
"eval_loss": 3.4032301902770996,
"eval_runtime": 115.4692,
"eval_samples_per_second": 894.394,
"eval_steps_per_second": 55.902,
"step": 920000
},
{
"epoch": 7.57,
"learning_rate": 6.1369131993662975e-06,
"loss": 3.3327,
"step": 928000
},
{
"epoch": 7.57,
"eval_loss": 3.385192394256592,
"eval_runtime": 115.7558,
"eval_samples_per_second": 892.18,
"eval_steps_per_second": 55.764,
"step": 928000
},
{
"epoch": 7.63,
"eval_loss": 3.38653826713562,
"eval_runtime": 116.1964,
"eval_samples_per_second": 888.797,
"eval_steps_per_second": 55.553,
"step": 936000
},
{
"epoch": 7.7,
"learning_rate": 6.070207621112316e-06,
"loss": 3.3507,
"step": 944000
},
{
"epoch": 7.7,
"eval_loss": 3.390004873275757,
"eval_runtime": 115.6497,
"eval_samples_per_second": 892.999,
"eval_steps_per_second": 55.815,
"step": 944000
},
{
"epoch": 7.76,
"eval_loss": 3.3772072792053223,
"eval_runtime": 115.4517,
"eval_samples_per_second": 894.53,
"eval_steps_per_second": 55.911,
"step": 952000
},
{
"epoch": 7.83,
"learning_rate": 6.003502042858335e-06,
"loss": 3.3493,
"step": 960000
},
{
"epoch": 7.83,
"eval_loss": 3.388688802719116,
"eval_runtime": 115.7986,
"eval_samples_per_second": 891.85,
"eval_steps_per_second": 55.743,
"step": 960000
},
{
"epoch": 7.89,
"eval_loss": 3.395124912261963,
"eval_runtime": 115.4739,
"eval_samples_per_second": 894.358,
"eval_steps_per_second": 55.9,
"step": 968000
},
{
"epoch": 7.96,
"learning_rate": 5.936796464604353e-06,
"loss": 3.3412,
"step": 976000
},
{
"epoch": 7.96,
"eval_loss": 3.3833136558532715,
"eval_runtime": 114.7504,
"eval_samples_per_second": 899.997,
"eval_steps_per_second": 56.253,
"step": 976000
},
{
"epoch": 8.02,
"eval_loss": 3.381627321243286,
"eval_runtime": 115.0253,
"eval_samples_per_second": 897.846,
"eval_steps_per_second": 56.118,
"step": 984000
},
{
"epoch": 8.09,
"learning_rate": 5.870090886350371e-06,
"loss": 3.3232,
"step": 992000
},
{
"epoch": 8.09,
"eval_loss": 3.37522292137146,
"eval_runtime": 114.2933,
"eval_samples_per_second": 903.597,
"eval_steps_per_second": 56.478,
"step": 992000
},
{
"epoch": 8.15,
"eval_loss": 3.384525775909424,
"eval_runtime": 115.119,
"eval_samples_per_second": 897.115,
"eval_steps_per_second": 56.072,
"step": 1000000
},
{
"epoch": 8.22,
"learning_rate": 5.80338530809639e-06,
"loss": 3.333,
"step": 1008000
},
{
"epoch": 8.22,
"eval_loss": 3.3906686305999756,
"eval_runtime": 115.1127,
"eval_samples_per_second": 897.164,
"eval_steps_per_second": 56.075,
"step": 1008000
},
{
"epoch": 8.28,
"eval_loss": 3.3822684288024902,
"eval_runtime": 114.8049,
"eval_samples_per_second": 899.569,
"eval_steps_per_second": 56.226,
"step": 1016000
},
{
"epoch": 8.35,
"learning_rate": 5.736679729842408e-06,
"loss": 3.3449,
"step": 1024000
},
{
"epoch": 8.35,
"eval_loss": 3.3724589347839355,
"eval_runtime": 114.8265,
"eval_samples_per_second": 899.4,
"eval_steps_per_second": 56.215,
"step": 1024000
},
{
"epoch": 8.41,
"eval_loss": 3.37973952293396,
"eval_runtime": 115.0872,
"eval_samples_per_second": 897.363,
"eval_steps_per_second": 56.088,
"step": 1032000
},
{
"epoch": 8.48,
"learning_rate": 5.669974151588427e-06,
"loss": 3.3336,
"step": 1040000
},
{
"epoch": 8.48,
"eval_loss": 3.38781476020813,
"eval_runtime": 116.3835,
"eval_samples_per_second": 887.368,
"eval_steps_per_second": 55.463,
"step": 1040000
},
{
"epoch": 8.55,
"eval_loss": 3.384516716003418,
"eval_runtime": 115.2938,
"eval_samples_per_second": 895.755,
"eval_steps_per_second": 55.987,
"step": 1048000
},
{
"epoch": 8.61,
"learning_rate": 5.603268573334446e-06,
"loss": 3.3307,
"step": 1056000
},
{
"epoch": 8.61,
"eval_loss": 3.390652894973755,
"eval_runtime": 116.7145,
"eval_samples_per_second": 884.851,
"eval_steps_per_second": 55.306,
"step": 1056000
},
{
"epoch": 8.68,
"eval_loss": 3.3857922554016113,
"eval_runtime": 115.6915,
"eval_samples_per_second": 892.676,
"eval_steps_per_second": 55.795,
"step": 1064000
},
{
"epoch": 8.74,
"learning_rate": 5.536562995080464e-06,
"loss": 3.3267,
"step": 1072000
},
{
"epoch": 8.74,
"eval_loss": 3.3951947689056396,
"eval_runtime": 115.1111,
"eval_samples_per_second": 897.177,
"eval_steps_per_second": 56.076,
"step": 1072000
},
{
"epoch": 8.81,
"eval_loss": 3.391402006149292,
"eval_runtime": 114.8898,
"eval_samples_per_second": 898.905,
"eval_steps_per_second": 56.184,
"step": 1080000
},
{
"epoch": 8.87,
"learning_rate": 5.469857416826483e-06,
"loss": 3.335,
"step": 1088000
},
{
"epoch": 8.87,
"eval_loss": 3.3904380798339844,
"eval_runtime": 116.7468,
"eval_samples_per_second": 884.607,
"eval_steps_per_second": 55.291,
"step": 1088000
},
{
"epoch": 8.94,
"eval_loss": 3.3894879817962646,
"eval_runtime": 115.0778,
"eval_samples_per_second": 897.437,
"eval_steps_per_second": 56.092,
"step": 1096000
},
{
"epoch": 9.0,
"learning_rate": 5.403151838572501e-06,
"loss": 3.3411,
"step": 1104000
},
{
"epoch": 9.0,
"eval_loss": 3.395911455154419,
"eval_runtime": 116.3802,
"eval_samples_per_second": 887.393,
"eval_steps_per_second": 55.465,
"step": 1104000
},
{
"epoch": 9.07,
"eval_loss": 3.391462802886963,
"eval_runtime": 115.5689,
"eval_samples_per_second": 893.623,
"eval_steps_per_second": 55.854,
"step": 1112000
},
{
"epoch": 9.13,
"learning_rate": 5.33644626031852e-06,
"loss": 3.3324,
"step": 1120000
},
{
"epoch": 9.13,
"eval_loss": 3.4030401706695557,
"eval_runtime": 115.7261,
"eval_samples_per_second": 892.409,
"eval_steps_per_second": 55.778,
"step": 1120000
},
{
"epoch": 9.2,
"eval_loss": 3.4083750247955322,
"eval_runtime": 118.5809,
"eval_samples_per_second": 870.924,
"eval_steps_per_second": 54.435,
"step": 1128000
},
{
"epoch": 9.26,
"learning_rate": 5.269740682064538e-06,
"loss": 3.3297,
"step": 1136000
},
{
"epoch": 9.26,
"eval_loss": 3.402348518371582,
"eval_runtime": 115.6049,
"eval_samples_per_second": 893.344,
"eval_steps_per_second": 55.837,
"step": 1136000
},
{
"epoch": 9.33,
"eval_loss": 3.3967323303222656,
"eval_runtime": 115.5344,
"eval_samples_per_second": 893.889,
"eval_steps_per_second": 55.871,
"step": 1144000
},
{
"epoch": 9.39,
"learning_rate": 5.203035103810556e-06,
"loss": 3.3492,
"step": 1152000
},
{
"epoch": 9.39,
"eval_loss": 3.393101215362549,
"eval_runtime": 115.5769,
"eval_samples_per_second": 893.561,
"eval_steps_per_second": 55.85,
"step": 1152000
},
{
"epoch": 9.46,
"eval_loss": 3.4064693450927734,
"eval_runtime": 114.7523,
"eval_samples_per_second": 899.982,
"eval_steps_per_second": 56.252,
"step": 1160000
},
{
"epoch": 9.52,
"learning_rate": 5.136329525556575e-06,
"loss": 3.3317,
"step": 1168000
},
{
"epoch": 9.52,
"eval_loss": 3.3905270099639893,
"eval_runtime": 115.5534,
"eval_samples_per_second": 893.743,
"eval_steps_per_second": 55.862,
"step": 1168000
},
{
"epoch": 9.59,
"eval_loss": 3.402090072631836,
"eval_runtime": 114.6435,
"eval_samples_per_second": 900.836,
"eval_steps_per_second": 56.305,
"step": 1176000
},
{
"epoch": 9.65,
"learning_rate": 5.0696239473025935e-06,
"loss": 3.3447,
"step": 1184000
},
{
"epoch": 9.65,
"eval_loss": 3.400120735168457,
"eval_runtime": 116.0858,
"eval_samples_per_second": 889.643,
"eval_steps_per_second": 55.605,
"step": 1184000
},
{
"epoch": 9.72,
"eval_loss": 3.3942949771881104,
"eval_runtime": 114.8922,
"eval_samples_per_second": 898.886,
"eval_steps_per_second": 56.183,
"step": 1192000
},
{
"epoch": 9.78,
"learning_rate": 5.002918369048611e-06,
"loss": 3.3377,
"step": 1200000
},
{
"epoch": 9.78,
"eval_loss": 3.3970954418182373,
"eval_runtime": 114.8942,
"eval_samples_per_second": 898.871,
"eval_steps_per_second": 56.182,
"step": 1200000
},
{
"epoch": 9.85,
"eval_loss": 3.3946433067321777,
"eval_runtime": 114.9828,
"eval_samples_per_second": 898.178,
"eval_steps_per_second": 56.139,
"step": 1208000
},
{
"epoch": 9.92,
"learning_rate": 4.936212790794631e-06,
"loss": 3.3486,
"step": 1216000
},
{
"epoch": 9.92,
"eval_loss": 3.392373561859131,
"eval_runtime": 115.6846,
"eval_samples_per_second": 892.729,
"eval_steps_per_second": 55.798,
"step": 1216000
},
{
"epoch": 9.98,
"eval_loss": 3.398346424102783,
"eval_runtime": 115.4236,
"eval_samples_per_second": 894.747,
"eval_steps_per_second": 55.924,
"step": 1224000
},
{
"epoch": 10.05,
"learning_rate": 4.869507212540649e-06,
"loss": 3.3471,
"step": 1232000
},
{
"epoch": 10.05,
"eval_loss": 3.414100408554077,
"eval_runtime": 115.0455,
"eval_samples_per_second": 897.689,
"eval_steps_per_second": 56.108,
"step": 1232000
},
{
"epoch": 10.11,
"eval_loss": 3.4220006465911865,
"eval_runtime": 115.4764,
"eval_samples_per_second": 894.339,
"eval_steps_per_second": 55.899,
"step": 1240000
},
{
"epoch": 10.18,
"learning_rate": 4.802801634286667e-06,
"loss": 3.3457,
"step": 1248000
},
{
"epoch": 10.18,
"eval_loss": 3.4085357189178467,
"eval_runtime": 115.0154,
"eval_samples_per_second": 897.923,
"eval_steps_per_second": 56.123,
"step": 1248000
},
{
"epoch": 10.24,
"eval_loss": 3.424273729324341,
"eval_runtime": 114.96,
"eval_samples_per_second": 898.356,
"eval_steps_per_second": 56.15,
"step": 1256000
},
{
"epoch": 10.31,
"learning_rate": 4.7360960560326865e-06,
"loss": 3.3278,
"step": 1264000
},
{
"epoch": 10.31,
"eval_loss": 3.4058358669281006,
"eval_runtime": 115.4303,
"eval_samples_per_second": 894.696,
"eval_steps_per_second": 55.921,
"step": 1264000
},
{
"epoch": 10.37,
"eval_loss": 3.403254985809326,
"eval_runtime": 114.783,
"eval_samples_per_second": 899.741,
"eval_steps_per_second": 56.237,
"step": 1272000
},
{
"epoch": 10.44,
"learning_rate": 4.669390477778704e-06,
"loss": 3.325,
"step": 1280000
},
{
"epoch": 10.44,
"eval_loss": 3.3866589069366455,
"eval_runtime": 115.6771,
"eval_samples_per_second": 892.787,
"eval_steps_per_second": 55.802,
"step": 1280000
},
{
"epoch": 10.5,
"eval_loss": 3.3878674507141113,
"eval_runtime": 114.7924,
"eval_samples_per_second": 899.667,
"eval_steps_per_second": 56.232,
"step": 1288000
},
{
"epoch": 10.57,
"learning_rate": 4.602684899524723e-06,
"loss": 3.3248,
"step": 1296000
},
{
"epoch": 10.57,
"eval_loss": 3.380067825317383,
"eval_runtime": 115.2061,
"eval_samples_per_second": 896.437,
"eval_steps_per_second": 56.03,
"step": 1296000
},
{
"epoch": 10.63,
"eval_loss": 3.4026682376861572,
"eval_runtime": 117.5473,
"eval_samples_per_second": 878.583,
"eval_steps_per_second": 54.914,
"step": 1304000
},
{
"epoch": 10.7,
"learning_rate": 4.5359793212707415e-06,
"loss": 3.3217,
"step": 1312000
},
{
"epoch": 10.7,
"eval_loss": 3.3781392574310303,
"eval_runtime": 116.9837,
"eval_samples_per_second": 882.816,
"eval_steps_per_second": 55.179,
"step": 1312000
},
{
"epoch": 10.76,
"eval_loss": 3.38712477684021,
"eval_runtime": 116.1554,
"eval_samples_per_second": 889.111,
"eval_steps_per_second": 55.572,
"step": 1320000
},
{
"epoch": 10.83,
"learning_rate": 4.46927374301676e-06,
"loss": 3.3227,
"step": 1328000
},
{
"epoch": 10.83,
"eval_loss": 3.386099338531494,
"eval_runtime": 116.8959,
"eval_samples_per_second": 883.478,
"eval_steps_per_second": 55.22,
"step": 1328000
},
{
"epoch": 10.89,
"eval_loss": 3.378852605819702,
"eval_runtime": 116.5746,
"eval_samples_per_second": 885.913,
"eval_steps_per_second": 55.372,
"step": 1336000
},
{
"epoch": 10.96,
"learning_rate": 4.402568164762779e-06,
"loss": 3.3259,
"step": 1344000
},
{
"epoch": 10.96,
"eval_loss": 3.386458158493042,
"eval_runtime": 116.5428,
"eval_samples_per_second": 886.155,
"eval_steps_per_second": 55.387,
"step": 1344000
},
{
"epoch": 11.02,
"eval_loss": 3.386268377304077,
"eval_runtime": 115.7105,
"eval_samples_per_second": 892.529,
"eval_steps_per_second": 55.786,
"step": 1352000
},
{
"epoch": 11.09,
"learning_rate": 4.335862586508797e-06,
"loss": 3.3094,
"step": 1360000
},
{
"epoch": 11.09,
"eval_loss": 3.3826916217803955,
"eval_runtime": 118.0068,
"eval_samples_per_second": 875.161,
"eval_steps_per_second": 54.7,
"step": 1360000
},
{
"epoch": 11.15,
"eval_loss": 3.3880295753479004,
"eval_runtime": 115.413,
"eval_samples_per_second": 894.83,
"eval_steps_per_second": 55.93,
"step": 1368000
},
{
"epoch": 11.22,
"learning_rate": 4.269157008254816e-06,
"loss": 3.3128,
"step": 1376000
},
{
"epoch": 11.22,
"eval_loss": 3.365227460861206,
"eval_runtime": 116.1062,
"eval_samples_per_second": 889.487,
"eval_steps_per_second": 55.596,
"step": 1376000
},
{
"epoch": 11.29,
"eval_loss": 3.381347179412842,
"eval_runtime": 119.0899,
"eval_samples_per_second": 867.202,
"eval_steps_per_second": 54.203,
"step": 1384000
},
{
"epoch": 11.35,
"learning_rate": 4.202451430000834e-06,
"loss": 3.3088,
"step": 1392000
},
{
"epoch": 11.35,
"eval_loss": 3.385295867919922,
"eval_runtime": 115.9391,
"eval_samples_per_second": 890.769,
"eval_steps_per_second": 55.676,
"step": 1392000
},
{
"epoch": 11.42,
"eval_loss": 3.3708653450012207,
"eval_runtime": 116.9766,
"eval_samples_per_second": 882.869,
"eval_steps_per_second": 55.182,
"step": 1400000
},
{
"epoch": 11.48,
"learning_rate": 4.135745851746852e-06,
"loss": 3.3067,
"step": 1408000
},
{
"epoch": 11.48,
"eval_loss": 3.3830504417419434,
"eval_runtime": 115.9272,
"eval_samples_per_second": 890.861,
"eval_steps_per_second": 55.682,
"step": 1408000
},
{
"epoch": 11.55,
"eval_loss": 3.370314598083496,
"eval_runtime": 117.2105,
"eval_samples_per_second": 881.107,
"eval_steps_per_second": 55.072,
"step": 1416000
},
{
"epoch": 11.61,
"learning_rate": 4.069040273492872e-06,
"loss": 3.311,
"step": 1424000
},
{
"epoch": 11.61,
"eval_loss": 3.369617223739624,
"eval_runtime": 116.4339,
"eval_samples_per_second": 886.984,
"eval_steps_per_second": 55.439,
"step": 1424000
},
{
"epoch": 11.68,
"eval_loss": 3.3768646717071533,
"eval_runtime": 118.1326,
"eval_samples_per_second": 874.23,
"eval_steps_per_second": 54.642,
"step": 1432000
},
{
"epoch": 11.74,
"learning_rate": 4.0023346952388895e-06,
"loss": 3.3048,
"step": 1440000
},
{
"epoch": 11.74,
"eval_loss": 3.373983860015869,
"eval_runtime": 118.2179,
"eval_samples_per_second": 873.598,
"eval_steps_per_second": 54.603,
"step": 1440000
},
{
"epoch": 11.81,
"eval_loss": 3.3731493949890137,
"eval_runtime": 116.9055,
"eval_samples_per_second": 883.406,
"eval_steps_per_second": 55.216,
"step": 1448000
},
{
"epoch": 11.87,
"learning_rate": 3.935629116984908e-06,
"loss": 3.3055,
"step": 1456000
},
{
"epoch": 11.87,
"eval_loss": 3.365483283996582,
"eval_runtime": 117.1876,
"eval_samples_per_second": 881.279,
"eval_steps_per_second": 55.083,
"step": 1456000
},
{
"epoch": 11.94,
"eval_loss": 3.3697094917297363,
"eval_runtime": 117.1788,
"eval_samples_per_second": 881.346,
"eval_steps_per_second": 55.087,
"step": 1464000
},
{
"epoch": 12.0,
"learning_rate": 3.868923538730927e-06,
"loss": 3.3105,
"step": 1472000
},
{
"epoch": 12.0,
"eval_loss": 3.3741800785064697,
"eval_runtime": 116.7081,
"eval_samples_per_second": 884.9,
"eval_steps_per_second": 55.309,
"step": 1472000
},
{
"epoch": 12.07,
"eval_loss": 3.3614203929901123,
"eval_runtime": 118.1522,
"eval_samples_per_second": 874.084,
"eval_steps_per_second": 54.633,
"step": 1480000
},
{
"epoch": 12.13,
"learning_rate": 3.8022179604769453e-06,
"loss": 3.2977,
"step": 1488000
},
{
"epoch": 12.13,
"eval_loss": 3.370495319366455,
"eval_runtime": 117.0737,
"eval_samples_per_second": 882.137,
"eval_steps_per_second": 55.136,
"step": 1488000
},
{
"epoch": 12.2,
"eval_loss": 3.3746001720428467,
"eval_runtime": 117.4262,
"eval_samples_per_second": 879.489,
"eval_steps_per_second": 54.971,
"step": 1496000
},
{
"epoch": 12.26,
"learning_rate": 3.735512382222964e-06,
"loss": 3.2999,
"step": 1504000
},
{
"epoch": 12.26,
"eval_loss": 3.3690757751464844,
"eval_runtime": 114.9601,
"eval_samples_per_second": 898.355,
"eval_steps_per_second": 56.15,
"step": 1504000
},
{
"epoch": 12.33,
"eval_loss": 3.374530792236328,
"eval_runtime": 115.3595,
"eval_samples_per_second": 895.245,
"eval_steps_per_second": 55.955,
"step": 1512000
},
{
"epoch": 12.39,
"learning_rate": 3.668806803968982e-06,
"loss": 3.2983,
"step": 1520000
},
{
"epoch": 12.39,
"eval_loss": 3.3717198371887207,
"eval_runtime": 114.9666,
"eval_samples_per_second": 898.304,
"eval_steps_per_second": 56.147,
"step": 1520000
},
{
"epoch": 12.46,
"eval_loss": 3.368246555328369,
"eval_runtime": 115.591,
"eval_samples_per_second": 893.452,
"eval_steps_per_second": 55.843,
"step": 1528000
},
{
"epoch": 12.52,
"learning_rate": 3.6021012257150007e-06,
"loss": 3.2957,
"step": 1536000
},
{
"epoch": 12.52,
"eval_loss": 3.369278907775879,
"eval_runtime": 116.1156,
"eval_samples_per_second": 889.416,
"eval_steps_per_second": 55.591,
"step": 1536000
},
{
"epoch": 12.59,
"eval_loss": 3.376443386077881,
"eval_runtime": 114.7209,
"eval_samples_per_second": 900.228,
"eval_steps_per_second": 56.267,
"step": 1544000
},
{
"epoch": 12.65,
"learning_rate": 3.535395647461019e-06,
"loss": 3.293,
"step": 1552000
},
{
"epoch": 12.65,
"eval_loss": 3.3690662384033203,
"eval_runtime": 114.9457,
"eval_samples_per_second": 898.468,
"eval_steps_per_second": 56.157,
"step": 1552000
},
{
"epoch": 12.72,
"eval_loss": 3.380187511444092,
"eval_runtime": 115.2975,
"eval_samples_per_second": 895.726,
"eval_steps_per_second": 55.986,
"step": 1560000
},
{
"epoch": 12.79,
"learning_rate": 3.468690069207038e-06,
"loss": 3.2919,
"step": 1568000
},
{
"epoch": 12.79,
"eval_loss": 3.3626480102539062,
"eval_runtime": 115.0018,
"eval_samples_per_second": 898.03,
"eval_steps_per_second": 56.13,
"step": 1568000
},
{
"epoch": 12.85,
"eval_loss": 3.3604438304901123,
"eval_runtime": 116.2394,
"eval_samples_per_second": 888.468,
"eval_steps_per_second": 55.532,
"step": 1576000
},
{
"epoch": 12.92,
"learning_rate": 3.4019844909530565e-06,
"loss": 3.3023,
"step": 1584000
},
{
"epoch": 12.92,
"eval_loss": 3.374943971633911,
"eval_runtime": 115.4828,
"eval_samples_per_second": 894.289,
"eval_steps_per_second": 55.896,
"step": 1584000
},
{
"epoch": 12.98,
"eval_loss": 3.368828773498535,
"eval_runtime": 114.8626,
"eval_samples_per_second": 899.118,
"eval_steps_per_second": 56.198,
"step": 1592000
},
{
"epoch": 13.05,
"learning_rate": 3.3352789126990747e-06,
"loss": 3.2988,
"step": 1600000
},
{
"epoch": 13.05,
"eval_loss": 3.3666255474090576,
"eval_runtime": 115.7226,
"eval_samples_per_second": 892.436,
"eval_steps_per_second": 55.78,
"step": 1600000
},
{
"epoch": 13.11,
"eval_loss": 3.369481325149536,
"eval_runtime": 116.2492,
"eval_samples_per_second": 888.393,
"eval_steps_per_second": 55.527,
"step": 1608000
},
{
"epoch": 13.18,
"learning_rate": 3.2685733344450933e-06,
"loss": 3.2924,
"step": 1616000
},
{
"epoch": 13.18,
"eval_loss": 3.364980697631836,
"eval_runtime": 114.892,
"eval_samples_per_second": 898.887,
"eval_steps_per_second": 56.183,
"step": 1616000
},
{
"epoch": 13.24,
"eval_loss": 3.3651351928710938,
"eval_runtime": 114.7414,
"eval_samples_per_second": 900.068,
"eval_steps_per_second": 56.257,
"step": 1624000
},
{
"epoch": 13.31,
"learning_rate": 3.2018677561911115e-06,
"loss": 3.2958,
"step": 1632000
},
{
"epoch": 13.31,
"eval_loss": 3.369225263595581,
"eval_runtime": 115.9526,
"eval_samples_per_second": 890.666,
"eval_steps_per_second": 55.669,
"step": 1632000
},
{
"epoch": 13.37,
"eval_loss": 3.3855459690093994,
"eval_runtime": 114.8307,
"eval_samples_per_second": 899.367,
"eval_steps_per_second": 56.213,
"step": 1640000
},
{
"epoch": 13.44,
"learning_rate": 3.1351621779371306e-06,
"loss": 3.2918,
"step": 1648000
},
{
"epoch": 13.44,
"eval_loss": 3.3706300258636475,
"eval_runtime": 115.344,
"eval_samples_per_second": 895.365,
"eval_steps_per_second": 55.963,
"step": 1648000
},
{
"epoch": 13.5,
"eval_loss": 3.3680288791656494,
"eval_runtime": 114.7321,
"eval_samples_per_second": 900.14,
"eval_steps_per_second": 56.261,
"step": 1656000
},
{
"epoch": 13.57,
"learning_rate": 3.0684565996831487e-06,
"loss": 3.2948,
"step": 1664000
},
{
"epoch": 13.57,
"eval_loss": 3.353415012359619,
"eval_runtime": 116.4266,
"eval_samples_per_second": 887.039,
"eval_steps_per_second": 55.443,
"step": 1664000
},
{
"epoch": 13.63,
"eval_loss": 3.369929790496826,
"eval_runtime": 114.8306,
"eval_samples_per_second": 899.369,
"eval_steps_per_second": 56.213,
"step": 1672000
},
{
"epoch": 13.7,
"learning_rate": 3.0017510214291673e-06,
"loss": 3.2996,
"step": 1680000
},
{
"epoch": 13.7,
"eval_loss": 3.3732664585113525,
"eval_runtime": 115.7005,
"eval_samples_per_second": 892.607,
"eval_steps_per_second": 55.791,
"step": 1680000
},
{
"epoch": 13.76,
"eval_loss": 3.3764214515686035,
"eval_runtime": 115.4981,
"eval_samples_per_second": 894.171,
"eval_steps_per_second": 55.888,
"step": 1688000
},
{
"epoch": 13.83,
"learning_rate": 2.9350454431751855e-06,
"loss": 3.2999,
"step": 1696000
},
{
"epoch": 13.83,
"eval_loss": 3.3792943954467773,
"eval_runtime": 116.0913,
"eval_samples_per_second": 889.602,
"eval_steps_per_second": 55.603,
"step": 1696000
},
{
"epoch": 13.89,
"eval_loss": 3.368272304534912,
"eval_runtime": 116.0753,
"eval_samples_per_second": 889.724,
"eval_steps_per_second": 55.61,
"step": 1704000
},
{
"epoch": 13.96,
"learning_rate": 2.868339864921204e-06,
"loss": 3.291,
"step": 1712000
},
{
"epoch": 13.96,
"eval_loss": 3.3653597831726074,
"eval_runtime": 115.5031,
"eval_samples_per_second": 894.132,
"eval_steps_per_second": 55.886,
"step": 1712000
},
{
"epoch": 14.02,
"eval_loss": 3.372131109237671,
"eval_runtime": 115.6199,
"eval_samples_per_second": 893.228,
"eval_steps_per_second": 55.829,
"step": 1720000
},
{
"epoch": 14.09,
"learning_rate": 2.801634286667223e-06,
"loss": 3.2952,
"step": 1728000
},
{
"epoch": 14.09,
"eval_loss": 3.367438316345215,
"eval_runtime": 115.0009,
"eval_samples_per_second": 898.037,
"eval_steps_per_second": 56.13,
"step": 1728000
},
{
"epoch": 14.16,
"eval_loss": 3.3762009143829346,
"eval_runtime": 115.4616,
"eval_samples_per_second": 894.453,
"eval_steps_per_second": 55.906,
"step": 1736000
},
{
"epoch": 14.22,
"learning_rate": 2.7349287084132413e-06,
"loss": 3.2866,
"step": 1744000
},
{
"epoch": 14.22,
"eval_loss": 3.3699355125427246,
"eval_runtime": 114.9346,
"eval_samples_per_second": 898.554,
"eval_steps_per_second": 56.162,
"step": 1744000
},
{
"epoch": 14.29,
"eval_loss": 3.3690149784088135,
"eval_runtime": 115.9293,
"eval_samples_per_second": 890.845,
"eval_steps_per_second": 55.681,
"step": 1752000
},
{
"epoch": 14.35,
"learning_rate": 2.66822313015926e-06,
"loss": 3.2825,
"step": 1760000
},
{
"epoch": 14.35,
"eval_loss": 3.365321636199951,
"eval_runtime": 114.9037,
"eval_samples_per_second": 898.796,
"eval_steps_per_second": 56.177,
"step": 1760000
},
{
"epoch": 14.42,
"eval_loss": 3.368727207183838,
"eval_runtime": 115.3436,
"eval_samples_per_second": 895.369,
"eval_steps_per_second": 55.963,
"step": 1768000
},
{
"epoch": 14.48,
"learning_rate": 2.601517551905278e-06,
"loss": 3.2825,
"step": 1776000
},
{
"epoch": 14.48,
"eval_loss": 3.3617701530456543,
"eval_runtime": 115.7714,
"eval_samples_per_second": 892.06,
"eval_steps_per_second": 55.756,
"step": 1776000
},
{
"epoch": 14.55,
"eval_loss": 3.3609282970428467,
"eval_runtime": 114.879,
"eval_samples_per_second": 898.989,
"eval_steps_per_second": 56.19,
"step": 1784000
},
{
"epoch": 14.61,
"learning_rate": 2.5348119736512967e-06,
"loss": 3.2744,
"step": 1792000
},
{
"epoch": 14.61,
"eval_loss": 3.3552184104919434,
"eval_runtime": 114.6789,
"eval_samples_per_second": 900.558,
"eval_steps_per_second": 56.288,
"step": 1792000
},
{
"epoch": 14.68,
"eval_loss": 3.3549087047576904,
"eval_runtime": 116.3921,
"eval_samples_per_second": 887.303,
"eval_steps_per_second": 55.459,
"step": 1800000
},
{
"epoch": 14.74,
"learning_rate": 2.4681063953973154e-06,
"loss": 3.2811,
"step": 1808000
},
{
"epoch": 14.74,
"eval_loss": 3.3504152297973633,
"eval_runtime": 115.0014,
"eval_samples_per_second": 898.032,
"eval_steps_per_second": 56.13,
"step": 1808000
},
{
"epoch": 14.81,
"eval_loss": 3.3574647903442383,
"eval_runtime": 115.1236,
"eval_samples_per_second": 897.079,
"eval_steps_per_second": 56.07,
"step": 1816000
},
{
"epoch": 14.87,
"learning_rate": 2.4014008171433335e-06,
"loss": 3.2672,
"step": 1824000
},
{
"epoch": 14.87,
"eval_loss": 3.3587796688079834,
"eval_runtime": 116.6416,
"eval_samples_per_second": 885.404,
"eval_steps_per_second": 55.34,
"step": 1824000
},
{
"epoch": 14.94,
"eval_loss": 3.3559627532958984,
"eval_runtime": 116.2457,
"eval_samples_per_second": 888.42,
"eval_steps_per_second": 55.529,
"step": 1832000
},
{
"epoch": 15.0,
"learning_rate": 2.334695238889352e-06,
"loss": 3.2919,
"step": 1840000
},
{
"epoch": 15.0,
"eval_loss": 3.359805107116699,
"eval_runtime": 115.5497,
"eval_samples_per_second": 893.771,
"eval_steps_per_second": 55.863,
"step": 1840000
},
{
"epoch": 15.07,
"eval_loss": 3.344524383544922,
"eval_runtime": 115.5133,
"eval_samples_per_second": 894.053,
"eval_steps_per_second": 55.881,
"step": 1848000
},
{
"epoch": 15.13,
"learning_rate": 2.2679896606353707e-06,
"loss": 3.2724,
"step": 1856000
},
{
"epoch": 15.13,
"eval_loss": 3.3516576290130615,
"eval_runtime": 115.2664,
"eval_samples_per_second": 895.968,
"eval_steps_per_second": 56.001,
"step": 1856000
},
{
"epoch": 15.2,
"eval_loss": 3.359280824661255,
"eval_runtime": 116.0103,
"eval_samples_per_second": 890.223,
"eval_steps_per_second": 55.642,
"step": 1864000
},
{
"epoch": 15.26,
"learning_rate": 2.2012840823813894e-06,
"loss": 3.277,
"step": 1872000
},
{
"epoch": 15.26,
"eval_loss": 3.3597874641418457,
"eval_runtime": 114.9804,
"eval_samples_per_second": 898.197,
"eval_steps_per_second": 56.14,
"step": 1872000
},
{
"epoch": 15.33,
"eval_loss": 3.345801591873169,
"eval_runtime": 116.1901,
"eval_samples_per_second": 888.845,
"eval_steps_per_second": 55.555,
"step": 1880000
},
{
"epoch": 15.39,
"learning_rate": 2.134578504127408e-06,
"loss": 3.2842,
"step": 1888000
},
{
"epoch": 15.39,
"eval_loss": 3.3583106994628906,
"eval_runtime": 114.8266,
"eval_samples_per_second": 899.399,
"eval_steps_per_second": 56.215,
"step": 1888000
},
{
"epoch": 15.46,
"eval_loss": 3.3447749614715576,
"eval_runtime": 114.9801,
"eval_samples_per_second": 898.199,
"eval_steps_per_second": 56.14,
"step": 1896000
},
{
"epoch": 15.53,
"learning_rate": 2.067872925873426e-06,
"loss": 3.2758,
"step": 1904000
},
{
"epoch": 15.53,
"eval_loss": 3.3593051433563232,
"eval_runtime": 114.9092,
"eval_samples_per_second": 898.753,
"eval_steps_per_second": 56.175,
"step": 1904000
},
{
"epoch": 15.59,
"eval_loss": 3.3551743030548096,
"eval_runtime": 115.5179,
"eval_samples_per_second": 894.017,
"eval_steps_per_second": 55.879,
"step": 1912000
},
{
"epoch": 15.66,
"learning_rate": 2.0011673476194448e-06,
"loss": 3.2684,
"step": 1920000
},
{
"epoch": 15.66,
"eval_loss": 3.371454954147339,
"eval_runtime": 114.8944,
"eval_samples_per_second": 898.869,
"eval_steps_per_second": 56.182,
"step": 1920000
},
{
"epoch": 15.72,
"eval_loss": 3.3543806076049805,
"eval_runtime": 115.4862,
"eval_samples_per_second": 894.263,
"eval_steps_per_second": 55.894,
"step": 1928000
},
{
"epoch": 15.79,
"learning_rate": 1.9344617693654634e-06,
"loss": 3.2924,
"step": 1936000
},
{
"epoch": 15.79,
"eval_loss": 3.3514981269836426,
"eval_runtime": 115.0356,
"eval_samples_per_second": 897.766,
"eval_steps_per_second": 56.113,
"step": 1936000
},
{
"epoch": 15.85,
"eval_loss": 3.36460018157959,
"eval_runtime": 115.4242,
"eval_samples_per_second": 894.743,
"eval_steps_per_second": 55.924,
"step": 1944000
},
{
"epoch": 15.92,
"learning_rate": 1.867756191111482e-06,
"loss": 3.2673,
"step": 1952000
},
{
"epoch": 15.92,
"eval_loss": 3.353806495666504,
"eval_runtime": 115.3905,
"eval_samples_per_second": 895.004,
"eval_steps_per_second": 55.94,
"step": 1952000
},
{
"epoch": 15.98,
"eval_loss": 3.3436896800994873,
"eval_runtime": 114.7945,
"eval_samples_per_second": 899.651,
"eval_steps_per_second": 56.231,
"step": 1960000
},
{
"epoch": 16.05,
"learning_rate": 1.8010506128575004e-06,
"loss": 3.2833,
"step": 1968000
},
{
"epoch": 16.05,
"eval_loss": 3.3442821502685547,
"eval_runtime": 116.1629,
"eval_samples_per_second": 889.053,
"eval_steps_per_second": 55.569,
"step": 1968000
},
{
"epoch": 16.11,
"eval_loss": 3.361924886703491,
"eval_runtime": 116.4426,
"eval_samples_per_second": 886.917,
"eval_steps_per_second": 55.435,
"step": 1976000
},
{
"epoch": 16.18,
"learning_rate": 1.734345034603519e-06,
"loss": 3.2636,
"step": 1984000
},
{
"epoch": 16.18,
"eval_loss": 3.3510515689849854,
"eval_runtime": 115.8529,
"eval_samples_per_second": 891.432,
"eval_steps_per_second": 55.717,
"step": 1984000
},
{
"epoch": 16.24,
"eval_loss": 3.3447539806365967,
"eval_runtime": 114.926,
"eval_samples_per_second": 898.622,
"eval_steps_per_second": 56.167,
"step": 1992000
},
{
"epoch": 16.31,
"learning_rate": 1.6676394563495374e-06,
"loss": 3.2753,
"step": 2000000
},
{
"epoch": 16.31,
"eval_loss": 3.355980396270752,
"eval_runtime": 115.4649,
"eval_samples_per_second": 894.427,
"eval_steps_per_second": 55.904,
"step": 2000000
},
{
"epoch": 16.37,
"eval_loss": 3.3524882793426514,
"eval_runtime": 118.2786,
"eval_samples_per_second": 873.151,
"eval_steps_per_second": 54.575,
"step": 2008000
},
{
"epoch": 16.44,
"learning_rate": 1.6009338780955558e-06,
"loss": 3.2701,
"step": 2016000
},
{
"epoch": 16.44,
"eval_loss": 3.355792760848999,
"eval_runtime": 115.0046,
"eval_samples_per_second": 898.008,
"eval_steps_per_second": 56.128,
"step": 2016000
},
{
"epoch": 16.5,
"eval_loss": 3.3558590412139893,
"eval_runtime": 115.5093,
"eval_samples_per_second": 894.084,
"eval_steps_per_second": 55.883,
"step": 2024000
},
{
"epoch": 16.57,
"learning_rate": 1.5342282998415744e-06,
"loss": 3.2761,
"step": 2032000
},
{
"epoch": 16.57,
"eval_loss": 3.3439648151397705,
"eval_runtime": 114.8803,
"eval_samples_per_second": 898.979,
"eval_steps_per_second": 56.189,
"step": 2032000
},
{
"epoch": 16.63,
"eval_loss": 3.3505825996398926,
"eval_runtime": 115.5177,
"eval_samples_per_second": 894.019,
"eval_steps_per_second": 55.879,
"step": 2040000
},
{
"epoch": 16.7,
"learning_rate": 1.4675227215875928e-06,
"loss": 3.2677,
"step": 2048000
},
{
"epoch": 16.7,
"eval_loss": 3.3473587036132812,
"eval_runtime": 115.2604,
"eval_samples_per_second": 896.014,
"eval_steps_per_second": 56.004,
"step": 2048000
},
{
"epoch": 16.76,
"eval_loss": 3.3614845275878906,
"eval_runtime": 114.7851,
"eval_samples_per_second": 899.724,
"eval_steps_per_second": 56.236,
"step": 2056000
},
{
"epoch": 16.83,
"learning_rate": 1.4008171433336116e-06,
"loss": 3.2614,
"step": 2064000
},
{
"epoch": 16.83,
"eval_loss": 3.350660562515259,
"eval_runtime": 116.1258,
"eval_samples_per_second": 889.337,
"eval_steps_per_second": 55.586,
"step": 2064000
},
{
"epoch": 16.89,
"eval_loss": 3.34436297416687,
"eval_runtime": 114.7641,
"eval_samples_per_second": 899.89,
"eval_steps_per_second": 56.246,
"step": 2072000
},
{
"epoch": 16.96,
"learning_rate": 1.33411156507963e-06,
"loss": 3.2608,
"step": 2080000
},
{
"epoch": 16.96,
"eval_loss": 3.352665901184082,
"eval_runtime": 114.9595,
"eval_samples_per_second": 898.36,
"eval_steps_per_second": 56.15,
"step": 2080000
},
{
"epoch": 17.03,
"eval_loss": 3.3398256301879883,
"eval_runtime": 114.8716,
"eval_samples_per_second": 899.047,
"eval_steps_per_second": 56.193,
"step": 2088000
},
{
"epoch": 17.09,
"learning_rate": 1.2674059868256484e-06,
"loss": 3.2643,
"step": 2096000
},
{
"epoch": 17.09,
"eval_loss": 3.3497581481933594,
"eval_runtime": 115.3741,
"eval_samples_per_second": 895.132,
"eval_steps_per_second": 55.948,
"step": 2096000
},
{
"epoch": 17.16,
"eval_loss": 3.3348639011383057,
"eval_runtime": 114.8223,
"eval_samples_per_second": 899.434,
"eval_steps_per_second": 56.217,
"step": 2104000
},
{
"epoch": 17.22,
"learning_rate": 1.2007004085716668e-06,
"loss": 3.2721,
"step": 2112000
},
{
"epoch": 17.22,
"eval_loss": 3.356008291244507,
"eval_runtime": 115.5116,
"eval_samples_per_second": 894.066,
"eval_steps_per_second": 55.882,
"step": 2112000
},
{
"epoch": 17.29,
"eval_loss": 3.3421435356140137,
"eval_runtime": 115.5912,
"eval_samples_per_second": 893.45,
"eval_steps_per_second": 55.843,
"step": 2120000
},
{
"epoch": 17.35,
"learning_rate": 1.1339948303176854e-06,
"loss": 3.266,
"step": 2128000
},
{
"epoch": 17.35,
"eval_loss": 3.342872142791748,
"eval_runtime": 115.0319,
"eval_samples_per_second": 897.794,
"eval_steps_per_second": 56.115,
"step": 2128000
},
{
"epoch": 17.42,
"eval_loss": 3.337078809738159,
"eval_runtime": 114.7057,
"eval_samples_per_second": 900.347,
"eval_steps_per_second": 56.274,
"step": 2136000
},
{
"epoch": 17.48,
"learning_rate": 1.067289252063704e-06,
"loss": 3.2551,
"step": 2144000
},
{
"epoch": 17.48,
"eval_loss": 3.340388774871826,
"eval_runtime": 115.5719,
"eval_samples_per_second": 893.599,
"eval_steps_per_second": 55.853,
"step": 2144000
},
{
"epoch": 17.55,
"eval_loss": 3.349374771118164,
"eval_runtime": 116.2218,
"eval_samples_per_second": 888.603,
"eval_steps_per_second": 55.54,
"step": 2152000
},
{
"epoch": 17.61,
"learning_rate": 1.0005836738097224e-06,
"loss": 3.26,
"step": 2160000
},
{
"epoch": 17.61,
"eval_loss": 3.3389031887054443,
"eval_runtime": 115.0165,
"eval_samples_per_second": 897.915,
"eval_steps_per_second": 56.122,
"step": 2160000
},
{
"epoch": 17.68,
"eval_loss": 3.345613718032837,
"eval_runtime": 114.2481,
"eval_samples_per_second": 903.954,
"eval_steps_per_second": 56.5,
"step": 2168000
},
{
"epoch": 17.74,
"learning_rate": 9.33878095555741e-07,
"loss": 3.2528,
"step": 2176000
},
{
"epoch": 17.74,
"eval_loss": 3.3248987197875977,
"eval_runtime": 115.0558,
"eval_samples_per_second": 897.608,
"eval_steps_per_second": 56.103,
"step": 2176000
},
{
"epoch": 17.81,
"eval_loss": 3.3452157974243164,
"eval_runtime": 116.2164,
"eval_samples_per_second": 888.644,
"eval_steps_per_second": 55.543,
"step": 2184000
},
{
"epoch": 17.87,
"learning_rate": 8.671725173017595e-07,
"loss": 3.2602,
"step": 2192000
},
{
"epoch": 17.87,
"eval_loss": 3.33760929107666,
"eval_runtime": 116.1157,
"eval_samples_per_second": 889.414,
"eval_steps_per_second": 55.591,
"step": 2192000
},
{
"epoch": 17.94,
"eval_loss": 3.351128101348877,
"eval_runtime": 114.6575,
"eval_samples_per_second": 900.726,
"eval_steps_per_second": 56.298,
"step": 2200000
},
{
"epoch": 18.0,
"learning_rate": 8.004669390477779e-07,
"loss": 3.2492,
"step": 2208000
},
{
"epoch": 18.0,
"eval_loss": 3.347473621368408,
"eval_runtime": 115.2092,
"eval_samples_per_second": 896.413,
"eval_steps_per_second": 56.029,
"step": 2208000
},
{
"epoch": 18.07,
"eval_loss": 3.349674940109253,
"eval_runtime": 115.6497,
"eval_samples_per_second": 892.998,
"eval_steps_per_second": 55.815,
"step": 2216000
},
{
"epoch": 18.13,
"learning_rate": 7.337613607937964e-07,
"loss": 3.2469,
"step": 2224000
},
{
"epoch": 18.13,
"eval_loss": 3.3378491401672363,
"eval_runtime": 114.9296,
"eval_samples_per_second": 898.594,
"eval_steps_per_second": 56.165,
"step": 2224000
},
{
"epoch": 18.2,
"eval_loss": 3.332571029663086,
"eval_runtime": 115.4244,
"eval_samples_per_second": 894.742,
"eval_steps_per_second": 55.924,
"step": 2232000
},
{
"epoch": 18.26,
"learning_rate": 6.67055782539815e-07,
"loss": 3.2589,
"step": 2240000
},
{
"epoch": 18.26,
"eval_loss": 3.3277342319488525,
"eval_runtime": 114.9762,
"eval_samples_per_second": 898.229,
"eval_steps_per_second": 56.142,
"step": 2240000
},
{
"epoch": 18.33,
"eval_loss": 3.3456978797912598,
"eval_runtime": 116.0675,
"eval_samples_per_second": 889.784,
"eval_steps_per_second": 55.614,
"step": 2248000
},
{
"epoch": 18.4,
"learning_rate": 6.003502042858334e-07,
"loss": 3.2548,
"step": 2256000
},
{
"epoch": 18.4,
"eval_loss": 3.334270715713501,
"eval_runtime": 115.7666,
"eval_samples_per_second": 892.097,
"eval_steps_per_second": 55.759,
"step": 2256000
},
{
"epoch": 18.46,
"eval_loss": 3.3362197875976562,
"eval_runtime": 115.5031,
"eval_samples_per_second": 894.132,
"eval_steps_per_second": 55.886,
"step": 2264000
},
{
"epoch": 18.53,
"learning_rate": 5.33644626031852e-07,
"loss": 3.2589,
"step": 2272000
},
{
"epoch": 18.53,
"eval_loss": 3.343080997467041,
"eval_runtime": 115.3187,
"eval_samples_per_second": 895.561,
"eval_steps_per_second": 55.975,
"step": 2272000
},
{
"epoch": 18.59,
"eval_loss": 3.3428003787994385,
"eval_runtime": 115.3186,
"eval_samples_per_second": 895.563,
"eval_steps_per_second": 55.975,
"step": 2280000
},
{
"epoch": 18.66,
"learning_rate": 4.669390477778705e-07,
"loss": 3.2674,
"step": 2288000
},
{
"epoch": 18.66,
"eval_loss": 3.3400795459747314,
"eval_runtime": 114.7905,
"eval_samples_per_second": 899.682,
"eval_steps_per_second": 56.233,
"step": 2288000
},
{
"epoch": 18.72,
"eval_loss": 3.337498903274536,
"eval_runtime": 114.9489,
"eval_samples_per_second": 898.443,
"eval_steps_per_second": 56.155,
"step": 2296000
},
{
"epoch": 18.79,
"learning_rate": 4.0023346952388894e-07,
"loss": 3.2561,
"step": 2304000
},
{
"epoch": 18.79,
"eval_loss": 3.3333868980407715,
"eval_runtime": 114.8393,
"eval_samples_per_second": 899.3,
"eval_steps_per_second": 56.209,
"step": 2304000
},
{
"epoch": 18.85,
"eval_loss": 3.3320717811584473,
"eval_runtime": 115.0159,
"eval_samples_per_second": 897.919,
"eval_steps_per_second": 56.123,
"step": 2312000
},
{
"epoch": 18.92,
"learning_rate": 3.335278912699075e-07,
"loss": 3.2452,
"step": 2320000
},
{
"epoch": 18.92,
"eval_loss": 3.3445632457733154,
"eval_runtime": 114.9617,
"eval_samples_per_second": 898.342,
"eval_steps_per_second": 56.149,
"step": 2320000
},
{
"epoch": 18.98,
"eval_loss": 3.3525032997131348,
"eval_runtime": 116.2145,
"eval_samples_per_second": 888.659,
"eval_steps_per_second": 55.544,
"step": 2328000
},
{
"epoch": 19.05,
"learning_rate": 2.66822313015926e-07,
"loss": 3.259,
"step": 2336000
},
{
"epoch": 19.05,
"eval_loss": 3.331772804260254,
"eval_runtime": 115.4929,
"eval_samples_per_second": 894.211,
"eval_steps_per_second": 55.891,
"step": 2336000
},
{
"epoch": 19.11,
"eval_loss": 3.3451852798461914,
"eval_runtime": 115.1546,
"eval_samples_per_second": 896.838,
"eval_steps_per_second": 56.055,
"step": 2344000
},
{
"epoch": 19.18,
"learning_rate": 2.0011673476194447e-07,
"loss": 3.2494,
"step": 2352000
},
{
"epoch": 19.18,
"eval_loss": 3.335479497909546,
"eval_runtime": 114.4583,
"eval_samples_per_second": 902.293,
"eval_steps_per_second": 56.396,
"step": 2352000
},
{
"epoch": 19.24,
"eval_loss": 3.3322434425354004,
"eval_runtime": 116.1476,
"eval_samples_per_second": 889.17,
"eval_steps_per_second": 55.576,
"step": 2360000
},
{
"epoch": 19.31,
"learning_rate": 1.33411156507963e-07,
"loss": 3.2558,
"step": 2368000
},
{
"epoch": 19.31,
"eval_loss": 3.325453281402588,
"eval_runtime": 114.8662,
"eval_samples_per_second": 899.089,
"eval_steps_per_second": 56.196,
"step": 2368000
},
{
"epoch": 19.37,
"eval_loss": 3.3329989910125732,
"eval_runtime": 117.9929,
"eval_samples_per_second": 875.265,
"eval_steps_per_second": 54.707,
"step": 2376000
},
{
"epoch": 19.44,
"learning_rate": 6.67055782539815e-08,
"loss": 3.2436,
"step": 2384000
},
{
"epoch": 19.44,
"eval_loss": 3.3357789516448975,
"eval_runtime": 117.7235,
"eval_samples_per_second": 877.268,
"eval_steps_per_second": 54.832,
"step": 2384000
},
{
"epoch": 19.5,
"eval_loss": 3.3287487030029297,
"eval_runtime": 115.6745,
"eval_samples_per_second": 892.807,
"eval_steps_per_second": 55.803,
"step": 2392000
},
{
"epoch": 19.57,
"learning_rate": 0.0,
"loss": 3.2545,
"step": 2400000
},
{
"epoch": 19.57,
"eval_loss": 3.3321266174316406,
"eval_runtime": 115.8716,
"eval_samples_per_second": 891.289,
"eval_steps_per_second": 55.708,
"step": 2400000
},
{
"epoch": 19.57,
"step": 2400000,
"total_flos": 6.9600759359113e+17,
"train_loss": 3.268406458333333,
"train_runtime": 194422.9949,
"train_samples_per_second": 197.508,
"train_steps_per_second": 12.344
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 20,
"save_steps": 32000,
"total_flos": 6.9600759359113e+17,
"trial_name": null,
"trial_params": null
}