nreimers's picture
upload
6dd993a
raw
history blame
103 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8325008325008325,
"global_step": 230000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 4.996983692635866e-05,
"loss": 9.3743,
"step": 500
},
{
"epoch": 0.0,
"learning_rate": 4.9939673852717336e-05,
"loss": 8.6211,
"step": 1000
},
{
"epoch": 0.0,
"eval_loss": 8.523540496826172,
"eval_runtime": 4.1116,
"eval_samples_per_second": 243.213,
"eval_steps_per_second": 15.322,
"step": 1000
},
{
"epoch": 0.01,
"learning_rate": 4.9909510779075997e-05,
"loss": 8.3996,
"step": 1500
},
{
"epoch": 0.01,
"learning_rate": 4.9879347705434663e-05,
"loss": 8.2439,
"step": 2000
},
{
"epoch": 0.01,
"eval_loss": 8.210819244384766,
"eval_runtime": 4.2077,
"eval_samples_per_second": 237.661,
"eval_steps_per_second": 14.973,
"step": 2000
},
{
"epoch": 0.01,
"learning_rate": 4.984924495794061e-05,
"loss": 8.1275,
"step": 2500
},
{
"epoch": 0.01,
"learning_rate": 4.981908188429928e-05,
"loss": 8.0166,
"step": 3000
},
{
"epoch": 0.01,
"eval_loss": 7.937699317932129,
"eval_runtime": 4.3727,
"eval_samples_per_second": 228.692,
"eval_steps_per_second": 14.408,
"step": 3000
},
{
"epoch": 0.01,
"learning_rate": 4.978891881065794e-05,
"loss": 7.922,
"step": 3500
},
{
"epoch": 0.01,
"learning_rate": 4.975875573701661e-05,
"loss": 7.8304,
"step": 4000
},
{
"epoch": 0.01,
"eval_loss": 7.840174674987793,
"eval_runtime": 4.2845,
"eval_samples_per_second": 233.399,
"eval_steps_per_second": 14.704,
"step": 4000
},
{
"epoch": 0.02,
"learning_rate": 4.9728592663375274e-05,
"loss": 7.7459,
"step": 4500
},
{
"epoch": 0.02,
"learning_rate": 4.9698489915881224e-05,
"loss": 7.705,
"step": 5000
},
{
"epoch": 0.02,
"eval_loss": 7.6650848388671875,
"eval_runtime": 4.3176,
"eval_samples_per_second": 231.609,
"eval_steps_per_second": 14.591,
"step": 5000
},
{
"epoch": 0.02,
"learning_rate": 4.966832684223989e-05,
"loss": 7.653,
"step": 5500
},
{
"epoch": 0.02,
"learning_rate": 4.963816376859856e-05,
"loss": 7.5734,
"step": 6000
},
{
"epoch": 0.02,
"eval_loss": 7.5711894035339355,
"eval_runtime": 4.3326,
"eval_samples_per_second": 230.807,
"eval_steps_per_second": 14.541,
"step": 6000
},
{
"epoch": 0.02,
"learning_rate": 4.960800069495722e-05,
"loss": 7.5223,
"step": 6500
},
{
"epoch": 0.03,
"learning_rate": 4.9577837621315885e-05,
"loss": 7.4853,
"step": 7000
},
{
"epoch": 0.03,
"eval_loss": 7.468273639678955,
"eval_runtime": 4.3863,
"eval_samples_per_second": 227.982,
"eval_steps_per_second": 14.363,
"step": 7000
},
{
"epoch": 0.03,
"learning_rate": 4.9547734873821834e-05,
"loss": 7.419,
"step": 7500
},
{
"epoch": 0.03,
"learning_rate": 4.9517571800180494e-05,
"loss": 7.3788,
"step": 8000
},
{
"epoch": 0.03,
"eval_loss": 7.432778358459473,
"eval_runtime": 4.2901,
"eval_samples_per_second": 233.096,
"eval_steps_per_second": 14.685,
"step": 8000
},
{
"epoch": 0.03,
"learning_rate": 4.948740872653916e-05,
"loss": 7.3405,
"step": 8500
},
{
"epoch": 0.03,
"learning_rate": 4.945724565289783e-05,
"loss": 7.2813,
"step": 9000
},
{
"epoch": 0.03,
"eval_loss": 7.341405868530273,
"eval_runtime": 4.4257,
"eval_samples_per_second": 225.951,
"eval_steps_per_second": 14.235,
"step": 9000
},
{
"epoch": 0.03,
"learning_rate": 4.9427082579256495e-05,
"loss": 7.2549,
"step": 9500
},
{
"epoch": 0.04,
"learning_rate": 4.9396919505615156e-05,
"loss": 7.213,
"step": 10000
},
{
"epoch": 0.04,
"eval_loss": 7.239240646362305,
"eval_runtime": 4.2885,
"eval_samples_per_second": 233.181,
"eval_steps_per_second": 14.69,
"step": 10000
},
{
"epoch": 0.04,
"learning_rate": 4.936675643197383e-05,
"loss": 7.1898,
"step": 10500
},
{
"epoch": 0.04,
"learning_rate": 4.933659335833249e-05,
"loss": 7.1662,
"step": 11000
},
{
"epoch": 0.04,
"eval_loss": 7.204339504241943,
"eval_runtime": 4.306,
"eval_samples_per_second": 232.234,
"eval_steps_per_second": 14.631,
"step": 11000
},
{
"epoch": 0.04,
"learning_rate": 4.930655093698572e-05,
"loss": 7.1373,
"step": 11500
},
{
"epoch": 0.04,
"learning_rate": 4.927638786334439e-05,
"loss": 7.0908,
"step": 12000
},
{
"epoch": 0.04,
"eval_loss": 7.1362996101379395,
"eval_runtime": 4.278,
"eval_samples_per_second": 233.752,
"eval_steps_per_second": 14.726,
"step": 12000
},
{
"epoch": 0.05,
"learning_rate": 4.9246224789703056e-05,
"loss": 7.0629,
"step": 12500
},
{
"epoch": 0.05,
"learning_rate": 4.9216061716061716e-05,
"loss": 7.0458,
"step": 13000
},
{
"epoch": 0.05,
"eval_loss": 7.110766410827637,
"eval_runtime": 4.4164,
"eval_samples_per_second": 226.429,
"eval_steps_per_second": 14.265,
"step": 13000
},
{
"epoch": 0.05,
"learning_rate": 4.918589864242038e-05,
"loss": 7.0147,
"step": 13500
},
{
"epoch": 0.05,
"learning_rate": 4.915573556877905e-05,
"loss": 6.9895,
"step": 14000
},
{
"epoch": 0.05,
"eval_loss": 6.985559940338135,
"eval_runtime": 4.3309,
"eval_samples_per_second": 230.901,
"eval_steps_per_second": 14.547,
"step": 14000
},
{
"epoch": 0.05,
"learning_rate": 4.912563282128499e-05,
"loss": 6.9634,
"step": 14500
},
{
"epoch": 0.05,
"learning_rate": 4.9095469747643666e-05,
"loss": 6.9537,
"step": 15000
},
{
"epoch": 0.05,
"eval_loss": 7.012362480163574,
"eval_runtime": 4.2388,
"eval_samples_per_second": 235.918,
"eval_steps_per_second": 14.863,
"step": 15000
},
{
"epoch": 0.06,
"learning_rate": 4.9065306674002326e-05,
"loss": 6.9295,
"step": 15500
},
{
"epoch": 0.06,
"learning_rate": 4.903514360036099e-05,
"loss": 6.9067,
"step": 16000
},
{
"epoch": 0.06,
"eval_loss": 6.980933666229248,
"eval_runtime": 4.272,
"eval_samples_per_second": 234.082,
"eval_steps_per_second": 14.747,
"step": 16000
},
{
"epoch": 0.06,
"learning_rate": 4.900504085286694e-05,
"loss": 6.8896,
"step": 16500
},
{
"epoch": 0.06,
"learning_rate": 4.897487777922561e-05,
"loss": 6.8678,
"step": 17000
},
{
"epoch": 0.06,
"eval_loss": 6.871150493621826,
"eval_runtime": 4.2596,
"eval_samples_per_second": 234.765,
"eval_steps_per_second": 14.79,
"step": 17000
},
{
"epoch": 0.06,
"learning_rate": 4.894471470558427e-05,
"loss": 6.8301,
"step": 17500
},
{
"epoch": 0.07,
"learning_rate": 4.891455163194294e-05,
"loss": 6.8388,
"step": 18000
},
{
"epoch": 0.07,
"eval_loss": 6.891202449798584,
"eval_runtime": 4.2654,
"eval_samples_per_second": 234.445,
"eval_steps_per_second": 14.77,
"step": 18000
},
{
"epoch": 0.07,
"learning_rate": 4.8884388558301604e-05,
"loss": 6.8197,
"step": 18500
},
{
"epoch": 0.07,
"learning_rate": 4.885422548466027e-05,
"loss": 6.8031,
"step": 19000
},
{
"epoch": 0.07,
"eval_loss": 6.856838703155518,
"eval_runtime": 4.3078,
"eval_samples_per_second": 232.139,
"eval_steps_per_second": 14.625,
"step": 19000
},
{
"epoch": 0.07,
"learning_rate": 4.882406241101893e-05,
"loss": 6.7821,
"step": 19500
},
{
"epoch": 0.07,
"learning_rate": 4.8793899337377605e-05,
"loss": 6.7818,
"step": 20000
},
{
"epoch": 0.07,
"eval_loss": 6.831137180328369,
"eval_runtime": 4.3011,
"eval_samples_per_second": 232.498,
"eval_steps_per_second": 14.647,
"step": 20000
},
{
"epoch": 0.07,
"learning_rate": 4.876379658988355e-05,
"loss": 6.7754,
"step": 20500
},
{
"epoch": 0.08,
"learning_rate": 4.8733633516242215e-05,
"loss": 6.7548,
"step": 21000
},
{
"epoch": 0.08,
"eval_loss": 6.813143253326416,
"eval_runtime": 6.2764,
"eval_samples_per_second": 159.326,
"eval_steps_per_second": 10.038,
"step": 21000
},
{
"epoch": 0.08,
"learning_rate": 4.870347044260088e-05,
"loss": 6.7458,
"step": 21500
},
{
"epoch": 0.08,
"learning_rate": 4.867330736895955e-05,
"loss": 6.7351,
"step": 22000
},
{
"epoch": 0.08,
"eval_loss": 6.724792957305908,
"eval_runtime": 4.2249,
"eval_samples_per_second": 236.695,
"eval_steps_per_second": 14.912,
"step": 22000
},
{
"epoch": 0.08,
"learning_rate": 4.864314429531821e-05,
"loss": 6.7107,
"step": 22500
},
{
"epoch": 0.08,
"learning_rate": 4.8612981221676876e-05,
"loss": 6.6927,
"step": 23000
},
{
"epoch": 0.08,
"eval_loss": 6.75374698638916,
"eval_runtime": 4.3504,
"eval_samples_per_second": 229.864,
"eval_steps_per_second": 14.481,
"step": 23000
},
{
"epoch": 0.09,
"learning_rate": 4.8582878474182825e-05,
"loss": 6.6837,
"step": 23500
},
{
"epoch": 0.09,
"learning_rate": 4.8552715400541485e-05,
"loss": 6.6863,
"step": 24000
},
{
"epoch": 0.09,
"eval_loss": 6.72822380065918,
"eval_runtime": 4.384,
"eval_samples_per_second": 228.102,
"eval_steps_per_second": 14.37,
"step": 24000
},
{
"epoch": 0.09,
"learning_rate": 4.852255232690016e-05,
"loss": 6.6608,
"step": 24500
},
{
"epoch": 0.09,
"learning_rate": 4.849238925325882e-05,
"loss": 6.6452,
"step": 25000
},
{
"epoch": 0.09,
"eval_loss": 6.719581604003906,
"eval_runtime": 4.3427,
"eval_samples_per_second": 230.271,
"eval_steps_per_second": 14.507,
"step": 25000
},
{
"epoch": 0.09,
"learning_rate": 4.8462226179617486e-05,
"loss": 6.6261,
"step": 25500
},
{
"epoch": 0.09,
"learning_rate": 4.8432123432123436e-05,
"loss": 6.6153,
"step": 26000
},
{
"epoch": 0.09,
"eval_loss": 6.65714168548584,
"eval_runtime": 4.3559,
"eval_samples_per_second": 229.574,
"eval_steps_per_second": 14.463,
"step": 26000
},
{
"epoch": 0.1,
"learning_rate": 4.84019603584821e-05,
"loss": 6.6081,
"step": 26500
},
{
"epoch": 0.1,
"learning_rate": 4.837179728484076e-05,
"loss": 6.6114,
"step": 27000
},
{
"epoch": 0.1,
"eval_loss": 6.6556854248046875,
"eval_runtime": 4.4221,
"eval_samples_per_second": 226.135,
"eval_steps_per_second": 14.247,
"step": 27000
},
{
"epoch": 0.1,
"learning_rate": 4.834163421119943e-05,
"loss": 6.5909,
"step": 27500
},
{
"epoch": 0.1,
"learning_rate": 4.831153146370538e-05,
"loss": 6.6003,
"step": 28000
},
{
"epoch": 0.1,
"eval_loss": 6.645580291748047,
"eval_runtime": 4.321,
"eval_samples_per_second": 231.43,
"eval_steps_per_second": 14.58,
"step": 28000
},
{
"epoch": 0.1,
"learning_rate": 4.8281368390064047e-05,
"loss": 6.5773,
"step": 28500
},
{
"epoch": 0.1,
"learning_rate": 4.825120531642271e-05,
"loss": 6.5656,
"step": 29000
},
{
"epoch": 0.1,
"eval_loss": 6.621737480163574,
"eval_runtime": 4.2363,
"eval_samples_per_second": 236.056,
"eval_steps_per_second": 14.872,
"step": 29000
},
{
"epoch": 0.11,
"learning_rate": 4.822104224278138e-05,
"loss": 6.5753,
"step": 29500
},
{
"epoch": 0.11,
"learning_rate": 4.819087916914004e-05,
"loss": 6.5518,
"step": 30000
},
{
"epoch": 0.11,
"eval_loss": 6.60227632522583,
"eval_runtime": 4.3472,
"eval_samples_per_second": 230.034,
"eval_steps_per_second": 14.492,
"step": 30000
},
{
"epoch": 0.11,
"learning_rate": 4.816071609549871e-05,
"loss": 6.5431,
"step": 30500
},
{
"epoch": 0.11,
"learning_rate": 4.813061334800466e-05,
"loss": 6.5407,
"step": 31000
},
{
"epoch": 0.11,
"eval_loss": 6.591599464416504,
"eval_runtime": 4.2945,
"eval_samples_per_second": 232.858,
"eval_steps_per_second": 14.67,
"step": 31000
},
{
"epoch": 0.11,
"learning_rate": 4.810045027436332e-05,
"loss": 6.5197,
"step": 31500
},
{
"epoch": 0.12,
"learning_rate": 4.8070287200721984e-05,
"loss": 6.5069,
"step": 32000
},
{
"epoch": 0.12,
"eval_loss": 6.563199996948242,
"eval_runtime": 4.2853,
"eval_samples_per_second": 233.356,
"eval_steps_per_second": 14.701,
"step": 32000
},
{
"epoch": 0.12,
"learning_rate": 4.804012412708065e-05,
"loss": 6.508,
"step": 32500
},
{
"epoch": 0.12,
"learning_rate": 4.800996105343932e-05,
"loss": 6.5031,
"step": 33000
},
{
"epoch": 0.12,
"eval_loss": 6.495542526245117,
"eval_runtime": 4.2956,
"eval_samples_per_second": 232.797,
"eval_steps_per_second": 14.666,
"step": 33000
},
{
"epoch": 0.12,
"learning_rate": 4.797979797979798e-05,
"loss": 6.4874,
"step": 33500
},
{
"epoch": 0.12,
"learning_rate": 4.7949634906156645e-05,
"loss": 6.4795,
"step": 34000
},
{
"epoch": 0.12,
"eval_loss": 6.54636812210083,
"eval_runtime": 4.3345,
"eval_samples_per_second": 230.706,
"eval_steps_per_second": 14.534,
"step": 34000
},
{
"epoch": 0.12,
"learning_rate": 4.7919532158662595e-05,
"loss": 6.4765,
"step": 34500
},
{
"epoch": 0.13,
"learning_rate": 4.788936908502126e-05,
"loss": 6.4749,
"step": 35000
},
{
"epoch": 0.13,
"eval_loss": 6.562394142150879,
"eval_runtime": 4.214,
"eval_samples_per_second": 237.303,
"eval_steps_per_second": 14.95,
"step": 35000
},
{
"epoch": 0.13,
"learning_rate": 4.785920601137993e-05,
"loss": 6.4512,
"step": 35500
},
{
"epoch": 0.13,
"learning_rate": 4.7829042937738596e-05,
"loss": 6.4453,
"step": 36000
},
{
"epoch": 0.13,
"eval_loss": 6.480181694030762,
"eval_runtime": 4.3457,
"eval_samples_per_second": 230.112,
"eval_steps_per_second": 14.497,
"step": 36000
},
{
"epoch": 0.13,
"learning_rate": 4.779894019024454e-05,
"loss": 6.471,
"step": 36500
},
{
"epoch": 0.13,
"learning_rate": 4.7768777116603206e-05,
"loss": 6.4371,
"step": 37000
},
{
"epoch": 0.13,
"eval_loss": 6.513635635375977,
"eval_runtime": 4.3597,
"eval_samples_per_second": 229.373,
"eval_steps_per_second": 14.45,
"step": 37000
},
{
"epoch": 0.14,
"learning_rate": 4.773861404296187e-05,
"loss": 6.4295,
"step": 37500
},
{
"epoch": 0.14,
"learning_rate": 4.770845096932054e-05,
"loss": 6.4302,
"step": 38000
},
{
"epoch": 0.14,
"eval_loss": 6.495160102844238,
"eval_runtime": 4.2204,
"eval_samples_per_second": 236.944,
"eval_steps_per_second": 14.927,
"step": 38000
},
{
"epoch": 0.14,
"learning_rate": 4.76782878956792e-05,
"loss": 6.4215,
"step": 38500
},
{
"epoch": 0.14,
"learning_rate": 4.764818514818515e-05,
"loss": 6.4062,
"step": 39000
},
{
"epoch": 0.14,
"eval_loss": 6.4935173988342285,
"eval_runtime": 4.366,
"eval_samples_per_second": 229.044,
"eval_steps_per_second": 14.43,
"step": 39000
},
{
"epoch": 0.14,
"learning_rate": 4.7618022074543816e-05,
"loss": 6.3908,
"step": 39500
},
{
"epoch": 0.14,
"learning_rate": 4.7587859000902476e-05,
"loss": 6.4109,
"step": 40000
},
{
"epoch": 0.14,
"eval_loss": 6.459984302520752,
"eval_runtime": 4.3414,
"eval_samples_per_second": 230.343,
"eval_steps_per_second": 14.512,
"step": 40000
},
{
"epoch": 0.15,
"learning_rate": 4.755769592726115e-05,
"loss": 6.3959,
"step": 40500
},
{
"epoch": 0.15,
"learning_rate": 4.752753285361981e-05,
"loss": 6.3828,
"step": 41000
},
{
"epoch": 0.15,
"eval_loss": 6.429795265197754,
"eval_runtime": 4.3398,
"eval_samples_per_second": 230.427,
"eval_steps_per_second": 14.517,
"step": 41000
},
{
"epoch": 0.15,
"learning_rate": 4.749736977997848e-05,
"loss": 6.3775,
"step": 41500
},
{
"epoch": 0.15,
"learning_rate": 4.7467206706337144e-05,
"loss": 6.3739,
"step": 42000
},
{
"epoch": 0.15,
"eval_loss": 6.406271457672119,
"eval_runtime": 4.3187,
"eval_samples_per_second": 231.553,
"eval_steps_per_second": 14.588,
"step": 42000
},
{
"epoch": 0.15,
"learning_rate": 4.743704363269581e-05,
"loss": 6.3638,
"step": 42500
},
{
"epoch": 0.16,
"learning_rate": 4.7406940885201754e-05,
"loss": 6.3449,
"step": 43000
},
{
"epoch": 0.16,
"eval_loss": 6.363914489746094,
"eval_runtime": 6.5899,
"eval_samples_per_second": 151.748,
"eval_steps_per_second": 9.56,
"step": 43000
},
{
"epoch": 0.16,
"learning_rate": 4.737677781156043e-05,
"loss": 6.3654,
"step": 43500
},
{
"epoch": 0.16,
"learning_rate": 4.734661473791909e-05,
"loss": 6.3473,
"step": 44000
},
{
"epoch": 0.16,
"eval_loss": 6.375613212585449,
"eval_runtime": 4.323,
"eval_samples_per_second": 231.319,
"eval_steps_per_second": 14.573,
"step": 44000
},
{
"epoch": 0.16,
"learning_rate": 4.7316451664277755e-05,
"loss": 6.3486,
"step": 44500
},
{
"epoch": 0.16,
"learning_rate": 4.7286348916783704e-05,
"loss": 6.3462,
"step": 45000
},
{
"epoch": 0.16,
"eval_loss": 6.379289150238037,
"eval_runtime": 4.2624,
"eval_samples_per_second": 234.609,
"eval_steps_per_second": 14.78,
"step": 45000
},
{
"epoch": 0.16,
"learning_rate": 4.725624616928965e-05,
"loss": 6.3313,
"step": 45500
},
{
"epoch": 0.17,
"learning_rate": 4.7226083095648314e-05,
"loss": 6.3323,
"step": 46000
},
{
"epoch": 0.17,
"eval_loss": 6.387815475463867,
"eval_runtime": 4.3758,
"eval_samples_per_second": 228.53,
"eval_steps_per_second": 14.397,
"step": 46000
},
{
"epoch": 0.17,
"learning_rate": 4.719592002200698e-05,
"loss": 6.3256,
"step": 46500
},
{
"epoch": 0.17,
"learning_rate": 4.716575694836565e-05,
"loss": 6.3098,
"step": 47000
},
{
"epoch": 0.17,
"eval_loss": 6.394986629486084,
"eval_runtime": 4.3311,
"eval_samples_per_second": 230.889,
"eval_steps_per_second": 14.546,
"step": 47000
},
{
"epoch": 0.17,
"learning_rate": 4.713559387472431e-05,
"loss": 6.3082,
"step": 47500
},
{
"epoch": 0.17,
"learning_rate": 4.7105430801082975e-05,
"loss": 6.298,
"step": 48000
},
{
"epoch": 0.17,
"eval_loss": 6.3571929931640625,
"eval_runtime": 4.3274,
"eval_samples_per_second": 231.084,
"eval_steps_per_second": 14.558,
"step": 48000
},
{
"epoch": 0.18,
"learning_rate": 4.707526772744164e-05,
"loss": 6.3032,
"step": 48500
},
{
"epoch": 0.18,
"learning_rate": 4.704510465380031e-05,
"loss": 6.3095,
"step": 49000
},
{
"epoch": 0.18,
"eval_loss": 6.37529182434082,
"eval_runtime": 4.2983,
"eval_samples_per_second": 232.652,
"eval_steps_per_second": 14.657,
"step": 49000
},
{
"epoch": 0.18,
"learning_rate": 4.701500190630625e-05,
"loss": 6.2991,
"step": 49500
},
{
"epoch": 0.18,
"learning_rate": 4.6984838832664926e-05,
"loss": 6.2891,
"step": 50000
},
{
"epoch": 0.18,
"eval_loss": 6.396151542663574,
"eval_runtime": 4.4316,
"eval_samples_per_second": 225.65,
"eval_steps_per_second": 14.216,
"step": 50000
},
{
"epoch": 0.18,
"learning_rate": 4.6954675759023586e-05,
"loss": 6.2902,
"step": 50500
},
{
"epoch": 0.18,
"learning_rate": 4.692451268538225e-05,
"loss": 6.2847,
"step": 51000
},
{
"epoch": 0.18,
"eval_loss": 6.3496623039245605,
"eval_runtime": 4.3566,
"eval_samples_per_second": 229.537,
"eval_steps_per_second": 14.461,
"step": 51000
},
{
"epoch": 0.19,
"learning_rate": 4.689434961174092e-05,
"loss": 6.278,
"step": 51500
},
{
"epoch": 0.19,
"learning_rate": 4.686424686424687e-05,
"loss": 6.2635,
"step": 52000
},
{
"epoch": 0.19,
"eval_loss": 6.37539005279541,
"eval_runtime": 4.338,
"eval_samples_per_second": 230.523,
"eval_steps_per_second": 14.523,
"step": 52000
},
{
"epoch": 0.19,
"learning_rate": 4.683408379060553e-05,
"loss": 6.2571,
"step": 52500
},
{
"epoch": 0.19,
"learning_rate": 4.68039207169642e-05,
"loss": 6.2554,
"step": 53000
},
{
"epoch": 0.19,
"eval_loss": 6.27804708480835,
"eval_runtime": 4.3755,
"eval_samples_per_second": 228.546,
"eval_steps_per_second": 14.398,
"step": 53000
},
{
"epoch": 0.19,
"learning_rate": 4.6773757643322863e-05,
"loss": 6.2548,
"step": 53500
},
{
"epoch": 0.2,
"learning_rate": 4.6743594569681524e-05,
"loss": 6.2407,
"step": 54000
},
{
"epoch": 0.2,
"eval_loss": 6.285305023193359,
"eval_runtime": 4.3656,
"eval_samples_per_second": 229.064,
"eval_steps_per_second": 14.431,
"step": 54000
},
{
"epoch": 0.2,
"learning_rate": 4.67134314960402e-05,
"loss": 6.2262,
"step": 54500
},
{
"epoch": 0.2,
"learning_rate": 4.668326842239886e-05,
"loss": 6.2222,
"step": 55000
},
{
"epoch": 0.2,
"eval_loss": 6.287919044494629,
"eval_runtime": 4.3582,
"eval_samples_per_second": 229.452,
"eval_steps_per_second": 14.455,
"step": 55000
},
{
"epoch": 0.2,
"learning_rate": 4.6653105348757525e-05,
"loss": 6.2463,
"step": 55500
},
{
"epoch": 0.2,
"learning_rate": 4.6623002601263474e-05,
"loss": 6.2233,
"step": 56000
},
{
"epoch": 0.2,
"eval_loss": 6.290223598480225,
"eval_runtime": 4.3327,
"eval_samples_per_second": 230.801,
"eval_steps_per_second": 14.54,
"step": 56000
},
{
"epoch": 0.2,
"learning_rate": 4.659283952762214e-05,
"loss": 6.2253,
"step": 56500
},
{
"epoch": 0.21,
"learning_rate": 4.65626764539808e-05,
"loss": 6.2147,
"step": 57000
},
{
"epoch": 0.21,
"eval_loss": 6.265440464019775,
"eval_runtime": 4.367,
"eval_samples_per_second": 228.992,
"eval_steps_per_second": 14.426,
"step": 57000
},
{
"epoch": 0.21,
"learning_rate": 4.653251338033947e-05,
"loss": 6.2245,
"step": 57500
},
{
"epoch": 0.21,
"learning_rate": 4.650241063284542e-05,
"loss": 6.2118,
"step": 58000
},
{
"epoch": 0.21,
"eval_loss": 6.265470504760742,
"eval_runtime": 4.2827,
"eval_samples_per_second": 233.497,
"eval_steps_per_second": 14.71,
"step": 58000
},
{
"epoch": 0.21,
"learning_rate": 4.6472247559204085e-05,
"loss": 6.2082,
"step": 58500
},
{
"epoch": 0.21,
"learning_rate": 4.6442084485562745e-05,
"loss": 6.2039,
"step": 59000
},
{
"epoch": 0.21,
"eval_loss": 6.2890191078186035,
"eval_runtime": 4.3624,
"eval_samples_per_second": 229.229,
"eval_steps_per_second": 14.441,
"step": 59000
},
{
"epoch": 0.22,
"learning_rate": 4.641192141192142e-05,
"loss": 6.2145,
"step": 59500
},
{
"epoch": 0.22,
"learning_rate": 4.638175833828008e-05,
"loss": 6.1805,
"step": 60000
},
{
"epoch": 0.22,
"eval_loss": 6.290516376495361,
"eval_runtime": 4.3339,
"eval_samples_per_second": 230.741,
"eval_steps_per_second": 14.537,
"step": 60000
},
{
"epoch": 0.22,
"learning_rate": 4.6351595264638746e-05,
"loss": 6.1993,
"step": 60500
},
{
"epoch": 0.22,
"learning_rate": 4.632143219099741e-05,
"loss": 6.1947,
"step": 61000
},
{
"epoch": 0.22,
"eval_loss": 6.219234466552734,
"eval_runtime": 4.3446,
"eval_samples_per_second": 230.172,
"eval_steps_per_second": 14.501,
"step": 61000
},
{
"epoch": 0.22,
"learning_rate": 4.6291329443503356e-05,
"loss": 6.2058,
"step": 61500
},
{
"epoch": 0.22,
"learning_rate": 4.626116636986202e-05,
"loss": 6.1748,
"step": 62000
},
{
"epoch": 0.22,
"eval_loss": 6.209784030914307,
"eval_runtime": 4.371,
"eval_samples_per_second": 228.781,
"eval_steps_per_second": 14.413,
"step": 62000
},
{
"epoch": 0.23,
"learning_rate": 4.623100329622069e-05,
"loss": 6.1758,
"step": 62500
},
{
"epoch": 0.23,
"learning_rate": 4.6200840222579356e-05,
"loss": 6.1761,
"step": 63000
},
{
"epoch": 0.23,
"eval_loss": 6.212859153747559,
"eval_runtime": 4.4203,
"eval_samples_per_second": 226.231,
"eval_steps_per_second": 14.253,
"step": 63000
},
{
"epoch": 0.23,
"learning_rate": 4.617067714893802e-05,
"loss": 6.1741,
"step": 63500
},
{
"epoch": 0.23,
"learning_rate": 4.6140514075296684e-05,
"loss": 6.156,
"step": 64000
},
{
"epoch": 0.23,
"eval_loss": 6.214317321777344,
"eval_runtime": 4.3952,
"eval_samples_per_second": 227.52,
"eval_steps_per_second": 14.334,
"step": 64000
},
{
"epoch": 0.23,
"learning_rate": 4.611035100165535e-05,
"loss": 6.1674,
"step": 64500
},
{
"epoch": 0.24,
"learning_rate": 4.608018792801402e-05,
"loss": 6.1669,
"step": 65000
},
{
"epoch": 0.24,
"eval_loss": 6.188581943511963,
"eval_runtime": 4.4051,
"eval_samples_per_second": 227.008,
"eval_steps_per_second": 14.302,
"step": 65000
},
{
"epoch": 0.24,
"learning_rate": 4.605008518051997e-05,
"loss": 6.16,
"step": 65500
},
{
"epoch": 0.24,
"learning_rate": 4.6019922106878634e-05,
"loss": 6.1483,
"step": 66000
},
{
"epoch": 0.24,
"eval_loss": 6.198948383331299,
"eval_runtime": 4.3438,
"eval_samples_per_second": 230.215,
"eval_steps_per_second": 14.504,
"step": 66000
},
{
"epoch": 0.24,
"learning_rate": 4.5989759033237294e-05,
"loss": 6.1433,
"step": 66500
},
{
"epoch": 0.24,
"learning_rate": 4.595959595959596e-05,
"loss": 6.1573,
"step": 67000
},
{
"epoch": 0.24,
"eval_loss": 6.161040782928467,
"eval_runtime": 6.3766,
"eval_samples_per_second": 156.823,
"eval_steps_per_second": 9.88,
"step": 67000
},
{
"epoch": 0.24,
"learning_rate": 4.592943288595463e-05,
"loss": 6.1463,
"step": 67500
},
{
"epoch": 0.25,
"learning_rate": 4.589933013846058e-05,
"loss": 6.1414,
"step": 68000
},
{
"epoch": 0.25,
"eval_loss": 6.161304473876953,
"eval_runtime": 4.3109,
"eval_samples_per_second": 231.972,
"eval_steps_per_second": 14.614,
"step": 68000
},
{
"epoch": 0.25,
"learning_rate": 4.586916706481924e-05,
"loss": 6.1435,
"step": 68500
},
{
"epoch": 0.25,
"learning_rate": 4.583900399117791e-05,
"loss": 6.1202,
"step": 69000
},
{
"epoch": 0.25,
"eval_loss": 6.174992561340332,
"eval_runtime": 4.3065,
"eval_samples_per_second": 232.209,
"eval_steps_per_second": 14.629,
"step": 69000
},
{
"epoch": 0.25,
"learning_rate": 4.580884091753657e-05,
"loss": 6.1219,
"step": 69500
},
{
"epoch": 0.25,
"learning_rate": 4.577867784389523e-05,
"loss": 6.115,
"step": 70000
},
{
"epoch": 0.25,
"eval_loss": 6.18692684173584,
"eval_runtime": 4.3917,
"eval_samples_per_second": 227.7,
"eval_steps_per_second": 14.345,
"step": 70000
},
{
"epoch": 0.26,
"learning_rate": 4.5748514770253906e-05,
"loss": 6.1257,
"step": 70500
},
{
"epoch": 0.26,
"learning_rate": 4.571841202275985e-05,
"loss": 6.1381,
"step": 71000
},
{
"epoch": 0.26,
"eval_loss": 6.09669828414917,
"eval_runtime": 4.3181,
"eval_samples_per_second": 231.583,
"eval_steps_per_second": 14.59,
"step": 71000
},
{
"epoch": 0.26,
"learning_rate": 4.5688248949118515e-05,
"loss": 6.1158,
"step": 71500
},
{
"epoch": 0.26,
"learning_rate": 4.565808587547718e-05,
"loss": 6.1182,
"step": 72000
},
{
"epoch": 0.26,
"eval_loss": 6.138221740722656,
"eval_runtime": 4.3099,
"eval_samples_per_second": 232.023,
"eval_steps_per_second": 14.617,
"step": 72000
},
{
"epoch": 0.26,
"learning_rate": 4.562792280183585e-05,
"loss": 6.1063,
"step": 72500
},
{
"epoch": 0.26,
"learning_rate": 4.559775972819451e-05,
"loss": 6.096,
"step": 73000
},
{
"epoch": 0.26,
"eval_loss": 6.156211853027344,
"eval_runtime": 4.3788,
"eval_samples_per_second": 228.373,
"eval_steps_per_second": 14.388,
"step": 73000
},
{
"epoch": 0.27,
"learning_rate": 4.5567596654553177e-05,
"loss": 6.117,
"step": 73500
},
{
"epoch": 0.27,
"learning_rate": 4.5537433580911844e-05,
"loss": 6.1019,
"step": 74000
},
{
"epoch": 0.27,
"eval_loss": 6.182168483734131,
"eval_runtime": 4.3493,
"eval_samples_per_second": 229.923,
"eval_steps_per_second": 14.485,
"step": 74000
},
{
"epoch": 0.27,
"learning_rate": 4.550727050727051e-05,
"loss": 6.0936,
"step": 74500
},
{
"epoch": 0.27,
"learning_rate": 4.547716775977646e-05,
"loss": 6.0907,
"step": 75000
},
{
"epoch": 0.27,
"eval_loss": 6.1323113441467285,
"eval_runtime": 4.3544,
"eval_samples_per_second": 229.654,
"eval_steps_per_second": 14.468,
"step": 75000
},
{
"epoch": 0.27,
"learning_rate": 4.544706501228241e-05,
"loss": 6.086,
"step": 75500
},
{
"epoch": 0.28,
"learning_rate": 4.541690193864107e-05,
"loss": 6.0914,
"step": 76000
},
{
"epoch": 0.28,
"eval_loss": 6.1409149169921875,
"eval_runtime": 4.3453,
"eval_samples_per_second": 230.134,
"eval_steps_per_second": 14.498,
"step": 76000
},
{
"epoch": 0.28,
"learning_rate": 4.538673886499974e-05,
"loss": 6.0774,
"step": 76500
},
{
"epoch": 0.28,
"learning_rate": 4.5356575791358404e-05,
"loss": 6.0911,
"step": 77000
},
{
"epoch": 0.28,
"eval_loss": 6.122730255126953,
"eval_runtime": 4.2779,
"eval_samples_per_second": 233.759,
"eval_steps_per_second": 14.727,
"step": 77000
},
{
"epoch": 0.28,
"learning_rate": 4.5326412717717064e-05,
"loss": 6.0764,
"step": 77500
},
{
"epoch": 0.28,
"learning_rate": 4.5296309970223013e-05,
"loss": 6.0789,
"step": 78000
},
{
"epoch": 0.28,
"eval_loss": 6.096778392791748,
"eval_runtime": 6.4313,
"eval_samples_per_second": 155.49,
"eval_steps_per_second": 9.796,
"step": 78000
},
{
"epoch": 0.28,
"learning_rate": 4.526614689658168e-05,
"loss": 6.0659,
"step": 78500
},
{
"epoch": 0.29,
"learning_rate": 4.523598382294035e-05,
"loss": 6.0649,
"step": 79000
},
{
"epoch": 0.29,
"eval_loss": 6.110874176025391,
"eval_runtime": 4.3543,
"eval_samples_per_second": 229.655,
"eval_steps_per_second": 14.468,
"step": 79000
},
{
"epoch": 0.29,
"learning_rate": 4.520582074929901e-05,
"loss": 6.0598,
"step": 79500
},
{
"epoch": 0.29,
"learning_rate": 4.517565767565768e-05,
"loss": 6.0721,
"step": 80000
},
{
"epoch": 0.29,
"eval_loss": 6.144255638122559,
"eval_runtime": 4.2916,
"eval_samples_per_second": 233.012,
"eval_steps_per_second": 14.68,
"step": 80000
},
{
"epoch": 0.29,
"learning_rate": 4.5145554928163624e-05,
"loss": 6.0663,
"step": 80500
},
{
"epoch": 0.29,
"learning_rate": 4.511539185452229e-05,
"loss": 6.0543,
"step": 81000
},
{
"epoch": 0.29,
"eval_loss": 6.099493980407715,
"eval_runtime": 4.4139,
"eval_samples_per_second": 226.556,
"eval_steps_per_second": 14.273,
"step": 81000
},
{
"epoch": 0.29,
"learning_rate": 4.508522878088096e-05,
"loss": 6.0371,
"step": 81500
},
{
"epoch": 0.3,
"learning_rate": 4.5055065707239625e-05,
"loss": 6.0491,
"step": 82000
},
{
"epoch": 0.3,
"eval_loss": 6.0733723640441895,
"eval_runtime": 4.3153,
"eval_samples_per_second": 231.736,
"eval_steps_per_second": 14.599,
"step": 82000
},
{
"epoch": 0.3,
"learning_rate": 4.5024902633598285e-05,
"loss": 6.0488,
"step": 82500
},
{
"epoch": 0.3,
"learning_rate": 4.499473955995695e-05,
"loss": 6.0392,
"step": 83000
},
{
"epoch": 0.3,
"eval_loss": 6.062073230743408,
"eval_runtime": 4.372,
"eval_samples_per_second": 228.728,
"eval_steps_per_second": 14.41,
"step": 83000
},
{
"epoch": 0.3,
"learning_rate": 4.496457648631562e-05,
"loss": 6.0402,
"step": 83500
},
{
"epoch": 0.3,
"learning_rate": 4.4934413412674286e-05,
"loss": 6.0487,
"step": 84000
},
{
"epoch": 0.3,
"eval_loss": 6.08383321762085,
"eval_runtime": 6.5143,
"eval_samples_per_second": 153.507,
"eval_steps_per_second": 9.671,
"step": 84000
},
{
"epoch": 0.31,
"learning_rate": 4.4904310665180236e-05,
"loss": 6.0402,
"step": 84500
},
{
"epoch": 0.31,
"learning_rate": 4.487420791768618e-05,
"loss": 6.0314,
"step": 85000
},
{
"epoch": 0.31,
"eval_loss": 6.082727432250977,
"eval_runtime": 4.3196,
"eval_samples_per_second": 231.502,
"eval_steps_per_second": 14.585,
"step": 85000
},
{
"epoch": 0.31,
"learning_rate": 4.4844044844044845e-05,
"loss": 6.0249,
"step": 85500
},
{
"epoch": 0.31,
"learning_rate": 4.481388177040351e-05,
"loss": 6.0329,
"step": 86000
},
{
"epoch": 0.31,
"eval_loss": 6.075023651123047,
"eval_runtime": 4.2685,
"eval_samples_per_second": 234.274,
"eval_steps_per_second": 14.759,
"step": 86000
},
{
"epoch": 0.31,
"learning_rate": 4.478371869676218e-05,
"loss": 6.0329,
"step": 86500
},
{
"epoch": 0.31,
"learning_rate": 4.475361594926812e-05,
"loss": 6.0233,
"step": 87000
},
{
"epoch": 0.31,
"eval_loss": 6.072445392608643,
"eval_runtime": 4.3773,
"eval_samples_per_second": 228.453,
"eval_steps_per_second": 14.393,
"step": 87000
},
{
"epoch": 0.32,
"learning_rate": 4.472345287562679e-05,
"loss": 6.032,
"step": 87500
},
{
"epoch": 0.32,
"learning_rate": 4.4693289801985456e-05,
"loss": 6.0296,
"step": 88000
},
{
"epoch": 0.32,
"eval_loss": 6.064884185791016,
"eval_runtime": 4.3333,
"eval_samples_per_second": 230.771,
"eval_steps_per_second": 14.539,
"step": 88000
},
{
"epoch": 0.32,
"learning_rate": 4.466312672834412e-05,
"loss": 6.0324,
"step": 88500
},
{
"epoch": 0.32,
"learning_rate": 4.463296365470278e-05,
"loss": 6.0141,
"step": 89000
},
{
"epoch": 0.32,
"eval_loss": 6.0311689376831055,
"eval_runtime": 4.3204,
"eval_samples_per_second": 231.461,
"eval_steps_per_second": 14.582,
"step": 89000
},
{
"epoch": 0.32,
"learning_rate": 4.460280058106146e-05,
"loss": 6.0171,
"step": 89500
},
{
"epoch": 0.33,
"learning_rate": 4.457263750742012e-05,
"loss": 6.0109,
"step": 90000
},
{
"epoch": 0.33,
"eval_loss": 6.072292804718018,
"eval_runtime": 6.2611,
"eval_samples_per_second": 159.717,
"eval_steps_per_second": 10.062,
"step": 90000
},
{
"epoch": 0.33,
"learning_rate": 4.4542474433778784e-05,
"loss": 6.001,
"step": 90500
},
{
"epoch": 0.33,
"learning_rate": 4.4512371686284734e-05,
"loss": 5.9988,
"step": 91000
},
{
"epoch": 0.33,
"eval_loss": 6.042818546295166,
"eval_runtime": 4.3121,
"eval_samples_per_second": 231.908,
"eval_steps_per_second": 14.61,
"step": 91000
},
{
"epoch": 0.33,
"learning_rate": 4.44822086126434e-05,
"loss": 6.0103,
"step": 91500
},
{
"epoch": 0.33,
"learning_rate": 4.445210586514934e-05,
"loss": 6.0061,
"step": 92000
},
{
"epoch": 0.33,
"eval_loss": 6.017780303955078,
"eval_runtime": 4.3583,
"eval_samples_per_second": 229.448,
"eval_steps_per_second": 14.455,
"step": 92000
},
{
"epoch": 0.33,
"learning_rate": 4.442194279150801e-05,
"loss": 6.0035,
"step": 92500
},
{
"epoch": 0.34,
"learning_rate": 4.439177971786668e-05,
"loss": 6.0009,
"step": 93000
},
{
"epoch": 0.34,
"eval_loss": 6.078599452972412,
"eval_runtime": 4.3628,
"eval_samples_per_second": 229.211,
"eval_steps_per_second": 14.44,
"step": 93000
},
{
"epoch": 0.34,
"learning_rate": 4.436161664422534e-05,
"loss": 6.0004,
"step": 93500
},
{
"epoch": 0.34,
"learning_rate": 4.433145357058401e-05,
"loss": 5.9976,
"step": 94000
},
{
"epoch": 0.34,
"eval_loss": 6.076462745666504,
"eval_runtime": 4.4135,
"eval_samples_per_second": 226.579,
"eval_steps_per_second": 14.274,
"step": 94000
},
{
"epoch": 0.34,
"learning_rate": 4.4301350823089954e-05,
"loss": 6.004,
"step": 94500
},
{
"epoch": 0.34,
"learning_rate": 4.427118774944862e-05,
"loss": 5.9857,
"step": 95000
},
{
"epoch": 0.34,
"eval_loss": 6.037008762359619,
"eval_runtime": 4.3059,
"eval_samples_per_second": 232.239,
"eval_steps_per_second": 14.631,
"step": 95000
},
{
"epoch": 0.35,
"learning_rate": 4.424102467580729e-05,
"loss": 5.9964,
"step": 95500
},
{
"epoch": 0.35,
"learning_rate": 4.4210861602165955e-05,
"loss": 5.9927,
"step": 96000
},
{
"epoch": 0.35,
"eval_loss": 6.015842914581299,
"eval_runtime": 4.2524,
"eval_samples_per_second": 235.159,
"eval_steps_per_second": 14.815,
"step": 96000
},
{
"epoch": 0.35,
"learning_rate": 4.4180698528524615e-05,
"loss": 5.999,
"step": 96500
},
{
"epoch": 0.35,
"learning_rate": 4.415053545488328e-05,
"loss": 5.9784,
"step": 97000
},
{
"epoch": 0.35,
"eval_loss": 6.02786111831665,
"eval_runtime": 4.3676,
"eval_samples_per_second": 228.957,
"eval_steps_per_second": 14.424,
"step": 97000
},
{
"epoch": 0.35,
"learning_rate": 4.412037238124195e-05,
"loss": 5.9761,
"step": 97500
},
{
"epoch": 0.35,
"learning_rate": 4.4090329959895174e-05,
"loss": 5.9748,
"step": 98000
},
{
"epoch": 0.35,
"eval_loss": 6.014493465423584,
"eval_runtime": 4.2894,
"eval_samples_per_second": 233.134,
"eval_steps_per_second": 14.687,
"step": 98000
},
{
"epoch": 0.36,
"learning_rate": 4.406016688625385e-05,
"loss": 5.9804,
"step": 98500
},
{
"epoch": 0.36,
"learning_rate": 4.403000381261251e-05,
"loss": 5.9741,
"step": 99000
},
{
"epoch": 0.36,
"eval_loss": 6.044219493865967,
"eval_runtime": 4.3247,
"eval_samples_per_second": 231.229,
"eval_steps_per_second": 14.567,
"step": 99000
},
{
"epoch": 0.36,
"learning_rate": 4.3999840738971175e-05,
"loss": 5.9778,
"step": 99500
},
{
"epoch": 0.36,
"learning_rate": 4.396967766532984e-05,
"loss": 5.9699,
"step": 100000
},
{
"epoch": 0.36,
"eval_loss": 6.005799293518066,
"eval_runtime": 4.3718,
"eval_samples_per_second": 228.741,
"eval_steps_per_second": 14.411,
"step": 100000
},
{
"epoch": 0.36,
"learning_rate": 4.393951459168851e-05,
"loss": 5.9631,
"step": 100500
},
{
"epoch": 0.37,
"learning_rate": 4.390935151804717e-05,
"loss": 5.9899,
"step": 101000
},
{
"epoch": 0.37,
"eval_loss": 5.946277618408203,
"eval_runtime": 4.3243,
"eval_samples_per_second": 231.25,
"eval_steps_per_second": 14.569,
"step": 101000
},
{
"epoch": 0.37,
"learning_rate": 4.3879188444405836e-05,
"loss": 5.9565,
"step": 101500
},
{
"epoch": 0.37,
"learning_rate": 4.38490253707645e-05,
"loss": 5.9728,
"step": 102000
},
{
"epoch": 0.37,
"eval_loss": 6.005879878997803,
"eval_runtime": 4.4401,
"eval_samples_per_second": 225.223,
"eval_steps_per_second": 14.189,
"step": 102000
},
{
"epoch": 0.37,
"learning_rate": 4.381886229712317e-05,
"loss": 5.9722,
"step": 102500
},
{
"epoch": 0.37,
"learning_rate": 4.378869922348183e-05,
"loss": 5.9503,
"step": 103000
},
{
"epoch": 0.37,
"eval_loss": 5.96213436126709,
"eval_runtime": 4.349,
"eval_samples_per_second": 229.939,
"eval_steps_per_second": 14.486,
"step": 103000
},
{
"epoch": 0.37,
"learning_rate": 4.3758536149840504e-05,
"loss": 5.9631,
"step": 103500
},
{
"epoch": 0.38,
"learning_rate": 4.3728373076199164e-05,
"loss": 5.9573,
"step": 104000
},
{
"epoch": 0.38,
"eval_loss": 6.019954681396484,
"eval_runtime": 4.3963,
"eval_samples_per_second": 227.464,
"eval_steps_per_second": 14.33,
"step": 104000
},
{
"epoch": 0.38,
"learning_rate": 4.3698270328705114e-05,
"loss": 5.9604,
"step": 104500
},
{
"epoch": 0.38,
"learning_rate": 4.366810725506378e-05,
"loss": 5.9563,
"step": 105000
},
{
"epoch": 0.38,
"eval_loss": 6.00137996673584,
"eval_runtime": 4.4013,
"eval_samples_per_second": 227.207,
"eval_steps_per_second": 14.314,
"step": 105000
},
{
"epoch": 0.38,
"learning_rate": 4.363794418142245e-05,
"loss": 5.9479,
"step": 105500
},
{
"epoch": 0.38,
"learning_rate": 4.360778110778111e-05,
"loss": 5.9414,
"step": 106000
},
{
"epoch": 0.38,
"eval_loss": 5.9667510986328125,
"eval_runtime": 4.2409,
"eval_samples_per_second": 235.8,
"eval_steps_per_second": 14.855,
"step": 106000
},
{
"epoch": 0.39,
"learning_rate": 4.357773868643434e-05,
"loss": 5.9349,
"step": 106500
},
{
"epoch": 0.39,
"learning_rate": 4.354757561279301e-05,
"loss": 5.9311,
"step": 107000
},
{
"epoch": 0.39,
"eval_loss": 6.016119956970215,
"eval_runtime": 6.3445,
"eval_samples_per_second": 157.617,
"eval_steps_per_second": 9.93,
"step": 107000
},
{
"epoch": 0.39,
"learning_rate": 4.351741253915167e-05,
"loss": 5.9398,
"step": 107500
},
{
"epoch": 0.39,
"learning_rate": 4.348724946551034e-05,
"loss": 5.9357,
"step": 108000
},
{
"epoch": 0.39,
"eval_loss": 5.970701217651367,
"eval_runtime": 4.3817,
"eval_samples_per_second": 228.222,
"eval_steps_per_second": 14.378,
"step": 108000
},
{
"epoch": 0.39,
"learning_rate": 4.3457086391869e-05,
"loss": 5.9368,
"step": 108500
},
{
"epoch": 0.39,
"learning_rate": 4.342698364437495e-05,
"loss": 5.9312,
"step": 109000
},
{
"epoch": 0.39,
"eval_loss": 5.961437702178955,
"eval_runtime": 4.3658,
"eval_samples_per_second": 229.052,
"eval_steps_per_second": 14.43,
"step": 109000
},
{
"epoch": 0.4,
"learning_rate": 4.339682057073362e-05,
"loss": 5.9306,
"step": 109500
},
{
"epoch": 0.4,
"learning_rate": 4.3366657497092285e-05,
"loss": 5.9337,
"step": 110000
},
{
"epoch": 0.4,
"eval_loss": 5.9670586585998535,
"eval_runtime": 4.3639,
"eval_samples_per_second": 229.152,
"eval_steps_per_second": 14.437,
"step": 110000
},
{
"epoch": 0.4,
"learning_rate": 4.3336494423450945e-05,
"loss": 5.9424,
"step": 110500
},
{
"epoch": 0.4,
"learning_rate": 4.330633134980961e-05,
"loss": 5.9361,
"step": 111000
},
{
"epoch": 0.4,
"eval_loss": 6.001840591430664,
"eval_runtime": 4.3398,
"eval_samples_per_second": 230.423,
"eval_steps_per_second": 14.517,
"step": 111000
},
{
"epoch": 0.4,
"learning_rate": 4.327616827616828e-05,
"loss": 5.9411,
"step": 111500
},
{
"epoch": 0.41,
"learning_rate": 4.324606552867423e-05,
"loss": 5.9163,
"step": 112000
},
{
"epoch": 0.41,
"eval_loss": 5.968799114227295,
"eval_runtime": 4.2961,
"eval_samples_per_second": 232.767,
"eval_steps_per_second": 14.664,
"step": 112000
},
{
"epoch": 0.41,
"learning_rate": 4.321596278118018e-05,
"loss": 5.9305,
"step": 112500
},
{
"epoch": 0.41,
"learning_rate": 4.318579970753884e-05,
"loss": 5.9078,
"step": 113000
},
{
"epoch": 0.41,
"eval_loss": 5.962240219116211,
"eval_runtime": 4.405,
"eval_samples_per_second": 227.013,
"eval_steps_per_second": 14.302,
"step": 113000
},
{
"epoch": 0.41,
"learning_rate": 4.3155636633897505e-05,
"loss": 5.9165,
"step": 113500
},
{
"epoch": 0.41,
"learning_rate": 4.312547356025617e-05,
"loss": 5.9351,
"step": 114000
},
{
"epoch": 0.41,
"eval_loss": 5.99934196472168,
"eval_runtime": 4.3193,
"eval_samples_per_second": 231.518,
"eval_steps_per_second": 14.586,
"step": 114000
},
{
"epoch": 0.41,
"learning_rate": 4.309531048661484e-05,
"loss": 5.9126,
"step": 114500
},
{
"epoch": 0.42,
"learning_rate": 4.30651474129735e-05,
"loss": 5.9191,
"step": 115000
},
{
"epoch": 0.42,
"eval_loss": 5.963631629943848,
"eval_runtime": 6.2259,
"eval_samples_per_second": 160.621,
"eval_steps_per_second": 10.119,
"step": 115000
},
{
"epoch": 0.42,
"learning_rate": 4.3034984339332166e-05,
"loss": 5.9279,
"step": 115500
},
{
"epoch": 0.42,
"learning_rate": 4.300482126569083e-05,
"loss": 5.9058,
"step": 116000
},
{
"epoch": 0.42,
"eval_loss": 5.989487171173096,
"eval_runtime": 4.3475,
"eval_samples_per_second": 230.019,
"eval_steps_per_second": 14.491,
"step": 116000
},
{
"epoch": 0.42,
"learning_rate": 4.29746581920495e-05,
"loss": 5.9151,
"step": 116500
},
{
"epoch": 0.42,
"learning_rate": 4.294449511840816e-05,
"loss": 5.9031,
"step": 117000
},
{
"epoch": 0.42,
"eval_loss": 5.9690752029418945,
"eval_runtime": 4.1678,
"eval_samples_per_second": 239.936,
"eval_steps_per_second": 15.116,
"step": 117000
},
{
"epoch": 0.43,
"learning_rate": 4.2914332044766834e-05,
"loss": 5.9153,
"step": 117500
},
{
"epoch": 0.43,
"learning_rate": 4.288422929727278e-05,
"loss": 5.9167,
"step": 118000
},
{
"epoch": 0.43,
"eval_loss": 5.953476428985596,
"eval_runtime": 4.2534,
"eval_samples_per_second": 235.107,
"eval_steps_per_second": 14.812,
"step": 118000
},
{
"epoch": 0.43,
"learning_rate": 4.2854066223631444e-05,
"loss": 5.8958,
"step": 118500
},
{
"epoch": 0.43,
"learning_rate": 4.282390314999011e-05,
"loss": 5.9013,
"step": 119000
},
{
"epoch": 0.43,
"eval_loss": 5.934565544128418,
"eval_runtime": 4.1119,
"eval_samples_per_second": 243.194,
"eval_steps_per_second": 15.321,
"step": 119000
},
{
"epoch": 0.43,
"learning_rate": 4.279374007634878e-05,
"loss": 5.8967,
"step": 119500
},
{
"epoch": 0.43,
"learning_rate": 4.276357700270744e-05,
"loss": 5.9066,
"step": 120000
},
{
"epoch": 0.43,
"eval_loss": 5.935710906982422,
"eval_runtime": 4.1492,
"eval_samples_per_second": 241.009,
"eval_steps_per_second": 15.184,
"step": 120000
},
{
"epoch": 0.44,
"learning_rate": 4.2733413929066105e-05,
"loss": 5.9031,
"step": 120500
},
{
"epoch": 0.44,
"learning_rate": 4.270325085542477e-05,
"loss": 5.8929,
"step": 121000
},
{
"epoch": 0.44,
"eval_loss": 5.922500133514404,
"eval_runtime": 3.9626,
"eval_samples_per_second": 252.362,
"eval_steps_per_second": 15.899,
"step": 121000
},
{
"epoch": 0.44,
"learning_rate": 4.2673148107930715e-05,
"loss": 5.9005,
"step": 121500
},
{
"epoch": 0.44,
"learning_rate": 4.264298503428938e-05,
"loss": 5.9015,
"step": 122000
},
{
"epoch": 0.44,
"eval_loss": 5.941558361053467,
"eval_runtime": 4.1057,
"eval_samples_per_second": 243.562,
"eval_steps_per_second": 15.344,
"step": 122000
},
{
"epoch": 0.44,
"learning_rate": 4.261282196064805e-05,
"loss": 5.8931,
"step": 122500
},
{
"epoch": 0.45,
"learning_rate": 4.2582658887006715e-05,
"loss": 5.8834,
"step": 123000
},
{
"epoch": 0.45,
"eval_loss": 5.9397172927856445,
"eval_runtime": 4.1106,
"eval_samples_per_second": 243.275,
"eval_steps_per_second": 15.326,
"step": 123000
},
{
"epoch": 0.45,
"learning_rate": 4.2552495813365376e-05,
"loss": 5.8849,
"step": 123500
},
{
"epoch": 0.45,
"learning_rate": 4.252239306587133e-05,
"loss": 5.8941,
"step": 124000
},
{
"epoch": 0.45,
"eval_loss": 5.900291919708252,
"eval_runtime": 4.2978,
"eval_samples_per_second": 232.678,
"eval_steps_per_second": 14.659,
"step": 124000
},
{
"epoch": 0.45,
"learning_rate": 4.249222999222999e-05,
"loss": 5.8869,
"step": 124500
},
{
"epoch": 0.45,
"learning_rate": 4.246206691858866e-05,
"loss": 5.8926,
"step": 125000
},
{
"epoch": 0.45,
"eval_loss": 5.94087028503418,
"eval_runtime": 4.2807,
"eval_samples_per_second": 233.609,
"eval_steps_per_second": 14.717,
"step": 125000
},
{
"epoch": 0.45,
"learning_rate": 4.2431903844947326e-05,
"loss": 5.8719,
"step": 125500
},
{
"epoch": 0.46,
"learning_rate": 4.2401801097453276e-05,
"loss": 5.8807,
"step": 126000
},
{
"epoch": 0.46,
"eval_loss": 5.909360408782959,
"eval_runtime": 4.3061,
"eval_samples_per_second": 232.228,
"eval_steps_per_second": 14.63,
"step": 126000
},
{
"epoch": 0.46,
"learning_rate": 4.2371638023811936e-05,
"loss": 5.8892,
"step": 126500
},
{
"epoch": 0.46,
"learning_rate": 4.234147495017061e-05,
"loss": 5.8809,
"step": 127000
},
{
"epoch": 0.46,
"eval_loss": 5.925731658935547,
"eval_runtime": 4.3535,
"eval_samples_per_second": 229.703,
"eval_steps_per_second": 14.471,
"step": 127000
},
{
"epoch": 0.46,
"learning_rate": 4.231131187652927e-05,
"loss": 5.8775,
"step": 127500
},
{
"epoch": 0.46,
"learning_rate": 4.228120912903521e-05,
"loss": 5.8816,
"step": 128000
},
{
"epoch": 0.46,
"eval_loss": 5.949062347412109,
"eval_runtime": 6.324,
"eval_samples_per_second": 158.127,
"eval_steps_per_second": 9.962,
"step": 128000
},
{
"epoch": 0.47,
"learning_rate": 4.2251046055393886e-05,
"loss": 5.8887,
"step": 128500
},
{
"epoch": 0.47,
"learning_rate": 4.2220882981752546e-05,
"loss": 5.8612,
"step": 129000
},
{
"epoch": 0.47,
"eval_loss": 5.9013543128967285,
"eval_runtime": 4.2793,
"eval_samples_per_second": 233.683,
"eval_steps_per_second": 14.722,
"step": 129000
},
{
"epoch": 0.47,
"learning_rate": 4.2190719908111213e-05,
"loss": 5.876,
"step": 129500
},
{
"epoch": 0.47,
"learning_rate": 4.216055683446988e-05,
"loss": 5.8685,
"step": 130000
},
{
"epoch": 0.47,
"eval_loss": 5.895120143890381,
"eval_runtime": 4.4394,
"eval_samples_per_second": 225.255,
"eval_steps_per_second": 14.191,
"step": 130000
},
{
"epoch": 0.47,
"learning_rate": 4.213039376082855e-05,
"loss": 5.8975,
"step": 130500
},
{
"epoch": 0.47,
"learning_rate": 4.210029101333449e-05,
"loss": 5.8759,
"step": 131000
},
{
"epoch": 0.47,
"eval_loss": 5.904821872711182,
"eval_runtime": 6.1491,
"eval_samples_per_second": 162.626,
"eval_steps_per_second": 10.245,
"step": 131000
},
{
"epoch": 0.48,
"learning_rate": 4.207012793969316e-05,
"loss": 5.8635,
"step": 131500
},
{
"epoch": 0.48,
"learning_rate": 4.2039964866051824e-05,
"loss": 5.875,
"step": 132000
},
{
"epoch": 0.48,
"eval_loss": 5.9182329177856445,
"eval_runtime": 4.3366,
"eval_samples_per_second": 230.597,
"eval_steps_per_second": 14.528,
"step": 132000
},
{
"epoch": 0.48,
"learning_rate": 4.200980179241049e-05,
"loss": 5.8747,
"step": 132500
},
{
"epoch": 0.48,
"learning_rate": 4.197963871876915e-05,
"loss": 5.8686,
"step": 133000
},
{
"epoch": 0.48,
"eval_loss": 5.91906213760376,
"eval_runtime": 4.3148,
"eval_samples_per_second": 231.759,
"eval_steps_per_second": 14.601,
"step": 133000
},
{
"epoch": 0.48,
"learning_rate": 4.194953597127511e-05,
"loss": 5.8716,
"step": 133500
},
{
"epoch": 0.49,
"learning_rate": 4.191937289763377e-05,
"loss": 5.8808,
"step": 134000
},
{
"epoch": 0.49,
"eval_loss": 5.845741271972656,
"eval_runtime": 4.3065,
"eval_samples_per_second": 232.206,
"eval_steps_per_second": 14.629,
"step": 134000
},
{
"epoch": 0.49,
"learning_rate": 4.1889209823992435e-05,
"loss": 5.8677,
"step": 134500
},
{
"epoch": 0.49,
"learning_rate": 4.18590467503511e-05,
"loss": 5.8624,
"step": 135000
},
{
"epoch": 0.49,
"eval_loss": 5.901210308074951,
"eval_runtime": 4.2572,
"eval_samples_per_second": 234.896,
"eval_steps_per_second": 14.798,
"step": 135000
},
{
"epoch": 0.49,
"learning_rate": 4.182888367670977e-05,
"loss": 5.8513,
"step": 135500
},
{
"epoch": 0.49,
"learning_rate": 4.179872060306843e-05,
"loss": 5.865,
"step": 136000
},
{
"epoch": 0.49,
"eval_loss": 5.93388557434082,
"eval_runtime": 4.3004,
"eval_samples_per_second": 232.539,
"eval_steps_per_second": 14.65,
"step": 136000
},
{
"epoch": 0.49,
"learning_rate": 4.17685575294271e-05,
"loss": 5.8666,
"step": 136500
},
{
"epoch": 0.5,
"learning_rate": 4.173839445578576e-05,
"loss": 5.8595,
"step": 137000
},
{
"epoch": 0.5,
"eval_loss": 5.921290874481201,
"eval_runtime": 4.3562,
"eval_samples_per_second": 229.559,
"eval_steps_per_second": 14.462,
"step": 137000
},
{
"epoch": 0.5,
"learning_rate": 4.170823138214443e-05,
"loss": 5.8616,
"step": 137500
},
{
"epoch": 0.5,
"learning_rate": 4.167812863465038e-05,
"loss": 5.8664,
"step": 138000
},
{
"epoch": 0.5,
"eval_loss": 5.8725409507751465,
"eval_runtime": 4.2779,
"eval_samples_per_second": 233.757,
"eval_steps_per_second": 14.727,
"step": 138000
},
{
"epoch": 0.5,
"learning_rate": 4.164796556100904e-05,
"loss": 5.8355,
"step": 138500
},
{
"epoch": 0.5,
"learning_rate": 4.1617802487367706e-05,
"loss": 5.8529,
"step": 139000
},
{
"epoch": 0.5,
"eval_loss": 5.918557643890381,
"eval_runtime": 4.3121,
"eval_samples_per_second": 231.903,
"eval_steps_per_second": 14.61,
"step": 139000
},
{
"epoch": 0.5,
"learning_rate": 4.1587699739873656e-05,
"loss": 5.8408,
"step": 139500
},
{
"epoch": 0.51,
"learning_rate": 4.155753666623232e-05,
"loss": 5.8419,
"step": 140000
},
{
"epoch": 0.51,
"eval_loss": 5.910894870758057,
"eval_runtime": 4.3311,
"eval_samples_per_second": 230.888,
"eval_steps_per_second": 14.546,
"step": 140000
},
{
"epoch": 0.51,
"learning_rate": 4.152737359259098e-05,
"loss": 5.8523,
"step": 140500
},
{
"epoch": 0.51,
"learning_rate": 4.149721051894965e-05,
"loss": 5.8436,
"step": 141000
},
{
"epoch": 0.51,
"eval_loss": 5.868130207061768,
"eval_runtime": 4.3137,
"eval_samples_per_second": 231.819,
"eval_steps_per_second": 14.605,
"step": 141000
},
{
"epoch": 0.51,
"learning_rate": 4.146704744530832e-05,
"loss": 5.8501,
"step": 141500
},
{
"epoch": 0.51,
"learning_rate": 4.143694469781427e-05,
"loss": 5.8394,
"step": 142000
},
{
"epoch": 0.51,
"eval_loss": 5.902928829193115,
"eval_runtime": 4.3712,
"eval_samples_per_second": 228.77,
"eval_steps_per_second": 14.412,
"step": 142000
},
{
"epoch": 0.52,
"learning_rate": 4.140678162417293e-05,
"loss": 5.843,
"step": 142500
},
{
"epoch": 0.52,
"learning_rate": 4.13766185505316e-05,
"loss": 5.8433,
"step": 143000
},
{
"epoch": 0.52,
"eval_loss": 5.870312690734863,
"eval_runtime": 4.2876,
"eval_samples_per_second": 233.229,
"eval_steps_per_second": 14.693,
"step": 143000
},
{
"epoch": 0.52,
"learning_rate": 4.134645547689026e-05,
"loss": 5.8436,
"step": 143500
},
{
"epoch": 0.52,
"learning_rate": 4.131629240324893e-05,
"loss": 5.8415,
"step": 144000
},
{
"epoch": 0.52,
"eval_loss": 5.892139434814453,
"eval_runtime": 4.3156,
"eval_samples_per_second": 231.716,
"eval_steps_per_second": 14.598,
"step": 144000
},
{
"epoch": 0.52,
"learning_rate": 4.1286129329607595e-05,
"loss": 5.8496,
"step": 144500
},
{
"epoch": 0.52,
"learning_rate": 4.125602658211354e-05,
"loss": 5.8489,
"step": 145000
},
{
"epoch": 0.52,
"eval_loss": 5.9164910316467285,
"eval_runtime": 4.382,
"eval_samples_per_second": 228.207,
"eval_steps_per_second": 14.377,
"step": 145000
},
{
"epoch": 0.53,
"learning_rate": 4.1225863508472204e-05,
"loss": 5.8353,
"step": 145500
},
{
"epoch": 0.53,
"learning_rate": 4.119570043483087e-05,
"loss": 5.8531,
"step": 146000
},
{
"epoch": 0.53,
"eval_loss": 5.896515846252441,
"eval_runtime": 4.3161,
"eval_samples_per_second": 231.693,
"eval_steps_per_second": 14.597,
"step": 146000
},
{
"epoch": 0.53,
"learning_rate": 4.116553736118954e-05,
"loss": 5.833,
"step": 146500
},
{
"epoch": 0.53,
"learning_rate": 4.11353742875482e-05,
"loss": 5.8285,
"step": 147000
},
{
"epoch": 0.53,
"eval_loss": 5.857729911804199,
"eval_runtime": 4.3275,
"eval_samples_per_second": 231.083,
"eval_steps_per_second": 14.558,
"step": 147000
},
{
"epoch": 0.53,
"learning_rate": 4.110521121390687e-05,
"loss": 5.8368,
"step": 147500
},
{
"epoch": 0.54,
"learning_rate": 4.107504814026553e-05,
"loss": 5.8258,
"step": 148000
},
{
"epoch": 0.54,
"eval_loss": 5.907439231872559,
"eval_runtime": 4.3373,
"eval_samples_per_second": 230.556,
"eval_steps_per_second": 14.525,
"step": 148000
},
{
"epoch": 0.54,
"learning_rate": 4.10448850666242e-05,
"loss": 5.8369,
"step": 148500
},
{
"epoch": 0.54,
"learning_rate": 4.101478231913015e-05,
"loss": 5.8342,
"step": 149000
},
{
"epoch": 0.54,
"eval_loss": 5.867617130279541,
"eval_runtime": 4.3431,
"eval_samples_per_second": 230.249,
"eval_steps_per_second": 14.506,
"step": 149000
},
{
"epoch": 0.54,
"learning_rate": 4.0984619245488816e-05,
"loss": 5.8303,
"step": 149500
},
{
"epoch": 0.54,
"learning_rate": 4.0954456171847476e-05,
"loss": 5.8182,
"step": 150000
},
{
"epoch": 0.54,
"eval_loss": 5.870373725891113,
"eval_runtime": 4.346,
"eval_samples_per_second": 230.099,
"eval_steps_per_second": 14.496,
"step": 150000
},
{
"epoch": 0.54,
"learning_rate": 4.092429309820614e-05,
"loss": 5.8311,
"step": 150500
},
{
"epoch": 0.55,
"learning_rate": 4.089419035071209e-05,
"loss": 5.8423,
"step": 151000
},
{
"epoch": 0.55,
"eval_loss": 5.850360870361328,
"eval_runtime": 4.3581,
"eval_samples_per_second": 229.46,
"eval_steps_per_second": 14.456,
"step": 151000
},
{
"epoch": 0.55,
"learning_rate": 4.086402727707076e-05,
"loss": 5.8341,
"step": 151500
},
{
"epoch": 0.55,
"learning_rate": 4.083386420342942e-05,
"loss": 5.8201,
"step": 152000
},
{
"epoch": 0.55,
"eval_loss": 5.889283180236816,
"eval_runtime": 6.4869,
"eval_samples_per_second": 154.156,
"eval_steps_per_second": 9.712,
"step": 152000
},
{
"epoch": 0.55,
"learning_rate": 4.080370112978809e-05,
"loss": 5.8177,
"step": 152500
},
{
"epoch": 0.55,
"learning_rate": 4.0773538056146754e-05,
"loss": 5.8277,
"step": 153000
},
{
"epoch": 0.55,
"eval_loss": 5.80609130859375,
"eval_runtime": 4.3836,
"eval_samples_per_second": 228.121,
"eval_steps_per_second": 14.372,
"step": 153000
},
{
"epoch": 0.56,
"learning_rate": 4.0743374982505414e-05,
"loss": 5.8206,
"step": 153500
},
{
"epoch": 0.56,
"learning_rate": 4.071321190886409e-05,
"loss": 5.8245,
"step": 154000
},
{
"epoch": 0.56,
"eval_loss": 5.86394739151001,
"eval_runtime": 4.2559,
"eval_samples_per_second": 234.966,
"eval_steps_per_second": 14.803,
"step": 154000
},
{
"epoch": 0.56,
"learning_rate": 4.068310916137003e-05,
"loss": 5.8145,
"step": 154500
},
{
"epoch": 0.56,
"learning_rate": 4.06529460877287e-05,
"loss": 5.8244,
"step": 155000
},
{
"epoch": 0.56,
"eval_loss": 5.874474048614502,
"eval_runtime": 4.2932,
"eval_samples_per_second": 232.925,
"eval_steps_per_second": 14.674,
"step": 155000
},
{
"epoch": 0.56,
"learning_rate": 4.0622783014087364e-05,
"loss": 5.828,
"step": 155500
},
{
"epoch": 0.56,
"learning_rate": 4.059261994044603e-05,
"loss": 5.8241,
"step": 156000
},
{
"epoch": 0.56,
"eval_loss": 5.891319274902344,
"eval_runtime": 4.1898,
"eval_samples_per_second": 238.672,
"eval_steps_per_second": 15.036,
"step": 156000
},
{
"epoch": 0.57,
"learning_rate": 4.0562517192951974e-05,
"loss": 5.8099,
"step": 156500
},
{
"epoch": 0.57,
"learning_rate": 4.053235411931065e-05,
"loss": 5.816,
"step": 157000
},
{
"epoch": 0.57,
"eval_loss": 5.805520534515381,
"eval_runtime": 4.1865,
"eval_samples_per_second": 238.861,
"eval_steps_per_second": 15.048,
"step": 157000
},
{
"epoch": 0.57,
"learning_rate": 4.050219104566931e-05,
"loss": 5.8202,
"step": 157500
},
{
"epoch": 0.57,
"learning_rate": 4.0472027972027975e-05,
"loss": 5.8128,
"step": 158000
},
{
"epoch": 0.57,
"eval_loss": 5.84921407699585,
"eval_runtime": 5.9007,
"eval_samples_per_second": 169.472,
"eval_steps_per_second": 10.677,
"step": 158000
},
{
"epoch": 0.57,
"learning_rate": 4.044186489838664e-05,
"loss": 5.8166,
"step": 158500
},
{
"epoch": 0.58,
"learning_rate": 4.041170182474531e-05,
"loss": 5.8186,
"step": 159000
},
{
"epoch": 0.58,
"eval_loss": 5.864505290985107,
"eval_runtime": 4.302,
"eval_samples_per_second": 232.451,
"eval_steps_per_second": 14.644,
"step": 159000
},
{
"epoch": 0.58,
"learning_rate": 4.038159907725125e-05,
"loss": 5.7966,
"step": 159500
},
{
"epoch": 0.58,
"learning_rate": 4.035143600360992e-05,
"loss": 5.8179,
"step": 160000
},
{
"epoch": 0.58,
"eval_loss": 5.826276779174805,
"eval_runtime": 4.3186,
"eval_samples_per_second": 231.558,
"eval_steps_per_second": 14.588,
"step": 160000
},
{
"epoch": 0.58,
"learning_rate": 4.0321272929968586e-05,
"loss": 5.8101,
"step": 160500
},
{
"epoch": 0.58,
"learning_rate": 4.0291109856327246e-05,
"loss": 5.8193,
"step": 161000
},
{
"epoch": 0.58,
"eval_loss": 5.851203918457031,
"eval_runtime": 6.3135,
"eval_samples_per_second": 158.39,
"eval_steps_per_second": 9.979,
"step": 161000
},
{
"epoch": 0.58,
"learning_rate": 4.0261007108833195e-05,
"loss": 5.8154,
"step": 161500
},
{
"epoch": 0.59,
"learning_rate": 4.023084403519186e-05,
"loss": 5.7945,
"step": 162000
},
{
"epoch": 0.59,
"eval_loss": 5.839649677276611,
"eval_runtime": 4.357,
"eval_samples_per_second": 229.515,
"eval_steps_per_second": 14.459,
"step": 162000
},
{
"epoch": 0.59,
"learning_rate": 4.020068096155053e-05,
"loss": 5.801,
"step": 162500
},
{
"epoch": 0.59,
"learning_rate": 4.017051788790919e-05,
"loss": 5.804,
"step": 163000
},
{
"epoch": 0.59,
"eval_loss": 5.798958778381348,
"eval_runtime": 4.3729,
"eval_samples_per_second": 228.679,
"eval_steps_per_second": 14.407,
"step": 163000
},
{
"epoch": 0.59,
"learning_rate": 4.014035481426786e-05,
"loss": 5.8083,
"step": 163500
},
{
"epoch": 0.59,
"learning_rate": 4.0110252066773806e-05,
"loss": 5.8002,
"step": 164000
},
{
"epoch": 0.59,
"eval_loss": 5.807581424713135,
"eval_runtime": 6.5612,
"eval_samples_per_second": 152.412,
"eval_steps_per_second": 9.602,
"step": 164000
},
{
"epoch": 0.6,
"learning_rate": 4.008008899313247e-05,
"loss": 5.8106,
"step": 164500
},
{
"epoch": 0.6,
"learning_rate": 4.004992591949114e-05,
"loss": 5.8056,
"step": 165000
},
{
"epoch": 0.6,
"eval_loss": 5.79323673248291,
"eval_runtime": 4.3218,
"eval_samples_per_second": 231.385,
"eval_steps_per_second": 14.577,
"step": 165000
},
{
"epoch": 0.6,
"learning_rate": 4.001976284584981e-05,
"loss": 5.8112,
"step": 165500
},
{
"epoch": 0.6,
"learning_rate": 3.998966009835575e-05,
"loss": 5.7934,
"step": 166000
},
{
"epoch": 0.6,
"eval_loss": 5.81195592880249,
"eval_runtime": 4.3688,
"eval_samples_per_second": 228.895,
"eval_steps_per_second": 14.42,
"step": 166000
},
{
"epoch": 0.6,
"learning_rate": 3.9959497024714423e-05,
"loss": 5.7962,
"step": 166500
},
{
"epoch": 0.6,
"learning_rate": 3.9929333951073084e-05,
"loss": 5.8154,
"step": 167000
},
{
"epoch": 0.6,
"eval_loss": 5.7888407707214355,
"eval_runtime": 4.3098,
"eval_samples_per_second": 232.032,
"eval_steps_per_second": 14.618,
"step": 167000
},
{
"epoch": 0.61,
"learning_rate": 3.9899170877431744e-05,
"loss": 5.802,
"step": 167500
},
{
"epoch": 0.61,
"learning_rate": 3.986900780379042e-05,
"loss": 5.8044,
"step": 168000
},
{
"epoch": 0.61,
"eval_loss": 5.808478355407715,
"eval_runtime": 4.3839,
"eval_samples_per_second": 228.11,
"eval_steps_per_second": 14.371,
"step": 168000
},
{
"epoch": 0.61,
"learning_rate": 3.983884473014908e-05,
"loss": 5.7986,
"step": 168500
},
{
"epoch": 0.61,
"learning_rate": 3.980874198265503e-05,
"loss": 5.8019,
"step": 169000
},
{
"epoch": 0.61,
"eval_loss": 5.862877368927002,
"eval_runtime": 4.306,
"eval_samples_per_second": 232.233,
"eval_steps_per_second": 14.631,
"step": 169000
},
{
"epoch": 0.61,
"learning_rate": 3.9778578909013694e-05,
"loss": 5.7873,
"step": 169500
},
{
"epoch": 0.62,
"learning_rate": 3.974841583537236e-05,
"loss": 5.7818,
"step": 170000
},
{
"epoch": 0.62,
"eval_loss": 5.815363883972168,
"eval_runtime": 6.2718,
"eval_samples_per_second": 159.444,
"eval_steps_per_second": 10.045,
"step": 170000
},
{
"epoch": 0.62,
"learning_rate": 3.971825276173102e-05,
"loss": 5.801,
"step": 170500
},
{
"epoch": 0.62,
"learning_rate": 3.968808968808969e-05,
"loss": 5.8158,
"step": 171000
},
{
"epoch": 0.62,
"eval_loss": 5.889772415161133,
"eval_runtime": 4.3105,
"eval_samples_per_second": 231.99,
"eval_steps_per_second": 14.615,
"step": 171000
},
{
"epoch": 0.62,
"learning_rate": 3.965798694059564e-05,
"loss": 5.7831,
"step": 171500
},
{
"epoch": 0.62,
"learning_rate": 3.9627823866954305e-05,
"loss": 5.7964,
"step": 172000
},
{
"epoch": 0.62,
"eval_loss": 5.8176398277282715,
"eval_runtime": 4.2897,
"eval_samples_per_second": 233.118,
"eval_steps_per_second": 14.686,
"step": 172000
},
{
"epoch": 0.62,
"learning_rate": 3.9597660793312965e-05,
"loss": 5.8032,
"step": 172500
},
{
"epoch": 0.63,
"learning_rate": 3.956749771967164e-05,
"loss": 5.7874,
"step": 173000
},
{
"epoch": 0.63,
"eval_loss": 5.859738349914551,
"eval_runtime": 4.2313,
"eval_samples_per_second": 236.336,
"eval_steps_per_second": 14.889,
"step": 173000
},
{
"epoch": 0.63,
"learning_rate": 3.953739497217758e-05,
"loss": 5.7942,
"step": 173500
},
{
"epoch": 0.63,
"learning_rate": 3.950723189853625e-05,
"loss": 5.7831,
"step": 174000
},
{
"epoch": 0.63,
"eval_loss": 5.804076194763184,
"eval_runtime": 4.3301,
"eval_samples_per_second": 230.94,
"eval_steps_per_second": 14.549,
"step": 174000
},
{
"epoch": 0.63,
"learning_rate": 3.9477068824894915e-05,
"loss": 5.7912,
"step": 174500
},
{
"epoch": 0.63,
"learning_rate": 3.9446905751253576e-05,
"loss": 5.7917,
"step": 175000
},
{
"epoch": 0.63,
"eval_loss": 5.769659042358398,
"eval_runtime": 4.3337,
"eval_samples_per_second": 230.752,
"eval_steps_per_second": 14.537,
"step": 175000
},
{
"epoch": 0.64,
"learning_rate": 3.941674267761224e-05,
"loss": 5.7979,
"step": 175500
},
{
"epoch": 0.64,
"learning_rate": 3.938663993011819e-05,
"loss": 5.7897,
"step": 176000
},
{
"epoch": 0.64,
"eval_loss": 5.812716007232666,
"eval_runtime": 6.3065,
"eval_samples_per_second": 158.566,
"eval_steps_per_second": 9.99,
"step": 176000
},
{
"epoch": 0.64,
"learning_rate": 3.935647685647686e-05,
"loss": 5.7918,
"step": 176500
},
{
"epoch": 0.64,
"learning_rate": 3.932631378283552e-05,
"loss": 5.787,
"step": 177000
},
{
"epoch": 0.64,
"eval_loss": 5.8294172286987305,
"eval_runtime": 4.4089,
"eval_samples_per_second": 226.814,
"eval_steps_per_second": 14.289,
"step": 177000
},
{
"epoch": 0.64,
"learning_rate": 3.929615070919419e-05,
"loss": 5.78,
"step": 177500
},
{
"epoch": 0.64,
"learning_rate": 3.926598763555285e-05,
"loss": 5.7881,
"step": 178000
},
{
"epoch": 0.64,
"eval_loss": 5.794092178344727,
"eval_runtime": 4.3295,
"eval_samples_per_second": 230.971,
"eval_steps_per_second": 14.551,
"step": 178000
},
{
"epoch": 0.65,
"learning_rate": 3.92358848880588e-05,
"loss": 5.776,
"step": 178500
},
{
"epoch": 0.65,
"learning_rate": 3.920572181441747e-05,
"loss": 5.7757,
"step": 179000
},
{
"epoch": 0.65,
"eval_loss": 5.8026957511901855,
"eval_runtime": 6.3375,
"eval_samples_per_second": 157.79,
"eval_steps_per_second": 9.941,
"step": 179000
},
{
"epoch": 0.65,
"learning_rate": 3.917555874077614e-05,
"loss": 5.7966,
"step": 179500
},
{
"epoch": 0.65,
"learning_rate": 3.91453956671348e-05,
"loss": 5.7837,
"step": 180000
},
{
"epoch": 0.65,
"eval_loss": 5.851541519165039,
"eval_runtime": 4.3261,
"eval_samples_per_second": 231.153,
"eval_steps_per_second": 14.563,
"step": 180000
},
{
"epoch": 0.65,
"learning_rate": 3.9115232593493464e-05,
"loss": 5.7852,
"step": 180500
},
{
"epoch": 0.66,
"learning_rate": 3.908506951985213e-05,
"loss": 5.7766,
"step": 181000
},
{
"epoch": 0.66,
"eval_loss": 5.806175708770752,
"eval_runtime": 4.2789,
"eval_samples_per_second": 233.704,
"eval_steps_per_second": 14.723,
"step": 181000
},
{
"epoch": 0.66,
"learning_rate": 3.90549064462108e-05,
"loss": 5.7912,
"step": 181500
},
{
"epoch": 0.66,
"learning_rate": 3.902474337256946e-05,
"loss": 5.7799,
"step": 182000
},
{
"epoch": 0.66,
"eval_loss": 5.806212425231934,
"eval_runtime": 4.4158,
"eval_samples_per_second": 226.461,
"eval_steps_per_second": 14.267,
"step": 182000
},
{
"epoch": 0.66,
"learning_rate": 3.899464062507541e-05,
"loss": 5.787,
"step": 182500
},
{
"epoch": 0.66,
"learning_rate": 3.896453787758136e-05,
"loss": 5.7831,
"step": 183000
},
{
"epoch": 0.66,
"eval_loss": 5.78677225112915,
"eval_runtime": 4.3143,
"eval_samples_per_second": 231.789,
"eval_steps_per_second": 14.603,
"step": 183000
},
{
"epoch": 0.66,
"learning_rate": 3.8934374803940024e-05,
"loss": 5.7735,
"step": 183500
},
{
"epoch": 0.67,
"learning_rate": 3.890421173029869e-05,
"loss": 5.7758,
"step": 184000
},
{
"epoch": 0.67,
"eval_loss": 5.771059513092041,
"eval_runtime": 4.2501,
"eval_samples_per_second": 235.289,
"eval_steps_per_second": 14.823,
"step": 184000
},
{
"epoch": 0.67,
"learning_rate": 3.887404865665735e-05,
"loss": 5.7717,
"step": 184500
},
{
"epoch": 0.67,
"learning_rate": 3.884388558301602e-05,
"loss": 5.7854,
"step": 185000
},
{
"epoch": 0.67,
"eval_loss": 5.744887828826904,
"eval_runtime": 4.38,
"eval_samples_per_second": 228.313,
"eval_steps_per_second": 14.384,
"step": 185000
},
{
"epoch": 0.67,
"learning_rate": 3.881378283552197e-05,
"loss": 5.7759,
"step": 185500
},
{
"epoch": 0.67,
"learning_rate": 3.8783619761880635e-05,
"loss": 5.782,
"step": 186000
},
{
"epoch": 0.67,
"eval_loss": 5.777120113372803,
"eval_runtime": 4.3199,
"eval_samples_per_second": 231.487,
"eval_steps_per_second": 14.584,
"step": 186000
},
{
"epoch": 0.68,
"learning_rate": 3.8753456688239295e-05,
"loss": 5.7735,
"step": 186500
},
{
"epoch": 0.68,
"learning_rate": 3.872329361459797e-05,
"loss": 5.7806,
"step": 187000
},
{
"epoch": 0.68,
"eval_loss": 5.791393280029297,
"eval_runtime": 4.2777,
"eval_samples_per_second": 233.768,
"eval_steps_per_second": 14.727,
"step": 187000
},
{
"epoch": 0.68,
"learning_rate": 3.869319086710391e-05,
"loss": 5.7717,
"step": 187500
},
{
"epoch": 0.68,
"learning_rate": 3.866302779346258e-05,
"loss": 5.7649,
"step": 188000
},
{
"epoch": 0.68,
"eval_loss": 5.816986560821533,
"eval_runtime": 6.3859,
"eval_samples_per_second": 156.596,
"eval_steps_per_second": 9.866,
"step": 188000
},
{
"epoch": 0.68,
"learning_rate": 3.8632864719821245e-05,
"loss": 5.7693,
"step": 188500
},
{
"epoch": 0.68,
"learning_rate": 3.8602701646179906e-05,
"loss": 5.776,
"step": 189000
},
{
"epoch": 0.68,
"eval_loss": 5.79604959487915,
"eval_runtime": 4.3399,
"eval_samples_per_second": 230.421,
"eval_steps_per_second": 14.516,
"step": 189000
},
{
"epoch": 0.69,
"learning_rate": 3.857253857253857e-05,
"loss": 5.7631,
"step": 189500
},
{
"epoch": 0.69,
"learning_rate": 3.854243582504452e-05,
"loss": 5.7604,
"step": 190000
},
{
"epoch": 0.69,
"eval_loss": 5.783555507659912,
"eval_runtime": 4.4137,
"eval_samples_per_second": 226.565,
"eval_steps_per_second": 14.274,
"step": 190000
},
{
"epoch": 0.69,
"learning_rate": 3.851227275140319e-05,
"loss": 5.7713,
"step": 190500
},
{
"epoch": 0.69,
"learning_rate": 3.848210967776185e-05,
"loss": 5.7778,
"step": 191000
},
{
"epoch": 0.69,
"eval_loss": 5.775600433349609,
"eval_runtime": 4.2997,
"eval_samples_per_second": 232.575,
"eval_steps_per_second": 14.652,
"step": 191000
},
{
"epoch": 0.69,
"learning_rate": 3.845194660412052e-05,
"loss": 5.7759,
"step": 191500
},
{
"epoch": 0.69,
"learning_rate": 3.842178353047918e-05,
"loss": 5.7705,
"step": 192000
},
{
"epoch": 0.69,
"eval_loss": 5.826145648956299,
"eval_runtime": 4.2643,
"eval_samples_per_second": 234.504,
"eval_steps_per_second": 14.774,
"step": 192000
},
{
"epoch": 0.7,
"learning_rate": 3.839162045683785e-05,
"loss": 5.7729,
"step": 192500
},
{
"epoch": 0.7,
"learning_rate": 3.836145738319652e-05,
"loss": 5.7658,
"step": 193000
},
{
"epoch": 0.7,
"eval_loss": 5.824615001678467,
"eval_runtime": 4.3664,
"eval_samples_per_second": 229.021,
"eval_steps_per_second": 14.428,
"step": 193000
},
{
"epoch": 0.7,
"learning_rate": 3.833135463570247e-05,
"loss": 5.767,
"step": 193500
},
{
"epoch": 0.7,
"learning_rate": 3.830119156206113e-05,
"loss": 5.7687,
"step": 194000
},
{
"epoch": 0.7,
"eval_loss": 5.796684741973877,
"eval_runtime": 4.322,
"eval_samples_per_second": 231.375,
"eval_steps_per_second": 14.577,
"step": 194000
},
{
"epoch": 0.7,
"learning_rate": 3.8271028488419794e-05,
"loss": 5.7601,
"step": 194500
},
{
"epoch": 0.71,
"learning_rate": 3.824086541477846e-05,
"loss": 5.7589,
"step": 195000
},
{
"epoch": 0.71,
"eval_loss": 5.798003673553467,
"eval_runtime": 6.0617,
"eval_samples_per_second": 164.97,
"eval_steps_per_second": 10.393,
"step": 195000
},
{
"epoch": 0.71,
"learning_rate": 3.821070234113713e-05,
"loss": 5.7691,
"step": 195500
},
{
"epoch": 0.71,
"learning_rate": 3.818053926749579e-05,
"loss": 5.7713,
"step": 196000
},
{
"epoch": 0.71,
"eval_loss": 5.799909591674805,
"eval_runtime": 4.3023,
"eval_samples_per_second": 232.433,
"eval_steps_per_second": 14.643,
"step": 196000
},
{
"epoch": 0.71,
"learning_rate": 3.815043652000174e-05,
"loss": 5.7676,
"step": 196500
},
{
"epoch": 0.71,
"learning_rate": 3.8120273446360404e-05,
"loss": 5.7616,
"step": 197000
},
{
"epoch": 0.71,
"eval_loss": 5.801938533782959,
"eval_runtime": 4.062,
"eval_samples_per_second": 246.183,
"eval_steps_per_second": 15.51,
"step": 197000
},
{
"epoch": 0.71,
"learning_rate": 3.8090110372719065e-05,
"loss": 5.7518,
"step": 197500
},
{
"epoch": 0.72,
"learning_rate": 3.805994729907774e-05,
"loss": 5.7643,
"step": 198000
},
{
"epoch": 0.72,
"eval_loss": 5.786435127258301,
"eval_runtime": 4.2697,
"eval_samples_per_second": 234.21,
"eval_steps_per_second": 14.755,
"step": 198000
},
{
"epoch": 0.72,
"learning_rate": 3.802984455158368e-05,
"loss": 5.7582,
"step": 198500
},
{
"epoch": 0.72,
"learning_rate": 3.799968147794235e-05,
"loss": 5.7634,
"step": 199000
},
{
"epoch": 0.72,
"eval_loss": 5.755306720733643,
"eval_runtime": 4.4039,
"eval_samples_per_second": 227.072,
"eval_steps_per_second": 14.306,
"step": 199000
},
{
"epoch": 0.72,
"learning_rate": 3.7969518404301015e-05,
"loss": 5.7523,
"step": 199500
},
{
"epoch": 0.72,
"learning_rate": 3.793935533065968e-05,
"loss": 5.7582,
"step": 200000
},
{
"epoch": 0.72,
"eval_loss": 5.787529468536377,
"eval_runtime": 4.2787,
"eval_samples_per_second": 233.717,
"eval_steps_per_second": 14.724,
"step": 200000
},
{
"epoch": 0.73,
"learning_rate": 3.790919225701834e-05,
"loss": 5.7647,
"step": 200500
},
{
"epoch": 0.73,
"learning_rate": 3.7879029183377016e-05,
"loss": 5.7548,
"step": 201000
},
{
"epoch": 0.73,
"eval_loss": 5.7919721603393555,
"eval_runtime": 4.347,
"eval_samples_per_second": 230.043,
"eval_steps_per_second": 14.493,
"step": 201000
},
{
"epoch": 0.73,
"learning_rate": 3.7848866109735676e-05,
"loss": 5.7659,
"step": 201500
},
{
"epoch": 0.73,
"learning_rate": 3.781870303609434e-05,
"loss": 5.7602,
"step": 202000
},
{
"epoch": 0.73,
"eval_loss": 5.796759605407715,
"eval_runtime": 4.367,
"eval_samples_per_second": 228.988,
"eval_steps_per_second": 14.426,
"step": 202000
},
{
"epoch": 0.73,
"learning_rate": 3.778860028860029e-05,
"loss": 5.7507,
"step": 202500
},
{
"epoch": 0.73,
"learning_rate": 3.7758497541106235e-05,
"loss": 5.7575,
"step": 203000
},
{
"epoch": 0.73,
"eval_loss": 5.792579174041748,
"eval_runtime": 4.2917,
"eval_samples_per_second": 233.009,
"eval_steps_per_second": 14.68,
"step": 203000
},
{
"epoch": 0.74,
"learning_rate": 3.77283344674649e-05,
"loss": 5.7526,
"step": 203500
},
{
"epoch": 0.74,
"learning_rate": 3.769817139382357e-05,
"loss": 5.76,
"step": 204000
},
{
"epoch": 0.74,
"eval_loss": 5.783504962921143,
"eval_runtime": 4.2736,
"eval_samples_per_second": 233.996,
"eval_steps_per_second": 14.742,
"step": 204000
},
{
"epoch": 0.74,
"learning_rate": 3.7668008320182236e-05,
"loss": 5.7642,
"step": 204500
},
{
"epoch": 0.74,
"learning_rate": 3.7637845246540896e-05,
"loss": 5.7455,
"step": 205000
},
{
"epoch": 0.74,
"eval_loss": 5.809587478637695,
"eval_runtime": 6.5273,
"eval_samples_per_second": 153.202,
"eval_steps_per_second": 9.652,
"step": 205000
},
{
"epoch": 0.74,
"learning_rate": 3.760774249904685e-05,
"loss": 5.7621,
"step": 205500
},
{
"epoch": 0.75,
"learning_rate": 3.757757942540551e-05,
"loss": 5.7508,
"step": 206000
},
{
"epoch": 0.75,
"eval_loss": 5.8115620613098145,
"eval_runtime": 4.3241,
"eval_samples_per_second": 231.263,
"eval_steps_per_second": 14.57,
"step": 206000
},
{
"epoch": 0.75,
"learning_rate": 3.754741635176418e-05,
"loss": 5.7566,
"step": 206500
},
{
"epoch": 0.75,
"learning_rate": 3.751725327812285e-05,
"loss": 5.7472,
"step": 207000
},
{
"epoch": 0.75,
"eval_loss": 5.75007963180542,
"eval_runtime": 4.2843,
"eval_samples_per_second": 233.408,
"eval_steps_per_second": 14.705,
"step": 207000
},
{
"epoch": 0.75,
"learning_rate": 3.7487090204481514e-05,
"loss": 5.7621,
"step": 207500
},
{
"epoch": 0.75,
"learning_rate": 3.745698745698746e-05,
"loss": 5.7459,
"step": 208000
},
{
"epoch": 0.75,
"eval_loss": 5.7589802742004395,
"eval_runtime": 4.249,
"eval_samples_per_second": 235.35,
"eval_steps_per_second": 14.827,
"step": 208000
},
{
"epoch": 0.75,
"learning_rate": 3.7426824383346124e-05,
"loss": 5.748,
"step": 208500
},
{
"epoch": 0.76,
"learning_rate": 3.739666130970479e-05,
"loss": 5.7457,
"step": 209000
},
{
"epoch": 0.76,
"eval_loss": 5.755953788757324,
"eval_runtime": 4.381,
"eval_samples_per_second": 228.257,
"eval_steps_per_second": 14.38,
"step": 209000
},
{
"epoch": 0.76,
"learning_rate": 3.736649823606346e-05,
"loss": 5.7559,
"step": 209500
},
{
"epoch": 0.76,
"learning_rate": 3.733633516242212e-05,
"loss": 5.7514,
"step": 210000
},
{
"epoch": 0.76,
"eval_loss": 5.743694305419922,
"eval_runtime": 4.3893,
"eval_samples_per_second": 227.825,
"eval_steps_per_second": 14.353,
"step": 210000
},
{
"epoch": 0.76,
"learning_rate": 3.730623241492807e-05,
"loss": 5.7522,
"step": 210500
},
{
"epoch": 0.76,
"learning_rate": 3.7276069341286734e-05,
"loss": 5.758,
"step": 211000
},
{
"epoch": 0.76,
"eval_loss": 5.809187889099121,
"eval_runtime": 4.3578,
"eval_samples_per_second": 229.472,
"eval_steps_per_second": 14.457,
"step": 211000
},
{
"epoch": 0.77,
"learning_rate": 3.7245906267645394e-05,
"loss": 5.7386,
"step": 211500
},
{
"epoch": 0.77,
"learning_rate": 3.721574319400407e-05,
"loss": 5.7412,
"step": 212000
},
{
"epoch": 0.77,
"eval_loss": 5.790631294250488,
"eval_runtime": 6.9155,
"eval_samples_per_second": 144.603,
"eval_steps_per_second": 9.11,
"step": 212000
},
{
"epoch": 0.77,
"learning_rate": 3.718564044651001e-05,
"loss": 5.7447,
"step": 212500
},
{
"epoch": 0.77,
"learning_rate": 3.715547737286868e-05,
"loss": 5.7564,
"step": 213000
},
{
"epoch": 0.77,
"eval_loss": 5.766078472137451,
"eval_runtime": 4.3057,
"eval_samples_per_second": 232.248,
"eval_steps_per_second": 14.632,
"step": 213000
},
{
"epoch": 0.77,
"learning_rate": 3.7125314299227345e-05,
"loss": 5.7403,
"step": 213500
},
{
"epoch": 0.77,
"learning_rate": 3.7095211551733294e-05,
"loss": 5.7473,
"step": 214000
},
{
"epoch": 0.77,
"eval_loss": 5.826783657073975,
"eval_runtime": 4.2915,
"eval_samples_per_second": 233.02,
"eval_steps_per_second": 14.68,
"step": 214000
},
{
"epoch": 0.78,
"learning_rate": 3.7065048478091955e-05,
"loss": 5.7359,
"step": 214500
},
{
"epoch": 0.78,
"learning_rate": 3.703488540445063e-05,
"loss": 5.7439,
"step": 215000
},
{
"epoch": 0.78,
"eval_loss": 5.752542018890381,
"eval_runtime": 6.2452,
"eval_samples_per_second": 160.123,
"eval_steps_per_second": 10.088,
"step": 215000
},
{
"epoch": 0.78,
"learning_rate": 3.700472233080929e-05,
"loss": 5.7517,
"step": 215500
},
{
"epoch": 0.78,
"learning_rate": 3.6974559257167956e-05,
"loss": 5.7482,
"step": 216000
},
{
"epoch": 0.78,
"eval_loss": 5.75383996963501,
"eval_runtime": 4.2741,
"eval_samples_per_second": 233.968,
"eval_steps_per_second": 14.74,
"step": 216000
},
{
"epoch": 0.78,
"learning_rate": 3.694439618352662e-05,
"loss": 5.7552,
"step": 216500
},
{
"epoch": 0.79,
"learning_rate": 3.691429343603257e-05,
"loss": 5.7498,
"step": 217000
},
{
"epoch": 0.79,
"eval_loss": 5.778792858123779,
"eval_runtime": 4.3819,
"eval_samples_per_second": 228.21,
"eval_steps_per_second": 14.377,
"step": 217000
},
{
"epoch": 0.79,
"learning_rate": 3.688413036239123e-05,
"loss": 5.7418,
"step": 217500
},
{
"epoch": 0.79,
"learning_rate": 3.68539672887499e-05,
"loss": 5.7276,
"step": 218000
},
{
"epoch": 0.79,
"eval_loss": 5.756528377532959,
"eval_runtime": 4.3385,
"eval_samples_per_second": 230.495,
"eval_steps_per_second": 14.521,
"step": 218000
},
{
"epoch": 0.79,
"learning_rate": 3.6823804215108566e-05,
"loss": 5.7493,
"step": 218500
},
{
"epoch": 0.79,
"learning_rate": 3.6793641141467226e-05,
"loss": 5.7559,
"step": 219000
},
{
"epoch": 0.79,
"eval_loss": 5.770053863525391,
"eval_runtime": 4.3798,
"eval_samples_per_second": 228.323,
"eval_steps_per_second": 14.384,
"step": 219000
},
{
"epoch": 0.79,
"learning_rate": 3.676347806782589e-05,
"loss": 5.7309,
"step": 219500
},
{
"epoch": 0.8,
"learning_rate": 3.673331499418456e-05,
"loss": 5.7466,
"step": 220000
},
{
"epoch": 0.8,
"eval_loss": 5.751831531524658,
"eval_runtime": 4.3885,
"eval_samples_per_second": 227.868,
"eval_steps_per_second": 14.356,
"step": 220000
},
{
"epoch": 0.8,
"learning_rate": 3.670315192054323e-05,
"loss": 5.754,
"step": 220500
},
{
"epoch": 0.8,
"learning_rate": 3.667304917304917e-05,
"loss": 5.7441,
"step": 221000
},
{
"epoch": 0.8,
"eval_loss": 5.766170024871826,
"eval_runtime": 4.4111,
"eval_samples_per_second": 226.7,
"eval_steps_per_second": 14.282,
"step": 221000
},
{
"epoch": 0.8,
"learning_rate": 3.6642886099407844e-05,
"loss": 5.739,
"step": 221500
},
{
"epoch": 0.8,
"learning_rate": 3.6612723025766504e-05,
"loss": 5.7352,
"step": 222000
},
{
"epoch": 0.8,
"eval_loss": 5.74399995803833,
"eval_runtime": 4.3561,
"eval_samples_per_second": 229.561,
"eval_steps_per_second": 14.462,
"step": 222000
},
{
"epoch": 0.81,
"learning_rate": 3.658255995212517e-05,
"loss": 5.7368,
"step": 222500
},
{
"epoch": 0.81,
"learning_rate": 3.655239687848384e-05,
"loss": 5.739,
"step": 223000
},
{
"epoch": 0.81,
"eval_loss": 5.763323783874512,
"eval_runtime": 4.3227,
"eval_samples_per_second": 231.335,
"eval_steps_per_second": 14.574,
"step": 223000
},
{
"epoch": 0.81,
"learning_rate": 3.652229413098979e-05,
"loss": 5.7403,
"step": 223500
},
{
"epoch": 0.81,
"learning_rate": 3.649213105734845e-05,
"loss": 5.7391,
"step": 224000
},
{
"epoch": 0.81,
"eval_loss": 5.7649736404418945,
"eval_runtime": 4.3622,
"eval_samples_per_second": 229.24,
"eval_steps_per_second": 14.442,
"step": 224000
},
{
"epoch": 0.81,
"learning_rate": 3.646196798370712e-05,
"loss": 5.744,
"step": 224500
},
{
"epoch": 0.81,
"learning_rate": 3.643180491006578e-05,
"loss": 5.7363,
"step": 225000
},
{
"epoch": 0.81,
"eval_loss": 5.732951641082764,
"eval_runtime": 4.3989,
"eval_samples_per_second": 227.33,
"eval_steps_per_second": 14.322,
"step": 225000
},
{
"epoch": 0.82,
"learning_rate": 3.6401702162571724e-05,
"loss": 5.7421,
"step": 225500
},
{
"epoch": 0.82,
"learning_rate": 3.63715390889304e-05,
"loss": 5.7414,
"step": 226000
},
{
"epoch": 0.82,
"eval_loss": 5.73438024520874,
"eval_runtime": 4.3577,
"eval_samples_per_second": 229.481,
"eval_steps_per_second": 14.457,
"step": 226000
},
{
"epoch": 0.82,
"learning_rate": 3.634137601528906e-05,
"loss": 5.7274,
"step": 226500
},
{
"epoch": 0.82,
"learning_rate": 3.6311212941647725e-05,
"loss": 5.7385,
"step": 227000
},
{
"epoch": 0.82,
"eval_loss": 5.781483173370361,
"eval_runtime": 4.3808,
"eval_samples_per_second": 228.267,
"eval_steps_per_second": 14.381,
"step": 227000
},
{
"epoch": 0.82,
"learning_rate": 3.628104986800639e-05,
"loss": 5.7458,
"step": 227500
},
{
"epoch": 0.83,
"learning_rate": 3.625094712051234e-05,
"loss": 5.7521,
"step": 228000
},
{
"epoch": 0.83,
"eval_loss": 5.737205982208252,
"eval_runtime": 4.3898,
"eval_samples_per_second": 227.799,
"eval_steps_per_second": 14.351,
"step": 228000
},
{
"epoch": 0.83,
"learning_rate": 3.6220784046871e-05,
"loss": 5.7272,
"step": 228500
},
{
"epoch": 0.83,
"learning_rate": 3.619062097322967e-05,
"loss": 5.7258,
"step": 229000
},
{
"epoch": 0.83,
"eval_loss": 5.73113489151001,
"eval_runtime": 4.264,
"eval_samples_per_second": 234.524,
"eval_steps_per_second": 14.775,
"step": 229000
},
{
"epoch": 0.83,
"learning_rate": 3.6160457899588336e-05,
"loss": 5.7354,
"step": 229500
},
{
"epoch": 0.83,
"learning_rate": 3.6130294825947e-05,
"loss": 5.7256,
"step": 230000
},
{
"epoch": 0.83,
"eval_loss": 5.771120071411133,
"eval_runtime": 4.2667,
"eval_samples_per_second": 234.371,
"eval_steps_per_second": 14.765,
"step": 230000
}
],
"max_steps": 828828,
"num_train_epochs": 3,
"total_flos": 2.819512759114629e+17,
"trial_name": null,
"trial_params": null
}