2020-Q4-75p-filtered-random / trainer_state.json
DouglasPontes's picture
End of training
a7c86b1 verified
raw
history blame contribute delete
No virus
81 kB
{
"best_metric": 2.249734878540039,
"best_model_checkpoint": "./model_tweets_2020_Q4_75/checkpoint-1824000",
"epoch": 20.210356123316856,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07,
"eval_loss": 2.578623056411743,
"eval_runtime": 123.0205,
"eval_samples_per_second": 812.873,
"eval_steps_per_second": 50.805,
"step": 8000
},
{
"epoch": 0.13,
"learning_rate": 4.0726666666666665e-07,
"loss": 2.8197,
"step": 16000
},
{
"epoch": 0.13,
"eval_loss": 2.4787847995758057,
"eval_runtime": 121.9308,
"eval_samples_per_second": 820.137,
"eval_steps_per_second": 51.259,
"step": 16000
},
{
"epoch": 0.2,
"eval_loss": 2.434333086013794,
"eval_runtime": 121.4115,
"eval_samples_per_second": 823.645,
"eval_steps_per_second": 51.478,
"step": 24000
},
{
"epoch": 0.27,
"learning_rate": 4.0453333333333336e-07,
"loss": 2.5564,
"step": 32000
},
{
"epoch": 0.27,
"eval_loss": 2.4142847061157227,
"eval_runtime": 121.0734,
"eval_samples_per_second": 825.945,
"eval_steps_per_second": 51.622,
"step": 32000
},
{
"epoch": 0.34,
"eval_loss": 2.3825528621673584,
"eval_runtime": 121.0153,
"eval_samples_per_second": 826.341,
"eval_steps_per_second": 51.646,
"step": 40000
},
{
"epoch": 0.4,
"learning_rate": 4.018e-07,
"loss": 2.4967,
"step": 48000
},
{
"epoch": 0.4,
"eval_loss": 2.3654873371124268,
"eval_runtime": 120.7184,
"eval_samples_per_second": 828.374,
"eval_steps_per_second": 51.773,
"step": 48000
},
{
"epoch": 0.47,
"eval_loss": 2.3449532985687256,
"eval_runtime": 121.1097,
"eval_samples_per_second": 825.698,
"eval_steps_per_second": 51.606,
"step": 56000
},
{
"epoch": 0.54,
"learning_rate": 3.9906666666666667e-07,
"loss": 2.476,
"step": 64000
},
{
"epoch": 0.54,
"eval_loss": 2.3501105308532715,
"eval_runtime": 121.0094,
"eval_samples_per_second": 826.382,
"eval_steps_per_second": 51.649,
"step": 64000
},
{
"epoch": 0.61,
"eval_loss": 2.331531524658203,
"eval_runtime": 121.5665,
"eval_samples_per_second": 822.595,
"eval_steps_per_second": 51.412,
"step": 72000
},
{
"epoch": 0.67,
"learning_rate": 3.963333333333333e-07,
"loss": 2.4525,
"step": 80000
},
{
"epoch": 0.67,
"eval_loss": 2.3285999298095703,
"eval_runtime": 122.0584,
"eval_samples_per_second": 819.28,
"eval_steps_per_second": 51.205,
"step": 80000
},
{
"epoch": 0.74,
"eval_loss": 2.327819585800171,
"eval_runtime": 121.7307,
"eval_samples_per_second": 821.486,
"eval_steps_per_second": 51.343,
"step": 88000
},
{
"epoch": 0.81,
"learning_rate": 3.936e-07,
"loss": 2.445,
"step": 96000
},
{
"epoch": 0.81,
"eval_loss": 2.3187131881713867,
"eval_runtime": 121.9578,
"eval_samples_per_second": 819.956,
"eval_steps_per_second": 51.247,
"step": 96000
},
{
"epoch": 0.88,
"eval_loss": 2.3113441467285156,
"eval_runtime": 121.6182,
"eval_samples_per_second": 822.245,
"eval_steps_per_second": 51.39,
"step": 104000
},
{
"epoch": 0.94,
"learning_rate": 3.908666666666667e-07,
"loss": 2.438,
"step": 112000
},
{
"epoch": 0.94,
"eval_loss": 2.3128702640533447,
"eval_runtime": 120.7683,
"eval_samples_per_second": 828.032,
"eval_steps_per_second": 51.752,
"step": 112000
},
{
"epoch": 1.01,
"eval_loss": 2.3160533905029297,
"eval_runtime": 120.7418,
"eval_samples_per_second": 828.214,
"eval_steps_per_second": 51.763,
"step": 120000
},
{
"epoch": 1.08,
"learning_rate": 3.8813333333333334e-07,
"loss": 2.4233,
"step": 128000
},
{
"epoch": 1.08,
"eval_loss": 2.3008599281311035,
"eval_runtime": 121.6785,
"eval_samples_per_second": 821.838,
"eval_steps_per_second": 51.365,
"step": 128000
},
{
"epoch": 1.15,
"eval_loss": 2.3072261810302734,
"eval_runtime": 121.7737,
"eval_samples_per_second": 821.196,
"eval_steps_per_second": 51.325,
"step": 136000
},
{
"epoch": 1.21,
"learning_rate": 3.854e-07,
"loss": 2.4182,
"step": 144000
},
{
"epoch": 1.21,
"eval_loss": 2.3069441318511963,
"eval_runtime": 121.4616,
"eval_samples_per_second": 823.306,
"eval_steps_per_second": 51.457,
"step": 144000
},
{
"epoch": 1.28,
"eval_loss": 2.3059937953948975,
"eval_runtime": 121.1919,
"eval_samples_per_second": 825.137,
"eval_steps_per_second": 51.571,
"step": 152000
},
{
"epoch": 1.35,
"learning_rate": 3.8266666666666665e-07,
"loss": 2.418,
"step": 160000
},
{
"epoch": 1.35,
"eval_loss": 2.296349048614502,
"eval_runtime": 120.8775,
"eval_samples_per_second": 827.284,
"eval_steps_per_second": 51.705,
"step": 160000
},
{
"epoch": 1.41,
"eval_loss": 2.3016867637634277,
"eval_runtime": 122.1083,
"eval_samples_per_second": 818.945,
"eval_steps_per_second": 51.184,
"step": 168000
},
{
"epoch": 1.48,
"learning_rate": 3.799333333333333e-07,
"loss": 2.4106,
"step": 176000
},
{
"epoch": 1.48,
"eval_loss": 2.2863121032714844,
"eval_runtime": 121.754,
"eval_samples_per_second": 821.328,
"eval_steps_per_second": 51.333,
"step": 176000
},
{
"epoch": 1.55,
"eval_loss": 2.2870755195617676,
"eval_runtime": 121.9197,
"eval_samples_per_second": 820.212,
"eval_steps_per_second": 51.263,
"step": 184000
},
{
"epoch": 1.62,
"learning_rate": 3.772e-07,
"loss": 2.4093,
"step": 192000
},
{
"epoch": 1.62,
"eval_loss": 2.2869644165039062,
"eval_runtime": 123.0294,
"eval_samples_per_second": 812.814,
"eval_steps_per_second": 50.801,
"step": 192000
},
{
"epoch": 1.68,
"eval_loss": 2.2844796180725098,
"eval_runtime": 122.1632,
"eval_samples_per_second": 818.577,
"eval_steps_per_second": 51.161,
"step": 200000
},
{
"epoch": 1.75,
"learning_rate": 3.7446666666666667e-07,
"loss": 2.4124,
"step": 208000
},
{
"epoch": 1.75,
"eval_loss": 2.2971222400665283,
"eval_runtime": 122.9029,
"eval_samples_per_second": 813.651,
"eval_steps_per_second": 50.853,
"step": 208000
},
{
"epoch": 1.82,
"eval_loss": 2.283277988433838,
"eval_runtime": 122.8004,
"eval_samples_per_second": 814.33,
"eval_steps_per_second": 50.896,
"step": 216000
},
{
"epoch": 1.89,
"learning_rate": 3.7173333333333333e-07,
"loss": 2.4031,
"step": 224000
},
{
"epoch": 1.89,
"eval_loss": 2.286567449569702,
"eval_runtime": 123.6365,
"eval_samples_per_second": 808.822,
"eval_steps_per_second": 50.551,
"step": 224000
},
{
"epoch": 1.95,
"eval_loss": 2.2832698822021484,
"eval_runtime": 124.0411,
"eval_samples_per_second": 806.184,
"eval_steps_per_second": 50.387,
"step": 232000
},
{
"epoch": 2.02,
"learning_rate": 3.69e-07,
"loss": 2.4056,
"step": 240000
},
{
"epoch": 2.02,
"eval_loss": 2.2877089977264404,
"eval_runtime": 123.3004,
"eval_samples_per_second": 811.028,
"eval_steps_per_second": 50.689,
"step": 240000
},
{
"epoch": 2.09,
"eval_loss": 2.278871774673462,
"eval_runtime": 121.4394,
"eval_samples_per_second": 823.456,
"eval_steps_per_second": 51.466,
"step": 248000
},
{
"epoch": 2.16,
"learning_rate": 3.6626666666666664e-07,
"loss": 2.4035,
"step": 256000
},
{
"epoch": 2.16,
"eval_loss": 2.287219762802124,
"eval_runtime": 122.192,
"eval_samples_per_second": 818.384,
"eval_steps_per_second": 51.149,
"step": 256000
},
{
"epoch": 2.22,
"eval_loss": 2.277144193649292,
"eval_runtime": 122.8125,
"eval_samples_per_second": 814.25,
"eval_steps_per_second": 50.891,
"step": 264000
},
{
"epoch": 2.29,
"learning_rate": 3.6353333333333335e-07,
"loss": 2.4068,
"step": 272000
},
{
"epoch": 2.29,
"eval_loss": 2.2823851108551025,
"eval_runtime": 122.2322,
"eval_samples_per_second": 818.115,
"eval_steps_per_second": 51.132,
"step": 272000
},
{
"epoch": 2.36,
"eval_loss": 2.268087148666382,
"eval_runtime": 122.1078,
"eval_samples_per_second": 818.949,
"eval_steps_per_second": 51.184,
"step": 280000
},
{
"epoch": 2.43,
"learning_rate": 3.608e-07,
"loss": 2.4069,
"step": 288000
},
{
"epoch": 2.43,
"eval_loss": 2.2866132259368896,
"eval_runtime": 122.7666,
"eval_samples_per_second": 814.554,
"eval_steps_per_second": 50.91,
"step": 288000
},
{
"epoch": 2.49,
"eval_loss": 2.2837841510772705,
"eval_runtime": 122.6627,
"eval_samples_per_second": 815.244,
"eval_steps_per_second": 50.953,
"step": 296000
},
{
"epoch": 2.56,
"learning_rate": 3.5806666666666666e-07,
"loss": 2.4059,
"step": 304000
},
{
"epoch": 2.56,
"eval_loss": 2.280437707901001,
"eval_runtime": 121.9348,
"eval_samples_per_second": 820.111,
"eval_steps_per_second": 51.257,
"step": 304000
},
{
"epoch": 2.63,
"eval_loss": 2.275745391845703,
"eval_runtime": 121.6125,
"eval_samples_per_second": 822.284,
"eval_steps_per_second": 51.393,
"step": 312000
},
{
"epoch": 2.69,
"learning_rate": 3.553333333333333e-07,
"loss": 2.3997,
"step": 320000
},
{
"epoch": 2.69,
"eval_loss": 2.2774524688720703,
"eval_runtime": 121.7427,
"eval_samples_per_second": 821.404,
"eval_steps_per_second": 51.338,
"step": 320000
},
{
"epoch": 2.76,
"eval_loss": 2.2692716121673584,
"eval_runtime": 121.9867,
"eval_samples_per_second": 819.762,
"eval_steps_per_second": 51.235,
"step": 328000
},
{
"epoch": 2.83,
"learning_rate": 3.5259999999999997e-07,
"loss": 2.4025,
"step": 336000
},
{
"epoch": 2.83,
"eval_loss": 2.2751219272613525,
"eval_runtime": 123.4237,
"eval_samples_per_second": 810.217,
"eval_steps_per_second": 50.639,
"step": 336000
},
{
"epoch": 2.9,
"eval_loss": 2.2686424255371094,
"eval_runtime": 122.8579,
"eval_samples_per_second": 813.948,
"eval_steps_per_second": 50.872,
"step": 344000
},
{
"epoch": 2.96,
"learning_rate": 3.498666666666667e-07,
"loss": 2.399,
"step": 352000
},
{
"epoch": 2.96,
"eval_loss": 2.2784106731414795,
"eval_runtime": 123.6612,
"eval_samples_per_second": 808.661,
"eval_steps_per_second": 50.541,
"step": 352000
},
{
"epoch": 3.03,
"eval_loss": 2.278184175491333,
"eval_runtime": 122.1432,
"eval_samples_per_second": 818.711,
"eval_steps_per_second": 51.169,
"step": 360000
},
{
"epoch": 3.1,
"learning_rate": 3.4713333333333333e-07,
"loss": 2.3953,
"step": 368000
},
{
"epoch": 3.1,
"eval_loss": 2.2693536281585693,
"eval_runtime": 123.798,
"eval_samples_per_second": 807.767,
"eval_steps_per_second": 50.485,
"step": 368000
},
{
"epoch": 3.17,
"eval_loss": 2.263842821121216,
"eval_runtime": 123.5987,
"eval_samples_per_second": 809.07,
"eval_steps_per_second": 50.567,
"step": 376000
},
{
"epoch": 3.23,
"learning_rate": 3.444e-07,
"loss": 2.4002,
"step": 384000
},
{
"epoch": 3.23,
"eval_loss": 2.2785000801086426,
"eval_runtime": 123.8322,
"eval_samples_per_second": 807.544,
"eval_steps_per_second": 50.472,
"step": 384000
},
{
"epoch": 3.3,
"eval_loss": 2.278545618057251,
"eval_runtime": 123.0521,
"eval_samples_per_second": 812.664,
"eval_steps_per_second": 50.791,
"step": 392000
},
{
"epoch": 3.37,
"learning_rate": 3.416666666666667e-07,
"loss": 2.4035,
"step": 400000
},
{
"epoch": 3.37,
"eval_loss": 2.277418375015259,
"eval_runtime": 121.6188,
"eval_samples_per_second": 822.241,
"eval_steps_per_second": 51.39,
"step": 400000
},
{
"epoch": 3.44,
"eval_loss": 2.273623466491699,
"eval_runtime": 123.5249,
"eval_samples_per_second": 809.554,
"eval_steps_per_second": 50.597,
"step": 408000
},
{
"epoch": 3.5,
"learning_rate": 3.3893333333333335e-07,
"loss": 2.3985,
"step": 416000
},
{
"epoch": 3.5,
"eval_loss": 2.28084397315979,
"eval_runtime": 124.25,
"eval_samples_per_second": 804.829,
"eval_steps_per_second": 50.302,
"step": 416000
},
{
"epoch": 3.57,
"eval_loss": 2.2672338485717773,
"eval_runtime": 122.2338,
"eval_samples_per_second": 818.104,
"eval_steps_per_second": 51.132,
"step": 424000
},
{
"epoch": 3.64,
"learning_rate": 3.3619999999999995e-07,
"loss": 2.3996,
"step": 432000
},
{
"epoch": 3.64,
"eval_loss": 2.2764840126037598,
"eval_runtime": 121.8322,
"eval_samples_per_second": 820.801,
"eval_steps_per_second": 51.3,
"step": 432000
},
{
"epoch": 3.71,
"eval_loss": 2.2748093605041504,
"eval_runtime": 123.7721,
"eval_samples_per_second": 807.937,
"eval_steps_per_second": 50.496,
"step": 440000
},
{
"epoch": 3.77,
"learning_rate": 3.3346666666666666e-07,
"loss": 2.4052,
"step": 448000
},
{
"epoch": 3.77,
"eval_loss": 2.2646682262420654,
"eval_runtime": 124.0373,
"eval_samples_per_second": 806.209,
"eval_steps_per_second": 50.388,
"step": 448000
},
{
"epoch": 3.84,
"eval_loss": 2.2776055335998535,
"eval_runtime": 122.9793,
"eval_samples_per_second": 813.145,
"eval_steps_per_second": 50.822,
"step": 456000
},
{
"epoch": 3.91,
"learning_rate": 3.307333333333333e-07,
"loss": 2.4025,
"step": 464000
},
{
"epoch": 3.91,
"eval_loss": 2.2734146118164062,
"eval_runtime": 122.7865,
"eval_samples_per_second": 814.422,
"eval_steps_per_second": 50.901,
"step": 464000
},
{
"epoch": 3.97,
"eval_loss": 2.258847713470459,
"eval_runtime": 122.3563,
"eval_samples_per_second": 817.285,
"eval_steps_per_second": 51.08,
"step": 472000
},
{
"epoch": 4.04,
"learning_rate": 3.28e-07,
"loss": 2.4082,
"step": 480000
},
{
"epoch": 4.04,
"eval_loss": 2.272439479827881,
"eval_runtime": 121.6654,
"eval_samples_per_second": 821.927,
"eval_steps_per_second": 51.37,
"step": 480000
},
{
"epoch": 4.11,
"eval_loss": 2.2740166187286377,
"eval_runtime": 121.9787,
"eval_samples_per_second": 819.815,
"eval_steps_per_second": 51.238,
"step": 488000
},
{
"epoch": 4.18,
"learning_rate": 3.252666666666667e-07,
"loss": 2.3993,
"step": 496000
},
{
"epoch": 4.18,
"eval_loss": 2.272566556930542,
"eval_runtime": 122.2763,
"eval_samples_per_second": 817.82,
"eval_steps_per_second": 51.114,
"step": 496000
},
{
"epoch": 4.24,
"eval_loss": 2.282744884490967,
"eval_runtime": 122.9895,
"eval_samples_per_second": 813.078,
"eval_steps_per_second": 50.817,
"step": 504000
},
{
"epoch": 4.31,
"learning_rate": 3.2253333333333334e-07,
"loss": 2.4029,
"step": 512000
},
{
"epoch": 4.31,
"eval_loss": 2.2727839946746826,
"eval_runtime": 123.1921,
"eval_samples_per_second": 811.74,
"eval_steps_per_second": 50.734,
"step": 512000
},
{
"epoch": 4.38,
"eval_loss": 2.2833046913146973,
"eval_runtime": 122.6812,
"eval_samples_per_second": 815.121,
"eval_steps_per_second": 50.945,
"step": 520000
},
{
"epoch": 4.45,
"learning_rate": 3.198e-07,
"loss": 2.407,
"step": 528000
},
{
"epoch": 4.45,
"eval_loss": 2.2636303901672363,
"eval_runtime": 121.8819,
"eval_samples_per_second": 820.467,
"eval_steps_per_second": 51.279,
"step": 528000
},
{
"epoch": 4.51,
"eval_loss": 2.2689170837402344,
"eval_runtime": 121.6415,
"eval_samples_per_second": 822.088,
"eval_steps_per_second": 51.38,
"step": 536000
},
{
"epoch": 4.58,
"learning_rate": 3.1706666666666665e-07,
"loss": 2.4039,
"step": 544000
},
{
"epoch": 4.58,
"eval_loss": 2.2741005420684814,
"eval_runtime": 122.571,
"eval_samples_per_second": 815.854,
"eval_steps_per_second": 50.991,
"step": 544000
},
{
"epoch": 4.65,
"eval_loss": 2.271497964859009,
"eval_runtime": 121.9112,
"eval_samples_per_second": 820.269,
"eval_steps_per_second": 51.267,
"step": 552000
},
{
"epoch": 4.72,
"learning_rate": 3.1433333333333336e-07,
"loss": 2.3983,
"step": 560000
},
{
"epoch": 4.72,
"eval_loss": 2.280510187149048,
"eval_runtime": 122.4175,
"eval_samples_per_second": 816.877,
"eval_steps_per_second": 51.055,
"step": 560000
},
{
"epoch": 4.78,
"eval_loss": 2.274358034133911,
"eval_runtime": 123.0247,
"eval_samples_per_second": 812.845,
"eval_steps_per_second": 50.803,
"step": 568000
},
{
"epoch": 4.85,
"learning_rate": 3.116e-07,
"loss": 2.3974,
"step": 576000
},
{
"epoch": 4.85,
"eval_loss": 2.2677698135375977,
"eval_runtime": 122.4679,
"eval_samples_per_second": 816.54,
"eval_steps_per_second": 51.034,
"step": 576000
},
{
"epoch": 4.92,
"eval_loss": 2.27225399017334,
"eval_runtime": 121.7124,
"eval_samples_per_second": 821.609,
"eval_steps_per_second": 51.351,
"step": 584000
},
{
"epoch": 4.99,
"learning_rate": 3.0886666666666667e-07,
"loss": 2.388,
"step": 592000
},
{
"epoch": 4.99,
"eval_loss": 2.2655186653137207,
"eval_runtime": 123.2663,
"eval_samples_per_second": 811.252,
"eval_steps_per_second": 50.703,
"step": 592000
},
{
"epoch": 5.05,
"eval_loss": 2.2716100215911865,
"eval_runtime": 122.2663,
"eval_samples_per_second": 817.887,
"eval_steps_per_second": 51.118,
"step": 600000
},
{
"epoch": 5.12,
"learning_rate": 3.061333333333333e-07,
"loss": 2.3921,
"step": 608000
},
{
"epoch": 5.12,
"eval_loss": 2.27711820602417,
"eval_runtime": 123.6023,
"eval_samples_per_second": 809.046,
"eval_steps_per_second": 50.565,
"step": 608000
},
{
"epoch": 5.19,
"eval_loss": 2.276036262512207,
"eval_runtime": 122.8165,
"eval_samples_per_second": 814.223,
"eval_steps_per_second": 50.889,
"step": 616000
},
{
"epoch": 5.25,
"learning_rate": 3.034e-07,
"loss": 2.3963,
"step": 624000
},
{
"epoch": 5.25,
"eval_loss": 2.2806384563446045,
"eval_runtime": 124.0725,
"eval_samples_per_second": 805.98,
"eval_steps_per_second": 50.374,
"step": 624000
},
{
"epoch": 5.32,
"eval_loss": 2.2697396278381348,
"eval_runtime": 123.9418,
"eval_samples_per_second": 806.831,
"eval_steps_per_second": 50.427,
"step": 632000
},
{
"epoch": 5.39,
"learning_rate": 3.0066666666666663e-07,
"loss": 2.3891,
"step": 640000
},
{
"epoch": 5.39,
"eval_loss": 2.270462989807129,
"eval_runtime": 123.8524,
"eval_samples_per_second": 807.412,
"eval_steps_per_second": 50.463,
"step": 640000
},
{
"epoch": 5.46,
"eval_loss": 2.270759105682373,
"eval_runtime": 125.0675,
"eval_samples_per_second": 799.568,
"eval_steps_per_second": 49.973,
"step": 648000
},
{
"epoch": 5.52,
"learning_rate": 2.9793333333333334e-07,
"loss": 2.3968,
"step": 656000
},
{
"epoch": 5.52,
"eval_loss": 2.2689437866210938,
"eval_runtime": 123.4597,
"eval_samples_per_second": 809.981,
"eval_steps_per_second": 50.624,
"step": 656000
},
{
"epoch": 5.59,
"eval_loss": 2.265118360519409,
"eval_runtime": 123.1184,
"eval_samples_per_second": 812.226,
"eval_steps_per_second": 50.764,
"step": 664000
},
{
"epoch": 5.66,
"learning_rate": 2.952e-07,
"loss": 2.3951,
"step": 672000
},
{
"epoch": 5.66,
"eval_loss": 2.276594638824463,
"eval_runtime": 122.2204,
"eval_samples_per_second": 818.194,
"eval_steps_per_second": 51.137,
"step": 672000
},
{
"epoch": 5.73,
"eval_loss": 2.2717082500457764,
"eval_runtime": 121.454,
"eval_samples_per_second": 823.357,
"eval_steps_per_second": 51.46,
"step": 680000
},
{
"epoch": 5.79,
"learning_rate": 2.9246666666666665e-07,
"loss": 2.3986,
"step": 688000
},
{
"epoch": 5.79,
"eval_loss": 2.262892246246338,
"eval_runtime": 121.1279,
"eval_samples_per_second": 825.574,
"eval_steps_per_second": 51.598,
"step": 688000
},
{
"epoch": 5.86,
"eval_loss": 2.2623705863952637,
"eval_runtime": 122.6601,
"eval_samples_per_second": 815.261,
"eval_steps_per_second": 50.954,
"step": 696000
},
{
"epoch": 5.93,
"learning_rate": 2.897333333333333e-07,
"loss": 2.3985,
"step": 704000
},
{
"epoch": 5.93,
"eval_loss": 2.2692642211914062,
"eval_runtime": 122.4517,
"eval_samples_per_second": 816.648,
"eval_steps_per_second": 51.041,
"step": 704000
},
{
"epoch": 6.0,
"eval_loss": 2.2631709575653076,
"eval_runtime": 122.6658,
"eval_samples_per_second": 815.223,
"eval_steps_per_second": 50.951,
"step": 712000
},
{
"epoch": 6.06,
"learning_rate": 2.8699999999999996e-07,
"loss": 2.4009,
"step": 720000
},
{
"epoch": 6.06,
"eval_loss": 2.271480083465576,
"eval_runtime": 123.6589,
"eval_samples_per_second": 808.676,
"eval_steps_per_second": 50.542,
"step": 720000
},
{
"epoch": 6.13,
"eval_loss": 2.2654306888580322,
"eval_runtime": 122.4949,
"eval_samples_per_second": 816.36,
"eval_steps_per_second": 51.023,
"step": 728000
},
{
"epoch": 6.2,
"learning_rate": 2.8426666666666667e-07,
"loss": 2.4015,
"step": 736000
},
{
"epoch": 6.2,
"eval_loss": 2.2700347900390625,
"eval_runtime": 123.7479,
"eval_samples_per_second": 808.095,
"eval_steps_per_second": 50.506,
"step": 736000
},
{
"epoch": 6.27,
"eval_loss": 2.2672719955444336,
"eval_runtime": 122.6192,
"eval_samples_per_second": 815.533,
"eval_steps_per_second": 50.971,
"step": 744000
},
{
"epoch": 6.33,
"learning_rate": 2.815333333333333e-07,
"loss": 2.3927,
"step": 752000
},
{
"epoch": 6.33,
"eval_loss": 2.2701234817504883,
"eval_runtime": 122.5955,
"eval_samples_per_second": 815.691,
"eval_steps_per_second": 50.981,
"step": 752000
},
{
"epoch": 6.4,
"eval_loss": 2.266552448272705,
"eval_runtime": 122.7923,
"eval_samples_per_second": 814.383,
"eval_steps_per_second": 50.899,
"step": 760000
},
{
"epoch": 6.47,
"learning_rate": 2.7880000000000003e-07,
"loss": 2.3941,
"step": 768000
},
{
"epoch": 6.47,
"eval_loss": 2.2584707736968994,
"eval_runtime": 122.6812,
"eval_samples_per_second": 815.121,
"eval_steps_per_second": 50.945,
"step": 768000
},
{
"epoch": 6.53,
"eval_loss": 2.2679240703582764,
"eval_runtime": 123.1769,
"eval_samples_per_second": 811.84,
"eval_steps_per_second": 50.74,
"step": 776000
},
{
"epoch": 6.6,
"learning_rate": 2.7606666666666664e-07,
"loss": 2.393,
"step": 784000
},
{
"epoch": 6.6,
"eval_loss": 2.2623932361602783,
"eval_runtime": 122.5019,
"eval_samples_per_second": 816.314,
"eval_steps_per_second": 51.02,
"step": 784000
},
{
"epoch": 6.67,
"eval_loss": 2.2706010341644287,
"eval_runtime": 121.9417,
"eval_samples_per_second": 820.064,
"eval_steps_per_second": 51.254,
"step": 792000
},
{
"epoch": 6.74,
"learning_rate": 2.733333333333333e-07,
"loss": 2.4025,
"step": 800000
},
{
"epoch": 6.74,
"eval_loss": 2.278485059738159,
"eval_runtime": 121.1371,
"eval_samples_per_second": 825.511,
"eval_steps_per_second": 51.594,
"step": 800000
},
{
"epoch": 6.8,
"eval_loss": 2.265829086303711,
"eval_runtime": 122.5144,
"eval_samples_per_second": 816.231,
"eval_steps_per_second": 51.014,
"step": 808000
},
{
"epoch": 6.87,
"learning_rate": 2.706e-07,
"loss": 2.3992,
"step": 816000
},
{
"epoch": 6.87,
"eval_loss": 2.2557029724121094,
"eval_runtime": 122.5492,
"eval_samples_per_second": 815.999,
"eval_steps_per_second": 51.0,
"step": 816000
},
{
"epoch": 6.94,
"eval_loss": 2.2580955028533936,
"eval_runtime": 122.7074,
"eval_samples_per_second": 814.947,
"eval_steps_per_second": 50.934,
"step": 824000
},
{
"epoch": 7.01,
"learning_rate": 2.6786666666666666e-07,
"loss": 2.4055,
"step": 832000
},
{
"epoch": 7.01,
"eval_loss": 2.2724661827087402,
"eval_runtime": 123.1095,
"eval_samples_per_second": 812.285,
"eval_steps_per_second": 50.768,
"step": 832000
},
{
"epoch": 7.07,
"eval_loss": 2.2608320713043213,
"eval_runtime": 123.2985,
"eval_samples_per_second": 811.04,
"eval_steps_per_second": 50.69,
"step": 840000
},
{
"epoch": 7.14,
"learning_rate": 2.651333333333333e-07,
"loss": 2.3965,
"step": 848000
},
{
"epoch": 7.14,
"eval_loss": 2.2716965675354004,
"eval_runtime": 123.0254,
"eval_samples_per_second": 812.84,
"eval_steps_per_second": 50.803,
"step": 848000
},
{
"epoch": 7.21,
"eval_loss": 2.2643144130706787,
"eval_runtime": 124.1312,
"eval_samples_per_second": 805.599,
"eval_steps_per_second": 50.35,
"step": 856000
},
{
"epoch": 7.28,
"learning_rate": 2.624e-07,
"loss": 2.4028,
"step": 864000
},
{
"epoch": 7.28,
"eval_loss": 2.269681930541992,
"eval_runtime": 124.1538,
"eval_samples_per_second": 805.453,
"eval_steps_per_second": 50.341,
"step": 864000
},
{
"epoch": 7.34,
"eval_loss": 2.269113063812256,
"eval_runtime": 122.6058,
"eval_samples_per_second": 815.622,
"eval_steps_per_second": 50.976,
"step": 872000
},
{
"epoch": 7.41,
"learning_rate": 2.596666666666667e-07,
"loss": 2.3943,
"step": 880000
},
{
"epoch": 7.41,
"eval_loss": 2.2628066539764404,
"eval_runtime": 123.121,
"eval_samples_per_second": 812.209,
"eval_steps_per_second": 50.763,
"step": 880000
},
{
"epoch": 7.48,
"eval_loss": 2.262988567352295,
"eval_runtime": 122.3602,
"eval_samples_per_second": 817.259,
"eval_steps_per_second": 51.079,
"step": 888000
},
{
"epoch": 7.55,
"learning_rate": 2.5693333333333333e-07,
"loss": 2.3918,
"step": 896000
},
{
"epoch": 7.55,
"eval_loss": 2.269148349761963,
"eval_runtime": 121.9179,
"eval_samples_per_second": 820.224,
"eval_steps_per_second": 51.264,
"step": 896000
},
{
"epoch": 7.61,
"eval_loss": 2.2778499126434326,
"eval_runtime": 121.8423,
"eval_samples_per_second": 820.733,
"eval_steps_per_second": 51.296,
"step": 904000
},
{
"epoch": 7.68,
"learning_rate": 2.542e-07,
"loss": 2.3897,
"step": 912000
},
{
"epoch": 7.68,
"eval_loss": 2.2576756477355957,
"eval_runtime": 122.7735,
"eval_samples_per_second": 814.508,
"eval_steps_per_second": 50.907,
"step": 912000
},
{
"epoch": 7.75,
"eval_loss": 2.2689757347106934,
"eval_runtime": 122.5565,
"eval_samples_per_second": 815.95,
"eval_steps_per_second": 50.997,
"step": 920000
},
{
"epoch": 7.81,
"learning_rate": 2.5146666666666664e-07,
"loss": 2.3996,
"step": 928000
},
{
"epoch": 7.81,
"eval_loss": 2.2631115913391113,
"eval_runtime": 122.8254,
"eval_samples_per_second": 814.164,
"eval_steps_per_second": 50.885,
"step": 928000
},
{
"epoch": 7.88,
"eval_loss": 2.260575294494629,
"eval_runtime": 123.4359,
"eval_samples_per_second": 810.137,
"eval_steps_per_second": 50.634,
"step": 936000
},
{
"epoch": 7.95,
"learning_rate": 2.4873333333333335e-07,
"loss": 2.4016,
"step": 944000
},
{
"epoch": 7.95,
"eval_loss": 2.274198293685913,
"eval_runtime": 122.9226,
"eval_samples_per_second": 813.52,
"eval_steps_per_second": 50.845,
"step": 944000
},
{
"epoch": 8.02,
"eval_loss": 2.270534038543701,
"eval_runtime": 122.6348,
"eval_samples_per_second": 815.429,
"eval_steps_per_second": 50.964,
"step": 952000
},
{
"epoch": 8.08,
"learning_rate": 2.46e-07,
"loss": 2.3989,
"step": 960000
},
{
"epoch": 8.08,
"eval_loss": 2.26939058303833,
"eval_runtime": 122.9893,
"eval_samples_per_second": 813.079,
"eval_steps_per_second": 50.817,
"step": 960000
},
{
"epoch": 8.15,
"eval_loss": 2.26764178276062,
"eval_runtime": 123.268,
"eval_samples_per_second": 811.241,
"eval_steps_per_second": 50.703,
"step": 968000
},
{
"epoch": 8.22,
"learning_rate": 2.4326666666666666e-07,
"loss": 2.3989,
"step": 976000
},
{
"epoch": 8.22,
"eval_loss": 2.265949010848999,
"eval_runtime": 122.977,
"eval_samples_per_second": 813.16,
"eval_steps_per_second": 50.823,
"step": 976000
},
{
"epoch": 8.29,
"eval_loss": 2.2675843238830566,
"eval_runtime": 123.7208,
"eval_samples_per_second": 808.272,
"eval_steps_per_second": 50.517,
"step": 984000
},
{
"epoch": 8.35,
"learning_rate": 2.405333333333333e-07,
"loss": 2.3995,
"step": 992000
},
{
"epoch": 8.35,
"eval_loss": 2.2751786708831787,
"eval_runtime": 124.1169,
"eval_samples_per_second": 805.692,
"eval_steps_per_second": 50.356,
"step": 992000
},
{
"epoch": 8.42,
"eval_loss": 2.276003360748291,
"eval_runtime": 122.615,
"eval_samples_per_second": 815.561,
"eval_steps_per_second": 50.973,
"step": 1000000
},
{
"epoch": 8.49,
"learning_rate": 2.3779999999999997e-07,
"loss": 2.3958,
"step": 1008000
},
{
"epoch": 8.49,
"eval_loss": 2.2779273986816406,
"eval_runtime": 124.3332,
"eval_samples_per_second": 804.29,
"eval_steps_per_second": 50.268,
"step": 1008000
},
{
"epoch": 8.56,
"eval_loss": 2.2626090049743652,
"eval_runtime": 124.2615,
"eval_samples_per_second": 804.754,
"eval_steps_per_second": 50.297,
"step": 1016000
},
{
"epoch": 8.62,
"learning_rate": 2.3506666666666668e-07,
"loss": 2.3962,
"step": 1024000
},
{
"epoch": 8.62,
"eval_loss": 2.264645576477051,
"eval_runtime": 125.4442,
"eval_samples_per_second": 797.167,
"eval_steps_per_second": 49.823,
"step": 1024000
},
{
"epoch": 8.69,
"eval_loss": 2.264505386352539,
"eval_runtime": 123.3616,
"eval_samples_per_second": 810.625,
"eval_steps_per_second": 50.664,
"step": 1032000
},
{
"epoch": 8.76,
"learning_rate": 2.3233333333333334e-07,
"loss": 2.3966,
"step": 1040000
},
{
"epoch": 8.76,
"eval_loss": 2.2602715492248535,
"eval_runtime": 125.0544,
"eval_samples_per_second": 799.652,
"eval_steps_per_second": 49.978,
"step": 1040000
},
{
"epoch": 8.83,
"eval_loss": 2.254918098449707,
"eval_runtime": 125.3507,
"eval_samples_per_second": 797.762,
"eval_steps_per_second": 49.86,
"step": 1048000
},
{
"epoch": 8.89,
"learning_rate": 2.2960000000000002e-07,
"loss": 2.3934,
"step": 1056000
},
{
"epoch": 8.89,
"eval_loss": 2.2668979167938232,
"eval_runtime": 124.2577,
"eval_samples_per_second": 804.779,
"eval_steps_per_second": 50.299,
"step": 1056000
},
{
"epoch": 8.96,
"eval_loss": 2.2576217651367188,
"eval_runtime": 124.3543,
"eval_samples_per_second": 804.154,
"eval_steps_per_second": 50.26,
"step": 1064000
},
{
"epoch": 9.03,
"learning_rate": 2.2686666666666667e-07,
"loss": 2.3918,
"step": 1072000
},
{
"epoch": 9.03,
"eval_loss": 2.270716905593872,
"eval_runtime": 124.4458,
"eval_samples_per_second": 803.563,
"eval_steps_per_second": 50.223,
"step": 1072000
},
{
"epoch": 9.09,
"eval_loss": 2.261819362640381,
"eval_runtime": 123.2038,
"eval_samples_per_second": 811.663,
"eval_steps_per_second": 50.729,
"step": 1080000
},
{
"epoch": 9.16,
"learning_rate": 2.2413333333333333e-07,
"loss": 2.401,
"step": 1088000
},
{
"epoch": 9.16,
"eval_loss": 2.2680015563964844,
"eval_runtime": 122.7384,
"eval_samples_per_second": 814.741,
"eval_steps_per_second": 50.921,
"step": 1088000
},
{
"epoch": 9.23,
"eval_loss": 2.2721402645111084,
"eval_runtime": 122.6217,
"eval_samples_per_second": 815.516,
"eval_steps_per_second": 50.97,
"step": 1096000
},
{
"epoch": 9.3,
"learning_rate": 2.214e-07,
"loss": 2.3938,
"step": 1104000
},
{
"epoch": 9.3,
"eval_loss": 2.2636525630950928,
"eval_runtime": 123.3896,
"eval_samples_per_second": 810.441,
"eval_steps_per_second": 50.653,
"step": 1104000
},
{
"epoch": 9.36,
"eval_loss": 2.265684127807617,
"eval_runtime": 124.4365,
"eval_samples_per_second": 803.622,
"eval_steps_per_second": 50.226,
"step": 1112000
},
{
"epoch": 9.43,
"learning_rate": 2.1866666666666667e-07,
"loss": 2.3982,
"step": 1120000
},
{
"epoch": 9.43,
"eval_loss": 2.2576382160186768,
"eval_runtime": 123.4041,
"eval_samples_per_second": 810.346,
"eval_steps_per_second": 50.647,
"step": 1120000
},
{
"epoch": 9.5,
"eval_loss": 2.2633111476898193,
"eval_runtime": 124.0094,
"eval_samples_per_second": 806.39,
"eval_steps_per_second": 50.399,
"step": 1128000
},
{
"epoch": 9.57,
"learning_rate": 2.1593333333333332e-07,
"loss": 2.4006,
"step": 1136000
},
{
"epoch": 9.57,
"eval_loss": 2.2667734622955322,
"eval_runtime": 124.2747,
"eval_samples_per_second": 804.669,
"eval_steps_per_second": 50.292,
"step": 1136000
},
{
"epoch": 9.63,
"eval_loss": 2.265963077545166,
"eval_runtime": 123.6085,
"eval_samples_per_second": 809.006,
"eval_steps_per_second": 50.563,
"step": 1144000
},
{
"epoch": 9.7,
"learning_rate": 2.132e-07,
"loss": 2.3971,
"step": 1152000
},
{
"epoch": 9.7,
"eval_loss": 2.265885829925537,
"eval_runtime": 122.706,
"eval_samples_per_second": 814.956,
"eval_steps_per_second": 50.935,
"step": 1152000
},
{
"epoch": 9.77,
"eval_loss": 2.272273063659668,
"eval_runtime": 122.3266,
"eval_samples_per_second": 817.483,
"eval_steps_per_second": 51.093,
"step": 1160000
},
{
"epoch": 9.84,
"learning_rate": 2.1046666666666666e-07,
"loss": 2.4004,
"step": 1168000
},
{
"epoch": 9.84,
"eval_loss": 2.2626895904541016,
"eval_runtime": 122.5255,
"eval_samples_per_second": 816.157,
"eval_steps_per_second": 51.01,
"step": 1168000
},
{
"epoch": 9.9,
"eval_loss": 2.2708263397216797,
"eval_runtime": 123.4094,
"eval_samples_per_second": 810.311,
"eval_steps_per_second": 50.644,
"step": 1176000
},
{
"epoch": 9.97,
"learning_rate": 2.0773333333333334e-07,
"loss": 2.3903,
"step": 1184000
},
{
"epoch": 9.97,
"eval_loss": 2.257643222808838,
"eval_runtime": 124.6842,
"eval_samples_per_second": 802.026,
"eval_steps_per_second": 50.127,
"step": 1184000
},
{
"epoch": 10.04,
"eval_loss": 2.2625293731689453,
"eval_runtime": 123.3124,
"eval_samples_per_second": 810.949,
"eval_steps_per_second": 50.684,
"step": 1192000
},
{
"epoch": 10.11,
"learning_rate": 2.05e-07,
"loss": 2.3909,
"step": 1200000
},
{
"epoch": 10.11,
"eval_loss": 2.254274606704712,
"eval_runtime": 123.3737,
"eval_samples_per_second": 810.545,
"eval_steps_per_second": 50.659,
"step": 1200000
},
{
"epoch": 10.17,
"eval_loss": 2.259488582611084,
"eval_runtime": 123.697,
"eval_samples_per_second": 808.427,
"eval_steps_per_second": 50.527,
"step": 1208000
},
{
"epoch": 10.24,
"learning_rate": 2.0226666666666668e-07,
"loss": 2.4004,
"step": 1216000
},
{
"epoch": 10.24,
"eval_loss": 2.2560548782348633,
"eval_runtime": 124.7416,
"eval_samples_per_second": 801.657,
"eval_steps_per_second": 50.104,
"step": 1216000
},
{
"epoch": 10.31,
"eval_loss": 2.26065993309021,
"eval_runtime": 124.3446,
"eval_samples_per_second": 804.217,
"eval_steps_per_second": 50.264,
"step": 1224000
},
{
"epoch": 10.37,
"learning_rate": 1.9953333333333333e-07,
"loss": 2.3964,
"step": 1232000
},
{
"epoch": 10.37,
"eval_loss": 2.2605860233306885,
"eval_runtime": 124.6826,
"eval_samples_per_second": 802.037,
"eval_steps_per_second": 50.127,
"step": 1232000
},
{
"epoch": 10.44,
"eval_loss": 2.2635273933410645,
"eval_runtime": 124.8015,
"eval_samples_per_second": 801.272,
"eval_steps_per_second": 50.08,
"step": 1240000
},
{
"epoch": 10.51,
"learning_rate": 1.968e-07,
"loss": 2.4007,
"step": 1248000
},
{
"epoch": 10.51,
"eval_loss": 2.262328863143921,
"eval_runtime": 124.2584,
"eval_samples_per_second": 804.775,
"eval_steps_per_second": 50.298,
"step": 1248000
},
{
"epoch": 10.58,
"eval_loss": 2.2696375846862793,
"eval_runtime": 124.2659,
"eval_samples_per_second": 804.726,
"eval_steps_per_second": 50.295,
"step": 1256000
},
{
"epoch": 10.64,
"learning_rate": 1.9406666666666667e-07,
"loss": 2.3993,
"step": 1264000
},
{
"epoch": 10.64,
"eval_loss": 2.2700283527374268,
"eval_runtime": 125.1405,
"eval_samples_per_second": 799.102,
"eval_steps_per_second": 49.944,
"step": 1264000
},
{
"epoch": 10.71,
"eval_loss": 2.2731199264526367,
"eval_runtime": 124.158,
"eval_samples_per_second": 805.425,
"eval_steps_per_second": 50.339,
"step": 1272000
},
{
"epoch": 10.78,
"learning_rate": 1.9133333333333333e-07,
"loss": 2.4048,
"step": 1280000
},
{
"epoch": 10.78,
"eval_loss": 2.2701127529144287,
"eval_runtime": 123.8055,
"eval_samples_per_second": 807.719,
"eval_steps_per_second": 50.482,
"step": 1280000
},
{
"epoch": 10.85,
"eval_loss": 2.270094871520996,
"eval_runtime": 124.7299,
"eval_samples_per_second": 801.732,
"eval_steps_per_second": 50.108,
"step": 1288000
},
{
"epoch": 10.91,
"learning_rate": 1.886e-07,
"loss": 2.3936,
"step": 1296000
},
{
"epoch": 10.91,
"eval_loss": 2.2705767154693604,
"eval_runtime": 124.6548,
"eval_samples_per_second": 802.216,
"eval_steps_per_second": 50.138,
"step": 1296000
},
{
"epoch": 10.98,
"eval_loss": 2.259617567062378,
"eval_runtime": 124.0017,
"eval_samples_per_second": 806.441,
"eval_steps_per_second": 50.403,
"step": 1304000
},
{
"epoch": 11.05,
"learning_rate": 1.8586666666666666e-07,
"loss": 2.3951,
"step": 1312000
},
{
"epoch": 11.05,
"eval_loss": 2.2812488079071045,
"eval_runtime": 123.9447,
"eval_samples_per_second": 806.811,
"eval_steps_per_second": 50.426,
"step": 1312000
},
{
"epoch": 11.12,
"eval_loss": 2.25225830078125,
"eval_runtime": 122.9659,
"eval_samples_per_second": 813.234,
"eval_steps_per_second": 50.827,
"step": 1320000
},
{
"epoch": 11.18,
"learning_rate": 1.8313333333333332e-07,
"loss": 2.39,
"step": 1328000
},
{
"epoch": 11.18,
"eval_loss": 2.259617805480957,
"eval_runtime": 122.4932,
"eval_samples_per_second": 816.372,
"eval_steps_per_second": 51.023,
"step": 1328000
},
{
"epoch": 11.25,
"eval_loss": 2.272284507751465,
"eval_runtime": 123.4502,
"eval_samples_per_second": 810.043,
"eval_steps_per_second": 50.628,
"step": 1336000
},
{
"epoch": 11.32,
"learning_rate": 1.804e-07,
"loss": 2.393,
"step": 1344000
},
{
"epoch": 11.32,
"eval_loss": 2.2695858478546143,
"eval_runtime": 124.1104,
"eval_samples_per_second": 805.734,
"eval_steps_per_second": 50.358,
"step": 1344000
},
{
"epoch": 11.39,
"eval_loss": 2.261369466781616,
"eval_runtime": 123.2613,
"eval_samples_per_second": 811.285,
"eval_steps_per_second": 50.705,
"step": 1352000
},
{
"epoch": 11.45,
"learning_rate": 1.7766666666666666e-07,
"loss": 2.3915,
"step": 1360000
},
{
"epoch": 11.45,
"eval_loss": 2.268724203109741,
"eval_runtime": 122.9565,
"eval_samples_per_second": 813.296,
"eval_steps_per_second": 50.831,
"step": 1360000
},
{
"epoch": 11.52,
"eval_loss": 2.2566559314727783,
"eval_runtime": 123.2157,
"eval_samples_per_second": 811.585,
"eval_steps_per_second": 50.724,
"step": 1368000
},
{
"epoch": 11.59,
"learning_rate": 1.7493333333333334e-07,
"loss": 2.405,
"step": 1376000
},
{
"epoch": 11.59,
"eval_loss": 2.2717325687408447,
"eval_runtime": 122.7605,
"eval_samples_per_second": 814.594,
"eval_steps_per_second": 50.912,
"step": 1376000
},
{
"epoch": 11.65,
"eval_loss": 2.2732982635498047,
"eval_runtime": 123.3107,
"eval_samples_per_second": 810.959,
"eval_steps_per_second": 50.685,
"step": 1384000
},
{
"epoch": 11.72,
"learning_rate": 1.722e-07,
"loss": 2.3898,
"step": 1392000
},
{
"epoch": 11.72,
"eval_loss": 2.2679965496063232,
"eval_runtime": 122.9066,
"eval_samples_per_second": 813.626,
"eval_steps_per_second": 50.852,
"step": 1392000
},
{
"epoch": 11.79,
"eval_loss": 2.2626819610595703,
"eval_runtime": 123.3147,
"eval_samples_per_second": 810.933,
"eval_steps_per_second": 50.683,
"step": 1400000
},
{
"epoch": 11.86,
"learning_rate": 1.6946666666666668e-07,
"loss": 2.3956,
"step": 1408000
},
{
"epoch": 11.86,
"eval_loss": 2.268873929977417,
"eval_runtime": 125.7341,
"eval_samples_per_second": 795.329,
"eval_steps_per_second": 49.708,
"step": 1408000
},
{
"epoch": 11.92,
"eval_loss": 2.2668938636779785,
"eval_runtime": 125.1617,
"eval_samples_per_second": 798.966,
"eval_steps_per_second": 49.935,
"step": 1416000
},
{
"epoch": 11.99,
"learning_rate": 1.6673333333333333e-07,
"loss": 2.4041,
"step": 1424000
},
{
"epoch": 11.99,
"eval_loss": 2.2610137462615967,
"eval_runtime": 125.5606,
"eval_samples_per_second": 796.428,
"eval_steps_per_second": 49.777,
"step": 1424000
},
{
"epoch": 12.06,
"eval_loss": 2.2688722610473633,
"eval_runtime": 126.602,
"eval_samples_per_second": 789.877,
"eval_steps_per_second": 49.367,
"step": 1432000
},
{
"epoch": 12.13,
"learning_rate": 1.64e-07,
"loss": 2.3968,
"step": 1440000
},
{
"epoch": 12.13,
"eval_loss": 2.274923086166382,
"eval_runtime": 126.5506,
"eval_samples_per_second": 790.198,
"eval_steps_per_second": 49.387,
"step": 1440000
},
{
"epoch": 12.19,
"eval_loss": 2.264037609100342,
"eval_runtime": 125.1647,
"eval_samples_per_second": 798.948,
"eval_steps_per_second": 49.934,
"step": 1448000
},
{
"epoch": 12.26,
"learning_rate": 1.6126666666666667e-07,
"loss": 2.4048,
"step": 1456000
},
{
"epoch": 12.26,
"eval_loss": 2.260209798812866,
"eval_runtime": 124.3269,
"eval_samples_per_second": 804.331,
"eval_steps_per_second": 50.271,
"step": 1456000
},
{
"epoch": 12.33,
"eval_loss": 2.269806146621704,
"eval_runtime": 123.3478,
"eval_samples_per_second": 810.715,
"eval_steps_per_second": 50.67,
"step": 1464000
},
{
"epoch": 12.4,
"learning_rate": 1.5853333333333332e-07,
"loss": 2.4025,
"step": 1472000
},
{
"epoch": 12.4,
"eval_loss": 2.2544891834259033,
"eval_runtime": 125.1006,
"eval_samples_per_second": 799.357,
"eval_steps_per_second": 49.96,
"step": 1472000
},
{
"epoch": 12.46,
"eval_loss": 2.2685253620147705,
"eval_runtime": 123.5222,
"eval_samples_per_second": 809.571,
"eval_steps_per_second": 50.598,
"step": 1480000
},
{
"epoch": 12.53,
"learning_rate": 1.558e-07,
"loss": 2.3977,
"step": 1488000
},
{
"epoch": 12.53,
"eval_loss": 2.262343406677246,
"eval_runtime": 123.0303,
"eval_samples_per_second": 812.808,
"eval_steps_per_second": 50.8,
"step": 1488000
},
{
"epoch": 12.6,
"eval_loss": 2.2679247856140137,
"eval_runtime": 123.7046,
"eval_samples_per_second": 808.378,
"eval_steps_per_second": 50.524,
"step": 1496000
},
{
"epoch": 12.67,
"learning_rate": 1.5306666666666666e-07,
"loss": 2.3965,
"step": 1504000
},
{
"epoch": 12.67,
"eval_loss": 2.250502109527588,
"eval_runtime": 124.441,
"eval_samples_per_second": 803.594,
"eval_steps_per_second": 50.225,
"step": 1504000
},
{
"epoch": 12.73,
"eval_loss": 2.270815134048462,
"eval_runtime": 124.209,
"eval_samples_per_second": 805.095,
"eval_steps_per_second": 50.318,
"step": 1512000
},
{
"epoch": 12.8,
"learning_rate": 1.5033333333333332e-07,
"loss": 2.3945,
"step": 1520000
},
{
"epoch": 12.8,
"eval_loss": 2.2654542922973633,
"eval_runtime": 124.6421,
"eval_samples_per_second": 802.297,
"eval_steps_per_second": 50.144,
"step": 1520000
},
{
"epoch": 12.87,
"eval_loss": 2.267200231552124,
"eval_runtime": 125.0498,
"eval_samples_per_second": 799.681,
"eval_steps_per_second": 49.98,
"step": 1528000
},
{
"epoch": 12.93,
"learning_rate": 1.476e-07,
"loss": 2.3957,
"step": 1536000
},
{
"epoch": 12.93,
"eval_loss": 2.269829273223877,
"eval_runtime": 124.6935,
"eval_samples_per_second": 801.966,
"eval_steps_per_second": 50.123,
"step": 1536000
},
{
"epoch": 13.0,
"eval_loss": 2.2660622596740723,
"eval_runtime": 125.1539,
"eval_samples_per_second": 799.016,
"eval_steps_per_second": 49.939,
"step": 1544000
},
{
"epoch": 13.07,
"learning_rate": 1.4486666666666665e-07,
"loss": 2.3951,
"step": 1552000
},
{
"epoch": 13.07,
"eval_loss": 2.263535737991333,
"eval_runtime": 124.319,
"eval_samples_per_second": 804.382,
"eval_steps_per_second": 50.274,
"step": 1552000
},
{
"epoch": 13.14,
"eval_loss": 2.2597036361694336,
"eval_runtime": 124.2784,
"eval_samples_per_second": 804.645,
"eval_steps_per_second": 50.29,
"step": 1560000
},
{
"epoch": 13.2,
"learning_rate": 1.4213333333333334e-07,
"loss": 2.4005,
"step": 1568000
},
{
"epoch": 13.2,
"eval_loss": 2.2575347423553467,
"eval_runtime": 123.3519,
"eval_samples_per_second": 810.689,
"eval_steps_per_second": 50.668,
"step": 1568000
},
{
"epoch": 13.27,
"eval_loss": 2.2647805213928223,
"eval_runtime": 124.82,
"eval_samples_per_second": 801.154,
"eval_steps_per_second": 50.072,
"step": 1576000
},
{
"epoch": 13.34,
"learning_rate": 1.3940000000000002e-07,
"loss": 2.394,
"step": 1584000
},
{
"epoch": 13.34,
"eval_loss": 2.2745957374572754,
"eval_runtime": 124.719,
"eval_samples_per_second": 801.803,
"eval_steps_per_second": 50.113,
"step": 1584000
},
{
"epoch": 13.41,
"eval_loss": 2.272183656692505,
"eval_runtime": 125.5079,
"eval_samples_per_second": 796.762,
"eval_steps_per_second": 49.798,
"step": 1592000
},
{
"epoch": 13.47,
"learning_rate": 1.3666666666666665e-07,
"loss": 2.4016,
"step": 1600000
},
{
"epoch": 13.47,
"eval_loss": 2.2566604614257812,
"eval_runtime": 123.9875,
"eval_samples_per_second": 806.533,
"eval_steps_per_second": 50.408,
"step": 1600000
},
{
"epoch": 13.54,
"eval_loss": 2.259911298751831,
"eval_runtime": 126.1542,
"eval_samples_per_second": 792.681,
"eval_steps_per_second": 49.543,
"step": 1608000
},
{
"epoch": 13.61,
"learning_rate": 1.3393333333333333e-07,
"loss": 2.392,
"step": 1616000
},
{
"epoch": 13.61,
"eval_loss": 2.258845090866089,
"eval_runtime": 126.2191,
"eval_samples_per_second": 792.273,
"eval_steps_per_second": 49.517,
"step": 1616000
},
{
"epoch": 13.68,
"eval_loss": 2.2644309997558594,
"eval_runtime": 125.6333,
"eval_samples_per_second": 795.968,
"eval_steps_per_second": 49.748,
"step": 1624000
},
{
"epoch": 13.74,
"learning_rate": 1.312e-07,
"loss": 2.3936,
"step": 1632000
},
{
"epoch": 13.74,
"eval_loss": 2.266770839691162,
"eval_runtime": 125.3086,
"eval_samples_per_second": 798.03,
"eval_steps_per_second": 49.877,
"step": 1632000
},
{
"epoch": 13.81,
"eval_loss": 2.2446775436401367,
"eval_runtime": 124.755,
"eval_samples_per_second": 801.571,
"eval_steps_per_second": 50.098,
"step": 1640000
},
{
"epoch": 13.88,
"learning_rate": 1.2846666666666667e-07,
"loss": 2.3954,
"step": 1648000
},
{
"epoch": 13.88,
"eval_loss": 2.25016713142395,
"eval_runtime": 125.12,
"eval_samples_per_second": 799.233,
"eval_steps_per_second": 49.952,
"step": 1648000
},
{
"epoch": 13.95,
"eval_loss": 2.2736761569976807,
"eval_runtime": 124.2206,
"eval_samples_per_second": 805.019,
"eval_steps_per_second": 50.314,
"step": 1656000
},
{
"epoch": 14.01,
"learning_rate": 1.2573333333333332e-07,
"loss": 2.3901,
"step": 1664000
},
{
"epoch": 14.01,
"eval_loss": 2.2700750827789307,
"eval_runtime": 124.082,
"eval_samples_per_second": 805.919,
"eval_steps_per_second": 50.37,
"step": 1664000
},
{
"epoch": 14.08,
"eval_loss": 2.263190746307373,
"eval_runtime": 123.8135,
"eval_samples_per_second": 807.667,
"eval_steps_per_second": 50.479,
"step": 1672000
},
{
"epoch": 14.15,
"learning_rate": 1.23e-07,
"loss": 2.3963,
"step": 1680000
},
{
"epoch": 14.15,
"eval_loss": 2.2660787105560303,
"eval_runtime": 123.6011,
"eval_samples_per_second": 809.054,
"eval_steps_per_second": 50.566,
"step": 1680000
},
{
"epoch": 14.21,
"eval_loss": 2.2628071308135986,
"eval_runtime": 124.3028,
"eval_samples_per_second": 804.487,
"eval_steps_per_second": 50.28,
"step": 1688000
},
{
"epoch": 14.28,
"learning_rate": 1.2026666666666666e-07,
"loss": 2.4005,
"step": 1696000
},
{
"epoch": 14.28,
"eval_loss": 2.2605719566345215,
"eval_runtime": 124.679,
"eval_samples_per_second": 802.06,
"eval_steps_per_second": 50.129,
"step": 1696000
},
{
"epoch": 14.35,
"eval_loss": 2.257826328277588,
"eval_runtime": 124.1165,
"eval_samples_per_second": 805.695,
"eval_steps_per_second": 50.356,
"step": 1704000
},
{
"epoch": 14.42,
"learning_rate": 1.1753333333333334e-07,
"loss": 2.3877,
"step": 1712000
},
{
"epoch": 14.42,
"eval_loss": 2.267404079437256,
"eval_runtime": 123.854,
"eval_samples_per_second": 807.402,
"eval_steps_per_second": 50.463,
"step": 1712000
},
{
"epoch": 14.48,
"eval_loss": 2.263066053390503,
"eval_runtime": 124.1271,
"eval_samples_per_second": 805.626,
"eval_steps_per_second": 50.352,
"step": 1720000
},
{
"epoch": 14.55,
"learning_rate": 1.1480000000000001e-07,
"loss": 2.3958,
"step": 1728000
},
{
"epoch": 14.55,
"eval_loss": 2.267526149749756,
"eval_runtime": 124.1257,
"eval_samples_per_second": 805.635,
"eval_steps_per_second": 50.352,
"step": 1728000
},
{
"epoch": 14.62,
"eval_loss": 2.2751998901367188,
"eval_runtime": 123.9013,
"eval_samples_per_second": 807.094,
"eval_steps_per_second": 50.443,
"step": 1736000
},
{
"epoch": 14.69,
"learning_rate": 1.1206666666666666e-07,
"loss": 2.3858,
"step": 1744000
},
{
"epoch": 14.69,
"eval_loss": 2.2622976303100586,
"eval_runtime": 124.0119,
"eval_samples_per_second": 806.374,
"eval_steps_per_second": 50.398,
"step": 1744000
},
{
"epoch": 14.75,
"eval_loss": 2.2577433586120605,
"eval_runtime": 124.3224,
"eval_samples_per_second": 804.36,
"eval_steps_per_second": 50.273,
"step": 1752000
},
{
"epoch": 14.82,
"learning_rate": 1.0933333333333333e-07,
"loss": 2.403,
"step": 1760000
},
{
"epoch": 14.82,
"eval_loss": 2.251173496246338,
"eval_runtime": 124.4564,
"eval_samples_per_second": 803.494,
"eval_steps_per_second": 50.218,
"step": 1760000
},
{
"epoch": 14.89,
"eval_loss": 2.2610392570495605,
"eval_runtime": 124.6434,
"eval_samples_per_second": 802.289,
"eval_steps_per_second": 50.143,
"step": 1768000
},
{
"epoch": 14.96,
"learning_rate": 1.066e-07,
"loss": 2.3969,
"step": 1776000
},
{
"epoch": 14.96,
"eval_loss": 2.259674310684204,
"eval_runtime": 125.2932,
"eval_samples_per_second": 798.128,
"eval_steps_per_second": 49.883,
"step": 1776000
},
{
"epoch": 15.02,
"eval_loss": 2.274836778640747,
"eval_runtime": 125.8602,
"eval_samples_per_second": 794.532,
"eval_steps_per_second": 49.658,
"step": 1784000
},
{
"epoch": 15.09,
"learning_rate": 1.0386666666666667e-07,
"loss": 2.4016,
"step": 1792000
},
{
"epoch": 15.09,
"eval_loss": 2.2631607055664062,
"eval_runtime": 126.3601,
"eval_samples_per_second": 791.389,
"eval_steps_per_second": 49.462,
"step": 1792000
},
{
"epoch": 15.16,
"eval_loss": 2.2650434970855713,
"eval_runtime": 125.0464,
"eval_samples_per_second": 799.703,
"eval_steps_per_second": 49.981,
"step": 1800000
},
{
"epoch": 15.23,
"learning_rate": 1.0113333333333334e-07,
"loss": 2.4018,
"step": 1808000
},
{
"epoch": 15.23,
"eval_loss": 2.266899824142456,
"eval_runtime": 125.1187,
"eval_samples_per_second": 799.241,
"eval_steps_per_second": 49.953,
"step": 1808000
},
{
"epoch": 15.29,
"eval_loss": 2.252521276473999,
"eval_runtime": 124.7901,
"eval_samples_per_second": 801.346,
"eval_steps_per_second": 50.084,
"step": 1816000
},
{
"epoch": 15.36,
"learning_rate": 9.84e-08,
"loss": 2.3954,
"step": 1824000
},
{
"epoch": 15.36,
"eval_loss": 2.249734878540039,
"eval_runtime": 125.3184,
"eval_samples_per_second": 797.967,
"eval_steps_per_second": 49.873,
"step": 1824000
},
{
"epoch": 15.43,
"eval_loss": 2.274369716644287,
"eval_runtime": 125.8354,
"eval_samples_per_second": 794.689,
"eval_steps_per_second": 49.668,
"step": 1832000
},
{
"epoch": 15.49,
"learning_rate": 9.566666666666666e-08,
"loss": 2.396,
"step": 1840000
},
{
"epoch": 15.49,
"eval_loss": 2.267287492752075,
"eval_runtime": 124.6245,
"eval_samples_per_second": 802.41,
"eval_steps_per_second": 50.151,
"step": 1840000
},
{
"epoch": 15.56,
"eval_loss": 2.263681173324585,
"eval_runtime": 124.8384,
"eval_samples_per_second": 801.035,
"eval_steps_per_second": 50.065,
"step": 1848000
},
{
"epoch": 15.63,
"learning_rate": 9.293333333333333e-08,
"loss": 2.3951,
"step": 1856000
},
{
"epoch": 15.63,
"eval_loss": 2.2615184783935547,
"eval_runtime": 125.4015,
"eval_samples_per_second": 797.439,
"eval_steps_per_second": 49.84,
"step": 1856000
},
{
"epoch": 15.7,
"eval_loss": 2.2643656730651855,
"eval_runtime": 124.6138,
"eval_samples_per_second": 802.479,
"eval_steps_per_second": 50.155,
"step": 1864000
},
{
"epoch": 15.76,
"learning_rate": 9.02e-08,
"loss": 2.4017,
"step": 1872000
},
{
"epoch": 15.76,
"eval_loss": 2.265629768371582,
"eval_runtime": 124.2729,
"eval_samples_per_second": 804.681,
"eval_steps_per_second": 50.293,
"step": 1872000
},
{
"epoch": 15.83,
"eval_loss": 2.268179178237915,
"eval_runtime": 124.3886,
"eval_samples_per_second": 803.932,
"eval_steps_per_second": 50.246,
"step": 1880000
},
{
"epoch": 15.9,
"learning_rate": 8.746666666666667e-08,
"loss": 2.3962,
"step": 1888000
},
{
"epoch": 15.9,
"eval_loss": 2.2591919898986816,
"eval_runtime": 124.0671,
"eval_samples_per_second": 806.015,
"eval_steps_per_second": 50.376,
"step": 1888000
},
{
"epoch": 15.97,
"eval_loss": 2.264333724975586,
"eval_runtime": 123.9572,
"eval_samples_per_second": 806.73,
"eval_steps_per_second": 50.421,
"step": 1896000
},
{
"epoch": 16.03,
"learning_rate": 8.473333333333334e-08,
"loss": 2.3996,
"step": 1904000
},
{
"epoch": 16.03,
"eval_loss": 2.264843225479126,
"eval_runtime": 124.1244,
"eval_samples_per_second": 805.644,
"eval_steps_per_second": 50.353,
"step": 1904000
},
{
"epoch": 16.1,
"eval_loss": 2.2705702781677246,
"eval_runtime": 124.9853,
"eval_samples_per_second": 800.094,
"eval_steps_per_second": 50.006,
"step": 1912000
},
{
"epoch": 16.17,
"learning_rate": 8.2e-08,
"loss": 2.3994,
"step": 1920000
},
{
"epoch": 16.17,
"eval_loss": 2.270008087158203,
"eval_runtime": 124.4796,
"eval_samples_per_second": 803.345,
"eval_steps_per_second": 50.209,
"step": 1920000
},
{
"epoch": 16.24,
"eval_loss": 2.2626538276672363,
"eval_runtime": 124.7698,
"eval_samples_per_second": 801.476,
"eval_steps_per_second": 50.092,
"step": 1928000
},
{
"epoch": 16.3,
"learning_rate": 7.926666666666666e-08,
"loss": 2.3976,
"step": 1936000
},
{
"epoch": 16.3,
"eval_loss": 2.259154796600342,
"eval_runtime": 125.5881,
"eval_samples_per_second": 796.254,
"eval_steps_per_second": 49.766,
"step": 1936000
},
{
"epoch": 16.37,
"eval_loss": 2.2606401443481445,
"eval_runtime": 124.5151,
"eval_samples_per_second": 803.116,
"eval_steps_per_second": 50.195,
"step": 1944000
},
{
"epoch": 16.44,
"learning_rate": 7.653333333333333e-08,
"loss": 2.3971,
"step": 1952000
},
{
"epoch": 16.44,
"eval_loss": 2.2588131427764893,
"eval_runtime": 125.6386,
"eval_samples_per_second": 795.934,
"eval_steps_per_second": 49.746,
"step": 1952000
},
{
"epoch": 16.51,
"eval_loss": 2.2607157230377197,
"eval_runtime": 125.7855,
"eval_samples_per_second": 795.004,
"eval_steps_per_second": 49.688,
"step": 1960000
},
{
"epoch": 16.57,
"learning_rate": 7.38e-08,
"loss": 2.3991,
"step": 1968000
},
{
"epoch": 16.57,
"eval_loss": 2.2692136764526367,
"eval_runtime": 124.174,
"eval_samples_per_second": 805.322,
"eval_steps_per_second": 50.333,
"step": 1968000
},
{
"epoch": 16.64,
"eval_loss": 2.2548389434814453,
"eval_runtime": 126.1957,
"eval_samples_per_second": 792.42,
"eval_steps_per_second": 49.526,
"step": 1976000
},
{
"epoch": 16.71,
"learning_rate": 7.106666666666667e-08,
"loss": 2.3952,
"step": 1984000
},
{
"epoch": 16.71,
"eval_loss": 2.2572038173675537,
"eval_runtime": 126.1234,
"eval_samples_per_second": 792.874,
"eval_steps_per_second": 49.555,
"step": 1984000
},
{
"epoch": 16.77,
"eval_loss": 2.262612819671631,
"eval_runtime": 126.0135,
"eval_samples_per_second": 793.566,
"eval_steps_per_second": 49.598,
"step": 1992000
},
{
"epoch": 16.84,
"learning_rate": 6.833333333333332e-08,
"loss": 2.4002,
"step": 2000000
},
{
"epoch": 16.84,
"eval_loss": 2.268040657043457,
"eval_runtime": 126.4057,
"eval_samples_per_second": 791.104,
"eval_steps_per_second": 49.444,
"step": 2000000
},
{
"epoch": 16.91,
"eval_loss": 2.268988847732544,
"eval_runtime": 126.2757,
"eval_samples_per_second": 791.918,
"eval_steps_per_second": 49.495,
"step": 2008000
},
{
"epoch": 16.98,
"learning_rate": 6.56e-08,
"loss": 2.3937,
"step": 2016000
},
{
"epoch": 16.98,
"eval_loss": 2.2523088455200195,
"eval_runtime": 125.8129,
"eval_samples_per_second": 794.831,
"eval_steps_per_second": 49.677,
"step": 2016000
},
{
"epoch": 17.04,
"eval_loss": 2.2699527740478516,
"eval_runtime": 126.3749,
"eval_samples_per_second": 791.296,
"eval_steps_per_second": 49.456,
"step": 2024000
},
{
"epoch": 17.11,
"learning_rate": 6.286666666666666e-08,
"loss": 2.3999,
"step": 2032000
},
{
"epoch": 17.11,
"eval_loss": 2.265235662460327,
"eval_runtime": 125.4726,
"eval_samples_per_second": 796.987,
"eval_steps_per_second": 49.812,
"step": 2032000
},
{
"epoch": 17.18,
"eval_loss": 2.2670557498931885,
"eval_runtime": 125.952,
"eval_samples_per_second": 793.953,
"eval_steps_per_second": 49.622,
"step": 2040000
},
{
"epoch": 17.25,
"learning_rate": 6.013333333333333e-08,
"loss": 2.3891,
"step": 2048000
},
{
"epoch": 17.25,
"eval_loss": 2.2700319290161133,
"eval_runtime": 126.128,
"eval_samples_per_second": 792.845,
"eval_steps_per_second": 49.553,
"step": 2048000
},
{
"epoch": 17.31,
"eval_loss": 2.258948802947998,
"eval_runtime": 126.6616,
"eval_samples_per_second": 789.505,
"eval_steps_per_second": 49.344,
"step": 2056000
},
{
"epoch": 17.38,
"learning_rate": 5.7400000000000004e-08,
"loss": 2.397,
"step": 2064000
},
{
"epoch": 17.38,
"eval_loss": 2.2626419067382812,
"eval_runtime": 125.2648,
"eval_samples_per_second": 798.309,
"eval_steps_per_second": 49.894,
"step": 2064000
},
{
"epoch": 17.45,
"eval_loss": 2.260664463043213,
"eval_runtime": 124.8328,
"eval_samples_per_second": 801.071,
"eval_steps_per_second": 50.067,
"step": 2072000
},
{
"epoch": 17.52,
"learning_rate": 5.4666666666666666e-08,
"loss": 2.3968,
"step": 2080000
},
{
"epoch": 17.52,
"eval_loss": 2.2663474082946777,
"eval_runtime": 126.0403,
"eval_samples_per_second": 793.397,
"eval_steps_per_second": 49.587,
"step": 2080000
},
{
"epoch": 17.58,
"eval_loss": 2.263674020767212,
"eval_runtime": 125.5699,
"eval_samples_per_second": 796.369,
"eval_steps_per_second": 49.773,
"step": 2088000
},
{
"epoch": 17.65,
"learning_rate": 5.1933333333333335e-08,
"loss": 2.3932,
"step": 2096000
},
{
"epoch": 17.65,
"eval_loss": 2.2622973918914795,
"eval_runtime": 125.0302,
"eval_samples_per_second": 799.807,
"eval_steps_per_second": 49.988,
"step": 2096000
},
{
"epoch": 17.72,
"eval_loss": 2.267320394515991,
"eval_runtime": 126.5313,
"eval_samples_per_second": 790.318,
"eval_steps_per_second": 49.395,
"step": 2104000
},
{
"epoch": 17.79,
"learning_rate": 4.92e-08,
"loss": 2.3981,
"step": 2112000
},
{
"epoch": 17.79,
"eval_loss": 2.2546768188476562,
"eval_runtime": 126.1555,
"eval_samples_per_second": 792.673,
"eval_steps_per_second": 49.542,
"step": 2112000
},
{
"epoch": 17.85,
"eval_loss": 2.2597532272338867,
"eval_runtime": 126.0713,
"eval_samples_per_second": 793.202,
"eval_steps_per_second": 49.575,
"step": 2120000
},
{
"epoch": 17.92,
"learning_rate": 4.6466666666666666e-08,
"loss": 2.3964,
"step": 2128000
},
{
"epoch": 17.92,
"eval_loss": 2.2690351009368896,
"eval_runtime": 127.1255,
"eval_samples_per_second": 786.624,
"eval_steps_per_second": 49.164,
"step": 2128000
},
{
"epoch": 17.99,
"eval_loss": 2.2618870735168457,
"eval_runtime": 126.4894,
"eval_samples_per_second": 790.58,
"eval_steps_per_second": 49.411,
"step": 2136000
},
{
"epoch": 18.05,
"learning_rate": 4.3733333333333335e-08,
"loss": 2.3941,
"step": 2144000
},
{
"epoch": 18.05,
"eval_loss": 2.255786657333374,
"eval_runtime": 125.2961,
"eval_samples_per_second": 798.11,
"eval_steps_per_second": 49.882,
"step": 2144000
},
{
"epoch": 18.12,
"eval_loss": 2.2658987045288086,
"eval_runtime": 125.33,
"eval_samples_per_second": 797.893,
"eval_steps_per_second": 49.868,
"step": 2152000
},
{
"epoch": 18.19,
"learning_rate": 4.1e-08,
"loss": 2.3926,
"step": 2160000
},
{
"epoch": 18.19,
"eval_loss": 2.255154848098755,
"eval_runtime": 125.9241,
"eval_samples_per_second": 794.129,
"eval_steps_per_second": 49.633,
"step": 2160000
},
{
"epoch": 18.26,
"eval_loss": 2.267132043838501,
"eval_runtime": 126.3428,
"eval_samples_per_second": 791.498,
"eval_steps_per_second": 49.469,
"step": 2168000
},
{
"epoch": 18.32,
"learning_rate": 3.8266666666666665e-08,
"loss": 2.399,
"step": 2176000
},
{
"epoch": 18.32,
"eval_loss": 2.2660810947418213,
"eval_runtime": 126.922,
"eval_samples_per_second": 787.886,
"eval_steps_per_second": 49.243,
"step": 2176000
},
{
"epoch": 18.39,
"eval_loss": 2.259093999862671,
"eval_runtime": 126.8243,
"eval_samples_per_second": 788.493,
"eval_steps_per_second": 49.281,
"step": 2184000
},
{
"epoch": 18.46,
"learning_rate": 3.5533333333333334e-08,
"loss": 2.3941,
"step": 2192000
},
{
"epoch": 18.46,
"eval_loss": 2.2567954063415527,
"eval_runtime": 126.0771,
"eval_samples_per_second": 793.165,
"eval_steps_per_second": 49.573,
"step": 2192000
},
{
"epoch": 18.53,
"eval_loss": 2.258824348449707,
"eval_runtime": 126.912,
"eval_samples_per_second": 787.947,
"eval_steps_per_second": 49.247,
"step": 2200000
},
{
"epoch": 18.59,
"learning_rate": 3.28e-08,
"loss": 2.3975,
"step": 2208000
},
{
"epoch": 18.59,
"eval_loss": 2.2631142139434814,
"eval_runtime": 126.1148,
"eval_samples_per_second": 792.928,
"eval_steps_per_second": 49.558,
"step": 2208000
},
{
"epoch": 18.66,
"eval_loss": 2.265528917312622,
"eval_runtime": 125.083,
"eval_samples_per_second": 799.469,
"eval_steps_per_second": 49.967,
"step": 2216000
},
{
"epoch": 18.73,
"learning_rate": 3.0066666666666665e-08,
"loss": 2.3884,
"step": 2224000
},
{
"epoch": 18.73,
"eval_loss": 2.262817859649658,
"eval_runtime": 126.358,
"eval_samples_per_second": 791.402,
"eval_steps_per_second": 49.463,
"step": 2224000
},
{
"epoch": 18.8,
"eval_loss": 2.265577554702759,
"eval_runtime": 125.5796,
"eval_samples_per_second": 796.308,
"eval_steps_per_second": 49.769,
"step": 2232000
},
{
"epoch": 18.86,
"learning_rate": 2.7333333333333333e-08,
"loss": 2.399,
"step": 2240000
},
{
"epoch": 18.86,
"eval_loss": 2.264435052871704,
"eval_runtime": 125.6584,
"eval_samples_per_second": 795.808,
"eval_steps_per_second": 49.738,
"step": 2240000
},
{
"epoch": 18.93,
"eval_loss": 2.2607643604278564,
"eval_runtime": 124.8913,
"eval_samples_per_second": 800.697,
"eval_steps_per_second": 50.044,
"step": 2248000
},
{
"epoch": 19.0,
"learning_rate": 2.46e-08,
"loss": 2.4064,
"step": 2256000
},
{
"epoch": 19.0,
"eval_loss": 2.256071090698242,
"eval_runtime": 124.5565,
"eval_samples_per_second": 802.849,
"eval_steps_per_second": 50.178,
"step": 2256000
},
{
"epoch": 19.07,
"eval_loss": 2.2680041790008545,
"eval_runtime": 125.2469,
"eval_samples_per_second": 798.423,
"eval_steps_per_second": 49.901,
"step": 2264000
},
{
"epoch": 19.13,
"learning_rate": 2.1866666666666667e-08,
"loss": 2.3999,
"step": 2272000
},
{
"epoch": 19.13,
"eval_loss": 2.27026104927063,
"eval_runtime": 124.766,
"eval_samples_per_second": 801.501,
"eval_steps_per_second": 50.094,
"step": 2272000
},
{
"epoch": 19.2,
"eval_loss": 2.2624101638793945,
"eval_runtime": 124.8172,
"eval_samples_per_second": 801.172,
"eval_steps_per_second": 50.073,
"step": 2280000
},
{
"epoch": 19.27,
"learning_rate": 1.9133333333333333e-08,
"loss": 2.398,
"step": 2288000
},
{
"epoch": 19.27,
"eval_loss": 2.270665407180786,
"eval_runtime": 124.8089,
"eval_samples_per_second": 801.225,
"eval_steps_per_second": 50.077,
"step": 2288000
},
{
"epoch": 19.33,
"eval_loss": 2.264582872390747,
"eval_runtime": 124.6344,
"eval_samples_per_second": 802.347,
"eval_steps_per_second": 50.147,
"step": 2296000
},
{
"epoch": 19.4,
"learning_rate": 1.64e-08,
"loss": 2.4007,
"step": 2304000
},
{
"epoch": 19.4,
"eval_loss": 2.2658579349517822,
"eval_runtime": 125.9424,
"eval_samples_per_second": 794.014,
"eval_steps_per_second": 49.626,
"step": 2304000
},
{
"epoch": 19.47,
"eval_loss": 2.2709732055664062,
"eval_runtime": 125.5781,
"eval_samples_per_second": 796.317,
"eval_steps_per_second": 49.77,
"step": 2312000
},
{
"epoch": 19.54,
"learning_rate": 1.3666666666666667e-08,
"loss": 2.3955,
"step": 2320000
},
{
"epoch": 19.54,
"eval_loss": 2.2720258235931396,
"eval_runtime": 125.3571,
"eval_samples_per_second": 797.721,
"eval_steps_per_second": 49.858,
"step": 2320000
},
{
"epoch": 19.6,
"eval_loss": 2.2569046020507812,
"eval_runtime": 126.2089,
"eval_samples_per_second": 792.337,
"eval_steps_per_second": 49.521,
"step": 2328000
},
{
"epoch": 19.67,
"learning_rate": 1.0933333333333334e-08,
"loss": 2.3973,
"step": 2336000
},
{
"epoch": 19.67,
"eval_loss": 2.264122486114502,
"eval_runtime": 124.8579,
"eval_samples_per_second": 800.91,
"eval_steps_per_second": 50.057,
"step": 2336000
},
{
"epoch": 19.74,
"eval_loss": 2.263296127319336,
"eval_runtime": 126.0432,
"eval_samples_per_second": 793.379,
"eval_steps_per_second": 49.586,
"step": 2344000
},
{
"epoch": 19.81,
"learning_rate": 8.2e-09,
"loss": 2.4059,
"step": 2352000
},
{
"epoch": 19.81,
"eval_loss": 2.262235164642334,
"eval_runtime": 126.1477,
"eval_samples_per_second": 792.722,
"eval_steps_per_second": 49.545,
"step": 2352000
},
{
"epoch": 19.87,
"eval_loss": 2.253866672515869,
"eval_runtime": 126.3644,
"eval_samples_per_second": 791.362,
"eval_steps_per_second": 49.46,
"step": 2360000
},
{
"epoch": 19.94,
"learning_rate": 5.466666666666667e-09,
"loss": 2.3899,
"step": 2368000
},
{
"epoch": 19.94,
"eval_loss": 2.266547441482544,
"eval_runtime": 128.0069,
"eval_samples_per_second": 781.208,
"eval_steps_per_second": 48.825,
"step": 2368000
},
{
"epoch": 20.01,
"eval_loss": 2.262890577316284,
"eval_runtime": 128.4011,
"eval_samples_per_second": 778.809,
"eval_steps_per_second": 48.676,
"step": 2376000
},
{
"epoch": 20.08,
"learning_rate": 2.7333333333333334e-09,
"loss": 2.4025,
"step": 2384000
},
{
"epoch": 20.08,
"eval_loss": 2.2551090717315674,
"eval_runtime": 127.3686,
"eval_samples_per_second": 785.123,
"eval_steps_per_second": 49.07,
"step": 2384000
},
{
"epoch": 20.14,
"eval_loss": 2.2545723915100098,
"eval_runtime": 127.74,
"eval_samples_per_second": 782.84,
"eval_steps_per_second": 48.928,
"step": 2392000
},
{
"epoch": 20.21,
"learning_rate": 0.0,
"loss": 2.3956,
"step": 2400000
},
{
"epoch": 20.21,
"eval_loss": 2.2619526386260986,
"eval_runtime": 127.428,
"eval_samples_per_second": 784.757,
"eval_steps_per_second": 49.047,
"step": 2400000
},
{
"epoch": 20.21,
"step": 2400000,
"total_flos": 8.365104666768984e+17,
"train_loss": 2.404444656575521,
"train_runtime": 193218.8586,
"train_samples_per_second": 198.738,
"train_steps_per_second": 12.421
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 21,
"save_steps": 32000,
"total_flos": 8.365104666768984e+17,
"trial_name": null,
"trial_params": null
}