phi-ft-1000000-fp-newsplit / trainer_state.json
KaranChand's picture
Model save
8591e44 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 8736,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005723443223443223,
"grad_norm": 6.21875,
"learning_rate": 1.4302059496567508e-06,
"loss": 3.2916,
"step": 50
},
{
"epoch": 0.011446886446886446,
"grad_norm": 3.453125,
"learning_rate": 2.8604118993135015e-06,
"loss": 3.1002,
"step": 100
},
{
"epoch": 0.011446886446886446,
"eval_loss": 3.050471782684326,
"eval_runtime": 124.1248,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 100
},
{
"epoch": 0.017170329670329672,
"grad_norm": 1.8671875,
"learning_rate": 4.290617848970252e-06,
"loss": 2.6977,
"step": 150
},
{
"epoch": 0.022893772893772892,
"grad_norm": 0.91015625,
"learning_rate": 5.720823798627003e-06,
"loss": 2.1929,
"step": 200
},
{
"epoch": 0.022893772893772892,
"eval_loss": 2.049286365509033,
"eval_runtime": 124.1321,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 200
},
{
"epoch": 0.028617216117216116,
"grad_norm": 0.625,
"learning_rate": 7.151029748283754e-06,
"loss": 1.8298,
"step": 250
},
{
"epoch": 0.034340659340659344,
"grad_norm": 0.486328125,
"learning_rate": 8.581235697940504e-06,
"loss": 1.6369,
"step": 300
},
{
"epoch": 0.034340659340659344,
"eval_loss": 1.6432359218597412,
"eval_runtime": 124.1394,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 300
},
{
"epoch": 0.04006410256410257,
"grad_norm": 0.5078125,
"learning_rate": 1.0011441647597253e-05,
"loss": 1.531,
"step": 350
},
{
"epoch": 0.045787545787545784,
"grad_norm": 0.328125,
"learning_rate": 1.1441647597254006e-05,
"loss": 1.4618,
"step": 400
},
{
"epoch": 0.045787545787545784,
"eval_loss": 1.5580341815948486,
"eval_runtime": 124.1039,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 400
},
{
"epoch": 0.05151098901098901,
"grad_norm": 0.28515625,
"learning_rate": 1.2871853546910755e-05,
"loss": 1.4061,
"step": 450
},
{
"epoch": 0.05723443223443223,
"grad_norm": 0.2890625,
"learning_rate": 1.4302059496567508e-05,
"loss": 1.317,
"step": 500
},
{
"epoch": 0.05723443223443223,
"eval_loss": 1.5409735441207886,
"eval_runtime": 124.0925,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 500
},
{
"epoch": 0.06295787545787546,
"grad_norm": 0.265625,
"learning_rate": 1.5732265446224257e-05,
"loss": 1.2334,
"step": 550
},
{
"epoch": 0.06868131868131869,
"grad_norm": 0.341796875,
"learning_rate": 1.716247139588101e-05,
"loss": 1.1329,
"step": 600
},
{
"epoch": 0.06868131868131869,
"eval_loss": 1.6269210577011108,
"eval_runtime": 124.0849,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 600
},
{
"epoch": 0.0744047619047619,
"grad_norm": 0.275390625,
"learning_rate": 1.859267734553776e-05,
"loss": 1.039,
"step": 650
},
{
"epoch": 0.08012820512820513,
"grad_norm": 0.33984375,
"learning_rate": 2.0022883295194507e-05,
"loss": 0.9505,
"step": 700
},
{
"epoch": 0.08012820512820513,
"eval_loss": 1.738716959953308,
"eval_runtime": 124.1002,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 700
},
{
"epoch": 0.08585164835164835,
"grad_norm": 0.349609375,
"learning_rate": 2.145308924485126e-05,
"loss": 0.8827,
"step": 750
},
{
"epoch": 0.09157509157509157,
"grad_norm": 0.4296875,
"learning_rate": 2.2883295194508012e-05,
"loss": 0.8334,
"step": 800
},
{
"epoch": 0.09157509157509157,
"eval_loss": 1.7442790269851685,
"eval_runtime": 124.087,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 800
},
{
"epoch": 0.0972985347985348,
"grad_norm": 0.431640625,
"learning_rate": 2.431350114416476e-05,
"loss": 0.8127,
"step": 850
},
{
"epoch": 0.10302197802197802,
"grad_norm": 0.400390625,
"learning_rate": 2.574370709382151e-05,
"loss": 0.7692,
"step": 900
},
{
"epoch": 0.10302197802197802,
"eval_loss": 1.7634161710739136,
"eval_runtime": 124.1852,
"eval_samples_per_second": 2.255,
"eval_steps_per_second": 0.564,
"step": 900
},
{
"epoch": 0.10874542124542125,
"grad_norm": 0.455078125,
"learning_rate": 2.7173913043478262e-05,
"loss": 0.74,
"step": 950
},
{
"epoch": 0.11446886446886446,
"grad_norm": 0.48828125,
"learning_rate": 2.8604118993135016e-05,
"loss": 0.6983,
"step": 1000
},
{
"epoch": 0.11446886446886446,
"eval_loss": 1.7546391487121582,
"eval_runtime": 124.0637,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 1000
},
{
"epoch": 0.1201923076923077,
"grad_norm": 0.46484375,
"learning_rate": 3.0034324942791764e-05,
"loss": 0.7277,
"step": 1050
},
{
"epoch": 0.1259157509157509,
"grad_norm": 0.5703125,
"learning_rate": 3.1464530892448515e-05,
"loss": 0.6859,
"step": 1100
},
{
"epoch": 0.1259157509157509,
"eval_loss": 1.7593410015106201,
"eval_runtime": 124.125,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1100
},
{
"epoch": 0.13163919413919414,
"grad_norm": 0.4921875,
"learning_rate": 3.289473684210527e-05,
"loss": 0.6844,
"step": 1150
},
{
"epoch": 0.13736263736263737,
"grad_norm": 0.5546875,
"learning_rate": 3.432494279176202e-05,
"loss": 0.6671,
"step": 1200
},
{
"epoch": 0.13736263736263737,
"eval_loss": 1.7647184133529663,
"eval_runtime": 124.0962,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1200
},
{
"epoch": 0.14308608058608058,
"grad_norm": 0.4921875,
"learning_rate": 3.5755148741418764e-05,
"loss": 0.6553,
"step": 1250
},
{
"epoch": 0.1488095238095238,
"grad_norm": 0.5546875,
"learning_rate": 3.718535469107552e-05,
"loss": 0.6285,
"step": 1300
},
{
"epoch": 0.1488095238095238,
"eval_loss": 1.7950905561447144,
"eval_runtime": 124.0984,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1300
},
{
"epoch": 0.15453296703296704,
"grad_norm": 0.5234375,
"learning_rate": 3.8615560640732266e-05,
"loss": 0.6192,
"step": 1350
},
{
"epoch": 0.16025641025641027,
"grad_norm": 0.5703125,
"learning_rate": 4.0045766590389014e-05,
"loss": 0.6121,
"step": 1400
},
{
"epoch": 0.16025641025641027,
"eval_loss": 1.7816270589828491,
"eval_runtime": 124.0951,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1400
},
{
"epoch": 0.16597985347985347,
"grad_norm": 0.609375,
"learning_rate": 4.147597254004577e-05,
"loss": 0.6039,
"step": 1450
},
{
"epoch": 0.1717032967032967,
"grad_norm": 0.5625,
"learning_rate": 4.290617848970252e-05,
"loss": 0.5923,
"step": 1500
},
{
"epoch": 0.1717032967032967,
"eval_loss": 1.8131866455078125,
"eval_runtime": 124.0957,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1500
},
{
"epoch": 0.17742673992673993,
"grad_norm": 0.61328125,
"learning_rate": 4.433638443935927e-05,
"loss": 0.5973,
"step": 1550
},
{
"epoch": 0.18315018315018314,
"grad_norm": 0.62890625,
"learning_rate": 4.5766590389016025e-05,
"loss": 0.5908,
"step": 1600
},
{
"epoch": 0.18315018315018314,
"eval_loss": 1.7663753032684326,
"eval_runtime": 124.103,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1600
},
{
"epoch": 0.18887362637362637,
"grad_norm": 0.63671875,
"learning_rate": 4.719679633867277e-05,
"loss": 0.5869,
"step": 1650
},
{
"epoch": 0.1945970695970696,
"grad_norm": 0.6015625,
"learning_rate": 4.862700228832952e-05,
"loss": 0.5662,
"step": 1700
},
{
"epoch": 0.1945970695970696,
"eval_loss": 1.830661416053772,
"eval_runtime": 124.107,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1700
},
{
"epoch": 0.20032051282051283,
"grad_norm": 0.65234375,
"learning_rate": 4.9999989894357565e-05,
"loss": 0.5641,
"step": 1750
},
{
"epoch": 0.20604395604395603,
"grad_norm": 0.6171875,
"learning_rate": 4.999316889636665e-05,
"loss": 0.5637,
"step": 1800
},
{
"epoch": 0.20604395604395603,
"eval_loss": 1.7863534688949585,
"eval_runtime": 124.0697,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 1800
},
{
"epoch": 0.21176739926739926,
"grad_norm": 0.63671875,
"learning_rate": 4.9973719827852006e-05,
"loss": 0.5566,
"step": 1850
},
{
"epoch": 0.2174908424908425,
"grad_norm": 0.578125,
"learning_rate": 4.994165251566713e-05,
"loss": 0.5475,
"step": 1900
},
{
"epoch": 0.2174908424908425,
"eval_loss": 1.7987805604934692,
"eval_runtime": 124.1161,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 1900
},
{
"epoch": 0.22321428571428573,
"grad_norm": 0.6328125,
"learning_rate": 4.98969831621704e-05,
"loss": 0.5544,
"step": 1950
},
{
"epoch": 0.22893772893772893,
"grad_norm": 0.6953125,
"learning_rate": 4.98397343370387e-05,
"loss": 0.5421,
"step": 2000
},
{
"epoch": 0.22893772893772893,
"eval_loss": 1.7876337766647339,
"eval_runtime": 124.1072,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2000
},
{
"epoch": 0.23466117216117216,
"grad_norm": 0.58984375,
"learning_rate": 4.976993496586383e-05,
"loss": 0.5386,
"step": 2050
},
{
"epoch": 0.2403846153846154,
"grad_norm": 0.70703125,
"learning_rate": 4.968762031553753e-05,
"loss": 0.529,
"step": 2100
},
{
"epoch": 0.2403846153846154,
"eval_loss": 1.7660707235336304,
"eval_runtime": 124.1155,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2100
},
{
"epoch": 0.24610805860805862,
"grad_norm": 0.625,
"learning_rate": 4.959283197643249e-05,
"loss": 0.5278,
"step": 2150
},
{
"epoch": 0.2518315018315018,
"grad_norm": 0.64453125,
"learning_rate": 4.948561784138841e-05,
"loss": 0.5202,
"step": 2200
},
{
"epoch": 0.2518315018315018,
"eval_loss": 1.770936131477356,
"eval_runtime": 124.1257,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2200
},
{
"epoch": 0.25755494505494503,
"grad_norm": 0.59375,
"learning_rate": 4.936603208151355e-05,
"loss": 0.5071,
"step": 2250
},
{
"epoch": 0.2632783882783883,
"grad_norm": 0.6796875,
"learning_rate": 4.9234135118814246e-05,
"loss": 0.5287,
"step": 2300
},
{
"epoch": 0.2632783882783883,
"eval_loss": 1.7680959701538086,
"eval_runtime": 124.1059,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2300
},
{
"epoch": 0.2690018315018315,
"grad_norm": 0.6171875,
"learning_rate": 4.908999359566602e-05,
"loss": 0.5116,
"step": 2350
},
{
"epoch": 0.27472527472527475,
"grad_norm": 0.61328125,
"learning_rate": 4.8933680341141775e-05,
"loss": 0.514,
"step": 2400
},
{
"epoch": 0.27472527472527475,
"eval_loss": 1.7765259742736816,
"eval_runtime": 124.1161,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2400
},
{
"epoch": 0.28044871794871795,
"grad_norm": 0.6328125,
"learning_rate": 4.8765274334214116e-05,
"loss": 0.5099,
"step": 2450
},
{
"epoch": 0.28617216117216115,
"grad_norm": 0.6875,
"learning_rate": 4.8584860663850404e-05,
"loss": 0.5026,
"step": 2500
},
{
"epoch": 0.28617216117216115,
"eval_loss": 1.7931022644042969,
"eval_runtime": 124.0904,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2500
},
{
"epoch": 0.2918956043956044,
"grad_norm": 0.609375,
"learning_rate": 4.839253048602059e-05,
"loss": 0.5044,
"step": 2550
},
{
"epoch": 0.2976190476190476,
"grad_norm": 0.6484375,
"learning_rate": 4.818838097763967e-05,
"loss": 0.5038,
"step": 2600
},
{
"epoch": 0.2976190476190476,
"eval_loss": 1.7807551622390747,
"eval_runtime": 124.0961,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2600
},
{
"epoch": 0.3033424908424908,
"grad_norm": 0.65625,
"learning_rate": 4.7972515287468e-05,
"loss": 0.4828,
"step": 2650
},
{
"epoch": 0.3090659340659341,
"grad_norm": 0.64453125,
"learning_rate": 4.774504248399427e-05,
"loss": 0.5052,
"step": 2700
},
{
"epoch": 0.3090659340659341,
"eval_loss": 1.7688714265823364,
"eval_runtime": 124.0947,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2700
},
{
"epoch": 0.3147893772893773,
"grad_norm": 0.62890625,
"learning_rate": 4.750607750032748e-05,
"loss": 0.4938,
"step": 2750
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.74609375,
"learning_rate": 4.725574107612567e-05,
"loss": 0.4918,
"step": 2800
},
{
"epoch": 0.32051282051282054,
"eval_loss": 1.7862409353256226,
"eval_runtime": 124.0923,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 2800
},
{
"epoch": 0.32623626373626374,
"grad_norm": 0.6796875,
"learning_rate": 4.699415969659098e-05,
"loss": 0.4847,
"step": 2850
},
{
"epoch": 0.33195970695970695,
"grad_norm": 0.69140625,
"learning_rate": 4.672146552856155e-05,
"loss": 0.4817,
"step": 2900
},
{
"epoch": 0.33195970695970695,
"eval_loss": 1.7916373014450073,
"eval_runtime": 124.0857,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 2900
},
{
"epoch": 0.3376831501831502,
"grad_norm": 0.67578125,
"learning_rate": 4.6437796353732824e-05,
"loss": 0.4908,
"step": 2950
},
{
"epoch": 0.3434065934065934,
"grad_norm": 0.68359375,
"learning_rate": 4.614329549904187e-05,
"loss": 0.4806,
"step": 3000
},
{
"epoch": 0.3434065934065934,
"eval_loss": 1.7795602083206177,
"eval_runtime": 124.1973,
"eval_samples_per_second": 2.254,
"eval_steps_per_second": 0.564,
"step": 3000
},
{
"epoch": 0.3491300366300366,
"grad_norm": 0.65234375,
"learning_rate": 4.583811176424984e-05,
"loss": 0.4831,
"step": 3050
},
{
"epoch": 0.35485347985347987,
"grad_norm": 0.6640625,
"learning_rate": 4.5522399346759304e-05,
"loss": 0.4849,
"step": 3100
},
{
"epoch": 0.35485347985347987,
"eval_loss": 1.76537024974823,
"eval_runtime": 124.1057,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 3100
},
{
"epoch": 0.3605769230769231,
"grad_norm": 0.6171875,
"learning_rate": 4.51963177637043e-05,
"loss": 0.4787,
"step": 3150
},
{
"epoch": 0.3663003663003663,
"grad_norm": 0.640625,
"learning_rate": 4.4860031771352626e-05,
"loss": 0.4784,
"step": 3200
},
{
"epoch": 0.3663003663003663,
"eval_loss": 1.7576422691345215,
"eval_runtime": 124.109,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 3200
},
{
"epoch": 0.37202380952380953,
"grad_norm": 0.70703125,
"learning_rate": 4.451371128186087e-05,
"loss": 0.485,
"step": 3250
},
{
"epoch": 0.37774725274725274,
"grad_norm": 0.64453125,
"learning_rate": 4.4157531277424503e-05,
"loss": 0.4712,
"step": 3300
},
{
"epoch": 0.37774725274725274,
"eval_loss": 1.7745938301086426,
"eval_runtime": 124.0704,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 3300
},
{
"epoch": 0.383470695970696,
"grad_norm": 0.6953125,
"learning_rate": 4.379167172186619e-05,
"loss": 0.4899,
"step": 3350
},
{
"epoch": 0.3891941391941392,
"grad_norm": 0.671875,
"learning_rate": 4.3416317469707125e-05,
"loss": 0.4715,
"step": 3400
},
{
"epoch": 0.3891941391941392,
"eval_loss": 1.7567591667175293,
"eval_runtime": 124.0669,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 3400
},
{
"epoch": 0.3949175824175824,
"grad_norm": 0.65625,
"learning_rate": 4.3031658172767266e-05,
"loss": 0.472,
"step": 3450
},
{
"epoch": 0.40064102564102566,
"grad_norm": 0.65625,
"learning_rate": 4.263788818434168e-05,
"loss": 0.4608,
"step": 3500
},
{
"epoch": 0.40064102564102566,
"eval_loss": 1.7424110174179077,
"eval_runtime": 124.063,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 3500
},
{
"epoch": 0.40636446886446886,
"grad_norm": 0.63671875,
"learning_rate": 4.223520646100145e-05,
"loss": 0.4586,
"step": 3550
},
{
"epoch": 0.41208791208791207,
"grad_norm": 0.68359375,
"learning_rate": 4.182381646206868e-05,
"loss": 0.4629,
"step": 3600
},
{
"epoch": 0.41208791208791207,
"eval_loss": 1.7561120986938477,
"eval_runtime": 124.1336,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 3600
},
{
"epoch": 0.4178113553113553,
"grad_norm": 0.6875,
"learning_rate": 4.140392604681646e-05,
"loss": 0.4655,
"step": 3650
},
{
"epoch": 0.42353479853479853,
"grad_norm": 0.7109375,
"learning_rate": 4.097574736944575e-05,
"loss": 0.4591,
"step": 3700
},
{
"epoch": 0.42353479853479853,
"eval_loss": 1.7497508525848389,
"eval_runtime": 124.0856,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 3700
},
{
"epoch": 0.42925824175824173,
"grad_norm": 0.7578125,
"learning_rate": 4.053949677189208e-05,
"loss": 0.4658,
"step": 3750
},
{
"epoch": 0.434981684981685,
"grad_norm": 0.671875,
"learning_rate": 4.0095394674516506e-05,
"loss": 0.4652,
"step": 3800
},
{
"epoch": 0.434981684981685,
"eval_loss": 1.736577033996582,
"eval_runtime": 124.0959,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 3800
},
{
"epoch": 0.4407051282051282,
"grad_norm": 0.7109375,
"learning_rate": 3.96436654647358e-05,
"loss": 0.4588,
"step": 3850
},
{
"epoch": 0.44642857142857145,
"grad_norm": 0.66796875,
"learning_rate": 3.918453738364824e-05,
"loss": 0.461,
"step": 3900
},
{
"epoch": 0.44642857142857145,
"eval_loss": 1.7393635511398315,
"eval_runtime": 124.0934,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 3900
},
{
"epoch": 0.45215201465201466,
"grad_norm": 0.6875,
"learning_rate": 3.871824241071236e-05,
"loss": 0.4493,
"step": 3950
},
{
"epoch": 0.45787545787545786,
"grad_norm": 0.6796875,
"learning_rate": 3.824501614653676e-05,
"loss": 0.4469,
"step": 4000
},
{
"epoch": 0.45787545787545786,
"eval_loss": 1.7396734952926636,
"eval_runtime": 124.0876,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 4000
},
{
"epoch": 0.4635989010989011,
"grad_norm": 0.73828125,
"learning_rate": 3.7765097693840385e-05,
"loss": 0.4496,
"step": 4050
},
{
"epoch": 0.4693223443223443,
"grad_norm": 0.74609375,
"learning_rate": 3.727872953664322e-05,
"loss": 0.4521,
"step": 4100
},
{
"epoch": 0.4693223443223443,
"eval_loss": 1.7555357217788696,
"eval_runtime": 124.0832,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 4100
},
{
"epoch": 0.4750457875457875,
"grad_norm": 0.671875,
"learning_rate": 3.678615741774861e-05,
"loss": 0.4598,
"step": 4150
},
{
"epoch": 0.4807692307692308,
"grad_norm": 0.77734375,
"learning_rate": 3.628763021457909e-05,
"loss": 0.4498,
"step": 4200
},
{
"epoch": 0.4807692307692308,
"eval_loss": 1.7651796340942383,
"eval_runtime": 124.0848,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 4200
},
{
"epoch": 0.486492673992674,
"grad_norm": 0.68359375,
"learning_rate": 3.57833998134283e-05,
"loss": 0.4471,
"step": 4250
},
{
"epoch": 0.49221611721611724,
"grad_norm": 0.6953125,
"learning_rate": 3.5273720982192716e-05,
"loss": 0.4541,
"step": 4300
},
{
"epoch": 0.49221611721611724,
"eval_loss": 1.758300542831421,
"eval_runtime": 124.0895,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 4300
},
{
"epoch": 0.49793956043956045,
"grad_norm": 0.76953125,
"learning_rate": 3.475885124164737e-05,
"loss": 0.4595,
"step": 4350
},
{
"epoch": 0.5036630036630036,
"grad_norm": 0.65625,
"learning_rate": 3.4239050735330754e-05,
"loss": 0.4594,
"step": 4400
},
{
"epoch": 0.5036630036630036,
"eval_loss": 1.7604867219924927,
"eval_runtime": 124.0898,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 4400
},
{
"epoch": 0.5093864468864469,
"grad_norm": 0.67578125,
"learning_rate": 3.371458209810437e-05,
"loss": 0.4584,
"step": 4450
},
{
"epoch": 0.5151098901098901,
"grad_norm": 0.90625,
"learning_rate": 3.3185710323453684e-05,
"loss": 0.4514,
"step": 4500
},
{
"epoch": 0.5151098901098901,
"eval_loss": 1.7685655355453491,
"eval_runtime": 124.0911,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 4500
},
{
"epoch": 0.5208333333333334,
"grad_norm": 0.67578125,
"learning_rate": 3.265270262959723e-05,
"loss": 0.4523,
"step": 4550
},
{
"epoch": 0.5265567765567766,
"grad_norm": 0.65625,
"learning_rate": 3.211582832447175e-05,
"loss": 0.4395,
"step": 4600
},
{
"epoch": 0.5265567765567766,
"eval_loss": 1.7713632583618164,
"eval_runtime": 124.0855,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 4600
},
{
"epoch": 0.5322802197802198,
"grad_norm": 0.78125,
"learning_rate": 3.1575358669661356e-05,
"loss": 0.4464,
"step": 4650
},
{
"epoch": 0.538003663003663,
"grad_norm": 0.72265625,
"learning_rate": 3.103156674333976e-05,
"loss": 0.4384,
"step": 4700
},
{
"epoch": 0.538003663003663,
"eval_loss": 1.788908839225769,
"eval_runtime": 124.0945,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 4700
},
{
"epoch": 0.5437271062271062,
"grad_norm": 0.69921875,
"learning_rate": 3.0484727302294475e-05,
"loss": 0.4446,
"step": 4750
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.6875,
"learning_rate": 2.9935116643102983e-05,
"loss": 0.4392,
"step": 4800
},
{
"epoch": 0.5494505494505495,
"eval_loss": 1.7709113359451294,
"eval_runtime": 124.088,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 4800
},
{
"epoch": 0.5551739926739927,
"grad_norm": 0.7109375,
"learning_rate": 2.9383012462530895e-05,
"loss": 0.4406,
"step": 4850
},
{
"epoch": 0.5608974358974359,
"grad_norm": 0.6953125,
"learning_rate": 2.8828693717222625e-05,
"loss": 0.4495,
"step": 4900
},
{
"epoch": 0.5608974358974359,
"eval_loss": 1.7554136514663696,
"eval_runtime": 124.0986,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 4900
},
{
"epoch": 0.5666208791208791,
"grad_norm": 0.7109375,
"learning_rate": 2.8272440482755535e-05,
"loss": 0.4433,
"step": 4950
},
{
"epoch": 0.5723443223443223,
"grad_norm": 0.7890625,
"learning_rate": 2.771453381212865e-05,
"loss": 0.4375,
"step": 5000
},
{
"epoch": 0.5723443223443223,
"eval_loss": 1.7531843185424805,
"eval_runtime": 124.0918,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 5000
},
{
"epoch": 0.5780677655677655,
"grad_norm": 0.7578125,
"learning_rate": 2.715525559375764e-05,
"loss": 0.4405,
"step": 5050
},
{
"epoch": 0.5837912087912088,
"grad_norm": 0.72265625,
"learning_rate": 2.6594888409047557e-05,
"loss": 0.4441,
"step": 5100
},
{
"epoch": 0.5837912087912088,
"eval_loss": 1.7770174741744995,
"eval_runtime": 124.0878,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 5100
},
{
"epoch": 0.589514652014652,
"grad_norm": 0.77734375,
"learning_rate": 2.6033715389615588e-05,
"loss": 0.4403,
"step": 5150
},
{
"epoch": 0.5952380952380952,
"grad_norm": 0.7890625,
"learning_rate": 2.5472020074235635e-05,
"loss": 0.4458,
"step": 5200
},
{
"epoch": 0.5952380952380952,
"eval_loss": 1.7528095245361328,
"eval_runtime": 124.0953,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 5200
},
{
"epoch": 0.6009615384615384,
"grad_norm": 0.62890625,
"learning_rate": 2.4910086265577364e-05,
"loss": 0.4368,
"step": 5250
},
{
"epoch": 0.6066849816849816,
"grad_norm": 0.734375,
"learning_rate": 2.4348197886811702e-05,
"loss": 0.4343,
"step": 5300
},
{
"epoch": 0.6066849816849816,
"eval_loss": 1.7645584344863892,
"eval_runtime": 124.0831,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 5300
},
{
"epoch": 0.612408424908425,
"grad_norm": 0.73828125,
"learning_rate": 2.3786638838155694e-05,
"loss": 0.4444,
"step": 5350
},
{
"epoch": 0.6181318681318682,
"grad_norm": 0.7578125,
"learning_rate": 2.32256928534287e-05,
"loss": 0.433,
"step": 5400
},
{
"epoch": 0.6181318681318682,
"eval_loss": 1.7689203023910522,
"eval_runtime": 124.0857,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 5400
},
{
"epoch": 0.6238553113553114,
"grad_norm": 0.70703125,
"learning_rate": 2.2665643356692923e-05,
"loss": 0.4344,
"step": 5450
},
{
"epoch": 0.6295787545787546,
"grad_norm": 0.65625,
"learning_rate": 2.210677331905019e-05,
"loss": 0.4371,
"step": 5500
},
{
"epoch": 0.6295787545787546,
"eval_loss": 1.7737501859664917,
"eval_runtime": 124.0948,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 5500
},
{
"epoch": 0.6353021978021978,
"grad_norm": 0.83984375,
"learning_rate": 2.1549365115667853e-05,
"loss": 0.4377,
"step": 5550
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.76171875,
"learning_rate": 2.099370038310553e-05,
"loss": 0.4376,
"step": 5600
},
{
"epoch": 0.6410256410256411,
"eval_loss": 1.7633239030838013,
"eval_runtime": 124.0669,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 5600
},
{
"epoch": 0.6467490842490843,
"grad_norm": 0.71875,
"learning_rate": 2.044005987701531e-05,
"loss": 0.4388,
"step": 5650
},
{
"epoch": 0.6524725274725275,
"grad_norm": 0.7734375,
"learning_rate": 1.9888723330286763e-05,
"loss": 0.4366,
"step": 5700
},
{
"epoch": 0.6524725274725275,
"eval_loss": 1.7809503078460693,
"eval_runtime": 124.1026,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 5700
},
{
"epoch": 0.6581959706959707,
"grad_norm": 0.78515625,
"learning_rate": 1.933996931170898e-05,
"loss": 0.4333,
"step": 5750
},
{
"epoch": 0.6639194139194139,
"grad_norm": 0.7421875,
"learning_rate": 1.879407508522056e-05,
"loss": 0.43,
"step": 5800
},
{
"epoch": 0.6639194139194139,
"eval_loss": 1.7684820890426636,
"eval_runtime": 124.0911,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 5800
},
{
"epoch": 0.6696428571428571,
"grad_norm": 0.82421875,
"learning_rate": 1.8251316469819075e-05,
"loss": 0.4318,
"step": 5850
},
{
"epoch": 0.6753663003663004,
"grad_norm": 0.765625,
"learning_rate": 1.7711967700200435e-05,
"loss": 0.4345,
"step": 5900
},
{
"epoch": 0.6753663003663004,
"eval_loss": 1.7761142253875732,
"eval_runtime": 124.0752,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 5900
},
{
"epoch": 0.6810897435897436,
"grad_norm": 0.78125,
"learning_rate": 1.7176301288198894e-05,
"loss": 0.4362,
"step": 5950
},
{
"epoch": 0.6868131868131868,
"grad_norm": 0.8671875,
"learning_rate": 1.6644587885097457e-05,
"loss": 0.4379,
"step": 6000
},
{
"epoch": 0.6868131868131868,
"eval_loss": 1.7782317399978638,
"eval_runtime": 124.0838,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 6000
},
{
"epoch": 0.69253663003663,
"grad_norm": 0.80078125,
"learning_rate": 1.611709614487835e-05,
"loss": 0.4379,
"step": 6050
},
{
"epoch": 0.6982600732600732,
"grad_norm": 0.75,
"learning_rate": 1.5594092588482718e-05,
"loss": 0.4294,
"step": 6100
},
{
"epoch": 0.6982600732600732,
"eval_loss": 1.7736785411834717,
"eval_runtime": 124.0841,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 6100
},
{
"epoch": 0.7039835164835165,
"grad_norm": 0.75,
"learning_rate": 1.5075841469147974e-05,
"loss": 0.4356,
"step": 6150
},
{
"epoch": 0.7097069597069597,
"grad_norm": 0.84375,
"learning_rate": 1.456260463889102e-05,
"loss": 0.4441,
"step": 6200
},
{
"epoch": 0.7097069597069597,
"eval_loss": 1.7646363973617554,
"eval_runtime": 124.1036,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 6200
},
{
"epoch": 0.7154304029304029,
"grad_norm": 0.7109375,
"learning_rate": 1.4054641416204609e-05,
"loss": 0.4347,
"step": 6250
},
{
"epoch": 0.7211538461538461,
"grad_norm": 0.828125,
"learning_rate": 1.3552208455033932e-05,
"loss": 0.4396,
"step": 6300
},
{
"epoch": 0.7211538461538461,
"eval_loss": 1.7779277563095093,
"eval_runtime": 124.0892,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 6300
},
{
"epoch": 0.7268772893772893,
"grad_norm": 0.75,
"learning_rate": 1.3055559615099353e-05,
"loss": 0.4399,
"step": 6350
},
{
"epoch": 0.7326007326007326,
"grad_norm": 0.76953125,
"learning_rate": 1.256494583363104e-05,
"loss": 0.4307,
"step": 6400
},
{
"epoch": 0.7326007326007326,
"eval_loss": 1.7766470909118652,
"eval_runtime": 124.1063,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 6400
},
{
"epoch": 0.7383241758241759,
"grad_norm": 0.79296875,
"learning_rate": 1.2080614998580212e-05,
"loss": 0.4351,
"step": 6450
},
{
"epoch": 0.7440476190476191,
"grad_norm": 0.7265625,
"learning_rate": 1.1602811823371069e-05,
"loss": 0.4331,
"step": 6500
},
{
"epoch": 0.7440476190476191,
"eval_loss": 1.7733304500579834,
"eval_runtime": 124.0732,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 6500
},
{
"epoch": 0.7497710622710623,
"grad_norm": 0.734375,
"learning_rate": 1.1131777723256629e-05,
"loss": 0.4373,
"step": 6550
},
{
"epoch": 0.7554945054945055,
"grad_norm": 0.6484375,
"learning_rate": 1.0667750693341072e-05,
"loss": 0.4326,
"step": 6600
},
{
"epoch": 0.7554945054945055,
"eval_loss": 1.7796032428741455,
"eval_runtime": 124.0885,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 6600
},
{
"epoch": 0.7612179487179487,
"grad_norm": 0.71875,
"learning_rate": 1.0210965188330119e-05,
"loss": 0.4314,
"step": 6650
},
{
"epoch": 0.766941391941392,
"grad_norm": 0.76171875,
"learning_rate": 9.761652004070241e-06,
"loss": 0.4286,
"step": 6700
},
{
"epoch": 0.766941391941392,
"eval_loss": 1.7802751064300537,
"eval_runtime": 124.0901,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 6700
},
{
"epoch": 0.7726648351648352,
"grad_norm": 0.84375,
"learning_rate": 9.320038160936568e-06,
"loss": 0.44,
"step": 6750
},
{
"epoch": 0.7783882783882784,
"grad_norm": 0.7890625,
"learning_rate": 8.886346789128305e-06,
"loss": 0.4294,
"step": 6800
},
{
"epoch": 0.7783882783882784,
"eval_loss": 1.7787123918533325,
"eval_runtime": 124.0869,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 6800
},
{
"epoch": 0.7841117216117216,
"grad_norm": 0.73828125,
"learning_rate": 8.460797015929873e-06,
"loss": 0.4401,
"step": 6850
},
{
"epoch": 0.7898351648351648,
"grad_norm": 0.71484375,
"learning_rate": 8.04360385499437e-06,
"loss": 0.4294,
"step": 6900
},
{
"epoch": 0.7898351648351648,
"eval_loss": 1.779537558555603,
"eval_runtime": 124.0971,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 6900
},
{
"epoch": 0.795558608058608,
"grad_norm": 0.80859375,
"learning_rate": 7.63497809770566e-06,
"loss": 0.4327,
"step": 6950
},
{
"epoch": 0.8012820512820513,
"grad_norm": 0.76953125,
"learning_rate": 7.235126206673651e-06,
"loss": 0.4364,
"step": 7000
},
{
"epoch": 0.8012820512820513,
"eval_loss": 1.7765251398086548,
"eval_runtime": 124.0761,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 7000
},
{
"epoch": 0.8070054945054945,
"grad_norm": 0.734375,
"learning_rate": 6.844250211416903e-06,
"loss": 0.4392,
"step": 7050
},
{
"epoch": 0.8127289377289377,
"grad_norm": 0.73828125,
"learning_rate": 6.462547606284947e-06,
"loss": 0.4414,
"step": 7100
},
{
"epoch": 0.8127289377289377,
"eval_loss": 1.7783187627792358,
"eval_runtime": 124.1204,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7100
},
{
"epoch": 0.8184523809523809,
"grad_norm": 0.7421875,
"learning_rate": 6.09021125067217e-06,
"loss": 0.4354,
"step": 7150
},
{
"epoch": 0.8241758241758241,
"grad_norm": 0.84765625,
"learning_rate": 5.7274292715734315e-06,
"loss": 0.4336,
"step": 7200
},
{
"epoch": 0.8241758241758241,
"eval_loss": 1.7745906114578247,
"eval_runtime": 124.1664,
"eval_samples_per_second": 2.255,
"eval_steps_per_second": 0.564,
"step": 7200
},
{
"epoch": 0.8298992673992674,
"grad_norm": 0.7421875,
"learning_rate": 5.374384968530952e-06,
"loss": 0.4296,
"step": 7250
},
{
"epoch": 0.8356227106227107,
"grad_norm": 0.67578125,
"learning_rate": 5.031256721020139e-06,
"loss": 0.4324,
"step": 7300
},
{
"epoch": 0.8356227106227107,
"eval_loss": 1.772754192352295,
"eval_runtime": 124.0892,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7300
},
{
"epoch": 0.8413461538461539,
"grad_norm": 0.78125,
"learning_rate": 4.698217898321483e-06,
"loss": 0.4304,
"step": 7350
},
{
"epoch": 0.8470695970695971,
"grad_norm": 0.71875,
"learning_rate": 4.375436771923766e-06,
"loss": 0.4414,
"step": 7400
},
{
"epoch": 0.8470695970695971,
"eval_loss": 1.7765103578567505,
"eval_runtime": 124.0978,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7400
},
{
"epoch": 0.8527930402930403,
"grad_norm": 0.77734375,
"learning_rate": 4.063076430503138e-06,
"loss": 0.4258,
"step": 7450
},
{
"epoch": 0.8585164835164835,
"grad_norm": 0.76953125,
"learning_rate": 3.761294697520751e-06,
"loss": 0.4288,
"step": 7500
},
{
"epoch": 0.8585164835164835,
"eval_loss": 1.7792127132415771,
"eval_runtime": 124.1222,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7500
},
{
"epoch": 0.8642399267399268,
"grad_norm": 0.72265625,
"learning_rate": 3.4702440514807366e-06,
"loss": 0.4365,
"step": 7550
},
{
"epoch": 0.86996336996337,
"grad_norm": 0.79296875,
"learning_rate": 3.1900715488887873e-06,
"loss": 0.4359,
"step": 7600
},
{
"epoch": 0.86996336996337,
"eval_loss": 1.777583122253418,
"eval_runtime": 124.1273,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7600
},
{
"epoch": 0.8756868131868132,
"grad_norm": 0.7109375,
"learning_rate": 2.9209187499502604e-06,
"loss": 0.4388,
"step": 7650
},
{
"epoch": 0.8814102564102564,
"grad_norm": 0.6953125,
"learning_rate": 2.662921647045355e-06,
"loss": 0.4242,
"step": 7700
},
{
"epoch": 0.8814102564102564,
"eval_loss": 1.7761567831039429,
"eval_runtime": 124.1269,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7700
},
{
"epoch": 0.8871336996336996,
"grad_norm": 0.73828125,
"learning_rate": 2.4162105960174486e-06,
"loss": 0.4352,
"step": 7750
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.77734375,
"learning_rate": 2.180910250309423e-06,
"loss": 0.4413,
"step": 7800
},
{
"epoch": 0.8928571428571429,
"eval_loss": 1.7751096487045288,
"eval_runtime": 124.1103,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7800
},
{
"epoch": 0.8985805860805861,
"grad_norm": 0.734375,
"learning_rate": 1.957139497981131e-06,
"loss": 0.4293,
"step": 7850
},
{
"epoch": 0.9043040293040293,
"grad_norm": 0.8359375,
"learning_rate": 1.74501140163994e-06,
"loss": 0.4402,
"step": 7900
},
{
"epoch": 0.9043040293040293,
"eval_loss": 1.7753708362579346,
"eval_runtime": 124.1127,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 7900
},
{
"epoch": 0.9100274725274725,
"grad_norm": 0.75,
"learning_rate": 1.5446331413145887e-06,
"loss": 0.4436,
"step": 7950
},
{
"epoch": 0.9157509157509157,
"grad_norm": 0.703125,
"learning_rate": 1.3561059603013265e-06,
"loss": 0.4452,
"step": 8000
},
{
"epoch": 0.9157509157509157,
"eval_loss": 1.7749762535095215,
"eval_runtime": 124.0977,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 8000
},
{
"epoch": 0.9214743589743589,
"grad_norm": 0.6640625,
"learning_rate": 1.1795251140096358e-06,
"loss": 0.4458,
"step": 8050
},
{
"epoch": 0.9271978021978022,
"grad_norm": 0.74609375,
"learning_rate": 1.014979821833395e-06,
"loss": 0.4346,
"step": 8100
},
{
"epoch": 0.9271978021978022,
"eval_loss": 1.775481104850769,
"eval_runtime": 124.1005,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 8100
},
{
"epoch": 0.9329212454212454,
"grad_norm": 0.73046875,
"learning_rate": 8.625532220718186e-07,
"loss": 0.4361,
"step": 8150
},
{
"epoch": 0.9386446886446886,
"grad_norm": 0.7578125,
"learning_rate": 7.223223299229198e-07,
"loss": 0.4396,
"step": 8200
},
{
"epoch": 0.9386446886446886,
"eval_loss": 1.7751343250274658,
"eval_runtime": 124.089,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 8200
},
{
"epoch": 0.9443681318681318,
"grad_norm": 0.765625,
"learning_rate": 5.943579985707409e-07,
"loss": 0.4337,
"step": 8250
},
{
"epoch": 0.950091575091575,
"grad_norm": 0.78125,
"learning_rate": 4.787248833860114e-07,
"loss": 0.44,
"step": 8300
},
{
"epoch": 0.950091575091575,
"eval_loss": 1.7752093076705933,
"eval_runtime": 124.0951,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 8300
},
{
"epoch": 0.9558150183150184,
"grad_norm": 0.7578125,
"learning_rate": 3.7548140925833806e-07,
"loss": 0.4362,
"step": 8350
},
{
"epoch": 0.9615384615384616,
"grad_norm": 0.7578125,
"learning_rate": 2.8467974107636017e-07,
"loss": 0.4333,
"step": 8400
},
{
"epoch": 0.9615384615384616,
"eval_loss": 1.7753241062164307,
"eval_runtime": 124.0863,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 8400
},
{
"epoch": 0.9672619047619048,
"grad_norm": 0.83984375,
"learning_rate": 2.063657573708966e-07,
"loss": 0.4338,
"step": 8450
},
{
"epoch": 0.972985347985348,
"grad_norm": 0.84375,
"learning_rate": 1.4057902713431327e-07,
"loss": 0.4348,
"step": 8500
},
{
"epoch": 0.972985347985348,
"eval_loss": 1.7753559350967407,
"eval_runtime": 124.0677,
"eval_samples_per_second": 2.257,
"eval_steps_per_second": 0.564,
"step": 8500
},
{
"epoch": 0.9787087912087912,
"grad_norm": 0.734375,
"learning_rate": 8.735278982785755e-08,
"loss": 0.4306,
"step": 8550
},
{
"epoch": 0.9844322344322345,
"grad_norm": 0.74609375,
"learning_rate": 4.671393858705908e-08,
"loss": 0.4331,
"step": 8600
},
{
"epoch": 0.9844322344322345,
"eval_loss": 1.77518630027771,
"eval_runtime": 124.0873,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 8600
},
{
"epoch": 0.9901556776556777,
"grad_norm": 0.78125,
"learning_rate": 1.868300663367406e-08,
"loss": 0.4329,
"step": 8650
},
{
"epoch": 0.9958791208791209,
"grad_norm": 0.76171875,
"learning_rate": 3.2741569010674712e-09,
"loss": 0.4326,
"step": 8700
},
{
"epoch": 0.9958791208791209,
"eval_loss": 1.7753793001174927,
"eval_runtime": 124.103,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.564,
"step": 8700
},
{
"epoch": 1.0,
"step": 8736,
"total_flos": 1.6067287852253184e+18,
"train_loss": 0.5878726873860691,
"train_runtime": 61938.6999,
"train_samples_per_second": 0.564,
"train_steps_per_second": 0.141
}
],
"logging_steps": 50,
"max_steps": 8736,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"total_flos": 1.6067287852253184e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}