|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 8736, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005723443223443223, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 1.4302059496567508e-06, |
|
"loss": 3.2916, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011446886446886446, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 2.8604118993135015e-06, |
|
"loss": 3.1002, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011446886446886446, |
|
"eval_loss": 3.050471782684326, |
|
"eval_runtime": 124.1248, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.017170329670329672, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.290617848970252e-06, |
|
"loss": 2.6977, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.022893772893772892, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 5.720823798627003e-06, |
|
"loss": 2.1929, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.022893772893772892, |
|
"eval_loss": 2.049286365509033, |
|
"eval_runtime": 124.1321, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.028617216117216116, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.151029748283754e-06, |
|
"loss": 1.8298, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.034340659340659344, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 8.581235697940504e-06, |
|
"loss": 1.6369, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.034340659340659344, |
|
"eval_loss": 1.6432359218597412, |
|
"eval_runtime": 124.1394, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04006410256410257, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.0011441647597253e-05, |
|
"loss": 1.531, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.045787545787545784, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 1.1441647597254006e-05, |
|
"loss": 1.4618, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.045787545787545784, |
|
"eval_loss": 1.5580341815948486, |
|
"eval_runtime": 124.1039, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05151098901098901, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.2871853546910755e-05, |
|
"loss": 1.4061, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.05723443223443223, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.4302059496567508e-05, |
|
"loss": 1.317, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05723443223443223, |
|
"eval_loss": 1.5409735441207886, |
|
"eval_runtime": 124.0925, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06295787545787546, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 1.5732265446224257e-05, |
|
"loss": 1.2334, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.06868131868131869, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 1.716247139588101e-05, |
|
"loss": 1.1329, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06868131868131869, |
|
"eval_loss": 1.6269210577011108, |
|
"eval_runtime": 124.0849, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0744047619047619, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.859267734553776e-05, |
|
"loss": 1.039, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.08012820512820513, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.0022883295194507e-05, |
|
"loss": 0.9505, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08012820512820513, |
|
"eval_loss": 1.738716959953308, |
|
"eval_runtime": 124.1002, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08585164835164835, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 2.145308924485126e-05, |
|
"loss": 0.8827, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.09157509157509157, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 2.2883295194508012e-05, |
|
"loss": 0.8334, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.09157509157509157, |
|
"eval_loss": 1.7442790269851685, |
|
"eval_runtime": 124.087, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.0972985347985348, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 2.431350114416476e-05, |
|
"loss": 0.8127, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.10302197802197802, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.574370709382151e-05, |
|
"loss": 0.7692, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.10302197802197802, |
|
"eval_loss": 1.7634161710739136, |
|
"eval_runtime": 124.1852, |
|
"eval_samples_per_second": 2.255, |
|
"eval_steps_per_second": 0.564, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.10874542124542125, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 2.7173913043478262e-05, |
|
"loss": 0.74, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.11446886446886446, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 2.8604118993135016e-05, |
|
"loss": 0.6983, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11446886446886446, |
|
"eval_loss": 1.7546391487121582, |
|
"eval_runtime": 124.0637, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1201923076923077, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 3.0034324942791764e-05, |
|
"loss": 0.7277, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.1259157509157509, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 3.1464530892448515e-05, |
|
"loss": 0.6859, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1259157509157509, |
|
"eval_loss": 1.7593410015106201, |
|
"eval_runtime": 124.125, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.13163919413919414, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 3.289473684210527e-05, |
|
"loss": 0.6844, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.13736263736263737, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 3.432494279176202e-05, |
|
"loss": 0.6671, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.13736263736263737, |
|
"eval_loss": 1.7647184133529663, |
|
"eval_runtime": 124.0962, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.14308608058608058, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 3.5755148741418764e-05, |
|
"loss": 0.6553, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.1488095238095238, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 3.718535469107552e-05, |
|
"loss": 0.6285, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.1488095238095238, |
|
"eval_loss": 1.7950905561447144, |
|
"eval_runtime": 124.0984, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.15453296703296704, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.8615560640732266e-05, |
|
"loss": 0.6192, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.16025641025641027, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 4.0045766590389014e-05, |
|
"loss": 0.6121, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.16025641025641027, |
|
"eval_loss": 1.7816270589828491, |
|
"eval_runtime": 124.0951, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.16597985347985347, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.147597254004577e-05, |
|
"loss": 0.6039, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.1717032967032967, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 4.290617848970252e-05, |
|
"loss": 0.5923, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1717032967032967, |
|
"eval_loss": 1.8131866455078125, |
|
"eval_runtime": 124.0957, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.17742673992673993, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.433638443935927e-05, |
|
"loss": 0.5973, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.18315018315018314, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.5766590389016025e-05, |
|
"loss": 0.5908, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.18315018315018314, |
|
"eval_loss": 1.7663753032684326, |
|
"eval_runtime": 124.103, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.18887362637362637, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.719679633867277e-05, |
|
"loss": 0.5869, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.1945970695970696, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.862700228832952e-05, |
|
"loss": 0.5662, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.1945970695970696, |
|
"eval_loss": 1.830661416053772, |
|
"eval_runtime": 124.107, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.20032051282051283, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.9999989894357565e-05, |
|
"loss": 0.5641, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.20604395604395603, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.999316889636665e-05, |
|
"loss": 0.5637, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.20604395604395603, |
|
"eval_loss": 1.7863534688949585, |
|
"eval_runtime": 124.0697, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.21176739926739926, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.9973719827852006e-05, |
|
"loss": 0.5566, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.2174908424908425, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 4.994165251566713e-05, |
|
"loss": 0.5475, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2174908424908425, |
|
"eval_loss": 1.7987805604934692, |
|
"eval_runtime": 124.1161, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.22321428571428573, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.98969831621704e-05, |
|
"loss": 0.5544, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.22893772893772893, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.98397343370387e-05, |
|
"loss": 0.5421, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.22893772893772893, |
|
"eval_loss": 1.7876337766647339, |
|
"eval_runtime": 124.1072, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.23466117216117216, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 4.976993496586383e-05, |
|
"loss": 0.5386, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.2403846153846154, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.968762031553753e-05, |
|
"loss": 0.529, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2403846153846154, |
|
"eval_loss": 1.7660707235336304, |
|
"eval_runtime": 124.1155, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.24610805860805862, |
|
"grad_norm": 0.625, |
|
"learning_rate": 4.959283197643249e-05, |
|
"loss": 0.5278, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.2518315018315018, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.948561784138841e-05, |
|
"loss": 0.5202, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.2518315018315018, |
|
"eval_loss": 1.770936131477356, |
|
"eval_runtime": 124.1257, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.25755494505494503, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 4.936603208151355e-05, |
|
"loss": 0.5071, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.2632783882783883, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.9234135118814246e-05, |
|
"loss": 0.5287, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.2632783882783883, |
|
"eval_loss": 1.7680959701538086, |
|
"eval_runtime": 124.1059, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.2690018315018315, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.908999359566602e-05, |
|
"loss": 0.5116, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.8933680341141775e-05, |
|
"loss": 0.514, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.27472527472527475, |
|
"eval_loss": 1.7765259742736816, |
|
"eval_runtime": 124.1161, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.28044871794871795, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.8765274334214116e-05, |
|
"loss": 0.5099, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.28617216117216115, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.8584860663850404e-05, |
|
"loss": 0.5026, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.28617216117216115, |
|
"eval_loss": 1.7931022644042969, |
|
"eval_runtime": 124.0904, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2918956043956044, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 4.839253048602059e-05, |
|
"loss": 0.5044, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.818838097763967e-05, |
|
"loss": 0.5038, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"eval_loss": 1.7807551622390747, |
|
"eval_runtime": 124.0961, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.3033424908424908, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.7972515287468e-05, |
|
"loss": 0.4828, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.3090659340659341, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.774504248399427e-05, |
|
"loss": 0.5052, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.3090659340659341, |
|
"eval_loss": 1.7688714265823364, |
|
"eval_runtime": 124.0947, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.3147893772893773, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.750607750032748e-05, |
|
"loss": 0.4938, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.32051282051282054, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.725574107612567e-05, |
|
"loss": 0.4918, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.32051282051282054, |
|
"eval_loss": 1.7862409353256226, |
|
"eval_runtime": 124.0923, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.32623626373626374, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.699415969659098e-05, |
|
"loss": 0.4847, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.33195970695970695, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 4.672146552856155e-05, |
|
"loss": 0.4817, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.33195970695970695, |
|
"eval_loss": 1.7916373014450073, |
|
"eval_runtime": 124.0857, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.3376831501831502, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 4.6437796353732824e-05, |
|
"loss": 0.4908, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.3434065934065934, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.614329549904187e-05, |
|
"loss": 0.4806, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3434065934065934, |
|
"eval_loss": 1.7795602083206177, |
|
"eval_runtime": 124.1973, |
|
"eval_samples_per_second": 2.254, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3491300366300366, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.583811176424984e-05, |
|
"loss": 0.4831, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.35485347985347987, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 4.5522399346759304e-05, |
|
"loss": 0.4849, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.35485347985347987, |
|
"eval_loss": 1.76537024974823, |
|
"eval_runtime": 124.1057, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.3605769230769231, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 4.51963177637043e-05, |
|
"loss": 0.4787, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.3663003663003663, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.4860031771352626e-05, |
|
"loss": 0.4784, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.3663003663003663, |
|
"eval_loss": 1.7576422691345215, |
|
"eval_runtime": 124.109, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.37202380952380953, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 4.451371128186087e-05, |
|
"loss": 0.485, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.37774725274725274, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.4157531277424503e-05, |
|
"loss": 0.4712, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.37774725274725274, |
|
"eval_loss": 1.7745938301086426, |
|
"eval_runtime": 124.0704, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.383470695970696, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.379167172186619e-05, |
|
"loss": 0.4899, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.3891941391941392, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.3416317469707125e-05, |
|
"loss": 0.4715, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3891941391941392, |
|
"eval_loss": 1.7567591667175293, |
|
"eval_runtime": 124.0669, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.3949175824175824, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.3031658172767266e-05, |
|
"loss": 0.472, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.40064102564102566, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.263788818434168e-05, |
|
"loss": 0.4608, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.40064102564102566, |
|
"eval_loss": 1.7424110174179077, |
|
"eval_runtime": 124.063, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.40636446886446886, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 4.223520646100145e-05, |
|
"loss": 0.4586, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.41208791208791207, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.182381646206868e-05, |
|
"loss": 0.4629, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.41208791208791207, |
|
"eval_loss": 1.7561120986938477, |
|
"eval_runtime": 124.1336, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.4178113553113553, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.140392604681646e-05, |
|
"loss": 0.4655, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.42353479853479853, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.097574736944575e-05, |
|
"loss": 0.4591, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.42353479853479853, |
|
"eval_loss": 1.7497508525848389, |
|
"eval_runtime": 124.0856, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.42925824175824173, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 4.053949677189208e-05, |
|
"loss": 0.4658, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.434981684981685, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.0095394674516506e-05, |
|
"loss": 0.4652, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.434981684981685, |
|
"eval_loss": 1.736577033996582, |
|
"eval_runtime": 124.0959, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.4407051282051282, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 3.96436654647358e-05, |
|
"loss": 0.4588, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 3.918453738364824e-05, |
|
"loss": 0.461, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"eval_loss": 1.7393635511398315, |
|
"eval_runtime": 124.0934, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.45215201465201466, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.871824241071236e-05, |
|
"loss": 0.4493, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.45787545787545786, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 3.824501614653676e-05, |
|
"loss": 0.4469, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.45787545787545786, |
|
"eval_loss": 1.7396734952926636, |
|
"eval_runtime": 124.0876, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4635989010989011, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.7765097693840385e-05, |
|
"loss": 0.4496, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.4693223443223443, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.727872953664322e-05, |
|
"loss": 0.4521, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.4693223443223443, |
|
"eval_loss": 1.7555357217788696, |
|
"eval_runtime": 124.0832, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.4750457875457875, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 3.678615741774861e-05, |
|
"loss": 0.4598, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.4807692307692308, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 3.628763021457909e-05, |
|
"loss": 0.4498, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.4807692307692308, |
|
"eval_loss": 1.7651796340942383, |
|
"eval_runtime": 124.0848, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.486492673992674, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.57833998134283e-05, |
|
"loss": 0.4471, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.49221611721611724, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.5273720982192716e-05, |
|
"loss": 0.4541, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.49221611721611724, |
|
"eval_loss": 1.758300542831421, |
|
"eval_runtime": 124.0895, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.49793956043956045, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.475885124164737e-05, |
|
"loss": 0.4595, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.5036630036630036, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 3.4239050735330754e-05, |
|
"loss": 0.4594, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5036630036630036, |
|
"eval_loss": 1.7604867219924927, |
|
"eval_runtime": 124.0898, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5093864468864469, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 3.371458209810437e-05, |
|
"loss": 0.4584, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.5151098901098901, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.3185710323453684e-05, |
|
"loss": 0.4514, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5151098901098901, |
|
"eval_loss": 1.7685655355453491, |
|
"eval_runtime": 124.0911, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 3.265270262959723e-05, |
|
"loss": 0.4523, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.5265567765567766, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 3.211582832447175e-05, |
|
"loss": 0.4395, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5265567765567766, |
|
"eval_loss": 1.7713632583618164, |
|
"eval_runtime": 124.0855, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.5322802197802198, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.1575358669661356e-05, |
|
"loss": 0.4464, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.538003663003663, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.103156674333976e-05, |
|
"loss": 0.4384, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.538003663003663, |
|
"eval_loss": 1.788908839225769, |
|
"eval_runtime": 124.0945, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.5437271062271062, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.0484727302294475e-05, |
|
"loss": 0.4446, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 2.9935116643102983e-05, |
|
"loss": 0.4392, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"eval_loss": 1.7709113359451294, |
|
"eval_runtime": 124.088, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.5551739926739927, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.9383012462530895e-05, |
|
"loss": 0.4406, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.5608974358974359, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.8828693717222625e-05, |
|
"loss": 0.4495, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.5608974358974359, |
|
"eval_loss": 1.7554136514663696, |
|
"eval_runtime": 124.0986, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.5666208791208791, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.8272440482755535e-05, |
|
"loss": 0.4433, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.5723443223443223, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 2.771453381212865e-05, |
|
"loss": 0.4375, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5723443223443223, |
|
"eval_loss": 1.7531843185424805, |
|
"eval_runtime": 124.0918, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5780677655677655, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.715525559375764e-05, |
|
"loss": 0.4405, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.5837912087912088, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.6594888409047557e-05, |
|
"loss": 0.4441, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5837912087912088, |
|
"eval_loss": 1.7770174741744995, |
|
"eval_runtime": 124.0878, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.589514652014652, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.6033715389615588e-05, |
|
"loss": 0.4403, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 2.5472020074235635e-05, |
|
"loss": 0.4458, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"eval_loss": 1.7528095245361328, |
|
"eval_runtime": 124.0953, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.6009615384615384, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.4910086265577364e-05, |
|
"loss": 0.4368, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.6066849816849816, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.4348197886811702e-05, |
|
"loss": 0.4343, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.6066849816849816, |
|
"eval_loss": 1.7645584344863892, |
|
"eval_runtime": 124.0831, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.612408424908425, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.3786638838155694e-05, |
|
"loss": 0.4444, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.6181318681318682, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.32256928534287e-05, |
|
"loss": 0.433, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6181318681318682, |
|
"eval_loss": 1.7689203023910522, |
|
"eval_runtime": 124.0857, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.6238553113553114, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.2665643356692923e-05, |
|
"loss": 0.4344, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.6295787545787546, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.210677331905019e-05, |
|
"loss": 0.4371, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6295787545787546, |
|
"eval_loss": 1.7737501859664917, |
|
"eval_runtime": 124.0948, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6353021978021978, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.1549365115667853e-05, |
|
"loss": 0.4377, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 2.099370038310553e-05, |
|
"loss": 0.4376, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"eval_loss": 1.7633239030838013, |
|
"eval_runtime": 124.0669, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.6467490842490843, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.044005987701531e-05, |
|
"loss": 0.4388, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.6524725274725275, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.9888723330286763e-05, |
|
"loss": 0.4366, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.6524725274725275, |
|
"eval_loss": 1.7809503078460693, |
|
"eval_runtime": 124.1026, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.6581959706959707, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.933996931170898e-05, |
|
"loss": 0.4333, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.6639194139194139, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.879407508522056e-05, |
|
"loss": 0.43, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.6639194139194139, |
|
"eval_loss": 1.7684820890426636, |
|
"eval_runtime": 124.0911, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.6696428571428571, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 1.8251316469819075e-05, |
|
"loss": 0.4318, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.6753663003663004, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.7711967700200435e-05, |
|
"loss": 0.4345, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.6753663003663004, |
|
"eval_loss": 1.7761142253875732, |
|
"eval_runtime": 124.0752, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.6810897435897436, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.7176301288198894e-05, |
|
"loss": 0.4362, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.6868131868131868, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 1.6644587885097457e-05, |
|
"loss": 0.4379, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6868131868131868, |
|
"eval_loss": 1.7782317399978638, |
|
"eval_runtime": 124.0838, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.69253663003663, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.611709614487835e-05, |
|
"loss": 0.4379, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.6982600732600732, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.5594092588482718e-05, |
|
"loss": 0.4294, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.6982600732600732, |
|
"eval_loss": 1.7736785411834717, |
|
"eval_runtime": 124.0841, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.7039835164835165, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.5075841469147974e-05, |
|
"loss": 0.4356, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.7097069597069597, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.456260463889102e-05, |
|
"loss": 0.4441, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.7097069597069597, |
|
"eval_loss": 1.7646363973617554, |
|
"eval_runtime": 124.1036, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.7154304029304029, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.4054641416204609e-05, |
|
"loss": 0.4347, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.7211538461538461, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.3552208455033932e-05, |
|
"loss": 0.4396, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.7211538461538461, |
|
"eval_loss": 1.7779277563095093, |
|
"eval_runtime": 124.0892, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.7268772893772893, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.3055559615099353e-05, |
|
"loss": 0.4399, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.256494583363104e-05, |
|
"loss": 0.4307, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"eval_loss": 1.7766470909118652, |
|
"eval_runtime": 124.1063, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.7383241758241759, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.2080614998580212e-05, |
|
"loss": 0.4351, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.7440476190476191, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.1602811823371069e-05, |
|
"loss": 0.4331, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7440476190476191, |
|
"eval_loss": 1.7733304500579834, |
|
"eval_runtime": 124.0732, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7497710622710623, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.1131777723256629e-05, |
|
"loss": 0.4373, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.7554945054945055, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.0667750693341072e-05, |
|
"loss": 0.4326, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.7554945054945055, |
|
"eval_loss": 1.7796032428741455, |
|
"eval_runtime": 124.0885, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.7612179487179487, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.0210965188330119e-05, |
|
"loss": 0.4314, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.766941391941392, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 9.761652004070241e-06, |
|
"loss": 0.4286, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.766941391941392, |
|
"eval_loss": 1.7802751064300537, |
|
"eval_runtime": 124.0901, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.7726648351648352, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 9.320038160936568e-06, |
|
"loss": 0.44, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.7783882783882784, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 8.886346789128305e-06, |
|
"loss": 0.4294, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.7783882783882784, |
|
"eval_loss": 1.7787123918533325, |
|
"eval_runtime": 124.0869, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.7841117216117216, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 8.460797015929873e-06, |
|
"loss": 0.4401, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.7898351648351648, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 8.04360385499437e-06, |
|
"loss": 0.4294, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.7898351648351648, |
|
"eval_loss": 1.779537558555603, |
|
"eval_runtime": 124.0971, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.795558608058608, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.63497809770566e-06, |
|
"loss": 0.4327, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.8012820512820513, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.235126206673651e-06, |
|
"loss": 0.4364, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.8012820512820513, |
|
"eval_loss": 1.7765251398086548, |
|
"eval_runtime": 124.0761, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.8070054945054945, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.844250211416903e-06, |
|
"loss": 0.4392, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.8127289377289377, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.462547606284947e-06, |
|
"loss": 0.4414, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.8127289377289377, |
|
"eval_loss": 1.7783187627792358, |
|
"eval_runtime": 124.1204, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.8184523809523809, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.09021125067217e-06, |
|
"loss": 0.4354, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.7274292715734315e-06, |
|
"loss": 0.4336, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.8241758241758241, |
|
"eval_loss": 1.7745906114578247, |
|
"eval_runtime": 124.1664, |
|
"eval_samples_per_second": 2.255, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.8298992673992674, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 5.374384968530952e-06, |
|
"loss": 0.4296, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.8356227106227107, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.031256721020139e-06, |
|
"loss": 0.4324, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.8356227106227107, |
|
"eval_loss": 1.772754192352295, |
|
"eval_runtime": 124.0892, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.8413461538461539, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.698217898321483e-06, |
|
"loss": 0.4304, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.8470695970695971, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.375436771923766e-06, |
|
"loss": 0.4414, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.8470695970695971, |
|
"eval_loss": 1.7765103578567505, |
|
"eval_runtime": 124.0978, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.8527930402930403, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.063076430503138e-06, |
|
"loss": 0.4258, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.8585164835164835, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.761294697520751e-06, |
|
"loss": 0.4288, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.8585164835164835, |
|
"eval_loss": 1.7792127132415771, |
|
"eval_runtime": 124.1222, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.8642399267399268, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.4702440514807366e-06, |
|
"loss": 0.4365, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.86996336996337, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.1900715488887873e-06, |
|
"loss": 0.4359, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.86996336996337, |
|
"eval_loss": 1.777583122253418, |
|
"eval_runtime": 124.1273, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.8756868131868132, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.9209187499502604e-06, |
|
"loss": 0.4388, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.8814102564102564, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 2.662921647045355e-06, |
|
"loss": 0.4242, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.8814102564102564, |
|
"eval_loss": 1.7761567831039429, |
|
"eval_runtime": 124.1269, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.8871336996336996, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.4162105960174486e-06, |
|
"loss": 0.4352, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.180910250309423e-06, |
|
"loss": 0.4413, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"eval_loss": 1.7751096487045288, |
|
"eval_runtime": 124.1103, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.8985805860805861, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.957139497981131e-06, |
|
"loss": 0.4293, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.9043040293040293, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 1.74501140163994e-06, |
|
"loss": 0.4402, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.9043040293040293, |
|
"eval_loss": 1.7753708362579346, |
|
"eval_runtime": 124.1127, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.9100274725274725, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.5446331413145887e-06, |
|
"loss": 0.4436, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.9157509157509157, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.3561059603013265e-06, |
|
"loss": 0.4452, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.9157509157509157, |
|
"eval_loss": 1.7749762535095215, |
|
"eval_runtime": 124.0977, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.9214743589743589, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.1795251140096358e-06, |
|
"loss": 0.4458, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.9271978021978022, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.014979821833395e-06, |
|
"loss": 0.4346, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.9271978021978022, |
|
"eval_loss": 1.775481104850769, |
|
"eval_runtime": 124.1005, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.9329212454212454, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 8.625532220718186e-07, |
|
"loss": 0.4361, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.9386446886446886, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.223223299229198e-07, |
|
"loss": 0.4396, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.9386446886446886, |
|
"eval_loss": 1.7751343250274658, |
|
"eval_runtime": 124.089, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.9443681318681318, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 5.943579985707409e-07, |
|
"loss": 0.4337, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.950091575091575, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.787248833860114e-07, |
|
"loss": 0.44, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.950091575091575, |
|
"eval_loss": 1.7752093076705933, |
|
"eval_runtime": 124.0951, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.9558150183150184, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.7548140925833806e-07, |
|
"loss": 0.4362, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.8467974107636017e-07, |
|
"loss": 0.4333, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"eval_loss": 1.7753241062164307, |
|
"eval_runtime": 124.0863, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.9672619047619048, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.063657573708966e-07, |
|
"loss": 0.4338, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.972985347985348, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.4057902713431327e-07, |
|
"loss": 0.4348, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.972985347985348, |
|
"eval_loss": 1.7753559350967407, |
|
"eval_runtime": 124.0677, |
|
"eval_samples_per_second": 2.257, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9787087912087912, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 8.735278982785755e-08, |
|
"loss": 0.4306, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.9844322344322345, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.671393858705908e-08, |
|
"loss": 0.4331, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.9844322344322345, |
|
"eval_loss": 1.77518630027771, |
|
"eval_runtime": 124.0873, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.9901556776556777, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.868300663367406e-08, |
|
"loss": 0.4329, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.9958791208791209, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.2741569010674712e-09, |
|
"loss": 0.4326, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.9958791208791209, |
|
"eval_loss": 1.7753793001174927, |
|
"eval_runtime": 124.103, |
|
"eval_samples_per_second": 2.256, |
|
"eval_steps_per_second": 0.564, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 8736, |
|
"total_flos": 1.6067287852253184e+18, |
|
"train_loss": 0.5878726873860691, |
|
"train_runtime": 61938.6999, |
|
"train_samples_per_second": 0.564, |
|
"train_steps_per_second": 0.141 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 8736, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 250, |
|
"total_flos": 1.6067287852253184e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|