|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 66.66666666666667, |
|
"eval_steps": 100, |
|
"global_step": 13000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 73.68424224853516, |
|
"learning_rate": 9.9907e-06, |
|
"loss": 3.1457, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"eval_loss": 2.2357213497161865, |
|
"eval_runtime": 36.4689, |
|
"eval_samples_per_second": 10.776, |
|
"eval_steps_per_second": 1.371, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 57.47202682495117, |
|
"learning_rate": 9.980800000000001e-06, |
|
"loss": 2.1614, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"eval_loss": 2.0913825035095215, |
|
"eval_runtime": 36.349, |
|
"eval_samples_per_second": 10.812, |
|
"eval_steps_per_second": 1.376, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 176.88357543945312, |
|
"learning_rate": 9.970800000000001e-06, |
|
"loss": 2.0388, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 2.003911018371582, |
|
"eval_runtime": 36.2551, |
|
"eval_samples_per_second": 10.84, |
|
"eval_steps_per_second": 1.379, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 55.31132507324219, |
|
"learning_rate": 9.960800000000001e-06, |
|
"loss": 1.9285, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"eval_loss": 1.9796075820922852, |
|
"eval_runtime": 36.4437, |
|
"eval_samples_per_second": 10.784, |
|
"eval_steps_per_second": 1.372, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 41.900753021240234, |
|
"learning_rate": 9.9508e-06, |
|
"loss": 1.9523, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"eval_loss": 1.936122179031372, |
|
"eval_runtime": 36.5845, |
|
"eval_samples_per_second": 10.742, |
|
"eval_steps_per_second": 1.367, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 50.21903991699219, |
|
"learning_rate": 9.9408e-06, |
|
"loss": 1.8452, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 1.8883634805679321, |
|
"eval_runtime": 36.6015, |
|
"eval_samples_per_second": 10.737, |
|
"eval_steps_per_second": 1.366, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"grad_norm": 45.193939208984375, |
|
"learning_rate": 9.930900000000002e-06, |
|
"loss": 1.8403, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"eval_loss": 1.8506474494934082, |
|
"eval_runtime": 36.454, |
|
"eval_samples_per_second": 10.781, |
|
"eval_steps_per_second": 1.372, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 27.302494049072266, |
|
"learning_rate": 9.920900000000002e-06, |
|
"loss": 1.7976, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"eval_loss": 1.8370662927627563, |
|
"eval_runtime": 36.834, |
|
"eval_samples_per_second": 10.67, |
|
"eval_steps_per_second": 1.357, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 52.6607666015625, |
|
"learning_rate": 9.9109e-06, |
|
"loss": 1.7508, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 1.8037244081497192, |
|
"eval_runtime": 36.9939, |
|
"eval_samples_per_second": 10.623, |
|
"eval_steps_per_second": 1.352, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 59.508033752441406, |
|
"learning_rate": 9.9009e-06, |
|
"loss": 1.7383, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"eval_loss": 1.7986633777618408, |
|
"eval_runtime": 36.8356, |
|
"eval_samples_per_second": 10.669, |
|
"eval_steps_per_second": 1.357, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.641025641025641, |
|
"grad_norm": 71.58872985839844, |
|
"learning_rate": 9.8909e-06, |
|
"loss": 1.7361, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.641025641025641, |
|
"eval_loss": 1.7810852527618408, |
|
"eval_runtime": 37.0957, |
|
"eval_samples_per_second": 10.594, |
|
"eval_steps_per_second": 1.348, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 41.782066345214844, |
|
"learning_rate": 9.8809e-06, |
|
"loss": 1.682, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"eval_loss": 1.777554988861084, |
|
"eval_runtime": 36.9173, |
|
"eval_samples_per_second": 10.645, |
|
"eval_steps_per_second": 1.354, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 40.28728485107422, |
|
"learning_rate": 9.8709e-06, |
|
"loss": 1.7216, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"eval_loss": 1.7382572889328003, |
|
"eval_runtime": 36.8918, |
|
"eval_samples_per_second": 10.653, |
|
"eval_steps_per_second": 1.355, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"grad_norm": 104.99211883544922, |
|
"learning_rate": 9.8609e-06, |
|
"loss": 1.6534, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"eval_loss": 1.76559579372406, |
|
"eval_runtime": 36.3398, |
|
"eval_samples_per_second": 10.815, |
|
"eval_steps_per_second": 1.376, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 29.326631546020508, |
|
"learning_rate": 9.8509e-06, |
|
"loss": 1.707, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_loss": 1.750089406967163, |
|
"eval_runtime": 36.4098, |
|
"eval_samples_per_second": 10.794, |
|
"eval_steps_per_second": 1.373, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 37.941261291503906, |
|
"learning_rate": 9.840900000000001e-06, |
|
"loss": 1.6554, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"eval_loss": 1.6900651454925537, |
|
"eval_runtime": 36.3226, |
|
"eval_samples_per_second": 10.82, |
|
"eval_steps_per_second": 1.377, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.717948717948717, |
|
"grad_norm": 44.60703659057617, |
|
"learning_rate": 9.830900000000001e-06, |
|
"loss": 1.6334, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 8.717948717948717, |
|
"eval_loss": 1.7162973880767822, |
|
"eval_runtime": 36.2672, |
|
"eval_samples_per_second": 10.836, |
|
"eval_steps_per_second": 1.379, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 34.127254486083984, |
|
"learning_rate": 9.820900000000001e-06, |
|
"loss": 1.6345, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"eval_loss": 1.6906001567840576, |
|
"eval_runtime": 36.2264, |
|
"eval_samples_per_second": 10.848, |
|
"eval_steps_per_second": 1.38, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 9.743589743589745, |
|
"grad_norm": 60.377540588378906, |
|
"learning_rate": 9.810900000000001e-06, |
|
"loss": 1.598, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 9.743589743589745, |
|
"eval_loss": 1.6555503606796265, |
|
"eval_runtime": 36.3896, |
|
"eval_samples_per_second": 10.8, |
|
"eval_steps_per_second": 1.374, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"grad_norm": 20.264404296875, |
|
"learning_rate": 9.800900000000001e-06, |
|
"loss": 1.5466, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.256410256410255, |
|
"eval_loss": 1.648037075996399, |
|
"eval_runtime": 36.4136, |
|
"eval_samples_per_second": 10.793, |
|
"eval_steps_per_second": 1.373, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 27.18608856201172, |
|
"learning_rate": 9.790900000000001e-06, |
|
"loss": 1.5865, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"eval_loss": 1.6171936988830566, |
|
"eval_runtime": 36.1051, |
|
"eval_samples_per_second": 10.885, |
|
"eval_steps_per_second": 1.385, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 11.282051282051283, |
|
"grad_norm": 32.486331939697266, |
|
"learning_rate": 9.780900000000002e-06, |
|
"loss": 1.5284, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 11.282051282051283, |
|
"eval_loss": 1.5915095806121826, |
|
"eval_runtime": 36.1781, |
|
"eval_samples_per_second": 10.863, |
|
"eval_steps_per_second": 1.382, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 11.794871794871796, |
|
"grad_norm": 65.88719940185547, |
|
"learning_rate": 9.770900000000002e-06, |
|
"loss": 1.5514, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 11.794871794871796, |
|
"eval_loss": 1.5879931449890137, |
|
"eval_runtime": 36.3934, |
|
"eval_samples_per_second": 10.799, |
|
"eval_steps_per_second": 1.374, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 31.737024307250977, |
|
"learning_rate": 9.760900000000002e-06, |
|
"loss": 1.4941, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"eval_loss": 1.583853006362915, |
|
"eval_runtime": 36.3797, |
|
"eval_samples_per_second": 10.803, |
|
"eval_steps_per_second": 1.374, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"grad_norm": 45.48268508911133, |
|
"learning_rate": 9.7509e-06, |
|
"loss": 1.5097, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"eval_loss": 1.5559026002883911, |
|
"eval_runtime": 36.1605, |
|
"eval_samples_per_second": 10.868, |
|
"eval_steps_per_second": 1.383, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 27.500398635864258, |
|
"learning_rate": 9.7409e-06, |
|
"loss": 1.5018, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"eval_loss": 1.5453521013259888, |
|
"eval_runtime": 36.2887, |
|
"eval_samples_per_second": 10.83, |
|
"eval_steps_per_second": 1.378, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 32.49728775024414, |
|
"learning_rate": 9.7309e-06, |
|
"loss": 1.4804, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"eval_loss": 1.5424816608428955, |
|
"eval_runtime": 36.359, |
|
"eval_samples_per_second": 10.809, |
|
"eval_steps_per_second": 1.375, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 14.35897435897436, |
|
"grad_norm": 38.46280288696289, |
|
"learning_rate": 9.7209e-06, |
|
"loss": 1.4826, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 14.35897435897436, |
|
"eval_loss": 1.5317177772521973, |
|
"eval_runtime": 36.3362, |
|
"eval_samples_per_second": 10.816, |
|
"eval_steps_per_second": 1.376, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 14.871794871794872, |
|
"grad_norm": 16.075960159301758, |
|
"learning_rate": 9.7109e-06, |
|
"loss": 1.4568, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 14.871794871794872, |
|
"eval_loss": 1.5241832733154297, |
|
"eval_runtime": 36.2025, |
|
"eval_samples_per_second": 10.856, |
|
"eval_steps_per_second": 1.381, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 27.334318161010742, |
|
"learning_rate": 9.7009e-06, |
|
"loss": 1.4176, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"eval_loss": 1.520580768585205, |
|
"eval_runtime": 36.398, |
|
"eval_samples_per_second": 10.797, |
|
"eval_steps_per_second": 1.374, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 15.897435897435898, |
|
"grad_norm": 94.90784454345703, |
|
"learning_rate": 9.6909e-06, |
|
"loss": 1.4681, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 15.897435897435898, |
|
"eval_loss": 1.5268648862838745, |
|
"eval_runtime": 36.0541, |
|
"eval_samples_per_second": 10.9, |
|
"eval_steps_per_second": 1.387, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 16.41025641025641, |
|
"grad_norm": 16.697856903076172, |
|
"learning_rate": 9.6809e-06, |
|
"loss": 1.454, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 16.41025641025641, |
|
"eval_loss": 1.5157753229141235, |
|
"eval_runtime": 36.3172, |
|
"eval_samples_per_second": 10.821, |
|
"eval_steps_per_second": 1.377, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 54.05553436279297, |
|
"learning_rate": 9.670900000000001e-06, |
|
"loss": 1.4309, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"eval_loss": 1.516249179840088, |
|
"eval_runtime": 36.2632, |
|
"eval_samples_per_second": 10.837, |
|
"eval_steps_per_second": 1.379, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 17.435897435897434, |
|
"grad_norm": 47.010475158691406, |
|
"learning_rate": 9.660900000000001e-06, |
|
"loss": 1.4571, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 17.435897435897434, |
|
"eval_loss": 1.5018247365951538, |
|
"eval_runtime": 36.2118, |
|
"eval_samples_per_second": 10.853, |
|
"eval_steps_per_second": 1.381, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 17.94871794871795, |
|
"grad_norm": 52.865718841552734, |
|
"learning_rate": 9.650900000000001e-06, |
|
"loss": 1.4168, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 17.94871794871795, |
|
"eval_loss": 1.4993616342544556, |
|
"eval_runtime": 36.2572, |
|
"eval_samples_per_second": 10.839, |
|
"eval_steps_per_second": 1.379, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 30.117380142211914, |
|
"learning_rate": 9.640900000000001e-06, |
|
"loss": 1.4275, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"eval_loss": 1.4899998903274536, |
|
"eval_runtime": 36.4696, |
|
"eval_samples_per_second": 10.776, |
|
"eval_steps_per_second": 1.371, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 18.974358974358974, |
|
"grad_norm": 31.10028076171875, |
|
"learning_rate": 9.630900000000001e-06, |
|
"loss": 1.4148, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 18.974358974358974, |
|
"eval_loss": 1.5231629610061646, |
|
"eval_runtime": 36.4604, |
|
"eval_samples_per_second": 10.779, |
|
"eval_steps_per_second": 1.371, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 19.487179487179485, |
|
"grad_norm": 44.06697082519531, |
|
"learning_rate": 9.620900000000001e-06, |
|
"loss": 1.4057, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 19.487179487179485, |
|
"eval_loss": 1.4841217994689941, |
|
"eval_runtime": 36.3382, |
|
"eval_samples_per_second": 10.815, |
|
"eval_steps_per_second": 1.376, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 53.86429214477539, |
|
"learning_rate": 9.610900000000001e-06, |
|
"loss": 1.4302, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 1.477772831916809, |
|
"eval_runtime": 36.1794, |
|
"eval_samples_per_second": 10.863, |
|
"eval_steps_per_second": 1.382, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 20.51282051282051, |
|
"grad_norm": 80.95457458496094, |
|
"learning_rate": 9.600900000000002e-06, |
|
"loss": 1.4076, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 20.51282051282051, |
|
"eval_loss": 1.4769134521484375, |
|
"eval_runtime": 36.4725, |
|
"eval_samples_per_second": 10.775, |
|
"eval_steps_per_second": 1.371, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 21.025641025641026, |
|
"grad_norm": 32.276214599609375, |
|
"learning_rate": 9.5909e-06, |
|
"loss": 1.3868, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 21.025641025641026, |
|
"eval_loss": 1.463292121887207, |
|
"eval_runtime": 36.5192, |
|
"eval_samples_per_second": 10.761, |
|
"eval_steps_per_second": 1.369, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"grad_norm": 54.65959167480469, |
|
"learning_rate": 9.5809e-06, |
|
"loss": 1.3795, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"eval_loss": 1.4630039930343628, |
|
"eval_runtime": 36.5288, |
|
"eval_samples_per_second": 10.759, |
|
"eval_steps_per_second": 1.369, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 22.05128205128205, |
|
"grad_norm": 42.31818389892578, |
|
"learning_rate": 9.5709e-06, |
|
"loss": 1.3787, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 22.05128205128205, |
|
"eval_loss": 1.4471133947372437, |
|
"eval_runtime": 36.2949, |
|
"eval_samples_per_second": 10.828, |
|
"eval_steps_per_second": 1.378, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 22.564102564102566, |
|
"grad_norm": 34.44257736206055, |
|
"learning_rate": 9.5609e-06, |
|
"loss": 1.4027, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 22.564102564102566, |
|
"eval_loss": 1.4606964588165283, |
|
"eval_runtime": 36.5672, |
|
"eval_samples_per_second": 10.747, |
|
"eval_steps_per_second": 1.367, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"grad_norm": 42.65989303588867, |
|
"learning_rate": 9.5509e-06, |
|
"loss": 1.3459, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"eval_loss": 1.454709768295288, |
|
"eval_runtime": 37.2024, |
|
"eval_samples_per_second": 10.564, |
|
"eval_steps_per_second": 1.344, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 23.58974358974359, |
|
"grad_norm": 35.11396789550781, |
|
"learning_rate": 9.5409e-06, |
|
"loss": 1.3367, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 23.58974358974359, |
|
"eval_loss": 1.4562979936599731, |
|
"eval_runtime": 36.4579, |
|
"eval_samples_per_second": 10.78, |
|
"eval_steps_per_second": 1.371, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 24.102564102564102, |
|
"grad_norm": 32.71805953979492, |
|
"learning_rate": 9.5309e-06, |
|
"loss": 1.3575, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 24.102564102564102, |
|
"eval_loss": 1.4620144367218018, |
|
"eval_runtime": 36.4055, |
|
"eval_samples_per_second": 10.795, |
|
"eval_steps_per_second": 1.373, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"grad_norm": 28.839757919311523, |
|
"learning_rate": 9.5209e-06, |
|
"loss": 1.3549, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"eval_loss": 1.4431304931640625, |
|
"eval_runtime": 36.5027, |
|
"eval_samples_per_second": 10.766, |
|
"eval_steps_per_second": 1.37, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 25.128205128205128, |
|
"grad_norm": 45.3994140625, |
|
"learning_rate": 9.5109e-06, |
|
"loss": 1.3885, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 25.128205128205128, |
|
"eval_loss": 1.4312200546264648, |
|
"eval_runtime": 36.3039, |
|
"eval_samples_per_second": 10.825, |
|
"eval_steps_per_second": 1.377, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 25.641025641025642, |
|
"grad_norm": 22.972829818725586, |
|
"learning_rate": 9.5009e-06, |
|
"loss": 1.3469, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 25.641025641025642, |
|
"eval_loss": 1.416171669960022, |
|
"eval_runtime": 36.6695, |
|
"eval_samples_per_second": 10.717, |
|
"eval_steps_per_second": 1.364, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"grad_norm": 77.40106964111328, |
|
"learning_rate": 9.490900000000001e-06, |
|
"loss": 1.3363, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"eval_loss": 1.4090278148651123, |
|
"eval_runtime": 36.2573, |
|
"eval_samples_per_second": 10.839, |
|
"eval_steps_per_second": 1.379, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"grad_norm": 29.757932662963867, |
|
"learning_rate": 9.480900000000001e-06, |
|
"loss": 1.3183, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 26.666666666666668, |
|
"eval_loss": 1.4073749780654907, |
|
"eval_runtime": 36.2439, |
|
"eval_samples_per_second": 10.843, |
|
"eval_steps_per_second": 1.38, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 27.17948717948718, |
|
"grad_norm": 56.78797149658203, |
|
"learning_rate": 9.470900000000001e-06, |
|
"loss": 1.3568, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 27.17948717948718, |
|
"eval_loss": 1.4153741598129272, |
|
"eval_runtime": 36.2756, |
|
"eval_samples_per_second": 10.834, |
|
"eval_steps_per_second": 1.378, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"grad_norm": 62.353477478027344, |
|
"learning_rate": 9.460900000000001e-06, |
|
"loss": 1.3304, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"eval_loss": 1.4334921836853027, |
|
"eval_runtime": 36.2626, |
|
"eval_samples_per_second": 10.838, |
|
"eval_steps_per_second": 1.379, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 28.205128205128204, |
|
"grad_norm": 100.7852554321289, |
|
"learning_rate": 9.450900000000001e-06, |
|
"loss": 1.2897, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 28.205128205128204, |
|
"eval_loss": 1.4160270690917969, |
|
"eval_runtime": 36.6139, |
|
"eval_samples_per_second": 10.734, |
|
"eval_steps_per_second": 1.366, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 28.71794871794872, |
|
"grad_norm": 62.06657409667969, |
|
"learning_rate": 9.440900000000001e-06, |
|
"loss": 1.3233, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 28.71794871794872, |
|
"eval_loss": 1.431317687034607, |
|
"eval_runtime": 36.5341, |
|
"eval_samples_per_second": 10.757, |
|
"eval_steps_per_second": 1.369, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"grad_norm": 32.661346435546875, |
|
"learning_rate": 9.4309e-06, |
|
"loss": 1.305, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"eval_loss": 1.3954827785491943, |
|
"eval_runtime": 36.5903, |
|
"eval_samples_per_second": 10.741, |
|
"eval_steps_per_second": 1.366, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 29.743589743589745, |
|
"grad_norm": 25.690454483032227, |
|
"learning_rate": 9.421000000000002e-06, |
|
"loss": 1.2961, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 29.743589743589745, |
|
"eval_loss": 1.4036046266555786, |
|
"eval_runtime": 36.5935, |
|
"eval_samples_per_second": 10.74, |
|
"eval_steps_per_second": 1.366, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 30.256410256410255, |
|
"grad_norm": 45.45426940917969, |
|
"learning_rate": 9.411000000000002e-06, |
|
"loss": 1.3175, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 30.256410256410255, |
|
"eval_loss": 1.3845292329788208, |
|
"eval_runtime": 36.5594, |
|
"eval_samples_per_second": 10.75, |
|
"eval_steps_per_second": 1.368, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"grad_norm": 41.62439727783203, |
|
"learning_rate": 9.401000000000002e-06, |
|
"loss": 1.3242, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"eval_loss": 1.3939634561538696, |
|
"eval_runtime": 36.4895, |
|
"eval_samples_per_second": 10.77, |
|
"eval_steps_per_second": 1.37, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 31.28205128205128, |
|
"grad_norm": 26.999319076538086, |
|
"learning_rate": 9.391e-06, |
|
"loss": 1.2886, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 31.28205128205128, |
|
"eval_loss": 1.3804558515548706, |
|
"eval_runtime": 36.2599, |
|
"eval_samples_per_second": 10.838, |
|
"eval_steps_per_second": 1.379, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 31.794871794871796, |
|
"grad_norm": 24.70287322998047, |
|
"learning_rate": 9.381e-06, |
|
"loss": 1.2893, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 31.794871794871796, |
|
"eval_loss": 1.3821990489959717, |
|
"eval_runtime": 36.2613, |
|
"eval_samples_per_second": 10.838, |
|
"eval_steps_per_second": 1.379, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"grad_norm": 41.910606384277344, |
|
"learning_rate": 9.371e-06, |
|
"loss": 1.3093, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"eval_loss": 1.3875064849853516, |
|
"eval_runtime": 36.4998, |
|
"eval_samples_per_second": 10.767, |
|
"eval_steps_per_second": 1.37, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 32.82051282051282, |
|
"grad_norm": 82.20216369628906, |
|
"learning_rate": 9.361e-06, |
|
"loss": 1.3184, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 32.82051282051282, |
|
"eval_loss": 1.3840612173080444, |
|
"eval_runtime": 36.4787, |
|
"eval_samples_per_second": 10.773, |
|
"eval_steps_per_second": 1.371, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 797.53271484375, |
|
"learning_rate": 9.351e-06, |
|
"loss": 1.2939, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"eval_loss": 1.3881950378417969, |
|
"eval_runtime": 36.4166, |
|
"eval_samples_per_second": 10.792, |
|
"eval_steps_per_second": 1.373, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"grad_norm": 35.14693069458008, |
|
"learning_rate": 9.341000000000001e-06, |
|
"loss": 1.2881, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"eval_loss": 1.4039666652679443, |
|
"eval_runtime": 36.4141, |
|
"eval_samples_per_second": 10.793, |
|
"eval_steps_per_second": 1.373, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 34.35897435897436, |
|
"grad_norm": 38.676666259765625, |
|
"learning_rate": 9.331000000000001e-06, |
|
"loss": 1.2699, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 34.35897435897436, |
|
"eval_loss": 1.3800386190414429, |
|
"eval_runtime": 36.2367, |
|
"eval_samples_per_second": 10.845, |
|
"eval_steps_per_second": 1.38, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 34.87179487179487, |
|
"grad_norm": 26.051170349121094, |
|
"learning_rate": 9.321000000000001e-06, |
|
"loss": 1.3079, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 34.87179487179487, |
|
"eval_loss": 1.3784704208374023, |
|
"eval_runtime": 36.3119, |
|
"eval_samples_per_second": 10.823, |
|
"eval_steps_per_second": 1.377, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"grad_norm": 42.667236328125, |
|
"learning_rate": 9.311000000000001e-06, |
|
"loss": 1.2622, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"eval_loss": 1.3637058734893799, |
|
"eval_runtime": 36.1116, |
|
"eval_samples_per_second": 10.883, |
|
"eval_steps_per_second": 1.385, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 35.8974358974359, |
|
"grad_norm": 38.78388977050781, |
|
"learning_rate": 9.301000000000001e-06, |
|
"loss": 1.2652, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 35.8974358974359, |
|
"eval_loss": 1.3452589511871338, |
|
"eval_runtime": 36.2593, |
|
"eval_samples_per_second": 10.839, |
|
"eval_steps_per_second": 1.379, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 36.41025641025641, |
|
"grad_norm": 44.43967056274414, |
|
"learning_rate": 9.291000000000001e-06, |
|
"loss": 1.2378, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 36.41025641025641, |
|
"eval_loss": 1.3494073152542114, |
|
"eval_runtime": 35.4402, |
|
"eval_samples_per_second": 11.089, |
|
"eval_steps_per_second": 1.411, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"grad_norm": 49.02131271362305, |
|
"learning_rate": 9.281000000000001e-06, |
|
"loss": 1.2932, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"eval_loss": 1.3460158109664917, |
|
"eval_runtime": 35.9361, |
|
"eval_samples_per_second": 10.936, |
|
"eval_steps_per_second": 1.391, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 37.43589743589744, |
|
"grad_norm": 28.279098510742188, |
|
"learning_rate": 9.271000000000002e-06, |
|
"loss": 1.2598, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 37.43589743589744, |
|
"eval_loss": 1.36253023147583, |
|
"eval_runtime": 36.2944, |
|
"eval_samples_per_second": 10.828, |
|
"eval_steps_per_second": 1.378, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 37.94871794871795, |
|
"grad_norm": 35.21017074584961, |
|
"learning_rate": 9.261000000000002e-06, |
|
"loss": 1.2703, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 37.94871794871795, |
|
"eval_loss": 1.3509865999221802, |
|
"eval_runtime": 36.2661, |
|
"eval_samples_per_second": 10.837, |
|
"eval_steps_per_second": 1.379, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 51.673316955566406, |
|
"learning_rate": 9.251000000000002e-06, |
|
"loss": 1.2393, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"eval_loss": 1.3402855396270752, |
|
"eval_runtime": 36.392, |
|
"eval_samples_per_second": 10.799, |
|
"eval_steps_per_second": 1.374, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 38.97435897435897, |
|
"grad_norm": 53.73936462402344, |
|
"learning_rate": 9.241000000000002e-06, |
|
"loss": 1.2577, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 38.97435897435897, |
|
"eval_loss": 1.3487578630447388, |
|
"eval_runtime": 36.2119, |
|
"eval_samples_per_second": 10.853, |
|
"eval_steps_per_second": 1.381, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 39.48717948717949, |
|
"grad_norm": 55.994686126708984, |
|
"learning_rate": 9.231000000000002e-06, |
|
"loss": 1.229, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 39.48717948717949, |
|
"eval_loss": 1.340031623840332, |
|
"eval_runtime": 36.2063, |
|
"eval_samples_per_second": 10.854, |
|
"eval_steps_per_second": 1.381, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 86.7531509399414, |
|
"learning_rate": 9.221e-06, |
|
"loss": 1.2941, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 1.3422337770462036, |
|
"eval_runtime": 36.7462, |
|
"eval_samples_per_second": 10.695, |
|
"eval_steps_per_second": 1.361, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 40.51282051282051, |
|
"grad_norm": 60.86371612548828, |
|
"learning_rate": 9.211e-06, |
|
"loss": 1.2423, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 40.51282051282051, |
|
"eval_loss": 1.3336257934570312, |
|
"eval_runtime": 36.2441, |
|
"eval_samples_per_second": 10.843, |
|
"eval_steps_per_second": 1.38, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 41.02564102564103, |
|
"grad_norm": 28.535411834716797, |
|
"learning_rate": 9.2011e-06, |
|
"loss": 1.2676, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 41.02564102564103, |
|
"eval_loss": 1.338461995124817, |
|
"eval_runtime": 36.2212, |
|
"eval_samples_per_second": 10.85, |
|
"eval_steps_per_second": 1.38, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"grad_norm": 35.707183837890625, |
|
"learning_rate": 9.1911e-06, |
|
"loss": 1.2428, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"eval_loss": 1.3225666284561157, |
|
"eval_runtime": 36.2043, |
|
"eval_samples_per_second": 10.855, |
|
"eval_steps_per_second": 1.381, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 42.05128205128205, |
|
"grad_norm": 29.23111343383789, |
|
"learning_rate": 9.181100000000001e-06, |
|
"loss": 1.2269, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 42.05128205128205, |
|
"eval_loss": 1.3405094146728516, |
|
"eval_runtime": 36.2498, |
|
"eval_samples_per_second": 10.841, |
|
"eval_steps_per_second": 1.379, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 42.56410256410256, |
|
"grad_norm": 20.379304885864258, |
|
"learning_rate": 9.171100000000001e-06, |
|
"loss": 1.2187, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 42.56410256410256, |
|
"eval_loss": 1.3247309923171997, |
|
"eval_runtime": 36.3237, |
|
"eval_samples_per_second": 10.819, |
|
"eval_steps_per_second": 1.377, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"grad_norm": 44.43791198730469, |
|
"learning_rate": 9.161100000000001e-06, |
|
"loss": 1.2321, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"eval_loss": 1.334086298942566, |
|
"eval_runtime": 36.2039, |
|
"eval_samples_per_second": 10.855, |
|
"eval_steps_per_second": 1.381, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 43.58974358974359, |
|
"grad_norm": 30.97890853881836, |
|
"learning_rate": 9.151100000000001e-06, |
|
"loss": 1.2071, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 43.58974358974359, |
|
"eval_loss": 1.3306576013565063, |
|
"eval_runtime": 36.3547, |
|
"eval_samples_per_second": 10.81, |
|
"eval_steps_per_second": 1.375, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 44.1025641025641, |
|
"grad_norm": 40.07706832885742, |
|
"learning_rate": 9.141100000000001e-06, |
|
"loss": 1.25, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 44.1025641025641, |
|
"eval_loss": 1.3270679712295532, |
|
"eval_runtime": 36.1754, |
|
"eval_samples_per_second": 10.864, |
|
"eval_steps_per_second": 1.382, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"grad_norm": 27.011186599731445, |
|
"learning_rate": 9.1311e-06, |
|
"loss": 1.1968, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"eval_loss": 1.3117327690124512, |
|
"eval_runtime": 36.1727, |
|
"eval_samples_per_second": 10.865, |
|
"eval_steps_per_second": 1.382, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 45.12820512820513, |
|
"grad_norm": 25.976228713989258, |
|
"learning_rate": 9.1211e-06, |
|
"loss": 1.2492, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 45.12820512820513, |
|
"eval_loss": 1.3281300067901611, |
|
"eval_runtime": 36.2531, |
|
"eval_samples_per_second": 10.84, |
|
"eval_steps_per_second": 1.379, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 45.64102564102564, |
|
"grad_norm": 21.215715408325195, |
|
"learning_rate": 9.1111e-06, |
|
"loss": 1.221, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 45.64102564102564, |
|
"eval_loss": 1.3373734951019287, |
|
"eval_runtime": 36.2242, |
|
"eval_samples_per_second": 10.849, |
|
"eval_steps_per_second": 1.38, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"grad_norm": 48.8258171081543, |
|
"learning_rate": 9.1011e-06, |
|
"loss": 1.2123, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"eval_loss": 1.3303853273391724, |
|
"eval_runtime": 36.4019, |
|
"eval_samples_per_second": 10.796, |
|
"eval_steps_per_second": 1.374, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 46.666666666666664, |
|
"grad_norm": 36.76605224609375, |
|
"learning_rate": 9.0911e-06, |
|
"loss": 1.1951, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 46.666666666666664, |
|
"eval_loss": 1.3182373046875, |
|
"eval_runtime": 36.387, |
|
"eval_samples_per_second": 10.801, |
|
"eval_steps_per_second": 1.374, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 47.17948717948718, |
|
"grad_norm": 40.79771423339844, |
|
"learning_rate": 9.0811e-06, |
|
"loss": 1.2155, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 47.17948717948718, |
|
"eval_loss": 1.3303160667419434, |
|
"eval_runtime": 36.2265, |
|
"eval_samples_per_second": 10.848, |
|
"eval_steps_per_second": 1.38, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"grad_norm": 48.06431579589844, |
|
"learning_rate": 9.0711e-06, |
|
"loss": 1.2236, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"eval_loss": 1.3128286600112915, |
|
"eval_runtime": 36.3665, |
|
"eval_samples_per_second": 10.807, |
|
"eval_steps_per_second": 1.375, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 48.205128205128204, |
|
"grad_norm": 31.19647216796875, |
|
"learning_rate": 9.0611e-06, |
|
"loss": 1.2033, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 48.205128205128204, |
|
"eval_loss": 1.3134888410568237, |
|
"eval_runtime": 36.5129, |
|
"eval_samples_per_second": 10.763, |
|
"eval_steps_per_second": 1.369, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 48.717948717948715, |
|
"grad_norm": 20.11866569519043, |
|
"learning_rate": 9.0511e-06, |
|
"loss": 1.1955, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 48.717948717948715, |
|
"eval_loss": 1.3154560327529907, |
|
"eval_runtime": 36.6023, |
|
"eval_samples_per_second": 10.737, |
|
"eval_steps_per_second": 1.366, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"grad_norm": 36.3143424987793, |
|
"learning_rate": 9.0411e-06, |
|
"loss": 1.2067, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"eval_loss": 1.3158589601516724, |
|
"eval_runtime": 36.4851, |
|
"eval_samples_per_second": 10.772, |
|
"eval_steps_per_second": 1.37, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 49.743589743589745, |
|
"grad_norm": 48.41688537597656, |
|
"learning_rate": 9.0311e-06, |
|
"loss": 1.2295, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 49.743589743589745, |
|
"eval_loss": 1.306788682937622, |
|
"eval_runtime": 36.291, |
|
"eval_samples_per_second": 10.829, |
|
"eval_steps_per_second": 1.378, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 50.256410256410255, |
|
"grad_norm": 26.129995346069336, |
|
"learning_rate": 9.0211e-06, |
|
"loss": 1.1809, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 50.256410256410255, |
|
"eval_loss": 1.3299375772476196, |
|
"eval_runtime": 36.4651, |
|
"eval_samples_per_second": 10.777, |
|
"eval_steps_per_second": 1.371, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"grad_norm": 19.543821334838867, |
|
"learning_rate": 9.011100000000001e-06, |
|
"loss": 1.2179, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"eval_loss": 1.317457675933838, |
|
"eval_runtime": 36.2864, |
|
"eval_samples_per_second": 10.831, |
|
"eval_steps_per_second": 1.378, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 51.282051282051285, |
|
"grad_norm": 25.619775772094727, |
|
"learning_rate": 9.001100000000001e-06, |
|
"loss": 1.1653, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 51.282051282051285, |
|
"eval_loss": 1.31196928024292, |
|
"eval_runtime": 36.3263, |
|
"eval_samples_per_second": 10.819, |
|
"eval_steps_per_second": 1.376, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 51.794871794871796, |
|
"grad_norm": 45.30315017700195, |
|
"learning_rate": 8.991100000000001e-06, |
|
"loss": 1.2391, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 51.794871794871796, |
|
"eval_loss": 1.305709719657898, |
|
"eval_runtime": 36.2103, |
|
"eval_samples_per_second": 10.853, |
|
"eval_steps_per_second": 1.381, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"grad_norm": 26.942337036132812, |
|
"learning_rate": 8.981100000000001e-06, |
|
"loss": 1.2195, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"eval_loss": 1.3068941831588745, |
|
"eval_runtime": 36.2565, |
|
"eval_samples_per_second": 10.839, |
|
"eval_steps_per_second": 1.379, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 52.82051282051282, |
|
"grad_norm": 26.20073890686035, |
|
"learning_rate": 8.9711e-06, |
|
"loss": 1.1639, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 52.82051282051282, |
|
"eval_loss": 1.3013452291488647, |
|
"eval_runtime": 36.2146, |
|
"eval_samples_per_second": 10.852, |
|
"eval_steps_per_second": 1.381, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"grad_norm": 41.40350341796875, |
|
"learning_rate": 8.9611e-06, |
|
"loss": 1.2033, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 53.333333333333336, |
|
"eval_loss": 1.305737853050232, |
|
"eval_runtime": 36.4865, |
|
"eval_samples_per_second": 10.771, |
|
"eval_steps_per_second": 1.37, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"grad_norm": 28.133567810058594, |
|
"learning_rate": 8.9511e-06, |
|
"loss": 1.1906, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"eval_loss": 1.2961195707321167, |
|
"eval_runtime": 36.2734, |
|
"eval_samples_per_second": 10.834, |
|
"eval_steps_per_second": 1.378, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 54.35897435897436, |
|
"grad_norm": 44.07390213012695, |
|
"learning_rate": 8.9411e-06, |
|
"loss": 1.1899, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 54.35897435897436, |
|
"eval_loss": 1.3024916648864746, |
|
"eval_runtime": 36.4774, |
|
"eval_samples_per_second": 10.774, |
|
"eval_steps_per_second": 1.371, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 54.87179487179487, |
|
"grad_norm": 19.120830535888672, |
|
"learning_rate": 8.9311e-06, |
|
"loss": 1.1697, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 54.87179487179487, |
|
"eval_loss": 1.3056432008743286, |
|
"eval_runtime": 36.2367, |
|
"eval_samples_per_second": 10.845, |
|
"eval_steps_per_second": 1.38, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"grad_norm": 52.376529693603516, |
|
"learning_rate": 8.9211e-06, |
|
"loss": 1.1759, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"eval_loss": 1.3018929958343506, |
|
"eval_runtime": 36.1478, |
|
"eval_samples_per_second": 10.872, |
|
"eval_steps_per_second": 1.383, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 55.8974358974359, |
|
"grad_norm": 41.84946060180664, |
|
"learning_rate": 8.9112e-06, |
|
"loss": 1.1973, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 55.8974358974359, |
|
"eval_loss": 1.3166084289550781, |
|
"eval_runtime": 36.4967, |
|
"eval_samples_per_second": 10.768, |
|
"eval_steps_per_second": 1.37, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 56.41025641025641, |
|
"grad_norm": 48.97800064086914, |
|
"learning_rate": 8.9012e-06, |
|
"loss": 1.1942, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 56.41025641025641, |
|
"eval_loss": 1.3040730953216553, |
|
"eval_runtime": 36.3391, |
|
"eval_samples_per_second": 10.815, |
|
"eval_steps_per_second": 1.376, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"grad_norm": 24.18547821044922, |
|
"learning_rate": 8.8912e-06, |
|
"loss": 1.1544, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"eval_loss": 1.2837135791778564, |
|
"eval_runtime": 36.2839, |
|
"eval_samples_per_second": 10.831, |
|
"eval_steps_per_second": 1.378, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 57.43589743589744, |
|
"grad_norm": 34.69540023803711, |
|
"learning_rate": 8.8812e-06, |
|
"loss": 1.1998, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 57.43589743589744, |
|
"eval_loss": 1.2983756065368652, |
|
"eval_runtime": 36.2024, |
|
"eval_samples_per_second": 10.856, |
|
"eval_steps_per_second": 1.381, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 57.94871794871795, |
|
"grad_norm": 30.074583053588867, |
|
"learning_rate": 8.8712e-06, |
|
"loss": 1.1352, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 57.94871794871795, |
|
"eval_loss": 1.2913649082183838, |
|
"eval_runtime": 36.0977, |
|
"eval_samples_per_second": 10.887, |
|
"eval_steps_per_second": 1.385, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"grad_norm": 26.75031852722168, |
|
"learning_rate": 8.8612e-06, |
|
"loss": 1.1728, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"eval_loss": 1.288116216659546, |
|
"eval_runtime": 36.2588, |
|
"eval_samples_per_second": 10.839, |
|
"eval_steps_per_second": 1.379, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 58.97435897435897, |
|
"grad_norm": 49.548213958740234, |
|
"learning_rate": 8.851200000000001e-06, |
|
"loss": 1.1738, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 58.97435897435897, |
|
"eval_loss": 1.2846206426620483, |
|
"eval_runtime": 36.5245, |
|
"eval_samples_per_second": 10.76, |
|
"eval_steps_per_second": 1.369, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 59.48717948717949, |
|
"grad_norm": 23.007057189941406, |
|
"learning_rate": 8.841200000000001e-06, |
|
"loss": 1.1501, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 59.48717948717949, |
|
"eval_loss": 1.297021746635437, |
|
"eval_runtime": 36.2678, |
|
"eval_samples_per_second": 10.836, |
|
"eval_steps_per_second": 1.379, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 39.79067611694336, |
|
"learning_rate": 8.831200000000001e-06, |
|
"loss": 1.1836, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_loss": 1.2865136861801147, |
|
"eval_runtime": 36.1127, |
|
"eval_samples_per_second": 10.883, |
|
"eval_steps_per_second": 1.385, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 60.51282051282051, |
|
"grad_norm": 24.281373977661133, |
|
"learning_rate": 8.821200000000001e-06, |
|
"loss": 1.1548, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 60.51282051282051, |
|
"eval_loss": 1.2812024354934692, |
|
"eval_runtime": 36.2399, |
|
"eval_samples_per_second": 10.844, |
|
"eval_steps_per_second": 1.38, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 61.02564102564103, |
|
"grad_norm": 30.851072311401367, |
|
"learning_rate": 8.811200000000001e-06, |
|
"loss": 1.1794, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 61.02564102564103, |
|
"eval_loss": 1.2902381420135498, |
|
"eval_runtime": 36.1264, |
|
"eval_samples_per_second": 10.878, |
|
"eval_steps_per_second": 1.384, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"grad_norm": 44.42039108276367, |
|
"learning_rate": 8.801200000000001e-06, |
|
"loss": 1.1385, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"eval_loss": 1.2793415784835815, |
|
"eval_runtime": 36.266, |
|
"eval_samples_per_second": 10.837, |
|
"eval_steps_per_second": 1.379, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 62.05128205128205, |
|
"grad_norm": 57.410274505615234, |
|
"learning_rate": 8.791200000000001e-06, |
|
"loss": 1.1697, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 62.05128205128205, |
|
"eval_loss": 1.2847199440002441, |
|
"eval_runtime": 36.1783, |
|
"eval_samples_per_second": 10.863, |
|
"eval_steps_per_second": 1.382, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 62.56410256410256, |
|
"grad_norm": 70.70729064941406, |
|
"learning_rate": 8.781200000000002e-06, |
|
"loss": 1.1518, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 62.56410256410256, |
|
"eval_loss": 1.2760446071624756, |
|
"eval_runtime": 36.0527, |
|
"eval_samples_per_second": 10.901, |
|
"eval_steps_per_second": 1.387, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"grad_norm": 32.417388916015625, |
|
"learning_rate": 8.7712e-06, |
|
"loss": 1.1677, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"eval_loss": 1.2847411632537842, |
|
"eval_runtime": 36.3923, |
|
"eval_samples_per_second": 10.799, |
|
"eval_steps_per_second": 1.374, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 63.58974358974359, |
|
"grad_norm": 24.372791290283203, |
|
"learning_rate": 8.7612e-06, |
|
"loss": 1.1433, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 63.58974358974359, |
|
"eval_loss": 1.2779407501220703, |
|
"eval_runtime": 36.5015, |
|
"eval_samples_per_second": 10.767, |
|
"eval_steps_per_second": 1.37, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 64.1025641025641, |
|
"grad_norm": 19.632272720336914, |
|
"learning_rate": 8.7512e-06, |
|
"loss": 1.1607, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 64.1025641025641, |
|
"eval_loss": 1.2792208194732666, |
|
"eval_runtime": 36.4617, |
|
"eval_samples_per_second": 10.778, |
|
"eval_steps_per_second": 1.371, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"grad_norm": 34.54841613769531, |
|
"learning_rate": 8.7412e-06, |
|
"loss": 1.1371, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"eval_loss": 1.2620294094085693, |
|
"eval_runtime": 36.4969, |
|
"eval_samples_per_second": 10.768, |
|
"eval_steps_per_second": 1.37, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 65.12820512820512, |
|
"grad_norm": 19.67386817932129, |
|
"learning_rate": 8.7312e-06, |
|
"loss": 1.1332, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 65.12820512820512, |
|
"eval_loss": 1.2682890892028809, |
|
"eval_runtime": 36.6309, |
|
"eval_samples_per_second": 10.729, |
|
"eval_steps_per_second": 1.365, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 65.64102564102564, |
|
"grad_norm": 65.07030487060547, |
|
"learning_rate": 8.7212e-06, |
|
"loss": 1.1571, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 65.64102564102564, |
|
"eval_loss": 1.2490720748901367, |
|
"eval_runtime": 36.3056, |
|
"eval_samples_per_second": 10.825, |
|
"eval_steps_per_second": 1.377, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"grad_norm": 20.384132385253906, |
|
"learning_rate": 8.7112e-06, |
|
"loss": 1.1619, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"eval_loss": 1.2465028762817383, |
|
"eval_runtime": 36.1678, |
|
"eval_samples_per_second": 10.866, |
|
"eval_steps_per_second": 1.382, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 36.320674896240234, |
|
"learning_rate": 8.7012e-06, |
|
"loss": 1.1176, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"eval_loss": 1.2594362497329712, |
|
"eval_runtime": 36.3117, |
|
"eval_samples_per_second": 10.823, |
|
"eval_steps_per_second": 1.377, |
|
"step": 13000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 513, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.969161160704e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|