diff --git "a/checkpoint-31500/trainer_state.json" "b/checkpoint-31500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-31500/trainer_state.json" @@ -0,0 +1,4758 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 161.53846153846155, + "eval_steps": 100, + "global_step": 31500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.5128205128205128, + "grad_norm": 73.68424224853516, + "learning_rate": 9.9907e-06, + "loss": 3.1457, + "step": 100 + }, + { + "epoch": 0.5128205128205128, + "eval_loss": 2.2357213497161865, + "eval_runtime": 36.4689, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 100 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 57.47202682495117, + "learning_rate": 9.980800000000001e-06, + "loss": 2.1614, + "step": 200 + }, + { + "epoch": 1.0256410256410255, + "eval_loss": 2.0913825035095215, + "eval_runtime": 36.349, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 200 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 176.88357543945312, + "learning_rate": 9.970800000000001e-06, + "loss": 2.0388, + "step": 300 + }, + { + "epoch": 1.5384615384615383, + "eval_loss": 2.003911018371582, + "eval_runtime": 36.2551, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 300 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 55.31132507324219, + "learning_rate": 9.960800000000001e-06, + "loss": 1.9285, + "step": 400 + }, + { + "epoch": 2.051282051282051, + "eval_loss": 1.9796075820922852, + "eval_runtime": 36.4437, + "eval_samples_per_second": 10.784, + "eval_steps_per_second": 1.372, + "step": 400 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 41.900753021240234, + "learning_rate": 9.9508e-06, + "loss": 1.9523, + "step": 500 + }, + { + "epoch": 2.564102564102564, + "eval_loss": 1.936122179031372, + "eval_runtime": 36.5845, + "eval_samples_per_second": 10.742, + "eval_steps_per_second": 1.367, + "step": 500 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 50.21903991699219, + "learning_rate": 9.9408e-06, + "loss": 1.8452, + "step": 600 + }, + { + "epoch": 3.076923076923077, + "eval_loss": 1.8883634805679321, + "eval_runtime": 36.6015, + "eval_samples_per_second": 10.737, + "eval_steps_per_second": 1.366, + "step": 600 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 45.193939208984375, + "learning_rate": 9.930900000000002e-06, + "loss": 1.8403, + "step": 700 + }, + { + "epoch": 3.58974358974359, + "eval_loss": 1.8506474494934082, + "eval_runtime": 36.454, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 700 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 27.302494049072266, + "learning_rate": 9.920900000000002e-06, + "loss": 1.7976, + "step": 800 + }, + { + "epoch": 4.102564102564102, + "eval_loss": 1.8370662927627563, + "eval_runtime": 36.834, + "eval_samples_per_second": 10.67, + "eval_steps_per_second": 1.357, + "step": 800 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 52.6607666015625, + "learning_rate": 9.9109e-06, + "loss": 1.7508, + "step": 900 + }, + { + "epoch": 4.615384615384615, + "eval_loss": 1.8037244081497192, + "eval_runtime": 36.9939, + "eval_samples_per_second": 10.623, + "eval_steps_per_second": 1.352, + "step": 900 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 59.508033752441406, + "learning_rate": 9.9009e-06, + "loss": 1.7383, + "step": 1000 + }, + { + "epoch": 5.128205128205128, + "eval_loss": 1.7986633777618408, + "eval_runtime": 36.8356, + "eval_samples_per_second": 10.669, + "eval_steps_per_second": 1.357, + "step": 1000 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 71.58872985839844, + "learning_rate": 9.8909e-06, + "loss": 1.7361, + "step": 1100 + }, + { + "epoch": 5.641025641025641, + "eval_loss": 1.7810852527618408, + "eval_runtime": 37.0957, + "eval_samples_per_second": 10.594, + "eval_steps_per_second": 1.348, + "step": 1100 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 41.782066345214844, + "learning_rate": 9.8809e-06, + "loss": 1.682, + "step": 1200 + }, + { + "epoch": 6.153846153846154, + "eval_loss": 1.777554988861084, + "eval_runtime": 36.9173, + "eval_samples_per_second": 10.645, + "eval_steps_per_second": 1.354, + "step": 1200 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 40.28728485107422, + "learning_rate": 9.8709e-06, + "loss": 1.7216, + "step": 1300 + }, + { + "epoch": 6.666666666666667, + "eval_loss": 1.7382572889328003, + "eval_runtime": 36.8918, + "eval_samples_per_second": 10.653, + "eval_steps_per_second": 1.355, + "step": 1300 + }, + { + "epoch": 7.17948717948718, + "grad_norm": 104.99211883544922, + "learning_rate": 9.8609e-06, + "loss": 1.6534, + "step": 1400 + }, + { + "epoch": 7.17948717948718, + "eval_loss": 1.76559579372406, + "eval_runtime": 36.3398, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 1400 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 29.326631546020508, + "learning_rate": 9.8509e-06, + "loss": 1.707, + "step": 1500 + }, + { + "epoch": 7.6923076923076925, + "eval_loss": 1.750089406967163, + "eval_runtime": 36.4098, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 1500 + }, + { + "epoch": 8.205128205128204, + "grad_norm": 37.941261291503906, + "learning_rate": 9.840900000000001e-06, + "loss": 1.6554, + "step": 1600 + }, + { + "epoch": 8.205128205128204, + "eval_loss": 1.6900651454925537, + "eval_runtime": 36.3226, + "eval_samples_per_second": 10.82, + "eval_steps_per_second": 1.377, + "step": 1600 + }, + { + "epoch": 8.717948717948717, + "grad_norm": 44.60703659057617, + "learning_rate": 9.830900000000001e-06, + "loss": 1.6334, + "step": 1700 + }, + { + "epoch": 8.717948717948717, + "eval_loss": 1.7162973880767822, + "eval_runtime": 36.2672, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 1700 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 34.127254486083984, + "learning_rate": 9.820900000000001e-06, + "loss": 1.6345, + "step": 1800 + }, + { + "epoch": 9.23076923076923, + "eval_loss": 1.6906001567840576, + "eval_runtime": 36.2264, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 1800 + }, + { + "epoch": 9.743589743589745, + "grad_norm": 60.377540588378906, + "learning_rate": 9.810900000000001e-06, + "loss": 1.598, + "step": 1900 + }, + { + "epoch": 9.743589743589745, + "eval_loss": 1.6555503606796265, + "eval_runtime": 36.3896, + "eval_samples_per_second": 10.8, + "eval_steps_per_second": 1.374, + "step": 1900 + }, + { + "epoch": 10.256410256410255, + "grad_norm": 20.264404296875, + "learning_rate": 9.800900000000001e-06, + "loss": 1.5466, + "step": 2000 + }, + { + "epoch": 10.256410256410255, + "eval_loss": 1.648037075996399, + "eval_runtime": 36.4136, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 2000 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 27.18608856201172, + "learning_rate": 9.790900000000001e-06, + "loss": 1.5865, + "step": 2100 + }, + { + "epoch": 10.76923076923077, + "eval_loss": 1.6171936988830566, + "eval_runtime": 36.1051, + "eval_samples_per_second": 10.885, + "eval_steps_per_second": 1.385, + "step": 2100 + }, + { + "epoch": 11.282051282051283, + "grad_norm": 32.486331939697266, + "learning_rate": 9.780900000000002e-06, + "loss": 1.5284, + "step": 2200 + }, + { + "epoch": 11.282051282051283, + "eval_loss": 1.5915095806121826, + "eval_runtime": 36.1781, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 2200 + }, + { + "epoch": 11.794871794871796, + "grad_norm": 65.88719940185547, + "learning_rate": 9.770900000000002e-06, + "loss": 1.5514, + "step": 2300 + }, + { + "epoch": 11.794871794871796, + "eval_loss": 1.5879931449890137, + "eval_runtime": 36.3934, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 2300 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 31.737024307250977, + "learning_rate": 9.760900000000002e-06, + "loss": 1.4941, + "step": 2400 + }, + { + "epoch": 12.307692307692308, + "eval_loss": 1.583853006362915, + "eval_runtime": 36.3797, + "eval_samples_per_second": 10.803, + "eval_steps_per_second": 1.374, + "step": 2400 + }, + { + "epoch": 12.820512820512821, + "grad_norm": 45.48268508911133, + "learning_rate": 9.7509e-06, + "loss": 1.5097, + "step": 2500 + }, + { + "epoch": 12.820512820512821, + "eval_loss": 1.5559026002883911, + "eval_runtime": 36.1605, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 2500 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 27.500398635864258, + "learning_rate": 9.7409e-06, + "loss": 1.5018, + "step": 2600 + }, + { + "epoch": 13.333333333333334, + "eval_loss": 1.5453521013259888, + "eval_runtime": 36.2887, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 2600 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 32.49728775024414, + "learning_rate": 9.7309e-06, + "loss": 1.4804, + "step": 2700 + }, + { + "epoch": 13.846153846153847, + "eval_loss": 1.5424816608428955, + "eval_runtime": 36.359, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 2700 + }, + { + "epoch": 14.35897435897436, + "grad_norm": 38.46280288696289, + "learning_rate": 9.7209e-06, + "loss": 1.4826, + "step": 2800 + }, + { + "epoch": 14.35897435897436, + "eval_loss": 1.5317177772521973, + "eval_runtime": 36.3362, + "eval_samples_per_second": 10.816, + "eval_steps_per_second": 1.376, + "step": 2800 + }, + { + "epoch": 14.871794871794872, + "grad_norm": 16.075960159301758, + "learning_rate": 9.7109e-06, + "loss": 1.4568, + "step": 2900 + }, + { + "epoch": 14.871794871794872, + "eval_loss": 1.5241832733154297, + "eval_runtime": 36.2025, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 2900 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 27.334318161010742, + "learning_rate": 9.7009e-06, + "loss": 1.4176, + "step": 3000 + }, + { + "epoch": 15.384615384615385, + "eval_loss": 1.520580768585205, + "eval_runtime": 36.398, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 3000 + }, + { + "epoch": 15.897435897435898, + "grad_norm": 94.90784454345703, + "learning_rate": 9.6909e-06, + "loss": 1.4681, + "step": 3100 + }, + { + "epoch": 15.897435897435898, + "eval_loss": 1.5268648862838745, + "eval_runtime": 36.0541, + "eval_samples_per_second": 10.9, + "eval_steps_per_second": 1.387, + "step": 3100 + }, + { + "epoch": 16.41025641025641, + "grad_norm": 16.697856903076172, + "learning_rate": 9.6809e-06, + "loss": 1.454, + "step": 3200 + }, + { + "epoch": 16.41025641025641, + "eval_loss": 1.5157753229141235, + "eval_runtime": 36.3172, + "eval_samples_per_second": 10.821, + "eval_steps_per_second": 1.377, + "step": 3200 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 54.05553436279297, + "learning_rate": 9.670900000000001e-06, + "loss": 1.4309, + "step": 3300 + }, + { + "epoch": 16.923076923076923, + "eval_loss": 1.516249179840088, + "eval_runtime": 36.2632, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 3300 + }, + { + "epoch": 17.435897435897434, + "grad_norm": 47.010475158691406, + "learning_rate": 9.660900000000001e-06, + "loss": 1.4571, + "step": 3400 + }, + { + "epoch": 17.435897435897434, + "eval_loss": 1.5018247365951538, + "eval_runtime": 36.2118, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 3400 + }, + { + "epoch": 17.94871794871795, + "grad_norm": 52.865718841552734, + "learning_rate": 9.650900000000001e-06, + "loss": 1.4168, + "step": 3500 + }, + { + "epoch": 17.94871794871795, + "eval_loss": 1.4993616342544556, + "eval_runtime": 36.2572, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 3500 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 30.117380142211914, + "learning_rate": 9.640900000000001e-06, + "loss": 1.4275, + "step": 3600 + }, + { + "epoch": 18.46153846153846, + "eval_loss": 1.4899998903274536, + "eval_runtime": 36.4696, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 3600 + }, + { + "epoch": 18.974358974358974, + "grad_norm": 31.10028076171875, + "learning_rate": 9.630900000000001e-06, + "loss": 1.4148, + "step": 3700 + }, + { + "epoch": 18.974358974358974, + "eval_loss": 1.5231629610061646, + "eval_runtime": 36.4604, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 3700 + }, + { + "epoch": 19.487179487179485, + "grad_norm": 44.06697082519531, + "learning_rate": 9.620900000000001e-06, + "loss": 1.4057, + "step": 3800 + }, + { + "epoch": 19.487179487179485, + "eval_loss": 1.4841217994689941, + "eval_runtime": 36.3382, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 3800 + }, + { + "epoch": 20.0, + "grad_norm": 53.86429214477539, + "learning_rate": 9.610900000000001e-06, + "loss": 1.4302, + "step": 3900 + }, + { + "epoch": 20.0, + "eval_loss": 1.477772831916809, + "eval_runtime": 36.1794, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 3900 + }, + { + "epoch": 20.51282051282051, + "grad_norm": 80.95457458496094, + "learning_rate": 9.600900000000002e-06, + "loss": 1.4076, + "step": 4000 + }, + { + "epoch": 20.51282051282051, + "eval_loss": 1.4769134521484375, + "eval_runtime": 36.4725, + "eval_samples_per_second": 10.775, + "eval_steps_per_second": 1.371, + "step": 4000 + }, + { + "epoch": 21.025641025641026, + "grad_norm": 32.276214599609375, + "learning_rate": 9.5909e-06, + "loss": 1.3868, + "step": 4100 + }, + { + "epoch": 21.025641025641026, + "eval_loss": 1.463292121887207, + "eval_runtime": 36.5192, + "eval_samples_per_second": 10.761, + "eval_steps_per_second": 1.369, + "step": 4100 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 54.65959167480469, + "learning_rate": 9.5809e-06, + "loss": 1.3795, + "step": 4200 + }, + { + "epoch": 21.53846153846154, + "eval_loss": 1.4630039930343628, + "eval_runtime": 36.5288, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 4200 + }, + { + "epoch": 22.05128205128205, + "grad_norm": 42.31818389892578, + "learning_rate": 9.5709e-06, + "loss": 1.3787, + "step": 4300 + }, + { + "epoch": 22.05128205128205, + "eval_loss": 1.4471133947372437, + "eval_runtime": 36.2949, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 4300 + }, + { + "epoch": 22.564102564102566, + "grad_norm": 34.44257736206055, + "learning_rate": 9.5609e-06, + "loss": 1.4027, + "step": 4400 + }, + { + "epoch": 22.564102564102566, + "eval_loss": 1.4606964588165283, + "eval_runtime": 36.5672, + "eval_samples_per_second": 10.747, + "eval_steps_per_second": 1.367, + "step": 4400 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 42.65989303588867, + "learning_rate": 9.5509e-06, + "loss": 1.3459, + "step": 4500 + }, + { + "epoch": 23.076923076923077, + "eval_loss": 1.454709768295288, + "eval_runtime": 37.2024, + "eval_samples_per_second": 10.564, + "eval_steps_per_second": 1.344, + "step": 4500 + }, + { + "epoch": 23.58974358974359, + "grad_norm": 35.11396789550781, + "learning_rate": 9.5409e-06, + "loss": 1.3367, + "step": 4600 + }, + { + "epoch": 23.58974358974359, + "eval_loss": 1.4562979936599731, + "eval_runtime": 36.4579, + "eval_samples_per_second": 10.78, + "eval_steps_per_second": 1.371, + "step": 4600 + }, + { + "epoch": 24.102564102564102, + "grad_norm": 32.71805953979492, + "learning_rate": 9.5309e-06, + "loss": 1.3575, + "step": 4700 + }, + { + "epoch": 24.102564102564102, + "eval_loss": 1.4620144367218018, + "eval_runtime": 36.4055, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 4700 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 28.839757919311523, + "learning_rate": 9.5209e-06, + "loss": 1.3549, + "step": 4800 + }, + { + "epoch": 24.615384615384617, + "eval_loss": 1.4431304931640625, + "eval_runtime": 36.5027, + "eval_samples_per_second": 10.766, + "eval_steps_per_second": 1.37, + "step": 4800 + }, + { + "epoch": 25.128205128205128, + "grad_norm": 45.3994140625, + "learning_rate": 9.5109e-06, + "loss": 1.3885, + "step": 4900 + }, + { + "epoch": 25.128205128205128, + "eval_loss": 1.4312200546264648, + "eval_runtime": 36.3039, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 4900 + }, + { + "epoch": 25.641025641025642, + "grad_norm": 22.972829818725586, + "learning_rate": 9.5009e-06, + "loss": 1.3469, + "step": 5000 + }, + { + "epoch": 25.641025641025642, + "eval_loss": 1.416171669960022, + "eval_runtime": 36.6695, + "eval_samples_per_second": 10.717, + "eval_steps_per_second": 1.364, + "step": 5000 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 77.40106964111328, + "learning_rate": 9.490900000000001e-06, + "loss": 1.3363, + "step": 5100 + }, + { + "epoch": 26.153846153846153, + "eval_loss": 1.4090278148651123, + "eval_runtime": 36.2573, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 5100 + }, + { + "epoch": 26.666666666666668, + "grad_norm": 29.757932662963867, + "learning_rate": 9.480900000000001e-06, + "loss": 1.3183, + "step": 5200 + }, + { + "epoch": 26.666666666666668, + "eval_loss": 1.4073749780654907, + "eval_runtime": 36.2439, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.38, + "step": 5200 + }, + { + "epoch": 27.17948717948718, + "grad_norm": 56.78797149658203, + "learning_rate": 9.470900000000001e-06, + "loss": 1.3568, + "step": 5300 + }, + { + "epoch": 27.17948717948718, + "eval_loss": 1.4153741598129272, + "eval_runtime": 36.2756, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 5300 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 62.353477478027344, + "learning_rate": 9.460900000000001e-06, + "loss": 1.3304, + "step": 5400 + }, + { + "epoch": 27.692307692307693, + "eval_loss": 1.4334921836853027, + "eval_runtime": 36.2626, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 5400 + }, + { + "epoch": 28.205128205128204, + "grad_norm": 100.7852554321289, + "learning_rate": 9.450900000000001e-06, + "loss": 1.2897, + "step": 5500 + }, + { + "epoch": 28.205128205128204, + "eval_loss": 1.4160270690917969, + "eval_runtime": 36.6139, + "eval_samples_per_second": 10.734, + "eval_steps_per_second": 1.366, + "step": 5500 + }, + { + "epoch": 28.71794871794872, + "grad_norm": 62.06657409667969, + "learning_rate": 9.440900000000001e-06, + "loss": 1.3233, + "step": 5600 + }, + { + "epoch": 28.71794871794872, + "eval_loss": 1.431317687034607, + "eval_runtime": 36.5341, + "eval_samples_per_second": 10.757, + "eval_steps_per_second": 1.369, + "step": 5600 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 32.661346435546875, + "learning_rate": 9.4309e-06, + "loss": 1.305, + "step": 5700 + }, + { + "epoch": 29.23076923076923, + "eval_loss": 1.3954827785491943, + "eval_runtime": 36.5903, + "eval_samples_per_second": 10.741, + "eval_steps_per_second": 1.366, + "step": 5700 + }, + { + "epoch": 29.743589743589745, + "grad_norm": 25.690454483032227, + "learning_rate": 9.421000000000002e-06, + "loss": 1.2961, + "step": 5800 + }, + { + "epoch": 29.743589743589745, + "eval_loss": 1.4036046266555786, + "eval_runtime": 36.5935, + "eval_samples_per_second": 10.74, + "eval_steps_per_second": 1.366, + "step": 5800 + }, + { + "epoch": 30.256410256410255, + "grad_norm": 45.45426940917969, + "learning_rate": 9.411000000000002e-06, + "loss": 1.3175, + "step": 5900 + }, + { + "epoch": 30.256410256410255, + "eval_loss": 1.3845292329788208, + "eval_runtime": 36.5594, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 5900 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 41.62439727783203, + "learning_rate": 9.401000000000002e-06, + "loss": 1.3242, + "step": 6000 + }, + { + "epoch": 30.76923076923077, + "eval_loss": 1.3939634561538696, + "eval_runtime": 36.4895, + "eval_samples_per_second": 10.77, + "eval_steps_per_second": 1.37, + "step": 6000 + }, + { + "epoch": 31.28205128205128, + "grad_norm": 26.999319076538086, + "learning_rate": 9.391e-06, + "loss": 1.2886, + "step": 6100 + }, + { + "epoch": 31.28205128205128, + "eval_loss": 1.3804558515548706, + "eval_runtime": 36.2599, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 6100 + }, + { + "epoch": 31.794871794871796, + "grad_norm": 24.70287322998047, + "learning_rate": 9.381e-06, + "loss": 1.2893, + "step": 6200 + }, + { + "epoch": 31.794871794871796, + "eval_loss": 1.3821990489959717, + "eval_runtime": 36.2613, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 6200 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 41.910606384277344, + "learning_rate": 9.371e-06, + "loss": 1.3093, + "step": 6300 + }, + { + "epoch": 32.30769230769231, + "eval_loss": 1.3875064849853516, + "eval_runtime": 36.4998, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 6300 + }, + { + "epoch": 32.82051282051282, + "grad_norm": 82.20216369628906, + "learning_rate": 9.361e-06, + "loss": 1.3184, + "step": 6400 + }, + { + "epoch": 32.82051282051282, + "eval_loss": 1.3840612173080444, + "eval_runtime": 36.4787, + "eval_samples_per_second": 10.773, + "eval_steps_per_second": 1.371, + "step": 6400 + }, + { + "epoch": 33.333333333333336, + "grad_norm": 797.53271484375, + "learning_rate": 9.351e-06, + "loss": 1.2939, + "step": 6500 + }, + { + "epoch": 33.333333333333336, + "eval_loss": 1.3881950378417969, + "eval_runtime": 36.4166, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 6500 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 35.14693069458008, + "learning_rate": 9.341000000000001e-06, + "loss": 1.2881, + "step": 6600 + }, + { + "epoch": 33.84615384615385, + "eval_loss": 1.4039666652679443, + "eval_runtime": 36.4141, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 6600 + }, + { + "epoch": 34.35897435897436, + "grad_norm": 38.676666259765625, + "learning_rate": 9.331000000000001e-06, + "loss": 1.2699, + "step": 6700 + }, + { + "epoch": 34.35897435897436, + "eval_loss": 1.3800386190414429, + "eval_runtime": 36.2367, + "eval_samples_per_second": 10.845, + "eval_steps_per_second": 1.38, + "step": 6700 + }, + { + "epoch": 34.87179487179487, + "grad_norm": 26.051170349121094, + "learning_rate": 9.321000000000001e-06, + "loss": 1.3079, + "step": 6800 + }, + { + "epoch": 34.87179487179487, + "eval_loss": 1.3784704208374023, + "eval_runtime": 36.3119, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 6800 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 42.667236328125, + "learning_rate": 9.311000000000001e-06, + "loss": 1.2622, + "step": 6900 + }, + { + "epoch": 35.38461538461539, + "eval_loss": 1.3637058734893799, + "eval_runtime": 36.1116, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 6900 + }, + { + "epoch": 35.8974358974359, + "grad_norm": 38.78388977050781, + "learning_rate": 9.301000000000001e-06, + "loss": 1.2652, + "step": 7000 + }, + { + "epoch": 35.8974358974359, + "eval_loss": 1.3452589511871338, + "eval_runtime": 36.2593, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 7000 + }, + { + "epoch": 36.41025641025641, + "grad_norm": 44.43967056274414, + "learning_rate": 9.291000000000001e-06, + "loss": 1.2378, + "step": 7100 + }, + { + "epoch": 36.41025641025641, + "eval_loss": 1.3494073152542114, + "eval_runtime": 35.4402, + "eval_samples_per_second": 11.089, + "eval_steps_per_second": 1.411, + "step": 7100 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 49.02131271362305, + "learning_rate": 9.281000000000001e-06, + "loss": 1.2932, + "step": 7200 + }, + { + "epoch": 36.92307692307692, + "eval_loss": 1.3460158109664917, + "eval_runtime": 35.9361, + "eval_samples_per_second": 10.936, + "eval_steps_per_second": 1.391, + "step": 7200 + }, + { + "epoch": 37.43589743589744, + "grad_norm": 28.279098510742188, + "learning_rate": 9.271000000000002e-06, + "loss": 1.2598, + "step": 7300 + }, + { + "epoch": 37.43589743589744, + "eval_loss": 1.36253023147583, + "eval_runtime": 36.2944, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 7300 + }, + { + "epoch": 37.94871794871795, + "grad_norm": 35.21017074584961, + "learning_rate": 9.261000000000002e-06, + "loss": 1.2703, + "step": 7400 + }, + { + "epoch": 37.94871794871795, + "eval_loss": 1.3509865999221802, + "eval_runtime": 36.2661, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 7400 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 51.673316955566406, + "learning_rate": 9.251000000000002e-06, + "loss": 1.2393, + "step": 7500 + }, + { + "epoch": 38.46153846153846, + "eval_loss": 1.3402855396270752, + "eval_runtime": 36.392, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 7500 + }, + { + "epoch": 38.97435897435897, + "grad_norm": 53.73936462402344, + "learning_rate": 9.241000000000002e-06, + "loss": 1.2577, + "step": 7600 + }, + { + "epoch": 38.97435897435897, + "eval_loss": 1.3487578630447388, + "eval_runtime": 36.2119, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 7600 + }, + { + "epoch": 39.48717948717949, + "grad_norm": 55.994686126708984, + "learning_rate": 9.231000000000002e-06, + "loss": 1.229, + "step": 7700 + }, + { + "epoch": 39.48717948717949, + "eval_loss": 1.340031623840332, + "eval_runtime": 36.2063, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 7700 + }, + { + "epoch": 40.0, + "grad_norm": 86.7531509399414, + "learning_rate": 9.221e-06, + "loss": 1.2941, + "step": 7800 + }, + { + "epoch": 40.0, + "eval_loss": 1.3422337770462036, + "eval_runtime": 36.7462, + "eval_samples_per_second": 10.695, + "eval_steps_per_second": 1.361, + "step": 7800 + }, + { + "epoch": 40.51282051282051, + "grad_norm": 60.86371612548828, + "learning_rate": 9.211e-06, + "loss": 1.2423, + "step": 7900 + }, + { + "epoch": 40.51282051282051, + "eval_loss": 1.3336257934570312, + "eval_runtime": 36.2441, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.38, + "step": 7900 + }, + { + "epoch": 41.02564102564103, + "grad_norm": 28.535411834716797, + "learning_rate": 9.2011e-06, + "loss": 1.2676, + "step": 8000 + }, + { + "epoch": 41.02564102564103, + "eval_loss": 1.338461995124817, + "eval_runtime": 36.2212, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 8000 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 35.707183837890625, + "learning_rate": 9.1911e-06, + "loss": 1.2428, + "step": 8100 + }, + { + "epoch": 41.53846153846154, + "eval_loss": 1.3225666284561157, + "eval_runtime": 36.2043, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 8100 + }, + { + "epoch": 42.05128205128205, + "grad_norm": 29.23111343383789, + "learning_rate": 9.181100000000001e-06, + "loss": 1.2269, + "step": 8200 + }, + { + "epoch": 42.05128205128205, + "eval_loss": 1.3405094146728516, + "eval_runtime": 36.2498, + "eval_samples_per_second": 10.841, + "eval_steps_per_second": 1.379, + "step": 8200 + }, + { + "epoch": 42.56410256410256, + "grad_norm": 20.379304885864258, + "learning_rate": 9.171100000000001e-06, + "loss": 1.2187, + "step": 8300 + }, + { + "epoch": 42.56410256410256, + "eval_loss": 1.3247309923171997, + "eval_runtime": 36.3237, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.377, + "step": 8300 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 44.43791198730469, + "learning_rate": 9.161100000000001e-06, + "loss": 1.2321, + "step": 8400 + }, + { + "epoch": 43.07692307692308, + "eval_loss": 1.334086298942566, + "eval_runtime": 36.2039, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 8400 + }, + { + "epoch": 43.58974358974359, + "grad_norm": 30.97890853881836, + "learning_rate": 9.151100000000001e-06, + "loss": 1.2071, + "step": 8500 + }, + { + "epoch": 43.58974358974359, + "eval_loss": 1.3306576013565063, + "eval_runtime": 36.3547, + "eval_samples_per_second": 10.81, + "eval_steps_per_second": 1.375, + "step": 8500 + }, + { + "epoch": 44.1025641025641, + "grad_norm": 40.07706832885742, + "learning_rate": 9.141100000000001e-06, + "loss": 1.25, + "step": 8600 + }, + { + "epoch": 44.1025641025641, + "eval_loss": 1.3270679712295532, + "eval_runtime": 36.1754, + "eval_samples_per_second": 10.864, + "eval_steps_per_second": 1.382, + "step": 8600 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 27.011186599731445, + "learning_rate": 9.1311e-06, + "loss": 1.1968, + "step": 8700 + }, + { + "epoch": 44.61538461538461, + "eval_loss": 1.3117327690124512, + "eval_runtime": 36.1727, + "eval_samples_per_second": 10.865, + "eval_steps_per_second": 1.382, + "step": 8700 + }, + { + "epoch": 45.12820512820513, + "grad_norm": 25.976228713989258, + "learning_rate": 9.1211e-06, + "loss": 1.2492, + "step": 8800 + }, + { + "epoch": 45.12820512820513, + "eval_loss": 1.3281300067901611, + "eval_runtime": 36.2531, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 8800 + }, + { + "epoch": 45.64102564102564, + "grad_norm": 21.215715408325195, + "learning_rate": 9.1111e-06, + "loss": 1.221, + "step": 8900 + }, + { + "epoch": 45.64102564102564, + "eval_loss": 1.3373734951019287, + "eval_runtime": 36.2242, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 8900 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 48.8258171081543, + "learning_rate": 9.1011e-06, + "loss": 1.2123, + "step": 9000 + }, + { + "epoch": 46.15384615384615, + "eval_loss": 1.3303853273391724, + "eval_runtime": 36.4019, + "eval_samples_per_second": 10.796, + "eval_steps_per_second": 1.374, + "step": 9000 + }, + { + "epoch": 46.666666666666664, + "grad_norm": 36.76605224609375, + "learning_rate": 9.0911e-06, + "loss": 1.1951, + "step": 9100 + }, + { + "epoch": 46.666666666666664, + "eval_loss": 1.3182373046875, + "eval_runtime": 36.387, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 9100 + }, + { + "epoch": 47.17948717948718, + "grad_norm": 40.79771423339844, + "learning_rate": 9.0811e-06, + "loss": 1.2155, + "step": 9200 + }, + { + "epoch": 47.17948717948718, + "eval_loss": 1.3303160667419434, + "eval_runtime": 36.2265, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 9200 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 48.06431579589844, + "learning_rate": 9.0711e-06, + "loss": 1.2236, + "step": 9300 + }, + { + "epoch": 47.69230769230769, + "eval_loss": 1.3128286600112915, + "eval_runtime": 36.3665, + "eval_samples_per_second": 10.807, + "eval_steps_per_second": 1.375, + "step": 9300 + }, + { + "epoch": 48.205128205128204, + "grad_norm": 31.19647216796875, + "learning_rate": 9.0611e-06, + "loss": 1.2033, + "step": 9400 + }, + { + "epoch": 48.205128205128204, + "eval_loss": 1.3134888410568237, + "eval_runtime": 36.5129, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 9400 + }, + { + "epoch": 48.717948717948715, + "grad_norm": 20.11866569519043, + "learning_rate": 9.0511e-06, + "loss": 1.1955, + "step": 9500 + }, + { + "epoch": 48.717948717948715, + "eval_loss": 1.3154560327529907, + "eval_runtime": 36.6023, + "eval_samples_per_second": 10.737, + "eval_steps_per_second": 1.366, + "step": 9500 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 36.3143424987793, + "learning_rate": 9.0411e-06, + "loss": 1.2067, + "step": 9600 + }, + { + "epoch": 49.23076923076923, + "eval_loss": 1.3158589601516724, + "eval_runtime": 36.4851, + "eval_samples_per_second": 10.772, + "eval_steps_per_second": 1.37, + "step": 9600 + }, + { + "epoch": 49.743589743589745, + "grad_norm": 48.41688537597656, + "learning_rate": 9.0311e-06, + "loss": 1.2295, + "step": 9700 + }, + { + "epoch": 49.743589743589745, + "eval_loss": 1.306788682937622, + "eval_runtime": 36.291, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 9700 + }, + { + "epoch": 50.256410256410255, + "grad_norm": 26.129995346069336, + "learning_rate": 9.0211e-06, + "loss": 1.1809, + "step": 9800 + }, + { + "epoch": 50.256410256410255, + "eval_loss": 1.3299375772476196, + "eval_runtime": 36.4651, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 9800 + }, + { + "epoch": 50.76923076923077, + "grad_norm": 19.543821334838867, + "learning_rate": 9.011100000000001e-06, + "loss": 1.2179, + "step": 9900 + }, + { + "epoch": 50.76923076923077, + "eval_loss": 1.317457675933838, + "eval_runtime": 36.2864, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 9900 + }, + { + "epoch": 51.282051282051285, + "grad_norm": 25.619775772094727, + "learning_rate": 9.001100000000001e-06, + "loss": 1.1653, + "step": 10000 + }, + { + "epoch": 51.282051282051285, + "eval_loss": 1.31196928024292, + "eval_runtime": 36.3263, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 10000 + }, + { + "epoch": 51.794871794871796, + "grad_norm": 45.30315017700195, + "learning_rate": 8.991100000000001e-06, + "loss": 1.2391, + "step": 10100 + }, + { + "epoch": 51.794871794871796, + "eval_loss": 1.305709719657898, + "eval_runtime": 36.2103, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 10100 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 26.942337036132812, + "learning_rate": 8.981100000000001e-06, + "loss": 1.2195, + "step": 10200 + }, + { + "epoch": 52.30769230769231, + "eval_loss": 1.3068941831588745, + "eval_runtime": 36.2565, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 10200 + }, + { + "epoch": 52.82051282051282, + "grad_norm": 26.20073890686035, + "learning_rate": 8.9711e-06, + "loss": 1.1639, + "step": 10300 + }, + { + "epoch": 52.82051282051282, + "eval_loss": 1.3013452291488647, + "eval_runtime": 36.2146, + "eval_samples_per_second": 10.852, + "eval_steps_per_second": 1.381, + "step": 10300 + }, + { + "epoch": 53.333333333333336, + "grad_norm": 41.40350341796875, + "learning_rate": 8.9611e-06, + "loss": 1.2033, + "step": 10400 + }, + { + "epoch": 53.333333333333336, + "eval_loss": 1.305737853050232, + "eval_runtime": 36.4865, + "eval_samples_per_second": 10.771, + "eval_steps_per_second": 1.37, + "step": 10400 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 28.133567810058594, + "learning_rate": 8.9511e-06, + "loss": 1.1906, + "step": 10500 + }, + { + "epoch": 53.84615384615385, + "eval_loss": 1.2961195707321167, + "eval_runtime": 36.2734, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 10500 + }, + { + "epoch": 54.35897435897436, + "grad_norm": 44.07390213012695, + "learning_rate": 8.9411e-06, + "loss": 1.1899, + "step": 10600 + }, + { + "epoch": 54.35897435897436, + "eval_loss": 1.3024916648864746, + "eval_runtime": 36.4774, + "eval_samples_per_second": 10.774, + "eval_steps_per_second": 1.371, + "step": 10600 + }, + { + "epoch": 54.87179487179487, + "grad_norm": 19.120830535888672, + "learning_rate": 8.9311e-06, + "loss": 1.1697, + "step": 10700 + }, + { + "epoch": 54.87179487179487, + "eval_loss": 1.3056432008743286, + "eval_runtime": 36.2367, + "eval_samples_per_second": 10.845, + "eval_steps_per_second": 1.38, + "step": 10700 + }, + { + "epoch": 55.38461538461539, + "grad_norm": 52.376529693603516, + "learning_rate": 8.9211e-06, + "loss": 1.1759, + "step": 10800 + }, + { + "epoch": 55.38461538461539, + "eval_loss": 1.3018929958343506, + "eval_runtime": 36.1478, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 10800 + }, + { + "epoch": 55.8974358974359, + "grad_norm": 41.84946060180664, + "learning_rate": 8.9112e-06, + "loss": 1.1973, + "step": 10900 + }, + { + "epoch": 55.8974358974359, + "eval_loss": 1.3166084289550781, + "eval_runtime": 36.4967, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 10900 + }, + { + "epoch": 56.41025641025641, + "grad_norm": 48.97800064086914, + "learning_rate": 8.9012e-06, + "loss": 1.1942, + "step": 11000 + }, + { + "epoch": 56.41025641025641, + "eval_loss": 1.3040730953216553, + "eval_runtime": 36.3391, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 11000 + }, + { + "epoch": 56.92307692307692, + "grad_norm": 24.18547821044922, + "learning_rate": 8.8912e-06, + "loss": 1.1544, + "step": 11100 + }, + { + "epoch": 56.92307692307692, + "eval_loss": 1.2837135791778564, + "eval_runtime": 36.2839, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 11100 + }, + { + "epoch": 57.43589743589744, + "grad_norm": 34.69540023803711, + "learning_rate": 8.8812e-06, + "loss": 1.1998, + "step": 11200 + }, + { + "epoch": 57.43589743589744, + "eval_loss": 1.2983756065368652, + "eval_runtime": 36.2024, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 11200 + }, + { + "epoch": 57.94871794871795, + "grad_norm": 30.074583053588867, + "learning_rate": 8.8712e-06, + "loss": 1.1352, + "step": 11300 + }, + { + "epoch": 57.94871794871795, + "eval_loss": 1.2913649082183838, + "eval_runtime": 36.0977, + "eval_samples_per_second": 10.887, + "eval_steps_per_second": 1.385, + "step": 11300 + }, + { + "epoch": 58.46153846153846, + "grad_norm": 26.75031852722168, + "learning_rate": 8.8612e-06, + "loss": 1.1728, + "step": 11400 + }, + { + "epoch": 58.46153846153846, + "eval_loss": 1.288116216659546, + "eval_runtime": 36.2588, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 11400 + }, + { + "epoch": 58.97435897435897, + "grad_norm": 49.548213958740234, + "learning_rate": 8.851200000000001e-06, + "loss": 1.1738, + "step": 11500 + }, + { + "epoch": 58.97435897435897, + "eval_loss": 1.2846206426620483, + "eval_runtime": 36.5245, + "eval_samples_per_second": 10.76, + "eval_steps_per_second": 1.369, + "step": 11500 + }, + { + "epoch": 59.48717948717949, + "grad_norm": 23.007057189941406, + "learning_rate": 8.841200000000001e-06, + "loss": 1.1501, + "step": 11600 + }, + { + "epoch": 59.48717948717949, + "eval_loss": 1.297021746635437, + "eval_runtime": 36.2678, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 11600 + }, + { + "epoch": 60.0, + "grad_norm": 39.79067611694336, + "learning_rate": 8.831200000000001e-06, + "loss": 1.1836, + "step": 11700 + }, + { + "epoch": 60.0, + "eval_loss": 1.2865136861801147, + "eval_runtime": 36.1127, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 11700 + }, + { + "epoch": 60.51282051282051, + "grad_norm": 24.281373977661133, + "learning_rate": 8.821200000000001e-06, + "loss": 1.1548, + "step": 11800 + }, + { + "epoch": 60.51282051282051, + "eval_loss": 1.2812024354934692, + "eval_runtime": 36.2399, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 11800 + }, + { + "epoch": 61.02564102564103, + "grad_norm": 30.851072311401367, + "learning_rate": 8.811200000000001e-06, + "loss": 1.1794, + "step": 11900 + }, + { + "epoch": 61.02564102564103, + "eval_loss": 1.2902381420135498, + "eval_runtime": 36.1264, + "eval_samples_per_second": 10.878, + "eval_steps_per_second": 1.384, + "step": 11900 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 44.42039108276367, + "learning_rate": 8.801200000000001e-06, + "loss": 1.1385, + "step": 12000 + }, + { + "epoch": 61.53846153846154, + "eval_loss": 1.2793415784835815, + "eval_runtime": 36.266, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 12000 + }, + { + "epoch": 62.05128205128205, + "grad_norm": 57.410274505615234, + "learning_rate": 8.791200000000001e-06, + "loss": 1.1697, + "step": 12100 + }, + { + "epoch": 62.05128205128205, + "eval_loss": 1.2847199440002441, + "eval_runtime": 36.1783, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 12100 + }, + { + "epoch": 62.56410256410256, + "grad_norm": 70.70729064941406, + "learning_rate": 8.781200000000002e-06, + "loss": 1.1518, + "step": 12200 + }, + { + "epoch": 62.56410256410256, + "eval_loss": 1.2760446071624756, + "eval_runtime": 36.0527, + "eval_samples_per_second": 10.901, + "eval_steps_per_second": 1.387, + "step": 12200 + }, + { + "epoch": 63.07692307692308, + "grad_norm": 32.417388916015625, + "learning_rate": 8.7712e-06, + "loss": 1.1677, + "step": 12300 + }, + { + "epoch": 63.07692307692308, + "eval_loss": 1.2847411632537842, + "eval_runtime": 36.3923, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 12300 + }, + { + "epoch": 63.58974358974359, + "grad_norm": 24.372791290283203, + "learning_rate": 8.7612e-06, + "loss": 1.1433, + "step": 12400 + }, + { + "epoch": 63.58974358974359, + "eval_loss": 1.2779407501220703, + "eval_runtime": 36.5015, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 12400 + }, + { + "epoch": 64.1025641025641, + "grad_norm": 19.632272720336914, + "learning_rate": 8.7512e-06, + "loss": 1.1607, + "step": 12500 + }, + { + "epoch": 64.1025641025641, + "eval_loss": 1.2792208194732666, + "eval_runtime": 36.4617, + "eval_samples_per_second": 10.778, + "eval_steps_per_second": 1.371, + "step": 12500 + }, + { + "epoch": 64.61538461538461, + "grad_norm": 34.54841613769531, + "learning_rate": 8.7412e-06, + "loss": 1.1371, + "step": 12600 + }, + { + "epoch": 64.61538461538461, + "eval_loss": 1.2620294094085693, + "eval_runtime": 36.4969, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 12600 + }, + { + "epoch": 65.12820512820512, + "grad_norm": 19.67386817932129, + "learning_rate": 8.7312e-06, + "loss": 1.1332, + "step": 12700 + }, + { + "epoch": 65.12820512820512, + "eval_loss": 1.2682890892028809, + "eval_runtime": 36.6309, + "eval_samples_per_second": 10.729, + "eval_steps_per_second": 1.365, + "step": 12700 + }, + { + "epoch": 65.64102564102564, + "grad_norm": 65.07030487060547, + "learning_rate": 8.7212e-06, + "loss": 1.1571, + "step": 12800 + }, + { + "epoch": 65.64102564102564, + "eval_loss": 1.2490720748901367, + "eval_runtime": 36.3056, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 12800 + }, + { + "epoch": 66.15384615384616, + "grad_norm": 20.384132385253906, + "learning_rate": 8.7112e-06, + "loss": 1.1619, + "step": 12900 + }, + { + "epoch": 66.15384615384616, + "eval_loss": 1.2465028762817383, + "eval_runtime": 36.1678, + "eval_samples_per_second": 10.866, + "eval_steps_per_second": 1.382, + "step": 12900 + }, + { + "epoch": 66.66666666666667, + "grad_norm": 36.320674896240234, + "learning_rate": 8.7012e-06, + "loss": 1.1176, + "step": 13000 + }, + { + "epoch": 66.66666666666667, + "eval_loss": 1.2594362497329712, + "eval_runtime": 36.3117, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 13000 + }, + { + "epoch": 67.17948717948718, + "grad_norm": 30.79526138305664, + "learning_rate": 8.6912e-06, + "loss": 1.1311, + "step": 13100 + }, + { + "epoch": 67.17948717948718, + "eval_loss": 1.2553967237472534, + "eval_runtime": 36.2667, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 13100 + }, + { + "epoch": 67.6923076923077, + "grad_norm": 40.0444450378418, + "learning_rate": 8.6812e-06, + "loss": 1.165, + "step": 13200 + }, + { + "epoch": 67.6923076923077, + "eval_loss": 1.2607430219650269, + "eval_runtime": 36.339, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 13200 + }, + { + "epoch": 68.2051282051282, + "grad_norm": 25.53687286376953, + "learning_rate": 8.671200000000001e-06, + "loss": 1.1334, + "step": 13300 + }, + { + "epoch": 68.2051282051282, + "eval_loss": 1.2592159509658813, + "eval_runtime": 36.2847, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 13300 + }, + { + "epoch": 68.71794871794872, + "grad_norm": 28.198373794555664, + "learning_rate": 8.661200000000001e-06, + "loss": 1.1481, + "step": 13400 + }, + { + "epoch": 68.71794871794872, + "eval_loss": 1.2755882740020752, + "eval_runtime": 36.6238, + "eval_samples_per_second": 10.731, + "eval_steps_per_second": 1.365, + "step": 13400 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 44.77776336669922, + "learning_rate": 8.651200000000001e-06, + "loss": 1.1138, + "step": 13500 + }, + { + "epoch": 69.23076923076923, + "eval_loss": 1.259446144104004, + "eval_runtime": 36.4048, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 13500 + }, + { + "epoch": 69.74358974358974, + "grad_norm": 73.0951156616211, + "learning_rate": 8.641200000000001e-06, + "loss": 1.149, + "step": 13600 + }, + { + "epoch": 69.74358974358974, + "eval_loss": 1.2759552001953125, + "eval_runtime": 35.9924, + "eval_samples_per_second": 10.919, + "eval_steps_per_second": 1.389, + "step": 13600 + }, + { + "epoch": 70.25641025641026, + "grad_norm": 43.95732116699219, + "learning_rate": 8.631200000000001e-06, + "loss": 1.15, + "step": 13700 + }, + { + "epoch": 70.25641025641026, + "eval_loss": 1.2675697803497314, + "eval_runtime": 36.2305, + "eval_samples_per_second": 10.847, + "eval_steps_per_second": 1.38, + "step": 13700 + }, + { + "epoch": 70.76923076923077, + "grad_norm": 28.56161117553711, + "learning_rate": 8.621200000000001e-06, + "loss": 1.1065, + "step": 13800 + }, + { + "epoch": 70.76923076923077, + "eval_loss": 1.256949782371521, + "eval_runtime": 36.2833, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 13800 + }, + { + "epoch": 71.28205128205128, + "grad_norm": 80.90894317626953, + "learning_rate": 8.611200000000002e-06, + "loss": 1.1111, + "step": 13900 + }, + { + "epoch": 71.28205128205128, + "eval_loss": 1.2672077417373657, + "eval_runtime": 36.2401, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 13900 + }, + { + "epoch": 71.7948717948718, + "grad_norm": 15.566130638122559, + "learning_rate": 8.6012e-06, + "loss": 1.1487, + "step": 14000 + }, + { + "epoch": 71.7948717948718, + "eval_loss": 1.2434508800506592, + "eval_runtime": 36.3731, + "eval_samples_per_second": 10.805, + "eval_steps_per_second": 1.375, + "step": 14000 + }, + { + "epoch": 72.3076923076923, + "grad_norm": 41.25931930541992, + "learning_rate": 8.5912e-06, + "loss": 1.1357, + "step": 14100 + }, + { + "epoch": 72.3076923076923, + "eval_loss": 1.25618314743042, + "eval_runtime": 36.1143, + "eval_samples_per_second": 10.882, + "eval_steps_per_second": 1.384, + "step": 14100 + }, + { + "epoch": 72.82051282051282, + "grad_norm": 24.271724700927734, + "learning_rate": 8.5812e-06, + "loss": 1.1039, + "step": 14200 + }, + { + "epoch": 72.82051282051282, + "eval_loss": 1.2586956024169922, + "eval_runtime": 36.2558, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 14200 + }, + { + "epoch": 73.33333333333333, + "grad_norm": 23.148712158203125, + "learning_rate": 8.5713e-06, + "loss": 1.1332, + "step": 14300 + }, + { + "epoch": 73.33333333333333, + "eval_loss": 1.241268515586853, + "eval_runtime": 36.617, + "eval_samples_per_second": 10.733, + "eval_steps_per_second": 1.365, + "step": 14300 + }, + { + "epoch": 73.84615384615384, + "grad_norm": 39.42832946777344, + "learning_rate": 8.5613e-06, + "loss": 1.1276, + "step": 14400 + }, + { + "epoch": 73.84615384615384, + "eval_loss": 1.261016607284546, + "eval_runtime": 36.2412, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 14400 + }, + { + "epoch": 74.35897435897436, + "grad_norm": 41.2990837097168, + "learning_rate": 8.5513e-06, + "loss": 1.1259, + "step": 14500 + }, + { + "epoch": 74.35897435897436, + "eval_loss": 1.254787802696228, + "eval_runtime": 36.1616, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 14500 + }, + { + "epoch": 74.87179487179488, + "grad_norm": 75.0270767211914, + "learning_rate": 8.5413e-06, + "loss": 1.0919, + "step": 14600 + }, + { + "epoch": 74.87179487179488, + "eval_loss": 1.2456586360931396, + "eval_runtime": 36.3316, + "eval_samples_per_second": 10.817, + "eval_steps_per_second": 1.376, + "step": 14600 + }, + { + "epoch": 75.38461538461539, + "grad_norm": 23.520156860351562, + "learning_rate": 8.5313e-06, + "loss": 1.1415, + "step": 14700 + }, + { + "epoch": 75.38461538461539, + "eval_loss": 1.260399580001831, + "eval_runtime": 36.3274, + "eval_samples_per_second": 10.818, + "eval_steps_per_second": 1.376, + "step": 14700 + }, + { + "epoch": 75.8974358974359, + "grad_norm": 23.925405502319336, + "learning_rate": 8.521300000000001e-06, + "loss": 1.1435, + "step": 14800 + }, + { + "epoch": 75.8974358974359, + "eval_loss": 1.2679468393325806, + "eval_runtime": 36.2707, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.379, + "step": 14800 + }, + { + "epoch": 76.41025641025641, + "grad_norm": 19.826759338378906, + "learning_rate": 8.511300000000001e-06, + "loss": 1.1034, + "step": 14900 + }, + { + "epoch": 76.41025641025641, + "eval_loss": 1.260425090789795, + "eval_runtime": 36.2564, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 14900 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 36.41432189941406, + "learning_rate": 8.501300000000001e-06, + "loss": 1.1181, + "step": 15000 + }, + { + "epoch": 76.92307692307692, + "eval_loss": 1.275189757347107, + "eval_runtime": 36.226, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 15000 + }, + { + "epoch": 77.43589743589743, + "grad_norm": 56.50348663330078, + "learning_rate": 8.491300000000001e-06, + "loss": 1.117, + "step": 15100 + }, + { + "epoch": 77.43589743589743, + "eval_loss": 1.2597932815551758, + "eval_runtime": 36.1736, + "eval_samples_per_second": 10.864, + "eval_steps_per_second": 1.382, + "step": 15100 + }, + { + "epoch": 77.94871794871794, + "grad_norm": 16.227319717407227, + "learning_rate": 8.481300000000001e-06, + "loss": 1.1287, + "step": 15200 + }, + { + "epoch": 77.94871794871794, + "eval_loss": 1.2599974870681763, + "eval_runtime": 36.2653, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 15200 + }, + { + "epoch": 78.46153846153847, + "grad_norm": 34.07974624633789, + "learning_rate": 8.471300000000001e-06, + "loss": 1.1484, + "step": 15300 + }, + { + "epoch": 78.46153846153847, + "eval_loss": 1.2516891956329346, + "eval_runtime": 36.47, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 15300 + }, + { + "epoch": 78.97435897435898, + "grad_norm": 48.17190933227539, + "learning_rate": 8.461300000000001e-06, + "loss": 1.0917, + "step": 15400 + }, + { + "epoch": 78.97435897435898, + "eval_loss": 1.244437336921692, + "eval_runtime": 36.2847, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 15400 + }, + { + "epoch": 79.48717948717949, + "grad_norm": 34.02452087402344, + "learning_rate": 8.451300000000002e-06, + "loss": 1.0924, + "step": 15500 + }, + { + "epoch": 79.48717948717949, + "eval_loss": 1.2553346157073975, + "eval_runtime": 36.3446, + "eval_samples_per_second": 10.813, + "eval_steps_per_second": 1.376, + "step": 15500 + }, + { + "epoch": 80.0, + "grad_norm": 12.990086555480957, + "learning_rate": 8.441300000000002e-06, + "loss": 1.1319, + "step": 15600 + }, + { + "epoch": 80.0, + "eval_loss": 1.2469161748886108, + "eval_runtime": 36.3089, + "eval_samples_per_second": 10.824, + "eval_steps_per_second": 1.377, + "step": 15600 + }, + { + "epoch": 80.51282051282051, + "grad_norm": 31.65691375732422, + "learning_rate": 8.431300000000002e-06, + "loss": 1.12, + "step": 15700 + }, + { + "epoch": 80.51282051282051, + "eval_loss": 1.2401645183563232, + "eval_runtime": 36.2741, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 15700 + }, + { + "epoch": 81.02564102564102, + "grad_norm": 21.566389083862305, + "learning_rate": 8.421300000000002e-06, + "loss": 1.1089, + "step": 15800 + }, + { + "epoch": 81.02564102564102, + "eval_loss": 1.2469390630722046, + "eval_runtime": 36.3979, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 15800 + }, + { + "epoch": 81.53846153846153, + "grad_norm": 27.354368209838867, + "learning_rate": 8.411300000000002e-06, + "loss": 1.1259, + "step": 15900 + }, + { + "epoch": 81.53846153846153, + "eval_loss": 1.2625091075897217, + "eval_runtime": 36.6689, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 1.364, + "step": 15900 + }, + { + "epoch": 82.05128205128206, + "grad_norm": 29.60841178894043, + "learning_rate": 8.4013e-06, + "loss": 1.0668, + "step": 16000 + }, + { + "epoch": 82.05128205128206, + "eval_loss": 1.2272534370422363, + "eval_runtime": 36.5139, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 16000 + }, + { + "epoch": 82.56410256410257, + "grad_norm": 19.469263076782227, + "learning_rate": 8.3913e-06, + "loss": 1.1236, + "step": 16100 + }, + { + "epoch": 82.56410256410257, + "eval_loss": 1.232078194618225, + "eval_runtime": 36.3175, + "eval_samples_per_second": 10.821, + "eval_steps_per_second": 1.377, + "step": 16100 + }, + { + "epoch": 83.07692307692308, + "grad_norm": 40.06100082397461, + "learning_rate": 8.3813e-06, + "loss": 1.0685, + "step": 16200 + }, + { + "epoch": 83.07692307692308, + "eval_loss": 1.254823088645935, + "eval_runtime": 36.5367, + "eval_samples_per_second": 10.756, + "eval_steps_per_second": 1.368, + "step": 16200 + }, + { + "epoch": 83.58974358974359, + "grad_norm": 25.563325881958008, + "learning_rate": 8.3713e-06, + "loss": 1.0911, + "step": 16300 + }, + { + "epoch": 83.58974358974359, + "eval_loss": 1.2462764978408813, + "eval_runtime": 36.1602, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 16300 + }, + { + "epoch": 84.1025641025641, + "grad_norm": 41.810020446777344, + "learning_rate": 8.3613e-06, + "loss": 1.1009, + "step": 16400 + }, + { + "epoch": 84.1025641025641, + "eval_loss": 1.2400792837142944, + "eval_runtime": 36.2464, + "eval_samples_per_second": 10.842, + "eval_steps_per_second": 1.379, + "step": 16400 + }, + { + "epoch": 84.61538461538461, + "grad_norm": 40.818111419677734, + "learning_rate": 8.3513e-06, + "loss": 1.0857, + "step": 16500 + }, + { + "epoch": 84.61538461538461, + "eval_loss": 1.2257955074310303, + "eval_runtime": 36.2933, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 16500 + }, + { + "epoch": 85.12820512820512, + "grad_norm": 33.36876678466797, + "learning_rate": 8.341300000000001e-06, + "loss": 1.1033, + "step": 16600 + }, + { + "epoch": 85.12820512820512, + "eval_loss": 1.2494174242019653, + "eval_runtime": 36.2089, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 16600 + }, + { + "epoch": 85.64102564102564, + "grad_norm": 41.88273620605469, + "learning_rate": 8.331300000000001e-06, + "loss": 1.0674, + "step": 16700 + }, + { + "epoch": 85.64102564102564, + "eval_loss": 1.2431418895721436, + "eval_runtime": 36.3141, + "eval_samples_per_second": 10.822, + "eval_steps_per_second": 1.377, + "step": 16700 + }, + { + "epoch": 86.15384615384616, + "grad_norm": 26.26712989807129, + "learning_rate": 8.321300000000001e-06, + "loss": 1.1154, + "step": 16800 + }, + { + "epoch": 86.15384615384616, + "eval_loss": 1.2349519729614258, + "eval_runtime": 36.3687, + "eval_samples_per_second": 10.806, + "eval_steps_per_second": 1.375, + "step": 16800 + }, + { + "epoch": 86.66666666666667, + "grad_norm": 28.719078063964844, + "learning_rate": 8.311300000000001e-06, + "loss": 1.0821, + "step": 16900 + }, + { + "epoch": 86.66666666666667, + "eval_loss": 1.228055715560913, + "eval_runtime": 36.3697, + "eval_samples_per_second": 10.806, + "eval_steps_per_second": 1.375, + "step": 16900 + }, + { + "epoch": 87.17948717948718, + "grad_norm": 35.35209655761719, + "learning_rate": 8.301300000000001e-06, + "loss": 1.0829, + "step": 17000 + }, + { + "epoch": 87.17948717948718, + "eval_loss": 1.241512417793274, + "eval_runtime": 36.4655, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 17000 + }, + { + "epoch": 87.6923076923077, + "grad_norm": 24.084636688232422, + "learning_rate": 8.291300000000001e-06, + "loss": 1.0926, + "step": 17100 + }, + { + "epoch": 87.6923076923077, + "eval_loss": 1.227530598640442, + "eval_runtime": 36.183, + "eval_samples_per_second": 10.861, + "eval_steps_per_second": 1.382, + "step": 17100 + }, + { + "epoch": 88.2051282051282, + "grad_norm": 35.995765686035156, + "learning_rate": 8.281300000000002e-06, + "loss": 1.076, + "step": 17200 + }, + { + "epoch": 88.2051282051282, + "eval_loss": 1.231921911239624, + "eval_runtime": 36.1958, + "eval_samples_per_second": 10.858, + "eval_steps_per_second": 1.381, + "step": 17200 + }, + { + "epoch": 88.71794871794872, + "grad_norm": 23.116085052490234, + "learning_rate": 8.271300000000002e-06, + "loss": 1.0993, + "step": 17300 + }, + { + "epoch": 88.71794871794872, + "eval_loss": 1.242146611213684, + "eval_runtime": 36.2666, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 17300 + }, + { + "epoch": 89.23076923076923, + "grad_norm": 18.24385643005371, + "learning_rate": 8.261300000000002e-06, + "loss": 1.1213, + "step": 17400 + }, + { + "epoch": 89.23076923076923, + "eval_loss": 1.230137825012207, + "eval_runtime": 36.153, + "eval_samples_per_second": 10.87, + "eval_steps_per_second": 1.383, + "step": 17400 + }, + { + "epoch": 89.74358974358974, + "grad_norm": 44.21913146972656, + "learning_rate": 8.251300000000002e-06, + "loss": 1.045, + "step": 17500 + }, + { + "epoch": 89.74358974358974, + "eval_loss": 1.2343608140945435, + "eval_runtime": 36.3254, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 17500 + }, + { + "epoch": 90.25641025641026, + "grad_norm": 29.37706756591797, + "learning_rate": 8.2413e-06, + "loss": 1.0805, + "step": 17600 + }, + { + "epoch": 90.25641025641026, + "eval_loss": 1.2186076641082764, + "eval_runtime": 36.3582, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 17600 + }, + { + "epoch": 90.76923076923077, + "grad_norm": 40.11149597167969, + "learning_rate": 8.2313e-06, + "loss": 1.0732, + "step": 17700 + }, + { + "epoch": 90.76923076923077, + "eval_loss": 1.2361598014831543, + "eval_runtime": 36.4168, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 17700 + }, + { + "epoch": 91.28205128205128, + "grad_norm": 33.00440216064453, + "learning_rate": 8.2213e-06, + "loss": 1.0912, + "step": 17800 + }, + { + "epoch": 91.28205128205128, + "eval_loss": 1.2296538352966309, + "eval_runtime": 36.2928, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 17800 + }, + { + "epoch": 91.7948717948718, + "grad_norm": 30.941469192504883, + "learning_rate": 8.2113e-06, + "loss": 1.064, + "step": 17900 + }, + { + "epoch": 91.7948717948718, + "eval_loss": 1.250794529914856, + "eval_runtime": 36.4692, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 17900 + }, + { + "epoch": 92.3076923076923, + "grad_norm": 41.63932800292969, + "learning_rate": 8.2013e-06, + "loss": 1.0529, + "step": 18000 + }, + { + "epoch": 92.3076923076923, + "eval_loss": 1.2209473848342896, + "eval_runtime": 36.5494, + "eval_samples_per_second": 10.753, + "eval_steps_per_second": 1.368, + "step": 18000 + }, + { + "epoch": 92.82051282051282, + "grad_norm": 34.083587646484375, + "learning_rate": 8.1913e-06, + "loss": 1.0849, + "step": 18100 + }, + { + "epoch": 92.82051282051282, + "eval_loss": 1.2245945930480957, + "eval_runtime": 36.3755, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 18100 + }, + { + "epoch": 93.33333333333333, + "grad_norm": 33.740848541259766, + "learning_rate": 8.1813e-06, + "loss": 1.0853, + "step": 18200 + }, + { + "epoch": 93.33333333333333, + "eval_loss": 1.2368453741073608, + "eval_runtime": 36.1346, + "eval_samples_per_second": 10.876, + "eval_steps_per_second": 1.384, + "step": 18200 + }, + { + "epoch": 93.84615384615384, + "grad_norm": 22.13953971862793, + "learning_rate": 8.171300000000001e-06, + "loss": 1.09, + "step": 18300 + }, + { + "epoch": 93.84615384615384, + "eval_loss": 1.2331533432006836, + "eval_runtime": 36.4043, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 18300 + }, + { + "epoch": 94.35897435897436, + "grad_norm": 45.70988082885742, + "learning_rate": 8.161300000000001e-06, + "loss": 1.0543, + "step": 18400 + }, + { + "epoch": 94.35897435897436, + "eval_loss": 1.216800570487976, + "eval_runtime": 36.5884, + "eval_samples_per_second": 10.741, + "eval_steps_per_second": 1.367, + "step": 18400 + }, + { + "epoch": 94.87179487179488, + "grad_norm": 38.62083435058594, + "learning_rate": 8.151300000000001e-06, + "loss": 1.09, + "step": 18500 + }, + { + "epoch": 94.87179487179488, + "eval_loss": 1.24717378616333, + "eval_runtime": 36.2986, + "eval_samples_per_second": 10.827, + "eval_steps_per_second": 1.377, + "step": 18500 + }, + { + "epoch": 95.38461538461539, + "grad_norm": 40.52507400512695, + "learning_rate": 8.141300000000001e-06, + "loss": 1.1019, + "step": 18600 + }, + { + "epoch": 95.38461538461539, + "eval_loss": 1.2494611740112305, + "eval_runtime": 36.3553, + "eval_samples_per_second": 10.81, + "eval_steps_per_second": 1.375, + "step": 18600 + }, + { + "epoch": 95.8974358974359, + "grad_norm": 70.1895523071289, + "learning_rate": 8.131300000000001e-06, + "loss": 1.0711, + "step": 18700 + }, + { + "epoch": 95.8974358974359, + "eval_loss": 1.2522521018981934, + "eval_runtime": 36.1421, + "eval_samples_per_second": 10.874, + "eval_steps_per_second": 1.383, + "step": 18700 + }, + { + "epoch": 96.41025641025641, + "grad_norm": 45.69275665283203, + "learning_rate": 8.121300000000001e-06, + "loss": 1.1066, + "step": 18800 + }, + { + "epoch": 96.41025641025641, + "eval_loss": 1.250695824623108, + "eval_runtime": 36.2669, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 18800 + }, + { + "epoch": 96.92307692307692, + "grad_norm": 23.61644744873047, + "learning_rate": 8.111300000000001e-06, + "loss": 1.0967, + "step": 18900 + }, + { + "epoch": 96.92307692307692, + "eval_loss": 1.2277597188949585, + "eval_runtime": 36.3005, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 18900 + }, + { + "epoch": 97.43589743589743, + "grad_norm": 21.942943572998047, + "learning_rate": 8.101300000000002e-06, + "loss": 1.0704, + "step": 19000 + }, + { + "epoch": 97.43589743589743, + "eval_loss": 1.2279075384140015, + "eval_runtime": 36.189, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 19000 + }, + { + "epoch": 97.94871794871794, + "grad_norm": 36.2983512878418, + "learning_rate": 8.091300000000002e-06, + "loss": 1.0719, + "step": 19100 + }, + { + "epoch": 97.94871794871794, + "eval_loss": 1.2093193531036377, + "eval_runtime": 36.5953, + "eval_samples_per_second": 10.739, + "eval_steps_per_second": 1.366, + "step": 19100 + }, + { + "epoch": 98.46153846153847, + "grad_norm": 73.0156021118164, + "learning_rate": 8.0813e-06, + "loss": 1.0538, + "step": 19200 + }, + { + "epoch": 98.46153846153847, + "eval_loss": 1.2311538457870483, + "eval_runtime": 36.4102, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 19200 + }, + { + "epoch": 98.97435897435898, + "grad_norm": 51.309017181396484, + "learning_rate": 8.0713e-06, + "loss": 1.0818, + "step": 19300 + }, + { + "epoch": 98.97435897435898, + "eval_loss": 1.2250592708587646, + "eval_runtime": 36.4117, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 19300 + }, + { + "epoch": 99.48717948717949, + "grad_norm": 15.101311683654785, + "learning_rate": 8.0613e-06, + "loss": 1.0656, + "step": 19400 + }, + { + "epoch": 99.48717948717949, + "eval_loss": 1.233995795249939, + "eval_runtime": 36.3917, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 19400 + }, + { + "epoch": 100.0, + "grad_norm": 39.63221740722656, + "learning_rate": 8.0513e-06, + "loss": 1.0716, + "step": 19500 + }, + { + "epoch": 100.0, + "eval_loss": 1.2169172763824463, + "eval_runtime": 36.466, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 19500 + }, + { + "epoch": 100.51282051282051, + "grad_norm": 28.275875091552734, + "learning_rate": 8.0413e-06, + "loss": 1.0863, + "step": 19600 + }, + { + "epoch": 100.51282051282051, + "eval_loss": 1.235645055770874, + "eval_runtime": 36.4873, + "eval_samples_per_second": 10.771, + "eval_steps_per_second": 1.37, + "step": 19600 + }, + { + "epoch": 101.02564102564102, + "grad_norm": 46.1825065612793, + "learning_rate": 8.0313e-06, + "loss": 1.0254, + "step": 19700 + }, + { + "epoch": 101.02564102564102, + "eval_loss": 1.2021634578704834, + "eval_runtime": 36.3574, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 19700 + }, + { + "epoch": 101.53846153846153, + "grad_norm": 33.101219177246094, + "learning_rate": 8.0213e-06, + "loss": 1.0802, + "step": 19800 + }, + { + "epoch": 101.53846153846153, + "eval_loss": 1.2263848781585693, + "eval_runtime": 36.4102, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 19800 + }, + { + "epoch": 102.05128205128206, + "grad_norm": 28.070240020751953, + "learning_rate": 8.0113e-06, + "loss": 1.0209, + "step": 19900 + }, + { + "epoch": 102.05128205128206, + "eval_loss": 1.2010384798049927, + "eval_runtime": 36.5083, + "eval_samples_per_second": 10.765, + "eval_steps_per_second": 1.37, + "step": 19900 + }, + { + "epoch": 102.56410256410257, + "grad_norm": 32.505916595458984, + "learning_rate": 8.0013e-06, + "loss": 1.0738, + "step": 20000 + }, + { + "epoch": 102.56410256410257, + "eval_loss": 1.1892540454864502, + "eval_runtime": 36.8658, + "eval_samples_per_second": 10.66, + "eval_steps_per_second": 1.356, + "step": 20000 + }, + { + "epoch": 103.07692307692308, + "grad_norm": 116.78060150146484, + "learning_rate": 7.991300000000001e-06, + "loss": 1.0417, + "step": 20100 + }, + { + "epoch": 103.07692307692308, + "eval_loss": 1.211946964263916, + "eval_runtime": 36.0941, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 20100 + }, + { + "epoch": 103.58974358974359, + "grad_norm": 47.81857681274414, + "learning_rate": 7.981300000000001e-06, + "loss": 1.0576, + "step": 20200 + }, + { + "epoch": 103.58974358974359, + "eval_loss": 1.2092243432998657, + "eval_runtime": 36.2906, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 20200 + }, + { + "epoch": 104.1025641025641, + "grad_norm": 44.64072036743164, + "learning_rate": 7.971300000000001e-06, + "loss": 1.053, + "step": 20300 + }, + { + "epoch": 104.1025641025641, + "eval_loss": 1.2181479930877686, + "eval_runtime": 36.2566, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 20300 + }, + { + "epoch": 104.61538461538461, + "grad_norm": 25.469749450683594, + "learning_rate": 7.961300000000001e-06, + "loss": 1.0532, + "step": 20400 + }, + { + "epoch": 104.61538461538461, + "eval_loss": 1.2095298767089844, + "eval_runtime": 36.1952, + "eval_samples_per_second": 10.858, + "eval_steps_per_second": 1.381, + "step": 20400 + }, + { + "epoch": 105.12820512820512, + "grad_norm": 18.33926773071289, + "learning_rate": 7.951300000000001e-06, + "loss": 1.0778, + "step": 20500 + }, + { + "epoch": 105.12820512820512, + "eval_loss": 1.2227920293807983, + "eval_runtime": 36.3372, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 20500 + }, + { + "epoch": 105.64102564102564, + "grad_norm": 67.40721130371094, + "learning_rate": 7.941300000000001e-06, + "loss": 1.0777, + "step": 20600 + }, + { + "epoch": 105.64102564102564, + "eval_loss": 1.2083336114883423, + "eval_runtime": 36.3182, + "eval_samples_per_second": 10.821, + "eval_steps_per_second": 1.377, + "step": 20600 + }, + { + "epoch": 106.15384615384616, + "grad_norm": 24.97282600402832, + "learning_rate": 7.931300000000001e-06, + "loss": 1.0512, + "step": 20700 + }, + { + "epoch": 106.15384615384616, + "eval_loss": 1.1997463703155518, + "eval_runtime": 36.3993, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 20700 + }, + { + "epoch": 106.66666666666667, + "grad_norm": 56.40156173706055, + "learning_rate": 7.9213e-06, + "loss": 1.0712, + "step": 20800 + }, + { + "epoch": 106.66666666666667, + "eval_loss": 1.2138112783432007, + "eval_runtime": 35.549, + "eval_samples_per_second": 11.055, + "eval_steps_per_second": 1.407, + "step": 20800 + }, + { + "epoch": 107.17948717948718, + "grad_norm": 28.606220245361328, + "learning_rate": 7.9113e-06, + "loss": 1.0497, + "step": 20900 + }, + { + "epoch": 107.17948717948718, + "eval_loss": 1.2097153663635254, + "eval_runtime": 36.5101, + "eval_samples_per_second": 10.764, + "eval_steps_per_second": 1.369, + "step": 20900 + }, + { + "epoch": 107.6923076923077, + "grad_norm": 32.19448471069336, + "learning_rate": 7.9013e-06, + "loss": 1.0383, + "step": 21000 + }, + { + "epoch": 107.6923076923077, + "eval_loss": 1.2029515504837036, + "eval_runtime": 36.5264, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 21000 + }, + { + "epoch": 108.2051282051282, + "grad_norm": 14.996597290039062, + "learning_rate": 7.8913e-06, + "loss": 1.0571, + "step": 21100 + }, + { + "epoch": 108.2051282051282, + "eval_loss": 1.2292343378067017, + "eval_runtime": 36.1323, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 21100 + }, + { + "epoch": 108.71794871794872, + "grad_norm": 15.809494018554688, + "learning_rate": 7.8814e-06, + "loss": 1.0725, + "step": 21200 + }, + { + "epoch": 108.71794871794872, + "eval_loss": 1.2223467826843262, + "eval_runtime": 36.1509, + "eval_samples_per_second": 10.871, + "eval_steps_per_second": 1.383, + "step": 21200 + }, + { + "epoch": 109.23076923076923, + "grad_norm": 55.448875427246094, + "learning_rate": 7.8714e-06, + "loss": 1.0401, + "step": 21300 + }, + { + "epoch": 109.23076923076923, + "eval_loss": 1.2152210474014282, + "eval_runtime": 36.4536, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 21300 + }, + { + "epoch": 109.74358974358974, + "grad_norm": 47.51198959350586, + "learning_rate": 7.8614e-06, + "loss": 1.0232, + "step": 21400 + }, + { + "epoch": 109.74358974358974, + "eval_loss": 1.2074100971221924, + "eval_runtime": 36.2117, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 21400 + }, + { + "epoch": 110.25641025641026, + "grad_norm": 22.616479873657227, + "learning_rate": 7.8514e-06, + "loss": 1.0837, + "step": 21500 + }, + { + "epoch": 110.25641025641026, + "eval_loss": 1.2034764289855957, + "eval_runtime": 36.2909, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 21500 + }, + { + "epoch": 110.76923076923077, + "grad_norm": 40.14712905883789, + "learning_rate": 7.841400000000001e-06, + "loss": 1.044, + "step": 21600 + }, + { + "epoch": 110.76923076923077, + "eval_loss": 1.1942527294158936, + "eval_runtime": 36.2254, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 21600 + }, + { + "epoch": 111.28205128205128, + "grad_norm": 89.68008422851562, + "learning_rate": 7.831400000000001e-06, + "loss": 1.0301, + "step": 21700 + }, + { + "epoch": 111.28205128205128, + "eval_loss": 1.2042152881622314, + "eval_runtime": 36.2099, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 21700 + }, + { + "epoch": 111.7948717948718, + "grad_norm": 40.1873664855957, + "learning_rate": 7.8214e-06, + "loss": 1.0513, + "step": 21800 + }, + { + "epoch": 111.7948717948718, + "eval_loss": 1.201180100440979, + "eval_runtime": 36.1821, + "eval_samples_per_second": 10.862, + "eval_steps_per_second": 1.382, + "step": 21800 + }, + { + "epoch": 112.3076923076923, + "grad_norm": 30.849868774414062, + "learning_rate": 7.8114e-06, + "loss": 1.0514, + "step": 21900 + }, + { + "epoch": 112.3076923076923, + "eval_loss": 1.2128338813781738, + "eval_runtime": 36.2828, + "eval_samples_per_second": 10.832, + "eval_steps_per_second": 1.378, + "step": 21900 + }, + { + "epoch": 112.82051282051282, + "grad_norm": 31.983945846557617, + "learning_rate": 7.8014e-06, + "loss": 1.0288, + "step": 22000 + }, + { + "epoch": 112.82051282051282, + "eval_loss": 1.194412350654602, + "eval_runtime": 36.4995, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 22000 + }, + { + "epoch": 113.33333333333333, + "grad_norm": 27.589929580688477, + "learning_rate": 7.791400000000001e-06, + "loss": 1.0131, + "step": 22100 + }, + { + "epoch": 113.33333333333333, + "eval_loss": 1.2023619413375854, + "eval_runtime": 36.2873, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 22100 + }, + { + "epoch": 113.84615384615384, + "grad_norm": 98.88153076171875, + "learning_rate": 7.781400000000001e-06, + "loss": 1.0648, + "step": 22200 + }, + { + "epoch": 113.84615384615384, + "eval_loss": 1.1987013816833496, + "eval_runtime": 36.3531, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 22200 + }, + { + "epoch": 114.35897435897436, + "grad_norm": 32.19725036621094, + "learning_rate": 7.771400000000002e-06, + "loss": 1.0401, + "step": 22300 + }, + { + "epoch": 114.35897435897436, + "eval_loss": 1.2003728151321411, + "eval_runtime": 36.294, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 22300 + }, + { + "epoch": 114.87179487179488, + "grad_norm": 29.627470016479492, + "learning_rate": 7.761400000000002e-06, + "loss": 1.0638, + "step": 22400 + }, + { + "epoch": 114.87179487179488, + "eval_loss": 1.2163525819778442, + "eval_runtime": 36.4332, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 22400 + }, + { + "epoch": 115.38461538461539, + "grad_norm": 38.58709716796875, + "learning_rate": 7.751400000000002e-06, + "loss": 1.0307, + "step": 22500 + }, + { + "epoch": 115.38461538461539, + "eval_loss": 1.199642300605774, + "eval_runtime": 36.3023, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 22500 + }, + { + "epoch": 115.8974358974359, + "grad_norm": 28.66075897216797, + "learning_rate": 7.741400000000002e-06, + "loss": 1.0276, + "step": 22600 + }, + { + "epoch": 115.8974358974359, + "eval_loss": 1.213975429534912, + "eval_runtime": 36.2211, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 22600 + }, + { + "epoch": 116.41025641025641, + "grad_norm": 39.12740707397461, + "learning_rate": 7.731400000000002e-06, + "loss": 1.0163, + "step": 22700 + }, + { + "epoch": 116.41025641025641, + "eval_loss": 1.2231982946395874, + "eval_runtime": 36.2424, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 22700 + }, + { + "epoch": 116.92307692307692, + "grad_norm": 46.517051696777344, + "learning_rate": 7.7214e-06, + "loss": 1.0463, + "step": 22800 + }, + { + "epoch": 116.92307692307692, + "eval_loss": 1.183910846710205, + "eval_runtime": 36.4609, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 22800 + }, + { + "epoch": 117.43589743589743, + "grad_norm": 49.0311393737793, + "learning_rate": 7.7114e-06, + "loss": 1.0236, + "step": 22900 + }, + { + "epoch": 117.43589743589743, + "eval_loss": 1.209437608718872, + "eval_runtime": 36.3858, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 22900 + }, + { + "epoch": 117.94871794871794, + "grad_norm": 25.398332595825195, + "learning_rate": 7.7014e-06, + "loss": 1.07, + "step": 23000 + }, + { + "epoch": 117.94871794871794, + "eval_loss": 1.2132450342178345, + "eval_runtime": 36.4343, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 23000 + }, + { + "epoch": 118.46153846153847, + "grad_norm": 49.871639251708984, + "learning_rate": 7.6914e-06, + "loss": 0.9935, + "step": 23100 + }, + { + "epoch": 118.46153846153847, + "eval_loss": 1.2235440015792847, + "eval_runtime": 36.3755, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 23100 + }, + { + "epoch": 118.97435897435898, + "grad_norm": 66.67717742919922, + "learning_rate": 7.6814e-06, + "loss": 1.0672, + "step": 23200 + }, + { + "epoch": 118.97435897435898, + "eval_loss": 1.1966772079467773, + "eval_runtime": 36.3466, + "eval_samples_per_second": 10.813, + "eval_steps_per_second": 1.376, + "step": 23200 + }, + { + "epoch": 119.48717948717949, + "grad_norm": 43.98142623901367, + "learning_rate": 7.6714e-06, + "loss": 1.019, + "step": 23300 + }, + { + "epoch": 119.48717948717949, + "eval_loss": 1.1935210227966309, + "eval_runtime": 36.4031, + "eval_samples_per_second": 10.796, + "eval_steps_per_second": 1.374, + "step": 23300 + }, + { + "epoch": 120.0, + "grad_norm": 29.781999588012695, + "learning_rate": 7.661400000000001e-06, + "loss": 1.0456, + "step": 23400 + }, + { + "epoch": 120.0, + "eval_loss": 1.19161856174469, + "eval_runtime": 36.6677, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 1.364, + "step": 23400 + }, + { + "epoch": 120.51282051282051, + "grad_norm": 38.971858978271484, + "learning_rate": 7.651400000000001e-06, + "loss": 1.0488, + "step": 23500 + }, + { + "epoch": 120.51282051282051, + "eval_loss": 1.2005729675292969, + "eval_runtime": 36.8304, + "eval_samples_per_second": 10.671, + "eval_steps_per_second": 1.358, + "step": 23500 + }, + { + "epoch": 121.02564102564102, + "grad_norm": 37.437660217285156, + "learning_rate": 7.641400000000001e-06, + "loss": 1.0196, + "step": 23600 + }, + { + "epoch": 121.02564102564102, + "eval_loss": 1.1815942525863647, + "eval_runtime": 36.5578, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 23600 + }, + { + "epoch": 121.53846153846153, + "grad_norm": 45.006248474121094, + "learning_rate": 7.631500000000001e-06, + "loss": 1.0247, + "step": 23700 + }, + { + "epoch": 121.53846153846153, + "eval_loss": 1.1985987424850464, + "eval_runtime": 36.9519, + "eval_samples_per_second": 10.635, + "eval_steps_per_second": 1.353, + "step": 23700 + }, + { + "epoch": 122.05128205128206, + "grad_norm": 28.17504119873047, + "learning_rate": 7.621500000000001e-06, + "loss": 1.0278, + "step": 23800 + }, + { + "epoch": 122.05128205128206, + "eval_loss": 1.1994553804397583, + "eval_runtime": 36.7025, + "eval_samples_per_second": 10.708, + "eval_steps_per_second": 1.362, + "step": 23800 + }, + { + "epoch": 122.56410256410257, + "grad_norm": 36.315181732177734, + "learning_rate": 7.6116e-06, + "loss": 1.0073, + "step": 23900 + }, + { + "epoch": 122.56410256410257, + "eval_loss": 1.201116919517517, + "eval_runtime": 36.7114, + "eval_samples_per_second": 10.705, + "eval_steps_per_second": 1.362, + "step": 23900 + }, + { + "epoch": 123.07692307692308, + "grad_norm": 23.902212142944336, + "learning_rate": 7.6016e-06, + "loss": 1.0238, + "step": 24000 + }, + { + "epoch": 123.07692307692308, + "eval_loss": 1.2095754146575928, + "eval_runtime": 36.3531, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 24000 + }, + { + "epoch": 123.58974358974359, + "grad_norm": 29.814146041870117, + "learning_rate": 7.5916e-06, + "loss": 0.9958, + "step": 24100 + }, + { + "epoch": 123.58974358974359, + "eval_loss": 1.1929839849472046, + "eval_runtime": 36.7481, + "eval_samples_per_second": 10.694, + "eval_steps_per_second": 1.361, + "step": 24100 + }, + { + "epoch": 124.1025641025641, + "grad_norm": 52.22597885131836, + "learning_rate": 7.5816e-06, + "loss": 1.0466, + "step": 24200 + }, + { + "epoch": 124.1025641025641, + "eval_loss": 1.189598798751831, + "eval_runtime": 36.3774, + "eval_samples_per_second": 10.803, + "eval_steps_per_second": 1.374, + "step": 24200 + }, + { + "epoch": 124.61538461538461, + "grad_norm": 23.292490005493164, + "learning_rate": 7.571600000000001e-06, + "loss": 1.0451, + "step": 24300 + }, + { + "epoch": 124.61538461538461, + "eval_loss": 1.2092961072921753, + "eval_runtime": 36.3753, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 24300 + }, + { + "epoch": 125.12820512820512, + "grad_norm": 42.846275329589844, + "learning_rate": 7.5616000000000014e-06, + "loss": 1.0122, + "step": 24400 + }, + { + "epoch": 125.12820512820512, + "eval_loss": 1.1754584312438965, + "eval_runtime": 36.676, + "eval_samples_per_second": 10.715, + "eval_steps_per_second": 1.363, + "step": 24400 + }, + { + "epoch": 125.64102564102564, + "grad_norm": 23.48101234436035, + "learning_rate": 7.5516000000000015e-06, + "loss": 1.0127, + "step": 24500 + }, + { + "epoch": 125.64102564102564, + "eval_loss": 1.1745011806488037, + "eval_runtime": 36.5027, + "eval_samples_per_second": 10.766, + "eval_steps_per_second": 1.37, + "step": 24500 + }, + { + "epoch": 126.15384615384616, + "grad_norm": 32.6221809387207, + "learning_rate": 7.541600000000001e-06, + "loss": 1.0416, + "step": 24600 + }, + { + "epoch": 126.15384615384616, + "eval_loss": 1.1841143369674683, + "eval_runtime": 36.5596, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 24600 + }, + { + "epoch": 126.66666666666667, + "grad_norm": 44.3466682434082, + "learning_rate": 7.531600000000001e-06, + "loss": 1.0134, + "step": 24700 + }, + { + "epoch": 126.66666666666667, + "eval_loss": 1.1873056888580322, + "eval_runtime": 36.2722, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.378, + "step": 24700 + }, + { + "epoch": 127.17948717948718, + "grad_norm": 35.96662902832031, + "learning_rate": 7.521600000000001e-06, + "loss": 1.0157, + "step": 24800 + }, + { + "epoch": 127.17948717948718, + "eval_loss": 1.1866871118545532, + "eval_runtime": 36.0895, + "eval_samples_per_second": 10.89, + "eval_steps_per_second": 1.385, + "step": 24800 + }, + { + "epoch": 127.6923076923077, + "grad_norm": 26.58826446533203, + "learning_rate": 7.511600000000001e-06, + "loss": 1.0562, + "step": 24900 + }, + { + "epoch": 127.6923076923077, + "eval_loss": 1.1805154085159302, + "eval_runtime": 36.3065, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 24900 + }, + { + "epoch": 128.2051282051282, + "grad_norm": 60.84516906738281, + "learning_rate": 7.501600000000001e-06, + "loss": 0.9951, + "step": 25000 + }, + { + "epoch": 128.2051282051282, + "eval_loss": 1.1816545724868774, + "eval_runtime": 36.491, + "eval_samples_per_second": 10.77, + "eval_steps_per_second": 1.37, + "step": 25000 + }, + { + "epoch": 128.71794871794873, + "grad_norm": 102.91582489013672, + "learning_rate": 7.491600000000001e-06, + "loss": 1.0385, + "step": 25100 + }, + { + "epoch": 128.71794871794873, + "eval_loss": 1.1889567375183105, + "eval_runtime": 36.3865, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 25100 + }, + { + "epoch": 129.23076923076923, + "grad_norm": 33.94929885864258, + "learning_rate": 7.481600000000001e-06, + "loss": 1.0105, + "step": 25200 + }, + { + "epoch": 129.23076923076923, + "eval_loss": 1.1977964639663696, + "eval_runtime": 36.2056, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 25200 + }, + { + "epoch": 129.74358974358975, + "grad_norm": 38.72896957397461, + "learning_rate": 7.4716000000000014e-06, + "loss": 0.983, + "step": 25300 + }, + { + "epoch": 129.74358974358975, + "eval_loss": 1.179408311843872, + "eval_runtime": 36.362, + "eval_samples_per_second": 10.808, + "eval_steps_per_second": 1.375, + "step": 25300 + }, + { + "epoch": 130.25641025641025, + "grad_norm": 26.789093017578125, + "learning_rate": 7.461600000000001e-06, + "loss": 1.0294, + "step": 25400 + }, + { + "epoch": 130.25641025641025, + "eval_loss": 1.18820321559906, + "eval_runtime": 36.265, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 25400 + }, + { + "epoch": 130.76923076923077, + "grad_norm": 39.0494384765625, + "learning_rate": 7.451600000000001e-06, + "loss": 0.9926, + "step": 25500 + }, + { + "epoch": 130.76923076923077, + "eval_loss": 1.1823586225509644, + "eval_runtime": 36.4492, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 25500 + }, + { + "epoch": 131.28205128205127, + "grad_norm": 24.54622459411621, + "learning_rate": 7.441600000000001e-06, + "loss": 0.9796, + "step": 25600 + }, + { + "epoch": 131.28205128205127, + "eval_loss": 1.176493525505066, + "eval_runtime": 36.3072, + "eval_samples_per_second": 10.824, + "eval_steps_per_second": 1.377, + "step": 25600 + }, + { + "epoch": 131.7948717948718, + "grad_norm": 47.54056930541992, + "learning_rate": 7.431600000000001e-06, + "loss": 1.0261, + "step": 25700 + }, + { + "epoch": 131.7948717948718, + "eval_loss": 1.189279317855835, + "eval_runtime": 36.2551, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 25700 + }, + { + "epoch": 132.30769230769232, + "grad_norm": 173.0323028564453, + "learning_rate": 7.421600000000001e-06, + "loss": 1.0701, + "step": 25800 + }, + { + "epoch": 132.30769230769232, + "eval_loss": 1.1980550289154053, + "eval_runtime": 36.5, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 25800 + }, + { + "epoch": 132.82051282051282, + "grad_norm": 20.522178649902344, + "learning_rate": 7.411600000000001e-06, + "loss": 0.9994, + "step": 25900 + }, + { + "epoch": 132.82051282051282, + "eval_loss": 1.1794248819351196, + "eval_runtime": 36.1653, + "eval_samples_per_second": 10.867, + "eval_steps_per_second": 1.383, + "step": 25900 + }, + { + "epoch": 133.33333333333334, + "grad_norm": 58.22213363647461, + "learning_rate": 7.401600000000001e-06, + "loss": 0.9745, + "step": 26000 + }, + { + "epoch": 133.33333333333334, + "eval_loss": 1.185817837715149, + "eval_runtime": 36.5417, + "eval_samples_per_second": 10.755, + "eval_steps_per_second": 1.368, + "step": 26000 + }, + { + "epoch": 133.84615384615384, + "grad_norm": 42.3514518737793, + "learning_rate": 7.391600000000001e-06, + "loss": 1.0218, + "step": 26100 + }, + { + "epoch": 133.84615384615384, + "eval_loss": 1.1769567728042603, + "eval_runtime": 36.3485, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 26100 + }, + { + "epoch": 134.35897435897436, + "grad_norm": 21.729629516601562, + "learning_rate": 7.381600000000001e-06, + "loss": 1.0041, + "step": 26200 + }, + { + "epoch": 134.35897435897436, + "eval_loss": 1.1934856176376343, + "eval_runtime": 36.513, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 26200 + }, + { + "epoch": 134.87179487179486, + "grad_norm": 31.473798751831055, + "learning_rate": 7.371600000000001e-06, + "loss": 1.0194, + "step": 26300 + }, + { + "epoch": 134.87179487179486, + "eval_loss": 1.176289677619934, + "eval_runtime": 36.4474, + "eval_samples_per_second": 10.783, + "eval_steps_per_second": 1.372, + "step": 26300 + }, + { + "epoch": 135.3846153846154, + "grad_norm": 47.508140563964844, + "learning_rate": 7.361600000000001e-06, + "loss": 0.9971, + "step": 26400 + }, + { + "epoch": 135.3846153846154, + "eval_loss": 1.1870988607406616, + "eval_runtime": 36.3471, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 26400 + }, + { + "epoch": 135.89743589743588, + "grad_norm": 28.1704158782959, + "learning_rate": 7.351600000000001e-06, + "loss": 0.9994, + "step": 26500 + }, + { + "epoch": 135.89743589743588, + "eval_loss": 1.1857614517211914, + "eval_runtime": 36.4793, + "eval_samples_per_second": 10.773, + "eval_steps_per_second": 1.371, + "step": 26500 + }, + { + "epoch": 136.4102564102564, + "grad_norm": 21.236408233642578, + "learning_rate": 7.341600000000001e-06, + "loss": 1.0042, + "step": 26600 + }, + { + "epoch": 136.4102564102564, + "eval_loss": 1.183176875114441, + "eval_runtime": 36.6587, + "eval_samples_per_second": 10.721, + "eval_steps_per_second": 1.364, + "step": 26600 + }, + { + "epoch": 136.92307692307693, + "grad_norm": 35.42839813232422, + "learning_rate": 7.331600000000001e-06, + "loss": 1.0016, + "step": 26700 + }, + { + "epoch": 136.92307692307693, + "eval_loss": 1.1894404888153076, + "eval_runtime": 36.3426, + "eval_samples_per_second": 10.814, + "eval_steps_per_second": 1.376, + "step": 26700 + }, + { + "epoch": 137.43589743589743, + "grad_norm": 33.13557815551758, + "learning_rate": 7.321600000000001e-06, + "loss": 1.0378, + "step": 26800 + }, + { + "epoch": 137.43589743589743, + "eval_loss": 1.1841119527816772, + "eval_runtime": 36.4508, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 26800 + }, + { + "epoch": 137.94871794871796, + "grad_norm": 31.94637107849121, + "learning_rate": 7.311600000000001e-06, + "loss": 0.9938, + "step": 26900 + }, + { + "epoch": 137.94871794871796, + "eval_loss": 1.1836310625076294, + "eval_runtime": 36.4314, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 26900 + }, + { + "epoch": 138.46153846153845, + "grad_norm": 53.63075637817383, + "learning_rate": 7.3016000000000005e-06, + "loss": 0.9617, + "step": 27000 + }, + { + "epoch": 138.46153846153845, + "eval_loss": 1.1768561601638794, + "eval_runtime": 36.3252, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 27000 + }, + { + "epoch": 138.97435897435898, + "grad_norm": 21.395553588867188, + "learning_rate": 7.291600000000001e-06, + "loss": 1.0385, + "step": 27100 + }, + { + "epoch": 138.97435897435898, + "eval_loss": 1.1800990104675293, + "eval_runtime": 36.2628, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 27100 + }, + { + "epoch": 139.48717948717947, + "grad_norm": 49.950172424316406, + "learning_rate": 7.281600000000001e-06, + "loss": 0.993, + "step": 27200 + }, + { + "epoch": 139.48717948717947, + "eval_loss": 1.1754857301712036, + "eval_runtime": 36.4227, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 27200 + }, + { + "epoch": 140.0, + "grad_norm": 28.073486328125, + "learning_rate": 7.271600000000001e-06, + "loss": 0.9907, + "step": 27300 + }, + { + "epoch": 140.0, + "eval_loss": 1.184246301651001, + "eval_runtime": 36.2844, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 27300 + }, + { + "epoch": 140.51282051282053, + "grad_norm": 24.810503005981445, + "learning_rate": 7.261600000000001e-06, + "loss": 1.0236, + "step": 27400 + }, + { + "epoch": 140.51282051282053, + "eval_loss": 1.1773159503936768, + "eval_runtime": 36.3242, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 27400 + }, + { + "epoch": 141.02564102564102, + "grad_norm": 40.216854095458984, + "learning_rate": 7.251600000000001e-06, + "loss": 0.9461, + "step": 27500 + }, + { + "epoch": 141.02564102564102, + "eval_loss": 1.1806586980819702, + "eval_runtime": 36.2095, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 27500 + }, + { + "epoch": 141.53846153846155, + "grad_norm": 40.43361282348633, + "learning_rate": 7.241600000000001e-06, + "loss": 0.9822, + "step": 27600 + }, + { + "epoch": 141.53846153846155, + "eval_loss": 1.1734962463378906, + "eval_runtime": 36.3354, + "eval_samples_per_second": 10.816, + "eval_steps_per_second": 1.376, + "step": 27600 + }, + { + "epoch": 142.05128205128204, + "grad_norm": 66.24449920654297, + "learning_rate": 7.231600000000001e-06, + "loss": 1.0241, + "step": 27700 + }, + { + "epoch": 142.05128205128204, + "eval_loss": 1.1698683500289917, + "eval_runtime": 36.5375, + "eval_samples_per_second": 10.756, + "eval_steps_per_second": 1.368, + "step": 27700 + }, + { + "epoch": 142.56410256410257, + "grad_norm": 25.957368850708008, + "learning_rate": 7.2216000000000004e-06, + "loss": 0.974, + "step": 27800 + }, + { + "epoch": 142.56410256410257, + "eval_loss": 1.1907858848571777, + "eval_runtime": 36.4875, + "eval_samples_per_second": 10.771, + "eval_steps_per_second": 1.37, + "step": 27800 + }, + { + "epoch": 143.07692307692307, + "grad_norm": 27.866281509399414, + "learning_rate": 7.211700000000001e-06, + "loss": 1.0346, + "step": 27900 + }, + { + "epoch": 143.07692307692307, + "eval_loss": 1.1854588985443115, + "eval_runtime": 36.3736, + "eval_samples_per_second": 10.805, + "eval_steps_per_second": 1.375, + "step": 27900 + }, + { + "epoch": 143.5897435897436, + "grad_norm": 35.50307846069336, + "learning_rate": 7.2017e-06, + "loss": 0.9947, + "step": 28000 + }, + { + "epoch": 143.5897435897436, + "eval_loss": 1.1978769302368164, + "eval_runtime": 36.3032, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 28000 + }, + { + "epoch": 144.10256410256412, + "grad_norm": 30.067153930664062, + "learning_rate": 7.1917e-06, + "loss": 0.9954, + "step": 28100 + }, + { + "epoch": 144.10256410256412, + "eval_loss": 1.1732500791549683, + "eval_runtime": 36.1784, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 28100 + }, + { + "epoch": 144.6153846153846, + "grad_norm": 40.56125259399414, + "learning_rate": 7.1817e-06, + "loss": 0.9954, + "step": 28200 + }, + { + "epoch": 144.6153846153846, + "eval_loss": 1.178220510482788, + "eval_runtime": 36.2456, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.379, + "step": 28200 + }, + { + "epoch": 145.12820512820514, + "grad_norm": 23.636066436767578, + "learning_rate": 7.171800000000001e-06, + "loss": 1.0048, + "step": 28300 + }, + { + "epoch": 145.12820512820514, + "eval_loss": 1.1718521118164062, + "eval_runtime": 36.7366, + "eval_samples_per_second": 10.698, + "eval_steps_per_second": 1.361, + "step": 28300 + }, + { + "epoch": 145.64102564102564, + "grad_norm": 97.56835174560547, + "learning_rate": 7.161800000000001e-06, + "loss": 0.9895, + "step": 28400 + }, + { + "epoch": 145.64102564102564, + "eval_loss": 1.1722639799118042, + "eval_runtime": 36.2683, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 28400 + }, + { + "epoch": 146.15384615384616, + "grad_norm": 54.8230094909668, + "learning_rate": 7.151800000000001e-06, + "loss": 1.001, + "step": 28500 + }, + { + "epoch": 146.15384615384616, + "eval_loss": 1.1968903541564941, + "eval_runtime": 36.2527, + "eval_samples_per_second": 10.841, + "eval_steps_per_second": 1.379, + "step": 28500 + }, + { + "epoch": 146.66666666666666, + "grad_norm": 15.167412757873535, + "learning_rate": 7.141800000000001e-06, + "loss": 0.9852, + "step": 28600 + }, + { + "epoch": 146.66666666666666, + "eval_loss": 1.188344120979309, + "eval_runtime": 36.3001, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 28600 + }, + { + "epoch": 147.17948717948718, + "grad_norm": 69.213134765625, + "learning_rate": 7.131800000000001e-06, + "loss": 1.016, + "step": 28700 + }, + { + "epoch": 147.17948717948718, + "eval_loss": 1.1813448667526245, + "eval_runtime": 36.0937, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 28700 + }, + { + "epoch": 147.69230769230768, + "grad_norm": 43.38339614868164, + "learning_rate": 7.121800000000001e-06, + "loss": 1.0048, + "step": 28800 + }, + { + "epoch": 147.69230769230768, + "eval_loss": 1.189638376235962, + "eval_runtime": 36.1286, + "eval_samples_per_second": 10.878, + "eval_steps_per_second": 1.384, + "step": 28800 + }, + { + "epoch": 148.2051282051282, + "grad_norm": 43.99717712402344, + "learning_rate": 7.111800000000001e-06, + "loss": 0.9838, + "step": 28900 + }, + { + "epoch": 148.2051282051282, + "eval_loss": 1.1951574087142944, + "eval_runtime": 36.2031, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 28900 + }, + { + "epoch": 148.71794871794873, + "grad_norm": 31.055850982666016, + "learning_rate": 7.101800000000001e-06, + "loss": 0.9683, + "step": 29000 + }, + { + "epoch": 148.71794871794873, + "eval_loss": 1.1835789680480957, + "eval_runtime": 36.2271, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 29000 + }, + { + "epoch": 149.23076923076923, + "grad_norm": 32.13520050048828, + "learning_rate": 7.091800000000001e-06, + "loss": 1.0131, + "step": 29100 + }, + { + "epoch": 149.23076923076923, + "eval_loss": 1.1654757261276245, + "eval_runtime": 36.5086, + "eval_samples_per_second": 10.765, + "eval_steps_per_second": 1.37, + "step": 29100 + }, + { + "epoch": 149.74358974358975, + "grad_norm": 31.264759063720703, + "learning_rate": 7.0818000000000005e-06, + "loss": 1.0073, + "step": 29200 + }, + { + "epoch": 149.74358974358975, + "eval_loss": 1.1849567890167236, + "eval_runtime": 36.7699, + "eval_samples_per_second": 10.688, + "eval_steps_per_second": 1.36, + "step": 29200 + }, + { + "epoch": 150.25641025641025, + "grad_norm": 32.43284225463867, + "learning_rate": 7.071800000000001e-06, + "loss": 0.9656, + "step": 29300 + }, + { + "epoch": 150.25641025641025, + "eval_loss": 1.168485403060913, + "eval_runtime": 36.2215, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 29300 + }, + { + "epoch": 150.76923076923077, + "grad_norm": 59.16147994995117, + "learning_rate": 7.061800000000001e-06, + "loss": 0.9923, + "step": 29400 + }, + { + "epoch": 150.76923076923077, + "eval_loss": 1.1792685985565186, + "eval_runtime": 36.3853, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 29400 + }, + { + "epoch": 151.28205128205127, + "grad_norm": 101.80491638183594, + "learning_rate": 7.051800000000001e-06, + "loss": 1.0074, + "step": 29500 + }, + { + "epoch": 151.28205128205127, + "eval_loss": 1.1642369031906128, + "eval_runtime": 36.3983, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 29500 + }, + { + "epoch": 151.7948717948718, + "grad_norm": 25.33744239807129, + "learning_rate": 7.041800000000001e-06, + "loss": 0.9715, + "step": 29600 + }, + { + "epoch": 151.7948717948718, + "eval_loss": 1.1821907758712769, + "eval_runtime": 36.0667, + "eval_samples_per_second": 10.896, + "eval_steps_per_second": 1.386, + "step": 29600 + }, + { + "epoch": 152.30769230769232, + "grad_norm": 116.95513916015625, + "learning_rate": 7.031800000000001e-06, + "loss": 0.9958, + "step": 29700 + }, + { + "epoch": 152.30769230769232, + "eval_loss": 1.1610026359558105, + "eval_runtime": 36.2008, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 29700 + }, + { + "epoch": 152.82051282051282, + "grad_norm": 90.85464477539062, + "learning_rate": 7.021800000000001e-06, + "loss": 0.9767, + "step": 29800 + }, + { + "epoch": 152.82051282051282, + "eval_loss": 1.1621683835983276, + "eval_runtime": 36.2957, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 29800 + }, + { + "epoch": 153.33333333333334, + "grad_norm": 24.217872619628906, + "learning_rate": 7.011800000000001e-06, + "loss": 0.9786, + "step": 29900 + }, + { + "epoch": 153.33333333333334, + "eval_loss": 1.158172845840454, + "eval_runtime": 35.9046, + "eval_samples_per_second": 10.946, + "eval_steps_per_second": 1.393, + "step": 29900 + }, + { + "epoch": 153.84615384615384, + "grad_norm": 30.70758628845215, + "learning_rate": 7.001900000000001e-06, + "loss": 1.0029, + "step": 30000 + }, + { + "epoch": 153.84615384615384, + "eval_loss": 1.1670671701431274, + "eval_runtime": 36.1463, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 30000 + }, + { + "epoch": 154.35897435897436, + "grad_norm": 29.265003204345703, + "learning_rate": 6.991900000000001e-06, + "loss": 1.0156, + "step": 30100 + }, + { + "epoch": 154.35897435897436, + "eval_loss": 1.1782671213150024, + "eval_runtime": 36.3854, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 30100 + }, + { + "epoch": 154.87179487179486, + "grad_norm": 28.448328018188477, + "learning_rate": 6.9819e-06, + "loss": 0.9709, + "step": 30200 + }, + { + "epoch": 154.87179487179486, + "eval_loss": 1.16402268409729, + "eval_runtime": 36.815, + "eval_samples_per_second": 10.675, + "eval_steps_per_second": 1.358, + "step": 30200 + }, + { + "epoch": 155.3846153846154, + "grad_norm": 65.755615234375, + "learning_rate": 6.9719e-06, + "loss": 0.9669, + "step": 30300 + }, + { + "epoch": 155.3846153846154, + "eval_loss": 1.1739165782928467, + "eval_runtime": 36.34, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 30300 + }, + { + "epoch": 155.89743589743588, + "grad_norm": 45.11248779296875, + "learning_rate": 6.9619e-06, + "loss": 1.0136, + "step": 30400 + }, + { + "epoch": 155.89743589743588, + "eval_loss": 1.143236756324768, + "eval_runtime": 36.4174, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 30400 + }, + { + "epoch": 156.4102564102564, + "grad_norm": 50.76137924194336, + "learning_rate": 6.9519e-06, + "loss": 0.9565, + "step": 30500 + }, + { + "epoch": 156.4102564102564, + "eval_loss": 1.1797446012496948, + "eval_runtime": 36.3814, + "eval_samples_per_second": 10.802, + "eval_steps_per_second": 1.374, + "step": 30500 + }, + { + "epoch": 156.92307692307693, + "grad_norm": 35.62437438964844, + "learning_rate": 6.9419e-06, + "loss": 1.0004, + "step": 30600 + }, + { + "epoch": 156.92307692307693, + "eval_loss": 1.1688662767410278, + "eval_runtime": 36.529, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 30600 + }, + { + "epoch": 157.43589743589743, + "grad_norm": 43.17647933959961, + "learning_rate": 6.9319000000000005e-06, + "loss": 0.9897, + "step": 30700 + }, + { + "epoch": 157.43589743589743, + "eval_loss": 1.1841660737991333, + "eval_runtime": 36.4235, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 30700 + }, + { + "epoch": 157.94871794871796, + "grad_norm": 56.77204513549805, + "learning_rate": 6.921900000000001e-06, + "loss": 0.9817, + "step": 30800 + }, + { + "epoch": 157.94871794871796, + "eval_loss": 1.171298861503601, + "eval_runtime": 36.4571, + "eval_samples_per_second": 10.78, + "eval_steps_per_second": 1.371, + "step": 30800 + }, + { + "epoch": 158.46153846153845, + "grad_norm": 26.698223114013672, + "learning_rate": 6.911900000000001e-06, + "loss": 0.9528, + "step": 30900 + }, + { + "epoch": 158.46153846153845, + "eval_loss": 1.1642067432403564, + "eval_runtime": 36.2888, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 30900 + }, + { + "epoch": 158.97435897435898, + "grad_norm": 23.478918075561523, + "learning_rate": 6.9019e-06, + "loss": 0.9874, + "step": 31000 + }, + { + "epoch": 158.97435897435898, + "eval_loss": 1.1525483131408691, + "eval_runtime": 36.1434, + "eval_samples_per_second": 10.873, + "eval_steps_per_second": 1.383, + "step": 31000 + }, + { + "epoch": 159.48717948717947, + "grad_norm": 38.60530471801758, + "learning_rate": 6.8919e-06, + "loss": 0.9591, + "step": 31100 + }, + { + "epoch": 159.48717948717947, + "eval_loss": 1.1493828296661377, + "eval_runtime": 36.2104, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 31100 + }, + { + "epoch": 160.0, + "grad_norm": 20.907501220703125, + "learning_rate": 6.8819e-06, + "loss": 0.9884, + "step": 31200 + }, + { + "epoch": 160.0, + "eval_loss": 1.1617672443389893, + "eval_runtime": 36.2202, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 31200 + }, + { + "epoch": 160.51282051282053, + "grad_norm": 46.7100715637207, + "learning_rate": 6.8719e-06, + "loss": 0.9584, + "step": 31300 + }, + { + "epoch": 160.51282051282053, + "eval_loss": 1.1728988885879517, + "eval_runtime": 36.0326, + "eval_samples_per_second": 10.907, + "eval_steps_per_second": 1.388, + "step": 31300 + }, + { + "epoch": 161.02564102564102, + "grad_norm": 26.359403610229492, + "learning_rate": 6.8619e-06, + "loss": 0.9797, + "step": 31400 + }, + { + "epoch": 161.02564102564102, + "eval_loss": 1.1622204780578613, + "eval_runtime": 36.3492, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 31400 + }, + { + "epoch": 161.53846153846155, + "grad_norm": 19.935714721679688, + "learning_rate": 6.8519e-06, + "loss": 0.9914, + "step": 31500 + }, + { + "epoch": 161.53846153846155, + "eval_loss": 1.1545765399932861, + "eval_runtime": 36.0783, + "eval_samples_per_second": 10.893, + "eval_steps_per_second": 1.386, + "step": 31500 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 513, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2040659735552e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}