diff --git "a/checkpoint-93500/trainer_state.json" "b/checkpoint-93500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-93500/trainer_state.json" @@ -0,0 +1,14058 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 479.4871794871795, + "eval_steps": 100, + "global_step": 93500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.5128205128205128, + "grad_norm": 73.68424224853516, + "learning_rate": 9.9907e-06, + "loss": 3.1457, + "step": 100 + }, + { + "epoch": 0.5128205128205128, + "eval_loss": 2.2357213497161865, + "eval_runtime": 36.4689, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 100 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 57.47202682495117, + "learning_rate": 9.980800000000001e-06, + "loss": 2.1614, + "step": 200 + }, + { + "epoch": 1.0256410256410255, + "eval_loss": 2.0913825035095215, + "eval_runtime": 36.349, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 200 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 176.88357543945312, + "learning_rate": 9.970800000000001e-06, + "loss": 2.0388, + "step": 300 + }, + { + "epoch": 1.5384615384615383, + "eval_loss": 2.003911018371582, + "eval_runtime": 36.2551, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 300 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 55.31132507324219, + "learning_rate": 9.960800000000001e-06, + "loss": 1.9285, + "step": 400 + }, + { + "epoch": 2.051282051282051, + "eval_loss": 1.9796075820922852, + "eval_runtime": 36.4437, + "eval_samples_per_second": 10.784, + "eval_steps_per_second": 1.372, + "step": 400 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 41.900753021240234, + "learning_rate": 9.9508e-06, + "loss": 1.9523, + "step": 500 + }, + { + "epoch": 2.564102564102564, + "eval_loss": 1.936122179031372, + "eval_runtime": 36.5845, + "eval_samples_per_second": 10.742, + "eval_steps_per_second": 1.367, + "step": 500 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 50.21903991699219, + "learning_rate": 9.9408e-06, + "loss": 1.8452, + "step": 600 + }, + { + "epoch": 3.076923076923077, + "eval_loss": 1.8883634805679321, + "eval_runtime": 36.6015, + "eval_samples_per_second": 10.737, + "eval_steps_per_second": 1.366, + "step": 600 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 45.193939208984375, + "learning_rate": 9.930900000000002e-06, + "loss": 1.8403, + "step": 700 + }, + { + "epoch": 3.58974358974359, + "eval_loss": 1.8506474494934082, + "eval_runtime": 36.454, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 700 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 27.302494049072266, + "learning_rate": 9.920900000000002e-06, + "loss": 1.7976, + "step": 800 + }, + { + "epoch": 4.102564102564102, + "eval_loss": 1.8370662927627563, + "eval_runtime": 36.834, + "eval_samples_per_second": 10.67, + "eval_steps_per_second": 1.357, + "step": 800 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 52.6607666015625, + "learning_rate": 9.9109e-06, + "loss": 1.7508, + "step": 900 + }, + { + "epoch": 4.615384615384615, + "eval_loss": 1.8037244081497192, + "eval_runtime": 36.9939, + "eval_samples_per_second": 10.623, + "eval_steps_per_second": 1.352, + "step": 900 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 59.508033752441406, + "learning_rate": 9.9009e-06, + "loss": 1.7383, + "step": 1000 + }, + { + "epoch": 5.128205128205128, + "eval_loss": 1.7986633777618408, + "eval_runtime": 36.8356, + "eval_samples_per_second": 10.669, + "eval_steps_per_second": 1.357, + "step": 1000 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 71.58872985839844, + "learning_rate": 9.8909e-06, + "loss": 1.7361, + "step": 1100 + }, + { + "epoch": 5.641025641025641, + "eval_loss": 1.7810852527618408, + "eval_runtime": 37.0957, + "eval_samples_per_second": 10.594, + "eval_steps_per_second": 1.348, + "step": 1100 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 41.782066345214844, + "learning_rate": 9.8809e-06, + "loss": 1.682, + "step": 1200 + }, + { + "epoch": 6.153846153846154, + "eval_loss": 1.777554988861084, + "eval_runtime": 36.9173, + "eval_samples_per_second": 10.645, + "eval_steps_per_second": 1.354, + "step": 1200 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 40.28728485107422, + "learning_rate": 9.8709e-06, + "loss": 1.7216, + "step": 1300 + }, + { + "epoch": 6.666666666666667, + "eval_loss": 1.7382572889328003, + "eval_runtime": 36.8918, + "eval_samples_per_second": 10.653, + "eval_steps_per_second": 1.355, + "step": 1300 + }, + { + "epoch": 7.17948717948718, + "grad_norm": 104.99211883544922, + "learning_rate": 9.8609e-06, + "loss": 1.6534, + "step": 1400 + }, + { + "epoch": 7.17948717948718, + "eval_loss": 1.76559579372406, + "eval_runtime": 36.3398, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 1400 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 29.326631546020508, + "learning_rate": 9.8509e-06, + "loss": 1.707, + "step": 1500 + }, + { + "epoch": 7.6923076923076925, + "eval_loss": 1.750089406967163, + "eval_runtime": 36.4098, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 1500 + }, + { + "epoch": 8.205128205128204, + "grad_norm": 37.941261291503906, + "learning_rate": 9.840900000000001e-06, + "loss": 1.6554, + "step": 1600 + }, + { + "epoch": 8.205128205128204, + "eval_loss": 1.6900651454925537, + "eval_runtime": 36.3226, + "eval_samples_per_second": 10.82, + "eval_steps_per_second": 1.377, + "step": 1600 + }, + { + "epoch": 8.717948717948717, + "grad_norm": 44.60703659057617, + "learning_rate": 9.830900000000001e-06, + "loss": 1.6334, + "step": 1700 + }, + { + "epoch": 8.717948717948717, + "eval_loss": 1.7162973880767822, + "eval_runtime": 36.2672, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 1700 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 34.127254486083984, + "learning_rate": 9.820900000000001e-06, + "loss": 1.6345, + "step": 1800 + }, + { + "epoch": 9.23076923076923, + "eval_loss": 1.6906001567840576, + "eval_runtime": 36.2264, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 1800 + }, + { + "epoch": 9.743589743589745, + "grad_norm": 60.377540588378906, + "learning_rate": 9.810900000000001e-06, + "loss": 1.598, + "step": 1900 + }, + { + "epoch": 9.743589743589745, + "eval_loss": 1.6555503606796265, + "eval_runtime": 36.3896, + "eval_samples_per_second": 10.8, + "eval_steps_per_second": 1.374, + "step": 1900 + }, + { + "epoch": 10.256410256410255, + "grad_norm": 20.264404296875, + "learning_rate": 9.800900000000001e-06, + "loss": 1.5466, + "step": 2000 + }, + { + "epoch": 10.256410256410255, + "eval_loss": 1.648037075996399, + "eval_runtime": 36.4136, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 2000 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 27.18608856201172, + "learning_rate": 9.790900000000001e-06, + "loss": 1.5865, + "step": 2100 + }, + { + "epoch": 10.76923076923077, + "eval_loss": 1.6171936988830566, + "eval_runtime": 36.1051, + "eval_samples_per_second": 10.885, + "eval_steps_per_second": 1.385, + "step": 2100 + }, + { + "epoch": 11.282051282051283, + "grad_norm": 32.486331939697266, + "learning_rate": 9.780900000000002e-06, + "loss": 1.5284, + "step": 2200 + }, + { + "epoch": 11.282051282051283, + "eval_loss": 1.5915095806121826, + "eval_runtime": 36.1781, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 2200 + }, + { + "epoch": 11.794871794871796, + "grad_norm": 65.88719940185547, + "learning_rate": 9.770900000000002e-06, + "loss": 1.5514, + "step": 2300 + }, + { + "epoch": 11.794871794871796, + "eval_loss": 1.5879931449890137, + "eval_runtime": 36.3934, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 2300 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 31.737024307250977, + "learning_rate": 9.760900000000002e-06, + "loss": 1.4941, + "step": 2400 + }, + { + "epoch": 12.307692307692308, + "eval_loss": 1.583853006362915, + "eval_runtime": 36.3797, + "eval_samples_per_second": 10.803, + "eval_steps_per_second": 1.374, + "step": 2400 + }, + { + "epoch": 12.820512820512821, + "grad_norm": 45.48268508911133, + "learning_rate": 9.7509e-06, + "loss": 1.5097, + "step": 2500 + }, + { + "epoch": 12.820512820512821, + "eval_loss": 1.5559026002883911, + "eval_runtime": 36.1605, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 2500 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 27.500398635864258, + "learning_rate": 9.7409e-06, + "loss": 1.5018, + "step": 2600 + }, + { + "epoch": 13.333333333333334, + "eval_loss": 1.5453521013259888, + "eval_runtime": 36.2887, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 2600 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 32.49728775024414, + "learning_rate": 9.7309e-06, + "loss": 1.4804, + "step": 2700 + }, + { + "epoch": 13.846153846153847, + "eval_loss": 1.5424816608428955, + "eval_runtime": 36.359, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 2700 + }, + { + "epoch": 14.35897435897436, + "grad_norm": 38.46280288696289, + "learning_rate": 9.7209e-06, + "loss": 1.4826, + "step": 2800 + }, + { + "epoch": 14.35897435897436, + "eval_loss": 1.5317177772521973, + "eval_runtime": 36.3362, + "eval_samples_per_second": 10.816, + "eval_steps_per_second": 1.376, + "step": 2800 + }, + { + "epoch": 14.871794871794872, + "grad_norm": 16.075960159301758, + "learning_rate": 9.7109e-06, + "loss": 1.4568, + "step": 2900 + }, + { + "epoch": 14.871794871794872, + "eval_loss": 1.5241832733154297, + "eval_runtime": 36.2025, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 2900 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 27.334318161010742, + "learning_rate": 9.7009e-06, + "loss": 1.4176, + "step": 3000 + }, + { + "epoch": 15.384615384615385, + "eval_loss": 1.520580768585205, + "eval_runtime": 36.398, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 3000 + }, + { + "epoch": 15.897435897435898, + "grad_norm": 94.90784454345703, + "learning_rate": 9.6909e-06, + "loss": 1.4681, + "step": 3100 + }, + { + "epoch": 15.897435897435898, + "eval_loss": 1.5268648862838745, + "eval_runtime": 36.0541, + "eval_samples_per_second": 10.9, + "eval_steps_per_second": 1.387, + "step": 3100 + }, + { + "epoch": 16.41025641025641, + "grad_norm": 16.697856903076172, + "learning_rate": 9.6809e-06, + "loss": 1.454, + "step": 3200 + }, + { + "epoch": 16.41025641025641, + "eval_loss": 1.5157753229141235, + "eval_runtime": 36.3172, + "eval_samples_per_second": 10.821, + "eval_steps_per_second": 1.377, + "step": 3200 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 54.05553436279297, + "learning_rate": 9.670900000000001e-06, + "loss": 1.4309, + "step": 3300 + }, + { + "epoch": 16.923076923076923, + "eval_loss": 1.516249179840088, + "eval_runtime": 36.2632, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 3300 + }, + { + "epoch": 17.435897435897434, + "grad_norm": 47.010475158691406, + "learning_rate": 9.660900000000001e-06, + "loss": 1.4571, + "step": 3400 + }, + { + "epoch": 17.435897435897434, + "eval_loss": 1.5018247365951538, + "eval_runtime": 36.2118, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 3400 + }, + { + "epoch": 17.94871794871795, + "grad_norm": 52.865718841552734, + "learning_rate": 9.650900000000001e-06, + "loss": 1.4168, + "step": 3500 + }, + { + "epoch": 17.94871794871795, + "eval_loss": 1.4993616342544556, + "eval_runtime": 36.2572, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 3500 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 30.117380142211914, + "learning_rate": 9.640900000000001e-06, + "loss": 1.4275, + "step": 3600 + }, + { + "epoch": 18.46153846153846, + "eval_loss": 1.4899998903274536, + "eval_runtime": 36.4696, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 3600 + }, + { + "epoch": 18.974358974358974, + "grad_norm": 31.10028076171875, + "learning_rate": 9.630900000000001e-06, + "loss": 1.4148, + "step": 3700 + }, + { + "epoch": 18.974358974358974, + "eval_loss": 1.5231629610061646, + "eval_runtime": 36.4604, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 3700 + }, + { + "epoch": 19.487179487179485, + "grad_norm": 44.06697082519531, + "learning_rate": 9.620900000000001e-06, + "loss": 1.4057, + "step": 3800 + }, + { + "epoch": 19.487179487179485, + "eval_loss": 1.4841217994689941, + "eval_runtime": 36.3382, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 3800 + }, + { + "epoch": 20.0, + "grad_norm": 53.86429214477539, + "learning_rate": 9.610900000000001e-06, + "loss": 1.4302, + "step": 3900 + }, + { + "epoch": 20.0, + "eval_loss": 1.477772831916809, + "eval_runtime": 36.1794, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 3900 + }, + { + "epoch": 20.51282051282051, + "grad_norm": 80.95457458496094, + "learning_rate": 9.600900000000002e-06, + "loss": 1.4076, + "step": 4000 + }, + { + "epoch": 20.51282051282051, + "eval_loss": 1.4769134521484375, + "eval_runtime": 36.4725, + "eval_samples_per_second": 10.775, + "eval_steps_per_second": 1.371, + "step": 4000 + }, + { + "epoch": 21.025641025641026, + "grad_norm": 32.276214599609375, + "learning_rate": 9.5909e-06, + "loss": 1.3868, + "step": 4100 + }, + { + "epoch": 21.025641025641026, + "eval_loss": 1.463292121887207, + "eval_runtime": 36.5192, + "eval_samples_per_second": 10.761, + "eval_steps_per_second": 1.369, + "step": 4100 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 54.65959167480469, + "learning_rate": 9.5809e-06, + "loss": 1.3795, + "step": 4200 + }, + { + "epoch": 21.53846153846154, + "eval_loss": 1.4630039930343628, + "eval_runtime": 36.5288, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 4200 + }, + { + "epoch": 22.05128205128205, + "grad_norm": 42.31818389892578, + "learning_rate": 9.5709e-06, + "loss": 1.3787, + "step": 4300 + }, + { + "epoch": 22.05128205128205, + "eval_loss": 1.4471133947372437, + "eval_runtime": 36.2949, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 4300 + }, + { + "epoch": 22.564102564102566, + "grad_norm": 34.44257736206055, + "learning_rate": 9.5609e-06, + "loss": 1.4027, + "step": 4400 + }, + { + "epoch": 22.564102564102566, + "eval_loss": 1.4606964588165283, + "eval_runtime": 36.5672, + "eval_samples_per_second": 10.747, + "eval_steps_per_second": 1.367, + "step": 4400 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 42.65989303588867, + "learning_rate": 9.5509e-06, + "loss": 1.3459, + "step": 4500 + }, + { + "epoch": 23.076923076923077, + "eval_loss": 1.454709768295288, + "eval_runtime": 37.2024, + "eval_samples_per_second": 10.564, + "eval_steps_per_second": 1.344, + "step": 4500 + }, + { + "epoch": 23.58974358974359, + "grad_norm": 35.11396789550781, + "learning_rate": 9.5409e-06, + "loss": 1.3367, + "step": 4600 + }, + { + "epoch": 23.58974358974359, + "eval_loss": 1.4562979936599731, + "eval_runtime": 36.4579, + "eval_samples_per_second": 10.78, + "eval_steps_per_second": 1.371, + "step": 4600 + }, + { + "epoch": 24.102564102564102, + "grad_norm": 32.71805953979492, + "learning_rate": 9.5309e-06, + "loss": 1.3575, + "step": 4700 + }, + { + "epoch": 24.102564102564102, + "eval_loss": 1.4620144367218018, + "eval_runtime": 36.4055, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 4700 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 28.839757919311523, + "learning_rate": 9.5209e-06, + "loss": 1.3549, + "step": 4800 + }, + { + "epoch": 24.615384615384617, + "eval_loss": 1.4431304931640625, + "eval_runtime": 36.5027, + "eval_samples_per_second": 10.766, + "eval_steps_per_second": 1.37, + "step": 4800 + }, + { + "epoch": 25.128205128205128, + "grad_norm": 45.3994140625, + "learning_rate": 9.5109e-06, + "loss": 1.3885, + "step": 4900 + }, + { + "epoch": 25.128205128205128, + "eval_loss": 1.4312200546264648, + "eval_runtime": 36.3039, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 4900 + }, + { + "epoch": 25.641025641025642, + "grad_norm": 22.972829818725586, + "learning_rate": 9.5009e-06, + "loss": 1.3469, + "step": 5000 + }, + { + "epoch": 25.641025641025642, + "eval_loss": 1.416171669960022, + "eval_runtime": 36.6695, + "eval_samples_per_second": 10.717, + "eval_steps_per_second": 1.364, + "step": 5000 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 77.40106964111328, + "learning_rate": 9.490900000000001e-06, + "loss": 1.3363, + "step": 5100 + }, + { + "epoch": 26.153846153846153, + "eval_loss": 1.4090278148651123, + "eval_runtime": 36.2573, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 5100 + }, + { + "epoch": 26.666666666666668, + "grad_norm": 29.757932662963867, + "learning_rate": 9.480900000000001e-06, + "loss": 1.3183, + "step": 5200 + }, + { + "epoch": 26.666666666666668, + "eval_loss": 1.4073749780654907, + "eval_runtime": 36.2439, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.38, + "step": 5200 + }, + { + "epoch": 27.17948717948718, + "grad_norm": 56.78797149658203, + "learning_rate": 9.470900000000001e-06, + "loss": 1.3568, + "step": 5300 + }, + { + "epoch": 27.17948717948718, + "eval_loss": 1.4153741598129272, + "eval_runtime": 36.2756, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 5300 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 62.353477478027344, + "learning_rate": 9.460900000000001e-06, + "loss": 1.3304, + "step": 5400 + }, + { + "epoch": 27.692307692307693, + "eval_loss": 1.4334921836853027, + "eval_runtime": 36.2626, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 5400 + }, + { + "epoch": 28.205128205128204, + "grad_norm": 100.7852554321289, + "learning_rate": 9.450900000000001e-06, + "loss": 1.2897, + "step": 5500 + }, + { + "epoch": 28.205128205128204, + "eval_loss": 1.4160270690917969, + "eval_runtime": 36.6139, + "eval_samples_per_second": 10.734, + "eval_steps_per_second": 1.366, + "step": 5500 + }, + { + "epoch": 28.71794871794872, + "grad_norm": 62.06657409667969, + "learning_rate": 9.440900000000001e-06, + "loss": 1.3233, + "step": 5600 + }, + { + "epoch": 28.71794871794872, + "eval_loss": 1.431317687034607, + "eval_runtime": 36.5341, + "eval_samples_per_second": 10.757, + "eval_steps_per_second": 1.369, + "step": 5600 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 32.661346435546875, + "learning_rate": 9.4309e-06, + "loss": 1.305, + "step": 5700 + }, + { + "epoch": 29.23076923076923, + "eval_loss": 1.3954827785491943, + "eval_runtime": 36.5903, + "eval_samples_per_second": 10.741, + "eval_steps_per_second": 1.366, + "step": 5700 + }, + { + "epoch": 29.743589743589745, + "grad_norm": 25.690454483032227, + "learning_rate": 9.421000000000002e-06, + "loss": 1.2961, + "step": 5800 + }, + { + "epoch": 29.743589743589745, + "eval_loss": 1.4036046266555786, + "eval_runtime": 36.5935, + "eval_samples_per_second": 10.74, + "eval_steps_per_second": 1.366, + "step": 5800 + }, + { + "epoch": 30.256410256410255, + "grad_norm": 45.45426940917969, + "learning_rate": 9.411000000000002e-06, + "loss": 1.3175, + "step": 5900 + }, + { + "epoch": 30.256410256410255, + "eval_loss": 1.3845292329788208, + "eval_runtime": 36.5594, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 5900 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 41.62439727783203, + "learning_rate": 9.401000000000002e-06, + "loss": 1.3242, + "step": 6000 + }, + { + "epoch": 30.76923076923077, + "eval_loss": 1.3939634561538696, + "eval_runtime": 36.4895, + "eval_samples_per_second": 10.77, + "eval_steps_per_second": 1.37, + "step": 6000 + }, + { + "epoch": 31.28205128205128, + "grad_norm": 26.999319076538086, + "learning_rate": 9.391e-06, + "loss": 1.2886, + "step": 6100 + }, + { + "epoch": 31.28205128205128, + "eval_loss": 1.3804558515548706, + "eval_runtime": 36.2599, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 6100 + }, + { + "epoch": 31.794871794871796, + "grad_norm": 24.70287322998047, + "learning_rate": 9.381e-06, + "loss": 1.2893, + "step": 6200 + }, + { + "epoch": 31.794871794871796, + "eval_loss": 1.3821990489959717, + "eval_runtime": 36.2613, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 6200 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 41.910606384277344, + "learning_rate": 9.371e-06, + "loss": 1.3093, + "step": 6300 + }, + { + "epoch": 32.30769230769231, + "eval_loss": 1.3875064849853516, + "eval_runtime": 36.4998, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 6300 + }, + { + "epoch": 32.82051282051282, + "grad_norm": 82.20216369628906, + "learning_rate": 9.361e-06, + "loss": 1.3184, + "step": 6400 + }, + { + "epoch": 32.82051282051282, + "eval_loss": 1.3840612173080444, + "eval_runtime": 36.4787, + "eval_samples_per_second": 10.773, + "eval_steps_per_second": 1.371, + "step": 6400 + }, + { + "epoch": 33.333333333333336, + "grad_norm": 797.53271484375, + "learning_rate": 9.351e-06, + "loss": 1.2939, + "step": 6500 + }, + { + "epoch": 33.333333333333336, + "eval_loss": 1.3881950378417969, + "eval_runtime": 36.4166, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 6500 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 35.14693069458008, + "learning_rate": 9.341000000000001e-06, + "loss": 1.2881, + "step": 6600 + }, + { + "epoch": 33.84615384615385, + "eval_loss": 1.4039666652679443, + "eval_runtime": 36.4141, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 6600 + }, + { + "epoch": 34.35897435897436, + "grad_norm": 38.676666259765625, + "learning_rate": 9.331000000000001e-06, + "loss": 1.2699, + "step": 6700 + }, + { + "epoch": 34.35897435897436, + "eval_loss": 1.3800386190414429, + "eval_runtime": 36.2367, + "eval_samples_per_second": 10.845, + "eval_steps_per_second": 1.38, + "step": 6700 + }, + { + "epoch": 34.87179487179487, + "grad_norm": 26.051170349121094, + "learning_rate": 9.321000000000001e-06, + "loss": 1.3079, + "step": 6800 + }, + { + "epoch": 34.87179487179487, + "eval_loss": 1.3784704208374023, + "eval_runtime": 36.3119, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 6800 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 42.667236328125, + "learning_rate": 9.311000000000001e-06, + "loss": 1.2622, + "step": 6900 + }, + { + "epoch": 35.38461538461539, + "eval_loss": 1.3637058734893799, + "eval_runtime": 36.1116, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 6900 + }, + { + "epoch": 35.8974358974359, + "grad_norm": 38.78388977050781, + "learning_rate": 9.301000000000001e-06, + "loss": 1.2652, + "step": 7000 + }, + { + "epoch": 35.8974358974359, + "eval_loss": 1.3452589511871338, + "eval_runtime": 36.2593, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 7000 + }, + { + "epoch": 36.41025641025641, + "grad_norm": 44.43967056274414, + "learning_rate": 9.291000000000001e-06, + "loss": 1.2378, + "step": 7100 + }, + { + "epoch": 36.41025641025641, + "eval_loss": 1.3494073152542114, + "eval_runtime": 35.4402, + "eval_samples_per_second": 11.089, + "eval_steps_per_second": 1.411, + "step": 7100 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 49.02131271362305, + "learning_rate": 9.281000000000001e-06, + "loss": 1.2932, + "step": 7200 + }, + { + "epoch": 36.92307692307692, + "eval_loss": 1.3460158109664917, + "eval_runtime": 35.9361, + "eval_samples_per_second": 10.936, + "eval_steps_per_second": 1.391, + "step": 7200 + }, + { + "epoch": 37.43589743589744, + "grad_norm": 28.279098510742188, + "learning_rate": 9.271000000000002e-06, + "loss": 1.2598, + "step": 7300 + }, + { + "epoch": 37.43589743589744, + "eval_loss": 1.36253023147583, + "eval_runtime": 36.2944, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 7300 + }, + { + "epoch": 37.94871794871795, + "grad_norm": 35.21017074584961, + "learning_rate": 9.261000000000002e-06, + "loss": 1.2703, + "step": 7400 + }, + { + "epoch": 37.94871794871795, + "eval_loss": 1.3509865999221802, + "eval_runtime": 36.2661, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 7400 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 51.673316955566406, + "learning_rate": 9.251000000000002e-06, + "loss": 1.2393, + "step": 7500 + }, + { + "epoch": 38.46153846153846, + "eval_loss": 1.3402855396270752, + "eval_runtime": 36.392, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 7500 + }, + { + "epoch": 38.97435897435897, + "grad_norm": 53.73936462402344, + "learning_rate": 9.241000000000002e-06, + "loss": 1.2577, + "step": 7600 + }, + { + "epoch": 38.97435897435897, + "eval_loss": 1.3487578630447388, + "eval_runtime": 36.2119, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 7600 + }, + { + "epoch": 39.48717948717949, + "grad_norm": 55.994686126708984, + "learning_rate": 9.231000000000002e-06, + "loss": 1.229, + "step": 7700 + }, + { + "epoch": 39.48717948717949, + "eval_loss": 1.340031623840332, + "eval_runtime": 36.2063, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 7700 + }, + { + "epoch": 40.0, + "grad_norm": 86.7531509399414, + "learning_rate": 9.221e-06, + "loss": 1.2941, + "step": 7800 + }, + { + "epoch": 40.0, + "eval_loss": 1.3422337770462036, + "eval_runtime": 36.7462, + "eval_samples_per_second": 10.695, + "eval_steps_per_second": 1.361, + "step": 7800 + }, + { + "epoch": 40.51282051282051, + "grad_norm": 60.86371612548828, + "learning_rate": 9.211e-06, + "loss": 1.2423, + "step": 7900 + }, + { + "epoch": 40.51282051282051, + "eval_loss": 1.3336257934570312, + "eval_runtime": 36.2441, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.38, + "step": 7900 + }, + { + "epoch": 41.02564102564103, + "grad_norm": 28.535411834716797, + "learning_rate": 9.2011e-06, + "loss": 1.2676, + "step": 8000 + }, + { + "epoch": 41.02564102564103, + "eval_loss": 1.338461995124817, + "eval_runtime": 36.2212, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 8000 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 35.707183837890625, + "learning_rate": 9.1911e-06, + "loss": 1.2428, + "step": 8100 + }, + { + "epoch": 41.53846153846154, + "eval_loss": 1.3225666284561157, + "eval_runtime": 36.2043, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 8100 + }, + { + "epoch": 42.05128205128205, + "grad_norm": 29.23111343383789, + "learning_rate": 9.181100000000001e-06, + "loss": 1.2269, + "step": 8200 + }, + { + "epoch": 42.05128205128205, + "eval_loss": 1.3405094146728516, + "eval_runtime": 36.2498, + "eval_samples_per_second": 10.841, + "eval_steps_per_second": 1.379, + "step": 8200 + }, + { + "epoch": 42.56410256410256, + "grad_norm": 20.379304885864258, + "learning_rate": 9.171100000000001e-06, + "loss": 1.2187, + "step": 8300 + }, + { + "epoch": 42.56410256410256, + "eval_loss": 1.3247309923171997, + "eval_runtime": 36.3237, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.377, + "step": 8300 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 44.43791198730469, + "learning_rate": 9.161100000000001e-06, + "loss": 1.2321, + "step": 8400 + }, + { + "epoch": 43.07692307692308, + "eval_loss": 1.334086298942566, + "eval_runtime": 36.2039, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 8400 + }, + { + "epoch": 43.58974358974359, + "grad_norm": 30.97890853881836, + "learning_rate": 9.151100000000001e-06, + "loss": 1.2071, + "step": 8500 + }, + { + "epoch": 43.58974358974359, + "eval_loss": 1.3306576013565063, + "eval_runtime": 36.3547, + "eval_samples_per_second": 10.81, + "eval_steps_per_second": 1.375, + "step": 8500 + }, + { + "epoch": 44.1025641025641, + "grad_norm": 40.07706832885742, + "learning_rate": 9.141100000000001e-06, + "loss": 1.25, + "step": 8600 + }, + { + "epoch": 44.1025641025641, + "eval_loss": 1.3270679712295532, + "eval_runtime": 36.1754, + "eval_samples_per_second": 10.864, + "eval_steps_per_second": 1.382, + "step": 8600 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 27.011186599731445, + "learning_rate": 9.1311e-06, + "loss": 1.1968, + "step": 8700 + }, + { + "epoch": 44.61538461538461, + "eval_loss": 1.3117327690124512, + "eval_runtime": 36.1727, + "eval_samples_per_second": 10.865, + "eval_steps_per_second": 1.382, + "step": 8700 + }, + { + "epoch": 45.12820512820513, + "grad_norm": 25.976228713989258, + "learning_rate": 9.1211e-06, + "loss": 1.2492, + "step": 8800 + }, + { + "epoch": 45.12820512820513, + "eval_loss": 1.3281300067901611, + "eval_runtime": 36.2531, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 8800 + }, + { + "epoch": 45.64102564102564, + "grad_norm": 21.215715408325195, + "learning_rate": 9.1111e-06, + "loss": 1.221, + "step": 8900 + }, + { + "epoch": 45.64102564102564, + "eval_loss": 1.3373734951019287, + "eval_runtime": 36.2242, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 8900 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 48.8258171081543, + "learning_rate": 9.1011e-06, + "loss": 1.2123, + "step": 9000 + }, + { + "epoch": 46.15384615384615, + "eval_loss": 1.3303853273391724, + "eval_runtime": 36.4019, + "eval_samples_per_second": 10.796, + "eval_steps_per_second": 1.374, + "step": 9000 + }, + { + "epoch": 46.666666666666664, + "grad_norm": 36.76605224609375, + "learning_rate": 9.0911e-06, + "loss": 1.1951, + "step": 9100 + }, + { + "epoch": 46.666666666666664, + "eval_loss": 1.3182373046875, + "eval_runtime": 36.387, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 9100 + }, + { + "epoch": 47.17948717948718, + "grad_norm": 40.79771423339844, + "learning_rate": 9.0811e-06, + "loss": 1.2155, + "step": 9200 + }, + { + "epoch": 47.17948717948718, + "eval_loss": 1.3303160667419434, + "eval_runtime": 36.2265, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 9200 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 48.06431579589844, + "learning_rate": 9.0711e-06, + "loss": 1.2236, + "step": 9300 + }, + { + "epoch": 47.69230769230769, + "eval_loss": 1.3128286600112915, + "eval_runtime": 36.3665, + "eval_samples_per_second": 10.807, + "eval_steps_per_second": 1.375, + "step": 9300 + }, + { + "epoch": 48.205128205128204, + "grad_norm": 31.19647216796875, + "learning_rate": 9.0611e-06, + "loss": 1.2033, + "step": 9400 + }, + { + "epoch": 48.205128205128204, + "eval_loss": 1.3134888410568237, + "eval_runtime": 36.5129, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 9400 + }, + { + "epoch": 48.717948717948715, + "grad_norm": 20.11866569519043, + "learning_rate": 9.0511e-06, + "loss": 1.1955, + "step": 9500 + }, + { + "epoch": 48.717948717948715, + "eval_loss": 1.3154560327529907, + "eval_runtime": 36.6023, + "eval_samples_per_second": 10.737, + "eval_steps_per_second": 1.366, + "step": 9500 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 36.3143424987793, + "learning_rate": 9.0411e-06, + "loss": 1.2067, + "step": 9600 + }, + { + "epoch": 49.23076923076923, + "eval_loss": 1.3158589601516724, + "eval_runtime": 36.4851, + "eval_samples_per_second": 10.772, + "eval_steps_per_second": 1.37, + "step": 9600 + }, + { + "epoch": 49.743589743589745, + "grad_norm": 48.41688537597656, + "learning_rate": 9.0311e-06, + "loss": 1.2295, + "step": 9700 + }, + { + "epoch": 49.743589743589745, + "eval_loss": 1.306788682937622, + "eval_runtime": 36.291, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 9700 + }, + { + "epoch": 50.256410256410255, + "grad_norm": 26.129995346069336, + "learning_rate": 9.0211e-06, + "loss": 1.1809, + "step": 9800 + }, + { + "epoch": 50.256410256410255, + "eval_loss": 1.3299375772476196, + "eval_runtime": 36.4651, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 9800 + }, + { + "epoch": 50.76923076923077, + "grad_norm": 19.543821334838867, + "learning_rate": 9.011100000000001e-06, + "loss": 1.2179, + "step": 9900 + }, + { + "epoch": 50.76923076923077, + "eval_loss": 1.317457675933838, + "eval_runtime": 36.2864, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 9900 + }, + { + "epoch": 51.282051282051285, + "grad_norm": 25.619775772094727, + "learning_rate": 9.001100000000001e-06, + "loss": 1.1653, + "step": 10000 + }, + { + "epoch": 51.282051282051285, + "eval_loss": 1.31196928024292, + "eval_runtime": 36.3263, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 10000 + }, + { + "epoch": 51.794871794871796, + "grad_norm": 45.30315017700195, + "learning_rate": 8.991100000000001e-06, + "loss": 1.2391, + "step": 10100 + }, + { + "epoch": 51.794871794871796, + "eval_loss": 1.305709719657898, + "eval_runtime": 36.2103, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 10100 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 26.942337036132812, + "learning_rate": 8.981100000000001e-06, + "loss": 1.2195, + "step": 10200 + }, + { + "epoch": 52.30769230769231, + "eval_loss": 1.3068941831588745, + "eval_runtime": 36.2565, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 10200 + }, + { + "epoch": 52.82051282051282, + "grad_norm": 26.20073890686035, + "learning_rate": 8.9711e-06, + "loss": 1.1639, + "step": 10300 + }, + { + "epoch": 52.82051282051282, + "eval_loss": 1.3013452291488647, + "eval_runtime": 36.2146, + "eval_samples_per_second": 10.852, + "eval_steps_per_second": 1.381, + "step": 10300 + }, + { + "epoch": 53.333333333333336, + "grad_norm": 41.40350341796875, + "learning_rate": 8.9611e-06, + "loss": 1.2033, + "step": 10400 + }, + { + "epoch": 53.333333333333336, + "eval_loss": 1.305737853050232, + "eval_runtime": 36.4865, + "eval_samples_per_second": 10.771, + "eval_steps_per_second": 1.37, + "step": 10400 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 28.133567810058594, + "learning_rate": 8.9511e-06, + "loss": 1.1906, + "step": 10500 + }, + { + "epoch": 53.84615384615385, + "eval_loss": 1.2961195707321167, + "eval_runtime": 36.2734, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 10500 + }, + { + "epoch": 54.35897435897436, + "grad_norm": 44.07390213012695, + "learning_rate": 8.9411e-06, + "loss": 1.1899, + "step": 10600 + }, + { + "epoch": 54.35897435897436, + "eval_loss": 1.3024916648864746, + "eval_runtime": 36.4774, + "eval_samples_per_second": 10.774, + "eval_steps_per_second": 1.371, + "step": 10600 + }, + { + "epoch": 54.87179487179487, + "grad_norm": 19.120830535888672, + "learning_rate": 8.9311e-06, + "loss": 1.1697, + "step": 10700 + }, + { + "epoch": 54.87179487179487, + "eval_loss": 1.3056432008743286, + "eval_runtime": 36.2367, + "eval_samples_per_second": 10.845, + "eval_steps_per_second": 1.38, + "step": 10700 + }, + { + "epoch": 55.38461538461539, + "grad_norm": 52.376529693603516, + "learning_rate": 8.9211e-06, + "loss": 1.1759, + "step": 10800 + }, + { + "epoch": 55.38461538461539, + "eval_loss": 1.3018929958343506, + "eval_runtime": 36.1478, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 10800 + }, + { + "epoch": 55.8974358974359, + "grad_norm": 41.84946060180664, + "learning_rate": 8.9112e-06, + "loss": 1.1973, + "step": 10900 + }, + { + "epoch": 55.8974358974359, + "eval_loss": 1.3166084289550781, + "eval_runtime": 36.4967, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 10900 + }, + { + "epoch": 56.41025641025641, + "grad_norm": 48.97800064086914, + "learning_rate": 8.9012e-06, + "loss": 1.1942, + "step": 11000 + }, + { + "epoch": 56.41025641025641, + "eval_loss": 1.3040730953216553, + "eval_runtime": 36.3391, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 11000 + }, + { + "epoch": 56.92307692307692, + "grad_norm": 24.18547821044922, + "learning_rate": 8.8912e-06, + "loss": 1.1544, + "step": 11100 + }, + { + "epoch": 56.92307692307692, + "eval_loss": 1.2837135791778564, + "eval_runtime": 36.2839, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 11100 + }, + { + "epoch": 57.43589743589744, + "grad_norm": 34.69540023803711, + "learning_rate": 8.8812e-06, + "loss": 1.1998, + "step": 11200 + }, + { + "epoch": 57.43589743589744, + "eval_loss": 1.2983756065368652, + "eval_runtime": 36.2024, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 11200 + }, + { + "epoch": 57.94871794871795, + "grad_norm": 30.074583053588867, + "learning_rate": 8.8712e-06, + "loss": 1.1352, + "step": 11300 + }, + { + "epoch": 57.94871794871795, + "eval_loss": 1.2913649082183838, + "eval_runtime": 36.0977, + "eval_samples_per_second": 10.887, + "eval_steps_per_second": 1.385, + "step": 11300 + }, + { + "epoch": 58.46153846153846, + "grad_norm": 26.75031852722168, + "learning_rate": 8.8612e-06, + "loss": 1.1728, + "step": 11400 + }, + { + "epoch": 58.46153846153846, + "eval_loss": 1.288116216659546, + "eval_runtime": 36.2588, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 11400 + }, + { + "epoch": 58.97435897435897, + "grad_norm": 49.548213958740234, + "learning_rate": 8.851200000000001e-06, + "loss": 1.1738, + "step": 11500 + }, + { + "epoch": 58.97435897435897, + "eval_loss": 1.2846206426620483, + "eval_runtime": 36.5245, + "eval_samples_per_second": 10.76, + "eval_steps_per_second": 1.369, + "step": 11500 + }, + { + "epoch": 59.48717948717949, + "grad_norm": 23.007057189941406, + "learning_rate": 8.841200000000001e-06, + "loss": 1.1501, + "step": 11600 + }, + { + "epoch": 59.48717948717949, + "eval_loss": 1.297021746635437, + "eval_runtime": 36.2678, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 11600 + }, + { + "epoch": 60.0, + "grad_norm": 39.79067611694336, + "learning_rate": 8.831200000000001e-06, + "loss": 1.1836, + "step": 11700 + }, + { + "epoch": 60.0, + "eval_loss": 1.2865136861801147, + "eval_runtime": 36.1127, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 11700 + }, + { + "epoch": 60.51282051282051, + "grad_norm": 24.281373977661133, + "learning_rate": 8.821200000000001e-06, + "loss": 1.1548, + "step": 11800 + }, + { + "epoch": 60.51282051282051, + "eval_loss": 1.2812024354934692, + "eval_runtime": 36.2399, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 11800 + }, + { + "epoch": 61.02564102564103, + "grad_norm": 30.851072311401367, + "learning_rate": 8.811200000000001e-06, + "loss": 1.1794, + "step": 11900 + }, + { + "epoch": 61.02564102564103, + "eval_loss": 1.2902381420135498, + "eval_runtime": 36.1264, + "eval_samples_per_second": 10.878, + "eval_steps_per_second": 1.384, + "step": 11900 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 44.42039108276367, + "learning_rate": 8.801200000000001e-06, + "loss": 1.1385, + "step": 12000 + }, + { + "epoch": 61.53846153846154, + "eval_loss": 1.2793415784835815, + "eval_runtime": 36.266, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 12000 + }, + { + "epoch": 62.05128205128205, + "grad_norm": 57.410274505615234, + "learning_rate": 8.791200000000001e-06, + "loss": 1.1697, + "step": 12100 + }, + { + "epoch": 62.05128205128205, + "eval_loss": 1.2847199440002441, + "eval_runtime": 36.1783, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 12100 + }, + { + "epoch": 62.56410256410256, + "grad_norm": 70.70729064941406, + "learning_rate": 8.781200000000002e-06, + "loss": 1.1518, + "step": 12200 + }, + { + "epoch": 62.56410256410256, + "eval_loss": 1.2760446071624756, + "eval_runtime": 36.0527, + "eval_samples_per_second": 10.901, + "eval_steps_per_second": 1.387, + "step": 12200 + }, + { + "epoch": 63.07692307692308, + "grad_norm": 32.417388916015625, + "learning_rate": 8.7712e-06, + "loss": 1.1677, + "step": 12300 + }, + { + "epoch": 63.07692307692308, + "eval_loss": 1.2847411632537842, + "eval_runtime": 36.3923, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 12300 + }, + { + "epoch": 63.58974358974359, + "grad_norm": 24.372791290283203, + "learning_rate": 8.7612e-06, + "loss": 1.1433, + "step": 12400 + }, + { + "epoch": 63.58974358974359, + "eval_loss": 1.2779407501220703, + "eval_runtime": 36.5015, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 12400 + }, + { + "epoch": 64.1025641025641, + "grad_norm": 19.632272720336914, + "learning_rate": 8.7512e-06, + "loss": 1.1607, + "step": 12500 + }, + { + "epoch": 64.1025641025641, + "eval_loss": 1.2792208194732666, + "eval_runtime": 36.4617, + "eval_samples_per_second": 10.778, + "eval_steps_per_second": 1.371, + "step": 12500 + }, + { + "epoch": 64.61538461538461, + "grad_norm": 34.54841613769531, + "learning_rate": 8.7412e-06, + "loss": 1.1371, + "step": 12600 + }, + { + "epoch": 64.61538461538461, + "eval_loss": 1.2620294094085693, + "eval_runtime": 36.4969, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 12600 + }, + { + "epoch": 65.12820512820512, + "grad_norm": 19.67386817932129, + "learning_rate": 8.7312e-06, + "loss": 1.1332, + "step": 12700 + }, + { + "epoch": 65.12820512820512, + "eval_loss": 1.2682890892028809, + "eval_runtime": 36.6309, + "eval_samples_per_second": 10.729, + "eval_steps_per_second": 1.365, + "step": 12700 + }, + { + "epoch": 65.64102564102564, + "grad_norm": 65.07030487060547, + "learning_rate": 8.7212e-06, + "loss": 1.1571, + "step": 12800 + }, + { + "epoch": 65.64102564102564, + "eval_loss": 1.2490720748901367, + "eval_runtime": 36.3056, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 12800 + }, + { + "epoch": 66.15384615384616, + "grad_norm": 20.384132385253906, + "learning_rate": 8.7112e-06, + "loss": 1.1619, + "step": 12900 + }, + { + "epoch": 66.15384615384616, + "eval_loss": 1.2465028762817383, + "eval_runtime": 36.1678, + "eval_samples_per_second": 10.866, + "eval_steps_per_second": 1.382, + "step": 12900 + }, + { + "epoch": 66.66666666666667, + "grad_norm": 36.320674896240234, + "learning_rate": 8.7012e-06, + "loss": 1.1176, + "step": 13000 + }, + { + "epoch": 66.66666666666667, + "eval_loss": 1.2594362497329712, + "eval_runtime": 36.3117, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 13000 + }, + { + "epoch": 67.17948717948718, + "grad_norm": 30.79526138305664, + "learning_rate": 8.6912e-06, + "loss": 1.1311, + "step": 13100 + }, + { + "epoch": 67.17948717948718, + "eval_loss": 1.2553967237472534, + "eval_runtime": 36.2667, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 13100 + }, + { + "epoch": 67.6923076923077, + "grad_norm": 40.0444450378418, + "learning_rate": 8.6812e-06, + "loss": 1.165, + "step": 13200 + }, + { + "epoch": 67.6923076923077, + "eval_loss": 1.2607430219650269, + "eval_runtime": 36.339, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 13200 + }, + { + "epoch": 68.2051282051282, + "grad_norm": 25.53687286376953, + "learning_rate": 8.671200000000001e-06, + "loss": 1.1334, + "step": 13300 + }, + { + "epoch": 68.2051282051282, + "eval_loss": 1.2592159509658813, + "eval_runtime": 36.2847, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 13300 + }, + { + "epoch": 68.71794871794872, + "grad_norm": 28.198373794555664, + "learning_rate": 8.661200000000001e-06, + "loss": 1.1481, + "step": 13400 + }, + { + "epoch": 68.71794871794872, + "eval_loss": 1.2755882740020752, + "eval_runtime": 36.6238, + "eval_samples_per_second": 10.731, + "eval_steps_per_second": 1.365, + "step": 13400 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 44.77776336669922, + "learning_rate": 8.651200000000001e-06, + "loss": 1.1138, + "step": 13500 + }, + { + "epoch": 69.23076923076923, + "eval_loss": 1.259446144104004, + "eval_runtime": 36.4048, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 13500 + }, + { + "epoch": 69.74358974358974, + "grad_norm": 73.0951156616211, + "learning_rate": 8.641200000000001e-06, + "loss": 1.149, + "step": 13600 + }, + { + "epoch": 69.74358974358974, + "eval_loss": 1.2759552001953125, + "eval_runtime": 35.9924, + "eval_samples_per_second": 10.919, + "eval_steps_per_second": 1.389, + "step": 13600 + }, + { + "epoch": 70.25641025641026, + "grad_norm": 43.95732116699219, + "learning_rate": 8.631200000000001e-06, + "loss": 1.15, + "step": 13700 + }, + { + "epoch": 70.25641025641026, + "eval_loss": 1.2675697803497314, + "eval_runtime": 36.2305, + "eval_samples_per_second": 10.847, + "eval_steps_per_second": 1.38, + "step": 13700 + }, + { + "epoch": 70.76923076923077, + "grad_norm": 28.56161117553711, + "learning_rate": 8.621200000000001e-06, + "loss": 1.1065, + "step": 13800 + }, + { + "epoch": 70.76923076923077, + "eval_loss": 1.256949782371521, + "eval_runtime": 36.2833, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 13800 + }, + { + "epoch": 71.28205128205128, + "grad_norm": 80.90894317626953, + "learning_rate": 8.611200000000002e-06, + "loss": 1.1111, + "step": 13900 + }, + { + "epoch": 71.28205128205128, + "eval_loss": 1.2672077417373657, + "eval_runtime": 36.2401, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 13900 + }, + { + "epoch": 71.7948717948718, + "grad_norm": 15.566130638122559, + "learning_rate": 8.6012e-06, + "loss": 1.1487, + "step": 14000 + }, + { + "epoch": 71.7948717948718, + "eval_loss": 1.2434508800506592, + "eval_runtime": 36.3731, + "eval_samples_per_second": 10.805, + "eval_steps_per_second": 1.375, + "step": 14000 + }, + { + "epoch": 72.3076923076923, + "grad_norm": 41.25931930541992, + "learning_rate": 8.5912e-06, + "loss": 1.1357, + "step": 14100 + }, + { + "epoch": 72.3076923076923, + "eval_loss": 1.25618314743042, + "eval_runtime": 36.1143, + "eval_samples_per_second": 10.882, + "eval_steps_per_second": 1.384, + "step": 14100 + }, + { + "epoch": 72.82051282051282, + "grad_norm": 24.271724700927734, + "learning_rate": 8.5812e-06, + "loss": 1.1039, + "step": 14200 + }, + { + "epoch": 72.82051282051282, + "eval_loss": 1.2586956024169922, + "eval_runtime": 36.2558, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 14200 + }, + { + "epoch": 73.33333333333333, + "grad_norm": 23.148712158203125, + "learning_rate": 8.5713e-06, + "loss": 1.1332, + "step": 14300 + }, + { + "epoch": 73.33333333333333, + "eval_loss": 1.241268515586853, + "eval_runtime": 36.617, + "eval_samples_per_second": 10.733, + "eval_steps_per_second": 1.365, + "step": 14300 + }, + { + "epoch": 73.84615384615384, + "grad_norm": 39.42832946777344, + "learning_rate": 8.5613e-06, + "loss": 1.1276, + "step": 14400 + }, + { + "epoch": 73.84615384615384, + "eval_loss": 1.261016607284546, + "eval_runtime": 36.2412, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 14400 + }, + { + "epoch": 74.35897435897436, + "grad_norm": 41.2990837097168, + "learning_rate": 8.5513e-06, + "loss": 1.1259, + "step": 14500 + }, + { + "epoch": 74.35897435897436, + "eval_loss": 1.254787802696228, + "eval_runtime": 36.1616, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 14500 + }, + { + "epoch": 74.87179487179488, + "grad_norm": 75.0270767211914, + "learning_rate": 8.5413e-06, + "loss": 1.0919, + "step": 14600 + }, + { + "epoch": 74.87179487179488, + "eval_loss": 1.2456586360931396, + "eval_runtime": 36.3316, + "eval_samples_per_second": 10.817, + "eval_steps_per_second": 1.376, + "step": 14600 + }, + { + "epoch": 75.38461538461539, + "grad_norm": 23.520156860351562, + "learning_rate": 8.5313e-06, + "loss": 1.1415, + "step": 14700 + }, + { + "epoch": 75.38461538461539, + "eval_loss": 1.260399580001831, + "eval_runtime": 36.3274, + "eval_samples_per_second": 10.818, + "eval_steps_per_second": 1.376, + "step": 14700 + }, + { + "epoch": 75.8974358974359, + "grad_norm": 23.925405502319336, + "learning_rate": 8.521300000000001e-06, + "loss": 1.1435, + "step": 14800 + }, + { + "epoch": 75.8974358974359, + "eval_loss": 1.2679468393325806, + "eval_runtime": 36.2707, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.379, + "step": 14800 + }, + { + "epoch": 76.41025641025641, + "grad_norm": 19.826759338378906, + "learning_rate": 8.511300000000001e-06, + "loss": 1.1034, + "step": 14900 + }, + { + "epoch": 76.41025641025641, + "eval_loss": 1.260425090789795, + "eval_runtime": 36.2564, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 14900 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 36.41432189941406, + "learning_rate": 8.501300000000001e-06, + "loss": 1.1181, + "step": 15000 + }, + { + "epoch": 76.92307692307692, + "eval_loss": 1.275189757347107, + "eval_runtime": 36.226, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 15000 + }, + { + "epoch": 77.43589743589743, + "grad_norm": 56.50348663330078, + "learning_rate": 8.491300000000001e-06, + "loss": 1.117, + "step": 15100 + }, + { + "epoch": 77.43589743589743, + "eval_loss": 1.2597932815551758, + "eval_runtime": 36.1736, + "eval_samples_per_second": 10.864, + "eval_steps_per_second": 1.382, + "step": 15100 + }, + { + "epoch": 77.94871794871794, + "grad_norm": 16.227319717407227, + "learning_rate": 8.481300000000001e-06, + "loss": 1.1287, + "step": 15200 + }, + { + "epoch": 77.94871794871794, + "eval_loss": 1.2599974870681763, + "eval_runtime": 36.2653, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 15200 + }, + { + "epoch": 78.46153846153847, + "grad_norm": 34.07974624633789, + "learning_rate": 8.471300000000001e-06, + "loss": 1.1484, + "step": 15300 + }, + { + "epoch": 78.46153846153847, + "eval_loss": 1.2516891956329346, + "eval_runtime": 36.47, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 15300 + }, + { + "epoch": 78.97435897435898, + "grad_norm": 48.17190933227539, + "learning_rate": 8.461300000000001e-06, + "loss": 1.0917, + "step": 15400 + }, + { + "epoch": 78.97435897435898, + "eval_loss": 1.244437336921692, + "eval_runtime": 36.2847, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 15400 + }, + { + "epoch": 79.48717948717949, + "grad_norm": 34.02452087402344, + "learning_rate": 8.451300000000002e-06, + "loss": 1.0924, + "step": 15500 + }, + { + "epoch": 79.48717948717949, + "eval_loss": 1.2553346157073975, + "eval_runtime": 36.3446, + "eval_samples_per_second": 10.813, + "eval_steps_per_second": 1.376, + "step": 15500 + }, + { + "epoch": 80.0, + "grad_norm": 12.990086555480957, + "learning_rate": 8.441300000000002e-06, + "loss": 1.1319, + "step": 15600 + }, + { + "epoch": 80.0, + "eval_loss": 1.2469161748886108, + "eval_runtime": 36.3089, + "eval_samples_per_second": 10.824, + "eval_steps_per_second": 1.377, + "step": 15600 + }, + { + "epoch": 80.51282051282051, + "grad_norm": 31.65691375732422, + "learning_rate": 8.431300000000002e-06, + "loss": 1.12, + "step": 15700 + }, + { + "epoch": 80.51282051282051, + "eval_loss": 1.2401645183563232, + "eval_runtime": 36.2741, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 15700 + }, + { + "epoch": 81.02564102564102, + "grad_norm": 21.566389083862305, + "learning_rate": 8.421300000000002e-06, + "loss": 1.1089, + "step": 15800 + }, + { + "epoch": 81.02564102564102, + "eval_loss": 1.2469390630722046, + "eval_runtime": 36.3979, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 15800 + }, + { + "epoch": 81.53846153846153, + "grad_norm": 27.354368209838867, + "learning_rate": 8.411300000000002e-06, + "loss": 1.1259, + "step": 15900 + }, + { + "epoch": 81.53846153846153, + "eval_loss": 1.2625091075897217, + "eval_runtime": 36.6689, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 1.364, + "step": 15900 + }, + { + "epoch": 82.05128205128206, + "grad_norm": 29.60841178894043, + "learning_rate": 8.4013e-06, + "loss": 1.0668, + "step": 16000 + }, + { + "epoch": 82.05128205128206, + "eval_loss": 1.2272534370422363, + "eval_runtime": 36.5139, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 16000 + }, + { + "epoch": 82.56410256410257, + "grad_norm": 19.469263076782227, + "learning_rate": 8.3913e-06, + "loss": 1.1236, + "step": 16100 + }, + { + "epoch": 82.56410256410257, + "eval_loss": 1.232078194618225, + "eval_runtime": 36.3175, + "eval_samples_per_second": 10.821, + "eval_steps_per_second": 1.377, + "step": 16100 + }, + { + "epoch": 83.07692307692308, + "grad_norm": 40.06100082397461, + "learning_rate": 8.3813e-06, + "loss": 1.0685, + "step": 16200 + }, + { + "epoch": 83.07692307692308, + "eval_loss": 1.254823088645935, + "eval_runtime": 36.5367, + "eval_samples_per_second": 10.756, + "eval_steps_per_second": 1.368, + "step": 16200 + }, + { + "epoch": 83.58974358974359, + "grad_norm": 25.563325881958008, + "learning_rate": 8.3713e-06, + "loss": 1.0911, + "step": 16300 + }, + { + "epoch": 83.58974358974359, + "eval_loss": 1.2462764978408813, + "eval_runtime": 36.1602, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 16300 + }, + { + "epoch": 84.1025641025641, + "grad_norm": 41.810020446777344, + "learning_rate": 8.3613e-06, + "loss": 1.1009, + "step": 16400 + }, + { + "epoch": 84.1025641025641, + "eval_loss": 1.2400792837142944, + "eval_runtime": 36.2464, + "eval_samples_per_second": 10.842, + "eval_steps_per_second": 1.379, + "step": 16400 + }, + { + "epoch": 84.61538461538461, + "grad_norm": 40.818111419677734, + "learning_rate": 8.3513e-06, + "loss": 1.0857, + "step": 16500 + }, + { + "epoch": 84.61538461538461, + "eval_loss": 1.2257955074310303, + "eval_runtime": 36.2933, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 16500 + }, + { + "epoch": 85.12820512820512, + "grad_norm": 33.36876678466797, + "learning_rate": 8.341300000000001e-06, + "loss": 1.1033, + "step": 16600 + }, + { + "epoch": 85.12820512820512, + "eval_loss": 1.2494174242019653, + "eval_runtime": 36.2089, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 16600 + }, + { + "epoch": 85.64102564102564, + "grad_norm": 41.88273620605469, + "learning_rate": 8.331300000000001e-06, + "loss": 1.0674, + "step": 16700 + }, + { + "epoch": 85.64102564102564, + "eval_loss": 1.2431418895721436, + "eval_runtime": 36.3141, + "eval_samples_per_second": 10.822, + "eval_steps_per_second": 1.377, + "step": 16700 + }, + { + "epoch": 86.15384615384616, + "grad_norm": 26.26712989807129, + "learning_rate": 8.321300000000001e-06, + "loss": 1.1154, + "step": 16800 + }, + { + "epoch": 86.15384615384616, + "eval_loss": 1.2349519729614258, + "eval_runtime": 36.3687, + "eval_samples_per_second": 10.806, + "eval_steps_per_second": 1.375, + "step": 16800 + }, + { + "epoch": 86.66666666666667, + "grad_norm": 28.719078063964844, + "learning_rate": 8.311300000000001e-06, + "loss": 1.0821, + "step": 16900 + }, + { + "epoch": 86.66666666666667, + "eval_loss": 1.228055715560913, + "eval_runtime": 36.3697, + "eval_samples_per_second": 10.806, + "eval_steps_per_second": 1.375, + "step": 16900 + }, + { + "epoch": 87.17948717948718, + "grad_norm": 35.35209655761719, + "learning_rate": 8.301300000000001e-06, + "loss": 1.0829, + "step": 17000 + }, + { + "epoch": 87.17948717948718, + "eval_loss": 1.241512417793274, + "eval_runtime": 36.4655, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 17000 + }, + { + "epoch": 87.6923076923077, + "grad_norm": 24.084636688232422, + "learning_rate": 8.291300000000001e-06, + "loss": 1.0926, + "step": 17100 + }, + { + "epoch": 87.6923076923077, + "eval_loss": 1.227530598640442, + "eval_runtime": 36.183, + "eval_samples_per_second": 10.861, + "eval_steps_per_second": 1.382, + "step": 17100 + }, + { + "epoch": 88.2051282051282, + "grad_norm": 35.995765686035156, + "learning_rate": 8.281300000000002e-06, + "loss": 1.076, + "step": 17200 + }, + { + "epoch": 88.2051282051282, + "eval_loss": 1.231921911239624, + "eval_runtime": 36.1958, + "eval_samples_per_second": 10.858, + "eval_steps_per_second": 1.381, + "step": 17200 + }, + { + "epoch": 88.71794871794872, + "grad_norm": 23.116085052490234, + "learning_rate": 8.271300000000002e-06, + "loss": 1.0993, + "step": 17300 + }, + { + "epoch": 88.71794871794872, + "eval_loss": 1.242146611213684, + "eval_runtime": 36.2666, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 17300 + }, + { + "epoch": 89.23076923076923, + "grad_norm": 18.24385643005371, + "learning_rate": 8.261300000000002e-06, + "loss": 1.1213, + "step": 17400 + }, + { + "epoch": 89.23076923076923, + "eval_loss": 1.230137825012207, + "eval_runtime": 36.153, + "eval_samples_per_second": 10.87, + "eval_steps_per_second": 1.383, + "step": 17400 + }, + { + "epoch": 89.74358974358974, + "grad_norm": 44.21913146972656, + "learning_rate": 8.251300000000002e-06, + "loss": 1.045, + "step": 17500 + }, + { + "epoch": 89.74358974358974, + "eval_loss": 1.2343608140945435, + "eval_runtime": 36.3254, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 17500 + }, + { + "epoch": 90.25641025641026, + "grad_norm": 29.37706756591797, + "learning_rate": 8.2413e-06, + "loss": 1.0805, + "step": 17600 + }, + { + "epoch": 90.25641025641026, + "eval_loss": 1.2186076641082764, + "eval_runtime": 36.3582, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 17600 + }, + { + "epoch": 90.76923076923077, + "grad_norm": 40.11149597167969, + "learning_rate": 8.2313e-06, + "loss": 1.0732, + "step": 17700 + }, + { + "epoch": 90.76923076923077, + "eval_loss": 1.2361598014831543, + "eval_runtime": 36.4168, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 17700 + }, + { + "epoch": 91.28205128205128, + "grad_norm": 33.00440216064453, + "learning_rate": 8.2213e-06, + "loss": 1.0912, + "step": 17800 + }, + { + "epoch": 91.28205128205128, + "eval_loss": 1.2296538352966309, + "eval_runtime": 36.2928, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 17800 + }, + { + "epoch": 91.7948717948718, + "grad_norm": 30.941469192504883, + "learning_rate": 8.2113e-06, + "loss": 1.064, + "step": 17900 + }, + { + "epoch": 91.7948717948718, + "eval_loss": 1.250794529914856, + "eval_runtime": 36.4692, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 17900 + }, + { + "epoch": 92.3076923076923, + "grad_norm": 41.63932800292969, + "learning_rate": 8.2013e-06, + "loss": 1.0529, + "step": 18000 + }, + { + "epoch": 92.3076923076923, + "eval_loss": 1.2209473848342896, + "eval_runtime": 36.5494, + "eval_samples_per_second": 10.753, + "eval_steps_per_second": 1.368, + "step": 18000 + }, + { + "epoch": 92.82051282051282, + "grad_norm": 34.083587646484375, + "learning_rate": 8.1913e-06, + "loss": 1.0849, + "step": 18100 + }, + { + "epoch": 92.82051282051282, + "eval_loss": 1.2245945930480957, + "eval_runtime": 36.3755, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 18100 + }, + { + "epoch": 93.33333333333333, + "grad_norm": 33.740848541259766, + "learning_rate": 8.1813e-06, + "loss": 1.0853, + "step": 18200 + }, + { + "epoch": 93.33333333333333, + "eval_loss": 1.2368453741073608, + "eval_runtime": 36.1346, + "eval_samples_per_second": 10.876, + "eval_steps_per_second": 1.384, + "step": 18200 + }, + { + "epoch": 93.84615384615384, + "grad_norm": 22.13953971862793, + "learning_rate": 8.171300000000001e-06, + "loss": 1.09, + "step": 18300 + }, + { + "epoch": 93.84615384615384, + "eval_loss": 1.2331533432006836, + "eval_runtime": 36.4043, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 18300 + }, + { + "epoch": 94.35897435897436, + "grad_norm": 45.70988082885742, + "learning_rate": 8.161300000000001e-06, + "loss": 1.0543, + "step": 18400 + }, + { + "epoch": 94.35897435897436, + "eval_loss": 1.216800570487976, + "eval_runtime": 36.5884, + "eval_samples_per_second": 10.741, + "eval_steps_per_second": 1.367, + "step": 18400 + }, + { + "epoch": 94.87179487179488, + "grad_norm": 38.62083435058594, + "learning_rate": 8.151300000000001e-06, + "loss": 1.09, + "step": 18500 + }, + { + "epoch": 94.87179487179488, + "eval_loss": 1.24717378616333, + "eval_runtime": 36.2986, + "eval_samples_per_second": 10.827, + "eval_steps_per_second": 1.377, + "step": 18500 + }, + { + "epoch": 95.38461538461539, + "grad_norm": 40.52507400512695, + "learning_rate": 8.141300000000001e-06, + "loss": 1.1019, + "step": 18600 + }, + { + "epoch": 95.38461538461539, + "eval_loss": 1.2494611740112305, + "eval_runtime": 36.3553, + "eval_samples_per_second": 10.81, + "eval_steps_per_second": 1.375, + "step": 18600 + }, + { + "epoch": 95.8974358974359, + "grad_norm": 70.1895523071289, + "learning_rate": 8.131300000000001e-06, + "loss": 1.0711, + "step": 18700 + }, + { + "epoch": 95.8974358974359, + "eval_loss": 1.2522521018981934, + "eval_runtime": 36.1421, + "eval_samples_per_second": 10.874, + "eval_steps_per_second": 1.383, + "step": 18700 + }, + { + "epoch": 96.41025641025641, + "grad_norm": 45.69275665283203, + "learning_rate": 8.121300000000001e-06, + "loss": 1.1066, + "step": 18800 + }, + { + "epoch": 96.41025641025641, + "eval_loss": 1.250695824623108, + "eval_runtime": 36.2669, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 18800 + }, + { + "epoch": 96.92307692307692, + "grad_norm": 23.61644744873047, + "learning_rate": 8.111300000000001e-06, + "loss": 1.0967, + "step": 18900 + }, + { + "epoch": 96.92307692307692, + "eval_loss": 1.2277597188949585, + "eval_runtime": 36.3005, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 18900 + }, + { + "epoch": 97.43589743589743, + "grad_norm": 21.942943572998047, + "learning_rate": 8.101300000000002e-06, + "loss": 1.0704, + "step": 19000 + }, + { + "epoch": 97.43589743589743, + "eval_loss": 1.2279075384140015, + "eval_runtime": 36.189, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 19000 + }, + { + "epoch": 97.94871794871794, + "grad_norm": 36.2983512878418, + "learning_rate": 8.091300000000002e-06, + "loss": 1.0719, + "step": 19100 + }, + { + "epoch": 97.94871794871794, + "eval_loss": 1.2093193531036377, + "eval_runtime": 36.5953, + "eval_samples_per_second": 10.739, + "eval_steps_per_second": 1.366, + "step": 19100 + }, + { + "epoch": 98.46153846153847, + "grad_norm": 73.0156021118164, + "learning_rate": 8.0813e-06, + "loss": 1.0538, + "step": 19200 + }, + { + "epoch": 98.46153846153847, + "eval_loss": 1.2311538457870483, + "eval_runtime": 36.4102, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 19200 + }, + { + "epoch": 98.97435897435898, + "grad_norm": 51.309017181396484, + "learning_rate": 8.0713e-06, + "loss": 1.0818, + "step": 19300 + }, + { + "epoch": 98.97435897435898, + "eval_loss": 1.2250592708587646, + "eval_runtime": 36.4117, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 19300 + }, + { + "epoch": 99.48717948717949, + "grad_norm": 15.101311683654785, + "learning_rate": 8.0613e-06, + "loss": 1.0656, + "step": 19400 + }, + { + "epoch": 99.48717948717949, + "eval_loss": 1.233995795249939, + "eval_runtime": 36.3917, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 19400 + }, + { + "epoch": 100.0, + "grad_norm": 39.63221740722656, + "learning_rate": 8.0513e-06, + "loss": 1.0716, + "step": 19500 + }, + { + "epoch": 100.0, + "eval_loss": 1.2169172763824463, + "eval_runtime": 36.466, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 19500 + }, + { + "epoch": 100.51282051282051, + "grad_norm": 28.275875091552734, + "learning_rate": 8.0413e-06, + "loss": 1.0863, + "step": 19600 + }, + { + "epoch": 100.51282051282051, + "eval_loss": 1.235645055770874, + "eval_runtime": 36.4873, + "eval_samples_per_second": 10.771, + "eval_steps_per_second": 1.37, + "step": 19600 + }, + { + "epoch": 101.02564102564102, + "grad_norm": 46.1825065612793, + "learning_rate": 8.0313e-06, + "loss": 1.0254, + "step": 19700 + }, + { + "epoch": 101.02564102564102, + "eval_loss": 1.2021634578704834, + "eval_runtime": 36.3574, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 19700 + }, + { + "epoch": 101.53846153846153, + "grad_norm": 33.101219177246094, + "learning_rate": 8.0213e-06, + "loss": 1.0802, + "step": 19800 + }, + { + "epoch": 101.53846153846153, + "eval_loss": 1.2263848781585693, + "eval_runtime": 36.4102, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 19800 + }, + { + "epoch": 102.05128205128206, + "grad_norm": 28.070240020751953, + "learning_rate": 8.0113e-06, + "loss": 1.0209, + "step": 19900 + }, + { + "epoch": 102.05128205128206, + "eval_loss": 1.2010384798049927, + "eval_runtime": 36.5083, + "eval_samples_per_second": 10.765, + "eval_steps_per_second": 1.37, + "step": 19900 + }, + { + "epoch": 102.56410256410257, + "grad_norm": 32.505916595458984, + "learning_rate": 8.0013e-06, + "loss": 1.0738, + "step": 20000 + }, + { + "epoch": 102.56410256410257, + "eval_loss": 1.1892540454864502, + "eval_runtime": 36.8658, + "eval_samples_per_second": 10.66, + "eval_steps_per_second": 1.356, + "step": 20000 + }, + { + "epoch": 103.07692307692308, + "grad_norm": 116.78060150146484, + "learning_rate": 7.991300000000001e-06, + "loss": 1.0417, + "step": 20100 + }, + { + "epoch": 103.07692307692308, + "eval_loss": 1.211946964263916, + "eval_runtime": 36.0941, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 20100 + }, + { + "epoch": 103.58974358974359, + "grad_norm": 47.81857681274414, + "learning_rate": 7.981300000000001e-06, + "loss": 1.0576, + "step": 20200 + }, + { + "epoch": 103.58974358974359, + "eval_loss": 1.2092243432998657, + "eval_runtime": 36.2906, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 20200 + }, + { + "epoch": 104.1025641025641, + "grad_norm": 44.64072036743164, + "learning_rate": 7.971300000000001e-06, + "loss": 1.053, + "step": 20300 + }, + { + "epoch": 104.1025641025641, + "eval_loss": 1.2181479930877686, + "eval_runtime": 36.2566, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 20300 + }, + { + "epoch": 104.61538461538461, + "grad_norm": 25.469749450683594, + "learning_rate": 7.961300000000001e-06, + "loss": 1.0532, + "step": 20400 + }, + { + "epoch": 104.61538461538461, + "eval_loss": 1.2095298767089844, + "eval_runtime": 36.1952, + "eval_samples_per_second": 10.858, + "eval_steps_per_second": 1.381, + "step": 20400 + }, + { + "epoch": 105.12820512820512, + "grad_norm": 18.33926773071289, + "learning_rate": 7.951300000000001e-06, + "loss": 1.0778, + "step": 20500 + }, + { + "epoch": 105.12820512820512, + "eval_loss": 1.2227920293807983, + "eval_runtime": 36.3372, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 20500 + }, + { + "epoch": 105.64102564102564, + "grad_norm": 67.40721130371094, + "learning_rate": 7.941300000000001e-06, + "loss": 1.0777, + "step": 20600 + }, + { + "epoch": 105.64102564102564, + "eval_loss": 1.2083336114883423, + "eval_runtime": 36.3182, + "eval_samples_per_second": 10.821, + "eval_steps_per_second": 1.377, + "step": 20600 + }, + { + "epoch": 106.15384615384616, + "grad_norm": 24.97282600402832, + "learning_rate": 7.931300000000001e-06, + "loss": 1.0512, + "step": 20700 + }, + { + "epoch": 106.15384615384616, + "eval_loss": 1.1997463703155518, + "eval_runtime": 36.3993, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 20700 + }, + { + "epoch": 106.66666666666667, + "grad_norm": 56.40156173706055, + "learning_rate": 7.9213e-06, + "loss": 1.0712, + "step": 20800 + }, + { + "epoch": 106.66666666666667, + "eval_loss": 1.2138112783432007, + "eval_runtime": 35.549, + "eval_samples_per_second": 11.055, + "eval_steps_per_second": 1.407, + "step": 20800 + }, + { + "epoch": 107.17948717948718, + "grad_norm": 28.606220245361328, + "learning_rate": 7.9113e-06, + "loss": 1.0497, + "step": 20900 + }, + { + "epoch": 107.17948717948718, + "eval_loss": 1.2097153663635254, + "eval_runtime": 36.5101, + "eval_samples_per_second": 10.764, + "eval_steps_per_second": 1.369, + "step": 20900 + }, + { + "epoch": 107.6923076923077, + "grad_norm": 32.19448471069336, + "learning_rate": 7.9013e-06, + "loss": 1.0383, + "step": 21000 + }, + { + "epoch": 107.6923076923077, + "eval_loss": 1.2029515504837036, + "eval_runtime": 36.5264, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 21000 + }, + { + "epoch": 108.2051282051282, + "grad_norm": 14.996597290039062, + "learning_rate": 7.8913e-06, + "loss": 1.0571, + "step": 21100 + }, + { + "epoch": 108.2051282051282, + "eval_loss": 1.2292343378067017, + "eval_runtime": 36.1323, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 21100 + }, + { + "epoch": 108.71794871794872, + "grad_norm": 15.809494018554688, + "learning_rate": 7.8814e-06, + "loss": 1.0725, + "step": 21200 + }, + { + "epoch": 108.71794871794872, + "eval_loss": 1.2223467826843262, + "eval_runtime": 36.1509, + "eval_samples_per_second": 10.871, + "eval_steps_per_second": 1.383, + "step": 21200 + }, + { + "epoch": 109.23076923076923, + "grad_norm": 55.448875427246094, + "learning_rate": 7.8714e-06, + "loss": 1.0401, + "step": 21300 + }, + { + "epoch": 109.23076923076923, + "eval_loss": 1.2152210474014282, + "eval_runtime": 36.4536, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 21300 + }, + { + "epoch": 109.74358974358974, + "grad_norm": 47.51198959350586, + "learning_rate": 7.8614e-06, + "loss": 1.0232, + "step": 21400 + }, + { + "epoch": 109.74358974358974, + "eval_loss": 1.2074100971221924, + "eval_runtime": 36.2117, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 21400 + }, + { + "epoch": 110.25641025641026, + "grad_norm": 22.616479873657227, + "learning_rate": 7.8514e-06, + "loss": 1.0837, + "step": 21500 + }, + { + "epoch": 110.25641025641026, + "eval_loss": 1.2034764289855957, + "eval_runtime": 36.2909, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 21500 + }, + { + "epoch": 110.76923076923077, + "grad_norm": 40.14712905883789, + "learning_rate": 7.841400000000001e-06, + "loss": 1.044, + "step": 21600 + }, + { + "epoch": 110.76923076923077, + "eval_loss": 1.1942527294158936, + "eval_runtime": 36.2254, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 21600 + }, + { + "epoch": 111.28205128205128, + "grad_norm": 89.68008422851562, + "learning_rate": 7.831400000000001e-06, + "loss": 1.0301, + "step": 21700 + }, + { + "epoch": 111.28205128205128, + "eval_loss": 1.2042152881622314, + "eval_runtime": 36.2099, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 21700 + }, + { + "epoch": 111.7948717948718, + "grad_norm": 40.1873664855957, + "learning_rate": 7.8214e-06, + "loss": 1.0513, + "step": 21800 + }, + { + "epoch": 111.7948717948718, + "eval_loss": 1.201180100440979, + "eval_runtime": 36.1821, + "eval_samples_per_second": 10.862, + "eval_steps_per_second": 1.382, + "step": 21800 + }, + { + "epoch": 112.3076923076923, + "grad_norm": 30.849868774414062, + "learning_rate": 7.8114e-06, + "loss": 1.0514, + "step": 21900 + }, + { + "epoch": 112.3076923076923, + "eval_loss": 1.2128338813781738, + "eval_runtime": 36.2828, + "eval_samples_per_second": 10.832, + "eval_steps_per_second": 1.378, + "step": 21900 + }, + { + "epoch": 112.82051282051282, + "grad_norm": 31.983945846557617, + "learning_rate": 7.8014e-06, + "loss": 1.0288, + "step": 22000 + }, + { + "epoch": 112.82051282051282, + "eval_loss": 1.194412350654602, + "eval_runtime": 36.4995, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 22000 + }, + { + "epoch": 113.33333333333333, + "grad_norm": 27.589929580688477, + "learning_rate": 7.791400000000001e-06, + "loss": 1.0131, + "step": 22100 + }, + { + "epoch": 113.33333333333333, + "eval_loss": 1.2023619413375854, + "eval_runtime": 36.2873, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 22100 + }, + { + "epoch": 113.84615384615384, + "grad_norm": 98.88153076171875, + "learning_rate": 7.781400000000001e-06, + "loss": 1.0648, + "step": 22200 + }, + { + "epoch": 113.84615384615384, + "eval_loss": 1.1987013816833496, + "eval_runtime": 36.3531, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 22200 + }, + { + "epoch": 114.35897435897436, + "grad_norm": 32.19725036621094, + "learning_rate": 7.771400000000002e-06, + "loss": 1.0401, + "step": 22300 + }, + { + "epoch": 114.35897435897436, + "eval_loss": 1.2003728151321411, + "eval_runtime": 36.294, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 22300 + }, + { + "epoch": 114.87179487179488, + "grad_norm": 29.627470016479492, + "learning_rate": 7.761400000000002e-06, + "loss": 1.0638, + "step": 22400 + }, + { + "epoch": 114.87179487179488, + "eval_loss": 1.2163525819778442, + "eval_runtime": 36.4332, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 22400 + }, + { + "epoch": 115.38461538461539, + "grad_norm": 38.58709716796875, + "learning_rate": 7.751400000000002e-06, + "loss": 1.0307, + "step": 22500 + }, + { + "epoch": 115.38461538461539, + "eval_loss": 1.199642300605774, + "eval_runtime": 36.3023, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 22500 + }, + { + "epoch": 115.8974358974359, + "grad_norm": 28.66075897216797, + "learning_rate": 7.741400000000002e-06, + "loss": 1.0276, + "step": 22600 + }, + { + "epoch": 115.8974358974359, + "eval_loss": 1.213975429534912, + "eval_runtime": 36.2211, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 22600 + }, + { + "epoch": 116.41025641025641, + "grad_norm": 39.12740707397461, + "learning_rate": 7.731400000000002e-06, + "loss": 1.0163, + "step": 22700 + }, + { + "epoch": 116.41025641025641, + "eval_loss": 1.2231982946395874, + "eval_runtime": 36.2424, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 22700 + }, + { + "epoch": 116.92307692307692, + "grad_norm": 46.517051696777344, + "learning_rate": 7.7214e-06, + "loss": 1.0463, + "step": 22800 + }, + { + "epoch": 116.92307692307692, + "eval_loss": 1.183910846710205, + "eval_runtime": 36.4609, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 22800 + }, + { + "epoch": 117.43589743589743, + "grad_norm": 49.0311393737793, + "learning_rate": 7.7114e-06, + "loss": 1.0236, + "step": 22900 + }, + { + "epoch": 117.43589743589743, + "eval_loss": 1.209437608718872, + "eval_runtime": 36.3858, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 22900 + }, + { + "epoch": 117.94871794871794, + "grad_norm": 25.398332595825195, + "learning_rate": 7.7014e-06, + "loss": 1.07, + "step": 23000 + }, + { + "epoch": 117.94871794871794, + "eval_loss": 1.2132450342178345, + "eval_runtime": 36.4343, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 23000 + }, + { + "epoch": 118.46153846153847, + "grad_norm": 49.871639251708984, + "learning_rate": 7.6914e-06, + "loss": 0.9935, + "step": 23100 + }, + { + "epoch": 118.46153846153847, + "eval_loss": 1.2235440015792847, + "eval_runtime": 36.3755, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 23100 + }, + { + "epoch": 118.97435897435898, + "grad_norm": 66.67717742919922, + "learning_rate": 7.6814e-06, + "loss": 1.0672, + "step": 23200 + }, + { + "epoch": 118.97435897435898, + "eval_loss": 1.1966772079467773, + "eval_runtime": 36.3466, + "eval_samples_per_second": 10.813, + "eval_steps_per_second": 1.376, + "step": 23200 + }, + { + "epoch": 119.48717948717949, + "grad_norm": 43.98142623901367, + "learning_rate": 7.6714e-06, + "loss": 1.019, + "step": 23300 + }, + { + "epoch": 119.48717948717949, + "eval_loss": 1.1935210227966309, + "eval_runtime": 36.4031, + "eval_samples_per_second": 10.796, + "eval_steps_per_second": 1.374, + "step": 23300 + }, + { + "epoch": 120.0, + "grad_norm": 29.781999588012695, + "learning_rate": 7.661400000000001e-06, + "loss": 1.0456, + "step": 23400 + }, + { + "epoch": 120.0, + "eval_loss": 1.19161856174469, + "eval_runtime": 36.6677, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 1.364, + "step": 23400 + }, + { + "epoch": 120.51282051282051, + "grad_norm": 38.971858978271484, + "learning_rate": 7.651400000000001e-06, + "loss": 1.0488, + "step": 23500 + }, + { + "epoch": 120.51282051282051, + "eval_loss": 1.2005729675292969, + "eval_runtime": 36.8304, + "eval_samples_per_second": 10.671, + "eval_steps_per_second": 1.358, + "step": 23500 + }, + { + "epoch": 121.02564102564102, + "grad_norm": 37.437660217285156, + "learning_rate": 7.641400000000001e-06, + "loss": 1.0196, + "step": 23600 + }, + { + "epoch": 121.02564102564102, + "eval_loss": 1.1815942525863647, + "eval_runtime": 36.5578, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 23600 + }, + { + "epoch": 121.53846153846153, + "grad_norm": 45.006248474121094, + "learning_rate": 7.631500000000001e-06, + "loss": 1.0247, + "step": 23700 + }, + { + "epoch": 121.53846153846153, + "eval_loss": 1.1985987424850464, + "eval_runtime": 36.9519, + "eval_samples_per_second": 10.635, + "eval_steps_per_second": 1.353, + "step": 23700 + }, + { + "epoch": 122.05128205128206, + "grad_norm": 28.17504119873047, + "learning_rate": 7.621500000000001e-06, + "loss": 1.0278, + "step": 23800 + }, + { + "epoch": 122.05128205128206, + "eval_loss": 1.1994553804397583, + "eval_runtime": 36.7025, + "eval_samples_per_second": 10.708, + "eval_steps_per_second": 1.362, + "step": 23800 + }, + { + "epoch": 122.56410256410257, + "grad_norm": 36.315181732177734, + "learning_rate": 7.6116e-06, + "loss": 1.0073, + "step": 23900 + }, + { + "epoch": 122.56410256410257, + "eval_loss": 1.201116919517517, + "eval_runtime": 36.7114, + "eval_samples_per_second": 10.705, + "eval_steps_per_second": 1.362, + "step": 23900 + }, + { + "epoch": 123.07692307692308, + "grad_norm": 23.902212142944336, + "learning_rate": 7.6016e-06, + "loss": 1.0238, + "step": 24000 + }, + { + "epoch": 123.07692307692308, + "eval_loss": 1.2095754146575928, + "eval_runtime": 36.3531, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 24000 + }, + { + "epoch": 123.58974358974359, + "grad_norm": 29.814146041870117, + "learning_rate": 7.5916e-06, + "loss": 0.9958, + "step": 24100 + }, + { + "epoch": 123.58974358974359, + "eval_loss": 1.1929839849472046, + "eval_runtime": 36.7481, + "eval_samples_per_second": 10.694, + "eval_steps_per_second": 1.361, + "step": 24100 + }, + { + "epoch": 124.1025641025641, + "grad_norm": 52.22597885131836, + "learning_rate": 7.5816e-06, + "loss": 1.0466, + "step": 24200 + }, + { + "epoch": 124.1025641025641, + "eval_loss": 1.189598798751831, + "eval_runtime": 36.3774, + "eval_samples_per_second": 10.803, + "eval_steps_per_second": 1.374, + "step": 24200 + }, + { + "epoch": 124.61538461538461, + "grad_norm": 23.292490005493164, + "learning_rate": 7.571600000000001e-06, + "loss": 1.0451, + "step": 24300 + }, + { + "epoch": 124.61538461538461, + "eval_loss": 1.2092961072921753, + "eval_runtime": 36.3753, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 24300 + }, + { + "epoch": 125.12820512820512, + "grad_norm": 42.846275329589844, + "learning_rate": 7.5616000000000014e-06, + "loss": 1.0122, + "step": 24400 + }, + { + "epoch": 125.12820512820512, + "eval_loss": 1.1754584312438965, + "eval_runtime": 36.676, + "eval_samples_per_second": 10.715, + "eval_steps_per_second": 1.363, + "step": 24400 + }, + { + "epoch": 125.64102564102564, + "grad_norm": 23.48101234436035, + "learning_rate": 7.5516000000000015e-06, + "loss": 1.0127, + "step": 24500 + }, + { + "epoch": 125.64102564102564, + "eval_loss": 1.1745011806488037, + "eval_runtime": 36.5027, + "eval_samples_per_second": 10.766, + "eval_steps_per_second": 1.37, + "step": 24500 + }, + { + "epoch": 126.15384615384616, + "grad_norm": 32.6221809387207, + "learning_rate": 7.541600000000001e-06, + "loss": 1.0416, + "step": 24600 + }, + { + "epoch": 126.15384615384616, + "eval_loss": 1.1841143369674683, + "eval_runtime": 36.5596, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 24600 + }, + { + "epoch": 126.66666666666667, + "grad_norm": 44.3466682434082, + "learning_rate": 7.531600000000001e-06, + "loss": 1.0134, + "step": 24700 + }, + { + "epoch": 126.66666666666667, + "eval_loss": 1.1873056888580322, + "eval_runtime": 36.2722, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.378, + "step": 24700 + }, + { + "epoch": 127.17948717948718, + "grad_norm": 35.96662902832031, + "learning_rate": 7.521600000000001e-06, + "loss": 1.0157, + "step": 24800 + }, + { + "epoch": 127.17948717948718, + "eval_loss": 1.1866871118545532, + "eval_runtime": 36.0895, + "eval_samples_per_second": 10.89, + "eval_steps_per_second": 1.385, + "step": 24800 + }, + { + "epoch": 127.6923076923077, + "grad_norm": 26.58826446533203, + "learning_rate": 7.511600000000001e-06, + "loss": 1.0562, + "step": 24900 + }, + { + "epoch": 127.6923076923077, + "eval_loss": 1.1805154085159302, + "eval_runtime": 36.3065, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 24900 + }, + { + "epoch": 128.2051282051282, + "grad_norm": 60.84516906738281, + "learning_rate": 7.501600000000001e-06, + "loss": 0.9951, + "step": 25000 + }, + { + "epoch": 128.2051282051282, + "eval_loss": 1.1816545724868774, + "eval_runtime": 36.491, + "eval_samples_per_second": 10.77, + "eval_steps_per_second": 1.37, + "step": 25000 + }, + { + "epoch": 128.71794871794873, + "grad_norm": 102.91582489013672, + "learning_rate": 7.491600000000001e-06, + "loss": 1.0385, + "step": 25100 + }, + { + "epoch": 128.71794871794873, + "eval_loss": 1.1889567375183105, + "eval_runtime": 36.3865, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 25100 + }, + { + "epoch": 129.23076923076923, + "grad_norm": 33.94929885864258, + "learning_rate": 7.481600000000001e-06, + "loss": 1.0105, + "step": 25200 + }, + { + "epoch": 129.23076923076923, + "eval_loss": 1.1977964639663696, + "eval_runtime": 36.2056, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 25200 + }, + { + "epoch": 129.74358974358975, + "grad_norm": 38.72896957397461, + "learning_rate": 7.4716000000000014e-06, + "loss": 0.983, + "step": 25300 + }, + { + "epoch": 129.74358974358975, + "eval_loss": 1.179408311843872, + "eval_runtime": 36.362, + "eval_samples_per_second": 10.808, + "eval_steps_per_second": 1.375, + "step": 25300 + }, + { + "epoch": 130.25641025641025, + "grad_norm": 26.789093017578125, + "learning_rate": 7.461600000000001e-06, + "loss": 1.0294, + "step": 25400 + }, + { + "epoch": 130.25641025641025, + "eval_loss": 1.18820321559906, + "eval_runtime": 36.265, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 25400 + }, + { + "epoch": 130.76923076923077, + "grad_norm": 39.0494384765625, + "learning_rate": 7.451600000000001e-06, + "loss": 0.9926, + "step": 25500 + }, + { + "epoch": 130.76923076923077, + "eval_loss": 1.1823586225509644, + "eval_runtime": 36.4492, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 25500 + }, + { + "epoch": 131.28205128205127, + "grad_norm": 24.54622459411621, + "learning_rate": 7.441600000000001e-06, + "loss": 0.9796, + "step": 25600 + }, + { + "epoch": 131.28205128205127, + "eval_loss": 1.176493525505066, + "eval_runtime": 36.3072, + "eval_samples_per_second": 10.824, + "eval_steps_per_second": 1.377, + "step": 25600 + }, + { + "epoch": 131.7948717948718, + "grad_norm": 47.54056930541992, + "learning_rate": 7.431600000000001e-06, + "loss": 1.0261, + "step": 25700 + }, + { + "epoch": 131.7948717948718, + "eval_loss": 1.189279317855835, + "eval_runtime": 36.2551, + "eval_samples_per_second": 10.84, + "eval_steps_per_second": 1.379, + "step": 25700 + }, + { + "epoch": 132.30769230769232, + "grad_norm": 173.0323028564453, + "learning_rate": 7.421600000000001e-06, + "loss": 1.0701, + "step": 25800 + }, + { + "epoch": 132.30769230769232, + "eval_loss": 1.1980550289154053, + "eval_runtime": 36.5, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 25800 + }, + { + "epoch": 132.82051282051282, + "grad_norm": 20.522178649902344, + "learning_rate": 7.411600000000001e-06, + "loss": 0.9994, + "step": 25900 + }, + { + "epoch": 132.82051282051282, + "eval_loss": 1.1794248819351196, + "eval_runtime": 36.1653, + "eval_samples_per_second": 10.867, + "eval_steps_per_second": 1.383, + "step": 25900 + }, + { + "epoch": 133.33333333333334, + "grad_norm": 58.22213363647461, + "learning_rate": 7.401600000000001e-06, + "loss": 0.9745, + "step": 26000 + }, + { + "epoch": 133.33333333333334, + "eval_loss": 1.185817837715149, + "eval_runtime": 36.5417, + "eval_samples_per_second": 10.755, + "eval_steps_per_second": 1.368, + "step": 26000 + }, + { + "epoch": 133.84615384615384, + "grad_norm": 42.3514518737793, + "learning_rate": 7.391600000000001e-06, + "loss": 1.0218, + "step": 26100 + }, + { + "epoch": 133.84615384615384, + "eval_loss": 1.1769567728042603, + "eval_runtime": 36.3485, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 26100 + }, + { + "epoch": 134.35897435897436, + "grad_norm": 21.729629516601562, + "learning_rate": 7.381600000000001e-06, + "loss": 1.0041, + "step": 26200 + }, + { + "epoch": 134.35897435897436, + "eval_loss": 1.1934856176376343, + "eval_runtime": 36.513, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 26200 + }, + { + "epoch": 134.87179487179486, + "grad_norm": 31.473798751831055, + "learning_rate": 7.371600000000001e-06, + "loss": 1.0194, + "step": 26300 + }, + { + "epoch": 134.87179487179486, + "eval_loss": 1.176289677619934, + "eval_runtime": 36.4474, + "eval_samples_per_second": 10.783, + "eval_steps_per_second": 1.372, + "step": 26300 + }, + { + "epoch": 135.3846153846154, + "grad_norm": 47.508140563964844, + "learning_rate": 7.361600000000001e-06, + "loss": 0.9971, + "step": 26400 + }, + { + "epoch": 135.3846153846154, + "eval_loss": 1.1870988607406616, + "eval_runtime": 36.3471, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 26400 + }, + { + "epoch": 135.89743589743588, + "grad_norm": 28.1704158782959, + "learning_rate": 7.351600000000001e-06, + "loss": 0.9994, + "step": 26500 + }, + { + "epoch": 135.89743589743588, + "eval_loss": 1.1857614517211914, + "eval_runtime": 36.4793, + "eval_samples_per_second": 10.773, + "eval_steps_per_second": 1.371, + "step": 26500 + }, + { + "epoch": 136.4102564102564, + "grad_norm": 21.236408233642578, + "learning_rate": 7.341600000000001e-06, + "loss": 1.0042, + "step": 26600 + }, + { + "epoch": 136.4102564102564, + "eval_loss": 1.183176875114441, + "eval_runtime": 36.6587, + "eval_samples_per_second": 10.721, + "eval_steps_per_second": 1.364, + "step": 26600 + }, + { + "epoch": 136.92307692307693, + "grad_norm": 35.42839813232422, + "learning_rate": 7.331600000000001e-06, + "loss": 1.0016, + "step": 26700 + }, + { + "epoch": 136.92307692307693, + "eval_loss": 1.1894404888153076, + "eval_runtime": 36.3426, + "eval_samples_per_second": 10.814, + "eval_steps_per_second": 1.376, + "step": 26700 + }, + { + "epoch": 137.43589743589743, + "grad_norm": 33.13557815551758, + "learning_rate": 7.321600000000001e-06, + "loss": 1.0378, + "step": 26800 + }, + { + "epoch": 137.43589743589743, + "eval_loss": 1.1841119527816772, + "eval_runtime": 36.4508, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 26800 + }, + { + "epoch": 137.94871794871796, + "grad_norm": 31.94637107849121, + "learning_rate": 7.311600000000001e-06, + "loss": 0.9938, + "step": 26900 + }, + { + "epoch": 137.94871794871796, + "eval_loss": 1.1836310625076294, + "eval_runtime": 36.4314, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 26900 + }, + { + "epoch": 138.46153846153845, + "grad_norm": 53.63075637817383, + "learning_rate": 7.3016000000000005e-06, + "loss": 0.9617, + "step": 27000 + }, + { + "epoch": 138.46153846153845, + "eval_loss": 1.1768561601638794, + "eval_runtime": 36.3252, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 27000 + }, + { + "epoch": 138.97435897435898, + "grad_norm": 21.395553588867188, + "learning_rate": 7.291600000000001e-06, + "loss": 1.0385, + "step": 27100 + }, + { + "epoch": 138.97435897435898, + "eval_loss": 1.1800990104675293, + "eval_runtime": 36.2628, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 27100 + }, + { + "epoch": 139.48717948717947, + "grad_norm": 49.950172424316406, + "learning_rate": 7.281600000000001e-06, + "loss": 0.993, + "step": 27200 + }, + { + "epoch": 139.48717948717947, + "eval_loss": 1.1754857301712036, + "eval_runtime": 36.4227, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 27200 + }, + { + "epoch": 140.0, + "grad_norm": 28.073486328125, + "learning_rate": 7.271600000000001e-06, + "loss": 0.9907, + "step": 27300 + }, + { + "epoch": 140.0, + "eval_loss": 1.184246301651001, + "eval_runtime": 36.2844, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 27300 + }, + { + "epoch": 140.51282051282053, + "grad_norm": 24.810503005981445, + "learning_rate": 7.261600000000001e-06, + "loss": 1.0236, + "step": 27400 + }, + { + "epoch": 140.51282051282053, + "eval_loss": 1.1773159503936768, + "eval_runtime": 36.3242, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 27400 + }, + { + "epoch": 141.02564102564102, + "grad_norm": 40.216854095458984, + "learning_rate": 7.251600000000001e-06, + "loss": 0.9461, + "step": 27500 + }, + { + "epoch": 141.02564102564102, + "eval_loss": 1.1806586980819702, + "eval_runtime": 36.2095, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 27500 + }, + { + "epoch": 141.53846153846155, + "grad_norm": 40.43361282348633, + "learning_rate": 7.241600000000001e-06, + "loss": 0.9822, + "step": 27600 + }, + { + "epoch": 141.53846153846155, + "eval_loss": 1.1734962463378906, + "eval_runtime": 36.3354, + "eval_samples_per_second": 10.816, + "eval_steps_per_second": 1.376, + "step": 27600 + }, + { + "epoch": 142.05128205128204, + "grad_norm": 66.24449920654297, + "learning_rate": 7.231600000000001e-06, + "loss": 1.0241, + "step": 27700 + }, + { + "epoch": 142.05128205128204, + "eval_loss": 1.1698683500289917, + "eval_runtime": 36.5375, + "eval_samples_per_second": 10.756, + "eval_steps_per_second": 1.368, + "step": 27700 + }, + { + "epoch": 142.56410256410257, + "grad_norm": 25.957368850708008, + "learning_rate": 7.2216000000000004e-06, + "loss": 0.974, + "step": 27800 + }, + { + "epoch": 142.56410256410257, + "eval_loss": 1.1907858848571777, + "eval_runtime": 36.4875, + "eval_samples_per_second": 10.771, + "eval_steps_per_second": 1.37, + "step": 27800 + }, + { + "epoch": 143.07692307692307, + "grad_norm": 27.866281509399414, + "learning_rate": 7.211700000000001e-06, + "loss": 1.0346, + "step": 27900 + }, + { + "epoch": 143.07692307692307, + "eval_loss": 1.1854588985443115, + "eval_runtime": 36.3736, + "eval_samples_per_second": 10.805, + "eval_steps_per_second": 1.375, + "step": 27900 + }, + { + "epoch": 143.5897435897436, + "grad_norm": 35.50307846069336, + "learning_rate": 7.2017e-06, + "loss": 0.9947, + "step": 28000 + }, + { + "epoch": 143.5897435897436, + "eval_loss": 1.1978769302368164, + "eval_runtime": 36.3032, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 28000 + }, + { + "epoch": 144.10256410256412, + "grad_norm": 30.067153930664062, + "learning_rate": 7.1917e-06, + "loss": 0.9954, + "step": 28100 + }, + { + "epoch": 144.10256410256412, + "eval_loss": 1.1732500791549683, + "eval_runtime": 36.1784, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 28100 + }, + { + "epoch": 144.6153846153846, + "grad_norm": 40.56125259399414, + "learning_rate": 7.1817e-06, + "loss": 0.9954, + "step": 28200 + }, + { + "epoch": 144.6153846153846, + "eval_loss": 1.178220510482788, + "eval_runtime": 36.2456, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.379, + "step": 28200 + }, + { + "epoch": 145.12820512820514, + "grad_norm": 23.636066436767578, + "learning_rate": 7.171800000000001e-06, + "loss": 1.0048, + "step": 28300 + }, + { + "epoch": 145.12820512820514, + "eval_loss": 1.1718521118164062, + "eval_runtime": 36.7366, + "eval_samples_per_second": 10.698, + "eval_steps_per_second": 1.361, + "step": 28300 + }, + { + "epoch": 145.64102564102564, + "grad_norm": 97.56835174560547, + "learning_rate": 7.161800000000001e-06, + "loss": 0.9895, + "step": 28400 + }, + { + "epoch": 145.64102564102564, + "eval_loss": 1.1722639799118042, + "eval_runtime": 36.2683, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 28400 + }, + { + "epoch": 146.15384615384616, + "grad_norm": 54.8230094909668, + "learning_rate": 7.151800000000001e-06, + "loss": 1.001, + "step": 28500 + }, + { + "epoch": 146.15384615384616, + "eval_loss": 1.1968903541564941, + "eval_runtime": 36.2527, + "eval_samples_per_second": 10.841, + "eval_steps_per_second": 1.379, + "step": 28500 + }, + { + "epoch": 146.66666666666666, + "grad_norm": 15.167412757873535, + "learning_rate": 7.141800000000001e-06, + "loss": 0.9852, + "step": 28600 + }, + { + "epoch": 146.66666666666666, + "eval_loss": 1.188344120979309, + "eval_runtime": 36.3001, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 28600 + }, + { + "epoch": 147.17948717948718, + "grad_norm": 69.213134765625, + "learning_rate": 7.131800000000001e-06, + "loss": 1.016, + "step": 28700 + }, + { + "epoch": 147.17948717948718, + "eval_loss": 1.1813448667526245, + "eval_runtime": 36.0937, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 28700 + }, + { + "epoch": 147.69230769230768, + "grad_norm": 43.38339614868164, + "learning_rate": 7.121800000000001e-06, + "loss": 1.0048, + "step": 28800 + }, + { + "epoch": 147.69230769230768, + "eval_loss": 1.189638376235962, + "eval_runtime": 36.1286, + "eval_samples_per_second": 10.878, + "eval_steps_per_second": 1.384, + "step": 28800 + }, + { + "epoch": 148.2051282051282, + "grad_norm": 43.99717712402344, + "learning_rate": 7.111800000000001e-06, + "loss": 0.9838, + "step": 28900 + }, + { + "epoch": 148.2051282051282, + "eval_loss": 1.1951574087142944, + "eval_runtime": 36.2031, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 28900 + }, + { + "epoch": 148.71794871794873, + "grad_norm": 31.055850982666016, + "learning_rate": 7.101800000000001e-06, + "loss": 0.9683, + "step": 29000 + }, + { + "epoch": 148.71794871794873, + "eval_loss": 1.1835789680480957, + "eval_runtime": 36.2271, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 29000 + }, + { + "epoch": 149.23076923076923, + "grad_norm": 32.13520050048828, + "learning_rate": 7.091800000000001e-06, + "loss": 1.0131, + "step": 29100 + }, + { + "epoch": 149.23076923076923, + "eval_loss": 1.1654757261276245, + "eval_runtime": 36.5086, + "eval_samples_per_second": 10.765, + "eval_steps_per_second": 1.37, + "step": 29100 + }, + { + "epoch": 149.74358974358975, + "grad_norm": 31.264759063720703, + "learning_rate": 7.0818000000000005e-06, + "loss": 1.0073, + "step": 29200 + }, + { + "epoch": 149.74358974358975, + "eval_loss": 1.1849567890167236, + "eval_runtime": 36.7699, + "eval_samples_per_second": 10.688, + "eval_steps_per_second": 1.36, + "step": 29200 + }, + { + "epoch": 150.25641025641025, + "grad_norm": 32.43284225463867, + "learning_rate": 7.071800000000001e-06, + "loss": 0.9656, + "step": 29300 + }, + { + "epoch": 150.25641025641025, + "eval_loss": 1.168485403060913, + "eval_runtime": 36.2215, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 29300 + }, + { + "epoch": 150.76923076923077, + "grad_norm": 59.16147994995117, + "learning_rate": 7.061800000000001e-06, + "loss": 0.9923, + "step": 29400 + }, + { + "epoch": 150.76923076923077, + "eval_loss": 1.1792685985565186, + "eval_runtime": 36.3853, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 29400 + }, + { + "epoch": 151.28205128205127, + "grad_norm": 101.80491638183594, + "learning_rate": 7.051800000000001e-06, + "loss": 1.0074, + "step": 29500 + }, + { + "epoch": 151.28205128205127, + "eval_loss": 1.1642369031906128, + "eval_runtime": 36.3983, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 29500 + }, + { + "epoch": 151.7948717948718, + "grad_norm": 25.33744239807129, + "learning_rate": 7.041800000000001e-06, + "loss": 0.9715, + "step": 29600 + }, + { + "epoch": 151.7948717948718, + "eval_loss": 1.1821907758712769, + "eval_runtime": 36.0667, + "eval_samples_per_second": 10.896, + "eval_steps_per_second": 1.386, + "step": 29600 + }, + { + "epoch": 152.30769230769232, + "grad_norm": 116.95513916015625, + "learning_rate": 7.031800000000001e-06, + "loss": 0.9958, + "step": 29700 + }, + { + "epoch": 152.30769230769232, + "eval_loss": 1.1610026359558105, + "eval_runtime": 36.2008, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 29700 + }, + { + "epoch": 152.82051282051282, + "grad_norm": 90.85464477539062, + "learning_rate": 7.021800000000001e-06, + "loss": 0.9767, + "step": 29800 + }, + { + "epoch": 152.82051282051282, + "eval_loss": 1.1621683835983276, + "eval_runtime": 36.2957, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 29800 + }, + { + "epoch": 153.33333333333334, + "grad_norm": 24.217872619628906, + "learning_rate": 7.011800000000001e-06, + "loss": 0.9786, + "step": 29900 + }, + { + "epoch": 153.33333333333334, + "eval_loss": 1.158172845840454, + "eval_runtime": 35.9046, + "eval_samples_per_second": 10.946, + "eval_steps_per_second": 1.393, + "step": 29900 + }, + { + "epoch": 153.84615384615384, + "grad_norm": 30.70758628845215, + "learning_rate": 7.001900000000001e-06, + "loss": 1.0029, + "step": 30000 + }, + { + "epoch": 153.84615384615384, + "eval_loss": 1.1670671701431274, + "eval_runtime": 36.1463, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 30000 + }, + { + "epoch": 154.35897435897436, + "grad_norm": 29.265003204345703, + "learning_rate": 6.991900000000001e-06, + "loss": 1.0156, + "step": 30100 + }, + { + "epoch": 154.35897435897436, + "eval_loss": 1.1782671213150024, + "eval_runtime": 36.3854, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 30100 + }, + { + "epoch": 154.87179487179486, + "grad_norm": 28.448328018188477, + "learning_rate": 6.9819e-06, + "loss": 0.9709, + "step": 30200 + }, + { + "epoch": 154.87179487179486, + "eval_loss": 1.16402268409729, + "eval_runtime": 36.815, + "eval_samples_per_second": 10.675, + "eval_steps_per_second": 1.358, + "step": 30200 + }, + { + "epoch": 155.3846153846154, + "grad_norm": 65.755615234375, + "learning_rate": 6.9719e-06, + "loss": 0.9669, + "step": 30300 + }, + { + "epoch": 155.3846153846154, + "eval_loss": 1.1739165782928467, + "eval_runtime": 36.34, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 30300 + }, + { + "epoch": 155.89743589743588, + "grad_norm": 45.11248779296875, + "learning_rate": 6.9619e-06, + "loss": 1.0136, + "step": 30400 + }, + { + "epoch": 155.89743589743588, + "eval_loss": 1.143236756324768, + "eval_runtime": 36.4174, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 30400 + }, + { + "epoch": 156.4102564102564, + "grad_norm": 50.76137924194336, + "learning_rate": 6.9519e-06, + "loss": 0.9565, + "step": 30500 + }, + { + "epoch": 156.4102564102564, + "eval_loss": 1.1797446012496948, + "eval_runtime": 36.3814, + "eval_samples_per_second": 10.802, + "eval_steps_per_second": 1.374, + "step": 30500 + }, + { + "epoch": 156.92307692307693, + "grad_norm": 35.62437438964844, + "learning_rate": 6.9419e-06, + "loss": 1.0004, + "step": 30600 + }, + { + "epoch": 156.92307692307693, + "eval_loss": 1.1688662767410278, + "eval_runtime": 36.529, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 30600 + }, + { + "epoch": 157.43589743589743, + "grad_norm": 43.17647933959961, + "learning_rate": 6.9319000000000005e-06, + "loss": 0.9897, + "step": 30700 + }, + { + "epoch": 157.43589743589743, + "eval_loss": 1.1841660737991333, + "eval_runtime": 36.4235, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 30700 + }, + { + "epoch": 157.94871794871796, + "grad_norm": 56.77204513549805, + "learning_rate": 6.921900000000001e-06, + "loss": 0.9817, + "step": 30800 + }, + { + "epoch": 157.94871794871796, + "eval_loss": 1.171298861503601, + "eval_runtime": 36.4571, + "eval_samples_per_second": 10.78, + "eval_steps_per_second": 1.371, + "step": 30800 + }, + { + "epoch": 158.46153846153845, + "grad_norm": 26.698223114013672, + "learning_rate": 6.911900000000001e-06, + "loss": 0.9528, + "step": 30900 + }, + { + "epoch": 158.46153846153845, + "eval_loss": 1.1642067432403564, + "eval_runtime": 36.2888, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 30900 + }, + { + "epoch": 158.97435897435898, + "grad_norm": 23.478918075561523, + "learning_rate": 6.9019e-06, + "loss": 0.9874, + "step": 31000 + }, + { + "epoch": 158.97435897435898, + "eval_loss": 1.1525483131408691, + "eval_runtime": 36.1434, + "eval_samples_per_second": 10.873, + "eval_steps_per_second": 1.383, + "step": 31000 + }, + { + "epoch": 159.48717948717947, + "grad_norm": 38.60530471801758, + "learning_rate": 6.8919e-06, + "loss": 0.9591, + "step": 31100 + }, + { + "epoch": 159.48717948717947, + "eval_loss": 1.1493828296661377, + "eval_runtime": 36.2104, + "eval_samples_per_second": 10.853, + "eval_steps_per_second": 1.381, + "step": 31100 + }, + { + "epoch": 160.0, + "grad_norm": 20.907501220703125, + "learning_rate": 6.8819e-06, + "loss": 0.9884, + "step": 31200 + }, + { + "epoch": 160.0, + "eval_loss": 1.1617672443389893, + "eval_runtime": 36.2202, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 31200 + }, + { + "epoch": 160.51282051282053, + "grad_norm": 46.7100715637207, + "learning_rate": 6.8719e-06, + "loss": 0.9584, + "step": 31300 + }, + { + "epoch": 160.51282051282053, + "eval_loss": 1.1728988885879517, + "eval_runtime": 36.0326, + "eval_samples_per_second": 10.907, + "eval_steps_per_second": 1.388, + "step": 31300 + }, + { + "epoch": 161.02564102564102, + "grad_norm": 26.359403610229492, + "learning_rate": 6.8619e-06, + "loss": 0.9797, + "step": 31400 + }, + { + "epoch": 161.02564102564102, + "eval_loss": 1.1622204780578613, + "eval_runtime": 36.3492, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 31400 + }, + { + "epoch": 161.53846153846155, + "grad_norm": 19.935714721679688, + "learning_rate": 6.8519e-06, + "loss": 0.9914, + "step": 31500 + }, + { + "epoch": 161.53846153846155, + "eval_loss": 1.1545765399932861, + "eval_runtime": 36.0783, + "eval_samples_per_second": 10.893, + "eval_steps_per_second": 1.386, + "step": 31500 + }, + { + "epoch": 162.05128205128204, + "grad_norm": 33.71799850463867, + "learning_rate": 6.8419000000000005e-06, + "loss": 0.9906, + "step": 31600 + }, + { + "epoch": 162.05128205128204, + "eval_loss": 1.1707539558410645, + "eval_runtime": 36.3207, + "eval_samples_per_second": 10.82, + "eval_steps_per_second": 1.377, + "step": 31600 + }, + { + "epoch": 162.56410256410257, + "grad_norm": 18.32308006286621, + "learning_rate": 6.831900000000001e-06, + "loss": 0.9557, + "step": 31700 + }, + { + "epoch": 162.56410256410257, + "eval_loss": 1.167324423789978, + "eval_runtime": 36.655, + "eval_samples_per_second": 10.722, + "eval_steps_per_second": 1.364, + "step": 31700 + }, + { + "epoch": 163.07692307692307, + "grad_norm": 48.19999313354492, + "learning_rate": 6.8219e-06, + "loss": 0.9933, + "step": 31800 + }, + { + "epoch": 163.07692307692307, + "eval_loss": 1.1636213064193726, + "eval_runtime": 36.3559, + "eval_samples_per_second": 10.81, + "eval_steps_per_second": 1.375, + "step": 31800 + }, + { + "epoch": 163.5897435897436, + "grad_norm": 50.3586540222168, + "learning_rate": 6.8119e-06, + "loss": 0.9807, + "step": 31900 + }, + { + "epoch": 163.5897435897436, + "eval_loss": 1.1503098011016846, + "eval_runtime": 36.5851, + "eval_samples_per_second": 10.742, + "eval_steps_per_second": 1.367, + "step": 31900 + }, + { + "epoch": 164.10256410256412, + "grad_norm": 26.934223175048828, + "learning_rate": 6.8019e-06, + "loss": 0.9659, + "step": 32000 + }, + { + "epoch": 164.10256410256412, + "eval_loss": 1.1598249673843384, + "eval_runtime": 36.3889, + "eval_samples_per_second": 10.8, + "eval_steps_per_second": 1.374, + "step": 32000 + }, + { + "epoch": 164.6153846153846, + "grad_norm": 19.510753631591797, + "learning_rate": 6.7919e-06, + "loss": 0.9309, + "step": 32100 + }, + { + "epoch": 164.6153846153846, + "eval_loss": 1.1527222394943237, + "eval_runtime": 36.5229, + "eval_samples_per_second": 10.76, + "eval_steps_per_second": 1.369, + "step": 32100 + }, + { + "epoch": 165.12820512820514, + "grad_norm": 35.315731048583984, + "learning_rate": 6.7819e-06, + "loss": 1.0032, + "step": 32200 + }, + { + "epoch": 165.12820512820514, + "eval_loss": 1.1454689502716064, + "eval_runtime": 36.6685, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 1.364, + "step": 32200 + }, + { + "epoch": 165.64102564102564, + "grad_norm": 43.165409088134766, + "learning_rate": 6.7719e-06, + "loss": 0.9481, + "step": 32300 + }, + { + "epoch": 165.64102564102564, + "eval_loss": 1.1533300876617432, + "eval_runtime": 36.4611, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 32300 + }, + { + "epoch": 166.15384615384616, + "grad_norm": 24.036327362060547, + "learning_rate": 6.7619000000000004e-06, + "loss": 0.981, + "step": 32400 + }, + { + "epoch": 166.15384615384616, + "eval_loss": 1.1508747339248657, + "eval_runtime": 36.6157, + "eval_samples_per_second": 10.733, + "eval_steps_per_second": 1.366, + "step": 32400 + }, + { + "epoch": 166.66666666666666, + "grad_norm": 44.49024200439453, + "learning_rate": 6.7519000000000005e-06, + "loss": 0.9824, + "step": 32500 + }, + { + "epoch": 166.66666666666666, + "eval_loss": 1.1492385864257812, + "eval_runtime": 36.7467, + "eval_samples_per_second": 10.695, + "eval_steps_per_second": 1.361, + "step": 32500 + }, + { + "epoch": 167.17948717948718, + "grad_norm": 58.19458770751953, + "learning_rate": 6.7419e-06, + "loss": 0.9992, + "step": 32600 + }, + { + "epoch": 167.17948717948718, + "eval_loss": 1.176912784576416, + "eval_runtime": 36.4221, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 32600 + }, + { + "epoch": 167.69230769230768, + "grad_norm": 24.772140502929688, + "learning_rate": 6.7319e-06, + "loss": 0.9341, + "step": 32700 + }, + { + "epoch": 167.69230769230768, + "eval_loss": 1.150542140007019, + "eval_runtime": 36.6285, + "eval_samples_per_second": 10.729, + "eval_steps_per_second": 1.365, + "step": 32700 + }, + { + "epoch": 168.2051282051282, + "grad_norm": 31.784894943237305, + "learning_rate": 6.7219e-06, + "loss": 0.9599, + "step": 32800 + }, + { + "epoch": 168.2051282051282, + "eval_loss": 1.1548517942428589, + "eval_runtime": 36.613, + "eval_samples_per_second": 10.734, + "eval_steps_per_second": 1.366, + "step": 32800 + }, + { + "epoch": 168.71794871794873, + "grad_norm": 25.822301864624023, + "learning_rate": 6.7119e-06, + "loss": 0.9657, + "step": 32900 + }, + { + "epoch": 168.71794871794873, + "eval_loss": 1.1672749519348145, + "eval_runtime": 36.4185, + "eval_samples_per_second": 10.791, + "eval_steps_per_second": 1.373, + "step": 32900 + }, + { + "epoch": 169.23076923076923, + "grad_norm": 23.413084030151367, + "learning_rate": 6.7019e-06, + "loss": 0.9609, + "step": 33000 + }, + { + "epoch": 169.23076923076923, + "eval_loss": 1.1891629695892334, + "eval_runtime": 35.8404, + "eval_samples_per_second": 10.965, + "eval_steps_per_second": 1.395, + "step": 33000 + }, + { + "epoch": 169.74358974358975, + "grad_norm": 46.016353607177734, + "learning_rate": 6.6919e-06, + "loss": 1.0016, + "step": 33100 + }, + { + "epoch": 169.74358974358975, + "eval_loss": 1.1610451936721802, + "eval_runtime": 36.5132, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 33100 + }, + { + "epoch": 170.25641025641025, + "grad_norm": 48.33186721801758, + "learning_rate": 6.6819e-06, + "loss": 0.9395, + "step": 33200 + }, + { + "epoch": 170.25641025641025, + "eval_loss": 1.1638587713241577, + "eval_runtime": 35.833, + "eval_samples_per_second": 10.968, + "eval_steps_per_second": 1.395, + "step": 33200 + }, + { + "epoch": 170.76923076923077, + "grad_norm": 66.21365356445312, + "learning_rate": 6.6719000000000004e-06, + "loss": 0.9914, + "step": 33300 + }, + { + "epoch": 170.76923076923077, + "eval_loss": 1.1556510925292969, + "eval_runtime": 36.4668, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 33300 + }, + { + "epoch": 171.28205128205127, + "grad_norm": 76.94286346435547, + "learning_rate": 6.6619e-06, + "loss": 0.9762, + "step": 33400 + }, + { + "epoch": 171.28205128205127, + "eval_loss": 1.148744821548462, + "eval_runtime": 36.4485, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 33400 + }, + { + "epoch": 171.7948717948718, + "grad_norm": 93.9037094116211, + "learning_rate": 6.6519e-06, + "loss": 0.9483, + "step": 33500 + }, + { + "epoch": 171.7948717948718, + "eval_loss": 1.1647981405258179, + "eval_runtime": 36.589, + "eval_samples_per_second": 10.741, + "eval_steps_per_second": 1.367, + "step": 33500 + }, + { + "epoch": 172.30769230769232, + "grad_norm": 29.575817108154297, + "learning_rate": 6.6419e-06, + "loss": 0.9438, + "step": 33600 + }, + { + "epoch": 172.30769230769232, + "eval_loss": 1.1736277341842651, + "eval_runtime": 36.6333, + "eval_samples_per_second": 10.728, + "eval_steps_per_second": 1.365, + "step": 33600 + }, + { + "epoch": 172.82051282051282, + "grad_norm": 27.817867279052734, + "learning_rate": 6.6319e-06, + "loss": 0.9667, + "step": 33700 + }, + { + "epoch": 172.82051282051282, + "eval_loss": 1.181942105293274, + "eval_runtime": 36.67, + "eval_samples_per_second": 10.717, + "eval_steps_per_second": 1.364, + "step": 33700 + }, + { + "epoch": 173.33333333333334, + "grad_norm": 30.455278396606445, + "learning_rate": 6.6219e-06, + "loss": 0.973, + "step": 33800 + }, + { + "epoch": 173.33333333333334, + "eval_loss": 1.1556087732315063, + "eval_runtime": 36.4992, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 33800 + }, + { + "epoch": 173.84615384615384, + "grad_norm": 49.9559211730957, + "learning_rate": 6.611900000000001e-06, + "loss": 0.9537, + "step": 33900 + }, + { + "epoch": 173.84615384615384, + "eval_loss": 1.159470558166504, + "eval_runtime": 36.9007, + "eval_samples_per_second": 10.65, + "eval_steps_per_second": 1.355, + "step": 33900 + }, + { + "epoch": 174.35897435897436, + "grad_norm": 21.05950355529785, + "learning_rate": 6.601900000000001e-06, + "loss": 0.9383, + "step": 34000 + }, + { + "epoch": 174.35897435897436, + "eval_loss": 1.1624013185501099, + "eval_runtime": 37.0722, + "eval_samples_per_second": 10.601, + "eval_steps_per_second": 1.349, + "step": 34000 + }, + { + "epoch": 174.87179487179486, + "grad_norm": 39.16794967651367, + "learning_rate": 6.591900000000001e-06, + "loss": 0.9841, + "step": 34100 + }, + { + "epoch": 174.87179487179486, + "eval_loss": 1.1565006971359253, + "eval_runtime": 36.6378, + "eval_samples_per_second": 10.727, + "eval_steps_per_second": 1.365, + "step": 34100 + }, + { + "epoch": 175.3846153846154, + "grad_norm": 30.419639587402344, + "learning_rate": 6.581900000000001e-06, + "loss": 0.9389, + "step": 34200 + }, + { + "epoch": 175.3846153846154, + "eval_loss": 1.1742419004440308, + "eval_runtime": 36.2064, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 34200 + }, + { + "epoch": 175.89743589743588, + "grad_norm": 36.81096267700195, + "learning_rate": 6.571900000000001e-06, + "loss": 0.9573, + "step": 34300 + }, + { + "epoch": 175.89743589743588, + "eval_loss": 1.1683204174041748, + "eval_runtime": 36.759, + "eval_samples_per_second": 10.691, + "eval_steps_per_second": 1.36, + "step": 34300 + }, + { + "epoch": 176.4102564102564, + "grad_norm": 41.405521392822266, + "learning_rate": 6.561900000000001e-06, + "loss": 0.931, + "step": 34400 + }, + { + "epoch": 176.4102564102564, + "eval_loss": 1.1893439292907715, + "eval_runtime": 36.6676, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 1.364, + "step": 34400 + }, + { + "epoch": 176.92307692307693, + "grad_norm": 78.70719909667969, + "learning_rate": 6.551900000000001e-06, + "loss": 1.0, + "step": 34500 + }, + { + "epoch": 176.92307692307693, + "eval_loss": 1.161646842956543, + "eval_runtime": 36.3148, + "eval_samples_per_second": 10.822, + "eval_steps_per_second": 1.377, + "step": 34500 + }, + { + "epoch": 177.43589743589743, + "grad_norm": 38.149898529052734, + "learning_rate": 6.541900000000001e-06, + "loss": 0.9591, + "step": 34600 + }, + { + "epoch": 177.43589743589743, + "eval_loss": 1.1639251708984375, + "eval_runtime": 36.3126, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 34600 + }, + { + "epoch": 177.94871794871796, + "grad_norm": 36.89656066894531, + "learning_rate": 6.531900000000001e-06, + "loss": 0.9682, + "step": 34700 + }, + { + "epoch": 177.94871794871796, + "eval_loss": 1.1663053035736084, + "eval_runtime": 36.4388, + "eval_samples_per_second": 10.785, + "eval_steps_per_second": 1.372, + "step": 34700 + }, + { + "epoch": 178.46153846153845, + "grad_norm": 23.646724700927734, + "learning_rate": 6.521900000000001e-06, + "loss": 0.9498, + "step": 34800 + }, + { + "epoch": 178.46153846153845, + "eval_loss": 1.159184455871582, + "eval_runtime": 36.3522, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 34800 + }, + { + "epoch": 178.97435897435898, + "grad_norm": 31.551183700561523, + "learning_rate": 6.511900000000001e-06, + "loss": 0.9634, + "step": 34900 + }, + { + "epoch": 178.97435897435898, + "eval_loss": 1.1479530334472656, + "eval_runtime": 36.3066, + "eval_samples_per_second": 10.824, + "eval_steps_per_second": 1.377, + "step": 34900 + }, + { + "epoch": 179.48717948717947, + "grad_norm": 20.25978660583496, + "learning_rate": 6.501900000000001e-06, + "loss": 0.9554, + "step": 35000 + }, + { + "epoch": 179.48717948717947, + "eval_loss": 1.1639673709869385, + "eval_runtime": 36.5013, + "eval_samples_per_second": 10.767, + "eval_steps_per_second": 1.37, + "step": 35000 + }, + { + "epoch": 180.0, + "grad_norm": 34.67188262939453, + "learning_rate": 6.491900000000001e-06, + "loss": 0.9649, + "step": 35100 + }, + { + "epoch": 180.0, + "eval_loss": 1.1612266302108765, + "eval_runtime": 36.5383, + "eval_samples_per_second": 10.756, + "eval_steps_per_second": 1.368, + "step": 35100 + }, + { + "epoch": 180.51282051282053, + "grad_norm": 29.289005279541016, + "learning_rate": 6.4819000000000006e-06, + "loss": 0.9538, + "step": 35200 + }, + { + "epoch": 180.51282051282053, + "eval_loss": 1.147358775138855, + "eval_runtime": 36.3617, + "eval_samples_per_second": 10.808, + "eval_steps_per_second": 1.375, + "step": 35200 + }, + { + "epoch": 181.02564102564102, + "grad_norm": 23.1491641998291, + "learning_rate": 6.471900000000001e-06, + "loss": 0.9584, + "step": 35300 + }, + { + "epoch": 181.02564102564102, + "eval_loss": 1.1400158405303955, + "eval_runtime": 35.9302, + "eval_samples_per_second": 10.938, + "eval_steps_per_second": 1.392, + "step": 35300 + }, + { + "epoch": 181.53846153846155, + "grad_norm": 31.366552352905273, + "learning_rate": 6.461900000000001e-06, + "loss": 0.9624, + "step": 35400 + }, + { + "epoch": 181.53846153846155, + "eval_loss": 1.1689287424087524, + "eval_runtime": 35.9251, + "eval_samples_per_second": 10.939, + "eval_steps_per_second": 1.392, + "step": 35400 + }, + { + "epoch": 182.05128205128204, + "grad_norm": 67.5167236328125, + "learning_rate": 6.451900000000001e-06, + "loss": 0.9427, + "step": 35500 + }, + { + "epoch": 182.05128205128204, + "eval_loss": 1.1505628824234009, + "eval_runtime": 36.6043, + "eval_samples_per_second": 10.736, + "eval_steps_per_second": 1.366, + "step": 35500 + }, + { + "epoch": 182.56410256410257, + "grad_norm": 41.26319122314453, + "learning_rate": 6.441900000000001e-06, + "loss": 0.9355, + "step": 35600 + }, + { + "epoch": 182.56410256410257, + "eval_loss": 1.1368300914764404, + "eval_runtime": 36.5073, + "eval_samples_per_second": 10.765, + "eval_steps_per_second": 1.37, + "step": 35600 + }, + { + "epoch": 183.07692307692307, + "grad_norm": 20.184959411621094, + "learning_rate": 6.431900000000001e-06, + "loss": 0.9581, + "step": 35700 + }, + { + "epoch": 183.07692307692307, + "eval_loss": 1.1507443189620972, + "eval_runtime": 36.6174, + "eval_samples_per_second": 10.733, + "eval_steps_per_second": 1.365, + "step": 35700 + }, + { + "epoch": 183.5897435897436, + "grad_norm": 74.46769714355469, + "learning_rate": 6.421900000000001e-06, + "loss": 0.9348, + "step": 35800 + }, + { + "epoch": 183.5897435897436, + "eval_loss": 1.1444846391677856, + "eval_runtime": 36.3345, + "eval_samples_per_second": 10.816, + "eval_steps_per_second": 1.376, + "step": 35800 + }, + { + "epoch": 184.10256410256412, + "grad_norm": 59.410648345947266, + "learning_rate": 6.411900000000001e-06, + "loss": 0.9935, + "step": 35900 + }, + { + "epoch": 184.10256410256412, + "eval_loss": 1.1530898809432983, + "eval_runtime": 36.1224, + "eval_samples_per_second": 10.88, + "eval_steps_per_second": 1.384, + "step": 35900 + }, + { + "epoch": 184.6153846153846, + "grad_norm": 35.07084274291992, + "learning_rate": 6.4019000000000005e-06, + "loss": 0.9724, + "step": 36000 + }, + { + "epoch": 184.6153846153846, + "eval_loss": 1.1684391498565674, + "eval_runtime": 36.2728, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.378, + "step": 36000 + }, + { + "epoch": 185.12820512820514, + "grad_norm": 29.23606300354004, + "learning_rate": 6.391900000000001e-06, + "loss": 0.9098, + "step": 36100 + }, + { + "epoch": 185.12820512820514, + "eval_loss": 1.158069372177124, + "eval_runtime": 36.3213, + "eval_samples_per_second": 10.82, + "eval_steps_per_second": 1.377, + "step": 36100 + }, + { + "epoch": 185.64102564102564, + "grad_norm": 40.36886215209961, + "learning_rate": 6.381900000000001e-06, + "loss": 0.936, + "step": 36200 + }, + { + "epoch": 185.64102564102564, + "eval_loss": 1.1593129634857178, + "eval_runtime": 36.2993, + "eval_samples_per_second": 10.827, + "eval_steps_per_second": 1.377, + "step": 36200 + }, + { + "epoch": 186.15384615384616, + "grad_norm": 34.383018493652344, + "learning_rate": 6.371900000000001e-06, + "loss": 0.9353, + "step": 36300 + }, + { + "epoch": 186.15384615384616, + "eval_loss": 1.1560229063034058, + "eval_runtime": 36.2044, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 36300 + }, + { + "epoch": 186.66666666666666, + "grad_norm": 40.90024185180664, + "learning_rate": 6.361900000000001e-06, + "loss": 0.9722, + "step": 36400 + }, + { + "epoch": 186.66666666666666, + "eval_loss": 1.145377278327942, + "eval_runtime": 36.3434, + "eval_samples_per_second": 10.814, + "eval_steps_per_second": 1.376, + "step": 36400 + }, + { + "epoch": 187.17948717948718, + "grad_norm": 22.60957908630371, + "learning_rate": 6.352e-06, + "loss": 0.9376, + "step": 36500 + }, + { + "epoch": 187.17948717948718, + "eval_loss": 1.1557235717773438, + "eval_runtime": 36.1426, + "eval_samples_per_second": 10.874, + "eval_steps_per_second": 1.383, + "step": 36500 + }, + { + "epoch": 187.69230769230768, + "grad_norm": 64.978271484375, + "learning_rate": 6.3420000000000004e-06, + "loss": 0.9624, + "step": 36600 + }, + { + "epoch": 187.69230769230768, + "eval_loss": 1.1466400623321533, + "eval_runtime": 36.4648, + "eval_samples_per_second": 10.778, + "eval_steps_per_second": 1.371, + "step": 36600 + }, + { + "epoch": 188.2051282051282, + "grad_norm": 58.62668991088867, + "learning_rate": 6.3320000000000005e-06, + "loss": 0.9113, + "step": 36700 + }, + { + "epoch": 188.2051282051282, + "eval_loss": 1.1506292819976807, + "eval_runtime": 36.4044, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 36700 + }, + { + "epoch": 188.71794871794873, + "grad_norm": 43.32209396362305, + "learning_rate": 6.322000000000001e-06, + "loss": 0.9861, + "step": 36800 + }, + { + "epoch": 188.71794871794873, + "eval_loss": 1.1522414684295654, + "eval_runtime": 36.3402, + "eval_samples_per_second": 10.814, + "eval_steps_per_second": 1.376, + "step": 36800 + }, + { + "epoch": 189.23076923076923, + "grad_norm": 41.845062255859375, + "learning_rate": 6.312000000000001e-06, + "loss": 0.9719, + "step": 36900 + }, + { + "epoch": 189.23076923076923, + "eval_loss": 1.1591370105743408, + "eval_runtime": 36.4574, + "eval_samples_per_second": 10.78, + "eval_steps_per_second": 1.371, + "step": 36900 + }, + { + "epoch": 189.74358974358975, + "grad_norm": 24.911863327026367, + "learning_rate": 6.302e-06, + "loss": 0.9191, + "step": 37000 + }, + { + "epoch": 189.74358974358975, + "eval_loss": 1.1516185998916626, + "eval_runtime": 36.349, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 37000 + }, + { + "epoch": 190.25641025641025, + "grad_norm": 31.848737716674805, + "learning_rate": 6.292e-06, + "loss": 0.9232, + "step": 37100 + }, + { + "epoch": 190.25641025641025, + "eval_loss": 1.1466903686523438, + "eval_runtime": 36.2024, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 37100 + }, + { + "epoch": 190.76923076923077, + "grad_norm": 118.13239288330078, + "learning_rate": 6.282e-06, + "loss": 0.925, + "step": 37200 + }, + { + "epoch": 190.76923076923077, + "eval_loss": 1.156925916671753, + "eval_runtime": 36.6009, + "eval_samples_per_second": 10.737, + "eval_steps_per_second": 1.366, + "step": 37200 + }, + { + "epoch": 191.28205128205127, + "grad_norm": 38.65251159667969, + "learning_rate": 6.272e-06, + "loss": 0.9754, + "step": 37300 + }, + { + "epoch": 191.28205128205127, + "eval_loss": 1.151605248451233, + "eval_runtime": 36.3302, + "eval_samples_per_second": 10.817, + "eval_steps_per_second": 1.376, + "step": 37300 + }, + { + "epoch": 191.7948717948718, + "grad_norm": 18.521682739257812, + "learning_rate": 6.262e-06, + "loss": 0.9626, + "step": 37400 + }, + { + "epoch": 191.7948717948718, + "eval_loss": 1.165377140045166, + "eval_runtime": 36.4212, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 37400 + }, + { + "epoch": 192.30769230769232, + "grad_norm": 26.990888595581055, + "learning_rate": 6.2520000000000004e-06, + "loss": 0.9254, + "step": 37500 + }, + { + "epoch": 192.30769230769232, + "eval_loss": 1.1528462171554565, + "eval_runtime": 36.5754, + "eval_samples_per_second": 10.745, + "eval_steps_per_second": 1.367, + "step": 37500 + }, + { + "epoch": 192.82051282051282, + "grad_norm": 51.070289611816406, + "learning_rate": 6.2420000000000005e-06, + "loss": 0.9171, + "step": 37600 + }, + { + "epoch": 192.82051282051282, + "eval_loss": 1.15065336227417, + "eval_runtime": 35.7158, + "eval_samples_per_second": 11.004, + "eval_steps_per_second": 1.4, + "step": 37600 + }, + { + "epoch": 193.33333333333334, + "grad_norm": 39.551231384277344, + "learning_rate": 6.232000000000001e-06, + "loss": 0.9448, + "step": 37700 + }, + { + "epoch": 193.33333333333334, + "eval_loss": 1.1428643465042114, + "eval_runtime": 36.367, + "eval_samples_per_second": 10.807, + "eval_steps_per_second": 1.375, + "step": 37700 + }, + { + "epoch": 193.84615384615384, + "grad_norm": 28.688800811767578, + "learning_rate": 6.222e-06, + "loss": 0.9848, + "step": 37800 + }, + { + "epoch": 193.84615384615384, + "eval_loss": 1.1568922996520996, + "eval_runtime": 36.5107, + "eval_samples_per_second": 10.764, + "eval_steps_per_second": 1.369, + "step": 37800 + }, + { + "epoch": 194.35897435897436, + "grad_norm": 16.188304901123047, + "learning_rate": 6.212e-06, + "loss": 0.906, + "step": 37900 + }, + { + "epoch": 194.35897435897436, + "eval_loss": 1.13926362991333, + "eval_runtime": 36.6812, + "eval_samples_per_second": 10.714, + "eval_steps_per_second": 1.363, + "step": 37900 + }, + { + "epoch": 194.87179487179486, + "grad_norm": 20.75755500793457, + "learning_rate": 6.202e-06, + "loss": 0.9438, + "step": 38000 + }, + { + "epoch": 194.87179487179486, + "eval_loss": 1.1322276592254639, + "eval_runtime": 36.3793, + "eval_samples_per_second": 10.803, + "eval_steps_per_second": 1.374, + "step": 38000 + }, + { + "epoch": 195.3846153846154, + "grad_norm": 25.585630416870117, + "learning_rate": 6.192e-06, + "loss": 0.9399, + "step": 38100 + }, + { + "epoch": 195.3846153846154, + "eval_loss": 1.1462476253509521, + "eval_runtime": 36.4846, + "eval_samples_per_second": 10.772, + "eval_steps_per_second": 1.37, + "step": 38100 + }, + { + "epoch": 195.89743589743588, + "grad_norm": 26.052518844604492, + "learning_rate": 6.182e-06, + "loss": 0.9641, + "step": 38200 + }, + { + "epoch": 195.89743589743588, + "eval_loss": 1.1364027261734009, + "eval_runtime": 36.4301, + "eval_samples_per_second": 10.788, + "eval_steps_per_second": 1.372, + "step": 38200 + }, + { + "epoch": 196.4102564102564, + "grad_norm": 25.223285675048828, + "learning_rate": 6.172e-06, + "loss": 0.9562, + "step": 38300 + }, + { + "epoch": 196.4102564102564, + "eval_loss": 1.164099931716919, + "eval_runtime": 36.6269, + "eval_samples_per_second": 10.73, + "eval_steps_per_second": 1.365, + "step": 38300 + }, + { + "epoch": 196.92307692307693, + "grad_norm": 69.42765808105469, + "learning_rate": 6.1620000000000005e-06, + "loss": 0.917, + "step": 38400 + }, + { + "epoch": 196.92307692307693, + "eval_loss": 1.1439764499664307, + "eval_runtime": 36.3533, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 38400 + }, + { + "epoch": 197.43589743589743, + "grad_norm": 39.35880661010742, + "learning_rate": 6.1520000000000006e-06, + "loss": 0.9202, + "step": 38500 + }, + { + "epoch": 197.43589743589743, + "eval_loss": 1.1464428901672363, + "eval_runtime": 36.4825, + "eval_samples_per_second": 10.772, + "eval_steps_per_second": 1.371, + "step": 38500 + }, + { + "epoch": 197.94871794871796, + "grad_norm": 37.69295120239258, + "learning_rate": 6.142e-06, + "loss": 0.9692, + "step": 38600 + }, + { + "epoch": 197.94871794871796, + "eval_loss": 1.1464636325836182, + "eval_runtime": 36.4627, + "eval_samples_per_second": 10.778, + "eval_steps_per_second": 1.371, + "step": 38600 + }, + { + "epoch": 198.46153846153845, + "grad_norm": 33.59278106689453, + "learning_rate": 6.132e-06, + "loss": 0.9222, + "step": 38700 + }, + { + "epoch": 198.46153846153845, + "eval_loss": 1.1497478485107422, + "eval_runtime": 36.4592, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 38700 + }, + { + "epoch": 198.97435897435898, + "grad_norm": 35.24226379394531, + "learning_rate": 6.122e-06, + "loss": 0.9512, + "step": 38800 + }, + { + "epoch": 198.97435897435898, + "eval_loss": 1.1409236192703247, + "eval_runtime": 36.3124, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 38800 + }, + { + "epoch": 199.48717948717947, + "grad_norm": 32.13850402832031, + "learning_rate": 6.112e-06, + "loss": 0.9694, + "step": 38900 + }, + { + "epoch": 199.48717948717947, + "eval_loss": 1.1598166227340698, + "eval_runtime": 36.3308, + "eval_samples_per_second": 10.817, + "eval_steps_per_second": 1.376, + "step": 38900 + }, + { + "epoch": 200.0, + "grad_norm": 32.88125228881836, + "learning_rate": 6.102e-06, + "loss": 0.94, + "step": 39000 + }, + { + "epoch": 200.0, + "eval_loss": 1.1506444215774536, + "eval_runtime": 36.5128, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 39000 + }, + { + "epoch": 200.51282051282053, + "grad_norm": 39.92708206176758, + "learning_rate": 6.0921000000000005e-06, + "loss": 0.9351, + "step": 39100 + }, + { + "epoch": 200.51282051282053, + "eval_loss": 1.1523412466049194, + "eval_runtime": 36.3692, + "eval_samples_per_second": 10.806, + "eval_steps_per_second": 1.375, + "step": 39100 + }, + { + "epoch": 201.02564102564102, + "grad_norm": 22.772733688354492, + "learning_rate": 6.082100000000001e-06, + "loss": 0.9189, + "step": 39200 + }, + { + "epoch": 201.02564102564102, + "eval_loss": 1.1522444486618042, + "eval_runtime": 36.2076, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 39200 + }, + { + "epoch": 201.53846153846155, + "grad_norm": 47.06113815307617, + "learning_rate": 6.072100000000001e-06, + "loss": 0.9172, + "step": 39300 + }, + { + "epoch": 201.53846153846155, + "eval_loss": 1.1399579048156738, + "eval_runtime": 36.6602, + "eval_samples_per_second": 10.72, + "eval_steps_per_second": 1.364, + "step": 39300 + }, + { + "epoch": 202.05128205128204, + "grad_norm": 34.18037414550781, + "learning_rate": 6.062100000000001e-06, + "loss": 0.9559, + "step": 39400 + }, + { + "epoch": 202.05128205128204, + "eval_loss": 1.1355735063552856, + "eval_runtime": 36.2157, + "eval_samples_per_second": 10.852, + "eval_steps_per_second": 1.381, + "step": 39400 + }, + { + "epoch": 202.56410256410257, + "grad_norm": 31.199600219726562, + "learning_rate": 6.052100000000001e-06, + "loss": 0.9353, + "step": 39500 + }, + { + "epoch": 202.56410256410257, + "eval_loss": 1.1327228546142578, + "eval_runtime": 36.2862, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 39500 + }, + { + "epoch": 203.07692307692307, + "grad_norm": 26.310894012451172, + "learning_rate": 6.042100000000001e-06, + "loss": 0.9386, + "step": 39600 + }, + { + "epoch": 203.07692307692307, + "eval_loss": 1.145666241645813, + "eval_runtime": 36.5137, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 39600 + }, + { + "epoch": 203.5897435897436, + "grad_norm": 16.017013549804688, + "learning_rate": 6.032100000000001e-06, + "loss": 0.9649, + "step": 39700 + }, + { + "epoch": 203.5897435897436, + "eval_loss": 1.1428974866867065, + "eval_runtime": 36.6366, + "eval_samples_per_second": 10.727, + "eval_steps_per_second": 1.365, + "step": 39700 + }, + { + "epoch": 204.10256410256412, + "grad_norm": 28.163827896118164, + "learning_rate": 6.0221e-06, + "loss": 0.8773, + "step": 39800 + }, + { + "epoch": 204.10256410256412, + "eval_loss": 1.1463987827301025, + "eval_runtime": 36.5278, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 39800 + }, + { + "epoch": 204.6153846153846, + "grad_norm": 32.74943161010742, + "learning_rate": 6.0121000000000004e-06, + "loss": 0.9237, + "step": 39900 + }, + { + "epoch": 204.6153846153846, + "eval_loss": 1.1489200592041016, + "eval_runtime": 36.2296, + "eval_samples_per_second": 10.847, + "eval_steps_per_second": 1.38, + "step": 39900 + }, + { + "epoch": 205.12820512820514, + "grad_norm": 23.800464630126953, + "learning_rate": 6.0021000000000005e-06, + "loss": 0.952, + "step": 40000 + }, + { + "epoch": 205.12820512820514, + "eval_loss": 1.1407150030136108, + "eval_runtime": 36.622, + "eval_samples_per_second": 10.731, + "eval_steps_per_second": 1.365, + "step": 40000 + }, + { + "epoch": 205.64102564102564, + "grad_norm": 65.57909393310547, + "learning_rate": 5.992100000000001e-06, + "loss": 0.9231, + "step": 40100 + }, + { + "epoch": 205.64102564102564, + "eval_loss": 1.1432756185531616, + "eval_runtime": 36.3839, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 40100 + }, + { + "epoch": 206.15384615384616, + "grad_norm": 43.498634338378906, + "learning_rate": 5.982100000000001e-06, + "loss": 0.928, + "step": 40200 + }, + { + "epoch": 206.15384615384616, + "eval_loss": 1.1423068046569824, + "eval_runtime": 36.5746, + "eval_samples_per_second": 10.745, + "eval_steps_per_second": 1.367, + "step": 40200 + }, + { + "epoch": 206.66666666666666, + "grad_norm": 91.03629302978516, + "learning_rate": 5.972100000000001e-06, + "loss": 0.9418, + "step": 40300 + }, + { + "epoch": 206.66666666666666, + "eval_loss": 1.1347699165344238, + "eval_runtime": 36.5471, + "eval_samples_per_second": 10.753, + "eval_steps_per_second": 1.368, + "step": 40300 + }, + { + "epoch": 207.17948717948718, + "grad_norm": 35.1215934753418, + "learning_rate": 5.962100000000001e-06, + "loss": 0.8969, + "step": 40400 + }, + { + "epoch": 207.17948717948718, + "eval_loss": 1.1392910480499268, + "eval_runtime": 36.2656, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 40400 + }, + { + "epoch": 207.69230769230768, + "grad_norm": 38.79172897338867, + "learning_rate": 5.952100000000001e-06, + "loss": 0.949, + "step": 40500 + }, + { + "epoch": 207.69230769230768, + "eval_loss": 1.1519299745559692, + "eval_runtime": 36.454, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 40500 + }, + { + "epoch": 208.2051282051282, + "grad_norm": 34.377132415771484, + "learning_rate": 5.9421e-06, + "loss": 0.9209, + "step": 40600 + }, + { + "epoch": 208.2051282051282, + "eval_loss": 1.147558331489563, + "eval_runtime": 36.3742, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 40600 + }, + { + "epoch": 208.71794871794873, + "grad_norm": 58.57600402832031, + "learning_rate": 5.9321e-06, + "loss": 0.9094, + "step": 40700 + }, + { + "epoch": 208.71794871794873, + "eval_loss": 1.1457767486572266, + "eval_runtime": 36.4311, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 40700 + }, + { + "epoch": 209.23076923076923, + "grad_norm": 27.23153305053711, + "learning_rate": 5.9221000000000004e-06, + "loss": 0.9392, + "step": 40800 + }, + { + "epoch": 209.23076923076923, + "eval_loss": 1.139640212059021, + "eval_runtime": 36.4391, + "eval_samples_per_second": 10.785, + "eval_steps_per_second": 1.372, + "step": 40800 + }, + { + "epoch": 209.74358974358975, + "grad_norm": 51.57414627075195, + "learning_rate": 5.9121000000000005e-06, + "loss": 0.9168, + "step": 40900 + }, + { + "epoch": 209.74358974358975, + "eval_loss": 1.1544536352157593, + "eval_runtime": 36.4469, + "eval_samples_per_second": 10.783, + "eval_steps_per_second": 1.372, + "step": 40900 + }, + { + "epoch": 210.25641025641025, + "grad_norm": 38.73366165161133, + "learning_rate": 5.902100000000001e-06, + "loss": 0.9552, + "step": 41000 + }, + { + "epoch": 210.25641025641025, + "eval_loss": 1.1501624584197998, + "eval_runtime": 36.5568, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 41000 + }, + { + "epoch": 210.76923076923077, + "grad_norm": 26.72366714477539, + "learning_rate": 5.8922e-06, + "loss": 0.9036, + "step": 41100 + }, + { + "epoch": 210.76923076923077, + "eval_loss": 1.1485997438430786, + "eval_runtime": 36.5958, + "eval_samples_per_second": 10.739, + "eval_steps_per_second": 1.366, + "step": 41100 + }, + { + "epoch": 211.28205128205127, + "grad_norm": 26.981584548950195, + "learning_rate": 5.8822e-06, + "loss": 0.9206, + "step": 41200 + }, + { + "epoch": 211.28205128205127, + "eval_loss": 1.1338543891906738, + "eval_runtime": 36.4819, + "eval_samples_per_second": 10.772, + "eval_steps_per_second": 1.371, + "step": 41200 + }, + { + "epoch": 211.7948717948718, + "grad_norm": 49.47190475463867, + "learning_rate": 5.8722e-06, + "loss": 0.8931, + "step": 41300 + }, + { + "epoch": 211.7948717948718, + "eval_loss": 1.150622844696045, + "eval_runtime": 36.547, + "eval_samples_per_second": 10.753, + "eval_steps_per_second": 1.368, + "step": 41300 + }, + { + "epoch": 212.30769230769232, + "grad_norm": 37.64232635498047, + "learning_rate": 5.8622e-06, + "loss": 0.98, + "step": 41400 + }, + { + "epoch": 212.30769230769232, + "eval_loss": 1.152109980583191, + "eval_runtime": 36.3385, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 41400 + }, + { + "epoch": 212.82051282051282, + "grad_norm": 144.03382873535156, + "learning_rate": 5.8522000000000005e-06, + "loss": 0.9199, + "step": 41500 + }, + { + "epoch": 212.82051282051282, + "eval_loss": 1.1535152196884155, + "eval_runtime": 35.665, + "eval_samples_per_second": 11.019, + "eval_steps_per_second": 1.402, + "step": 41500 + }, + { + "epoch": 213.33333333333334, + "grad_norm": 18.488162994384766, + "learning_rate": 5.8422e-06, + "loss": 0.9188, + "step": 41600 + }, + { + "epoch": 213.33333333333334, + "eval_loss": 1.143506407737732, + "eval_runtime": 36.5314, + "eval_samples_per_second": 10.758, + "eval_steps_per_second": 1.369, + "step": 41600 + }, + { + "epoch": 213.84615384615384, + "grad_norm": 21.914854049682617, + "learning_rate": 5.8322e-06, + "loss": 0.9403, + "step": 41700 + }, + { + "epoch": 213.84615384615384, + "eval_loss": 1.1514562368392944, + "eval_runtime": 36.4983, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 41700 + }, + { + "epoch": 214.35897435897436, + "grad_norm": 34.050079345703125, + "learning_rate": 5.8222e-06, + "loss": 0.9165, + "step": 41800 + }, + { + "epoch": 214.35897435897436, + "eval_loss": 1.1493028402328491, + "eval_runtime": 36.436, + "eval_samples_per_second": 10.786, + "eval_steps_per_second": 1.372, + "step": 41800 + }, + { + "epoch": 214.87179487179486, + "grad_norm": 41.65437316894531, + "learning_rate": 5.8122e-06, + "loss": 0.9135, + "step": 41900 + }, + { + "epoch": 214.87179487179486, + "eval_loss": 1.1457487344741821, + "eval_runtime": 36.564, + "eval_samples_per_second": 10.748, + "eval_steps_per_second": 1.367, + "step": 41900 + }, + { + "epoch": 215.3846153846154, + "grad_norm": 31.771461486816406, + "learning_rate": 5.8022e-06, + "loss": 0.9484, + "step": 42000 + }, + { + "epoch": 215.3846153846154, + "eval_loss": 1.1541823148727417, + "eval_runtime": 36.2575, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 42000 + }, + { + "epoch": 215.89743589743588, + "grad_norm": 29.232608795166016, + "learning_rate": 5.7922e-06, + "loss": 0.9214, + "step": 42100 + }, + { + "epoch": 215.89743589743588, + "eval_loss": 1.1500135660171509, + "eval_runtime": 36.4682, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 42100 + }, + { + "epoch": 216.4102564102564, + "grad_norm": 24.89523696899414, + "learning_rate": 5.7822e-06, + "loss": 0.8954, + "step": 42200 + }, + { + "epoch": 216.4102564102564, + "eval_loss": 1.1545988321304321, + "eval_runtime": 36.2508, + "eval_samples_per_second": 10.841, + "eval_steps_per_second": 1.379, + "step": 42200 + }, + { + "epoch": 216.92307692307693, + "grad_norm": 22.55742645263672, + "learning_rate": 5.7722e-06, + "loss": 0.9234, + "step": 42300 + }, + { + "epoch": 216.92307692307693, + "eval_loss": 1.1607162952423096, + "eval_runtime": 36.6996, + "eval_samples_per_second": 10.709, + "eval_steps_per_second": 1.362, + "step": 42300 + }, + { + "epoch": 217.43589743589743, + "grad_norm": 59.15635681152344, + "learning_rate": 5.7622e-06, + "loss": 0.9537, + "step": 42400 + }, + { + "epoch": 217.43589743589743, + "eval_loss": 1.1494605541229248, + "eval_runtime": 36.6637, + "eval_samples_per_second": 10.719, + "eval_steps_per_second": 1.364, + "step": 42400 + }, + { + "epoch": 217.94871794871796, + "grad_norm": 62.2183837890625, + "learning_rate": 5.7522e-06, + "loss": 0.8931, + "step": 42500 + }, + { + "epoch": 217.94871794871796, + "eval_loss": 1.1521434783935547, + "eval_runtime": 36.6099, + "eval_samples_per_second": 10.735, + "eval_steps_per_second": 1.366, + "step": 42500 + }, + { + "epoch": 218.46153846153845, + "grad_norm": 27.152162551879883, + "learning_rate": 5.7422e-06, + "loss": 0.9379, + "step": 42600 + }, + { + "epoch": 218.46153846153845, + "eval_loss": 1.143193244934082, + "eval_runtime": 36.7002, + "eval_samples_per_second": 10.708, + "eval_steps_per_second": 1.362, + "step": 42600 + }, + { + "epoch": 218.97435897435898, + "grad_norm": 70.6044692993164, + "learning_rate": 5.7322e-06, + "loss": 0.9201, + "step": 42700 + }, + { + "epoch": 218.97435897435898, + "eval_loss": 1.1616748571395874, + "eval_runtime": 36.5128, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 42700 + }, + { + "epoch": 219.48717948717947, + "grad_norm": 29.45513153076172, + "learning_rate": 5.7222e-06, + "loss": 0.9551, + "step": 42800 + }, + { + "epoch": 219.48717948717947, + "eval_loss": 1.1373143196105957, + "eval_runtime": 36.7105, + "eval_samples_per_second": 10.705, + "eval_steps_per_second": 1.362, + "step": 42800 + }, + { + "epoch": 220.0, + "grad_norm": 42.406620025634766, + "learning_rate": 5.7122e-06, + "loss": 0.883, + "step": 42900 + }, + { + "epoch": 220.0, + "eval_loss": 1.1591565608978271, + "eval_runtime": 36.6157, + "eval_samples_per_second": 10.733, + "eval_steps_per_second": 1.366, + "step": 42900 + }, + { + "epoch": 220.51282051282053, + "grad_norm": 44.156959533691406, + "learning_rate": 5.7023000000000004e-06, + "loss": 0.9512, + "step": 43000 + }, + { + "epoch": 220.51282051282053, + "eval_loss": 1.1618258953094482, + "eval_runtime": 36.4218, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 43000 + }, + { + "epoch": 221.02564102564102, + "grad_norm": 29.019187927246094, + "learning_rate": 5.6923000000000005e-06, + "loss": 0.8984, + "step": 43100 + }, + { + "epoch": 221.02564102564102, + "eval_loss": 1.1506043672561646, + "eval_runtime": 36.3417, + "eval_samples_per_second": 10.814, + "eval_steps_per_second": 1.376, + "step": 43100 + }, + { + "epoch": 221.53846153846155, + "grad_norm": 37.82948303222656, + "learning_rate": 5.682300000000001e-06, + "loss": 0.9196, + "step": 43200 + }, + { + "epoch": 221.53846153846155, + "eval_loss": 1.1627445220947266, + "eval_runtime": 36.8605, + "eval_samples_per_second": 10.662, + "eval_steps_per_second": 1.356, + "step": 43200 + }, + { + "epoch": 222.05128205128204, + "grad_norm": 26.116413116455078, + "learning_rate": 5.672300000000001e-06, + "loss": 0.9087, + "step": 43300 + }, + { + "epoch": 222.05128205128204, + "eval_loss": 1.1551017761230469, + "eval_runtime": 36.3739, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 43300 + }, + { + "epoch": 222.56410256410257, + "grad_norm": 38.41297912597656, + "learning_rate": 5.662300000000001e-06, + "loss": 0.8872, + "step": 43400 + }, + { + "epoch": 222.56410256410257, + "eval_loss": 1.1744037866592407, + "eval_runtime": 36.6211, + "eval_samples_per_second": 10.732, + "eval_steps_per_second": 1.365, + "step": 43400 + }, + { + "epoch": 223.07692307692307, + "grad_norm": 62.05165481567383, + "learning_rate": 5.652300000000001e-06, + "loss": 0.9606, + "step": 43500 + }, + { + "epoch": 223.07692307692307, + "eval_loss": 1.1693332195281982, + "eval_runtime": 36.5668, + "eval_samples_per_second": 10.747, + "eval_steps_per_second": 1.367, + "step": 43500 + }, + { + "epoch": 223.5897435897436, + "grad_norm": 21.745861053466797, + "learning_rate": 5.6423e-06, + "loss": 0.9192, + "step": 43600 + }, + { + "epoch": 223.5897435897436, + "eval_loss": 1.1761411428451538, + "eval_runtime": 36.4307, + "eval_samples_per_second": 10.788, + "eval_steps_per_second": 1.372, + "step": 43600 + }, + { + "epoch": 224.10256410256412, + "grad_norm": 51.64875030517578, + "learning_rate": 5.6323e-06, + "loss": 0.9394, + "step": 43700 + }, + { + "epoch": 224.10256410256412, + "eval_loss": 1.1744928359985352, + "eval_runtime": 37.1908, + "eval_samples_per_second": 10.567, + "eval_steps_per_second": 1.344, + "step": 43700 + }, + { + "epoch": 224.6153846153846, + "grad_norm": 84.01808166503906, + "learning_rate": 5.6223e-06, + "loss": 0.9037, + "step": 43800 + }, + { + "epoch": 224.6153846153846, + "eval_loss": 1.1551580429077148, + "eval_runtime": 36.8417, + "eval_samples_per_second": 10.667, + "eval_steps_per_second": 1.357, + "step": 43800 + }, + { + "epoch": 225.12820512820514, + "grad_norm": 50.05611801147461, + "learning_rate": 5.6123000000000005e-06, + "loss": 0.9323, + "step": 43900 + }, + { + "epoch": 225.12820512820514, + "eval_loss": 1.153321385383606, + "eval_runtime": 35.9736, + "eval_samples_per_second": 10.925, + "eval_steps_per_second": 1.39, + "step": 43900 + }, + { + "epoch": 225.64102564102564, + "grad_norm": 41.83252716064453, + "learning_rate": 5.6023000000000006e-06, + "loss": 0.8957, + "step": 44000 + }, + { + "epoch": 225.64102564102564, + "eval_loss": 1.1556611061096191, + "eval_runtime": 36.4143, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 44000 + }, + { + "epoch": 226.15384615384616, + "grad_norm": 35.9349250793457, + "learning_rate": 5.592300000000001e-06, + "loss": 0.9552, + "step": 44100 + }, + { + "epoch": 226.15384615384616, + "eval_loss": 1.1485646963119507, + "eval_runtime": 36.5155, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 44100 + }, + { + "epoch": 226.66666666666666, + "grad_norm": 33.7932014465332, + "learning_rate": 5.582300000000001e-06, + "loss": 0.896, + "step": 44200 + }, + { + "epoch": 226.66666666666666, + "eval_loss": 1.137686014175415, + "eval_runtime": 36.6143, + "eval_samples_per_second": 10.734, + "eval_steps_per_second": 1.366, + "step": 44200 + }, + { + "epoch": 227.17948717948718, + "grad_norm": 29.460514068603516, + "learning_rate": 5.572300000000001e-06, + "loss": 0.9006, + "step": 44300 + }, + { + "epoch": 227.17948717948718, + "eval_loss": 1.142223596572876, + "eval_runtime": 36.7414, + "eval_samples_per_second": 10.696, + "eval_steps_per_second": 1.361, + "step": 44300 + }, + { + "epoch": 227.69230769230768, + "grad_norm": 25.321260452270508, + "learning_rate": 5.5623e-06, + "loss": 0.9338, + "step": 44400 + }, + { + "epoch": 227.69230769230768, + "eval_loss": 1.1675009727478027, + "eval_runtime": 36.4865, + "eval_samples_per_second": 10.771, + "eval_steps_per_second": 1.37, + "step": 44400 + }, + { + "epoch": 228.2051282051282, + "grad_norm": 84.12693786621094, + "learning_rate": 5.5523e-06, + "loss": 0.9006, + "step": 44500 + }, + { + "epoch": 228.2051282051282, + "eval_loss": 1.147350788116455, + "eval_runtime": 36.7099, + "eval_samples_per_second": 10.706, + "eval_steps_per_second": 1.362, + "step": 44500 + }, + { + "epoch": 228.71794871794873, + "grad_norm": 61.523162841796875, + "learning_rate": 5.5423e-06, + "loss": 0.9248, + "step": 44600 + }, + { + "epoch": 228.71794871794873, + "eval_loss": 1.141809344291687, + "eval_runtime": 36.528, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 44600 + }, + { + "epoch": 229.23076923076923, + "grad_norm": 24.00448989868164, + "learning_rate": 5.5323e-06, + "loss": 0.8957, + "step": 44700 + }, + { + "epoch": 229.23076923076923, + "eval_loss": 1.1519194841384888, + "eval_runtime": 36.3046, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 44700 + }, + { + "epoch": 229.74358974358975, + "grad_norm": 20.595897674560547, + "learning_rate": 5.5223000000000005e-06, + "loss": 0.9049, + "step": 44800 + }, + { + "epoch": 229.74358974358975, + "eval_loss": 1.129428505897522, + "eval_runtime": 36.8306, + "eval_samples_per_second": 10.67, + "eval_steps_per_second": 1.358, + "step": 44800 + }, + { + "epoch": 230.25641025641025, + "grad_norm": 53.32012939453125, + "learning_rate": 5.5123000000000006e-06, + "loss": 0.9057, + "step": 44900 + }, + { + "epoch": 230.25641025641025, + "eval_loss": 1.1530910730361938, + "eval_runtime": 36.4156, + "eval_samples_per_second": 10.792, + "eval_steps_per_second": 1.373, + "step": 44900 + }, + { + "epoch": 230.76923076923077, + "grad_norm": 26.143396377563477, + "learning_rate": 5.502300000000001e-06, + "loss": 0.918, + "step": 45000 + }, + { + "epoch": 230.76923076923077, + "eval_loss": 1.1326847076416016, + "eval_runtime": 36.9328, + "eval_samples_per_second": 10.641, + "eval_steps_per_second": 1.354, + "step": 45000 + }, + { + "epoch": 231.28205128205127, + "grad_norm": 41.85169982910156, + "learning_rate": 5.492300000000001e-06, + "loss": 0.8931, + "step": 45100 + }, + { + "epoch": 231.28205128205127, + "eval_loss": 1.1300909519195557, + "eval_runtime": 36.9354, + "eval_samples_per_second": 10.64, + "eval_steps_per_second": 1.354, + "step": 45100 + }, + { + "epoch": 231.7948717948718, + "grad_norm": 62.00379180908203, + "learning_rate": 5.4823e-06, + "loss": 0.918, + "step": 45200 + }, + { + "epoch": 231.7948717948718, + "eval_loss": 1.1503008604049683, + "eval_runtime": 36.6285, + "eval_samples_per_second": 10.729, + "eval_steps_per_second": 1.365, + "step": 45200 + }, + { + "epoch": 232.30769230769232, + "grad_norm": 45.8463134765625, + "learning_rate": 5.4723e-06, + "loss": 0.8912, + "step": 45300 + }, + { + "epoch": 232.30769230769232, + "eval_loss": 1.1568548679351807, + "eval_runtime": 36.4888, + "eval_samples_per_second": 10.77, + "eval_steps_per_second": 1.37, + "step": 45300 + }, + { + "epoch": 232.82051282051282, + "grad_norm": 20.30961036682129, + "learning_rate": 5.4623e-06, + "loss": 0.9026, + "step": 45400 + }, + { + "epoch": 232.82051282051282, + "eval_loss": 1.1477478742599487, + "eval_runtime": 36.4826, + "eval_samples_per_second": 10.772, + "eval_steps_per_second": 1.371, + "step": 45400 + }, + { + "epoch": 233.33333333333334, + "grad_norm": 27.44028663635254, + "learning_rate": 5.4523e-06, + "loss": 0.9315, + "step": 45500 + }, + { + "epoch": 233.33333333333334, + "eval_loss": 1.1411335468292236, + "eval_runtime": 36.5154, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 45500 + }, + { + "epoch": 233.84615384615384, + "grad_norm": 40.86202621459961, + "learning_rate": 5.4423e-06, + "loss": 0.9053, + "step": 45600 + }, + { + "epoch": 233.84615384615384, + "eval_loss": 1.1318960189819336, + "eval_runtime": 36.7403, + "eval_samples_per_second": 10.697, + "eval_steps_per_second": 1.361, + "step": 45600 + }, + { + "epoch": 234.35897435897436, + "grad_norm": 95.98432922363281, + "learning_rate": 5.4323000000000005e-06, + "loss": 0.8835, + "step": 45700 + }, + { + "epoch": 234.35897435897436, + "eval_loss": 1.1261402368545532, + "eval_runtime": 36.558, + "eval_samples_per_second": 10.75, + "eval_steps_per_second": 1.368, + "step": 45700 + }, + { + "epoch": 234.87179487179486, + "grad_norm": 51.0683479309082, + "learning_rate": 5.422300000000001e-06, + "loss": 0.9177, + "step": 45800 + }, + { + "epoch": 234.87179487179486, + "eval_loss": 1.1533235311508179, + "eval_runtime": 37.4128, + "eval_samples_per_second": 10.504, + "eval_steps_per_second": 1.336, + "step": 45800 + }, + { + "epoch": 235.3846153846154, + "grad_norm": 38.637596130371094, + "learning_rate": 5.412300000000001e-06, + "loss": 0.9359, + "step": 45900 + }, + { + "epoch": 235.3846153846154, + "eval_loss": 1.1480993032455444, + "eval_runtime": 37.3495, + "eval_samples_per_second": 10.522, + "eval_steps_per_second": 1.339, + "step": 45900 + }, + { + "epoch": 235.89743589743588, + "grad_norm": 29.30744171142578, + "learning_rate": 5.4023e-06, + "loss": 0.909, + "step": 46000 + }, + { + "epoch": 235.89743589743588, + "eval_loss": 1.1343969106674194, + "eval_runtime": 37.675, + "eval_samples_per_second": 10.431, + "eval_steps_per_second": 1.327, + "step": 46000 + }, + { + "epoch": 236.4102564102564, + "grad_norm": 28.268306732177734, + "learning_rate": 5.3923e-06, + "loss": 0.902, + "step": 46100 + }, + { + "epoch": 236.4102564102564, + "eval_loss": 1.138815999031067, + "eval_runtime": 36.5934, + "eval_samples_per_second": 10.74, + "eval_steps_per_second": 1.366, + "step": 46100 + }, + { + "epoch": 236.92307692307693, + "grad_norm": 26.0909366607666, + "learning_rate": 5.3823e-06, + "loss": 0.9115, + "step": 46200 + }, + { + "epoch": 236.92307692307693, + "eval_loss": 1.1440030336380005, + "eval_runtime": 37.0639, + "eval_samples_per_second": 10.603, + "eval_steps_per_second": 1.349, + "step": 46200 + }, + { + "epoch": 237.43589743589743, + "grad_norm": 31.87432289123535, + "learning_rate": 5.3723e-06, + "loss": 0.9055, + "step": 46300 + }, + { + "epoch": 237.43589743589743, + "eval_loss": 1.1343128681182861, + "eval_runtime": 36.5526, + "eval_samples_per_second": 10.752, + "eval_steps_per_second": 1.368, + "step": 46300 + }, + { + "epoch": 237.94871794871796, + "grad_norm": 19.258453369140625, + "learning_rate": 5.3623e-06, + "loss": 0.8851, + "step": 46400 + }, + { + "epoch": 237.94871794871796, + "eval_loss": 1.1451404094696045, + "eval_runtime": 36.9483, + "eval_samples_per_second": 10.636, + "eval_steps_per_second": 1.353, + "step": 46400 + }, + { + "epoch": 238.46153846153845, + "grad_norm": 38.91690444946289, + "learning_rate": 5.3523e-06, + "loss": 0.8827, + "step": 46500 + }, + { + "epoch": 238.46153846153845, + "eval_loss": 1.164791226387024, + "eval_runtime": 37.4914, + "eval_samples_per_second": 10.482, + "eval_steps_per_second": 1.334, + "step": 46500 + }, + { + "epoch": 238.97435897435898, + "grad_norm": 58.735591888427734, + "learning_rate": 5.3423000000000005e-06, + "loss": 0.9089, + "step": 46600 + }, + { + "epoch": 238.97435897435898, + "eval_loss": 1.1569896936416626, + "eval_runtime": 36.9641, + "eval_samples_per_second": 10.632, + "eval_steps_per_second": 1.353, + "step": 46600 + }, + { + "epoch": 239.48717948717947, + "grad_norm": 37.801090240478516, + "learning_rate": 5.332300000000001e-06, + "loss": 0.8978, + "step": 46700 + }, + { + "epoch": 239.48717948717947, + "eval_loss": 1.140226125717163, + "eval_runtime": 37.1838, + "eval_samples_per_second": 10.569, + "eval_steps_per_second": 1.345, + "step": 46700 + }, + { + "epoch": 240.0, + "grad_norm": 33.55605697631836, + "learning_rate": 5.3223e-06, + "loss": 0.9203, + "step": 46800 + }, + { + "epoch": 240.0, + "eval_loss": 1.1476918458938599, + "eval_runtime": 36.9833, + "eval_samples_per_second": 10.626, + "eval_steps_per_second": 1.352, + "step": 46800 + }, + { + "epoch": 240.51282051282053, + "grad_norm": 32.43328094482422, + "learning_rate": 5.3123e-06, + "loss": 0.8925, + "step": 46900 + }, + { + "epoch": 240.51282051282053, + "eval_loss": 1.1360310316085815, + "eval_runtime": 37.2737, + "eval_samples_per_second": 10.544, + "eval_steps_per_second": 1.341, + "step": 46900 + }, + { + "epoch": 241.02564102564102, + "grad_norm": 38.85364532470703, + "learning_rate": 5.3023e-06, + "loss": 0.912, + "step": 47000 + }, + { + "epoch": 241.02564102564102, + "eval_loss": 1.1512075662612915, + "eval_runtime": 37.1426, + "eval_samples_per_second": 10.581, + "eval_steps_per_second": 1.346, + "step": 47000 + }, + { + "epoch": 241.53846153846155, + "grad_norm": 31.19080924987793, + "learning_rate": 5.292400000000001e-06, + "loss": 0.8962, + "step": 47100 + }, + { + "epoch": 241.53846153846155, + "eval_loss": 1.138826608657837, + "eval_runtime": 37.1566, + "eval_samples_per_second": 10.577, + "eval_steps_per_second": 1.346, + "step": 47100 + }, + { + "epoch": 242.05128205128204, + "grad_norm": 49.27690124511719, + "learning_rate": 5.2824000000000004e-06, + "loss": 0.9029, + "step": 47200 + }, + { + "epoch": 242.05128205128204, + "eval_loss": 1.1300182342529297, + "eval_runtime": 36.9043, + "eval_samples_per_second": 10.649, + "eval_steps_per_second": 1.355, + "step": 47200 + }, + { + "epoch": 242.56410256410257, + "grad_norm": 17.137653350830078, + "learning_rate": 5.2724000000000005e-06, + "loss": 0.8813, + "step": 47300 + }, + { + "epoch": 242.56410256410257, + "eval_loss": 1.1398885250091553, + "eval_runtime": 36.8835, + "eval_samples_per_second": 10.655, + "eval_steps_per_second": 1.356, + "step": 47300 + }, + { + "epoch": 243.07692307692307, + "grad_norm": 90.67513275146484, + "learning_rate": 5.262400000000001e-06, + "loss": 0.8976, + "step": 47400 + }, + { + "epoch": 243.07692307692307, + "eval_loss": 1.141875147819519, + "eval_runtime": 36.6734, + "eval_samples_per_second": 10.716, + "eval_steps_per_second": 1.363, + "step": 47400 + }, + { + "epoch": 243.5897435897436, + "grad_norm": 62.24956512451172, + "learning_rate": 5.252400000000001e-06, + "loss": 0.8952, + "step": 47500 + }, + { + "epoch": 243.5897435897436, + "eval_loss": 1.1468392610549927, + "eval_runtime": 36.9371, + "eval_samples_per_second": 10.64, + "eval_steps_per_second": 1.354, + "step": 47500 + }, + { + "epoch": 244.10256410256412, + "grad_norm": 33.8979377746582, + "learning_rate": 5.242400000000001e-06, + "loss": 0.897, + "step": 47600 + }, + { + "epoch": 244.10256410256412, + "eval_loss": 1.1225271224975586, + "eval_runtime": 37.0422, + "eval_samples_per_second": 10.61, + "eval_steps_per_second": 1.35, + "step": 47600 + }, + { + "epoch": 244.6153846153846, + "grad_norm": 33.14387130737305, + "learning_rate": 5.232400000000001e-06, + "loss": 0.8782, + "step": 47700 + }, + { + "epoch": 244.6153846153846, + "eval_loss": 1.14120352268219, + "eval_runtime": 36.9739, + "eval_samples_per_second": 10.629, + "eval_steps_per_second": 1.352, + "step": 47700 + }, + { + "epoch": 245.12820512820514, + "grad_norm": 18.191484451293945, + "learning_rate": 5.222400000000001e-06, + "loss": 0.9106, + "step": 47800 + }, + { + "epoch": 245.12820512820514, + "eval_loss": 1.1563894748687744, + "eval_runtime": 36.7815, + "eval_samples_per_second": 10.685, + "eval_steps_per_second": 1.359, + "step": 47800 + }, + { + "epoch": 245.64102564102564, + "grad_norm": 33.353023529052734, + "learning_rate": 5.212400000000001e-06, + "loss": 0.8699, + "step": 47900 + }, + { + "epoch": 245.64102564102564, + "eval_loss": 1.1522945165634155, + "eval_runtime": 36.8975, + "eval_samples_per_second": 10.651, + "eval_steps_per_second": 1.355, + "step": 47900 + }, + { + "epoch": 246.15384615384616, + "grad_norm": 48.964202880859375, + "learning_rate": 5.2024e-06, + "loss": 0.9211, + "step": 48000 + }, + { + "epoch": 246.15384615384616, + "eval_loss": 1.140363335609436, + "eval_runtime": 36.9006, + "eval_samples_per_second": 10.65, + "eval_steps_per_second": 1.355, + "step": 48000 + }, + { + "epoch": 246.66666666666666, + "grad_norm": 39.334320068359375, + "learning_rate": 5.1924000000000005e-06, + "loss": 0.9005, + "step": 48100 + }, + { + "epoch": 246.66666666666666, + "eval_loss": 1.146032452583313, + "eval_runtime": 36.6159, + "eval_samples_per_second": 10.733, + "eval_steps_per_second": 1.366, + "step": 48100 + }, + { + "epoch": 247.17948717948718, + "grad_norm": 37.416343688964844, + "learning_rate": 5.1824000000000006e-06, + "loss": 0.8876, + "step": 48200 + }, + { + "epoch": 247.17948717948718, + "eval_loss": 1.1454395055770874, + "eval_runtime": 36.8641, + "eval_samples_per_second": 10.661, + "eval_steps_per_second": 1.356, + "step": 48200 + }, + { + "epoch": 247.69230769230768, + "grad_norm": 45.49391555786133, + "learning_rate": 5.172400000000001e-06, + "loss": 0.9004, + "step": 48300 + }, + { + "epoch": 247.69230769230768, + "eval_loss": 1.1497219800949097, + "eval_runtime": 36.839, + "eval_samples_per_second": 10.668, + "eval_steps_per_second": 1.357, + "step": 48300 + }, + { + "epoch": 248.2051282051282, + "grad_norm": 28.48965835571289, + "learning_rate": 5.162400000000001e-06, + "loss": 0.9081, + "step": 48400 + }, + { + "epoch": 248.2051282051282, + "eval_loss": 1.139384388923645, + "eval_runtime": 36.6619, + "eval_samples_per_second": 10.72, + "eval_steps_per_second": 1.364, + "step": 48400 + }, + { + "epoch": 248.71794871794873, + "grad_norm": 66.45464324951172, + "learning_rate": 5.152400000000001e-06, + "loss": 0.9006, + "step": 48500 + }, + { + "epoch": 248.71794871794873, + "eval_loss": 1.1441779136657715, + "eval_runtime": 36.6227, + "eval_samples_per_second": 10.731, + "eval_steps_per_second": 1.365, + "step": 48500 + }, + { + "epoch": 249.23076923076923, + "grad_norm": 37.261810302734375, + "learning_rate": 5.142400000000001e-06, + "loss": 0.8986, + "step": 48600 + }, + { + "epoch": 249.23076923076923, + "eval_loss": 1.1535238027572632, + "eval_runtime": 36.5662, + "eval_samples_per_second": 10.748, + "eval_steps_per_second": 1.367, + "step": 48600 + }, + { + "epoch": 249.74358974358975, + "grad_norm": 47.28608322143555, + "learning_rate": 5.132400000000001e-06, + "loss": 0.8932, + "step": 48700 + }, + { + "epoch": 249.74358974358975, + "eval_loss": 1.1534796953201294, + "eval_runtime": 36.8759, + "eval_samples_per_second": 10.657, + "eval_steps_per_second": 1.356, + "step": 48700 + }, + { + "epoch": 250.25641025641025, + "grad_norm": 79.1075439453125, + "learning_rate": 5.1224e-06, + "loss": 0.898, + "step": 48800 + }, + { + "epoch": 250.25641025641025, + "eval_loss": 1.1351540088653564, + "eval_runtime": 38.4993, + "eval_samples_per_second": 10.208, + "eval_steps_per_second": 1.299, + "step": 48800 + }, + { + "epoch": 250.76923076923077, + "grad_norm": 33.8562126159668, + "learning_rate": 5.1124e-06, + "loss": 0.9157, + "step": 48900 + }, + { + "epoch": 250.76923076923077, + "eval_loss": 1.14054274559021, + "eval_runtime": 38.2458, + "eval_samples_per_second": 10.276, + "eval_steps_per_second": 1.307, + "step": 48900 + }, + { + "epoch": 251.28205128205127, + "grad_norm": 47.562599182128906, + "learning_rate": 5.1024000000000005e-06, + "loss": 0.8721, + "step": 49000 + }, + { + "epoch": 251.28205128205127, + "eval_loss": 1.145168662071228, + "eval_runtime": 38.1055, + "eval_samples_per_second": 10.313, + "eval_steps_per_second": 1.312, + "step": 49000 + }, + { + "epoch": 251.7948717948718, + "grad_norm": 14.716377258300781, + "learning_rate": 5.092600000000001e-06, + "loss": 0.9125, + "step": 49100 + }, + { + "epoch": 251.7948717948718, + "eval_loss": 1.150905966758728, + "eval_runtime": 38.263, + "eval_samples_per_second": 10.271, + "eval_steps_per_second": 1.307, + "step": 49100 + }, + { + "epoch": 252.30769230769232, + "grad_norm": 43.26768493652344, + "learning_rate": 5.082600000000001e-06, + "loss": 0.8869, + "step": 49200 + }, + { + "epoch": 252.30769230769232, + "eval_loss": 1.1392353773117065, + "eval_runtime": 38.3086, + "eval_samples_per_second": 10.259, + "eval_steps_per_second": 1.305, + "step": 49200 + }, + { + "epoch": 252.82051282051282, + "grad_norm": 48.02055358886719, + "learning_rate": 5.072600000000001e-06, + "loss": 0.8663, + "step": 49300 + }, + { + "epoch": 252.82051282051282, + "eval_loss": 1.128859043121338, + "eval_runtime": 38.4908, + "eval_samples_per_second": 10.21, + "eval_steps_per_second": 1.299, + "step": 49300 + }, + { + "epoch": 253.33333333333334, + "grad_norm": 54.31551742553711, + "learning_rate": 5.0626000000000005e-06, + "loss": 0.8789, + "step": 49400 + }, + { + "epoch": 253.33333333333334, + "eval_loss": 1.1284832954406738, + "eval_runtime": 38.2031, + "eval_samples_per_second": 10.287, + "eval_steps_per_second": 1.309, + "step": 49400 + }, + { + "epoch": 253.84615384615384, + "grad_norm": 29.443904876708984, + "learning_rate": 5.0526000000000005e-06, + "loss": 0.9105, + "step": 49500 + }, + { + "epoch": 253.84615384615384, + "eval_loss": 1.1321011781692505, + "eval_runtime": 38.3842, + "eval_samples_per_second": 10.239, + "eval_steps_per_second": 1.303, + "step": 49500 + }, + { + "epoch": 254.35897435897436, + "grad_norm": 15.285839080810547, + "learning_rate": 5.043100000000001e-06, + "loss": 0.8987, + "step": 49600 + }, + { + "epoch": 254.35897435897436, + "eval_loss": 1.147458553314209, + "eval_runtime": 36.1009, + "eval_samples_per_second": 10.886, + "eval_steps_per_second": 1.385, + "step": 49600 + }, + { + "epoch": 254.87179487179486, + "grad_norm": 40.69172668457031, + "learning_rate": 5.0332e-06, + "loss": 0.8706, + "step": 49700 + }, + { + "epoch": 254.87179487179486, + "eval_loss": 1.124189019203186, + "eval_runtime": 35.7208, + "eval_samples_per_second": 11.002, + "eval_steps_per_second": 1.4, + "step": 49700 + }, + { + "epoch": 255.3846153846154, + "grad_norm": 37.004058837890625, + "learning_rate": 5.0232000000000005e-06, + "loss": 0.8796, + "step": 49800 + }, + { + "epoch": 255.3846153846154, + "eval_loss": 1.1304963827133179, + "eval_runtime": 35.799, + "eval_samples_per_second": 10.978, + "eval_steps_per_second": 1.397, + "step": 49800 + }, + { + "epoch": 255.89743589743588, + "grad_norm": 22.531829833984375, + "learning_rate": 5.0132000000000006e-06, + "loss": 0.8779, + "step": 49900 + }, + { + "epoch": 255.89743589743588, + "eval_loss": 1.1285786628723145, + "eval_runtime": 35.7929, + "eval_samples_per_second": 10.98, + "eval_steps_per_second": 1.397, + "step": 49900 + }, + { + "epoch": 256.4102564102564, + "grad_norm": 35.553382873535156, + "learning_rate": 5.003200000000001e-06, + "loss": 0.8916, + "step": 50000 + }, + { + "epoch": 256.4102564102564, + "eval_loss": 1.1436452865600586, + "eval_runtime": 35.7123, + "eval_samples_per_second": 11.005, + "eval_steps_per_second": 1.4, + "step": 50000 + }, + { + "epoch": 256.9230769230769, + "grad_norm": 59.00286102294922, + "learning_rate": 4.9932e-06, + "loss": 0.8951, + "step": 50100 + }, + { + "epoch": 256.9230769230769, + "eval_loss": 1.13074791431427, + "eval_runtime": 36.4482, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 50100 + }, + { + "epoch": 257.43589743589746, + "grad_norm": 17.311477661132812, + "learning_rate": 4.9832e-06, + "loss": 0.9174, + "step": 50200 + }, + { + "epoch": 257.43589743589746, + "eval_loss": 1.1362484693527222, + "eval_runtime": 36.3271, + "eval_samples_per_second": 10.818, + "eval_steps_per_second": 1.376, + "step": 50200 + }, + { + "epoch": 257.94871794871796, + "grad_norm": 63.69939422607422, + "learning_rate": 4.9733e-06, + "loss": 0.851, + "step": 50300 + }, + { + "epoch": 257.94871794871796, + "eval_loss": 1.1349260807037354, + "eval_runtime": 36.2883, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 50300 + }, + { + "epoch": 258.46153846153845, + "grad_norm": 22.72454071044922, + "learning_rate": 4.9633e-06, + "loss": 0.9153, + "step": 50400 + }, + { + "epoch": 258.46153846153845, + "eval_loss": 1.1272227764129639, + "eval_runtime": 36.1818, + "eval_samples_per_second": 10.862, + "eval_steps_per_second": 1.382, + "step": 50400 + }, + { + "epoch": 258.97435897435895, + "grad_norm": 76.05549621582031, + "learning_rate": 4.9533000000000005e-06, + "loss": 0.8684, + "step": 50500 + }, + { + "epoch": 258.97435897435895, + "eval_loss": 1.1530264616012573, + "eval_runtime": 36.2941, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 50500 + }, + { + "epoch": 259.4871794871795, + "grad_norm": 28.523717880249023, + "learning_rate": 4.943300000000001e-06, + "loss": 0.8615, + "step": 50600 + }, + { + "epoch": 259.4871794871795, + "eval_loss": 1.147382140159607, + "eval_runtime": 36.5458, + "eval_samples_per_second": 10.754, + "eval_steps_per_second": 1.368, + "step": 50600 + }, + { + "epoch": 260.0, + "grad_norm": 46.74496078491211, + "learning_rate": 4.933300000000001e-06, + "loss": 0.9206, + "step": 50700 + }, + { + "epoch": 260.0, + "eval_loss": 1.1495282649993896, + "eval_runtime": 35.9668, + "eval_samples_per_second": 10.927, + "eval_steps_per_second": 1.39, + "step": 50700 + }, + { + "epoch": 260.5128205128205, + "grad_norm": 48.6566047668457, + "learning_rate": 4.9233e-06, + "loss": 0.8654, + "step": 50800 + }, + { + "epoch": 260.5128205128205, + "eval_loss": 1.1383757591247559, + "eval_runtime": 36.4034, + "eval_samples_per_second": 10.796, + "eval_steps_per_second": 1.373, + "step": 50800 + }, + { + "epoch": 261.02564102564105, + "grad_norm": 32.860172271728516, + "learning_rate": 4.9133e-06, + "loss": 0.9161, + "step": 50900 + }, + { + "epoch": 261.02564102564105, + "eval_loss": 1.1320137977600098, + "eval_runtime": 36.5697, + "eval_samples_per_second": 10.747, + "eval_steps_per_second": 1.367, + "step": 50900 + }, + { + "epoch": 261.53846153846155, + "grad_norm": 17.922121047973633, + "learning_rate": 4.9033e-06, + "loss": 0.8807, + "step": 51000 + }, + { + "epoch": 261.53846153846155, + "eval_loss": 1.1474146842956543, + "eval_runtime": 36.4706, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 51000 + }, + { + "epoch": 262.05128205128204, + "grad_norm": 97.9308090209961, + "learning_rate": 4.8933e-06, + "loss": 0.9001, + "step": 51100 + }, + { + "epoch": 262.05128205128204, + "eval_loss": 1.1349908113479614, + "eval_runtime": 36.4107, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 51100 + }, + { + "epoch": 262.56410256410254, + "grad_norm": 45.31386184692383, + "learning_rate": 4.8833e-06, + "loss": 0.9144, + "step": 51200 + }, + { + "epoch": 262.56410256410254, + "eval_loss": 1.1408582925796509, + "eval_runtime": 36.4323, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 51200 + }, + { + "epoch": 263.0769230769231, + "grad_norm": 44.512184143066406, + "learning_rate": 4.8733e-06, + "loss": 0.8513, + "step": 51300 + }, + { + "epoch": 263.0769230769231, + "eval_loss": 1.1530704498291016, + "eval_runtime": 37.0089, + "eval_samples_per_second": 10.619, + "eval_steps_per_second": 1.351, + "step": 51300 + }, + { + "epoch": 263.5897435897436, + "grad_norm": 51.15203094482422, + "learning_rate": 4.8633000000000005e-06, + "loss": 0.8842, + "step": 51400 + }, + { + "epoch": 263.5897435897436, + "eval_loss": 1.1536325216293335, + "eval_runtime": 36.0911, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.385, + "step": 51400 + }, + { + "epoch": 264.1025641025641, + "grad_norm": 45.99526596069336, + "learning_rate": 4.853300000000001e-06, + "loss": 0.9085, + "step": 51500 + }, + { + "epoch": 264.1025641025641, + "eval_loss": 1.1356266736984253, + "eval_runtime": 36.683, + "eval_samples_per_second": 10.713, + "eval_steps_per_second": 1.363, + "step": 51500 + }, + { + "epoch": 264.61538461538464, + "grad_norm": 46.314334869384766, + "learning_rate": 4.8433e-06, + "loss": 0.8542, + "step": 51600 + }, + { + "epoch": 264.61538461538464, + "eval_loss": 1.1203433275222778, + "eval_runtime": 36.4813, + "eval_samples_per_second": 10.773, + "eval_steps_per_second": 1.371, + "step": 51600 + }, + { + "epoch": 265.12820512820514, + "grad_norm": 24.765853881835938, + "learning_rate": 4.8333e-06, + "loss": 0.8937, + "step": 51700 + }, + { + "epoch": 265.12820512820514, + "eval_loss": 1.1401312351226807, + "eval_runtime": 36.3129, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 51700 + }, + { + "epoch": 265.64102564102564, + "grad_norm": 38.81180191040039, + "learning_rate": 4.8233e-06, + "loss": 0.8871, + "step": 51800 + }, + { + "epoch": 265.64102564102564, + "eval_loss": 1.1477760076522827, + "eval_runtime": 36.5281, + "eval_samples_per_second": 10.759, + "eval_steps_per_second": 1.369, + "step": 51800 + }, + { + "epoch": 266.15384615384613, + "grad_norm": 30.124187469482422, + "learning_rate": 4.8133e-06, + "loss": 0.8654, + "step": 51900 + }, + { + "epoch": 266.15384615384613, + "eval_loss": 1.1316016912460327, + "eval_runtime": 36.4179, + "eval_samples_per_second": 10.791, + "eval_steps_per_second": 1.373, + "step": 51900 + }, + { + "epoch": 266.6666666666667, + "grad_norm": 44.90863037109375, + "learning_rate": 4.8033e-06, + "loss": 0.8635, + "step": 52000 + }, + { + "epoch": 266.6666666666667, + "eval_loss": 1.1263864040374756, + "eval_runtime": 36.2895, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 52000 + }, + { + "epoch": 267.1794871794872, + "grad_norm": 34.481868743896484, + "learning_rate": 4.7933e-06, + "loss": 0.9176, + "step": 52100 + }, + { + "epoch": 267.1794871794872, + "eval_loss": 1.130428433418274, + "eval_runtime": 36.13, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 52100 + }, + { + "epoch": 267.6923076923077, + "grad_norm": 34.233551025390625, + "learning_rate": 4.7833000000000004e-06, + "loss": 0.8856, + "step": 52200 + }, + { + "epoch": 267.6923076923077, + "eval_loss": 1.1517621278762817, + "eval_runtime": 36.3257, + "eval_samples_per_second": 10.819, + "eval_steps_per_second": 1.376, + "step": 52200 + }, + { + "epoch": 268.20512820512823, + "grad_norm": 60.05598831176758, + "learning_rate": 4.7733000000000005e-06, + "loss": 0.8862, + "step": 52300 + }, + { + "epoch": 268.20512820512823, + "eval_loss": 1.1423187255859375, + "eval_runtime": 36.2202, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 52300 + }, + { + "epoch": 268.71794871794873, + "grad_norm": 23.428115844726562, + "learning_rate": 4.7633e-06, + "loss": 0.8737, + "step": 52400 + }, + { + "epoch": 268.71794871794873, + "eval_loss": 1.140769124031067, + "eval_runtime": 36.2936, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 52400 + }, + { + "epoch": 269.2307692307692, + "grad_norm": 47.76045608520508, + "learning_rate": 4.7533e-06, + "loss": 0.8973, + "step": 52500 + }, + { + "epoch": 269.2307692307692, + "eval_loss": 1.1517248153686523, + "eval_runtime": 36.3193, + "eval_samples_per_second": 10.821, + "eval_steps_per_second": 1.377, + "step": 52500 + }, + { + "epoch": 269.7435897435897, + "grad_norm": 22.47121810913086, + "learning_rate": 4.7433e-06, + "loss": 0.8472, + "step": 52600 + }, + { + "epoch": 269.7435897435897, + "eval_loss": 1.1354777812957764, + "eval_runtime": 36.2675, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 52600 + }, + { + "epoch": 270.2564102564103, + "grad_norm": 43.68316650390625, + "learning_rate": 4.7333e-06, + "loss": 0.8872, + "step": 52700 + }, + { + "epoch": 270.2564102564103, + "eval_loss": 1.1220067739486694, + "eval_runtime": 36.2778, + "eval_samples_per_second": 10.833, + "eval_steps_per_second": 1.378, + "step": 52700 + }, + { + "epoch": 270.7692307692308, + "grad_norm": 37.78057861328125, + "learning_rate": 4.723300000000001e-06, + "loss": 0.8783, + "step": 52800 + }, + { + "epoch": 270.7692307692308, + "eval_loss": 1.1229535341262817, + "eval_runtime": 36.2405, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 52800 + }, + { + "epoch": 271.28205128205127, + "grad_norm": 35.45234298706055, + "learning_rate": 4.7133e-06, + "loss": 0.8582, + "step": 52900 + }, + { + "epoch": 271.28205128205127, + "eval_loss": 1.1541651487350464, + "eval_runtime": 36.4608, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 52900 + }, + { + "epoch": 271.79487179487177, + "grad_norm": 48.900718688964844, + "learning_rate": 4.7033e-06, + "loss": 0.8792, + "step": 53000 + }, + { + "epoch": 271.79487179487177, + "eval_loss": 1.1478773355484009, + "eval_runtime": 36.3942, + "eval_samples_per_second": 10.798, + "eval_steps_per_second": 1.374, + "step": 53000 + }, + { + "epoch": 272.3076923076923, + "grad_norm": 30.410303115844727, + "learning_rate": 4.6933000000000004e-06, + "loss": 0.8946, + "step": 53100 + }, + { + "epoch": 272.3076923076923, + "eval_loss": 1.1503480672836304, + "eval_runtime": 36.1212, + "eval_samples_per_second": 10.88, + "eval_steps_per_second": 1.384, + "step": 53100 + }, + { + "epoch": 272.8205128205128, + "grad_norm": 26.446216583251953, + "learning_rate": 4.6833000000000005e-06, + "loss": 0.8368, + "step": 53200 + }, + { + "epoch": 272.8205128205128, + "eval_loss": 1.1520763635635376, + "eval_runtime": 36.1636, + "eval_samples_per_second": 10.867, + "eval_steps_per_second": 1.383, + "step": 53200 + }, + { + "epoch": 273.3333333333333, + "grad_norm": 25.304534912109375, + "learning_rate": 4.673300000000001e-06, + "loss": 0.9014, + "step": 53300 + }, + { + "epoch": 273.3333333333333, + "eval_loss": 1.1331719160079956, + "eval_runtime": 36.1314, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 53300 + }, + { + "epoch": 273.84615384615387, + "grad_norm": 51.88557815551758, + "learning_rate": 4.663300000000001e-06, + "loss": 0.8674, + "step": 53400 + }, + { + "epoch": 273.84615384615387, + "eval_loss": 1.1463605165481567, + "eval_runtime": 35.909, + "eval_samples_per_second": 10.944, + "eval_steps_per_second": 1.392, + "step": 53400 + }, + { + "epoch": 274.35897435897436, + "grad_norm": 53.21933364868164, + "learning_rate": 4.653300000000001e-06, + "loss": 0.886, + "step": 53500 + }, + { + "epoch": 274.35897435897436, + "eval_loss": 1.1393417119979858, + "eval_runtime": 36.0941, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 53500 + }, + { + "epoch": 274.87179487179486, + "grad_norm": 47.593536376953125, + "learning_rate": 4.643300000000001e-06, + "loss": 0.8866, + "step": 53600 + }, + { + "epoch": 274.87179487179486, + "eval_loss": 1.1355082988739014, + "eval_runtime": 35.9594, + "eval_samples_per_second": 10.929, + "eval_steps_per_second": 1.39, + "step": 53600 + }, + { + "epoch": 275.38461538461536, + "grad_norm": 35.31821060180664, + "learning_rate": 4.6333e-06, + "loss": 0.8758, + "step": 53700 + }, + { + "epoch": 275.38461538461536, + "eval_loss": 1.1380585432052612, + "eval_runtime": 36.1258, + "eval_samples_per_second": 10.879, + "eval_steps_per_second": 1.384, + "step": 53700 + }, + { + "epoch": 275.8974358974359, + "grad_norm": 31.84410858154297, + "learning_rate": 4.6233e-06, + "loss": 0.8584, + "step": 53800 + }, + { + "epoch": 275.8974358974359, + "eval_loss": 1.139506459236145, + "eval_runtime": 36.0685, + "eval_samples_per_second": 10.896, + "eval_steps_per_second": 1.386, + "step": 53800 + }, + { + "epoch": 276.4102564102564, + "grad_norm": 14.109840393066406, + "learning_rate": 4.6133e-06, + "loss": 0.8712, + "step": 53900 + }, + { + "epoch": 276.4102564102564, + "eval_loss": 1.1366515159606934, + "eval_runtime": 36.1556, + "eval_samples_per_second": 10.87, + "eval_steps_per_second": 1.383, + "step": 53900 + }, + { + "epoch": 276.9230769230769, + "grad_norm": 19.018056869506836, + "learning_rate": 4.6033000000000005e-06, + "loss": 0.8978, + "step": 54000 + }, + { + "epoch": 276.9230769230769, + "eval_loss": 1.1294822692871094, + "eval_runtime": 35.9845, + "eval_samples_per_second": 10.921, + "eval_steps_per_second": 1.389, + "step": 54000 + }, + { + "epoch": 277.43589743589746, + "grad_norm": 61.409645080566406, + "learning_rate": 4.5933000000000006e-06, + "loss": 0.8481, + "step": 54100 + }, + { + "epoch": 277.43589743589746, + "eval_loss": 1.1292353868484497, + "eval_runtime": 36.1859, + "eval_samples_per_second": 10.861, + "eval_steps_per_second": 1.382, + "step": 54100 + }, + { + "epoch": 277.94871794871796, + "grad_norm": 108.62322235107422, + "learning_rate": 4.583300000000001e-06, + "loss": 0.9033, + "step": 54200 + }, + { + "epoch": 277.94871794871796, + "eval_loss": 1.1410841941833496, + "eval_runtime": 36.145, + "eval_samples_per_second": 10.873, + "eval_steps_per_second": 1.383, + "step": 54200 + }, + { + "epoch": 278.46153846153845, + "grad_norm": 40.35408401489258, + "learning_rate": 4.573300000000001e-06, + "loss": 0.8857, + "step": 54300 + }, + { + "epoch": 278.46153846153845, + "eval_loss": 1.1335570812225342, + "eval_runtime": 36.073, + "eval_samples_per_second": 10.895, + "eval_steps_per_second": 1.386, + "step": 54300 + }, + { + "epoch": 278.97435897435895, + "grad_norm": 58.30024337768555, + "learning_rate": 4.5634e-06, + "loss": 0.8501, + "step": 54400 + }, + { + "epoch": 278.97435897435895, + "eval_loss": 1.1359648704528809, + "eval_runtime": 36.4765, + "eval_samples_per_second": 10.774, + "eval_steps_per_second": 1.371, + "step": 54400 + }, + { + "epoch": 279.4871794871795, + "grad_norm": 39.219383239746094, + "learning_rate": 4.5534e-06, + "loss": 0.8622, + "step": 54500 + }, + { + "epoch": 279.4871794871795, + "eval_loss": 1.1281332969665527, + "eval_runtime": 36.1223, + "eval_samples_per_second": 10.88, + "eval_steps_per_second": 1.384, + "step": 54500 + }, + { + "epoch": 280.0, + "grad_norm": 30.55599594116211, + "learning_rate": 4.5434e-06, + "loss": 0.8966, + "step": 54600 + }, + { + "epoch": 280.0, + "eval_loss": 1.1225199699401855, + "eval_runtime": 36.155, + "eval_samples_per_second": 10.87, + "eval_steps_per_second": 1.383, + "step": 54600 + }, + { + "epoch": 280.5128205128205, + "grad_norm": 46.56840515136719, + "learning_rate": 4.5334000000000005e-06, + "loss": 0.863, + "step": 54700 + }, + { + "epoch": 280.5128205128205, + "eval_loss": 1.1088685989379883, + "eval_runtime": 36.275, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 54700 + }, + { + "epoch": 281.02564102564105, + "grad_norm": 41.028480529785156, + "learning_rate": 4.523400000000001e-06, + "loss": 0.878, + "step": 54800 + }, + { + "epoch": 281.02564102564105, + "eval_loss": 1.1299177408218384, + "eval_runtime": 36.2427, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 54800 + }, + { + "epoch": 281.53846153846155, + "grad_norm": 28.16250991821289, + "learning_rate": 4.513400000000001e-06, + "loss": 0.8539, + "step": 54900 + }, + { + "epoch": 281.53846153846155, + "eval_loss": 1.1401152610778809, + "eval_runtime": 36.3881, + "eval_samples_per_second": 10.8, + "eval_steps_per_second": 1.374, + "step": 54900 + }, + { + "epoch": 282.05128205128204, + "grad_norm": 52.706581115722656, + "learning_rate": 4.503400000000001e-06, + "loss": 0.8937, + "step": 55000 + }, + { + "epoch": 282.05128205128204, + "eval_loss": 1.13363778591156, + "eval_runtime": 36.2229, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 55000 + }, + { + "epoch": 282.56410256410254, + "grad_norm": 41.943607330322266, + "learning_rate": 4.493400000000001e-06, + "loss": 0.825, + "step": 55100 + }, + { + "epoch": 282.56410256410254, + "eval_loss": 1.130061149597168, + "eval_runtime": 36.2193, + "eval_samples_per_second": 10.851, + "eval_steps_per_second": 1.38, + "step": 55100 + }, + { + "epoch": 283.0769230769231, + "grad_norm": 27.637081146240234, + "learning_rate": 4.4834e-06, + "loss": 0.9255, + "step": 55200 + }, + { + "epoch": 283.0769230769231, + "eval_loss": 1.1405057907104492, + "eval_runtime": 36.0458, + "eval_samples_per_second": 10.903, + "eval_steps_per_second": 1.387, + "step": 55200 + }, + { + "epoch": 283.5897435897436, + "grad_norm": 31.099653244018555, + "learning_rate": 4.4734e-06, + "loss": 0.8334, + "step": 55300 + }, + { + "epoch": 283.5897435897436, + "eval_loss": 1.1467561721801758, + "eval_runtime": 36.213, + "eval_samples_per_second": 10.852, + "eval_steps_per_second": 1.381, + "step": 55300 + }, + { + "epoch": 284.1025641025641, + "grad_norm": 30.28895378112793, + "learning_rate": 4.4634e-06, + "loss": 0.863, + "step": 55400 + }, + { + "epoch": 284.1025641025641, + "eval_loss": 1.131995677947998, + "eval_runtime": 36.0232, + "eval_samples_per_second": 10.91, + "eval_steps_per_second": 1.388, + "step": 55400 + }, + { + "epoch": 284.61538461538464, + "grad_norm": 75.0301742553711, + "learning_rate": 4.4534000000000004e-06, + "loss": 0.8553, + "step": 55500 + }, + { + "epoch": 284.61538461538464, + "eval_loss": 1.1213085651397705, + "eval_runtime": 36.0481, + "eval_samples_per_second": 10.902, + "eval_steps_per_second": 1.387, + "step": 55500 + }, + { + "epoch": 285.12820512820514, + "grad_norm": 26.00135040283203, + "learning_rate": 4.4434000000000005e-06, + "loss": 0.8894, + "step": 55600 + }, + { + "epoch": 285.12820512820514, + "eval_loss": 1.116700530052185, + "eval_runtime": 36.1471, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 55600 + }, + { + "epoch": 285.64102564102564, + "grad_norm": 32.3271484375, + "learning_rate": 4.433400000000001e-06, + "loss": 0.8414, + "step": 55700 + }, + { + "epoch": 285.64102564102564, + "eval_loss": 1.13357412815094, + "eval_runtime": 36.2345, + "eval_samples_per_second": 10.846, + "eval_steps_per_second": 1.38, + "step": 55700 + }, + { + "epoch": 286.15384615384613, + "grad_norm": 29.941709518432617, + "learning_rate": 4.423400000000001e-06, + "loss": 0.8901, + "step": 55800 + }, + { + "epoch": 286.15384615384613, + "eval_loss": 1.1295361518859863, + "eval_runtime": 36.2685, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 55800 + }, + { + "epoch": 286.6666666666667, + "grad_norm": 69.54522705078125, + "learning_rate": 4.413400000000001e-06, + "loss": 0.8598, + "step": 55900 + }, + { + "epoch": 286.6666666666667, + "eval_loss": 1.1274800300598145, + "eval_runtime": 36.1893, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 55900 + }, + { + "epoch": 287.1794871794872, + "grad_norm": 43.70930862426758, + "learning_rate": 4.4034e-06, + "loss": 0.8827, + "step": 56000 + }, + { + "epoch": 287.1794871794872, + "eval_loss": 1.1295197010040283, + "eval_runtime": 36.2576, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 56000 + }, + { + "epoch": 287.6923076923077, + "grad_norm": 35.178009033203125, + "learning_rate": 4.3934e-06, + "loss": 0.8551, + "step": 56100 + }, + { + "epoch": 287.6923076923077, + "eval_loss": 1.1459099054336548, + "eval_runtime": 36.144, + "eval_samples_per_second": 10.873, + "eval_steps_per_second": 1.383, + "step": 56100 + }, + { + "epoch": 288.20512820512823, + "grad_norm": 63.10675811767578, + "learning_rate": 4.3834e-06, + "loss": 0.8614, + "step": 56200 + }, + { + "epoch": 288.20512820512823, + "eval_loss": 1.1338379383087158, + "eval_runtime": 36.0428, + "eval_samples_per_second": 10.904, + "eval_steps_per_second": 1.387, + "step": 56200 + }, + { + "epoch": 288.71794871794873, + "grad_norm": 26.874658584594727, + "learning_rate": 4.3734e-06, + "loss": 0.8831, + "step": 56300 + }, + { + "epoch": 288.71794871794873, + "eval_loss": 1.1341028213500977, + "eval_runtime": 36.0473, + "eval_samples_per_second": 10.902, + "eval_steps_per_second": 1.387, + "step": 56300 + }, + { + "epoch": 289.2307692307692, + "grad_norm": 44.12113571166992, + "learning_rate": 4.3634000000000004e-06, + "loss": 0.834, + "step": 56400 + }, + { + "epoch": 289.2307692307692, + "eval_loss": 1.142713189125061, + "eval_runtime": 36.2172, + "eval_samples_per_second": 10.851, + "eval_steps_per_second": 1.381, + "step": 56400 + }, + { + "epoch": 289.7435897435897, + "grad_norm": 32.364463806152344, + "learning_rate": 4.353500000000001e-06, + "loss": 0.8986, + "step": 56500 + }, + { + "epoch": 289.7435897435897, + "eval_loss": 1.1315776109695435, + "eval_runtime": 36.4739, + "eval_samples_per_second": 10.775, + "eval_steps_per_second": 1.371, + "step": 56500 + }, + { + "epoch": 290.2564102564103, + "grad_norm": 17.216848373413086, + "learning_rate": 4.343500000000001e-06, + "loss": 0.8301, + "step": 56600 + }, + { + "epoch": 290.2564102564103, + "eval_loss": 1.1266990900039673, + "eval_runtime": 35.9428, + "eval_samples_per_second": 10.934, + "eval_steps_per_second": 1.391, + "step": 56600 + }, + { + "epoch": 290.7692307692308, + "grad_norm": 45.26468276977539, + "learning_rate": 4.3335e-06, + "loss": 0.8923, + "step": 56700 + }, + { + "epoch": 290.7692307692308, + "eval_loss": 1.1295489072799683, + "eval_runtime": 36.0228, + "eval_samples_per_second": 10.91, + "eval_steps_per_second": 1.388, + "step": 56700 + }, + { + "epoch": 291.28205128205127, + "grad_norm": 26.592758178710938, + "learning_rate": 4.3235e-06, + "loss": 0.851, + "step": 56800 + }, + { + "epoch": 291.28205128205127, + "eval_loss": 1.1603025197982788, + "eval_runtime": 36.1616, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 56800 + }, + { + "epoch": 291.79487179487177, + "grad_norm": 38.846736907958984, + "learning_rate": 4.3135e-06, + "loss": 0.8777, + "step": 56900 + }, + { + "epoch": 291.79487179487177, + "eval_loss": 1.1385276317596436, + "eval_runtime": 36.017, + "eval_samples_per_second": 10.912, + "eval_steps_per_second": 1.388, + "step": 56900 + }, + { + "epoch": 292.3076923076923, + "grad_norm": 56.09749221801758, + "learning_rate": 4.3035e-06, + "loss": 0.8709, + "step": 57000 + }, + { + "epoch": 292.3076923076923, + "eval_loss": 1.1382383108139038, + "eval_runtime": 36.2928, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 57000 + }, + { + "epoch": 292.8205128205128, + "grad_norm": 36.632686614990234, + "learning_rate": 4.2935000000000005e-06, + "loss": 0.8461, + "step": 57100 + }, + { + "epoch": 292.8205128205128, + "eval_loss": 1.1344859600067139, + "eval_runtime": 36.109, + "eval_samples_per_second": 10.884, + "eval_steps_per_second": 1.385, + "step": 57100 + }, + { + "epoch": 293.3333333333333, + "grad_norm": 39.6942253112793, + "learning_rate": 4.283500000000001e-06, + "loss": 0.8683, + "step": 57200 + }, + { + "epoch": 293.3333333333333, + "eval_loss": 1.1459788084030151, + "eval_runtime": 36.0625, + "eval_samples_per_second": 10.898, + "eval_steps_per_second": 1.386, + "step": 57200 + }, + { + "epoch": 293.84615384615387, + "grad_norm": 35.99827575683594, + "learning_rate": 4.273500000000001e-06, + "loss": 0.8956, + "step": 57300 + }, + { + "epoch": 293.84615384615387, + "eval_loss": 1.1262143850326538, + "eval_runtime": 36.0178, + "eval_samples_per_second": 10.911, + "eval_steps_per_second": 1.388, + "step": 57300 + }, + { + "epoch": 294.35897435897436, + "grad_norm": 41.230899810791016, + "learning_rate": 4.263500000000001e-06, + "loss": 0.817, + "step": 57400 + }, + { + "epoch": 294.35897435897436, + "eval_loss": 1.1344050168991089, + "eval_runtime": 36.0438, + "eval_samples_per_second": 10.903, + "eval_steps_per_second": 1.387, + "step": 57400 + }, + { + "epoch": 294.87179487179486, + "grad_norm": 43.20774841308594, + "learning_rate": 4.2536e-06, + "loss": 0.8668, + "step": 57500 + }, + { + "epoch": 294.87179487179486, + "eval_loss": 1.136633276939392, + "eval_runtime": 36.0645, + "eval_samples_per_second": 10.897, + "eval_steps_per_second": 1.386, + "step": 57500 + }, + { + "epoch": 295.38461538461536, + "grad_norm": 36.17228698730469, + "learning_rate": 4.2436e-06, + "loss": 0.8744, + "step": 57600 + }, + { + "epoch": 295.38461538461536, + "eval_loss": 1.1418293714523315, + "eval_runtime": 36.2511, + "eval_samples_per_second": 10.841, + "eval_steps_per_second": 1.379, + "step": 57600 + }, + { + "epoch": 295.8974358974359, + "grad_norm": 24.944168090820312, + "learning_rate": 4.2336000000000004e-06, + "loss": 0.8718, + "step": 57700 + }, + { + "epoch": 295.8974358974359, + "eval_loss": 1.1473342180252075, + "eval_runtime": 36.2603, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 57700 + }, + { + "epoch": 296.4102564102564, + "grad_norm": 61.16934585571289, + "learning_rate": 4.2236000000000005e-06, + "loss": 0.8366, + "step": 57800 + }, + { + "epoch": 296.4102564102564, + "eval_loss": 1.144806981086731, + "eval_runtime": 36.236, + "eval_samples_per_second": 10.846, + "eval_steps_per_second": 1.38, + "step": 57800 + }, + { + "epoch": 296.9230769230769, + "grad_norm": 43.54716491699219, + "learning_rate": 4.213600000000001e-06, + "loss": 0.8657, + "step": 57900 + }, + { + "epoch": 296.9230769230769, + "eval_loss": 1.1215286254882812, + "eval_runtime": 36.2582, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 57900 + }, + { + "epoch": 297.43589743589746, + "grad_norm": 24.557117462158203, + "learning_rate": 4.203600000000001e-06, + "loss": 0.889, + "step": 58000 + }, + { + "epoch": 297.43589743589746, + "eval_loss": 1.120747685432434, + "eval_runtime": 36.2706, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.379, + "step": 58000 + }, + { + "epoch": 297.94871794871796, + "grad_norm": 51.271366119384766, + "learning_rate": 4.193600000000001e-06, + "loss": 0.8405, + "step": 58100 + }, + { + "epoch": 297.94871794871796, + "eval_loss": 1.1216601133346558, + "eval_runtime": 36.3155, + "eval_samples_per_second": 10.822, + "eval_steps_per_second": 1.377, + "step": 58100 + }, + { + "epoch": 298.46153846153845, + "grad_norm": 29.24059295654297, + "learning_rate": 4.1836e-06, + "loss": 0.8447, + "step": 58200 + }, + { + "epoch": 298.46153846153845, + "eval_loss": 1.1474366188049316, + "eval_runtime": 36.2846, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 58200 + }, + { + "epoch": 298.97435897435895, + "grad_norm": 34.42660903930664, + "learning_rate": 4.1736e-06, + "loss": 0.8895, + "step": 58300 + }, + { + "epoch": 298.97435897435895, + "eval_loss": 1.1352310180664062, + "eval_runtime": 36.0562, + "eval_samples_per_second": 10.9, + "eval_steps_per_second": 1.387, + "step": 58300 + }, + { + "epoch": 299.4871794871795, + "grad_norm": 46.0521354675293, + "learning_rate": 4.1636e-06, + "loss": 0.87, + "step": 58400 + }, + { + "epoch": 299.4871794871795, + "eval_loss": 1.1243605613708496, + "eval_runtime": 36.3719, + "eval_samples_per_second": 10.805, + "eval_steps_per_second": 1.375, + "step": 58400 + }, + { + "epoch": 300.0, + "grad_norm": 12.998165130615234, + "learning_rate": 4.1536e-06, + "loss": 0.8581, + "step": 58500 + }, + { + "epoch": 300.0, + "eval_loss": 1.130947470664978, + "eval_runtime": 36.2878, + "eval_samples_per_second": 10.83, + "eval_steps_per_second": 1.378, + "step": 58500 + }, + { + "epoch": 300.5128205128205, + "grad_norm": 28.472652435302734, + "learning_rate": 4.1436000000000004e-06, + "loss": 0.8438, + "step": 58600 + }, + { + "epoch": 300.5128205128205, + "eval_loss": 1.1358656883239746, + "eval_runtime": 36.2581, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 58600 + }, + { + "epoch": 301.02564102564105, + "grad_norm": 39.82411193847656, + "learning_rate": 4.1336000000000005e-06, + "loss": 0.8793, + "step": 58700 + }, + { + "epoch": 301.02564102564105, + "eval_loss": 1.151535153388977, + "eval_runtime": 36.271, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.379, + "step": 58700 + }, + { + "epoch": 301.53846153846155, + "grad_norm": 33.88236618041992, + "learning_rate": 4.123600000000001e-06, + "loss": 0.8573, + "step": 58800 + }, + { + "epoch": 301.53846153846155, + "eval_loss": 1.1491762399673462, + "eval_runtime": 36.2736, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 58800 + }, + { + "epoch": 302.05128205128204, + "grad_norm": 74.63835906982422, + "learning_rate": 4.113600000000001e-06, + "loss": 0.8678, + "step": 58900 + }, + { + "epoch": 302.05128205128204, + "eval_loss": 1.1325905323028564, + "eval_runtime": 36.1605, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 58900 + }, + { + "epoch": 302.56410256410254, + "grad_norm": 18.790489196777344, + "learning_rate": 4.1036e-06, + "loss": 0.8579, + "step": 59000 + }, + { + "epoch": 302.56410256410254, + "eval_loss": 1.1320217847824097, + "eval_runtime": 36.1183, + "eval_samples_per_second": 10.881, + "eval_steps_per_second": 1.384, + "step": 59000 + }, + { + "epoch": 303.0769230769231, + "grad_norm": 53.02982711791992, + "learning_rate": 4.0936e-06, + "loss": 0.8744, + "step": 59100 + }, + { + "epoch": 303.0769230769231, + "eval_loss": 1.1578673124313354, + "eval_runtime": 36.2429, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 59100 + }, + { + "epoch": 303.5897435897436, + "grad_norm": 29.44129753112793, + "learning_rate": 4.0836e-06, + "loss": 0.8168, + "step": 59200 + }, + { + "epoch": 303.5897435897436, + "eval_loss": 1.13029146194458, + "eval_runtime": 36.3476, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 59200 + }, + { + "epoch": 304.1025641025641, + "grad_norm": 23.5069522857666, + "learning_rate": 4.0736e-06, + "loss": 0.8519, + "step": 59300 + }, + { + "epoch": 304.1025641025641, + "eval_loss": 1.1346431970596313, + "eval_runtime": 36.2725, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.378, + "step": 59300 + }, + { + "epoch": 304.61538461538464, + "grad_norm": 33.96268844604492, + "learning_rate": 4.0636e-06, + "loss": 0.8649, + "step": 59400 + }, + { + "epoch": 304.61538461538464, + "eval_loss": 1.138556718826294, + "eval_runtime": 36.2069, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 59400 + }, + { + "epoch": 305.12820512820514, + "grad_norm": 36.306922912597656, + "learning_rate": 4.0536000000000005e-06, + "loss": 0.8503, + "step": 59500 + }, + { + "epoch": 305.12820512820514, + "eval_loss": 1.1353572607040405, + "eval_runtime": 36.2673, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 59500 + }, + { + "epoch": 305.64102564102564, + "grad_norm": 57.436378479003906, + "learning_rate": 4.0436000000000006e-06, + "loss": 0.8518, + "step": 59600 + }, + { + "epoch": 305.64102564102564, + "eval_loss": 1.1379770040512085, + "eval_runtime": 36.3601, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 59600 + }, + { + "epoch": 306.15384615384613, + "grad_norm": 40.419063568115234, + "learning_rate": 4.033600000000001e-06, + "loss": 0.8434, + "step": 59700 + }, + { + "epoch": 306.15384615384613, + "eval_loss": 1.117214322090149, + "eval_runtime": 36.4071, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 59700 + }, + { + "epoch": 306.6666666666667, + "grad_norm": 24.74186134338379, + "learning_rate": 4.0236e-06, + "loss": 0.8518, + "step": 59800 + }, + { + "epoch": 306.6666666666667, + "eval_loss": 1.1457172632217407, + "eval_runtime": 36.3384, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 59800 + }, + { + "epoch": 307.1794871794872, + "grad_norm": 24.197113037109375, + "learning_rate": 4.0136e-06, + "loss": 0.8503, + "step": 59900 + }, + { + "epoch": 307.1794871794872, + "eval_loss": 1.1278254985809326, + "eval_runtime": 36.3113, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 59900 + }, + { + "epoch": 307.6923076923077, + "grad_norm": 20.559213638305664, + "learning_rate": 4.0036e-06, + "loss": 0.8422, + "step": 60000 + }, + { + "epoch": 307.6923076923077, + "eval_loss": 1.1234817504882812, + "eval_runtime": 36.4132, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 60000 + }, + { + "epoch": 308.20512820512823, + "grad_norm": 39.70729064941406, + "learning_rate": 3.9936e-06, + "loss": 0.879, + "step": 60100 + }, + { + "epoch": 308.20512820512823, + "eval_loss": 1.1375590562820435, + "eval_runtime": 36.3416, + "eval_samples_per_second": 10.814, + "eval_steps_per_second": 1.376, + "step": 60100 + }, + { + "epoch": 308.71794871794873, + "grad_norm": 36.46154022216797, + "learning_rate": 3.9836e-06, + "loss": 0.8612, + "step": 60200 + }, + { + "epoch": 308.71794871794873, + "eval_loss": 1.139458179473877, + "eval_runtime": 36.4076, + "eval_samples_per_second": 10.794, + "eval_steps_per_second": 1.373, + "step": 60200 + }, + { + "epoch": 309.2307692307692, + "grad_norm": 39.92527770996094, + "learning_rate": 3.9736e-06, + "loss": 0.8556, + "step": 60300 + }, + { + "epoch": 309.2307692307692, + "eval_loss": 1.138722538948059, + "eval_runtime": 36.2737, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 60300 + }, + { + "epoch": 309.7435897435897, + "grad_norm": 35.99172592163086, + "learning_rate": 3.9636000000000005e-06, + "loss": 0.8799, + "step": 60400 + }, + { + "epoch": 309.7435897435897, + "eval_loss": 1.1376827955245972, + "eval_runtime": 36.2745, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 60400 + }, + { + "epoch": 310.2564102564103, + "grad_norm": 28.24496078491211, + "learning_rate": 3.9536000000000006e-06, + "loss": 0.8489, + "step": 60500 + }, + { + "epoch": 310.2564102564103, + "eval_loss": 1.132504940032959, + "eval_runtime": 36.295, + "eval_samples_per_second": 10.828, + "eval_steps_per_second": 1.378, + "step": 60500 + }, + { + "epoch": 310.7692307692308, + "grad_norm": 34.48954772949219, + "learning_rate": 3.9436e-06, + "loss": 0.8638, + "step": 60600 + }, + { + "epoch": 310.7692307692308, + "eval_loss": 1.1368725299835205, + "eval_runtime": 36.1508, + "eval_samples_per_second": 10.871, + "eval_steps_per_second": 1.383, + "step": 60600 + }, + { + "epoch": 311.28205128205127, + "grad_norm": 27.438060760498047, + "learning_rate": 3.9336e-06, + "loss": 0.8261, + "step": 60700 + }, + { + "epoch": 311.28205128205127, + "eval_loss": 1.135315179824829, + "eval_runtime": 36.1366, + "eval_samples_per_second": 10.875, + "eval_steps_per_second": 1.384, + "step": 60700 + }, + { + "epoch": 311.79487179487177, + "grad_norm": 35.011592864990234, + "learning_rate": 3.9236e-06, + "loss": 0.8545, + "step": 60800 + }, + { + "epoch": 311.79487179487177, + "eval_loss": 1.1467878818511963, + "eval_runtime": 36.3756, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 60800 + }, + { + "epoch": 312.3076923076923, + "grad_norm": 69.58039855957031, + "learning_rate": 3.9136e-06, + "loss": 0.8148, + "step": 60900 + }, + { + "epoch": 312.3076923076923, + "eval_loss": 1.1344656944274902, + "eval_runtime": 36.2717, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.378, + "step": 60900 + }, + { + "epoch": 312.8205128205128, + "grad_norm": 46.799896240234375, + "learning_rate": 3.9036e-06, + "loss": 0.8826, + "step": 61000 + }, + { + "epoch": 312.8205128205128, + "eval_loss": 1.116567850112915, + "eval_runtime": 36.3393, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 61000 + }, + { + "epoch": 313.3333333333333, + "grad_norm": 20.304611206054688, + "learning_rate": 3.8936e-06, + "loss": 0.8471, + "step": 61100 + }, + { + "epoch": 313.3333333333333, + "eval_loss": 1.121153473854065, + "eval_runtime": 36.2902, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 61100 + }, + { + "epoch": 313.84615384615387, + "grad_norm": 34.00658416748047, + "learning_rate": 3.8836e-06, + "loss": 0.8661, + "step": 61200 + }, + { + "epoch": 313.84615384615387, + "eval_loss": 1.1305875778198242, + "eval_runtime": 36.3498, + "eval_samples_per_second": 10.812, + "eval_steps_per_second": 1.376, + "step": 61200 + }, + { + "epoch": 314.35897435897436, + "grad_norm": 34.77503204345703, + "learning_rate": 3.8736000000000005e-06, + "loss": 0.8547, + "step": 61300 + }, + { + "epoch": 314.35897435897436, + "eval_loss": 1.1209969520568848, + "eval_runtime": 36.3654, + "eval_samples_per_second": 10.807, + "eval_steps_per_second": 1.375, + "step": 61300 + }, + { + "epoch": 314.87179487179486, + "grad_norm": 22.354047775268555, + "learning_rate": 3.8636e-06, + "loss": 0.8603, + "step": 61400 + }, + { + "epoch": 314.87179487179486, + "eval_loss": 1.1311761140823364, + "eval_runtime": 36.5129, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 61400 + }, + { + "epoch": 315.38461538461536, + "grad_norm": 54.11902618408203, + "learning_rate": 3.8537e-06, + "loss": 0.8196, + "step": 61500 + }, + { + "epoch": 315.38461538461536, + "eval_loss": 1.1318939924240112, + "eval_runtime": 36.4949, + "eval_samples_per_second": 10.769, + "eval_steps_per_second": 1.37, + "step": 61500 + }, + { + "epoch": 315.8974358974359, + "grad_norm": 30.516218185424805, + "learning_rate": 3.8437e-06, + "loss": 0.8569, + "step": 61600 + }, + { + "epoch": 315.8974358974359, + "eval_loss": 1.13310706615448, + "eval_runtime": 36.626, + "eval_samples_per_second": 10.73, + "eval_steps_per_second": 1.365, + "step": 61600 + }, + { + "epoch": 316.4102564102564, + "grad_norm": 23.061357498168945, + "learning_rate": 3.8337e-06, + "loss": 0.8591, + "step": 61700 + }, + { + "epoch": 316.4102564102564, + "eval_loss": 1.130542278289795, + "eval_runtime": 36.4064, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 61700 + }, + { + "epoch": 316.9230769230769, + "grad_norm": 35.596534729003906, + "learning_rate": 3.8237e-06, + "loss": 0.8619, + "step": 61800 + }, + { + "epoch": 316.9230769230769, + "eval_loss": 1.123049020767212, + "eval_runtime": 36.3821, + "eval_samples_per_second": 10.802, + "eval_steps_per_second": 1.374, + "step": 61800 + }, + { + "epoch": 317.43589743589746, + "grad_norm": 35.28330993652344, + "learning_rate": 3.8137e-06, + "loss": 0.817, + "step": 61900 + }, + { + "epoch": 317.43589743589746, + "eval_loss": 1.140207052230835, + "eval_runtime": 36.3773, + "eval_samples_per_second": 10.803, + "eval_steps_per_second": 1.374, + "step": 61900 + }, + { + "epoch": 317.94871794871796, + "grad_norm": 27.04100799560547, + "learning_rate": 3.8037e-06, + "loss": 0.8687, + "step": 62000 + }, + { + "epoch": 317.94871794871796, + "eval_loss": 1.1311331987380981, + "eval_runtime": 36.2836, + "eval_samples_per_second": 10.831, + "eval_steps_per_second": 1.378, + "step": 62000 + }, + { + "epoch": 318.46153846153845, + "grad_norm": 37.69451141357422, + "learning_rate": 3.7937e-06, + "loss": 0.8476, + "step": 62100 + }, + { + "epoch": 318.46153846153845, + "eval_loss": 1.1316076517105103, + "eval_runtime": 36.2475, + "eval_samples_per_second": 10.842, + "eval_steps_per_second": 1.379, + "step": 62100 + }, + { + "epoch": 318.97435897435895, + "grad_norm": 38.90933609008789, + "learning_rate": 3.7837000000000003e-06, + "loss": 0.8372, + "step": 62200 + }, + { + "epoch": 318.97435897435895, + "eval_loss": 1.1558796167373657, + "eval_runtime": 36.2606, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 62200 + }, + { + "epoch": 319.4871794871795, + "grad_norm": 30.965660095214844, + "learning_rate": 3.7737e-06, + "loss": 0.8714, + "step": 62300 + }, + { + "epoch": 319.4871794871795, + "eval_loss": 1.144376277923584, + "eval_runtime": 36.2641, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 62300 + }, + { + "epoch": 320.0, + "grad_norm": 30.53451919555664, + "learning_rate": 3.7637e-06, + "loss": 0.8316, + "step": 62400 + }, + { + "epoch": 320.0, + "eval_loss": 1.1326403617858887, + "eval_runtime": 36.2455, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.379, + "step": 62400 + }, + { + "epoch": 320.5128205128205, + "grad_norm": 81.33617401123047, + "learning_rate": 3.7537e-06, + "loss": 0.8453, + "step": 62500 + }, + { + "epoch": 320.5128205128205, + "eval_loss": 1.1305010318756104, + "eval_runtime": 36.2462, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.379, + "step": 62500 + }, + { + "epoch": 321.02564102564105, + "grad_norm": 33.47633743286133, + "learning_rate": 3.7437000000000003e-06, + "loss": 0.8419, + "step": 62600 + }, + { + "epoch": 321.02564102564105, + "eval_loss": 1.1317977905273438, + "eval_runtime": 36.3302, + "eval_samples_per_second": 10.817, + "eval_steps_per_second": 1.376, + "step": 62600 + }, + { + "epoch": 321.53846153846155, + "grad_norm": 24.6075382232666, + "learning_rate": 3.7337e-06, + "loss": 0.8981, + "step": 62700 + }, + { + "epoch": 321.53846153846155, + "eval_loss": 1.1273932456970215, + "eval_runtime": 36.343, + "eval_samples_per_second": 10.814, + "eval_steps_per_second": 1.376, + "step": 62700 + }, + { + "epoch": 322.05128205128204, + "grad_norm": 28.492483139038086, + "learning_rate": 3.7237e-06, + "loss": 0.8088, + "step": 62800 + }, + { + "epoch": 322.05128205128204, + "eval_loss": 1.137994647026062, + "eval_runtime": 36.3578, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 62800 + }, + { + "epoch": 322.56410256410254, + "grad_norm": 27.50029182434082, + "learning_rate": 3.7137e-06, + "loss": 0.8147, + "step": 62900 + }, + { + "epoch": 322.56410256410254, + "eval_loss": 1.1336575746536255, + "eval_runtime": 36.2268, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 62900 + }, + { + "epoch": 323.0769230769231, + "grad_norm": 17.780519485473633, + "learning_rate": 3.7037000000000002e-06, + "loss": 0.8634, + "step": 63000 + }, + { + "epoch": 323.0769230769231, + "eval_loss": 1.12924325466156, + "eval_runtime": 36.4323, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 63000 + }, + { + "epoch": 323.5897435897436, + "grad_norm": 37.044708251953125, + "learning_rate": 3.6937e-06, + "loss": 0.8586, + "step": 63100 + }, + { + "epoch": 323.5897435897436, + "eval_loss": 1.1343861818313599, + "eval_runtime": 36.3524, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 63100 + }, + { + "epoch": 324.1025641025641, + "grad_norm": 39.34878921508789, + "learning_rate": 3.6837e-06, + "loss": 0.8077, + "step": 63200 + }, + { + "epoch": 324.1025641025641, + "eval_loss": 1.1381350755691528, + "eval_runtime": 36.2233, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 63200 + }, + { + "epoch": 324.61538461538464, + "grad_norm": 88.59685516357422, + "learning_rate": 3.6737e-06, + "loss": 0.8242, + "step": 63300 + }, + { + "epoch": 324.61538461538464, + "eval_loss": 1.1444284915924072, + "eval_runtime": 36.3756, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 63300 + }, + { + "epoch": 325.12820512820514, + "grad_norm": 37.44831848144531, + "learning_rate": 3.6637e-06, + "loss": 0.8602, + "step": 63400 + }, + { + "epoch": 325.12820512820514, + "eval_loss": 1.1373730897903442, + "eval_runtime": 36.3154, + "eval_samples_per_second": 10.822, + "eval_steps_per_second": 1.377, + "step": 63400 + }, + { + "epoch": 325.64102564102564, + "grad_norm": 28.210973739624023, + "learning_rate": 3.6537e-06, + "loss": 0.8754, + "step": 63500 + }, + { + "epoch": 325.64102564102564, + "eval_loss": 1.1477177143096924, + "eval_runtime": 36.2632, + "eval_samples_per_second": 10.837, + "eval_steps_per_second": 1.379, + "step": 63500 + }, + { + "epoch": 326.15384615384613, + "grad_norm": 35.3448486328125, + "learning_rate": 3.6437000000000004e-06, + "loss": 0.8314, + "step": 63600 + }, + { + "epoch": 326.15384615384613, + "eval_loss": 1.1356128454208374, + "eval_runtime": 36.028, + "eval_samples_per_second": 10.908, + "eval_steps_per_second": 1.388, + "step": 63600 + }, + { + "epoch": 326.6666666666667, + "grad_norm": 25.399267196655273, + "learning_rate": 3.6337000000000005e-06, + "loss": 0.8351, + "step": 63700 + }, + { + "epoch": 326.6666666666667, + "eval_loss": 1.1474655866622925, + "eval_runtime": 36.1367, + "eval_samples_per_second": 10.875, + "eval_steps_per_second": 1.384, + "step": 63700 + }, + { + "epoch": 327.1794871794872, + "grad_norm": 20.2020320892334, + "learning_rate": 3.6238e-06, + "loss": 0.8422, + "step": 63800 + }, + { + "epoch": 327.1794871794872, + "eval_loss": 1.1537506580352783, + "eval_runtime": 36.0015, + "eval_samples_per_second": 10.916, + "eval_steps_per_second": 1.389, + "step": 63800 + }, + { + "epoch": 327.6923076923077, + "grad_norm": 20.147663116455078, + "learning_rate": 3.6139000000000002e-06, + "loss": 0.8267, + "step": 63900 + }, + { + "epoch": 327.6923076923077, + "eval_loss": 1.1340347528457642, + "eval_runtime": 36.2054, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 63900 + }, + { + "epoch": 328.20512820512823, + "grad_norm": 48.518009185791016, + "learning_rate": 3.6039000000000003e-06, + "loss": 0.8798, + "step": 64000 + }, + { + "epoch": 328.20512820512823, + "eval_loss": 1.1437115669250488, + "eval_runtime": 36.1832, + "eval_samples_per_second": 10.861, + "eval_steps_per_second": 1.382, + "step": 64000 + }, + { + "epoch": 328.71794871794873, + "grad_norm": 26.967958450317383, + "learning_rate": 3.5939e-06, + "loss": 0.8371, + "step": 64100 + }, + { + "epoch": 328.71794871794873, + "eval_loss": 1.1382254362106323, + "eval_runtime": 35.9994, + "eval_samples_per_second": 10.917, + "eval_steps_per_second": 1.389, + "step": 64100 + }, + { + "epoch": 329.2307692307692, + "grad_norm": 21.064455032348633, + "learning_rate": 3.5839e-06, + "loss": 0.8527, + "step": 64200 + }, + { + "epoch": 329.2307692307692, + "eval_loss": 1.13132643699646, + "eval_runtime": 35.9125, + "eval_samples_per_second": 10.943, + "eval_steps_per_second": 1.392, + "step": 64200 + }, + { + "epoch": 329.7435897435897, + "grad_norm": 54.478694915771484, + "learning_rate": 3.5739e-06, + "loss": 0.8274, + "step": 64300 + }, + { + "epoch": 329.7435897435897, + "eval_loss": 1.13289475440979, + "eval_runtime": 35.958, + "eval_samples_per_second": 10.929, + "eval_steps_per_second": 1.391, + "step": 64300 + }, + { + "epoch": 330.2564102564103, + "grad_norm": 20.714599609375, + "learning_rate": 3.5639000000000003e-06, + "loss": 0.8464, + "step": 64400 + }, + { + "epoch": 330.2564102564103, + "eval_loss": 1.138232946395874, + "eval_runtime": 36.0472, + "eval_samples_per_second": 10.902, + "eval_steps_per_second": 1.387, + "step": 64400 + }, + { + "epoch": 330.7692307692308, + "grad_norm": 32.57964324951172, + "learning_rate": 3.5539e-06, + "loss": 0.8201, + "step": 64500 + }, + { + "epoch": 330.7692307692308, + "eval_loss": 1.1389477252960205, + "eval_runtime": 36.1958, + "eval_samples_per_second": 10.858, + "eval_steps_per_second": 1.381, + "step": 64500 + }, + { + "epoch": 331.28205128205127, + "grad_norm": 20.443450927734375, + "learning_rate": 3.5439e-06, + "loss": 0.8895, + "step": 64600 + }, + { + "epoch": 331.28205128205127, + "eval_loss": 1.1412203311920166, + "eval_runtime": 36.8154, + "eval_samples_per_second": 10.675, + "eval_steps_per_second": 1.358, + "step": 64600 + }, + { + "epoch": 331.79487179487177, + "grad_norm": 52.59553146362305, + "learning_rate": 3.5339e-06, + "loss": 0.831, + "step": 64700 + }, + { + "epoch": 331.79487179487177, + "eval_loss": 1.143002986907959, + "eval_runtime": 36.9539, + "eval_samples_per_second": 10.635, + "eval_steps_per_second": 1.353, + "step": 64700 + }, + { + "epoch": 332.3076923076923, + "grad_norm": 82.77082824707031, + "learning_rate": 3.5239000000000003e-06, + "loss": 0.7722, + "step": 64800 + }, + { + "epoch": 332.3076923076923, + "eval_loss": 1.1405818462371826, + "eval_runtime": 36.9377, + "eval_samples_per_second": 10.64, + "eval_steps_per_second": 1.354, + "step": 64800 + }, + { + "epoch": 332.8205128205128, + "grad_norm": 41.14771270751953, + "learning_rate": 3.5139e-06, + "loss": 0.8565, + "step": 64900 + }, + { + "epoch": 332.8205128205128, + "eval_loss": 1.1375489234924316, + "eval_runtime": 36.9952, + "eval_samples_per_second": 10.623, + "eval_steps_per_second": 1.352, + "step": 64900 + }, + { + "epoch": 333.3333333333333, + "grad_norm": 75.23269653320312, + "learning_rate": 3.5039e-06, + "loss": 0.8179, + "step": 65000 + }, + { + "epoch": 333.3333333333333, + "eval_loss": 1.136751413345337, + "eval_runtime": 36.9622, + "eval_samples_per_second": 10.632, + "eval_steps_per_second": 1.353, + "step": 65000 + }, + { + "epoch": 333.84615384615387, + "grad_norm": 46.79996109008789, + "learning_rate": 3.4939e-06, + "loss": 0.8433, + "step": 65100 + }, + { + "epoch": 333.84615384615387, + "eval_loss": 1.1267091035842896, + "eval_runtime": 36.8058, + "eval_samples_per_second": 10.678, + "eval_steps_per_second": 1.358, + "step": 65100 + }, + { + "epoch": 334.35897435897436, + "grad_norm": 35.528564453125, + "learning_rate": 3.4839000000000002e-06, + "loss": 0.8579, + "step": 65200 + }, + { + "epoch": 334.35897435897436, + "eval_loss": 1.1351813077926636, + "eval_runtime": 36.2196, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 65200 + }, + { + "epoch": 334.87179487179486, + "grad_norm": 22.288785934448242, + "learning_rate": 3.4739e-06, + "loss": 0.8481, + "step": 65300 + }, + { + "epoch": 334.87179487179486, + "eval_loss": 1.1294851303100586, + "eval_runtime": 35.9674, + "eval_samples_per_second": 10.927, + "eval_steps_per_second": 1.39, + "step": 65300 + }, + { + "epoch": 335.38461538461536, + "grad_norm": 30.772056579589844, + "learning_rate": 3.4639e-06, + "loss": 0.8161, + "step": 65400 + }, + { + "epoch": 335.38461538461536, + "eval_loss": 1.1349191665649414, + "eval_runtime": 35.9238, + "eval_samples_per_second": 10.94, + "eval_steps_per_second": 1.392, + "step": 65400 + }, + { + "epoch": 335.8974358974359, + "grad_norm": 42.20217514038086, + "learning_rate": 3.4539e-06, + "loss": 0.8608, + "step": 65500 + }, + { + "epoch": 335.8974358974359, + "eval_loss": 1.1342188119888306, + "eval_runtime": 36.0745, + "eval_samples_per_second": 10.894, + "eval_steps_per_second": 1.386, + "step": 65500 + }, + { + "epoch": 336.4102564102564, + "grad_norm": 20.258020401000977, + "learning_rate": 3.4439e-06, + "loss": 0.8137, + "step": 65600 + }, + { + "epoch": 336.4102564102564, + "eval_loss": 1.1354069709777832, + "eval_runtime": 35.9246, + "eval_samples_per_second": 10.94, + "eval_steps_per_second": 1.392, + "step": 65600 + }, + { + "epoch": 336.9230769230769, + "grad_norm": 42.6712532043457, + "learning_rate": 3.4339e-06, + "loss": 0.8424, + "step": 65700 + }, + { + "epoch": 336.9230769230769, + "eval_loss": 1.1437708139419556, + "eval_runtime": 36.4952, + "eval_samples_per_second": 10.769, + "eval_steps_per_second": 1.37, + "step": 65700 + }, + { + "epoch": 337.43589743589746, + "grad_norm": 35.17815017700195, + "learning_rate": 3.424e-06, + "loss": 0.8378, + "step": 65800 + }, + { + "epoch": 337.43589743589746, + "eval_loss": 1.1268436908721924, + "eval_runtime": 36.0699, + "eval_samples_per_second": 10.895, + "eval_steps_per_second": 1.386, + "step": 65800 + }, + { + "epoch": 337.94871794871796, + "grad_norm": 34.09093475341797, + "learning_rate": 3.4140000000000003e-06, + "loss": 0.8401, + "step": 65900 + }, + { + "epoch": 337.94871794871796, + "eval_loss": 1.131771206855774, + "eval_runtime": 36.1819, + "eval_samples_per_second": 10.862, + "eval_steps_per_second": 1.382, + "step": 65900 + }, + { + "epoch": 338.46153846153845, + "grad_norm": 30.245824813842773, + "learning_rate": 3.404e-06, + "loss": 0.7949, + "step": 66000 + }, + { + "epoch": 338.46153846153845, + "eval_loss": 1.1323456764221191, + "eval_runtime": 36.2258, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 66000 + }, + { + "epoch": 338.97435897435895, + "grad_norm": 37.53680419921875, + "learning_rate": 3.394e-06, + "loss": 0.8774, + "step": 66100 + }, + { + "epoch": 338.97435897435895, + "eval_loss": 1.1351007223129272, + "eval_runtime": 36.2451, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.379, + "step": 66100 + }, + { + "epoch": 339.4871794871795, + "grad_norm": 49.84280776977539, + "learning_rate": 3.384e-06, + "loss": 0.8372, + "step": 66200 + }, + { + "epoch": 339.4871794871795, + "eval_loss": 1.1467829942703247, + "eval_runtime": 36.0955, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 66200 + }, + { + "epoch": 340.0, + "grad_norm": 31.46631622314453, + "learning_rate": 3.3740000000000002e-06, + "loss": 0.8384, + "step": 66300 + }, + { + "epoch": 340.0, + "eval_loss": 1.1329807043075562, + "eval_runtime": 36.1181, + "eval_samples_per_second": 10.881, + "eval_steps_per_second": 1.384, + "step": 66300 + }, + { + "epoch": 340.5128205128205, + "grad_norm": 36.72136688232422, + "learning_rate": 3.364e-06, + "loss": 0.8709, + "step": 66400 + }, + { + "epoch": 340.5128205128205, + "eval_loss": 1.12633216381073, + "eval_runtime": 36.2045, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 66400 + }, + { + "epoch": 341.02564102564105, + "grad_norm": 48.11905288696289, + "learning_rate": 3.354e-06, + "loss": 0.8127, + "step": 66500 + }, + { + "epoch": 341.02564102564105, + "eval_loss": 1.145508050918579, + "eval_runtime": 36.2252, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 66500 + }, + { + "epoch": 341.53846153846155, + "grad_norm": 51.75871658325195, + "learning_rate": 3.344e-06, + "loss": 0.847, + "step": 66600 + }, + { + "epoch": 341.53846153846155, + "eval_loss": 1.1242154836654663, + "eval_runtime": 36.029, + "eval_samples_per_second": 10.908, + "eval_steps_per_second": 1.388, + "step": 66600 + }, + { + "epoch": 342.05128205128204, + "grad_norm": 51.20391845703125, + "learning_rate": 3.334e-06, + "loss": 0.806, + "step": 66700 + }, + { + "epoch": 342.05128205128204, + "eval_loss": 1.1406984329223633, + "eval_runtime": 36.2198, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 66700 + }, + { + "epoch": 342.56410256410254, + "grad_norm": 38.16461944580078, + "learning_rate": 3.324e-06, + "loss": 0.8356, + "step": 66800 + }, + { + "epoch": 342.56410256410254, + "eval_loss": 1.1333765983581543, + "eval_runtime": 36.0679, + "eval_samples_per_second": 10.896, + "eval_steps_per_second": 1.386, + "step": 66800 + }, + { + "epoch": 343.0769230769231, + "grad_norm": 27.25999641418457, + "learning_rate": 3.314e-06, + "loss": 0.8142, + "step": 66900 + }, + { + "epoch": 343.0769230769231, + "eval_loss": 1.1365360021591187, + "eval_runtime": 36.2, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 66900 + }, + { + "epoch": 343.5897435897436, + "grad_norm": 29.00643539428711, + "learning_rate": 3.3040000000000005e-06, + "loss": 0.819, + "step": 67000 + }, + { + "epoch": 343.5897435897436, + "eval_loss": 1.134656548500061, + "eval_runtime": 36.2992, + "eval_samples_per_second": 10.827, + "eval_steps_per_second": 1.377, + "step": 67000 + }, + { + "epoch": 344.1025641025641, + "grad_norm": 53.16897964477539, + "learning_rate": 3.2940000000000006e-06, + "loss": 0.83, + "step": 67100 + }, + { + "epoch": 344.1025641025641, + "eval_loss": 1.1237508058547974, + "eval_runtime": 36.2587, + "eval_samples_per_second": 10.839, + "eval_steps_per_second": 1.379, + "step": 67100 + }, + { + "epoch": 344.61538461538464, + "grad_norm": 15.668848037719727, + "learning_rate": 3.2840000000000007e-06, + "loss": 0.7977, + "step": 67200 + }, + { + "epoch": 344.61538461538464, + "eval_loss": 1.1372902393341064, + "eval_runtime": 36.2367, + "eval_samples_per_second": 10.845, + "eval_steps_per_second": 1.38, + "step": 67200 + }, + { + "epoch": 345.12820512820514, + "grad_norm": 32.75185775756836, + "learning_rate": 3.2740000000000003e-06, + "loss": 0.8818, + "step": 67300 + }, + { + "epoch": 345.12820512820514, + "eval_loss": 1.133774995803833, + "eval_runtime": 36.1807, + "eval_samples_per_second": 10.862, + "eval_steps_per_second": 1.382, + "step": 67300 + }, + { + "epoch": 345.64102564102564, + "grad_norm": 29.76172637939453, + "learning_rate": 3.2640000000000004e-06, + "loss": 0.79, + "step": 67400 + }, + { + "epoch": 345.64102564102564, + "eval_loss": 1.1439337730407715, + "eval_runtime": 36.1486, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 67400 + }, + { + "epoch": 346.15384615384613, + "grad_norm": 35.10476303100586, + "learning_rate": 3.2540000000000005e-06, + "loss": 0.8502, + "step": 67500 + }, + { + "epoch": 346.15384615384613, + "eval_loss": 1.141124963760376, + "eval_runtime": 36.1119, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 67500 + }, + { + "epoch": 346.6666666666667, + "grad_norm": 42.05295944213867, + "learning_rate": 3.2440000000000006e-06, + "loss": 0.8235, + "step": 67600 + }, + { + "epoch": 346.6666666666667, + "eval_loss": 1.1455899477005005, + "eval_runtime": 36.0364, + "eval_samples_per_second": 10.906, + "eval_steps_per_second": 1.387, + "step": 67600 + }, + { + "epoch": 347.1794871794872, + "grad_norm": 29.23992347717285, + "learning_rate": 3.2340000000000003e-06, + "loss": 0.8475, + "step": 67700 + }, + { + "epoch": 347.1794871794872, + "eval_loss": 1.1290616989135742, + "eval_runtime": 35.9874, + "eval_samples_per_second": 10.92, + "eval_steps_per_second": 1.389, + "step": 67700 + }, + { + "epoch": 347.6923076923077, + "grad_norm": 27.160301208496094, + "learning_rate": 3.2240000000000004e-06, + "loss": 0.8455, + "step": 67800 + }, + { + "epoch": 347.6923076923077, + "eval_loss": 1.1164175271987915, + "eval_runtime": 36.3119, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 67800 + }, + { + "epoch": 348.20512820512823, + "grad_norm": 36.495513916015625, + "learning_rate": 3.2140000000000005e-06, + "loss": 0.8431, + "step": 67900 + }, + { + "epoch": 348.20512820512823, + "eval_loss": 1.1352448463439941, + "eval_runtime": 36.1878, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 67900 + }, + { + "epoch": 348.71794871794873, + "grad_norm": 42.4805793762207, + "learning_rate": 3.2040000000000006e-06, + "loss": 0.8132, + "step": 68000 + }, + { + "epoch": 348.71794871794873, + "eval_loss": 1.1500545740127563, + "eval_runtime": 36.3381, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 68000 + }, + { + "epoch": 349.2307692307692, + "grad_norm": 23.529930114746094, + "learning_rate": 3.1940000000000003e-06, + "loss": 0.8156, + "step": 68100 + }, + { + "epoch": 349.2307692307692, + "eval_loss": 1.1303083896636963, + "eval_runtime": 36.8554, + "eval_samples_per_second": 10.663, + "eval_steps_per_second": 1.357, + "step": 68100 + }, + { + "epoch": 349.7435897435897, + "grad_norm": 40.1572265625, + "learning_rate": 3.1840000000000003e-06, + "loss": 0.8297, + "step": 68200 + }, + { + "epoch": 349.7435897435897, + "eval_loss": 1.1410999298095703, + "eval_runtime": 36.1892, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 68200 + }, + { + "epoch": 350.2564102564103, + "grad_norm": 24.163389205932617, + "learning_rate": 3.1740000000000004e-06, + "loss": 0.8343, + "step": 68300 + }, + { + "epoch": 350.2564102564103, + "eval_loss": 1.131314992904663, + "eval_runtime": 36.137, + "eval_samples_per_second": 10.875, + "eval_steps_per_second": 1.384, + "step": 68300 + }, + { + "epoch": 350.7692307692308, + "grad_norm": 24.996288299560547, + "learning_rate": 3.1640000000000005e-06, + "loss": 0.8017, + "step": 68400 + }, + { + "epoch": 350.7692307692308, + "eval_loss": 1.148737907409668, + "eval_runtime": 36.0411, + "eval_samples_per_second": 10.904, + "eval_steps_per_second": 1.387, + "step": 68400 + }, + { + "epoch": 351.28205128205127, + "grad_norm": 19.23929786682129, + "learning_rate": 3.154e-06, + "loss": 0.8303, + "step": 68500 + }, + { + "epoch": 351.28205128205127, + "eval_loss": 1.1406868696212769, + "eval_runtime": 36.0645, + "eval_samples_per_second": 10.897, + "eval_steps_per_second": 1.386, + "step": 68500 + }, + { + "epoch": 351.79487179487177, + "grad_norm": 44.152137756347656, + "learning_rate": 3.1440000000000003e-06, + "loss": 0.8594, + "step": 68600 + }, + { + "epoch": 351.79487179487177, + "eval_loss": 1.1420936584472656, + "eval_runtime": 36.1482, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 68600 + }, + { + "epoch": 352.3076923076923, + "grad_norm": 29.038612365722656, + "learning_rate": 3.1340000000000004e-06, + "loss": 0.7655, + "step": 68700 + }, + { + "epoch": 352.3076923076923, + "eval_loss": 1.131518006324768, + "eval_runtime": 36.1493, + "eval_samples_per_second": 10.872, + "eval_steps_per_second": 1.383, + "step": 68700 + }, + { + "epoch": 352.8205128205128, + "grad_norm": 59.23828887939453, + "learning_rate": 3.1240000000000005e-06, + "loss": 0.8358, + "step": 68800 + }, + { + "epoch": 352.8205128205128, + "eval_loss": 1.1423251628875732, + "eval_runtime": 36.2319, + "eval_samples_per_second": 10.847, + "eval_steps_per_second": 1.38, + "step": 68800 + }, + { + "epoch": 353.3333333333333, + "grad_norm": 113.09857940673828, + "learning_rate": 3.114e-06, + "loss": 0.8039, + "step": 68900 + }, + { + "epoch": 353.3333333333333, + "eval_loss": 1.1355007886886597, + "eval_runtime": 36.1727, + "eval_samples_per_second": 10.865, + "eval_steps_per_second": 1.382, + "step": 68900 + }, + { + "epoch": 353.84615384615387, + "grad_norm": 49.12174606323242, + "learning_rate": 3.1040000000000003e-06, + "loss": 0.8552, + "step": 69000 + }, + { + "epoch": 353.84615384615387, + "eval_loss": 1.141284465789795, + "eval_runtime": 36.0837, + "eval_samples_per_second": 10.891, + "eval_steps_per_second": 1.386, + "step": 69000 + }, + { + "epoch": 354.35897435897436, + "grad_norm": 26.018787384033203, + "learning_rate": 3.0940000000000004e-06, + "loss": 0.8186, + "step": 69100 + }, + { + "epoch": 354.35897435897436, + "eval_loss": 1.1342588663101196, + "eval_runtime": 36.0812, + "eval_samples_per_second": 10.892, + "eval_steps_per_second": 1.386, + "step": 69100 + }, + { + "epoch": 354.87179487179486, + "grad_norm": 40.32272720336914, + "learning_rate": 3.0840000000000005e-06, + "loss": 0.8227, + "step": 69200 + }, + { + "epoch": 354.87179487179486, + "eval_loss": 1.1433744430541992, + "eval_runtime": 36.2782, + "eval_samples_per_second": 10.833, + "eval_steps_per_second": 1.378, + "step": 69200 + }, + { + "epoch": 355.38461538461536, + "grad_norm": 40.22716522216797, + "learning_rate": 3.074e-06, + "loss": 0.8137, + "step": 69300 + }, + { + "epoch": 355.38461538461536, + "eval_loss": 1.1444528102874756, + "eval_runtime": 36.2701, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.379, + "step": 69300 + }, + { + "epoch": 355.8974358974359, + "grad_norm": 49.00700378417969, + "learning_rate": 3.0640000000000002e-06, + "loss": 0.8585, + "step": 69400 + }, + { + "epoch": 355.8974358974359, + "eval_loss": 1.140838623046875, + "eval_runtime": 36.1031, + "eval_samples_per_second": 10.885, + "eval_steps_per_second": 1.385, + "step": 69400 + }, + { + "epoch": 356.4102564102564, + "grad_norm": 24.257936477661133, + "learning_rate": 3.0540000000000003e-06, + "loss": 0.7991, + "step": 69500 + }, + { + "epoch": 356.4102564102564, + "eval_loss": 1.1431113481521606, + "eval_runtime": 36.2388, + "eval_samples_per_second": 10.845, + "eval_steps_per_second": 1.38, + "step": 69500 + }, + { + "epoch": 356.9230769230769, + "grad_norm": 37.413692474365234, + "learning_rate": 3.0440000000000004e-06, + "loss": 0.8436, + "step": 69600 + }, + { + "epoch": 356.9230769230769, + "eval_loss": 1.1497875452041626, + "eval_runtime": 35.987, + "eval_samples_per_second": 10.921, + "eval_steps_per_second": 1.389, + "step": 69600 + }, + { + "epoch": 357.43589743589746, + "grad_norm": 33.238773345947266, + "learning_rate": 3.034e-06, + "loss": 0.8296, + "step": 69700 + }, + { + "epoch": 357.43589743589746, + "eval_loss": 1.1500591039657593, + "eval_runtime": 36.3101, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 69700 + }, + { + "epoch": 357.94871794871796, + "grad_norm": 24.451854705810547, + "learning_rate": 3.024e-06, + "loss": 0.828, + "step": 69800 + }, + { + "epoch": 357.94871794871796, + "eval_loss": 1.1296894550323486, + "eval_runtime": 36.4963, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 69800 + }, + { + "epoch": 358.46153846153845, + "grad_norm": 34.535491943359375, + "learning_rate": 3.0140000000000003e-06, + "loss": 0.8162, + "step": 69900 + }, + { + "epoch": 358.46153846153845, + "eval_loss": 1.1353371143341064, + "eval_runtime": 36.4286, + "eval_samples_per_second": 10.788, + "eval_steps_per_second": 1.373, + "step": 69900 + }, + { + "epoch": 358.97435897435895, + "grad_norm": 27.939172744750977, + "learning_rate": 3.0040000000000004e-06, + "loss": 0.8357, + "step": 70000 + }, + { + "epoch": 358.97435897435895, + "eval_loss": 1.1424373388290405, + "eval_runtime": 36.1009, + "eval_samples_per_second": 10.886, + "eval_steps_per_second": 1.385, + "step": 70000 + }, + { + "epoch": 359.4871794871795, + "grad_norm": 48.748897552490234, + "learning_rate": 2.994e-06, + "loss": 0.8322, + "step": 70100 + }, + { + "epoch": 359.4871794871795, + "eval_loss": 1.1328743696212769, + "eval_runtime": 36.135, + "eval_samples_per_second": 10.876, + "eval_steps_per_second": 1.384, + "step": 70100 + }, + { + "epoch": 360.0, + "grad_norm": 45.24883270263672, + "learning_rate": 2.984e-06, + "loss": 0.8142, + "step": 70200 + }, + { + "epoch": 360.0, + "eval_loss": 1.1296157836914062, + "eval_runtime": 36.1363, + "eval_samples_per_second": 10.875, + "eval_steps_per_second": 1.384, + "step": 70200 + }, + { + "epoch": 360.5128205128205, + "grad_norm": 30.030261993408203, + "learning_rate": 2.9740000000000002e-06, + "loss": 0.8231, + "step": 70300 + }, + { + "epoch": 360.5128205128205, + "eval_loss": 1.1310758590698242, + "eval_runtime": 36.0671, + "eval_samples_per_second": 10.896, + "eval_steps_per_second": 1.386, + "step": 70300 + }, + { + "epoch": 361.02564102564105, + "grad_norm": 55.29873275756836, + "learning_rate": 2.9640000000000003e-06, + "loss": 0.8219, + "step": 70400 + }, + { + "epoch": 361.02564102564105, + "eval_loss": 1.140838861465454, + "eval_runtime": 36.0393, + "eval_samples_per_second": 10.905, + "eval_steps_per_second": 1.387, + "step": 70400 + }, + { + "epoch": 361.53846153846155, + "grad_norm": 35.925048828125, + "learning_rate": 2.954e-06, + "loss": 0.7706, + "step": 70500 + }, + { + "epoch": 361.53846153846155, + "eval_loss": 1.1380531787872314, + "eval_runtime": 36.1313, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 70500 + }, + { + "epoch": 362.05128205128204, + "grad_norm": 70.1023941040039, + "learning_rate": 2.944e-06, + "loss": 0.8313, + "step": 70600 + }, + { + "epoch": 362.05128205128204, + "eval_loss": 1.1221985816955566, + "eval_runtime": 36.004, + "eval_samples_per_second": 10.915, + "eval_steps_per_second": 1.389, + "step": 70600 + }, + { + "epoch": 362.56410256410254, + "grad_norm": 58.735076904296875, + "learning_rate": 2.934e-06, + "loss": 0.8484, + "step": 70700 + }, + { + "epoch": 362.56410256410254, + "eval_loss": 1.1278297901153564, + "eval_runtime": 36.1907, + "eval_samples_per_second": 10.859, + "eval_steps_per_second": 1.382, + "step": 70700 + }, + { + "epoch": 363.0769230769231, + "grad_norm": 19.338993072509766, + "learning_rate": 2.9240000000000003e-06, + "loss": 0.82, + "step": 70800 + }, + { + "epoch": 363.0769230769231, + "eval_loss": 1.1378103494644165, + "eval_runtime": 36.1268, + "eval_samples_per_second": 10.878, + "eval_steps_per_second": 1.384, + "step": 70800 + }, + { + "epoch": 363.5897435897436, + "grad_norm": 32.785926818847656, + "learning_rate": 2.914e-06, + "loss": 0.8498, + "step": 70900 + }, + { + "epoch": 363.5897435897436, + "eval_loss": 1.13233482837677, + "eval_runtime": 36.5698, + "eval_samples_per_second": 10.747, + "eval_steps_per_second": 1.367, + "step": 70900 + }, + { + "epoch": 364.1025641025641, + "grad_norm": 34.11168670654297, + "learning_rate": 2.904e-06, + "loss": 0.7935, + "step": 71000 + }, + { + "epoch": 364.1025641025641, + "eval_loss": 1.1397004127502441, + "eval_runtime": 36.2462, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.379, + "step": 71000 + }, + { + "epoch": 364.61538461538464, + "grad_norm": 43.881141662597656, + "learning_rate": 2.894e-06, + "loss": 0.8399, + "step": 71100 + }, + { + "epoch": 364.61538461538464, + "eval_loss": 1.138384461402893, + "eval_runtime": 36.1269, + "eval_samples_per_second": 10.878, + "eval_steps_per_second": 1.384, + "step": 71100 + }, + { + "epoch": 365.12820512820514, + "grad_norm": 33.7010498046875, + "learning_rate": 2.8840000000000003e-06, + "loss": 0.8359, + "step": 71200 + }, + { + "epoch": 365.12820512820514, + "eval_loss": 1.1436262130737305, + "eval_runtime": 36.1127, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 71200 + }, + { + "epoch": 365.64102564102564, + "grad_norm": 53.668922424316406, + "learning_rate": 2.874e-06, + "loss": 0.8081, + "step": 71300 + }, + { + "epoch": 365.64102564102564, + "eval_loss": 1.1465517282485962, + "eval_runtime": 36.1268, + "eval_samples_per_second": 10.878, + "eval_steps_per_second": 1.384, + "step": 71300 + }, + { + "epoch": 366.15384615384613, + "grad_norm": 48.464298248291016, + "learning_rate": 2.864e-06, + "loss": 0.8604, + "step": 71400 + }, + { + "epoch": 366.15384615384613, + "eval_loss": 1.14792799949646, + "eval_runtime": 36.3571, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 71400 + }, + { + "epoch": 366.6666666666667, + "grad_norm": 29.980213165283203, + "learning_rate": 2.854e-06, + "loss": 0.8283, + "step": 71500 + }, + { + "epoch": 366.6666666666667, + "eval_loss": 1.144291639328003, + "eval_runtime": 36.468, + "eval_samples_per_second": 10.777, + "eval_steps_per_second": 1.371, + "step": 71500 + }, + { + "epoch": 367.1794871794872, + "grad_norm": 55.52314758300781, + "learning_rate": 2.8440000000000002e-06, + "loss": 0.8078, + "step": 71600 + }, + { + "epoch": 367.1794871794872, + "eval_loss": 1.1428290605545044, + "eval_runtime": 36.4962, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 71600 + }, + { + "epoch": 367.6923076923077, + "grad_norm": 69.9741439819336, + "learning_rate": 2.834e-06, + "loss": 0.8323, + "step": 71700 + }, + { + "epoch": 367.6923076923077, + "eval_loss": 1.1439061164855957, + "eval_runtime": 36.337, + "eval_samples_per_second": 10.815, + "eval_steps_per_second": 1.376, + "step": 71700 + }, + { + "epoch": 368.20512820512823, + "grad_norm": 41.96189498901367, + "learning_rate": 2.824e-06, + "loss": 0.798, + "step": 71800 + }, + { + "epoch": 368.20512820512823, + "eval_loss": 1.1313480138778687, + "eval_runtime": 36.414, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 71800 + }, + { + "epoch": 368.71794871794873, + "grad_norm": 75.76729583740234, + "learning_rate": 2.8141000000000003e-06, + "loss": 0.8205, + "step": 71900 + }, + { + "epoch": 368.71794871794873, + "eval_loss": 1.1258835792541504, + "eval_runtime": 36.5426, + "eval_samples_per_second": 10.755, + "eval_steps_per_second": 1.368, + "step": 71900 + }, + { + "epoch": 369.2307692307692, + "grad_norm": 33.178993225097656, + "learning_rate": 2.8041e-06, + "loss": 0.8074, + "step": 72000 + }, + { + "epoch": 369.2307692307692, + "eval_loss": 1.122310996055603, + "eval_runtime": 36.7857, + "eval_samples_per_second": 10.683, + "eval_steps_per_second": 1.359, + "step": 72000 + }, + { + "epoch": 369.7435897435897, + "grad_norm": 23.60536003112793, + "learning_rate": 2.7941e-06, + "loss": 0.8444, + "step": 72100 + }, + { + "epoch": 369.7435897435897, + "eval_loss": 1.1236257553100586, + "eval_runtime": 36.4632, + "eval_samples_per_second": 10.778, + "eval_steps_per_second": 1.371, + "step": 72100 + }, + { + "epoch": 370.2564102564103, + "grad_norm": 25.935991287231445, + "learning_rate": 2.7841e-06, + "loss": 0.7812, + "step": 72200 + }, + { + "epoch": 370.2564102564103, + "eval_loss": 1.12736976146698, + "eval_runtime": 36.5811, + "eval_samples_per_second": 10.743, + "eval_steps_per_second": 1.367, + "step": 72200 + }, + { + "epoch": 370.7692307692308, + "grad_norm": 43.20096969604492, + "learning_rate": 2.7741000000000003e-06, + "loss": 0.7978, + "step": 72300 + }, + { + "epoch": 370.7692307692308, + "eval_loss": 1.1197290420532227, + "eval_runtime": 36.3035, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 72300 + }, + { + "epoch": 371.28205128205127, + "grad_norm": 32.30807876586914, + "learning_rate": 2.7641e-06, + "loss": 0.8523, + "step": 72400 + }, + { + "epoch": 371.28205128205127, + "eval_loss": 1.1308066844940186, + "eval_runtime": 36.0938, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 72400 + }, + { + "epoch": 371.79487179487177, + "grad_norm": 60.18589782714844, + "learning_rate": 2.7541e-06, + "loss": 0.8296, + "step": 72500 + }, + { + "epoch": 371.79487179487177, + "eval_loss": 1.1326649188995361, + "eval_runtime": 36.3116, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 72500 + }, + { + "epoch": 372.3076923076923, + "grad_norm": 47.34400939941406, + "learning_rate": 2.7441e-06, + "loss": 0.8178, + "step": 72600 + }, + { + "epoch": 372.3076923076923, + "eval_loss": 1.1379941701889038, + "eval_runtime": 36.1257, + "eval_samples_per_second": 10.879, + "eval_steps_per_second": 1.384, + "step": 72600 + }, + { + "epoch": 372.8205128205128, + "grad_norm": 28.95159149169922, + "learning_rate": 2.7341000000000002e-06, + "loss": 0.8049, + "step": 72700 + }, + { + "epoch": 372.8205128205128, + "eval_loss": 1.1204009056091309, + "eval_runtime": 36.0777, + "eval_samples_per_second": 10.893, + "eval_steps_per_second": 1.386, + "step": 72700 + }, + { + "epoch": 373.3333333333333, + "grad_norm": 50.76661682128906, + "learning_rate": 2.7241e-06, + "loss": 0.8488, + "step": 72800 + }, + { + "epoch": 373.3333333333333, + "eval_loss": 1.1152757406234741, + "eval_runtime": 36.1965, + "eval_samples_per_second": 10.857, + "eval_steps_per_second": 1.381, + "step": 72800 + }, + { + "epoch": 373.84615384615387, + "grad_norm": 26.5462589263916, + "learning_rate": 2.7141e-06, + "loss": 0.802, + "step": 72900 + }, + { + "epoch": 373.84615384615387, + "eval_loss": 1.1345009803771973, + "eval_runtime": 36.1161, + "eval_samples_per_second": 10.882, + "eval_steps_per_second": 1.384, + "step": 72900 + }, + { + "epoch": 374.35897435897436, + "grad_norm": 30.196237564086914, + "learning_rate": 2.7041e-06, + "loss": 0.8061, + "step": 73000 + }, + { + "epoch": 374.35897435897436, + "eval_loss": 1.1305344104766846, + "eval_runtime": 36.5074, + "eval_samples_per_second": 10.765, + "eval_steps_per_second": 1.37, + "step": 73000 + }, + { + "epoch": 374.87179487179486, + "grad_norm": 42.895442962646484, + "learning_rate": 2.6941e-06, + "loss": 0.8368, + "step": 73100 + }, + { + "epoch": 374.87179487179486, + "eval_loss": 1.1243641376495361, + "eval_runtime": 36.2221, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 73100 + }, + { + "epoch": 375.38461538461536, + "grad_norm": 63.908836364746094, + "learning_rate": 2.6841e-06, + "loss": 0.7962, + "step": 73200 + }, + { + "epoch": 375.38461538461536, + "eval_loss": 1.1395690441131592, + "eval_runtime": 36.1915, + "eval_samples_per_second": 10.859, + "eval_steps_per_second": 1.382, + "step": 73200 + }, + { + "epoch": 375.8974358974359, + "grad_norm": 36.449241638183594, + "learning_rate": 2.6741e-06, + "loss": 0.7806, + "step": 73300 + }, + { + "epoch": 375.8974358974359, + "eval_loss": 1.131882905960083, + "eval_runtime": 36.2045, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 73300 + }, + { + "epoch": 376.4102564102564, + "grad_norm": 52.753299713134766, + "learning_rate": 2.6641e-06, + "loss": 0.8499, + "step": 73400 + }, + { + "epoch": 376.4102564102564, + "eval_loss": 1.133449912071228, + "eval_runtime": 36.2498, + "eval_samples_per_second": 10.841, + "eval_steps_per_second": 1.379, + "step": 73400 + }, + { + "epoch": 376.9230769230769, + "grad_norm": 51.7213020324707, + "learning_rate": 2.6541e-06, + "loss": 0.8019, + "step": 73500 + }, + { + "epoch": 376.9230769230769, + "eval_loss": 1.1361366510391235, + "eval_runtime": 36.242, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 73500 + }, + { + "epoch": 377.43589743589746, + "grad_norm": 25.267913818359375, + "learning_rate": 2.6441e-06, + "loss": 0.8363, + "step": 73600 + }, + { + "epoch": 377.43589743589746, + "eval_loss": 1.1475539207458496, + "eval_runtime": 36.2237, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 73600 + }, + { + "epoch": 377.94871794871796, + "grad_norm": 37.4945182800293, + "learning_rate": 2.6341e-06, + "loss": 0.7926, + "step": 73700 + }, + { + "epoch": 377.94871794871796, + "eval_loss": 1.1402689218521118, + "eval_runtime": 36.1073, + "eval_samples_per_second": 10.884, + "eval_steps_per_second": 1.385, + "step": 73700 + }, + { + "epoch": 378.46153846153845, + "grad_norm": 39.47886657714844, + "learning_rate": 2.6241e-06, + "loss": 0.7776, + "step": 73800 + }, + { + "epoch": 378.46153846153845, + "eval_loss": 1.1334457397460938, + "eval_runtime": 36.1895, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 73800 + }, + { + "epoch": 378.97435897435895, + "grad_norm": 93.54617309570312, + "learning_rate": 2.6142e-06, + "loss": 0.8509, + "step": 73900 + }, + { + "epoch": 378.97435897435895, + "eval_loss": 1.130581259727478, + "eval_runtime": 36.1763, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 73900 + }, + { + "epoch": 379.4871794871795, + "grad_norm": 23.877254486083984, + "learning_rate": 2.6042e-06, + "loss": 0.8302, + "step": 74000 + }, + { + "epoch": 379.4871794871795, + "eval_loss": 1.1330444812774658, + "eval_runtime": 36.452, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 74000 + }, + { + "epoch": 380.0, + "grad_norm": 40.691104888916016, + "learning_rate": 2.5942e-06, + "loss": 0.797, + "step": 74100 + }, + { + "epoch": 380.0, + "eval_loss": 1.132405400276184, + "eval_runtime": 36.0309, + "eval_samples_per_second": 10.907, + "eval_steps_per_second": 1.388, + "step": 74100 + }, + { + "epoch": 380.5128205128205, + "grad_norm": 46.088722229003906, + "learning_rate": 2.5842e-06, + "loss": 0.8083, + "step": 74200 + }, + { + "epoch": 380.5128205128205, + "eval_loss": 1.1357730627059937, + "eval_runtime": 35.9925, + "eval_samples_per_second": 10.919, + "eval_steps_per_second": 1.389, + "step": 74200 + }, + { + "epoch": 381.02564102564105, + "grad_norm": 28.777494430541992, + "learning_rate": 2.5742e-06, + "loss": 0.8177, + "step": 74300 + }, + { + "epoch": 381.02564102564105, + "eval_loss": 1.1488385200500488, + "eval_runtime": 35.8263, + "eval_samples_per_second": 10.97, + "eval_steps_per_second": 1.396, + "step": 74300 + }, + { + "epoch": 381.53846153846155, + "grad_norm": 39.972625732421875, + "learning_rate": 2.5642e-06, + "loss": 0.8237, + "step": 74400 + }, + { + "epoch": 381.53846153846155, + "eval_loss": 1.132332444190979, + "eval_runtime": 36.0918, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.385, + "step": 74400 + }, + { + "epoch": 382.05128205128204, + "grad_norm": 24.858991622924805, + "learning_rate": 2.5542e-06, + "loss": 0.8343, + "step": 74500 + }, + { + "epoch": 382.05128205128204, + "eval_loss": 1.1276569366455078, + "eval_runtime": 35.9622, + "eval_samples_per_second": 10.928, + "eval_steps_per_second": 1.39, + "step": 74500 + }, + { + "epoch": 382.56410256410254, + "grad_norm": 120.7697525024414, + "learning_rate": 2.5442e-06, + "loss": 0.7879, + "step": 74600 + }, + { + "epoch": 382.56410256410254, + "eval_loss": 1.13145112991333, + "eval_runtime": 35.8935, + "eval_samples_per_second": 10.949, + "eval_steps_per_second": 1.393, + "step": 74600 + }, + { + "epoch": 383.0769230769231, + "grad_norm": 27.130176544189453, + "learning_rate": 2.5342e-06, + "loss": 0.8334, + "step": 74700 + }, + { + "epoch": 383.0769230769231, + "eval_loss": 1.1278948783874512, + "eval_runtime": 35.9175, + "eval_samples_per_second": 10.942, + "eval_steps_per_second": 1.392, + "step": 74700 + }, + { + "epoch": 383.5897435897436, + "grad_norm": 20.454666137695312, + "learning_rate": 2.5242e-06, + "loss": 0.8002, + "step": 74800 + }, + { + "epoch": 383.5897435897436, + "eval_loss": 1.1529312133789062, + "eval_runtime": 36.0851, + "eval_samples_per_second": 10.891, + "eval_steps_per_second": 1.386, + "step": 74800 + }, + { + "epoch": 384.1025641025641, + "grad_norm": 24.886058807373047, + "learning_rate": 2.5142e-06, + "loss": 0.8623, + "step": 74900 + }, + { + "epoch": 384.1025641025641, + "eval_loss": 1.1320093870162964, + "eval_runtime": 35.9454, + "eval_samples_per_second": 10.933, + "eval_steps_per_second": 1.391, + "step": 74900 + }, + { + "epoch": 384.61538461538464, + "grad_norm": 23.75406265258789, + "learning_rate": 2.5042e-06, + "loss": 0.8038, + "step": 75000 + }, + { + "epoch": 384.61538461538464, + "eval_loss": 1.1403261423110962, + "eval_runtime": 36.0943, + "eval_samples_per_second": 10.888, + "eval_steps_per_second": 1.385, + "step": 75000 + }, + { + "epoch": 385.12820512820514, + "grad_norm": 51.40729904174805, + "learning_rate": 2.4942e-06, + "loss": 0.8049, + "step": 75100 + }, + { + "epoch": 385.12820512820514, + "eval_loss": 1.1415657997131348, + "eval_runtime": 35.9402, + "eval_samples_per_second": 10.935, + "eval_steps_per_second": 1.391, + "step": 75100 + }, + { + "epoch": 385.64102564102564, + "grad_norm": 39.571861267089844, + "learning_rate": 2.4842000000000003e-06, + "loss": 0.7928, + "step": 75200 + }, + { + "epoch": 385.64102564102564, + "eval_loss": 1.1306287050247192, + "eval_runtime": 35.9067, + "eval_samples_per_second": 10.945, + "eval_steps_per_second": 1.392, + "step": 75200 + }, + { + "epoch": 386.15384615384613, + "grad_norm": 35.23868942260742, + "learning_rate": 2.4742000000000004e-06, + "loss": 0.8258, + "step": 75300 + }, + { + "epoch": 386.15384615384613, + "eval_loss": 1.1376850605010986, + "eval_runtime": 35.9784, + "eval_samples_per_second": 10.923, + "eval_steps_per_second": 1.39, + "step": 75300 + }, + { + "epoch": 386.6666666666667, + "grad_norm": 27.087814331054688, + "learning_rate": 2.4642e-06, + "loss": 0.7465, + "step": 75400 + }, + { + "epoch": 386.6666666666667, + "eval_loss": 1.126760721206665, + "eval_runtime": 35.9663, + "eval_samples_per_second": 10.927, + "eval_steps_per_second": 1.39, + "step": 75400 + }, + { + "epoch": 387.1794871794872, + "grad_norm": 33.293907165527344, + "learning_rate": 2.4542e-06, + "loss": 0.8666, + "step": 75500 + }, + { + "epoch": 387.1794871794872, + "eval_loss": 1.1381648778915405, + "eval_runtime": 35.8929, + "eval_samples_per_second": 10.949, + "eval_steps_per_second": 1.393, + "step": 75500 + }, + { + "epoch": 387.6923076923077, + "grad_norm": 34.061317443847656, + "learning_rate": 2.4442000000000002e-06, + "loss": 0.8187, + "step": 75600 + }, + { + "epoch": 387.6923076923077, + "eval_loss": 1.1330041885375977, + "eval_runtime": 35.9525, + "eval_samples_per_second": 10.931, + "eval_steps_per_second": 1.391, + "step": 75600 + }, + { + "epoch": 388.20512820512823, + "grad_norm": 31.500200271606445, + "learning_rate": 2.4342000000000003e-06, + "loss": 0.7931, + "step": 75700 + }, + { + "epoch": 388.20512820512823, + "eval_loss": 1.1351202726364136, + "eval_runtime": 36.0554, + "eval_samples_per_second": 10.9, + "eval_steps_per_second": 1.387, + "step": 75700 + }, + { + "epoch": 388.71794871794873, + "grad_norm": 20.645219802856445, + "learning_rate": 2.4242e-06, + "loss": 0.8143, + "step": 75800 + }, + { + "epoch": 388.71794871794873, + "eval_loss": 1.1449867486953735, + "eval_runtime": 36.1867, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 75800 + }, + { + "epoch": 389.2307692307692, + "grad_norm": 50.396514892578125, + "learning_rate": 2.4142e-06, + "loss": 0.8262, + "step": 75900 + }, + { + "epoch": 389.2307692307692, + "eval_loss": 1.1388434171676636, + "eval_runtime": 35.9932, + "eval_samples_per_second": 10.919, + "eval_steps_per_second": 1.389, + "step": 75900 + }, + { + "epoch": 389.7435897435897, + "grad_norm": 28.243499755859375, + "learning_rate": 2.4043000000000004e-06, + "loss": 0.818, + "step": 76000 + }, + { + "epoch": 389.7435897435897, + "eval_loss": 1.1377149820327759, + "eval_runtime": 36.1995, + "eval_samples_per_second": 10.857, + "eval_steps_per_second": 1.381, + "step": 76000 + }, + { + "epoch": 390.2564102564103, + "grad_norm": 31.120445251464844, + "learning_rate": 2.3943e-06, + "loss": 0.7816, + "step": 76100 + }, + { + "epoch": 390.2564102564103, + "eval_loss": 1.1347707509994507, + "eval_runtime": 36.7154, + "eval_samples_per_second": 10.704, + "eval_steps_per_second": 1.362, + "step": 76100 + }, + { + "epoch": 390.7692307692308, + "grad_norm": 25.865686416625977, + "learning_rate": 2.3843e-06, + "loss": 0.8171, + "step": 76200 + }, + { + "epoch": 390.7692307692308, + "eval_loss": 1.1412353515625, + "eval_runtime": 36.4116, + "eval_samples_per_second": 10.793, + "eval_steps_per_second": 1.373, + "step": 76200 + }, + { + "epoch": 391.28205128205127, + "grad_norm": 28.151960372924805, + "learning_rate": 2.3743000000000003e-06, + "loss": 0.8331, + "step": 76300 + }, + { + "epoch": 391.28205128205127, + "eval_loss": 1.1397844552993774, + "eval_runtime": 36.454, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 76300 + }, + { + "epoch": 391.79487179487177, + "grad_norm": 38.25983428955078, + "learning_rate": 2.3643000000000004e-06, + "loss": 0.8108, + "step": 76400 + }, + { + "epoch": 391.79487179487177, + "eval_loss": 1.137779712677002, + "eval_runtime": 36.4316, + "eval_samples_per_second": 10.787, + "eval_steps_per_second": 1.372, + "step": 76400 + }, + { + "epoch": 392.3076923076923, + "grad_norm": 29.177074432373047, + "learning_rate": 2.3543e-06, + "loss": 0.8086, + "step": 76500 + }, + { + "epoch": 392.3076923076923, + "eval_loss": 1.131404995918274, + "eval_runtime": 36.5348, + "eval_samples_per_second": 10.757, + "eval_steps_per_second": 1.369, + "step": 76500 + }, + { + "epoch": 392.8205128205128, + "grad_norm": 14.619794845581055, + "learning_rate": 2.3443e-06, + "loss": 0.8196, + "step": 76600 + }, + { + "epoch": 392.8205128205128, + "eval_loss": 1.1232109069824219, + "eval_runtime": 35.9601, + "eval_samples_per_second": 10.929, + "eval_steps_per_second": 1.39, + "step": 76600 + }, + { + "epoch": 393.3333333333333, + "grad_norm": 46.24609375, + "learning_rate": 2.3343000000000002e-06, + "loss": 0.8066, + "step": 76700 + }, + { + "epoch": 393.3333333333333, + "eval_loss": 1.1361371278762817, + "eval_runtime": 36.1158, + "eval_samples_per_second": 10.882, + "eval_steps_per_second": 1.384, + "step": 76700 + }, + { + "epoch": 393.84615384615387, + "grad_norm": 50.48100662231445, + "learning_rate": 2.3243000000000003e-06, + "loss": 0.7843, + "step": 76800 + }, + { + "epoch": 393.84615384615387, + "eval_loss": 1.138421893119812, + "eval_runtime": 36.0517, + "eval_samples_per_second": 10.901, + "eval_steps_per_second": 1.387, + "step": 76800 + }, + { + "epoch": 394.35897435897436, + "grad_norm": 20.45284080505371, + "learning_rate": 2.3143e-06, + "loss": 0.84, + "step": 76900 + }, + { + "epoch": 394.35897435897436, + "eval_loss": 1.1426371335983276, + "eval_runtime": 35.9851, + "eval_samples_per_second": 10.921, + "eval_steps_per_second": 1.389, + "step": 76900 + }, + { + "epoch": 394.87179487179486, + "grad_norm": 38.49494934082031, + "learning_rate": 2.3043e-06, + "loss": 0.7862, + "step": 77000 + }, + { + "epoch": 394.87179487179486, + "eval_loss": 1.1350079774856567, + "eval_runtime": 36.0134, + "eval_samples_per_second": 10.913, + "eval_steps_per_second": 1.388, + "step": 77000 + }, + { + "epoch": 395.38461538461536, + "grad_norm": 46.25963592529297, + "learning_rate": 2.2943e-06, + "loss": 0.8173, + "step": 77100 + }, + { + "epoch": 395.38461538461536, + "eval_loss": 1.13560152053833, + "eval_runtime": 36.5982, + "eval_samples_per_second": 10.738, + "eval_steps_per_second": 1.366, + "step": 77100 + }, + { + "epoch": 395.8974358974359, + "grad_norm": 32.53540802001953, + "learning_rate": 2.2843000000000003e-06, + "loss": 0.7886, + "step": 77200 + }, + { + "epoch": 395.8974358974359, + "eval_loss": 1.1367639303207397, + "eval_runtime": 36.7387, + "eval_samples_per_second": 10.697, + "eval_steps_per_second": 1.361, + "step": 77200 + }, + { + "epoch": 396.4102564102564, + "grad_norm": 37.32951354980469, + "learning_rate": 2.2743e-06, + "loss": 0.8223, + "step": 77300 + }, + { + "epoch": 396.4102564102564, + "eval_loss": 1.1398205757141113, + "eval_runtime": 36.887, + "eval_samples_per_second": 10.654, + "eval_steps_per_second": 1.355, + "step": 77300 + }, + { + "epoch": 396.9230769230769, + "grad_norm": 28.554519653320312, + "learning_rate": 2.2643e-06, + "loss": 0.807, + "step": 77400 + }, + { + "epoch": 396.9230769230769, + "eval_loss": 1.1388484239578247, + "eval_runtime": 36.9574, + "eval_samples_per_second": 10.634, + "eval_steps_per_second": 1.353, + "step": 77400 + }, + { + "epoch": 397.43589743589746, + "grad_norm": 30.867055892944336, + "learning_rate": 2.2543e-06, + "loss": 0.8237, + "step": 77500 + }, + { + "epoch": 397.43589743589746, + "eval_loss": 1.1461577415466309, + "eval_runtime": 36.9834, + "eval_samples_per_second": 10.626, + "eval_steps_per_second": 1.352, + "step": 77500 + }, + { + "epoch": 397.94871794871796, + "grad_norm": 48.92927169799805, + "learning_rate": 2.2443000000000003e-06, + "loss": 0.7726, + "step": 77600 + }, + { + "epoch": 397.94871794871796, + "eval_loss": 1.1421867609024048, + "eval_runtime": 36.1926, + "eval_samples_per_second": 10.859, + "eval_steps_per_second": 1.381, + "step": 77600 + }, + { + "epoch": 398.46153846153845, + "grad_norm": 23.895078659057617, + "learning_rate": 2.2343e-06, + "loss": 0.8147, + "step": 77700 + }, + { + "epoch": 398.46153846153845, + "eval_loss": 1.1238188743591309, + "eval_runtime": 36.153, + "eval_samples_per_second": 10.87, + "eval_steps_per_second": 1.383, + "step": 77700 + }, + { + "epoch": 398.97435897435895, + "grad_norm": 44.79777526855469, + "learning_rate": 2.2243e-06, + "loss": 0.8078, + "step": 77800 + }, + { + "epoch": 398.97435897435895, + "eval_loss": 1.135284185409546, + "eval_runtime": 36.2147, + "eval_samples_per_second": 10.852, + "eval_steps_per_second": 1.381, + "step": 77800 + }, + { + "epoch": 399.4871794871795, + "grad_norm": 21.840147018432617, + "learning_rate": 2.2143e-06, + "loss": 0.8137, + "step": 77900 + }, + { + "epoch": 399.4871794871795, + "eval_loss": 1.1369855403900146, + "eval_runtime": 36.5906, + "eval_samples_per_second": 10.74, + "eval_steps_per_second": 1.366, + "step": 77900 + }, + { + "epoch": 400.0, + "grad_norm": 20.729894638061523, + "learning_rate": 2.2043000000000002e-06, + "loss": 0.7894, + "step": 78000 + }, + { + "epoch": 400.0, + "eval_loss": 1.1269853115081787, + "eval_runtime": 36.5406, + "eval_samples_per_second": 10.755, + "eval_steps_per_second": 1.368, + "step": 78000 + }, + { + "epoch": 400.5128205128205, + "grad_norm": 36.30642318725586, + "learning_rate": 2.1944e-06, + "loss": 0.8045, + "step": 78100 + }, + { + "epoch": 400.5128205128205, + "eval_loss": 1.1333670616149902, + "eval_runtime": 36.0969, + "eval_samples_per_second": 10.887, + "eval_steps_per_second": 1.385, + "step": 78100 + }, + { + "epoch": 401.02564102564105, + "grad_norm": 29.74964714050293, + "learning_rate": 2.1844e-06, + "loss": 0.7984, + "step": 78200 + }, + { + "epoch": 401.02564102564105, + "eval_loss": 1.1333872079849243, + "eval_runtime": 36.5113, + "eval_samples_per_second": 10.764, + "eval_steps_per_second": 1.369, + "step": 78200 + }, + { + "epoch": 401.53846153846155, + "grad_norm": 31.322322845458984, + "learning_rate": 2.1744000000000003e-06, + "loss": 0.7891, + "step": 78300 + }, + { + "epoch": 401.53846153846155, + "eval_loss": 1.137516975402832, + "eval_runtime": 35.9843, + "eval_samples_per_second": 10.921, + "eval_steps_per_second": 1.389, + "step": 78300 + }, + { + "epoch": 402.05128205128204, + "grad_norm": 45.945350646972656, + "learning_rate": 2.1644e-06, + "loss": 0.8127, + "step": 78400 + }, + { + "epoch": 402.05128205128204, + "eval_loss": 1.1432017087936401, + "eval_runtime": 36.0904, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.385, + "step": 78400 + }, + { + "epoch": 402.56410256410254, + "grad_norm": 18.730331420898438, + "learning_rate": 2.1544e-06, + "loss": 0.7953, + "step": 78500 + }, + { + "epoch": 402.56410256410254, + "eval_loss": 1.1325803995132446, + "eval_runtime": 36.3616, + "eval_samples_per_second": 10.808, + "eval_steps_per_second": 1.375, + "step": 78500 + }, + { + "epoch": 403.0769230769231, + "grad_norm": 38.38829803466797, + "learning_rate": 2.1444e-06, + "loss": 0.8242, + "step": 78600 + }, + { + "epoch": 403.0769230769231, + "eval_loss": 1.1348460912704468, + "eval_runtime": 36.7911, + "eval_samples_per_second": 10.682, + "eval_steps_per_second": 1.359, + "step": 78600 + }, + { + "epoch": 403.5897435897436, + "grad_norm": 31.04094123840332, + "learning_rate": 2.1344000000000003e-06, + "loss": 0.7804, + "step": 78700 + }, + { + "epoch": 403.5897435897436, + "eval_loss": 1.1315021514892578, + "eval_runtime": 36.6467, + "eval_samples_per_second": 10.724, + "eval_steps_per_second": 1.364, + "step": 78700 + }, + { + "epoch": 404.1025641025641, + "grad_norm": 36.32188415527344, + "learning_rate": 2.1245e-06, + "loss": 0.8275, + "step": 78800 + }, + { + "epoch": 404.1025641025641, + "eval_loss": 1.1283265352249146, + "eval_runtime": 36.8323, + "eval_samples_per_second": 10.67, + "eval_steps_per_second": 1.358, + "step": 78800 + }, + { + "epoch": 404.61538461538464, + "grad_norm": 42.415252685546875, + "learning_rate": 2.1145000000000003e-06, + "loss": 0.7933, + "step": 78900 + }, + { + "epoch": 404.61538461538464, + "eval_loss": 1.1314613819122314, + "eval_runtime": 37.0337, + "eval_samples_per_second": 10.612, + "eval_steps_per_second": 1.35, + "step": 78900 + }, + { + "epoch": 405.12820512820514, + "grad_norm": 32.10728073120117, + "learning_rate": 2.1045000000000003e-06, + "loss": 0.8274, + "step": 79000 + }, + { + "epoch": 405.12820512820514, + "eval_loss": 1.118557095527649, + "eval_runtime": 36.6721, + "eval_samples_per_second": 10.717, + "eval_steps_per_second": 1.363, + "step": 79000 + }, + { + "epoch": 405.64102564102564, + "grad_norm": 32.93065643310547, + "learning_rate": 2.0945e-06, + "loss": 0.7974, + "step": 79100 + }, + { + "epoch": 405.64102564102564, + "eval_loss": 1.1386582851409912, + "eval_runtime": 36.0858, + "eval_samples_per_second": 10.891, + "eval_steps_per_second": 1.386, + "step": 79100 + }, + { + "epoch": 406.15384615384613, + "grad_norm": 93.33120727539062, + "learning_rate": 2.0845e-06, + "loss": 0.8241, + "step": 79200 + }, + { + "epoch": 406.15384615384613, + "eval_loss": 1.136437177658081, + "eval_runtime": 36.3859, + "eval_samples_per_second": 10.801, + "eval_steps_per_second": 1.374, + "step": 79200 + }, + { + "epoch": 406.6666666666667, + "grad_norm": 14.437223434448242, + "learning_rate": 2.0745000000000002e-06, + "loss": 0.8091, + "step": 79300 + }, + { + "epoch": 406.6666666666667, + "eval_loss": 1.1350340843200684, + "eval_runtime": 36.52, + "eval_samples_per_second": 10.761, + "eval_steps_per_second": 1.369, + "step": 79300 + }, + { + "epoch": 407.1794871794872, + "grad_norm": 23.937789916992188, + "learning_rate": 2.0645000000000003e-06, + "loss": 0.7806, + "step": 79400 + }, + { + "epoch": 407.1794871794872, + "eval_loss": 1.1395503282546997, + "eval_runtime": 36.5054, + "eval_samples_per_second": 10.766, + "eval_steps_per_second": 1.37, + "step": 79400 + }, + { + "epoch": 407.6923076923077, + "grad_norm": 51.89796829223633, + "learning_rate": 2.0545e-06, + "loss": 0.8034, + "step": 79500 + }, + { + "epoch": 407.6923076923077, + "eval_loss": 1.1486047506332397, + "eval_runtime": 36.4513, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 79500 + }, + { + "epoch": 408.20512820512823, + "grad_norm": 62.49965286254883, + "learning_rate": 2.0445e-06, + "loss": 0.8091, + "step": 79600 + }, + { + "epoch": 408.20512820512823, + "eval_loss": 1.1327706575393677, + "eval_runtime": 36.3997, + "eval_samples_per_second": 10.797, + "eval_steps_per_second": 1.374, + "step": 79600 + }, + { + "epoch": 408.71794871794873, + "grad_norm": 36.0612678527832, + "learning_rate": 2.0345e-06, + "loss": 0.8373, + "step": 79700 + }, + { + "epoch": 408.71794871794873, + "eval_loss": 1.133113980293274, + "eval_runtime": 36.403, + "eval_samples_per_second": 10.796, + "eval_steps_per_second": 1.374, + "step": 79700 + }, + { + "epoch": 409.2307692307692, + "grad_norm": 28.11672019958496, + "learning_rate": 2.0245000000000003e-06, + "loss": 0.7991, + "step": 79800 + }, + { + "epoch": 409.2307692307692, + "eval_loss": 1.130694031715393, + "eval_runtime": 36.2037, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 79800 + }, + { + "epoch": 409.7435897435897, + "grad_norm": 24.10468101501465, + "learning_rate": 2.0145e-06, + "loss": 0.7952, + "step": 79900 + }, + { + "epoch": 409.7435897435897, + "eval_loss": 1.1308884620666504, + "eval_runtime": 36.49, + "eval_samples_per_second": 10.77, + "eval_steps_per_second": 1.37, + "step": 79900 + }, + { + "epoch": 410.2564102564103, + "grad_norm": 59.5438232421875, + "learning_rate": 2.0045e-06, + "loss": 0.7788, + "step": 80000 + }, + { + "epoch": 410.2564102564103, + "eval_loss": 1.1568071842193604, + "eval_runtime": 36.2602, + "eval_samples_per_second": 10.838, + "eval_steps_per_second": 1.379, + "step": 80000 + }, + { + "epoch": 410.7692307692308, + "grad_norm": 21.68340301513672, + "learning_rate": 1.9945e-06, + "loss": 0.8156, + "step": 80100 + }, + { + "epoch": 410.7692307692308, + "eval_loss": 1.1372692584991455, + "eval_runtime": 36.469, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 80100 + }, + { + "epoch": 411.28205128205127, + "grad_norm": 44.26573944091797, + "learning_rate": 1.9845000000000002e-06, + "loss": 0.8135, + "step": 80200 + }, + { + "epoch": 411.28205128205127, + "eval_loss": 1.1349170207977295, + "eval_runtime": 36.3583, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 80200 + }, + { + "epoch": 411.79487179487177, + "grad_norm": 35.368370056152344, + "learning_rate": 1.9745e-06, + "loss": 0.7686, + "step": 80300 + }, + { + "epoch": 411.79487179487177, + "eval_loss": 1.1509037017822266, + "eval_runtime": 36.3039, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 80300 + }, + { + "epoch": 412.3076923076923, + "grad_norm": 26.248321533203125, + "learning_rate": 1.9645000000000004e-06, + "loss": 0.8213, + "step": 80400 + }, + { + "epoch": 412.3076923076923, + "eval_loss": 1.1445726156234741, + "eval_runtime": 36.2998, + "eval_samples_per_second": 10.827, + "eval_steps_per_second": 1.377, + "step": 80400 + }, + { + "epoch": 412.8205128205128, + "grad_norm": 43.753517150878906, + "learning_rate": 1.9545e-06, + "loss": 0.7912, + "step": 80500 + }, + { + "epoch": 412.8205128205128, + "eval_loss": 1.1356618404388428, + "eval_runtime": 36.2017, + "eval_samples_per_second": 10.856, + "eval_steps_per_second": 1.381, + "step": 80500 + }, + { + "epoch": 413.3333333333333, + "grad_norm": 46.324249267578125, + "learning_rate": 1.9445e-06, + "loss": 0.7898, + "step": 80600 + }, + { + "epoch": 413.3333333333333, + "eval_loss": 1.1311746835708618, + "eval_runtime": 36.0672, + "eval_samples_per_second": 10.896, + "eval_steps_per_second": 1.386, + "step": 80600 + }, + { + "epoch": 413.84615384615387, + "grad_norm": 37.151611328125, + "learning_rate": 1.9345000000000003e-06, + "loss": 0.8159, + "step": 80700 + }, + { + "epoch": 413.84615384615387, + "eval_loss": 1.1330280303955078, + "eval_runtime": 36.0902, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.385, + "step": 80700 + }, + { + "epoch": 414.35897435897436, + "grad_norm": 18.428199768066406, + "learning_rate": 1.9245000000000004e-06, + "loss": 0.8127, + "step": 80800 + }, + { + "epoch": 414.35897435897436, + "eval_loss": 1.1370999813079834, + "eval_runtime": 36.0648, + "eval_samples_per_second": 10.897, + "eval_steps_per_second": 1.386, + "step": 80800 + }, + { + "epoch": 414.87179487179486, + "grad_norm": 26.18221664428711, + "learning_rate": 1.9145e-06, + "loss": 0.7786, + "step": 80900 + }, + { + "epoch": 414.87179487179486, + "eval_loss": 1.1384717226028442, + "eval_runtime": 35.973, + "eval_samples_per_second": 10.925, + "eval_steps_per_second": 1.39, + "step": 80900 + }, + { + "epoch": 415.38461538461536, + "grad_norm": 29.011056900024414, + "learning_rate": 1.9045000000000001e-06, + "loss": 0.7738, + "step": 81000 + }, + { + "epoch": 415.38461538461536, + "eval_loss": 1.1393600702285767, + "eval_runtime": 36.1218, + "eval_samples_per_second": 10.88, + "eval_steps_per_second": 1.384, + "step": 81000 + }, + { + "epoch": 415.8974358974359, + "grad_norm": 34.290645599365234, + "learning_rate": 1.8945000000000002e-06, + "loss": 0.8409, + "step": 81100 + }, + { + "epoch": 415.8974358974359, + "eval_loss": 1.1381124258041382, + "eval_runtime": 36.092, + "eval_samples_per_second": 10.889, + "eval_steps_per_second": 1.385, + "step": 81100 + }, + { + "epoch": 416.4102564102564, + "grad_norm": 26.203128814697266, + "learning_rate": 1.8845000000000001e-06, + "loss": 0.8074, + "step": 81200 + }, + { + "epoch": 416.4102564102564, + "eval_loss": 1.128665566444397, + "eval_runtime": 36.4617, + "eval_samples_per_second": 10.778, + "eval_steps_per_second": 1.371, + "step": 81200 + }, + { + "epoch": 416.9230769230769, + "grad_norm": 47.903987884521484, + "learning_rate": 1.8745000000000002e-06, + "loss": 0.7934, + "step": 81300 + }, + { + "epoch": 416.9230769230769, + "eval_loss": 1.1310197114944458, + "eval_runtime": 36.4533, + "eval_samples_per_second": 10.781, + "eval_steps_per_second": 1.372, + "step": 81300 + }, + { + "epoch": 417.43589743589746, + "grad_norm": 66.6627197265625, + "learning_rate": 1.8645e-06, + "loss": 0.8069, + "step": 81400 + }, + { + "epoch": 417.43589743589746, + "eval_loss": 1.1379177570343018, + "eval_runtime": 36.1075, + "eval_samples_per_second": 10.884, + "eval_steps_per_second": 1.385, + "step": 81400 + }, + { + "epoch": 417.94871794871796, + "grad_norm": 35.06187438964844, + "learning_rate": 1.8545000000000002e-06, + "loss": 0.7859, + "step": 81500 + }, + { + "epoch": 417.94871794871796, + "eval_loss": 1.1314971446990967, + "eval_runtime": 36.0446, + "eval_samples_per_second": 10.903, + "eval_steps_per_second": 1.387, + "step": 81500 + }, + { + "epoch": 418.46153846153845, + "grad_norm": 38.089481353759766, + "learning_rate": 1.8445e-06, + "loss": 0.8019, + "step": 81600 + }, + { + "epoch": 418.46153846153845, + "eval_loss": 1.1358174085617065, + "eval_runtime": 36.0218, + "eval_samples_per_second": 10.91, + "eval_steps_per_second": 1.388, + "step": 81600 + }, + { + "epoch": 418.97435897435895, + "grad_norm": 27.354896545410156, + "learning_rate": 1.8345000000000002e-06, + "loss": 0.8007, + "step": 81700 + }, + { + "epoch": 418.97435897435895, + "eval_loss": 1.1441881656646729, + "eval_runtime": 36.1967, + "eval_samples_per_second": 10.857, + "eval_steps_per_second": 1.381, + "step": 81700 + }, + { + "epoch": 419.4871794871795, + "grad_norm": 32.9517707824707, + "learning_rate": 1.8245e-06, + "loss": 0.8196, + "step": 81800 + }, + { + "epoch": 419.4871794871795, + "eval_loss": 1.1395628452301025, + "eval_runtime": 36.2204, + "eval_samples_per_second": 10.85, + "eval_steps_per_second": 1.38, + "step": 81800 + }, + { + "epoch": 420.0, + "grad_norm": 30.36843490600586, + "learning_rate": 1.8145000000000002e-06, + "loss": 0.7927, + "step": 81900 + }, + { + "epoch": 420.0, + "eval_loss": 1.1398494243621826, + "eval_runtime": 36.6657, + "eval_samples_per_second": 10.718, + "eval_steps_per_second": 1.364, + "step": 81900 + }, + { + "epoch": 420.5128205128205, + "grad_norm": 60.86655044555664, + "learning_rate": 1.8045e-06, + "loss": 0.8022, + "step": 82000 + }, + { + "epoch": 420.5128205128205, + "eval_loss": 1.1431193351745605, + "eval_runtime": 36.2928, + "eval_samples_per_second": 10.829, + "eval_steps_per_second": 1.378, + "step": 82000 + }, + { + "epoch": 421.02564102564105, + "grad_norm": 14.138097763061523, + "learning_rate": 1.7945000000000001e-06, + "loss": 0.7717, + "step": 82100 + }, + { + "epoch": 421.02564102564105, + "eval_loss": 1.1445454359054565, + "eval_runtime": 36.0382, + "eval_samples_per_second": 10.905, + "eval_steps_per_second": 1.387, + "step": 82100 + }, + { + "epoch": 421.53846153846155, + "grad_norm": 32.59247589111328, + "learning_rate": 1.7845e-06, + "loss": 0.7955, + "step": 82200 + }, + { + "epoch": 421.53846153846155, + "eval_loss": 1.1323463916778564, + "eval_runtime": 36.0304, + "eval_samples_per_second": 10.907, + "eval_steps_per_second": 1.388, + "step": 82200 + }, + { + "epoch": 422.05128205128204, + "grad_norm": 39.05498123168945, + "learning_rate": 1.7745000000000001e-06, + "loss": 0.8065, + "step": 82300 + }, + { + "epoch": 422.05128205128204, + "eval_loss": 1.141575574874878, + "eval_runtime": 36.5483, + "eval_samples_per_second": 10.753, + "eval_steps_per_second": 1.368, + "step": 82300 + }, + { + "epoch": 422.56410256410254, + "grad_norm": 35.87183380126953, + "learning_rate": 1.7645e-06, + "loss": 0.7939, + "step": 82400 + }, + { + "epoch": 422.56410256410254, + "eval_loss": 1.1375387907028198, + "eval_runtime": 36.0782, + "eval_samples_per_second": 10.893, + "eval_steps_per_second": 1.386, + "step": 82400 + }, + { + "epoch": 423.0769230769231, + "grad_norm": 61.73817825317383, + "learning_rate": 1.7545e-06, + "loss": 0.7774, + "step": 82500 + }, + { + "epoch": 423.0769230769231, + "eval_loss": 1.1390148401260376, + "eval_runtime": 36.0111, + "eval_samples_per_second": 10.913, + "eval_steps_per_second": 1.388, + "step": 82500 + }, + { + "epoch": 423.5897435897436, + "grad_norm": 36.2542724609375, + "learning_rate": 1.7445e-06, + "loss": 0.8218, + "step": 82600 + }, + { + "epoch": 423.5897435897436, + "eval_loss": 1.1377121210098267, + "eval_runtime": 36.2269, + "eval_samples_per_second": 10.848, + "eval_steps_per_second": 1.38, + "step": 82600 + }, + { + "epoch": 424.1025641025641, + "grad_norm": 56.437477111816406, + "learning_rate": 1.7345e-06, + "loss": 0.7736, + "step": 82700 + }, + { + "epoch": 424.1025641025641, + "eval_loss": 1.1443400382995605, + "eval_runtime": 36.2984, + "eval_samples_per_second": 10.827, + "eval_steps_per_second": 1.377, + "step": 82700 + }, + { + "epoch": 424.61538461538464, + "grad_norm": 18.46978759765625, + "learning_rate": 1.7245e-06, + "loss": 0.7731, + "step": 82800 + }, + { + "epoch": 424.61538461538464, + "eval_loss": 1.1458019018173218, + "eval_runtime": 36.1666, + "eval_samples_per_second": 10.866, + "eval_steps_per_second": 1.382, + "step": 82800 + }, + { + "epoch": 425.12820512820514, + "grad_norm": 25.277870178222656, + "learning_rate": 1.7145e-06, + "loss": 0.8232, + "step": 82900 + }, + { + "epoch": 425.12820512820514, + "eval_loss": 1.138476848602295, + "eval_runtime": 37.1081, + "eval_samples_per_second": 10.591, + "eval_steps_per_second": 1.347, + "step": 82900 + }, + { + "epoch": 425.64102564102564, + "grad_norm": 21.484840393066406, + "learning_rate": 1.7045e-06, + "loss": 0.8052, + "step": 83000 + }, + { + "epoch": 425.64102564102564, + "eval_loss": 1.140100121498108, + "eval_runtime": 37.1907, + "eval_samples_per_second": 10.567, + "eval_steps_per_second": 1.344, + "step": 83000 + }, + { + "epoch": 426.15384615384613, + "grad_norm": 27.945289611816406, + "learning_rate": 1.6946e-06, + "loss": 0.7932, + "step": 83100 + }, + { + "epoch": 426.15384615384613, + "eval_loss": 1.131630301475525, + "eval_runtime": 37.2152, + "eval_samples_per_second": 10.56, + "eval_steps_per_second": 1.344, + "step": 83100 + }, + { + "epoch": 426.6666666666667, + "grad_norm": 34.187442779541016, + "learning_rate": 1.6846000000000001e-06, + "loss": 0.7697, + "step": 83200 + }, + { + "epoch": 426.6666666666667, + "eval_loss": 1.1404935121536255, + "eval_runtime": 37.1997, + "eval_samples_per_second": 10.565, + "eval_steps_per_second": 1.344, + "step": 83200 + }, + { + "epoch": 427.1794871794872, + "grad_norm": 53.82330322265625, + "learning_rate": 1.6746e-06, + "loss": 0.825, + "step": 83300 + }, + { + "epoch": 427.1794871794872, + "eval_loss": 1.1366734504699707, + "eval_runtime": 37.6629, + "eval_samples_per_second": 10.435, + "eval_steps_per_second": 1.328, + "step": 83300 + }, + { + "epoch": 427.6923076923077, + "grad_norm": 32.13728332519531, + "learning_rate": 1.6646000000000001e-06, + "loss": 0.7926, + "step": 83400 + }, + { + "epoch": 427.6923076923077, + "eval_loss": 1.1375055313110352, + "eval_runtime": 37.1569, + "eval_samples_per_second": 10.577, + "eval_steps_per_second": 1.346, + "step": 83400 + }, + { + "epoch": 428.20512820512823, + "grad_norm": 14.809713363647461, + "learning_rate": 1.6546e-06, + "loss": 0.7953, + "step": 83500 + }, + { + "epoch": 428.20512820512823, + "eval_loss": 1.1445286273956299, + "eval_runtime": 37.1963, + "eval_samples_per_second": 10.566, + "eval_steps_per_second": 1.344, + "step": 83500 + }, + { + "epoch": 428.71794871794873, + "grad_norm": 26.210575103759766, + "learning_rate": 1.6446e-06, + "loss": 0.7997, + "step": 83600 + }, + { + "epoch": 428.71794871794873, + "eval_loss": 1.1372207403182983, + "eval_runtime": 37.1439, + "eval_samples_per_second": 10.58, + "eval_steps_per_second": 1.346, + "step": 83600 + }, + { + "epoch": 429.2307692307692, + "grad_norm": 35.37520980834961, + "learning_rate": 1.6346e-06, + "loss": 0.817, + "step": 83700 + }, + { + "epoch": 429.2307692307692, + "eval_loss": 1.1383602619171143, + "eval_runtime": 37.4204, + "eval_samples_per_second": 10.502, + "eval_steps_per_second": 1.336, + "step": 83700 + }, + { + "epoch": 429.7435897435897, + "grad_norm": 37.56960678100586, + "learning_rate": 1.6246e-06, + "loss": 0.7785, + "step": 83800 + }, + { + "epoch": 429.7435897435897, + "eval_loss": 1.139742374420166, + "eval_runtime": 37.5322, + "eval_samples_per_second": 10.471, + "eval_steps_per_second": 1.332, + "step": 83800 + }, + { + "epoch": 430.2564102564103, + "grad_norm": 19.520986557006836, + "learning_rate": 1.6146e-06, + "loss": 0.7949, + "step": 83900 + }, + { + "epoch": 430.2564102564103, + "eval_loss": 1.1356620788574219, + "eval_runtime": 37.3317, + "eval_samples_per_second": 10.527, + "eval_steps_per_second": 1.339, + "step": 83900 + }, + { + "epoch": 430.7692307692308, + "grad_norm": 29.4014949798584, + "learning_rate": 1.6046e-06, + "loss": 0.8071, + "step": 84000 + }, + { + "epoch": 430.7692307692308, + "eval_loss": 1.138593316078186, + "eval_runtime": 37.0678, + "eval_samples_per_second": 10.602, + "eval_steps_per_second": 1.349, + "step": 84000 + }, + { + "epoch": 431.28205128205127, + "grad_norm": 35.618621826171875, + "learning_rate": 1.5946e-06, + "loss": 0.7629, + "step": 84100 + }, + { + "epoch": 431.28205128205127, + "eval_loss": 1.1490651369094849, + "eval_runtime": 36.3907, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.374, + "step": 84100 + }, + { + "epoch": 431.79487179487177, + "grad_norm": 46.74853515625, + "learning_rate": 1.5846e-06, + "loss": 0.8251, + "step": 84200 + }, + { + "epoch": 431.79487179487177, + "eval_loss": 1.1478149890899658, + "eval_runtime": 36.418, + "eval_samples_per_second": 10.791, + "eval_steps_per_second": 1.373, + "step": 84200 + }, + { + "epoch": 432.3076923076923, + "grad_norm": 36.4643669128418, + "learning_rate": 1.5746e-06, + "loss": 0.7825, + "step": 84300 + }, + { + "epoch": 432.3076923076923, + "eval_loss": 1.1478251218795776, + "eval_runtime": 36.1189, + "eval_samples_per_second": 10.881, + "eval_steps_per_second": 1.384, + "step": 84300 + }, + { + "epoch": 432.8205128205128, + "grad_norm": 35.388328552246094, + "learning_rate": 1.5646e-06, + "loss": 0.7868, + "step": 84400 + }, + { + "epoch": 432.8205128205128, + "eval_loss": 1.1359634399414062, + "eval_runtime": 36.1927, + "eval_samples_per_second": 10.859, + "eval_steps_per_second": 1.381, + "step": 84400 + }, + { + "epoch": 433.3333333333333, + "grad_norm": 54.64905548095703, + "learning_rate": 1.5546e-06, + "loss": 0.769, + "step": 84500 + }, + { + "epoch": 433.3333333333333, + "eval_loss": 1.1468294858932495, + "eval_runtime": 36.0976, + "eval_samples_per_second": 10.887, + "eval_steps_per_second": 1.385, + "step": 84500 + }, + { + "epoch": 433.84615384615387, + "grad_norm": 30.190101623535156, + "learning_rate": 1.5446e-06, + "loss": 0.8016, + "step": 84600 + }, + { + "epoch": 433.84615384615387, + "eval_loss": 1.1358263492584229, + "eval_runtime": 36.7151, + "eval_samples_per_second": 10.704, + "eval_steps_per_second": 1.362, + "step": 84600 + }, + { + "epoch": 434.35897435897436, + "grad_norm": 41.01110076904297, + "learning_rate": 1.5346000000000003e-06, + "loss": 0.7951, + "step": 84700 + }, + { + "epoch": 434.35897435897436, + "eval_loss": 1.141539454460144, + "eval_runtime": 36.1351, + "eval_samples_per_second": 10.876, + "eval_steps_per_second": 1.384, + "step": 84700 + }, + { + "epoch": 434.87179487179486, + "grad_norm": 72.61260986328125, + "learning_rate": 1.5246000000000002e-06, + "loss": 0.7761, + "step": 84800 + }, + { + "epoch": 434.87179487179486, + "eval_loss": 1.1440916061401367, + "eval_runtime": 36.407, + "eval_samples_per_second": 10.795, + "eval_steps_per_second": 1.373, + "step": 84800 + }, + { + "epoch": 435.38461538461536, + "grad_norm": 31.35848617553711, + "learning_rate": 1.5146000000000003e-06, + "loss": 0.8177, + "step": 84900 + }, + { + "epoch": 435.38461538461536, + "eval_loss": 1.1316170692443848, + "eval_runtime": 36.1146, + "eval_samples_per_second": 10.882, + "eval_steps_per_second": 1.384, + "step": 84900 + }, + { + "epoch": 435.8974358974359, + "grad_norm": 73.97954559326172, + "learning_rate": 1.5046000000000002e-06, + "loss": 0.7681, + "step": 85000 + }, + { + "epoch": 435.8974358974359, + "eval_loss": 1.1298261880874634, + "eval_runtime": 36.088, + "eval_samples_per_second": 10.89, + "eval_steps_per_second": 1.386, + "step": 85000 + }, + { + "epoch": 436.4102564102564, + "grad_norm": 43.618160247802734, + "learning_rate": 1.4946000000000003e-06, + "loss": 0.8046, + "step": 85100 + }, + { + "epoch": 436.4102564102564, + "eval_loss": 1.1228266954421997, + "eval_runtime": 36.3074, + "eval_samples_per_second": 10.824, + "eval_steps_per_second": 1.377, + "step": 85100 + }, + { + "epoch": 436.9230769230769, + "grad_norm": 26.855504989624023, + "learning_rate": 1.4846000000000002e-06, + "loss": 0.7967, + "step": 85200 + }, + { + "epoch": 436.9230769230769, + "eval_loss": 1.1299227476119995, + "eval_runtime": 36.1305, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 85200 + }, + { + "epoch": 437.43589743589746, + "grad_norm": 25.799026489257812, + "learning_rate": 1.4747e-06, + "loss": 0.7733, + "step": 85300 + }, + { + "epoch": 437.43589743589746, + "eval_loss": 1.1327317953109741, + "eval_runtime": 36.2722, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.378, + "step": 85300 + }, + { + "epoch": 437.94871794871796, + "grad_norm": 77.15607452392578, + "learning_rate": 1.4647e-06, + "loss": 0.8059, + "step": 85400 + }, + { + "epoch": 437.94871794871796, + "eval_loss": 1.1374785900115967, + "eval_runtime": 36.7014, + "eval_samples_per_second": 10.708, + "eval_steps_per_second": 1.362, + "step": 85400 + }, + { + "epoch": 438.46153846153845, + "grad_norm": 19.759017944335938, + "learning_rate": 1.4547e-06, + "loss": 0.794, + "step": 85500 + }, + { + "epoch": 438.46153846153845, + "eval_loss": 1.1408315896987915, + "eval_runtime": 35.9936, + "eval_samples_per_second": 10.919, + "eval_steps_per_second": 1.389, + "step": 85500 + }, + { + "epoch": 438.97435897435895, + "grad_norm": 38.38291549682617, + "learning_rate": 1.4447e-06, + "loss": 0.769, + "step": 85600 + }, + { + "epoch": 438.97435897435895, + "eval_loss": 1.134137749671936, + "eval_runtime": 36.2259, + "eval_samples_per_second": 10.849, + "eval_steps_per_second": 1.38, + "step": 85600 + }, + { + "epoch": 439.4871794871795, + "grad_norm": 31.742353439331055, + "learning_rate": 1.4347e-06, + "loss": 0.8237, + "step": 85700 + }, + { + "epoch": 439.4871794871795, + "eval_loss": 1.1402748823165894, + "eval_runtime": 36.1103, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 85700 + }, + { + "epoch": 440.0, + "grad_norm": 27.051218032836914, + "learning_rate": 1.4247000000000003e-06, + "loss": 0.7846, + "step": 85800 + }, + { + "epoch": 440.0, + "eval_loss": 1.1372736692428589, + "eval_runtime": 36.0759, + "eval_samples_per_second": 10.894, + "eval_steps_per_second": 1.386, + "step": 85800 + }, + { + "epoch": 440.5128205128205, + "grad_norm": 22.370813369750977, + "learning_rate": 1.4147000000000002e-06, + "loss": 0.7808, + "step": 85900 + }, + { + "epoch": 440.5128205128205, + "eval_loss": 1.1368041038513184, + "eval_runtime": 36.2725, + "eval_samples_per_second": 10.835, + "eval_steps_per_second": 1.378, + "step": 85900 + }, + { + "epoch": 441.02564102564105, + "grad_norm": 42.83647537231445, + "learning_rate": 1.4047000000000003e-06, + "loss": 0.8001, + "step": 86000 + }, + { + "epoch": 441.02564102564105, + "eval_loss": 1.1312806606292725, + "eval_runtime": 36.1429, + "eval_samples_per_second": 10.874, + "eval_steps_per_second": 1.383, + "step": 86000 + }, + { + "epoch": 441.53846153846155, + "grad_norm": 39.096534729003906, + "learning_rate": 1.3947000000000002e-06, + "loss": 0.7667, + "step": 86100 + }, + { + "epoch": 441.53846153846155, + "eval_loss": 1.1320886611938477, + "eval_runtime": 36.3951, + "eval_samples_per_second": 10.798, + "eval_steps_per_second": 1.374, + "step": 86100 + }, + { + "epoch": 442.05128205128204, + "grad_norm": 58.20417022705078, + "learning_rate": 1.3847000000000003e-06, + "loss": 0.8213, + "step": 86200 + }, + { + "epoch": 442.05128205128204, + "eval_loss": 1.140681505203247, + "eval_runtime": 36.4571, + "eval_samples_per_second": 10.78, + "eval_steps_per_second": 1.371, + "step": 86200 + }, + { + "epoch": 442.56410256410254, + "grad_norm": 63.09733200073242, + "learning_rate": 1.3747000000000002e-06, + "loss": 0.7951, + "step": 86300 + }, + { + "epoch": 442.56410256410254, + "eval_loss": 1.1364511251449585, + "eval_runtime": 36.4226, + "eval_samples_per_second": 10.79, + "eval_steps_per_second": 1.373, + "step": 86300 + }, + { + "epoch": 443.0769230769231, + "grad_norm": 57.014122009277344, + "learning_rate": 1.3647000000000002e-06, + "loss": 0.7802, + "step": 86400 + }, + { + "epoch": 443.0769230769231, + "eval_loss": 1.1391345262527466, + "eval_runtime": 36.973, + "eval_samples_per_second": 10.629, + "eval_steps_per_second": 1.352, + "step": 86400 + }, + { + "epoch": 443.5897435897436, + "grad_norm": 25.986827850341797, + "learning_rate": 1.3547000000000001e-06, + "loss": 0.805, + "step": 86500 + }, + { + "epoch": 443.5897435897436, + "eval_loss": 1.1400049924850464, + "eval_runtime": 36.4446, + "eval_samples_per_second": 10.783, + "eval_steps_per_second": 1.372, + "step": 86500 + }, + { + "epoch": 444.1025641025641, + "grad_norm": 36.341148376464844, + "learning_rate": 1.3447000000000002e-06, + "loss": 0.7409, + "step": 86600 + }, + { + "epoch": 444.1025641025641, + "eval_loss": 1.1340398788452148, + "eval_runtime": 36.0454, + "eval_samples_per_second": 10.903, + "eval_steps_per_second": 1.387, + "step": 86600 + }, + { + "epoch": 444.61538461538464, + "grad_norm": 39.169891357421875, + "learning_rate": 1.3347000000000001e-06, + "loss": 0.7742, + "step": 86700 + }, + { + "epoch": 444.61538461538464, + "eval_loss": 1.1242835521697998, + "eval_runtime": 35.9622, + "eval_samples_per_second": 10.928, + "eval_steps_per_second": 1.39, + "step": 86700 + }, + { + "epoch": 445.12820512820514, + "grad_norm": 29.04669952392578, + "learning_rate": 1.3247000000000002e-06, + "loss": 0.8122, + "step": 86800 + }, + { + "epoch": 445.12820512820514, + "eval_loss": 1.1295175552368164, + "eval_runtime": 36.0804, + "eval_samples_per_second": 10.892, + "eval_steps_per_second": 1.386, + "step": 86800 + }, + { + "epoch": 445.64102564102564, + "grad_norm": 68.22357940673828, + "learning_rate": 1.3148000000000003e-06, + "loss": 0.8073, + "step": 86900 + }, + { + "epoch": 445.64102564102564, + "eval_loss": 1.127049207687378, + "eval_runtime": 36.1675, + "eval_samples_per_second": 10.866, + "eval_steps_per_second": 1.382, + "step": 86900 + }, + { + "epoch": 446.15384615384613, + "grad_norm": 57.1582145690918, + "learning_rate": 1.3048000000000002e-06, + "loss": 0.7537, + "step": 87000 + }, + { + "epoch": 446.15384615384613, + "eval_loss": 1.143984079360962, + "eval_runtime": 36.1107, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 87000 + }, + { + "epoch": 446.6666666666667, + "grad_norm": 41.092411041259766, + "learning_rate": 1.2948000000000003e-06, + "loss": 0.8052, + "step": 87100 + }, + { + "epoch": 446.6666666666667, + "eval_loss": 1.1349208354949951, + "eval_runtime": 36.846, + "eval_samples_per_second": 10.666, + "eval_steps_per_second": 1.357, + "step": 87100 + }, + { + "epoch": 447.1794871794872, + "grad_norm": 22.109609603881836, + "learning_rate": 1.2848000000000002e-06, + "loss": 0.8011, + "step": 87200 + }, + { + "epoch": 447.1794871794872, + "eval_loss": 1.141160011291504, + "eval_runtime": 37.0028, + "eval_samples_per_second": 10.621, + "eval_steps_per_second": 1.351, + "step": 87200 + }, + { + "epoch": 447.6923076923077, + "grad_norm": 31.559099197387695, + "learning_rate": 1.2748000000000003e-06, + "loss": 0.8042, + "step": 87300 + }, + { + "epoch": 447.6923076923077, + "eval_loss": 1.13499915599823, + "eval_runtime": 36.9785, + "eval_samples_per_second": 10.628, + "eval_steps_per_second": 1.352, + "step": 87300 + }, + { + "epoch": 448.20512820512823, + "grad_norm": 34.05754470825195, + "learning_rate": 1.2648000000000002e-06, + "loss": 0.778, + "step": 87400 + }, + { + "epoch": 448.20512820512823, + "eval_loss": 1.1388235092163086, + "eval_runtime": 36.8823, + "eval_samples_per_second": 10.656, + "eval_steps_per_second": 1.356, + "step": 87400 + }, + { + "epoch": 448.71794871794873, + "grad_norm": 14.186123847961426, + "learning_rate": 1.2548000000000003e-06, + "loss": 0.8001, + "step": 87500 + }, + { + "epoch": 448.71794871794873, + "eval_loss": 1.1438807249069214, + "eval_runtime": 37.2954, + "eval_samples_per_second": 10.537, + "eval_steps_per_second": 1.341, + "step": 87500 + }, + { + "epoch": 449.2307692307692, + "grad_norm": 53.54055404663086, + "learning_rate": 1.2448000000000001e-06, + "loss": 0.7851, + "step": 87600 + }, + { + "epoch": 449.2307692307692, + "eval_loss": 1.139961838722229, + "eval_runtime": 36.4595, + "eval_samples_per_second": 10.779, + "eval_steps_per_second": 1.371, + "step": 87600 + }, + { + "epoch": 449.7435897435897, + "grad_norm": 35.1504020690918, + "learning_rate": 1.2348000000000002e-06, + "loss": 0.7617, + "step": 87700 + }, + { + "epoch": 449.7435897435897, + "eval_loss": 1.1491321325302124, + "eval_runtime": 36.3532, + "eval_samples_per_second": 10.811, + "eval_steps_per_second": 1.375, + "step": 87700 + }, + { + "epoch": 450.2564102564103, + "grad_norm": 35.47263717651367, + "learning_rate": 1.2248000000000001e-06, + "loss": 0.8067, + "step": 87800 + }, + { + "epoch": 450.2564102564103, + "eval_loss": 1.14724600315094, + "eval_runtime": 36.4509, + "eval_samples_per_second": 10.782, + "eval_steps_per_second": 1.372, + "step": 87800 + }, + { + "epoch": 450.7692307692308, + "grad_norm": 25.25591278076172, + "learning_rate": 1.2148000000000002e-06, + "loss": 0.783, + "step": 87900 + }, + { + "epoch": 450.7692307692308, + "eval_loss": 1.1455408334732056, + "eval_runtime": 36.3609, + "eval_samples_per_second": 10.808, + "eval_steps_per_second": 1.375, + "step": 87900 + }, + { + "epoch": 451.28205128205127, + "grad_norm": 30.956676483154297, + "learning_rate": 1.2048e-06, + "loss": 0.7667, + "step": 88000 + }, + { + "epoch": 451.28205128205127, + "eval_loss": 1.1487252712249756, + "eval_runtime": 36.4708, + "eval_samples_per_second": 10.776, + "eval_steps_per_second": 1.371, + "step": 88000 + }, + { + "epoch": 451.79487179487177, + "grad_norm": 23.540725708007812, + "learning_rate": 1.1948000000000002e-06, + "loss": 0.8008, + "step": 88100 + }, + { + "epoch": 451.79487179487177, + "eval_loss": 1.1447594165802002, + "eval_runtime": 36.2301, + "eval_samples_per_second": 10.847, + "eval_steps_per_second": 1.38, + "step": 88100 + }, + { + "epoch": 452.3076923076923, + "grad_norm": 44.291500091552734, + "learning_rate": 1.1848e-06, + "loss": 0.7914, + "step": 88200 + }, + { + "epoch": 452.3076923076923, + "eval_loss": 1.1462591886520386, + "eval_runtime": 36.0974, + "eval_samples_per_second": 10.887, + "eval_steps_per_second": 1.385, + "step": 88200 + }, + { + "epoch": 452.8205128205128, + "grad_norm": 26.443422317504883, + "learning_rate": 1.1748000000000002e-06, + "loss": 0.778, + "step": 88300 + }, + { + "epoch": 452.8205128205128, + "eval_loss": 1.142282485961914, + "eval_runtime": 36.058, + "eval_samples_per_second": 10.899, + "eval_steps_per_second": 1.387, + "step": 88300 + }, + { + "epoch": 453.3333333333333, + "grad_norm": 34.64956283569336, + "learning_rate": 1.1648e-06, + "loss": 0.7695, + "step": 88400 + }, + { + "epoch": 453.3333333333333, + "eval_loss": 1.1501014232635498, + "eval_runtime": 36.1764, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 88400 + }, + { + "epoch": 453.84615384615387, + "grad_norm": 40.976863861083984, + "learning_rate": 1.1548000000000001e-06, + "loss": 0.7839, + "step": 88500 + }, + { + "epoch": 453.84615384615387, + "eval_loss": 1.1512559652328491, + "eval_runtime": 36.5113, + "eval_samples_per_second": 10.764, + "eval_steps_per_second": 1.369, + "step": 88500 + }, + { + "epoch": 454.35897435897436, + "grad_norm": 22.397504806518555, + "learning_rate": 1.1448e-06, + "loss": 0.7994, + "step": 88600 + }, + { + "epoch": 454.35897435897436, + "eval_loss": 1.1445460319519043, + "eval_runtime": 35.9809, + "eval_samples_per_second": 10.922, + "eval_steps_per_second": 1.39, + "step": 88600 + }, + { + "epoch": 454.87179487179486, + "grad_norm": 29.209510803222656, + "learning_rate": 1.1348000000000001e-06, + "loss": 0.7852, + "step": 88700 + }, + { + "epoch": 454.87179487179486, + "eval_loss": 1.1429709196090698, + "eval_runtime": 36.0705, + "eval_samples_per_second": 10.895, + "eval_steps_per_second": 1.386, + "step": 88700 + }, + { + "epoch": 455.38461538461536, + "grad_norm": 43.59063720703125, + "learning_rate": 1.1248e-06, + "loss": 0.7879, + "step": 88800 + }, + { + "epoch": 455.38461538461536, + "eval_loss": 1.1381382942199707, + "eval_runtime": 36.1315, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 88800 + }, + { + "epoch": 455.8974358974359, + "grad_norm": 27.261241912841797, + "learning_rate": 1.1148000000000001e-06, + "loss": 0.7665, + "step": 88900 + }, + { + "epoch": 455.8974358974359, + "eval_loss": 1.1429065465927124, + "eval_runtime": 35.993, + "eval_samples_per_second": 10.919, + "eval_steps_per_second": 1.389, + "step": 88900 + }, + { + "epoch": 456.4102564102564, + "grad_norm": 30.30113983154297, + "learning_rate": 1.1048e-06, + "loss": 0.7905, + "step": 89000 + }, + { + "epoch": 456.4102564102564, + "eval_loss": 1.146630883216858, + "eval_runtime": 36.1909, + "eval_samples_per_second": 10.859, + "eval_steps_per_second": 1.382, + "step": 89000 + }, + { + "epoch": 456.9230769230769, + "grad_norm": 33.64459991455078, + "learning_rate": 1.0948e-06, + "loss": 0.785, + "step": 89100 + }, + { + "epoch": 456.9230769230769, + "eval_loss": 1.1532384157180786, + "eval_runtime": 35.9659, + "eval_samples_per_second": 10.927, + "eval_steps_per_second": 1.39, + "step": 89100 + }, + { + "epoch": 457.43589743589746, + "grad_norm": 37.836669921875, + "learning_rate": 1.0848e-06, + "loss": 0.7889, + "step": 89200 + }, + { + "epoch": 457.43589743589746, + "eval_loss": 1.1464803218841553, + "eval_runtime": 36.113, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 89200 + }, + { + "epoch": 457.94871794871796, + "grad_norm": 32.36075210571289, + "learning_rate": 1.0748e-06, + "loss": 0.7917, + "step": 89300 + }, + { + "epoch": 457.94871794871796, + "eval_loss": 1.1373164653778076, + "eval_runtime": 36.164, + "eval_samples_per_second": 10.867, + "eval_steps_per_second": 1.383, + "step": 89300 + }, + { + "epoch": 458.46153846153845, + "grad_norm": 32.23586654663086, + "learning_rate": 1.0648000000000002e-06, + "loss": 0.7804, + "step": 89400 + }, + { + "epoch": 458.46153846153845, + "eval_loss": 1.1462308168411255, + "eval_runtime": 36.0829, + "eval_samples_per_second": 10.892, + "eval_steps_per_second": 1.386, + "step": 89400 + }, + { + "epoch": 458.97435897435895, + "grad_norm": 36.703941345214844, + "learning_rate": 1.0548e-06, + "loss": 0.7868, + "step": 89500 + }, + { + "epoch": 458.97435897435895, + "eval_loss": 1.1507140398025513, + "eval_runtime": 36.1367, + "eval_samples_per_second": 10.875, + "eval_steps_per_second": 1.384, + "step": 89500 + }, + { + "epoch": 459.4871794871795, + "grad_norm": 26.663591384887695, + "learning_rate": 1.0448000000000001e-06, + "loss": 0.7795, + "step": 89600 + }, + { + "epoch": 459.4871794871795, + "eval_loss": 1.1461986303329468, + "eval_runtime": 36.7628, + "eval_samples_per_second": 10.69, + "eval_steps_per_second": 1.36, + "step": 89600 + }, + { + "epoch": 460.0, + "grad_norm": 52.451419830322266, + "learning_rate": 1.0348e-06, + "loss": 0.7909, + "step": 89700 + }, + { + "epoch": 460.0, + "eval_loss": 1.1439627408981323, + "eval_runtime": 36.3037, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 89700 + }, + { + "epoch": 460.5128205128205, + "grad_norm": 26.186614990234375, + "learning_rate": 1.0248000000000001e-06, + "loss": 0.8083, + "step": 89800 + }, + { + "epoch": 460.5128205128205, + "eval_loss": 1.1412297487258911, + "eval_runtime": 36.2306, + "eval_samples_per_second": 10.847, + "eval_steps_per_second": 1.38, + "step": 89800 + }, + { + "epoch": 461.02564102564105, + "grad_norm": 34.80671691894531, + "learning_rate": 1.0148e-06, + "loss": 0.7622, + "step": 89900 + }, + { + "epoch": 461.02564102564105, + "eval_loss": 1.1351121664047241, + "eval_runtime": 36.1308, + "eval_samples_per_second": 10.877, + "eval_steps_per_second": 1.384, + "step": 89900 + }, + { + "epoch": 461.53846153846155, + "grad_norm": 27.466503143310547, + "learning_rate": 1.0048e-06, + "loss": 0.7809, + "step": 90000 + }, + { + "epoch": 461.53846153846155, + "eval_loss": 1.1431870460510254, + "eval_runtime": 36.2384, + "eval_samples_per_second": 10.845, + "eval_steps_per_second": 1.38, + "step": 90000 + }, + { + "epoch": 462.05128205128204, + "grad_norm": 39.38359069824219, + "learning_rate": 9.948e-07, + "loss": 0.7929, + "step": 90100 + }, + { + "epoch": 462.05128205128204, + "eval_loss": 1.138918399810791, + "eval_runtime": 36.2457, + "eval_samples_per_second": 10.843, + "eval_steps_per_second": 1.379, + "step": 90100 + }, + { + "epoch": 462.56410256410254, + "grad_norm": 67.79429626464844, + "learning_rate": 9.848e-07, + "loss": 0.7943, + "step": 90200 + }, + { + "epoch": 462.56410256410254, + "eval_loss": 1.1377341747283936, + "eval_runtime": 36.2669, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 90200 + }, + { + "epoch": 463.0769230769231, + "grad_norm": 86.01333618164062, + "learning_rate": 9.748e-07, + "loss": 0.7886, + "step": 90300 + }, + { + "epoch": 463.0769230769231, + "eval_loss": 1.1376402378082275, + "eval_runtime": 36.1662, + "eval_samples_per_second": 10.867, + "eval_steps_per_second": 1.383, + "step": 90300 + }, + { + "epoch": 463.5897435897436, + "grad_norm": 19.905719757080078, + "learning_rate": 9.648e-07, + "loss": 0.786, + "step": 90400 + }, + { + "epoch": 463.5897435897436, + "eval_loss": 1.1363729238510132, + "eval_runtime": 36.5216, + "eval_samples_per_second": 10.761, + "eval_steps_per_second": 1.369, + "step": 90400 + }, + { + "epoch": 464.1025641025641, + "grad_norm": 46.60888671875, + "learning_rate": 9.548e-07, + "loss": 0.8002, + "step": 90500 + }, + { + "epoch": 464.1025641025641, + "eval_loss": 1.1395928859710693, + "eval_runtime": 36.3596, + "eval_samples_per_second": 10.809, + "eval_steps_per_second": 1.375, + "step": 90500 + }, + { + "epoch": 464.61538461538464, + "grad_norm": 35.5651741027832, + "learning_rate": 9.448e-07, + "loss": 0.7943, + "step": 90600 + }, + { + "epoch": 464.61538461538464, + "eval_loss": 1.1389234066009521, + "eval_runtime": 36.3026, + "eval_samples_per_second": 10.826, + "eval_steps_per_second": 1.377, + "step": 90600 + }, + { + "epoch": 465.12820512820514, + "grad_norm": 41.65739440917969, + "learning_rate": 9.348e-07, + "loss": 0.7508, + "step": 90700 + }, + { + "epoch": 465.12820512820514, + "eval_loss": 1.1372681856155396, + "eval_runtime": 36.5153, + "eval_samples_per_second": 10.763, + "eval_steps_per_second": 1.369, + "step": 90700 + }, + { + "epoch": 465.64102564102564, + "grad_norm": 38.36830520629883, + "learning_rate": 9.248000000000001e-07, + "loss": 0.7813, + "step": 90800 + }, + { + "epoch": 465.64102564102564, + "eval_loss": 1.1375644207000732, + "eval_runtime": 36.2421, + "eval_samples_per_second": 10.844, + "eval_steps_per_second": 1.38, + "step": 90800 + }, + { + "epoch": 466.15384615384613, + "grad_norm": 27.81422996520996, + "learning_rate": 9.148000000000001e-07, + "loss": 0.7686, + "step": 90900 + }, + { + "epoch": 466.15384615384613, + "eval_loss": 1.1425598859786987, + "eval_runtime": 36.2682, + "eval_samples_per_second": 10.836, + "eval_steps_per_second": 1.379, + "step": 90900 + }, + { + "epoch": 466.6666666666667, + "grad_norm": 29.16278076171875, + "learning_rate": 9.048000000000001e-07, + "loss": 0.8038, + "step": 91000 + }, + { + "epoch": 466.6666666666667, + "eval_loss": 1.1381059885025024, + "eval_runtime": 36.1829, + "eval_samples_per_second": 10.861, + "eval_steps_per_second": 1.382, + "step": 91000 + }, + { + "epoch": 467.1794871794872, + "grad_norm": 38.36164093017578, + "learning_rate": 8.949000000000001e-07, + "loss": 0.7959, + "step": 91100 + }, + { + "epoch": 467.1794871794872, + "eval_loss": 1.1415718793869019, + "eval_runtime": 36.2758, + "eval_samples_per_second": 10.834, + "eval_steps_per_second": 1.378, + "step": 91100 + }, + { + "epoch": 467.6923076923077, + "grad_norm": 39.69020080566406, + "learning_rate": 8.849000000000001e-07, + "loss": 0.7693, + "step": 91200 + }, + { + "epoch": 467.6923076923077, + "eval_loss": 1.1395233869552612, + "eval_runtime": 36.6971, + "eval_samples_per_second": 10.709, + "eval_steps_per_second": 1.363, + "step": 91200 + }, + { + "epoch": 468.20512820512823, + "grad_norm": 19.789915084838867, + "learning_rate": 8.749000000000001e-07, + "loss": 0.7648, + "step": 91300 + }, + { + "epoch": 468.20512820512823, + "eval_loss": 1.1467856168746948, + "eval_runtime": 36.1577, + "eval_samples_per_second": 10.869, + "eval_steps_per_second": 1.383, + "step": 91300 + }, + { + "epoch": 468.71794871794873, + "grad_norm": 22.790111541748047, + "learning_rate": 8.649000000000001e-07, + "loss": 0.7654, + "step": 91400 + }, + { + "epoch": 468.71794871794873, + "eval_loss": 1.1393033266067505, + "eval_runtime": 36.3954, + "eval_samples_per_second": 10.798, + "eval_steps_per_second": 1.374, + "step": 91400 + }, + { + "epoch": 469.2307692307692, + "grad_norm": 38.26460266113281, + "learning_rate": 8.549000000000001e-07, + "loss": 0.7779, + "step": 91500 + }, + { + "epoch": 469.2307692307692, + "eval_loss": 1.1418498754501343, + "eval_runtime": 36.3106, + "eval_samples_per_second": 10.823, + "eval_steps_per_second": 1.377, + "step": 91500 + }, + { + "epoch": 469.7435897435897, + "grad_norm": 23.214740753173828, + "learning_rate": 8.449000000000001e-07, + "loss": 0.7658, + "step": 91600 + }, + { + "epoch": 469.7435897435897, + "eval_loss": 1.1464049816131592, + "eval_runtime": 36.204, + "eval_samples_per_second": 10.855, + "eval_steps_per_second": 1.381, + "step": 91600 + }, + { + "epoch": 470.2564102564103, + "grad_norm": 24.108579635620117, + "learning_rate": 8.349e-07, + "loss": 0.8129, + "step": 91700 + }, + { + "epoch": 470.2564102564103, + "eval_loss": 1.1496237516403198, + "eval_runtime": 36.4781, + "eval_samples_per_second": 10.774, + "eval_steps_per_second": 1.371, + "step": 91700 + }, + { + "epoch": 470.7692307692308, + "grad_norm": 16.09745979309082, + "learning_rate": 8.249e-07, + "loss": 0.7935, + "step": 91800 + }, + { + "epoch": 470.7692307692308, + "eval_loss": 1.146942138671875, + "eval_runtime": 36.3763, + "eval_samples_per_second": 10.804, + "eval_steps_per_second": 1.375, + "step": 91800 + }, + { + "epoch": 471.28205128205127, + "grad_norm": 41.664974212646484, + "learning_rate": 8.149000000000001e-07, + "loss": 0.7413, + "step": 91900 + }, + { + "epoch": 471.28205128205127, + "eval_loss": 1.1448837518692017, + "eval_runtime": 36.2081, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 91900 + }, + { + "epoch": 471.79487179487177, + "grad_norm": 76.44750213623047, + "learning_rate": 8.049000000000001e-07, + "loss": 0.8009, + "step": 92000 + }, + { + "epoch": 471.79487179487177, + "eval_loss": 1.1425203084945679, + "eval_runtime": 36.3063, + "eval_samples_per_second": 10.825, + "eval_steps_per_second": 1.377, + "step": 92000 + }, + { + "epoch": 472.3076923076923, + "grad_norm": 53.231346130371094, + "learning_rate": 7.949000000000001e-07, + "loss": 0.7828, + "step": 92100 + }, + { + "epoch": 472.3076923076923, + "eval_loss": 1.143397569656372, + "eval_runtime": 36.5895, + "eval_samples_per_second": 10.741, + "eval_steps_per_second": 1.367, + "step": 92100 + }, + { + "epoch": 472.8205128205128, + "grad_norm": 27.0950984954834, + "learning_rate": 7.849000000000001e-07, + "loss": 0.8098, + "step": 92200 + }, + { + "epoch": 472.8205128205128, + "eval_loss": 1.1414209604263306, + "eval_runtime": 36.2062, + "eval_samples_per_second": 10.854, + "eval_steps_per_second": 1.381, + "step": 92200 + }, + { + "epoch": 473.3333333333333, + "grad_norm": 48.05419158935547, + "learning_rate": 7.749000000000001e-07, + "loss": 0.7622, + "step": 92300 + }, + { + "epoch": 473.3333333333333, + "eval_loss": 1.1400697231292725, + "eval_runtime": 36.1118, + "eval_samples_per_second": 10.883, + "eval_steps_per_second": 1.385, + "step": 92300 + }, + { + "epoch": 473.84615384615387, + "grad_norm": 43.447879791259766, + "learning_rate": 7.649000000000001e-07, + "loss": 0.8016, + "step": 92400 + }, + { + "epoch": 473.84615384615387, + "eval_loss": 1.1430341005325317, + "eval_runtime": 36.1914, + "eval_samples_per_second": 10.859, + "eval_steps_per_second": 1.382, + "step": 92400 + }, + { + "epoch": 474.35897435897436, + "grad_norm": 43.409297943115234, + "learning_rate": 7.549000000000001e-07, + "loss": 0.7696, + "step": 92500 + }, + { + "epoch": 474.35897435897436, + "eval_loss": 1.1409395933151245, + "eval_runtime": 36.0834, + "eval_samples_per_second": 10.891, + "eval_steps_per_second": 1.386, + "step": 92500 + }, + { + "epoch": 474.87179487179486, + "grad_norm": 16.421245574951172, + "learning_rate": 7.449000000000001e-07, + "loss": 0.8062, + "step": 92600 + }, + { + "epoch": 474.87179487179486, + "eval_loss": 1.1461801528930664, + "eval_runtime": 36.0863, + "eval_samples_per_second": 10.891, + "eval_steps_per_second": 1.386, + "step": 92600 + }, + { + "epoch": 475.38461538461536, + "grad_norm": 53.00939178466797, + "learning_rate": 7.349e-07, + "loss": 0.7604, + "step": 92700 + }, + { + "epoch": 475.38461538461536, + "eval_loss": 1.140365481376648, + "eval_runtime": 36.6267, + "eval_samples_per_second": 10.73, + "eval_steps_per_second": 1.365, + "step": 92700 + }, + { + "epoch": 475.8974358974359, + "grad_norm": 36.78721618652344, + "learning_rate": 7.249e-07, + "loss": 0.7818, + "step": 92800 + }, + { + "epoch": 475.8974358974359, + "eval_loss": 1.1321120262145996, + "eval_runtime": 36.1749, + "eval_samples_per_second": 10.864, + "eval_steps_per_second": 1.382, + "step": 92800 + }, + { + "epoch": 476.4102564102564, + "grad_norm": 33.232357025146484, + "learning_rate": 7.149e-07, + "loss": 0.8078, + "step": 92900 + }, + { + "epoch": 476.4102564102564, + "eval_loss": 1.1404153108596802, + "eval_runtime": 36.1628, + "eval_samples_per_second": 10.868, + "eval_steps_per_second": 1.383, + "step": 92900 + }, + { + "epoch": 476.9230769230769, + "grad_norm": 63.29995346069336, + "learning_rate": 7.049e-07, + "loss": 0.7685, + "step": 93000 + }, + { + "epoch": 476.9230769230769, + "eval_loss": 1.1407204866409302, + "eval_runtime": 36.231, + "eval_samples_per_second": 10.847, + "eval_steps_per_second": 1.38, + "step": 93000 + }, + { + "epoch": 477.43589743589746, + "grad_norm": 19.1306209564209, + "learning_rate": 6.949e-07, + "loss": 0.7823, + "step": 93100 + }, + { + "epoch": 477.43589743589746, + "eval_loss": 1.142924427986145, + "eval_runtime": 36.0022, + "eval_samples_per_second": 10.916, + "eval_steps_per_second": 1.389, + "step": 93100 + }, + { + "epoch": 477.94871794871796, + "grad_norm": 73.6246566772461, + "learning_rate": 6.850000000000001e-07, + "loss": 0.7934, + "step": 93200 + }, + { + "epoch": 477.94871794871796, + "eval_loss": 1.1422728300094604, + "eval_runtime": 36.0364, + "eval_samples_per_second": 10.906, + "eval_steps_per_second": 1.387, + "step": 93200 + }, + { + "epoch": 478.46153846153845, + "grad_norm": 52.62312316894531, + "learning_rate": 6.750000000000001e-07, + "loss": 0.7946, + "step": 93300 + }, + { + "epoch": 478.46153846153845, + "eval_loss": 1.1408541202545166, + "eval_runtime": 36.1775, + "eval_samples_per_second": 10.863, + "eval_steps_per_second": 1.382, + "step": 93300 + }, + { + "epoch": 478.97435897435895, + "grad_norm": 59.98847198486328, + "learning_rate": 6.650000000000001e-07, + "loss": 0.7738, + "step": 93400 + }, + { + "epoch": 478.97435897435895, + "eval_loss": 1.1421667337417603, + "eval_runtime": 36.4964, + "eval_samples_per_second": 10.768, + "eval_steps_per_second": 1.37, + "step": 93400 + }, + { + "epoch": 479.4871794871795, + "grad_norm": 24.55941390991211, + "learning_rate": 6.550000000000001e-07, + "loss": 0.7751, + "step": 93500 + }, + { + "epoch": 479.4871794871795, + "eval_loss": 1.1392768621444702, + "eval_runtime": 36.1874, + "eval_samples_per_second": 10.86, + "eval_steps_per_second": 1.382, + "step": 93500 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 513, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.5739736040448e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}