{ "best_metric": null, "best_model_checkpoint": null, "epoch": 128.2051282051282, "eval_steps": 100, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5128205128205128, "grad_norm": 73.68424224853516, "learning_rate": 9.9907e-06, "loss": 3.1457, "step": 100 }, { "epoch": 0.5128205128205128, "eval_loss": 2.2357213497161865, "eval_runtime": 36.4689, "eval_samples_per_second": 10.776, "eval_steps_per_second": 1.371, "step": 100 }, { "epoch": 1.0256410256410255, "grad_norm": 57.47202682495117, "learning_rate": 9.980800000000001e-06, "loss": 2.1614, "step": 200 }, { "epoch": 1.0256410256410255, "eval_loss": 2.0913825035095215, "eval_runtime": 36.349, "eval_samples_per_second": 10.812, "eval_steps_per_second": 1.376, "step": 200 }, { "epoch": 1.5384615384615383, "grad_norm": 176.88357543945312, "learning_rate": 9.970800000000001e-06, "loss": 2.0388, "step": 300 }, { "epoch": 1.5384615384615383, "eval_loss": 2.003911018371582, "eval_runtime": 36.2551, "eval_samples_per_second": 10.84, "eval_steps_per_second": 1.379, "step": 300 }, { "epoch": 2.051282051282051, "grad_norm": 55.31132507324219, "learning_rate": 9.960800000000001e-06, "loss": 1.9285, "step": 400 }, { "epoch": 2.051282051282051, "eval_loss": 1.9796075820922852, "eval_runtime": 36.4437, "eval_samples_per_second": 10.784, "eval_steps_per_second": 1.372, "step": 400 }, { "epoch": 2.564102564102564, "grad_norm": 41.900753021240234, "learning_rate": 9.9508e-06, "loss": 1.9523, "step": 500 }, { "epoch": 2.564102564102564, "eval_loss": 1.936122179031372, "eval_runtime": 36.5845, "eval_samples_per_second": 10.742, "eval_steps_per_second": 1.367, "step": 500 }, { "epoch": 3.076923076923077, "grad_norm": 50.21903991699219, "learning_rate": 9.9408e-06, "loss": 1.8452, "step": 600 }, { "epoch": 3.076923076923077, "eval_loss": 1.8883634805679321, "eval_runtime": 36.6015, "eval_samples_per_second": 10.737, "eval_steps_per_second": 1.366, "step": 600 }, { "epoch": 3.58974358974359, "grad_norm": 45.193939208984375, "learning_rate": 9.930900000000002e-06, "loss": 1.8403, "step": 700 }, { "epoch": 3.58974358974359, "eval_loss": 1.8506474494934082, "eval_runtime": 36.454, "eval_samples_per_second": 10.781, "eval_steps_per_second": 1.372, "step": 700 }, { "epoch": 4.102564102564102, "grad_norm": 27.302494049072266, "learning_rate": 9.920900000000002e-06, "loss": 1.7976, "step": 800 }, { "epoch": 4.102564102564102, "eval_loss": 1.8370662927627563, "eval_runtime": 36.834, "eval_samples_per_second": 10.67, "eval_steps_per_second": 1.357, "step": 800 }, { "epoch": 4.615384615384615, "grad_norm": 52.6607666015625, "learning_rate": 9.9109e-06, "loss": 1.7508, "step": 900 }, { "epoch": 4.615384615384615, "eval_loss": 1.8037244081497192, "eval_runtime": 36.9939, "eval_samples_per_second": 10.623, "eval_steps_per_second": 1.352, "step": 900 }, { "epoch": 5.128205128205128, "grad_norm": 59.508033752441406, "learning_rate": 9.9009e-06, "loss": 1.7383, "step": 1000 }, { "epoch": 5.128205128205128, "eval_loss": 1.7986633777618408, "eval_runtime": 36.8356, "eval_samples_per_second": 10.669, "eval_steps_per_second": 1.357, "step": 1000 }, { "epoch": 5.641025641025641, "grad_norm": 71.58872985839844, "learning_rate": 9.8909e-06, "loss": 1.7361, "step": 1100 }, { "epoch": 5.641025641025641, "eval_loss": 1.7810852527618408, "eval_runtime": 37.0957, "eval_samples_per_second": 10.594, "eval_steps_per_second": 1.348, "step": 1100 }, { "epoch": 6.153846153846154, "grad_norm": 41.782066345214844, "learning_rate": 9.8809e-06, "loss": 1.682, "step": 1200 }, { "epoch": 6.153846153846154, "eval_loss": 1.777554988861084, "eval_runtime": 36.9173, "eval_samples_per_second": 10.645, "eval_steps_per_second": 1.354, "step": 1200 }, { "epoch": 6.666666666666667, "grad_norm": 40.28728485107422, "learning_rate": 9.8709e-06, "loss": 1.7216, "step": 1300 }, { "epoch": 6.666666666666667, "eval_loss": 1.7382572889328003, "eval_runtime": 36.8918, "eval_samples_per_second": 10.653, "eval_steps_per_second": 1.355, "step": 1300 }, { "epoch": 7.17948717948718, "grad_norm": 104.99211883544922, "learning_rate": 9.8609e-06, "loss": 1.6534, "step": 1400 }, { "epoch": 7.17948717948718, "eval_loss": 1.76559579372406, "eval_runtime": 36.3398, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.376, "step": 1400 }, { "epoch": 7.6923076923076925, "grad_norm": 29.326631546020508, "learning_rate": 9.8509e-06, "loss": 1.707, "step": 1500 }, { "epoch": 7.6923076923076925, "eval_loss": 1.750089406967163, "eval_runtime": 36.4098, "eval_samples_per_second": 10.794, "eval_steps_per_second": 1.373, "step": 1500 }, { "epoch": 8.205128205128204, "grad_norm": 37.941261291503906, "learning_rate": 9.840900000000001e-06, "loss": 1.6554, "step": 1600 }, { "epoch": 8.205128205128204, "eval_loss": 1.6900651454925537, "eval_runtime": 36.3226, "eval_samples_per_second": 10.82, "eval_steps_per_second": 1.377, "step": 1600 }, { "epoch": 8.717948717948717, "grad_norm": 44.60703659057617, "learning_rate": 9.830900000000001e-06, "loss": 1.6334, "step": 1700 }, { "epoch": 8.717948717948717, "eval_loss": 1.7162973880767822, "eval_runtime": 36.2672, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.379, "step": 1700 }, { "epoch": 9.23076923076923, "grad_norm": 34.127254486083984, "learning_rate": 9.820900000000001e-06, "loss": 1.6345, "step": 1800 }, { "epoch": 9.23076923076923, "eval_loss": 1.6906001567840576, "eval_runtime": 36.2264, "eval_samples_per_second": 10.848, "eval_steps_per_second": 1.38, "step": 1800 }, { "epoch": 9.743589743589745, "grad_norm": 60.377540588378906, "learning_rate": 9.810900000000001e-06, "loss": 1.598, "step": 1900 }, { "epoch": 9.743589743589745, "eval_loss": 1.6555503606796265, "eval_runtime": 36.3896, "eval_samples_per_second": 10.8, "eval_steps_per_second": 1.374, "step": 1900 }, { "epoch": 10.256410256410255, "grad_norm": 20.264404296875, "learning_rate": 9.800900000000001e-06, "loss": 1.5466, "step": 2000 }, { "epoch": 10.256410256410255, "eval_loss": 1.648037075996399, "eval_runtime": 36.4136, "eval_samples_per_second": 10.793, "eval_steps_per_second": 1.373, "step": 2000 }, { "epoch": 10.76923076923077, "grad_norm": 27.18608856201172, "learning_rate": 9.790900000000001e-06, "loss": 1.5865, "step": 2100 }, { "epoch": 10.76923076923077, "eval_loss": 1.6171936988830566, "eval_runtime": 36.1051, "eval_samples_per_second": 10.885, "eval_steps_per_second": 1.385, "step": 2100 }, { "epoch": 11.282051282051283, "grad_norm": 32.486331939697266, "learning_rate": 9.780900000000002e-06, "loss": 1.5284, "step": 2200 }, { "epoch": 11.282051282051283, "eval_loss": 1.5915095806121826, "eval_runtime": 36.1781, "eval_samples_per_second": 10.863, "eval_steps_per_second": 1.382, "step": 2200 }, { "epoch": 11.794871794871796, "grad_norm": 65.88719940185547, "learning_rate": 9.770900000000002e-06, "loss": 1.5514, "step": 2300 }, { "epoch": 11.794871794871796, "eval_loss": 1.5879931449890137, "eval_runtime": 36.3934, "eval_samples_per_second": 10.799, "eval_steps_per_second": 1.374, "step": 2300 }, { "epoch": 12.307692307692308, "grad_norm": 31.737024307250977, "learning_rate": 9.760900000000002e-06, "loss": 1.4941, "step": 2400 }, { "epoch": 12.307692307692308, "eval_loss": 1.583853006362915, "eval_runtime": 36.3797, "eval_samples_per_second": 10.803, "eval_steps_per_second": 1.374, "step": 2400 }, { "epoch": 12.820512820512821, "grad_norm": 45.48268508911133, "learning_rate": 9.7509e-06, "loss": 1.5097, "step": 2500 }, { "epoch": 12.820512820512821, "eval_loss": 1.5559026002883911, "eval_runtime": 36.1605, "eval_samples_per_second": 10.868, "eval_steps_per_second": 1.383, "step": 2500 }, { "epoch": 13.333333333333334, "grad_norm": 27.500398635864258, "learning_rate": 9.7409e-06, "loss": 1.5018, "step": 2600 }, { "epoch": 13.333333333333334, "eval_loss": 1.5453521013259888, "eval_runtime": 36.2887, "eval_samples_per_second": 10.83, "eval_steps_per_second": 1.378, "step": 2600 }, { "epoch": 13.846153846153847, "grad_norm": 32.49728775024414, "learning_rate": 9.7309e-06, "loss": 1.4804, "step": 2700 }, { "epoch": 13.846153846153847, "eval_loss": 1.5424816608428955, "eval_runtime": 36.359, "eval_samples_per_second": 10.809, "eval_steps_per_second": 1.375, "step": 2700 }, { "epoch": 14.35897435897436, "grad_norm": 38.46280288696289, "learning_rate": 9.7209e-06, "loss": 1.4826, "step": 2800 }, { "epoch": 14.35897435897436, "eval_loss": 1.5317177772521973, "eval_runtime": 36.3362, "eval_samples_per_second": 10.816, "eval_steps_per_second": 1.376, "step": 2800 }, { "epoch": 14.871794871794872, "grad_norm": 16.075960159301758, "learning_rate": 9.7109e-06, "loss": 1.4568, "step": 2900 }, { "epoch": 14.871794871794872, "eval_loss": 1.5241832733154297, "eval_runtime": 36.2025, "eval_samples_per_second": 10.856, "eval_steps_per_second": 1.381, "step": 2900 }, { "epoch": 15.384615384615385, "grad_norm": 27.334318161010742, "learning_rate": 9.7009e-06, "loss": 1.4176, "step": 3000 }, { "epoch": 15.384615384615385, "eval_loss": 1.520580768585205, "eval_runtime": 36.398, "eval_samples_per_second": 10.797, "eval_steps_per_second": 1.374, "step": 3000 }, { "epoch": 15.897435897435898, "grad_norm": 94.90784454345703, "learning_rate": 9.6909e-06, "loss": 1.4681, "step": 3100 }, { "epoch": 15.897435897435898, "eval_loss": 1.5268648862838745, "eval_runtime": 36.0541, "eval_samples_per_second": 10.9, "eval_steps_per_second": 1.387, "step": 3100 }, { "epoch": 16.41025641025641, "grad_norm": 16.697856903076172, "learning_rate": 9.6809e-06, "loss": 1.454, "step": 3200 }, { "epoch": 16.41025641025641, "eval_loss": 1.5157753229141235, "eval_runtime": 36.3172, "eval_samples_per_second": 10.821, "eval_steps_per_second": 1.377, "step": 3200 }, { "epoch": 16.923076923076923, "grad_norm": 54.05553436279297, "learning_rate": 9.670900000000001e-06, "loss": 1.4309, "step": 3300 }, { "epoch": 16.923076923076923, "eval_loss": 1.516249179840088, "eval_runtime": 36.2632, "eval_samples_per_second": 10.837, "eval_steps_per_second": 1.379, "step": 3300 }, { "epoch": 17.435897435897434, "grad_norm": 47.010475158691406, "learning_rate": 9.660900000000001e-06, "loss": 1.4571, "step": 3400 }, { "epoch": 17.435897435897434, "eval_loss": 1.5018247365951538, "eval_runtime": 36.2118, "eval_samples_per_second": 10.853, "eval_steps_per_second": 1.381, "step": 3400 }, { "epoch": 17.94871794871795, "grad_norm": 52.865718841552734, "learning_rate": 9.650900000000001e-06, "loss": 1.4168, "step": 3500 }, { "epoch": 17.94871794871795, "eval_loss": 1.4993616342544556, "eval_runtime": 36.2572, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 3500 }, { "epoch": 18.46153846153846, "grad_norm": 30.117380142211914, "learning_rate": 9.640900000000001e-06, "loss": 1.4275, "step": 3600 }, { "epoch": 18.46153846153846, "eval_loss": 1.4899998903274536, "eval_runtime": 36.4696, "eval_samples_per_second": 10.776, "eval_steps_per_second": 1.371, "step": 3600 }, { "epoch": 18.974358974358974, "grad_norm": 31.10028076171875, "learning_rate": 9.630900000000001e-06, "loss": 1.4148, "step": 3700 }, { "epoch": 18.974358974358974, "eval_loss": 1.5231629610061646, "eval_runtime": 36.4604, "eval_samples_per_second": 10.779, "eval_steps_per_second": 1.371, "step": 3700 }, { "epoch": 19.487179487179485, "grad_norm": 44.06697082519531, "learning_rate": 9.620900000000001e-06, "loss": 1.4057, "step": 3800 }, { "epoch": 19.487179487179485, "eval_loss": 1.4841217994689941, "eval_runtime": 36.3382, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.376, "step": 3800 }, { "epoch": 20.0, "grad_norm": 53.86429214477539, "learning_rate": 9.610900000000001e-06, "loss": 1.4302, "step": 3900 }, { "epoch": 20.0, "eval_loss": 1.477772831916809, "eval_runtime": 36.1794, "eval_samples_per_second": 10.863, "eval_steps_per_second": 1.382, "step": 3900 }, { "epoch": 20.51282051282051, "grad_norm": 80.95457458496094, "learning_rate": 9.600900000000002e-06, "loss": 1.4076, "step": 4000 }, { "epoch": 20.51282051282051, "eval_loss": 1.4769134521484375, "eval_runtime": 36.4725, "eval_samples_per_second": 10.775, "eval_steps_per_second": 1.371, "step": 4000 }, { "epoch": 21.025641025641026, "grad_norm": 32.276214599609375, "learning_rate": 9.5909e-06, "loss": 1.3868, "step": 4100 }, { "epoch": 21.025641025641026, "eval_loss": 1.463292121887207, "eval_runtime": 36.5192, "eval_samples_per_second": 10.761, "eval_steps_per_second": 1.369, "step": 4100 }, { "epoch": 21.53846153846154, "grad_norm": 54.65959167480469, "learning_rate": 9.5809e-06, "loss": 1.3795, "step": 4200 }, { "epoch": 21.53846153846154, "eval_loss": 1.4630039930343628, "eval_runtime": 36.5288, "eval_samples_per_second": 10.759, "eval_steps_per_second": 1.369, "step": 4200 }, { "epoch": 22.05128205128205, "grad_norm": 42.31818389892578, "learning_rate": 9.5709e-06, "loss": 1.3787, "step": 4300 }, { "epoch": 22.05128205128205, "eval_loss": 1.4471133947372437, "eval_runtime": 36.2949, "eval_samples_per_second": 10.828, "eval_steps_per_second": 1.378, "step": 4300 }, { "epoch": 22.564102564102566, "grad_norm": 34.44257736206055, "learning_rate": 9.5609e-06, "loss": 1.4027, "step": 4400 }, { "epoch": 22.564102564102566, "eval_loss": 1.4606964588165283, "eval_runtime": 36.5672, "eval_samples_per_second": 10.747, "eval_steps_per_second": 1.367, "step": 4400 }, { "epoch": 23.076923076923077, "grad_norm": 42.65989303588867, "learning_rate": 9.5509e-06, "loss": 1.3459, "step": 4500 }, { "epoch": 23.076923076923077, "eval_loss": 1.454709768295288, "eval_runtime": 37.2024, "eval_samples_per_second": 10.564, "eval_steps_per_second": 1.344, "step": 4500 }, { "epoch": 23.58974358974359, "grad_norm": 35.11396789550781, "learning_rate": 9.5409e-06, "loss": 1.3367, "step": 4600 }, { "epoch": 23.58974358974359, "eval_loss": 1.4562979936599731, "eval_runtime": 36.4579, "eval_samples_per_second": 10.78, "eval_steps_per_second": 1.371, "step": 4600 }, { "epoch": 24.102564102564102, "grad_norm": 32.71805953979492, "learning_rate": 9.5309e-06, "loss": 1.3575, "step": 4700 }, { "epoch": 24.102564102564102, "eval_loss": 1.4620144367218018, "eval_runtime": 36.4055, "eval_samples_per_second": 10.795, "eval_steps_per_second": 1.373, "step": 4700 }, { "epoch": 24.615384615384617, "grad_norm": 28.839757919311523, "learning_rate": 9.5209e-06, "loss": 1.3549, "step": 4800 }, { "epoch": 24.615384615384617, "eval_loss": 1.4431304931640625, "eval_runtime": 36.5027, "eval_samples_per_second": 10.766, "eval_steps_per_second": 1.37, "step": 4800 }, { "epoch": 25.128205128205128, "grad_norm": 45.3994140625, "learning_rate": 9.5109e-06, "loss": 1.3885, "step": 4900 }, { "epoch": 25.128205128205128, "eval_loss": 1.4312200546264648, "eval_runtime": 36.3039, "eval_samples_per_second": 10.825, "eval_steps_per_second": 1.377, "step": 4900 }, { "epoch": 25.641025641025642, "grad_norm": 22.972829818725586, "learning_rate": 9.5009e-06, "loss": 1.3469, "step": 5000 }, { "epoch": 25.641025641025642, "eval_loss": 1.416171669960022, "eval_runtime": 36.6695, "eval_samples_per_second": 10.717, "eval_steps_per_second": 1.364, "step": 5000 }, { "epoch": 26.153846153846153, "grad_norm": 77.40106964111328, "learning_rate": 9.490900000000001e-06, "loss": 1.3363, "step": 5100 }, { "epoch": 26.153846153846153, "eval_loss": 1.4090278148651123, "eval_runtime": 36.2573, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 5100 }, { "epoch": 26.666666666666668, "grad_norm": 29.757932662963867, "learning_rate": 9.480900000000001e-06, "loss": 1.3183, "step": 5200 }, { "epoch": 26.666666666666668, "eval_loss": 1.4073749780654907, "eval_runtime": 36.2439, "eval_samples_per_second": 10.843, "eval_steps_per_second": 1.38, "step": 5200 }, { "epoch": 27.17948717948718, "grad_norm": 56.78797149658203, "learning_rate": 9.470900000000001e-06, "loss": 1.3568, "step": 5300 }, { "epoch": 27.17948717948718, "eval_loss": 1.4153741598129272, "eval_runtime": 36.2756, "eval_samples_per_second": 10.834, "eval_steps_per_second": 1.378, "step": 5300 }, { "epoch": 27.692307692307693, "grad_norm": 62.353477478027344, "learning_rate": 9.460900000000001e-06, "loss": 1.3304, "step": 5400 }, { "epoch": 27.692307692307693, "eval_loss": 1.4334921836853027, "eval_runtime": 36.2626, "eval_samples_per_second": 10.838, "eval_steps_per_second": 1.379, "step": 5400 }, { "epoch": 28.205128205128204, "grad_norm": 100.7852554321289, "learning_rate": 9.450900000000001e-06, "loss": 1.2897, "step": 5500 }, { "epoch": 28.205128205128204, "eval_loss": 1.4160270690917969, "eval_runtime": 36.6139, "eval_samples_per_second": 10.734, "eval_steps_per_second": 1.366, "step": 5500 }, { "epoch": 28.71794871794872, "grad_norm": 62.06657409667969, "learning_rate": 9.440900000000001e-06, "loss": 1.3233, "step": 5600 }, { "epoch": 28.71794871794872, "eval_loss": 1.431317687034607, "eval_runtime": 36.5341, "eval_samples_per_second": 10.757, "eval_steps_per_second": 1.369, "step": 5600 }, { "epoch": 29.23076923076923, "grad_norm": 32.661346435546875, "learning_rate": 9.4309e-06, "loss": 1.305, "step": 5700 }, { "epoch": 29.23076923076923, "eval_loss": 1.3954827785491943, "eval_runtime": 36.5903, "eval_samples_per_second": 10.741, "eval_steps_per_second": 1.366, "step": 5700 }, { "epoch": 29.743589743589745, "grad_norm": 25.690454483032227, "learning_rate": 9.421000000000002e-06, "loss": 1.2961, "step": 5800 }, { "epoch": 29.743589743589745, "eval_loss": 1.4036046266555786, "eval_runtime": 36.5935, "eval_samples_per_second": 10.74, "eval_steps_per_second": 1.366, "step": 5800 }, { "epoch": 30.256410256410255, "grad_norm": 45.45426940917969, "learning_rate": 9.411000000000002e-06, "loss": 1.3175, "step": 5900 }, { "epoch": 30.256410256410255, "eval_loss": 1.3845292329788208, "eval_runtime": 36.5594, "eval_samples_per_second": 10.75, "eval_steps_per_second": 1.368, "step": 5900 }, { "epoch": 30.76923076923077, "grad_norm": 41.62439727783203, "learning_rate": 9.401000000000002e-06, "loss": 1.3242, "step": 6000 }, { "epoch": 30.76923076923077, "eval_loss": 1.3939634561538696, "eval_runtime": 36.4895, "eval_samples_per_second": 10.77, "eval_steps_per_second": 1.37, "step": 6000 }, { "epoch": 31.28205128205128, "grad_norm": 26.999319076538086, "learning_rate": 9.391e-06, "loss": 1.2886, "step": 6100 }, { "epoch": 31.28205128205128, "eval_loss": 1.3804558515548706, "eval_runtime": 36.2599, "eval_samples_per_second": 10.838, "eval_steps_per_second": 1.379, "step": 6100 }, { "epoch": 31.794871794871796, "grad_norm": 24.70287322998047, "learning_rate": 9.381e-06, "loss": 1.2893, "step": 6200 }, { "epoch": 31.794871794871796, "eval_loss": 1.3821990489959717, "eval_runtime": 36.2613, "eval_samples_per_second": 10.838, "eval_steps_per_second": 1.379, "step": 6200 }, { "epoch": 32.30769230769231, "grad_norm": 41.910606384277344, "learning_rate": 9.371e-06, "loss": 1.3093, "step": 6300 }, { "epoch": 32.30769230769231, "eval_loss": 1.3875064849853516, "eval_runtime": 36.4998, "eval_samples_per_second": 10.767, "eval_steps_per_second": 1.37, "step": 6300 }, { "epoch": 32.82051282051282, "grad_norm": 82.20216369628906, "learning_rate": 9.361e-06, "loss": 1.3184, "step": 6400 }, { "epoch": 32.82051282051282, "eval_loss": 1.3840612173080444, "eval_runtime": 36.4787, "eval_samples_per_second": 10.773, "eval_steps_per_second": 1.371, "step": 6400 }, { "epoch": 33.333333333333336, "grad_norm": 797.53271484375, "learning_rate": 9.351e-06, "loss": 1.2939, "step": 6500 }, { "epoch": 33.333333333333336, "eval_loss": 1.3881950378417969, "eval_runtime": 36.4166, "eval_samples_per_second": 10.792, "eval_steps_per_second": 1.373, "step": 6500 }, { "epoch": 33.84615384615385, "grad_norm": 35.14693069458008, "learning_rate": 9.341000000000001e-06, "loss": 1.2881, "step": 6600 }, { "epoch": 33.84615384615385, "eval_loss": 1.4039666652679443, "eval_runtime": 36.4141, "eval_samples_per_second": 10.793, "eval_steps_per_second": 1.373, "step": 6600 }, { "epoch": 34.35897435897436, "grad_norm": 38.676666259765625, "learning_rate": 9.331000000000001e-06, "loss": 1.2699, "step": 6700 }, { "epoch": 34.35897435897436, "eval_loss": 1.3800386190414429, "eval_runtime": 36.2367, "eval_samples_per_second": 10.845, "eval_steps_per_second": 1.38, "step": 6700 }, { "epoch": 34.87179487179487, "grad_norm": 26.051170349121094, "learning_rate": 9.321000000000001e-06, "loss": 1.3079, "step": 6800 }, { "epoch": 34.87179487179487, "eval_loss": 1.3784704208374023, "eval_runtime": 36.3119, "eval_samples_per_second": 10.823, "eval_steps_per_second": 1.377, "step": 6800 }, { "epoch": 35.38461538461539, "grad_norm": 42.667236328125, "learning_rate": 9.311000000000001e-06, "loss": 1.2622, "step": 6900 }, { "epoch": 35.38461538461539, "eval_loss": 1.3637058734893799, "eval_runtime": 36.1116, "eval_samples_per_second": 10.883, "eval_steps_per_second": 1.385, "step": 6900 }, { "epoch": 35.8974358974359, "grad_norm": 38.78388977050781, "learning_rate": 9.301000000000001e-06, "loss": 1.2652, "step": 7000 }, { "epoch": 35.8974358974359, "eval_loss": 1.3452589511871338, "eval_runtime": 36.2593, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 7000 }, { "epoch": 36.41025641025641, "grad_norm": 44.43967056274414, "learning_rate": 9.291000000000001e-06, "loss": 1.2378, "step": 7100 }, { "epoch": 36.41025641025641, "eval_loss": 1.3494073152542114, "eval_runtime": 35.4402, "eval_samples_per_second": 11.089, "eval_steps_per_second": 1.411, "step": 7100 }, { "epoch": 36.92307692307692, "grad_norm": 49.02131271362305, "learning_rate": 9.281000000000001e-06, "loss": 1.2932, "step": 7200 }, { "epoch": 36.92307692307692, "eval_loss": 1.3460158109664917, "eval_runtime": 35.9361, "eval_samples_per_second": 10.936, "eval_steps_per_second": 1.391, "step": 7200 }, { "epoch": 37.43589743589744, "grad_norm": 28.279098510742188, "learning_rate": 9.271000000000002e-06, "loss": 1.2598, "step": 7300 }, { "epoch": 37.43589743589744, "eval_loss": 1.36253023147583, "eval_runtime": 36.2944, "eval_samples_per_second": 10.828, "eval_steps_per_second": 1.378, "step": 7300 }, { "epoch": 37.94871794871795, "grad_norm": 35.21017074584961, "learning_rate": 9.261000000000002e-06, "loss": 1.2703, "step": 7400 }, { "epoch": 37.94871794871795, "eval_loss": 1.3509865999221802, "eval_runtime": 36.2661, "eval_samples_per_second": 10.837, "eval_steps_per_second": 1.379, "step": 7400 }, { "epoch": 38.46153846153846, "grad_norm": 51.673316955566406, "learning_rate": 9.251000000000002e-06, "loss": 1.2393, "step": 7500 }, { "epoch": 38.46153846153846, "eval_loss": 1.3402855396270752, "eval_runtime": 36.392, "eval_samples_per_second": 10.799, "eval_steps_per_second": 1.374, "step": 7500 }, { "epoch": 38.97435897435897, "grad_norm": 53.73936462402344, "learning_rate": 9.241000000000002e-06, "loss": 1.2577, "step": 7600 }, { "epoch": 38.97435897435897, "eval_loss": 1.3487578630447388, "eval_runtime": 36.2119, "eval_samples_per_second": 10.853, "eval_steps_per_second": 1.381, "step": 7600 }, { "epoch": 39.48717948717949, "grad_norm": 55.994686126708984, "learning_rate": 9.231000000000002e-06, "loss": 1.229, "step": 7700 }, { "epoch": 39.48717948717949, "eval_loss": 1.340031623840332, "eval_runtime": 36.2063, "eval_samples_per_second": 10.854, "eval_steps_per_second": 1.381, "step": 7700 }, { "epoch": 40.0, "grad_norm": 86.7531509399414, "learning_rate": 9.221e-06, "loss": 1.2941, "step": 7800 }, { "epoch": 40.0, "eval_loss": 1.3422337770462036, "eval_runtime": 36.7462, "eval_samples_per_second": 10.695, "eval_steps_per_second": 1.361, "step": 7800 }, { "epoch": 40.51282051282051, "grad_norm": 60.86371612548828, "learning_rate": 9.211e-06, "loss": 1.2423, "step": 7900 }, { "epoch": 40.51282051282051, "eval_loss": 1.3336257934570312, "eval_runtime": 36.2441, "eval_samples_per_second": 10.843, "eval_steps_per_second": 1.38, "step": 7900 }, { "epoch": 41.02564102564103, "grad_norm": 28.535411834716797, "learning_rate": 9.2011e-06, "loss": 1.2676, "step": 8000 }, { "epoch": 41.02564102564103, "eval_loss": 1.338461995124817, "eval_runtime": 36.2212, "eval_samples_per_second": 10.85, "eval_steps_per_second": 1.38, "step": 8000 }, { "epoch": 41.53846153846154, "grad_norm": 35.707183837890625, "learning_rate": 9.1911e-06, "loss": 1.2428, "step": 8100 }, { "epoch": 41.53846153846154, "eval_loss": 1.3225666284561157, "eval_runtime": 36.2043, "eval_samples_per_second": 10.855, "eval_steps_per_second": 1.381, "step": 8100 }, { "epoch": 42.05128205128205, "grad_norm": 29.23111343383789, "learning_rate": 9.181100000000001e-06, "loss": 1.2269, "step": 8200 }, { "epoch": 42.05128205128205, "eval_loss": 1.3405094146728516, "eval_runtime": 36.2498, "eval_samples_per_second": 10.841, "eval_steps_per_second": 1.379, "step": 8200 }, { "epoch": 42.56410256410256, "grad_norm": 20.379304885864258, "learning_rate": 9.171100000000001e-06, "loss": 1.2187, "step": 8300 }, { "epoch": 42.56410256410256, "eval_loss": 1.3247309923171997, "eval_runtime": 36.3237, "eval_samples_per_second": 10.819, "eval_steps_per_second": 1.377, "step": 8300 }, { "epoch": 43.07692307692308, "grad_norm": 44.43791198730469, "learning_rate": 9.161100000000001e-06, "loss": 1.2321, "step": 8400 }, { "epoch": 43.07692307692308, "eval_loss": 1.334086298942566, "eval_runtime": 36.2039, "eval_samples_per_second": 10.855, "eval_steps_per_second": 1.381, "step": 8400 }, { "epoch": 43.58974358974359, "grad_norm": 30.97890853881836, "learning_rate": 9.151100000000001e-06, "loss": 1.2071, "step": 8500 }, { "epoch": 43.58974358974359, "eval_loss": 1.3306576013565063, "eval_runtime": 36.3547, "eval_samples_per_second": 10.81, "eval_steps_per_second": 1.375, "step": 8500 }, { "epoch": 44.1025641025641, "grad_norm": 40.07706832885742, "learning_rate": 9.141100000000001e-06, "loss": 1.25, "step": 8600 }, { "epoch": 44.1025641025641, "eval_loss": 1.3270679712295532, "eval_runtime": 36.1754, "eval_samples_per_second": 10.864, "eval_steps_per_second": 1.382, "step": 8600 }, { "epoch": 44.61538461538461, "grad_norm": 27.011186599731445, "learning_rate": 9.1311e-06, "loss": 1.1968, "step": 8700 }, { "epoch": 44.61538461538461, "eval_loss": 1.3117327690124512, "eval_runtime": 36.1727, "eval_samples_per_second": 10.865, "eval_steps_per_second": 1.382, "step": 8700 }, { "epoch": 45.12820512820513, "grad_norm": 25.976228713989258, "learning_rate": 9.1211e-06, "loss": 1.2492, "step": 8800 }, { "epoch": 45.12820512820513, "eval_loss": 1.3281300067901611, "eval_runtime": 36.2531, "eval_samples_per_second": 10.84, "eval_steps_per_second": 1.379, "step": 8800 }, { "epoch": 45.64102564102564, "grad_norm": 21.215715408325195, "learning_rate": 9.1111e-06, "loss": 1.221, "step": 8900 }, { "epoch": 45.64102564102564, "eval_loss": 1.3373734951019287, "eval_runtime": 36.2242, "eval_samples_per_second": 10.849, "eval_steps_per_second": 1.38, "step": 8900 }, { "epoch": 46.15384615384615, "grad_norm": 48.8258171081543, "learning_rate": 9.1011e-06, "loss": 1.2123, "step": 9000 }, { "epoch": 46.15384615384615, "eval_loss": 1.3303853273391724, "eval_runtime": 36.4019, "eval_samples_per_second": 10.796, "eval_steps_per_second": 1.374, "step": 9000 }, { "epoch": 46.666666666666664, "grad_norm": 36.76605224609375, "learning_rate": 9.0911e-06, "loss": 1.1951, "step": 9100 }, { "epoch": 46.666666666666664, "eval_loss": 1.3182373046875, "eval_runtime": 36.387, "eval_samples_per_second": 10.801, "eval_steps_per_second": 1.374, "step": 9100 }, { "epoch": 47.17948717948718, "grad_norm": 40.79771423339844, "learning_rate": 9.0811e-06, "loss": 1.2155, "step": 9200 }, { "epoch": 47.17948717948718, "eval_loss": 1.3303160667419434, "eval_runtime": 36.2265, "eval_samples_per_second": 10.848, "eval_steps_per_second": 1.38, "step": 9200 }, { "epoch": 47.69230769230769, "grad_norm": 48.06431579589844, "learning_rate": 9.0711e-06, "loss": 1.2236, "step": 9300 }, { "epoch": 47.69230769230769, "eval_loss": 1.3128286600112915, "eval_runtime": 36.3665, "eval_samples_per_second": 10.807, "eval_steps_per_second": 1.375, "step": 9300 }, { "epoch": 48.205128205128204, "grad_norm": 31.19647216796875, "learning_rate": 9.0611e-06, "loss": 1.2033, "step": 9400 }, { "epoch": 48.205128205128204, "eval_loss": 1.3134888410568237, "eval_runtime": 36.5129, "eval_samples_per_second": 10.763, "eval_steps_per_second": 1.369, "step": 9400 }, { "epoch": 48.717948717948715, "grad_norm": 20.11866569519043, "learning_rate": 9.0511e-06, "loss": 1.1955, "step": 9500 }, { "epoch": 48.717948717948715, "eval_loss": 1.3154560327529907, "eval_runtime": 36.6023, "eval_samples_per_second": 10.737, "eval_steps_per_second": 1.366, "step": 9500 }, { "epoch": 49.23076923076923, "grad_norm": 36.3143424987793, "learning_rate": 9.0411e-06, "loss": 1.2067, "step": 9600 }, { "epoch": 49.23076923076923, "eval_loss": 1.3158589601516724, "eval_runtime": 36.4851, "eval_samples_per_second": 10.772, "eval_steps_per_second": 1.37, "step": 9600 }, { "epoch": 49.743589743589745, "grad_norm": 48.41688537597656, "learning_rate": 9.0311e-06, "loss": 1.2295, "step": 9700 }, { "epoch": 49.743589743589745, "eval_loss": 1.306788682937622, "eval_runtime": 36.291, "eval_samples_per_second": 10.829, "eval_steps_per_second": 1.378, "step": 9700 }, { "epoch": 50.256410256410255, "grad_norm": 26.129995346069336, "learning_rate": 9.0211e-06, "loss": 1.1809, "step": 9800 }, { "epoch": 50.256410256410255, "eval_loss": 1.3299375772476196, "eval_runtime": 36.4651, "eval_samples_per_second": 10.777, "eval_steps_per_second": 1.371, "step": 9800 }, { "epoch": 50.76923076923077, "grad_norm": 19.543821334838867, "learning_rate": 9.011100000000001e-06, "loss": 1.2179, "step": 9900 }, { "epoch": 50.76923076923077, "eval_loss": 1.317457675933838, "eval_runtime": 36.2864, "eval_samples_per_second": 10.831, "eval_steps_per_second": 1.378, "step": 9900 }, { "epoch": 51.282051282051285, "grad_norm": 25.619775772094727, "learning_rate": 9.001100000000001e-06, "loss": 1.1653, "step": 10000 }, { "epoch": 51.282051282051285, "eval_loss": 1.31196928024292, "eval_runtime": 36.3263, "eval_samples_per_second": 10.819, "eval_steps_per_second": 1.376, "step": 10000 }, { "epoch": 51.794871794871796, "grad_norm": 45.30315017700195, "learning_rate": 8.991100000000001e-06, "loss": 1.2391, "step": 10100 }, { "epoch": 51.794871794871796, "eval_loss": 1.305709719657898, "eval_runtime": 36.2103, "eval_samples_per_second": 10.853, "eval_steps_per_second": 1.381, "step": 10100 }, { "epoch": 52.30769230769231, "grad_norm": 26.942337036132812, "learning_rate": 8.981100000000001e-06, "loss": 1.2195, "step": 10200 }, { "epoch": 52.30769230769231, "eval_loss": 1.3068941831588745, "eval_runtime": 36.2565, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 10200 }, { "epoch": 52.82051282051282, "grad_norm": 26.20073890686035, "learning_rate": 8.9711e-06, "loss": 1.1639, "step": 10300 }, { "epoch": 52.82051282051282, "eval_loss": 1.3013452291488647, "eval_runtime": 36.2146, "eval_samples_per_second": 10.852, "eval_steps_per_second": 1.381, "step": 10300 }, { "epoch": 53.333333333333336, "grad_norm": 41.40350341796875, "learning_rate": 8.9611e-06, "loss": 1.2033, "step": 10400 }, { "epoch": 53.333333333333336, "eval_loss": 1.305737853050232, "eval_runtime": 36.4865, "eval_samples_per_second": 10.771, "eval_steps_per_second": 1.37, "step": 10400 }, { "epoch": 53.84615384615385, "grad_norm": 28.133567810058594, "learning_rate": 8.9511e-06, "loss": 1.1906, "step": 10500 }, { "epoch": 53.84615384615385, "eval_loss": 1.2961195707321167, "eval_runtime": 36.2734, "eval_samples_per_second": 10.834, "eval_steps_per_second": 1.378, "step": 10500 }, { "epoch": 54.35897435897436, "grad_norm": 44.07390213012695, "learning_rate": 8.9411e-06, "loss": 1.1899, "step": 10600 }, { "epoch": 54.35897435897436, "eval_loss": 1.3024916648864746, "eval_runtime": 36.4774, "eval_samples_per_second": 10.774, "eval_steps_per_second": 1.371, "step": 10600 }, { "epoch": 54.87179487179487, "grad_norm": 19.120830535888672, "learning_rate": 8.9311e-06, "loss": 1.1697, "step": 10700 }, { "epoch": 54.87179487179487, "eval_loss": 1.3056432008743286, "eval_runtime": 36.2367, "eval_samples_per_second": 10.845, "eval_steps_per_second": 1.38, "step": 10700 }, { "epoch": 55.38461538461539, "grad_norm": 52.376529693603516, "learning_rate": 8.9211e-06, "loss": 1.1759, "step": 10800 }, { "epoch": 55.38461538461539, "eval_loss": 1.3018929958343506, "eval_runtime": 36.1478, "eval_samples_per_second": 10.872, "eval_steps_per_second": 1.383, "step": 10800 }, { "epoch": 55.8974358974359, "grad_norm": 41.84946060180664, "learning_rate": 8.9112e-06, "loss": 1.1973, "step": 10900 }, { "epoch": 55.8974358974359, "eval_loss": 1.3166084289550781, "eval_runtime": 36.4967, "eval_samples_per_second": 10.768, "eval_steps_per_second": 1.37, "step": 10900 }, { "epoch": 56.41025641025641, "grad_norm": 48.97800064086914, "learning_rate": 8.9012e-06, "loss": 1.1942, "step": 11000 }, { "epoch": 56.41025641025641, "eval_loss": 1.3040730953216553, "eval_runtime": 36.3391, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.376, "step": 11000 }, { "epoch": 56.92307692307692, "grad_norm": 24.18547821044922, "learning_rate": 8.8912e-06, "loss": 1.1544, "step": 11100 }, { "epoch": 56.92307692307692, "eval_loss": 1.2837135791778564, "eval_runtime": 36.2839, "eval_samples_per_second": 10.831, "eval_steps_per_second": 1.378, "step": 11100 }, { "epoch": 57.43589743589744, "grad_norm": 34.69540023803711, "learning_rate": 8.8812e-06, "loss": 1.1998, "step": 11200 }, { "epoch": 57.43589743589744, "eval_loss": 1.2983756065368652, "eval_runtime": 36.2024, "eval_samples_per_second": 10.856, "eval_steps_per_second": 1.381, "step": 11200 }, { "epoch": 57.94871794871795, "grad_norm": 30.074583053588867, "learning_rate": 8.8712e-06, "loss": 1.1352, "step": 11300 }, { "epoch": 57.94871794871795, "eval_loss": 1.2913649082183838, "eval_runtime": 36.0977, "eval_samples_per_second": 10.887, "eval_steps_per_second": 1.385, "step": 11300 }, { "epoch": 58.46153846153846, "grad_norm": 26.75031852722168, "learning_rate": 8.8612e-06, "loss": 1.1728, "step": 11400 }, { "epoch": 58.46153846153846, "eval_loss": 1.288116216659546, "eval_runtime": 36.2588, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 11400 }, { "epoch": 58.97435897435897, "grad_norm": 49.548213958740234, "learning_rate": 8.851200000000001e-06, "loss": 1.1738, "step": 11500 }, { "epoch": 58.97435897435897, "eval_loss": 1.2846206426620483, "eval_runtime": 36.5245, "eval_samples_per_second": 10.76, "eval_steps_per_second": 1.369, "step": 11500 }, { "epoch": 59.48717948717949, "grad_norm": 23.007057189941406, "learning_rate": 8.841200000000001e-06, "loss": 1.1501, "step": 11600 }, { "epoch": 59.48717948717949, "eval_loss": 1.297021746635437, "eval_runtime": 36.2678, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.379, "step": 11600 }, { "epoch": 60.0, "grad_norm": 39.79067611694336, "learning_rate": 8.831200000000001e-06, "loss": 1.1836, "step": 11700 }, { "epoch": 60.0, "eval_loss": 1.2865136861801147, "eval_runtime": 36.1127, "eval_samples_per_second": 10.883, "eval_steps_per_second": 1.385, "step": 11700 }, { "epoch": 60.51282051282051, "grad_norm": 24.281373977661133, "learning_rate": 8.821200000000001e-06, "loss": 1.1548, "step": 11800 }, { "epoch": 60.51282051282051, "eval_loss": 1.2812024354934692, "eval_runtime": 36.2399, "eval_samples_per_second": 10.844, "eval_steps_per_second": 1.38, "step": 11800 }, { "epoch": 61.02564102564103, "grad_norm": 30.851072311401367, "learning_rate": 8.811200000000001e-06, "loss": 1.1794, "step": 11900 }, { "epoch": 61.02564102564103, "eval_loss": 1.2902381420135498, "eval_runtime": 36.1264, "eval_samples_per_second": 10.878, "eval_steps_per_second": 1.384, "step": 11900 }, { "epoch": 61.53846153846154, "grad_norm": 44.42039108276367, "learning_rate": 8.801200000000001e-06, "loss": 1.1385, "step": 12000 }, { "epoch": 61.53846153846154, "eval_loss": 1.2793415784835815, "eval_runtime": 36.266, "eval_samples_per_second": 10.837, "eval_steps_per_second": 1.379, "step": 12000 }, { "epoch": 62.05128205128205, "grad_norm": 57.410274505615234, "learning_rate": 8.791200000000001e-06, "loss": 1.1697, "step": 12100 }, { "epoch": 62.05128205128205, "eval_loss": 1.2847199440002441, "eval_runtime": 36.1783, "eval_samples_per_second": 10.863, "eval_steps_per_second": 1.382, "step": 12100 }, { "epoch": 62.56410256410256, "grad_norm": 70.70729064941406, "learning_rate": 8.781200000000002e-06, "loss": 1.1518, "step": 12200 }, { "epoch": 62.56410256410256, "eval_loss": 1.2760446071624756, "eval_runtime": 36.0527, "eval_samples_per_second": 10.901, "eval_steps_per_second": 1.387, "step": 12200 }, { "epoch": 63.07692307692308, "grad_norm": 32.417388916015625, "learning_rate": 8.7712e-06, "loss": 1.1677, "step": 12300 }, { "epoch": 63.07692307692308, "eval_loss": 1.2847411632537842, "eval_runtime": 36.3923, "eval_samples_per_second": 10.799, "eval_steps_per_second": 1.374, "step": 12300 }, { "epoch": 63.58974358974359, "grad_norm": 24.372791290283203, "learning_rate": 8.7612e-06, "loss": 1.1433, "step": 12400 }, { "epoch": 63.58974358974359, "eval_loss": 1.2779407501220703, "eval_runtime": 36.5015, "eval_samples_per_second": 10.767, "eval_steps_per_second": 1.37, "step": 12400 }, { "epoch": 64.1025641025641, "grad_norm": 19.632272720336914, "learning_rate": 8.7512e-06, "loss": 1.1607, "step": 12500 }, { "epoch": 64.1025641025641, "eval_loss": 1.2792208194732666, "eval_runtime": 36.4617, "eval_samples_per_second": 10.778, "eval_steps_per_second": 1.371, "step": 12500 }, { "epoch": 64.61538461538461, "grad_norm": 34.54841613769531, "learning_rate": 8.7412e-06, "loss": 1.1371, "step": 12600 }, { "epoch": 64.61538461538461, "eval_loss": 1.2620294094085693, "eval_runtime": 36.4969, "eval_samples_per_second": 10.768, "eval_steps_per_second": 1.37, "step": 12600 }, { "epoch": 65.12820512820512, "grad_norm": 19.67386817932129, "learning_rate": 8.7312e-06, "loss": 1.1332, "step": 12700 }, { "epoch": 65.12820512820512, "eval_loss": 1.2682890892028809, "eval_runtime": 36.6309, "eval_samples_per_second": 10.729, "eval_steps_per_second": 1.365, "step": 12700 }, { "epoch": 65.64102564102564, "grad_norm": 65.07030487060547, "learning_rate": 8.7212e-06, "loss": 1.1571, "step": 12800 }, { "epoch": 65.64102564102564, "eval_loss": 1.2490720748901367, "eval_runtime": 36.3056, "eval_samples_per_second": 10.825, "eval_steps_per_second": 1.377, "step": 12800 }, { "epoch": 66.15384615384616, "grad_norm": 20.384132385253906, "learning_rate": 8.7112e-06, "loss": 1.1619, "step": 12900 }, { "epoch": 66.15384615384616, "eval_loss": 1.2465028762817383, "eval_runtime": 36.1678, "eval_samples_per_second": 10.866, "eval_steps_per_second": 1.382, "step": 12900 }, { "epoch": 66.66666666666667, "grad_norm": 36.320674896240234, "learning_rate": 8.7012e-06, "loss": 1.1176, "step": 13000 }, { "epoch": 66.66666666666667, "eval_loss": 1.2594362497329712, "eval_runtime": 36.3117, "eval_samples_per_second": 10.823, "eval_steps_per_second": 1.377, "step": 13000 }, { "epoch": 67.17948717948718, "grad_norm": 30.79526138305664, "learning_rate": 8.6912e-06, "loss": 1.1311, "step": 13100 }, { "epoch": 67.17948717948718, "eval_loss": 1.2553967237472534, "eval_runtime": 36.2667, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.379, "step": 13100 }, { "epoch": 67.6923076923077, "grad_norm": 40.0444450378418, "learning_rate": 8.6812e-06, "loss": 1.165, "step": 13200 }, { "epoch": 67.6923076923077, "eval_loss": 1.2607430219650269, "eval_runtime": 36.339, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.376, "step": 13200 }, { "epoch": 68.2051282051282, "grad_norm": 25.53687286376953, "learning_rate": 8.671200000000001e-06, "loss": 1.1334, "step": 13300 }, { "epoch": 68.2051282051282, "eval_loss": 1.2592159509658813, "eval_runtime": 36.2847, "eval_samples_per_second": 10.831, "eval_steps_per_second": 1.378, "step": 13300 }, { "epoch": 68.71794871794872, "grad_norm": 28.198373794555664, "learning_rate": 8.661200000000001e-06, "loss": 1.1481, "step": 13400 }, { "epoch": 68.71794871794872, "eval_loss": 1.2755882740020752, "eval_runtime": 36.6238, "eval_samples_per_second": 10.731, "eval_steps_per_second": 1.365, "step": 13400 }, { "epoch": 69.23076923076923, "grad_norm": 44.77776336669922, "learning_rate": 8.651200000000001e-06, "loss": 1.1138, "step": 13500 }, { "epoch": 69.23076923076923, "eval_loss": 1.259446144104004, "eval_runtime": 36.4048, "eval_samples_per_second": 10.795, "eval_steps_per_second": 1.373, "step": 13500 }, { "epoch": 69.74358974358974, "grad_norm": 73.0951156616211, "learning_rate": 8.641200000000001e-06, "loss": 1.149, "step": 13600 }, { "epoch": 69.74358974358974, "eval_loss": 1.2759552001953125, "eval_runtime": 35.9924, "eval_samples_per_second": 10.919, "eval_steps_per_second": 1.389, "step": 13600 }, { "epoch": 70.25641025641026, "grad_norm": 43.95732116699219, "learning_rate": 8.631200000000001e-06, "loss": 1.15, "step": 13700 }, { "epoch": 70.25641025641026, "eval_loss": 1.2675697803497314, "eval_runtime": 36.2305, "eval_samples_per_second": 10.847, "eval_steps_per_second": 1.38, "step": 13700 }, { "epoch": 70.76923076923077, "grad_norm": 28.56161117553711, "learning_rate": 8.621200000000001e-06, "loss": 1.1065, "step": 13800 }, { "epoch": 70.76923076923077, "eval_loss": 1.256949782371521, "eval_runtime": 36.2833, "eval_samples_per_second": 10.831, "eval_steps_per_second": 1.378, "step": 13800 }, { "epoch": 71.28205128205128, "grad_norm": 80.90894317626953, "learning_rate": 8.611200000000002e-06, "loss": 1.1111, "step": 13900 }, { "epoch": 71.28205128205128, "eval_loss": 1.2672077417373657, "eval_runtime": 36.2401, "eval_samples_per_second": 10.844, "eval_steps_per_second": 1.38, "step": 13900 }, { "epoch": 71.7948717948718, "grad_norm": 15.566130638122559, "learning_rate": 8.6012e-06, "loss": 1.1487, "step": 14000 }, { "epoch": 71.7948717948718, "eval_loss": 1.2434508800506592, "eval_runtime": 36.3731, "eval_samples_per_second": 10.805, "eval_steps_per_second": 1.375, "step": 14000 }, { "epoch": 72.3076923076923, "grad_norm": 41.25931930541992, "learning_rate": 8.5912e-06, "loss": 1.1357, "step": 14100 }, { "epoch": 72.3076923076923, "eval_loss": 1.25618314743042, "eval_runtime": 36.1143, "eval_samples_per_second": 10.882, "eval_steps_per_second": 1.384, "step": 14100 }, { "epoch": 72.82051282051282, "grad_norm": 24.271724700927734, "learning_rate": 8.5812e-06, "loss": 1.1039, "step": 14200 }, { "epoch": 72.82051282051282, "eval_loss": 1.2586956024169922, "eval_runtime": 36.2558, "eval_samples_per_second": 10.84, "eval_steps_per_second": 1.379, "step": 14200 }, { "epoch": 73.33333333333333, "grad_norm": 23.148712158203125, "learning_rate": 8.5713e-06, "loss": 1.1332, "step": 14300 }, { "epoch": 73.33333333333333, "eval_loss": 1.241268515586853, "eval_runtime": 36.617, "eval_samples_per_second": 10.733, "eval_steps_per_second": 1.365, "step": 14300 }, { "epoch": 73.84615384615384, "grad_norm": 39.42832946777344, "learning_rate": 8.5613e-06, "loss": 1.1276, "step": 14400 }, { "epoch": 73.84615384615384, "eval_loss": 1.261016607284546, "eval_runtime": 36.2412, "eval_samples_per_second": 10.844, "eval_steps_per_second": 1.38, "step": 14400 }, { "epoch": 74.35897435897436, "grad_norm": 41.2990837097168, "learning_rate": 8.5513e-06, "loss": 1.1259, "step": 14500 }, { "epoch": 74.35897435897436, "eval_loss": 1.254787802696228, "eval_runtime": 36.1616, "eval_samples_per_second": 10.868, "eval_steps_per_second": 1.383, "step": 14500 }, { "epoch": 74.87179487179488, "grad_norm": 75.0270767211914, "learning_rate": 8.5413e-06, "loss": 1.0919, "step": 14600 }, { "epoch": 74.87179487179488, "eval_loss": 1.2456586360931396, "eval_runtime": 36.3316, "eval_samples_per_second": 10.817, "eval_steps_per_second": 1.376, "step": 14600 }, { "epoch": 75.38461538461539, "grad_norm": 23.520156860351562, "learning_rate": 8.5313e-06, "loss": 1.1415, "step": 14700 }, { "epoch": 75.38461538461539, "eval_loss": 1.260399580001831, "eval_runtime": 36.3274, "eval_samples_per_second": 10.818, "eval_steps_per_second": 1.376, "step": 14700 }, { "epoch": 75.8974358974359, "grad_norm": 23.925405502319336, "learning_rate": 8.521300000000001e-06, "loss": 1.1435, "step": 14800 }, { "epoch": 75.8974358974359, "eval_loss": 1.2679468393325806, "eval_runtime": 36.2707, "eval_samples_per_second": 10.835, "eval_steps_per_second": 1.379, "step": 14800 }, { "epoch": 76.41025641025641, "grad_norm": 19.826759338378906, "learning_rate": 8.511300000000001e-06, "loss": 1.1034, "step": 14900 }, { "epoch": 76.41025641025641, "eval_loss": 1.260425090789795, "eval_runtime": 36.2564, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 14900 }, { "epoch": 76.92307692307692, "grad_norm": 36.41432189941406, "learning_rate": 8.501300000000001e-06, "loss": 1.1181, "step": 15000 }, { "epoch": 76.92307692307692, "eval_loss": 1.275189757347107, "eval_runtime": 36.226, "eval_samples_per_second": 10.849, "eval_steps_per_second": 1.38, "step": 15000 }, { "epoch": 77.43589743589743, "grad_norm": 56.50348663330078, "learning_rate": 8.491300000000001e-06, "loss": 1.117, "step": 15100 }, { "epoch": 77.43589743589743, "eval_loss": 1.2597932815551758, "eval_runtime": 36.1736, "eval_samples_per_second": 10.864, "eval_steps_per_second": 1.382, "step": 15100 }, { "epoch": 77.94871794871794, "grad_norm": 16.227319717407227, "learning_rate": 8.481300000000001e-06, "loss": 1.1287, "step": 15200 }, { "epoch": 77.94871794871794, "eval_loss": 1.2599974870681763, "eval_runtime": 36.2653, "eval_samples_per_second": 10.837, "eval_steps_per_second": 1.379, "step": 15200 }, { "epoch": 78.46153846153847, "grad_norm": 34.07974624633789, "learning_rate": 8.471300000000001e-06, "loss": 1.1484, "step": 15300 }, { "epoch": 78.46153846153847, "eval_loss": 1.2516891956329346, "eval_runtime": 36.47, "eval_samples_per_second": 10.776, "eval_steps_per_second": 1.371, "step": 15300 }, { "epoch": 78.97435897435898, "grad_norm": 48.17190933227539, "learning_rate": 8.461300000000001e-06, "loss": 1.0917, "step": 15400 }, { "epoch": 78.97435897435898, "eval_loss": 1.244437336921692, "eval_runtime": 36.2847, "eval_samples_per_second": 10.831, "eval_steps_per_second": 1.378, "step": 15400 }, { "epoch": 79.48717948717949, "grad_norm": 34.02452087402344, "learning_rate": 8.451300000000002e-06, "loss": 1.0924, "step": 15500 }, { "epoch": 79.48717948717949, "eval_loss": 1.2553346157073975, "eval_runtime": 36.3446, "eval_samples_per_second": 10.813, "eval_steps_per_second": 1.376, "step": 15500 }, { "epoch": 80.0, "grad_norm": 12.990086555480957, "learning_rate": 8.441300000000002e-06, "loss": 1.1319, "step": 15600 }, { "epoch": 80.0, "eval_loss": 1.2469161748886108, "eval_runtime": 36.3089, "eval_samples_per_second": 10.824, "eval_steps_per_second": 1.377, "step": 15600 }, { "epoch": 80.51282051282051, "grad_norm": 31.65691375732422, "learning_rate": 8.431300000000002e-06, "loss": 1.12, "step": 15700 }, { "epoch": 80.51282051282051, "eval_loss": 1.2401645183563232, "eval_runtime": 36.2741, "eval_samples_per_second": 10.834, "eval_steps_per_second": 1.378, "step": 15700 }, { "epoch": 81.02564102564102, "grad_norm": 21.566389083862305, "learning_rate": 8.421300000000002e-06, "loss": 1.1089, "step": 15800 }, { "epoch": 81.02564102564102, "eval_loss": 1.2469390630722046, "eval_runtime": 36.3979, "eval_samples_per_second": 10.797, "eval_steps_per_second": 1.374, "step": 15800 }, { "epoch": 81.53846153846153, "grad_norm": 27.354368209838867, "learning_rate": 8.411300000000002e-06, "loss": 1.1259, "step": 15900 }, { "epoch": 81.53846153846153, "eval_loss": 1.2625091075897217, "eval_runtime": 36.6689, "eval_samples_per_second": 10.718, "eval_steps_per_second": 1.364, "step": 15900 }, { "epoch": 82.05128205128206, "grad_norm": 29.60841178894043, "learning_rate": 8.4013e-06, "loss": 1.0668, "step": 16000 }, { "epoch": 82.05128205128206, "eval_loss": 1.2272534370422363, "eval_runtime": 36.5139, "eval_samples_per_second": 10.763, "eval_steps_per_second": 1.369, "step": 16000 }, { "epoch": 82.56410256410257, "grad_norm": 19.469263076782227, "learning_rate": 8.3913e-06, "loss": 1.1236, "step": 16100 }, { "epoch": 82.56410256410257, "eval_loss": 1.232078194618225, "eval_runtime": 36.3175, "eval_samples_per_second": 10.821, "eval_steps_per_second": 1.377, "step": 16100 }, { "epoch": 83.07692307692308, "grad_norm": 40.06100082397461, "learning_rate": 8.3813e-06, "loss": 1.0685, "step": 16200 }, { "epoch": 83.07692307692308, "eval_loss": 1.254823088645935, "eval_runtime": 36.5367, "eval_samples_per_second": 10.756, "eval_steps_per_second": 1.368, "step": 16200 }, { "epoch": 83.58974358974359, "grad_norm": 25.563325881958008, "learning_rate": 8.3713e-06, "loss": 1.0911, "step": 16300 }, { "epoch": 83.58974358974359, "eval_loss": 1.2462764978408813, "eval_runtime": 36.1602, "eval_samples_per_second": 10.868, "eval_steps_per_second": 1.383, "step": 16300 }, { "epoch": 84.1025641025641, "grad_norm": 41.810020446777344, "learning_rate": 8.3613e-06, "loss": 1.1009, "step": 16400 }, { "epoch": 84.1025641025641, "eval_loss": 1.2400792837142944, "eval_runtime": 36.2464, "eval_samples_per_second": 10.842, "eval_steps_per_second": 1.379, "step": 16400 }, { "epoch": 84.61538461538461, "grad_norm": 40.818111419677734, "learning_rate": 8.3513e-06, "loss": 1.0857, "step": 16500 }, { "epoch": 84.61538461538461, "eval_loss": 1.2257955074310303, "eval_runtime": 36.2933, "eval_samples_per_second": 10.828, "eval_steps_per_second": 1.378, "step": 16500 }, { "epoch": 85.12820512820512, "grad_norm": 33.36876678466797, "learning_rate": 8.341300000000001e-06, "loss": 1.1033, "step": 16600 }, { "epoch": 85.12820512820512, "eval_loss": 1.2494174242019653, "eval_runtime": 36.2089, "eval_samples_per_second": 10.854, "eval_steps_per_second": 1.381, "step": 16600 }, { "epoch": 85.64102564102564, "grad_norm": 41.88273620605469, "learning_rate": 8.331300000000001e-06, "loss": 1.0674, "step": 16700 }, { "epoch": 85.64102564102564, "eval_loss": 1.2431418895721436, "eval_runtime": 36.3141, "eval_samples_per_second": 10.822, "eval_steps_per_second": 1.377, "step": 16700 }, { "epoch": 86.15384615384616, "grad_norm": 26.26712989807129, "learning_rate": 8.321300000000001e-06, "loss": 1.1154, "step": 16800 }, { "epoch": 86.15384615384616, "eval_loss": 1.2349519729614258, "eval_runtime": 36.3687, "eval_samples_per_second": 10.806, "eval_steps_per_second": 1.375, "step": 16800 }, { "epoch": 86.66666666666667, "grad_norm": 28.719078063964844, "learning_rate": 8.311300000000001e-06, "loss": 1.0821, "step": 16900 }, { "epoch": 86.66666666666667, "eval_loss": 1.228055715560913, "eval_runtime": 36.3697, "eval_samples_per_second": 10.806, "eval_steps_per_second": 1.375, "step": 16900 }, { "epoch": 87.17948717948718, "grad_norm": 35.35209655761719, "learning_rate": 8.301300000000001e-06, "loss": 1.0829, "step": 17000 }, { "epoch": 87.17948717948718, "eval_loss": 1.241512417793274, "eval_runtime": 36.4655, "eval_samples_per_second": 10.777, "eval_steps_per_second": 1.371, "step": 17000 }, { "epoch": 87.6923076923077, "grad_norm": 24.084636688232422, "learning_rate": 8.291300000000001e-06, "loss": 1.0926, "step": 17100 }, { "epoch": 87.6923076923077, "eval_loss": 1.227530598640442, "eval_runtime": 36.183, "eval_samples_per_second": 10.861, "eval_steps_per_second": 1.382, "step": 17100 }, { "epoch": 88.2051282051282, "grad_norm": 35.995765686035156, "learning_rate": 8.281300000000002e-06, "loss": 1.076, "step": 17200 }, { "epoch": 88.2051282051282, "eval_loss": 1.231921911239624, "eval_runtime": 36.1958, "eval_samples_per_second": 10.858, "eval_steps_per_second": 1.381, "step": 17200 }, { "epoch": 88.71794871794872, "grad_norm": 23.116085052490234, "learning_rate": 8.271300000000002e-06, "loss": 1.0993, "step": 17300 }, { "epoch": 88.71794871794872, "eval_loss": 1.242146611213684, "eval_runtime": 36.2666, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.379, "step": 17300 }, { "epoch": 89.23076923076923, "grad_norm": 18.24385643005371, "learning_rate": 8.261300000000002e-06, "loss": 1.1213, "step": 17400 }, { "epoch": 89.23076923076923, "eval_loss": 1.230137825012207, "eval_runtime": 36.153, "eval_samples_per_second": 10.87, "eval_steps_per_second": 1.383, "step": 17400 }, { "epoch": 89.74358974358974, "grad_norm": 44.21913146972656, "learning_rate": 8.251300000000002e-06, "loss": 1.045, "step": 17500 }, { "epoch": 89.74358974358974, "eval_loss": 1.2343608140945435, "eval_runtime": 36.3254, "eval_samples_per_second": 10.819, "eval_steps_per_second": 1.376, "step": 17500 }, { "epoch": 90.25641025641026, "grad_norm": 29.37706756591797, "learning_rate": 8.2413e-06, "loss": 1.0805, "step": 17600 }, { "epoch": 90.25641025641026, "eval_loss": 1.2186076641082764, "eval_runtime": 36.3582, "eval_samples_per_second": 10.809, "eval_steps_per_second": 1.375, "step": 17600 }, { "epoch": 90.76923076923077, "grad_norm": 40.11149597167969, "learning_rate": 8.2313e-06, "loss": 1.0732, "step": 17700 }, { "epoch": 90.76923076923077, "eval_loss": 1.2361598014831543, "eval_runtime": 36.4168, "eval_samples_per_second": 10.792, "eval_steps_per_second": 1.373, "step": 17700 }, { "epoch": 91.28205128205128, "grad_norm": 33.00440216064453, "learning_rate": 8.2213e-06, "loss": 1.0912, "step": 17800 }, { "epoch": 91.28205128205128, "eval_loss": 1.2296538352966309, "eval_runtime": 36.2928, "eval_samples_per_second": 10.829, "eval_steps_per_second": 1.378, "step": 17800 }, { "epoch": 91.7948717948718, "grad_norm": 30.941469192504883, "learning_rate": 8.2113e-06, "loss": 1.064, "step": 17900 }, { "epoch": 91.7948717948718, "eval_loss": 1.250794529914856, "eval_runtime": 36.4692, "eval_samples_per_second": 10.776, "eval_steps_per_second": 1.371, "step": 17900 }, { "epoch": 92.3076923076923, "grad_norm": 41.63932800292969, "learning_rate": 8.2013e-06, "loss": 1.0529, "step": 18000 }, { "epoch": 92.3076923076923, "eval_loss": 1.2209473848342896, "eval_runtime": 36.5494, "eval_samples_per_second": 10.753, "eval_steps_per_second": 1.368, "step": 18000 }, { "epoch": 92.82051282051282, "grad_norm": 34.083587646484375, "learning_rate": 8.1913e-06, "loss": 1.0849, "step": 18100 }, { "epoch": 92.82051282051282, "eval_loss": 1.2245945930480957, "eval_runtime": 36.3755, "eval_samples_per_second": 10.804, "eval_steps_per_second": 1.375, "step": 18100 }, { "epoch": 93.33333333333333, "grad_norm": 33.740848541259766, "learning_rate": 8.1813e-06, "loss": 1.0853, "step": 18200 }, { "epoch": 93.33333333333333, "eval_loss": 1.2368453741073608, "eval_runtime": 36.1346, "eval_samples_per_second": 10.876, "eval_steps_per_second": 1.384, "step": 18200 }, { "epoch": 93.84615384615384, "grad_norm": 22.13953971862793, "learning_rate": 8.171300000000001e-06, "loss": 1.09, "step": 18300 }, { "epoch": 93.84615384615384, "eval_loss": 1.2331533432006836, "eval_runtime": 36.4043, "eval_samples_per_second": 10.795, "eval_steps_per_second": 1.373, "step": 18300 }, { "epoch": 94.35897435897436, "grad_norm": 45.70988082885742, "learning_rate": 8.161300000000001e-06, "loss": 1.0543, "step": 18400 }, { "epoch": 94.35897435897436, "eval_loss": 1.216800570487976, "eval_runtime": 36.5884, "eval_samples_per_second": 10.741, "eval_steps_per_second": 1.367, "step": 18400 }, { "epoch": 94.87179487179488, "grad_norm": 38.62083435058594, "learning_rate": 8.151300000000001e-06, "loss": 1.09, "step": 18500 }, { "epoch": 94.87179487179488, "eval_loss": 1.24717378616333, "eval_runtime": 36.2986, "eval_samples_per_second": 10.827, "eval_steps_per_second": 1.377, "step": 18500 }, { "epoch": 95.38461538461539, "grad_norm": 40.52507400512695, "learning_rate": 8.141300000000001e-06, "loss": 1.1019, "step": 18600 }, { "epoch": 95.38461538461539, "eval_loss": 1.2494611740112305, "eval_runtime": 36.3553, "eval_samples_per_second": 10.81, "eval_steps_per_second": 1.375, "step": 18600 }, { "epoch": 95.8974358974359, "grad_norm": 70.1895523071289, "learning_rate": 8.131300000000001e-06, "loss": 1.0711, "step": 18700 }, { "epoch": 95.8974358974359, "eval_loss": 1.2522521018981934, "eval_runtime": 36.1421, "eval_samples_per_second": 10.874, "eval_steps_per_second": 1.383, "step": 18700 }, { "epoch": 96.41025641025641, "grad_norm": 45.69275665283203, "learning_rate": 8.121300000000001e-06, "loss": 1.1066, "step": 18800 }, { "epoch": 96.41025641025641, "eval_loss": 1.250695824623108, "eval_runtime": 36.2669, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.379, "step": 18800 }, { "epoch": 96.92307692307692, "grad_norm": 23.61644744873047, "learning_rate": 8.111300000000001e-06, "loss": 1.0967, "step": 18900 }, { "epoch": 96.92307692307692, "eval_loss": 1.2277597188949585, "eval_runtime": 36.3005, "eval_samples_per_second": 10.826, "eval_steps_per_second": 1.377, "step": 18900 }, { "epoch": 97.43589743589743, "grad_norm": 21.942943572998047, "learning_rate": 8.101300000000002e-06, "loss": 1.0704, "step": 19000 }, { "epoch": 97.43589743589743, "eval_loss": 1.2279075384140015, "eval_runtime": 36.189, "eval_samples_per_second": 10.86, "eval_steps_per_second": 1.382, "step": 19000 }, { "epoch": 97.94871794871794, "grad_norm": 36.2983512878418, "learning_rate": 8.091300000000002e-06, "loss": 1.0719, "step": 19100 }, { "epoch": 97.94871794871794, "eval_loss": 1.2093193531036377, "eval_runtime": 36.5953, "eval_samples_per_second": 10.739, "eval_steps_per_second": 1.366, "step": 19100 }, { "epoch": 98.46153846153847, "grad_norm": 73.0156021118164, "learning_rate": 8.0813e-06, "loss": 1.0538, "step": 19200 }, { "epoch": 98.46153846153847, "eval_loss": 1.2311538457870483, "eval_runtime": 36.4102, "eval_samples_per_second": 10.794, "eval_steps_per_second": 1.373, "step": 19200 }, { "epoch": 98.97435897435898, "grad_norm": 51.309017181396484, "learning_rate": 8.0713e-06, "loss": 1.0818, "step": 19300 }, { "epoch": 98.97435897435898, "eval_loss": 1.2250592708587646, "eval_runtime": 36.4117, "eval_samples_per_second": 10.793, "eval_steps_per_second": 1.373, "step": 19300 }, { "epoch": 99.48717948717949, "grad_norm": 15.101311683654785, "learning_rate": 8.0613e-06, "loss": 1.0656, "step": 19400 }, { "epoch": 99.48717948717949, "eval_loss": 1.233995795249939, "eval_runtime": 36.3917, "eval_samples_per_second": 10.799, "eval_steps_per_second": 1.374, "step": 19400 }, { "epoch": 100.0, "grad_norm": 39.63221740722656, "learning_rate": 8.0513e-06, "loss": 1.0716, "step": 19500 }, { "epoch": 100.0, "eval_loss": 1.2169172763824463, "eval_runtime": 36.466, "eval_samples_per_second": 10.777, "eval_steps_per_second": 1.371, "step": 19500 }, { "epoch": 100.51282051282051, "grad_norm": 28.275875091552734, "learning_rate": 8.0413e-06, "loss": 1.0863, "step": 19600 }, { "epoch": 100.51282051282051, "eval_loss": 1.235645055770874, "eval_runtime": 36.4873, "eval_samples_per_second": 10.771, "eval_steps_per_second": 1.37, "step": 19600 }, { "epoch": 101.02564102564102, "grad_norm": 46.1825065612793, "learning_rate": 8.0313e-06, "loss": 1.0254, "step": 19700 }, { "epoch": 101.02564102564102, "eval_loss": 1.2021634578704834, "eval_runtime": 36.3574, "eval_samples_per_second": 10.809, "eval_steps_per_second": 1.375, "step": 19700 }, { "epoch": 101.53846153846153, "grad_norm": 33.101219177246094, "learning_rate": 8.0213e-06, "loss": 1.0802, "step": 19800 }, { "epoch": 101.53846153846153, "eval_loss": 1.2263848781585693, "eval_runtime": 36.4102, "eval_samples_per_second": 10.794, "eval_steps_per_second": 1.373, "step": 19800 }, { "epoch": 102.05128205128206, "grad_norm": 28.070240020751953, "learning_rate": 8.0113e-06, "loss": 1.0209, "step": 19900 }, { "epoch": 102.05128205128206, "eval_loss": 1.2010384798049927, "eval_runtime": 36.5083, "eval_samples_per_second": 10.765, "eval_steps_per_second": 1.37, "step": 19900 }, { "epoch": 102.56410256410257, "grad_norm": 32.505916595458984, "learning_rate": 8.0013e-06, "loss": 1.0738, "step": 20000 }, { "epoch": 102.56410256410257, "eval_loss": 1.1892540454864502, "eval_runtime": 36.8658, "eval_samples_per_second": 10.66, "eval_steps_per_second": 1.356, "step": 20000 }, { "epoch": 103.07692307692308, "grad_norm": 116.78060150146484, "learning_rate": 7.991300000000001e-06, "loss": 1.0417, "step": 20100 }, { "epoch": 103.07692307692308, "eval_loss": 1.211946964263916, "eval_runtime": 36.0941, "eval_samples_per_second": 10.888, "eval_steps_per_second": 1.385, "step": 20100 }, { "epoch": 103.58974358974359, "grad_norm": 47.81857681274414, "learning_rate": 7.981300000000001e-06, "loss": 1.0576, "step": 20200 }, { "epoch": 103.58974358974359, "eval_loss": 1.2092243432998657, "eval_runtime": 36.2906, "eval_samples_per_second": 10.829, "eval_steps_per_second": 1.378, "step": 20200 }, { "epoch": 104.1025641025641, "grad_norm": 44.64072036743164, "learning_rate": 7.971300000000001e-06, "loss": 1.053, "step": 20300 }, { "epoch": 104.1025641025641, "eval_loss": 1.2181479930877686, "eval_runtime": 36.2566, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 20300 }, { "epoch": 104.61538461538461, "grad_norm": 25.469749450683594, "learning_rate": 7.961300000000001e-06, "loss": 1.0532, "step": 20400 }, { "epoch": 104.61538461538461, "eval_loss": 1.2095298767089844, "eval_runtime": 36.1952, "eval_samples_per_second": 10.858, "eval_steps_per_second": 1.381, "step": 20400 }, { "epoch": 105.12820512820512, "grad_norm": 18.33926773071289, "learning_rate": 7.951300000000001e-06, "loss": 1.0778, "step": 20500 }, { "epoch": 105.12820512820512, "eval_loss": 1.2227920293807983, "eval_runtime": 36.3372, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.376, "step": 20500 }, { "epoch": 105.64102564102564, "grad_norm": 67.40721130371094, "learning_rate": 7.941300000000001e-06, "loss": 1.0777, "step": 20600 }, { "epoch": 105.64102564102564, "eval_loss": 1.2083336114883423, "eval_runtime": 36.3182, "eval_samples_per_second": 10.821, "eval_steps_per_second": 1.377, "step": 20600 }, { "epoch": 106.15384615384616, "grad_norm": 24.97282600402832, "learning_rate": 7.931300000000001e-06, "loss": 1.0512, "step": 20700 }, { "epoch": 106.15384615384616, "eval_loss": 1.1997463703155518, "eval_runtime": 36.3993, "eval_samples_per_second": 10.797, "eval_steps_per_second": 1.374, "step": 20700 }, { "epoch": 106.66666666666667, "grad_norm": 56.40156173706055, "learning_rate": 7.9213e-06, "loss": 1.0712, "step": 20800 }, { "epoch": 106.66666666666667, "eval_loss": 1.2138112783432007, "eval_runtime": 35.549, "eval_samples_per_second": 11.055, "eval_steps_per_second": 1.407, "step": 20800 }, { "epoch": 107.17948717948718, "grad_norm": 28.606220245361328, "learning_rate": 7.9113e-06, "loss": 1.0497, "step": 20900 }, { "epoch": 107.17948717948718, "eval_loss": 1.2097153663635254, "eval_runtime": 36.5101, "eval_samples_per_second": 10.764, "eval_steps_per_second": 1.369, "step": 20900 }, { "epoch": 107.6923076923077, "grad_norm": 32.19448471069336, "learning_rate": 7.9013e-06, "loss": 1.0383, "step": 21000 }, { "epoch": 107.6923076923077, "eval_loss": 1.2029515504837036, "eval_runtime": 36.5264, "eval_samples_per_second": 10.759, "eval_steps_per_second": 1.369, "step": 21000 }, { "epoch": 108.2051282051282, "grad_norm": 14.996597290039062, "learning_rate": 7.8913e-06, "loss": 1.0571, "step": 21100 }, { "epoch": 108.2051282051282, "eval_loss": 1.2292343378067017, "eval_runtime": 36.1323, "eval_samples_per_second": 10.877, "eval_steps_per_second": 1.384, "step": 21100 }, { "epoch": 108.71794871794872, "grad_norm": 15.809494018554688, "learning_rate": 7.8814e-06, "loss": 1.0725, "step": 21200 }, { "epoch": 108.71794871794872, "eval_loss": 1.2223467826843262, "eval_runtime": 36.1509, "eval_samples_per_second": 10.871, "eval_steps_per_second": 1.383, "step": 21200 }, { "epoch": 109.23076923076923, "grad_norm": 55.448875427246094, "learning_rate": 7.8714e-06, "loss": 1.0401, "step": 21300 }, { "epoch": 109.23076923076923, "eval_loss": 1.2152210474014282, "eval_runtime": 36.4536, "eval_samples_per_second": 10.781, "eval_steps_per_second": 1.372, "step": 21300 }, { "epoch": 109.74358974358974, "grad_norm": 47.51198959350586, "learning_rate": 7.8614e-06, "loss": 1.0232, "step": 21400 }, { "epoch": 109.74358974358974, "eval_loss": 1.2074100971221924, "eval_runtime": 36.2117, "eval_samples_per_second": 10.853, "eval_steps_per_second": 1.381, "step": 21400 }, { "epoch": 110.25641025641026, "grad_norm": 22.616479873657227, "learning_rate": 7.8514e-06, "loss": 1.0837, "step": 21500 }, { "epoch": 110.25641025641026, "eval_loss": 1.2034764289855957, "eval_runtime": 36.2909, "eval_samples_per_second": 10.829, "eval_steps_per_second": 1.378, "step": 21500 }, { "epoch": 110.76923076923077, "grad_norm": 40.14712905883789, "learning_rate": 7.841400000000001e-06, "loss": 1.044, "step": 21600 }, { "epoch": 110.76923076923077, "eval_loss": 1.1942527294158936, "eval_runtime": 36.2254, "eval_samples_per_second": 10.849, "eval_steps_per_second": 1.38, "step": 21600 }, { "epoch": 111.28205128205128, "grad_norm": 89.68008422851562, "learning_rate": 7.831400000000001e-06, "loss": 1.0301, "step": 21700 }, { "epoch": 111.28205128205128, "eval_loss": 1.2042152881622314, "eval_runtime": 36.2099, "eval_samples_per_second": 10.853, "eval_steps_per_second": 1.381, "step": 21700 }, { "epoch": 111.7948717948718, "grad_norm": 40.1873664855957, "learning_rate": 7.8214e-06, "loss": 1.0513, "step": 21800 }, { "epoch": 111.7948717948718, "eval_loss": 1.201180100440979, "eval_runtime": 36.1821, "eval_samples_per_second": 10.862, "eval_steps_per_second": 1.382, "step": 21800 }, { "epoch": 112.3076923076923, "grad_norm": 30.849868774414062, "learning_rate": 7.8114e-06, "loss": 1.0514, "step": 21900 }, { "epoch": 112.3076923076923, "eval_loss": 1.2128338813781738, "eval_runtime": 36.2828, "eval_samples_per_second": 10.832, "eval_steps_per_second": 1.378, "step": 21900 }, { "epoch": 112.82051282051282, "grad_norm": 31.983945846557617, "learning_rate": 7.8014e-06, "loss": 1.0288, "step": 22000 }, { "epoch": 112.82051282051282, "eval_loss": 1.194412350654602, "eval_runtime": 36.4995, "eval_samples_per_second": 10.767, "eval_steps_per_second": 1.37, "step": 22000 }, { "epoch": 113.33333333333333, "grad_norm": 27.589929580688477, "learning_rate": 7.791400000000001e-06, "loss": 1.0131, "step": 22100 }, { "epoch": 113.33333333333333, "eval_loss": 1.2023619413375854, "eval_runtime": 36.2873, "eval_samples_per_second": 10.83, "eval_steps_per_second": 1.378, "step": 22100 }, { "epoch": 113.84615384615384, "grad_norm": 98.88153076171875, "learning_rate": 7.781400000000001e-06, "loss": 1.0648, "step": 22200 }, { "epoch": 113.84615384615384, "eval_loss": 1.1987013816833496, "eval_runtime": 36.3531, "eval_samples_per_second": 10.811, "eval_steps_per_second": 1.375, "step": 22200 }, { "epoch": 114.35897435897436, "grad_norm": 32.19725036621094, "learning_rate": 7.771400000000002e-06, "loss": 1.0401, "step": 22300 }, { "epoch": 114.35897435897436, "eval_loss": 1.2003728151321411, "eval_runtime": 36.294, "eval_samples_per_second": 10.828, "eval_steps_per_second": 1.378, "step": 22300 }, { "epoch": 114.87179487179488, "grad_norm": 29.627470016479492, "learning_rate": 7.761400000000002e-06, "loss": 1.0638, "step": 22400 }, { "epoch": 114.87179487179488, "eval_loss": 1.2163525819778442, "eval_runtime": 36.4332, "eval_samples_per_second": 10.787, "eval_steps_per_second": 1.372, "step": 22400 }, { "epoch": 115.38461538461539, "grad_norm": 38.58709716796875, "learning_rate": 7.751400000000002e-06, "loss": 1.0307, "step": 22500 }, { "epoch": 115.38461538461539, "eval_loss": 1.199642300605774, "eval_runtime": 36.3023, "eval_samples_per_second": 10.826, "eval_steps_per_second": 1.377, "step": 22500 }, { "epoch": 115.8974358974359, "grad_norm": 28.66075897216797, "learning_rate": 7.741400000000002e-06, "loss": 1.0276, "step": 22600 }, { "epoch": 115.8974358974359, "eval_loss": 1.213975429534912, "eval_runtime": 36.2211, "eval_samples_per_second": 10.85, "eval_steps_per_second": 1.38, "step": 22600 }, { "epoch": 116.41025641025641, "grad_norm": 39.12740707397461, "learning_rate": 7.731400000000002e-06, "loss": 1.0163, "step": 22700 }, { "epoch": 116.41025641025641, "eval_loss": 1.2231982946395874, "eval_runtime": 36.2424, "eval_samples_per_second": 10.844, "eval_steps_per_second": 1.38, "step": 22700 }, { "epoch": 116.92307692307692, "grad_norm": 46.517051696777344, "learning_rate": 7.7214e-06, "loss": 1.0463, "step": 22800 }, { "epoch": 116.92307692307692, "eval_loss": 1.183910846710205, "eval_runtime": 36.4609, "eval_samples_per_second": 10.779, "eval_steps_per_second": 1.371, "step": 22800 }, { "epoch": 117.43589743589743, "grad_norm": 49.0311393737793, "learning_rate": 7.7114e-06, "loss": 1.0236, "step": 22900 }, { "epoch": 117.43589743589743, "eval_loss": 1.209437608718872, "eval_runtime": 36.3858, "eval_samples_per_second": 10.801, "eval_steps_per_second": 1.374, "step": 22900 }, { "epoch": 117.94871794871794, "grad_norm": 25.398332595825195, "learning_rate": 7.7014e-06, "loss": 1.07, "step": 23000 }, { "epoch": 117.94871794871794, "eval_loss": 1.2132450342178345, "eval_runtime": 36.4343, "eval_samples_per_second": 10.787, "eval_steps_per_second": 1.372, "step": 23000 }, { "epoch": 118.46153846153847, "grad_norm": 49.871639251708984, "learning_rate": 7.6914e-06, "loss": 0.9935, "step": 23100 }, { "epoch": 118.46153846153847, "eval_loss": 1.2235440015792847, "eval_runtime": 36.3755, "eval_samples_per_second": 10.804, "eval_steps_per_second": 1.375, "step": 23100 }, { "epoch": 118.97435897435898, "grad_norm": 66.67717742919922, "learning_rate": 7.6814e-06, "loss": 1.0672, "step": 23200 }, { "epoch": 118.97435897435898, "eval_loss": 1.1966772079467773, "eval_runtime": 36.3466, "eval_samples_per_second": 10.813, "eval_steps_per_second": 1.376, "step": 23200 }, { "epoch": 119.48717948717949, "grad_norm": 43.98142623901367, "learning_rate": 7.6714e-06, "loss": 1.019, "step": 23300 }, { "epoch": 119.48717948717949, "eval_loss": 1.1935210227966309, "eval_runtime": 36.4031, "eval_samples_per_second": 10.796, "eval_steps_per_second": 1.374, "step": 23300 }, { "epoch": 120.0, "grad_norm": 29.781999588012695, "learning_rate": 7.661400000000001e-06, "loss": 1.0456, "step": 23400 }, { "epoch": 120.0, "eval_loss": 1.19161856174469, "eval_runtime": 36.6677, "eval_samples_per_second": 10.718, "eval_steps_per_second": 1.364, "step": 23400 }, { "epoch": 120.51282051282051, "grad_norm": 38.971858978271484, "learning_rate": 7.651400000000001e-06, "loss": 1.0488, "step": 23500 }, { "epoch": 120.51282051282051, "eval_loss": 1.2005729675292969, "eval_runtime": 36.8304, "eval_samples_per_second": 10.671, "eval_steps_per_second": 1.358, "step": 23500 }, { "epoch": 121.02564102564102, "grad_norm": 37.437660217285156, "learning_rate": 7.641400000000001e-06, "loss": 1.0196, "step": 23600 }, { "epoch": 121.02564102564102, "eval_loss": 1.1815942525863647, "eval_runtime": 36.5578, "eval_samples_per_second": 10.75, "eval_steps_per_second": 1.368, "step": 23600 }, { "epoch": 121.53846153846153, "grad_norm": 45.006248474121094, "learning_rate": 7.631500000000001e-06, "loss": 1.0247, "step": 23700 }, { "epoch": 121.53846153846153, "eval_loss": 1.1985987424850464, "eval_runtime": 36.9519, "eval_samples_per_second": 10.635, "eval_steps_per_second": 1.353, "step": 23700 }, { "epoch": 122.05128205128206, "grad_norm": 28.17504119873047, "learning_rate": 7.621500000000001e-06, "loss": 1.0278, "step": 23800 }, { "epoch": 122.05128205128206, "eval_loss": 1.1994553804397583, "eval_runtime": 36.7025, "eval_samples_per_second": 10.708, "eval_steps_per_second": 1.362, "step": 23800 }, { "epoch": 122.56410256410257, "grad_norm": 36.315181732177734, "learning_rate": 7.6116e-06, "loss": 1.0073, "step": 23900 }, { "epoch": 122.56410256410257, "eval_loss": 1.201116919517517, "eval_runtime": 36.7114, "eval_samples_per_second": 10.705, "eval_steps_per_second": 1.362, "step": 23900 }, { "epoch": 123.07692307692308, "grad_norm": 23.902212142944336, "learning_rate": 7.6016e-06, "loss": 1.0238, "step": 24000 }, { "epoch": 123.07692307692308, "eval_loss": 1.2095754146575928, "eval_runtime": 36.3531, "eval_samples_per_second": 10.811, "eval_steps_per_second": 1.375, "step": 24000 }, { "epoch": 123.58974358974359, "grad_norm": 29.814146041870117, "learning_rate": 7.5916e-06, "loss": 0.9958, "step": 24100 }, { "epoch": 123.58974358974359, "eval_loss": 1.1929839849472046, "eval_runtime": 36.7481, "eval_samples_per_second": 10.694, "eval_steps_per_second": 1.361, "step": 24100 }, { "epoch": 124.1025641025641, "grad_norm": 52.22597885131836, "learning_rate": 7.5816e-06, "loss": 1.0466, "step": 24200 }, { "epoch": 124.1025641025641, "eval_loss": 1.189598798751831, "eval_runtime": 36.3774, "eval_samples_per_second": 10.803, "eval_steps_per_second": 1.374, "step": 24200 }, { "epoch": 124.61538461538461, "grad_norm": 23.292490005493164, "learning_rate": 7.571600000000001e-06, "loss": 1.0451, "step": 24300 }, { "epoch": 124.61538461538461, "eval_loss": 1.2092961072921753, "eval_runtime": 36.3753, "eval_samples_per_second": 10.804, "eval_steps_per_second": 1.375, "step": 24300 }, { "epoch": 125.12820512820512, "grad_norm": 42.846275329589844, "learning_rate": 7.5616000000000014e-06, "loss": 1.0122, "step": 24400 }, { "epoch": 125.12820512820512, "eval_loss": 1.1754584312438965, "eval_runtime": 36.676, "eval_samples_per_second": 10.715, "eval_steps_per_second": 1.363, "step": 24400 }, { "epoch": 125.64102564102564, "grad_norm": 23.48101234436035, "learning_rate": 7.5516000000000015e-06, "loss": 1.0127, "step": 24500 }, { "epoch": 125.64102564102564, "eval_loss": 1.1745011806488037, "eval_runtime": 36.5027, "eval_samples_per_second": 10.766, "eval_steps_per_second": 1.37, "step": 24500 }, { "epoch": 126.15384615384616, "grad_norm": 32.6221809387207, "learning_rate": 7.541600000000001e-06, "loss": 1.0416, "step": 24600 }, { "epoch": 126.15384615384616, "eval_loss": 1.1841143369674683, "eval_runtime": 36.5596, "eval_samples_per_second": 10.75, "eval_steps_per_second": 1.368, "step": 24600 }, { "epoch": 126.66666666666667, "grad_norm": 44.3466682434082, "learning_rate": 7.531600000000001e-06, "loss": 1.0134, "step": 24700 }, { "epoch": 126.66666666666667, "eval_loss": 1.1873056888580322, "eval_runtime": 36.2722, "eval_samples_per_second": 10.835, "eval_steps_per_second": 1.378, "step": 24700 }, { "epoch": 127.17948717948718, "grad_norm": 35.96662902832031, "learning_rate": 7.521600000000001e-06, "loss": 1.0157, "step": 24800 }, { "epoch": 127.17948717948718, "eval_loss": 1.1866871118545532, "eval_runtime": 36.0895, "eval_samples_per_second": 10.89, "eval_steps_per_second": 1.385, "step": 24800 }, { "epoch": 127.6923076923077, "grad_norm": 26.58826446533203, "learning_rate": 7.511600000000001e-06, "loss": 1.0562, "step": 24900 }, { "epoch": 127.6923076923077, "eval_loss": 1.1805154085159302, "eval_runtime": 36.3065, "eval_samples_per_second": 10.825, "eval_steps_per_second": 1.377, "step": 24900 }, { "epoch": 128.2051282051282, "grad_norm": 60.84516906738281, "learning_rate": 7.501600000000001e-06, "loss": 0.9951, "step": 25000 }, { "epoch": 128.2051282051282, "eval_loss": 1.1816545724868774, "eval_runtime": 36.491, "eval_samples_per_second": 10.77, "eval_steps_per_second": 1.37, "step": 25000 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 513, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.5560791552e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }