{ "best_metric": null, "best_model_checkpoint": null, "epoch": 23.076923076923077, "eval_steps": 100, "global_step": 4500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5128205128205128, "grad_norm": 73.68424224853516, "learning_rate": 9.9907e-06, "loss": 3.1457, "step": 100 }, { "epoch": 0.5128205128205128, "eval_loss": 2.2357213497161865, "eval_runtime": 36.4689, "eval_samples_per_second": 10.776, "eval_steps_per_second": 1.371, "step": 100 }, { "epoch": 1.0256410256410255, "grad_norm": 57.47202682495117, "learning_rate": 9.980800000000001e-06, "loss": 2.1614, "step": 200 }, { "epoch": 1.0256410256410255, "eval_loss": 2.0913825035095215, "eval_runtime": 36.349, "eval_samples_per_second": 10.812, "eval_steps_per_second": 1.376, "step": 200 }, { "epoch": 1.5384615384615383, "grad_norm": 176.88357543945312, "learning_rate": 9.970800000000001e-06, "loss": 2.0388, "step": 300 }, { "epoch": 1.5384615384615383, "eval_loss": 2.003911018371582, "eval_runtime": 36.2551, "eval_samples_per_second": 10.84, "eval_steps_per_second": 1.379, "step": 300 }, { "epoch": 2.051282051282051, "grad_norm": 55.31132507324219, "learning_rate": 9.960800000000001e-06, "loss": 1.9285, "step": 400 }, { "epoch": 2.051282051282051, "eval_loss": 1.9796075820922852, "eval_runtime": 36.4437, "eval_samples_per_second": 10.784, "eval_steps_per_second": 1.372, "step": 400 }, { "epoch": 2.564102564102564, "grad_norm": 41.900753021240234, "learning_rate": 9.9508e-06, "loss": 1.9523, "step": 500 }, { "epoch": 2.564102564102564, "eval_loss": 1.936122179031372, "eval_runtime": 36.5845, "eval_samples_per_second": 10.742, "eval_steps_per_second": 1.367, "step": 500 }, { "epoch": 3.076923076923077, "grad_norm": 50.21903991699219, "learning_rate": 9.9408e-06, "loss": 1.8452, "step": 600 }, { "epoch": 3.076923076923077, "eval_loss": 1.8883634805679321, "eval_runtime": 36.6015, "eval_samples_per_second": 10.737, "eval_steps_per_second": 1.366, "step": 600 }, { "epoch": 3.58974358974359, "grad_norm": 45.193939208984375, "learning_rate": 9.930900000000002e-06, "loss": 1.8403, "step": 700 }, { "epoch": 3.58974358974359, "eval_loss": 1.8506474494934082, "eval_runtime": 36.454, "eval_samples_per_second": 10.781, "eval_steps_per_second": 1.372, "step": 700 }, { "epoch": 4.102564102564102, "grad_norm": 27.302494049072266, "learning_rate": 9.920900000000002e-06, "loss": 1.7976, "step": 800 }, { "epoch": 4.102564102564102, "eval_loss": 1.8370662927627563, "eval_runtime": 36.834, "eval_samples_per_second": 10.67, "eval_steps_per_second": 1.357, "step": 800 }, { "epoch": 4.615384615384615, "grad_norm": 52.6607666015625, "learning_rate": 9.9109e-06, "loss": 1.7508, "step": 900 }, { "epoch": 4.615384615384615, "eval_loss": 1.8037244081497192, "eval_runtime": 36.9939, "eval_samples_per_second": 10.623, "eval_steps_per_second": 1.352, "step": 900 }, { "epoch": 5.128205128205128, "grad_norm": 59.508033752441406, "learning_rate": 9.9009e-06, "loss": 1.7383, "step": 1000 }, { "epoch": 5.128205128205128, "eval_loss": 1.7986633777618408, "eval_runtime": 36.8356, "eval_samples_per_second": 10.669, "eval_steps_per_second": 1.357, "step": 1000 }, { "epoch": 5.641025641025641, "grad_norm": 71.58872985839844, "learning_rate": 9.8909e-06, "loss": 1.7361, "step": 1100 }, { "epoch": 5.641025641025641, "eval_loss": 1.7810852527618408, "eval_runtime": 37.0957, "eval_samples_per_second": 10.594, "eval_steps_per_second": 1.348, "step": 1100 }, { "epoch": 6.153846153846154, "grad_norm": 41.782066345214844, "learning_rate": 9.8809e-06, "loss": 1.682, "step": 1200 }, { "epoch": 6.153846153846154, "eval_loss": 1.777554988861084, "eval_runtime": 36.9173, "eval_samples_per_second": 10.645, "eval_steps_per_second": 1.354, "step": 1200 }, { "epoch": 6.666666666666667, "grad_norm": 40.28728485107422, "learning_rate": 9.8709e-06, "loss": 1.7216, "step": 1300 }, { "epoch": 6.666666666666667, "eval_loss": 1.7382572889328003, "eval_runtime": 36.8918, "eval_samples_per_second": 10.653, "eval_steps_per_second": 1.355, "step": 1300 }, { "epoch": 7.17948717948718, "grad_norm": 104.99211883544922, "learning_rate": 9.8609e-06, "loss": 1.6534, "step": 1400 }, { "epoch": 7.17948717948718, "eval_loss": 1.76559579372406, "eval_runtime": 36.3398, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.376, "step": 1400 }, { "epoch": 7.6923076923076925, "grad_norm": 29.326631546020508, "learning_rate": 9.8509e-06, "loss": 1.707, "step": 1500 }, { "epoch": 7.6923076923076925, "eval_loss": 1.750089406967163, "eval_runtime": 36.4098, "eval_samples_per_second": 10.794, "eval_steps_per_second": 1.373, "step": 1500 }, { "epoch": 8.205128205128204, "grad_norm": 37.941261291503906, "learning_rate": 9.840900000000001e-06, "loss": 1.6554, "step": 1600 }, { "epoch": 8.205128205128204, "eval_loss": 1.6900651454925537, "eval_runtime": 36.3226, "eval_samples_per_second": 10.82, "eval_steps_per_second": 1.377, "step": 1600 }, { "epoch": 8.717948717948717, "grad_norm": 44.60703659057617, "learning_rate": 9.830900000000001e-06, "loss": 1.6334, "step": 1700 }, { "epoch": 8.717948717948717, "eval_loss": 1.7162973880767822, "eval_runtime": 36.2672, "eval_samples_per_second": 10.836, "eval_steps_per_second": 1.379, "step": 1700 }, { "epoch": 9.23076923076923, "grad_norm": 34.127254486083984, "learning_rate": 9.820900000000001e-06, "loss": 1.6345, "step": 1800 }, { "epoch": 9.23076923076923, "eval_loss": 1.6906001567840576, "eval_runtime": 36.2264, "eval_samples_per_second": 10.848, "eval_steps_per_second": 1.38, "step": 1800 }, { "epoch": 9.743589743589745, "grad_norm": 60.377540588378906, "learning_rate": 9.810900000000001e-06, "loss": 1.598, "step": 1900 }, { "epoch": 9.743589743589745, "eval_loss": 1.6555503606796265, "eval_runtime": 36.3896, "eval_samples_per_second": 10.8, "eval_steps_per_second": 1.374, "step": 1900 }, { "epoch": 10.256410256410255, "grad_norm": 20.264404296875, "learning_rate": 9.800900000000001e-06, "loss": 1.5466, "step": 2000 }, { "epoch": 10.256410256410255, "eval_loss": 1.648037075996399, "eval_runtime": 36.4136, "eval_samples_per_second": 10.793, "eval_steps_per_second": 1.373, "step": 2000 }, { "epoch": 10.76923076923077, "grad_norm": 27.18608856201172, "learning_rate": 9.790900000000001e-06, "loss": 1.5865, "step": 2100 }, { "epoch": 10.76923076923077, "eval_loss": 1.6171936988830566, "eval_runtime": 36.1051, "eval_samples_per_second": 10.885, "eval_steps_per_second": 1.385, "step": 2100 }, { "epoch": 11.282051282051283, "grad_norm": 32.486331939697266, "learning_rate": 9.780900000000002e-06, "loss": 1.5284, "step": 2200 }, { "epoch": 11.282051282051283, "eval_loss": 1.5915095806121826, "eval_runtime": 36.1781, "eval_samples_per_second": 10.863, "eval_steps_per_second": 1.382, "step": 2200 }, { "epoch": 11.794871794871796, "grad_norm": 65.88719940185547, "learning_rate": 9.770900000000002e-06, "loss": 1.5514, "step": 2300 }, { "epoch": 11.794871794871796, "eval_loss": 1.5879931449890137, "eval_runtime": 36.3934, "eval_samples_per_second": 10.799, "eval_steps_per_second": 1.374, "step": 2300 }, { "epoch": 12.307692307692308, "grad_norm": 31.737024307250977, "learning_rate": 9.760900000000002e-06, "loss": 1.4941, "step": 2400 }, { "epoch": 12.307692307692308, "eval_loss": 1.583853006362915, "eval_runtime": 36.3797, "eval_samples_per_second": 10.803, "eval_steps_per_second": 1.374, "step": 2400 }, { "epoch": 12.820512820512821, "grad_norm": 45.48268508911133, "learning_rate": 9.7509e-06, "loss": 1.5097, "step": 2500 }, { "epoch": 12.820512820512821, "eval_loss": 1.5559026002883911, "eval_runtime": 36.1605, "eval_samples_per_second": 10.868, "eval_steps_per_second": 1.383, "step": 2500 }, { "epoch": 13.333333333333334, "grad_norm": 27.500398635864258, "learning_rate": 9.7409e-06, "loss": 1.5018, "step": 2600 }, { "epoch": 13.333333333333334, "eval_loss": 1.5453521013259888, "eval_runtime": 36.2887, "eval_samples_per_second": 10.83, "eval_steps_per_second": 1.378, "step": 2600 }, { "epoch": 13.846153846153847, "grad_norm": 32.49728775024414, "learning_rate": 9.7309e-06, "loss": 1.4804, "step": 2700 }, { "epoch": 13.846153846153847, "eval_loss": 1.5424816608428955, "eval_runtime": 36.359, "eval_samples_per_second": 10.809, "eval_steps_per_second": 1.375, "step": 2700 }, { "epoch": 14.35897435897436, "grad_norm": 38.46280288696289, "learning_rate": 9.7209e-06, "loss": 1.4826, "step": 2800 }, { "epoch": 14.35897435897436, "eval_loss": 1.5317177772521973, "eval_runtime": 36.3362, "eval_samples_per_second": 10.816, "eval_steps_per_second": 1.376, "step": 2800 }, { "epoch": 14.871794871794872, "grad_norm": 16.075960159301758, "learning_rate": 9.7109e-06, "loss": 1.4568, "step": 2900 }, { "epoch": 14.871794871794872, "eval_loss": 1.5241832733154297, "eval_runtime": 36.2025, "eval_samples_per_second": 10.856, "eval_steps_per_second": 1.381, "step": 2900 }, { "epoch": 15.384615384615385, "grad_norm": 27.334318161010742, "learning_rate": 9.7009e-06, "loss": 1.4176, "step": 3000 }, { "epoch": 15.384615384615385, "eval_loss": 1.520580768585205, "eval_runtime": 36.398, "eval_samples_per_second": 10.797, "eval_steps_per_second": 1.374, "step": 3000 }, { "epoch": 15.897435897435898, "grad_norm": 94.90784454345703, "learning_rate": 9.6909e-06, "loss": 1.4681, "step": 3100 }, { "epoch": 15.897435897435898, "eval_loss": 1.5268648862838745, "eval_runtime": 36.0541, "eval_samples_per_second": 10.9, "eval_steps_per_second": 1.387, "step": 3100 }, { "epoch": 16.41025641025641, "grad_norm": 16.697856903076172, "learning_rate": 9.6809e-06, "loss": 1.454, "step": 3200 }, { "epoch": 16.41025641025641, "eval_loss": 1.5157753229141235, "eval_runtime": 36.3172, "eval_samples_per_second": 10.821, "eval_steps_per_second": 1.377, "step": 3200 }, { "epoch": 16.923076923076923, "grad_norm": 54.05553436279297, "learning_rate": 9.670900000000001e-06, "loss": 1.4309, "step": 3300 }, { "epoch": 16.923076923076923, "eval_loss": 1.516249179840088, "eval_runtime": 36.2632, "eval_samples_per_second": 10.837, "eval_steps_per_second": 1.379, "step": 3300 }, { "epoch": 17.435897435897434, "grad_norm": 47.010475158691406, "learning_rate": 9.660900000000001e-06, "loss": 1.4571, "step": 3400 }, { "epoch": 17.435897435897434, "eval_loss": 1.5018247365951538, "eval_runtime": 36.2118, "eval_samples_per_second": 10.853, "eval_steps_per_second": 1.381, "step": 3400 }, { "epoch": 17.94871794871795, "grad_norm": 52.865718841552734, "learning_rate": 9.650900000000001e-06, "loss": 1.4168, "step": 3500 }, { "epoch": 17.94871794871795, "eval_loss": 1.4993616342544556, "eval_runtime": 36.2572, "eval_samples_per_second": 10.839, "eval_steps_per_second": 1.379, "step": 3500 }, { "epoch": 18.46153846153846, "grad_norm": 30.117380142211914, "learning_rate": 9.640900000000001e-06, "loss": 1.4275, "step": 3600 }, { "epoch": 18.46153846153846, "eval_loss": 1.4899998903274536, "eval_runtime": 36.4696, "eval_samples_per_second": 10.776, "eval_steps_per_second": 1.371, "step": 3600 }, { "epoch": 18.974358974358974, "grad_norm": 31.10028076171875, "learning_rate": 9.630900000000001e-06, "loss": 1.4148, "step": 3700 }, { "epoch": 18.974358974358974, "eval_loss": 1.5231629610061646, "eval_runtime": 36.4604, "eval_samples_per_second": 10.779, "eval_steps_per_second": 1.371, "step": 3700 }, { "epoch": 19.487179487179485, "grad_norm": 44.06697082519531, "learning_rate": 9.620900000000001e-06, "loss": 1.4057, "step": 3800 }, { "epoch": 19.487179487179485, "eval_loss": 1.4841217994689941, "eval_runtime": 36.3382, "eval_samples_per_second": 10.815, "eval_steps_per_second": 1.376, "step": 3800 }, { "epoch": 20.0, "grad_norm": 53.86429214477539, "learning_rate": 9.610900000000001e-06, "loss": 1.4302, "step": 3900 }, { "epoch": 20.0, "eval_loss": 1.477772831916809, "eval_runtime": 36.1794, "eval_samples_per_second": 10.863, "eval_steps_per_second": 1.382, "step": 3900 }, { "epoch": 20.51282051282051, "grad_norm": 80.95457458496094, "learning_rate": 9.600900000000002e-06, "loss": 1.4076, "step": 4000 }, { "epoch": 20.51282051282051, "eval_loss": 1.4769134521484375, "eval_runtime": 36.4725, "eval_samples_per_second": 10.775, "eval_steps_per_second": 1.371, "step": 4000 }, { "epoch": 21.025641025641026, "grad_norm": 32.276214599609375, "learning_rate": 9.5909e-06, "loss": 1.3868, "step": 4100 }, { "epoch": 21.025641025641026, "eval_loss": 1.463292121887207, "eval_runtime": 36.5192, "eval_samples_per_second": 10.761, "eval_steps_per_second": 1.369, "step": 4100 }, { "epoch": 21.53846153846154, "grad_norm": 54.65959167480469, "learning_rate": 9.5809e-06, "loss": 1.3795, "step": 4200 }, { "epoch": 21.53846153846154, "eval_loss": 1.4630039930343628, "eval_runtime": 36.5288, "eval_samples_per_second": 10.759, "eval_steps_per_second": 1.369, "step": 4200 }, { "epoch": 22.05128205128205, "grad_norm": 42.31818389892578, "learning_rate": 9.5709e-06, "loss": 1.3787, "step": 4300 }, { "epoch": 22.05128205128205, "eval_loss": 1.4471133947372437, "eval_runtime": 36.2949, "eval_samples_per_second": 10.828, "eval_steps_per_second": 1.378, "step": 4300 }, { "epoch": 22.564102564102566, "grad_norm": 34.44257736206055, "learning_rate": 9.5609e-06, "loss": 1.4027, "step": 4400 }, { "epoch": 22.564102564102566, "eval_loss": 1.4606964588165283, "eval_runtime": 36.5672, "eval_samples_per_second": 10.747, "eval_steps_per_second": 1.367, "step": 4400 }, { "epoch": 23.076923076923077, "grad_norm": 42.65989303588867, "learning_rate": 9.5509e-06, "loss": 1.3459, "step": 4500 }, { "epoch": 23.076923076923077, "eval_loss": 1.454709768295288, "eval_runtime": 37.2024, "eval_samples_per_second": 10.564, "eval_steps_per_second": 1.344, "step": 4500 } ], "logging_steps": 100, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 513, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.720094247936e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }