{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.026055237102657634, "eval_steps": 4, "global_step": 25, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010422094841063053, "grad_norm": 3.42587947845459, "learning_rate": 1.0000000000000002e-06, "loss": 6.4631, "step": 1 }, { "epoch": 0.0010422094841063053, "eval_loss": 6.671364784240723, "eval_runtime": 9.44, "eval_samples_per_second": 42.903, "eval_steps_per_second": 10.805, "step": 1 }, { "epoch": 0.0020844189682126106, "grad_norm": 3.787745714187622, "learning_rate": 2.0000000000000003e-06, "loss": 6.619, "step": 2 }, { "epoch": 0.003126628452318916, "grad_norm": 3.6949379444122314, "learning_rate": 3e-06, "loss": 6.5476, "step": 3 }, { "epoch": 0.004168837936425221, "grad_norm": 4.343378067016602, "learning_rate": 4.000000000000001e-06, "loss": 6.6851, "step": 4 }, { "epoch": 0.004168837936425221, "eval_loss": 6.672913074493408, "eval_runtime": 7.8311, "eval_samples_per_second": 51.717, "eval_steps_per_second": 13.025, "step": 4 }, { "epoch": 0.005211047420531527, "grad_norm": 4.449336528778076, "learning_rate": 5e-06, "loss": 6.7603, "step": 5 }, { "epoch": 0.006253256904637832, "grad_norm": 4.981034755706787, "learning_rate": 6e-06, "loss": 6.7741, "step": 6 }, { "epoch": 0.007295466388744137, "grad_norm": 4.62288236618042, "learning_rate": 7e-06, "loss": 7.3853, "step": 7 }, { "epoch": 0.008337675872850442, "grad_norm": 4.797128677368164, "learning_rate": 8.000000000000001e-06, "loss": 7.1027, "step": 8 }, { "epoch": 0.008337675872850442, "eval_loss": 6.669874668121338, "eval_runtime": 7.7933, "eval_samples_per_second": 51.968, "eval_steps_per_second": 13.088, "step": 8 }, { "epoch": 0.009379885356956748, "grad_norm": 4.196491718292236, "learning_rate": 9e-06, "loss": 6.7359, "step": 9 }, { "epoch": 0.010422094841063054, "grad_norm": 3.4589948654174805, "learning_rate": 1e-05, "loss": 6.5552, "step": 10 }, { "epoch": 0.011464304325169358, "grad_norm": 3.497850179672241, "learning_rate": 9.890738003669029e-06, "loss": 6.8426, "step": 11 }, { "epoch": 0.012506513809275664, "grad_norm": 3.9026501178741455, "learning_rate": 9.567727288213005e-06, "loss": 6.9201, "step": 12 }, { "epoch": 0.012506513809275664, "eval_loss": 6.656294822692871, "eval_runtime": 7.8499, "eval_samples_per_second": 51.593, "eval_steps_per_second": 12.994, "step": 12 }, { "epoch": 0.01354872329338197, "grad_norm": 3.805992603302002, "learning_rate": 9.045084971874738e-06, "loss": 6.4677, "step": 13 }, { "epoch": 0.014590932777488274, "grad_norm": 3.989011526107788, "learning_rate": 8.345653031794292e-06, "loss": 6.8528, "step": 14 }, { "epoch": 0.015633142261594582, "grad_norm": 4.253493785858154, "learning_rate": 7.500000000000001e-06, "loss": 6.8437, "step": 15 }, { "epoch": 0.016675351745700884, "grad_norm": 4.157368183135986, "learning_rate": 6.545084971874738e-06, "loss": 7.0895, "step": 16 }, { "epoch": 0.016675351745700884, "eval_loss": 6.632657051086426, "eval_runtime": 7.8431, "eval_samples_per_second": 51.638, "eval_steps_per_second": 13.005, "step": 16 }, { "epoch": 0.01771756122980719, "grad_norm": 4.2801594734191895, "learning_rate": 5.522642316338268e-06, "loss": 6.8602, "step": 17 }, { "epoch": 0.018759770713913496, "grad_norm": 3.379889726638794, "learning_rate": 4.477357683661734e-06, "loss": 6.3006, "step": 18 }, { "epoch": 0.019801980198019802, "grad_norm": 4.244846820831299, "learning_rate": 3.4549150281252635e-06, "loss": 6.6127, "step": 19 }, { "epoch": 0.020844189682126108, "grad_norm": 4.060556411743164, "learning_rate": 2.5000000000000015e-06, "loss": 6.3001, "step": 20 }, { "epoch": 0.020844189682126108, "eval_loss": 6.617997646331787, "eval_runtime": 7.8463, "eval_samples_per_second": 51.617, "eval_steps_per_second": 13.0, "step": 20 }, { "epoch": 0.021886399166232414, "grad_norm": 4.195801258087158, "learning_rate": 1.6543469682057105e-06, "loss": 7.1121, "step": 21 }, { "epoch": 0.022928608650338717, "grad_norm": 3.2973010540008545, "learning_rate": 9.549150281252633e-07, "loss": 5.6943, "step": 22 }, { "epoch": 0.023970818134445022, "grad_norm": 3.863523244857788, "learning_rate": 4.322727117869951e-07, "loss": 6.3595, "step": 23 }, { "epoch": 0.02501302761855133, "grad_norm": 3.6141159534454346, "learning_rate": 1.0926199633097156e-07, "loss": 6.1388, "step": 24 }, { "epoch": 0.02501302761855133, "eval_loss": 6.613474369049072, "eval_runtime": 7.81, "eval_samples_per_second": 51.856, "eval_steps_per_second": 13.06, "step": 24 }, { "epoch": 0.026055237102657634, "grad_norm": 4.716870307922363, "learning_rate": 0.0, "loss": 6.6627, "step": 25 } ], "logging_steps": 1, "max_steps": 25, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 347093965209600.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }