{ "best_metric": null, "best_model_checkpoint": null, "epoch": 120.0, "eval_steps": 17748, "global_step": 177480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 12.0, "grad_norm": 22.951730728149414, "learning_rate": 9.000676132521975e-06, "loss": 2.3088, "step": 17748 }, { "epoch": 12.0, "eval_loss": 3.187467336654663, "eval_runtime": 33.5588, "eval_samples_per_second": 289.492, "eval_steps_per_second": 6.585, "step": 17748 }, { "epoch": 24.0, "grad_norm": 2.020862579345703, "learning_rate": 8.001126887536624e-06, "loss": 2.0566, "step": 35496 }, { "epoch": 24.0, "eval_loss": 3.3423843383789062, "eval_runtime": 31.6561, "eval_samples_per_second": 306.892, "eval_steps_per_second": 6.981, "step": 35496 }, { "epoch": 36.0, "grad_norm": 8.087705612182617, "learning_rate": 7.0016339869281044e-06, "loss": 1.9815, "step": 53244 }, { "epoch": 36.0, "eval_loss": 3.454538345336914, "eval_runtime": 31.7681, "eval_samples_per_second": 305.81, "eval_steps_per_second": 6.957, "step": 53244 }, { "epoch": 48.0, "grad_norm": 7.905551433563232, "learning_rate": 6.0020847419427546e-06, "loss": 1.9566, "step": 70992 }, { "epoch": 48.0, "eval_loss": 3.566826343536377, "eval_runtime": 31.9459, "eval_samples_per_second": 304.108, "eval_steps_per_second": 6.918, "step": 70992 }, { "epoch": 60.0, "grad_norm": 1.7085500955581665, "learning_rate": 5.0025918413342355e-06, "loss": 1.9488, "step": 88740 }, { "epoch": 60.0, "eval_loss": 3.5228500366210938, "eval_runtime": 31.684, "eval_samples_per_second": 306.622, "eval_steps_per_second": 6.975, "step": 88740 }, { "epoch": 72.0, "grad_norm": 0.28708475828170776, "learning_rate": 4.003098940725716e-06, "loss": 1.9424, "step": 106488 }, { "epoch": 72.0, "eval_loss": 3.677123785018921, "eval_runtime": 33.4796, "eval_samples_per_second": 290.177, "eval_steps_per_second": 6.601, "step": 106488 }, { "epoch": 84.0, "grad_norm": 0.5243535041809082, "learning_rate": 3.0035496957403653e-06, "loss": 1.9411, "step": 124236 }, { "epoch": 84.0, "eval_loss": 3.786760091781616, "eval_runtime": 31.6666, "eval_samples_per_second": 306.79, "eval_steps_per_second": 6.979, "step": 124236 }, { "epoch": 96.0, "grad_norm": 0.4514042139053345, "learning_rate": 2.004056795131846e-06, "loss": 1.9388, "step": 141984 }, { "epoch": 96.0, "eval_loss": 3.7067413330078125, "eval_runtime": 33.5223, "eval_samples_per_second": 289.807, "eval_steps_per_second": 6.593, "step": 141984 }, { "epoch": 108.0, "grad_norm": 0.2749279737472534, "learning_rate": 1.0045638945233266e-06, "loss": 1.9352, "step": 159732 }, { "epoch": 108.0, "eval_loss": 3.850806951522827, "eval_runtime": 31.5804, "eval_samples_per_second": 307.628, "eval_steps_per_second": 6.998, "step": 159732 }, { "epoch": 120.0, "grad_norm": 0.5212497115135193, "learning_rate": 5.01464953797611e-09, "loss": 1.9318, "step": 177480 }, { "epoch": 120.0, "eval_loss": 3.8198835849761963, "eval_runtime": 33.3012, "eval_samples_per_second": 291.731, "eval_steps_per_second": 6.636, "step": 177480 }, { "epoch": 120.0, "step": 177480, "total_flos": 2.803155588736635e+18, "train_loss": 1.9941566479321613, "train_runtime": 61251.1099, "train_samples_per_second": 115.842, "train_steps_per_second": 2.898 } ], "logging_steps": 17748, "max_steps": 177480, "num_input_tokens_seen": 0, "num_train_epochs": 120, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.803155588736635e+18, "train_batch_size": 40, "trial_name": null, "trial_params": null }