{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 11080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18050541516245489, "grad_norm": 31.555688858032227, "learning_rate": 2.864620938628159e-05, "loss": 1.3005, "step": 500 }, { "epoch": 0.36101083032490977, "grad_norm": 20.2050838470459, "learning_rate": 2.729241877256318e-05, "loss": 0.9978, "step": 1000 }, { "epoch": 0.5415162454873647, "grad_norm": 25.661306381225586, "learning_rate": 2.5938628158844765e-05, "loss": 0.9259, "step": 1500 }, { "epoch": 0.7220216606498195, "grad_norm": 22.348859786987305, "learning_rate": 2.4584837545126353e-05, "loss": 0.8798, "step": 2000 }, { "epoch": 0.9025270758122743, "grad_norm": 32.197166442871094, "learning_rate": 2.3231046931407943e-05, "loss": 0.8471, "step": 2500 }, { "epoch": 1.0830324909747293, "grad_norm": 20.764020919799805, "learning_rate": 2.1877256317689534e-05, "loss": 0.771, "step": 3000 }, { "epoch": 1.263537906137184, "grad_norm": 15.258258819580078, "learning_rate": 2.0523465703971117e-05, "loss": 0.6937, "step": 3500 }, { "epoch": 1.444043321299639, "grad_norm": 24.816614151000977, "learning_rate": 1.9169675090252708e-05, "loss": 0.709, "step": 4000 }, { "epoch": 1.6245487364620939, "grad_norm": 38.912071228027344, "learning_rate": 1.7815884476534298e-05, "loss": 0.6831, "step": 4500 }, { "epoch": 1.8050541516245486, "grad_norm": 8.910807609558105, "learning_rate": 1.6462093862815885e-05, "loss": 0.6742, "step": 5000 }, { "epoch": 1.9855595667870036, "grad_norm": 11.664189338684082, "learning_rate": 1.5108303249097474e-05, "loss": 0.69, "step": 5500 }, { "epoch": 2.1660649819494586, "grad_norm": 10.968308448791504, "learning_rate": 1.3754512635379063e-05, "loss": 0.5436, "step": 6000 }, { "epoch": 2.3465703971119134, "grad_norm": 11.711438179016113, "learning_rate": 1.240072202166065e-05, "loss": 0.5357, "step": 6500 }, { "epoch": 2.527075812274368, "grad_norm": 13.477335929870605, "learning_rate": 1.1046931407942239e-05, "loss": 0.5359, "step": 7000 }, { "epoch": 2.707581227436823, "grad_norm": 10.649256706237793, "learning_rate": 9.693140794223826e-06, "loss": 0.5394, "step": 7500 }, { "epoch": 2.888086642599278, "grad_norm": 10.525208473205566, "learning_rate": 8.339350180505416e-06, "loss": 0.5254, "step": 8000 }, { "epoch": 3.068592057761733, "grad_norm": 19.402320861816406, "learning_rate": 6.985559566787004e-06, "loss": 0.4775, "step": 8500 }, { "epoch": 3.2490974729241877, "grad_norm": 41.23615646362305, "learning_rate": 5.631768953068592e-06, "loss": 0.4003, "step": 9000 }, { "epoch": 3.4296028880866425, "grad_norm": 21.56231689453125, "learning_rate": 4.277978339350181e-06, "loss": 0.3952, "step": 9500 }, { "epoch": 3.6101083032490973, "grad_norm": 11.254490852355957, "learning_rate": 2.924187725631769e-06, "loss": 0.4007, "step": 10000 }, { "epoch": 3.7906137184115525, "grad_norm": 29.451414108276367, "learning_rate": 1.5703971119133576e-06, "loss": 0.3962, "step": 10500 }, { "epoch": 3.9711191335740073, "grad_norm": 16.022735595703125, "learning_rate": 2.1660649819494586e-07, "loss": 0.3853, "step": 11000 }, { "epoch": 4.0, "step": 11080, "total_flos": 1.0399493167607808e+16, "train_loss": 0.6484355885199261, "train_runtime": 8433.1347, "train_samples_per_second": 42.043, "train_steps_per_second": 1.314 } ], "logging_steps": 500, "max_steps": 11080, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0399493167607808e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }