{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 2820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.71, "grad_norm": 42.0918083190918, "learning_rate": 1.929078014184397e-06, "loss": 10.0975, "step": 100 }, { "epoch": 1.42, "grad_norm": 54.439884185791016, "learning_rate": 1.8581560283687943e-06, "loss": 5.8147, "step": 200 }, { "epoch": 2.13, "grad_norm": 60.155399322509766, "learning_rate": 1.7872340425531913e-06, "loss": 4.1279, "step": 300 }, { "epoch": 2.84, "grad_norm": 49.89198684692383, "learning_rate": 1.7163120567375885e-06, "loss": 2.8192, "step": 400 }, { "epoch": 3.55, "grad_norm": 24.662052154541016, "learning_rate": 1.6453900709219858e-06, "loss": 2.3071, "step": 500 }, { "epoch": 4.26, "grad_norm": 46.319374084472656, "learning_rate": 1.574468085106383e-06, "loss": 1.788, "step": 600 }, { "epoch": 4.96, "grad_norm": 19.011852264404297, "learning_rate": 1.50354609929078e-06, "loss": 1.6626, "step": 700 }, { "epoch": 5.67, "grad_norm": 24.53301239013672, "learning_rate": 1.4326241134751774e-06, "loss": 1.5502, "step": 800 }, { "epoch": 6.38, "grad_norm": 29.92524528503418, "learning_rate": 1.3617021276595744e-06, "loss": 1.4522, "step": 900 }, { "epoch": 7.09, "grad_norm": 49.022037506103516, "learning_rate": 1.2907801418439716e-06, "loss": 1.3627, "step": 1000 }, { "epoch": 7.8, "grad_norm": 15.343302726745605, "learning_rate": 1.2198581560283688e-06, "loss": 1.2928, "step": 1100 }, { "epoch": 8.51, "grad_norm": 20.46482276916504, "learning_rate": 1.148936170212766e-06, "loss": 1.2246, "step": 1200 }, { "epoch": 9.22, "grad_norm": 26.57284927368164, "learning_rate": 1.078014184397163e-06, "loss": 1.153, "step": 1300 }, { "epoch": 9.93, "grad_norm": 15.145112991333008, "learning_rate": 1.00709219858156e-06, "loss": 1.1536, "step": 1400 }, { "epoch": 10.64, "grad_norm": 19.93513298034668, "learning_rate": 9.361702127659575e-07, "loss": 1.0598, "step": 1500 }, { "epoch": 11.35, "grad_norm": 27.3331298828125, "learning_rate": 8.652482269503546e-07, "loss": 1.047, "step": 1600 }, { "epoch": 12.06, "grad_norm": 43.462093353271484, "learning_rate": 7.943262411347518e-07, "loss": 1.0587, "step": 1700 }, { "epoch": 12.77, "grad_norm": 36.50919723510742, "learning_rate": 7.23404255319149e-07, "loss": 1.0709, "step": 1800 }, { "epoch": 13.48, "grad_norm": 19.904586791992188, "learning_rate": 6.524822695035461e-07, "loss": 0.972, "step": 1900 }, { "epoch": 14.18, "grad_norm": 19.49388313293457, "learning_rate": 5.815602836879432e-07, "loss": 1.0102, "step": 2000 }, { "epoch": 14.89, "grad_norm": 16.17923927307129, "learning_rate": 5.106382978723403e-07, "loss": 0.9229, "step": 2100 }, { "epoch": 15.6, "grad_norm": 20.12999153137207, "learning_rate": 4.397163120567376e-07, "loss": 0.9488, "step": 2200 }, { "epoch": 16.31, "grad_norm": 26.802228927612305, "learning_rate": 3.687943262411347e-07, "loss": 0.9173, "step": 2300 }, { "epoch": 17.02, "grad_norm": 30.8055419921875, "learning_rate": 2.978723404255319e-07, "loss": 0.9306, "step": 2400 }, { "epoch": 17.73, "grad_norm": 15.404391288757324, "learning_rate": 2.2695035460992907e-07, "loss": 0.898, "step": 2500 }, { "epoch": 18.44, "grad_norm": 11.279256820678711, "learning_rate": 1.5602836879432623e-07, "loss": 0.9177, "step": 2600 }, { "epoch": 19.15, "grad_norm": 20.938844680786133, "learning_rate": 8.51063829787234e-08, "loss": 0.8923, "step": 2700 }, { "epoch": 19.86, "grad_norm": 48.48252868652344, "learning_rate": 1.4184397163120567e-08, "loss": 0.9238, "step": 2800 } ], "logging_steps": 100, "max_steps": 2820, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 7.55100305726976e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }