{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.054764512595837894, "grad_norm": 3.8052456378936768, "learning_rate": 1.8248175182481753e-05, "loss": 2.2411, "step": 100 }, { "epoch": 0.10952902519167579, "grad_norm": 2.646080255508423, "learning_rate": 3.649635036496351e-05, "loss": 1.8231, "step": 200 }, { "epoch": 0.16429353778751368, "grad_norm": 2.700340986251831, "learning_rate": 5.474452554744526e-05, "loss": 1.7334, "step": 300 }, { "epoch": 0.21905805038335158, "grad_norm": 3.046970844268799, "learning_rate": 7.299270072992701e-05, "loss": 1.6861, "step": 400 }, { "epoch": 0.2738225629791895, "grad_norm": 2.5365347862243652, "learning_rate": 9.124087591240877e-05, "loss": 1.6711, "step": 500 }, { "epoch": 0.32858707557502737, "grad_norm": 2.219165802001953, "learning_rate": 9.997255186358079e-05, "loss": 1.6651, "step": 600 }, { "epoch": 0.3833515881708653, "grad_norm": 2.6104276180267334, "learning_rate": 9.976563458663239e-05, "loss": 1.5924, "step": 700 }, { "epoch": 0.43811610076670315, "grad_norm": 2.4602837562561035, "learning_rate": 9.93567000457336e-05, "loss": 1.5787, "step": 800 }, { "epoch": 0.4928806133625411, "grad_norm": 2.33683705329895, "learning_rate": 9.874740825864108e-05, "loss": 1.5269, "step": 900 }, { "epoch": 0.547645125958379, "grad_norm": 3.0263447761535645, "learning_rate": 9.794023256786919e-05, "loss": 1.5251, "step": 1000 }, { "epoch": 0.6024096385542169, "grad_norm": 2.4210712909698486, "learning_rate": 9.693844960047051e-05, "loss": 1.4886, "step": 1100 }, { "epoch": 0.6571741511500547, "grad_norm": 2.1745235919952393, "learning_rate": 9.574612596698522e-05, "loss": 1.4715, "step": 1200 }, { "epoch": 0.7119386637458927, "grad_norm": 2.216663360595703, "learning_rate": 9.436810175355263e-05, "loss": 1.4191, "step": 1300 }, { "epoch": 0.7667031763417306, "grad_norm": 2.010982036590576, "learning_rate": 9.280997087419733e-05, "loss": 1.4411, "step": 1400 }, { "epoch": 0.8214676889375685, "grad_norm": 2.0168168544769287, "learning_rate": 9.107805836304658e-05, "loss": 1.4264, "step": 1500 }, { "epoch": 0.8762322015334063, "grad_norm": 2.2413294315338135, "learning_rate": 8.91793946986587e-05, "loss": 1.4056, "step": 1600 }, { "epoch": 0.9309967141292442, "grad_norm": 2.189241409301758, "learning_rate": 8.712168726468965e-05, "loss": 1.368, "step": 1700 }, { "epoch": 0.9857612267250822, "grad_norm": 2.0814454555511475, "learning_rate": 8.491328906274937e-05, "loss": 1.3536, "step": 1800 }, { "epoch": 1.04052573932092, "grad_norm": 2.4284451007843018, "learning_rate": 8.256316480445412e-05, "loss": 1.1463, "step": 1900 }, { "epoch": 1.095290251916758, "grad_norm": 2.5235214233398438, "learning_rate": 8.008085452031985e-05, "loss": 1.0484, "step": 2000 }, { "epoch": 1.1500547645125958, "grad_norm": 2.3191304206848145, "learning_rate": 7.747643483322187e-05, "loss": 1.0878, "step": 2100 }, { "epoch": 1.2048192771084336, "grad_norm": 2.206143856048584, "learning_rate": 7.476047805362569e-05, "loss": 1.0593, "step": 2200 }, { "epoch": 1.2595837897042717, "grad_norm": 2.5182549953460693, "learning_rate": 7.194400926263763e-05, "loss": 1.0407, "step": 2300 }, { "epoch": 1.3143483023001095, "grad_norm": 2.556648015975952, "learning_rate": 6.903846155709005e-05, "loss": 1.0629, "step": 2400 }, { "epoch": 1.3691128148959475, "grad_norm": 2.1211650371551514, "learning_rate": 6.605562963833925e-05, "loss": 1.0505, "step": 2500 }, { "epoch": 1.4238773274917853, "grad_norm": 2.1135342121124268, "learning_rate": 6.300762193317518e-05, "loss": 0.9912, "step": 2600 }, { "epoch": 1.4786418400876231, "grad_norm": 2.554298162460327, "learning_rate": 5.9906811441203135e-05, "loss": 1.0007, "step": 2700 }, { "epoch": 1.5334063526834611, "grad_norm": 1.823740839958191, "learning_rate": 5.67657855082255e-05, "loss": 0.9865, "step": 2800 }, { "epoch": 1.588170865279299, "grad_norm": 1.8818488121032715, "learning_rate": 5.359729472951246e-05, "loss": 0.9782, "step": 2900 }, { "epoch": 1.642935377875137, "grad_norm": 2.093569040298462, "learning_rate": 5.041420119038218e-05, "loss": 0.9874, "step": 3000 }, { "epoch": 1.6976998904709748, "grad_norm": 2.0732619762420654, "learning_rate": 4.72294262542015e-05, "loss": 0.984, "step": 3100 }, { "epoch": 1.7524644030668126, "grad_norm": 1.9700407981872559, "learning_rate": 4.405589810975468e-05, "loss": 0.9442, "step": 3200 }, { "epoch": 1.8072289156626506, "grad_norm": 1.8880764245986938, "learning_rate": 4.090649929090541e-05, "loss": 0.9511, "step": 3300 }, { "epoch": 1.8619934282584885, "grad_norm": 1.8169358968734741, "learning_rate": 3.7794014381589125e-05, "loss": 0.9123, "step": 3400 }, { "epoch": 1.9167579408543265, "grad_norm": 2.062743663787842, "learning_rate": 3.473107811842055e-05, "loss": 0.9308, "step": 3500 }, { "epoch": 1.9715224534501643, "grad_norm": 1.9522345066070557, "learning_rate": 3.173012410158744e-05, "loss": 0.9195, "step": 3600 }, { "epoch": 2.026286966046002, "grad_norm": 1.4607131481170654, "learning_rate": 2.8803334322232017e-05, "loss": 0.7255, "step": 3700 }, { "epoch": 2.08105147864184, "grad_norm": 2.2295525074005127, "learning_rate": 2.596258971120737e-05, "loss": 0.5014, "step": 3800 }, { "epoch": 2.135815991237678, "grad_norm": 1.6707744598388672, "learning_rate": 2.3219421909949735e-05, "loss": 0.5206, "step": 3900 }, { "epoch": 2.190580503833516, "grad_norm": 1.4230283498764038, "learning_rate": 2.0584966459246906e-05, "loss": 0.4994, "step": 4000 }, { "epoch": 2.245345016429354, "grad_norm": 1.987844467163086, "learning_rate": 1.8069917595926504e-05, "loss": 0.4941, "step": 4100 }, { "epoch": 2.3001095290251916, "grad_norm": 1.8899997472763062, "learning_rate": 1.568448484096205e-05, "loss": 0.4926, "step": 4200 }, { "epoch": 2.3548740416210294, "grad_norm": 2.2514610290527344, "learning_rate": 1.3438351555220874e-05, "loss": 0.4809, "step": 4300 }, { "epoch": 2.4096385542168672, "grad_norm": 1.9894077777862549, "learning_rate": 1.1340635631092428e-05, "loss": 0.4834, "step": 4400 }, { "epoch": 2.4644030668127055, "grad_norm": 2.134925603866577, "learning_rate": 9.399852479563775e-06, "loss": 0.494, "step": 4500 }, { "epoch": 2.5191675794085433, "grad_norm": 1.5433528423309326, "learning_rate": 7.623880462991801e-06, "loss": 0.475, "step": 4600 }, { "epoch": 2.573932092004381, "grad_norm": 1.8533953428268433, "learning_rate": 6.019928913893208e-06, "loss": 0.4692, "step": 4700 }, { "epoch": 2.628696604600219, "grad_norm": 1.9672038555145264, "learning_rate": 4.594508869576164e-06, "loss": 0.4806, "step": 4800 }, { "epoch": 2.6834611171960567, "grad_norm": 1.722920536994934, "learning_rate": 3.35340664141246e-06, "loss": 0.4629, "step": 4900 }, { "epoch": 2.738225629791895, "grad_norm": 2.070141315460205, "learning_rate": 2.301660326042443e-06, "loss": 0.4596, "step": 5000 }, { "epoch": 2.792990142387733, "grad_norm": 1.5731868743896484, "learning_rate": 1.4435393538625497e-06, "loss": 0.4652, "step": 5100 }, { "epoch": 2.8477546549835706, "grad_norm": 1.4046640396118164, "learning_rate": 7.825271578155602e-07, "loss": 0.4419, "step": 5200 }, { "epoch": 2.9025191675794084, "grad_norm": 2.1482276916503906, "learning_rate": 3.2130703283768106e-07, "loss": 0.4663, "step": 5300 }, { "epoch": 2.9572836801752462, "grad_norm": 1.5635905265808105, "learning_rate": 6.175124336402217e-08, "loss": 0.4757, "step": 5400 } ], "logging_steps": 100, "max_steps": 5478, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.633057747449823e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }