|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 5478, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.054764512595837894, |
|
"grad_norm": 3.8052456378936768, |
|
"learning_rate": 1.8248175182481753e-05, |
|
"loss": 2.2411, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10952902519167579, |
|
"grad_norm": 2.646080255508423, |
|
"learning_rate": 3.649635036496351e-05, |
|
"loss": 1.8231, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16429353778751368, |
|
"grad_norm": 2.700340986251831, |
|
"learning_rate": 5.474452554744526e-05, |
|
"loss": 1.7334, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21905805038335158, |
|
"grad_norm": 3.046970844268799, |
|
"learning_rate": 7.299270072992701e-05, |
|
"loss": 1.6861, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2738225629791895, |
|
"grad_norm": 2.5365347862243652, |
|
"learning_rate": 9.124087591240877e-05, |
|
"loss": 1.6711, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.32858707557502737, |
|
"grad_norm": 2.219165802001953, |
|
"learning_rate": 9.997255186358079e-05, |
|
"loss": 1.6651, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3833515881708653, |
|
"grad_norm": 2.6104276180267334, |
|
"learning_rate": 9.976563458663239e-05, |
|
"loss": 1.5924, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.43811610076670315, |
|
"grad_norm": 2.4602837562561035, |
|
"learning_rate": 9.93567000457336e-05, |
|
"loss": 1.5787, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4928806133625411, |
|
"grad_norm": 2.33683705329895, |
|
"learning_rate": 9.874740825864108e-05, |
|
"loss": 1.5269, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.547645125958379, |
|
"grad_norm": 3.0263447761535645, |
|
"learning_rate": 9.794023256786919e-05, |
|
"loss": 1.5251, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 2.4210712909698486, |
|
"learning_rate": 9.693844960047051e-05, |
|
"loss": 1.4886, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6571741511500547, |
|
"grad_norm": 2.1745235919952393, |
|
"learning_rate": 9.574612596698522e-05, |
|
"loss": 1.4715, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7119386637458927, |
|
"grad_norm": 2.216663360595703, |
|
"learning_rate": 9.436810175355263e-05, |
|
"loss": 1.4191, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7667031763417306, |
|
"grad_norm": 2.010982036590576, |
|
"learning_rate": 9.280997087419733e-05, |
|
"loss": 1.4411, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8214676889375685, |
|
"grad_norm": 2.0168168544769287, |
|
"learning_rate": 9.107805836304658e-05, |
|
"loss": 1.4264, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8762322015334063, |
|
"grad_norm": 2.2413294315338135, |
|
"learning_rate": 8.91793946986587e-05, |
|
"loss": 1.4056, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9309967141292442, |
|
"grad_norm": 2.189241409301758, |
|
"learning_rate": 8.712168726468965e-05, |
|
"loss": 1.368, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9857612267250822, |
|
"grad_norm": 2.0814454555511475, |
|
"learning_rate": 8.491328906274937e-05, |
|
"loss": 1.3536, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.04052573932092, |
|
"grad_norm": 2.4284451007843018, |
|
"learning_rate": 8.256316480445412e-05, |
|
"loss": 1.1463, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.095290251916758, |
|
"grad_norm": 2.5235214233398438, |
|
"learning_rate": 8.008085452031985e-05, |
|
"loss": 1.0484, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1500547645125958, |
|
"grad_norm": 2.3191304206848145, |
|
"learning_rate": 7.747643483322187e-05, |
|
"loss": 1.0878, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 2.206143856048584, |
|
"learning_rate": 7.476047805362569e-05, |
|
"loss": 1.0593, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.2595837897042717, |
|
"grad_norm": 2.5182549953460693, |
|
"learning_rate": 7.194400926263763e-05, |
|
"loss": 1.0407, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.3143483023001095, |
|
"grad_norm": 2.556648015975952, |
|
"learning_rate": 6.903846155709005e-05, |
|
"loss": 1.0629, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3691128148959475, |
|
"grad_norm": 2.1211650371551514, |
|
"learning_rate": 6.605562963833925e-05, |
|
"loss": 1.0505, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4238773274917853, |
|
"grad_norm": 2.1135342121124268, |
|
"learning_rate": 6.300762193317518e-05, |
|
"loss": 0.9912, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4786418400876231, |
|
"grad_norm": 2.554298162460327, |
|
"learning_rate": 5.9906811441203135e-05, |
|
"loss": 1.0007, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.5334063526834611, |
|
"grad_norm": 1.823740839958191, |
|
"learning_rate": 5.67657855082255e-05, |
|
"loss": 0.9865, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.588170865279299, |
|
"grad_norm": 1.8818488121032715, |
|
"learning_rate": 5.359729472951246e-05, |
|
"loss": 0.9782, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.642935377875137, |
|
"grad_norm": 2.093569040298462, |
|
"learning_rate": 5.041420119038218e-05, |
|
"loss": 0.9874, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.6976998904709748, |
|
"grad_norm": 2.0732619762420654, |
|
"learning_rate": 4.72294262542015e-05, |
|
"loss": 0.984, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.7524644030668126, |
|
"grad_norm": 1.9700407981872559, |
|
"learning_rate": 4.405589810975468e-05, |
|
"loss": 0.9442, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 1.8880764245986938, |
|
"learning_rate": 4.090649929090541e-05, |
|
"loss": 0.9511, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.8619934282584885, |
|
"grad_norm": 1.8169358968734741, |
|
"learning_rate": 3.7794014381589125e-05, |
|
"loss": 0.9123, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.9167579408543265, |
|
"grad_norm": 2.062743663787842, |
|
"learning_rate": 3.473107811842055e-05, |
|
"loss": 0.9308, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.9715224534501643, |
|
"grad_norm": 1.9522345066070557, |
|
"learning_rate": 3.173012410158744e-05, |
|
"loss": 0.9195, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.026286966046002, |
|
"grad_norm": 1.4607131481170654, |
|
"learning_rate": 2.8803334322232017e-05, |
|
"loss": 0.7255, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.08105147864184, |
|
"grad_norm": 2.2295525074005127, |
|
"learning_rate": 2.596258971120737e-05, |
|
"loss": 0.5014, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.135815991237678, |
|
"grad_norm": 1.6707744598388672, |
|
"learning_rate": 2.3219421909949735e-05, |
|
"loss": 0.5206, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.190580503833516, |
|
"grad_norm": 1.4230283498764038, |
|
"learning_rate": 2.0584966459246906e-05, |
|
"loss": 0.4994, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.245345016429354, |
|
"grad_norm": 1.987844467163086, |
|
"learning_rate": 1.8069917595926504e-05, |
|
"loss": 0.4941, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.3001095290251916, |
|
"grad_norm": 1.8899997472763062, |
|
"learning_rate": 1.568448484096205e-05, |
|
"loss": 0.4926, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.3548740416210294, |
|
"grad_norm": 2.2514610290527344, |
|
"learning_rate": 1.3438351555220874e-05, |
|
"loss": 0.4809, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 1.9894077777862549, |
|
"learning_rate": 1.1340635631092428e-05, |
|
"loss": 0.4834, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.4644030668127055, |
|
"grad_norm": 2.134925603866577, |
|
"learning_rate": 9.399852479563775e-06, |
|
"loss": 0.494, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.5191675794085433, |
|
"grad_norm": 1.5433528423309326, |
|
"learning_rate": 7.623880462991801e-06, |
|
"loss": 0.475, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.573932092004381, |
|
"grad_norm": 1.8533953428268433, |
|
"learning_rate": 6.019928913893208e-06, |
|
"loss": 0.4692, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.628696604600219, |
|
"grad_norm": 1.9672038555145264, |
|
"learning_rate": 4.594508869576164e-06, |
|
"loss": 0.4806, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.6834611171960567, |
|
"grad_norm": 1.722920536994934, |
|
"learning_rate": 3.35340664141246e-06, |
|
"loss": 0.4629, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.738225629791895, |
|
"grad_norm": 2.070141315460205, |
|
"learning_rate": 2.301660326042443e-06, |
|
"loss": 0.4596, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.792990142387733, |
|
"grad_norm": 1.5731868743896484, |
|
"learning_rate": 1.4435393538625497e-06, |
|
"loss": 0.4652, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.8477546549835706, |
|
"grad_norm": 1.4046640396118164, |
|
"learning_rate": 7.825271578155602e-07, |
|
"loss": 0.4419, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.9025191675794084, |
|
"grad_norm": 2.1482276916503906, |
|
"learning_rate": 3.2130703283768106e-07, |
|
"loss": 0.4663, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.9572836801752462, |
|
"grad_norm": 1.5635905265808105, |
|
"learning_rate": 6.175124336402217e-08, |
|
"loss": 0.4757, |
|
"step": 5400 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5478, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.633057747449823e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|