{ "best_metric": 3.009352207183838, "best_model_checkpoint": "models/babylm-unablated_seed-42_1e-3/checkpoint-35580", "epoch": 19.99718982717437, "eval_steps": 500, "global_step": 35580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5620345651257552, "grad_norm": 0.747050404548645, "learning_rate": 3.125e-05, "loss": 6.1639, "step": 1000 }, { "epoch": 0.9998594913587185, "eval_accuracy": 0.3043492726599091, "eval_loss": 4.264366149902344, "eval_runtime": 127.2817, "eval_samples_per_second": 354.67, "eval_steps_per_second": 5.547, "step": 1779 }, { "epoch": 1.1240691302515105, "grad_norm": 0.6358739733695984, "learning_rate": 6.25e-05, "loss": 4.4584, "step": 2000 }, { "epoch": 1.6861036953772657, "grad_norm": 0.710475742816925, "learning_rate": 9.375e-05, "loss": 4.0418, "step": 3000 }, { "epoch": 1.999718982717437, "eval_accuracy": 0.34673388660782967, "eval_loss": 3.7366583347320557, "eval_runtime": 128.0547, "eval_samples_per_second": 352.529, "eval_steps_per_second": 5.513, "step": 3558 }, { "epoch": 2.248138260503021, "grad_norm": 0.6259163022041321, "learning_rate": 0.000125, "loss": 3.7842, "step": 4000 }, { "epoch": 2.810172825628776, "grad_norm": 0.5914742350578308, "learning_rate": 0.00015625, "loss": 3.6074, "step": 5000 }, { "epoch": 2.9995784740761557, "eval_accuracy": 0.3700874736621273, "eval_loss": 3.4760947227478027, "eval_runtime": 128.4007, "eval_samples_per_second": 351.579, "eval_steps_per_second": 5.498, "step": 5337 }, { "epoch": 3.3722073907545314, "grad_norm": 0.532345712184906, "learning_rate": 0.0001875, "loss": 3.4689, "step": 6000 }, { "epoch": 3.9342419558802866, "grad_norm": 0.49571847915649414, "learning_rate": 0.00021875, "loss": 3.3836, "step": 7000 }, { "epoch": 4.0, "eval_accuracy": 0.38321377861114986, "eval_loss": 3.33798885345459, "eval_runtime": 127.867, "eval_samples_per_second": 353.047, "eval_steps_per_second": 5.521, "step": 7117 }, { "epoch": 4.496276521006042, "grad_norm": 0.4585084319114685, "learning_rate": 0.00025, "loss": 3.2939, "step": 8000 }, { "epoch": 4.999859491358719, "eval_accuracy": 0.3912746987459893, "eval_loss": 3.2569355964660645, "eval_runtime": 128.3798, "eval_samples_per_second": 351.636, "eval_steps_per_second": 5.499, "step": 8896 }, { "epoch": 5.0583110861317975, "grad_norm": 0.3779755234718323, "learning_rate": 0.00028125000000000003, "loss": 3.2499, "step": 9000 }, { "epoch": 5.620345651257552, "grad_norm": 0.3809504210948944, "learning_rate": 0.0003125, "loss": 3.1911, "step": 10000 }, { "epoch": 5.999718982717437, "eval_accuracy": 0.3958713334923053, "eval_loss": 3.208075523376465, "eval_runtime": 128.1646, "eval_samples_per_second": 352.227, "eval_steps_per_second": 5.509, "step": 10675 }, { "epoch": 6.182380216383308, "grad_norm": 0.361873596906662, "learning_rate": 0.00034375, "loss": 3.1594, "step": 11000 }, { "epoch": 6.744414781509063, "grad_norm": 0.3257278501987457, "learning_rate": 0.000375, "loss": 3.1276, "step": 12000 }, { "epoch": 6.999578474076156, "eval_accuracy": 0.3993205903853245, "eval_loss": 3.174260377883911, "eval_runtime": 128.5693, "eval_samples_per_second": 351.118, "eval_steps_per_second": 5.491, "step": 12454 }, { "epoch": 7.306449346634818, "grad_norm": 0.3252241015434265, "learning_rate": 0.0004061875, "loss": 3.0935, "step": 13000 }, { "epoch": 7.868483911760573, "grad_norm": 0.31484290957450867, "learning_rate": 0.0004374375, "loss": 3.0834, "step": 14000 }, { "epoch": 8.0, "eval_accuracy": 0.4013973026022318, "eval_loss": 3.1533353328704834, "eval_runtime": 128.32, "eval_samples_per_second": 351.8, "eval_steps_per_second": 5.502, "step": 14234 }, { "epoch": 8.430518476886329, "grad_norm": 0.31745150685310364, "learning_rate": 0.0004686875, "loss": 3.0435, "step": 15000 }, { "epoch": 8.992553042012084, "grad_norm": 0.2775208353996277, "learning_rate": 0.0004999375, "loss": 3.0525, "step": 16000 }, { "epoch": 8.999859491358718, "eval_accuracy": 0.4032936728730878, "eval_loss": 3.1344425678253174, "eval_runtime": 128.3823, "eval_samples_per_second": 351.629, "eval_steps_per_second": 5.499, "step": 16013 }, { "epoch": 9.554587607137838, "grad_norm": 0.28058525919914246, "learning_rate": 0.00053115625, "loss": 3.0036, "step": 17000 }, { "epoch": 9.999718982717438, "eval_accuracy": 0.405010656766971, "eval_loss": 3.122068405151367, "eval_runtime": 128.3587, "eval_samples_per_second": 351.694, "eval_steps_per_second": 5.5, "step": 17792 }, { "epoch": 10.116622172263595, "grad_norm": 0.2726403772830963, "learning_rate": 0.00056240625, "loss": 3.0137, "step": 18000 }, { "epoch": 10.67865673738935, "grad_norm": 0.255198210477829, "learning_rate": 0.000593625, "loss": 2.9901, "step": 19000 }, { "epoch": 10.999578474076156, "eval_accuracy": 0.4056058894328394, "eval_loss": 3.1155996322631836, "eval_runtime": 128.3954, "eval_samples_per_second": 351.594, "eval_steps_per_second": 5.499, "step": 19571 }, { "epoch": 11.240691302515105, "grad_norm": 0.26096227765083313, "learning_rate": 0.000624875, "loss": 2.9801, "step": 20000 }, { "epoch": 11.80272586764086, "grad_norm": 0.2451123744249344, "learning_rate": 0.000656125, "loss": 2.9798, "step": 21000 }, { "epoch": 12.0, "eval_accuracy": 0.4068902611439986, "eval_loss": 3.1083483695983887, "eval_runtime": 128.4387, "eval_samples_per_second": 351.475, "eval_steps_per_second": 5.497, "step": 21351 }, { "epoch": 12.364760432766616, "grad_norm": 0.251653254032135, "learning_rate": 0.0006873749999999999, "loss": 2.9544, "step": 22000 }, { "epoch": 12.92679499789237, "grad_norm": 0.23384283483028412, "learning_rate": 0.00071859375, "loss": 2.9729, "step": 23000 }, { "epoch": 12.999859491358718, "eval_accuracy": 0.40739992694240046, "eval_loss": 3.1014933586120605, "eval_runtime": 128.3695, "eval_samples_per_second": 351.665, "eval_steps_per_second": 5.5, "step": 23130 }, { "epoch": 13.488829563018125, "grad_norm": 0.21852266788482666, "learning_rate": 0.0007498437500000001, "loss": 2.9356, "step": 24000 }, { "epoch": 13.999718982717438, "eval_accuracy": 0.40764333644761985, "eval_loss": 3.0996627807617188, "eval_runtime": 128.4805, "eval_samples_per_second": 351.361, "eval_steps_per_second": 5.495, "step": 24909 }, { "epoch": 14.05086412814388, "grad_norm": 0.2264845222234726, "learning_rate": 0.0007810625, "loss": 2.9577, "step": 25000 }, { "epoch": 14.612898693269637, "grad_norm": 0.21793274581432343, "learning_rate": 0.0008123125, "loss": 2.9282, "step": 26000 }, { "epoch": 14.999578474076156, "eval_accuracy": 0.4083614900449248, "eval_loss": 3.0948216915130615, "eval_runtime": 128.3484, "eval_samples_per_second": 351.722, "eval_steps_per_second": 5.501, "step": 26688 }, { "epoch": 15.174933258395392, "grad_norm": 0.21455030143260956, "learning_rate": 0.00084353125, "loss": 2.9359, "step": 27000 }, { "epoch": 15.736967823521146, "grad_norm": 0.20253227651119232, "learning_rate": 0.00087478125, "loss": 2.9332, "step": 28000 }, { "epoch": 16.0, "eval_accuracy": 0.40898035132800215, "eval_loss": 3.0927815437316895, "eval_runtime": 128.2022, "eval_samples_per_second": 352.123, "eval_steps_per_second": 5.507, "step": 28468 }, { "epoch": 16.2990023886469, "grad_norm": 0.20522606372833252, "learning_rate": 0.0009060312499999999, "loss": 2.9202, "step": 29000 }, { "epoch": 16.861036953772658, "grad_norm": 0.19934597611427307, "learning_rate": 0.00093728125, "loss": 2.9346, "step": 30000 }, { "epoch": 16.99985949135872, "eval_accuracy": 0.4089585469790335, "eval_loss": 3.089961290359497, "eval_runtime": 128.5393, "eval_samples_per_second": 351.2, "eval_steps_per_second": 5.492, "step": 30247 }, { "epoch": 17.42307151889841, "grad_norm": 0.2042018175125122, "learning_rate": 0.00096853125, "loss": 2.9069, "step": 31000 }, { "epoch": 17.985106084024167, "grad_norm": 0.1865146905183792, "learning_rate": 0.00099978125, "loss": 2.9394, "step": 32000 }, { "epoch": 17.999718982717436, "eval_accuracy": 0.40965750232485615, "eval_loss": 3.087404489517212, "eval_runtime": 128.1261, "eval_samples_per_second": 352.332, "eval_steps_per_second": 5.51, "step": 32026 }, { "epoch": 18.547140649149924, "grad_norm": 0.1902616024017334, "learning_rate": 0.0007229050279329609, "loss": 2.8743, "step": 33000 }, { "epoch": 18.999578474076156, "eval_accuracy": 0.4162921053054498, "eval_loss": 3.0349581241607666, "eval_runtime": 128.4097, "eval_samples_per_second": 351.554, "eval_steps_per_second": 5.498, "step": 33805 }, { "epoch": 19.109175214275677, "grad_norm": 0.20329073071479797, "learning_rate": 0.0004435754189944134, "loss": 2.8348, "step": 34000 }, { "epoch": 19.671209779401433, "grad_norm": 0.18743157386779785, "learning_rate": 0.0001645251396648045, "loss": 2.729, "step": 35000 }, { "epoch": 19.99718982717437, "eval_accuracy": 0.4210676920791576, "eval_loss": 3.009352207183838, "eval_runtime": 129.4424, "eval_samples_per_second": 348.75, "eval_steps_per_second": 5.454, "step": 35580 }, { "epoch": 19.99718982717437, "step": 35580, "total_flos": 1.18991215558656e+18, "train_loss": 3.2179960425614373, "train_runtime": 62492.0089, "train_samples_per_second": 145.765, "train_steps_per_second": 0.569 } ], "logging_steps": 1000, "max_steps": 35580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.18991215558656e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }