babylm-default_seed-42_1e-3 / trainer_state.json
qing-yao's picture
Model save
f6afe72 verified
{
"best_metric": 3.009352207183838,
"best_model_checkpoint": "models/babylm-unablated_seed-42_1e-3/checkpoint-35580",
"epoch": 19.99718982717437,
"eval_steps": 500,
"global_step": 35580,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.5620345651257552,
"grad_norm": 0.747050404548645,
"learning_rate": 3.125e-05,
"loss": 6.1639,
"step": 1000
},
{
"epoch": 0.9998594913587185,
"eval_accuracy": 0.3043492726599091,
"eval_loss": 4.264366149902344,
"eval_runtime": 127.2817,
"eval_samples_per_second": 354.67,
"eval_steps_per_second": 5.547,
"step": 1779
},
{
"epoch": 1.1240691302515105,
"grad_norm": 0.6358739733695984,
"learning_rate": 6.25e-05,
"loss": 4.4584,
"step": 2000
},
{
"epoch": 1.6861036953772657,
"grad_norm": 0.710475742816925,
"learning_rate": 9.375e-05,
"loss": 4.0418,
"step": 3000
},
{
"epoch": 1.999718982717437,
"eval_accuracy": 0.34673388660782967,
"eval_loss": 3.7366583347320557,
"eval_runtime": 128.0547,
"eval_samples_per_second": 352.529,
"eval_steps_per_second": 5.513,
"step": 3558
},
{
"epoch": 2.248138260503021,
"grad_norm": 0.6259163022041321,
"learning_rate": 0.000125,
"loss": 3.7842,
"step": 4000
},
{
"epoch": 2.810172825628776,
"grad_norm": 0.5914742350578308,
"learning_rate": 0.00015625,
"loss": 3.6074,
"step": 5000
},
{
"epoch": 2.9995784740761557,
"eval_accuracy": 0.3700874736621273,
"eval_loss": 3.4760947227478027,
"eval_runtime": 128.4007,
"eval_samples_per_second": 351.579,
"eval_steps_per_second": 5.498,
"step": 5337
},
{
"epoch": 3.3722073907545314,
"grad_norm": 0.532345712184906,
"learning_rate": 0.0001875,
"loss": 3.4689,
"step": 6000
},
{
"epoch": 3.9342419558802866,
"grad_norm": 0.49571847915649414,
"learning_rate": 0.00021875,
"loss": 3.3836,
"step": 7000
},
{
"epoch": 4.0,
"eval_accuracy": 0.38321377861114986,
"eval_loss": 3.33798885345459,
"eval_runtime": 127.867,
"eval_samples_per_second": 353.047,
"eval_steps_per_second": 5.521,
"step": 7117
},
{
"epoch": 4.496276521006042,
"grad_norm": 0.4585084319114685,
"learning_rate": 0.00025,
"loss": 3.2939,
"step": 8000
},
{
"epoch": 4.999859491358719,
"eval_accuracy": 0.3912746987459893,
"eval_loss": 3.2569355964660645,
"eval_runtime": 128.3798,
"eval_samples_per_second": 351.636,
"eval_steps_per_second": 5.499,
"step": 8896
},
{
"epoch": 5.0583110861317975,
"grad_norm": 0.3779755234718323,
"learning_rate": 0.00028125000000000003,
"loss": 3.2499,
"step": 9000
},
{
"epoch": 5.620345651257552,
"grad_norm": 0.3809504210948944,
"learning_rate": 0.0003125,
"loss": 3.1911,
"step": 10000
},
{
"epoch": 5.999718982717437,
"eval_accuracy": 0.3958713334923053,
"eval_loss": 3.208075523376465,
"eval_runtime": 128.1646,
"eval_samples_per_second": 352.227,
"eval_steps_per_second": 5.509,
"step": 10675
},
{
"epoch": 6.182380216383308,
"grad_norm": 0.361873596906662,
"learning_rate": 0.00034375,
"loss": 3.1594,
"step": 11000
},
{
"epoch": 6.744414781509063,
"grad_norm": 0.3257278501987457,
"learning_rate": 0.000375,
"loss": 3.1276,
"step": 12000
},
{
"epoch": 6.999578474076156,
"eval_accuracy": 0.3993205903853245,
"eval_loss": 3.174260377883911,
"eval_runtime": 128.5693,
"eval_samples_per_second": 351.118,
"eval_steps_per_second": 5.491,
"step": 12454
},
{
"epoch": 7.306449346634818,
"grad_norm": 0.3252241015434265,
"learning_rate": 0.0004061875,
"loss": 3.0935,
"step": 13000
},
{
"epoch": 7.868483911760573,
"grad_norm": 0.31484290957450867,
"learning_rate": 0.0004374375,
"loss": 3.0834,
"step": 14000
},
{
"epoch": 8.0,
"eval_accuracy": 0.4013973026022318,
"eval_loss": 3.1533353328704834,
"eval_runtime": 128.32,
"eval_samples_per_second": 351.8,
"eval_steps_per_second": 5.502,
"step": 14234
},
{
"epoch": 8.430518476886329,
"grad_norm": 0.31745150685310364,
"learning_rate": 0.0004686875,
"loss": 3.0435,
"step": 15000
},
{
"epoch": 8.992553042012084,
"grad_norm": 0.2775208353996277,
"learning_rate": 0.0004999375,
"loss": 3.0525,
"step": 16000
},
{
"epoch": 8.999859491358718,
"eval_accuracy": 0.4032936728730878,
"eval_loss": 3.1344425678253174,
"eval_runtime": 128.3823,
"eval_samples_per_second": 351.629,
"eval_steps_per_second": 5.499,
"step": 16013
},
{
"epoch": 9.554587607137838,
"grad_norm": 0.28058525919914246,
"learning_rate": 0.00053115625,
"loss": 3.0036,
"step": 17000
},
{
"epoch": 9.999718982717438,
"eval_accuracy": 0.405010656766971,
"eval_loss": 3.122068405151367,
"eval_runtime": 128.3587,
"eval_samples_per_second": 351.694,
"eval_steps_per_second": 5.5,
"step": 17792
},
{
"epoch": 10.116622172263595,
"grad_norm": 0.2726403772830963,
"learning_rate": 0.00056240625,
"loss": 3.0137,
"step": 18000
},
{
"epoch": 10.67865673738935,
"grad_norm": 0.255198210477829,
"learning_rate": 0.000593625,
"loss": 2.9901,
"step": 19000
},
{
"epoch": 10.999578474076156,
"eval_accuracy": 0.4056058894328394,
"eval_loss": 3.1155996322631836,
"eval_runtime": 128.3954,
"eval_samples_per_second": 351.594,
"eval_steps_per_second": 5.499,
"step": 19571
},
{
"epoch": 11.240691302515105,
"grad_norm": 0.26096227765083313,
"learning_rate": 0.000624875,
"loss": 2.9801,
"step": 20000
},
{
"epoch": 11.80272586764086,
"grad_norm": 0.2451123744249344,
"learning_rate": 0.000656125,
"loss": 2.9798,
"step": 21000
},
{
"epoch": 12.0,
"eval_accuracy": 0.4068902611439986,
"eval_loss": 3.1083483695983887,
"eval_runtime": 128.4387,
"eval_samples_per_second": 351.475,
"eval_steps_per_second": 5.497,
"step": 21351
},
{
"epoch": 12.364760432766616,
"grad_norm": 0.251653254032135,
"learning_rate": 0.0006873749999999999,
"loss": 2.9544,
"step": 22000
},
{
"epoch": 12.92679499789237,
"grad_norm": 0.23384283483028412,
"learning_rate": 0.00071859375,
"loss": 2.9729,
"step": 23000
},
{
"epoch": 12.999859491358718,
"eval_accuracy": 0.40739992694240046,
"eval_loss": 3.1014933586120605,
"eval_runtime": 128.3695,
"eval_samples_per_second": 351.665,
"eval_steps_per_second": 5.5,
"step": 23130
},
{
"epoch": 13.488829563018125,
"grad_norm": 0.21852266788482666,
"learning_rate": 0.0007498437500000001,
"loss": 2.9356,
"step": 24000
},
{
"epoch": 13.999718982717438,
"eval_accuracy": 0.40764333644761985,
"eval_loss": 3.0996627807617188,
"eval_runtime": 128.4805,
"eval_samples_per_second": 351.361,
"eval_steps_per_second": 5.495,
"step": 24909
},
{
"epoch": 14.05086412814388,
"grad_norm": 0.2264845222234726,
"learning_rate": 0.0007810625,
"loss": 2.9577,
"step": 25000
},
{
"epoch": 14.612898693269637,
"grad_norm": 0.21793274581432343,
"learning_rate": 0.0008123125,
"loss": 2.9282,
"step": 26000
},
{
"epoch": 14.999578474076156,
"eval_accuracy": 0.4083614900449248,
"eval_loss": 3.0948216915130615,
"eval_runtime": 128.3484,
"eval_samples_per_second": 351.722,
"eval_steps_per_second": 5.501,
"step": 26688
},
{
"epoch": 15.174933258395392,
"grad_norm": 0.21455030143260956,
"learning_rate": 0.00084353125,
"loss": 2.9359,
"step": 27000
},
{
"epoch": 15.736967823521146,
"grad_norm": 0.20253227651119232,
"learning_rate": 0.00087478125,
"loss": 2.9332,
"step": 28000
},
{
"epoch": 16.0,
"eval_accuracy": 0.40898035132800215,
"eval_loss": 3.0927815437316895,
"eval_runtime": 128.2022,
"eval_samples_per_second": 352.123,
"eval_steps_per_second": 5.507,
"step": 28468
},
{
"epoch": 16.2990023886469,
"grad_norm": 0.20522606372833252,
"learning_rate": 0.0009060312499999999,
"loss": 2.9202,
"step": 29000
},
{
"epoch": 16.861036953772658,
"grad_norm": 0.19934597611427307,
"learning_rate": 0.00093728125,
"loss": 2.9346,
"step": 30000
},
{
"epoch": 16.99985949135872,
"eval_accuracy": 0.4089585469790335,
"eval_loss": 3.089961290359497,
"eval_runtime": 128.5393,
"eval_samples_per_second": 351.2,
"eval_steps_per_second": 5.492,
"step": 30247
},
{
"epoch": 17.42307151889841,
"grad_norm": 0.2042018175125122,
"learning_rate": 0.00096853125,
"loss": 2.9069,
"step": 31000
},
{
"epoch": 17.985106084024167,
"grad_norm": 0.1865146905183792,
"learning_rate": 0.00099978125,
"loss": 2.9394,
"step": 32000
},
{
"epoch": 17.999718982717436,
"eval_accuracy": 0.40965750232485615,
"eval_loss": 3.087404489517212,
"eval_runtime": 128.1261,
"eval_samples_per_second": 352.332,
"eval_steps_per_second": 5.51,
"step": 32026
},
{
"epoch": 18.547140649149924,
"grad_norm": 0.1902616024017334,
"learning_rate": 0.0007229050279329609,
"loss": 2.8743,
"step": 33000
},
{
"epoch": 18.999578474076156,
"eval_accuracy": 0.4162921053054498,
"eval_loss": 3.0349581241607666,
"eval_runtime": 128.4097,
"eval_samples_per_second": 351.554,
"eval_steps_per_second": 5.498,
"step": 33805
},
{
"epoch": 19.109175214275677,
"grad_norm": 0.20329073071479797,
"learning_rate": 0.0004435754189944134,
"loss": 2.8348,
"step": 34000
},
{
"epoch": 19.671209779401433,
"grad_norm": 0.18743157386779785,
"learning_rate": 0.0001645251396648045,
"loss": 2.729,
"step": 35000
},
{
"epoch": 19.99718982717437,
"eval_accuracy": 0.4210676920791576,
"eval_loss": 3.009352207183838,
"eval_runtime": 129.4424,
"eval_samples_per_second": 348.75,
"eval_steps_per_second": 5.454,
"step": 35580
},
{
"epoch": 19.99718982717437,
"step": 35580,
"total_flos": 1.18991215558656e+18,
"train_loss": 3.2179960425614373,
"train_runtime": 62492.0089,
"train_samples_per_second": 145.765,
"train_steps_per_second": 0.569
}
],
"logging_steps": 1000,
"max_steps": 35580,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.18991215558656e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}