{ "best_metric": 2.9599573612213135, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-32k-earlystop-40epochs_seed-42_1e-3/checkpoint-48293", "epoch": 28.0, "eval_steps": 500, "global_step": 54089, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5176653293645658, "grad_norm": 0.516268789768219, "learning_rate": 3.125e-05, "loss": 5.9216, "step": 1000 }, { "epoch": 0.9996117510029766, "eval_accuracy": 0.32528310960329715, "eval_loss": 4.013359069824219, "eval_runtime": 112.6568, "eval_samples_per_second": 463.265, "eval_steps_per_second": 7.243, "step": 1931 }, { "epoch": 1.0353306587291315, "grad_norm": 0.6370756030082703, "learning_rate": 6.25e-05, "loss": 4.1987, "step": 2000 }, { "epoch": 1.5529959880936974, "grad_norm": 0.6017232537269592, "learning_rate": 9.375e-05, "loss": 3.7977, "step": 3000 }, { "epoch": 1.9997411673353178, "eval_accuracy": 0.3639096213308086, "eval_loss": 3.544811725616455, "eval_runtime": 112.8804, "eval_samples_per_second": 462.348, "eval_steps_per_second": 7.229, "step": 3863 }, { "epoch": 2.070661317458263, "grad_norm": 0.5702515840530396, "learning_rate": 0.000125, "loss": 3.5582, "step": 4000 }, { "epoch": 2.588326646822829, "grad_norm": 0.48817870020866394, "learning_rate": 0.00015625, "loss": 3.3887, "step": 5000 }, { "epoch": 2.9998705836676587, "eval_accuracy": 0.3840780857274889, "eval_loss": 3.324249744415283, "eval_runtime": 112.7528, "eval_samples_per_second": 462.871, "eval_steps_per_second": 7.237, "step": 5795 }, { "epoch": 3.105991976187395, "grad_norm": 0.45357462763786316, "learning_rate": 0.0001875, "loss": 3.2719, "step": 6000 }, { "epoch": 3.6236573055519608, "grad_norm": 0.42539018392562866, "learning_rate": 0.00021875, "loss": 3.1805, "step": 7000 }, { "epoch": 4.0, "eval_accuracy": 0.3949032381682315, "eval_loss": 3.2081618309020996, "eval_runtime": 112.6095, "eval_samples_per_second": 463.46, "eval_steps_per_second": 7.246, "step": 7727 }, { "epoch": 4.141322634916526, "grad_norm": 0.41741499304771423, "learning_rate": 0.00025, "loss": 3.1173, "step": 8000 }, { "epoch": 4.658987964281092, "grad_norm": 0.3810145854949951, "learning_rate": 0.00028125000000000003, "loss": 3.0632, "step": 9000 }, { "epoch": 4.999611751002977, "eval_accuracy": 0.401180377880219, "eval_loss": 3.143218517303467, "eval_runtime": 112.4876, "eval_samples_per_second": 463.962, "eval_steps_per_second": 7.254, "step": 9658 }, { "epoch": 5.176653293645658, "grad_norm": 0.3555419445037842, "learning_rate": 0.0003125, "loss": 3.0212, "step": 10000 }, { "epoch": 5.694318623010224, "grad_norm": 0.3318658173084259, "learning_rate": 0.00034375, "loss": 2.9865, "step": 11000 }, { "epoch": 5.999741167335317, "eval_accuracy": 0.4055885546400971, "eval_loss": 3.101013422012329, "eval_runtime": 112.8154, "eval_samples_per_second": 462.614, "eval_steps_per_second": 7.233, "step": 11590 }, { "epoch": 6.21198395237479, "grad_norm": 0.3243854343891144, "learning_rate": 0.000375, "loss": 2.9568, "step": 12000 }, { "epoch": 6.729649281739356, "grad_norm": 0.3086845874786377, "learning_rate": 0.00040625000000000004, "loss": 2.9347, "step": 13000 }, { "epoch": 6.999870583667659, "eval_accuracy": 0.4087078510269791, "eval_loss": 3.071547746658325, "eval_runtime": 113.6377, "eval_samples_per_second": 459.267, "eval_steps_per_second": 7.181, "step": 13522 }, { "epoch": 7.2473146111039215, "grad_norm": 0.29632025957107544, "learning_rate": 0.0004375, "loss": 2.9084, "step": 14000 }, { "epoch": 7.764979940468487, "grad_norm": 0.28605663776397705, "learning_rate": 0.0004686875, "loss": 2.8953, "step": 15000 }, { "epoch": 8.0, "eval_accuracy": 0.4107785654978604, "eval_loss": 3.053938388824463, "eval_runtime": 112.9423, "eval_samples_per_second": 462.094, "eval_steps_per_second": 7.225, "step": 15454 }, { "epoch": 8.282645269833052, "grad_norm": 0.2786637246608734, "learning_rate": 0.0004999375, "loss": 2.8698, "step": 16000 }, { "epoch": 8.80031059919762, "grad_norm": 0.2667602002620697, "learning_rate": 0.00053115625, "loss": 2.8689, "step": 17000 }, { "epoch": 8.999611751002977, "eval_accuracy": 0.4122456033572655, "eval_loss": 3.039193868637085, "eval_runtime": 112.7699, "eval_samples_per_second": 462.801, "eval_steps_per_second": 7.236, "step": 17385 }, { "epoch": 9.317975928562184, "grad_norm": 0.25813835859298706, "learning_rate": 0.00056240625, "loss": 2.8401, "step": 18000 }, { "epoch": 9.835641257926751, "grad_norm": 0.2392367571592331, "learning_rate": 0.00059365625, "loss": 2.8456, "step": 19000 }, { "epoch": 9.999741167335317, "eval_accuracy": 0.4133619617611367, "eval_loss": 3.0309925079345703, "eval_runtime": 112.8237, "eval_samples_per_second": 462.58, "eval_steps_per_second": 7.233, "step": 19317 }, { "epoch": 10.353306587291316, "grad_norm": 0.2465026080608368, "learning_rate": 0.00062490625, "loss": 2.8163, "step": 20000 }, { "epoch": 10.870971916655883, "grad_norm": 0.21547040343284607, "learning_rate": 0.000656125, "loss": 2.8298, "step": 21000 }, { "epoch": 10.99987058366766, "eval_accuracy": 0.41438394403555634, "eval_loss": 3.0251340866088867, "eval_runtime": 112.6781, "eval_samples_per_second": 463.178, "eval_steps_per_second": 7.242, "step": 21249 }, { "epoch": 11.388637246020448, "grad_norm": 0.23142270743846893, "learning_rate": 0.0006873749999999999, "loss": 2.798, "step": 22000 }, { "epoch": 11.906302575385013, "grad_norm": 0.2184012234210968, "learning_rate": 0.00071859375, "loss": 2.817, "step": 23000 }, { "epoch": 12.0, "eval_accuracy": 0.4152235609706615, "eval_loss": 3.0175206661224365, "eval_runtime": 112.4939, "eval_samples_per_second": 463.936, "eval_steps_per_second": 7.254, "step": 23181 }, { "epoch": 12.42396790474958, "grad_norm": 0.211518794298172, "learning_rate": 0.0007498437500000001, "loss": 2.7828, "step": 24000 }, { "epoch": 12.941633234114144, "grad_norm": 0.2092377245426178, "learning_rate": 0.00078109375, "loss": 2.8069, "step": 25000 }, { "epoch": 12.999611751002977, "eval_accuracy": 0.41580334298885296, "eval_loss": 3.0118961334228516, "eval_runtime": 112.5807, "eval_samples_per_second": 463.578, "eval_steps_per_second": 7.248, "step": 25112 }, { "epoch": 13.459298563478711, "grad_norm": 0.20688970386981964, "learning_rate": 0.00081234375, "loss": 2.7707, "step": 26000 }, { "epoch": 13.976963892843276, "grad_norm": 0.20183704793453217, "learning_rate": 0.00084353125, "loss": 2.7996, "step": 27000 }, { "epoch": 13.999741167335317, "eval_accuracy": 0.4162821365373128, "eval_loss": 3.005990743637085, "eval_runtime": 112.5919, "eval_samples_per_second": 463.533, "eval_steps_per_second": 7.247, "step": 27044 }, { "epoch": 14.494629222207843, "grad_norm": 0.19293886423110962, "learning_rate": 0.00087478125, "loss": 2.7615, "step": 28000 }, { "epoch": 14.99987058366766, "eval_accuracy": 0.4170801257847458, "eval_loss": 3.0038492679595947, "eval_runtime": 112.6692, "eval_samples_per_second": 463.214, "eval_steps_per_second": 7.242, "step": 28976 }, { "epoch": 15.012294551572408, "grad_norm": 0.1958475559949875, "learning_rate": 0.0009060312499999999, "loss": 2.7934, "step": 29000 }, { "epoch": 15.529959880936975, "grad_norm": 0.18981003761291504, "learning_rate": 0.00093728125, "loss": 2.7575, "step": 30000 }, { "epoch": 16.0, "eval_accuracy": 0.4168810041740398, "eval_loss": 3.0022430419921875, "eval_runtime": 112.632, "eval_samples_per_second": 463.367, "eval_steps_per_second": 7.245, "step": 30908 }, { "epoch": 16.04762521030154, "grad_norm": 0.21319861710071564, "learning_rate": 0.00096853125, "loss": 2.7826, "step": 31000 }, { "epoch": 16.565290539666105, "grad_norm": 0.188828244805336, "learning_rate": 0.00099975, "loss": 2.7573, "step": 32000 }, { "epoch": 16.999611751002977, "eval_accuracy": 0.4178921662552739, "eval_loss": 2.9961841106414795, "eval_runtime": 112.6097, "eval_samples_per_second": 463.459, "eval_steps_per_second": 7.246, "step": 32839 }, { "epoch": 17.082955869030673, "grad_norm": 0.1903599202632904, "learning_rate": 0.0009780725022104334, "loss": 2.7729, "step": 33000 }, { "epoch": 17.60062119839524, "grad_norm": 0.18594609200954437, "learning_rate": 0.0009559902740937224, "loss": 2.7451, "step": 34000 }, { "epoch": 17.99974116733532, "eval_accuracy": 0.41887905804207104, "eval_loss": 2.98665452003479, "eval_runtime": 112.7053, "eval_samples_per_second": 463.066, "eval_steps_per_second": 7.24, "step": 34771 }, { "epoch": 18.118286527759803, "grad_norm": 0.19103111326694489, "learning_rate": 0.0009338859416445623, "loss": 2.7475, "step": 35000 }, { "epoch": 18.63595185712437, "grad_norm": 0.17397448420524597, "learning_rate": 0.0009118037135278515, "loss": 2.7275, "step": 36000 }, { "epoch": 18.999870583667658, "eval_accuracy": 0.4201490782172229, "eval_loss": 2.98036527633667, "eval_runtime": 112.4975, "eval_samples_per_second": 463.921, "eval_steps_per_second": 7.253, "step": 36703 }, { "epoch": 19.153617186488933, "grad_norm": 0.18536260724067688, "learning_rate": 0.0008896993810786914, "loss": 2.7238, "step": 37000 }, { "epoch": 19.671282515853502, "grad_norm": 0.17299328744411469, "learning_rate": 0.0008676171529619805, "loss": 2.7099, "step": 38000 }, { "epoch": 20.0, "eval_accuracy": 0.42075012492063313, "eval_loss": 2.9760243892669678, "eval_runtime": 112.7131, "eval_samples_per_second": 463.034, "eval_steps_per_second": 7.24, "step": 38635 }, { "epoch": 20.188947845218067, "grad_norm": 0.19630002975463867, "learning_rate": 0.0008455128205128205, "loss": 2.7028, "step": 39000 }, { "epoch": 20.706613174582632, "grad_norm": 0.18252891302108765, "learning_rate": 0.0008234084880636605, "loss": 2.693, "step": 40000 }, { "epoch": 20.999611751002977, "eval_accuracy": 0.4216216764536817, "eval_loss": 2.968324899673462, "eval_runtime": 112.7059, "eval_samples_per_second": 463.064, "eval_steps_per_second": 7.24, "step": 40566 }, { "epoch": 21.224278503947197, "grad_norm": 0.19568035006523132, "learning_rate": 0.0008013262599469496, "loss": 2.6802, "step": 41000 }, { "epoch": 21.741943833311765, "grad_norm": 0.2092587798833847, "learning_rate": 0.0007792219274977895, "loss": 2.6785, "step": 42000 }, { "epoch": 21.99974116733532, "eval_accuracy": 0.4221156483286934, "eval_loss": 2.96663761138916, "eval_runtime": 112.5823, "eval_samples_per_second": 463.572, "eval_steps_per_second": 7.248, "step": 42498 }, { "epoch": 22.25960916267633, "grad_norm": 0.2045181393623352, "learning_rate": 0.0007571175950486296, "loss": 2.6616, "step": 43000 }, { "epoch": 22.777274492040895, "grad_norm": 0.19560641050338745, "learning_rate": 0.0007350132625994696, "loss": 2.6628, "step": 44000 }, { "epoch": 22.999870583667658, "eval_accuracy": 0.4226662007972378, "eval_loss": 2.9646129608154297, "eval_runtime": 112.7148, "eval_samples_per_second": 463.027, "eval_steps_per_second": 7.24, "step": 44430 }, { "epoch": 23.29493982140546, "grad_norm": 0.19825534522533417, "learning_rate": 0.0007129089301503095, "loss": 2.6395, "step": 45000 }, { "epoch": 23.812605150770025, "grad_norm": 0.1957850456237793, "learning_rate": 0.0006908267020335986, "loss": 2.6501, "step": 46000 }, { "epoch": 24.0, "eval_accuracy": 0.42281715752022214, "eval_loss": 2.9626119136810303, "eval_runtime": 111.9607, "eval_samples_per_second": 466.146, "eval_steps_per_second": 7.288, "step": 46362 }, { "epoch": 24.330270480134594, "grad_norm": 0.2085314244031906, "learning_rate": 0.0006687223695844385, "loss": 2.6181, "step": 47000 }, { "epoch": 24.84793580949916, "grad_norm": 0.21406565606594086, "learning_rate": 0.0006466401414677277, "loss": 2.6343, "step": 48000 }, { "epoch": 24.999611751002977, "eval_accuracy": 0.42334742212654364, "eval_loss": 2.9599573612213135, "eval_runtime": 111.9994, "eval_samples_per_second": 465.985, "eval_steps_per_second": 7.286, "step": 48293 }, { "epoch": 25.365601138863724, "grad_norm": 0.20139683783054352, "learning_rate": 0.0006245358090185677, "loss": 2.598, "step": 49000 }, { "epoch": 25.88326646822829, "grad_norm": 0.20590080320835114, "learning_rate": 0.0006024535809018568, "loss": 2.6198, "step": 50000 }, { "epoch": 25.99974116733532, "eval_accuracy": 0.42357351907998303, "eval_loss": 2.9637885093688965, "eval_runtime": 112.1406, "eval_samples_per_second": 465.398, "eval_steps_per_second": 7.277, "step": 50225 }, { "epoch": 26.400931797592857, "grad_norm": 0.2109968364238739, "learning_rate": 0.0005803492484526968, "loss": 2.5789, "step": 51000 }, { "epoch": 26.918597126957422, "grad_norm": 0.21926765143871307, "learning_rate": 0.0005582670203359858, "loss": 2.604, "step": 52000 }, { "epoch": 26.999870583667658, "eval_accuracy": 0.4239798023060537, "eval_loss": 2.9604480266571045, "eval_runtime": 112.1419, "eval_samples_per_second": 465.392, "eval_steps_per_second": 7.276, "step": 52157 }, { "epoch": 27.436262456321987, "grad_norm": 0.2280665636062622, "learning_rate": 0.0005361626878868259, "loss": 2.5576, "step": 53000 }, { "epoch": 27.953927785686552, "grad_norm": 0.218441903591156, "learning_rate": 0.000514080459770115, "loss": 2.5876, "step": 54000 }, { "epoch": 28.0, "eval_accuracy": 0.42452238991016983, "eval_loss": 2.960141658782959, "eval_runtime": 111.7102, "eval_samples_per_second": 467.191, "eval_steps_per_second": 7.305, "step": 54089 }, { "epoch": 28.0, "step": 54089, "total_flos": 1.808986925039616e+18, "train_loss": 2.9159927132606205, "train_runtime": 57317.9169, "train_samples_per_second": 345.105, "train_steps_per_second": 1.348 } ], "logging_steps": 1000, "max_steps": 77240, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.808986925039616e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }