{ "best_metric": 1.268411636352539, "best_model_checkpoint": "/gscratch/stf/seunguk/dipika/olmo-code-sft/train/result_outputs/7b_py23_mix_10k_normal/allenai_OLMo-2-1124-7B-Instruct/r64_lr1.5e-05/checkpoint-429", "epoch": 3.0, "eval_steps": 39, "global_step": 459, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006535947712418301, "grad_norm": 3.6004388332366943, "learning_rate": 3.0000000000000004e-07, "loss": 7.4653, "step": 1 }, { "epoch": 0.06535947712418301, "grad_norm": 3.2401626110076904, "learning_rate": 3e-06, "loss": 7.7982, "step": 10 }, { "epoch": 0.13071895424836602, "grad_norm": 2.898332118988037, "learning_rate": 6e-06, "loss": 7.8085, "step": 20 }, { "epoch": 0.19607843137254902, "grad_norm": 2.342164993286133, "learning_rate": 9e-06, "loss": 7.0685, "step": 30 }, { "epoch": 0.2549019607843137, "eval_loss": 1.5568805932998657, "eval_runtime": 6.1754, "eval_samples_per_second": 16.193, "eval_steps_per_second": 2.105, "step": 39 }, { "epoch": 0.26143790849673204, "grad_norm": 1.3029459714889526, "learning_rate": 1.2e-05, "loss": 6.2268, "step": 40 }, { "epoch": 0.32679738562091504, "grad_norm": 0.8066064715385437, "learning_rate": 1.5e-05, "loss": 5.6646, "step": 50 }, { "epoch": 0.39215686274509803, "grad_norm": 0.640289843082428, "learning_rate": 1.4977885819099278e-05, "loss": 5.4284, "step": 60 }, { "epoch": 0.45751633986928103, "grad_norm": 0.5137796401977539, "learning_rate": 1.4911673686262952e-05, "loss": 5.4141, "step": 70 }, { "epoch": 0.5098039215686274, "eval_loss": 1.329179286956787, "eval_runtime": 6.0801, "eval_samples_per_second": 16.447, "eval_steps_per_second": 2.138, "step": 78 }, { "epoch": 0.5228758169934641, "grad_norm": 0.4564005732536316, "learning_rate": 1.4801754062046587e-05, "loss": 5.243, "step": 80 }, { "epoch": 0.5882352941176471, "grad_norm": 0.4929277300834656, "learning_rate": 1.4648775155104705e-05, "loss": 5.2285, "step": 90 }, { "epoch": 0.6535947712418301, "grad_norm": 0.5199493169784546, "learning_rate": 1.4453639099629869e-05, "loss": 5.2129, "step": 100 }, { "epoch": 0.7189542483660131, "grad_norm": 0.41562795639038086, "learning_rate": 1.4217496635363684e-05, "loss": 5.2174, "step": 110 }, { "epoch": 0.7647058823529411, "eval_loss": 1.2994059324264526, "eval_runtime": 6.1041, "eval_samples_per_second": 16.382, "eval_steps_per_second": 2.13, "step": 117 }, { "epoch": 0.7843137254901961, "grad_norm": 0.4347153902053833, "learning_rate": 1.3941740321552318e-05, "loss": 5.1946, "step": 120 }, { "epoch": 0.8496732026143791, "grad_norm": 0.4713277816772461, "learning_rate": 1.3627996324864611e-05, "loss": 5.1453, "step": 130 }, { "epoch": 0.9150326797385621, "grad_norm": 0.4941399395465851, "learning_rate": 1.3278114829700362e-05, "loss": 5.1508, "step": 140 }, { "epoch": 0.9803921568627451, "grad_norm": 0.5141309499740601, "learning_rate": 1.2894159127440344e-05, "loss": 5.0553, "step": 150 }, { "epoch": 1.0196078431372548, "eval_loss": 1.2868971824645996, "eval_runtime": 6.1033, "eval_samples_per_second": 16.385, "eval_steps_per_second": 2.13, "step": 156 }, { "epoch": 1.0457516339869282, "grad_norm": 0.4459021985530853, "learning_rate": 1.2478393448979922e-05, "loss": 5.0354, "step": 160 }, { "epoch": 1.1111111111111112, "grad_norm": 0.518979549407959, "learning_rate": 1.2033269612299312e-05, "loss": 5.0699, "step": 170 }, { "epoch": 1.1764705882352942, "grad_norm": 0.5595972537994385, "learning_rate": 1.1561412563811198e-05, "loss": 5.0097, "step": 180 }, { "epoch": 1.2418300653594772, "grad_norm": 0.49540629982948303, "learning_rate": 1.1065604898750127e-05, "loss": 5.075, "step": 190 }, { "epoch": 1.2745098039215685, "eval_loss": 1.2794718742370605, "eval_runtime": 6.0945, "eval_samples_per_second": 16.408, "eval_steps_per_second": 2.133, "step": 195 }, { "epoch": 1.3071895424836601, "grad_norm": 0.5955595970153809, "learning_rate": 1.0548770451888665e-05, "loss": 4.9689, "step": 200 }, { "epoch": 1.3725490196078431, "grad_norm": 0.5468981862068176, "learning_rate": 1.0013957055347779e-05, "loss": 5.1004, "step": 210 }, { "epoch": 1.4379084967320261, "grad_norm": 0.5972552299499512, "learning_rate": 9.464318565180596e-06, "loss": 5.0697, "step": 220 }, { "epoch": 1.5032679738562091, "grad_norm": 0.5224605202674866, "learning_rate": 8.903096262720867e-06, "loss": 5.1168, "step": 230 }, { "epoch": 1.5294117647058822, "eval_loss": 1.274889349937439, "eval_runtime": 6.1193, "eval_samples_per_second": 16.342, "eval_steps_per_second": 2.124, "step": 234 }, { "epoch": 1.5686274509803921, "grad_norm": 0.563854455947876, "learning_rate": 8.333599740374487e-06, "loss": 5.0523, "step": 240 }, { "epoch": 1.6339869281045751, "grad_norm": 0.5406912565231323, "learning_rate": 7.75918738457279e-06, "loss": 4.9938, "step": 250 }, { "epoch": 1.6993464052287581, "grad_norm": 0.5734691023826599, "learning_rate": 7.183246570981859e-06, "loss": 5.0574, "step": 260 }, { "epoch": 1.7647058823529411, "grad_norm": 0.5452589392662048, "learning_rate": 6.609173688758989e-06, "loss": 5.0522, "step": 270 }, { "epoch": 1.784313725490196, "eval_loss": 1.2728018760681152, "eval_runtime": 6.1187, "eval_samples_per_second": 16.343, "eval_steps_per_second": 2.125, "step": 273 }, { "epoch": 1.8300653594771243, "grad_norm": 0.5308836102485657, "learning_rate": 6.0403541116555636e-06, "loss": 5.0906, "step": 280 }, { "epoch": 1.8954248366013071, "grad_norm": 0.5624706745147705, "learning_rate": 5.480142234079027e-06, "loss": 5.0739, "step": 290 }, { "epoch": 1.9607843137254903, "grad_norm": 0.6330104470252991, "learning_rate": 4.9318416898436404e-06, "loss": 5.0924, "step": 300 }, { "epoch": 2.026143790849673, "grad_norm": 0.550271213054657, "learning_rate": 4.398685870262254e-06, "loss": 4.9703, "step": 310 }, { "epoch": 2.0392156862745097, "eval_loss": 1.2705397605895996, "eval_runtime": 6.092, "eval_samples_per_second": 16.415, "eval_steps_per_second": 2.134, "step": 312 }, { "epoch": 2.0915032679738563, "grad_norm": 0.5277569890022278, "learning_rate": 3.883818856466194e-06, "loss": 5.0368, "step": 320 }, { "epoch": 2.156862745098039, "grad_norm": 0.49453046917915344, "learning_rate": 3.390276878397574e-06, "loss": 4.9466, "step": 330 }, { "epoch": 2.2222222222222223, "grad_norm": 0.4974152445793152, "learning_rate": 2.9209704098124204e-06, "loss": 5.1187, "step": 340 }, { "epoch": 2.287581699346405, "grad_norm": 0.6691598892211914, "learning_rate": 2.47866700488251e-06, "loss": 4.9278, "step": 350 }, { "epoch": 2.2941176470588234, "eval_loss": 1.269243836402893, "eval_runtime": 6.1135, "eval_samples_per_second": 16.357, "eval_steps_per_second": 2.126, "step": 351 }, { "epoch": 2.3529411764705883, "grad_norm": 0.6030656099319458, "learning_rate": 2.0659749776104147e-06, "loss": 4.8925, "step": 360 }, { "epoch": 2.418300653594771, "grad_norm": 0.6159414052963257, "learning_rate": 1.6853280203020998e-06, "loss": 5.0115, "step": 370 }, { "epoch": 2.4836601307189543, "grad_norm": 0.5232154726982117, "learning_rate": 1.3389708518037574e-06, "loss": 5.015, "step": 380 }, { "epoch": 2.549019607843137, "grad_norm": 0.5964416265487671, "learning_rate": 1.0289459801368406e-06, "loss": 5.0732, "step": 390 }, { "epoch": 2.549019607843137, "eval_loss": 1.2685757875442505, "eval_runtime": 6.0856, "eval_samples_per_second": 16.432, "eval_steps_per_second": 2.136, "step": 390 }, { "epoch": 2.6143790849673203, "grad_norm": 0.5542371869087219, "learning_rate": 7.570816575935527e-07, "loss": 5.0339, "step": 400 }, { "epoch": 2.6797385620915035, "grad_norm": 0.6067601442337036, "learning_rate": 5.249810993230036e-07, "loss": 5.0544, "step": 410 }, { "epoch": 2.7450980392156863, "grad_norm": 0.5335155129432678, "learning_rate": 3.3401302898726395e-07, "loss": 4.9646, "step": 420 }, { "epoch": 2.803921568627451, "eval_loss": 1.268411636352539, "eval_runtime": 6.0899, "eval_samples_per_second": 16.421, "eval_steps_per_second": 2.135, "step": 429 }, { "epoch": 2.810457516339869, "grad_norm": 0.5043578743934631, "learning_rate": 1.853036072406436e-07, "loss": 4.9906, "step": 430 }, { "epoch": 2.8758169934640523, "grad_norm": 0.6804988384246826, "learning_rate": 7.972979063091468e-08, "loss": 5.096, "step": 440 }, { "epoch": 2.9411764705882355, "grad_norm": 0.6172552704811096, "learning_rate": 1.7914160085782116e-08, "loss": 5.0006, "step": 450 } ], "logging_steps": 10, "max_steps": 459, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 39, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.001 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.087892566070788e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }