|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0064525495636463355, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.226274781823168e-05, |
|
"eval_loss": 3.6124956607818604, |
|
"eval_runtime": 307.4909, |
|
"eval_samples_per_second": 42.444, |
|
"eval_steps_per_second": 21.223, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0001613137390911584, |
|
"grad_norm": 0.14286714792251587, |
|
"learning_rate": 5e-05, |
|
"loss": 3.2497, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0003226274781823168, |
|
"grad_norm": 0.14805500209331512, |
|
"learning_rate": 0.0001, |
|
"loss": 3.3692, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0004839412172734752, |
|
"grad_norm": 0.18735414743423462, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 3.4622, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0006452549563646336, |
|
"grad_norm": 0.22555524110794067, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 3.3926, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0008065686954557919, |
|
"grad_norm": 0.27350375056266785, |
|
"learning_rate": 9.847001329696653e-05, |
|
"loss": 3.4236, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0009678824345469504, |
|
"grad_norm": 0.33609363436698914, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 3.6329, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0011291961736381087, |
|
"grad_norm": 0.38991883397102356, |
|
"learning_rate": 9.578866633275288e-05, |
|
"loss": 3.707, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0012905099127292672, |
|
"grad_norm": 0.4257192015647888, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 3.7489, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0014518236518204256, |
|
"grad_norm": 0.4441923499107361, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 3.5653, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0016131373909115839, |
|
"grad_norm": 0.6182786226272583, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 3.6585, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0016131373909115839, |
|
"eval_loss": 3.442338705062866, |
|
"eval_runtime": 308.2071, |
|
"eval_samples_per_second": 42.345, |
|
"eval_steps_per_second": 21.174, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0017744511300027423, |
|
"grad_norm": 0.18179060518741608, |
|
"learning_rate": 8.678619553365659e-05, |
|
"loss": 3.2418, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0019357648690939008, |
|
"grad_norm": 0.2533986568450928, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 3.2397, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0020970786081850592, |
|
"grad_norm": 0.24411137402057648, |
|
"learning_rate": 8.07106356344834e-05, |
|
"loss": 3.2814, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0022583923472762175, |
|
"grad_norm": 0.24782365560531616, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 3.2308, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.002419706086367376, |
|
"grad_norm": 0.24265138804912567, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 3.2418, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0025810198254585344, |
|
"grad_norm": 0.29976701736450195, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 3.4099, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0027423335645496926, |
|
"grad_norm": 0.36268311738967896, |
|
"learning_rate": 6.623497346023418e-05, |
|
"loss": 3.3549, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0029036473036408513, |
|
"grad_norm": 0.36004823446273804, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 3.3432, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0030649610427320095, |
|
"grad_norm": 0.526280403137207, |
|
"learning_rate": 5.8229729514036705e-05, |
|
"loss": 3.5244, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0032262747818231677, |
|
"grad_norm": 0.8874719738960266, |
|
"learning_rate": 5.4128967273616625e-05, |
|
"loss": 3.7889, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0032262747818231677, |
|
"eval_loss": 3.360200881958008, |
|
"eval_runtime": 307.9178, |
|
"eval_samples_per_second": 42.385, |
|
"eval_steps_per_second": 21.194, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0033875885209143264, |
|
"grad_norm": 0.23607748746871948, |
|
"learning_rate": 5e-05, |
|
"loss": 3.2003, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0035489022600054846, |
|
"grad_norm": 0.26606330275535583, |
|
"learning_rate": 4.5871032726383386e-05, |
|
"loss": 3.0856, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.003710215999096643, |
|
"grad_norm": 0.2770606577396393, |
|
"learning_rate": 4.17702704859633e-05, |
|
"loss": 3.2055, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0038715297381878015, |
|
"grad_norm": 0.25630635023117065, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 3.3092, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00403284347727896, |
|
"grad_norm": 0.27507317066192627, |
|
"learning_rate": 3.3765026539765834e-05, |
|
"loss": 3.3265, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0041941572163701184, |
|
"grad_norm": 0.28478625416755676, |
|
"learning_rate": 2.991522876735154e-05, |
|
"loss": 3.3682, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.004355470955461276, |
|
"grad_norm": 0.3185293972492218, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 3.3184, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.004516784694552435, |
|
"grad_norm": 0.37928304076194763, |
|
"learning_rate": 2.2652592093878666e-05, |
|
"loss": 3.4225, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.004678098433643594, |
|
"grad_norm": 0.5270272493362427, |
|
"learning_rate": 1.928936436551661e-05, |
|
"loss": 3.3142, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.004839412172734752, |
|
"grad_norm": 1.016218662261963, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 3.6347, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004839412172734752, |
|
"eval_loss": 3.3451364040374756, |
|
"eval_runtime": 307.2314, |
|
"eval_samples_per_second": 42.479, |
|
"eval_steps_per_second": 21.241, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.00500072591182591, |
|
"grad_norm": 0.20172300934791565, |
|
"learning_rate": 1.3213804466343421e-05, |
|
"loss": 3.1708, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.005162039650917069, |
|
"grad_norm": 0.23905648291110992, |
|
"learning_rate": 1.0542974530180327e-05, |
|
"loss": 3.1856, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.005323353390008227, |
|
"grad_norm": 0.2721221148967743, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 3.2851, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.005484667129099385, |
|
"grad_norm": 0.25581252574920654, |
|
"learning_rate": 6.026312439675552e-06, |
|
"loss": 3.2723, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.005645980868190544, |
|
"grad_norm": 0.2833029329776764, |
|
"learning_rate": 4.2113336672471245e-06, |
|
"loss": 3.3305, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0058072946072817025, |
|
"grad_norm": 0.3108144700527191, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 3.4338, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00596860834637286, |
|
"grad_norm": 0.3516363501548767, |
|
"learning_rate": 1.5299867030334814e-06, |
|
"loss": 3.2965, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.006129922085464019, |
|
"grad_norm": 0.39249828457832336, |
|
"learning_rate": 6.819348298638839e-07, |
|
"loss": 3.4179, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.006291235824555178, |
|
"grad_norm": 0.44656136631965637, |
|
"learning_rate": 1.7077534966650766e-07, |
|
"loss": 3.2208, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.0064525495636463355, |
|
"grad_norm": 0.875480592250824, |
|
"learning_rate": 0.0, |
|
"loss": 3.6721, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0064525495636463355, |
|
"eval_loss": 3.3430535793304443, |
|
"eval_runtime": 308.6144, |
|
"eval_samples_per_second": 42.289, |
|
"eval_steps_per_second": 21.146, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1072035764895744.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|