|
{ |
|
"best_metric": 0.8340622782707214, |
|
"best_model_checkpoint": "./results/checkpoint-1000", |
|
"epoch": 2.9940436796823295, |
|
"eval_steps": 500, |
|
"global_step": 1131, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05294506949040371, |
|
"grad_norm": 62.576904296875, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 16.7099, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10589013898080742, |
|
"grad_norm": 35.65088653564453, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 14.7891, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15883520847121113, |
|
"grad_norm": 30.480260848999023, |
|
"learning_rate": 6e-06, |
|
"loss": 11.6833, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21178027796161483, |
|
"grad_norm": 24.958763122558594, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 7.878, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.26472534745201853, |
|
"grad_norm": 4.896631717681885, |
|
"learning_rate": 1e-05, |
|
"loss": 4.1299, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.31767041694242226, |
|
"grad_norm": 1.8821851015090942, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.512, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.37061548643282594, |
|
"grad_norm": 0.7400406002998352, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.8807, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.42356055592322966, |
|
"grad_norm": 0.601634681224823, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.5617, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.47650562541363334, |
|
"grad_norm": 0.5250083208084106, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.461, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5294506949040371, |
|
"grad_norm": 0.5037821531295776, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3703, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5823957643944407, |
|
"grad_norm": 0.47873854637145996, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.2966, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6353408338848445, |
|
"grad_norm": 3.9664485454559326, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.2427, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6882859033752482, |
|
"grad_norm": 0.45016345381736755, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.2139, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7412309728656519, |
|
"grad_norm": 0.5136398077011108, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.1679, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7941760423560555, |
|
"grad_norm": 0.3736862242221832, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1604, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8471211118464593, |
|
"grad_norm": 0.38698282837867737, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.1407, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.900066181336863, |
|
"grad_norm": 0.4257580637931824, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.1177, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9530112508272667, |
|
"grad_norm": 0.3982521891593933, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.1199, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0059563203176705, |
|
"grad_norm": 0.3849237859249115, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.0925, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0589013898080741, |
|
"grad_norm": 0.3753887414932251, |
|
"learning_rate": 4e-05, |
|
"loss": 1.0605, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1118464592984778, |
|
"grad_norm": 0.3810591697692871, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.0541, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1647915287888815, |
|
"grad_norm": 0.3707886040210724, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.0334, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2177365982792852, |
|
"grad_norm": 0.36902502179145813, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.0249, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.270681667769689, |
|
"grad_norm": 0.3862062692642212, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.0141, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3236267372600927, |
|
"grad_norm": 0.36468520760536194, |
|
"learning_rate": 5e-05, |
|
"loss": 1.063, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3236267372600927, |
|
"eval_loss": 0.9045532941818237, |
|
"eval_runtime": 74.2771, |
|
"eval_samples_per_second": 10.165, |
|
"eval_steps_per_second": 2.545, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3765718067504964, |
|
"grad_norm": 0.3426459729671478, |
|
"learning_rate": 4.8415213946117275e-05, |
|
"loss": 1.01, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4295168762409, |
|
"grad_norm": 0.3736313581466675, |
|
"learning_rate": 4.6830427892234554e-05, |
|
"loss": 0.984, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4824619457313037, |
|
"grad_norm": 0.36285898089408875, |
|
"learning_rate": 4.524564183835183e-05, |
|
"loss": 0.9816, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5354070152217076, |
|
"grad_norm": 0.37838441133499146, |
|
"learning_rate": 4.36608557844691e-05, |
|
"loss": 0.9807, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.588352084712111, |
|
"grad_norm": 0.3449678421020508, |
|
"learning_rate": 4.207606973058637e-05, |
|
"loss": 0.982, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.641297154202515, |
|
"grad_norm": 0.3467804789543152, |
|
"learning_rate": 4.0491283676703644e-05, |
|
"loss": 0.9553, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6942422236929184, |
|
"grad_norm": 0.3551880419254303, |
|
"learning_rate": 3.8906497622820917e-05, |
|
"loss": 0.9701, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7471872931833223, |
|
"grad_norm": 0.3425547182559967, |
|
"learning_rate": 3.7321711568938196e-05, |
|
"loss": 0.973, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.800132362673726, |
|
"grad_norm": 0.32189810276031494, |
|
"learning_rate": 3.573692551505547e-05, |
|
"loss": 0.9599, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8530774321641297, |
|
"grad_norm": 0.34214696288108826, |
|
"learning_rate": 3.415213946117274e-05, |
|
"loss": 0.952, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9060225016545336, |
|
"grad_norm": 0.33412784337997437, |
|
"learning_rate": 3.256735340729002e-05, |
|
"loss": 0.9453, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.958967571144937, |
|
"grad_norm": 0.33273905515670776, |
|
"learning_rate": 3.098256735340729e-05, |
|
"loss": 0.9387, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.011912640635341, |
|
"grad_norm": 0.32698702812194824, |
|
"learning_rate": 2.939778129952457e-05, |
|
"loss": 0.9456, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.0648577101257444, |
|
"grad_norm": 0.36428529024124146, |
|
"learning_rate": 2.7812995245641837e-05, |
|
"loss": 0.9445, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.1178027796161483, |
|
"grad_norm": 0.3233266770839691, |
|
"learning_rate": 2.6228209191759113e-05, |
|
"loss": 0.9185, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1707478491065517, |
|
"grad_norm": 0.3173067569732666, |
|
"learning_rate": 2.4643423137876386e-05, |
|
"loss": 0.9146, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2236929185969556, |
|
"grad_norm": 0.33917301893234253, |
|
"learning_rate": 2.305863708399366e-05, |
|
"loss": 0.9195, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.2766379880873595, |
|
"grad_norm": 0.3438282608985901, |
|
"learning_rate": 2.1473851030110938e-05, |
|
"loss": 0.9356, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.329583057577763, |
|
"grad_norm": 0.33590319752693176, |
|
"learning_rate": 1.988906497622821e-05, |
|
"loss": 0.9319, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.382528127068167, |
|
"grad_norm": 0.5414553880691528, |
|
"learning_rate": 1.8304278922345483e-05, |
|
"loss": 0.9208, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4354731965585703, |
|
"grad_norm": 0.34509792923927307, |
|
"learning_rate": 1.671949286846276e-05, |
|
"loss": 0.9026, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.488418266048974, |
|
"grad_norm": 0.30984020233154297, |
|
"learning_rate": 1.5134706814580033e-05, |
|
"loss": 0.9066, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.541363335539378, |
|
"grad_norm": 0.31895536184310913, |
|
"learning_rate": 1.3549920760697307e-05, |
|
"loss": 0.9275, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.5943084050297816, |
|
"grad_norm": 0.3005692660808563, |
|
"learning_rate": 1.1965134706814581e-05, |
|
"loss": 0.9241, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6472534745201854, |
|
"grad_norm": 0.325959712266922, |
|
"learning_rate": 1.0380348652931855e-05, |
|
"loss": 0.9181, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.6472534745201854, |
|
"eval_loss": 0.8340622782707214, |
|
"eval_runtime": 74.3137, |
|
"eval_samples_per_second": 10.16, |
|
"eval_steps_per_second": 2.543, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.700198544010589, |
|
"grad_norm": 0.37024152278900146, |
|
"learning_rate": 8.79556259904913e-06, |
|
"loss": 0.9156, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.753143613500993, |
|
"grad_norm": 0.3021298050880432, |
|
"learning_rate": 7.2107765451664034e-06, |
|
"loss": 0.9204, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.8060886829913967, |
|
"grad_norm": 0.35478419065475464, |
|
"learning_rate": 5.625990491283677e-06, |
|
"loss": 0.9342, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8590337524818, |
|
"grad_norm": 0.34113916754722595, |
|
"learning_rate": 4.041204437400952e-06, |
|
"loss": 0.9187, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9119788219722036, |
|
"grad_norm": 0.33516696095466614, |
|
"learning_rate": 2.456418383518225e-06, |
|
"loss": 0.9238, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9649238914626075, |
|
"grad_norm": 0.2989369034767151, |
|
"learning_rate": 8.716323296354993e-07, |
|
"loss": 0.9188, |
|
"step": 1120 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1131, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.101970198757376e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|