|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.2941626306398164, |
|
"eval_steps": 500, |
|
"global_step": 9000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1274534794799898, |
|
"grad_norm": 855.9707641601562, |
|
"learning_rate": 9.999156208166614e-06, |
|
"loss": 74.3469, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2549069589599796, |
|
"grad_norm": 302.50543212890625, |
|
"learning_rate": 9.89824533543787e-06, |
|
"loss": 61.1889, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3823604384399694, |
|
"grad_norm": 495.7685546875, |
|
"learning_rate": 9.632470336074009e-06, |
|
"loss": 57.9995, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5098139179199592, |
|
"grad_norm": 265.8482666015625, |
|
"learning_rate": 9.21077660993783e-06, |
|
"loss": 57.4618, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.637267397399949, |
|
"grad_norm": 1070.9898681640625, |
|
"learning_rate": 8.647357437093104e-06, |
|
"loss": 53.0734, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7647208768799388, |
|
"grad_norm": 607.1848754882812, |
|
"learning_rate": 7.961176263324902e-06, |
|
"loss": 54.122, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8921743563599286, |
|
"grad_norm": 299.444091796875, |
|
"learning_rate": 7.175328432410367e-06, |
|
"loss": 55.9574, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2435.821044921875, |
|
"eval_runtime": 13.0829, |
|
"eval_samples_per_second": 266.532, |
|
"eval_steps_per_second": 33.326, |
|
"step": 3923 |
|
}, |
|
{ |
|
"epoch": 1.0196278358399185, |
|
"grad_norm": 313.2464294433594, |
|
"learning_rate": 6.3162638478229965e-06, |
|
"loss": 52.9767, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.1470813153199082, |
|
"grad_norm": 406.19097900390625, |
|
"learning_rate": 5.412896727361663e-06, |
|
"loss": 49.7189, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.274534794799898, |
|
"grad_norm": 227.44325256347656, |
|
"learning_rate": 4.495632414398659e-06, |
|
"loss": 48.3298, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.401988274279888, |
|
"grad_norm": 292.7037353515625, |
|
"learning_rate": 3.595344001132154e-06, |
|
"loss": 50.4245, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.5294417537598777, |
|
"grad_norm": 522.2669677734375, |
|
"learning_rate": 2.7423332084455543e-06, |
|
"loss": 49.8634, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.6568952332398674, |
|
"grad_norm": 408.0614929199219, |
|
"learning_rate": 1.965310496864217e-06, |
|
"loss": 49.0125, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.7843487127198574, |
|
"grad_norm": 722.4154052734375, |
|
"learning_rate": 1.290428735823593e-06, |
|
"loss": 44.8823, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.911802192199847, |
|
"grad_norm": 383.4438171386719, |
|
"learning_rate": 7.404029558083653e-07, |
|
"loss": 47.8235, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2325.86083984375, |
|
"eval_runtime": 13.1669, |
|
"eval_samples_per_second": 264.831, |
|
"eval_steps_per_second": 33.113, |
|
"step": 7846 |
|
}, |
|
{ |
|
"epoch": 2.039255671679837, |
|
"grad_norm": 408.5860290527344, |
|
"learning_rate": 3.337458105627145e-07, |
|
"loss": 44.1611, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.166709151159827, |
|
"grad_norm": 252.09278869628906, |
|
"learning_rate": 8.414448202622494e-08, |
|
"loss": 45.9323, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.2941626306398164, |
|
"grad_norm": 174.15538024902344, |
|
"learning_rate": 0.0, |
|
"loss": 47.7162, |
|
"step": 9000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 9000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 3000, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|