|
{ |
|
"best_metric": 0.08136745542287827, |
|
"best_model_checkpoint": "model/checkpoint-8500", |
|
"epoch": 2.903997266826102, |
|
"eval_steps": 500, |
|
"global_step": 8500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.9025481939315796, |
|
"learning_rate": 9.430588771210569e-06, |
|
"loss": 8.0821, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 0.15024334192276, |
|
"eval_runtime": 39.7831, |
|
"eval_samples_per_second": 61.961, |
|
"eval_steps_per_second": 12.392, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.37238627672195435, |
|
"learning_rate": 8.861177542421137e-06, |
|
"loss": 0.1353, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 0.09424333274364471, |
|
"eval_runtime": 39.7674, |
|
"eval_samples_per_second": 61.986, |
|
"eval_steps_per_second": 12.397, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.28026077151298523, |
|
"learning_rate": 8.291766313631705e-06, |
|
"loss": 0.1056, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 0.08929836004972458, |
|
"eval_runtime": 39.8141, |
|
"eval_samples_per_second": 61.913, |
|
"eval_steps_per_second": 12.383, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.242831751704216, |
|
"learning_rate": 7.722355084842274e-06, |
|
"loss": 0.0994, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 0.08687467128038406, |
|
"eval_runtime": 39.8819, |
|
"eval_samples_per_second": 61.808, |
|
"eval_steps_per_second": 12.362, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.31068557500839233, |
|
"learning_rate": 7.152943856052842e-06, |
|
"loss": 0.0957, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 0.08578581362962723, |
|
"eval_runtime": 39.8992, |
|
"eval_samples_per_second": 61.781, |
|
"eval_steps_per_second": 12.356, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.1952580064535141, |
|
"learning_rate": 6.583532627263411e-06, |
|
"loss": 0.0951, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_loss": 0.0845487117767334, |
|
"eval_runtime": 39.7723, |
|
"eval_samples_per_second": 61.978, |
|
"eval_steps_per_second": 12.396, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.1910097897052765, |
|
"learning_rate": 6.014121398473979e-06, |
|
"loss": 0.0923, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.08382081985473633, |
|
"eval_runtime": 39.606, |
|
"eval_samples_per_second": 62.238, |
|
"eval_steps_per_second": 12.448, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.18350045382976532, |
|
"learning_rate": 5.444710169684546e-06, |
|
"loss": 0.0899, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"eval_loss": 0.08325745165348053, |
|
"eval_runtime": 39.6287, |
|
"eval_samples_per_second": 62.202, |
|
"eval_steps_per_second": 12.44, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.2158266305923462, |
|
"learning_rate": 4.8752989408951145e-06, |
|
"loss": 0.0899, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_loss": 0.08282188326120377, |
|
"eval_runtime": 39.6573, |
|
"eval_samples_per_second": 62.158, |
|
"eval_steps_per_second": 12.432, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.2279905378818512, |
|
"learning_rate": 4.305887712105683e-06, |
|
"loss": 0.0894, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_loss": 0.08228254318237305, |
|
"eval_runtime": 39.8586, |
|
"eval_samples_per_second": 61.844, |
|
"eval_steps_per_second": 12.369, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.18786436319351196, |
|
"learning_rate": 3.7364764833162513e-06, |
|
"loss": 0.089, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 0.08236683160066605, |
|
"eval_runtime": 39.8662, |
|
"eval_samples_per_second": 61.832, |
|
"eval_steps_per_second": 12.366, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.20143625140190125, |
|
"learning_rate": 3.1670652545268194e-06, |
|
"loss": 0.0882, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_loss": 0.08215593546628952, |
|
"eval_runtime": 39.8891, |
|
"eval_samples_per_second": 61.796, |
|
"eval_steps_per_second": 12.359, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.3449649512767792, |
|
"learning_rate": 2.5976540257373876e-06, |
|
"loss": 0.0876, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"eval_loss": 0.08190125972032547, |
|
"eval_runtime": 39.9241, |
|
"eval_samples_per_second": 61.742, |
|
"eval_steps_per_second": 12.348, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.17690658569335938, |
|
"learning_rate": 2.028242796947956e-06, |
|
"loss": 0.0876, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"eval_loss": 0.08178862929344177, |
|
"eval_runtime": 39.8989, |
|
"eval_samples_per_second": 61.781, |
|
"eval_steps_per_second": 12.356, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.16777564585208893, |
|
"learning_rate": 1.4588315681585242e-06, |
|
"loss": 0.0874, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 0.08147595077753067, |
|
"eval_runtime": 39.629, |
|
"eval_samples_per_second": 62.202, |
|
"eval_steps_per_second": 12.44, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.2061876803636551, |
|
"learning_rate": 8.894203393690924e-07, |
|
"loss": 0.088, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"eval_loss": 0.08150825649499893, |
|
"eval_runtime": 39.6669, |
|
"eval_samples_per_second": 62.142, |
|
"eval_steps_per_second": 12.428, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.21220079064369202, |
|
"learning_rate": 3.2000911057966066e-07, |
|
"loss": 0.0871, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"eval_loss": 0.08136745542287827, |
|
"eval_runtime": 39.796, |
|
"eval_samples_per_second": 61.941, |
|
"eval_steps_per_second": 12.388, |
|
"step": 8500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 8781, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 2.7281294636544e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|