|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 13.636363636363637, |
|
"eval_steps": 500, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 220.0, |
|
"learning_rate": 2.5e-05, |
|
"loss": 24.9691, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 35.25, |
|
"learning_rate": 0.000125, |
|
"loss": 21.9058, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 7.6562018394470215, |
|
"eval_runtime": 0.5686, |
|
"eval_samples_per_second": 3.517, |
|
"eval_steps_per_second": 1.759, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 9.0, |
|
"learning_rate": 0.00019956059820218982, |
|
"loss": 13.5645, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 6.635939121246338, |
|
"eval_runtime": 0.5597, |
|
"eval_samples_per_second": 3.573, |
|
"eval_steps_per_second": 1.787, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.00019466156752904343, |
|
"loss": 10.2613, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"eval_loss": 6.0754241943359375, |
|
"eval_runtime": 0.5741, |
|
"eval_samples_per_second": 3.484, |
|
"eval_steps_per_second": 1.742, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 14.6875, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 9.903, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 3.1116435527801514, |
|
"eval_runtime": 0.5647, |
|
"eval_samples_per_second": 3.542, |
|
"eval_steps_per_second": 1.771, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 32.75, |
|
"learning_rate": 0.00016987694277788417, |
|
"loss": 4.594, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"eval_loss": 1.6371122598648071, |
|
"eval_runtime": 0.5752, |
|
"eval_samples_per_second": 3.477, |
|
"eval_steps_per_second": 1.739, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 3.125, |
|
"learning_rate": 0.0001513474193514842, |
|
"loss": 1.6122, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.4159561395645142, |
|
"eval_runtime": 0.5662, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 1.766, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0001300084635000341, |
|
"loss": 1.3971, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"eval_loss": 1.3411411046981812, |
|
"eval_runtime": 0.5782, |
|
"eval_samples_per_second": 3.459, |
|
"eval_steps_per_second": 1.73, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.0001070276188945293, |
|
"loss": 1.2757, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.307417631149292, |
|
"eval_runtime": 0.5668, |
|
"eval_samples_per_second": 3.529, |
|
"eval_steps_per_second": 1.764, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 8.366226381814697e-05, |
|
"loss": 1.1233, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 8.91, |
|
"eval_loss": 1.275590419769287, |
|
"eval_runtime": 0.5782, |
|
"eval_samples_per_second": 3.459, |
|
"eval_steps_per_second": 1.729, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 6.119081473277501e-05, |
|
"loss": 1.0599, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.084277875864776e-05, |
|
"loss": 0.9741, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.273629903793335, |
|
"eval_runtime": 0.5636, |
|
"eval_samples_per_second": 3.549, |
|
"eval_steps_per_second": 1.774, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.3731482188961818e-05, |
|
"loss": 0.9266, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"eval_loss": 1.279077410697937, |
|
"eval_runtime": 0.5768, |
|
"eval_samples_per_second": 3.467, |
|
"eval_steps_per_second": 1.734, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 11.82, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 1.0793155744261351e-05, |
|
"loss": 0.8584, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.2753326892852783, |
|
"eval_runtime": 0.5671, |
|
"eval_samples_per_second": 3.527, |
|
"eval_steps_per_second": 1.763, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 12.73, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 2.735709467518699e-06, |
|
"loss": 0.8714, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"eval_loss": 1.2842026948928833, |
|
"eval_runtime": 0.7036, |
|
"eval_samples_per_second": 2.842, |
|
"eval_steps_per_second": 1.421, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0, |
|
"loss": 0.8421, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"eval_loss": 1.2808171510696411, |
|
"eval_runtime": 0.5646, |
|
"eval_samples_per_second": 3.543, |
|
"eval_steps_per_second": 1.771, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"step": 75, |
|
"total_flos": 1.1503415671442637e+17, |
|
"train_loss": 4.785474745432536, |
|
"train_runtime": 283.6494, |
|
"train_samples_per_second": 4.654, |
|
"train_steps_per_second": 0.264 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 100, |
|
"total_flos": 1.1503415671442637e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|