|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 13.636363636363637, |
|
"eval_steps": 500, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 220.0, |
|
"learning_rate": 2.5e-05, |
|
"loss": 24.9691, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 35.25, |
|
"learning_rate": 0.000125, |
|
"loss": 21.906, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 7.653346538543701, |
|
"eval_runtime": 0.5608, |
|
"eval_samples_per_second": 3.567, |
|
"eval_steps_per_second": 1.783, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 0.00019956059820218982, |
|
"loss": 13.5603, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 6.644189357757568, |
|
"eval_runtime": 0.5604, |
|
"eval_samples_per_second": 3.569, |
|
"eval_steps_per_second": 1.784, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 4.625, |
|
"learning_rate": 0.00019466156752904343, |
|
"loss": 10.2605, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"eval_loss": 6.081549644470215, |
|
"eval_runtime": 0.5806, |
|
"eval_samples_per_second": 3.445, |
|
"eval_steps_per_second": 1.722, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 15.125, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 9.9129, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 3.114753484725952, |
|
"eval_runtime": 0.5656, |
|
"eval_samples_per_second": 3.536, |
|
"eval_steps_per_second": 1.768, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 0.00016987694277788417, |
|
"loss": 4.5895, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"eval_loss": 1.6582958698272705, |
|
"eval_runtime": 0.5842, |
|
"eval_samples_per_second": 3.424, |
|
"eval_steps_per_second": 1.712, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 0.0001513474193514842, |
|
"loss": 1.6316, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.415539264678955, |
|
"eval_runtime": 0.5662, |
|
"eval_samples_per_second": 3.532, |
|
"eval_steps_per_second": 1.766, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 0.0001300084635000341, |
|
"loss": 1.4115, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"eval_loss": 1.3542958498001099, |
|
"eval_runtime": 0.58, |
|
"eval_samples_per_second": 3.448, |
|
"eval_steps_per_second": 1.724, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0001070276188945293, |
|
"loss": 1.2971, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.313336968421936, |
|
"eval_runtime": 0.5751, |
|
"eval_samples_per_second": 3.478, |
|
"eval_steps_per_second": 1.739, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 8.366226381814697e-05, |
|
"loss": 1.1321, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 8.91, |
|
"eval_loss": 1.2903474569320679, |
|
"eval_runtime": 0.5848, |
|
"eval_samples_per_second": 3.42, |
|
"eval_steps_per_second": 1.71, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 6.119081473277501e-05, |
|
"loss": 1.062, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.084277875864776e-05, |
|
"loss": 0.9739, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.2820332050323486, |
|
"eval_runtime": 0.5646, |
|
"eval_samples_per_second": 3.542, |
|
"eval_steps_per_second": 1.771, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.3731482188961818e-05, |
|
"loss": 0.917, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"eval_loss": 1.2888375520706177, |
|
"eval_runtime": 0.5836, |
|
"eval_samples_per_second": 3.427, |
|
"eval_steps_per_second": 1.714, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 11.82, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 1.0793155744261351e-05, |
|
"loss": 0.8541, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.27806556224823, |
|
"eval_runtime": 0.5683, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 1.76, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 12.73, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 2.735709467518699e-06, |
|
"loss": 0.8659, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"eval_loss": 1.2891546487808228, |
|
"eval_runtime": 0.7414, |
|
"eval_samples_per_second": 2.698, |
|
"eval_steps_per_second": 1.349, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0, |
|
"loss": 0.8354, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"eval_loss": 1.2830009460449219, |
|
"eval_runtime": 0.5659, |
|
"eval_samples_per_second": 3.534, |
|
"eval_steps_per_second": 1.767, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 13.64, |
|
"step": 75, |
|
"total_flos": 1.1503415671442637e+17, |
|
"train_loss": 4.788167775472005, |
|
"train_runtime": 283.7979, |
|
"train_samples_per_second": 4.651, |
|
"train_steps_per_second": 0.264 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 100, |
|
"total_flos": 1.1503415671442637e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|