|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 11.971223021582734, |
|
"eval_steps": 25, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 494.05772193159083, |
|
"learning_rate": 9.999874838141888e-05, |
|
"loss": 1.7591, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 4.927945613861084, |
|
"eval_runtime": 213.4346, |
|
"eval_samples_per_second": 46.857, |
|
"eval_steps_per_second": 0.736, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 41.85279295369776, |
|
"learning_rate": 9.915628588978522e-05, |
|
"loss": 2.0299, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 0.818238377571106, |
|
"eval_runtime": 211.9322, |
|
"eval_samples_per_second": 47.19, |
|
"eval_steps_per_second": 0.741, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 10.844358526437997, |
|
"learning_rate": 9.67797005288181e-05, |
|
"loss": 0.6558, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 0.5749660134315491, |
|
"eval_runtime": 211.6596, |
|
"eval_samples_per_second": 47.25, |
|
"eval_steps_per_second": 0.742, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 6.085279119700608, |
|
"learning_rate": 9.294316336102132e-05, |
|
"loss": 0.4785, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 0.38230499625205994, |
|
"eval_runtime": 213.3859, |
|
"eval_samples_per_second": 46.868, |
|
"eval_steps_per_second": 0.736, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 3.033775912026443, |
|
"learning_rate": 8.776640921382584e-05, |
|
"loss": 0.3837, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.29413464665412903, |
|
"eval_runtime": 213.545, |
|
"eval_samples_per_second": 46.833, |
|
"eval_steps_per_second": 0.735, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.1383110535612262, |
|
"learning_rate": 8.141099986478212e-05, |
|
"loss": 0.3073, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"eval_loss": 0.23183606564998627, |
|
"eval_runtime": 212.4796, |
|
"eval_samples_per_second": 47.068, |
|
"eval_steps_per_second": 0.739, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.081127753088496, |
|
"learning_rate": 7.407528184577019e-05, |
|
"loss": 0.2119, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_loss": 0.18712490797042847, |
|
"eval_runtime": 213.5483, |
|
"eval_samples_per_second": 46.833, |
|
"eval_steps_per_second": 0.735, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.36145826033813083, |
|
"learning_rate": 6.598819622856227e-05, |
|
"loss": 0.1632, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"eval_loss": 0.15954019129276276, |
|
"eval_runtime": 211.2408, |
|
"eval_samples_per_second": 47.344, |
|
"eval_steps_per_second": 0.743, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 0.3381871309692255, |
|
"learning_rate": 5.7402133582686576e-05, |
|
"loss": 0.1297, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"eval_loss": 0.14867956936359406, |
|
"eval_runtime": 214.0779, |
|
"eval_samples_per_second": 46.717, |
|
"eval_steps_per_second": 0.733, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"grad_norm": 0.3059190090918364, |
|
"learning_rate": 4.85850570958441e-05, |
|
"loss": 0.1035, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"eval_loss": 0.14759863913059235, |
|
"eval_runtime": 212.6859, |
|
"eval_samples_per_second": 47.022, |
|
"eval_steps_per_second": 0.738, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"grad_norm": 0.28211565488590556, |
|
"learning_rate": 3.9812139687108815e-05, |
|
"loss": 0.0856, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"eval_loss": 0.14268410205841064, |
|
"eval_runtime": 210.2857, |
|
"eval_samples_per_second": 47.559, |
|
"eval_steps_per_second": 0.747, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"grad_norm": 0.3745125634810898, |
|
"learning_rate": 3.135717611098458e-05, |
|
"loss": 0.0574, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"eval_loss": 0.14823941886425018, |
|
"eval_runtime": 214.2104, |
|
"eval_samples_per_second": 46.688, |
|
"eval_steps_per_second": 0.733, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"grad_norm": 0.3453636602728437, |
|
"learning_rate": 2.3484038072721758e-05, |
|
"loss": 0.0448, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"eval_loss": 0.1552451252937317, |
|
"eval_runtime": 210.5548, |
|
"eval_samples_per_second": 47.498, |
|
"eval_steps_per_second": 0.746, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 8.95, |
|
"grad_norm": 0.28973521029613436, |
|
"learning_rate": 1.6438439032954855e-05, |
|
"loss": 0.0318, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 8.95, |
|
"eval_loss": 0.15622578561306, |
|
"eval_runtime": 210.6816, |
|
"eval_samples_per_second": 47.47, |
|
"eval_steps_per_second": 0.745, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"grad_norm": 0.20130206793775604, |
|
"learning_rate": 1.0440265714600572e-05, |
|
"loss": 0.0196, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 9.59, |
|
"eval_loss": 0.1708817183971405, |
|
"eval_runtime": 213.4033, |
|
"eval_samples_per_second": 46.864, |
|
"eval_steps_per_second": 0.736, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 10.23, |
|
"grad_norm": 0.12286721385196743, |
|
"learning_rate": 5.676715638695063e-06, |
|
"loss": 0.0146, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 10.23, |
|
"eval_loss": 0.1793455183506012, |
|
"eval_runtime": 212.2341, |
|
"eval_samples_per_second": 47.122, |
|
"eval_steps_per_second": 0.74, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 10.87, |
|
"grad_norm": 0.07993244910369399, |
|
"learning_rate": 2.2964548604209213e-06, |
|
"loss": 0.0084, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 10.87, |
|
"eval_loss": 0.1853875368833542, |
|
"eval_runtime": 213.6886, |
|
"eval_samples_per_second": 46.802, |
|
"eval_steps_per_second": 0.735, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"grad_norm": 0.05627841974218994, |
|
"learning_rate": 4.049782370561583e-07, |
|
"loss": 0.0058, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"eval_loss": 0.1918596774339676, |
|
"eval_runtime": 211.3759, |
|
"eval_samples_per_second": 47.314, |
|
"eval_steps_per_second": 0.743, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 11.97, |
|
"step": 468, |
|
"total_flos": 3135460343808000.0, |
|
"train_loss": 0.346917372992915, |
|
"train_runtime": 79535.2013, |
|
"train_samples_per_second": 12.071, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 12, |
|
"save_steps": 2000, |
|
"total_flos": 3135460343808000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|