|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5, |
|
"eval_steps": 3, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008333333333333333, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.1097, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.1424, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4957225914582386, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.1714, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 1.0203233450225075, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.1459, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.1952212091375145, |
|
"learning_rate": 3e-06, |
|
"loss": 1.1465, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.832578954318156, |
|
"learning_rate": 2.839090909090909e-06, |
|
"loss": 1.1517, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0877339332216878, |
|
"learning_rate": 2.678181818181818e-06, |
|
"loss": 1.0991, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 1.4203314600042332, |
|
"learning_rate": 2.5172727272727275e-06, |
|
"loss": 1.1481, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.543863444651869, |
|
"learning_rate": 2.3563636363636366e-06, |
|
"loss": 1.151, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 2.2672694357791805, |
|
"learning_rate": 2.1954545454545456e-06, |
|
"loss": 1.1132, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7570282713584158, |
|
"learning_rate": 2.0345454545454546e-06, |
|
"loss": 1.1043, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 0.7556695763204282, |
|
"learning_rate": 1.9272727272727273e-06, |
|
"loss": 1.0492, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.5250919282965105, |
|
"learning_rate": 1.8199999999999997e-06, |
|
"loss": 1.1206, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 2.402860972914852, |
|
"learning_rate": 1.659090909090909e-06, |
|
"loss": 1.0908, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.9250227812759156, |
|
"learning_rate": 1.4981818181818184e-06, |
|
"loss": 1.0982, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 1.0119045287556745, |
|
"learning_rate": 1.3372727272727274e-06, |
|
"loss": 1.1147, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.7972164809866773, |
|
"learning_rate": 1.1763636363636364e-06, |
|
"loss": 1.1004, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 1.4489209555210352, |
|
"learning_rate": 1.0154545454545454e-06, |
|
"loss": 1.0982, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7531358827177462, |
|
"learning_rate": 8.545454545454544e-07, |
|
"loss": 1.0794, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 0.7368058065522421, |
|
"learning_rate": 6.936363636363635e-07, |
|
"loss": 1.0948, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.2350079863263905, |
|
"learning_rate": 5.327272727272729e-07, |
|
"loss": 1.1091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"step": 60, |
|
"total_flos": 238563958456320.0, |
|
"train_loss": 1.1158998648325602, |
|
"train_runtime": 19061.2793, |
|
"train_samples_per_second": 0.402, |
|
"train_steps_per_second": 0.003 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 60, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 12, |
|
"total_flos": 238563958456320.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|