|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 7, |
|
"global_step": 132, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030303030303030304, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.2128, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.21212121212121213, |
|
"grad_norm": 2.591930533784769, |
|
"learning_rate": 3.0000000000000004e-07, |
|
"loss": 2.2112, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21212121212121213, |
|
"eval_loss": 2.200307846069336, |
|
"eval_runtime": 142.2614, |
|
"eval_samples_per_second": 3.269, |
|
"eval_steps_per_second": 0.028, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"grad_norm": 3.209520918668776, |
|
"learning_rate": 2.1e-06, |
|
"loss": 2.1943, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"eval_loss": 2.147017478942871, |
|
"eval_runtime": 143.3053, |
|
"eval_samples_per_second": 3.245, |
|
"eval_steps_per_second": 0.028, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 2.1132781771027593, |
|
"learning_rate": 2.5095609265912853e-06, |
|
"loss": 2.1573, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"eval_loss": 2.0779454708099365, |
|
"eval_runtime": 143.4582, |
|
"eval_samples_per_second": 3.241, |
|
"eval_steps_per_second": 0.028, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": 1.9056335733865675, |
|
"learning_rate": 1.3197749551783641e-06, |
|
"loss": 2.0595, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"eval_loss": 2.0294580459594727, |
|
"eval_runtime": 137.3982, |
|
"eval_samples_per_second": 3.384, |
|
"eval_steps_per_second": 0.029, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"grad_norm": 2.1736242058284025, |
|
"learning_rate": 6.783887430182062e-07, |
|
"loss": 1.9961, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"eval_loss": 2.0057485103607178, |
|
"eval_runtime": 138.1845, |
|
"eval_samples_per_second": 3.365, |
|
"eval_steps_per_second": 0.029, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 3.1170870533174737, |
|
"learning_rate": 4.1931673730025623e-07, |
|
"loss": 1.9332, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"eval_loss": 1.9999465942382812, |
|
"eval_runtime": 139.0169, |
|
"eval_samples_per_second": 3.345, |
|
"eval_steps_per_second": 0.029, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.4848484848484849, |
|
"grad_norm": 2.023380630221034, |
|
"learning_rate": 2.1759855432049637e-07, |
|
"loss": 1.9101, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.4848484848484849, |
|
"eval_loss": 1.9939130544662476, |
|
"eval_runtime": 141.1776, |
|
"eval_samples_per_second": 3.294, |
|
"eval_steps_per_second": 0.028, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.696969696969697, |
|
"grad_norm": 2.6277704707226777, |
|
"learning_rate": 1.2154440189415328e-07, |
|
"loss": 1.906, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.696969696969697, |
|
"eval_loss": 1.990719199180603, |
|
"eval_runtime": 138.5039, |
|
"eval_samples_per_second": 3.357, |
|
"eval_steps_per_second": 0.029, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 2.0677909551788045, |
|
"learning_rate": 7.843503292553053e-08, |
|
"loss": 1.9054, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"eval_loss": 1.9889289140701294, |
|
"eval_runtime": 142.8032, |
|
"eval_samples_per_second": 3.256, |
|
"eval_steps_per_second": 0.028, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 1.4792478673423035, |
|
"learning_rate": 6.038521136361391e-08, |
|
"loss": 1.9037, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"eval_loss": 1.9878106117248535, |
|
"eval_runtime": 139.6825, |
|
"eval_samples_per_second": 3.329, |
|
"eval_steps_per_second": 0.029, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 5.077301685002183, |
|
"learning_rate": 5.342647186003563e-08, |
|
"loss": 1.8786, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"eval_loss": 1.9871684312820435, |
|
"eval_runtime": 151.1266, |
|
"eval_samples_per_second": 3.077, |
|
"eval_steps_per_second": 0.026, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 1.6345215057101947, |
|
"learning_rate": 5.099824238664556e-08, |
|
"loss": 1.8962, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"eval_loss": 1.9865626096725464, |
|
"eval_runtime": 140.1891, |
|
"eval_samples_per_second": 3.317, |
|
"eval_steps_per_second": 0.029, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.757575757575758, |
|
"grad_norm": 1.7960257700653, |
|
"learning_rate": 5.024882880767712e-08, |
|
"loss": 1.8668, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.757575757575758, |
|
"eval_loss": 1.9858595132827759, |
|
"eval_runtime": 141.572, |
|
"eval_samples_per_second": 3.285, |
|
"eval_steps_per_second": 0.028, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.9696969696969697, |
|
"grad_norm": 2.888661327751688, |
|
"learning_rate": 5.0050722602692304e-08, |
|
"loss": 1.8988, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.9696969696969697, |
|
"eval_loss": 1.984999179840088, |
|
"eval_runtime": 139.6459, |
|
"eval_samples_per_second": 3.33, |
|
"eval_steps_per_second": 0.029, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 1.5620758554693681, |
|
"learning_rate": 5.000789738737886e-08, |
|
"loss": 1.8966, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"eval_loss": 1.9841891527175903, |
|
"eval_runtime": 144.3125, |
|
"eval_samples_per_second": 3.222, |
|
"eval_steps_per_second": 0.028, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.393939393939394, |
|
"grad_norm": 2.0865509846490657, |
|
"learning_rate": 5.0000841090079794e-08, |
|
"loss": 1.8847, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.393939393939394, |
|
"eval_loss": 1.983519196510315, |
|
"eval_runtime": 143.6862, |
|
"eval_samples_per_second": 3.236, |
|
"eval_steps_per_second": 0.028, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.606060606060606, |
|
"grad_norm": 1.8436999654324047, |
|
"learning_rate": 5.000005037180778e-08, |
|
"loss": 1.8748, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.606060606060606, |
|
"eval_loss": 1.9828811883926392, |
|
"eval_runtime": 141.3381, |
|
"eval_samples_per_second": 3.29, |
|
"eval_steps_per_second": 0.028, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"grad_norm": 2.307936602188417, |
|
"learning_rate": 5.00000011344935e-08, |
|
"loss": 1.851, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"eval_loss": 1.9823122024536133, |
|
"eval_runtime": 142.0143, |
|
"eval_samples_per_second": 3.274, |
|
"eval_steps_per_second": 0.028, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 132, |
|
"total_flos": 130691559849984.0, |
|
"train_loss": 1.9532976656249075, |
|
"train_runtime": 25190.3368, |
|
"train_samples_per_second": 0.664, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 7, |
|
"max_steps": 132, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 27, |
|
"total_flos": 130691559849984.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|