|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.777777777777779, |
|
"eval_steps": 500, |
|
"global_step": 330, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.83837890625, |
|
"learning_rate": 0.00019954719225730847, |
|
"loss": 1.6268, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.63134765625, |
|
"learning_rate": 0.00019819286972627066, |
|
"loss": 1.0878, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.63623046875, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 0.9758, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.54345703125, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 0.8484, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.355712890625, |
|
"learning_rate": 0.00018888354486549237, |
|
"loss": 0.7642, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.90869140625, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 0.7435, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 0.35498046875, |
|
"learning_rate": 0.00017860530947427875, |
|
"loss": 0.7328, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.37548828125, |
|
"learning_rate": 0.00017237340381050703, |
|
"loss": 0.7266, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.319091796875, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.7161, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.446044921875, |
|
"learning_rate": 0.00015800569095711982, |
|
"loss": 0.6811, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.259259259259259, |
|
"grad_norm": 0.3486328125, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.6885, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.312255859375, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 0.6989, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.851851851851852, |
|
"grad_norm": 0.402099609375, |
|
"learning_rate": 0.00013270679633174218, |
|
"loss": 0.6812, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.148148148148148, |
|
"grad_norm": 0.3740234375, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 0.6903, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 0.6746, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.7407407407407405, |
|
"grad_norm": 0.3525390625, |
|
"learning_rate": 0.00010475819158237425, |
|
"loss": 0.6626, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.037037037037037, |
|
"grad_norm": 0.328857421875, |
|
"learning_rate": 9.524180841762577e-05, |
|
"loss": 0.6716, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.336669921875, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.6314, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.62962962962963, |
|
"grad_norm": 0.336669921875, |
|
"learning_rate": 7.642410644905726e-05, |
|
"loss": 0.649, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 1.4248046875, |
|
"learning_rate": 6.729320366825784e-05, |
|
"loss": 0.6906, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 0.349365234375, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 0.644, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.518518518518518, |
|
"grad_norm": 0.36474609375, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.6315, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.814814814814815, |
|
"grad_norm": 0.377197265625, |
|
"learning_rate": 4.19943090428802e-05, |
|
"loss": 0.6505, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.111111111111111, |
|
"grad_norm": 0.349853515625, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 0.6518, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 0.40966796875, |
|
"learning_rate": 2.7626596189492983e-05, |
|
"loss": 0.6243, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.703703703703704, |
|
"grad_norm": 0.359619140625, |
|
"learning_rate": 2.139469052572127e-05, |
|
"loss": 0.6406, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.385498046875, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 0.655, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.296296296296296, |
|
"grad_norm": 0.386474609375, |
|
"learning_rate": 1.1116455134507664e-05, |
|
"loss": 0.6513, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.592592592592592, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 7.163206698392744e-06, |
|
"loss": 0.6311, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.341064453125, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 0.6223, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.185185185185185, |
|
"grad_norm": 0.36572265625, |
|
"learning_rate": 1.8071302737293295e-06, |
|
"loss": 0.6275, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.481481481481481, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 4.5280774269154115e-07, |
|
"loss": 0.6198, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"grad_norm": 0.39892578125, |
|
"learning_rate": 0.0, |
|
"loss": 0.6337, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"step": 330, |
|
"total_flos": 1.610313816342528e+16, |
|
"train_loss": 0.7250095107338645, |
|
"train_runtime": 297.4234, |
|
"train_samples_per_second": 4.539, |
|
"train_steps_per_second": 1.11 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 330, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.610313816342528e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|