|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 252, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 350422.84375, |
|
"learning_rate": 0.0002988358809900258, |
|
"loss": 1.2276, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 127592.3203125, |
|
"learning_rate": 0.00029536159293436166, |
|
"loss": 0.7731, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 113769.28125, |
|
"learning_rate": 0.00028963106229663063, |
|
"loss": 0.7033, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 109609.40625, |
|
"learning_rate": 0.0002817332360055343, |
|
"loss": 0.7912, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 130476.6171875, |
|
"learning_rate": 0.0002717907008573785, |
|
"loss": 0.7413, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 86021.2265625, |
|
"learning_rate": 0.0002599577807744739, |
|
"loss": 0.7233, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 87099.53125, |
|
"learning_rate": 0.0002464181414529809, |
|
"loss": 0.759, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 79031.671875, |
|
"learning_rate": 0.0002313819395798639, |
|
"loss": 0.7152, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 96164.9609375, |
|
"learning_rate": 0.00021508256086763368, |
|
"loss": 0.6194, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 82587.0859375, |
|
"learning_rate": 0.00019777299753775265, |
|
"loss": 0.5308, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 83371.2578125, |
|
"learning_rate": 0.0001797219214799096, |
|
"loss": 0.5415, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 107494.8984375, |
|
"learning_rate": 0.00016120951403796364, |
|
"loss": 0.5953, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 110181.2578125, |
|
"learning_rate": 0.0001425231171508954, |
|
"loss": 0.591, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 84472.59375, |
|
"learning_rate": 0.00012395277334996044, |
|
"loss": 0.5071, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 72922.546875, |
|
"learning_rate": 0.00010578672383836435, |
|
"loss": 0.5188, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 84903.71875, |
|
"learning_rate": 8.830693453040829e-05, |
|
"loss": 0.5508, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 71174.5546875, |
|
"learning_rate": 7.17847194930753e-05, |
|
"loss": 0.4979, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 190946.0625, |
|
"learning_rate": 5.6476529721189974e-05, |
|
"loss": 0.3451, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 73154.4609375, |
|
"learning_rate": 4.261997261104223e-05, |
|
"loss": 0.3543, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 97452.4453125, |
|
"learning_rate": 3.0430123916561672e-05, |
|
"loss": 0.3384, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 75840.453125, |
|
"learning_rate": 2.009618943233419e-05, |
|
"loss": 0.3708, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 101251.2578125, |
|
"learning_rate": 1.1778568219438839e-05, |
|
"loss": 0.354, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 77960.1015625, |
|
"learning_rate": 5.606362957498195e-06, |
|
"loss": 0.3275, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 92760.578125, |
|
"learning_rate": 1.6753760662307215e-06, |
|
"loss": 0.3273, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 78068.015625, |
|
"learning_rate": 4.662269987756317e-08, |
|
"loss": 0.3638, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 252, |
|
"total_flos": 3.744392573136077e+16, |
|
"train_loss": 0.5652910582129917, |
|
"train_runtime": 2830.2027, |
|
"train_samples_per_second": 0.353, |
|
"train_steps_per_second": 0.089 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 252, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10, |
|
"total_flos": 3.744392573136077e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|