|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5000.0, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 0.0051079667173326015, |
|
"learning_rate": 0.00019616000000000002, |
|
"loss": 0.1768, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"grad_norm": 0.0013037772150710225, |
|
"learning_rate": 0.00019216, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 300.0, |
|
"grad_norm": 0.00030190523830242455, |
|
"learning_rate": 0.00018816000000000001, |
|
"loss": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 400.0, |
|
"grad_norm": 0.00017417919298168272, |
|
"learning_rate": 0.00018416, |
|
"loss": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 500.0, |
|
"grad_norm": 0.00014137968537397683, |
|
"learning_rate": 0.00018016, |
|
"loss": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 600.0, |
|
"grad_norm": 0.0001240275305463001, |
|
"learning_rate": 0.00017616000000000002, |
|
"loss": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 700.0, |
|
"grad_norm": 9.807997412281111e-05, |
|
"learning_rate": 0.00017216, |
|
"loss": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 800.0, |
|
"grad_norm": 6.768624734831974e-05, |
|
"learning_rate": 0.00016816000000000002, |
|
"loss": 0.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 900.0, |
|
"grad_norm": 5.961491842754185e-05, |
|
"learning_rate": 0.00016416, |
|
"loss": 0.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1000.0, |
|
"grad_norm": 5.017322473577224e-05, |
|
"learning_rate": 0.00016016, |
|
"loss": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1100.0, |
|
"grad_norm": 5.257365410216153e-05, |
|
"learning_rate": 0.00015616000000000002, |
|
"loss": 0.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1200.0, |
|
"grad_norm": 5.0212354835821316e-05, |
|
"learning_rate": 0.00015216, |
|
"loss": 0.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1300.0, |
|
"grad_norm": 0.00011130324128316715, |
|
"learning_rate": 0.00014816000000000002, |
|
"loss": 0.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1400.0, |
|
"grad_norm": 3.4537704777903855e-05, |
|
"learning_rate": 0.00014416, |
|
"loss": 0.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1500.0, |
|
"grad_norm": 2.7689882699633017e-05, |
|
"learning_rate": 0.00014016, |
|
"loss": 0.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1600.0, |
|
"grad_norm": 2.726606180658564e-05, |
|
"learning_rate": 0.00013616, |
|
"loss": 0.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1700.0, |
|
"grad_norm": 2.1775686036562547e-05, |
|
"learning_rate": 0.00013216, |
|
"loss": 0.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1800.0, |
|
"grad_norm": 2.3525770302512683e-05, |
|
"learning_rate": 0.00012816000000000002, |
|
"loss": 0.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1900.0, |
|
"grad_norm": 1.902567055367399e-05, |
|
"learning_rate": 0.00012416, |
|
"loss": 0.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2000.0, |
|
"grad_norm": 2.1888447008677758e-05, |
|
"learning_rate": 0.00012016, |
|
"loss": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2100.0, |
|
"grad_norm": 1.896571302495431e-05, |
|
"learning_rate": 0.00011616, |
|
"loss": 0.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2200.0, |
|
"grad_norm": 1.5480936781386845e-05, |
|
"learning_rate": 0.00011216, |
|
"loss": 0.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2300.0, |
|
"grad_norm": 1.3961292097519618e-05, |
|
"learning_rate": 0.00010816, |
|
"loss": 0.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2400.0, |
|
"grad_norm": 1.4109475159784779e-05, |
|
"learning_rate": 0.00010416000000000002, |
|
"loss": 0.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2500.0, |
|
"grad_norm": 1.2665558642765973e-05, |
|
"learning_rate": 0.00010016, |
|
"loss": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2600.0, |
|
"grad_norm": 1.5646817701053806e-05, |
|
"learning_rate": 9.616e-05, |
|
"loss": 0.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2700.0, |
|
"grad_norm": 1.2876950677309651e-05, |
|
"learning_rate": 9.216e-05, |
|
"loss": 0.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2800.0, |
|
"grad_norm": 1.2121616236981936e-05, |
|
"learning_rate": 8.816000000000001e-05, |
|
"loss": 0.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2900.0, |
|
"grad_norm": 1.4524578546115663e-05, |
|
"learning_rate": 8.416000000000001e-05, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3000.0, |
|
"grad_norm": 1.1223896763112862e-05, |
|
"learning_rate": 8.016e-05, |
|
"loss": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3100.0, |
|
"grad_norm": 8.85269673744915e-06, |
|
"learning_rate": 7.616e-05, |
|
"loss": 0.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3200.0, |
|
"grad_norm": 1.264509955944959e-05, |
|
"learning_rate": 7.216e-05, |
|
"loss": 0.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3300.0, |
|
"grad_norm": 8.284540854219813e-06, |
|
"learning_rate": 6.816e-05, |
|
"loss": 0.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3400.0, |
|
"grad_norm": 8.871616046235431e-06, |
|
"learning_rate": 6.416e-05, |
|
"loss": 0.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3500.0, |
|
"grad_norm": 9.966872312361374e-06, |
|
"learning_rate": 6.016000000000001e-05, |
|
"loss": 0.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3600.0, |
|
"grad_norm": 2.9739601814071648e-05, |
|
"learning_rate": 5.6160000000000004e-05, |
|
"loss": 0.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3700.0, |
|
"grad_norm": 7.714033927186392e-06, |
|
"learning_rate": 5.2159999999999995e-05, |
|
"loss": 0.0, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3800.0, |
|
"grad_norm": 1.497406901762588e-05, |
|
"learning_rate": 4.816e-05, |
|
"loss": 0.0, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3900.0, |
|
"grad_norm": 7.307490250241244e-06, |
|
"learning_rate": 4.4160000000000004e-05, |
|
"loss": 0.0, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 4000.0, |
|
"grad_norm": 6.682894763798686e-06, |
|
"learning_rate": 4.016e-05, |
|
"loss": 0.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4100.0, |
|
"grad_norm": 7.749928954581264e-06, |
|
"learning_rate": 3.616e-05, |
|
"loss": 0.0, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 4200.0, |
|
"grad_norm": 1.01770574474358e-05, |
|
"learning_rate": 3.2160000000000004e-05, |
|
"loss": 0.0, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 4300.0, |
|
"grad_norm": 6.606936040043365e-06, |
|
"learning_rate": 2.816e-05, |
|
"loss": 0.0, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4400.0, |
|
"grad_norm": 6.749212843715213e-06, |
|
"learning_rate": 2.4160000000000002e-05, |
|
"loss": 0.0, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4500.0, |
|
"grad_norm": 8.575744686822873e-06, |
|
"learning_rate": 2.016e-05, |
|
"loss": 0.0, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4600.0, |
|
"grad_norm": 6.673930329270661e-06, |
|
"learning_rate": 1.616e-05, |
|
"loss": 0.0, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4700.0, |
|
"grad_norm": 6.32612272966071e-06, |
|
"learning_rate": 1.216e-05, |
|
"loss": 0.0, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4800.0, |
|
"grad_norm": 6.985771960899001e-06, |
|
"learning_rate": 8.160000000000001e-06, |
|
"loss": 0.0, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4900.0, |
|
"grad_norm": 5.245818101684563e-06, |
|
"learning_rate": 4.16e-06, |
|
"loss": 0.0, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 5000.0, |
|
"grad_norm": 5.854470146005042e-06, |
|
"learning_rate": 1.6e-07, |
|
"loss": 0.0, |
|
"step": 5000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5000, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6755965747200000.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|