Gemma-2-9B-It-SFT / trainer_state.json
chchen's picture
End of training
17ba8fd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.986666666666667,
"eval_steps": 500,
"global_step": 168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.17777777777777778,
"grad_norm": 6.429335594177246,
"learning_rate": 2.9411764705882355e-06,
"loss": 2.8371,
"step": 10
},
{
"epoch": 0.35555555555555557,
"grad_norm": 6.359095573425293,
"learning_rate": 4.995131923687488e-06,
"loss": 2.5866,
"step": 20
},
{
"epoch": 0.5333333333333333,
"grad_norm": 6.2563042640686035,
"learning_rate": 4.90911473983908e-06,
"loss": 2.1317,
"step": 30
},
{
"epoch": 0.7111111111111111,
"grad_norm": 5.7428483963012695,
"learning_rate": 4.71919261421297e-06,
"loss": 1.6483,
"step": 40
},
{
"epoch": 0.8888888888888888,
"grad_norm": 5.202849388122559,
"learning_rate": 4.43355687413747e-06,
"loss": 1.258,
"step": 50
},
{
"epoch": 1.0666666666666667,
"grad_norm": 3.334800958633423,
"learning_rate": 4.064526968101844e-06,
"loss": 0.9961,
"step": 60
},
{
"epoch": 1.2444444444444445,
"grad_norm": 5.201640605926514,
"learning_rate": 3.6280191288478437e-06,
"loss": 0.6742,
"step": 70
},
{
"epoch": 1.4222222222222223,
"grad_norm": 4.489432334899902,
"learning_rate": 3.142859907420615e-06,
"loss": 0.4521,
"step": 80
},
{
"epoch": 1.6,
"grad_norm": 3.431414842605591,
"learning_rate": 2.629974185404951e-06,
"loss": 0.3905,
"step": 90
},
{
"epoch": 1.7777777777777777,
"grad_norm": 2.042440414428711,
"learning_rate": 2.1114826863194882e-06,
"loss": 0.2197,
"step": 100
},
{
"epoch": 1.9555555555555557,
"grad_norm": 2.322110891342163,
"learning_rate": 1.6097479104361328e-06,
"loss": 0.235,
"step": 110
},
{
"epoch": 2.1333333333333333,
"grad_norm": 2.210110664367676,
"learning_rate": 1.1464096417858821e-06,
"loss": 0.177,
"step": 120
},
{
"epoch": 2.311111111111111,
"grad_norm": 1.0164854526519775,
"learning_rate": 7.414516258630245e-07,
"loss": 0.1756,
"step": 130
},
{
"epoch": 2.488888888888889,
"grad_norm": 4.7322163581848145,
"learning_rate": 4.123396721497977e-07,
"loss": 0.1916,
"step": 140
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.4119203090667725,
"learning_rate": 1.7326835503629542e-07,
"loss": 0.1296,
"step": 150
},
{
"epoch": 2.8444444444444446,
"grad_norm": 1.8030685186386108,
"learning_rate": 3.4548802869627806e-08,
"loss": 0.1469,
"step": 160
},
{
"epoch": 2.986666666666667,
"step": 168,
"total_flos": 1.7300235104796672e+16,
"train_loss": 0.8548009863921574,
"train_runtime": 612.7985,
"train_samples_per_second": 4.406,
"train_steps_per_second": 0.274
}
],
"logging_steps": 10,
"max_steps": 168,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7300235104796672e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}