|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 24651, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 19.664987564086914, |
|
"learning_rate": 2.939515638310819e-05, |
|
"loss": 2.1242, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 15.668974876403809, |
|
"learning_rate": 2.8786661798709992e-05, |
|
"loss": 1.3535, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 19.882505416870117, |
|
"learning_rate": 2.817816721431179e-05, |
|
"loss": 1.22, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 10.92320442199707, |
|
"learning_rate": 2.7569672629913596e-05, |
|
"loss": 1.1381, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 10.699568748474121, |
|
"learning_rate": 2.6961178045515395e-05, |
|
"loss": 1.0921, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 12.03032112121582, |
|
"learning_rate": 2.6352683461117197e-05, |
|
"loss": 1.0828, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 13.232436180114746, |
|
"learning_rate": 2.5744188876719e-05, |
|
"loss": 1.0373, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 14.177654266357422, |
|
"learning_rate": 2.5135694292320798e-05, |
|
"loss": 0.9764, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 21.473983764648438, |
|
"learning_rate": 2.4528416697091396e-05, |
|
"loss": 0.9691, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 14.361074447631836, |
|
"learning_rate": 2.3919922112693198e-05, |
|
"loss": 0.9319, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 14.396862983703613, |
|
"learning_rate": 2.3311427528294997e-05, |
|
"loss": 0.9323, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 15.474726676940918, |
|
"learning_rate": 2.2702932943896803e-05, |
|
"loss": 0.8969, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 10.854999542236328, |
|
"learning_rate": 2.2095655348667397e-05, |
|
"loss": 0.905, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 7.904343128204346, |
|
"learning_rate": 2.14871607642692e-05, |
|
"loss": 0.9025, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 7.447906970977783, |
|
"learning_rate": 2.0878666179871e-05, |
|
"loss": 0.8784, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 15.822772979736328, |
|
"learning_rate": 2.02701715954728e-05, |
|
"loss": 0.8754, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 13.860732078552246, |
|
"learning_rate": 1.9662894000243398e-05, |
|
"loss": 0.758, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 9.411458015441895, |
|
"learning_rate": 1.9054399415845197e-05, |
|
"loss": 0.6682, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 12.759841918945312, |
|
"learning_rate": 1.8445904831447e-05, |
|
"loss": 0.6745, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 12.465913772583008, |
|
"learning_rate": 1.7837410247048804e-05, |
|
"loss": 0.6599, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 13.312762260437012, |
|
"learning_rate": 1.72301326518194e-05, |
|
"loss": 0.6538, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 8.506616592407227, |
|
"learning_rate": 1.66216380674212e-05, |
|
"loss": 0.6591, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 11.091550827026367, |
|
"learning_rate": 1.6013143483023003e-05, |
|
"loss": 0.6634, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 10.133756637573242, |
|
"learning_rate": 1.5404648898624802e-05, |
|
"loss": 0.6748, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 11.615654945373535, |
|
"learning_rate": 1.4796154314226602e-05, |
|
"loss": 0.6759, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 6.393186092376709, |
|
"learning_rate": 1.4188876718997202e-05, |
|
"loss": 0.6475, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 14.697447776794434, |
|
"learning_rate": 1.3580382134599002e-05, |
|
"loss": 0.6493, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 15.398512840270996, |
|
"learning_rate": 1.2971887550200803e-05, |
|
"loss": 0.6444, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 15.842305183410645, |
|
"learning_rate": 1.2363392965802603e-05, |
|
"loss": 0.6396, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 12.496230125427246, |
|
"learning_rate": 1.1756115370573203e-05, |
|
"loss": 0.628, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 8.957124710083008, |
|
"learning_rate": 1.1147620786175003e-05, |
|
"loss": 0.643, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 9.984686851501465, |
|
"learning_rate": 1.0539126201776804e-05, |
|
"loss": 0.612, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 12.69090747833252, |
|
"learning_rate": 9.930631617378604e-06, |
|
"loss": 0.6215, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 7.916914939880371, |
|
"learning_rate": 9.323354022149204e-06, |
|
"loss": 0.4676, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 13.55048942565918, |
|
"learning_rate": 8.7160764269198e-06, |
|
"loss": 0.4779, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 27.892988204956055, |
|
"learning_rate": 8.107581842521602e-06, |
|
"loss": 0.4771, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 13.891279220581055, |
|
"learning_rate": 7.499087258123403e-06, |
|
"loss": 0.4612, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 19.246742248535156, |
|
"learning_rate": 6.890592673725204e-06, |
|
"loss": 0.4852, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 26.485008239746094, |
|
"learning_rate": 6.282098089327005e-06, |
|
"loss": 0.4841, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 12.629850387573242, |
|
"learning_rate": 5.673603504928806e-06, |
|
"loss": 0.4732, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 21.771028518676758, |
|
"learning_rate": 5.065108920530608e-06, |
|
"loss": 0.4841, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 8.484288215637207, |
|
"learning_rate": 4.457831325301205e-06, |
|
"loss": 0.4703, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 12.666841506958008, |
|
"learning_rate": 3.849336740903006e-06, |
|
"loss": 0.4647, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 6.814643383026123, |
|
"learning_rate": 3.2408421565048075e-06, |
|
"loss": 0.4589, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 10.161438941955566, |
|
"learning_rate": 2.6323475721066084e-06, |
|
"loss": 0.4435, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 17.461462020874023, |
|
"learning_rate": 2.0250699768772058e-06, |
|
"loss": 0.4658, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 6.685905933380127, |
|
"learning_rate": 1.416575392479007e-06, |
|
"loss": 0.4635, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 20.97271156311035, |
|
"learning_rate": 8.080808080808081e-07, |
|
"loss": 0.4541, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 8.495226860046387, |
|
"learning_rate": 1.9958622368260923e-07, |
|
"loss": 0.4637, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 24651, |
|
"total_flos": 3.864867977455206e+16, |
|
"train_loss": 0.7326071534426255, |
|
"train_runtime": 12303.0667, |
|
"train_samples_per_second": 32.057, |
|
"train_steps_per_second": 2.004 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 24651, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 5000, |
|
"total_flos": 3.864867977455206e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|