|
{ |
|
"best_metric": 0.8734126984126984, |
|
"best_model_checkpoint": "21BAI1229/checkpoint-474", |
|
"epoch": 19.746835443037973, |
|
"eval_steps": 500, |
|
"global_step": 780, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9873417721518988, |
|
"grad_norm": 11.754476547241211, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.6034, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.9873417721518988, |
|
"eval_accuracy": 0.451984126984127, |
|
"eval_loss": 2.054410696029663, |
|
"eval_runtime": 36.1954, |
|
"eval_samples_per_second": 69.622, |
|
"eval_steps_per_second": 1.105, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 7.275434970855713, |
|
"learning_rate": 4.992877492877493e-05, |
|
"loss": 1.4429, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.7849206349206349, |
|
"eval_loss": 0.7735527157783508, |
|
"eval_runtime": 35.4184, |
|
"eval_samples_per_second": 71.149, |
|
"eval_steps_per_second": 1.129, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.9873417721518987, |
|
"grad_norm": 7.623991012573242, |
|
"learning_rate": 4.7150997150997157e-05, |
|
"loss": 0.8307, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.9873417721518987, |
|
"eval_accuracy": 0.8412698412698413, |
|
"eval_loss": 0.5455929636955261, |
|
"eval_runtime": 35.3707, |
|
"eval_samples_per_second": 71.245, |
|
"eval_steps_per_second": 1.131, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 8.973851203918457, |
|
"learning_rate": 4.4301994301994304e-05, |
|
"loss": 0.6814, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8515873015873016, |
|
"eval_loss": 0.48805657029151917, |
|
"eval_runtime": 35.4085, |
|
"eval_samples_per_second": 71.169, |
|
"eval_steps_per_second": 1.13, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.987341772151899, |
|
"grad_norm": 8.185949325561523, |
|
"learning_rate": 4.152421652421652e-05, |
|
"loss": 0.6199, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 4.987341772151899, |
|
"eval_accuracy": 0.8527777777777777, |
|
"eval_loss": 0.46135592460632324, |
|
"eval_runtime": 35.2536, |
|
"eval_samples_per_second": 71.482, |
|
"eval_steps_per_second": 1.135, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 11.136569023132324, |
|
"learning_rate": 3.867521367521368e-05, |
|
"loss": 0.5578, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.8615079365079366, |
|
"eval_loss": 0.44191327691078186, |
|
"eval_runtime": 35.2038, |
|
"eval_samples_per_second": 71.583, |
|
"eval_steps_per_second": 1.136, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 6.987341772151899, |
|
"grad_norm": 6.935160160064697, |
|
"learning_rate": 3.58974358974359e-05, |
|
"loss": 0.5198, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 6.987341772151899, |
|
"eval_accuracy": 0.8603174603174604, |
|
"eval_loss": 0.4485108256340027, |
|
"eval_runtime": 35.2921, |
|
"eval_samples_per_second": 71.404, |
|
"eval_steps_per_second": 1.133, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 7.163381576538086, |
|
"learning_rate": 3.304843304843305e-05, |
|
"loss": 0.4811, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8658730158730159, |
|
"eval_loss": 0.4355041980743408, |
|
"eval_runtime": 35.4396, |
|
"eval_samples_per_second": 71.107, |
|
"eval_steps_per_second": 1.129, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 8.987341772151899, |
|
"grad_norm": 7.22255277633667, |
|
"learning_rate": 3.0270655270655275e-05, |
|
"loss": 0.4568, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 8.987341772151899, |
|
"eval_accuracy": 0.8650793650793651, |
|
"eval_loss": 0.4182125926017761, |
|
"eval_runtime": 35.5074, |
|
"eval_samples_per_second": 70.971, |
|
"eval_steps_per_second": 1.127, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 7.7428879737854, |
|
"learning_rate": 2.7421652421652423e-05, |
|
"loss": 0.4268, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8702380952380953, |
|
"eval_loss": 0.4093915522098541, |
|
"eval_runtime": 35.1709, |
|
"eval_samples_per_second": 71.65, |
|
"eval_steps_per_second": 1.137, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 10.987341772151899, |
|
"grad_norm": 8.56812572479248, |
|
"learning_rate": 2.4643874643874645e-05, |
|
"loss": 0.4281, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 10.987341772151899, |
|
"eval_accuracy": 0.8706349206349207, |
|
"eval_loss": 0.41577932238578796, |
|
"eval_runtime": 35.2893, |
|
"eval_samples_per_second": 71.41, |
|
"eval_steps_per_second": 1.133, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 9.711762428283691, |
|
"learning_rate": 2.1794871794871795e-05, |
|
"loss": 0.4143, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.8734126984126984, |
|
"eval_loss": 0.40782999992370605, |
|
"eval_runtime": 35.0211, |
|
"eval_samples_per_second": 71.957, |
|
"eval_steps_per_second": 1.142, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 12.987341772151899, |
|
"grad_norm": 7.874723434448242, |
|
"learning_rate": 1.9017094017094017e-05, |
|
"loss": 0.4009, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 12.987341772151899, |
|
"eval_accuracy": 0.8714285714285714, |
|
"eval_loss": 0.4066493511199951, |
|
"eval_runtime": 35.2449, |
|
"eval_samples_per_second": 71.5, |
|
"eval_steps_per_second": 1.135, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 8.416353225708008, |
|
"learning_rate": 1.6168091168091168e-05, |
|
"loss": 0.3642, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8682539682539683, |
|
"eval_loss": 0.4131360352039337, |
|
"eval_runtime": 35.3914, |
|
"eval_samples_per_second": 71.204, |
|
"eval_steps_per_second": 1.13, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 14.987341772151899, |
|
"grad_norm": 8.845190048217773, |
|
"learning_rate": 1.3390313390313392e-05, |
|
"loss": 0.3659, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 14.987341772151899, |
|
"eval_accuracy": 0.8726190476190476, |
|
"eval_loss": 0.40469926595687866, |
|
"eval_runtime": 35.2434, |
|
"eval_samples_per_second": 71.503, |
|
"eval_steps_per_second": 1.135, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 7.056828022003174, |
|
"learning_rate": 1.0541310541310543e-05, |
|
"loss": 0.3487, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.871031746031746, |
|
"eval_loss": 0.4053677022457123, |
|
"eval_runtime": 35.2106, |
|
"eval_samples_per_second": 71.569, |
|
"eval_steps_per_second": 1.136, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 16.9873417721519, |
|
"grad_norm": 7.8862199783325195, |
|
"learning_rate": 7.763532763532765e-06, |
|
"loss": 0.35, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 16.9873417721519, |
|
"eval_accuracy": 0.8722222222222222, |
|
"eval_loss": 0.41073036193847656, |
|
"eval_runtime": 35.125, |
|
"eval_samples_per_second": 71.744, |
|
"eval_steps_per_second": 1.139, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 9.344978332519531, |
|
"learning_rate": 4.914529914529915e-06, |
|
"loss": 0.3291, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.8698412698412699, |
|
"eval_loss": 0.40985915064811707, |
|
"eval_runtime": 35.2658, |
|
"eval_samples_per_second": 71.457, |
|
"eval_steps_per_second": 1.134, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 18.9873417721519, |
|
"grad_norm": 6.548698902130127, |
|
"learning_rate": 2.136752136752137e-06, |
|
"loss": 0.338, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 18.9873417721519, |
|
"eval_accuracy": 0.8718253968253968, |
|
"eval_loss": 0.40625905990600586, |
|
"eval_runtime": 35.4023, |
|
"eval_samples_per_second": 71.182, |
|
"eval_steps_per_second": 1.13, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 19.746835443037973, |
|
"grad_norm": 6.30403470993042, |
|
"learning_rate": 0.0, |
|
"loss": 0.3419, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 19.746835443037973, |
|
"eval_accuracy": 0.8702380952380953, |
|
"eval_loss": 0.4066447913646698, |
|
"eval_runtime": 35.3364, |
|
"eval_samples_per_second": 71.315, |
|
"eval_steps_per_second": 1.132, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 19.746835443037973, |
|
"step": 780, |
|
"total_flos": 1.5428282771770638e+19, |
|
"train_loss": 0.6176073722350292, |
|
"train_runtime": 7965.5555, |
|
"train_samples_per_second": 25.309, |
|
"train_steps_per_second": 0.098 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5428282771770638e+19, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|