{ "best_metric": 0.8734126984126984, "best_model_checkpoint": "21BAI1229/checkpoint-474", "epoch": 19.746835443037973, "eval_steps": 500, "global_step": 780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9873417721518988, "grad_norm": 11.754476547241211, "learning_rate": 2.5e-05, "loss": 2.6034, "step": 39 }, { "epoch": 0.9873417721518988, "eval_accuracy": 0.451984126984127, "eval_loss": 2.054410696029663, "eval_runtime": 36.1954, "eval_samples_per_second": 69.622, "eval_steps_per_second": 1.105, "step": 39 }, { "epoch": 2.0, "grad_norm": 7.275434970855713, "learning_rate": 4.992877492877493e-05, "loss": 1.4429, "step": 79 }, { "epoch": 2.0, "eval_accuracy": 0.7849206349206349, "eval_loss": 0.7735527157783508, "eval_runtime": 35.4184, "eval_samples_per_second": 71.149, "eval_steps_per_second": 1.129, "step": 79 }, { "epoch": 2.9873417721518987, "grad_norm": 7.623991012573242, "learning_rate": 4.7150997150997157e-05, "loss": 0.8307, "step": 118 }, { "epoch": 2.9873417721518987, "eval_accuracy": 0.8412698412698413, "eval_loss": 0.5455929636955261, "eval_runtime": 35.3707, "eval_samples_per_second": 71.245, "eval_steps_per_second": 1.131, "step": 118 }, { "epoch": 4.0, "grad_norm": 8.973851203918457, "learning_rate": 4.4301994301994304e-05, "loss": 0.6814, "step": 158 }, { "epoch": 4.0, "eval_accuracy": 0.8515873015873016, "eval_loss": 0.48805657029151917, "eval_runtime": 35.4085, "eval_samples_per_second": 71.169, "eval_steps_per_second": 1.13, "step": 158 }, { "epoch": 4.987341772151899, "grad_norm": 8.185949325561523, "learning_rate": 4.152421652421652e-05, "loss": 0.6199, "step": 197 }, { "epoch": 4.987341772151899, "eval_accuracy": 0.8527777777777777, "eval_loss": 0.46135592460632324, "eval_runtime": 35.2536, "eval_samples_per_second": 71.482, "eval_steps_per_second": 1.135, "step": 197 }, { "epoch": 6.0, "grad_norm": 11.136569023132324, "learning_rate": 3.867521367521368e-05, "loss": 0.5578, "step": 237 }, { "epoch": 6.0, "eval_accuracy": 0.8615079365079366, "eval_loss": 0.44191327691078186, "eval_runtime": 35.2038, "eval_samples_per_second": 71.583, "eval_steps_per_second": 1.136, "step": 237 }, { "epoch": 6.987341772151899, "grad_norm": 6.935160160064697, "learning_rate": 3.58974358974359e-05, "loss": 0.5198, "step": 276 }, { "epoch": 6.987341772151899, "eval_accuracy": 0.8603174603174604, "eval_loss": 0.4485108256340027, "eval_runtime": 35.2921, "eval_samples_per_second": 71.404, "eval_steps_per_second": 1.133, "step": 276 }, { "epoch": 8.0, "grad_norm": 7.163381576538086, "learning_rate": 3.304843304843305e-05, "loss": 0.4811, "step": 316 }, { "epoch": 8.0, "eval_accuracy": 0.8658730158730159, "eval_loss": 0.4355041980743408, "eval_runtime": 35.4396, "eval_samples_per_second": 71.107, "eval_steps_per_second": 1.129, "step": 316 }, { "epoch": 8.987341772151899, "grad_norm": 7.22255277633667, "learning_rate": 3.0270655270655275e-05, "loss": 0.4568, "step": 355 }, { "epoch": 8.987341772151899, "eval_accuracy": 0.8650793650793651, "eval_loss": 0.4182125926017761, "eval_runtime": 35.5074, "eval_samples_per_second": 70.971, "eval_steps_per_second": 1.127, "step": 355 }, { "epoch": 10.0, "grad_norm": 7.7428879737854, "learning_rate": 2.7421652421652423e-05, "loss": 0.4268, "step": 395 }, { "epoch": 10.0, "eval_accuracy": 0.8702380952380953, "eval_loss": 0.4093915522098541, "eval_runtime": 35.1709, "eval_samples_per_second": 71.65, "eval_steps_per_second": 1.137, "step": 395 }, { "epoch": 10.987341772151899, "grad_norm": 8.56812572479248, "learning_rate": 2.4643874643874645e-05, "loss": 0.4281, "step": 434 }, { "epoch": 10.987341772151899, "eval_accuracy": 0.8706349206349207, "eval_loss": 0.41577932238578796, "eval_runtime": 35.2893, "eval_samples_per_second": 71.41, "eval_steps_per_second": 1.133, "step": 434 }, { "epoch": 12.0, "grad_norm": 9.711762428283691, "learning_rate": 2.1794871794871795e-05, "loss": 0.4143, "step": 474 }, { "epoch": 12.0, "eval_accuracy": 0.8734126984126984, "eval_loss": 0.40782999992370605, "eval_runtime": 35.0211, "eval_samples_per_second": 71.957, "eval_steps_per_second": 1.142, "step": 474 }, { "epoch": 12.987341772151899, "grad_norm": 7.874723434448242, "learning_rate": 1.9017094017094017e-05, "loss": 0.4009, "step": 513 }, { "epoch": 12.987341772151899, "eval_accuracy": 0.8714285714285714, "eval_loss": 0.4066493511199951, "eval_runtime": 35.2449, "eval_samples_per_second": 71.5, "eval_steps_per_second": 1.135, "step": 513 }, { "epoch": 14.0, "grad_norm": 8.416353225708008, "learning_rate": 1.6168091168091168e-05, "loss": 0.3642, "step": 553 }, { "epoch": 14.0, "eval_accuracy": 0.8682539682539683, "eval_loss": 0.4131360352039337, "eval_runtime": 35.3914, "eval_samples_per_second": 71.204, "eval_steps_per_second": 1.13, "step": 553 }, { "epoch": 14.987341772151899, "grad_norm": 8.845190048217773, "learning_rate": 1.3390313390313392e-05, "loss": 0.3659, "step": 592 }, { "epoch": 14.987341772151899, "eval_accuracy": 0.8726190476190476, "eval_loss": 0.40469926595687866, "eval_runtime": 35.2434, "eval_samples_per_second": 71.503, "eval_steps_per_second": 1.135, "step": 592 }, { "epoch": 16.0, "grad_norm": 7.056828022003174, "learning_rate": 1.0541310541310543e-05, "loss": 0.3487, "step": 632 }, { "epoch": 16.0, "eval_accuracy": 0.871031746031746, "eval_loss": 0.4053677022457123, "eval_runtime": 35.2106, "eval_samples_per_second": 71.569, "eval_steps_per_second": 1.136, "step": 632 }, { "epoch": 16.9873417721519, "grad_norm": 7.8862199783325195, "learning_rate": 7.763532763532765e-06, "loss": 0.35, "step": 671 }, { "epoch": 16.9873417721519, "eval_accuracy": 0.8722222222222222, "eval_loss": 0.41073036193847656, "eval_runtime": 35.125, "eval_samples_per_second": 71.744, "eval_steps_per_second": 1.139, "step": 671 }, { "epoch": 18.0, "grad_norm": 9.344978332519531, "learning_rate": 4.914529914529915e-06, "loss": 0.3291, "step": 711 }, { "epoch": 18.0, "eval_accuracy": 0.8698412698412699, "eval_loss": 0.40985915064811707, "eval_runtime": 35.2658, "eval_samples_per_second": 71.457, "eval_steps_per_second": 1.134, "step": 711 }, { "epoch": 18.9873417721519, "grad_norm": 6.548698902130127, "learning_rate": 2.136752136752137e-06, "loss": 0.338, "step": 750 }, { "epoch": 18.9873417721519, "eval_accuracy": 0.8718253968253968, "eval_loss": 0.40625905990600586, "eval_runtime": 35.4023, "eval_samples_per_second": 71.182, "eval_steps_per_second": 1.13, "step": 750 }, { "epoch": 19.746835443037973, "grad_norm": 6.30403470993042, "learning_rate": 0.0, "loss": 0.3419, "step": 780 }, { "epoch": 19.746835443037973, "eval_accuracy": 0.8702380952380953, "eval_loss": 0.4066447913646698, "eval_runtime": 35.3364, "eval_samples_per_second": 71.315, "eval_steps_per_second": 1.132, "step": 780 }, { "epoch": 19.746835443037973, "step": 780, "total_flos": 1.5428282771770638e+19, "train_loss": 0.6176073722350292, "train_runtime": 7965.5555, "train_samples_per_second": 25.309, "train_steps_per_second": 0.098 } ], "logging_steps": 500, "max_steps": 780, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5428282771770638e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }