{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08708430849616285, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001741686169923257, "grad_norm": 3.0921738147735596, "learning_rate": 5e-05, "loss": 1.9628, "step": 1 }, { "epoch": 0.001741686169923257, "eval_loss": 2.522160053253174, "eval_runtime": 351.8165, "eval_samples_per_second": 10.994, "eval_steps_per_second": 2.749, "step": 1 }, { "epoch": 0.003483372339846514, "grad_norm": 2.935767412185669, "learning_rate": 0.0001, "loss": 1.9583, "step": 2 }, { "epoch": 0.005225058509769771, "grad_norm": 2.823258399963379, "learning_rate": 9.989294616193017e-05, "loss": 1.834, "step": 3 }, { "epoch": 0.006966744679693028, "grad_norm": 2.366340160369873, "learning_rate": 9.957224306869053e-05, "loss": 1.4925, "step": 4 }, { "epoch": 0.008708430849616285, "grad_norm": 1.9907324314117432, "learning_rate": 9.903926402016153e-05, "loss": 1.1095, "step": 5 }, { "epoch": 0.010450117019539542, "grad_norm": 1.629361867904663, "learning_rate": 9.829629131445342e-05, "loss": 0.7857, "step": 6 }, { "epoch": 0.012191803189462799, "grad_norm": 1.6523230075836182, "learning_rate": 9.73465064747553e-05, "loss": 0.6975, "step": 7 }, { "epoch": 0.013933489359386056, "grad_norm": 1.8730814456939697, "learning_rate": 9.619397662556435e-05, "loss": 0.5556, "step": 8 }, { "epoch": 0.01567517552930931, "grad_norm": 1.6527363061904907, "learning_rate": 9.484363707663442e-05, "loss": 0.3817, "step": 9 }, { "epoch": 0.01741686169923257, "grad_norm": 1.103572964668274, "learning_rate": 9.330127018922194e-05, "loss": 0.2157, "step": 10 }, { "epoch": 0.019158547869155825, "grad_norm": 1.1308457851409912, "learning_rate": 9.157348061512727e-05, "loss": 0.2139, "step": 11 }, { "epoch": 0.020900234039079084, "grad_norm": 1.191890001296997, "learning_rate": 8.966766701456177e-05, "loss": 0.1758, "step": 12 }, { "epoch": 0.02264192020900234, "grad_norm": 0.7013289928436279, "learning_rate": 8.759199037394887e-05, "loss": 0.149, "step": 13 }, { "epoch": 0.02264192020900234, "eval_loss": 0.14659395813941956, "eval_runtime": 352.1285, "eval_samples_per_second": 10.985, "eval_steps_per_second": 2.746, "step": 13 }, { "epoch": 0.024383606378925598, "grad_norm": 0.8086413145065308, "learning_rate": 8.535533905932738e-05, "loss": 0.147, "step": 14 }, { "epoch": 0.026125292548848853, "grad_norm": 0.9576364755630493, "learning_rate": 8.296729075500344e-05, "loss": 0.1463, "step": 15 }, { "epoch": 0.027866978718772112, "grad_norm": 0.5651424527168274, "learning_rate": 8.043807145043604e-05, "loss": 0.1489, "step": 16 }, { "epoch": 0.029608664888695367, "grad_norm": 0.7270596623420715, "learning_rate": 7.777851165098012e-05, "loss": 0.1385, "step": 17 }, { "epoch": 0.03135035105861862, "grad_norm": 0.7695695161819458, "learning_rate": 7.500000000000001e-05, "loss": 0.1344, "step": 18 }, { "epoch": 0.033092037228541885, "grad_norm": 0.5415513515472412, "learning_rate": 7.211443451095007e-05, "loss": 0.1271, "step": 19 }, { "epoch": 0.03483372339846514, "grad_norm": 0.5811421871185303, "learning_rate": 6.91341716182545e-05, "loss": 0.1157, "step": 20 }, { "epoch": 0.036575409568388395, "grad_norm": 0.5054365992546082, "learning_rate": 6.607197326515808e-05, "loss": 0.1075, "step": 21 }, { "epoch": 0.03831709573831165, "grad_norm": 0.43094682693481445, "learning_rate": 6.294095225512603e-05, "loss": 0.0977, "step": 22 }, { "epoch": 0.04005878190823491, "grad_norm": 0.35248297452926636, "learning_rate": 5.9754516100806423e-05, "loss": 0.0912, "step": 23 }, { "epoch": 0.04180046807815817, "grad_norm": 0.46628761291503906, "learning_rate": 5.6526309611002594e-05, "loss": 0.1272, "step": 24 }, { "epoch": 0.04354215424808142, "grad_norm": 0.37774521112442017, "learning_rate": 5.327015646150716e-05, "loss": 0.0911, "step": 25 }, { "epoch": 0.04528384041800468, "grad_norm": 0.318975567817688, "learning_rate": 5e-05, "loss": 0.0788, "step": 26 }, { "epoch": 0.04528384041800468, "eval_loss": 0.08614670485258102, "eval_runtime": 352.1165, "eval_samples_per_second": 10.985, "eval_steps_per_second": 2.746, "step": 26 }, { "epoch": 0.04702552658792794, "grad_norm": 0.3819328546524048, "learning_rate": 4.6729843538492847e-05, "loss": 0.0898, "step": 27 }, { "epoch": 0.048767212757851196, "grad_norm": 0.38801172375679016, "learning_rate": 4.347369038899744e-05, "loss": 0.0728, "step": 28 }, { "epoch": 0.05050889892777445, "grad_norm": 0.3896499276161194, "learning_rate": 4.0245483899193595e-05, "loss": 0.0931, "step": 29 }, { "epoch": 0.052250585097697706, "grad_norm": 0.44811469316482544, "learning_rate": 3.705904774487396e-05, "loss": 0.104, "step": 30 }, { "epoch": 0.05399227126762097, "grad_norm": 0.3136563003063202, "learning_rate": 3.392802673484193e-05, "loss": 0.071, "step": 31 }, { "epoch": 0.055733957437544224, "grad_norm": 0.4052649736404419, "learning_rate": 3.086582838174551e-05, "loss": 0.0825, "step": 32 }, { "epoch": 0.05747564360746748, "grad_norm": 0.3598843216896057, "learning_rate": 2.7885565489049946e-05, "loss": 0.0904, "step": 33 }, { "epoch": 0.059217329777390734, "grad_norm": 0.3175484538078308, "learning_rate": 2.500000000000001e-05, "loss": 0.0929, "step": 34 }, { "epoch": 0.060959015947313996, "grad_norm": 0.3411295413970947, "learning_rate": 2.2221488349019903e-05, "loss": 0.0826, "step": 35 }, { "epoch": 0.06270070211723724, "grad_norm": 0.4295560419559479, "learning_rate": 1.9561928549563968e-05, "loss": 0.0854, "step": 36 }, { "epoch": 0.0644423882871605, "grad_norm": 0.336063414812088, "learning_rate": 1.703270924499656e-05, "loss": 0.0659, "step": 37 }, { "epoch": 0.06618407445708377, "grad_norm": 0.3456127643585205, "learning_rate": 1.4644660940672627e-05, "loss": 0.0928, "step": 38 }, { "epoch": 0.06792576062700702, "grad_norm": 0.37239643931388855, "learning_rate": 1.2408009626051137e-05, "loss": 0.0807, "step": 39 }, { "epoch": 0.06792576062700702, "eval_loss": 0.07401972264051437, "eval_runtime": 352.2856, "eval_samples_per_second": 10.98, "eval_steps_per_second": 2.745, "step": 39 }, { "epoch": 0.06966744679693028, "grad_norm": 0.39408788084983826, "learning_rate": 1.0332332985438248e-05, "loss": 0.1016, "step": 40 }, { "epoch": 0.07140913296685354, "grad_norm": 0.3007913827896118, "learning_rate": 8.426519384872733e-06, "loss": 0.0745, "step": 41 }, { "epoch": 0.07315081913677679, "grad_norm": 0.33744704723358154, "learning_rate": 6.698729810778065e-06, "loss": 0.086, "step": 42 }, { "epoch": 0.07489250530670005, "grad_norm": 0.37013643980026245, "learning_rate": 5.156362923365588e-06, "loss": 0.0786, "step": 43 }, { "epoch": 0.0766341914766233, "grad_norm": 0.3822749853134155, "learning_rate": 3.8060233744356633e-06, "loss": 0.0882, "step": 44 }, { "epoch": 0.07837587764654656, "grad_norm": 0.27670085430145264, "learning_rate": 2.653493525244721e-06, "loss": 0.0661, "step": 45 }, { "epoch": 0.08011756381646983, "grad_norm": 0.33175578713417053, "learning_rate": 1.70370868554659e-06, "loss": 0.0792, "step": 46 }, { "epoch": 0.08185924998639307, "grad_norm": 0.2966095209121704, "learning_rate": 9.607359798384785e-07, "loss": 0.0705, "step": 47 }, { "epoch": 0.08360093615631634, "grad_norm": 0.3922402262687683, "learning_rate": 4.277569313094809e-07, "loss": 0.0891, "step": 48 }, { "epoch": 0.08534262232623958, "grad_norm": 0.40399035811424255, "learning_rate": 1.0705383806982606e-07, "loss": 0.0725, "step": 49 }, { "epoch": 0.08708430849616285, "grad_norm": 0.34264397621154785, "learning_rate": 0.0, "loss": 0.0832, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1870196536967168e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }