{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1154068090017311, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001154068090017311, "eval_loss": 2.0465598106384277, "eval_runtime": 103.0211, "eval_samples_per_second": 14.172, "eval_steps_per_second": 1.776, "step": 1 }, { "epoch": 0.003462204270051933, "grad_norm": 8.097893714904785, "learning_rate": 3e-05, "loss": 8.0719, "step": 3 }, { "epoch": 0.006924408540103866, "grad_norm": 5.514200210571289, "learning_rate": 6e-05, "loss": 8.0211, "step": 6 }, { "epoch": 0.0103866128101558, "grad_norm": 5.214906692504883, "learning_rate": 9e-05, "loss": 8.0571, "step": 9 }, { "epoch": 0.0103866128101558, "eval_loss": 1.9573053121566772, "eval_runtime": 104.5011, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.751, "step": 9 }, { "epoch": 0.013848817080207732, "grad_norm": 4.61670446395874, "learning_rate": 9.987820251299122e-05, "loss": 7.7237, "step": 12 }, { "epoch": 0.017311021350259664, "grad_norm": 4.776439666748047, "learning_rate": 9.924038765061042e-05, "loss": 7.5723, "step": 15 }, { "epoch": 0.0207732256203116, "grad_norm": 3.863844394683838, "learning_rate": 9.806308479691595e-05, "loss": 7.8625, "step": 18 }, { "epoch": 0.0207732256203116, "eval_loss": 1.938367247581482, "eval_runtime": 104.6138, "eval_samples_per_second": 13.956, "eval_steps_per_second": 1.749, "step": 18 }, { "epoch": 0.024235429890363532, "grad_norm": 3.75036358833313, "learning_rate": 9.635919272833938e-05, "loss": 7.5972, "step": 21 }, { "epoch": 0.027697634160415464, "grad_norm": 3.694056510925293, "learning_rate": 9.414737964294636e-05, "loss": 7.325, "step": 24 }, { "epoch": 0.031159838430467397, "grad_norm": 3.767008066177368, "learning_rate": 9.145187862775209e-05, "loss": 7.8502, "step": 27 }, { "epoch": 0.031159838430467397, "eval_loss": 1.9282969236373901, "eval_runtime": 104.5512, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.75, "step": 27 }, { "epoch": 0.03462204270051933, "grad_norm": 3.892972230911255, "learning_rate": 8.83022221559489e-05, "loss": 7.737, "step": 30 }, { "epoch": 0.038084246970571264, "grad_norm": 4.205379962921143, "learning_rate": 8.473291852294987e-05, "loss": 7.7768, "step": 33 }, { "epoch": 0.0415464512406232, "grad_norm": 4.217511177062988, "learning_rate": 8.07830737662829e-05, "loss": 7.7462, "step": 36 }, { "epoch": 0.0415464512406232, "eval_loss": 1.9257190227508545, "eval_runtime": 104.5925, "eval_samples_per_second": 13.959, "eval_steps_per_second": 1.75, "step": 36 }, { "epoch": 0.04500865551067513, "grad_norm": 4.240529537200928, "learning_rate": 7.649596321166024e-05, "loss": 7.9651, "step": 39 }, { "epoch": 0.048470859780727064, "grad_norm": 4.575836658477783, "learning_rate": 7.191855733945387e-05, "loss": 7.7481, "step": 42 }, { "epoch": 0.05193306405077899, "grad_norm": 4.351076126098633, "learning_rate": 6.710100716628344e-05, "loss": 7.9443, "step": 45 }, { "epoch": 0.05193306405077899, "eval_loss": 1.9247784614562988, "eval_runtime": 104.5498, "eval_samples_per_second": 13.965, "eval_steps_per_second": 1.75, "step": 45 }, { "epoch": 0.05539526832083093, "grad_norm": 4.322741508483887, "learning_rate": 6.209609477998338e-05, "loss": 7.5892, "step": 48 }, { "epoch": 0.058857472590882864, "grad_norm": 3.949061870574951, "learning_rate": 5.695865504800327e-05, "loss": 7.5119, "step": 51 }, { "epoch": 0.06231967686093479, "grad_norm": 4.710406303405762, "learning_rate": 5.174497483512506e-05, "loss": 7.8128, "step": 54 }, { "epoch": 0.06231967686093479, "eval_loss": 1.9215248823165894, "eval_runtime": 104.5083, "eval_samples_per_second": 13.97, "eval_steps_per_second": 1.751, "step": 54 }, { "epoch": 0.06578188113098672, "grad_norm": 3.8922553062438965, "learning_rate": 4.6512176312793736e-05, "loss": 7.6261, "step": 57 }, { "epoch": 0.06924408540103866, "grad_norm": 3.532529830932617, "learning_rate": 4.131759111665349e-05, "loss": 7.799, "step": 60 }, { "epoch": 0.0727062896710906, "grad_norm": 3.5513439178466797, "learning_rate": 3.6218132209150045e-05, "loss": 7.5004, "step": 63 }, { "epoch": 0.0727062896710906, "eval_loss": 1.9200663566589355, "eval_runtime": 104.4709, "eval_samples_per_second": 13.975, "eval_steps_per_second": 1.752, "step": 63 }, { "epoch": 0.07616849394114253, "grad_norm": 3.730220317840576, "learning_rate": 3.12696703292044e-05, "loss": 7.9821, "step": 66 }, { "epoch": 0.07963069821119446, "grad_norm": 3.3771941661834717, "learning_rate": 2.6526421860705473e-05, "loss": 7.2337, "step": 69 }, { "epoch": 0.0830929024812464, "grad_norm": 3.681811809539795, "learning_rate": 2.2040354826462668e-05, "loss": 7.4931, "step": 72 }, { "epoch": 0.0830929024812464, "eval_loss": 1.9186850786209106, "eval_runtime": 104.5007, "eval_samples_per_second": 13.971, "eval_steps_per_second": 1.751, "step": 72 }, { "epoch": 0.08655510675129832, "grad_norm": 3.645982027053833, "learning_rate": 1.7860619515673033e-05, "loss": 7.5561, "step": 75 }, { "epoch": 0.09001731102135026, "grad_norm": 4.192688941955566, "learning_rate": 1.4033009983067452e-05, "loss": 7.7532, "step": 78 }, { "epoch": 0.0934795152914022, "grad_norm": 4.39183235168457, "learning_rate": 1.0599462319663905e-05, "loss": 7.7928, "step": 81 }, { "epoch": 0.0934795152914022, "eval_loss": 1.9180028438568115, "eval_runtime": 104.5172, "eval_samples_per_second": 13.969, "eval_steps_per_second": 1.751, "step": 81 }, { "epoch": 0.09694171956145413, "grad_norm": 3.5582807064056396, "learning_rate": 7.597595192178702e-06, "loss": 7.6637, "step": 84 }, { "epoch": 0.10040392383150606, "grad_norm": 4.048159122467041, "learning_rate": 5.060297685041659e-06, "loss": 7.3083, "step": 87 }, { "epoch": 0.10386612810155799, "grad_norm": 3.8467483520507812, "learning_rate": 3.0153689607045845e-06, "loss": 7.7565, "step": 90 }, { "epoch": 0.10386612810155799, "eval_loss": 1.91769278049469, "eval_runtime": 104.4893, "eval_samples_per_second": 13.973, "eval_steps_per_second": 1.751, "step": 90 }, { "epoch": 0.10732833237160992, "grad_norm": 3.6378891468048096, "learning_rate": 1.4852136862001764e-06, "loss": 7.2697, "step": 93 }, { "epoch": 0.11079053664166186, "grad_norm": 3.763043165206909, "learning_rate": 4.865965629214819e-07, "loss": 7.5429, "step": 96 }, { "epoch": 0.1142527409117138, "grad_norm": 3.8854165077209473, "learning_rate": 3.04586490452119e-08, "loss": 7.6568, "step": 99 }, { "epoch": 0.1142527409117138, "eval_loss": 1.9176751375198364, "eval_runtime": 104.5578, "eval_samples_per_second": 13.964, "eval_steps_per_second": 1.75, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.402137439371264e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }