{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0202020202020203, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020202020202020204, "eval_loss": 4.9802985191345215, "eval_runtime": 2.3092, "eval_samples_per_second": 36.376, "eval_steps_per_second": 4.764, "step": 1 }, { "epoch": 0.06060606060606061, "grad_norm": 3.5151710510253906, "learning_rate": 1.5e-05, "loss": 4.8465, "step": 3 }, { "epoch": 0.12121212121212122, "grad_norm": 3.6303155422210693, "learning_rate": 3e-05, "loss": 4.8307, "step": 6 }, { "epoch": 0.18181818181818182, "grad_norm": 3.675507068634033, "learning_rate": 4.5e-05, "loss": 4.7838, "step": 9 }, { "epoch": 0.18181818181818182, "eval_loss": 4.756908893585205, "eval_runtime": 2.3145, "eval_samples_per_second": 36.293, "eval_steps_per_second": 4.753, "step": 9 }, { "epoch": 0.24242424242424243, "grad_norm": 3.3855478763580322, "learning_rate": 4.993910125649561e-05, "loss": 4.3757, "step": 12 }, { "epoch": 0.30303030303030304, "grad_norm": 2.789888858795166, "learning_rate": 4.962019382530521e-05, "loss": 4.1669, "step": 15 }, { "epoch": 0.36363636363636365, "grad_norm": 1.9098223447799683, "learning_rate": 4.9031542398457974e-05, "loss": 3.689, "step": 18 }, { "epoch": 0.36363636363636365, "eval_loss": 3.8252432346343994, "eval_runtime": 2.3162, "eval_samples_per_second": 36.267, "eval_steps_per_second": 4.749, "step": 18 }, { "epoch": 0.42424242424242425, "grad_norm": 1.645618200302124, "learning_rate": 4.817959636416969e-05, "loss": 3.6643, "step": 21 }, { "epoch": 0.48484848484848486, "grad_norm": 1.723616361618042, "learning_rate": 4.707368982147318e-05, "loss": 3.5012, "step": 24 }, { "epoch": 0.5454545454545454, "grad_norm": 1.6863574981689453, "learning_rate": 4.572593931387604e-05, "loss": 3.3348, "step": 27 }, { "epoch": 0.5454545454545454, "eval_loss": 3.322596311569214, "eval_runtime": 2.3165, "eval_samples_per_second": 36.261, "eval_steps_per_second": 4.748, "step": 27 }, { "epoch": 0.6060606060606061, "grad_norm": 1.399096131324768, "learning_rate": 4.415111107797445e-05, "loss": 3.1167, "step": 30 }, { "epoch": 0.6666666666666666, "grad_norm": 1.5978809595108032, "learning_rate": 4.2366459261474933e-05, "loss": 2.9797, "step": 33 }, { "epoch": 0.7272727272727273, "grad_norm": 1.3575938940048218, "learning_rate": 4.039153688314145e-05, "loss": 2.9187, "step": 36 }, { "epoch": 0.7272727272727273, "eval_loss": 2.8548290729522705, "eval_runtime": 2.3198, "eval_samples_per_second": 36.21, "eval_steps_per_second": 4.742, "step": 36 }, { "epoch": 0.7878787878787878, "grad_norm": 1.374647855758667, "learning_rate": 3.824798160583012e-05, "loss": 2.797, "step": 39 }, { "epoch": 0.8484848484848485, "grad_norm": 1.2123751640319824, "learning_rate": 3.5959278669726935e-05, "loss": 2.5678, "step": 42 }, { "epoch": 0.9090909090909091, "grad_norm": 1.1229029893875122, "learning_rate": 3.355050358314172e-05, "loss": 2.5997, "step": 45 }, { "epoch": 0.9090909090909091, "eval_loss": 2.5246951580047607, "eval_runtime": 2.3112, "eval_samples_per_second": 36.344, "eval_steps_per_second": 4.759, "step": 45 }, { "epoch": 0.9696969696969697, "grad_norm": 1.132219672203064, "learning_rate": 3.104804738999169e-05, "loss": 2.4134, "step": 48 }, { "epoch": 1.0303030303030303, "grad_norm": 1.2697609663009644, "learning_rate": 2.8479327524001636e-05, "loss": 2.9878, "step": 51 }, { "epoch": 1.0909090909090908, "grad_norm": 0.9924309849739075, "learning_rate": 2.587248741756253e-05, "loss": 2.2329, "step": 54 }, { "epoch": 1.0909090909090908, "eval_loss": 2.328364372253418, "eval_runtime": 2.3181, "eval_samples_per_second": 36.237, "eval_steps_per_second": 4.745, "step": 54 }, { "epoch": 1.1515151515151516, "grad_norm": 0.9968785047531128, "learning_rate": 2.3256088156396868e-05, "loss": 2.2676, "step": 57 }, { "epoch": 1.2121212121212122, "grad_norm": 1.0607465505599976, "learning_rate": 2.0658795558326743e-05, "loss": 2.2769, "step": 60 }, { "epoch": 1.2727272727272727, "grad_norm": 1.071947693824768, "learning_rate": 1.8109066104575023e-05, "loss": 2.1025, "step": 63 }, { "epoch": 1.2727272727272727, "eval_loss": 2.20631742477417, "eval_runtime": 2.3202, "eval_samples_per_second": 36.203, "eval_steps_per_second": 4.741, "step": 63 }, { "epoch": 1.3333333333333333, "grad_norm": 1.0798910856246948, "learning_rate": 1.56348351646022e-05, "loss": 2.1019, "step": 66 }, { "epoch": 1.393939393939394, "grad_norm": 1.1480613946914673, "learning_rate": 1.3263210930352737e-05, "loss": 2.1791, "step": 69 }, { "epoch": 1.4545454545454546, "grad_norm": 0.9959827661514282, "learning_rate": 1.1020177413231334e-05, "loss": 2.0875, "step": 72 }, { "epoch": 1.4545454545454546, "eval_loss": 2.130913257598877, "eval_runtime": 2.3152, "eval_samples_per_second": 36.283, "eval_steps_per_second": 4.751, "step": 72 }, { "epoch": 1.5151515151515151, "grad_norm": 0.9272772073745728, "learning_rate": 8.930309757836517e-06, "loss": 1.9071, "step": 75 }, { "epoch": 1.5757575757575757, "grad_norm": 1.047183871269226, "learning_rate": 7.016504991533726e-06, "loss": 2.116, "step": 78 }, { "epoch": 1.6363636363636362, "grad_norm": 1.016968011856079, "learning_rate": 5.299731159831953e-06, "loss": 1.9271, "step": 81 }, { "epoch": 1.6363636363636362, "eval_loss": 2.092479705810547, "eval_runtime": 2.3272, "eval_samples_per_second": 36.095, "eval_steps_per_second": 4.727, "step": 81 }, { "epoch": 1.696969696969697, "grad_norm": 0.9477640986442566, "learning_rate": 3.798797596089351e-06, "loss": 2.0882, "step": 84 }, { "epoch": 1.7575757575757576, "grad_norm": 0.9164983630180359, "learning_rate": 2.5301488425208296e-06, "loss": 2.046, "step": 87 }, { "epoch": 1.8181818181818183, "grad_norm": 1.078360915184021, "learning_rate": 1.5076844803522922e-06, "loss": 2.0393, "step": 90 }, { "epoch": 1.8181818181818183, "eval_loss": 2.0764904022216797, "eval_runtime": 2.319, "eval_samples_per_second": 36.223, "eval_steps_per_second": 4.743, "step": 90 }, { "epoch": 1.878787878787879, "grad_norm": 0.9819637537002563, "learning_rate": 7.426068431000882e-07, "loss": 2.0016, "step": 93 }, { "epoch": 1.9393939393939394, "grad_norm": 1.2420907020568848, "learning_rate": 2.4329828146074095e-07, "loss": 1.9479, "step": 96 }, { "epoch": 2.0, "grad_norm": 1.456129789352417, "learning_rate": 1.522932452260595e-08, "loss": 2.2999, "step": 99 }, { "epoch": 2.0, "eval_loss": 2.0743141174316406, "eval_runtime": 2.3243, "eval_samples_per_second": 36.14, "eval_steps_per_second": 4.733, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.610904837600051e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }