{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.021194673072167863, "eval_steps": 13, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00014129782048111908, "eval_loss": NaN, "eval_runtime": 235.7619, "eval_samples_per_second": 50.559, "eval_steps_per_second": 6.32, "step": 1 }, { "epoch": 0.0004238934614433572, "grad_norm": NaN, "learning_rate": 3e-05, "loss": 0.0, "step": 3 }, { "epoch": 0.0008477869228867144, "grad_norm": NaN, "learning_rate": 6e-05, "loss": 0.0, "step": 6 }, { "epoch": 0.0012716803843300717, "grad_norm": NaN, "learning_rate": 9e-05, "loss": 0.0, "step": 9 }, { "epoch": 0.0016955738457734289, "grad_norm": NaN, "learning_rate": 9.994965332706573e-05, "loss": 0.0, "step": 12 }, { "epoch": 0.001836871666254548, "eval_loss": NaN, "eval_runtime": 235.922, "eval_samples_per_second": 50.525, "eval_steps_per_second": 6.316, "step": 13 }, { "epoch": 0.002119467307216786, "grad_norm": NaN, "learning_rate": 9.968561049466214e-05, "loss": 0.0, "step": 15 }, { "epoch": 0.0025433607686601435, "grad_norm": NaN, "learning_rate": 9.919647942993148e-05, "loss": 0.0, "step": 18 }, { "epoch": 0.002967254230103501, "grad_norm": NaN, "learning_rate": 9.848447601883435e-05, "loss": 0.0, "step": 21 }, { "epoch": 0.0033911476915468577, "grad_norm": NaN, "learning_rate": 9.755282581475769e-05, "loss": 0.0, "step": 24 }, { "epoch": 0.003673743332509096, "eval_loss": NaN, "eval_runtime": 235.9223, "eval_samples_per_second": 50.525, "eval_steps_per_second": 6.316, "step": 26 }, { "epoch": 0.003815041152990215, "grad_norm": NaN, "learning_rate": 9.640574942595196e-05, "loss": 0.0, "step": 27 }, { "epoch": 0.004238934614433572, "grad_norm": NaN, "learning_rate": 9.504844339512095e-05, "loss": 0.0, "step": 30 }, { "epoch": 0.00466282807587693, "grad_norm": NaN, "learning_rate": 9.348705665778478e-05, "loss": 0.0, "step": 33 }, { "epoch": 0.005086721537320287, "grad_norm": NaN, "learning_rate": 9.172866268606513e-05, "loss": 0.0, "step": 36 }, { "epoch": 0.005510614998763644, "grad_norm": NaN, "learning_rate": 8.978122744408906e-05, "loss": 0.0, "step": 39 }, { "epoch": 0.005510614998763644, "eval_loss": NaN, "eval_runtime": 235.965, "eval_samples_per_second": 50.516, "eval_steps_per_second": 6.314, "step": 39 }, { "epoch": 0.005934508460207002, "grad_norm": NaN, "learning_rate": 8.765357330018056e-05, "loss": 0.0, "step": 42 }, { "epoch": 0.006358401921650359, "grad_norm": NaN, "learning_rate": 8.535533905932738e-05, "loss": 0.0, "step": 45 }, { "epoch": 0.006782295383093715, "grad_norm": NaN, "learning_rate": 8.289693629698564e-05, "loss": 0.0, "step": 48 }, { "epoch": 0.007206188844537073, "grad_norm": NaN, "learning_rate": 8.0289502192041e-05, "loss": 0.0, "step": 51 }, { "epoch": 0.007347486665018192, "eval_loss": NaN, "eval_runtime": 235.958, "eval_samples_per_second": 50.517, "eval_steps_per_second": 6.315, "step": 52 }, { "epoch": 0.00763008230598043, "grad_norm": NaN, "learning_rate": 7.754484907260513e-05, "loss": 0.0, "step": 54 }, { "epoch": 0.008053975767423787, "grad_norm": NaN, "learning_rate": 7.467541090321735e-05, "loss": 0.0, "step": 57 }, { "epoch": 0.008477869228867145, "grad_norm": NaN, "learning_rate": 7.169418695587791e-05, "loss": 0.0, "step": 60 }, { "epoch": 0.008901762690310502, "grad_norm": NaN, "learning_rate": 6.861468292009727e-05, "loss": 0.0, "step": 63 }, { "epoch": 0.00918435833127274, "eval_loss": NaN, "eval_runtime": 235.8169, "eval_samples_per_second": 50.548, "eval_steps_per_second": 6.318, "step": 65 }, { "epoch": 0.00932565615175386, "grad_norm": NaN, "learning_rate": 6.545084971874738e-05, "loss": 0.0, "step": 66 }, { "epoch": 0.009749549613197217, "grad_norm": NaN, "learning_rate": 6.22170203068947e-05, "loss": 0.0, "step": 69 }, { "epoch": 0.010173443074640574, "grad_norm": NaN, "learning_rate": 5.8927844739931834e-05, "loss": 0.0, "step": 72 }, { "epoch": 0.010597336536083931, "grad_norm": NaN, "learning_rate": 5.559822380516539e-05, "loss": 0.0, "step": 75 }, { "epoch": 0.011021229997527289, "grad_norm": NaN, "learning_rate": 5.2243241517525754e-05, "loss": 0.0, "step": 78 }, { "epoch": 0.011021229997527289, "eval_loss": NaN, "eval_runtime": 235.731, "eval_samples_per_second": 50.566, "eval_steps_per_second": 6.321, "step": 78 }, { "epoch": 0.011445123458970646, "grad_norm": NaN, "learning_rate": 4.887809678520976e-05, "loss": 0.0, "step": 81 }, { "epoch": 0.011869016920414003, "grad_norm": NaN, "learning_rate": 4.551803455482833e-05, "loss": 0.0, "step": 84 }, { "epoch": 0.01229291038185736, "grad_norm": NaN, "learning_rate": 4.2178276747988446e-05, "loss": 0.0, "step": 87 }, { "epoch": 0.012716803843300718, "grad_norm": NaN, "learning_rate": 3.887395330218429e-05, "loss": 0.0, "step": 90 }, { "epoch": 0.012858101663781835, "eval_loss": NaN, "eval_runtime": 235.7733, "eval_samples_per_second": 50.557, "eval_steps_per_second": 6.32, "step": 91 }, { "epoch": 0.013140697304744074, "grad_norm": NaN, "learning_rate": 3.562003362839914e-05, "loss": 0.0, "step": 93 }, { "epoch": 0.01356459076618743, "grad_norm": NaN, "learning_rate": 3.243125879593286e-05, "loss": 0.0, "step": 96 }, { "epoch": 0.013988484227630788, "grad_norm": NaN, "learning_rate": 2.932207475167398e-05, "loss": 0.0, "step": 99 }, { "epoch": 0.014412377689074145, "grad_norm": NaN, "learning_rate": 2.630656687635007e-05, "loss": 0.0, "step": 102 }, { "epoch": 0.014694973330036384, "eval_loss": NaN, "eval_runtime": 235.7296, "eval_samples_per_second": 50.566, "eval_steps_per_second": 6.321, "step": 104 }, { "epoch": 0.014836271150517503, "grad_norm": NaN, "learning_rate": 2.3398396174233178e-05, "loss": 0.0, "step": 105 }, { "epoch": 0.01526016461196086, "grad_norm": NaN, "learning_rate": 2.061073738537635e-05, "loss": 0.0, "step": 108 }, { "epoch": 0.015684058073404217, "grad_norm": NaN, "learning_rate": 1.7956219300748793e-05, "loss": 0.0, "step": 111 }, { "epoch": 0.016107951534847575, "grad_norm": NaN, "learning_rate": 1.544686755065677e-05, "loss": 0.0, "step": 114 }, { "epoch": 0.016531844996290932, "grad_norm": NaN, "learning_rate": 1.3094050125632972e-05, "loss": 0.0, "step": 117 }, { "epoch": 0.016531844996290932, "eval_loss": NaN, "eval_runtime": 235.7863, "eval_samples_per_second": 50.554, "eval_steps_per_second": 6.319, "step": 117 }, { "epoch": 0.01695573845773429, "grad_norm": NaN, "learning_rate": 1.090842587659851e-05, "loss": 0.0, "step": 120 }, { "epoch": 0.017379631919177647, "grad_norm": NaN, "learning_rate": 8.899896227604509e-06, "loss": 0.0, "step": 123 }, { "epoch": 0.017803525380621004, "grad_norm": NaN, "learning_rate": 7.077560319906695e-06, "loss": 0.0, "step": 126 }, { "epoch": 0.01822741884206436, "grad_norm": NaN, "learning_rate": 5.449673790581611e-06, "loss": 0.0, "step": 129 }, { "epoch": 0.01836871666254548, "eval_loss": NaN, "eval_runtime": 235.7866, "eval_samples_per_second": 50.554, "eval_steps_per_second": 6.319, "step": 130 }, { "epoch": 0.01865131230350772, "grad_norm": NaN, "learning_rate": 4.023611372427471e-06, "loss": 0.0, "step": 132 }, { "epoch": 0.019075205764951076, "grad_norm": NaN, "learning_rate": 2.8058334845816213e-06, "loss": 0.0, "step": 135 }, { "epoch": 0.019499099226394433, "grad_norm": NaN, "learning_rate": 1.8018569652073381e-06, "loss": 0.0, "step": 138 }, { "epoch": 0.01992299268783779, "grad_norm": NaN, "learning_rate": 1.016230078838226e-06, "loss": 0.0, "step": 141 }, { "epoch": 0.02020558832880003, "eval_loss": NaN, "eval_runtime": 235.8437, "eval_samples_per_second": 50.542, "eval_steps_per_second": 6.318, "step": 143 }, { "epoch": 0.020346886149281148, "grad_norm": NaN, "learning_rate": 4.52511911603265e-07, "loss": 0.0, "step": 144 }, { "epoch": 0.020770779610724505, "grad_norm": NaN, "learning_rate": 1.132562476771959e-07, "loss": 0.0, "step": 147 }, { "epoch": 0.021194673072167863, "grad_norm": NaN, "learning_rate": 0.0, "loss": 0.0, "step": 150 } ], "logging_steps": 3, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.19972177870848e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }