{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2631578947368421, "eval_steps": 4, "global_step": 25, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010526315789473684, "grad_norm": 29.17266082763672, "learning_rate": 1.0000000000000002e-06, "loss": 5.424, "step": 1 }, { "epoch": 0.010526315789473684, "eval_loss": NaN, "eval_runtime": 5.3901, "eval_samples_per_second": 7.421, "eval_steps_per_second": 1.855, "step": 1 }, { "epoch": 0.021052631578947368, "grad_norm": 35.16109848022461, "learning_rate": 2.0000000000000003e-06, "loss": 4.8392, "step": 2 }, { "epoch": 0.031578947368421054, "grad_norm": 26.968463897705078, "learning_rate": 3e-06, "loss": 5.4673, "step": 3 }, { "epoch": 0.042105263157894736, "grad_norm": 32.266021728515625, "learning_rate": 4.000000000000001e-06, "loss": 5.3509, "step": 4 }, { "epoch": 0.042105263157894736, "eval_loss": NaN, "eval_runtime": 5.0573, "eval_samples_per_second": 7.909, "eval_steps_per_second": 1.977, "step": 4 }, { "epoch": 0.05263157894736842, "grad_norm": 39.015323638916016, "learning_rate": 5e-06, "loss": 4.2175, "step": 5 }, { "epoch": 0.06315789473684211, "grad_norm": 27.262535095214844, "learning_rate": 6e-06, "loss": 5.0985, "step": 6 }, { "epoch": 0.07368421052631578, "grad_norm": 31.20633316040039, "learning_rate": 7e-06, "loss": 5.267, "step": 7 }, { "epoch": 0.08421052631578947, "grad_norm": 31.28679084777832, "learning_rate": 8.000000000000001e-06, "loss": 4.7477, "step": 8 }, { "epoch": 0.08421052631578947, "eval_loss": NaN, "eval_runtime": 5.0562, "eval_samples_per_second": 7.911, "eval_steps_per_second": 1.978, "step": 8 }, { "epoch": 0.09473684210526316, "grad_norm": 37.779701232910156, "learning_rate": 9e-06, "loss": 5.0317, "step": 9 }, { "epoch": 0.10526315789473684, "grad_norm": 31.49554443359375, "learning_rate": 1e-05, "loss": 4.7971, "step": 10 }, { "epoch": 0.11578947368421053, "grad_norm": 30.3826847076416, "learning_rate": 9.890738003669029e-06, "loss": 4.3297, "step": 11 }, { "epoch": 0.12631578947368421, "grad_norm": 29.516464233398438, "learning_rate": 9.567727288213005e-06, "loss": 4.4154, "step": 12 }, { "epoch": 0.12631578947368421, "eval_loss": NaN, "eval_runtime": 5.0722, "eval_samples_per_second": 7.886, "eval_steps_per_second": 1.972, "step": 12 }, { "epoch": 0.1368421052631579, "grad_norm": 28.98780632019043, "learning_rate": 9.045084971874738e-06, "loss": 3.7288, "step": 13 }, { "epoch": 0.14736842105263157, "grad_norm": 40.040767669677734, "learning_rate": 8.345653031794292e-06, "loss": 3.8781, "step": 14 }, { "epoch": 0.15789473684210525, "grad_norm": 25.301918029785156, "learning_rate": 7.500000000000001e-06, "loss": 3.185, "step": 15 }, { "epoch": 0.16842105263157894, "grad_norm": 26.731529235839844, "learning_rate": 6.545084971874738e-06, "loss": 2.6103, "step": 16 }, { "epoch": 0.16842105263157894, "eval_loss": NaN, "eval_runtime": 5.1077, "eval_samples_per_second": 7.831, "eval_steps_per_second": 1.958, "step": 16 }, { "epoch": 0.17894736842105263, "grad_norm": 31.212656021118164, "learning_rate": 5.522642316338268e-06, "loss": 2.5839, "step": 17 }, { "epoch": 0.18947368421052632, "grad_norm": 19.863901138305664, "learning_rate": 4.477357683661734e-06, "loss": 2.1497, "step": 18 }, { "epoch": 0.2, "grad_norm": 19.72088050842285, "learning_rate": 3.4549150281252635e-06, "loss": 1.8183, "step": 19 }, { "epoch": 0.21052631578947367, "grad_norm": 20.8995361328125, "learning_rate": 2.5000000000000015e-06, "loss": 1.6411, "step": 20 }, { "epoch": 0.21052631578947367, "eval_loss": NaN, "eval_runtime": 5.1024, "eval_samples_per_second": 7.839, "eval_steps_per_second": 1.96, "step": 20 }, { "epoch": 0.22105263157894736, "grad_norm": 19.47292137145996, "learning_rate": 1.6543469682057105e-06, "loss": 1.6535, "step": 21 }, { "epoch": 0.23157894736842105, "grad_norm": 16.744619369506836, "learning_rate": 9.549150281252633e-07, "loss": 1.3746, "step": 22 }, { "epoch": 0.24210526315789474, "grad_norm": 16.703203201293945, "learning_rate": 4.322727117869951e-07, "loss": 1.4706, "step": 23 }, { "epoch": 0.25263157894736843, "grad_norm": 16.96294403076172, "learning_rate": 1.0926199633097156e-07, "loss": 1.4195, "step": 24 }, { "epoch": 0.25263157894736843, "eval_loss": NaN, "eval_runtime": 5.0979, "eval_samples_per_second": 7.846, "eval_steps_per_second": 1.962, "step": 24 }, { "epoch": 0.2631578947368421, "grad_norm": 15.989789009094238, "learning_rate": 0.0, "loss": 1.4447, "step": 25 } ], "logging_steps": 1, "max_steps": 25, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9273591044505600.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }