{ "best_metric": 1.22141432762146, "best_model_checkpoint": "outputs/checkpoint-45", "epoch": 1.0, "eval_steps": 15, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01694915254237288, "grad_norm": 42.25, "learning_rate": 0.001, "loss": 2.0126, "step": 1 }, { "epoch": 0.03389830508474576, "grad_norm": 14.375, "learning_rate": 0.001, "loss": 1.8512, "step": 2 }, { "epoch": 0.05084745762711865, "grad_norm": 7.09375, "learning_rate": 0.001, "loss": 1.4776, "step": 3 }, { "epoch": 0.06779661016949153, "grad_norm": 6.9375, "learning_rate": 0.001, "loss": 1.4508, "step": 4 }, { "epoch": 0.0847457627118644, "grad_norm": 1.5625, "learning_rate": 0.001, "loss": 1.342, "step": 5 }, { "epoch": 0.1016949152542373, "grad_norm": 1.5234375, "learning_rate": 0.001, "loss": 1.3831, "step": 6 }, { "epoch": 0.11864406779661017, "grad_norm": 3.671875, "learning_rate": 0.001, "loss": 1.3233, "step": 7 }, { "epoch": 0.13559322033898305, "grad_norm": 16.875, "learning_rate": 0.001, "loss": 1.8094, "step": 8 }, { "epoch": 0.15254237288135594, "grad_norm": 6.8125, "learning_rate": 0.001, "loss": 1.4678, "step": 9 }, { "epoch": 0.1694915254237288, "grad_norm": 1.6796875, "learning_rate": 0.001, "loss": 1.3086, "step": 10 }, { "epoch": 0.1864406779661017, "grad_norm": 1.09375, "learning_rate": 0.001, "loss": 1.1842, "step": 11 }, { "epoch": 0.2033898305084746, "grad_norm": 1.28125, "learning_rate": 0.001, "loss": 1.2429, "step": 12 }, { "epoch": 0.22033898305084745, "grad_norm": 1.796875, "learning_rate": 0.001, "loss": 1.368, "step": 13 }, { "epoch": 0.23728813559322035, "grad_norm": 1.09375, "learning_rate": 0.001, "loss": 1.2236, "step": 14 }, { "epoch": 0.2542372881355932, "grad_norm": 1.21875, "learning_rate": 0.001, "loss": 1.2481, "step": 15 }, { "epoch": 0.2542372881355932, "eval_loss": 1.2657548189163208, "eval_runtime": 2.241, "eval_samples_per_second": 83.891, "eval_steps_per_second": 10.71, "step": 15 }, { "epoch": 0.2711864406779661, "grad_norm": 1.3359375, "learning_rate": 0.001, "loss": 1.2124, "step": 16 }, { "epoch": 0.288135593220339, "grad_norm": 1.34375, "learning_rate": 0.001, "loss": 1.3263, "step": 17 }, { "epoch": 0.3050847457627119, "grad_norm": 1.140625, "learning_rate": 0.001, "loss": 1.2269, "step": 18 }, { "epoch": 0.3220338983050847, "grad_norm": 1.421875, "learning_rate": 0.001, "loss": 1.3701, "step": 19 }, { "epoch": 0.3389830508474576, "grad_norm": 1.53125, "learning_rate": 0.001, "loss": 1.2618, "step": 20 }, { "epoch": 0.3559322033898305, "grad_norm": 1.6484375, "learning_rate": 0.001, "loss": 1.3644, "step": 21 }, { "epoch": 0.3728813559322034, "grad_norm": 1.4921875, "learning_rate": 0.001, "loss": 1.3143, "step": 22 }, { "epoch": 0.3898305084745763, "grad_norm": 1.8125, "learning_rate": 0.001, "loss": 1.2497, "step": 23 }, { "epoch": 0.4067796610169492, "grad_norm": 1.3828125, "learning_rate": 0.001, "loss": 1.1473, "step": 24 }, { "epoch": 0.423728813559322, "grad_norm": 1.109375, "learning_rate": 0.001, "loss": 1.1923, "step": 25 }, { "epoch": 0.4406779661016949, "grad_norm": 1.390625, "learning_rate": 0.001, "loss": 1.2122, "step": 26 }, { "epoch": 0.4576271186440678, "grad_norm": 1.4375, "learning_rate": 0.001, "loss": 1.1393, "step": 27 }, { "epoch": 0.4745762711864407, "grad_norm": 1.1796875, "learning_rate": 0.001, "loss": 1.1894, "step": 28 }, { "epoch": 0.4915254237288136, "grad_norm": 7.3125, "learning_rate": 0.001, "loss": 1.2592, "step": 29 }, { "epoch": 0.5084745762711864, "grad_norm": 2.109375, "learning_rate": 0.001, "loss": 1.2445, "step": 30 }, { "epoch": 0.5084745762711864, "eval_loss": 1.2405431270599365, "eval_runtime": 2.2457, "eval_samples_per_second": 83.715, "eval_steps_per_second": 10.687, "step": 30 }, { "epoch": 0.5254237288135594, "grad_norm": 3.921875, "learning_rate": 0.001, "loss": 1.2998, "step": 31 }, { "epoch": 0.5423728813559322, "grad_norm": 39.25, "learning_rate": 0.001, "loss": 1.626, "step": 32 }, { "epoch": 0.559322033898305, "grad_norm": 5.4375, "learning_rate": 0.001, "loss": 1.2702, "step": 33 }, { "epoch": 0.576271186440678, "grad_norm": 2.53125, "learning_rate": 0.001, "loss": 1.2658, "step": 34 }, { "epoch": 0.5932203389830508, "grad_norm": 28.5, "learning_rate": 0.001, "loss": 1.2946, "step": 35 }, { "epoch": 0.6101694915254238, "grad_norm": 31.25, "learning_rate": 0.001, "loss": 1.4802, "step": 36 }, { "epoch": 0.6271186440677966, "grad_norm": 5.59375, "learning_rate": 0.001, "loss": 1.5868, "step": 37 }, { "epoch": 0.6440677966101694, "grad_norm": 3.1875, "learning_rate": 0.001, "loss": 1.4699, "step": 38 }, { "epoch": 0.6610169491525424, "grad_norm": 2.625, "learning_rate": 0.001, "loss": 1.359, "step": 39 }, { "epoch": 0.6779661016949152, "grad_norm": 2.515625, "learning_rate": 0.001, "loss": 1.4783, "step": 40 }, { "epoch": 0.6949152542372882, "grad_norm": 1.6640625, "learning_rate": 0.001, "loss": 1.304, "step": 41 }, { "epoch": 0.711864406779661, "grad_norm": 1.6953125, "learning_rate": 0.001, "loss": 1.3992, "step": 42 }, { "epoch": 0.7288135593220338, "grad_norm": 2.234375, "learning_rate": 0.001, "loss": 1.4327, "step": 43 }, { "epoch": 0.7457627118644068, "grad_norm": 3.9375, "learning_rate": 0.001, "loss": 1.2919, "step": 44 }, { "epoch": 0.7627118644067796, "grad_norm": 8.3125, "learning_rate": 0.001, "loss": 1.5113, "step": 45 }, { "epoch": 0.7627118644067796, "eval_loss": 1.22141432762146, "eval_runtime": 2.2509, "eval_samples_per_second": 83.524, "eval_steps_per_second": 10.663, "step": 45 }, { "epoch": 0.7796610169491526, "grad_norm": 3.3125, "learning_rate": 0.001, "loss": 1.3297, "step": 46 }, { "epoch": 0.7966101694915254, "grad_norm": 6.25, "learning_rate": 0.001, "loss": 1.3788, "step": 47 }, { "epoch": 0.8135593220338984, "grad_norm": 2.359375, "learning_rate": 0.001, "loss": 1.3912, "step": 48 }, { "epoch": 0.8305084745762712, "grad_norm": 1.25, "learning_rate": 0.001, "loss": 1.3249, "step": 49 }, { "epoch": 0.847457627118644, "grad_norm": 1.625, "learning_rate": 0.001, "loss": 1.3285, "step": 50 }, { "epoch": 0.864406779661017, "grad_norm": 1.4140625, "learning_rate": 0.001, "loss": 1.2734, "step": 51 }, { "epoch": 0.8813559322033898, "grad_norm": 1.3828125, "learning_rate": 0.001, "loss": 1.2826, "step": 52 }, { "epoch": 0.8983050847457628, "grad_norm": 1.5, "learning_rate": 0.001, "loss": 1.1952, "step": 53 }, { "epoch": 0.9152542372881356, "grad_norm": 1.2109375, "learning_rate": 0.001, "loss": 1.4186, "step": 54 }, { "epoch": 0.9322033898305084, "grad_norm": 1.5078125, "learning_rate": 0.001, "loss": 1.3943, "step": 55 }, { "epoch": 0.9491525423728814, "grad_norm": 1.796875, "learning_rate": 0.001, "loss": 1.3948, "step": 56 }, { "epoch": 0.9661016949152542, "grad_norm": 1.7734375, "learning_rate": 0.001, "loss": 1.3806, "step": 57 }, { "epoch": 0.9830508474576272, "grad_norm": 1.4609375, "learning_rate": 0.001, "loss": 1.4173, "step": 58 }, { "epoch": 1.0, "grad_norm": 1.5234375, "learning_rate": 0.001, "loss": 1.3113, "step": 59 }, { "epoch": 1.0, "step": 59, "total_flos": 8392016473620480.0, "train_loss": 1.3609151254265994, "train_runtime": 66.1562, "train_samples_per_second": 28.327, "train_steps_per_second": 0.892 } ], "logging_steps": 1, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 15, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8392016473620480.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }