{ "best_metric": 6.83237886428833, "best_model_checkpoint": "./results/models/checkpoint-25230", "epoch": 6.0, "eval_steps": 500, "global_step": 25230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11890606420927467, "grad_norm": 0.4375, "learning_rate": 0.003990487514863258, "loss": 6.8648, "step": 500 }, { "epoch": 0.23781212841854935, "grad_norm": 3.78125, "learning_rate": 0.003980975029726516, "loss": 6.8372, "step": 1000 }, { "epoch": 0.356718192627824, "grad_norm": 3.21875, "learning_rate": 0.003971462544589774, "loss": 6.872, "step": 1500 }, { "epoch": 0.4756242568370987, "grad_norm": 1.8359375, "learning_rate": 0.0039619500594530324, "loss": 6.8835, "step": 2000 }, { "epoch": 0.5945303210463734, "grad_norm": 15.25, "learning_rate": 0.00395243757431629, "loss": 6.8784, "step": 2500 }, { "epoch": 0.713436385255648, "grad_norm": 2.328125, "learning_rate": 0.003942925089179548, "loss": 6.8964, "step": 3000 }, { "epoch": 0.8323424494649228, "grad_norm": 17.0, "learning_rate": 0.003933412604042806, "loss": 6.8779, "step": 3500 }, { "epoch": 0.9512485136741974, "grad_norm": 0.98828125, "learning_rate": 0.003923900118906064, "loss": 6.8788, "step": 4000 }, { "epoch": 1.0, "eval_loss": 6.88191556930542, "eval_runtime": 26.2747, "eval_samples_per_second": 76.119, "eval_steps_per_second": 0.609, "step": 4205 }, { "epoch": 1.070154577883472, "grad_norm": 5.1875, "learning_rate": 0.003914387633769323, "loss": 6.8778, "step": 4500 }, { "epoch": 1.1890606420927468, "grad_norm": 2.296875, "learning_rate": 0.00390487514863258, "loss": 6.8945, "step": 5000 }, { "epoch": 1.3079667063020213, "grad_norm": 2.625, "learning_rate": 0.0038953626634958384, "loss": 6.8862, "step": 5500 }, { "epoch": 1.426872770511296, "grad_norm": 0.71875, "learning_rate": 0.0038858501783590963, "loss": 6.8846, "step": 6000 }, { "epoch": 1.5457788347205708, "grad_norm": 1.3046875, "learning_rate": 0.0038763376932223546, "loss": 6.8757, "step": 6500 }, { "epoch": 1.6646848989298455, "grad_norm": 1.0078125, "learning_rate": 0.0038668252080856125, "loss": 6.8802, "step": 7000 }, { "epoch": 1.7835909631391202, "grad_norm": 1.5625, "learning_rate": 0.0038573127229488704, "loss": 6.8663, "step": 7500 }, { "epoch": 1.9024970273483948, "grad_norm": 12.75, "learning_rate": 0.0038478002378121287, "loss": 6.8694, "step": 8000 }, { "epoch": 2.0, "eval_loss": 6.880330562591553, "eval_runtime": 23.7249, "eval_samples_per_second": 84.299, "eval_steps_per_second": 0.674, "step": 8410 }, { "epoch": 2.0214030915576693, "grad_norm": 14.375, "learning_rate": 0.003838287752675386, "loss": 6.8764, "step": 8500 }, { "epoch": 2.140309155766944, "grad_norm": 5.09375, "learning_rate": 0.0038287752675386444, "loss": 6.8742, "step": 9000 }, { "epoch": 2.2592152199762188, "grad_norm": 17.125, "learning_rate": 0.0038192627824019027, "loss": 6.8829, "step": 9500 }, { "epoch": 2.3781212841854935, "grad_norm": 2.171875, "learning_rate": 0.0038097502972651606, "loss": 6.8883, "step": 10000 }, { "epoch": 2.4970273483947683, "grad_norm": 1.0, "learning_rate": 0.0038002378121284185, "loss": 6.8851, "step": 10500 }, { "epoch": 2.6159334126040426, "grad_norm": 0.84375, "learning_rate": 0.003790725326991677, "loss": 6.8849, "step": 11000 }, { "epoch": 2.7348394768133177, "grad_norm": 2.0625, "learning_rate": 0.0037812128418549347, "loss": 6.8784, "step": 11500 }, { "epoch": 2.853745541022592, "grad_norm": 5.78125, "learning_rate": 0.003771700356718193, "loss": 6.8769, "step": 12000 }, { "epoch": 2.972651605231867, "grad_norm": 5.125, "learning_rate": 0.0037621878715814504, "loss": 6.876, "step": 12500 }, { "epoch": 3.0, "eval_loss": 6.865922451019287, "eval_runtime": 20.8083, "eval_samples_per_second": 96.115, "eval_steps_per_second": 0.769, "step": 12615 }, { "epoch": 3.0915576694411415, "grad_norm": 3.8125, "learning_rate": 0.0037526753864447087, "loss": 6.8658, "step": 13000 }, { "epoch": 3.2104637336504163, "grad_norm": 6.21875, "learning_rate": 0.003743162901307967, "loss": 6.8673, "step": 13500 }, { "epoch": 3.329369797859691, "grad_norm": 3.71875, "learning_rate": 0.003733650416171225, "loss": 6.8629, "step": 14000 }, { "epoch": 3.4482758620689653, "grad_norm": 4.5, "learning_rate": 0.0037241379310344828, "loss": 6.866, "step": 14500 }, { "epoch": 3.56718192627824, "grad_norm": 2.546875, "learning_rate": 0.003714625445897741, "loss": 6.8627, "step": 15000 }, { "epoch": 3.686087990487515, "grad_norm": 7.0625, "learning_rate": 0.003705112960760999, "loss": 6.859, "step": 15500 }, { "epoch": 3.8049940546967895, "grad_norm": 42.0, "learning_rate": 0.003695600475624257, "loss": 6.852, "step": 16000 }, { "epoch": 3.9239001189060643, "grad_norm": 24.75, "learning_rate": 0.0036860879904875147, "loss": 6.8541, "step": 16500 }, { "epoch": 4.0, "eval_loss": 6.84573221206665, "eval_runtime": 22.0613, "eval_samples_per_second": 90.656, "eval_steps_per_second": 0.725, "step": 16820 }, { "epoch": 4.042806183115339, "grad_norm": 19.75, "learning_rate": 0.003676575505350773, "loss": 6.85, "step": 17000 }, { "epoch": 4.161712247324614, "grad_norm": 18.0, "learning_rate": 0.0036670630202140313, "loss": 6.8548, "step": 17500 }, { "epoch": 4.280618311533888, "grad_norm": 1.7421875, "learning_rate": 0.0036575505350772888, "loss": 6.8497, "step": 18000 }, { "epoch": 4.399524375743163, "grad_norm": 11.25, "learning_rate": 0.003648038049940547, "loss": 6.8469, "step": 18500 }, { "epoch": 4.5184304399524375, "grad_norm": 181.0, "learning_rate": 0.003638525564803805, "loss": 6.8481, "step": 19000 }, { "epoch": 4.637336504161713, "grad_norm": 13.4375, "learning_rate": 0.0036290130796670633, "loss": 6.854, "step": 19500 }, { "epoch": 4.756242568370987, "grad_norm": 19.75, "learning_rate": 0.003619500594530321, "loss": 6.848, "step": 20000 }, { "epoch": 4.875148632580261, "grad_norm": 3.71875, "learning_rate": 0.003609988109393579, "loss": 6.8439, "step": 20500 }, { "epoch": 4.9940546967895365, "grad_norm": 3.78125, "learning_rate": 0.0036004756242568373, "loss": 6.8516, "step": 21000 }, { "epoch": 5.0, "eval_loss": 6.845646381378174, "eval_runtime": 24.3842, "eval_samples_per_second": 82.02, "eval_steps_per_second": 0.656, "step": 21025 }, { "epoch": 5.112960760998811, "grad_norm": 43.0, "learning_rate": 0.003590963139120095, "loss": 6.8426, "step": 21500 }, { "epoch": 5.231866825208086, "grad_norm": 14.5, "learning_rate": 0.003581450653983353, "loss": 6.8388, "step": 22000 }, { "epoch": 5.35077288941736, "grad_norm": 23.25, "learning_rate": 0.0035719381688466114, "loss": 6.843, "step": 22500 }, { "epoch": 5.469678953626635, "grad_norm": 3.546875, "learning_rate": 0.0035624256837098692, "loss": 6.8391, "step": 23000 }, { "epoch": 5.58858501783591, "grad_norm": 4.03125, "learning_rate": 0.003552913198573127, "loss": 6.838, "step": 23500 }, { "epoch": 5.707491082045184, "grad_norm": 2.953125, "learning_rate": 0.0035434007134363854, "loss": 6.8373, "step": 24000 }, { "epoch": 5.826397146254459, "grad_norm": 16.25, "learning_rate": 0.0035338882282996433, "loss": 6.8372, "step": 24500 }, { "epoch": 5.945303210463734, "grad_norm": 49.0, "learning_rate": 0.0035243757431629016, "loss": 6.8363, "step": 25000 }, { "epoch": 6.0, "eval_loss": 6.83237886428833, "eval_runtime": 24.8165, "eval_samples_per_second": 80.591, "eval_steps_per_second": 0.645, "step": 25230 } ], "logging_steps": 500, "max_steps": 210250, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.337350063694092e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }