|
{ |
|
"best_metric": 6.83237886428833, |
|
"best_model_checkpoint": "./results/models/checkpoint-25230", |
|
"epoch": 6.0, |
|
"eval_steps": 500, |
|
"global_step": 25230, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11890606420927467, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.003990487514863258, |
|
"loss": 6.8648, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23781212841854935, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 0.003980975029726516, |
|
"loss": 6.8372, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.356718192627824, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 0.003971462544589774, |
|
"loss": 6.872, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4756242568370987, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.0039619500594530324, |
|
"loss": 6.8835, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5945303210463734, |
|
"grad_norm": 15.25, |
|
"learning_rate": 0.00395243757431629, |
|
"loss": 6.8784, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.713436385255648, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 0.003942925089179548, |
|
"loss": 6.8964, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8323424494649228, |
|
"grad_norm": 17.0, |
|
"learning_rate": 0.003933412604042806, |
|
"loss": 6.8779, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9512485136741974, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.003923900118906064, |
|
"loss": 6.8788, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 6.88191556930542, |
|
"eval_runtime": 26.2747, |
|
"eval_samples_per_second": 76.119, |
|
"eval_steps_per_second": 0.609, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 1.070154577883472, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 0.003914387633769323, |
|
"loss": 6.8778, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.1890606420927468, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.00390487514863258, |
|
"loss": 6.8945, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.3079667063020213, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.0038953626634958384, |
|
"loss": 6.8862, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.426872770511296, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0038858501783590963, |
|
"loss": 6.8846, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.5457788347205708, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0038763376932223546, |
|
"loss": 6.8757, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.6646848989298455, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0038668252080856125, |
|
"loss": 6.8802, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.7835909631391202, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0038573127229488704, |
|
"loss": 6.8663, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.9024970273483948, |
|
"grad_norm": 12.75, |
|
"learning_rate": 0.0038478002378121287, |
|
"loss": 6.8694, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 6.880330562591553, |
|
"eval_runtime": 23.7249, |
|
"eval_samples_per_second": 84.299, |
|
"eval_steps_per_second": 0.674, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 2.0214030915576693, |
|
"grad_norm": 14.375, |
|
"learning_rate": 0.003838287752675386, |
|
"loss": 6.8764, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.140309155766944, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 0.0038287752675386444, |
|
"loss": 6.8742, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.2592152199762188, |
|
"grad_norm": 17.125, |
|
"learning_rate": 0.0038192627824019027, |
|
"loss": 6.8829, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.3781212841854935, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.0038097502972651606, |
|
"loss": 6.8883, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.4970273483947683, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0038002378121284185, |
|
"loss": 6.8851, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.6159334126040426, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.003790725326991677, |
|
"loss": 6.8849, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.7348394768133177, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0037812128418549347, |
|
"loss": 6.8784, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.853745541022592, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.003771700356718193, |
|
"loss": 6.8769, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.972651605231867, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.0037621878715814504, |
|
"loss": 6.876, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 6.865922451019287, |
|
"eval_runtime": 20.8083, |
|
"eval_samples_per_second": 96.115, |
|
"eval_steps_per_second": 0.769, |
|
"step": 12615 |
|
}, |
|
{ |
|
"epoch": 3.0915576694411415, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 0.0037526753864447087, |
|
"loss": 6.8658, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.2104637336504163, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 0.003743162901307967, |
|
"loss": 6.8673, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.329369797859691, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.003733650416171225, |
|
"loss": 6.8629, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 4.5, |
|
"learning_rate": 0.0037241379310344828, |
|
"loss": 6.866, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.56718192627824, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.003714625445897741, |
|
"loss": 6.8627, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.686087990487515, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 0.003705112960760999, |
|
"loss": 6.859, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.8049940546967895, |
|
"grad_norm": 42.0, |
|
"learning_rate": 0.003695600475624257, |
|
"loss": 6.852, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.9239001189060643, |
|
"grad_norm": 24.75, |
|
"learning_rate": 0.0036860879904875147, |
|
"loss": 6.8541, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 6.84573221206665, |
|
"eval_runtime": 22.0613, |
|
"eval_samples_per_second": 90.656, |
|
"eval_steps_per_second": 0.725, |
|
"step": 16820 |
|
}, |
|
{ |
|
"epoch": 4.042806183115339, |
|
"grad_norm": 19.75, |
|
"learning_rate": 0.003676575505350773, |
|
"loss": 6.85, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.161712247324614, |
|
"grad_norm": 18.0, |
|
"learning_rate": 0.0036670630202140313, |
|
"loss": 6.8548, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 4.280618311533888, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0036575505350772888, |
|
"loss": 6.8497, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.399524375743163, |
|
"grad_norm": 11.25, |
|
"learning_rate": 0.003648038049940547, |
|
"loss": 6.8469, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.5184304399524375, |
|
"grad_norm": 181.0, |
|
"learning_rate": 0.003638525564803805, |
|
"loss": 6.8481, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.637336504161713, |
|
"grad_norm": 13.4375, |
|
"learning_rate": 0.0036290130796670633, |
|
"loss": 6.854, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.756242568370987, |
|
"grad_norm": 19.75, |
|
"learning_rate": 0.003619500594530321, |
|
"loss": 6.848, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.875148632580261, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.003609988109393579, |
|
"loss": 6.8439, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 4.9940546967895365, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 0.0036004756242568373, |
|
"loss": 6.8516, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 6.845646381378174, |
|
"eval_runtime": 24.3842, |
|
"eval_samples_per_second": 82.02, |
|
"eval_steps_per_second": 0.656, |
|
"step": 21025 |
|
}, |
|
{ |
|
"epoch": 5.112960760998811, |
|
"grad_norm": 43.0, |
|
"learning_rate": 0.003590963139120095, |
|
"loss": 6.8426, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 5.231866825208086, |
|
"grad_norm": 14.5, |
|
"learning_rate": 0.003581450653983353, |
|
"loss": 6.8388, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 5.35077288941736, |
|
"grad_norm": 23.25, |
|
"learning_rate": 0.0035719381688466114, |
|
"loss": 6.843, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 5.469678953626635, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 0.0035624256837098692, |
|
"loss": 6.8391, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 5.58858501783591, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.003552913198573127, |
|
"loss": 6.838, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 5.707491082045184, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 0.0035434007134363854, |
|
"loss": 6.8373, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 5.826397146254459, |
|
"grad_norm": 16.25, |
|
"learning_rate": 0.0035338882282996433, |
|
"loss": 6.8372, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 5.945303210463734, |
|
"grad_norm": 49.0, |
|
"learning_rate": 0.0035243757431629016, |
|
"loss": 6.8363, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 6.83237886428833, |
|
"eval_runtime": 24.8165, |
|
"eval_samples_per_second": 80.591, |
|
"eval_steps_per_second": 0.645, |
|
"step": 25230 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 210250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.337350063694092e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|