{ "best_metric": 0.8958781361579895, "best_model_checkpoint": "Llama-3.1-8B-medquad-V2/checkpoint-180", "epoch": 2.630937880633374, "eval_steps": 10, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.146163215590743, "grad_norm": 0.3197868764400482, "learning_rate": 0.0002, "loss": 1.2503, "step": 10 }, { "epoch": 0.146163215590743, "eval_loss": 1.1359219551086426, "eval_runtime": 294.7811, "eval_samples_per_second": 5.567, "eval_steps_per_second": 0.699, "step": 10 }, { "epoch": 0.292326431181486, "grad_norm": 0.24248264729976654, "learning_rate": 0.0002, "loss": 1.1182, "step": 20 }, { "epoch": 0.292326431181486, "eval_loss": 1.0198906660079956, "eval_runtime": 294.7536, "eval_samples_per_second": 5.567, "eval_steps_per_second": 0.699, "step": 20 }, { "epoch": 0.438489646772229, "grad_norm": 0.21424424648284912, "learning_rate": 0.0002, "loss": 1.0864, "step": 30 }, { "epoch": 0.438489646772229, "eval_loss": 0.9855759143829346, "eval_runtime": 294.9371, "eval_samples_per_second": 5.564, "eval_steps_per_second": 0.698, "step": 30 }, { "epoch": 0.584652862362972, "grad_norm": 0.1644379198551178, "learning_rate": 0.0002, "loss": 0.9031, "step": 40 }, { "epoch": 0.584652862362972, "eval_loss": 0.9680554866790771, "eval_runtime": 294.8273, "eval_samples_per_second": 5.566, "eval_steps_per_second": 0.699, "step": 40 }, { "epoch": 0.730816077953715, "grad_norm": 0.3253229260444641, "learning_rate": 0.0002, "loss": 1.0773, "step": 50 }, { "epoch": 0.730816077953715, "eval_loss": 0.9498887062072754, "eval_runtime": 294.9458, "eval_samples_per_second": 5.564, "eval_steps_per_second": 0.698, "step": 50 }, { "epoch": 0.876979293544458, "grad_norm": 0.17308901250362396, "learning_rate": 0.0002, "loss": 0.9575, "step": 60 }, { "epoch": 0.876979293544458, "eval_loss": 0.9426676630973816, "eval_runtime": 294.8574, "eval_samples_per_second": 5.565, "eval_steps_per_second": 0.699, "step": 60 }, { "epoch": 1.0231425091352009, "grad_norm": 0.16445040702819824, "learning_rate": 0.0002, "loss": 0.9768, "step": 70 }, { "epoch": 1.0231425091352009, "eval_loss": 0.9452133178710938, "eval_runtime": 294.8787, "eval_samples_per_second": 5.565, "eval_steps_per_second": 0.699, "step": 70 }, { "epoch": 1.169305724725944, "grad_norm": 0.16264739632606506, "learning_rate": 0.0002, "loss": 0.9673, "step": 80 }, { "epoch": 1.169305724725944, "eval_loss": 0.9263980984687805, "eval_runtime": 294.9078, "eval_samples_per_second": 5.564, "eval_steps_per_second": 0.699, "step": 80 }, { "epoch": 1.315468940316687, "grad_norm": 0.1224495992064476, "learning_rate": 0.0002, "loss": 0.8541, "step": 90 }, { "epoch": 1.315468940316687, "eval_loss": 0.9281975626945496, "eval_runtime": 294.9471, "eval_samples_per_second": 5.564, "eval_steps_per_second": 0.698, "step": 90 }, { "epoch": 1.46163215590743, "grad_norm": 0.20104917883872986, "learning_rate": 0.0002, "loss": 0.9772, "step": 100 }, { "epoch": 1.46163215590743, "eval_loss": 0.918040931224823, "eval_runtime": 294.8931, "eval_samples_per_second": 5.565, "eval_steps_per_second": 0.699, "step": 100 }, { "epoch": 1.607795371498173, "grad_norm": 0.1838410645723343, "learning_rate": 0.0002, "loss": 0.8427, "step": 110 }, { "epoch": 1.607795371498173, "eval_loss": 0.9211308360099792, "eval_runtime": 294.9367, "eval_samples_per_second": 5.564, "eval_steps_per_second": 0.698, "step": 110 }, { "epoch": 1.753958587088916, "grad_norm": 0.2129116952419281, "learning_rate": 0.0002, "loss": 0.9317, "step": 120 }, { "epoch": 1.753958587088916, "eval_loss": 0.9142090678215027, "eval_runtime": 294.8478, "eval_samples_per_second": 5.566, "eval_steps_per_second": 0.699, "step": 120 }, { "epoch": 1.900121802679659, "grad_norm": 0.15467825531959534, "learning_rate": 0.0002, "loss": 0.9498, "step": 130 }, { "epoch": 1.900121802679659, "eval_loss": 0.901090681552887, "eval_runtime": 294.95, "eval_samples_per_second": 5.564, "eval_steps_per_second": 0.698, "step": 130 }, { "epoch": 2.0462850182704018, "grad_norm": 0.13313396275043488, "learning_rate": 0.0002, "loss": 0.8412, "step": 140 }, { "epoch": 2.0462850182704018, "eval_loss": 0.9035805463790894, "eval_runtime": 294.9282, "eval_samples_per_second": 5.564, "eval_steps_per_second": 0.698, "step": 140 }, { "epoch": 2.192448233861145, "grad_norm": 0.18854761123657227, "learning_rate": 0.0002, "loss": 0.899, "step": 150 }, { "epoch": 2.192448233861145, "eval_loss": 0.9030548334121704, "eval_runtime": 294.8581, "eval_samples_per_second": 5.565, "eval_steps_per_second": 0.699, "step": 150 }, { "epoch": 2.338611449451888, "grad_norm": 0.14641670882701874, "learning_rate": 0.0002, "loss": 0.7488, "step": 160 }, { "epoch": 2.338611449451888, "eval_loss": 0.8989503383636475, "eval_runtime": 294.8764, "eval_samples_per_second": 5.565, "eval_steps_per_second": 0.699, "step": 160 }, { "epoch": 2.484774665042631, "grad_norm": 0.24440337717533112, "learning_rate": 0.0002, "loss": 0.8824, "step": 170 }, { "epoch": 2.484774665042631, "eval_loss": 0.9033491611480713, "eval_runtime": 294.8116, "eval_samples_per_second": 5.566, "eval_steps_per_second": 0.699, "step": 170 }, { "epoch": 2.630937880633374, "grad_norm": 0.19349223375320435, "learning_rate": 0.0002, "loss": 0.8334, "step": 180 }, { "epoch": 2.630937880633374, "eval_loss": 0.8958781361579895, "eval_runtime": 294.9602, "eval_samples_per_second": 5.563, "eval_steps_per_second": 0.698, "step": 180 }, { "epoch": 2.630937880633374, "step": 180, "total_flos": 3.658397647269151e+17, "train_loss": 0.9498361190160115, "train_runtime": 17857.7741, "train_samples_per_second": 5.145, "train_steps_per_second": 0.027 } ], "logging_steps": 10, "max_steps": 476, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 10, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 5 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.658397647269151e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }