{ "best_metric": 0.08136745542287827, "best_model_checkpoint": "model/checkpoint-8500", "epoch": 2.903997266826102, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17, "grad_norm": 1.9025481939315796, "learning_rate": 9.430588771210569e-06, "loss": 8.0821, "step": 500 }, { "epoch": 0.17, "eval_loss": 0.15024334192276, "eval_runtime": 39.7831, "eval_samples_per_second": 61.961, "eval_steps_per_second": 12.392, "step": 500 }, { "epoch": 0.34, "grad_norm": 0.37238627672195435, "learning_rate": 8.861177542421137e-06, "loss": 0.1353, "step": 1000 }, { "epoch": 0.34, "eval_loss": 0.09424333274364471, "eval_runtime": 39.7674, "eval_samples_per_second": 61.986, "eval_steps_per_second": 12.397, "step": 1000 }, { "epoch": 0.51, "grad_norm": 0.28026077151298523, "learning_rate": 8.291766313631705e-06, "loss": 0.1056, "step": 1500 }, { "epoch": 0.51, "eval_loss": 0.08929836004972458, "eval_runtime": 39.8141, "eval_samples_per_second": 61.913, "eval_steps_per_second": 12.383, "step": 1500 }, { "epoch": 0.68, "grad_norm": 0.242831751704216, "learning_rate": 7.722355084842274e-06, "loss": 0.0994, "step": 2000 }, { "epoch": 0.68, "eval_loss": 0.08687467128038406, "eval_runtime": 39.8819, "eval_samples_per_second": 61.808, "eval_steps_per_second": 12.362, "step": 2000 }, { "epoch": 0.85, "grad_norm": 0.31068557500839233, "learning_rate": 7.152943856052842e-06, "loss": 0.0957, "step": 2500 }, { "epoch": 0.85, "eval_loss": 0.08578581362962723, "eval_runtime": 39.8992, "eval_samples_per_second": 61.781, "eval_steps_per_second": 12.356, "step": 2500 }, { "epoch": 1.02, "grad_norm": 0.1952580064535141, "learning_rate": 6.583532627263411e-06, "loss": 0.0951, "step": 3000 }, { "epoch": 1.02, "eval_loss": 0.0845487117767334, "eval_runtime": 39.7723, "eval_samples_per_second": 61.978, "eval_steps_per_second": 12.396, "step": 3000 }, { "epoch": 1.2, "grad_norm": 0.1910097897052765, "learning_rate": 6.014121398473979e-06, "loss": 0.0923, "step": 3500 }, { "epoch": 1.2, "eval_loss": 0.08382081985473633, "eval_runtime": 39.606, "eval_samples_per_second": 62.238, "eval_steps_per_second": 12.448, "step": 3500 }, { "epoch": 1.37, "grad_norm": 0.18350045382976532, "learning_rate": 5.444710169684546e-06, "loss": 0.0899, "step": 4000 }, { "epoch": 1.37, "eval_loss": 0.08325745165348053, "eval_runtime": 39.6287, "eval_samples_per_second": 62.202, "eval_steps_per_second": 12.44, "step": 4000 }, { "epoch": 1.54, "grad_norm": 0.2158266305923462, "learning_rate": 4.8752989408951145e-06, "loss": 0.0899, "step": 4500 }, { "epoch": 1.54, "eval_loss": 0.08282188326120377, "eval_runtime": 39.6573, "eval_samples_per_second": 62.158, "eval_steps_per_second": 12.432, "step": 4500 }, { "epoch": 1.71, "grad_norm": 0.2279905378818512, "learning_rate": 4.305887712105683e-06, "loss": 0.0894, "step": 5000 }, { "epoch": 1.71, "eval_loss": 0.08228254318237305, "eval_runtime": 39.8586, "eval_samples_per_second": 61.844, "eval_steps_per_second": 12.369, "step": 5000 }, { "epoch": 1.88, "grad_norm": 0.18786436319351196, "learning_rate": 3.7364764833162513e-06, "loss": 0.089, "step": 5500 }, { "epoch": 1.88, "eval_loss": 0.08236683160066605, "eval_runtime": 39.8662, "eval_samples_per_second": 61.832, "eval_steps_per_second": 12.366, "step": 5500 }, { "epoch": 2.05, "grad_norm": 0.20143625140190125, "learning_rate": 3.1670652545268194e-06, "loss": 0.0882, "step": 6000 }, { "epoch": 2.05, "eval_loss": 0.08215593546628952, "eval_runtime": 39.8891, "eval_samples_per_second": 61.796, "eval_steps_per_second": 12.359, "step": 6000 }, { "epoch": 2.22, "grad_norm": 0.3449649512767792, "learning_rate": 2.5976540257373876e-06, "loss": 0.0876, "step": 6500 }, { "epoch": 2.22, "eval_loss": 0.08190125972032547, "eval_runtime": 39.9241, "eval_samples_per_second": 61.742, "eval_steps_per_second": 12.348, "step": 6500 }, { "epoch": 2.39, "grad_norm": 0.17690658569335938, "learning_rate": 2.028242796947956e-06, "loss": 0.0876, "step": 7000 }, { "epoch": 2.39, "eval_loss": 0.08178862929344177, "eval_runtime": 39.8989, "eval_samples_per_second": 61.781, "eval_steps_per_second": 12.356, "step": 7000 }, { "epoch": 2.56, "grad_norm": 0.16777564585208893, "learning_rate": 1.4588315681585242e-06, "loss": 0.0874, "step": 7500 }, { "epoch": 2.56, "eval_loss": 0.08147595077753067, "eval_runtime": 39.629, "eval_samples_per_second": 62.202, "eval_steps_per_second": 12.44, "step": 7500 }, { "epoch": 2.73, "grad_norm": 0.2061876803636551, "learning_rate": 8.894203393690924e-07, "loss": 0.088, "step": 8000 }, { "epoch": 2.73, "eval_loss": 0.08150825649499893, "eval_runtime": 39.6669, "eval_samples_per_second": 62.142, "eval_steps_per_second": 12.428, "step": 8000 }, { "epoch": 2.9, "grad_norm": 0.21220079064369202, "learning_rate": 3.2000911057966066e-07, "loss": 0.0871, "step": 8500 }, { "epoch": 2.9, "eval_loss": 0.08136745542287827, "eval_runtime": 39.796, "eval_samples_per_second": 61.941, "eval_steps_per_second": 12.388, "step": 8500 } ], "logging_steps": 500, "max_steps": 8781, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.7281294636544e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }