{ "best_metric": null, "best_model_checkpoint": null, "epoch": 21.224489795918366, "eval_steps": 20, "global_step": 260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.82, "grad_norm": 0.18377472460269928, "learning_rate": 2.9999999999999997e-05, "loss": 1.861, "step": 10 }, { "epoch": 1.63, "grad_norm": 0.35202744603157043, "learning_rate": 5.9999999999999995e-05, "loss": 1.7263, "step": 20 }, { "epoch": 1.63, "eval_loss": 1.4457755088806152, "eval_runtime": 89.8938, "eval_samples_per_second": 4.305, "eval_steps_per_second": 0.545, "step": 20 }, { "epoch": 2.45, "grad_norm": 0.928983747959137, "learning_rate": 8.999999999999999e-05, "loss": 1.1718, "step": 30 }, { "epoch": 3.27, "grad_norm": 0.253262996673584, "learning_rate": 0.00011999999999999999, "loss": 0.4789, "step": 40 }, { "epoch": 3.27, "eval_loss": 0.3332095146179199, "eval_runtime": 89.9804, "eval_samples_per_second": 4.301, "eval_steps_per_second": 0.545, "step": 40 }, { "epoch": 4.08, "grad_norm": 0.12236642092466354, "learning_rate": 0.00015, "loss": 0.3568, "step": 50 }, { "epoch": 4.9, "grad_norm": 0.09160923212766647, "learning_rate": 0.00017999999999999998, "loss": 0.3256, "step": 60 }, { "epoch": 4.9, "eval_loss": 0.2753114104270935, "eval_runtime": 90.3206, "eval_samples_per_second": 4.285, "eval_steps_per_second": 0.543, "step": 60 }, { "epoch": 5.71, "grad_norm": 0.10242326557636261, "learning_rate": 0.00020999999999999998, "loss": 0.2841, "step": 70 }, { "epoch": 6.53, "grad_norm": 0.1305350810289383, "learning_rate": 0.00023999999999999998, "loss": 0.2615, "step": 80 }, { "epoch": 6.53, "eval_loss": 0.2476309835910797, "eval_runtime": 90.525, "eval_samples_per_second": 4.275, "eval_steps_per_second": 0.541, "step": 80 }, { "epoch": 7.35, "grad_norm": 0.17941106855869293, "learning_rate": 0.00027, "loss": 0.2216, "step": 90 }, { "epoch": 8.16, "grad_norm": 0.20095375180244446, "learning_rate": 0.0003, "loss": 0.1832, "step": 100 }, { "epoch": 8.16, "eval_loss": 0.2350914180278778, "eval_runtime": 90.3919, "eval_samples_per_second": 4.281, "eval_steps_per_second": 0.542, "step": 100 }, { "epoch": 8.98, "grad_norm": 0.2600422501564026, "learning_rate": 0.00029, "loss": 0.1441, "step": 110 }, { "epoch": 9.8, "grad_norm": 0.20544037222862244, "learning_rate": 0.00028, "loss": 0.1186, "step": 120 }, { "epoch": 9.8, "eval_loss": 0.23090216517448425, "eval_runtime": 90.3144, "eval_samples_per_second": 4.285, "eval_steps_per_second": 0.543, "step": 120 }, { "epoch": 10.61, "grad_norm": 0.2158157229423523, "learning_rate": 0.00027, "loss": 0.0947, "step": 130 }, { "epoch": 11.43, "grad_norm": 0.18916285037994385, "learning_rate": 0.00026, "loss": 0.0768, "step": 140 }, { "epoch": 11.43, "eval_loss": 0.24214179813861847, "eval_runtime": 90.2597, "eval_samples_per_second": 4.288, "eval_steps_per_second": 0.543, "step": 140 }, { "epoch": 12.24, "grad_norm": 0.22263498604297638, "learning_rate": 0.00025, "loss": 0.0615, "step": 150 }, { "epoch": 13.06, "grad_norm": 0.21315976977348328, "learning_rate": 0.00023999999999999998, "loss": 0.054, "step": 160 }, { "epoch": 13.06, "eval_loss": 0.25932466983795166, "eval_runtime": 89.8439, "eval_samples_per_second": 4.307, "eval_steps_per_second": 0.545, "step": 160 }, { "epoch": 13.88, "grad_norm": 0.18338361382484436, "learning_rate": 0.00023, "loss": 0.0455, "step": 170 }, { "epoch": 14.69, "grad_norm": 0.17157459259033203, "learning_rate": 0.00021999999999999995, "loss": 0.0393, "step": 180 }, { "epoch": 14.69, "eval_loss": 0.27233538031578064, "eval_runtime": 90.1364, "eval_samples_per_second": 4.293, "eval_steps_per_second": 0.544, "step": 180 }, { "epoch": 15.51, "grad_norm": 0.1541435867547989, "learning_rate": 0.00020999999999999998, "loss": 0.0352, "step": 190 }, { "epoch": 16.33, "grad_norm": 0.1553652435541153, "learning_rate": 0.00019999999999999998, "loss": 0.0325, "step": 200 }, { "epoch": 16.33, "eval_loss": 0.28704825043678284, "eval_runtime": 89.7951, "eval_samples_per_second": 4.31, "eval_steps_per_second": 0.546, "step": 200 }, { "epoch": 17.14, "grad_norm": 0.13403691351413727, "learning_rate": 0.00018999999999999998, "loss": 0.0297, "step": 210 }, { "epoch": 17.96, "grad_norm": 0.14512716233730316, "learning_rate": 0.00017999999999999998, "loss": 0.0279, "step": 220 }, { "epoch": 17.96, "eval_loss": 0.2964874505996704, "eval_runtime": 89.7009, "eval_samples_per_second": 4.314, "eval_steps_per_second": 0.546, "step": 220 }, { "epoch": 18.78, "grad_norm": 0.12400835007429123, "learning_rate": 0.00016999999999999999, "loss": 0.0263, "step": 230 }, { "epoch": 19.59, "grad_norm": 0.1139909029006958, "learning_rate": 0.00015999999999999999, "loss": 0.0246, "step": 240 }, { "epoch": 19.59, "eval_loss": 0.30519917607307434, "eval_runtime": 89.8387, "eval_samples_per_second": 4.308, "eval_steps_per_second": 0.545, "step": 240 }, { "epoch": 20.41, "grad_norm": 0.12317101657390594, "learning_rate": 0.00015, "loss": 0.0235, "step": 250 }, { "epoch": 21.22, "grad_norm": 0.12494686245918274, "learning_rate": 0.00014, "loss": 0.0224, "step": 260 }, { "epoch": 21.22, "eval_loss": 0.314134418964386, "eval_runtime": 89.7974, "eval_samples_per_second": 4.31, "eval_steps_per_second": 0.546, "step": 260 } ], "logging_steps": 10, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 34, "save_steps": 20, "total_flos": 1.49113643104469e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }