{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2675227394328518, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 7.3019086777975994, "learning_rate": 5e-06, "loss": 0.6939, "step": 10 }, { "epoch": 0.01, "grad_norm": 5.150989333266983, "learning_rate": 1e-05, "loss": 0.7167, "step": 20 }, { "epoch": 0.02, "grad_norm": 3.640558809610037, "learning_rate": 1.5e-05, "loss": 0.5683, "step": 30 }, { "epoch": 0.02, "grad_norm": 7.517999397731128, "learning_rate": 2e-05, "loss": 0.5472, "step": 40 }, { "epoch": 0.03, "grad_norm": 1.9687061679463425, "learning_rate": 2.5e-05, "loss": 0.4439, "step": 50 }, { "epoch": 0.03, "grad_norm": 3.643479206523606, "learning_rate": 3e-05, "loss": 0.2486, "step": 60 }, { "epoch": 0.04, "grad_norm": 2.2754773308695095, "learning_rate": 3.5e-05, "loss": 0.2217, "step": 70 }, { "epoch": 0.04, "grad_norm": 1.7144730049127388, "learning_rate": 4e-05, "loss": 0.169, "step": 80 }, { "epoch": 0.05, "grad_norm": 3.4702829704135114, "learning_rate": 4.5e-05, "loss": 0.1994, "step": 90 }, { "epoch": 0.05, "grad_norm": 1.3985127340985621, "learning_rate": 5e-05, "loss": 0.1612, "step": 100 }, { "epoch": 0.06, "grad_norm": 1.375992184386137, "learning_rate": 4.982758620689655e-05, "loss": 0.1576, "step": 110 }, { "epoch": 0.06, "grad_norm": 1.9528635753013313, "learning_rate": 4.9655172413793107e-05, "loss": 0.1393, "step": 120 }, { "epoch": 0.07, "grad_norm": 4.075169010198401, "learning_rate": 4.9482758620689655e-05, "loss": 0.1969, "step": 130 }, { "epoch": 0.07, "grad_norm": 2.0953991165751207, "learning_rate": 4.931034482758621e-05, "loss": 0.1294, "step": 140 }, { "epoch": 0.08, "grad_norm": 1.942660591044849, "learning_rate": 4.913793103448276e-05, "loss": 0.1306, "step": 150 }, { "epoch": 0.09, "grad_norm": 3.1508904015728345, "learning_rate": 4.896551724137931e-05, "loss": 0.1526, "step": 160 }, { "epoch": 0.09, "grad_norm": 1.9862795165358471, "learning_rate": 4.8793103448275864e-05, "loss": 0.1186, "step": 170 }, { "epoch": 0.1, "grad_norm": 2.633061833817991, "learning_rate": 4.862068965517241e-05, "loss": 0.1457, "step": 180 }, { "epoch": 0.1, "grad_norm": 1.8017052368446178, "learning_rate": 4.844827586206897e-05, "loss": 0.1234, "step": 190 }, { "epoch": 0.11, "grad_norm": 2.1560694100709803, "learning_rate": 4.827586206896552e-05, "loss": 0.1346, "step": 200 }, { "epoch": 0.11, "grad_norm": 1.5737689267430703, "learning_rate": 4.810344827586207e-05, "loss": 0.116, "step": 210 }, { "epoch": 0.12, "grad_norm": 1.957864677854788, "learning_rate": 4.793103448275863e-05, "loss": 0.1692, "step": 220 }, { "epoch": 0.12, "grad_norm": 2.215039223521855, "learning_rate": 4.7758620689655176e-05, "loss": 0.1245, "step": 230 }, { "epoch": 0.13, "grad_norm": 1.370517239734168, "learning_rate": 4.7586206896551725e-05, "loss": 0.1476, "step": 240 }, { "epoch": 0.13, "grad_norm": 1.7341334563022532, "learning_rate": 4.741379310344828e-05, "loss": 0.1236, "step": 250 }, { "epoch": 0.14, "grad_norm": 1.5994298113068974, "learning_rate": 4.724137931034483e-05, "loss": 0.1161, "step": 260 }, { "epoch": 0.14, "grad_norm": 1.5317433190951963, "learning_rate": 4.7068965517241385e-05, "loss": 0.1035, "step": 270 }, { "epoch": 0.15, "grad_norm": 2.191977732539556, "learning_rate": 4.689655172413793e-05, "loss": 0.1427, "step": 280 }, { "epoch": 0.16, "grad_norm": 1.6038667570691656, "learning_rate": 4.672413793103448e-05, "loss": 0.1225, "step": 290 }, { "epoch": 0.16, "grad_norm": 2.577572731831179, "learning_rate": 4.655172413793104e-05, "loss": 0.1399, "step": 300 }, { "epoch": 0.17, "grad_norm": 1.6199241001441385, "learning_rate": 4.6379310344827586e-05, "loss": 0.1242, "step": 310 }, { "epoch": 0.17, "grad_norm": 2.236577821186196, "learning_rate": 4.6206896551724135e-05, "loss": 0.1656, "step": 320 }, { "epoch": 0.18, "grad_norm": 1.7294690605254757, "learning_rate": 4.603448275862069e-05, "loss": 0.1382, "step": 330 }, { "epoch": 0.18, "grad_norm": 2.196527516378511, "learning_rate": 4.586206896551724e-05, "loss": 0.1257, "step": 340 }, { "epoch": 0.19, "grad_norm": 2.1057444340221463, "learning_rate": 4.5689655172413794e-05, "loss": 0.1238, "step": 350 }, { "epoch": 0.19, "grad_norm": 1.5409556870328274, "learning_rate": 4.551724137931035e-05, "loss": 0.1383, "step": 360 }, { "epoch": 0.2, "grad_norm": 1.5204083616874053, "learning_rate": 4.53448275862069e-05, "loss": 0.1068, "step": 370 }, { "epoch": 0.2, "grad_norm": 2.3557725298931746, "learning_rate": 4.5172413793103454e-05, "loss": 0.1071, "step": 380 }, { "epoch": 0.21, "grad_norm": 3.2601538460418644, "learning_rate": 4.5e-05, "loss": 0.125, "step": 390 }, { "epoch": 0.21, "grad_norm": 1.9031725385762286, "learning_rate": 4.482758620689655e-05, "loss": 0.0991, "step": 400 }, { "epoch": 0.22, "grad_norm": 1.3946050262183123, "learning_rate": 4.465517241379311e-05, "loss": 0.1156, "step": 410 }, { "epoch": 0.22, "grad_norm": 1.097644875106397, "learning_rate": 4.4482758620689656e-05, "loss": 0.1366, "step": 420 }, { "epoch": 0.23, "grad_norm": 1.37846299019108, "learning_rate": 4.431034482758621e-05, "loss": 0.126, "step": 430 }, { "epoch": 0.24, "grad_norm": 1.8340152889320331, "learning_rate": 4.413793103448276e-05, "loss": 0.1066, "step": 440 }, { "epoch": 0.24, "grad_norm": 1.8304505611337867, "learning_rate": 4.396551724137931e-05, "loss": 0.0868, "step": 450 }, { "epoch": 0.25, "grad_norm": 1.550196490898523, "learning_rate": 4.3793103448275864e-05, "loss": 0.1286, "step": 460 }, { "epoch": 0.25, "grad_norm": 2.176112247796248, "learning_rate": 4.362068965517241e-05, "loss": 0.1206, "step": 470 }, { "epoch": 0.26, "grad_norm": 1.6589263894091213, "learning_rate": 4.344827586206897e-05, "loss": 0.1008, "step": 480 }, { "epoch": 0.26, "grad_norm": 1.8349611508902046, "learning_rate": 4.327586206896552e-05, "loss": 0.1198, "step": 490 }, { "epoch": 0.27, "grad_norm": 2.1218964920724126, "learning_rate": 4.3103448275862066e-05, "loss": 0.1166, "step": 500 }, { "epoch": 0.27, "eval_loss": 0.6078919172286987, "eval_runtime": 116.8471, "eval_samples_per_second": 11.288, "eval_steps_per_second": 2.824, "step": 500 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 14449508352000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }