{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03184713375796178, "grad_norm": 5.6323702767298744, "learning_rate": 5e-06, "loss": 0.942, "step": 10 }, { "epoch": 0.06369426751592357, "grad_norm": 1.8772154794170284, "learning_rate": 5e-06, "loss": 0.8332, "step": 20 }, { "epoch": 0.09554140127388536, "grad_norm": 1.15240428378361, "learning_rate": 5e-06, "loss": 0.7857, "step": 30 }, { "epoch": 0.12738853503184713, "grad_norm": 0.9723964053330802, "learning_rate": 5e-06, "loss": 0.7718, "step": 40 }, { "epoch": 0.1592356687898089, "grad_norm": 1.180485085299224, "learning_rate": 5e-06, "loss": 0.7492, "step": 50 }, { "epoch": 0.1910828025477707, "grad_norm": 1.047903477889757, "learning_rate": 5e-06, "loss": 0.7413, "step": 60 }, { "epoch": 0.2229299363057325, "grad_norm": 0.8341131630541828, "learning_rate": 5e-06, "loss": 0.719, "step": 70 }, { "epoch": 0.25477707006369427, "grad_norm": 0.9122278937918772, "learning_rate": 5e-06, "loss": 0.7184, "step": 80 }, { "epoch": 0.28662420382165604, "grad_norm": 0.7073501886373434, "learning_rate": 5e-06, "loss": 0.7121, "step": 90 }, { "epoch": 0.3184713375796178, "grad_norm": 0.8281770199094491, "learning_rate": 5e-06, "loss": 0.7092, "step": 100 }, { "epoch": 0.3503184713375796, "grad_norm": 0.7210347968056159, "learning_rate": 5e-06, "loss": 0.7013, "step": 110 }, { "epoch": 0.3821656050955414, "grad_norm": 0.5738125544201067, "learning_rate": 5e-06, "loss": 0.7022, "step": 120 }, { "epoch": 0.4140127388535032, "grad_norm": 1.2444329803886918, "learning_rate": 5e-06, "loss": 0.7044, "step": 130 }, { "epoch": 0.445859872611465, "grad_norm": 0.75470283252076, "learning_rate": 5e-06, "loss": 0.692, "step": 140 }, { "epoch": 0.47770700636942676, "grad_norm": 0.6449167354505037, "learning_rate": 5e-06, "loss": 0.7023, "step": 150 }, { "epoch": 0.5095541401273885, "grad_norm": 0.5910288139996942, "learning_rate": 5e-06, "loss": 0.6944, "step": 160 }, { "epoch": 0.5414012738853503, "grad_norm": 0.5604434405342383, "learning_rate": 5e-06, "loss": 0.6966, "step": 170 }, { "epoch": 0.5732484076433121, "grad_norm": 0.6654582315660812, "learning_rate": 5e-06, "loss": 0.6876, "step": 180 }, { "epoch": 0.6050955414012739, "grad_norm": 0.9166199094846909, "learning_rate": 5e-06, "loss": 0.6891, "step": 190 }, { "epoch": 0.6369426751592356, "grad_norm": 0.7178723052852772, "learning_rate": 5e-06, "loss": 0.6832, "step": 200 }, { "epoch": 0.6687898089171974, "grad_norm": 0.6541344411840553, "learning_rate": 5e-06, "loss": 0.6803, "step": 210 }, { "epoch": 0.7006369426751592, "grad_norm": 0.637825014720494, "learning_rate": 5e-06, "loss": 0.6768, "step": 220 }, { "epoch": 0.732484076433121, "grad_norm": 0.5549500561748851, "learning_rate": 5e-06, "loss": 0.6825, "step": 230 }, { "epoch": 0.7643312101910829, "grad_norm": 0.654612026735291, "learning_rate": 5e-06, "loss": 0.6806, "step": 240 }, { "epoch": 0.7961783439490446, "grad_norm": 0.5538249059296838, "learning_rate": 5e-06, "loss": 0.6848, "step": 250 }, { "epoch": 0.8280254777070064, "grad_norm": 0.5723698853730642, "learning_rate": 5e-06, "loss": 0.6842, "step": 260 }, { "epoch": 0.8598726114649682, "grad_norm": 0.5768848558596881, "learning_rate": 5e-06, "loss": 0.6826, "step": 270 }, { "epoch": 0.89171974522293, "grad_norm": 0.531302038623778, "learning_rate": 5e-06, "loss": 0.6812, "step": 280 }, { "epoch": 0.9235668789808917, "grad_norm": 0.543420146899659, "learning_rate": 5e-06, "loss": 0.6774, "step": 290 }, { "epoch": 0.9554140127388535, "grad_norm": 0.509668170466935, "learning_rate": 5e-06, "loss": 0.6757, "step": 300 }, { "epoch": 0.9872611464968153, "grad_norm": 0.5426815942872757, "learning_rate": 5e-06, "loss": 0.6774, "step": 310 }, { "epoch": 1.0, "eval_loss": 0.6760347485542297, "eval_runtime": 30.8722, "eval_samples_per_second": 273.871, "eval_steps_per_second": 1.101, "step": 314 }, { "epoch": 1.019108280254777, "grad_norm": 0.7477023983107832, "learning_rate": 5e-06, "loss": 0.6476, "step": 320 }, { "epoch": 1.0509554140127388, "grad_norm": 0.6149216256910138, "learning_rate": 5e-06, "loss": 0.6341, "step": 330 }, { "epoch": 1.0828025477707006, "grad_norm": 0.7013272885905804, "learning_rate": 5e-06, "loss": 0.6285, "step": 340 }, { "epoch": 1.1146496815286624, "grad_norm": 0.6645252684100534, "learning_rate": 5e-06, "loss": 0.6293, "step": 350 }, { "epoch": 1.1464968152866242, "grad_norm": 0.6691516868475139, "learning_rate": 5e-06, "loss": 0.624, "step": 360 }, { "epoch": 1.178343949044586, "grad_norm": 0.6277574945118575, "learning_rate": 5e-06, "loss": 0.628, "step": 370 }, { "epoch": 1.2101910828025477, "grad_norm": 0.5594126038913051, "learning_rate": 5e-06, "loss": 0.6293, "step": 380 }, { "epoch": 1.2420382165605095, "grad_norm": 0.6377404973712193, "learning_rate": 5e-06, "loss": 0.6272, "step": 390 }, { "epoch": 1.2738853503184713, "grad_norm": 0.5512534318917951, "learning_rate": 5e-06, "loss": 0.6314, "step": 400 }, { "epoch": 1.305732484076433, "grad_norm": 0.6708898369555326, "learning_rate": 5e-06, "loss": 0.6309, "step": 410 }, { "epoch": 1.3375796178343948, "grad_norm": 0.662681777058966, "learning_rate": 5e-06, "loss": 0.6322, "step": 420 }, { "epoch": 1.3694267515923566, "grad_norm": 0.5855564523983591, "learning_rate": 5e-06, "loss": 0.6293, "step": 430 }, { "epoch": 1.4012738853503186, "grad_norm": 0.7027168380973281, "learning_rate": 5e-06, "loss": 0.6231, "step": 440 }, { "epoch": 1.4331210191082802, "grad_norm": 0.5875190666099424, "learning_rate": 5e-06, "loss": 0.6308, "step": 450 }, { "epoch": 1.4649681528662422, "grad_norm": 0.5695855789396209, "learning_rate": 5e-06, "loss": 0.6316, "step": 460 }, { "epoch": 1.4968152866242037, "grad_norm": 0.5640229543248884, "learning_rate": 5e-06, "loss": 0.6216, "step": 470 }, { "epoch": 1.5286624203821657, "grad_norm": 0.5284708915831005, "learning_rate": 5e-06, "loss": 0.634, "step": 480 }, { "epoch": 1.5605095541401273, "grad_norm": 0.6601370586932329, "learning_rate": 5e-06, "loss": 0.6211, "step": 490 }, { "epoch": 1.5923566878980893, "grad_norm": 0.5141465072879543, "learning_rate": 5e-06, "loss": 0.6263, "step": 500 }, { "epoch": 1.6242038216560508, "grad_norm": 0.7951970752704245, "learning_rate": 5e-06, "loss": 0.6283, "step": 510 }, { "epoch": 1.6560509554140128, "grad_norm": 0.6915600915890423, "learning_rate": 5e-06, "loss": 0.6296, "step": 520 }, { "epoch": 1.6878980891719744, "grad_norm": 0.5352783895102649, "learning_rate": 5e-06, "loss": 0.6243, "step": 530 }, { "epoch": 1.7197452229299364, "grad_norm": 0.6389694940943629, "learning_rate": 5e-06, "loss": 0.6262, "step": 540 }, { "epoch": 1.7515923566878981, "grad_norm": 0.638658826251017, "learning_rate": 5e-06, "loss": 0.6233, "step": 550 }, { "epoch": 1.78343949044586, "grad_norm": 0.5164381003657269, "learning_rate": 5e-06, "loss": 0.6325, "step": 560 }, { "epoch": 1.8152866242038217, "grad_norm": 0.5462044960453822, "learning_rate": 5e-06, "loss": 0.6262, "step": 570 }, { "epoch": 1.8471337579617835, "grad_norm": 0.5320271319641363, "learning_rate": 5e-06, "loss": 0.6238, "step": 580 }, { "epoch": 1.8789808917197452, "grad_norm": 0.6783189674697148, "learning_rate": 5e-06, "loss": 0.6195, "step": 590 }, { "epoch": 1.910828025477707, "grad_norm": 0.6838086842362504, "learning_rate": 5e-06, "loss": 0.6242, "step": 600 }, { "epoch": 1.9426751592356688, "grad_norm": 0.6094514581891239, "learning_rate": 5e-06, "loss": 0.6282, "step": 610 }, { "epoch": 1.9745222929936306, "grad_norm": 0.627522298883166, "learning_rate": 5e-06, "loss": 0.6192, "step": 620 }, { "epoch": 2.0, "eval_loss": 0.6656389832496643, "eval_runtime": 31.0662, "eval_samples_per_second": 272.161, "eval_steps_per_second": 1.094, "step": 628 }, { "epoch": 2.0063694267515926, "grad_norm": 0.9573075915273472, "learning_rate": 5e-06, "loss": 0.6157, "step": 630 }, { "epoch": 2.038216560509554, "grad_norm": 0.7301893995079715, "learning_rate": 5e-06, "loss": 0.5735, "step": 640 }, { "epoch": 2.070063694267516, "grad_norm": 0.7084282739053801, "learning_rate": 5e-06, "loss": 0.5778, "step": 650 }, { "epoch": 2.1019108280254777, "grad_norm": 0.6585976344607456, "learning_rate": 5e-06, "loss": 0.5784, "step": 660 }, { "epoch": 2.1337579617834397, "grad_norm": 0.6540062522321651, "learning_rate": 5e-06, "loss": 0.5751, "step": 670 }, { "epoch": 2.1656050955414012, "grad_norm": 0.5639591666878391, "learning_rate": 5e-06, "loss": 0.5766, "step": 680 }, { "epoch": 2.1974522292993632, "grad_norm": 0.8212954305221217, "learning_rate": 5e-06, "loss": 0.5798, "step": 690 }, { "epoch": 2.229299363057325, "grad_norm": 0.8109275454168715, "learning_rate": 5e-06, "loss": 0.5767, "step": 700 }, { "epoch": 2.261146496815287, "grad_norm": 0.6107672621726716, "learning_rate": 5e-06, "loss": 0.5773, "step": 710 }, { "epoch": 2.2929936305732483, "grad_norm": 0.6385570175836712, "learning_rate": 5e-06, "loss": 0.579, "step": 720 }, { "epoch": 2.3248407643312103, "grad_norm": 0.5907370356401401, "learning_rate": 5e-06, "loss": 0.5796, "step": 730 }, { "epoch": 2.356687898089172, "grad_norm": 0.5656820007312235, "learning_rate": 5e-06, "loss": 0.5748, "step": 740 }, { "epoch": 2.388535031847134, "grad_norm": 0.5760709804095724, "learning_rate": 5e-06, "loss": 0.5834, "step": 750 }, { "epoch": 2.4203821656050954, "grad_norm": 0.6305043111751713, "learning_rate": 5e-06, "loss": 0.5758, "step": 760 }, { "epoch": 2.4522292993630574, "grad_norm": 0.5901331102554143, "learning_rate": 5e-06, "loss": 0.5856, "step": 770 }, { "epoch": 2.484076433121019, "grad_norm": 0.6608011253619817, "learning_rate": 5e-06, "loss": 0.5786, "step": 780 }, { "epoch": 2.515923566878981, "grad_norm": 0.6413426727699342, "learning_rate": 5e-06, "loss": 0.577, "step": 790 }, { "epoch": 2.5477707006369426, "grad_norm": 0.5719942317033819, "learning_rate": 5e-06, "loss": 0.581, "step": 800 }, { "epoch": 2.5796178343949046, "grad_norm": 0.5846298214850376, "learning_rate": 5e-06, "loss": 0.5809, "step": 810 }, { "epoch": 2.611464968152866, "grad_norm": 0.6682316534474163, "learning_rate": 5e-06, "loss": 0.5845, "step": 820 }, { "epoch": 2.643312101910828, "grad_norm": 0.5948936776108811, "learning_rate": 5e-06, "loss": 0.5837, "step": 830 }, { "epoch": 2.6751592356687897, "grad_norm": 0.8151554129031496, "learning_rate": 5e-06, "loss": 0.5871, "step": 840 }, { "epoch": 2.7070063694267517, "grad_norm": 0.5916261196390682, "learning_rate": 5e-06, "loss": 0.5797, "step": 850 }, { "epoch": 2.738853503184713, "grad_norm": 0.6442969800604952, "learning_rate": 5e-06, "loss": 0.5814, "step": 860 }, { "epoch": 2.770700636942675, "grad_norm": 0.6420974104025357, "learning_rate": 5e-06, "loss": 0.585, "step": 870 }, { "epoch": 2.802547770700637, "grad_norm": 0.6318903979436119, "learning_rate": 5e-06, "loss": 0.5813, "step": 880 }, { "epoch": 2.8343949044585988, "grad_norm": 0.6105841622731739, "learning_rate": 5e-06, "loss": 0.5823, "step": 890 }, { "epoch": 2.8662420382165603, "grad_norm": 0.5727955806455897, "learning_rate": 5e-06, "loss": 0.5799, "step": 900 }, { "epoch": 2.8980891719745223, "grad_norm": 0.5970248698087671, "learning_rate": 5e-06, "loss": 0.583, "step": 910 }, { "epoch": 2.9299363057324843, "grad_norm": 0.6176747095322584, "learning_rate": 5e-06, "loss": 0.5816, "step": 920 }, { "epoch": 2.961783439490446, "grad_norm": 0.550436105577002, "learning_rate": 5e-06, "loss": 0.585, "step": 930 }, { "epoch": 2.9936305732484074, "grad_norm": 0.6334063259900834, "learning_rate": 5e-06, "loss": 0.5826, "step": 940 }, { "epoch": 3.0, "eval_loss": 0.6691574454307556, "eval_runtime": 30.8039, "eval_samples_per_second": 274.478, "eval_steps_per_second": 1.104, "step": 942 }, { "epoch": 3.0, "step": 942, "total_flos": 1577885085204480.0, "train_loss": 0.6401269947141093, "train_runtime": 5912.9311, "train_samples_per_second": 81.505, "train_steps_per_second": 0.159 } ], "logging_steps": 10, "max_steps": 942, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1577885085204480.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }