{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 2028, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "grad_norm": 1.6387559175491333, "learning_rate": 2.4630541871921184e-05, "loss": 1.3637, "step": 25 }, { "epoch": 0.15, "grad_norm": 1.9441598653793335, "learning_rate": 4.926108374384237e-05, "loss": 1.2291, "step": 50 }, { "epoch": 0.22, "grad_norm": 2.4013047218322754, "learning_rate": 7.389162561576355e-05, "loss": 1.1296, "step": 75 }, { "epoch": 0.3, "grad_norm": 2.915468215942383, "learning_rate": 9.852216748768474e-05, "loss": 1.0603, "step": 100 }, { "epoch": 0.37, "grad_norm": 2.2165613174438477, "learning_rate": 0.00012315270935960593, "loss": 0.9742, "step": 125 }, { "epoch": 0.44, "grad_norm": 2.0160369873046875, "learning_rate": 0.0001477832512315271, "loss": 1.0095, "step": 150 }, { "epoch": 0.52, "grad_norm": 1.840472936630249, "learning_rate": 0.00017241379310344826, "loss": 0.9758, "step": 175 }, { "epoch": 0.59, "grad_norm": 1.7259827852249146, "learning_rate": 0.00019704433497536947, "loss": 0.9755, "step": 200 }, { "epoch": 0.67, "grad_norm": 1.8914766311645508, "learning_rate": 0.00019758904109589044, "loss": 1.0167, "step": 225 }, { "epoch": 0.74, "grad_norm": 1.5285598039627075, "learning_rate": 0.00019484931506849316, "loss": 0.9205, "step": 250 }, { "epoch": 0.81, "grad_norm": 1.6174041032791138, "learning_rate": 0.00019210958904109592, "loss": 0.9171, "step": 275 }, { "epoch": 0.89, "grad_norm": 1.6485415697097778, "learning_rate": 0.00018936986301369864, "loss": 0.9214, "step": 300 }, { "epoch": 0.96, "grad_norm": 1.6646627187728882, "learning_rate": 0.0001866301369863014, "loss": 0.9161, "step": 325 }, { "epoch": 1.04, "grad_norm": 1.58356511592865, "learning_rate": 0.00018400000000000003, "loss": 0.8968, "step": 350 }, { "epoch": 1.11, "grad_norm": 1.722945213317871, "learning_rate": 0.00018126027397260275, "loss": 0.8365, "step": 375 }, { "epoch": 1.18, "grad_norm": 1.9991916418075562, "learning_rate": 0.0001785205479452055, "loss": 0.8648, "step": 400 }, { "epoch": 1.26, "grad_norm": 3.0022475719451904, "learning_rate": 0.00017578082191780823, "loss": 0.8409, "step": 425 }, { "epoch": 1.33, "grad_norm": 1.5056065320968628, "learning_rate": 0.00017304109589041098, "loss": 0.7653, "step": 450 }, { "epoch": 1.41, "grad_norm": 1.5764682292938232, "learning_rate": 0.0001703013698630137, "loss": 0.8132, "step": 475 }, { "epoch": 1.48, "grad_norm": 1.6800600290298462, "learning_rate": 0.00016756164383561646, "loss": 0.8424, "step": 500 }, { "epoch": 1.55, "grad_norm": 1.5748885869979858, "learning_rate": 0.00016482191780821918, "loss": 0.8225, "step": 525 }, { "epoch": 1.63, "grad_norm": 1.8514370918273926, "learning_rate": 0.00016208219178082193, "loss": 0.782, "step": 550 }, { "epoch": 1.7, "grad_norm": 1.6620662212371826, "learning_rate": 0.00015934246575342466, "loss": 0.8057, "step": 575 }, { "epoch": 1.78, "grad_norm": 1.5210013389587402, "learning_rate": 0.0001566027397260274, "loss": 0.8187, "step": 600 }, { "epoch": 1.85, "grad_norm": 1.831774353981018, "learning_rate": 0.00015386301369863013, "loss": 0.8243, "step": 625 }, { "epoch": 1.92, "grad_norm": 1.578306794166565, "learning_rate": 0.0001511232876712329, "loss": 0.8339, "step": 650 }, { "epoch": 2.0, "grad_norm": 1.6106500625610352, "learning_rate": 0.0001483835616438356, "loss": 0.8359, "step": 675 }, { "epoch": 2.07, "grad_norm": 2.1241354942321777, "learning_rate": 0.00014564383561643836, "loss": 0.711, "step": 700 }, { "epoch": 2.14, "grad_norm": 2.0171968936920166, "learning_rate": 0.00014290410958904112, "loss": 0.6632, "step": 725 }, { "epoch": 2.22, "grad_norm": 1.9910348653793335, "learning_rate": 0.00014016438356164384, "loss": 0.6418, "step": 750 }, { "epoch": 2.29, "grad_norm": 2.025341749191284, "learning_rate": 0.0001374246575342466, "loss": 0.673, "step": 775 }, { "epoch": 2.37, "grad_norm": 1.8271619081497192, "learning_rate": 0.00013468493150684934, "loss": 0.6392, "step": 800 }, { "epoch": 2.44, "grad_norm": 2.3863108158111572, "learning_rate": 0.00013194520547945207, "loss": 0.6239, "step": 825 }, { "epoch": 2.51, "grad_norm": 2.1700851917266846, "learning_rate": 0.00012920547945205482, "loss": 0.6683, "step": 850 }, { "epoch": 2.59, "grad_norm": 1.662989616394043, "learning_rate": 0.00012646575342465755, "loss": 0.6566, "step": 875 }, { "epoch": 2.66, "grad_norm": 2.041293144226074, "learning_rate": 0.00012372602739726027, "loss": 0.6954, "step": 900 }, { "epoch": 2.74, "grad_norm": 1.8853856325149536, "learning_rate": 0.00012098630136986302, "loss": 0.6824, "step": 925 }, { "epoch": 2.81, "grad_norm": 2.2064335346221924, "learning_rate": 0.00011824657534246575, "loss": 0.638, "step": 950 }, { "epoch": 2.88, "grad_norm": 2.4988508224487305, "learning_rate": 0.0001155068493150685, "loss": 0.6586, "step": 975 }, { "epoch": 2.96, "grad_norm": 1.8918757438659668, "learning_rate": 0.00011276712328767122, "loss": 0.6704, "step": 1000 }, { "epoch": 3.03, "grad_norm": 2.117297410964966, "learning_rate": 0.00011002739726027397, "loss": 0.5681, "step": 1025 }, { "epoch": 3.11, "grad_norm": 1.9822639226913452, "learning_rate": 0.00010728767123287671, "loss": 0.507, "step": 1050 }, { "epoch": 3.18, "grad_norm": 2.404207706451416, "learning_rate": 0.00010454794520547946, "loss": 0.5017, "step": 1075 }, { "epoch": 3.25, "grad_norm": 2.122673273086548, "learning_rate": 0.00010180821917808219, "loss": 0.4803, "step": 1100 }, { "epoch": 3.33, "grad_norm": 2.590771436691284, "learning_rate": 9.906849315068494e-05, "loss": 0.5102, "step": 1125 }, { "epoch": 3.4, "grad_norm": 2.141523838043213, "learning_rate": 9.632876712328768e-05, "loss": 0.5011, "step": 1150 }, { "epoch": 3.48, "grad_norm": 2.4209084510803223, "learning_rate": 9.358904109589042e-05, "loss": 0.5235, "step": 1175 }, { "epoch": 3.55, "grad_norm": 2.436467170715332, "learning_rate": 9.084931506849316e-05, "loss": 0.5421, "step": 1200 }, { "epoch": 3.62, "grad_norm": 2.52020001411438, "learning_rate": 8.81095890410959e-05, "loss": 0.5186, "step": 1225 }, { "epoch": 3.7, "grad_norm": 2.1507699489593506, "learning_rate": 8.536986301369863e-05, "loss": 0.5135, "step": 1250 }, { "epoch": 3.77, "grad_norm": 2.4909422397613525, "learning_rate": 8.263013698630137e-05, "loss": 0.5494, "step": 1275 }, { "epoch": 3.85, "grad_norm": 2.0679502487182617, "learning_rate": 7.989041095890411e-05, "loss": 0.4808, "step": 1300 }, { "epoch": 3.92, "grad_norm": 2.8978271484375, "learning_rate": 7.715068493150686e-05, "loss": 0.5453, "step": 1325 }, { "epoch": 3.99, "grad_norm": 2.301755428314209, "learning_rate": 7.44109589041096e-05, "loss": 0.4995, "step": 1350 }, { "epoch": 4.07, "grad_norm": 2.578835964202881, "learning_rate": 7.167123287671234e-05, "loss": 0.3984, "step": 1375 }, { "epoch": 4.14, "grad_norm": 2.900747299194336, "learning_rate": 6.893150684931508e-05, "loss": 0.3675, "step": 1400 }, { "epoch": 4.22, "grad_norm": 3.3370769023895264, "learning_rate": 6.619178082191781e-05, "loss": 0.3662, "step": 1425 }, { "epoch": 4.29, "grad_norm": 2.989133834838867, "learning_rate": 6.345205479452055e-05, "loss": 0.3756, "step": 1450 }, { "epoch": 4.36, "grad_norm": 2.7858242988586426, "learning_rate": 6.071232876712329e-05, "loss": 0.3816, "step": 1475 }, { "epoch": 4.44, "grad_norm": 2.5152714252471924, "learning_rate": 5.7972602739726036e-05, "loss": 0.3561, "step": 1500 }, { "epoch": 4.51, "grad_norm": 2.771308660507202, "learning_rate": 5.5232876712328775e-05, "loss": 0.3699, "step": 1525 }, { "epoch": 4.59, "grad_norm": 3.1365509033203125, "learning_rate": 5.249315068493151e-05, "loss": 0.3843, "step": 1550 }, { "epoch": 4.66, "grad_norm": 3.3356099128723145, "learning_rate": 4.9753424657534244e-05, "loss": 0.419, "step": 1575 }, { "epoch": 4.73, "grad_norm": 3.6290457248687744, "learning_rate": 4.701369863013699e-05, "loss": 0.3759, "step": 1600 }, { "epoch": 4.81, "grad_norm": 2.340773344039917, "learning_rate": 4.427397260273973e-05, "loss": 0.3608, "step": 1625 }, { "epoch": 4.88, "grad_norm": 2.5572738647460938, "learning_rate": 4.1534246575342466e-05, "loss": 0.3827, "step": 1650 }, { "epoch": 4.96, "grad_norm": 2.8399617671966553, "learning_rate": 3.8794520547945204e-05, "loss": 0.3946, "step": 1675 }, { "epoch": 5.03, "grad_norm": 2.2290821075439453, "learning_rate": 3.605479452054795e-05, "loss": 0.3155, "step": 1700 }, { "epoch": 5.1, "grad_norm": 2.295093536376953, "learning_rate": 3.331506849315069e-05, "loss": 0.2648, "step": 1725 }, { "epoch": 5.18, "grad_norm": 2.754384756088257, "learning_rate": 3.0575342465753426e-05, "loss": 0.2921, "step": 1750 }, { "epoch": 5.25, "grad_norm": 3.0329742431640625, "learning_rate": 2.7835616438356164e-05, "loss": 0.2801, "step": 1775 }, { "epoch": 5.33, "grad_norm": 2.5985870361328125, "learning_rate": 2.5095890410958906e-05, "loss": 0.262, "step": 1800 }, { "epoch": 5.4, "grad_norm": 3.2596547603607178, "learning_rate": 2.2356164383561644e-05, "loss": 0.2742, "step": 1825 }, { "epoch": 5.47, "grad_norm": 2.7161920070648193, "learning_rate": 1.9616438356164386e-05, "loss": 0.2609, "step": 1850 }, { "epoch": 5.55, "grad_norm": 2.671105146408081, "learning_rate": 1.6876712328767124e-05, "loss": 0.2906, "step": 1875 }, { "epoch": 5.62, "grad_norm": 2.5027403831481934, "learning_rate": 1.4136986301369864e-05, "loss": 0.2491, "step": 1900 }, { "epoch": 5.7, "grad_norm": 3.0660693645477295, "learning_rate": 1.1397260273972603e-05, "loss": 0.2869, "step": 1925 }, { "epoch": 5.77, "grad_norm": 2.5826847553253174, "learning_rate": 8.657534246575343e-06, "loss": 0.2624, "step": 1950 }, { "epoch": 5.84, "grad_norm": 2.695078134536743, "learning_rate": 5.9178082191780825e-06, "loss": 0.2477, "step": 1975 }, { "epoch": 5.92, "grad_norm": 2.729051113128662, "learning_rate": 3.178082191780822e-06, "loss": 0.2862, "step": 2000 }, { "epoch": 5.99, "grad_norm": 3.2778682708740234, "learning_rate": 4.3835616438356164e-07, "loss": 0.2722, "step": 2025 } ], "logging_steps": 25, "max_steps": 2028, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "total_flos": 8.868397672562688e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }