|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.0, |
|
"eval_steps": 500, |
|
"global_step": 2028, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6387559175491333, |
|
"learning_rate": 2.4630541871921184e-05, |
|
"loss": 1.3637, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.9441598653793335, |
|
"learning_rate": 4.926108374384237e-05, |
|
"loss": 1.2291, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.4013047218322754, |
|
"learning_rate": 7.389162561576355e-05, |
|
"loss": 1.1296, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.915468215942383, |
|
"learning_rate": 9.852216748768474e-05, |
|
"loss": 1.0603, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.2165613174438477, |
|
"learning_rate": 0.00012315270935960593, |
|
"loss": 0.9742, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.0160369873046875, |
|
"learning_rate": 0.0001477832512315271, |
|
"loss": 1.0095, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.840472936630249, |
|
"learning_rate": 0.00017241379310344826, |
|
"loss": 0.9758, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.7259827852249146, |
|
"learning_rate": 0.00019704433497536947, |
|
"loss": 0.9755, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.8914766311645508, |
|
"learning_rate": 0.00019758904109589044, |
|
"loss": 1.0167, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5285598039627075, |
|
"learning_rate": 0.00019484931506849316, |
|
"loss": 0.9205, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.6174041032791138, |
|
"learning_rate": 0.00019210958904109592, |
|
"loss": 0.9171, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.6485415697097778, |
|
"learning_rate": 0.00018936986301369864, |
|
"loss": 0.9214, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.6646627187728882, |
|
"learning_rate": 0.0001866301369863014, |
|
"loss": 0.9161, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.58356511592865, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 0.8968, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.722945213317871, |
|
"learning_rate": 0.00018126027397260275, |
|
"loss": 0.8365, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.9991916418075562, |
|
"learning_rate": 0.0001785205479452055, |
|
"loss": 0.8648, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.0022475719451904, |
|
"learning_rate": 0.00017578082191780823, |
|
"loss": 0.8409, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.5056065320968628, |
|
"learning_rate": 0.00017304109589041098, |
|
"loss": 0.7653, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.5764682292938232, |
|
"learning_rate": 0.0001703013698630137, |
|
"loss": 0.8132, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.6800600290298462, |
|
"learning_rate": 0.00016756164383561646, |
|
"loss": 0.8424, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.5748885869979858, |
|
"learning_rate": 0.00016482191780821918, |
|
"loss": 0.8225, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.8514370918273926, |
|
"learning_rate": 0.00016208219178082193, |
|
"loss": 0.782, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.6620662212371826, |
|
"learning_rate": 0.00015934246575342466, |
|
"loss": 0.8057, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.5210013389587402, |
|
"learning_rate": 0.0001566027397260274, |
|
"loss": 0.8187, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.831774353981018, |
|
"learning_rate": 0.00015386301369863013, |
|
"loss": 0.8243, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.578306794166565, |
|
"learning_rate": 0.0001511232876712329, |
|
"loss": 0.8339, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6106500625610352, |
|
"learning_rate": 0.0001483835616438356, |
|
"loss": 0.8359, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.1241354942321777, |
|
"learning_rate": 0.00014564383561643836, |
|
"loss": 0.711, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.0171968936920166, |
|
"learning_rate": 0.00014290410958904112, |
|
"loss": 0.6632, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.9910348653793335, |
|
"learning_rate": 0.00014016438356164384, |
|
"loss": 0.6418, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.025341749191284, |
|
"learning_rate": 0.0001374246575342466, |
|
"loss": 0.673, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.8271619081497192, |
|
"learning_rate": 0.00013468493150684934, |
|
"loss": 0.6392, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.3863108158111572, |
|
"learning_rate": 0.00013194520547945207, |
|
"loss": 0.6239, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 2.1700851917266846, |
|
"learning_rate": 0.00012920547945205482, |
|
"loss": 0.6683, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.662989616394043, |
|
"learning_rate": 0.00012646575342465755, |
|
"loss": 0.6566, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 2.041293144226074, |
|
"learning_rate": 0.00012372602739726027, |
|
"loss": 0.6954, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.8853856325149536, |
|
"learning_rate": 0.00012098630136986302, |
|
"loss": 0.6824, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.2064335346221924, |
|
"learning_rate": 0.00011824657534246575, |
|
"loss": 0.638, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.4988508224487305, |
|
"learning_rate": 0.0001155068493150685, |
|
"loss": 0.6586, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.8918757438659668, |
|
"learning_rate": 0.00011276712328767122, |
|
"loss": 0.6704, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 2.117297410964966, |
|
"learning_rate": 0.00011002739726027397, |
|
"loss": 0.5681, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 1.9822639226913452, |
|
"learning_rate": 0.00010728767123287671, |
|
"loss": 0.507, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 2.404207706451416, |
|
"learning_rate": 0.00010454794520547946, |
|
"loss": 0.5017, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.122673273086548, |
|
"learning_rate": 0.00010180821917808219, |
|
"loss": 0.4803, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 2.590771436691284, |
|
"learning_rate": 9.906849315068494e-05, |
|
"loss": 0.5102, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.141523838043213, |
|
"learning_rate": 9.632876712328768e-05, |
|
"loss": 0.5011, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 2.4209084510803223, |
|
"learning_rate": 9.358904109589042e-05, |
|
"loss": 0.5235, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 2.436467170715332, |
|
"learning_rate": 9.084931506849316e-05, |
|
"loss": 0.5421, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 2.52020001411438, |
|
"learning_rate": 8.81095890410959e-05, |
|
"loss": 0.5186, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 2.1507699489593506, |
|
"learning_rate": 8.536986301369863e-05, |
|
"loss": 0.5135, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 2.4909422397613525, |
|
"learning_rate": 8.263013698630137e-05, |
|
"loss": 0.5494, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 2.0679502487182617, |
|
"learning_rate": 7.989041095890411e-05, |
|
"loss": 0.4808, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 2.8978271484375, |
|
"learning_rate": 7.715068493150686e-05, |
|
"loss": 0.5453, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 2.301755428314209, |
|
"learning_rate": 7.44109589041096e-05, |
|
"loss": 0.4995, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 2.578835964202881, |
|
"learning_rate": 7.167123287671234e-05, |
|
"loss": 0.3984, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 2.900747299194336, |
|
"learning_rate": 6.893150684931508e-05, |
|
"loss": 0.3675, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 3.3370769023895264, |
|
"learning_rate": 6.619178082191781e-05, |
|
"loss": 0.3662, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 2.989133834838867, |
|
"learning_rate": 6.345205479452055e-05, |
|
"loss": 0.3756, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 2.7858242988586426, |
|
"learning_rate": 6.071232876712329e-05, |
|
"loss": 0.3816, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 2.5152714252471924, |
|
"learning_rate": 5.7972602739726036e-05, |
|
"loss": 0.3561, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 2.771308660507202, |
|
"learning_rate": 5.5232876712328775e-05, |
|
"loss": 0.3699, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 3.1365509033203125, |
|
"learning_rate": 5.249315068493151e-05, |
|
"loss": 0.3843, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 3.3356099128723145, |
|
"learning_rate": 4.9753424657534244e-05, |
|
"loss": 0.419, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 3.6290457248687744, |
|
"learning_rate": 4.701369863013699e-05, |
|
"loss": 0.3759, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 2.340773344039917, |
|
"learning_rate": 4.427397260273973e-05, |
|
"loss": 0.3608, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 2.5572738647460938, |
|
"learning_rate": 4.1534246575342466e-05, |
|
"loss": 0.3827, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 2.8399617671966553, |
|
"learning_rate": 3.8794520547945204e-05, |
|
"loss": 0.3946, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 2.2290821075439453, |
|
"learning_rate": 3.605479452054795e-05, |
|
"loss": 0.3155, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 2.295093536376953, |
|
"learning_rate": 3.331506849315069e-05, |
|
"loss": 0.2648, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 2.754384756088257, |
|
"learning_rate": 3.0575342465753426e-05, |
|
"loss": 0.2921, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 3.0329742431640625, |
|
"learning_rate": 2.7835616438356164e-05, |
|
"loss": 0.2801, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 2.5985870361328125, |
|
"learning_rate": 2.5095890410958906e-05, |
|
"loss": 0.262, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 3.2596547603607178, |
|
"learning_rate": 2.2356164383561644e-05, |
|
"loss": 0.2742, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 5.47, |
|
"grad_norm": 2.7161920070648193, |
|
"learning_rate": 1.9616438356164386e-05, |
|
"loss": 0.2609, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 2.671105146408081, |
|
"learning_rate": 1.6876712328767124e-05, |
|
"loss": 0.2906, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 2.5027403831481934, |
|
"learning_rate": 1.4136986301369864e-05, |
|
"loss": 0.2491, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 3.0660693645477295, |
|
"learning_rate": 1.1397260273972603e-05, |
|
"loss": 0.2869, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"grad_norm": 2.5826847553253174, |
|
"learning_rate": 8.657534246575343e-06, |
|
"loss": 0.2624, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 2.695078134536743, |
|
"learning_rate": 5.9178082191780825e-06, |
|
"loss": 0.2477, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 2.729051113128662, |
|
"learning_rate": 3.178082191780822e-06, |
|
"loss": 0.2862, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"grad_norm": 3.2778682708740234, |
|
"learning_rate": 4.3835616438356164e-07, |
|
"loss": 0.2722, |
|
"step": 2025 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 2028, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"total_flos": 8.868397672562688e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|