|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 942, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03184713375796178, |
|
"grad_norm": 5.6323702767298744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.942, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06369426751592357, |
|
"grad_norm": 1.8772154794170284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8332, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09554140127388536, |
|
"grad_norm": 1.15240428378361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7857, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12738853503184713, |
|
"grad_norm": 0.9723964053330802, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7718, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1592356687898089, |
|
"grad_norm": 1.180485085299224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7492, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1910828025477707, |
|
"grad_norm": 1.047903477889757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7413, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2229299363057325, |
|
"grad_norm": 0.8341131630541828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.719, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25477707006369427, |
|
"grad_norm": 0.9122278937918772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7184, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28662420382165604, |
|
"grad_norm": 0.7073501886373434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7121, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3184713375796178, |
|
"grad_norm": 0.8281770199094491, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7092, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3503184713375796, |
|
"grad_norm": 0.7210347968056159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7013, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3821656050955414, |
|
"grad_norm": 0.5738125544201067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7022, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4140127388535032, |
|
"grad_norm": 1.2444329803886918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7044, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.445859872611465, |
|
"grad_norm": 0.75470283252076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.692, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.47770700636942676, |
|
"grad_norm": 0.6449167354505037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7023, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5095541401273885, |
|
"grad_norm": 0.5910288139996942, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6944, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5414012738853503, |
|
"grad_norm": 0.5604434405342383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6966, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5732484076433121, |
|
"grad_norm": 0.6654582315660812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6876, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6050955414012739, |
|
"grad_norm": 0.9166199094846909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6891, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6369426751592356, |
|
"grad_norm": 0.7178723052852772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6832, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6687898089171974, |
|
"grad_norm": 0.6541344411840553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6803, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7006369426751592, |
|
"grad_norm": 0.637825014720494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6768, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.732484076433121, |
|
"grad_norm": 0.5549500561748851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6825, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7643312101910829, |
|
"grad_norm": 0.654612026735291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6806, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7961783439490446, |
|
"grad_norm": 0.5538249059296838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6848, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8280254777070064, |
|
"grad_norm": 0.5723698853730642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6842, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8598726114649682, |
|
"grad_norm": 0.5768848558596881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6826, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.89171974522293, |
|
"grad_norm": 0.531302038623778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6812, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9235668789808917, |
|
"grad_norm": 0.543420146899659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6774, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9554140127388535, |
|
"grad_norm": 0.509668170466935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6757, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9872611464968153, |
|
"grad_norm": 0.5426815942872757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6774, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6760347485542297, |
|
"eval_runtime": 30.8722, |
|
"eval_samples_per_second": 273.871, |
|
"eval_steps_per_second": 1.101, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.019108280254777, |
|
"grad_norm": 0.7477023983107832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6476, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0509554140127388, |
|
"grad_norm": 0.6149216256910138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6341, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0828025477707006, |
|
"grad_norm": 0.7013272885905804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6285, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1146496815286624, |
|
"grad_norm": 0.6645252684100534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6293, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1464968152866242, |
|
"grad_norm": 0.6691516868475139, |
|
"learning_rate": 5e-06, |
|
"loss": 0.624, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.178343949044586, |
|
"grad_norm": 0.6277574945118575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.628, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2101910828025477, |
|
"grad_norm": 0.5594126038913051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6293, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2420382165605095, |
|
"grad_norm": 0.6377404973712193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6272, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2738853503184713, |
|
"grad_norm": 0.5512534318917951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6314, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.305732484076433, |
|
"grad_norm": 0.6708898369555326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6309, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3375796178343948, |
|
"grad_norm": 0.662681777058966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6322, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3694267515923566, |
|
"grad_norm": 0.5855564523983591, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6293, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4012738853503186, |
|
"grad_norm": 0.7027168380973281, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6231, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4331210191082802, |
|
"grad_norm": 0.5875190666099424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6308, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4649681528662422, |
|
"grad_norm": 0.5695855789396209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6316, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4968152866242037, |
|
"grad_norm": 0.5640229543248884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6216, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5286624203821657, |
|
"grad_norm": 0.5284708915831005, |
|
"learning_rate": 5e-06, |
|
"loss": 0.634, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5605095541401273, |
|
"grad_norm": 0.6601370586932329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6211, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5923566878980893, |
|
"grad_norm": 0.5141465072879543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6263, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6242038216560508, |
|
"grad_norm": 0.7951970752704245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6283, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6560509554140128, |
|
"grad_norm": 0.6915600915890423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6296, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6878980891719744, |
|
"grad_norm": 0.5352783895102649, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6243, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7197452229299364, |
|
"grad_norm": 0.6389694940943629, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6262, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7515923566878981, |
|
"grad_norm": 0.638658826251017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6233, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.78343949044586, |
|
"grad_norm": 0.5164381003657269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6325, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8152866242038217, |
|
"grad_norm": 0.5462044960453822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6262, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8471337579617835, |
|
"grad_norm": 0.5320271319641363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6238, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8789808917197452, |
|
"grad_norm": 0.6783189674697148, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6195, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.910828025477707, |
|
"grad_norm": 0.6838086842362504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6242, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9426751592356688, |
|
"grad_norm": 0.6094514581891239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6282, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9745222929936306, |
|
"grad_norm": 0.627522298883166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6192, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6656389832496643, |
|
"eval_runtime": 31.0662, |
|
"eval_samples_per_second": 272.161, |
|
"eval_steps_per_second": 1.094, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 2.0063694267515926, |
|
"grad_norm": 0.9573075915273472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6157, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.038216560509554, |
|
"grad_norm": 0.7301893995079715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5735, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.070063694267516, |
|
"grad_norm": 0.7084282739053801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5778, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.1019108280254777, |
|
"grad_norm": 0.6585976344607456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5784, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.1337579617834397, |
|
"grad_norm": 0.6540062522321651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1656050955414012, |
|
"grad_norm": 0.5639591666878391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5766, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1974522292993632, |
|
"grad_norm": 0.8212954305221217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5798, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.229299363057325, |
|
"grad_norm": 0.8109275454168715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5767, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.261146496815287, |
|
"grad_norm": 0.6107672621726716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5773, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2929936305732483, |
|
"grad_norm": 0.6385570175836712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.579, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.3248407643312103, |
|
"grad_norm": 0.5907370356401401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5796, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.356687898089172, |
|
"grad_norm": 0.5656820007312235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5748, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.388535031847134, |
|
"grad_norm": 0.5760709804095724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5834, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.4203821656050954, |
|
"grad_norm": 0.6305043111751713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5758, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.4522292993630574, |
|
"grad_norm": 0.5901331102554143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5856, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.484076433121019, |
|
"grad_norm": 0.6608011253619817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5786, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.515923566878981, |
|
"grad_norm": 0.6413426727699342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.577, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5477707006369426, |
|
"grad_norm": 0.5719942317033819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.581, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5796178343949046, |
|
"grad_norm": 0.5846298214850376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5809, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.611464968152866, |
|
"grad_norm": 0.6682316534474163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5845, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.643312101910828, |
|
"grad_norm": 0.5948936776108811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5837, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.6751592356687897, |
|
"grad_norm": 0.8151554129031496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5871, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.7070063694267517, |
|
"grad_norm": 0.5916261196390682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5797, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.738853503184713, |
|
"grad_norm": 0.6442969800604952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5814, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.770700636942675, |
|
"grad_norm": 0.6420974104025357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.585, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.802547770700637, |
|
"grad_norm": 0.6318903979436119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5813, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8343949044585988, |
|
"grad_norm": 0.6105841622731739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5823, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.8662420382165603, |
|
"grad_norm": 0.5727955806455897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5799, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8980891719745223, |
|
"grad_norm": 0.5970248698087671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.583, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.9299363057324843, |
|
"grad_norm": 0.6176747095322584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5816, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.961783439490446, |
|
"grad_norm": 0.550436105577002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.585, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9936305732484074, |
|
"grad_norm": 0.6334063259900834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5826, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6691574454307556, |
|
"eval_runtime": 30.8039, |
|
"eval_samples_per_second": 274.478, |
|
"eval_steps_per_second": 1.104, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 942, |
|
"total_flos": 1577885085204480.0, |
|
"train_loss": 0.6401269947141093, |
|
"train_runtime": 5912.9311, |
|
"train_samples_per_second": 81.505, |
|
"train_steps_per_second": 0.159 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 942, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1577885085204480.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|