|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 808, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0024752475247524753, |
|
"grad_norm": 38.88609991114745, |
|
"learning_rate": 2.469135802469136e-07, |
|
"loss": 1.8604, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012376237623762377, |
|
"grad_norm": 41.27939914036261, |
|
"learning_rate": 1.234567901234568e-06, |
|
"loss": 1.8561, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.024752475247524754, |
|
"grad_norm": 5.095317934955837, |
|
"learning_rate": 2.469135802469136e-06, |
|
"loss": 1.6195, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03712871287128713, |
|
"grad_norm": 2.761116338631256, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 1.3811, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04950495049504951, |
|
"grad_norm": 1.6927067748367093, |
|
"learning_rate": 4.938271604938272e-06, |
|
"loss": 1.1736, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06188118811881188, |
|
"grad_norm": 1.637997489475765, |
|
"learning_rate": 6.17283950617284e-06, |
|
"loss": 1.0247, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07425742574257425, |
|
"grad_norm": 0.9921453622709451, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 1.0275, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08663366336633663, |
|
"grad_norm": 0.9409275181475548, |
|
"learning_rate": 8.641975308641975e-06, |
|
"loss": 0.9832, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09900990099009901, |
|
"grad_norm": 1.0325275844641013, |
|
"learning_rate": 9.876543209876543e-06, |
|
"loss": 0.9685, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11138613861386139, |
|
"grad_norm": 1.1218773632200145, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.9528, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12376237623762376, |
|
"grad_norm": 0.9572180193224005, |
|
"learning_rate": 1.234567901234568e-05, |
|
"loss": 0.9201, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13613861386138615, |
|
"grad_norm": 1.0009650827934435, |
|
"learning_rate": 1.3580246913580248e-05, |
|
"loss": 0.9542, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1485148514851485, |
|
"grad_norm": 1.3065165374147407, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 0.9437, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1608910891089109, |
|
"grad_norm": 1.0022123167697776, |
|
"learning_rate": 1.6049382716049385e-05, |
|
"loss": 0.9228, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17326732673267325, |
|
"grad_norm": 1.2184130543835685, |
|
"learning_rate": 1.728395061728395e-05, |
|
"loss": 0.9772, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18564356435643564, |
|
"grad_norm": 0.9408942641640691, |
|
"learning_rate": 1.851851851851852e-05, |
|
"loss": 0.9284, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19801980198019803, |
|
"grad_norm": 0.9712584145776969, |
|
"learning_rate": 1.9753086419753087e-05, |
|
"loss": 0.9484, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2103960396039604, |
|
"grad_norm": 0.9980192527761912, |
|
"learning_rate": 1.999850613931615e-05, |
|
"loss": 0.9536, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22277227722772278, |
|
"grad_norm": 0.9890748697192007, |
|
"learning_rate": 1.9992438095219886e-05, |
|
"loss": 0.9539, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23514851485148514, |
|
"grad_norm": 0.9155960993755162, |
|
"learning_rate": 1.9981705331953295e-05, |
|
"loss": 0.936, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24752475247524752, |
|
"grad_norm": 0.8051611539712368, |
|
"learning_rate": 1.996631285983779e-05, |
|
"loss": 0.9456, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2599009900990099, |
|
"grad_norm": 0.9230091255662991, |
|
"learning_rate": 1.9946267864463027e-05, |
|
"loss": 0.9136, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2722772277227723, |
|
"grad_norm": 0.8382334460112006, |
|
"learning_rate": 1.9921579703332475e-05, |
|
"loss": 0.93, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28465346534653463, |
|
"grad_norm": 0.8774240428827245, |
|
"learning_rate": 1.989225990149512e-05, |
|
"loss": 0.9245, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.297029702970297, |
|
"grad_norm": 0.8987251464829349, |
|
"learning_rate": 1.9858322146165272e-05, |
|
"loss": 0.9341, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3094059405940594, |
|
"grad_norm": 0.9049306018652525, |
|
"learning_rate": 1.981978228033304e-05, |
|
"loss": 0.9257, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3217821782178218, |
|
"grad_norm": 0.9029524926074908, |
|
"learning_rate": 1.977665829536842e-05, |
|
"loss": 0.916, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3341584158415842, |
|
"grad_norm": 0.8591727043131795, |
|
"learning_rate": 1.9728970322622485e-05, |
|
"loss": 0.9168, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3465346534653465, |
|
"grad_norm": 0.8712548282993846, |
|
"learning_rate": 1.9676740624029566e-05, |
|
"loss": 0.9347, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3589108910891089, |
|
"grad_norm": 0.8442378198048982, |
|
"learning_rate": 1.961999358171482e-05, |
|
"loss": 0.9286, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3712871287128713, |
|
"grad_norm": 0.8448891617644472, |
|
"learning_rate": 1.955875568661206e-05, |
|
"loss": 0.9247, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38366336633663367, |
|
"grad_norm": 0.8506188708401377, |
|
"learning_rate": 1.94930555260971e-05, |
|
"loss": 0.9089, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.39603960396039606, |
|
"grad_norm": 0.7615649650166222, |
|
"learning_rate": 1.9422923770642494e-05, |
|
"loss": 0.9121, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4084158415841584, |
|
"grad_norm": 0.8438076926475072, |
|
"learning_rate": 1.934839315949976e-05, |
|
"loss": 0.9133, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4207920792079208, |
|
"grad_norm": 0.9246754527016784, |
|
"learning_rate": 1.9269498485415897e-05, |
|
"loss": 0.9298, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.43316831683168316, |
|
"grad_norm": 0.9627433039386685, |
|
"learning_rate": 1.9186276578391268e-05, |
|
"loss": 0.9421, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.44554455445544555, |
|
"grad_norm": 0.8087976752191404, |
|
"learning_rate": 1.9098766288486426e-05, |
|
"loss": 0.9261, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.45792079207920794, |
|
"grad_norm": 0.774521835464554, |
|
"learning_rate": 1.9007008467685947e-05, |
|
"loss": 0.9328, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.47029702970297027, |
|
"grad_norm": 0.7725284205468596, |
|
"learning_rate": 1.8911045950827693e-05, |
|
"loss": 0.9093, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.48267326732673266, |
|
"grad_norm": 0.8054030551821266, |
|
"learning_rate": 1.881092353560646e-05, |
|
"loss": 0.9013, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.49504950495049505, |
|
"grad_norm": 0.7758400337225813, |
|
"learning_rate": 1.870668796166129e-05, |
|
"loss": 0.9142, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5074257425742574, |
|
"grad_norm": 0.8158556574805117, |
|
"learning_rate": 1.8598387888756224e-05, |
|
"loss": 0.9222, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5198019801980198, |
|
"grad_norm": 0.7595417385088049, |
|
"learning_rate": 1.8486073874064745e-05, |
|
"loss": 0.9061, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5321782178217822, |
|
"grad_norm": 0.7472799231884507, |
|
"learning_rate": 1.8369798348568403e-05, |
|
"loss": 0.9083, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5445544554455446, |
|
"grad_norm": 0.7606289538163604, |
|
"learning_rate": 1.8249615592580733e-05, |
|
"loss": 0.9328, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.556930693069307, |
|
"grad_norm": 0.805186253739664, |
|
"learning_rate": 1.8125581710407864e-05, |
|
"loss": 0.9138, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5693069306930693, |
|
"grad_norm": 0.767139039892534, |
|
"learning_rate": 1.7997754604157607e-05, |
|
"loss": 0.9075, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5816831683168316, |
|
"grad_norm": 0.7674924368854081, |
|
"learning_rate": 1.786619394670933e-05, |
|
"loss": 0.9094, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.594059405940594, |
|
"grad_norm": 0.7416157345263308, |
|
"learning_rate": 1.7730961153857155e-05, |
|
"loss": 0.9146, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6064356435643564, |
|
"grad_norm": 0.7972512834290945, |
|
"learning_rate": 1.7592119355639545e-05, |
|
"loss": 0.8986, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6188118811881188, |
|
"grad_norm": 0.7413943937305464, |
|
"learning_rate": 1.744973336686862e-05, |
|
"loss": 0.9261, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6311881188118812, |
|
"grad_norm": 0.7288799636758897, |
|
"learning_rate": 1.7303869656872994e-05, |
|
"loss": 0.9004, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6435643564356436, |
|
"grad_norm": 0.7554557666231643, |
|
"learning_rate": 1.715459631846824e-05, |
|
"loss": 0.9097, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.655940594059406, |
|
"grad_norm": 0.7267320540898269, |
|
"learning_rate": 1.700198303616944e-05, |
|
"loss": 0.902, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6683168316831684, |
|
"grad_norm": 0.6823040717527771, |
|
"learning_rate": 1.684610105366076e-05, |
|
"loss": 0.8913, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6806930693069307, |
|
"grad_norm": 0.6787438356187852, |
|
"learning_rate": 1.6687023140537082e-05, |
|
"loss": 0.882, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.693069306930693, |
|
"grad_norm": 0.7420049831517054, |
|
"learning_rate": 1.6524823558333362e-05, |
|
"loss": 0.8985, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7054455445544554, |
|
"grad_norm": 0.7627405635266166, |
|
"learning_rate": 1.6359578025857495e-05, |
|
"loss": 0.8836, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7178217821782178, |
|
"grad_norm": 0.8778998231303615, |
|
"learning_rate": 1.6191363683842883e-05, |
|
"loss": 0.8871, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7301980198019802, |
|
"grad_norm": 0.8167069067691852, |
|
"learning_rate": 1.6020259058937228e-05, |
|
"loss": 0.8866, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7425742574257426, |
|
"grad_norm": 0.7980390861743758, |
|
"learning_rate": 1.5846344027044307e-05, |
|
"loss": 0.9047, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.754950495049505, |
|
"grad_norm": 0.7563819762216695, |
|
"learning_rate": 1.5669699776035958e-05, |
|
"loss": 0.921, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7673267326732673, |
|
"grad_norm": 0.7797762078681911, |
|
"learning_rate": 1.5490408767851506e-05, |
|
"loss": 0.8869, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7797029702970297, |
|
"grad_norm": 0.7666252485199382, |
|
"learning_rate": 1.530855470000251e-05, |
|
"loss": 0.9151, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7920792079207921, |
|
"grad_norm": 0.7051563046414802, |
|
"learning_rate": 1.5124222466500665e-05, |
|
"loss": 0.9024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8044554455445545, |
|
"grad_norm": 0.7203691895762679, |
|
"learning_rate": 1.4937498118227156e-05, |
|
"loss": 0.9098, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8168316831683168, |
|
"grad_norm": 0.7125293698622911, |
|
"learning_rate": 1.4748468822761974e-05, |
|
"loss": 0.9076, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8292079207920792, |
|
"grad_norm": 0.7428102709759479, |
|
"learning_rate": 1.4557222823691913e-05, |
|
"loss": 0.9082, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8415841584158416, |
|
"grad_norm": 0.7230219909417678, |
|
"learning_rate": 1.4363849399416254e-05, |
|
"loss": 0.9004, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8539603960396039, |
|
"grad_norm": 0.788928114591889, |
|
"learning_rate": 1.4168438821469402e-05, |
|
"loss": 0.8845, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8663366336633663, |
|
"grad_norm": 0.7456926963995498, |
|
"learning_rate": 1.3971082312379864e-05, |
|
"loss": 0.9013, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8787128712871287, |
|
"grad_norm": 0.7757067249831519, |
|
"learning_rate": 1.3771872003085315e-05, |
|
"loss": 0.8913, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8910891089108911, |
|
"grad_norm": 0.7040237849520753, |
|
"learning_rate": 1.3570900889923566e-05, |
|
"loss": 0.9178, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9034653465346535, |
|
"grad_norm": 0.7028463213970978, |
|
"learning_rate": 1.3368262791219568e-05, |
|
"loss": 0.8864, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9158415841584159, |
|
"grad_norm": 0.6865489126472716, |
|
"learning_rate": 1.3164052303488673e-05, |
|
"loss": 0.8958, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9282178217821783, |
|
"grad_norm": 0.7124826316517546, |
|
"learning_rate": 1.2958364757276616e-05, |
|
"loss": 0.8927, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9405940594059405, |
|
"grad_norm": 0.71875227320249, |
|
"learning_rate": 1.2751296172656862e-05, |
|
"loss": 0.897, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9529702970297029, |
|
"grad_norm": 0.7298632107047348, |
|
"learning_rate": 1.2542943214406012e-05, |
|
"loss": 0.9051, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9653465346534653, |
|
"grad_norm": 0.6885898822393058, |
|
"learning_rate": 1.23334031468783e-05, |
|
"loss": 0.8546, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9777227722772277, |
|
"grad_norm": 0.7078778884772863, |
|
"learning_rate": 1.2122773788600164e-05, |
|
"loss": 0.9019, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9900990099009901, |
|
"grad_norm": 0.7533302944336393, |
|
"learning_rate": 1.1911153466606105e-05, |
|
"loss": 0.895, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.06302809715271, |
|
"eval_runtime": 43.2706, |
|
"eval_samples_per_second": 75.871, |
|
"eval_steps_per_second": 1.202, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.0024752475247525, |
|
"grad_norm": 1.3960472434954319, |
|
"learning_rate": 1.1698640970537195e-05, |
|
"loss": 0.8479, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0148514851485149, |
|
"grad_norm": 0.8222898474391185, |
|
"learning_rate": 1.14853355065236e-05, |
|
"loss": 0.7492, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0272277227722773, |
|
"grad_norm": 1.146533181549737, |
|
"learning_rate": 1.1271336650872687e-05, |
|
"loss": 0.7352, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.0396039603960396, |
|
"grad_norm": 1.2500390362477898, |
|
"learning_rate": 1.1056744303584322e-05, |
|
"loss": 0.7107, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.051980198019802, |
|
"grad_norm": 0.8882156471951722, |
|
"learning_rate": 1.0841658641715064e-05, |
|
"loss": 0.7027, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.0643564356435644, |
|
"grad_norm": 0.8783518588023339, |
|
"learning_rate": 1.0626180072613011e-05, |
|
"loss": 0.7199, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0767326732673268, |
|
"grad_norm": 0.7956936034755372, |
|
"learning_rate": 1.0410409187045145e-05, |
|
"loss": 0.6972, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.0891089108910892, |
|
"grad_norm": 0.7957451930016578, |
|
"learning_rate": 1.0194446712239076e-05, |
|
"loss": 0.7194, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1014851485148516, |
|
"grad_norm": 0.8085539993808099, |
|
"learning_rate": 9.978393464861036e-06, |
|
"loss": 0.7085, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.113861386138614, |
|
"grad_norm": 0.7629426599829563, |
|
"learning_rate": 9.76235030395215e-06, |
|
"loss": 0.7221, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1262376237623761, |
|
"grad_norm": 0.7099439929036686, |
|
"learning_rate": 9.546418083844944e-06, |
|
"loss": 0.7228, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.1386138613861387, |
|
"grad_norm": 0.768143602885412, |
|
"learning_rate": 9.330697607081995e-06, |
|
"loss": 0.7055, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.150990099009901, |
|
"grad_norm": 0.7399932448725586, |
|
"learning_rate": 9.115289577358826e-06, |
|
"loss": 0.7126, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.1633663366336633, |
|
"grad_norm": 0.7720020506192959, |
|
"learning_rate": 8.900294552512878e-06, |
|
"loss": 0.7095, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1757425742574257, |
|
"grad_norm": 0.7452407090275405, |
|
"learning_rate": 8.68581289758063e-06, |
|
"loss": 0.7028, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.188118811881188, |
|
"grad_norm": 0.7571246453363882, |
|
"learning_rate": 8.471944737944687e-06, |
|
"loss": 0.7184, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2004950495049505, |
|
"grad_norm": 0.7494453898237526, |
|
"learning_rate": 8.25878991259276e-06, |
|
"loss": 0.6982, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.2128712871287128, |
|
"grad_norm": 0.7254138409040641, |
|
"learning_rate": 8.046447927510335e-06, |
|
"loss": 0.7175, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2252475247524752, |
|
"grad_norm": 0.7709180774648441, |
|
"learning_rate": 7.835017909228801e-06, |
|
"loss": 0.7075, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.2376237623762376, |
|
"grad_norm": 0.760113272357165, |
|
"learning_rate": 7.624598558550707e-06, |
|
"loss": 0.7224, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.7251746178009261, |
|
"learning_rate": 7.415288104473774e-06, |
|
"loss": 0.7059, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.2623762376237624, |
|
"grad_norm": 0.7604221994294873, |
|
"learning_rate": 7.207184258335163e-06, |
|
"loss": 0.7022, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2747524752475248, |
|
"grad_norm": 0.7947902441474564, |
|
"learning_rate": 7.000384168197354e-06, |
|
"loss": 0.7076, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.2871287128712872, |
|
"grad_norm": 0.7054449499686205, |
|
"learning_rate": 6.7949843734970475e-06, |
|
"loss": 0.7133, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2995049504950495, |
|
"grad_norm": 0.708545411017501, |
|
"learning_rate": 6.5910807599781135e-06, |
|
"loss": 0.7106, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.311881188118812, |
|
"grad_norm": 0.7262363459114348, |
|
"learning_rate": 6.388768514929768e-06, |
|
"loss": 0.7114, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3242574257425743, |
|
"grad_norm": 0.6998186137578639, |
|
"learning_rate": 6.18814208275075e-06, |
|
"loss": 0.7122, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.3366336633663367, |
|
"grad_norm": 0.7095860494984699, |
|
"learning_rate": 5.989295120860334e-06, |
|
"loss": 0.7252, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.349009900990099, |
|
"grad_norm": 0.6915165561447487, |
|
"learning_rate": 5.792320455976714e-06, |
|
"loss": 0.7048, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.3613861386138613, |
|
"grad_norm": 0.7119909691342722, |
|
"learning_rate": 5.597310040783161e-06, |
|
"loss": 0.6962, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3737623762376239, |
|
"grad_norm": 0.7504970044336818, |
|
"learning_rate": 5.404354911002243e-06, |
|
"loss": 0.707, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.386138613861386, |
|
"grad_norm": 0.7146866752363458, |
|
"learning_rate": 5.213545142898061e-06, |
|
"loss": 0.7223, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3985148514851486, |
|
"grad_norm": 0.7356980622491117, |
|
"learning_rate": 5.024969811226419e-06, |
|
"loss": 0.6998, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.4108910891089108, |
|
"grad_norm": 0.7061674308512749, |
|
"learning_rate": 4.838716947652485e-06, |
|
"loss": 0.6958, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4232673267326732, |
|
"grad_norm": 0.7284088021218859, |
|
"learning_rate": 4.654873499655449e-06, |
|
"loss": 0.6964, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.4356435643564356, |
|
"grad_norm": 0.6886931942419792, |
|
"learning_rate": 4.4735252899392335e-06, |
|
"loss": 0.7023, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.448019801980198, |
|
"grad_norm": 0.7166576506174707, |
|
"learning_rate": 4.294756976368351e-06, |
|
"loss": 0.7069, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.4603960396039604, |
|
"grad_norm": 0.7032864356512767, |
|
"learning_rate": 4.118652012447486e-06, |
|
"loss": 0.7211, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4727722772277227, |
|
"grad_norm": 0.7242483728321348, |
|
"learning_rate": 3.945292608363312e-06, |
|
"loss": 0.7119, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.4851485148514851, |
|
"grad_norm": 0.6918844982337927, |
|
"learning_rate": 3.7747596926067485e-06, |
|
"loss": 0.7221, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4975247524752475, |
|
"grad_norm": 0.7132050642871574, |
|
"learning_rate": 3.6071328741934985e-06, |
|
"loss": 0.7022, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.50990099009901, |
|
"grad_norm": 0.7347264550601227, |
|
"learning_rate": 3.442490405500598e-06, |
|
"loss": 0.6975, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5222772277227723, |
|
"grad_norm": 0.6602507777994507, |
|
"learning_rate": 3.2809091457362464e-06, |
|
"loss": 0.7065, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.5346534653465347, |
|
"grad_norm": 0.6740627682629335, |
|
"learning_rate": 3.122464525060013e-06, |
|
"loss": 0.6978, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.547029702970297, |
|
"grad_norm": 0.7142221920558823, |
|
"learning_rate": 2.96723050937015e-06, |
|
"loss": 0.709, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.5594059405940595, |
|
"grad_norm": 0.7335799725765082, |
|
"learning_rate": 2.8152795657744882e-06, |
|
"loss": 0.6893, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5717821782178216, |
|
"grad_norm": 0.7032104638603995, |
|
"learning_rate": 2.666682628760958e-06, |
|
"loss": 0.6961, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.5841584158415842, |
|
"grad_norm": 0.6921897923437655, |
|
"learning_rate": 2.521509067083631e-06, |
|
"loss": 0.6938, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5965346534653464, |
|
"grad_norm": 0.6863228632248336, |
|
"learning_rate": 2.379826651379632e-06, |
|
"loss": 0.7033, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.608910891089109, |
|
"grad_norm": 0.6778666475373859, |
|
"learning_rate": 2.241701522532136e-06, |
|
"loss": 0.7077, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6212871287128712, |
|
"grad_norm": 0.978566748682469, |
|
"learning_rate": 2.107198160794136e-06, |
|
"loss": 0.7102, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.6336633663366338, |
|
"grad_norm": 0.7103004954900359, |
|
"learning_rate": 1.9763793556874655e-06, |
|
"loss": 0.6983, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.646039603960396, |
|
"grad_norm": 0.676195777058198, |
|
"learning_rate": 1.849306176691088e-06, |
|
"loss": 0.7176, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.6584158415841586, |
|
"grad_norm": 0.7079808939239, |
|
"learning_rate": 1.7260379447323327e-06, |
|
"loss": 0.6998, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6707920792079207, |
|
"grad_norm": 0.6910118308885802, |
|
"learning_rate": 1.6066322044944126e-06, |
|
"loss": 0.6847, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.6831683168316833, |
|
"grad_norm": 0.6904569392825494, |
|
"learning_rate": 1.4911446975531329e-06, |
|
"loss": 0.7014, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6955445544554455, |
|
"grad_norm": 0.6979836145792466, |
|
"learning_rate": 1.3796293363553259e-06, |
|
"loss": 0.7252, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.7079207920792079, |
|
"grad_norm": 0.6898195721719471, |
|
"learning_rate": 1.2721381790511832e-06, |
|
"loss": 0.7096, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7202970297029703, |
|
"grad_norm": 0.7872710836498652, |
|
"learning_rate": 1.168721405192218e-06, |
|
"loss": 0.7118, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.7326732673267327, |
|
"grad_norm": 0.6351928901398578, |
|
"learning_rate": 1.0694272923061933e-06, |
|
"loss": 0.7073, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.745049504950495, |
|
"grad_norm": 0.6550150479810684, |
|
"learning_rate": 9.743021933599695e-07, |
|
"loss": 0.6879, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.7574257425742574, |
|
"grad_norm": 0.6762595095414403, |
|
"learning_rate": 8.833905151207833e-07, |
|
"loss": 0.6972, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7698019801980198, |
|
"grad_norm": 0.7046793104440422, |
|
"learning_rate": 7.967346974260626e-07, |
|
"loss": 0.7119, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.7821782178217822, |
|
"grad_norm": 0.6803563399348503, |
|
"learning_rate": 7.143751933714583e-07, |
|
"loss": 0.7064, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7945544554455446, |
|
"grad_norm": 0.6569063272572411, |
|
"learning_rate": 6.363504504263207e-07, |
|
"loss": 0.694, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.806930693069307, |
|
"grad_norm": 0.672487356470194, |
|
"learning_rate": 5.626968924854714e-07, |
|
"loss": 0.7133, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8193069306930694, |
|
"grad_norm": 0.6612411849360775, |
|
"learning_rate": 4.934489028656164e-07, |
|
"loss": 0.6933, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.8316831683168315, |
|
"grad_norm": 0.7156254594176831, |
|
"learning_rate": 4.2863880825435687e-07, |
|
"loss": 0.7057, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8440594059405941, |
|
"grad_norm": 0.6833411888683371, |
|
"learning_rate": 3.682968636192863e-07, |
|
"loss": 0.7112, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.8564356435643563, |
|
"grad_norm": 0.6969326586876218, |
|
"learning_rate": 3.124512380842204e-07, |
|
"loss": 0.6912, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.868811881188119, |
|
"grad_norm": 0.6742873155450232, |
|
"learning_rate": 2.61128001779144e-07, |
|
"loss": 0.6823, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.881188118811881, |
|
"grad_norm": 0.6414949104474531, |
|
"learning_rate": 2.1435111367002826e-07, |
|
"loss": 0.7126, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8935643564356437, |
|
"grad_norm": 0.6525583867438978, |
|
"learning_rate": 1.7214241037418312e-07, |
|
"loss": 0.6969, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.9059405940594059, |
|
"grad_norm": 0.65521211994519, |
|
"learning_rate": 1.345215959663837e-07, |
|
"loss": 0.6903, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9183168316831685, |
|
"grad_norm": 0.6944361037558262, |
|
"learning_rate": 1.0150623278051719e-07, |
|
"loss": 0.6912, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.9306930693069306, |
|
"grad_norm": 0.648743159889204, |
|
"learning_rate": 7.311173321104648e-08, |
|
"loss": 0.683, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.943069306930693, |
|
"grad_norm": 0.6925205112702544, |
|
"learning_rate": 4.935135251811995e-08, |
|
"loss": 0.6934, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.9554455445544554, |
|
"grad_norm": 0.6504666964480196, |
|
"learning_rate": 3.023618263968797e-08, |
|
"loss": 0.7186, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9678217821782178, |
|
"grad_norm": 0.7265261251377658, |
|
"learning_rate": 1.577514701350591e-08, |
|
"loss": 0.7216, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.9801980198019802, |
|
"grad_norm": 0.6739382260611008, |
|
"learning_rate": 5.97499641145416e-09, |
|
"loss": 0.6928, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9925742574257426, |
|
"grad_norm": 0.6660006595388417, |
|
"learning_rate": 8.403057881067877e-10, |
|
"loss": 0.6941, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.07817542552948, |
|
"eval_runtime": 43.2932, |
|
"eval_samples_per_second": 75.832, |
|
"eval_steps_per_second": 1.201, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 808, |
|
"total_flos": 169178761789440.0, |
|
"train_loss": 0.8271688128461933, |
|
"train_runtime": 2411.6836, |
|
"train_samples_per_second": 21.427, |
|
"train_steps_per_second": 0.335 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 808, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 169178761789440.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|