|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.999581764951903, |
|
"eval_steps": 500, |
|
"global_step": 1195, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000836470096194061, |
|
"grad_norm": 109.33856201171875, |
|
"learning_rate": 2.4999999999999998e-06, |
|
"loss": 24.0328, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004182350480970306, |
|
"grad_norm": 97.91401672363281, |
|
"learning_rate": 1.2499999999999999e-05, |
|
"loss": 22.4374, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008364700961940611, |
|
"grad_norm": 36.984405517578125, |
|
"learning_rate": 2.4999999999999998e-05, |
|
"loss": 20.9539, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012547051442910916, |
|
"grad_norm": 20.562854766845703, |
|
"learning_rate": 3.75e-05, |
|
"loss": 18.0157, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.016729401923881223, |
|
"grad_norm": 7.916600227355957, |
|
"learning_rate": 4.9999999999999996e-05, |
|
"loss": 15.4241, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.020911752404851526, |
|
"grad_norm": 6.871030807495117, |
|
"learning_rate": 6.25e-05, |
|
"loss": 14.6159, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.025094102885821833, |
|
"grad_norm": 6.986784934997559, |
|
"learning_rate": 7.5e-05, |
|
"loss": 13.6058, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.029276453366792136, |
|
"grad_norm": 3.074172019958496, |
|
"learning_rate": 8.75e-05, |
|
"loss": 12.7264, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.033458803847762446, |
|
"grad_norm": 2.548049211502075, |
|
"learning_rate": 9.999999999999999e-05, |
|
"loss": 12.7985, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.037641154328732745, |
|
"grad_norm": 3.184255361557007, |
|
"learning_rate": 0.0001125, |
|
"loss": 12.2921, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04182350480970305, |
|
"grad_norm": 4.834357738494873, |
|
"learning_rate": 0.000125, |
|
"loss": 11.7192, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04600585529067336, |
|
"grad_norm": 8.06936264038086, |
|
"learning_rate": 0.00013749999999999998, |
|
"loss": 11.0815, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.050188205771643665, |
|
"grad_norm": 12.874434471130371, |
|
"learning_rate": 0.00015, |
|
"loss": 9.8728, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05437055625261397, |
|
"grad_norm": 18.794525146484375, |
|
"learning_rate": 0.00016249999999999997, |
|
"loss": 7.7488, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05855290673358427, |
|
"grad_norm": 21.42744255065918, |
|
"learning_rate": 0.000175, |
|
"loss": 4.8002, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06273525721455459, |
|
"grad_norm": 7.021483898162842, |
|
"learning_rate": 0.00018749999999999998, |
|
"loss": 2.2667, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06691760769552489, |
|
"grad_norm": 4.35729455947876, |
|
"learning_rate": 0.00019999999999999998, |
|
"loss": 1.7236, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07109995817649518, |
|
"grad_norm": 2.531404972076416, |
|
"learning_rate": 0.0002125, |
|
"loss": 1.4388, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07528230865746549, |
|
"grad_norm": 1.7126120328903198, |
|
"learning_rate": 0.000225, |
|
"loss": 1.2603, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0794646591384358, |
|
"grad_norm": 1.713827133178711, |
|
"learning_rate": 0.00023749999999999997, |
|
"loss": 1.1149, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0836470096194061, |
|
"grad_norm": 0.6418541073799133, |
|
"learning_rate": 0.00025, |
|
"loss": 1.0645, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08782936010037641, |
|
"grad_norm": 0.9687772989273071, |
|
"learning_rate": 0.0002625, |
|
"loss": 1.0047, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09201171058134672, |
|
"grad_norm": 0.8204954266548157, |
|
"learning_rate": 0.00027499999999999996, |
|
"loss": 0.986, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09619406106231702, |
|
"grad_norm": 0.5046463012695312, |
|
"learning_rate": 0.0002875, |
|
"loss": 0.9229, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.10037641154328733, |
|
"grad_norm": 1.1709442138671875, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9344, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10455876202425764, |
|
"grad_norm": 0.9831328392028809, |
|
"learning_rate": 0.0002999839868651235, |
|
"loss": 0.8812, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10874111250522794, |
|
"grad_norm": 1.4019944667816162, |
|
"learning_rate": 0.0002999359508794339, |
|
"loss": 0.8814, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11292346298619825, |
|
"grad_norm": 0.78233802318573, |
|
"learning_rate": 0.00029985590229902073, |
|
"loss": 0.8701, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11710581346716854, |
|
"grad_norm": 1.4517076015472412, |
|
"learning_rate": 0.0002997438582149335, |
|
"loss": 0.8753, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12128816394813885, |
|
"grad_norm": 2.320331335067749, |
|
"learning_rate": 0.0002995998425495327, |
|
"loss": 0.8464, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.12547051442910917, |
|
"grad_norm": 1.0486135482788086, |
|
"learning_rate": 0.000299423886051382, |
|
"loss": 0.8498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12965286491007946, |
|
"grad_norm": 0.9131810069084167, |
|
"learning_rate": 0.0002992160262886831, |
|
"loss": 0.8468, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13383521539104978, |
|
"grad_norm": 1.4284504652023315, |
|
"learning_rate": 0.0002989763076412549, |
|
"loss": 0.8088, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13801756587202008, |
|
"grad_norm": 0.6106438040733337, |
|
"learning_rate": 0.000298704781291058, |
|
"loss": 0.8215, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14219991635299037, |
|
"grad_norm": 0.5358127951622009, |
|
"learning_rate": 0.0002984015052112665, |
|
"loss": 0.8201, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1463822668339607, |
|
"grad_norm": 1.5017443895339966, |
|
"learning_rate": 0.0002980665441538907, |
|
"loss": 0.7957, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.15056461731493098, |
|
"grad_norm": 1.1214942932128906, |
|
"learning_rate": 0.00029769996963595184, |
|
"loss": 0.8083, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1547469677959013, |
|
"grad_norm": 2.1036202907562256, |
|
"learning_rate": 0.0002973018599242125, |
|
"loss": 0.7929, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1589293182768716, |
|
"grad_norm": 1.0557819604873657, |
|
"learning_rate": 0.0002968723000184662, |
|
"loss": 0.7868, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16311166875784192, |
|
"grad_norm": 0.9558168649673462, |
|
"learning_rate": 0.00029641138163338907, |
|
"loss": 0.7812, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1672940192388122, |
|
"grad_norm": 0.771851122379303, |
|
"learning_rate": 0.0002959192031789579, |
|
"loss": 0.7846, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17147636971978253, |
|
"grad_norm": 1.288865089416504, |
|
"learning_rate": 0.0002953958697394391, |
|
"loss": 0.777, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.17565872020075282, |
|
"grad_norm": 2.001302480697632, |
|
"learning_rate": 0.000294841493050952, |
|
"loss": 0.7797, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.17984107068172314, |
|
"grad_norm": 0.7828574776649475, |
|
"learning_rate": 0.0002942561914776124, |
|
"loss": 0.7815, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18402342116269343, |
|
"grad_norm": 1.4854490756988525, |
|
"learning_rate": 0.00029364008998626086, |
|
"loss": 0.7608, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.18820577164366373, |
|
"grad_norm": 1.1406800746917725, |
|
"learning_rate": 0.00029299332011978107, |
|
"loss": 0.747, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19238812212463405, |
|
"grad_norm": 1.7346460819244385, |
|
"learning_rate": 0.00029231601996901433, |
|
"loss": 0.7555, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.19657047260560434, |
|
"grad_norm": 1.7754813432693481, |
|
"learning_rate": 0.0002916083341432763, |
|
"loss": 0.7626, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.20075282308657466, |
|
"grad_norm": 1.2126661539077759, |
|
"learning_rate": 0.00029087041373948135, |
|
"loss": 0.7237, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.20493517356754495, |
|
"grad_norm": 1.930538535118103, |
|
"learning_rate": 0.00029010241630988217, |
|
"loss": 0.7672, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.20911752404851527, |
|
"grad_norm": 2.1792216300964355, |
|
"learning_rate": 0.0002893045058284311, |
|
"loss": 0.7416, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21329987452948557, |
|
"grad_norm": 1.416754961013794, |
|
"learning_rate": 0.0002884768526557703, |
|
"loss": 0.7196, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2174822250104559, |
|
"grad_norm": 1.6103583574295044, |
|
"learning_rate": 0.0002876196335028581, |
|
"loss": 0.7397, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22166457549142618, |
|
"grad_norm": 1.0755459070205688, |
|
"learning_rate": 0.0002867330313932402, |
|
"loss": 0.7644, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2258469259723965, |
|
"grad_norm": 0.8303298354148865, |
|
"learning_rate": 0.000285817235623972, |
|
"loss": 0.7393, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2300292764533668, |
|
"grad_norm": 1.4747998714447021, |
|
"learning_rate": 0.00028487244172520246, |
|
"loss": 0.7121, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.23421162693433709, |
|
"grad_norm": 2.582953929901123, |
|
"learning_rate": 0.0002838988514184267, |
|
"loss": 0.7361, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2383939774153074, |
|
"grad_norm": 2.413325309753418, |
|
"learning_rate": 0.0002828966725734167, |
|
"loss": 0.74, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2425763278962777, |
|
"grad_norm": 0.7637856006622314, |
|
"learning_rate": 0.0002818661191638393, |
|
"loss": 0.7096, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.24675867837724802, |
|
"grad_norm": 1.757056713104248, |
|
"learning_rate": 0.0002808074112215711, |
|
"loss": 0.7205, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.25094102885821834, |
|
"grad_norm": 0.8766753077507019, |
|
"learning_rate": 0.0002797207747897198, |
|
"loss": 0.7098, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2551233793391886, |
|
"grad_norm": 1.449209213256836, |
|
"learning_rate": 0.00027860644187436195, |
|
"loss": 0.725, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2593057298201589, |
|
"grad_norm": 0.6825206875801086, |
|
"learning_rate": 0.0002774646503950078, |
|
"loss": 0.6938, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.26348808030112925, |
|
"grad_norm": 1.119585394859314, |
|
"learning_rate": 0.0002762956441338036, |
|
"loss": 0.698, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.26767043078209957, |
|
"grad_norm": 0.9425824880599976, |
|
"learning_rate": 0.0002750996726834817, |
|
"loss": 0.7189, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.27185278126306983, |
|
"grad_norm": 0.5979897975921631, |
|
"learning_rate": 0.0002738769913940706, |
|
"loss": 0.7039, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.27603513174404015, |
|
"grad_norm": 1.8769757747650146, |
|
"learning_rate": 0.00027262786131837573, |
|
"loss": 0.7035, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2802174822250105, |
|
"grad_norm": 1.1395800113677979, |
|
"learning_rate": 0.0002713525491562421, |
|
"loss": 0.6898, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.28439983270598074, |
|
"grad_norm": 1.0573526620864868, |
|
"learning_rate": 0.0002700513271976119, |
|
"loss": 0.7042, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.28858218318695106, |
|
"grad_norm": 0.5185459852218628, |
|
"learning_rate": 0.0002687244732643881, |
|
"loss": 0.6914, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2927645336679214, |
|
"grad_norm": 2.7914602756500244, |
|
"learning_rate": 0.0002673722706511174, |
|
"loss": 0.7049, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2969468841488917, |
|
"grad_norm": 3.0459792613983154, |
|
"learning_rate": 0.000265995008064504, |
|
"loss": 0.7148, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.30112923462986196, |
|
"grad_norm": 2.1906723976135254, |
|
"learning_rate": 0.00026459297956176885, |
|
"loss": 0.7074, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3053115851108323, |
|
"grad_norm": 1.6257227659225464, |
|
"learning_rate": 0.00026316648448786536, |
|
"loss": 0.6985, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3094939355918026, |
|
"grad_norm": 0.7152910828590393, |
|
"learning_rate": 0.00026171582741156725, |
|
"loss": 0.6875, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3136762860727729, |
|
"grad_norm": 2.4449851512908936, |
|
"learning_rate": 0.0002602413180604401, |
|
"loss": 0.6787, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3178586365537432, |
|
"grad_norm": 0.5180588960647583, |
|
"learning_rate": 0.000258743271254712, |
|
"loss": 0.6724, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3220409870347135, |
|
"grad_norm": 1.5739381313323975, |
|
"learning_rate": 0.00025722200684005715, |
|
"loss": 0.7076, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.32622333751568383, |
|
"grad_norm": 0.8701817989349365, |
|
"learning_rate": 0.00025567784961930546, |
|
"loss": 0.6841, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3304056879966541, |
|
"grad_norm": 1.474747896194458, |
|
"learning_rate": 0.0002541111292830951, |
|
"loss": 0.713, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3345880384776244, |
|
"grad_norm": 1.8884798288345337, |
|
"learning_rate": 0.00025252218033947993, |
|
"loss": 0.6893, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.33877038895859474, |
|
"grad_norm": 0.8834472894668579, |
|
"learning_rate": 0.00025091134204250997, |
|
"loss": 0.6966, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.34295273943956506, |
|
"grad_norm": 0.6324520707130432, |
|
"learning_rate": 0.00024927895831979745, |
|
"loss": 0.6882, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3471350899205353, |
|
"grad_norm": 2.353163480758667, |
|
"learning_rate": 0.00024762537769908535, |
|
"loss": 0.6829, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.35131744040150564, |
|
"grad_norm": 1.3682096004486084, |
|
"learning_rate": 0.00024595095323383365, |
|
"loss": 0.6912, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.35549979088247596, |
|
"grad_norm": 0.9962055087089539, |
|
"learning_rate": 0.0002442560424278399, |
|
"loss": 0.6857, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3596821413634463, |
|
"grad_norm": 1.1282930374145508, |
|
"learning_rate": 0.00024254100715890846, |
|
"loss": 0.6696, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.36386449184441655, |
|
"grad_norm": 0.934388279914856, |
|
"learning_rate": 0.00024080621360158717, |
|
"loss": 0.6841, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.36804684232538687, |
|
"grad_norm": 1.4339077472686768, |
|
"learning_rate": 0.00023905203214898558, |
|
"loss": 0.6705, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3722291928063572, |
|
"grad_norm": 1.0309265851974487, |
|
"learning_rate": 0.00023727883733369292, |
|
"loss": 0.6706, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.37641154328732745, |
|
"grad_norm": 1.9208811521530151, |
|
"learning_rate": 0.00023548700774781242, |
|
"loss": 0.6637, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3805938937682978, |
|
"grad_norm": 1.0379974842071533, |
|
"learning_rate": 0.00023367692596212858, |
|
"loss": 0.68, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.3847762442492681, |
|
"grad_norm": 1.852662444114685, |
|
"learning_rate": 0.00023184897844442495, |
|
"loss": 0.6589, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3889585947302384, |
|
"grad_norm": 1.1750479936599731, |
|
"learning_rate": 0.00023000355547697027, |
|
"loss": 0.6675, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3931409452112087, |
|
"grad_norm": 1.6473002433776855, |
|
"learning_rate": 0.00022814105107318952, |
|
"loss": 0.6709, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.397323295692179, |
|
"grad_norm": 1.2356650829315186, |
|
"learning_rate": 0.00022626186289353913, |
|
"loss": 0.6652, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4015056461731493, |
|
"grad_norm": 1.1605840921401978, |
|
"learning_rate": 0.00022436639216060275, |
|
"loss": 0.6698, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.40568799665411964, |
|
"grad_norm": 1.5935866832733154, |
|
"learning_rate": 0.00022245504357342716, |
|
"loss": 0.6688, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4098703471350899, |
|
"grad_norm": 0.810558557510376, |
|
"learning_rate": 0.00022052822522111522, |
|
"loss": 0.6524, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.41405269761606023, |
|
"grad_norm": 0.7008018493652344, |
|
"learning_rate": 0.00021858634849569576, |
|
"loss": 0.6924, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.41823504809703055, |
|
"grad_norm": 1.7558863162994385, |
|
"learning_rate": 0.0002166298280042877, |
|
"loss": 0.6711, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4224173985780008, |
|
"grad_norm": 1.573688268661499, |
|
"learning_rate": 0.00021465908148057787, |
|
"loss": 0.6674, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.42659974905897113, |
|
"grad_norm": 1.4761265516281128, |
|
"learning_rate": 0.00021267452969563153, |
|
"loss": 0.6706, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.43078209953994145, |
|
"grad_norm": 1.7749208211898804, |
|
"learning_rate": 0.00021067659636805403, |
|
"loss": 0.6469, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.4349644500209118, |
|
"grad_norm": 1.0164939165115356, |
|
"learning_rate": 0.00020866570807352337, |
|
"loss": 0.6764, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.43914680050188204, |
|
"grad_norm": 1.6237319707870483, |
|
"learning_rate": 0.00020664229415371266, |
|
"loss": 0.6694, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.44332915098285236, |
|
"grad_norm": 1.5586035251617432, |
|
"learning_rate": 0.00020460678662462194, |
|
"loss": 0.6562, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4475115014638227, |
|
"grad_norm": 1.771645188331604, |
|
"learning_rate": 0.0002025596200843394, |
|
"loss": 0.6622, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.451693851944793, |
|
"grad_norm": 0.5951160788536072, |
|
"learning_rate": 0.0002005012316202506, |
|
"loss": 0.651, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.45587620242576327, |
|
"grad_norm": 0.793093740940094, |
|
"learning_rate": 0.00019843206071571692, |
|
"loss": 0.6634, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4600585529067336, |
|
"grad_norm": 1.0352814197540283, |
|
"learning_rate": 0.0001963525491562421, |
|
"loss": 0.6636, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4642409033877039, |
|
"grad_norm": 0.843008816242218, |
|
"learning_rate": 0.00019426314093514717, |
|
"loss": 0.6407, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.46842325386867417, |
|
"grad_norm": 1.7540709972381592, |
|
"learning_rate": 0.00019216428215877425, |
|
"loss": 0.638, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.4726056043496445, |
|
"grad_norm": 0.5828922390937805, |
|
"learning_rate": 0.00019005642095123895, |
|
"loss": 0.6625, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4767879548306148, |
|
"grad_norm": 0.7700462937355042, |
|
"learning_rate": 0.00018794000735875208, |
|
"loss": 0.6428, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.48097030531158513, |
|
"grad_norm": 0.8344655632972717, |
|
"learning_rate": 0.00018581549325353126, |
|
"loss": 0.6553, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4851526557925554, |
|
"grad_norm": 1.2676873207092285, |
|
"learning_rate": 0.000183683332237322, |
|
"loss": 0.6645, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4893350062735257, |
|
"grad_norm": 1.4888837337493896, |
|
"learning_rate": 0.00018154397954454993, |
|
"loss": 0.6859, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.49351735675449604, |
|
"grad_norm": 0.7020601034164429, |
|
"learning_rate": 0.00017939789194512472, |
|
"loss": 0.6456, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.49769970723546636, |
|
"grad_norm": 1.1964813470840454, |
|
"learning_rate": 0.00017724552764691545, |
|
"loss": 0.6594, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5018820577164367, |
|
"grad_norm": 1.1332772970199585, |
|
"learning_rate": 0.00017508734619791966, |
|
"loss": 0.6606, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.506064408197407, |
|
"grad_norm": 1.6122368574142456, |
|
"learning_rate": 0.00017292380838814577, |
|
"loss": 0.6468, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5102467586783772, |
|
"grad_norm": 0.8950415849685669, |
|
"learning_rate": 0.00017075537615123042, |
|
"loss": 0.6615, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5144291091593476, |
|
"grad_norm": 1.9138753414154053, |
|
"learning_rate": 0.00016858251246581216, |
|
"loss": 0.6683, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5186114596403179, |
|
"grad_norm": 0.9320158362388611, |
|
"learning_rate": 0.00016640568125668117, |
|
"loss": 0.6734, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5227938101212881, |
|
"grad_norm": 1.2331713438034058, |
|
"learning_rate": 0.00016422534729572738, |
|
"loss": 0.6582, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5269761606022585, |
|
"grad_norm": 1.1182340383529663, |
|
"learning_rate": 0.00016204197610270816, |
|
"loss": 0.6533, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5311585110832288, |
|
"grad_norm": 0.6500148773193359, |
|
"learning_rate": 0.00015985603384585542, |
|
"loss": 0.6396, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5353408615641991, |
|
"grad_norm": 0.9531376361846924, |
|
"learning_rate": 0.00015766798724234506, |
|
"loss": 0.6337, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5395232120451694, |
|
"grad_norm": 0.7729771733283997, |
|
"learning_rate": 0.00015547830345864885, |
|
"loss": 0.6498, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5437055625261397, |
|
"grad_norm": 0.6831007599830627, |
|
"learning_rate": 0.0001532874500107902, |
|
"loss": 0.6404, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.54788791300711, |
|
"grad_norm": 1.2160038948059082, |
|
"learning_rate": 0.00015109589466452594, |
|
"loss": 0.658, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5520702634880803, |
|
"grad_norm": 1.015773057937622, |
|
"learning_rate": 0.00014890410533547404, |
|
"loss": 0.6507, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5562526139690506, |
|
"grad_norm": 1.208256721496582, |
|
"learning_rate": 0.00014671254998920976, |
|
"loss": 0.6399, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.560434964450021, |
|
"grad_norm": 0.7431871294975281, |
|
"learning_rate": 0.00014452169654135115, |
|
"loss": 0.6534, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5646173149309912, |
|
"grad_norm": 0.6661595702171326, |
|
"learning_rate": 0.00014233201275765494, |
|
"loss": 0.6343, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5687996654119615, |
|
"grad_norm": 1.2753201723098755, |
|
"learning_rate": 0.00014014396615414458, |
|
"loss": 0.6296, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5729820158929319, |
|
"grad_norm": 1.4110949039459229, |
|
"learning_rate": 0.00013795802389729184, |
|
"loss": 0.6452, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5771643663739021, |
|
"grad_norm": 1.4824358224868774, |
|
"learning_rate": 0.00013577465270427262, |
|
"loss": 0.6348, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5813467168548725, |
|
"grad_norm": 1.8900264501571655, |
|
"learning_rate": 0.00013359431874331886, |
|
"loss": 0.6509, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5855290673358428, |
|
"grad_norm": 1.652632236480713, |
|
"learning_rate": 0.0001314174875341878, |
|
"loss": 0.6206, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.589711417816813, |
|
"grad_norm": 1.1248772144317627, |
|
"learning_rate": 0.00012924462384876953, |
|
"loss": 0.6299, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5938937682977834, |
|
"grad_norm": 0.7448098659515381, |
|
"learning_rate": 0.00012707619161185423, |
|
"loss": 0.6483, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5980761187787537, |
|
"grad_norm": 0.6708864569664001, |
|
"learning_rate": 0.00012491265380208032, |
|
"loss": 0.6473, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6022584692597239, |
|
"grad_norm": 0.8381022810935974, |
|
"learning_rate": 0.00012275447235308453, |
|
"loss": 0.6356, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6064408197406943, |
|
"grad_norm": 1.3462333679199219, |
|
"learning_rate": 0.00012060210805487529, |
|
"loss": 0.6388, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6106231702216646, |
|
"grad_norm": 1.2015129327774048, |
|
"learning_rate": 0.00011845602045545008, |
|
"loss": 0.6258, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6148055207026348, |
|
"grad_norm": 0.7825962901115417, |
|
"learning_rate": 0.00011631666776267803, |
|
"loss": 0.6401, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6189878711836052, |
|
"grad_norm": 0.9470372200012207, |
|
"learning_rate": 0.00011418450674646868, |
|
"loss": 0.6501, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6231702216645755, |
|
"grad_norm": 0.9243600368499756, |
|
"learning_rate": 0.00011205999264124786, |
|
"loss": 0.6195, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6273525721455459, |
|
"grad_norm": 1.931402325630188, |
|
"learning_rate": 0.00010994357904876106, |
|
"loss": 0.6264, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6315349226265161, |
|
"grad_norm": 1.7754958868026733, |
|
"learning_rate": 0.00010783571784122577, |
|
"loss": 0.6351, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6357172731074864, |
|
"grad_norm": 0.627479076385498, |
|
"learning_rate": 0.00010573685906485282, |
|
"loss": 0.6395, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6398996235884568, |
|
"grad_norm": 2.4568421840667725, |
|
"learning_rate": 0.0001036474508437579, |
|
"loss": 0.6257, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.644081974069427, |
|
"grad_norm": 1.9016671180725098, |
|
"learning_rate": 0.0001015679392842831, |
|
"loss": 0.6446, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6482643245503973, |
|
"grad_norm": 0.7173120975494385, |
|
"learning_rate": 9.949876837974944e-05, |
|
"loss": 0.6312, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6524466750313677, |
|
"grad_norm": 1.9048292636871338, |
|
"learning_rate": 9.744037991566058e-05, |
|
"loss": 0.622, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6566290255123379, |
|
"grad_norm": 1.2454332113265991, |
|
"learning_rate": 9.5393213375378e-05, |
|
"loss": 0.6219, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6608113759933082, |
|
"grad_norm": 1.0283215045928955, |
|
"learning_rate": 9.33577058462873e-05, |
|
"loss": 0.6236, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6649937264742786, |
|
"grad_norm": 1.8116226196289062, |
|
"learning_rate": 9.133429192647661e-05, |
|
"loss": 0.6244, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6691760769552488, |
|
"grad_norm": 0.8519335389137268, |
|
"learning_rate": 8.932340363194595e-05, |
|
"loss": 0.6253, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6733584274362192, |
|
"grad_norm": 1.004647135734558, |
|
"learning_rate": 8.73254703043685e-05, |
|
"loss": 0.6278, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.6775407779171895, |
|
"grad_norm": 0.945331871509552, |
|
"learning_rate": 8.534091851942214e-05, |
|
"loss": 0.6251, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6817231283981597, |
|
"grad_norm": 0.4643709659576416, |
|
"learning_rate": 8.337017199571235e-05, |
|
"loss": 0.6298, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6859054788791301, |
|
"grad_norm": 0.8933060169219971, |
|
"learning_rate": 8.141365150430421e-05, |
|
"loss": 0.6419, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6900878293601004, |
|
"grad_norm": 2.5518875122070312, |
|
"learning_rate": 7.947177477888472e-05, |
|
"loss": 0.6424, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6942701798410706, |
|
"grad_norm": 0.7830976247787476, |
|
"learning_rate": 7.754495642657282e-05, |
|
"loss": 0.6292, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.698452530322041, |
|
"grad_norm": 1.2565546035766602, |
|
"learning_rate": 7.563360783939722e-05, |
|
"loss": 0.6308, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7026348808030113, |
|
"grad_norm": 1.588156819343567, |
|
"learning_rate": 7.373813710646083e-05, |
|
"loss": 0.6249, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7068172312839816, |
|
"grad_norm": 0.6486766934394836, |
|
"learning_rate": 7.185894892681048e-05, |
|
"loss": 0.6308, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7109995817649519, |
|
"grad_norm": 0.7454473972320557, |
|
"learning_rate": 6.999644452302975e-05, |
|
"loss": 0.6267, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7151819322459222, |
|
"grad_norm": 0.6658061146736145, |
|
"learning_rate": 6.815102155557501e-05, |
|
"loss": 0.6162, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7193642827268926, |
|
"grad_norm": 1.0052908658981323, |
|
"learning_rate": 6.632307403787138e-05, |
|
"loss": 0.644, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7235466332078628, |
|
"grad_norm": 0.7472626566886902, |
|
"learning_rate": 6.451299225218754e-05, |
|
"loss": 0.616, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7277289836888331, |
|
"grad_norm": 0.587893009185791, |
|
"learning_rate": 6.27211626663071e-05, |
|
"loss": 0.6318, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7319113341698035, |
|
"grad_norm": 0.898607611656189, |
|
"learning_rate": 6.0947967851014405e-05, |
|
"loss": 0.6409, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7360936846507737, |
|
"grad_norm": 0.7444003224372864, |
|
"learning_rate": 5.919378639841281e-05, |
|
"loss": 0.6214, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.740276035131744, |
|
"grad_norm": 1.199029564857483, |
|
"learning_rate": 5.745899284109154e-05, |
|
"loss": 0.6184, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7444583856127144, |
|
"grad_norm": 1.2874826192855835, |
|
"learning_rate": 5.57439575721601e-05, |
|
"loss": 0.6233, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7486407360936846, |
|
"grad_norm": 1.3848364353179932, |
|
"learning_rate": 5.4049046766166335e-05, |
|
"loss": 0.6043, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7528230865746549, |
|
"grad_norm": 0.6463631987571716, |
|
"learning_rate": 5.237462230091467e-05, |
|
"loss": 0.6361, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7570054370556253, |
|
"grad_norm": 1.0429089069366455, |
|
"learning_rate": 5.07210416802025e-05, |
|
"loss": 0.6206, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7611877875365956, |
|
"grad_norm": 0.7253705263137817, |
|
"learning_rate": 4.908865795748999e-05, |
|
"loss": 0.6312, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7653701380175659, |
|
"grad_norm": 0.498542457818985, |
|
"learning_rate": 4.74778196605201e-05, |
|
"loss": 0.6421, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7695524884985362, |
|
"grad_norm": 0.6464426517486572, |
|
"learning_rate": 4.58888707169049e-05, |
|
"loss": 0.6047, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7737348389795065, |
|
"grad_norm": 0.6658442616462708, |
|
"learning_rate": 4.432215038069449e-05, |
|
"loss": 0.623, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.7779171894604768, |
|
"grad_norm": 0.6936389207839966, |
|
"learning_rate": 4.277799315994286e-05, |
|
"loss": 0.6226, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7820995399414471, |
|
"grad_norm": 0.5900949835777283, |
|
"learning_rate": 4.125672874528797e-05, |
|
"loss": 0.6314, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.7862818904224174, |
|
"grad_norm": 0.9340611100196838, |
|
"learning_rate": 3.97586819395599e-05, |
|
"loss": 0.6252, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7904642409033877, |
|
"grad_norm": 1.272733211517334, |
|
"learning_rate": 3.8284172588432716e-05, |
|
"loss": 0.6236, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.794646591384358, |
|
"grad_norm": 0.7782835364341736, |
|
"learning_rate": 3.6833515512134606e-05, |
|
"loss": 0.6096, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7988289418653283, |
|
"grad_norm": 0.6020464301109314, |
|
"learning_rate": 3.540702043823113e-05, |
|
"loss": 0.6124, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8030112923462986, |
|
"grad_norm": 0.675359845161438, |
|
"learning_rate": 3.4004991935496004e-05, |
|
"loss": 0.5955, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8071936428272689, |
|
"grad_norm": 0.5639395117759705, |
|
"learning_rate": 3.262772934888265e-05, |
|
"loss": 0.6069, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8113759933082393, |
|
"grad_norm": 0.4711320698261261, |
|
"learning_rate": 3.1275526735611896e-05, |
|
"loss": 0.6102, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8155583437892095, |
|
"grad_norm": 1.0711911916732788, |
|
"learning_rate": 2.9948672802388135e-05, |
|
"loss": 0.6391, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8197406942701798, |
|
"grad_norm": 0.4709874987602234, |
|
"learning_rate": 2.8647450843757897e-05, |
|
"loss": 0.6186, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8239230447511502, |
|
"grad_norm": 0.5402533411979675, |
|
"learning_rate": 2.7372138681624244e-05, |
|
"loss": 0.613, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8281053952321205, |
|
"grad_norm": 0.6864106059074402, |
|
"learning_rate": 2.6123008605929375e-05, |
|
"loss": 0.6215, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8322877457130907, |
|
"grad_norm": 0.5939123630523682, |
|
"learning_rate": 2.4900327316518326e-05, |
|
"loss": 0.6168, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8364700961940611, |
|
"grad_norm": 0.7122395038604736, |
|
"learning_rate": 2.3704355866196373e-05, |
|
"loss": 0.6053, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8406524466750314, |
|
"grad_norm": 0.6920621395111084, |
|
"learning_rate": 2.2535349604992153e-05, |
|
"loss": 0.6097, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8448347971560016, |
|
"grad_norm": 1.7864691019058228, |
|
"learning_rate": 2.1393558125638066e-05, |
|
"loss": 0.6382, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.849017147636972, |
|
"grad_norm": 0.7472600936889648, |
|
"learning_rate": 2.027922521028018e-05, |
|
"loss": 0.6159, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8531994981179423, |
|
"grad_norm": 1.722579836845398, |
|
"learning_rate": 1.9192588778428842e-05, |
|
"loss": 0.6011, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8573818485989126, |
|
"grad_norm": 0.5121240019798279, |
|
"learning_rate": 1.813388083616068e-05, |
|
"loss": 0.6031, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8615641990798829, |
|
"grad_norm": 0.5083288550376892, |
|
"learning_rate": 1.7103327426583265e-05, |
|
"loss": 0.5845, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8657465495608532, |
|
"grad_norm": 0.46453267335891724, |
|
"learning_rate": 1.6101148581573274e-05, |
|
"loss": 0.6031, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.8699289000418235, |
|
"grad_norm": 0.8352246880531311, |
|
"learning_rate": 1.5127558274797535e-05, |
|
"loss": 0.6024, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8741112505227938, |
|
"grad_norm": 0.8040021061897278, |
|
"learning_rate": 1.4182764376028006e-05, |
|
"loss": 0.635, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.8782936010037641, |
|
"grad_norm": 0.7152721881866455, |
|
"learning_rate": 1.326696860675981e-05, |
|
"loss": 0.6162, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8824759514847345, |
|
"grad_norm": 0.6833348274230957, |
|
"learning_rate": 1.2380366497141886e-05, |
|
"loss": 0.6217, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.8866583019657047, |
|
"grad_norm": 0.6803585886955261, |
|
"learning_rate": 1.1523147344229716e-05, |
|
"loss": 0.6218, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.890840652446675, |
|
"grad_norm": 0.9396490454673767, |
|
"learning_rate": 1.069549417156887e-05, |
|
"loss": 0.6176, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.8950230029276454, |
|
"grad_norm": 0.7006051540374756, |
|
"learning_rate": 9.89758369011781e-06, |
|
"loss": 0.6123, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.8992053534086156, |
|
"grad_norm": 0.5886880159378052, |
|
"learning_rate": 9.129586260518634e-06, |
|
"loss": 0.5923, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.903387703889586, |
|
"grad_norm": 0.5032349228858948, |
|
"learning_rate": 8.391665856723655e-06, |
|
"loss": 0.619, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9075700543705563, |
|
"grad_norm": 0.51763916015625, |
|
"learning_rate": 7.683980030985654e-06, |
|
"loss": 0.6039, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.9117524048515265, |
|
"grad_norm": 0.588211715221405, |
|
"learning_rate": 7.006679880218974e-06, |
|
"loss": 0.6057, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9159347553324969, |
|
"grad_norm": 0.6631506681442261, |
|
"learning_rate": 6.359910013739122e-06, |
|
"loss": 0.6106, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9201171058134672, |
|
"grad_norm": 0.5523665547370911, |
|
"learning_rate": 5.743808522387544e-06, |
|
"loss": 0.6058, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9242994562944374, |
|
"grad_norm": 0.5903011560440063, |
|
"learning_rate": 5.158506949047975e-06, |
|
"loss": 0.6321, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.9284818067754078, |
|
"grad_norm": 0.6151677966117859, |
|
"learning_rate": 4.604130260560873e-06, |
|
"loss": 0.6171, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9326641572563781, |
|
"grad_norm": 0.5324861407279968, |
|
"learning_rate": 4.080796821042082e-06, |
|
"loss": 0.6184, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9368465077373483, |
|
"grad_norm": 0.5442612171173096, |
|
"learning_rate": 3.5886183666109405e-06, |
|
"loss": 0.6069, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9410288582183187, |
|
"grad_norm": 0.4981847405433655, |
|
"learning_rate": 3.1276999815337544e-06, |
|
"loss": 0.62, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.945211208699289, |
|
"grad_norm": 0.45795848965644836, |
|
"learning_rate": 2.6981400757874584e-06, |
|
"loss": 0.6027, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9493935591802594, |
|
"grad_norm": 0.78538978099823, |
|
"learning_rate": 2.3000303640481386e-06, |
|
"loss": 0.6084, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.9535759096612296, |
|
"grad_norm": 0.5453292727470398, |
|
"learning_rate": 1.9334558461092663e-06, |
|
"loss": 0.6043, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9577582601421999, |
|
"grad_norm": 0.5924518704414368, |
|
"learning_rate": 1.598494788733462e-06, |
|
"loss": 0.6033, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9619406106231703, |
|
"grad_norm": 0.5484139323234558, |
|
"learning_rate": 1.2952187089419642e-06, |
|
"loss": 0.616, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9661229611041405, |
|
"grad_norm": 0.5363529324531555, |
|
"learning_rate": 1.0236923587450263e-06, |
|
"loss": 0.6196, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.9703053115851108, |
|
"grad_norm": 0.6777392625808716, |
|
"learning_rate": 7.839737113168931e-07, |
|
"loss": 0.6102, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9744876620660812, |
|
"grad_norm": 0.5920884609222412, |
|
"learning_rate": 5.761139486180178e-07, |
|
"loss": 0.6075, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.9786700125470514, |
|
"grad_norm": 0.4196658730506897, |
|
"learning_rate": 4.0015745046725336e-07, |
|
"loss": 0.6017, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9828523630280217, |
|
"grad_norm": 0.6661626100540161, |
|
"learning_rate": 2.5614178506644934e-07, |
|
"loss": 0.5787, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.9870347135089921, |
|
"grad_norm": 0.6183582544326782, |
|
"learning_rate": 1.4409770097926765e-07, |
|
"loss": 0.6188, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9912170639899623, |
|
"grad_norm": 0.5524072051048279, |
|
"learning_rate": 6.40491205661009e-08, |
|
"loss": 0.6089, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.9953994144709327, |
|
"grad_norm": 0.5690078735351562, |
|
"learning_rate": 1.6013134876491362e-08, |
|
"loss": 0.6079, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.999581764951903, |
|
"grad_norm": 0.4828049838542938, |
|
"learning_rate": 0.0, |
|
"loss": 0.5901, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.999581764951903, |
|
"eval_loss": 1.2799842357635498, |
|
"eval_runtime": 0.8401, |
|
"eval_samples_per_second": 5.951, |
|
"eval_steps_per_second": 1.19, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.999581764951903, |
|
"step": 1195, |
|
"total_flos": 9.109418934146171e+17, |
|
"train_loss": 1.440422640086218, |
|
"train_runtime": 6570.4384, |
|
"train_samples_per_second": 2.911, |
|
"train_steps_per_second": 0.182 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1195, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.109418934146171e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|